ensembl-metadata/MetaDataProcessor_8pm_source.html

00001
00002 =head1 LICENSE
00003
00004 Copyright [1999-2014] EMBL-European Bioinformatics Institute
00005
00006 Licensed under the Apache License, Version 2.0 (the "License");
00007 you may not use this file except in compliance with the License.
00008 You may obtain a copy of the License at
00009
00010      http://www.apache.org/licenses/LICENSE-2.0
00011
00012 Unless required by applicable law or agreed to in writing, software
00013 distributed under the License is distributed on an "AS IS" BASIS,
00014 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 See the License for the specific language governing permissions and
00016 limitations under the License.
00017
00018 =head1 NAME
00019
00020 Bio::EnsEMBL::MetaData::MetaDataProcessor
00021
00022 =head1 SYNOPSIS
00023
00024 my $processor = Bio::EnsEMBL::MetaData::MetaDataProcessor->new();
00025 my $info = $processor->process_genome($dba);
00026
00027 =head1 DESCRIPTION
00028
00029 Object for generating GenomeInfo objects from DBAdaptor objects.
00030
00031 =head1 SEE ALSO
00032
00033 Bio::EnsEMBL::MetaData::BaseInfo
00034 Bio::EnsEMBL::MetaData::DBSQL::GenomeOrganismInfoAdaptor
00035
00036 =head1 AUTHOR
00037
00038 Dan Staines
00039
00040 =cut
00041
00042 package Bio::EnsEMBL::MetaData::MetaDataProcessor;
00043 use Bio::EnsEMBL::MetaData::GenomeInfo;
00044 use Bio::EnsEMBL::MetaData::GenomeComparaInfo;
00045 use Bio::EnsEMBL::Registry;
00046 use Bio::EnsEMBL::Utils::Exception qw/throw warning/;
00047 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00048 use Data::Dumper;
00049 use Log::Log4perl qw(get_logger);
00050 use Carp qw/confess croak/;
00051 use strict;
00052 use warnings;
00053
00054 =head1 SUBROUTINES/METHODS
00055
00056 =head2 new
00057   Arg [-CONTIGS] : Integer
00058     Set to retrieve a list of sequences
00059   Arg [-ANNOTATION_ANALYZER] : Bio::EnsEMBL::MetaData::AnnotationAnalyzer
00060   Arg [-VARIATION] : Integer
00061     Set to process variation
00062   Arg [-COMPARA] : Integer
00063     Set to process compara
00064   Arg [-INFO_ADAPTOR] : Bio::EnsEMBL::MetaData::DBSQL::GenomeInfoAdaptor
00065   Arg [-FORCE_UPDATE] : Integer
00066     Set to force update of existing metadata
00067   Description: Return a new instance of MetaDataProcessor
00068   Returntype : Bio::EnsEMBL::MetaData::MetaDataProcessor
00069   Exceptions : none
00070   Caller     : general
00071   Status     : Stable
00072 =cut
00073 sub new {
00074   my ( $caller, @args ) = @_;
00075   my $class = ref($caller) || $caller;
00076   my $self = bless( {}, $class );
00077   ( $self->{contigs},      $self->{annotation_analyzer},
00078     $self->{variation},    $self->{compara},
00079     $self->{info_adaptor}, $self->{force_update},
00080     $self->{release},      $self->{eg_release} )
00081     = rearrange( [ 'CONTIGS',      'ANNOTATION_ANALYZER',
00082                    'VARIATION',    'COMPARA',
00083                    'INFO_ADAPTOR', 'FORCE_UPDATE' ],
00084                  @args );
00085   $self->{logger} = get_logger();
00086   return $self;
00087 }
00088
00089 =head2 process_metadata
00090   Arg        : Arrayref of DBAdaptors
00091   Description: Process supplied DBAdaptors
00092   Returntype : Arrayref of Bio::EnsEMBL::MetaData::GenomeInfo
00093   Exceptions : none
00094   Caller     : general
00095   Status     : Stable
00096 =cut
00097 sub process_metadata {
00098   my ( $self, $dbas ) = @_;
00099
00100   # 1. create hash of DBAs
00101   my $dba_hash = {};
00102   my $comparas = [];
00103   for my $dba ( grep { $_->dbc()->dbname() !~ /ancestral/ } @{$dbas} ) {
00104     my $type;
00105     my $species = $dba->species();
00106     for my $t (qw(core otherfeatures variation funcgen)) {
00107       if ( $dba->dbc()->dbname() =~ m/$t/ ) {
00108         $type = $t;
00109         last;
00110       }
00111     }
00112     if ( defined $type ) {
00113       $dba_hash->{$species}{$type} = $dba if defined $type;
00114     }
00115     elsif ( $dba->dbc()->dbname() =~ m/_compara_/ ) {
00116       push @$comparas, $dba;
00117     }
00118   }
00119
00120   # 2. iterate through each genome
00121   my $genome_infos = {};
00122   my $n            = 0;
00123   my $total        = scalar( keys %$dba_hash );
00124   while ( my ( $genome, $dbas ) = each %$dba_hash ) {
00125     $self->{logger}->info( "Processing " . $genome . " (" . ++$n . "/$total)" );
00126     $genome_infos->{$genome} = $self->process_genome($dbas);
00127   }
00128
00129   # 3. apply compara
00130   for my $compara (@$comparas) {
00131     $self->process_compara( $compara, $genome_infos );
00132   }
00133
00134   return [ values(%$genome_infos) ];
00135 } ## end sub process_metadata
00136
00137 =head2 process_genome
00138   Arg        : Hashref of DBAdaptors
00139   Description: Process supplied genome
00140   Returntype : Bio::EnsEMBL::MetaData::GenomeInfo
00141   Exceptions : none
00142   Caller     : general
00143   Status     : Stable
00144 =cut
00145 sub process_genome {
00146   my ( $self, $dbas ) = @_;
00147   my $dba = $dbas->{core};
00148   if ( !defined $dba ) {
00149     confess "DBA not defined for processing";
00150   }
00151   $dba->dbc()->reconnect_when_lost(1);
00152
00153   # get metadata container
00154   my $meta   = $dba->get_MetaContainer();
00155   my $dbname = $dba->dbc()->dbname();
00156   my $size   = get_dbsize($dba);
00157   my $tableN =
00158     $dba->dbc()->sql_helper()->execute_single_result(
00159         -SQL =>
00160           "select count(*) from information_schema.tables where table_schema=?",
00161         -PARAMS => [$dbname] );
00162
00163   my $strain      = $meta->single_value_by_key('species.strain');
00164   my $serotype    = $meta->single_value_by_key('species.serotype');
00165   my $name        = $meta->get_display_name();
00166   my $taxonomy_id = $meta->get_taxonomy_id();
00167   my $species_taxonomy_id =
00168     $meta->single_value_by_key('species.species_taxonomy_id') || $taxonomy_id;
00169   my $assembly_accession = $meta->single_value_by_key('assembly.accession');
00170   my $assembly_name      = $meta->single_value_by_key('assembly.name');
00171   my $genebuild          = $meta->single_value_by_key('genebuild.start_date');
00172
00173   # get highest assembly level
00174   my ($assembly_level) =
00175     @{
00176     $dba->dbc()->sql_helper()->execute_simple(
00177          -SQL =>
00178            'select name from coord_system where species_id=? order by rank asc',
00179          -PARAMS => [ $dba->species_id() ] ) };
00180   my $division  = 'Ensembl';
00181   my @divisions = sort @{ $meta->list_value_by_key('species.division') };
00182   if ( scalar @divisions > 0 ) {
00183     $division = $divisions[-1];
00184   }
00185
00186   my $md =
00187     Bio::EnsEMBL::MetaData::GenomeInfo->new(
00188                                    -name         => $dba->species(),
00189                                    -species_id   => $dba->species_id(),
00190                                    -division     => $division,
00191                                    -dbname       => $dbname,
00192                                    -data_release => $self->{data_release},
00193                                    -strain       => $strain,
00194                                    -serotype     => $serotype,
00195                                    -display_name => $name,
00196                                    -taxonomy_id  => $taxonomy_id,
00197                                    -species_taxonomy_id => $species_taxonomy_id,
00198                                    -assembly_accession  => $assembly_accession,
00199                                    -assembly_name       => $assembly_name,
00200                                    -genebuild           => $genebuild,
00201                                    -assembly_level      => $assembly_level );
00202
00203   # get list of seq names
00204   my $seqs_arr = [];
00205
00206   if ( defined $self->{contigs} ) {
00207     my $seqs = {};
00208
00209     # 1. get complete list of seq_regions as a hash vs. ENA synonyms
00210     $dba->dbc()->sql_helper()->execute_no_return(
00211       -SQL => q/select distinct s.name, ss.synonym
00212       from coord_system c
00213       join seq_region s using (coord_system_id)
00214       left join seq_region_synonym ss on
00215         (ss.seq_region_id=s.seq_region_id and ss.external_db_id in
00216             (select external_db_id from external_db where db_name='INSDC'))
00217       where c.species_id=? and attrib like '%default_version%'/,
00218       -PARAMS   => [ $dba->species_id() ],
00219       -CALLBACK => sub {
00220         my ( $name, $acc ) = @{ shift @_ };
00221         $seqs->{$name} = $acc;
00222         return;
00223       } );
00224
00225     # 2. add accessions where the name is flagged as being in ENA
00226     $dba->dbc()->sql_helper()->execute_no_return(
00227       -SQL => q/
00228       select s.name
00229       from coord_system c
00230       join seq_region s using (coord_system_id)
00231       join seq_region_attrib sa using (seq_region_id)
00232       where sa.value='ENA' and c.species_id=? and attrib like '%default_version%'/,
00233       -PARAMS   => [ $dba->species_id() ],
00234       -CALLBACK => sub {
00235         my ($acc) = @{ shift @_ };
00236         $seqs->{$acc} = $acc;
00237         return;
00238       } );
00239
00240     while ( my ( $key, $acc ) = each %$seqs ) {
00241       push @$seqs_arr, { name => $key, acc => $acc };
00242     }
00243   } ## end if ( defined $self->{contigs...})
00244
00245   $md->assembly()->sequences($seqs_arr);
00246
00247   # get toplevel base count
00248   my $base_counts =
00249     $dba->dbc()->sql_helper()->execute_simple(
00250     -SQL =>
00251 q/select value from genome_statistics where statistic='ref_length' and species_id=?/,
00252     -PARAMS => [ $dba->species_id() ] );
00253
00254   if ( scalar @$base_counts == 0 ) {
00255     $base_counts = $dba->dbc()->sql_helper()->execute_simple(
00256       -SQL => q/select sum(s.length) from seq_region s
00257 join coord_system c using (coord_system_id)
00258 join seq_region_attrib sa using (seq_region_id)
00259 join attrib_type a using (attrib_type_id)
00260 where a.code='toplevel' and species_id=?/,
00261       -PARAMS => [ $dba->species_id() ] );
00262   }
00263
00264   $md->assembly()->base_count( $base_counts->[0] );
00265
00266   # get associated PMIDs
00267   $md->organism()->publications(
00268     $dba->dbc()->sql_helper()->execute_simple(
00269       -SQL => q/select distinct dbprimary_acc from
00270       xref
00271       join external_db using (external_db_id)
00272       join seq_region_attrib sa on (xref.xref_id=sa.value)
00273       join attrib_type using (attrib_type_id)
00274       join seq_region using (seq_region_id)
00275       join coord_system using (coord_system_id)
00276       where species_id=? and code='xref_id' and db_name in ('PUBMED')/,
00277       -PARAMS => [ $dba->species_id() ] ) );
00278
00279   # add aliases
00280   $md->organism()->aliases(
00281     $dba->dbc()->sql_helper()->execute_simple(
00282       -SQL => q/select distinct meta_value from meta
00283       where species_id=? and meta_key='species.alias'/,
00284       -PARAMS => [ $dba->species_id() ] ) );
00285
00286   if ( defined $self->{annotation_analyzer} ) {
00287
00288     # core annotation
00289     $self->{logger}
00290       ->info( "Processing " . $dba->species() . " core annotation" );
00291     $md->annotations( $self->{annotation_analyzer}->analyze_annotation($dba) );
00292
00293     # features
00294     my $core_ali  = $self->{annotation_analyzer}->analyze_alignments($dba);
00295     my $other_ali = {};
00296     $md->features( $self->{annotation_analyzer}->analyze_features($dba) );
00297     my $other_features = $dbas->{otherfeatures};
00298     if ( defined $other_features ) {
00299       $self->{logger}
00300         ->info( "Processing " . $dba->species() . " otherfeatures annotation" );
00301       my %features = ( %{ $md->features() },
00302                        %{$self->{annotation_analyzer}
00303                            ->analyze_features($other_features) } );
00304       $other_ali =
00305         $self->{annotation_analyzer}->analyze_alignments($other_features);
00306       $size += get_dbsize($other_features);
00307       $md->features( \%features );
00308       $md->add_database( $other_features->dbc()->dbname() );
00309     }
00310     my $variation = $dbas->{variation};
00311
00312     # variation
00313     if ( defined $variation ) {
00314       $self->{logger}
00315         ->info( "Processing " . $dba->species() . " variation annotation" );
00316       $md->variations(
00317                   $self->{annotation_analyzer}->analyze_variation($variation) );
00318       $size += get_dbsize($variation);
00319       $md->add_database( $variation->dbc()->dbname() );
00320     }
00321
00322     my $funcgen = $dbas->{funcgen};
00323     if ( defined $funcgen ) {
00324       $self->{logger}
00325         ->info( "Processing " . $dba->species() . " funcgen annotation" );
00326       $md->add_database( $funcgen->dbc()->dbname() );
00327     }
00328
00329     # BAM
00330     $self->{logger}
00331       ->info( "Processing " . $dba->species() . " read aligments" );
00332     my $read_ali =
00333       $self->{annotation_analyzer}
00334       ->analyze_tracks( $md->name(), $md->division() );
00335     my %all_ali = ( %{$core_ali}, %{$other_ali} );
00336
00337     # add bam tracks by count - use source name
00338     for my $bam ( @{ $read_ali->{bam} } ) {
00339       $all_ali{bam}{ $bam->{id} }++;
00340     }
00341     $md->other_alignments( \%all_ali );
00342     $md->db_size($size);
00343
00344   } ## end if ( defined $self->{annotation_analyzer...})
00345   $dba->dbc()->disconnect_if_idle();
00346   return $md;
00347 } ## end sub process_genome
00348
00349 my $DIVISION_NAMES = { 'bacteria'     => 'EnsemblBacteria',
00350                        'plants'       => 'EnsemblPlants',
00351                        'protists'     => 'EnsemblProtists',
00352                        'fungi'        => 'EnsemblFungi',
00353                        'metazoa'      => 'EnsemblMetazoa',
00354                        'pan_homology' => 'EnsemblPan' };
00355
00356 =head2 process_compara
00357   Arg        : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor
00358   Arg        : Arrayref of Bio::EnsEMBL::MetaData::GenomeInfo
00359   Description: Process supplied compara
00360   Returntype : Bio::EnsEMBL::MetaData::GenomeComparaInfo
00361   Exceptions : none
00362   Caller     : general
00363   Status     : Stable
00364 =cut
00365 sub process_compara {
00366   my ( $self, $compara, $genomes ) = @_;
00367   if ( !defined $genomes ) {
00368     $genomes = {};
00369   }
00370   my $comparas = [];
00371   eval {
00372     $self->{logger}
00373       ->info( "Processing compara database " . $compara->dbc()->dbname() );
00374
00375     ( my $division = $compara->dbc()->dbname() ) =~
00376       s/ensembl_compara_([a-z_]+)_[0-9]+_[0-9]+/$1/;
00377
00378     $division = $DIVISION_NAMES->{$division} || $division;
00379
00380     my $adaptor = $compara->get_MethodLinkSpeciesSetAdaptor();
00381
00382     for my $method (
00383       qw/PROTEIN_TREES BLASTZ_NET LASTZ_NET TRANSLATED_BLAT TRANSLATED_BLAT_NET SYNTENY/
00384       )
00385     {
00386
00387       $self->{logger}
00388         ->info( "Processing method type $method from compara database " .
00389                 $compara->dbc()->dbname() );
00390
00391       # group by species_set
00392       my $mlss_by_ss = {};
00393       for my $mlss ( @{ $adaptor->fetch_all_by_method_link_type($method) } ) {
00394         push @{ $mlss_by_ss->{ $mlss->species_set_obj()->dbID() } }, $mlss;
00395       }
00396
00397       for my $mlss_list ( values %$mlss_by_ss ) {
00398
00399         my $dbs = {};
00400         my $ss_name;
00401         for my $mlss ( @{$mlss_list} ) {
00402
00403           $ss_name ||= $mlss->species_set_obj()->get_tagvalue('name');
00404           for my $gdb ( @{ $mlss->species_set_obj()->genome_dbs() } ) {
00405             $dbs->{ $gdb->name() } = $gdb;
00406           }
00407           if ( !defined $ss_name ) {
00408             $ss_name = $mlss->name();
00409           }
00410           if ( defined $ss_name ) {
00411             last;
00412           }
00413         }
00414
00415         $self->{logger}->info(
00416 "Processing species set $ss_name for method $method from compara database "
00417             . $compara->dbc()->dbname() );
00418
00419         my $compara_info =
00420           Bio::EnsEMBL::MetaData::GenomeComparaInfo->new(
00421                                            -DBNAME => $compara->dbc()->dbname(),
00422                                            -DIVISION => $division,
00423                                            -METHOD   => $method,
00424                                            -SET_NAME => $ss_name,
00425                                            -GENOMES  => [] );
00426
00427         for my $gdb ( values %{$dbs} ) {
00428
00429           $self->{logger}->info( "Processing species " . $gdb->name() .
00430 " from species set $ss_name for method $method from compara database "
00431             . $compara->dbc()->dbname() );
00432
00433           my $genomeInfo = $genomes->{ $gdb->name() };
00434
00435           # have we got one in the database already?
00436           if ( !defined $genomeInfo && defined $self->{info_adaptor} ) {
00437             $self->{logger}->debug("Checking in the database");
00438             $genomeInfo = $self->{info_adaptor}->fetch_by_name( $gdb->name() );
00439
00440             if ( !defined $genomeInfo ) {
00441               my $current_release = $self->{info_adaptor}->data_release();
00442               if ( defined $current_release->ensembl_genomes_version() ) {
00443                 # try the ensembl release
00444                 my $ensembl_release =
00445                   $self->{info_adaptor}->db()->get_DataReleaseInfoAdaptor()
00446                   ->fetch_by_ensembl_release(
00447                                           $current_release->ensembl_version() );
00448                 $self->{info_adaptor}->data_release($ensembl_release);
00449                 $genomeInfo =
00450                   $self->{info_adaptor}->fetch_by_name( $gdb->name() );
00451                 $self->{info_adaptor}->data_release($current_release);
00452               }
00453             }
00454
00455             if ( !defined $genomeInfo ) {
00456               croak "Could not find genome info object for " . $gdb->name();
00457             }
00458             $self->{logger}->debug("Got one from the database");
00459             $genomes->{ $gdb->name() } = $genomeInfo;
00460           } ## end if ( !defined $genomeInfo...)
00461
00462           # last attempt, create one
00463           if ( !defined $genomeInfo ) {
00464             $self->{logger}->info( "Creating info object for " . $gdb->name() );
00465
00466             # get core dba
00467             my $dba;
00468             eval {
00469               $dba =
00470                 Bio::EnsEMBL::Registry->get_DBAdaptor( $gdb->name(), 'core' );
00471             };
00472             if ( defined $dba ) {
00473               $genomeInfo = $self->process_genome( { core => $dba } );
00474             }
00475             else {
00476               croak "Could not find DBAdaptor for " . $gdb->name();
00477             }
00478             $genomeInfo->base_count(0);
00479             if ( !defined $genomeInfo ) {
00480               croak "Could not create a genome info object for " . $gdb->name();
00481             }
00482             $genomes->{ $gdb->name() } = $genomeInfo;
00483           }
00484           croak "Could not find genome info for " . $gdb->name()
00485             unless defined $genomeInfo;
00486
00487           push @{ $compara_info->genomes() }, $genomeInfo;
00488
00489           if ( !defined $genomeInfo->compara() ) {
00490             $genomeInfo->compara( [$compara_info] );
00491           }
00492           else {
00493             push @{ $genomeInfo->compara() }, $compara_info;
00494           }
00495         } ## end for my $gdb ( values %{...})
00496         $self->{logger}
00497           ->info("Adding compara info to list of analyses to store");
00498         push @$comparas, $compara_info if defined $compara_info;
00499       } ## end for my $mlss_list ( values...)
00500     } ## end for my $method ( ...)
00501
00502     $self->{logger}->info(
00503          "Completed processing compara database " . $compara->dbc()->dbname() );
00504   };    ## end eval
00505   if ($@) {
00506     $self->{logger}->warn( "Could not process compara: " . $@ );
00507   }
00508   return $comparas;
00509 } ## end sub process_compara
00510
00511 =head2 get_dbsize
00512   Arg        : DBAdaptor
00513   Description: Calculate size of supplied database
00514   Returntype : Long
00515   Exceptions : none
00516   Caller     : general
00517   Status     : Stable
00518 =cut
00519 sub get_dbsize {
00520   my ($dba) = @_;
00521   return
00522     $dba->dbc()->sql_helper()->execute_single_result(
00523     -SQL =>
00524 "select SUM(data_length + index_length) from information_schema.tables where table_schema=?",
00525     -PARAMS => [ $dba->dbc()->dbname() ] );
00526 }
00527
00528 1;