00001
00002 =head1 LICENSE
00003
00004 Copyright [1999-2014] EMBL-European Bioinformatics Institute
00005
00006 Licensed under the Apache License, Version 2.0 (the "License");
00007 you may not use this file except in compliance with the License.
00008 You may obtain a copy of the License at
00009
00010 http:
00011
00012 Unless required by applicable law or agreed to in writing, software
00013 distributed under the License is distributed on an "AS IS" BASIS,
00014 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 See the License for the specific language governing permissions and
00016 limitations under the License.
00017
00018 =head1 NAME
00019
00020 Bio::EnsEMBL::MetaData::MetaDataProcessor
00021
00022 =head1 SYNOPSIS
00023
00024 my $processor = Bio::EnsEMBL::MetaData::MetaDataProcessor->new();
00025 my $info = $processor->process_genome($dba);
00026
00027 =head1 DESCRIPTION
00028
00029 Object for generating GenomeInfo objects from DBAdaptor objects.
00030
00031 =head1 SEE ALSO
00032
00033 Bio::EnsEMBL::MetaData::BaseInfo
00034 Bio::EnsEMBL::MetaData::DBSQL::GenomeOrganismInfoAdaptor
00035
00036 =head1 AUTHOR
00037
00038 Dan Staines
00039
00040 =cut
00041
00042 package Bio::EnsEMBL::MetaData::MetaDataProcessor;
00043 use Bio::EnsEMBL::MetaData::GenomeInfo;
00044 use Bio::EnsEMBL::MetaData::GenomeComparaInfo;
00045 use Bio::EnsEMBL::Registry;
00046 use Bio::EnsEMBL::Utils::Exception qw/throw warning/;
00047 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00048 use Data::Dumper;
00049 use Log::Log4perl qw(get_logger);
00050 use Carp qw/confess croak/;
00051 use strict;
00052 use warnings;
00053
00054 =head1 SUBROUTINES/METHODS
00055
00056 =head2 new
00057 Arg [-CONTIGS] : Integer
00058 Set to retrieve a list of sequences
00059 Arg [-ANNOTATION_ANALYZER] : Bio::EnsEMBL::MetaData::AnnotationAnalyzer
00060 Arg [-VARIATION] : Integer
00061 Set to process variation
00062 Arg [-COMPARA] : Integer
00063 Set to process compara
00064 Arg [-INFO_ADAPTOR] : Bio::EnsEMBL::MetaData::DBSQL::GenomeInfoAdaptor
00065 Arg [-FORCE_UPDATE] : Integer
00066 Set to force update of existing metadata
00067 Description: Return a new instance of MetaDataProcessor
00068 Returntype : Bio::EnsEMBL::MetaData::MetaDataProcessor
00069 Exceptions : none
00070 Caller : general
00071 Status : Stable
00072 =cut
00073 sub new {
00074 my ( $caller, @args ) = @_;
00075 my $class = ref($caller) || $caller;
00076 my $self = bless( {}, $class );
00077 ( $self->{contigs}, $self->{annotation_analyzer},
00078 $self->{variation}, $self->{compara},
00079 $self->{info_adaptor}, $self->{force_update},
00080 $self->{release}, $self->{eg_release} )
00081 = rearrange( [ 'CONTIGS', 'ANNOTATION_ANALYZER',
00082 'VARIATION', 'COMPARA',
00083 'INFO_ADAPTOR', 'FORCE_UPDATE' ],
00084 @args );
00085 $self->{logger} = get_logger();
00086 return $self;
00087 }
00088
00089 =head2 process_metadata
00090 Arg : Arrayref of DBAdaptors
00091 Description: Process supplied DBAdaptors
00092 Returntype : Arrayref of Bio::EnsEMBL::MetaData::GenomeInfo
00093 Exceptions : none
00094 Caller : general
00095 Status : Stable
00096 =cut
00097 sub process_metadata {
00098 my ( $self, $dbas ) = @_;
00099
00100 # 1. create hash of DBAs
00101 my $dba_hash = {};
00102 my $comparas = [];
00103 for my $dba ( grep { $_->dbc()->dbname() !~ /ancestral/ } @{$dbas} ) {
00104 my $type;
00105 my $species = $dba->species();
00106 for my $t (qw(core otherfeatures variation funcgen)) {
00107 if ( $dba->dbc()->dbname() =~ m/$t/ ) {
00108 $type = $t;
00109 last;
00110 }
00111 }
00112 if ( defined $type ) {
00113 $dba_hash->{$species}{$type} = $dba if defined $type;
00114 }
00115 elsif ( $dba->dbc()->dbname() =~ m/_compara_/ ) {
00116 push @$comparas, $dba;
00117 }
00118 }
00119
00120 # 2. iterate through each genome
00121 my $genome_infos = {};
00122 my $n = 0;
00123 my $total = scalar( keys %$dba_hash );
00124 while ( my ( $genome, $dbas ) = each %$dba_hash ) {
00125 $self->{logger}->info( "Processing " . $genome . " (" . ++$n . "/$total)" );
00126 $genome_infos->{$genome} = $self->process_genome($dbas);
00127 }
00128
00129 # 3. apply compara
00130 for my $compara (@$comparas) {
00131 $self->process_compara( $compara, $genome_infos );
00132 }
00133
00134 return [ values(%$genome_infos) ];
00135 } ## end sub process_metadata
00136
00137 =head2 process_genome
00138 Arg : Hashref of DBAdaptors
00139 Description: Process supplied genome
00140 Returntype : Bio::EnsEMBL::MetaData::GenomeInfo
00141 Exceptions : none
00142 Caller : general
00143 Status : Stable
00144 =cut
00145 sub process_genome {
00146 my ( $self, $dbas ) = @_;
00147 my $dba = $dbas->{core};
00148 if ( !defined $dba ) {
00149 confess "DBA not defined for processing";
00150 }
00151 $dba->dbc()->reconnect_when_lost(1);
00152
00153 # get metadata container
00154 my $meta = $dba->get_MetaContainer();
00155 my $dbname = $dba->dbc()->dbname();
00156 my $size = get_dbsize($dba);
00157 my $tableN =
00158 $dba->dbc()->sql_helper()->execute_single_result(
00159 -SQL =>
00160 "select count(*) from information_schema.tables where table_schema=?",
00161 -PARAMS => [$dbname] );
00162
00163 my $strain = $meta->single_value_by_key('species.strain');
00164 my $serotype = $meta->single_value_by_key('species.serotype');
00165 my $name = $meta->get_display_name();
00166 my $taxonomy_id = $meta->get_taxonomy_id();
00167 my $species_taxonomy_id =
00168 $meta->single_value_by_key('species.species_taxonomy_id') || $taxonomy_id;
00169 my $assembly_accession = $meta->single_value_by_key('assembly.accession');
00170 my $assembly_name = $meta->single_value_by_key('assembly.name');
00171 my $genebuild = $meta->single_value_by_key('genebuild.start_date');
00172
00173 # get highest assembly level
00174 my ($assembly_level) =
00175 @{
00176 $dba->dbc()->sql_helper()->execute_simple(
00177 -SQL =>
00178 'select name from coord_system where species_id=? order by rank asc',
00179 -PARAMS => [ $dba->species_id() ] ) };
00180 my $division = 'Ensembl';
00181 my @divisions = sort @{ $meta->list_value_by_key('species.division') };
00182 if ( scalar @divisions > 0 ) {
00183 $division = $divisions[-1];
00184 }
00185
00186 my $md =
00187 Bio::EnsEMBL::MetaData::GenomeInfo->new(
00188 -name => $dba->species(),
00189 -species_id => $dba->species_id(),
00190 -division => $division,
00191 -dbname => $dbname,
00192 -data_release => $self->{data_release},
00193 -strain => $strain,
00194 -serotype => $serotype,
00195 -display_name => $name,
00196 -taxonomy_id => $taxonomy_id,
00197 -species_taxonomy_id => $species_taxonomy_id,
00198 -assembly_accession => $assembly_accession,
00199 -assembly_name => $assembly_name,
00200 -genebuild => $genebuild,
00201 -assembly_level => $assembly_level );
00202
00203 # get list of seq names
00204 my $seqs_arr = [];
00205
00206 if ( defined $self->{contigs} ) {
00207 my $seqs = {};
00208
00209 # 1. get complete list of seq_regions as a hash vs. ENA synonyms
00210 $dba->dbc()->sql_helper()->execute_no_return(
00211 -SQL => q/select distinct s.name, ss.synonym
00212 from coord_system c
00213 join seq_region s using (coord_system_id)
00214 left join seq_region_synonym ss on
00215 (ss.seq_region_id=s.seq_region_id and ss.external_db_id in
00216 (select external_db_id from external_db where db_name='INSDC'))
00217 where c.species_id=? and attrib like '%default_version%'/,
00218 -PARAMS => [ $dba->species_id() ],
00219 -CALLBACK => sub {
00220 my ( $name, $acc ) = @{ shift @_ };
00221 $seqs->{$name} = $acc;
00222 return;
00223 } );
00224
00225 # 2. add accessions where the name is flagged as being in ENA
00226 $dba->dbc()->sql_helper()->execute_no_return(
00227 -SQL => q/
00228 select s.name
00229 from coord_system c
00230 join seq_region s using (coord_system_id)
00231 join seq_region_attrib sa using (seq_region_id)
00232 where sa.value='ENA' and c.species_id=? and attrib like '%default_version%'/,
00233 -PARAMS => [ $dba->species_id() ],
00234 -CALLBACK => sub {
00235 my ($acc) = @{ shift @_ };
00236 $seqs->{$acc} = $acc;
00237 return;
00238 } );
00239
00240 while ( my ( $key, $acc ) = each %$seqs ) {
00241 push @$seqs_arr, { name => $key, acc => $acc };
00242 }
00243 } ## end if ( defined $self->{contigs...})
00244
00245 $md->assembly()->sequences($seqs_arr);
00246
00247 # get toplevel base count
00248 my $base_counts =
00249 $dba->dbc()->sql_helper()->execute_simple(
00250 -SQL =>
00251 q/select value from genome_statistics where statistic='ref_length' and species_id=?/,
00252 -PARAMS => [ $dba->species_id() ] );
00253
00254 if ( scalar @$base_counts == 0 ) {
00255 $base_counts = $dba->dbc()->sql_helper()->execute_simple(
00256 -SQL => q/select sum(s.length) from seq_region s
00257 join coord_system c using (coord_system_id)
00258 join seq_region_attrib sa using (seq_region_id)
00259 join attrib_type a using (attrib_type_id)
00260 where a.code='toplevel' and species_id=?/,
00261 -PARAMS => [ $dba->species_id() ] );
00262 }
00263
00264 $md->assembly()->base_count( $base_counts->[0] );
00265
00266 # get associated PMIDs
00267 $md->organism()->publications(
00268 $dba->dbc()->sql_helper()->execute_simple(
00269 -SQL => q/select distinct dbprimary_acc from
00270 xref
00271 join external_db using (external_db_id)
00272 join seq_region_attrib sa on (xref.xref_id=sa.value)
00273 join attrib_type using (attrib_type_id)
00274 join seq_region using (seq_region_id)
00275 join coord_system using (coord_system_id)
00276 where species_id=? and code='xref_id' and db_name in ('PUBMED')/,
00277 -PARAMS => [ $dba->species_id() ] ) );
00278
00279 # add aliases
00280 $md->organism()->aliases(
00281 $dba->dbc()->sql_helper()->execute_simple(
00282 -SQL => q/select distinct meta_value from meta
00283 where species_id=? and meta_key='species.alias'/,
00284 -PARAMS => [ $dba->species_id() ] ) );
00285
00286 if ( defined $self->{annotation_analyzer} ) {
00287
00288 # core annotation
00289 $self->{logger}
00290 ->info( "Processing " . $dba->species() . " core annotation" );
00291 $md->annotations( $self->{annotation_analyzer}->analyze_annotation($dba) );
00292
00293 # features
00294 my $core_ali = $self->{annotation_analyzer}->analyze_alignments($dba);
00295 my $other_ali = {};
00296 $md->features( $self->{annotation_analyzer}->analyze_features($dba) );
00297 my $other_features = $dbas->{otherfeatures};
00298 if ( defined $other_features ) {
00299 $self->{logger}
00300 ->info( "Processing " . $dba->species() . " otherfeatures annotation" );
00301 my %features = ( %{ $md->features() },
00302 %{$self->{annotation_analyzer}
00303 ->analyze_features($other_features) } );
00304 $other_ali =
00305 $self->{annotation_analyzer}->analyze_alignments($other_features);
00306 $size += get_dbsize($other_features);
00307 $md->features( \%features );
00308 $md->add_database( $other_features->dbc()->dbname() );
00309 }
00310 my $variation = $dbas->{variation};
00311
00312 # variation
00313 if ( defined $variation ) {
00314 $self->{logger}
00315 ->info( "Processing " . $dba->species() . " variation annotation" );
00316 $md->variations(
00317 $self->{annotation_analyzer}->analyze_variation($variation) );
00318 $size += get_dbsize($variation);
00319 $md->add_database( $variation->dbc()->dbname() );
00320 }
00321
00322 my $funcgen = $dbas->{funcgen};
00323 if ( defined $funcgen ) {
00324 $self->{logger}
00325 ->info( "Processing " . $dba->species() . " funcgen annotation" );
00326 $md->add_database( $funcgen->dbc()->dbname() );
00327 }
00328
00329 # BAM
00330 $self->{logger}
00331 ->info( "Processing " . $dba->species() . " read aligments" );
00332 my $read_ali =
00333 $self->{annotation_analyzer}
00334 ->analyze_tracks( $md->name(), $md->division() );
00335 my %all_ali = ( %{$core_ali}, %{$other_ali} );
00336
00337 # add bam tracks by count - use source name
00338 for my $bam ( @{ $read_ali->{bam} } ) {
00339 $all_ali{bam}{ $bam->{id} }++;
00340 }
00341 $md->other_alignments( \%all_ali );
00342 $md->db_size($size);
00343
00344 } ## end if ( defined $self->{annotation_analyzer...})
00345 $dba->dbc()->disconnect_if_idle();
00346 return $md;
00347 } ## end sub process_genome
00348
00349 my $DIVISION_NAMES = { 'bacteria' => 'EnsemblBacteria',
00350 'plants' => 'EnsemblPlants',
00351 'protists' => 'EnsemblProtists',
00352 'fungi' => 'EnsemblFungi',
00353 'metazoa' => 'EnsemblMetazoa',
00354 'pan_homology' => 'EnsemblPan' };
00355
00356 =head2 process_compara
00357 Arg : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor
00358 Arg : Arrayref of Bio::EnsEMBL::MetaData::GenomeInfo
00359 Description: Process supplied compara
00360 Returntype : Bio::EnsEMBL::MetaData::GenomeComparaInfo
00361 Exceptions : none
00362 Caller : general
00363 Status : Stable
00364 =cut
00365 sub process_compara {
00366 my ( $self, $compara, $genomes ) = @_;
00367 if ( !defined $genomes ) {
00368 $genomes = {};
00369 }
00370 my $comparas = [];
00371 eval {
00372 $self->{logger}
00373 ->info( "Processing compara database " . $compara->dbc()->dbname() );
00374
00375 ( my $division = $compara->dbc()->dbname() ) =~
00376 s/ensembl_compara_([a-z_]+)_[0-9]+_[0-9]+/$1/;
00377
00378 $division = $DIVISION_NAMES->{$division} || $division;
00379
00380 my $adaptor = $compara->get_MethodLinkSpeciesSetAdaptor();
00381
00382 for my $method (
00383 qw/PROTEIN_TREES BLASTZ_NET LASTZ_NET TRANSLATED_BLAT TRANSLATED_BLAT_NET SYNTENY/
00384 )
00385 {
00386
00387 $self->{logger}
00388 ->info( "Processing method type $method from compara database " .
00389 $compara->dbc()->dbname() );
00390
00391 # group by species_set
00392 my $mlss_by_ss = {};
00393 for my $mlss ( @{ $adaptor->fetch_all_by_method_link_type($method) } ) {
00394 push @{ $mlss_by_ss->{ $mlss->species_set_obj()->dbID() } }, $mlss;
00395 }
00396
00397 for my $mlss_list ( values %$mlss_by_ss ) {
00398
00399 my $dbs = {};
00400 my $ss_name;
00401 for my $mlss ( @{$mlss_list} ) {
00402
00403 $ss_name ||= $mlss->species_set_obj()->get_tagvalue('name');
00404 for my $gdb ( @{ $mlss->species_set_obj()->genome_dbs() } ) {
00405 $dbs->{ $gdb->name() } = $gdb;
00406 }
00407 if ( !defined $ss_name ) {
00408 $ss_name = $mlss->name();
00409 }
00410 if ( defined $ss_name ) {
00411 last;
00412 }
00413 }
00414
00415 $self->{logger}->info(
00416 "Processing species set $ss_name for method $method from compara database "
00417 . $compara->dbc()->dbname() );
00418
00419 my $compara_info =
00420 Bio::EnsEMBL::MetaData::GenomeComparaInfo->new(
00421 -DBNAME => $compara->dbc()->dbname(),
00422 -DIVISION => $division,
00423 -METHOD => $method,
00424 -SET_NAME => $ss_name,
00425 -GENOMES => [] );
00426
00427 for my $gdb ( values %{$dbs} ) {
00428
00429 $self->{logger}->info( "Processing species " . $gdb->name() .
00430 " from species set $ss_name for method $method from compara database "
00431 . $compara->dbc()->dbname() );
00432
00433 my $genomeInfo = $genomes->{ $gdb->name() };
00434
00435 # have we got one in the database already?
00436 if ( !defined $genomeInfo && defined $self->{info_adaptor} ) {
00437 $self->{logger}->debug("Checking in the database");
00438 $genomeInfo = $self->{info_adaptor}->fetch_by_name( $gdb->name() );
00439
00440 if ( !defined $genomeInfo ) {
00441 my $current_release = $self->{info_adaptor}->data_release();
00442 if ( defined $current_release->ensembl_genomes_version() ) {
00443 # try the ensembl release
00444 my $ensembl_release =
00445 $self->{info_adaptor}->db()->get_DataReleaseInfoAdaptor()
00446 ->fetch_by_ensembl_release(
00447 $current_release->ensembl_version() );
00448 $self->{info_adaptor}->data_release($ensembl_release);
00449 $genomeInfo =
00450 $self->{info_adaptor}->fetch_by_name( $gdb->name() );
00451 $self->{info_adaptor}->data_release($current_release);
00452 }
00453 }
00454
00455 if ( !defined $genomeInfo ) {
00456 croak "Could not find genome info object for " . $gdb->name();
00457 }
00458 $self->{logger}->debug("Got one from the database");
00459 $genomes->{ $gdb->name() } = $genomeInfo;
00460 } ## end if ( !defined $genomeInfo...)
00461
00462 # last attempt, create one
00463 if ( !defined $genomeInfo ) {
00464 $self->{logger}->info( "Creating info object for " . $gdb->name() );
00465
00466 # get core dba
00467 my $dba;
00468 eval {
00469 $dba =
00470 Bio::EnsEMBL::Registry->get_DBAdaptor( $gdb->name(), 'core' );
00471 };
00472 if ( defined $dba ) {
00473 $genomeInfo = $self->process_genome( { core => $dba } );
00474 }
00475 else {
00476 croak "Could not find DBAdaptor for " . $gdb->name();
00477 }
00478 $genomeInfo->base_count(0);
00479 if ( !defined $genomeInfo ) {
00480 croak "Could not create a genome info object for " . $gdb->name();
00481 }
00482 $genomes->{ $gdb->name() } = $genomeInfo;
00483 }
00484 croak "Could not find genome info for " . $gdb->name()
00485 unless defined $genomeInfo;
00486
00487 push @{ $compara_info->genomes() }, $genomeInfo;
00488
00489 if ( !defined $genomeInfo->compara() ) {
00490 $genomeInfo->compara( [$compara_info] );
00491 }
00492 else {
00493 push @{ $genomeInfo->compara() }, $compara_info;
00494 }
00495 } ## end for my $gdb ( values %{...})
00496 $self->{logger}
00497 ->info("Adding compara info to list of analyses to store");
00498 push @$comparas, $compara_info if defined $compara_info;
00499 } ## end for my $mlss_list ( values...)
00500 } ## end for my $method ( ...)
00501
00502 $self->{logger}->info(
00503 "Completed processing compara database " . $compara->dbc()->dbname() );
00504 }; ## end eval
00505 if ($@) {
00506 $self->{logger}->warn( "Could not process compara: " . $@ );
00507 }
00508 return $comparas;
00509 } ## end sub process_compara
00510
00511 =head2 get_dbsize
00512 Arg : DBAdaptor
00513 Description: Calculate size of supplied database
00514 Returntype : Long
00515 Exceptions : none
00516 Caller : general
00517 Status : Stable
00518 =cut
00519 sub get_dbsize {
00520 my ($dba) = @_;
00521 return
00522 $dba->dbc()->sql_helper()->execute_single_result(
00523 -SQL =>
00524 "select SUM(data_length + index_length) from information_schema.tables where table_schema=?",
00525 -PARAMS => [ $dba->dbc()->dbname() ] );
00526 }
00527
00528 1;