00001
00002 =head1 LICENSE
00003
00004 Copyright [2009-2016] EMBL-European Bioinformatics Institute
00005
00006 Licensed under the Apache License, Version 2.0 (the "License");
00007 you may not use this file except in compliance with the License.
00008 You may obtain a copy of the License at
00009
00010 http:
00011
00012 Unless required by applicable law or agreed to in writing, software
00013 distributed under the License is distributed on an "AS IS" BASIS,
00014 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 See the License for the specific language governing permissions and
00016 limitations under the License.
00017
00018 =head1 CONTACT
00019
00020 Please email comments or questions to the public Ensembl
00021 developers list at <dev@ensembl.org>.
00022
00023 Questions may also be sent to the Ensembl help desk at
00024 <helpdesk@ensembl.org>.
00025
00026 =head1 NAME
00027
00028 Bio::EnsEMBL::MetaData::AnnotationAnalyzer
00029
00030 =head1 SYNOPSIS
00031
00032 my $anal = Bio::EnsEMBL::MetaData::AnnotationAnalyzer->new();
00033 my $details = $anal->analyze_annotation($dba);
00034
00035 =head1 DESCRIPTION
00036
00037 Utility class for counting xrefs, variations etc. associated with a genome
00038
00039 =head1 AUTHOR
00040
00041 Dan Staines
00042
00043 =cut
00044
00045 use strict;
00046 use warnings;
00047
00048 package Bio::EnsEMBL::MetaData::AnnotationAnalyzer;
00049 use Bio::EnsEMBL::Utils::Exception qw/throw/;
00050
00051 use Log::Log4perl qw(get_logger);
00052 use Data::Dumper;
00053 use Config::IniFiles;
00054 use LWP::UserAgent;
00055 my $ua = LWP::UserAgent->new();
00056
00057 my $url_template =
00058 'https://raw.github.com/EnsemblGenomes/eg-web-DIVISION/master/conf/ini-files/SPECIES.ini';
00059
00060 =head1 SUBROUTINES/METHODS
00061
00062 =head2 new
00063 Description: Return a new instance of AnnotationAnalyzer
00064 Returntype : Bio::EnsEMBL::MetaData::AnnotationAnalyzer
00065 Exceptions : none
00066 Caller : general
00067 Status : Stable
00068 =cut
00069
00070 sub new {
00071 my $caller = shift;
00072 my $class = ref($caller) || $caller;
00073 my $self = bless( {}, $class );
00074 $self->{logger} = get_logger();
00075 return $self;
00076 }
00077
00078 =head2 analyze_annotation
00079 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00080 Description: Analyzes annotation content of the supplied core DB
00081 Returntype : Hash ref with the following keys:
00082 nProteinCoding - number of protein coding genes
00083 nProteinCodingUniProtKBSwissProt - number of protein coding genes with at least one SwissPROT entry
00084 nProteinCodingUniProtKBTrEMBL - number of protein coding genes with at least one TrEMBL entry
00085 nProteinCodingGO - number of protein coding genes with at least one GO term
00086 nProteinCodingInterPro - number of protein coding genes with at least one InterPro match
00087 nUniProtKBSwissProt - number of distinct UniProtKB/TrEMBL entries matching at least one translation
00088 nUniProtKBTrEMBL - number of distinct UniProtKB/TrEMBL entries matching at least one translation
00089 nGO - number of distinct GO terms matching at least one translation
00090 nInterPro - number of distinct InterPro entries matching at least one feature
00091 nInterProDomains - number of distinct protein features matching an InterPro domains
00092 Exceptions : none
00093 Caller : general
00094 Status : Stable
00095 =cut
00096
00097 sub analyze_annotation {
00098 my ( $self, $dba ) = @_;
00099 $self->{logger}->debug( "Analysing annotation for " . $dba->species() );
00100 return {
00101 nProteinCoding => $self->count_by_biotype( $dba, 'protein_coding' ),
00102 nProteinCodingGO => $self->count_by_xref( $dba, 'GO', 'protein_coding' ),
00103 nProteinCodingUniProtKB =>
00104 $self->count_by_xref( $dba, [ 'Uniprot/SWISSPROT', 'Uniprot/SPTREMBL' ],
00105 'protein_coding' ),
00106 nProteinCodingUniProtKBSwissProt =>
00107 $self->count_by_xref( $dba, 'Uniprot/SWISSPROT', 'protein_coding' ),
00108 nProteinCodingUniProtKBTrEMBL =>
00109 $self->count_by_xref( $dba, 'Uniprot/SPTREMBL', 'protein_coding' ),
00110 nProteinCodingInterPro => $self->count_by_interpro($dba),
00111 nGO => $self->count_by_xref( $dba, 'GO', 'protein_coding' ),
00112 nUniProtKBSwissProt => $self->count_xrefs( $dba, 'Uniprot/SWISSPROT' ),
00113 nUniProtKBTrEMBL => $self->count_xrefs( $dba, 'Uniprot/SPTREMBL' ),
00114 nInterPro => $self->count_interpro($dba),
00115 nInterProDomains => $self->count_interpro_domains($dba) };
00116 }
00117
00118 =head2 analyze_features
00119 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00120 Description: Analyzes features found in the supplied core/otherfeatures database (simple_feature,repeat_feature,protein_align_feature,dna_align_features)
00121 Returntype : Hash ref (keys are feature tables, values are hashrefs of count by analysis name)
00122 Exceptions : none
00123 Caller : general
00124 Status : Stable
00125 =cut
00126
00127 sub analyze_features {
00128 my ( $self, $dba ) = @_;
00129 return { simpleFeatures => $self->count_features( $dba, 'simple_feature' ),
00130 repeatFeatures => $self->count_features( $dba, 'repeat_feature' ) };
00131 }
00132
00133 =head2 analyze_variation
00134 Arg : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor
00135 Description: Analyzes variation found in the supplied variation database
00136 Returntype : Hash ref with the following keys
00137 variations - count of variations per source
00138 structuralVariations - count of structural variations per source
00139 genotypes - count of genotypes per sample
00140 phenotypes - count of variation annotations per phenotype
00141 Exceptions : none
00142 Caller : general
00143 Status : Stable
00144 =cut
00145
00146 sub analyze_variation {
00147 my ( $self, $dba ) = @_;
00148 return { variations => $self->count_variations($dba),
00149 structuralVariations => $self->count_structural_variations($dba),
00150 genotypes => $self->count_genotypes($dba),
00151 phenotypes => $self->count_phenotypes($dba) };
00152 }
00153
00154 =head2 analyze_compara
00155 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00156 Arg : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor
00157 Description: Analyzes compara methods found in the supplied compara database involving the supplied core species
00158 Returntype : Hash ref (keys are method names, values are arrays of partner species)
00159 Exceptions : none
00160 Caller : general
00161 Status : Stable
00162 =cut
00163
00164 sub analyze_compara {
00165 my ( $self, $dba, $core ) = @_;
00166 my $compara = {};
00167 eval {
00168 my $gdb =
00169 $dba->get_GenomeDBAdaptor()->fetch_by_registry_name( $core->species() );
00170 for my $mlss (
00171 @{ $dba->get_MethodLinkSpeciesSetAdaptor()->fetch_all_by_GenomeDB($gdb) }
00172 )
00173 {
00174 my $t = $mlss->method()->type();
00175 next
00176 if ( $t eq 'FAMILY' ||
00177 $t eq 'ENSEMBL_ORTHOLOGUES' ||
00178 $t eq 'ENSEMBL_PARALOGUES' );
00179 for my $gdb2 ( grep { $gdb->dbID() ne $_->dbID() }
00180 @{ $mlss->species_set_obj->genome_dbs() } )
00181 {
00182 push( @{ $compara->{$t} }, $gdb2->name() );
00183 }
00184 }
00185 };
00186 if ($@) {
00187 warn "No compara entry found for " . $dba->species();
00188 }
00189 return $compara;
00190 } ## end sub analyze_compara
00191
00192 =head2 analyze_alignments
00193 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00194 Description: Analyzes alignments for sequences to the assembly
00195 Returntype : Hash ref - keys are source_type (e.g. RNA seq), values are arrayrefs of hashrefs containing track details (source_name,description,source_url)
00196 Exceptions : none
00197 Caller : general
00198 Status : Stable
00199 =cut
00200
00201 sub analyze_alignments {
00202 my ( $self, $dba ) = @_;
00203 my $ali = {};
00204 my $pf = $self->count_features( $dba, 'protein_align_feature' );
00205 if ( scalar( keys %$pf ) > 0 ) {
00206 $ali->{proteinAlignFeatures} = $pf;
00207 }
00208 my $df = $self->count_features( $dba, 'dna_align_feature' );
00209 if ( scalar( keys %$df ) > 0 ) {
00210 $ali->{dnaAlignFeatures} = $df;
00211 }
00212 return $ali;
00213 }
00214
00215 =head2 analyze_tracks
00216 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00217 Description: Analyzes tracks attached to the supplied core database
00218 Returntype : Hash ref - keys are source_type (e.g. RNA seq), values are arrayrefs of hashrefs containing track details (source_name,description,source_url)
00219 Exceptions : none
00220 Caller : general
00221 Status : Stable
00222 =cut
00223
00224 sub analyze_tracks {
00225 my ( $self, $species, $division ) = @_;
00226 $species = ucfirst($species);
00227 ( $division = lc $division ) =~ s/ensembl
00228 # get the ini file from git
00229 ( my $ini_url = $url_template ) =~ s/SPECIES/$species/;
00230 $ini_url =~ s/DIVISION/$division/;
00231
00232 my $req = HTTP::Request->new( GET => $ini_url );
00233 # Pass request to the user agent and get a response back
00234 my $res = $ua->request($req);
00235 my $ini;
00236 # Check the outcome of the response
00237 if ( $res->is_success ) {
00238 $ini = $res->content;
00239 }
00240 else {
00241 $self->{logger}
00242 ->debug( "Could not retrieve $ini_url: " . $res->status_line );
00243 }
00244
00245 # parse out and look at:
00246 # [ENSEMBL_INTERNAL_BAM_SOURCES]
00247 # 1_Puccinia_triticina_SRR035315 = dna_align_est
00248 # walk through keys and use keys to find sections like:
00249 # [1_Puccinia_triticina_SRR035315]
00250 # source_name = Transcriptomics sequences of fresh spores. Run id: SRR035315
00251 # description = RNA-Seq study.
00252 # source_url = http://ftp.sra.ebi.ac.uk/vol1/ERZ000/ERZ000002/SRR035315.bam
00253 # source_type = rnaseq
00254 # display = normal
00255 # then store some or all of this in my output e.g. {bam}{source_type}[{source_name,description,source_url}]
00256 my $bams = {};
00257 if ( defined $ini ) {
00258 my $cfg = Config::IniFiles->new( -file => \$ini );
00259 if ( defined $cfg ) {
00260 for my $bam ( $cfg->Parameters("ENSEMBL_INTERNAL_BAM_SOURCES") ) {
00261 push @{ $bams->{ $cfg->val( $bam, 'source_type' ) } }, {
00262 id => $bam,
00263 source_name => $cfg->val( $bam, 'source_name' ),
00264 source_url => $cfg->val( $bam, 'source_url' ),
00265 description => $cfg->val( $bam, 'description' ) };
00266 }
00267 }
00268 }
00269 return $bams;
00270 } ## end sub analyze_tracks
00271
00272 =head2 count_by_biotype
00273 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00274 Arg : String (biotype)
00275 Description: Returns count of genes with specified biotype
00276 Returntype : Integer
00277 Exceptions : none
00278 Caller : internal
00279 Status : Stable
00280 =cut
00281
00282 sub count_by_biotype {
00283 my ( $self, $dba, $biotype ) = @_;
00284 return $dba->get_GeneAdaptor()->count_all_by_biotype($biotype);
00285 }
00286
00287 =head2 count_by_xref
00288 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00289 Arg : Arrayref of dbnames
00290 Arg : (optional) String (biotype)
00291 Description: Returns count of genes with specified cross-reference
00292 Returntype : Integer
00293 Exceptions : none
00294 Caller : internal
00295 Status : Stable
00296 =cut
00297
00298 my $xref_gene_sql = q/select count(distinct(gene_id))
00299 from gene g
00300 join seq_region s using (seq_region_id)
00301 join coord_system c using (coord_system_id)
00302 join transcript tr using (gene_id)
00303 join translation t using (transcript_id)
00304 join object_xref ox on (ox.ensembl_id=t.translation_id and ox.ensembl_object_type='Translation')
00305 join xref x using (xref_id)
00306 join external_db d using (external_db_id)
00307 where species_id=? and d.db_name in (NAMES)/;
00308
00309 my $biotype_clause = q/ and g.biotype=?/;
00310
00311 sub count_by_xref {
00312 my ( $self, $dba, $db_names, $biotype ) = @_;
00313 $self->{logger}->debug( "Counting genes by " .
00314 join( ",", $db_names ) . " xref for " . $dba->species() );
00315 $db_names = [$db_names] if ( ref($db_names) ne 'ARRAY' );
00316 my $sql = $xref_gene_sql;
00317 my $db_name = join ',', map { "\"$_\"" } @$db_names;
00318 $sql =~ s/NAMES/$db_name/;
00319 my $params = [ $dba->species_id() ];
00320 if ( defined $biotype ) {
00321 $sql .= $biotype_clause;
00322 push @$params, $biotype;
00323 }
00324 $self->{logger}
00325 ->debug( "Executing $sql with params: [" . join( ",", @$params ) );
00326 return $dba->dbc()->sql_helper()
00327 ->execute_single_result( -SQL => $sql, -PARAMS => $params );
00328 }
00329
00330 =head2 count_xrefs
00331 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00332 Arg : One or more dbnames
00333 Description: Returns count of numbers of cross-references to supplied database(s)
00334 Returntype : Integer
00335 Exceptions : none
00336 Caller : internal
00337 Status : Stable
00338 =cut
00339
00340 my $gene_xref_count_sql = q/
00341 select count(distinct(dbprimary_acc))
00342 from gene g
00343 join seq_region s using (seq_region_id)
00344 join coord_system c using (coord_system_id)
00345 join object_xref ox on (ox.ensembl_id=g.gene_id and ox.ensembl_object_type='Gene')
00346 join xref x using (xref_id)
00347 join external_db d using (external_db_id)
00348 where species_id=? and d.db_name=?/;
00349
00350 my $transcript_xref_count_sql = q/
00351 select count(distinct(dbprimary_acc))
00352 from transcript tr
00353 join seq_region s using (seq_region_id)
00354 join coord_system c using (coord_system_id)
00355 join object_xref ox on (ox.ensembl_id=tr.transcript_id and ox.ensembl_object_type='Transcript')
00356 join xref x using (xref_id)
00357 join external_db d using (external_db_id)
00358 where species_id=? and d.db_name=?/;
00359
00360 my $translation_xref_count_sql = q/
00361 select count(distinct(dbprimary_acc))
00362 from transcript tr
00363 join translation t using (transcript_id)
00364 join seq_region s using (seq_region_id)
00365 join coord_system c using (coord_system_id)
00366 join object_xref ox on (ox.ensembl_id=t.translation_id and ox.ensembl_object_type='Translation')
00367 join xref x using (xref_id)
00368 join external_db d using (external_db_id)
00369 where species_id=? and d.db_name=?/;
00370
00371 sub count_xrefs {
00372 my $self = shift;
00373 my $dba = shift;
00374 my $tot = 0;
00375 for my $db_name (@_) {
00376 $tot +=
00377 $dba->dbc()->sql_helper()->execute_single_result(
00378 -SQL => $gene_xref_count_sql,
00379 -PARAMS => [ $dba->species_id(), $db_name ]
00380 );
00381 $tot +=
00382 $dba->dbc()->sql_helper()->execute_single_result(
00383 -SQL => $transcript_xref_count_sql,
00384 -PARAMS => [ $dba->species_id(), $db_name ]
00385 );
00386 $tot +=
00387 $dba->dbc()->sql_helper()->execute_single_result(
00388 -SQL => $translation_xref_count_sql,
00389 -PARAMS => [ $dba->species_id(), $db_name ]
00390 );
00391 }
00392 return $tot;
00393 }
00394
00395 =head2 count_by_interpro
00396 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00397 Description: Returns count of genes with InterPro domains
00398 Returntype : Integer
00399 Exceptions : none
00400 Caller : internal
00401 Status : Stable
00402 =cut
00403
00404 my $count_by_interpro_base = q/ from interpro
00405 join protein_feature on (id=hit_name)
00406 join translation using (translation_id)
00407 join transcript using (transcript_id)
00408 join seq_region using (seq_region_id)
00409 join coord_system using (coord_system_id)
00410 where species_id=?
00411 /;
00412
00413 my $count_by_interpro =
00414 'select count(distinct(interpro_ac)) ' . $count_by_interpro_base;
00415
00416 sub count_by_interpro {
00417 my ( $self, $dba ) = @_;
00418 return
00419 $dba->dbc()->sql_helper()->execute_single_result(
00420 -SQL => $count_by_interpro,
00421 -PARAMS => [ $dba->species_id() ]
00422 );
00423 }
00424
00425 =head2 count_interpro
00426 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00427 Description: Returns count of InterPro cross-references
00428 Returntype : Integer
00429 Exceptions : none
00430 Caller : internal
00431 Status : Stable
00432 =cut
00433
00434 my $count_interpro =
00435 q/select count(distinct(translation_id)) / . $count_by_interpro_base;
00436
00437 sub count_interpro {
00438 my ( $self, $dba ) = @_;
00439 return
00440 $dba->dbc()->sql_helper()->execute_single_result(
00441 -SQL => $count_interpro,
00442 -PARAMS => [ $dba->species_id() ]
00443 );
00444 }
00445
00446 =head2 count_interpro_domains
00447 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00448 Description: Returns count of InterPro domains
00449 Returntype : Integer
00450 Exceptions : none
00451 Caller : internal
00452 Status : Stable
00453 =cut
00454
00455 my $count_interpro_domains =
00456 q/select count(distinct(protein_feature_id)) / . $count_by_interpro_base;
00457
00458 sub count_interpro_domains {
00459 my ( $self, $dba ) = @_;
00460 return
00461 $dba->dbc()->sql_helper()->execute_single_result(
00462 -SQL => $count_interpro_domains,
00463 -PARAMS => [ $dba->species_id() ]
00464 );
00465 }
00466
00467 =head2 count_toplevel
00468 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00469 Description: Returns count of top-level regions
00470 Returntype : Integer
00471 Exceptions : none
00472 Caller : internal
00473 Status : Stable
00474 =cut
00475
00476 my $count_toplevel = q/select count(*) from seq_region
00477 join seq_region_attrib using (seq_region_id)
00478 join coord_system using (coord_system_id)
00479 join attrib_type using (attrib_type_id)
00480 where species_id=? and code='toplevel'/;
00481
00482 sub count_toplevel {
00483 my ( $self, $dba ) = @_;
00484 return
00485 $dba->dbc()->sql_helper()->execute_single_result(
00486 -SQL => $count_toplevel,
00487 -PARAMS => [ $dba->species_id() ]
00488 );
00489 }
00490
00491 =head2 count_features
00492 Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00493 Arg : String - feature type (e.g. RepeatFeature)
00494 Description: Returns count of specified feature by analysis
00495 Returntype : Hashref
00496 Exceptions : none
00497 Caller : internal
00498 Status : Stable
00499 =cut
00500
00501 my $count_features = q/select logic_name,count(*)
00502 from TABLE join analysis using (analysis_id)
00503 join seq_region using (seq_region_id)
00504 join coord_system using (coord_system_id)
00505 where species_id=? group by logic_name/;
00506
00507 sub count_features {
00508 my ( $self, $dba, $table ) = @_;
00509 my $sql = $count_features;
00510 $sql =~ s/TABLE/$table/;
00511 return $dba->dbc()->sql_helper()
00512 ->execute_into_hash( -SQL => $sql, -PARAMS => [ $dba->species_id() ] );
00513 }
00514
00515 =head2 count_variations
00516 Arg : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor
00517 Description: Returns count of variations by source
00518 Returntype : Hashref
00519 Exceptions : none
00520 Caller : internal
00521 Status : Stable
00522 =cut
00523
00524 my $count_variation = q/select ifnull(s.name,"unknown"),count(*)
00525 from variation v
00526 join source s using (source_id)
00527 group by s.name/;
00528
00529 sub count_variations {
00530 my ( $self, $dba ) = @_;
00531 return $dba->dbc()->sql_helper()
00532 ->execute_into_hash( -SQL => $count_variation, -PARAMS => [] );
00533 }
00534
00535 =head2 count_structural_variations
00536 Arg : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor
00537 Description: Returns count of structural variations by source
00538 Returntype : Hashref
00539 Exceptions : none
00540 Caller : internal
00541 Status : Stable
00542 =cut
00543
00544 my $count_structural_variation = q/select ifnull(s.name,"unknown"),count(*)
00545 from structural_variation v
00546 join source s using (source_id)
00547 group by s.name/;
00548
00549 sub count_structural_variations {
00550 my ( $self, $dba ) = @_;
00551 return $dba->dbc()->sql_helper()
00552 ->execute_into_hash( -SQL => $count_structural_variation, -PARAMS => [] );
00553 }
00554
00555 =head2 count_genotypes
00556 Arg : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor
00557 Description: Returns count of genotypes by source
00558 Returntype : Hashref
00559 Exceptions : none
00560 Caller : internal
00561 Status : Stable
00562 =cut
00563
00564 my $count_genotypes = q/
00565 select ifnull(name,"unknown"),count(*)
00566 from tmp_individual_genotype_single_bp
00567 join individual using (individual_id) group by name/;
00568
00569 sub count_genotypes {
00570 my ( $self, $dba ) = @_;
00571 my $cnt = {};
00572 eval {
00573 $cnt =
00574 $dba->dbc()->sql_helper()
00575 ->execute_into_hash( -SQL => $count_genotypes, -PARAMS => [] );
00576 };
00577 return $cnt;
00578 }
00579
00580 =head2 count_phenotypes
00581 Arg : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor
00582 Description: Returns count of phenotypes by name
00583 Returntype : Integer
00584 Exceptions : none
00585 Caller : internal
00586 Status : Stable
00587 =cut
00588
00589 my $count_phenotypes = q/
00590 select ifnull(p.name,"unknown"),count(*)
00591 from phenotype p
00592 join phenotype_feature pf using (phenotype_id)
00593 join variation v on (object_id=v.name and type='Variation')
00594 group by p.name;
00595 /;
00596
00597 sub count_phenotypes {
00598 my ( $self, $dba ) = @_;
00599 return $dba->dbc()->sql_helper()
00600 ->execute_into_hash( -SQL => $count_phenotypes, -PARAMS => [] );
00601 }
00602
00603 1;