AnnotationAnalyzer.pm
Go to the documentation of this file.
00001 
00002 =head1 LICENSE
00003 
00004 Copyright [2009-2016] EMBL-European Bioinformatics Institute
00005 
00006 Licensed under the Apache License, Version 2.0 (the "License");
00007 you may not use this file except in compliance with the License.
00008 You may obtain a copy of the License at
00009 
00010      http://www.apache.org/licenses/LICENSE-2.0
00011 
00012 Unless required by applicable law or agreed to in writing, software
00013 distributed under the License is distributed on an "AS IS" BASIS,
00014 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 See the License for the specific language governing permissions and
00016 limitations under the License.
00017 
00018 =head1 CONTACT
00019 
00020   Please email comments or questions to the public Ensembl
00021   developers list at <dev@ensembl.org>.
00022 
00023   Questions may also be sent to the Ensembl help desk at
00024   <helpdesk@ensembl.org>.
00025 
00026 =head1 NAME
00027 
00028 Bio::EnsEMBL::MetaData::AnnotationAnalyzer
00029 
00030 =head1 SYNOPSIS
00031 
00032 my $anal = Bio::EnsEMBL::MetaData::AnnotationAnalyzer->new();
00033 my $details = $anal->analyze_annotation($dba);
00034 
00035 =head1 DESCRIPTION
00036 
00037 Utility class for counting xrefs, variations etc. associated with a genome
00038 
00039 =head1 AUTHOR
00040 
00041 Dan Staines
00042 
00043 =cut
00044 
00045 use strict;
00046 use warnings;
00047 
00048 package Bio::EnsEMBL::MetaData::AnnotationAnalyzer;
00049 use Bio::EnsEMBL::Utils::Exception qw/throw/;
00050 
00051 use Log::Log4perl qw(get_logger);
00052 use Data::Dumper;
00053 use Config::IniFiles;
00054 use LWP::UserAgent;
00055 my $ua = LWP::UserAgent->new();
00056 
00057 my $url_template =
00058 'https://raw.github.com/EnsemblGenomes/eg-web-DIVISION/master/conf/ini-files/SPECIES.ini';
00059 
00060 =head1 SUBROUTINES/METHODS
00061 
00062 =head2 new
00063   Description: Return a new instance of AnnotationAnalyzer
00064   Returntype : Bio::EnsEMBL::MetaData::AnnotationAnalyzer
00065   Exceptions : none
00066   Caller     : general
00067   Status     : Stable
00068 =cut
00069 
00070 sub new {
00071   my $caller = shift;
00072   my $class  = ref($caller) || $caller;
00073   my $self   = bless( {}, $class );
00074   $self->{logger} = get_logger();
00075   return $self;
00076 }
00077 
00078 =head2 analyze_annotation
00079   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00080   Description: Analyzes annotation content of the supplied core DB
00081   Returntype :  Hash ref with the following keys:
00082                     nProteinCoding - number of protein coding genes
00083                     nProteinCodingUniProtKBSwissProt - number of protein coding genes with at least one SwissPROT entry
00084                     nProteinCodingUniProtKBTrEMBL - number of protein coding genes with at least one TrEMBL entry
00085                     nProteinCodingGO - number of protein coding genes with at least one GO term
00086                     nProteinCodingInterPro - number of protein coding genes with at least one InterPro match
00087                     nUniProtKBSwissProt - number of distinct UniProtKB/TrEMBL entries matching at least one translation
00088                     nUniProtKBTrEMBL - number of distinct UniProtKB/TrEMBL entries matching at least one translation
00089                     nGO - number of distinct GO terms matching at least one translation
00090                     nInterPro - number of distinct InterPro entries matching at least one feature
00091                     nInterProDomains - number of distinct protein features matching an InterPro domains
00092   Exceptions : none
00093   Caller     : general
00094   Status     : Stable
00095 =cut
00096 
00097 sub analyze_annotation {
00098   my ( $self, $dba ) = @_;
00099   $self->{logger}->debug( "Analysing annotation for " . $dba->species() );
00100   return {
00101       nProteinCoding => $self->count_by_biotype( $dba, 'protein_coding' ),
00102       nProteinCodingGO => $self->count_by_xref( $dba, 'GO', 'protein_coding' ),
00103       nProteinCodingUniProtKB =>
00104         $self->count_by_xref( $dba, [ 'Uniprot/SWISSPROT', 'Uniprot/SPTREMBL' ],
00105                               'protein_coding' ),
00106       nProteinCodingUniProtKBSwissProt =>
00107         $self->count_by_xref( $dba, 'Uniprot/SWISSPROT', 'protein_coding' ),
00108       nProteinCodingUniProtKBTrEMBL =>
00109         $self->count_by_xref( $dba, 'Uniprot/SPTREMBL', 'protein_coding' ),
00110       nProteinCodingInterPro => $self->count_by_interpro($dba),
00111       nGO => $self->count_by_xref( $dba, 'GO', 'protein_coding' ),
00112       nUniProtKBSwissProt => $self->count_xrefs( $dba, 'Uniprot/SWISSPROT' ),
00113       nUniProtKBTrEMBL    => $self->count_xrefs( $dba, 'Uniprot/SPTREMBL' ),
00114       nInterPro           => $self->count_interpro($dba),
00115       nInterProDomains => $self->count_interpro_domains($dba) };
00116 }
00117 
00118 =head2 analyze_features
00119   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00120   Description: Analyzes features found in the supplied core/otherfeatures database (simple_feature,repeat_feature,protein_align_feature,dna_align_features)
00121   Returntype :  Hash ref (keys are feature tables, values are hashrefs of count by analysis name)
00122   Exceptions : none
00123   Caller     : general
00124   Status     : Stable
00125 =cut
00126 
00127 sub analyze_features {
00128   my ( $self, $dba ) = @_;
00129   return { simpleFeatures => $self->count_features( $dba, 'simple_feature' ),
00130            repeatFeatures => $self->count_features( $dba, 'repeat_feature' ) };
00131 }
00132 
00133 =head2 analyze_variation
00134   Arg        : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor
00135   Description: Analyzes variation found in the supplied variation database
00136   Returntype :  Hash ref with the following keys
00137                     variations - count of variations per source
00138                     structuralVariations - count of structural variations per source
00139                     genotypes - count of genotypes per sample
00140                     phenotypes - count of variation annotations per phenotype
00141   Exceptions : none
00142   Caller     : general
00143   Status     : Stable
00144 =cut
00145 
00146 sub analyze_variation {
00147   my ( $self, $dba ) = @_;
00148   return { variations           => $self->count_variations($dba),
00149            structuralVariations => $self->count_structural_variations($dba),
00150            genotypes            => $self->count_genotypes($dba),
00151            phenotypes           => $self->count_phenotypes($dba) };
00152 }
00153 
00154 =head2 analyze_compara
00155   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00156   Arg        : Bio::EnsEMBL::Compara::DBSQL::DBAdaptor
00157   Description: Analyzes compara methods found in the supplied compara database involving the supplied core species
00158   Returntype :  Hash ref (keys are method names, values are arrays of partner species)
00159   Exceptions : none
00160   Caller     : general
00161   Status     : Stable
00162 =cut
00163 
00164 sub analyze_compara {
00165   my ( $self, $dba, $core ) = @_;
00166   my $compara = {};
00167   eval {
00168     my $gdb =
00169       $dba->get_GenomeDBAdaptor()->fetch_by_registry_name( $core->species() );
00170     for my $mlss (
00171        @{ $dba->get_MethodLinkSpeciesSetAdaptor()->fetch_all_by_GenomeDB($gdb) }
00172       )
00173     {
00174       my $t = $mlss->method()->type();
00175       next
00176         if ( $t eq 'FAMILY' ||
00177              $t eq 'ENSEMBL_ORTHOLOGUES' ||
00178              $t eq 'ENSEMBL_PARALOGUES' );
00179       for my $gdb2 ( grep { $gdb->dbID() ne $_->dbID() }
00180                      @{ $mlss->species_set_obj->genome_dbs() } )
00181       {
00182         push( @{ $compara->{$t} }, $gdb2->name() );
00183       }
00184     }
00185   };
00186   if ($@) {
00187     warn "No compara entry found for " . $dba->species();
00188   }
00189   return $compara;
00190 } ## end sub analyze_compara
00191 
00192 =head2 analyze_alignments
00193   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00194   Description: Analyzes alignments for sequences to the assembly 
00195   Returntype : Hash ref - keys are source_type (e.g. RNA seq), values are arrayrefs of hashrefs containing track details (source_name,description,source_url)
00196   Exceptions : none
00197   Caller     : general
00198   Status     : Stable
00199 =cut
00200 
00201 sub analyze_alignments {
00202   my ( $self, $dba ) = @_;
00203   my $ali = {};
00204   my $pf = $self->count_features( $dba, 'protein_align_feature' );
00205   if ( scalar( keys %$pf ) > 0 ) {
00206     $ali->{proteinAlignFeatures} = $pf;
00207   }
00208   my $df = $self->count_features( $dba, 'dna_align_feature' );
00209   if ( scalar( keys %$df ) > 0 ) {
00210     $ali->{dnaAlignFeatures} = $df;
00211   }
00212   return $ali;
00213 }
00214 
00215 =head2 analyze_tracks
00216   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00217   Description: Analyzes tracks attached to the supplied core database
00218   Returntype : Hash ref - keys are source_type (e.g. RNA seq), values are arrayrefs of hashrefs containing track details (source_name,description,source_url)
00219   Exceptions : none
00220   Caller     : general
00221   Status     : Stable   
00222 =cut
00223 
00224 sub analyze_tracks {
00225   my ( $self, $species, $division ) = @_;
00226   $species = ucfirst($species);
00227   ( $division = lc $division ) =~ s/ensembl//;
00228   # get the ini file from git
00229   ( my $ini_url = $url_template ) =~ s/SPECIES/$species/;
00230   $ini_url =~ s/DIVISION/$division/;
00231 
00232   my $req = HTTP::Request->new( GET => $ini_url );
00233   # Pass request to the user agent and get a response back
00234   my $res = $ua->request($req);
00235   my $ini;
00236   # Check the outcome of the response
00237   if ( $res->is_success ) {
00238     $ini = $res->content;
00239   }
00240   else {
00241     $self->{logger}
00242       ->debug( "Could not retrieve $ini_url: " . $res->status_line );
00243   }
00244 
00245 # parse out and look at:
00246 # [ENSEMBL_INTERNAL_BAM_SOURCES]
00247 # 1_Puccinia_triticina_SRR035315 = dna_align_est
00248 # walk through keys and use keys to find sections like:
00249 # [1_Puccinia_triticina_SRR035315]
00250 # source_name        = Transcriptomics sequences of fresh spores. Run id: SRR035315
00251 # description        = RNA-Seq study.
00252 # source_url         = http://ftp.sra.ebi.ac.uk/vol1/ERZ000/ERZ000002/SRR035315.bam
00253 # source_type        = rnaseq
00254 # display            = normal
00255 # then store some or all of this in my output e.g. {bam}{source_type}[{source_name,description,source_url}]
00256   my $bams = {};
00257   if ( defined $ini ) {
00258     my $cfg = Config::IniFiles->new( -file => \$ini );
00259     if ( defined $cfg ) {
00260       for my $bam ( $cfg->Parameters("ENSEMBL_INTERNAL_BAM_SOURCES") ) {
00261         push @{ $bams->{ $cfg->val( $bam, 'source_type' ) } }, {
00262             id          => $bam,
00263             source_name => $cfg->val( $bam, 'source_name' ),
00264             source_url  => $cfg->val( $bam, 'source_url' ),
00265             description => $cfg->val( $bam, 'description' ) };
00266       }
00267     }
00268   }
00269   return $bams;
00270 } ## end sub analyze_tracks
00271 
00272 =head2 count_by_biotype
00273   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00274   Arg        : String (biotype)
00275   Description: Returns count of genes with specified biotype
00276   Returntype : Integer
00277   Exceptions : none
00278   Caller     : internal
00279   Status     : Stable
00280 =cut
00281 
00282 sub count_by_biotype {
00283   my ( $self, $dba, $biotype ) = @_;
00284   return $dba->get_GeneAdaptor()->count_all_by_biotype($biotype);
00285 }
00286 
00287 =head2 count_by_xref
00288   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00289   Arg        : Arrayref of dbnames 
00290   Arg        : (optional) String (biotype)
00291   Description: Returns count of genes with specified cross-reference
00292   Returntype : Integer
00293   Exceptions : none
00294   Caller     : internal
00295   Status     : Stable
00296 =cut
00297 
00298 my $xref_gene_sql = q/select count(distinct(gene_id)) 
00299 from gene g
00300 join seq_region s using (seq_region_id) 
00301 join coord_system c using (coord_system_id) 
00302 join transcript tr using (gene_id)
00303 join translation t using (transcript_id)
00304 join object_xref ox on (ox.ensembl_id=t.translation_id and ox.ensembl_object_type='Translation')
00305 join xref x using (xref_id)
00306 join external_db d using (external_db_id)
00307 where species_id=? and d.db_name in (NAMES)/;
00308 
00309 my $biotype_clause = q/ and g.biotype=?/;
00310 
00311 sub count_by_xref {
00312   my ( $self, $dba, $db_names, $biotype ) = @_;
00313   $self->{logger}->debug( "Counting genes by " .
00314                       join( ",", $db_names ) . " xref for " . $dba->species() );
00315   $db_names = [$db_names] if ( ref($db_names) ne 'ARRAY' );
00316   my $sql = $xref_gene_sql;
00317   my $db_name = join ',', map { "\"$_\"" } @$db_names;
00318   $sql =~ s/NAMES/$db_name/;
00319   my $params = [ $dba->species_id() ];
00320   if ( defined $biotype ) {
00321     $sql .= $biotype_clause;
00322     push @$params, $biotype;
00323   }
00324   $self->{logger}
00325     ->debug( "Executing $sql with params: [" . join( ",", @$params ) );
00326   return $dba->dbc()->sql_helper()
00327     ->execute_single_result( -SQL => $sql, -PARAMS => $params );
00328 }
00329 
00330 =head2 count_xrefs
00331   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00332   Arg        : One or more dbnames
00333   Description: Returns count of numbers of cross-references to supplied database(s)
00334   Returntype : Integer
00335   Exceptions : none
00336   Caller     : internal
00337   Status     : Stable
00338 =cut
00339 
00340 my $gene_xref_count_sql = q/
00341 select count(distinct(dbprimary_acc)) 
00342 from gene g
00343 join seq_region s using (seq_region_id) 
00344 join coord_system c using (coord_system_id) 
00345 join object_xref ox on (ox.ensembl_id=g.gene_id and ox.ensembl_object_type='Gene')
00346 join xref x using (xref_id)
00347 join external_db d using (external_db_id)
00348 where species_id=? and d.db_name=?/;
00349 
00350 my $transcript_xref_count_sql = q/
00351 select count(distinct(dbprimary_acc)) 
00352 from transcript tr
00353 join seq_region s using (seq_region_id) 
00354 join coord_system c using (coord_system_id) 
00355 join object_xref ox on (ox.ensembl_id=tr.transcript_id and ox.ensembl_object_type='Transcript')
00356 join xref x using (xref_id)
00357 join external_db d using (external_db_id)
00358 where species_id=? and d.db_name=?/;
00359 
00360 my $translation_xref_count_sql = q/
00361 select count(distinct(dbprimary_acc)) 
00362 from transcript tr
00363 join translation t using (transcript_id)
00364 join seq_region s using (seq_region_id) 
00365 join coord_system c using (coord_system_id) 
00366 join object_xref ox on (ox.ensembl_id=t.translation_id and ox.ensembl_object_type='Translation')
00367 join xref x using (xref_id)
00368 join external_db d using (external_db_id)
00369 where species_id=? and d.db_name=?/;
00370 
00371 sub count_xrefs {
00372   my $self = shift;
00373   my $dba  = shift;
00374   my $tot  = 0;
00375   for my $db_name (@_) {
00376     $tot +=
00377       $dba->dbc()->sql_helper()->execute_single_result(
00378                                      -SQL    => $gene_xref_count_sql,
00379                                      -PARAMS => [ $dba->species_id(), $db_name ]
00380       );
00381     $tot +=
00382       $dba->dbc()->sql_helper()->execute_single_result(
00383                                      -SQL    => $transcript_xref_count_sql,
00384                                      -PARAMS => [ $dba->species_id(), $db_name ]
00385       );
00386     $tot +=
00387       $dba->dbc()->sql_helper()->execute_single_result(
00388                                      -SQL    => $translation_xref_count_sql,
00389                                      -PARAMS => [ $dba->species_id(), $db_name ]
00390       );
00391   }
00392   return $tot;
00393 }
00394 
00395 =head2 count_by_interpro
00396   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00397   Description: Returns count of genes with InterPro domains
00398   Returntype : Integer
00399   Exceptions : none
00400   Caller     : internal
00401   Status     : Stable
00402 =cut
00403 
00404 my $count_by_interpro_base = q/ from interpro
00405 join protein_feature on (id=hit_name)
00406 join translation using (translation_id)
00407 join transcript using (transcript_id)
00408 join seq_region using (seq_region_id)
00409 join coord_system using (coord_system_id)
00410 where species_id=?
00411 /;
00412 
00413 my $count_by_interpro =
00414   'select count(distinct(interpro_ac)) ' . $count_by_interpro_base;
00415 
00416 sub count_by_interpro {
00417   my ( $self, $dba ) = @_;
00418   return
00419     $dba->dbc()->sql_helper()->execute_single_result(
00420                                                -SQL    => $count_by_interpro,
00421                                                -PARAMS => [ $dba->species_id() ]
00422     );
00423 }
00424 
00425 =head2 count_interpro
00426   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00427   Description: Returns count of InterPro cross-references
00428   Returntype : Integer
00429   Exceptions : none
00430   Caller     : internal
00431   Status     : Stable
00432 =cut
00433 
00434 my $count_interpro =
00435   q/select count(distinct(translation_id)) / . $count_by_interpro_base;
00436 
00437 sub count_interpro {
00438   my ( $self, $dba ) = @_;
00439   return
00440     $dba->dbc()->sql_helper()->execute_single_result(
00441                                                -SQL    => $count_interpro,
00442                                                -PARAMS => [ $dba->species_id() ]
00443     );
00444 }
00445 
00446 =head2 count_interpro_domains
00447   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00448   Description: Returns count of InterPro domains
00449   Returntype : Integer
00450   Exceptions : none
00451   Caller     : internal
00452   Status     : Stable
00453 =cut
00454 
00455 my $count_interpro_domains =
00456   q/select count(distinct(protein_feature_id)) / . $count_by_interpro_base;
00457 
00458 sub count_interpro_domains {
00459   my ( $self, $dba ) = @_;
00460   return
00461     $dba->dbc()->sql_helper()->execute_single_result(
00462                                                -SQL => $count_interpro_domains,
00463                                                -PARAMS => [ $dba->species_id() ]
00464     );
00465 }
00466 
00467 =head2 count_toplevel
00468   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00469   Description: Returns count of top-level regions
00470   Returntype : Integer
00471   Exceptions : none
00472   Caller     : internal
00473   Status     : Stable
00474 =cut
00475 
00476 my $count_toplevel = q/select count(*) from seq_region 
00477 join seq_region_attrib using (seq_region_id) 
00478 join coord_system using (coord_system_id) 
00479 join attrib_type using (attrib_type_id)
00480 where species_id=? and code='toplevel'/;
00481 
00482 sub count_toplevel {
00483   my ( $self, $dba ) = @_;
00484   return
00485     $dba->dbc()->sql_helper()->execute_single_result(
00486                                                -SQL    => $count_toplevel,
00487                                                -PARAMS => [ $dba->species_id() ]
00488     );
00489 }
00490 
00491 =head2 count_features
00492   Arg        : Bio::EnsEMBL::DBSQL::DBAdaptor
00493   Arg        : String - feature type (e.g. RepeatFeature)
00494   Description: Returns count of specified feature by analysis
00495   Returntype : Hashref
00496   Exceptions : none
00497   Caller     : internal
00498   Status     : Stable
00499 =cut
00500 
00501 my $count_features = q/select logic_name,count(*) 
00502 from TABLE join analysis using (analysis_id) 
00503 join seq_region using (seq_region_id) 
00504 join coord_system using (coord_system_id) 
00505 where species_id=? group by logic_name/;
00506 
00507 sub count_features {
00508   my ( $self, $dba, $table ) = @_;
00509   my $sql = $count_features;
00510   $sql =~ s/TABLE/$table/;
00511   return $dba->dbc()->sql_helper()
00512     ->execute_into_hash( -SQL => $sql, -PARAMS => [ $dba->species_id() ] );
00513 }
00514 
00515 =head2 count_variations
00516   Arg        : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor
00517   Description: Returns count of variations by source
00518   Returntype : Hashref
00519   Exceptions : none
00520   Caller     : internal
00521   Status     : Stable
00522 =cut
00523 
00524 my $count_variation = q/select ifnull(s.name,"unknown"),count(*) 
00525 from variation v 
00526 join source s using (source_id) 
00527 group by s.name/;
00528 
00529 sub count_variations {
00530   my ( $self, $dba ) = @_;
00531   return $dba->dbc()->sql_helper()
00532     ->execute_into_hash( -SQL => $count_variation, -PARAMS => [] );
00533 }
00534 
00535 =head2 count_structural_variations
00536   Arg        : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor
00537   Description: Returns count of structural variations by source
00538   Returntype : Hashref
00539   Exceptions : none
00540   Caller     : internal
00541   Status     : Stable
00542 =cut
00543 
00544 my $count_structural_variation = q/select ifnull(s.name,"unknown"),count(*) 
00545 from structural_variation v 
00546 join source s using (source_id) 
00547 group by s.name/;
00548 
00549 sub count_structural_variations {
00550   my ( $self, $dba ) = @_;
00551   return $dba->dbc()->sql_helper()
00552     ->execute_into_hash( -SQL => $count_structural_variation, -PARAMS => [] );
00553 }
00554 
00555 =head2 count_genotypes
00556   Arg        : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor
00557   Description: Returns count of genotypes by source
00558   Returntype : Hashref
00559   Exceptions : none
00560   Caller     : internal
00561   Status     : Stable
00562 =cut
00563 
00564 my $count_genotypes = q/
00565 select ifnull(name,"unknown"),count(*) 
00566 from tmp_individual_genotype_single_bp 
00567 join individual using (individual_id) group by name/;
00568 
00569 sub count_genotypes {
00570   my ( $self, $dba ) = @_;
00571   my $cnt = {};
00572   eval {
00573     $cnt =
00574       $dba->dbc()->sql_helper()
00575       ->execute_into_hash( -SQL => $count_genotypes, -PARAMS => [] );
00576   };
00577   return $cnt;
00578 }
00579 
00580 =head2 count_phenotypes
00581   Arg        : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor
00582   Description: Returns count of phenotypes by name
00583   Returntype : Integer
00584   Exceptions : none
00585   Caller     : internal
00586   Status     : Stable
00587 =cut
00588 
00589 my $count_phenotypes = q/
00590 select ifnull(p.name,"unknown"),count(*) 
00591 from phenotype p 
00592 join phenotype_feature pf using (phenotype_id) 
00593 join variation v on (object_id=v.name and type='Variation') 
00594 group by p.name;
00595 /;
00596 
00597 sub count_phenotypes {
00598   my ( $self, $dba ) = @_;
00599   return $dba->dbc()->sql_helper()
00600     ->execute_into_hash( -SQL => $count_phenotypes, -PARAMS => [] );
00601 }
00602 
00603 1;