LocalLookUp.pm
Go to the documentation of this file.
00001 
00002 =head1 LICENSE
00003 
00004 Copyright [2009-2016] EMBL-European Bioinformatics Institute
00005 
00006 Licensed under the Apache License, Version 2.0 (the "License");
00007 you may not use this file except in compliance with the License.
00008 You may obtain a copy of the License at
00009 
00010      http://www.apache.org/licenses/LICENSE-2.0
00011 
00012 Unless required by applicable law or agreed to in writing, software
00013 distributed under the License is distributed on an "AS IS" BASIS,
00014 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 See the License for the specific language governing permissions and
00016 limitations under the License.
00017 
00018 =head1 CONTACT
00019 
00020   Please email comments or questions to the public Ensembl
00021   developers list at <dev@ensembl.org>.
00022 
00023   Questions may also be sent to the Ensembl help desk at
00024   <helpdesk@ensembl.org>.
00025 
00026 =head1 NAME
00027 
00028 Bio::EnsEMBL::LookUp::LocalLookUp
00029 
00030 =head1 SYNOPSIS
00031 
00032 # creation from a database server
00033 Bio::EnsEMBL::LookUp::LocalLookUp->register_all_dbs( $conf->{host},
00034        $conf->{port}, $conf->{user}, $conf->{pass}, $conf->{db});
00035 my $lookup = Bio::EnsEMBL::LookUp::LocalLookUp->new();
00036 my $dbas = $lookup->registry()->get_all();
00037 $dbas = $lookup->get_all_by_taxon_id(388919);
00038 $dbas = $lookup->get_by_name_pattern("Escherichia.*");
00039 
00040 # creation from a URL
00041 my $lookup = Bio::EnsEMBL::LookUp::::LocalLookUp->new(-URL=>'http://bacteria.ensembl.org/registry.json');
00042 
00043 =head1 DESCRIPTION
00044 
00045 This module is an implementation of Bio::EnsEMBL::LookUp that uses a local hash to store information about available Ensembl Genomes data. 
00046 It can be constructed in a number of different ways:
00047 
00048 A remote cache file containing a JSON representation of the hash can be supplied:
00049 
00050     my $lookup = Bio::EnsEMBL::LookUp::LocalLookUp->new(-URL=>'http://bacteria.ensembl.org/registry.json');
00051 
00052 Alternatively, a local file containing the required JSON can be specified instead:
00053 
00054     my $lookup = Bio::EnsEMBL::LookUp::LocalLookUp->new(-FILE=>"/path/to/reg.json");
00055 
00056 Finally, a Registry already loaded with core databases can be supplied:
00057 
00058     my $lookup = Bio::EnsEMBL::LookUp::LocalLookUp->new(-REGISTRY=>'Bio::EnsEMBL::Registry');
00059 
00060 If the standard Registry is used, the Arg can be omitted completely:
00061 
00062     my $lookup = Bio::EnsEMBL::LookUp::LocalLookUp->new();
00063 
00064 To populate the registry with just the Ensembl Genomes databases for the current software 
00065 release on a specified server, the following method can be used:
00066 
00067     Bio::EnsEMBL::LookUp::LocalLookUp->register_all_dbs( $host,
00068        $port, $user, $pass);
00069 
00070 Once a lookup has been created, it can be used as specified in Bio::EnsEMBL::LookUp
00071 
00072 =head1 SEE ALSO
00073 
00074 Bio::EnsEMBL::LookUp
00075 
00076 =head1 AUTHOR
00077 
00078 Dan Staines
00079 
00080 =cut
00081 
00082 package Bio::EnsEMBL::LookUp::LocalLookUp;
00083 
00084 use warnings;
00085 use strict;
00086 use Bio::EnsEMBL::ApiVersion;
00087 use Bio::EnsEMBL::Utils::ConfigRegistry;
00088 use Bio::EnsEMBL::DBSQL::DBAdaptor;
00089 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00090 use Bio::EnsEMBL::Utils::Scalar qw(assert_ref check_ref);
00091 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00092 use Bio::EnsEMBL::Taxonomy::DBSQL::TaxonomyNodeAdaptor;
00093 use List::MoreUtils qw(uniq);
00094 use Scalar::Util qw(looks_like_number);
00095 use DBI;
00096 use JSON;
00097 use LWP::Simple;
00098 use Carp;
00099 
00100 my $default_cache_file = qw/lookup_cache.json/;
00101 
00102 =head1 SUBROUTINES/METHODS
00103 
00104 =head2 new
00105 
00106   Description       : Creates a new instance of this object. 
00107   Arg [-REGISTRY]   : (optional) Registry module to use (default is Bio::EnsEMBL::Registry)
00108   Arg [-NO_CACHE]   : (optional) int 1
00109                This option will turn off the use of a local cache file for storing species details
00110   Arg [-CACHE_FILE] : (optional) String (default lookup_cache.json)
00111                 This option allows use of a user specified local cache file
00112   Arg [-URL]        : (optional) 
00113               This option allows the use of a remote resource supplying species details in JSON format. This is not used if a local cache exists and -NO_CACHE is not set
00114   Arg [-FILE]       : (optional) 
00115               This option allows the use of a file supplying species details in JSON format. This is not used if a local cache exists and -NO_CACHE is not set
00116   Returntype        : Instance of lookup
00117   Status            : Stable
00118 
00119   Example           : 
00120   my $lookup = Bio::EnsEMBL::LookUp::LocalLookUp->new(
00121                                         -REGISTRY => $reg);
00122 =cut
00123 
00124 sub new {
00125   my ( $class, @args ) = @_;
00126   my $self = bless( {}, ref($class) || $class );
00127   my ( $reg, $url, $file, $nocache, $cache_file, $clear_cache, $skip_contigs ) =
00128     rearrange( [qw(registry url file no_cache cache_file clear_cache skip_contigs)],
00129                @args );
00130   if ( !defined $reg ) { $reg = 'Bio::EnsEMBL::Registry' }
00131   $self->{registry}      = $reg;
00132   $self->{dbas_by_taxid} = {};
00133   $self->{dbas_by_name}  = {};
00134   $self->{dbas_by_acc}   = {};
00135   $self->{dbas_by_vacc}  = {};
00136   $self->{dbas_by_gc}    = {};
00137   $self->{dbas_by_vgc}   = {};
00138   $self->{dbas}          = {};
00139   $self->{dbc_meta}      = {};
00140   $self->{skip_contigs}  = $skip_contigs;
00141   $cache_file ||= $default_cache_file;
00142   $self->cache_file($cache_file);
00143 
00144   if ( defined $clear_cache && $clear_cache == 1 ) {
00145     $self->clear_cache();
00146   }
00147   if ( !$nocache && !defined $file && -e $cache_file ) {
00148     $file = $cache_file;
00149   }
00150   if ( defined $file ) {
00151     $self->load_registry_from_file($file);
00152   }
00153   elsif ( defined $url ) {
00154     $self->load_registry_from_url($url);
00155   }
00156   else {
00157     $self->update_from_registry();
00158   }
00159   if ( !$nocache && !-e $cache_file ) {
00160     $self->write_registry_to_file($cache_file);
00161   }
00162   return $self;
00163 } ## end sub new
00164 
00165 =head2 registry
00166 
00167   Arg [1]     : Registry module to use (default is Bio::EnsEMBL::Registry)
00168   Description : Sets and retrieves the Registry 
00169   Returntype  : Registry if set; otherwise undef
00170   Exceptions  : if an attempt is made to set the value more than once
00171   Status      : Stable
00172 =cut
00173 
00174 sub registry {
00175   my ( $self, $registry ) = @_;
00176   if ( defined $registry ) {
00177     if ( exists $self->{registry} ) {
00178       throw('Cannot reset the Registry object; already defined ');
00179     }
00180     $self->{registry} = $registry;
00181     $self->update_from_registry();
00182   }
00183   return $self->{registry};
00184 }
00185 
00186 =head2 cache_file
00187   Arg [1]     : File to use for local caching
00188   Description : Sets and retrieves the local cache file 
00189   Returntype  : File name if set; otherwise undef
00190   Status      : Stable
00191 =cut
00192 
00193 sub cache_file {
00194   my ( $self, $cache_file ) = @_;
00195   if ( defined $cache_file ) {
00196     $self->{cache_file} = $cache_file;
00197   }
00198   return $self->{cache_file};
00199 }
00200 
00201 =head2  update_from_registry
00202     Description : Update internal hashes of database adaptors by name/taxids from the registry. Invoke when registry has been updated independently.
00203     Arg     : (optional) If set to 1, do not update from a DBAdaptor if already processed (allows updates of an existing LookUp instance)
00204     Exceptions  : None
00205 =cut
00206 
00207 sub update_from_registry {
00208   my ( $self, $update_only ) = @_;
00209   $self->_intern_db_connections();
00210   for my $dba (
00211         @{ $self->registry()->get_all_DBAdaptors( -group => 'core' ) } )
00212   {
00213     $self->_hash_dba( $dba, $update_only );
00214         $dba->dbc()->disconnect_if_idle();
00215   }
00216   return;
00217 }
00218 
00219 =head2 taxonomy_adaptor
00220     Description : Get/set the taxonomy adaptor
00221     Arg : (optional) Bio::EnsEMBL::Taxonomy::DBSQL::DBAdaptor
00222     Caller    : Internal
00223     Status    : Stable
00224     Return      : Bio::EnsEMBL::Taxonomy::DBSQL::DBAdaptor
00225 =cut
00226 sub taxonomy_adaptor {
00227   my ( $self, $adaptor ) = @_;
00228   if ( defined $adaptor ) {
00229     $self->{taxonomy_adaptor} = $adaptor;
00230   }
00231   elsif ( !defined $self->{taxonomy_adaptor} ) {
00232     $self->{taxonomy_adaptor} =
00233       Bio::EnsEMBL::Taxonomy::DBSQL::TaxonomyNodeAdaptor->new_public();
00234   }
00235   return $self->{taxonomy_adaptor};
00236 }
00237 
00238 =head2 _register_dba
00239     Description : Add a single DBAdaptor to the registry and the internal hashes of details
00240     Arg : Bio::EnsEMBL::DBSQL::DBAdaptor
00241     Return      : None
00242 =cut
00243 
00244 sub _register_dba {
00245   my ( $self, $species, $dba ) = @_;
00246   $self->registry()->add_DBAdaptor( $species, "core", $dba );
00247   $self->_hash_dba($dba);
00248   return;
00249 }
00250 
00251 sub _dba_id {
00252   my ( $self, $dba ) = @_;
00253   return $dba->dbc()->dbname() . '/' . $dba->species_id();
00254 }
00255 
00256 sub _get_dbc_meta {
00257   my ( $self, $dbc ) = @_;
00258   my $name = $dbc->dbname();
00259   if ( !defined $self->{dbc_meta}{$name} ) {
00260     # load metadata in batch for the whole database
00261     $self->{dbc_meta}{$name} = {};
00262     # taxid
00263 
00264     $dbc->sql_helper()->execute(
00265       -SQL =>
00266 q/select species_id,meta_value from meta where meta_key='species.taxonomy_id'/,
00267       -PARAMS   => [],
00268       -CALLBACK => sub {
00269         my $row = shift @_;
00270         $self->{dbc_meta}{$name}{ $row->[0] }{taxid} = $row->[1];
00271       } );
00272     # aliases
00273     $dbc->sql_helper()->execute(
00274       -SQL =>
00275 'select species_id,meta_value from meta where meta_key=\'species.alias\'',
00276       -PARAMS   => [],
00277       -CALLBACK => sub {
00278         my @row = @{ shift @_ };
00279         push @{ $self->{dbc_meta}{$name}{ $row[0] }{aliases} }, $row[1];
00280       } );
00281     # assembly.accession
00282     $dbc->sql_helper()->execute(
00283       -SQL =>
00284 'select species_id,meta_value from meta where meta_key=\'assembly.accession\'',
00285       -PARAMS   => [],
00286       -CALLBACK => sub {
00287         my @row = @{ shift @_ };
00288         $self->{dbc_meta}{$name}{ $row[0] }{assembly_accession} =
00289           $row[1];
00290       } );
00291 
00292         if(!defined $self->{skip_contigs}) {
00293     # seq_region_names and synonyms
00294     $dbc->sql_helper()->execute(
00295       -SQL =>
00296         q/select cs.species_id,sr.name,ss.synonym from coord_system cs 
00297       join seq_region sr using (coord_system_id) 
00298       left join seq_region_synonym ss using (seq_region_id)/,
00299       -PARAMS   => [],
00300       -CALLBACK => sub {
00301         my @row = @{ shift @_ };
00302         push @{ $self->{dbc_meta}{$name}{ $row[0] }{accessions} },
00303           $row[1];
00304         if ( defined $row[2] ) {
00305           push @{ $self->{dbc_meta}{$name}{ $row[0] }{accessions} },
00306             $row[2];
00307         }
00308       } );
00309         }
00310   } ## end if ( !defined $self->{...})
00311   return $self->{dbc_meta}{$name};
00312 } ## end sub _get_dbc_meta
00313 
00314 =head2 _hash_dba
00315     Description : Add details from a DBAdaptor to the internal hashes of details
00316     Arg : Bio::EnsEMBL::DBAdaptor
00317     Arg     : (optional) If set to 1, do not update from a DBAdaptor if already processed (allows updates of an existing LookUp instance)
00318 =cut
00319 
00320 sub _hash_dba {
00321   my ( $self, $dba, $update_only ) = @_;
00322   my $nom = $self->_dba_id($dba);
00323   if ( !defined $update_only || !defined $self->{dbas}{$nom} ) {
00324     my $dbc_meta = $self->_get_dbc_meta( $dba->dbc() );
00325     my $dba_meta = $dbc_meta->{ $dba->species_id() };
00326     $self->_hash_dba_from_values( $dba,
00327                                   [ $dba_meta->{taxid} ],
00328                                   $dba_meta->{aliases},
00329                                   $dba_meta->{accessions},
00330                                   $dba_meta->{assembly_accession} );
00331   }
00332   return;
00333 }
00334 
00335 =head2 _hash_dba_from_values
00336     Description : Add supplied details to the internal hashes of details
00337     Arg : Bio::EnsEMBL::DBAdaptor to hash
00338     Arg : Arrayref of taxonomy IDs to use as keys
00339     Arg : Arrayref of aliases to use as keys
00340     Arg : Arrayref of ENA accessions to use as keys
00341 =cut
00342 
00343 sub _hash_dba_from_values {
00344   my ( $self, $dba, $taxids, $aliases, $accessions, $vgc ) = @_;
00345 
00346   for my $taxid ( @{$taxids} ) {
00347     if ( defined $taxid ) {
00348       push @{ $self->{dbas_by_taxid}{$taxid} }, $dba;
00349     }
00350   }
00351   for my $name ( uniq( @{$aliases} ) ) {
00352     push @{ $self->{dbas_by_name}{$name} }, $dba;
00353   }
00354   for my $acc ( uniq( @{$accessions} ) ) {
00355     push @{ $self->{dbas_by_vacc}{$acc} }, $dba;
00356     $acc =~ s/\.[0-9]+$//x;
00357     push @{ $self->{dbas_by_acc}{$acc} }, $dba;
00358   }
00359   if ( defined $vgc ) {
00360     if ( defined $self->{dbas_by_vgc}{$vgc} ) {
00361       croak "DBA with GC accession $vgc already registered";
00362     }
00363     $self->{dbas_by_vgc}{$vgc} = $dba;
00364     $vgc =~ s/\.[0-9]+$//x;
00365     if ( defined $self->{dbas_by_gc}{$vgc} ) {
00366       croak "DBA with GC set chain $vgc already registered";
00367     }
00368     $self->{dbas_by_gc}{$vgc} = $dba;
00369   }
00370   $self->{dbas}{ $self->_dba_id($dba) } = $dba;
00371   return;
00372 } ## end sub _hash_dba_from_values
00373 
00374 sub _invert_dba_hash {
00375   my ($hash) = @_;
00376   my $inv_hash;
00377   while ( my ( $item, $dbas ) = each( %{$hash} ) ) {
00378     if ( ref $dbas ne 'ARRAY' ) {
00379       $dbas = [$dbas];
00380     }
00381     for my $dba ( @{$dbas} ) {
00382       my $dba_loc = _dba_to_locator($dba);
00383       push @{ $inv_hash->{$dba_loc} }, $item;
00384     }
00385   }
00386   return $inv_hash;
00387 }
00388 
00389 =head2 _registry_to_hash
00390     Description : Generate a hash array structure for the current registry and species details for turning to JSON
00391     Returns     : Arrayref of hashes
00392 =cut
00393 
00394 sub _registry_to_hash {
00395   my ($self) = @_;
00396   # hash dbcs and dbas by locators
00397   my $dbc_hash;
00398   my $dba_hash;
00399   for my $dba ( values %{ $self->{dbas} } ) {
00400     my $dbc_loc = _dbc_to_locator( $dba->dbc() );
00401     $dbc_hash->{$dbc_loc} = $dba->dbc();
00402     push @{ $dba_hash->{$dbc_loc} }, $dba;
00403   }
00404   # create auxillary hashes
00405   my $acc_hash   = _invert_dba_hash( $self->{dbas_by_vacc} );
00406   my $name_hash  = _invert_dba_hash( $self->{dbas_by_name} );
00407   my $taxid_hash = _invert_dba_hash( $self->{dbas_by_taxid} );
00408   my $gc_hash    = _invert_dba_hash( $self->{dbas_by_vgc} );
00409   # create array of hashes
00410   my $out_arr;
00411   while ( my ( $dbc_loc, $dbconn ) = each( %{$dbc_hash} ) ) {
00412     my $dbc_h = { driver   => $dbconn->driver(),
00413                   host     => $dbconn->host(),
00414                   port     => $dbconn->port(),
00415                   username => $dbconn->username(),
00416                   password => $dbconn->password(),
00417                   dbname   => $dbconn->dbname(),
00418                   species  => [] };
00419     for my $dba ( @{ $dba_hash->{$dbc_loc} } ) {
00420       my $dba_loc = _dba_to_locator($dba);
00421       my $gc      = $gc_hash->{$dba_loc};
00422       if ( ref $gc eq 'ARRAY' ) {
00423         $gc = $gc->[0];
00424       }
00425       push @{ $dbc_h->{species} },
00426         { species_id => $dba->species_id(),
00427           species    => $dba->species(),
00428           taxids     => $taxid_hash->{$dba_loc},
00429           aliases    => $name_hash->{$dba_loc},
00430           accessions => $acc_hash->{$dba_loc},
00431           gc         => $gc };
00432     }
00433     push @{$out_arr}, $dbc_h;
00434   }
00435   return $out_arr;
00436 } ## end sub _registry_to_hash
00437 
00438 sub _registry_to_json {
00439   my ($self) = @_;
00440   my $out_arr = $self->_registry_to_hash();
00441   my $json;
00442   if ($out_arr) {
00443     $json = encode_json($out_arr);
00444   }
00445   else {
00446     carp('No DBAs found');
00447   }
00448   return $json;
00449 }
00450 
00451 =head2 write_registry_to_file
00452     Description : Write the contents of the registry and species lists to a JSON file
00453     Arg : File name
00454     Return      : None
00455 =cut
00456 
00457 sub write_registry_to_file {
00458   my ( $self, $file ) = @_;
00459   my $json = $self->_registry_to_json();
00460   if ($json) {
00461     open( my $fh, '>', $file ) ||
00462       croak "Cannot open '${file}' for writing: $!";
00463     print $fh $json;
00464     close($fh);
00465   }
00466   return;
00467 }
00468 
00469 sub clear_cache {
00470   my ($self) = @_;
00471   if ( defined $self->cache_file() && -e $self->cache_file() ) {
00472     unlink $self->cache_file();
00473   }
00474   return;
00475 }
00476 
00477 =head2 _load_registry_from_json
00478     Description : load the registry from the supplied JSON string
00479     Arg : JSON string
00480 =cut
00481 
00482 sub _load_registry_from_json {
00483   my ( $self, $json ) = @_;
00484   my $reg_arr = decode_json($json);
00485   for my $dbc_h ( @{$reg_arr} ) {
00486     my $dbc =
00487       Bio::EnsEMBL::DBSQL::DBConnection->new(
00488                                             -driver => $dbc_h->{driver},
00489                                             -host   => $dbc_h->{host},
00490                                             -port   => $dbc_h->{port},
00491                                             -user => $dbc_h->{username},
00492                                             -pass => $dbc_h->{password},
00493                                             -dbname => $dbc_h->{dbname}
00494       );
00495     for my $species ( @{ $dbc_h->{species} } ) {
00496       my $dba =
00497         Bio::EnsEMBL::DBSQL::DBAdaptor->new(
00498                                   -DBCONN     => $dbc,
00499                                   -species    => $species->{species},
00500                                   -species_id => $species->{species_id},
00501                                   -multispecies_db => 1,
00502                                   -group           => 'core', );
00503       # no need to register - found automagically
00504       my $gc = $species->{gc};
00505       if ( ref $gc eq 'ARRAY' ) { $gc = $gc->[0]; }
00506       $self->_hash_dba_from_values( $dba,
00507                                     $species->{taxids},
00508                                     $species->{aliases},
00509                                     $species->{accessions},
00510                                     $species->{gc} );
00511     }
00512   } ## end for my $dbc_h ( @{$reg_arr...})
00513   $self->_intern_db_connections();
00514   return;
00515 } ## end sub _load_registry_from_json
00516 
00517 sub load_registry_from_file {
00518   my ( $self, $file ) = @_;
00519   open( my $fh, '<', $file ) or croak "Cannot open file $file: $!";
00520   my $data = do { local $/ = undef; <$fh> };
00521   close($fh);
00522   if ( defined $data && $data ne '' ) {
00523     $self->_load_registry_from_json($data);
00524   }
00525   return;
00526 }
00527 
00528 sub load_registry_from_url {
00529   my ( $self, $url ) = @_;
00530   my $json = get($url);
00531   croak "Could not retrieve JSON from $url: $@" unless defined $json;
00532   $self->_load_registry_from_json($json);
00533   return;
00534 }
00535 
00536 =head2 dba_to_args
00537     Description : Dump the Args needed for contructing a DBA
00538     Arg : Bio::EnsEMBL::DBAdaptor
00539     Return      : Arrayref of args
00540 =cut
00541 
00542 sub dba_to_args {
00543   my ( $self, $dba ) = @_;
00544   my $dbc = $dba->dbc();
00545   my $args = [ -species         => $dba->species(),
00546                -multispecies_db => $dba->is_multispecies(),
00547                -species_id      => $dba->species_id(),
00548                -group           => $dba->group(),
00549                -host            => $dbc->host(),
00550                -port            => $dbc->port(),
00551                -user            => $dbc->username(),
00552                -pass            => $dbc->password(),
00553                -driver          => $dbc->driver(),
00554                -dbname          => $dbc->dbname() ];
00555   return $args;
00556 }
00557 
00558 =head2 _dba_to_locator
00559     Description : return a hash key for a DBAdaptor
00560     Arg : Bio::EnsEMBL::DBAdaptor
00561 =cut
00562 
00563 sub _dba_to_locator {
00564   my ($dba) = @_;
00565   confess(
00566     "Arg must be Bio::EnsEMBL::DBSQL::DBAdaptor not " . ref($dba) )
00567     unless ref($dba) eq "Bio::EnsEMBL::DBSQL::DBAdaptor";
00568   my $locator =
00569     join( q{!-!}, $dba->species_id(), _dbc_to_locator( $dba->dbc() ) );
00570   return $locator;
00571 }
00572 
00573 =head2 _dbc_to_locator
00574     Description : return a hash key for a DBConnection
00575     Arg : Bio::EnsEMBL::DBConnection
00576 =cut
00577 
00578 sub _dbc_to_locator {
00579   my ($dbc) = @_;
00580   my $locator = join( q{!-!},
00581                       $dbc->host(),   $dbc->dbname(),
00582                       $dbc->driver(), $dbc->port(),
00583                       $dbc->username() );
00584   return $locator;
00585 }
00586 
00587 =head2 _intern_db_connections()
00588   Description : Go through all available DBAdaptors of registry and ensure they use the same
00589                 DBConnection instance.
00590   Exceptions  : None
00591   Status      : At Risk
00592 =cut
00593 
00594 sub _intern_db_connections {
00595   my ($self) = @_;
00596   my $adaptors = $self->registry()->get_all_DBAdaptors();
00597   my %dbc_intern;
00598   foreach my $dba ( @{$adaptors} ) {
00599     my $dbc     = $dba->dbc();
00600     my $locator = _dbc_to_locator($dbc);
00601     if ( exists $dbc_intern{$locator} ) {
00602       $dba->dbc( $dbc_intern{$locator} );
00603     }
00604     else {
00605       $dbc_intern{$locator} = $dba->dbc();
00606     }
00607   }
00608   return;
00609 }
00610 
00611 =head2 get_all_DBConnections
00612     Description : Return all database connections used by the DBAs retrieved from the registry
00613     Arg    : None
00614     Exceptions  : None
00615     Return type : Arrayref of Bio::EnsEMBL::DBSQL::DBConnection
00616 =cut
00617 
00618 sub get_all_DBConnections {
00619   my ($self) = @_;
00620   my $adaptors = $self->registry()->get_all_DBAdaptors();
00621   my %dbcs;
00622   foreach my $dba ( values %{ $self->{dbas} } ) {
00623     my $dbc = $dba->dbc();
00624     $dbcs{ _dbc_to_locator($dbc) } = $dbc;
00625   }
00626   return [ values %dbcs ];
00627 }
00628 
00629 =head2 get_all_dbnames
00630     Description : Return all database names used by the DBAs retrieved from the registry
00631     Arg    : None
00632     Exceptions  : None
00633     Return type : Arrayref of strings
00634 =cut
00635 
00636 sub get_all_dbnames {
00637   my ($self) = @_;
00638   my %dbnames;
00639   foreach my $dba ( values %{ $self->{dbas} } ) {
00640     my $dbc = $dba->dbc();
00641     $dbnames{ $dbc->dbname }++;
00642   }
00643   return [ keys %dbnames ];
00644 }
00645 
00646 =head2 get_all
00647     Description : Return all database adaptors that have been retrieved from registry
00648     Arg    : None
00649     Exceptions  : None
00650     Return type : Arrayref of Bio::EnsEMBL::DBSQL::DatabaseAdaptor
00651 =cut
00652 
00653 sub get_all {
00654   my ($self) = @_;
00655   return [ values %{ $self->{dbas} } ];
00656 }
00657 
00658 sub get_all_by_taxon_branch {
00659   my ( $self, $root ) = @_;
00660   if ( ref($root) ne 'Bio::EnsEMBL::Taxonomy::TaxonomyNode' ) {
00661     if ( looks_like_number($root) ) {
00662       $root = $self->taxonomy_adaptor()->fetch_by_taxon_id($root);
00663     }
00664     else {
00665       ($root) =
00666         @{ $self->taxonomy_adaptor()->fetch_all_by_name($root) };
00667     }
00668   }
00669   if ( !defined $root ) {
00670     return [];
00671   }
00672   my @genomes = @{ $self->get_all_by_taxon_id( $root->taxon_id() ) };
00673   for my $node ( @{ $root->adaptor()->fetch_descendants($root) } ) {
00674     @genomes = ( @genomes,
00675                  @{ $self->get_all_by_taxon_id( $node->taxon_id() ) } );
00676   }
00677   return \@genomes;
00678 }
00679 
00680 =head2 get_all_by_taxon_id
00681     Description : Returns all database adaptors that have the supplied taxonomy ID
00682     Arg    : Int
00683     Exceptions  : None
00684     Return type : Arrayref of Bio::EnsEMBL::DBSQL::DatabaseAdaptor
00685 =cut
00686 
00687 sub get_all_by_taxon_id {
00688   my ( $self, $id ) = @_;
00689   return $self->{dbas_by_taxid}{$id} || [];
00690 }
00691 
00692 =head2 get_by_name_exact
00693     Description : Return all database adaptors that have the supplied string as an alias/name
00694     Arg    : String
00695     Exceptions  : None
00696     Return type : Arrayref of Bio::EnsEMBL::DBSQL::DatabaseAdaptor
00697 =cut
00698 
00699 sub get_by_name_exact {
00700   my ( $self, $name ) = @_;
00701   return $self->{dbas_by_name}{$name};
00702 }
00703 
00704 =head2 get_all_by_accession
00705     Description : Returns the database adaptor(s) that contains a seq_region with the supplied INSDC accession (or other seq_region name)
00706     Arg    : Int
00707     Exceptions  : None
00708     Return type : Arrayref of Bio::EnsEMBL::DBSQL::DatabaseAdaptor
00709 =cut    
00710 
00711 sub get_all_by_accession {
00712   my ( $self, $acc ) = @_;
00713   my $dba = $self->{dbas_by_vacc}{$acc};
00714   if ( !defined $dba ) {
00715     $acc =~ s/\.[0-9]+$//x;
00716     $dba = $self->{dbas_by_acc}{$acc};
00717   }
00718   return $dba;
00719 }
00720 
00721 =head2 get_by_assembly_accession
00722     Description : Returns the database adaptor that contains the assembly with the supplied INSDC assembly accession
00723     Arg    : Int
00724     Exceptions  : None
00725     Return type : Bio::EnsEMBL::DBSQL::DatabaseAdaptor
00726 =cut
00727 
00728 sub get_by_assembly_accession {
00729   my ( $self, $acc ) = @_;
00730   my $dba = $self->{dbas_by_vgc}{$acc};
00731   if ( !defined $dba ) {
00732     $acc =~ s/\.[0-9]+$//x;
00733     $dba = $self->{dbas_by_gc}{$acc};
00734   }
00735   return $dba;
00736 }
00737 
00738 =head2 get_all_by_name_pattern
00739     Description : Return all database adaptors that have an alias/name that match the supplied regexp
00740     Arg    : String
00741     Exceptions  : None
00742     Return type : Arrayref of Bio::EnsEMBL::DBSQL::DatabaseAdaptor
00743 =cut    
00744 
00745 sub get_all_by_name_pattern {
00746   my ( $self, $pattern ) = @_;
00747   my %dbas;
00748   for my $name ( keys %{ $self->{dbas_by_name} } ) {
00749     if ( $name =~ m/$pattern/x ) {
00750       for my $dba ( @{ $self->{dbas_by_name}{$name} } ) {
00751         my $key = _dba_to_locator($dba);
00752         $dbas{$key} = $dba;
00753       }
00754     }
00755   }
00756   return [ values(%dbas) ];
00757 }
00758 
00759 =head2 get_all_by_dbname
00760     Description : Returns all database adaptors that have the supplied dbname
00761     Arg    : String
00762     Exceptions  : None
00763     Return type : Arrayref of Bio::EnsEMBL::DBSQL::DatabaseAdaptor
00764 =cut
00765 
00766 sub get_all_by_dbname {
00767   my ( $self, $dbname ) = @_;
00768 
00769   my @filtered_dbas;
00770 
00771   my $all = $self->get_all();
00772   foreach my $dba ( values %{ $self->{dbas} } ) {
00773     if ( $dba->dbc->dbname eq $dbname ) {
00774       push @filtered_dbas, $dba;
00775     }
00776   }
00777 
00778   return \@filtered_dbas;
00779 }
00780 
00781 =head2 get_all_taxon_ids
00782     Description : Return list of all taxon IDs registered with the helper
00783     Exceptions  : None
00784     Return type : Arrayref of integers
00785 =cut
00786 
00787 sub get_all_taxon_ids {
00788   my ($self) = @_;
00789   return [ keys( %{ $self->{dbas_by_taxid} } ) ];
00790 }
00791 
00792 =head2 get_all_names
00793     Description : Return list of all species names registered with the helper
00794     Exceptions  : None
00795     Return type : Arrayref of strings
00796 =cut
00797 
00798 sub get_all_names {
00799   my ($self) = @_;
00800   return [ keys( %{ $self->{dbas_by_name} } ) ];
00801 }
00802 
00803 =head2 get_all_accessions
00804     Description : Return list of all INSDC sequence accessions (or other seq_region names) registered with the helper
00805     Exceptions  : None
00806     Return type : Arrayref of strings
00807 =cut
00808 
00809 sub get_all_accessions {
00810   my ($self) = @_;
00811   return [ keys( %{ $self->{dbas_by_acc} } ) ];
00812 }
00813 
00814 =head2 get_all_versioned_accessions
00815     Description : Return list of all versioned INSDC sequence accessions (or other seq_region names) registered with the helper
00816     Exceptions  : None
00817     Return type : Arrayref of strings
00818 =cut
00819 
00820 sub get_all_versioned_accessions {
00821   my ($self) = @_;
00822   return [ keys( %{ $self->{dbas_by_vacc} } ) ];
00823 }
00824 
00825 =head2 get_all_assemblies
00826     Description : Return list of all INSDC assembly accessions registered with the helper
00827     Exceptions  : None
00828     Return type : Arrayref of strings
00829 =cut
00830 
00831 sub get_all_assemblies {
00832   my ($self) = @_;
00833   return [ keys( %{ $self->{dbas_by_gc} } ) ];
00834 }
00835 
00836 =head2 get_all_versioned_assemblies
00837     Description : Return list of all versioned INSDC assembly accessions registered with the helper
00838     Exceptions  : None
00839     Return type : Arrayref of strings
00840 =cut
00841 
00842 sub get_all_versioned_assemblies {
00843   my ($self) = @_;
00844   return [ keys( %{ $self->{dbas_by_vgc} } ) ];
00845 }
00846 
00847 =head2 register_all_dbs
00848     Description : Helper method to load the registry with all multispecies core databases on the supplied server
00849     Arg    : Host
00850     Arg    : Port
00851     Arg    : User
00852     Arg    : Password
00853     Arg    : (optional) String with database regexp (default is _collection_core_[0-9]_eVersion_[0-9]+)
00854     Exceptions  : None
00855     Return type : None
00856 =cut
00857 
00858 sub register_all_dbs {
00859   my ( $class, $host, $port, $user, $pass, $regexp ) = @_;
00860 
00861   if ( !$regexp ) {
00862     $regexp = '_core_[0-9]+_' . software_version() . '_[0-9]+';
00863   }
00864 
00865   if ( !defined $host ) {
00866     croak "Host must be supplied for registration of databases";
00867   }
00868 
00869   if ( !defined $port ) {
00870     croak
00871 "Port must be supplied for registration of databases on host $host";
00872   }
00873 
00874   my $str = "DBI:mysql:host=$host;port=$port";
00875   my $dbh = DBI->connect( $str, $user, $pass ) ||
00876     croak "Could not connect to database $str (user=$user, pass=$pass)";
00877 
00878   my @dbnames =
00879     grep { m/$regexp/xi }
00880     map  { $_->[0] } @{ $dbh->selectall_arrayref('SHOW DATABASES') };
00881 
00882   for my $db (@dbnames) {
00883     if ( $db =~ m/.*_collection_core_.*/ ) {
00884       _register_multispecies_core( $host, $port, $user, $pass, $db );
00885     }
00886     else {
00887       _register_singlespecies_core( $host, $port, $user, $pass, $db );
00888     }
00889   }
00890   return;
00891 } ## end sub register_all_dbs
00892 
00893 =head2 _register_singlespecies_core
00894     Description : Register core dbas for all species in the supplied database
00895 =cut
00896 
00897 sub _register_singlespecies_core {
00898   my ( $host, $port, $user, $pass, $db ) = @_;
00899   my ( $species, $num ) = (
00900     $db =~ /(^[a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?)  # species name
00901                      _
00902                      core                   # type
00903                      _
00904                      (?:\d+_)?               # optional endbit for ensembl genomes
00905                      (\d+)                   # databases release
00906                      _
00907                       /x );
00908   return
00909     Bio::EnsEMBL::DBSQL::DBAdaptor->new( -HOST    => $host,
00910                                          -PORT    => $port,
00911                                          -USER    => $user,
00912                                          -PASS    => $pass,
00913                                          -DBNAME  => $db,
00914                                          -SPECIES => $species );
00915 }
00916 
00917 =head2 _register_multispecies_core
00918     Description : Register core dbas for all species in the supplied database
00919 =cut
00920 
00921 sub _register_multispecies_core {
00922   return _register_multispecies_x(
00923     'Bio::EnsEMBL::DBSQL::DBAdaptor',
00924     sub {
00925       my %args = @_;
00926       return Bio::EnsEMBL::DBSQL::DBAdaptor->new(%args);
00927     },
00928     @_ );
00929 }
00930 
00931 =head2 _register_multispecies_x
00932     Description : Register specified dba type for all species in the supplied database
00933 =cut
00934 
00935 sub _register_multispecies_x {
00936 
00937   my ( $adaptor, $closure, $host, $port, $user, $pass, $instance,
00938        $species, $alias_arr_ref, $disconnect_when_idle )
00939     = @_;
00940   _runtime_include($adaptor);
00941   _check_name($instance);
00942   my %args =
00943     ( _login_hash( $host, $port, $user, $pass ), '-DBNAME', $instance );
00944   my @species_array = @{ _query_multispecies_db( $species, %args ) };
00945   my @dbas;
00946   foreach my $species_args (@species_array) {
00947     my %new_args = ( %args, %{$species_args}, '-MULTISPECIES_DB' => 1 );
00948     $new_args{-DISCONNECT_WHEN_INACTIVE} = 1 if $disconnect_when_idle;
00949     my $dba = $closure->(%new_args);
00950     if ( defined $species && defined $alias_arr_ref ) {
00951       _add_aliases( $species, $alias_arr_ref );
00952     }
00953     push( @dbas, $dba );
00954   }
00955   return \@dbas;
00956 }
00957 
00958 =head2 _query_multispecies_db
00959     Description : Find all species in the multispecies database
00960 =cut
00961 
00962 sub _query_multispecies_db {
00963   my ( $species, %db_args ) = @_;
00964   my @species_array;
00965   eval {
00966     my $dbh = Bio::EnsEMBL::DBSQL::DBConnection->new(%db_args);
00967     my $sql;
00968     my @params = ('species.db_name');
00969 
00970     if ( defined $species ) {
00971       $sql =
00972 'select meta_value, species_id from meta where meta_key =? and meta_value =?';
00973       push( @params, $species );
00974     }
00975     else {
00976       $sql =
00977 'select distinct meta_value, species_id from meta where meta_key =? and species_id is not null';
00978     }
00979     my $sth = $dbh->prepare($sql);
00980     $sth->execute(@params);
00981     while ( my $row = $sth->fetchrow_arrayref() ) {
00982       my ( $species_name, $species_id ) = @{$row};
00983       push( @species_array,
00984             { '-SPECIES'    => $species_name,
00985               '-SPECIES_ID' => $species_id } );
00986     }
00987     $sth->finish();
00988   };
00989   throw("Error detected: $@") if $@;
00990   return \@species_array;
00991 } ## end sub _query_multispecies_db
00992 
00993 =head2 _add_aliases
00994     Description : Registry all aliases
00995     Arg : Species name
00996     Arg : Arrayref of alias strings
00997 =cut
00998 
00999 sub _add_aliases {
01000   my ( $species_name, $alias_ref ) = @_;
01001   if ( defined $alias_ref ) {
01002     if ( ref($alias_ref) ne 'ARRAY' ) {
01003       $alias_ref = [$alias_ref];
01004     }
01005 
01006     if ( scalar( @{$alias_ref} ) > 0 ) {
01007       Bio::EnsEMBL::Utils::ConfigRegistry->add_alias(
01008                                               -species => $species_name,
01009                                               -alias   => $alias_ref );
01010     }
01011   }
01012   return;
01013 }
01014 
01015 =head2 _runtime_include
01016     Description : Load the specified module (usually a DatabaseAdaptor)
01017     Arg     : Module name
01018 =cut
01019 
01020 sub _runtime_include {
01021   my ($full_module) = @_;
01022   ## no critic (ProhibitStringyEval)
01023   eval "use ${full_module}" ||
01024     throw("Cannot import module '${full_module}': $@")
01025     if $@;
01026   ## use critic
01027   return;
01028 }
01029 
01030 =head2 _check_name
01031     Description : Check that the name is not longer than 64 characters
01032     Arg : Name to check
01033 =cut
01034 
01035 sub _check_name {
01036   #Throws a wobbly if the db name is longer than 64 characters
01037   my ($name) = @_;
01038   my $length = length($name);
01039   throw(
01040 "Invalid length for database name used. Max is 64. The name ${name}- was ${length}"
01041   ) if $length > 64;
01042   return;
01043 }
01044 
01045 =head2 _login_hash
01046     Description : Generate dbadaptor login hash from supplied Args
01047     Arg : Host
01048     Arg : Port
01049     Arg : User
01050     Arg : Password
01051 =cut
01052 
01053 sub _login_hash {
01054   my ( $host, $port, $user, $pass ) = @_;
01055   my %details = ( -HOST => $host, -PORT => $port, -USER => $user );
01056   $details{-PASS} = $pass if defined $pass;
01057   return %details;
01058 }
01059 
01060 1;
01061