GenomeAssemblyInfoAdaptor.pm
Go to the documentation of this file.
00001 
00002 =head1 LICENSE
00003 
00004 Copyright [1999-2014] EMBL-European Bioinformatics Institute
00005 
00006 Licensed under the Apache License, Version 2.0 (the "License");
00007 you may not use this file except in compliance with the License.
00008 You may obtain a copy of the License at
00009 
00010      http://www.apache.org/licenses/LICENSE-2.0
00011 
00012 Unless required by applicable law or agreed to in writing, software
00013 distributed under the License is distributed on an "AS IS" BASIS,
00014 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 See the License for the specific language governing permissions and
00016 limitations under the License.
00017 
00018 =cut
00019 
00020 =pod
00021 
00022 =head1 NAME
00023 
00024 Bio::EnsEMBL::MetaData::DBSQL::GenomeAssemblyInfoAdaptor
00025 
00026 =head1 SYNOPSIS
00027 
00028 # metadata_db is an instance of MetaDataDBAdaptor
00029 my $adaptor = $metadata_db->get_GenomeAssemblyInfoAdaptor();
00030 my $assembly = $adaptor->fetch_by_assembly_accession('GCA_000001405.15');
00031 
00032 =head1 DESCRIPTION
00033 
00034 Adaptor for storing and retrieving GenomeAssemblyInfo objects from MySQL ensembl_metadata database
00035 
00036 =head1 SEE ALSO
00037 
00038 Bio::EnsEMBL::MetaData::GenomeAssemblyInfo
00039 Bio::EnsEMBL::MetaData::GenomeOrganismInfo
00040 
00041 =head1 Author
00042 
00043 Dan Staines
00044 
00045 =cut
00046 
00047 package Bio::EnsEMBL::MetaData::DBSQL::GenomeAssemblyInfoAdaptor;
00048 
00049 use strict;
00050 use warnings;
00051 
00052 use base qw/Bio::EnsEMBL::MetaData::DBSQL::BaseInfoAdaptor/;
00053 
00054 use Carp qw(cluck croak);
00055 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
00056 use Scalar::Util qw(looks_like_number);
00057 use Bio::EnsEMBL::DBSQL::DBConnection;
00058 use Bio::EnsEMBL::MetaData::GenomeAssemblyInfo;
00059 use List::MoreUtils qw(natatime);
00060 
00061 =head1 METHODS
00062 =cut
00063 
00064 =head2 store
00065   Arg        : Bio::EnsEMBL::MetaData::DatabaseInfo
00066   Description: Store the supplied object
00067   Returntype : none
00068   Exceptions : none
00069   Caller     : general
00070   Status     : Stable
00071 =cut
00072 
00073 sub store {
00074   my ( $self, $assembly ) = @_;
00075   if ( !defined $assembly->organism() ) {
00076     throw("Assembly must be associated with an organism");
00077   }
00078   if ( !defined $assembly->organism()->dbID() ) {
00079     $self->db()->get_GenomeOrganismInfoAdaptor()
00080       ->store( $assembly->organism() );
00081   }
00082   if ( !defined $assembly->dbID() ) {
00083     # find out if organism exists first
00084     my ($dbID) =
00085       @{
00086       $self->dbc()->sql_helper()->execute_simple(
00087         -SQL =>
00088 "select assembly_id from assembly where organism_id=? and assembly_name=?",
00089         -PARAMS => [ $assembly->organism()->dbID(), $assembly->assembly_name() ]
00090       ) };
00091     if ( defined $dbID ) {
00092       $assembly->dbID($dbID);
00093       $assembly->adaptor($self);
00094     }
00095   }
00096   if ( defined $assembly->dbID() ) {
00097     $self->update($assembly);
00098   }
00099   else {
00100     $self->dbc()->sql_helper()->execute_update(
00101       -SQL =>
00102 q/insert into assembly(assembly_accession,assembly_name,assembly_level,base_count,organism_id)
00103         values(?,?,?,?,?)/,
00104       -PARAMS => [ $assembly->assembly_accession(),
00105                    $assembly->assembly_name(),
00106                    $assembly->assembly_level(),
00107                    $assembly->base_count(),
00108                    $assembly->organism()->dbID() ],
00109       -CALLBACK => sub {
00110         my ( $sth, $dbh, $rv ) = @_;
00111         $assembly->dbID( $dbh->{mysql_insertid} );
00112       } );
00113     $self->_store_sequences($assembly);
00114     $assembly->adaptor($self);
00115     $self->_store_cached_obj($assembly);
00116   }
00117   return;
00118 } ## end sub store
00119 
00120 =head2 update
00121   Arg        : Bio::EnsEMBL::MetaData::DatabaseInfo
00122   Description: Update the supplied object (must be previously stored)
00123   Returntype : none
00124   Exceptions : none
00125   Caller     : general
00126   Status     : Stable
00127 =cut
00128 
00129 sub update {
00130   my ( $self, $assembly ) = @_;
00131   if ( !defined $assembly->dbID() ) {
00132     croak "Cannot update an object that has not already been stored";
00133   }
00134 
00135   $self->dbc()->sql_helper()->execute_update(
00136     -SQL =>
00137 q/update assembly set assembly_accession=?,assembly_name=?,assembly_level=?,base_count=?,organism_id=? where assembly_id=?/,
00138     -PARAMS => [ $assembly->assembly_accession(), $assembly->assembly_name(),
00139                  $assembly->assembly_level(),     $assembly->base_count(),
00140                  $assembly->organism()->dbID(),   $assembly->dbID() ] );
00141 
00142   return;
00143 }
00144 
00145 =head2 fetch_all_by_sequence_accession
00146   Arg        : INSDC sequence accession e.g. U00096.1 or U00096
00147   Arg        : (optional) if 1, expand children of genome info
00148   Description: Fetch genome info for specified sequence accession
00149   Returntype : Bio::EnsEMBL::MetaData::GenomeAssemblyInfo
00150   Exceptions : none
00151   Caller     : general
00152   Status     : Stable
00153 =cut
00154 
00155 sub fetch_all_by_sequence_accession {
00156   my ( $self, $id, $keen ) = @_;
00157   if ( $id =~ m/\.[0-9]+$/ ) {
00158     return $self->fetch_all_by_sequence_accession_versioned( $id, $keen );
00159   }
00160   else {
00161     return $self->fetch_all_by_sequence_accession_unversioned( $id, $keen );
00162   }
00163 }
00164 
00165 =head2 fetch_all_by_sequence_accession_unversioned
00166   Arg        : INSDC sequence accession e.g. U00096
00167   Arg        : (optional) if 1, expand children of genome info
00168   Description: Fetch genome info for specified sequence accession
00169   Returntype : Bio::EnsEMBL::MetaData::GenomeAssemblyInfo
00170   Exceptions : none
00171   Caller     : general
00172   Status     : Stable
00173 =cut
00174 
00175 sub fetch_all_by_sequence_accession_unversioned {
00176   my ( $self, $id, $keen ) = @_;
00177   return
00178     $self->_fetch_generic(
00179     $self->_get_base_sql() .
00180 ' where assembly_id in (select distinct(assembly_id) from assembly_sequence where acc like ? or name like ?)',
00181     [ $id . '.%', $id . '.%' ],
00182     $keen );
00183 }
00184 
00185 =head2 fetch_all_by_sequence_accession_versioned
00186   Arg        : INSDC sequence accession e.g. U00096.1
00187   Arg        : (optional) if 1, expand children of genome info
00188   Description: Fetch genome info for specified sequence accession
00189   Returntype : Bio::EnsEMBL::MetaData::GenomeInfo
00190   Exceptions : none
00191   Caller     : general
00192   Status     : Stable
00193 =cut
00194 
00195 sub fetch_all_by_sequence_accession_versioned {
00196   my ( $self, $id, $keen ) = @_;
00197   return
00198     $self->_fetch_generic(
00199     $self->_get_base_sql() .
00200 ' where assembly_id in (select distinct(assembly_id) from assembly_sequence where acc=? or name=?)',
00201     [ $id, $id ],
00202     $keen );
00203 }
00204 
00205 =head2 fetch_by_assembly_accession
00206   Arg        : INSDC assembly accession
00207   Arg        : (optional) if 1, expand children of genome info
00208   Description: Fetch genome info for specified assembly ID (versioned or unversioned)
00209   Returntype : Bio::EnsEMBL::MetaData::GenomeAssemblyInfo
00210   Exceptions : none
00211   Caller     : general
00212   Status     : Stable
00213 =cut
00214 
00215 sub fetch_by_assembly_accession {
00216   my ( $self, $id, $keen ) = @_;
00217   return
00218     $self->_first_element(
00219                          $self->_fetch_generic(
00220                            $self->_get_base_sql . ' where assembly_accession=?',
00221                            [$id], $keen ) );
00222 
00223 }
00224 
00225 =head2 fetch_all_by_assembly_set_chain
00226   Arg          : INSDC assembly set chain (unversioned accession)
00227   Arg        : (optional) if 1, expand children of genome info
00228   Description: Fetch genome info for specified assembly set chain
00229   Returntype : Bio::EnsEMBL::MetaData::GenomeAssemblyInfo
00230   Exceptions : none
00231   Caller     : general
00232   Status     : Stable
00233 =cut
00234 
00235 sub fetch_all_by_assembly_set_chain {
00236   my ( $self, $id, $keen ) = @_;
00237   return
00238     $self->_fetch_generic(
00239                       $self->_get_base_sql . ' where assembly_accession like ?',
00240                       [ $id . '.%' ], $keen );
00241 }
00242 
00243 =head2 fetch_all_by_organism
00244   Arg          : GenomeOrganismInfo object
00245   Arg        : (optional) if 1, expand children of genome info
00246   Description: Fetch genome info for specified organism
00247   Returntype : Bio::EnsEMBL::MetaData::GenomeAssemblyInfo
00248   Exceptions : none
00249   Caller     : general
00250   Status     : Stable
00251 =cut
00252 
00253 sub fetch_all_by_organism {
00254   my ( $self, $organism_id, $keen ) = @_;
00255   if ( ref($organism_id) eq 'Bio::EnsEMBL::MetaData::GenomeOrganismInfo' ) {
00256     $organism_id = $organism_id->dbID();
00257   }
00258   return
00259     $self->_fetch_generic( $self->_get_base_sql() . ' where organism_id = ?',
00260                            [$organism_id], $keen );
00261 }
00262 
00263 =head1 INTERNAL METHODS
00264 =head2 _store_sequences
00265   Arg        : Bio::EnsEMBL::MetaData::GenomeAssemblyInfo
00266   Description: Stores the sequences for the supplied object
00267   Returntype : None
00268   Exceptions : none
00269   Caller     : internal
00270   Status     : Stable
00271 =cut
00272 
00273 sub _store_sequences {
00274   my ( $self, $assembly ) = @_;
00275 
00276   $self->{dbc}->sql_helper()->execute_update(
00277                    -SQL => q/delete from assembly_sequence where assembly_id=?/,
00278                    -PARAMS => [ $assembly->dbID() ] );
00279 
00280   return if !defined $assembly->sequences();
00281 
00282   my $it = natatime 1000, @{ $assembly->sequences() };
00283   while ( my @vals = $it->() ) {
00284     my $sql =
00285       'insert ignore into assembly_sequence(assembly_id,name,acc) values ' .
00286       join(
00287       ',',
00288       map {
00289         '(' . $assembly->dbID() . ',"' . $_->{name} . '",' .
00290           ( $_->{acc} ? ( '"' . $_->{acc} . '"' ) : ('NULL') ) . ')'
00291       } @vals );
00292     $self->dbc()->sql_helper()->execute_update( -SQL => $sql );
00293   }
00294   return;
00295 }
00296 
00297 =head2 _fetch_sequences
00298   Arg        : Bio::EnsEMBL::MetaData::GenomeInfo 
00299   Description: Add sequences to supplied object
00300   Returntype : none
00301   Exceptions : none
00302   Caller     : internal
00303   Status     : Stable
00304 =cut
00305 
00306 sub _fetch_sequences {
00307   my ( $self, $genome ) = @_;
00308   croak
00309 "Cannot fetch sequences for a GenomeAssemblyInfo object that has not been stored"
00310     if !defined $genome->dbID();
00311   my $sequences =
00312     $self->dbc()->sql_helper()->execute(
00313            -USE_HASHREFS => 1,
00314            -SQL => 'select name,acc from assembly_sequence where assembly_id=?',
00315            -PARAMS => [ $genome->dbID() ] );
00316   $genome->sequences($sequences);
00317   return;
00318 }
00319 
00320 sub _fetch_organism {
00321   my ( $self, $md ) = @_;
00322   if ( defined $md->{organism_id} ) {
00323     $md->organism( $self->db()->get_GenomeOrganismInfoAdaptor()
00324                    ->fetch_by_dbID( $md->{organism_id} ) );
00325   }
00326   return;
00327 }
00328 
00329 =head2 _fetch_children
00330   Arg        : Arrayref of Bio::EnsEMBL::MetaData::GenomeInfo
00331   Description: Fetch all children of specified genome info object
00332   Returntype : none
00333   Exceptions : none
00334   Caller     : internal
00335   Status     : Stable
00336 =cut
00337 
00338 sub _fetch_children {
00339   my ( $self, $md ) = @_;
00340   $self->_fetch_sequences($md);
00341   $self->_fetch_organism($md);
00342   return;
00343 }
00344 
00345 my $base_organism_fetch_sql =
00346 q/select assembly_id as dbID, organism_id, assembly_accession, assembly_name, assembly_level, base_count from assembly/;
00347 
00348 sub _get_base_sql {
00349   return $base_organism_fetch_sql;
00350 }
00351 
00352 sub _get_id_field {
00353   return 'assembly_id';
00354 }
00355 
00356 sub _get_obj_class {
00357   return 'Bio::EnsEMBL::MetaData::GenomeAssemblyInfo';
00358 }
00359 
00360 1;