sub process_genome {
my ( $self, $dbas ) = @_;
my $dba = $dbas->{core};
if ( !defined $dba ) {
confess "DBA not defined for processing";
}
$dba->dbc()->reconnect_when_lost(1);
# get metadata container
my $meta = $dba->get_MetaContainer();
my $dbname = $dba->dbc()->dbname();
my $size = get_dbsize($dba);
my $tableN =
$dba->dbc()->sql_helper()->execute_single_result(
-SQL =>
"select count(*) from information_schema.tables where table_schema=?",
-PARAMS => [$dbname] );
my $strain = $meta->single_value_by_key('species.strain');
my $serotype = $meta->single_value_by_key('species.serotype');
my $name = $meta->get_display_name();
my $taxonomy_id = $meta->get_taxonomy_id();
my $species_taxonomy_id =
$meta->single_value_by_key('species.species_taxonomy_id') || $taxonomy_id;
my $assembly_accession = $meta->single_value_by_key('assembly.accession');
my $assembly_name = $meta->single_value_by_key('assembly.name');
my $genebuild = $meta->single_value_by_key('genebuild.start_date');
# get highest assembly level
my ($assembly_level) =
@{
$dba->dbc()->sql_helper()->execute_simple(
-SQL =>
'select name from coord_system where species_id=? order by rank asc',
-PARAMS => [ $dba->species_id() ] ) };
my $division = 'Ensembl';
my @divisions = sort @{ $meta->list_value_by_key('species.division') };
if ( scalar @divisions > 0 ) {
$division = $divisions[-1];
}
my $md =
Bio::EnsEMBL::MetaData::GenomeInfo->new(
-name => $dba->species(),
-species_id => $dba->species_id(),
-division => $division,
-dbname => $dbname,
-data_release => $self->{data_release},
-strain => $strain,
-serotype => $serotype,
-display_name => $name,
-taxonomy_id => $taxonomy_id,
-species_taxonomy_id => $species_taxonomy_id,
-assembly_accession => $assembly_accession,
-assembly_name => $assembly_name,
-genebuild => $genebuild,
-assembly_level => $assembly_level );
# get list of seq names
my $seqs_arr = [];
if ( defined $self->{contigs} ) {
my $seqs = {};
# 1. get complete list of seq_regions as a hash vs. ENA synonyms
$dba->dbc()->sql_helper()->execute_no_return(
-SQL => q/select distinct s.name, ss.synonym
from coord_system c
join seq_region s using (coord_system_id)
left join seq_region_synonym ss on
(ss.seq_region_id=s.seq_region_id and ss.external_db_id in
(select external_db_id from external_db where db_name='INSDC'))
where c.species_id=? and attrib like '%default_version%'/,
-PARAMS => [ $dba->species_id() ],
-CALLBACK => sub {
my ( $name, $acc ) = @{ shift @_ };
$seqs->{$name} = $acc;
return;
} );
# 2. add accessions where the name is flagged as being in ENA
$dba->dbc()->sql_helper()->execute_no_return(
-SQL => q/
select s.name
from coord_system c
join seq_region s using (coord_system_id)
join seq_region_attrib sa using (seq_region_id)
where sa.value='ENA' and c.species_id=? and attrib like '%default_version%'/,
-PARAMS => [ $dba->species_id() ],
-CALLBACK => sub {
my ($acc) = @{ shift @_ };
$seqs->{$acc} = $acc;
return;
} );
while ( my ( $key, $acc ) = each %$seqs ) {
push @$seqs_arr, { name => $key, acc => $acc };
}
} ## end if ( defined $self->{contigs...})
$md->assembly()->sequences($seqs_arr);
# get toplevel base count
my $base_counts =
$dba->dbc()->sql_helper()->execute_simple(
-SQL =>
q/select value from genome_statistics where statistic='ref_length' and species_id=?/,
-PARAMS => [ $dba->species_id() ] );
if ( scalar @$base_counts == 0 ) {
$base_counts = $dba->dbc()->sql_helper()->execute_simple(
-SQL => q/select sum(s.length) from seq_region s
join coord_system c using (coord_system_id)
join seq_region_attrib sa using (seq_region_id)
join attrib_type a using (attrib_type_id)
where a.code='toplevel' and species_id=?/,
-PARAMS => [ $dba->species_id() ] );
}
$md->assembly()->base_count( $base_counts->[0] );
# get associated PMIDs
$md->organism()->publications(
$dba->dbc()->sql_helper()->execute_simple(
-SQL => q/select distinct dbprimary_acc from
xref
join external_db using (external_db_id)
join seq_region_attrib sa on (xref.xref_id=sa.value)
join attrib_type using (attrib_type_id)
join seq_region using (seq_region_id)
join coord_system using (coord_system_id)
where species_id=? and code='xref_id' and db_name in ('PUBMED')/,
-PARAMS => [ $dba->species_id() ] ) );
# add aliases
$md->organism()->aliases(
$dba->dbc()->sql_helper()->execute_simple(
-SQL => q/select distinct meta_value from meta
where species_id=? and meta_key='species.alias'/,
-PARAMS => [ $dba->species_id() ] ) );
if ( defined $self->{annotation_analyzer} ) {
# core annotation
$self->{logger}
->info( "Processing " . $dba->species() . " core annotation" );
$md->annotations( $self->{annotation_analyzer}->analyze_annotation($dba) );
# features
my $core_ali = $self->{annotation_analyzer}->analyze_alignments($dba);
my $other_ali = {};
$md->features( $self->{annotation_analyzer}->analyze_features($dba) );
my $other_features = $dbas->{otherfeatures};
if ( defined $other_features ) {
$self->{logger}
->info( "Processing " . $dba->species() . " otherfeatures annotation" );
my %features = ( %{ $md->features() },
%{$self->{annotation_analyzer}
->analyze_features($other_features) } );
$other_ali =
$self->{annotation_analyzer}->analyze_alignments($other_features);
$size += get_dbsize($other_features);
$md->features( \%features );
$md->add_database( $other_features->dbc()->dbname() );
}
my $variation = $dbas->{variation};
# variation
if ( defined $variation ) {
$self->{logger}
->info( "Processing " . $dba->species() . " variation annotation" );
$md->variations(
$self->{annotation_analyzer}->analyze_variation($variation) );
$size += get_dbsize($variation);
$md->add_database( $variation->dbc()->dbname() );
}
my $funcgen = $dbas->{funcgen};
if ( defined $funcgen ) {
$self->{logger}
->info( "Processing " . $dba->species() . " funcgen annotation" );
$md->add_database( $funcgen->dbc()->dbname() );
}
# BAM
$self->{logger}
->info( "Processing " . $dba->species() . " read aligments" );
my $read_ali =
$self->{annotation_analyzer}
->analyze_tracks( $md->name(), $md->division() );
my %all_ali = ( %{$core_ali}, %{$other_ali} );
# add bam tracks by count - use source name
for my $bam ( @{ $read_ali->{bam} } ) {
$all_ali{bam}{ $bam->{id} }++;
}
$md->other_alignments( \%all_ali );
$md->db_size($size);
} ## end if ( defined $self->{annotation_analyzer...})
$dba->dbc()->disconnect_if_idle();
return $md;
} ## end sub process_genome