Skip to content

genome_metadata

ensembl.io.genomio.genome_metadata

Genome metadata handling module.

PROVIDER_DATA = {'GenBank': {'assembly': {'provider_name': 'GenBank', 'provider_url': 'https://www.ncbi.nlm.nih.gov/datasets/genome'}, 'annotation': {'provider_name': 'GenBank', 'provider_url': 'https://www.ncbi.nlm.nih.gov/datasets/genome'}}, 'RefSeq': {'assembly': {'provider_name': 'RefSeq', 'provider_url': 'https://www.ncbi.nlm.nih.gov/datasets/genome'}, 'annotation': {'provider_name': 'RefSeq', 'provider_url': 'https://www.ncbi.nlm.nih.gov/datasets/genome'}}} module-attribute

MetadataError

Bases: Exception

When a metadata value is not expected.

Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
68
69
class MetadataError(Exception):
    """When a metadata value is not expected."""

MissingNodeError

Bases: Exception

When a taxon XML node cannot be found.

Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
64
65
class MissingNodeError(Exception):
    """When a taxon XML node cannot be found."""

add_assembly_version(genome_data)

Adds version number to the genome's assembly information if one is not present already.

Parameters:

Name Type Description Default
genome_data Dict

Genome information of assembly, accession and annotation.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
108
109
110
111
112
113
114
115
116
117
118
119
def add_assembly_version(genome_data: Dict) -> None:
    """Adds version number to the genome's assembly information if one is not present already.

    Args:
        genome_data: Genome information of assembly, accession and annotation.
    """
    assembly = genome_data["assembly"]
    if "version" not in assembly:
        accession = assembly["accession"]
        version = accession.partition(".")[2]
        if version:
            assembly["version"] = int(version)

add_genebuild_metadata(genome_data)

Adds genebuild metadata to genome information if not present already.

The default convention is to use the current date as "version" and "start_date".

Parameters:

Name Type Description Default
genome_data Dict

Genome information of assembly, accession and annotation.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def add_genebuild_metadata(genome_data: Dict) -> None:
    """Adds genebuild metadata to genome information if not present already.

    The default convention is to use the current date as `"version"` and `"start_date"`.

    Args:
        genome_data: Genome information of assembly, accession and annotation.
    """
    genebuild = genome_data.setdefault("genebuild", {})
    current_date = datetime.date.today().isoformat()
    if "version" not in genebuild:
        genebuild["version"] = current_date
    if "start_date" not in genebuild:
        genebuild["start_date"] = current_date

add_provider(genome_metadata, ncbi_data)

Updates the genome metadata adding provider information for assembly and gene models.

Assembly provider metadata will only be added if it is missing, i.e. neither "provider_name" or "provider_url" are present. The gene model metadata will only be added if gff3_file is provided.

Parameters:

Name Type Description Default
genome_data

Genome information of assembly, accession and annotation.

required
ncbi_data Dict

Report data from NCBI datasets.

required

Raises:

Type Description
MetadataError

If accession's format in genome metadata does not match with a known provider.

Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def add_provider(genome_metadata: Dict, ncbi_data: Dict) -> None:
    """Updates the genome metadata adding provider information for assembly and gene models.

    Assembly provider metadata will only be added if it is missing, i.e. neither `"provider_name"` or
    `"provider_url"` are present. The gene model metadata will only be added if `gff3_file` is provided.

    Args:
        genome_data: Genome information of assembly, accession and annotation.
        ncbi_data: Report data from NCBI datasets.

    Raises:
        MetadataError: If accession's format in genome metadata does not match with a known provider.
    """
    # Get accession provider
    accession = genome_metadata["assembly"]["accession"]
    if accession.startswith("GCF"):
        provider = PROVIDER_DATA["RefSeq"]
    elif accession.startswith("GCA"):
        provider = PROVIDER_DATA["GenBank"]
    else:
        raise MetadataError(f"Accession does not look like an INSDC or RefSeq accession: {accession}")

    # Add assembly provider (if missing)
    assembly = genome_metadata["assembly"]
    if ("provider_name" not in assembly) and ("provider_url" not in assembly):
        assembly["provider_name"] = provider["assembly"]["provider_name"]
        assembly["provider_url"] = f'{provider["assembly"]["provider_url"]}/{accession}'

    # Add annotation provider if there are gene models
    if "annotation_info" in ncbi_data:
        annotation = genome_metadata.setdefault("annotation", {})
        if ("provider_name" not in annotation) and ("provider_url" not in annotation):
            annotation["provider_name"] = provider["annotation"]["provider_name"]
            annotation["provider_url"] = f'{provider["annotation"]["provider_url"]}/{accession}'

add_species_metadata(genome_metadata, ncbi_data)

Adds taxonomy ID, scientific name and strain (if present) from the NCBI dataset report.

Parameters:

Name Type Description Default
genome_metadata Dict

Genome information of assembly, accession and annotation.

required
ncbi_data Dict

Report data from NCBI datasets.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def add_species_metadata(genome_metadata: Dict, ncbi_data: Dict) -> None:
    """Adds taxonomy ID, scientific name and strain (if present) from the NCBI dataset report.

    Args:
        genome_metadata: Genome information of assembly, accession and annotation.
        ncbi_data: Report data from NCBI datasets.

    """
    species = genome_metadata.setdefault("species", {})
    try:
        organism = ncbi_data["organism"]
    except KeyError:
        return

    if "tax_id" in organism:
        species.setdefault("taxonomy_id", organism["tax_id"])
    if "organism_name" in organism:
        species.setdefault("scientific_name", organism["organism_name"])

    try:
        species.setdefault("strain", organism["infraspecific_names"]["strain"])
    except KeyError:
        pass

amend_genome_metadata(genome_infile, genome_outfile, report_file=None, genbank_file=None)

Parameters:

Name Type Description Default
genome_infile PathLike

Genome metadata following the src/python/ensembl/io/genomio/data/schemas/genome.json.

required
genome_outfile PathLike

Amended genome metadata file.

required
report_file Optional[PathLike]

INSDC/RefSeq sequences report file.

None
genbank_file Optional[PathLike]

INSDC/RefSeq GBFF file.

None
Source code in src/python/ensembl/io/genomio/genome_metadata/extend.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def amend_genome_metadata(
    genome_infile: PathLike,
    genome_outfile: PathLike,
    report_file: Optional[PathLike] = None,
    genbank_file: Optional[PathLike] = None,
) -> None:
    """
    Args:
        genome_infile: Genome metadata following the `src/python/ensembl/io/genomio/data/schemas/genome.json`.
        genome_outfile: Amended genome metadata file.
        report_file: INSDC/RefSeq sequences report file.
        genbank_file: INSDC/RefSeq GBFF file.
    """
    genome_metadata = get_json(genome_infile)
    # Get additional sequences in the assembly but not in the data
    if report_file:
        genbank_path = Path(genbank_file) if genbank_file else None
        additions = get_additions(report_file, genbank_path)
        if additions:
            genome_metadata["added_seq"] = {"region_name": additions}
    # Print out the file
    genome_outfile = Path(genome_outfile)
    print_json(genome_outfile, genome_metadata)

check_assembly_version(genome_metadata)

Updates the assembly version of the genome metadata provided.

If version meta key is not and integer or it is not available, the assembly accession's version will be used instead.

Parameters:

Name Type Description Default
genome_metadata dict[str, Any]

Nested metadata key values from the core metadata table.

required

Raises:

Type Description
ValueError

If both version and the assembly accession's version are not integers or are missing.

Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def check_assembly_version(genome_metadata: dict[str, Any]) -> None:
    """Updates the assembly version of the genome metadata provided.

    If `version` meta key is not and integer or it is not available, the assembly accession's version
    will be used instead.

    Args:
        genome_metadata: Nested metadata key values from the core metadata table.

    Raises:
        ValueError: If both `version` and the assembly accession's version are not integers or are missing.
    """
    assembly = genome_metadata["assembly"]
    version = assembly.get("version")
    # Check the version is an integer
    try:
        assembly["version"] = int(version)
    except (ValueError, TypeError) as exc:
        # Get the version from the assembly accession
        accession = assembly["accession"]
        version = accession.partition(".")[2]
        try:
            assembly["version"] = int(version)
        except ValueError:
            raise ValueError(f"Assembly version is not an integer in {assembly}") from exc
        logging.info(f"Assembly version [v{version}] obtained from assembly accession ({accession}).")
    else:
        logging.info(f'Located version [v{assembly["version"]}] info from meta data.')

check_genebuild_version(genome_metadata)

Updates the genebuild version (if not present) from the genebuild ID, removing the latter.

Parameters:

Name Type Description Default
genome_metadata dict[str, Any]

Nested metadata key values from the core metadata table.

required

Raises:

Type Description
ValueError

If there is no genebuild version or ID available.

Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def check_genebuild_version(genome_metadata: dict[str, Any]) -> None:
    """Updates the genebuild version (if not present) from the genebuild ID, removing the latter.

    Args:
        genome_metadata: Nested metadata key values from the core metadata table.

    Raises:
        ValueError: If there is no genebuild version or ID available.
    """
    try:
        genebuild = genome_metadata["genebuild"]
    except KeyError:
        return
    if "version" not in genebuild:
        try:
            genebuild_id = genebuild["id"]
        except KeyError:
            # pylint: disable=raise-missing-from
            raise ValueError("No genebuild version or ID found")
        genome_metadata["genebuild"]["version"] = str(genebuild_id)
    # Drop genebuild ID since there is a genebuild version
    genome_metadata["genebuild"].pop("id", None)

filter_genome_meta(genome_metadata, metafilter, meta_update)

Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER.

Also converts to expected data types (to follow the genome JSON schema).

Parameters:

Name Type Description Default
genome_metadata dict[str, Any]

Nested metadata key values from the core metadata table.

required
metafilter dict | None

Input JSON containing subset of meta table values to filter on.

required
meta_update bool

Deactivates additional meta updating.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def filter_genome_meta(
    genome_metadata: dict[str, Any], metafilter: dict | None, meta_update: bool
) -> dict[str, Any]:
    """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER.

    Also converts to expected data types (to follow the genome JSON schema).

    Args:
        genome_metadata: Nested metadata key values from the core metadata table.
        metafilter: Input JSON containing subset of meta table values to filter on.
        meta_update: Deactivates additional meta updating.

    """
    filtered_metadata: dict[str, Any] = {}

    if metafilter:
        metadata_filter: dict[str, dict[str, type]] = metafilter
    else:
        metadata_filter = DEFAULT_FILTER

    for key, subfilter in metadata_filter.items():
        if key in genome_metadata:
            filtered_metadata[key] = {}
            for subkey, value_type in subfilter.items():
                if isinstance(value_type, str):
                    value_type = type(value_type)
                if isinstance(value_type, int):
                    value_type = type(value_type)
                if subkey in genome_metadata[key]:
                    value = genome_metadata[key][subkey]
                    if isinstance(value, list):
                        value = [value_type(x) for x in value]
                    else:
                        value = value_type(value)
                    filtered_metadata[key][subkey] = value

    # Optional assembly and genebuild based filtering:
    if meta_update:
        # Check assembly and genebuild versions
        check_assembly_refseq(filtered_metadata)
        check_assembly_version(filtered_metadata)
        check_genebuild_version(filtered_metadata)

    return filtered_metadata

get_additions(report_path, gbff_path)

Returns all seq_regions that are mentioned in the report but that are not in the data.

Parameters:

Name Type Description Default
report_path PathLike

Path to the report file.

required
gbff_path Optional[PathLike]

Path to the GBFF file.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/extend.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def get_additions(report_path: PathLike, gbff_path: Optional[PathLike]) -> List[str]:
    """Returns all `seq_regions` that are mentioned in the report but that are not in the data.

    Args:
        report_path: Path to the report file.
        gbff_path: Path to the GBFF file.
    """
    gbff_regions = set(get_gbff_regions(gbff_path))
    report_regions = get_report_regions_names(report_path)
    additions = []
    for seq_region_name in report_regions:
        (genbank_seq_name, refseq_seq_name) = seq_region_name
        if genbank_seq_name not in gbff_regions and refseq_seq_name not in gbff_regions:
            if refseq_seq_name:
                additions.append(refseq_seq_name)
            else:
                additions.append(genbank_seq_name)
    additions = sorted(additions)
    return additions

get_gbff_regions(gbff_path)

Returns the seq_region data from a GBFF file.

Parameters:

Name Type Description Default
gbff_path Optional[PathLike]

GBFF file path to use.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/extend.py
63
64
65
66
67
68
69
70
71
72
73
74
75
def get_gbff_regions(gbff_path: Optional[PathLike]) -> List[str]:
    """Returns the `seq_region` data from a GBFF file.

    Args:
        gbff_path: GBFF file path to use.
    """
    seq_regions = []
    if gbff_path:
        with open_gz_file(gbff_path) as gbff_file:
            for record in SeqIO.parse(gbff_file, "genbank"):
                record_id = re.sub(_VERSION_END, "", record.id)
                seq_regions.append(record_id)
    return seq_regions

get_genome_metadata(session, db_name)

Returns the meta table content from the core database in a nested dictionary.

Parameters:

Name Type Description Default
session Session

Session for the current core.

required
db_name str | None

Target database name

required
Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def get_genome_metadata(session: Session, db_name: str | None) -> dict[str, Any]:
    """Returns the meta table content from the core database in a nested dictionary.

    Args:
        session: Session for the current core.
        db_name: Target database name
    """
    genome_metadata: dict[str, Any] = {}

    meta_statement = select(Meta)
    for row in session.execute(meta_statement).unique().all():
        meta_key = row[0].meta_key
        meta_value = row[0].meta_value
        (main_key, _, subkey) = meta_key.partition(".")
        # Use empty string as subkey when no "." found to simplify dictionary creation
        if main_key in genome_metadata:
            if subkey in genome_metadata[main_key]:
                genome_metadata[main_key][subkey].append(meta_value)
            else:
                genome_metadata[main_key][subkey] = [meta_value]
        else:
            genome_metadata[main_key] = {subkey: [meta_value]}

    if db_name:
        genome_metadata["database"] = {"name": f"{db_name}"}

    # Parse genome metadata to simplify dictionary and check data consistency
    for main_key, subkeys_dict in genome_metadata.items():
        # Replace single-value lists by the value itself
        for subkey, value in subkeys_dict.items():
            if len(value) == 1:
                subkeys_dict[subkey] = value[0]
        # Remove nested dictionary if it only has "" as key, passing its value to the main key
        if "" in subkeys_dict:
            if len(subkeys_dict) == 1:
                genome_metadata[main_key] = subkeys_dict.pop("")
            else:
                raise ValueError(f"Unexpected meta keys for '{main_key}': {', '.join(subkeys_dict.keys())}")
    return genome_metadata

get_report_regions_names(report_path)

Returns a list of GenBank-RefSeq seq_region names from the assembly report file.

Parameters:

Name Type Description Default
report_path PathLike

Path to the assembly report file from INSDC/RefSeq.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/extend.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def get_report_regions_names(report_path: PathLike) -> List[Tuple[str, str]]:
    """Returns a list of GenBank-RefSeq `seq_region` names from the assembly report file.

    Args:
        report_path: Path to the assembly report file from INSDC/RefSeq.
    """
    # Get the report in a CSV format, easier to manipulate
    report_csv, _ = _report_to_csv(report_path)
    # Feed the CSV string to the CSV reader
    reader = csv.DictReader(report_csv.splitlines(), delimiter="\t", quoting=csv.QUOTE_NONE)
    # Create the seq_regions
    seq_regions = []
    for row in reader:
        refseq_name = row["RefSeq-Accn"]
        genbank_name = row["GenBank-Accn"]
        if refseq_name == "na":
            refseq_name = ""
        if genbank_name == "na":
            genbank_name = ""
        refseq_name = re.sub(_VERSION_END, "", refseq_name)
        genbank_name = re.sub(_VERSION_END, "", genbank_name)
        seq_regions.append((genbank_name, refseq_name))
    return seq_regions

metadata_dump_setup(db_url, input_filter, meta_update, append_db)

Setup main stages of genome meta dump from user input arguments provided. Args: db_url: Target core database URL. input_filter: Input JSON containing subset of meta table values to filter on. no_update: Deactivate additional meta updating. append_db: Append target core database name to output JSON.

Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def metadata_dump_setup(
    db_url: URL, input_filter: StrPath | None, meta_update: bool, append_db: bool
) -> dict[str, Any]:
    """Setup main stages of genome meta dump from user input arguments provided.
    Args:
        db_url: Target core database URL.
        input_filter: Input JSON containing subset of meta table values to filter on.
        no_update: Deactivate additional meta updating.
        append_db: Append target core database name to output JSON.

    """
    dbc = DBConnectionLite(db_url)
    db_name = None
    meta_filter = {}
    if append_db:
        db_name = db_url.database

    if input_filter:
        unconverted_json = get_json(input_filter)
        meta_filter = convert_dict(unconverted_json)

    with dbc.session_scope() as session:
        genome_meta = get_genome_metadata(session, db_name)
        genome_meta = filter_genome_meta(genome_meta, meta_filter, meta_update)

    return genome_meta

prepare_genome_metadata(input_file, output_file, ncbi_meta)

Updates the genome metadata JSON file with additional information.

In particular, more information is added about the provider, the assembly and its gene build version, and the taxonomy.

Parameters:

Name Type Description Default
input_file PathLike

Path to JSON file with genome metadata.

required
output_file PathLike

Output directory where to generate the final genome.json file.

required
ncbi_meta PathLike

JSON file from NCBI datasets.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def prepare_genome_metadata(
    input_file: PathLike,
    output_file: PathLike,
    ncbi_meta: PathLike,
) -> None:
    """Updates the genome metadata JSON file with additional information.

    In particular, more information is added about the provider, the assembly and its gene build version,
    and the taxonomy.

    Args:
        input_file: Path to JSON file with genome metadata.
        output_file: Output directory where to generate the final `genome.json` file.
        ncbi_meta: JSON file from NCBI datasets.

    """
    genome_data = get_json(input_file)
    ncbi_data = {}
    if ncbi_meta:
        ncbi_data = get_json(ncbi_meta)["reports"][0]

    # Amend any missing metadata
    add_provider(genome_data, ncbi_data)
    add_assembly_version(genome_data)
    add_genebuild_metadata(genome_data)
    add_species_metadata(genome_data, ncbi_data)
    # Dump updated genome metadata
    print_json(output_file, genome_data)