Skip to content

prepare

ensembl.io.genomio.genome_metadata.prepare

Expand the genome metadata file adding information about the provider, taxonomy, and assembly and gene build versions.

PROVIDER_DATA = {'GenBank': {'assembly': {'provider_name': 'GenBank', 'provider_url': 'https://www.ncbi.nlm.nih.gov/datasets/genome'}, 'annotation': {'provider_name': 'GenBank', 'provider_url': 'https://www.ncbi.nlm.nih.gov/datasets/genome'}}, 'RefSeq': {'assembly': {'provider_name': 'RefSeq', 'provider_url': 'https://www.ncbi.nlm.nih.gov/datasets/genome'}, 'annotation': {'provider_name': 'RefSeq', 'provider_url': 'https://www.ncbi.nlm.nih.gov/datasets/genome'}}} module-attribute

MetadataError

Bases: Exception

When a metadata value is not expected.

Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
68
69
class MetadataError(Exception):
    """When a metadata value is not expected."""

MissingNodeError

Bases: Exception

When a taxon XML node cannot be found.

Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
64
65
class MissingNodeError(Exception):
    """When a taxon XML node cannot be found."""

add_assembly_version(genome_data)

Adds version number to the genome's assembly information if one is not present already.

Parameters:

Name Type Description Default
genome_data Dict

Genome information of assembly, accession and annotation.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
108
109
110
111
112
113
114
115
116
117
118
119
def add_assembly_version(genome_data: Dict) -> None:
    """Adds version number to the genome's assembly information if one is not present already.

    Args:
        genome_data: Genome information of assembly, accession and annotation.
    """
    assembly = genome_data["assembly"]
    if "version" not in assembly:
        accession = assembly["accession"]
        version = accession.partition(".")[2]
        if version:
            assembly["version"] = int(version)

add_genebuild_metadata(genome_data)

Adds genebuild metadata to genome information if not present already.

The default convention is to use the current date as "version" and "start_date".

Parameters:

Name Type Description Default
genome_data Dict

Genome information of assembly, accession and annotation.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def add_genebuild_metadata(genome_data: Dict) -> None:
    """Adds genebuild metadata to genome information if not present already.

    The default convention is to use the current date as `"version"` and `"start_date"`.

    Args:
        genome_data: Genome information of assembly, accession and annotation.
    """
    genebuild = genome_data.setdefault("genebuild", {})
    current_date = datetime.date.today().isoformat()
    if "version" not in genebuild:
        genebuild["version"] = current_date
    if "start_date" not in genebuild:
        genebuild["start_date"] = current_date

add_provider(genome_metadata, ncbi_data)

Updates the genome metadata adding provider information for assembly and gene models.

Assembly provider metadata will only be added if it is missing, i.e. neither "provider_name" or "provider_url" are present. The gene model metadata will only be added if gff3_file is provided.

Parameters:

Name Type Description Default
genome_data

Genome information of assembly, accession and annotation.

required
ncbi_data Dict

Report data from NCBI datasets.

required

Raises:

Type Description
MetadataError

If accession's format in genome metadata does not match with a known provider.

Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def add_provider(genome_metadata: Dict, ncbi_data: Dict) -> None:
    """Updates the genome metadata adding provider information for assembly and gene models.

    Assembly provider metadata will only be added if it is missing, i.e. neither `"provider_name"` or
    `"provider_url"` are present. The gene model metadata will only be added if `gff3_file` is provided.

    Args:
        genome_data: Genome information of assembly, accession and annotation.
        ncbi_data: Report data from NCBI datasets.

    Raises:
        MetadataError: If accession's format in genome metadata does not match with a known provider.
    """
    # Get accession provider
    accession = genome_metadata["assembly"]["accession"]
    if accession.startswith("GCF"):
        provider = PROVIDER_DATA["RefSeq"]
    elif accession.startswith("GCA"):
        provider = PROVIDER_DATA["GenBank"]
    else:
        raise MetadataError(f"Accession does not look like an INSDC or RefSeq accession: {accession}")

    # Add assembly provider (if missing)
    assembly = genome_metadata["assembly"]
    if ("provider_name" not in assembly) and ("provider_url" not in assembly):
        assembly["provider_name"] = provider["assembly"]["provider_name"]
        assembly["provider_url"] = f'{provider["assembly"]["provider_url"]}/{accession}'

    # Add annotation provider if there are gene models
    if "annotation_info" in ncbi_data:
        annotation = genome_metadata.setdefault("annotation", {})
        if ("provider_name" not in annotation) and ("provider_url" not in annotation):
            annotation["provider_name"] = provider["annotation"]["provider_name"]
            annotation["provider_url"] = f'{provider["annotation"]["provider_url"]}/{accession}'

add_species_metadata(genome_metadata, ncbi_data)

Adds taxonomy ID, scientific name and strain (if present) from the NCBI dataset report.

Parameters:

Name Type Description Default
genome_metadata Dict

Genome information of assembly, accession and annotation.

required
ncbi_data Dict

Report data from NCBI datasets.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def add_species_metadata(genome_metadata: Dict, ncbi_data: Dict) -> None:
    """Adds taxonomy ID, scientific name and strain (if present) from the NCBI dataset report.

    Args:
        genome_metadata: Genome information of assembly, accession and annotation.
        ncbi_data: Report data from NCBI datasets.

    """
    species = genome_metadata.setdefault("species", {})
    try:
        organism = ncbi_data["organism"]
    except KeyError:
        return

    if "tax_id" in organism:
        species.setdefault("taxonomy_id", organism["tax_id"])
    if "organism_name" in organism:
        species.setdefault("scientific_name", organism["organism_name"])

    try:
        species.setdefault("strain", organism["infraspecific_names"]["strain"])
    except KeyError:
        pass

main()

Module's entry-point.

Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def main() -> None:
    """Module's entry-point."""
    parser = ArgumentParser(description=__doc__)
    parser.add_argument_src_path("--input_file", required=True, help="Genome metadata JSON file")
    parser.add_argument_dst_path(
        "--output_file", required=True, help="Output path for the new genome metadata file"
    )
    parser.add_argument_src_path(
        "--ncbi_meta", required=True, help="JSON file from NCBI datasets for this genome."
    )
    parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
    parser.add_log_arguments()
    args = parser.parse_args()
    init_logging_with_args(args)

    prepare_genome_metadata(
        input_file=args.input_file, output_file=args.output_file, ncbi_meta=args.ncbi_meta
    )

prepare_genome_metadata(input_file, output_file, ncbi_meta)

Updates the genome metadata JSON file with additional information.

In particular, more information is added about the provider, the assembly and its gene build version, and the taxonomy.

Parameters:

Name Type Description Default
input_file PathLike

Path to JSON file with genome metadata.

required
output_file PathLike

Output directory where to generate the final genome.json file.

required
ncbi_meta PathLike

JSON file from NCBI datasets.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/prepare.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def prepare_genome_metadata(
    input_file: PathLike,
    output_file: PathLike,
    ncbi_meta: PathLike,
) -> None:
    """Updates the genome metadata JSON file with additional information.

    In particular, more information is added about the provider, the assembly and its gene build version,
    and the taxonomy.

    Args:
        input_file: Path to JSON file with genome metadata.
        output_file: Output directory where to generate the final `genome.json` file.
        ncbi_meta: JSON file from NCBI datasets.

    """
    genome_data = get_json(input_file)
    ncbi_data = {}
    if ncbi_meta:
        ncbi_data = get_json(ncbi_meta)["reports"][0]

    # Amend any missing metadata
    add_provider(genome_data, ncbi_data)
    add_assembly_version(genome_data)
    add_genebuild_metadata(genome_data)
    add_species_metadata(genome_data, ncbi_data)
    # Dump updated genome metadata
    print_json(output_file, genome_data)