Skip to content

dump

ensembl.io.genomio.genome_metadata.dump

Generates a JSON file representing the genome metadata from a core database.

DEFAULT_FILTER = {'database': {'name': str}, 'added_seq': {'region_name': str}, 'annotation': {'provider_name': str, 'provider_url': str}, 'assembly': {'accession': str, 'date': str, 'name': str, 'provider_name': str, 'provider_url': str, 'version': int}, 'BRC4': {'organism_abbrev': str, 'component': str}, 'genebuild': {'id': str, 'method': str, 'method_display': str, 'start_date': str, 'version': str}, 'species': {'alias': str, 'annotation_source': str, 'display_name': str, 'division': str, 'production_name': str, 'scientific_name': str, 'strain': str, 'taxonomy_id': int}} module-attribute

check_assembly_refseq(gmeta_out)

Update the GCA accession to use GCF if it is from RefSeq.

Parameters:

Name Type Description Default
genome_metadata

Nested metadata key values from the core metadata table.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def check_assembly_refseq(gmeta_out: dict[str, Any]) -> None:
    """Update the GCA accession to use GCF if it is from RefSeq.

    Args:
        genome_metadata: Nested metadata key values from the core metadata table.
    """
    assembly = gmeta_out.get("assembly", {})
    if assembly.get("provider_name"):
        if assembly["provider_name"] == "RefSeq":
            assembly["accession"] = assembly["accession"].replace("GCA", "GCF")
            logging.info("GCA accession updated to RefSeq GFC accession.")
        else:
            logging.info(f"Meta check 'assembly is RefSeq': Asm provider = {assembly['provider_name']}")
    else:
        logging.debug(
            "Meta filter update to RefSeq accession not done: user meta filter missing: \
            'assembly.provider_name'"
        )

check_assembly_version(genome_metadata)

Updates the assembly version of the genome metadata provided.

If version meta key is not and integer or it is not available, the assembly accession's version will be used instead.

Parameters:

Name Type Description Default
genome_metadata dict[str, Any]

Nested metadata key values from the core metadata table.

required

Raises:

Type Description
ValueError

If both version and the assembly accession's version are not integers or are missing.

Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def check_assembly_version(genome_metadata: dict[str, Any]) -> None:
    """Updates the assembly version of the genome metadata provided.

    If `version` meta key is not and integer or it is not available, the assembly accession's version
    will be used instead.

    Args:
        genome_metadata: Nested metadata key values from the core metadata table.

    Raises:
        ValueError: If both `version` and the assembly accession's version are not integers or are missing.
    """
    assembly = genome_metadata["assembly"]
    version = assembly.get("version")
    # Check the version is an integer
    try:
        assembly["version"] = int(version)
    except (ValueError, TypeError) as exc:
        # Get the version from the assembly accession
        accession = assembly["accession"]
        version = accession.partition(".")[2]
        try:
            assembly["version"] = int(version)
        except ValueError:
            raise ValueError(f"Assembly version is not an integer in {assembly}") from exc
        logging.info(f"Assembly version [v{version}] obtained from assembly accession ({accession}).")
    else:
        logging.info(f'Located version [v{assembly["version"]}] info from meta data.')

check_genebuild_version(genome_metadata)

Updates the genebuild version (if not present) from the genebuild ID, removing the latter.

Parameters:

Name Type Description Default
genome_metadata dict[str, Any]

Nested metadata key values from the core metadata table.

required

Raises:

Type Description
ValueError

If there is no genebuild version or ID available.

Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def check_genebuild_version(genome_metadata: dict[str, Any]) -> None:
    """Updates the genebuild version (if not present) from the genebuild ID, removing the latter.

    Args:
        genome_metadata: Nested metadata key values from the core metadata table.

    Raises:
        ValueError: If there is no genebuild version or ID available.
    """
    try:
        genebuild = genome_metadata["genebuild"]
    except KeyError:
        return
    if "version" not in genebuild:
        try:
            genebuild_id = genebuild["id"]
        except KeyError:
            # pylint: disable=raise-missing-from
            raise ValueError("No genebuild version or ID found")
        genome_metadata["genebuild"]["version"] = str(genebuild_id)
    # Drop genebuild ID since there is a genebuild version
    genome_metadata["genebuild"].pop("id", None)

convert_dict(meta_dict)

Converts text JSON to add type properties from string

Parameters:

Name Type Description Default
meta_dict dict

User meta dictionary with literal string typing to be converted.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
232
233
234
235
236
237
238
239
240
241
242
243
244
def convert_dict(meta_dict: dict) -> dict:
    """Converts text JSON to add type properties from string

    Args:
        meta_dict: User meta dictionary with literal string typing to be converted.
    """
    new_dict = meta_dict.copy()
    for key, value in meta_dict.items():
        if isinstance(value, dict):
            new_dict[key] = convert_dict(value)
        else:
            new_dict[key] = locate(value)
    return new_dict

filter_genome_meta(genome_metadata, metafilter, meta_update)

Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER.

Also converts to expected data types (to follow the genome JSON schema).

Parameters:

Name Type Description Default
genome_metadata dict[str, Any]

Nested metadata key values from the core metadata table.

required
metafilter dict | None

Input JSON containing subset of meta table values to filter on.

required
meta_update bool

Deactivates additional meta updating.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def filter_genome_meta(
    genome_metadata: dict[str, Any], metafilter: dict | None, meta_update: bool
) -> dict[str, Any]:
    """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER.

    Also converts to expected data types (to follow the genome JSON schema).

    Args:
        genome_metadata: Nested metadata key values from the core metadata table.
        metafilter: Input JSON containing subset of meta table values to filter on.
        meta_update: Deactivates additional meta updating.

    """
    filtered_metadata: dict[str, Any] = {}

    if metafilter:
        metadata_filter: dict[str, dict[str, type]] = metafilter
    else:
        metadata_filter = DEFAULT_FILTER

    for key, subfilter in metadata_filter.items():
        if key in genome_metadata:
            filtered_metadata[key] = {}
            for subkey, value_type in subfilter.items():
                if isinstance(value_type, str):
                    value_type = type(value_type)
                if isinstance(value_type, int):
                    value_type = type(value_type)
                if subkey in genome_metadata[key]:
                    value = genome_metadata[key][subkey]
                    if isinstance(value, list):
                        value = [value_type(x) for x in value]
                    else:
                        value = value_type(value)
                    filtered_metadata[key][subkey] = value

    # Optional assembly and genebuild based filtering:
    if meta_update:
        # Check assembly and genebuild versions
        check_assembly_refseq(filtered_metadata)
        check_assembly_version(filtered_metadata)
        check_genebuild_version(filtered_metadata)

    return filtered_metadata

get_genome_metadata(session, db_name)

Returns the meta table content from the core database in a nested dictionary.

Parameters:

Name Type Description Default
session Session

Session for the current core.

required
db_name str | None

Target database name

required
Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def get_genome_metadata(session: Session, db_name: str | None) -> dict[str, Any]:
    """Returns the meta table content from the core database in a nested dictionary.

    Args:
        session: Session for the current core.
        db_name: Target database name
    """
    genome_metadata: dict[str, Any] = {}

    meta_statement = select(Meta)
    for row in session.execute(meta_statement).unique().all():
        meta_key = row[0].meta_key
        meta_value = row[0].meta_value
        (main_key, _, subkey) = meta_key.partition(".")
        # Use empty string as subkey when no "." found to simplify dictionary creation
        if main_key in genome_metadata:
            if subkey in genome_metadata[main_key]:
                genome_metadata[main_key][subkey].append(meta_value)
            else:
                genome_metadata[main_key][subkey] = [meta_value]
        else:
            genome_metadata[main_key] = {subkey: [meta_value]}

    if db_name:
        genome_metadata["database"] = {"name": f"{db_name}"}

    # Parse genome metadata to simplify dictionary and check data consistency
    for main_key, subkeys_dict in genome_metadata.items():
        # Replace single-value lists by the value itself
        for subkey, value in subkeys_dict.items():
            if len(value) == 1:
                subkeys_dict[subkey] = value[0]
        # Remove nested dictionary if it only has "" as key, passing its value to the main key
        if "" in subkeys_dict:
            if len(subkeys_dict) == 1:
                genome_metadata[main_key] = subkeys_dict.pop("")
            else:
                raise ValueError(f"Unexpected meta keys for '{main_key}': {', '.join(subkeys_dict.keys())}")
    return genome_metadata

main(arg_list=None)

Main script entry-point.

Parameters:

Name Type Description Default
arg_list list[str] | None

Arguments to parse passing list to parse_args().

None
Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def main(arg_list: list[str] | None = None) -> None:
    """Main script entry-point.

    Args:
        arg_list: Arguments to parse passing list to parse_args().
    """
    args = parse_args(arg_list)
    init_logging_with_args(args)

    genome_meta = metadata_dump_setup(
        db_url=args.url, input_filter=args.metafilter, meta_update=args.meta_update, append_db=args.append_db
    )

    print(json.dumps(genome_meta, indent=2, sort_keys=True))

metadata_dump_setup(db_url, input_filter, meta_update, append_db)

Setup main stages of genome meta dump from user input arguments provided. Args: db_url: Target core database URL. input_filter: Input JSON containing subset of meta table values to filter on. no_update: Deactivate additional meta updating. append_db: Append target core database name to output JSON.

Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def metadata_dump_setup(
    db_url: URL, input_filter: StrPath | None, meta_update: bool, append_db: bool
) -> dict[str, Any]:
    """Setup main stages of genome meta dump from user input arguments provided.
    Args:
        db_url: Target core database URL.
        input_filter: Input JSON containing subset of meta table values to filter on.
        no_update: Deactivate additional meta updating.
        append_db: Append target core database name to output JSON.

    """
    dbc = DBConnectionLite(db_url)
    db_name = None
    meta_filter = {}
    if append_db:
        db_name = db_url.database

    if input_filter:
        unconverted_json = get_json(input_filter)
        meta_filter = convert_dict(unconverted_json)

    with dbc.session_scope() as session:
        genome_meta = get_genome_metadata(session, db_name)
        genome_meta = filter_genome_meta(genome_meta, meta_filter, meta_update)

    return genome_meta

parse_args(arg_list)

Return a populated namespace with the arguments parsed from a list or from the command line.

Parameters:

Name Type Description Default
arg_list list[str] | None

List of arguments to parse. If None, grab them from the command line.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/dump.py
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
    """Return a populated namespace with the arguments parsed from a list or from the command line.

    Args:
        arg_list: List of arguments to parse. If `None`, grab them from the command line.

    """
    parser = ArgumentParser(description=__doc__)
    parser.add_server_arguments(include_database=True, help="server url and core database")
    parser.add_argument_src_path(
        "--metafilter", default=None, help="JSON file of nested meta_key:meta_value to filter dump output."
    )
    parser.add_argument(
        "--meta_update",
        action="store_true",
        help="Perform assembly and genebuild 'version' metadata checks & update if needed.",
    )
    parser.add_argument("--append_db", action="store_true", help="Append core database name to output JSON.")
    parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
    parser.add_log_arguments(add_log_file=True)
    return parser.parse_args(arg_list)