Skip to content

prepare

ensembl.io.genomio.seq_region.prepare

Construct a seq_region metadata file from INSDC files.

main()

Module's entry-point.

Source code in src/python/ensembl/io/genomio/seq_region/prepare.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def main() -> None:
    """Module's entry-point."""
    parser = ArgumentParser(description="Construct a sequence region metadata file from INSDC files.")
    parser.add_argument_src_path("--genome_file", required=True, help="Genome metadata JSON file")
    parser.add_argument_src_path(
        "--report_file", required=True, help="INSDC/RefSeq sequences report file to parse"
    )
    parser.add_argument_src_path("--gbff_file", help="INSDC/RefSeq GBFF file to parse")
    parser.add_argument_dst_path(
        "--dst_file", default="seq_region.json", help="Output JSON file for the processed sequence regions"
    )
    parser.add_argument(
        "--to_exclude", nargs="*", metavar="SEQ_REGION_NAME", help="Sequence region names to exclude"
    )
    parser.add_argument("--mock_run", action="store_true", help="Do not call external APIs")
    parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
    parser.add_log_arguments()
    args = parser.parse_args()
    init_logging_with_args(args)

    prepare_seq_region_metadata(
        genome_file=args.genome_file,
        report_file=args.report_file,
        dst_file=args.dst_file,
        gbff_file=args.gbff_file,
        to_exclude=args.to_exclude,
        mock_run=args.mock_run,
    )

prepare_seq_region_metadata(genome_file, report_file, dst_file, *, gbff_file=None, to_exclude=None, mock_run=False)

Prepares the sequence region metadata found in the INSDC/RefSeq report and GBFF files.

The sequence region information is loaded from both sources and combined. Elements are added/excluded as requested, and the final sequence region metadata is dumped in a JSON file that follows the schema defined in "src/python/ensembl/io/genomio/data/schemas/seq_region.json".

Parameters:

Name Type Description Default
genome_file StrPath

Genome metadata JSON file path.

required
report_file StrPath

INSDC/RefSeq sequences report file path to parse.

required
gbff_file StrPath | None

INSDC/RefSeq GBFF file path to parse.

None
dst_file StrPath

JSON file output for the processed sequence regions JSON.

required
to_exclude list[str] | None

Sequence region names to exclude.

None
mock_run bool

Do not call external taxonomy service.

False
Source code in src/python/ensembl/io/genomio/seq_region/prepare.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def prepare_seq_region_metadata(
    genome_file: StrPath,
    report_file: StrPath,
    dst_file: StrPath,
    *,
    gbff_file: StrPath | None = None,
    to_exclude: list[str] | None = None,
    mock_run: bool = False,
) -> None:
    """Prepares the sequence region metadata found in the INSDC/RefSeq report and GBFF files.

    The sequence region information is loaded from both sources and combined. Elements are added/excluded
    as requested, and the final sequence region metadata is dumped in a JSON file that follows the schema
    defined in "src/python/ensembl/io/genomio/data/schemas/seq_region.json".

    Args:
        genome_file: Genome metadata JSON file path.
        report_file: INSDC/RefSeq sequences report file path to parse.
        gbff_file: INSDC/RefSeq GBFF file path to parse.
        dst_file: JSON file output for the processed sequence regions JSON.
        to_exclude: Sequence region names to exclude.
        mock_run: Do not call external taxonomy service.

    """
    genome_data = get_json(genome_file)
    dst_file = Path(dst_file)
    is_refseq = genome_data["assembly"]["accession"].startswith("GCF_")

    seqs = SeqCollection(mock=mock_run)
    seqs.from_report(Path(report_file), is_refseq)
    if gbff_file:
        seqs.from_gbff(Path(gbff_file))

    # Exclude seq_regions from a list
    if to_exclude:
        seqs.remove(to_exclude)

    # Add translation and mitochondrial codon tables
    seqs.add_translation_table()
    seqs.add_mitochondrial_codon_table(genome_data["species"]["taxonomy_id"])

    # Print out the file
    print_json(dst_file, seqs.to_list())