Skip to content

extend

ensembl.io.genomio.genome_metadata.extend

Update a genome metadata file to include additional sequence regions (e.g. MT chromosome).

amend_genome_metadata(genome_infile, genome_outfile, report_file=None, genbank_file=None)

Parameters:

Name Type Description Default
genome_infile PathLike

Genome metadata following the src/python/ensembl/io/genomio/data/schemas/genome.json.

required
genome_outfile PathLike

Amended genome metadata file.

required
report_file Optional[PathLike]

INSDC/RefSeq sequences report file.

None
genbank_file Optional[PathLike]

INSDC/RefSeq GBFF file.

None
Source code in src/python/ensembl/io/genomio/genome_metadata/extend.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def amend_genome_metadata(
    genome_infile: PathLike,
    genome_outfile: PathLike,
    report_file: Optional[PathLike] = None,
    genbank_file: Optional[PathLike] = None,
) -> None:
    """
    Args:
        genome_infile: Genome metadata following the `src/python/ensembl/io/genomio/data/schemas/genome.json`.
        genome_outfile: Amended genome metadata file.
        report_file: INSDC/RefSeq sequences report file.
        genbank_file: INSDC/RefSeq GBFF file.
    """
    genome_metadata = get_json(genome_infile)
    # Get additional sequences in the assembly but not in the data
    if report_file:
        genbank_path = Path(genbank_file) if genbank_file else None
        additions = get_additions(report_file, genbank_path)
        if additions:
            genome_metadata["added_seq"] = {"region_name": additions}
    # Print out the file
    genome_outfile = Path(genome_outfile)
    print_json(genome_outfile, genome_metadata)

get_additions(report_path, gbff_path)

Returns all seq_regions that are mentioned in the report but that are not in the data.

Parameters:

Name Type Description Default
report_path PathLike

Path to the report file.

required
gbff_path Optional[PathLike]

Path to the GBFF file.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/extend.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def get_additions(report_path: PathLike, gbff_path: Optional[PathLike]) -> List[str]:
    """Returns all `seq_regions` that are mentioned in the report but that are not in the data.

    Args:
        report_path: Path to the report file.
        gbff_path: Path to the GBFF file.
    """
    gbff_regions = set(get_gbff_regions(gbff_path))
    report_regions = get_report_regions_names(report_path)
    additions = []
    for seq_region_name in report_regions:
        (genbank_seq_name, refseq_seq_name) = seq_region_name
        if genbank_seq_name not in gbff_regions and refseq_seq_name not in gbff_regions:
            if refseq_seq_name:
                additions.append(refseq_seq_name)
            else:
                additions.append(genbank_seq_name)
    additions = sorted(additions)
    return additions

get_gbff_regions(gbff_path)

Returns the seq_region data from a GBFF file.

Parameters:

Name Type Description Default
gbff_path Optional[PathLike]

GBFF file path to use.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/extend.py
63
64
65
66
67
68
69
70
71
72
73
74
75
def get_gbff_regions(gbff_path: Optional[PathLike]) -> List[str]:
    """Returns the `seq_region` data from a GBFF file.

    Args:
        gbff_path: GBFF file path to use.
    """
    seq_regions = []
    if gbff_path:
        with open_gz_file(gbff_path) as gbff_file:
            for record in SeqIO.parse(gbff_file, "genbank"):
                record_id = re.sub(_VERSION_END, "", record.id)
                seq_regions.append(record_id)
    return seq_regions

get_report_regions_names(report_path)

Returns a list of GenBank-RefSeq seq_region names from the assembly report file.

Parameters:

Name Type Description Default
report_path PathLike

Path to the assembly report file from INSDC/RefSeq.

required
Source code in src/python/ensembl/io/genomio/genome_metadata/extend.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def get_report_regions_names(report_path: PathLike) -> List[Tuple[str, str]]:
    """Returns a list of GenBank-RefSeq `seq_region` names from the assembly report file.

    Args:
        report_path: Path to the assembly report file from INSDC/RefSeq.
    """
    # Get the report in a CSV format, easier to manipulate
    report_csv, _ = _report_to_csv(report_path)
    # Feed the CSV string to the CSV reader
    reader = csv.DictReader(report_csv.splitlines(), delimiter="\t", quoting=csv.QUOTE_NONE)
    # Create the seq_regions
    seq_regions = []
    for row in reader:
        refseq_name = row["RefSeq-Accn"]
        genbank_name = row["GenBank-Accn"]
        if refseq_name == "na":
            refseq_name = ""
        if genbank_name == "na":
            genbank_name = ""
        refseq_name = re.sub(_VERSION_END, "", refseq_name)
        genbank_name = re.sub(_VERSION_END, "", genbank_name)
        seq_regions.append((genbank_name, refseq_name))
    return seq_regions

main()

Module's entry-point.

Source code in src/python/ensembl/io/genomio/genome_metadata/extend.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def main() -> None:
    """Module's entry-point."""
    parser = ArgumentParser(description=__doc__)
    parser.add_argument_src_path(
        "--genome_infile",
        required=True,
        help="Input genome metadata file (following src/python/ensembl/io/genomio/data/schemas/genome.json)",
    )
    parser.add_argument_dst_path(
        "--genome_outfile", required=True, help="Path to the new amended genome metadata file"
    )
    parser.add_argument_src_path("--report_file", help="INSDC/RefSeq sequences report file")
    parser.add_argument_src_path("--genbank_file", help="INSDC/RefSeq GBFF file")
    parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
    parser.add_log_arguments()
    args = parser.parse_args()
    init_logging_with_args(args)

    amend_genome_metadata(
        genome_infile=args.genome_infile,
        genome_outfile=args.genome_outfile,
        report_file=args.report_file,
        genbank_file=args.genbank_file,
    )