Skip to content

process

ensembl.io.genomio.gff3.process

Simplify and fix a GFF3 file and returns both a cleaned up GFF3 file and a functional annotation JSON file.

main()

Main script entry-point.

Source code in src/python/ensembl/io/genomio/gff3/process.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def main() -> None:
    """Main script entry-point."""
    parser = ArgumentParser(
        description=(
            "Standardize the gene model representation of a GFF3 file, and extract the functional "
            "annotation in a separate file."
        )
    )
    parser.add_argument_src_path("--in_gff_path", required=True, help="Input GFF3 file")
    parser.add_argument_src_path("--genome_data", required=True, help="Genome JSON file")
    parser.add_argument(
        "--fail_missing_stable_ids", action="store_true", help="Do not generate IDs when missing/invalid"
    )
    parser.add_argument_dst_path("--out_gff_path", default=Path("gene_models.gff3"), help="Output GFF3 file")
    parser.add_argument_dst_path(
        "--out_func_path",
        default=Path("functional_annotation.json"),
        help="Output functional annotation JSON file",
    )
    parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
    parser.add_log_arguments(add_log_file=True)
    args = parser.parse_args()
    init_logging_with_args(args)

    # Merge multiline gene features in a separate file
    logging.info("Checking for genes to merge...")
    interim_gff_path = Path(f"{args.in_gff_path}_INTERIM_MERGE")
    merger = GFFGeneMerger()
    merged_genes = merger.merge(args.in_gff_path, interim_gff_path)
    num_merged_genes = len(merged_genes)
    in_gff_path = args.in_gff_path
    # If there are split genes, decide to merge, or just die
    if num_merged_genes > 0:
        # Report the list of merged genes in case something does not look right
        logging.info(f"{num_merged_genes} genes merged")
        logging.debug("\n".join(merged_genes))
        # Use the GFF with the merged genes for the next part
        in_gff_path = interim_gff_path

    # Load GFF3 data and write a simpler version that follows our specifications as well as a
    # functional annotation JSON file
    logging.info("Simplify and fix GFF3")
    gff_data = GFFSimplifier(args.genome_data)
    if args.fail_missing_stable_ids:
        gff_data.stable_ids.make_missing_stable_ids = False
    gff_data.simpler_gff3(in_gff_path)
    gff_data.records.to_gff(args.out_gff_path)
    gff_data.annotations.to_json(args.out_func_path)