Skip to content

update_description

ensembl.io.genomio.annotation.update_description

Update descriptions from a functional annotation file into a core database.

FEAT_TABLE = {'gene': 'gene', 'mobile_element': 'gene', 'transcript': 'transcript'} module-attribute

FeatStruct = Tuple[str, str, str] module-attribute

get_core_data(session, table, match_xrefs=False)

Returns the table descriptions from a core database.

Parameters:

Name Type Description Default
session Session

Session open on a core database.

required
table str

"gene" or "transcript" table from the core database.

required
match_xrefs bool

If the IDs do not match, try to match an Xref ID instead.

False
Source code in src/python/ensembl/io/genomio/annotation/update_description.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> Dict[str, FeatStruct]:
    """Returns the table descriptions from a core database.

    Args:
        session: Session open on a core database.
        table: "gene" or "transcript" table from the core database.
        match_xrefs: If the IDs do not match, try to match an Xref ID instead.
    """

    if table == "gene":
        stmt = (
            select(Gene.gene_id, Gene.stable_id, Gene.description, Xref.dbprimary_acc)
            .select_from(Gene)
            .outerjoin(
                ObjectXref,
                and_(Gene.gene_id == ObjectXref.ensembl_id, ObjectXref.ensembl_object_type == "gene"),
            )
            .outerjoin(Xref)
        )
    elif table == "transcript":
        stmt = (
            select(Transcript.transcript_id, Transcript.stable_id, Transcript.description, Xref.dbprimary_acc)
            .select_from(Transcript)
            .outerjoin(
                ObjectXref,
                and_(
                    Transcript.transcript_id == ObjectXref.ensembl_id,
                    ObjectXref.ensembl_object_type == "transcript",
                ),
            )
            .outerjoin(Xref)
        )
    else:
        raise ValueError(f"Table {table} is not supported")

    feat_data = {}
    for row in session.execute(stmt):
        (feat_id, stable_id, desc, xref_name) = row
        feat_struct: FeatStruct = (feat_id, stable_id, desc)
        feat_data[stable_id.lower()] = feat_struct
        if match_xrefs and xref_name:
            feat_data[xref_name.lower()] = feat_struct

    return feat_data

load_descriptions(session, func_file, report=False, do_update=False, match_xrefs=True)

Loads gene and transcript descriptions into a core database.

Parameters:

Name Type Description Default
session Session

Session open on a core database.

required
func_file Path

JSON file with the annotation information.

required
report bool

Print the mapping of changes to perform in the standard output.

False
do_update bool

Actually update the core database.

False
match_xrefs bool

If the IDs do not match, try to match an Xref ID instead.

True
Source code in src/python/ensembl/io/genomio/annotation/update_description.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def load_descriptions(
    session: Session,
    func_file: Path,
    report: bool = False,
    do_update: bool = False,
    match_xrefs: bool = True,
) -> None:
    """Loads gene and transcript descriptions into a core database.

    Args:
        session: Session open on a core database.
        func_file: JSON file with the annotation information.
        report: Print the mapping of changes to perform in the standard output.
        do_update: Actually update the core database.
        match_xrefs: If the IDs do not match, try to match an Xref ID instead.
    """
    func = get_json(func_file)
    logging.info(f"{len(func)} annotations from {func_file}")
    table_to_update = {"gene": Gene, "transcript": Transcript}
    for table, mapped_table in table_to_update.items():
        logging.info(f"Checking {table} descriptions")
        feat_func = [feat for feat in func if feat["object_type"] == table]
        logging.info(f"{len(feat_func)} {table} annotations from {func_file}")
        feat_data = get_core_data(session, table, match_xrefs)
        logging.info(f"Loaded {len(feat_data)} {table} data")

        stats = {
            "not_supported": 0,
            "not_found": 0,
            "same": 0,
            "same_empty": 0,
            "empty_but_xref": 0,
            "to_update_replace": 0,
            "to_update_remove": 0,
        }
        # Compare, only keep the descriptions that have changed
        features_to_update = _get_features_to_update(
            table, feat_func, feat_data, stats, report=report, do_update=do_update, match_xrefs=match_xrefs
        )

        # Show stats for this feature type
        for stat, count in stats.items():
            if count == 0:
                continue
            logging.info(f"{stat} = {count}")

        if do_update:
            logging.info(f"Now updating {len(features_to_update)} rows...")
            session.bulk_update_mappings(mapped_table, features_to_update)
            session.commit()

main()

Main script entry-point.

Source code in src/python/ensembl/io/genomio/annotation/update_description.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
def main() -> None:
    """Main script entry-point."""
    parser = ArgumentParser(description=__doc__)
    parser.add_server_arguments(include_database=True)
    parser.add_argument_src_path("--func_file", required=True, help="Input functional annotation JSON")
    parser.add_argument("--report", action="store_true", help="Show what change would be made")
    parser.add_argument("--update", action="store_true", help="Make the changes to the database")
    parser.add_argument(
        "--match_xrefs", action="store_true", help="Use xref IDs to match features if IDs do not work"
    )
    parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
    parser.add_log_arguments(add_log_file=True)
    args = parser.parse_args()
    init_logging_with_args(args)

    dbc = DBConnection(args.url)
    with dbc.session_scope() as session:
        load_descriptions(session, args.func_file, args.report, args.update, match_xrefs=args.match_xrefs)