Skip to content

annotation

ensembl.io.genomio.annotation

Annotation files processing module.

get_core_data(session, table, match_xrefs=False)

Returns the table descriptions from a core database.

Parameters:

Name Type Description Default
session Session

Session open on a core database.

required
table str

"gene" or "transcript" table from the core database.

required
match_xrefs bool

If the IDs do not match, try to match an Xref ID instead.

False
Source code in src/python/ensembl/io/genomio/annotation/update_description.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> Dict[str, FeatStruct]:
    """Returns the table descriptions from a core database.

    Args:
        session: Session open on a core database.
        table: "gene" or "transcript" table from the core database.
        match_xrefs: If the IDs do not match, try to match an Xref ID instead.
    """

    if table == "gene":
        stmt = (
            select(Gene.gene_id, Gene.stable_id, Gene.description, Xref.dbprimary_acc)
            .select_from(Gene)
            .outerjoin(
                ObjectXref,
                and_(Gene.gene_id == ObjectXref.ensembl_id, ObjectXref.ensembl_object_type == "gene"),
            )
            .outerjoin(Xref)
        )
    elif table == "transcript":
        stmt = (
            select(Transcript.transcript_id, Transcript.stable_id, Transcript.description, Xref.dbprimary_acc)
            .select_from(Transcript)
            .outerjoin(
                ObjectXref,
                and_(
                    Transcript.transcript_id == ObjectXref.ensembl_id,
                    ObjectXref.ensembl_object_type == "transcript",
                ),
            )
            .outerjoin(Xref)
        )
    else:
        raise ValueError(f"Table {table} is not supported")

    feat_data = {}
    for row in session.execute(stmt):
        (feat_id, stable_id, desc, xref_name) = row
        feat_struct: FeatStruct = (feat_id, stable_id, desc)
        feat_data[stable_id.lower()] = feat_struct
        if match_xrefs and xref_name:
            feat_data[xref_name.lower()] = feat_struct

    return feat_data

load_descriptions(session, func_file, report=False, do_update=False, match_xrefs=True)

Loads gene and transcript descriptions into a core database.

Parameters:

Name Type Description Default
session Session

Session open on a core database.

required
func_file Path

JSON file with the annotation information.

required
report bool

Print the mapping of changes to perform in the standard output.

False
do_update bool

Actually update the core database.

False
match_xrefs bool

If the IDs do not match, try to match an Xref ID instead.

True
Source code in src/python/ensembl/io/genomio/annotation/update_description.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def load_descriptions(
    session: Session,
    func_file: Path,
    report: bool = False,
    do_update: bool = False,
    match_xrefs: bool = True,
) -> None:
    """Loads gene and transcript descriptions into a core database.

    Args:
        session: Session open on a core database.
        func_file: JSON file with the annotation information.
        report: Print the mapping of changes to perform in the standard output.
        do_update: Actually update the core database.
        match_xrefs: If the IDs do not match, try to match an Xref ID instead.
    """
    func = get_json(func_file)
    logging.info(f"{len(func)} annotations from {func_file}")
    table_to_update = {"gene": Gene, "transcript": Transcript}
    for table, mapped_table in table_to_update.items():
        logging.info(f"Checking {table} descriptions")
        feat_func = [feat for feat in func if feat["object_type"] == table]
        logging.info(f"{len(feat_func)} {table} annotations from {func_file}")
        feat_data = get_core_data(session, table, match_xrefs)
        logging.info(f"Loaded {len(feat_data)} {table} data")

        stats = {
            "not_supported": 0,
            "not_found": 0,
            "same": 0,
            "same_empty": 0,
            "empty_but_xref": 0,
            "to_update_replace": 0,
            "to_update_remove": 0,
        }
        # Compare, only keep the descriptions that have changed
        features_to_update = _get_features_to_update(
            table, feat_func, feat_data, stats, report=report, do_update=do_update, match_xrefs=match_xrefs
        )

        # Show stats for this feature type
        for stat, count in stats.items():
            if count == 0:
                continue
            logging.info(f"{stat} = {count}")

        if do_update:
            logging.info(f"Now updating {len(features_to_update)} rows...")
            session.bulk_update_mappings(mapped_table, features_to_update)
            session.commit()