Skip to content

extract_annotation

ensembl.io.genomio.gff3.extract_annotation

Simple representation of gene features functional annotation extracted from a GFF3 file.

Annotation = Dict[str, Any] module-attribute

AnnotationError

Bases: Exception

If anything wrong happens when recording annotations.

Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
51
52
class AnnotationError(Exception):
    """If anything wrong happens when recording annotations."""

DuplicateIdError

Bases: Exception

Trying to add a feature with an ID already in use.

Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
43
44
class DuplicateIdError(Exception):
    """Trying to add a feature with an ID already in use."""

FunctionalAnnotations

List of annotations extracted from a GFF3 file.

Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
class FunctionalAnnotations:
    """List of annotations extracted from a GFF3 file."""

    ignored_xrefs = {"go", "interpro", "uniprot"}

    def __init__(self, provider_name: str = "") -> None:
        self.annotations: List[Annotation] = []
        self.provider_name = provider_name
        # Annotated features
        # Under each feature, each dict's key is a feature ID
        self.features: Dict[str, Dict[str, Annotation]] = {
            "gene": {},
            "transcript": {},
            "translation": {},
            "transposable_element": {},
        }
        # Keep parent info: key is the feature ID, value is the parent ID
        self.parents: Dict[str, Dict[str, str]] = {
            "gene": {},
            "transcript": {},
        }

    def get_xrefs(self, feature: GFFSeqFeature) -> List[Dict[str, Any]]:
        """Get the xrefs from the Dbxref field."""
        all_xref: List[Dict[str, str]] = []

        if "Dbxref" in feature.qualifiers:
            for xref in feature.qualifiers["Dbxref"]:
                dbname, name = xref.split(":", maxsplit=1)
                if dbname == "GenBank" and self.provider_name == "RefSeq":
                    dbname = "RefSeq"

                if dbname.lower() in self.ignored_xrefs:
                    continue

                xrefs = {"dbname": dbname, "id": name}
                all_xref.append(xrefs)

        # Add RefSeq ID xref if it looks like one
        if self.provider_name == "RefSeq":
            if feature.type == "gene" and feature.id.startswith("LOC"):
                xref_dbs = {x["dbname"] for x in all_xref}
                if "RefSeq" not in xref_dbs:
                    all_xref.append({"dbname": "RefSeq", "id": feature.id})

        return all_xref

    def get_features(self, feat_type: str) -> Dict[str, Annotation]:
        """Get all feature annotations for the requested type."""
        try:
            return self.features[feat_type]
        except KeyError as err:
            raise KeyError(f"No such feature type {feat_type}") from err

    def add_parent_link(self, parent_type: str, parent_id: str, child_id: str) -> None:
        """Record a parent-child IDs relationship for a given parent biotype."""
        features = self.get_features(parent_type)
        if parent_id not in features:
            raise MissingParentError(f"Parent {parent_type}:{parent_id} not found for {child_id}")
        self.parents[parent_type][child_id] = parent_id

    def get_parent(self, parent_type: str, child_id: str) -> str:
        """Returns the parent ID of a given child for a given parent biotype."""
        try:
            parents = self.parents[parent_type]
        except KeyError as err:
            raise KeyError(f"Unsupported parent type {parent_type}") from err

        parent_id = parents.get(child_id)
        if parent_id is None:
            raise MissingParentError(f"Can't find {parent_type} parent for {child_id}")
        return parent_id

    def add_feature(
        self,
        feature: GFFSeqFeature,
        feat_type: str,
        parent_id: Optional[str] = None,
        all_parent_ids: Optional[List[str]] = None,
    ) -> None:
        """Add annotation for a feature of a given type. If a parent_id is provided, record the relationship.

        Args:
            feature: The feature to create an annotation.
            feat_type: Type of the feature to annotate.
            parent_id: Parent ID of this feature to keep it linked.
            all_parent_ids: All parent IDs to remove from non-informative descriptions.
        """
        if all_parent_ids is None:
            all_parent_ids = []
        features = self.get_features(feat_type)
        if feature.id in features:
            raise AnnotationError(f"Feature {feat_type} ID {feature.id} already added")

        feature_object = self._generic_feature(feature, feat_type, all_parent_ids)
        self.features[feat_type][feature.id] = feature_object

        if parent_id:
            if feat_type in _PARENTS:
                parent_type = _PARENTS[feat_type]
                self.add_parent_link(parent_type, parent_id, feature.id)
            else:
                raise AnnotationError(f"No parent possible for {feat_type} {feature.id}")

    def _generic_feature(
        self, feature: GFFSeqFeature, feat_type: str, parent_ids: Optional[List[str]] = None
    ) -> Dict[str, Any]:
        """Create a feature object following the specifications.

        Args:
            feature: The GFFSeqFeature to add to the list.
            feat_type: Feature type of the feature to store (e.g. gene, transcript, translation).
            all_parent_ids: All parent IDs to remove from non-informative descriptions.

        """
        if parent_ids is None:
            parent_ids = []

        feature_object: Annotation = {"object_type": feat_type, "id": feature.id}

        # Description?
        for qname in ("description", "product"):
            if qname in feature.qualifiers:
                description = feature.qualifiers[qname][0]
                if self.product_is_informative(description, feat_ids=parent_ids + [feature.id]):
                    feature_object["description"] = description
                    break
                logging.debug(f"Non informative description for {feature.id}: {description}")

        feature_object["xrefs"] = []
        if "Dbxref" in feature.qualifiers:
            all_xref = self.get_xrefs(feature)
            feature_object["xrefs"] = all_xref

        xref_values = {xref["id"].lower() for xref in feature_object["xrefs"]}

        # Synonyms?
        # We add synonyms to the external_synonym table
        # which is associated with the first xref of that feature type
        if "Name" in feature.qualifiers:
            feat_name = feature.qualifiers["Name"][0]
            if feat_name.lower() != feature.id.lower() and feat_name.lower() not in xref_values:
                feature_object["synonyms"] = [feat_name]

        # is_pseudogene?
        if feature.type.startswith("pseudogen"):
            feature_object["is_pseudogene"] = True

        # Don't keep empty xref
        if not feature_object["xrefs"]:
            del feature_object["xrefs"]
        return feature_object

    def transfer_descriptions(self) -> None:
        """Transfers the feature descriptions in 2 steps:
        - from translations to transcripts (if the transcript description is empty)
        - from transcripts to genes (same case)

        """
        self._transfer_description_up("translation")
        self._transfer_description_up("transcript")

    def _transfer_description_up(self, child_feature: str) -> None:
        """Transfer descriptions from all feature of a given type, up to their parent.

        Args:
            child_feature: Either "translation" (transfer to transcript) or "transcript" (to gene).

        """
        children_features = self.get_features(child_feature)
        parent_type = _PARENTS[child_feature]
        parent_features = self.get_features(parent_type)

        # Transfer description from children to their parent
        for child_id, child in children_features.items():
            child_description = child.get("description")
            if child_description is not None:
                child_description = self._clean_description(child_description)
                # Check parent
                parent_id = self.get_parent(parent_type, child_id)
                parent = parent_features[parent_id]
                parent_description = parent.get("description")
                if parent_description is None:
                    parent["description"] = child_description

    @staticmethod
    def _clean_description(description: str) -> str:
        """Returns the description without "transcript variant" information."""
        variant_re = re.compile(r", transcript variant [A-Z][0-9]+$", re.IGNORECASE)
        description = re.sub(variant_re, "", description)
        return description

    @staticmethod
    def product_is_informative(product: str, feat_ids: Optional[List[str]] = None) -> bool:
        """Returns True if the product name contains informative words, False otherwise.

        It is considered uninformative when the description contains words such as "hypothetical" or
        or "putative". If feature IDs are provided, consider it uninformative as well (we do not want
        descriptions to be just the ID).

        Args:
            product: A product name.
            feat_ids: List of feature IDs.

        """
        non_informative_words = [
            "hypothetical",
            "putative",
            "uncharacterized",
            "unspecified",
            "unknown",
            r"(of )?unknown function",
            "conserved",
            "predicted",
            "fragment",
            "product",
            "function",
            "protein",
            "transcript",
            "gene",
            "RNA",
            r"(variant|isoform)( X?\d+)?",
            r"low quality protein",
        ]
        non_informative_re = re.compile(r"|".join(non_informative_words), re.IGNORECASE)

        # Remove all IDs that are in the description
        if feat_ids:
            logging.debug(f"Filter out {feat_ids} from {product}")
            try:
                for feat_id in feat_ids:
                    feat_id_re = re.compile(feat_id, re.IGNORECASE)
                    product = re.sub(feat_id_re, "", product)
            except TypeError as err:
                raise TypeError(f"Failed to search {feat_id_re} in '{product}'") from err

        # Remove punctuations
        punct_re = re.compile(r"[,;: _()-]+")
        product = re.sub(punct_re, " ", product)

        # Then remove non informative words
        product = re.sub(non_informative_re, " ", product)

        # Anything (informative) left?
        empty_re = re.compile(r"^[ ]*$")
        return not bool(empty_re.match(product))

    def _to_list(self) -> list[Annotation]:
        all_list: list[Annotation] = []
        for feat_dict in self.features.values():
            all_list += feat_dict.values()
        return all_list

    def to_json(self, out_path: PathLike) -> None:
        """Print out the current annotation list in a json file.

        Args:
            out_path: JSON file path where to write the data.

        """
        self.transfer_descriptions()
        feats_list = self._to_list()
        print_json(Path(out_path), feats_list)

    def store_gene(self, gene: GFFSeqFeature) -> None:
        """Record the functional_annotations of a gene and its children features."""
        self.add_feature(gene, "gene")

        for transcript in gene.sub_features:
            self.add_feature(transcript, "transcript", gene.id, [gene.id])
            for feat in transcript.sub_features:
                if feat.type == "CDS":
                    self.add_feature(feat, "translation", transcript.id, [gene.id, transcript.id])
                    # Store CDS functional annotation only once
                    break

annotations = [] instance-attribute

features = {'gene': {}, 'transcript': {}, 'translation': {}, 'transposable_element': {}} instance-attribute

ignored_xrefs = {'go', 'interpro', 'uniprot'} class-attribute instance-attribute

parents = {'gene': {}, 'transcript': {}} instance-attribute

provider_name = provider_name instance-attribute

add_feature(feature, feat_type, parent_id=None, all_parent_ids=None)

Add annotation for a feature of a given type. If a parent_id is provided, record the relationship.

Parameters:

Name Type Description Default
feature GFFSeqFeature

The feature to create an annotation.

required
feat_type str

Type of the feature to annotate.

required
parent_id Optional[str]

Parent ID of this feature to keep it linked.

None
all_parent_ids Optional[List[str]]

All parent IDs to remove from non-informative descriptions.

None
Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def add_feature(
    self,
    feature: GFFSeqFeature,
    feat_type: str,
    parent_id: Optional[str] = None,
    all_parent_ids: Optional[List[str]] = None,
) -> None:
    """Add annotation for a feature of a given type. If a parent_id is provided, record the relationship.

    Args:
        feature: The feature to create an annotation.
        feat_type: Type of the feature to annotate.
        parent_id: Parent ID of this feature to keep it linked.
        all_parent_ids: All parent IDs to remove from non-informative descriptions.
    """
    if all_parent_ids is None:
        all_parent_ids = []
    features = self.get_features(feat_type)
    if feature.id in features:
        raise AnnotationError(f"Feature {feat_type} ID {feature.id} already added")

    feature_object = self._generic_feature(feature, feat_type, all_parent_ids)
    self.features[feat_type][feature.id] = feature_object

    if parent_id:
        if feat_type in _PARENTS:
            parent_type = _PARENTS[feat_type]
            self.add_parent_link(parent_type, parent_id, feature.id)
        else:
            raise AnnotationError(f"No parent possible for {feat_type} {feature.id}")

Record a parent-child IDs relationship for a given parent biotype.

Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
109
110
111
112
113
114
def add_parent_link(self, parent_type: str, parent_id: str, child_id: str) -> None:
    """Record a parent-child IDs relationship for a given parent biotype."""
    features = self.get_features(parent_type)
    if parent_id not in features:
        raise MissingParentError(f"Parent {parent_type}:{parent_id} not found for {child_id}")
    self.parents[parent_type][child_id] = parent_id

get_features(feat_type)

Get all feature annotations for the requested type.

Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
102
103
104
105
106
107
def get_features(self, feat_type: str) -> Dict[str, Annotation]:
    """Get all feature annotations for the requested type."""
    try:
        return self.features[feat_type]
    except KeyError as err:
        raise KeyError(f"No such feature type {feat_type}") from err

get_parent(parent_type, child_id)

Returns the parent ID of a given child for a given parent biotype.

Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
116
117
118
119
120
121
122
123
124
125
126
def get_parent(self, parent_type: str, child_id: str) -> str:
    """Returns the parent ID of a given child for a given parent biotype."""
    try:
        parents = self.parents[parent_type]
    except KeyError as err:
        raise KeyError(f"Unsupported parent type {parent_type}") from err

    parent_id = parents.get(child_id)
    if parent_id is None:
        raise MissingParentError(f"Can't find {parent_type} parent for {child_id}")
    return parent_id

get_xrefs(feature)

Get the xrefs from the Dbxref field.

Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def get_xrefs(self, feature: GFFSeqFeature) -> List[Dict[str, Any]]:
    """Get the xrefs from the Dbxref field."""
    all_xref: List[Dict[str, str]] = []

    if "Dbxref" in feature.qualifiers:
        for xref in feature.qualifiers["Dbxref"]:
            dbname, name = xref.split(":", maxsplit=1)
            if dbname == "GenBank" and self.provider_name == "RefSeq":
                dbname = "RefSeq"

            if dbname.lower() in self.ignored_xrefs:
                continue

            xrefs = {"dbname": dbname, "id": name}
            all_xref.append(xrefs)

    # Add RefSeq ID xref if it looks like one
    if self.provider_name == "RefSeq":
        if feature.type == "gene" and feature.id.startswith("LOC"):
            xref_dbs = {x["dbname"] for x in all_xref}
            if "RefSeq" not in xref_dbs:
                all_xref.append({"dbname": "RefSeq", "id": feature.id})

    return all_xref

product_is_informative(product, feat_ids=None) staticmethod

Returns True if the product name contains informative words, False otherwise.

It is considered uninformative when the description contains words such as "hypothetical" or or "putative". If feature IDs are provided, consider it uninformative as well (we do not want descriptions to be just the ID).

Parameters:

Name Type Description Default
product str

A product name.

required
feat_ids Optional[List[str]]

List of feature IDs.

None
Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
@staticmethod
def product_is_informative(product: str, feat_ids: Optional[List[str]] = None) -> bool:
    """Returns True if the product name contains informative words, False otherwise.

    It is considered uninformative when the description contains words such as "hypothetical" or
    or "putative". If feature IDs are provided, consider it uninformative as well (we do not want
    descriptions to be just the ID).

    Args:
        product: A product name.
        feat_ids: List of feature IDs.

    """
    non_informative_words = [
        "hypothetical",
        "putative",
        "uncharacterized",
        "unspecified",
        "unknown",
        r"(of )?unknown function",
        "conserved",
        "predicted",
        "fragment",
        "product",
        "function",
        "protein",
        "transcript",
        "gene",
        "RNA",
        r"(variant|isoform)( X?\d+)?",
        r"low quality protein",
    ]
    non_informative_re = re.compile(r"|".join(non_informative_words), re.IGNORECASE)

    # Remove all IDs that are in the description
    if feat_ids:
        logging.debug(f"Filter out {feat_ids} from {product}")
        try:
            for feat_id in feat_ids:
                feat_id_re = re.compile(feat_id, re.IGNORECASE)
                product = re.sub(feat_id_re, "", product)
        except TypeError as err:
            raise TypeError(f"Failed to search {feat_id_re} in '{product}'") from err

    # Remove punctuations
    punct_re = re.compile(r"[,;: _()-]+")
    product = re.sub(punct_re, " ", product)

    # Then remove non informative words
    product = re.sub(non_informative_re, " ", product)

    # Anything (informative) left?
    empty_re = re.compile(r"^[ ]*$")
    return not bool(empty_re.match(product))

store_gene(gene)

Record the functional_annotations of a gene and its children features.

Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
319
320
321
322
323
324
325
326
327
328
329
def store_gene(self, gene: GFFSeqFeature) -> None:
    """Record the functional_annotations of a gene and its children features."""
    self.add_feature(gene, "gene")

    for transcript in gene.sub_features:
        self.add_feature(transcript, "transcript", gene.id, [gene.id])
        for feat in transcript.sub_features:
            if feat.type == "CDS":
                self.add_feature(feat, "translation", transcript.id, [gene.id, transcript.id])
                # Store CDS functional annotation only once
                break

to_json(out_path)

Print out the current annotation list in a json file.

Parameters:

Name Type Description Default
out_path PathLike

JSON file path where to write the data.

required
Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
308
309
310
311
312
313
314
315
316
317
def to_json(self, out_path: PathLike) -> None:
    """Print out the current annotation list in a json file.

    Args:
        out_path: JSON file path where to write the data.

    """
    self.transfer_descriptions()
    feats_list = self._to_list()
    print_json(Path(out_path), feats_list)

transfer_descriptions()

Transfers the feature descriptions in 2 steps: - from translations to transcripts (if the transcript description is empty) - from transcripts to genes (same case)

Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
208
209
210
211
212
213
214
215
def transfer_descriptions(self) -> None:
    """Transfers the feature descriptions in 2 steps:
    - from translations to transcripts (if the transcript description is empty)
    - from transcripts to genes (same case)

    """
    self._transfer_description_up("translation")
    self._transfer_description_up("transcript")

MissingParentError

Bases: Exception

Trying to add a feature without an expected parent.

Source code in src/python/ensembl/io/genomio/gff3/extract_annotation.py
47
48
class MissingParentError(Exception):
    """Trying to add a feature without an expected parent."""