Skip to content

restructure

ensembl.io.genomio.gff3.restructure

Restructure a gene model to a standard representation: gene -> [ mRNAs -> [CDSs, exons] ]

add_transcript_to_naked_gene(gene)

Add an unspecific transcript to a gene without any sub features.

Source code in src/python/ensembl/io/genomio/gff3/restructure.py
70
71
72
73
74
75
76
77
78
79
def add_transcript_to_naked_gene(gene: GFFSeqFeature) -> None:
    """Add an unspecific transcript to a gene without any sub features."""

    if (len(gene.sub_features) > 0) or (gene.type != "gene"):
        return

    transcript = GFFSeqFeature(gene.location, type="transcript")
    transcript.qualifiers["source"] = gene.qualifiers["source"]
    gene.sub_features = [transcript]
    logging.debug(f"Inserted 1 transcript for a lone gene {gene.id}")

move_cds_to_existing_mrna(gene)

Move CDS child features of a gene to the mRNA.

This is to fix the case where we have the following structure:: gene -> [ mRNA, CDSs ]

and change it to:: gene -> [ mRNA -> [ CDSs ] ]

The mRNA itself might have exons, in which case check that they match the CDS coordinates.

Parameters:

Name Type Description Default
gene GFFSeqFeature

Gene feature to update.

required

Raises:

Type Description
GFFParserError

If the feature structure is not recognized.

Source code in src/python/ensembl/io/genomio/gff3/restructure.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def move_cds_to_existing_mrna(gene: GFFSeqFeature) -> None:
    """Move CDS child features of a gene to the mRNA.

    This is to fix the case where we have the following structure::
        gene -> [ mRNA, CDSs ]

    and change it to::
        gene -> [ mRNA -> [ CDSs ] ]

    The mRNA itself might have exons, in which case check that they match the CDS coordinates.

    Args:
        gene: Gene feature to update.

    Raises:
        GFFParserError: If the feature structure is not recognized.
    """
    counts = _get_feat_counts(gene)
    if not counts.get("mRNA") or not counts.get("CDS"):
        return
    if counts["mRNA"] > 1:
        raise GFFParserError(
            f"Can't fix gene {gene.id}: contains several mRNAs and CDSs, all children of the gene"
        )

    # First, count the types
    mrnas = []
    cdss = []

    gene_subf_clean = []
    for subf in gene.sub_features:
        if subf.type == "mRNA":
            mrnas.append(subf)
        elif subf.type == "CDS":
            cdss.append(subf)
        else:
            gene_subf_clean.append(subf)

    mrna = mrnas[0]

    # Check if there are exons (or CDSs) under the mRNA
    sub_cdss = []
    sub_exons = []
    for subf in mrna.sub_features:
        if subf.type == "CDS":
            sub_cdss.append(subf)
        elif subf.type == "exon":
            sub_exons.append(subf)

    # Check sub CDSs
    if sub_cdss:
        raise GFFParserError(f"Gene {gene.id} has CDSs as children in both the gene and mRNA")

    # If there are exons, check they overlap with the CDSs
    _check_sub_exons(mrna, cdss, sub_exons)

    # No more issues? Move the CDSs, and add any new exons
    mrna.sub_features += cdss
    # And remove them from the gene
    gene.sub_features = gene_subf_clean
    gene.sub_features.append(mrna)
    logging.debug(f"Gene {gene.id}: moved {len(cdss)} CDSs to the mRNA")

move_only_cdss_to_new_mrna(gene)

Add intermediate mRNAs to a gene with only CDS children. Do nothing if some sub-features are not CDS.

Source code in src/python/ensembl/io/genomio/gff3/restructure.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def move_only_cdss_to_new_mrna(gene: GFFSeqFeature) -> None:
    """Add intermediate mRNAs to a gene with only CDS children.
    Do nothing if some sub-features are not CDS.
    """

    counts = _get_feat_counts(gene)
    if (len(counts) != 1) or not counts.get("CDS"):
        return

    transcripts_dict = {}

    for cds in gene.sub_features:
        # We create as many transcripts as there are different CDS IDs
        if cds.id not in transcripts_dict:
            logging.debug(f"Create a new mRNA for {cds.id}")
            transcript = GFFSeqFeature(gene.location, type="mRNA")
            transcript.qualifiers["source"] = gene.qualifiers["source"]
            transcripts_dict[cds.id] = transcript

        # Add the CDS to the transcript
        transcripts_dict[cds.id].sub_features.append(cds)

        # Also add an exon in the same location
        exon = GFFSeqFeature(cds.location, type="exon")
        exon.qualifiers["source"] = gene.qualifiers["source"]
        transcripts_dict[cds.id].sub_features.append(exon)

    transcripts = list(transcripts_dict.values())
    gene.sub_features = transcripts

    logging.debug(f"Insert transcript-exon feats for {gene.id} ({len(transcripts)} CDSs)")

move_only_exons_to_new_mrna(gene)

Add an mRNA for a gene that only has exons and move the exons under the mRNA. No change if the gene has other sub_features than exon.

Source code in src/python/ensembl/io/genomio/gff3/restructure.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def move_only_exons_to_new_mrna(gene: GFFSeqFeature) -> None:
    """Add an mRNA for a gene that only has exons and move the exons under the mRNA.
    No change if the gene has other sub_features than exon.
    """

    counts = _get_feat_counts(gene)
    if (len(counts) != 1) or not counts.get("exon"):
        return

    transcript = GFFSeqFeature(gene.location, type="mRNA")
    transcript.qualifiers["source"] = gene.qualifiers["source"]
    transcript.sub_features = gene.sub_features
    gene.sub_features = [transcript]

    logging.debug(f"Insert transcript for {gene.id} ({len(gene.sub_features)} exons)")

remove_cds_from_pseudogene(gene)

Removes the CDSs from a pseudogene.

This assumes the CDSs are sub features of the transcript or the gene.

Source code in src/python/ensembl/io/genomio/gff3/restructure.py
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
def remove_cds_from_pseudogene(gene: GFFSeqFeature) -> None:
    """Removes the CDSs from a pseudogene.

    This assumes the CDSs are sub features of the transcript or the gene.

    """
    if gene.type != "pseudogene":
        return

    gene_subfeats = []
    for transcript in gene.sub_features:
        if transcript.type == "CDS":
            logging.debug(f"Remove pseudo CDS {transcript.id}")
        else:
            new_subfeats = []
            for feat in transcript.sub_features:
                if feat.type == "CDS":
                    logging.debug(f"Remove pseudo CDS {feat.id}")
                else:
                    new_subfeats.append(feat)
            transcript.sub_features = new_subfeats
            gene_subfeats.append(transcript)
    gene.sub_features = gene_subfeats

remove_extra_exons(gene)

Remove duplicated exons existing in both the gene and the mRNAs.

This is a special case where a gene contains proper mRNAs, etc. but also extra exons for the same features. Those exons usually have an ID starting with "id-", so that is what we use to detect them.

Parameters:

Name Type Description Default
gene GFFSeqFeature

Gene feature to update.

required

Raises:

Type Description
GFFParserError

If not all exons of this gene start with "id-".

Source code in src/python/ensembl/io/genomio/gff3/restructure.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
def remove_extra_exons(gene: GFFSeqFeature) -> None:
    """Remove duplicated exons existing in both the gene and the mRNAs.

    This is a special case where a gene contains proper mRNAs, etc. but also extra exons for the same
    features. Those exons usually have an ID starting with "id-", so that is what we use to detect them.

    Args:
        gene: Gene feature to update.

    Raises:
        GFFParserError: If not all exons of this gene start with "id-".
    """
    counts = _get_feat_counts(gene)
    if not counts.get("mRNA") and not counts.get("exon"):
        return

    exons = []
    mrnas = []
    others = []
    for subf in gene.sub_features:
        if subf.type == "exon":
            exons.append(subf)
        elif subf.type == "mRNA":
            mrnas.append(subf)
        else:
            others.append(subf)

    if exons and mrnas:
        exon_has_id = 0
        # Check if the exon ids start with "id-", which is an indication that they do not belong here
        for exon in exons:
            if exon.id.startswith("id-"):
                exon_has_id += 1
        if exon_has_id == len(exons):
            logging.debug(f"Remove {exon_has_id} extra exons from {gene.id}")
            gene.sub_features = mrnas
            gene.sub_features += others
        else:
            raise GFFParserError(f"Can't remove extra exons for {gene.id}, not all start with 'id-'")

restructure_gene(gene)

Standardize the structure of a gene model: - Add a transcript if there are no children - Move the CDS and exons to an mRNA if they are directly under the gene

Parameters:

Name Type Description Default
gene GFFSeqFeature

Gene feature to restructure.

required

Raises:

Type Description
GFFParserError

If there are CDSs/exons remaining under the gene after applying the fixes.

Source code in src/python/ensembl/io/genomio/gff3/restructure.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def restructure_gene(gene: GFFSeqFeature) -> None:
    """Standardize the structure of a gene model:
    - Add a transcript if there are no children
    - Move the CDS and exons to an mRNA if they are directly under the gene

    Args:
        gene: Gene feature to restructure.

    Raises:
        GFFParserError: If there are CDSs/exons remaining under the gene after applying the fixes.
    """
    # Skip if the children of the gene look ok
    counts = _get_feat_counts(gene)
    if (len(counts) > 0) and not counts.get("CDS") and not counts.get("exon"):
        return

    # Make sure the gene has a transcript if nothing else
    add_transcript_to_naked_gene(gene)

    # Corrections if there are CDSs or exons directly under the gene level
    move_only_cdss_to_new_mrna(gene)
    move_only_exons_to_new_mrna(gene)
    move_cds_to_existing_mrna(gene)
    remove_extra_exons(gene)

    # Check again after fixes that no CDS or exon remain under the gene
    counts = _get_feat_counts(gene)
    if counts.get("CDS") or counts.get("exon"):
        raise GFFParserError(f"Gene {gene.id} contains direct CDSs and exons children")