Skip to content

id_allocator

ensembl.io.genomio.gff3.id_allocator

Check and allocate IDs for gene features in a GFF3 file.

InvalidStableID

Bases: ValueError

Raised when there is a problem with an stable ID.

Source code in src/python/ensembl/io/genomio/gff3/id_allocator.py
27
28
class InvalidStableID(ValueError):
    """Raised when there is a problem with an stable ID."""

StableIDAllocator dataclass

Set of tools to check and allocate stable IDs.

Source code in src/python/ensembl/io/genomio/gff3/id_allocator.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
@dataclass
class StableIDAllocator:
    """Set of tools to check and allocate stable IDs."""

    # Multiple parameters to automate various fixes
    skip_gene_id_validation: bool = False
    min_id_length: int = 7
    current_id_number: int = 0
    make_missing_stable_ids: bool = True
    prefix: str = "TMP_"
    _loaded_ids: Set = field(default_factory=set)

    def set_prefix(self, genome: Dict) -> None:
        """Sets the ID prefix using the organism abbrev if it exists in the genome metadata."""
        try:
            org = genome["BRC4"]["organism_abbrev"]
        except KeyError:
            prefix = "TMP_PREFIX_"
        else:
            prefix = "TMP_" + org + "_"
        self.prefix = prefix

    def generate_gene_id(self) -> str:
        """Returns a new unique gene stable_id with a prefix.

        The ID is made up of a prefix and a number, which is auto incremented.

        """
        self.current_id_number += 1
        new_id = f"{self.prefix}{self.current_id_number}"
        return new_id

    def is_valid(self, stable_id: str) -> bool:
        """Checks that the format of a stable ID is valid.
        Args:
            stable_id: Stable ID to validate.
        """

        if self.skip_gene_id_validation:
            logging.debug(f"Validation deactivated by user: '{stable_id}' not checked")
            return True

        # Trna (from tRNAscan)
        if re.search(r"^Trna", stable_id, re.IGNORECASE):
            logging.debug(f"Stable ID is a tRNA from tRNA-scan: {stable_id}")
            return False

        # Coordinates
        if re.search(r"^.+:\d+..\d+", stable_id):
            logging.debug(f"Stable id is a coordinate: {stable_id}")
            return False

        # Special characters
        if re.search(r"[ |]", stable_id):
            logging.debug(f"Stable id contains special characters: {stable_id}")
            return False

        # Min length
        if len(stable_id) < self.min_id_length:
            logging.debug(f"Stable id is too short (<{self.min_id_length}) {stable_id}")
            return False

        return True

    @staticmethod
    def remove_prefix(stable_id: str, prefixes: List[str]) -> str:
        """Returns the stable ID after removing its prefix (if any).

        If more than one prefix may be found, only the first one is removed.

        Args:
            stable_id: Stable ID to process.
            prefixes: List of prefixes to search for.
        """

        for prefix in prefixes:
            if stable_id.startswith(prefix):
                return stable_id[len(prefix) :]
        return stable_id

    @staticmethod
    def generate_transcript_id(gene_id: str, number: int) -> str:
        """Returns a formatted transcript ID generated from a gene ID and number.
        Args:
            gene_id: Gene stable ID.
            number: Positive number.
        Raises:
            ValueError: If the number provided is not greater than zero.

        """
        if number < 1:
            raise ValueError("Number has to be a positive integer.")

        transcript_id = f"{gene_id}_t{number}"
        return transcript_id

    def normalize_cds_id(self, cds_id: str) -> str:
        """Returns a normalized version of the provided CDS ID.

        The normalisation implies to remove any unnecessary prefixes around the CDS ID. However, if
        the CDS ID is still not proper, an empty string will be returned.

        Args:
            cds_id: CDS ID to normalize.

        """

        prefixes = ["cds-", "cds:"]
        normalized_cds_id = StableIDAllocator.remove_prefix(cds_id, prefixes)

        # Special case: if the ID doesn't look like one, remove it - it needs to be regenerated
        if not self.is_valid(normalized_cds_id):
            return ""
        return normalized_cds_id

    def normalize_pseudogene_cds_id(self, pseudogene: GFFSeqFeature) -> None:
        """Normalizes every CDS ID of the provided pseudogene.

        Ensure each CDS from a pseudogene has a proper ID:
        - Different from the gene
        - Derived from the gene if it is not proper

        Args:
            pseudogene: Pseudogene feature.
        """
        for transcript in pseudogene.sub_features:
            for feat in transcript.sub_features:
                if feat.type == "CDS":
                    feat.id = self.normalize_cds_id(feat.id)
                    if feat.id in ("", pseudogene.id):
                        feat.id = f"{transcript.id}_cds"
                        feat.qualifiers["ID"] = feat.id

    def normalize_gene_id(self, gene: GFFSeqFeature, refseq: Optional[bool] = False) -> str:
        """Returns a normalized gene stable ID.

        Removes any unnecessary prefixes, but will generate a new stable ID if the normalized one is
        not recognized as valid.

        Args:
            gene: Gene feature to normalize.
        """
        prefixes = ["gene-", "gene:"]
        new_gene_id = StableIDAllocator.remove_prefix(gene.id, prefixes)

        is_valid = False
        # Special case for RefSeq: only valid Gene IDs are LOC*
        if refseq:
            if new_gene_id.startswith("LOC"):
                is_valid = True
        else:
            is_valid = self.is_valid(new_gene_id)

        if is_valid:
            return new_gene_id

        # In case the normalized gene ID is not valid, use the GeneID
        logging.debug(f"Gene ID is not valid: {new_gene_id}")
        qual = gene.qualifiers
        if "Dbxref" in qual:
            for xref in qual["Dbxref"]:
                (db, value) = xref.split(":")
                if db != "GeneID":
                    continue
                new_gene_id_base = f"{db}_{value}"
                new_gene_id = new_gene_id_base
                number = 1
                while new_gene_id in self._loaded_ids:
                    number += 1
                    new_gene_id = f"{new_gene_id_base}_{number}"
                    if number > 10:
                        raise InvalidStableID(f"Duplicate ID {new_gene_id_base} (up to {new_gene_id})")
                self._loaded_ids.add(new_gene_id)
                logging.debug(f"Using GeneID {new_gene_id} for stable_id instead of {gene.id}")
                return new_gene_id

        # Make a new stable_id
        if self.make_missing_stable_ids:
            new_gene_id = self.generate_gene_id()
            logging.debug(f"New ID: {new_gene_id} -> {new_gene_id}")
            return new_gene_id
        raise InvalidStableID(f"Can't use invalid gene id for {gene}")

current_id_number = 0 class-attribute instance-attribute

make_missing_stable_ids = True class-attribute instance-attribute

min_id_length = 7 class-attribute instance-attribute

prefix = 'TMP_' class-attribute instance-attribute

skip_gene_id_validation = False class-attribute instance-attribute

generate_gene_id()

Returns a new unique gene stable_id with a prefix.

The ID is made up of a prefix and a number, which is auto incremented.

Source code in src/python/ensembl/io/genomio/gff3/id_allocator.py
53
54
55
56
57
58
59
60
61
def generate_gene_id(self) -> str:
    """Returns a new unique gene stable_id with a prefix.

    The ID is made up of a prefix and a number, which is auto incremented.

    """
    self.current_id_number += 1
    new_id = f"{self.prefix}{self.current_id_number}"
    return new_id

generate_transcript_id(gene_id, number) staticmethod

Returns a formatted transcript ID generated from a gene ID and number. Args: gene_id: Gene stable ID. number: Positive number. Raises: ValueError: If the number provided is not greater than zero.

Source code in src/python/ensembl/io/genomio/gff3/id_allocator.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
@staticmethod
def generate_transcript_id(gene_id: str, number: int) -> str:
    """Returns a formatted transcript ID generated from a gene ID and number.
    Args:
        gene_id: Gene stable ID.
        number: Positive number.
    Raises:
        ValueError: If the number provided is not greater than zero.

    """
    if number < 1:
        raise ValueError("Number has to be a positive integer.")

    transcript_id = f"{gene_id}_t{number}"
    return transcript_id

is_valid(stable_id)

Checks that the format of a stable ID is valid. Args: stable_id: Stable ID to validate.

Source code in src/python/ensembl/io/genomio/gff3/id_allocator.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def is_valid(self, stable_id: str) -> bool:
    """Checks that the format of a stable ID is valid.
    Args:
        stable_id: Stable ID to validate.
    """

    if self.skip_gene_id_validation:
        logging.debug(f"Validation deactivated by user: '{stable_id}' not checked")
        return True

    # Trna (from tRNAscan)
    if re.search(r"^Trna", stable_id, re.IGNORECASE):
        logging.debug(f"Stable ID is a tRNA from tRNA-scan: {stable_id}")
        return False

    # Coordinates
    if re.search(r"^.+:\d+..\d+", stable_id):
        logging.debug(f"Stable id is a coordinate: {stable_id}")
        return False

    # Special characters
    if re.search(r"[ |]", stable_id):
        logging.debug(f"Stable id contains special characters: {stable_id}")
        return False

    # Min length
    if len(stable_id) < self.min_id_length:
        logging.debug(f"Stable id is too short (<{self.min_id_length}) {stable_id}")
        return False

    return True

normalize_cds_id(cds_id)

Returns a normalized version of the provided CDS ID.

The normalisation implies to remove any unnecessary prefixes around the CDS ID. However, if the CDS ID is still not proper, an empty string will be returned.

Parameters:

Name Type Description Default
cds_id str

CDS ID to normalize.

required
Source code in src/python/ensembl/io/genomio/gff3/id_allocator.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def normalize_cds_id(self, cds_id: str) -> str:
    """Returns a normalized version of the provided CDS ID.

    The normalisation implies to remove any unnecessary prefixes around the CDS ID. However, if
    the CDS ID is still not proper, an empty string will be returned.

    Args:
        cds_id: CDS ID to normalize.

    """

    prefixes = ["cds-", "cds:"]
    normalized_cds_id = StableIDAllocator.remove_prefix(cds_id, prefixes)

    # Special case: if the ID doesn't look like one, remove it - it needs to be regenerated
    if not self.is_valid(normalized_cds_id):
        return ""
    return normalized_cds_id

normalize_gene_id(gene, refseq=False)

Returns a normalized gene stable ID.

Removes any unnecessary prefixes, but will generate a new stable ID if the normalized one is not recognized as valid.

Parameters:

Name Type Description Default
gene GFFSeqFeature

Gene feature to normalize.

required
Source code in src/python/ensembl/io/genomio/gff3/id_allocator.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def normalize_gene_id(self, gene: GFFSeqFeature, refseq: Optional[bool] = False) -> str:
    """Returns a normalized gene stable ID.

    Removes any unnecessary prefixes, but will generate a new stable ID if the normalized one is
    not recognized as valid.

    Args:
        gene: Gene feature to normalize.
    """
    prefixes = ["gene-", "gene:"]
    new_gene_id = StableIDAllocator.remove_prefix(gene.id, prefixes)

    is_valid = False
    # Special case for RefSeq: only valid Gene IDs are LOC*
    if refseq:
        if new_gene_id.startswith("LOC"):
            is_valid = True
    else:
        is_valid = self.is_valid(new_gene_id)

    if is_valid:
        return new_gene_id

    # In case the normalized gene ID is not valid, use the GeneID
    logging.debug(f"Gene ID is not valid: {new_gene_id}")
    qual = gene.qualifiers
    if "Dbxref" in qual:
        for xref in qual["Dbxref"]:
            (db, value) = xref.split(":")
            if db != "GeneID":
                continue
            new_gene_id_base = f"{db}_{value}"
            new_gene_id = new_gene_id_base
            number = 1
            while new_gene_id in self._loaded_ids:
                number += 1
                new_gene_id = f"{new_gene_id_base}_{number}"
                if number > 10:
                    raise InvalidStableID(f"Duplicate ID {new_gene_id_base} (up to {new_gene_id})")
            self._loaded_ids.add(new_gene_id)
            logging.debug(f"Using GeneID {new_gene_id} for stable_id instead of {gene.id}")
            return new_gene_id

    # Make a new stable_id
    if self.make_missing_stable_ids:
        new_gene_id = self.generate_gene_id()
        logging.debug(f"New ID: {new_gene_id} -> {new_gene_id}")
        return new_gene_id
    raise InvalidStableID(f"Can't use invalid gene id for {gene}")

normalize_pseudogene_cds_id(pseudogene)

Normalizes every CDS ID of the provided pseudogene.

Ensure each CDS from a pseudogene has a proper ID: - Different from the gene - Derived from the gene if it is not proper

Parameters:

Name Type Description Default
pseudogene GFFSeqFeature

Pseudogene feature.

required
Source code in src/python/ensembl/io/genomio/gff3/id_allocator.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def normalize_pseudogene_cds_id(self, pseudogene: GFFSeqFeature) -> None:
    """Normalizes every CDS ID of the provided pseudogene.

    Ensure each CDS from a pseudogene has a proper ID:
    - Different from the gene
    - Derived from the gene if it is not proper

    Args:
        pseudogene: Pseudogene feature.
    """
    for transcript in pseudogene.sub_features:
        for feat in transcript.sub_features:
            if feat.type == "CDS":
                feat.id = self.normalize_cds_id(feat.id)
                if feat.id in ("", pseudogene.id):
                    feat.id = f"{transcript.id}_cds"
                    feat.qualifiers["ID"] = feat.id

remove_prefix(stable_id, prefixes) staticmethod

Returns the stable ID after removing its prefix (if any).

If more than one prefix may be found, only the first one is removed.

Parameters:

Name Type Description Default
stable_id str

Stable ID to process.

required
prefixes List[str]

List of prefixes to search for.

required
Source code in src/python/ensembl/io/genomio/gff3/id_allocator.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
@staticmethod
def remove_prefix(stable_id: str, prefixes: List[str]) -> str:
    """Returns the stable ID after removing its prefix (if any).

    If more than one prefix may be found, only the first one is removed.

    Args:
        stable_id: Stable ID to process.
        prefixes: List of prefixes to search for.
    """

    for prefix in prefixes:
        if stable_id.startswith(prefix):
            return stable_id[len(prefix) :]
    return stable_id

set_prefix(genome)

Sets the ID prefix using the organism abbrev if it exists in the genome metadata.

Source code in src/python/ensembl/io/genomio/gff3/id_allocator.py
43
44
45
46
47
48
49
50
51
def set_prefix(self, genome: Dict) -> None:
    """Sets the ID prefix using the organism abbrev if it exists in the genome metadata."""
    try:
        org = genome["BRC4"]["organism_abbrev"]
    except KeyError:
        prefix = "TMP_PREFIX_"
    else:
        prefix = "TMP_" + org + "_"
    self.prefix = prefix