Skip to content

collection

ensembl.io.genomio.seq_region.collection

SeqCollection as a collection of seq_regions metadata.

SeqRegionDict = dict[str, Any] module-attribute

SeqCollection

Represent a collection of seq_regions metadata.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
class SeqCollection:
    """Represent a collection of seq_regions metadata."""

    mock: bool
    seqs: dict

    def __init__(self, mock: bool = False) -> None:
        self.seqs = {}
        self.mock = mock

    def from_gbff(self, gbff_path: Path) -> None:
        """Store seq_regions extracted from a GBFF file.

        If a seq_region with the same ID exists in the collection, it will be replaced.
        """
        with open_gz_file(gbff_path) as gbff_file:
            for record in SeqIO.parse(gbff_file, "genbank"):
                record_data = GBFFRecord(record)
                new_seq: SeqRegionDict = self.make_seqregion_from_gbff(record_data)
                name = record.id
                merged_seq = self._merge(new_seq, self.seqs.get(name, {}))
                self.seqs[name] = merged_seq

    def _merge(self, source: SeqRegionDict, destination: SeqRegionDict) -> SeqRegionDict:
        """Merge a source dict in a destination dict (only add values or append to lists)."""
        if not destination:
            return source
        for key, value in source.items():
            if isinstance(value, list):
                destination[key] += value
            else:
                destination[key] = value

        return destination

    @staticmethod
    def make_seqregion_from_gbff(record_data: GBFFRecord) -> SeqRegionDict:
        """Returns a seq_region dict extracted from a GBFF record."""
        seqr: SeqRegionDict = {"length": len(record_data.record.seq)}  # type: ignore[arg-type]

        if record_data.is_circular():
            seqr["circular"] = True

        # Is there a genetic code defined?
        codon_table = record_data.get_codon_table()
        if codon_table is not None:
            seqr["codon_table"] = codon_table

        # Is it an organelle?
        location = record_data.get_organelle()
        if location is not None:
            seqr["location"] = location

        # Is there a comment stating the Genbank record this is based on?
        genbank_id = record_data.get_genbank_id()
        if genbank_id is not None:
            seqr["synonyms"] = [{"source": "INSDC", "name": genbank_id}]

        return seqr

    def from_report(self, report_path: Path, is_refseq: bool = False) -> None:
        """Store seq_regions extracted from an INSDC assembly report file.

        If a seq_region with the same id exists in the collection, it will be replaced.

        Args:
            report_path: Path to the sequence regions report file.
            is_refseq: True if the source of the report is RefSeq, false if INSDC.

        """
        report = ReportRecord(report_path)
        for seq_data in report.reader:
            new_seq = self.make_seq_region_from_report(seq_data, is_refseq)
            name = new_seq["name"]
            merged_seq = self._merge(new_seq, self.seqs.get(name, {}))
            self.seqs[name] = merged_seq

    @staticmethod
    def make_seq_region_from_report(
        seq_data: dict[str, Any],
        is_refseq: bool,
        synonym_map: Mapping[str, str] = SYNONYM_MAP,
        molecule_location: Mapping[str, str] = MOLECULE_LOCATION,
    ) -> SeqRegionDict:
        """Returns a sequence region from the information provided.

        An empty sequence region will be returned if no accession information is found.

        Args:
            data: Dict from the report representing one line, where the key is the column name.
            is_refseq: True if the source is RefSeq, false if INSDC.
            synonym_map: Map of INSDC report column names to sequence region field names.
            molecule_location: Map of sequence type to SO location.

        Raises:
            UnknownMetadata: If the sequence role or location is not recognised.

        """
        seq_region = {}

        # Set accession as the sequence region name
        src = "RefSeq" if is_refseq else "GenBank"
        accession_id = seq_data.get(f"{src}-Accn", "")
        if not accession_id or (accession_id == "na"):
            logging.warning(f'No {src} accession ID found for {seq_data["Sequence-Name"]}')
            return {}
        seq_region["name"] = accession_id

        # Add synonyms
        synonyms = []
        for field, source in synonym_map.items():
            if (field in seq_data) and (seq_data[field].casefold() != "na"):
                synonym = {"source": source, "name": seq_data[field]}
                synonyms.append(synonym)
        synonyms.sort(key=lambda x: x["source"])
        seq_region["synonyms"] = synonyms

        # Add sequence length
        field = "Sequence-Length"
        if (field in seq_data) and (seq_data[field].casefold() != "na"):
            seq_region["length"] = int(seq_data[field])

        # Add coordinate system and location
        seq_role = seq_data["Sequence-Role"]
        # Scaffold?
        if seq_role in ("unplaced-scaffold", "unlocalized-scaffold"):
            seq_region["coord_system_level"] = "scaffold"
        # Chromosome? Check location
        elif seq_role == "assembled-molecule":
            seq_region["coord_system_level"] = "chromosome"
            location = seq_data["Assigned-Molecule-Location/Type"].lower()
            # Get location metadata
            try:
                seq_region["location"] = molecule_location[location]
            except KeyError as exc:
                raise UnknownMetadata(f"Unrecognized sequence location: {location}") from exc
        else:
            raise UnknownMetadata(f"Unrecognized sequence role: {seq_role}")

        return seq_region

    def remove(self, to_exclude: list[str]) -> None:
        """Remove seq_regions based on a provided list of accessions."""
        for seq_name in to_exclude:
            if seq_name in self.seqs:
                del self.seqs[seq_name]
            else:
                logging.info(f"Cannot exclude seq not found: {seq_name}")

    def add_translation_table(self, location_codon: Mapping[str, int] = LOCATION_CODON) -> None:
        """Adds the translation codon table to each sequence region (when missing) based on its location.

        Args:
            location_codon: Map of known codon tables for known locations.

        """
        for seqr in self.seqs.values():
            if "codon_table" in seqr:
                continue
            if seqr.get("location", "") in location_codon:
                seqr["codon_table"] = location_codon[seqr["location"]]

    def add_mitochondrial_codon_table(self, taxon_id: int) -> None:
        """Adds the mitochondrial codon table to each sequence region (when missing) based on the taxon ID.

        If no mitochondrial genetic code can be found for the given taxon ID nothing will be changed.

        Args:
            taxon_id: The species taxon ID.

        """
        if self.mock:
            logging.info("Skip mitochondrial codon table: mock")
            return
        if not taxon_id:
            logging.info("Skip mitochondrial codon table: no taxon_id to use")
            return

        url = f"https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{str(taxon_id)}"
        response = requests.get(url, headers={"Content-Type": "application/json"}, timeout=60)
        response.raise_for_status()
        # In case we have been redirected, check for HTML opening tag
        if response.text.startswith("<"):
            raise ValueError(f"Response from {url} is not JSON")
        decoded = response.json()
        genetic_code = int(decoded.get("mitochondrialGeneticCode", 0))
        if genetic_code == 0:
            logging.warning(f"No mitochondria genetic code found for taxon {taxon_id}")
            return

        for seqr in self.seqs.values():
            if ("codon_table" not in seqr) and (seqr.get("location", "") == "mitochondrial_chromosome"):
                seqr["codon_table"] = genetic_code

    def to_list(self) -> list[SeqRegionDict]:
        """Returns the sequences as a simple list of `SeqRegionDict` objects."""
        return list(self.seqs.values())

mock = mock instance-attribute

seqs = {} instance-attribute

add_mitochondrial_codon_table(taxon_id)

Adds the mitochondrial codon table to each sequence region (when missing) based on the taxon ID.

If no mitochondrial genetic code can be found for the given taxon ID nothing will be changed.

Parameters:

Name Type Description Default
taxon_id int

The species taxon ID.

required
Source code in src/python/ensembl/io/genomio/seq_region/collection.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
def add_mitochondrial_codon_table(self, taxon_id: int) -> None:
    """Adds the mitochondrial codon table to each sequence region (when missing) based on the taxon ID.

    If no mitochondrial genetic code can be found for the given taxon ID nothing will be changed.

    Args:
        taxon_id: The species taxon ID.

    """
    if self.mock:
        logging.info("Skip mitochondrial codon table: mock")
        return
    if not taxon_id:
        logging.info("Skip mitochondrial codon table: no taxon_id to use")
        return

    url = f"https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{str(taxon_id)}"
    response = requests.get(url, headers={"Content-Type": "application/json"}, timeout=60)
    response.raise_for_status()
    # In case we have been redirected, check for HTML opening tag
    if response.text.startswith("<"):
        raise ValueError(f"Response from {url} is not JSON")
    decoded = response.json()
    genetic_code = int(decoded.get("mitochondrialGeneticCode", 0))
    if genetic_code == 0:
        logging.warning(f"No mitochondria genetic code found for taxon {taxon_id}")
        return

    for seqr in self.seqs.values():
        if ("codon_table" not in seqr) and (seqr.get("location", "") == "mitochondrial_chromosome"):
            seqr["codon_table"] = genetic_code

add_translation_table(location_codon=LOCATION_CODON)

Adds the translation codon table to each sequence region (when missing) based on its location.

Parameters:

Name Type Description Default
location_codon Mapping[str, int]

Map of known codon tables for known locations.

LOCATION_CODON
Source code in src/python/ensembl/io/genomio/seq_region/collection.py
187
188
189
190
191
192
193
194
195
196
197
198
def add_translation_table(self, location_codon: Mapping[str, int] = LOCATION_CODON) -> None:
    """Adds the translation codon table to each sequence region (when missing) based on its location.

    Args:
        location_codon: Map of known codon tables for known locations.

    """
    for seqr in self.seqs.values():
        if "codon_table" in seqr:
            continue
        if seqr.get("location", "") in location_codon:
            seqr["codon_table"] = location_codon[seqr["location"]]

from_gbff(gbff_path)

Store seq_regions extracted from a GBFF file.

If a seq_region with the same ID exists in the collection, it will be replaced.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
48
49
50
51
52
53
54
55
56
57
58
59
def from_gbff(self, gbff_path: Path) -> None:
    """Store seq_regions extracted from a GBFF file.

    If a seq_region with the same ID exists in the collection, it will be replaced.
    """
    with open_gz_file(gbff_path) as gbff_file:
        for record in SeqIO.parse(gbff_file, "genbank"):
            record_data = GBFFRecord(record)
            new_seq: SeqRegionDict = self.make_seqregion_from_gbff(record_data)
            name = record.id
            merged_seq = self._merge(new_seq, self.seqs.get(name, {}))
            self.seqs[name] = merged_seq

from_report(report_path, is_refseq=False)

Store seq_regions extracted from an INSDC assembly report file.

If a seq_region with the same id exists in the collection, it will be replaced.

Parameters:

Name Type Description Default
report_path Path

Path to the sequence regions report file.

required
is_refseq bool

True if the source of the report is RefSeq, false if INSDC.

False
Source code in src/python/ensembl/io/genomio/seq_region/collection.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def from_report(self, report_path: Path, is_refseq: bool = False) -> None:
    """Store seq_regions extracted from an INSDC assembly report file.

    If a seq_region with the same id exists in the collection, it will be replaced.

    Args:
        report_path: Path to the sequence regions report file.
        is_refseq: True if the source of the report is RefSeq, false if INSDC.

    """
    report = ReportRecord(report_path)
    for seq_data in report.reader:
        new_seq = self.make_seq_region_from_report(seq_data, is_refseq)
        name = new_seq["name"]
        merged_seq = self._merge(new_seq, self.seqs.get(name, {}))
        self.seqs[name] = merged_seq

make_seq_region_from_report(seq_data, is_refseq, synonym_map=SYNONYM_MAP, molecule_location=MOLECULE_LOCATION) staticmethod

Returns a sequence region from the information provided.

An empty sequence region will be returned if no accession information is found.

Parameters:

Name Type Description Default
data

Dict from the report representing one line, where the key is the column name.

required
is_refseq bool

True if the source is RefSeq, false if INSDC.

required
synonym_map Mapping[str, str]

Map of INSDC report column names to sequence region field names.

SYNONYM_MAP
molecule_location Mapping[str, str]

Map of sequence type to SO location.

MOLECULE_LOCATION

Raises:

Type Description
UnknownMetadata

If the sequence role or location is not recognised.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
@staticmethod
def make_seq_region_from_report(
    seq_data: dict[str, Any],
    is_refseq: bool,
    synonym_map: Mapping[str, str] = SYNONYM_MAP,
    molecule_location: Mapping[str, str] = MOLECULE_LOCATION,
) -> SeqRegionDict:
    """Returns a sequence region from the information provided.

    An empty sequence region will be returned if no accession information is found.

    Args:
        data: Dict from the report representing one line, where the key is the column name.
        is_refseq: True if the source is RefSeq, false if INSDC.
        synonym_map: Map of INSDC report column names to sequence region field names.
        molecule_location: Map of sequence type to SO location.

    Raises:
        UnknownMetadata: If the sequence role or location is not recognised.

    """
    seq_region = {}

    # Set accession as the sequence region name
    src = "RefSeq" if is_refseq else "GenBank"
    accession_id = seq_data.get(f"{src}-Accn", "")
    if not accession_id or (accession_id == "na"):
        logging.warning(f'No {src} accession ID found for {seq_data["Sequence-Name"]}')
        return {}
    seq_region["name"] = accession_id

    # Add synonyms
    synonyms = []
    for field, source in synonym_map.items():
        if (field in seq_data) and (seq_data[field].casefold() != "na"):
            synonym = {"source": source, "name": seq_data[field]}
            synonyms.append(synonym)
    synonyms.sort(key=lambda x: x["source"])
    seq_region["synonyms"] = synonyms

    # Add sequence length
    field = "Sequence-Length"
    if (field in seq_data) and (seq_data[field].casefold() != "na"):
        seq_region["length"] = int(seq_data[field])

    # Add coordinate system and location
    seq_role = seq_data["Sequence-Role"]
    # Scaffold?
    if seq_role in ("unplaced-scaffold", "unlocalized-scaffold"):
        seq_region["coord_system_level"] = "scaffold"
    # Chromosome? Check location
    elif seq_role == "assembled-molecule":
        seq_region["coord_system_level"] = "chromosome"
        location = seq_data["Assigned-Molecule-Location/Type"].lower()
        # Get location metadata
        try:
            seq_region["location"] = molecule_location[location]
        except KeyError as exc:
            raise UnknownMetadata(f"Unrecognized sequence location: {location}") from exc
    else:
        raise UnknownMetadata(f"Unrecognized sequence role: {seq_role}")

    return seq_region

make_seqregion_from_gbff(record_data) staticmethod

Returns a seq_region dict extracted from a GBFF record.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
@staticmethod
def make_seqregion_from_gbff(record_data: GBFFRecord) -> SeqRegionDict:
    """Returns a seq_region dict extracted from a GBFF record."""
    seqr: SeqRegionDict = {"length": len(record_data.record.seq)}  # type: ignore[arg-type]

    if record_data.is_circular():
        seqr["circular"] = True

    # Is there a genetic code defined?
    codon_table = record_data.get_codon_table()
    if codon_table is not None:
        seqr["codon_table"] = codon_table

    # Is it an organelle?
    location = record_data.get_organelle()
    if location is not None:
        seqr["location"] = location

    # Is there a comment stating the Genbank record this is based on?
    genbank_id = record_data.get_genbank_id()
    if genbank_id is not None:
        seqr["synonyms"] = [{"source": "INSDC", "name": genbank_id}]

    return seqr

remove(to_exclude)

Remove seq_regions based on a provided list of accessions.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
179
180
181
182
183
184
185
def remove(self, to_exclude: list[str]) -> None:
    """Remove seq_regions based on a provided list of accessions."""
    for seq_name in to_exclude:
        if seq_name in self.seqs:
            del self.seqs[seq_name]
        else:
            logging.info(f"Cannot exclude seq not found: {seq_name}")

to_list()

Returns the sequences as a simple list of SeqRegionDict objects.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
232
233
234
def to_list(self) -> list[SeqRegionDict]:
    """Returns the sequences as a simple list of `SeqRegionDict` objects."""
    return list(self.seqs.values())