Skip to content

seq_region

ensembl.io.genomio.seq_region

Sequence regions handling module.

StrPath = TypeVar('StrPath', str, os.PathLike) module-attribute

ArgumentParser

Bases: ArgumentParser

Extends argparse.ArgumentParser with additional methods and functionality.

The default behaviour of the help text will be to display the default values on every non-required argument, i.e. optional arguments with required=False.

Source code in ensembl/utils/argparse.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
class ArgumentParser(argparse.ArgumentParser):
    """Extends `argparse.ArgumentParser` with additional methods and functionality.

    The default behaviour of the help text will be to display the default values on every non-required
    argument, i.e. optional arguments with `required=False`.

    """

    def __init__(self, *args: Any, **kwargs: Any) -> None:
        """Extends the base class to include the information about default argument values by default."""
        super().__init__(*args, **kwargs)
        self.formatter_class = argparse.ArgumentDefaultsHelpFormatter

    def _validate_src_path(self, src_path: StrPath) -> Path:
        """Returns the path if exists and it is readable, raises an error through the parser otherwise.

        Args:
            src_path: File or directory path to check.

        """
        src_path = Path(src_path)
        if not src_path.exists():
            self.error(f"'{src_path}' not found")
        elif not os.access(src_path, os.R_OK):
            self.error(f"'{src_path}' not readable")
        return src_path

    def _validate_dst_path(self, dst_path: StrPath, exists_ok: bool = False) -> Path:
        """Returns the path if it is writable, raises an error through the parser otherwise.

        Args:
            dst_path: File or directory path to check.
            exists_ok: Do not raise an error during parsing if the destination path already exists.

        """
        dst_path = Path(dst_path)
        if dst_path.exists():
            if os.access(dst_path, os.W_OK):
                if exists_ok:
                    return dst_path
                self.error(f"'{dst_path}' already exists")
            else:
                self.error(f"'{dst_path}' is not writable")
        # Check if the first parent directory that exists is writable
        for parent_path in dst_path.parents:
            if parent_path.exists():
                if not os.access(parent_path, os.W_OK):
                    self.error(f"'{dst_path}' is not writable")
                break
        return dst_path

    def _validate_number(
        self,
        value: str,
        value_type: Callable[[str], int | float],
        min_value: int | float | None,
        max_value: int | float | None,
    ) -> int | float:
        """Returns the numeric value if it is of the expected type and it is within the specified range.

        Args:
            value: String representation of numeric value to check.
            value_type: Expected type of the numeric value.
            min_value: Minimum value constrain. If `None`, no minimum value constrain.
            max_value: Maximum value constrain. If `None`, no maximum value constrain.

        """
        # Check if the string representation can be converted to the expected type
        try:
            result = value_type(value)
        except (TypeError, ValueError):
            self.error(f"invalid {value_type.__name__} value: {value}")
        # Check if numeric value is within range
        if (min_value is not None) and (result < min_value):
            self.error(f"{value} is lower than minimum value ({min_value})")
        if (max_value is not None) and (result > max_value):
            self.error(f"{value} is greater than maximum value ({max_value})")
        return result

    def add_argument(self, *args: Any, **kwargs: Any) -> None:  # type: ignore[override]
        """Extends the parent function by excluding the default value in the help text when not provided.

        Only applied to required arguments without a default value, i.e. positional arguments or optional
        arguments with `required=True`.

        """
        if kwargs.get("required", False):
            kwargs.setdefault("default", argparse.SUPPRESS)
        super().add_argument(*args, **kwargs)

    def add_argument_src_path(self, *args: Any, **kwargs: Any) -> None:
        """Adds `pathlib.Path` argument, checking if it exists and it is readable at parsing time.

        If "metavar" is not defined, it is added with "PATH" as value to improve help text readability.

        """
        kwargs.setdefault("metavar", "PATH")
        kwargs["type"] = self._validate_src_path
        self.add_argument(*args, **kwargs)

    def add_argument_dst_path(self, *args: Any, exists_ok: bool = True, **kwargs: Any) -> None:
        """Adds `pathlib.Path` argument, checking if it is writable at parsing time.

        If "metavar" is not defined it is added with "PATH" as value to improve help text readability.

        Args:
            exists_ok: Do not raise an error if the destination path already exists.

        """
        kwargs.setdefault("metavar", "PATH")
        kwargs["type"] = lambda x: self._validate_dst_path(x, exists_ok)
        self.add_argument(*args, **kwargs)

    def add_argument_url(self, *args: Any, **kwargs: Any) -> None:
        """Adds `sqlalchemy.engine.URL` argument.

        If "metavar" is not defined it is added with "URI" as value to improve help text readability.

        """
        kwargs.setdefault("metavar", "URI")
        kwargs["type"] = make_url
        self.add_argument(*args, **kwargs)

    # pylint: disable=redefined-builtin
    def add_numeric_argument(
        self,
        *args: Any,
        type: Callable[[str], int | float] = float,
        min_value: int | float | None = None,
        max_value: int | float | None = None,
        **kwargs: Any,
    ) -> None:
        """Adds a numeric argument with constrains on its type and its minimum or maximum value.

        Note that the default value (if defined) is not checked unless the argument is an optional argument
        and no value is provided in the command line.

        Args:
            type: Type to convert the argument value to when parsing.
            min_value: Minimum value constrain. If `None`, no minimum value constrain.
            max_value: Maximum value constrain. If `None`, no maximum value constrain.

        """
        # If both minimum and maximum values are defined, ensure min_value <= max_value
        if (min_value is not None) and (max_value is not None) and (min_value > max_value):
            raise ArgumentError("minimum value is greater than maximum value")
        # Add lambda function to check numeric constrains when parsing argument
        kwargs["type"] = lambda x: self._validate_number(x, type, min_value, max_value)
        self.add_argument(*args, **kwargs)

    # pylint: disable=redefined-builtin
    def add_server_arguments(
        self, prefix: str = "", include_database: bool = False, help: str | None = None
    ) -> None:
        """Adds the usual set of arguments needed to connect to a server, i.e. `--host`, `--port`, `--user`
        and `--password` (optional).

        Note that the parser will assume this is a MySQL server.

        Args:
            prefix: Prefix to add the each argument, e.g. if prefix is `src_`, the arguments will be
                `--src_host`, etc.
            include_database: Include `--database` argument.
            help: Description message to include for this set of arguments.

        """
        group = self.add_argument_group(f"{prefix}server connection arguments", description=help)
        group.add_argument(
            f"--{prefix}host", required=True, metavar="HOST", default=argparse.SUPPRESS, help="host name"
        )
        group.add_argument(
            f"--{prefix}port",
            required=True,
            type=int,
            metavar="PORT",
            default=argparse.SUPPRESS,
            help="port number",
        )
        group.add_argument(
            f"--{prefix}user", required=True, metavar="USER", default=argparse.SUPPRESS, help="user name"
        )
        group.add_argument(f"--{prefix}password", metavar="PWD", help="host password")
        if include_database:
            group.add_argument(
                f"--{prefix}database",
                required=True,
                metavar="NAME",
                default=argparse.SUPPRESS,
                help="database name",
            )

    def add_log_arguments(self, add_log_file: bool = False) -> None:
        """Adds the usual set of arguments required to set and initialise a logging system.

        The current set includes a mutually exclusive group for the default logging level: `--verbose`,
        `--debug` or `--log LEVEL`.

        Args:
            add_log_file: Add arguments to allow storing messages into a file, i.e. `--log_file` and
                `--log_file_level`.

        """
        # Define the list of log levels available
        log_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
        # NOTE: from 3.11 this list can be changed to: logging.getLevelNamesMapping().keys()
        # Create logging arguments group
        group = self.add_argument_group("logging arguments")
        # Add 3 mutually exclusive options to set the logging level
        subgroup = group.add_mutually_exclusive_group()
        subgroup.add_argument(
            "-v",
            "--verbose",
            action="store_const",
            const="INFO",
            dest="log_level",
            help="verbose mode, i.e. 'INFO' log level",
        )
        subgroup.add_argument(
            "--debug",
            action="store_const",
            const="DEBUG",
            dest="log_level",
            help="debugging mode, i.e. 'DEBUG' log level",
        )
        subgroup.add_argument(
            "--log",
            choices=log_levels,
            type=str.upper,
            default="WARNING",
            metavar="LEVEL",
            dest="log_level",
            help="level of the events to track: %(choices)s",
        )
        subgroup.set_defaults(log_level="WARNING")
        if add_log_file:
            # Add log file-related arguments
            group.add_argument(
                "--log_file",
                type=lambda x: self._validate_dst_path(x, exists_ok=True),
                metavar="PATH",
                default=None,
                help="log file path",
            )
            group.add_argument(
                "--log_file_level",
                choices=log_levels,
                type=str.upper,
                default="DEBUG",
                metavar="LEVEL",
                help="level of the events to track in the log file: %(choices)s",
            )

    def parse_args(self, *args: Any, **kwargs: Any) -> argparse.Namespace:  # type: ignore[override]
        """Extends the parent function by adding a new URL argument for every server group added.

        The type of this new argument will be `sqlalchemy.engine.URL`. It also logs all the parsed
        arguments for debugging purposes when logging arguments have been added.

        """
        arguments = super().parse_args(*args, **kwargs)
        # Build and add an sqlalchemy.engine.URL object for every server group added
        pattern = re.compile(r"([\w-]*)host$")
        server_prefixes = [x.group(1) for x in map(pattern.match, vars(arguments)) if x]
        for prefix in server_prefixes:
            # Raise an error rather than overwriting when the URL argument is already present
            if f"{prefix}url" in arguments:
                self.error(f"argument '{prefix}url' is already present")
            try:
                server_url = URL.create(
                    "mysql",
                    getattr(arguments, f"{prefix}user"),
                    getattr(arguments, f"{prefix}password"),
                    getattr(arguments, f"{prefix}host"),
                    getattr(arguments, f"{prefix}port"),
                    getattr(arguments, f"{prefix}database", None),
                )
            except AttributeError:
                # Not a database server host argument
                continue
            setattr(arguments, f"{prefix}url", server_url)
        return arguments

formatter_class = argparse.ArgumentDefaultsHelpFormatter instance-attribute

add_argument(*args, **kwargs)

Extends the parent function by excluding the default value in the help text when not provided.

Only applied to required arguments without a default value, i.e. positional arguments or optional arguments with required=True.

Source code in ensembl/utils/argparse.py
132
133
134
135
136
137
138
139
140
141
def add_argument(self, *args: Any, **kwargs: Any) -> None:  # type: ignore[override]
    """Extends the parent function by excluding the default value in the help text when not provided.

    Only applied to required arguments without a default value, i.e. positional arguments or optional
    arguments with `required=True`.

    """
    if kwargs.get("required", False):
        kwargs.setdefault("default", argparse.SUPPRESS)
    super().add_argument(*args, **kwargs)

add_argument_dst_path(*args, exists_ok=True, **kwargs)

Adds pathlib.Path argument, checking if it is writable at parsing time.

If "metavar" is not defined it is added with "PATH" as value to improve help text readability.

Parameters:

Name Type Description Default
exists_ok bool

Do not raise an error if the destination path already exists.

True
Source code in ensembl/utils/argparse.py
153
154
155
156
157
158
159
160
161
162
163
164
def add_argument_dst_path(self, *args: Any, exists_ok: bool = True, **kwargs: Any) -> None:
    """Adds `pathlib.Path` argument, checking if it is writable at parsing time.

    If "metavar" is not defined it is added with "PATH" as value to improve help text readability.

    Args:
        exists_ok: Do not raise an error if the destination path already exists.

    """
    kwargs.setdefault("metavar", "PATH")
    kwargs["type"] = lambda x: self._validate_dst_path(x, exists_ok)
    self.add_argument(*args, **kwargs)

add_argument_src_path(*args, **kwargs)

Adds pathlib.Path argument, checking if it exists and it is readable at parsing time.

If "metavar" is not defined, it is added with "PATH" as value to improve help text readability.

Source code in ensembl/utils/argparse.py
143
144
145
146
147
148
149
150
151
def add_argument_src_path(self, *args: Any, **kwargs: Any) -> None:
    """Adds `pathlib.Path` argument, checking if it exists and it is readable at parsing time.

    If "metavar" is not defined, it is added with "PATH" as value to improve help text readability.

    """
    kwargs.setdefault("metavar", "PATH")
    kwargs["type"] = self._validate_src_path
    self.add_argument(*args, **kwargs)

add_argument_url(*args, **kwargs)

Adds sqlalchemy.engine.URL argument.

If "metavar" is not defined it is added with "URI" as value to improve help text readability.

Source code in ensembl/utils/argparse.py
166
167
168
169
170
171
172
173
174
def add_argument_url(self, *args: Any, **kwargs: Any) -> None:
    """Adds `sqlalchemy.engine.URL` argument.

    If "metavar" is not defined it is added with "URI" as value to improve help text readability.

    """
    kwargs.setdefault("metavar", "URI")
    kwargs["type"] = make_url
    self.add_argument(*args, **kwargs)

add_log_arguments(add_log_file=False)

Adds the usual set of arguments required to set and initialise a logging system.

The current set includes a mutually exclusive group for the default logging level: --verbose, --debug or --log LEVEL.

Parameters:

Name Type Description Default
add_log_file bool

Add arguments to allow storing messages into a file, i.e. --log_file and --log_file_level.

False
Source code in ensembl/utils/argparse.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
def add_log_arguments(self, add_log_file: bool = False) -> None:
    """Adds the usual set of arguments required to set and initialise a logging system.

    The current set includes a mutually exclusive group for the default logging level: `--verbose`,
    `--debug` or `--log LEVEL`.

    Args:
        add_log_file: Add arguments to allow storing messages into a file, i.e. `--log_file` and
            `--log_file_level`.

    """
    # Define the list of log levels available
    log_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
    # NOTE: from 3.11 this list can be changed to: logging.getLevelNamesMapping().keys()
    # Create logging arguments group
    group = self.add_argument_group("logging arguments")
    # Add 3 mutually exclusive options to set the logging level
    subgroup = group.add_mutually_exclusive_group()
    subgroup.add_argument(
        "-v",
        "--verbose",
        action="store_const",
        const="INFO",
        dest="log_level",
        help="verbose mode, i.e. 'INFO' log level",
    )
    subgroup.add_argument(
        "--debug",
        action="store_const",
        const="DEBUG",
        dest="log_level",
        help="debugging mode, i.e. 'DEBUG' log level",
    )
    subgroup.add_argument(
        "--log",
        choices=log_levels,
        type=str.upper,
        default="WARNING",
        metavar="LEVEL",
        dest="log_level",
        help="level of the events to track: %(choices)s",
    )
    subgroup.set_defaults(log_level="WARNING")
    if add_log_file:
        # Add log file-related arguments
        group.add_argument(
            "--log_file",
            type=lambda x: self._validate_dst_path(x, exists_ok=True),
            metavar="PATH",
            default=None,
            help="log file path",
        )
        group.add_argument(
            "--log_file_level",
            choices=log_levels,
            type=str.upper,
            default="DEBUG",
            metavar="LEVEL",
            help="level of the events to track in the log file: %(choices)s",
        )

add_numeric_argument(*args, type=float, min_value=None, max_value=None, **kwargs)

Adds a numeric argument with constrains on its type and its minimum or maximum value.

Note that the default value (if defined) is not checked unless the argument is an optional argument and no value is provided in the command line.

Parameters:

Name Type Description Default
type Callable[[str], int | float]

Type to convert the argument value to when parsing.

float
min_value int | float | None

Minimum value constrain. If None, no minimum value constrain.

None
max_value int | float | None

Maximum value constrain. If None, no maximum value constrain.

None
Source code in ensembl/utils/argparse.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def add_numeric_argument(
    self,
    *args: Any,
    type: Callable[[str], int | float] = float,
    min_value: int | float | None = None,
    max_value: int | float | None = None,
    **kwargs: Any,
) -> None:
    """Adds a numeric argument with constrains on its type and its minimum or maximum value.

    Note that the default value (if defined) is not checked unless the argument is an optional argument
    and no value is provided in the command line.

    Args:
        type: Type to convert the argument value to when parsing.
        min_value: Minimum value constrain. If `None`, no minimum value constrain.
        max_value: Maximum value constrain. If `None`, no maximum value constrain.

    """
    # If both minimum and maximum values are defined, ensure min_value <= max_value
    if (min_value is not None) and (max_value is not None) and (min_value > max_value):
        raise ArgumentError("minimum value is greater than maximum value")
    # Add lambda function to check numeric constrains when parsing argument
    kwargs["type"] = lambda x: self._validate_number(x, type, min_value, max_value)
    self.add_argument(*args, **kwargs)

add_server_arguments(prefix='', include_database=False, help=None)

Adds the usual set of arguments needed to connect to a server, i.e. --host, --port, --user and --password (optional).

Note that the parser will assume this is a MySQL server.

Parameters:

Name Type Description Default
prefix str

Prefix to add the each argument, e.g. if prefix is src_, the arguments will be --src_host, etc.

''
include_database bool

Include --database argument.

False
help str | None

Description message to include for this set of arguments.

None
Source code in ensembl/utils/argparse.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def add_server_arguments(
    self, prefix: str = "", include_database: bool = False, help: str | None = None
) -> None:
    """Adds the usual set of arguments needed to connect to a server, i.e. `--host`, `--port`, `--user`
    and `--password` (optional).

    Note that the parser will assume this is a MySQL server.

    Args:
        prefix: Prefix to add the each argument, e.g. if prefix is `src_`, the arguments will be
            `--src_host`, etc.
        include_database: Include `--database` argument.
        help: Description message to include for this set of arguments.

    """
    group = self.add_argument_group(f"{prefix}server connection arguments", description=help)
    group.add_argument(
        f"--{prefix}host", required=True, metavar="HOST", default=argparse.SUPPRESS, help="host name"
    )
    group.add_argument(
        f"--{prefix}port",
        required=True,
        type=int,
        metavar="PORT",
        default=argparse.SUPPRESS,
        help="port number",
    )
    group.add_argument(
        f"--{prefix}user", required=True, metavar="USER", default=argparse.SUPPRESS, help="user name"
    )
    group.add_argument(f"--{prefix}password", metavar="PWD", help="host password")
    if include_database:
        group.add_argument(
            f"--{prefix}database",
            required=True,
            metavar="NAME",
            default=argparse.SUPPRESS,
            help="database name",
        )

parse_args(*args, **kwargs)

Extends the parent function by adding a new URL argument for every server group added.

The type of this new argument will be sqlalchemy.engine.URL. It also logs all the parsed arguments for debugging purposes when logging arguments have been added.

Source code in ensembl/utils/argparse.py
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
def parse_args(self, *args: Any, **kwargs: Any) -> argparse.Namespace:  # type: ignore[override]
    """Extends the parent function by adding a new URL argument for every server group added.

    The type of this new argument will be `sqlalchemy.engine.URL`. It also logs all the parsed
    arguments for debugging purposes when logging arguments have been added.

    """
    arguments = super().parse_args(*args, **kwargs)
    # Build and add an sqlalchemy.engine.URL object for every server group added
    pattern = re.compile(r"([\w-]*)host$")
    server_prefixes = [x.group(1) for x in map(pattern.match, vars(arguments)) if x]
    for prefix in server_prefixes:
        # Raise an error rather than overwriting when the URL argument is already present
        if f"{prefix}url" in arguments:
            self.error(f"argument '{prefix}url' is already present")
        try:
            server_url = URL.create(
                "mysql",
                getattr(arguments, f"{prefix}user"),
                getattr(arguments, f"{prefix}password"),
                getattr(arguments, f"{prefix}host"),
                getattr(arguments, f"{prefix}port"),
                getattr(arguments, f"{prefix}database", None),
            )
        except AttributeError:
            # Not a database server host argument
            continue
        setattr(arguments, f"{prefix}url", server_url)
    return arguments

SeqCollection

Represent a collection of seq_regions metadata.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
class SeqCollection:
    """Represent a collection of seq_regions metadata."""

    mock: bool
    seqs: dict

    def __init__(self, mock: bool = False) -> None:
        self.seqs = {}
        self.mock = mock

    def from_gbff(self, gbff_path: Path) -> None:
        """Store seq_regions extracted from a GBFF file.

        If a seq_region with the same ID exists in the collection, it will be replaced.
        """
        with open_gz_file(gbff_path) as gbff_file:
            for record in SeqIO.parse(gbff_file, "genbank"):
                record_data = GBFFRecord(record)
                new_seq: SeqRegionDict = self.make_seqregion_from_gbff(record_data)
                name = record.id
                merged_seq = self._merge(new_seq, self.seqs.get(name, {}))
                self.seqs[name] = merged_seq

    def _merge(self, source: SeqRegionDict, destination: SeqRegionDict) -> SeqRegionDict:
        """Merge a source dict in a destination dict (only add values or append to lists)."""
        if not destination:
            return source
        for key, value in source.items():
            if isinstance(value, list):
                destination[key] += value
            else:
                destination[key] = value

        return destination

    @staticmethod
    def make_seqregion_from_gbff(record_data: GBFFRecord) -> SeqRegionDict:
        """Returns a seq_region dict extracted from a GBFF record."""
        seqr: SeqRegionDict = {"length": len(record_data.record.seq)}  # type: ignore[arg-type]

        if record_data.is_circular():
            seqr["circular"] = True

        # Is there a genetic code defined?
        codon_table = record_data.get_codon_table()
        if codon_table is not None:
            seqr["codon_table"] = codon_table

        # Is it an organelle?
        location = record_data.get_organelle()
        if location is not None:
            seqr["location"] = location

        # Is there a comment stating the Genbank record this is based on?
        genbank_id = record_data.get_genbank_id()
        if genbank_id is not None:
            seqr["synonyms"] = [{"source": "INSDC", "name": genbank_id}]

        return seqr

    def from_report(self, report_path: Path, is_refseq: bool = False) -> None:
        """Store seq_regions extracted from an INSDC assembly report file.

        If a seq_region with the same id exists in the collection, it will be replaced.

        Args:
            report_path: Path to the sequence regions report file.
            is_refseq: True if the source of the report is RefSeq, false if INSDC.

        """
        report = ReportRecord(report_path)
        for seq_data in report.reader:
            new_seq = self.make_seq_region_from_report(seq_data, is_refseq)
            name = new_seq["name"]
            merged_seq = self._merge(new_seq, self.seqs.get(name, {}))
            self.seqs[name] = merged_seq

    @staticmethod
    def make_seq_region_from_report(
        seq_data: dict[str, Any],
        is_refseq: bool,
        synonym_map: Mapping[str, str] = SYNONYM_MAP,
        molecule_location: Mapping[str, str] = MOLECULE_LOCATION,
    ) -> SeqRegionDict:
        """Returns a sequence region from the information provided.

        An empty sequence region will be returned if no accession information is found.

        Args:
            data: Dict from the report representing one line, where the key is the column name.
            is_refseq: True if the source is RefSeq, false if INSDC.
            synonym_map: Map of INSDC report column names to sequence region field names.
            molecule_location: Map of sequence type to SO location.

        Raises:
            UnknownMetadata: If the sequence role or location is not recognised.

        """
        seq_region = {}

        # Set accession as the sequence region name
        src = "RefSeq" if is_refseq else "GenBank"
        accession_id = seq_data.get(f"{src}-Accn", "")
        if not accession_id or (accession_id == "na"):
            logging.warning(f'No {src} accession ID found for {seq_data["Sequence-Name"]}')
            return {}
        seq_region["name"] = accession_id

        # Add synonyms
        synonyms = []
        for field, source in synonym_map.items():
            if (field in seq_data) and (seq_data[field].casefold() != "na"):
                synonym = {"source": source, "name": seq_data[field]}
                synonyms.append(synonym)
        synonyms.sort(key=lambda x: x["source"])
        seq_region["synonyms"] = synonyms

        # Add sequence length
        field = "Sequence-Length"
        if (field in seq_data) and (seq_data[field].casefold() != "na"):
            seq_region["length"] = int(seq_data[field])

        # Add coordinate system and location
        seq_role = seq_data["Sequence-Role"]
        # Scaffold?
        if seq_role in ("unplaced-scaffold", "unlocalized-scaffold"):
            seq_region["coord_system_level"] = "scaffold"
        # Chromosome? Check location
        elif seq_role == "assembled-molecule":
            seq_region["coord_system_level"] = "chromosome"
            location = seq_data["Assigned-Molecule-Location/Type"].lower()
            # Get location metadata
            try:
                seq_region["location"] = molecule_location[location]
            except KeyError as exc:
                raise UnknownMetadata(f"Unrecognized sequence location: {location}") from exc
        else:
            raise UnknownMetadata(f"Unrecognized sequence role: {seq_role}")

        return seq_region

    def remove(self, to_exclude: list[str]) -> None:
        """Remove seq_regions based on a provided list of accessions."""
        for seq_name in to_exclude:
            if seq_name in self.seqs:
                del self.seqs[seq_name]
            else:
                logging.info(f"Cannot exclude seq not found: {seq_name}")

    def add_translation_table(self, location_codon: Mapping[str, int] = LOCATION_CODON) -> None:
        """Adds the translation codon table to each sequence region (when missing) based on its location.

        Args:
            location_codon: Map of known codon tables for known locations.

        """
        for seqr in self.seqs.values():
            if "codon_table" in seqr:
                continue
            if seqr.get("location", "") in location_codon:
                seqr["codon_table"] = location_codon[seqr["location"]]

    def add_mitochondrial_codon_table(self, taxon_id: int) -> None:
        """Adds the mitochondrial codon table to each sequence region (when missing) based on the taxon ID.

        If no mitochondrial genetic code can be found for the given taxon ID nothing will be changed.

        Args:
            taxon_id: The species taxon ID.

        """
        if self.mock:
            logging.info("Skip mitochondrial codon table: mock")
            return
        if not taxon_id:
            logging.info("Skip mitochondrial codon table: no taxon_id to use")
            return

        url = f"https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{str(taxon_id)}"
        response = requests.get(url, headers={"Content-Type": "application/json"}, timeout=60)
        response.raise_for_status()
        # In case we have been redirected, check for HTML opening tag
        if response.text.startswith("<"):
            raise ValueError(f"Response from {url} is not JSON")
        decoded = response.json()
        genetic_code = int(decoded.get("mitochondrialGeneticCode", 0))
        if genetic_code == 0:
            logging.warning(f"No mitochondria genetic code found for taxon {taxon_id}")
            return

        for seqr in self.seqs.values():
            if ("codon_table" not in seqr) and (seqr.get("location", "") == "mitochondrial_chromosome"):
                seqr["codon_table"] = genetic_code

    def to_list(self) -> list[SeqRegionDict]:
        """Returns the sequences as a simple list of `SeqRegionDict` objects."""
        return list(self.seqs.values())

mock = mock instance-attribute

seqs = {} instance-attribute

add_mitochondrial_codon_table(taxon_id)

Adds the mitochondrial codon table to each sequence region (when missing) based on the taxon ID.

If no mitochondrial genetic code can be found for the given taxon ID nothing will be changed.

Parameters:

Name Type Description Default
taxon_id int

The species taxon ID.

required
Source code in src/python/ensembl/io/genomio/seq_region/collection.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
def add_mitochondrial_codon_table(self, taxon_id: int) -> None:
    """Adds the mitochondrial codon table to each sequence region (when missing) based on the taxon ID.

    If no mitochondrial genetic code can be found for the given taxon ID nothing will be changed.

    Args:
        taxon_id: The species taxon ID.

    """
    if self.mock:
        logging.info("Skip mitochondrial codon table: mock")
        return
    if not taxon_id:
        logging.info("Skip mitochondrial codon table: no taxon_id to use")
        return

    url = f"https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{str(taxon_id)}"
    response = requests.get(url, headers={"Content-Type": "application/json"}, timeout=60)
    response.raise_for_status()
    # In case we have been redirected, check for HTML opening tag
    if response.text.startswith("<"):
        raise ValueError(f"Response from {url} is not JSON")
    decoded = response.json()
    genetic_code = int(decoded.get("mitochondrialGeneticCode", 0))
    if genetic_code == 0:
        logging.warning(f"No mitochondria genetic code found for taxon {taxon_id}")
        return

    for seqr in self.seqs.values():
        if ("codon_table" not in seqr) and (seqr.get("location", "") == "mitochondrial_chromosome"):
            seqr["codon_table"] = genetic_code

add_translation_table(location_codon=LOCATION_CODON)

Adds the translation codon table to each sequence region (when missing) based on its location.

Parameters:

Name Type Description Default
location_codon Mapping[str, int]

Map of known codon tables for known locations.

LOCATION_CODON
Source code in src/python/ensembl/io/genomio/seq_region/collection.py
187
188
189
190
191
192
193
194
195
196
197
198
def add_translation_table(self, location_codon: Mapping[str, int] = LOCATION_CODON) -> None:
    """Adds the translation codon table to each sequence region (when missing) based on its location.

    Args:
        location_codon: Map of known codon tables for known locations.

    """
    for seqr in self.seqs.values():
        if "codon_table" in seqr:
            continue
        if seqr.get("location", "") in location_codon:
            seqr["codon_table"] = location_codon[seqr["location"]]

from_gbff(gbff_path)

Store seq_regions extracted from a GBFF file.

If a seq_region with the same ID exists in the collection, it will be replaced.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
48
49
50
51
52
53
54
55
56
57
58
59
def from_gbff(self, gbff_path: Path) -> None:
    """Store seq_regions extracted from a GBFF file.

    If a seq_region with the same ID exists in the collection, it will be replaced.
    """
    with open_gz_file(gbff_path) as gbff_file:
        for record in SeqIO.parse(gbff_file, "genbank"):
            record_data = GBFFRecord(record)
            new_seq: SeqRegionDict = self.make_seqregion_from_gbff(record_data)
            name = record.id
            merged_seq = self._merge(new_seq, self.seqs.get(name, {}))
            self.seqs[name] = merged_seq

from_report(report_path, is_refseq=False)

Store seq_regions extracted from an INSDC assembly report file.

If a seq_region with the same id exists in the collection, it will be replaced.

Parameters:

Name Type Description Default
report_path Path

Path to the sequence regions report file.

required
is_refseq bool

True if the source of the report is RefSeq, false if INSDC.

False
Source code in src/python/ensembl/io/genomio/seq_region/collection.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def from_report(self, report_path: Path, is_refseq: bool = False) -> None:
    """Store seq_regions extracted from an INSDC assembly report file.

    If a seq_region with the same id exists in the collection, it will be replaced.

    Args:
        report_path: Path to the sequence regions report file.
        is_refseq: True if the source of the report is RefSeq, false if INSDC.

    """
    report = ReportRecord(report_path)
    for seq_data in report.reader:
        new_seq = self.make_seq_region_from_report(seq_data, is_refseq)
        name = new_seq["name"]
        merged_seq = self._merge(new_seq, self.seqs.get(name, {}))
        self.seqs[name] = merged_seq

make_seq_region_from_report(seq_data, is_refseq, synonym_map=SYNONYM_MAP, molecule_location=MOLECULE_LOCATION) staticmethod

Returns a sequence region from the information provided.

An empty sequence region will be returned if no accession information is found.

Parameters:

Name Type Description Default
data

Dict from the report representing one line, where the key is the column name.

required
is_refseq bool

True if the source is RefSeq, false if INSDC.

required
synonym_map Mapping[str, str]

Map of INSDC report column names to sequence region field names.

SYNONYM_MAP
molecule_location Mapping[str, str]

Map of sequence type to SO location.

MOLECULE_LOCATION

Raises:

Type Description
UnknownMetadata

If the sequence role or location is not recognised.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
@staticmethod
def make_seq_region_from_report(
    seq_data: dict[str, Any],
    is_refseq: bool,
    synonym_map: Mapping[str, str] = SYNONYM_MAP,
    molecule_location: Mapping[str, str] = MOLECULE_LOCATION,
) -> SeqRegionDict:
    """Returns a sequence region from the information provided.

    An empty sequence region will be returned if no accession information is found.

    Args:
        data: Dict from the report representing one line, where the key is the column name.
        is_refseq: True if the source is RefSeq, false if INSDC.
        synonym_map: Map of INSDC report column names to sequence region field names.
        molecule_location: Map of sequence type to SO location.

    Raises:
        UnknownMetadata: If the sequence role or location is not recognised.

    """
    seq_region = {}

    # Set accession as the sequence region name
    src = "RefSeq" if is_refseq else "GenBank"
    accession_id = seq_data.get(f"{src}-Accn", "")
    if not accession_id or (accession_id == "na"):
        logging.warning(f'No {src} accession ID found for {seq_data["Sequence-Name"]}')
        return {}
    seq_region["name"] = accession_id

    # Add synonyms
    synonyms = []
    for field, source in synonym_map.items():
        if (field in seq_data) and (seq_data[field].casefold() != "na"):
            synonym = {"source": source, "name": seq_data[field]}
            synonyms.append(synonym)
    synonyms.sort(key=lambda x: x["source"])
    seq_region["synonyms"] = synonyms

    # Add sequence length
    field = "Sequence-Length"
    if (field in seq_data) and (seq_data[field].casefold() != "na"):
        seq_region["length"] = int(seq_data[field])

    # Add coordinate system and location
    seq_role = seq_data["Sequence-Role"]
    # Scaffold?
    if seq_role in ("unplaced-scaffold", "unlocalized-scaffold"):
        seq_region["coord_system_level"] = "scaffold"
    # Chromosome? Check location
    elif seq_role == "assembled-molecule":
        seq_region["coord_system_level"] = "chromosome"
        location = seq_data["Assigned-Molecule-Location/Type"].lower()
        # Get location metadata
        try:
            seq_region["location"] = molecule_location[location]
        except KeyError as exc:
            raise UnknownMetadata(f"Unrecognized sequence location: {location}") from exc
    else:
        raise UnknownMetadata(f"Unrecognized sequence role: {seq_role}")

    return seq_region

make_seqregion_from_gbff(record_data) staticmethod

Returns a seq_region dict extracted from a GBFF record.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
@staticmethod
def make_seqregion_from_gbff(record_data: GBFFRecord) -> SeqRegionDict:
    """Returns a seq_region dict extracted from a GBFF record."""
    seqr: SeqRegionDict = {"length": len(record_data.record.seq)}  # type: ignore[arg-type]

    if record_data.is_circular():
        seqr["circular"] = True

    # Is there a genetic code defined?
    codon_table = record_data.get_codon_table()
    if codon_table is not None:
        seqr["codon_table"] = codon_table

    # Is it an organelle?
    location = record_data.get_organelle()
    if location is not None:
        seqr["location"] = location

    # Is there a comment stating the Genbank record this is based on?
    genbank_id = record_data.get_genbank_id()
    if genbank_id is not None:
        seqr["synonyms"] = [{"source": "INSDC", "name": genbank_id}]

    return seqr

remove(to_exclude)

Remove seq_regions based on a provided list of accessions.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
179
180
181
182
183
184
185
def remove(self, to_exclude: list[str]) -> None:
    """Remove seq_regions based on a provided list of accessions."""
    for seq_name in to_exclude:
        if seq_name in self.seqs:
            del self.seqs[seq_name]
        else:
            logging.info(f"Cannot exclude seq not found: {seq_name}")

to_list()

Returns the sequences as a simple list of SeqRegionDict objects.

Source code in src/python/ensembl/io/genomio/seq_region/collection.py
232
233
234
def to_list(self) -> list[SeqRegionDict]:
    """Returns the sequences as a simple list of `SeqRegionDict` objects."""
    return list(self.seqs.values())

add_attribs(seq_region, seq_region_attrib)

Map seq_regions attribs to a specific name and type defined below.

Parameters:

Name Type Description Default
seq_region dict

A seq_region dict to modify.

required
seq_region_attrib dict

The attribs for this seq_region.

required
Source code in src/python/ensembl/io/genomio/seq_region/dump.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def add_attribs(seq_region: dict, seq_region_attrib: dict) -> None:
    """Map seq_regions attribs to a specific name and type defined below.

    Args:
        seq_region: A seq_region dict to modify.
        seq_region_attrib: The attribs for this seq_region.
    """
    bool_attribs = {
        "circular_seq": "circular",
        "non_ref": "non_ref",
    }
    int_attribs = {
        "codon_table": "codon_table",
    }
    string_attribs = {
        "BRC4_seq_region_name": "BRC4_seq_region_name",
        "EBI_seq_region_name": "EBI_seq_region_name",
        "coord_system_tag": "coord_system_level",
        "sequence_location": "location",
    }

    for name, key in bool_attribs.items():
        # Make sure "0" means False, i.e. not added
        value = int(seq_region_attrib.get(name, "0"))
        if value:
            seq_region[key] = bool(value)

    for name, key in int_attribs.items():
        value = seq_region_attrib.get(name, "")
        if value:
            seq_region[key] = int(value)

    for name, key in string_attribs.items():
        value = seq_region_attrib.get(name, "")
        if value:
            seq_region[key] = str(value)

fetch_coord_systems(session)

Retrieve the coord_system metadata from the current core.

Parameters:

Name Type Description Default
session Session

Session for the current core database.

required

Yields:

Type Description
CoordSystem

All default coord_systems in the core database.

Source code in src/python/ensembl/io/genomio/seq_region/dump.py
44
45
46
47
48
49
50
51
52
53
54
55
56
def fetch_coord_systems(session: Session) -> Iterator[CoordSystem]:
    """Retrieve the coord_system metadata from the current core.

    Args:
        session: Session for the current core database.

    Yields:
        All default coord_systems in the core database.
    """
    coord_system_select = select(CoordSystem).filter(CoordSystem.attrib.like(r"%default_version%"))
    for row in session.execute(coord_system_select).unique().all():
        coord: CoordSystem = row[0]
        yield coord

get_json(src_path, **kwargs)

Generic data JSON loader.

Parameters:

Name Type Description Default
src_path StrPath

Path to the JSON file to load.

required
Source code in src/python/ensembl/io/genomio/utils/json_utils.py
26
27
28
29
30
31
32
33
34
def get_json(src_path: StrPath, **kwargs: Any) -> Any:
    """Generic data JSON loader.

    Args:
        src_path: Path to the JSON file to load.

    """
    with Path(src_path).open("r", encoding="utf-8") as json_file:
        return json.load(json_file, **kwargs)

get_karyotype(seq_region)

Given a seq_region, extract the karyotype bands.

Parameters:

Name Type Description Default
seq_region SeqRegion

The seq_region from which the karyotype bands are extracted.

required

Returns:

Type Description
list[dict[str, str]]

List of all karyotype bands as a dict with values 'start', 'end', 'name' 'stain', 'structure'.

Source code in src/python/ensembl/io/genomio/seq_region/dump.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def get_karyotype(seq_region: SeqRegion) -> list[dict[str, str]]:
    """Given a seq_region, extract the karyotype bands.

    Args:
        seq_region: The seq_region from which the karyotype bands are extracted.

    Returns:
        List of all karyotype bands as a dict with values 'start', 'end', 'name' 'stain', 'structure'.
    """
    bands = seq_region.karyotype
    kars = []
    if bands:
        for band in bands:
            kar = {"start": band.seq_region_start, "end": band.seq_region_end}
            if band.band:
                kar["name"] = band.band
            if band.stain:
                kar["stain"] = band.stain
                structure = _KARYOTYPE_STRUCTURE.get(band.stain, "")
                if structure:
                    kar["structure"] = structure
            kars.append(kar)

    kars = sorted(kars, key=lambda kar: kar.get("name", ""))
    return kars

get_seq_regions(session, external_db_map)

Returns all the sequence regions from the current core database.

Include synonyms, attribs and karyotypes. Only the top level sequences are exported.

Parameters:

Name Type Description Default
session Session

Session from the current core database.

required
external_db_map dict

Mapping of external_db names for the synonyms.

required
Source code in src/python/ensembl/io/genomio/seq_region/dump.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def get_seq_regions(session: Session, external_db_map: dict) -> list[SeqRegion]:
    """Returns all the sequence regions from the current core database.

    Include synonyms, attribs and karyotypes. Only the top level sequences are exported.

    Args:
        session: Session from the current core database.
        external_db_map: Mapping of external_db names for the synonyms.

    """
    seq_regions = []

    for coord_system in fetch_coord_systems(session):
        logging.debug(f"Dump coord {coord_system.name}")
        for seqr in fetch_seq_regions(session, coord_system):
            seq_region: dict[str, Any] = {}
            seq_region = {"name": seqr.name, "length": seqr.length}
            synonyms = get_synonyms(seqr, external_db_map)
            if synonyms:
                seq_region["synonyms"] = synonyms

            attribs = get_attribs_dict(seqr)
            if not attribs or "toplevel" not in attribs:
                # Skip seq_region without attribs or not toplevel
                continue
            add_attribs(seq_region, attribs)

            karyotype = get_karyotype(seqr)
            if karyotype:
                seq_region["karyotype_bands"] = karyotype

            added_seq = get_added_sequence(seqr)
            if added_seq:
                seq_region["added_sequence"] = added_seq

            if "coord_system_level" not in seq_region:
                seq_region["coord_system_level"] = coord_system.name

            seq_regions.append(seq_region)

    seq_regions = sorted(seq_regions, key=lambda seqr: (seqr["coord_system_level"], seqr["name"]))
    return seq_regions

get_synonyms(seq_region, external_db_map)

Get all synonyms for a given seq_region. Use the mapping for synonym source names.

Parameters:

Name Type Description Default
seq_region SeqRegion

Seq_region from which the synonyms are extracted.

required
external_db_map dict[str, str]

To map the synonym source names.

required

Returns:

Type Description
list[dict[str, str]]

List of all synonyms as a dict with 'name' and 'source' keys.

Source code in src/python/ensembl/io/genomio/seq_region/dump.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def get_synonyms(seq_region: SeqRegion, external_db_map: dict[str, str]) -> list[dict[str, str]]:
    """Get all synonyms for a given seq_region. Use the mapping for synonym source names.

    Args:
        seq_region: Seq_region from which the synonyms are extracted.
        external_db_map: To map the synonym source names.

    Returns:
        List of all synonyms as a dict with 'name' and 'source' keys.
    """
    synonyms = seq_region.seq_region_synonym
    syns = []
    if synonyms:
        for syn in synonyms:
            if syn.external_db:
                source = syn.external_db.db_name
                if source in external_db_map:
                    source = external_db_map[source]
                syn_obj = {"name": syn.synonym, "source": source}
            else:
                syn_obj = {"name": syn.synonym}
            syns.append(syn_obj)

    syns = sorted(syns, key=lambda syn: (syn["name"], syn.get("source", "")))
    return syns

init_logging_with_args(args)

Processes the Namespace object provided to call init_logging() with the correct arguments.

Parameters:

Name Type Description Default
args Namespace

Namespace populated by an argument parser.

required
Source code in ensembl/utils/logging.py
 98
 99
100
101
102
103
104
105
106
107
def init_logging_with_args(args: argparse.Namespace) -> None:
    """Processes the Namespace object provided to call `init_logging()` with the correct arguments.

    Args:
        args: Namespace populated by an argument parser.

    """
    args_dict = vars(args)
    log_args = {x: args_dict[x] for x in ["log_level", "log_file", "log_file_level"] if x in args_dict}
    init_logging(**log_args)

main()

Module's entry-point.

Source code in src/python/ensembl/io/genomio/seq_region/prepare.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def main() -> None:
    """Module's entry-point."""
    parser = ArgumentParser(description="Construct a sequence region metadata file from INSDC files.")
    parser.add_argument_src_path("--genome_file", required=True, help="Genome metadata JSON file")
    parser.add_argument_src_path(
        "--report_file", required=True, help="INSDC/RefSeq sequences report file to parse"
    )
    parser.add_argument_src_path("--gbff_file", help="INSDC/RefSeq GBFF file to parse")
    parser.add_argument_dst_path(
        "--dst_file", default="seq_region.json", help="Output JSON file for the processed sequence regions"
    )
    parser.add_argument(
        "--to_exclude", nargs="*", metavar="SEQ_REGION_NAME", help="Sequence region names to exclude"
    )
    parser.add_argument("--mock_run", action="store_true", help="Do not call external APIs")
    parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
    parser.add_log_arguments()
    args = parser.parse_args()
    init_logging_with_args(args)

    prepare_seq_region_metadata(
        genome_file=args.genome_file,
        report_file=args.report_file,
        dst_file=args.dst_file,
        gbff_file=args.gbff_file,
        to_exclude=args.to_exclude,
        mock_run=args.mock_run,
    )

prepare_seq_region_metadata(genome_file, report_file, dst_file, *, gbff_file=None, to_exclude=None, mock_run=False)

Prepares the sequence region metadata found in the INSDC/RefSeq report and GBFF files.

The sequence region information is loaded from both sources and combined. Elements are added/excluded as requested, and the final sequence region metadata is dumped in a JSON file that follows the schema defined in "src/python/ensembl/io/genomio/data/schemas/seq_region.json".

Parameters:

Name Type Description Default
genome_file StrPath

Genome metadata JSON file path.

required
report_file StrPath

INSDC/RefSeq sequences report file path to parse.

required
gbff_file StrPath | None

INSDC/RefSeq GBFF file path to parse.

None
dst_file StrPath

JSON file output for the processed sequence regions JSON.

required
to_exclude list[str] | None

Sequence region names to exclude.

None
mock_run bool

Do not call external taxonomy service.

False
Source code in src/python/ensembl/io/genomio/seq_region/prepare.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def prepare_seq_region_metadata(
    genome_file: StrPath,
    report_file: StrPath,
    dst_file: StrPath,
    *,
    gbff_file: StrPath | None = None,
    to_exclude: list[str] | None = None,
    mock_run: bool = False,
) -> None:
    """Prepares the sequence region metadata found in the INSDC/RefSeq report and GBFF files.

    The sequence region information is loaded from both sources and combined. Elements are added/excluded
    as requested, and the final sequence region metadata is dumped in a JSON file that follows the schema
    defined in "src/python/ensembl/io/genomio/data/schemas/seq_region.json".

    Args:
        genome_file: Genome metadata JSON file path.
        report_file: INSDC/RefSeq sequences report file path to parse.
        gbff_file: INSDC/RefSeq GBFF file path to parse.
        dst_file: JSON file output for the processed sequence regions JSON.
        to_exclude: Sequence region names to exclude.
        mock_run: Do not call external taxonomy service.

    """
    genome_data = get_json(genome_file)
    dst_file = Path(dst_file)
    is_refseq = genome_data["assembly"]["accession"].startswith("GCF_")

    seqs = SeqCollection(mock=mock_run)
    seqs.from_report(Path(report_file), is_refseq)
    if gbff_file:
        seqs.from_gbff(Path(gbff_file))

    # Exclude seq_regions from a list
    if to_exclude:
        seqs.remove(to_exclude)

    # Add translation and mitochondrial codon tables
    seqs.add_translation_table()
    seqs.add_mitochondrial_codon_table(genome_data["species"]["taxonomy_id"])

    # Print out the file
    print_json(dst_file, seqs.to_list())

print_json(dst_path, data, **kwargs)

Generic data JSON dumper to a file, with keys sorted and pretty-printed with indent 4 by default.

Parameters:

Name Type Description Default
dst_path StrPath

Path to the JSON file to create.

required
data Any

Any data to store into the file.

required
Source code in src/python/ensembl/io/genomio/utils/json_utils.py
37
38
39
40
41
42
43
44
45
46
47
48
def print_json(dst_path: StrPath, data: Any, **kwargs: Any) -> None:
    """Generic data JSON dumper to a file, with keys sorted and pretty-printed with indent 4 by default.

    Args:
        dst_path: Path to the JSON file to create.
        data: Any data to store into the file.

    """
    kwargs.setdefault("sort_keys", True)
    kwargs.setdefault("indent", 4)
    with Path(dst_path).open("w", encoding="utf-8") as json_file:
        json_file.write(json.dumps(data, **kwargs))