Skip to content

database

ensembl.io.genomio.database

Ensembl core database interface module.

CoreServer

Basic interface to a MySQL server with core databases.

Allows to get a filtered list of databases.

Source code in src/python/ensembl/io/genomio/database/core_server.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class CoreServer:
    """Basic interface to a MySQL server with core databases.

    Allows to get a filtered list of databases.
    """

    def __init__(self, server_url: URL) -> None:
        logging.debug(f"Connect to {server_url}")
        self.engine = sqlalchemy.create_engine(server_url)

    def get_all_core_names(self) -> List[str]:
        """Query the server and retrieve all database names that look like Ensembl cores."""

        with self.engine.connect() as connection:
            all_query = connection.execute(text(r"SHOW DATABASES LIKE '%%_core_%%'"))
            dbs = [row[0] for row in all_query.fetchall()]
        logging.info(f"{len(dbs)} core databases on the server")
        return dbs

    def get_cores(
        self,
        *,
        prefix: str = "",
        build: Optional[int] = None,
        version: Optional[int] = None,
        dbname_re: str = "",
        db_list: Optional[List[str]] = None,
    ) -> List[str]:
        """Returns a list of core databases, filtered if requested.

        Args:
            prefix: Filter by prefix (no "_" is added automatically).
            build: Filter by VEuPathDB build number.
            version: Filter by Ensembl version.
            dbname_re: Filter by dbname regular expression.
            db_list: Explicit list of database names.
        """
        dbs = []

        dbs = self.get_all_core_names()

        # Check if there are databases returned from query to host
        if not dbs:
            logging.warning("No databases returned from query")

        if db_list:
            logging.debug(f"Filter with db list: {db_list}")
            dbs = [db for db in dbs if db in db_list]
        if prefix:
            dbs = [db for db in dbs if db.startswith(f"{prefix}")]
        if dbname_re:
            dbname_m = re.compile(dbname_re)
            dbs = list(filter(dbname_m.search, dbs))
        if build is not None:
            dbs = [db for db in dbs if re.search(rf"_core_{build}_\d+_\d+$", db)]
        if version is not None:
            dbs = [db for db in dbs if re.search(rf"_core_\d+_{version}_\d+$", db)]

        logging.info(f"{len(dbs)} core databases remain after filtering")

        return dbs

engine = sqlalchemy.create_engine(server_url) instance-attribute

get_all_core_names()

Query the server and retrieve all database names that look like Ensembl cores.

Source code in src/python/ensembl/io/genomio/database/core_server.py
38
39
40
41
42
43
44
45
def get_all_core_names(self) -> List[str]:
    """Query the server and retrieve all database names that look like Ensembl cores."""

    with self.engine.connect() as connection:
        all_query = connection.execute(text(r"SHOW DATABASES LIKE '%%_core_%%'"))
        dbs = [row[0] for row in all_query.fetchall()]
    logging.info(f"{len(dbs)} core databases on the server")
    return dbs

get_cores(*, prefix='', build=None, version=None, dbname_re='', db_list=None)

Returns a list of core databases, filtered if requested.

Parameters:

Name Type Description Default
prefix str

Filter by prefix (no "_" is added automatically).

''
build Optional[int]

Filter by VEuPathDB build number.

None
version Optional[int]

Filter by Ensembl version.

None
dbname_re str

Filter by dbname regular expression.

''
db_list Optional[List[str]]

Explicit list of database names.

None
Source code in src/python/ensembl/io/genomio/database/core_server.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def get_cores(
    self,
    *,
    prefix: str = "",
    build: Optional[int] = None,
    version: Optional[int] = None,
    dbname_re: str = "",
    db_list: Optional[List[str]] = None,
) -> List[str]:
    """Returns a list of core databases, filtered if requested.

    Args:
        prefix: Filter by prefix (no "_" is added automatically).
        build: Filter by VEuPathDB build number.
        version: Filter by Ensembl version.
        dbname_re: Filter by dbname regular expression.
        db_list: Explicit list of database names.
    """
    dbs = []

    dbs = self.get_all_core_names()

    # Check if there are databases returned from query to host
    if not dbs:
        logging.warning("No databases returned from query")

    if db_list:
        logging.debug(f"Filter with db list: {db_list}")
        dbs = [db for db in dbs if db in db_list]
    if prefix:
        dbs = [db for db in dbs if db.startswith(f"{prefix}")]
    if dbname_re:
        dbname_m = re.compile(dbname_re)
        dbs = list(filter(dbname_m.search, dbs))
    if build is not None:
        dbs = [db for db in dbs if re.search(rf"_core_{build}_\d+_\d+$", db)]
    if version is not None:
        dbs = [db for db in dbs if re.search(rf"_core_\d+_{version}_\d+$", db)]

    logging.info(f"{len(dbs)} core databases remain after filtering")

    return dbs

DBConnectionLite

Bases: DBConnection

Extension to get metadata directly from a database, assuming it has a metadata table.

Source code in src/python/ensembl/io/genomio/database/dbconnection_lite.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class DBConnectionLite(DBConnection):
    """Extension to get metadata directly from a database, assuming it has a metadata table."""

    def __init__(self, url: StrURL, reflect: bool = False, **kwargs: Any) -> None:
        super().__init__(url, reflect, **kwargs)
        self._metadata: Dict[str, List] = {}

    def get_metadata(self) -> Dict[str, List]:
        """Retrieves all metadata from the `meta` table in the database.

        Returns:
            A dict of with key meta_key, and value=List of meta_value.

        """
        self._load_metadata()
        return self._metadata

    def _load_metadata(self) -> None:
        """Caches the metadata values."""

        if self._metadata:
            return

        with Session(self._engine) as session:
            meta_stmt = select(Meta)

            for meta_row in session.scalars(meta_stmt).unique().all():
                meta_key = meta_row.meta_key
                meta_value = meta_row.meta_value
                if meta_key in self._metadata:
                    self._metadata[meta_key].append(meta_value)
                else:
                    self._metadata[meta_key] = [meta_value]

    def get_meta_value(self, meta_key: str) -> Optional[str]:
        """Returns the first meta_value for a given meta_key."""

        self._load_metadata()
        try:
            return self._metadata[meta_key][0]
        except KeyError:
            logging.debug(f"No meta_key {meta_key}")
            return None

    def get_project_release(self) -> str:
        """Returns the project release number from the database name. Returns empty string if not found."""

        match = re.search(_DB_PATTERN_RELEASE, self.db_name)
        if match:
            return match.group(1)
        return ""

get_meta_value(meta_key)

Returns the first meta_value for a given meta_key.

Source code in src/python/ensembl/io/genomio/database/dbconnection_lite.py
66
67
68
69
70
71
72
73
74
def get_meta_value(self, meta_key: str) -> Optional[str]:
    """Returns the first meta_value for a given meta_key."""

    self._load_metadata()
    try:
        return self._metadata[meta_key][0]
    except KeyError:
        logging.debug(f"No meta_key {meta_key}")
        return None

get_metadata()

Retrieves all metadata from the meta table in the database.

Returns:

Type Description
Dict[str, List]

A dict of with key meta_key, and value=List of meta_value.

Source code in src/python/ensembl/io/genomio/database/dbconnection_lite.py
39
40
41
42
43
44
45
46
47
def get_metadata(self) -> Dict[str, List]:
    """Retrieves all metadata from the `meta` table in the database.

    Returns:
        A dict of with key meta_key, and value=List of meta_value.

    """
    self._load_metadata()
    return self._metadata

get_project_release()

Returns the project release number from the database name. Returns empty string if not found.

Source code in src/python/ensembl/io/genomio/database/dbconnection_lite.py
76
77
78
79
80
81
82
def get_project_release(self) -> str:
    """Returns the project release number from the database name. Returns empty string if not found."""

    match = re.search(_DB_PATTERN_RELEASE, self.db_name)
    if match:
        return match.group(1)
    return ""

format_db_data(server_url, dbs, brc_mode=False)

Returns a metadata list from the given databases on a server.

Parameters:

Name Type Description Default
server_url URL

Server URL where all the databases are hosted.

required
dbs list[str]

List of database names.

required
brc_mode bool

If true, assign BRC4.organism_abbrev as the species, and BRC4.component as the division. Otherwise, the species will be species.production_name and the division will be species.division.

False

Returns:

Type Description
list[dict]

List of dictionaries with 3 keys: "database", "species" and "division".

Source code in src/python/ensembl/io/genomio/database/factory.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def format_db_data(server_url: URL, dbs: list[str], brc_mode: bool = False) -> list[dict]:
    """Returns a metadata list from the given databases on a server.

    Args:
        server_url: Server URL where all the databases are hosted.
        dbs: List of database names.
        brc_mode: If true, assign ``BRC4.organism_abbrev`` as the species, and ``BRC4.component`` as the
            division. Otherwise, the species will be ``species.production_name`` and the division will be
            ``species.division``.

    Returns:
        List of dictionaries with 3 keys: "database", "species" and "division".
    """
    databases_data = []
    for db_name in dbs:
        logging.debug(f"Get metadata for {db_name}")
        db_url = server_url.set(database=db_name)
        core_db = DBConnectionLite(db_url)

        prod_name = core_db.get_meta_value("species.production_name")
        species = prod_name
        division = core_db.get_meta_value("species.division")
        accession = core_db.get_meta_value("assembly.accession")
        project_release = core_db.get_project_release()

        if brc_mode:
            brc_organism = core_db.get_meta_value("BRC4.organism_abbrev")
            brc_component = core_db.get_meta_value("BRC4.component")
            if brc_organism is not None:
                species = brc_organism
            if brc_component is not None:
                division = brc_component

        if not division:
            division = "all"

        server_data = {
            "host": db_url.host,
            "user": db_url.username,
            "port": db_url.port,
            "password": db_url.password,
            "database": db_url.database,
        }
        db_data = {
            "server": server_data,
            "production_name": prod_name,
            "species": species,
            "division": division,
            "accession": accession,
            "release": project_release,
        }

        databases_data.append(db_data)
    return databases_data

get_core_dbs_metadata(server_url, *, prefix='', build=None, version=None, db_regex='', db_list=None, brc_mode=False)

Returns all the metadata fetched for the selected core databases.

Parameters:

Name Type Description Default
server_url URL

Server URL where the core databases are stored.

required
prefix str

Filter by prefix (no "_" is added automatically).

''
build int | None

Filter by VEuPathDB build number.

None
version int | None

Filter by Ensembl version.

None
db_regex str

Filter by dbname regular expression.

''
db_list Path | None

Explicit list of database names.

None
brc_mode bool

Enable BRC mode.

False

Returns:

Type Description
list[dict]

List of dictionaries with 3 keys: "database", "species" and "division".

Source code in src/python/ensembl/io/genomio/database/factory.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def get_core_dbs_metadata(
    server_url: URL,
    *,
    prefix: str = "",
    build: int | None = None,
    version: int | None = None,
    db_regex: str = "",
    db_list: Path | None = None,
    brc_mode: bool = False,
) -> list[dict]:
    """Returns all the metadata fetched for the selected core databases.

    Args:
        server_url: Server URL where the core databases are stored.
        prefix: Filter by prefix (no "_" is added automatically).
        build: Filter by VEuPathDB build number.
        version: Filter by Ensembl version.
        db_regex: Filter by dbname regular expression.
        db_list: Explicit list of database names.
        brc_mode: Enable BRC mode.

    Returns:
        List of dictionaries with 3 keys: "database", "species" and "division".
    """
    db_list_file = None
    if db_list:
        with db_list.open("r") as infile_fh:
            db_list_file = [line.strip() for line in infile_fh]
    # Get all database names
    server = CoreServer(server_url)
    logging.debug("Fetching databases...")
    databases = server.get_cores(
        prefix=prefix, build=build, version=version, dbname_re=db_regex, db_list=db_list_file
    )
    logging.info(f"Got {len(databases)} databases")
    logging.debug("\n".join(databases))
    return format_db_data(server_url, databases, brc_mode)

get_meta_values(db_url, meta_keys)

Returns a set of meta values based on set of 1 or more input DB meta_keys.

Parameters:

Name Type Description Default
db_url URL

Target core database URL.

required
meta_keys StrPath | list[str]

File path with one meta key per line or list of meta keys.

required
Source code in src/python/ensembl/io/genomio/database/meta_getter.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def get_meta_values(db_url: URL, meta_keys: StrPath | list[str]) -> dict[str, str]:
    """Returns a set of meta values based on set of 1 or more input DB meta_keys.

    Args:
        db_url: Target core database URL.
        meta_keys: File path with one meta key per line or list of meta keys.

    """
    db_name = db_url.database
    core_db = DBConnectionLite(db_url)
    query_meta_keys = []
    unpopulated_meta_keys = []
    meta_values_located = {}
    input_keys_count = 0
    meta_populated = False

    # Check input type and populated query list
    if isinstance(meta_keys, PosixPath):
        with Path(meta_keys).open(mode="r", encoding="UTF-8") as fh:
            for line in fh.readlines():
                meta_key = line.strip()
                query_meta_keys.append(meta_key)
    elif isinstance(meta_keys, list):
        query_meta_keys = meta_keys

    # Loop over input meta_key(s) and query DB
    for meta_key in query_meta_keys:
        input_keys_count += 1
        meta_value = core_db.get_meta_value(f"{meta_key}")

        if meta_value is not None:
            meta_values_located[f"{meta_key}"] = meta_value
        else:
            unpopulated_meta_keys.append(f"{meta_key}")
            logging.info(f"Meta query returned no entry on meta_key: '{meta_key}'")

    # Now assess what meta info was recovered and dump to JSON
    total_queries_located = len(meta_values_located)
    if total_queries_located >= 1:
        meta_populated = True
        if total_queries_located < input_keys_count:
            logging.info(f"Missing meta_key(s)-> {unpopulated_meta_keys}")
    else:
        logging.warning("Zero input query meta_keys present/populated.")

    if meta_populated:
        meta_values_located["database_name"] = f"{db_name}"
        print(json.dumps(meta_values_located, sort_keys=True, indent=2))
        return meta_values_located
    return {}