Skip to content

dump

ensembl.io.genomio.genome_stats.dump

Generates a JSON representation of the genome stats (assembly and annotation) from a core database.

StatsGenerator dataclass

Interface to extract genome stats from a core database.

Source code in src/python/ensembl/io/genomio/genome_stats/dump.py
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
@dataclass
class StatsGenerator:
    """Interface to extract genome stats from a core database."""

    session: Session

    def get_assembly_stats(self) -> Dict[str, Any]:
        """Returns a dict of stats about the assembly."""
        stats = {
            "coord_system": self.get_attrib_counts("coord_system_tag"),
            "locations": self.get_attrib_counts("sequence_location"),
            "codon_table": self.get_attrib_counts("codon_table"),
        }
        # Special: rename supercontigs to scaffolds for homogeneity
        StatsGenerator._fix_scaffolds(stats)
        return stats

    @staticmethod
    def _fix_scaffolds(stats: Dict[str, Any]) -> None:
        """Renames supercontigs to scaffolds in the provided stats.

        If scaffolds are present already, nothing is done.

        Args:
            stats: Statistics dictionary.

        """
        coords = stats.get("coord_system", {})
        if "supercontig" in coords and "scaffold" not in coords:
            coords["scaffold"] = coords["supercontig"]
            del coords["supercontig"]

    def get_attrib_counts(self, code: str) -> Dict[str, Any]:
        """Returns a dict of count for each value counted with the attrib_type code provided.

        Args:
            code: Ensembl database attrib_type code.

        """
        seqs_st = (
            select(SeqRegionAttrib.value, func.count())  # pylint: disable=not-callable
            .join(AttribType)
            .filter(AttribType.code == code)
            .group_by(SeqRegionAttrib.value)
        )
        attributes = {}
        for row in self.session.execute(seqs_st):
            (attribute_name, count) = row
            attributes[attribute_name] = count
        return attributes

    def get_annotation_stats(self) -> Dict[str, Any]:
        """Returns a dict of stats about the coordinate systems (number of biotypes, etc.)."""
        stats = {
            "genes": self.get_feature_stats(Gene),
            "transcripts": self.get_feature_stats(Transcript),
        }
        return stats

    def get_biotypes(self, table: Any) -> Dict[str, int]:
        """Returns a dict of stats about the feature biotypes."""
        # pylint: disable-next=not-callable
        seqs_st = select(table.biotype, func.count()).group_by(table.biotype)
        biotypes = {}
        for row in self.session.execute(seqs_st):
            (biotype, count) = row
            biotypes[biotype] = count
        return biotypes

    def get_feature_stats(self, table: Any) -> Dict[str, int]:
        """Returns a dict of stats about a given feature."""
        session = self.session
        totals_st = select(func.count()).select_from(table)  # pylint: disable=not-callable
        (total,) = session.execute(totals_st).one()
        # pylint: disable-next=singleton-comparison,not-callable
        no_desc_st = select(func.count()).filter(table.description.is_(None))
        (no_desc,) = session.execute(no_desc_st).one()
        # pylint: disable-next=not-callable
        xref_desc_st = select(func.count()).where(table.description.like("%[Source:%"))
        (xref_desc,) = session.execute(xref_desc_st).one()
        left_over = total - no_desc - xref_desc
        feat_stats = {
            "total": total,
            "biotypes": self.get_biotypes(table),
            "description": {
                "empty": no_desc,
                "source_xref": xref_desc,
                "normal": left_over,
            },
        }
        return feat_stats

    def get_genome_stats(self) -> Dict[str, Any]:
        """Returns a dict of stats about the assembly and annotation."""
        genome_stats = {
            "assembly_stats": self.get_assembly_stats(),
            "annotation_stats": self.get_annotation_stats(),
        }
        return genome_stats

session instance-attribute

get_annotation_stats()

Returns a dict of stats about the coordinate systems (number of biotypes, etc.).

Source code in src/python/ensembl/io/genomio/genome_stats/dump.py
85
86
87
88
89
90
91
def get_annotation_stats(self) -> Dict[str, Any]:
    """Returns a dict of stats about the coordinate systems (number of biotypes, etc.)."""
    stats = {
        "genes": self.get_feature_stats(Gene),
        "transcripts": self.get_feature_stats(Transcript),
    }
    return stats

get_assembly_stats()

Returns a dict of stats about the assembly.

Source code in src/python/ensembl/io/genomio/genome_stats/dump.py
40
41
42
43
44
45
46
47
48
49
def get_assembly_stats(self) -> Dict[str, Any]:
    """Returns a dict of stats about the assembly."""
    stats = {
        "coord_system": self.get_attrib_counts("coord_system_tag"),
        "locations": self.get_attrib_counts("sequence_location"),
        "codon_table": self.get_attrib_counts("codon_table"),
    }
    # Special: rename supercontigs to scaffolds for homogeneity
    StatsGenerator._fix_scaffolds(stats)
    return stats

get_attrib_counts(code)

Returns a dict of count for each value counted with the attrib_type code provided.

Parameters:

Name Type Description Default
code str

Ensembl database attrib_type code.

required
Source code in src/python/ensembl/io/genomio/genome_stats/dump.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def get_attrib_counts(self, code: str) -> Dict[str, Any]:
    """Returns a dict of count for each value counted with the attrib_type code provided.

    Args:
        code: Ensembl database attrib_type code.

    """
    seqs_st = (
        select(SeqRegionAttrib.value, func.count())  # pylint: disable=not-callable
        .join(AttribType)
        .filter(AttribType.code == code)
        .group_by(SeqRegionAttrib.value)
    )
    attributes = {}
    for row in self.session.execute(seqs_st):
        (attribute_name, count) = row
        attributes[attribute_name] = count
    return attributes

get_biotypes(table)

Returns a dict of stats about the feature biotypes.

Source code in src/python/ensembl/io/genomio/genome_stats/dump.py
 93
 94
 95
 96
 97
 98
 99
100
101
def get_biotypes(self, table: Any) -> Dict[str, int]:
    """Returns a dict of stats about the feature biotypes."""
    # pylint: disable-next=not-callable
    seqs_st = select(table.biotype, func.count()).group_by(table.biotype)
    biotypes = {}
    for row in self.session.execute(seqs_st):
        (biotype, count) = row
        biotypes[biotype] = count
    return biotypes

get_feature_stats(table)

Returns a dict of stats about a given feature.

Source code in src/python/ensembl/io/genomio/genome_stats/dump.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def get_feature_stats(self, table: Any) -> Dict[str, int]:
    """Returns a dict of stats about a given feature."""
    session = self.session
    totals_st = select(func.count()).select_from(table)  # pylint: disable=not-callable
    (total,) = session.execute(totals_st).one()
    # pylint: disable-next=singleton-comparison,not-callable
    no_desc_st = select(func.count()).filter(table.description.is_(None))
    (no_desc,) = session.execute(no_desc_st).one()
    # pylint: disable-next=not-callable
    xref_desc_st = select(func.count()).where(table.description.like("%[Source:%"))
    (xref_desc,) = session.execute(xref_desc_st).one()
    left_over = total - no_desc - xref_desc
    feat_stats = {
        "total": total,
        "biotypes": self.get_biotypes(table),
        "description": {
            "empty": no_desc,
            "source_xref": xref_desc,
            "normal": left_over,
        },
    }
    return feat_stats

get_genome_stats()

Returns a dict of stats about the assembly and annotation.

Source code in src/python/ensembl/io/genomio/genome_stats/dump.py
126
127
128
129
130
131
132
def get_genome_stats(self) -> Dict[str, Any]:
    """Returns a dict of stats about the assembly and annotation."""
    genome_stats = {
        "assembly_stats": self.get_assembly_stats(),
        "annotation_stats": self.get_annotation_stats(),
    }
    return genome_stats

dump_genome_stats(url)

Returns JSON object containing the genome stats (assembly and annotation) of the given core database.

Parameters:

Name Type Description Default
url StrURL

Core database URL.

required
Source code in src/python/ensembl/io/genomio/genome_stats/dump.py
135
136
137
138
139
140
141
142
143
144
145
146
def dump_genome_stats(url: StrURL) -> Dict[str, Any]:
    """Returns JSON object containing the genome stats (assembly and annotation) of the given core database.

    Args:
        url: Core database URL.

    """
    dbc = DBConnectionLite(url)
    with dbc.session_scope() as session:
        generator = StatsGenerator(session)
        genome_stats = generator.get_genome_stats()
        return genome_stats

main()

Main script entry-point.

Source code in src/python/ensembl/io/genomio/genome_stats/dump.py
149
150
151
152
153
154
155
156
157
158
159
def main() -> None:
    """Main script entry-point."""
    parser = ArgumentParser(description=__doc__)
    parser.add_server_arguments(include_database=True)
    parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
    parser.add_log_arguments(add_log_file=True)
    args = parser.parse_args()
    init_logging_with_args(args)

    genome_stats = dump_genome_stats(args.url)
    print(json.dumps(genome_stats, indent=2, sort_keys=True))