Skip to content

seqregion_parser

ensembl.brc4.runnable.seqregion_parser

SeqregionParser

Parser of a seq_region report from INSDC/RefSeq.

The main method of the Parser is get_report_regions, which returns a Dict of seq_regions, where the keys are the names.

Source code in src/python/ensembl/brc4/runnable/seqregion_parser.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
class SeqregionParser:
    """Parser of a seq_region report from INSDC/RefSeq.

    The main method of the Parser is get_report_regions, which returns a Dict of seq_regions,
    where the keys are the names.
    """

    synonym_map = {
        "Sequence-Name": "INSDC_submitted_name",
        "GenBank-Accn": "INSDC",
        "RefSeq-Accn": "RefSeq",
        "Assigned-Molecule": "GenBank",
    }

    molecule_location = {
        "chromosome": "nuclear_chromosome",
        "mitochondrion": "mitochondrial_chromosome",
        "apicoplast": "apicoplast_chromosome",
        "plasmid": "plasmid",
        "linkage group": "linkage_group",
        "kinetoplast": "kinetoplast",
    }

    def get_report_regions(
        self, report_path: Path, accession: str, use_refseq: bool = False
    ) -> Dict[str, dict]:
        """Get seq_region data from report file.

        Args:
            report_path: Path to the INSDC seq_region report.
            use_refseq: Expect a RefSeq seq_region report.
        Returns:
            A dict of seq_regions dicts, with their name as the key
        """
        if accession.startswith("GCF"):
            use_refseq = True
        # Get the report in a CSV format, easier to manipulate
        report_csv, metadata = self.report_to_csv(report_path)

        # Feed the csv string to the CSV reader
        reader = csv.DictReader(report_csv.splitlines(), delimiter="\t", quoting=csv.QUOTE_NONE)

        # Metadata
        assembly_level = "contig"
        if "Assembly level" in metadata:
            assembly_level = metadata["Assembly level"].lower()

        # Create the seq_regions
        seq_regions = {}
        for row in reader:
            seq_region = self.make_seq_region(row, assembly_level, use_refseq)
            name = seq_region["name"]
            seq_regions[name] = seq_region

        return seq_regions

    def report_to_csv(self, report_path: Path) -> Tuple[str, dict]:
        """Load an assembly report as a csv string.

        Args:
            report_path: Path to the INSDC seq_region report.
        Returns:
            The csv as a string, and the head metadata as a dict.
        """

        with open_gz_file(report_path) as report:
            data = ""
            metadata = {}
            last_head = ""
            for line in report:
                # Ignore header
                if line.startswith("#"):
                    # Get metadata values if possible
                    match = re.search("# (.+?): (.+?)$", line)
                    if match:
                        metadata[match.group(1)] = match.group(2)
                    last_head = line
                    continue
                else:
                    if last_head:
                        data += last_head[2:].strip() + "\n"
                        last_head = ""
                    data += line
            return data, metadata

    def make_seq_region(self, row: Dict[str, str], assembly_level: str, use_refseq: bool) -> Dict[str, Any]:
        """From a row of the report, create one seq_region dict.

        Args:
            row: A seq_region row from the INSDC report.
            assembly_level: what level is the seq_region (chromosome, contig, etc.)
            use_refseq: Expect a RefSeq seq_region report.

        Returns:
            A seq_region dict.
        """
        seq_region: Dict[str, Any] = {}

        # Map the fields to their synonym name
        synonym_map = self.synonym_map
        molecule_location = self.molecule_location

        # Synonyms
        synonyms = []
        for field, source in synonym_map.items():
            if field in row and row[field].lower() != "na":
                synonym = {"source": source, "name": row[field]}
                synonyms.append(synonym)
        if len(synonyms) > 0:
            synonyms.sort(key=lambda x: x["source"])
            seq_region["synonyms"] = synonyms

        # Length
        field = "Sequence-Length"
        if field in row and row[field].lower() != "na":
            seq_region["length"] = int(row[field])

        if use_refseq:
            if "RefSeq-Accn" in row:
                seq_region["name"] = row["RefSeq-Accn"]
            else:
                raise Exception(f"No RefSeq name for {row['Sequence-Name']}")

        else:
            if "GenBank-Accn" in row:
                seq_region["name"] = row["GenBank-Accn"]
            else:
                raise Exception(f"No INSDC name for {row['Sequence-Name']}")

        # Coord system and location
        seq_role = row["Sequence-Role"]

        # Scaffold?
        if seq_role in ("unplaced-scaffold", "unlocalized-scaffold", "alt-scaffold"):
            seq_region["coord_system_level"] = "scaffold"

        # Chromosome? Check location
        elif seq_role == "assembled-molecule":
            seq_region["coord_system_level"] = "chromosome"
            location = row["Assigned-Molecule-Location/Type"].lower()

            # Get location metadata
            if location in molecule_location:
                seq_location = molecule_location[location]
                seq_region["location"] = seq_location
            else:
                raise Exception(f"Unrecognized sequence location: {location}")
        else:
            raise Exception(f"Unrecognized sequence role: {seq_role}")
        return seq_region

molecule_location = {'chromosome': 'nuclear_chromosome', 'mitochondrion': 'mitochondrial_chromosome', 'apicoplast': 'apicoplast_chromosome', 'plasmid': 'plasmid', 'linkage group': 'linkage_group', 'kinetoplast': 'kinetoplast'} class-attribute instance-attribute

synonym_map = {'Sequence-Name': 'INSDC_submitted_name', 'GenBank-Accn': 'INSDC', 'RefSeq-Accn': 'RefSeq', 'Assigned-Molecule': 'GenBank'} class-attribute instance-attribute

get_report_regions(report_path, accession, use_refseq=False)

Get seq_region data from report file.

Parameters:

Name Type Description Default
report_path Path

Path to the INSDC seq_region report.

required
use_refseq bool

Expect a RefSeq seq_region report.

False

Returns: A dict of seq_regions dicts, with their name as the key

Source code in src/python/ensembl/brc4/runnable/seqregion_parser.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def get_report_regions(
    self, report_path: Path, accession: str, use_refseq: bool = False
) -> Dict[str, dict]:
    """Get seq_region data from report file.

    Args:
        report_path: Path to the INSDC seq_region report.
        use_refseq: Expect a RefSeq seq_region report.
    Returns:
        A dict of seq_regions dicts, with their name as the key
    """
    if accession.startswith("GCF"):
        use_refseq = True
    # Get the report in a CSV format, easier to manipulate
    report_csv, metadata = self.report_to_csv(report_path)

    # Feed the csv string to the CSV reader
    reader = csv.DictReader(report_csv.splitlines(), delimiter="\t", quoting=csv.QUOTE_NONE)

    # Metadata
    assembly_level = "contig"
    if "Assembly level" in metadata:
        assembly_level = metadata["Assembly level"].lower()

    # Create the seq_regions
    seq_regions = {}
    for row in reader:
        seq_region = self.make_seq_region(row, assembly_level, use_refseq)
        name = seq_region["name"]
        seq_regions[name] = seq_region

    return seq_regions

make_seq_region(row, assembly_level, use_refseq)

From a row of the report, create one seq_region dict.

Parameters:

Name Type Description Default
row Dict[str, str]

A seq_region row from the INSDC report.

required
assembly_level str

what level is the seq_region (chromosome, contig, etc.)

required
use_refseq bool

Expect a RefSeq seq_region report.

required

Returns:

Type Description
Dict[str, Any]

A seq_region dict.

Source code in src/python/ensembl/brc4/runnable/seqregion_parser.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def make_seq_region(self, row: Dict[str, str], assembly_level: str, use_refseq: bool) -> Dict[str, Any]:
    """From a row of the report, create one seq_region dict.

    Args:
        row: A seq_region row from the INSDC report.
        assembly_level: what level is the seq_region (chromosome, contig, etc.)
        use_refseq: Expect a RefSeq seq_region report.

    Returns:
        A seq_region dict.
    """
    seq_region: Dict[str, Any] = {}

    # Map the fields to their synonym name
    synonym_map = self.synonym_map
    molecule_location = self.molecule_location

    # Synonyms
    synonyms = []
    for field, source in synonym_map.items():
        if field in row and row[field].lower() != "na":
            synonym = {"source": source, "name": row[field]}
            synonyms.append(synonym)
    if len(synonyms) > 0:
        synonyms.sort(key=lambda x: x["source"])
        seq_region["synonyms"] = synonyms

    # Length
    field = "Sequence-Length"
    if field in row and row[field].lower() != "na":
        seq_region["length"] = int(row[field])

    if use_refseq:
        if "RefSeq-Accn" in row:
            seq_region["name"] = row["RefSeq-Accn"]
        else:
            raise Exception(f"No RefSeq name for {row['Sequence-Name']}")

    else:
        if "GenBank-Accn" in row:
            seq_region["name"] = row["GenBank-Accn"]
        else:
            raise Exception(f"No INSDC name for {row['Sequence-Name']}")

    # Coord system and location
    seq_role = row["Sequence-Role"]

    # Scaffold?
    if seq_role in ("unplaced-scaffold", "unlocalized-scaffold", "alt-scaffold"):
        seq_region["coord_system_level"] = "scaffold"

    # Chromosome? Check location
    elif seq_role == "assembled-molecule":
        seq_region["coord_system_level"] = "chromosome"
        location = row["Assigned-Molecule-Location/Type"].lower()

        # Get location metadata
        if location in molecule_location:
            seq_location = molecule_location[location]
            seq_region["location"] = seq_location
        else:
            raise Exception(f"Unrecognized sequence location: {location}")
    else:
        raise Exception(f"Unrecognized sequence role: {seq_role}")
    return seq_region

report_to_csv(report_path)

Load an assembly report as a csv string.

Parameters:

Name Type Description Default
report_path Path

Path to the INSDC seq_region report.

required

Returns: The csv as a string, and the head metadata as a dict.

Source code in src/python/ensembl/brc4/runnable/seqregion_parser.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def report_to_csv(self, report_path: Path) -> Tuple[str, dict]:
    """Load an assembly report as a csv string.

    Args:
        report_path: Path to the INSDC seq_region report.
    Returns:
        The csv as a string, and the head metadata as a dict.
    """

    with open_gz_file(report_path) as report:
        data = ""
        metadata = {}
        last_head = ""
        for line in report:
            # Ignore header
            if line.startswith("#"):
                # Get metadata values if possible
                match = re.search("# (.+?): (.+?)$", line)
                if match:
                    metadata[match.group(1)] = match.group(2)
                last_head = line
                continue
            else:
                if last_head:
                    data += last_head[2:].strip() + "\n"
                    last_head = ""
                data += line
        return data, metadata