Skip to content

process

ensembl.io.genomio.fasta.process

Takes a FASTA file (DNA or peptide), cleans it up and optionally excludes some IDs.

exclude_seq_regions = [] module-attribute

FastaParserError

Bases: Exception

Error while parsing a FASTA file.

Source code in src/python/ensembl/io/genomio/fasta/process.py
35
36
class FastaParserError(Exception):
    """Error while parsing a FASTA file."""

get_peptides_to_exclude(genbank_path, seqr_to_exclude)

Extract peptide IDs from a genbank file that are in a given list of seq regions

Source code in src/python/ensembl/io/genomio/fasta/process.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def get_peptides_to_exclude(genbank_path: PathLike, seqr_to_exclude: Set[str]) -> Set[str]:
    """
    Extract peptide IDs from a genbank file that are in a given list of seq regions
    """
    peptides_to_exclude: Set[str] = set()
    with open_gz_file(genbank_path) as in_genbank:
        for record in SeqIO.parse(in_genbank, "genbank"):
            if record.id in seqr_to_exclude:
                logging.info(f"Skip sequence {record.id}")
                for feat in record.features:
                    if feat.type == "CDS":
                        if "protein_id" in feat.qualifiers:
                            feat_id = feat.qualifiers["protein_id"]
                            peptides_to_exclude.add(feat_id[0])
                        else:
                            raise FastaParserError(f"Peptide without peptide ID ${feat}")
    return peptides_to_exclude

main()

Module's entry-point.

Source code in src/python/ensembl/io/genomio/fasta/process.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def main() -> None:
    """Module's entry-point."""
    parser = ArgumentParser(description="Clean-up a given FASTA file to remove unwanted elements.")
    parser.add_argument_src_path("--fasta_infile", required=True, help="Input FASTA file - DNA / Protein")
    parser.add_argument_src_path("--genbank_infile", help="Input GenBank GBFF file")
    parser.add_argument_dst_path("--fasta_outfile", required=True, help="Output FASTA file")
    parser.add_argument("--peptide_mode", action="store_true", help="Process proteins instead of DNA")
    parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
    parser.add_log_arguments(add_log_file=True)
    args = parser.parse_args()
    init_logging_with_args(args)

    prep_fasta_data(
        fasta_infile=args.fasta_infile,
        genbank_infile=args.genbank_infile,
        fasta_outfile=args.fasta_outfile,
        peptide_mode=args.peptide_mode,
    )

prep_fasta_data(fasta_infile, genbank_infile, fasta_outfile, peptide_mode=False)

Parameters:

Name Type Description Default
fasta_file

Input FASTA file - DNA / Protein

required
genbank_infile Optional[PathLike]

Input GenBank GBFF file (Optional)

required
fasta_outfile PathLike

Output FASTA sequence file.

required
peptide_mode bool

Process proteins instead of DNA

False
Source code in src/python/ensembl/io/genomio/fasta/process.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def prep_fasta_data(
    fasta_infile: PathLike,
    genbank_infile: Optional[PathLike],
    fasta_outfile: PathLike,
    peptide_mode: bool = False,
) -> None:
    """
    Args:
        fasta_file: Input FASTA file - DNA / Protein
        genbank_infile: Input GenBank GBFF file (Optional)
        fasta_outfile: Output FASTA sequence file.
        peptide_mode: Process proteins instead of DNA
    """
    file_path = Path(fasta_infile)

    to_exclude = set()
    seqr_to_exclude = set(exclude_seq_regions)
    if peptide_mode:
        if genbank_infile is not None:
            genbank_path = Path(genbank_infile)
            to_exclude = get_peptides_to_exclude(genbank_path, seqr_to_exclude)
    else:
        to_exclude = seqr_to_exclude

    # Copy and filter
    records = []

    # Final path
    with open_gz_file(file_path) as in_fasta:
        for record in SeqIO.parse(in_fasta, "fasta"):
            if record.id in to_exclude:
                logging.info(f"Skip record ${record.id}")
            else:
                records.append(record)
    with Path(fasta_outfile).open("w") as out_fasta:
        SeqIO.write(records, out_fasta, "fasta")