Source code for ensembl.tools.anno.transcriptomic_annotation.minimap

# See the NOTICE file distributed with this work for additional information #pylint: disable=missing-module-docstring
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Minimap2 is a pairwise sequence alignment algorithm designed for efficiently comparing nucleotide sequences.
The algorithm uses a versatile indexing strategy to quickly find approximate matches between sequences, 
allowing it to efficiently align long sequences against reference genomes or other sequences.

Li, H. (2018). Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics, 34(18), 3094-3100.
"""

__all__ = ["run_minimap2"]

import argparse
import logging
import logging.config
from pathlib import Path
import subprocess
from typing import List

from ensembl.tools.anno.utils._utils import (
    check_exe,
    create_dir,
    check_gtf_content,
)


[docs]def run_minimap2( output_dir: Path, long_read_fastq_dir: Path, genome_file: Path, minimap2_bin: Path = Path("minimap2"), paftools_bin: Path = Path("paftools.js"), max_intron_length: int = 100000, num_threads: int = 1, ) -> None: """ Run Minimap2 to align long read data against genome file. Default Minimap set for PacBio data. :param output_dir: Working directory path. :type output_dir: Path :param long_read_fastq_dir: Long read directory path. :type long_read_fastq_dir: Path :param genome_file: Genome file path. :type genome_file: Path :param minimap2_bin: Software path. :type minimap2_bin: Path, default minimap2 :param paftools_bin: Js path. :type paftools_bin: Path, default paftools.js :param max_intron_length: The maximum intron size for alignments. Defaults to 100000. :type max_intron_length: int, default 100000 :param num_threads: Number of available threads. :type num_threads: int, default 1 :return: None :rtype: None """ check_exe(minimap2_bin) check_exe(paftools_bin) minimap2_dir = create_dir(output_dir, "minimap2_output") logging.info("Skip analysis if the gtf file already exists") output_file = minimap2_dir / "annotation.gtf" if output_file.exists(): transcript_count = check_gtf_content(output_file, "transcript") if transcript_count > 0: logging.info("Minimap2 gtf file exists, skipping analysis") return minimap2_index_file = minimap2_dir / f"{Path(genome_file).name}.mmi" # minimap2_hints_file = minimap2_dir /"minimap2_hints.gff" file_types = ("*.fastq", "*.fq") fastq_file_list = [ path for file_type in file_types for path in Path(long_read_fastq_dir).rglob(file_type) ] if len(fastq_file_list) == 0: raise IndexError(f"The list of fastq files is empty. Fastq dir:\n{long_read_fastq_dir}") if not minimap2_index_file.exists(): logging.info("Did not find an index file for minimap2. Will create now") try: subprocess.run( # pylint:disable=subprocess-run-check [ minimap2_bin, "-t", str(num_threads), "-d", str(minimap2_index_file), genome_file, ] ) except subprocess.CalledProcessError as e: logging.error("An error occurred while creating minimap2 index: %s", e) except OSError as e: logging.error("An OS error occurred: %s", e) logging.info("Running minimap2 on the files in the long read fastq dir") for fastq_file in fastq_file_list: sam_file = minimap2_dir / f"{fastq_file.name}.sam" bed_file = minimap2_dir / f"{fastq_file.name}.bed" logging.info("Processing %s", fastq_file) with open(bed_file, "w+", encoding="utf8") as bed_file_out: subprocess.run( # pylint:disable=subprocess-run-check [ minimap2_bin, "-G", str(max_intron_length), "-t", str(num_threads), "--cs", "--secondary=no", "-ax", "splice", "-u", "b", minimap2_index_file, fastq_file, "-o", sam_file, ] ) logging.info("Creating bed file from SAM") subprocess.run( [paftools_bin, "splice2bed", sam_file], stdout=bed_file_out ) # pylint:disable=subprocess-run-check _bed_to_gtf(minimap2_dir) logging.info("Completed running minimap2")
def _bed_to_gtf(output_dir: Path) -> None: """ Convert bed file into gtf file Args: output_dir : Working directory path. """ gtf_file_path = output_dir / "annotation.gtf" with open(gtf_file_path, "w+", encoding="utf8") as gtf_out: gene_id = 1 for bed_file in output_dir.glob("*.bed"): logging.info("Converting bed to GTF: %s", str(bed_file)) with open(bed_file, "r", encoding="utf8") as bed_in: for line in bed_in: elements = line.rstrip().split("\t") seq_region_name = elements[0] offset = int(elements[1]) strand = elements[5] # sizes of individual block of exons block_sizes = [size for size in elements[10].split(",") if size] block_starts = [size for size in elements[11].split(",") if size] exons = _bed_block_to_exons(block_sizes, block_starts, offset) transcript_start = None transcript_end = None exon_records = [] for i, exon_coords in enumerate(exons): if transcript_start is None or exon_coords[0] < transcript_start: transcript_start = exon_coords[0] if transcript_end is None or exon_coords[1] > transcript_end: transcript_end = exon_coords[1] exon_line = ( f"{seq_region_name}\tminimap\texon\t{exon_coords[0]}\t" f"{exon_coords[1]}\t.\t{strand}\t.\t" f'gene_id "minimap_{gene_id}"; transcript_id "minimap_{gene_id}"; ' f'exon_number "{i+ 1}";\n' ) exon_records.append(exon_line) transcript_line = ( f"{seq_region_name}\tminimap\ttranscript\t{transcript_start}\t" f"{transcript_end}\t.\t{strand}\t.\t" f'gene_id "minimap_{gene_id}"; transcript_id "minimap_{gene_id}"\n' ) gtf_out.write(transcript_line) for exon_line in exon_records: gtf_out.write(exon_line) gene_id += 1 def _bed_block_to_exons(block_sizes: List, block_starts: List, offset: int) -> List: """ Extract exon size and start from exon feature block Args: block_sizes : Block feature size. block_starts : Block feature starts. offset : Feature offset. Returns: List of exon coordinates """ exons = [] for i, _ in enumerate(block_sizes): block_start = offset + int(block_starts[i]) + 1 block_end = block_start + int(block_sizes[i]) - 1 if block_end < block_start: logging.warning("Warning: block end is less than block start, skipping exon") continue exon_coords = [str(block_start), str(block_end)] exons.append(exon_coords) return exons def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser(description="Minimap2's arguments") parser.add_argument("--output_dir", required=True, help="Output directory path") parser.add_argument("--long_read_fastq_dir", required=True, help="Long read directory path") parser.add_argument("--genome_file", required=True, help="Genome file path") parser.add_argument("--minimap2_bin", default="minimap2", help="Minimap2 software path") parser.add_argument("--paftools_bin", default="paftools.js", help="Paftools js path") parser.add_argument("--max_intron_length", type=int, default=100000, help="The maximum intron length.") parser.add_argument("--num_threads", type=int, default=1, help="Number of threads") return parser.parse_args() def main(): """Minimap2's entry-point.""" args = parse_args() log_file_path = create_dir(args.output_dir, "log") / "minimap.log" loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" logging.config.fileConfig( loginipath, defaults={"logfilename": str(log_file_path)}, disable_existing_loggers=False, ) run_minimap2( args.output_dir, args.long_read_fastq_dir, args.genome_file, args.minimap2_bin, args.paftools_bin, args.max_intron_length, args.num_threads, ) if __name__ == "__main__": main()