Source code for ensembl.tools.anno.transcriptomic_annotation.minimap

# See the NOTICE file distributed with this work for additional information #pylint: disable=missing-module-docstring
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Minimap2 is a pairwise sequence alignment algorithm designed for efficiently comparing nucleotide sequences.
The algorithm uses a versatile indexing strategy to quickly find approximate matches between sequences, 
allowing it to efficiently align long sequences against reference genomes or other sequences.

Li, H. (2018). Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics, 34(18), 3094-3100.
"""

__all__ = ["run_minimap2"]

import argparse
import logging
import logging.config
from pathlib import Path
import subprocess
from typing import List

from ensembl.tools.anno.utils._utils import (
    check_exe,
    create_dir,
    check_gtf_content,
)


[docs]def run_minimap2(
    output_dir: Path,
    long_read_fastq_dir: Path,
    genome_file: Path,
    minimap2_bin: Path = Path("minimap2"),
    paftools_bin: Path = Path("paftools.js"),
    max_intron_length: int = 100000,
    num_threads: int = 1,
) -> None:
    """
    Run Minimap2 to align long read data against genome file.
    Default Minimap set for PacBio data.

        :param output_dir: Working directory path.
        :type output_dir: Path
        :param long_read_fastq_dir: Long read directory path.
        :type long_read_fastq_dir: Path
        :param genome_file: Genome file path.
        :type genome_file: Path
        :param minimap2_bin: Software path.
        :type minimap2_bin: Path, default minimap2
        :param paftools_bin: Js path.
        :type paftools_bin: Path, default paftools.js
        :param max_intron_length: The maximum intron size for alignments. Defaults to 100000.
        :type max_intron_length: int, default 100000
        :param num_threads: Number of available threads.
        :type num_threads: int, default 1

        :return: None
        :rtype: None
    """
    check_exe(minimap2_bin)
    check_exe(paftools_bin)
    minimap2_dir = create_dir(output_dir, "minimap2_output")

    logging.info("Skip analysis if the gtf file already exists")
    output_file = minimap2_dir / "annotation.gtf"
    if output_file.exists():
        transcript_count = check_gtf_content(output_file, "transcript")
        if transcript_count > 0:
            logging.info("Minimap2 gtf file exists, skipping analysis")
            return
    minimap2_index_file = minimap2_dir / f"{Path(genome_file).name}.mmi"
    # minimap2_hints_file = minimap2_dir /"minimap2_hints.gff"
    file_types = ("*.fastq", "*.fq")
    fastq_file_list = [
        path for file_type in file_types for path in Path(long_read_fastq_dir).rglob(file_type)
    ]
    if len(fastq_file_list) == 0:
        raise IndexError(f"The list of fastq files is empty. Fastq dir:\n{long_read_fastq_dir}")

    if not minimap2_index_file.exists():
        logging.info("Did not find an index file for minimap2. Will create now")
        try:
            subprocess.run(  # pylint:disable=subprocess-run-check
                [
                    minimap2_bin,
                    "-t",
                    str(num_threads),
                    "-d",
                    str(minimap2_index_file),
                    genome_file,
                ]
            )
        except subprocess.CalledProcessError as e:
            logging.error("An error occurred while creating minimap2 index: %s", e)
        except OSError as e:
            logging.error("An OS error occurred: %s", e)

    logging.info("Running minimap2 on the files in the long read fastq dir")
    for fastq_file in fastq_file_list:
        sam_file = minimap2_dir / f"{fastq_file.name}.sam"
        bed_file = minimap2_dir / f"{fastq_file.name}.bed"
        logging.info("Processing %s", fastq_file)
        with open(bed_file, "w+", encoding="utf8") as bed_file_out:
            subprocess.run(  # pylint:disable=subprocess-run-check
                [
                    minimap2_bin,
                    "-G",
                    str(max_intron_length),
                    "-t",
                    str(num_threads),
                    "--cs",
                    "--secondary=no",
                    "-ax",
                    "splice",
                    "-u",
                    "b",
                    minimap2_index_file,
                    fastq_file,
                    "-o",
                    sam_file,
                ]
            )
            logging.info("Creating bed file from SAM")
            subprocess.run(
                [paftools_bin, "splice2bed", sam_file], stdout=bed_file_out
            )  # pylint:disable=subprocess-run-check

    _bed_to_gtf(minimap2_dir)

    logging.info("Completed running minimap2")


def _bed_to_gtf(output_dir: Path) -> None:
    """
    Convert bed file into gtf file
    Args:
        output_dir : Working directory path.
    """
    gtf_file_path = output_dir / "annotation.gtf"
    with open(gtf_file_path, "w+", encoding="utf8") as gtf_out:
        gene_id = 1
        for bed_file in output_dir.glob("*.bed"):
            logging.info("Converting bed to GTF: %s", str(bed_file))
            with open(bed_file, "r", encoding="utf8") as bed_in:
                for line in bed_in:
                    elements = line.rstrip().split("\t")
                    seq_region_name = elements[0]
                    offset = int(elements[1])
                    strand = elements[5]
                    # sizes of individual block of exons
                    block_sizes = [size for size in elements[10].split(",") if size]
                    block_starts = [size for size in elements[11].split(",") if size]
                    exons = _bed_block_to_exons(block_sizes, block_starts, offset)
                    transcript_start = None
                    transcript_end = None
                    exon_records = []
                    for i, exon_coords in enumerate(exons):
                        if transcript_start is None or exon_coords[0] < transcript_start:
                            transcript_start = exon_coords[0]

                        if transcript_end is None or exon_coords[1] > transcript_end:
                            transcript_end = exon_coords[1]

                        exon_line = (
                            f"{seq_region_name}\tminimap\texon\t{exon_coords[0]}\t"
                            f"{exon_coords[1]}\t.\t{strand}\t.\t"
                            f'gene_id "minimap_{gene_id}"; transcript_id "minimap_{gene_id}"; '
                            f'exon_number "{i+ 1}";\n'
                        )
                        exon_records.append(exon_line)
                    transcript_line = (
                        f"{seq_region_name}\tminimap\ttranscript\t{transcript_start}\t"
                        f"{transcript_end}\t.\t{strand}\t.\t"
                        f'gene_id "minimap_{gene_id}"; transcript_id "minimap_{gene_id}"\n'
                    )
                    gtf_out.write(transcript_line)
                    for exon_line in exon_records:
                        gtf_out.write(exon_line)
                    gene_id += 1


def _bed_block_to_exons(block_sizes: List, block_starts: List, offset: int) -> List:
    """
    Extract exon size and start from exon feature block
    Args:
        block_sizes : Block feature size.
        block_starts : Block feature starts.
        offset : Feature offset.

    Returns:
        List of exon coordinates
    """
    exons = []
    for i, _ in enumerate(block_sizes):
        block_start = offset + int(block_starts[i]) + 1
        block_end = block_start + int(block_sizes[i]) - 1
        if block_end < block_start:
            logging.warning("Warning: block end is less than block start, skipping exon")
            continue
        exon_coords = [str(block_start), str(block_end)]
        exons.append(exon_coords)
    return exons


def parse_args():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description="Minimap2's arguments")
    parser.add_argument("--output_dir", required=True, help="Output directory path")
    parser.add_argument("--long_read_fastq_dir", required=True, help="Long read directory path")
    parser.add_argument("--genome_file", required=True, help="Genome file path")
    parser.add_argument("--minimap2_bin", default="minimap2", help="Minimap2 software path")
    parser.add_argument("--paftools_bin", default="paftools.js", help="Paftools js path")
    parser.add_argument("--max_intron_length", type=int, default=100000, help="The maximum intron length.")
    parser.add_argument("--num_threads", type=int, default=1, help="Number of threads")
    return parser.parse_args()


def main():
    """Minimap2's entry-point."""
    args = parse_args()

    log_file_path = create_dir(args.output_dir, "log") / "minimap.log"
    loginipath = Path(__file__).parents[6] / "conf" / "logging.conf"

    logging.config.fileConfig(
        loginipath,
        defaults={"logfilename": str(log_file_path)},
        disable_existing_loggers=False,
    )

    run_minimap2(
        args.output_dir,
        args.long_read_fastq_dir,
        args.genome_file,
        args.minimap2_bin,
        args.paftools_bin,
        args.max_intron_length,
        args.num_threads,
    )


if __name__ == "__main__":
    main()
Source code for ensembl.tools.anno.transcriptomic_annotation.minimap

Table of Contents

Search