Source code for ensembl.tools.anno.transcriptomic_annotation.stringtie

# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
StringTie is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts.
It uses a novel network flow algorithm as well as an optional de novo assembly step to assemble and
quantitate full-length transcripts representing multiple splice variants for each gene locus.
Pertea M, Pertea GM, Antonescu CM, Chang TC, Mendell JT & Salzberg SL. StringTie enables improved 
reconstruction of a transcriptome from RNA-seq reads Nature Biotechnology 2015, doi:10.1038/nbt.3122
"""

__all__ = ["run_stringtie"]

import argparse
import logging
import logging.config
from pathlib import Path
import re
import subprocess

from ensembl.tools.anno.utils._utils import (
    check_exe,
    create_dir,
    check_gtf_content,
)


[docs]def run_stringtie( output_dir: Path, stringtie_bin: Path = Path("stringtie"), num_threads: int = 1, ) -> None: """ StringTie assembler of short read data. :param output_dir: Working directory path. :type output_dir: Path :param stringtie_bin: Software path. :type stringtie_bin: Path, default stringtie :param num_threads: Number of available threads. :type num_threads: int, default 1 :return: None :rtype: None """ check_exe(stringtie_bin) stringtie_dir = create_dir(output_dir, "stringtie_output") logging.info("Skip analysis if the gtf file already exists") output_file = stringtie_dir / "annotation.gtf" if output_file.exists(): transcript_count = check_gtf_content(output_file, "transcript") if transcript_count > 0: logging.info("Stringtie gtf file exists, skipping analysis") return stringtie_merge_input_file = stringtie_dir / "stringtie_assemblies.txt" stringtie_merge_output_file = stringtie_dir / "annotation.gtf" star_dir = Path(f"{output_dir}/star_output") if star_dir.exists() and len(list(star_dir.glob("*.bam"))) != 0: for sorted_bam_file in star_dir.glob("*.bam"): transcript_file_name = re.sub(".bam", ".stringtie.gtf", sorted_bam_file.name) transcript_file = stringtie_dir / transcript_file_name if transcript_file.exists(): logging.info( "Found an existing stringtie gtf file, will not overwrite. \ File found: %s", transcript_file, ) else: logging.info("Running Stringtie on: %s", sorted_bam_file.name) try: subprocess.check_output( # pylint:disable=subprocess-run-check [ stringtie_bin, sorted_bam_file, "-o", transcript_file, "-p", str(num_threads), "-t", # disable trimming of predicted transcripts based on coverage "-a", # minimum anchor length for junctions "15", ] ) except subprocess.CalledProcessError as e: logging.error("Error running Stringtie command: %s", e) logging.error("Return code: %s", str(e.returncode)) logging.error("Output and error messages:%s\n", e.output) else: raise IndexError(f"The list of sorted bam files is empty, Star output dir: {star_dir}") logging.info("Creating Stringtie merge input file: %s", stringtie_merge_input_file) with open(stringtie_merge_input_file, "w+", encoding="utf8") as gtf_list_out: for gtf_file in stringtie_dir.glob("*.stringtie.gtf"): transcript_count = check_gtf_content(gtf_file, "transcript") if transcript_count > 0: gtf_list_out.write(f"{gtf_file}\n") else: logging.warning("Warning, skipping file with no transcripts. Path:%s", gtf_file) logging.info("Merging Stringtie results.") try: subprocess.run( # pylint:disable=subprocess-run-check [ stringtie_bin, "--merge", "-o", stringtie_merge_output_file, stringtie_merge_input_file, ] ) except subprocess.CalledProcessError as e: logging.error("Error running Stringtie merging command: %s", e)
def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser(description="StringTie's arguments") parser.add_argument("--output_dir", required=True, help="Output directory path") parser.add_argument("--stringtie_bin", default="stringtie", help="StringTie software path") parser.add_argument("--num_threads", type=int, default=1, help="Number of threads") return parser.parse_args() def main(): """StringTie's entry-point.""" args = parse_args() log_file_path = create_dir(args.output_dir, "log") / "stringtie.log" loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" logging.config.fileConfig( loginipath, defaults={"logfilename": str(log_file_path)}, disable_existing_loggers=False, ) run_stringtie( args.output_dir, args.stringtie_bin, args.num_threads, ) if __name__ == "__main__": main()