# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used
in genomics research for aligning RNA-seq data to a reference genome.
Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner.
Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635
"""
__all__ = ["run_star", "subsample_transcriptomic_data", "run_trimming"]
import argparse
import logging
import logging.config
import gzip
import math
import multiprocessing
from pathlib import Path
import random
import re
import shutil
import subprocess
from typing import Dict, List
from ensembl.tools.anno.utils._utils import (
check_exe,
create_dir,
check_gtf_content,
get_seq_region_length,
)
[docs]def run_star( # pylint:disable=too-many-branches
genome_file: Path,
output_dir: Path,
short_read_fastq_dir: Path,
delete_pre_trim_fastq: bool = False,
trim_fastq: bool = False,
max_reads_per_sample: int = 0,
max_intron_length: int = 100000,
subsample_read_limit: int = 100000000,
subsample_percentage: float = 0.25,
sampling_via_read_limit: bool = False,
sampling_via_percentage: bool = False,
sampling_via_read_limit_percentage: bool = False,
num_threads: int = 1,
star_bin: Path = Path("star"),
samtools_bin: Path = Path("samtools"),
trim_galore_bin: Path = Path("trim_galore"),
) -> None:
"""
Run STAR alignment on list of short read data.
:param genome_file: Genome file path.
:type genome_file: Path
:param output_dir: Working directory path.
:type output_dir: Path
:param short_read_fastq_dir: Short read directory path.
:type short_read_fastq_dir: Path
:param delete_pre_trim_fastq: Delete the original fastq files after trimming. Defaults to False.
:type delete_pre_trim_fastq: boolean, default False
:param trim_fastq: Trim short read files using TrimGalore. Defaults to False.
:type trim_fastq: boolean, default False
:param max_reads_per_sample: Max number of reads per sample. Defaults to 0 (unlimited).
:type max_reads_per_sample: int, default 0
:param max_intron_length: The maximum intron size for alignments. Defaults to 100000.
:type max_intron_length: int, default 100000
:param subsample_read_limit: Maximum number of reads to subsample. Defaults to 100000000.
:type subsample_read_limit:int, default 100000000,
:param subsample_percentage: Maximun percentage of reads to subsample.
:type subsample_percentage: int, default 0.25,
:param sampling_via_read_limit: subsample fastq files using subsample_read_limit.
:type sampling_via_read_limit: boolean, False,
:param sampling_via_percentage: subsample fastq files using subsample_percentage.
:type sampling_via_percentage: boolean, False,
:param sampling_via_read_limit_percentage: use max read limit and percentage value.
:type sampling_via_read_limit_percentage: boolean, False,
:param num_threads: Number of available threads.
:type num_threads: int, default 1
:param star_bin: Software path.
:type star_bin: Path, default star
:param samtools_bin: Software path.
:type samtools_bin: Path,default samtools
:param trim_galore_bin: Software path.
:type trim_galore_bin: Path, default trim_galore
:return: None
:rtype: None
"""
check_exe(star_bin)
# If trimming has been enabled then switch the path for
# short_read_fastq_dir from the original location to the trimmed fastq dir
if trim_fastq:
run_trimming(output_dir, short_read_fastq_dir, delete_pre_trim_fastq, num_threads, trim_galore_bin)
short_read_fastq_dir = output_dir / "trim_galore_output"
# if not os.path.exists(subsample_script_path):
# subsample_script_path = "subsample_fastq.py"
star_dir = create_dir(output_dir, "star_output")
for output_file in [
Path(f"{output_dir}/stringtie_output/annotation.gtf"),
Path(f"{output_dir}/scallop_output/annotation.gtf"),
]:
if output_file.exists():
transcript_count = check_gtf_content(output_file, "transcript") # check a gtf
if transcript_count > 0:
logging.info("Transcriptomic alignment exists")
return
star_index_file = star_dir / "SAindex"
fastq_file_list: List[Path] = []
file_types = ("*.fastq", "*.fq", "*.fastq.gz", "*.fq.gz")
fastq_file_list = [
path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type)
]
if len(fastq_file_list) == 0:
raise IndexError(f"The list of fastq files is empty. Fastq dir:\n{short_read_fastq_dir}")
# for file_type in file_types:
# fastq_file_list.extend(glob.glob(os.path.join(short_read_fastq_dir, file_type)))
# Get list of paired paths
paired_fastq_file_list = _create_paired_paths(fastq_file_list)
# Subsamples in parallel if there's a value set
if max_reads_per_sample:
paired_fastq_file_list_sub: List[List[Path]] = []
for paired_fastq_files in paired_fastq_file_list:
paired_fastq_files_sub = subsample_transcriptomic_data(
paired_fastq_files,
subsample_read_limit,
subsample_percentage,
sampling_via_read_limit,
sampling_via_percentage,
sampling_via_read_limit_percentage,
num_threads,
)
paired_fastq_file_list_sub.append(paired_fastq_files_sub)
paired_fastq_file_list = paired_fastq_file_list_sub
# Get the list of the new subsampled files
# fastq_file_list = [
# path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type)
# ]
# I don't think is needed
# fastq_file_list = check_for_fastq_subsamples(fastq_file_list)
if not star_index_file.exists():
logging.info("Did not find an index file for Star. Will create now")
seq_region_to_length = get_seq_region_length(genome_file, 0)
genome_size = sum(seq_region_to_length.values())
# This calculates the base-2 logarithm of the genome_size. The logarithm of the genome size is
# a measure of how many bits are needed to represent the genome size in binary.
#
# The choice of 14 as the maximum value is likely based on empirical observations and optimization
# considerations. Too large of a seed length can lead to increased memory usage and potentially
# slower indexing, while a seed length that is too small might affect alignment accuracy.
index_bases = min(14, math.floor((math.log(genome_size, 2) / 2) - 1))
try:
subprocess.run( # pylint:disable=subprocess-run-check
[
str(star_bin),
"--runThreadN",
str(num_threads),
"--runMode",
"genomeGenerate",
"--outFileNamePrefix",
f"{star_dir}/",
"--genomeDir",
str(star_dir),
"--genomeSAindexNbases",
str(index_bases),
"--genomeFastaFiles",
str(genome_file),
]
)
except Exception as e: # pylint:disable=broad-exception-caught
logging.error("An error occurred while creating star index: %s", e)
logging.info("Running Star on the files in the fastq dir")
for paired_files in paired_fastq_file_list:
first_file_name = paired_files[0].name # Get the name of the first file
match = re.search(r"(.+)_\d+\.(fastq|fq)", first_file_name) # Search for pattern
if match:
first_part_of_name = match.group(1)
# logger.info(fastq_file_path)
# fastq_file_name = os.path.basename(fastq_file_path)
star_tmp_dir = star_dir / "tmp"
if star_tmp_dir.exists():
shutil.rmtree(star_tmp_dir)
sam_file = Path(f"{star_dir}/{first_part_of_name}.sam")
junctions_file = Path(f"{star_dir}/{first_part_of_name}.sj.tab")
sam_file_name = sam_file.name
sam_temp_file = Path(f"{star_dir}/{sam_file_name}.tmp")
bam_file = re.sub(".sam", ".bam", sam_file_name)
bam_sort_file = Path(f"{star_dir}/{bam_file}")
log_out_file = Path(f"{star_dir}/{first_part_of_name}.Log.final.out")
if log_out_file.exists() and bam_sort_file.exists() and bam_sort_file.stat().st_size != 0:
logging.info(
"Found an existing bam file for the fastq file, \
presuming the file has been processed, will skip"
)
continue
read_files_in = ",".join(str(fastq_file) for fastq_file in paired_files)
logging.info("Processing %s", list(paired_files))
star_command = [
str(star_bin),
"--outFilterIntronMotifs",
"RemoveNoncanonicalUnannotated",
"--outSAMstrandField",
"intronMotif",
"--runThreadN",
str(num_threads),
"--twopassMode",
"Basic",
"--runMode",
"alignReads",
"--genomeDir",
str(star_dir),
"--readFilesIn",
read_files_in,
"--outFileNamePrefix",
f"{star_dir}/",
"--outTmpDir",
str(star_tmp_dir),
"--outSAMtype",
"SAM",
"--alignIntronMax",
str(max_intron_length),
]
#'--outSJfilterIntronMaxVsReadN','5000','10000','25000','40000',
#'50000','50000','50000','50000','50000','100000']
# check_compression = re.search(r".gz$", fastq_file)
if Path(paired_files[0].name).suffix.endswith(".gz"):
star_command.append("--readFilesCommand")
star_command.append("gunzip")
star_command.append("-c")
subprocess.run(star_command) # pylint:disable=subprocess-run-check
shutil.move(Path(f"{star_dir}/Aligned.out.sam"), sam_file)
shutil.move(Path(f"{star_dir}/SJ.out.tab"), junctions_file)
logging.info("Converting samfile into sorted bam file. Bam file: %s", bam_file)
subprocess.run( # pylint:disable=subprocess-run-check
[
str(samtools_bin),
"sort",
"-@",
str(num_threads),
"-T",
str(sam_temp_file),
"-o",
str(bam_sort_file),
str(sam_file),
]
)
shutil.move(star_dir / "Log.final.out", log_out_file)
sam_file.unlink()
logging.info("Completed running STAR")
def _create_paired_paths(fastq_file_paths: List[Path]) -> List[List[Path]]:
"""
Create list of paired transcriptomic fastq files
Args:
fastq_file_paths (List): List of transcriptomic file paths.
Returns:
List[List[Path]]: List of paired transcriptomic files
"""
path_dict: Dict[str, List[Path]] = {}
final_list: List[List[Path]] = []
for fastq_file in fastq_file_paths:
paired_name = re.search(r"(.+)_\d+\.(fastq|fq)", str(fastq_file))
if not paired_name:
logging.exception(
"Could not find _1 or _2 at the end of the prefix \
for file. Assuming file is not paired: %s",
str(fastq_file),
)
final_list.append([fastq_file])
continue
run_accession = paired_name.group(1)
if run_accession in path_dict:
path_dict[run_accession].append(fastq_file)
else:
path_dict[run_accession] = [fastq_file]
for pair in path_dict: # pylint:disable=consider-using-dict-items
final_list.append(path_dict[pair])
return final_list
# pylint:disable=pointless-string-statement
"""
For an advanced and optimised subsampling we could use
https://github.com/lh3/seqtk
"""
[docs]def subsample_transcriptomic_data(
fastq_file_list: List[Path],
subsample_read_limit: int = 100000000,
subsample_percentage: float = 0.25,
sampling_via_read_limit: bool = False,
sampling_via_percentage: bool = False,
sampling_via_read_limit_percentage: bool = True,
num_threads: int = 2,
) -> List[Path]:
"""
Subsample list of paired files.
Args:
fastq_file_list : Subsample paired fastq files.
subsample_read_limit : Maximum number of reads to subsample, default to 100000000.
subsample_percentage : Maximun percentage of reads to subsample, default to 0.25.
sampling_via_read_limit : If True will subsample an input dataset of fastq files \
using --subsample_read_limit value.
sampling_via_percentage : If True will subsample an input dataset of fastq files \
using --subsample_percentage value.
sampling_via_read_limit_percentage : If True will subsample an input dataset of \
fastq files using --subsample_read_limit and --subsample_percentage value; \
the lowest number of reads is taken.
num_threads : number of threads
Returns:
List[Path]: List of subsampled paired transcriptomic files
"""
subsampled_fastq_files: List[Path] = []
# for fastq_files in fastq_file_list:
# fastq_file_1, fastq_file_2 = fastq_files
# fastq_file_pair = ""
# if len(fastq_files) == 2:
# fastq_file_pair = fastq_files[1]
if len(fastq_file_list) == 1:
fastq_file_1 = fastq_file_list[0]
if Path(f"{fastq_file_1}.sub").exists():
logging.info(
"Found an existing .sub file on the fastq path, will use that \
instead. File:%s.sub",
fastq_file_1,
)
else:
_subsample_paired_fastq_files(
fastq_file_list,
subsample_read_limit,
subsample_percentage,
sampling_via_read_limit,
sampling_via_percentage,
sampling_via_read_limit_percentage,
num_threads,
)
subsampled_fastq_files = [Path(f"{fastq_file_1}.sub")]
if len(fastq_file_list) == 2:
fastq_file_1, fastq_file_2 = fastq_file_list
if Path(f"{fastq_file_1}.sub").exists() and Path(f"{fastq_file_2}.sub").exists():
logging.info(
"Found an existing .sub files on the fastq path for both members of the pair, will use \
those instead of subsampling again. Files: %s.sub,%s.sub",
fastq_file_1,
fastq_file_2,
)
else:
_subsample_paired_fastq_files(
fastq_file_list,
subsample_read_limit,
subsample_percentage,
sampling_via_read_limit,
sampling_via_percentage,
sampling_via_read_limit_percentage,
num_threads,
)
subsampled_fastq_files = [Path(f"{fastq_file_1}.sub"), Path(f"{fastq_file_2}.sub")]
return subsampled_fastq_files
def _subsample_paired_fastq_files( # pylint:disable=too-many-branches
fastq_files: List[Path],
subsample_read_limit: int = 100000000,
subsample_percentage: float = 0.25,
sampling_via_read_limit: bool = False,
sampling_via_percentage: bool = False,
sampling_via_read_limit_percentage: bool = True,
num_threads: int = 2,
) -> None:
"""
Perform subsampling on two paired FastQ files in parallel using multiple threads.
Args:
fastq_files : Path for paired fastq files.
subsample_read_limit : Maximum number of reads to subsample, default to 100000000.
subsample_percentage : Maximun percentage of reads to subsample, default to 0.25.
sampling_via_read_limit : If True will subsample an input dataset of fastq files \
using --subsample_read_limit value.
sampling_via_percentage : If True will subsample an input dataset of fastq files \
using --subsample_percentage value.
sampling_via_read_limit_percentage : If True will subsample an input dataset of \
fastq files using --subsample_read_limit and --subsample_percentage value; \
the lowest number of reads is taken.
num_threads : Number of threads, defaults to 2.
compressed : file compressed, defaults to False.
"""
if len(fastq_files) == 2:
fastq_file_1, fastq_file_2 = fastq_files
output_file_1, output_file_2 = [Path(f"{fastq_file_1}.sub"), Path(f"{fastq_file_2}.sub")]
elif len(fastq_files) == 1:
fastq_file_1 = fastq_files[0]
output_file_1 = Path(f"{fastq_file_1}.sub")
else:
raise FileNotFoundError("No fastq file found")
if fastq_file_1.suffix.endswith(".gz"):
compressed = True
num_lines = sum(1 for line in gzip.open(fastq_file_1)) # pylint:disable=consider-using-with
else:
compressed = False
num_lines = sum(1 for line in open(fastq_file_1)) # pylint:disable=consider-using-with
range_limit = int(num_lines / 4)
sampling_size = 0
if sampling_via_read_limit and subsample_read_limit:
sampling_size = subsample_read_limit
elif sampling_via_percentage and subsample_percentage:
sampling_size = round(range_limit * subsample_percentage)
elif sampling_via_read_limit_percentage and subsample_percentage and subsample_read_limit:
sampling_size = min(subsample_read_limit, round(range_limit * subsample_percentage))
if range_limit <= sampling_size:
logging.info(
"Number of reads (%s is less than the max allowed read count (%s), \
no need to subsample",
str(range_limit),
str(sampling_size),
)
return
rand_list = random.sample(range(0, range_limit-1), sampling_size)
random_indices = [idx * 4 for idx in rand_list]
logging.info("Processing paired files in parallel")
if num_threads >= 2:
pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with
pool.apply_async(
_subsample_fastq_subset,
args=(
fastq_file_1,
output_file_1,
random_indices,
compressed,
),
)
if len(fastq_files) == 2:
pool.apply_async(
_subsample_fastq_subset,
args=(
fastq_file_2,
output_file_2,
random_indices,
compressed,
),
)
pool.close()
pool.join()
else:
_subsample_fastq_subset(
fastq_file_1,
output_file_1,
random_indices,
compressed,
)
if len(fastq_files) == 2:
_subsample_fastq_subset(
fastq_file_2,
output_file_2,
random_indices,
compressed,
)
def _subsample_fastq_subset(
fastq_file: Path, output_file: Path, random_indices: dict, compressed: bool
) -> None:
"""
Selecting specific sets of four lines from an input FastQ file and writing them to an output file.
Args:
fastq_file : Path for the fastq file.
output_file : Path for the output file.
random_indices : set of random indices.
compressed : the files is compressed
"""
line_index = 0
read_block = []
with gzip.open(fastq_file, "rt") if compressed else open(fastq_file) as file_in, open(
output_file, "w+"
) as file_out:
for line in file_in:
read_block.append(line)
if len(read_block) == 4:
if line_index in random_indices:
file_out.writelines(read_block)
read_block = []
line_index += 4
#lines = [file_in.readline() for _ in range(4)]
"""
while lines[3]:
#lines = [file_in.readline() for _ in range(4)]
# Break if the end of the file is reached
if len(lines) < 4 : # No more lines to read
break
# Write to output if current index is in random_indices
if line_index in random_indices:
file_out.writelines(lines)
line_index += 4
lines = [file_in.readline() for _ in range(4)]
with gzip.open(fastq_file, "rt") if compressed else open(fastq_file) as file_in, open(
output_file, "w+"
) as file_out:
lines = [file_in.readline() for _ in range(4)]
while lines[3]: # This ensures that the loop continues until the end of the input file.
if line_index in random_indices:
file_out.writelines(lines)
line_index += 4
lines = [file_in.readline() for _ in range(4)]
"""
[docs]def run_trimming(
output_dir: Path,
short_read_fastq_dir: Path,
delete_pre_trim_fastq: bool = False,
num_threads: int = 1,
trim_galore_bin="trim_galore",
) -> None:
"""
Trim list of short read fastq files.
Args:
output_dir : Working directory path.
short_read_fastq_dir : Short read directory path.
delete_pre_trim_fastq : Removing original fastq file post trimming. Defaults to False.
num_threads : Number of threads.
trim_galore_bin : Software path.
"""
check_exe(trim_galore_bin)
trim_dir = create_dir(output_dir, "trim_galore_output")
fastq_file_list: List[Path] = []
file_types = ("*.fastq", "*.fq", "*.fastq.gz", "*.fq.gz")
fastq_file_list = [
path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type)
]
paired_fastq_file_list = _create_paired_paths(fastq_file_list)
trim_galore_cmd = [
str(trim_galore_bin),
"--illumina",
"--quality",
"20",
"--length",
"50",
"--output_dir",
str(trim_dir),
]
pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with
for fastq_file in paired_fastq_file_list:
file1, file2 = fastq_file
if delete_pre_trim_fastq:
file1.unlink()
file2.unlink()
pool.apply_async(
multiprocess_trim_galore,
args=(
trim_galore_cmd,
fastq_file,
trim_dir,
),
)
pool.close()
pool.join()
trimmed_fastq_list = trim_dir.glob("*.fq.gz")
for trimmed_fastq_path in trimmed_fastq_list:
logging.info("Trimmed file path: %s", str(trimmed_fastq_path))
sub_patterns = re.compile(r"|".join(("_val_1.fq", "_val_2.fq", "_trimmed.fq")))
updated_file_path_name = sub_patterns.sub(".fq", trimmed_fastq_path.name)
updated_file_path = short_read_fastq_dir / updated_file_path_name
logging.info("Updated file path: %s", str(updated_file_path))
trimmed_fastq_path.rename(updated_file_path)
files_to_delete_list: List[Path] = []
for file_type in file_types:
files_to_delete_list.extend(short_read_fastq_dir.glob(file_type))
for file_to_delete in files_to_delete_list:
file_to_delete.unlink()
def multiprocess_trim_galore(trim_galore_cmd: List, fastq_paired_files: List[Path]) -> None:
"""
Trim short paired or single short read fastq file.
Args:
trim_galore_cmd : Generic command.
fastq_paired_files : List of single or paired fastq files.
"""
fastq_file = fastq_paired_files[0]
fastq_file_pair = None
if len(fastq_paired_files) == 2:
fastq_file, fastq_file_pair = fastq_paired_files
trim_galore_cmd.append("--paired")
trim_galore_cmd.append(fastq_file)
trim_galore_cmd.append(fastq_file_pair)
elif len(fastq_paired_files) == 1:
trim_galore_cmd.append(fastq_paired_files)
logging.info("Running Trim Galore with the following command: %s", {" ".join(trim_galore_cmd)})
subprocess.run(trim_galore_cmd, check=True)
def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="STAR's arguments")
parser.add_argument("--genome_file", help="Genome file path")
parser.add_argument("--output_dir", help="Output directory path")
parser.add_argument("--short_read_fastq_dir", help="Short read directory path")
parser.add_argument(
"--delete_pre_trim_fastq",
action="store_true",
default=False,
help="Delete the original fastq files after trimming",
)
parser.add_argument(
"--trim_fastq", action="store_true", default=False, help="Trim the short read files using Trim Galore"
)
parser.add_argument(
"--max_reads_per_sample", type=int, default=0, help="The maximum number of reads to use per sample"
)
parser.add_argument(
"--max_intron_length", type=int, default=100000, help="The maximum intron size for alignments"
)
parser.add_argument("--num_threads", type=int, default=1, help="Number of threads")
parser.add_argument("--star_bin", default="STAR", help="Star software path")
parser.add_argument("--samtools_bin", default="samtools", help="Samtools software path")
parser.add_argument("--trim_galore_bin", default="trim_galore", help="Trim Galore software path")
parser.add_argument(
"--subsample_read_limit",
type=int,
default=100000000,
help="Maximum number of reads to subsample. Default 1 hundred million reads",
required=False,
)
parser.add_argument(
"--subsample_percentage",
type=float,
default=0.25,
help="Maximun percentage of reads to subsample (0 to 1)",
required=False,
)
parser.add_argument(
"--sampling_via_read_limit",
type=bool,
default=False,
help="If True will subsample an input dataset of fastq files using --subsample_read_limit value.",
required=False,
)
parser.add_argument(
"--sampling_via_percentage",
type=bool,
default=False,
help="If True will subsample an input dataset of fastq files using --subsample_percentage value.",
required=False,
)
parser.add_argument(
"--sampling_via_read_limit_percentage",
type=bool,
default=True,
help="If True will subsample an input dataset of fastq files using --subsample_read_limit \
and --subsample_percentage value; the lowest number of reads is taken.",
required=False,
)
parser.add_argument(
"--paired_file_1",
help="Path for single or paired fastq file; used when --run_subsampling \
or --run_trimming are enabled.",
required=False,
)
parser.add_argument(
"--paired_file_2",
help="Path for single or paired fastq file; used when --run_subsampling \
or --run_trimming are enabled.",
required=False,
)
parser.add_argument(
"--run_star",
type=bool,
help="If True will run STAR alignment given an input dataset of fastq files.",
required=False,
)
parser.add_argument(
"--run_subsampling",
type=bool,
default=False,
help="If True will subsample an input dataset of fastq files.",
required=False,
)
parser.add_argument(
"--run_trimming",
type=bool,
default=False,
help="If True will trim input dataset of fastq files using TrimGalore suite.",
required=False,
)
return parser.parse_args()
def main():
"""STAR's entry-point."""
args = parse_args()
log_file_path = create_dir(args.output_dir, "log") / "star.log"
loginipath = Path(__file__).parents[6] / "conf" / "logging.conf"
logging.config.fileConfig(
loginipath,
defaults={"logfilename": str(log_file_path)},
disable_existing_loggers=False,
)
if args.run_star:
run_star(
args.genome_file,
args.output_dir,
args.short_read_fastq_dir,
args.delete_pre_trim_fastq,
args.trim_fastq,
args.max_reads_per_sample,
args.max_intron_length,
args.subsample_read_limit,
args.subsample_percentage,
args.sampling_via_read_limit,
args.sampling_via_percentage,
args.sampling_via_read_limit_percentage,
args.num_threads,
args.star_bin,
args.samtools_bin,
args.trim_galore_bin,
)
if args.run_subsampling:
fastq_file_list = [Path(args.paired_file_1), Path(args.paired_file_2)]
subsample_transcriptomic_data(
fastq_file_list,
args.subsample_read_limit,
args.subsample_percentage,
args.sampling_via_read_limit,
args.sampling_via_percentage,
args.sampling_via_read_limit_percentage,
args.num_threads,
)
if args.run_trimming:
run_trimming(
args.output_dir,
args.short_read_fastq_dir,
args.delete_pre_trim_fastq,
args.num_threads,
args.trim_galore_bin,
)
if __name__ == "__main__":
main()
# pylint:disable=pointless-string-statement
"""
def model_builder(work_dir):
star_output_dir = os.path.join(work_dir, "star_output")
all_junctions_file = os.path.join(star_output_dir, "all_junctions.sj")
sjf_out = open(all_junctions_file, "w+")
for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"):
sjf_in = open(sj_tab_file)
sjf_lines = sjf_in.readlines()
for line in sjf_lines:
elements = line.split("\t")
strand = "+"
# my $slice_name = $eles[0];
# my $start = $eles[1];
# my $end = $eles[2];
# my $strand = $eles[3];
# If the strand is undefined then skip, Augustus expects a strand
if elements[3] == "0":
continue
elif elements[3] == "2":
strand = "-"
junction_length = int(elements[2]) - int(elements[1]) + 1
if junction_length < 100:
continue
if not elements[4] and elements[7] < 10:
continue
# For the moment treat multimapping and single
# mapping things as a combined score
score = float(elements[6]) + float(elements[7])
score = str(score)
output_line = [
elements[0],
"RNASEQ",
"intron",
elements[1],
elements[2],
score,
strand,
".",
("src=W;mul=" + score + ";"),
]
sjf_out.write("\t".join(output_line) + "\n")
sjf_out.close()
"""