Source code for ensembl.tools.anno.repeat_annotation.red

# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Red is the first repeat-detection tool capable of labeling its training data
and training itself automatically on an entire genome.
Girgis, H.Z. Red: an intelligent, rapid, accurate tool for detecting repeats
de-novo on the genomic scale. BMC Bioinformatics 16, 227 (2015).
https://doi.org/10.1186/s12859-015-0654-5
"""
__all__ = ["run_red"]

import argparse
import logging
import logging.config
from pathlib import Path
import re
import subprocess


from ensembl.tools.anno.utils._utils import (
    check_exe,
    create_dir,
)

logger = logging.getLogger(__name__)


[docs]def run_red( genome_file: Path, output_dir: Path, red_bin: Path = Path("Red"), ) -> str: """ Run Red on genome file :param genome_file: Genome file path. :type genome_file: Path :param output_dir: Working directory path. :type output_dir: Path :param red_bin: Red software path. :type red_bin: Path, default Red :return: Masked genome file :rtype: str """ check_exe(red_bin) red_dir = create_dir(output_dir, "red_output") red_mask_dir = create_dir(red_dir, "mask_output") red_repeat_dir = create_dir(red_dir, "repeat_output") red_genome_dir = create_dir(red_dir, "genome_dir") sym_link_genome_cmd = "ln -s " + str(genome_file) + " " + str(red_genome_dir) genome_file_name = genome_file.name red_genome_file = red_genome_dir / genome_file_name genome_file_stem = genome_file.stem masked_genome_file = red_mask_dir / f"{genome_file_stem}.msk" repeat_coords_file = red_repeat_dir / f"{genome_file_stem}.rpt" output_file = red_dir / "annotation.gtf" if masked_genome_file.exists(): logger.warning( "Masked Genome file already found on the path to the Red mask output dir. \ Will not create a new file" ) # _create_red_gtf(repeat_coords_file, output_file) return str(masked_genome_file) if red_genome_file.exists(): logger.warning( "Unmasked genome file already found on the path to the Red genome dir, \ will not create a sym link" ) else: logger.info( "Preparing to sym link the genome file to the Red genome dir. Cmd\n %s", sym_link_genome_cmd, ) # subprocess.run(["ln", "-s", genome_file, red_genome_dir]) red_genome_file.symlink_to(genome_file) try: if red_genome_file.exists(): logger.info("Running Red") subprocess.run( [ red_bin, "-gnm", red_genome_dir, "-msk", red_mask_dir, "-rpt", red_repeat_dir, ], check=True, ) except:#pylint:disable=bare-except logger.error( "Could not find the genome file in the Red genome dir or sym link \ to the original file. Path expected:\n%s", genome_file, ) _create_red_gtf(repeat_coords_file, output_file) return str(masked_genome_file)
def _create_red_gtf(repeat_coords_file: Path, output_file: Path): """ Create Red gtf file from masked genome file Args: repeat_coords_file: Coordinates for repeats. output_file : GTF file with the final results. """ with open(repeat_coords_file, "r", encoding="utf8") as red_in, open( output_file, "w+", encoding="utf8" ) as red_out: for repeat_id, line in enumerate(red_in, start=1): result_match = re.search(r"^\>(.+)\:(\d+)\-(\d+)", line) if result_match: region_name = result_match.group(1) # Note that Red is 0-based, so add 1 start = int(result_match.group(2)) + 1 end = int(result_match.group(3)) + 1 gtf_line = ( f"{region_name}\tRed\trepeat\t{start}\t" f'{end}\t.\t+\t.\trepeat_id "{repeat_id}";\n' ) red_out.write(gtf_line) def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser(description="Red's arguments") parser.add_argument("--genome_file", required=True, help="Genome file path") parser.add_argument("--output_dir", required=True, help="Output directory path") parser.add_argument( "--red_bin", default="red", help="Red executable path", ) return parser.parse_args() def main(): """Red's entry-point.""" args = parse_args() log_file_path = create_dir(args.output_dir, "log") / "red.log" loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" logging.config.fileConfig( loginipath, defaults={"logfilename": str(log_file_path)}, disable_existing_loggers=False, ) run_red( Path(args.genome_file), args.output_dir, args.red_bin, ) if __name__ == "__main__": main()