Skip to content

overlaps

ensembl.io.genomio.gff3.overlaps

Scan a GFF3 file to detect overlapping SeqFeature objects. Default object level => gene.

get_intervals(record, genes_dict, seq_dict, seq_name)

Extract start/stop feature coordinates for use in creating intervaltree object.

Parameters:

Name Type Description Default
record SeqRecord

Individual sequence record.

required
genes_dict dict

Genes.

required
seq_dict dict

Sequences.

required
seq_name str

Feature sequence name.

required
Source code in src/python/ensembl/io/genomio/gff3/overlaps.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def get_intervals(record: SeqRecord, genes_dict: dict, seq_dict: dict, seq_name: str) -> None:
    """Extract start/stop feature coordinates for use in creating intervaltree object.

    Args:
        record: Individual sequence record.
        genes_dict: Genes.
        seq_dict: Sequences.
        seq_name: Feature sequence name.
    """

    for feature in record.features:
        genes_dict[str(feature.id)] = {
            "sequence": f"{record.id}",
            "start": f"{int(feature.location.start) + 1}",
            "end": f"{int(feature.location.end)}",
            "strand": f"{feature.location.strand}",
            "name": f"{feature.id}",
        }

        if feature.location.strand == 1:
            seq_dict[seq_name]["plus"].append(
                (int(feature.location.start), int(feature.location.end), str(feature.id))
            )
        elif feature.location.strand == -1:
            seq_dict[seq_name]["minus"].append(
                (int(feature.location.start), int(feature.location.end), str(feature.id))
            )
        else:
            logging.critical("Something went wrong with the strand processing!")

identify_feature_overlaps(gff_in, output_file, isolate_feature)

Detect overlapping GFF3 SeqFeature objects and dump to a report.

Parameters:

Name Type Description Default
gff_in Path

User supplied GFF3 input file.

required
output_file Path

Output file to write feature overlaps.

required
isolate_feature str

Sequence feature type to filter by.

required
Source code in src/python/ensembl/io/genomio/gff3/overlaps.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def identify_feature_overlaps(gff_in: Path, output_file: Path, isolate_feature: str) -> None:
    """Detect overlapping GFF3 SeqFeature objects and dump to a report.

    Args:
        gff_in: User supplied GFF3 input file.
        output_file: Output file to write feature overlaps.
        isolate_feature: Sequence feature type to filter by.
    """
    logging.info("Processing sequence feature overlaps!")
    logging.info(f"Output file = {str(output_file)}")
    logging.info(f"Features filtered by type: {isolate_feature}")

    gff_type_filter: dict = {"gff_type": [isolate_feature]}
    seq_dict: dict = defaultdict(dict)
    genes_dict: dict = {}
    with gff_in.open("r", encoding="utf-8") as input_handle:
        for record in GFF.parse(input_handle, limit_info=gff_type_filter):
            seq_name = str(record.id)
            if seq_name not in seq_dict:
                seq_dict[seq_name]["plus"] = []
                seq_dict[seq_name]["minus"] = []

            get_intervals(record, genes_dict, seq_dict, seq_name)

    overlap_count = _write_report(output_file, seq_dict, genes_dict)

    result_total_features = f"In total {len(genes_dict)} {isolate_feature} features were scanned."
    print(result_total_features)
    logging.info(result_total_features)

    result_total_overlaps = f"In total {overlap_count} overlaps were detected."
    print(result_total_overlaps)
    logging.info(result_total_overlaps)

    logging.info("Finished all processing.")

main()

Module entry-point.

Source code in src/python/ensembl/io/genomio/gff3/overlaps.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def main() -> None:
    """Module entry-point."""
    parser = ArgumentParser(description=__doc__)
    parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
    # Create parser with common arguments to be used by both subparsers
    base_parser = ArgumentParser(add_help=False)
    base_parser.add_argument_src_path("--input_gff", required=True, help="path of GFF3 file to process")
    base_parser.add_log_arguments(add_log_file=True)
    # Add subparsers with their parent being the base parser with the common arguments
    subparsers = parser.add_subparsers(title="Parse GFF3 and ", required=True, dest="subcommand")
    _ = subparsers.add_parser("stats", parents=[base_parser], help="Provide summary of feature types")
    overlaps_parser = subparsers.add_parser("overlaps", parents=[base_parser], help="Find feature overlaps")
    overlaps_parser.add_argument_dst_path(
        "--output_file", default="feature_overlaps.txt", help="path of output file"
    )
    overlaps_parser.add_argument(
        "--filter_type", default="gene", help="sequence feature type used for overlap isolation"
    )

    args = parser.parse_args()
    init_logging_with_args(args)

    logging.info("Starting processing...")
    logging.info(f"GFF input file = {str(args.input_gff)}")

    # Check optional processing param
    if args.subcommand == "stats":
        summarize_feature_stats(args.input_gff)
    else:
        identify_feature_overlaps(args.input_gff, args.output_file, args.filter_type)

scan_tree(feature_intervals)

Construct an interval tree using supplied genomic intervals, check all elements on the tree against itself and return any that hit 2 or more intervals (i.e. itself + 1 other)

Parameters:

Name Type Description Default
feature_intervals list

Genome features to examine for coordinate (start/end) overlaps.

required
Return

Set of intervals identified in the input GFF3 file that overlap with 2 or more intervals.

Source code in src/python/ensembl/io/genomio/gff3/overlaps.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def scan_tree(feature_intervals: list) -> set:
    """Construct an interval tree using supplied genomic intervals, check all elements on the tree against
    itself and return any that hit 2 or more intervals (i.e. itself + 1 other)

    Args:
        feature_intervals: Genome features to examine for coordinate (start/end) overlaps.

    Return:
        Set of intervals identified in the input GFF3 file that overlap with 2 or more intervals.
    """

    interval_sets = set()
    traversed_tree = IntervalTree(Interval(*iv) for iv in feature_intervals)

    for interval in feature_intervals:
        if len(traversed_tree.overlap(interval[0], interval[1])) > 1:
            overlap_interval = traversed_tree.overlap(interval[0], interval[1])

            for features in overlap_interval:
                interval_sets.add(features.data)

    return interval_sets

summarize_feature_stats(gff_in)

Analyse a GFF3 file and produce a summary of its feature types.

Parameters:

Name Type Description Default
gff_in Path

User supplied GFF3 input file.

required
Source code in src/python/ensembl/io/genomio/gff3/overlaps.py
41
42
43
44
45
46
47
48
49
50
51
52
53
def summarize_feature_stats(gff_in: Path) -> None:
    """Analyse a GFF3 file and produce a summary of its feature types.

    Args:
        gff_in: User supplied GFF3 input file.
    """

    logging.info("Alt processing: Not parsing the GFF3, producing summary feature stats instead!")

    examiner = GFFExaminer()
    with gff_in.open("r", encoding="utf-8") as input_handle:
        pprint(examiner.available_limits(input_handle))
    input_handle.close()