From 2581de3dc76321a8dcbe462cfaac25c107318d86 Mon Sep 17 00:00:00 2001 From: Darren Wight Date: Wed, 24 Sep 2025 18:16:08 +0200 Subject: [PATCH] feat: scripts to get coverage across panel added --- src/get_panel_coverage.py | 86 +++++++++++++++++++++++++++++++++++++++ src/parse_cov.py | 11 +++++ 2 files changed, 97 insertions(+) create mode 100644 src/get_panel_coverage.py create mode 100644 src/parse_cov.py diff --git a/src/get_panel_coverage.py b/src/get_panel_coverage.py new file mode 100644 index 0000000..2f651c4 --- /dev/null +++ b/src/get_panel_coverage.py @@ -0,0 +1,86 @@ +import os +from subprocess import run +import logging +import sys + +logging.basicConfig( + format="%(asctime)s [%(levelname)s] %(message)s", + level=logging.INFO, + stream=sys.stderr, +) + + +def main(s3_paths: list[str], bed_path: str, ref_path: str, cli_bin: str) -> None: + logging.info("Script started...") + logging.info(f"Using reference: {ref_path}") + logging.info(f"Using bed file: {bed_path}") + logging.info(f"Number of BAM/CRAMs to process: {len(s3_paths)}") + logging.info(f"BAM/CRAMs to process: {s3_paths}") + + for idx, file in enumerate(s3_paths): + name = file.split("/")[-1] + file_num = f"{idx+1}/{len(s3_paths)}" + + logging.info(f"({file_num}) Downloading file: {name}") + down_cmd = ( + f"{cli_bin} s3 cp {file} . --profile=se" + if cli_bin == "aws" + else f"{cli_bin} get {file} ." + ) + run(down_cmd, shell=True, capture_output=False) + + sort_name = f"{name.split('.')[0]}.sorted.cram" + sort_cmd = f"samtools sort -@ 18 {name} --reference {ref_path} > {sort_name}" + run(sort_cmd, shell=True, capture_output=False) + + logging.info(f"({file_num}) Getting coverage for '{name}'") + cov_cmd = ( + f"bedtools coverage -d -sorted -a {bed_path} -b {sort_name} | " + f"python parse_cov.py > {name.split(".")[0]}_cov.json" + ) + run(cov_cmd, shell=True, capture_output=False) + os.remove(name) + os.remove(sort_name) + + logging.info(f"({file_num}) Processing of '{name}' completed!") + + +if __name__ == "__main__": + s3_paths = [ + # "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-25-0001-pa-21Mar2025_S33.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-25-0002-pa-21Mar2025_S34.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-25-0003-pa-21Mar2025_S35.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-25-0004-pa-21Mar2025_S36.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-25-0005-pa-21Mar2025_S37.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-25-0006-pa-21Mar2025_S38.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-SeraPlasma-0d125pc-1-28Mar2025_S41.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-SeraPlasma-0d125pc-2-28Mar2025_S42.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-SeraPlasma-0d25pc-1-28Mar2025_S43.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-SeraPlasma-0d25pc-2-28Mar2025_S44.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-SeraPlasma-0d5pc-1-28Mar2025_S45.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-SeraPlasma-0d5pc-2-28Mar2025_S46.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-SeraPlasma-0pc-1-28Mar2025_S39.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-SeraPlasma-0pc-2-28Mar2025_S40.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-SeraPlasma-1pc-1-28Mar2025_S48.collapsed.cram", + "s3://serenomica-pipeline-archive/se1-prd-2.1.1/LB-SeraPlasma-1pc-2-28Mar2025_S47.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0007-pa-1-23May2025_S20.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0007-pa-2-23May2025_S21.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0008-pa-1-23May2025_S22.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0008-pa-2-23May2025_S23.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0009-pa-1-23May2025_S24.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0009-pa-2-23May2025_S25.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0010-pa-1-23May2025_S26.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0010-pa-2-23May2025_S27.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0011-pa-1-23May2025_S32.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0012-pa-1-23May2025_S28.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0012-pa-2-23May2025_S29.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0032-pa-1-23May2025_S30.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-25-0032-pa-2-23May2025_S31.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-SeraCare-0d25pc-1-21Mar2025_S17.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-SeraCare-0d25pc-2-21Mar2025_S18.collapsed.cram", + "s3://serenomica-pipeline-archive/se2-lb-1/LB-SeraPlasma-0pc-3-28Mar2025_S19.collapsed.cram", + ] + # "/home/darren/Documents/4_data/3_internal/1_panels/TWIST/twist_LB_probes_plus_30bp_merged.sorted.bed" + bed_path = "/home/darren/Documents/4_data/3_internal/1_panels/TWIST/twist_LB_probes_plus_30bp_merged.sorted.bed" + ref_path = "/home/darren/Documents/4_data/1_genomes/human/hg38/Homo_sapiens_assembly38.fasta" + main(s3_paths, bed_path, ref_path, cli_bin="aws") diff --git a/src/parse_cov.py b/src/parse_cov.py new file mode 100644 index 0000000..b477208 --- /dev/null +++ b/src/parse_cov.py @@ -0,0 +1,11 @@ +import sys +from collections import defaultdict +import json + +data = defaultdict(lambda: []) + +for line in sys.stdin: + chrom, start, stop, _, depth = line.rstrip().split("\t") + data[f"{chrom}_{start}_{stop}"].append(depth) + +print(json.dumps(dict(data)))