feat: script to download and filter coriell wgs BAMs

This commit is contained in:
2025-09-08 20:44:44 +02:00
parent 66d97b290b
commit 485c1ea0b5

46
filter_wgs_bams.py Normal file
View File

@@ -0,0 +1,46 @@
import os
import logging
from subprocess import run
from time import sleep
FILTER_BED = "brca1_palb2_chek2_atm_genes_ensembl115.bed"
logging.basicConfig(
format="%(asctime)s [%(levelname)s] %(message)s", level=logging.INFO
)
def main(ftp_paths: list[str]) -> None:
logging.info("Script started")
logging.info(f"{(n_ftps:=len(ftp_paths))} BAMs to be processed")
logging.info(f"FTP paths: {ftp_paths}")
for idx, ftp in enumerate(ftp_paths):
# download
logging.info(f"({idx+1}/{n_ftps}) Downloading: '{ftp}'")
down_cmd = f"wget -nc {ftp}"
run(down_cmd, shell=True, capture_output=False)
fname = ftp.split("/")[-1]
logging.info(f"Download completed...")
# filtering
logging.info(f"Filtering '{fname}'")
filter_cmd = (
f"samtools view --bam -h -L {FILTER_BED} {fname} "
f"-o {fname.split(".")[0]}.filtered.bam"
)
run(filter_cmd, shell=True, capture_output=False)
logging.info(f"Filtering completed...")
logging.info(f"Removing large BAM {fname}")
os.remove(fname)
logging.info(f"({idx+1}/{n_ftps}) '{fname}' completed!")
if __name__ == "__main__":
ftp_paths = [
"ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR147/ERR14793899/NA14626.bam",
"ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR147/ERR14793893/HG00343_20221005Run.bam",
"ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR147/ERR14793896/HG03694.bam",
]
main(ftp_paths)