diff --git a/filter_wgs_bams.py b/filter_wgs_bams.py new file mode 100644 index 0000000..7168c0b --- /dev/null +++ b/filter_wgs_bams.py @@ -0,0 +1,46 @@ +import os +import logging +from subprocess import run +from time import sleep + +FILTER_BED = "brca1_palb2_chek2_atm_genes_ensembl115.bed" + +logging.basicConfig( + format="%(asctime)s [%(levelname)s] %(message)s", level=logging.INFO +) + + +def main(ftp_paths: list[str]) -> None: + logging.info("Script started") + logging.info(f"{(n_ftps:=len(ftp_paths))} BAMs to be processed") + logging.info(f"FTP paths: {ftp_paths}") + + for idx, ftp in enumerate(ftp_paths): + # download + logging.info(f"({idx+1}/{n_ftps}) Downloading: '{ftp}'") + down_cmd = f"wget -nc {ftp}" + run(down_cmd, shell=True, capture_output=False) + fname = ftp.split("/")[-1] + logging.info(f"Download completed...") + + # filtering + logging.info(f"Filtering '{fname}'") + filter_cmd = ( + f"samtools view --bam -h -L {FILTER_BED} {fname} " + f"-o {fname.split(".")[0]}.filtered.bam" + ) + run(filter_cmd, shell=True, capture_output=False) + logging.info(f"Filtering completed...") + + logging.info(f"Removing large BAM {fname}") + os.remove(fname) + logging.info(f"({idx+1}/{n_ftps}) '{fname}' completed!") + + +if __name__ == "__main__": + ftp_paths = [ + "ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR147/ERR14793899/NA14626.bam", + "ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR147/ERR14793893/HG00343_20221005Run.bam", + "ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR147/ERR14793896/HG03694.bam", + ] + main(ftp_paths)