feat: script to download and filter coriell wgs BAMs
This commit is contained in:
46
filter_wgs_bams.py
Normal file
46
filter_wgs_bams.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import os
|
||||
import logging
|
||||
from subprocess import run
|
||||
from time import sleep
|
||||
|
||||
FILTER_BED = "brca1_palb2_chek2_atm_genes_ensembl115.bed"
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s [%(levelname)s] %(message)s", level=logging.INFO
|
||||
)
|
||||
|
||||
|
||||
def main(ftp_paths: list[str]) -> None:
|
||||
logging.info("Script started")
|
||||
logging.info(f"{(n_ftps:=len(ftp_paths))} BAMs to be processed")
|
||||
logging.info(f"FTP paths: {ftp_paths}")
|
||||
|
||||
for idx, ftp in enumerate(ftp_paths):
|
||||
# download
|
||||
logging.info(f"({idx+1}/{n_ftps}) Downloading: '{ftp}'")
|
||||
down_cmd = f"wget -nc {ftp}"
|
||||
run(down_cmd, shell=True, capture_output=False)
|
||||
fname = ftp.split("/")[-1]
|
||||
logging.info(f"Download completed...")
|
||||
|
||||
# filtering
|
||||
logging.info(f"Filtering '{fname}'")
|
||||
filter_cmd = (
|
||||
f"samtools view --bam -h -L {FILTER_BED} {fname} "
|
||||
f"-o {fname.split(".")[0]}.filtered.bam"
|
||||
)
|
||||
run(filter_cmd, shell=True, capture_output=False)
|
||||
logging.info(f"Filtering completed...")
|
||||
|
||||
logging.info(f"Removing large BAM {fname}")
|
||||
os.remove(fname)
|
||||
logging.info(f"({idx+1}/{n_ftps}) '{fname}' completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ftp_paths = [
|
||||
"ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR147/ERR14793899/NA14626.bam",
|
||||
"ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR147/ERR14793893/HG00343_20221005Run.bam",
|
||||
"ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR147/ERR14793896/HG03694.bam",
|
||||
]
|
||||
main(ftp_paths)
|
||||
Reference in New Issue
Block a user