Compare commits

...

5 Commits

6 changed files with 583 additions and 2 deletions

1
.gitignore vendored
View File

@@ -188,3 +188,4 @@ cython_debug/
# PyPI configuration file
.pypirc
data/

View File

@@ -0,0 +1,156 @@
import marimo
__generated_with = "0.16.4"
app = marimo.App(width="medium")
@app.cell
def _():
import os
import json
import statistics
import marimo as mo
import pandas as pd
import plotly.express as px
return json, mo, os, pd, statistics
@app.cell
def _(mo):
mo.md(
r"""
# EDA of LB Assay Coverage
Data was generated by `get_panel_coverage.py` that uses the LB panels LB covered bed (with flankning sequence added, i.e. the variant calling bed) with the deduplicated bam/cram.
The output json from this analysis for each sample has the following form🥇
```json
{
"chr<#>_<start>_<stop>": [
"depth_pos1",
"depth_pos2",
"depth_pos3",
...
"depth_posN"
]
}
```
Analysis Plan:
- Poorly covered regions (i.e. bad probes/challenging areas to align)
- Overly covered regions (i.e. potential areas that are challenging for aligners or have low complexity)
- Comparison of Serenomica and Quantgene in overlap areas
"""
)
return
@app.cell
def _():
# data
se_path = "../data/coverage_serenomica_QG/se1_se2/"
qg_path = "../data/coverage_serenomica_QG/QG_historical/"
return qg_path, se_path
@app.cell
def _(json, os):
def parse_data(dpath):
files = [f for f in os.listdir(dpath) if f.endswith(".json")]
parsed_data, mean_data = {}
for file in files:
name = file.split("_cov")[0]
with open(os.path.join(dpath, file), "r") as handle:
parsed_data[name] = json.load(handle)
return parsed_data
return (parse_data,)
@app.cell
def _(parse_data, qg_path, se_path):
se_data = parse_data(se_path)
qg_data = parse_data(qg_path)
return qg_data, se_data
@app.cell
def _(statistics):
def summerize_list(lst):
vals = list(map(int, lst))
return (
statistics.mean(vals),
statistics.median(vals),
statistics.stdev(vals),
min(vals),
max(vals),
sum(vals),
len(vals),
)
def get_summary_data(sample_data):
out_data = {}
sample_sum, sample_pos_count = 0, 0
for region, depths in sample_data.items():
(mean, median, std, min_val, max_val, sum_vals, count) = (
summerize_list(depths)
)
out_data[region] = {
"min": min_val,
"max": max_val,
"mean": mean,
"median": median,
"std": std,
}
sample_sum += sum_vals
sample_pos_count += count
out_data["overall"] = {"mean": sample_sum / sample_pos_count}
return out_data
return (get_summary_data,)
@app.cell
def _(get_summary_data, qg_data, se_data):
se_summary_data = {}
for se_sample, se_sample_data in se_data.items():
se_summary_data[se_sample] = get_summary_data(se_sample_data)
qg_summary_data = {}
for qg_sample, qg_sample_data in qg_data.items():
qg_summary_data[qg_sample] = get_summary_data(qg_sample_data)
return (se_summary_data,)
@app.cell
def _():
return
@app.cell
def _(pd):
def extract_stat_as_df(data_dict, access_key):
tmp_dict = {}
for name, sample_data in data_dict.items():
tmp_dict[name] = {
region: data[access_key]
for region, data in sample_data.items()
if region != "overall"
}
return pd.DataFrame.from_dict(tmp_dict, orient="index")
return (extract_stat_as_df,)
@app.cell
def _(extract_stat_as_df, se_summary_data):
extract_stat_as_df(se_summary_data, access_key="std")
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()

View File

@@ -1,6 +1,6 @@
import marimo
__generated_with = "0.14.16"
__generated_with = "0.16.4"
app = marimo.App(width="medium")
@@ -168,7 +168,7 @@ def _(mo):
def _(os, pd, qc_hist_dir):
# QG historical data
qg_hist_stats = pd.read_csv(
os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0
os.path.join(qc_hist_dir, "historical_data_LB-QG.csv"), index_col=0
)
qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns]
qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1)
@@ -840,6 +840,270 @@ def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
return
@app.cell
def _(mo):
mo.md(r"""# (3) Stats and Metadata""")
return
@app.cell
def _(pd):
meta = pd.read_excel(
"/home/darren/Documents/4_data/3_internal/2_lb/Metadata_LB_20250924.xlsx",
index_col="fastq_id",
)
meta
return (meta,)
@app.cell
def _(clinicals, meta, pd, stats):
# merge stats and metadata and remove seracares
stats_meta = pd.concat([stats, meta], axis=1)
stats_meta = stats_meta.loc[clinicals, :]
stats_meta = stats_meta.reset_index()
stats_meta.head()
return (stats_meta,)
@app.cell
def _(pd):
meta_qg = pd.read_excel(
"/home/darren/Documents/4_data/3_internal/2_lb/QC/lb2_metadata.xlsx",
index_col="fastq_id",
)
meta_qg
return (meta_qg,)
@app.cell
def _(meta_qg, pd, qg_hist_stats):
# merge stats and metadata and remove seracares
stats_meta_qg = pd.concat([qg_hist_stats, meta_qg], axis=1)
stats_meta_qg = stats_meta_qg.loc[
[
i
for i in stats_meta_qg.index
if "twist" not in i.lower() and "sera" not in i.lower()
],
:,
]
stats_meta_qg = stats_meta_qg.reset_index()
stats_meta_qg.head()
return (stats_meta_qg,)
@app.cell
def _(mo, stats_meta):
var1 = mo.ui.dropdown(options=stats_meta.columns, label="Variable 1")
var2 = mo.ui.dropdown(options=stats_meta.columns, label="Variable 2")
mo.vstack([var1, var2])
return var1, var2
@app.cell
def _(px, stats_meta, var1, var2):
# note that all SE1 clinical samples are older QG extracted cfDNAs
fig5 = px.scatter(
data_frame=stats_meta,
x=var1.value,
y=var2.value,
color="Flow cell #",
width=700,
height=500,
template="simple_white",
hover_data=["index"],
color_discrete_sequence=px.colors.qualitative.Dark2,
category_orders={"Flow cell #": ["SE1", "SE2"]},
trendline="ols",
)
fig5.update_traces(
marker=dict(size=9, line=dict(width=1, color="DarkSlateGrey")),
selector=dict(mode="markers"),
)
fig5.add_vline(
x=30,
line_width=3,
line_dash="dot",
annotation_text="30ng",
annotation_position="top left",
)
fig5.show()
return
@app.cell
def _(stats_meta):
stats_meta["ng_group"] = [
"< 30ng" if i < 30 else "> 30ng" for i in stats_meta["Input DNA used (ng)"]
]
return
@app.cell
def _(pd, stats_meta):
stats_meta_melt = pd.melt(
stats_meta, id_vars=["index", "sample_type", "ng_group", "Flow cell #"]
)
stats_meta_melt.head()
return (stats_meta_melt,)
@app.cell
def _(mo, stats_meta_melt):
box_vars = mo.ui.dropdown(
options=stats_meta_melt["variable"].unique(), label="Variables to plot"
)
mo.hstack([box_vars])
return (box_vars,)
@app.cell
def _(box_vars, px, stats_meta_melt):
fig6 = px.box(
data_frame=stats_meta_melt.loc[
(stats_meta_melt["variable"] == box_vars.value)
& (stats_meta_melt["Flow cell #"] == "SE2")
],
color="ng_group",
y="value",
x="variable",
template="simple_white",
points="all",
labels={
"ng_group": "cfDNA Input for Library",
"value": box_vars.value,
"variable": "",
},
color_discrete_sequence=px.colors.qualitative.Dark2,
category_orders={"ng_group": ["< 30ng", "> 30ng"]},
hover_data=["index"],
width=800,
)
fig6.update_traces(
marker=dict(size=10, line=dict(width=1, color="DarkSlateGrey")),
selector=dict(type="points"),
)
fig6.update_xaxes(ticks="", showticklabels=False)
fig6.update_yaxes(showgrid=True, tickfont_size=14)
fig6.update_legends(font_size=14)
for trace in fig6.select_traces():
trace.marker.update(size=10, line=dict(width=1, color="DarkSlateGrey"))
fig6.show()
return
@app.cell
def _(stats_meta):
stats_meta["ng_group"].value_counts()
return
@app.cell
def _(stats_meta_melt):
stats_meta_melt["Flow cell #"].unique()
return
@app.cell
def _(qg_hist_stats):
qg_hist_stats
return
@app.cell
def _(pd, stats_meta, stats_meta_qg):
stats_meta["lab"] = "Serenomica"
stats_meta_qg["lab"] = "Quantgene"
all_stats_meta = pd.concat([stats_meta, stats_meta_qg], axis=0)
all_stats_meta.shape
return (all_stats_meta,)
@app.cell
def _(all_stats_meta):
all_stats_meta["ng_group"] = [
"< 30ng" if i < 30 else "> 30ng"
for i in all_stats_meta["Input DNA used (ng)"]
]
all_stats_meta[["lab", "ng_group"]].value_counts()
return
@app.cell
def _(all_stats_meta, pd):
all_stats_meta_melt = pd.melt(
all_stats_meta,
id_vars=["index", "sample_type", "ng_group", "Flow cell #", "lab"],
)
all_stats_meta_melt
return (all_stats_meta_melt,)
@app.cell
def _(all_stats_meta_melt, mo):
box_vars2 = mo.ui.dropdown(
options=all_stats_meta_melt["variable"].unique(), label="Variables to plot"
)
mo.hstack([box_vars2])
return (box_vars2,)
@app.cell
def _(all_stats_meta_melt, box_vars2, px):
fig7 = px.box(
data_frame=all_stats_meta_melt.loc[
(all_stats_meta_melt["variable"] == box_vars2.value)
],
color="ng_group",
y="value",
x="lab",
template="simple_white",
points="all",
labels={
"ng_group": "cfDNA Input for Library",
"value": box_vars2.value,
"lab": "Laboratory",
},
color_discrete_sequence=px.colors.qualitative.Dark2,
category_orders={"ng_group": ["< 30ng", "> 30ng"]},
hover_data=["index"],
width=800,
)
fig7.update_traces(
marker=dict(size=10, line=dict(width=1, color="DarkSlateGrey")),
selector=dict(type="points"),
)
fig7.update_xaxes(tickfont_size=14)
fig7.update_yaxes(showgrid=True, tickfont_size=14)
fig7.update_legends(font_size=14)
for trace2 in fig7.select_traces():
trace2.marker.update(
size=7, line=dict(width=1, color="DarkSlateGrey"), opacity=0.8
)
fig7.show()
return
@app.cell
def _(all_stats_meta):
all_stats_meta[["lab", "ng_group"]].value_counts()
return
@app.cell
def _(stats_meta):
stats_meta.sort_values("Input DNA used (ng)")[
["index", "Input DNA used (ng)", "Flow cell #"]
]
return
@app.cell
def _():
return

View File

@@ -0,0 +1,9 @@
marimo==0.16.4
matplotlib==3.10.3
matplotlib-inline==0.1.7
matplotlib-venn==1.1.2
openpyxl==3.1.5
pandas==2.3.1
plotly==6.2.0
pyyaml==6.0.3
seaborn==0.13.2

140
src/get_panel_coverage.py Normal file
View File

@@ -0,0 +1,140 @@
import os
from subprocess import run
import logging
import sys
logging.basicConfig(
format="%(asctime)s [%(levelname)s] %(message)s",
level=logging.INFO,
stream=sys.stderr,
)
def main(
s3_paths: list[str], bed_path: str, ref_path: str, cli_bin: str, tars: bool
) -> None:
logging.info("Script started...")
logging.info(f"Using reference: {ref_path}")
logging.info(f"Using bed file: {bed_path}")
logging.info(f"Number of BAM/CRAMs to process: {len(s3_paths)}")
logging.info(f"BAM/CRAMs to process: {s3_paths}")
for idx, file in enumerate(s3_paths):
name = file.split("/")[-1]
file_num = f"{idx+1}/{len(s3_paths)}"
logging.info(f"({file_num}) Downloading file: {name}")
if tars:
down_unpack_cmd = (
f"{cli_bin} s3 cp {file} . --profile=se"
if cli_bin == "aws"
else f"{cli_bin} get {file} . && tar -xf {name} --strip-components=4 --wildcards --no-anchored '*.cram' && rm {name}"
)
run(down_unpack_cmd, shell=True, capture_output=False)
name = name.split(".")[0] + ".collapsed.cram"
else:
down_cmd = (
f"{cli_bin} s3 cp {file} . --profile=se"
if cli_bin == "aws"
else f"{cli_bin} get {file} ."
)
run(down_cmd, shell=True, capture_output=False)
sort_name = f"{name.split('.')[0]}.sorted.cram"
sort_cmd = f"samtools sort -@ 18 {name} --reference {ref_path} > {sort_name}"
run(sort_cmd, shell=True, capture_output=False)
logging.info(f"({file_num}) Getting coverage for '{name}'")
cov_cmd = (
f"bedtools coverage -d -sorted -a {bed_path} -b {sort_name} | "
f"python parse_cov.py > {name.split(".")[0]}_cov.json"
)
run(cov_cmd, shell=True, capture_output=False)
os.remove(name)
os.remove(sort_name)
logging.info(f"({file_num}) Processing of '{name}' completed!")
if __name__ == "__main__":
s3_paths = [
# "s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0087-pa-13Jun2023_S1.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0089-pa-13Jun2023_S2.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0090-pa-13Jun2023_S3.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0091-pa-13Jun2023_S4.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0092-pa-13Jun2023_S5.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0096-pa-13Jun2023_S6.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0098-pa-29Jun2023_S1.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0099-pa-29Jun2023_S2.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0100-pa-29Jun2023_S3.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0103-pa-29Jun2023_S4.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0105-pa-29Jun2023_S5.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0107-pa-29Jun2023_S6.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0112-pa-19Jul2023_S1.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0113-pa-19Jul2023_S2.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0119-pa-19Jul2023_S4.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0122-pa-19Jul2023_S5.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0123-pa-19Jul2023_S6.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0124-pa-19Jul2023_S7.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0125-pa-19Jul2023_S8.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0126-pa-19Jul2023_S9.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0128-pa-19Jul2023_S10.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0129-pa-19Jul2023_S3.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0130-pa-09Aug2023_S1.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0132-pa-09Aug2023_S2.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0133-pa-09Aug2023_S3.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0134-pa-09Aug2023_S4.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0135-pa-09Aug2023_S5.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0136-pa-09Aug2023_S6.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0137-pa-09Aug2023_S7.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0138-pa-30Aug2023_S1.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0140-pa-30Aug2023_S2.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0141-pa-30Aug2023_S3.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0142-pa-30Aug2023_S4.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0144-pa-30Aug2023_S5.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0145-pa-30Aug2023_S6.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0146-pa-30Aug2023_S7.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0147-pa-30Aug2023_S8.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0153-pa-25Oct2023_S1.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0156-pa-25Oct2023_S2.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0159-pa-25Oct2023_S3.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0160-pa-25Oct2023_S4.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-23-0186-pa-14Feb2024_S1.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0001-pa-14Feb2024_S2.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0015-pa-14Feb2024_S3.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0019-pa-14Feb2024_S4.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0023-pa-14Feb2024_S5.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0024-pa-14Feb2024_S6.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0027-pa-14Feb2024_S7.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0037-pa-28Mar2024_S16.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0038-pa-28Mar2024_S17.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0044-pa-13May2024_S2.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0046-pa-13May2024_S3.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0047-pa-13May2024_S4.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0047-pa-23May2024_S1.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0048-pa-13May2024_S5.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0049-pa-13May2024_S6.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0049-pa-23May2024_S2.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0050-pa-13May2024_S7.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0050-pa-23May2024_S3.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0053-pa-13May2024_S8.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0053-pa-23May2024_S4.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0054-pa-13May2024_S9.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0055-pa-13May2024_S10.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0056-pa-13May2024_S11.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0057-pa-13May2024_S12.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0058-pa-13May2024_S13.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0059-pa-23May2024_S5.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0060-pa-23May2024_S6.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0063-pa-23May2024_S7.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0064-pa-23May2024_S8.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-24-0065-pa-23May2024_S9.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-GVA-HC-246-13Jun2023_S10.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-GVA-ME-339-13Jun2023_S9.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-GVA-PA-307-13Jun2023_S8.bams.tar",
"s3://serenomica-pipeline-archive/historical-qg-qc-samples/LB-GVA-PA-308-13Jun2023_S7.bams.tar",
]
# "/home/darren/Documents/4_data/3_internal/1_panels/TWIST/twist_LB_probes_plus_30bp_merged.sorted.bed"
bed_path = "/home/darren/Documents/4_data/3_internal/1_panels/Aglient/LB_3413441_Covered_50ext.sorted.bed"
ref_path = "/home/darren/Documents/4_data/1_genomes/human/hg38/Homo_sapiens_assembly38.fasta"
main(s3_paths, bed_path, ref_path, cli_bin="s3cmd", tars=True)

11
src/parse_cov.py Normal file
View File

@@ -0,0 +1,11 @@
import sys
from collections import defaultdict
import json
data = defaultdict(lambda: [])
for line in sys.stdin:
chrom, start, stop, _, depth = line.rstrip().split("\t")
data[f"{chrom}_{start}_{stop}"].append(depth)
print(json.dumps(dict(data)))