feat: eda analysis continued and samples from QG search added

This commit is contained in:
2025-09-22 17:31:11 +02:00
parent ab15df85ad
commit e03ede9c28
2 changed files with 677 additions and 30 deletions

View File

@@ -0,0 +1,116 @@
import marimo
__generated_with = "0.14.16"
app = marimo.App(width="medium")
@app.cell
def _():
import os
from collections import Counter
import marimo as mo
import pandas as pd
return mo, os, pd
@app.cell
def _(mo):
mo.md(
r"""
# Search for Samples
Not all QG samples have the `.mutations` files. It would be helpful to have some from those that were part of the historical QC service.
Samples we have: `../data/qg_avail_dot_mutations.txt`
"""
)
return
@app.cell
def _():
# pipeline runs used in the historical data (from repo)
LB2_PIPELINE_RUNS = [
"s3://quantgene-portland-lb/20221123-sp-pl34-hs2/lb-pl34-all-dev56/",
"s3://quantgene-portland-lb/20221202-s1-pl36-hs2/lb-pl36-all-dev56/",
"s3://quantgene-portland-lb/20221207-s2-pl37-hs2/lb-pl37-all-dev56/",
"s3://quantgene-portland-lb/20230109-sp-pl41-hs2/lb-pl41-dev56/",
"s3://quantgene-portland-lb/20230124-sp-pl42-hs2/lb-pl42-dev56/",
# PL43 skipped, mostly Geneva samples with stats all over the place
"s3://quantgene-portland-lb/20230210-sp-pl44-hs2/lb-pl44-dev59/",
"s3://quantgene-portland-lb/20230224-s1-pl47-hs2/lb-pl47-dev59/",
"s3://quantgene-portland-lb/20230316-s1-pl49-hs2/lb-pl49-dev59/",
"s3://quantgene-portland-lb/20230330-sp-pl50-hs2/lb-pl50-dev59/",
"s3://quantgene-portland-lb/20230417-s1-pl51-hs2/lb-pl51-dev59/",
"s3://quantgene-portland-lb/20230512-sp-pl52-hs2/lb-pl52-prd-2-1-0/",
"s3://quantgene-portland-lb/20230602-sp-pl53-hs2/lb-pl53-prd-2-1-1/",
"s3://quantgene-portland-lb/20230615-sp-pl54-hs2/lb-pl54-prd-2-1-1/",
"s3://quantgene-portland-lb/20230630-sp-pl55-hs2/lb-pl55-prd-2-1-1/",
"s3://quantgene-portland-lb/20230721-sp-pl56-hs2/lb-pl56-prd212/",
"s3://quantgene-portland-lb/20230811-sp-pl57-hs2/lb-pl57-prd212/",
"s3://quantgene-portland-lb/20230901-sp-pl58-hs2/lb-pl58-prd212/",
"s3://quantgene-portland-lb/20231027-sp-pl59-hs2/lb-pl59-prd212/",
# PL60 skipped due to low amount of input reads in all samples
"s3://quantgene-portland-lb/20240216-sp-pl61-hs2/lb-pl61-prd212/",
# PL62 skipped due to low amount of input reads and insert size issues
"s3://quantgene-portland-lb/20240516-s1-pl63-hs2/lb-pl63-prd212/",
"s3://quantgene-portland-lb/20240529-sp-pl64-hs2/lb-pl64-prd212/",
]
return (LB2_PIPELINE_RUNS,)
@app.cell
def _(pd):
# hist samples
qg_hist_data = pd.read_csv(
"/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/historical_data_LB.csv",
index_col=0,
)
qc_samples = list(set(qg_hist_data.index))
qc_samples
return (qc_samples,)
@app.cell
def _(LB2_PIPELINE_RUNS, os, qc_samples):
with open("../data/qg_avail_dot_mutations.txt", "r") as handle:
original, avail_samples, needed_samples = [], [], []
for line in handle.readlines():
fpath, samp = os.path.split(line.rstrip())
fpath = str(fpath).replace("-lb//", "-lb/") + "/"
samp = samp.strip(".mutations")
original.append(line.rstrip())
# skip non-prod versions
if fpath not in LB2_PIPELINE_RUNS:
continue
avail_samples.append(samp)
if samp in qc_samples:
needed_samples.append(line.rstrip())
print(
f"Number of samples available from runs = {(n_avail := len(avail_samples))}"
)
print(f"Number of historical QC samples = {(n_qc := len(qc_samples))}")
print(
f"Number of historical QC samples available = {(n_needed := len(needed_samples))} ({round((n_needed / n_qc) * 100, 2)}%)"
)
return (needed_samples,)
@app.cell
def _(needed_samples):
with open("../data/qg_needed.txt", "w") as whandle:
whandle.writelines([i + "\n" for i in needed_samples])
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()

View File

@@ -36,6 +36,12 @@ def _(mo):
return
@app.cell
def _(mo):
mo.md(r"""## (1) Preprocess Data""")
return
@app.cell
def _():
se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/"
@@ -44,7 +50,9 @@ def _():
qc_hist_dir = (
"/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/"
)
return qc_hist_dir, se1_dir, se2_dir
qc_hist_mutations_dir = "/home/darren/Documents/4_data/3_internal/2_lb/qg_historical_qc_service/dot_mutations/"
sc_path = "/home/darren/Documents/2_repos/serenomica/supercolumn/panel_v2/supercolumn_v2.csv"
return qc_hist_dir, qc_hist_mutations_dir, sc_path, se1_dir, se2_dir
@app.cell
@@ -74,6 +82,12 @@ def _(return_paths, se1_dir, se2_dir):
return mutations_fpaths, stats_fpaths
@app.cell
def _(mo):
mo.md(r"""### (1.1) Stats""")
return
@app.cell
def _(pd, stats_fpaths, yaml):
stats = pd.DataFrame()
@@ -101,31 +115,52 @@ def _(pd, stats_fpaths, yaml):
@app.cell
def _(mutations_fpaths, pd):
cols = []
mutations_vaf, mutations_counts, mutations_depth = (
pd.DataFrame(),
pd.DataFrame(),
pd.DataFrame(),
)
for mf in mutations_fpaths:
tmp = pd.read_csv(mf, sep="\t", header=None)
tmp["mutation_id"] = tmp.apply(
lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
def _(mo):
mo.md(r"""### (1.2) .mutations""")
return
@app.cell
def _(pd):
def process_mutations_files(mutation_file_paths):
cols = []
vaf, counts, depth = (
pd.DataFrame(),
pd.DataFrame(),
pd.DataFrame(),
)
tmp.set_index("mutation_id", inplace=True)
for mf in mutation_file_paths:
tmp = pd.read_csv(mf, sep="\t", header=None)
tmp["mutation_id"] = tmp.apply(
lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
)
tmp.set_index("mutation_id", inplace=True)
cols.append(mf.split("/")[-1].strip(".mutations"))
mutations_vaf = pd.concat([mutations_vaf, tmp[3]], axis=1)
mutations_counts = pd.concat([mutations_counts, tmp[4]], axis=1)
mutations_depth = pd.concat([mutations_depth, tmp[5]], axis=1)
cols.append(mf.split("/")[-1].strip(".mutations"))
vaf = pd.concat([vaf, tmp[3]], axis=1)
counts = pd.concat([counts, tmp[4]], axis=1)
depth = pd.concat([depth, tmp[5]], axis=1)
mutations_vaf.columns = cols
mutations_counts.columns = cols
mutations_depth.columns = cols
vaf.columns = cols
counts.columns = cols
depth.columns = cols
return vaf, counts, depth
return (process_mutations_files,)
@app.cell
def _(mutations_fpaths, process_mutations_files):
mutations_vaf, mutations_counts, mutations_depth = process_mutations_files(
mutations_fpaths
)
mutations_vaf
return (mutations_vaf,)
return mutations_depth, mutations_vaf
@app.cell
def _(mo):
mo.md(r"""### (1.3) Stats QG Historical""")
return
@app.cell
@@ -156,7 +191,39 @@ def _(pd, qg_hist_stats, stats):
@app.cell
def _(mo):
mo.md(r"""## (1) Stats""")
mo.md(r"""### (1.4) .mutations QG Historical""")
return
@app.cell
def _(process_mutations_files, qc_hist_mutations_dir, return_paths):
qg_hist_mutations_paths = return_paths(".mutations", qc_hist_mutations_dir)
qg_hist_mut_vaf, qg_hist_mut_count, qg_hist_mut_depth = (
process_mutations_files(qg_hist_mutations_paths)
)
qg_hist_mut_vaf
return qg_hist_mut_depth, qg_hist_mut_vaf
@app.cell
def _(mutations_depth, mutations_vaf, pd, qg_hist_mut_depth, qg_hist_mut_vaf):
# ensure columns match
assert set(qg_hist_mut_vaf.index) == set(mutations_vaf.index)
all_vaf = pd.concat([mutations_vaf, qg_hist_mut_vaf], axis=1)
all_depth = pd.concat([mutations_depth, qg_hist_mut_depth], axis=1)
return all_depth, all_vaf
@app.cell
def _(qg_hist_mut_vaf):
qg_samples_dot_mutations = qg_hist_mut_vaf.columns
return (qg_samples_dot_mutations,)
@app.cell
def _(mo):
mo.md(r"""## (1) Stats Analysis""")
return
@@ -197,13 +264,13 @@ def _(multiselect, px, stats_melt):
def _(mo):
mo.md(
r"""
## (2) .mutations Files
## (2) .mutations Analysis
Analysis ideas💡
- Always/majority called - potentially suspicious
- Never called
- High VAF targets
- Always/majority called - potentially suspicious
- Never called
- High VAF targets
- Poorly covered targets
- Overly covered targets
"""
@@ -211,17 +278,481 @@ def _(mo):
return
@app.cell
def _(pd, sc_path):
# supercolumn for analysis
sc = pd.read_csv(sc_path, sep=";")
sc["mutation_id"] = (
"chr"
+ sc["chrom"].astype(str)
+ "_"
+ sc["position"].astype(str)
+ "_"
+ sc["ref"].astype(str)
+ "/"
+ sc["alt"].astype(str)
)
return (sc,)
@app.cell
def _(sc):
black_mut_ids = sc.loc[sc["blacklisted"] == "Yes"]["mutation_id"].tolist()
print(f"Number of blacklisted variants = {len(black_mut_ids)}")
return (black_mut_ids,)
@app.cell
def _(mutations_vaf):
((mutations_vaf == 0).sum(axis=1) / len(mutations_vaf.columns)).sort_values(
ascending=False
seracares = [i for i in mutations_vaf.columns if "Sera" in i]
clinicals = [i for i in mutations_vaf.columns if i not in seracares]
print(f"Number of seracares = {len(seracares)}")
print(f"Number of volunteers = {len(clinicals)}")
print("Both numbers include replicates!")
return (clinicals,)
@app.cell
def _(mo):
mo.md(r"""### (2.1) Frequently and infrequently called variants""")
return
@app.cell
def _(clinicals, mutations_vaf, pd):
# always called and never called
called_gt0 = pd.DataFrame(
(mutations_vaf.loc[:, clinicals] > 0).sum(axis=1),
columns=["samples_called_in"],
)
called_gt0["prop_samples_called"] = [
row / len(clinicals) for row in called_gt0["samples_called_in"]
]
called_gt0["called_in_ALL"] = [
True if val == len(clinicals) else False
for val in called_gt0["samples_called_in"]
]
called_gt0["called_in_NONE"] = [
True if val == 0 else False for val in called_gt0["samples_called_in"]
]
var_always_called = called_gt0[called_gt0["called_in_ALL"] == True].index
var_never_called = called_gt0[called_gt0["called_in_NONE"] == True].index
called_gt0[["called_in_ALL", "called_in_NONE"]].sum(axis=0)
return (var_always_called,)
@app.cell
def _(sc, var_always_called):
# all 9 variants that occur in all clinical samples are blacklisted variants
sc.loc[sc["mutation_id"].isin(var_always_called)][
["chrom", "position", "ref", "alt", "gene", "blacklisted"]
]
return
@app.cell
def _(mo):
mo.md(r"""### (2.2) Variants called above VAF thresholds""")
return
@app.cell
def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations):
# mutations over thresholds, excluding the blacklisted hits
df_mut_per_sample_by_thresh = pd.DataFrame()
for t in [0.00125, 0.0025, 0.005, 0.01]:
t_data_all = pd.DataFrame(
(all_vaf > t).sum(axis=0),
columns=[f"variants_gt_{t}_all"],
)
t_data = pd.DataFrame(
(
all_vaf.loc[
[i for i in all_vaf.index if i not in black_mut_ids], :
]
> t
).sum(axis=0),
columns=[f"variants_gt_{t}_wo_blacklisted"],
)
df_mut_per_sample_by_thresh = pd.concat(
[df_mut_per_sample_by_thresh, t_data, t_data_all], axis=1
)
df_mut_per_sample_by_thresh["sample_type"] = [
"Serenomica-Sera"
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
else "QG-Sera"
if (i in qg_samples_dot_mutations) and ("Sera" in i)
else "Serenomical-Clinincal"
if i not in qg_samples_dot_mutations
else "QG-Clinical"
for i in df_mut_per_sample_by_thresh.index
]
df_mut_per_sample_by_thresh
return (df_mut_per_sample_by_thresh,)
@app.cell
def _(df_mut_per_sample_by_thresh, pd):
df_mut_per_sample_by_thresh_melt = pd.melt(
df_mut_per_sample_by_thresh.reset_index(), id_vars=["index", "sample_type"]
)
return (df_mut_per_sample_by_thresh_melt,)
@app.cell
def _(df_mut_per_sample_by_thresh, mo):
ms_gt_thresh = mo.ui.multiselect(options=df_mut_per_sample_by_thresh.columns)
mo.hstack([ms_gt_thresh])
return (ms_gt_thresh,)
@app.cell
def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px):
fig2 = px.box(
df_mut_per_sample_by_thresh_melt.loc[
df_mut_per_sample_by_thresh_melt["variable"].isin(ms_gt_thresh.value)
],
x="variable",
y="value",
color="sample_type",
points="all",
template="simple_white",
labels={"variable": "", "value": "Variants per sample"},
hover_data=["index"],
)
fig2.update_yaxes(showgrid=True)
fig2.show()
return
@app.cell
def _(mo):
mo.md(
r"""
### (2.3) Mutations with very High VAF (in some samples)
Arbritrarily looking at >10%
"""
)
return
@app.cell
def _(mutations_vaf):
mutations_vaf.shape
def _(clinicals, mutations_vaf, pd, sc):
# looking just at the serenomica samples
high_vaf_serenomica = pd.DataFrame(
(mutations_vaf.loc[:, clinicals] > 0.1)
.sum(axis=1)
.sort_values(ascending=False),
columns=["count_vaf_gt0.1"],
)
sc.loc[
sc["mutation_id"].isin(
high_vaf_serenomica.loc[
high_vaf_serenomica["count_vaf_gt0.1"] > 0
].index
)
][["mutation_id", "blacklisted", "rsID", "gene", "Consequence"]]
return (high_vaf_serenomica,)
@app.cell
def _(high_vaf_serenomica):
high_vaf_serenomica.head(9)
return
@app.cell
def _(mo):
mo.md(
r"""
rs587780751 - low pop AF
rs2256740 - high pop AF, especially in EU, Africans and latin americans (13/19 samples)
rs34094720 - relatively low pop AF
rs2293347 - relatively high pop AF (7/19 samples)
rs138327406 - relatively low pop AF
rs11214077 - relatively low but > 1% in latin pop
rs121913530 - very low pop AF
"""
)
return
@app.cell
def _(mo):
mo.md(r"""### (2.3) Global Coverage""")
return
@app.cell
def _(
all_depth,
black_mut_ids,
df_mut_per_sample_by_thresh,
pd,
qg_samples_dot_mutations,
):
# coverage over thresholds
df_depth_by_thresh = pd.DataFrame()
for d in [250, 500, 1000, 2000, 5000, 10000]:
d_data_all = pd.DataFrame(
(all_depth > d).sum(axis=0) / all_depth.shape[0],
columns=[f"coverage_gt_{d}_all"],
)
d_data = pd.DataFrame(
(
all_depth.loc[
[i for i in all_depth.index if i not in black_mut_ids], :
]
> d
).sum(axis=0)
/ all_depth.shape[0],
columns=[f"coverage_gt_{d}_wo_blacklisted"],
)
df_depth_by_thresh = pd.concat(
[df_depth_by_thresh, d_data, d_data_all], axis=1
)
df_depth_by_thresh["sample_type"] = [
"Serenomica-Sera"
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
else "QG-Sera"
if (i in qg_samples_dot_mutations) and ("Sera" in i)
else "Serenomical-Clinincal"
if i not in qg_samples_dot_mutations
else "QG-Clinical"
for i in df_mut_per_sample_by_thresh.index
]
df_depth_by_thresh
return (df_depth_by_thresh,)
@app.cell
def _(df_depth_by_thresh, pd):
df_depth_by_thresh_melt = pd.melt(
df_depth_by_thresh.reset_index(), id_vars=["index", "sample_type"]
)
return (df_depth_by_thresh_melt,)
@app.cell
def _(df_depth_by_thresh, mo):
ms_gt_dep_thresh = mo.ui.multiselect(options=df_depth_by_thresh.columns)
mo.hstack([ms_gt_dep_thresh])
return (ms_gt_dep_thresh,)
@app.cell
def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px):
fig3 = px.box(
df_depth_by_thresh_melt.loc[
df_depth_by_thresh_melt["variable"].isin(ms_gt_dep_thresh.value)
],
x="variable",
y="value",
color="sample_type",
points="all",
template="simple_white",
labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
hover_data=["index"],
)
fig3.update_yaxes(showgrid=True)
fig3.show()
return
@app.cell
def _(mo):
mo.md(
r"""
### (2.4) Missed Targets
Look into variants that only have a depth that could support a VAF of 1% (i.e. 200 reads)
"""
)
return
@app.cell
def _(black_mut_ids, mutations_depth):
# removal of blacklisted variants for this analysis
mutations_depth_wo_black = mutations_depth.loc[
[i for i in mutations_depth.index if i not in black_mut_ids], :
].copy()
return (mutations_depth_wo_black,)
@app.cell
def _(clinicals, mutations_depth_wo_black, pd):
# missed non-blacklisted variants where theoretical detection only >1% (i.e. 2 reads in 200 supporting variant)
variants_lost_global_coverage = pd.DataFrame(
(
(
mutations_depth_wo_black.loc[
:,
clinicals,
]
< 200
).sum(axis=1)
/ len(clinicals)
).sort_values(ascending=False),
columns=["prop_cov_lt200"],
)
variants_lost_global_coverage
return (variants_lost_global_coverage,)
@app.cell
def _(variants_lost_global_coverage):
variants_lost_cov_ids = list(
variants_lost_global_coverage.loc[
variants_lost_global_coverage["prop_cov_lt200"] > 0.5
].index
)
variants_lost_cov_ids
return (variants_lost_cov_ids,)
@app.cell
def _(mo):
mo.md(
r"""
### (2.5) Poorly Covered for low VAF detection
As we need 2 reads before we report a variant, this analysis looks at variants with lower than 1000 read depth that would enable a lowest VAF of 0.2% to be found.
"""
)
return
@app.cell
def _(clinicals, mutations_depth_wo_black, pd):
# variants that always fail a theoretical report coverage >= 0.2% VAF (i.e. needs 2 reads out of at least 1000)
variants_low_global_coverage = pd.DataFrame(
(
(mutations_depth_wo_black.loc[:, clinicals] < 1000).sum(axis=1)
/ len(clinicals)
).sort_values(ascending=False),
columns=["pro_cov_lt1000"],
)
variants_low_cov_ids = list(
variants_low_global_coverage.loc[
variants_low_global_coverage["pro_cov_lt1000"] > 0.75
].index
)
return (variants_low_cov_ids,)
@app.cell
def _():
return
@app.cell
def _(mo):
mo.md(
r"""
### (2.6) Overly covered targets
Look to see if any sc targets are overly represented in the reads
"""
)
return
@app.cell
def _(clinicals, mutations_depth_wo_black, pd):
mean_cov = pd.DataFrame(
mutations_depth_wo_black.loc[:, clinicals]
.mean(axis=1)
.sort_values(ascending=False),
columns=["mean_cov"],
)
mean_cov
return (mean_cov,)
@app.cell
def _(mean_cov, mo):
slider = mo.ui.slider(
start=100,
stop=len(mean_cov),
step=200,
label="Number of variants to show",
show_value=True,
)
mo.hstack([slider])
return (slider,)
@app.cell
def _(mean_cov, px, slider):
fig4 = px.bar(mean_cov.iloc[: slider.value, :], template="simple_white")
fig4.show()
return
@app.cell
def _(clinicals, mutations_depth_wo_black, pd):
variants_high_global_coverage = pd.DataFrame(
(
(mutations_depth_wo_black.loc[:, clinicals] > 10000).sum(axis=1)
/ len(clinicals)
).sort_values(ascending=False),
columns=["pro_cov_gt10000"],
)
variants_high_cov_ids = list(
variants_high_global_coverage.loc[
variants_high_global_coverage["pro_cov_gt10000"] > 0.75
].index
)
return (variants_high_cov_ids,)
@app.cell
def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
vars_of_interest = sc.loc[
sc["mutation_id"].isin(
variants_lost_cov_ids + variants_low_cov_ids + variants_high_cov_ids
)
][
[
"chrom",
"position",
"mutation_id",
"gene",
"HGVS",
"Consequence",
"region_type",
"blacklisted",
"cancer_fda",
"FDA_drugs",
"cancer_clinvar",
"cancer_rank_prevalence_combined",
]
].copy()
vars_of_interest["coverage_issue"] = [
"lost"
if i in variants_lost_cov_ids
else "poorly_covered"
if i in variants_low_cov_ids
else "highly_covered"
if i in variants_high_cov_ids
else ""
for i in vars_of_interest["mutation_id"]
]
vars_of_interest.to_csv(
"../data/coverage_issue_variants.tsv", sep="\t", index=False
)
return