Files
lb_eda/notebooks/initial_eda_lb_20250919.py

850 lines
22 KiB
Python

import marimo
__generated_with = "0.14.16"
app = marimo.App(width="medium")
@app.cell
def _():
import os
import itertools
import marimo as mo
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from matplotlib_venn import venn2
return itertools, mo, os, pd, plt, px, venn2, yaml
@app.cell
def _(mo):
mo.md(
r"""
# Initial EDA of the Twist LB data (19-Sep-2025)
So far no analysis have been done on the data from the LB assay. This analysis will take the first look.
Analysis ideas:
- Stats data analysis - box plots and outliers
- Mutations - agg stats -> variants called, coverage of called variants in supercol, common called variants, unique variants
- vcfs - agg stats -> variants called, coverage of called variants in supercol, common called variants
"""
)
return
@app.cell
def _(mo):
mo.md(r"""## (1) Preprocess Data""")
return
@app.cell
def _():
se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/"
se2_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se2-lb-1/"
qc_hist_dir = (
"/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/"
)
qc_hist_mutations_dir = "/home/darren/Documents/4_data/3_internal/2_lb/qg_historical_qc_service/dot_mutations/"
sc_path = "/home/darren/Documents/2_repos/serenomica/supercolumn/panel_v2/supercolumn_v2.csv"
return qc_hist_dir, qc_hist_mutations_dir, sc_path, se1_dir, se2_dir
@app.cell
def _(itertools, os):
def return_paths(suffix, *dirs):
return list(
itertools.chain.from_iterable(
[
[
os.path.join(d, f)
for f in os.listdir(d)
if f.endswith(suffix)
]
for d in dirs
]
)
)
return (return_paths,)
@app.cell
def _(return_paths, se1_dir, se2_dir):
stats_fpaths = return_paths(".stats", se1_dir, se2_dir)
print(len(stats_fpaths))
mutations_fpaths = return_paths(".mutations", se1_dir, se2_dir)
print(len(mutations_fpaths))
return mutations_fpaths, stats_fpaths
@app.cell
def _(mo):
mo.md(r"""### (1.1) Stats""")
return
@app.cell
def _(pd, stats_fpaths, yaml):
stats = pd.DataFrame()
for sf in stats_fpaths:
with open(sf, "r") as handle:
stats = pd.concat(
[
stats,
pd.json_normalize(yaml.safe_load(handle), sep=" / ").rename(
{0: sf.split("/")[-1].strip(".stats")}, axis=0
),
],
axis=0,
)
stats["sample_type"] = [
"Serenomica-Sera" if "Sera" in i else "Serenomica-Clinical"
for i in stats.index
]
stats["source"] = "Serenomica"
stats = stats.drop(
["miscellaneous / stats_file_version", "miscellaneous / target_panel_bed"],
axis=1,
)
return (stats,)
@app.cell
def _(mo):
mo.md(r"""### (1.2) .mutations""")
return
@app.cell
def _(pd):
def process_mutations_files(mutation_file_paths):
cols = []
vaf, counts, depth = (
pd.DataFrame(),
pd.DataFrame(),
pd.DataFrame(),
)
for mf in mutation_file_paths:
tmp = pd.read_csv(mf, sep="\t", header=None)
tmp["mutation_id"] = tmp.apply(
lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
)
tmp.set_index("mutation_id", inplace=True)
cols.append(mf.split("/")[-1].strip(".mutations"))
vaf = pd.concat([vaf, tmp[3]], axis=1)
counts = pd.concat([counts, tmp[4]], axis=1)
depth = pd.concat([depth, tmp[5]], axis=1)
vaf.columns = cols
counts.columns = cols
depth.columns = cols
return vaf, counts, depth
return (process_mutations_files,)
@app.cell
def _(mutations_fpaths, process_mutations_files):
mutations_vaf, mutations_counts, mutations_depth = process_mutations_files(
mutations_fpaths
)
mutations_vaf
return mutations_depth, mutations_vaf
@app.cell
def _(mo):
mo.md(r"""### (1.3) Stats QG Historical""")
return
@app.cell
def _(os, pd, qc_hist_dir):
# QG historical data
qg_hist_stats = pd.read_csv(
os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0
)
qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns]
qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1)
qg_hist_stats["sample_type"] = [
"QG-Sera" if "Sera" in i else "QG-Clinical" for i in qg_hist_stats.index
]
qg_hist_stats["source"] = "QG"
qg_hist_stats
return (qg_hist_stats,)
@app.cell
def _(pd, qg_hist_stats, stats):
# ensure columns match
assert set(qg_hist_stats.columns) == set(stats.columns)
all_stats = pd.concat([stats, qg_hist_stats], axis=0)
all_stats.head()
return (all_stats,)
@app.cell
def _(mo):
mo.md(r"""### (1.4) .mutations QG Historical""")
return
@app.cell
def _(process_mutations_files, qc_hist_mutations_dir, return_paths):
qg_hist_mutations_paths = return_paths(".mutations", qc_hist_mutations_dir)
qg_hist_mut_vaf, qg_hist_mut_count, qg_hist_mut_depth = (
process_mutations_files(qg_hist_mutations_paths)
)
qg_hist_mut_vaf
return qg_hist_mut_depth, qg_hist_mut_vaf
@app.cell
def _(mutations_depth, mutations_vaf, pd, qg_hist_mut_depth, qg_hist_mut_vaf):
# ensure columns match
assert set(qg_hist_mut_vaf.index) == set(mutations_vaf.index)
all_vaf = pd.concat([mutations_vaf, qg_hist_mut_vaf], axis=1)
all_depth = pd.concat([mutations_depth, qg_hist_mut_depth], axis=1)
return all_depth, all_vaf
@app.cell
def _(qg_hist_mut_vaf):
qg_samples_dot_mutations = qg_hist_mut_vaf.columns
return (qg_samples_dot_mutations,)
@app.cell
def _(mo):
mo.md(r"""## (1) Stats Analysis""")
return
@app.cell
def _(all_stats, pd):
stats_melt = pd.melt(
all_stats.reset_index(), id_vars=["index", "sample_type", "source"]
)
stats_melt.head()
return (stats_melt,)
@app.cell
def _(mo, stats):
multiselect = mo.ui.multiselect(options=stats.columns)
mo.hstack([multiselect])
return (multiselect,)
@app.cell
def _(multiselect, px, stats_melt):
fig = px.box(
stats_melt.loc[stats_melt["variable"].isin(multiselect.value)],
x="variable",
y="value",
color="sample_type",
points="all",
template="simple_white",
labels={"variable": "", "value": "Value from stats file"},
hover_data=["index", "source"],
category_orders={
"sample_type": sorted(stats_melt["sample_type"].unique())
},
)
fig.update_yaxes(showgrid=True)
fig.show()
return
@app.cell
def _(mo):
mo.md(
r"""
## (2) .mutations Analysis
Analysis ideas💡
- Always/majority called - potentially suspicious ✅
- Never called ✅
- High VAF targets ✅
- Poorly covered targets ✅
- Overly covered targets ✅
"""
)
return
@app.cell
def _(pd, sc_path):
# supercolumn for analysis
sc = pd.read_csv(sc_path, sep=";")
sc["mutation_id"] = (
"chr"
+ sc["chrom"].astype(str)
+ "_"
+ sc["position"].astype(str)
+ "_"
+ sc["ref"].astype(str)
+ "/"
+ sc["alt"].astype(str)
)
return (sc,)
@app.cell
def _(sc):
black_mut_ids = sc.loc[sc["blacklisted"] == "Yes"]["mutation_id"].tolist()
print(f"Number of blacklisted variants = {len(black_mut_ids)}")
return (black_mut_ids,)
@app.cell
def _(mutations_vaf):
seracares = [i for i in mutations_vaf.columns if "Sera" in i]
clinicals = [i for i in mutations_vaf.columns if i not in seracares]
print(f"Number of seracares = {len(seracares)}")
print(f"Number of volunteers = {len(clinicals)}")
print("Both numbers include replicates!")
return (clinicals,)
@app.cell
def _(mo):
mo.md(r"""### (2.1) Frequently and infrequently called variants""")
return
@app.cell
def _(clinicals, mutations_vaf, pd):
# always called and never called
called_gt0 = pd.DataFrame(
(mutations_vaf.loc[:, clinicals] > 0).sum(axis=1),
columns=["samples_called_in"],
)
called_gt0["prop_samples_called"] = [
row / len(clinicals) for row in called_gt0["samples_called_in"]
]
called_gt0["called_in_ALL"] = [
True if val == len(clinicals) else False
for val in called_gt0["samples_called_in"]
]
called_gt0["called_in_NONE"] = [
True if val == 0 else False for val in called_gt0["samples_called_in"]
]
var_always_called = called_gt0[called_gt0["called_in_ALL"] == True].index
var_never_called = called_gt0[called_gt0["called_in_NONE"] == True].index
called_gt0[["called_in_ALL", "called_in_NONE"]].sum(axis=0)
return (var_always_called,)
@app.cell
def _(sc, var_always_called):
# all 9 variants that occur in all clinical samples are blacklisted variants
sc.loc[sc["mutation_id"].isin(var_always_called)][
["chrom", "position", "ref", "alt", "gene", "blacklisted"]
]
return
@app.cell
def _(mo):
mo.md(r"""### (2.2) Variants called above VAF thresholds""")
return
@app.cell
def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations):
# mutations over thresholds, excluding the blacklisted hits
df_mut_per_sample_by_thresh = pd.DataFrame()
for t in [0.00125, 0.0025, 0.005, 0.01]:
t_data_all = pd.DataFrame(
(all_vaf > t).sum(axis=0),
columns=[f"variants_gt_{t}_all"],
)
t_data = pd.DataFrame(
(
all_vaf.loc[
[i for i in all_vaf.index if i not in black_mut_ids], :
]
> t
).sum(axis=0),
columns=[f"variants_gt_{t}_wo_blacklisted"],
)
df_mut_per_sample_by_thresh = pd.concat(
[df_mut_per_sample_by_thresh, t_data, t_data_all], axis=1
)
df_mut_per_sample_by_thresh["sample_type"] = [
"Serenomica-Sera"
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
else "QG-Sera"
if (i in qg_samples_dot_mutations) and ("Sera" in i)
else "Serenomica-Clinincal"
if i not in qg_samples_dot_mutations
else "QG-Clinical"
for i in df_mut_per_sample_by_thresh.index
]
df_mut_per_sample_by_thresh
return (df_mut_per_sample_by_thresh,)
@app.cell
def _(df_mut_per_sample_by_thresh, pd):
df_mut_per_sample_by_thresh_melt = pd.melt(
df_mut_per_sample_by_thresh.reset_index(), id_vars=["index", "sample_type"]
)
return (df_mut_per_sample_by_thresh_melt,)
@app.cell
def _(df_mut_per_sample_by_thresh, mo):
ms_gt_thresh = mo.ui.multiselect(options=df_mut_per_sample_by_thresh.columns)
mo.hstack([ms_gt_thresh])
return (ms_gt_thresh,)
@app.cell
def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px):
fig2 = px.box(
df_mut_per_sample_by_thresh_melt.loc[
df_mut_per_sample_by_thresh_melt["variable"].isin(ms_gt_thresh.value)
],
x="variable",
y="value",
color="sample_type",
points="all",
template="simple_white",
labels={"variable": "", "value": "Variants per sample"},
hover_data=["index"],
category_orders={
"sample_type": sorted(
df_mut_per_sample_by_thresh_melt["sample_type"].unique()
)
},
)
fig2.update_yaxes(showgrid=True)
fig2.show()
return
@app.cell
def _(mo):
mo.md(
r"""
### (2.3) Mutations with very High VAF (in some samples)
Arbritrarily looking at >10%
"""
)
return
@app.cell
def _(clinicals, mutations_vaf, pd, sc):
# looking just at the serenomica samples
high_vaf_serenomica = pd.DataFrame(
(mutations_vaf.loc[:, clinicals] > 0.1)
.sum(axis=1)
.sort_values(ascending=False),
columns=["count_vaf_gt0.1"],
)
sc.loc[
sc["mutation_id"].isin(
high_vaf_serenomica.loc[
high_vaf_serenomica["count_vaf_gt0.1"] > 0
].index
)
][["mutation_id", "blacklisted", "rsID", "gene", "Consequence"]]
return (high_vaf_serenomica,)
@app.cell
def _(high_vaf_serenomica):
high_vaf_serenomica.head(9)
return
@app.cell
def _(mo):
mo.md(
r"""
rs587780751 - low pop AF
rs2256740 - high pop AF, especially in EU, Africans and latin americans (13/19 samples)
rs34094720 - relatively low pop AF
rs2293347 - relatively high pop AF (7/19 samples)
rs138327406 - relatively low pop AF
rs11214077 - relatively low but > 1% in latin pop
rs121913530 - very low pop AF
"""
)
return
@app.cell
def _(mo):
mo.md(r"""### (2.3) Global Coverage""")
return
@app.cell
def _(
all_depth,
black_mut_ids,
df_mut_per_sample_by_thresh,
pd,
qg_samples_dot_mutations,
):
# coverage over thresholds
df_depth_by_thresh = pd.DataFrame()
for d in [250, 500, 1000, 2000, 5000, 10000]:
d_data_all = pd.DataFrame(
(all_depth > d).sum(axis=0) / all_depth.shape[0],
columns=[f"coverage_gt_{d}_all"],
)
d_data = pd.DataFrame(
(
all_depth.loc[
[i for i in all_depth.index if i not in black_mut_ids], :
]
> d
).sum(axis=0)
/ all_depth.shape[0],
columns=[f"coverage_gt_{d}_wo_blacklisted"],
)
df_depth_by_thresh = pd.concat(
[df_depth_by_thresh, d_data, d_data_all], axis=1
)
df_depth_by_thresh["sample_type"] = [
"Serenomica-Sera"
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
else "QG-Sera"
if (i in qg_samples_dot_mutations) and ("Sera" in i)
else "Serenomica-Clinincal"
if i not in qg_samples_dot_mutations
else "QG-Clinical"
for i in df_mut_per_sample_by_thresh.index
]
df_depth_by_thresh
return (df_depth_by_thresh,)
@app.cell
def _(df_depth_by_thresh, pd):
df_depth_by_thresh_melt = pd.melt(
df_depth_by_thresh.reset_index(), id_vars=["index", "sample_type"]
)
return (df_depth_by_thresh_melt,)
@app.cell
def _(df_depth_by_thresh, mo):
ms_gt_dep_thresh = mo.ui.multiselect(options=df_depth_by_thresh.columns)
mo.hstack([ms_gt_dep_thresh])
return (ms_gt_dep_thresh,)
@app.cell
def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px):
fig3 = px.box(
df_depth_by_thresh_melt.loc[
df_depth_by_thresh_melt["variable"].isin(ms_gt_dep_thresh.value)
],
x="variable",
y="value",
color="sample_type",
points="all",
template="simple_white",
labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
hover_data=["index"],
category_orders={
"sample_type": sorted(df_depth_by_thresh_melt["sample_type"].unique())
},
)
fig3.update_yaxes(showgrid=True)
fig3.show()
return
@app.cell
def _(mo):
mo.md(
r"""
### (2.4) Missed Targets
Look into variants that only have a depth that could support a VAF of 1% (i.e. 200 reads)
"""
)
return
@app.cell
def _(black_mut_ids, mutations_depth, qg_hist_mut_depth):
# removal of blacklisted variants for this analysis
mutations_depth_wo_black = mutations_depth.loc[
[i for i in mutations_depth.index if i not in black_mut_ids], :
].copy()
qg_mutations_depth_wo_black = qg_hist_mut_depth.loc[
[i for i in qg_hist_mut_depth.index if i not in black_mut_ids], :
].copy()
return mutations_depth_wo_black, qg_mutations_depth_wo_black
@app.cell
def _(pd):
def get_variant_group_by_depth(df, depth_thresh, sample_prop, mode="lt"):
if mode == "lt":
variants_group = pd.DataFrame(
((df < depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
ascending=False
),
columns=[f"prop_cov_{mode}{depth_thresh}"],
)
elif mode == "gt":
variants_group = pd.DataFrame(
((df > depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
ascending=False
),
columns=[f"prop_cov_{mode}{depth_thresh}"],
)
variant_ids = list(
variants_group.loc[
variants_group[f"prop_cov_{mode}{depth_thresh}"] > sample_prop
].index
)
return variants_group, variant_ids
return (get_variant_group_by_depth,)
@app.cell
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
variants_lost_global_coverage, variants_lost_cov_ids = (
get_variant_group_by_depth(
mutations_depth_wo_black.loc[:, clinicals], 400, 0.5
)
)
variants_lost_cov_ids
return (variants_lost_cov_ids,)
@app.cell
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
qg_variants_lost_global_coverage, qg_variants_lost_cov_ids = (
get_variant_group_by_depth(qg_mutations_depth_wo_black, 400, 0.5)
)
qg_variants_lost_cov_ids
return (qg_variants_lost_cov_ids,)
@app.cell
def _(plt, qg_variants_lost_cov_ids, variants_lost_cov_ids, venn2):
venn2(
subsets=(
len(qg_variants_lost_cov_ids),
len(
set(qg_variants_lost_cov_ids).intersection(
set(variants_lost_cov_ids)
)
),
len(set(variants_lost_cov_ids)),
),
set_labels=("QG", "Serenomica"),
set_colors=("blue", "green"),
alpha=0.5,
)
plt.title("Overlap of missed supercolumn targets (w/o blacklist targets)")
plt.show()
return
@app.cell
def _(mo):
mo.md(
r"""
### (2.5) Poorly Covered for low VAF detection
As we need 2 reads before we report a variant, this analysis looks at variants with lower than 1000 read depth that would enable a lowest VAF of 0.2% to be found.
"""
)
return
@app.cell
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
variants_low_global_coverage, variants_low_cov_ids = (
get_variant_group_by_depth(
mutations_depth_wo_black.loc[:, clinicals], 1000, 0.75
)
)
variants_low_cov_ids
return (variants_low_cov_ids,)
@app.cell
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
qg_variants_low_global_coverage, qg_variants_low_cov_ids = (
get_variant_group_by_depth(qg_mutations_depth_wo_black, 1000, 0.75)
)
qg_variants_low_cov_ids
return (qg_variants_low_cov_ids,)
@app.cell
def _(
plt,
qg_variants_lost_cov_ids,
qg_variants_low_cov_ids,
variants_lost_cov_ids,
variants_low_cov_ids,
venn2,
):
qg_low = set(
[q for q in qg_variants_low_cov_ids if q not in qg_variants_lost_cov_ids]
)
sere_low = set(
[v for v in variants_low_cov_ids if v not in variants_lost_cov_ids]
)
venn2(
subsets=(
len(qg_low),
len(qg_low.intersection(set(sere_low))),
len(sere_low),
),
set_labels=("QG", "Serenomica"),
set_colors=("blue", "green"),
alpha=0.5,
)
plt.title(
"Overlap of lowly covered supercolumn targets (w/o blacklist targets)"
)
plt.show()
return
@app.cell
def _(mo):
mo.md(
r"""
### (2.6) Overly covered targets
Look to see if any sc targets are overly represented in the reads
"""
)
return
@app.cell
def _(clinicals, mutations_depth_wo_black, pd):
mean_cov = pd.DataFrame(
mutations_depth_wo_black.loc[:, clinicals]
.mean(axis=1)
.sort_values(ascending=False),
columns=["mean_cov"],
)
mean_cov
return (mean_cov,)
@app.cell
def _(mean_cov, mo):
slider = mo.ui.slider(
start=100,
stop=len(mean_cov),
step=200,
label="Number of variants to show",
show_value=True,
)
mo.hstack([slider])
return (slider,)
@app.cell
def _(mean_cov, px, slider):
fig4 = px.bar(mean_cov.iloc[: slider.value, :], template="simple_white")
fig4.show()
return
@app.cell
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
variants_high_global_coverage, variants_high_cov_ids = (
get_variant_group_by_depth(
mutations_depth_wo_black.loc[:, clinicals], 10000, 0.75, mode="gt"
)
)
variants_high_cov_ids
return (variants_high_cov_ids,)
@app.cell
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
qg_variants_high_global_coverage, qg_variants_high_cov_ids = (
get_variant_group_by_depth(
qg_mutations_depth_wo_black, 10000, 0.75, mode="gt"
)
)
qg_variants_high_cov_ids
return
@app.cell
def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
vars_of_interest = sc.loc[
sc["mutation_id"].isin(
variants_lost_cov_ids + variants_low_cov_ids + variants_high_cov_ids
)
][
[
"chrom",
"position",
"mutation_id",
"gene",
"HGVS",
"Consequence",
"region_type",
"blacklisted",
"cancer_fda",
"FDA_drugs",
"cancer_clinvar",
"cancer_rank_prevalence_combined",
]
].copy()
vars_of_interest["coverage_issue"] = [
"lost"
if i in variants_lost_cov_ids
else "poorly_covered"
if i in variants_low_cov_ids
else "highly_covered"
if i in variants_high_cov_ids
else ""
for i in vars_of_interest["mutation_id"]
]
vars_of_interest.to_csv(
"../data/coverage_issue_variants.tsv", sep="\t", index=False
)
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()