feat: analysis completed in the eda on stats and mutations

This commit is contained in:
2025-09-23 16:21:03 +02:00
parent e03ede9c28
commit 39ef0b93ea

View File

@@ -12,10 +12,11 @@ def _():
import pandas as pd import pandas as pd
import plotly.express as px import plotly.express as px
import plotly.graph_objects as go import plotly.graph_objects as go
import matplotlib as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
import yaml import yaml
return itertools, mo, os, pd, px, yaml from matplotlib_venn import venn2
return itertools, mo, os, pd, plt, px, venn2, yaml
@app.cell @app.cell
@@ -254,6 +255,9 @@ def _(multiselect, px, stats_melt):
template="simple_white", template="simple_white",
labels={"variable": "", "value": "Value from stats file"}, labels={"variable": "", "value": "Value from stats file"},
hover_data=["index", "source"], hover_data=["index", "source"],
category_orders={
"sample_type": sorted(stats_melt["sample_type"].unique())
},
) )
fig.update_yaxes(showgrid=True) fig.update_yaxes(showgrid=True)
fig.show() fig.show()
@@ -271,8 +275,8 @@ def _(mo):
- Always/majority called - potentially suspicious ✅ - Always/majority called - potentially suspicious ✅
- Never called ✅ - Never called ✅
- High VAF targets ✅ - High VAF targets ✅
- Poorly covered targets - Poorly covered targets
- Overly covered targets - Overly covered targets
""" """
) )
return return
@@ -384,7 +388,7 @@ def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations):
if ("Sera" in i) and (i not in qg_samples_dot_mutations) if ("Sera" in i) and (i not in qg_samples_dot_mutations)
else "QG-Sera" else "QG-Sera"
if (i in qg_samples_dot_mutations) and ("Sera" in i) if (i in qg_samples_dot_mutations) and ("Sera" in i)
else "Serenomical-Clinincal" else "Serenomica-Clinincal"
if i not in qg_samples_dot_mutations if i not in qg_samples_dot_mutations
else "QG-Clinical" else "QG-Clinical"
for i in df_mut_per_sample_by_thresh.index for i in df_mut_per_sample_by_thresh.index
@@ -421,6 +425,11 @@ def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px):
template="simple_white", template="simple_white",
labels={"variable": "", "value": "Variants per sample"}, labels={"variable": "", "value": "Variants per sample"},
hover_data=["index"], hover_data=["index"],
category_orders={
"sample_type": sorted(
df_mut_per_sample_by_thresh_melt["sample_type"].unique()
)
},
) )
fig2.update_yaxes(showgrid=True) fig2.update_yaxes(showgrid=True)
fig2.show() fig2.show()
@@ -525,7 +534,7 @@ def _(
if ("Sera" in i) and (i not in qg_samples_dot_mutations) if ("Sera" in i) and (i not in qg_samples_dot_mutations)
else "QG-Sera" else "QG-Sera"
if (i in qg_samples_dot_mutations) and ("Sera" in i) if (i in qg_samples_dot_mutations) and ("Sera" in i)
else "Serenomical-Clinincal" else "Serenomica-Clinincal"
if i not in qg_samples_dot_mutations if i not in qg_samples_dot_mutations
else "QG-Clinical" else "QG-Clinical"
for i in df_mut_per_sample_by_thresh.index for i in df_mut_per_sample_by_thresh.index
@@ -562,6 +571,9 @@ def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px):
template="simple_white", template="simple_white",
labels={"variable": "", "value": "Prop of sc sites > threhold per sample"}, labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
hover_data=["index"], hover_data=["index"],
category_orders={
"sample_type": sorted(df_depth_by_thresh_melt["sample_type"].unique())
},
) )
fig3.update_yaxes(showgrid=True) fig3.update_yaxes(showgrid=True)
fig3.show() fig3.show()
@@ -581,45 +593,85 @@ def _(mo):
@app.cell @app.cell
def _(black_mut_ids, mutations_depth): def _(black_mut_ids, mutations_depth, qg_hist_mut_depth):
# removal of blacklisted variants for this analysis # removal of blacklisted variants for this analysis
mutations_depth_wo_black = mutations_depth.loc[ mutations_depth_wo_black = mutations_depth.loc[
[i for i in mutations_depth.index if i not in black_mut_ids], : [i for i in mutations_depth.index if i not in black_mut_ids], :
].copy() ].copy()
return (mutations_depth_wo_black,)
qg_mutations_depth_wo_black = qg_hist_mut_depth.loc[
[i for i in qg_hist_mut_depth.index if i not in black_mut_ids], :
].copy()
return mutations_depth_wo_black, qg_mutations_depth_wo_black
@app.cell @app.cell
def _(clinicals, mutations_depth_wo_black, pd): def _(pd):
# missed non-blacklisted variants where theoretical detection only >1% (i.e. 2 reads in 200 supporting variant) def get_variant_group_by_depth(df, depth_thresh, sample_prop, mode="lt"):
variants_lost_global_coverage = pd.DataFrame( if mode == "lt":
( variants_group = pd.DataFrame(
( ((df < depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
mutations_depth_wo_black.loc[ ascending=False
:, ),
clinicals, columns=[f"prop_cov_{mode}{depth_thresh}"],
]
< 200
).sum(axis=1)
/ len(clinicals)
).sort_values(ascending=False),
columns=["prop_cov_lt200"],
) )
variants_lost_global_coverage elif mode == "gt":
return (variants_lost_global_coverage,) variants_group = pd.DataFrame(
((df > depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
ascending=False
),
columns=[f"prop_cov_{mode}{depth_thresh}"],
)
variant_ids = list(
variants_group.loc[
variants_group[f"prop_cov_{mode}{depth_thresh}"] > sample_prop
].index
)
return variants_group, variant_ids
return (get_variant_group_by_depth,)
@app.cell @app.cell
def _(variants_lost_global_coverage): def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
variants_lost_cov_ids = list( variants_lost_global_coverage, variants_lost_cov_ids = (
variants_lost_global_coverage.loc[ get_variant_group_by_depth(
variants_lost_global_coverage["prop_cov_lt200"] > 0.5 mutations_depth_wo_black.loc[:, clinicals], 400, 0.5
].index )
) )
variants_lost_cov_ids variants_lost_cov_ids
return (variants_lost_cov_ids,) return (variants_lost_cov_ids,)
@app.cell
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
qg_variants_lost_global_coverage, qg_variants_lost_cov_ids = (
get_variant_group_by_depth(qg_mutations_depth_wo_black, 400, 0.5)
)
qg_variants_lost_cov_ids
return (qg_variants_lost_cov_ids,)
@app.cell
def _(plt, qg_variants_lost_cov_ids, variants_lost_cov_ids, venn2):
venn2(
subsets=(
len(qg_variants_lost_cov_ids),
len(
set(qg_variants_lost_cov_ids).intersection(
set(variants_lost_cov_ids)
)
),
len(set(variants_lost_cov_ids)),
),
set_labels=("QG", "Serenomica"),
set_colors=("blue", "green"),
alpha=0.5,
)
plt.title("Overlap of missed supercolumn targets (w/o blacklist targets)")
plt.show()
return
@app.cell @app.cell
def _(mo): def _(mo):
mo.md( mo.md(
@@ -633,26 +685,54 @@ def _(mo):
@app.cell @app.cell
def _(clinicals, mutations_depth_wo_black, pd): def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
# variants that always fail a theoretical report coverage >= 0.2% VAF (i.e. needs 2 reads out of at least 1000) variants_low_global_coverage, variants_low_cov_ids = (
variants_low_global_coverage = pd.DataFrame( get_variant_group_by_depth(
( mutations_depth_wo_black.loc[:, clinicals], 1000, 0.75
(mutations_depth_wo_black.loc[:, clinicals] < 1000).sum(axis=1)
/ len(clinicals)
).sort_values(ascending=False),
columns=["pro_cov_lt1000"],
) )
variants_low_cov_ids = list(
variants_low_global_coverage.loc[
variants_low_global_coverage["pro_cov_lt1000"] > 0.75
].index
) )
variants_low_cov_ids
return (variants_low_cov_ids,) return (variants_low_cov_ids,)
@app.cell @app.cell
def _(): def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
qg_variants_low_global_coverage, qg_variants_low_cov_ids = (
get_variant_group_by_depth(qg_mutations_depth_wo_black, 1000, 0.75)
)
qg_variants_low_cov_ids
return (qg_variants_low_cov_ids,)
@app.cell
def _(
plt,
qg_variants_lost_cov_ids,
qg_variants_low_cov_ids,
variants_lost_cov_ids,
variants_low_cov_ids,
venn2,
):
qg_low = set(
[q for q in qg_variants_low_cov_ids if q not in qg_variants_lost_cov_ids]
)
sere_low = set(
[v for v in variants_low_cov_ids if v not in variants_lost_cov_ids]
)
venn2(
subsets=(
len(qg_low),
len(qg_low.intersection(set(sere_low))),
len(sere_low),
),
set_labels=("QG", "Serenomica"),
set_colors=("blue", "green"),
alpha=0.5,
)
plt.title(
"Overlap of lowly covered supercolumn targets (w/o blacklist targets)"
)
plt.show()
return return
@@ -701,23 +781,27 @@ def _(mean_cov, px, slider):
@app.cell @app.cell
def _(clinicals, mutations_depth_wo_black, pd): def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
variants_high_global_coverage = pd.DataFrame( variants_high_global_coverage, variants_high_cov_ids = (
( get_variant_group_by_depth(
(mutations_depth_wo_black.loc[:, clinicals] > 10000).sum(axis=1) mutations_depth_wo_black.loc[:, clinicals], 10000, 0.75, mode="gt"
/ len(clinicals)
).sort_values(ascending=False),
columns=["pro_cov_gt10000"],
) )
variants_high_cov_ids = list(
variants_high_global_coverage.loc[
variants_high_global_coverage["pro_cov_gt10000"] > 0.75
].index
) )
variants_high_cov_ids
return (variants_high_cov_ids,) return (variants_high_cov_ids,)
@app.cell
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
qg_variants_high_global_coverage, qg_variants_high_cov_ids = (
get_variant_group_by_depth(
qg_mutations_depth_wo_black, 10000, 0.75, mode="gt"
)
)
qg_variants_high_cov_ids
return
@app.cell @app.cell
def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids): def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
vars_of_interest = sc.loc[ vars_of_interest = sc.loc[