feat: analysis completed in the eda on stats and mutations

This commit is contained in:
2025-09-23 16:21:03 +02:00
parent e03ede9c28
commit 39ef0b93ea

View File

@@ -12,10 +12,11 @@ def _():
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib as plt
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
return itertools, mo, os, pd, px, yaml
from matplotlib_venn import venn2
return itertools, mo, os, pd, plt, px, venn2, yaml
@app.cell
@@ -254,6 +255,9 @@ def _(multiselect, px, stats_melt):
template="simple_white",
labels={"variable": "", "value": "Value from stats file"},
hover_data=["index", "source"],
category_orders={
"sample_type": sorted(stats_melt["sample_type"].unique())
},
)
fig.update_yaxes(showgrid=True)
fig.show()
@@ -271,8 +275,8 @@ def _(mo):
- Always/majority called - potentially suspicious ✅
- Never called ✅
- High VAF targets ✅
- Poorly covered targets
- Overly covered targets
- Poorly covered targets
- Overly covered targets
"""
)
return
@@ -384,7 +388,7 @@ def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations):
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
else "QG-Sera"
if (i in qg_samples_dot_mutations) and ("Sera" in i)
else "Serenomical-Clinincal"
else "Serenomica-Clinincal"
if i not in qg_samples_dot_mutations
else "QG-Clinical"
for i in df_mut_per_sample_by_thresh.index
@@ -421,6 +425,11 @@ def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px):
template="simple_white",
labels={"variable": "", "value": "Variants per sample"},
hover_data=["index"],
category_orders={
"sample_type": sorted(
df_mut_per_sample_by_thresh_melt["sample_type"].unique()
)
},
)
fig2.update_yaxes(showgrid=True)
fig2.show()
@@ -525,7 +534,7 @@ def _(
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
else "QG-Sera"
if (i in qg_samples_dot_mutations) and ("Sera" in i)
else "Serenomical-Clinincal"
else "Serenomica-Clinincal"
if i not in qg_samples_dot_mutations
else "QG-Clinical"
for i in df_mut_per_sample_by_thresh.index
@@ -562,6 +571,9 @@ def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px):
template="simple_white",
labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
hover_data=["index"],
category_orders={
"sample_type": sorted(df_depth_by_thresh_melt["sample_type"].unique())
},
)
fig3.update_yaxes(showgrid=True)
fig3.show()
@@ -581,45 +593,85 @@ def _(mo):
@app.cell
def _(black_mut_ids, mutations_depth):
def _(black_mut_ids, mutations_depth, qg_hist_mut_depth):
# removal of blacklisted variants for this analysis
mutations_depth_wo_black = mutations_depth.loc[
[i for i in mutations_depth.index if i not in black_mut_ids], :
].copy()
return (mutations_depth_wo_black,)
qg_mutations_depth_wo_black = qg_hist_mut_depth.loc[
[i for i in qg_hist_mut_depth.index if i not in black_mut_ids], :
].copy()
return mutations_depth_wo_black, qg_mutations_depth_wo_black
@app.cell
def _(clinicals, mutations_depth_wo_black, pd):
# missed non-blacklisted variants where theoretical detection only >1% (i.e. 2 reads in 200 supporting variant)
variants_lost_global_coverage = pd.DataFrame(
(
(
mutations_depth_wo_black.loc[
:,
clinicals,
]
< 200
).sum(axis=1)
/ len(clinicals)
).sort_values(ascending=False),
columns=["prop_cov_lt200"],
)
variants_lost_global_coverage
return (variants_lost_global_coverage,)
def _(pd):
def get_variant_group_by_depth(df, depth_thresh, sample_prop, mode="lt"):
if mode == "lt":
variants_group = pd.DataFrame(
((df < depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
ascending=False
),
columns=[f"prop_cov_{mode}{depth_thresh}"],
)
elif mode == "gt":
variants_group = pd.DataFrame(
((df > depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
ascending=False
),
columns=[f"prop_cov_{mode}{depth_thresh}"],
)
variant_ids = list(
variants_group.loc[
variants_group[f"prop_cov_{mode}{depth_thresh}"] > sample_prop
].index
)
return variants_group, variant_ids
return (get_variant_group_by_depth,)
@app.cell
def _(variants_lost_global_coverage):
variants_lost_cov_ids = list(
variants_lost_global_coverage.loc[
variants_lost_global_coverage["prop_cov_lt200"] > 0.5
].index
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
variants_lost_global_coverage, variants_lost_cov_ids = (
get_variant_group_by_depth(
mutations_depth_wo_black.loc[:, clinicals], 400, 0.5
)
)
variants_lost_cov_ids
return (variants_lost_cov_ids,)
@app.cell
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
qg_variants_lost_global_coverage, qg_variants_lost_cov_ids = (
get_variant_group_by_depth(qg_mutations_depth_wo_black, 400, 0.5)
)
qg_variants_lost_cov_ids
return (qg_variants_lost_cov_ids,)
@app.cell
def _(plt, qg_variants_lost_cov_ids, variants_lost_cov_ids, venn2):
venn2(
subsets=(
len(qg_variants_lost_cov_ids),
len(
set(qg_variants_lost_cov_ids).intersection(
set(variants_lost_cov_ids)
)
),
len(set(variants_lost_cov_ids)),
),
set_labels=("QG", "Serenomica"),
set_colors=("blue", "green"),
alpha=0.5,
)
plt.title("Overlap of missed supercolumn targets (w/o blacklist targets)")
plt.show()
return
@app.cell
def _(mo):
mo.md(
@@ -633,26 +685,54 @@ def _(mo):
@app.cell
def _(clinicals, mutations_depth_wo_black, pd):
# variants that always fail a theoretical report coverage >= 0.2% VAF (i.e. needs 2 reads out of at least 1000)
variants_low_global_coverage = pd.DataFrame(
(
(mutations_depth_wo_black.loc[:, clinicals] < 1000).sum(axis=1)
/ len(clinicals)
).sort_values(ascending=False),
columns=["pro_cov_lt1000"],
)
variants_low_cov_ids = list(
variants_low_global_coverage.loc[
variants_low_global_coverage["pro_cov_lt1000"] > 0.75
].index
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
variants_low_global_coverage, variants_low_cov_ids = (
get_variant_group_by_depth(
mutations_depth_wo_black.loc[:, clinicals], 1000, 0.75
)
)
variants_low_cov_ids
return (variants_low_cov_ids,)
@app.cell
def _():
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
qg_variants_low_global_coverage, qg_variants_low_cov_ids = (
get_variant_group_by_depth(qg_mutations_depth_wo_black, 1000, 0.75)
)
qg_variants_low_cov_ids
return (qg_variants_low_cov_ids,)
@app.cell
def _(
plt,
qg_variants_lost_cov_ids,
qg_variants_low_cov_ids,
variants_lost_cov_ids,
variants_low_cov_ids,
venn2,
):
qg_low = set(
[q for q in qg_variants_low_cov_ids if q not in qg_variants_lost_cov_ids]
)
sere_low = set(
[v for v in variants_low_cov_ids if v not in variants_lost_cov_ids]
)
venn2(
subsets=(
len(qg_low),
len(qg_low.intersection(set(sere_low))),
len(sere_low),
),
set_labels=("QG", "Serenomica"),
set_colors=("blue", "green"),
alpha=0.5,
)
plt.title(
"Overlap of lowly covered supercolumn targets (w/o blacklist targets)"
)
plt.show()
return
@@ -701,23 +781,27 @@ def _(mean_cov, px, slider):
@app.cell
def _(clinicals, mutations_depth_wo_black, pd):
variants_high_global_coverage = pd.DataFrame(
(
(mutations_depth_wo_black.loc[:, clinicals] > 10000).sum(axis=1)
/ len(clinicals)
).sort_values(ascending=False),
columns=["pro_cov_gt10000"],
)
variants_high_cov_ids = list(
variants_high_global_coverage.loc[
variants_high_global_coverage["pro_cov_gt10000"] > 0.75
].index
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
variants_high_global_coverage, variants_high_cov_ids = (
get_variant_group_by_depth(
mutations_depth_wo_black.loc[:, clinicals], 10000, 0.75, mode="gt"
)
)
variants_high_cov_ids
return (variants_high_cov_ids,)
@app.cell
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
qg_variants_high_global_coverage, qg_variants_high_cov_ids = (
get_variant_group_by_depth(
qg_mutations_depth_wo_black, 10000, 0.75, mode="gt"
)
)
qg_variants_high_cov_ids
return
@app.cell
def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
vars_of_interest = sc.loc[