feat: analysis completed in the eda on stats and mutations
This commit is contained in:
@@ -12,10 +12,11 @@ def _():
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
import plotly.graph_objects as go
|
||||
import matplotlib as plt
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import yaml
|
||||
return itertools, mo, os, pd, px, yaml
|
||||
from matplotlib_venn import venn2
|
||||
return itertools, mo, os, pd, plt, px, venn2, yaml
|
||||
|
||||
|
||||
@app.cell
|
||||
@@ -254,6 +255,9 @@ def _(multiselect, px, stats_melt):
|
||||
template="simple_white",
|
||||
labels={"variable": "", "value": "Value from stats file"},
|
||||
hover_data=["index", "source"],
|
||||
category_orders={
|
||||
"sample_type": sorted(stats_melt["sample_type"].unique())
|
||||
},
|
||||
)
|
||||
fig.update_yaxes(showgrid=True)
|
||||
fig.show()
|
||||
@@ -271,8 +275,8 @@ def _(mo):
|
||||
- Always/majority called - potentially suspicious ✅
|
||||
- Never called ✅
|
||||
- High VAF targets ✅
|
||||
- Poorly covered targets
|
||||
- Overly covered targets
|
||||
- Poorly covered targets ✅
|
||||
- Overly covered targets ✅
|
||||
"""
|
||||
)
|
||||
return
|
||||
@@ -384,7 +388,7 @@ def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations):
|
||||
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
|
||||
else "QG-Sera"
|
||||
if (i in qg_samples_dot_mutations) and ("Sera" in i)
|
||||
else "Serenomical-Clinincal"
|
||||
else "Serenomica-Clinincal"
|
||||
if i not in qg_samples_dot_mutations
|
||||
else "QG-Clinical"
|
||||
for i in df_mut_per_sample_by_thresh.index
|
||||
@@ -421,6 +425,11 @@ def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px):
|
||||
template="simple_white",
|
||||
labels={"variable": "", "value": "Variants per sample"},
|
||||
hover_data=["index"],
|
||||
category_orders={
|
||||
"sample_type": sorted(
|
||||
df_mut_per_sample_by_thresh_melt["sample_type"].unique()
|
||||
)
|
||||
},
|
||||
)
|
||||
fig2.update_yaxes(showgrid=True)
|
||||
fig2.show()
|
||||
@@ -525,7 +534,7 @@ def _(
|
||||
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
|
||||
else "QG-Sera"
|
||||
if (i in qg_samples_dot_mutations) and ("Sera" in i)
|
||||
else "Serenomical-Clinincal"
|
||||
else "Serenomica-Clinincal"
|
||||
if i not in qg_samples_dot_mutations
|
||||
else "QG-Clinical"
|
||||
for i in df_mut_per_sample_by_thresh.index
|
||||
@@ -562,6 +571,9 @@ def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px):
|
||||
template="simple_white",
|
||||
labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
|
||||
hover_data=["index"],
|
||||
category_orders={
|
||||
"sample_type": sorted(df_depth_by_thresh_melt["sample_type"].unique())
|
||||
},
|
||||
)
|
||||
fig3.update_yaxes(showgrid=True)
|
||||
fig3.show()
|
||||
@@ -581,45 +593,85 @@ def _(mo):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(black_mut_ids, mutations_depth):
|
||||
def _(black_mut_ids, mutations_depth, qg_hist_mut_depth):
|
||||
# removal of blacklisted variants for this analysis
|
||||
mutations_depth_wo_black = mutations_depth.loc[
|
||||
[i for i in mutations_depth.index if i not in black_mut_ids], :
|
||||
].copy()
|
||||
return (mutations_depth_wo_black,)
|
||||
|
||||
qg_mutations_depth_wo_black = qg_hist_mut_depth.loc[
|
||||
[i for i in qg_hist_mut_depth.index if i not in black_mut_ids], :
|
||||
].copy()
|
||||
return mutations_depth_wo_black, qg_mutations_depth_wo_black
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(clinicals, mutations_depth_wo_black, pd):
|
||||
# missed non-blacklisted variants where theoretical detection only >1% (i.e. 2 reads in 200 supporting variant)
|
||||
variants_lost_global_coverage = pd.DataFrame(
|
||||
(
|
||||
(
|
||||
mutations_depth_wo_black.loc[
|
||||
:,
|
||||
clinicals,
|
||||
]
|
||||
< 200
|
||||
).sum(axis=1)
|
||||
/ len(clinicals)
|
||||
).sort_values(ascending=False),
|
||||
columns=["prop_cov_lt200"],
|
||||
)
|
||||
variants_lost_global_coverage
|
||||
return (variants_lost_global_coverage,)
|
||||
def _(pd):
|
||||
def get_variant_group_by_depth(df, depth_thresh, sample_prop, mode="lt"):
|
||||
if mode == "lt":
|
||||
variants_group = pd.DataFrame(
|
||||
((df < depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
|
||||
ascending=False
|
||||
),
|
||||
columns=[f"prop_cov_{mode}{depth_thresh}"],
|
||||
)
|
||||
elif mode == "gt":
|
||||
variants_group = pd.DataFrame(
|
||||
((df > depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
|
||||
ascending=False
|
||||
),
|
||||
columns=[f"prop_cov_{mode}{depth_thresh}"],
|
||||
)
|
||||
variant_ids = list(
|
||||
variants_group.loc[
|
||||
variants_group[f"prop_cov_{mode}{depth_thresh}"] > sample_prop
|
||||
].index
|
||||
)
|
||||
return variants_group, variant_ids
|
||||
return (get_variant_group_by_depth,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(variants_lost_global_coverage):
|
||||
variants_lost_cov_ids = list(
|
||||
variants_lost_global_coverage.loc[
|
||||
variants_lost_global_coverage["prop_cov_lt200"] > 0.5
|
||||
].index
|
||||
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
|
||||
variants_lost_global_coverage, variants_lost_cov_ids = (
|
||||
get_variant_group_by_depth(
|
||||
mutations_depth_wo_black.loc[:, clinicals], 400, 0.5
|
||||
)
|
||||
)
|
||||
variants_lost_cov_ids
|
||||
return (variants_lost_cov_ids,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
|
||||
qg_variants_lost_global_coverage, qg_variants_lost_cov_ids = (
|
||||
get_variant_group_by_depth(qg_mutations_depth_wo_black, 400, 0.5)
|
||||
)
|
||||
qg_variants_lost_cov_ids
|
||||
return (qg_variants_lost_cov_ids,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(plt, qg_variants_lost_cov_ids, variants_lost_cov_ids, venn2):
|
||||
venn2(
|
||||
subsets=(
|
||||
len(qg_variants_lost_cov_ids),
|
||||
len(
|
||||
set(qg_variants_lost_cov_ids).intersection(
|
||||
set(variants_lost_cov_ids)
|
||||
)
|
||||
),
|
||||
len(set(variants_lost_cov_ids)),
|
||||
),
|
||||
set_labels=("QG", "Serenomica"),
|
||||
set_colors=("blue", "green"),
|
||||
alpha=0.5,
|
||||
)
|
||||
plt.title("Overlap of missed supercolumn targets (w/o blacklist targets)")
|
||||
plt.show()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(
|
||||
@@ -633,26 +685,54 @@ def _(mo):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(clinicals, mutations_depth_wo_black, pd):
|
||||
# variants that always fail a theoretical report coverage >= 0.2% VAF (i.e. needs 2 reads out of at least 1000)
|
||||
variants_low_global_coverage = pd.DataFrame(
|
||||
(
|
||||
(mutations_depth_wo_black.loc[:, clinicals] < 1000).sum(axis=1)
|
||||
/ len(clinicals)
|
||||
).sort_values(ascending=False),
|
||||
columns=["pro_cov_lt1000"],
|
||||
)
|
||||
|
||||
variants_low_cov_ids = list(
|
||||
variants_low_global_coverage.loc[
|
||||
variants_low_global_coverage["pro_cov_lt1000"] > 0.75
|
||||
].index
|
||||
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
|
||||
variants_low_global_coverage, variants_low_cov_ids = (
|
||||
get_variant_group_by_depth(
|
||||
mutations_depth_wo_black.loc[:, clinicals], 1000, 0.75
|
||||
)
|
||||
)
|
||||
variants_low_cov_ids
|
||||
return (variants_low_cov_ids,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
|
||||
qg_variants_low_global_coverage, qg_variants_low_cov_ids = (
|
||||
get_variant_group_by_depth(qg_mutations_depth_wo_black, 1000, 0.75)
|
||||
)
|
||||
qg_variants_low_cov_ids
|
||||
return (qg_variants_low_cov_ids,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(
|
||||
plt,
|
||||
qg_variants_lost_cov_ids,
|
||||
qg_variants_low_cov_ids,
|
||||
variants_lost_cov_ids,
|
||||
variants_low_cov_ids,
|
||||
venn2,
|
||||
):
|
||||
qg_low = set(
|
||||
[q for q in qg_variants_low_cov_ids if q not in qg_variants_lost_cov_ids]
|
||||
)
|
||||
sere_low = set(
|
||||
[v for v in variants_low_cov_ids if v not in variants_lost_cov_ids]
|
||||
)
|
||||
venn2(
|
||||
subsets=(
|
||||
len(qg_low),
|
||||
len(qg_low.intersection(set(sere_low))),
|
||||
len(sere_low),
|
||||
),
|
||||
set_labels=("QG", "Serenomica"),
|
||||
set_colors=("blue", "green"),
|
||||
alpha=0.5,
|
||||
)
|
||||
plt.title(
|
||||
"Overlap of lowly covered supercolumn targets (w/o blacklist targets)"
|
||||
)
|
||||
plt.show()
|
||||
return
|
||||
|
||||
|
||||
@@ -701,23 +781,27 @@ def _(mean_cov, px, slider):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(clinicals, mutations_depth_wo_black, pd):
|
||||
variants_high_global_coverage = pd.DataFrame(
|
||||
(
|
||||
(mutations_depth_wo_black.loc[:, clinicals] > 10000).sum(axis=1)
|
||||
/ len(clinicals)
|
||||
).sort_values(ascending=False),
|
||||
columns=["pro_cov_gt10000"],
|
||||
)
|
||||
|
||||
variants_high_cov_ids = list(
|
||||
variants_high_global_coverage.loc[
|
||||
variants_high_global_coverage["pro_cov_gt10000"] > 0.75
|
||||
].index
|
||||
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
|
||||
variants_high_global_coverage, variants_high_cov_ids = (
|
||||
get_variant_group_by_depth(
|
||||
mutations_depth_wo_black.loc[:, clinicals], 10000, 0.75, mode="gt"
|
||||
)
|
||||
)
|
||||
variants_high_cov_ids
|
||||
return (variants_high_cov_ids,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
|
||||
qg_variants_high_global_coverage, qg_variants_high_cov_ids = (
|
||||
get_variant_group_by_depth(
|
||||
qg_mutations_depth_wo_black, 10000, 0.75, mode="gt"
|
||||
)
|
||||
)
|
||||
qg_variants_high_cov_ids
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
|
||||
vars_of_interest = sc.loc[
|
||||
|
||||
Reference in New Issue
Block a user