feat: analysis completed in the eda on stats and mutations
This commit is contained in:
@@ -12,10 +12,11 @@ def _():
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
import plotly.graph_objects as go
|
import plotly.graph_objects as go
|
||||||
import matplotlib as plt
|
import matplotlib.pyplot as plt
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import yaml
|
import yaml
|
||||||
return itertools, mo, os, pd, px, yaml
|
from matplotlib_venn import venn2
|
||||||
|
return itertools, mo, os, pd, plt, px, venn2, yaml
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
@@ -254,6 +255,9 @@ def _(multiselect, px, stats_melt):
|
|||||||
template="simple_white",
|
template="simple_white",
|
||||||
labels={"variable": "", "value": "Value from stats file"},
|
labels={"variable": "", "value": "Value from stats file"},
|
||||||
hover_data=["index", "source"],
|
hover_data=["index", "source"],
|
||||||
|
category_orders={
|
||||||
|
"sample_type": sorted(stats_melt["sample_type"].unique())
|
||||||
|
},
|
||||||
)
|
)
|
||||||
fig.update_yaxes(showgrid=True)
|
fig.update_yaxes(showgrid=True)
|
||||||
fig.show()
|
fig.show()
|
||||||
@@ -271,8 +275,8 @@ def _(mo):
|
|||||||
- Always/majority called - potentially suspicious ✅
|
- Always/majority called - potentially suspicious ✅
|
||||||
- Never called ✅
|
- Never called ✅
|
||||||
- High VAF targets ✅
|
- High VAF targets ✅
|
||||||
- Poorly covered targets
|
- Poorly covered targets ✅
|
||||||
- Overly covered targets
|
- Overly covered targets ✅
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
@@ -384,7 +388,7 @@ def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations):
|
|||||||
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
|
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
|
||||||
else "QG-Sera"
|
else "QG-Sera"
|
||||||
if (i in qg_samples_dot_mutations) and ("Sera" in i)
|
if (i in qg_samples_dot_mutations) and ("Sera" in i)
|
||||||
else "Serenomical-Clinincal"
|
else "Serenomica-Clinincal"
|
||||||
if i not in qg_samples_dot_mutations
|
if i not in qg_samples_dot_mutations
|
||||||
else "QG-Clinical"
|
else "QG-Clinical"
|
||||||
for i in df_mut_per_sample_by_thresh.index
|
for i in df_mut_per_sample_by_thresh.index
|
||||||
@@ -421,6 +425,11 @@ def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px):
|
|||||||
template="simple_white",
|
template="simple_white",
|
||||||
labels={"variable": "", "value": "Variants per sample"},
|
labels={"variable": "", "value": "Variants per sample"},
|
||||||
hover_data=["index"],
|
hover_data=["index"],
|
||||||
|
category_orders={
|
||||||
|
"sample_type": sorted(
|
||||||
|
df_mut_per_sample_by_thresh_melt["sample_type"].unique()
|
||||||
|
)
|
||||||
|
},
|
||||||
)
|
)
|
||||||
fig2.update_yaxes(showgrid=True)
|
fig2.update_yaxes(showgrid=True)
|
||||||
fig2.show()
|
fig2.show()
|
||||||
@@ -525,7 +534,7 @@ def _(
|
|||||||
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
|
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
|
||||||
else "QG-Sera"
|
else "QG-Sera"
|
||||||
if (i in qg_samples_dot_mutations) and ("Sera" in i)
|
if (i in qg_samples_dot_mutations) and ("Sera" in i)
|
||||||
else "Serenomical-Clinincal"
|
else "Serenomica-Clinincal"
|
||||||
if i not in qg_samples_dot_mutations
|
if i not in qg_samples_dot_mutations
|
||||||
else "QG-Clinical"
|
else "QG-Clinical"
|
||||||
for i in df_mut_per_sample_by_thresh.index
|
for i in df_mut_per_sample_by_thresh.index
|
||||||
@@ -562,6 +571,9 @@ def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px):
|
|||||||
template="simple_white",
|
template="simple_white",
|
||||||
labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
|
labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
|
||||||
hover_data=["index"],
|
hover_data=["index"],
|
||||||
|
category_orders={
|
||||||
|
"sample_type": sorted(df_depth_by_thresh_melt["sample_type"].unique())
|
||||||
|
},
|
||||||
)
|
)
|
||||||
fig3.update_yaxes(showgrid=True)
|
fig3.update_yaxes(showgrid=True)
|
||||||
fig3.show()
|
fig3.show()
|
||||||
@@ -581,45 +593,85 @@ def _(mo):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(black_mut_ids, mutations_depth):
|
def _(black_mut_ids, mutations_depth, qg_hist_mut_depth):
|
||||||
# removal of blacklisted variants for this analysis
|
# removal of blacklisted variants for this analysis
|
||||||
mutations_depth_wo_black = mutations_depth.loc[
|
mutations_depth_wo_black = mutations_depth.loc[
|
||||||
[i for i in mutations_depth.index if i not in black_mut_ids], :
|
[i for i in mutations_depth.index if i not in black_mut_ids], :
|
||||||
].copy()
|
].copy()
|
||||||
return (mutations_depth_wo_black,)
|
|
||||||
|
qg_mutations_depth_wo_black = qg_hist_mut_depth.loc[
|
||||||
|
[i for i in qg_hist_mut_depth.index if i not in black_mut_ids], :
|
||||||
|
].copy()
|
||||||
|
return mutations_depth_wo_black, qg_mutations_depth_wo_black
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(clinicals, mutations_depth_wo_black, pd):
|
def _(pd):
|
||||||
# missed non-blacklisted variants where theoretical detection only >1% (i.e. 2 reads in 200 supporting variant)
|
def get_variant_group_by_depth(df, depth_thresh, sample_prop, mode="lt"):
|
||||||
variants_lost_global_coverage = pd.DataFrame(
|
if mode == "lt":
|
||||||
(
|
variants_group = pd.DataFrame(
|
||||||
(
|
((df < depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
|
||||||
mutations_depth_wo_black.loc[
|
ascending=False
|
||||||
:,
|
),
|
||||||
clinicals,
|
columns=[f"prop_cov_{mode}{depth_thresh}"],
|
||||||
]
|
)
|
||||||
< 200
|
elif mode == "gt":
|
||||||
).sum(axis=1)
|
variants_group = pd.DataFrame(
|
||||||
/ len(clinicals)
|
((df > depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
|
||||||
).sort_values(ascending=False),
|
ascending=False
|
||||||
columns=["prop_cov_lt200"],
|
),
|
||||||
)
|
columns=[f"prop_cov_{mode}{depth_thresh}"],
|
||||||
variants_lost_global_coverage
|
)
|
||||||
return (variants_lost_global_coverage,)
|
variant_ids = list(
|
||||||
|
variants_group.loc[
|
||||||
|
variants_group[f"prop_cov_{mode}{depth_thresh}"] > sample_prop
|
||||||
|
].index
|
||||||
|
)
|
||||||
|
return variants_group, variant_ids
|
||||||
|
return (get_variant_group_by_depth,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(variants_lost_global_coverage):
|
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
|
||||||
variants_lost_cov_ids = list(
|
variants_lost_global_coverage, variants_lost_cov_ids = (
|
||||||
variants_lost_global_coverage.loc[
|
get_variant_group_by_depth(
|
||||||
variants_lost_global_coverage["prop_cov_lt200"] > 0.5
|
mutations_depth_wo_black.loc[:, clinicals], 400, 0.5
|
||||||
].index
|
)
|
||||||
)
|
)
|
||||||
variants_lost_cov_ids
|
variants_lost_cov_ids
|
||||||
return (variants_lost_cov_ids,)
|
return (variants_lost_cov_ids,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
|
||||||
|
qg_variants_lost_global_coverage, qg_variants_lost_cov_ids = (
|
||||||
|
get_variant_group_by_depth(qg_mutations_depth_wo_black, 400, 0.5)
|
||||||
|
)
|
||||||
|
qg_variants_lost_cov_ids
|
||||||
|
return (qg_variants_lost_cov_ids,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(plt, qg_variants_lost_cov_ids, variants_lost_cov_ids, venn2):
|
||||||
|
venn2(
|
||||||
|
subsets=(
|
||||||
|
len(qg_variants_lost_cov_ids),
|
||||||
|
len(
|
||||||
|
set(qg_variants_lost_cov_ids).intersection(
|
||||||
|
set(variants_lost_cov_ids)
|
||||||
|
)
|
||||||
|
),
|
||||||
|
len(set(variants_lost_cov_ids)),
|
||||||
|
),
|
||||||
|
set_labels=("QG", "Serenomica"),
|
||||||
|
set_colors=("blue", "green"),
|
||||||
|
alpha=0.5,
|
||||||
|
)
|
||||||
|
plt.title("Overlap of missed supercolumn targets (w/o blacklist targets)")
|
||||||
|
plt.show()
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(mo):
|
def _(mo):
|
||||||
mo.md(
|
mo.md(
|
||||||
@@ -633,26 +685,54 @@ def _(mo):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(clinicals, mutations_depth_wo_black, pd):
|
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
|
||||||
# variants that always fail a theoretical report coverage >= 0.2% VAF (i.e. needs 2 reads out of at least 1000)
|
variants_low_global_coverage, variants_low_cov_ids = (
|
||||||
variants_low_global_coverage = pd.DataFrame(
|
get_variant_group_by_depth(
|
||||||
(
|
mutations_depth_wo_black.loc[:, clinicals], 1000, 0.75
|
||||||
(mutations_depth_wo_black.loc[:, clinicals] < 1000).sum(axis=1)
|
)
|
||||||
/ len(clinicals)
|
|
||||||
).sort_values(ascending=False),
|
|
||||||
columns=["pro_cov_lt1000"],
|
|
||||||
)
|
|
||||||
|
|
||||||
variants_low_cov_ids = list(
|
|
||||||
variants_low_global_coverage.loc[
|
|
||||||
variants_low_global_coverage["pro_cov_lt1000"] > 0.75
|
|
||||||
].index
|
|
||||||
)
|
)
|
||||||
|
variants_low_cov_ids
|
||||||
return (variants_low_cov_ids,)
|
return (variants_low_cov_ids,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _():
|
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
|
||||||
|
qg_variants_low_global_coverage, qg_variants_low_cov_ids = (
|
||||||
|
get_variant_group_by_depth(qg_mutations_depth_wo_black, 1000, 0.75)
|
||||||
|
)
|
||||||
|
qg_variants_low_cov_ids
|
||||||
|
return (qg_variants_low_cov_ids,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(
|
||||||
|
plt,
|
||||||
|
qg_variants_lost_cov_ids,
|
||||||
|
qg_variants_low_cov_ids,
|
||||||
|
variants_lost_cov_ids,
|
||||||
|
variants_low_cov_ids,
|
||||||
|
venn2,
|
||||||
|
):
|
||||||
|
qg_low = set(
|
||||||
|
[q for q in qg_variants_low_cov_ids if q not in qg_variants_lost_cov_ids]
|
||||||
|
)
|
||||||
|
sere_low = set(
|
||||||
|
[v for v in variants_low_cov_ids if v not in variants_lost_cov_ids]
|
||||||
|
)
|
||||||
|
venn2(
|
||||||
|
subsets=(
|
||||||
|
len(qg_low),
|
||||||
|
len(qg_low.intersection(set(sere_low))),
|
||||||
|
len(sere_low),
|
||||||
|
),
|
||||||
|
set_labels=("QG", "Serenomica"),
|
||||||
|
set_colors=("blue", "green"),
|
||||||
|
alpha=0.5,
|
||||||
|
)
|
||||||
|
plt.title(
|
||||||
|
"Overlap of lowly covered supercolumn targets (w/o blacklist targets)"
|
||||||
|
)
|
||||||
|
plt.show()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@@ -701,23 +781,27 @@ def _(mean_cov, px, slider):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(clinicals, mutations_depth_wo_black, pd):
|
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
|
||||||
variants_high_global_coverage = pd.DataFrame(
|
variants_high_global_coverage, variants_high_cov_ids = (
|
||||||
(
|
get_variant_group_by_depth(
|
||||||
(mutations_depth_wo_black.loc[:, clinicals] > 10000).sum(axis=1)
|
mutations_depth_wo_black.loc[:, clinicals], 10000, 0.75, mode="gt"
|
||||||
/ len(clinicals)
|
)
|
||||||
).sort_values(ascending=False),
|
|
||||||
columns=["pro_cov_gt10000"],
|
|
||||||
)
|
|
||||||
|
|
||||||
variants_high_cov_ids = list(
|
|
||||||
variants_high_global_coverage.loc[
|
|
||||||
variants_high_global_coverage["pro_cov_gt10000"] > 0.75
|
|
||||||
].index
|
|
||||||
)
|
)
|
||||||
|
variants_high_cov_ids
|
||||||
return (variants_high_cov_ids,)
|
return (variants_high_cov_ids,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
|
||||||
|
qg_variants_high_global_coverage, qg_variants_high_cov_ids = (
|
||||||
|
get_variant_group_by_depth(
|
||||||
|
qg_mutations_depth_wo_black, 10000, 0.75, mode="gt"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
qg_variants_high_cov_ids
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
|
def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
|
||||||
vars_of_interest = sc.loc[
|
vars_of_interest = sc.loc[
|
||||||
|
|||||||
Reference in New Issue
Block a user