850 lines
22 KiB
Python
850 lines
22 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.14.16"
|
|
app = marimo.App(width="medium")
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
import os
|
|
import itertools
|
|
import marimo as mo
|
|
import pandas as pd
|
|
import plotly.express as px
|
|
import plotly.graph_objects as go
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
import yaml
|
|
from matplotlib_venn import venn2
|
|
return itertools, mo, os, pd, plt, px, venn2, yaml
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(
|
|
r"""
|
|
# Initial EDA of the Twist LB data (19-Sep-2025)
|
|
|
|
So far no analysis have been done on the data from the LB assay. This analysis will take the first look.
|
|
|
|
Analysis ideas:
|
|
|
|
- Stats data analysis - box plots and outliers
|
|
- Mutations - agg stats -> variants called, coverage of called variants in supercol, common called variants, unique variants
|
|
- vcfs - agg stats -> variants called, coverage of called variants in supercol, common called variants
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""## (1) Preprocess Data""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/"
|
|
se2_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se2-lb-1/"
|
|
|
|
qc_hist_dir = (
|
|
"/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/"
|
|
)
|
|
qc_hist_mutations_dir = "/home/darren/Documents/4_data/3_internal/2_lb/qg_historical_qc_service/dot_mutations/"
|
|
sc_path = "/home/darren/Documents/2_repos/serenomica/supercolumn/panel_v2/supercolumn_v2.csv"
|
|
return qc_hist_dir, qc_hist_mutations_dir, sc_path, se1_dir, se2_dir
|
|
|
|
|
|
@app.cell
|
|
def _(itertools, os):
|
|
def return_paths(suffix, *dirs):
|
|
return list(
|
|
itertools.chain.from_iterable(
|
|
[
|
|
[
|
|
os.path.join(d, f)
|
|
for f in os.listdir(d)
|
|
if f.endswith(suffix)
|
|
]
|
|
for d in dirs
|
|
]
|
|
)
|
|
)
|
|
return (return_paths,)
|
|
|
|
|
|
@app.cell
|
|
def _(return_paths, se1_dir, se2_dir):
|
|
stats_fpaths = return_paths(".stats", se1_dir, se2_dir)
|
|
print(len(stats_fpaths))
|
|
mutations_fpaths = return_paths(".mutations", se1_dir, se2_dir)
|
|
print(len(mutations_fpaths))
|
|
return mutations_fpaths, stats_fpaths
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""### (1.1) Stats""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd, stats_fpaths, yaml):
|
|
stats = pd.DataFrame()
|
|
for sf in stats_fpaths:
|
|
with open(sf, "r") as handle:
|
|
stats = pd.concat(
|
|
[
|
|
stats,
|
|
pd.json_normalize(yaml.safe_load(handle), sep=" / ").rename(
|
|
{0: sf.split("/")[-1].strip(".stats")}, axis=0
|
|
),
|
|
],
|
|
axis=0,
|
|
)
|
|
stats["sample_type"] = [
|
|
"Serenomica-Sera" if "Sera" in i else "Serenomica-Clinical"
|
|
for i in stats.index
|
|
]
|
|
stats["source"] = "Serenomica"
|
|
stats = stats.drop(
|
|
["miscellaneous / stats_file_version", "miscellaneous / target_panel_bed"],
|
|
axis=1,
|
|
)
|
|
return (stats,)
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""### (1.2) .mutations""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd):
|
|
def process_mutations_files(mutation_file_paths):
|
|
cols = []
|
|
vaf, counts, depth = (
|
|
pd.DataFrame(),
|
|
pd.DataFrame(),
|
|
pd.DataFrame(),
|
|
)
|
|
for mf in mutation_file_paths:
|
|
tmp = pd.read_csv(mf, sep="\t", header=None)
|
|
tmp["mutation_id"] = tmp.apply(
|
|
lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
|
|
)
|
|
tmp.set_index("mutation_id", inplace=True)
|
|
|
|
cols.append(mf.split("/")[-1].strip(".mutations"))
|
|
vaf = pd.concat([vaf, tmp[3]], axis=1)
|
|
counts = pd.concat([counts, tmp[4]], axis=1)
|
|
depth = pd.concat([depth, tmp[5]], axis=1)
|
|
|
|
vaf.columns = cols
|
|
counts.columns = cols
|
|
depth.columns = cols
|
|
return vaf, counts, depth
|
|
return (process_mutations_files,)
|
|
|
|
|
|
@app.cell
|
|
def _(mutations_fpaths, process_mutations_files):
|
|
mutations_vaf, mutations_counts, mutations_depth = process_mutations_files(
|
|
mutations_fpaths
|
|
)
|
|
mutations_vaf
|
|
return mutations_depth, mutations_vaf
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""### (1.3) Stats QG Historical""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(os, pd, qc_hist_dir):
|
|
# QG historical data
|
|
qg_hist_stats = pd.read_csv(
|
|
os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0
|
|
)
|
|
qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns]
|
|
qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1)
|
|
qg_hist_stats["sample_type"] = [
|
|
"QG-Sera" if "Sera" in i else "QG-Clinical" for i in qg_hist_stats.index
|
|
]
|
|
qg_hist_stats["source"] = "QG"
|
|
qg_hist_stats
|
|
return (qg_hist_stats,)
|
|
|
|
|
|
@app.cell
|
|
def _(pd, qg_hist_stats, stats):
|
|
# ensure columns match
|
|
assert set(qg_hist_stats.columns) == set(stats.columns)
|
|
|
|
all_stats = pd.concat([stats, qg_hist_stats], axis=0)
|
|
all_stats.head()
|
|
return (all_stats,)
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""### (1.4) .mutations QG Historical""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(process_mutations_files, qc_hist_mutations_dir, return_paths):
|
|
qg_hist_mutations_paths = return_paths(".mutations", qc_hist_mutations_dir)
|
|
qg_hist_mut_vaf, qg_hist_mut_count, qg_hist_mut_depth = (
|
|
process_mutations_files(qg_hist_mutations_paths)
|
|
)
|
|
qg_hist_mut_vaf
|
|
return qg_hist_mut_depth, qg_hist_mut_vaf
|
|
|
|
|
|
@app.cell
|
|
def _(mutations_depth, mutations_vaf, pd, qg_hist_mut_depth, qg_hist_mut_vaf):
|
|
# ensure columns match
|
|
assert set(qg_hist_mut_vaf.index) == set(mutations_vaf.index)
|
|
|
|
all_vaf = pd.concat([mutations_vaf, qg_hist_mut_vaf], axis=1)
|
|
all_depth = pd.concat([mutations_depth, qg_hist_mut_depth], axis=1)
|
|
return all_depth, all_vaf
|
|
|
|
|
|
@app.cell
|
|
def _(qg_hist_mut_vaf):
|
|
qg_samples_dot_mutations = qg_hist_mut_vaf.columns
|
|
return (qg_samples_dot_mutations,)
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""## (1) Stats Analysis""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(all_stats, pd):
|
|
stats_melt = pd.melt(
|
|
all_stats.reset_index(), id_vars=["index", "sample_type", "source"]
|
|
)
|
|
stats_melt.head()
|
|
return (stats_melt,)
|
|
|
|
|
|
@app.cell
|
|
def _(mo, stats):
|
|
multiselect = mo.ui.multiselect(options=stats.columns)
|
|
mo.hstack([multiselect])
|
|
return (multiselect,)
|
|
|
|
|
|
@app.cell
|
|
def _(multiselect, px, stats_melt):
|
|
fig = px.box(
|
|
stats_melt.loc[stats_melt["variable"].isin(multiselect.value)],
|
|
x="variable",
|
|
y="value",
|
|
color="sample_type",
|
|
points="all",
|
|
template="simple_white",
|
|
labels={"variable": "", "value": "Value from stats file"},
|
|
hover_data=["index", "source"],
|
|
category_orders={
|
|
"sample_type": sorted(stats_melt["sample_type"].unique())
|
|
},
|
|
)
|
|
fig.update_yaxes(showgrid=True)
|
|
fig.show()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(
|
|
r"""
|
|
## (2) .mutations Analysis
|
|
|
|
Analysis ideas💡
|
|
|
|
- Always/majority called - potentially suspicious ✅
|
|
- Never called ✅
|
|
- High VAF targets ✅
|
|
- Poorly covered targets ✅
|
|
- Overly covered targets ✅
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd, sc_path):
|
|
# supercolumn for analysis
|
|
sc = pd.read_csv(sc_path, sep=";")
|
|
sc["mutation_id"] = (
|
|
"chr"
|
|
+ sc["chrom"].astype(str)
|
|
+ "_"
|
|
+ sc["position"].astype(str)
|
|
+ "_"
|
|
+ sc["ref"].astype(str)
|
|
+ "/"
|
|
+ sc["alt"].astype(str)
|
|
)
|
|
return (sc,)
|
|
|
|
|
|
@app.cell
|
|
def _(sc):
|
|
black_mut_ids = sc.loc[sc["blacklisted"] == "Yes"]["mutation_id"].tolist()
|
|
print(f"Number of blacklisted variants = {len(black_mut_ids)}")
|
|
return (black_mut_ids,)
|
|
|
|
|
|
@app.cell
|
|
def _(mutations_vaf):
|
|
seracares = [i for i in mutations_vaf.columns if "Sera" in i]
|
|
clinicals = [i for i in mutations_vaf.columns if i not in seracares]
|
|
print(f"Number of seracares = {len(seracares)}")
|
|
print(f"Number of volunteers = {len(clinicals)}")
|
|
print("Both numbers include replicates!")
|
|
return (clinicals,)
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""### (2.1) Frequently and infrequently called variants""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(clinicals, mutations_vaf, pd):
|
|
# always called and never called
|
|
called_gt0 = pd.DataFrame(
|
|
(mutations_vaf.loc[:, clinicals] > 0).sum(axis=1),
|
|
columns=["samples_called_in"],
|
|
)
|
|
called_gt0["prop_samples_called"] = [
|
|
row / len(clinicals) for row in called_gt0["samples_called_in"]
|
|
]
|
|
called_gt0["called_in_ALL"] = [
|
|
True if val == len(clinicals) else False
|
|
for val in called_gt0["samples_called_in"]
|
|
]
|
|
called_gt0["called_in_NONE"] = [
|
|
True if val == 0 else False for val in called_gt0["samples_called_in"]
|
|
]
|
|
|
|
var_always_called = called_gt0[called_gt0["called_in_ALL"] == True].index
|
|
var_never_called = called_gt0[called_gt0["called_in_NONE"] == True].index
|
|
|
|
called_gt0[["called_in_ALL", "called_in_NONE"]].sum(axis=0)
|
|
return (var_always_called,)
|
|
|
|
|
|
@app.cell
|
|
def _(sc, var_always_called):
|
|
# all 9 variants that occur in all clinical samples are blacklisted variants
|
|
sc.loc[sc["mutation_id"].isin(var_always_called)][
|
|
["chrom", "position", "ref", "alt", "gene", "blacklisted"]
|
|
]
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""### (2.2) Variants called above VAF thresholds""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations):
|
|
# mutations over thresholds, excluding the blacklisted hits
|
|
df_mut_per_sample_by_thresh = pd.DataFrame()
|
|
for t in [0.00125, 0.0025, 0.005, 0.01]:
|
|
t_data_all = pd.DataFrame(
|
|
(all_vaf > t).sum(axis=0),
|
|
columns=[f"variants_gt_{t}_all"],
|
|
)
|
|
t_data = pd.DataFrame(
|
|
(
|
|
all_vaf.loc[
|
|
[i for i in all_vaf.index if i not in black_mut_ids], :
|
|
]
|
|
> t
|
|
).sum(axis=0),
|
|
columns=[f"variants_gt_{t}_wo_blacklisted"],
|
|
)
|
|
df_mut_per_sample_by_thresh = pd.concat(
|
|
[df_mut_per_sample_by_thresh, t_data, t_data_all], axis=1
|
|
)
|
|
df_mut_per_sample_by_thresh["sample_type"] = [
|
|
"Serenomica-Sera"
|
|
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
|
|
else "QG-Sera"
|
|
if (i in qg_samples_dot_mutations) and ("Sera" in i)
|
|
else "Serenomica-Clinincal"
|
|
if i not in qg_samples_dot_mutations
|
|
else "QG-Clinical"
|
|
for i in df_mut_per_sample_by_thresh.index
|
|
]
|
|
df_mut_per_sample_by_thresh
|
|
return (df_mut_per_sample_by_thresh,)
|
|
|
|
|
|
@app.cell
|
|
def _(df_mut_per_sample_by_thresh, pd):
|
|
df_mut_per_sample_by_thresh_melt = pd.melt(
|
|
df_mut_per_sample_by_thresh.reset_index(), id_vars=["index", "sample_type"]
|
|
)
|
|
return (df_mut_per_sample_by_thresh_melt,)
|
|
|
|
|
|
@app.cell
|
|
def _(df_mut_per_sample_by_thresh, mo):
|
|
ms_gt_thresh = mo.ui.multiselect(options=df_mut_per_sample_by_thresh.columns)
|
|
mo.hstack([ms_gt_thresh])
|
|
return (ms_gt_thresh,)
|
|
|
|
|
|
@app.cell
|
|
def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px):
|
|
fig2 = px.box(
|
|
df_mut_per_sample_by_thresh_melt.loc[
|
|
df_mut_per_sample_by_thresh_melt["variable"].isin(ms_gt_thresh.value)
|
|
],
|
|
x="variable",
|
|
y="value",
|
|
color="sample_type",
|
|
points="all",
|
|
template="simple_white",
|
|
labels={"variable": "", "value": "Variants per sample"},
|
|
hover_data=["index"],
|
|
category_orders={
|
|
"sample_type": sorted(
|
|
df_mut_per_sample_by_thresh_melt["sample_type"].unique()
|
|
)
|
|
},
|
|
)
|
|
fig2.update_yaxes(showgrid=True)
|
|
fig2.show()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(
|
|
r"""
|
|
### (2.3) Mutations with very High VAF (in some samples)
|
|
|
|
Arbritrarily looking at >10%
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(clinicals, mutations_vaf, pd, sc):
|
|
# looking just at the serenomica samples
|
|
high_vaf_serenomica = pd.DataFrame(
|
|
(mutations_vaf.loc[:, clinicals] > 0.1)
|
|
.sum(axis=1)
|
|
.sort_values(ascending=False),
|
|
columns=["count_vaf_gt0.1"],
|
|
)
|
|
sc.loc[
|
|
sc["mutation_id"].isin(
|
|
high_vaf_serenomica.loc[
|
|
high_vaf_serenomica["count_vaf_gt0.1"] > 0
|
|
].index
|
|
)
|
|
][["mutation_id", "blacklisted", "rsID", "gene", "Consequence"]]
|
|
return (high_vaf_serenomica,)
|
|
|
|
|
|
@app.cell
|
|
def _(high_vaf_serenomica):
|
|
high_vaf_serenomica.head(9)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(
|
|
r"""
|
|
rs587780751 - low pop AF
|
|
|
|
rs2256740 - high pop AF, especially in EU, Africans and latin americans (13/19 samples)
|
|
|
|
rs34094720 - relatively low pop AF
|
|
|
|
rs2293347 - relatively high pop AF (7/19 samples)
|
|
|
|
rs138327406 - relatively low pop AF
|
|
|
|
rs11214077 - relatively low but > 1% in latin pop
|
|
|
|
rs121913530 - very low pop AF
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""### (2.3) Global Coverage""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(
|
|
all_depth,
|
|
black_mut_ids,
|
|
df_mut_per_sample_by_thresh,
|
|
pd,
|
|
qg_samples_dot_mutations,
|
|
):
|
|
# coverage over thresholds
|
|
df_depth_by_thresh = pd.DataFrame()
|
|
for d in [250, 500, 1000, 2000, 5000, 10000]:
|
|
d_data_all = pd.DataFrame(
|
|
(all_depth > d).sum(axis=0) / all_depth.shape[0],
|
|
columns=[f"coverage_gt_{d}_all"],
|
|
)
|
|
d_data = pd.DataFrame(
|
|
(
|
|
all_depth.loc[
|
|
[i for i in all_depth.index if i not in black_mut_ids], :
|
|
]
|
|
> d
|
|
).sum(axis=0)
|
|
/ all_depth.shape[0],
|
|
columns=[f"coverage_gt_{d}_wo_blacklisted"],
|
|
)
|
|
df_depth_by_thresh = pd.concat(
|
|
[df_depth_by_thresh, d_data, d_data_all], axis=1
|
|
)
|
|
df_depth_by_thresh["sample_type"] = [
|
|
"Serenomica-Sera"
|
|
if ("Sera" in i) and (i not in qg_samples_dot_mutations)
|
|
else "QG-Sera"
|
|
if (i in qg_samples_dot_mutations) and ("Sera" in i)
|
|
else "Serenomica-Clinincal"
|
|
if i not in qg_samples_dot_mutations
|
|
else "QG-Clinical"
|
|
for i in df_mut_per_sample_by_thresh.index
|
|
]
|
|
df_depth_by_thresh
|
|
return (df_depth_by_thresh,)
|
|
|
|
|
|
@app.cell
|
|
def _(df_depth_by_thresh, pd):
|
|
df_depth_by_thresh_melt = pd.melt(
|
|
df_depth_by_thresh.reset_index(), id_vars=["index", "sample_type"]
|
|
)
|
|
return (df_depth_by_thresh_melt,)
|
|
|
|
|
|
@app.cell
|
|
def _(df_depth_by_thresh, mo):
|
|
ms_gt_dep_thresh = mo.ui.multiselect(options=df_depth_by_thresh.columns)
|
|
mo.hstack([ms_gt_dep_thresh])
|
|
return (ms_gt_dep_thresh,)
|
|
|
|
|
|
@app.cell
|
|
def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px):
|
|
fig3 = px.box(
|
|
df_depth_by_thresh_melt.loc[
|
|
df_depth_by_thresh_melt["variable"].isin(ms_gt_dep_thresh.value)
|
|
],
|
|
x="variable",
|
|
y="value",
|
|
color="sample_type",
|
|
points="all",
|
|
template="simple_white",
|
|
labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
|
|
hover_data=["index"],
|
|
category_orders={
|
|
"sample_type": sorted(df_depth_by_thresh_melt["sample_type"].unique())
|
|
},
|
|
)
|
|
fig3.update_yaxes(showgrid=True)
|
|
fig3.show()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(
|
|
r"""
|
|
### (2.4) Missed Targets
|
|
|
|
Look into variants that only have a depth that could support a VAF of 1% (i.e. 200 reads)
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(black_mut_ids, mutations_depth, qg_hist_mut_depth):
|
|
# removal of blacklisted variants for this analysis
|
|
mutations_depth_wo_black = mutations_depth.loc[
|
|
[i for i in mutations_depth.index if i not in black_mut_ids], :
|
|
].copy()
|
|
|
|
qg_mutations_depth_wo_black = qg_hist_mut_depth.loc[
|
|
[i for i in qg_hist_mut_depth.index if i not in black_mut_ids], :
|
|
].copy()
|
|
return mutations_depth_wo_black, qg_mutations_depth_wo_black
|
|
|
|
|
|
@app.cell
|
|
def _(pd):
|
|
def get_variant_group_by_depth(df, depth_thresh, sample_prop, mode="lt"):
|
|
if mode == "lt":
|
|
variants_group = pd.DataFrame(
|
|
((df < depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
|
|
ascending=False
|
|
),
|
|
columns=[f"prop_cov_{mode}{depth_thresh}"],
|
|
)
|
|
elif mode == "gt":
|
|
variants_group = pd.DataFrame(
|
|
((df > depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
|
|
ascending=False
|
|
),
|
|
columns=[f"prop_cov_{mode}{depth_thresh}"],
|
|
)
|
|
variant_ids = list(
|
|
variants_group.loc[
|
|
variants_group[f"prop_cov_{mode}{depth_thresh}"] > sample_prop
|
|
].index
|
|
)
|
|
return variants_group, variant_ids
|
|
return (get_variant_group_by_depth,)
|
|
|
|
|
|
@app.cell
|
|
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
|
|
variants_lost_global_coverage, variants_lost_cov_ids = (
|
|
get_variant_group_by_depth(
|
|
mutations_depth_wo_black.loc[:, clinicals], 400, 0.5
|
|
)
|
|
)
|
|
variants_lost_cov_ids
|
|
return (variants_lost_cov_ids,)
|
|
|
|
|
|
@app.cell
|
|
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
|
|
qg_variants_lost_global_coverage, qg_variants_lost_cov_ids = (
|
|
get_variant_group_by_depth(qg_mutations_depth_wo_black, 400, 0.5)
|
|
)
|
|
qg_variants_lost_cov_ids
|
|
return (qg_variants_lost_cov_ids,)
|
|
|
|
|
|
@app.cell
|
|
def _(plt, qg_variants_lost_cov_ids, variants_lost_cov_ids, venn2):
|
|
venn2(
|
|
subsets=(
|
|
len(qg_variants_lost_cov_ids),
|
|
len(
|
|
set(qg_variants_lost_cov_ids).intersection(
|
|
set(variants_lost_cov_ids)
|
|
)
|
|
),
|
|
len(set(variants_lost_cov_ids)),
|
|
),
|
|
set_labels=("QG", "Serenomica"),
|
|
set_colors=("blue", "green"),
|
|
alpha=0.5,
|
|
)
|
|
plt.title("Overlap of missed supercolumn targets (w/o blacklist targets)")
|
|
plt.show()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(
|
|
r"""
|
|
### (2.5) Poorly Covered for low VAF detection
|
|
|
|
As we need 2 reads before we report a variant, this analysis looks at variants with lower than 1000 read depth that would enable a lowest VAF of 0.2% to be found.
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
|
|
variants_low_global_coverage, variants_low_cov_ids = (
|
|
get_variant_group_by_depth(
|
|
mutations_depth_wo_black.loc[:, clinicals], 1000, 0.75
|
|
)
|
|
)
|
|
variants_low_cov_ids
|
|
return (variants_low_cov_ids,)
|
|
|
|
|
|
@app.cell
|
|
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
|
|
qg_variants_low_global_coverage, qg_variants_low_cov_ids = (
|
|
get_variant_group_by_depth(qg_mutations_depth_wo_black, 1000, 0.75)
|
|
)
|
|
qg_variants_low_cov_ids
|
|
return (qg_variants_low_cov_ids,)
|
|
|
|
|
|
@app.cell
|
|
def _(
|
|
plt,
|
|
qg_variants_lost_cov_ids,
|
|
qg_variants_low_cov_ids,
|
|
variants_lost_cov_ids,
|
|
variants_low_cov_ids,
|
|
venn2,
|
|
):
|
|
qg_low = set(
|
|
[q for q in qg_variants_low_cov_ids if q not in qg_variants_lost_cov_ids]
|
|
)
|
|
sere_low = set(
|
|
[v for v in variants_low_cov_ids if v not in variants_lost_cov_ids]
|
|
)
|
|
venn2(
|
|
subsets=(
|
|
len(qg_low),
|
|
len(qg_low.intersection(set(sere_low))),
|
|
len(sere_low),
|
|
),
|
|
set_labels=("QG", "Serenomica"),
|
|
set_colors=("blue", "green"),
|
|
alpha=0.5,
|
|
)
|
|
plt.title(
|
|
"Overlap of lowly covered supercolumn targets (w/o blacklist targets)"
|
|
)
|
|
plt.show()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(
|
|
r"""
|
|
### (2.6) Overly covered targets
|
|
|
|
Look to see if any sc targets are overly represented in the reads
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(clinicals, mutations_depth_wo_black, pd):
|
|
mean_cov = pd.DataFrame(
|
|
mutations_depth_wo_black.loc[:, clinicals]
|
|
.mean(axis=1)
|
|
.sort_values(ascending=False),
|
|
columns=["mean_cov"],
|
|
)
|
|
mean_cov
|
|
return (mean_cov,)
|
|
|
|
|
|
@app.cell
|
|
def _(mean_cov, mo):
|
|
slider = mo.ui.slider(
|
|
start=100,
|
|
stop=len(mean_cov),
|
|
step=200,
|
|
label="Number of variants to show",
|
|
show_value=True,
|
|
)
|
|
mo.hstack([slider])
|
|
return (slider,)
|
|
|
|
|
|
@app.cell
|
|
def _(mean_cov, px, slider):
|
|
fig4 = px.bar(mean_cov.iloc[: slider.value, :], template="simple_white")
|
|
fig4.show()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
|
|
variants_high_global_coverage, variants_high_cov_ids = (
|
|
get_variant_group_by_depth(
|
|
mutations_depth_wo_black.loc[:, clinicals], 10000, 0.75, mode="gt"
|
|
)
|
|
)
|
|
variants_high_cov_ids
|
|
return (variants_high_cov_ids,)
|
|
|
|
|
|
@app.cell
|
|
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
|
|
qg_variants_high_global_coverage, qg_variants_high_cov_ids = (
|
|
get_variant_group_by_depth(
|
|
qg_mutations_depth_wo_black, 10000, 0.75, mode="gt"
|
|
)
|
|
)
|
|
qg_variants_high_cov_ids
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
|
|
vars_of_interest = sc.loc[
|
|
sc["mutation_id"].isin(
|
|
variants_lost_cov_ids + variants_low_cov_ids + variants_high_cov_ids
|
|
)
|
|
][
|
|
[
|
|
"chrom",
|
|
"position",
|
|
"mutation_id",
|
|
"gene",
|
|
"HGVS",
|
|
"Consequence",
|
|
"region_type",
|
|
"blacklisted",
|
|
"cancer_fda",
|
|
"FDA_drugs",
|
|
"cancer_clinvar",
|
|
"cancer_rank_prevalence_combined",
|
|
]
|
|
].copy()
|
|
vars_of_interest["coverage_issue"] = [
|
|
"lost"
|
|
if i in variants_lost_cov_ids
|
|
else "poorly_covered"
|
|
if i in variants_low_cov_ids
|
|
else "highly_covered"
|
|
if i in variants_high_cov_ids
|
|
else ""
|
|
for i in vars_of_interest["mutation_id"]
|
|
]
|
|
vars_of_interest.to_csv(
|
|
"../data/coverage_issue_variants.tsv", sep="\t", index=False
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|