diff --git a/notebooks/initial_eda_lb_20250919.py b/notebooks/initial_eda_lb_20250919.py index e9ab295..04028a1 100644 --- a/notebooks/initial_eda_lb_20250919.py +++ b/notebooks/initial_eda_lb_20250919.py @@ -12,10 +12,11 @@ def _(): import pandas as pd import plotly.express as px import plotly.graph_objects as go - import matplotlib as plt + import matplotlib.pyplot as plt import seaborn as sns import yaml - return itertools, mo, os, pd, px, yaml + from matplotlib_venn import venn2 + return itertools, mo, os, pd, plt, px, venn2, yaml @app.cell @@ -254,6 +255,9 @@ def _(multiselect, px, stats_melt): template="simple_white", labels={"variable": "", "value": "Value from stats file"}, hover_data=["index", "source"], + category_orders={ + "sample_type": sorted(stats_melt["sample_type"].unique()) + }, ) fig.update_yaxes(showgrid=True) fig.show() @@ -271,8 +275,8 @@ def _(mo): - Always/majority called - potentially suspicious ✅ - Never called ✅ - High VAF targets ✅ - - Poorly covered targets - - Overly covered targets + - Poorly covered targets ✅ + - Overly covered targets ✅ """ ) return @@ -384,7 +388,7 @@ def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations): if ("Sera" in i) and (i not in qg_samples_dot_mutations) else "QG-Sera" if (i in qg_samples_dot_mutations) and ("Sera" in i) - else "Serenomical-Clinincal" + else "Serenomica-Clinincal" if i not in qg_samples_dot_mutations else "QG-Clinical" for i in df_mut_per_sample_by_thresh.index @@ -421,6 +425,11 @@ def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px): template="simple_white", labels={"variable": "", "value": "Variants per sample"}, hover_data=["index"], + category_orders={ + "sample_type": sorted( + df_mut_per_sample_by_thresh_melt["sample_type"].unique() + ) + }, ) fig2.update_yaxes(showgrid=True) fig2.show() @@ -525,7 +534,7 @@ def _( if ("Sera" in i) and (i not in qg_samples_dot_mutations) else "QG-Sera" if (i in qg_samples_dot_mutations) and ("Sera" in i) - else "Serenomical-Clinincal" + else "Serenomica-Clinincal" if i not in qg_samples_dot_mutations else "QG-Clinical" for i in df_mut_per_sample_by_thresh.index @@ -562,6 +571,9 @@ def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px): template="simple_white", labels={"variable": "", "value": "Prop of sc sites > threhold per sample"}, hover_data=["index"], + category_orders={ + "sample_type": sorted(df_depth_by_thresh_melt["sample_type"].unique()) + }, ) fig3.update_yaxes(showgrid=True) fig3.show() @@ -581,45 +593,85 @@ def _(mo): @app.cell -def _(black_mut_ids, mutations_depth): +def _(black_mut_ids, mutations_depth, qg_hist_mut_depth): # removal of blacklisted variants for this analysis mutations_depth_wo_black = mutations_depth.loc[ [i for i in mutations_depth.index if i not in black_mut_ids], : ].copy() - return (mutations_depth_wo_black,) + + qg_mutations_depth_wo_black = qg_hist_mut_depth.loc[ + [i for i in qg_hist_mut_depth.index if i not in black_mut_ids], : + ].copy() + return mutations_depth_wo_black, qg_mutations_depth_wo_black @app.cell -def _(clinicals, mutations_depth_wo_black, pd): - # missed non-blacklisted variants where theoretical detection only >1% (i.e. 2 reads in 200 supporting variant) - variants_lost_global_coverage = pd.DataFrame( - ( - ( - mutations_depth_wo_black.loc[ - :, - clinicals, - ] - < 200 - ).sum(axis=1) - / len(clinicals) - ).sort_values(ascending=False), - columns=["prop_cov_lt200"], - ) - variants_lost_global_coverage - return (variants_lost_global_coverage,) +def _(pd): + def get_variant_group_by_depth(df, depth_thresh, sample_prop, mode="lt"): + if mode == "lt": + variants_group = pd.DataFrame( + ((df < depth_thresh).sum(axis=1) / df.shape[1]).sort_values( + ascending=False + ), + columns=[f"prop_cov_{mode}{depth_thresh}"], + ) + elif mode == "gt": + variants_group = pd.DataFrame( + ((df > depth_thresh).sum(axis=1) / df.shape[1]).sort_values( + ascending=False + ), + columns=[f"prop_cov_{mode}{depth_thresh}"], + ) + variant_ids = list( + variants_group.loc[ + variants_group[f"prop_cov_{mode}{depth_thresh}"] > sample_prop + ].index + ) + return variants_group, variant_ids + return (get_variant_group_by_depth,) @app.cell -def _(variants_lost_global_coverage): - variants_lost_cov_ids = list( - variants_lost_global_coverage.loc[ - variants_lost_global_coverage["prop_cov_lt200"] > 0.5 - ].index +def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black): + variants_lost_global_coverage, variants_lost_cov_ids = ( + get_variant_group_by_depth( + mutations_depth_wo_black.loc[:, clinicals], 400, 0.5 + ) ) variants_lost_cov_ids return (variants_lost_cov_ids,) +@app.cell +def _(get_variant_group_by_depth, qg_mutations_depth_wo_black): + qg_variants_lost_global_coverage, qg_variants_lost_cov_ids = ( + get_variant_group_by_depth(qg_mutations_depth_wo_black, 400, 0.5) + ) + qg_variants_lost_cov_ids + return (qg_variants_lost_cov_ids,) + + +@app.cell +def _(plt, qg_variants_lost_cov_ids, variants_lost_cov_ids, venn2): + venn2( + subsets=( + len(qg_variants_lost_cov_ids), + len( + set(qg_variants_lost_cov_ids).intersection( + set(variants_lost_cov_ids) + ) + ), + len(set(variants_lost_cov_ids)), + ), + set_labels=("QG", "Serenomica"), + set_colors=("blue", "green"), + alpha=0.5, + ) + plt.title("Overlap of missed supercolumn targets (w/o blacklist targets)") + plt.show() + return + + @app.cell def _(mo): mo.md( @@ -633,26 +685,54 @@ def _(mo): @app.cell -def _(clinicals, mutations_depth_wo_black, pd): - # variants that always fail a theoretical report coverage >= 0.2% VAF (i.e. needs 2 reads out of at least 1000) - variants_low_global_coverage = pd.DataFrame( - ( - (mutations_depth_wo_black.loc[:, clinicals] < 1000).sum(axis=1) - / len(clinicals) - ).sort_values(ascending=False), - columns=["pro_cov_lt1000"], - ) - - variants_low_cov_ids = list( - variants_low_global_coverage.loc[ - variants_low_global_coverage["pro_cov_lt1000"] > 0.75 - ].index +def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black): + variants_low_global_coverage, variants_low_cov_ids = ( + get_variant_group_by_depth( + mutations_depth_wo_black.loc[:, clinicals], 1000, 0.75 + ) ) + variants_low_cov_ids return (variants_low_cov_ids,) @app.cell -def _(): +def _(get_variant_group_by_depth, qg_mutations_depth_wo_black): + qg_variants_low_global_coverage, qg_variants_low_cov_ids = ( + get_variant_group_by_depth(qg_mutations_depth_wo_black, 1000, 0.75) + ) + qg_variants_low_cov_ids + return (qg_variants_low_cov_ids,) + + +@app.cell +def _( + plt, + qg_variants_lost_cov_ids, + qg_variants_low_cov_ids, + variants_lost_cov_ids, + variants_low_cov_ids, + venn2, +): + qg_low = set( + [q for q in qg_variants_low_cov_ids if q not in qg_variants_lost_cov_ids] + ) + sere_low = set( + [v for v in variants_low_cov_ids if v not in variants_lost_cov_ids] + ) + venn2( + subsets=( + len(qg_low), + len(qg_low.intersection(set(sere_low))), + len(sere_low), + ), + set_labels=("QG", "Serenomica"), + set_colors=("blue", "green"), + alpha=0.5, + ) + plt.title( + "Overlap of lowly covered supercolumn targets (w/o blacklist targets)" + ) + plt.show() return @@ -701,23 +781,27 @@ def _(mean_cov, px, slider): @app.cell -def _(clinicals, mutations_depth_wo_black, pd): - variants_high_global_coverage = pd.DataFrame( - ( - (mutations_depth_wo_black.loc[:, clinicals] > 10000).sum(axis=1) - / len(clinicals) - ).sort_values(ascending=False), - columns=["pro_cov_gt10000"], - ) - - variants_high_cov_ids = list( - variants_high_global_coverage.loc[ - variants_high_global_coverage["pro_cov_gt10000"] > 0.75 - ].index +def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black): + variants_high_global_coverage, variants_high_cov_ids = ( + get_variant_group_by_depth( + mutations_depth_wo_black.loc[:, clinicals], 10000, 0.75, mode="gt" + ) ) + variants_high_cov_ids return (variants_high_cov_ids,) +@app.cell +def _(get_variant_group_by_depth, qg_mutations_depth_wo_black): + qg_variants_high_global_coverage, qg_variants_high_cov_ids = ( + get_variant_group_by_depth( + qg_mutations_depth_wo_black, 10000, 0.75, mode="gt" + ) + ) + qg_variants_high_cov_ids + return + + @app.cell def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids): vars_of_interest = sc.loc[