feat: analysis completed in the eda on stats and mutations

2025-09-23 16:21:03 +02:00
parent e03ede9c28
commit 39ef0b93ea
1 changed files with 142 additions and 58 deletions
--- a/notebooks/initial_eda_lb_20250919.py
+++ b/notebooks/initial_eda_lb_20250919.py
@@ -12,10 +12,11 @@ def _():
    import pandas as pd
    import plotly.express as px
    import plotly.graph_objects as go
-    import matplotlib as plt
+    import matplotlib.pyplot as plt
    import seaborn as sns
    import yaml
-    return itertools, mo, os, pd, px, yaml
+    from matplotlib_venn import venn2
    return itertools, mo, os, pd, plt, px, venn2, yaml
@app.cell
@@ -254,6 +255,9 @@ def _(multiselect, px, stats_melt):
        template="simple_white",
        labels={"variable": "", "value": "Value from stats file"},
        hover_data=["index", "source"],
        category_orders={
            "sample_type": sorted(stats_melt["sample_type"].unique())
        },
    )
    fig.update_yaxes(showgrid=True)
    fig.show()
@@ -271,8 +275,8 @@ def _(mo):
    - Always/majority called - potentially suspicious ✅
    - Never called ✅
    - High VAF targets ✅
-    - Poorly covered targets
+    - Poorly covered targets ✅
-    - Overly covered targets
+    - Overly covered targets ✅
    """
    )
    return
@@ -384,7 +388,7 @@ def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations):
        if ("Sera" in i) and (i not in qg_samples_dot_mutations)
        else "QG-Sera"
        if (i in qg_samples_dot_mutations) and ("Sera" in i)
-        else "Serenomical-Clinincal"
+        else "Serenomica-Clinincal"
        if i not in qg_samples_dot_mutations
        else "QG-Clinical"
        for i in df_mut_per_sample_by_thresh.index
@@ -421,6 +425,11 @@ def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px):
        template="simple_white",
        labels={"variable": "", "value": "Variants per sample"},
        hover_data=["index"],
        category_orders={
            "sample_type": sorted(
                df_mut_per_sample_by_thresh_melt["sample_type"].unique()
            )
        },
    )
    fig2.update_yaxes(showgrid=True)
    fig2.show()
@@ -525,7 +534,7 @@ def _(
        if ("Sera" in i) and (i not in qg_samples_dot_mutations)
        else "QG-Sera"
        if (i in qg_samples_dot_mutations) and ("Sera" in i)
-        else "Serenomical-Clinincal"
+        else "Serenomica-Clinincal"
        if i not in qg_samples_dot_mutations
        else "QG-Clinical"
        for i in df_mut_per_sample_by_thresh.index
@@ -562,6 +571,9 @@ def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px):
        template="simple_white",
        labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
        hover_data=["index"],
        category_orders={
            "sample_type": sorted(df_depth_by_thresh_melt["sample_type"].unique())
        },
    )
    fig3.update_yaxes(showgrid=True)
    fig3.show()
@@ -581,45 +593,85 @@ def _(mo):
@app.cell
-def _(black_mut_ids, mutations_depth):
+def _(black_mut_ids, mutations_depth, qg_hist_mut_depth):
    # removal of blacklisted variants for this analysis
    mutations_depth_wo_black = mutations_depth.loc[
        [i for i in mutations_depth.index if i not in black_mut_ids], :
    ].copy()
-    return (mutations_depth_wo_black,)
+
    qg_mutations_depth_wo_black = qg_hist_mut_depth.loc[
        [i for i in qg_hist_mut_depth.index if i not in black_mut_ids], :
    ].copy()
    return mutations_depth_wo_black, qg_mutations_depth_wo_black
@app.cell
-def _(clinicals, mutations_depth_wo_black, pd):
+def _(pd):
-    # missed non-blacklisted variants where theoretical detection only >1% (i.e. 2 reads in 200 supporting variant)
+    def get_variant_group_by_depth(df, depth_thresh, sample_prop, mode="lt"):
-    variants_lost_global_coverage = pd.DataFrame(
+        if mode == "lt":
-        (
+            variants_group = pd.DataFrame(
-            (
+                ((df < depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
-                mutations_depth_wo_black.loc[
+                    ascending=False
-                    :,
+                ),
-                    clinicals,
+                columns=[f"prop_cov_{mode}{depth_thresh}"],
-                ]
+            )
-                < 200
+        elif mode == "gt":
-            ).sum(axis=1)
+            variants_group = pd.DataFrame(
-            / len(clinicals)
+                ((df > depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
-        ).sort_values(ascending=False),
+                    ascending=False
-        columns=["prop_cov_lt200"],
+                ),
-    )
+                columns=[f"prop_cov_{mode}{depth_thresh}"],
-    variants_lost_global_coverage
+            )
-    return (variants_lost_global_coverage,)
+        variant_ids = list(
            variants_group.loc[
                variants_group[f"prop_cov_{mode}{depth_thresh}"] > sample_prop
            ].index
        )
        return variants_group, variant_ids
    return (get_variant_group_by_depth,)
@app.cell
-def _(variants_lost_global_coverage):
+def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
-    variants_lost_cov_ids = list(
+    variants_lost_global_coverage, variants_lost_cov_ids = (
-        variants_lost_global_coverage.loc[
+        get_variant_group_by_depth(
-            variants_lost_global_coverage["prop_cov_lt200"] > 0.5
+            mutations_depth_wo_black.loc[:, clinicals], 400, 0.5
-        ].index
+        )
    )
    variants_lost_cov_ids
    return (variants_lost_cov_ids,)
@app.cell
 def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
    qg_variants_lost_global_coverage, qg_variants_lost_cov_ids = (
        get_variant_group_by_depth(qg_mutations_depth_wo_black, 400, 0.5)
    )
    qg_variants_lost_cov_ids
    return (qg_variants_lost_cov_ids,)
@app.cell
 def _(plt, qg_variants_lost_cov_ids, variants_lost_cov_ids, venn2):
    venn2(
        subsets=(
            len(qg_variants_lost_cov_ids),
            len(
                set(qg_variants_lost_cov_ids).intersection(
                    set(variants_lost_cov_ids)
                )
            ),
            len(set(variants_lost_cov_ids)),
        ),
        set_labels=("QG", "Serenomica"),
        set_colors=("blue", "green"),
        alpha=0.5,
    )
    plt.title("Overlap of missed supercolumn targets (w/o blacklist targets)")
    plt.show()
    return
@app.cell
 def _(mo):
    mo.md(
@@ -633,26 +685,54 @@ def _(mo):
@app.cell
-def _(clinicals, mutations_depth_wo_black, pd):
+def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
-    # variants that always fail a theoretical report coverage >= 0.2% VAF (i.e. needs 2 reads out of at least 1000)
+    variants_low_global_coverage, variants_low_cov_ids = (
-    variants_low_global_coverage = pd.DataFrame(
+        get_variant_group_by_depth(
-        (
+            mutations_depth_wo_black.loc[:, clinicals], 1000, 0.75
-            (mutations_depth_wo_black.loc[:, clinicals] < 1000).sum(axis=1)
+        )
            / len(clinicals)
        ).sort_values(ascending=False),
        columns=["pro_cov_lt1000"],
    )
    variants_low_cov_ids = list(
        variants_low_global_coverage.loc[
            variants_low_global_coverage["pro_cov_lt1000"] > 0.75
        ].index
    )
    variants_low_cov_ids
    return (variants_low_cov_ids,)
@app.cell
-def _():
+def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
    qg_variants_low_global_coverage, qg_variants_low_cov_ids = (
        get_variant_group_by_depth(qg_mutations_depth_wo_black, 1000, 0.75)
    )
    qg_variants_low_cov_ids
    return (qg_variants_low_cov_ids,)
@app.cell
 def _(
    plt,
    qg_variants_lost_cov_ids,
    qg_variants_low_cov_ids,
    variants_lost_cov_ids,
    variants_low_cov_ids,
    venn2,
 ):
    qg_low = set(
        [q for q in qg_variants_low_cov_ids if q not in qg_variants_lost_cov_ids]
    )
    sere_low = set(
        [v for v in variants_low_cov_ids if v not in variants_lost_cov_ids]
    )
    venn2(
        subsets=(
            len(qg_low),
            len(qg_low.intersection(set(sere_low))),
            len(sere_low),
        ),
        set_labels=("QG", "Serenomica"),
        set_colors=("blue", "green"),
        alpha=0.5,
    )
    plt.title(
        "Overlap of lowly covered supercolumn targets (w/o blacklist targets)"
    )
    plt.show()
    return
@@ -701,23 +781,27 @@ def _(mean_cov, px, slider):
@app.cell
-def _(clinicals, mutations_depth_wo_black, pd):
+def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
-    variants_high_global_coverage = pd.DataFrame(
+    variants_high_global_coverage, variants_high_cov_ids = (
-        (
+        get_variant_group_by_depth(
-            (mutations_depth_wo_black.loc[:, clinicals] > 10000).sum(axis=1)
+            mutations_depth_wo_black.loc[:, clinicals], 10000, 0.75, mode="gt"
-            / len(clinicals)
+        )
        ).sort_values(ascending=False),
        columns=["pro_cov_gt10000"],
    )
    variants_high_cov_ids = list(
        variants_high_global_coverage.loc[
            variants_high_global_coverage["pro_cov_gt10000"] > 0.75
        ].index
    )
    variants_high_cov_ids
    return (variants_high_cov_ids,)
@app.cell
 def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
    qg_variants_high_global_coverage, qg_variants_high_cov_ids = (
        get_variant_group_by_depth(
            qg_mutations_depth_wo_black, 10000, 0.75, mode="gt"
        )
    )
    qg_variants_high_cov_ids
    return
@app.cell
 def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
    vars_of_interest = sc.loc[