feat: analysis completed in the eda on stats and mutations

2025-09-23 16:21:03 +02:00
parent e03ede9c28
commit 39ef0b93ea
1 changed files with 142 additions and 58 deletions
--- a/notebooks/initial_eda_lb_20250919.py
+++ b/notebooks/initial_eda_lb_20250919.py
@@ -12,10 +12,11 @@ def _():
    import pandas as pd
    import plotly.express as px
    import plotly.graph_objects as go
-    import matplotlib as plt
+    import matplotlib.pyplot as plt
    import seaborn as sns
    import yaml
-    return itertools, mo, os, pd, px, yaml
+    from matplotlib_venn import venn2
+    return itertools, mo, os, pd, plt, px, venn2, yaml


@app.cell
@@ -254,6 +255,9 @@ def _(multiselect, px, stats_melt):
        template="simple_white",
        labels={"variable": "", "value": "Value from stats file"},
        hover_data=["index", "source"],
+        category_orders={
+            "sample_type": sorted(stats_melt["sample_type"].unique())
+        },
    )
    fig.update_yaxes(showgrid=True)
    fig.show()
@@ -271,8 +275,8 @@ def _(mo):
    - Always/majority called - potentially suspicious ✅
    - Never called ✅
    - High VAF targets ✅
-    - Poorly covered targets
-    - Overly covered targets
+    - Poorly covered targets ✅
+    - Overly covered targets ✅
    """
    )
    return
@@ -384,7 +388,7 @@ def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations):
        if ("Sera" in i) and (i not in qg_samples_dot_mutations)
        else "QG-Sera"
        if (i in qg_samples_dot_mutations) and ("Sera" in i)
-        else "Serenomical-Clinincal"
+        else "Serenomica-Clinincal"
        if i not in qg_samples_dot_mutations
        else "QG-Clinical"
        for i in df_mut_per_sample_by_thresh.index
@@ -421,6 +425,11 @@ def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px):
        template="simple_white",
        labels={"variable": "", "value": "Variants per sample"},
        hover_data=["index"],
+        category_orders={
+            "sample_type": sorted(
+                df_mut_per_sample_by_thresh_melt["sample_type"].unique()
+            )
+        },
    )
    fig2.update_yaxes(showgrid=True)
    fig2.show()
@@ -525,7 +534,7 @@ def _(
        if ("Sera" in i) and (i not in qg_samples_dot_mutations)
        else "QG-Sera"
        if (i in qg_samples_dot_mutations) and ("Sera" in i)
-        else "Serenomical-Clinincal"
+        else "Serenomica-Clinincal"
        if i not in qg_samples_dot_mutations
        else "QG-Clinical"
        for i in df_mut_per_sample_by_thresh.index
@@ -562,6 +571,9 @@ def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px):
        template="simple_white",
        labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
        hover_data=["index"],
+        category_orders={
+            "sample_type": sorted(df_depth_by_thresh_melt["sample_type"].unique())
+        },
    )
    fig3.update_yaxes(showgrid=True)
    fig3.show()
@@ -581,45 +593,85 @@ def _(mo):


@app.cell
-def _(black_mut_ids, mutations_depth):
+def _(black_mut_ids, mutations_depth, qg_hist_mut_depth):
    # removal of blacklisted variants for this analysis
    mutations_depth_wo_black = mutations_depth.loc[
        [i for i in mutations_depth.index if i not in black_mut_ids], :
    ].copy()
-    return (mutations_depth_wo_black,)
+
+    qg_mutations_depth_wo_black = qg_hist_mut_depth.loc[
+        [i for i in qg_hist_mut_depth.index if i not in black_mut_ids], :
+    ].copy()
+    return mutations_depth_wo_black, qg_mutations_depth_wo_black


@app.cell
-def _(clinicals, mutations_depth_wo_black, pd):
-    # missed non-blacklisted variants where theoretical detection only >1% (i.e. 2 reads in 200 supporting variant)
-    variants_lost_global_coverage = pd.DataFrame(
-        (
-            (
-                mutations_depth_wo_black.loc[
-                    :,
-                    clinicals,
-                ]
-                < 200
-            ).sum(axis=1)
-            / len(clinicals)
-        ).sort_values(ascending=False),
-        columns=["prop_cov_lt200"],
-    )
-    variants_lost_global_coverage
-    return (variants_lost_global_coverage,)
+def _(pd):
+    def get_variant_group_by_depth(df, depth_thresh, sample_prop, mode="lt"):
+        if mode == "lt":
+            variants_group = pd.DataFrame(
+                ((df < depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
+                    ascending=False
+                ),
+                columns=[f"prop_cov_{mode}{depth_thresh}"],
+            )
+        elif mode == "gt":
+            variants_group = pd.DataFrame(
+                ((df > depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
+                    ascending=False
+                ),
+                columns=[f"prop_cov_{mode}{depth_thresh}"],
+            )
+        variant_ids = list(
+            variants_group.loc[
+                variants_group[f"prop_cov_{mode}{depth_thresh}"] > sample_prop
+            ].index
+        )
+        return variants_group, variant_ids
+    return (get_variant_group_by_depth,)


@app.cell
-def _(variants_lost_global_coverage):
-    variants_lost_cov_ids = list(
-        variants_lost_global_coverage.loc[
-            variants_lost_global_coverage["prop_cov_lt200"] > 0.5
-        ].index
+def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
+    variants_lost_global_coverage, variants_lost_cov_ids = (
+        get_variant_group_by_depth(
+            mutations_depth_wo_black.loc[:, clinicals], 400, 0.5
+        )
    )
    variants_lost_cov_ids
    return (variants_lost_cov_ids,)


+@app.cell
+def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
+    qg_variants_lost_global_coverage, qg_variants_lost_cov_ids = (
+        get_variant_group_by_depth(qg_mutations_depth_wo_black, 400, 0.5)
+    )
+    qg_variants_lost_cov_ids
+    return (qg_variants_lost_cov_ids,)
+
+
+@app.cell
+def _(plt, qg_variants_lost_cov_ids, variants_lost_cov_ids, venn2):
+    venn2(
+        subsets=(
+            len(qg_variants_lost_cov_ids),
+            len(
+                set(qg_variants_lost_cov_ids).intersection(
+                    set(variants_lost_cov_ids)
+                )
+            ),
+            len(set(variants_lost_cov_ids)),
+        ),
+        set_labels=("QG", "Serenomica"),
+        set_colors=("blue", "green"),
+        alpha=0.5,
+    )
+    plt.title("Overlap of missed supercolumn targets (w/o blacklist targets)")
+    plt.show()
+    return
+
+
@app.cell
 def _(mo):
    mo.md(
@@ -633,26 +685,54 @@ def _(mo):


@app.cell
-def _(clinicals, mutations_depth_wo_black, pd):
-    # variants that always fail a theoretical report coverage >= 0.2% VAF (i.e. needs 2 reads out of at least 1000)
-    variants_low_global_coverage = pd.DataFrame(
-        (
-            (mutations_depth_wo_black.loc[:, clinicals] < 1000).sum(axis=1)
-            / len(clinicals)
-        ).sort_values(ascending=False),
-        columns=["pro_cov_lt1000"],
-    )
-
-    variants_low_cov_ids = list(
-        variants_low_global_coverage.loc[
-            variants_low_global_coverage["pro_cov_lt1000"] > 0.75
-        ].index
+def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
+    variants_low_global_coverage, variants_low_cov_ids = (
+        get_variant_group_by_depth(
+            mutations_depth_wo_black.loc[:, clinicals], 1000, 0.75
+        )
    )
+    variants_low_cov_ids
    return (variants_low_cov_ids,)


@app.cell
-def _():
+def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
+    qg_variants_low_global_coverage, qg_variants_low_cov_ids = (
+        get_variant_group_by_depth(qg_mutations_depth_wo_black, 1000, 0.75)
+    )
+    qg_variants_low_cov_ids
+    return (qg_variants_low_cov_ids,)
+
+
+@app.cell
+def _(
+    plt,
+    qg_variants_lost_cov_ids,
+    qg_variants_low_cov_ids,
+    variants_lost_cov_ids,
+    variants_low_cov_ids,
+    venn2,
+):
+    qg_low = set(
+        [q for q in qg_variants_low_cov_ids if q not in qg_variants_lost_cov_ids]
+    )
+    sere_low = set(
+        [v for v in variants_low_cov_ids if v not in variants_lost_cov_ids]
+    )
+    venn2(
+        subsets=(
+            len(qg_low),
+            len(qg_low.intersection(set(sere_low))),
+            len(sere_low),
+        ),
+        set_labels=("QG", "Serenomica"),
+        set_colors=("blue", "green"),
+        alpha=0.5,
+    )
+    plt.title(
+        "Overlap of lowly covered supercolumn targets (w/o blacklist targets)"
+    )
+    plt.show()
    return


@@ -701,23 +781,27 @@ def _(mean_cov, px, slider):


@app.cell
-def _(clinicals, mutations_depth_wo_black, pd):
-    variants_high_global_coverage = pd.DataFrame(
-        (
-            (mutations_depth_wo_black.loc[:, clinicals] > 10000).sum(axis=1)
-            / len(clinicals)
-        ).sort_values(ascending=False),
-        columns=["pro_cov_gt10000"],
-    )
-
-    variants_high_cov_ids = list(
-        variants_high_global_coverage.loc[
-            variants_high_global_coverage["pro_cov_gt10000"] > 0.75
-        ].index
+def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
+    variants_high_global_coverage, variants_high_cov_ids = (
+        get_variant_group_by_depth(
+            mutations_depth_wo_black.loc[:, clinicals], 10000, 0.75, mode="gt"
+        )
    )
+    variants_high_cov_ids
    return (variants_high_cov_ids,)


+@app.cell
+def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
+    qg_variants_high_global_coverage, qg_variants_high_cov_ids = (
+        get_variant_group_by_depth(
+            qg_mutations_depth_wo_black, 10000, 0.75, mode="gt"
+        )
+    )
+    qg_variants_high_cov_ids
+    return
+
+
@app.cell
 def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
    vars_of_interest = sc.loc[