From ab15df85adfd25f53ef33f5863439fa7a172ba05 Mon Sep 17 00:00:00 2001
From: Darren Wight <darren@serenomica.com>
Date: Sun, 21 Sep 2025 22:26:09 +0200
Subject: [PATCH] feat: added code for eda with marimo

---
 notebooks/initial_eda_lb_20250919.py | 234 +++++++++++++++++++++++++++
 1 file changed, 234 insertions(+)
 create mode 100644 notebooks/initial_eda_lb_20250919.py

diff --git a/notebooks/initial_eda_lb_20250919.py b/notebooks/initial_eda_lb_20250919.py
new file mode 100644
index 0000000..a741d34
--- /dev/null
+++ b/notebooks/initial_eda_lb_20250919.py
@@ -0,0 +1,234 @@
+import marimo
+
+__generated_with = "0.14.16"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import os
+    import itertools
+    import marimo as mo
+    import pandas as pd
+    import plotly.express as px
+    import plotly.graph_objects as go
+    import matplotlib as plt
+    import seaborn as sns
+    import yaml
+    return itertools, mo, os, pd, px, yaml
+
+
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    # Initial EDA of the Twist LB data (19-Sep-2025)
+
+    So far no analysis have been done on the data from the LB assay. This analysis will take the first look.
+
+    Analysis ideas:
+
+    - Stats data analysis - box plots and outliers
+    - Mutations - agg stats -> variants called, coverage of called variants in supercol, common called variants, unique variants
+    - vcfs - agg stats -> variants called, coverage of called variants in supercol, common called variants
+    """
+    )
+    return
+
+
+@app.cell
+def _():
+    se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/"
+    se2_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se2-lb-1/"
+
+    qc_hist_dir = (
+        "/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/"
+    )
+    return qc_hist_dir, se1_dir, se2_dir
+
+
+@app.cell
+def _(itertools, os):
+    def return_paths(suffix, *dirs):
+        return list(
+            itertools.chain.from_iterable(
+                [
+                    [
+                        os.path.join(d, f)
+                        for f in os.listdir(d)
+                        if f.endswith(suffix)
+                    ]
+                    for d in dirs
+                ]
+            )
+        )
+    return (return_paths,)
+
+
+@app.cell
+def _(return_paths, se1_dir, se2_dir):
+    stats_fpaths = return_paths(".stats", se1_dir, se2_dir)
+    print(len(stats_fpaths))
+    mutations_fpaths = return_paths(".mutations", se1_dir, se2_dir)
+    print(len(mutations_fpaths))
+    return mutations_fpaths, stats_fpaths
+
+
+@app.cell
+def _(pd, stats_fpaths, yaml):
+    stats = pd.DataFrame()
+    for sf in stats_fpaths:
+        with open(sf, "r") as handle:
+            stats = pd.concat(
+                [
+                    stats,
+                    pd.json_normalize(yaml.safe_load(handle), sep=" / ").rename(
+                        {0: sf.split("/")[-1].strip(".stats")}, axis=0
+                    ),
+                ],
+                axis=0,
+            )
+    stats["sample_type"] = [
+        "Serenomica-Sera" if "Sera" in i else "Serenomica-Clinical"
+        for i in stats.index
+    ]
+    stats["source"] = "Serenomica"
+    stats = stats.drop(
+        ["miscellaneous / stats_file_version", "miscellaneous / target_panel_bed"],
+        axis=1,
+    )
+    return (stats,)
+
+
+@app.cell
+def _(mutations_fpaths, pd):
+    cols = []
+    mutations_vaf, mutations_counts, mutations_depth = (
+        pd.DataFrame(),
+        pd.DataFrame(),
+        pd.DataFrame(),
+    )
+    for mf in mutations_fpaths:
+        tmp = pd.read_csv(mf, sep="\t", header=None)
+        tmp["mutation_id"] = tmp.apply(
+            lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
+        )
+        tmp.set_index("mutation_id", inplace=True)
+
+        cols.append(mf.split("/")[-1].strip(".mutations"))
+        mutations_vaf = pd.concat([mutations_vaf, tmp[3]], axis=1)
+        mutations_counts = pd.concat([mutations_counts, tmp[4]], axis=1)
+        mutations_depth = pd.concat([mutations_depth, tmp[5]], axis=1)
+
+    mutations_vaf.columns = cols
+    mutations_counts.columns = cols
+    mutations_depth.columns = cols
+
+    mutations_vaf
+    return (mutations_vaf,)
+
+
+@app.cell
+def _(os, pd, qc_hist_dir):
+    # QG historical data
+    qg_hist_stats = pd.read_csv(
+        os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0
+    )
+    qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns]
+    qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1)
+    qg_hist_stats["sample_type"] = [
+        "QG-Sera" if "Sera" in i else "QG-Clinical" for i in qg_hist_stats.index
+    ]
+    qg_hist_stats["source"] = "QG"
+    qg_hist_stats
+    return (qg_hist_stats,)
+
+
+@app.cell
+def _(pd, qg_hist_stats, stats):
+    # ensure columns match
+    assert set(qg_hist_stats.columns) == set(stats.columns)
+
+    all_stats = pd.concat([stats, qg_hist_stats], axis=0)
+    all_stats.head()
+    return (all_stats,)
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""## (1) Stats""")
+    return
+
+
+@app.cell
+def _(all_stats, pd):
+    stats_melt = pd.melt(
+        all_stats.reset_index(), id_vars=["index", "sample_type", "source"]
+    )
+    stats_melt.head()
+    return (stats_melt,)
+
+
+@app.cell
+def _(mo, stats):
+    multiselect = mo.ui.multiselect(options=stats.columns)
+    mo.hstack([multiselect])
+    return (multiselect,)
+
+
+@app.cell
+def _(multiselect, px, stats_melt):
+    fig = px.box(
+        stats_melt.loc[stats_melt["variable"].isin(multiselect.value)],
+        x="variable",
+        y="value",
+        color="sample_type",
+        points="all",
+        template="simple_white",
+        labels={"variable": "", "value": "Value from stats file"},
+        hover_data=["index", "source"],
+    )
+    fig.update_yaxes(showgrid=True)
+    fig.show()
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    ## (2) .mutations Files
+
+    Analysis ideas💡
+
+    - Always/majority called - potentially suspicious
+    - Never called
+    - High VAF targets
+    - Poorly covered targets
+    - Overly covered targets
+    """
+    )
+    return
+
+
+@app.cell
+def _(mutations_vaf):
+    ((mutations_vaf == 0).sum(axis=1) / len(mutations_vaf.columns)).sort_values(
+        ascending=False
+    )
+    return
+
+
+@app.cell
+def _(mutations_vaf):
+    mutations_vaf.shape
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+if __name__ == "__main__":
+    app.run()