feat: analysis of low cfDNA input updated for comparison to QG

2025-10-17 09:30:11 +02:00
parent 2581de3dc7
commit 7a35a49084
3 changed files with 227 additions and 43 deletions
--- a/notebooks/initial_eda_lb_20250919.py
+++ b/notebooks/initial_eda_lb_20250919.py
@@ -1,6 +1,6 @@
 import marimo

-__generated_with = "0.14.16"
+__generated_with = "0.16.4"
 app = marimo.App(width="medium")


@@ -168,7 +168,7 @@ def _(mo):
 def _(os, pd, qc_hist_dir):
    # QG historical data
    qg_hist_stats = pd.read_csv(
-        os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0
+        os.path.join(qc_hist_dir, "historical_data_LB-QG.csv"), index_col=0
    )
    qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns]
    qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1)
@@ -866,6 +866,33 @@ def _(clinicals, meta, pd, stats):
    return (stats_meta,)


+@app.cell
+def _(pd):
+    meta_qg = pd.read_excel(
+        "/home/darren/Documents/4_data/3_internal/2_lb/QC/lb2_metadata.xlsx",
+        index_col="fastq_id",
+    )
+    meta_qg
+    return (meta_qg,)
+
+
+@app.cell
+def _(meta_qg, pd, qg_hist_stats):
+    # merge stats and metadata and remove seracares
+    stats_meta_qg = pd.concat([qg_hist_stats, meta_qg], axis=1)
+    stats_meta_qg = stats_meta_qg.loc[
+        [
+            i
+            for i in stats_meta_qg.index
+            if "twist" not in i.lower() and "sera" not in i.lower()
+        ],
+        :,
+    ]
+    stats_meta_qg = stats_meta_qg.reset_index()
+    stats_meta_qg.head()
+    return (stats_meta_qg,)
+
+
@app.cell
 def _(mo, stats_meta):
    var1 = mo.ui.dropdown(options=stats_meta.columns, label="Variable 1")
@@ -975,6 +1002,108 @@ def _(stats_meta):
    return


+@app.cell
+def _(stats_meta_melt):
+    stats_meta_melt["Flow cell #"].unique()
+    return
+
+
+@app.cell
+def _(qg_hist_stats):
+    qg_hist_stats
+    return
+
+
+@app.cell
+def _(pd, stats_meta, stats_meta_qg):
+    stats_meta["lab"] = "Serenomica"
+    stats_meta_qg["lab"] = "Quantgene"
+    all_stats_meta = pd.concat([stats_meta, stats_meta_qg], axis=0)
+    all_stats_meta.shape
+    return (all_stats_meta,)
+
+
+@app.cell
+def _(all_stats_meta):
+    all_stats_meta["ng_group"] = [
+        "< 30ng" if i < 30 else "> 30ng"
+        for i in all_stats_meta["Input DNA used (ng)"]
+    ]
+    all_stats_meta[["lab", "ng_group"]].value_counts()
+    return
+
+
+@app.cell
+def _(all_stats_meta, pd):
+    all_stats_meta_melt = pd.melt(
+        all_stats_meta,
+        id_vars=["index", "sample_type", "ng_group", "Flow cell #", "lab"],
+    )
+    all_stats_meta_melt
+    return (all_stats_meta_melt,)
+
+
+@app.cell
+def _(all_stats_meta_melt, mo):
+    box_vars2 = mo.ui.dropdown(
+        options=all_stats_meta_melt["variable"].unique(), label="Variables to plot"
+    )
+    mo.hstack([box_vars2])
+    return (box_vars2,)
+
+
+@app.cell
+def _(all_stats_meta_melt, box_vars2, px):
+    fig7 = px.box(
+        data_frame=all_stats_meta_melt.loc[
+            (all_stats_meta_melt["variable"] == box_vars2.value)
+        ],
+        color="ng_group",
+        y="value",
+        x="lab",
+        template="simple_white",
+        points="all",
+        labels={
+            "ng_group": "cfDNA Input for Library",
+            "value": box_vars2.value,
+            "lab": "Laboratory",
+        },
+        color_discrete_sequence=px.colors.qualitative.Dark2,
+        category_orders={"ng_group": ["< 30ng", "> 30ng"]},
+        hover_data=["index"],
+        width=800,
+    )
+    fig7.update_traces(
+        marker=dict(size=10, line=dict(width=1, color="DarkSlateGrey")),
+        selector=dict(type="points"),
+    )
+    fig7.update_xaxes(tickfont_size=14)
+    fig7.update_yaxes(showgrid=True, tickfont_size=14)
+    fig7.update_legends(font_size=14)
+
+    for trace2 in fig7.select_traces():
+        trace2.marker.update(
+            size=7, line=dict(width=1, color="DarkSlateGrey"), opacity=0.8
+        )
+
+    fig7.show()
+    return
+
+
+@app.cell
+def _(all_stats_meta):
+    all_stats_meta[["lab", "ng_group"]].value_counts()
+    return
+
+
+@app.cell
+def _(stats_meta):
+    stats_meta.sort_values("Input DNA used (ng)")[
+        ["index", "Input DNA used (ng)", "Flow cell #"]
+    ]
+    return
+
+
@app.cell
 def _():
    return