feat: added code for eda with marimo

2025-09-21 22:26:09 +02:00
parent a433e4df51
commit ab15df85ad
1 changed files with 234 additions and 0 deletions
--- a/notebooks/initial_eda_lb_20250919.py
+++ b/notebooks/initial_eda_lb_20250919.py
@@ -0,0 +1,234 @@
 import marimo
 __generated_with = "0.14.16"
 app = marimo.App(width="medium")
@app.cell
 def _():
    import os
    import itertools
    import marimo as mo
    import pandas as pd
    import plotly.express as px
    import plotly.graph_objects as go
    import matplotlib as plt
    import seaborn as sns
    import yaml
    return itertools, mo, os, pd, px, yaml
@app.cell
 def _(mo):
    mo.md(
        r"""
    # Initial EDA of the Twist LB data (19-Sep-2025)
    So far no analysis have been done on the data from the LB assay. This analysis will take the first look.
    Analysis ideas:
    - Stats data analysis - box plots and outliers
    - Mutations - agg stats -> variants called, coverage of called variants in supercol, common called variants, unique variants
    - vcfs - agg stats -> variants called, coverage of called variants in supercol, common called variants
    """
    )
    return
@app.cell
 def _():
    se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/"
    se2_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se2-lb-1/"
    qc_hist_dir = (
        "/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/"
    )
    return qc_hist_dir, se1_dir, se2_dir
@app.cell
 def _(itertools, os):
    def return_paths(suffix, *dirs):
        return list(
            itertools.chain.from_iterable(
                [
                    [
                        os.path.join(d, f)
                        for f in os.listdir(d)
                        if f.endswith(suffix)
                    ]
                    for d in dirs
                ]
            )
        )
    return (return_paths,)
@app.cell
 def _(return_paths, se1_dir, se2_dir):
    stats_fpaths = return_paths(".stats", se1_dir, se2_dir)
    print(len(stats_fpaths))
    mutations_fpaths = return_paths(".mutations", se1_dir, se2_dir)
    print(len(mutations_fpaths))
    return mutations_fpaths, stats_fpaths
@app.cell
 def _(pd, stats_fpaths, yaml):
    stats = pd.DataFrame()
    for sf in stats_fpaths:
        with open(sf, "r") as handle:
            stats = pd.concat(
                [
                    stats,
                    pd.json_normalize(yaml.safe_load(handle), sep=" / ").rename(
                        {0: sf.split("/")[-1].strip(".stats")}, axis=0
                    ),
                ],
                axis=0,
            )
    stats["sample_type"] = [
        "Serenomica-Sera" if "Sera" in i else "Serenomica-Clinical"
        for i in stats.index
    ]
    stats["source"] = "Serenomica"
    stats = stats.drop(
        ["miscellaneous / stats_file_version", "miscellaneous / target_panel_bed"],
        axis=1,
    )
    return (stats,)
@app.cell
 def _(mutations_fpaths, pd):
    cols = []
    mutations_vaf, mutations_counts, mutations_depth = (
        pd.DataFrame(),
        pd.DataFrame(),
        pd.DataFrame(),
    )
    for mf in mutations_fpaths:
        tmp = pd.read_csv(mf, sep="\t", header=None)
        tmp["mutation_id"] = tmp.apply(
            lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
        )
        tmp.set_index("mutation_id", inplace=True)
        cols.append(mf.split("/")[-1].strip(".mutations"))
        mutations_vaf = pd.concat([mutations_vaf, tmp[3]], axis=1)
        mutations_counts = pd.concat([mutations_counts, tmp[4]], axis=1)
        mutations_depth = pd.concat([mutations_depth, tmp[5]], axis=1)
    mutations_vaf.columns = cols
    mutations_counts.columns = cols
    mutations_depth.columns = cols
    mutations_vaf
    return (mutations_vaf,)
@app.cell
 def _(os, pd, qc_hist_dir):
    # QG historical data
    qg_hist_stats = pd.read_csv(
        os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0
    )
    qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns]
    qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1)
    qg_hist_stats["sample_type"] = [
        "QG-Sera" if "Sera" in i else "QG-Clinical" for i in qg_hist_stats.index
    ]
    qg_hist_stats["source"] = "QG"
    qg_hist_stats
    return (qg_hist_stats,)
@app.cell
 def _(pd, qg_hist_stats, stats):
    # ensure columns match
    assert set(qg_hist_stats.columns) == set(stats.columns)
    all_stats = pd.concat([stats, qg_hist_stats], axis=0)
    all_stats.head()
    return (all_stats,)
@app.cell
 def _(mo):
    mo.md(r"""## (1) Stats""")
    return
@app.cell
 def _(all_stats, pd):
    stats_melt = pd.melt(
        all_stats.reset_index(), id_vars=["index", "sample_type", "source"]
    )
    stats_melt.head()
    return (stats_melt,)
@app.cell
 def _(mo, stats):
    multiselect = mo.ui.multiselect(options=stats.columns)
    mo.hstack([multiselect])
    return (multiselect,)
@app.cell
 def _(multiselect, px, stats_melt):
    fig = px.box(
        stats_melt.loc[stats_melt["variable"].isin(multiselect.value)],
        x="variable",
        y="value",
        color="sample_type",
        points="all",
        template="simple_white",
        labels={"variable": "", "value": "Value from stats file"},
        hover_data=["index", "source"],
    )
    fig.update_yaxes(showgrid=True)
    fig.show()
    return
@app.cell
 def _(mo):
    mo.md(
        r"""
    ## (2) .mutations Files
    Analysis ideas💡
    - Always/majority called - potentially suspicious
    - Never called
    - High VAF targets
    - Poorly covered targets
    - Overly covered targets
    """
    )
    return
@app.cell
 def _(mutations_vaf):
    ((mutations_vaf == 0).sum(axis=1) / len(mutations_vaf.columns)).sort_values(
        ascending=False
    )
    return
@app.cell
 def _(mutations_vaf):
    mutations_vaf.shape
    return
@app.cell
 def _():
    return
 if __name__ == "__main__":
    app.run()