lb_eda/notebooks/initial_eda_lb_20250919.py

import marimo

__generated_with = "0.14.16"
app = marimo.App(width="medium")


@app.cell
def _():
    import os
    import itertools
    import marimo as mo
    import pandas as pd
    import plotly.express as px
    import plotly.graph_objects as go
    import matplotlib.pyplot as plt
    import seaborn as sns
    import yaml
    from matplotlib_venn import venn2
    return itertools, mo, os, pd, plt, px, venn2, yaml


@app.cell
def _(mo):
    mo.md(
        r"""
    # Initial EDA of the Twist LB data (19-Sep-2025)

    So far no analysis have been done on the data from the LB assay. This analysis will take the first look.

    Analysis ideas:

    - Stats data analysis - box plots and outliers
    - Mutations - agg stats -> variants called, coverage of called variants in supercol, common called variants, unique variants
    - vcfs - agg stats -> variants called, coverage of called variants in supercol, common called variants
    """
    )
    return


@app.cell
def _(mo):
    mo.md(r"""## (1) Preprocess Data""")
    return


@app.cell
def _():
    se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/"
    se2_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se2-lb-1/"

    qc_hist_dir = (
        "/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/"
    )
    qc_hist_mutations_dir = "/home/darren/Documents/4_data/3_internal/2_lb/qg_historical_qc_service/dot_mutations/"
    sc_path = "/home/darren/Documents/2_repos/serenomica/supercolumn/panel_v2/supercolumn_v2.csv"
    return qc_hist_dir, qc_hist_mutations_dir, sc_path, se1_dir, se2_dir


@app.cell
def _(itertools, os):
    def return_paths(suffix, *dirs):
        return list(
            itertools.chain.from_iterable(
                [
                    [
                        os.path.join(d, f)
                        for f in os.listdir(d)
                        if f.endswith(suffix)
                    ]
                    for d in dirs
                ]
            )
        )
    return (return_paths,)


@app.cell
def _(return_paths, se1_dir, se2_dir):
    stats_fpaths = return_paths(".stats", se1_dir, se2_dir)
    print(len(stats_fpaths))
    mutations_fpaths = return_paths(".mutations", se1_dir, se2_dir)
    print(len(mutations_fpaths))
    return mutations_fpaths, stats_fpaths


@app.cell
def _(mo):
    mo.md(r"""### (1.1) Stats""")
    return


@app.cell
def _(pd, stats_fpaths, yaml):
    stats = pd.DataFrame()
    for sf in stats_fpaths:
        with open(sf, "r") as handle:
            stats = pd.concat(
                [
                    stats,
                    pd.json_normalize(yaml.safe_load(handle), sep=" / ").rename(
                        {0: sf.split("/")[-1].strip(".stats")}, axis=0
                    ),
                ],
                axis=0,
            )
    stats["sample_type"] = [
        "Serenomica-Sera" if "Sera" in i else "Serenomica-Clinical"
        for i in stats.index
    ]
    stats["source"] = "Serenomica"
    stats = stats.drop(
        ["miscellaneous / stats_file_version", "miscellaneous / target_panel_bed"],
        axis=1,
    )
    return (stats,)


@app.cell
def _(mo):
    mo.md(r"""### (1.2) .mutations""")
    return


@app.cell
def _(pd):
    def process_mutations_files(mutation_file_paths):
        cols = []
        vaf, counts, depth = (
            pd.DataFrame(),
            pd.DataFrame(),
            pd.DataFrame(),
        )
        for mf in mutation_file_paths:
            tmp = pd.read_csv(mf, sep="\t", header=None)
            tmp["mutation_id"] = tmp.apply(
                lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
            )
            tmp.set_index("mutation_id", inplace=True)

            cols.append(mf.split("/")[-1].strip(".mutations"))
            vaf = pd.concat([vaf, tmp[3]], axis=1)
            counts = pd.concat([counts, tmp[4]], axis=1)
            depth = pd.concat([depth, tmp[5]], axis=1)

        vaf.columns = cols
        counts.columns = cols
        depth.columns = cols
        return vaf, counts, depth
    return (process_mutations_files,)


@app.cell
def _(mutations_fpaths, process_mutations_files):
    mutations_vaf, mutations_counts, mutations_depth = process_mutations_files(
        mutations_fpaths
    )
    mutations_vaf
    return mutations_depth, mutations_vaf


@app.cell
def _(mo):
    mo.md(r"""### (1.3) Stats QG Historical""")
    return


@app.cell
def _(os, pd, qc_hist_dir):
    # QG historical data
    qg_hist_stats = pd.read_csv(
        os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0
    )
    qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns]
    qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1)
    qg_hist_stats["sample_type"] = [
        "QG-Sera" if "Sera" in i else "QG-Clinical" for i in qg_hist_stats.index
    ]
    qg_hist_stats["source"] = "QG"
    qg_hist_stats
    return (qg_hist_stats,)


@app.cell
def _(pd, qg_hist_stats, stats):
    # ensure columns match
    assert set(qg_hist_stats.columns) == set(stats.columns)

    all_stats = pd.concat([stats, qg_hist_stats], axis=0)
    all_stats.head()
    return (all_stats,)


@app.cell
def _(mo):
    mo.md(r"""### (1.4) .mutations QG Historical""")
    return


@app.cell
def _(process_mutations_files, qc_hist_mutations_dir, return_paths):
    qg_hist_mutations_paths = return_paths(".mutations", qc_hist_mutations_dir)
    qg_hist_mut_vaf, qg_hist_mut_count, qg_hist_mut_depth = (
        process_mutations_files(qg_hist_mutations_paths)
    )
    qg_hist_mut_vaf
    return qg_hist_mut_depth, qg_hist_mut_vaf


@app.cell
def _(mutations_depth, mutations_vaf, pd, qg_hist_mut_depth, qg_hist_mut_vaf):
    # ensure columns match
    assert set(qg_hist_mut_vaf.index) == set(mutations_vaf.index)

    all_vaf = pd.concat([mutations_vaf, qg_hist_mut_vaf], axis=1)
    all_depth = pd.concat([mutations_depth, qg_hist_mut_depth], axis=1)
    return all_depth, all_vaf


@app.cell
def _(qg_hist_mut_vaf):
    qg_samples_dot_mutations = qg_hist_mut_vaf.columns
    return (qg_samples_dot_mutations,)


@app.cell
def _(mo):
    mo.md(r"""## (1) Stats Analysis""")
    return


@app.cell
def _(all_stats, pd):
    stats_melt = pd.melt(
        all_stats.reset_index(), id_vars=["index", "sample_type", "source"]
    )
    stats_melt.head()
    return (stats_melt,)


@app.cell
def _(mo, stats):
    multiselect = mo.ui.multiselect(options=stats.columns)
    mo.hstack([multiselect])
    return (multiselect,)


@app.cell
def _(multiselect, px, stats_melt):
    fig = px.box(
        stats_melt.loc[stats_melt["variable"].isin(multiselect.value)],
        x="variable",
        y="value",
        color="sample_type",
        points="all",
        template="simple_white",
        labels={"variable": "", "value": "Value from stats file"},
        hover_data=["index", "source"],
        category_orders={
            "sample_type": sorted(stats_melt["sample_type"].unique())
        },
    )
    fig.update_yaxes(showgrid=True)
    fig.show()
    return


@app.cell
def _(mo):
    mo.md(
        r"""
    ## (2) .mutations Analysis

    Analysis ideas💡

    - Always/majority called - potentially suspicious ✅
    - Never called ✅
    - High VAF targets ✅
    - Poorly covered targets ✅
    - Overly covered targets ✅
    """
    )
    return


@app.cell
def _(pd, sc_path):
    # supercolumn for analysis
    sc = pd.read_csv(sc_path, sep=";")
    sc["mutation_id"] = (
        "chr"
        + sc["chrom"].astype(str)
        + "_"
        + sc["position"].astype(str)
        + "_"
        + sc["ref"].astype(str)
        + "/"
        + sc["alt"].astype(str)
    )
    return (sc,)


@app.cell
def _(sc):
    black_mut_ids = sc.loc[sc["blacklisted"] == "Yes"]["mutation_id"].tolist()
    print(f"Number of blacklisted variants = {len(black_mut_ids)}")
    return (black_mut_ids,)


@app.cell
def _(mutations_vaf):
    seracares = [i for i in mutations_vaf.columns if "Sera" in i]
    clinicals = [i for i in mutations_vaf.columns if i not in seracares]
    print(f"Number of seracares = {len(seracares)}")
    print(f"Number of volunteers = {len(clinicals)}")
    print("Both numbers include replicates!")
    return (clinicals,)


@app.cell
def _(mo):
    mo.md(r"""### (2.1) Frequently and infrequently called variants""")
    return


@app.cell
def _(clinicals, mutations_vaf, pd):
    # always called and never called
    called_gt0 = pd.DataFrame(
        (mutations_vaf.loc[:, clinicals] > 0).sum(axis=1),
        columns=["samples_called_in"],
    )
    called_gt0["prop_samples_called"] = [
        row / len(clinicals) for row in called_gt0["samples_called_in"]
    ]
    called_gt0["called_in_ALL"] = [
        True if val == len(clinicals) else False
        for val in called_gt0["samples_called_in"]
    ]
    called_gt0["called_in_NONE"] = [
        True if val == 0 else False for val in called_gt0["samples_called_in"]
    ]

    var_always_called = called_gt0[called_gt0["called_in_ALL"] == True].index
    var_never_called = called_gt0[called_gt0["called_in_NONE"] == True].index

    called_gt0[["called_in_ALL", "called_in_NONE"]].sum(axis=0)
    return (var_always_called,)


@app.cell
def _(sc, var_always_called):
    # all 9 variants that occur in all clinical samples are blacklisted variants
    sc.loc[sc["mutation_id"].isin(var_always_called)][
        ["chrom", "position", "ref", "alt", "gene", "blacklisted"]
    ]
    return


@app.cell
def _(mo):
    mo.md(r"""### (2.2) Variants called above VAF thresholds""")
    return


@app.cell
def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations):
    # mutations over thresholds, excluding the blacklisted hits
    df_mut_per_sample_by_thresh = pd.DataFrame()
    for t in [0.00125, 0.0025, 0.005, 0.01]:
        t_data_all = pd.DataFrame(
            (all_vaf > t).sum(axis=0),
            columns=[f"variants_gt_{t}_all"],
        )
        t_data = pd.DataFrame(
            (
                all_vaf.loc[
                    [i for i in all_vaf.index if i not in black_mut_ids], :
                ]
                > t
            ).sum(axis=0),
            columns=[f"variants_gt_{t}_wo_blacklisted"],
        )
        df_mut_per_sample_by_thresh = pd.concat(
            [df_mut_per_sample_by_thresh, t_data, t_data_all], axis=1
        )
    df_mut_per_sample_by_thresh["sample_type"] = [
        "Serenomica-Sera"
        if ("Sera" in i) and (i not in qg_samples_dot_mutations)
        else "QG-Sera"
        if (i in qg_samples_dot_mutations) and ("Sera" in i)
        else "Serenomica-Clinincal"
        if i not in qg_samples_dot_mutations
        else "QG-Clinical"
        for i in df_mut_per_sample_by_thresh.index
    ]
    df_mut_per_sample_by_thresh
    return (df_mut_per_sample_by_thresh,)


@app.cell
def _(df_mut_per_sample_by_thresh, pd):
    df_mut_per_sample_by_thresh_melt = pd.melt(
        df_mut_per_sample_by_thresh.reset_index(), id_vars=["index", "sample_type"]
    )
    return (df_mut_per_sample_by_thresh_melt,)


@app.cell
def _(df_mut_per_sample_by_thresh, mo):
    ms_gt_thresh = mo.ui.multiselect(options=df_mut_per_sample_by_thresh.columns)
    mo.hstack([ms_gt_thresh])
    return (ms_gt_thresh,)


@app.cell
def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px):
    fig2 = px.box(
        df_mut_per_sample_by_thresh_melt.loc[
            df_mut_per_sample_by_thresh_melt["variable"].isin(ms_gt_thresh.value)
        ],
        x="variable",
        y="value",
        color="sample_type",
        points="all",
        template="simple_white",
        labels={"variable": "", "value": "Variants per sample"},
        hover_data=["index"],
        category_orders={
            "sample_type": sorted(
                df_mut_per_sample_by_thresh_melt["sample_type"].unique()
            )
        },
    )
    fig2.update_yaxes(showgrid=True)
    fig2.show()
    return


@app.cell
def _(mo):
    mo.md(
        r"""
    ### (2.3) Mutations with very High VAF (in some samples)

    Arbritrarily looking at >10%
    """
    )
    return


@app.cell
def _(clinicals, mutations_vaf, pd, sc):
    # looking just at the serenomica samples
    high_vaf_serenomica = pd.DataFrame(
        (mutations_vaf.loc[:, clinicals] > 0.1)
        .sum(axis=1)
        .sort_values(ascending=False),
        columns=["count_vaf_gt0.1"],
    )
    sc.loc[
        sc["mutation_id"].isin(
            high_vaf_serenomica.loc[
                high_vaf_serenomica["count_vaf_gt0.1"] > 0
            ].index
        )
    ][["mutation_id", "blacklisted", "rsID", "gene", "Consequence"]]
    return (high_vaf_serenomica,)


@app.cell
def _(high_vaf_serenomica):
    high_vaf_serenomica.head(9)
    return


@app.cell
def _(mo):
    mo.md(
        r"""
    rs587780751 - low pop AF

    rs2256740 - high pop AF, especially in EU, Africans and latin americans (13/19 samples)

    rs34094720 - relatively low pop AF

    rs2293347 - relatively high pop AF (7/19 samples)

    rs138327406 - relatively low pop AF

    rs11214077 - relatively low but > 1% in latin pop

    rs121913530 - very low pop AF
    """
    )
    return


@app.cell
def _(mo):
    mo.md(r"""### (2.3) Global Coverage""")
    return


@app.cell
def _(
    all_depth,
    black_mut_ids,
    df_mut_per_sample_by_thresh,
    pd,
    qg_samples_dot_mutations,
):
    # coverage over thresholds
    df_depth_by_thresh = pd.DataFrame()
    for d in [250, 500, 1000, 2000, 5000, 10000]:
        d_data_all = pd.DataFrame(
            (all_depth > d).sum(axis=0) / all_depth.shape[0],
            columns=[f"coverage_gt_{d}_all"],
        )
        d_data = pd.DataFrame(
            (
                all_depth.loc[
                    [i for i in all_depth.index if i not in black_mut_ids], :
                ]
                > d
            ).sum(axis=0)
            / all_depth.shape[0],
            columns=[f"coverage_gt_{d}_wo_blacklisted"],
        )
        df_depth_by_thresh = pd.concat(
            [df_depth_by_thresh, d_data, d_data_all], axis=1
        )
    df_depth_by_thresh["sample_type"] = [
        "Serenomica-Sera"
        if ("Sera" in i) and (i not in qg_samples_dot_mutations)
        else "QG-Sera"
        if (i in qg_samples_dot_mutations) and ("Sera" in i)
        else "Serenomica-Clinincal"
        if i not in qg_samples_dot_mutations
        else "QG-Clinical"
        for i in df_mut_per_sample_by_thresh.index
    ]
    df_depth_by_thresh
    return (df_depth_by_thresh,)


@app.cell
def _(df_depth_by_thresh, pd):
    df_depth_by_thresh_melt = pd.melt(
        df_depth_by_thresh.reset_index(), id_vars=["index", "sample_type"]
    )
    return (df_depth_by_thresh_melt,)


@app.cell
def _(df_depth_by_thresh, mo):
    ms_gt_dep_thresh = mo.ui.multiselect(options=df_depth_by_thresh.columns)
    mo.hstack([ms_gt_dep_thresh])
    return (ms_gt_dep_thresh,)


@app.cell
def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px):
    fig3 = px.box(
        df_depth_by_thresh_melt.loc[
            df_depth_by_thresh_melt["variable"].isin(ms_gt_dep_thresh.value)
        ],
        x="variable",
        y="value",
        color="sample_type",
        points="all",
        template="simple_white",
        labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
        hover_data=["index"],
        category_orders={
            "sample_type": sorted(df_depth_by_thresh_melt["sample_type"].unique())
        },
    )
    fig3.update_yaxes(showgrid=True)
    fig3.show()
    return


@app.cell
def _(mo):
    mo.md(
        r"""
    ### (2.4) Missed Targets

    Look into variants that only have a depth that could support a VAF of 1% (i.e. 200 reads)
    """
    )
    return


@app.cell
def _(black_mut_ids, mutations_depth, qg_hist_mut_depth):
    # removal of blacklisted variants for this analysis
    mutations_depth_wo_black = mutations_depth.loc[
        [i for i in mutations_depth.index if i not in black_mut_ids], :
    ].copy()

    qg_mutations_depth_wo_black = qg_hist_mut_depth.loc[
        [i for i in qg_hist_mut_depth.index if i not in black_mut_ids], :
    ].copy()
    return mutations_depth_wo_black, qg_mutations_depth_wo_black


@app.cell
def _(pd):
    def get_variant_group_by_depth(df, depth_thresh, sample_prop, mode="lt"):
        if mode == "lt":
            variants_group = pd.DataFrame(
                ((df < depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
                    ascending=False
                ),
                columns=[f"prop_cov_{mode}{depth_thresh}"],
            )
        elif mode == "gt":
            variants_group = pd.DataFrame(
                ((df > depth_thresh).sum(axis=1) / df.shape[1]).sort_values(
                    ascending=False
                ),
                columns=[f"prop_cov_{mode}{depth_thresh}"],
            )
        variant_ids = list(
            variants_group.loc[
                variants_group[f"prop_cov_{mode}{depth_thresh}"] > sample_prop
            ].index
        )
        return variants_group, variant_ids
    return (get_variant_group_by_depth,)


@app.cell
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
    variants_lost_global_coverage, variants_lost_cov_ids = (
        get_variant_group_by_depth(
            mutations_depth_wo_black.loc[:, clinicals], 400, 0.5
        )
    )
    variants_lost_cov_ids
    return (variants_lost_cov_ids,)


@app.cell
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
    qg_variants_lost_global_coverage, qg_variants_lost_cov_ids = (
        get_variant_group_by_depth(qg_mutations_depth_wo_black, 400, 0.5)
    )
    qg_variants_lost_cov_ids
    return (qg_variants_lost_cov_ids,)


@app.cell
def _(plt, qg_variants_lost_cov_ids, variants_lost_cov_ids, venn2):
    venn2(
        subsets=(
            len(qg_variants_lost_cov_ids),
            len(
                set(qg_variants_lost_cov_ids).intersection(
                    set(variants_lost_cov_ids)
                )
            ),
            len(set(variants_lost_cov_ids)),
        ),
        set_labels=("QG", "Serenomica"),
        set_colors=("blue", "green"),
        alpha=0.5,
    )
    plt.title("Overlap of missed supercolumn targets (w/o blacklist targets)")
    plt.show()
    return


@app.cell
def _(mo):
    mo.md(
        r"""
    ### (2.5) Poorly Covered for low VAF detection

    As we need 2 reads before we report a variant, this analysis looks at variants with lower than 1000 read depth that would enable a lowest VAF of 0.2% to be found.
    """
    )
    return


@app.cell
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
    variants_low_global_coverage, variants_low_cov_ids = (
        get_variant_group_by_depth(
            mutations_depth_wo_black.loc[:, clinicals], 1000, 0.75
        )
    )
    variants_low_cov_ids
    return (variants_low_cov_ids,)


@app.cell
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
    qg_variants_low_global_coverage, qg_variants_low_cov_ids = (
        get_variant_group_by_depth(qg_mutations_depth_wo_black, 1000, 0.75)
    )
    qg_variants_low_cov_ids
    return (qg_variants_low_cov_ids,)


@app.cell
def _(
    plt,
    qg_variants_lost_cov_ids,
    qg_variants_low_cov_ids,
    variants_lost_cov_ids,
    variants_low_cov_ids,
    venn2,
):
    qg_low = set(
        [q for q in qg_variants_low_cov_ids if q not in qg_variants_lost_cov_ids]
    )
    sere_low = set(
        [v for v in variants_low_cov_ids if v not in variants_lost_cov_ids]
    )
    venn2(
        subsets=(
            len(qg_low),
            len(qg_low.intersection(set(sere_low))),
            len(sere_low),
        ),
        set_labels=("QG", "Serenomica"),
        set_colors=("blue", "green"),
        alpha=0.5,
    )
    plt.title(
        "Overlap of lowly covered supercolumn targets (w/o blacklist targets)"
    )
    plt.show()
    return


@app.cell
def _(mo):
    mo.md(
        r"""
    ### (2.6) Overly covered targets

    Look to see if any sc targets are overly represented in the reads
    """
    )
    return


@app.cell
def _(clinicals, mutations_depth_wo_black, pd):
    mean_cov = pd.DataFrame(
        mutations_depth_wo_black.loc[:, clinicals]
        .mean(axis=1)
        .sort_values(ascending=False),
        columns=["mean_cov"],
    )
    mean_cov
    return (mean_cov,)


@app.cell
def _(mean_cov, mo):
    slider = mo.ui.slider(
        start=100,
        stop=len(mean_cov),
        step=200,
        label="Number of variants to show",
        show_value=True,
    )
    mo.hstack([slider])
    return (slider,)


@app.cell
def _(mean_cov, px, slider):
    fig4 = px.bar(mean_cov.iloc[: slider.value, :], template="simple_white")
    fig4.show()
    return


@app.cell
def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black):
    variants_high_global_coverage, variants_high_cov_ids = (
        get_variant_group_by_depth(
            mutations_depth_wo_black.loc[:, clinicals], 10000, 0.75, mode="gt"
        )
    )
    variants_high_cov_ids
    return (variants_high_cov_ids,)


@app.cell
def _(get_variant_group_by_depth, qg_mutations_depth_wo_black):
    qg_variants_high_global_coverage, qg_variants_high_cov_ids = (
        get_variant_group_by_depth(
            qg_mutations_depth_wo_black, 10000, 0.75, mode="gt"
        )
    )
    qg_variants_high_cov_ids
    return


@app.cell
def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
    vars_of_interest = sc.loc[
        sc["mutation_id"].isin(
            variants_lost_cov_ids + variants_low_cov_ids + variants_high_cov_ids
        )
    ][
        [
            "chrom",
            "position",
            "mutation_id",
            "gene",
            "HGVS",
            "Consequence",
            "region_type",
            "blacklisted",
            "cancer_fda",
            "FDA_drugs",
            "cancer_clinvar",
            "cancer_rank_prevalence_combined",
        ]
    ].copy()
    vars_of_interest["coverage_issue"] = [
        "lost"
        if i in variants_lost_cov_ids
        else "poorly_covered"
        if i in variants_low_cov_ids
        else "highly_covered"
        if i in variants_high_cov_ids
        else ""
        for i in vars_of_interest["mutation_id"]
    ]
    vars_of_interest.to_csv(
        "../data/coverage_issue_variants.tsv", sep="\t", index=False
    )
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()