From e03ede9c2887c44d21975bbe369dff5101a6d611 Mon Sep 17 00:00:00 2001
From: Darren Wight <darren@serenomica.com>
Date: Mon, 22 Sep 2025 17:31:11 +0200
Subject: [PATCH] feat: eda analysis continued and samples from QG search added

---
 notebooks/find_qg_dot_mutations.py   | 116 ++++++
 notebooks/initial_eda_lb_20250919.py | 591 +++++++++++++++++++++++++--
 2 files changed, 677 insertions(+), 30 deletions(-)
 create mode 100644 notebooks/find_qg_dot_mutations.py

diff --git a/notebooks/find_qg_dot_mutations.py b/notebooks/find_qg_dot_mutations.py
new file mode 100644
index 0000000..efd30c7
--- /dev/null
+++ b/notebooks/find_qg_dot_mutations.py
@@ -0,0 +1,116 @@
+import marimo
+
+__generated_with = "0.14.16"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import os
+    from collections import Counter
+    import marimo as mo
+    import pandas as pd
+    return mo, os, pd
+
+
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    # Search for Samples
+
+    Not all QG samples have the `.mutations` files. It would be helpful to have some from those that were part of the historical QC service.
+
+    Samples we have: `../data/qg_avail_dot_mutations.txt`
+    """
+    )
+    return
+
+
+@app.cell
+def _():
+    # pipeline runs used in the historical data (from repo)
+    LB2_PIPELINE_RUNS = [
+        "s3://quantgene-portland-lb/20221123-sp-pl34-hs2/lb-pl34-all-dev56/",
+        "s3://quantgene-portland-lb/20221202-s1-pl36-hs2/lb-pl36-all-dev56/",
+        "s3://quantgene-portland-lb/20221207-s2-pl37-hs2/lb-pl37-all-dev56/",
+        "s3://quantgene-portland-lb/20230109-sp-pl41-hs2/lb-pl41-dev56/",
+        "s3://quantgene-portland-lb/20230124-sp-pl42-hs2/lb-pl42-dev56/",
+        # PL43 skipped, mostly Geneva samples with stats all over the place
+        "s3://quantgene-portland-lb/20230210-sp-pl44-hs2/lb-pl44-dev59/",
+        "s3://quantgene-portland-lb/20230224-s1-pl47-hs2/lb-pl47-dev59/",
+        "s3://quantgene-portland-lb/20230316-s1-pl49-hs2/lb-pl49-dev59/",
+        "s3://quantgene-portland-lb/20230330-sp-pl50-hs2/lb-pl50-dev59/",
+        "s3://quantgene-portland-lb/20230417-s1-pl51-hs2/lb-pl51-dev59/",
+        "s3://quantgene-portland-lb/20230512-sp-pl52-hs2/lb-pl52-prd-2-1-0/",
+        "s3://quantgene-portland-lb/20230602-sp-pl53-hs2/lb-pl53-prd-2-1-1/",
+        "s3://quantgene-portland-lb/20230615-sp-pl54-hs2/lb-pl54-prd-2-1-1/",
+        "s3://quantgene-portland-lb/20230630-sp-pl55-hs2/lb-pl55-prd-2-1-1/",
+        "s3://quantgene-portland-lb/20230721-sp-pl56-hs2/lb-pl56-prd212/",
+        "s3://quantgene-portland-lb/20230811-sp-pl57-hs2/lb-pl57-prd212/",
+        "s3://quantgene-portland-lb/20230901-sp-pl58-hs2/lb-pl58-prd212/",
+        "s3://quantgene-portland-lb/20231027-sp-pl59-hs2/lb-pl59-prd212/",
+        # PL60 skipped due to low amount of input reads in all samples
+        "s3://quantgene-portland-lb/20240216-sp-pl61-hs2/lb-pl61-prd212/",
+        # PL62 skipped due to low amount of input reads and insert size issues
+        "s3://quantgene-portland-lb/20240516-s1-pl63-hs2/lb-pl63-prd212/",
+        "s3://quantgene-portland-lb/20240529-sp-pl64-hs2/lb-pl64-prd212/",
+    ]
+    return (LB2_PIPELINE_RUNS,)
+
+
+@app.cell
+def _(pd):
+    # hist samples
+    qg_hist_data = pd.read_csv(
+        "/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/historical_data_LB.csv",
+        index_col=0,
+    )
+    qc_samples = list(set(qg_hist_data.index))
+    qc_samples
+    return (qc_samples,)
+
+
+@app.cell
+def _(LB2_PIPELINE_RUNS, os, qc_samples):
+    with open("../data/qg_avail_dot_mutations.txt", "r") as handle:
+        original, avail_samples, needed_samples = [], [], []
+        for line in handle.readlines():
+            fpath, samp = os.path.split(line.rstrip())
+
+            fpath = str(fpath).replace("-lb//", "-lb/") + "/"
+            samp = samp.strip(".mutations")
+
+            original.append(line.rstrip())
+
+            # skip non-prod versions
+            if fpath not in LB2_PIPELINE_RUNS:
+                continue
+
+            avail_samples.append(samp)
+            if samp in qc_samples:
+                needed_samples.append(line.rstrip())
+    print(
+        f"Number of samples available from runs = {(n_avail := len(avail_samples))}"
+    )
+    print(f"Number of historical QC samples = {(n_qc := len(qc_samples))}")
+    print(
+        f"Number of historical QC samples available = {(n_needed := len(needed_samples))} ({round((n_needed / n_qc) * 100, 2)}%)"
+    )
+    return (needed_samples,)
+
+
+@app.cell
+def _(needed_samples):
+    with open("../data/qg_needed.txt", "w") as whandle:
+        whandle.writelines([i + "\n" for i in needed_samples])
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+if __name__ == "__main__":
+    app.run()
diff --git a/notebooks/initial_eda_lb_20250919.py b/notebooks/initial_eda_lb_20250919.py
index a741d34..e9ab295 100644
--- a/notebooks/initial_eda_lb_20250919.py
+++ b/notebooks/initial_eda_lb_20250919.py
@@ -36,6 +36,12 @@ def _(mo):
     return
 
 
+@app.cell
+def _(mo):
+    mo.md(r"""## (1) Preprocess Data""")
+    return
+
+
 @app.cell
 def _():
     se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/"
@@ -44,7 +50,9 @@ def _():
     qc_hist_dir = (
         "/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/"
     )
-    return qc_hist_dir, se1_dir, se2_dir
+    qc_hist_mutations_dir = "/home/darren/Documents/4_data/3_internal/2_lb/qg_historical_qc_service/dot_mutations/"
+    sc_path = "/home/darren/Documents/2_repos/serenomica/supercolumn/panel_v2/supercolumn_v2.csv"
+    return qc_hist_dir, qc_hist_mutations_dir, sc_path, se1_dir, se2_dir
 
 
 @app.cell
@@ -74,6 +82,12 @@ def _(return_paths, se1_dir, se2_dir):
     return mutations_fpaths, stats_fpaths
 
 
+@app.cell
+def _(mo):
+    mo.md(r"""### (1.1) Stats""")
+    return
+
+
 @app.cell
 def _(pd, stats_fpaths, yaml):
     stats = pd.DataFrame()
@@ -101,31 +115,52 @@ def _(pd, stats_fpaths, yaml):
 
 
 @app.cell
-def _(mutations_fpaths, pd):
-    cols = []
-    mutations_vaf, mutations_counts, mutations_depth = (
-        pd.DataFrame(),
-        pd.DataFrame(),
-        pd.DataFrame(),
-    )
-    for mf in mutations_fpaths:
-        tmp = pd.read_csv(mf, sep="\t", header=None)
-        tmp["mutation_id"] = tmp.apply(
-            lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
+def _(mo):
+    mo.md(r"""### (1.2) .mutations""")
+    return
+
+
+@app.cell
+def _(pd):
+    def process_mutations_files(mutation_file_paths):
+        cols = []
+        vaf, counts, depth = (
+            pd.DataFrame(),
+            pd.DataFrame(),
+            pd.DataFrame(),
         )
-        tmp.set_index("mutation_id", inplace=True)
+        for mf in mutation_file_paths:
+            tmp = pd.read_csv(mf, sep="\t", header=None)
+            tmp["mutation_id"] = tmp.apply(
+                lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
+            )
+            tmp.set_index("mutation_id", inplace=True)
 
-        cols.append(mf.split("/")[-1].strip(".mutations"))
-        mutations_vaf = pd.concat([mutations_vaf, tmp[3]], axis=1)
-        mutations_counts = pd.concat([mutations_counts, tmp[4]], axis=1)
-        mutations_depth = pd.concat([mutations_depth, tmp[5]], axis=1)
+            cols.append(mf.split("/")[-1].strip(".mutations"))
+            vaf = pd.concat([vaf, tmp[3]], axis=1)
+            counts = pd.concat([counts, tmp[4]], axis=1)
+            depth = pd.concat([depth, tmp[5]], axis=1)
 
-    mutations_vaf.columns = cols
-    mutations_counts.columns = cols
-    mutations_depth.columns = cols
+        vaf.columns = cols
+        counts.columns = cols
+        depth.columns = cols
+        return vaf, counts, depth
+    return (process_mutations_files,)
 
+
+@app.cell
+def _(mutations_fpaths, process_mutations_files):
+    mutations_vaf, mutations_counts, mutations_depth = process_mutations_files(
+        mutations_fpaths
+    )
     mutations_vaf
-    return (mutations_vaf,)
+    return mutations_depth, mutations_vaf
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""### (1.3) Stats QG Historical""")
+    return
 
 
 @app.cell
@@ -156,7 +191,39 @@ def _(pd, qg_hist_stats, stats):
 
 @app.cell
 def _(mo):
-    mo.md(r"""## (1) Stats""")
+    mo.md(r"""### (1.4) .mutations QG Historical""")
+    return
+
+
+@app.cell
+def _(process_mutations_files, qc_hist_mutations_dir, return_paths):
+    qg_hist_mutations_paths = return_paths(".mutations", qc_hist_mutations_dir)
+    qg_hist_mut_vaf, qg_hist_mut_count, qg_hist_mut_depth = (
+        process_mutations_files(qg_hist_mutations_paths)
+    )
+    qg_hist_mut_vaf
+    return qg_hist_mut_depth, qg_hist_mut_vaf
+
+
+@app.cell
+def _(mutations_depth, mutations_vaf, pd, qg_hist_mut_depth, qg_hist_mut_vaf):
+    # ensure columns match
+    assert set(qg_hist_mut_vaf.index) == set(mutations_vaf.index)
+
+    all_vaf = pd.concat([mutations_vaf, qg_hist_mut_vaf], axis=1)
+    all_depth = pd.concat([mutations_depth, qg_hist_mut_depth], axis=1)
+    return all_depth, all_vaf
+
+
+@app.cell
+def _(qg_hist_mut_vaf):
+    qg_samples_dot_mutations = qg_hist_mut_vaf.columns
+    return (qg_samples_dot_mutations,)
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""## (1) Stats Analysis""")
     return
 
 
@@ -197,13 +264,13 @@ def _(multiselect, px, stats_melt):
 def _(mo):
     mo.md(
         r"""
-    ## (2) .mutations Files
+    ## (2) .mutations Analysis
 
     Analysis ideas💡
 
-    - Always/majority called - potentially suspicious
-    - Never called
-    - High VAF targets
+    - Always/majority called - potentially suspicious ✅
+    - Never called ✅
+    - High VAF targets ✅
     - Poorly covered targets
     - Overly covered targets
     """
@@ -211,17 +278,481 @@ def _(mo):
     return
 
 
+@app.cell
+def _(pd, sc_path):
+    # supercolumn for analysis
+    sc = pd.read_csv(sc_path, sep=";")
+    sc["mutation_id"] = (
+        "chr"
+        + sc["chrom"].astype(str)
+        + "_"
+        + sc["position"].astype(str)
+        + "_"
+        + sc["ref"].astype(str)
+        + "/"
+        + sc["alt"].astype(str)
+    )
+    return (sc,)
+
+
+@app.cell
+def _(sc):
+    black_mut_ids = sc.loc[sc["blacklisted"] == "Yes"]["mutation_id"].tolist()
+    print(f"Number of blacklisted variants = {len(black_mut_ids)}")
+    return (black_mut_ids,)
+
+
 @app.cell
 def _(mutations_vaf):
-    ((mutations_vaf == 0).sum(axis=1) / len(mutations_vaf.columns)).sort_values(
-        ascending=False
+    seracares = [i for i in mutations_vaf.columns if "Sera" in i]
+    clinicals = [i for i in mutations_vaf.columns if i not in seracares]
+    print(f"Number of seracares = {len(seracares)}")
+    print(f"Number of volunteers = {len(clinicals)}")
+    print("Both numbers include replicates!")
+    return (clinicals,)
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""### (2.1) Frequently and infrequently called variants""")
+    return
+
+
+@app.cell
+def _(clinicals, mutations_vaf, pd):
+    # always called and never called
+    called_gt0 = pd.DataFrame(
+        (mutations_vaf.loc[:, clinicals] > 0).sum(axis=1),
+        columns=["samples_called_in"],
+    )
+    called_gt0["prop_samples_called"] = [
+        row / len(clinicals) for row in called_gt0["samples_called_in"]
+    ]
+    called_gt0["called_in_ALL"] = [
+        True if val == len(clinicals) else False
+        for val in called_gt0["samples_called_in"]
+    ]
+    called_gt0["called_in_NONE"] = [
+        True if val == 0 else False for val in called_gt0["samples_called_in"]
+    ]
+
+    var_always_called = called_gt0[called_gt0["called_in_ALL"] == True].index
+    var_never_called = called_gt0[called_gt0["called_in_NONE"] == True].index
+
+    called_gt0[["called_in_ALL", "called_in_NONE"]].sum(axis=0)
+    return (var_always_called,)
+
+
+@app.cell
+def _(sc, var_always_called):
+    # all 9 variants that occur in all clinical samples are blacklisted variants
+    sc.loc[sc["mutation_id"].isin(var_always_called)][
+        ["chrom", "position", "ref", "alt", "gene", "blacklisted"]
+    ]
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""### (2.2) Variants called above VAF thresholds""")
+    return
+
+
+@app.cell
+def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations):
+    # mutations over thresholds, excluding the blacklisted hits
+    df_mut_per_sample_by_thresh = pd.DataFrame()
+    for t in [0.00125, 0.0025, 0.005, 0.01]:
+        t_data_all = pd.DataFrame(
+            (all_vaf > t).sum(axis=0),
+            columns=[f"variants_gt_{t}_all"],
+        )
+        t_data = pd.DataFrame(
+            (
+                all_vaf.loc[
+                    [i for i in all_vaf.index if i not in black_mut_ids], :
+                ]
+                > t
+            ).sum(axis=0),
+            columns=[f"variants_gt_{t}_wo_blacklisted"],
+        )
+        df_mut_per_sample_by_thresh = pd.concat(
+            [df_mut_per_sample_by_thresh, t_data, t_data_all], axis=1
+        )
+    df_mut_per_sample_by_thresh["sample_type"] = [
+        "Serenomica-Sera"
+        if ("Sera" in i) and (i not in qg_samples_dot_mutations)
+        else "QG-Sera"
+        if (i in qg_samples_dot_mutations) and ("Sera" in i)
+        else "Serenomical-Clinincal"
+        if i not in qg_samples_dot_mutations
+        else "QG-Clinical"
+        for i in df_mut_per_sample_by_thresh.index
+    ]
+    df_mut_per_sample_by_thresh
+    return (df_mut_per_sample_by_thresh,)
+
+
+@app.cell
+def _(df_mut_per_sample_by_thresh, pd):
+    df_mut_per_sample_by_thresh_melt = pd.melt(
+        df_mut_per_sample_by_thresh.reset_index(), id_vars=["index", "sample_type"]
+    )
+    return (df_mut_per_sample_by_thresh_melt,)
+
+
+@app.cell
+def _(df_mut_per_sample_by_thresh, mo):
+    ms_gt_thresh = mo.ui.multiselect(options=df_mut_per_sample_by_thresh.columns)
+    mo.hstack([ms_gt_thresh])
+    return (ms_gt_thresh,)
+
+
+@app.cell
+def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px):
+    fig2 = px.box(
+        df_mut_per_sample_by_thresh_melt.loc[
+            df_mut_per_sample_by_thresh_melt["variable"].isin(ms_gt_thresh.value)
+        ],
+        x="variable",
+        y="value",
+        color="sample_type",
+        points="all",
+        template="simple_white",
+        labels={"variable": "", "value": "Variants per sample"},
+        hover_data=["index"],
+    )
+    fig2.update_yaxes(showgrid=True)
+    fig2.show()
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    ### (2.3) Mutations with very High VAF (in some samples)
+
+    Arbritrarily looking at >10%
+    """
     )
     return
 
 
 @app.cell
-def _(mutations_vaf):
-    mutations_vaf.shape
+def _(clinicals, mutations_vaf, pd, sc):
+    # looking just at the serenomica samples
+    high_vaf_serenomica = pd.DataFrame(
+        (mutations_vaf.loc[:, clinicals] > 0.1)
+        .sum(axis=1)
+        .sort_values(ascending=False),
+        columns=["count_vaf_gt0.1"],
+    )
+    sc.loc[
+        sc["mutation_id"].isin(
+            high_vaf_serenomica.loc[
+                high_vaf_serenomica["count_vaf_gt0.1"] > 0
+            ].index
+        )
+    ][["mutation_id", "blacklisted", "rsID", "gene", "Consequence"]]
+    return (high_vaf_serenomica,)
+
+
+@app.cell
+def _(high_vaf_serenomica):
+    high_vaf_serenomica.head(9)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    rs587780751 - low pop AF
+
+    rs2256740 - high pop AF, especially in EU, Africans and latin americans (13/19 samples)
+
+    rs34094720 - relatively low pop AF
+
+    rs2293347 - relatively high pop AF (7/19 samples)
+
+    rs138327406 - relatively low pop AF
+
+    rs11214077 - relatively low but > 1% in latin pop
+
+    rs121913530 - very low pop AF
+    """
+    )
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""### (2.3) Global Coverage""")
+    return
+
+
+@app.cell
+def _(
+    all_depth,
+    black_mut_ids,
+    df_mut_per_sample_by_thresh,
+    pd,
+    qg_samples_dot_mutations,
+):
+    # coverage over thresholds
+    df_depth_by_thresh = pd.DataFrame()
+    for d in [250, 500, 1000, 2000, 5000, 10000]:
+        d_data_all = pd.DataFrame(
+            (all_depth > d).sum(axis=0) / all_depth.shape[0],
+            columns=[f"coverage_gt_{d}_all"],
+        )
+        d_data = pd.DataFrame(
+            (
+                all_depth.loc[
+                    [i for i in all_depth.index if i not in black_mut_ids], :
+                ]
+                > d
+            ).sum(axis=0)
+            / all_depth.shape[0],
+            columns=[f"coverage_gt_{d}_wo_blacklisted"],
+        )
+        df_depth_by_thresh = pd.concat(
+            [df_depth_by_thresh, d_data, d_data_all], axis=1
+        )
+    df_depth_by_thresh["sample_type"] = [
+        "Serenomica-Sera"
+        if ("Sera" in i) and (i not in qg_samples_dot_mutations)
+        else "QG-Sera"
+        if (i in qg_samples_dot_mutations) and ("Sera" in i)
+        else "Serenomical-Clinincal"
+        if i not in qg_samples_dot_mutations
+        else "QG-Clinical"
+        for i in df_mut_per_sample_by_thresh.index
+    ]
+    df_depth_by_thresh
+    return (df_depth_by_thresh,)
+
+
+@app.cell
+def _(df_depth_by_thresh, pd):
+    df_depth_by_thresh_melt = pd.melt(
+        df_depth_by_thresh.reset_index(), id_vars=["index", "sample_type"]
+    )
+    return (df_depth_by_thresh_melt,)
+
+
+@app.cell
+def _(df_depth_by_thresh, mo):
+    ms_gt_dep_thresh = mo.ui.multiselect(options=df_depth_by_thresh.columns)
+    mo.hstack([ms_gt_dep_thresh])
+    return (ms_gt_dep_thresh,)
+
+
+@app.cell
+def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px):
+    fig3 = px.box(
+        df_depth_by_thresh_melt.loc[
+            df_depth_by_thresh_melt["variable"].isin(ms_gt_dep_thresh.value)
+        ],
+        x="variable",
+        y="value",
+        color="sample_type",
+        points="all",
+        template="simple_white",
+        labels={"variable": "", "value": "Prop of sc sites > threhold per sample"},
+        hover_data=["index"],
+    )
+    fig3.update_yaxes(showgrid=True)
+    fig3.show()
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    ### (2.4) Missed Targets
+
+    Look into variants that only have a depth that could support a VAF of 1% (i.e. 200 reads)
+    """
+    )
+    return
+
+
+@app.cell
+def _(black_mut_ids, mutations_depth):
+    # removal of blacklisted variants for this analysis
+    mutations_depth_wo_black = mutations_depth.loc[
+        [i for i in mutations_depth.index if i not in black_mut_ids], :
+    ].copy()
+    return (mutations_depth_wo_black,)
+
+
+@app.cell
+def _(clinicals, mutations_depth_wo_black, pd):
+    # missed non-blacklisted variants where theoretical detection only >1% (i.e. 2 reads in 200 supporting variant)
+    variants_lost_global_coverage = pd.DataFrame(
+        (
+            (
+                mutations_depth_wo_black.loc[
+                    :,
+                    clinicals,
+                ]
+                < 200
+            ).sum(axis=1)
+            / len(clinicals)
+        ).sort_values(ascending=False),
+        columns=["prop_cov_lt200"],
+    )
+    variants_lost_global_coverage
+    return (variants_lost_global_coverage,)
+
+
+@app.cell
+def _(variants_lost_global_coverage):
+    variants_lost_cov_ids = list(
+        variants_lost_global_coverage.loc[
+            variants_lost_global_coverage["prop_cov_lt200"] > 0.5
+        ].index
+    )
+    variants_lost_cov_ids
+    return (variants_lost_cov_ids,)
+
+
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    ### (2.5) Poorly Covered for low VAF detection
+
+    As we need 2 reads before we report a variant, this analysis looks at variants with lower than 1000 read depth that would enable a lowest VAF of 0.2% to be found.
+    """
+    )
+    return
+
+
+@app.cell
+def _(clinicals, mutations_depth_wo_black, pd):
+    # variants that always fail a theoretical report coverage >= 0.2% VAF (i.e. needs 2 reads out of at least 1000)
+    variants_low_global_coverage = pd.DataFrame(
+        (
+            (mutations_depth_wo_black.loc[:, clinicals] < 1000).sum(axis=1)
+            / len(clinicals)
+        ).sort_values(ascending=False),
+        columns=["pro_cov_lt1000"],
+    )
+
+    variants_low_cov_ids = list(
+        variants_low_global_coverage.loc[
+            variants_low_global_coverage["pro_cov_lt1000"] > 0.75
+        ].index
+    )
+    return (variants_low_cov_ids,)
+
+
+@app.cell
+def _():
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    ### (2.6) Overly covered targets
+
+    Look to see if any sc targets are overly represented in the reads
+    """
+    )
+    return
+
+
+@app.cell
+def _(clinicals, mutations_depth_wo_black, pd):
+    mean_cov = pd.DataFrame(
+        mutations_depth_wo_black.loc[:, clinicals]
+        .mean(axis=1)
+        .sort_values(ascending=False),
+        columns=["mean_cov"],
+    )
+    mean_cov
+    return (mean_cov,)
+
+
+@app.cell
+def _(mean_cov, mo):
+    slider = mo.ui.slider(
+        start=100,
+        stop=len(mean_cov),
+        step=200,
+        label="Number of variants to show",
+        show_value=True,
+    )
+    mo.hstack([slider])
+    return (slider,)
+
+
+@app.cell
+def _(mean_cov, px, slider):
+    fig4 = px.bar(mean_cov.iloc[: slider.value, :], template="simple_white")
+    fig4.show()
+    return
+
+
+@app.cell
+def _(clinicals, mutations_depth_wo_black, pd):
+    variants_high_global_coverage = pd.DataFrame(
+        (
+            (mutations_depth_wo_black.loc[:, clinicals] > 10000).sum(axis=1)
+            / len(clinicals)
+        ).sort_values(ascending=False),
+        columns=["pro_cov_gt10000"],
+    )
+
+    variants_high_cov_ids = list(
+        variants_high_global_coverage.loc[
+            variants_high_global_coverage["pro_cov_gt10000"] > 0.75
+        ].index
+    )
+    return (variants_high_cov_ids,)
+
+
+@app.cell
+def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids):
+    vars_of_interest = sc.loc[
+        sc["mutation_id"].isin(
+            variants_lost_cov_ids + variants_low_cov_ids + variants_high_cov_ids
+        )
+    ][
+        [
+            "chrom",
+            "position",
+            "mutation_id",
+            "gene",
+            "HGVS",
+            "Consequence",
+            "region_type",
+            "blacklisted",
+            "cancer_fda",
+            "FDA_drugs",
+            "cancer_clinvar",
+            "cancer_rank_prevalence_combined",
+        ]
+    ].copy()
+    vars_of_interest["coverage_issue"] = [
+        "lost"
+        if i in variants_lost_cov_ids
+        else "poorly_covered"
+        if i in variants_low_cov_ids
+        else "highly_covered"
+        if i in variants_high_cov_ids
+        else ""
+        for i in vars_of_interest["mutation_id"]
+    ]
+    vars_of_interest.to_csv(
+        "../data/coverage_issue_variants.tsv", sep="\t", index=False
+    )
     return