From ab15df85adfd25f53ef33f5863439fa7a172ba05 Mon Sep 17 00:00:00 2001 From: Darren Wight Date: Sun, 21 Sep 2025 22:26:09 +0200 Subject: [PATCH] feat: added code for eda with marimo --- notebooks/initial_eda_lb_20250919.py | 234 +++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 notebooks/initial_eda_lb_20250919.py diff --git a/notebooks/initial_eda_lb_20250919.py b/notebooks/initial_eda_lb_20250919.py new file mode 100644 index 0000000..a741d34 --- /dev/null +++ b/notebooks/initial_eda_lb_20250919.py @@ -0,0 +1,234 @@ +import marimo + +__generated_with = "0.14.16" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import os + import itertools + import marimo as mo + import pandas as pd + import plotly.express as px + import plotly.graph_objects as go + import matplotlib as plt + import seaborn as sns + import yaml + return itertools, mo, os, pd, px, yaml + + +@app.cell +def _(mo): + mo.md( + r""" + # Initial EDA of the Twist LB data (19-Sep-2025) + + So far no analysis have been done on the data from the LB assay. This analysis will take the first look. + + Analysis ideas: + + - Stats data analysis - box plots and outliers + - Mutations - agg stats -> variants called, coverage of called variants in supercol, common called variants, unique variants + - vcfs - agg stats -> variants called, coverage of called variants in supercol, common called variants + """ + ) + return + + +@app.cell +def _(): + se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/" + se2_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se2-lb-1/" + + qc_hist_dir = ( + "/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/" + ) + return qc_hist_dir, se1_dir, se2_dir + + +@app.cell +def _(itertools, os): + def return_paths(suffix, *dirs): + return list( + itertools.chain.from_iterable( + [ + [ + os.path.join(d, f) + for f in os.listdir(d) + if f.endswith(suffix) + ] + for d in dirs + ] + ) + ) + return (return_paths,) + + +@app.cell +def _(return_paths, se1_dir, se2_dir): + stats_fpaths = return_paths(".stats", se1_dir, se2_dir) + print(len(stats_fpaths)) + mutations_fpaths = return_paths(".mutations", se1_dir, se2_dir) + print(len(mutations_fpaths)) + return mutations_fpaths, stats_fpaths + + +@app.cell +def _(pd, stats_fpaths, yaml): + stats = pd.DataFrame() + for sf in stats_fpaths: + with open(sf, "r") as handle: + stats = pd.concat( + [ + stats, + pd.json_normalize(yaml.safe_load(handle), sep=" / ").rename( + {0: sf.split("/")[-1].strip(".stats")}, axis=0 + ), + ], + axis=0, + ) + stats["sample_type"] = [ + "Serenomica-Sera" if "Sera" in i else "Serenomica-Clinical" + for i in stats.index + ] + stats["source"] = "Serenomica" + stats = stats.drop( + ["miscellaneous / stats_file_version", "miscellaneous / target_panel_bed"], + axis=1, + ) + return (stats,) + + +@app.cell +def _(mutations_fpaths, pd): + cols = [] + mutations_vaf, mutations_counts, mutations_depth = ( + pd.DataFrame(), + pd.DataFrame(), + pd.DataFrame(), + ) + for mf in mutations_fpaths: + tmp = pd.read_csv(mf, sep="\t", header=None) + tmp["mutation_id"] = tmp.apply( + lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1 + ) + tmp.set_index("mutation_id", inplace=True) + + cols.append(mf.split("/")[-1].strip(".mutations")) + mutations_vaf = pd.concat([mutations_vaf, tmp[3]], axis=1) + mutations_counts = pd.concat([mutations_counts, tmp[4]], axis=1) + mutations_depth = pd.concat([mutations_depth, tmp[5]], axis=1) + + mutations_vaf.columns = cols + mutations_counts.columns = cols + mutations_depth.columns = cols + + mutations_vaf + return (mutations_vaf,) + + +@app.cell +def _(os, pd, qc_hist_dir): + # QG historical data + qg_hist_stats = pd.read_csv( + os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0 + ) + qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns] + qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1) + qg_hist_stats["sample_type"] = [ + "QG-Sera" if "Sera" in i else "QG-Clinical" for i in qg_hist_stats.index + ] + qg_hist_stats["source"] = "QG" + qg_hist_stats + return (qg_hist_stats,) + + +@app.cell +def _(pd, qg_hist_stats, stats): + # ensure columns match + assert set(qg_hist_stats.columns) == set(stats.columns) + + all_stats = pd.concat([stats, qg_hist_stats], axis=0) + all_stats.head() + return (all_stats,) + + +@app.cell +def _(mo): + mo.md(r"""## (1) Stats""") + return + + +@app.cell +def _(all_stats, pd): + stats_melt = pd.melt( + all_stats.reset_index(), id_vars=["index", "sample_type", "source"] + ) + stats_melt.head() + return (stats_melt,) + + +@app.cell +def _(mo, stats): + multiselect = mo.ui.multiselect(options=stats.columns) + mo.hstack([multiselect]) + return (multiselect,) + + +@app.cell +def _(multiselect, px, stats_melt): + fig = px.box( + stats_melt.loc[stats_melt["variable"].isin(multiselect.value)], + x="variable", + y="value", + color="sample_type", + points="all", + template="simple_white", + labels={"variable": "", "value": "Value from stats file"}, + hover_data=["index", "source"], + ) + fig.update_yaxes(showgrid=True) + fig.show() + return + + +@app.cell +def _(mo): + mo.md( + r""" + ## (2) .mutations Files + + Analysis ideas💡 + + - Always/majority called - potentially suspicious + - Never called + - High VAF targets + - Poorly covered targets + - Overly covered targets + """ + ) + return + + +@app.cell +def _(mutations_vaf): + ((mutations_vaf == 0).sum(axis=1) / len(mutations_vaf.columns)).sort_values( + ascending=False + ) + return + + +@app.cell +def _(mutations_vaf): + mutations_vaf.shape + return + + +@app.cell +def _(): + return + + +if __name__ == "__main__": + app.run()