import marimo __generated_with = "0.14.16" app = marimo.App(width="medium") @app.cell def _(): import os import itertools import marimo as mo import pandas as pd import plotly.express as px import plotly.graph_objects as go import matplotlib as plt import seaborn as sns import yaml return itertools, mo, os, pd, px, yaml @app.cell def _(mo): mo.md( r""" # Initial EDA of the Twist LB data (19-Sep-2025) So far no analysis have been done on the data from the LB assay. This analysis will take the first look. Analysis ideas: - Stats data analysis - box plots and outliers - Mutations - agg stats -> variants called, coverage of called variants in supercol, common called variants, unique variants - vcfs - agg stats -> variants called, coverage of called variants in supercol, common called variants """ ) return @app.cell def _(): se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/" se2_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se2-lb-1/" qc_hist_dir = ( "/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/" ) return qc_hist_dir, se1_dir, se2_dir @app.cell def _(itertools, os): def return_paths(suffix, *dirs): return list( itertools.chain.from_iterable( [ [ os.path.join(d, f) for f in os.listdir(d) if f.endswith(suffix) ] for d in dirs ] ) ) return (return_paths,) @app.cell def _(return_paths, se1_dir, se2_dir): stats_fpaths = return_paths(".stats", se1_dir, se2_dir) print(len(stats_fpaths)) mutations_fpaths = return_paths(".mutations", se1_dir, se2_dir) print(len(mutations_fpaths)) return mutations_fpaths, stats_fpaths @app.cell def _(pd, stats_fpaths, yaml): stats = pd.DataFrame() for sf in stats_fpaths: with open(sf, "r") as handle: stats = pd.concat( [ stats, pd.json_normalize(yaml.safe_load(handle), sep=" / ").rename( {0: sf.split("/")[-1].strip(".stats")}, axis=0 ), ], axis=0, ) stats["sample_type"] = [ "Serenomica-Sera" if "Sera" in i else "Serenomica-Clinical" for i in stats.index ] stats["source"] = "Serenomica" stats = stats.drop( ["miscellaneous / stats_file_version", "miscellaneous / target_panel_bed"], axis=1, ) return (stats,) @app.cell def _(mutations_fpaths, pd): cols = [] mutations_vaf, mutations_counts, mutations_depth = ( pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), ) for mf in mutations_fpaths: tmp = pd.read_csv(mf, sep="\t", header=None) tmp["mutation_id"] = tmp.apply( lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1 ) tmp.set_index("mutation_id", inplace=True) cols.append(mf.split("/")[-1].strip(".mutations")) mutations_vaf = pd.concat([mutations_vaf, tmp[3]], axis=1) mutations_counts = pd.concat([mutations_counts, tmp[4]], axis=1) mutations_depth = pd.concat([mutations_depth, tmp[5]], axis=1) mutations_vaf.columns = cols mutations_counts.columns = cols mutations_depth.columns = cols mutations_vaf return (mutations_vaf,) @app.cell def _(os, pd, qc_hist_dir): # QG historical data qg_hist_stats = pd.read_csv( os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0 ) qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns] qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1) qg_hist_stats["sample_type"] = [ "QG-Sera" if "Sera" in i else "QG-Clinical" for i in qg_hist_stats.index ] qg_hist_stats["source"] = "QG" qg_hist_stats return (qg_hist_stats,) @app.cell def _(pd, qg_hist_stats, stats): # ensure columns match assert set(qg_hist_stats.columns) == set(stats.columns) all_stats = pd.concat([stats, qg_hist_stats], axis=0) all_stats.head() return (all_stats,) @app.cell def _(mo): mo.md(r"""## (1) Stats""") return @app.cell def _(all_stats, pd): stats_melt = pd.melt( all_stats.reset_index(), id_vars=["index", "sample_type", "source"] ) stats_melt.head() return (stats_melt,) @app.cell def _(mo, stats): multiselect = mo.ui.multiselect(options=stats.columns) mo.hstack([multiselect]) return (multiselect,) @app.cell def _(multiselect, px, stats_melt): fig = px.box( stats_melt.loc[stats_melt["variable"].isin(multiselect.value)], x="variable", y="value", color="sample_type", points="all", template="simple_white", labels={"variable": "", "value": "Value from stats file"}, hover_data=["index", "source"], ) fig.update_yaxes(showgrid=True) fig.show() return @app.cell def _(mo): mo.md( r""" ## (2) .mutations Files Analysis ideas💡 - Always/majority called - potentially suspicious - Never called - High VAF targets - Poorly covered targets - Overly covered targets """ ) return @app.cell def _(mutations_vaf): ((mutations_vaf == 0).sum(axis=1) / len(mutations_vaf.columns)).sort_values( ascending=False ) return @app.cell def _(mutations_vaf): mutations_vaf.shape return @app.cell def _(): return if __name__ == "__main__": app.run()