feat: added code for eda with marimo

This commit is contained in:
2025-09-21 22:26:09 +02:00
parent a433e4df51
commit ab15df85ad

View File

@@ -0,0 +1,234 @@
import marimo
__generated_with = "0.14.16"
app = marimo.App(width="medium")
@app.cell
def _():
import os
import itertools
import marimo as mo
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib as plt
import seaborn as sns
import yaml
return itertools, mo, os, pd, px, yaml
@app.cell
def _(mo):
mo.md(
r"""
# Initial EDA of the Twist LB data (19-Sep-2025)
So far no analysis have been done on the data from the LB assay. This analysis will take the first look.
Analysis ideas:
- Stats data analysis - box plots and outliers
- Mutations - agg stats -> variants called, coverage of called variants in supercol, common called variants, unique variants
- vcfs - agg stats -> variants called, coverage of called variants in supercol, common called variants
"""
)
return
@app.cell
def _():
se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/"
se2_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se2-lb-1/"
qc_hist_dir = (
"/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/"
)
return qc_hist_dir, se1_dir, se2_dir
@app.cell
def _(itertools, os):
def return_paths(suffix, *dirs):
return list(
itertools.chain.from_iterable(
[
[
os.path.join(d, f)
for f in os.listdir(d)
if f.endswith(suffix)
]
for d in dirs
]
)
)
return (return_paths,)
@app.cell
def _(return_paths, se1_dir, se2_dir):
stats_fpaths = return_paths(".stats", se1_dir, se2_dir)
print(len(stats_fpaths))
mutations_fpaths = return_paths(".mutations", se1_dir, se2_dir)
print(len(mutations_fpaths))
return mutations_fpaths, stats_fpaths
@app.cell
def _(pd, stats_fpaths, yaml):
stats = pd.DataFrame()
for sf in stats_fpaths:
with open(sf, "r") as handle:
stats = pd.concat(
[
stats,
pd.json_normalize(yaml.safe_load(handle), sep=" / ").rename(
{0: sf.split("/")[-1].strip(".stats")}, axis=0
),
],
axis=0,
)
stats["sample_type"] = [
"Serenomica-Sera" if "Sera" in i else "Serenomica-Clinical"
for i in stats.index
]
stats["source"] = "Serenomica"
stats = stats.drop(
["miscellaneous / stats_file_version", "miscellaneous / target_panel_bed"],
axis=1,
)
return (stats,)
@app.cell
def _(mutations_fpaths, pd):
cols = []
mutations_vaf, mutations_counts, mutations_depth = (
pd.DataFrame(),
pd.DataFrame(),
pd.DataFrame(),
)
for mf in mutations_fpaths:
tmp = pd.read_csv(mf, sep="\t", header=None)
tmp["mutation_id"] = tmp.apply(
lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
)
tmp.set_index("mutation_id", inplace=True)
cols.append(mf.split("/")[-1].strip(".mutations"))
mutations_vaf = pd.concat([mutations_vaf, tmp[3]], axis=1)
mutations_counts = pd.concat([mutations_counts, tmp[4]], axis=1)
mutations_depth = pd.concat([mutations_depth, tmp[5]], axis=1)
mutations_vaf.columns = cols
mutations_counts.columns = cols
mutations_depth.columns = cols
mutations_vaf
return (mutations_vaf,)
@app.cell
def _(os, pd, qc_hist_dir):
# QG historical data
qg_hist_stats = pd.read_csv(
os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0
)
qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns]
qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1)
qg_hist_stats["sample_type"] = [
"QG-Sera" if "Sera" in i else "QG-Clinical" for i in qg_hist_stats.index
]
qg_hist_stats["source"] = "QG"
qg_hist_stats
return (qg_hist_stats,)
@app.cell
def _(pd, qg_hist_stats, stats):
# ensure columns match
assert set(qg_hist_stats.columns) == set(stats.columns)
all_stats = pd.concat([stats, qg_hist_stats], axis=0)
all_stats.head()
return (all_stats,)
@app.cell
def _(mo):
mo.md(r"""## (1) Stats""")
return
@app.cell
def _(all_stats, pd):
stats_melt = pd.melt(
all_stats.reset_index(), id_vars=["index", "sample_type", "source"]
)
stats_melt.head()
return (stats_melt,)
@app.cell
def _(mo, stats):
multiselect = mo.ui.multiselect(options=stats.columns)
mo.hstack([multiselect])
return (multiselect,)
@app.cell
def _(multiselect, px, stats_melt):
fig = px.box(
stats_melt.loc[stats_melt["variable"].isin(multiselect.value)],
x="variable",
y="value",
color="sample_type",
points="all",
template="simple_white",
labels={"variable": "", "value": "Value from stats file"},
hover_data=["index", "source"],
)
fig.update_yaxes(showgrid=True)
fig.show()
return
@app.cell
def _(mo):
mo.md(
r"""
## (2) .mutations Files
Analysis ideas💡
- Always/majority called - potentially suspicious
- Never called
- High VAF targets
- Poorly covered targets
- Overly covered targets
"""
)
return
@app.cell
def _(mutations_vaf):
((mutations_vaf == 0).sum(axis=1) / len(mutations_vaf.columns)).sort_values(
ascending=False
)
return
@app.cell
def _(mutations_vaf):
mutations_vaf.shape
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()