feat: added code for eda with marimo
This commit is contained in:
234
notebooks/initial_eda_lb_20250919.py
Normal file
234
notebooks/initial_eda_lb_20250919.py
Normal file
@@ -0,0 +1,234 @@
|
|||||||
|
import marimo
|
||||||
|
|
||||||
|
__generated_with = "0.14.16"
|
||||||
|
app = marimo.App(width="medium")
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
import os
|
||||||
|
import itertools
|
||||||
|
import marimo as mo
|
||||||
|
import pandas as pd
|
||||||
|
import plotly.express as px
|
||||||
|
import plotly.graph_objects as go
|
||||||
|
import matplotlib as plt
|
||||||
|
import seaborn as sns
|
||||||
|
import yaml
|
||||||
|
return itertools, mo, os, pd, px, yaml
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
mo.md(
|
||||||
|
r"""
|
||||||
|
# Initial EDA of the Twist LB data (19-Sep-2025)
|
||||||
|
|
||||||
|
So far no analysis have been done on the data from the LB assay. This analysis will take the first look.
|
||||||
|
|
||||||
|
Analysis ideas:
|
||||||
|
|
||||||
|
- Stats data analysis - box plots and outliers
|
||||||
|
- Mutations - agg stats -> variants called, coverage of called variants in supercol, common called variants, unique variants
|
||||||
|
- vcfs - agg stats -> variants called, coverage of called variants in supercol, common called variants
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/"
|
||||||
|
se2_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se2-lb-1/"
|
||||||
|
|
||||||
|
qc_hist_dir = (
|
||||||
|
"/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/"
|
||||||
|
)
|
||||||
|
return qc_hist_dir, se1_dir, se2_dir
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(itertools, os):
|
||||||
|
def return_paths(suffix, *dirs):
|
||||||
|
return list(
|
||||||
|
itertools.chain.from_iterable(
|
||||||
|
[
|
||||||
|
[
|
||||||
|
os.path.join(d, f)
|
||||||
|
for f in os.listdir(d)
|
||||||
|
if f.endswith(suffix)
|
||||||
|
]
|
||||||
|
for d in dirs
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return (return_paths,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(return_paths, se1_dir, se2_dir):
|
||||||
|
stats_fpaths = return_paths(".stats", se1_dir, se2_dir)
|
||||||
|
print(len(stats_fpaths))
|
||||||
|
mutations_fpaths = return_paths(".mutations", se1_dir, se2_dir)
|
||||||
|
print(len(mutations_fpaths))
|
||||||
|
return mutations_fpaths, stats_fpaths
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pd, stats_fpaths, yaml):
|
||||||
|
stats = pd.DataFrame()
|
||||||
|
for sf in stats_fpaths:
|
||||||
|
with open(sf, "r") as handle:
|
||||||
|
stats = pd.concat(
|
||||||
|
[
|
||||||
|
stats,
|
||||||
|
pd.json_normalize(yaml.safe_load(handle), sep=" / ").rename(
|
||||||
|
{0: sf.split("/")[-1].strip(".stats")}, axis=0
|
||||||
|
),
|
||||||
|
],
|
||||||
|
axis=0,
|
||||||
|
)
|
||||||
|
stats["sample_type"] = [
|
||||||
|
"Serenomica-Sera" if "Sera" in i else "Serenomica-Clinical"
|
||||||
|
for i in stats.index
|
||||||
|
]
|
||||||
|
stats["source"] = "Serenomica"
|
||||||
|
stats = stats.drop(
|
||||||
|
["miscellaneous / stats_file_version", "miscellaneous / target_panel_bed"],
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
return (stats,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mutations_fpaths, pd):
|
||||||
|
cols = []
|
||||||
|
mutations_vaf, mutations_counts, mutations_depth = (
|
||||||
|
pd.DataFrame(),
|
||||||
|
pd.DataFrame(),
|
||||||
|
pd.DataFrame(),
|
||||||
|
)
|
||||||
|
for mf in mutations_fpaths:
|
||||||
|
tmp = pd.read_csv(mf, sep="\t", header=None)
|
||||||
|
tmp["mutation_id"] = tmp.apply(
|
||||||
|
lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
|
||||||
|
)
|
||||||
|
tmp.set_index("mutation_id", inplace=True)
|
||||||
|
|
||||||
|
cols.append(mf.split("/")[-1].strip(".mutations"))
|
||||||
|
mutations_vaf = pd.concat([mutations_vaf, tmp[3]], axis=1)
|
||||||
|
mutations_counts = pd.concat([mutations_counts, tmp[4]], axis=1)
|
||||||
|
mutations_depth = pd.concat([mutations_depth, tmp[5]], axis=1)
|
||||||
|
|
||||||
|
mutations_vaf.columns = cols
|
||||||
|
mutations_counts.columns = cols
|
||||||
|
mutations_depth.columns = cols
|
||||||
|
|
||||||
|
mutations_vaf
|
||||||
|
return (mutations_vaf,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(os, pd, qc_hist_dir):
|
||||||
|
# QG historical data
|
||||||
|
qg_hist_stats = pd.read_csv(
|
||||||
|
os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0
|
||||||
|
)
|
||||||
|
qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns]
|
||||||
|
qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1)
|
||||||
|
qg_hist_stats["sample_type"] = [
|
||||||
|
"QG-Sera" if "Sera" in i else "QG-Clinical" for i in qg_hist_stats.index
|
||||||
|
]
|
||||||
|
qg_hist_stats["source"] = "QG"
|
||||||
|
qg_hist_stats
|
||||||
|
return (qg_hist_stats,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pd, qg_hist_stats, stats):
|
||||||
|
# ensure columns match
|
||||||
|
assert set(qg_hist_stats.columns) == set(stats.columns)
|
||||||
|
|
||||||
|
all_stats = pd.concat([stats, qg_hist_stats], axis=0)
|
||||||
|
all_stats.head()
|
||||||
|
return (all_stats,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""## (1) Stats""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(all_stats, pd):
|
||||||
|
stats_melt = pd.melt(
|
||||||
|
all_stats.reset_index(), id_vars=["index", "sample_type", "source"]
|
||||||
|
)
|
||||||
|
stats_melt.head()
|
||||||
|
return (stats_melt,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo, stats):
|
||||||
|
multiselect = mo.ui.multiselect(options=stats.columns)
|
||||||
|
mo.hstack([multiselect])
|
||||||
|
return (multiselect,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(multiselect, px, stats_melt):
|
||||||
|
fig = px.box(
|
||||||
|
stats_melt.loc[stats_melt["variable"].isin(multiselect.value)],
|
||||||
|
x="variable",
|
||||||
|
y="value",
|
||||||
|
color="sample_type",
|
||||||
|
points="all",
|
||||||
|
template="simple_white",
|
||||||
|
labels={"variable": "", "value": "Value from stats file"},
|
||||||
|
hover_data=["index", "source"],
|
||||||
|
)
|
||||||
|
fig.update_yaxes(showgrid=True)
|
||||||
|
fig.show()
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
mo.md(
|
||||||
|
r"""
|
||||||
|
## (2) .mutations Files
|
||||||
|
|
||||||
|
Analysis ideas💡
|
||||||
|
|
||||||
|
- Always/majority called - potentially suspicious
|
||||||
|
- Never called
|
||||||
|
- High VAF targets
|
||||||
|
- Poorly covered targets
|
||||||
|
- Overly covered targets
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mutations_vaf):
|
||||||
|
((mutations_vaf == 0).sum(axis=1) / len(mutations_vaf.columns)).sort_values(
|
||||||
|
ascending=False
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mutations_vaf):
|
||||||
|
mutations_vaf.shape
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run()
|
||||||
Reference in New Issue
Block a user