feat: added code for eda with marimo
This commit is contained in:
234
notebooks/initial_eda_lb_20250919.py
Normal file
234
notebooks/initial_eda_lb_20250919.py
Normal file
@@ -0,0 +1,234 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.14.16"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import os
|
||||
import itertools
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
import plotly.graph_objects as go
|
||||
import matplotlib as plt
|
||||
import seaborn as sns
|
||||
import yaml
|
||||
return itertools, mo, os, pd, px, yaml
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""
|
||||
# Initial EDA of the Twist LB data (19-Sep-2025)
|
||||
|
||||
So far no analysis have been done on the data from the LB assay. This analysis will take the first look.
|
||||
|
||||
Analysis ideas:
|
||||
|
||||
- Stats data analysis - box plots and outliers
|
||||
- Mutations - agg stats -> variants called, coverage of called variants in supercol, common called variants, unique variants
|
||||
- vcfs - agg stats -> variants called, coverage of called variants in supercol, common called variants
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/"
|
||||
se2_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se2-lb-1/"
|
||||
|
||||
qc_hist_dir = (
|
||||
"/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/"
|
||||
)
|
||||
return qc_hist_dir, se1_dir, se2_dir
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(itertools, os):
|
||||
def return_paths(suffix, *dirs):
|
||||
return list(
|
||||
itertools.chain.from_iterable(
|
||||
[
|
||||
[
|
||||
os.path.join(d, f)
|
||||
for f in os.listdir(d)
|
||||
if f.endswith(suffix)
|
||||
]
|
||||
for d in dirs
|
||||
]
|
||||
)
|
||||
)
|
||||
return (return_paths,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(return_paths, se1_dir, se2_dir):
|
||||
stats_fpaths = return_paths(".stats", se1_dir, se2_dir)
|
||||
print(len(stats_fpaths))
|
||||
mutations_fpaths = return_paths(".mutations", se1_dir, se2_dir)
|
||||
print(len(mutations_fpaths))
|
||||
return mutations_fpaths, stats_fpaths
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd, stats_fpaths, yaml):
|
||||
stats = pd.DataFrame()
|
||||
for sf in stats_fpaths:
|
||||
with open(sf, "r") as handle:
|
||||
stats = pd.concat(
|
||||
[
|
||||
stats,
|
||||
pd.json_normalize(yaml.safe_load(handle), sep=" / ").rename(
|
||||
{0: sf.split("/")[-1].strip(".stats")}, axis=0
|
||||
),
|
||||
],
|
||||
axis=0,
|
||||
)
|
||||
stats["sample_type"] = [
|
||||
"Serenomica-Sera" if "Sera" in i else "Serenomica-Clinical"
|
||||
for i in stats.index
|
||||
]
|
||||
stats["source"] = "Serenomica"
|
||||
stats = stats.drop(
|
||||
["miscellaneous / stats_file_version", "miscellaneous / target_panel_bed"],
|
||||
axis=1,
|
||||
)
|
||||
return (stats,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mutations_fpaths, pd):
|
||||
cols = []
|
||||
mutations_vaf, mutations_counts, mutations_depth = (
|
||||
pd.DataFrame(),
|
||||
pd.DataFrame(),
|
||||
pd.DataFrame(),
|
||||
)
|
||||
for mf in mutations_fpaths:
|
||||
tmp = pd.read_csv(mf, sep="\t", header=None)
|
||||
tmp["mutation_id"] = tmp.apply(
|
||||
lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1
|
||||
)
|
||||
tmp.set_index("mutation_id", inplace=True)
|
||||
|
||||
cols.append(mf.split("/")[-1].strip(".mutations"))
|
||||
mutations_vaf = pd.concat([mutations_vaf, tmp[3]], axis=1)
|
||||
mutations_counts = pd.concat([mutations_counts, tmp[4]], axis=1)
|
||||
mutations_depth = pd.concat([mutations_depth, tmp[5]], axis=1)
|
||||
|
||||
mutations_vaf.columns = cols
|
||||
mutations_counts.columns = cols
|
||||
mutations_depth.columns = cols
|
||||
|
||||
mutations_vaf
|
||||
return (mutations_vaf,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(os, pd, qc_hist_dir):
|
||||
# QG historical data
|
||||
qg_hist_stats = pd.read_csv(
|
||||
os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0
|
||||
)
|
||||
qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns]
|
||||
qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1)
|
||||
qg_hist_stats["sample_type"] = [
|
||||
"QG-Sera" if "Sera" in i else "QG-Clinical" for i in qg_hist_stats.index
|
||||
]
|
||||
qg_hist_stats["source"] = "QG"
|
||||
qg_hist_stats
|
||||
return (qg_hist_stats,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd, qg_hist_stats, stats):
|
||||
# ensure columns match
|
||||
assert set(qg_hist_stats.columns) == set(stats.columns)
|
||||
|
||||
all_stats = pd.concat([stats, qg_hist_stats], axis=0)
|
||||
all_stats.head()
|
||||
return (all_stats,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""## (1) Stats""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_stats, pd):
|
||||
stats_melt = pd.melt(
|
||||
all_stats.reset_index(), id_vars=["index", "sample_type", "source"]
|
||||
)
|
||||
stats_melt.head()
|
||||
return (stats_melt,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, stats):
|
||||
multiselect = mo.ui.multiselect(options=stats.columns)
|
||||
mo.hstack([multiselect])
|
||||
return (multiselect,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(multiselect, px, stats_melt):
|
||||
fig = px.box(
|
||||
stats_melt.loc[stats_melt["variable"].isin(multiselect.value)],
|
||||
x="variable",
|
||||
y="value",
|
||||
color="sample_type",
|
||||
points="all",
|
||||
template="simple_white",
|
||||
labels={"variable": "", "value": "Value from stats file"},
|
||||
hover_data=["index", "source"],
|
||||
)
|
||||
fig.update_yaxes(showgrid=True)
|
||||
fig.show()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""
|
||||
## (2) .mutations Files
|
||||
|
||||
Analysis ideas💡
|
||||
|
||||
- Always/majority called - potentially suspicious
|
||||
- Never called
|
||||
- High VAF targets
|
||||
- Poorly covered targets
|
||||
- Overly covered targets
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mutations_vaf):
|
||||
((mutations_vaf == 0).sum(axis=1) / len(mutations_vaf.columns)).sort_values(
|
||||
ascending=False
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mutations_vaf):
|
||||
mutations_vaf.shape
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Reference in New Issue
Block a user