From d9cbe4f99803b84ad0e5da9fde4f813ae89a00bc Mon Sep 17 00:00:00 2001 From: Darren Wight Date: Wed, 24 Sep 2025 18:15:18 +0200 Subject: [PATCH] feat: furtehr analysis added --- notebooks/initial_eda_lb_20250919.py | 135 +++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/notebooks/initial_eda_lb_20250919.py b/notebooks/initial_eda_lb_20250919.py index 04028a1..402907c 100644 --- a/notebooks/initial_eda_lb_20250919.py +++ b/notebooks/initial_eda_lb_20250919.py @@ -840,6 +840,141 @@ def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids): return +@app.cell +def _(mo): + mo.md(r"""# (3) Stats and Metadata""") + return + + +@app.cell +def _(pd): + meta = pd.read_excel( + "/home/darren/Documents/4_data/3_internal/2_lb/Metadata_LB_20250924.xlsx", + index_col="fastq_id", + ) + meta + return (meta,) + + +@app.cell +def _(clinicals, meta, pd, stats): + # merge stats and metadata and remove seracares + stats_meta = pd.concat([stats, meta], axis=1) + stats_meta = stats_meta.loc[clinicals, :] + stats_meta = stats_meta.reset_index() + stats_meta.head() + return (stats_meta,) + + +@app.cell +def _(mo, stats_meta): + var1 = mo.ui.dropdown(options=stats_meta.columns, label="Variable 1") + var2 = mo.ui.dropdown(options=stats_meta.columns, label="Variable 2") + + mo.vstack([var1, var2]) + return var1, var2 + + +@app.cell +def _(px, stats_meta, var1, var2): + # note that all SE1 clinical samples are older QG extracted cfDNAs + fig5 = px.scatter( + data_frame=stats_meta, + x=var1.value, + y=var2.value, + color="Flow cell #", + width=700, + height=500, + template="simple_white", + hover_data=["index"], + color_discrete_sequence=px.colors.qualitative.Dark2, + category_orders={"Flow cell #": ["SE1", "SE2"]}, + trendline="ols", + ) + fig5.update_traces( + marker=dict(size=9, line=dict(width=1, color="DarkSlateGrey")), + selector=dict(mode="markers"), + ) + fig5.add_vline( + x=30, + line_width=3, + line_dash="dot", + annotation_text="30ng", + annotation_position="top left", + ) + fig5.show() + return + + +@app.cell +def _(stats_meta): + stats_meta["ng_group"] = [ + "< 30ng" if i < 30 else "> 30ng" for i in stats_meta["Input DNA used (ng)"] + ] + return + + +@app.cell +def _(pd, stats_meta): + stats_meta_melt = pd.melt( + stats_meta, id_vars=["index", "sample_type", "ng_group", "Flow cell #"] + ) + stats_meta_melt.head() + return (stats_meta_melt,) + + +@app.cell +def _(mo, stats_meta_melt): + box_vars = mo.ui.dropdown( + options=stats_meta_melt["variable"].unique(), label="Variables to plot" + ) + mo.hstack([box_vars]) + return (box_vars,) + + +@app.cell +def _(box_vars, px, stats_meta_melt): + fig6 = px.box( + data_frame=stats_meta_melt.loc[ + (stats_meta_melt["variable"] == box_vars.value) + & (stats_meta_melt["Flow cell #"] == "SE2") + ], + color="ng_group", + y="value", + x="variable", + template="simple_white", + points="all", + labels={ + "ng_group": "cfDNA Input for Library", + "value": box_vars.value, + "variable": "", + }, + color_discrete_sequence=px.colors.qualitative.Dark2, + category_orders={"ng_group": ["< 30ng", "> 30ng"]}, + hover_data=["index"], + width=800, + ) + fig6.update_traces( + marker=dict(size=10, line=dict(width=1, color="DarkSlateGrey")), + selector=dict(type="points"), + ) + fig6.update_xaxes(ticks="", showticklabels=False) + fig6.update_yaxes(showgrid=True, tickfont_size=14) + fig6.update_legends(font_size=14) + + for trace in fig6.select_traces(): + trace.marker.update(size=10, line=dict(width=1, color="DarkSlateGrey")) + + fig6.show() + return + + +@app.cell +def _(stats_meta): + stats_meta["ng_group"].value_counts() + return + + @app.cell def _(): return