import marimo __generated_with = "0.14.16" app = marimo.App(width="medium") @app.cell def _(): import os import itertools import marimo as mo import pandas as pd import plotly.express as px import plotly.graph_objects as go import matplotlib.pyplot as plt import seaborn as sns import yaml from matplotlib_venn import venn2 return itertools, mo, os, pd, plt, px, venn2, yaml @app.cell def _(mo): mo.md( r""" # Initial EDA of the Twist LB data (19-Sep-2025) So far no analysis have been done on the data from the LB assay. This analysis will take the first look. Analysis ideas: - Stats data analysis - box plots and outliers - Mutations - agg stats -> variants called, coverage of called variants in supercol, common called variants, unique variants - vcfs - agg stats -> variants called, coverage of called variants in supercol, common called variants """ ) return @app.cell def _(mo): mo.md(r"""## (1) Preprocess Data""") return @app.cell def _(): se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/" se2_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se2-lb-1/" qc_hist_dir = ( "/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/" ) qc_hist_mutations_dir = "/home/darren/Documents/4_data/3_internal/2_lb/qg_historical_qc_service/dot_mutations/" sc_path = "/home/darren/Documents/2_repos/serenomica/supercolumn/panel_v2/supercolumn_v2.csv" return qc_hist_dir, qc_hist_mutations_dir, sc_path, se1_dir, se2_dir @app.cell def _(itertools, os): def return_paths(suffix, *dirs): return list( itertools.chain.from_iterable( [ [ os.path.join(d, f) for f in os.listdir(d) if f.endswith(suffix) ] for d in dirs ] ) ) return (return_paths,) @app.cell def _(return_paths, se1_dir, se2_dir): stats_fpaths = return_paths(".stats", se1_dir, se2_dir) print(len(stats_fpaths)) mutations_fpaths = return_paths(".mutations", se1_dir, se2_dir) print(len(mutations_fpaths)) return mutations_fpaths, stats_fpaths @app.cell def _(mo): mo.md(r"""### (1.1) Stats""") return @app.cell def _(pd, stats_fpaths, yaml): stats = pd.DataFrame() for sf in stats_fpaths: with open(sf, "r") as handle: stats = pd.concat( [ stats, pd.json_normalize(yaml.safe_load(handle), sep=" / ").rename( {0: sf.split("/")[-1].strip(".stats")}, axis=0 ), ], axis=0, ) stats["sample_type"] = [ "Serenomica-Sera" if "Sera" in i else "Serenomica-Clinical" for i in stats.index ] stats["source"] = "Serenomica" stats = stats.drop( ["miscellaneous / stats_file_version", "miscellaneous / target_panel_bed"], axis=1, ) return (stats,) @app.cell def _(mo): mo.md(r"""### (1.2) .mutations""") return @app.cell def _(pd): def process_mutations_files(mutation_file_paths): cols = [] vaf, counts, depth = ( pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), ) for mf in mutation_file_paths: tmp = pd.read_csv(mf, sep="\t", header=None) tmp["mutation_id"] = tmp.apply( lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1 ) tmp.set_index("mutation_id", inplace=True) cols.append(mf.split("/")[-1].strip(".mutations")) vaf = pd.concat([vaf, tmp[3]], axis=1) counts = pd.concat([counts, tmp[4]], axis=1) depth = pd.concat([depth, tmp[5]], axis=1) vaf.columns = cols counts.columns = cols depth.columns = cols return vaf, counts, depth return (process_mutations_files,) @app.cell def _(mutations_fpaths, process_mutations_files): mutations_vaf, mutations_counts, mutations_depth = process_mutations_files( mutations_fpaths ) mutations_vaf return mutations_depth, mutations_vaf @app.cell def _(mo): mo.md(r"""### (1.3) Stats QG Historical""") return @app.cell def _(os, pd, qc_hist_dir): # QG historical data qg_hist_stats = pd.read_csv( os.path.join(qc_hist_dir, "historical_data_LB.csv"), index_col=0 ) qg_hist_stats.columns = [i.replace("|", " / ") for i in qg_hist_stats.columns] qg_hist_stats = qg_hist_stats.drop(["batch"], axis=1) qg_hist_stats["sample_type"] = [ "QG-Sera" if "Sera" in i else "QG-Clinical" for i in qg_hist_stats.index ] qg_hist_stats["source"] = "QG" qg_hist_stats return (qg_hist_stats,) @app.cell def _(pd, qg_hist_stats, stats): # ensure columns match assert set(qg_hist_stats.columns) == set(stats.columns) all_stats = pd.concat([stats, qg_hist_stats], axis=0) all_stats.head() return (all_stats,) @app.cell def _(mo): mo.md(r"""### (1.4) .mutations QG Historical""") return @app.cell def _(process_mutations_files, qc_hist_mutations_dir, return_paths): qg_hist_mutations_paths = return_paths(".mutations", qc_hist_mutations_dir) qg_hist_mut_vaf, qg_hist_mut_count, qg_hist_mut_depth = ( process_mutations_files(qg_hist_mutations_paths) ) qg_hist_mut_vaf return qg_hist_mut_depth, qg_hist_mut_vaf @app.cell def _(mutations_depth, mutations_vaf, pd, qg_hist_mut_depth, qg_hist_mut_vaf): # ensure columns match assert set(qg_hist_mut_vaf.index) == set(mutations_vaf.index) all_vaf = pd.concat([mutations_vaf, qg_hist_mut_vaf], axis=1) all_depth = pd.concat([mutations_depth, qg_hist_mut_depth], axis=1) return all_depth, all_vaf @app.cell def _(qg_hist_mut_vaf): qg_samples_dot_mutations = qg_hist_mut_vaf.columns return (qg_samples_dot_mutations,) @app.cell def _(mo): mo.md(r"""## (1) Stats Analysis""") return @app.cell def _(all_stats, pd): stats_melt = pd.melt( all_stats.reset_index(), id_vars=["index", "sample_type", "source"] ) stats_melt.head() return (stats_melt,) @app.cell def _(mo, stats): multiselect = mo.ui.multiselect(options=stats.columns) mo.hstack([multiselect]) return (multiselect,) @app.cell def _(multiselect, px, stats_melt): fig = px.box( stats_melt.loc[stats_melt["variable"].isin(multiselect.value)], x="variable", y="value", color="sample_type", points="all", template="simple_white", labels={"variable": "", "value": "Value from stats file"}, hover_data=["index", "source"], category_orders={ "sample_type": sorted(stats_melt["sample_type"].unique()) }, ) fig.update_yaxes(showgrid=True) fig.show() return @app.cell def _(mo): mo.md( r""" ## (2) .mutations Analysis Analysis ideasπŸ’‘ - Always/majority called - potentially suspicious βœ… - Never called βœ… - High VAF targets βœ… - Poorly covered targets βœ… - Overly covered targets βœ… """ ) return @app.cell def _(pd, sc_path): # supercolumn for analysis sc = pd.read_csv(sc_path, sep=";") sc["mutation_id"] = ( "chr" + sc["chrom"].astype(str) + "_" + sc["position"].astype(str) + "_" + sc["ref"].astype(str) + "/" + sc["alt"].astype(str) ) return (sc,) @app.cell def _(sc): black_mut_ids = sc.loc[sc["blacklisted"] == "Yes"]["mutation_id"].tolist() print(f"Number of blacklisted variants = {len(black_mut_ids)}") return (black_mut_ids,) @app.cell def _(mutations_vaf): seracares = [i for i in mutations_vaf.columns if "Sera" in i] clinicals = [i for i in mutations_vaf.columns if i not in seracares] print(f"Number of seracares = {len(seracares)}") print(f"Number of volunteers = {len(clinicals)}") print("Both numbers include replicates!") return (clinicals,) @app.cell def _(mo): mo.md(r"""### (2.1) Frequently and infrequently called variants""") return @app.cell def _(clinicals, mutations_vaf, pd): # always called and never called called_gt0 = pd.DataFrame( (mutations_vaf.loc[:, clinicals] > 0).sum(axis=1), columns=["samples_called_in"], ) called_gt0["prop_samples_called"] = [ row / len(clinicals) for row in called_gt0["samples_called_in"] ] called_gt0["called_in_ALL"] = [ True if val == len(clinicals) else False for val in called_gt0["samples_called_in"] ] called_gt0["called_in_NONE"] = [ True if val == 0 else False for val in called_gt0["samples_called_in"] ] var_always_called = called_gt0[called_gt0["called_in_ALL"] == True].index var_never_called = called_gt0[called_gt0["called_in_NONE"] == True].index called_gt0[["called_in_ALL", "called_in_NONE"]].sum(axis=0) return (var_always_called,) @app.cell def _(sc, var_always_called): # all 9 variants that occur in all clinical samples are blacklisted variants sc.loc[sc["mutation_id"].isin(var_always_called)][ ["chrom", "position", "ref", "alt", "gene", "blacklisted"] ] return @app.cell def _(mo): mo.md(r"""### (2.2) Variants called above VAF thresholds""") return @app.cell def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations): # mutations over thresholds, excluding the blacklisted hits df_mut_per_sample_by_thresh = pd.DataFrame() for t in [0.00125, 0.0025, 0.005, 0.01]: t_data_all = pd.DataFrame( (all_vaf > t).sum(axis=0), columns=[f"variants_gt_{t}_all"], ) t_data = pd.DataFrame( ( all_vaf.loc[ [i for i in all_vaf.index if i not in black_mut_ids], : ] > t ).sum(axis=0), columns=[f"variants_gt_{t}_wo_blacklisted"], ) df_mut_per_sample_by_thresh = pd.concat( [df_mut_per_sample_by_thresh, t_data, t_data_all], axis=1 ) df_mut_per_sample_by_thresh["sample_type"] = [ "Serenomica-Sera" if ("Sera" in i) and (i not in qg_samples_dot_mutations) else "QG-Sera" if (i in qg_samples_dot_mutations) and ("Sera" in i) else "Serenomica-Clinincal" if i not in qg_samples_dot_mutations else "QG-Clinical" for i in df_mut_per_sample_by_thresh.index ] df_mut_per_sample_by_thresh return (df_mut_per_sample_by_thresh,) @app.cell def _(df_mut_per_sample_by_thresh, pd): df_mut_per_sample_by_thresh_melt = pd.melt( df_mut_per_sample_by_thresh.reset_index(), id_vars=["index", "sample_type"] ) return (df_mut_per_sample_by_thresh_melt,) @app.cell def _(df_mut_per_sample_by_thresh, mo): ms_gt_thresh = mo.ui.multiselect(options=df_mut_per_sample_by_thresh.columns) mo.hstack([ms_gt_thresh]) return (ms_gt_thresh,) @app.cell def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px): fig2 = px.box( df_mut_per_sample_by_thresh_melt.loc[ df_mut_per_sample_by_thresh_melt["variable"].isin(ms_gt_thresh.value) ], x="variable", y="value", color="sample_type", points="all", template="simple_white", labels={"variable": "", "value": "Variants per sample"}, hover_data=["index"], category_orders={ "sample_type": sorted( df_mut_per_sample_by_thresh_melt["sample_type"].unique() ) }, ) fig2.update_yaxes(showgrid=True) fig2.show() return @app.cell def _(mo): mo.md( r""" ### (2.3) Mutations with very High VAF (in some samples) Arbritrarily looking at >10% """ ) return @app.cell def _(clinicals, mutations_vaf, pd, sc): # looking just at the serenomica samples high_vaf_serenomica = pd.DataFrame( (mutations_vaf.loc[:, clinicals] > 0.1) .sum(axis=1) .sort_values(ascending=False), columns=["count_vaf_gt0.1"], ) sc.loc[ sc["mutation_id"].isin( high_vaf_serenomica.loc[ high_vaf_serenomica["count_vaf_gt0.1"] > 0 ].index ) ][["mutation_id", "blacklisted", "rsID", "gene", "Consequence"]] return (high_vaf_serenomica,) @app.cell def _(high_vaf_serenomica): high_vaf_serenomica.head(9) return @app.cell def _(mo): mo.md( r""" rs587780751 - low pop AF rs2256740 - high pop AF, especially in EU, Africans and latin americans (13/19 samples) rs34094720 - relatively low pop AF rs2293347 - relatively high pop AF (7/19 samples) rs138327406 - relatively low pop AF rs11214077 - relatively low but > 1% in latin pop rs121913530 - very low pop AF """ ) return @app.cell def _(mo): mo.md(r"""### (2.3) Global Coverage""") return @app.cell def _( all_depth, black_mut_ids, df_mut_per_sample_by_thresh, pd, qg_samples_dot_mutations, ): # coverage over thresholds df_depth_by_thresh = pd.DataFrame() for d in [250, 500, 1000, 2000, 5000, 10000]: d_data_all = pd.DataFrame( (all_depth > d).sum(axis=0) / all_depth.shape[0], columns=[f"coverage_gt_{d}_all"], ) d_data = pd.DataFrame( ( all_depth.loc[ [i for i in all_depth.index if i not in black_mut_ids], : ] > d ).sum(axis=0) / all_depth.shape[0], columns=[f"coverage_gt_{d}_wo_blacklisted"], ) df_depth_by_thresh = pd.concat( [df_depth_by_thresh, d_data, d_data_all], axis=1 ) df_depth_by_thresh["sample_type"] = [ "Serenomica-Sera" if ("Sera" in i) and (i not in qg_samples_dot_mutations) else "QG-Sera" if (i in qg_samples_dot_mutations) and ("Sera" in i) else "Serenomica-Clinincal" if i not in qg_samples_dot_mutations else "QG-Clinical" for i in df_mut_per_sample_by_thresh.index ] df_depth_by_thresh return (df_depth_by_thresh,) @app.cell def _(df_depth_by_thresh, pd): df_depth_by_thresh_melt = pd.melt( df_depth_by_thresh.reset_index(), id_vars=["index", "sample_type"] ) return (df_depth_by_thresh_melt,) @app.cell def _(df_depth_by_thresh, mo): ms_gt_dep_thresh = mo.ui.multiselect(options=df_depth_by_thresh.columns) mo.hstack([ms_gt_dep_thresh]) return (ms_gt_dep_thresh,) @app.cell def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px): fig3 = px.box( df_depth_by_thresh_melt.loc[ df_depth_by_thresh_melt["variable"].isin(ms_gt_dep_thresh.value) ], x="variable", y="value", color="sample_type", points="all", template="simple_white", labels={"variable": "", "value": "Prop of sc sites > threhold per sample"}, hover_data=["index"], category_orders={ "sample_type": sorted(df_depth_by_thresh_melt["sample_type"].unique()) }, ) fig3.update_yaxes(showgrid=True) fig3.show() return @app.cell def _(mo): mo.md( r""" ### (2.4) Missed Targets Look into variants that only have a depth that could support a VAF of 1% (i.e. 200 reads) """ ) return @app.cell def _(black_mut_ids, mutations_depth, qg_hist_mut_depth): # removal of blacklisted variants for this analysis mutations_depth_wo_black = mutations_depth.loc[ [i for i in mutations_depth.index if i not in black_mut_ids], : ].copy() qg_mutations_depth_wo_black = qg_hist_mut_depth.loc[ [i for i in qg_hist_mut_depth.index if i not in black_mut_ids], : ].copy() return mutations_depth_wo_black, qg_mutations_depth_wo_black @app.cell def _(pd): def get_variant_group_by_depth(df, depth_thresh, sample_prop, mode="lt"): if mode == "lt": variants_group = pd.DataFrame( ((df < depth_thresh).sum(axis=1) / df.shape[1]).sort_values( ascending=False ), columns=[f"prop_cov_{mode}{depth_thresh}"], ) elif mode == "gt": variants_group = pd.DataFrame( ((df > depth_thresh).sum(axis=1) / df.shape[1]).sort_values( ascending=False ), columns=[f"prop_cov_{mode}{depth_thresh}"], ) variant_ids = list( variants_group.loc[ variants_group[f"prop_cov_{mode}{depth_thresh}"] > sample_prop ].index ) return variants_group, variant_ids return (get_variant_group_by_depth,) @app.cell def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black): variants_lost_global_coverage, variants_lost_cov_ids = ( get_variant_group_by_depth( mutations_depth_wo_black.loc[:, clinicals], 400, 0.5 ) ) variants_lost_cov_ids return (variants_lost_cov_ids,) @app.cell def _(get_variant_group_by_depth, qg_mutations_depth_wo_black): qg_variants_lost_global_coverage, qg_variants_lost_cov_ids = ( get_variant_group_by_depth(qg_mutations_depth_wo_black, 400, 0.5) ) qg_variants_lost_cov_ids return (qg_variants_lost_cov_ids,) @app.cell def _(plt, qg_variants_lost_cov_ids, variants_lost_cov_ids, venn2): venn2( subsets=( len(qg_variants_lost_cov_ids), len( set(qg_variants_lost_cov_ids).intersection( set(variants_lost_cov_ids) ) ), len(set(variants_lost_cov_ids)), ), set_labels=("QG", "Serenomica"), set_colors=("blue", "green"), alpha=0.5, ) plt.title("Overlap of missed supercolumn targets (w/o blacklist targets)") plt.show() return @app.cell def _(mo): mo.md( r""" ### (2.5) Poorly Covered for low VAF detection As we need 2 reads before we report a variant, this analysis looks at variants with lower than 1000 read depth that would enable a lowest VAF of 0.2% to be found. """ ) return @app.cell def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black): variants_low_global_coverage, variants_low_cov_ids = ( get_variant_group_by_depth( mutations_depth_wo_black.loc[:, clinicals], 1000, 0.75 ) ) variants_low_cov_ids return (variants_low_cov_ids,) @app.cell def _(get_variant_group_by_depth, qg_mutations_depth_wo_black): qg_variants_low_global_coverage, qg_variants_low_cov_ids = ( get_variant_group_by_depth(qg_mutations_depth_wo_black, 1000, 0.75) ) qg_variants_low_cov_ids return (qg_variants_low_cov_ids,) @app.cell def _( plt, qg_variants_lost_cov_ids, qg_variants_low_cov_ids, variants_lost_cov_ids, variants_low_cov_ids, venn2, ): qg_low = set( [q for q in qg_variants_low_cov_ids if q not in qg_variants_lost_cov_ids] ) sere_low = set( [v for v in variants_low_cov_ids if v not in variants_lost_cov_ids] ) venn2( subsets=( len(qg_low), len(qg_low.intersection(set(sere_low))), len(sere_low), ), set_labels=("QG", "Serenomica"), set_colors=("blue", "green"), alpha=0.5, ) plt.title( "Overlap of lowly covered supercolumn targets (w/o blacklist targets)" ) plt.show() return @app.cell def _(mo): mo.md( r""" ### (2.6) Overly covered targets Look to see if any sc targets are overly represented in the reads """ ) return @app.cell def _(clinicals, mutations_depth_wo_black, pd): mean_cov = pd.DataFrame( mutations_depth_wo_black.loc[:, clinicals] .mean(axis=1) .sort_values(ascending=False), columns=["mean_cov"], ) mean_cov return (mean_cov,) @app.cell def _(mean_cov, mo): slider = mo.ui.slider( start=100, stop=len(mean_cov), step=200, label="Number of variants to show", show_value=True, ) mo.hstack([slider]) return (slider,) @app.cell def _(mean_cov, px, slider): fig4 = px.bar(mean_cov.iloc[: slider.value, :], template="simple_white") fig4.show() return @app.cell def _(clinicals, get_variant_group_by_depth, mutations_depth_wo_black): variants_high_global_coverage, variants_high_cov_ids = ( get_variant_group_by_depth( mutations_depth_wo_black.loc[:, clinicals], 10000, 0.75, mode="gt" ) ) variants_high_cov_ids return (variants_high_cov_ids,) @app.cell def _(get_variant_group_by_depth, qg_mutations_depth_wo_black): qg_variants_high_global_coverage, qg_variants_high_cov_ids = ( get_variant_group_by_depth( qg_mutations_depth_wo_black, 10000, 0.75, mode="gt" ) ) qg_variants_high_cov_ids return @app.cell def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids): vars_of_interest = sc.loc[ sc["mutation_id"].isin( variants_lost_cov_ids + variants_low_cov_ids + variants_high_cov_ids ) ][ [ "chrom", "position", "mutation_id", "gene", "HGVS", "Consequence", "region_type", "blacklisted", "cancer_fda", "FDA_drugs", "cancer_clinvar", "cancer_rank_prevalence_combined", ] ].copy() vars_of_interest["coverage_issue"] = [ "lost" if i in variants_lost_cov_ids else "poorly_covered" if i in variants_low_cov_ids else "highly_covered" if i in variants_high_cov_ids else "" for i in vars_of_interest["mutation_id"] ] vars_of_interest.to_csv( "../data/coverage_issue_variants.tsv", sep="\t", index=False ) return @app.cell def _(): return if __name__ == "__main__": app.run()