From e03ede9c2887c44d21975bbe369dff5101a6d611 Mon Sep 17 00:00:00 2001 From: Darren Wight Date: Mon, 22 Sep 2025 17:31:11 +0200 Subject: [PATCH] feat: eda analysis continued and samples from QG search added --- notebooks/find_qg_dot_mutations.py | 116 ++++++ notebooks/initial_eda_lb_20250919.py | 591 +++++++++++++++++++++++++-- 2 files changed, 677 insertions(+), 30 deletions(-) create mode 100644 notebooks/find_qg_dot_mutations.py diff --git a/notebooks/find_qg_dot_mutations.py b/notebooks/find_qg_dot_mutations.py new file mode 100644 index 0000000..efd30c7 --- /dev/null +++ b/notebooks/find_qg_dot_mutations.py @@ -0,0 +1,116 @@ +import marimo + +__generated_with = "0.14.16" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import os + from collections import Counter + import marimo as mo + import pandas as pd + return mo, os, pd + + +@app.cell +def _(mo): + mo.md( + r""" + # Search for Samples + + Not all QG samples have the `.mutations` files. It would be helpful to have some from those that were part of the historical QC service. + + Samples we have: `../data/qg_avail_dot_mutations.txt` + """ + ) + return + + +@app.cell +def _(): + # pipeline runs used in the historical data (from repo) + LB2_PIPELINE_RUNS = [ + "s3://quantgene-portland-lb/20221123-sp-pl34-hs2/lb-pl34-all-dev56/", + "s3://quantgene-portland-lb/20221202-s1-pl36-hs2/lb-pl36-all-dev56/", + "s3://quantgene-portland-lb/20221207-s2-pl37-hs2/lb-pl37-all-dev56/", + "s3://quantgene-portland-lb/20230109-sp-pl41-hs2/lb-pl41-dev56/", + "s3://quantgene-portland-lb/20230124-sp-pl42-hs2/lb-pl42-dev56/", + # PL43 skipped, mostly Geneva samples with stats all over the place + "s3://quantgene-portland-lb/20230210-sp-pl44-hs2/lb-pl44-dev59/", + "s3://quantgene-portland-lb/20230224-s1-pl47-hs2/lb-pl47-dev59/", + "s3://quantgene-portland-lb/20230316-s1-pl49-hs2/lb-pl49-dev59/", + "s3://quantgene-portland-lb/20230330-sp-pl50-hs2/lb-pl50-dev59/", + "s3://quantgene-portland-lb/20230417-s1-pl51-hs2/lb-pl51-dev59/", + "s3://quantgene-portland-lb/20230512-sp-pl52-hs2/lb-pl52-prd-2-1-0/", + "s3://quantgene-portland-lb/20230602-sp-pl53-hs2/lb-pl53-prd-2-1-1/", + "s3://quantgene-portland-lb/20230615-sp-pl54-hs2/lb-pl54-prd-2-1-1/", + "s3://quantgene-portland-lb/20230630-sp-pl55-hs2/lb-pl55-prd-2-1-1/", + "s3://quantgene-portland-lb/20230721-sp-pl56-hs2/lb-pl56-prd212/", + "s3://quantgene-portland-lb/20230811-sp-pl57-hs2/lb-pl57-prd212/", + "s3://quantgene-portland-lb/20230901-sp-pl58-hs2/lb-pl58-prd212/", + "s3://quantgene-portland-lb/20231027-sp-pl59-hs2/lb-pl59-prd212/", + # PL60 skipped due to low amount of input reads in all samples + "s3://quantgene-portland-lb/20240216-sp-pl61-hs2/lb-pl61-prd212/", + # PL62 skipped due to low amount of input reads and insert size issues + "s3://quantgene-portland-lb/20240516-s1-pl63-hs2/lb-pl63-prd212/", + "s3://quantgene-portland-lb/20240529-sp-pl64-hs2/lb-pl64-prd212/", + ] + return (LB2_PIPELINE_RUNS,) + + +@app.cell +def _(pd): + # hist samples + qg_hist_data = pd.read_csv( + "/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/historical_data_LB.csv", + index_col=0, + ) + qc_samples = list(set(qg_hist_data.index)) + qc_samples + return (qc_samples,) + + +@app.cell +def _(LB2_PIPELINE_RUNS, os, qc_samples): + with open("../data/qg_avail_dot_mutations.txt", "r") as handle: + original, avail_samples, needed_samples = [], [], [] + for line in handle.readlines(): + fpath, samp = os.path.split(line.rstrip()) + + fpath = str(fpath).replace("-lb//", "-lb/") + "/" + samp = samp.strip(".mutations") + + original.append(line.rstrip()) + + # skip non-prod versions + if fpath not in LB2_PIPELINE_RUNS: + continue + + avail_samples.append(samp) + if samp in qc_samples: + needed_samples.append(line.rstrip()) + print( + f"Number of samples available from runs = {(n_avail := len(avail_samples))}" + ) + print(f"Number of historical QC samples = {(n_qc := len(qc_samples))}") + print( + f"Number of historical QC samples available = {(n_needed := len(needed_samples))} ({round((n_needed / n_qc) * 100, 2)}%)" + ) + return (needed_samples,) + + +@app.cell +def _(needed_samples): + with open("../data/qg_needed.txt", "w") as whandle: + whandle.writelines([i + "\n" for i in needed_samples]) + return + + +@app.cell +def _(): + return + + +if __name__ == "__main__": + app.run() diff --git a/notebooks/initial_eda_lb_20250919.py b/notebooks/initial_eda_lb_20250919.py index a741d34..e9ab295 100644 --- a/notebooks/initial_eda_lb_20250919.py +++ b/notebooks/initial_eda_lb_20250919.py @@ -36,6 +36,12 @@ def _(mo): return +@app.cell +def _(mo): + mo.md(r"""## (1) Preprocess Data""") + return + + @app.cell def _(): se1_dir = "/home/darren/Documents/4_data/3_internal/2_lb/se1-prd-2.1.1/" @@ -44,7 +50,9 @@ def _(): qc_hist_dir = ( "/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/" ) - return qc_hist_dir, se1_dir, se2_dir + qc_hist_mutations_dir = "/home/darren/Documents/4_data/3_internal/2_lb/qg_historical_qc_service/dot_mutations/" + sc_path = "/home/darren/Documents/2_repos/serenomica/supercolumn/panel_v2/supercolumn_v2.csv" + return qc_hist_dir, qc_hist_mutations_dir, sc_path, se1_dir, se2_dir @app.cell @@ -74,6 +82,12 @@ def _(return_paths, se1_dir, se2_dir): return mutations_fpaths, stats_fpaths +@app.cell +def _(mo): + mo.md(r"""### (1.1) Stats""") + return + + @app.cell def _(pd, stats_fpaths, yaml): stats = pd.DataFrame() @@ -101,31 +115,52 @@ def _(pd, stats_fpaths, yaml): @app.cell -def _(mutations_fpaths, pd): - cols = [] - mutations_vaf, mutations_counts, mutations_depth = ( - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame(), - ) - for mf in mutations_fpaths: - tmp = pd.read_csv(mf, sep="\t", header=None) - tmp["mutation_id"] = tmp.apply( - lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1 +def _(mo): + mo.md(r"""### (1.2) .mutations""") + return + + +@app.cell +def _(pd): + def process_mutations_files(mutation_file_paths): + cols = [] + vaf, counts, depth = ( + pd.DataFrame(), + pd.DataFrame(), + pd.DataFrame(), ) - tmp.set_index("mutation_id", inplace=True) + for mf in mutation_file_paths: + tmp = pd.read_csv(mf, sep="\t", header=None) + tmp["mutation_id"] = tmp.apply( + lambda x: "_".join([str(x[0]), str(x[1]), str(x[2])]), axis=1 + ) + tmp.set_index("mutation_id", inplace=True) - cols.append(mf.split("/")[-1].strip(".mutations")) - mutations_vaf = pd.concat([mutations_vaf, tmp[3]], axis=1) - mutations_counts = pd.concat([mutations_counts, tmp[4]], axis=1) - mutations_depth = pd.concat([mutations_depth, tmp[5]], axis=1) + cols.append(mf.split("/")[-1].strip(".mutations")) + vaf = pd.concat([vaf, tmp[3]], axis=1) + counts = pd.concat([counts, tmp[4]], axis=1) + depth = pd.concat([depth, tmp[5]], axis=1) - mutations_vaf.columns = cols - mutations_counts.columns = cols - mutations_depth.columns = cols + vaf.columns = cols + counts.columns = cols + depth.columns = cols + return vaf, counts, depth + return (process_mutations_files,) + +@app.cell +def _(mutations_fpaths, process_mutations_files): + mutations_vaf, mutations_counts, mutations_depth = process_mutations_files( + mutations_fpaths + ) mutations_vaf - return (mutations_vaf,) + return mutations_depth, mutations_vaf + + +@app.cell +def _(mo): + mo.md(r"""### (1.3) Stats QG Historical""") + return @app.cell @@ -156,7 +191,39 @@ def _(pd, qg_hist_stats, stats): @app.cell def _(mo): - mo.md(r"""## (1) Stats""") + mo.md(r"""### (1.4) .mutations QG Historical""") + return + + +@app.cell +def _(process_mutations_files, qc_hist_mutations_dir, return_paths): + qg_hist_mutations_paths = return_paths(".mutations", qc_hist_mutations_dir) + qg_hist_mut_vaf, qg_hist_mut_count, qg_hist_mut_depth = ( + process_mutations_files(qg_hist_mutations_paths) + ) + qg_hist_mut_vaf + return qg_hist_mut_depth, qg_hist_mut_vaf + + +@app.cell +def _(mutations_depth, mutations_vaf, pd, qg_hist_mut_depth, qg_hist_mut_vaf): + # ensure columns match + assert set(qg_hist_mut_vaf.index) == set(mutations_vaf.index) + + all_vaf = pd.concat([mutations_vaf, qg_hist_mut_vaf], axis=1) + all_depth = pd.concat([mutations_depth, qg_hist_mut_depth], axis=1) + return all_depth, all_vaf + + +@app.cell +def _(qg_hist_mut_vaf): + qg_samples_dot_mutations = qg_hist_mut_vaf.columns + return (qg_samples_dot_mutations,) + + +@app.cell +def _(mo): + mo.md(r"""## (1) Stats Analysis""") return @@ -197,13 +264,13 @@ def _(multiselect, px, stats_melt): def _(mo): mo.md( r""" - ## (2) .mutations Files + ## (2) .mutations Analysis Analysis ideas💡 - - Always/majority called - potentially suspicious - - Never called - - High VAF targets + - Always/majority called - potentially suspicious ✅ + - Never called ✅ + - High VAF targets ✅ - Poorly covered targets - Overly covered targets """ @@ -211,17 +278,481 @@ def _(mo): return +@app.cell +def _(pd, sc_path): + # supercolumn for analysis + sc = pd.read_csv(sc_path, sep=";") + sc["mutation_id"] = ( + "chr" + + sc["chrom"].astype(str) + + "_" + + sc["position"].astype(str) + + "_" + + sc["ref"].astype(str) + + "/" + + sc["alt"].astype(str) + ) + return (sc,) + + +@app.cell +def _(sc): + black_mut_ids = sc.loc[sc["blacklisted"] == "Yes"]["mutation_id"].tolist() + print(f"Number of blacklisted variants = {len(black_mut_ids)}") + return (black_mut_ids,) + + @app.cell def _(mutations_vaf): - ((mutations_vaf == 0).sum(axis=1) / len(mutations_vaf.columns)).sort_values( - ascending=False + seracares = [i for i in mutations_vaf.columns if "Sera" in i] + clinicals = [i for i in mutations_vaf.columns if i not in seracares] + print(f"Number of seracares = {len(seracares)}") + print(f"Number of volunteers = {len(clinicals)}") + print("Both numbers include replicates!") + return (clinicals,) + + +@app.cell +def _(mo): + mo.md(r"""### (2.1) Frequently and infrequently called variants""") + return + + +@app.cell +def _(clinicals, mutations_vaf, pd): + # always called and never called + called_gt0 = pd.DataFrame( + (mutations_vaf.loc[:, clinicals] > 0).sum(axis=1), + columns=["samples_called_in"], + ) + called_gt0["prop_samples_called"] = [ + row / len(clinicals) for row in called_gt0["samples_called_in"] + ] + called_gt0["called_in_ALL"] = [ + True if val == len(clinicals) else False + for val in called_gt0["samples_called_in"] + ] + called_gt0["called_in_NONE"] = [ + True if val == 0 else False for val in called_gt0["samples_called_in"] + ] + + var_always_called = called_gt0[called_gt0["called_in_ALL"] == True].index + var_never_called = called_gt0[called_gt0["called_in_NONE"] == True].index + + called_gt0[["called_in_ALL", "called_in_NONE"]].sum(axis=0) + return (var_always_called,) + + +@app.cell +def _(sc, var_always_called): + # all 9 variants that occur in all clinical samples are blacklisted variants + sc.loc[sc["mutation_id"].isin(var_always_called)][ + ["chrom", "position", "ref", "alt", "gene", "blacklisted"] + ] + return + + +@app.cell +def _(mo): + mo.md(r"""### (2.2) Variants called above VAF thresholds""") + return + + +@app.cell +def _(all_vaf, black_mut_ids, pd, qg_samples_dot_mutations): + # mutations over thresholds, excluding the blacklisted hits + df_mut_per_sample_by_thresh = pd.DataFrame() + for t in [0.00125, 0.0025, 0.005, 0.01]: + t_data_all = pd.DataFrame( + (all_vaf > t).sum(axis=0), + columns=[f"variants_gt_{t}_all"], + ) + t_data = pd.DataFrame( + ( + all_vaf.loc[ + [i for i in all_vaf.index if i not in black_mut_ids], : + ] + > t + ).sum(axis=0), + columns=[f"variants_gt_{t}_wo_blacklisted"], + ) + df_mut_per_sample_by_thresh = pd.concat( + [df_mut_per_sample_by_thresh, t_data, t_data_all], axis=1 + ) + df_mut_per_sample_by_thresh["sample_type"] = [ + "Serenomica-Sera" + if ("Sera" in i) and (i not in qg_samples_dot_mutations) + else "QG-Sera" + if (i in qg_samples_dot_mutations) and ("Sera" in i) + else "Serenomical-Clinincal" + if i not in qg_samples_dot_mutations + else "QG-Clinical" + for i in df_mut_per_sample_by_thresh.index + ] + df_mut_per_sample_by_thresh + return (df_mut_per_sample_by_thresh,) + + +@app.cell +def _(df_mut_per_sample_by_thresh, pd): + df_mut_per_sample_by_thresh_melt = pd.melt( + df_mut_per_sample_by_thresh.reset_index(), id_vars=["index", "sample_type"] + ) + return (df_mut_per_sample_by_thresh_melt,) + + +@app.cell +def _(df_mut_per_sample_by_thresh, mo): + ms_gt_thresh = mo.ui.multiselect(options=df_mut_per_sample_by_thresh.columns) + mo.hstack([ms_gt_thresh]) + return (ms_gt_thresh,) + + +@app.cell +def _(df_mut_per_sample_by_thresh_melt, ms_gt_thresh, px): + fig2 = px.box( + df_mut_per_sample_by_thresh_melt.loc[ + df_mut_per_sample_by_thresh_melt["variable"].isin(ms_gt_thresh.value) + ], + x="variable", + y="value", + color="sample_type", + points="all", + template="simple_white", + labels={"variable": "", "value": "Variants per sample"}, + hover_data=["index"], + ) + fig2.update_yaxes(showgrid=True) + fig2.show() + return + + +@app.cell +def _(mo): + mo.md( + r""" + ### (2.3) Mutations with very High VAF (in some samples) + + Arbritrarily looking at >10% + """ ) return @app.cell -def _(mutations_vaf): - mutations_vaf.shape +def _(clinicals, mutations_vaf, pd, sc): + # looking just at the serenomica samples + high_vaf_serenomica = pd.DataFrame( + (mutations_vaf.loc[:, clinicals] > 0.1) + .sum(axis=1) + .sort_values(ascending=False), + columns=["count_vaf_gt0.1"], + ) + sc.loc[ + sc["mutation_id"].isin( + high_vaf_serenomica.loc[ + high_vaf_serenomica["count_vaf_gt0.1"] > 0 + ].index + ) + ][["mutation_id", "blacklisted", "rsID", "gene", "Consequence"]] + return (high_vaf_serenomica,) + + +@app.cell +def _(high_vaf_serenomica): + high_vaf_serenomica.head(9) + return + + +@app.cell +def _(mo): + mo.md( + r""" + rs587780751 - low pop AF + + rs2256740 - high pop AF, especially in EU, Africans and latin americans (13/19 samples) + + rs34094720 - relatively low pop AF + + rs2293347 - relatively high pop AF (7/19 samples) + + rs138327406 - relatively low pop AF + + rs11214077 - relatively low but > 1% in latin pop + + rs121913530 - very low pop AF + """ + ) + return + + +@app.cell +def _(mo): + mo.md(r"""### (2.3) Global Coverage""") + return + + +@app.cell +def _( + all_depth, + black_mut_ids, + df_mut_per_sample_by_thresh, + pd, + qg_samples_dot_mutations, +): + # coverage over thresholds + df_depth_by_thresh = pd.DataFrame() + for d in [250, 500, 1000, 2000, 5000, 10000]: + d_data_all = pd.DataFrame( + (all_depth > d).sum(axis=0) / all_depth.shape[0], + columns=[f"coverage_gt_{d}_all"], + ) + d_data = pd.DataFrame( + ( + all_depth.loc[ + [i for i in all_depth.index if i not in black_mut_ids], : + ] + > d + ).sum(axis=0) + / all_depth.shape[0], + columns=[f"coverage_gt_{d}_wo_blacklisted"], + ) + df_depth_by_thresh = pd.concat( + [df_depth_by_thresh, d_data, d_data_all], axis=1 + ) + df_depth_by_thresh["sample_type"] = [ + "Serenomica-Sera" + if ("Sera" in i) and (i not in qg_samples_dot_mutations) + else "QG-Sera" + if (i in qg_samples_dot_mutations) and ("Sera" in i) + else "Serenomical-Clinincal" + if i not in qg_samples_dot_mutations + else "QG-Clinical" + for i in df_mut_per_sample_by_thresh.index + ] + df_depth_by_thresh + return (df_depth_by_thresh,) + + +@app.cell +def _(df_depth_by_thresh, pd): + df_depth_by_thresh_melt = pd.melt( + df_depth_by_thresh.reset_index(), id_vars=["index", "sample_type"] + ) + return (df_depth_by_thresh_melt,) + + +@app.cell +def _(df_depth_by_thresh, mo): + ms_gt_dep_thresh = mo.ui.multiselect(options=df_depth_by_thresh.columns) + mo.hstack([ms_gt_dep_thresh]) + return (ms_gt_dep_thresh,) + + +@app.cell +def _(df_depth_by_thresh_melt, ms_gt_dep_thresh, px): + fig3 = px.box( + df_depth_by_thresh_melt.loc[ + df_depth_by_thresh_melt["variable"].isin(ms_gt_dep_thresh.value) + ], + x="variable", + y="value", + color="sample_type", + points="all", + template="simple_white", + labels={"variable": "", "value": "Prop of sc sites > threhold per sample"}, + hover_data=["index"], + ) + fig3.update_yaxes(showgrid=True) + fig3.show() + return + + +@app.cell +def _(mo): + mo.md( + r""" + ### (2.4) Missed Targets + + Look into variants that only have a depth that could support a VAF of 1% (i.e. 200 reads) + """ + ) + return + + +@app.cell +def _(black_mut_ids, mutations_depth): + # removal of blacklisted variants for this analysis + mutations_depth_wo_black = mutations_depth.loc[ + [i for i in mutations_depth.index if i not in black_mut_ids], : + ].copy() + return (mutations_depth_wo_black,) + + +@app.cell +def _(clinicals, mutations_depth_wo_black, pd): + # missed non-blacklisted variants where theoretical detection only >1% (i.e. 2 reads in 200 supporting variant) + variants_lost_global_coverage = pd.DataFrame( + ( + ( + mutations_depth_wo_black.loc[ + :, + clinicals, + ] + < 200 + ).sum(axis=1) + / len(clinicals) + ).sort_values(ascending=False), + columns=["prop_cov_lt200"], + ) + variants_lost_global_coverage + return (variants_lost_global_coverage,) + + +@app.cell +def _(variants_lost_global_coverage): + variants_lost_cov_ids = list( + variants_lost_global_coverage.loc[ + variants_lost_global_coverage["prop_cov_lt200"] > 0.5 + ].index + ) + variants_lost_cov_ids + return (variants_lost_cov_ids,) + + +@app.cell +def _(mo): + mo.md( + r""" + ### (2.5) Poorly Covered for low VAF detection + + As we need 2 reads before we report a variant, this analysis looks at variants with lower than 1000 read depth that would enable a lowest VAF of 0.2% to be found. + """ + ) + return + + +@app.cell +def _(clinicals, mutations_depth_wo_black, pd): + # variants that always fail a theoretical report coverage >= 0.2% VAF (i.e. needs 2 reads out of at least 1000) + variants_low_global_coverage = pd.DataFrame( + ( + (mutations_depth_wo_black.loc[:, clinicals] < 1000).sum(axis=1) + / len(clinicals) + ).sort_values(ascending=False), + columns=["pro_cov_lt1000"], + ) + + variants_low_cov_ids = list( + variants_low_global_coverage.loc[ + variants_low_global_coverage["pro_cov_lt1000"] > 0.75 + ].index + ) + return (variants_low_cov_ids,) + + +@app.cell +def _(): + return + + +@app.cell +def _(mo): + mo.md( + r""" + ### (2.6) Overly covered targets + + Look to see if any sc targets are overly represented in the reads + """ + ) + return + + +@app.cell +def _(clinicals, mutations_depth_wo_black, pd): + mean_cov = pd.DataFrame( + mutations_depth_wo_black.loc[:, clinicals] + .mean(axis=1) + .sort_values(ascending=False), + columns=["mean_cov"], + ) + mean_cov + return (mean_cov,) + + +@app.cell +def _(mean_cov, mo): + slider = mo.ui.slider( + start=100, + stop=len(mean_cov), + step=200, + label="Number of variants to show", + show_value=True, + ) + mo.hstack([slider]) + return (slider,) + + +@app.cell +def _(mean_cov, px, slider): + fig4 = px.bar(mean_cov.iloc[: slider.value, :], template="simple_white") + fig4.show() + return + + +@app.cell +def _(clinicals, mutations_depth_wo_black, pd): + variants_high_global_coverage = pd.DataFrame( + ( + (mutations_depth_wo_black.loc[:, clinicals] > 10000).sum(axis=1) + / len(clinicals) + ).sort_values(ascending=False), + columns=["pro_cov_gt10000"], + ) + + variants_high_cov_ids = list( + variants_high_global_coverage.loc[ + variants_high_global_coverage["pro_cov_gt10000"] > 0.75 + ].index + ) + return (variants_high_cov_ids,) + + +@app.cell +def _(sc, variants_high_cov_ids, variants_lost_cov_ids, variants_low_cov_ids): + vars_of_interest = sc.loc[ + sc["mutation_id"].isin( + variants_lost_cov_ids + variants_low_cov_ids + variants_high_cov_ids + ) + ][ + [ + "chrom", + "position", + "mutation_id", + "gene", + "HGVS", + "Consequence", + "region_type", + "blacklisted", + "cancer_fda", + "FDA_drugs", + "cancer_clinvar", + "cancer_rank_prevalence_combined", + ] + ].copy() + vars_of_interest["coverage_issue"] = [ + "lost" + if i in variants_lost_cov_ids + else "poorly_covered" + if i in variants_low_cov_ids + else "highly_covered" + if i in variants_high_cov_ids + else "" + for i in vars_of_interest["mutation_id"] + ] + vars_of_interest.to_csv( + "../data/coverage_issue_variants.tsv", sep="\t", index=False + ) return