diff --git a/notebooks/coverage_eda_lb_20251017.py b/notebooks/coverage_eda_lb_20251017.py new file mode 100644 index 0000000..9f309f2 --- /dev/null +++ b/notebooks/coverage_eda_lb_20251017.py @@ -0,0 +1,156 @@ +import marimo + +__generated_with = "0.16.4" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import os + import json + import statistics + import marimo as mo + import pandas as pd + import plotly.express as px + return json, mo, os, pd, statistics + + +@app.cell +def _(mo): + mo.md( + r""" + # EDA of LB Assay Coverage + + Data was generated by `get_panel_coverage.py` that uses the LB panels LB covered bed (with flankning sequence added, i.e. the variant calling bed) with the deduplicated bam/cram. + + The output json from this analysis for each sample has the following form🥇 + + ```json + { + "chr<#>__": [ + "depth_pos1", + "depth_pos2", + "depth_pos3", + ... + "depth_posN" + ] + } + ``` + + Analysis Plan: + - Poorly covered regions (i.e. bad probes/challenging areas to align) + - Overly covered regions (i.e. potential areas that are challenging for aligners or have low complexity) + - Comparison of Serenomica and Quantgene in overlap areas + """ + ) + return + + +@app.cell +def _(): + # data + se_path = "../data/coverage_serenomica_QG/se1_se2/" + qg_path = "../data/coverage_serenomica_QG/QG_historical/" + return qg_path, se_path + + +@app.cell +def _(json, os): + def parse_data(dpath): + files = [f for f in os.listdir(dpath) if f.endswith(".json")] + parsed_data, mean_data = {} + for file in files: + name = file.split("_cov")[0] + with open(os.path.join(dpath, file), "r") as handle: + parsed_data[name] = json.load(handle) + return parsed_data + return (parse_data,) + + +@app.cell +def _(parse_data, qg_path, se_path): + se_data = parse_data(se_path) + qg_data = parse_data(qg_path) + return qg_data, se_data + + +@app.cell +def _(statistics): + def summerize_list(lst): + vals = list(map(int, lst)) + return ( + statistics.mean(vals), + statistics.median(vals), + statistics.stdev(vals), + min(vals), + max(vals), + sum(vals), + len(vals), + ) + + + def get_summary_data(sample_data): + out_data = {} + sample_sum, sample_pos_count = 0, 0 + for region, depths in sample_data.items(): + (mean, median, std, min_val, max_val, sum_vals, count) = ( + summerize_list(depths) + ) + out_data[region] = { + "min": min_val, + "max": max_val, + "mean": mean, + "median": median, + "std": std, + } + sample_sum += sum_vals + sample_pos_count += count + out_data["overall"] = {"mean": sample_sum / sample_pos_count} + return out_data + return (get_summary_data,) + + +@app.cell +def _(get_summary_data, qg_data, se_data): + se_summary_data = {} + for se_sample, se_sample_data in se_data.items(): + se_summary_data[se_sample] = get_summary_data(se_sample_data) + + qg_summary_data = {} + for qg_sample, qg_sample_data in qg_data.items(): + qg_summary_data[qg_sample] = get_summary_data(qg_sample_data) + return (se_summary_data,) + + +@app.cell +def _(): + return + + +@app.cell +def _(pd): + def extract_stat_as_df(data_dict, access_key): + tmp_dict = {} + for name, sample_data in data_dict.items(): + tmp_dict[name] = { + region: data[access_key] + for region, data in sample_data.items() + if region != "overall" + } + return pd.DataFrame.from_dict(tmp_dict, orient="index") + return (extract_stat_as_df,) + + +@app.cell +def _(extract_stat_as_df, se_summary_data): + extract_stat_as_df(se_summary_data, access_key="std") + return + + +@app.cell +def _(): + return + + +if __name__ == "__main__": + app.run()