import marimo __generated_with = "0.16.4" app = marimo.App(width="medium") @app.cell def _(): import os import json import statistics import marimo as mo import pandas as pd import plotly.express as px return json, mo, os, pd, statistics @app.cell def _(mo): mo.md( r""" # EDA of LB Assay Coverage Data was generated by `get_panel_coverage.py` that uses the LB panels LB covered bed (with flankning sequence added, i.e. the variant calling bed) with the deduplicated bam/cram. The output json from this analysis for each sample has the following form🥇 ```json { "chr<#>__": [ "depth_pos1", "depth_pos2", "depth_pos3", ... "depth_posN" ] } ``` Analysis Plan: - Poorly covered regions (i.e. bad probes/challenging areas to align) - Overly covered regions (i.e. potential areas that are challenging for aligners or have low complexity) - Comparison of Serenomica and Quantgene in overlap areas """ ) return @app.cell def _(): # data se_path = "../data/coverage_serenomica_QG/se1_se2/" qg_path = "../data/coverage_serenomica_QG/QG_historical/" return qg_path, se_path @app.cell def _(json, os): def parse_data(dpath): files = [f for f in os.listdir(dpath) if f.endswith(".json")] parsed_data, mean_data = {} for file in files: name = file.split("_cov")[0] with open(os.path.join(dpath, file), "r") as handle: parsed_data[name] = json.load(handle) return parsed_data return (parse_data,) @app.cell def _(parse_data, qg_path, se_path): se_data = parse_data(se_path) qg_data = parse_data(qg_path) return qg_data, se_data @app.cell def _(statistics): def summerize_list(lst): vals = list(map(int, lst)) return ( statistics.mean(vals), statistics.median(vals), statistics.stdev(vals), min(vals), max(vals), sum(vals), len(vals), ) def get_summary_data(sample_data): out_data = {} sample_sum, sample_pos_count = 0, 0 for region, depths in sample_data.items(): (mean, median, std, min_val, max_val, sum_vals, count) = ( summerize_list(depths) ) out_data[region] = { "min": min_val, "max": max_val, "mean": mean, "median": median, "std": std, } sample_sum += sum_vals sample_pos_count += count out_data["overall"] = {"mean": sample_sum / sample_pos_count} return out_data return (get_summary_data,) @app.cell def _(get_summary_data, qg_data, se_data): se_summary_data = {} for se_sample, se_sample_data in se_data.items(): se_summary_data[se_sample] = get_summary_data(se_sample_data) qg_summary_data = {} for qg_sample, qg_sample_data in qg_data.items(): qg_summary_data[qg_sample] = get_summary_data(qg_sample_data) return (se_summary_data,) @app.cell def _(): return @app.cell def _(pd): def extract_stat_as_df(data_dict, access_key): tmp_dict = {} for name, sample_data in data_dict.items(): tmp_dict[name] = { region: data[access_key] for region, data in sample_data.items() if region != "overall" } return pd.DataFrame.from_dict(tmp_dict, orient="index") return (extract_stat_as_df,) @app.cell def _(extract_stat_as_df, se_summary_data): extract_stat_as_df(se_summary_data, access_key="std") return @app.cell def _(): return if __name__ == "__main__": app.run()