diff --git a/notebooks/2025Aug11_coriell_cnv_samples_Vega.py b/notebooks/2025Aug11_coriell_cnv_samples_Vega.py new file mode 100644 index 0000000..18bc4a8 --- /dev/null +++ b/notebooks/2025Aug11_coriell_cnv_samples_Vega.py @@ -0,0 +1,85 @@ +import marimo + +__generated_with = "0.14.16" +app = marimo.App(width="medium") + + +@app.cell +def _(mo): + mo.md( + r""" + # Coriell Samples with a CNV + Samples were curated by [de la Vega *et al.* 2025](https://watermark.silverchair.com/vbaf071.pdf?token=AQECAHi208BE49Ooan9kkhW_Ercy7Dm3ZL_9Cf3qfKAc485ysgAAA3YwggNyBgkqhkiG9w0BBwagggNjMIIDXwIBADCCA1gGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMRFJk6IGGusYlaMTsAgEQgIIDKZrr_s89fRo2igyj7qnPvIdDh2y4dPBP0VX1hNWcnK4JMWO33RjaONrkp2CjAWrfbgpjFOBZWXTuUZhY-yZ4I19H_tWe9wuMm7ShePGYYkaFjeWAagTj0temnzXTn8SisKOZog9ZAyoCpYkv5YnH8sq9OyBXHXT798u-vzgA9FArVz5srKV74yLdKWl69QsxLyzm2sBEKY4lK4j4Bp1kufFW5_jvMhITKKfAu4RoOY8VtkUUWIatse4xD2c385faZkVDr7fpCF3xZKkBwVF9iX5u1MdLmUPn-wGK2jqWz5N-NZ43j1JFp6pnW_t4qK4JUiWfz8iwiIvVadcTRa8RdGiRzHO_vlZmJkx_a1fY1OpPh00o6Sx2W2fUilq1EEqc6MUDjo2IhaPvSJGxZJXj1_G5bhVVC-UJJe2luvPfL06-tWsyNGZVTJWXjQjT4kTk3GtyRPq1vJh0-IFBMzIQRSH1UjsNioD4TXvioimD3bYfd6Gkywbx6dqpMEjHITSqq6Lwlln-or10l7ZdYjKSCt9U6yBgjAVoH34rdxv5JIaRIWMLzJGQFTRFsJBI3qfy5pqOxg4PMqqkWtDtAio8JMmrPzhSP0e4Vo7NtbeliiLQAZ3XlUSQp1wHxgLiqAiVDWr4FLyrlPZlKhzjZktC31uwcav3C-WY0U7LTPDdCvMW4rMc8sKhUqCWDE-gHEuzRN0JvoyfoJVoyuzclINXfWXmrcfU2k3D7Ygcw_j2iYJg5ZWZcJnYnfRxHH5maWW0bwOswEYkE4kSd5FD3R9nefIqjz3yicD1BEgaTjEjK-ZOFvH0jgampP869BMKeoH4Xfy_l0q08410lZ743kqq9pDeVDk7cjeW1SG2QKNXWPL12dgUGsg3_luigWfHvQd3oMQQoRQF_Emh8-5F2LiMmOCzUPmueB6vYkvZU7MM8evBvufD7N8scsA6grE3zDPF8ZntiHYpA0ZAx4CG6ikX0tToaWKjZzzLJqcFG4Pu1Q194tRP1wAcl7gPRRHAsHAfvAyULmSJ0W2CgCMAATqvJ7-cp6UMMg1ebA2yXEE_4kWDYM-GdHI1Dhqd) + """ + ) + return + + +@app.cell +def _(): + import marimo as mo + import pandas as pd + import itertools + from collections import Counter + return Counter, itertools, mo, pd + + +@app.cell +def _(pd): + delavega_df = pd.read_csv("coriell_samples_cnv.csv", sep="\t", skiprows=1) + delavega_df.head() + return (delavega_df,) + + +@app.cell +def _(pd): + # twist panel design + twist_design_input = pd.read_csv("hercan_twist_design.csv", sep="\t") + print(twist_design_input.shape) + twist_design_input.head() + return (twist_design_input,) + + +@app.cell +def _(Counter, itertools, pd, twist_design_input): + annotations = list( + itertools.chain.from_iterable( + [annot.split(",") for annot in twist_design_input["Annotation*"]] + ) + ) + print(f"Total annotations: {len(annotations)}") + + cds_annotations = [ + i.split("__")[0] for i in list(annotations) if i.endswith("__cds") + ] + pd.DataFrame.from_dict( + dict(Counter(cds_annotations)), orient="index", columns=["cds_count"] + ).sort_values(by="cds_count", ascending=False) + return (cds_annotations,) + + +@app.cell +def _(cds_annotations, delavega_df): + genes = list(set(cds_annotations)) + + delavega_df["potentially_in_serenomica_panel"] = [ + "YES" if any([gene in genes for gene in i.split("-")]) else "" + for i in delavega_df["Gene(s) affected"] + ] + delavega_df + return + + +@app.cell +def _(delavega_df): + delavega_df.to_csv("potential_coriell_samples.tsv", sep="\t", index=False) + return + + +@app.cell +def _(): + return + + +if __name__ == "__main__": + app.run()