feat: eda of the CNV coriell samples
This commit is contained in:
85
notebooks/2025Aug11_coriell_cnv_samples_Vega.py
Normal file
85
notebooks/2025Aug11_coriell_cnv_samples_Vega.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
import marimo
|
||||||
|
|
||||||
|
__generated_with = "0.14.16"
|
||||||
|
app = marimo.App(width="medium")
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
mo.md(
|
||||||
|
r"""
|
||||||
|
# Coriell Samples with a CNV
|
||||||
|
Samples were curated by [de la Vega *et al.* 2025](https://watermark.silverchair.com/vbaf071.pdf?token=AQECAHi208BE49Ooan9kkhW_Ercy7Dm3ZL_9Cf3qfKAc485ysgAAA3YwggNyBgkqhkiG9w0BBwagggNjMIIDXwIBADCCA1gGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMRFJk6IGGusYlaMTsAgEQgIIDKZrr_s89fRo2igyj7qnPvIdDh2y4dPBP0VX1hNWcnK4JMWO33RjaONrkp2CjAWrfbgpjFOBZWXTuUZhY-yZ4I19H_tWe9wuMm7ShePGYYkaFjeWAagTj0temnzXTn8SisKOZog9ZAyoCpYkv5YnH8sq9OyBXHXT798u-vzgA9FArVz5srKV74yLdKWl69QsxLyzm2sBEKY4lK4j4Bp1kufFW5_jvMhITKKfAu4RoOY8VtkUUWIatse4xD2c385faZkVDr7fpCF3xZKkBwVF9iX5u1MdLmUPn-wGK2jqWz5N-NZ43j1JFp6pnW_t4qK4JUiWfz8iwiIvVadcTRa8RdGiRzHO_vlZmJkx_a1fY1OpPh00o6Sx2W2fUilq1EEqc6MUDjo2IhaPvSJGxZJXj1_G5bhVVC-UJJe2luvPfL06-tWsyNGZVTJWXjQjT4kTk3GtyRPq1vJh0-IFBMzIQRSH1UjsNioD4TXvioimD3bYfd6Gkywbx6dqpMEjHITSqq6Lwlln-or10l7ZdYjKSCt9U6yBgjAVoH34rdxv5JIaRIWMLzJGQFTRFsJBI3qfy5pqOxg4PMqqkWtDtAio8JMmrPzhSP0e4Vo7NtbeliiLQAZ3XlUSQp1wHxgLiqAiVDWr4FLyrlPZlKhzjZktC31uwcav3C-WY0U7LTPDdCvMW4rMc8sKhUqCWDE-gHEuzRN0JvoyfoJVoyuzclINXfWXmrcfU2k3D7Ygcw_j2iYJg5ZWZcJnYnfRxHH5maWW0bwOswEYkE4kSd5FD3R9nefIqjz3yicD1BEgaTjEjK-ZOFvH0jgampP869BMKeoH4Xfy_l0q08410lZ743kqq9pDeVDk7cjeW1SG2QKNXWPL12dgUGsg3_luigWfHvQd3oMQQoRQF_Emh8-5F2LiMmOCzUPmueB6vYkvZU7MM8evBvufD7N8scsA6grE3zDPF8ZntiHYpA0ZAx4CG6ikX0tToaWKjZzzLJqcFG4Pu1Q194tRP1wAcl7gPRRHAsHAfvAyULmSJ0W2CgCMAATqvJ7-cp6UMMg1ebA2yXEE_4kWDYM-GdHI1Dhqd)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
import marimo as mo
|
||||||
|
import pandas as pd
|
||||||
|
import itertools
|
||||||
|
from collections import Counter
|
||||||
|
return Counter, itertools, mo, pd
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pd):
|
||||||
|
delavega_df = pd.read_csv("coriell_samples_cnv.csv", sep="\t", skiprows=1)
|
||||||
|
delavega_df.head()
|
||||||
|
return (delavega_df,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pd):
|
||||||
|
# twist panel design
|
||||||
|
twist_design_input = pd.read_csv("hercan_twist_design.csv", sep="\t")
|
||||||
|
print(twist_design_input.shape)
|
||||||
|
twist_design_input.head()
|
||||||
|
return (twist_design_input,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(Counter, itertools, pd, twist_design_input):
|
||||||
|
annotations = list(
|
||||||
|
itertools.chain.from_iterable(
|
||||||
|
[annot.split(",") for annot in twist_design_input["Annotation*"]]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print(f"Total annotations: {len(annotations)}")
|
||||||
|
|
||||||
|
cds_annotations = [
|
||||||
|
i.split("__")[0] for i in list(annotations) if i.endswith("__cds")
|
||||||
|
]
|
||||||
|
pd.DataFrame.from_dict(
|
||||||
|
dict(Counter(cds_annotations)), orient="index", columns=["cds_count"]
|
||||||
|
).sort_values(by="cds_count", ascending=False)
|
||||||
|
return (cds_annotations,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(cds_annotations, delavega_df):
|
||||||
|
genes = list(set(cds_annotations))
|
||||||
|
|
||||||
|
delavega_df["potentially_in_serenomica_panel"] = [
|
||||||
|
"YES" if any([gene in genes for gene in i.split("-")]) else ""
|
||||||
|
for i in delavega_df["Gene(s) affected"]
|
||||||
|
]
|
||||||
|
delavega_df
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(delavega_df):
|
||||||
|
delavega_df.to_csv("potential_coriell_samples.tsv", sep="\t", index=False)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run()
|
||||||
Reference in New Issue
Block a user