feat: eda of the CNV coriell samples
This commit is contained in:
85
notebooks/2025Aug11_coriell_cnv_samples_Vega.py
Normal file
85
notebooks/2025Aug11_coriell_cnv_samples_Vega.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.14.16"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""
|
||||
# Coriell Samples with a CNV
|
||||
Samples were curated by [de la Vega *et al.* 2025](https://watermark.silverchair.com/vbaf071.pdf?token=AQECAHi208BE49Ooan9kkhW_Ercy7Dm3ZL_9Cf3qfKAc485ysgAAA3YwggNyBgkqhkiG9w0BBwagggNjMIIDXwIBADCCA1gGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMRFJk6IGGusYlaMTsAgEQgIIDKZrr_s89fRo2igyj7qnPvIdDh2y4dPBP0VX1hNWcnK4JMWO33RjaONrkp2CjAWrfbgpjFOBZWXTuUZhY-yZ4I19H_tWe9wuMm7ShePGYYkaFjeWAagTj0temnzXTn8SisKOZog9ZAyoCpYkv5YnH8sq9OyBXHXT798u-vzgA9FArVz5srKV74yLdKWl69QsxLyzm2sBEKY4lK4j4Bp1kufFW5_jvMhITKKfAu4RoOY8VtkUUWIatse4xD2c385faZkVDr7fpCF3xZKkBwVF9iX5u1MdLmUPn-wGK2jqWz5N-NZ43j1JFp6pnW_t4qK4JUiWfz8iwiIvVadcTRa8RdGiRzHO_vlZmJkx_a1fY1OpPh00o6Sx2W2fUilq1EEqc6MUDjo2IhaPvSJGxZJXj1_G5bhVVC-UJJe2luvPfL06-tWsyNGZVTJWXjQjT4kTk3GtyRPq1vJh0-IFBMzIQRSH1UjsNioD4TXvioimD3bYfd6Gkywbx6dqpMEjHITSqq6Lwlln-or10l7ZdYjKSCt9U6yBgjAVoH34rdxv5JIaRIWMLzJGQFTRFsJBI3qfy5pqOxg4PMqqkWtDtAio8JMmrPzhSP0e4Vo7NtbeliiLQAZ3XlUSQp1wHxgLiqAiVDWr4FLyrlPZlKhzjZktC31uwcav3C-WY0U7LTPDdCvMW4rMc8sKhUqCWDE-gHEuzRN0JvoyfoJVoyuzclINXfWXmrcfU2k3D7Ygcw_j2iYJg5ZWZcJnYnfRxHH5maWW0bwOswEYkE4kSd5FD3R9nefIqjz3yicD1BEgaTjEjK-ZOFvH0jgampP869BMKeoH4Xfy_l0q08410lZ743kqq9pDeVDk7cjeW1SG2QKNXWPL12dgUGsg3_luigWfHvQd3oMQQoRQF_Emh8-5F2LiMmOCzUPmueB6vYkvZU7MM8evBvufD7N8scsA6grE3zDPF8ZntiHYpA0ZAx4CG6ikX0tToaWKjZzzLJqcFG4Pu1Q194tRP1wAcl7gPRRHAsHAfvAyULmSJ0W2CgCMAATqvJ7-cp6UMMg1ebA2yXEE_4kWDYM-GdHI1Dhqd)
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
import itertools
|
||||
from collections import Counter
|
||||
return Counter, itertools, mo, pd
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd):
|
||||
delavega_df = pd.read_csv("coriell_samples_cnv.csv", sep="\t", skiprows=1)
|
||||
delavega_df.head()
|
||||
return (delavega_df,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd):
|
||||
# twist panel design
|
||||
twist_design_input = pd.read_csv("hercan_twist_design.csv", sep="\t")
|
||||
print(twist_design_input.shape)
|
||||
twist_design_input.head()
|
||||
return (twist_design_input,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Counter, itertools, pd, twist_design_input):
|
||||
annotations = list(
|
||||
itertools.chain.from_iterable(
|
||||
[annot.split(",") for annot in twist_design_input["Annotation*"]]
|
||||
)
|
||||
)
|
||||
print(f"Total annotations: {len(annotations)}")
|
||||
|
||||
cds_annotations = [
|
||||
i.split("__")[0] for i in list(annotations) if i.endswith("__cds")
|
||||
]
|
||||
pd.DataFrame.from_dict(
|
||||
dict(Counter(cds_annotations)), orient="index", columns=["cds_count"]
|
||||
).sort_values(by="cds_count", ascending=False)
|
||||
return (cds_annotations,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(cds_annotations, delavega_df):
|
||||
genes = list(set(cds_annotations))
|
||||
|
||||
delavega_df["potentially_in_serenomica_panel"] = [
|
||||
"YES" if any([gene in genes for gene in i.split("-")]) else ""
|
||||
for i in delavega_df["Gene(s) affected"]
|
||||
]
|
||||
delavega_df
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(delavega_df):
|
||||
delavega_df.to_csv("potential_coriell_samples.tsv", sep="\t", index=False)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Reference in New Issue
Block a user