108 lines
3.6 KiB
Python
108 lines
3.6 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.14.16"
|
|
app = marimo.App(width="medium")
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
import marimo as mo
|
|
import pandas as pd
|
|
return mo, pd
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(
|
|
r"""
|
|
# EDA of the NAT2 Haplotypes table
|
|
|
|
Downloaded from [PharmVar](https://www.pharmvar.org/gene/NAT2?ref=blog.clinpgx.org)
|
|
|
|
Ajit had scripts to convert from this format into the json used by PharmCAT
|
|
|
|
`pgx-engine/scripts/pharmvar_to_translation_table.py`
|
|
`pgx-engine/scripts/translation_to_json`
|
|
|
|
Where the pharmvar haplotypes table should go into the first and the translation table can be fed into the second one to get the json.
|
|
|
|
Running these causes and error because the alts element is None
|
|
|
|
```python
|
|
Traceback (most recent call last):
|
|
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 100, in <module>
|
|
main()
|
|
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 90, in main
|
|
convert(
|
|
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 40, in convert
|
|
variants = table.apply(make_variant, axis=1).to_list()
|
|
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/frame.py", line 10381, in apply
|
|
return op.apply().__finalize__(self, method="apply")
|
|
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 916, in apply
|
|
return self.apply_standard()
|
|
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1063, in apply_standard
|
|
results, res_index = self.apply_series_generator()
|
|
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1081, in apply_series_generator
|
|
results[i] = self.func(v, *self.args, **self.kwargs)
|
|
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 20, in make_variant
|
|
alts = row["alts"].split(",")
|
|
AttributeError: 'NoneType' object has no attribute 'split'
|
|
```
|
|
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd):
|
|
# load the Pharmvar table
|
|
pharmvar = pd.read_csv(
|
|
"data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv",
|
|
sep="\t",
|
|
comment="#",
|
|
)
|
|
print(pharmvar.shape)
|
|
pharmvar.drop_duplicates(inplace=True)
|
|
print(pharmvar.shape)
|
|
pharmvar.head()
|
|
return (pharmvar,)
|
|
|
|
|
|
@app.cell
|
|
def _(pharmvar):
|
|
# code from Ajit that doesn't take the alt alleles and are then missing in the output - plus dangerously drops dups on position only
|
|
# also dangerously compresses over the start position and would loose any variant that has multiple possible alts (there are multiple of these)
|
|
position_dict = (
|
|
pharmvar[pharmvar.rsID.notna()]
|
|
.drop_duplicates(["Variant Start"])
|
|
.set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
|
|
.to_dict("index")
|
|
)
|
|
position_dict
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pharmvar):
|
|
# my solution to gather all possible variant alleles under each variant start without de-duplicating
|
|
correct_pos_dict = (
|
|
pharmvar[pharmvar.rsID.notna()]
|
|
.groupby(["rsID", "Variant Start", "Reference Allele"])["Variant Allele"]
|
|
.agg(lambda x: ",".join(x.unique()))
|
|
.reset_index()
|
|
.set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
|
|
.to_dict("index")
|
|
)
|
|
correct_pos_dict
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|