eda: look into the variant allele issue that caused Ajit's script to break

This commit is contained in:
2025-08-21 17:27:37 +02:00
parent dfe19fa611
commit 68ae6746e0
3 changed files with 122 additions and 0 deletions

2
.gitignore vendored
View File

@@ -188,3 +188,5 @@ ipython_config.py
# Remove previous ipynb_checkpoints # Remove previous ipynb_checkpoints
# git rm -r .ipynb_checkpoints/ # git rm -r .ipynb_checkpoints/
data
__marimo__

13
cmds.sh Normal file
View File

@@ -0,0 +1,13 @@
# create the translation table after fixing the issue with the variant allele encoding
python /home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/pharmvar_to_translation_table.py \
data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv \
"NAT2" \
"chr8"
# create the json (needed to change the saving location)
python /home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py \
temp_NAT2_translation_table.tsv \
"NAT2" \
"chr8" \
"GRCh38" \
"NC_000008.11"

View File

@@ -0,0 +1,107 @@
import marimo
__generated_with = "0.14.16"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import pandas as pd
return mo, pd
@app.cell
def _(mo):
mo.md(
r"""
# EDA of the NAT2 Haplotypes table
Downloaded from [PharmVar](https://www.pharmvar.org/gene/NAT2?ref=blog.clinpgx.org)
Ajit had scripts to convert from this format into the json used by PharmCAT
`pgx-engine/scripts/pharmvar_to_translation_table.py`
`pgx-engine/scripts/translation_to_json`
Where the pharmvar haplotypes table should go into the first and the translation table can be fed into the second one to get the json.
Running these causes and error because the alts element is None
```python
Traceback (most recent call last):
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 100, in <module>
main()
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 90, in main
convert(
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 40, in convert
variants = table.apply(make_variant, axis=1).to_list()
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/frame.py", line 10381, in apply
return op.apply().__finalize__(self, method="apply")
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 916, in apply
return self.apply_standard()
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1063, in apply_standard
results, res_index = self.apply_series_generator()
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1081, in apply_series_generator
results[i] = self.func(v, *self.args, **self.kwargs)
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 20, in make_variant
alts = row["alts"].split(",")
AttributeError: 'NoneType' object has no attribute 'split'
```
"""
)
return
@app.cell
def _(pd):
# load the Pharmvar table
pharmvar = pd.read_csv(
"data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv",
sep="\t",
comment="#",
)
print(pharmvar.shape)
pharmvar.drop_duplicates(inplace=True)
print(pharmvar.shape)
pharmvar.head()
return (pharmvar,)
@app.cell
def _(pharmvar):
# code from Ajit that doesn't take the alt alleles and are then missing in the output - plus dangerously drops dups on position only
# also dangerously compresses over the start position and would loose any variant that has multiple possible alts (there are multiple of these)
position_dict = (
pharmvar[pharmvar.rsID.notna()]
.drop_duplicates(["Variant Start"])
.set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
.to_dict("index")
)
position_dict
return
@app.cell
def _(pharmvar):
# my solution to gather all possible variant alleles under each variant start without de-duplicating
correct_pos_dict = (
pharmvar[pharmvar.rsID.notna()]
.groupby(["rsID", "Variant Start", "Reference Allele"])["Variant Allele"]
.agg(lambda x: ",".join(x.unique()))
.reset_index()
.set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
.to_dict("index")
)
correct_pos_dict
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()