eda: look into the variant allele issue that caused Ajit's script to break
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -188,3 +188,5 @@ ipython_config.py
|
|||||||
# Remove previous ipynb_checkpoints
|
# Remove previous ipynb_checkpoints
|
||||||
# git rm -r .ipynb_checkpoints/
|
# git rm -r .ipynb_checkpoints/
|
||||||
|
|
||||||
|
data
|
||||||
|
__marimo__
|
||||||
13
cmds.sh
Normal file
13
cmds.sh
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
# create the translation table after fixing the issue with the variant allele encoding
|
||||||
|
python /home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/pharmvar_to_translation_table.py \
|
||||||
|
data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv \
|
||||||
|
"NAT2" \
|
||||||
|
"chr8"
|
||||||
|
|
||||||
|
# create the json (needed to change the saving location)
|
||||||
|
python /home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py \
|
||||||
|
temp_NAT2_translation_table.tsv \
|
||||||
|
"NAT2" \
|
||||||
|
"chr8" \
|
||||||
|
"GRCh38" \
|
||||||
|
"NC_000008.11"
|
||||||
107
notebooks/translation_table_eda.py
Normal file
107
notebooks/translation_table_eda.py
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
import marimo
|
||||||
|
|
||||||
|
__generated_with = "0.14.16"
|
||||||
|
app = marimo.App(width="medium")
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
import marimo as mo
|
||||||
|
import pandas as pd
|
||||||
|
return mo, pd
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
mo.md(
|
||||||
|
r"""
|
||||||
|
# EDA of the NAT2 Haplotypes table
|
||||||
|
|
||||||
|
Downloaded from [PharmVar](https://www.pharmvar.org/gene/NAT2?ref=blog.clinpgx.org)
|
||||||
|
|
||||||
|
Ajit had scripts to convert from this format into the json used by PharmCAT
|
||||||
|
|
||||||
|
`pgx-engine/scripts/pharmvar_to_translation_table.py`
|
||||||
|
`pgx-engine/scripts/translation_to_json`
|
||||||
|
|
||||||
|
Where the pharmvar haplotypes table should go into the first and the translation table can be fed into the second one to get the json.
|
||||||
|
|
||||||
|
Running these causes and error because the alts element is None
|
||||||
|
|
||||||
|
```python
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 100, in <module>
|
||||||
|
main()
|
||||||
|
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 90, in main
|
||||||
|
convert(
|
||||||
|
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 40, in convert
|
||||||
|
variants = table.apply(make_variant, axis=1).to_list()
|
||||||
|
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/frame.py", line 10381, in apply
|
||||||
|
return op.apply().__finalize__(self, method="apply")
|
||||||
|
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 916, in apply
|
||||||
|
return self.apply_standard()
|
||||||
|
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1063, in apply_standard
|
||||||
|
results, res_index = self.apply_series_generator()
|
||||||
|
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1081, in apply_series_generator
|
||||||
|
results[i] = self.func(v, *self.args, **self.kwargs)
|
||||||
|
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 20, in make_variant
|
||||||
|
alts = row["alts"].split(",")
|
||||||
|
AttributeError: 'NoneType' object has no attribute 'split'
|
||||||
|
```
|
||||||
|
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pd):
|
||||||
|
# load the Pharmvar table
|
||||||
|
pharmvar = pd.read_csv(
|
||||||
|
"data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv",
|
||||||
|
sep="\t",
|
||||||
|
comment="#",
|
||||||
|
)
|
||||||
|
print(pharmvar.shape)
|
||||||
|
pharmvar.drop_duplicates(inplace=True)
|
||||||
|
print(pharmvar.shape)
|
||||||
|
pharmvar.head()
|
||||||
|
return (pharmvar,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pharmvar):
|
||||||
|
# code from Ajit that doesn't take the alt alleles and are then missing in the output - plus dangerously drops dups on position only
|
||||||
|
# also dangerously compresses over the start position and would loose any variant that has multiple possible alts (there are multiple of these)
|
||||||
|
position_dict = (
|
||||||
|
pharmvar[pharmvar.rsID.notna()]
|
||||||
|
.drop_duplicates(["Variant Start"])
|
||||||
|
.set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
|
||||||
|
.to_dict("index")
|
||||||
|
)
|
||||||
|
position_dict
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pharmvar):
|
||||||
|
# my solution to gather all possible variant alleles under each variant start without de-duplicating
|
||||||
|
correct_pos_dict = (
|
||||||
|
pharmvar[pharmvar.rsID.notna()]
|
||||||
|
.groupby(["rsID", "Variant Start", "Reference Allele"])["Variant Allele"]
|
||||||
|
.agg(lambda x: ",".join(x.unique()))
|
||||||
|
.reset_index()
|
||||||
|
.set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
|
||||||
|
.to_dict("index")
|
||||||
|
)
|
||||||
|
correct_pos_dict
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run()
|
||||||
Reference in New Issue
Block a user