eda: look into the variant allele issue that caused Ajit's script to break
This commit is contained in:
107
notebooks/translation_table_eda.py
Normal file
107
notebooks/translation_table_eda.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.14.16"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
return mo, pd
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""
|
||||
# EDA of the NAT2 Haplotypes table
|
||||
|
||||
Downloaded from [PharmVar](https://www.pharmvar.org/gene/NAT2?ref=blog.clinpgx.org)
|
||||
|
||||
Ajit had scripts to convert from this format into the json used by PharmCAT
|
||||
|
||||
`pgx-engine/scripts/pharmvar_to_translation_table.py`
|
||||
`pgx-engine/scripts/translation_to_json`
|
||||
|
||||
Where the pharmvar haplotypes table should go into the first and the translation table can be fed into the second one to get the json.
|
||||
|
||||
Running these causes and error because the alts element is None
|
||||
|
||||
```python
|
||||
Traceback (most recent call last):
|
||||
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 100, in <module>
|
||||
main()
|
||||
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 90, in main
|
||||
convert(
|
||||
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 40, in convert
|
||||
variants = table.apply(make_variant, axis=1).to_list()
|
||||
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/frame.py", line 10381, in apply
|
||||
return op.apply().__finalize__(self, method="apply")
|
||||
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 916, in apply
|
||||
return self.apply_standard()
|
||||
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1063, in apply_standard
|
||||
results, res_index = self.apply_series_generator()
|
||||
File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1081, in apply_series_generator
|
||||
results[i] = self.func(v, *self.args, **self.kwargs)
|
||||
File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 20, in make_variant
|
||||
alts = row["alts"].split(",")
|
||||
AttributeError: 'NoneType' object has no attribute 'split'
|
||||
```
|
||||
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd):
|
||||
# load the Pharmvar table
|
||||
pharmvar = pd.read_csv(
|
||||
"data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv",
|
||||
sep="\t",
|
||||
comment="#",
|
||||
)
|
||||
print(pharmvar.shape)
|
||||
pharmvar.drop_duplicates(inplace=True)
|
||||
print(pharmvar.shape)
|
||||
pharmvar.head()
|
||||
return (pharmvar,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pharmvar):
|
||||
# code from Ajit that doesn't take the alt alleles and are then missing in the output - plus dangerously drops dups on position only
|
||||
# also dangerously compresses over the start position and would loose any variant that has multiple possible alts (there are multiple of these)
|
||||
position_dict = (
|
||||
pharmvar[pharmvar.rsID.notna()]
|
||||
.drop_duplicates(["Variant Start"])
|
||||
.set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
|
||||
.to_dict("index")
|
||||
)
|
||||
position_dict
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pharmvar):
|
||||
# my solution to gather all possible variant alleles under each variant start without de-duplicating
|
||||
correct_pos_dict = (
|
||||
pharmvar[pharmvar.rsID.notna()]
|
||||
.groupby(["rsID", "Variant Start", "Reference Allele"])["Variant Allele"]
|
||||
.agg(lambda x: ",".join(x.unique()))
|
||||
.reset_index()
|
||||
.set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
|
||||
.to_dict("index")
|
||||
)
|
||||
correct_pos_dict
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Reference in New Issue
Block a user