diff --git a/.gitignore b/.gitignore index cac109b..87c9263 100644 --- a/.gitignore +++ b/.gitignore @@ -188,3 +188,5 @@ ipython_config.py # Remove previous ipynb_checkpoints # git rm -r .ipynb_checkpoints/ +data +__marimo__ \ No newline at end of file diff --git a/cmds.sh b/cmds.sh new file mode 100644 index 0000000..8651200 --- /dev/null +++ b/cmds.sh @@ -0,0 +1,13 @@ +# create the translation table after fixing the issue with the variant allele encoding +python /home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/pharmvar_to_translation_table.py \ + data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv \ + "NAT2" \ + "chr8" + +# create the json (needed to change the saving location) +python /home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py \ + temp_NAT2_translation_table.tsv \ + "NAT2" \ + "chr8" \ + "GRCh38" \ + "NC_000008.11" diff --git a/notebooks/translation_table_eda.py b/notebooks/translation_table_eda.py new file mode 100644 index 0000000..dc11117 --- /dev/null +++ b/notebooks/translation_table_eda.py @@ -0,0 +1,107 @@ +import marimo + +__generated_with = "0.14.16" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import marimo as mo + import pandas as pd + return mo, pd + + +@app.cell +def _(mo): + mo.md( + r""" + # EDA of the NAT2 Haplotypes table + + Downloaded from [PharmVar](https://www.pharmvar.org/gene/NAT2?ref=blog.clinpgx.org) + + Ajit had scripts to convert from this format into the json used by PharmCAT + + `pgx-engine/scripts/pharmvar_to_translation_table.py` + `pgx-engine/scripts/translation_to_json` + + Where the pharmvar haplotypes table should go into the first and the translation table can be fed into the second one to get the json. + + Running these causes and error because the alts element is None + + ```python + Traceback (most recent call last): + File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 100, in + main() + File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 90, in main + convert( + File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 40, in convert + variants = table.apply(make_variant, axis=1).to_list() + File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/frame.py", line 10381, in apply + return op.apply().__finalize__(self, method="apply") + File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 916, in apply + return self.apply_standard() + File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1063, in apply_standard + results, res_index = self.apply_series_generator() + File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1081, in apply_series_generator + results[i] = self.func(v, *self.args, **self.kwargs) + File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 20, in make_variant + alts = row["alts"].split(",") + AttributeError: 'NoneType' object has no attribute 'split' + ``` + + """ + ) + return + + +@app.cell +def _(pd): + # load the Pharmvar table + pharmvar = pd.read_csv( + "data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv", + sep="\t", + comment="#", + ) + print(pharmvar.shape) + pharmvar.drop_duplicates(inplace=True) + print(pharmvar.shape) + pharmvar.head() + return (pharmvar,) + + +@app.cell +def _(pharmvar): + # code from Ajit that doesn't take the alt alleles and are then missing in the output - plus dangerously drops dups on position only + # also dangerously compresses over the start position and would loose any variant that has multiple possible alts (there are multiple of these) + position_dict = ( + pharmvar[pharmvar.rsID.notna()] + .drop_duplicates(["Variant Start"]) + .set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]] + .to_dict("index") + ) + position_dict + return + + +@app.cell +def _(pharmvar): + # my solution to gather all possible variant alleles under each variant start without de-duplicating + correct_pos_dict = ( + pharmvar[pharmvar.rsID.notna()] + .groupby(["rsID", "Variant Start", "Reference Allele"])["Variant Allele"] + .agg(lambda x: ",".join(x.unique())) + .reset_index() + .set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]] + .to_dict("index") + ) + correct_pos_dict + return + + +@app.cell +def _(): + return + + +if __name__ == "__main__": + app.run()