eda: look into the variant allele issue that caused Ajit's script to break

2025-08-21 17:27:37 +02:00
parent dfe19fa611
commit 68ae6746e0
3 changed files with 122 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -188,3 +188,5 @@ ipython_config.py
 # Remove previous ipynb_checkpoints
 #   git rm -r .ipynb_checkpoints/
 data
 __marimo__
--- a/cmds.sh
+++ b/cmds.sh
@@ -0,0 +1,13 @@
 # create the translation table after fixing the issue with the variant allele encoding
 python /home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/pharmvar_to_translation_table.py \
    data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv \
    "NAT2" \
    "chr8"
 # create the json (needed to change the saving location)
 python /home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py \
    temp_NAT2_translation_table.tsv \
    "NAT2" \
    "chr8" \
    "GRCh38" \
    "NC_000008.11"
--- a/notebooks/translation_table_eda.py
+++ b/notebooks/translation_table_eda.py
@@ -0,0 +1,107 @@
 import marimo
 __generated_with = "0.14.16"
 app = marimo.App(width="medium")
@app.cell
 def _():
    import marimo as mo
    import pandas as pd
    return mo, pd
@app.cell
 def _(mo):
    mo.md(
        r"""
    # EDA of the NAT2 Haplotypes table
    Downloaded from [PharmVar](https://www.pharmvar.org/gene/NAT2?ref=blog.clinpgx.org)
    Ajit had scripts to convert from this format into the json used by PharmCAT
    `pgx-engine/scripts/pharmvar_to_translation_table.py`
    `pgx-engine/scripts/translation_to_json`
    Where the pharmvar haplotypes table should go into the first and the translation table can be fed into the second one to get the json.
    Running these causes and error because the alts element is None
    ```python
    Traceback (most recent call last):
      File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 100, in <module>
        main()
      File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 90, in main
        convert(
      File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 40, in convert
        variants = table.apply(make_variant, axis=1).to_list()
      File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/frame.py", line 10381, in apply
        return op.apply().__finalize__(self, method="apply")
      File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 916, in apply
        return self.apply_standard()
      File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1063, in apply_standard
        results, res_index = self.apply_series_generator()
      File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1081, in apply_series_generator
        results[i] = self.func(v, *self.args, **self.kwargs)
      File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 20, in make_variant
        alts = row["alts"].split(",")
    AttributeError: 'NoneType' object has no attribute 'split'
    ```
    """
    )
    return
@app.cell
 def _(pd):
    # load the Pharmvar table
    pharmvar = pd.read_csv(
        "data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv",
        sep="\t",
        comment="#",
    )
    print(pharmvar.shape)
    pharmvar.drop_duplicates(inplace=True)
    print(pharmvar.shape)
    pharmvar.head()
    return (pharmvar,)
@app.cell
 def _(pharmvar):
    # code from Ajit that doesn't take the alt alleles and are then missing in the output - plus dangerously drops dups on position only
    # also dangerously compresses over the start position and would loose any variant that has multiple possible alts (there are multiple of these)
    position_dict = (
        pharmvar[pharmvar.rsID.notna()]
        .drop_duplicates(["Variant Start"])
        .set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
        .to_dict("index")
    )
    position_dict
    return
@app.cell
 def _(pharmvar):
    # my solution to gather all possible variant alleles under each variant start without de-duplicating
    correct_pos_dict = (
        pharmvar[pharmvar.rsID.notna()]
        .groupby(["rsID", "Variant Start", "Reference Allele"])["Variant Allele"]
        .agg(lambda x: ",".join(x.unique()))
        .reset_index()
        .set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
        .to_dict("index")
    )
    correct_pos_dict
    return
@app.cell
 def _():
    return
 if __name__ == "__main__":
    app.run()