pgx_nat2_issue/notebooks/translation_table_eda.py

import marimo

__generated_with = "0.14.16"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import pandas as pd
    return mo, pd


@app.cell
def _(mo):
    mo.md(
        r"""
    # EDA of the NAT2 Haplotypes table

    Downloaded from [PharmVar](https://www.pharmvar.org/gene/NAT2?ref=blog.clinpgx.org)

    Ajit had scripts to convert from this format into the json used by PharmCAT

    `pgx-engine/scripts/pharmvar_to_translation_table.py`
    `pgx-engine/scripts/translation_to_json`

    Where the pharmvar haplotypes table should go into the first and the translation table can be fed into the second one to get the json.

    Running these causes and error because the alts element is None

    ```python
    Traceback (most recent call last):
      File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 100, in <module>
        main()
      File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 90, in main
        convert(
      File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 40, in convert
        variants = table.apply(make_variant, axis=1).to_list()
      File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/frame.py", line 10381, in apply
        return op.apply().__finalize__(self, method="apply")
      File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 916, in apply
        return self.apply_standard()
      File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1063, in apply_standard
        results, res_index = self.apply_series_generator()
      File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1081, in apply_series_generator
        results[i] = self.func(v, *self.args, **self.kwargs)
      File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 20, in make_variant
        alts = row["alts"].split(",")
    AttributeError: 'NoneType' object has no attribute 'split'
    ```

    """
    )
    return


@app.cell
def _(pd):
    # load the Pharmvar table
    pharmvar = pd.read_csv(
        "data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv",
        sep="\t",
        comment="#",
    )
    print(pharmvar.shape)
    pharmvar.drop_duplicates(inplace=True)
    print(pharmvar.shape)
    pharmvar.head()
    return (pharmvar,)


@app.cell
def _(pharmvar):
    # code from Ajit that doesn't take the alt alleles and are then missing in the output - plus dangerously drops dups on position only
    # also dangerously compresses over the start position and would loose any variant that has multiple possible alts (there are multiple of these)
    position_dict = (
        pharmvar[pharmvar.rsID.notna()]
        .drop_duplicates(["Variant Start"])
        .set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
        .to_dict("index")
    )
    position_dict
    return


@app.cell
def _(pharmvar):
    # my solution to gather all possible variant alleles under each variant start without de-duplicating
    correct_pos_dict = (
        pharmvar[pharmvar.rsID.notna()]
        .groupby(["rsID", "Variant Start", "Reference Allele"])["Variant Allele"]
        .agg(lambda x: ",".join(x.unique()))
        .reset_index()
        .set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]]
        .to_dict("index")
    )
    correct_pos_dict
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()