pgx_nat2_issue/notebooks/check_nat2_perf.py

import marimo

__generated_with = "0.14.16"
app = marimo.App(width="medium")


@app.cell
def _():
    import os
    import json
    import marimo as mo
    import pandas as pd
    return json, mo, os, pd


@app.cell
def _(mo):
    mo.md(
        r"""
    # Analyse the NAT2 Results post-update of the JSON with the latest star allele mapping.

    ### What changes were made to the pgx-engine
    - Json updated with the latest [NAT2 allele defintions](https://www.pharmvar.org/gene/NAT2?ref=blog.clinpgx.org)
        - Used these scripts [pharmvar_to_translation_table.py](https://git.serenomica.com/serenomica/pgx-engine/src/branch/master/scripts/pharmvar_to_translation_table.py) >> [translation_to_json.py](https://git.serenomica.com/serenomica/pgx-engine/src/branch/master/scripts/translation_to_json.py) >> new json
    - Updated the diplotypes_to_phenotypes.tsv to have the corrected new allele mapping
        - [Mapping source](https://a.storyblok.com/f/70677/x/301f6834b5/nat2_look-up-table-v1-1.xlsx) PharmVar
        - Done using [update_nat2_d2p_tsv.py](https://git.serenomica.com/serenomica/pgx-engine/src/branch/fix-pharmvar-to-json-scripts/scripts/update_nat2_d2p_tsv.py)
    - All validation samples run on server with updated PGx engine

    See [PR#3](https://git.serenomica.com/serenomica/pgx-engine/pulls/3) for more details
    """
    )
    return


@app.cell
def _(pd):
    # import mapping file
    nat2_mapping = pd.read_excel("data/nat2_look-up-table-v1-1.xlsx", skiprows=1)

    # paths to the PGx output
    path_new_nat2 = "data/results_nat2_update_validation/"
    path_old_nat2 = "../reproduce_setup/data/full_validation_data/"

    # sample list
    with open("../reproduce_setup/data/se_val_samples.txt", "r") as handle:
        samples = [
            s.rstrip().split("/")[-1].replace(".vcf", "")
            for s in handle.readlines()
        ]
    print(f"Number of samples: {len(samples)}")
    return path_new_nat2, path_old_nat2, samples


@app.cell
def _(json, os, path_new_nat2, path_old_nat2, samples):
    def parse_result(dir, sample, genes):
        sample_path = os.path.join(dir, f"{sample}_pgx_result", "output.json")

        with open(sample_path, "r") as handle:
            sample_data = json.load(handle)
        return {gene: sample_data["called_genotypes"][gene] for gene in genes}


    for sample in samples:
        print(
            parse_result(path_old_nat2, sample, ["NAT2"]),
            "\t",
            parse_result(path_new_nat2, sample, ["NAT2"]),
        )
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()