import marimo __generated_with = "0.14.16" app = marimo.App(width="medium") @app.cell def _(): import os import json import marimo as mo import pandas as pd return json, mo, os, pd @app.cell def _(mo): mo.md( r""" # Analyse the NAT2 Results post-update of the JSON with the latest star allele mapping. ### What changes were made to the pgx-engine - Json updated with the latest [NAT2 allele defintions](https://www.pharmvar.org/gene/NAT2?ref=blog.clinpgx.org) - Used these scripts [pharmvar_to_translation_table.py](https://git.serenomica.com/serenomica/pgx-engine/src/branch/master/scripts/pharmvar_to_translation_table.py) >> [translation_to_json.py](https://git.serenomica.com/serenomica/pgx-engine/src/branch/master/scripts/translation_to_json.py) >> new json - Updated the diplotypes_to_phenotypes.tsv to have the corrected new allele mapping - [Mapping source](https://a.storyblok.com/f/70677/x/301f6834b5/nat2_look-up-table-v1-1.xlsx) PharmVar - Done using [update_nat2_d2p_tsv.py](https://git.serenomica.com/serenomica/pgx-engine/src/branch/fix-pharmvar-to-json-scripts/scripts/update_nat2_d2p_tsv.py) - All validation samples run on server with updated PGx engine See [PR#3](https://git.serenomica.com/serenomica/pgx-engine/pulls/3) for more details """ ) return @app.cell def _(pd): # import mapping file nat2_mapping = pd.read_excel("data/nat2_look-up-table-v1-1.xlsx", skiprows=1) # paths to the PGx output path_new_nat2 = "data/results_nat2_update_validation/" path_old_nat2 = "../reproduce_setup/data/full_validation_data/" # sample list with open("../reproduce_setup/data/se_val_samples.txt", "r") as handle: samples = [ s.rstrip().split("/")[-1].replace(".vcf", "") for s in handle.readlines() ] print(f"Number of samples: {len(samples)}") return path_new_nat2, path_old_nat2, samples @app.cell def _(json, os, path_new_nat2, path_old_nat2, samples): def parse_result(dir, sample, genes): sample_path = os.path.join(dir, f"{sample}_pgx_result", "output.json") with open(sample_path, "r") as handle: sample_data = json.load(handle) return {gene: sample_data["called_genotypes"][gene] for gene in genes} for sample in samples: print( parse_result(path_old_nat2, sample, ["NAT2"]), "\t", parse_result(path_new_nat2, sample, ["NAT2"]), ) return @app.cell def _(): return if __name__ == "__main__": app.run()