From 4fe9a42a56bbcc3159bc01885963f13c0d03ec06 Mon Sep 17 00:00:00 2001 From: Darren Wight Date: Tue, 26 Aug 2025 12:41:50 +0200 Subject: [PATCH] feat: code to validate the NAT2 json made by scripts in teh pgx-engine added --- notebooks/validate_json.py | 142 +++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 notebooks/validate_json.py diff --git a/notebooks/validate_json.py b/notebooks/validate_json.py new file mode 100644 index 0000000..7a1bef1 --- /dev/null +++ b/notebooks/validate_json.py @@ -0,0 +1,142 @@ +import marimo + +__generated_with = "0.14.16" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import json + import marimo as mo + import pandas as pd + import numpy as np + return json, mo, np, pd + + +@app.cell +def _(mo): + mo.md( + r""" + # Validate the Json created by the pgx-pipeline + + This notebook is to validate the json created by the scripts in the pgx-pipeline repo with the data from [pharmvar](https://www.pharmvar.org/). + """ + ) + return + + +@app.cell +def _(pd): + # original file downloaded from pharmvar for NAT2 + pharmvar = pd.read_csv( + "../data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv", + sep="\t", + comment="#", + ) + pharmvar.head(10) + return (pharmvar,) + + +@app.cell +def _(json): + # json created with the scripts in the PGx-engine using above file + with open("../data/NAT2_translation.json", "r") as handle: + json_data = json.load(handle) + print(json_data.keys()) + + # parse vars into list to use later for mapping + j_var_list = [ + [str(j["position"]), j["rsid"], j["ref"], j["alts"]] + for j in json_data["variants"] + ] + print(j_var_list[0]) + print(len(j_var_list)) + return j_var_list, json_data + + +@app.cell +def _(j_var_list, json_data, np): + # extract the allele info for each allele and map with the variant details + allele_dict = {} + for allele in json_data["namedAlleles"]: + allele_id = allele["id"] + + # if allele has no variants (i.e. wild type) give it a fixed entry + allele_dict[allele_id] = ( + [ + j_var_list[idx] + for idx, var in enumerate(allele["alleles"]) + if var != None + ] + if allele_id not in ("NAT2*1", "NAT2*1.001") + else [[".", np.nan, np.nan, [np.nan]]] + ) + allele_dict["NAT2*4.002"] + return (allele_dict,) + + +@app.cell +def _(allele_dict, pharmvar): + # check allele names match between json and the original file + pharmvar_hap_names = pharmvar["Haplotype Name"].unique().tolist() + json_hap_names = list(allele_dict.keys()) + + # check allele ids are same between json and pharvar sources + assert len(pharmvar_hap_names) == len(json_hap_names) + assert set(pharmvar_hap_names) == set(json_hap_names) + return (pharmvar_hap_names,) + + +@app.cell +def _(pharmvar, pharmvar_hap_names): + # parse the pharmvar table entries for each allele into a list for comparison with the json data + def get_pharmvar_vars(pharmvar_df, star_id): + vals = pharmvar_df.loc[pharmvar_df["Haplotype Name"] == star_id][ + ["Variant Start", "rsID", "Reference Allele", "Variant Allele"] + ].values + return [[v[0], v[1], v[2], [v[3]]] for v in vals] + + + pharmvar_dict = {} + for name in pharmvar_hap_names: + pharmvar_dict[name] = get_pharmvar_vars(pharmvar, name) + pharmvar_dict["NAT2*4.002"] + return (pharmvar_dict,) + + +@app.cell +def _(allele_dict, pd, pharmvar_dict): + # compare the json to what is in the pharmvar + results = [] + for pharm_name, pharm_data in pharmvar_dict.items(): + if sorted(pharm_data) == sorted(allele_dict[pharm_name]): + # print(f"{pharm_name}\t\tMATCH") + results.append([pharm_name, "MATCH", ".", "."]) + else: + # print([pharm_name, "NO MATCH", pharm_data, allele_dict[pharm_name]]) + results.append( + [pharm_name, "NO MATCH", pharm_data, allele_dict[pharm_name]] + ) + df = pd.DataFrame( + results, columns=["Haplotype Name", "Match", "PhamVar Data", "Json Data"] + ) + df.value_counts("Match") + return + + +@app.cell +def _(mo): + mo.md( + r""" + ## Conclusions + + - Alleles are all in the json from the Phamvar table. + - Allele definitions were correctly copied over. + - Scripts appear to make a correct form of the pharmvar data into json. + """ + ) + return + + +if __name__ == "__main__": + app.run()