import marimo __generated_with = "0.14.16" app = marimo.App(width="medium") @app.cell def _(): import json import marimo as mo import pandas as pd import numpy as np return json, mo, np, pd @app.cell def _(mo): mo.md( r""" # Validate the Json created by the pgx-pipeline This notebook is to validate the json created by the scripts in the pgx-pipeline repo with the data from [pharmvar](https://www.pharmvar.org/). """ ) return @app.cell def _(pd): # original file downloaded from pharmvar for NAT2 pharmvar = pd.read_csv( "../data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv", sep="\t", comment="#", ) pharmvar.head(10) return (pharmvar,) @app.cell def _(json): # json created with the scripts in the PGx-engine using above file with open("../data/NAT2_translation.json", "r") as handle: json_data = json.load(handle) print(json_data.keys()) # parse vars into list to use later for mapping j_var_list = [ [str(j["position"]), j["rsid"], j["ref"], j["alts"]] for j in json_data["variants"] ] print(j_var_list[0]) print(len(j_var_list)) return j_var_list, json_data @app.cell def _(j_var_list, json_data, np): # extract the allele info for each allele and map with the variant details allele_dict = {} for allele in json_data["namedAlleles"]: allele_id = allele["id"] # if allele has no variants (i.e. wild type) give it a fixed entry allele_dict[allele_id] = ( [ j_var_list[idx] for idx, var in enumerate(allele["alleles"]) if var != None ] if allele_id not in ("NAT2*1", "NAT2*1.001") else [[".", np.nan, np.nan, [np.nan]]] ) allele_dict["NAT2*4.002"] return (allele_dict,) @app.cell def _(allele_dict, pharmvar): # check allele names match between json and the original file pharmvar_hap_names = pharmvar["Haplotype Name"].unique().tolist() json_hap_names = list(allele_dict.keys()) # check allele ids are same between json and pharvar sources assert len(pharmvar_hap_names) == len(json_hap_names) assert set(pharmvar_hap_names) == set(json_hap_names) return (pharmvar_hap_names,) @app.cell def _(pharmvar, pharmvar_hap_names): # parse the pharmvar table entries for each allele into a list for comparison with the json data def get_pharmvar_vars(pharmvar_df, star_id): vals = pharmvar_df.loc[pharmvar_df["Haplotype Name"] == star_id][ ["Variant Start", "rsID", "Reference Allele", "Variant Allele"] ].values return [[v[0], v[1], v[2], [v[3]]] for v in vals] pharmvar_dict = {} for name in pharmvar_hap_names: pharmvar_dict[name] = get_pharmvar_vars(pharmvar, name) pharmvar_dict["NAT2*4.002"] return (pharmvar_dict,) @app.cell def _(allele_dict, pd, pharmvar_dict): # compare the json to what is in the pharmvar results = [] for pharm_name, pharm_data in pharmvar_dict.items(): if sorted(pharm_data) == sorted(allele_dict[pharm_name]): # print(f"{pharm_name}\t\tMATCH") results.append([pharm_name, "MATCH", ".", "."]) else: # print([pharm_name, "NO MATCH", pharm_data, allele_dict[pharm_name]]) results.append( [pharm_name, "NO MATCH", pharm_data, allele_dict[pharm_name]] ) df = pd.DataFrame( results, columns=["Haplotype Name", "Match", "PhamVar Data", "Json Data"] ) df.value_counts("Match") return @app.cell def _(mo): mo.md( r""" ## Conclusions - Alleles are all in the json from the Phamvar table. - Allele definitions were correctly copied over. - Scripts appear to make a correct form of the pharmvar data into json. """ ) return if __name__ == "__main__": app.run()