Files
pgx_nat2_issue/notebooks/validate_json.py

143 lines
3.9 KiB
Python

import marimo
__generated_with = "0.14.16"
app = marimo.App(width="medium")
@app.cell
def _():
import json
import marimo as mo
import pandas as pd
import numpy as np
return json, mo, np, pd
@app.cell
def _(mo):
mo.md(
r"""
# Validate the Json created by the pgx-pipeline
This notebook is to validate the json created by the scripts in the pgx-pipeline repo with the data from [pharmvar](https://www.pharmvar.org/).
"""
)
return
@app.cell
def _(pd):
# original file downloaded from pharmvar for NAT2
pharmvar = pd.read_csv(
"../data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv",
sep="\t",
comment="#",
)
pharmvar.head(10)
return (pharmvar,)
@app.cell
def _(json):
# json created with the scripts in the PGx-engine using above file
with open("../data/NAT2_translation.json", "r") as handle:
json_data = json.load(handle)
print(json_data.keys())
# parse vars into list to use later for mapping
j_var_list = [
[str(j["position"]), j["rsid"], j["ref"], j["alts"]]
for j in json_data["variants"]
]
print(j_var_list[0])
print(len(j_var_list))
return j_var_list, json_data
@app.cell
def _(j_var_list, json_data, np):
# extract the allele info for each allele and map with the variant details
allele_dict = {}
for allele in json_data["namedAlleles"]:
allele_id = allele["id"]
# if allele has no variants (i.e. wild type) give it a fixed entry
allele_dict[allele_id] = (
[
j_var_list[idx]
for idx, var in enumerate(allele["alleles"])
if var != None
]
if allele_id not in ("NAT2*1", "NAT2*1.001")
else [[".", np.nan, np.nan, [np.nan]]]
)
allele_dict["NAT2*4.002"]
return (allele_dict,)
@app.cell
def _(allele_dict, pharmvar):
# check allele names match between json and the original file
pharmvar_hap_names = pharmvar["Haplotype Name"].unique().tolist()
json_hap_names = list(allele_dict.keys())
# check allele ids are same between json and pharvar sources
assert len(pharmvar_hap_names) == len(json_hap_names)
assert set(pharmvar_hap_names) == set(json_hap_names)
return (pharmvar_hap_names,)
@app.cell
def _(pharmvar, pharmvar_hap_names):
# parse the pharmvar table entries for each allele into a list for comparison with the json data
def get_pharmvar_vars(pharmvar_df, star_id):
vals = pharmvar_df.loc[pharmvar_df["Haplotype Name"] == star_id][
["Variant Start", "rsID", "Reference Allele", "Variant Allele"]
].values
return [[v[0], v[1], v[2], [v[3]]] for v in vals]
pharmvar_dict = {}
for name in pharmvar_hap_names:
pharmvar_dict[name] = get_pharmvar_vars(pharmvar, name)
pharmvar_dict["NAT2*4.002"]
return (pharmvar_dict,)
@app.cell
def _(allele_dict, pd, pharmvar_dict):
# compare the json to what is in the pharmvar
results = []
for pharm_name, pharm_data in pharmvar_dict.items():
if sorted(pharm_data) == sorted(allele_dict[pharm_name]):
# print(f"{pharm_name}\t\tMATCH")
results.append([pharm_name, "MATCH", ".", "."])
else:
# print([pharm_name, "NO MATCH", pharm_data, allele_dict[pharm_name]])
results.append(
[pharm_name, "NO MATCH", pharm_data, allele_dict[pharm_name]]
)
df = pd.DataFrame(
results, columns=["Haplotype Name", "Match", "PhamVar Data", "Json Data"]
)
df.value_counts("Match")
return
@app.cell
def _(mo):
mo.md(
r"""
## Conclusions
- Alleles are all in the json from the Phamvar table.
- Allele definitions were correctly copied over.
- Scripts appear to make a correct form of the pharmvar data into json.
"""
)
return
if __name__ == "__main__":
app.run()