feat: code to validate the NAT2 json made by scripts in teh pgx-engine added
This commit is contained in:
142
notebooks/validate_json.py
Normal file
142
notebooks/validate_json.py
Normal file
@@ -0,0 +1,142 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.14.16"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import json
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
return json, mo, np, pd
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""
|
||||
# Validate the Json created by the pgx-pipeline
|
||||
|
||||
This notebook is to validate the json created by the scripts in the pgx-pipeline repo with the data from [pharmvar](https://www.pharmvar.org/).
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd):
|
||||
# original file downloaded from pharmvar for NAT2
|
||||
pharmvar = pd.read_csv(
|
||||
"../data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv",
|
||||
sep="\t",
|
||||
comment="#",
|
||||
)
|
||||
pharmvar.head(10)
|
||||
return (pharmvar,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(json):
|
||||
# json created with the scripts in the PGx-engine using above file
|
||||
with open("../data/NAT2_translation.json", "r") as handle:
|
||||
json_data = json.load(handle)
|
||||
print(json_data.keys())
|
||||
|
||||
# parse vars into list to use later for mapping
|
||||
j_var_list = [
|
||||
[str(j["position"]), j["rsid"], j["ref"], j["alts"]]
|
||||
for j in json_data["variants"]
|
||||
]
|
||||
print(j_var_list[0])
|
||||
print(len(j_var_list))
|
||||
return j_var_list, json_data
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(j_var_list, json_data, np):
|
||||
# extract the allele info for each allele and map with the variant details
|
||||
allele_dict = {}
|
||||
for allele in json_data["namedAlleles"]:
|
||||
allele_id = allele["id"]
|
||||
|
||||
# if allele has no variants (i.e. wild type) give it a fixed entry
|
||||
allele_dict[allele_id] = (
|
||||
[
|
||||
j_var_list[idx]
|
||||
for idx, var in enumerate(allele["alleles"])
|
||||
if var != None
|
||||
]
|
||||
if allele_id not in ("NAT2*1", "NAT2*1.001")
|
||||
else [[".", np.nan, np.nan, [np.nan]]]
|
||||
)
|
||||
allele_dict["NAT2*4.002"]
|
||||
return (allele_dict,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(allele_dict, pharmvar):
|
||||
# check allele names match between json and the original file
|
||||
pharmvar_hap_names = pharmvar["Haplotype Name"].unique().tolist()
|
||||
json_hap_names = list(allele_dict.keys())
|
||||
|
||||
# check allele ids are same between json and pharvar sources
|
||||
assert len(pharmvar_hap_names) == len(json_hap_names)
|
||||
assert set(pharmvar_hap_names) == set(json_hap_names)
|
||||
return (pharmvar_hap_names,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pharmvar, pharmvar_hap_names):
|
||||
# parse the pharmvar table entries for each allele into a list for comparison with the json data
|
||||
def get_pharmvar_vars(pharmvar_df, star_id):
|
||||
vals = pharmvar_df.loc[pharmvar_df["Haplotype Name"] == star_id][
|
||||
["Variant Start", "rsID", "Reference Allele", "Variant Allele"]
|
||||
].values
|
||||
return [[v[0], v[1], v[2], [v[3]]] for v in vals]
|
||||
|
||||
|
||||
pharmvar_dict = {}
|
||||
for name in pharmvar_hap_names:
|
||||
pharmvar_dict[name] = get_pharmvar_vars(pharmvar, name)
|
||||
pharmvar_dict["NAT2*4.002"]
|
||||
return (pharmvar_dict,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(allele_dict, pd, pharmvar_dict):
|
||||
# compare the json to what is in the pharmvar
|
||||
results = []
|
||||
for pharm_name, pharm_data in pharmvar_dict.items():
|
||||
if sorted(pharm_data) == sorted(allele_dict[pharm_name]):
|
||||
# print(f"{pharm_name}\t\tMATCH")
|
||||
results.append([pharm_name, "MATCH", ".", "."])
|
||||
else:
|
||||
# print([pharm_name, "NO MATCH", pharm_data, allele_dict[pharm_name]])
|
||||
results.append(
|
||||
[pharm_name, "NO MATCH", pharm_data, allele_dict[pharm_name]]
|
||||
)
|
||||
df = pd.DataFrame(
|
||||
results, columns=["Haplotype Name", "Match", "PhamVar Data", "Json Data"]
|
||||
)
|
||||
df.value_counts("Match")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""
|
||||
## Conclusions
|
||||
|
||||
- Alleles are all in the json from the Phamvar table.
|
||||
- Allele definitions were correctly copied over.
|
||||
- Scripts appear to make a correct form of the pharmvar data into json.
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Reference in New Issue
Block a user