feat: code to validate the NAT2 json made by scripts in teh pgx-engine added
This commit is contained in:
142
notebooks/validate_json.py
Normal file
142
notebooks/validate_json.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
import marimo
|
||||||
|
|
||||||
|
__generated_with = "0.14.16"
|
||||||
|
app = marimo.App(width="medium")
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
import json
|
||||||
|
import marimo as mo
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
return json, mo, np, pd
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
mo.md(
|
||||||
|
r"""
|
||||||
|
# Validate the Json created by the pgx-pipeline
|
||||||
|
|
||||||
|
This notebook is to validate the json created by the scripts in the pgx-pipeline repo with the data from [pharmvar](https://www.pharmvar.org/).
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pd):
|
||||||
|
# original file downloaded from pharmvar for NAT2
|
||||||
|
pharmvar = pd.read_csv(
|
||||||
|
"../data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv",
|
||||||
|
sep="\t",
|
||||||
|
comment="#",
|
||||||
|
)
|
||||||
|
pharmvar.head(10)
|
||||||
|
return (pharmvar,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(json):
|
||||||
|
# json created with the scripts in the PGx-engine using above file
|
||||||
|
with open("../data/NAT2_translation.json", "r") as handle:
|
||||||
|
json_data = json.load(handle)
|
||||||
|
print(json_data.keys())
|
||||||
|
|
||||||
|
# parse vars into list to use later for mapping
|
||||||
|
j_var_list = [
|
||||||
|
[str(j["position"]), j["rsid"], j["ref"], j["alts"]]
|
||||||
|
for j in json_data["variants"]
|
||||||
|
]
|
||||||
|
print(j_var_list[0])
|
||||||
|
print(len(j_var_list))
|
||||||
|
return j_var_list, json_data
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(j_var_list, json_data, np):
|
||||||
|
# extract the allele info for each allele and map with the variant details
|
||||||
|
allele_dict = {}
|
||||||
|
for allele in json_data["namedAlleles"]:
|
||||||
|
allele_id = allele["id"]
|
||||||
|
|
||||||
|
# if allele has no variants (i.e. wild type) give it a fixed entry
|
||||||
|
allele_dict[allele_id] = (
|
||||||
|
[
|
||||||
|
j_var_list[idx]
|
||||||
|
for idx, var in enumerate(allele["alleles"])
|
||||||
|
if var != None
|
||||||
|
]
|
||||||
|
if allele_id not in ("NAT2*1", "NAT2*1.001")
|
||||||
|
else [[".", np.nan, np.nan, [np.nan]]]
|
||||||
|
)
|
||||||
|
allele_dict["NAT2*4.002"]
|
||||||
|
return (allele_dict,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(allele_dict, pharmvar):
|
||||||
|
# check allele names match between json and the original file
|
||||||
|
pharmvar_hap_names = pharmvar["Haplotype Name"].unique().tolist()
|
||||||
|
json_hap_names = list(allele_dict.keys())
|
||||||
|
|
||||||
|
# check allele ids are same between json and pharvar sources
|
||||||
|
assert len(pharmvar_hap_names) == len(json_hap_names)
|
||||||
|
assert set(pharmvar_hap_names) == set(json_hap_names)
|
||||||
|
return (pharmvar_hap_names,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pharmvar, pharmvar_hap_names):
|
||||||
|
# parse the pharmvar table entries for each allele into a list for comparison with the json data
|
||||||
|
def get_pharmvar_vars(pharmvar_df, star_id):
|
||||||
|
vals = pharmvar_df.loc[pharmvar_df["Haplotype Name"] == star_id][
|
||||||
|
["Variant Start", "rsID", "Reference Allele", "Variant Allele"]
|
||||||
|
].values
|
||||||
|
return [[v[0], v[1], v[2], [v[3]]] for v in vals]
|
||||||
|
|
||||||
|
|
||||||
|
pharmvar_dict = {}
|
||||||
|
for name in pharmvar_hap_names:
|
||||||
|
pharmvar_dict[name] = get_pharmvar_vars(pharmvar, name)
|
||||||
|
pharmvar_dict["NAT2*4.002"]
|
||||||
|
return (pharmvar_dict,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(allele_dict, pd, pharmvar_dict):
|
||||||
|
# compare the json to what is in the pharmvar
|
||||||
|
results = []
|
||||||
|
for pharm_name, pharm_data in pharmvar_dict.items():
|
||||||
|
if sorted(pharm_data) == sorted(allele_dict[pharm_name]):
|
||||||
|
# print(f"{pharm_name}\t\tMATCH")
|
||||||
|
results.append([pharm_name, "MATCH", ".", "."])
|
||||||
|
else:
|
||||||
|
# print([pharm_name, "NO MATCH", pharm_data, allele_dict[pharm_name]])
|
||||||
|
results.append(
|
||||||
|
[pharm_name, "NO MATCH", pharm_data, allele_dict[pharm_name]]
|
||||||
|
)
|
||||||
|
df = pd.DataFrame(
|
||||||
|
results, columns=["Haplotype Name", "Match", "PhamVar Data", "Json Data"]
|
||||||
|
)
|
||||||
|
df.value_counts("Match")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
mo.md(
|
||||||
|
r"""
|
||||||
|
## Conclusions
|
||||||
|
|
||||||
|
- Alleles are all in the json from the Phamvar table.
|
||||||
|
- Allele definitions were correctly copied over.
|
||||||
|
- Scripts appear to make a correct form of the pharmvar data into json.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run()
|
||||||
Reference in New Issue
Block a user