feat: update notebook code

This commit is contained in:
2025-09-19 13:49:30 +02:00
parent cb80d69877
commit 877c736ce5

View File

@@ -0,0 +1,80 @@
import marimo
__generated_with = "0.14.16"
app = marimo.App(width="medium")
@app.cell
def _():
import os
import json
import marimo as mo
import pandas as pd
return json, mo, os, pd
@app.cell
def _(mo):
mo.md(
r"""
# Analyse the NAT2 Results post-update of the JSON with the latest star allele mapping.
### What changes were made to the pgx-engine
- Json updated with the latest [NAT2 allele defintions](https://www.pharmvar.org/gene/NAT2?ref=blog.clinpgx.org)
- Used these scripts [pharmvar_to_translation_table.py](https://git.serenomica.com/serenomica/pgx-engine/src/branch/master/scripts/pharmvar_to_translation_table.py) >> [translation_to_json.py](https://git.serenomica.com/serenomica/pgx-engine/src/branch/master/scripts/translation_to_json.py) >> new json
- Updated the diplotypes_to_phenotypes.tsv to have the corrected new allele mapping
- [Mapping source](https://a.storyblok.com/f/70677/x/301f6834b5/nat2_look-up-table-v1-1.xlsx) PharmVar
- Done using [update_nat2_d2p_tsv.py](https://git.serenomica.com/serenomica/pgx-engine/src/branch/fix-pharmvar-to-json-scripts/scripts/update_nat2_d2p_tsv.py)
- All validation samples run on server with updated PGx engine
See [PR#3](https://git.serenomica.com/serenomica/pgx-engine/pulls/3) for more details
"""
)
return
@app.cell
def _(pd):
# import mapping file
nat2_mapping = pd.read_excel("data/nat2_look-up-table-v1-1.xlsx", skiprows=1)
# paths to the PGx output
path_new_nat2 = "data/results_nat2_update_validation/"
path_old_nat2 = "../reproduce_setup/data/full_validation_data/"
# sample list
with open("../reproduce_setup/data/se_val_samples.txt", "r") as handle:
samples = [
s.rstrip().split("/")[-1].replace(".vcf", "")
for s in handle.readlines()
]
print(f"Number of samples: {len(samples)}")
return path_new_nat2, path_old_nat2, samples
@app.cell
def _(json, os, path_new_nat2, path_old_nat2, samples):
def parse_result(dir, sample, genes):
sample_path = os.path.join(dir, f"{sample}_pgx_result", "output.json")
with open(sample_path, "r") as handle:
sample_data = json.load(handle)
return {gene: sample_data["called_genotypes"][gene] for gene in genes}
for sample in samples:
print(
parse_result(path_old_nat2, sample, ["NAT2"]),
"\t",
parse_result(path_new_nat2, sample, ["NAT2"]),
)
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()