feat: code added that updates the NAT2 whitelist in the pgx-engine

This commit is contained in:
2025-08-28 12:08:39 +02:00
parent 4fe9a42a56
commit cb80d69877

View File

@@ -0,0 +1,73 @@
import os
import json
import re
import logging
from datetime import datetime
import pandas as pd
NAT2_JSON_PATH = "/home/darren/Documents/2_repos/serenomica/pgx-engine/resources/allele_whitelist.json"
NAT2_LOOKUP = "/home/darren/Documents/3_projects/3_PGx/1_Clean_up/pgx_nat2_issue/data/nat2_look-up-table-v1-1.xlsx"
logging.basicConfig(
format="%(asctime)s [%(levelname)s] %(message)s", level=logging.INFO
)
def main(old_white: list[str], ref_allele: str, overwrite: bool = False) -> None:
logging.info("Script started...")
logging.info(f"Loading lookup and filtering to whitelist ('{NAT2_LOOKUP}')...")
lookup = pd.read_excel(NAT2_LOOKUP, skiprows=1)
lookup_re = re.compile(("|").join([f"\\{o}+" for o in old_white]))
lookup = lookup.loc[
(lookup["Transferred\nYes/No"] == "yes")
& (lookup["Legacy \nname"].str.contains(lookup_re))
]
logging.info(
f"Saving copy of filtered whitelist: '{(save_path:=os.path.join(
os.path.dirname(NAT2_LOOKUP),
f"nat2_whitelist_{datetime.now().strftime('%Y%-m-%d')}.tsv",
))}')..."
)
lookup.to_csv(
save_path,
sep="\t",
index=False,
)
logging.info(f"Reading whitelist ('{NAT2_JSON_PATH}')...")
with open(NAT2_JSON_PATH, "r") as handle:
whitelist = json.load(handle)
genes_pre = set(whitelist.keys())
logging.info(f"Updating NAT2...")
new_entry_json = {
"reference": ref_allele,
"alleles": sorted(lookup["PharmVar \nName1"].tolist(), reverse=True),
}
whitelist["NAT2"] = new_entry_json
genes_post = set(whitelist.keys())
# sanity check
assert len(genes_pre) == len(genes_post)
assert genes_pre == genes_post
if overwrite:
logging.info("Overwrite active, saving new white list...")
with open(NAT2_JSON_PATH, "w") as handle:
json.dump(whitelist, handle)
else:
logging.info("Overwrite NOT active, printing json to screen...")
print(json.dumps(whitelist, indent=2))
logging.info("Script completed!")
if __name__ == "__main__":
old_white = ["*4", "*5", "*6", "*7", "*14"]
ref_allele = "*4.001"
main(old_white, ref_allele, overwrite=True)