From cb80d69877af839dcceca4d74d1dc67350bf6e33 Mon Sep 17 00:00:00 2001 From: Darren Wight Date: Thu, 28 Aug 2025 12:08:39 +0200 Subject: [PATCH] feat: code added that updates the NAT2 whitelist in the pgx-engine --- src/create_new_nat2_whitelst.py | 73 +++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 src/create_new_nat2_whitelst.py diff --git a/src/create_new_nat2_whitelst.py b/src/create_new_nat2_whitelst.py new file mode 100644 index 0000000..e3906c0 --- /dev/null +++ b/src/create_new_nat2_whitelst.py @@ -0,0 +1,73 @@ +import os +import json +import re +import logging +from datetime import datetime +import pandas as pd + + +NAT2_JSON_PATH = "/home/darren/Documents/2_repos/serenomica/pgx-engine/resources/allele_whitelist.json" +NAT2_LOOKUP = "/home/darren/Documents/3_projects/3_PGx/1_Clean_up/pgx_nat2_issue/data/nat2_look-up-table-v1-1.xlsx" + +logging.basicConfig( + format="%(asctime)s [%(levelname)s] %(message)s", level=logging.INFO +) + + +def main(old_white: list[str], ref_allele: str, overwrite: bool = False) -> None: + logging.info("Script started...") + logging.info(f"Loading lookup and filtering to whitelist ('{NAT2_LOOKUP}')...") + lookup = pd.read_excel(NAT2_LOOKUP, skiprows=1) + + lookup_re = re.compile(("|").join([f"\\{o}+" for o in old_white])) + + lookup = lookup.loc[ + (lookup["Transferred\nYes/No"] == "yes") + & (lookup["Legacy \nname"].str.contains(lookup_re)) + ] + + logging.info( + f"Saving copy of filtered whitelist: '{(save_path:=os.path.join( + os.path.dirname(NAT2_LOOKUP), + f"nat2_whitelist_{datetime.now().strftime('%Y%-m-%d')}.tsv", + ))}')..." + ) + lookup.to_csv( + save_path, + sep="\t", + index=False, + ) + + logging.info(f"Reading whitelist ('{NAT2_JSON_PATH}')...") + with open(NAT2_JSON_PATH, "r") as handle: + whitelist = json.load(handle) + genes_pre = set(whitelist.keys()) + + logging.info(f"Updating NAT2...") + new_entry_json = { + "reference": ref_allele, + "alleles": sorted(lookup["PharmVar \nName1"].tolist(), reverse=True), + } + + whitelist["NAT2"] = new_entry_json + genes_post = set(whitelist.keys()) + + # sanity check + assert len(genes_pre) == len(genes_post) + assert genes_pre == genes_post + + if overwrite: + logging.info("Overwrite active, saving new white list...") + with open(NAT2_JSON_PATH, "w") as handle: + json.dump(whitelist, handle) + else: + logging.info("Overwrite NOT active, printing json to screen...") + print(json.dumps(whitelist, indent=2)) + + logging.info("Script completed!") + + +if __name__ == "__main__": + old_white = ["*4", "*5", "*6", "*7", "*14"] + ref_allele = "*4.001" + main(old_white, ref_allele, overwrite=True)