import pandas as pd import os import zipfile import functools as ft dir = "pgx_results" diplotypes = [] for z in os.listdir(dir): s = z[:-4] z = f"{dir}/{z}" d = z[:-4] if not z.endswith('.zip'): continue print("Working on sample ", z) with zipfile.ZipFile(z, "r") as zip_ref: zip_ref.extractall(d) os.unlink(z) df = pd.read_csv(f"{d}/diplotypes.tsv", sep="\t") df = df.groupby("name").agg({"genotype": lambda x: ", ".join(list(set(x)))}) df.rename(columns={"genotype": s}, inplace=True) diplotypes.append(df) df_final = ft.reduce(lambda left, right: left.join(right, how="outer"), diplotypes) dfx = df_final.transpose().sort_index() fulgent_ids = [f for f in pd.read_csv("pgx_fulgent_panel.tsv", sep="\t")["variants"].to_list() if f.startswith("rs")] a = [] b = [] c = [] e = [] for d in dfx.index: filename = f"{dir}/{d}/pharmgkb_annotations.json" x = pd.read_json(filename) x = x[x["Variant"].apply(lambda t: t.startswith("rs"))] #x = x[x["Level of Evidence"] < "3"] y = x[x["Variant"].apply(lambda t: t.split()[0] in fulgent_ids)].copy() b.append(", ".join(sorted(x.Variant.unique()))) e.append(", ".join(sorted(y.Variant.unique()))) x = x[x["Phenotype(s)"].isna()] y = y[y["Phenotype(s)"].isna()] a.append(", ".join(sorted(x.Variant.unique()))) c.append(", ".join(sorted(y.Variant.unique()))) dfx["rsids (no phenotype)"] = a dfx["rsids (all)"] = b dfx["rsids (fulgent/no phenotype)"] = c dfx["rsids (fulgent/all)"] = e dfx.to_csv("pgx_diplotypes_rsids.tsv", sep="\t")