Files
reproduce_setup/pgx-main/create_overview.py
2025-08-18 13:09:30 +02:00

55 lines
1.6 KiB
Python

import pandas as pd
import os
import zipfile
import functools as ft
dir = "pgx_results"
diplotypes = []
for z in os.listdir(dir):
s = z[:-4]
z = f"{dir}/{z}"
d = z[:-4]
if not z.endswith('.zip'):
continue
print("Working on sample ", z)
with zipfile.ZipFile(z, "r") as zip_ref:
zip_ref.extractall(d)
os.unlink(z)
df = pd.read_csv(f"{d}/diplotypes.tsv", sep="\t")
df = df.groupby("name").agg({"genotype": lambda x: ", ".join(list(set(x)))})
df.rename(columns={"genotype": s}, inplace=True)
diplotypes.append(df)
df_final = ft.reduce(lambda left, right: left.join(right, how="outer"), diplotypes)
dfx = df_final.transpose().sort_index()
fulgent_ids = [f for f in pd.read_csv("pgx_fulgent_panel.tsv", sep="\t")["variants"].to_list() if f.startswith("rs")]
a = []
b = []
c = []
e = []
for d in dfx.index:
filename = f"{dir}/{d}/pharmgkb_annotations.json"
x = pd.read_json(filename)
x = x[x["Variant"].apply(lambda t: t.startswith("rs"))]
#x = x[x["Level of Evidence"] < "3"]
y = x[x["Variant"].apply(lambda t: t.split()[0] in fulgent_ids)].copy()
b.append(", ".join(sorted(x.Variant.unique())))
e.append(", ".join(sorted(y.Variant.unique())))
x = x[x["Phenotype(s)"].isna()]
y = y[y["Phenotype(s)"].isna()]
a.append(", ".join(sorted(x.Variant.unique())))
c.append(", ".join(sorted(y.Variant.unique())))
dfx["rsids (no phenotype)"] = a
dfx["rsids (all)"] = b
dfx["rsids (fulgent/no phenotype)"] = c
dfx["rsids (fulgent/all)"] = e
dfx.to_csv("pgx_diplotypes_rsids.tsv", sep="\t")