55 lines
1.6 KiB
Python
55 lines
1.6 KiB
Python
import pandas as pd
|
|
import os
|
|
import zipfile
|
|
import functools as ft
|
|
|
|
|
|
dir = "pgx_results"
|
|
diplotypes = []
|
|
|
|
for z in os.listdir(dir):
|
|
s = z[:-4]
|
|
z = f"{dir}/{z}"
|
|
d = z[:-4]
|
|
if not z.endswith('.zip'):
|
|
continue
|
|
|
|
print("Working on sample ", z)
|
|
with zipfile.ZipFile(z, "r") as zip_ref:
|
|
zip_ref.extractall(d)
|
|
os.unlink(z)
|
|
df = pd.read_csv(f"{d}/diplotypes.tsv", sep="\t")
|
|
df = df.groupby("name").agg({"genotype": lambda x: ", ".join(list(set(x)))})
|
|
df.rename(columns={"genotype": s}, inplace=True)
|
|
diplotypes.append(df)
|
|
|
|
df_final = ft.reduce(lambda left, right: left.join(right, how="outer"), diplotypes)
|
|
dfx = df_final.transpose().sort_index()
|
|
|
|
fulgent_ids = [f for f in pd.read_csv("pgx_fulgent_panel.tsv", sep="\t")["variants"].to_list() if f.startswith("rs")]
|
|
|
|
a = []
|
|
b = []
|
|
c = []
|
|
e = []
|
|
for d in dfx.index:
|
|
filename = f"{dir}/{d}/pharmgkb_annotations.json"
|
|
x = pd.read_json(filename)
|
|
x = x[x["Variant"].apply(lambda t: t.startswith("rs"))]
|
|
#x = x[x["Level of Evidence"] < "3"]
|
|
y = x[x["Variant"].apply(lambda t: t.split()[0] in fulgent_ids)].copy()
|
|
b.append(", ".join(sorted(x.Variant.unique())))
|
|
e.append(", ".join(sorted(y.Variant.unique())))
|
|
|
|
x = x[x["Phenotype(s)"].isna()]
|
|
y = y[y["Phenotype(s)"].isna()]
|
|
a.append(", ".join(sorted(x.Variant.unique())))
|
|
c.append(", ".join(sorted(y.Variant.unique())))
|
|
|
|
dfx["rsids (no phenotype)"] = a
|
|
dfx["rsids (all)"] = b
|
|
dfx["rsids (fulgent/no phenotype)"] = c
|
|
dfx["rsids (fulgent/all)"] = e
|
|
|
|
dfx.to_csv("pgx_diplotypes_rsids.tsv", sep="\t")
|