reproduce_setup/pgx-main/create_overview.py

import pandas as pd
import os
import zipfile
import functools as ft


dir = "pgx_results"
diplotypes = []

for z in os.listdir(dir):
    s = z[:-4]
    z = f"{dir}/{z}"
    d = z[:-4]
    if not z.endswith('.zip'):
        continue

    print("Working on sample ", z)
    with zipfile.ZipFile(z, "r") as zip_ref:
        zip_ref.extractall(d)
    os.unlink(z)
    df = pd.read_csv(f"{d}/diplotypes.tsv", sep="\t")
    df = df.groupby("name").agg({"genotype": lambda x: ", ".join(list(set(x)))})
    df.rename(columns={"genotype": s}, inplace=True)
    diplotypes.append(df)

df_final = ft.reduce(lambda left, right: left.join(right, how="outer"), diplotypes)
dfx = df_final.transpose().sort_index()

fulgent_ids = [f for f in pd.read_csv("pgx_fulgent_panel.tsv", sep="\t")["variants"].to_list() if f.startswith("rs")]

a = []
b = []
c = []
e = []
for d in dfx.index:
    filename = f"{dir}/{d}/pharmgkb_annotations.json"
    x = pd.read_json(filename)
    x = x[x["Variant"].apply(lambda t: t.startswith("rs"))]
    #x = x[x["Level of Evidence"] < "3"]
    y = x[x["Variant"].apply(lambda t: t.split()[0] in fulgent_ids)].copy()
    b.append(", ".join(sorted(x.Variant.unique())))
    e.append(", ".join(sorted(y.Variant.unique())))

    x = x[x["Phenotype(s)"].isna()]
    y = y[y["Phenotype(s)"].isna()]
    a.append(", ".join(sorted(x.Variant.unique())))
    c.append(", ".join(sorted(y.Variant.unique())))

dfx["rsids (no phenotype)"] = a
dfx["rsids (all)"] = b
dfx["rsids (fulgent/no phenotype)"] = c
dfx["rsids (fulgent/all)"] = e

dfx.to_csv("pgx_diplotypes_rsids.tsv", sep="\t")