feat: update code to create the sample comparison without notebook

This commit is contained in:
2025-08-18 15:27:31 +02:00
parent 05401022db
commit 881d22b9b1

View File

@@ -25,7 +25,7 @@ def test_same_file(file1: str, file2: str) -> str:
raise RuntimeError
def main(test_sample_paths: str, conditions: list[str]) -> None:
def main(test_sample_paths: str, conditions: list[str], outpath: str) -> None:
with open(test_sample_paths, "r") as handle:
samples = [r.strip().split("/")[-1].strip(".vcf") for r in handle.readlines()]
conditions_pairwise = list(combinations(conditions, r=2))
@@ -64,16 +64,41 @@ def main(test_sample_paths: str, conditions: list[str]) -> None:
df = pd.DataFrame(
results, columns=["condition1", "condition2", "sample", "file", "matching"]
)
df.to_csv("data/pairwise_equality.csv", sep=",", index=None)
df.to_csv(f"{outpath}.csv", sep=",", index=None)
samples = df["sample"].unique()
print(samples)
gb_pairwise_df = df.groupby(by=["condition1", "condition2"])
groups = list(gb_pairwise_df.groups.keys())
# decompose table to be samples and group focused
data_dict = {}
for group in groups:
str_group = f"{'__'.join(group)}"
data_dict[str_group] = {}
group_df = gb_pairwise_df.get_group(group)
for sample in samples:
data_dict[str_group][sample] = group_df.loc[
group_df["sample"] == sample
].sort_values(by="file")
# save data to analyse further
with pd.ExcelWriter(f"{outpath}.xlsx", engine="openpyxl") as writer:
for g, data in data_dict.items():
for idx, (_, file_data) in enumerate(data.items()):
file_data.to_excel(
writer,
sheet_name=g,
startrow=(idx * len(file_data)) + (2 * idx),
)
if __name__ == "__main__":
test_sample_paths = "data/test_samples.txt"
conditions = [
"validation_data",
"v1.2.6_manifest_version",
"pgxCleaner_server",
"pgxCleaner_prod_updates",
]
main(test_sample_paths, conditions)
outpath = "data/test"
main(test_sample_paths, conditions, outpath)