code for checking pipeline outputs added

This commit is contained in:
2025-08-18 09:38:33 +02:00
parent 334f16fe01
commit 531d178f66
3 changed files with 113 additions and 0 deletions

View File

@@ -0,0 +1,80 @@
import os
from subprocess import run
from itertools import combinations
import pandas as pd
def get_files_in_path(dir: str, sample: str) -> list[str]:
return [i.strip(sample) for i in os.listdir(dir)]
def test_same_file(file1: str, file2: str) -> str:
cmp_test = run(
f"cmp {file1} {file2}",
capture_output=True,
shell=True,
)
if cmp_test.returncode == 0:
return "match"
elif cmp_test.returncode == 1:
return "NO Match"
else:
print(
f"Unexpected return code != 0|1 ({cmp_test.returncode}\t{file1}\t{file2})"
)
raise RuntimeError
def main(test_sample_paths: str, conditions: list[str]) -> None:
with open(test_sample_paths, "r") as handle:
samples = [r.strip().split("/")[-1].strip(".vcf") for r in handle.readlines()]
conditions_pairwise = list(combinations(conditions, r=2))
# check all conditions and samples have same files available
for condition in conditions:
for idx, sample in enumerate(samples):
dir = f"data/{condition}/{sample}_pgx_result"
if idx == 0:
files = get_files_in_path(dir, sample)
test_files = get_files_in_path(dir, sample)
assert all(
[file in files for file in test_files]
), f"'{sample}' from '{condition}' has unexpected files: {test_files}"
# check combos are equal
results = []
for pair in conditions_pairwise:
print(f"############ Testing folling condition pair: {pair} ##############")
for sample in samples:
dir1 = f"data/{pair[0]}/{sample}_pgx_result"
dir2 = f"data/{pair[1]}/{sample}_pgx_result"
for file in os.listdir(dir1):
results.append(
[
pair[0],
pair[1],
sample,
file,
test_same_file(f"{dir1}/{file}", f"{dir2}/{file}"),
]
)
print(f"############ COMPLETED ##############\n")
df = pd.DataFrame(
results, columns=["condition1", "condition2", "sample", "file", "matching"]
)
df.to_csv("data/pairwise_equality.csv", sep=",", index=None)
if __name__ == "__main__":
test_sample_paths = "data/test_samples.txt"
conditions = [
"v1.2.8_git_tag",
"validation_data",
"v1.2.6_manifest_version",
"v1.2.7_mainfest_version",
"pgxCleaner_server",
]
main(test_sample_paths, conditions)