code for checking pipeline outputs added
This commit is contained in:
80
src/check_results_equivilence.py
Normal file
80
src/check_results_equivilence.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import os
|
||||
from subprocess import run
|
||||
from itertools import combinations
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def get_files_in_path(dir: str, sample: str) -> list[str]:
|
||||
return [i.strip(sample) for i in os.listdir(dir)]
|
||||
|
||||
|
||||
def test_same_file(file1: str, file2: str) -> str:
|
||||
cmp_test = run(
|
||||
f"cmp {file1} {file2}",
|
||||
capture_output=True,
|
||||
shell=True,
|
||||
)
|
||||
if cmp_test.returncode == 0:
|
||||
return "match"
|
||||
elif cmp_test.returncode == 1:
|
||||
return "NO Match"
|
||||
else:
|
||||
print(
|
||||
f"Unexpected return code != 0|1 ({cmp_test.returncode}\t{file1}\t{file2})"
|
||||
)
|
||||
raise RuntimeError
|
||||
|
||||
|
||||
def main(test_sample_paths: str, conditions: list[str]) -> None:
|
||||
with open(test_sample_paths, "r") as handle:
|
||||
samples = [r.strip().split("/")[-1].strip(".vcf") for r in handle.readlines()]
|
||||
conditions_pairwise = list(combinations(conditions, r=2))
|
||||
|
||||
# check all conditions and samples have same files available
|
||||
for condition in conditions:
|
||||
for idx, sample in enumerate(samples):
|
||||
dir = f"data/{condition}/{sample}_pgx_result"
|
||||
if idx == 0:
|
||||
files = get_files_in_path(dir, sample)
|
||||
|
||||
test_files = get_files_in_path(dir, sample)
|
||||
assert all(
|
||||
[file in files for file in test_files]
|
||||
), f"'{sample}' from '{condition}' has unexpected files: {test_files}"
|
||||
|
||||
# check combos are equal
|
||||
results = []
|
||||
for pair in conditions_pairwise:
|
||||
print(f"############ Testing folling condition pair: {pair} ##############")
|
||||
for sample in samples:
|
||||
dir1 = f"data/{pair[0]}/{sample}_pgx_result"
|
||||
dir2 = f"data/{pair[1]}/{sample}_pgx_result"
|
||||
|
||||
for file in os.listdir(dir1):
|
||||
results.append(
|
||||
[
|
||||
pair[0],
|
||||
pair[1],
|
||||
sample,
|
||||
file,
|
||||
test_same_file(f"{dir1}/{file}", f"{dir2}/{file}"),
|
||||
]
|
||||
)
|
||||
print(f"############ COMPLETED ##############\n")
|
||||
df = pd.DataFrame(
|
||||
results, columns=["condition1", "condition2", "sample", "file", "matching"]
|
||||
)
|
||||
df.to_csv("data/pairwise_equality.csv", sep=",", index=None)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_sample_paths = "data/test_samples.txt"
|
||||
conditions = [
|
||||
"v1.2.8_git_tag",
|
||||
"validation_data",
|
||||
"v1.2.6_manifest_version",
|
||||
"v1.2.7_mainfest_version",
|
||||
"pgxCleaner_server",
|
||||
]
|
||||
|
||||
main(test_sample_paths, conditions)
|
||||
Reference in New Issue
Block a user