diff --git a/.gitignore b/.gitignore index 0dbf2f2..e36be05 100644 --- a/.gitignore +++ b/.gitignore @@ -168,3 +168,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +# data directory +data/ diff --git a/notebooks/data_inspector.py b/notebooks/data_inspector.py new file mode 100644 index 0000000..45e5f79 --- /dev/null +++ b/notebooks/data_inspector.py @@ -0,0 +1,31 @@ +import marimo + +__generated_with = "0.14.16" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import os + import marimo as mo + import pandas as pd + return (mo,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md( + r""" + # Analyse the Mismatch data + + Data structure + ``` + + ``` + """ + ) + return + + +if __name__ == "__main__": + app.run() diff --git a/src/check_results_equivilence.py b/src/check_results_equivilence.py new file mode 100644 index 0000000..03b91ad --- /dev/null +++ b/src/check_results_equivilence.py @@ -0,0 +1,80 @@ +import os +from subprocess import run +from itertools import combinations +import pandas as pd + + +def get_files_in_path(dir: str, sample: str) -> list[str]: + return [i.strip(sample) for i in os.listdir(dir)] + + +def test_same_file(file1: str, file2: str) -> str: + cmp_test = run( + f"cmp {file1} {file2}", + capture_output=True, + shell=True, + ) + if cmp_test.returncode == 0: + return "match" + elif cmp_test.returncode == 1: + return "NO Match" + else: + print( + f"Unexpected return code != 0|1 ({cmp_test.returncode}\t{file1}\t{file2})" + ) + raise RuntimeError + + +def main(test_sample_paths: str, conditions: list[str]) -> None: + with open(test_sample_paths, "r") as handle: + samples = [r.strip().split("/")[-1].strip(".vcf") for r in handle.readlines()] + conditions_pairwise = list(combinations(conditions, r=2)) + + # check all conditions and samples have same files available + for condition in conditions: + for idx, sample in enumerate(samples): + dir = f"data/{condition}/{sample}_pgx_result" + if idx == 0: + files = get_files_in_path(dir, sample) + + test_files = get_files_in_path(dir, sample) + assert all( + [file in files for file in test_files] + ), f"'{sample}' from '{condition}' has unexpected files: {test_files}" + + # check combos are equal + results = [] + for pair in conditions_pairwise: + print(f"############ Testing folling condition pair: {pair} ##############") + for sample in samples: + dir1 = f"data/{pair[0]}/{sample}_pgx_result" + dir2 = f"data/{pair[1]}/{sample}_pgx_result" + + for file in os.listdir(dir1): + results.append( + [ + pair[0], + pair[1], + sample, + file, + test_same_file(f"{dir1}/{file}", f"{dir2}/{file}"), + ] + ) + print(f"############ COMPLETED ##############\n") + df = pd.DataFrame( + results, columns=["condition1", "condition2", "sample", "file", "matching"] + ) + df.to_csv("data/pairwise_equality.csv", sep=",", index=None) + + +if __name__ == "__main__": + test_sample_paths = "data/test_samples.txt" + conditions = [ + "v1.2.8_git_tag", + "validation_data", + "v1.2.6_manifest_version", + "v1.2.7_mainfest_version", + "pgxCleaner_server", + ] + + main(test_sample_paths, conditions)