diff --git a/.gitignore b/.gitignore
index 0dbf2f2..e36be05 100644
--- a/.gitignore
+++ b/.gitignore
@@ -168,3 +168,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
+# data directory
+data/
diff --git a/notebooks/data_inspector.py b/notebooks/data_inspector.py
new file mode 100644
index 0000000..45e5f79
--- /dev/null
+++ b/notebooks/data_inspector.py
@@ -0,0 +1,31 @@
+import marimo
+
+__generated_with = "0.14.16"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import os
+    import marimo as mo
+    import pandas as pd
+    return (mo,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+    # Analyse the Mismatch data
+
+    Data structure
+    ```
+
+    ```
+    """
+    )
+    return
+
+
+if __name__ == "__main__":
+    app.run()
diff --git a/src/check_results_equivilence.py b/src/check_results_equivilence.py
new file mode 100644
index 0000000..03b91ad
--- /dev/null
+++ b/src/check_results_equivilence.py
@@ -0,0 +1,80 @@
+import os
+from subprocess import run
+from itertools import combinations
+import pandas as pd
+
+
+def get_files_in_path(dir: str, sample: str) -> list[str]:
+    return [i.strip(sample) for i in os.listdir(dir)]
+
+
+def test_same_file(file1: str, file2: str) -> str:
+    cmp_test = run(
+        f"cmp {file1} {file2}",
+        capture_output=True,
+        shell=True,
+    )
+    if cmp_test.returncode == 0:
+        return "match"
+    elif cmp_test.returncode == 1:
+        return "NO Match"
+    else:
+        print(
+            f"Unexpected return code != 0|1 ({cmp_test.returncode}\t{file1}\t{file2})"
+        )
+        raise RuntimeError
+
+
+def main(test_sample_paths: str, conditions: list[str]) -> None:
+    with open(test_sample_paths, "r") as handle:
+        samples = [r.strip().split("/")[-1].strip(".vcf") for r in handle.readlines()]
+    conditions_pairwise = list(combinations(conditions, r=2))
+
+    # check all conditions and samples have same files available
+    for condition in conditions:
+        for idx, sample in enumerate(samples):
+            dir = f"data/{condition}/{sample}_pgx_result"
+            if idx == 0:
+                files = get_files_in_path(dir, sample)
+
+            test_files = get_files_in_path(dir, sample)
+            assert all(
+                [file in files for file in test_files]
+            ), f"'{sample}' from '{condition}' has unexpected files: {test_files}"
+
+    # check combos are equal
+    results = []
+    for pair in conditions_pairwise:
+        print(f"############ Testing folling condition pair: {pair} ##############")
+        for sample in samples:
+            dir1 = f"data/{pair[0]}/{sample}_pgx_result"
+            dir2 = f"data/{pair[1]}/{sample}_pgx_result"
+
+            for file in os.listdir(dir1):
+                results.append(
+                    [
+                        pair[0],
+                        pair[1],
+                        sample,
+                        file,
+                        test_same_file(f"{dir1}/{file}", f"{dir2}/{file}"),
+                    ]
+                )
+        print(f"############ COMPLETED ##############\n")
+    df = pd.DataFrame(
+        results, columns=["condition1", "condition2", "sample", "file", "matching"]
+    )
+    df.to_csv("data/pairwise_equality.csv", sep=",", index=None)
+
+
+if __name__ == "__main__":
+    test_sample_paths = "data/test_samples.txt"
+    conditions = [
+        "v1.2.8_git_tag",
+        "validation_data",
+        "v1.2.6_manifest_version",
+        "v1.2.7_mainfest_version",
+        "pgxCleaner_server",
+    ]
+
+    main(test_sample_paths, conditions)