From add456f0e77733b890570e17497939554c2d81a4 Mon Sep 17 00:00:00 2001
From: Darren Wight <darren@serenomica.com>
Date: Mon, 18 Aug 2025 11:56:41 +0200
Subject: [PATCH] analysed the file differences

---
 notebooks/data_inspector.py      | 97 +++++++++++++++++++++++++++++++-
 src/check_files_diffs.py         | 31 ++++++++++
 src/check_results_equivilence.py |  2 +-
 3 files changed, 127 insertions(+), 3 deletions(-)
 create mode 100644 src/check_files_diffs.py

diff --git a/notebooks/data_inspector.py b/notebooks/data_inspector.py
index 45e5f79..90e1ef1 100644
--- a/notebooks/data_inspector.py
+++ b/notebooks/data_inspector.py
@@ -9,7 +9,7 @@ def _():
     import os
     import marimo as mo
     import pandas as pd
-    return (mo,)
+    return mo, pd
 
 
 @app.cell(hide_code=True)
@@ -20,12 +20,105 @@ def _(mo):
 
     Data structure
     ```
-
+    .
+    ├── data
+    │   ├── pairwise_equality.csv -- file with summary from src/check_results_equivilence.py
+    │   ├── pgxCleaner_server -- data from main branch of StellarPGx and generated on the experiment server
+    │   ├── test_samples.txt -- list of samples to be tested with paths on s3 (one per line)
+    │   ├── v1.2.6_manifest_version -- data generated on the prod server with manifest.version = 1.2.6
+    │   ├── v1.2.7_mainfest_version -- data generated on the prod server with manifest.version = 1.2.6
+    │   ├── v1.2.8_git_tag -- data from tags/v1.2.8 branch of StellarPGx and generated on the experiment server
+    │   └── validation_data -- data used in the validation (from google drive)
     ```
     """
     )
     return
 
 
+@app.cell
+def _(pd):
+    pairwise_df = pd.read_csv("data/pairwise_equality.csv")
+    pairwise_df.head()
+    return (pairwise_df,)
+
+
+@app.cell
+def _(pairwise_df):
+    # print and get samples and groups for later
+    samples = pairwise_df["sample"].unique()
+    print(samples)
+    cond1 = pairwise_df["condition1"].unique()
+    print(cond1)
+    cond2 = pairwise_df["condition2"].unique()
+    print(cond2)
+    return (samples,)
+
+
+@app.cell
+def _(pairwise_df):
+    gb_pairwise_df = pairwise_df.groupby(by=["condition1", "condition2"])
+    groups = list(gb_pairwise_df.groups.keys())
+    return gb_pairwise_df, groups
+
+
+@app.cell
+def _(gb_pairwise_df, groups, samples):
+    # decompose table to be samples and group focused
+    data_dict = {}
+    for group in groups:
+        str_group = f"{'__'.join(group)}"
+        data_dict[str_group] = {}
+        group_df = gb_pairwise_df.get_group(group)
+        for sample in samples:
+            data_dict[str_group][sample] = group_df.loc[
+                group_df["sample"] == sample
+            ].sort_values(by="file")
+
+    print(data_dict.keys())
+    return (data_dict,)
+
+
+@app.cell
+def _(data_dict, pd):
+    # save data to analyse further
+    with pd.ExcelWriter(
+        "data/pairwise_comparison.xlsx", engine="openpyxl"
+    ) as writer:
+        for g, data in data_dict.items():
+            for idx, (_, file_data) in enumerate(data.items()):
+                file_data.to_excel(
+                    writer,
+                    sheet_name=g,
+                    startrow=(idx * len(file_data)) + (2 * idx),
+                )
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    ## Results Summary
+
+    **Validation data and v1.2.6 and v1.2.7 manifest versions** from the production server are all equivilent.
+
+    - Was a strange issue with `pharmcat.vcf` but this was that two rsids were reported for one variant and the order of the rsids in the file changes between the two pipeline runs- i.e. not a systemic issue.
+
+    **v1.2.6 and v1.2.7 manifest versions** are equivilent.
+
+    **validation and PGx experiment server using main brach of stellarPGx** do not match
+
+    - PharmCAT output is equivilent.
+    - Core issue is that on the new server stellarPGx prododuces different genotype predictions
+    """
+    )
+    return
+
+
+@app.cell
+def _():
+    return
+
+
 if __name__ == "__main__":
     app.run()
diff --git a/src/check_files_diffs.py b/src/check_files_diffs.py
new file mode 100644
index 0000000..31b1e4d
--- /dev/null
+++ b/src/check_files_diffs.py
@@ -0,0 +1,31 @@
+import os
+from subprocess import run
+
+SKIP_FILES = ["matcher.html", "matcher.json", "pharmcat.vcf", "output.json"]
+
+
+def main(condition1_dir: str, condition2_dir: str) -> None:
+    sample_dirs = os.listdir(condition1_dir)
+
+    for sample_dir in sample_dirs:
+        with open(
+            f"data/diff_results_{sample_dir.split('_pgx')[0]}.txt", "w"
+        ) as handle:
+            for file in os.listdir(f"{condition1_dir}/{sample_dir}"):
+                if file in SKIP_FILES:
+                    continue
+                cmd = (
+                    f"diff {condition1_dir}/{sample_dir}/{file} "
+                    f"{condition2_dir}/{sample_dir}/{file}"
+                )
+                result = run(cmd, shell=True, capture_output=True)
+
+                handle.write(f"#####{file}#####\n")
+                handle.writelines(result.stdout.decode("utf-8"))
+                handle.write("END############################################\n\n")
+
+
+if __name__ == "__main__":
+    condition1_dir = "data/validation_data"
+    condition2_dir = "data/pgxCleaner_server"
+    main(condition1_dir, condition2_dir)
diff --git a/src/check_results_equivilence.py b/src/check_results_equivilence.py
index 03b91ad..6ff90be 100644
--- a/src/check_results_equivilence.py
+++ b/src/check_results_equivilence.py
@@ -73,7 +73,7 @@ if __name__ == "__main__":
         "v1.2.8_git_tag",
         "validation_data",
         "v1.2.6_manifest_version",
-        "v1.2.7_mainfest_version",
+        "v1.2.7_manifest_version",
         "pgxCleaner_server",
     ]