From add456f0e77733b890570e17497939554c2d81a4 Mon Sep 17 00:00:00 2001 From: Darren Wight Date: Mon, 18 Aug 2025 11:56:41 +0200 Subject: [PATCH] analysed the file differences --- notebooks/data_inspector.py | 97 +++++++++++++++++++++++++++++++- src/check_files_diffs.py | 31 ++++++++++ src/check_results_equivilence.py | 2 +- 3 files changed, 127 insertions(+), 3 deletions(-) create mode 100644 src/check_files_diffs.py diff --git a/notebooks/data_inspector.py b/notebooks/data_inspector.py index 45e5f79..90e1ef1 100644 --- a/notebooks/data_inspector.py +++ b/notebooks/data_inspector.py @@ -9,7 +9,7 @@ def _(): import os import marimo as mo import pandas as pd - return (mo,) + return mo, pd @app.cell(hide_code=True) @@ -20,12 +20,105 @@ def _(mo): Data structure ``` - + . + ├── data + │ ├── pairwise_equality.csv -- file with summary from src/check_results_equivilence.py + │ ├── pgxCleaner_server -- data from main branch of StellarPGx and generated on the experiment server + │ ├── test_samples.txt -- list of samples to be tested with paths on s3 (one per line) + │ ├── v1.2.6_manifest_version -- data generated on the prod server with manifest.version = 1.2.6 + │ ├── v1.2.7_mainfest_version -- data generated on the prod server with manifest.version = 1.2.6 + │ ├── v1.2.8_git_tag -- data from tags/v1.2.8 branch of StellarPGx and generated on the experiment server + │ └── validation_data -- data used in the validation (from google drive) ``` """ ) return +@app.cell +def _(pd): + pairwise_df = pd.read_csv("data/pairwise_equality.csv") + pairwise_df.head() + return (pairwise_df,) + + +@app.cell +def _(pairwise_df): + # print and get samples and groups for later + samples = pairwise_df["sample"].unique() + print(samples) + cond1 = pairwise_df["condition1"].unique() + print(cond1) + cond2 = pairwise_df["condition2"].unique() + print(cond2) + return (samples,) + + +@app.cell +def _(pairwise_df): + gb_pairwise_df = pairwise_df.groupby(by=["condition1", "condition2"]) + groups = list(gb_pairwise_df.groups.keys()) + return gb_pairwise_df, groups + + +@app.cell +def _(gb_pairwise_df, groups, samples): + # decompose table to be samples and group focused + data_dict = {} + for group in groups: + str_group = f"{'__'.join(group)}" + data_dict[str_group] = {} + group_df = gb_pairwise_df.get_group(group) + for sample in samples: + data_dict[str_group][sample] = group_df.loc[ + group_df["sample"] == sample + ].sort_values(by="file") + + print(data_dict.keys()) + return (data_dict,) + + +@app.cell +def _(data_dict, pd): + # save data to analyse further + with pd.ExcelWriter( + "data/pairwise_comparison.xlsx", engine="openpyxl" + ) as writer: + for g, data in data_dict.items(): + for idx, (_, file_data) in enumerate(data.items()): + file_data.to_excel( + writer, + sheet_name=g, + startrow=(idx * len(file_data)) + (2 * idx), + ) + return + + +@app.cell +def _(mo): + mo.md( + r""" + ## Results Summary + + **Validation data and v1.2.6 and v1.2.7 manifest versions** from the production server are all equivilent. + + - Was a strange issue with `pharmcat.vcf` but this was that two rsids were reported for one variant and the order of the rsids in the file changes between the two pipeline runs- i.e. not a systemic issue. + + **v1.2.6 and v1.2.7 manifest versions** are equivilent. + + **validation and PGx experiment server using main brach of stellarPGx** do not match + + - PharmCAT output is equivilent. + - Core issue is that on the new server stellarPGx prododuces different genotype predictions + """ + ) + return + + +@app.cell +def _(): + return + + if __name__ == "__main__": app.run() diff --git a/src/check_files_diffs.py b/src/check_files_diffs.py new file mode 100644 index 0000000..31b1e4d --- /dev/null +++ b/src/check_files_diffs.py @@ -0,0 +1,31 @@ +import os +from subprocess import run + +SKIP_FILES = ["matcher.html", "matcher.json", "pharmcat.vcf", "output.json"] + + +def main(condition1_dir: str, condition2_dir: str) -> None: + sample_dirs = os.listdir(condition1_dir) + + for sample_dir in sample_dirs: + with open( + f"data/diff_results_{sample_dir.split('_pgx')[0]}.txt", "w" + ) as handle: + for file in os.listdir(f"{condition1_dir}/{sample_dir}"): + if file in SKIP_FILES: + continue + cmd = ( + f"diff {condition1_dir}/{sample_dir}/{file} " + f"{condition2_dir}/{sample_dir}/{file}" + ) + result = run(cmd, shell=True, capture_output=True) + + handle.write(f"#####{file}#####\n") + handle.writelines(result.stdout.decode("utf-8")) + handle.write("END############################################\n\n") + + +if __name__ == "__main__": + condition1_dir = "data/validation_data" + condition2_dir = "data/pgxCleaner_server" + main(condition1_dir, condition2_dir) diff --git a/src/check_results_equivilence.py b/src/check_results_equivilence.py index 03b91ad..6ff90be 100644 --- a/src/check_results_equivilence.py +++ b/src/check_results_equivilence.py @@ -73,7 +73,7 @@ if __name__ == "__main__": "v1.2.8_git_tag", "validation_data", "v1.2.6_manifest_version", - "v1.2.7_mainfest_version", + "v1.2.7_manifest_version", "pgxCleaner_server", ]