analysed the file differences

This commit is contained in:
2025-08-18 11:56:41 +02:00
parent 531d178f66
commit add456f0e7
3 changed files with 127 additions and 3 deletions

View File

@@ -9,7 +9,7 @@ def _():
import os import os
import marimo as mo import marimo as mo
import pandas as pd import pandas as pd
return (mo,) return mo, pd
@app.cell(hide_code=True) @app.cell(hide_code=True)
@@ -20,12 +20,105 @@ def _(mo):
Data structure Data structure
``` ```
.
├── data
│ ├── pairwise_equality.csv -- file with summary from src/check_results_equivilence.py
│ ├── pgxCleaner_server -- data from main branch of StellarPGx and generated on the experiment server
│ ├── test_samples.txt -- list of samples to be tested with paths on s3 (one per line)
│ ├── v1.2.6_manifest_version -- data generated on the prod server with manifest.version = 1.2.6
│ ├── v1.2.7_mainfest_version -- data generated on the prod server with manifest.version = 1.2.6
│ ├── v1.2.8_git_tag -- data from tags/v1.2.8 branch of StellarPGx and generated on the experiment server
│ └── validation_data -- data used in the validation (from google drive)
``` ```
""" """
) )
return return
@app.cell
def _(pd):
pairwise_df = pd.read_csv("data/pairwise_equality.csv")
pairwise_df.head()
return (pairwise_df,)
@app.cell
def _(pairwise_df):
# print and get samples and groups for later
samples = pairwise_df["sample"].unique()
print(samples)
cond1 = pairwise_df["condition1"].unique()
print(cond1)
cond2 = pairwise_df["condition2"].unique()
print(cond2)
return (samples,)
@app.cell
def _(pairwise_df):
gb_pairwise_df = pairwise_df.groupby(by=["condition1", "condition2"])
groups = list(gb_pairwise_df.groups.keys())
return gb_pairwise_df, groups
@app.cell
def _(gb_pairwise_df, groups, samples):
# decompose table to be samples and group focused
data_dict = {}
for group in groups:
str_group = f"{'__'.join(group)}"
data_dict[str_group] = {}
group_df = gb_pairwise_df.get_group(group)
for sample in samples:
data_dict[str_group][sample] = group_df.loc[
group_df["sample"] == sample
].sort_values(by="file")
print(data_dict.keys())
return (data_dict,)
@app.cell
def _(data_dict, pd):
# save data to analyse further
with pd.ExcelWriter(
"data/pairwise_comparison.xlsx", engine="openpyxl"
) as writer:
for g, data in data_dict.items():
for idx, (_, file_data) in enumerate(data.items()):
file_data.to_excel(
writer,
sheet_name=g,
startrow=(idx * len(file_data)) + (2 * idx),
)
return
@app.cell
def _(mo):
mo.md(
r"""
## Results Summary
**Validation data and v1.2.6 and v1.2.7 manifest versions** from the production server are all equivilent.
- Was a strange issue with `pharmcat.vcf` but this was that two rsids were reported for one variant and the order of the rsids in the file changes between the two pipeline runs- i.e. not a systemic issue.
**v1.2.6 and v1.2.7 manifest versions** are equivilent.
**validation and PGx experiment server using main brach of stellarPGx** do not match
- PharmCAT output is equivilent.
- Core issue is that on the new server stellarPGx prododuces different genotype predictions
"""
)
return
@app.cell
def _():
return
if __name__ == "__main__": if __name__ == "__main__":
app.run() app.run()

31
src/check_files_diffs.py Normal file
View File

@@ -0,0 +1,31 @@
import os
from subprocess import run
SKIP_FILES = ["matcher.html", "matcher.json", "pharmcat.vcf", "output.json"]
def main(condition1_dir: str, condition2_dir: str) -> None:
sample_dirs = os.listdir(condition1_dir)
for sample_dir in sample_dirs:
with open(
f"data/diff_results_{sample_dir.split('_pgx')[0]}.txt", "w"
) as handle:
for file in os.listdir(f"{condition1_dir}/{sample_dir}"):
if file in SKIP_FILES:
continue
cmd = (
f"diff {condition1_dir}/{sample_dir}/{file} "
f"{condition2_dir}/{sample_dir}/{file}"
)
result = run(cmd, shell=True, capture_output=True)
handle.write(f"#####{file}#####\n")
handle.writelines(result.stdout.decode("utf-8"))
handle.write("END############################################\n\n")
if __name__ == "__main__":
condition1_dir = "data/validation_data"
condition2_dir = "data/pgxCleaner_server"
main(condition1_dir, condition2_dir)

View File

@@ -73,7 +73,7 @@ if __name__ == "__main__":
"v1.2.8_git_tag", "v1.2.8_git_tag",
"validation_data", "validation_data",
"v1.2.6_manifest_version", "v1.2.6_manifest_version",
"v1.2.7_mainfest_version", "v1.2.7_manifest_version",
"pgxCleaner_server", "pgxCleaner_server",
] ]