analysed the file differences
This commit is contained in:
@@ -9,7 +9,7 @@ def _():
|
||||
import os
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
return (mo,)
|
||||
return mo, pd
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
@@ -20,12 +20,105 @@ def _(mo):
|
||||
|
||||
Data structure
|
||||
```
|
||||
|
||||
.
|
||||
├── data
|
||||
│ ├── pairwise_equality.csv -- file with summary from src/check_results_equivilence.py
|
||||
│ ├── pgxCleaner_server -- data from main branch of StellarPGx and generated on the experiment server
|
||||
│ ├── test_samples.txt -- list of samples to be tested with paths on s3 (one per line)
|
||||
│ ├── v1.2.6_manifest_version -- data generated on the prod server with manifest.version = 1.2.6
|
||||
│ ├── v1.2.7_mainfest_version -- data generated on the prod server with manifest.version = 1.2.6
|
||||
│ ├── v1.2.8_git_tag -- data from tags/v1.2.8 branch of StellarPGx and generated on the experiment server
|
||||
│ └── validation_data -- data used in the validation (from google drive)
|
||||
```
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd):
|
||||
pairwise_df = pd.read_csv("data/pairwise_equality.csv")
|
||||
pairwise_df.head()
|
||||
return (pairwise_df,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pairwise_df):
|
||||
# print and get samples and groups for later
|
||||
samples = pairwise_df["sample"].unique()
|
||||
print(samples)
|
||||
cond1 = pairwise_df["condition1"].unique()
|
||||
print(cond1)
|
||||
cond2 = pairwise_df["condition2"].unique()
|
||||
print(cond2)
|
||||
return (samples,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pairwise_df):
|
||||
gb_pairwise_df = pairwise_df.groupby(by=["condition1", "condition2"])
|
||||
groups = list(gb_pairwise_df.groups.keys())
|
||||
return gb_pairwise_df, groups
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(gb_pairwise_df, groups, samples):
|
||||
# decompose table to be samples and group focused
|
||||
data_dict = {}
|
||||
for group in groups:
|
||||
str_group = f"{'__'.join(group)}"
|
||||
data_dict[str_group] = {}
|
||||
group_df = gb_pairwise_df.get_group(group)
|
||||
for sample in samples:
|
||||
data_dict[str_group][sample] = group_df.loc[
|
||||
group_df["sample"] == sample
|
||||
].sort_values(by="file")
|
||||
|
||||
print(data_dict.keys())
|
||||
return (data_dict,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data_dict, pd):
|
||||
# save data to analyse further
|
||||
with pd.ExcelWriter(
|
||||
"data/pairwise_comparison.xlsx", engine="openpyxl"
|
||||
) as writer:
|
||||
for g, data in data_dict.items():
|
||||
for idx, (_, file_data) in enumerate(data.items()):
|
||||
file_data.to_excel(
|
||||
writer,
|
||||
sheet_name=g,
|
||||
startrow=(idx * len(file_data)) + (2 * idx),
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""
|
||||
## Results Summary
|
||||
|
||||
**Validation data and v1.2.6 and v1.2.7 manifest versions** from the production server are all equivilent.
|
||||
|
||||
- Was a strange issue with `pharmcat.vcf` but this was that two rsids were reported for one variant and the order of the rsids in the file changes between the two pipeline runs- i.e. not a systemic issue.
|
||||
|
||||
**v1.2.6 and v1.2.7 manifest versions** are equivilent.
|
||||
|
||||
**validation and PGx experiment server using main brach of stellarPGx** do not match
|
||||
|
||||
- PharmCAT output is equivilent.
|
||||
- Core issue is that on the new server stellarPGx prododuces different genotype predictions
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
|
||||
31
src/check_files_diffs.py
Normal file
31
src/check_files_diffs.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import os
|
||||
from subprocess import run
|
||||
|
||||
SKIP_FILES = ["matcher.html", "matcher.json", "pharmcat.vcf", "output.json"]
|
||||
|
||||
|
||||
def main(condition1_dir: str, condition2_dir: str) -> None:
|
||||
sample_dirs = os.listdir(condition1_dir)
|
||||
|
||||
for sample_dir in sample_dirs:
|
||||
with open(
|
||||
f"data/diff_results_{sample_dir.split('_pgx')[0]}.txt", "w"
|
||||
) as handle:
|
||||
for file in os.listdir(f"{condition1_dir}/{sample_dir}"):
|
||||
if file in SKIP_FILES:
|
||||
continue
|
||||
cmd = (
|
||||
f"diff {condition1_dir}/{sample_dir}/{file} "
|
||||
f"{condition2_dir}/{sample_dir}/{file}"
|
||||
)
|
||||
result = run(cmd, shell=True, capture_output=True)
|
||||
|
||||
handle.write(f"#####{file}#####\n")
|
||||
handle.writelines(result.stdout.decode("utf-8"))
|
||||
handle.write("END############################################\n\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
condition1_dir = "data/validation_data"
|
||||
condition2_dir = "data/pgxCleaner_server"
|
||||
main(condition1_dir, condition2_dir)
|
||||
@@ -73,7 +73,7 @@ if __name__ == "__main__":
|
||||
"v1.2.8_git_tag",
|
||||
"validation_data",
|
||||
"v1.2.6_manifest_version",
|
||||
"v1.2.7_mainfest_version",
|
||||
"v1.2.7_manifest_version",
|
||||
"pgxCleaner_server",
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user