analysed the file differences
This commit is contained in:
@@ -9,7 +9,7 @@ def _():
|
||||
import os
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
return (mo,)
|
||||
return mo, pd
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
@@ -20,12 +20,105 @@ def _(mo):
|
||||
|
||||
Data structure
|
||||
```
|
||||
|
||||
.
|
||||
├── data
|
||||
│ ├── pairwise_equality.csv -- file with summary from src/check_results_equivilence.py
|
||||
│ ├── pgxCleaner_server -- data from main branch of StellarPGx and generated on the experiment server
|
||||
│ ├── test_samples.txt -- list of samples to be tested with paths on s3 (one per line)
|
||||
│ ├── v1.2.6_manifest_version -- data generated on the prod server with manifest.version = 1.2.6
|
||||
│ ├── v1.2.7_mainfest_version -- data generated on the prod server with manifest.version = 1.2.6
|
||||
│ ├── v1.2.8_git_tag -- data from tags/v1.2.8 branch of StellarPGx and generated on the experiment server
|
||||
│ └── validation_data -- data used in the validation (from google drive)
|
||||
```
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd):
|
||||
pairwise_df = pd.read_csv("data/pairwise_equality.csv")
|
||||
pairwise_df.head()
|
||||
return (pairwise_df,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pairwise_df):
|
||||
# print and get samples and groups for later
|
||||
samples = pairwise_df["sample"].unique()
|
||||
print(samples)
|
||||
cond1 = pairwise_df["condition1"].unique()
|
||||
print(cond1)
|
||||
cond2 = pairwise_df["condition2"].unique()
|
||||
print(cond2)
|
||||
return (samples,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pairwise_df):
|
||||
gb_pairwise_df = pairwise_df.groupby(by=["condition1", "condition2"])
|
||||
groups = list(gb_pairwise_df.groups.keys())
|
||||
return gb_pairwise_df, groups
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(gb_pairwise_df, groups, samples):
|
||||
# decompose table to be samples and group focused
|
||||
data_dict = {}
|
||||
for group in groups:
|
||||
str_group = f"{'__'.join(group)}"
|
||||
data_dict[str_group] = {}
|
||||
group_df = gb_pairwise_df.get_group(group)
|
||||
for sample in samples:
|
||||
data_dict[str_group][sample] = group_df.loc[
|
||||
group_df["sample"] == sample
|
||||
].sort_values(by="file")
|
||||
|
||||
print(data_dict.keys())
|
||||
return (data_dict,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data_dict, pd):
|
||||
# save data to analyse further
|
||||
with pd.ExcelWriter(
|
||||
"data/pairwise_comparison.xlsx", engine="openpyxl"
|
||||
) as writer:
|
||||
for g, data in data_dict.items():
|
||||
for idx, (_, file_data) in enumerate(data.items()):
|
||||
file_data.to_excel(
|
||||
writer,
|
||||
sheet_name=g,
|
||||
startrow=(idx * len(file_data)) + (2 * idx),
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""
|
||||
## Results Summary
|
||||
|
||||
**Validation data and v1.2.6 and v1.2.7 manifest versions** from the production server are all equivilent.
|
||||
|
||||
- Was a strange issue with `pharmcat.vcf` but this was that two rsids were reported for one variant and the order of the rsids in the file changes between the two pipeline runs- i.e. not a systemic issue.
|
||||
|
||||
**v1.2.6 and v1.2.7 manifest versions** are equivilent.
|
||||
|
||||
**validation and PGx experiment server using main brach of stellarPGx** do not match
|
||||
|
||||
- PharmCAT output is equivilent.
|
||||
- Core issue is that on the new server stellarPGx prododuces different genotype predictions
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
|
||||
Reference in New Issue
Block a user