125 lines
3.4 KiB
Python
125 lines
3.4 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.14.16"
|
|
app = marimo.App(width="medium")
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
import os
|
|
import marimo as mo
|
|
import pandas as pd
|
|
return mo, pd
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(
|
|
r"""
|
|
# Analyse the Mismatch data
|
|
|
|
Data structure
|
|
```
|
|
.
|
|
├── data
|
|
│ ├── pairwise_equality.csv -- file with summary from src/check_results_equivilence.py
|
|
│ ├── pgxCleaner_server -- data from main branch of StellarPGx and generated on the experiment server
|
|
│ ├── test_samples.txt -- list of samples to be tested with paths on s3 (one per line)
|
|
│ ├── v1.2.6_manifest_version -- data generated on the prod server with manifest.version = 1.2.6
|
|
│ ├── v1.2.7_mainfest_version -- data generated on the prod server with manifest.version = 1.2.6
|
|
│ ├── v1.2.8_git_tag -- data from tags/v1.2.8 branch of StellarPGx and generated on the experiment server
|
|
│ └── validation_data -- data used in the validation (from google drive)
|
|
```
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd):
|
|
pairwise_df = pd.read_csv("data/pairwise_equality.csv")
|
|
pairwise_df.head()
|
|
return (pairwise_df,)
|
|
|
|
|
|
@app.cell
|
|
def _(pairwise_df):
|
|
# print and get samples and groups for later
|
|
samples = pairwise_df["sample"].unique()
|
|
print(samples)
|
|
cond1 = pairwise_df["condition1"].unique()
|
|
print(cond1)
|
|
cond2 = pairwise_df["condition2"].unique()
|
|
print(cond2)
|
|
return (samples,)
|
|
|
|
|
|
@app.cell
|
|
def _(pairwise_df):
|
|
gb_pairwise_df = pairwise_df.groupby(by=["condition1", "condition2"])
|
|
groups = list(gb_pairwise_df.groups.keys())
|
|
return gb_pairwise_df, groups
|
|
|
|
|
|
@app.cell
|
|
def _(gb_pairwise_df, groups, samples):
|
|
# decompose table to be samples and group focused
|
|
data_dict = {}
|
|
for group in groups:
|
|
str_group = f"{'__'.join(group)}"
|
|
data_dict[str_group] = {}
|
|
group_df = gb_pairwise_df.get_group(group)
|
|
for sample in samples:
|
|
data_dict[str_group][sample] = group_df.loc[
|
|
group_df["sample"] == sample
|
|
].sort_values(by="file")
|
|
|
|
print(data_dict.keys())
|
|
return (data_dict,)
|
|
|
|
|
|
@app.cell
|
|
def _(data_dict, pd):
|
|
# save data to analyse further
|
|
with pd.ExcelWriter(
|
|
"data/pairwise_comparison.xlsx", engine="openpyxl"
|
|
) as writer:
|
|
for g, data in data_dict.items():
|
|
for idx, (_, file_data) in enumerate(data.items()):
|
|
file_data.to_excel(
|
|
writer,
|
|
sheet_name=g,
|
|
startrow=(idx * len(file_data)) + (2 * idx),
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(
|
|
r"""
|
|
## Results Summary
|
|
|
|
**Validation data and v1.2.6 and v1.2.7 manifest versions** from the production server are all equivilent.
|
|
|
|
- Was a strange issue with `pharmcat.vcf` but this was that two rsids were reported for one variant and the order of the rsids in the file changes between the two pipeline runs- i.e. not a systemic issue.
|
|
|
|
**v1.2.6 and v1.2.7 manifest versions** are equivilent.
|
|
|
|
**validation and PGx experiment server using main brach of stellarPGx** do not match
|
|
|
|
- PharmCAT output is equivilent.
|
|
- Core issue is that on the new server stellarPGx prododuces different genotype predictions
|
|
"""
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|