import marimo __generated_with = "0.14.16" app = marimo.App(width="medium") @app.cell def _(): import os import marimo as mo import pandas as pd return mo, pd @app.cell(hide_code=True) def _(mo): mo.md( r""" # Analyse the Mismatch data Data structure ``` . ├── data │ ├── pairwise_equality.csv -- file with summary from src/check_results_equivilence.py │ ├── pgxCleaner_server -- data from main branch of StellarPGx and generated on the experiment server │ ├── test_samples.txt -- list of samples to be tested with paths on s3 (one per line) │ ├── v1.2.6_manifest_version -- data generated on the prod server with manifest.version = 1.2.6 │ ├── v1.2.7_mainfest_version -- data generated on the prod server with manifest.version = 1.2.6 │ ├── v1.2.8_git_tag -- data from tags/v1.2.8 branch of StellarPGx and generated on the experiment server │ └── validation_data -- data used in the validation (from google drive) ``` """ ) return @app.cell def _(pd): pairwise_df = pd.read_csv("data/pairwise_equality.csv") pairwise_df.head() return (pairwise_df,) @app.cell def _(pairwise_df): # print and get samples and groups for later samples = pairwise_df["sample"].unique() print(samples) cond1 = pairwise_df["condition1"].unique() print(cond1) cond2 = pairwise_df["condition2"].unique() print(cond2) return (samples,) @app.cell def _(pairwise_df): gb_pairwise_df = pairwise_df.groupby(by=["condition1", "condition2"]) groups = list(gb_pairwise_df.groups.keys()) return gb_pairwise_df, groups @app.cell def _(gb_pairwise_df, groups, samples): # decompose table to be samples and group focused data_dict = {} for group in groups: str_group = f"{'__'.join(group)}" data_dict[str_group] = {} group_df = gb_pairwise_df.get_group(group) for sample in samples: data_dict[str_group][sample] = group_df.loc[ group_df["sample"] == sample ].sort_values(by="file") print(data_dict.keys()) return (data_dict,) @app.cell def _(data_dict, pd): # save data to analyse further with pd.ExcelWriter( "data/pairwise_comparison.xlsx", engine="openpyxl" ) as writer: for g, data in data_dict.items(): for idx, (_, file_data) in enumerate(data.items()): file_data.to_excel( writer, sheet_name=g, startrow=(idx * len(file_data)) + (2 * idx), ) return @app.cell def _(mo): mo.md( r""" ## Results Summary **Validation data and v1.2.6 and v1.2.7 manifest versions** from the production server are all equivilent. - Was a strange issue with `pharmcat.vcf` but this was that two rsids were reported for one variant and the order of the rsids in the file changes between the two pipeline runs- i.e. not a systemic issue. **v1.2.6 and v1.2.7 manifest versions** are equivilent. **validation and PGx experiment server using main brach of stellarPGx** do not match - PharmCAT output is equivilent. - Core issue is that on the new server stellarPGx prododuces different genotype predictions """ ) return @app.cell def _(): return if __name__ == "__main__": app.run()