reproduce_setup/notebooks/data_inspector.py

import marimo

__generated_with = "0.14.16"
app = marimo.App(width="medium")


@app.cell
def _():
    import os
    import marimo as mo
    import pandas as pd
    return mo, pd


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""
    # Analyse the Mismatch data

    Data structure
    ```
    .
    ├── data
    │   ├── pairwise_equality.csv -- file with summary from src/check_results_equivilence.py
    │   ├── pgxCleaner_server -- data from main branch of StellarPGx and generated on the experiment server
    │   ├── test_samples.txt -- list of samples to be tested with paths on s3 (one per line)
    │   ├── v1.2.6_manifest_version -- data generated on the prod server with manifest.version = 1.2.6
    │   ├── v1.2.7_mainfest_version -- data generated on the prod server with manifest.version = 1.2.6
    │   ├── v1.2.8_git_tag -- data from tags/v1.2.8 branch of StellarPGx and generated on the experiment server
    │   └── validation_data -- data used in the validation (from google drive)
    ```
    """
    )
    return


@app.cell
def _(pd):
    pairwise_df = pd.read_csv("data/pairwise_equality.csv")
    pairwise_df.head()
    return (pairwise_df,)


@app.cell
def _(pairwise_df):
    # print and get samples and groups for later
    samples = pairwise_df["sample"].unique()
    print(samples)
    cond1 = pairwise_df["condition1"].unique()
    print(cond1)
    cond2 = pairwise_df["condition2"].unique()
    print(cond2)
    return (samples,)


@app.cell
def _(pairwise_df):
    gb_pairwise_df = pairwise_df.groupby(by=["condition1", "condition2"])
    groups = list(gb_pairwise_df.groups.keys())
    return gb_pairwise_df, groups


@app.cell
def _(gb_pairwise_df, groups, samples):
    # decompose table to be samples and group focused
    data_dict = {}
    for group in groups:
        str_group = f"{'__'.join(group)}"
        data_dict[str_group] = {}
        group_df = gb_pairwise_df.get_group(group)
        for sample in samples:
            data_dict[str_group][sample] = group_df.loc[
                group_df["sample"] == sample
            ].sort_values(by="file")

    print(data_dict.keys())
    return (data_dict,)


@app.cell
def _(data_dict, pd):
    # save data to analyse further
    with pd.ExcelWriter(
        "data/pairwise_comparison.xlsx", engine="openpyxl"
    ) as writer:
        for g, data in data_dict.items():
            for idx, (_, file_data) in enumerate(data.items()):
                file_data.to_excel(
                    writer,
                    sheet_name=g,
                    startrow=(idx * len(file_data)) + (2 * idx),
                )
    return


@app.cell
def _(mo):
    mo.md(
        r"""
    ## Results Summary

    **Validation data and v1.2.6 and v1.2.7 manifest versions** from the production server are all equivilent.

    - Was a strange issue with `pharmcat.vcf` but this was that two rsids were reported for one variant and the order of the rsids in the file changes between the two pipeline runs- i.e. not a systemic issue.

    **v1.2.6 and v1.2.7 manifest versions** are equivilent.

    **validation and PGx experiment server using main brach of stellarPGx** do not match

    - PharmCAT output is equivilent.
    - Core issue is that on the new server stellarPGx prododuces different genotype predictions
    """
    )
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()