import marimo

__generated_with = "0.14.16"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import pandas as pd
    import numpy as np
    import datetime
    return mo, np, pd


@app.cell
def _(mo):
    mo.md(
        r"""
    # Generate the Up to Date Sample List for Dr Chambers

    Email request from 29-Aug-25:

    "The Quantgene/Serenomica lab case IDs are unique identifiers for each of our samples. The samples also have our sample ID numbers, which are not unique.

    Can you make sure the lab case ID numbers are still labeled on the tubes which are being sent to us?


    Lastly, can you get us a full list of plasma samples on our research project (approx. 135, including the last 7 specimens we have been discussing) along with the lab case ID numbers and sample ID numbers? If a date of collection or lab receipt date is available that would also be helpful. I have several partial lists containing this information.   


    Thank you, Setsuko
    "
    """
    )
    return


@app.cell
def _():
    useful_cols = [
        "Lab Case ID",
        "Sample ID (on tube)",
        "Patient ID ",
        "Date Received",
        "Sample Comments",
    ]
    return (useful_cols,)


@app.cell
def _(mo):
    mo.md(r"""## (1) Samples from RDX""")
    return


@app.cell
def _(np, pd):
    samples_rdx = (
        pd.read_excel("../data/RDX to Portland Shipment - Sample List.xlsx")
        .rename(
            {"Sample ID": "Sample ID (on tube)", "Date Arrived": "Date Received"},
            axis=1,
        )
        .drop("Sort", axis=1)
    )
    samples_rdx.loc[
        samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
    ]
    samples_rdx["Sample Comments"] = (
        samples_rdx[
            [
                "Confirmed Sample Receipt in Portland",
                "Still Need to Investigate",
                "RDX/Nivi Comments",
            ]
        ]
        .replace(np.nan, "")
        .astype(str)
        .apply("; ".join, axis=1)
    )
    samples_rdx
    return (samples_rdx,)


@app.cell
def _(samples_rdx, useful_cols):
    az_samples_rdx = (
        samples_rdx.loc[
            samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
        ]
        .copy()
        .reset_index()
    )
    az_samples_rdx["Lab Case ID"] = [None] * len(az_samples_rdx)
    az_samples_rdx["Patient ID "] = az_samples_rdx["Sample ID (on tube)"].apply(
        lambda x: "-".join(x.split("-")[:-1])
    )
    az_samples_rdx = az_samples_rdx[useful_cols]
    return (az_samples_rdx,)


@app.cell
def _(az_samples_rdx):
    # find dups to deal with
    az_samples_rdx.loc[
        az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
    ].sort_values(by="Sample ID (on tube)")
    return


@app.cell
def _(az_samples_rdx):
    drop_idx = az_samples_rdx[
        az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
        & (az_samples_rdx["Sample Comments"].str.contains("No"))
    ].index
    az_samples_rdx.drop(drop_idx, inplace=True)
    return


@app.cell
def _(mo):
    mo.md(r"""## (2) Samples Recieved at Portland lab""")
    return


@app.cell
def _(pd):
    # incoming sample trackers
    df2022 = pd.read_excel(
        "../data/Incoming Sample Tracker - Commercial & Clinical Research-2022.xlsx"
    )
    df2023 = pd.read_excel(
        "../data/Incoming Sample Tracker - Commercial & Clinical Research-2023.xlsx"
    )
    df2024 = pd.read_excel(
        "../data/Incoming Sample Tracker - Commercial & Clinical Research-2024.xlsx"
    )
    df2025 = pd.read_excel("../data/Incoming Sample Tracker 2025.xlsx")
    return df2022, df2023, df2024, df2025


@app.cell
def _(df2022, df2023, df2024, df2025, useful_cols):
    # only the samples with -AZ- i.e. UoAz samples
    az_2022 = df2022.loc[df2022["Patient ID "].astype(str).str.contains("-AZ-")][
        useful_cols
    ]
    az_2023 = df2023.loc[df2023["Patient ID "].astype(str).str.contains("-AZ-")][
        useful_cols
    ]
    az_2024 = df2024.loc[df2024["Patient ID "].astype(str).str.contains("-AZ-")][
        useful_cols
    ]
    az_2025 = df2025.loc[df2025["Patient ID "].astype(str).str.contains("-AZ-")][
        useful_cols
    ]
    return az_2022, az_2023, az_2024, az_2025


@app.cell
def _(mo):
    mo.md(r"""## Join all Data and deal with Duplicates""")
    return


@app.cell
def _(az_2022, az_2023, az_2024, az_2025, az_samples_rdx, pd):
    # join all data
    all_az_samples = pd.concat(
        [az_samples_rdx, az_2022, az_2023, az_2024, az_2025],
        axis=0,
        ignore_index=True,
    )
    print(all_az_samples.shape)
    all_az_samples.head()
    return (all_az_samples,)


@app.cell
def _(all_az_samples):
    # identify duplicates
    print(sum(all_az_samples.duplicated(subset="Patient ID ", keep="first")))
    all_az_samples.loc[
        all_az_samples.duplicated(
            subset=["Patient ID ", "Sample ID (on tube)"], keep=False
        )
    ].sort_values(by="Patient ID ")

    # drop logic
    # - remove any dup that doesn't have a lab case ID
    # - remaining need to stay as these are likely repeated sent samples
    return


@app.cell
def _(all_az_samples, np):
    # remove the dups based on the logic above
    dup_idx = all_az_samples[
        (
            all_az_samples.duplicated(
                subset=["Patient ID ", "Sample ID (on tube)"], keep=False
            )
        )
        & (all_az_samples["Lab Case ID"].replace(np.nan, "") == "")
    ].index
    all_az_samples_dedup = (
        all_az_samples.drop(dup_idx)
        .sort_values(by=["Patient ID ", "Sample ID (on tube)"])
        .reset_index(drop=True)
        .copy()
    )
    print(all_az_samples_dedup.shape)
    return (all_az_samples_dedup,)


@app.cell
def _(all_az_samples_dedup):
    # mark remaining true duplicates
    # all_az_samples = all_az_samples.sort_values(
    #     by=["Patient ID ", "Sample ID (on tube)"], inplace=True
    # )
    all_az_samples_dedup["duplicated"] = all_az_samples_dedup.duplicated(
        subset="Patient ID ", keep=False
    )

    # clean the date column
    dates_cleaned = all_az_samples_dedup["Date Received"].apply(
        lambda x: "No data" if (x != x) or (x == "") else x.strftime("%Y-%m-%d")
    )
    all_az_samples_dedup["Date Received"] = dates_cleaned

    all_az_samples_dedup
    return


@app.cell
def _(all_az_samples_dedup):
    all_az_samples_dedup.to_excel("../data/az_samples_overview.xlsx", index=False)
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()