feat: code added for parsing the sample list by UoAz samples

2025-09-03 13:31:19 +02:00
parent ea772ca0b0
commit c00ff64771
2 changed files with 254 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -174,3 +174,5 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 data/
 __marimo__/
--- a/notebooks/sample_list_chambers_20250202.py
+++ b/notebooks/sample_list_chambers_20250202.py
@@ -0,0 +1,252 @@
 import marimo
 __generated_with = "0.14.16"
 app = marimo.App(width="medium")
@app.cell
 def _():
    import marimo as mo
    import pandas as pd
    import numpy as np
    import datetime
    return mo, np, pd
@app.cell
 def _(mo):
    mo.md(
        r"""
    # Generate the Up to Date Sample List for Dr Chambers
    Email request from 29-Aug-25:
    "The Quantgene/Serenomica lab case IDs are unique identifiers for each of our samples. The samples also have our sample ID numbers, which are not unique.
    Can you make sure the lab case ID numbers are still labeled on the tubes which are being sent to us?
    Lastly, can you get us a full list of plasma samples on our research project (approx. 135, including the last 7 specimens we have been discussing) along with the lab case ID numbers and sample ID numbers? If a date of collection or lab receipt date is available that would also be helpful. I have several partial lists containing this information.   
    Thank you, Setsuko
    "
    """
    )
    return
@app.cell
 def _():
    useful_cols = [
        "Lab Case ID",
        "Sample ID (on tube)",
        "Patient ID ",
        "Date Received",
        "Sample Comments",
    ]
    return (useful_cols,)
@app.cell
 def _(mo):
    mo.md(r"""## (1) Samples from RDX""")
    return
@app.cell
 def _(np, pd):
    samples_rdx = (
        pd.read_excel("../data/RDX to Portland Shipment - Sample List.xlsx")
        .rename(
            {"Sample ID": "Sample ID (on tube)", "Date Arrived": "Date Received"},
            axis=1,
        )
        .drop("Sort", axis=1)
    )
    samples_rdx.loc[
        samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
    ]
    samples_rdx["Sample Comments"] = (
        samples_rdx[
            [
                "Confirmed Sample Receipt in Portland",
                "Still Need to Investigate",
                "RDX/Nivi Comments",
            ]
        ]
        .replace(np.nan, "")
        .astype(str)
        .apply("; ".join, axis=1)
    )
    samples_rdx
    return (samples_rdx,)
@app.cell
 def _(samples_rdx, useful_cols):
    az_samples_rdx = (
        samples_rdx.loc[
            samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
        ]
        .copy()
        .reset_index()
    )
    az_samples_rdx["Lab Case ID"] = [None] * len(az_samples_rdx)
    az_samples_rdx["Patient ID "] = az_samples_rdx["Sample ID (on tube)"].apply(
        lambda x: "-".join(x.split("-")[:-1])
    )
    az_samples_rdx = az_samples_rdx[useful_cols]
    return (az_samples_rdx,)
@app.cell
 def _(az_samples_rdx):
    # find dups to deal with
    az_samples_rdx.loc[
        az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
    ].sort_values(by="Sample ID (on tube)")
    return
@app.cell
 def _(az_samples_rdx):
    drop_idx = az_samples_rdx[
        az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
        & (az_samples_rdx["Sample Comments"].str.contains("No"))
    ].index
    az_samples_rdx.drop(drop_idx, inplace=True)
    return
@app.cell
 def _(mo):
    mo.md(r"""## (2) Samples Recieved at Portland lab""")
    return
@app.cell
 def _(pd):
    # incoming sample trackers
    df2022 = pd.read_excel(
        "../data/Incoming Sample Tracker - Commercial & Clinical Research-2022.xlsx"
    )
    df2023 = pd.read_excel(
        "../data/Incoming Sample Tracker - Commercial & Clinical Research-2023.xlsx"
    )
    df2024 = pd.read_excel(
        "../data/Incoming Sample Tracker - Commercial & Clinical Research-2024.xlsx"
    )
    df2025 = pd.read_excel("../data/Incoming Sample Tracker 2025.xlsx")
    return df2022, df2023, df2024, df2025
@app.cell
 def _(df2022, df2023, df2024, df2025, useful_cols):
    # only the samples with -AZ- i.e. UoAz samples
    az_2022 = df2022.loc[df2022["Patient ID "].astype(str).str.contains("-AZ-")][
        useful_cols
    ]
    az_2023 = df2023.loc[df2023["Patient ID "].astype(str).str.contains("-AZ-")][
        useful_cols
    ]
    az_2024 = df2024.loc[df2024["Patient ID "].astype(str).str.contains("-AZ-")][
        useful_cols
    ]
    az_2025 = df2025.loc[df2025["Patient ID "].astype(str).str.contains("-AZ-")][
        useful_cols
    ]
    return az_2022, az_2023, az_2024, az_2025
@app.cell
 def _(mo):
    mo.md(r"""## Join all Data and deal with Duplicates""")
    return
@app.cell
 def _(az_2022, az_2023, az_2024, az_2025, az_samples_rdx, pd):
    # join all data
    all_az_samples = pd.concat(
        [az_samples_rdx, az_2022, az_2023, az_2024, az_2025],
        axis=0,
        ignore_index=True,
    )
    print(all_az_samples.shape)
    all_az_samples.head()
    return (all_az_samples,)
@app.cell
 def _(all_az_samples):
    # identify duplicates
    print(sum(all_az_samples.duplicated(subset="Patient ID ", keep="first")))
    all_az_samples.loc[
        all_az_samples.duplicated(
            subset=["Patient ID ", "Sample ID (on tube)"], keep=False
        )
    ].sort_values(by="Patient ID ")
    # drop logic
    # - remove any dup that doesn't have a lab case ID
    # - remaining need to stay as these are likely repeated sent samples
    return
@app.cell
 def _(all_az_samples, np):
    # remove the dups based on the logic above
    dup_idx = all_az_samples[
        (
            all_az_samples.duplicated(
                subset=["Patient ID ", "Sample ID (on tube)"], keep=False
            )
        )
        & (all_az_samples["Lab Case ID"].replace(np.nan, "") == "")
    ].index
    all_az_samples_dedup = (
        all_az_samples.drop(dup_idx)
        .sort_values(by=["Patient ID ", "Sample ID (on tube)"])
        .reset_index(drop=True)
        .copy()
    )
    print(all_az_samples_dedup.shape)
    return (all_az_samples_dedup,)
@app.cell
 def _(all_az_samples_dedup):
    # mark remaining true duplicates
    # all_az_samples = all_az_samples.sort_values(
    #     by=["Patient ID ", "Sample ID (on tube)"], inplace=True
    # )
    all_az_samples_dedup["duplicated"] = all_az_samples_dedup.duplicated(
        subset="Patient ID ", keep=False
    )
    # clean the date column
    dates_cleaned = all_az_samples_dedup["Date Received"].apply(
        lambda x: "No data" if (x != x) or (x == "") else x.strftime("%Y-%m-%d")
    )
    all_az_samples_dedup["Date Received"] = dates_cleaned
    all_az_samples_dedup
    return
@app.cell
 def _(all_az_samples_dedup):
    all_az_samples_dedup.to_excel("../data/az_samples_overview.xlsx", index=False)
    return
@app.cell
 def _():
    return
 if __name__ == "__main__":
    app.run()