feat: code added for parsing the sample list by UoAz samples

2025-09-03 13:31:19 +02:00
parent ea772ca0b0
commit c00ff64771
2 changed files with 254 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -174,3 +174,5 @@ cython_debug/
 # PyPI configuration file
 .pypirc

+data/
+__marimo__/
--- a/notebooks/sample_list_chambers_20250202.py
+++ b/notebooks/sample_list_chambers_20250202.py
@@ -0,0 +1,252 @@
+import marimo
+
+__generated_with = "0.14.16"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import pandas as pd
+    import numpy as np
+    import datetime
+    return mo, np, pd
+
+
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    # Generate the Up to Date Sample List for Dr Chambers
+
+    Email request from 29-Aug-25:
+
+    "The Quantgene/Serenomica lab case IDs are unique identifiers for each of our samples. The samples also have our sample ID numbers, which are not unique.
+
+    Can you make sure the lab case ID numbers are still labeled on the tubes which are being sent to us?
+
+
+
+    Lastly, can you get us a full list of plasma samples on our research project (approx. 135, including the last 7 specimens we have been discussing) along with the lab case ID numbers and sample ID numbers? If a date of collection or lab receipt date is available that would also be helpful. I have several partial lists containing this information.   
+
+
+
+    Thank you, Setsuko
+    "
+    """
+    )
+    return
+
+
+@app.cell
+def _():
+    useful_cols = [
+        "Lab Case ID",
+        "Sample ID (on tube)",
+        "Patient ID ",
+        "Date Received",
+        "Sample Comments",
+    ]
+    return (useful_cols,)
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""## (1) Samples from RDX""")
+    return
+
+
+@app.cell
+def _(np, pd):
+    samples_rdx = (
+        pd.read_excel("../data/RDX to Portland Shipment - Sample List.xlsx")
+        .rename(
+            {"Sample ID": "Sample ID (on tube)", "Date Arrived": "Date Received"},
+            axis=1,
+        )
+        .drop("Sort", axis=1)
+    )
+    samples_rdx.loc[
+        samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
+    ]
+    samples_rdx["Sample Comments"] = (
+        samples_rdx[
+            [
+                "Confirmed Sample Receipt in Portland",
+                "Still Need to Investigate",
+                "RDX/Nivi Comments",
+            ]
+        ]
+        .replace(np.nan, "")
+        .astype(str)
+        .apply("; ".join, axis=1)
+    )
+    samples_rdx
+    return (samples_rdx,)
+
+
+@app.cell
+def _(samples_rdx, useful_cols):
+    az_samples_rdx = (
+        samples_rdx.loc[
+            samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
+        ]
+        .copy()
+        .reset_index()
+    )
+    az_samples_rdx["Lab Case ID"] = [None] * len(az_samples_rdx)
+    az_samples_rdx["Patient ID "] = az_samples_rdx["Sample ID (on tube)"].apply(
+        lambda x: "-".join(x.split("-")[:-1])
+    )
+    az_samples_rdx = az_samples_rdx[useful_cols]
+    return (az_samples_rdx,)
+
+
+@app.cell
+def _(az_samples_rdx):
+    # find dups to deal with
+    az_samples_rdx.loc[
+        az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
+    ].sort_values(by="Sample ID (on tube)")
+    return
+
+
+@app.cell
+def _(az_samples_rdx):
+    drop_idx = az_samples_rdx[
+        az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
+        & (az_samples_rdx["Sample Comments"].str.contains("No"))
+    ].index
+    az_samples_rdx.drop(drop_idx, inplace=True)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""## (2) Samples Recieved at Portland lab""")
+    return
+
+
+@app.cell
+def _(pd):
+    # incoming sample trackers
+    df2022 = pd.read_excel(
+        "../data/Incoming Sample Tracker - Commercial & Clinical Research-2022.xlsx"
+    )
+    df2023 = pd.read_excel(
+        "../data/Incoming Sample Tracker - Commercial & Clinical Research-2023.xlsx"
+    )
+    df2024 = pd.read_excel(
+        "../data/Incoming Sample Tracker - Commercial & Clinical Research-2024.xlsx"
+    )
+    df2025 = pd.read_excel("../data/Incoming Sample Tracker 2025.xlsx")
+    return df2022, df2023, df2024, df2025
+
+
+@app.cell
+def _(df2022, df2023, df2024, df2025, useful_cols):
+    # only the samples with -AZ- i.e. UoAz samples
+    az_2022 = df2022.loc[df2022["Patient ID "].astype(str).str.contains("-AZ-")][
+        useful_cols
+    ]
+    az_2023 = df2023.loc[df2023["Patient ID "].astype(str).str.contains("-AZ-")][
+        useful_cols
+    ]
+    az_2024 = df2024.loc[df2024["Patient ID "].astype(str).str.contains("-AZ-")][
+        useful_cols
+    ]
+    az_2025 = df2025.loc[df2025["Patient ID "].astype(str).str.contains("-AZ-")][
+        useful_cols
+    ]
+    return az_2022, az_2023, az_2024, az_2025
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""## Join all Data and deal with Duplicates""")
+    return
+
+
+@app.cell
+def _(az_2022, az_2023, az_2024, az_2025, az_samples_rdx, pd):
+    # join all data
+    all_az_samples = pd.concat(
+        [az_samples_rdx, az_2022, az_2023, az_2024, az_2025],
+        axis=0,
+        ignore_index=True,
+    )
+    print(all_az_samples.shape)
+    all_az_samples.head()
+    return (all_az_samples,)
+
+
+@app.cell
+def _(all_az_samples):
+    # identify duplicates
+    print(sum(all_az_samples.duplicated(subset="Patient ID ", keep="first")))
+    all_az_samples.loc[
+        all_az_samples.duplicated(
+            subset=["Patient ID ", "Sample ID (on tube)"], keep=False
+        )
+    ].sort_values(by="Patient ID ")
+
+    # drop logic
+    # - remove any dup that doesn't have a lab case ID
+    # - remaining need to stay as these are likely repeated sent samples
+    return
+
+
+@app.cell
+def _(all_az_samples, np):
+    # remove the dups based on the logic above
+    dup_idx = all_az_samples[
+        (
+            all_az_samples.duplicated(
+                subset=["Patient ID ", "Sample ID (on tube)"], keep=False
+            )
+        )
+        & (all_az_samples["Lab Case ID"].replace(np.nan, "") == "")
+    ].index
+    all_az_samples_dedup = (
+        all_az_samples.drop(dup_idx)
+        .sort_values(by=["Patient ID ", "Sample ID (on tube)"])
+        .reset_index(drop=True)
+        .copy()
+    )
+    print(all_az_samples_dedup.shape)
+    return (all_az_samples_dedup,)
+
+
+@app.cell
+def _(all_az_samples_dedup):
+    # mark remaining true duplicates
+    # all_az_samples = all_az_samples.sort_values(
+    #     by=["Patient ID ", "Sample ID (on tube)"], inplace=True
+    # )
+    all_az_samples_dedup["duplicated"] = all_az_samples_dedup.duplicated(
+        subset="Patient ID ", keep=False
+    )
+
+    # clean the date column
+    dates_cleaned = all_az_samples_dedup["Date Received"].apply(
+        lambda x: "No data" if (x != x) or (x == "") else x.strftime("%Y-%m-%d")
+    )
+    all_az_samples_dedup["Date Received"] = dates_cleaned
+
+    all_az_samples_dedup
+    return
+
+
+@app.cell
+def _(all_az_samples_dedup):
+    all_az_samples_dedup.to_excel("../data/az_samples_overview.xlsx", index=False)
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+if __name__ == "__main__":
+    app.run()