import marimo __generated_with = "0.14.16" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import pandas as pd import numpy as np import datetime return mo, np, pd @app.cell def _(mo): mo.md( r""" # Generate the Up to Date Sample List for Dr Chambers Email request from 29-Aug-25: "The Quantgene/Serenomica lab case IDs are unique identifiers for each of our samples. The samples also have our sample ID numbers, which are not unique. Can you make sure the lab case ID numbers are still labeled on the tubes which are being sent to us? Lastly, can you get us a full list of plasma samples on our research project (approx. 135, including the last 7 specimens we have been discussing) along with the lab case ID numbers and sample ID numbers? If a date of collection or lab receipt date is available that would also be helpful. I have several partial lists containing this information. Thank you, Setsuko " """ ) return @app.cell def _(): useful_cols = [ "Lab Case ID", "Sample ID (on tube)", "Patient ID ", "Date Received", "Sample Comments", ] return (useful_cols,) @app.cell def _(mo): mo.md(r"""## (1) Samples from RDX""") return @app.cell def _(np, pd): samples_rdx = ( pd.read_excel("../data/RDX to Portland Shipment - Sample List.xlsx") .rename( {"Sample ID": "Sample ID (on tube)", "Date Arrived": "Date Received"}, axis=1, ) .drop("Sort", axis=1) ) samples_rdx.loc[ samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-") ] samples_rdx["Sample Comments"] = ( samples_rdx[ [ "Confirmed Sample Receipt in Portland", "Still Need to Investigate", "RDX/Nivi Comments", ] ] .replace(np.nan, "") .astype(str) .apply("; ".join, axis=1) ) samples_rdx return (samples_rdx,) @app.cell def _(samples_rdx, useful_cols): az_samples_rdx = ( samples_rdx.loc[ samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-") ] .copy() .reset_index() ) az_samples_rdx["Lab Case ID"] = [None] * len(az_samples_rdx) az_samples_rdx["Patient ID "] = az_samples_rdx["Sample ID (on tube)"].apply( lambda x: "-".join(x.split("-")[:-1]) ) az_samples_rdx = az_samples_rdx[useful_cols] return (az_samples_rdx,) @app.cell def _(az_samples_rdx): # find dups to deal with az_samples_rdx.loc[ az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False) ].sort_values(by="Sample ID (on tube)") return @app.cell def _(az_samples_rdx): drop_idx = az_samples_rdx[ az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False) & (az_samples_rdx["Sample Comments"].str.contains("No")) ].index az_samples_rdx.drop(drop_idx, inplace=True) return @app.cell def _(mo): mo.md(r"""## (2) Samples Recieved at Portland lab""") return @app.cell def _(pd): # incoming sample trackers df2022 = pd.read_excel( "../data/Incoming Sample Tracker - Commercial & Clinical Research-2022.xlsx" ) df2023 = pd.read_excel( "../data/Incoming Sample Tracker - Commercial & Clinical Research-2023.xlsx" ) df2024 = pd.read_excel( "../data/Incoming Sample Tracker - Commercial & Clinical Research-2024.xlsx" ) df2025 = pd.read_excel("../data/Incoming Sample Tracker 2025.xlsx") return df2022, df2023, df2024, df2025 @app.cell def _(df2022, df2023, df2024, df2025, useful_cols): # only the samples with -AZ- i.e. UoAz samples az_2022 = df2022.loc[df2022["Patient ID "].astype(str).str.contains("-AZ-")][ useful_cols ] az_2023 = df2023.loc[df2023["Patient ID "].astype(str).str.contains("-AZ-")][ useful_cols ] az_2024 = df2024.loc[df2024["Patient ID "].astype(str).str.contains("-AZ-")][ useful_cols ] az_2025 = df2025.loc[df2025["Patient ID "].astype(str).str.contains("-AZ-")][ useful_cols ] return az_2022, az_2023, az_2024, az_2025 @app.cell def _(mo): mo.md(r"""## Join all Data and deal with Duplicates""") return @app.cell def _(az_2022, az_2023, az_2024, az_2025, az_samples_rdx, pd): # join all data all_az_samples = pd.concat( [az_samples_rdx, az_2022, az_2023, az_2024, az_2025], axis=0, ignore_index=True, ) print(all_az_samples.shape) all_az_samples.head() return (all_az_samples,) @app.cell def _(all_az_samples): # identify duplicates print(sum(all_az_samples.duplicated(subset="Patient ID ", keep="first"))) all_az_samples.loc[ all_az_samples.duplicated( subset=["Patient ID ", "Sample ID (on tube)"], keep=False ) ].sort_values(by="Patient ID ") # drop logic # - remove any dup that doesn't have a lab case ID # - remaining need to stay as these are likely repeated sent samples return @app.cell def _(all_az_samples, np): # remove the dups based on the logic above dup_idx = all_az_samples[ ( all_az_samples.duplicated( subset=["Patient ID ", "Sample ID (on tube)"], keep=False ) ) & (all_az_samples["Lab Case ID"].replace(np.nan, "") == "") ].index all_az_samples_dedup = ( all_az_samples.drop(dup_idx) .sort_values(by=["Patient ID ", "Sample ID (on tube)"]) .reset_index(drop=True) .copy() ) print(all_az_samples_dedup.shape) return (all_az_samples_dedup,) @app.cell def _(all_az_samples_dedup): # mark remaining true duplicates # all_az_samples = all_az_samples.sort_values( # by=["Patient ID ", "Sample ID (on tube)"], inplace=True # ) all_az_samples_dedup["duplicated"] = all_az_samples_dedup.duplicated( subset="Patient ID ", keep=False ) # clean the date column dates_cleaned = all_az_samples_dedup["Date Received"].apply( lambda x: "No data" if (x != x) or (x == "") else x.strftime("%Y-%m-%d") ) all_az_samples_dedup["Date Received"] = dates_cleaned all_az_samples_dedup return @app.cell def _(all_az_samples_dedup): all_az_samples_dedup.to_excel("../data/az_samples_overview.xlsx", index=False) return @app.cell def _(): return if __name__ == "__main__": app.run()