diff --git a/.gitignore b/.gitignore index 36b13f1..8cd6131 100644 --- a/.gitignore +++ b/.gitignore @@ -174,3 +174,5 @@ cython_debug/ # PyPI configuration file .pypirc +data/ +__marimo__/ diff --git a/notebooks/sample_list_chambers_20250202.py b/notebooks/sample_list_chambers_20250202.py new file mode 100644 index 0000000..d35ddd7 --- /dev/null +++ b/notebooks/sample_list_chambers_20250202.py @@ -0,0 +1,252 @@ +import marimo + +__generated_with = "0.14.16" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import marimo as mo + import pandas as pd + import numpy as np + import datetime + return mo, np, pd + + +@app.cell +def _(mo): + mo.md( + r""" + # Generate the Up to Date Sample List for Dr Chambers + + Email request from 29-Aug-25: + + "The Quantgene/Serenomica lab case IDs are unique identifiers for each of our samples. The samples also have our sample ID numbers, which are not unique. + + Can you make sure the lab case ID numbers are still labeled on the tubes which are being sent to us? + + + + Lastly, can you get us a full list of plasma samples on our research project (approx. 135, including the last 7 specimens we have been discussing) along with the lab case ID numbers and sample ID numbers? If a date of collection or lab receipt date is available that would also be helpful. I have several partial lists containing this information. + + + + Thank you, Setsuko + " + """ + ) + return + + +@app.cell +def _(): + useful_cols = [ + "Lab Case ID", + "Sample ID (on tube)", + "Patient ID ", + "Date Received", + "Sample Comments", + ] + return (useful_cols,) + + +@app.cell +def _(mo): + mo.md(r"""## (1) Samples from RDX""") + return + + +@app.cell +def _(np, pd): + samples_rdx = ( + pd.read_excel("../data/RDX to Portland Shipment - Sample List.xlsx") + .rename( + {"Sample ID": "Sample ID (on tube)", "Date Arrived": "Date Received"}, + axis=1, + ) + .drop("Sort", axis=1) + ) + samples_rdx.loc[ + samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-") + ] + samples_rdx["Sample Comments"] = ( + samples_rdx[ + [ + "Confirmed Sample Receipt in Portland", + "Still Need to Investigate", + "RDX/Nivi Comments", + ] + ] + .replace(np.nan, "") + .astype(str) + .apply("; ".join, axis=1) + ) + samples_rdx + return (samples_rdx,) + + +@app.cell +def _(samples_rdx, useful_cols): + az_samples_rdx = ( + samples_rdx.loc[ + samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-") + ] + .copy() + .reset_index() + ) + az_samples_rdx["Lab Case ID"] = [None] * len(az_samples_rdx) + az_samples_rdx["Patient ID "] = az_samples_rdx["Sample ID (on tube)"].apply( + lambda x: "-".join(x.split("-")[:-1]) + ) + az_samples_rdx = az_samples_rdx[useful_cols] + return (az_samples_rdx,) + + +@app.cell +def _(az_samples_rdx): + # find dups to deal with + az_samples_rdx.loc[ + az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False) + ].sort_values(by="Sample ID (on tube)") + return + + +@app.cell +def _(az_samples_rdx): + drop_idx = az_samples_rdx[ + az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False) + & (az_samples_rdx["Sample Comments"].str.contains("No")) + ].index + az_samples_rdx.drop(drop_idx, inplace=True) + return + + +@app.cell +def _(mo): + mo.md(r"""## (2) Samples Recieved at Portland lab""") + return + + +@app.cell +def _(pd): + # incoming sample trackers + df2022 = pd.read_excel( + "../data/Incoming Sample Tracker - Commercial & Clinical Research-2022.xlsx" + ) + df2023 = pd.read_excel( + "../data/Incoming Sample Tracker - Commercial & Clinical Research-2023.xlsx" + ) + df2024 = pd.read_excel( + "../data/Incoming Sample Tracker - Commercial & Clinical Research-2024.xlsx" + ) + df2025 = pd.read_excel("../data/Incoming Sample Tracker 2025.xlsx") + return df2022, df2023, df2024, df2025 + + +@app.cell +def _(df2022, df2023, df2024, df2025, useful_cols): + # only the samples with -AZ- i.e. UoAz samples + az_2022 = df2022.loc[df2022["Patient ID "].astype(str).str.contains("-AZ-")][ + useful_cols + ] + az_2023 = df2023.loc[df2023["Patient ID "].astype(str).str.contains("-AZ-")][ + useful_cols + ] + az_2024 = df2024.loc[df2024["Patient ID "].astype(str).str.contains("-AZ-")][ + useful_cols + ] + az_2025 = df2025.loc[df2025["Patient ID "].astype(str).str.contains("-AZ-")][ + useful_cols + ] + return az_2022, az_2023, az_2024, az_2025 + + +@app.cell +def _(mo): + mo.md(r"""## Join all Data and deal with Duplicates""") + return + + +@app.cell +def _(az_2022, az_2023, az_2024, az_2025, az_samples_rdx, pd): + # join all data + all_az_samples = pd.concat( + [az_samples_rdx, az_2022, az_2023, az_2024, az_2025], + axis=0, + ignore_index=True, + ) + print(all_az_samples.shape) + all_az_samples.head() + return (all_az_samples,) + + +@app.cell +def _(all_az_samples): + # identify duplicates + print(sum(all_az_samples.duplicated(subset="Patient ID ", keep="first"))) + all_az_samples.loc[ + all_az_samples.duplicated( + subset=["Patient ID ", "Sample ID (on tube)"], keep=False + ) + ].sort_values(by="Patient ID ") + + # drop logic + # - remove any dup that doesn't have a lab case ID + # - remaining need to stay as these are likely repeated sent samples + return + + +@app.cell +def _(all_az_samples, np): + # remove the dups based on the logic above + dup_idx = all_az_samples[ + ( + all_az_samples.duplicated( + subset=["Patient ID ", "Sample ID (on tube)"], keep=False + ) + ) + & (all_az_samples["Lab Case ID"].replace(np.nan, "") == "") + ].index + all_az_samples_dedup = ( + all_az_samples.drop(dup_idx) + .sort_values(by=["Patient ID ", "Sample ID (on tube)"]) + .reset_index(drop=True) + .copy() + ) + print(all_az_samples_dedup.shape) + return (all_az_samples_dedup,) + + +@app.cell +def _(all_az_samples_dedup): + # mark remaining true duplicates + # all_az_samples = all_az_samples.sort_values( + # by=["Patient ID ", "Sample ID (on tube)"], inplace=True + # ) + all_az_samples_dedup["duplicated"] = all_az_samples_dedup.duplicated( + subset="Patient ID ", keep=False + ) + + # clean the date column + dates_cleaned = all_az_samples_dedup["Date Received"].apply( + lambda x: "No data" if (x != x) or (x == "") else x.strftime("%Y-%m-%d") + ) + all_az_samples_dedup["Date Received"] = dates_cleaned + + all_az_samples_dedup + return + + +@app.cell +def _(all_az_samples_dedup): + all_az_samples_dedup.to_excel("../data/az_samples_overview.xlsx", index=False) + return + + +@app.cell +def _(): + return + + +if __name__ == "__main__": + app.run()