feat: code added for parsing the sample list by UoAz samples

This commit is contained in:
2025-09-03 13:31:19 +02:00
parent ea772ca0b0
commit c00ff64771
2 changed files with 254 additions and 0 deletions

View File

@@ -0,0 +1,252 @@
import marimo
__generated_with = "0.14.16"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import pandas as pd
import numpy as np
import datetime
return mo, np, pd
@app.cell
def _(mo):
mo.md(
r"""
# Generate the Up to Date Sample List for Dr Chambers
Email request from 29-Aug-25:
"The Quantgene/Serenomica lab case IDs are unique identifiers for each of our samples. The samples also have our sample ID numbers, which are not unique.
Can you make sure the lab case ID numbers are still labeled on the tubes which are being sent to us?
Lastly, can you get us a full list of plasma samples on our research project (approx. 135, including the last 7 specimens we have been discussing) along with the lab case ID numbers and sample ID numbers? If a date of collection or lab receipt date is available that would also be helpful. I have several partial lists containing this information.
Thank you, Setsuko
"
"""
)
return
@app.cell
def _():
useful_cols = [
"Lab Case ID",
"Sample ID (on tube)",
"Patient ID ",
"Date Received",
"Sample Comments",
]
return (useful_cols,)
@app.cell
def _(mo):
mo.md(r"""## (1) Samples from RDX""")
return
@app.cell
def _(np, pd):
samples_rdx = (
pd.read_excel("../data/RDX to Portland Shipment - Sample List.xlsx")
.rename(
{"Sample ID": "Sample ID (on tube)", "Date Arrived": "Date Received"},
axis=1,
)
.drop("Sort", axis=1)
)
samples_rdx.loc[
samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
]
samples_rdx["Sample Comments"] = (
samples_rdx[
[
"Confirmed Sample Receipt in Portland",
"Still Need to Investigate",
"RDX/Nivi Comments",
]
]
.replace(np.nan, "")
.astype(str)
.apply("; ".join, axis=1)
)
samples_rdx
return (samples_rdx,)
@app.cell
def _(samples_rdx, useful_cols):
az_samples_rdx = (
samples_rdx.loc[
samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
]
.copy()
.reset_index()
)
az_samples_rdx["Lab Case ID"] = [None] * len(az_samples_rdx)
az_samples_rdx["Patient ID "] = az_samples_rdx["Sample ID (on tube)"].apply(
lambda x: "-".join(x.split("-")[:-1])
)
az_samples_rdx = az_samples_rdx[useful_cols]
return (az_samples_rdx,)
@app.cell
def _(az_samples_rdx):
# find dups to deal with
az_samples_rdx.loc[
az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
].sort_values(by="Sample ID (on tube)")
return
@app.cell
def _(az_samples_rdx):
drop_idx = az_samples_rdx[
az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
& (az_samples_rdx["Sample Comments"].str.contains("No"))
].index
az_samples_rdx.drop(drop_idx, inplace=True)
return
@app.cell
def _(mo):
mo.md(r"""## (2) Samples Recieved at Portland lab""")
return
@app.cell
def _(pd):
# incoming sample trackers
df2022 = pd.read_excel(
"../data/Incoming Sample Tracker - Commercial & Clinical Research-2022.xlsx"
)
df2023 = pd.read_excel(
"../data/Incoming Sample Tracker - Commercial & Clinical Research-2023.xlsx"
)
df2024 = pd.read_excel(
"../data/Incoming Sample Tracker - Commercial & Clinical Research-2024.xlsx"
)
df2025 = pd.read_excel("../data/Incoming Sample Tracker 2025.xlsx")
return df2022, df2023, df2024, df2025
@app.cell
def _(df2022, df2023, df2024, df2025, useful_cols):
# only the samples with -AZ- i.e. UoAz samples
az_2022 = df2022.loc[df2022["Patient ID "].astype(str).str.contains("-AZ-")][
useful_cols
]
az_2023 = df2023.loc[df2023["Patient ID "].astype(str).str.contains("-AZ-")][
useful_cols
]
az_2024 = df2024.loc[df2024["Patient ID "].astype(str).str.contains("-AZ-")][
useful_cols
]
az_2025 = df2025.loc[df2025["Patient ID "].astype(str).str.contains("-AZ-")][
useful_cols
]
return az_2022, az_2023, az_2024, az_2025
@app.cell
def _(mo):
mo.md(r"""## Join all Data and deal with Duplicates""")
return
@app.cell
def _(az_2022, az_2023, az_2024, az_2025, az_samples_rdx, pd):
# join all data
all_az_samples = pd.concat(
[az_samples_rdx, az_2022, az_2023, az_2024, az_2025],
axis=0,
ignore_index=True,
)
print(all_az_samples.shape)
all_az_samples.head()
return (all_az_samples,)
@app.cell
def _(all_az_samples):
# identify duplicates
print(sum(all_az_samples.duplicated(subset="Patient ID ", keep="first")))
all_az_samples.loc[
all_az_samples.duplicated(
subset=["Patient ID ", "Sample ID (on tube)"], keep=False
)
].sort_values(by="Patient ID ")
# drop logic
# - remove any dup that doesn't have a lab case ID
# - remaining need to stay as these are likely repeated sent samples
return
@app.cell
def _(all_az_samples, np):
# remove the dups based on the logic above
dup_idx = all_az_samples[
(
all_az_samples.duplicated(
subset=["Patient ID ", "Sample ID (on tube)"], keep=False
)
)
& (all_az_samples["Lab Case ID"].replace(np.nan, "") == "")
].index
all_az_samples_dedup = (
all_az_samples.drop(dup_idx)
.sort_values(by=["Patient ID ", "Sample ID (on tube)"])
.reset_index(drop=True)
.copy()
)
print(all_az_samples_dedup.shape)
return (all_az_samples_dedup,)
@app.cell
def _(all_az_samples_dedup):
# mark remaining true duplicates
# all_az_samples = all_az_samples.sort_values(
# by=["Patient ID ", "Sample ID (on tube)"], inplace=True
# )
all_az_samples_dedup["duplicated"] = all_az_samples_dedup.duplicated(
subset="Patient ID ", keep=False
)
# clean the date column
dates_cleaned = all_az_samples_dedup["Date Received"].apply(
lambda x: "No data" if (x != x) or (x == "") else x.strftime("%Y-%m-%d")
)
all_az_samples_dedup["Date Received"] = dates_cleaned
all_az_samples_dedup
return
@app.cell
def _(all_az_samples_dedup):
all_az_samples_dedup.to_excel("../data/az_samples_overview.xlsx", index=False)
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()