feat: code added for parsing the sample list by UoAz samples
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -174,3 +174,5 @@ cython_debug/
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
|
||||
data/
|
||||
__marimo__/
|
||||
|
||||
252
notebooks/sample_list_chambers_20250202.py
Normal file
252
notebooks/sample_list_chambers_20250202.py
Normal file
@@ -0,0 +1,252 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.14.16"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import datetime
|
||||
return mo, np, pd
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""
|
||||
# Generate the Up to Date Sample List for Dr Chambers
|
||||
|
||||
Email request from 29-Aug-25:
|
||||
|
||||
"The Quantgene/Serenomica lab case IDs are unique identifiers for each of our samples. The samples also have our sample ID numbers, which are not unique.
|
||||
|
||||
Can you make sure the lab case ID numbers are still labeled on the tubes which are being sent to us?
|
||||
|
||||
|
||||
|
||||
Lastly, can you get us a full list of plasma samples on our research project (approx. 135, including the last 7 specimens we have been discussing) along with the lab case ID numbers and sample ID numbers? If a date of collection or lab receipt date is available that would also be helpful. I have several partial lists containing this information.
|
||||
|
||||
|
||||
|
||||
Thank you, Setsuko
|
||||
"
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
useful_cols = [
|
||||
"Lab Case ID",
|
||||
"Sample ID (on tube)",
|
||||
"Patient ID ",
|
||||
"Date Received",
|
||||
"Sample Comments",
|
||||
]
|
||||
return (useful_cols,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""## (1) Samples from RDX""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(np, pd):
|
||||
samples_rdx = (
|
||||
pd.read_excel("../data/RDX to Portland Shipment - Sample List.xlsx")
|
||||
.rename(
|
||||
{"Sample ID": "Sample ID (on tube)", "Date Arrived": "Date Received"},
|
||||
axis=1,
|
||||
)
|
||||
.drop("Sort", axis=1)
|
||||
)
|
||||
samples_rdx.loc[
|
||||
samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
|
||||
]
|
||||
samples_rdx["Sample Comments"] = (
|
||||
samples_rdx[
|
||||
[
|
||||
"Confirmed Sample Receipt in Portland",
|
||||
"Still Need to Investigate",
|
||||
"RDX/Nivi Comments",
|
||||
]
|
||||
]
|
||||
.replace(np.nan, "")
|
||||
.astype(str)
|
||||
.apply("; ".join, axis=1)
|
||||
)
|
||||
samples_rdx
|
||||
return (samples_rdx,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(samples_rdx, useful_cols):
|
||||
az_samples_rdx = (
|
||||
samples_rdx.loc[
|
||||
samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
|
||||
]
|
||||
.copy()
|
||||
.reset_index()
|
||||
)
|
||||
az_samples_rdx["Lab Case ID"] = [None] * len(az_samples_rdx)
|
||||
az_samples_rdx["Patient ID "] = az_samples_rdx["Sample ID (on tube)"].apply(
|
||||
lambda x: "-".join(x.split("-")[:-1])
|
||||
)
|
||||
az_samples_rdx = az_samples_rdx[useful_cols]
|
||||
return (az_samples_rdx,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(az_samples_rdx):
|
||||
# find dups to deal with
|
||||
az_samples_rdx.loc[
|
||||
az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
|
||||
].sort_values(by="Sample ID (on tube)")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(az_samples_rdx):
|
||||
drop_idx = az_samples_rdx[
|
||||
az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
|
||||
& (az_samples_rdx["Sample Comments"].str.contains("No"))
|
||||
].index
|
||||
az_samples_rdx.drop(drop_idx, inplace=True)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""## (2) Samples Recieved at Portland lab""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd):
|
||||
# incoming sample trackers
|
||||
df2022 = pd.read_excel(
|
||||
"../data/Incoming Sample Tracker - Commercial & Clinical Research-2022.xlsx"
|
||||
)
|
||||
df2023 = pd.read_excel(
|
||||
"../data/Incoming Sample Tracker - Commercial & Clinical Research-2023.xlsx"
|
||||
)
|
||||
df2024 = pd.read_excel(
|
||||
"../data/Incoming Sample Tracker - Commercial & Clinical Research-2024.xlsx"
|
||||
)
|
||||
df2025 = pd.read_excel("../data/Incoming Sample Tracker 2025.xlsx")
|
||||
return df2022, df2023, df2024, df2025
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(df2022, df2023, df2024, df2025, useful_cols):
|
||||
# only the samples with -AZ- i.e. UoAz samples
|
||||
az_2022 = df2022.loc[df2022["Patient ID "].astype(str).str.contains("-AZ-")][
|
||||
useful_cols
|
||||
]
|
||||
az_2023 = df2023.loc[df2023["Patient ID "].astype(str).str.contains("-AZ-")][
|
||||
useful_cols
|
||||
]
|
||||
az_2024 = df2024.loc[df2024["Patient ID "].astype(str).str.contains("-AZ-")][
|
||||
useful_cols
|
||||
]
|
||||
az_2025 = df2025.loc[df2025["Patient ID "].astype(str).str.contains("-AZ-")][
|
||||
useful_cols
|
||||
]
|
||||
return az_2022, az_2023, az_2024, az_2025
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""## Join all Data and deal with Duplicates""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(az_2022, az_2023, az_2024, az_2025, az_samples_rdx, pd):
|
||||
# join all data
|
||||
all_az_samples = pd.concat(
|
||||
[az_samples_rdx, az_2022, az_2023, az_2024, az_2025],
|
||||
axis=0,
|
||||
ignore_index=True,
|
||||
)
|
||||
print(all_az_samples.shape)
|
||||
all_az_samples.head()
|
||||
return (all_az_samples,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_az_samples):
|
||||
# identify duplicates
|
||||
print(sum(all_az_samples.duplicated(subset="Patient ID ", keep="first")))
|
||||
all_az_samples.loc[
|
||||
all_az_samples.duplicated(
|
||||
subset=["Patient ID ", "Sample ID (on tube)"], keep=False
|
||||
)
|
||||
].sort_values(by="Patient ID ")
|
||||
|
||||
# drop logic
|
||||
# - remove any dup that doesn't have a lab case ID
|
||||
# - remaining need to stay as these are likely repeated sent samples
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_az_samples, np):
|
||||
# remove the dups based on the logic above
|
||||
dup_idx = all_az_samples[
|
||||
(
|
||||
all_az_samples.duplicated(
|
||||
subset=["Patient ID ", "Sample ID (on tube)"], keep=False
|
||||
)
|
||||
)
|
||||
& (all_az_samples["Lab Case ID"].replace(np.nan, "") == "")
|
||||
].index
|
||||
all_az_samples_dedup = (
|
||||
all_az_samples.drop(dup_idx)
|
||||
.sort_values(by=["Patient ID ", "Sample ID (on tube)"])
|
||||
.reset_index(drop=True)
|
||||
.copy()
|
||||
)
|
||||
print(all_az_samples_dedup.shape)
|
||||
return (all_az_samples_dedup,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_az_samples_dedup):
|
||||
# mark remaining true duplicates
|
||||
# all_az_samples = all_az_samples.sort_values(
|
||||
# by=["Patient ID ", "Sample ID (on tube)"], inplace=True
|
||||
# )
|
||||
all_az_samples_dedup["duplicated"] = all_az_samples_dedup.duplicated(
|
||||
subset="Patient ID ", keep=False
|
||||
)
|
||||
|
||||
# clean the date column
|
||||
dates_cleaned = all_az_samples_dedup["Date Received"].apply(
|
||||
lambda x: "No data" if (x != x) or (x == "") else x.strftime("%Y-%m-%d")
|
||||
)
|
||||
all_az_samples_dedup["Date Received"] = dates_cleaned
|
||||
|
||||
all_az_samples_dedup
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_az_samples_dedup):
|
||||
all_az_samples_dedup.to_excel("../data/az_samples_overview.xlsx", index=False)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Reference in New Issue
Block a user