feat: code added for parsing the sample list by UoAz samples
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -174,3 +174,5 @@ cython_debug/
|
|||||||
# PyPI configuration file
|
# PyPI configuration file
|
||||||
.pypirc
|
.pypirc
|
||||||
|
|
||||||
|
data/
|
||||||
|
__marimo__/
|
||||||
|
|||||||
252
notebooks/sample_list_chambers_20250202.py
Normal file
252
notebooks/sample_list_chambers_20250202.py
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
import marimo
|
||||||
|
|
||||||
|
__generated_with = "0.14.16"
|
||||||
|
app = marimo.App(width="medium")
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
import marimo as mo
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import datetime
|
||||||
|
return mo, np, pd
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
mo.md(
|
||||||
|
r"""
|
||||||
|
# Generate the Up to Date Sample List for Dr Chambers
|
||||||
|
|
||||||
|
Email request from 29-Aug-25:
|
||||||
|
|
||||||
|
"The Quantgene/Serenomica lab case IDs are unique identifiers for each of our samples. The samples also have our sample ID numbers, which are not unique.
|
||||||
|
|
||||||
|
Can you make sure the lab case ID numbers are still labeled on the tubes which are being sent to us?
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Lastly, can you get us a full list of plasma samples on our research project (approx. 135, including the last 7 specimens we have been discussing) along with the lab case ID numbers and sample ID numbers? If a date of collection or lab receipt date is available that would also be helpful. I have several partial lists containing this information.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Thank you, Setsuko
|
||||||
|
"
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
useful_cols = [
|
||||||
|
"Lab Case ID",
|
||||||
|
"Sample ID (on tube)",
|
||||||
|
"Patient ID ",
|
||||||
|
"Date Received",
|
||||||
|
"Sample Comments",
|
||||||
|
]
|
||||||
|
return (useful_cols,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""## (1) Samples from RDX""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(np, pd):
|
||||||
|
samples_rdx = (
|
||||||
|
pd.read_excel("../data/RDX to Portland Shipment - Sample List.xlsx")
|
||||||
|
.rename(
|
||||||
|
{"Sample ID": "Sample ID (on tube)", "Date Arrived": "Date Received"},
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
.drop("Sort", axis=1)
|
||||||
|
)
|
||||||
|
samples_rdx.loc[
|
||||||
|
samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
|
||||||
|
]
|
||||||
|
samples_rdx["Sample Comments"] = (
|
||||||
|
samples_rdx[
|
||||||
|
[
|
||||||
|
"Confirmed Sample Receipt in Portland",
|
||||||
|
"Still Need to Investigate",
|
||||||
|
"RDX/Nivi Comments",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
.replace(np.nan, "")
|
||||||
|
.astype(str)
|
||||||
|
.apply("; ".join, axis=1)
|
||||||
|
)
|
||||||
|
samples_rdx
|
||||||
|
return (samples_rdx,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(samples_rdx, useful_cols):
|
||||||
|
az_samples_rdx = (
|
||||||
|
samples_rdx.loc[
|
||||||
|
samples_rdx["Sample ID (on tube)"].astype(str).str.contains("-AZ-")
|
||||||
|
]
|
||||||
|
.copy()
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
az_samples_rdx["Lab Case ID"] = [None] * len(az_samples_rdx)
|
||||||
|
az_samples_rdx["Patient ID "] = az_samples_rdx["Sample ID (on tube)"].apply(
|
||||||
|
lambda x: "-".join(x.split("-")[:-1])
|
||||||
|
)
|
||||||
|
az_samples_rdx = az_samples_rdx[useful_cols]
|
||||||
|
return (az_samples_rdx,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(az_samples_rdx):
|
||||||
|
# find dups to deal with
|
||||||
|
az_samples_rdx.loc[
|
||||||
|
az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
|
||||||
|
].sort_values(by="Sample ID (on tube)")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(az_samples_rdx):
|
||||||
|
drop_idx = az_samples_rdx[
|
||||||
|
az_samples_rdx.duplicated(subset="Sample ID (on tube)", keep=False)
|
||||||
|
& (az_samples_rdx["Sample Comments"].str.contains("No"))
|
||||||
|
].index
|
||||||
|
az_samples_rdx.drop(drop_idx, inplace=True)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""## (2) Samples Recieved at Portland lab""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pd):
|
||||||
|
# incoming sample trackers
|
||||||
|
df2022 = pd.read_excel(
|
||||||
|
"../data/Incoming Sample Tracker - Commercial & Clinical Research-2022.xlsx"
|
||||||
|
)
|
||||||
|
df2023 = pd.read_excel(
|
||||||
|
"../data/Incoming Sample Tracker - Commercial & Clinical Research-2023.xlsx"
|
||||||
|
)
|
||||||
|
df2024 = pd.read_excel(
|
||||||
|
"../data/Incoming Sample Tracker - Commercial & Clinical Research-2024.xlsx"
|
||||||
|
)
|
||||||
|
df2025 = pd.read_excel("../data/Incoming Sample Tracker 2025.xlsx")
|
||||||
|
return df2022, df2023, df2024, df2025
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(df2022, df2023, df2024, df2025, useful_cols):
|
||||||
|
# only the samples with -AZ- i.e. UoAz samples
|
||||||
|
az_2022 = df2022.loc[df2022["Patient ID "].astype(str).str.contains("-AZ-")][
|
||||||
|
useful_cols
|
||||||
|
]
|
||||||
|
az_2023 = df2023.loc[df2023["Patient ID "].astype(str).str.contains("-AZ-")][
|
||||||
|
useful_cols
|
||||||
|
]
|
||||||
|
az_2024 = df2024.loc[df2024["Patient ID "].astype(str).str.contains("-AZ-")][
|
||||||
|
useful_cols
|
||||||
|
]
|
||||||
|
az_2025 = df2025.loc[df2025["Patient ID "].astype(str).str.contains("-AZ-")][
|
||||||
|
useful_cols
|
||||||
|
]
|
||||||
|
return az_2022, az_2023, az_2024, az_2025
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""## Join all Data and deal with Duplicates""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(az_2022, az_2023, az_2024, az_2025, az_samples_rdx, pd):
|
||||||
|
# join all data
|
||||||
|
all_az_samples = pd.concat(
|
||||||
|
[az_samples_rdx, az_2022, az_2023, az_2024, az_2025],
|
||||||
|
axis=0,
|
||||||
|
ignore_index=True,
|
||||||
|
)
|
||||||
|
print(all_az_samples.shape)
|
||||||
|
all_az_samples.head()
|
||||||
|
return (all_az_samples,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(all_az_samples):
|
||||||
|
# identify duplicates
|
||||||
|
print(sum(all_az_samples.duplicated(subset="Patient ID ", keep="first")))
|
||||||
|
all_az_samples.loc[
|
||||||
|
all_az_samples.duplicated(
|
||||||
|
subset=["Patient ID ", "Sample ID (on tube)"], keep=False
|
||||||
|
)
|
||||||
|
].sort_values(by="Patient ID ")
|
||||||
|
|
||||||
|
# drop logic
|
||||||
|
# - remove any dup that doesn't have a lab case ID
|
||||||
|
# - remaining need to stay as these are likely repeated sent samples
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(all_az_samples, np):
|
||||||
|
# remove the dups based on the logic above
|
||||||
|
dup_idx = all_az_samples[
|
||||||
|
(
|
||||||
|
all_az_samples.duplicated(
|
||||||
|
subset=["Patient ID ", "Sample ID (on tube)"], keep=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
& (all_az_samples["Lab Case ID"].replace(np.nan, "") == "")
|
||||||
|
].index
|
||||||
|
all_az_samples_dedup = (
|
||||||
|
all_az_samples.drop(dup_idx)
|
||||||
|
.sort_values(by=["Patient ID ", "Sample ID (on tube)"])
|
||||||
|
.reset_index(drop=True)
|
||||||
|
.copy()
|
||||||
|
)
|
||||||
|
print(all_az_samples_dedup.shape)
|
||||||
|
return (all_az_samples_dedup,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(all_az_samples_dedup):
|
||||||
|
# mark remaining true duplicates
|
||||||
|
# all_az_samples = all_az_samples.sort_values(
|
||||||
|
# by=["Patient ID ", "Sample ID (on tube)"], inplace=True
|
||||||
|
# )
|
||||||
|
all_az_samples_dedup["duplicated"] = all_az_samples_dedup.duplicated(
|
||||||
|
subset="Patient ID ", keep=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# clean the date column
|
||||||
|
dates_cleaned = all_az_samples_dedup["Date Received"].apply(
|
||||||
|
lambda x: "No data" if (x != x) or (x == "") else x.strftime("%Y-%m-%d")
|
||||||
|
)
|
||||||
|
all_az_samples_dedup["Date Received"] = dates_cleaned
|
||||||
|
|
||||||
|
all_az_samples_dedup
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(all_az_samples_dedup):
|
||||||
|
all_az_samples_dedup.to_excel("../data/az_samples_overview.xlsx", index=False)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run()
|
||||||
Reference in New Issue
Block a user