feat: eda analysis continued and samples from QG search added
This commit is contained in:
116
notebooks/find_qg_dot_mutations.py
Normal file
116
notebooks/find_qg_dot_mutations.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.14.16"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import os
|
||||
from collections import Counter
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
return mo, os, pd
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(
|
||||
r"""
|
||||
# Search for Samples
|
||||
|
||||
Not all QG samples have the `.mutations` files. It would be helpful to have some from those that were part of the historical QC service.
|
||||
|
||||
Samples we have: `../data/qg_avail_dot_mutations.txt`
|
||||
"""
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
# pipeline runs used in the historical data (from repo)
|
||||
LB2_PIPELINE_RUNS = [
|
||||
"s3://quantgene-portland-lb/20221123-sp-pl34-hs2/lb-pl34-all-dev56/",
|
||||
"s3://quantgene-portland-lb/20221202-s1-pl36-hs2/lb-pl36-all-dev56/",
|
||||
"s3://quantgene-portland-lb/20221207-s2-pl37-hs2/lb-pl37-all-dev56/",
|
||||
"s3://quantgene-portland-lb/20230109-sp-pl41-hs2/lb-pl41-dev56/",
|
||||
"s3://quantgene-portland-lb/20230124-sp-pl42-hs2/lb-pl42-dev56/",
|
||||
# PL43 skipped, mostly Geneva samples with stats all over the place
|
||||
"s3://quantgene-portland-lb/20230210-sp-pl44-hs2/lb-pl44-dev59/",
|
||||
"s3://quantgene-portland-lb/20230224-s1-pl47-hs2/lb-pl47-dev59/",
|
||||
"s3://quantgene-portland-lb/20230316-s1-pl49-hs2/lb-pl49-dev59/",
|
||||
"s3://quantgene-portland-lb/20230330-sp-pl50-hs2/lb-pl50-dev59/",
|
||||
"s3://quantgene-portland-lb/20230417-s1-pl51-hs2/lb-pl51-dev59/",
|
||||
"s3://quantgene-portland-lb/20230512-sp-pl52-hs2/lb-pl52-prd-2-1-0/",
|
||||
"s3://quantgene-portland-lb/20230602-sp-pl53-hs2/lb-pl53-prd-2-1-1/",
|
||||
"s3://quantgene-portland-lb/20230615-sp-pl54-hs2/lb-pl54-prd-2-1-1/",
|
||||
"s3://quantgene-portland-lb/20230630-sp-pl55-hs2/lb-pl55-prd-2-1-1/",
|
||||
"s3://quantgene-portland-lb/20230721-sp-pl56-hs2/lb-pl56-prd212/",
|
||||
"s3://quantgene-portland-lb/20230811-sp-pl57-hs2/lb-pl57-prd212/",
|
||||
"s3://quantgene-portland-lb/20230901-sp-pl58-hs2/lb-pl58-prd212/",
|
||||
"s3://quantgene-portland-lb/20231027-sp-pl59-hs2/lb-pl59-prd212/",
|
||||
# PL60 skipped due to low amount of input reads in all samples
|
||||
"s3://quantgene-portland-lb/20240216-sp-pl61-hs2/lb-pl61-prd212/",
|
||||
# PL62 skipped due to low amount of input reads and insert size issues
|
||||
"s3://quantgene-portland-lb/20240516-s1-pl63-hs2/lb-pl63-prd212/",
|
||||
"s3://quantgene-portland-lb/20240529-sp-pl64-hs2/lb-pl64-prd212/",
|
||||
]
|
||||
return (LB2_PIPELINE_RUNS,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd):
|
||||
# hist samples
|
||||
qg_hist_data = pd.read_csv(
|
||||
"/home/darren/Documents/2_repos/serenomica/flowcell_qc/historical_data/historical_data_LB.csv",
|
||||
index_col=0,
|
||||
)
|
||||
qc_samples = list(set(qg_hist_data.index))
|
||||
qc_samples
|
||||
return (qc_samples,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(LB2_PIPELINE_RUNS, os, qc_samples):
|
||||
with open("../data/qg_avail_dot_mutations.txt", "r") as handle:
|
||||
original, avail_samples, needed_samples = [], [], []
|
||||
for line in handle.readlines():
|
||||
fpath, samp = os.path.split(line.rstrip())
|
||||
|
||||
fpath = str(fpath).replace("-lb//", "-lb/") + "/"
|
||||
samp = samp.strip(".mutations")
|
||||
|
||||
original.append(line.rstrip())
|
||||
|
||||
# skip non-prod versions
|
||||
if fpath not in LB2_PIPELINE_RUNS:
|
||||
continue
|
||||
|
||||
avail_samples.append(samp)
|
||||
if samp in qc_samples:
|
||||
needed_samples.append(line.rstrip())
|
||||
print(
|
||||
f"Number of samples available from runs = {(n_avail := len(avail_samples))}"
|
||||
)
|
||||
print(f"Number of historical QC samples = {(n_qc := len(qc_samples))}")
|
||||
print(
|
||||
f"Number of historical QC samples available = {(n_needed := len(needed_samples))} ({round((n_needed / n_qc) * 100, 2)}%)"
|
||||
)
|
||||
return (needed_samples,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(needed_samples):
|
||||
with open("../data/qg_needed.txt", "w") as whandle:
|
||||
whandle.writelines([i + "\n" for i in needed_samples])
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Reference in New Issue
Block a user