feat: update code to create the sample comparison without notebook
This commit is contained in:
@@ -25,7 +25,7 @@ def test_same_file(file1: str, file2: str) -> str:
|
|||||||
raise RuntimeError
|
raise RuntimeError
|
||||||
|
|
||||||
|
|
||||||
def main(test_sample_paths: str, conditions: list[str]) -> None:
|
def main(test_sample_paths: str, conditions: list[str], outpath: str) -> None:
|
||||||
with open(test_sample_paths, "r") as handle:
|
with open(test_sample_paths, "r") as handle:
|
||||||
samples = [r.strip().split("/")[-1].strip(".vcf") for r in handle.readlines()]
|
samples = [r.strip().split("/")[-1].strip(".vcf") for r in handle.readlines()]
|
||||||
conditions_pairwise = list(combinations(conditions, r=2))
|
conditions_pairwise = list(combinations(conditions, r=2))
|
||||||
@@ -64,16 +64,41 @@ def main(test_sample_paths: str, conditions: list[str]) -> None:
|
|||||||
df = pd.DataFrame(
|
df = pd.DataFrame(
|
||||||
results, columns=["condition1", "condition2", "sample", "file", "matching"]
|
results, columns=["condition1", "condition2", "sample", "file", "matching"]
|
||||||
)
|
)
|
||||||
df.to_csv("data/pairwise_equality.csv", sep=",", index=None)
|
df.to_csv(f"{outpath}.csv", sep=",", index=None)
|
||||||
|
|
||||||
|
samples = df["sample"].unique()
|
||||||
|
print(samples)
|
||||||
|
|
||||||
|
gb_pairwise_df = df.groupby(by=["condition1", "condition2"])
|
||||||
|
groups = list(gb_pairwise_df.groups.keys())
|
||||||
|
|
||||||
|
# decompose table to be samples and group focused
|
||||||
|
data_dict = {}
|
||||||
|
for group in groups:
|
||||||
|
str_group = f"{'__'.join(group)}"
|
||||||
|
data_dict[str_group] = {}
|
||||||
|
group_df = gb_pairwise_df.get_group(group)
|
||||||
|
for sample in samples:
|
||||||
|
data_dict[str_group][sample] = group_df.loc[
|
||||||
|
group_df["sample"] == sample
|
||||||
|
].sort_values(by="file")
|
||||||
|
|
||||||
|
# save data to analyse further
|
||||||
|
with pd.ExcelWriter(f"{outpath}.xlsx", engine="openpyxl") as writer:
|
||||||
|
for g, data in data_dict.items():
|
||||||
|
for idx, (_, file_data) in enumerate(data.items()):
|
||||||
|
file_data.to_excel(
|
||||||
|
writer,
|
||||||
|
sheet_name=g,
|
||||||
|
startrow=(idx * len(file_data)) + (2 * idx),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test_sample_paths = "data/test_samples.txt"
|
test_sample_paths = "data/test_samples.txt"
|
||||||
conditions = [
|
conditions = [
|
||||||
"validation_data",
|
"validation_data",
|
||||||
"v1.2.6_manifest_version",
|
|
||||||
"pgxCleaner_server",
|
|
||||||
"pgxCleaner_prod_updates",
|
"pgxCleaner_prod_updates",
|
||||||
]
|
]
|
||||||
|
outpath = "data/test"
|
||||||
main(test_sample_paths, conditions)
|
main(test_sample_paths, conditions, outpath)
|
||||||
|
|||||||
Reference in New Issue
Block a user