diff --git a/src/check_results_equivilence.py b/src/check_results_equivilence.py index 994bcbc..acd1740 100644 --- a/src/check_results_equivilence.py +++ b/src/check_results_equivilence.py @@ -25,7 +25,7 @@ def test_same_file(file1: str, file2: str) -> str: raise RuntimeError -def main(test_sample_paths: str, conditions: list[str]) -> None: +def main(test_sample_paths: str, conditions: list[str], outpath: str) -> None: with open(test_sample_paths, "r") as handle: samples = [r.strip().split("/")[-1].strip(".vcf") for r in handle.readlines()] conditions_pairwise = list(combinations(conditions, r=2)) @@ -64,16 +64,41 @@ def main(test_sample_paths: str, conditions: list[str]) -> None: df = pd.DataFrame( results, columns=["condition1", "condition2", "sample", "file", "matching"] ) - df.to_csv("data/pairwise_equality.csv", sep=",", index=None) + df.to_csv(f"{outpath}.csv", sep=",", index=None) + + samples = df["sample"].unique() + print(samples) + + gb_pairwise_df = df.groupby(by=["condition1", "condition2"]) + groups = list(gb_pairwise_df.groups.keys()) + + # decompose table to be samples and group focused + data_dict = {} + for group in groups: + str_group = f"{'__'.join(group)}" + data_dict[str_group] = {} + group_df = gb_pairwise_df.get_group(group) + for sample in samples: + data_dict[str_group][sample] = group_df.loc[ + group_df["sample"] == sample + ].sort_values(by="file") + + # save data to analyse further + with pd.ExcelWriter(f"{outpath}.xlsx", engine="openpyxl") as writer: + for g, data in data_dict.items(): + for idx, (_, file_data) in enumerate(data.items()): + file_data.to_excel( + writer, + sheet_name=g, + startrow=(idx * len(file_data)) + (2 * idx), + ) if __name__ == "__main__": test_sample_paths = "data/test_samples.txt" conditions = [ "validation_data", - "v1.2.6_manifest_version", - "pgxCleaner_server", "pgxCleaner_prod_updates", ] - - main(test_sample_paths, conditions) + outpath = "data/test" + main(test_sample_paths, conditions, outpath)