import marimo __generated_with = "0.14.16" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import pandas as pd return mo, pd @app.cell def _(mo): mo.md( r""" # EDA of the NAT2 Haplotypes table Downloaded from [PharmVar](https://www.pharmvar.org/gene/NAT2?ref=blog.clinpgx.org) Ajit had scripts to convert from this format into the json used by PharmCAT `pgx-engine/scripts/pharmvar_to_translation_table.py` `pgx-engine/scripts/translation_to_json` Where the pharmvar haplotypes table should go into the first and the translation table can be fed into the second one to get the json. Running these causes and error because the alts element is None ```python Traceback (most recent call last): File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 100, in main() File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 90, in main convert( File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 40, in convert variants = table.apply(make_variant, axis=1).to_list() File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/frame.py", line 10381, in apply return op.apply().__finalize__(self, method="apply") File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 916, in apply return self.apply_standard() File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1063, in apply_standard results, res_index = self.apply_series_generator() File "/home/darren/.venv/py310/lib/python3.10/site-packages/pandas/core/apply.py", line 1081, in apply_series_generator results[i] = self.func(v, *self.args, **self.kwargs) File "/home/darren/Documents/2_repos/serenomica/pgx-engine/scripts/translation_to_json.py", line 20, in make_variant alts = row["alts"].split(",") AttributeError: 'NoneType' object has no attribute 'split' ``` """ ) return @app.cell def _(pd): # load the Pharmvar table pharmvar = pd.read_csv( "data/NAT2-6.2.15/GRCh38/NAT2.NC_000008.11.haplotypes.tsv", sep="\t", comment="#", ) print(pharmvar.shape) pharmvar.drop_duplicates(inplace=True) print(pharmvar.shape) pharmvar.head() return (pharmvar,) @app.cell def _(pharmvar): # code from Ajit that doesn't take the alt alleles and are then missing in the output - plus dangerously drops dups on position only # also dangerously compresses over the start position and would loose any variant that has multiple possible alts (there are multiple of these) position_dict = ( pharmvar[pharmvar.rsID.notna()] .drop_duplicates(["Variant Start"]) .set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]] .to_dict("index") ) position_dict return @app.cell def _(pharmvar): # my solution to gather all possible variant alleles under each variant start without de-duplicating correct_pos_dict = ( pharmvar[pharmvar.rsID.notna()] .groupby(["rsID", "Variant Start", "Reference Allele"])["Variant Allele"] .agg(lambda x: ",".join(x.unique())) .reset_index() .set_index("Variant Start")[["rsID", "Reference Allele", "Variant Allele"]] .to_dict("index") ) correct_pos_dict return @app.cell def _(): return if __name__ == "__main__": app.run()