diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 0eee3a15e..02c0f580a 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -35,10 +35,12 @@ jobs: run: | conda activate gt4sd python -m black src/gt4sd --check --diff --color - # - name: Check isort - # run: | - # conda activate gt4sd - # python -m isort src/gt4sd --check-only + - name: Remove unnecessary files (see https://stackoverflow.com/questions/75536771/github-runner-out-of-disk-space-after-building-docker-image) + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" - name: Check flake8 run: | conda activate gt4sd diff --git a/conda.yml b/conda.yml index e1fcb522a..2dd1b4a36 100644 --- a/conda.yml +++ b/conda.yml @@ -9,7 +9,7 @@ dependencies: - pip>=19.1,<20.3 - pytorch>=1.0,<=1.12.1 - cpuonly - - pytorch-scatter<=2.0.9 + - pytorch-scatter<=2.0.9=*cu102* - pip: - -r requirements.txt - -r cpu_requirements.txt diff --git a/conda_gpu.yml b/conda_gpu.yml index 47c27265a..cd9d54981 100644 --- a/conda_gpu.yml +++ b/conda_gpu.yml @@ -8,7 +8,7 @@ dependencies: - python>=3.7,<3.9 - pip>=19.1,<20.3 - pytorch>=1.0,<=1.12.1=*cu* - - pytorch-scatter<=2.0.9 + - pytorch-scatter<=2.0.9=*cu102* - torchvision<=0.13.1=*cu* - torchaudio<=0.12.1=*cu* - pip: diff --git a/examples/enzeptional/README.md b/examples/enzeptional/README.md new file mode 100644 index 000000000..34f8d6fc0 --- /dev/null +++ b/examples/enzeptional/README.md @@ -0,0 +1,30 @@ +# Enzyme Optimization in Biocatalytic Reactions + +This repository provides an example on how ro run the framework for the optimization of enzymes within the context of biocatalytic reactions. + +## Prerequisites + +Before initiating the enzyme optimization process, execute the following command in your terminal to activate the environment: + +```console +conda activate gt4sd +``` + +## Running the example + +To run the example simply type: + +```console +python example_enzeptional.py +``` + +## Citation + +```bibtex +@inproceedings{teukam2023enzyme, + title={Enzyme optimization via a generative language modeling-based evolutionary algorithm}, + author={Teukam, Yves Gaetan Nana and Grisoni, Francesca and Manica, Matteo and Zipoli, Federico and Laino, Teodoro}, + booktitle={American Chemical Society (ACS) Spring Meeting}, + year={2023} +} +``` \ No newline at end of file diff --git a/examples/enzeptional/data.csv b/examples/enzeptional/data.csv new file mode 100644 index 000000000..66f082648 --- /dev/null +++ b/examples/enzeptional/data.csv @@ -0,0 +1,106 @@ +substrates,products,sequences,intervals +NCC(=O)O,CC(=O)C(C(=O)[O-])N,MRGEFYQQLTNDLETARAEGLFKEERIITSAQQADITVADGSHVINFCANNYLGLANHPDLIAAAKAGMDSHGFGMASVRFICGTQDSHKELEQKLAAFLGMEDAILYSSCFDANGGLFETLLGAEDAIISDALNHASIIDGVRLCKAKRYRYANNDMQELEARLKEAREAGARHVLIATDGVFSMDGVIANLKGVCDLADKYDALVMVDDSHAVGFVGENGRGSHEYCDVMGRVDIITGTLGKALGGASGGYTAARKEVVEWLRQRSRPYLFSNSLAPAIVAASIKVLEMVEAGSELRDRLWANARQFREQMSAAGFTLAGADHAIIPVMLGDAVVAQKFARELQKEGIYVTGFFYPVVPKGQARIRTQMSAAHTPEQITRAVEAFTRIGKQLGVIA,"[(50, 52), (77, 80), (92, 92), (96, 96), (107, 107), (109, 110), (113, 116), (136, 139), (141, 141), (180, 184), (187, 188), (209, 209), (214, 215), (217, 218), (223, 226), (238, 240), (245, 253), (269, 273), (276, 277), (281, 281), (285, 285), (325, 325), (327, 327), (368, 368), (370, 370)]" +CC(=CCC/C(=C/CC/C(=C/COP(=O)(O)OP(=O)(O)O)/C)/C)C,CC(=CCC[C@@]1([C@H]2CC[C@H](C2)C1=C)C)C,MDSSTATAMTAPFIDPTDHVNLKTDTDASENRRMGNYKPSIWNYDFLQSLATHHNIVEERHLKLAEKLKGQVKFMFGAPMEPLAKLELVDVVQRLGLNHLFETEIKEALFSIYKDGSNGWWFGHLHATSLRFRLLRQCGLFIPQDVFKTFQNKTGEFDMKLCDNVKGLLSLYEASYLGWKGENILDEAKAFTTKCLKSAWENISEKWLAKRVKHALALPLHWRVPRIEARWFIEAYEQEANMNPTLLKLAKLDFNMVQSIHQKEIGELARWWVTTGLDKLAFARNNLLQSYMWSCAIASDPKFKLARETIVEIGSVLTVVDDGYDVYGSIDELDLYTSSVERWSCVEIDKLPNTLKLIFMSMFNKTNEVGLRVQHERGYNSIPTFIKAWVEQCKSYQKEARWFHGGHTPPLEEYSLNGLVSIGFPLLLITGYVAIAENEAALDKVHPLPDLLHYSSLLSRLINDIGTSPDEMARGDNLKSIHCYMNETGASEEVAREHIKGVIEENWKILNQCCFDQSQFQEPFITFNLNSVRGSHFFYEFGDGFGVTDSWTKVDMKSVLIDPIPLGEE,"[(44, 44), (281, 283), (285, 286), (317, 320), (322, 324), (326, 327), (396, 396), (399, 399), (414, 414), (418, 419), (421, 422), (456, 459), (461, 462), (464, 466), (468, 468), (477, 477), (479, 479), (481, 481), (496, 496), (539, 539), (546, 546), (548, 548)]" +CC(=CCOP(=O)(O)OP(=O)(O)O)C,O=P([O-])([O-])OP(=O)([O-])[O-],MTADELVFFVNGKKVVEKNADPETTLLVYLRRKLGLCGTKLGCGEGGCGACTVMISKYDRLQNKIVHFSVNACLAPICSLHHVAVTTVEGIGNTQKLHPVQERIARSHGSQCGFCTPGIVMSMYTLLRNQPEPTVEEIENAFQGNLCRCTGYRPILQGFRTFAKDGGCCGGSGNNPNCCMNQTKDQTVSLSPSLFNPEDFKPLDPTQEPIFPPELLRLKDTPQKKLRFEGERVTWIQASTMEELLDLKAQHPDAKLVVGNTEIGIEMKFKNMLFPLIVCPAWIPELNSVVHGPEGISFGASCPLSLVESVLAEEIAKLPEQKTEVFRGVMEQLRWFAGKQVKSVASIGGNIITASPISDLNPVFMASGAKLTLVSRGTRRTVRMDHTFFPGYRKTLLRPEEILLSIEIPYSKEGEFFSAFKQASRREDDIAKVTSGMRVLFKPGTIEVQELSLCFGGMADRTISALKTTPKQLSKSWNEELLQSVCAGLAEELQLAPDAPGGMVEFRRTLTLSFFFKFYLTVLQKLGRADLEDMCGKLDPTFASATLLFQKDPPANVQLFQEVPKDQSEEDMVGRPLPHLAANMQASGEAVYCDDIPRYENELSLRLVTSTRAHAKITSIDTSEAKKVPGFVCFLTAEDVPNSNATGLFNDETVFAKDEVTCVGHIIGAVVADTPEHAQRAARGVKITYEDLPAIITIQDAINNNSFYGSEIKIEKGDLKKGFSEADNVVSGELYIGGQEHFYLETNCTIAVPKGEAGEMELFVSTQNTMKTQSFVAKMLGVPDNRIVVRVKRMGGGFGGKETRSTVVSTALALAAHKTGRPVRCMLDRDEDMLITGGRHPFLAKYKVGFMKTGTVVALEVAHFSNGGNTEDLSRSIMERALFHMDNAYKIPNIRGTGRICKTNLPSNTAFRGFGGPQGMLIAEYWMSEVAITCGLPAEEVRRKNMYKEGDLTHFNQKLEGFTLPRCWDECIASSQYLARKREVEKFNRENCWKKRGLCIIPTKFGISFTLPFLNQGGALVHVYTDGSVLLTHGGTEMGQGLHTKMVQVASRALKIPTSKIHISETSTNTVPNTSPTAASASADLNGQGVYEACQTILKRLEPFKKKKPTGPWEAWVMDAYTSAVSLSATGFYKTPNLGYSFETNSGNPFHYFSYGVACSEVEIDCLTGDHKNLRTDIVMDVGSSLNPAIDIGQVEGAFVQGLGLFTMEELHYSPEGSLHTRGPSTYKIPAFGSIPIEFRVSLLRDCPNKRAIYASKAVGEPPLFLASSIFFAIKDAIRAARAQHGDNAKQLFQLDSPATPEKIRNACVDQFTTLCVTGVPENCKSWSVRI,"[(25, 25), (27, 27), (39, 42), (44, 47), (49, 50), (52, 53), (70, 72), (74, 76), (110, 111), (113, 114), (116, 119), (143, 146), (148, 148), (150, 152), (155, 155), (235, 235), (244, 244), (248, 248), (254, 255), (264, 270), (274, 274), (277, 281), (286, 286), (298, 304), (307, 307), (332, 335), (337, 338), (341, 342), (344, 345), (351, 355), (357, 358), (360, 364), (371, 373), (393, 393), (397, 397), (401, 402), (404, 406), (408, 408), (419, 420), (422, 423), (429, 431), (433, 434), (514, 514), (585, 585), (592, 592), (741, 744), (765, 766), (768, 769), (793, 797), (799, 802), (829, 829), (839, 839), (910, 911), (913, 914), (917, 917), (1007, 1009), (1036, 1036), (1038, 1039), (1075, 1078), (1080, 1081), (1193, 1194), (1197, 1198), (1201, 1201), (1224, 1224), (1229, 1230), (1260, 1261), (1264, 1264)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,O=C[C@H](O)[C@@H](O)[C@H](O)[C@H](O)COP(=O)(O)O,MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLRLETHEEASVKMLPTYVRSTPEGSEVGDFLSLDLGGTNFRVMLVKVGEGEEGQWSVKTKHQMYSIPEDAMTGTAEMLFDYISECISDFLDKHQMKHKKLPLGFTFSFPVRHEDIDKGILLNWTKGFKASGAEGNNVVGLLRDAIKRRGDFEMDVVAMVNDTVATMISCYYEDHQCEVGMIVGTGCNACYMEEMQNVELVEGDEGRMCVNTEWGAFGDSGELDEFLLEYDRLVDESSANPGQQLYEKLIGGKYMGELVRLVLLRLVDENLLFHGEASEQLRTRGAFETRFVSQVESDTGDRKQIYNILSTLGLRPSTTDCDIVRRACESVSTRAAHMCSAGLAGVINRMRESRSEDVMRITVGVDGSVYKLHPSFKERFHASVRRLTPSCEITFIESEEGSGRGAALVSAVACKKACMLGQ,"[(76, 77), (84, 85), (107, 110), (115, 115), (122, 123), (126, 126), (148, 152), (167, 167), (169, 171), (205, 205), (224, 224), (226, 227), (229, 230), (290, 290), (293, 294), (297, 300), (302, 302), (329, 331), (337, 340), (351, 351), (355, 355), (371, 371), (375, 375), (408, 410), (416, 417), (419, 420), (423, 424), (438, 438), (440, 442), (445, 445)]" +NCCCC[C@H](N)C(=O)O,CC(C)(N)CO,MSHEELNDQLRVRREKLKKIEELGVDPFGKRFERTHKAEELFELYGDLSKEELEEQQIEVAVAGRIMTKRGMGKAGFAHIQDVTGQIQIYVRQDDVGEQQYELFKISDLGDIVGVRGTMFKTKVGELSIKVSSYEFLTKALRPLPEKYHGLKDIEQRYRQRYLDLIMNPESKKTFITRSLIIQSMRRYLDSHGYLEVETPMMHAVAGGAAARPFITHHNALDMTLYMRIAIELHLKRLIVGGLEKVYEIGRVFRNEGISTRHNPEFTMLELYEAYADFRDIMKLTENLIAHIATEVLGTTKIQYGEHLVDLTPEWRRLHMVDAIKEYVGVDFWRQMSDEEARELAKEHGVEVAPHMTFGHIVNEFFEQKVEDKLIQPTFIYGHPVEISPLAKKNPDDPRFTDRFELFIVGREHANAFTELNDPIDQRQRFEEQLKEREQGNDEAHEMDEDFLEALEYGMPPTGGLGIGVDRLVMLLTNSPSIRDVLLFPQMRHK,"[(284, 284), (288, 288), (314, 314), (376, 378), (403, 404), (406, 408), (410, 411), (413, 414), (465, 467), (470, 471), (474, 474)]" +N,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,MAKYTREDIEKLVKEENVKYIRLQFTDILGTIKNVEIPVSQLGKALDNKVMFDGSSIEGFVRIEESDMYLYPDLNTFVIFPWTAEKGKVARFICDIYNPDGTPFEGDPRNNLKRILKEMEDLGFSDFNLGPEPEFFLFKLDEKGEPTLELNDKGGYFDLAPTDLGENCRRDIVLELEEMGFEIEASHHEVAPGQHEIDFKYAGAVRSCDDIQTFKLVVKTIARKHGLHATFMPKPLFGVNGSGMHCNLSLFKNGVNAFFDENADLQLSETAKHFIAGIVKHATSFTAVTNPTVNSYKRLVPGYEAPCYVAWSAQNRSPLIRIPASRGISTRVEVRSVDPAANPYLALSVLLAAGLDGIKNKLEAPAPIDRNIYVMSKEERMENGIVDLPATLAEALEEFKSNEVMVKALGEHLFEHFIEAKEIEWDMFRTQVHPWEREQYMSQY,"[(126, 131), (133, 133), (135, 136), (154, 157), (169, 169), (182, 183), (185, 188), (190, 191), (194, 195), (197, 201), (214, 214), (230, 231), (233, 235), (238, 239), (242, 244), (246, 248), (250, 251), (256, 258), (290, 290), (294, 297), (299, 300), (302, 303), (305, 306), (311, 315), (317, 320), (322, 324), (328, 328), (330, 332), (334, 334), (336, 339), (373, 373)]" +Oc1ccccc1,C1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)(O)O)OP(=O)(O)O)O)N,MELIQDTSRPPLEYVKGVPLIKYFAEALGPLQSFQARPDDLLISTYPKSGTTWVSQILDMIYQGGDLEKCHRAPIFMRVPFLEFKAPGIPSGMETLKDTPAPRLLKTHLPLALLPQTLLDQKVKVVYVARNAKDVAVSYYHFYHMAKVHPEPGTWDSFLEKFMVGEVSYGSWYQHVQEWWELSRTHPVLYLFYEDMKENPKREIQKILEFVGRSLPEETVDFVVQHTSFKEMKKNPMTNYTTVPQEFMDHSISPFMRKGMAGDWKTTFTVAQNERFDADYAEKMAGCSLSFRSEL,"[(43, 43), (45, 47), (54, 57), (60, 60), (75, 75), (106, 106), (108, 108), (128, 129), (131, 132), (134, 137), (139, 142), (145, 145), (172, 172), (191, 192), (194, 197), (200, 200), (204, 204), (223, 226), (233, 235), (238, 240), (248, 248), (250, 250), (253, 254), (260, 261), (263, 263), (266, 266), (291, 291)]" +C1=C2C(=NC=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)CO)O)O,O=P([O-])([O-])O[C@H]1O[C@H](CO)[C@@H](O)[C@H]1O,MANGYTYEDYQDTAKWLLSHTEQRPQVAVICGSGLGGLVNKLTQAQTFDYSEIPNFPESTVPGHAGRLVFGILNGRACVMMQGRFHMYEGYPFWKVTFPVRVFRLLGVETLVVTNAAGGLNPNFEVGDIMLIRDHINLPGFSGENPLRGPNEERFGVRFPAMSDAYDRDMRQKAHSTWKQMGEQRELQEGTYVMLGGPNFETVAECRLLRNLGADAVGMSTVPEVIVARHCGLRVFGFSLITNKVIMDYESQGKANHEEVLEAGKQAAQKLEQFVSLLMASIPVSGHTG,"[(30, 32), (34, 35), (50, 50), (56, 57), (81, 83), (87, 87), (89, 91), (95, 96), (99, 99), (113, 115), (117, 119), (126, 126), (192, 192), (194, 200), (202, 203), (205, 206), (209, 209), (217, 218), (221, 224), (239, 242), (244, 245), (255, 256), (258, 262), (271, 271)]" +O,CCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O,MAEIRKLKNYINGEWVESKTDQYEDVVNPATKEVLCQVPISTKEDIDYAAQTAAEAFKTWSKVAVPRRARILFNFQQLLSQHKEELAHLITIENGKNTKEALGEVGRGIENVEFAAGAPSLMMGDSLASIATDVEAANYRYPIGVVGGIAPFNFPMMVPCWMFPMAIALGNTFILKPSERTPLLTEKLVELFEKAGLPKGVFNVVYGAHDVVNGILEHPEIKAISFVGSKPVGEYVYKKGSENLKRVQSLTGAKNHTIVLNDANLEDTVTNIVGAAFGSAGERCMACAVVTVEEGIADEFMAKLQEKVADIKIGNGLDDGVFLGPVIREDNKKRTLSYIEKGLEEGARLVCDGRENVSDDGYFVGPTIFDNVTTEMTIWKDEIFAPVLSVIRVKNLKEAIEIANKSEFANGACLFTSNSNAIRYFRENIDAGMLGINLGVPAPMAFFPFSGWKSSFFGTLHANGKDSVDFYTRKKVVTARYPAPDFN,"[(24, 24), (26, 26), (38, 38), (93, 93), (148, 152), (174, 175), (181, 182), (185, 185), (204, 208), (211, 212), (215, 215), (252, 254), (283, 284), (327, 328), (331, 331), (334, 334), (338, 338), (380, 381), (383, 384), (408, 408)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)C(C(=O)NCCC(=O)NCCSC(=O)CCCl)O,MPSTLTINGKAPIVAYAELIAARIVNALAPNSIAIKLVDDKKAPAAKLDDATEDVFNKITSKFAAIFDNGDKEQVAKWVNLAQKELVIKNFAKLSQSLETLDSQLNLRTFILGGLKYSAADVACWGALRSNGMCGSIIKNKVDVNVSRWYTLLEMDPIFGEAHDFLSKSLLELKKSANVGKKKETHKANFEIDLPDAKMGEVVTRFPPEPSGYLHIGHAKAALLNQYFAQAYKGKLIIRFDDTNPSKEKEEFQDSILEDLDLLGIKGDRITYSSDYFQEMYDYCVQMIKDGKAYCDDTPTEKMREERMDGVASARRDRSVEENLRIFTEEMKNGTEEGLKNCVRAKIDYKALNKTLRDPVIYRCNLTPHHRTGSTWKIYPTYDFCVPIVDAIEGVTHALRTIEYRDRNAQYDWMLQALRLRKVHIWDFARINFVRTLLSKRKLQWMVDKDLVGNWDDPRFPTVRGVRRRGMTVEGLRNFVLSQGPSRNVINLEWNLIWAFNKKVIDPIAPRHTAIVNPVKIHLEGSEAPQEPKIEMKPKHKKNPAVGEKKVIYYKDIVVDKDDADVINVDEEVTLMDWGNVIITKKNDDGSMVAKLNLEGDFKKTKHKLTWLADTKDVVPVDLVDFDHLITKDRLEEDESFEDFLTPQTEFHTDAIADLNVKDMKIGDIIQFERKGYYRLDALPKDGKPYVFFTIPDGKSVNKYGAKK,"[(87, 87), (132, 134)]" +C[C@]12CCCC(C1CCC34C2C[C@@H]5C(C3)C5(C4)C)(C)C,O,MKNRIPVVLLACGSFNPITNMHLRLFEVARDHLHQTGRYQVIEGIISPVNDSYGKKDLVASHHRVAMARLALQTSDWIRVDPWESEQAQWMETVKVLRHHHRELLRSSAQMDGPDPSKTPSASAALPELKLLCGADVLKTFQTPNLWKDTHIQEIVEKFGLVCVSRSGHDPERYISDSPILQQFQHNIHLAREPVLNEISATYVRKALGQGQSVKYLLPEAVITYIRDQGLYINDGSWKGKGKTG,"[(11, 13), (16, 21), (23, 26), (46, 50), (52, 55), (57, 58), (64, 64), (67, 67), (88, 89), (91, 92), (94, 97), (131, 133), (135, 135), (137, 138), (140, 145), (148, 149), (152, 152), (155, 155), (163, 165), (167, 169), (174, 174), (177, 180), (195, 196), (198, 201), (206, 209), (212, 212), (214, 214), (217, 218), (232, 232)]" +N#CC(O)c1ccccc1,N,MAPKAVLVGLPGSGKSTIGRRLAKALGVGLLDTDVAIEQRTGRSIADIFATDGEQEFRRIEEDVVRAALADHDGVLSLGGGAVTSPGVRAALAGHTVVYLEISAAEGVRRTGGNTVRPLLAGPDRAEKYRALMAKRAPLYRRVATMRVDTNRRNPGAVVRHILSRLQVPSPSEAAT,"[(7, 11), (18, 22), (30, 30), (77, 78), (80, 80), (100, 100), (102, 102), (107, 107), (110, 111), (115, 116), (118, 120), (122, 124), (127, 127), (129, 129), (148, 152), (154, 155), (157, 158), (161, 161)]" +C[C@@H](C(=O)N[C@H](CC(=O)[O-])C(=O)[O-])[NH3+],C[C@@H](C(=O)N[C@@H](CC(=O)O)C(=O)O)N,MKIIRIETSRIAVPLTKPFKTALRTVYTAESVIVRITYDSGAVGWGEAPPTLVITGDSMDSIESAIHHVLKPALLGKSLAGYEAILHDIQHLLTGNMSAKAAVEMALYDGWAQMCGLPLYQMLGGYRDTLETDYTVSVNSPEEMAADAENYLKQGFQTLKIKVGKDDIATDIARIQEIRKRVGSAVKLRLDANQGWRPKEAVTAIRKMEDAGLGIELVEQPVHKDDLAGLKKVTDATDTPIMADESVFTPRQAFEVLQTRSADLINIKLMKAGGISGAEKINAMAEACGVECMVGSMIETKLGITAAAHFAASKRNITRFDFDAPLMLKTDVFNGGITYSGSTISMPGKPGLGIIGAALLKGEKEQ,"[(160, 164), (189, 190), (192, 193), (217, 218), (220, 221), (242, 243), (245, 248), (266, 266), (268, 268), (271, 271), (293, 293), (321, 321)]" +C1=CN(C(=O)N=C1N)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O,O=C1CCCN1CCl,MNGDVQSVIRGYLERAQVAKTMSDAGRWNEAGDLLRQLMTDVKSCKISASNRDEHDARNTFLRALEANLKLVQQNVRDEDDLHEAMTRQSGSPEPPADPDVWSKPSPPLPSSSKFGATKKGVGAAGPRPREISKSTSSMSTNPADVKPANPTQGILPQNSAGDSFDASAYDAYIVQAVRGTMATNTENTMSLDDIIGMHDVKQVLHEAVTLPLLVPEFFQGLRSPWKAMVLAGPPGTGKTLIARAIASESSSTFFTVSSTDLSSKWRGDSEKIVRLLFELARFYAPSIIFIDEIDTLGGQRGNSGEHEASRRVKSEFLVQMDGSQNKFDSRRVFVLAATNIPWELDEALRRRFEKRIFIPLPDIDARKKLIEKSMEGTPKSDEINYDDLAARTEGFSGADVVSLCRTAAINVLRRYDTKSLRGGELTAAMESLKAELVRNIDFEAALQAVSPSAGPDTMLKCKEWCDSFGAM,"[(228, 228), (231, 232), (241, 245), (254, 254), (256, 256), (290, 290), (292, 292), (318, 318), (322, 322), (325, 325), (336, 336), (338, 340), (347, 350), (353, 354), (356, 356), (358, 361), (397, 399), (453, 454), (462, 462)]" +CC(C)c1ccc(CO)cc1,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1,MDFTSLETTTFEEVVIALGSNVGNRMNNFKEALRLMKDYGISVTRHSCLYETEPVHVTDQPRFLNAAIRGVTKLKPHELLNVLKKIEKEMGREENGLRYGPRPLDLDILFYGKHKIISDKLIIPHERIWERPFVLAPLVDLLGTEDIDNDKIVAYWHSLSMHSGGIFQAWERLGGESLLGKDGIIQRVIPIGDHLWDFSKKTYVMGILNLTPDSFSDGGKFQSVDTAVSRVRSMISEGVDIIDIGAQSTRPMASRISSQEEIDRLIPVLKVVRGMAEMKGKLISVDTFNSEVALEAIRNGADILNDVSGGSLDENMHKVVADSDVPYMIMHMRGDPCTMQNKENLEYNEICKDVATELYERVREAELSGIPAWRIMIDPGIGFSKGIDHNLDIVMELPKIREEMAKKSIGLSHAPILIGPSRKRFLGDICGRPEASERDAATVACVTAGILKGANIIRVHNVRDNVDAARLCDAMMTKRFKNVD,"[(204, 208), (210, 212), (230, 230), (243, 246), (284, 285), (287, 288), (303, 304), (306, 307), (328, 331), (358, 358), (376, 377), (379, 383), (417, 422), (424, 427), (438, 439), (442, 442), (446, 446), (456, 457), (461, 462), (465, 465)]" +C([C@H]([C@@H]([C@H]([C@H](C(=O)O)O)O)O)O)O,O,MTLPKIKQVRAWFTGGATAEKGAGGGDYHDQGANHWIDDHIATPMSKYRDYEQSRQSFGINVLGTLVVEVEAENGQTGFAVSTAGEMGCFIVEKHLNRFIEGKCVSDIKLIHDQMLSATLYYSGSGGLVMNTISCVDLALWDLFGKVVGLPVYKLLGGAVRDEIQFYATGARPDLAKEMGFIGGKMPTHWGPHDGDAGIRKDAAMVADMREKCGEDFWLMLDCWMSQDVNYATKLAHACAPYNLKWIEECLPPQQYESYRELKRNAPVGMMVTSGEHHGTLQSFRTLSETGIDIMQPDVGWCGGLTTLVEIAAIAKSRGQLVVPHGSSVYSHHAVITFTNTPFSEFLMTSPDCSTMRPQFDPILLNEPVPVNGRIHKSVLDKPGFGVELNRDCNLKRPYSH,"[(171, 171), (186, 188), (190, 190), (210, 210), (214, 214), (220, 221), (223, 225), (243, 244), (246, 247), (249, 251), (263, 263), (266, 267), (270, 270), (274, 275), (277, 278), (296, 298), (347, 347)]" +O,O=P([O-])([O-])OP(=O)([O-])[O-],MTTNYIFVTGGVVSSLGKGIAAASLAAILEARGLNVTIMKLDPYINVDPGTMSPIQHGEVFVTEDGAETDLDLGHYERFIRTKMSRRNNFTTGRIYSDVLRKERRGDYLGATVQVIPHITNAIKERVLEGGEGHDVVLVEIGGTVGDIESLPFLEAIRQMAVEIGREHTLFMHLTLVPYMAASGEVKTKPTQHSVKELLSIGIQPDILICRSDRAVPANERAKIALFCNVPEKAVISLKDVDSIYKIPGLLKSQGLDDYICKRFSLNCPEANLSEWEQVIFEEANPVSEVTIGMVGKYIELPDAYKSVIEALKHGGLKNRVSVNIKLIDSQDVETRGVEILKGLDAILVPGGFGYRGVEGMITTARFARENNIPYLGICLGMQVALIDYARHVANMENANSTEFVPDCKYPVVALITEWRDENGNVEVRSEKSDLGGTMRLGAQQCQLVDDSLVRQLYNAPTIVERHRHRYEVNNMLLKQIEDAGLRVAGRSGDDQLVEIIEVPNHPWFVACQFHPEFTSTPRDGHPLFAGFVKAASEFQKRQAK,"[(8, 13), (21, 25), (39, 41), (43, 43), (70, 71), (73, 76), (79, 79), (92, 92), (138, 139), (141, 146), (150, 153), (174, 178), (180, 180), (183, 183), (185, 186), (193, 196), (208, 208), (210, 215), (218, 222), (224, 229), (237, 238), (242, 244), (246, 247), (250, 250), (297, 298), (349, 351), (353, 355), (358, 358), (361, 361), (375, 375), (377, 379), (384, 387), (401, 402), (404, 405), (413, 413), (415, 415), (438, 440), (468, 469), (471, 473), (498, 501), (511, 513)]" +CSCCC(=O)/C(=C/O)/O,O=C[O-],MVQAWYMDESTADPRKPHRAQPDRPVSLEQLRTLGVLYWKLDADKYENDPELEKIRKMRNYSWMDIITICKDTLPNYEEKIKMFFEEHLHLDEEIRYILEGSGYFDVRDKEDKWIRISMEKGDMITLPAGIYHRFTLDEKNYVKAMRLFVGEPVWTPYNRPADHFDARVQYMSFLEGTA,"[(83, 84), (86, 87), (89, 89), (91, 93), (95, 96), (105, 107), (114, 114), (125, 127), (129, 132), (134, 135), (155, 160), (167, 167)]" +CCCCCCCCCCCCCC(=O)O,O,MNAKPGFTDYIVKDIALADFGRKEISLAETEMPGLMATREEYGPKQPLKGARIAGSLHMTIQTAVLIETLAALGADIRWVSCNIYSTQDHAAAAIAAAGIPVFAVKGETLTEYWDYTAKLFDWHGGGTPNMILDDGGDATMLVHAGYRAEQGDTAFLDKPGSEEEEIFYALVKRLLKEKPKGWFAEIAKNIKGVSEETTTGVHRLYEMANKGTLLFPAINVNDSVTKSKFDNLYGCRESLVDGIRRGTDVMLSGKVAMVAGFGDVGKGSAASLRQAGCRVMVSEVDPICALQAAMEGYEVVTMEDAAPRADIFVTATGNKDIITIEHMRAMKDRAIVCNIGHFDNEIQIASLRNLKWTNIKPQVDEIEFPDKHRIIMLSEGRLVNLGNAMGHPSFVMSASFTNQTLAQIELFANNKDSKYAKKVYVLPKTLDEKVARLHLAKIGVKLTELRKDQADYIGVKQEGPYKSDHYRY,"[(58, 58), (82, 83), (135, 136), (196, 197), (201, 204), (222, 222), (227, 228), (230, 231), (233, 236), (239, 240), (243, 243), (261, 264), (266, 270), (282, 283), (285, 286), (289, 290), (300, 301), (314, 318), (320, 322), (338, 339), (343, 346), (383, 384), (386, 392), (429, 429), (433, 433), (460, 460), (465, 466), (468, 470), (472, 473)]" +[C@@H]([C@@H]([C@H](C(=O)O)O)O)([C@@H](C(=O)O)O)O,C([C@@H]([C@H](C(=O)[O-])O)O)C(=O)C(=O)[O-],MALSANSDAVTYAKAANTRTAAETGDRIEWVKLSLAFLPLATPVSDAKVLTGRQKPLTEVAIIIAEIRSRDGFEGVGFSYSKRAGGQGIYAHAKEIADNLLGEDPNDIDKIYTKLLWAGASVGRSGMAVQAISPIDIALWDMKAKRAGLPLAKLLGAHRDSVQCYNTSGGFLHTPLDQVLKNVVISRENGIGGIKLKVGQPNCAEDIRRLTAVREALGDEFPLMVDANQQWDRETAIRMGRKMEQFNLIWIEEPLDAYDIEGHAQLAAALDTPIATGEMLTSFREHEQLILGNASDFVQPDAPRVGGISPFLKIMDLAAKHGRKLAPHFAMEVHLHLSAAYPLEPWLEHFEWLNPLFNEQLELRDGRMWISDRHGLGFTLSEQARRWTQLTCEFGKRP,"[(165, 165), (195, 199), (224, 225), (227, 228), (250, 251), (253, 254), (275, 277), (279, 280), (299, 299), (301, 301), (304, 304), (328, 328)]" +S,O=P([O-])([O-])[O-],MALADISGYLDVLDSVRGFSYLENAREVLRSGEARCLGNPRSEPEYVKALYVIGASRIPVGDGCSHTLEELGVFDISVPGEMVFPSPLDFFERGKPTPLVRSRLQLPNGVRVWLKLEWYNPFSLSVKDRPAVEIISRLSRRVEKGSLVADATSSNFGVALSAVARLYGYRARVYLPGAAEEFGKLLPRLLGAQVIVDPEAPSTVHLLPRVMKDSKNEGFVHVNQFYNDANFEAHMRGTAREIFVQSRRGGLALRGVAGSLGTSGHMSAAAFYLQSVDPSIRAVLVQPAQGDSIPGIRRVETGMLWINMLDISYTLAEVTLEEAMEAVVEVARSDGLVIGPSGGAAVKALAKKAAEGDLEPGDYVVVVPDTGFKYLSLVQNALEGAGDSV,"[(123, 123), (125, 127), (152, 154), (156, 160), (182, 182), (224, 225), (230, 231), (234, 234), (238, 238), (259, 260), (266, 269), (285, 285), (293, 297), (303, 306), (339, 340), (342, 345), (366, 366), (368, 369), (373, 374)]" +C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@H](O2)O[C@@H]3[C@H](O[C@@H]([C@@H]([C@H]3O)O)O)CO)O)O)O[C@@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O)O)O)O)O,O=P(O)(O)O[C@H]1O[C@H](CO)[C@@H](O)[C@H](O)[C@H]1O,MLDIVELSRLQFALTAMYHFLFVPLTLGMAFLLAIMETVYVLSGKQIYKDMTKFWGKLFGINFALGVATGLTMEFQFGTNWSYYSHYVGDIFGAPLAIEGLMAFFLESTFVGLFFFGWDRLGKVQHMCVTWLVALGSNLSALWILVANGWMQNPIASDFNFETMRMEMVSFSELVLNPVAQVKFVHTVASGYVTGAMFILGISAWYMLKGRDFAFAKRSFAIAASFGMAAVLSVIVLGDESGYEMGDVQKTKLAAIEAEWETQPAPAAFTLFGIPDQEEETNKFAIQIPYALGIIATRSVDTPVIGLKELMVQHEERIRNGMKAYSLLEQLRSGSTDQAVRDQFNSMKKDLGYGLLLKRYTPNVADATEAQIQQATKDSIPRVAPLYFAFRIMVACGFLLLAIIALSFWSVIRNRIGEKKWLLRAALYGIPLPWIAVEAGWFVAEYGRQPWAIGEVLPTAVANSSLTAGDLIFSMVLICGLYTLFLVAELFLMFKFARLGPSSLKTGRYHFEQSSTTTQPAR,"[(14, 18), (20, 23), (65, 66), (69, 70), (73, 73), (182, 185), (187, 190), (233, 234), (237, 239), (389, 392), (394, 397), (436, 436), (439, 440)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,MKVARFQKIPNGENETMIPVLTSKKASELPVSEVASILQADLQNGLNKCEVSHRRAFHGWNEFDISEDEPLWKKYISQFKNPLIMLLLASAVISVLMHQFDDAVSITVAILIVVTVAFVQEYRSEKSLEELSKLVPPECHCVREGKLEHTLARDLVPGDTVCLSVGDRVPADLRLFEAVDLSIDESSLTGETTPCSKVTAPQPAATNGDLASRSNIAFMGTLVRCGKAKGVVIGTGENSEFGEVFKMMQAEEAPKTPLQKSMDLLGKQLSFYSFGIIGIIMLVGWLLGKDILEMFTISVSLAVAAIPEGLPIVVTVTLALGVMRMVKKRAIVKKLPIVETLGCCNVICSDKTGTLTKNEMTVTHIFTSDGLHAEVTGVGYNQFGEVIVDGDVVHGFYNPAVSRIVEAGCVCNDAVIRNNTLMGKPTEGALIALAMKMGLDGLQQDYIRKAEYPFSSEQKWMAVKCVHRTQQDRPEICFMKGAYEQVIKYCTTYQSKGQTLTLTQQQRDVYQQEKARMGSAGLRVLALASGPELGQLTFLGLVGIIDPPRTGVKEAVTTLIASGVSIKMITGDSQETAVAIASRLGLYSKTSQSVSGEEIDAMDVQQLSQIVPKVAVFYRASPRHKMKIIKSLQKNGSVVAMTGDGVNDAVALKAADIGVAMGQTGTDVCKEAADMILVDDDFQTIMSAIEEGKGIYNNIKNFVRFQLSTSIAALTLISLATLMNFPNPLNAMQILWINIIMDGPPAQSLGVEPVDKDVIRKPPRNWKDSILTKNLILKILVSSIIIVCGTLFVFWRELRDNVITPRDTTMTFTCFVFFDMFNALSSRSQTKSVFEIGLCSNRMFCYAVLGSIMGQLLVIYFPPLQKVFQTESLSILDLLFLLGLTSSVCIVAEIIKKVERSREKIQKHVSSTSSSFLEV,"[(83, 84), (86, 87), (90, 90), (106, 106), (109, 110), (113, 113), (266, 266), (270, 270), (273, 274), (298, 302), (305, 305), (307, 307), (309, 312), (350, 350), (352, 354), (358, 358), (456, 456), (622, 622), (625, 625), (641, 643), (645, 647), (649, 653), (661, 661), (664, 666), (669, 669), (702, 702), (706, 706), (710, 710), (713, 713), (733, 737), (739, 741), (743, 747)]" +CCC(=O)C(=O)[O-],CCC=O,MRSKRFEALAKRPVNQDGFVKEWIEEGFIAMESPNDPKPSIKIVNGAVTELDGKPVSDFDLIDHFIARYGINLNRAEEVMAMDSVKLANMLCDPNVKRSEIVPLTTAMTPAKIVEVVSHMNVVEMMMAMQKMRARRTPSQQAHVTNVKDNPVQIAADAAEGAWRGFDEQETTVAVARYAPFNAIALLVGSQVGRPGVLTQCSLEEATELKLGMLGHTCYAETISVYGTEPVFTDGDDTPWSKGFLASSYASRGLKMRFTSGSGSEVQMGYAEGKSMLYLEARCIYITKAAGVQGLQNGSVSCIGVPSAVPSGIRAVLAENLICSSLDLECASSNDQTFTHSDMRRTARLLMQFLPGTDFISSGYSAVPNYDNMFAGSNEDAEDFDDYNVIQRDLKVDGGLRPVREEDVIAIRNKAARALQAVFAGMGLPPITDEEVEAATYAHGSKDMPERNIVEDIKFAQEIINKNRNGLEVVKALAQGGFTDVAQDMLNIQKAKLTGDYLHTSAIIVGDGQVLSAVNDVNDYAGPATGYRLQGERWEEIKNIPGALDPNEID,"[(140, 145), (168, 169), (171, 172), (186, 186), (200, 200), (202, 202), (208, 208), (219, 220), (222, 223), (257, 261), (294, 295), (297, 298), (300, 300), (331, 335), (359, 361), (363, 364), (374, 374)]" +CC(=O)N[C@@H](CCC(N)=O)C(=O)O,NC(=O)CC[C@H](N)C(=O)O,MTSKGPEEEHPSVTLFRQYLRIRTVQPKPDYGAAVAFFEETARQLGLGCQKVEVAPGYVVTVLTWPGTNPTLSSILLNSHTDVVPVFKEHWSHDPFEAFKDSEGYIYARGAQDMKCVSIQYLEAVRRLKVEGHRFPRTIHMTFVPDEEVGGHQGMELFVQRPEFHALRAGFALDEGIANPTDAFTVFYSERSPWWVRVTSTGRPGHASRFMEDTAAEKLHKVVNSILAFREKEWQRLQSNPHLKEGSVTSVNLTKLEGGVAYNVIPATMSASFDFRVAPDVDFKAFEEQLQSWCQAAGEGVTLEFAQKWMHPQVTPTDDSNPWWAAFSRVCKDMNLTLEPEIMPAATDNRYIRAVGVPALGFSPMNRTPVLLHDHDERLHEAVFLRGVDIYTRLLPALASVPALPSDS,"[(19, 19), (26, 26), (78, 79), (81, 84), (110, 112), (114, 116), (118, 118), (121, 121), (144, 144), (146, 147), (149, 150), (173, 174), (176, 177), (187, 187)]" +[C-]#N,CC(C)(O)C#N,MASLPVSFAKPDKNGVITCKAIMLKEAKLPGMSYADTVQIIDIQVDPPQNVELRVKMLCASVCRTDILTIEGFMAPTQFPKINGHEGVGIIESMGPDTKNFKVGDVIVAPTLGECQVCSSCRSGRTNFCQNYGANESALEPDGTSRFSYIDSDGKKKLLYYKLGCSTWTQYMVVDSNYATKLNEIAPELPPPHGSILSCAFATGYGAVWLDAAVQEGDSVAIFGVGSVGISAVIAAKELKAKQIIVVDRNEYKLKMAMELGATHCINSEKLPEGVTPSQAVRKLTPKEVGVDASIESSGYDVFMNEAMKAAIHGKAKTVITGEGIYENDRIFFDFKDFLFGGNVVGNVTGRVRIHSDFPGLLRKAQEPVIRAGMDKILGYDAATMKCKYEVDIREGTPALLKALEEVENVDCVKLVIKLNDY,"[(61, 62), (64, 67), (83, 84), (86, 87), (109, 109), (113, 114), (116, 117), (119, 120), (122, 124), (126, 128), (130, 133), (162, 162), (167, 168), (195, 198), (200, 204), (349, 349), (353, 353), (414, 414)]" +C[N+](C)(C)CCOC(=O)C1=CC=CC=C1,C[N+](C)(C)CCO,MHSKVTIICIRFLFWFLLLCMLIGKSHTEDDIIIATKNGKVRGMNLTVFGGTVTAFLGIPYAQPPLGRLRFKKPQSLTKWSDIWNATKYANSCCQNIDQSFPGFHGSEMWNPNTDLSEDCLYLNVWIPAPKPKNATVLIWIYGGGFQTGTSSLHVYDGKFLARVERVIVVSMNYRVGALGFLALPGNPEAPGNMGLFDQQLALQWVQKNIAAFGGNPKSVTLFGESAGAASVSLHLLSPGSHSLFTRAILQSGSFNAPWAVTSLYEARNRTLNLAKLTGCSRENETEIIKCLRNKDPQEILLNEAFVVPYGTPLSVNFGPTVDGDFLTDMPDILLELGQFKKTQILVGVNKDEGTAFLVYGAPGFSKDNNSIITRKEFQEGLKIFFPGVSEFGKESILFHYTDWVDDQRPENYREALGDVVGDYNFICPALEFTKKFSEWGNNAFFYYFEHRSSKLPWPEWMGVMHGYEIEFVFGLPLERRDNYTKAEEILSRSIVKRWANFAKYGNPNETQNNSTSWPVFKSTEQKYLTLNTESTRIMTKLRAQQCRFWTSFFPKVLEMTGNIDEAEWEWKAGFHRWNNYMMDWKNQFNDYTSKKESCVGL,"[(108, 109), (111, 112), (133, 133), (139, 142), (193, 193), (195, 195), (461, 465), (467, 470), (475, 475), (482, 482)]" +O,CC(C)(N)CO,MVCKVCGQKAQVEMRSRGLALCREHYLDWFVKETERAIRRHRMLLPGERVLVAVSGGKDSLALWDVLSRLGYQAVGLHIELGIGEYSKRSLEVTQAFARERGLELLVVDLKEAYGFGVPELARLSGRVACSACGLSKRYIINQVAVEEGFRVVATGHNLDDEAAVLFGNLLNPQEETLSRQGPVLPEKPGLAARVKPFYRFSEREVLSYTLLRGIRYLHEECPNAKGAKSLLYKEALNLVERSMPGAKLRFLDGFLEKIRPRLDVGEEVALRECERCGYPTTGAVCAFCRMWDAVYRRAKKRKLLPEEVSFRPRVKPLRAG,"[(2, 2), (4, 5), (7, 11), (20, 21), (23, 24), (26, 29), (51, 52), (56, 58), (60, 64), (76, 78), (80, 81), (83, 83), (86, 86), (90, 90), (94, 94), (105, 105), (108, 108), (110, 110), (118, 118), (128, 129), (131, 132), (134, 137), (139, 141), (154, 155), (157, 160), (162, 165), (198, 198), (203, 203), (206, 206), (210, 210), (212, 212), (218, 218), (220, 221), (234, 234), (272, 273), (275, 276), (278, 281), (284, 285), (287, 288), (290, 293), (311, 311), (313, 316)]" +O,Oc1ccccc1,MKIIRIETSRIAVPLTKPFKTALRTVYTAESVIVRITYDSGAVGWGEAPPTLVITGDSMDSIESAIHHVLKPALLGKSLAGYEAILHDIQHLLTGNMSAKAAVEMALYDGWAQMCGLPLYQMLGGYRDTLETDYTVSVNSPEEMAADAENYLKQGFQTLKIKVGKDDIATDIARIQEIRKRVGSAVKLRLDANQGWRPKEAVTAIRKMEDAGLGIELVEQPVHKDDLAGLKKVTDATDTPIMADESVFTPRQAFEVLQTRSADLINIKLMKAGGISGAEKINAMAEACGVECMVGSMIETKLGITAAAHFAASKRNITRFDFDAPLMLKTDVFNGGITYSGSTISMPGKPGLGIIGAALLKGEKEQ,"[(160, 164), (189, 190), (192, 193), (217, 218), (220, 221), (242, 243), (245, 248), (266, 266), (268, 268), (271, 271), (293, 293), (321, 321)]" +O,C[C@H](CCC(=O)O)[C@H]1CC[C@H]2[C@@H]3CC[C@@H]4C[C@H](O)CC[C@]4(C)[C@H]3C[C@H](O)[C@@]21C,MSSAEEKLFMKALKEKFEESPEEKYTKFYIFGGWKQSERKKEFKEWADKIVEERGVPHYNPDIGVPLGQRKLMSYQVSGTDVFVEGDDLHFVNNAAMQQMWDDIRRTVIVGMDTAHRVLERRLGKEVTPETINEYMETLNHALPGGAVVQEHMVEIHPGLTWDCYAKIITGDLELADEIDDKFLIDIEKLFPEEQAEQLIKAIGNRTYQVCRMPTIVGHVCDGATMYRWAAMQIAMSFICAYKIAAGEAAVSDFAFASKHAEVINMGEMLPARRARGENEPGGVPFGVLADCVQTMRKYPDDPAKVALEVIAAGAMLYDQIWLGSYMSGGVGFTQYATAVYTDNILDDYVYYGLEYVEDKYGIAEAEPSMDVVKDVATEVTLYGLEQYERYPAAMETHFGGSQRAAVCAAAAGCSTAFATGHAQAGLNGWYLSQILHKEGHGRLGFYGYALQDQCGAANSLSVRSDEGLPLELRGPNYPNYAMNVGHLGEYAGIVQAAHAARGDAFCVHPVIKVAFADENLVFDFTEPRKEFAKGALREFEPAGERDLIVPAE,"[(147, 149), (151, 153), (227, 228), (231, 232), (254, 258), (260, 261), (263, 263), (271, 272), (274, 275), (319, 319), (333, 335), (337, 339), (402, 403), (445, 446), (448, 450), (485, 485)]" +NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)(O)O)[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)(O)O)[C@@H]3O)[C@@H](O)[C@H]2O)c1,MDSKYQCVKLNDGHFMPVLGFGTYAPAEVPKSKALEAVKLAIEAGFHHIDSAHVYNNEEQVGLAIRSKIADGSVKREDIFYTSKLWSNSHRPELVRPALERSLKNLQLDYVDLYLIHFPVSVKPGEEVIPKDENGKILFDTVDLCATWEAMEKCKDAGLAKSIGVSNFNHRLLEMILNKPGLKYKPVCNQVECHPYFNQRKLLDFCKSKDIVLVAYSALGSHREEPWVDPNSPVLLEDPVLCALAKKHKRTPALIALRYQLQRGVVVLAKSYNEQRIRQNVQVFEFQLTSEEMKAIDGLNRNVRYLTLDIFAGPPNYPFSDEY,"[(17, 19), (25, 27), (29, 29), (33, 34), (36, 38), (40, 41), (44, 44), (46, 46), (48, 49), (51, 52), (55, 55), (57, 57), (61, 61), (82, 82), (84, 84), (115, 119), (164, 165), (168, 169), (172, 172), (188, 189), (191, 195), (214, 215), (223, 224), (227, 228), (235, 236), (251, 251), (253, 254), (256, 257), (260, 260), (267, 269), (281, 283), (305, 306), (308, 308), (318, 319)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)c1,MNFYSAYQHGFVRVAACTHHTTIGDPAANAASVLDMARACHDDGAALAVFPELTLSGYSIEDVLLQDSLLDAVEDALLDLVTESADLLPVLVVGAPLRHRHRIYNTAVVIHRGAVLGVVPKSYLPTYREFYERRQMAPGDGERGTIRIGGADVAFGTDLLFAASDLPGFVLHVEICEDMFVPMPPSAEAALAGATVLANLSGSPITIGRAEDRRLLARSASARCLAAYVYAAAGEGESTTDLAWDGQTMIWENGALLAESERFPKGVRRSVADVDTELLRSERLRMGTFDDNRRHHRELTESFRRIDFALDPPAGDIGLLREVERFPFVPADPQRLQQDCYEAYNIQVSGLEQRLRALDYPKVVIGVSGGLDSTHALIVATHAMDREGRPRSDILAFALPGFATGEHTKNNAIKLARALGVTFSEIDIGDTARLMLHTIGHPYSVGEKVYDVTFENVQAGLRTDYLFRIANQRGGIVLGTGDLSELALGWSTYGVGDQMSHYNVNAGVPKTLIQHLIRWVISAGEFGEKVGEVLQSVLDTEITPELIPTGEEELQSSEAKVGPFALQDFSLFQVLRYGFRPSKIAFLAWHAWNDAERGNWPPGFPKSERPSYSLAEIRHWLQIFVQRFYSFSQFKRSALPNGPKVSHGGALSPRGDWRAPSDMSARIWLDQIDREVPKG,"[(52, 52), (58, 58), (125, 126), (128, 130), (177, 177), (201, 202), (204, 208), (210, 214), (230, 230), (232, 232), (243, 244), (350, 353), (355, 358), (361, 365), (374, 377), (396, 399), (410, 412), (423, 423), (452, 455), (457, 461), (466, 470), (472, 474), (476, 479), (481, 484), (486, 489), (494, 500), (502, 503), (505, 505), (510, 510), (513, 514), (517, 517), (537, 538), (541, 542), (557, 557), (560, 561), (564, 564), (627, 627), (630, 634), (636, 641), (659, 660), (662, 663)]" +CC(C)C[C@H](N)C(=O)O,CC(C)C[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,MKSSAAKQTVLCLNRYAVVALPLAIASFAAFGASPASTLWAPTDTKAFVTPAQVEARSAAPLLELAAGETAHIVVSLKLRDEAQLKQLAQAVNQPGNAQFGKFLKRRQFLSQFAPTEAQVQAVVAHLRKNGFVNIHVVPNRLLISADGSAGAVKAAFNTPLVRYQLNGKAGYANTAPAQVPQDLGEIVGSVLGLQNVTRAHPMLKVGERSAAKTLAAGTAKGHNPTEFPTIYDASSAPTAANTTVGIITIGGVSQTLQDLQQFTSANGLASVNTQTIQTGSSNGDYSDDQQGQGEWDLDSQSIVGSAGGAVQQLLFYMADQSASGNTGLTQAFNQAVSDNVAKVINVSLGWCEADANADGTLQAEDRIFATAAAQGQTFSVSSGDEGVYECNNRGYPDGSTYSVSWPASSPNVIAVGGTTLYTTSAGAYSNETVWNEGLDSNGKLWATGGGYSVYESKPSWQSVVSGTPGRRLLPDISFDAAQGTGALIYNYGQLQQIGGTSLASPIFVGLWARLQSANSNSLGFPAASFYSAISSTPSLVHDVKSGNNGYGGYGYNAGTGWDYPTGWGSLDIAKLSAYIRSNGFGH,"[(51, 52), (63, 63), (69, 74), (106, 106), (111, 112), (115, 115), (146, 146), (149, 150), (220, 220), (222, 222), (231, 231), (268, 268)]" +C1=CC(=CC=C1[N+](=O)[O-])O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)CO)O[C@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CO)O)O)O)O)O,OC[C@H]1O[C@@H](O[C@@H]2[C@@H](CO)O[C@@H](O)[C@H](O)[C@H]2O)[C@H](O)[C@@H](O)[C@@H]1O,MEKDTKQVDIIFRSKLPDIYIPNHLPLHSYCFENISEFSSRPCLINGANKQIYTYADVELNSRKVAAGLHKQGIQPKDTIMILLPNSPEFVFAFIGASYLGAISTMANPLFTPAEVVKQAKASSAKIIVTQACHVNKVKDYAFENDVKIICIDSAPEGCLHFSVLTQANEHDIPEVEIQPDDVVALPYSSGTTGLPKGVMLTHKGLVTSVAQQVDGENPNLYIHSEDVMLCVLPLFHIYSLNSVLLCGLRVGAAILIMQKFDIVSFLELIQRYKVTIGPFVPPIVLAIAKSPMVDDYDLSSVRTVMSGAAPLGKELEDTVRAKFPNAKLGQGYGMTEAGPVLAMCLAFAKEPFEIKSGACGTVVRNAEMKIVDPKTGNSLPRNQSGEICIRGDQIMKGYLNDPEATARTIDKEGWLYTGDIGYIDDDDELFIVDRLKELIKYKGFQVAPAELEALLLNHPNISDAAVVPMKDEQAGEVPVAFVVRSNGSTITEDEVKDFISKQVIFYKRIKRVFFVDAIPKSPSGKILRKDLRAKLAAGLPN,"[(187, 188), (194, 196), (198, 199), (201, 201), (212, 213), (220, 221), (229, 229), (232, 233), (237, 238), (240, 242), (244, 245), (247, 248), (258, 259), (261, 262), (280, 281), (283, 283), (306, 308), (310, 313), (317, 317), (329, 330), (333, 335), (337, 343), (345, 346), (348, 348), (358, 364), (386, 388), (399, 399), (402, 402), (405, 405), (408, 409), (417, 419), (421, 422), (432, 434), (436, 436), (438, 440), (442, 442), (445, 445), (447, 448), (474, 477), (507, 508), (520, 522), (524, 525), (527, 528), (533, 533)]" +O=C[C@H](O)[C@@H](O)[C@@H](O)CO,O=C(CO)[C@@H](O)[C@@H](O)CO,MEMKKSGLGTTAIHAGTLKNLYGTLAMPIYQTSTFIFDSAEQGGRRFALEEAGYIYTRLGNPTTTVLENKIAALEEGEAGIAMSSGMGAISSTLWTVLKAGDHVVTDKTLYGCTFALMNHGLTRFGVEVTFVDTSNLEEVKNAMKKNTRVVYLETPANPNLKIVDLEALSKIAHTNPNTLVIVDNTFATPYMQKPLKLGVDIVVHSATKYLNGHGDVIAGLVVTRQELADQIRFVGLKDMTGAVLGPQEAYYIIRGLKTFEIRMERHCKNARTIVDFLNKHPKVEKVYYPGLETHPGYEIAKKQMKDFGAMISFELKGGFEAGKTLLNNLKLCSLAVSLGDTETLIQHPASMTHSPYTKEEREVAGITDGLVRLSVGLENVEDIIADLEQGLEKI,"[(30, 30), (32, 35), (54, 55), (59, 61), (84, 85), (88, 92), (111, 111), (113, 114), (117, 117), (184, 184), (186, 186), (204, 205), (210, 213), (215, 215), (217, 221), (237, 239), (242, 242), (244, 245), (247, 247), (253, 253), (338, 339), (341, 341)]" +CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O,CC(C)(COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O)[C@@H](O)C(=O)NCCC(=O)NCCS,MMTTSLIWGIAIAACCCLWLILGIRRRQTGEPPLENGLIPYLGCALQFGANPLEFLRANQRKHGHVFTCKLMGKYVHFITNPLSYHKVLCHGKYFDWKKFHFATSAKAFGHRSIDPMDGNTTENINDTFIKTLQGHALNSLTESMMENLQRIMRPPVSSNSKTAAWVTEGMYSFCYRVMFEAGYLTIFGRDLTRRDTQKAHILNNLDNFKQFDKVFPALVAGLPIHMFRTAHNAREKLAESLRHENLQKRESISELISLRMFLNDTLSTFDDLEKAKTHLVVLWASQANTIPATFWSLFQMIRNPEAMKAATEEVKRTLENAGQKVSLEGNPICLSQAELNDLPVLDSIIKESLRLSSASLNIRTAKEDFTLHLEDGSYNIRKDDIIALYPQLMHLDPEIYPDPLTFKYDRYLDENGKTKTTFYCNGLKLKYYYMPFGSGATICPGRLFAIHEIKQFLILMLSYFELELIEGQAKCPPLDQSRAGLGILPPLNDIEFKYKFKHL,"[(134, 134), (437, 438), (442, 443), (445, 447)]" +CC(=O)CC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O,C[C@](CC(=O)O)(CC(=O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)O,MARSRGERTPAARRITSRNARFQQWQALLGNRNKRTRAGEFLVMGVRPISLAVEHGWPVRTLLYDGQRELSKWARELLRTVRTEQIAMAPDLLMELGEKNEAPPEVVAVVEMPADDLDRIPVREDFLGVLFDRPTSPGNIGSIIRSADALGAHGLIVAGHAADVYDPKSVRSSTGSLFSLPAVRVPSPGEVMDWVEARRAAGTPIVLVGTDEHGDCDVFDFDFTQPTLLLIGNETAGLSNAWRTLCDYTVSIPMAGSASSLNAANAATAILYEAVRQRISGRTATTP,"[(131, 132), (208, 209), (212, 214), (217, 219), (229, 231), (233, 234), (237, 238), (243, 243), (249, 251), (255, 256), (259, 261), (263, 263), (266, 266), (269, 270)]" +N[C@@H](CC(=O)[O-])C(=O)[O-],O=P([O-])([O-])OP(=O)([O-])[O-],MPSASASRKSQEKPREIMDAAEDYAKERYGISSMIQSQEKPDRVLVRVRDLTIQKADEVVWVRARVHTSRAKGKQCFLVLRQQQFNVQALVAVGDHASKQMVKFAANINKESIVDVEGVVRKVNQKIGSCTQQDVELHVQKIYVISLAEPRLPLQLDDAVRPEAEGEEEGRATVNQDTRLDNRVIDLRTSTSQAVFRLQSGICHLFRETLINKGFVEIQTPKIISAASEGGANVFTVSYFKNNAYLAQSPQLYKQMCICADFEKVFSIGPVFRAEDSNTHRHLTEFVGLDIEMAFNYHYHEVMEEIADTMVQIFKGLQERFQTEIQTVNKQFPCEPFKFLEPTLRLEYCEALAMLREAGVEMGDEDDLSTPNEKLLGHLVKEKYDTDFYILDKYPLAVRPFYTMPDPRNPKQSNSYDMFMRGEEILSGAQRIHDPQLLTERALHHGIDLEKIKAYIDSFRFGAPPHAGGGIGLERVTMLFLGLHNVRQTSMFPRDPKRLTP,"[(199, 199), (202, 203), (251, 251), (254, 255), (272, 272), (284, 289), (291, 292), (373, 373), (399, 399), (402, 404), (416, 419), (422, 423), (425, 426), (428, 430), (432, 433), (466, 471), (476, 480), (486, 486), (489, 490)]" +C[C@@H](C(=O)N[C@@H](CCC(=O)N[C@@H](CCC(=O)O)C(=O)O)C(=O)O)OP(=O)(O)OC[C@H]([C@H]([C@H](CN1C2=CC(=O)C=CC2=CC3=C1NC(=O)NC3=O)O)O)O,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)(O)O)[C@@H]3O)[C@@H](O)[C@H]2O)c1,MSTLRLLISDSYDPWFNLAVEECIFRQMPATQRVLFLWRNADTVVIGRAQNPWKECNTRRMEEDNVRLARRSSGGGAVFHDLGNTCFTFMAGKPEYDKTISTSIVLNALNALGVSAEASGRNDLVVKTVEGDRKVSGSAYRETKDRGFHHGTLLLNADLSRLANYLNPDKKKLAAKGITSVRSRVTNLTELLPGITHEQVCEAITEAFFAHYGERVEAEIISPNKTPDLPNFAETFARQSSWEWNFGQAPAFSHLLDERFTWGGVELHFDVEKGHITRAQVFTDSLNPAPLEALAGRLQGCLYRADMLQQECEALLVDFPEQEKELRELSAWMAGAVR,"[(21, 21), (39, 45), (47, 47), (49, 49), (54, 54), (65, 65), (69, 70), (72, 75), (80, 81), (83, 83), (85, 85), (123, 125), (132, 133), (135, 136), (140, 140), (147, 147), (151, 153), (157, 157), (160, 161), (164, 165), (179, 179), (185, 185), (187, 187), (243, 243)]" +O=[N+]([O-])c1ccc(Cl)c([N+](=O)[O-])c1,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])SC[C@@H](C(=O)NCC(=O)O)NC(=O)CC[C@@H](C(=O)O)N,MPNYKLTYFNMRGRAEIIRYIFAYLDIQYEDHRIEQADWPEIKSTLPFGKIPILEVDGLTLHQSLAIARYLTKNTDLAGNTEMEQCHVDAIVDTLDDFMSCFPWAEKKQDVKEQMFNELLTYNAPHLMQDLDTYLGGREWLIGNSVTWADFYWEICSTTLLVFKPDLLDNHPRLVTLRKKVQAIPAVANWIKRRPQTKL,"[(6, 7), (9, 13), (15, 19), (32, 38), (40, 43), (46, 48), (52, 53), (61, 62), (65, 69), (96, 96), (99, 100), (104, 104), (152, 152)]" +O,O=P([O-])([O-])[O-],MSSSNVEVFIPVSQGNTNGFPATASNDLKAFTEGAVLSFHNICYRVKLKSGFLPCRKPVEKEILSNINGIMKPGLNAILGPTGGGKSSLLDVLAARKDPSGLSGDVLINGAPRPANFKCNSGYVVQDDVVMGTLTVRENLQFSAALRLATTMTNHEKNERINRVIQELGLDKVADSKVGTQFIRGVSGGERKRTSIGMELITDPSILFLDEPTTGLDSSTANAVLLLLKRMSKQGRTIIFSIHQPRYSIFKLFDSLTLLASGRLMFHGPAQEALGYFESAGYHCEAYNNPADFFLDIINGDSTAVALNREEDFKATEIIEPSKQDKPLIEKLAEIYVNSSFYKETKAELHQLSGGEKKKKITVFKEISYTTSFCHQLRWVSKRSFKNLLGNPQASIAQIIVTVVLGLVIGAIYFGLKNDSTGIQNRAGVLFFLTTNQCFSSVSAVELFVVEKKLFIHEYISGYYRVSSYFLGKLLSDLLPMRMLPSIIFTCIVYFMLGLKPKADAFFVMMFTLMMVAYSASSMALAIAAGQSVVSVATLLMTICFVFMMIFSGLLVNLTTIASWLSWLQYFSIPRYGFTALQHNEFLGQNFCPGLNATGNNPCNYATCTGEEYLVKQGIDLSPWGLWKNHVALACMIVIFLTIAYLKLLFLKKYS,"[(63, 64), (78, 79), (88, 91), (123, 123), (126, 126), (128, 128), (136, 136), (170, 170), (173, 173), (176, 180), (182, 183), (191, 194), (209, 210), (212, 216), (241, 242), (244, 245), (258, 263), (295, 295), (298, 299)]" +Cc1cn([C@H]2C[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O2)c(=O)[nH]c1=O,Cc1cn([C@H]2C[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)O)O2)c(=O)[nH]c1=O,MKAFILAAGSGERLEPITHTRPKAFVPILSKPLIEYQIEYLRKCGIRDITVIVSSKNKEYFEKKLKEISIVTQKDDIKGTGAAILSAKFNDEALIIYGDLFFSNEKEICNIITLKENAIIGVKVSNPKDYGVLVLDNQNNLSKIIEKPEIPPSNLINAGIYKLNSDIFTYLDKISISERGELELTDAINLMAKDHRVKVIEYEGYWMDIGKPWNIIDVNKWALDNLVFSQNLGNVEDNVKIKGKVIIEEDAEIKSGTYIEGPVYIGKGSEIGPNSYLRPYTILVEKNKIGASVEVKESVIMEGSKIPHLSYVGDSVIAEDVNFGAGTLIANLRFDEKEVKVNVKGKRISSGRRKLGAFIGGHVRTGINVTILPGVKIGAYARIYPGAVVNRDVGYGEFFKV,"[(5, 7), (14, 15), (18, 18), (23, 23), (37, 37), (52, 57), (71, 72), (74, 75), (77, 78), (81, 84), (86, 86), (95, 96), (98, 100), (119, 122), (129, 130), (132, 133), (144, 145), (147, 148), (155, 156), (158, 161), (181, 185), (206, 206), (208, 212), (343, 344)]" +N[C@@H](Cc1ccncc1)C(=O)O,N,MKTLSQAQSKTSSQQFSFTGNSSANVIIGNQKLTINDVARVARNGTLVSLTNNTDILQGIQASCDYINNAVESGEPIYGVTSGFGGMANVAISREQASELQTNLVWFLKTGAGNKLPLADVRAAMLLRANSHMRGASGIRLELIKRMEIFLNAGVTPYVYEFGSIGASGDLVPLSYITGSLIGLDPSFKVDFNGKEMDAPTALRQLNLSPLTLLPKEGLAMMNGTSVMTGIAANCVYDTQILTAIAMGVHALDIQALNGTNQSFHPFIHNSKPHPGQLWAADQMISLLANSQLVRDELDGKHDYRDHELIQDRYSLRCLPQYLGPIVDGISQIAKQIEIEINSVTDNPLIDVDNQASYHGGNFLGQYVGMGMDHLRYYIGLLAKHLDVQIALLASPEFSNGLPPSLLGNRERKVNMGLKGLQICGNSIMPLLTFYGNSIADRFPTHAEQFNQNINSQGYTSATLARRSVDIFQNYVAIALMFGVQAVDLRTYKKTGHYDARACLSPATERLYSAVRHVVGQKPTSDRPYIWNDNEQGLDEHIARISADIAAGGVIVQAVQDILPCLH,"[(128, 128), (171, 171), (174, 174), (216, 216), (218, 222), (224, 225), (263, 263), (295, 295), (310, 310), (312, 316), (318, 321), (345, 346), (348, 349), (358, 361), (363, 363), (405, 405), (416, 418), (420, 423), (446, 447), (449, 450), (452, 453)]" +CSCC[C@H](N)C(=O)O,CS,MSVHKTNDAFKVLMNSAKEPIVEDIPKKYRKQSFRDNLKVYIESPESYKNVIYYDDDVVLVRDMFPKSKMHLLLMTRDPHLTHVHPLEIMMKHRSLVEKLVSYVQGDLSGLIFDEARNCLSQQLTNEALCNYIKVGFHAGPSMNNLHLHIMTLDHVSPSLKNSAHYISFTSPFFVKIDTPTSNLPTRGTLTSLFQEDLKCWRCGETFGRHFTKLKAHLQEEYDDWLDKSVSM,"[(166, 166), (198, 199), (201, 202), (204, 205), (207, 207), (213, 216), (218, 220), (222, 226)]" +CCCCCCCC/C=C\CCCCCCCC(=O)O,CC(C)(N)CO,MRRLSSWRKMATAEKQKHDGRVKIGHYILGDTLGVGTFGKVKVGKHELTGHKVAVKILNRQKIRSLDVVGKIRREIQNLKLFRHPHIIKLYQVISTPSDIFMVMEYVSGGELFDYICKNGRLDEKESRRLFQQILSGVDYCHRHMVVHRDLKPENVLLDAHMNAKIADFGLSNMMSDGEFLRTSCGSPNYAAPEVISGRLYAGPEVDIWSSGVILYALLCGTLPFDDDHVPTLFKKICDGIFYTPQYLNPSVISLLKHMLQVDPMKRAAIKDIREHEWFKQDLPKYLFPEDPSYSSTMIDDEALKEVCEKFECSEEEVLSCLYNRNHQDPLAVAYHLIIDNRRIMNEAKDFYLATSPPDSFLDDHHLTRPHPERVPFLVAETPRARHTLDELNPQKSKHQGVRKAKWHLGIRSQSRPNDIMAEVCRAIKQLDYEWKVVNPYYLRVRRKNPVTSTFSKMSLQLYQVDSRTYLLDFRSIDDEITEAKSGTATPQRSGSISNYRSCQRSDSDAEAQGKPSDVSLTSSVTSLDSSPVDVAPRPGSHTIEFFEMCANLIKILAQ,"[(12, 12), (15, 19), (31, 32), (42, 44), (52, 55), (57, 59), (80, 80), (95, 95)]" +[C@H](C(=O)O)(NC(=O)N)O,O=C=O,MESLKRFLCSIALLLISLLLPSSLAQQQQHESIRTMEDFSGYPIHEPGQFGSINLASSLSVDAPGLQNQIDELSSFSDAPSPSVTRVLYTDKDVSARRYVKNLMALAGLTVREDAVGNIFGKWDGLEPNLPAVATGSHIDAIPYSGKYDGVVGVLGAIEAINVLKRSGFKPKRSLEIILFTSEEPTRFGISCLGSRLLAGSKELAEALKTTVVDGQNVSFIEAARSAGYAEDKDDDLSSVFLKKGSYFAFLELHIEQGPILEDEGLDIGVVTAIAAPASLKVEFEGNGGHAGAVLMPYRNDAGLAAAELALAVEKHVLESESIDTVGTVGILELHPGAINSIPSKSHLEIDTRDIDEARRNTVIKKIQESANTIAKKRKVKLSEFKIVNQDPPALSDKLVIKKMAEAATELNLSHKMMISRAYHDSLFMARISPMGMIFIPCYKGYSHKPEEYSSPEDMANGVKVLSLTLAKLSLD,"[(136, 137), (139, 142), (144, 148), (150, 152), (154, 154), (179, 180), (182, 183), (185, 187), (190, 192), (252, 253), (255, 257), (423, 425), (437, 440), (446, 447), (449, 450), (452, 452)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,MVKDTYISSASKTPPMERTVRVTGMTCAMCVKSIETAVGSLEGVEEVRVNLATETAFIRFDEKRIDFETIKRVIEDLGYGVVDEQAAVSAEVEHLSRMKRKLYVAAFAGVLLLFLAHFISLPYEDFVQLLIALPAIFYSGSSIFKAAFSALRRRTLNMDVMYSMGVGAAFLASVLSTAGVLPREYSFYETSVLLLAFLLLGRTLEARAKSRTGEAIKKLVGLQAKTAVVIRDGKEIAVPVEEVAVGDIVIVRPGEKIPVDGVVVEGESYVDESMISGEPVPVLKSKGDEVFGATINNTGVLKIRATRVGGETLLAQIVKLVEDAMGSKPPIQRLADKVVAYFIPTVLLVAISAFIYWYFIAHAPLLFAFTTLIAVLVVACPCAFGLATPTALTVGMGKGAELGILIKNADALEVAEKVTAVIFDKTGTLTKGKPEVTDLVPLNGDERELLRLAAIAERRSEHPIAEAIVKKALEHGIELGEPEKVEVIAGEGVVADGILVGNKRLMEDFGVAVSNEVELALEKLEREAKTAVIVARNGRVEGIIAVSDTLKESAKPAVQELKRMGIKVGMITGDNWRSAEAISRELNLDLVIAEVLPHQKSEEVKKLQAKEVVAFVGDGINDAPALAQADLGIAVGSGSDVAVESGDIVLIRDDLRDVVAAIQLSRKTMSKIKQNIFWALIYNVILIPAAAGLLYPIFGVVFRPEFAGLAMAMSSVSVVANSLLLRNYVPPIRRGGDSVEKIVLELSGLSCHHCVARVKKALEEAGAKVEKVDLNEAVVAGNKEDVDKYIKAVEAAGYQAKLRS,"[(223, 223), (240, 240), (258, 260), (266, 267), (269, 269), (272, 274), (278, 278), (281, 289), (291, 293), (297, 298), (309, 309), (312, 312), (314, 315), (318, 318)]" +O=C(O)[C@@H](CO)OP(=O)(O)O,O=C(O)[C@H](O)COP(=O)(O)O,MSKKPVALIILDGFALRDETYGNAVAQANKPNFDRYWNEYPHTTLKACGEAVGLPEGQMGNSEVGHLNIGAGRIVYQSLTRINIAIREGEFDRNETFLAAMNHVKQHGTSLHLFGLLSDGGVHSHIHHLYALLRLAAKEGVKRVYIHGFLDGRDVGPQTAPQYIKELQEKIKEYGVGEIATLSGRYYSMDRDKRWDRVEKAYRAMVYGEGPTYRDPLECIEDSYKHGIYDEFVLPSVIVREDGRPVATIQDNDAIIFYNFRPDRAIQISNTFTNEDFREFDRGPKHPKHLFFVCLTHFSETVKGYVAFKPTNLDNTIGEVLSQHGLRQLRIAETEKYPHVTFFMSGGREEKFPGEDRILINSPKVPTYDLKPEMSAYEVTDALLKEIEADKYDAIILNYANPDMVGHSGKLEPTIKAVEAVDECLGKVVDAILAKGGIAIITADHGNADEVLTPDGKPQTAHTTNPVPVIVTKKGIKLRDGGILGDLAPTMLDLLGLPQPKEMTGKSLIVK,"[(10, 11), (13, 15), (47, 47), (59, 61), (63, 67), (69, 69), (122, 122), (153, 153), (231, 231), (261, 261), (336, 336), (339, 339), (343, 343), (398, 402), (404, 406), (408, 409), (442, 443), (446, 447), (459, 461), (463, 464), (467, 467), (484, 484)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,C[C@H](C(=O)OP(=O)([O-])OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)O)[NH3+],MKLLEQIEKWAAETPDQTAFVWRDAKITYKQLKEDSDALAHWISSEYPDDRSPIMVYGHMQPEMIINFLGCVKAGHAYIPVDLSIPADRVQRIAENSGAKLLLSATAVTVTDLPVRIVSEDNLKDIFFTHKGNTPNPEHAVKGDENFYIIYTSGSTGNPKGVQITYNCLVSFTKWAVEDFNLQTGQVFLNQAPFSFDLSVMDIYPSLVTGGTLWAIDKDMIARPKDLFASLEQSDIQVWTSTPSFAEMCLMEASFSESMLPNMKTFLFCGEVLPNEVARKLIERFPKATIMNTYGPTEATVAVTGIHVTEEVLDQYKSLPVGYCKSDCRLLIMKEDGTIAPDGEKGEIVIVGPSVSVGYLGSPELTEKAFTMIDGERAYKTGDAGYVENGLLFYNGRLDFQIKLHGYRMELEEIEHHLRACSYVEGAVIVPIKKGEKYDYLLAVVVPGEHSFEKEFKLTSAIKKELNERLPNYMIPRKFMYQSSIPMTPNGKVDRKKLLSEVTA,"[(89, 89), (148, 148), (150, 151), (154, 154), (159, 160), (162, 162), (164, 164), (172, 172), (175, 175), (180, 180), (195, 196), (198, 201), (242, 242), (244, 244), (268, 273), (290, 291), (298, 300), (302, 306), (308, 308), (318, 321), (345, 348), (350, 350), (359, 359), (379, 379), (381, 382), (384, 386), (392, 393), (398, 401), (410, 410), (412, 413), (486, 488), (490, 491), (493, 494)]" +Nc1nc2c(ncn2[C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]2O)c(=O)[nH]1,O=P([O-])([O-])[O-],MELSEGELSHTSSSSSFVPVDQRQLQDAIQIIDENKHFNTGILDYINKTSPADVGNNYHIISVFGSQSTGKSTLLNRLFNTNFDVMDESNRQQTTKGIWLAYSPVVSTTLGHTTSKSNILVMDVEGTDGRERGEDQDFERKAALFALSTSEVLIINIWETQVGLYQGANMGLLKTVFEVNLSLFGKSKLETHNDHKVLLLIVIRDHVGVTPVESLAKTFTSDLQNMWSSLAKPAELEHLQFADFFDVTFHALNHKVLQPKEFGEGINRLDDRLVVSNELFKPEYHHDVPIDGWTMYAERCWEQIETNKDLDLPTQQILVAQFKCDEIVESVFQEFLAKYQHHFKEVDAAPDFEELGALFADLRQDAFEDYDASASRYNKAVYEQKRKKLRWLINDKLKEVFDVHAKNLCNTLLEKFEKDLVALKGKDFAVNVKTLSTKLVEDVNFQVSLMSLQGDLSLDEIILALTKDIDAIVAKQQVIELNSIVNKSVKKLSASLSKSIQFELGDPNEETWDNVLQQFKGVYEKFGGDFGLGTSSTQNQQAIEKFKFKSWCQFYDVTHKLISREKLLALLQDRFDDKFRYDENGLPKLYLNEQDLEKTFAVAKQHALQVLPILTFAKLADGSEIVPDYDIFDSKLREQFLGGYDDSDDEEDHCFAEIITEQEKSEVLAKFKKEVDAKYIETKRSIVQHITQIPYYIYLIILVLGWNEFMAIIRNPLFFSLSIVLGATVYVLYYLGLLRPALVVAQRTMDEVIVMAKTKLREVLIDDHEVTGRQLNKMAGSKENIELDDM,"[(63, 64), (73, 77), (83, 86), (123, 128), (156, 156), (158, 158), (169, 169), (172, 172), (204, 204)]" +O=C1OC2(c3ccc(O)cc3Oc3cc(O)ccc32)c2ccccc21,C(C(=N)C(=O)O)C(=O)O,MIYIIGSGIAGLSAGVALRRAGKKVTLISKRIDGGSTPIAKGGVAASVGSDDSPELHAQDTIRVGDGLCDVKTVNYVTSEAKNVIETFESWGFEFEEDLRLEGGHTKRRVLHRTDETGREIFNFLLKLAREEGIPIIEDRLVEIRVKDGKVTGFVTEKRGLVEDVDKLVLATGGYSYLYEYSSTQSTNIGDGMAIAFKAGTILADMEFVQFHPTVTSLDGEVFLLTETLRGEGAQIINENGERFLFNYDKRGELAPRDILSRAIYIEMLKGHKVFIDLSKIEDFERKFPVVAKYLARHGHNYKVKIPIFPAAHFVDGGIRVNIRGESNIVNLYAIGEVSDSGLHGANRLASNSLLEGLVFGINLPRYVDSSWEGISTDDGIVHSVRISGNKTLSLKEIRRINWENVGIIRNEEKLVKAINTYSSSTQNEAIISYLTALAAEIRKESRGNHFREDYPYKDPNWEKRIYFKLVV,"[(5, 6), (11, 15), (27, 28), (30, 35), (38, 41), (44, 45), (102, 102), (110, 112), (117, 119), (121, 122), (125, 125), (137, 140), (171, 176), (184, 184), (187, 190), (192, 196), (313, 315), (317, 317), (335, 336), (338, 339), (344, 344), (348, 349), (351, 352), (355, 358), (361, 361), (428, 428)]" +O,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,MNSPGGRGKKKGSGGASNPVPPRPPPPCLAPAPPAAGPAPPPESPHKRNLYYFSYPLFVGFALLRLVAFHLGLLFVWLCQRFSRALMAAKRSSGAAPAPASASAPAPVPGGEAERVRVFHKQAFEYISIALRIDEDEKAGQKEQAVEWYKKGIEELEKGIAVIVTGQGEQCERARRLQAKMMTNLVMAKDRLQLLEKMQPVLPFSKSQTDVYNDSTNLACRNGHLQSESGAVPKRKDPLTHTSNSLPRSKTVMKTGSAGLSGHHRAPSYSGLSMVSGVKQGSGPAPTTHKGTPKTNRTNKPSTPTTATRKKKDLKNFRNVDSNLANLIMNEIVDNGTAVKFDDIAGQDLAKQALQEIVILPSLRPELFTGLRAPARGLLLFGPPGNGKTMLAKAVAAESNATFFNISAASLTSKYVGEGEKLVRALFAVARELQPSIIFIDEVDSLLCERREGEHDASRRLKTEFLIEFDGVQSAGDDRVLVMGATNRPQELDEAVLRRFIKRVYVSLPNEETRLLLLKNLLCKQGSPLTQKELAQLARMTDGYSGSDLTALAKDAALGPIRELKPEQVKNMSASEMRNIRLSDFTESLKKIKRSVSPQTLEAYIRWNKDFGDTTV,"[(114, 115), (118, 119), (121, 122), (124, 124), (150, 150), (153, 153), (155, 155), (157, 157), (189, 189), (192, 192)]" +O=P([O-])([O-])[O-],NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1,MALVRALVCCLLTAWHCRSGLGLPVAPAGGRNPPPAIGQFWHVTDLHLDPTYHITDDHTKVCASSKGANASNPGPFGDVLCDSPYQLILSAFDFIKNSGQEASFMIWTGDSPPHVPVPELSTDTVINVITNMTTTIQSLFPNLQVFPALGNHDYWPQDQLPVVTSKVYNAVANLWKPWLDEEAISTLRKGGFYSQKVTTNPNLRIISLNTNLYYGPNIMTLNKTDPANQFEWLESTLNNSQQNKEKVYIIAHVPVGYLPSSQNITAMREYYNEKLIDIFQKYSDVIAGQFYGHTHRDSIMVLSDKKGSPVNSLFVAPAVTPVKSVLEKQTNNPGIRLFQYDPRDYKLLDMLQYYLNLTEANLKGESIWKLEYILTQTYDIEDLQPESLYGLAKQFTILDSKQFIKYYNYFFVSYDSSVTCDKTCKAFQICAIMNLDNISYADCLKQLYIKHNY,"[(43, 44), (46, 46), (48, 49), (82, 83), (88, 88), (108, 109), (111, 114), (148, 150), (152, 154), (157, 160), (208, 208), (210, 211), (214, 214), (250, 251), (253, 254), (266, 266), (291, 292), (294, 294), (296, 297), (316, 320), (322, 322), (331, 331), (410, 410)]" +NCCC[C@H](N)C(=O)O,O=P([O-])([O-])[O-],MARTVVLITGCSSGIGLHLAVRLASDPSQSFKVYATLRDLKTQGRLWEAARALACPPGSLETLQLDVRDSKSVAAARERVTEGRVDVLVCNAGLGLLGPLEALGEDAVASVLDVNVVGTVRMLQAFLPDMKRRGSGRVLVTGSVGGLMGLPFNDVYCASKFALEGLCESLAVLLLPFGVHLSLIECGPVHTAFMEKVLGSPEEVLDRTDIHTFHRFYQYLAHSKQVFREAAQNPEEVAEVFLTALRAPKPTLRYFTTERFLPLLRMRLDDPSGSNYVTAMHREVFGDVPAKAEAGAEAGGGAGPGAEDEAGRGAVGDPELGDPPAAPQ,"[(1, 9), (39, 42), (44, 45), (48, 49), (52, 55), (57, 65), (67, 68), (72, 72), (78, 81), (88, 88), (90, 91), (112, 113), (116, 117), (120, 121), (140, 140), (144, 148), (155, 159), (161, 164), (185, 185), (188, 188), (190, 190), (233, 234), (236, 238), (240, 241), (244, 245), (266, 266)]" +O,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,MFLKVRAEKRLGNFRLNVDFEMGRDYCVLLGPTGAGKSVFLELIAGIVKPDRGEVRLNGADITPLPPERRGIGFVPQDYALFPHLSVYRNIAYGLRNVERVERDRRVREMAEKLGIAHLLDRKPARLSGGERQRVALARALVIQPRLLLLDEPLSAVDLKTKGVLMEELRFVQREFDVPILHVTHDLIEAAMLADEVAVMLNGRIVEKGKLKELFSAKNGEVAEFLSARNLLLKVSKILD,"[(14, 14), (16, 16), (29, 30), (39, 42), (77, 77), (151, 152), (183, 185), (200, 204), (225, 225)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,O=P([O-])([O-])[O-],MELEEDLKGRADKNFSKMGKKSKKEKKEKKPAVSVLTMFRYAGWLDRLYMLVGTLAAIIHGVALPLMMLIFGDMTDSFASVGNVSKNSTNMSEADKRAMFAKLEEEMTTYAYYYTGIGAGVLIVAYIQVSFWCLAAGRQIHKIRQKFFHAIMNQEIGWFDVHDVGELNTRLTDDVSKINEGIGDKIGMFFQAMATFFGGFIIGFTRGWKLTLVILAISPVLGLSAGIWAKILSSFTDKELHAYAKAGAVAEEVLAAIRTVIAFGGQKKELERYNNNLEEAKRLGIKKAITANISMGAAFLLIYASYALAFWYGTSLVISKEYSIGQVLTVFFSVLIGAFSVGQASPNIEAFANARGAAYEVFKIIDNKPSIDSFSKSGHKPDNIQGNLEFKNIHFSYPSRKEVQILKGLNLKVKSGQTVALVGNSGCGKSTTVQLMQRLYDPLDGMVSIDGQDIRTINVRYLREIIGVVSQEPVLFATTIAENIRYGREDVTMDEIEKAVKEANAYDFIMKLPHQFDTLVGERGAQLSGGQKQRIAIARALVRNPKILLLDEATSALDTESEAVVQAALDKAREGRTTIVIAHRLSTVRNADVIAGFDGGVIVEQGNHDELMREKGIYFKLVMTQTAGNEIELGNEACKSKDEIDNLDMSSKDSGSSLIRRRSTRKSICGPHDQDRKLSTKEALDEDVPPASFWRILKLNSTEWPYFVVGIFCAIINGGLQPAFSVIFSKVVGVFTNGGPPETQRQNSNLFSLLFLILGIISFITFFLQGFTFGKAGEILTKRLRYMVFKSMLRQDVSWFDDPKNTTGALTTRLANDAAQVKGATGSRLAVIFQNIANLGTGIIISLIYGWQLTLLLLAIVPIIAIAGVVEMKMLSGQALKDKKELEGSGKIATEAIENFRTVVSLTREQKFETMYAQSLQIPYRNAMKKAHVFGITFSFTQAMMYFSYAACFRFGAYLVTQQLMTFENVLLVFSAIVFGAMAVGQVSSFAPDYAKATVSASHIIRIIEKTPEIDSYSTQGLKPNMLEGNVQFSGVVFNYPTRPSIPVLQGLSLEVKKGQTLALVGSSGCGKSTVVQLLERFYDPMAGSVFLDGKEIKQLNVQWLRAQLGIVSQEPILFDCSIAENIAYGDNSRVVSYEEIVRAAKEANIHQFIDSLPDKYNTRVGDKGTQLSGGQKQRIAIARALVRQPHILLLDEATSALDTESEKVVQEALDKAREGRTCIVIAHRLSTIQNADLIVVIQNGKVKEHGTHQQLLAQKGIYFSMVSVQAGAKRS,"[(397, 397), (405, 406), (409, 409), (421, 422), (431, 435), (471, 471), (551, 552), (581, 583), (596, 601), (617, 617), (621, 621), (1040, 1040), (1048, 1049), (1052, 1052), (1064, 1065), (1074, 1078), (1196, 1197), (1226, 1228), (1241, 1245), (1262, 1262), (1266, 1266)]" +C1=CN(C(=O)NC1=O)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC3[C@@H]([C@H]([C@H]([C@H](O3)CO)O)O)O)O)O,O=c1ccn([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]2O)c(=O)[nH]1,MIIDRLLQRSHSHLPILQATFGLERESLRIHQPTQRVAQTPHPKTLGSRNYHPYIQTDYSEPQLELITPIAKDSQEAIRFLKAISDVAGRSINHDEYLWPLSMPPKVREEDIQIAQLEDAFEYDYRKYLEKTYGKLIQSISGIHYNLGLGQELLTSLFELSQADNAIDFQNQLYMKLSQNFLRYRWLLTYLYGASPVAEEDFLDQKLNNPVRSLRNSHLGYVNHKDIRISYTSLKDYVNDLENAVKSGQLIAEKEFYSPVRLRGSKACRNYLEKGITYLEFRTFDLNPFSPIGITQETVDTVHLFLLALLWIDSSSHIDQDIKEANRLNDLIALSHPLEKLPNQAPVSDLVDAMQSVIQHFNLSPYYQDLLESVKRQIQSPELTVAGQLLEMIEGLSLETFGQRQGQIYHDYAWEAPYALKGYETMELSTQLLLFDVIQKGVNFEVLDEQDQFLKLWHNSHIEYVKNGNMTSKDNYIVPLAMANKVVTKKILDEKHFPTPFGDEFTDRKEALNYFSQIQDKPIVVKPKSTNFGLGISIFKTSANLASYEKAIDIAFTEDSAILVEEYIEGTEYRFFVLEGDCIAVLLRVAANVVGDGIHTISQLVKLKNQNPLRGYDHRSPLEVIELGEVEQLMLEQQGYTVNSIPPEGTKIELRRNSNISTGGDSIDVTNTMDPTYKQLAAEMAEAMGAWVCGVDLIIPNATQAYSKDKKNATCIELNFNPLMYMHTYCQEGPGQSITPRILAKLFPEL,"[(482, 482), (499, 508), (511, 515), (575, 576), (584, 588), (630, 630), (655, 657), (659, 659), (662, 662), (669, 669), (673, 673), (692, 695), (697, 700), (702, 702), (714, 716), (718, 718), (720, 723), (727, 727)]" +CC(C)[C@@H](N)C(=O)O,CC(C)[C@H](N)C(=O)O,MGKLDKASKLIDEENKYYARSARINYYNLVIDHAHGATLVDVDGNKYIDLLASASAINVGHTHEKVVKAIADQAQKLIHYTPAYFHHVPGMELSEKLAKIAPGNSPKMVSFGNSGSDANDAIIKFARAYTGRQYIVSYMGSYHGSTYGSQTLSGSSLNMTRKIGPMLPSVVHVPYPDSYRTYPGETEHDVSLRYFNEFKKPFESFLPADETACVLIEPIQGDGGIIKAPEEYMQLVYKFCHEHGILFAIDEVNQGLGRTGKMWAIQQFKDIEPDLMSVGKSLASGMPLSAVIGKKEVMQSLDAPAHLFTTAGNPVCSAASLATLDVIEYEGLVEKSATDGAYAKQRFLEMQQRHPMIGDVRMWGLNGGIELVKDPKTKEPDSDAATKVIYYAFAHGVVIITLAGNILRFQPPLVIPREQLDQALQVLDDAFTAVENGEVTIPKDTGKIGW,"[(54, 55), (79, 79), (81, 83), (113, 114), (117, 120), (140, 141), (143, 146), (149, 149), (153, 155), (159, 159), (215, 218), (221, 222), (248, 249), (254, 255), (258, 258), (262, 265), (276, 281), (289, 289), (307, 308), (310, 311), (366, 366), (408, 408), (410, 410)]" +N,N[C@@H](Cc1ccccc1)C(=O)O,MDKLRVAVVGYGNVGRYALEAVQAAPDMELVGVVRRKVLAATPPELTGVRVVTDISQLEGVQGALLCVPTRSVPEYAEAMLRRGIHTVDSYDIHGDLADLRRRLDPVAREHGAAAVISAGWDPGTDSIIRALLEFMAPKGITYTNFGPGMSMGHSVAVKAIPGVRDALSMTIPAGMGVHKRAVYVELEPGADFAEVERAIKTDPYFVRDETRVTQVESVSALMDVGHGVVMERKGVSGATHNQLFRFEMRINNPALTAQVMVAALRAAARQKPGCYTMIEIPVIDYLPGDREAWIRKLV,"[(8, 10), (15, 19), (33, 34), (38, 39), (43, 43), (45, 46), (51, 53), (65, 66), (72, 75), (77, 81), (88, 89), (93, 94), (96, 97), (100, 100), (116, 118), (124, 127), (130, 130), (144, 144), (146, 146), (155, 158), (160, 161), (164, 167), (225, 225), (227, 229), (249, 249), (251, 251), (253, 254), (256, 258), (260, 261), (264, 264), (278, 278)]" +O,O=P([O-])([O-])[O-],MSSLEDIKNETVDLEKIPIEEVFQQLKCSREGLTTQEGEDRIQIFGPNKLEEKKESKLLKFLGFMWNPLSWVMEMAAIMAIALANGDGRPPDWQDFVGIICLLVINSTISFIEENNAGNAAAALMAGLAPKTKVLRDGKWSEQEAAILVPGDIVSIKLGDIIPADARLLEGDPLKVDQSALTGESLPVTKHPGQEVFSGSTCKQGEIEAVVIATGVHTFFGKAAHLVDSTNQVGHFQKVLTAIGNFCICSIAIGMVIEIIVMYPIQRRKYRDGIDNLLVLLIGGIPIAMPTVLSVTMAIGSHRLSQQGAITKRMTAIEEMAGMDVLCSDKTGTLTLNKLSVDKNLVEVFCKGVEKDQVLLFAAMASRVENQDAIDAAMVGMLADPKEARAGIREVHFLPFNPVDKRTALTYIDGSGNWHRVSKGAPEQILELAKASNDLSKKVLSIIDKYAERGLRSLAVARQVVPEKTKESPGAPWEFVGLLPLFDPPRHDSAETIRRALNLGVNVKMITGDQLAIGKETGRRLGMGTNMYPSSALLGTHKDANLASIPVEELIEKADGFAGVFPEHKYEIVKKLQERKHIVGMTGDGVNDAPALKKADIGIAVADATDAARGASDIVLTEPGLSVIISAVLTSRAIFQRMKNYTIYAVSITIRIVFGFMLIALIWEFDFSAFMVLIIAILNDGTIMTISKDRVKPSPTPDSWKLKEIFATGVVLGGYQAIMTVIFFWAAHKTDFFSDTFGVRSIRDNNHELMGAVYLQVSIISQALIFVTRSRSWSFVERPGALLMIAFLIAQLIATLIAVYANWEFAKIRGIGWGWAGVIWLYSIVTYFPLDVFKFAIRYILSGKAWLNLFENKTAFTMKKDYGKEEREAQWALAQRTLHGLQPKEAVNIFPEKGSYRELSEIAEQAKRRAEIARLRELHTLKGHVESVVKLKGLDIETPSHYTV,"[(329, 329), (569, 569), (586, 587), (589, 591), (593, 597), (605, 605), (607, 609), (612, 612)]" +[Co+2],[H+],MVKSLQLAHQLKDKKILLIGGGEVGLTRLYKLIPTGCKLTLVSPDLHKSIIPKFGKFIQNEDQPDYREDAKRFINPNWDPTKNEIYEYIRSDFKDEYLDLEDENDAWYIIMTCIPDHPESARIYHLCKERFGKQQLVNVADKPDLCDFYFGANLEIGDRLQILISTNGLSPRFGALVRDEIRNLFTQMGDLALEDAVVKLGELRRGIRLLAPDDKDVKYRMDWARRCTDLFGIQHCHNIDVKRLLDLFKVMFQEQNCSLQFPPRERLLSEYCSS,"[(17, 17), (19, 22), (25, 28), (41, 42), (46, 47), (88, 92), (94, 95), (97, 98), (113, 113), (119, 119), (122, 123), (126, 126), (140, 141)]" +O,CC(C)(COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O)[C@@H](O)C(=O)NCCC(=O)NCCS,MDIYMSRYEEITQQLIFSPKTWLITGVAGFIGSNLLEKLLKLNQVVIGLDNFSTGHQYNLDEVKTLVSTEQWSRFCFIEGDIRDLTTCEQVMKGVDHVLHQAALGSVPRSIVDPITTNATNITGFLNILHAAKNAQVQSFTYAASSSTYGDHPALPKVEENIGNPLSPYAVTKYVNEIYAQVYARTYGFKTIGLRYFNVFGRRQDPNGAYAAVIPKWTAAMLKGDDVYINGDGETSRDFCYIDNVIQMNILSALAKDSAKDNIYNVAVGDRTTLNELSGYIYDELNLIHHIDKLSIKYREFRSGDVRHSQADVTKAIDLLKYRPNIKIREGLRLSMPWYVRFLKG,"[(21, 21), (23, 25), (32, 35), (47, 49), (56, 60), (74, 74), (78, 80), (83, 87), (98, 100), (104, 107), (113, 119), (121, 125), (128, 128), (139, 140), (165, 168), (170, 172), (174, 178), (193, 193), (196, 198), (200, 202), (207, 207), (238, 240), (242, 243), (246, 246), (332, 332), (335, 336), (339, 339)]" +N[C@@H](CCC(=O)N[C@@H](CS)C(=O)NCC(=O)O)C(=O)O,Cl,MASPPCTTEELSPPPGGSLVEYSGGSLRVPDNPVVAFIRGDGVGPEVVESALKVVDAAVKKVYGGSRRIVWWELLAGHLAREKCGELLPKATLEGIRLARVALKGPLETPVGTGYRSLNVAIRQALDLYANIRPVRYYGQPAPHKYADRVDMVIFRENTEDVYAGIEWPHDSPEAARIRRFLAEEFGISIREDAGIGVKPISRFATRRLMERALEWALRNGNTVVTIMHKGNIMKYTEGAFMRWAYEVALEKFREHVVTEQEVQEKYGGVRPEGKILVNDRIADNMLQQIITRPWDYQVIVAPNLNGDYISDAASALVGGIGMAAGMNMGDGIAVAEPVHGTAPKYAGKDLINPSAEILSASLLIGEFMGWREVKSIVEYAIRKAVQSKKVTQDLARHMPGVQPLRTSEYTETLIAYIDEADLNEVLAGKRG,"[(41, 47), (49, 51), (103, 106), (108, 108), (110, 112), (115, 116), (118, 118), (120, 122), (124, 128), (131, 132), (134, 138), (154, 155), (157, 162), (164, 167), (203, 205), (208, 208), (212, 213), (219, 219), (228, 229), (231, 231), (233, 235), (241, 242), (244, 245), (248, 249), (263, 263), (281, 283), (285, 287), (290, 293), (300, 304), (306, 307), (309, 313), (315, 315), (325, 333), (335, 336), (338, 339), (347, 352), (354, 356), (359, 360), (363, 363), (367, 367), (371, 371), (388, 389), (392, 393), (395, 396), (398, 402), (405, 411), (413, 413)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,C1=C(N(C=N1)C2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)O)O)O)N,MASQSSVAVISSAAARGESFPDSKKPIGSVRFQQPLRLSFSYCKSGNMSSRICAMAKPNDAETLSSSVDMSLSPRVQSLKPSKTMVITDLAATLVQSGVPVIRLAAGEPDFDTPKVVAEAGINAIREGFTRYTLNAGITELREAICRKLKEENGLSYAPDQILVSNGAKQSLLQAVLAVCSPGDEVIIPAPYWVSYTEQARLADATPVVIPTKISNNFLLDPKDLESKLTEKSRLLILCSPSNPTGSVYPKSLLEEIARIIAKHPRLLVLSDEIYEHIIYAPATHTSFASLPDMYERTLTVNGFSKAFAMTGWRLGYLAGPKHIVAACSKLQGQVSSGASSIAQKAGVAALGLGKAGGETVAEMVKAYRERRDFLVKSLGDIKGVKISEPQGAFYLFIDFSAYYGSEAEGFGLINDSSSLALYFLDKFQVAMVPGDAFGDDSCIRISYATSLDVLQAAVEKIRKALEPLRATVSV,"[(84, 84), (104, 106), (108, 109), (169, 169), (172, 172), (191, 192), (194, 197), (199, 199), (239, 239), (241, 242), (244, 246), (275, 275), (306, 306), (394, 397), (432, 435), (438, 438), (443, 444), (446, 447), (449, 449)]" +CC(=CCC/C(=C/CC/C(=C/COP(=O)(O)OP(=O)(O)O)/C)/C)C,O=P([O-])([O-])OP(=O)([O-])[O-],MLEEYRKHVAERAAEGIAPKPLDANQMAALVELLKNPPAGEEEFLLDLLTNRVPPGVDEAAYVKAGFLAAIAKGEAKSPLLTPEKAIELLGTMQGGYNIHPLIDALDDAKLAPIAAKALSHTLLMFDNFYDVEEKAKAGNEYAKQVMQSWADAEWFLNRPALAEKLTVTVFKVTGETNTDDLSPAPDAWSRPDIPLHALAMLKNAREGIEPDQPGVVGPIKQIEALQQKGFPLAYVGDVVGTGSSRKSATNSVLWFMGDDIPHVPNKRGGGLCLGGKIAPIFFNTMEDAGALPIEVDVSNLNMGDVIDVYPYKGEVRNHETGELLATFELKTDVLIDEVRAGGRIPLIIGRGLTTKAREALGLPHSDVFRQAKDVAESDRGFSLAQKMVGRACGVKGIRPGAYCEPKMTSVGSQDTTGPMTRDELKDLACLGFSADLVMQSFCHTAAYPKPVDVNTHHTLPDFIMNRGGVSLRPGDGVIHSWLNRMLLPDTVGTGGDSHTRFPIGISFPAGSGLVAFAAATGVMPLDMPESVLVRFKGKMQPGITLRDLVHAIPLYAIKQGLLTVEKKGKKNIFSGRILEIEGLPDLKVEQAFELTDASAERSAAGCTIKLNKEPIIEYLNSNIVLLKWMIAEGYGDRRTLERRIQGMEKWLANPELLEADADAEYAAVIDIDLADIKEPILCAPNDPDDARPLSAVQGEKIDEVFIGSCMTNIGHFRAAGKLLDAHKGQLPTRLWVAPPTRMDAAQLTEEGYYSVFGKSGARIEIPGCSLCMGNQARVADGATVVSTSTRNFPNRLGTGANVFLASAELAAVAALIGKLPTPEEYQTYVAQVDKTAVDTYRYLNFNQLSQYTEKADGVIFQTAV,"[(443, 444), (478, 478), (480, 481), (499, 499), (593, 593), (597, 597), (706, 706), (708, 709), (711, 712), (740, 740), (767, 768), (770, 771), (773, 774), (789, 790), (796, 797)]" +O,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)O)[C@@H](O)[C@H]1O,MSGKPVLHYFNARGRMECIRWLLAAAGVEFEEKLIQSPEDLEKLKKDGNLMFDQVPMVEIDGMKLAQTRAILNYIATKYDLYGKDMKERALIDMYSEGILDLTEMIIQLVICPPDQREAKTALAKDRTKNRYLPAFEKVLKSHGQDYLVGNRLTRVDIHLLELLLYVEEFDASLLTSFPLLKAFKSRISSLPNVKKFLQPGSQRKPAMDAKQIEEARKVFKF,"[(7, 8), (10, 16), (19, 20), (33, 35), (41, 44), (46, 50), (52, 53), (56, 57), (65, 66), (69, 73)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,MAKYTREDIEKLVKEENVKYIRLQFTDILGTIKNVEIPVSQLGKALDNKVMFDGSSIEGFVRIEESDMYLYPDLNTFVIFPWTAEKGKVARFICDIYNPDGTPFEGDPRNNLKRILKEMEDLGFSDFNLGPEPEFFLFKLDEKGEPTLELNDKGGYFDLAPTDLGENCRRDIVLELEEMGFEIEASHHEVAPGQHEIDFKYAGAVRSCDDIQTFKLVVKTIARKHGLHATFMPKPLFGVNGSGMHCNLSLFKNGVNAFFDENADLQLSETAKHFIAGIVKHATSFTAVTNPTVNSYKRLVPGYEAPCYVAWSAQNRSPLIRIPASRGISTRVEVRSVDPAANPYLALSVLLAAGLDGIKNKLEAPAPIDRNIYVMSKEERMENGIVDLPATLAEALEEFKSNEVMVKALGEHLFEHFIEAKEIEWDMFRTQVHPWEREQYMSQY,"[(126, 131), (133, 133), (135, 136), (154, 157), (169, 169), (182, 183), (185, 188), (190, 191), (194, 195), (197, 201), (214, 214), (230, 231), (233, 235), (238, 239), (242, 244), (246, 248), (250, 251), (256, 258), (290, 290), (294, 297), (299, 300), (302, 303), (305, 306), (311, 315), (317, 320), (322, 324), (328, 328), (330, 332), (334, 334), (336, 339), (373, 373)]" +N,N[C@@H](Cc1cccc(F)c1)C(=O)O,MENGNGATTNGHVNGNGMDFCMKTEDPLYWGIAAEAMTGSHLDEVKKMVAEYRKPVVKLGGETLTISQVAAISARDGSGVTVELSEAARAGVKASSDWVMDSMNKGTDSYGVTTGFGATSHRRTKQGGALQKELIRFLNAGIFGNGSDNTLPHSATRAAMLVRINTLLQGYSGIRFEILEAITKFLNQNITPCLPLRGTITASGDLVPLSYIAGLLTGRPNSKAVGPTGVILSPEEAFKLAGVEGGFFELQPKEGLALVNGTAVGSGMASMVLFEANILAVLAEVMSAIFAEVMQGKPEFTDHLTHKLKHHPGQIEAAAIMEHILDGSAYVKAAQKLHEMDPLQKPKQDRYALRTSPQWLGPQIEVIRSSTKMIEREINSVNDNPLIDVSRNKAIHGGNFQGTPIGVSMDNTRLAIAAIGKLMFAQFSELVNDFYNNGLPSNLSGGRNPSLDYGFKGAEIAMASYCSELQFLANPVTNHVQSAEQHNQDVNSLGLISSRKTSEAVEILKLMSTTFLVGLCQAIDLRHLEENLKSTVKNTVSSVAKRVLTMGVNGELHPSRFCEKDLLRVVDREYIFAYIDDPCSATYPLMQKLRQTLVEHALKNGDNERNLSTSIFQKIATFEDELKALLPKEVESARAALESGNPAIPNRIEECRSYPLYKFVRKELGTEYLTGEKVTSPGEEFEKVFIAMSKGEIIDPLLECLESWNGAPLPIC,"[(163, 163), (205, 206), (209, 209), (253, 253), (255, 259), (261, 262), (297, 297), (300, 300), (344, 347), (349, 353), (355, 358), (382, 383), (385, 386), (395, 397), (400, 400), (442, 442), (453, 455), (457, 460), (482, 483), (485, 486), (488, 489)]" +C(CC(=O)[O-])C(CC(=O)C(=O)[O-])O,O=CCCC(=O)[O-],MENSFKAALKAGRPQIGLWLGLSSSYSAELLAGAGFDWLLIDGEHAPNNVQTVLTQLQAIAPYPSQPVVRPSWNDPVQIKQLLDVGTQTLLVPMVQNADEAREAVRATRYPPAGIRGVGSALARASRWNRIPDYLQKANDQMCVLVQIETREAMKNLPQILDVEGVDGVFIGPADLSADMGYAGNPQHPEVQAAIEQAIVQIRESGKAPGILIANEQLAKRYLELGALFVAVGVDTTLLARAAEALAARFGAQATAVKPGVY,"[(93, 94), (96, 96), (147, 148), (150, 151), (153, 153), (172, 174), (176, 179)]" +O=[N+]([O-])c1ccc(O)cc1,C1=CC(=CC=C1[N+](=O)[O-])O[C@H]2[C@@H]([C@H]([C@@H]([C@H](O2)C(=O)O)O)O)O,MKQSHFFAHLSRLKLINRWPLMRNVRTENVSEHSLQVAMVAHALAAIKNRKFGGNVNAERIALLAMYHDASEVLTGDLPTPVKYFNSQIAQEYKAIEKIAQQKLVDMVPEELRDIFAPLIDEHAYSDEEKSLVKQADALCAYLKCLEELAAGNNEFLLAKTRLEATLEARRSQEMDYFMEIFVPSFHLSLDEISQDSPL,"[(13, 13), (18, 18), (28, 32), (34, 38), (64, 65), (67, 67), (70, 73), (119, 119), (122, 122), (125, 125), (133, 136), (138, 142)]" +C[C@@H](C(=O)N[C@@H](CC(=O)O)C(=O)O)N,C[C@@H](C(=O)N[C@H](CC(=O)[O-])C(=O)[O-])[NH3+],MKIIRIETSRIAVPLTKPFKTALRTVYTAESVIVRITYDSGAVGWGEAPPTLVITGDSMDSIESAIHHVLKPALLGKSLAGYEAILHDIQHLLTGNMSAKAAVEMALYDGWAQMCGLPLYQMLGGYRDTLETDYTVSVNSPEEMAADAENYLKQGFQTLKIKVGKDDIATDIARIQEIRKRVGSAVKLRLDANQGWRPKEAVTAIRKMEDAGLGIELVEQPVHKDDLAGLKKVTDATDTPIMADESVFTPRQAFEVLQTRSADLINIKLMKAGGISGAEKINAMAEACGVECMVGSMIETKLGITAAAHFAASKRNITRFDFDAPLMLKTDVFNGGITYSGSTISMPGKPGLGIIGAALLKGEKEQ,"[(160, 164), (189, 190), (192, 193), (217, 218), (220, 221), (242, 243), (245, 248), (266, 266), (268, 268), (271, 271), (293, 293), (321, 321)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,CC(C)(N)CO,MEFSEWYSDILEKAEIYDVRYPIKGCGVYLPYGFKIRRYTFEIIRNLLDESGHDEALFPMLIPEDLLAKEAEHIKGFEDEVYWVTHGGKTQLDVKLALRPTSETPIYYMMKLWVKVHTDLPIKIYQIVNTFRYETKHTRPLIRLREIMTFKEAHTAHSTKEEAENQVKEAISIYKKFFDTLGIPYLISKRPEWDKFPGAEYTMAFDTIFPDGRTMQIATVHNLGQNFSKTFEIIFETPTGDKDYAYQTCYGISDRVIASIIAIHGDEKGLILPPIVAPIQVVIVPLIFKGKEDIVMEKAKEIYEKLKGKFRVHIDDRDIRPGRKFNDWEIKGVPLRIEVGPKDIENKKITLFRRDTMEKFQVDETQLMEVVEKTLNNIMENIKNRAWEKFENFITILEDINPDEIKNILSEKRGVILVPFKEEIYNEELEEKVEATILGETEYKGNKYIAIAKTY,"[(41, 41), (70, 70), (77, 77), (80, 80), (82, 82), (99, 100), (102, 102), (104, 108), (130, 131), (133, 133), (135, 137), (141, 145), (147, 147), (149, 150), (152, 152), (154, 154), (167, 167), (174, 174), (190, 190), (196, 197), (199, 199), (201, 207), (209, 209), (214, 215), (217, 218), (220, 220), (222, 223), (227, 227), (231, 231), (248, 252), (254, 254), (256, 260), (455, 455)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,O=P([O-])([O-])[O-],MVRIIVKNVSKVFKKGKVVALDNVNINIENGERFGILGPSGAGKTTFMRIIAGLDVPSTGELYFDDRLVASNGKLIVPPEDRKIGMVFQTWALYPNLTAFENIAFPLTNMKMSKEEIRKRVEEVAKILDIHHVLNHFPRELSGGQQQRVALARALVKDPSLLLLDEPFSNLDARMRDSARALVKEVQSRLGVTLLVVSHDPADIFAIADRVGVLVKGKLVQVGKPEDLYDNPVSIQVASLIGEINELEGKVTNEGVVIGSLRFPVSVSSDRAIIGIRPEDVKLSKDVIKDDSWILVGKGKVKVIGYQGGLFRITITPLDSEEEIFTYSDHPIHSGEEVLVYVRKDKIKVFEKN,"[(11, 11), (13, 13), (20, 21), (36, 39), (47, 50), (55, 55), (87, 88), (90, 92), (164, 165), (167, 170), (197, 200), (214, 214)]" +C1[C@H](C([C@@H](CC1(C(=O)O)O)O)O)O,O=C1C[C@@](O)(C(=O)O)C[C@@H](O)[C@@H]1O,MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTVDYLSQVGGKKEVKIHVAAVAQMDFISKNFVYRTLPFDQLVQRAAEEKHKEFFVSEDEKYYLRSLGEDPRKDVADIRKQFPLLKGDIKFPEFFKEEQFFSSVFRISSPGLQLWTHYDVMDNLLIQVTGKKRVVLFSPRDAQYLYLKGTKSEVLNIDNPDLAKYPLFSKARRYECSLEAGDVLFIPALWFHNVISEEFGVGVNIFWKHLPSECYDKTDTYGNKDPTAASRAAQILDRALKTLAELPEEYRDFYARRMVLHIQDKAYSKNSE,"[(59, 61), (68, 68), (85, 85), (104, 105), (107, 108), (149, 151), (155, 155), (157, 159), (161, 161), (163, 165), (167, 168), (170, 170), (173, 174), (176, 180), (193, 194), (219, 221), (227, 229), (231, 234), (236, 240), (243, 245), (247, 250), (264, 266)]" +C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CCC2=O,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)(O)O)[C@@H]3O)[C@@H](O)[C@H]2O)c1,MARTVVLITGCSSGIGLHLAVRLASDPSQSFKVYATLRDLKTQGRLWEAARALACPPGSLETLQLDVRDSKSVAAARERVTEGRVDVLVCNAGLGLLGPLEALGEDAVASVLDVNVVGTVRMLQAFLPDMKRRGSGRVLVTGSVGGLMGLPFNDVYCASKFALEGLCESLAVLLLPFGVHLSLIECGPVHTAFMEKVLGSPEEVLDRTDIHTFHRFYQYLAHSKQVFREAAQNPEEVAEVFLTALRAPKPTLRYFTTERFLPLLRMRLDDPSGSNYVTAMHREVFGDVPAKAEAGAEAGGGAGPGAEDEAGRGAVGDPELGDPPAAPQ,"[(1, 9), (39, 42), (44, 45), (48, 49), (52, 55), (57, 65), (67, 68), (72, 72), (78, 81), (88, 88), (90, 91), (112, 113), (116, 117), (120, 121), (140, 140), (144, 148), (155, 159), (161, 164), (185, 185), (188, 188), (190, 190), (233, 234), (236, 238), (240, 241), (244, 245), (266, 266)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,MSPRVGVTLSGRYRLQRLIATGGMGQVWEAVDNRLGRRVAVKVLKSEFSSDPEFIERFRAEARTTAMLNHPGIASVHDYGESQMNGEGRTAYLVMELVNGEPLNSVLKRTGRLSLRHALDMLEQTGRALQIAHAAGLVHRDVKPGNILITPTGQVKITDFGIAKAVDAAPVTQTGMVMGTAQYIAPEQALGHDASPASDVYSLGVVGYEAVSGKRPFAGDGALTVAMKHIKEPPPPLPPDLPPNVRELIEITLVKNPAMRYRSGGPFADAVAAVRAGRRPPRPSQTPPPGRAAPAAIPSGTTARVAANSAGRTAASRRSRPATGGHRPPRRTFSSGQRALLWAAGVLGALAIIIAVLLVIKAPGDNSPQQAPTPTVTTTGNPPASNTGGTDASPRLNWTERGETRHSGLQSWVVPPTPHSRASLARYEIAQ,"[(3, 3), (17, 18), (28, 29), (40, 41), (43, 45), (48, 48), (54, 54), (58, 58), (91, 95), (97, 97), (159, 159), (162, 163), (166, 166)]" +O,C[C@H](N)C(=O)O,MDIMNEKVKKIIEFMDKNSIDAVLIAKNPNVYYISGASPLAGGYILITGESATLYVPELEYEMAKEESNIPVEKFKKMDEFYKALEGIKSLGIESSLPYGFIEELKKKANIKEFKKVDDVIRDMRIIKSEKEIKIIEKACEIADKAVMAAIEEITEGKKEREVAAKVEYLMKMNGAEKPAFDTIIASGYRSALPHGVASDKRIERGDLVVIDLGALYQHYNSDITRTIVVGSPNEKQKEIYEIVLEAQKKAVESAKPGITAKELDSIARNIIAEYGYGEYFNHSLGHGVGLEVHEWPRVSQYDETVLREGMVITIEPGIYIPKIGGVRIEDTILITKNGSKRLTKTERELI,"[(140, 140), (181, 184), (210, 211), (213, 214), (221, 222), (224, 226), (248, 248), (281, 281), (283, 286), (288, 289), (293, 295), (297, 299), (312, 315), (317, 318), (320, 320), (328, 329), (331, 332), (344, 344)]" +Nc1nc2c(ncn2[C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]2O)c(=O)[nH]1,O=P([O-])([O-])OP(=O)([O-])[O-],MRYDVVIAGAGPTGLMLACELRLAGARTLVLERLAEPVDFSKALGVHARTVELLDMRGLGEGFQAEAPKLRGGNFASLGVPLDFSSFDTRHPYALFVPQVRTEELLTGRALELGAELRRGHAVTALEQDADGVTVSVTGPEGPYEVECAYLVGCDGGGSTVRKLLGIDFPGQDPHMFAVIADARFREELPHGEGMGPMRPYGVMRHDLRAWFAAFPLEPDVYRATVAFFDRPYADRRAPVTEEDVRAALTEVAGSDFGMHDVRWLSRLTDTSRQAERYRDGRVLLAGDACHIHLPAGGQGLNLGFQDAVNLGWKLGATIAGTAPPELLDTYEAERRPIAAGVLRNTRAQAVLIDPDPRYEGLRELMIELLHVPETNRYLAGLISALDVRYPMAGEHPLLGRRVPDLPLVTEDGTRQLSTYFHAARGVLLTLGCDQPLADEAAAWKDRVDLVAAEGVADPGSAVDGLTALLVRPDGYICWTAAPETGTDGLTDALRTWFGPPAM,"[(8, 12), (14, 17), (30, 31), (34, 35), (40, 43), (45, 47), (50, 50), (95, 98), (100, 104), (117, 122), (124, 125), (135, 137), (151, 151), (154, 159), (161, 164), (169, 169), (215, 215), (217, 217), (223, 223), (286, 287), (289, 292), (298, 300), (303, 306), (308, 308), (335, 335)]" +O=O,O=Cc1ccccc1,MSFGALRQLLLIACLALPSLAATNLPTADFDYVVVGAGNAGNVVAARLTEDPDVSVLVLEAGVSDENVLGAEAPLLAPGLVPNSIFDWNYTTTAQAGYNGRSIAYPRGRMLGGSSSVHYMVMMRGSTEDFDRYAAVTGDEGWNWDNIQQFVRKNEMVVPPADNHNTSGEFIPAVHGTNGSVSISLPGFPTPLDDRVLATTQEQSEEFFFNPDMGTGHPLGISWSIASVGNGQRSSSSTAYLRPAQSRPNLSVLINAQVTKLVNSGTTNGLPAFRCVEYAEQEGAPTTTVCAKKEVVLSAGSVGTPILLQLSGIGDENDLSSVGIDTIVNNPSVGRNLSDHLLLPAAFFVNSNQTFDNIFRDSSEFNVDLDQWTNTRTGPLTALIANHLAWLRLPSNSSIFQTFPDPAAGPNSAHWETIFSNQWFHPAIPRPDTGSFMSVTNALISPVARGDIKLATSNPFDKPLINPQYLSTEFDIFTMIQAVKSNLRFLSGQAWADFVIRPFDPRLRDPTDDAAIESYIRDNANTIFHPVGTASMSPRGASWGVVDPDLKVKGVDGLRIVDGSILPFAPNAHTQGPIYLVGKQGADLIKADQ,"[(11, 12), (20, 21), (23, 23), (34, 34), (37, 38), (41, 42), (45, 45), (54, 59), (62, 63), (78, 82), (85, 87), (89, 91), (103, 104), (112, 113), (115, 117), (119, 119), (122, 125), (147, 147), (187, 187), (206, 206), (208, 210), (215, 215), (230, 230), (243, 244), (246, 246), (251, 253), (256, 257), (259, 260), (286, 286), (301, 301), (435, 435), (525, 527), (529, 531), (548, 548), (552, 553), (557, 562), (564, 566)]" +C([C@H](C(=O)[O-])O)O,[H+],MTWKNFGFEIFGEKYGQEELEKRIKDEHTPPPDSPVFGGLKLKLKKEKFKTLFTLGTTLKGFRRATHTVGTGGIGEITIVNDPKFPEHEFFTAGRTFPARLRHANLKYPDDAGADARSFSIKFADSDSDGPLDIVMNTGEANIFWNSPSLEDFVPVEEGDAAEEYVYKNPYYYYNLVEALRRAPDTFAHLYYYSQVTMPFKAKDGKVRYCRYRALPGDVDIKEEDESGRLTEEEQRKIWIFSRHENEKRPDDYLRKEYVERLQKGPVNYRLQIQIHEASPDDTATIFHAGILWDKETHPWFDLAKVSIKTPLSPDVLEKTAFNIANQPASLGLLEAKSPEDYNSIGELRVAVYTWVQHLRKLKIGSLVPAGQNAIYNVEVETGDREHAGTDATITIRITGAKGRTDYLKLDKWFHNDFEAGSKEQYTVQGFDVGDIQLIELHSDGGGYWSGDPDWFVNRVIIISSTQDRVYSFPCFRWVIKDMVLFPGEATLPFNEVPAIVSEQRQKELEQRKLTYQWDYVSDDMPGNIKAKTHDDLPRDVQFTDEKSRSYQESRKAALVNLGIGSLFTMFENWDSYDDYHILYRNWILGGTPNMADRWHEDRWFGYQFLNGANPVILTRCDALPSNFPVTNEHVNASLDRGKNLDEEIKDGHIYIVDFKVLVGAKSYGGPVLEDIGYKVPDHLKHDEADIRYCAAPLALFYVNKLGHLMPIAIQINQEPGPENPIWTPHEENEHDWMMAKFWLGVAESNFHQLNTHLLRTHLTTESFALSTWRNLASAHPVFKLLQPHIYGVLAIDTIGRKELIGSGGIVDQSLSLGGGGHVTFMEKCFKEVNLQDYHLPNALKKRGVDDPSKLPGFYYRDDGLALWEAIETFIGEIIAIFYKNDDDVKRDNEIQSWIYDVHKNGWRVNPGHQDHGVPASFESREQLKEVLTSLVFTFSCQHAAVNFSQKDHYGFTPNAPAVLRHPPPKKKGEATLQSILSTLPSKSQAAKAIATVYILTKFSEDERYLGNYSATAWEDKDALDAINRFQDKLEDISKKIKQRNENLEVPYIYLLPERIPNGTAI,"[(382, 383), (385, 386), (388, 388), (392, 394), (414, 414), (418, 418), (420, 422), (424, 424), (446, 446), (449, 451), (453, 453), (455, 456), (478, 480), (539, 539), (543, 543), (545, 545), (551, 551), (603, 603), (752, 756), (758, 761), (763, 766), (793, 793), (796, 797), (859, 859), (937, 942), (944, 946), (948, 951), (999, 1002), (1047, 1050), (1063, 1065)]" +C1=NC(=C2C(=N1)N(C=N2)[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O)(O)OS(=O)(=O)O)OP(=O)(O)O)O)N,C1=CC=C(C=C1)OS(=O)(=O)[O-],MALTSDLGKQIKLKEVEGTLLQPATVDNWSQIQSFEAKPDDLLICTYPKAGTTWIQEIVDMIEQNGDVEKCQRAIIQHRHPFIEWARPPQPSGVEKAKAMPSPRILKTHLSTQLLPPSFWENNCKFLYVARNAKDCMVSYYHFQRMNHMLPDPGTWEEYFETFINGKVVWGSWFDHVKGWWEMKDRHQILFLFYEDIKRDPKHEIRKVMQFMGKKVDETVLDKIVQETSFEKMKENPMTNRSTVSKSILDQSISSFMRKGTVGDWKNHFTVAQNERFDEIYRRKMEGTSINFCMEL,"[(44, 44), (46, 48), (55, 58), (61, 61), (107, 107), (109, 109), (129, 130), (132, 133), (135, 138), (140, 143), (146, 146), (173, 173), (192, 193), (195, 198), (205, 205), (223, 227), (234, 236), (261, 262), (264, 264), (268, 268), (292, 292)]" +C(=O)(O)[O-],O=P([O-])([O-])[O-],MASSAQDGNNPLFSPYKMGKFNLSHRVVLAPMTRCRALNNIPQAALGEYYEQRATAGGFLITEGTMISPTSAGFPHVPGIFTKEQVREWKKIVDVVHAKGAVIFCQLWHVGRASHEVYQPAGAAPISSTEKPISNRWRILMPDGTHGIYPKPRAIGTYEISQVVEDYRRSALNAIEAGFDGIEIHGAHGYLIDQFLKDGINDRTDEYGGSLANRCKFITQVVQAVVSAIGADRVGVRVSPAIDHLDAMDSNPLSLGLAVVERLNKIQLHSGSKLAYLHVTQPRYVAYGQTEAGRLGSEEEEARLMRTLRNAYQGTFICSGGYTRELGIEAVAQGDADLVSYGRLFISNPDLVMRIKLNAPLNKYNRKTFYTQDPVVGYTDYPFLQGNGSNGPLSRL,"[(29, 30), (34, 35), (49, 50), (61, 63), (65, 66), (74, 74), (77, 78), (104, 105), (107, 108), (183, 188), (235, 236), (238, 239), (278, 280), (283, 284), (319, 320), (322, 323), (326, 326), (340, 341), (344, 347), (364, 364), (366, 366), (369, 370)]" +[H+],NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)c1,MAPKRSSDLFSQVVNSGPGSFLARQLGVPQPETLRRYRAGEPPLTGSLLIGGAGRVVEPLRAALEKDYDLVGNNLGGRWADSFGGLVFDATGITEPAGLKGLHEFFTPVLRNLGRCGRVVVVGGTPEAAASTNERIAQRALEGFTRSLGKELRRGATTALVYLSPDAKPAATGLESTMRFLLSAKSAYVDGQVFSVGADDSTPPADWEKPLDGKVAIVTGAARGIGATIAEVFARDGAHVVAIDVESAAENLAETASKVGGTALWLDVTADDAVDKISEHLRDHHGGKADILVNNAGITRDKLLANMDDARWDAVLAVNLLAPLRLTEGLVGNGSIGEGGRVIGLSSIAGIAGNRGQTNYATTKAGMIGITQALAPGLAAKGITINAVAPGFIETQMTAAIPLATREVGRRLNSLLQGGQPVDVAEAIAYFASPASNAVTGNVIRVCGQAMIGA,"[(188, 188), (218, 222), (224, 224), (226, 229), (242, 243), (245, 246), (248, 248), (251, 252), (255, 255), (264, 266), (269, 270), (273, 273), (293, 294), (296, 296), (298, 299), (301, 301), (315, 315), (318, 319), (322, 322), (326, 326), (345, 350), (357, 359), (361, 363), (365, 368), (388, 389), (391, 392), (394, 394), (419, 421), (424, 424), (446, 447)]" +O,O=P([O-])([O-])[O-],MQKSPLEKASFISKLFFSWTTPILRKGYRHHLELSDIYQAPSADSADHLSEKLEREWDREQASKKNPQLIHALRRCFFWRFLFYGILLYLGEVTKAVQPVLLGRIIASYDPENKVERSIAIYLGIGLCLLFIVRTLLLHPAIFGLHRIGMQMRTAMFSLIYKKTLKLSSRVLDKISIGQLVSLLSNNLNKFDEGLALAHFIWIAPLQVTLLMGLLWDLLQFSAFCGLGLLIILVIFQAILGKMMVKYRDQRAAKINERLVITSEIIDNIYSVKAYCWESAMEKMIENLREVELKMTRKAAYMRFFTSSAFFFSGFFVVFLSVLPYTVINGIVLRKIFTTISFCIVLRMSVTRQFPTAVQIWYDSFGMIRKIQDFLQKQEYKVLEYNLMTTGIIMENVTAFWEEGFGELLEKVQQSNGDRKHSSDENNVSFSHLCLVGNPVLKNINLNIEKGEMLAITGSTGSGKTSLLMLILGELEASEGIIKHSGRVSFCSQFSWIMPGTIKENIIFGVSYDEYRYKSVVKACQLQQDITKFAEQDNTVLGEGGVTLSGGQRARISLARAVYKDADLYLLDSPFGYLDVFTEEQVFESCVCKLMANKTRILVTSKMEHLRKADKILILHQGSSYFYGTFSELQSLRPDFSSKLMGYDTFDQFTEERRSSILTETLRRFSVDDSSAPWSKPKQSFRQTGEVGEKRKNSILNSFSSVRKISIVQKTPLCIDGESDDLQEKRLSLVPDSEQGEAALPRSNMIATGPTFPGRRRQSVLDLMTFTPNSGSSNLQRTRTSIRKISLVPQISLNEVDVYSRRLSQDSTLNITEEINEEDLKECFLDDVIKIPPVTTWNTYLRYFTLHKGLLLVLIWCVLVFLVEVAASLFVLWLLKNNPVNSGNNGTKISNSSYVVIITSTSFYYIFYIYVGVADTLLALSLFRGLPLVHTLITASKILHRKMLHSILHAPMSTISKLKAGGILNRFSKDIAILDDFLPLTIFDFIQLVFIVIGAIIVVSALQPYIFLATVPGLVVFILLRAYFLHTAQQLKQLESEGRSPIFTHLVTSLKGLWTLRAFRRQTYFETLFHKALNLHTANWFMYLATLRWFQMRIDMIFVLFFIVVTFISILTTGEGEGTAGIILTLAMNIMSTLQWAVNSSIDTDSLMRSVSRVFKFIDIQTEESMYTQIIKELPREGSSDVLVIKNEHVKKSDIWPSGGEMVVKDLTVKYMDDGNAVLENISFSISPGQRVGLLGRTGSGKSTLLSAFLRMLNIKGDIEIDGVSWNSVTLQEWRKAFGVITQKVFIFSGTFRQNLDPNGKWKDEEIWKVADEVGLKSVIEQFPGQLNFTLVDGGYVLSHGHKQLMCLARSVLSKAKIILLDEPSAHLDPITYQVIRRVLKQAFAGCTVILCEHRIEAMLDCQRFLVIEESNVWQYDSLQALLSEKSIFQQAISSSEKMRFFQGRHSSKHKPRTQITALKEETEEEVQETRL,"[(399, 400), (402, 403), (405, 406), (409, 409), (412, 413), (430, 430), (433, 434), (438, 441), (456, 457), (466, 469), (490, 492), (494, 495), (572, 573), (577, 577), (603, 604), (618, 623), (659, 659), (662, 663), (666, 666)]" +O,CC(C)(CO)[C@H](C(=O)NCCC(=O)O)O,MSTLANLTEVLFRLDFDPDTAVYHYRGQTLSRLQCRTYILSQASQLARLLKPGDRVVLALNDSPSLACLFLACIAVGAIPAVINPKSREQALADIAADCQASLVVREADAPSLSGPLAPLTLRAAAGRPLLDDFSLDALVGPADLDWSAFHRQDPAAACFLQYTSGSTGAPKGVMHSLRNTLGFCRAFATELLALQAGDRLYSIPKMFFGYGMGNSLFFPWFSGASALLDDTWPSPERVLENLVAFRPRVLFGVPAIYASLRPQARELLSSVRLAFSAGSPLPRGEFEFWAAHGLEICDGIGATEVGHVFLANRPGQARADSTGLPLPGYECRLVDREGHTIEEAGRQGVLLVRGPGLSPGYWRASEEQQARFAGGWYRTGDLFERDESGAYRHCGREDDLFKVNGRWVVPTQVEQAICRHLPEVSEAVLVPTCRLHDGLRPTLFVTLATPLDDNQILLAQRIDQHLAEQIPSHMLPSQLHVLPALPRNDNGKLARAELRHLADTLYHDNLPEERAC,"[(55, 55), (57, 57), (70, 70), (79, 83), (91, 91), (94, 95), (98, 99), (158, 160), (173, 176), (184, 184), (208, 210), (213, 214), (218, 218), (304, 305), (307, 307), (362, 365), (369, 369), (372, 372), (378, 378)]" +[C@H](C(=O)O)(N)NC(=O)N,N,MRSLYLIVFIVISLVKASKSDDGFCSAPSIVESDEKTNPIYWKATNPTLSPSHLQDLPGFTRSVYKRDHALITPESHVYSPLPDWTNTLGAYLITPATGSHFVMYLAKMKEMSSSGLPPQDIERLIFVVEGAVTLTNTSSSSKKLTVDSYAYLPPNFHHSLDCVESATLVVFERRYEYLGSHTTELIVGSTDKQPLLETPGEVFELRKLLPMSVAYDFNIHTMDFQPGEFLNVKEVHYNQHGLLLLEGQGIYRLGDNWYPVQAGDVIWMAPFVPQWYAALGKTRSRYLLYKDVNRNPL,"[(94, 94), (102, 102), (233, 234), (236, 236), (238, 240), (242, 243), (252, 254), (267, 274), (276, 277), (289, 291)]" +C([C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)OP(=O)(O)O)O)O)O)O,C([C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)O)O)O)O)OP(=O)(O)O,MAVTAQAARRKERVLCLFDVDGTLTPARQKIDPEVAAFLQKLRSRVQIGVVGGSDYCKIAEQLGDGDEVIEKFDYVFAENGTVQYKHGRLLSKQTIQNHLGEELLQDLINFCLSYMALLRLPKKRGTFIEFRNGMLNISPIGRSCTLEERIEFSELDKKEKIREKFVEALKTEFAGKGLRFSRGGMISFDVFPEGWDKRYCLDSLDQDSFDTIHFFGNETSPGGNDFEIFADPRTVGHSVVSPQDTVQRCREIFFPETAHEA,"[(17, 18), (20, 20), (22, 27), (29, 30), (51, 52), (54, 54), (58, 58), (62, 62), (79, 79), (96, 96), (126, 128), (130, 131), (133, 134), (136, 142), (144, 149), (151, 154), (157, 157), (163, 163), (180, 185), (187, 187), (189, 189), (191, 192), (198, 199), (202, 202), (213, 213), (215, 217), (219, 220), (225, 229), (231, 231), (233, 234), (236, 241)]" +O=C([O-])C(=O)C[C@]1(C(=O)[O-])C=C[C@@H](O)C=C1,O,MFDKHTHTLIAQRLDQAEKQREQIRAISLDYPEITIEDAYAVQREWVRLKIAEGRTLKGHKIGLTSKAMQASSQISEPDYGALLDDMFFHDGSDIPTDRFIVPRIEVELAFVLAKPLRGPNCTLFDVYNATDYVIPALELIDARCHNIDPETQRPRKVFDTISDNAANAGVILGGRPIKPDELDLRWISALMYRNGVIEETGVAAGVLNHPANGVAWLANKLAPYDVQLEAGQIILGGSFTRPVPARKGDTFHVDYGNMGSISCRFV,"[(61, 62), (83, 83), (104, 105), (107, 107), (109, 110), (135, 138), (140, 141), (168, 168), (170, 171), (236, 241), (243, 243), (256, 256)]" +[H+],O,MSKLLMIGTGPVAIQLANICYLKSDYEIDMVGRASTSEKSKRLYQAYKKEKQFEVKIQNEAHQHLEGKFEINRLYKDVKNVKGEYETVVMACTADAYYDTLQQLSLETLQSVKHVILISPTFGSQMIVEQFMSKFSQDIEVISFSTYLGDTRIVDKEAPNHVLTTGVKKKLYMGSTHSNSTMCQRISALAEQLKIQLEVVESPLHAETRNSSLYVHPPLFMNDFSLKAIFEGTDVPVYVYKLFPEGPITMTLIREMRLMWKEMMAILQAFRVPSVNLLQFMVKENYPVRPETLDEGDIEHFEILPDILQEYLLYVRYTAILIDPFSQPDENGHYFDFSAVPFKQVYKNEQDVVQIPRMPSEDYYRTAMIQHIGKMLGIKTPMIDQFLTRYEASCQAYKDMHQDQQLSSQFNTNLFEGDKALVTKFLEINRTLS,"[(7, 8), (13, 17), (30, 32), (34, 36), (41, 44), (74, 74), (76, 78), (91, 91), (96, 98), (100, 103), (118, 118), (147, 147), (151, 153), (162, 162), (349, 350)]" +C([C@H]([C@H]([C@H]([C@@H](C(=O)CO)O)O)O)O)OP(=O)(O)O,C([C@H]([C@@H]1[C@H]([C@@H]([C@@H](C(O1)O)O)O)O)O)OP(=O)(O)O,MENRELTYITNSIAEAQRVMAAMLADERLLATVRKVADACIASIAQGGKVLLAGNGGSAADAQHIAGEFVSRFAFDRPGLPAVALTTDTSILTAIGNDYGYEKLFSRQVQALGNEGDVLIGYSTSGKSPNILAAFREAKAKGMTCVGFTGNRGGEMRELCDLLLEVPSADTPKIQEGHLVLGHIVCGLVEHSIFGKQ,"[(57, 58), (60, 63), (65, 67), (69, 73), (77, 77), (124, 124), (170, 174), (176, 182), (184, 187)]" +Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]1O,MDSSTATAMTAPFIDPTDHVNLKTDTDASENRRMGNYKPSIWNYDFLQSLATHHNIVEERHLKLAEKLKGQVKFMFGAPMEPLAKLELVDVVQRLGLNHLFETEIKEALFSIYKDGSNGWWFGHLHATSLRFRLLRQCGLFIPQDVFKTFQNKTGEFDMKLCDNVKGLLSLYEASYLGWKGENILDEAKAFTTKCLKSAWENISEKWLAKRVKHALALPLHWRVPRIEARWFIEAYEQEANMNPTLLKLAKLDFNMVQSIHQKEIGELARWWVTTGLDKLAFARNNLLQSYMWSCAIASDPKFKLARETIVEIGSVLTVVDDGYDVYGSIDELDLYTSSVERWSCVEIDKLPNTLKLIFMSMFNKTNEVGLRVQHERGYNSIPTFIKAWVEQCKSYQKEARWFHGGHTPPLEEYSLNGLVSIGFPLLLITGYVAIAENEAALDKVHPLPDLLHYSSLLSRLINDIGTSPDEMARGDNLKSIHCYMNETGASEEVAREHIKGVIEENWKILNQCCFDQSQFQEPFITFNLNSVRGSHFFYEFGDGFGVTDSWTKVDMKSVLIDPIPLGEE,"[(44, 44), (281, 283), (285, 286), (317, 320), (322, 324), (326, 327), (396, 396), (399, 399), (414, 414), (418, 419), (421, 422), (456, 459), (461, 462), (464, 466), (468, 468), (477, 477), (479, 479), (481, 481), (496, 496), (539, 539), (546, 546), (548, 548)]" +CCN(CC)C(=O)/C(C#N)=C/c1cc(O)c(O)c([N+](=O)[O-])c1,O=c1ccn([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O)[C@@H](O)[C@H]2O)c(=O)[nH]1,MSVKWTSVILLIQLSFCFSSGNCGKVLVWAAEYSHWMNIKTILDELIQRGHEVTVLASSASILFDPNNSSALKIEIYPTSLTKTELENFIMQQIKRWSDLPKDTFWLYFSQVQEIMSIFGDITRKFCKDVVSNKKFMKKVQESRFDVIFADAIFPCSELLAELFNIPFVYSLSFSPGYTFEKHSGGFIFPPSYVPVVMSELTDQMTFMERVKNMIYVLYFDFWFEIFDMKKWDQFYSEVLGRPTTLSETMGKADVWLIRNSWNFQFPYPLLPNVDFVGGLHCKPAKPLPKEMEDFVQSSGENGVVVFSLGSMVSNMTEERANVIASALAQIPQKVLWRFDGNKPDTLGLNTRLYKWIPQNDLLGHPKTRAFITHGGANGIYEAIYHGIPMVGIPLFADQPDNIAHMKARGAAVRVDFNTMSSTDLLNALKRVINDPSYKENVMKLSRIQHDQPVKPLDRAVFWIEFVMRHKGAKHLRVAAHDLTWFQYHSLDVIGFLLVCVATVIFIVTKCCLFCFWKFARKAKKGKND,"[(308, 310), (359, 359), (363, 363), (371, 372), (380, 383), (391, 397), (399, 403)]" +CC(=O)Oc1ccc2ccccc2c1,Oc1ccc2ccccc2c1,MAKLIALTLLGMGLALFRNHQSSYQTRLNALREVQPVELPNCNLVKGIETGSEDLEILPNGLAFISSGLKYPGIKSFNPNSPGKILLMDLNEEDPTVLELGITGSKFDVSSFNPHGISTFTDEDNAMYLLVVNHPDAKSTVELFKFQEEEKSLLHLKTIRHKLLPNLNDIVAVGPEHFYGTNDHYFLDPYLQSWEMYLGLAWSYVVYYSPSEVRVVAEGFDFANGINISPDGKYVYIAELLAHKIHVYEKHANWTLTPLKSLDFNTLVDNISVDPETGDLWVGCHPNGMKIFFYDSENPPASEVLRIQNILTEEPKVTQVYAENGTVLQGSTVASVYKGKLLIGTVFHKALYCEL,"[(51, 52), (55, 56), (64, 66), (69, 69), (85, 85), (115, 116), (118, 119), (129, 132), (166, 167), (170, 171), (179, 183), (222, 223), (225, 226), (238, 238), (240, 240), (267, 268), (271, 272), (281, 285), (331, 333), (345, 346)]" +NC(=O)[C@@H](N)Cc1ccccc1,NC(=O)[C@H](N)Cc1ccccc1,MTKALYDRDGAAIGNLQKLRFFPLAISGGRGARLIEENGRELIDLSGAWGAASLGYGHPAIVAAVSAAAANPAGATILSASNAPAVTLAERLLASFPGEGTHKIWFGHSGSDANEAAYRAIVKATGRSGVIAFAGAYHGCTVGSMAFSGHSVQADAAKADGLILLPYPDPYRPYRNDPTGDAILTLLTEKLAAVPAGSIGAAFIEPIQSDGGLIVPPDGFLRKFADICRAHGILVVCDEVKVGLARSGRLHCFEHEGFVPDILVLGKGLGGGLPLSAVIAPAEILDCASAFAMQTLHGNPISAAAGLAVLETIDRDDLPAMAERKGRLLRDGLSELAKRHPLIGDIRGRGLACGMELVCDRQSREPARAETAKLIYRAYQLGLVVYYVGMNGNVLEFTPPLTITETDIHKALDLLDRAFSELSAVSNEEIAQFAGW,"[(49, 50), (75, 76), (78, 78), (108, 109), (112, 116), (135, 136), (138, 141), (144, 144), (148, 150), (152, 153), (204, 206), (209, 210), (236, 237), (242, 244), (246, 246), (250, 253), (263, 268), (276, 276), (293, 294), (296, 297), (396, 396), (398, 398)]" +CC(=O)NCCCCNCCCN,NCCCCNCCCN,MAIGYVWNTLYGWVDTGTGSLAAANLTARMQPISHHLAHPDTKRRFHELVCASGQIEHLTPIAAVAATDADILRAHSAAHLENMKRVSNLPTGGDTGDGITMMGNGGLEIARLSAGGAVELTRRVATGELSAGYALVNPPGHHAPHNAAMGFCIFNNTSVAAGYARAVLGMERVAILDWDVHHGNGTQDIWWNDPSVLTISLHQHLCFPPDSGYSTERGAGNGHGYNINVPLPPGSGNAAYLHAMDQVVLHALRAYRPQLIIVGSGFDASMLDPLARMMVTADGFRQMARRTIDCAADICDGRIVFVQEGGYSPHYLPFCGLAVIEELTGVRSLPDPYHEFLAGMGGNTLLDAERAAIEEIVPLLADIR,"[(142, 143), (178, 179), (181, 181), (183, 184), (201, 204), (207, 208), (264, 264), (266, 267), (269, 270), (273, 273), (275, 276), (285, 285), (309, 312)]" +CC[C@H](/C=C/[C@@H](C)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2CC=C4[C@@]3(CC[C@@H](C4)O)C)C)C(C)C,OO,MTAQQHLSRRRMLGMAAFGAAALAGGTTIAAPRAAAAAKSAADNGGYVPAVVIGTGYGAAVSALRLGEAGVQTLMLEMGQLWNQPGPDGNIFCGMLNPDKRSSWFKNRTEAPLGSFLWLDVVNRNIDPYAGVLDRVNYDQMSVYVGRGVGGGSLVNGGMAVEPKRSYFEEILPRVDSSEMYDRYFPRANSMLRVNHIDTKWFEDTEWYKFARVSREQAGKAGLGTVFVPNVYDFGYMQREAAGEVPKSALATEVIYGNNHGKQSLDKTYLAAALGTGKVTIQTLHQVKTIRQTKDGGYALTVEQKDTDGKLLATKEISCRYLFLGAGSLGSTELLVRARDTGTLPNLNSEVGAGWGPNGNIMTARANHMWNPTGAHQSSIPALGIDAWDNSDSSVFAEIAPMPAGLETWVSLYLAITKNPQRGTFVYDAATDRAKLNWTRDQNAPAVNAAKALFDRINKANGTIYRYDLFGTQLKAFADDFCYHPLGGCVLGKATDDYGRVAGYKNLYVTDGSLIPGSVGVNPFVTITALAERNVERIIKQDVTAS,"[(15, 16), (51, 51), (55, 56), (59, 61), (75, 76), (78, 82), (85, 85), (123, 123), (125, 125), (130, 130), (147, 151), (153, 155), (160, 161), (191, 198), (214, 220), (222, 222), (226, 226), (230, 231), (234, 234), (253, 253), (285, 286), (288, 289), (293, 295), (297, 298), (321, 321), (325, 325), (342, 342), (351, 351), (359, 359), (365, 366), (375, 375), (379, 380), (432, 433), (464, 464), (470, 474), (481, 482), (484, 485), (487, 488)]" +N[C@@H](Cc1ccc(O)c(O)c1)C(=O)O,NCCc1ccc(O)c(O)c1,MNASEFRRRGKEMVDYMANYMEGIEGRQVYPDVEPGYLRPLIPAAAPQEPDTFEDIINDVEKIIMPGVTHWHSPYFFAYFPTASSYPAMLADMLCGAIGCIGFSWAASPACTELETVMMDWLGKMLELPKAFLNEKAGEGGGVIQGSASEATLVALLAARTKVIHRLQAASPELTQAAIMEKLVAYSSDQAHSSVERAGLIGGVKLKAIPSDGNFAMRASALQEALERDKAAGLIPFFMVATLGTTTCCSFDNLLEVGPICNKEDIWLHVDAAYAGSAFICPEFRHLLNGVEFADSFNFNPHKWLLVNFDCSAMWVKKRTDLTGAFRLDPTYLKHSHQDSGLITDYRHWQIPLGRRFRSLKMWFVFRMYGVKGLQAYIRKHVQLSHEFESLVRQDPRFEICVEVILGLVCFRLKGSNKVNEALLQRINSAKKIHLVPCHLRDKFVLRFAICSRTVESAHVQRAWEHIKELAADVLRAERE,"[(146, 147), (150, 153), (190, 190), (192, 192), (194, 194), (197, 197), (244, 245), (247, 248), (271, 271), (273, 274), (277, 277), (298, 299), (301, 303), (311, 313), (447, 447)]" +O,O=C[C@H](O)[C@@H](O)[C@H](O)[C@H](O)CO,MSKLFSTVNSARHSVPLGGMRDYVHIKKLEMNTVLGPDSWNQLMPQKCLLSLDMGTDFSKSAATDDLKYSLNYAVISRDLTNFVSKKKNWGSVSNLAKSVSQFVMDKYSGVECLNLEVQADTTHIRSDHISCIIQQERGNPESQEFDVVRISELKMLTLIGVFTFERLKKQYVTLDIKLPWPKKAELPPPVQSIIDNVVKFVEESNFKTVEALVESVSAVIAHNEYFQKFPDSPLVVKVLKLNAITATEGVGVSCIREPREIAMVNIPYLSSIHESSDIKFQLSSSQNTPIEGKNTWKRAFLAFGSNIGDRFKHIQMALQLLSREKTVKLRNISSIFESEPMYFKDQTPFMNGCVEVETLLTPSELLKLCKKIEYEELQRVKHFDNGPRTIDLDIVMFLNSAGEDIIVNEPDLNIPHPRMLERTFVLEPLCELISPVHLHPVTAEPIVDHLKQLYDKQHDEDTLWKLVPLPYRSGVEPRFLKFKTATKLDEFTGETNRITVSPTYIMAIFNATPDSFSDGGEHFADIESQLNDIIKLCKDALYLHESVIIDVGGCSTRPNSIQASEEEEIRRSIPLIKAIRESTELPQDKVILSIDTYRSNVAKEAIKVGVDIINDISGGLFDSNMFAVIAENPEICYILSHTRGDISTMNRLAHYENFALGDSIQQEFVHNTDIQQLDDLKDKTVLIRNVGQEIGERYIKAIDNGVKRWQILIDPGLGFAKTWKQNLQIIRHIPILKNYSFTMNSNNSQVYVNLRNMPVLLGPSRKKFIGHITKDVDAKQRDFATGAVVASCIGFGSDMVRVHDVKNCSKSIKLADAIYKGLE,"[(509, 510), (512, 513), (519, 519), (553, 553), (555, 557), (559, 561), (567, 567), (594, 595), (597, 598), (601, 603), (610, 614), (616, 619), (651, 652), (675, 676), (687, 687), (708, 710), (713, 714), (716, 719), (721, 722), (729, 729), (755, 759), (762, 766), (768, 771), (774, 774), (800, 801), (805, 807), (809, 810), (820, 820), (826, 826), (829, 830), (833, 833), (838, 838), (840, 842)]" +CC(C)=CCC/C(C)=C/COP(=O)([O-])OP(=O)([O-])[O-],CC1=CC[C@@H]2[C@H](C1)C2(C)C,MSKILVFGHQNPDSDAIGSSVAFAYLAKEAWGLDTEAVALGTPNEETAYVLDYFGVQAPRVVESAKAEGVETVILTDHNEFQQSISDIKDVTVYGVVDHHRVANFETANPLYMRLEPVGSASSIVYRMFKENGVSVPKELAGLLLSGLISDTLLLKSPTTHASDIPVAKELAELAGVNLEEYGLEMLKAGTNLSSKTAAELIDIDAKTFELNGEAVRVAQVNTVDINDILARQEEIEVAIQEAIVTEGYSDFVLMITDIVNSNSEILALGSNMAKVEAAFEFTLENNHAFLAGAVSRKKQVVPQLTESYNA,"[(7, 8), (10, 12), (14, 14), (16, 20), (40, 40), (44, 44), (47, 47), (75, 76), (78, 79), (83, 83), (97, 98), (100, 101), (115, 122), (124, 124), (146, 150), (152, 153), (157, 157), (159, 159), (205, 205), (207, 207), (298, 298)]" +O,O=[N+]([O-])c1ccc(O)cc1,MDIMNEKVKKIIEFMDKNSIDAVLIAKNPNVYYISGASPLAGGYILITGESATLYVPELEYEMAKEESNIPVEKFKKMDEFYKALEGIKSLGIESSLPYGFIEELKKKANIKEFKKVDDVIRDMRIIKSEKEIKIIEKACEIADKAVMAAIEEITEGKKEREVAAKVEYLMKMNGAEKPAFDTIIASGYRSALPHGVASDKRIERGDLVVIDLGALYQHYNSDITRTIVVGSPNEKQKEIYEIVLEAQKKAVESAKPGITAKELDSIARNIIAEYGYGEYFNHSLGHGVGLEVHEWPRVSQYDETVLREGMVITIEPGIYIPKIGGVRIEDTILITKNGSKRLTKTERELI,"[(140, 140), (181, 184), (210, 211), (213, 214), (221, 222), (224, 226), (248, 248), (281, 281), (283, 286), (288, 289), (293, 295), (297, 299), (312, 315), (317, 318), (320, 320), (328, 329), (331, 332), (344, 344)]" +NC(=O)CC[C@H](N)C(=O)O,N[C@@H](CCC(=O)[O-])C(=O)[O-],MNFYSAYQHGFVRVAACTHHTTIGDPAANAASVLDMARACHDDGAALAVFPELTLSGYSIEDVLLQDSLLDAVEDALLDLVTESADLLPVLVVGAPLRHRHRIYNTAVVIHRGAVLGVVPKSYLPTYREFYERRQMAPGDGERGTIRIGGADVAFGTDLLFAASDLPGFVLHVEICEDMFVPMPPSAEAALAGATVLANLSGSPITIGRAEDRRLLARSASARCLAAYVYAAAGEGESTTDLAWDGQTMIWENGALLAESERFPKGVRRSVADVDTELLRSERLRMGTFDDNRRHHRELTESFRRIDFALDPPAGDIGLLREVERFPFVPADPQRLQQDCYEAYNIQVSGLEQRLRALDYPKVVIGVSGGLDSTHALIVATHAMDREGRPRSDILAFALPGFATGEHTKNNAIKLARALGVTFSEIDIGDTARLMLHTIGHPYSVGEKVYDVTFENVQAGLRTDYLFRIANQRGGIVLGTGDLSELALGWSTYGVGDQMSHYNVNAGVPKTLIQHLIRWVISAGEFGEKVGEVLQSVLDTEITPELIPTGEEELQSSEAKVGPFALQDFSLFQVLRYGFRPSKIAFLAWHAWNDAERGNWPPGFPKSERPSYSLAEIRHWLQIFVQRFYSFSQFKRSALPNGPKVSHGGALSPRGDWRAPSDMSARIWLDQIDREVPKG,"[(52, 52), (58, 58), (125, 126), (128, 130), (177, 177), (201, 202), (204, 208), (210, 214), (230, 230), (232, 232), (243, 244), (350, 353), (355, 358), (361, 365), (374, 377), (396, 399), (410, 412), (423, 423), (452, 455), (457, 461), (466, 470), (472, 474), (476, 479), (481, 484), (486, 489), (494, 500), (502, 503), (505, 505), (510, 510), (513, 514), (517, 517), (537, 538), (541, 542), (557, 557), (560, 561), (564, 564), (627, 627), (630, 634), (636, 641), (659, 660), (662, 663)]" +O=O,O=C[O-],MPQLEASLELDFQSESYKDAYSRINAIVIEGEQEAFDNYNRLAEMLPDQRDELHKLAKMEQRHMKGFMACGKNLSVTPDMGFAQKFFERLHENFKAAAAEGKVVTCLLIQSLIIECFAIAAYNIYIPVADAFARKITEGVVRDEYLHRNFGEEWLKANFDASKAELEEANRQNLPLVWLMLNEVADDARELGMERESLVEDFMIAYGEALENIGFTTREIMRMSAYGLAAV,"[(28, 31), (33, 36), (39, 39), (56, 59), (61, 62), (64, 68), (109, 114), (116, 120), (122, 122), (139, 140), (143, 146), (148, 151)]" +C[C@H](CCC(=O)NCC(=O)O)[C@H]1CC[C@@H]2[C@@]1([C@H](C[C@H]3[C@H]2[C@@H](C[C@H]4[C@@]3(CC[C@H](C4)O)C)O)O)C,NCC(=O)O,MCTGLALETKDGLHLFGRNMDIEYSFNQSIIFIPRNFKCVNKSNKKELTTKYAVLGMGTIFDDYPTFADGMNEKGLGCAGLNFPVYVSYSKEDIEGKTNIPVYNFLLWVLANFSSVEEVKEALKNANIVDIPISENIPNTTLHWMISDITGKSIVVEQTKEKLNVFDNNIGVLTNSPTFDWHVANLNQYVGLRYNQVPEFKLGDQSLTALGQGTGLVGLPGDFTPASRFIRVAFLRDAMIKNDKDSIDLIEFFHILNNVAMVRGSTRTVEEKSDLTQYTSCMCLEKGIYYYNTYENNQINAIDMNKENLDGNEIKTYKYNKTLSINHVN,"[(3, 5), (16, 17), (19, 21), (68, 69), (79, 81), (83, 84), (140, 143), (173, 176), (228, 228), (252, 252), (256, 256), (278, 280)]" +C1CCNC(=O)[C@H](C1)N,C1CCNC(=O)[C@@H](C1)N,MTKALYDRDGAAIGNLQKLRFFPLAISGGRGARLIEENGRELIDLSGAWGAASLGYGHPAIVAAVSAAAANPAGATILSASNAPAVTLAERLLASFPGEGTHKIWFGHSGSDANEAAYRAIVKATGRSGVIAFAGAYHGCTVGSMAFSGHSVQADAAKADGLILLPYPDPYRPYRNDPTGDAILTLLTEKLAAVPAGSIGAAFIEPIQSDGGLIVPPDGFLRKFADICRAHGILVVCDEVKVGLARSGRLHCFEHEGFVPDILVLGKGLGGGLPLSAVIAPAEILDCASAFAMQTLHGNPISAAAGLAVLETIDRDDLPAMAERKGRLLRDGLSELAKRHPLIGDIRGRGLACGMELVCDRQSREPARAETAKLIYRAYQLGLVVYYVGMNGNVLEFTPPLTITETDIHKALDLLDRAFSELSAVSNEEIAQFAGW,"[(49, 50), (75, 76), (78, 78), (108, 109), (112, 116), (135, 136), (138, 141), (144, 144), (148, 150), (152, 153), (204, 206), (209, 210), (236, 237), (242, 244), (246, 246), (250, 253), (263, 268), (276, 276), (293, 294), (296, 297), (396, 396), (398, 398)]" +O,C[S+](CCC(N)C(=O)O)CC1OC(n2cnc3c(N)ncnc32)C(O)C1O,IPAAPVAAQARKLLRDLAFRPPLLAARSQVVQLTPRRWLNLQEYQSKKLMSDNGVKVQRFFVADTANEALEAAKRLNAKEIVLKAQILAGGRGKGVFSSGLKGGVHLTKDPEVVGQLAKQMIGYNLATKQTPKEGVKVNKVMVAEALDISRETYLAILMDRSCNGPVLVGSPQGGVDIEEVAASNPELIFKEQIDIIEGIKDSQAQRMAENLGFLGPLQNQAADQIKKLYNLFLKIDATQVEVNPFGETPEGQVVCFDAKINFDDNAEFRQKDIFAMDDKSENEPIENEAAKYDLKYIGLDGNIACFVNGAGLAMATCDIIFLNGGKPANFLDLGGGVKESQVYQAFKLLTADPKVEAILVNIFGGIVNCAIIANGITKACRELELKVPLVVRLEGTNVHEAQNILTNSGLPITSAVDLEDAAKKAVASVTKK,"[(4, 4), (49, 53), (56, 57), (59, 60), (62, 65), (87, 90), (94, 98), (135, 135), (142, 146), (148, 151), (241, 243), (245, 246), (250, 250), (252, 252), (256, 257), (259, 260), (273, 273), (293, 298)]" diff --git a/examples/enzeptional/example_enzeptional.py b/examples/enzeptional/example_enzeptional.py new file mode 100644 index 000000000..0033013cf --- /dev/null +++ b/examples/enzeptional/example_enzeptional.py @@ -0,0 +1,84 @@ +import logging +import pandas as pd +from gt4sd.frameworks.enzeptional.processing import HFandTAPEModelUtility +from gt4sd.frameworks.enzeptional.core import SequenceMutator, EnzymeOptimizer +from gt4sd.configuration import GT4SDConfiguration, sync_algorithm_with_s3 + + +def initialize_environment(): + """Synchronize with GT4SD S3 storage and set up the environment.""" + # NOTE: For those interested in optimizing kcat values, it is important to adjust the scorer path to reflect this focus, thereby selecting the appropriate model for kcat optimization: f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/kcat/model.pkl". The specification of the scaler, located within the same directory as the `scorer.pkl`, is mandatory for accurate model performance. + configuration = GT4SDConfiguration.get_instance() + sync_algorithm_with_s3("proteins/enzeptional/scorers", module="properties") + return f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/feasibility/model.pkl" + + +def load_experiment_parameters(): + """Load experiment parameters from a CSV file.""" + df = pd.read_csv("data.csv").iloc[1] + return df["substrates"], df["products"], df["sequences"], eval(df["intervals"]) + + +def setup_optimizer( + substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path +): + """Set up and return the optimizer with all necessary components configured.""" + model_tokenizer_paths = "facebook/esm2_t33_650M_UR50D" + chem_paths = "seyonec/ChemBERTa-zinc-base-v1" + + protein_model = HFandTAPEModelUtility( + embedding_model_path=model_tokenizer_paths, tokenizer_path=model_tokenizer_paths + ) + mutation_config = { + "type": "language-modeling", + "embedding_model_path": model_tokenizer_paths, + "tokenizer_path": model_tokenizer_paths, + "unmasking_model_path": model_tokenizer_paths, + } + + mutator = SequenceMutator(sequence=sample_sequence, mutation_config=mutation_config) + optimizer_config = { + "sequence": sample_sequence, + "protein_model": protein_model, + "substrate_smiles": substrate_smiles, + "product_smiles": product_smiles, + "chem_model_path": chem_paths, + "chem_tokenizer_path": chem_paths, + "scorer_filepath": scorer_path, + "mutator": mutator, + "intervals": intervals, + "batch_size": 5, + "top_k": 3, + "selection_ratio": 0.25, + "perform_crossover": True, + "crossover_type": "single_point", + "concat_order": ["substrate", "sequence", "product"], + } + return EnzymeOptimizer(**optimizer_config) + + +def optimize_sequences(optimizer): + """Optimize sequences using the configured optimizer.""" + return optimizer.optimize( + num_iterations=3, num_sequences=5, num_mutations=5, time_budget=3600 + ) + + +def main(): + logging.basicConfig(level=logging.INFO) + scorer_path = initialize_environment() + ( + substrate_smiles, + product_smiles, + sample_sequence, + intervals, + ) = load_experiment_parameters() + optimizer = setup_optimizer( + substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path + ) + optimized_sequences, iteration_info = optimize_sequences(optimizer) + logging.info("Optimization completed.") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index a57352cd9..0e64fc293 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,6 +32,7 @@ scikit-learn>=1.0.0,<1.3.0 scikit-optimize>=0.8.1 scipy>=1.0.0 sentencepiece>=0.1.95 +sentence_transformers>1.0,<=2.2.2 sympy>=1.10.1 tables>=3.7.0 tape-proteins>=0.4 diff --git a/src/gt4sd/frameworks/enzeptional/__init__.py b/src/gt4sd/frameworks/enzeptional/__init__.py index 82bff22c6..48d3007ce 100644 --- a/src/gt4sd/frameworks/enzeptional/__init__.py +++ b/src/gt4sd/frameworks/enzeptional/__init__.py @@ -1,7 +1,7 @@ # # MIT License # -# Copyright (c) 2022 GT4SD team +# Copyright (c) 2024 GT4SD team # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -26,4 +26,4 @@ Module for enzyme optimization. """ -from .optimization import EnzymeOptimizer # noqa: F401 +from .core import EnzymeOptimizer # noqa: F401 diff --git a/src/gt4sd/frameworks/enzeptional/core.py b/src/gt4sd/frameworks/enzeptional/core.py new file mode 100644 index 000000000..e37423194 --- /dev/null +++ b/src/gt4sd/frameworks/enzeptional/core.py @@ -0,0 +1,649 @@ +# +# MIT License +# +# Copyright (c) 2024 GT4SD team +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +import pandas as pd +import numpy as np +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Tuple +import random +import logging +from itertools import product as iter_product +import time +from joblib import load +from .processing import ( + HFandTAPEModelUtility, + SelectionGenerator, + CrossoverGenerator, + sanitize_intervals, + sanitize_intervals_with_padding, +) + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class MutationModelManager: + """ + Manages and caches mutation models for efficient reuse. + """ + + _models_cache: Dict[Any, Any] = {} + + @staticmethod + def load_model(embedding_model_path, tokenizer_path, **kwargs): + """ + Loads or retrieves a model from the cache based on the given paths. + + Args: + embedding_model_path (str): Path to the embedding model. + tokenizer_path (str): Path to the tokenizer. + **kwargs: Additional arguments for model loading. + + Returns: + An instance of the loaded model. + """ + model_key = (embedding_model_path, tokenizer_path) + + if model_key in MutationModelManager._models_cache: + return MutationModelManager._models_cache[model_key] + + model = HFandTAPEModelUtility(embedding_model_path, tokenizer_path, **kwargs) + MutationModelManager._models_cache[model_key] = model + return model + + @staticmethod + def clear_cache(): + """ + Clears the cached models. + """ + MutationModelManager._models_cache.clear() + + +class MutationStrategy(ABC): + """ + Abstract base class for defining mutation strategies. + """ + + @abstractmethod + def mutate( + self, sequence: str, num_mutations: int, intervals: List[List[int]] + ) -> List[str]: + """Abstract method for mutating a sequence. + + Args: + sequence (str): The original sequence to be mutated. + num_mutations (int): The number of mutations to apply. + + Returns: + List[str]: The mutated sequence. + """ + pass + + +class LanguageModelMutationStrategy(MutationStrategy): + """ + Mutation strategy using a language model. + """ + + def __init__(self, mutation_model): + """Initializes the mutation strategy with a given model. + + Args: + mutation_model: The model to be used for mutation. + """ + self.mutation_model = mutation_model + self.top_k = 2 + + def set_top_k(self, top_k: int): + """Sets the top k mutations to consider during mutation. + + Args: + top_k (int): The number of top mutations to consider. + """ + self.top_k = top_k + + def mutate( + self, sequence: str, num_mutations: int, intervals: List[List[int]] + ) -> List[str]: + """Mutates a sequence within specified intervals using the model. + + Args: + sequence (str): The original sequence to be mutated. + num_mutations (int): The number of mutations to introduce. + intervals (List[List[int]]): Intervals within the sequence + where mutations are allowed. + + Returns: + List[str]: A list of mutated sequences. + """ + + flat_intervals = [ + i + for interval in intervals + for i in range(interval[0], interval[1] + 1) + if i < len(sequence) + ] + + num_mutations = random.randint(1, num_mutations) + + chosen_positions = random.sample( + flat_intervals, min(num_mutations, len(flat_intervals)) + ) + sequence_list = list(sequence) + + for pos in chosen_positions: + sequence_list[pos] = self.mutation_model.tokenizer.mask_token + + masked_sequence = " ".join(sequence_list) + + return self.mutation_model.unmask(masked_sequence, self.top_k) + + +class TransitionMatrixMutationStrategy(MutationStrategy): + """ + Mutation strategy based on a transition matrix. + """ + + def __init__(self, transition_matrix: str): + """Initializes the mutation strategy with a transition matrix. + + Args: + transition_matrix (str): Path to the CSV file containing + the transition matrix. + """ + logger.info(" USING TRNASITION MATRIX ") + self.transition_matrix = pd.read_csv( + transition_matrix, index_col=None, header=0 + ) + self.top_k = 2 + + def set_top_k(self, top_k: int): + """Sets the top k mutations to consider during mutation. + + Args: + top_k (int): The number of top mutations to consider. + """ + + self.top_k = top_k + + def mutate( + self, sequence: str, num_mutations: int, intervals: List[List[int]] + ) -> List[str]: + """Mutates a sequence based on the transition matrix within + specified intervals. + + Args: + sequence (str): The original sequence to be mutated. + num_mutations (int): The number of mutations to introduce. + intervals (List[List[int]]): Intervals within the sequence + where mutations are allowed. + + Returns: + List[str]: A list of mutated sequences. + """ + + flat_intervals = [ + i + for interval in intervals + for i in range(interval[0], interval[1] + 1) + if i < len(sequence) + ] + + num_mutations = random.randint(1, num_mutations) + + chosen_positions = random.sample( + flat_intervals, min(num_mutations, len(flat_intervals)) + ) + + mutated_sequences = [] + + mutation_options = [] + for pos in chosen_positions: + aa_probabilities = self.transition_matrix.iloc[pos] + top_mutations = aa_probabilities.nlargest(self.top_k).index.tolist() + mutation_options.append([(pos, aa) for aa in top_mutations]) + + for mutation_combination in iter_product(*mutation_options): + temp_sequence = list(sequence) + for pos, new_aa in mutation_combination: + temp_sequence[pos] = new_aa + mutated_sequences.append("".join(temp_sequence)) + + return mutated_sequences + + +class MutationFactory: + """ + Factory class for creating mutation strategies based on configuration. + """ + + @staticmethod + def get_mutation_strategy(mutation_config: Dict[str, Any]): + """Retrieves a mutation strategy based on the provided configuration. + + Args: + mutation_config (Dict[str, Any]): Configuration specifying + the type of mutation strategy and its parameters. + + Raises: + KeyError: If required configuration parameters are missing. + ValueError: If the mutation type is unsupported. + + Returns: + _type_: An instance of the specified mutation strategy + """ + if mutation_config["type"] == "language-modeling": + mutation_model = MutationModelManager.load_model( + embedding_model_path=mutation_config["embedding_model_path"], + tokenizer_path=mutation_config["tokenizer_path"], + unmasking_model_path=mutation_config.get("unmasking_model_path"), + ) + return LanguageModelMutationStrategy(mutation_model) + elif mutation_config["type"] == "transition-matrix": + transition_matrix = mutation_config.get("transition_matrix") + if transition_matrix is None: + raise KeyError( + "Transition matrix not provided in mutation configuration." + ) + return TransitionMatrixMutationStrategy(transition_matrix) + else: + raise ValueError("Unsupported mutation type") + + +class SequenceMutator: + """ + Class for mutating sequences using a specified strategy. + """ + + def __init__(self, sequence: str, mutation_config: Dict[str, Any]): + """Initializes the mutator with a sequence and a mutation strategy. + + Args: + sequence (str): The sequence to be mutated. + mutation_config (Dict[str, Any]): Configuration for + the mutation strategy. + """ + self.sequence = sequence + self.mutation_strategy = MutationFactory.get_mutation_strategy(mutation_config) + self.top_k = 2 + + def set_top_k(self, top_k: int): + """Sets the number of top mutations to consider in the mutation strategy. + + Args: + top_k (int): The number of top mutations to consider. + """ + self.top_k = top_k + if isinstance( + self.mutation_strategy, + (LanguageModelMutationStrategy, TransitionMatrixMutationStrategy), + ): + self.mutation_strategy.set_top_k(top_k) + + def get_mutations( + self, + num_sequences: int, + number_of_mutations: int, + intervals: List[Tuple[int, int]], + current_population: List[str], + already_evaluated_sequences: List[str], + ) -> List[str]: + """Generates a set of mutated sequences. + + Args: + num_sequences (int): Number of mutated sequences to generate. + number_of_mutations (int): Number of mutations to apply to + each sequence. + intervals (List[Tuple[int]]): Intervals within the sequence + where mutations are allowed. + already_evaluated_sequences (List[str]): List of sequences + that have already been evaluated. + + Returns: + List[str]: A list of mutated sequences. + """ + max_mutations = min(len(self.sequence), number_of_mutations) + if len(current_population) < 1: + current_population.append(self.sequence) + + random.shuffle(current_population) + mutated_sequences_set: List[str] = [] + + while len(mutated_sequences_set) < num_sequences: + for temp_sequence in current_population: + new_mutations = self.mutation_strategy.mutate( + temp_sequence, max_mutations, intervals + ) + mutated_sequences_set.extend(new_mutations) + if len(mutated_sequences_set) >= num_sequences: + break + return random.sample(mutated_sequences_set, num_sequences) + + +class EnzymeOptimizer: + """ + Optimizes protein sequences based on interaction with + substrates and products. + """ + + def __init__( + self, + sequence: str, + protein_model: HFandTAPEModelUtility, + substrate_smiles: str, + product_smiles: str, + chem_model_path: str, + chem_tokenizer_path: str, + scorer_filepath: str, + mutator: SequenceMutator, + intervals: List[Tuple[int, int]], + batch_size: int = 2, + seed: int = 123, + top_k: int = 2, + selection_ratio: float = 0.5, + perform_crossover: bool = False, + crossover_type: str = "uniform", + minimum_interval_length: int = 8, + pad_intervals: bool = False, + concat_order=["sequence", "substrate", "product"], + ): + """Initializes the optimizer with models, sequences, and + optimization parameters. + + + Args: + sequence (str): The initial protein sequence. + protein_model (HFandTAPEModelUtility): Model for protein embeddings. + substrate_smiles (str): SMILES representation of the substrate. + product_smiles (str): SMILES representation of the product. + chem_model_path (str): Path to the chemical model. + chem_tokenizer_path (str): Path to the chemical tokenizer. + scorer_filepath (str): Path to the scoring model. + mutator (SequenceMutator): The mutator for generating sequence variants. + intervals (List[Tuple[int, int]]): Intervals for mutation. + batch_size (int, optional): The number of sequences to process in one batch. Defaults to 2. + seed (int, optional): Random seed. Defaults to 123. + top_k (int, optional): Number of top mutations to consider. Defaults to 2. + selection_ratio (float, optional): Ratio of sequences to select after scoring. Defaults to 0.5. + perform_crossover (bool, optional): Flag to perform crossover operation. Defaults to False. + crossover_type (str, optional): Type of crossover operation. Defaults to "uniform". + minimum_interval_length (int, optional): Minimum length of mutation intervals. Defaults to 8. + pad_intervals (bool, optional): Flag to pad the intervals. Defaults to False. + concat_order (list, optional): Order of concatenating embeddings. Defaults to ["sequence", "substrate", "product"]. + """ + self.sequence = sequence + self.protein_model = protein_model + self.mutator = mutator + self.intervals = intervals + self.batch_size = batch_size + self.top_k = top_k + self.selection_ratio = selection_ratio + self.perform_crossover = perform_crossover + self.crossover_type = crossover_type + self.concat_order = concat_order + self.minimum_interval_length = minimum_interval_length + self.pad_intervals = pad_intervals + self.mutator.set_top_k(top_k) + self.concat_order = concat_order + self.scorer = load(scorer_filepath) + self.seed = seed + + self.chem_model = HFandTAPEModelUtility(chem_model_path, chem_tokenizer_path) + self.substrate_embedding = self.chem_model.embed([substrate_smiles])[0] + self.product_embedding = self.chem_model.embed([product_smiles])[0] + + self.selection_generator = SelectionGenerator() + self.crossover_generator = CrossoverGenerator() + + if intervals is None: + self.intervals = [(0, len(sequence))] + else: + self.intervals = sanitize_intervals(intervals) + if pad_intervals: + self.intervals = sanitize_intervals_with_padding( + self.intervals, minimum_interval_length, len(sequence) + ) + + random.seed(self.seed) + + def optimize( + self, + num_iterations: int, + num_sequences: int, + num_mutations: int, + time_budget: Optional[int] = 360, + ): + """Runs the optimization process over a specified number + of iterations. + + Args: + num_iterations (int): Number of iterations to run + the optimization. + num_sequences (int): Number of sequences to generate + per iteration. + num_mutations (int): Max number of mutations to apply. + time_budget (Optional[int]): Time budget for + optimizer (in seconds). Defaults to 360. + + Returns: + A tuple containing the list of all sequences and + iteration information. + """ + + iteration_info = {} + + scored_original_sequence = self.score_sequence(self.sequence) + original_sequence_score_ = scored_original_sequence["score"] + + logger.info(f"Original sequence score: {original_sequence_score_}") + + all_mutated_sequences: List[str] = [scored_original_sequence["sequence"]] + current_best_score = original_sequence_score_ + + all_scored_sequences: List[Dict[str, Any]] = [] + + for iteration in range(num_iterations): + start_time = time.time() + + scored_sequences: List[Dict[str, Any]] = [scored_original_sequence] + + if iteration == 0: + current_population: List[str] = [self.sequence] + if len(current_population) < num_sequences: + while len(current_population) < num_sequences: + new_mutants = self.mutator.mutation_strategy.mutate( + self.sequence, num_mutations, self.intervals + ) + for mut in new_mutants: + if mut not in all_mutated_sequences: + current_population.append(mut) + else: + continue + if len(current_population) >= num_sequences: + break + + if len(current_population) >= num_sequences: + random.shuffle(current_population) + current_population = random.sample( + current_population, k=num_sequences + ) + + logger.info( + f"Number of sequences in current population: {len(current_population)}" + ) + + iteration_scored_sequences = [] + for _ in range(0, len(current_population), self.batch_size): + scored_sequences = self.score_sequences( + current_population[_ : _ + self.batch_size] + ) + all_mutated_sequences.extend( + current_population[_ : _ + self.batch_size] + ) + all_scored_sequences.extend(scored_sequences) + iteration_scored_sequences.extend(scored_sequences) + + if self.selection_ratio < 1.0: + + samples_with_higher_score = [ + i + for i in iteration_scored_sequences + if i["score"] > original_sequence_score_ + ] + selected_sequences = self.selection_generator.selection( + samples_with_higher_score, self.selection_ratio + ) + else: + selected_sequences = iteration_scored_sequences + + offspring_sequences = [] + if self.perform_crossover and len(selected_sequences) > 1: + for i in range(0, len(selected_sequences), 2): + if i + 1 < len(selected_sequences): + parent1 = selected_sequences[i]["sequence"] + parent2 = selected_sequences[i + 1]["sequence"] + if self.crossover_type == "single_point": + ( + offspring1, + offspring2, + ) = self.crossover_generator.sp_crossover(parent1, parent2) + else: + ( + offspring1, + offspring2, + ) = self.crossover_generator.uniform_crossover( + parent1, parent2 + ) + offspring_sequences.extend([offspring1, offspring2]) + + logger.info(f"Selected samples: {len(selected_sequences)}") + logger.info(f"Number Crossed-Over samples: {len(offspring_sequences)}") + + current_population = [ + seq["sequence"] for seq in selected_sequences + ] + offspring_sequences + + if len(current_population) < num_sequences: + while len(current_population) < num_sequences: + current_population.extend( + self.mutator.mutation_strategy.mutate( + self.sequence, num_mutations, self.intervals + ) + ) + if len(current_population) >= num_sequences: + break + + if len(current_population) >= num_sequences: + random.shuffle(current_population) + current_population = current_population[:num_sequences] + + higher_scoring_sequences = 0 + for temp_seq in iteration_scored_sequences: + if temp_seq["score"] > current_best_score: + current_best_score = temp_seq["score"] + higher_scoring_sequences += 1 + + end_time = time.time() + elapsed_time = end_time - start_time + iteration_info[iteration + 1] = { + "Iteration": iteration + 1, + "best_score": current_best_score, + "higher_scoring_sequences": higher_scoring_sequences, + "elapsed_time": elapsed_time, + } + logger.info( + f" Iteration {iteration + 1}: Best Score: {current_best_score}," + f" Higher Scoring Sequences: {higher_scoring_sequences}, " + f" Time: {elapsed_time} seconds," + f" Population length : {len(current_population)}" + ) + if time_budget is not None and elapsed_time > time_budget: + logger.warning(f"Used all the given time budget of {time_budget}s") + break + + all_scored_sequences = sorted( + all_scored_sequences, key=lambda x: x["score"], reverse=True + ) + + df = pd.DataFrame(all_scored_sequences) + df = df.drop_duplicates() + + all_scored_sequences = df.to_dict(orient="records") + + return all_scored_sequences, iteration_info + + def score_sequence(self, sequence: str) -> Dict[str, Any]: + """Scores a single protein sequence. + + Args: + sequence (str): The protein sequence to score. + + Returns: + Dict[str, Any]: The score of the sequence. + """ + sequence_embedding = self.protein_model.embed([sequence])[0] + embeddings = [ + sequence_embedding, + self.substrate_embedding, + self.product_embedding, + ] + ordered_embeddings = [ + embeddings[self.concat_order.index(item)] for item in self.concat_order + ] + combined_embedding = np.concatenate(ordered_embeddings) + combined_embedding = combined_embedding.reshape(1, -1) + + score = self.scorer.predict_proba(combined_embedding)[0][1] + return {"sequence": sequence, "score": score} + + def score_sequences(self, sequences: List[str]) -> List[Dict[str, float]]: + """Scores a list of protein sequences. + + Args: + sequences (List[str]): The list of protein sequences to score. + + Returns: + List[Dict[str, float]]: A list of dictionaries + containing sequences and their scores. + """ + sequence_embeddings = self.protein_model.embed(sequences) + + output = [] + for position in range(len(sequence_embeddings)): + sequence_embedding = sequence_embeddings[position] + embeddings = [ + sequence_embedding, + self.substrate_embedding, + self.product_embedding, + ] + ordered_embeddings = [ + embeddings[self.concat_order.index(item)] for item in self.concat_order + ] + combined_embedding = np.concatenate(ordered_embeddings) + combined_embedding = combined_embedding.reshape(1, -1) + + score = self.scorer.predict_proba(combined_embedding)[0][1] + output.append({"sequence": sequences[position], "score": score}) + + return output diff --git a/src/gt4sd/frameworks/enzeptional/optimization.py b/src/gt4sd/frameworks/enzeptional/optimization.py deleted file mode 100644 index 61594d7c6..000000000 --- a/src/gt4sd/frameworks/enzeptional/optimization.py +++ /dev/null @@ -1,448 +0,0 @@ -# -# MIT License -# -# Copyright (c) 2022 GT4SD team -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -"""Enzyme optimization.""" - -import json -import logging -import random -import time -from collections import OrderedDict -from typing import Any, Dict, List, MutableMapping, Optional, Sequence, Tuple, Union - -import numpy as np -from joblib import load - -from .processing import ( - HuggingFaceTransformerEmbedding, - StringEmbedding, - TAPEEmbedding, - reconstruct_sequence_with_mutation_range, - sanitize_intervals, -) - -logger = logging.getLogger(__name__) -logger.addHandler(logging.NullHandler()) - -#: transition matrix representation -TransitionMatrix = MutableMapping[str, MutableMapping[str, float]] -#: transition matrix configuration -TransitionConfiguration = MutableMapping[ - str, Union[MutableMapping[str, float], Sequence[str]] -] - -#: supported features -SUPPORTED_FEATURE_SET = set(["substrate", "product", "sequence"]) - -#: IUPAC code mapping -IUPAC_CODES = OrderedDict( - [ - ("Ala", "A"), - ("Asx", "B"), # Aspartate or Asparagine - ("Cys", "C"), - ("Asp", "D"), - ("Glu", "E"), - ("Phe", "F"), - ("Gly", "G"), - ("His", "H"), - ("Ile", "I"), - ("Lys", "K"), - ("Leu", "L"), - ("Met", "M"), - ("Asn", "N"), - ("Pyl", "O"), # Pyrrolysin - ("Pro", "P"), - ("Gln", "Q"), - ("Arg", "R"), - ("Ser", "S"), - ("Thr", "T"), - ("Sec", "U"), # Selenocysteine - ("Val", "V"), - ("Trp", "W"), - ("Xaa", "X"), # Any AA - ("Tyr", "Y"), - ("Glx", "Z"), # Glutamate or Glutamine - ] -) -#: IUPAC character set -IUPAC_CHARACTER_SET = set(IUPAC_CODES.values()) -#: IUPAC uniform mutation mapping, we exclude 'X' from the mapping values because it denotes a generic AA -IUPAC_MUTATION_MAPPING: TransitionConfiguration = { - iupac_character: sorted(list(IUPAC_CHARACTER_SET - {iupac_character, "X"})) - for iupac_character in IUPAC_CHARACTER_SET -} - - -class Mutations: - """Mutations definition class.""" - - def __init__(self, transition_configuration: TransitionConfiguration) -> None: - """Generate the mutation given the configuration for the transitions. - - Args: - transition_configuration: transition configuration. - """ - self.transition_matrix = Mutations.transition_configuration_to_matrix( - transition_configuration - ) - - @staticmethod - def transition_configuration_to_matrix( - transition_configuration: TransitionConfiguration, - ) -> TransitionMatrix: - """Transform a configuration into a valid transition matrix. - - Args: - transition_configuration: transition configuration. - - Returns: - a transition matrix. - """ - transition_matrix: TransitionMatrix = dict() - for transition_source, transition_targets in transition_configuration.items(): - if isinstance(transition_targets, dict): - total = float(sum(transition_targets.values())) - transition_matrix[transition_source] = { - transition_target: transtion_element / total - for transition_target, transtion_element in transition_targets.items() - } - else: - transition_matrix[transition_source] = { - transition_target: 1 / len(transition_targets) - for transition_target in transition_targets - } - return transition_matrix - - @staticmethod - def from_json(filepath: str) -> "Mutations": - """Parse the mutation from a JSON containing the transition configuration. - - Returns: - the mutations object. - """ - with open(filepath) as fp: - return Mutations(json.load(fp)) - - def mutate(self, source: str) -> str: - """Mutate a source string. - - Args: - source: source string. - - Returns: - the mutated target. - """ - targets, probabilities = zip(*self.transition_matrix[source].items()) - return np.random.choice(targets, size=1, p=probabilities).item() - - -class AASequence: - def __init__( - self, sequence: str, mutations: Mutations = Mutations(IUPAC_MUTATION_MAPPING) - ) -> None: - """Initialize an AA sequence representation. - - Args: - sequence: AA sequence. - mutations: mutations definition. Defaults to uniform sampling of IUPAC AAs. - """ - self.sequence = sequence - self.sequence_length = len(sequence) - self.mutations = mutations - - def mutate(self, maximum_number_of_mutations: int) -> str: - """Mutate the sequence in multiple positions. - - Args: - maximum_number_of_mutations: maximum number of mutations. - - Returns: - the mutated sequence. - """ - if maximum_number_of_mutations > self.sequence_length: - logger.warning( - f"resetting maximum number of mutations ({maximum_number_of_mutations}), since it is higher than sequence length: {self.sequence_length}" - ) - maximum_number_of_mutations = self.sequence_length - if maximum_number_of_mutations < 1: - logger.warning( - f"maximum number of mutations can't be lower than 1 ({maximum_number_of_mutations}), resetting to 1" - ) - maximum_number_of_mutations = 1 - number_of_mutations = random.randint(1, maximum_number_of_mutations) - positions = sorted( - random.sample(range(self.sequence_length), number_of_mutations) - ) - mutated_sequence = "" - start_position = -1 - for position in positions: - mutated_sequence += self.sequence[(start_position + 1) : position] - mutated_sequence += self.mutations.mutate(self.sequence[position]) - start_position = position - mutated_sequence += self.sequence[(start_position + 1) :] - return mutated_sequence - - -class EnzymeOptimizer: - """Optimize an enzyme to catalyze a reaction from substrate to product.""" - - def __init__( - self, - scorer_filepath: str, - substrate: str, - product: str, - sequence: str, - protein_embedding: StringEmbedding = TAPEEmbedding(), - molecule_embedding: StringEmbedding = HuggingFaceTransformerEmbedding(), - ordering: List[str] = ["substrate", "product", "sequence"], - ) -> None: - """Initialize the enzyme designer. - - Args: - scorer_filepath: pickled scorer filepath. - substrate: substrate SMILES. - product: product SMILES. - sequence: AA sequence representing the enzyme to optimize. - protein_embedding: protein embedding class. Defaults to TAPE bert-base. - molecule_embedding: molecule embedding class. Defaults to ChemBERTa version 1. - ordering: ordering of the features for the scorer. Defaults to ["substrate", "product", "sequence"]. - - Raises: - ValueError: ordering provided is not feasible. - - Example: - An example optimizing a specific reaction:: - - filepath = f"/path/to/model/scoring_model.pkl" - substrate = "NC1=CC=C(N)C=C1" - product = "CNC1=CC=C(NC(=O)C2=CC=C(C=C2)C(C)=O)C=C1" - sequence = ( - "MSIQIKQSTMVRPAEETPNKSLWLSNIDMILRTPYSHTGAVLIYKQPDNNEDNIHPSSSMYFDANILIEALSKA" - "LVPFYPMAGRLKINGDRYEIDCNAEGALFVEAESSHVLEDFGDFRPNDELHRVMVPTCDYSKGISSFPLLMVQLT" - "RFRCGGVSIGFAQHHHVCDGMAHFEFNNSWARIAKGLLPALEPVHDRYLHLRPRNPPQIKYSHSQFEPFVPSLPN" - "ELLDGKTNKSQTLFILSREQINTLKQKLDLSNNTTRLSTYEVVAAHVWRSVSKARGLSDHEEIKLIMPVDGRSRIN" - "NPSLPKGYCGNVVFLAVCTATVGDLSCNPLTDTAGKVQEALKGLDDDYLRSAIDHTESKPGLPVPYMGSPEKTLYPN" - "VLVNSWGRIPYQAMDFGWGSPTFFGISNIFYDGQCFLIPSRDGDGSMTLAINLFSSHLSRFKKYFYDF" - ) - # instantiate the designer - designer = EnzymeOptimizer( - scorer_filepath=filepath, substrate=substrate, product=product, sequence=sequence - ) - - - # with this sequence length every steps takes ~5s - # optimize between positions 150 and 405 allowing for a maximum of 5 mutations. - results = designer.optimize( - number_of_mutations=5, number_of_steps=10, number_of_samples_per_step=8, - intervals=[(150, 405)] - ) - best_score = results[0]["score"] - best_sequence = results[0]["sequence"] - """ - if len(set(ordering).intersection(SUPPORTED_FEATURE_SET)) < 3: - raise ValueError( - f"ordering={ordering} should contain only the three admissible values: {sorted(list(SUPPORTED_FEATURE_SET))}" - ) - else: - self._ordering = ordering - self.scorer_filepath = scorer_filepath - self.scorer = load(scorer_filepath) - self.substrate = substrate - self.product = product - self.protein_embedding = protein_embedding - self.molecule_embedding = molecule_embedding - self.embedded_vectors = { - "substrate": self.molecule_embedding.embed_one(self.substrate), - "product": self.molecule_embedding.embed_one(self.product), - } - self.sequence = sequence - self.sequence_length = len(sequence) - - def score_sequence(self, sequence: str) -> float: - """Score a given sequence. - - Args: - sequence: a sequence to score. - - Returns: - score for the sequence. - """ - embedded_vectors = {"sequence": self.protein_embedding.embed_one(sequence)} - embedded_vectors.update(self.embedded_vectors) - feature_vector = np.concatenate( - [embedded_vectors[feature] for feature in self._ordering], axis=1 - ) - return self.scorer.predict_proba(feature_vector)[0][1] - - def score_sequences(self, sequences: List[str]) -> List[Dict[str, Any]]: - """Score a given sequence list. - - Args: - sequences: a list of sequences to score. - - Returns: - a list of dictionaries of sequences and related scores. - """ - number_of_sequences = len(sequences) - embedded_matrices = { - "substrate": np.repeat( - self.embedded_vectors["substrate"], number_of_sequences, axis=0 - ), - "product": np.repeat( - self.embedded_vectors["product"], number_of_sequences, axis=0 - ), - } - embedded_matrices["sequence"] = self.protein_embedding(sequences) - feature_vector = np.concatenate( - [embedded_matrices[feature] for feature in self._ordering], axis=1 - ) - return [ - {"sequence": sequence, "score": score} - for sequence, score in zip( - sequences, self.scorer.predict_proba(feature_vector)[:, 1] - ) - ] - - def optimize( - self, - number_of_mutations: int, - intervals: Optional[List[Tuple[int, int]]] = None, - number_of_steps: int = 10, - number_of_samples_per_step: int = 32, - number_of_sequences: Optional[int] = None, - seed: int = 42, - time_budget: Optional[int] = None, - mutations: Mutations = Mutations(IUPAC_MUTATION_MAPPING), - ) -> List[Dict[str, Any]]: - """Optimize the enzyme given a number of mutations and a range. - - If the range limits are not provided the full sequence is optimized, this might be inefficient. - The sampling is performing by exploring mutations with a slightly smart random sampling. - - Args: - number_of_mutations: number of allowed mutations. - intervals: list of ranges in the sequence, zero-based. Defaults to None, a.k.a. use optimize the full sequence. - number_of_steps: number of optimization steps. Defaults to 100. - number_of_samples_per_step: number of samples sequences per optimization step. Defaults to 32. - number_of_sequences: number of optimal seuqence returned. Defaults to None, a.k.a, returns all. - seed: seed for random number generation. Defaults to 42. - time_budget: maximum allowed runtime in seconds. Defaults to None, a.k.a., no time limit, running for number_of_steps steps. - mutations: mutations definition. Defaults to uniform sampling of IUPAC AAs. - - Raises: - ValueError: in case an invalid range is provided. - - Returns: - a list of dictionaries containing a candidate optimal sequence and the related score. Sorted from best to worst. - Note that, when no limit on the returned number of sequences is set, the worst sequence is the original unmutated sequence. - If the optimization fails, only the original sequence is returned. - """ - random.seed(seed) - - # check if interval is None. In case it is, take as interval the whole sequence - if intervals is None: - intervals = [(0, self.sequence_length)] - else: - intervals = sanitize_intervals( - intervals - ) # here we merged and sorted the intervals - - # check that the intervals are in the range of the sequence length - if intervals[-1][1] > self.sequence_length: - raise ValueError( - "check provided intervals, at least an interval is larger than the sequence length" - ) - - # create a sequence from based on the intervals - sequence_from_intervals = "".join( - [self.sequence[start:end] for start, end in intervals] - ) - - # mutate the sequence from intervals - aa_sequence_range = AASequence(sequence_from_intervals, mutations=mutations) - maximum_number_of_mutations = number_of_mutations - - logger.info( - f"maximum number of mutations for the intervals: {maximum_number_of_mutations}" - ) - scored_original_sequence = { - "score": self.score_sequence(self.sequence), - "sequence": self.sequence, - } - original_sequence_score = scored_original_sequence["score"] - logger.info(f"original sequence score: {original_sequence_score}") - results: List[Dict[str, Any]] = [scored_original_sequence] - # slightly smart random sampling - visited_sequences = set() - start_time = time.time() - for step in range(number_of_steps): - logger.info(f"optimization step={step + 1}") - mutated_sequences = [] - - for _ in range(number_of_samples_per_step): - mutated_sequence_range = aa_sequence_range.mutate( - maximum_number_of_mutations=maximum_number_of_mutations - ) - - mutated_sequence = reconstruct_sequence_with_mutation_range( - sequence=self.sequence, - mutated_sequence_range=mutated_sequence_range, - intervals=intervals, - ) - - # make sure we do not revisit - if mutated_sequence not in visited_sequences: - visited_sequences.add(mutated_sequence) - mutated_sequences.append(mutated_sequence) - - # add only mutated sequences that are more optimal than the original - results += [ - scored_sequence - for scored_sequence in self.score_sequences(mutated_sequences) - if scored_sequence["score"] > original_sequence_score - ] - logger.info( - f"best score at step={step + 1}: {max([scored_sequence['score'] for scored_sequence in results])}" - ) - elapsed_time = int(time.time() - start_time) - if time_budget is not None: - if elapsed_time > time_budget: - logger.warning( - f"used all the given time budget of {time_budget}s, exting optimization loop" - ) - break - logger.info( - f"optimization completed visiting {len(visited_sequences)} mutated sequences" - ) - sorted_results = sorted( - results, key=lambda result: result["score"], reverse=True - )[:number_of_sequences] - if len(sorted_results) < 2: - logger.error( - "optimization failed, could not find a mutated sequence more optimal than the original" - ) - else: - logger.info( - f"found {len(sorted_results) - 1} optimal mutated sequences, best score: {sorted_results[0]['score']}" - ) - return sorted_results diff --git a/src/gt4sd/frameworks/enzeptional/processing.py b/src/gt4sd/frameworks/enzeptional/processing.py index c6c238019..600d26cc0 100644 --- a/src/gt4sd/frameworks/enzeptional/processing.py +++ b/src/gt4sd/frameworks/enzeptional/processing.py @@ -1,7 +1,7 @@ # # MIT License # -# Copyright (c) 2022 GT4SD team +# Copyright (c) 2024 GT4SD team # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -21,237 +21,566 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # -"""enzeptional - data processing utilities.""" - from abc import ABC -from typing import Generic, List, Optional, Tuple, TypeVar, Union - -import numpy as np import torch +import numpy as np +from typing import Any, Dict, List, Optional, Tuple, Union from tape.datasets import pad_sequences from tape.registry import registry from tape.tokenizers import TAPETokenizer -from transformers import AutoModelWithLMHead, AutoTokenizer +from transformers import ( + AutoModel, + EsmForMaskedLM, + AutoTokenizer, + T5Tokenizer, +) +import math +import random +import logging +from itertools import product as iter_product +from gt4sd.frameworks.torch import get_device + -from ..torch import device_claim +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) -T = TypeVar("T") # used for sample embedding +# os.environ["TRANSFORMERS_CACHE"] = "~/.cache/huggingface/" +# torch.hub.set_dir("/dccstor/yna/.cache/torch/hub") -class Embedding(ABC, Generic[T]): - """Abstract embedding class.""" +class ModelCache: + """ + A simple cache mechanism for storing and retrieving models. + """ + + def __init__(self): + """ + Initializes the cache as an empty dictionary. + """ + self.cache = {} - def embed_one(self, sample: T) -> np.ndarray: - """Embed one sample. + def get(self, key): + """ + Retrieves a model from the cache using the given key. Args: - sample: sample representation. + key: The key used to store the model. Returns: - embedding vector for the sample. + The model associated with the key, or None if not found. """ - return self.__call__([sample]) + return self.cache.get(key) - def __call__(self, samples: List[T]) -> np.ndarray: - """Embed multiple samples sample. + def add(self, key, model): + """ + Adds a model to the cache with the specified key. Args: - samples: a list of sample representations. - - Returns: - embedding vectors for the samples. + key: The key to associate with the model. + model: The model to be cached. """ - raise NotImplementedError + self.cache[key] = model + +ENZEPTIONAL_MODEL_CACHE = ModelCache() -StringEmbedding = Embedding[str] +class StringEmbedding(ABC): + """ + Abstract base class for embedding string data. -class TAPEEmbedding(StringEmbedding): - """Embed AA sequence using TAPE.""" + Attributes: + model (Any): The embedding model. + """ + + model: Any + + def embed(self, samples: List[str]) -> np.ndarray: + """Abstract method for embedding a list of string samples. + + Args: + samples (List[str]): The list of strings to be embedded. + + Raises: + NotImplementedError: If the method is not implemented in the subclass. + """ + raise NotImplementedError + + +class HFandTAPEModelUtility(StringEmbedding): + """ + Utility class for handling both Hugging Face and TAPE models for embedding + and unmasking tasks. + """ def __init__( self, - model_type: str = "transformer", - model_dir: str = "bert-base", - aa_vocabulary: str = "iupac", + embedding_model_path: str, + tokenizer_path: str, + unmasking_model_path: Optional[str] = None, + is_tape_model: bool = False, device: Optional[Union[torch.device, str]] = None, + cache_dir: Optional[str] = None, ) -> None: - """Initialize the TAPE embedding class. + """Initializes the utility with specified model and tokenizer paths. Args: - model_type: TAPE model type. Defaults to "transformer". - model_dir: model directory. Defaults to "bert-base". - aa_vocabulary: type of vocabulary. Defaults to "iupac". - device: device where the inference - is running either as a dedicated class or a string. If not provided is inferred. + embedding_model_path (str): Path to the embedding model. + tokenizer_path (str): Path to the tokenizer. + unmasking_model_path (Optional[str], optional): Path to the unmasking model, if applicable. Defaults to None. + is_tape_model (bool, optional): Flag to indicate if a TAPE model is being used. Defaults to False. + device (Optional[Union[torch.device, str]], optional): The compute device to use ('cpu' or 'cuda:0'). Defaults to None. + cache_dir (Optional[str], optional): Path to cache directory. Defaults to None. """ - # get device - self.device = device_claim(device) - # task and model definition - self.task_specification = registry.get_task_spec("embed") - self.model = registry.get_task_model( - model_type, self.task_specification.name, load_dir=model_dir - ) - self.model = self.model.to(self.device) - self.model.eval() - self.tokenizer = TAPETokenizer(vocab=aa_vocabulary) + self.device = get_device() + self.is_tape_model = is_tape_model + + embedding_cache_key = f"embedding_{embedding_model_path}" + self.embedding_model = ENZEPTIONAL_MODEL_CACHE.get(embedding_cache_key) + if not self.embedding_model: + if is_tape_model: + self.embedding_model = registry.get_task_model( + embedding_model_path, + "embed", + load_dir=embedding_model_path, + ).to(self.device) + else: + if cache_dir: + self.embedding_model = ( + AutoModel.from_pretrained( + embedding_model_path, + cache_dir=cache_dir, + ) + .to(self.device) + .eval() + ) + else: + self.embedding_model = ( + AutoModel.from_pretrained( + embedding_model_path, + ) + .to(self.device) + .eval() + ) + + ENZEPTIONAL_MODEL_CACHE.add(embedding_cache_key, self.embedding_model) + + if unmasking_model_path is not None: + unmasking_cache_key = f"unmasking_{unmasking_model_path}" + self.unmasking_model = ENZEPTIONAL_MODEL_CACHE.get(unmasking_cache_key) + if not self.unmasking_model: + if cache_dir: + self.unmasking_model = ( + EsmForMaskedLM.from_pretrained( + unmasking_model_path, + cache_dir=cache_dir, + ) + .to(self.device) + .eval() + ) + else: + self.unmasking_model = ( + EsmForMaskedLM.from_pretrained( + unmasking_model_path, + ) + .to(self.device) + .eval() + ) + ENZEPTIONAL_MODEL_CACHE.add(unmasking_cache_key, self.unmasking_model) + else: + logger.error("No Unmasking model loaded. Check you model inputs") - def _encode_and_mask(self, sequence: str) -> Tuple[np.ndarray, np.ndarray]: - """Encode and mask a sequence. + if is_tape_model: + self.tokenizer = TAPETokenizer(vocab="iupac") + else: + self.tokenizer = self._load_tokenizer(tokenizer_path) + + def _load_tokenizer(self, tokenizer_path: str): + """Loads a tokenizer based on the given path, caching it for future use. Args: - sequence: AA sequence. + tokenizer_path (str): Path to the tokenizer. Returns: - a tuple containing the token ids and the mask. + The loaded tokenizer """ - token_ids = self.tokenizer.encode(sequence) - return token_ids, np.ones_like(token_ids) + tokenizer_cache_key = f"tokenizer_{tokenizer_path}" + tokenizer = ENZEPTIONAL_MODEL_CACHE.get(tokenizer_cache_key) + if not tokenizer: + try: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + except Exception: + tokenizer = T5Tokenizer.from_pretrained(tokenizer_path) + ENZEPTIONAL_MODEL_CACHE.add(tokenizer_cache_key, tokenizer) + return tokenizer + + def embed(self, samples: List[str]) -> np.ndarray: + """Embeds a list of samples using either TAPE or Hugging Face models. - def __call__(self, samples: List[str]) -> np.ndarray: - """Embed multiple protein sequences using TAPE. + Args: + samples (List[str]): List of strings to be embedded. + + Returns: + np.ndarray: The resulting embeddings. + """ + if self.is_tape_model: + return self._embed_tape(samples) + else: + return self._embed_huggingface(samples) + + def _embed_tape(self, samples: List[str]) -> np.ndarray: + """mbeds samples using a TAPE model. Args: - samples: a list of protein sequences. + samples (List[str]): List of strings to be embedded. Returns: - a numpy array containing the embedding vectors. + np.ndarray: The resulting embeddings. """ - # prepare input - token_ids, masks = zip( - *[self._encode_and_mask(sequence) for sequence in samples] - ) - input_data = { - "input_ids": torch.from_numpy(pad_sequences(token_ids)).to(self.device), - "input_mask": torch.from_numpy(pad_sequences(masks)).to(self.device), - } - sequence_lenghts = input_data["input_mask"].sum(1) - sequence_embeddings = self.model(**input_data)[0].cpu().detach().numpy() - # get average embedding + token_ids: Dict[str, Any] = {"ids": [], "mask": []} + for sequence in samples: + encoded_sequence = self.tokenizer.encode(sequence) + token_ids["ids"].append(encoded_sequence) + token_ids["mask"].append(np.ones_like(encoded_sequence)) + + input_ids = torch.from_numpy(pad_sequences(token_ids["ids"])).to(self.device) + input_mask = torch.from_numpy(pad_sequences(token_ids["mask"])).to(self.device) + + inputs = {"input_ids": input_ids, "input_mask": input_mask} + + with torch.no_grad(): + sequence_embeddings = ( + self.embedding_model(**inputs)[0].cpu().detach().numpy() + ) + + sequence_lengths = input_mask.sum(1) + return np.array( [ sequence_embedding[:sequence_length].mean(0) - for sequence_embedding, sequence_length in zip( # type:ignore - sequence_embeddings, sequence_lenghts + for sequence_embedding, sequence_length in zip( + sequence_embeddings, sequence_lengths ) ] ) + def _embed_huggingface(self, samples: List[str]) -> np.ndarray: + """Embeds samples using a Hugging Face model. -class HuggingFaceTransformerEmbedding(StringEmbedding): - """Embed a string representation of a molecule using an HF transformers model.""" + Args: + samples (List[str]): List of strings to be embedded. - def __init__( - self, - model_name: str = "seyonec/ChemBERTa-zinc-base-v1", - tokenizer_name: str = "seyonec/ChemBERTa-zinc-base-v1", - device: Optional[Union[torch.device, str]] = None, - ) -> None: - """Initialize the HF transformers embedding class. + Returns: + np.ndarray: The resulting embeddings. + """ + inputs = self.tokenizer( + samples, + add_special_tokens=True, + padding=True, + return_tensors="pt", + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.embedding_model(**inputs) + sequence_embeddings = outputs[0].cpu().detach().numpy() + + sequence_lengths = inputs["attention_mask"].sum(1) + + return np.array( + [ + sequence_embedding[:sequence_length].mean(0) + for sequence_embedding, sequence_length in zip( + sequence_embeddings, sequence_lengths + ) + ] + ) + + def unmask(self, sequence: str, top_k: int = 2) -> List[str]: + """Unmasks a given sequence using the model, retrieving top-k predictions. Args: - model_name: model name. Defaults to "seyonec/ChemBERTa-zinc-base-v1". - tokenizer_name: tokenizer name. Defaults to "seyonec/ChemBERTa-zinc-base-v1". - device: device where the inference - is running either as a dedicated class or a string. If not provided is inferred. + sequence (str): The sequence with masked tokens. + top_k (int, optional): Number of top predictions to retrieve. Defaults to 2. + + Raises: + NotImplementedError: If TAPE model is used. + KeyError: If the model used is not supported. + + Returns: + List[str]: List of top-k predicted sequences. """ - # get device - self.device = device_claim(device) - # tokenizer and model definition - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.model = AutoModelWithLMHead.from_pretrained(tokenizer_name) - self.model = self.model.to(self.device) - self.model.eval() + if self.is_tape_model: + logger.error("Unmasking is not supported for TAPE models.") + raise NotImplementedError("Unmasking is not supported for TAPE models.") + + try: + return self._unmask_with_model(sequence, top_k) + except (KeyError, NotImplementedError) as e: + logger.warning(f"{e} Standard unmasking failed ") + raise KeyError("Check the unmasking model you want to use") - def __call__(self, samples: List[str]) -> np.ndarray: - """Embed multiple protein sequences using TAPE. + def _unmask_with_model(self, sequence: str, top_k: int) -> List[str]: + """Unmasks a sequence using the model, providing top-k predictions. Args: - samples: a list of strings representing molecules. + sequence (str): The sequence with masked tokens. + top_k (int): Number of top predictions to retrieve. + + Raises: + KeyError: If model used do not support unmasking. Returns: - a numpy array containing the embedding vectors. + List[str]: List of top-k predicted sequences. """ - # get the CLS token representation from each SMILES. - return ( - self.model( - **{ - key: tensor.to(self.device) - for key, tensor in self.tokenizer( - samples, return_tensors="pt", padding=True - ).items() - } - )[0][:, 0, :] - .detach() - .numpy() - ) + inputs = self.tokenizer( + sequence, + return_tensors="pt", + add_special_tokens=True, + padding=True, + ).to(self.device) + mask_token_index = torch.where( + inputs["input_ids"] == self.tokenizer.mask_token_id + )[1] + + with torch.no_grad(): + outputs = self.unmasking_model(inputs["input_ids"].to(self.device)) + + if "logits" in outputs: + logits = outputs.logits + else: + raise KeyError("Logits not available in the model's output.") + + mask_token_logits = logits[0, mask_token_index, :] + + top_tokens: List[Any] = [] + for i in range(len(mask_token_index)): + top_n_tokens = ( + torch.topk(mask_token_logits, top_k, dim=1).indices[i].tolist() + ) + top_tokens.append( + [self.tokenizer.decode([token]) for token in top_n_tokens] + ) + + mask_token_index = mask_token_index.cpu().numpy() + mutated_sequences = [] + tmp_top_tokens = [tuple(tokens) for tokens in top_tokens] + if len(set(tmp_top_tokens)) == 1: + for i in range(top_k): + temp_sequence = sequence.split(" ") + for mask_index in mask_token_index: + temp_sequence[mask_index - 1] = tmp_top_tokens[0][i] + mutated_sequences.append("".join(temp_sequence)) + else: + for combination in list(iter_product(*tmp_top_tokens)): + temp_sequence = sequence.split(" ") + for i, mask_index in enumerate(mask_token_index): + temp_sequence[mask_index - 1] = combination[i] + mutated_sequences.append("".join(temp_sequence)) + + return mutated_sequences def mutate_sequence_with_variant(sequence: str, variant: str) -> str: - """Given an AA sequence and a variant returns the mutated AA sequence. + """Applies a specified variant mutation to an amino acid sequence. Args: - sequence: an AA sequence. - variant: a variant annotation. + sequence (str): The original amino acid sequence. + variant (str): The variant to apply, formatted as a string. Returns: - the mutated sequence. + str: The mutated amino acid sequence. """ - edits = [ - (int(variant_string[1:-1]), variant[0], variant_string[-1]) - for variant_string in map(str.strip, variant.split("/")) - ] mutated_sequence = list(sequence) - for index, _, aa_to in edits: - mutated_sequence[index] = aa_to + for variant_string in variant.split("/"): + index = int(variant_string[1:-1]) - 1 + mutated_sequence[index] = variant_string[-1] return "".join(mutated_sequence) def sanitize_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]: - """Sanitize intervals merging overlapping ones and sorting them. + """Merges overlapping intervals into a single interval. Args: - intervals: intervals to sanitize. + intervals (List[Tuple[int, int]]): A list of + start and end points of intervals. Returns: - sorted and non overlapping intervals. + List[Tuple[int, int]]: A list of merged intervals. """ - sorted_intervals = sorted(intervals, key=lambda interval: interval[0]) - merged_intervals = [sorted_intervals[0]] - for current in sorted_intervals: - previous_end = merged_intervals[-1][1] - if current[0] <= previous_end: - previous_end = max(previous_end, current[1]) + intervals.sort() + merged: List[Tuple[int, int]] = [] + for start, end in intervals: + if not merged or merged[-1][1] < start: + merged.append((start, end)) else: - merged_intervals.append(current) - return merged_intervals + merged[-1] = (merged[-1][0], max(merged[-1][1], end)) + return merged + + +def round_up(number: float) -> int: + """Rounds up a floating-point number to the nearest integer. + + Args: + number (float): The number to round up. + + Returns: + int: The rounded-up integer. + """ + return math.ceil(number) + + +def sanitize_intervals_with_padding( + intervals: List[Tuple[int, int]], pad_value: int, max_value: int +) -> List[Tuple[int, int]]: + """Pads and sanitizes intervals within a given range. + + Args: + intervals (List[Tuple[int, int]]): A list of intervals. + pad_value (int): The value to pad intervals with. + max_value (int): The maximum value for the range of intervals. + + Returns: + List[Tuple[int, int]]: A list of padded and sanitized intervals. + """ + + def pad_interval( + interval: Tuple[int, int], pad: int, max_val: int + ) -> Tuple[int, int]: + """Pads an individual interval within the constraints of a maximum value. + + Args: + interval (Tuple[int, int]): The interval to pad. + pad (int): The padding value. + max_val (int): The maximum value for the interval. + + Returns: + Tuple[int, int]: The padded interval. + """ + start, end = interval + interval_length = end - start + padding_needed = max(0, pad - interval_length) // 2 + + padded_start = max(0, start - padding_needed) + padded_end = min(max_val, end + padding_needed) + + if padded_end > max_val: + padded_start = max(0, padded_start - (padded_end - max_val)) + return padded_start, padded_end + + padded_intervals = [ + pad_interval(interval, pad_value, max_value) for interval in intervals + ] + return sanitize_intervals(padded_intervals) def reconstruct_sequence_with_mutation_range( - sequence: str, mutated_sequence_range: str, intervals: List[Tuple[int, int]] -): - """Reconstruct a sequence replacing in given positions sub-sequences from a mutated range. + sequence: str, + mutated_sequence_range: str, + intervals: List[Tuple[int, int]], +) -> str: + """Reconstructs a sequence by inserting a mutated sequence + range at specific intervals. Args: - sequence: original sequence. - mutated_sequence_range: mutated sequence range. - intervals: sorted and non overlapping intervals. + sequence (str): The original sequence. + mutated_sequence_range (str): The range of the sequence to be mutated. + intervals (List[Tuple[int, int]]): The intervals where + mutations are applied. Returns: - reconstructed sequence. + str: The reconstructed sequence with mutations. """ - # create the mutated sequence, considering sorted intervals - mutated_range_offset = 0 # offset with respect to the mutated_sequence_range - mutated_sequence_offset = 0 # offset with respect to the full mutated sequence. - mutated_sequence = "" + mutated_sequence = list(sequence) + range_index = 0 for start, end in intervals: - mutated_sequence += sequence[mutated_sequence_offset:start] - chunk_length = end - start + 1 - mutated_sequence += mutated_sequence_range[ - mutated_range_offset : mutated_range_offset + chunk_length + size_fragment = end - start + mutated_sequence[start:end] = list( + mutated_sequence_range[range_index : range_index + size_fragment] + ) + range_index += size_fragment + return "".join(mutated_sequence) + + +class SelectionGenerator: + """ + A generator for selecting top sequences based on their scores. + """ + + def selection( + self, + pool_of_sequences: List[Dict[str, Any]], + k: float = 0.8, + ) -> List[Any]: + """Selects a subset of sequences from a pool based on their scores. + + Args: + pool_of_sequences (List[Dict[str, Any]]): A list of + dictionaries, each containing a sequence and its score. + k (float): A fraction representing the proportion + of top sequences to select. Defaults to 0.8. + + Returns: + List[Any]: A list of the top k sequences based on scores. + """ + n_samples_to_select = int(len(pool_of_sequences) * k) + return list(sorted(pool_of_sequences, key=lambda d: d["score"], reverse=True))[ + :n_samples_to_select ] - mutated_range_offset += chunk_length - mutated_sequence_offset = end + 1 - mutated_sequence += sequence[end + 1 :] - return mutated_sequence + + +class CrossoverGenerator: + """ + A generator for performing crossover operations between sequences. + """ + + def __init__(self, threshold_probability: float = 0.5) -> None: + """Initializes the CrossoverGenerator with a specified + threshold probability. + + Args: + threshold_probability (float, optional): The probability + threshold used in uniform crossover. Defaults to 0.5. + """ + self.threshold_probability = threshold_probability + + def sp_crossover(self, a_sequence: str, another_sequence: str) -> Tuple[str, str]: + """Performs a single point crossover between two sequences. + + Args: + a_sequence (str): The first sequence for crossover. + another_sequence (str): The second sequence for crossover. + + Returns: + Tuple[str, str]: A tuple of two new sequences resulting + from the crossover. + """ + random_point = random.randint(1, len(a_sequence) - 2) + return ( + a_sequence[:random_point] + another_sequence[random_point:], + another_sequence[:random_point] + a_sequence[random_point:], + ) + + def uniform_crossover( + self, a_sequence: str, another_sequence: str + ) -> Tuple[str, str]: + """Performs a uniform crossover between two sequences. + + Args: + a_sequence (str): The first sequence for crossover. + another_sequence (str): The second sequence for crossover. + + Returns: + Tuple[str, str]: A tuple of two new sequences resulting + from the crossover. + """ + return ( + "".join( + a if random.random() > self.threshold_probability else b + for a, b in zip(a_sequence, another_sequence) + ), + "".join( + b if random.random() > self.threshold_probability else a + for a, b in zip(a_sequence, another_sequence) + ), + ) diff --git a/src/gt4sd/frameworks/enzeptional/tests/__init__.py b/src/gt4sd/frameworks/enzeptional/tests/__init__.py index fd65e8eac..c1113d761 100644 --- a/src/gt4sd/frameworks/enzeptional/tests/__init__.py +++ b/src/gt4sd/frameworks/enzeptional/tests/__init__.py @@ -1,7 +1,7 @@ # # MIT License # -# Copyright (c) 2022 GT4SD team +# Copyright (c) 2024 GT4SD team # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/src/gt4sd/frameworks/enzeptional/tests/test_core.py b/src/gt4sd/frameworks/enzeptional/tests/test_core.py new file mode 100644 index 000000000..2764b20a3 --- /dev/null +++ b/src/gt4sd/frameworks/enzeptional/tests/test_core.py @@ -0,0 +1,102 @@ +# +# MIT License +# +# Copyright (c) 2024 GT4SD team +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +import warnings +from gt4sd.frameworks.enzeptional.core import ( + SequenceMutator, + EnzymeOptimizer, +) + +from gt4sd.frameworks.enzeptional.processing import HFandTAPEModelUtility + +from gt4sd.configuration import sync_algorithm_with_s3 +from gt4sd.configuration import GT4SDConfiguration + +configuration = GT4SDConfiguration.get_instance() + + +warnings.simplefilter(action="ignore", category=FutureWarning) + +sync_algorithm_with_s3("proteins/enzeptional/scorers", module="properties") + +scorer_filepath = f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/feasibility/model.pkl" + + +def test_optimize(): + language_model_path = "facebook/esm2_t33_650M_UR50D" + tokenizer_path = "facebook/esm2_t33_650M_UR50D" + unmasking_model_path = "facebook/esm2_t33_650M_UR50D" + chem_model_path = "seyonec/ChemBERTa-zinc-base-v1" + chem_tokenizer_path = "seyonec/ChemBERTa-zinc-base-v1" + + protein_model = HFandTAPEModelUtility( + embedding_model_path=language_model_path, tokenizer_path=tokenizer_path + ) + + mutation_config = { + "type": "language-modeling", + "embedding_model_path": language_model_path, + "tokenizer_path": tokenizer_path, + "unmasking_model_path": unmasking_model_path, + } + + intervals = [(5, 10), (20, 25)] + batch_size = 5 + top_k = 3 + substrate_smiles = "NC1=CC=C(N)C=C1" + product_smiles = "CNC1=CC=C(NC(=O)C2=CC=C(C=C2)C(C)=O)C=C1" + + sample_sequence = "MSKLLMIGTGPVAIDQFLTRYEASCQAYKDMHQDQQLSSQFNTNLFEGDKALVTKFLEINRTLS" + mutator = SequenceMutator(sequence=sample_sequence, mutation_config=mutation_config) + + optimizer = EnzymeOptimizer( + sequence=sample_sequence, + protein_model=protein_model, + substrate_smiles=substrate_smiles, + product_smiles=product_smiles, + chem_model_path=chem_model_path, + chem_tokenizer_path=chem_tokenizer_path, + scorer_filepath=scorer_filepath, + mutator=mutator, + intervals=intervals, + batch_size=batch_size, + top_k=top_k, + selection_ratio=0.25, + perform_crossover=True, + crossover_type="single_point", + concat_order=["substrate", "sequence", "product"], + ) + + num_iterations = 3 + num_sequences = 5 + num_mutations = 5 + time_budget = 50000 + + optimized_sequences, iteration_info = optimizer.optimize( + num_iterations=num_iterations, + num_sequences=num_sequences, + num_mutations=num_mutations, + time_budget=time_budget, + ) + + assert len(optimized_sequences) > 0 diff --git a/src/gt4sd/frameworks/enzeptional/tests/test_processing.py b/src/gt4sd/frameworks/enzeptional/tests/test_processing.py index 2b2b08d73..efaf1c4df 100644 --- a/src/gt4sd/frameworks/enzeptional/tests/test_processing.py +++ b/src/gt4sd/frameworks/enzeptional/tests/test_processing.py @@ -1,7 +1,7 @@ # # MIT License # -# Copyright (c) 2022 GT4SD team +# Copyright (c) 2024 GT4SD team # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -22,24 +22,44 @@ # SOFTWARE. # """Enzeptional processing tests.""" - from gt4sd.frameworks.enzeptional.processing import ( - reconstruct_sequence_with_mutation_range, + ModelCache, + get_device, sanitize_intervals, + sanitize_intervals_with_padding, + reconstruct_sequence_with_mutation_range, ) +import torch + + +def test_add_and_get_model(): + model_cache = ModelCache() + test_model = torch.nn.Module() + model_cache.add("test_model", test_model) + retrieved_model = model_cache.get("test_model") + assert test_model == retrieved_model + +class TestUtilityFunctions: + def test_get_device(self): + expected_device = "cuda:0" if torch.cuda.is_available() else "cpu" + assert str(get_device()) == expected_device -def test_sanitize_intervals(): - assert sanitize_intervals([(-5, 12), (13, 14), (2, 3), (-3, 4), (-2, 6)]) == [ - (-5, 12), - (13, 14), - ] + def test_sanitize_intervals(self): + intervals = [(1, 3), (2, 5), (6, 8)] + sanitized = sanitize_intervals(intervals) + assert sanitized == [(1, 5), (6, 8)] + def test_sanitize_intervals_with_padding(self): + intervals = [(1, 3), (6, 8)] + padded_intervals = sanitize_intervals_with_padding(intervals, 8, 50) + assert padded_intervals == [(0, 11)] -def test_reconstruct_sequence_with_mutation_range(): - assert ( - reconstruct_sequence_with_mutation_range( - "ABCDEFGHILMNOPQRSTUVWXYZ", "12789", [(0, 1), (6, 8)] + def test_reconstruct_sequence_with_mutation_range(self): + original_sequence = "AACCGGTT" + mutation_range = "NNNN" + intervals = [(2, 4), (6, 8)] + reconstructed = reconstruct_sequence_with_mutation_range( + original_sequence, mutation_range, intervals ) - == "12CDEF789LMNOPQRSTUVWXYZ" - ) + assert reconstructed == "AANNGGNN"