Skip to content

Commit

Permalink
feat: using enzeptional via pypi. (#252)
Browse files Browse the repository at this point in the history
Signed-off-by: nanayves <[email protected]>
  • Loading branch information
yvesnana authored Sep 6, 2024
1 parent 60fb56b commit 115a705
Show file tree
Hide file tree
Showing 10 changed files with 267 additions and 1,469 deletions.
106 changes: 0 additions & 106 deletions examples/enzeptional/data.csv

This file was deleted.

165 changes: 116 additions & 49 deletions examples/enzeptional/example_enzeptional.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
import logging
import pandas as pd
from typing import Tuple, List, Optional
from gt4sd.frameworks.enzeptional.processing import HFandTAPEModelUtility
from gt4sd.frameworks.enzeptional.core import SequenceMutator, EnzymeOptimizer
from gt4sd.frameworks.enzeptional import (
EnzymeOptimizer,
SequenceMutator,
SequenceScorer,
CrossoverGenerator,
HuggingFaceEmbedder,
HuggingFaceModelLoader,
HuggingFaceTokenizerLoader,
SelectionGenerator,
)
from gt4sd.configuration import GT4SDConfiguration, sync_algorithm_with_s3


Expand All @@ -17,23 +25,33 @@ def initialize_environment(model = "feasibility") -> Tuple[str, Optional[str]]:
"""
configuration = GT4SDConfiguration.get_instance()
sync_algorithm_with_s3("proteins/enzeptional/scorers", module="properties")
return f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/feasibility/model.pkl"


def load_experiment_parameters() -> Tuple[List, List, List, List]:
scorer = f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/{model}/model.pkl"
if model == "feasibility":
return scorer, None
else:
scaler = f"{configuration.gt4sd_local_cache_path}/properties/proteins/enzeptional/scorers/{model}/scaler.pkl"
return scorer, scaler

def load_experiment_parameters(model="feasibility") -> Tuple[List, List, List, List]:
"""Load experiment parameters from a CSV file."""
df = pd.read_csv("data.csv").iloc[1]
return df["substrates"], df["products"], df["sequences"], eval(df["intervals"])
substrate_smiles = "NC1=CC=C(N)C=C1"
product_smiles = "CNC1=CC=C(NC(=O)C2=CC=C(C=C2)C(C)=O)C=C1"
intervals = [(5, 10), (20, 25)]
sample_sequence = "MSKLLMIGTGPVAIDQFLTRYEASCQAYKDMHQDQQLSSQFNTNLFEGDKALVTKFLEINRTLS"
scorer_path, scaler_path = initialize_environment(model)
return substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path


def setup_optimizer(
substrate_smiles: str,
product_smiles: str,
sample_sequence: str,
intervals: List[List[int]],
scorer_path: str,
scaler_path: str,
intervals: List[List[int]],
concat_order: List[str],
top_k: int,
batch_size: int,
use_xgboost_scorer: bool
):
"""Set up and return the optimizer with all necessary components configured
Expand All @@ -44,48 +62,82 @@ def setup_optimizer(
product_smiles (str): SMILES representation of the
product.
sample_sequence (str): The initial protein sequence.
intervals (List[List[int]]): Intervals for mutation.
scorer_path (str): File path to the scoring model.
scaler_path (str): Path to the scaller in case you are usinh the Kcat model.
intervals (List[List[int]]): Intervals for mutation.
concat_order (List[str]): Order of concatenating embeddings.
top_k (int): Number of top amino acids to use to create mutants.
batch_size (int): Batch size.
use_xgboost_scorer (bool): flag to specify if the fitness function is the Kcat.
Returns:
Initialized optmizer
"""
model_tokenizer_paths = "facebook/esm2_t33_650M_UR50D"
chem_paths = "seyonec/ChemBERTa-zinc-base-v1"
language_model_path = "facebook/esm2_t33_650M_UR50D"
tokenizer_path = "facebook/esm2_t33_650M_UR50D"
chem_model_path = "seyonec/ChemBERTa-zinc-base-v1"
chem_tokenizer_path = "seyonec/ChemBERTa-zinc-base-v1"

model_loader = HuggingFaceModelLoader()
tokenizer_loader = HuggingFaceTokenizerLoader()

protein_model = HuggingFaceEmbedder(
model_loader=model_loader,
tokenizer_loader=tokenizer_loader,
model_path=language_model_path,
tokenizer_path=tokenizer_path,
cache_dir=None,
device="cpu",
)

protein_model = HFandTAPEModelUtility(
embedding_model_path=model_tokenizer_paths, tokenizer_path=model_tokenizer_paths
chem_model = HuggingFaceEmbedder(
model_loader=model_loader,
tokenizer_loader=tokenizer_loader,
model_path=chem_model_path,
tokenizer_path=chem_tokenizer_path,
cache_dir=None,
device="cpu",
)

mutation_config = {
"type": "language-modeling",
"embedding_model_path": model_tokenizer_paths,
"tokenizer_path": model_tokenizer_paths,
"unmasking_model_path": model_tokenizer_paths,
"embedding_model_path": language_model_path,
"tokenizer_path": tokenizer_path,
"unmasking_model_path": language_model_path,
}

mutator = SequenceMutator(sequence=sample_sequence, mutation_config=mutation_config)
optimizer_config = {
"sequence": sample_sequence,
"protein_model": protein_model,
"substrate_smiles": substrate_smiles,
"product_smiles": product_smiles,
"chem_model_path": chem_paths,
"chem_tokenizer_path": chem_paths,
"scorer_filepath": scorer_path,
"mutator": mutator,
"intervals": intervals,
"batch_size": 5,
"top_k": 3,
"selection_ratio": 0.25,
"perform_crossover": True,
"crossover_type": "single_point",
"concat_order": concat_order,
"scaler_filepath": scaler_path,
"use_xgboost_scorer": use_xgboost_scorer
}
mutator.set_top_k(top_k)

scorer = SequenceScorer(
protein_model=protein_model,
scorer_filepath=scorer_path,
use_xgboost=use_xgboost_scorer,
scaler_filepath=scaler_path,
)

selection_generator = SelectionGenerator()
crossover_generator = CrossoverGenerator()

optimizer_config = dict(
sequence=sample_sequence,
mutator=mutator,
scorer=scorer,
intervals=intervals,
substrate_smiles=substrate_smiles,
product_smiles=product_smiles,
chem_model=chem_model,
selection_generator=selection_generator,
crossover_generator=crossover_generator,
concat_order=concat_order,
batch_size=batch_size,
selection_ratio=0.25,
perform_crossover=True,
crossover_type="single_point",
pad_intervals=False,
minimum_interval_length=8,
seed=42,
)
return EnzymeOptimizer(**optimizer_config)


Expand All @@ -106,37 +158,52 @@ def optimize_sequences(optimizer):
def main_kcat():
"""Optimization using Kcat model"""
logging.basicConfig(level=logging.INFO)
scorer_path, scaler_path = initialize_environment(model="kcat")
concat_order, use_xgboost_scorer = ["substrate", "sequence"], True
(
concat_order = ["substrate", "sequence"]
use_xgboost_scorer=True
top_k=2
batch_size=2
substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path = load_experiment_parameters("kcat")
optimizer = setup_optimizer(
substrate_smiles,
product_smiles,
sample_sequence,
scorer_path,
scaler_path,
intervals,
) = load_experiment_parameters()
optimizer = setup_optimizer(
substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, use_xgboost_scorer
concat_order,
top_k,
batch_size,
use_xgboost_scorer
)

optimized_sequences, iteration_info = optimize_sequences(optimizer)
logging.info("Optimization completed.")


def main_feasibility():
"""Optimization using Feasibility model"""
logging.basicConfig(level=logging.INFO)
scorer_path, scaler_path = initialize_environment()
concat_order, use_xgboost_scorer = ["substrate", "sequence", "product"], False
(
concat_order = ["substrate", "sequence", "product"]
use_xgboost_scorer=False
top_k=2
batch_size=2
substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path = load_experiment_parameters("feasilibity")
optimizer = setup_optimizer(
substrate_smiles,
product_smiles,
sample_sequence,
scorer_path,
scaler_path,
intervals,
) = load_experiment_parameters()
optimizer = setup_optimizer(
substrate_smiles, product_smiles, sample_sequence, intervals, scorer_path, scaler_path, concat_order, use_xgboost_scorer
concat_order,
top_k,
batch_size,
use_xgboost_scorer
)

optimized_sequences, iteration_info = optimize_sequences(optimizer)
logging.info("Optimization completed.")

if __name__ == "__main__":
main()
main_feasibility()
main_kcat()
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ setuptools==69.5.1
accelerate>=0.12,<0.20.0
datasets>=1.11.0
diffusers<=0.6.0
enzeptional>=1.0.0
importlib-metadata>=1.7.0,<5.0.0 # temporary: https://github.com/python/importlib_metadata/issues/409
importlib-resources>=5.10.0
ipaddress>=1.0.23
Expand Down
4 changes: 4 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ install_requires =
accelerate
datasets
diffusers
enzeptional
importlib_metadata
importlib_resources
ipaddress
Expand Down Expand Up @@ -293,4 +294,7 @@ ignore_missing_imports = True
ignore_missing_imports = True

[mypy-pydantic_settings.*]
ignore_missing_imports = True

[mypy-enzeptional.*]
ignore_missing_imports = True
16 changes: 14 additions & 2 deletions src/gt4sd/frameworks/enzeptional/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,17 @@
Module for enzyme optimization.
"""

from .core import EnzymeOptimizer # noqa: F401
from enzeptional import ( # noqa: F401
EnzymeOptimizer,
SequenceMutator,
SequenceScorer,
CrossoverGenerator,
HuggingFaceEmbedder,
HuggingFaceModelLoader,
HuggingFaceTokenizerLoader,
SelectionGenerator,
mutate_sequence_with_variant,
round_up,
sanitize_intervals,
sanitize_intervals_with_padding,
)
Loading

0 comments on commit 115a705

Please sign in to comment.