From 540b16174badf506f0736234205120beabb79c8c Mon Sep 17 00:00:00 2001
From: Li Yin
Date: Thu, 2 Jan 2025 15:20:39 -0800
Subject: [PATCH] use random seed to control the order of training samples, add
backward pass setup for the backward engine via the trainer.fit function
---
adalflow/adalflow/core/generator.py | 62 +++++++-
.../adalflow/optim/text_grad/tgd_optimizer.py | 2 +-
adalflow/adalflow/optim/trainer/adal.py | 6 +-
adalflow/adalflow/optim/trainer/trainer.py | 86 ++++------
adalflow/adalflow/optim/types.py | 3 +
adalflow/adalflow/utils/data.py | 9 +-
.../bbh/object_count/task.py | 2 +-
.../bbh/object_count/train_new.py | 32 +++-
use_cases/text_grad_2.0_train.py | 150 ++++++++++--------
9 files changed, 223 insertions(+), 129 deletions(-)
diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py
index 943fc9efc..e4e1b4f7c 100644
--- a/adalflow/adalflow/core/generator.py
+++ b/adalflow/adalflow/core/generator.py
@@ -9,6 +9,7 @@
from typing import Any, Dict, Optional, Union, Callable, Tuple, List
import logging
+from dataclasses import dataclass, field
from adalflow.core.types import (
@@ -63,6 +64,20 @@
PromptArgType = Dict[str, Union[str, Parameter]]
+@dataclass
+class BackwardPassSetup(DataClass):
+ all_pred_at_once: bool = field(
+ default=True, metadata={"desc": "Backward all predecessors at once."}
+ )
+ threshold_score_to_compute_grad_for_errors: float = field(
+ default=0.9,
+ metadata={"desc": "Threshold score to compute gradient for errors."},
+ )
+ compute_grad_for_errors_only: bool = field(
+ default=False, metadata={"desc": "Compute gradient for errors only."}
+ )
+
+
class Generator(GradComponent, CachedEngine, CallbackManager):
__doc__ = """An user-facing orchestration component for LLM prediction.
@@ -95,6 +110,10 @@ class Generator(GradComponent, CachedEngine, CallbackManager):
{}
) # to create teacher generator from student TODO: might reaccess this
+ backward_pass_setup: BackwardPassSetup = (
+ BackwardPassSetup()
+ ) # default setup for the backward pass
+
def __init__(
self,
*,
@@ -184,6 +203,9 @@ def __init__(
{}
) # used by dynamic computation graph and backpropagation
+ def update_default_backward_pass_setup(self, setup: BackwardPassSetup):
+ self.backward_pass_setup = setup
+
def set_cache_path(self, cache_path: str, model_client: object, model: str):
"""Set the cache path for the generator."""
@@ -593,6 +615,7 @@ def data_to_prompt_map_fn(data: Parameter) -> str:
log.debug(f"Backward engine: {self.backward_engine}")
# attach a funtion to compute gradient for predecessors
+
response.set_grad_fn(
BackwardContext(
backward_fn=self.backward,
@@ -602,7 +625,6 @@ def data_to_prompt_map_fn(data: Parameter) -> str:
template=self.template,
prompt_str=self.get_prompt(**combined_prompt_kwargs),
id=id,
- all_pred_at_once=True,
)
)
return response
@@ -615,11 +637,16 @@ def backward(
prompt_str: str,
backward_engine: Optional["Generator"] = None,
id: Optional[str] = None, # the id of the input
- all_pred_at_once: bool = True,
) -> Parameter:
log.info(f"Generator: Backward: {response.name}")
+ backward_pass_setup = backward_engine.backward_pass_setup
+ printc(
+ f"backward pass setup: {backward_pass_setup}, name: {self.name}",
+ color="red",
+ )
+
children_params = response.predecessors
is_intermediate_node = True
if response.get_gradient_and_context_text().strip() == "":
@@ -648,6 +675,9 @@ def backward(
for pred in children_params:
pred.backward_engine_disabled = True
return
+
+ all_pred_at_once = backward_pass_setup.all_pred_at_once
+
if not all_pred_at_once:
for pred in children_params:
if not pred.requires_opt or pred.param_type == ParameterType.DEMOS:
@@ -663,6 +693,7 @@ def backward(
template=template,
backward_engine=backward_engine,
prompt_str=prompt_str,
+ backward_pass_setup=backward_pass_setup,
is_intermediate_node=is_intermediate_node,
)
else:
@@ -680,6 +711,7 @@ def backward(
template=template,
backward_engine=backward_engine,
prompt_str=prompt_str,
+ backward_pass_setup=backward_pass_setup,
is_intermediate_node=is_intermediate_node,
)
else:
@@ -693,6 +725,7 @@ def _backward_through_all_predecessors(
template: str,
backward_engine: "BackwardEngine",
prompt_str: str,
+ backward_pass_setup: BackwardPassSetup,
is_intermediate_node: bool = False,
):
parser = JsonParser()
@@ -762,8 +795,13 @@ def _backward_through_all_predecessors(
gradient_output: GeneratorOutput = None
response_gradient_list = [""] * len(children_params)
- if response._score is not None and float(response._score) > 0.9:
- manual_response_1 = f"You get score: {response._score}."
+ if (
+ backward_pass_setup.compute_grad_for_errors_only
+ and response._score is not None
+ and float(response._score)
+ > backward_pass_setup.threshold_score_to_compute_grad_for_errors
+ ):
+ manual_response_1 = f"You get score: {response._score}. No noticable error."
response_gradient_list = [manual_response_1] * len(children_params)
raw_response = str(response_gradient_list)
gradient_output = GeneratorOutput(
@@ -832,6 +870,7 @@ def _backward_through_one_predecessor(
template: str,
backward_engine: "BackwardEngine",
prompt_str: str,
+ backward_pass_setup: BackwardPassSetup,
is_intermediate_node: bool = False,
):
"""Creating gradient/textual feedback for prompt type parameters."""
@@ -840,7 +879,7 @@ def _backward_through_one_predecessor(
f"Generator: Skipping {pred} as it does not require optimization."
)
return
- log.debug(
+ printc(
f"Generator: Backward through {pred}, is_intermediate_node: {is_intermediate_node}"
)
@@ -872,8 +911,10 @@ def _backward_through_one_predecessor(
variable_dict = pred.get_param_info()
+ peers = [p.get_param_info() for p in pred.peers]
+
variable_and_peers_info = Prompt(
- prompt_kwargs={"variable": variable_dict, "peers": pred.peers},
+ prompt_kwargs={"variable": variable_dict, "peers": peers},
template=VARIABLE_AND_PEERS_INFO,
)()
@@ -914,10 +955,15 @@ def _backward_through_one_predecessor(
)
print(f"Backward engine prompt: {backward_engine_prompt_str}")
gradient_output: GeneratorOutput = None
- if response._score is not None and float(response._score) > 0.9:
+ if (
+ backward_pass_setup.compute_grad_for_errors_only
+ and response._score is not None
+ and float(response._score)
+ > backward_pass_setup.threshold_score_to_compute_grad_for_errors
+ ):
log.debug(f"EvalFnToTextLoss: Skipping {pred} as the score is high enough.")
# TODO: plus score descriptions
- manual_response = f"You get score: {response._score}."
+ manual_response = f"You get score: {response._score}. No noticable error."
gradient_output = GeneratorOutput(
data=manual_response, raw_response=manual_response
)
diff --git a/adalflow/adalflow/optim/text_grad/tgd_optimizer.py b/adalflow/adalflow/optim/text_grad/tgd_optimizer.py
index 26bbf38eb..076a2a4e0 100644
--- a/adalflow/adalflow/optim/text_grad/tgd_optimizer.py
+++ b/adalflow/adalflow/optim/text_grad/tgd_optimizer.py
@@ -374,7 +374,7 @@ def _get_user_prompt_kwargs(self, param: Parameter) -> Dict[str, str]:
variable=param.get_param_info(), peers=peers_params
)
- variable_grad = param.get_gradients_component_schema(skip_correct_sample=True)
+ variable_grad = param.get_gradients_component_schema(skip_correct_sample=False)
user_prompt_kwargs = {
"variable_and_peers_info": variable_and_peer_info,
diff --git a/adalflow/adalflow/optim/trainer/adal.py b/adalflow/adalflow/optim/trainer/adal.py
index 9e12da69e..0609ea963 100644
--- a/adalflow/adalflow/optim/trainer/adal.py
+++ b/adalflow/adalflow/optim/trainer/adal.py
@@ -9,7 +9,7 @@
if TYPE_CHECKING:
from adalflow.core.model_client import ModelClient
- from adalflow.core.generator import Generator, BackwardEngine
+ from adalflow.core.generator import Generator, BackwardEngine, BackwardPassSetup
from adalflow.optim.parameter import Parameter
from adalflow.core.component import Component
@@ -187,6 +187,7 @@ def configure_backward_engine(self, *args, **kwargs):
self.configure_backward_engine_helper(
model_client=self.backward_engine_model_config["model_client"],
model_kwargs=self.backward_engine_model_config["model_kwargs"],
+ backward_pass_setup=kwargs.get("backward_pass_setup", None),
)
# def configure_backward_engine(self, *args, **kwargs):
@@ -594,6 +595,7 @@ def configure_backward_engine_helper(
model_client: "ModelClient",
model_kwargs: Dict[str, Any],
template: Optional[str] = None,
+ backward_pass_setup: Optional["BackwardPassSetup"] = None,
):
r"""Configure a backward engine for all generators in the task for bootstrapping examples."""
from adalflow.core.generator import BackwardEngine
@@ -603,6 +605,8 @@ def configure_backward_engine_helper(
model_kwargs=model_kwargs,
template=template,
)
+ if backward_pass_setup is not None:
+ self.backward_engine.update_default_backward_pass_setup(backward_pass_setup)
# set all generator's backward engine
diff --git a/adalflow/adalflow/optim/trainer/trainer.py b/adalflow/adalflow/optim/trainer/trainer.py
index c3a351df3..98ed97eea 100644
--- a/adalflow/adalflow/optim/trainer/trainer.py
+++ b/adalflow/adalflow/optim/trainer/trainer.py
@@ -15,6 +15,8 @@
if TYPE_CHECKING:
from adalflow.optim.parameter import Parameter
+ from adalflow.core.generator import BackwardPassSetup
+
from adalflow.optim.types import (
PromptData,
TrainerResult,
@@ -82,6 +84,7 @@ class Trainer(Component):
optimization_order: Literal["sequential", "mix"] = (
"sequential" # zero-shot first, bootstrap second
)
+ sequential_order: List[str] = ["text", "demo"]
max_steps: int
optimizer: Optimizer = None
ckpt_path: Optional[str] = None
@@ -98,7 +101,7 @@ class Trainer(Component):
max_error_samples: Optional[int] = 2
max_correct_samples: Optional[int] = 2
debug: bool = False
- sequential_order: List[str] = ["text", "demo"]
+ random_seed: int = None
def __init__(
self,
@@ -173,6 +176,9 @@ def __init__(
)
self.sequential_order = sequential_order
+ def set_random_seed(self, seed: int):
+ self.random_seed = seed
+
# TODO: need to support checkpoint resume too!
def diagnose(self, dataset: Any, split: str = "train"):
"""Run an evaluation on the trainset to track all error response, and its raw response using AdaplComponent's default configure_callbacks
@@ -376,6 +382,7 @@ def fit(
resume_from_ckpt: Optional[
str
] = None, # TODO: have a more comprehensive ckpt loading in the future
+ backward_pass_setup: Optional["BackwardPassSetup"] = None,
) -> Tuple[str, TrainerResult]:
r"""
train_loader: An iterable or collection of iterables specifying training samples.
@@ -383,6 +390,7 @@ def fit(
Returns:
Tuple[str, TrainerResult]: Checkpoint file and the TrainerResult object
"""
+
start_time = time.time()
debug = debug or self.debug
@@ -410,6 +418,7 @@ def fit(
train_dataset,
batch_size=batch_size,
shuffle=True if not debug else False,
+ seed=self.random_seed,
)
val_dataset = val_dataset or self.val_dataset
test_dataset = test_dataset or self.test_dataset
@@ -461,7 +470,9 @@ def fit(
if len(self._get_trainable_text_params()) > 0:
if self.adaltask.backward_engine is None:
- self.adaltask.configure_backward_engine()
+ self.adaltask.configure_backward_engine(
+ backward_pass_setup=backward_pass_setup
+ )
else:
print("No trainable text params to optimize")
self.text_optimizers = []
@@ -592,23 +603,13 @@ def run_demo_optimizers(starting_step: int, trainer_results: TrainerResult):
run_demo_optimizers(starting_step, trainer_results)
starting_step += self.max_steps
run_text_optimizers(starting_step, trainer_results)
- # if len(self.text_optimizers) > 0:
- # run_text_optimizers(starting_step, trainer_results)
-
- # if len(self.demo_optimizers) > 0:
- # run_demo_optimizers(starting_step, trainer_results)
- # self.adaltask.configure_teacher_generator() # attemp to use the newest teacher as
- # self._fit_demos_random(
- # train_loader,
- # train_dataset,
- # val_dataset,
- # test_dataset,
- # trainer_results=trainer_results,
- # starting_step=starting_step,
- # )
end_time = time.time()
print(f"Training time: {end_time - start_time}s")
+ trainer_results.total_time = end_time - start_time
+ # write the results to the checkpoint file
+ save_json(trainer_results.to_dict(), self.ckpt_file)
+
print(f"ckpt_file: {self.ckpt_file}")
return self.ckpt_file, trainer_results
@@ -747,7 +748,7 @@ def _fit_demos_one_step_for_debug(
self.prep_ckpt_file_path()
debug_path = os.path.join(self.ckpt_path, "debug_demos")
os.makedirs(debug_path, exist_ok=True)
- print(f"save to {debug_path}")
+ print(f"_fit_demos_one_step_for_debug save to {debug_path}")
self.adaltask.train()
self.adaltask.trace()
@@ -832,41 +833,11 @@ def _fit_demos_one_step_for_debug(
self._demo_optimizers_add_scores(
[sample.id for sample in batch], batch_per_item_scores, is_teacher=False
)
- # for loss in losses_student:
- # loss.backward()
+
# Check the eval result
y_preds_outputs = [p.data for p in y_preds_student]
eval_result = self.adaltask.evaluate_samples(batch, y_preds_outputs)
print(f"Eval result: {eval_result.avg_score}")
- # eval_score_per_item = eval_result.per_item_scores
-
- # bootstrap a batch
- # batch_for_teacher = []
- # losses_teacher = []
-
- # for i, (sample, item_score) in enumerate(zip(batch, eval_score_per_item)):
-
- # # use teacher
- # if sample.id in pred_teacher:
- # continue
- # # if item_score < 0.5:
- # pred_teacher.add(sample.id)
- # batch_for_teacher.append(sample)
- # # run teacher, use teachers's output instead of the initial output (bootstrap)
- # if len(batch_for_teacher) > 0:
- # print(f"Using teacher for {len(batch_for_teacher)} samples")
- # self.adaltask.use_teacher()
- # y_preds_teacher = self.adaltask.train_step(
- # batch_for_teacher, batch_idx, self.num_workers
- # )
- # losses_teacher: List[Parameter] = self.adaltask.loss_step( # noqa F841
- # batch_for_teacher, y_preds_teacher, batch_idx, self.num_workers
- # )
- # self._demo_optimizers_add_scores(
- # [sample.id for sample in batch_for_teacher],
- # eval_score_per_item,
- # is_teacher=True,
- # )
# loss_students backward
for loss in losses_student:
@@ -1094,7 +1065,7 @@ def _fit_text_grad_demo_mix_constrained(
if trainer_results is None
else trainer_results
)
- print(f"save to {self.ckpt_file}")
+ print(f"_fit_text_grad_demo_mix_constrained save to {self.ckpt_file}")
if train_dataset is None:
raise ValueError("train_dataset is required")
@@ -1267,7 +1238,7 @@ def _fit_text_grad_demo_mix_random(
if train_results is None
else train_results
)
- print(f"save to {self.ckpt_file}")
+ print(f"_fit_text_grad_demo_mix_random save to {self.ckpt_file}")
if train_dataset is None:
raise ValueError("train_dataset is required")
@@ -1409,7 +1380,7 @@ def _fit_demos_random(
if trainer_results is None
else trainer_results
)
- print(f"save to {self.ckpt_file}")
+ print(f"_fit_demos_random save to {self.ckpt_file}")
print(f"Starting step: {starting_step}")
self.adaltask.train()
@@ -1602,7 +1573,7 @@ def _fit_text_grad_random(
if trainer_results is None
else trainer_results
)
- print(f"save to {self.ckpt_file}")
+ print(f"_fit_text_grad_random save to {self.ckpt_file}")
self.adaltask.train()
# self.optimizer.zero_grad()
@@ -1623,6 +1594,8 @@ def _fit_text_grad_random(
self.adaltask.train() # this will turn everything to train mode
# self.train()
try:
+ # print(f"Batch: {batch}")
+ # continue
y_preds = self.adaltask.train_step(batch, steps, self.num_workers)
except Exception as e:
print(f"Error in train step: {e}")
@@ -1659,6 +1632,8 @@ def _fit_text_grad_random(
if val_score > last_val_score:
print(f"Optimizer step: {val_score} > {last_val_score}")
+ # track the effectiveness
+ self._track_effectiveness("valset", True)
# self.optimizer.step()
self._step_text_optimizers()
self._add_history_text_optimizers(val_score) # track top performor
@@ -1680,6 +1655,7 @@ def _fit_text_grad_random(
print(f"Optimizer revert: {val_score} <= {last_val_score}")
self._revert_text_optimizers()
+ self._track_effectiveness("valset", False)
# save the score, no change
self._add_one_step_in_trainer_results(
trainer_results,
@@ -2139,13 +2115,14 @@ def _fit_text_grad_constraint(
from adalflow.optim.parameter import OutputParameter
logger.info("Fitting using Textual Gradient Descent with constraints")
+ printc("Fitting using Textual Gradient Descent with constraints")
trainer_results = (
self._pre_fit(val_dataset, test_dataset)
if trainer_results is None
else trainer_results
)
- print(f"save to {self.ckpt_file}")
+ print(f"_fit_text_grad_constraint save to {self.ckpt_file}")
self.adaltask.train()
self._zero_grad_text_optimizers()
@@ -2155,6 +2132,7 @@ def _fit_text_grad_constraint(
all_samples, all_losses = [], []
all_y_preds: List[OutputParameter] = []
for epoch in tqdm(range(num_epochs), desc="Epoch"):
+ print(f"Epoch: {epoch}")
for steps, batch in enumerate((pbar := tqdm(train_loader, position=0))):
total_steps += 1
if total_steps > self.max_steps + starting_step:
@@ -2163,6 +2141,8 @@ def _fit_text_grad_constraint(
self._zero_grad_text_optimizers()
pbar.set_description(f"Training Step: {total_steps}")
self.adaltask.train() # this will turn everything to train mode
+ # print(f"Batch: {batch}")
+ # continue
y_preds = self.adaltask.train_step(batch, steps, self.num_workers)
losses = self.adaltask.loss_step(
batch, y_preds, steps, self.num_workers
diff --git a/adalflow/adalflow/optim/types.py b/adalflow/adalflow/optim/types.py
index 6c4bb92f1..83517a5c4 100644
--- a/adalflow/adalflow/optim/types.py
+++ b/adalflow/adalflow/optim/types.py
@@ -158,3 +158,6 @@ class TrainerResult(DataClass):
trainer_state: Dict[str, Any] = field(
default=None, metadata={"desc": "Save the most detailed state of the trainer"}
)
+ total_time: float = field(
+ default=0.0, metadata={"desc": "Total time taken for training"}
+ )
diff --git a/adalflow/adalflow/utils/data.py b/adalflow/adalflow/utils/data.py
index 682453b1d..374c47b44 100644
--- a/adalflow/adalflow/utils/data.py
+++ b/adalflow/adalflow/utils/data.py
@@ -74,10 +74,13 @@ class DataLoader:
The biggest difference is not to handle tensors, but to handle any type of data."""
- def __init__(self, dataset, batch_size: int = 4, shuffle: bool = True):
+ def __init__(
+ self, dataset, batch_size: int = 4, shuffle: bool = True, seed: int = 42
+ ):
self.dataset = dataset
self.batch_size = batch_size
self.shuffle = shuffle
+ self.seed = seed
self.indices = np.arange(len(dataset))
# if self.shuffle:
@@ -91,6 +94,8 @@ def set_max_steps(self, max_steps: int):
def __iter__(self):
if self.shuffle:
+ if self.seed is not None:
+ np.random.seed(self.seed) # Use the provided seed
np.random.shuffle(self.indices)
self.current_index = 0
return self
@@ -104,6 +109,8 @@ def __next__(self) -> Union[np.ndarray, Tuple]:
if self.current_index >= len(self.dataset):
if self.shuffle:
+ if self.seed is not None:
+ np.random.seed(self.seed) # Use the same seed for reshuffle
np.random.shuffle(self.indices) # Reshuffle for the new epoch
self.current_index = 0
if self.step_index < self.max_steps:
diff --git a/use_cases/question_answering/bbh/object_count/task.py b/use_cases/question_answering/bbh/object_count/task.py
index 2f17930dd..8be247509 100644
--- a/use_cases/question_answering/bbh/object_count/task.py
+++ b/use_cases/question_answering/bbh/object_count/task.py
@@ -40,7 +40,7 @@ def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):
few_shot_demos = adal.Parameter(
data=None,
role_desc="To provide few shot demos to the language model",
- requires_opt=True,
+ requires_opt=False,
param_type=ParameterType.DEMOS,
)
diff --git a/use_cases/question_answering/bbh/object_count/train_new.py b/use_cases/question_answering/bbh/object_count/train_new.py
index 467ab7c66..b639beb3d 100644
--- a/use_cases/question_answering/bbh/object_count/train_new.py
+++ b/use_cases/question_answering/bbh/object_count/train_new.py
@@ -96,6 +96,9 @@ def train_diagnose_teacher(
# You will answer a reasoning question. Think step by step and double-check each calculation you make. Pay close attention to any numerical quantities in the text, converting written numbers into their numerical equivalents. Additionally, re-verify your final answer before concluding. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.
# 0.98 val, 0.91 test
+from adalflow.core.generator import BackwardPassSetup
+
+
def train(
train_batch_size=4, # larger batch size is not that effective, probably because of llm's lost in the middle
raw_shots: int = 0,
@@ -107,6 +110,8 @@ def train(
debug=False,
resume_from_ckpt=None,
exclude_input_fields_from_bootstrap_demos=False,
+ seed=None,
+ tg: bool = False,
):
adal_component = ObjectCountAdalComponent(
**gpt_3_model,
@@ -115,6 +120,13 @@ def train(
backward_engine_model_config=gpt_4o_model,
)
print(adal_component)
+ backward_pass_setup = None
+ if tg:
+ backward_pass_setup = BackwardPassSetup(
+ all_pred_at_once=False,
+ compute_grad_for_errors_only=False,
+ )
+
trainer = adal.Trainer(
train_batch_size=train_batch_size,
adaltask=adal_component,
@@ -124,21 +136,24 @@ def train(
raw_shots=raw_shots,
bootstrap_shots=bootstrap_shots,
debug=debug,
- weighted_sampling=True,
+ weighted_sampling=False,
optimization_order=optimization_order,
exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos,
)
+ trainer.set_random_seed(seed)
print(trainer)
train_dataset, val_dataset, test_dataset = load_datasets()
# train_dataset = train_dataset[:4]
# val_dataset = val_dataset[:4]
# test_dataset = test_dataset[:4]
+
ckpt, _ = trainer.fit(
train_dataset=train_dataset,
val_dataset=val_dataset,
test_dataset=test_dataset,
resume_from_ckpt=resume_from_ckpt,
+ backward_pass_setup=backward_pass_setup,
)
return ckpt
@@ -146,12 +161,18 @@ def train(
if __name__ == "__main__":
import json
+ import random
+
+ random.seed(2025)
+ # np.random.seed(2025) # Set NumPy random seed
+
# make the strategy configurable in the script
import argparse
parser = argparse.ArgumentParser()
- parser.add_argument("--strategy", type=str, default="random")
+ parser.add_argument("--strategy", type=str, default="constrained")
+ parser.add_argument("--use_tg", action="store_true")
parser.add_argument(
"output_path", nargs="?", help="File path to save the checkpoint"
)
@@ -160,12 +181,16 @@ def train(
set_strategy = args.strategy
set_output_path = args.output_path
+ use_tg = args.use_tg
ckpt = train(
debug=False,
max_steps=12,
strategy=set_strategy,
exclude_input_fields_from_bootstrap_demos=True,
+ seed=2025, # pass the numpy seed
+ tg=use_tg,
+ # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_18e8d_run_1.json",
)
print(f"ckpt: {ckpt}")
if set_output_path:
@@ -188,3 +213,6 @@ def train(
# /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_1f358_run_1.json 1 val 0.96 val 955s
# 0.94 val, 0.89 test, /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_e1bb5_run_1.json 907s, with both positive and negatives
# 92, 91 test /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_18e8d_run_1.json 747s
+ # 96% /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_18e8d_run_1.json
+ # (90%, 94%, 92%, 94%) 92.5 + 1.5
+ # (96%, 100%, 96%, 96% ) 97+ 1.73
diff --git a/use_cases/text_grad_2.0_train.py b/use_cases/text_grad_2.0_train.py
index 6071e36ce..e84b15fba 100644
--- a/use_cases/text_grad_2.0_train.py
+++ b/use_cases/text_grad_2.0_train.py
@@ -1,6 +1,8 @@
import subprocess
import tempfile
import json
+import numpy as np
+import argparse
num_runs = 4
# List of experiments to run
@@ -13,9 +15,25 @@
# hotpot_qa_multi_hop_rag,
]
+# set up the strategy for each experiment
+
+argparser = argparse.ArgumentParser()
+argparser.add_argument("--strategy", type=str, default="constrained")
+argparser.add_argument("--use_tg", action="store_true")
+args = argparser.parse_args()
+
+strategy = args.strategy
+use_tg = args.use_tg
+
# Optional: Arguments for each experiment (if needed)
+
+setup_str = f"--strategy {strategy}"
+
+if use_tg:
+ setup_str += " --use_tg"
+
experiment_args = {
- object_count: "--strategy constrained",
+ object_count: setup_str,
# hotpot_qa_multi_hop_rag: "",
}
ckpt_values = {}
@@ -64,65 +82,72 @@ def run_experiment(script, args):
if ckpt:
ckpt_values[ckpt_index] = ckpt
# load all json files using the ckpt paths
- highest_test_score, mean_test_score, standard_deviation = 0, 0, 0
- past_highest_scores = []
- # average pass rate, average pass prompts
- average_pass_rate_list = []
- average_pass_prompts_list = []
- average_total_prompts = []
- total_prompts = 0
- highest_test_score_json_file = None
+ highest_test_score, last_test_score, mean_test_score, standard_deviation = (
+ 0,
+ 0,
+ 0,
+ 0,
+ )
+ last_test_scores = []
+ highest_val_scores = []
+ total_passes = (
+ []
+ ) # each is the number of unique val scores in the highest val scores
+ total_prompts = [] # how many prompts tried in total
+
+ past_highest_val_scores = []
+ # # average pass rate, average pass prompts
+ # average_pass_rate_list = []
+ # average_pass_prompts_list = []
+ # average_total_prompts = []
+ # highest_test_score_json_file = None
+ total_steps = []
+ training_times = []
for experiment_index, ckpt in ckpt_values.items():
with open(ckpt, "r") as f:
data = json.load(f)
print(f"Experiment: {experiment_index}")
print(f"Data: {data}")
- _high_test_score = max(data["val_scores"])
- print(f" val score: {data["val_scores"]}")
- past_highest_scores.append(_high_test_score)
- if _high_test_score > highest_test_score:
- highest_test_score = _high_test_score
- highest_test_score_json_file = ckpt
+ _high_val_score = max(data["val_scores"])
+ _unique_val_scores = len(set(data["val_scores"])) - 1
+ _last_test_score = data["test_scores"][-1]
# read the effective measures
effective_measures = data.get("effective_measure", {})
- if not effective_measures:
- total_prompts = len(data["val_scores"]) - 1
- # count the total number of different test scores
- pass_num = len(set(data["val_scores"])) - 1
- average_pass_rate = pass_num / total_prompts
- average_pass_rate_list.append(average_pass_rate)
- average_pass_prompts_list.append(pass_num)
- average_total_prompts.append(total_prompts)
- else:
- total_prompts = (
- effective_measures["subset"]["pass"]
- + effective_measures["subset"]["fail"]
- )
-
- pass_num = effective_measures["valset"]["pass"]
- total_val_prompts = (
- effective_measures["valset"]["pass"]
- + effective_measures["valset"]["fail"]
- )
- average_pass_rate = pass_num / total_val_prompts
- average_pass_rate_list.append(average_pass_rate)
- average_pass_prompts_list.append(pass_num)
- average_total_prompts.append(total_prompts)
- # calculate the mean test score
- mean_test_score = sum(past_highest_scores) / len(past_highest_scores)
- # calculate the standard deviation
- standard_deviation = sum(
- [(x - mean_test_score) ** 2 for x in past_highest_scores]
- ) / len(past_highest_scores)
- standard_deviation = standard_deviation**0.5
- # calculate the average pass rate
- average_pass_rate = sum(average_pass_rate_list) / len(average_pass_rate_list)
- # calculate the average pass prompts
- average_pass_prompts = sum(average_pass_prompts_list) / len(
- average_pass_prompts_list
- )
- # calculate the average total prompts
- average_total_prompts = sum(average_total_prompts) / num_runs
+
+ _total_prompts = effective_measures.get("subset", {}).get(
+ "pass", 0
+ ) + effective_measures.get("subset", {}).get("fail", 0)
+ _total_steps = len(data["steps"]) - 1
+ _training_time = data.get("total_time", 0)
+ # save the results in the lists
+ past_highest_val_scores.append(_high_val_score)
+ total_passes.append(_unique_val_scores)
+ total_prompts.append(_total_prompts)
+ last_test_scores.append(_last_test_score)
+ total_steps.append(_total_steps)
+ training_times.append(_training_time)
+
+ # ensure all steps are the same
+ assert all(
+ [step == total_steps[0] for step in total_steps]
+ ), "All steps should be the same"
+
+ # compute the metrics
+ mean_test_score = np.mean(last_test_scores)
+ std_test_score = np.std(last_test_scores)
+
+ # val scores
+ mean_val_score = np.mean(past_highest_val_scores)
+ std_val_score = np.std(past_highest_val_scores)
+
+ # pass rate total_passes / steps
+ average_pass_rate = np.mean(total_passes) / total_steps[0]
+
+ # average total prompts
+ average_total_prompts = np.mean(total_prompts)
+
+ # average training time
+ average_training_time = np.mean(training_times)
# add these numbers in the ckpt_values
index = f"{experiment}_summary"
@@ -131,14 +156,15 @@ def run_experiment(script, args):
"num_runs": num_runs,
"args": args,
},
- "highest_test_score": highest_test_score,
- "mean_test_score": mean_test_score,
- "standard_deviation": standard_deviation,
- "highest_test_score_json_file": highest_test_score_json_file,
- "average_pass_rate": average_pass_rate,
- "average_pass_prompts": average_pass_prompts,
- "average_total_prompts": average_total_prompts,
- "past_highest_scores": past_highest_scores,
+ "metrics": {
+ "mean_test_score": mean_test_score,
+ "std_test_score": std_test_score,
+ "mean_val_score": mean_val_score,
+ "std_val_score": std_val_score,
+ "average_pass_rate": average_pass_rate,
+ "average_total_prompts": average_total_prompts,
+ "average_training_time": average_training_time,
+ },
}
print("\nAll Checkpoints:")
@@ -147,6 +173,6 @@ def run_experiment(script, args):
# Save the results to a file
with open(result_file, "w") as f:
- json.dump(ckpt_values, f)
+ json.dump(ckpt_values, f, indent=4)
print(f"\nResults saved to {result_file}")