From 540b16174badf506f0736234205120beabb79c8c Mon Sep 17 00:00:00 2001 From: Li Yin Date: Thu, 2 Jan 2025 15:20:39 -0800 Subject: [PATCH] use random seed to control the order of training samples, add backward pass setup for the backward engine via the trainer.fit function --- adalflow/adalflow/core/generator.py | 62 +++++++- .../adalflow/optim/text_grad/tgd_optimizer.py | 2 +- adalflow/adalflow/optim/trainer/adal.py | 6 +- adalflow/adalflow/optim/trainer/trainer.py | 86 ++++------ adalflow/adalflow/optim/types.py | 3 + adalflow/adalflow/utils/data.py | 9 +- .../bbh/object_count/task.py | 2 +- .../bbh/object_count/train_new.py | 32 +++- use_cases/text_grad_2.0_train.py | 150 ++++++++++-------- 9 files changed, 223 insertions(+), 129 deletions(-) diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py index 943fc9efc..e4e1b4f7c 100644 --- a/adalflow/adalflow/core/generator.py +++ b/adalflow/adalflow/core/generator.py @@ -9,6 +9,7 @@ from typing import Any, Dict, Optional, Union, Callable, Tuple, List import logging +from dataclasses import dataclass, field from adalflow.core.types import ( @@ -63,6 +64,20 @@ PromptArgType = Dict[str, Union[str, Parameter]] +@dataclass +class BackwardPassSetup(DataClass): + all_pred_at_once: bool = field( + default=True, metadata={"desc": "Backward all predecessors at once."} + ) + threshold_score_to_compute_grad_for_errors: float = field( + default=0.9, + metadata={"desc": "Threshold score to compute gradient for errors."}, + ) + compute_grad_for_errors_only: bool = field( + default=False, metadata={"desc": "Compute gradient for errors only."} + ) + + class Generator(GradComponent, CachedEngine, CallbackManager): __doc__ = """An user-facing orchestration component for LLM prediction. @@ -95,6 +110,10 @@ class Generator(GradComponent, CachedEngine, CallbackManager): {} ) # to create teacher generator from student TODO: might reaccess this + backward_pass_setup: BackwardPassSetup = ( + BackwardPassSetup() + ) # default setup for the backward pass + def __init__( self, *, @@ -184,6 +203,9 @@ def __init__( {} ) # used by dynamic computation graph and backpropagation + def update_default_backward_pass_setup(self, setup: BackwardPassSetup): + self.backward_pass_setup = setup + def set_cache_path(self, cache_path: str, model_client: object, model: str): """Set the cache path for the generator.""" @@ -593,6 +615,7 @@ def data_to_prompt_map_fn(data: Parameter) -> str: log.debug(f"Backward engine: {self.backward_engine}") # attach a funtion to compute gradient for predecessors + response.set_grad_fn( BackwardContext( backward_fn=self.backward, @@ -602,7 +625,6 @@ def data_to_prompt_map_fn(data: Parameter) -> str: template=self.template, prompt_str=self.get_prompt(**combined_prompt_kwargs), id=id, - all_pred_at_once=True, ) ) return response @@ -615,11 +637,16 @@ def backward( prompt_str: str, backward_engine: Optional["Generator"] = None, id: Optional[str] = None, # the id of the input - all_pred_at_once: bool = True, ) -> Parameter: log.info(f"Generator: Backward: {response.name}") + backward_pass_setup = backward_engine.backward_pass_setup + printc( + f"backward pass setup: {backward_pass_setup}, name: {self.name}", + color="red", + ) + children_params = response.predecessors is_intermediate_node = True if response.get_gradient_and_context_text().strip() == "": @@ -648,6 +675,9 @@ def backward( for pred in children_params: pred.backward_engine_disabled = True return + + all_pred_at_once = backward_pass_setup.all_pred_at_once + if not all_pred_at_once: for pred in children_params: if not pred.requires_opt or pred.param_type == ParameterType.DEMOS: @@ -663,6 +693,7 @@ def backward( template=template, backward_engine=backward_engine, prompt_str=prompt_str, + backward_pass_setup=backward_pass_setup, is_intermediate_node=is_intermediate_node, ) else: @@ -680,6 +711,7 @@ def backward( template=template, backward_engine=backward_engine, prompt_str=prompt_str, + backward_pass_setup=backward_pass_setup, is_intermediate_node=is_intermediate_node, ) else: @@ -693,6 +725,7 @@ def _backward_through_all_predecessors( template: str, backward_engine: "BackwardEngine", prompt_str: str, + backward_pass_setup: BackwardPassSetup, is_intermediate_node: bool = False, ): parser = JsonParser() @@ -762,8 +795,13 @@ def _backward_through_all_predecessors( gradient_output: GeneratorOutput = None response_gradient_list = [""] * len(children_params) - if response._score is not None and float(response._score) > 0.9: - manual_response_1 = f"You get score: {response._score}." + if ( + backward_pass_setup.compute_grad_for_errors_only + and response._score is not None + and float(response._score) + > backward_pass_setup.threshold_score_to_compute_grad_for_errors + ): + manual_response_1 = f"You get score: {response._score}. No noticable error." response_gradient_list = [manual_response_1] * len(children_params) raw_response = str(response_gradient_list) gradient_output = GeneratorOutput( @@ -832,6 +870,7 @@ def _backward_through_one_predecessor( template: str, backward_engine: "BackwardEngine", prompt_str: str, + backward_pass_setup: BackwardPassSetup, is_intermediate_node: bool = False, ): """Creating gradient/textual feedback for prompt type parameters.""" @@ -840,7 +879,7 @@ def _backward_through_one_predecessor( f"Generator: Skipping {pred} as it does not require optimization." ) return - log.debug( + printc( f"Generator: Backward through {pred}, is_intermediate_node: {is_intermediate_node}" ) @@ -872,8 +911,10 @@ def _backward_through_one_predecessor( variable_dict = pred.get_param_info() + peers = [p.get_param_info() for p in pred.peers] + variable_and_peers_info = Prompt( - prompt_kwargs={"variable": variable_dict, "peers": pred.peers}, + prompt_kwargs={"variable": variable_dict, "peers": peers}, template=VARIABLE_AND_PEERS_INFO, )() @@ -914,10 +955,15 @@ def _backward_through_one_predecessor( ) print(f"Backward engine prompt: {backward_engine_prompt_str}") gradient_output: GeneratorOutput = None - if response._score is not None and float(response._score) > 0.9: + if ( + backward_pass_setup.compute_grad_for_errors_only + and response._score is not None + and float(response._score) + > backward_pass_setup.threshold_score_to_compute_grad_for_errors + ): log.debug(f"EvalFnToTextLoss: Skipping {pred} as the score is high enough.") # TODO: plus score descriptions - manual_response = f"You get score: {response._score}." + manual_response = f"You get score: {response._score}. No noticable error." gradient_output = GeneratorOutput( data=manual_response, raw_response=manual_response ) diff --git a/adalflow/adalflow/optim/text_grad/tgd_optimizer.py b/adalflow/adalflow/optim/text_grad/tgd_optimizer.py index 26bbf38eb..076a2a4e0 100644 --- a/adalflow/adalflow/optim/text_grad/tgd_optimizer.py +++ b/adalflow/adalflow/optim/text_grad/tgd_optimizer.py @@ -374,7 +374,7 @@ def _get_user_prompt_kwargs(self, param: Parameter) -> Dict[str, str]: variable=param.get_param_info(), peers=peers_params ) - variable_grad = param.get_gradients_component_schema(skip_correct_sample=True) + variable_grad = param.get_gradients_component_schema(skip_correct_sample=False) user_prompt_kwargs = { "variable_and_peers_info": variable_and_peer_info, diff --git a/adalflow/adalflow/optim/trainer/adal.py b/adalflow/adalflow/optim/trainer/adal.py index 9e12da69e..0609ea963 100644 --- a/adalflow/adalflow/optim/trainer/adal.py +++ b/adalflow/adalflow/optim/trainer/adal.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: from adalflow.core.model_client import ModelClient - from adalflow.core.generator import Generator, BackwardEngine + from adalflow.core.generator import Generator, BackwardEngine, BackwardPassSetup from adalflow.optim.parameter import Parameter from adalflow.core.component import Component @@ -187,6 +187,7 @@ def configure_backward_engine(self, *args, **kwargs): self.configure_backward_engine_helper( model_client=self.backward_engine_model_config["model_client"], model_kwargs=self.backward_engine_model_config["model_kwargs"], + backward_pass_setup=kwargs.get("backward_pass_setup", None), ) # def configure_backward_engine(self, *args, **kwargs): @@ -594,6 +595,7 @@ def configure_backward_engine_helper( model_client: "ModelClient", model_kwargs: Dict[str, Any], template: Optional[str] = None, + backward_pass_setup: Optional["BackwardPassSetup"] = None, ): r"""Configure a backward engine for all generators in the task for bootstrapping examples.""" from adalflow.core.generator import BackwardEngine @@ -603,6 +605,8 @@ def configure_backward_engine_helper( model_kwargs=model_kwargs, template=template, ) + if backward_pass_setup is not None: + self.backward_engine.update_default_backward_pass_setup(backward_pass_setup) # set all generator's backward engine diff --git a/adalflow/adalflow/optim/trainer/trainer.py b/adalflow/adalflow/optim/trainer/trainer.py index c3a351df3..98ed97eea 100644 --- a/adalflow/adalflow/optim/trainer/trainer.py +++ b/adalflow/adalflow/optim/trainer/trainer.py @@ -15,6 +15,8 @@ if TYPE_CHECKING: from adalflow.optim.parameter import Parameter + from adalflow.core.generator import BackwardPassSetup + from adalflow.optim.types import ( PromptData, TrainerResult, @@ -82,6 +84,7 @@ class Trainer(Component): optimization_order: Literal["sequential", "mix"] = ( "sequential" # zero-shot first, bootstrap second ) + sequential_order: List[str] = ["text", "demo"] max_steps: int optimizer: Optimizer = None ckpt_path: Optional[str] = None @@ -98,7 +101,7 @@ class Trainer(Component): max_error_samples: Optional[int] = 2 max_correct_samples: Optional[int] = 2 debug: bool = False - sequential_order: List[str] = ["text", "demo"] + random_seed: int = None def __init__( self, @@ -173,6 +176,9 @@ def __init__( ) self.sequential_order = sequential_order + def set_random_seed(self, seed: int): + self.random_seed = seed + # TODO: need to support checkpoint resume too! def diagnose(self, dataset: Any, split: str = "train"): """Run an evaluation on the trainset to track all error response, and its raw response using AdaplComponent's default configure_callbacks @@ -376,6 +382,7 @@ def fit( resume_from_ckpt: Optional[ str ] = None, # TODO: have a more comprehensive ckpt loading in the future + backward_pass_setup: Optional["BackwardPassSetup"] = None, ) -> Tuple[str, TrainerResult]: r""" train_loader: An iterable or collection of iterables specifying training samples. @@ -383,6 +390,7 @@ def fit( Returns: Tuple[str, TrainerResult]: Checkpoint file and the TrainerResult object """ + start_time = time.time() debug = debug or self.debug @@ -410,6 +418,7 @@ def fit( train_dataset, batch_size=batch_size, shuffle=True if not debug else False, + seed=self.random_seed, ) val_dataset = val_dataset or self.val_dataset test_dataset = test_dataset or self.test_dataset @@ -461,7 +470,9 @@ def fit( if len(self._get_trainable_text_params()) > 0: if self.adaltask.backward_engine is None: - self.adaltask.configure_backward_engine() + self.adaltask.configure_backward_engine( + backward_pass_setup=backward_pass_setup + ) else: print("No trainable text params to optimize") self.text_optimizers = [] @@ -592,23 +603,13 @@ def run_demo_optimizers(starting_step: int, trainer_results: TrainerResult): run_demo_optimizers(starting_step, trainer_results) starting_step += self.max_steps run_text_optimizers(starting_step, trainer_results) - # if len(self.text_optimizers) > 0: - # run_text_optimizers(starting_step, trainer_results) - - # if len(self.demo_optimizers) > 0: - # run_demo_optimizers(starting_step, trainer_results) - # self.adaltask.configure_teacher_generator() # attemp to use the newest teacher as - # self._fit_demos_random( - # train_loader, - # train_dataset, - # val_dataset, - # test_dataset, - # trainer_results=trainer_results, - # starting_step=starting_step, - # ) end_time = time.time() print(f"Training time: {end_time - start_time}s") + trainer_results.total_time = end_time - start_time + # write the results to the checkpoint file + save_json(trainer_results.to_dict(), self.ckpt_file) + print(f"ckpt_file: {self.ckpt_file}") return self.ckpt_file, trainer_results @@ -747,7 +748,7 @@ def _fit_demos_one_step_for_debug( self.prep_ckpt_file_path() debug_path = os.path.join(self.ckpt_path, "debug_demos") os.makedirs(debug_path, exist_ok=True) - print(f"save to {debug_path}") + print(f"_fit_demos_one_step_for_debug save to {debug_path}") self.adaltask.train() self.adaltask.trace() @@ -832,41 +833,11 @@ def _fit_demos_one_step_for_debug( self._demo_optimizers_add_scores( [sample.id for sample in batch], batch_per_item_scores, is_teacher=False ) - # for loss in losses_student: - # loss.backward() + # Check the eval result y_preds_outputs = [p.data for p in y_preds_student] eval_result = self.adaltask.evaluate_samples(batch, y_preds_outputs) print(f"Eval result: {eval_result.avg_score}") - # eval_score_per_item = eval_result.per_item_scores - - # bootstrap a batch - # batch_for_teacher = [] - # losses_teacher = [] - - # for i, (sample, item_score) in enumerate(zip(batch, eval_score_per_item)): - - # # use teacher - # if sample.id in pred_teacher: - # continue - # # if item_score < 0.5: - # pred_teacher.add(sample.id) - # batch_for_teacher.append(sample) - # # run teacher, use teachers's output instead of the initial output (bootstrap) - # if len(batch_for_teacher) > 0: - # print(f"Using teacher for {len(batch_for_teacher)} samples") - # self.adaltask.use_teacher() - # y_preds_teacher = self.adaltask.train_step( - # batch_for_teacher, batch_idx, self.num_workers - # ) - # losses_teacher: List[Parameter] = self.adaltask.loss_step( # noqa F841 - # batch_for_teacher, y_preds_teacher, batch_idx, self.num_workers - # ) - # self._demo_optimizers_add_scores( - # [sample.id for sample in batch_for_teacher], - # eval_score_per_item, - # is_teacher=True, - # ) # loss_students backward for loss in losses_student: @@ -1094,7 +1065,7 @@ def _fit_text_grad_demo_mix_constrained( if trainer_results is None else trainer_results ) - print(f"save to {self.ckpt_file}") + print(f"_fit_text_grad_demo_mix_constrained save to {self.ckpt_file}") if train_dataset is None: raise ValueError("train_dataset is required") @@ -1267,7 +1238,7 @@ def _fit_text_grad_demo_mix_random( if train_results is None else train_results ) - print(f"save to {self.ckpt_file}") + print(f"_fit_text_grad_demo_mix_random save to {self.ckpt_file}") if train_dataset is None: raise ValueError("train_dataset is required") @@ -1409,7 +1380,7 @@ def _fit_demos_random( if trainer_results is None else trainer_results ) - print(f"save to {self.ckpt_file}") + print(f"_fit_demos_random save to {self.ckpt_file}") print(f"Starting step: {starting_step}") self.adaltask.train() @@ -1602,7 +1573,7 @@ def _fit_text_grad_random( if trainer_results is None else trainer_results ) - print(f"save to {self.ckpt_file}") + print(f"_fit_text_grad_random save to {self.ckpt_file}") self.adaltask.train() # self.optimizer.zero_grad() @@ -1623,6 +1594,8 @@ def _fit_text_grad_random( self.adaltask.train() # this will turn everything to train mode # self.train() try: + # print(f"Batch: {batch}") + # continue y_preds = self.adaltask.train_step(batch, steps, self.num_workers) except Exception as e: print(f"Error in train step: {e}") @@ -1659,6 +1632,8 @@ def _fit_text_grad_random( if val_score > last_val_score: print(f"Optimizer step: {val_score} > {last_val_score}") + # track the effectiveness + self._track_effectiveness("valset", True) # self.optimizer.step() self._step_text_optimizers() self._add_history_text_optimizers(val_score) # track top performor @@ -1680,6 +1655,7 @@ def _fit_text_grad_random( print(f"Optimizer revert: {val_score} <= {last_val_score}") self._revert_text_optimizers() + self._track_effectiveness("valset", False) # save the score, no change self._add_one_step_in_trainer_results( trainer_results, @@ -2139,13 +2115,14 @@ def _fit_text_grad_constraint( from adalflow.optim.parameter import OutputParameter logger.info("Fitting using Textual Gradient Descent with constraints") + printc("Fitting using Textual Gradient Descent with constraints") trainer_results = ( self._pre_fit(val_dataset, test_dataset) if trainer_results is None else trainer_results ) - print(f"save to {self.ckpt_file}") + print(f"_fit_text_grad_constraint save to {self.ckpt_file}") self.adaltask.train() self._zero_grad_text_optimizers() @@ -2155,6 +2132,7 @@ def _fit_text_grad_constraint( all_samples, all_losses = [], [] all_y_preds: List[OutputParameter] = [] for epoch in tqdm(range(num_epochs), desc="Epoch"): + print(f"Epoch: {epoch}") for steps, batch in enumerate((pbar := tqdm(train_loader, position=0))): total_steps += 1 if total_steps > self.max_steps + starting_step: @@ -2163,6 +2141,8 @@ def _fit_text_grad_constraint( self._zero_grad_text_optimizers() pbar.set_description(f"Training Step: {total_steps}") self.adaltask.train() # this will turn everything to train mode + # print(f"Batch: {batch}") + # continue y_preds = self.adaltask.train_step(batch, steps, self.num_workers) losses = self.adaltask.loss_step( batch, y_preds, steps, self.num_workers diff --git a/adalflow/adalflow/optim/types.py b/adalflow/adalflow/optim/types.py index 6c4bb92f1..83517a5c4 100644 --- a/adalflow/adalflow/optim/types.py +++ b/adalflow/adalflow/optim/types.py @@ -158,3 +158,6 @@ class TrainerResult(DataClass): trainer_state: Dict[str, Any] = field( default=None, metadata={"desc": "Save the most detailed state of the trainer"} ) + total_time: float = field( + default=0.0, metadata={"desc": "Total time taken for training"} + ) diff --git a/adalflow/adalflow/utils/data.py b/adalflow/adalflow/utils/data.py index 682453b1d..374c47b44 100644 --- a/adalflow/adalflow/utils/data.py +++ b/adalflow/adalflow/utils/data.py @@ -74,10 +74,13 @@ class DataLoader: The biggest difference is not to handle tensors, but to handle any type of data.""" - def __init__(self, dataset, batch_size: int = 4, shuffle: bool = True): + def __init__( + self, dataset, batch_size: int = 4, shuffle: bool = True, seed: int = 42 + ): self.dataset = dataset self.batch_size = batch_size self.shuffle = shuffle + self.seed = seed self.indices = np.arange(len(dataset)) # if self.shuffle: @@ -91,6 +94,8 @@ def set_max_steps(self, max_steps: int): def __iter__(self): if self.shuffle: + if self.seed is not None: + np.random.seed(self.seed) # Use the provided seed np.random.shuffle(self.indices) self.current_index = 0 return self @@ -104,6 +109,8 @@ def __next__(self) -> Union[np.ndarray, Tuple]: if self.current_index >= len(self.dataset): if self.shuffle: + if self.seed is not None: + np.random.seed(self.seed) # Use the same seed for reshuffle np.random.shuffle(self.indices) # Reshuffle for the new epoch self.current_index = 0 if self.step_index < self.max_steps: diff --git a/use_cases/question_answering/bbh/object_count/task.py b/use_cases/question_answering/bbh/object_count/task.py index 2f17930dd..8be247509 100644 --- a/use_cases/question_answering/bbh/object_count/task.py +++ b/use_cases/question_answering/bbh/object_count/task.py @@ -40,7 +40,7 @@ def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict): few_shot_demos = adal.Parameter( data=None, role_desc="To provide few shot demos to the language model", - requires_opt=True, + requires_opt=False, param_type=ParameterType.DEMOS, ) diff --git a/use_cases/question_answering/bbh/object_count/train_new.py b/use_cases/question_answering/bbh/object_count/train_new.py index 467ab7c66..b639beb3d 100644 --- a/use_cases/question_answering/bbh/object_count/train_new.py +++ b/use_cases/question_answering/bbh/object_count/train_new.py @@ -96,6 +96,9 @@ def train_diagnose_teacher( # You will answer a reasoning question. Think step by step and double-check each calculation you make. Pay close attention to any numerical quantities in the text, converting written numbers into their numerical equivalents. Additionally, re-verify your final answer before concluding. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. # 0.98 val, 0.91 test +from adalflow.core.generator import BackwardPassSetup + + def train( train_batch_size=4, # larger batch size is not that effective, probably because of llm's lost in the middle raw_shots: int = 0, @@ -107,6 +110,8 @@ def train( debug=False, resume_from_ckpt=None, exclude_input_fields_from_bootstrap_demos=False, + seed=None, + tg: bool = False, ): adal_component = ObjectCountAdalComponent( **gpt_3_model, @@ -115,6 +120,13 @@ def train( backward_engine_model_config=gpt_4o_model, ) print(adal_component) + backward_pass_setup = None + if tg: + backward_pass_setup = BackwardPassSetup( + all_pred_at_once=False, + compute_grad_for_errors_only=False, + ) + trainer = adal.Trainer( train_batch_size=train_batch_size, adaltask=adal_component, @@ -124,21 +136,24 @@ def train( raw_shots=raw_shots, bootstrap_shots=bootstrap_shots, debug=debug, - weighted_sampling=True, + weighted_sampling=False, optimization_order=optimization_order, exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos, ) + trainer.set_random_seed(seed) print(trainer) train_dataset, val_dataset, test_dataset = load_datasets() # train_dataset = train_dataset[:4] # val_dataset = val_dataset[:4] # test_dataset = test_dataset[:4] + ckpt, _ = trainer.fit( train_dataset=train_dataset, val_dataset=val_dataset, test_dataset=test_dataset, resume_from_ckpt=resume_from_ckpt, + backward_pass_setup=backward_pass_setup, ) return ckpt @@ -146,12 +161,18 @@ def train( if __name__ == "__main__": import json + import random + + random.seed(2025) + # np.random.seed(2025) # Set NumPy random seed + # make the strategy configurable in the script import argparse parser = argparse.ArgumentParser() - parser.add_argument("--strategy", type=str, default="random") + parser.add_argument("--strategy", type=str, default="constrained") + parser.add_argument("--use_tg", action="store_true") parser.add_argument( "output_path", nargs="?", help="File path to save the checkpoint" ) @@ -160,12 +181,16 @@ def train( set_strategy = args.strategy set_output_path = args.output_path + use_tg = args.use_tg ckpt = train( debug=False, max_steps=12, strategy=set_strategy, exclude_input_fields_from_bootstrap_demos=True, + seed=2025, # pass the numpy seed + tg=use_tg, + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_18e8d_run_1.json", ) print(f"ckpt: {ckpt}") if set_output_path: @@ -188,3 +213,6 @@ def train( # /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_1f358_run_1.json 1 val 0.96 val 955s # 0.94 val, 0.89 test, /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_e1bb5_run_1.json 907s, with both positive and negatives # 92, 91 test /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_18e8d_run_1.json 747s + # 96% /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_18e8d_run_1.json + # (90%, 94%, 92%, 94%) 92.5 + 1.5 + # (96%, 100%, 96%, 96% ) 97+ 1.73 diff --git a/use_cases/text_grad_2.0_train.py b/use_cases/text_grad_2.0_train.py index 6071e36ce..e84b15fba 100644 --- a/use_cases/text_grad_2.0_train.py +++ b/use_cases/text_grad_2.0_train.py @@ -1,6 +1,8 @@ import subprocess import tempfile import json +import numpy as np +import argparse num_runs = 4 # List of experiments to run @@ -13,9 +15,25 @@ # hotpot_qa_multi_hop_rag, ] +# set up the strategy for each experiment + +argparser = argparse.ArgumentParser() +argparser.add_argument("--strategy", type=str, default="constrained") +argparser.add_argument("--use_tg", action="store_true") +args = argparser.parse_args() + +strategy = args.strategy +use_tg = args.use_tg + # Optional: Arguments for each experiment (if needed) + +setup_str = f"--strategy {strategy}" + +if use_tg: + setup_str += " --use_tg" + experiment_args = { - object_count: "--strategy constrained", + object_count: setup_str, # hotpot_qa_multi_hop_rag: "", } ckpt_values = {} @@ -64,65 +82,72 @@ def run_experiment(script, args): if ckpt: ckpt_values[ckpt_index] = ckpt # load all json files using the ckpt paths - highest_test_score, mean_test_score, standard_deviation = 0, 0, 0 - past_highest_scores = [] - # average pass rate, average pass prompts - average_pass_rate_list = [] - average_pass_prompts_list = [] - average_total_prompts = [] - total_prompts = 0 - highest_test_score_json_file = None + highest_test_score, last_test_score, mean_test_score, standard_deviation = ( + 0, + 0, + 0, + 0, + ) + last_test_scores = [] + highest_val_scores = [] + total_passes = ( + [] + ) # each is the number of unique val scores in the highest val scores + total_prompts = [] # how many prompts tried in total + + past_highest_val_scores = [] + # # average pass rate, average pass prompts + # average_pass_rate_list = [] + # average_pass_prompts_list = [] + # average_total_prompts = [] + # highest_test_score_json_file = None + total_steps = [] + training_times = [] for experiment_index, ckpt in ckpt_values.items(): with open(ckpt, "r") as f: data = json.load(f) print(f"Experiment: {experiment_index}") print(f"Data: {data}") - _high_test_score = max(data["val_scores"]) - print(f" val score: {data["val_scores"]}") - past_highest_scores.append(_high_test_score) - if _high_test_score > highest_test_score: - highest_test_score = _high_test_score - highest_test_score_json_file = ckpt + _high_val_score = max(data["val_scores"]) + _unique_val_scores = len(set(data["val_scores"])) - 1 + _last_test_score = data["test_scores"][-1] # read the effective measures effective_measures = data.get("effective_measure", {}) - if not effective_measures: - total_prompts = len(data["val_scores"]) - 1 - # count the total number of different test scores - pass_num = len(set(data["val_scores"])) - 1 - average_pass_rate = pass_num / total_prompts - average_pass_rate_list.append(average_pass_rate) - average_pass_prompts_list.append(pass_num) - average_total_prompts.append(total_prompts) - else: - total_prompts = ( - effective_measures["subset"]["pass"] - + effective_measures["subset"]["fail"] - ) - - pass_num = effective_measures["valset"]["pass"] - total_val_prompts = ( - effective_measures["valset"]["pass"] - + effective_measures["valset"]["fail"] - ) - average_pass_rate = pass_num / total_val_prompts - average_pass_rate_list.append(average_pass_rate) - average_pass_prompts_list.append(pass_num) - average_total_prompts.append(total_prompts) - # calculate the mean test score - mean_test_score = sum(past_highest_scores) / len(past_highest_scores) - # calculate the standard deviation - standard_deviation = sum( - [(x - mean_test_score) ** 2 for x in past_highest_scores] - ) / len(past_highest_scores) - standard_deviation = standard_deviation**0.5 - # calculate the average pass rate - average_pass_rate = sum(average_pass_rate_list) / len(average_pass_rate_list) - # calculate the average pass prompts - average_pass_prompts = sum(average_pass_prompts_list) / len( - average_pass_prompts_list - ) - # calculate the average total prompts - average_total_prompts = sum(average_total_prompts) / num_runs + + _total_prompts = effective_measures.get("subset", {}).get( + "pass", 0 + ) + effective_measures.get("subset", {}).get("fail", 0) + _total_steps = len(data["steps"]) - 1 + _training_time = data.get("total_time", 0) + # save the results in the lists + past_highest_val_scores.append(_high_val_score) + total_passes.append(_unique_val_scores) + total_prompts.append(_total_prompts) + last_test_scores.append(_last_test_score) + total_steps.append(_total_steps) + training_times.append(_training_time) + + # ensure all steps are the same + assert all( + [step == total_steps[0] for step in total_steps] + ), "All steps should be the same" + + # compute the metrics + mean_test_score = np.mean(last_test_scores) + std_test_score = np.std(last_test_scores) + + # val scores + mean_val_score = np.mean(past_highest_val_scores) + std_val_score = np.std(past_highest_val_scores) + + # pass rate total_passes / steps + average_pass_rate = np.mean(total_passes) / total_steps[0] + + # average total prompts + average_total_prompts = np.mean(total_prompts) + + # average training time + average_training_time = np.mean(training_times) # add these numbers in the ckpt_values index = f"{experiment}_summary" @@ -131,14 +156,15 @@ def run_experiment(script, args): "num_runs": num_runs, "args": args, }, - "highest_test_score": highest_test_score, - "mean_test_score": mean_test_score, - "standard_deviation": standard_deviation, - "highest_test_score_json_file": highest_test_score_json_file, - "average_pass_rate": average_pass_rate, - "average_pass_prompts": average_pass_prompts, - "average_total_prompts": average_total_prompts, - "past_highest_scores": past_highest_scores, + "metrics": { + "mean_test_score": mean_test_score, + "std_test_score": std_test_score, + "mean_val_score": mean_val_score, + "std_val_score": std_val_score, + "average_pass_rate": average_pass_rate, + "average_total_prompts": average_total_prompts, + "average_training_time": average_training_time, + }, } print("\nAll Checkpoints:") @@ -147,6 +173,6 @@ def run_experiment(script, args): # Save the results to a file with open(result_file, "w") as f: - json.dump(ckpt_values, f) + json.dump(ckpt_values, f, indent=4) print(f"\nResults saved to {result_file}")