From e5bb519f84088d170dfdeae2aca6b57b5faa89cb Mon Sep 17 00:00:00 2001 From: "Joel Z. Leibo" Date: Thu, 12 Sep 2024 02:20:29 -0700 Subject: [PATCH] Make evaluation script write usable intermediate files which can be used in the event that the full script crashes. This makes it possible to re-run only the scenarios that crashed. PiperOrigin-RevId: 673749453 Change-Id: I52aa1bd816b18f25bcf6a8a3485777c23fb48b13 --- .../launch_concordia_challenge_evaluation.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/examples/modular/launch_concordia_challenge_evaluation.py b/examples/modular/launch_concordia_challenge_evaluation.py index a73af06d..96250d26 100644 --- a/examples/modular/launch_concordia_challenge_evaluation.py +++ b/examples/modular/launch_concordia_challenge_evaluation.py @@ -107,7 +107,7 @@ '--num_repetitions_per_scenario', action='store', type=int, - default=2, + default=1, dest='num_repetitions_per_scenario', ) parser.add_argument('--api_key', @@ -224,9 +224,10 @@ def _evaluate_all_repetitions_on_one_scenario( ) with open(html_filename, 'a', encoding='utf-8') as f: f.write(text_results_log) + # Average scores over repetitions and save results for all repetitions in a # json-serializable format. - return logging_lib.ScenarioResult( + scenario_result_ = logging_lib.ScenarioResult( scenario=scenario_name, focal_agent=args.agent_name, background_agent=scenario_config.background_agent_module, @@ -245,6 +246,14 @@ def _evaluate_all_repetitions_on_one_scenario( disable_language_model=args.disable_language_model, exclude_from_elo_calculation=args.exclude_from_elo_calculation, ) + scenario_json_filename = ( + f'{args.agent_name}__{args.model_name}__' + f'{args.embedder_name}__only_{scenario_name}.json' + ).replace('/', '_') + json_str_ = scenario_result_.to_json() + with open(scenario_json_filename, 'a', encoding='utf-8') as f: + f.write(json_str_) + return scenario_result_ tasks = { name: functools.partial(