From e5bb519f84088d170dfdeae2aca6b57b5faa89cb Mon Sep 17 00:00:00 2001
From: "Joel Z. Leibo" <jzl@google.com>
Date: Thu, 12 Sep 2024 02:20:29 -0700
Subject: [PATCH] Make evaluation script write usable intermediate files which
 can be used in the event that the full script crashes. This makes it possible
 to re-run only the scenarios that crashed.

PiperOrigin-RevId: 673749453
Change-Id: I52aa1bd816b18f25bcf6a8a3485777c23fb48b13
---
 .../launch_concordia_challenge_evaluation.py        | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/examples/modular/launch_concordia_challenge_evaluation.py b/examples/modular/launch_concordia_challenge_evaluation.py
index a73af06d..96250d26 100644
--- a/examples/modular/launch_concordia_challenge_evaluation.py
+++ b/examples/modular/launch_concordia_challenge_evaluation.py
@@ -107,7 +107,7 @@
     '--num_repetitions_per_scenario',
     action='store',
     type=int,
-    default=2,
+    default=1,
     dest='num_repetitions_per_scenario',
 )
 parser.add_argument('--api_key',
@@ -224,9 +224,10 @@ def _evaluate_all_repetitions_on_one_scenario(
     )
     with open(html_filename, 'a', encoding='utf-8') as f:
       f.write(text_results_log)
+
   # Average scores over repetitions and save results for all repetitions in a
   # json-serializable format.
-  return logging_lib.ScenarioResult(
+  scenario_result_ = logging_lib.ScenarioResult(
       scenario=scenario_name,
       focal_agent=args.agent_name,
       background_agent=scenario_config.background_agent_module,
@@ -245,6 +246,14 @@ def _evaluate_all_repetitions_on_one_scenario(
       disable_language_model=args.disable_language_model,
       exclude_from_elo_calculation=args.exclude_from_elo_calculation,
   )
+  scenario_json_filename = (
+      f'{args.agent_name}__{args.model_name}__'
+      f'{args.embedder_name}__only_{scenario_name}.json'
+  ).replace('/', '_')
+  json_str_ = scenario_result_.to_json()
+  with open(scenario_json_filename, 'a', encoding='utf-8') as f:
+    f.write(json_str_)
+  return scenario_result_
 
 tasks = {
     name: functools.partial(