Make evaluation script write usable intermediate files which can be u…

…sed in the event that the full script crashes. This makes it possible to re-run only the scenarios that crashed. PiperOrigin-RevId: 673749453 Change-Id: I52aa1bd816b18f25bcf6a8a3485777c23fb48b13
google-deepmind · Sep 12, 2024 · e5bb519 · e5bb519
1 parent d4e32c0
commit e5bb519
Showing 1 changed file with 11 additions and 2 deletions.
diff --git a/examples/modular/launch_concordia_challenge_evaluation.py b/examples/modular/launch_concordia_challenge_evaluation.py
@@ -107,7 +107,7 @@
     '--num_repetitions_per_scenario',
     action='store',
     type=int,
-    default=2,
+    default=1,
     dest='num_repetitions_per_scenario',
 )
 parser.add_argument('--api_key',
@@ -224,9 +224,10 @@ def _evaluate_all_repetitions_on_one_scenario(
     )
     with open(html_filename, 'a', encoding='utf-8') as f:
       f.write(text_results_log)
+
   # Average scores over repetitions and save results for all repetitions in a
   # json-serializable format.
-  return logging_lib.ScenarioResult(
+  scenario_result_ = logging_lib.ScenarioResult(
       scenario=scenario_name,
       focal_agent=args.agent_name,
       background_agent=scenario_config.background_agent_module,
@@ -245,6 +246,14 @@ def _evaluate_all_repetitions_on_one_scenario(
       disable_language_model=args.disable_language_model,
       exclude_from_elo_calculation=args.exclude_from_elo_calculation,
   )
+  scenario_json_filename = (
+      f'{args.agent_name}__{args.model_name}__'
+      f'{args.embedder_name}__only_{scenario_name}.json'
+  ).replace('/', '_')
+  json_str_ = scenario_result_.to_json()
+  with open(scenario_json_filename, 'a', encoding='utf-8') as f:
+    f.write(json_str_)
+  return scenario_result_
 
 tasks = {
     name: functools.partial(