From 7cbf802eda20665b1275943c69df14d6c6ba3f14 Mon Sep 17 00:00:00 2001
From: "Joel Z. Leibo" <jzl@google.com>
Date: Mon, 25 Nov 2024 16:01:18 -0800
Subject: [PATCH] Update launch and eval scripts for the eval phase of the
 contest

PiperOrigin-RevId: 700127292
Change-Id: I0eadabe74fbfed9f8a74a98703ed05beee9dd5e4
---
 .../language_model/call_limit_wrapper.py      |  2 +-
 examples/modular/calculate_ratings.py         | 31 +++++--
 .../launch_concordia_challenge_evaluation.py  | 80 +++++++++----------
 examples/modular/launch_one_scenario.py       | 58 ++++++--------
 examples/modular/utils/logging_types.py       | 24 +++---
 5 files changed, 94 insertions(+), 101 deletions(-)

diff --git a/concordia/language_model/call_limit_wrapper.py b/concordia/language_model/call_limit_wrapper.py
index 68f86d2e..90e7c048 100644
--- a/concordia/language_model/call_limit_wrapper.py
+++ b/concordia/language_model/call_limit_wrapper.py
@@ -33,7 +33,7 @@ class CallLimitLanguageModel(language_model.LanguageModel):
   def __init__(
       self,
       model: language_model.LanguageModel,
-      max_calls: int = 1000,
+      max_calls: int = 1200,
   ) -> None:
     """Wrap the underlying language model with a call limit.
 
diff --git a/examples/modular/calculate_ratings.py b/examples/modular/calculate_ratings.py
index ea6dbf3e..c158e485 100644
--- a/examples/modular/calculate_ratings.py
+++ b/examples/modular/calculate_ratings.py
@@ -59,13 +59,17 @@
 # Parse command line arguments
 args = parser.parse_args()
 
+sanitized_model_name = args.model_name.replace('/', '_')
+
 # Load data
 included = {}
 included_agent_idx = 0
 sorted_agent_names = sorted(args.agents)
+max_repetition_idx = -1
 for agent_name in sorted_agent_names:
   print(f'loading data from: {agent_name}')
-  json_filename = f'{agent_name}__{args.model_name}__{args.embedder_name}.json'
+  json_filename = (
+      f'{agent_name}__{sanitized_model_name}__{args.embedder_name}.json')
 
   loaded = file_utils.load_from_json_file(json_filename)
   scenario_results_to_include = {}
@@ -93,14 +97,23 @@
         f' {expected_background_agent}'
     )
 
-    if result.scenario in scenario_results_to_include:
-      raise RuntimeError(f'Duplicate scenario: {result.scenario}')
+    repetition_idx = int(result.repetition_idx)
+    max_repetition_idx = max(max_repetition_idx, repetition_idx)
+    scenario_with_repetition = f'{result.scenario}_{repetition_idx}'
+
+    if scenario_with_repetition in scenario_results_to_include:
+      raise RuntimeError(f'Duplicate scenario: {scenario_with_repetition}')
 
-    scenario_results_to_include[result.scenario] = result
+    scenario_results_to_include[scenario_with_repetition] = result
 
   # Check there are results for all scenarios.
+  expected_scenarios = []
+  for expected_scenario in set(scenarios_lib.SCENARIO_CONFIGS.keys()):
+    for repetition_idx in range(max_repetition_idx + 1):
+      expected_scenarios.append(f'{expected_scenario}_{repetition_idx}')
+  expected_scenarios = set(expected_scenarios)
   scenarios_found = set(scenario_results_to_include.keys())
-  if scenarios_found == set(scenarios_lib.SCENARIO_CONFIGS.keys()):
+  if scenarios_found == expected_scenarios:
     included[agent_name] = dict(
         agent_idx=included_agent_idx, results=scenario_results_to_include
     )
@@ -112,16 +125,18 @@
 # the data from the previous runs with other agent submissions.
 # We need to form a score matrix with shape [num_scenarios X num_agents]
 num_scenarios = len(scenarios_lib.SCENARIO_CONFIGS)
+num_scenarios_and_repetitions = num_scenarios * (max_repetition_idx + 1)
 agents_to_evaluate = list(included.keys())
 num_agents_to_evaluate = len(agents_to_evaluate)
-score_matrix = np.zeros((num_scenarios, num_agents_to_evaluate))
+score_matrix = np.zeros((num_scenarios_and_repetitions, num_agents_to_evaluate))
 for agent_name in agents_to_evaluate:
   results_per_scenario = included[agent_name]['results']
 
   num_scenarios_found = len(results_per_scenario)
   assert (
-      num_scenarios_found == num_scenarios
-  ), f'Wrong number of scenarios: {num_scenarios_found} != {num_scenarios}'
+      num_scenarios_found == num_scenarios_and_repetitions
+  ), ('Wrong number of scenarios: '
+      f'{num_scenarios_found} != {num_scenarios_and_repetitions}')
 
   names_by_scenario_vector = np.array(
       [result.scenario for result in results_per_scenario.values()]
diff --git a/examples/modular/launch_concordia_challenge_evaluation.py b/examples/modular/launch_concordia_challenge_evaluation.py
index 407f1893..419a1142 100644
--- a/examples/modular/launch_concordia_challenge_evaluation.py
+++ b/examples/modular/launch_concordia_challenge_evaluation.py
@@ -243,11 +243,6 @@ def _evaluate_all_repetitions_on_one_scenario(
   """
   print(f'Running scenario: {scenario_name}')
   # Run several simulations per scenario
-  simulation_outcomes = []
-  focal_per_capita_scores_to_average = []
-  background_per_capita_scores_to_average = []
-  ungrouped_per_capita_scores_to_average = []
-
   tasks_this_scenario = {
       str(i): functools.partial(
           _evaluate_one_repetition,
@@ -267,6 +262,7 @@ def _evaluate_all_repetitions_on_one_scenario(
         'Raised errors', list(exceptions_per_repetition.values())
     )
 
+  scenario_results = []
   for repetition_idx, outcome in outputs_per_repetition.items():
     if scenario_config.focal_is_resident:
       focal_scores = list(outcome.resident_scores.values())
@@ -279,45 +275,38 @@ def _evaluate_all_repetitions_on_one_scenario(
     # Calculate per capita scores.
     print(f'\nScores for repetition {repetition_idx}:')
     focal_per_capita_score = np.mean(focal_scores)
-    focal_per_capita_scores_to_average.append(focal_per_capita_score)
     print(f'  Focal per capita score: {focal_per_capita_score}')
     background_per_capita_score = np.mean(background_scores)
-    background_per_capita_scores_to_average.append(background_per_capita_score)
     print(f'  Background per capita score: {background_per_capita_score}')
     ungrouped_per_capita_score = np.mean(ungrouped_scores)
-    ungrouped_per_capita_scores_to_average.append(ungrouped_per_capita_score)
     print(f'  Ungrouped per capita score: {ungrouped_per_capita_score}')
 
-  # Average scores over repetitions and save results for all repetitions in a
-  # json-serializable format.
-  scenario_result_ = logging_lib.ScenarioResult(
-      scenario=scenario_name,
-      focal_agent=args.agent_name,
-      background_agent=scenario_config.background_agent_module,
-      focal_per_capita_score=np.mean(focal_per_capita_scores_to_average),
-      background_per_capita_score=np.mean(
-          background_per_capita_scores_to_average
-      ),
-      ungrouped_per_capita_score=np.mean(
-          ungrouped_per_capita_scores_to_average
-      ),
-      simulation_outcomes=tuple(simulation_outcomes),
-      focal_is_resident=scenario_config.focal_is_resident,
-      api_type=args.api_type,
-      model=args.model_name,
-      embedder=args.embedder_name,
-      disable_language_model=args.disable_language_model,
-      exclude_from_elo_calculation=args.exclude_from_elo_calculation,
-  )
-  scenario_json_filename = (
-      f'{args.agent_name}__{args.model_name}__'
-      f'{args.embedder_name}__only_{scenario_name}.json'
-  ).replace('/', '_')
-  scenario_json_filename = os.path.join(results_dir, scenario_json_filename)
-  json_str_ = scenario_result_.to_json()
-  with open(scenario_json_filename, 'a', encoding='utf-8') as f:
-    f.write(json_str_)
-  return scenario_result_
+    scenario_result_ = logging_lib.ScenarioResult(
+        scenario=scenario_name,
+        repetition_idx=repetition_idx,
+        focal_agent=args.agent_name,
+        background_agent=scenario_config.background_agent_module,
+        focal_per_capita_score=focal_per_capita_score,
+        background_per_capita_score=background_per_capita_score,
+        ungrouped_per_capita_score=ungrouped_per_capita_score,
+        simulation_outcome=outcome,
+        focal_is_resident=scenario_config.focal_is_resident,
+        api_type=args.api_type,
+        model=args.model_name,
+        embedder=args.embedder_name,
+        disable_language_model=args.disable_language_model,
+        exclude_from_elo_calculation=args.exclude_from_elo_calculation,
+    )
+    scenario_json_filename = (
+        f'{args.agent_name}__{args.model_name}__'
+        f'{args.embedder_name}__only__{scenario_name}__{repetition_idx}.json'
+    ).replace('/', '_')
+    scenario_json_filename = os.path.join(results_dir, scenario_json_filename)
+    json_str_ = scenario_result_.to_json()
+    with open(scenario_json_filename, 'a', encoding='utf-8') as f:
+      f.write(json_str_)
+    scenario_results.append(scenario_result_)
+  return scenario_results
 
 tasks = {
     name: functools.partial(
@@ -330,16 +319,19 @@ def _evaluate_all_repetitions_on_one_scenario(
 evaluation_results = concurrency.run_tasks(tasks)
 
 # Save evaluation results for all scenarios with this agent to one json file.
+num_expected_results = (len(scenarios_lib.SCENARIO_CONFIGS) *
+                        args.num_repetitions_per_scenario)
 json_filename = (
     f'{args.agent_name}__{args.model_name}__{args.embedder_name}.json'
 ).replace('/', '_')
 idx = 0
 with open(json_filename, 'a', encoding='utf-8') as file_handle:
   file_handle.write('[\n')
-  for scenario_name_, scenario_result in evaluation_results.items():
-    json_str = evaluation_results[scenario_name_].to_json()
-    if idx < len(scenarios_lib.SCENARIO_CONFIGS) - 1:
-      json_str += ',\n'
-    file_handle.write(json_str)
-    idx += 1
+  for scenario_name_, _ in evaluation_results.items():
+    for scenario_result in evaluation_results[scenario_name_]:
+      json_str = scenario_result.to_json()
+      if idx < num_expected_results - 1:
+        json_str += ',\n'
+      file_handle.write(json_str)
+      idx += 1
   file_handle.write('\n]')
diff --git a/examples/modular/launch_one_scenario.py b/examples/modular/launch_one_scenario.py
index 2bd813af..bafca56b 100644
--- a/examples/modular/launch_one_scenario.py
+++ b/examples/modular/launch_one_scenario.py
@@ -188,10 +188,6 @@
 print(f'Running scenario: {args.scenario_name}')
 scenario_config = scenarios_lib.SCENARIO_CONFIGS[args.scenario_name]
 # Run several simulations per scenario
-simulation_outcomes = []
-focal_per_capita_scores_to_average = []
-background_per_capita_scores_to_average = []
-ungrouped_per_capita_scores_to_average = []
 for repetition_idx in range(args.num_repetitions_per_scenario):
   measurements = measurements_lib.Measurements()
   runnable_simulation = scenarios_lib.build_simulation(
@@ -205,7 +201,6 @@
   )
   # Run the simulation
   outcome, text_results_log = runnable_simulation()
-  simulation_outcomes.append(outcome)
   if scenario_config.focal_is_resident:
     focal_scores = list(outcome.resident_scores.values())
     background_scores = list(outcome.visitor_scores.values())
@@ -217,13 +212,10 @@
   # Calculate per capita scores.
   print('\nScores:')
   focal_per_capita_score = np.mean(focal_scores)
-  focal_per_capita_scores_to_average.append(focal_per_capita_score)
   print(f'  Focal per capita score: {focal_per_capita_score}')
   background_per_capita_score = np.mean(background_scores)
-  background_per_capita_scores_to_average.append(background_per_capita_score)
   print(f'  Background per capita score: {background_per_capita_score}')
   ungrouped_per_capita_score = np.mean(ungrouped_scores)
-  ungrouped_per_capita_scores_to_average.append(ungrouped_per_capita_score)
   print(f'  Ungrouped per capita score: {ungrouped_per_capita_score}')
   # Write the full text log as an HTML file in the current working directory.
   html_filename = (
@@ -234,29 +226,27 @@
   with open(html_filename, 'a', encoding='utf-8') as f:
     f.write(text_results_log)
 
-# Average scores over repetitions and save results for all repetitions in a
-# json-serializable format.
-scenario_result = logging_lib.ScenarioResult(
-    scenario=args.scenario_name,
-    focal_agent=args.agent_name,
-    background_agent=scenario_config.background_agent_module,
-    focal_per_capita_score=np.mean(focal_per_capita_scores_to_average),
-    background_per_capita_score=np.mean(
-        background_per_capita_scores_to_average
-    ),
-    ungrouped_per_capita_score=np.mean(ungrouped_per_capita_scores_to_average),
-    simulation_outcomes=tuple(simulation_outcomes),
-    focal_is_resident=scenario_config.focal_is_resident,
-    api_type=args.api_type,
-    model=args.model_name,
-    embedder=args.embedder_name,
-    disable_language_model=args.disable_language_model,
-    exclude_from_elo_calculation=args.exclude_from_elo_calculation,
-)
-scenario_json_filename = (
-    f'{args.agent_name}__{args.model_name}__'
-    f'{args.embedder_name}__only_{args.scenario_name}.json'
-).replace('/', '_')
-json_str_ = scenario_result.to_json()
-with open(scenario_json_filename, 'a', encoding='utf-8') as f:
-  f.write(json_str_)
+  scenario_result = logging_lib.ScenarioResult(
+      scenario=args.scenario_name,
+      repetition_idx=repetition_idx,
+      focal_agent=args.agent_name,
+      background_agent=scenario_config.background_agent_module,
+      focal_per_capita_score=focal_per_capita_score,
+      background_per_capita_score=background_per_capita_score,
+      ungrouped_per_capita_score=ungrouped_per_capita_score,
+      simulation_outcome=outcome,
+      focal_is_resident=scenario_config.focal_is_resident,
+      api_type=args.api_type,
+      model=args.model_name,
+      embedder=args.embedder_name,
+      disable_language_model=args.disable_language_model,
+      exclude_from_elo_calculation=args.exclude_from_elo_calculation,
+  )
+  scenario_json_filename = (
+      f'{args.agent_name}__{args.model_name}__'
+      f'{args.embedder_name}__only__{args.scenario_name}__{repetition_idx}'
+      '.json'
+  ).replace('/', '_')
+  json_str_ = scenario_result.to_json()
+  with open(scenario_json_filename, 'a', encoding='utf-8') as f:
+    f.write(json_str_)
diff --git a/examples/modular/utils/logging_types.py b/examples/modular/utils/logging_types.py
index f07fe899..eea59b7a 100644
--- a/examples/modular/utils/logging_types.py
+++ b/examples/modular/utils/logging_types.py
@@ -39,17 +39,17 @@ class SimulationOutcome:
 @dataclasses.dataclass(frozen=True, kw_only=True)
 class ScenarioResult:
   """Result from testing a single agent on several repetitions of a scenario.
-  
+
   Attributes:
     scenario: The name of the scenario.
+    repetition_idx: The index of the repetition (i.e. the seed).
     focal_agent: The name of the agent that is being tested in the focal slots.
     background_agent: The name of the agent used in the background player slots.
     focal_per_capita_score: The per capita score of the focal agent.
     background_per_capita_score: The per capita score of the background agent.
     ungrouped_per_capita_score: The per capita score of the focal agent,
       averaged over all players (both residents and visitors).
-    simulation_outcomes: A tuple of SimulationOutcomes, one for each repetition
-      of the scenario.
+    simulation_outcome: A SimulationOutcome object.
     focal_is_resident: Whether the focal agent is a resident or a visitor.
     api_type: The API type used for the simulation
       (e.g. `google_aistudio_model`, `mistral`, `openai`, etc).
@@ -64,6 +64,7 @@ class ScenarioResult:
   """
 
   scenario: str
+  repetition_idx: int
 
   focal_agent: str
   background_agent: str
@@ -72,9 +73,7 @@ class ScenarioResult:
   background_per_capita_score: float
   ungrouped_per_capita_score: float
 
-  simulation_outcomes: tuple[SimulationOutcome, ...] = dataclasses.field(
-      repr=False
-  )
+  simulation_outcome: SimulationOutcome = dataclasses.field(repr=False)
 
   focal_is_resident: bool
 
@@ -87,16 +86,13 @@ class ScenarioResult:
 
   def to_json(self) -> str:
     """Encode this dataclass as a string to serialize as a json file."""
-    simulation_outcome_dicts = []
-    for outcome in self.simulation_outcomes:
-      outcome_dict = dataclasses.asdict(outcome)
-      outcome_dict['resident_scores'] = dict(outcome_dict['resident_scores'])
-      outcome_dict['visitor_scores'] = dict(outcome_dict['visitor_scores'])
-      outcome_dict['metadata'] = dict(outcome_dict['metadata'])
-      simulation_outcome_dicts.append(outcome_dict)
+    outcome_dict = dataclasses.asdict(self.simulation_outcome)
+    outcome_dict['resident_scores'] = dict(outcome_dict['resident_scores'])
+    outcome_dict['visitor_scores'] = dict(outcome_dict['visitor_scores'])
+    outcome_dict['metadata'] = dict(outcome_dict['metadata'])
 
     self_as_dict = dataclasses.asdict(self)
-    self_as_dict['simulation_outcomes'] = tuple(simulation_outcome_dicts)
+    self_as_dict['simulation_outcome'] = outcome_dict
 
     return json.dumps(self_as_dict, indent=2)