From 7cbf802eda20665b1275943c69df14d6c6ba3f14 Mon Sep 17 00:00:00 2001 From: "Joel Z. Leibo" Date: Mon, 25 Nov 2024 16:01:18 -0800 Subject: [PATCH] Update launch and eval scripts for the eval phase of the contest PiperOrigin-RevId: 700127292 Change-Id: I0eadabe74fbfed9f8a74a98703ed05beee9dd5e4 --- .../language_model/call_limit_wrapper.py | 2 +- examples/modular/calculate_ratings.py | 31 +++++-- .../launch_concordia_challenge_evaluation.py | 80 +++++++++---------- examples/modular/launch_one_scenario.py | 58 ++++++-------- examples/modular/utils/logging_types.py | 24 +++--- 5 files changed, 94 insertions(+), 101 deletions(-) diff --git a/concordia/language_model/call_limit_wrapper.py b/concordia/language_model/call_limit_wrapper.py index 68f86d2e..90e7c048 100644 --- a/concordia/language_model/call_limit_wrapper.py +++ b/concordia/language_model/call_limit_wrapper.py @@ -33,7 +33,7 @@ class CallLimitLanguageModel(language_model.LanguageModel): def __init__( self, model: language_model.LanguageModel, - max_calls: int = 1000, + max_calls: int = 1200, ) -> None: """Wrap the underlying language model with a call limit. diff --git a/examples/modular/calculate_ratings.py b/examples/modular/calculate_ratings.py index ea6dbf3e..c158e485 100644 --- a/examples/modular/calculate_ratings.py +++ b/examples/modular/calculate_ratings.py @@ -59,13 +59,17 @@ # Parse command line arguments args = parser.parse_args() +sanitized_model_name = args.model_name.replace('/', '_') + # Load data included = {} included_agent_idx = 0 sorted_agent_names = sorted(args.agents) +max_repetition_idx = -1 for agent_name in sorted_agent_names: print(f'loading data from: {agent_name}') - json_filename = f'{agent_name}__{args.model_name}__{args.embedder_name}.json' + json_filename = ( + f'{agent_name}__{sanitized_model_name}__{args.embedder_name}.json') loaded = file_utils.load_from_json_file(json_filename) scenario_results_to_include = {} @@ -93,14 +97,23 @@ f' {expected_background_agent}' ) - if result.scenario in scenario_results_to_include: - raise RuntimeError(f'Duplicate scenario: {result.scenario}') + repetition_idx = int(result.repetition_idx) + max_repetition_idx = max(max_repetition_idx, repetition_idx) + scenario_with_repetition = f'{result.scenario}_{repetition_idx}' + + if scenario_with_repetition in scenario_results_to_include: + raise RuntimeError(f'Duplicate scenario: {scenario_with_repetition}') - scenario_results_to_include[result.scenario] = result + scenario_results_to_include[scenario_with_repetition] = result # Check there are results for all scenarios. + expected_scenarios = [] + for expected_scenario in set(scenarios_lib.SCENARIO_CONFIGS.keys()): + for repetition_idx in range(max_repetition_idx + 1): + expected_scenarios.append(f'{expected_scenario}_{repetition_idx}') + expected_scenarios = set(expected_scenarios) scenarios_found = set(scenario_results_to_include.keys()) - if scenarios_found == set(scenarios_lib.SCENARIO_CONFIGS.keys()): + if scenarios_found == expected_scenarios: included[agent_name] = dict( agent_idx=included_agent_idx, results=scenario_results_to_include ) @@ -112,16 +125,18 @@ # the data from the previous runs with other agent submissions. # We need to form a score matrix with shape [num_scenarios X num_agents] num_scenarios = len(scenarios_lib.SCENARIO_CONFIGS) +num_scenarios_and_repetitions = num_scenarios * (max_repetition_idx + 1) agents_to_evaluate = list(included.keys()) num_agents_to_evaluate = len(agents_to_evaluate) -score_matrix = np.zeros((num_scenarios, num_agents_to_evaluate)) +score_matrix = np.zeros((num_scenarios_and_repetitions, num_agents_to_evaluate)) for agent_name in agents_to_evaluate: results_per_scenario = included[agent_name]['results'] num_scenarios_found = len(results_per_scenario) assert ( - num_scenarios_found == num_scenarios - ), f'Wrong number of scenarios: {num_scenarios_found} != {num_scenarios}' + num_scenarios_found == num_scenarios_and_repetitions + ), ('Wrong number of scenarios: ' + f'{num_scenarios_found} != {num_scenarios_and_repetitions}') names_by_scenario_vector = np.array( [result.scenario for result in results_per_scenario.values()] diff --git a/examples/modular/launch_concordia_challenge_evaluation.py b/examples/modular/launch_concordia_challenge_evaluation.py index 407f1893..419a1142 100644 --- a/examples/modular/launch_concordia_challenge_evaluation.py +++ b/examples/modular/launch_concordia_challenge_evaluation.py @@ -243,11 +243,6 @@ def _evaluate_all_repetitions_on_one_scenario( """ print(f'Running scenario: {scenario_name}') # Run several simulations per scenario - simulation_outcomes = [] - focal_per_capita_scores_to_average = [] - background_per_capita_scores_to_average = [] - ungrouped_per_capita_scores_to_average = [] - tasks_this_scenario = { str(i): functools.partial( _evaluate_one_repetition, @@ -267,6 +262,7 @@ def _evaluate_all_repetitions_on_one_scenario( 'Raised errors', list(exceptions_per_repetition.values()) ) + scenario_results = [] for repetition_idx, outcome in outputs_per_repetition.items(): if scenario_config.focal_is_resident: focal_scores = list(outcome.resident_scores.values()) @@ -279,45 +275,38 @@ def _evaluate_all_repetitions_on_one_scenario( # Calculate per capita scores. print(f'\nScores for repetition {repetition_idx}:') focal_per_capita_score = np.mean(focal_scores) - focal_per_capita_scores_to_average.append(focal_per_capita_score) print(f' Focal per capita score: {focal_per_capita_score}') background_per_capita_score = np.mean(background_scores) - background_per_capita_scores_to_average.append(background_per_capita_score) print(f' Background per capita score: {background_per_capita_score}') ungrouped_per_capita_score = np.mean(ungrouped_scores) - ungrouped_per_capita_scores_to_average.append(ungrouped_per_capita_score) print(f' Ungrouped per capita score: {ungrouped_per_capita_score}') - # Average scores over repetitions and save results for all repetitions in a - # json-serializable format. - scenario_result_ = logging_lib.ScenarioResult( - scenario=scenario_name, - focal_agent=args.agent_name, - background_agent=scenario_config.background_agent_module, - focal_per_capita_score=np.mean(focal_per_capita_scores_to_average), - background_per_capita_score=np.mean( - background_per_capita_scores_to_average - ), - ungrouped_per_capita_score=np.mean( - ungrouped_per_capita_scores_to_average - ), - simulation_outcomes=tuple(simulation_outcomes), - focal_is_resident=scenario_config.focal_is_resident, - api_type=args.api_type, - model=args.model_name, - embedder=args.embedder_name, - disable_language_model=args.disable_language_model, - exclude_from_elo_calculation=args.exclude_from_elo_calculation, - ) - scenario_json_filename = ( - f'{args.agent_name}__{args.model_name}__' - f'{args.embedder_name}__only_{scenario_name}.json' - ).replace('/', '_') - scenario_json_filename = os.path.join(results_dir, scenario_json_filename) - json_str_ = scenario_result_.to_json() - with open(scenario_json_filename, 'a', encoding='utf-8') as f: - f.write(json_str_) - return scenario_result_ + scenario_result_ = logging_lib.ScenarioResult( + scenario=scenario_name, + repetition_idx=repetition_idx, + focal_agent=args.agent_name, + background_agent=scenario_config.background_agent_module, + focal_per_capita_score=focal_per_capita_score, + background_per_capita_score=background_per_capita_score, + ungrouped_per_capita_score=ungrouped_per_capita_score, + simulation_outcome=outcome, + focal_is_resident=scenario_config.focal_is_resident, + api_type=args.api_type, + model=args.model_name, + embedder=args.embedder_name, + disable_language_model=args.disable_language_model, + exclude_from_elo_calculation=args.exclude_from_elo_calculation, + ) + scenario_json_filename = ( + f'{args.agent_name}__{args.model_name}__' + f'{args.embedder_name}__only__{scenario_name}__{repetition_idx}.json' + ).replace('/', '_') + scenario_json_filename = os.path.join(results_dir, scenario_json_filename) + json_str_ = scenario_result_.to_json() + with open(scenario_json_filename, 'a', encoding='utf-8') as f: + f.write(json_str_) + scenario_results.append(scenario_result_) + return scenario_results tasks = { name: functools.partial( @@ -330,16 +319,19 @@ def _evaluate_all_repetitions_on_one_scenario( evaluation_results = concurrency.run_tasks(tasks) # Save evaluation results for all scenarios with this agent to one json file. +num_expected_results = (len(scenarios_lib.SCENARIO_CONFIGS) * + args.num_repetitions_per_scenario) json_filename = ( f'{args.agent_name}__{args.model_name}__{args.embedder_name}.json' ).replace('/', '_') idx = 0 with open(json_filename, 'a', encoding='utf-8') as file_handle: file_handle.write('[\n') - for scenario_name_, scenario_result in evaluation_results.items(): - json_str = evaluation_results[scenario_name_].to_json() - if idx < len(scenarios_lib.SCENARIO_CONFIGS) - 1: - json_str += ',\n' - file_handle.write(json_str) - idx += 1 + for scenario_name_, _ in evaluation_results.items(): + for scenario_result in evaluation_results[scenario_name_]: + json_str = scenario_result.to_json() + if idx < num_expected_results - 1: + json_str += ',\n' + file_handle.write(json_str) + idx += 1 file_handle.write('\n]') diff --git a/examples/modular/launch_one_scenario.py b/examples/modular/launch_one_scenario.py index 2bd813af..bafca56b 100644 --- a/examples/modular/launch_one_scenario.py +++ b/examples/modular/launch_one_scenario.py @@ -188,10 +188,6 @@ print(f'Running scenario: {args.scenario_name}') scenario_config = scenarios_lib.SCENARIO_CONFIGS[args.scenario_name] # Run several simulations per scenario -simulation_outcomes = [] -focal_per_capita_scores_to_average = [] -background_per_capita_scores_to_average = [] -ungrouped_per_capita_scores_to_average = [] for repetition_idx in range(args.num_repetitions_per_scenario): measurements = measurements_lib.Measurements() runnable_simulation = scenarios_lib.build_simulation( @@ -205,7 +201,6 @@ ) # Run the simulation outcome, text_results_log = runnable_simulation() - simulation_outcomes.append(outcome) if scenario_config.focal_is_resident: focal_scores = list(outcome.resident_scores.values()) background_scores = list(outcome.visitor_scores.values()) @@ -217,13 +212,10 @@ # Calculate per capita scores. print('\nScores:') focal_per_capita_score = np.mean(focal_scores) - focal_per_capita_scores_to_average.append(focal_per_capita_score) print(f' Focal per capita score: {focal_per_capita_score}') background_per_capita_score = np.mean(background_scores) - background_per_capita_scores_to_average.append(background_per_capita_score) print(f' Background per capita score: {background_per_capita_score}') ungrouped_per_capita_score = np.mean(ungrouped_scores) - ungrouped_per_capita_scores_to_average.append(ungrouped_per_capita_score) print(f' Ungrouped per capita score: {ungrouped_per_capita_score}') # Write the full text log as an HTML file in the current working directory. html_filename = ( @@ -234,29 +226,27 @@ with open(html_filename, 'a', encoding='utf-8') as f: f.write(text_results_log) -# Average scores over repetitions and save results for all repetitions in a -# json-serializable format. -scenario_result = logging_lib.ScenarioResult( - scenario=args.scenario_name, - focal_agent=args.agent_name, - background_agent=scenario_config.background_agent_module, - focal_per_capita_score=np.mean(focal_per_capita_scores_to_average), - background_per_capita_score=np.mean( - background_per_capita_scores_to_average - ), - ungrouped_per_capita_score=np.mean(ungrouped_per_capita_scores_to_average), - simulation_outcomes=tuple(simulation_outcomes), - focal_is_resident=scenario_config.focal_is_resident, - api_type=args.api_type, - model=args.model_name, - embedder=args.embedder_name, - disable_language_model=args.disable_language_model, - exclude_from_elo_calculation=args.exclude_from_elo_calculation, -) -scenario_json_filename = ( - f'{args.agent_name}__{args.model_name}__' - f'{args.embedder_name}__only_{args.scenario_name}.json' -).replace('/', '_') -json_str_ = scenario_result.to_json() -with open(scenario_json_filename, 'a', encoding='utf-8') as f: - f.write(json_str_) + scenario_result = logging_lib.ScenarioResult( + scenario=args.scenario_name, + repetition_idx=repetition_idx, + focal_agent=args.agent_name, + background_agent=scenario_config.background_agent_module, + focal_per_capita_score=focal_per_capita_score, + background_per_capita_score=background_per_capita_score, + ungrouped_per_capita_score=ungrouped_per_capita_score, + simulation_outcome=outcome, + focal_is_resident=scenario_config.focal_is_resident, + api_type=args.api_type, + model=args.model_name, + embedder=args.embedder_name, + disable_language_model=args.disable_language_model, + exclude_from_elo_calculation=args.exclude_from_elo_calculation, + ) + scenario_json_filename = ( + f'{args.agent_name}__{args.model_name}__' + f'{args.embedder_name}__only__{args.scenario_name}__{repetition_idx}' + '.json' + ).replace('/', '_') + json_str_ = scenario_result.to_json() + with open(scenario_json_filename, 'a', encoding='utf-8') as f: + f.write(json_str_) diff --git a/examples/modular/utils/logging_types.py b/examples/modular/utils/logging_types.py index f07fe899..eea59b7a 100644 --- a/examples/modular/utils/logging_types.py +++ b/examples/modular/utils/logging_types.py @@ -39,17 +39,17 @@ class SimulationOutcome: @dataclasses.dataclass(frozen=True, kw_only=True) class ScenarioResult: """Result from testing a single agent on several repetitions of a scenario. - + Attributes: scenario: The name of the scenario. + repetition_idx: The index of the repetition (i.e. the seed). focal_agent: The name of the agent that is being tested in the focal slots. background_agent: The name of the agent used in the background player slots. focal_per_capita_score: The per capita score of the focal agent. background_per_capita_score: The per capita score of the background agent. ungrouped_per_capita_score: The per capita score of the focal agent, averaged over all players (both residents and visitors). - simulation_outcomes: A tuple of SimulationOutcomes, one for each repetition - of the scenario. + simulation_outcome: A SimulationOutcome object. focal_is_resident: Whether the focal agent is a resident or a visitor. api_type: The API type used for the simulation (e.g. `google_aistudio_model`, `mistral`, `openai`, etc). @@ -64,6 +64,7 @@ class ScenarioResult: """ scenario: str + repetition_idx: int focal_agent: str background_agent: str @@ -72,9 +73,7 @@ class ScenarioResult: background_per_capita_score: float ungrouped_per_capita_score: float - simulation_outcomes: tuple[SimulationOutcome, ...] = dataclasses.field( - repr=False - ) + simulation_outcome: SimulationOutcome = dataclasses.field(repr=False) focal_is_resident: bool @@ -87,16 +86,13 @@ class ScenarioResult: def to_json(self) -> str: """Encode this dataclass as a string to serialize as a json file.""" - simulation_outcome_dicts = [] - for outcome in self.simulation_outcomes: - outcome_dict = dataclasses.asdict(outcome) - outcome_dict['resident_scores'] = dict(outcome_dict['resident_scores']) - outcome_dict['visitor_scores'] = dict(outcome_dict['visitor_scores']) - outcome_dict['metadata'] = dict(outcome_dict['metadata']) - simulation_outcome_dicts.append(outcome_dict) + outcome_dict = dataclasses.asdict(self.simulation_outcome) + outcome_dict['resident_scores'] = dict(outcome_dict['resident_scores']) + outcome_dict['visitor_scores'] = dict(outcome_dict['visitor_scores']) + outcome_dict['metadata'] = dict(outcome_dict['metadata']) self_as_dict = dataclasses.asdict(self) - self_as_dict['simulation_outcomes'] = tuple(simulation_outcome_dicts) + self_as_dict['simulation_outcome'] = outcome_dict return json.dumps(self_as_dict, indent=2)