From 2cee86cc5bbbcbf5831fb1641d7d0eae4f492efd Mon Sep 17 00:00:00 2001 From: "Joel Z. Leibo" Date: Thu, 12 Sep 2024 14:32:59 -0700 Subject: [PATCH] Write supplementary results files to subdirectories in main eval script and add a script to evaluate one scenario at a time. PiperOrigin-RevId: 674013158 Change-Id: I9f35a42f23d4ab865d270edc95cd5ddacbfe1d42 --- .../launch_concordia_challenge_evaluation.py | 10 +- examples/modular/launch_one_scenario.py | 250 ++++++++++++++++++ 2 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 examples/modular/launch_one_scenario.py diff --git a/examples/modular/launch_concordia_challenge_evaluation.py b/examples/modular/launch_concordia_challenge_evaluation.py index 96250d26..15583b73 100644 --- a/examples/modular/launch_concordia_challenge_evaluation.py +++ b/examples/modular/launch_concordia_challenge_evaluation.py @@ -70,6 +70,7 @@ import datetime import functools import importlib +import os from concordia.language_model import utils from concordia.utils import concurrency @@ -167,6 +168,11 @@ else: embedder = lambda x: np.ones(5) +# Create evaluation results directory +start_time = datetime.datetime.now().strftime('%Y-%m-%d__%H:%M:%S') +results_dir = f'evaluation__{start_time}' +os.makedirs(results_dir, exist_ok=True) + def _evaluate_all_repetitions_on_one_scenario( scenario_name: str, @@ -207,6 +213,7 @@ def _evaluate_all_repetitions_on_one_scenario( # Ungrouped scores do not differentiate between focal and background. ungrouped_scores = focal_scores + background_scores # Calculate per capita scores. + print('\nScores:') focal_per_capita_score = np.mean(focal_scores) focal_per_capita_scores_to_average.append(focal_per_capita_score) print(f' Focal per capita score: {focal_per_capita_score}') @@ -218,7 +225,7 @@ def _evaluate_all_repetitions_on_one_scenario( print(f' Ungrouped per capita score: {ungrouped_per_capita_score}') # Write the full text log as an HTML file in the current working directory. html_filename = ( - f'{scenario_name}_' + f'{results_dir}/{scenario_name}_' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '.html' ) @@ -250,6 +257,7 @@ def _evaluate_all_repetitions_on_one_scenario( f'{args.agent_name}__{args.model_name}__' f'{args.embedder_name}__only_{scenario_name}.json' ).replace('/', '_') + scenario_json_filename = os.path.join(results_dir, scenario_json_filename) json_str_ = scenario_result_.to_json() with open(scenario_json_filename, 'a', encoding='utf-8') as f: f.write(json_str_) diff --git a/examples/modular/launch_one_scenario.py b/examples/modular/launch_one_scenario.py new file mode 100644 index 00000000..5968edda --- /dev/null +++ b/examples/modular/launch_one_scenario.py @@ -0,0 +1,250 @@ +# Copyright 2023 DeepMind Technologies Limited. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Evaluate the submitted agent on all scenarios. + +Usage: +cd {concordia_root}/ +PYTHONPATH=. PYTHONSAFEPATH=1 python examples/modular/launch_one_scenario.py \ + --agent=AGENT_NAME \ + --scenario=SCENARIO_NAME \ + --api_type=API_TYPE \ + --model=MODEL_NAME \ + --embedder=EMBEDDER_NAME \ + --num_repetitions_per_scenario=NUM_REPETITIONS_PER_SCENARIO + +Where AGENT_NAME indicates a file under concordia/factory/agent, +SCENARIO_NAME indicates a key in the SCENARIO_CONFIGS dictionary in +concordia/examples/modular/scenario/scenarios.py, +API_TYPE is one of the options named in concordia/language_model/utils.py, +e.g. 'google_aistudio_model', 'openai', 'mistral', 'ollama', 'amazon_bedrock'. +MODEL_NAME is a specific model under the chosen API_TYPE. See the corresponding +wrapper in concordia/language_model/ for the link to the website where the +model names are listed for each type of API. +and EMBEDDER_NAME specifies a sentence transformers embedding model listed at +https://huggingface.co/sentence-transformers. +NUM_REPETITIONS_PER_SCENARIO specifies how many times to repeat each scenario, +averaging the results to produce a single score per scenario. + +This script will download the embedder from huggingface and cache it locally. + +To debug without spending money on API calls, pass the extra flag: + --disable_language_model +It replaces the language model with a null model that always returns an empty +string when asked for a free response and always selects the first option when +asked for a multiple choice. + +This script will write a json file with the results of the evaluation to the +current working directory. The file will be named + AGENT_NAME__MODEL_NAME__EMBEDDER_NAME.json +and will contain a list of json-serializable objects, each one containing +results on all scenarios for the selected (agent, model, embedder). For each +scenario, this script also writes an html file with its full text log. The file +will be named + SCENARIO_NAME__YYYY-MM-DD HH:MM:SS.html +where SCENARIO_NAME is the name of the scenario and the date and time are the +time when the simulation was run. +The script also writes a text file in the current working directory with the +name of each evaluated agent: + agents__MODEL_NAME__EMBEDDER_NAME.txt +This file is used to keep track of which agents have already been evaluated. For +a given MODEL_NAME and EMBEDDER_NAME. If the selected agent is already in the +list, the script will raise an error. + +After running this script you can run `calculate_ratings.py` to compute Elo +ratings. The `calculate_ratings.py` script loads the json files written by this +script and computes the Elo ratings for all agents that were been tested with +the same model and embedder. +""" + +import argparse +import datetime +import importlib + +from concordia.language_model import utils +from concordia.utils import measurements as measurements_lib +import numpy as np +import sentence_transformers + +# pylint: disable=g-bad-import-order +from examples.modular.scenario import scenarios as scenarios_lib +from examples.modular.utils import logging_types as logging_lib + +# Setup for command line arguments +parser = argparse.ArgumentParser( + description='Run a Concordia Challenge evaluation.' +) +parser.add_argument( + '--agent', + action='store', + default='rational_agent', + dest='agent_name', +) +parser.add_argument( + '--scenario', + action='store', + default='labor_collective_action__fixed_rule_boss_0', + dest='scenario_name', +) +parser.add_argument( + '--api_type', action='store', default='mistral', dest='api_type' +) +parser.add_argument( + '--model', action='store', default='codestral-latest', dest='model_name' +) +parser.add_argument( + '--embedder', + action='store', + default='all-mpnet-base-v2', + dest='embedder_name', +) +parser.add_argument( + '--num_repetitions_per_scenario', + action='store', + type=int, + default=1, + dest='num_repetitions_per_scenario', +) +parser.add_argument('--api_key', + action='store', + default=None, + dest='api_key') +parser.add_argument( + '--disable_language_model', + action='store_true', + help=( + 'replace the language model with a null model. This ' + 'makes it possible to debug without spending money ' + 'on api calls.' + ), + default=False, + dest='disable_language_model', +) +parser.add_argument( + '--exclude_from_elo_calculation', + action='store_true', + help=( + 'Use this option to write and analyze test data. It ' + 'will be automatically enabled when selecting ' + 'disable_language_model but can also be selected ' + 'independently of that flag using this one.' + ), + default=False, + dest='exclude_from_elo_calculation', +) +# Parse command line arguments +args = parser.parse_args() + +exclude_from_elo_calculation = args.exclude_from_elo_calculation +if args.disable_language_model: + exclude_from_elo_calculation = True + +# Load the agent config with importlib +IMPORT_AGENT_BASE_DIR = 'concordia.factory.agent' +agent_module = importlib.import_module( + f'{IMPORT_AGENT_BASE_DIR}.{args.agent_name}' +) + +# Language Model setup +model = utils.language_model_setup( + api_type=args.api_type, + model_name=args.model_name, + api_key=args.api_key, + disable_language_model=args.disable_language_model, +) + +# Setup sentence encoder +if not args.disable_language_model: + st_model = sentence_transformers.SentenceTransformer( + f'sentence-transformers/{args.embedder_name}' + ) + embedder = lambda x: st_model.encode(x, show_progress_bar=False) +else: + embedder = lambda x: np.ones(5) + +print(f'Running scenario: {args.scenario_name}') +scenario_config = scenarios_lib.SCENARIO_CONFIGS[args.scenario_name] +# Run several simulations per scenario +simulation_outcomes = [] +focal_per_capita_scores_to_average = [] +background_per_capita_scores_to_average = [] +ungrouped_per_capita_scores_to_average = [] +for _ in range(args.num_repetitions_per_scenario): + measurements = measurements_lib.Measurements() + runnable_simulation = scenarios_lib.build_simulation( + scenario_config=scenario_config, + model=model, + focal_agent_module=agent_module, + embedder=embedder, + measurements=measurements, + ) + # Run the simulation + outcome, text_results_log = runnable_simulation() + simulation_outcomes.append(outcome) + if scenario_config.focal_is_resident: + focal_scores = list(outcome.resident_scores.values()) + background_scores = list(outcome.visitor_scores.values()) + else: + focal_scores = list(outcome.visitor_scores.values()) + background_scores = list(outcome.resident_scores.values()) + # Ungrouped scores do not differentiate between focal and background. + ungrouped_scores = focal_scores + background_scores + # Calculate per capita scores. + print('\nScores:') + focal_per_capita_score = np.mean(focal_scores) + focal_per_capita_scores_to_average.append(focal_per_capita_score) + print(f' Focal per capita score: {focal_per_capita_score}') + background_per_capita_score = np.mean(background_scores) + background_per_capita_scores_to_average.append(background_per_capita_score) + print(f' Background per capita score: {background_per_capita_score}') + ungrouped_per_capita_score = np.mean(ungrouped_scores) + ungrouped_per_capita_scores_to_average.append(ungrouped_per_capita_score) + print(f' Ungrouped per capita score: {ungrouped_per_capita_score}') + # Write the full text log as an HTML file in the current working directory. + html_filename = ( + f'{args.scenario_name}_' + + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + '.html' + ) + with open(html_filename, 'a', encoding='utf-8') as f: + f.write(text_results_log) + +# Average scores over repetitions and save results for all repetitions in a +# json-serializable format. +scenario_result = logging_lib.ScenarioResult( + scenario=args.scenario_name, + focal_agent=args.agent_name, + background_agent=scenario_config.background_agent_module, + focal_per_capita_score=np.mean(focal_per_capita_scores_to_average), + background_per_capita_score=np.mean( + background_per_capita_scores_to_average + ), + ungrouped_per_capita_score=np.mean( + ungrouped_per_capita_scores_to_average + ), + simulation_outcomes=tuple(simulation_outcomes), + focal_is_resident=scenario_config.focal_is_resident, + api_type=args.api_type, + model=args.model_name, + embedder=args.embedder_name, + disable_language_model=args.disable_language_model, + exclude_from_elo_calculation=args.exclude_from_elo_calculation, +) +scenario_json_filename = ( + f'{args.agent_name}__{args.model_name}__' + f'{args.embedder_name}__only_{args.scenario_name}.json' +).replace('/', '_') +json_str_ = scenario_result.to_json() +with open(scenario_json_filename, 'a', encoding='utf-8') as f: + f.write(json_str_)