Skip to content

Commit

Permalink
Write supplementary results files to subdirectories in main eval scri…
Browse files Browse the repository at this point in the history
…pt and add a script to evaluate one scenario at a time.

PiperOrigin-RevId: 674013158
Change-Id: I9f35a42f23d4ab865d270edc95cd5ddacbfe1d42
  • Loading branch information
jzleibo authored and copybara-github committed Sep 12, 2024
1 parent d644ecf commit 2cee86c
Show file tree
Hide file tree
Showing 2 changed files with 259 additions and 1 deletion.
10 changes: 9 additions & 1 deletion examples/modular/launch_concordia_challenge_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
import datetime
import functools
import importlib
import os

from concordia.language_model import utils
from concordia.utils import concurrency
Expand Down Expand Up @@ -167,6 +168,11 @@
else:
embedder = lambda x: np.ones(5)

# Create evaluation results directory
start_time = datetime.datetime.now().strftime('%Y-%m-%d__%H:%M:%S')
results_dir = f'evaluation__{start_time}'
os.makedirs(results_dir, exist_ok=True)


def _evaluate_all_repetitions_on_one_scenario(
scenario_name: str,
Expand Down Expand Up @@ -207,6 +213,7 @@ def _evaluate_all_repetitions_on_one_scenario(
# Ungrouped scores do not differentiate between focal and background.
ungrouped_scores = focal_scores + background_scores
# Calculate per capita scores.
print('\nScores:')
focal_per_capita_score = np.mean(focal_scores)
focal_per_capita_scores_to_average.append(focal_per_capita_score)
print(f' Focal per capita score: {focal_per_capita_score}')
Expand All @@ -218,7 +225,7 @@ def _evaluate_all_repetitions_on_one_scenario(
print(f' Ungrouped per capita score: {ungrouped_per_capita_score}')
# Write the full text log as an HTML file in the current working directory.
html_filename = (
f'{scenario_name}_'
f'{results_dir}/{scenario_name}_'
+ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ '.html'
)
Expand Down Expand Up @@ -250,6 +257,7 @@ def _evaluate_all_repetitions_on_one_scenario(
f'{args.agent_name}__{args.model_name}__'
f'{args.embedder_name}__only_{scenario_name}.json'
).replace('/', '_')
scenario_json_filename = os.path.join(results_dir, scenario_json_filename)
json_str_ = scenario_result_.to_json()
with open(scenario_json_filename, 'a', encoding='utf-8') as f:
f.write(json_str_)
Expand Down
250 changes: 250 additions & 0 deletions examples/modular/launch_one_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
# Copyright 2023 DeepMind Technologies Limited.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

r"""Evaluate the submitted agent on all scenarios.
Usage:
cd {concordia_root}/
PYTHONPATH=. PYTHONSAFEPATH=1 python examples/modular/launch_one_scenario.py \
--agent=AGENT_NAME \
--scenario=SCENARIO_NAME \
--api_type=API_TYPE \
--model=MODEL_NAME \
--embedder=EMBEDDER_NAME \
--num_repetitions_per_scenario=NUM_REPETITIONS_PER_SCENARIO
Where AGENT_NAME indicates a file under concordia/factory/agent,
SCENARIO_NAME indicates a key in the SCENARIO_CONFIGS dictionary in
concordia/examples/modular/scenario/scenarios.py,
API_TYPE is one of the options named in concordia/language_model/utils.py,
e.g. 'google_aistudio_model', 'openai', 'mistral', 'ollama', 'amazon_bedrock'.
MODEL_NAME is a specific model under the chosen API_TYPE. See the corresponding
wrapper in concordia/language_model/ for the link to the website where the
model names are listed for each type of API.
and EMBEDDER_NAME specifies a sentence transformers embedding model listed at
https://huggingface.co/sentence-transformers.
NUM_REPETITIONS_PER_SCENARIO specifies how many times to repeat each scenario,
averaging the results to produce a single score per scenario.
This script will download the embedder from huggingface and cache it locally.
To debug without spending money on API calls, pass the extra flag:
--disable_language_model
It replaces the language model with a null model that always returns an empty
string when asked for a free response and always selects the first option when
asked for a multiple choice.
This script will write a json file with the results of the evaluation to the
current working directory. The file will be named
AGENT_NAME__MODEL_NAME__EMBEDDER_NAME.json
and will contain a list of json-serializable objects, each one containing
results on all scenarios for the selected (agent, model, embedder). For each
scenario, this script also writes an html file with its full text log. The file
will be named
SCENARIO_NAME__YYYY-MM-DD HH:MM:SS.html
where SCENARIO_NAME is the name of the scenario and the date and time are the
time when the simulation was run.
The script also writes a text file in the current working directory with the
name of each evaluated agent:
agents__MODEL_NAME__EMBEDDER_NAME.txt
This file is used to keep track of which agents have already been evaluated. For
a given MODEL_NAME and EMBEDDER_NAME. If the selected agent is already in the
list, the script will raise an error.
After running this script you can run `calculate_ratings.py` to compute Elo
ratings. The `calculate_ratings.py` script loads the json files written by this
script and computes the Elo ratings for all agents that were been tested with
the same model and embedder.
"""

import argparse
import datetime
import importlib

from concordia.language_model import utils
from concordia.utils import measurements as measurements_lib
import numpy as np
import sentence_transformers

# pylint: disable=g-bad-import-order
from examples.modular.scenario import scenarios as scenarios_lib
from examples.modular.utils import logging_types as logging_lib

# Setup for command line arguments
parser = argparse.ArgumentParser(
description='Run a Concordia Challenge evaluation.'
)
parser.add_argument(
'--agent',
action='store',
default='rational_agent',
dest='agent_name',
)
parser.add_argument(
'--scenario',
action='store',
default='labor_collective_action__fixed_rule_boss_0',
dest='scenario_name',
)
parser.add_argument(
'--api_type', action='store', default='mistral', dest='api_type'
)
parser.add_argument(
'--model', action='store', default='codestral-latest', dest='model_name'
)
parser.add_argument(
'--embedder',
action='store',
default='all-mpnet-base-v2',
dest='embedder_name',
)
parser.add_argument(
'--num_repetitions_per_scenario',
action='store',
type=int,
default=1,
dest='num_repetitions_per_scenario',
)
parser.add_argument('--api_key',
action='store',
default=None,
dest='api_key')
parser.add_argument(
'--disable_language_model',
action='store_true',
help=(
'replace the language model with a null model. This '
'makes it possible to debug without spending money '
'on api calls.'
),
default=False,
dest='disable_language_model',
)
parser.add_argument(
'--exclude_from_elo_calculation',
action='store_true',
help=(
'Use this option to write and analyze test data. It '
'will be automatically enabled when selecting '
'disable_language_model but can also be selected '
'independently of that flag using this one.'
),
default=False,
dest='exclude_from_elo_calculation',
)
# Parse command line arguments
args = parser.parse_args()

exclude_from_elo_calculation = args.exclude_from_elo_calculation
if args.disable_language_model:
exclude_from_elo_calculation = True

# Load the agent config with importlib
IMPORT_AGENT_BASE_DIR = 'concordia.factory.agent'
agent_module = importlib.import_module(
f'{IMPORT_AGENT_BASE_DIR}.{args.agent_name}'
)

# Language Model setup
model = utils.language_model_setup(
api_type=args.api_type,
model_name=args.model_name,
api_key=args.api_key,
disable_language_model=args.disable_language_model,
)

# Setup sentence encoder
if not args.disable_language_model:
st_model = sentence_transformers.SentenceTransformer(
f'sentence-transformers/{args.embedder_name}'
)
embedder = lambda x: st_model.encode(x, show_progress_bar=False)
else:
embedder = lambda x: np.ones(5)

print(f'Running scenario: {args.scenario_name}')
scenario_config = scenarios_lib.SCENARIO_CONFIGS[args.scenario_name]
# Run several simulations per scenario
simulation_outcomes = []
focal_per_capita_scores_to_average = []
background_per_capita_scores_to_average = []
ungrouped_per_capita_scores_to_average = []
for _ in range(args.num_repetitions_per_scenario):
measurements = measurements_lib.Measurements()
runnable_simulation = scenarios_lib.build_simulation(
scenario_config=scenario_config,
model=model,
focal_agent_module=agent_module,
embedder=embedder,
measurements=measurements,
)
# Run the simulation
outcome, text_results_log = runnable_simulation()
simulation_outcomes.append(outcome)
if scenario_config.focal_is_resident:
focal_scores = list(outcome.resident_scores.values())
background_scores = list(outcome.visitor_scores.values())
else:
focal_scores = list(outcome.visitor_scores.values())
background_scores = list(outcome.resident_scores.values())
# Ungrouped scores do not differentiate between focal and background.
ungrouped_scores = focal_scores + background_scores
# Calculate per capita scores.
print('\nScores:')
focal_per_capita_score = np.mean(focal_scores)
focal_per_capita_scores_to_average.append(focal_per_capita_score)
print(f' Focal per capita score: {focal_per_capita_score}')
background_per_capita_score = np.mean(background_scores)
background_per_capita_scores_to_average.append(background_per_capita_score)
print(f' Background per capita score: {background_per_capita_score}')
ungrouped_per_capita_score = np.mean(ungrouped_scores)
ungrouped_per_capita_scores_to_average.append(ungrouped_per_capita_score)
print(f' Ungrouped per capita score: {ungrouped_per_capita_score}')
# Write the full text log as an HTML file in the current working directory.
html_filename = (
f'{args.scenario_name}_'
+ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ '.html'
)
with open(html_filename, 'a', encoding='utf-8') as f:
f.write(text_results_log)

# Average scores over repetitions and save results for all repetitions in a
# json-serializable format.
scenario_result = logging_lib.ScenarioResult(
scenario=args.scenario_name,
focal_agent=args.agent_name,
background_agent=scenario_config.background_agent_module,
focal_per_capita_score=np.mean(focal_per_capita_scores_to_average),
background_per_capita_score=np.mean(
background_per_capita_scores_to_average
),
ungrouped_per_capita_score=np.mean(
ungrouped_per_capita_scores_to_average
),
simulation_outcomes=tuple(simulation_outcomes),
focal_is_resident=scenario_config.focal_is_resident,
api_type=args.api_type,
model=args.model_name,
embedder=args.embedder_name,
disable_language_model=args.disable_language_model,
exclude_from_elo_calculation=args.exclude_from_elo_calculation,
)
scenario_json_filename = (
f'{args.agent_name}__{args.model_name}__'
f'{args.embedder_name}__only_{args.scenario_name}.json'
).replace('/', '_')
json_str_ = scenario_result.to_json()
with open(scenario_json_filename, 'a', encoding='utf-8') as f:
f.write(json_str_)

0 comments on commit 2cee86c

Please sign in to comment.