Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

How to calculate MTEB score from individual files? #1721

Open
orionw opened this issue Jan 7, 2025 · 2 comments
Open

How to calculate MTEB score from individual files? #1721

orionw opened this issue Jan 7, 2025 · 2 comments

Comments

@orionw
Copy link
Contributor

orionw commented Jan 7, 2025

Do we have a script for this?

cc @NohTow @KennethEnevoldsen

@Muennighoff
Copy link
Contributor

I think the below may work for converting the results into a LaTeX table & should be easy to adjust for just averaging without the table? Let me know if you get it work -- would be great to add a working version under https://github.com/embeddings-benchmark/mteb/tree/main/scripts

"""
Usage: python results_to_tex.py results_folder_path
results_folder_path contains results of multiple models whose folders should be named after them
"""
import json
import os
import sys

from mteb import MTEB
import numpy as np


### GLOBAL VARIABLES ###

TASK_LIST_BITEXT = [
    "BUCC",
    "Tatoeba",
]

TASK_LIST_CLASSIFICATION = [
    "AmazonCounterfactualClassification",
    "AmazonPolarityClassification",
    "AmazonReviewsClassification",
    "Banking77Classification",
    "EmotionClassification",
    "ImdbClassification",
    "MassiveIntentClassification",
    "MassiveScenarioClassification",
    "MTOPDomainClassification",
    "MTOPIntentClassification",
    "ToxicConversationsClassification",
    "TweetSentimentExtractionClassification",
]

TASK_LIST_CLUSTERING = [
    "ArxivClusteringP2P",
    "ArxivClusteringS2S",
    "BiorxivClusteringP2P",
    "BiorxivClusteringS2S",
    "MedrxivClusteringP2P",
    "MedrxivClusteringS2S",
    "RedditClustering",
    "RedditClusteringP2P",
    "StackExchangeClustering",
    "StackExchangeClusteringP2P",
    "TwentyNewsgroupsClustering",
]

TASK_LIST_PAIR_CLASSIFICATION = [
    "SprintDuplicateQuestions",
    "TwitterSemEval2015",
    "TwitterURLCorpus",
]

TASK_LIST_RERANKING = [
    "AskUbuntuDupQuestions",
    "MindSmallReranking",
    "SciDocsRR",
    "StackOverflowDupQuestions",
]

TASK_LIST_RETRIEVAL = [
    "ArguAna",
    "ClimateFEVER",
    "CQADupstackRetrieval",
    "DBPedia",
    "FEVER",
    "FiQA2018",
    "HotpotQA",
    "MSMARCO",
    "NFCorpus",
    "NQ",
    "QuoraRetrieval",
    "SCIDOCS",
    "SciFact",
    "Touche2020",
    "TRECCOVID",
]

TASK_LIST_STS = [
    "BIOSSES",
    "SICK-R",
    "STS12",
    "STS13",
    "STS14",
    "STS15",
    "STS16",
    "STS17",
    "STS22",
    "STSBenchmark",
]

TASK_LIST_SUMMARIZATION = [
    "SummEval",
]

TASK_LIST = (
    TASK_LIST_BITEXT
    + TASK_LIST_CLASSIFICATION
    + TASK_LIST_CLUSTERING
    + TASK_LIST_PAIR_CLASSIFICATION
    + TASK_LIST_RERANKING
    + TASK_LIST_RETRIEVAL
    + TASK_LIST_STS
    + TASK_LIST_SUMMARIZATION
)

TASK_LIST_EN = (
    TASK_LIST_CLASSIFICATION
    + TASK_LIST_CLUSTERING
    + TASK_LIST_PAIR_CLASSIFICATION
    + TASK_LIST_RERANKING
    + TASK_LIST_RETRIEVAL
    + TASK_LIST_STS
    + TASK_LIST_SUMMARIZATION
)

QUICK_EVAL = [
    # Classification
    "Banking77Classification",
    "EmotionClassification",
    # Clustering
    "MedrxivClusteringS2S",
    # PairClassification
    "TwitterSemEval2015",
    # Reranking
    "AskUbuntuDupQuestions",
    # Retrieval
    "ArguAna",
    "NFCorpus",
    "SciFact",
    # STS
    "BIOSSES",
    "STS17",
    "STSBenchmark",
    # Summarization
    "SummEval",
]

TASK_LIST_NAMES = [
    ("Classification", TASK_LIST_CLASSIFICATION, ["en", "en-en"]),
    ("Clustering", TASK_LIST_CLUSTERING, ["en", "en-en"]),
    ("PairClassification", TASK_LIST_PAIR_CLASSIFICATION, ["en", "en-en"]),
    ("Reranking", TASK_LIST_RERANKING, ["en", "en-en"]),
    ("Retrieval", TASK_LIST_RETRIEVAL, ["en", "en-en"]),
    ("STS", TASK_LIST_STS, ["en", "en-en"]),
    ("all", TASK_LIST, ["en", "en-en"]),
    ("BitextMining", TASK_LIST_BITEXT, []),
]

MODELS = [
    "GritLM__GritLM-7B/13f00a0e36500c80ce12870ea513846a066004af",
]

MODEL_TO_NAME = {
    "GritLM__GritLM-7B/13f00a0e36500c80ce12870ea513846a066004af": "GritLM-7B",
}

### LOGIC ###

results_folder = sys.argv[1].rstrip("/")

all_results = {}

mteb_task_names = [t.metadata.name for t in MTEB().tasks] + ["CQADupstackRetrieval"]

for model_name in MODELS:
    model_res_folder = os.path.join(results_folder, model_name)
    if os.path.isdir(model_res_folder):
        all_results.setdefault(model_name, {})
        for file_name in os.listdir(model_res_folder):
            if not file_name.split(".")[0].split("/")[-1] in mteb_task_names:
                print(f"Skipping non-MTEB file: {file_name}")
                continue
            print(f"Parsing MTEB file: {model_name}/{file_name}")
            with open(os.path.join(model_res_folder, file_name), "r", encoding="utf-8") as f:
                results = json.load(f)['scores']
                all_results[model_name] = {**all_results[model_name], **{file_name.replace(".json", ""): results}}

# Alternatively iterate via results folder
# for model_name in os.listdir(results_folder):
#     model_res_folder = os.path.join(results_folder, model_name)
#     if os.path.isdir(model_res_folder):
#         all_results.setdefault(model_name, {})
#         for file_name in os.listdir(model_res_folder):
#             if not file_name.split(".")[0].split("/")[-1] in mteb_task_names:
#                 print(f"Skipping non-MTEB file: {file_name}")
#                 continue
#             print(f"Parsing MTEB file: {model_name}/{file_name}")
#             with open(os.path.join(model_res_folder, file_name), "r", encoding="utf-8") as f:
#                 results = json.load(f)['scores']
#                 #import pdb; pdb.set_trace()
#                 all_results[model_name] = {**all_results[model_name], **{file_name.replace(".json", ""): results}}

def get_rows(dataset, model_name, limit_langs=[], skip_langs=[]):
    rows = []
    # CQADupstackRetrieval uses the same metric as its subsets
    tasks = MTEB(tasks=[dataset.replace("CQADupstackRetrieval", "CQADupstackTexRetrieval")]).tasks
    assert len(tasks) == 1, f"Found {len(tasks)} for {dataset}. Expected 1."
    main_metric = tasks[0].metadata.main_score
    test_result = all_results.get(model_name, {}).get(dataset, {})

    # Dev / Val set is used for MSMARCO (See BEIR paper)
    if "MSMARCO" in dataset:
        test_result = (
            test_result.get("dev") if "dev" in test_result else test_result.get("validation")
        )
    else:
        test_result = test_result.get("test")

    if test_result is None:
        rows.append([dataset, main_metric, None])
        return rows

    for res in test_result:
        lang = res['languages'][0]
        if "en-" in res['hf_subset'] and res['hf_subset'] != "en-en": continue
        if (limit_langs and lang not in limit_langs) or (skip_langs and lang in skip_langs):
            continue
        elif test_result is None:
            rows.append([lang, main_metric, None])
            continue

        test_result_lang = res['main_score']
        if test_result_lang is None:
            rows.append([lang, main_metric, None])
            continue
        rows.append([lang, main_metric, test_result_lang])
    #import pdb; pdb.set_trace()
    return rows


def get_table(models, task_list, limit_langs=[], skip_langs=[], name="table", no_lang_col=False):
    TABLE = "Dataset & Language & " + " & ".join([MODEL_TO_NAME.get(model, model) for model in models]) + " \\\\" + "\n"
    if no_lang_col:
        TABLE = TABLE.replace("Language & ", "")
    scores_all = []
    for ds in task_list:
        try:
            results = [get_rows(dataset=ds, model_name=model, limit_langs=limit_langs, skip_langs=skip_langs) for model in models]
            assert all(len(sub) == len(results[0]) for sub in results)
            for lang_idx in range(len(results[0])):
                scores = [x[lang_idx][-1] for x in results]
                scores_all.append(scores)
                lang = results[0][lang_idx][0]
                beginning = [ds, lang] if not(no_lang_col) else [ds]
                one_line = " & ".join(beginning + [str(round(x*100, 2)) if x is not None else "" for x in scores])
                TABLE += one_line + " \\\\" + "\n"
        except Exception as e:
            import pdb; pdb.set_trace()
            print(f"Skipping {ds} due to {e}")


    arr = np.array(scores_all, dtype=np.float32)
    # Get an index of columns which has any NaN value
    index = np.isnan(arr).any(axis=0)
    # Delete columns (models) with any NaN value from 2D NumPy Array
    arr = np.delete(arr, index, axis=1)
    # Average
    scores_avg = list(np.mean(arr, axis=0))
    # Insert empty string for NaN columns
    for i, val in enumerate(index):
        if val == True:
            scores_avg.insert(i, "")
    lang = "mix" if not(limit_langs) else limit_langs[0]
    beginning = ["Average", lang] if not(no_lang_col) else ["Average"]
    TABLE += " & ".join(beginning + [str(round(x*100, 2)) if x else "" for x in scores_avg]) + " \\\\" + "\n"

    with open(f"{name}.txt", "w") as f:
        f.write(TABLE)

get_table(MODELS, TASK_LIST_CLASSIFICATION, limit_langs=["en", "en-en", "eng-Latn"], name="mteb_clf", no_lang_col=True)
get_table(MODELS, TASK_LIST_CLUSTERING, limit_langs=["en", "en-en", "eng-Latn"], name="mteb_clu", no_lang_col=True)
get_table(MODELS, TASK_LIST_PAIR_CLASSIFICATION, limit_langs=["en", "en-en", "eng-Latn"], name="mteb_pclf", no_lang_col=True)
get_table(MODELS, TASK_LIST_RERANKING, limit_langs=["en", "en-en", "eng-Latn"], name="mteb_rrk", no_lang_col=True)
get_table(MODELS, TASK_LIST_RETRIEVAL, limit_langs=["en", "en-en", "eng-Latn"], name="mteb_rtr", no_lang_col=True)
get_table(MODELS, TASK_LIST_STS, limit_langs=["en", "en-en", "eng-Latn"], name="mteb_sts", no_lang_col=True)
get_table(MODELS, TASK_LIST_EN, limit_langs=["en", "en-en", "eng-Latn"], name="mteb_en", no_lang_col=True)

@KennethEnevoldsen
Copy link
Contributor

KennethEnevoldsen commented Jan 8, 2025

This should work:

import pandas as pd

import mteb

res: mteb.BenchmarkResults = mteb.load_results()

wide_scores = res.get_scores(format="wide")
df = pd.DataFrame.from_records(wide_scores)

This loads the benchmark results from the result repo. However, you an construct the mteb.BenchmarkResults manually using:

import mteb

tasks = mteb.get_tasks(...)
bench = mteb.MTEB(tasks)
task_results = bench.run(my_model)

mdl_result = ModelResult(model_name = ..., revision = ..., task_results = task_results)
model_results = [model_result# add more models here
res = BenchmarkResult(model_results)

if relevant we should add this to the docs somewhere and probably add a BenchmarkResult.to_table(). That does the work.

once you have BenchmarkResult you can compute the metrics using:

from mteb.task_aggregation import mean, task_category_weighted_mean, borda_count

borda: dict = borda_count(BenchmarkResult)

However, these scores are not the actual implementation used by the leaderboard (though I believe that they are compatible, @x-tabdeveloping?)

(I don't believe there are any dependencies on them, would remove them in v2)

We should probably add something like:

BenchmarkResult.compute_mean() # mean(task)
BenchmarkResult.compute_task_type_mean() # mean(TaskType)
BenchmarkResult.compute_borda_rank() # Rank(Borda)

# we could also add these individually (but we can't add borda here)
BenchmarkResult.compute_mean() # mean(task)
BenchmarkResult.compute_task_type_mean() # mean(TaskType)

Which uses the exact same implementation as in the leaderboard. (@x-tabdeveloping would love to get you opinion here to keep the leaderboard consideration in mind)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants