Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v2] Refactor evaluators and Abstasks #1707

Open
wants to merge 16 commits into
base: v2.0.0
Choose a base branch
from
20 changes: 14 additions & 6 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import torch
import tqdm
import transformers
from datasets import Dataset, DatasetDict
from sklearn.preprocessing import MultiLabelBinarizer

Expand All @@ -25,6 +26,13 @@
# ^ e.g {'main_score': 0.5, 'hf_subset': 'en-de', 'languages': ['eng-Latn', 'deu-Latn']}


def set_seed(seed: int) -> tuple[random.Random, np.random.Generator]:
torch.manual_seed(seed)
np.random.seed(seed)
transformers.set_seed(seed)
return random.Random(seed), np.random.default_rng(seed)


def _multilabel_subsampling(
dataset_dict: DatasetDict,
seed: int,
Expand Down Expand Up @@ -63,14 +71,14 @@ class AbsTask(ABC):
and Dataset is a datasets.Dataset objedct. "hf subset" is the data subset on Huggingface typically used to denote the language e.g.
datasets.load_dataset("data", "en"). If the dataset does not have a subset this is simply "default".
abstask_prompt: The potential prompt of the abstask
superseeded_by: Denotes the task that this task is superseeded by. Used to issue warning to users of outdated datasets, while maintaining
superseded_by: Denotes the task that this task is superseeded by. Used to issue warning to users of outdated datasets, while maintaining
reproducibility of existing benchmarks.
"""

metadata: TaskMetadata
abstask_prompt: str | None = None
_eval_splits: list[str] | None = None
superseded_by: None | str = None
superseded_by: str | None = None
dataset: dict[HFSubset, DatasetDict] | None = None # type: ignore
data_loaded: bool = False
is_multilingual: bool = False
Expand All @@ -85,10 +93,7 @@ def __init__(self, seed: int = 42, **kwargs: Any):
self.save_suffix = kwargs.get("save_suffix", "")

self.seed = seed
random.seed(self.seed)
np.random.seed(self.seed)
torch.manual_seed(self.seed)
torch.cuda.manual_seed_all(self.seed)
self.rng_state, self.np_rng = set_seed(seed)

def check_if_dataset_is_superseded(self):
"""Check if the dataset is superseded by a newer version"""
Expand Down Expand Up @@ -329,6 +334,9 @@ def filter_languages(
self.hf_subsets = subsets_to_keep
return self

def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def _upload_dataset_to_hub(self, repo_name: str, fields: list[str]) -> None:
if self.is_multilingual:
for config in self.metadata.eval_langs:
Expand Down
11 changes: 2 additions & 9 deletions mteb/abstasks/AbsTaskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,6 @@ class AbsTaskBitextMining(AbsTask):
parallel_subsets = False
abstask_prompt = "Retrieve parallel sentences."

def __init__(self, **kwargs):
super().__init__(**kwargs)

def evaluate(
self,
model: Encoder,
Expand Down Expand Up @@ -94,7 +91,7 @@ def evaluate(
else:
for hf_subet in hf_subsets:
logger.info(
f"\nTask: {self.metadata.name}, split: {split}, subset: {hf_subet}. Running..."
f"Task: {self.metadata.name}, split: {split}, subset: {hf_subet}. Running..."
)

if hf_subet not in self.dataset and hf_subet == "default":
Expand All @@ -103,8 +100,7 @@ def evaluate(
data_split = self.dataset[hf_subet][split]
scores[hf_subet] = self._evaluate_subset(
model,
data_split, # type: ignore
subsets=["sentence1", "sentence2"],
data_split,
encode_kwargs=encode_kwargs,
**kwargs,
)
Expand Down Expand Up @@ -142,9 +138,6 @@ def _evaluate_subset(
self._add_main_score(metrics)
return metrics

def _add_main_score(self, scores) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
) -> BitextDescriptiveStatistics:
Expand Down
12 changes: 4 additions & 8 deletions mteb/abstasks/AbsTaskClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,6 @@ def __init__(
# kNN parameters
self.k = k

def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def evaluate(
self,
model,
Expand All @@ -110,7 +107,7 @@ def evaluate(

for hf_subset in hf_subsets:
logger.info(
f"\nTask: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
f"Task: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
)

if hf_subset not in self.dataset and hf_subset == "default":
Expand Down Expand Up @@ -167,7 +164,6 @@ def _evaluate_subset(
eval_split["text"], # type: ignore
eval_split["label"], # type: ignore
task_name=self.metadata.name,
encode_kwargs=encode_kwargs,
**params,
)
elif self.method == "kNN-pytorch":
Expand All @@ -177,7 +173,6 @@ def _evaluate_subset(
eval_split["text"], # type: ignore
eval_split["label"], # type: ignore
task_name=self.metadata.name,
encode_kwargs=encode_kwargs,
**params,
)
elif self.method == "logReg":
Expand All @@ -187,13 +182,14 @@ def _evaluate_subset(
eval_split["text"], # type: ignore
eval_split["label"], # type: ignore
task_name=self.metadata.name,
encode_kwargs=encode_kwargs,
**params,
)
else:
raise ValueError(f"Method {self.method} not supported")

scores_exp, test_cache = evaluator(model, test_cache=test_cache)
scores_exp, test_cache = evaluator(
model, encode_kwargs=encode_kwargs, test_cache=test_cache
)
scores.append(scores_exp)

avg_scores: dict[str, Any] = {
Expand Down
3 changes: 0 additions & 3 deletions mteb/abstasks/AbsTaskClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,6 @@ class AbsTaskClustering(AbsTask):

abstask_prompt = "Identify categories in user passages."

def __init__(self, **kwargs):
super().__init__(**kwargs)

def _add_main_score(self, scores) -> None:
scores["main_score"] = scores[self.metadata.main_score]

Expand Down
20 changes: 3 additions & 17 deletions mteb/abstasks/AbsTaskClusteringFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ def evaluate_clustering_bootstrapped(
The bootstrapping is done by sampling N samples from the corpus and clustering them. It is done without replacement to get a diverse set of
samples.
"""
n_embeddings = embeddings.shape[0]

v_measures = defaultdict(list)
if max_depth is not None:
max_depth = min(max_depth, max(map(len, labels)))
Expand Down Expand Up @@ -143,17 +141,6 @@ class AbsTaskClusteringFast(AbsTask):
max_depth = None
abstask_prompt = "Identify categories in user passages."

def __init__(self, **kwargs):
super().__init__(**kwargs)

def _add_main_score(self, scores):
if self.metadata_dict["main_score"] in scores:
scores["main_score"] = scores[self.metadata.main_score]
else:
logger.warning(
f"main score {self.metadata.main_score} not found in scores {scores.keys()}"
)

def _evaluate_subset(
self,
model: Encoder,
Expand All @@ -162,8 +149,6 @@ def _evaluate_subset(
encode_kwargs: dict[str, Any] = {},
**kwargs: Any,
) -> dict[str, float | dict[str, list[float]]]:
rng_state = random.Random(self.seed)

if (
self.max_document_to_embed is not None
and self.max_fraction_of_documents_to_embed is not None
Expand All @@ -186,7 +171,7 @@ def _evaluate_subset(
max_documents_to_embed = self.max_document_to_embed

max_documents_to_embed = min(len(dataset), max_documents_to_embed) # type: ignore
example_indices = rng_state.sample(
example_indices = self.rng_state.sample(
range(len(dataset)), k=max_documents_to_embed
)
downsampled_dataset = dataset.select(example_indices) # type: ignore
Expand All @@ -210,7 +195,7 @@ def _evaluate_subset(
cluster_size=self.max_documents_per_cluster,
kmean_batch_size=self.k_mean_batch_size,
max_depth=self.max_depth,
rng_state=rng_state,
rng_state=self.rng_state,
)
v_measures = list(itertools.chain.from_iterable(all_v_scores.values()))

Expand Down Expand Up @@ -276,6 +261,7 @@ def clustering_downsample(
dataset: DatasetDict, seed: int, max_samples_in_cluster: int = 2048
) -> DatasetDict:
"""In cases where it is not possible to convert the dataset to a fast version, we can downsample the dataset to speed up the evaluation.
Only used in ArXivHierarchicalClusteringP2P
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we could probably just reupload it and remove this part then

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved this function to ArXivHierarchicalClusteringP2P.v2, because ArXivHierarchicalClusteringP2P uses same dataset


This might be necessary when the clusters in the dataset is not sampled from the same distribution.
"""
Expand Down
32 changes: 7 additions & 25 deletions mteb/abstasks/AbsTaskMultilabelClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,18 @@ def evaluate_classifier(
embeddings_test: np.ndarray,
y_test: np.ndarray,
classifier: ClassifierMixin,
):
scores = {}
) -> dict[str, float]:
classifier = clone(classifier)
classifier.fit(embeddings_train, y_train)
y_pred = classifier.predict(embeddings_test)
accuracy = classifier.score(embeddings_test, y_test)
f1 = f1_score(y_test, y_pred, average="macro")
scores["accuracy"] = accuracy
scores["f1"] = f1
lrap = label_ranking_average_precision_score(y_test, y_pred)
scores["lrap"] = lrap
return scores
return {
"accuracy": accuracy,
"f1": f1,
"lrap": lrap,
}


class MultilabelClassificationDescriptiveStatistics(DescriptiveStatistics):
Expand Down Expand Up @@ -97,25 +97,13 @@ class AbsTaskMultilabelClassification(AbsTask):
def __init__(
self,
n_experiments=None,
batch_size=32,
**kwargs,
):
super().__init__(**kwargs)
self.batch_size = batch_size

# Bootstrap parameters
self.n_experiments = n_experiments or getattr(self, "n_experiments", 10)
Samoed marked this conversation as resolved.
Show resolved Hide resolved

# Run metadata validation by instantiating addressing the attribute
# This is quite hacky. Ideally, this would be done in the constructor of
# each concrete task, but then we have to duplicate the __init__ method's
# interface.
if hasattr(self, "metadata"):
self.metadata

def _add_main_score(self, scores):
scores["main_score"] = scores[self.metadata.main_score]

def evaluate(
self,
model: Encoder,
Expand All @@ -137,7 +125,7 @@ def evaluate(

for hf_subset in hf_subsets:
logger.info(
f"\nTask: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
f"Task: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
)

if hf_subset not in self.dataset and hf_subset == "default":
Expand Down Expand Up @@ -168,12 +156,6 @@ def _evaluate_subset(
) -> ScoresDict:
train_split = dataset[train_split]
eval_split = dataset[eval_split]
params = {
"classifier_type": type(self.classifier).__name__,
"classifier_params": self.classifier.get_params(),
"batch_size": self.batch_size,
}
params.update(kwargs)

scores = []
# Bootstrap sample indices from training set for each experiment
Expand Down
6 changes: 0 additions & 6 deletions mteb/abstasks/AbsTaskPairClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,6 @@ class AbsTaskPairClassification(AbsTask):

abstask_prompt = "Retrieve text that are semantically similar to the given text."

def __init__(self, **kwargs):
super().__init__(**kwargs)

def _add_main_score(self, scores: ScoresDict) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def _evaluate_subset(
self,
model: Encoder,
Expand Down
3 changes: 0 additions & 3 deletions mteb/abstasks/AbsTaskReranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@
class AbsTaskReranking(AbsTaskRetrieval):
"""Abstract class for re-ranking experiments. This is mostly the same as the RetrievalEvaluator, but here to adapt the old format to the new format. TODO: update these tasks to the new format and delete this class."""

def __init__(self, **kwargs):
super(AbsTaskRetrieval, self).__init__(**kwargs)
Samoed marked this conversation as resolved.
Show resolved Hide resolved

def load_data(self, **kwargs):
if self.data_loaded:
return
Expand Down
19 changes: 10 additions & 9 deletions mteb/abstasks/AbsTaskSummarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any

import numpy as np
from datasets import Dataset

from mteb.encoder_interface import Encoder
from mteb.load_results.task_results import ScoresDict
Expand Down Expand Up @@ -75,13 +76,10 @@ class AbsTaskSummarization(AbsTask):
relevance: list[float] (the score of the machine generated summaries)
"""

evalutor = SummarizationEvaluator
abstask_prompt = (
"Given a news summary, retrieve other semantically similar summaries."
)

def __init__(self, **kwargs):
super().__init__(**kwargs)
evaluator = SummarizationEvaluator

@property
def min_score(self):
Expand All @@ -92,13 +90,19 @@ def max_score(self):
return self.metadata_dict["max_score"]

def _evaluate_subset(
self, model: Encoder, data_split, *, encode_kwargs: dict[str, Any], **kwargs
self,
model: Encoder,
data_split: Dataset,
*,
encode_kwargs: dict[str, Any],
**kwargs,
) -> ScoresDict:
normalized_scores = [
(np.array(x) - self.min_score) / (self.max_score - self.min_score)
for x in data_split["relevance"]
]
evaluator = self.evalutor(
# SummEval has DeprecatedSummarizationEvaluator
evaluator = self.evaluator(
machine_summaries=data_split["machine_summaries"],
human_summaries=data_split["human_summaries"],
texts=data_split["text"],
Expand All @@ -110,9 +114,6 @@ def _evaluate_subset(
self._add_main_score(scores)
return scores

def _add_main_score(self, scores: ScoresDict) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
) -> SummarizationDescriptiveStatistics:
Expand Down
Loading
Loading