embeddings-benchmark · Samoed · Jan 4, 2025 · Jan 4, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py
@@ -11,6 +11,7 @@
 import numpy as np
 import torch
 import tqdm
+import transformers
 from datasets import Dataset, DatasetDict
 from sklearn.preprocessing import MultiLabelBinarizer
 
@@ -25,6 +26,13 @@
 # ^ e.g {'main_score': 0.5, 'hf_subset': 'en-de', 'languages': ['eng-Latn', 'deu-Latn']}
 
 
+def set_seed(seed: int) -> tuple[random.Random, np.random.Generator]:
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    transformers.set_seed(seed)
+    return random.Random(seed), np.random.default_rng(seed)
+
+
 def _multilabel_subsampling(
     dataset_dict: DatasetDict,
     seed: int,
@@ -63,14 +71,14 @@ class AbsTask(ABC):
             and Dataset is a datasets.Dataset objedct. "hf subset" is the data subset on Huggingface typically used to denote the language e.g.
             datasets.load_dataset("data", "en"). If the dataset does not have a subset this is simply "default".
         abstask_prompt: The potential prompt of the abstask
-        superseeded_by: Denotes the task that this task is superseeded by. Used to issue warning to users of outdated datasets, while maintaining
+        superseded_by: Denotes the task that this task is superseeded by. Used to issue warning to users of outdated datasets, while maintaining
             reproducibility of existing benchmarks.
     """
 
     metadata: TaskMetadata
     abstask_prompt: str | None = None
     _eval_splits: list[str] | None = None
-    superseded_by: None | str = None
+    superseded_by: str | None = None
     dataset: dict[HFSubset, DatasetDict] | None = None  # type: ignore
     data_loaded: bool = False
     is_multilingual: bool = False
@@ -85,10 +93,7 @@ def __init__(self, seed: int = 42, **kwargs: Any):
         self.save_suffix = kwargs.get("save_suffix", "")
 
         self.seed = seed
-        random.seed(self.seed)
-        np.random.seed(self.seed)
-        torch.manual_seed(self.seed)
-        torch.cuda.manual_seed_all(self.seed)
+        self.rng_state, self.np_rng = set_seed(seed)
 
     def check_if_dataset_is_superseded(self):
         """Check if the dataset is superseded by a newer version"""
@@ -329,6 +334,9 @@ def filter_languages(
         self.hf_subsets = subsets_to_keep
         return self
 
+    def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None:
+        scores["main_score"] = scores[self.metadata.main_score]
+
     def _upload_dataset_to_hub(self, repo_name: str, fields: list[str]) -> None:
         if self.is_multilingual:
             for config in self.metadata.eval_langs:

diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py
@@ -61,9 +61,6 @@ class AbsTaskBitextMining(AbsTask):
     parallel_subsets = False
     abstask_prompt = "Retrieve parallel sentences."
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
     def evaluate(
         self,
         model: Encoder,
@@ -94,7 +91,7 @@ def evaluate(
         else:
             for hf_subet in hf_subsets:
                 logger.info(
-                    f"\nTask: {self.metadata.name}, split: {split}, subset: {hf_subet}. Running..."
+                    f"Task: {self.metadata.name}, split: {split}, subset: {hf_subet}. Running..."
                 )
 
                 if hf_subet not in self.dataset and hf_subet == "default":
@@ -103,8 +100,7 @@ def evaluate(
                     data_split = self.dataset[hf_subet][split]
                 scores[hf_subet] = self._evaluate_subset(
                     model,
-                    data_split,  # type: ignore
-                    subsets=["sentence1", "sentence2"],
+                    data_split,
                     encode_kwargs=encode_kwargs,
                     **kwargs,
                 )
@@ -142,9 +138,6 @@ def _evaluate_subset(
             self._add_main_score(metrics)
         return metrics
 
-    def _add_main_score(self, scores) -> None:
-        scores["main_score"] = scores[self.metadata.main_score]
-
     def _calculate_metrics_from_split(
         self, split: str, hf_subset: str | None = None, compute_overall: bool = False
     ) -> BitextDescriptiveStatistics:

diff --git a/mteb/abstasks/AbsTaskClassification.py b/mteb/abstasks/AbsTaskClassification.py
@@ -87,9 +87,6 @@ def __init__(
         # kNN parameters
         self.k = k
 
-    def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None:
-        scores["main_score"] = scores[self.metadata.main_score]
-
     def evaluate(
         self,
         model,
@@ -110,7 +107,7 @@ def evaluate(
 
         for hf_subset in hf_subsets:
             logger.info(
-                f"\nTask: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
+                f"Task: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
             )
 
             if hf_subset not in self.dataset and hf_subset == "default":
@@ -167,7 +164,6 @@ def _evaluate_subset(
                     eval_split["text"],  # type: ignore
                     eval_split["label"],  # type: ignore
                     task_name=self.metadata.name,
-                    encode_kwargs=encode_kwargs,
                     **params,
                 )
             elif self.method == "kNN-pytorch":
@@ -177,7 +173,6 @@ def _evaluate_subset(
                     eval_split["text"],  # type: ignore
                     eval_split["label"],  # type: ignore
                     task_name=self.metadata.name,
-                    encode_kwargs=encode_kwargs,
                     **params,
                 )
             elif self.method == "logReg":
@@ -187,13 +182,14 @@ def _evaluate_subset(
                     eval_split["text"],  # type: ignore
                     eval_split["label"],  # type: ignore
                     task_name=self.metadata.name,
-                    encode_kwargs=encode_kwargs,
                     **params,
                 )
             else:
                 raise ValueError(f"Method {self.method} not supported")
 
-            scores_exp, test_cache = evaluator(model, test_cache=test_cache)
+            scores_exp, test_cache = evaluator(
+                model, encode_kwargs=encode_kwargs, test_cache=test_cache
+            )
             scores.append(scores_exp)
 
         avg_scores: dict[str, Any] = {

diff --git a/mteb/abstasks/AbsTaskClustering.py b/mteb/abstasks/AbsTaskClustering.py
@@ -64,9 +64,6 @@ class AbsTaskClustering(AbsTask):
 
     abstask_prompt = "Identify categories in user passages."
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
     def _add_main_score(self, scores) -> None:
         scores["main_score"] = scores[self.metadata.main_score]
 

diff --git a/mteb/abstasks/AbsTaskClusteringFast.py b/mteb/abstasks/AbsTaskClusteringFast.py
@@ -38,8 +38,6 @@ def evaluate_clustering_bootstrapped(
     The bootstrapping is done by sampling N samples from the corpus and clustering them. It is done without replacement to get a diverse set of
     samples.
     """
-    n_embeddings = embeddings.shape[0]
-
     v_measures = defaultdict(list)
     if max_depth is not None:
         max_depth = min(max_depth, max(map(len, labels)))
@@ -143,17 +141,6 @@ class AbsTaskClusteringFast(AbsTask):
     max_depth = None
     abstask_prompt = "Identify categories in user passages."
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def _add_main_score(self, scores):
-        if self.metadata_dict["main_score"] in scores:
-            scores["main_score"] = scores[self.metadata.main_score]
-        else:
-            logger.warning(
-                f"main score {self.metadata.main_score} not found in scores {scores.keys()}"
-            )
-
     def _evaluate_subset(
         self,
         model: Encoder,
@@ -162,8 +149,6 @@ def _evaluate_subset(
         encode_kwargs: dict[str, Any] = {},
         **kwargs: Any,
     ) -> dict[str, float | dict[str, list[float]]]:
-        rng_state = random.Random(self.seed)
-
         if (
             self.max_document_to_embed is not None
             and self.max_fraction_of_documents_to_embed is not None
@@ -186,7 +171,7 @@ def _evaluate_subset(
                 max_documents_to_embed = self.max_document_to_embed
 
             max_documents_to_embed = min(len(dataset), max_documents_to_embed)  # type: ignore
-            example_indices = rng_state.sample(
+            example_indices = self.rng_state.sample(
                 range(len(dataset)), k=max_documents_to_embed
             )
             downsampled_dataset = dataset.select(example_indices)  # type: ignore
@@ -210,7 +195,7 @@ def _evaluate_subset(
             cluster_size=self.max_documents_per_cluster,
             kmean_batch_size=self.k_mean_batch_size,
             max_depth=self.max_depth,
-            rng_state=rng_state,
+            rng_state=self.rng_state,
         )
         v_measures = list(itertools.chain.from_iterable(all_v_scores.values()))
 
@@ -276,6 +261,7 @@ def clustering_downsample(
     dataset: DatasetDict, seed: int, max_samples_in_cluster: int = 2048
 ) -> DatasetDict:
     """In cases where it is not possible to convert the dataset to a fast version, we can downsample the dataset to speed up the evaluation.
+    Only used in ArXivHierarchicalClusteringP2P
 
     This might be necessary when the clusters in the dataset is not sampled from the same distribution.
     """

diff --git a/mteb/abstasks/AbsTaskMultilabelClassification.py b/mteb/abstasks/AbsTaskMultilabelClassification.py
@@ -27,18 +27,18 @@ def evaluate_classifier(
     embeddings_test: np.ndarray,
     y_test: np.ndarray,
     classifier: ClassifierMixin,
-):
-    scores = {}
+) -> dict[str, float]:
     classifier = clone(classifier)
     classifier.fit(embeddings_train, y_train)
     y_pred = classifier.predict(embeddings_test)
     accuracy = classifier.score(embeddings_test, y_test)
     f1 = f1_score(y_test, y_pred, average="macro")
-    scores["accuracy"] = accuracy
-    scores["f1"] = f1
     lrap = label_ranking_average_precision_score(y_test, y_pred)
-    scores["lrap"] = lrap
-    return scores
+    return {
+        "accuracy": accuracy,
+        "f1": f1,
+        "lrap": lrap,
+    }
 
 
 class MultilabelClassificationDescriptiveStatistics(DescriptiveStatistics):
@@ -97,25 +97,13 @@ class AbsTaskMultilabelClassification(AbsTask):
     def __init__(
         self,
         n_experiments=None,
-        batch_size=32,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.batch_size = batch_size
 
         # Bootstrap parameters
         self.n_experiments = n_experiments or getattr(self, "n_experiments", 10)
 
-        # Run metadata validation by instantiating addressing the attribute
-        # This is quite hacky. Ideally, this would be done in the constructor of
-        # each concrete task, but then we have to duplicate the __init__ method's
-        # interface.
-        if hasattr(self, "metadata"):
-            self.metadata
-
-    def _add_main_score(self, scores):
-        scores["main_score"] = scores[self.metadata.main_score]
-
     def evaluate(
         self,
         model: Encoder,
@@ -137,7 +125,7 @@ def evaluate(
 
         for hf_subset in hf_subsets:
             logger.info(
-                f"\nTask: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
+                f"Task: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
             )
 
             if hf_subset not in self.dataset and hf_subset == "default":
@@ -168,12 +156,6 @@ def _evaluate_subset(
     ) -> ScoresDict:
         train_split = dataset[train_split]
         eval_split = dataset[eval_split]
-        params = {
-            "classifier_type": type(self.classifier).__name__,
-            "classifier_params": self.classifier.get_params(),
-            "batch_size": self.batch_size,
-        }
-        params.update(kwargs)
 
         scores = []
         # Bootstrap sample indices from training set for each experiment

diff --git a/mteb/abstasks/AbsTaskPairClassification.py b/mteb/abstasks/AbsTaskPairClassification.py
@@ -67,12 +67,6 @@ class AbsTaskPairClassification(AbsTask):
 
     abstask_prompt = "Retrieve text that are semantically similar to the given text."
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def _add_main_score(self, scores: ScoresDict) -> None:
-        scores["main_score"] = scores[self.metadata.main_score]
-
     def _evaluate_subset(
         self,
         model: Encoder,

diff --git a/mteb/abstasks/AbsTaskReranking.py b/mteb/abstasks/AbsTaskReranking.py
@@ -33,9 +33,6 @@
 class AbsTaskReranking(AbsTaskRetrieval):
     """Abstract class for re-ranking experiments. This is mostly the same as the RetrievalEvaluator, but here to adapt the old format to the new format. TODO: update these tasks to the new format and delete this class."""
 
-    def __init__(self, **kwargs):
-        super(AbsTaskRetrieval, self).__init__(**kwargs)
-
     def load_data(self, **kwargs):
         if self.data_loaded:
             return

diff --git a/mteb/abstasks/AbsTaskSummarization.py b/mteb/abstasks/AbsTaskSummarization.py
@@ -4,6 +4,7 @@
 from typing import Any
 
 import numpy as np
+from datasets import Dataset
 
 from mteb.encoder_interface import Encoder
 from mteb.load_results.task_results import ScoresDict
@@ -75,13 +76,10 @@ class AbsTaskSummarization(AbsTask):
         relevance: list[float] (the score of the machine generated summaries)
     """
 
-    evalutor = SummarizationEvaluator
     abstask_prompt = (
         "Given a news summary, retrieve other semantically similar summaries."
     )
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+    evaluator = SummarizationEvaluator
 
     @property
     def min_score(self):
@@ -92,13 +90,19 @@ def max_score(self):
         return self.metadata_dict["max_score"]
 
     def _evaluate_subset(
-        self, model: Encoder, data_split, *, encode_kwargs: dict[str, Any], **kwargs
+        self,
+        model: Encoder,
+        data_split: Dataset,
+        *,
+        encode_kwargs: dict[str, Any],
+        **kwargs,
     ) -> ScoresDict:
         normalized_scores = [
             (np.array(x) - self.min_score) / (self.max_score - self.min_score)
             for x in data_split["relevance"]
         ]
-        evaluator = self.evalutor(
+        # SummEval has DeprecatedSummarizationEvaluator
+        evaluator = self.evaluator(
             machine_summaries=data_split["machine_summaries"],
             human_summaries=data_split["human_summaries"],
             texts=data_split["text"],
@@ -110,9 +114,6 @@ def _evaluate_subset(
         self._add_main_score(scores)
         return scores
 
-    def _add_main_score(self, scores: ScoresDict) -> None:
-        scores["main_score"] = scores[self.metadata.main_score]
-
     def _calculate_metrics_from_split(
         self, split: str, hf_subset: str | None = None, compute_overall: bool = False
     ) -> SummarizationDescriptiveStatistics: