embeddings-benchmark · Samoed · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py
@@ -6,7 +6,9 @@
 
 import numpy as np
 import torch
+from sentence_transformers import SentenceTransformer
 
+import mteb
 from mteb.encoder_interface import PromptType
 
 from .wrapper import Wrapper
@@ -78,3 +80,66 @@ def encode(
             return embeddings
 
     return InstructWrapper(model_name_or_path, mode, instruction_template, **kwargs)
+
+
+class InstructSentenceTransformerWrapper(Wrapper):
+    def __init__(
+        self,
+        model_name: str,
+        revision: str,
+        instruction_template: str | Callable[[str], str] | None = None,
+        max_seq_length: int | None = None,
+        apply_instruction_to_passages: bool = True,
+        **kwargs: Any,
+    ):
+        if (
+            isinstance(instruction_template, str)
+            and "{instruction}" not in instruction_template
+        ):
+            raise ValueError(
+                "Instruction template must contain the string '{instruction}'."
+            )
+        if instruction_template is None:
+            logger.warning(
+                "No instruction template provided. Instructions will be used as-is."
+            )
+
+        self.model_name = model_name
+        self.model = SentenceTransformer(model_name, revision=revision, **kwargs)
+        self.instruction_template = instruction_template
+        self.apply_instruction_to_passages = apply_instruction_to_passages
+        if max_seq_length is not None:
+            self.model.max_seq_length = max_seq_length
+
+    def encode(
+        self,
+        sentences: Sequence[str],
+        *,
+        task_name: str,
+        prompt_type: PromptType | None = None,
+        **kwargs: Any,
+    ) -> np.ndarray:
+        task = mteb.get_task(task_name=task_name)
+        instruction = self.get_instruction(task_name, prompt_type)
+        if self.instruction_template:
+            instruction = self.format_instruction(instruction, prompt_type)
+
+        # to passage prompts won't be applied to passages
+        if (
+            not self.apply_instruction_to_passages
+            and prompt_type == PromptType.passage
+            and task.metadata.type == "s2p"
+        ):
+            instruction = None
+
+        logger.info(f"Using instruction: '{instruction}' for task: '{task_name}'")
+        embeddings = self.model.encode(
+            sentences,
+            prompt=instruction,
+            **kwargs,
+        )
+
+        if isinstance(embeddings, torch.Tensor):
+            # sometimes in kwargs can be return_tensors=True
+            embeddings = embeddings.cpu().detach().float().numpy()
+        return embeddings
diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py
@@ -44,7 +44,7 @@ def encode(
         instruction = self.get_task_instruction(task_name, prompt_type)
 
         # to passage prompts won't be applied to passages
-        if prompt_type == PromptType.passage and task.metadata.type == "s2p":
+        if prompt_type == PromptType.passage and task.metadata.category == "s2p":
             instruction = None
 
         embeddings = self.model.encode(

diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py
@@ -4,7 +4,10 @@
 
 from functools import partial
 
+import torch
+
 from mteb.model_meta import ModelMeta, sentence_transformers_loader
+from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper
 
 rubert_tiny2 = ModelMeta(
     name="cointegrated/rubert-tiny2",
@@ -236,3 +239,31 @@
     release_date="2024-07-29",
     use_instructions=True,
 )
+
+giga_embeddings = ModelMeta(
+    loader=partial(
+        InstructSentenceTransformerWrapper,
+        model_name="ai-sage/Giga-Embeddings-instruct",
+        revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92",
+        trust_remote_code=True,
+        instruction_template="Instruct: {instruction}\nQuery: ",
+        apply_instruction_to_passages=False,
+        model_kwargs={
+            "torch_dtype": torch.float16,
+        },
+    ),
+    name="ai-sage/Giga-Embeddings-instruct",
+    languages=["eng_Latn", "rus_Cyrl"],
+    open_weights=True,
+    revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92",
+    release_date="2024-12-13",
+    n_parameters=2_530_000_000,
+    memory_usage=None,
+    embed_dim=2048,
+    license="mit",
+    max_tokens=32768,
+    reference="https://huggingface.co/ai-sage/Giga-Embeddings-instruct",
+    similarity_fn_name="cosine",
+    framework=["Sentence Transformers", "PyTorch"],
+    use_instructions=True,
+)