Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add giga embeddings #1741

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions mteb/models/instruct_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@

import numpy as np
import torch
from sentence_transformers import SentenceTransformer

import mteb
from mteb.encoder_interface import PromptType

from .wrapper import Wrapper
Expand Down Expand Up @@ -78,3 +80,66 @@ def encode(
return embeddings

return InstructWrapper(model_name_or_path, mode, instruction_template, **kwargs)


class InstructSentenceTransformerWrapper(Wrapper):
def __init__(
self,
model_name: str,
revision: str,
instruction_template: str | Callable[[str], str] | None = None,
max_seq_length: int | None = None,
apply_instruction_to_passages: bool = True,
**kwargs: Any,
):
if (
isinstance(instruction_template, str)
and "{instruction}" not in instruction_template
):
raise ValueError(
"Instruction template must contain the string '{instruction}'."
)
if instruction_template is None:
logger.warning(
"No instruction template provided. Instructions will be used as-is."
)

self.model_name = model_name
self.model = SentenceTransformer(model_name, revision=revision, **kwargs)
self.instruction_template = instruction_template
self.apply_instruction_to_passages = apply_instruction_to_passages
if max_seq_length is not None:
self.model.max_seq_length = max_seq_length

def encode(
self,
sentences: Sequence[str],
*,
task_name: str,
prompt_type: PromptType | None = None,
**kwargs: Any,
) -> np.ndarray:
task = mteb.get_task(task_name=task_name)
instruction = self.get_instruction(task_name, prompt_type)
if self.instruction_template:
instruction = self.format_instruction(instruction, prompt_type)

# to passage prompts won't be applied to passages
if (
not self.apply_instruction_to_passages
and prompt_type == PromptType.passage
and task.metadata.type == "s2p"
):
instruction = None
Comment on lines +127 to +133
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to jasper and nv-embed this model doesn't use prompt for passages. I think that can be helpful to add this to base class


logger.info(f"Using instruction: '{instruction}' for task: '{task_name}'")
embeddings = self.model.encode(
sentences,
prompt=instruction,
**kwargs,
)

if isinstance(embeddings, torch.Tensor):
# sometimes in kwargs can be return_tensors=True
embeddings = embeddings.cpu().detach().float().numpy()
return embeddings
2 changes: 1 addition & 1 deletion mteb/models/jasper_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def encode(
instruction = self.get_task_instruction(task_name, prompt_type)

# to passage prompts won't be applied to passages
if prompt_type == PromptType.passage and task.metadata.type == "s2p":
if prompt_type == PromptType.passage and task.metadata.category == "s2p":
instruction = None

embeddings = self.model.encode(
Expand Down
31 changes: 31 additions & 0 deletions mteb/models/ru_sentence_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

from functools import partial

import torch

from mteb.model_meta import ModelMeta, sentence_transformers_loader
from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper

rubert_tiny2 = ModelMeta(
name="cointegrated/rubert-tiny2",
Expand Down Expand Up @@ -236,3 +239,31 @@
release_date="2024-07-29",
use_instructions=True,
)

giga_embeddings = ModelMeta(
loader=partial(
InstructSentenceTransformerWrapper,
model_name="ai-sage/Giga-Embeddings-instruct",
revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92",
trust_remote_code=True,
instruction_template="Instruct: {instruction}\nQuery: ",
apply_instruction_to_passages=False,
model_kwargs={
"torch_dtype": torch.float16,
},
),
name="ai-sage/Giga-Embeddings-instruct",
languages=["eng_Latn", "rus_Cyrl"],
open_weights=True,
revision="646f5ff3587e74a18141c8d6b60d1cffd5897b92",
release_date="2024-12-13",
n_parameters=2_530_000_000,
memory_usage=None,
embed_dim=2048,
license="mit",
max_tokens=32768,
reference="https://huggingface.co/ai-sage/Giga-Embeddings-instruct",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=True,
)
Comment on lines +268 to +269
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add the training data annotation as well (we are going through models and adding that)

see_ #1561

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They haven't publish report yet, so I don't know anything about training dataset

Loading