jina-ai · bwanglzu · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/colbert/infra/config/settings.py b/colbert/infra/config/settings.py
@@ -1,5 +1,6 @@
 import os
 import torch
+from typing import Iterable
 
 import __main__
 from dataclasses import dataclass
@@ -155,6 +156,14 @@ class TrainingSettings:
 
     model_name: str = DefaultVal(None) # DefaultVal('bert-base-uncased')
 
+    mrl: bool = DefaultVal(False)
+
+    mrl_dims: Iterable[int] = (16, 32, 64, 128, 256, 512, 768)
+
+    mrl_weights = Iterable[int] = (1, 1, 1, 1, 1, 1, 1)
+
+    wandb_project: str = DefaultVal('jina-colbert')
+
 @dataclass
 class IndexingSettings:
     index_path: str = DefaultVal(None)

diff --git a/colbert/modeling/colbert.py b/colbert/modeling/colbert.py
@@ -28,6 +28,9 @@ def __init__(self, name='bert-base-uncased', colbert_config=None):
                              for symbol in string.punctuation
                              for w in [symbol, self.raw_tokenizer.encode(symbol, add_special_tokens=False)[0]]}
         self.pad_token = self.raw_tokenizer.pad_token_id
+        if colbert_config.mrl:
+            self.mrl_dims = colbert_config.mrl_dims
+            self.mrl_weights = colbert_config.mrl_weights
 
 
     @classmethod
@@ -66,6 +69,7 @@ def forward(self, Q, D):
 
     def compute_ib_loss(self, Q, D, D_mask):
         # TODO: Organize the code below! Quite messy.
+        loss = 0.0
         scores = (D.unsqueeze(0) @ Q.permute(0, 2, 1).unsqueeze(1)).flatten(0, 1)  # query-major unsqueeze
 
         scores = colbert_score_reduce(scores, D_mask.repeat(Q.size(0), 1, 1), self.colbert_config)
@@ -80,7 +84,15 @@ def compute_ib_loss(self, Q, D, D_mask):
 
         labels = torch.arange(0, Q.size(0), device=scores.device) * (self.colbert_config.nway)
 
-        return torch.nn.CrossEntropyLoss()(scores, labels)
+        if self.mrl_dims and self.mrl_weights:
+            for dim, weight in zip(self.mrl_dims, self.mrl_weights):
+                current_scores = scores[:dim]
+                current_labels = labels[:dim]
+                current_ib_ce_loss = torch.nn.CrossEntropyLoss()(current_scores, current_labels) * weight
+                loss += current_ib_ce_loss
+        else:
+            loss = torch.nn.CrossEntropyLoss()(scores, labels)
+        return loss 
 
     def query(self, input_ids, attention_mask):
         input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)

diff --git a/colbert/training/training.py b/colbert/training/training.py
@@ -1,4 +1,3 @@
-import time
 import torch
 import random
 import torch.nn as nn
@@ -30,7 +29,7 @@ def train(config: ColBERTConfig, triples, queries=None, collection=None):
         wandb.login()
         run = wandb.init(
             # Set the project where this run will be logged
-            project="jina-colbert",
+            project=config.wandb_project,
             # Track hyperparameters and run metadata
             config=asdict(config),
             group="DDP",
@@ -55,7 +54,7 @@ def train(config: ColBERTConfig, triples, queries=None, collection=None):
         raise NotImplementedError()
 
     if not config.reranker:
-        colbert = ColBERT(name=config.checkpoint, colbert_config=config)
+        colbert = ColBERT(name=config.checkpoint, colbert_config=config) # ib loss in forward func
     else:
         colbert = ElectraReranker.from_pretrained(config.checkpoint)
 
@@ -82,7 +81,6 @@ def train(config: ColBERTConfig, triples, queries=None, collection=None):
     amp = MixedPrecisionManager(config.amp)
     labels = torch.zeros(config.bsize, dtype=torch.long, device=DEVICE)
 
-    start_time = time.time()
     train_loss = None
     train_loss_mu = 0.999
 
@@ -123,9 +121,24 @@ def train(config: ColBERTConfig, triples, queries=None, collection=None):
                     target_scores = torch.nn.functional.log_softmax(target_scores, dim=-1)
 
                     log_scores = torch.nn.functional.log_softmax(scores, dim=-1)
-                    loss = torch.nn.KLDivLoss(reduction='batchmean', log_target=True)(log_scores, target_scores)
+                    if config.mrl:
+                        for dim, weight in zip(config.mrl_dims, config.mrl_weights):
+                            current_log_scores = log_scores[:dim]
+                            current_target_scores = target_scores[:dim]
+                            current_kl_loss = torch.nn.KLDivLoss(reduction='batchmean', log_target=True)(current_log_scores, current_target_scores) * weight
+                            loss += current_kl_loss
+                    else:
+                        loss = torch.nn.KLDivLoss(reduction='batchmean', log_target=True)(log_scores, target_scores)
                 else:
-                    loss = nn.CrossEntropyLoss()(scores, labels[:scores.size(0)])
+                    label_scores = labels[:scores.size(0)]
+                    if config.mrl:
+                        for dim, weight in zip(config.mrl_dims, config.mrl_weights):
+                            current_scores = scores[:dim]
+                            current_labels = label_scores[:dim]
+                            current_ce_loss = nn.CrossEntropyLoss()(current_scores, current_labels) * weight
+                            loss += current_ce_loss
+                    else:
+                        loss = nn.CrossEntropyLoss()(scores, label_scores)
 
                 if config.use_ib_negatives:
                     if config.rank < 1: