From 4e518bf1ab8f05fa30bf925525790954af66856e Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Mon, 25 Nov 2024 20:13:27 +0100 Subject: [PATCH 1/5] increment factor is the first batch size --- dicee/trainer/model_parallelism.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dicee/trainer/model_parallelism.py b/dicee/trainer/model_parallelism.py index 38ebd39d..5159b948 100644 --- a/dicee/trainer/model_parallelism.py +++ b/dicee/trainer/model_parallelism.py @@ -23,11 +23,13 @@ def extract_input_outputs(z: list, device=None): else: raise ValueError('Unexpected batch shape..') -def find_good_batch_size(train_loader,ensemble_model,max_available_gpu_memory:float=0.05): +def find_good_batch_size(train_loader,ensemble_model,max_available_gpu_memory:float=0.1): # () Initial batch size batch_size=train_loader.batch_size + first_batch_size = train_loader.batch_size + print("Automatic batch size finding") - for n in range(200): + while True: # () Initialize a dataloader with a current batch_size train_dataloaders = torch.utils.data.DataLoader(train_loader.dataset, batch_size=batch_size, @@ -50,7 +52,7 @@ def find_good_batch_size(train_loader,ensemble_model,max_available_gpu_memory:fl # () Stepping criterion if available_gpu_memory > max_available_gpu_memory and batch_size < len(train_loader.dataset) : # Increment the current batch size - batch_size+=batch_size + batch_size+=first_batch_size else: if batch_size >= len(train_loader.dataset): print("Batch size equals to the training dataset size") From 4b1a8762d18c99fa26469b21c0ae5839a73512d9 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Mon, 25 Nov 2024 20:51:03 +0100 Subject: [PATCH 2/5] avg of last three batches gpu usage measured --- dicee/trainer/model_parallelism.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/dicee/trainer/model_parallelism.py b/dicee/trainer/model_parallelism.py index 5159b948..f331c3f0 100644 --- a/dicee/trainer/model_parallelism.py +++ b/dicee/trainer/model_parallelism.py @@ -23,7 +23,7 @@ def extract_input_outputs(z: list, device=None): else: raise ValueError('Unexpected batch shape..') -def find_good_batch_size(train_loader,ensemble_model,max_available_gpu_memory:float=0.1): +def find_good_batch_size(train_loader,ensemble_model, max_available_gpu_memory:float=0.1): # () Initial batch size batch_size=train_loader.batch_size first_batch_size = train_loader.batch_size @@ -43,21 +43,24 @@ def find_good_batch_size(train_loader,ensemble_model,max_available_gpu_memory:fl worker_init_fn=None, persistent_workers=False) loss=None + avg_global_free_memory=[] for i, z in enumerate(train_dataloaders): loss = forward_backward_update_loss(z,ensemble_model) - break - global_free_memory, total_memory = torch.cuda.mem_get_info() - available_gpu_memory = global_free_memory / total_memory - print(f"Random Batch Loss: {loss}\tFree/Total GPU Memory: {available_gpu_memory}\tBatch Size:{batch_size}") + if i==3: + global_free_memory, total_memory = torch.cuda.mem_get_info() + avg_global_free_memory.append(global_free_memory / total_memory) + break + avg_global_free_memory=sum(avg_global_free_memory)/len(avg_global_free_memory) + print(f"Random Batch Loss: {loss}\tFree/Total GPU Memory: {avg_global_free_memory}\tBatch Size:{batch_size}") # () Stepping criterion - if available_gpu_memory > max_available_gpu_memory and batch_size < len(train_loader.dataset) : + if avg_global_free_memory > max_available_gpu_memory and batch_size < len(train_loader.dataset) : # Increment the current batch size batch_size+=first_batch_size else: if batch_size >= len(train_loader.dataset): print("Batch size equals to the training dataset size") else: - print(f"Max GPU memory used\tFree/Total GPU Memory:{available_gpu_memory}") + print(f"Max GPU memory used\tFree/Total GPU Memory:{avg_global_free_memory}") return batch_size From a6e15b7110d5dab87ddc11143b0e04c773e65840 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Mon, 25 Nov 2024 20:51:26 +0100 Subject: [PATCH 3/5] dynomo import removed --- dicee/models/ensemble.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/dicee/models/ensemble.py b/dicee/models/ensemble.py index e6c1e5de..83df1a3e 100644 --- a/dicee/models/ensemble.py +++ b/dicee/models/ensemble.py @@ -1,11 +1,6 @@ import torch import copy -import torch._dynamo - -torch._dynamo.config.suppress_errors = True - - class EnsembleKGE: def __init__(self, seed_model): self.models = [] From 4e89b1d9f3dc78a83614d2339aa6de24c120e4cc Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 26 Nov 2024 09:55:23 +0100 Subject: [PATCH 4/5] expoential batch size increment is reduced to the linear --- dicee/models/ensemble.py | 3 +++ dicee/trainer/model_parallelism.py | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/dicee/models/ensemble.py b/dicee/models/ensemble.py index 83df1a3e..072f91bd 100644 --- a/dicee/models/ensemble.py +++ b/dicee/models/ensemble.py @@ -82,6 +82,9 @@ def __call__(self,x_batch): def step(self): for opt in self.optimizers: opt.step() + + def get_embeddings(self): + raise NotImplementedError("Not yet Implemented") """ def __getattr__(self, name): diff --git a/dicee/trainer/model_parallelism.py b/dicee/trainer/model_parallelism.py index f331c3f0..ffea1a46 100644 --- a/dicee/trainer/model_parallelism.py +++ b/dicee/trainer/model_parallelism.py @@ -26,6 +26,8 @@ def extract_input_outputs(z: list, device=None): def find_good_batch_size(train_loader,ensemble_model, max_available_gpu_memory:float=0.1): # () Initial batch size batch_size=train_loader.batch_size + if batch_size >= len(train_loader.dataset): + return batch_size first_batch_size = train_loader.batch_size print("Automatic batch size finding") @@ -46,10 +48,11 @@ def find_good_batch_size(train_loader,ensemble_model, max_available_gpu_memory:f avg_global_free_memory=[] for i, z in enumerate(train_dataloaders): loss = forward_backward_update_loss(z,ensemble_model) + global_free_memory, total_memory = torch.cuda.mem_get_info() + avg_global_free_memory.append(global_free_memory / total_memory) if i==3: - global_free_memory, total_memory = torch.cuda.mem_get_info() - avg_global_free_memory.append(global_free_memory / total_memory) break + avg_global_free_memory=sum(avg_global_free_memory)/len(avg_global_free_memory) print(f"Random Batch Loss: {loss}\tFree/Total GPU Memory: {avg_global_free_memory}\tBatch Size:{batch_size}") # () Stepping criterion @@ -61,11 +64,8 @@ def find_good_batch_size(train_loader,ensemble_model, max_available_gpu_memory:f print("Batch size equals to the training dataset size") else: print(f"Max GPU memory used\tFree/Total GPU Memory:{avg_global_free_memory}") - return batch_size - raise RuntimeError("The computation should be here!") - def forward_backward_update_loss(z:Tuple, ensemble_model): # () Get the i-th batch of data points. x_batch, y_batch = extract_input_outputs(z) From 0417f1924b77c1eaec864aa3f9b1f143fa70b361 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 26 Nov 2024 10:42:37 +0100 Subject: [PATCH 5/5] embeddings can be concatted horiziontally for csv --- dicee/models/ensemble.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/dicee/models/ensemble.py b/dicee/models/ensemble.py index 072f91bd..34f41059 100644 --- a/dicee/models/ensemble.py +++ b/dicee/models/ensemble.py @@ -8,13 +8,13 @@ def __init__(self, seed_model): self.loss_history = [] for i in range(torch.cuda.device_count()): i_model=copy.deepcopy(seed_model) - i_model.to(torch.device(f"cuda:{i}")) # TODO: Why we cant send the compile model to cpu ? - # i_model = torch.compile(i_model) + i_model = torch.compile(i_model) + i_model.to(torch.device(f"cuda:{i}")) self.optimizers.append(i_model.configure_optimizers()) self.models.append(i_model) # Maybe use the original model's name ? - self.name="TP_"+self.models[0].name + self.name=self.models[0].name self.train_mode=True def named_children(self): @@ -84,8 +84,23 @@ def step(self): opt.step() def get_embeddings(self): - raise NotImplementedError("Not yet Implemented") - + entity_embeddings=[] + relation_embeddings=[] + # () Iterate + for trained_model in self.models: + entity_emb, relation_ebm = trained_model.get_embeddings() + entity_embeddings.append(entity_emb) + if relation_ebm is not None: + relation_embeddings.append(relation_ebm) + # () Concat the embedding vectors horizontally. + entity_embeddings=torch.cat(entity_embeddings,dim=1) + if relation_embeddings: + relation_embeddings=torch.cat(relation_embeddings,dim=1) + else: + relation_embeddings=None + + return entity_embeddings, relation_embeddings + """ def __getattr__(self, name): # Create a function that will call the same attribute/method on each model