Adapt continuation under mp

ratschlab · Nov 6, 2024 · a50ae9e · a50ae9e
1 parent 922f6f1
commit a50ae9e
Showing 1 changed file with 6 additions and 2 deletions.
diff --git a/src/nanotron/data/petagraph_dataset.py b/src/nanotron/data/petagraph_dataset.py
@@ -618,15 +618,19 @@ def __init__(self,
             restart_consumed_files_set = set(restart_consumed_files)
             for f in restart_consumed_files_set:
                 url_list.remove(f)
-            url_list.extend(restart_consumed_files)
+
+            # For now we don't append the consumed files to the end of the url_list
+            # As in multiprocessing setting we index into arbirary positions
+            # and we don't want to index into the consumed files
+            # url_list.extend(restart_consumed_files)
 
             # Add the consumed files to the consumed files set
             self.consumed_files = set(restart_consumed_files)
 
             # Set the current epoch to the restart epoch
             self.current_epoch = restart_epoch
 
-            log_msg = f"[PetaGAdd lockaphStreamDataset:{self.rank}] Restarting from epoch {self.current_epoch} with {len(self.consumed_files)} files"
+            log_msg = f"[PetaGraphStreamDataset:{self.rank}] Restarting from epoch {self.current_epoch} with {len(self.consumed_files)} files, {len(url_list)} files left"
             log_rank(log_msg, logger=logger, level=logging.INFO, rank=self.rank)
         else:
             self.consumed_files = set()