Skip to content

Commit

Permalink
Adapt continuation under mp
Browse files Browse the repository at this point in the history
  • Loading branch information
manuelburger committed Nov 6, 2024
1 parent 922f6f1 commit a50ae9e
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions src/nanotron/data/petagraph_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,15 +618,19 @@ def __init__(self,
restart_consumed_files_set = set(restart_consumed_files)
for f in restart_consumed_files_set:
url_list.remove(f)
url_list.extend(restart_consumed_files)

# For now we don't append the consumed files to the end of the url_list
# As in multiprocessing setting we index into arbirary positions
# and we don't want to index into the consumed files
# url_list.extend(restart_consumed_files)

# Add the consumed files to the consumed files set
self.consumed_files = set(restart_consumed_files)

# Set the current epoch to the restart epoch
self.current_epoch = restart_epoch

log_msg = f"[PetaGAdd lockaphStreamDataset:{self.rank}] Restarting from epoch {self.current_epoch} with {len(self.consumed_files)} files"
log_msg = f"[PetaGraphStreamDataset:{self.rank}] Restarting from epoch {self.current_epoch} with {len(self.consumed_files)} files, {len(url_list)} files left"
log_rank(log_msg, logger=logger, level=logging.INFO, rank=self.rank)
else:
self.consumed_files = set()
Expand Down

0 comments on commit a50ae9e

Please sign in to comment.