From a9d4dbd85639d32ca385a871f86781949de446b7 Mon Sep 17 00:00:00 2001 From: Manuel Burger Date: Wed, 6 Nov 2024 09:35:42 +0100 Subject: [PATCH] Fix empty and shuffle --- src/nanotron/data/petagraph_dataset.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/nanotron/data/petagraph_dataset.py b/src/nanotron/data/petagraph_dataset.py index e9522e09..df4bc824 100644 --- a/src/nanotron/data/petagraph_dataset.py +++ b/src/nanotron/data/petagraph_dataset.py @@ -792,7 +792,7 @@ def fasta_parsing_func(self, input_data: Tuple[str, bytes]): """ path, data = input_data if data is None: - return [[]] + return [("", "")] sequences = [] decoded_lines = data.decode() @@ -815,12 +815,15 @@ def fasta_parsing_func(self, input_data: Tuple[str, bytes]): # Test outputs if len(keep_sequences) == 0: - return [[]] + return [("", "")] assert isinstance(keep_sequences, list) assert isinstance(keep_sequences[0], tuple) and len(keep_sequences[0]) == 2 assert isinstance(keep_sequences[0][0], str) and isinstance(keep_sequences[0][1], str) + # Shuffle the sequences + random.shuffle(keep_sequences) + return keep_sequences def crop_maxlen(self, input_sequence: str, maxlen: int = None):