Skip to content

Commit

Permalink
Fix empty and shuffle
Browse files Browse the repository at this point in the history
  • Loading branch information
manuelburger committed Nov 6, 2024
1 parent 729a8d7 commit a9d4dbd
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions src/nanotron/data/petagraph_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ def fasta_parsing_func(self, input_data: Tuple[str, bytes]):
"""
path, data = input_data
if data is None:
return [[]]
return [("", "")]

sequences = []
decoded_lines = data.decode()
Expand All @@ -815,12 +815,15 @@ def fasta_parsing_func(self, input_data: Tuple[str, bytes]):

# Test outputs
if len(keep_sequences) == 0:
return [[]]
return [("", "")]

assert isinstance(keep_sequences, list)
assert isinstance(keep_sequences[0], tuple) and len(keep_sequences[0]) == 2
assert isinstance(keep_sequences[0][0], str) and isinstance(keep_sequences[0][1], str)

# Shuffle the sequences
random.shuffle(keep_sequences)

return keep_sequences

def crop_maxlen(self, input_sequence: str, maxlen: int = None):
Expand Down

0 comments on commit a9d4dbd

Please sign in to comment.