diff --git a/src/nanotron/data/petagraph_dataset.py b/src/nanotron/data/petagraph_dataset.py index 23bfae2c..630d6b91 100644 --- a/src/nanotron/data/petagraph_dataset.py +++ b/src/nanotron/data/petagraph_dataset.py @@ -378,7 +378,7 @@ def fasta_parsing_func(self, input_data: Tuple[str, bytes]): decoded_lines = data.decode() sequences = [str(s.seq) for s in SeqIO.parse(StringIO(decoded_lines), "fasta")] - # make sure only ALPHABET + # make sure only ALPHABET, TODO: align with training vocabulary allow "N" to pass through sequences = ["".join([c for c in s if c in ALPHABET]) for s in sequences] # Chop sequences in preparation for graph traversal