Support chunking of paired-end FASTA

See marcelm/cutadapt#746
marcelm · Dec 11, 2023 · c8df0fb · c8df0fb
1 parent a3147cc
commit c8df0fb
Show file tree

Hide file tree

Showing 2 changed files with 129 additions and 7 deletions.
diff --git a/src/dnaio/chunks.py b/src/dnaio/chunks.py
@@ -34,6 +34,35 @@ def _fasta_head(buf: bytes, end: Optional[int] = None) -> int:
     )
 
 
+def _paired_fasta_heads(
+    buf1: bytes, buf2: bytes, end1: int, end2: int
+) -> Tuple[int, int]:
+    """
+    Return positions pos1, pos2 where right1 <= end1 and right2 <= end2
+    such that buf1[:pos1] and buf2[:pos2] contain the same number of complete FASTA
+    records.
+    """
+    if end1 == 0 or end2 == 0:
+        return (0, 0)
+
+    if (end1 > 0 and buf1[:1] != b">") or (end2 > 0 and buf2[:1] != b">"):
+        raise FastaFormatError("FASTA file expected to start with '>'", line=None)
+
+    # Count complete records
+    n_records1 = buf1.count(b"\n>", 0, end1)
+    n_records2 = buf2.count(b"\n>", 0, end2)
+    print(f"{n_records1=} {n_records2=}")
+    n_records = min(n_records1, n_records2)
+
+    pos1 = pos2 = 0
+    while n_records > 0:
+        pos1 = buf1.find(b"\n>", pos1, end1) + 1
+        pos2 = buf2.find(b"\n>", pos2, end2) + 1
+        n_records -= 1
+
+    return (pos1, pos2)
+
+
 def _fastq_head(buf: bytes, end: Optional[int] = None) -> int:
     """
     Search for the end of the last complete *two* FASTQ records in buf[:end].
@@ -160,27 +189,51 @@ def read_paired_chunks(
     # Read one byte to make sure we are processing FASTQ
     start1 = f.readinto(memoryview(buf1)[0:1])
     start2 = f2.readinto(memoryview(buf2)[0:1])
-    if (start1 == 1 and buf1[0:1] != b"@") or (start2 == 1 and buf2[0:1] != b"@"):
+
+    if start1 == 0 and start2 == 0:
+        return memoryview(b""), memoryview(b"")
+
+    if (start1 == 0) != (start2 == 0):
+        i = 2 if start1 == 0 else 1
+        raise FileFormatError(
+            f"Paired-end reads not in sync: File with R{i} reads is empty and the other is not",
+            line=None,
+        )
+
+    if buf1[:1] == b"@" != buf2[:1] == b"@":
         raise FileFormatError(
             "Paired-end data must be in FASTQ format when using multiple cores",
             line=None,
         )
 
+    if buf1[:1] == b"@":
+        file_format = "FASTQ"
+        paired_heads = _paired_fastq_heads
+    elif buf1[:1] == b">":
+        file_format = "FASTA"
+        paired_heads = _paired_fasta_heads
+    else:
+        raise FileFormatError(
+            "First character in input file must be '@' (FASTQ) or '>' (FASTA), "
+            f"but found {buf1[:1]}",
+            line=None,
+        )
+
     while True:
         if start1 == len(buf1) and start2 == len(buf2):
             raise ValueError(
-                f"FASTQ records do not fit into buffer of size {buffer_size}"
+                f"FASTA/FASTQ records do not fit into buffer of size {buffer_size}"
             )
         bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1  # type: ignore
         bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2  # type: ignore
         if start1 == bufend1 and start2 == bufend2:
             break
 
-        end1, end2 = _paired_fastq_heads(buf1, buf2, bufend1, bufend2)
+        end1, end2 = paired_heads(buf1, buf2, bufend1, bufend2)
         assert end1 <= bufend1
         assert end2 <= bufend2
 
-        if end1 > 0 or end2 > 0:
+        if end1 > 0 or end2 > 0 or file_format == "FASTA":
             yield (memoryview(buf1)[0:end1], memoryview(buf2)[0:end2])
         else:
             assert end1 == 0 and end2 == 0
@@ -189,7 +242,7 @@ def read_paired_chunks(
                 i = 1 if bufend1 == 0 else 2
                 extra = f". File {i} ended, but more data found in the other file"
             raise FileFormatError(
-                f"Premature end of paired FASTQ input{extra}.", line=None
+                f"Premature end of paired-end input{extra}.", line=None
             )
         start1 = bufend1 - end1
         assert start1 >= 0

diff --git a/tests/test_chunks.py b/tests/test_chunks.py
@@ -1,9 +1,18 @@
+import textwrap
+
 from pytest import raises
 from io import BytesIO
 
+import dnaio
 from dnaio import UnknownFileFormat, FileFormatError
 from dnaio._core import paired_fastq_heads
-from dnaio.chunks import _fastq_head, _fasta_head, read_chunks, read_paired_chunks
+from dnaio.chunks import (
+    _fastq_head,
+    _fasta_head,
+    read_chunks,
+    read_paired_chunks,
+    _paired_fasta_heads,
+)
 
 
 def test_fasta_head():
@@ -32,6 +41,52 @@ def test_fasta_head_with_comment():
     assert _fasta_head(b"#\n>3\n5\n>") == 7
 
 
+def test_paired_fasta_heads():
+    def pheads(buf1, buf2):
+        return _paired_fasta_heads(buf1, buf2, len(buf1), len(buf2))
+
+    assert pheads(b"", b"") == (0, 0)
+    assert pheads(b">r", b">r") == (0, 0)
+    assert pheads(b">r\nA\n>s", b">r") == (0, 0)
+    assert pheads(b">r\nA\n>s", b">r\nCT\n>s") == (5, 6)
+    assert pheads(b">r\nA\n>s\nG\n>t\n", b">r\nCT\n>s") == (5, 6)
+
+    buf1 = (
+        textwrap.dedent(
+            """
+        >1
+        a
+        b
+        >2
+        c
+        >3
+        uv
+        """
+        )
+        .strip()
+        .encode()
+    )
+    buf2 = (
+        textwrap.dedent(
+            """
+        >1
+        def
+        >2
+        gh
+        i
+        >3
+        """
+        )
+        .strip()
+        .encode()
+    )
+
+    assert pheads(buf1, buf2) == (
+        len(b">1\na\nb\n>2\nc\n"),
+        len(b">1\ndef\n>2\ngh\ni\n"),
+    )
+
+
 def test_paired_fastq_heads():
     buf1 = b"first\nsecond\nthird\nfourth\nfifth"
     buf2 = b"a\nb\nc\nd\ne\nf\ng"
@@ -67,13 +122,27 @@ def test_fastq_head():
     assert _fastq_head(b"A\nB\nC\nD\nE\nF\nG\nH\nI\n") == 16
 
 
-def test_read_paired_chunks():
+def test_read_paired_chunks_fastq():
     with open("tests/data/paired.1.fastq", "rb") as f1:
         with open("tests/data/paired.2.fastq", "rb") as f2:
             for c1, c2 in read_paired_chunks(f1, f2, buffer_size=128):
                 print(c1, c2)
 
 
+def test_paired_chunks_fasta(tmp_path):
+    for i in (1, 2):
+        with dnaio.open(f"tests/data/paired.{i}.fastq") as infile:
+            with dnaio.open(tmp_path / f"{i}.fasta", mode="w") as outfile:
+                for record in infile:
+                    record.qualities = None
+                    outfile.write(record)
+
+    with open(tmp_path / "1.fasta", "rb") as r1:
+        with open(tmp_path / "2.fasta", "rb") as r2:
+            for c1, c2 in read_paired_chunks(r1, r2, buffer_size=128):
+                print(c1.tobytes(), c2.tobytes())
+
+
 def test_paired_chunks_different_number_of_records():
     record = b"@r\nAA\n+\n##\n"
     buf1 = record