Skip to content

Commit

Permalink
Support chunking of paired-end FASTA
Browse files Browse the repository at this point in the history
  • Loading branch information
marcelm committed Dec 11, 2023
1 parent a3147cc commit c8df0fb
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 7 deletions.
63 changes: 58 additions & 5 deletions src/dnaio/chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,35 @@ def _fasta_head(buf: bytes, end: Optional[int] = None) -> int:
)


def _paired_fasta_heads(
buf1: bytes, buf2: bytes, end1: int, end2: int
) -> Tuple[int, int]:
"""
Return positions pos1, pos2 where right1 <= end1 and right2 <= end2
such that buf1[:pos1] and buf2[:pos2] contain the same number of complete FASTA
records.
"""
if end1 == 0 or end2 == 0:
return (0, 0)

if (end1 > 0 and buf1[:1] != b">") or (end2 > 0 and buf2[:1] != b">"):
raise FastaFormatError("FASTA file expected to start with '>'", line=None)

# Count complete records
n_records1 = buf1.count(b"\n>", 0, end1)
n_records2 = buf2.count(b"\n>", 0, end2)
print(f"{n_records1=} {n_records2=}")
n_records = min(n_records1, n_records2)

pos1 = pos2 = 0
while n_records > 0:
pos1 = buf1.find(b"\n>", pos1, end1) + 1
pos2 = buf2.find(b"\n>", pos2, end2) + 1
n_records -= 1

return (pos1, pos2)


def _fastq_head(buf: bytes, end: Optional[int] = None) -> int:
"""
Search for the end of the last complete *two* FASTQ records in buf[:end].
Expand Down Expand Up @@ -160,27 +189,51 @@ def read_paired_chunks(
# Read one byte to make sure we are processing FASTQ
start1 = f.readinto(memoryview(buf1)[0:1])
start2 = f2.readinto(memoryview(buf2)[0:1])
if (start1 == 1 and buf1[0:1] != b"@") or (start2 == 1 and buf2[0:1] != b"@"):

if start1 == 0 and start2 == 0:
return memoryview(b""), memoryview(b"")

if (start1 == 0) != (start2 == 0):
i = 2 if start1 == 0 else 1
raise FileFormatError(
f"Paired-end reads not in sync: File with R{i} reads is empty and the other is not",
line=None,
)

if buf1[:1] == b"@" != buf2[:1] == b"@":
raise FileFormatError(
"Paired-end data must be in FASTQ format when using multiple cores",
line=None,
)

if buf1[:1] == b"@":
file_format = "FASTQ"
paired_heads = _paired_fastq_heads
elif buf1[:1] == b">":
file_format = "FASTA"
paired_heads = _paired_fasta_heads
else:
raise FileFormatError(
"First character in input file must be '@' (FASTQ) or '>' (FASTA), "
f"but found {buf1[:1]}",
line=None,
)

while True:
if start1 == len(buf1) and start2 == len(buf2):
raise ValueError(
f"FASTQ records do not fit into buffer of size {buffer_size}"
f"FASTA/FASTQ records do not fit into buffer of size {buffer_size}"
)
bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1 # type: ignore
bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2 # type: ignore
if start1 == bufend1 and start2 == bufend2:
break

end1, end2 = _paired_fastq_heads(buf1, buf2, bufend1, bufend2)
end1, end2 = paired_heads(buf1, buf2, bufend1, bufend2)
assert end1 <= bufend1
assert end2 <= bufend2

if end1 > 0 or end2 > 0:
if end1 > 0 or end2 > 0 or file_format == "FASTA":
yield (memoryview(buf1)[0:end1], memoryview(buf2)[0:end2])
else:
assert end1 == 0 and end2 == 0
Expand All @@ -189,7 +242,7 @@ def read_paired_chunks(
i = 1 if bufend1 == 0 else 2
extra = f". File {i} ended, but more data found in the other file"
raise FileFormatError(
f"Premature end of paired FASTQ input{extra}.", line=None
f"Premature end of paired-end input{extra}.", line=None
)
start1 = bufend1 - end1
assert start1 >= 0
Expand Down
73 changes: 71 additions & 2 deletions tests/test_chunks.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
import textwrap

from pytest import raises
from io import BytesIO

import dnaio
from dnaio import UnknownFileFormat, FileFormatError
from dnaio._core import paired_fastq_heads
from dnaio.chunks import _fastq_head, _fasta_head, read_chunks, read_paired_chunks
from dnaio.chunks import (
_fastq_head,
_fasta_head,
read_chunks,
read_paired_chunks,
_paired_fasta_heads,
)


def test_fasta_head():
Expand Down Expand Up @@ -32,6 +41,52 @@ def test_fasta_head_with_comment():
assert _fasta_head(b"#\n>3\n5\n>") == 7


def test_paired_fasta_heads():
def pheads(buf1, buf2):
return _paired_fasta_heads(buf1, buf2, len(buf1), len(buf2))

assert pheads(b"", b"") == (0, 0)
assert pheads(b">r", b">r") == (0, 0)
assert pheads(b">r\nA\n>s", b">r") == (0, 0)
assert pheads(b">r\nA\n>s", b">r\nCT\n>s") == (5, 6)
assert pheads(b">r\nA\n>s\nG\n>t\n", b">r\nCT\n>s") == (5, 6)

buf1 = (
textwrap.dedent(
"""
>1
a
b
>2
c
>3
uv
"""
)
.strip()
.encode()
)
buf2 = (
textwrap.dedent(
"""
>1
def
>2
gh
i
>3
"""
)
.strip()
.encode()
)

assert pheads(buf1, buf2) == (
len(b">1\na\nb\n>2\nc\n"),
len(b">1\ndef\n>2\ngh\ni\n"),
)


def test_paired_fastq_heads():
buf1 = b"first\nsecond\nthird\nfourth\nfifth"
buf2 = b"a\nb\nc\nd\ne\nf\ng"
Expand Down Expand Up @@ -67,13 +122,27 @@ def test_fastq_head():
assert _fastq_head(b"A\nB\nC\nD\nE\nF\nG\nH\nI\n") == 16


def test_read_paired_chunks():
def test_read_paired_chunks_fastq():
with open("tests/data/paired.1.fastq", "rb") as f1:
with open("tests/data/paired.2.fastq", "rb") as f2:
for c1, c2 in read_paired_chunks(f1, f2, buffer_size=128):
print(c1, c2)


def test_paired_chunks_fasta(tmp_path):
for i in (1, 2):
with dnaio.open(f"tests/data/paired.{i}.fastq") as infile:
with dnaio.open(tmp_path / f"{i}.fasta", mode="w") as outfile:
for record in infile:
record.qualities = None
outfile.write(record)

with open(tmp_path / "1.fasta", "rb") as r1:
with open(tmp_path / "2.fasta", "rb") as r2:
for c1, c2 in read_paired_chunks(r1, r2, buffer_size=128):
print(c1.tobytes(), c2.tobytes())


def test_paired_chunks_different_number_of_records():
record = b"@r\nAA\n+\n##\n"
buf1 = record
Expand Down

0 comments on commit c8df0fb

Please sign in to comment.