Skip to content

Commit

Permalink
Only read number of bytes indicated by string data item (#518)
Browse files Browse the repository at this point in the history
The string parsing now no longer reads more bytes than indicated by the data item. For any incomplete UTF-8 code points an exception with a corresponding message is thrown.
  • Loading branch information
knutwannheden authored Oct 19, 2024
1 parent 0f75408 commit 265b3c4
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2281,18 +2281,14 @@ protected void _finishToken() throws IOException
}
return;
}
// 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
// the longest individual unit is 4 bytes (surrogate pair) so we
// actually need len+3 bytes to avoid bounds checks
// 18-Jan-2024, tatu: For malicious input / Fuzzers, need to worry about overflow
// like Integer.MAX_VALUE
final int needed = Math.max(len, len + 3);
final int available = _inputEnd - _inputPtr;

if ((available >= needed)
if ((available >= len)
// if not, could we read? NOTE: we do not require it, just attempt to read
|| ((_inputBuffer.length >= needed)
&& _tryToLoadToHaveAtLeast(needed))) {
|| ((_inputBuffer.length >= len)
&& _tryToLoadToHaveAtLeast(len))) {
_finishShortText(len);
return;
}
Expand Down Expand Up @@ -2326,22 +2322,18 @@ protected String _finishTextToken(int ch) throws IOException
_finishChunkedText();
return _textBuffer.contentsAsString();
}
// 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
// the longest individual unit is 4 bytes (surrogate pair) so we
// actually need len+3 bytes to avoid bounds checks

// 19-Mar-2021, tatu: [dataformats-binary#259] shows the case where length
// we get is Integer.MAX_VALUE, leading to overflow. Could change values
// to longs but simpler to truncate "needed" (will never pass following test
// due to inputBuffer never being even close to that big).

final int needed = Math.max(len + 3, len);
final int available = _inputEnd - _inputPtr;

if ((available >= needed)
if ((available >= len)
// if not, could we read? NOTE: we do not require it, just attempt to read
|| ((_inputBuffer.length >= needed)
&& _tryToLoadToHaveAtLeast(needed))) {
|| ((_inputBuffer.length >= len)
&& _tryToLoadToHaveAtLeast(len))) {
return _finishShortText(len);
}
// If not enough space, need handling similar to chunked
Expand Down Expand Up @@ -2369,7 +2361,7 @@ private final String _finishShortText(int len) throws IOException
final byte[] inputBuf = _inputBuffer;

// Let's actually do a tight loop for ASCII first:
final int end = inPtr + len;
final int end = _inputPtr;

int i;
while ((i = inputBuf[inPtr]) >= 0) {
Expand All @@ -2386,44 +2378,50 @@ private final String _finishShortText(int len) throws IOException
final int[] codes = UTF8_UNIT_CODES;
do {
i = inputBuf[inPtr++] & 0xFF;
switch (codes[i]) {
case 0:
break;
case 1:
{
final int c2 = inputBuf[inPtr++];
if ((c2 & 0xC0) != 0x080) {
_reportInvalidOther(c2 & 0xFF, inPtr);
}
i = ((i & 0x1F) << 6) | (c2 & 0x3F);
int code = codes[i];
if (code != 0) {
// 05-Jul-2021, tatu: As per [dataformats-binary#289] need to
// be careful wrt end-of-buffer truncated codepoints
if ((inPtr + code) > end) {
final int firstCharOffset = len - (end - inPtr) - 1;
_reportTruncatedUTF8InString(len, firstCharOffset, i, code);
}
break;
case 2:
{
final int c2 = inputBuf[inPtr++];
if ((c2 & 0xC0) != 0x080) {
_reportInvalidOther(c2 & 0xFF, inPtr);

switch (code) {
case 1: {
final int c2 = inputBuf[inPtr++];
if ((c2 & 0xC0) != 0x080) {
_reportInvalidOther(c2 & 0xFF, inPtr);
}
i = ((i & 0x1F) << 6) | (c2 & 0x3F);
}
final int c3 = inputBuf[inPtr++];
if ((c3 & 0xC0) != 0x080) {
_reportInvalidOther(c3 & 0xFF, inPtr);
break;
case 2: {
final int c2 = inputBuf[inPtr++];
if ((c2 & 0xC0) != 0x080) {
_reportInvalidOther(c2 & 0xFF, inPtr);
}
final int c3 = inputBuf[inPtr++];
if ((c3 & 0xC0) != 0x080) {
_reportInvalidOther(c3 & 0xFF, inPtr);
}
i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
}
i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
break;
case 3:
// 30-Jan-2021, tatu: TODO - validate these too?
i = ((i & 0x07) << 18)
| ((inputBuf[inPtr++] & 0x3F) << 12)
| ((inputBuf[inPtr++] & 0x3F) << 6)
| (inputBuf[inPtr++] & 0x3F);
// note: this is the codepoint value; need to split, too
i -= 0x10000;
outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
i = 0xDC00 | (i & 0x3FF);
break;
default: // invalid
_reportInvalidInitial(i);
}
break;
case 3:
// 30-Jan-2021, tatu: TODO - validate these too?
i = ((i & 0x07) << 18)
| ((inputBuf[inPtr++] & 0x3F) << 12)
| ((inputBuf[inPtr++] & 0x3F) << 6)
| (inputBuf[inPtr++] & 0x3F);
// note: this is the codepoint value; need to split, too
i -= 0x10000;
outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
i = 0xDC00 | (i & 0x3FF);
break;
default: // invalid
_reportInvalidInitial(i);
}
outBuf[outPtr++] = (char) i;
} while (inPtr < end);
Expand Down Expand Up @@ -3850,18 +3848,16 @@ protected void _reportIncompleteBinaryRead(int expLen, int actLen) throws IOExce
expLen, actLen), _currToken);
}

// @since 2.13
/*
// @since 2.19
private String _reportTruncatedUTF8InString(int strLenBytes, int truncatedCharOffset,
int firstUTFByteValue, int bytesExpected)
throws IOException
{
throw _constructError(String.format(
"Truncated UTF-8 character in Chunked Unicode String value (%d bytes): "
"Truncated UTF-8 character in Unicode String value (%d bytes): "
+"byte 0x%02X at offset #%d indicated %d more bytes needed",
strLenBytes, firstUTFByteValue, truncatedCharOffset, bytesExpected));
}
*/

// @since 2.13
private String _reportTruncatedUTF8InName(int strLenBytes, int truncatedCharOffset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public void testInvalidTextValueWithBrokenUTF8() throws Exception
p.getText();
fail("Should not pass");
} catch (StreamReadException e) {
verifyException(e, "Malformed UTF-8 character at the end of a");
verifyException(e, "Truncated UTF-8 character in Unicode String value (256 bytes)");
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import com.fasterxml.jackson.dataformat.cbor.CBORParser;
import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;

import java.nio.charset.StandardCharsets;
import java.util.Arrays;

public class ParseInvalidUTF8String236Test extends CBORTestBase
{
// [dataformats-binary#236]: Original version; broken UTF-8 all around.
Expand All @@ -24,8 +27,8 @@ public void testShortString236Original() throws Exception
}

// Variant where the length would be valid, but the last byte is partial UTF-8
// code point
public void testShortString236EndsWithPartialUTF8() throws Exception
// code point and no more bytes are available due to end-of-stream
public void testShortString236EndsWithPartialUTF8AtEndOfStream() throws Exception
{
final byte[] input = {0x63, 0x41, 0x2d, (byte) 0xda};
try (CBORParser p = cborParser(input)) {
Expand All @@ -34,7 +37,23 @@ public void testShortString236EndsWithPartialUTF8() throws Exception
String str = p.getText();
fail("Should have failed, did not, String = '"+str+"'");
} catch (StreamReadException e) {
verifyException(e, "Malformed UTF-8 character at the end of");
verifyException(e, "Truncated UTF-8 character in Unicode String value (3 bytes)");
}
}
}

// Variant where the length would be valid, but the last byte is partial UTF-8
// code point and the subsequent byte would be a valid continuation byte, but belongs to next data item
public void testShortString236EndsWithPartialUTF8() throws Exception
{
final byte[] input = {0x62, 0x33, (byte) 0xdb, (byte) 0xa0};
try (CBORParser p = cborParser(input)) {
assertToken(JsonToken.VALUE_STRING, p.nextToken());
try {
String str = p.getText();
fail("Should have failed, did not, String = '"+str+"'");
} catch (StreamReadException e) {
verifyException(e, "Truncated UTF-8 character in Unicode String value (2 bytes)");
}
}
}
Expand Down

0 comments on commit 265b3c4

Please sign in to comment.