diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java index 5e97e819b..8d3984aba 100644 --- a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java +++ b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java @@ -2281,18 +2281,14 @@ protected void _finishToken() throws IOException } return; } - // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that - // the longest individual unit is 4 bytes (surrogate pair) so we - // actually need len+3 bytes to avoid bounds checks // 18-Jan-2024, tatu: For malicious input / Fuzzers, need to worry about overflow // like Integer.MAX_VALUE - final int needed = Math.max(len, len + 3); final int available = _inputEnd - _inputPtr; - if ((available >= needed) + if ((available >= len) // if not, could we read? NOTE: we do not require it, just attempt to read - || ((_inputBuffer.length >= needed) - && _tryToLoadToHaveAtLeast(needed))) { + || ((_inputBuffer.length >= len) + && _tryToLoadToHaveAtLeast(len))) { _finishShortText(len); return; } @@ -2326,22 +2322,18 @@ protected String _finishTextToken(int ch) throws IOException _finishChunkedText(); return _textBuffer.contentsAsString(); } - // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that - // the longest individual unit is 4 bytes (surrogate pair) so we - // actually need len+3 bytes to avoid bounds checks // 19-Mar-2021, tatu: [dataformats-binary#259] shows the case where length // we get is Integer.MAX_VALUE, leading to overflow. Could change values // to longs but simpler to truncate "needed" (will never pass following test // due to inputBuffer never being even close to that big). - final int needed = Math.max(len + 3, len); final int available = _inputEnd - _inputPtr; - if ((available >= needed) + if ((available >= len) // if not, could we read? NOTE: we do not require it, just attempt to read - || ((_inputBuffer.length >= needed) - && _tryToLoadToHaveAtLeast(needed))) { + || ((_inputBuffer.length >= len) + && _tryToLoadToHaveAtLeast(len))) { return _finishShortText(len); } // If not enough space, need handling similar to chunked @@ -2369,7 +2361,7 @@ private final String _finishShortText(int len) throws IOException final byte[] inputBuf = _inputBuffer; // Let's actually do a tight loop for ASCII first: - final int end = inPtr + len; + final int end = _inputPtr; int i; while ((i = inputBuf[inPtr]) >= 0) { @@ -2386,44 +2378,50 @@ private final String _finishShortText(int len) throws IOException final int[] codes = UTF8_UNIT_CODES; do { i = inputBuf[inPtr++] & 0xFF; - switch (codes[i]) { - case 0: - break; - case 1: - { - final int c2 = inputBuf[inPtr++]; - if ((c2 & 0xC0) != 0x080) { - _reportInvalidOther(c2 & 0xFF, inPtr); - } - i = ((i & 0x1F) << 6) | (c2 & 0x3F); + int code = codes[i]; + if (code != 0) { + // 05-Jul-2021, tatu: As per [dataformats-binary#289] need to + // be careful wrt end-of-buffer truncated codepoints + if ((inPtr + code) > end) { + final int firstCharOffset = len - (end - inPtr) - 1; + _reportTruncatedUTF8InString(len, firstCharOffset, i, code); } - break; - case 2: - { - final int c2 = inputBuf[inPtr++]; - if ((c2 & 0xC0) != 0x080) { - _reportInvalidOther(c2 & 0xFF, inPtr); + + switch (code) { + case 1: { + final int c2 = inputBuf[inPtr++]; + if ((c2 & 0xC0) != 0x080) { + _reportInvalidOther(c2 & 0xFF, inPtr); + } + i = ((i & 0x1F) << 6) | (c2 & 0x3F); } - final int c3 = inputBuf[inPtr++]; - if ((c3 & 0xC0) != 0x080) { - _reportInvalidOther(c3 & 0xFF, inPtr); + break; + case 2: { + final int c2 = inputBuf[inPtr++]; + if ((c2 & 0xC0) != 0x080) { + _reportInvalidOther(c2 & 0xFF, inPtr); + } + final int c3 = inputBuf[inPtr++]; + if ((c3 & 0xC0) != 0x080) { + _reportInvalidOther(c3 & 0xFF, inPtr); + } + i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); } - i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); + break; + case 3: + // 30-Jan-2021, tatu: TODO - validate these too? + i = ((i & 0x07) << 18) + | ((inputBuf[inPtr++] & 0x3F) << 12) + | ((inputBuf[inPtr++] & 0x3F) << 6) + | (inputBuf[inPtr++] & 0x3F); + // note: this is the codepoint value; need to split, too + i -= 0x10000; + outBuf[outPtr++] = (char) (0xD800 | (i >> 10)); + i = 0xDC00 | (i & 0x3FF); + break; + default: // invalid + _reportInvalidInitial(i); } - break; - case 3: - // 30-Jan-2021, tatu: TODO - validate these too? - i = ((i & 0x07) << 18) - | ((inputBuf[inPtr++] & 0x3F) << 12) - | ((inputBuf[inPtr++] & 0x3F) << 6) - | (inputBuf[inPtr++] & 0x3F); - // note: this is the codepoint value; need to split, too - i -= 0x10000; - outBuf[outPtr++] = (char) (0xD800 | (i >> 10)); - i = 0xDC00 | (i & 0x3FF); - break; - default: // invalid - _reportInvalidInitial(i); } outBuf[outPtr++] = (char) i; } while (inPtr < end); @@ -3850,18 +3848,16 @@ protected void _reportIncompleteBinaryRead(int expLen, int actLen) throws IOExce expLen, actLen), _currToken); } - // @since 2.13 - /* + // @since 2.19 private String _reportTruncatedUTF8InString(int strLenBytes, int truncatedCharOffset, int firstUTFByteValue, int bytesExpected) throws IOException { throw _constructError(String.format( -"Truncated UTF-8 character in Chunked Unicode String value (%d bytes): " +"Truncated UTF-8 character in Unicode String value (%d bytes): " +"byte 0x%02X at offset #%d indicated %d more bytes needed", strLenBytes, firstUTFByteValue, truncatedCharOffset, bytesExpected)); } - */ // @since 2.13 private String _reportTruncatedUTF8InName(int strLenBytes, int truncatedCharOffset, diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/fuzz/CBORFuzz_35979_StringValueTest.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/fuzz/CBORFuzz_35979_StringValueTest.java index 472d37d5f..cd75ad9cf 100644 --- a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/fuzz/CBORFuzz_35979_StringValueTest.java +++ b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/fuzz/CBORFuzz_35979_StringValueTest.java @@ -27,7 +27,7 @@ public void testInvalidTextValueWithBrokenUTF8() throws Exception p.getText(); fail("Should not pass"); } catch (StreamReadException e) { - verifyException(e, "Malformed UTF-8 character at the end of a"); + verifyException(e, "Truncated UTF-8 character in Unicode String value (256 bytes)"); } } diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java index 095fd1de8..8e3e8f97e 100644 --- a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java +++ b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java @@ -5,6 +5,9 @@ import com.fasterxml.jackson.dataformat.cbor.CBORParser; import com.fasterxml.jackson.dataformat.cbor.CBORTestBase; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; + public class ParseInvalidUTF8String236Test extends CBORTestBase { // [dataformats-binary#236]: Original version; broken UTF-8 all around. @@ -24,8 +27,8 @@ public void testShortString236Original() throws Exception } // Variant where the length would be valid, but the last byte is partial UTF-8 - // code point - public void testShortString236EndsWithPartialUTF8() throws Exception + // code point and no more bytes are available due to end-of-stream + public void testShortString236EndsWithPartialUTF8AtEndOfStream() throws Exception { final byte[] input = {0x63, 0x41, 0x2d, (byte) 0xda}; try (CBORParser p = cborParser(input)) { @@ -34,7 +37,23 @@ public void testShortString236EndsWithPartialUTF8() throws Exception String str = p.getText(); fail("Should have failed, did not, String = '"+str+"'"); } catch (StreamReadException e) { - verifyException(e, "Malformed UTF-8 character at the end of"); + verifyException(e, "Truncated UTF-8 character in Unicode String value (3 bytes)"); + } + } + } + + // Variant where the length would be valid, but the last byte is partial UTF-8 + // code point and the subsequent byte would be a valid continuation byte, but belongs to next data item + public void testShortString236EndsWithPartialUTF8() throws Exception + { + final byte[] input = {0x62, 0x33, (byte) 0xdb, (byte) 0xa0}; + try (CBORParser p = cborParser(input)) { + assertToken(JsonToken.VALUE_STRING, p.nextToken()); + try { + String str = p.getText(); + fail("Should have failed, did not, String = '"+str+"'"); + } catch (StreamReadException e) { + verifyException(e, "Truncated UTF-8 character in Unicode String value (2 bytes)"); } } }