Skip to content

Commit 5bfa87a

Browse files
committed
Only read number of bytes indicated by string data item
The string parsing now no longer reads more bytes than indicated by the data item. For any incomplete UTF-8 code points an exception with a corresponding message is thrown.
1 parent 0f75408 commit 5bfa87a

File tree

3 files changed

+72
-57
lines changed

3 files changed

+72
-57
lines changed

cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java

Lines changed: 49 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -2281,18 +2281,14 @@ protected void _finishToken() throws IOException
22812281
}
22822282
return;
22832283
}
2284-
// 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
2285-
// the longest individual unit is 4 bytes (surrogate pair) so we
2286-
// actually need len+3 bytes to avoid bounds checks
22872284
// 18-Jan-2024, tatu: For malicious input / Fuzzers, need to worry about overflow
22882285
// like Integer.MAX_VALUE
2289-
final int needed = Math.max(len, len + 3);
22902286
final int available = _inputEnd - _inputPtr;
22912287

2292-
if ((available >= needed)
2288+
if ((available >= len)
22932289
// if not, could we read? NOTE: we do not require it, just attempt to read
2294-
|| ((_inputBuffer.length >= needed)
2295-
&& _tryToLoadToHaveAtLeast(needed))) {
2290+
|| ((_inputBuffer.length >= len)
2291+
&& _tryToLoadToHaveAtLeast(len))) {
22962292
_finishShortText(len);
22972293
return;
22982294
}
@@ -2326,22 +2322,18 @@ protected String _finishTextToken(int ch) throws IOException
23262322
_finishChunkedText();
23272323
return _textBuffer.contentsAsString();
23282324
}
2329-
// 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
2330-
// the longest individual unit is 4 bytes (surrogate pair) so we
2331-
// actually need len+3 bytes to avoid bounds checks
23322325

23332326
// 19-Mar-2021, tatu: [dataformats-binary#259] shows the case where length
23342327
// we get is Integer.MAX_VALUE, leading to overflow. Could change values
23352328
// to longs but simpler to truncate "needed" (will never pass following test
23362329
// due to inputBuffer never being even close to that big).
23372330

2338-
final int needed = Math.max(len + 3, len);
23392331
final int available = _inputEnd - _inputPtr;
23402332

2341-
if ((available >= needed)
2333+
if ((available >= len)
23422334
// if not, could we read? NOTE: we do not require it, just attempt to read
2343-
|| ((_inputBuffer.length >= needed)
2344-
&& _tryToLoadToHaveAtLeast(needed))) {
2335+
|| ((_inputBuffer.length >= len)
2336+
&& _tryToLoadToHaveAtLeast(len))) {
23452337
return _finishShortText(len);
23462338
}
23472339
// If not enough space, need handling similar to chunked
@@ -2369,7 +2361,7 @@ private final String _finishShortText(int len) throws IOException
23692361
final byte[] inputBuf = _inputBuffer;
23702362

23712363
// Let's actually do a tight loop for ASCII first:
2372-
final int end = inPtr + len;
2364+
final int end = _inputPtr;
23732365

23742366
int i;
23752367
while ((i = inputBuf[inPtr]) >= 0) {
@@ -2386,44 +2378,50 @@ private final String _finishShortText(int len) throws IOException
23862378
final int[] codes = UTF8_UNIT_CODES;
23872379
do {
23882380
i = inputBuf[inPtr++] & 0xFF;
2389-
switch (codes[i]) {
2390-
case 0:
2391-
break;
2392-
case 1:
2393-
{
2394-
final int c2 = inputBuf[inPtr++];
2395-
if ((c2 & 0xC0) != 0x080) {
2396-
_reportInvalidOther(c2 & 0xFF, inPtr);
2397-
}
2398-
i = ((i & 0x1F) << 6) | (c2 & 0x3F);
2381+
int code = codes[i];
2382+
if (code != 0) {
2383+
// 05-Jul-2021, tatu: As per [dataformats-binary#289] need to
2384+
// be careful wrt end-of-buffer truncated codepoints
2385+
if ((inPtr + code) > end) {
2386+
final int firstCharOffset = len - (end - inPtr) - 1;
2387+
_reportTruncatedUTF8InString(len, firstCharOffset, i, code);
23992388
}
2400-
break;
2401-
case 2:
2402-
{
2403-
final int c2 = inputBuf[inPtr++];
2404-
if ((c2 & 0xC0) != 0x080) {
2405-
_reportInvalidOther(c2 & 0xFF, inPtr);
2389+
2390+
switch (code) {
2391+
case 1: {
2392+
final int c2 = inputBuf[inPtr++];
2393+
if ((c2 & 0xC0) != 0x080) {
2394+
_reportInvalidOther(c2 & 0xFF, inPtr);
2395+
}
2396+
i = ((i & 0x1F) << 6) | (c2 & 0x3F);
24062397
}
2407-
final int c3 = inputBuf[inPtr++];
2408-
if ((c3 & 0xC0) != 0x080) {
2409-
_reportInvalidOther(c3 & 0xFF, inPtr);
2398+
break;
2399+
case 2: {
2400+
final int c2 = inputBuf[inPtr++];
2401+
if ((c2 & 0xC0) != 0x080) {
2402+
_reportInvalidOther(c2 & 0xFF, inPtr);
2403+
}
2404+
final int c3 = inputBuf[inPtr++];
2405+
if ((c3 & 0xC0) != 0x080) {
2406+
_reportInvalidOther(c3 & 0xFF, inPtr);
2407+
}
2408+
i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
24102409
}
2411-
i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
2410+
break;
2411+
case 3:
2412+
// 30-Jan-2021, tatu: TODO - validate these too?
2413+
i = ((i & 0x07) << 18)
2414+
| ((inputBuf[inPtr++] & 0x3F) << 12)
2415+
| ((inputBuf[inPtr++] & 0x3F) << 6)
2416+
| (inputBuf[inPtr++] & 0x3F);
2417+
// note: this is the codepoint value; need to split, too
2418+
i -= 0x10000;
2419+
outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
2420+
i = 0xDC00 | (i & 0x3FF);
2421+
break;
2422+
default: // invalid
2423+
_reportInvalidInitial(i);
24122424
}
2413-
break;
2414-
case 3:
2415-
// 30-Jan-2021, tatu: TODO - validate these too?
2416-
i = ((i & 0x07) << 18)
2417-
| ((inputBuf[inPtr++] & 0x3F) << 12)
2418-
| ((inputBuf[inPtr++] & 0x3F) << 6)
2419-
| (inputBuf[inPtr++] & 0x3F);
2420-
// note: this is the codepoint value; need to split, too
2421-
i -= 0x10000;
2422-
outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
2423-
i = 0xDC00 | (i & 0x3FF);
2424-
break;
2425-
default: // invalid
2426-
_reportInvalidInitial(i);
24272425
}
24282426
outBuf[outPtr++] = (char) i;
24292427
} while (inPtr < end);
@@ -3850,18 +3848,16 @@ protected void _reportIncompleteBinaryRead(int expLen, int actLen) throws IOExce
38503848
expLen, actLen), _currToken);
38513849
}
38523850

3853-
// @since 2.13
3854-
/*
3851+
// @since 2.19
38553852
private String _reportTruncatedUTF8InString(int strLenBytes, int truncatedCharOffset,
38563853
int firstUTFByteValue, int bytesExpected)
38573854
throws IOException
38583855
{
38593856
throw _constructError(String.format(
3860-
"Truncated UTF-8 character in Chunked Unicode String value (%d bytes): "
3857+
"Truncated UTF-8 character in Unicode String value (%d bytes): "
38613858
+"byte 0x%02X at offset #%d indicated %d more bytes needed",
38623859
strLenBytes, firstUTFByteValue, truncatedCharOffset, bytesExpected));
38633860
}
3864-
*/
38653861

38663862
// @since 2.13
38673863
private String _reportTruncatedUTF8InName(int strLenBytes, int truncatedCharOffset,

cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/fuzz/CBORFuzz_35979_StringValueTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public void testInvalidTextValueWithBrokenUTF8() throws Exception
2727
p.getText();
2828
fail("Should not pass");
2929
} catch (StreamReadException e) {
30-
verifyException(e, "Malformed UTF-8 character at the end of a");
30+
verifyException(e, "Truncated UTF-8 character in Unicode String value (256 bytes)");
3131
}
3232

3333
}

cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
import com.fasterxml.jackson.dataformat.cbor.CBORParser;
66
import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;
77

8+
import java.nio.charset.StandardCharsets;
9+
import java.util.Arrays;
10+
811
public class ParseInvalidUTF8String236Test extends CBORTestBase
912
{
1013
// [dataformats-binary#236]: Original version; broken UTF-8 all around.
@@ -24,8 +27,8 @@ public void testShortString236Original() throws Exception
2427
}
2528

2629
// Variant where the length would be valid, but the last byte is partial UTF-8
27-
// code point
28-
public void testShortString236EndsWithPartialUTF8() throws Exception
30+
// code point and no more bytes are available due to end-of-stream
31+
public void testShortString236EndsWithPartialUTF8AtEndOfStream() throws Exception
2932
{
3033
final byte[] input = {0x63, 0x41, 0x2d, (byte) 0xda};
3134
try (CBORParser p = cborParser(input)) {
@@ -34,7 +37,23 @@ public void testShortString236EndsWithPartialUTF8() throws Exception
3437
String str = p.getText();
3538
fail("Should have failed, did not, String = '"+str+"'");
3639
} catch (StreamReadException e) {
37-
verifyException(e, "Malformed UTF-8 character at the end of");
40+
verifyException(e, "Truncated UTF-8 character in Unicode String value (3 bytes)");
41+
}
42+
}
43+
}
44+
45+
// Variant where the length would be valid, but the last byte is partial UTF-8
46+
// code point and the subsequent byte would be a valid continuation byte, but belongs to next data item
47+
public void testShortString236EndsWithPartialUTF8() throws Exception
48+
{
49+
final byte[] input = {0x62, 0x33, (byte) 0xdb, (byte) 0xa0};
50+
try (CBORParser p = cborParser(input)) {
51+
assertToken(JsonToken.VALUE_STRING, p.nextToken());
52+
try {
53+
String str = p.getText();
54+
fail("Should have failed, did not, String = '"+str+"'");
55+
} catch (StreamReadException e) {
56+
verifyException(e, "Truncated UTF-8 character in Unicode String value (2 bytes)");
3857
}
3958
}
4059
}

0 commit comments

Comments
 (0)