Only read number of bytes indicated by string data item

knutwannheden · knutwannheden · commit 5bfa87a2e409 · 2024-10-17T15:10:37.000+02:00
The string parsing now no longer reads more bytes than indicated by the data item. For any incomplete UTF-8 code points an exception with a corresponding message is thrown.
diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java
@@ -2281,18 +2281,14 @@ protected void _finishToken() throws IOException
             }
             return;
         }
-        // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
-        //    the longest individual unit is 4 bytes (surrogate pair) so we
-        //    actually need len+3 bytes to avoid bounds checks
         // 18-Jan-2024, tatu: For malicious input / Fuzzers, need to worry about overflow
         //    like Integer.MAX_VALUE
-        final int needed = Math.max(len, len + 3);
         final int available = _inputEnd - _inputPtr;
 
-        if ((available >= needed)
+        if ((available >= len)
                 // if not, could we read? NOTE: we do not require it, just attempt to read
-                    || ((_inputBuffer.length >= needed)
-                            && _tryToLoadToHaveAtLeast(needed))) {
+                    || ((_inputBuffer.length >= len)
+                            && _tryToLoadToHaveAtLeast(len))) {
                 _finishShortText(len);
                 return;
         }
@@ -2326,22 +2322,18 @@ protected String _finishTextToken(int ch) throws IOException
             _finishChunkedText();
             return _textBuffer.contentsAsString();
         }
-        // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that
-        //    the longest individual unit is 4 bytes (surrogate pair) so we
-        //    actually need len+3 bytes to avoid bounds checks
 
         // 19-Mar-2021, tatu: [dataformats-binary#259] shows the case where length
         //    we get is Integer.MAX_VALUE, leading to overflow. Could change values
         //    to longs but simpler to truncate "needed" (will never pass following test
         //    due to inputBuffer never being even close to that big).
 
-        final int needed = Math.max(len + 3, len);
         final int available = _inputEnd - _inputPtr;
 
-        if ((available >= needed)
+        if ((available >= len)
             // if not, could we read? NOTE: we do not require it, just attempt to read
-                || ((_inputBuffer.length >= needed)
-                        && _tryToLoadToHaveAtLeast(needed))) {
+                || ((_inputBuffer.length >= len)
+                        && _tryToLoadToHaveAtLeast(len))) {
             return _finishShortText(len);
         }
         // If not enough space, need handling similar to chunked
@@ -2369,7 +2361,7 @@ private final String _finishShortText(int len) throws IOException
         final byte[] inputBuf = _inputBuffer;
 
         // Let's actually do a tight loop for ASCII first:
-        final int end = inPtr + len;
+        final int end = _inputPtr;
 
         int i;
         while ((i = inputBuf[inPtr]) >= 0) {
@@ -2386,44 +2378,50 @@ private final String _finishShortText(int len) throws IOException
         final int[] codes = UTF8_UNIT_CODES;
         do {
             i = inputBuf[inPtr++] & 0xFF;
-            switch (codes[i]) {
-            case 0:
-                break;
-            case 1:
-                {
-                    final int c2 = inputBuf[inPtr++];
-                    if ((c2 & 0xC0) != 0x080) {
-                        _reportInvalidOther(c2 & 0xFF, inPtr);
-                    }
-                    i = ((i & 0x1F) << 6) | (c2 & 0x3F);
+            int code = codes[i];
+            if (code != 0) {
+                // 05-Jul-2021, tatu: As per [dataformats-binary#289] need to
+                //     be careful wrt end-of-buffer truncated codepoints
+                if ((inPtr + code) > end) {
+                    final int firstCharOffset = len - (end - inPtr) - 1;
+                    _reportTruncatedUTF8InString(len, firstCharOffset, i, code);
                 }
-                break;
-            case 2:
-                {
-                    final int c2 = inputBuf[inPtr++];
-                    if ((c2 & 0xC0) != 0x080) {
-                        _reportInvalidOther(c2 & 0xFF, inPtr);
+
+                switch (code) {
+                    case 1: {
+                        final int c2 = inputBuf[inPtr++];
+                        if ((c2 & 0xC0) != 0x080) {
+                            _reportInvalidOther(c2 & 0xFF, inPtr);
+                        }
+                        i = ((i & 0x1F) << 6) | (c2 & 0x3F);
                     }
-                    final int c3 = inputBuf[inPtr++];
-                    if ((c3 & 0xC0) != 0x080) {
-                        _reportInvalidOther(c3 & 0xFF, inPtr);
+                    break;
+                    case 2: {
+                        final int c2 = inputBuf[inPtr++];
+                        if ((c2 & 0xC0) != 0x080) {
+                            _reportInvalidOther(c2 & 0xFF, inPtr);
+                        }
+                        final int c3 = inputBuf[inPtr++];
+                        if ((c3 & 0xC0) != 0x080) {
+                            _reportInvalidOther(c3 & 0xFF, inPtr);
+                        }
+                        i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
                     }
-                    i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
+                    break;
+                    case 3:
+                        // 30-Jan-2021, tatu: TODO - validate these too?
+                        i = ((i & 0x07) << 18)
+                            | ((inputBuf[inPtr++] & 0x3F) << 12)
+                            | ((inputBuf[inPtr++] & 0x3F) << 6)
+                            | (inputBuf[inPtr++] & 0x3F);
+                        // note: this is the codepoint value; need to split, too
+                        i -= 0x10000;
+                        outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
+                        i = 0xDC00 | (i & 0x3FF);
+                        break;
+                    default: // invalid
+                        _reportInvalidInitial(i);
                 }
-                break;
-            case 3:
-                // 30-Jan-2021, tatu: TODO - validate these too?
-                i = ((i & 0x07) << 18)
-                    | ((inputBuf[inPtr++] & 0x3F) << 12)
-                    | ((inputBuf[inPtr++] & 0x3F) << 6)
-                    | (inputBuf[inPtr++] & 0x3F);
-                // note: this is the codepoint value; need to split, too
-                i -= 0x10000;
-                outBuf[outPtr++] = (char) (0xD800 | (i >> 10));
-                i = 0xDC00 | (i & 0x3FF);
-                break;
-            default: // invalid
-                _reportInvalidInitial(i);
             }
             outBuf[outPtr++] = (char) i;
         } while (inPtr < end);
@@ -3850,18 +3848,16 @@ protected void _reportIncompleteBinaryRead(int expLen, int actLen) throws IOExce
                 expLen, actLen), _currToken);
     }
 
-    // @since 2.13
-    /*
+    // @since 2.19
     private String _reportTruncatedUTF8InString(int strLenBytes, int truncatedCharOffset,
             int firstUTFByteValue, int bytesExpected)
         throws IOException
     {
         throw _constructError(String.format(
-"Truncated UTF-8 character in Chunked Unicode String value (%d bytes): "
+"Truncated UTF-8 character in Unicode String value (%d bytes): "
 +"byte 0x%02X at offset #%d indicated %d more bytes needed",
 strLenBytes, firstUTFByteValue, truncatedCharOffset, bytesExpected));
     }
-    */
 
     // @since 2.13
     private String _reportTruncatedUTF8InName(int strLenBytes, int truncatedCharOffset,
diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/fuzz/CBORFuzz_35979_StringValueTest.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/fuzz/CBORFuzz_35979_StringValueTest.java
@@ -27,7 +27,7 @@ public void testInvalidTextValueWithBrokenUTF8() throws Exception
             p.getText();
             fail("Should not pass");
         } catch (StreamReadException e) {
-            verifyException(e, "Malformed UTF-8 character at the end of a");
+            verifyException(e, "Truncated UTF-8 character in Unicode String value (256 bytes)");
         }
 
     }
diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java
@@ -5,6 +5,9 @@
 import com.fasterxml.jackson.dataformat.cbor.CBORParser;
 import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;
 
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
 public class ParseInvalidUTF8String236Test extends CBORTestBase
 {
     // [dataformats-binary#236]: Original version; broken UTF-8 all around.
@@ -24,8 +27,8 @@ public void testShortString236Original() throws Exception
     }
 
     // Variant where the length would be valid, but the last byte is partial UTF-8
-    // code point
-    public void testShortString236EndsWithPartialUTF8() throws Exception
+    // code point and no more bytes are available due to end-of-stream
+    public void testShortString236EndsWithPartialUTF8AtEndOfStream() throws Exception
     {
         final byte[] input = {0x63, 0x41, 0x2d, (byte) 0xda};
         try (CBORParser p = cborParser(input)) {
@@ -34,7 +37,23 @@ public void testShortString236EndsWithPartialUTF8() throws Exception
                 String str = p.getText();
                 fail("Should have failed, did not, String = '"+str+"'");
             } catch (StreamReadException e) {
-                verifyException(e, "Malformed UTF-8 character at the end of");
+                verifyException(e, "Truncated UTF-8 character in Unicode String value (3 bytes)");
+            }
+        }
+    }
+
+    // Variant where the length would be valid, but the last byte is partial UTF-8
+    // code point and the subsequent byte would be a valid continuation byte, but belongs to next data item
+    public void testShortString236EndsWithPartialUTF8() throws Exception
+    {
+        final byte[] input = {0x62, 0x33, (byte) 0xdb, (byte) 0xa0};
+        try (CBORParser p = cborParser(input)) {
+            assertToken(JsonToken.VALUE_STRING, p.nextToken());
+            try {
+                String str = p.getText();
+                fail("Should have failed, did not, String = '"+str+"'");
+            } catch (StreamReadException e) {
+                verifyException(e, "Truncated UTF-8 character in Unicode String value (2 bytes)");
             }
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ public void testInvalidTextValueWithBrokenUTF8() throws Exception`
`27`	`27`	`p.getText();`
`28`	`28`	`fail("Should not pass");`
`29`	`29`	`} catch (StreamReadException e) {`
`30`		`- verifyException(e, "Malformed UTF-8 character at the end of a");`
	`30`	`+ verifyException(e, "Truncated UTF-8 character in Unicode String value (256 bytes)");`
`31`	`31`	`}`
`32`	`32`
`33`	`33`	`}`