Fix #276

cowtowncoder · cowtowncoder · commit d2080f65371a · 2021-06-26T14:08:01.000-07:00
diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java
@@ -26,15 +26,15 @@ public class CBORGenerator extends GeneratorBase
      * Let's ensure that we have big enough output buffer because of safety
      * margins we need for UTF-8 encoding.
      */
-    final static int BYTE_BUFFER_FOR_OUTPUT = 16000;
+    protected final static int BYTE_BUFFER_FOR_OUTPUT = 16000;
 
     /**
      * The replacement character to use to fix invalid Unicode sequences
      * (mismatched surrogate pair).
      *
      * @since 2.12
      */
-    final static int REPLACEMENT_CHAR = 0xfffd;
+    protected final static int REPLACEMENT_CHAR = 0xfffd;
 
     /**
      * Longest char chunk we will output is chosen so that it is guaranteed to
@@ -80,7 +80,7 @@ public enum Feature implements FormatFeature {
          * an exception will be thrown to indicate invalid content.
          *<p>
          * Default value is {@code false} (for backwards compatibility) meaning that
-         * an invalide surrogate will result in exception ({@link IllegalArgumentException}
+         * an invalid surrogate will result in exception ({@link IllegalArgumentException}
          *
          * @since 2.12
          */
diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x
@@ -20,6 +20,9 @@ Modules:
  (actual fix in `jackson-databind`)
 #272: (cbor) Uncaught exception in CBORParser._nextChunkedByte2 (by ossfuzzer)
  (reported by Fabian M)
+#276: (smile) Add `SmileGenerator.Feature.LENIENT_UTF_ENCODING` for lenient handling
+  of broken Unicode surrogate pairs on writing
+ (requested by kireet@github)
 - `Ion-java` dep 1.4.0 -> 1.8.0
 - Minor change to Ion module registration names (fully-qualified)
 
diff --git a/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileGenerator.java b/smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileGenerator.java
@@ -82,7 +82,20 @@ public enum Feature
          * this option is disabled by default, and should only be enabled if it is likely that
          * same values repeat relatively often.
          */
-        CHECK_SHARED_STRING_VALUES(false)
+        CHECK_SHARED_STRING_VALUES(false),
+
+        /**
+         * Feature that determines if an invalid surrogate encoding found in the
+         * incoming String should fail with an exception or silently be output
+         * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD) or not; if not,
+         * an exception will be thrown to indicate invalid content.
+         *<p>
+         * Default value is {@code false} (for backwards compatibility) meaning that
+         * an invalid surrogate will result in exception ({@link IllegalArgumentException}
+         *
+         * @since 2.13
+         */
+        LENIENT_UTF_ENCODING(false),
         ;
 
         protected final boolean _defaultState;
@@ -156,6 +169,14 @@ public SharedStringNode(String value, int index, SharedStringNode next)
     protected final static long MIN_INT_AS_LONG = (long) Integer.MIN_VALUE;
     protected final static long MAX_INT_AS_LONG = (long) Integer.MAX_VALUE;
 
+    /**
+     * The replacement character to use to fix invalid Unicode sequences
+     * (mismatched surrogate pair).
+     *
+     * @since 2.13
+     */
+    protected final static int REPLACEMENT_CHAR = 0xfffd;
+
     /*
     /**********************************************************
     /* Configuration
@@ -1929,22 +1950,19 @@ private final int _shortUTF8Encode2(char[] str, int i, int end, int outputPtr)
                 outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
                 continue;
             }
-            // Yup, a surrogate pair
-            if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
-            }
-            // ... meaning it must have a pair
-            if (i >= end) {
-                _throwIllegalSurrogate(c);
-            }
-            c = _convertSurrogate(c, str[i++]);
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
+            // Yup, looks like a surrogate pair... but is it?
+            if ((c <= SURR1_LAST) && (i < end)) { // must be from first range and have another char
+                final int d = str[i];
+                if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
+                    ++i;
+                    outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
+                    continue;
+                }
+                outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
+                continue;
             }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
+            // Nah, something wrong
+            outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
         }
         int codedLen = outputPtr - _outputTail;
         _outputTail = outputPtr;
@@ -1993,22 +2011,19 @@ private final int _shortUTF8Encode2(String str, int i, int end, int outputPtr)
                 outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
                 continue;
             }
-            // Yup, a surrogate pair
-            if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
-            }
-            // ... meaning it must have a pair
-            if (i >= end) {
-                _throwIllegalSurrogate(c);
-            }
-            c = _convertSurrogate(c, str.charAt(i++));
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
+            // Yup, looks like a surrogate pair... but is it?
+            if ((c <= SURR1_LAST) && (i < end)) { // must be from first range and have another char
+                final int d = str.charAt(i);
+                if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
+                    ++i;
+                    outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
+                    continue;
+                }
+                outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
+                continue;
             }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
+            // Nah, something wrong
+            outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
         }
         int codedLen = outputPtr - _outputTail;
         _outputTail = outputPtr;
@@ -2021,9 +2036,8 @@ private void _mediumUTF8Encode(char[] str, int inputPtr, int inputEnd) throws IO
         
         output_loop:
         while (inputPtr < inputEnd) {
-            /* First, let's ensure we can output at least 4 bytes
-             * (longest UTF-8 encoded codepoint):
-             */
+            // First, let's ensure we can output at least 4 bytes
+            // (longest UTF-8 encoded codepoint):
             if (_outputTail >= bufferEnd) {
                 _flushBuffer();
             }
@@ -2064,22 +2078,19 @@ private void _mediumUTF8Encode(char[] str, int inputPtr, int inputEnd) throws IO
                     _outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
                     continue;
                 }
-                // Yup, a surrogate:
-                if (c > SURR1_LAST) { // must be from first range
-                    _throwIllegalSurrogate(c);
-                }
-                // and if so, followed by another from next range
-                if (inputPtr >= inputEnd) {
-                    _throwIllegalSurrogate(c);
-                }
-                c = _convertSurrogate(c, str[inputPtr++]);
-                if (c > 0x10FFFF) { // illegal, as per RFC 4627
-                    _throwIllegalSurrogate(c);
+                // Yup, looks like a surrogate pair... but is it?
+                if ((c <= SURR1_LAST) && (inputPtr < inputEnd)) { // must be from first range and have another char
+                    final int d = str[inputPtr];
+                    if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
+                        ++inputPtr;
+                        _outputTail = _decodeAndWriteSurrogate(c, d, _outputBuffer, _outputTail);
+                        continue;
+                    }
+                    _outputTail = _invalidSurrogateEnd(c, d, _outputBuffer, _outputTail);
+                    continue;
                 }
-                _outputBuffer[_outputTail++] = (byte) (0xf0 | (c >> 18));
-                _outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-                _outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-                _outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
+                // Nah, something wrong
+                _outputTail = _invalidSurrogateStart(c, _outputBuffer, _outputTail);
             }
         }
     }
@@ -2090,9 +2101,8 @@ private void _mediumUTF8Encode(String str, int inputPtr, int inputEnd) throws IO
         
         output_loop:
         while (inputPtr < inputEnd) {
-            /* First, let's ensure we can output at least 4 bytes
-             * (longest UTF-8 encoded codepoint):
-             */
+            // First, let's ensure we can output at least 4 bytes
+            // (longest UTF-8 encoded codepoint):
             if (_outputTail >= bufferEnd) {
                 _flushBuffer();
             }
@@ -2133,63 +2143,88 @@ private void _mediumUTF8Encode(String str, int inputPtr, int inputEnd) throws IO
                     _outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
                     continue;
                 }
-                // Yup, a surrogate:
-                if (c > SURR1_LAST) { // must be from first range
-                    _throwIllegalSurrogate(c);
-                }
-                // and if so, followed by another from next range
-                if (inputPtr >= inputEnd) {
-                    _throwIllegalSurrogate(c);
-                }
-                c = _convertSurrogate(c, str.charAt(inputPtr++));
-                if (c > 0x10FFFF) { // illegal, as per RFC 4627
-                    _throwIllegalSurrogate(c);
+                // Yup, looks like a surrogate pair... but is it?
+                if ((c <= SURR1_LAST) && (inputPtr < inputEnd)) { // must be from first range and have another char
+                    final int d = str.charAt(inputPtr);
+                    if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
+                        ++inputPtr;
+                        _outputTail = _decodeAndWriteSurrogate(c, d, _outputBuffer, _outputTail);
+                        continue;
+                    }
+                    _outputTail = _invalidSurrogateEnd(c, d, _outputBuffer, _outputTail);
+                    continue;
                 }
-                _outputBuffer[_outputTail++] = (byte) (0xf0 | (c >> 18));
-                _outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-                _outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-                _outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
+                // Nah, something wrong
+                _outputTail = _invalidSurrogateStart(c, _outputBuffer, _outputTail);
             }
         }
     }
-    
-    /**
-     * Method called to calculate UTF codepoint, from a surrogate pair.
+
+    /*
+    /**********************************************************************
+    /* Internal methods, surrogate pair handling
+    /**********************************************************************
      */
-    private int _convertSurrogate(int firstPart, int secondPart) throws IOException
+
+    private int _invalidSurrogateStart(int code, byte[] outBuf, int outputPtr)
+        throws IOException
     {
-        // Ok, then, is the second part valid?
-        if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
-            String msg = String.format("Broken surrogate pair: first char 0x%04X, second 0x%04X; illegal combination",
-                    firstPart, secondPart);
-            _reportError(msg);
+        if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
+            return _appendReplacementChar(outBuf, outputPtr);
+        }
+        // Will be called in two distinct cases: either first character is
+        // invalid (code range of second part), or first character is valid
+        // but there is no second part to encode
+        if (code <= SURR1_LAST) {
+            // Unmatched first part (closing without second part?)
+            _reportError(String.format(
+"Unmatched surrogate pair, starts with valid high surrogate (0x%04X) but ends without low surrogate",
+code));
         }
-        return 0x10000 + ((firstPart - SURR1_FIRST) << 10) + (secondPart - SURR2_FIRST);
+        _reportError(String.format(
+"Invalid surrogate pair, starts with invalid high surrogate (0x%04X), not in valid range [0xD800, 0xDBFF]",
+code));
+        return 0; // never gets here
     }
 
-    private void _throwIllegalSurrogate(int code) throws IOException
+    private int _invalidSurrogateEnd(int surr1, int surr2,
+            byte[] outBuf, int outputPtr)
+        throws IOException
     {
-        if (code > 0x10FFFF) { // over max?
-            _reportError(String.format(
-                    "Illegal character point (0x%X) to output; max is 0x10FFFF as per RFC 4627", code));
-        }
-        if (code >= SURR1_FIRST) {
-            if (code <= SURR1_LAST) { // Unmatched first part (closing without second part?)
-                _reportError(String.format(
-                    "Unmatched first part of surrogate pair (0x%04X)", code));
-            }
-            _reportError(String.format(
-                    "Unmatched second part of surrogate pair (0x%04X)", code));
+        if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
+            return _appendReplacementChar(outBuf, outputPtr);
         }
-        // should we ever get this?
-        _reportError(String.format("Illegal character point (0x%X) to output", code));
+        _reportError(String.format(
+"Invalid surrogate pair, starts with valid high surrogate (0x%04X)"
++" but ends with invalid low surrogate (0x%04X), not in valid range [0xDC00, 0xDFFF]",
+surr1, surr2));
+        return 0; // never gets here
+    }
+
+    private int _appendReplacementChar(byte[] outBuf, int outputPtr) {
+        outBuf[outputPtr++] = (byte) (0xe0 | (REPLACEMENT_CHAR >> 12));
+        outBuf[outputPtr++] = (byte) (0x80 | ((REPLACEMENT_CHAR >> 6) & 0x3f));
+        outBuf[outputPtr++] = (byte) (0x80 | (REPLACEMENT_CHAR & 0x3f));
+        return outputPtr;
+    }
+
+    private int _decodeAndWriteSurrogate(int surr1, int surr2,
+            byte[] outBuf, int outputPtr)
+    {
+        final int c = 0x10000 + ((surr1 - SURR1_FIRST) << 10)
+                + (surr2 - SURR2_FIRST);
+        outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
+        outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
+        outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
+        outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
+        return outputPtr;
     }
 
     /*
     /**********************************************************
     /* Internal methods, writing bytes
     /**********************************************************
-    */
+     */
 
     private final void _ensureRoomForOutput(int needed) throws IOException
     {
diff --git a/smile/src/test/java/com/fasterxml/jackson/dataformat/smile/gen/LenientUnicodeSmileGenerationTest.java b/smile/src/test/java/com/fasterxml/jackson/dataformat/smile/gen/LenientUnicodeSmileGenerationTest.java