Skip to content

Commit d2080f6

Browse files
committed
Fix #276
1 parent 2ddb1e6 commit d2080f6

File tree

4 files changed

+249
-95
lines changed

4 files changed

+249
-95
lines changed

cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,15 @@ public class CBORGenerator extends GeneratorBase
2626
* Let's ensure that we have big enough output buffer because of safety
2727
* margins we need for UTF-8 encoding.
2828
*/
29-
final static int BYTE_BUFFER_FOR_OUTPUT = 16000;
29+
protected final static int BYTE_BUFFER_FOR_OUTPUT = 16000;
3030

3131
/**
3232
* The replacement character to use to fix invalid Unicode sequences
3333
* (mismatched surrogate pair).
3434
*
3535
* @since 2.12
3636
*/
37-
final static int REPLACEMENT_CHAR = 0xfffd;
37+
protected final static int REPLACEMENT_CHAR = 0xfffd;
3838

3939
/**
4040
* Longest char chunk we will output is chosen so that it is guaranteed to
@@ -80,7 +80,7 @@ public enum Feature implements FormatFeature {
8080
* an exception will be thrown to indicate invalid content.
8181
*<p>
8282
* Default value is {@code false} (for backwards compatibility) meaning that
83-
* an invalide surrogate will result in exception ({@link IllegalArgumentException}
83+
* an invalid surrogate will result in exception ({@link IllegalArgumentException}
8484
*
8585
* @since 2.12
8686
*/

release-notes/VERSION-2.x

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ Modules:
2020
(actual fix in `jackson-databind`)
2121
#272: (cbor) Uncaught exception in CBORParser._nextChunkedByte2 (by ossfuzzer)
2222
(reported by Fabian M)
23+
#276: (smile) Add `SmileGenerator.Feature.LENIENT_UTF_ENCODING` for lenient handling
24+
of broken Unicode surrogate pairs on writing
25+
(requested by kireet@github)
2326
- `Ion-java` dep 1.4.0 -> 1.8.0
2427
- Minor change to Ion module registration names (fully-qualified)
2528

smile/src/main/java/com/fasterxml/jackson/dataformat/smile/SmileGenerator.java

Lines changed: 127 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,20 @@ public enum Feature
8282
* this option is disabled by default, and should only be enabled if it is likely that
8383
* same values repeat relatively often.
8484
*/
85-
CHECK_SHARED_STRING_VALUES(false)
85+
CHECK_SHARED_STRING_VALUES(false),
86+
87+
/**
88+
* Feature that determines if an invalid surrogate encoding found in the
89+
* incoming String should fail with an exception or silently be output
90+
* as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD) or not; if not,
91+
* an exception will be thrown to indicate invalid content.
92+
*<p>
93+
* Default value is {@code false} (for backwards compatibility) meaning that
94+
* an invalid surrogate will result in exception ({@link IllegalArgumentException}
95+
*
96+
* @since 2.13
97+
*/
98+
LENIENT_UTF_ENCODING(false),
8699
;
87100

88101
protected final boolean _defaultState;
@@ -156,6 +169,14 @@ public SharedStringNode(String value, int index, SharedStringNode next)
156169
protected final static long MIN_INT_AS_LONG = (long) Integer.MIN_VALUE;
157170
protected final static long MAX_INT_AS_LONG = (long) Integer.MAX_VALUE;
158171

172+
/**
173+
* The replacement character to use to fix invalid Unicode sequences
174+
* (mismatched surrogate pair).
175+
*
176+
* @since 2.13
177+
*/
178+
protected final static int REPLACEMENT_CHAR = 0xfffd;
179+
159180
/*
160181
/**********************************************************
161182
/* Configuration
@@ -1929,22 +1950,19 @@ private final int _shortUTF8Encode2(char[] str, int i, int end, int outputPtr)
19291950
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
19301951
continue;
19311952
}
1932-
// Yup, a surrogate pair
1933-
if (c > SURR1_LAST) { // must be from first range; second won't do
1934-
_throwIllegalSurrogate(c);
1935-
}
1936-
// ... meaning it must have a pair
1937-
if (i >= end) {
1938-
_throwIllegalSurrogate(c);
1939-
}
1940-
c = _convertSurrogate(c, str[i++]);
1941-
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
1942-
_throwIllegalSurrogate(c);
1953+
// Yup, looks like a surrogate pair... but is it?
1954+
if ((c <= SURR1_LAST) && (i < end)) { // must be from first range and have another char
1955+
final int d = str[i];
1956+
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
1957+
++i;
1958+
outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
1959+
continue;
1960+
}
1961+
outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
1962+
continue;
19431963
}
1944-
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
1945-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
1946-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1947-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1964+
// Nah, something wrong
1965+
outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
19481966
}
19491967
int codedLen = outputPtr - _outputTail;
19501968
_outputTail = outputPtr;
@@ -1993,22 +2011,19 @@ private final int _shortUTF8Encode2(String str, int i, int end, int outputPtr)
19932011
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
19942012
continue;
19952013
}
1996-
// Yup, a surrogate pair
1997-
if (c > SURR1_LAST) { // must be from first range; second won't do
1998-
_throwIllegalSurrogate(c);
1999-
}
2000-
// ... meaning it must have a pair
2001-
if (i >= end) {
2002-
_throwIllegalSurrogate(c);
2003-
}
2004-
c = _convertSurrogate(c, str.charAt(i++));
2005-
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
2006-
_throwIllegalSurrogate(c);
2014+
// Yup, looks like a surrogate pair... but is it?
2015+
if ((c <= SURR1_LAST) && (i < end)) { // must be from first range and have another char
2016+
final int d = str.charAt(i);
2017+
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
2018+
++i;
2019+
outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
2020+
continue;
2021+
}
2022+
outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
2023+
continue;
20072024
}
2008-
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
2009-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
2010-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
2011-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
2025+
// Nah, something wrong
2026+
outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
20122027
}
20132028
int codedLen = outputPtr - _outputTail;
20142029
_outputTail = outputPtr;
@@ -2021,9 +2036,8 @@ private void _mediumUTF8Encode(char[] str, int inputPtr, int inputEnd) throws IO
20212036

20222037
output_loop:
20232038
while (inputPtr < inputEnd) {
2024-
/* First, let's ensure we can output at least 4 bytes
2025-
* (longest UTF-8 encoded codepoint):
2026-
*/
2039+
// First, let's ensure we can output at least 4 bytes
2040+
// (longest UTF-8 encoded codepoint):
20272041
if (_outputTail >= bufferEnd) {
20282042
_flushBuffer();
20292043
}
@@ -2064,22 +2078,19 @@ private void _mediumUTF8Encode(char[] str, int inputPtr, int inputEnd) throws IO
20642078
_outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
20652079
continue;
20662080
}
2067-
// Yup, a surrogate:
2068-
if (c > SURR1_LAST) { // must be from first range
2069-
_throwIllegalSurrogate(c);
2070-
}
2071-
// and if so, followed by another from next range
2072-
if (inputPtr >= inputEnd) {
2073-
_throwIllegalSurrogate(c);
2074-
}
2075-
c = _convertSurrogate(c, str[inputPtr++]);
2076-
if (c > 0x10FFFF) { // illegal, as per RFC 4627
2077-
_throwIllegalSurrogate(c);
2081+
// Yup, looks like a surrogate pair... but is it?
2082+
if ((c <= SURR1_LAST) && (inputPtr < inputEnd)) { // must be from first range and have another char
2083+
final int d = str[inputPtr];
2084+
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
2085+
++inputPtr;
2086+
_outputTail = _decodeAndWriteSurrogate(c, d, _outputBuffer, _outputTail);
2087+
continue;
2088+
}
2089+
_outputTail = _invalidSurrogateEnd(c, d, _outputBuffer, _outputTail);
2090+
continue;
20782091
}
2079-
_outputBuffer[_outputTail++] = (byte) (0xf0 | (c >> 18));
2080-
_outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 12) & 0x3f));
2081-
_outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 6) & 0x3f));
2082-
_outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
2092+
// Nah, something wrong
2093+
_outputTail = _invalidSurrogateStart(c, _outputBuffer, _outputTail);
20832094
}
20842095
}
20852096
}
@@ -2090,9 +2101,8 @@ private void _mediumUTF8Encode(String str, int inputPtr, int inputEnd) throws IO
20902101

20912102
output_loop:
20922103
while (inputPtr < inputEnd) {
2093-
/* First, let's ensure we can output at least 4 bytes
2094-
* (longest UTF-8 encoded codepoint):
2095-
*/
2104+
// First, let's ensure we can output at least 4 bytes
2105+
// (longest UTF-8 encoded codepoint):
20962106
if (_outputTail >= bufferEnd) {
20972107
_flushBuffer();
20982108
}
@@ -2133,63 +2143,88 @@ private void _mediumUTF8Encode(String str, int inputPtr, int inputEnd) throws IO
21332143
_outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
21342144
continue;
21352145
}
2136-
// Yup, a surrogate:
2137-
if (c > SURR1_LAST) { // must be from first range
2138-
_throwIllegalSurrogate(c);
2139-
}
2140-
// and if so, followed by another from next range
2141-
if (inputPtr >= inputEnd) {
2142-
_throwIllegalSurrogate(c);
2143-
}
2144-
c = _convertSurrogate(c, str.charAt(inputPtr++));
2145-
if (c > 0x10FFFF) { // illegal, as per RFC 4627
2146-
_throwIllegalSurrogate(c);
2146+
// Yup, looks like a surrogate pair... but is it?
2147+
if ((c <= SURR1_LAST) && (inputPtr < inputEnd)) { // must be from first range and have another char
2148+
final int d = str.charAt(inputPtr);
2149+
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
2150+
++inputPtr;
2151+
_outputTail = _decodeAndWriteSurrogate(c, d, _outputBuffer, _outputTail);
2152+
continue;
2153+
}
2154+
_outputTail = _invalidSurrogateEnd(c, d, _outputBuffer, _outputTail);
2155+
continue;
21472156
}
2148-
_outputBuffer[_outputTail++] = (byte) (0xf0 | (c >> 18));
2149-
_outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 12) & 0x3f));
2150-
_outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 6) & 0x3f));
2151-
_outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
2157+
// Nah, something wrong
2158+
_outputTail = _invalidSurrogateStart(c, _outputBuffer, _outputTail);
21522159
}
21532160
}
21542161
}
2155-
2156-
/**
2157-
* Method called to calculate UTF codepoint, from a surrogate pair.
2162+
2163+
/*
2164+
/**********************************************************************
2165+
/* Internal methods, surrogate pair handling
2166+
/**********************************************************************
21582167
*/
2159-
private int _convertSurrogate(int firstPart, int secondPart) throws IOException
2168+
2169+
private int _invalidSurrogateStart(int code, byte[] outBuf, int outputPtr)
2170+
throws IOException
21602171
{
2161-
// Ok, then, is the second part valid?
2162-
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
2163-
String msg = String.format("Broken surrogate pair: first char 0x%04X, second 0x%04X; illegal combination",
2164-
firstPart, secondPart);
2165-
_reportError(msg);
2172+
if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
2173+
return _appendReplacementChar(outBuf, outputPtr);
2174+
}
2175+
// Will be called in two distinct cases: either first character is
2176+
// invalid (code range of second part), or first character is valid
2177+
// but there is no second part to encode
2178+
if (code <= SURR1_LAST) {
2179+
// Unmatched first part (closing without second part?)
2180+
_reportError(String.format(
2181+
"Unmatched surrogate pair, starts with valid high surrogate (0x%04X) but ends without low surrogate",
2182+
code));
21662183
}
2167-
return 0x10000 + ((firstPart - SURR1_FIRST) << 10) + (secondPart - SURR2_FIRST);
2184+
_reportError(String.format(
2185+
"Invalid surrogate pair, starts with invalid high surrogate (0x%04X), not in valid range [0xD800, 0xDBFF]",
2186+
code));
2187+
return 0; // never gets here
21682188
}
21692189

2170-
private void _throwIllegalSurrogate(int code) throws IOException
2190+
private int _invalidSurrogateEnd(int surr1, int surr2,
2191+
byte[] outBuf, int outputPtr)
2192+
throws IOException
21712193
{
2172-
if (code > 0x10FFFF) { // over max?
2173-
_reportError(String.format(
2174-
"Illegal character point (0x%X) to output; max is 0x10FFFF as per RFC 4627", code));
2175-
}
2176-
if (code >= SURR1_FIRST) {
2177-
if (code <= SURR1_LAST) { // Unmatched first part (closing without second part?)
2178-
_reportError(String.format(
2179-
"Unmatched first part of surrogate pair (0x%04X)", code));
2180-
}
2181-
_reportError(String.format(
2182-
"Unmatched second part of surrogate pair (0x%04X)", code));
2194+
if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
2195+
return _appendReplacementChar(outBuf, outputPtr);
21832196
}
2184-
// should we ever get this?
2185-
_reportError(String.format("Illegal character point (0x%X) to output", code));
2197+
_reportError(String.format(
2198+
"Invalid surrogate pair, starts with valid high surrogate (0x%04X)"
2199+
+" but ends with invalid low surrogate (0x%04X), not in valid range [0xDC00, 0xDFFF]",
2200+
surr1, surr2));
2201+
return 0; // never gets here
2202+
}
2203+
2204+
private int _appendReplacementChar(byte[] outBuf, int outputPtr) {
2205+
outBuf[outputPtr++] = (byte) (0xe0 | (REPLACEMENT_CHAR >> 12));
2206+
outBuf[outputPtr++] = (byte) (0x80 | ((REPLACEMENT_CHAR >> 6) & 0x3f));
2207+
outBuf[outputPtr++] = (byte) (0x80 | (REPLACEMENT_CHAR & 0x3f));
2208+
return outputPtr;
2209+
}
2210+
2211+
private int _decodeAndWriteSurrogate(int surr1, int surr2,
2212+
byte[] outBuf, int outputPtr)
2213+
{
2214+
final int c = 0x10000 + ((surr1 - SURR1_FIRST) << 10)
2215+
+ (surr2 - SURR2_FIRST);
2216+
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
2217+
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
2218+
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
2219+
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
2220+
return outputPtr;
21862221
}
21872222

21882223
/*
21892224
/**********************************************************
21902225
/* Internal methods, writing bytes
21912226
/**********************************************************
2192-
*/
2227+
*/
21932228

21942229
private final void _ensureRoomForOutput(int needed) throws IOException
21952230
{

0 commit comments

Comments
 (0)