@@ -82,7 +82,20 @@ public enum Feature
82
82
* this option is disabled by default, and should only be enabled if it is likely that
83
83
* same values repeat relatively often.
84
84
*/
85
- CHECK_SHARED_STRING_VALUES (false )
85
+ CHECK_SHARED_STRING_VALUES (false ),
86
+
87
+ /**
88
+ * Feature that determines if an invalid surrogate encoding found in the
89
+ * incoming String should fail with an exception or silently be output
90
+ * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD) or not; if not,
91
+ * an exception will be thrown to indicate invalid content.
92
+ *<p>
93
+ * Default value is {@code false} (for backwards compatibility) meaning that
94
+ * an invalid surrogate will result in exception ({@link IllegalArgumentException}
95
+ *
96
+ * @since 2.13
97
+ */
98
+ LENIENT_UTF_ENCODING (false ),
86
99
;
87
100
88
101
protected final boolean _defaultState ;
@@ -156,6 +169,14 @@ public SharedStringNode(String value, int index, SharedStringNode next)
156
169
protected final static long MIN_INT_AS_LONG = (long ) Integer .MIN_VALUE ;
157
170
protected final static long MAX_INT_AS_LONG = (long ) Integer .MAX_VALUE ;
158
171
172
+ /**
173
+ * The replacement character to use to fix invalid Unicode sequences
174
+ * (mismatched surrogate pair).
175
+ *
176
+ * @since 2.13
177
+ */
178
+ protected final static int REPLACEMENT_CHAR = 0xfffd ;
179
+
159
180
/*
160
181
/**********************************************************
161
182
/* Configuration
@@ -1929,22 +1950,19 @@ private final int _shortUTF8Encode2(char[] str, int i, int end, int outputPtr)
1929
1950
outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1930
1951
continue ;
1931
1952
}
1932
- // Yup, a surrogate pair
1933
- if (c > SURR1_LAST ) { // must be from first range; second won't do
1934
- _throwIllegalSurrogate (c );
1935
- }
1936
- // ... meaning it must have a pair
1937
- if (i >= end ) {
1938
- _throwIllegalSurrogate (c );
1939
- }
1940
- c = _convertSurrogate (c , str [i ++]);
1941
- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1942
- _throwIllegalSurrogate (c );
1953
+ // Yup, looks like a surrogate pair... but is it?
1954
+ if ((c <= SURR1_LAST ) && (i < end )) { // must be from first range and have another char
1955
+ final int d = str [i ];
1956
+ if ((d <= SURR2_LAST ) && (d >= SURR2_FIRST )) {
1957
+ ++i ;
1958
+ outputPtr = _decodeAndWriteSurrogate (c , d , outBuf , outputPtr );
1959
+ continue ;
1960
+ }
1961
+ outputPtr = _invalidSurrogateEnd (c , d , outBuf , outputPtr );
1962
+ continue ;
1943
1963
}
1944
- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1945
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1946
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1947
- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1964
+ // Nah, something wrong
1965
+ outputPtr = _invalidSurrogateStart (c , outBuf , outputPtr );
1948
1966
}
1949
1967
int codedLen = outputPtr - _outputTail ;
1950
1968
_outputTail = outputPtr ;
@@ -1993,22 +2011,19 @@ private final int _shortUTF8Encode2(String str, int i, int end, int outputPtr)
1993
2011
outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1994
2012
continue ;
1995
2013
}
1996
- // Yup, a surrogate pair
1997
- if (c > SURR1_LAST ) { // must be from first range; second won't do
1998
- _throwIllegalSurrogate (c );
1999
- }
2000
- // ... meaning it must have a pair
2001
- if (i >= end ) {
2002
- _throwIllegalSurrogate (c );
2003
- }
2004
- c = _convertSurrogate (c , str .charAt (i ++));
2005
- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
2006
- _throwIllegalSurrogate (c );
2014
+ // Yup, looks like a surrogate pair... but is it?
2015
+ if ((c <= SURR1_LAST ) && (i < end )) { // must be from first range and have another char
2016
+ final int d = str .charAt (i );
2017
+ if ((d <= SURR2_LAST ) && (d >= SURR2_FIRST )) {
2018
+ ++i ;
2019
+ outputPtr = _decodeAndWriteSurrogate (c , d , outBuf , outputPtr );
2020
+ continue ;
2021
+ }
2022
+ outputPtr = _invalidSurrogateEnd (c , d , outBuf , outputPtr );
2023
+ continue ;
2007
2024
}
2008
- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
2009
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
2010
- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
2011
- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
2025
+ // Nah, something wrong
2026
+ outputPtr = _invalidSurrogateStart (c , outBuf , outputPtr );
2012
2027
}
2013
2028
int codedLen = outputPtr - _outputTail ;
2014
2029
_outputTail = outputPtr ;
@@ -2021,9 +2036,8 @@ private void _mediumUTF8Encode(char[] str, int inputPtr, int inputEnd) throws IO
2021
2036
2022
2037
output_loop :
2023
2038
while (inputPtr < inputEnd ) {
2024
- /* First, let's ensure we can output at least 4 bytes
2025
- * (longest UTF-8 encoded codepoint):
2026
- */
2039
+ // First, let's ensure we can output at least 4 bytes
2040
+ // (longest UTF-8 encoded codepoint):
2027
2041
if (_outputTail >= bufferEnd ) {
2028
2042
_flushBuffer ();
2029
2043
}
@@ -2064,22 +2078,19 @@ private void _mediumUTF8Encode(char[] str, int inputPtr, int inputEnd) throws IO
2064
2078
_outputBuffer [_outputTail ++] = (byte ) (0x80 | (c & 0x3f ));
2065
2079
continue ;
2066
2080
}
2067
- // Yup, a surrogate:
2068
- if (c > SURR1_LAST ) { // must be from first range
2069
- _throwIllegalSurrogate (c );
2070
- }
2071
- // and if so, followed by another from next range
2072
- if (inputPtr >= inputEnd ) {
2073
- _throwIllegalSurrogate (c );
2074
- }
2075
- c = _convertSurrogate (c , str [inputPtr ++]);
2076
- if (c > 0x10FFFF ) { // illegal, as per RFC 4627
2077
- _throwIllegalSurrogate (c );
2081
+ // Yup, looks like a surrogate pair... but is it?
2082
+ if ((c <= SURR1_LAST ) && (inputPtr < inputEnd )) { // must be from first range and have another char
2083
+ final int d = str [inputPtr ];
2084
+ if ((d <= SURR2_LAST ) && (d >= SURR2_FIRST )) {
2085
+ ++inputPtr ;
2086
+ _outputTail = _decodeAndWriteSurrogate (c , d , _outputBuffer , _outputTail );
2087
+ continue ;
2088
+ }
2089
+ _outputTail = _invalidSurrogateEnd (c , d , _outputBuffer , _outputTail );
2090
+ continue ;
2078
2091
}
2079
- _outputBuffer [_outputTail ++] = (byte ) (0xf0 | (c >> 18 ));
2080
- _outputBuffer [_outputTail ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
2081
- _outputBuffer [_outputTail ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
2082
- _outputBuffer [_outputTail ++] = (byte ) (0x80 | (c & 0x3f ));
2092
+ // Nah, something wrong
2093
+ _outputTail = _invalidSurrogateStart (c , _outputBuffer , _outputTail );
2083
2094
}
2084
2095
}
2085
2096
}
@@ -2090,9 +2101,8 @@ private void _mediumUTF8Encode(String str, int inputPtr, int inputEnd) throws IO
2090
2101
2091
2102
output_loop :
2092
2103
while (inputPtr < inputEnd ) {
2093
- /* First, let's ensure we can output at least 4 bytes
2094
- * (longest UTF-8 encoded codepoint):
2095
- */
2104
+ // First, let's ensure we can output at least 4 bytes
2105
+ // (longest UTF-8 encoded codepoint):
2096
2106
if (_outputTail >= bufferEnd ) {
2097
2107
_flushBuffer ();
2098
2108
}
@@ -2133,63 +2143,88 @@ private void _mediumUTF8Encode(String str, int inputPtr, int inputEnd) throws IO
2133
2143
_outputBuffer [_outputTail ++] = (byte ) (0x80 | (c & 0x3f ));
2134
2144
continue ;
2135
2145
}
2136
- // Yup, a surrogate:
2137
- if (c > SURR1_LAST ) { // must be from first range
2138
- _throwIllegalSurrogate (c );
2139
- }
2140
- // and if so, followed by another from next range
2141
- if (inputPtr >= inputEnd ) {
2142
- _throwIllegalSurrogate (c );
2143
- }
2144
- c = _convertSurrogate (c , str .charAt (inputPtr ++));
2145
- if (c > 0x10FFFF ) { // illegal, as per RFC 4627
2146
- _throwIllegalSurrogate (c );
2146
+ // Yup, looks like a surrogate pair... but is it?
2147
+ if ((c <= SURR1_LAST ) && (inputPtr < inputEnd )) { // must be from first range and have another char
2148
+ final int d = str .charAt (inputPtr );
2149
+ if ((d <= SURR2_LAST ) && (d >= SURR2_FIRST )) {
2150
+ ++inputPtr ;
2151
+ _outputTail = _decodeAndWriteSurrogate (c , d , _outputBuffer , _outputTail );
2152
+ continue ;
2153
+ }
2154
+ _outputTail = _invalidSurrogateEnd (c , d , _outputBuffer , _outputTail );
2155
+ continue ;
2147
2156
}
2148
- _outputBuffer [_outputTail ++] = (byte ) (0xf0 | (c >> 18 ));
2149
- _outputBuffer [_outputTail ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
2150
- _outputBuffer [_outputTail ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
2151
- _outputBuffer [_outputTail ++] = (byte ) (0x80 | (c & 0x3f ));
2157
+ // Nah, something wrong
2158
+ _outputTail = _invalidSurrogateStart (c , _outputBuffer , _outputTail );
2152
2159
}
2153
2160
}
2154
2161
}
2155
-
2156
- /**
2157
- * Method called to calculate UTF codepoint, from a surrogate pair.
2162
+
2163
+ /*
2164
+ /**********************************************************************
2165
+ /* Internal methods, surrogate pair handling
2166
+ /**********************************************************************
2158
2167
*/
2159
- private int _convertSurrogate (int firstPart , int secondPart ) throws IOException
2168
+
2169
+ private int _invalidSurrogateStart (int code , byte [] outBuf , int outputPtr )
2170
+ throws IOException
2160
2171
{
2161
- // Ok, then, is the second part valid?
2162
- if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST ) {
2163
- String msg = String .format ("Broken surrogate pair: first char 0x%04X, second 0x%04X; illegal combination" ,
2164
- firstPart , secondPart );
2165
- _reportError (msg );
2172
+ if (isEnabled (Feature .LENIENT_UTF_ENCODING )) {
2173
+ return _appendReplacementChar (outBuf , outputPtr );
2174
+ }
2175
+ // Will be called in two distinct cases: either first character is
2176
+ // invalid (code range of second part), or first character is valid
2177
+ // but there is no second part to encode
2178
+ if (code <= SURR1_LAST ) {
2179
+ // Unmatched first part (closing without second part?)
2180
+ _reportError (String .format (
2181
+ "Unmatched surrogate pair, starts with valid high surrogate (0x%04X) but ends without low surrogate" ,
2182
+ code ));
2166
2183
}
2167
- return 0x10000 + ((firstPart - SURR1_FIRST ) << 10 ) + (secondPart - SURR2_FIRST );
2184
+ _reportError (String .format (
2185
+ "Invalid surrogate pair, starts with invalid high surrogate (0x%04X), not in valid range [0xD800, 0xDBFF]" ,
2186
+ code ));
2187
+ return 0 ; // never gets here
2168
2188
}
2169
2189
2170
- private void _throwIllegalSurrogate (int code ) throws IOException
2190
+ private int _invalidSurrogateEnd (int surr1 , int surr2 ,
2191
+ byte [] outBuf , int outputPtr )
2192
+ throws IOException
2171
2193
{
2172
- if (code > 0x10FFFF ) { // over max?
2173
- _reportError (String .format (
2174
- "Illegal character point (0x%X) to output; max is 0x10FFFF as per RFC 4627" , code ));
2175
- }
2176
- if (code >= SURR1_FIRST ) {
2177
- if (code <= SURR1_LAST ) { // Unmatched first part (closing without second part?)
2178
- _reportError (String .format (
2179
- "Unmatched first part of surrogate pair (0x%04X)" , code ));
2180
- }
2181
- _reportError (String .format (
2182
- "Unmatched second part of surrogate pair (0x%04X)" , code ));
2194
+ if (isEnabled (Feature .LENIENT_UTF_ENCODING )) {
2195
+ return _appendReplacementChar (outBuf , outputPtr );
2183
2196
}
2184
- // should we ever get this?
2185
- _reportError (String .format ("Illegal character point (0x%X) to output" , code ));
2197
+ _reportError (String .format (
2198
+ "Invalid surrogate pair, starts with valid high surrogate (0x%04X)"
2199
+ +" but ends with invalid low surrogate (0x%04X), not in valid range [0xDC00, 0xDFFF]" ,
2200
+ surr1 , surr2 ));
2201
+ return 0 ; // never gets here
2202
+ }
2203
+
2204
+ private int _appendReplacementChar (byte [] outBuf , int outputPtr ) {
2205
+ outBuf [outputPtr ++] = (byte ) (0xe0 | (REPLACEMENT_CHAR >> 12 ));
2206
+ outBuf [outputPtr ++] = (byte ) (0x80 | ((REPLACEMENT_CHAR >> 6 ) & 0x3f ));
2207
+ outBuf [outputPtr ++] = (byte ) (0x80 | (REPLACEMENT_CHAR & 0x3f ));
2208
+ return outputPtr ;
2209
+ }
2210
+
2211
+ private int _decodeAndWriteSurrogate (int surr1 , int surr2 ,
2212
+ byte [] outBuf , int outputPtr )
2213
+ {
2214
+ final int c = 0x10000 + ((surr1 - SURR1_FIRST ) << 10 )
2215
+ + (surr2 - SURR2_FIRST );
2216
+ outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
2217
+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
2218
+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
2219
+ outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
2220
+ return outputPtr ;
2186
2221
}
2187
2222
2188
2223
/*
2189
2224
/**********************************************************
2190
2225
/* Internal methods, writing bytes
2191
2226
/**********************************************************
2192
- */
2227
+ */
2193
2228
2194
2229
private final void _ensureRoomForOutput (int needed ) throws IOException
2195
2230
{
0 commit comments