Skip to content

Added ICU charset conversion implementation #64

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
12 changes: 12 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ jobs:
- '98'
- '11'
- '20'
encoding:
- ICONV
- ICU
exclude:
# Exclude ICU on C++98, as modern ICU library requires at least C++11
- cpp-standard: '98'
encoding: ICU
steps:
- uses: actions/checkout@v4
- name: Install GoogleTest
Expand All @@ -26,15 +33,20 @@ jobs:
run: sudo apt-get update
- name: Install packages
run: sudo apt-get install -y iwyu valgrind
- name: Install ICU
run: sudo apt-get install -y libicu-dev
if: matrix.encoding == 'ICU'
- name: Build
env:
CPP_STANDARD: ${{ matrix.cpp-standard }}
ENCODING_TYPE: ${{ matrix.encoding }}
# This tells the C++ compiler to produce debugging info that Valgrind needs to report line numbers.
# See also https://valgrind.org/docs/manual/manual-core.html#manual-core.started
CMAKE_BUILD_TYPE: Debug
run: |
.build/build \
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
-DSTRING_ENCODING_TYPE="$ENCODING_TYPE" \
-DCMAKE_CXX_STANDARD="$CPP_STANDARD" -DCMAKE_CXX_STANDARD_REQUIRED=ON -DCMAKE_CXX_EXTENSIONS=OFF \
-DCMAKE_CXX_INCLUDE_WHAT_YOU_USE='include-what-you-use;-Xiwyu;--verbose=3'
- name: Run tests
Expand Down
14 changes: 13 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@ set (CMAKE_INCLUDE_CURRENT_DIR ON)
find_package(ZLIB)
find_package(Iconv)

find_package(ICU COMPONENTS uc io)

set(ICU_FOUND FALSE)
if(ICU_INCLUDE_DIRS AND ICU_LIBRARIES)
SET(ICU_FOUND TRUE)
endif()

set (HEADERS
kaitai/kaitaistream.h
kaitai/kaitaistruct.h
Expand All @@ -20,7 +27,7 @@ set (SOURCES
kaitai/kaitaistream.cpp
)

set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|NONE|...)")
set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|ICU|NONE|...)")

set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)

Expand All @@ -37,6 +44,11 @@ if(Iconv_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE Iconv::Iconv)
endif()

if(ICU_FOUND)
target_include_directories(${PROJECT_NAME} PRIVATE ${ICU_INCLUDE_DIRS})
target_link_libraries(${PROJECT_NAME} PRIVATE ${ICU_LIBRARIES})
endif()

include(Common.cmake)

install(TARGETS ${PROJECT_NAME}
Expand Down
2 changes: 2 additions & 0 deletions Common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ if (STRING_ENCODING_TYPE STREQUAL "ICONV")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICONV)
elseif (STRING_ENCODING_TYPE STREQUAL "WIN32API")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_WIN32API)
elseif (STRING_ENCODING_TYPE STREQUAL "ICU")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICU)
elseif (STRING_ENCODING_TYPE STREQUAL "NONE")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_NONE)
else()
Expand Down
108 changes: 107 additions & 1 deletion kaitai/kaitaistream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1197,6 +1197,112 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) {
return utf8;
}

#elif defined(KS_STR_ENCODING_ICU)
#include <unicode/ucnv.h>
#include <unicode/ustring.h>

std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) {
UErrorCode err = U_ZERO_ERROR;

// Open the source converter
UConverter* conv = ucnv_open(src_enc, &err);
if (U_FAILURE(err)) {
if (err == U_FILE_ACCESS_ERROR) {
throw unknown_encoding(src_enc);
}
throw bytes_to_str_error(u_errorName(err));
}

// Open UTF-8 converter
UConverter* utf8Conv = ucnv_open("UTF-8", &err);
if (U_FAILURE(err)) {
ucnv_close(conv);
throw bytes_to_str_error(u_errorName(err));
}

// Configure source converter to stop on illegal sequences
err = U_ZERO_ERROR;
Copy link
Member

@generalmimon generalmimon Apr 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think these err = U_ZERO_ERROR assignments are necessary, because we check the value of err after every call to an ICU function and throw an exception if (U_FAILURE(err)). Therefore, we know that if we get past any if (U_FAILURE(err)) { ... } block, we know that err indicates success.

Strictly speaking, it might not be equal to U_ZERO_ERROR = 0, because negative err values are used for warnings and U_ZERO_ERROR is only used when there is no warnings. But warnings don't represent errors, so they will not cause ICU functions to exit if they see one - only errors do, see UErrorCode API docs:

Note: By convention, ICU functions that take a reference (C++) or a pointer (C) to a UErrorCode first test:

if (U_FAILURE(errorCode)) { return immediately; }

so that in a chain of such functions the first one that sets an error code causes the following ones to not perform any operations.

Note that U_FAILURE is defined like this - icu4c/source/common/unicode/utypes.h:713-717:

    /**
     * Does the error code indicate a failure?
     * @stable ICU 2.0
     */
#   define U_FAILURE(x) ((x)>U_ZERO_ERROR)

Bottom line, I'd remove them because they are unnecessary (to be clear, we still need to initialize the variable as UErrorCode err = U_ZERO_ERROR; at the beginning, but then we don't have to worry about it anymore).

ucnv_setToUCallBack(
conv,
UCNV_TO_U_CALLBACK_STOP,
nullptr,
nullptr,
nullptr,
&err);
if (U_FAILURE(err)) {
ucnv_close(conv);
ucnv_close(utf8Conv);
throw illegal_seq_in_encoding(u_errorName(err));
}

// Allocate buffer for UTF-16 intermediate representation
const int32_t uniStrCapacity = UCNV_GET_MAX_BYTES_FOR_STRING(src.length(), ucnv_getMaxCharSize(conv));
Copy link
Member

@generalmimon generalmimon Apr 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is ucnv_getMaxCharSize(conv) guaranteed to be sufficient here? I'm not sure, because the documentation of ucnv_getMaxCharSize() says the following:

Returns the maximum number of bytes that are output per UChar in conversion from Unicode using this converter.

The returned number can be used with UCNV_GET_MAX_BYTES_FOR_STRING to calculate the size of a target buffer for conversion from Unicode.

Whereas here we use it for a target buffer for conversion to Unicode (UTF-16), which is the opposite.

The examples of returned values also illustrate the possible problem nicely:

Examples for returned values:

  • SBCS charsets: 1
  • Shift-JIS: 2
  • (...)

SBCS apparently stands for single-byte character set (I've never heard this acronym before). This makes sense: if we take ISO-8859-1 as an example, then any UChar (UTF-16 character, also an unsigned 16-bit integer) valid in this encoding will be converted to just a single byte. However, obviously no character in this charset maps to just 1 byte in UTF-16 (in fact, none does).

It seems that this mismatch in meaning wasn't caught by our tests because you're also using UCNV_GET_MAX_BYTES_FOR_STRING incorrectly (in a way that doesn't agree with the documentation):

Calculates the size of a buffer for conversion from Unicode to a charset.

The calculated size is guaranteed to be sufficient for this conversion.

Parameters
lengthNumber of UChars to be converted.
maxCharSizeReturn value from ucnv_getMaxCharSize() for the converter that will be used.

So the first parameter of the macro is length, which should be number of UChars, i.e. UTF-16 code units or unsigned 16-bit integers. However, you pass src.length(), which is the number of bytes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the docs for ucnv_toUChars():

The maximum output buffer capacity required (barring output from callbacks) will be 2*srcLength (each char may be converted into a surrogate pair).

So we should probably use this instead.

Copy link
Member

@generalmimon generalmimon Apr 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, after thinking about it for a while, I've convinced myself that this calculated capacity should (coincidentally) always be enough, even if the calculation is logically incorrect and should definitely be fixed.

In theory, the only scenario in which it could underallocate memory would be for a SBCS (single-byte character set) with some character outside the BMP (Basic Multilingual Plane) - then src.length() would be 1 (assuming that a 1-character string with that one character is converted) and ucnv_getMaxCharSize(conv) would also be 1 (since it is a SBCS), so uniStrCapacity would be 1 and thus the uniStr array would only have space for 1 UChar (16-bit code unit). But since all Unicode characters outside the BMP (i.e. supplementary characters, U+10000 and greater) require 2 UChars to be represented in UTF-16 (see https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF), this would lead to an error.

However, such SBCS seemingly doesn't exist, because BMP pretty much covers all the common characters that any sane (real-world) SBCS would use. The most popular supplementary characters (U+10000 and above) are emojis, but of course there is no real-world SBCS with emojis, since all SBCSs predate emojis by a lot.

So underallocation isn't a problem here, but overallocation by a certain factor is quite likely. For example, if we're decoding a UTF-8 string, then ucnv_getMaxCharSize(conv) returns 3 - see ucnv_getMaxCharSize() docs:

Examples for returned values:

  • (...)
  • UTF-8: 3 (3 per BMP, 4 per surrogate pair)

So uniStr will be allocated with a capacity of at least 3 * src.length(), while 2 * src.length() is already guaranteed be enough. As mentioned above, each character of any encoding could be converted into a surrogate pair (a pair of two 16-bit code units, i.e. 2 UChars) in the worst case, but it doesn't get any worse.

UChar* uniStr = new UChar[uniStrCapacity];

// Convert from source encoding to UTF-16
err = U_ZERO_ERROR;
int32_t uniLength = ucnv_toUChars(
conv,
uniStr,
uniStrCapacity,
src.c_str(),
src.length(),
&err);
if (U_FAILURE(err)) {
delete[] uniStr;
ucnv_close(conv);
ucnv_close(utf8Conv);
throw illegal_seq_in_encoding(u_errorName(err));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the only place where illegal_seq_in_encoding is thrown when running our test suite, and the only place where it actually makes sense to throw it. It makes no sense when handling potential errors from ucnv_setToUCallBack, ucnv_setFromUCallBack or ucnv_fromUChars - if these fail, it doesn't indicate an illegal sequence in the input string (instead, it would indicate a bug in our bytes_to_str implementation, as none of these should normally fail regardless of the user input).

Therefore, throw illegal_seq_in_encoding should only remain here and throw bytes_to_str_error should be used everywhere else.

}

// Configure target converter to stop on illegal sequences
err = U_ZERO_ERROR;
ucnv_setFromUCallBack(
utf8Conv,
UCNV_FROM_U_CALLBACK_STOP,
nullptr,
nullptr,
nullptr,
&err);
if (U_FAILURE(err)) {
delete[] uniStr;
ucnv_close(conv);
ucnv_close(utf8Conv);
throw illegal_seq_in_encoding(u_errorName(err));
}

// Allocate buffer for UTF-8 output
const int32_t dstCapacity = UCNV_GET_MAX_BYTES_FOR_STRING(uniLength, ucnv_getMaxCharSize(utf8Conv));
char* dst = new char[dstCapacity];

// Convert from UTF-16 to UTF-8
err = U_ZERO_ERROR;
int32_t outputLength = ucnv_fromUChars(
utf8Conv,
dst,
dstCapacity,
uniStr,
uniLength,
&err);
if (U_FAILURE(err)) {
delete[] uniStr;
delete[] dst;
ucnv_close(conv);
ucnv_close(utf8Conv);
throw illegal_seq_in_encoding(u_errorName(err));
}

// Create result string
std::string result(dst, outputLength);

// Clean up
delete[] uniStr;
delete[] dst;
ucnv_close(conv);
ucnv_close(utf8Conv);

return result;
}
#else
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_ICU, KS_STR_ENCODING_NONE
#endif
10 changes: 10 additions & 0 deletions tests/unittest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_euc_jp_too_short)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -556,6 +558,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_too_short)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -581,6 +585,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_two_bytes)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EILSEQ"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_ILLEGAL_CHAR_FOUND"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -598,6 +604,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf16le_odd_bytes)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: incomplete"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -616,6 +624,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf16le_incomplete_high_surrogat
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: WideCharToMultiByte"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand Down