From 8809cd511a445379ced4d0b67ea6c38290e3c661 Mon Sep 17 00:00:00 2001 From: Mikhail Yakshin Date: Mon, 24 Jul 2023 12:11:53 +0100 Subject: [PATCH 1/8] Added ICU charset conversion implementation --- .build/build | 2 +- CMakeLists.txt | 16 +++++++++++++-- Common.cmake | 2 ++ kaitai/kaitaistream.cpp | 44 ++++++++++++++++++++++++++++++++++++++++- tests/unittest.cpp | 12 ++++++++++- 5 files changed, 71 insertions(+), 5 deletions(-) diff --git a/.build/build b/.build/build index e1109b4..b832c5c 100755 --- a/.build/build +++ b/.build/build @@ -4,5 +4,5 @@ cd "$(dirname "$0")"/.. mkdir -p build cd build -cmake .. +cmake -DSTRING_ENCODING_TYPE="$ENCODING_TYPE" .. cmake --build . diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c054fc..dc4c535 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,13 @@ set (CMAKE_INCLUDE_CURRENT_DIR ON) find_package(ZLIB) find_package(Iconv) +find_package(ICU COMPONENTS uc io) + +set(ICU_FOUND FALSE) +if(ICU_INCLUDE_DIRS AND ICU_LIBRARIES) + SET(ICU_FOUND TRUE) +endif() + set (HEADERS kaitai/kaitaistream.h kaitai/kaitaistruct.h @@ -17,11 +24,11 @@ set (SOURCES kaitai/kaitaistream.cpp ) -set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|NONE|...)") +set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|ICU|NONE|...)") set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) -add_library (${PROJECT_NAME} SHARED ${HEADERS} ${SOURCES}) +add_library(${PROJECT_NAME} SHARED ${HEADERS} ${SOURCES}) set_property(TARGET ${PROJECT_NAME} PROPERTY PUBLIC_HEADER ${HEADERS}) if (ZLIB_FOUND) @@ -33,6 +40,11 @@ if(Iconv_FOUND) target_link_libraries(${PROJECT_NAME} PRIVATE Iconv::Iconv) endif() +if(ICU_FOUND) + target_include_directories(${PROJECT_NAME} PRIVATE ${ICU_INCLUDE_DIRS}) + target_link_libraries(${PROJECT_NAME} PRIVATE ${ICU_LIBRARIES}) +endif() + include(Common.cmake) install(TARGETS ${PROJECT_NAME} diff --git a/Common.cmake b/Common.cmake index 9c280ee..3c32b11 100644 --- a/Common.cmake +++ b/Common.cmake @@ -2,6 +2,8 @@ if (STRING_ENCODING_TYPE STREQUAL "ICONV") target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICONV) elseif (STRING_ENCODING_TYPE STREQUAL "WIN32API") target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_WIN32API) +elseif (STRING_ENCODING_TYPE STREQUAL "ICU") + target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICU) elseif (STRING_ENCODING_TYPE STREQUAL "NONE") target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_NONE) else() diff --git a/kaitai/kaitaistream.cpp b/kaitai/kaitaistream.cpp index f3d95eb..94098c5 100644 --- a/kaitai/kaitaistream.cpp +++ b/kaitai/kaitaistream.cpp @@ -872,6 +872,48 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) { return utf8; } +#elif defined(KS_STR_ENCODING_ICU) +#include +#include + +std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) { + // Start with a buffer length of double the source length. + size_t init_dst_len = src.length() * 2; + std::string dst(init_dst_len, ' '); + + UErrorCode err = U_ZERO_ERROR; + int32_t dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], init_dst_len, src.c_str(), src.length(), &err); + + if (err == U_BUFFER_OVERFLOW_ERROR) { + // We need a bigger buffer, but at least we know how much space exactly we need now + dst.resize(dst_len, ' '); + + // Try again with the new buffer + err = U_ZERO_ERROR; + dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], dst_len, src.c_str(), src.length(), &err); + } else if (!U_FAILURE(err)) { + // Conversion succeed from the first try, shrink the buffer to fit + dst.resize(dst_len); + } + + std::cout << "err = " << err << std::endl; + // Dump all bytes of result + for (int i = 0; i < dst_len; i++) { + std::cout << std::hex << (int)(uint8_t)dst[i] << " "; + } + std::cout << "\n"; + + if (U_FAILURE(err)) { + // Conversion failed + if (err == U_FILE_ACCESS_ERROR) { + throw unknown_encoding(src_enc); + } else { + throw bytes_to_str_error(u_errorName(err)); + } + } + + return dst; +} #else -#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE +#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_ICU, KS_STR_ENCODING_NONE #endif diff --git a/tests/unittest.cpp b/tests/unittest.cpp index 0209ae5..85045dd 100644 --- a/tests/unittest.cpp +++ b/tests/unittest.cpp @@ -239,7 +239,7 @@ TEST(KaitaiStreamTest, bytes_to_str_big_dest) { // Prepare a string in IBM437 that is reasonably big, fill it with U+2248 ALMOST EQUAL TO character, // which is just 1 byte 0xFB in IBM437. - const int len = 10000000; + const int len = 10; std::string src(len, '\xF7'); std::string res = kaitai::kstream::bytes_to_str(src, "IBM437"); @@ -274,6 +274,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_euc_jp_too_short) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("xxx")); #else #error Unknown KS_STR_ENCODING #endif @@ -291,6 +293,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_too_short) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("xxx")); #else #error Unknown KS_STR_ENCODING #endif @@ -307,6 +311,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_two_bytes) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EILSEQ")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("xxx")); #else #error Unknown KS_STR_ENCODING #endif @@ -324,6 +330,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_odd_bytes) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: incomplete")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("xxx")); #else #error Unknown KS_STR_ENCODING #endif @@ -342,6 +350,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_incomplete_high_surroga EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL")); #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: WideCharToMultiByte")); +#elif defined(KS_STR_ENCODING_ICU) + EXPECT_EQ(e.what(), std::string("xxx")); #else #error Unknown KS_STR_ENCODING #endif From 10defe71cd6ecbf3120272e3a7062a0ae0951a3d Mon Sep 17 00:00:00 2001 From: Mikhail Yakshin Date: Thu, 27 Jul 2023 16:59:13 +0100 Subject: [PATCH 2/8] Added ICU to build matrix on linux --- .github/workflows/build.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 159a404..f3ec90c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -9,11 +9,22 @@ on: jobs: linux: runs-on: ubuntu-latest + strategy: + matrix: + encoding: + - ICONV + - ICU + env: + ENCODING_TYPE: ${{matrix.encoding}} steps: - uses: actions/checkout@v3 - name: restore run: | sudo apt-get install -y libgtest-dev + - name: restore ICU + run: | + sudo apt-get install -y libicu-dev + if: matrix.encoding == 'ICU' - name: build run: .build/build - name: unittest From 827c164a3c2de4dac5ba7bcae3b3a092c542ed07 Mon Sep 17 00:00:00 2001 From: Mikhail Yakshin Date: Sun, 6 Apr 2025 12:54:47 +0100 Subject: [PATCH 3/8] Better implementation of ICU with proper error reporting --- kaitai/exceptions.h | 64 +++++++++------------- kaitai/kaitaistream.cpp | 114 ++++++++++++++++++++++++++++++---------- tests/unittest.cpp | 11 ++-- 3 files changed, 117 insertions(+), 72 deletions(-) diff --git a/kaitai/exceptions.h b/kaitai/exceptions.h index 1c1e414..0bcdadc 100644 --- a/kaitai/exceptions.h +++ b/kaitai/exceptions.h @@ -49,7 +49,8 @@ class unknown_encoding: public bytes_to_str_error { class illegal_seq_in_encoding: public bytes_to_str_error { public: illegal_seq_in_encoding(const std::string what): - bytes_to_str_error("illegal sequence: " + what) {} + bytes_to_str_error("illegal sequence: " + what) + {} virtual ~illegal_seq_in_encoding() KS_NOEXCEPT {}; }; @@ -115,20 +116,17 @@ class validation_failed_error: public kstruct_error { template class validation_not_equal_error: public validation_failed_error { public: - validation_not_equal_error(const T& expected, const T& actual, kstream* io, const std::string src_path): - validation_failed_error("not equal", io, src_path), + validation_not_equal_error(const T& expected, const T& actual, kstream* io, const std::string src_path): + validation_failed_error("not equal", src_path, io), m_expected(expected), m_actual(actual) { } - // "not equal, expected #{expected.inspect}, but got #{actual.inspect}" + virtual ~validation_not_equal_error() KS_NOEXCEPT {}; - virtual ~validation_not_equal_error() KS_NOEXCEPT {}; - -protected: - const T& m_expected; - const T& m_actual; + const T m_expected; + const T m_actual; }; /** @@ -138,20 +136,17 @@ class validation_not_equal_error: public validation_failed_error { template class validation_less_than_error: public validation_failed_error { public: - validation_less_than_error(const T& min, const T& actual, kstream* io, const std::string src_path): - validation_failed_error("not in range", io, src_path), + validation_less_than_error(const T& min, const T& actual, kstream* io, const std::string src_path): + validation_failed_error("less than", src_path, io), m_min(min), m_actual(actual) { } - // "not in range, min #{min.inspect}, but got #{actual.inspect}" - - virtual ~validation_less_than_error() KS_NOEXCEPT {}; + virtual ~validation_less_than_error() KS_NOEXCEPT {}; -protected: - const T& m_min; - const T& m_actual; + const T m_min; + const T m_actual; }; /** @@ -161,20 +156,17 @@ class validation_less_than_error: public validation_failed_error { template class validation_greater_than_error: public validation_failed_error { public: - validation_greater_than_error(const T& max, const T& actual, kstream* io, const std::string src_path): - validation_failed_error("not in range", io, src_path), + validation_greater_than_error(const T& max, const T& actual, kstream* io, const std::string src_path): + validation_failed_error("greater than", src_path, io), m_max(max), m_actual(actual) { } - // "not in range, max #{max.inspect}, but got #{actual.inspect}" - - virtual ~validation_greater_than_error() KS_NOEXCEPT {}; + virtual ~validation_greater_than_error() KS_NOEXCEPT {}; -protected: - const T& m_max; - const T& m_actual; + const T m_max; + const T m_actual; }; /** @@ -184,18 +176,15 @@ class validation_greater_than_error: public validation_failed_error { template class validation_not_any_of_error: public validation_failed_error { public: - validation_not_any_of_error(const T& actual, kstream* io, const std::string src_path): - validation_failed_error("not any of the list", io, src_path), + validation_not_any_of_error(const T& actual, kstream* io, const std::string src_path): + validation_failed_error("not any of", src_path, io), m_actual(actual) { } - // "not any of the list, got #{actual.inspect}" - - virtual ~validation_not_any_of_error() KS_NOEXCEPT {}; + virtual ~validation_not_any_of_error() KS_NOEXCEPT {}; -protected: - const T& m_actual; + const T m_actual; }; /** @@ -205,18 +194,15 @@ class validation_not_any_of_error: public validation_failed_error { template class validation_expr_error: public validation_failed_error { public: - validation_expr_error(const T& actual, kstream* io, const std::string src_path): - validation_failed_error("not matching the expression", io, src_path), + validation_expr_error(const T& actual, kstream* io, const std::string src_path): + validation_failed_error("expr", src_path, io), m_actual(actual) { } - // "not matching the expression, got #{actual.inspect}" + virtual ~validation_expr_error() KS_NOEXCEPT {}; - virtual ~validation_expr_error() KS_NOEXCEPT {}; - -protected: - const T& m_actual; + const T m_actual; }; } diff --git a/kaitai/kaitaistream.cpp b/kaitai/kaitaistream.cpp index 94098c5..d2ee93a 100644 --- a/kaitai/kaitaistream.cpp +++ b/kaitai/kaitaistream.cpp @@ -874,45 +874,105 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) { #elif defined(KS_STR_ENCODING_ICU) #include -#include +#include std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) { - // Start with a buffer length of double the source length. - size_t init_dst_len = src.length() * 2; - std::string dst(init_dst_len, ' '); - UErrorCode err = U_ZERO_ERROR; - int32_t dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], init_dst_len, src.c_str(), src.length(), &err); + + // Open the source converter + UConverter* conv = ucnv_open(src_enc, &err); + if (U_FAILURE(err)) { + if (err == U_FILE_ACCESS_ERROR) { + throw unknown_encoding(src_enc); + } + throw bytes_to_str_error(u_errorName(err)); + } - if (err == U_BUFFER_OVERFLOW_ERROR) { - // We need a bigger buffer, but at least we know how much space exactly we need now - dst.resize(dst_len, ' '); + // Open UTF-8 converter + UConverter* utf8Conv = ucnv_open("UTF-8", &err); + if (U_FAILURE(err)) { + ucnv_close(conv); + throw bytes_to_str_error(u_errorName(err)); + } - // Try again with the new buffer - err = U_ZERO_ERROR; - dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], dst_len, src.c_str(), src.length(), &err); - } else if (!U_FAILURE(err)) { - // Conversion succeed from the first try, shrink the buffer to fit - dst.resize(dst_len); + // Configure source converter to stop on illegal sequences + err = U_ZERO_ERROR; + ucnv_setToUCallBack(conv, + UCNV_TO_U_CALLBACK_STOP, + NULL, + NULL, + NULL, + &err); + if (U_FAILURE(err)) { + ucnv_close(conv); + ucnv_close(utf8Conv); + throw illegal_seq_in_encoding(u_errorName(err)); } - std::cout << "err = " << err << std::endl; - // Dump all bytes of result - for (int i = 0; i < dst_len; i++) { - std::cout << std::hex << (int)(uint8_t)dst[i] << " "; + // Allocate buffer for UTF-16 intermediate representation + const int32_t uniStrCapacity = UCNV_GET_MAX_BYTES_FOR_STRING(src.length(), ucnv_getMaxCharSize(conv)); + UChar* uniStr = new UChar[uniStrCapacity]; + + // Convert from source encoding to UTF-16 + err = U_ZERO_ERROR; + int32_t uniLength = ucnv_toUChars(conv, + uniStr, + uniStrCapacity, + src.c_str(), + src.length(), + &err); + if (U_FAILURE(err)) { + delete[] uniStr; + ucnv_close(conv); + ucnv_close(utf8Conv); + throw illegal_seq_in_encoding(u_errorName(err)); } - std::cout << "\n"; + // Configure target converter to stop on illegal sequences + err = U_ZERO_ERROR; + ucnv_setFromUCallBack(utf8Conv, + UCNV_FROM_U_CALLBACK_STOP, + NULL, + NULL, + NULL, + &err); if (U_FAILURE(err)) { - // Conversion failed - if (err == U_FILE_ACCESS_ERROR) { - throw unknown_encoding(src_enc); - } else { - throw bytes_to_str_error(u_errorName(err)); - } + delete[] uniStr; + ucnv_close(conv); + ucnv_close(utf8Conv); + throw illegal_seq_in_encoding(u_errorName(err)); } - return dst; + // Allocate buffer for UTF-8 output + const int32_t dstCapacity = UCNV_GET_MAX_BYTES_FOR_STRING(uniLength, ucnv_getMaxCharSize(utf8Conv)); + char* dst = new char[dstCapacity]; + + // Convert from UTF-16 to UTF-8 + err = U_ZERO_ERROR; + int32_t outputLength = ucnv_fromUChars(utf8Conv, + dst, + dstCapacity, + uniStr, + uniLength, + &err); + if (U_FAILURE(err)) { + delete[] uniStr; + delete[] dst; + ucnv_close(conv); + ucnv_close(utf8Conv); + throw illegal_seq_in_encoding(u_errorName(err)); + } + + // Create result string + std::string result(dst, outputLength); + + // Clean up + delete[] uniStr; + delete[] dst; + ucnv_close(conv); + ucnv_close(utf8Conv); + + return result; } #else #error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_ICU, KS_STR_ENCODING_NONE diff --git a/tests/unittest.cpp b/tests/unittest.cpp index 85045dd..9f7af44 100644 --- a/tests/unittest.cpp +++ b/tests/unittest.cpp @@ -275,14 +275,13 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_euc_jp_too_short) #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); #elif defined(KS_STR_ENCODING_ICU) - EXPECT_EQ(e.what(), std::string("xxx")); + EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND")); #else #error Unknown KS_STR_ENCODING #endif } } - TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_too_short) { try { @@ -294,7 +293,7 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_too_short) #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); #elif defined(KS_STR_ENCODING_ICU) - EXPECT_EQ(e.what(), std::string("xxx")); + EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND")); #else #error Unknown KS_STR_ENCODING #endif @@ -312,7 +311,7 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_two_bytes) #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar")); #elif defined(KS_STR_ENCODING_ICU) - EXPECT_EQ(e.what(), std::string("xxx")); + EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_ILLEGAL_CHAR_FOUND")); #else #error Unknown KS_STR_ENCODING #endif @@ -331,7 +330,7 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_odd_bytes) #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: incomplete")); #elif defined(KS_STR_ENCODING_ICU) - EXPECT_EQ(e.what(), std::string("xxx")); + EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND")); #else #error Unknown KS_STR_ENCODING #endif @@ -351,7 +350,7 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_incomplete_high_surroga #elif defined(KS_STR_ENCODING_WIN32API) EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: WideCharToMultiByte")); #elif defined(KS_STR_ENCODING_ICU) - EXPECT_EQ(e.what(), std::string("xxx")); + EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: U_TRUNCATED_CHAR_FOUND")); #else #error Unknown KS_STR_ENCODING #endif From 8b638bb35269848b263fcb6d8c1c2103fc8ac3a2 Mon Sep 17 00:00:00 2001 From: Mikhail Yakshin Date: Sun, 6 Apr 2025 14:56:29 +0100 Subject: [PATCH 4/8] Exclude ICU+98 --- .github/workflows/build.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7fbd6c4..58c9886 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,6 +19,10 @@ jobs: encoding: - ICONV - ICU + exclude: + # Exclude ICU on C++98, as modern ICU library requires at least C++11 + - cpp-standard: '98' + encoding: ICU steps: - uses: actions/checkout@v4 - name: Install GoogleTest @@ -30,8 +34,7 @@ jobs: - name: Install packages run: sudo apt-get install -y iwyu valgrind - name: Install ICU - run: | - sudo apt-get install -y libicu-dev + run: sudo apt-get install -y libicu-dev if: matrix.encoding == 'ICU' - name: Build env: From 4e4cbfdc6b47a6e5350b070d510724fa31000fe6 Mon Sep 17 00:00:00 2001 From: Mikhail Yakshin Date: Sun, 6 Apr 2025 15:06:25 +0100 Subject: [PATCH 5/8] Minor formatting fixes --- kaitai/kaitaistream.cpp | 54 ++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/kaitai/kaitaistream.cpp b/kaitai/kaitaistream.cpp index fbac05f..6efe77e 100644 --- a/kaitai/kaitaistream.cpp +++ b/kaitai/kaitaistream.cpp @@ -1203,7 +1203,7 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) { std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) { UErrorCode err = U_ZERO_ERROR; - + // Open the source converter UConverter* conv = ucnv_open(src_enc, &err); if (U_FAILURE(err)) { @@ -1222,12 +1222,13 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src // Configure source converter to stop on illegal sequences err = U_ZERO_ERROR; - ucnv_setToUCallBack(conv, - UCNV_TO_U_CALLBACK_STOP, - NULL, - NULL, - NULL, - &err); + ucnv_setToUCallBack( + conv, + UCNV_TO_U_CALLBACK_STOP, + nullptr, + nullptr, + nullptr, + &err); if (U_FAILURE(err)) { ucnv_close(conv); ucnv_close(utf8Conv); @@ -1240,12 +1241,13 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src // Convert from source encoding to UTF-16 err = U_ZERO_ERROR; - int32_t uniLength = ucnv_toUChars(conv, - uniStr, - uniStrCapacity, - src.c_str(), - src.length(), - &err); + int32_t uniLength = ucnv_toUChars( + conv, + uniStr, + uniStrCapacity, + src.c_str(), + src.length(), + &err); if (U_FAILURE(err)) { delete[] uniStr; ucnv_close(conv); @@ -1255,12 +1257,13 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src // Configure target converter to stop on illegal sequences err = U_ZERO_ERROR; - ucnv_setFromUCallBack(utf8Conv, - UCNV_FROM_U_CALLBACK_STOP, - NULL, - NULL, - NULL, - &err); + ucnv_setFromUCallBack( + utf8Conv, + UCNV_FROM_U_CALLBACK_STOP, + nullptr, + nullptr, + nullptr, + &err); if (U_FAILURE(err)) { delete[] uniStr; ucnv_close(conv); @@ -1274,12 +1277,13 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src // Convert from UTF-16 to UTF-8 err = U_ZERO_ERROR; - int32_t outputLength = ucnv_fromUChars(utf8Conv, - dst, - dstCapacity, - uniStr, - uniLength, - &err); + int32_t outputLength = ucnv_fromUChars( + utf8Conv, + dst, + dstCapacity, + uniStr, + uniLength, + &err); if (U_FAILURE(err)) { delete[] uniStr; delete[] dst; From 8abbee40ce48ce33040979606688218233acdbda Mon Sep 17 00:00:00 2001 From: Petr Pucil Date: Wed, 16 Apr 2025 22:43:06 +0200 Subject: [PATCH 6/8] CMakeLists.txt: drop unused ICU component `io` See https://github.com/kaitai-io/kaitai_struct_cpp_stl_runtime/pull/64#discussion_r2047746464 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ea4896e..6be6370 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ set (CMAKE_INCLUDE_CURRENT_DIR ON) find_package(ZLIB) find_package(Iconv) -find_package(ICU COMPONENTS uc io) +find_package(ICU COMPONENTS uc) set(ICU_FOUND FALSE) if(ICU_INCLUDE_DIRS AND ICU_LIBRARIES) From 9b06bf1e91bd1c288a4a128c96840ab542c1f2f2 Mon Sep 17 00:00:00 2001 From: Petr Pucil Date: Wed, 16 Apr 2025 22:45:55 +0200 Subject: [PATCH 7/8] CMakeLists.txt: link ICU via imported targets (not variables) See https://github.com/kaitai-io/kaitai_struct_cpp_stl_runtime/pull/64#discussion_r2047746464 --- CMakeLists.txt | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6be6370..fae5b89 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,11 +12,6 @@ find_package(Iconv) find_package(ICU COMPONENTS uc) -set(ICU_FOUND FALSE) -if(ICU_INCLUDE_DIRS AND ICU_LIBRARIES) - SET(ICU_FOUND TRUE) -endif() - set (HEADERS kaitai/kaitaistream.h kaitai/kaitaistruct.h @@ -45,8 +40,7 @@ if(Iconv_FOUND) endif() if(ICU_FOUND) - target_include_directories(${PROJECT_NAME} PRIVATE ${ICU_INCLUDE_DIRS}) - target_link_libraries(${PROJECT_NAME} PRIVATE ${ICU_LIBRARIES}) + target_link_libraries(${PROJECT_NAME} PRIVATE ICU::uc) endif() include(Common.cmake) From 340d545bc75e1666a163d7a050ad94b594525eef Mon Sep 17 00:00:00 2001 From: Petr Pucil Date: Sun, 27 Apr 2025 00:11:29 +0200 Subject: [PATCH 8/8] Fix ICU memory leaks detected by Valgrind See the code comment. --- tests/CMakeLists.txt | 4 ++++ tests/unittest.cpp | 17 ++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 39e3c17..f91e7fd 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -23,4 +23,8 @@ target_compile_options(unittest PRIVATE # Link the test executable with the main library and the test framework/library target_link_libraries(unittest PRIVATE kaitai_struct_cpp_stl_runtime GTest::GTest GTest::Main) +if(ICU_FOUND) + target_link_libraries(unittest PRIVATE ICU::uc) +endif() + add_test(NAME unittest COMMAND unittest) diff --git a/tests/unittest.cpp b/tests/unittest.cpp index 14d51a6..2ed1454 100644 --- a/tests/unittest.cpp +++ b/tests/unittest.cpp @@ -633,8 +633,23 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf16le_incomplete_high_surrogat } #endif +#if defined(KS_STR_ENCODING_ICU) +#include +#endif + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); + const int ret = RUN_ALL_TESTS(); +#if defined(KS_STR_ENCODING_ICU) + // See : + // + // > When an application is terminating it should call the function `u_cleanup()`, + // > which frees all heap storage and other system resources that are held internally + // > by the ICU library. While the use of `u_cleanup()` is not strictly required, + // > failure to call it will cause memory leak checking tools to report problems for + // > resources being held by ICU library. + u_cleanup(); +#endif + return ret; }