Skip to content

Added ICU charset conversion implementation #64

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion .build/build
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ cd "$(dirname "$0")"/..

mkdir -p build
cd build
cmake ..
cmake -DSTRING_ENCODING_TYPE="$ENCODING_TYPE" ..
cmake --build .
11 changes: 11 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,22 @@ on:
jobs:
linux:
runs-on: ubuntu-latest
strategy:
matrix:
encoding:
- ICONV
- ICU
env:
ENCODING_TYPE: ${{matrix.encoding}}
steps:
- uses: actions/checkout@v3
- name: restore
run: |
sudo apt-get install -y libgtest-dev
- name: restore ICU
run: |
sudo apt-get install -y libicu-dev
if: matrix.encoding == 'ICU'
- name: build
run: .build/build
- name: unittest
Expand Down
16 changes: 14 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ set (CMAKE_INCLUDE_CURRENT_DIR ON)
find_package(ZLIB)
find_package(Iconv)

find_package(ICU COMPONENTS uc io)

set(ICU_FOUND FALSE)
if(ICU_INCLUDE_DIRS AND ICU_LIBRARIES)
SET(ICU_FOUND TRUE)
endif()

set (HEADERS
kaitai/kaitaistream.h
kaitai/kaitaistruct.h
Expand All @@ -17,11 +24,11 @@ set (SOURCES
kaitai/kaitaistream.cpp
)

set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|NONE|...)")
set(STRING_ENCODING_TYPE "ICONV" CACHE STRING "Set the way strings have to be encoded (ICONV|WIN32API|ICU|NONE|...)")

set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)

add_library (${PROJECT_NAME} SHARED ${HEADERS} ${SOURCES})
add_library(${PROJECT_NAME} SHARED ${HEADERS} ${SOURCES})
set_property(TARGET ${PROJECT_NAME} PROPERTY PUBLIC_HEADER ${HEADERS})

if (ZLIB_FOUND)
Expand All @@ -33,6 +40,11 @@ if(Iconv_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE Iconv::Iconv)
endif()

if(ICU_FOUND)
target_include_directories(${PROJECT_NAME} PRIVATE ${ICU_INCLUDE_DIRS})
target_link_libraries(${PROJECT_NAME} PRIVATE ${ICU_LIBRARIES})
endif()

include(Common.cmake)

install(TARGETS ${PROJECT_NAME}
Expand Down
2 changes: 2 additions & 0 deletions Common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ if (STRING_ENCODING_TYPE STREQUAL "ICONV")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICONV)
elseif (STRING_ENCODING_TYPE STREQUAL "WIN32API")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_WIN32API)
elseif (STRING_ENCODING_TYPE STREQUAL "ICU")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_ICU)
elseif (STRING_ENCODING_TYPE STREQUAL "NONE")
target_compile_definitions(${PROJECT_NAME} PRIVATE -DKS_STR_ENCODING_NONE)
else()
Expand Down
44 changes: 43 additions & 1 deletion kaitai/kaitaistream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,48 @@ std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) {
return utf8;
}

#elif defined(KS_STR_ENCODING_ICU)
#include <unicode/ucnv.h>
#include <iostream>

std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) {
// Start with a buffer length of double the source length.
size_t init_dst_len = src.length() * 2;
std::string dst(init_dst_len, ' ');

UErrorCode err = U_ZERO_ERROR;
int32_t dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], init_dst_len, src.c_str(), src.length(), &err);

if (err == U_BUFFER_OVERFLOW_ERROR) {
// We need a bigger buffer, but at least we know how much space exactly we need now
dst.resize(dst_len, ' ');

// Try again with the new buffer
err = U_ZERO_ERROR;
dst_len = ucnv_convert(KS_STR_DEFAULT_ENCODING, src_enc, &dst[0], dst_len, src.c_str(), src.length(), &err);
} else if (!U_FAILURE(err)) {
// Conversion succeed from the first try, shrink the buffer to fit
dst.resize(dst_len);
}

std::cout << "err = " << err << std::endl;
// Dump all bytes of result
for (int i = 0; i < dst_len; i++) {
std::cout << std::hex << (int)(uint8_t)dst[i] << " ";
}
std::cout << "\n";

if (U_FAILURE(err)) {
// Conversion failed
if (err == U_FILE_ACCESS_ERROR) {
throw unknown_encoding(src_enc);
} else {
throw bytes_to_str_error(u_errorName(err));
}
}

return dst;
}
#else
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE
#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_ICU, KS_STR_ENCODING_NONE
#endif
12 changes: 11 additions & 1 deletion tests/unittest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ TEST(KaitaiStreamTest, bytes_to_str_big_dest)
{
// Prepare a string in IBM437 that is reasonably big, fill it with U+2248 ALMOST EQUAL TO character,
// which is just 1 byte 0xFB in IBM437.
const int len = 10000000;
const int len = 10;
std::string src(len, '\xF7');

std::string res = kaitai::kstream::bytes_to_str(src, "IBM437");
Expand Down Expand Up @@ -274,6 +274,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_euc_jp_too_short)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("xxx"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -291,6 +293,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_too_short)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("xxx"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -307,6 +311,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_gb2312_two_bytes)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EILSEQ"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: MultiByteToWideChar"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("xxx"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -324,6 +330,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_odd_bytes)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: incomplete"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("xxx"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand All @@ -342,6 +350,8 @@ TEST(KaitaiStreamTest, bytes_to_str_invalid_seq_utf_16le_incomplete_high_surroga
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: EINVAL"));
#elif defined(KS_STR_ENCODING_WIN32API)
EXPECT_EQ(e.what(), std::string("bytes_to_str error: illegal sequence: WideCharToMultiByte"));
#elif defined(KS_STR_ENCODING_ICU)
EXPECT_EQ(e.what(), std::string("xxx"));
#else
#error Unknown KS_STR_ENCODING
#endif
Expand Down