Skip to content

Commit ecd7d1b

Browse files
committed
Optimize table layout
1 parent adb0f44 commit ecd7d1b

File tree

4 files changed

+809
-971
lines changed

4 files changed

+809
-971
lines changed

data/data_generator.jl

+14-2
Original file line numberDiff line numberDiff line change
@@ -469,12 +469,24 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
469469
end
470470
print(io, "};\n\n")
471471

472-
print(io, "static const utf8proc_uint32_t utf8proc_combinations[][2] = {\n")
472+
print(io, "static const utf8proc_uint32_t utf8proc_combinations_second[] = {\n")
473473
for dm0 in sort!(collect(keys(comb_mapping)))
474+
print(io, " ");
475+
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
476+
print(io, " ", dm1, ",")
477+
end
478+
print(io, "\n");
479+
end
480+
print(io, "};\n\n")
481+
482+
print(io, "static const utf8proc_uint32_t utf8proc_combinations_combined[] = {\n")
483+
for dm0 in sort!(collect(keys(comb_mapping)))
484+
print(io, " ");
474485
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
475486
code = comb_mapping[dm0][dm1]
476-
print(io, " {", dm1, ", ", code, "},\n")
487+
print(io, " ", code, ",")
477488
end
489+
print(io, "\n");
478490
end
479491
print(io, "};\n\n")
480492
end

utf8proc.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -687,18 +687,18 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
687687
int idx = starter_property->comb_index;
688688
if (idx < 0x3FF && current_property->comb_issecond) {
689689
int len = starter_property->comb_length;
690-
utf8proc_uint32_t max_second = utf8proc_combinations[idx + len - 1][0];
690+
utf8proc_uint32_t max_second = utf8proc_combinations_second[idx + len - 1];
691691
if (current_char <= max_second) {
692692
// TODO: binary search? arithmetic search?
693693
for (int off = 0; off < len; ++off) {
694-
utf8proc_uint32_t second = utf8proc_combinations[idx + off][0];
694+
utf8proc_uint32_t second = utf8proc_combinations_second[idx + off];
695695
if (current_char < second) {
696696
/* not found */
697697
break;
698698
}
699699
if (current_char == second) {
700700
/* found */
701-
utf8proc_uint32_t composition = utf8proc_combinations[idx + off][1];
701+
utf8proc_uint32_t composition = utf8proc_combinations_combined[idx + off];
702702
*starter = composition;
703703
starter_property = NULL;
704704
break;

utf8proc.h

+6-4
Original file line numberDiff line numberDiff line change
@@ -266,10 +266,12 @@ typedef struct utf8proc_property_struct {
266266
* combining pair, and for most, there are only a handful for
267267
* possible second characters.
268268
*
269-
* The combining table is stored as `utf8proc_uint32_t
270-
* utf8proc_combinations[][2]`. That is, it contains a pair `(second
271-
* combining character, combined character)` for every character
272-
* that can be a first combining character.
269+
* The combining table is stored as sparse matrix in the CSR
270+
* (compressed sparse row) format. That is, it is stored as two
271+
* arrays, `utf8proc_uint32_t utf8proc_combinations_second[]` and
272+
* `utf8proc_uint32_t utf8proc_combinations_combined[]`. These
273+
* contain the second combining characters and the combined
274+
* character of every combining pair.
273275
*
274276
* - `comb_index`: Index into the combining table if this character
275277
* is the first character in a combining pair, else 0x3ff

0 commit comments

Comments
 (0)