Skip to content

Commit c8d815a

Browse files
authored
Merge pull request #277 from eschnett/eschnett/unicode16
Redesign combining table
2 parents 53177fb + ecd7d1b commit c8d815a

File tree

10 files changed

+14062
-14006
lines changed

10 files changed

+14062
-14006
lines changed

.github/workflows/ci-fuzz.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ jobs:
2020
if: failure()
2121
with:
2222
name: artifacts
23-
path: ./out/artifacts
23+
path: ./out/artifacts

CMakeLists.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@ include (utils.cmake)
44

55
disallow_intree_builds()
66

7-
project (utf8proc VERSION 2.9.0 LANGUAGES C)
7+
project (utf8proc VERSION 2.10.0 LANGUAGES C)
88

99
# This is the ABI version number, which may differ from the
1010
# API version number (defined in utf8proc.h and above).
1111
# Be sure to also update these in Makefile and MANIFEST!
1212
set(SO_MAJOR 3)
13-
set(SO_MINOR 0)
13+
set(SO_MINOR 1)
1414
set(SO_PATCH 0)
1515

1616
option(UTF8PROC_INSTALL "Enable installation of utf8proc" On)
@@ -65,7 +65,7 @@ endif()
6565
if(UTF8PROC_ENABLE_TESTING)
6666
enable_testing()
6767
file(MAKE_DIRECTORY data)
68-
set(UNICODE_VERSION 15.1.0)
68+
set(UNICODE_VERSION 16.0.0)
6969
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
7070
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
7171
add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ The C library is found in this directory after successful compilation
5959
and is named `libutf8proc.a` (for the static library) and
6060
`libutf8proc.so` (for the dynamic library).
6161

62-
The Unicode version supported is 15.1.0.
62+
The Unicode version supported is 16.0.0.
6363

6464
For Unicode normalizations, the following options are used:
6565

data/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ utf8proc_data.c.new: data_generator.jl $(RAWDATA)
2121
$(JULIA) --project=. data_generator.jl > $@
2222

2323
# Unicode data version (must also update utf8proc_unicode_version function)
24-
UNICODE_VERSION=15.1.0
24+
UNICODE_VERSION=16.0.0
2525

2626
UnicodeData.txt:
2727
$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt

data/Manifest.toml

+7-60
Original file line numberDiff line numberDiff line change
@@ -1,69 +1,16 @@
11
# This file is machine-generated - editing it directly is not advised
22

3-
julia_version = "1.9.3"
3+
julia_version = "1.11.2"
44
manifest_format = "2.0"
55
project_hash = "bc0740aa2247b17bd49ba693fb87f41bbbddead6"
66

7-
[[deps.Adapt]]
8-
deps = ["LinearAlgebra", "Requires"]
9-
git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608"
10-
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
11-
version = "3.7.2"
12-
13-
[deps.Adapt.extensions]
14-
AdaptStaticArraysExt = "StaticArrays"
15-
16-
[deps.Adapt.weakdeps]
17-
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
18-
19-
[[deps.Artifacts]]
20-
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
21-
22-
[[deps.CompilerSupportLibraries_jll]]
23-
deps = ["Artifacts", "Libdl"]
24-
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
25-
version = "1.0.5+0"
26-
27-
[[deps.Libdl]]
28-
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
29-
30-
[[deps.LinearAlgebra]]
31-
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
32-
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
33-
347
[[deps.OffsetArrays]]
35-
deps = ["Adapt"]
36-
git-tree-sha1 = "2ac17d29c523ce1cd38e27785a7d23024853a4bb"
8+
git-tree-sha1 = "5e1897147d1ff8d98883cda2be2187dcf57d8f0c"
379
uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
38-
version = "1.12.10"
39-
40-
[[deps.OpenBLAS_jll]]
41-
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
42-
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
43-
version = "0.3.21+4"
44-
45-
[[deps.Random]]
46-
deps = ["SHA", "Serialization"]
47-
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
48-
49-
[[deps.Requires]]
50-
deps = ["UUIDs"]
51-
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
52-
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
53-
version = "1.3.0"
54-
55-
[[deps.SHA]]
56-
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
57-
version = "0.7.0"
58-
59-
[[deps.Serialization]]
60-
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
10+
version = "1.15.0"
6111

62-
[[deps.UUIDs]]
63-
deps = ["Random", "SHA"]
64-
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
12+
[deps.OffsetArrays.extensions]
13+
OffsetArraysAdaptExt = "Adapt"
6514

66-
[[deps.libblastrampoline_jll]]
67-
deps = ["Artifacts", "Libdl"]
68-
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
69-
version = "5.8.0+0"
15+
[deps.OffsetArrays.weakdeps]
16+
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

data/data_generator.jl

+43-98
Original file line numberDiff line numberDiff line change
@@ -236,8 +236,8 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
236236
width = 1
237237
elseif code == 0x2028 || code == 0x2029
238238
#By definition, should have zero width (on the same line)
239-
#0x002028 '' category: Zl name: LINE SEPARATOR/
240-
#0x002029 '' category: Zp name: PARAGRAPH SEPARATOR/
239+
#0x002028 '\u2028' category: Zl name: LINE SEPARATOR/
240+
#0x002029 '\u2029' category: Zp name: PARAGRAPH SEPARATOR/
241241
width = 0
242242
end
243243

@@ -256,79 +256,33 @@ end
256256
# decompressed on the C side at runtime.
257257

258258
# Inverse decomposition mapping tables for combining two characters into a single one.
259-
comb1st_indices = Dict{UInt32,Int}()
260-
comb1st_indices_sorted_keys = Origin(0)(UInt32[])
261-
comb2nd_indices = Dict{UInt32,Int}()
262-
comb2nd_indices_sorted_keys = Origin(0)(UInt32[])
263-
comb2nd_indices_nonbasic = Set{UInt32}()
264-
comb_array = Origin(0)(Vector{Dict{Int,UInt32}}())
259+
comb_mapping = Dict{UInt32, Dict{UInt32, UInt32}}()
260+
comb_issecond = Set{UInt32}()
265261
for char in char_props
262+
# What happens with decompositions that are longer than 2?
266263
if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) &&
267264
length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) &&
268265
char_hash[char.decomp_mapping[1]].combining_class == 0 &&
269-
char.code exclusions
266+
(char.code exclusions && char.code excl_version)
270267
dm0 = char.decomp_mapping[1]
271268
dm1 = char.decomp_mapping[2]
272-
if !haskey(comb1st_indices, dm0)
273-
comb1st_indices[dm0] = length(comb1st_indices)
274-
push!(comb1st_indices_sorted_keys, dm0)
275-
push!(comb_array, Dict{Int,UInt32}())
276-
@assert length(comb1st_indices) == length(comb_array)
277-
end
278-
if !haskey(comb2nd_indices, dm1)
279-
push!(comb2nd_indices_sorted_keys, dm1)
280-
comb2nd_indices[dm1] = length(comb2nd_indices)
281-
end
282-
@assert !haskey(comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1])
283-
comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char.code
284-
if char.code > 0xFFFF
285-
push!(comb2nd_indices_nonbasic, dm1)
269+
if !haskey(comb_mapping, dm0)
270+
comb_mapping[dm0] = Dict{UInt32, UInt32}()
286271
end
272+
comb_mapping[dm0][dm1] = char.code
273+
push!(comb_issecond, dm1)
287274
end
288275
end
289276

290-
comb_indices = Dict{UInt32,Int}()
291-
comb1st_indices_lastoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
292-
comb1st_indices_firstoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
277+
comb_index = Dict{UInt32, UInt32}()
278+
comb_length = Dict{UInt32, UInt32}()
293279
let
294-
cumoffset = 0
295-
for dm0 in comb1st_indices_sorted_keys
296-
index = comb1st_indices[dm0]
297-
first = nothing
298-
last = nothing
299-
offset = 0
300-
for b in eachindex(comb2nd_indices_sorted_keys)
301-
dm1 = comb2nd_indices_sorted_keys[b]
302-
if haskey(comb_array[index], b)
303-
if isnothing(first)
304-
first = offset
305-
end
306-
last = offset
307-
if dm1 in comb2nd_indices_nonbasic
308-
last += 1
309-
end
310-
end
311-
offset += 1
312-
if dm1 in comb2nd_indices_nonbasic
313-
offset += 1
314-
end
315-
end
316-
comb1st_indices_firstoffsets[index] = first
317-
comb1st_indices_lastoffsets[index] = last
318-
@assert !haskey(comb_indices, dm0)
319-
comb_indices[dm0] = cumoffset
320-
cumoffset += last - first + 1 + 2
321-
end
322-
323-
offset = 0
324-
for dm1 in comb2nd_indices_sorted_keys
325-
@assert !haskey(comb_indices, dm1)
326-
comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset)
327-
@assert comb2nd_indices[dm1] + offset <= 0x4000
328-
if dm1 in comb2nd_indices_nonbasic
329-
comb_indices[dm1] |= 0x4000
330-
offset += 1
331-
end
280+
ind = 0
281+
for dm0 in sort!(collect(keys(comb_mapping)))
282+
comb_index[dm0] = ind
283+
len = length(comb_mapping[dm0])
284+
comb_length[dm0] = len
285+
ind += len
332286
end
333287
end
334288

@@ -391,7 +345,9 @@ function char_table_properties!(sequences, char)
391345
uppercase_seqindex = encode_sequence!(sequences, char.uppercase_mapping),
392346
lowercase_seqindex = encode_sequence!(sequences, char.lowercase_mapping),
393347
titlecase_seqindex = encode_sequence!(sequences, char.titlecase_mapping),
394-
comb_index = get(comb_indices, code, typemax(UInt16)),
348+
comb_index = get(comb_index, code, 0x3FF), # see utf8proc_property_struct::comb_index
349+
comb_length = get(comb_length, code, 0),
350+
comb_issecond = code in comb_issecond,
395351
bidi_mirrored = char.bidi_mirrored,
396352
comp_exclusion = code in exclusions || code in excl_version,
397353
ignorable = code in ignorable,
@@ -473,8 +429,7 @@ function c_uint16(seqindex)
473429
end
474430

475431
function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props,
476-
comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
477-
comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
432+
comb_index, comb_length, comb_issecond)
478433
print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ")
479434
write_c_index_array(io, sequences.storage, 8)
480435
print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ")
@@ -484,7 +439,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
484439

485440
print(io, """
486441
static const utf8proc_property_t utf8proc_properties[] = {
487-
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
442+
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, 0x3FF,0,false, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
488443
""")
489444
for prop in deduplicated_props
490445
print(io, " {",
@@ -498,6 +453,8 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
498453
c_uint16(prop.lowercase_seqindex), ", ",
499454
c_uint16(prop.titlecase_seqindex), ", ",
500455
c_uint16(prop.comb_index), ", ",
456+
c_uint16(prop.comb_length), ", ",
457+
prop.comb_issecond, ", ",
501458
prop.bidi_mirrored, ", ",
502459
prop.comp_exclusion, ", ",
503460
prop.ignorable, ", ",
@@ -512,42 +469,30 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
512469
end
513470
print(io, "};\n\n")
514471

515-
print(io, "static const utf8proc_uint16_t utf8proc_combinations[] = {\n ")
516-
i = 0
517-
for a in eachindex(comb1st_indices_firstoffsets)
518-
offset = 0
519-
print(io, comb1st_indices_firstoffsets[a], ", ", comb1st_indices_lastoffsets[a], ", ")
520-
for b in eachindex(comb2nd_indices_sorted_keys)
521-
dm1 = comb2nd_indices_sorted_keys[b]
522-
if offset > comb1st_indices_lastoffsets[a]
523-
break
524-
end
525-
if offset >= comb1st_indices_firstoffsets[a]
526-
i += 1
527-
if i == 8
528-
i = 0
529-
print(io, "\n ")
530-
end
531-
v = get(comb_array[a], b, 0)
532-
if dm1 in comb2nd_indices_nonbasic
533-
print(io, (v & 0xFFFF0000) >> 16, ", ")
534-
end
535-
print(io, v & 0xFFFF, ", ")
536-
end
537-
offset += 1
538-
if dm1 in comb2nd_indices_nonbasic
539-
offset += 1
540-
end
472+
print(io, "static const utf8proc_uint32_t utf8proc_combinations_second[] = {\n")
473+
for dm0 in sort!(collect(keys(comb_mapping)))
474+
print(io, " ");
475+
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
476+
print(io, " ", dm1, ",")
477+
end
478+
print(io, "\n");
479+
end
480+
print(io, "};\n\n")
481+
482+
print(io, "static const utf8proc_uint32_t utf8proc_combinations_combined[] = {\n")
483+
for dm0 in sort!(collect(keys(comb_mapping)))
484+
print(io, " ");
485+
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
486+
code = comb_mapping[dm0][dm1]
487+
print(io, " ", code, ",")
541488
end
542-
print(io, "\n")
489+
print(io, "\n");
543490
end
544491
print(io, "};\n\n")
545492
end
546493

547494

548495
if !isinteractive()
549496
print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props,
550-
comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
551-
comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
497+
comb_index, comb_length, comb_issecond)
552498
end
553-

test/printproperty.c

+4
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ int main(int argc, char **argv)
3434
" titlecase_mapping = %04x (seqindex %04x)\n"
3535
" casefold = %s\n"
3636
" comb_index = %d\n"
37+
" comb_length = %d\n"
38+
" comb_issecond = %d\n"
3739
" bidi_mirrored = %d\n"
3840
" comp_exclusion = %d\n"
3941
" ignorable = %d\n"
@@ -51,6 +53,8 @@ int main(int argc, char **argv)
5153
utf8proc_totitle(c), p->titlecase_seqindex,
5254
(char *) map,
5355
p->comb_index,
56+
p->comb_length,
57+
p->comb_issecond,
5458
p->bidi_mirrored,
5559
p->comp_exclusion,
5660
p->ignorable,

0 commit comments

Comments
 (0)