Merge pull request #277 from eschnett/eschnett/unicode16

eschnett · web-flow · commit c8d815aa83c7 · 2024-12-29T15:15:14.000-05:00
Redesign combining table
diff --git a/.github/workflows/ci-fuzz.yml b/.github/workflows/ci-fuzz.yml
@@ -20,4 +20,4 @@ jobs:
       if: failure()
       with:
         name: artifacts
-        path: ./out/artifacts
+        path: ./out/artifacts
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,13 +4,13 @@ include (utils.cmake)
 
 disallow_intree_builds()
 
-project (utf8proc VERSION 2.9.0 LANGUAGES C)
+project (utf8proc VERSION 2.10.0 LANGUAGES C)
 
 # This is the ABI version number, which may differ from the
 # API version number (defined in utf8proc.h and above).
 # Be sure to also update these in Makefile and MANIFEST!
 set(SO_MAJOR 3)
-set(SO_MINOR 0)
+set(SO_MINOR 1)
 set(SO_PATCH 0)
 
 option(UTF8PROC_INSTALL "Enable installation of utf8proc" On)
@@ -65,7 +65,7 @@ endif()
 if(UTF8PROC_ENABLE_TESTING)
   enable_testing()
   file(MAKE_DIRECTORY data)
-  set(UNICODE_VERSION 15.1.0)
+  set(UNICODE_VERSION 16.0.0)
   file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
   file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
   add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)
diff --git a/README.md b/README.md
@@ -59,7 +59,7 @@ The C library is found in this directory after successful compilation
 and is named `libutf8proc.a` (for the static library) and
 `libutf8proc.so` (for the dynamic library).
 
-The Unicode version supported is 15.1.0.
+The Unicode version supported is 16.0.0.
 
 For Unicode normalizations, the following options are used:
 
diff --git a/data/Makefile b/data/Makefile
@@ -21,7 +21,7 @@ utf8proc_data.c.new: data_generator.jl $(RAWDATA)
 	$(JULIA) --project=. data_generator.jl > $@
 
 # Unicode data version (must also update utf8proc_unicode_version function)
-UNICODE_VERSION=15.1.0
+UNICODE_VERSION=16.0.0
 
 UnicodeData.txt:
 	$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
diff --git a/data/Manifest.toml b/data/Manifest.toml
@@ -1,69 +1,16 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.9.3"
+julia_version = "1.11.2"
 manifest_format = "2.0"
 project_hash = "bc0740aa2247b17bd49ba693fb87f41bbbddead6"
 
-[[deps.Adapt]]
-deps = ["LinearAlgebra", "Requires"]
-git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608"
-uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "3.7.2"
-
-    [deps.Adapt.extensions]
-    AdaptStaticArraysExt = "StaticArrays"
-
-    [deps.Adapt.weakdeps]
-    StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
-
-[[deps.Artifacts]]
-uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
-
-[[deps.CompilerSupportLibraries_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.0.5+0"
-
-[[deps.Libdl]]
-uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
-
-[[deps.LinearAlgebra]]
-deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
-uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-
 [[deps.OffsetArrays]]
-deps = ["Adapt"]
-git-tree-sha1 = "2ac17d29c523ce1cd38e27785a7d23024853a4bb"
+git-tree-sha1 = "5e1897147d1ff8d98883cda2be2187dcf57d8f0c"
 uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
-version = "1.12.10"
-
-[[deps.OpenBLAS_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
-uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.21+4"
-
-[[deps.Random]]
-deps = ["SHA", "Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[deps.Requires]]
-deps = ["UUIDs"]
-git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
-uuid = "ae029012-a4dd-5104-9daa-d747884805df"
-version = "1.3.0"
-
-[[deps.SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-version = "0.7.0"
-
-[[deps.Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+version = "1.15.0"
 
-[[deps.UUIDs]]
-deps = ["Random", "SHA"]
-uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+    [deps.OffsetArrays.extensions]
+    OffsetArraysAdaptExt = "Adapt"
 
-[[deps.libblastrampoline_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
-version = "5.8.0+0"
+    [deps.OffsetArrays.weakdeps]
+    Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/data/data_generator.jl b/data/data_generator.jl
@@ -236,8 +236,8 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
             width = 1
         elseif code == 0x2028 || code == 0x2029
             #By definition, should have zero width (on the same line)
-            #0x002028 ' ' category: Zl name: LINE SEPARATOR/
-            #0x002029 ' ' category: Zp name: PARAGRAPH SEPARATOR/
+            #0x002028 '\u2028' category: Zl name: LINE SEPARATOR/
+            #0x002029 '\u2029' category: Zp name: PARAGRAPH SEPARATOR/
             width = 0
         end
 
@@ -256,79 +256,33 @@ end
 # decompressed on the C side at runtime.
 
 # Inverse decomposition mapping tables for combining two characters into a single one.
-comb1st_indices = Dict{UInt32,Int}()
-comb1st_indices_sorted_keys = Origin(0)(UInt32[])
-comb2nd_indices = Dict{UInt32,Int}()
-comb2nd_indices_sorted_keys = Origin(0)(UInt32[])
-comb2nd_indices_nonbasic = Set{UInt32}()
-comb_array = Origin(0)(Vector{Dict{Int,UInt32}}())
+comb_mapping = Dict{UInt32, Dict{UInt32, UInt32}}()
+comb_issecond = Set{UInt32}()
 for char in char_props
+    # What happens with decompositions that are longer than 2?
     if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) &&
             length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) &&
             char_hash[char.decomp_mapping[1]].combining_class == 0 &&
-            char.code ∉ exclusions
+            (char.code ∉ exclusions && char.code ∉ excl_version)
         dm0 = char.decomp_mapping[1]
         dm1 = char.decomp_mapping[2]
-        if !haskey(comb1st_indices, dm0)
-            comb1st_indices[dm0] = length(comb1st_indices)
-            push!(comb1st_indices_sorted_keys, dm0)
-            push!(comb_array, Dict{Int,UInt32}())
-            @assert length(comb1st_indices) == length(comb_array)
-        end
-        if !haskey(comb2nd_indices, dm1)
-            push!(comb2nd_indices_sorted_keys, dm1)
-            comb2nd_indices[dm1] = length(comb2nd_indices)
-        end
-        @assert !haskey(comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1])
-        comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char.code
-        if char.code > 0xFFFF
-            push!(comb2nd_indices_nonbasic, dm1)
+        if !haskey(comb_mapping, dm0)
+            comb_mapping[dm0] = Dict{UInt32, UInt32}()
         end
+        comb_mapping[dm0][dm1] = char.code
+        push!(comb_issecond, dm1)
     end
 end
 
-comb_indices = Dict{UInt32,Int}()
-comb1st_indices_lastoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
-comb1st_indices_firstoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
+comb_index = Dict{UInt32, UInt32}()
+comb_length = Dict{UInt32, UInt32}()
 let
-    cumoffset = 0
-    for dm0 in comb1st_indices_sorted_keys
-        index = comb1st_indices[dm0]
-        first = nothing
-        last = nothing
-        offset = 0
-        for b in eachindex(comb2nd_indices_sorted_keys)
-            dm1 = comb2nd_indices_sorted_keys[b]
-            if haskey(comb_array[index], b)
-                if isnothing(first)
-                    first = offset
-                end
-                last = offset
-                if dm1 in comb2nd_indices_nonbasic
-                    last += 1
-                end
-            end
-            offset += 1
-            if dm1 in comb2nd_indices_nonbasic
-                offset += 1 
-            end
-        end
-        comb1st_indices_firstoffsets[index] = first
-        comb1st_indices_lastoffsets[index] = last
-        @assert !haskey(comb_indices, dm0)
-        comb_indices[dm0] = cumoffset
-        cumoffset += last - first + 1 + 2
-    end
-
-    offset = 0
-    for dm1 in comb2nd_indices_sorted_keys
-        @assert !haskey(comb_indices, dm1)
-        comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset)
-        @assert comb2nd_indices[dm1] + offset <= 0x4000
-        if dm1 in comb2nd_indices_nonbasic
-            comb_indices[dm1] |= 0x4000
-            offset += 1
-        end
+    ind = 0
+    for dm0 in sort!(collect(keys(comb_mapping)))
+        comb_index[dm0] = ind
+        len = length(comb_mapping[dm0])
+        comb_length[dm0] = len
+        ind += len
     end
 end
 
@@ -391,7 +345,9 @@ function char_table_properties!(sequences, char)
         uppercase_seqindex   = encode_sequence!(sequences, char.uppercase_mapping),
         lowercase_seqindex   = encode_sequence!(sequences, char.lowercase_mapping),
         titlecase_seqindex   = encode_sequence!(sequences, char.titlecase_mapping),
-        comb_index           = get(comb_indices, code, typemax(UInt16)),
+        comb_index           = get(comb_index, code, 0x3FF), # see utf8proc_property_struct::comb_index
+        comb_length          = get(comb_length, code, 0),
+        comb_issecond        = code in comb_issecond,
         bidi_mirrored        = char.bidi_mirrored,
         comp_exclusion       = code in exclusions || code in excl_version,
         ignorable            = code in ignorable,
@@ -473,8 +429,7 @@ function c_uint16(seqindex)
 end
 
 function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props,
-                             comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
-                             comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
+                             comb_index, comb_length, comb_issecond)
     print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ")
     write_c_index_array(io, sequences.storage, 8)
     print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ")
@@ -484,7 +439,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
 
     print(io, """
         static const utf8proc_property_t utf8proc_properties[] = {
-          {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
+          {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  0x3FF,0,false,  false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
         """)
     for prop in deduplicated_props
         print(io, "  {",
@@ -498,6 +453,8 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
               c_uint16(prop.lowercase_seqindex), ", ",
               c_uint16(prop.titlecase_seqindex), ", ",
               c_uint16(prop.comb_index), ", ",
+              c_uint16(prop.comb_length), ", ",
+              prop.comb_issecond, ", ",
               prop.bidi_mirrored, ", ",
               prop.comp_exclusion, ", ",
               prop.ignorable, ", ",
@@ -512,42 +469,30 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
     end
     print(io, "};\n\n")
 
-    print(io, "static const utf8proc_uint16_t utf8proc_combinations[] = {\n  ")
-    i = 0
-    for a in eachindex(comb1st_indices_firstoffsets)
-        offset = 0
-        print(io, comb1st_indices_firstoffsets[a], ", ", comb1st_indices_lastoffsets[a], ", ")
-        for b in eachindex(comb2nd_indices_sorted_keys)
-            dm1 = comb2nd_indices_sorted_keys[b]
-            if offset > comb1st_indices_lastoffsets[a]
-                break
-            end
-            if offset >= comb1st_indices_firstoffsets[a]
-                i += 1
-                if i == 8
-                    i = 0
-                    print(io, "\n  ")
-                end
-                v = get(comb_array[a], b, 0)
-                if dm1 in comb2nd_indices_nonbasic
-                    print(io, (v & 0xFFFF0000) >> 16, ", ")
-                end
-                print(io, v & 0xFFFF, ", ")
-            end
-            offset += 1
-            if dm1 in comb2nd_indices_nonbasic
-                offset += 1
-            end
+    print(io, "static const utf8proc_uint32_t utf8proc_combinations_second[] = {\n")
+    for dm0 in sort!(collect(keys(comb_mapping)))
+        print(io, " ");
+        for dm1 in sort!(collect(keys(comb_mapping[dm0])))
+            print(io, " ", dm1, ",")
+        end
+        print(io, "\n");
+    end
+    print(io, "};\n\n")
+
+    print(io, "static const utf8proc_uint32_t utf8proc_combinations_combined[] = {\n")
+    for dm0 in sort!(collect(keys(comb_mapping)))
+        print(io, " ");
+        for dm1 in sort!(collect(keys(comb_mapping[dm0])))
+            code = comb_mapping[dm0][dm1]
+            print(io, " ", code, ",")
         end
-        print(io, "\n")
+        print(io, "\n");
     end
     print(io, "};\n\n")
 end
 
 
 if !isinteractive()
     print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props,
-                        comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
-                        comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
+                        comb_index, comb_length, comb_issecond)
 end
-
diff --git a/test/printproperty.c b/test/printproperty.c
@@ -34,6 +34,8 @@ int main(int argc, char **argv)
             "  titlecase_mapping = %04x (seqindex %04x)\n"
             "  casefold = %s\n"
             "  comb_index = %d\n"
+            "  comb_length = %d\n"
+            "  comb_issecond = %d\n"
             "  bidi_mirrored = %d\n"
             "  comp_exclusion = %d\n"
             "  ignorable = %d\n"
@@ -51,6 +53,8 @@ int main(int argc, char **argv)
         utf8proc_totitle(c), p->titlecase_seqindex,
         (char *) map,
         p->comb_index,
+        p->comb_length,
+        p->comb_issecond,
         p->bidi_mirrored,
         p->comp_exclusion,
         p->ignorable,
diff --git a/utf8proc.c b/utf8proc.c
diff --git a/utf8proc.h b/utf8proc.h
diff --git a/utf8proc_data.c b/utf8proc_data.c