@@ -236,8 +236,8 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
236
236
width = 1
237
237
elseif code == 0x2028 || code == 0x2029
238
238
# By definition, should have zero width (on the same line)
239
- # 0x002028 '
' category: Zl name: LINE SEPARATOR/
240
- # 0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
239
+ # 0x002028 '\u2028 ' category: Zl name: LINE SEPARATOR/
240
+ # 0x002029 '\u2029 ' category: Zp name: PARAGRAPH SEPARATOR/
241
241
width = 0
242
242
end
243
243
@@ -256,79 +256,33 @@ end
256
256
# decompressed on the C side at runtime.
257
257
258
258
# Inverse decomposition mapping tables for combining two characters into a single one.
259
- comb1st_indices = Dict {UInt32,Int} ()
260
- comb1st_indices_sorted_keys = Origin (0 )(UInt32[])
261
- comb2nd_indices = Dict {UInt32,Int} ()
262
- comb2nd_indices_sorted_keys = Origin (0 )(UInt32[])
263
- comb2nd_indices_nonbasic = Set {UInt32} ()
264
- comb_array = Origin (0 )(Vector {Dict{Int,UInt32}} ())
259
+ comb_mapping = Dict {UInt32, Dict{UInt32, UInt32}} ()
260
+ comb_issecond = Set {UInt32} ()
265
261
for char in char_props
262
+ # What happens with decompositions that are longer than 2?
266
263
if isnothing (char. decomp_type) && ! isnothing (char. decomp_mapping) &&
267
264
length (char. decomp_mapping) == 2 && ! isnothing (char_hash[char. decomp_mapping[1 ]]) &&
268
265
char_hash[char. decomp_mapping[1 ]]. combining_class == 0 &&
269
- char. code ∉ exclusions
266
+ ( char. code ∉ exclusions && char . code ∉ excl_version)
270
267
dm0 = char. decomp_mapping[1 ]
271
268
dm1 = char. decomp_mapping[2 ]
272
- if ! haskey (comb1st_indices, dm0)
273
- comb1st_indices[dm0] = length (comb1st_indices)
274
- push! (comb1st_indices_sorted_keys, dm0)
275
- push! (comb_array, Dict {Int,UInt32} ())
276
- @assert length (comb1st_indices) == length (comb_array)
277
- end
278
- if ! haskey (comb2nd_indices, dm1)
279
- push! (comb2nd_indices_sorted_keys, dm1)
280
- comb2nd_indices[dm1] = length (comb2nd_indices)
281
- end
282
- @assert ! haskey (comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1])
283
- comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char. code
284
- if char. code > 0xFFFF
285
- push! (comb2nd_indices_nonbasic, dm1)
269
+ if ! haskey (comb_mapping, dm0)
270
+ comb_mapping[dm0] = Dict {UInt32, UInt32} ()
286
271
end
272
+ comb_mapping[dm0][dm1] = char. code
273
+ push! (comb_issecond, dm1)
287
274
end
288
275
end
289
276
290
- comb_indices = Dict {UInt32,Int} ()
291
- comb1st_indices_lastoffsets = Origin (0 )(zeros (Int, length (comb1st_indices)))
292
- comb1st_indices_firstoffsets = Origin (0 )(zeros (Int, length (comb1st_indices)))
277
+ comb_index = Dict {UInt32, UInt32} ()
278
+ comb_length = Dict {UInt32, UInt32} ()
293
279
let
294
- cumoffset = 0
295
- for dm0 in comb1st_indices_sorted_keys
296
- index = comb1st_indices[dm0]
297
- first = nothing
298
- last = nothing
299
- offset = 0
300
- for b in eachindex (comb2nd_indices_sorted_keys)
301
- dm1 = comb2nd_indices_sorted_keys[b]
302
- if haskey (comb_array[index], b)
303
- if isnothing (first)
304
- first = offset
305
- end
306
- last = offset
307
- if dm1 in comb2nd_indices_nonbasic
308
- last += 1
309
- end
310
- end
311
- offset += 1
312
- if dm1 in comb2nd_indices_nonbasic
313
- offset += 1
314
- end
315
- end
316
- comb1st_indices_firstoffsets[index] = first
317
- comb1st_indices_lastoffsets[index] = last
318
- @assert ! haskey (comb_indices, dm0)
319
- comb_indices[dm0] = cumoffset
320
- cumoffset += last - first + 1 + 2
321
- end
322
-
323
- offset = 0
324
- for dm1 in comb2nd_indices_sorted_keys
325
- @assert ! haskey (comb_indices, dm1)
326
- comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset)
327
- @assert comb2nd_indices[dm1] + offset <= 0x4000
328
- if dm1 in comb2nd_indices_nonbasic
329
- comb_indices[dm1] |= 0x4000
330
- offset += 1
331
- end
280
+ ind = 0
281
+ for dm0 in sort! (collect (keys (comb_mapping)))
282
+ comb_index[dm0] = ind
283
+ len = length (comb_mapping[dm0])
284
+ comb_length[dm0] = len
285
+ ind += len
332
286
end
333
287
end
334
288
@@ -391,7 +345,9 @@ function char_table_properties!(sequences, char)
391
345
uppercase_seqindex = encode_sequence! (sequences, char. uppercase_mapping),
392
346
lowercase_seqindex = encode_sequence! (sequences, char. lowercase_mapping),
393
347
titlecase_seqindex = encode_sequence! (sequences, char. titlecase_mapping),
394
- comb_index = get (comb_indices, code, typemax (UInt16)),
348
+ comb_index = get (comb_index, code, 0x3FF ), # see utf8proc_property_struct::comb_index
349
+ comb_length = get (comb_length, code, 0 ),
350
+ comb_issecond = code in comb_issecond,
395
351
bidi_mirrored = char. bidi_mirrored,
396
352
comp_exclusion = code in exclusions || code in excl_version,
397
353
ignorable = code in ignorable,
@@ -473,8 +429,7 @@ function c_uint16(seqindex)
473
429
end
474
430
475
431
function print_c_data_tables (io, sequences, prop_page_indices, prop_pages, deduplicated_props,
476
- comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
477
- comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
432
+ comb_index, comb_length, comb_issecond)
478
433
print (io, " static const utf8proc_uint16_t utf8proc_sequences[] = " )
479
434
write_c_index_array (io, sequences. storage, 8 )
480
435
print (io, " static const utf8proc_uint16_t utf8proc_stage1table[] = " )
@@ -484,7 +439,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
484
439
485
440
print (io, """
486
441
static const utf8proc_property_t utf8proc_properties[] = {
487
- {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX , false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
442
+ {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, 0x3FF,0,false , false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
488
443
""" )
489
444
for prop in deduplicated_props
490
445
print (io, " {" ,
@@ -498,6 +453,8 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
498
453
c_uint16 (prop. lowercase_seqindex), " , " ,
499
454
c_uint16 (prop. titlecase_seqindex), " , " ,
500
455
c_uint16 (prop. comb_index), " , " ,
456
+ c_uint16 (prop. comb_length), " , " ,
457
+ prop. comb_issecond, " , " ,
501
458
prop. bidi_mirrored, " , " ,
502
459
prop. comp_exclusion, " , " ,
503
460
prop. ignorable, " , " ,
@@ -512,42 +469,30 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
512
469
end
513
470
print (io, " };\n\n " )
514
471
515
- print (io, " static const utf8proc_uint16_t utf8proc_combinations[] = {\n " )
516
- i = 0
517
- for a in eachindex (comb1st_indices_firstoffsets)
518
- offset = 0
519
- print (io, comb1st_indices_firstoffsets[a], " , " , comb1st_indices_lastoffsets[a], " , " )
520
- for b in eachindex (comb2nd_indices_sorted_keys)
521
- dm1 = comb2nd_indices_sorted_keys[b]
522
- if offset > comb1st_indices_lastoffsets[a]
523
- break
524
- end
525
- if offset >= comb1st_indices_firstoffsets[a]
526
- i += 1
527
- if i == 8
528
- i = 0
529
- print (io, " \n " )
530
- end
531
- v = get (comb_array[a], b, 0 )
532
- if dm1 in comb2nd_indices_nonbasic
533
- print (io, (v & 0xFFFF0000 ) >> 16 , " , " )
534
- end
535
- print (io, v & 0xFFFF , " , " )
536
- end
537
- offset += 1
538
- if dm1 in comb2nd_indices_nonbasic
539
- offset += 1
540
- end
472
+ print (io, " static const utf8proc_uint32_t utf8proc_combinations_second[] = {\n " )
473
+ for dm0 in sort! (collect (keys (comb_mapping)))
474
+ print (io, " " );
475
+ for dm1 in sort! (collect (keys (comb_mapping[dm0])))
476
+ print (io, " " , dm1, " ," )
477
+ end
478
+ print (io, " \n " );
479
+ end
480
+ print (io, " };\n\n " )
481
+
482
+ print (io, " static const utf8proc_uint32_t utf8proc_combinations_combined[] = {\n " )
483
+ for dm0 in sort! (collect (keys (comb_mapping)))
484
+ print (io, " " );
485
+ for dm1 in sort! (collect (keys (comb_mapping[dm0])))
486
+ code = comb_mapping[dm0][dm1]
487
+ print (io, " " , code, " ," )
541
488
end
542
- print (io, " \n " )
489
+ print (io, " \n " );
543
490
end
544
491
print (io, " };\n\n " )
545
492
end
546
493
547
494
548
495
if ! isinteractive ()
549
496
print_c_data_tables (stdout , sequences, prop_page_indices, prop_pages, deduplicated_props,
550
- comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
551
- comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
497
+ comb_index, comb_length, comb_issecond)
552
498
end
553
-
0 commit comments