@@ -255,6 +255,38 @@ typedef struct utf8proc_property_struct {
255
255
utf8proc_uint16_t uppercase_seqindex ;
256
256
utf8proc_uint16_t lowercase_seqindex ;
257
257
utf8proc_uint16_t titlecase_seqindex ;
258
+ /**
259
+ * Character combining table.
260
+ *
261
+ * The character combining table is formally indexed by two
262
+ * characters, the first and second character that might form a
263
+ * combining pair. The table entry then contains the combined
264
+ * character. Most character pairs cannot be combined. There are
265
+ * about 1,000 characters that can be the first character in a
266
+ * combining pair, and for most, there are only a handful for
267
+ * possible second characters.
268
+ *
269
+ * The combining table is stored as `utf8proc_uint32_t
270
+ * utf8proc_combinations[][2]`. That is, it contains a pair `(second
271
+ * combining character, combined character)` for every character
272
+ * that can be a first combining character.
273
+ *
274
+ * - `comb_index`: Index into the combining table if this character
275
+ * is the first character in a combining pair, else 0x3ff
276
+ *
277
+ * - `comb_length`: Number of table entries for this first character
278
+ *
279
+ * - `comb_is_second`: As optimization we also record whether this
280
+ * characther is the second combining character in any pair. If
281
+ * not, we can skip the table lookup.
282
+ *
283
+ * A table lookup starts from a given character pair. It first
284
+ * checks whether the first character is stored in the table
285
+ * (checking whether the index is 0x3ff) and whether the second
286
+ * index is stored in the table (looking at `comb_is_second`). If
287
+ * so, the `comb_length` table entries will be checked sequentially
288
+ * for a match.
289
+ */
258
290
utf8proc_uint16_t comb_index :10 ;
259
291
utf8proc_uint16_t comb_length :5 ;
260
292
utf8proc_uint16_t comb_issecond :1 ;
0 commit comments