diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2024-03-03 19:12:53 -0500 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2024-03-03 19:12:53 -0500 |
| commit | dc04286e3a75d8f94a69c204cb4fbb7b22e2d6a9 (patch) | |
| tree | a7e82524f7f29e1b0ce95e4629dc121bc3baf4f2 /builtins/text.c | |
| parent | d8a533cdf09a9dc7c302913cad514fbeee357b18 (diff) | |
Unicode normalization for equality, hashing, tests, and printing
Diffstat (limited to 'builtins/text.c')
| -rw-r--r-- | builtins/text.c | 21 |
1 files changed, 16 insertions, 5 deletions
diff --git a/builtins/text.c b/builtins/text.c index a17ce23b..e280dcc6 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -10,11 +10,13 @@ #include <sys/param.h> #include <unistr.h> #include <unicase.h> +#include <uninorm.h> #include "../SipHash/halfsiphash.h" -#include "types.h" #include "array.h" +#include "functions.h" #include "text.h" +#include "types.h" #define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo)) @@ -86,12 +88,17 @@ public CORD Text__quoted(CORD str, bool colorize) public int Text__compare(CORD *x, CORD *y) { - return CORD_cmp(*x, *y); + uint8_t *xx = (uint8_t*)CORD_to_const_char_star(*x); + uint8_t *yy = (uint8_t*)CORD_to_const_char_star(*y); + int result = 0; + if (u8_normcmp(xx, strlen((char*)xx), yy, strlen((char*)yy), UNINORM_NFD, &result)) + fail("Something went wrong while comparing text"); + return result; } public bool Text__equal(CORD *x, CORD *y) { - return CORD_cmp(*x, *y) == 0; + return Text__compare(x, y) == 0; } public uint32_t Text__hash(CORD *cord) @@ -99,10 +106,14 @@ public uint32_t Text__hash(CORD *cord) if (!*cord) return 0; const char *str = CORD_to_const_char_star(*cord); - *cord = str; + size_t len = strlen(str); + uint8_t buf[128] = {0}; + size_t norm_len = sizeof(buf)-1; + uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len); uint32_t hash; - halfsiphash(str, strlen(str)+1, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); + halfsiphash(normalized, norm_len, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); + if (normalized != buf) free(normalized); return hash; } |
