diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2024-03-03 19:12:53 -0500 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2024-03-03 19:12:53 -0500 |
| commit | dc04286e3a75d8f94a69c204cb4fbb7b22e2d6a9 (patch) | |
| tree | a7e82524f7f29e1b0ce95e4629dc121bc3baf4f2 /builtins | |
| parent | d8a533cdf09a9dc7c302913cad514fbeee357b18 (diff) | |
Unicode normalization for equality, hashing, tests, and printing
Diffstat (limited to 'builtins')
| -rw-r--r-- | builtins/functions.c | 37 | ||||
| -rw-r--r-- | builtins/functions.h | 3 | ||||
| -rw-r--r-- | builtins/text.c | 21 |
3 files changed, 49 insertions, 12 deletions
diff --git a/builtins/functions.c b/builtins/functions.c index 69fe5934..c5514ffe 100644 --- a/builtins/functions.c +++ b/builtins/functions.c @@ -5,6 +5,7 @@ #include <errno.h> #include <stdlib.h> #include <sys/param.h> +#include <uninorm.h> #include "../SipHash/halfsiphash.h" #include "../files.h" @@ -12,6 +13,7 @@ #include "functions.h" #include "array.h" #include "table.h" +#include "text.h" #include "pointer.h" #include "string.h" #include "types.h" @@ -158,25 +160,48 @@ public void __doctest(void *expr, const TypeInfo *type, CORD expected, const cha CORD_fprintf(stderr, USE_COLOR ? "\x1b[33;1m>> \x1b[0m%.*s\x1b[m\n" : ">> %.*s\n", (end - start), file->text + start); if (expr) { - CORD expr_str = generic_as_text(expr, USE_COLOR, type); + CORD expr_cord = generic_as_text(expr, USE_COLOR, type); CORD type_name = generic_as_text(NULL, false, type); - CORD_fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %r \x1b[2m: %r\x1b[m\n" : "= %r : %r\n", expr_str, type_name); + uint8_t buf[512] = {0}; + size_t buf_len = sizeof(buf)-1; + const char *expr_str = CORD_to_const_char_star(expr_cord); + uint8_t *normalized_str = u8_normalize(UNINORM_NFD, (uint8_t*)expr_str, strlen(expr_str), buf, &buf_len); + if (!normalized_str) errx(1, "Couldn't normalize unicode string!"); + CORD expr_normalized = CORD_from_char_star((char*)buf); + if (normalized_str != buf) + free(normalized_str); + + CORD_fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %r \x1b[2m: %r\x1b[m\n" : "= %r : %r\n", expr_normalized, type_name); if (expected) { - CORD expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_str; - bool success = (CORD_cmp(expr_plain, expected) == 0); + CORD expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_normalized; + bool success = Text__equal(&expr_plain, &expected); if (!success && CORD_chr(expected, 0, ':')) { - success = (CORD_cmp(CORD_catn(3, expr_plain, " : ", type_name), expected) == 0); + CORD with_type = CORD_catn(3, expr_plain, " : ", type_name); + success = Text__equal(&with_type, &expected); } if (!success) { fail_source(filename, start, end, USE_COLOR ? "\x1b[31;1mDoctest failure:\nExpected: \x1b[32;1m%s\x1b[0m\n\x1b[31;1m But got: \x1b[31;7m%s\x1b[0m\n" : "Doctest failure:\nExpected: %s\n But got: %s\n", - CORD_to_const_char_star(expected), CORD_to_const_char_star(expr_str)); + CORD_to_const_char_star(expected), CORD_to_const_char_star(expr_normalized)); } } } } +public void say(CORD text) +{ + uint8_t buf[512] = {0}; + size_t buf_len = sizeof(buf)-1; + const char *str = CORD_to_const_char_star(text); + uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, strlen(str), buf, &buf_len); + if (normalized) { + puts((char*)normalized); + if (normalized != buf) + free(normalized); + } +} + // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/functions.h b/builtins/functions.h index 8a6026a1..e476f104 100644 --- a/builtins/functions.h +++ b/builtins/functions.h @@ -11,7 +11,8 @@ extern const char *SSS_HASH_VECTOR; void fail(CORD fmt, ...); void fail_source(const char *filename, int64_t start, int64_t end, CORD fmt, ...); CORD builtin_last_err(); -public void __doctest(void *expr, const TypeInfo *type, CORD expected, const char *filename, int64_t start, int64_t end); +void __doctest(void *expr, const TypeInfo *type, CORD expected, const char *filename, int64_t start, int64_t end); +void say(CORD text); uint32_t generic_hash(const void *obj, const TypeInfo *type); int32_t generic_compare(const void *x, const void *y, const TypeInfo *type); diff --git a/builtins/text.c b/builtins/text.c index a17ce23b..e280dcc6 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -10,11 +10,13 @@ #include <sys/param.h> #include <unistr.h> #include <unicase.h> +#include <uninorm.h> #include "../SipHash/halfsiphash.h" -#include "types.h" #include "array.h" +#include "functions.h" #include "text.h" +#include "types.h" #define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo)) @@ -86,12 +88,17 @@ public CORD Text__quoted(CORD str, bool colorize) public int Text__compare(CORD *x, CORD *y) { - return CORD_cmp(*x, *y); + uint8_t *xx = (uint8_t*)CORD_to_const_char_star(*x); + uint8_t *yy = (uint8_t*)CORD_to_const_char_star(*y); + int result = 0; + if (u8_normcmp(xx, strlen((char*)xx), yy, strlen((char*)yy), UNINORM_NFD, &result)) + fail("Something went wrong while comparing text"); + return result; } public bool Text__equal(CORD *x, CORD *y) { - return CORD_cmp(*x, *y) == 0; + return Text__compare(x, y) == 0; } public uint32_t Text__hash(CORD *cord) @@ -99,10 +106,14 @@ public uint32_t Text__hash(CORD *cord) if (!*cord) return 0; const char *str = CORD_to_const_char_star(*cord); - *cord = str; + size_t len = strlen(str); + uint8_t buf[128] = {0}; + size_t norm_len = sizeof(buf)-1; + uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len); uint32_t hash; - halfsiphash(str, strlen(str)+1, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); + halfsiphash(normalized, norm_len, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); + if (normalized != buf) free(normalized); return hash; } |
