From dc04286e3a75d8f94a69c204cb4fbb7b22e2d6a9 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Sun, 3 Mar 2024 19:12:53 -0500 Subject: Unicode normalization for equality, hashing, tests, and printing --- builtins/functions.c | 37 +++++++++++++++++++++++++++++++------ builtins/functions.h | 3 ++- builtins/text.c | 21 ++++++++++++++++----- compile.c | 17 ++++++++++++++--- test/text.tm | 8 +++++++- tomo.h | 11 ----------- 6 files changed, 70 insertions(+), 27 deletions(-) diff --git a/builtins/functions.c b/builtins/functions.c index 69fe5934..c5514ffe 100644 --- a/builtins/functions.c +++ b/builtins/functions.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "../SipHash/halfsiphash.h" #include "../files.h" @@ -12,6 +13,7 @@ #include "functions.h" #include "array.h" #include "table.h" +#include "text.h" #include "pointer.h" #include "string.h" #include "types.h" @@ -158,25 +160,48 @@ public void __doctest(void *expr, const TypeInfo *type, CORD expected, const cha CORD_fprintf(stderr, USE_COLOR ? "\x1b[33;1m>> \x1b[0m%.*s\x1b[m\n" : ">> %.*s\n", (end - start), file->text + start); if (expr) { - CORD expr_str = generic_as_text(expr, USE_COLOR, type); + CORD expr_cord = generic_as_text(expr, USE_COLOR, type); CORD type_name = generic_as_text(NULL, false, type); - CORD_fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %r \x1b[2m: %r\x1b[m\n" : "= %r : %r\n", expr_str, type_name); + uint8_t buf[512] = {0}; + size_t buf_len = sizeof(buf)-1; + const char *expr_str = CORD_to_const_char_star(expr_cord); + uint8_t *normalized_str = u8_normalize(UNINORM_NFD, (uint8_t*)expr_str, strlen(expr_str), buf, &buf_len); + if (!normalized_str) errx(1, "Couldn't normalize unicode string!"); + CORD expr_normalized = CORD_from_char_star((char*)buf); + if (normalized_str != buf) + free(normalized_str); + + CORD_fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %r \x1b[2m: %r\x1b[m\n" : "= %r : %r\n", expr_normalized, type_name); if (expected) { - CORD expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_str; - bool success = (CORD_cmp(expr_plain, expected) == 0); + CORD expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_normalized; + bool success = Text__equal(&expr_plain, &expected); if (!success && CORD_chr(expected, 0, ':')) { - success = (CORD_cmp(CORD_catn(3, expr_plain, " : ", type_name), expected) == 0); + CORD with_type = CORD_catn(3, expr_plain, " : ", type_name); + success = Text__equal(&with_type, &expected); } if (!success) { fail_source(filename, start, end, USE_COLOR ? "\x1b[31;1mDoctest failure:\nExpected: \x1b[32;1m%s\x1b[0m\n\x1b[31;1m But got: \x1b[31;7m%s\x1b[0m\n" : "Doctest failure:\nExpected: %s\n But got: %s\n", - CORD_to_const_char_star(expected), CORD_to_const_char_star(expr_str)); + CORD_to_const_char_star(expected), CORD_to_const_char_star(expr_normalized)); } } } } +public void say(CORD text) +{ + uint8_t buf[512] = {0}; + size_t buf_len = sizeof(buf)-1; + const char *str = CORD_to_const_char_star(text); + uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, strlen(str), buf, &buf_len); + if (normalized) { + puts((char*)normalized); + if (normalized != buf) + free(normalized); + } +} + // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/functions.h b/builtins/functions.h index 8a6026a1..e476f104 100644 --- a/builtins/functions.h +++ b/builtins/functions.h @@ -11,7 +11,8 @@ extern const char *SSS_HASH_VECTOR; void fail(CORD fmt, ...); void fail_source(const char *filename, int64_t start, int64_t end, CORD fmt, ...); CORD builtin_last_err(); -public void __doctest(void *expr, const TypeInfo *type, CORD expected, const char *filename, int64_t start, int64_t end); +void __doctest(void *expr, const TypeInfo *type, CORD expected, const char *filename, int64_t start, int64_t end); +void say(CORD text); uint32_t generic_hash(const void *obj, const TypeInfo *type); int32_t generic_compare(const void *x, const void *y, const TypeInfo *type); diff --git a/builtins/text.c b/builtins/text.c index a17ce23b..e280dcc6 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -10,11 +10,13 @@ #include #include #include +#include #include "../SipHash/halfsiphash.h" -#include "types.h" #include "array.h" +#include "functions.h" #include "text.h" +#include "types.h" #define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo)) @@ -86,12 +88,17 @@ public CORD Text__quoted(CORD str, bool colorize) public int Text__compare(CORD *x, CORD *y) { - return CORD_cmp(*x, *y); + uint8_t *xx = (uint8_t*)CORD_to_const_char_star(*x); + uint8_t *yy = (uint8_t*)CORD_to_const_char_star(*y); + int result = 0; + if (u8_normcmp(xx, strlen((char*)xx), yy, strlen((char*)yy), UNINORM_NFD, &result)) + fail("Something went wrong while comparing text"); + return result; } public bool Text__equal(CORD *x, CORD *y) { - return CORD_cmp(*x, *y) == 0; + return Text__compare(x, y) == 0; } public uint32_t Text__hash(CORD *cord) @@ -99,10 +106,14 @@ public uint32_t Text__hash(CORD *cord) if (!*cord) return 0; const char *str = CORD_to_const_char_star(*cord); - *cord = str; + size_t len = strlen(str); + uint8_t buf[128] = {0}; + size_t norm_len = sizeof(buf)-1; + uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len); uint32_t hash; - halfsiphash(str, strlen(str)+1, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); + halfsiphash(normalized, norm_len, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); + if (normalized != buf) free(normalized); return hash; } diff --git a/compile.c b/compile.c index ae1a66bf..79a92e0d 100644 --- a/compile.c +++ b/compile.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "ast.h" #include "builtins/text.h" @@ -855,6 +856,16 @@ CORD compile(env_t *env, ast_t *ast) if (!expr_t) code_err(test->expr, "I couldn't figure out the type of this expression"); + CORD output = NULL; + if (test->output) { + const uint8_t *raw = (const uint8_t*)CORD_to_const_char_star(test->output); + uint8_t buf[128] = {0}; + size_t norm_len = sizeof(buf)-1; + uint8_t *norm = u8_normalize(UNINORM_NFD, (uint8_t*)raw, strlen((char*)raw), buf, &norm_len); + output = CORD_from_char_star((char*)norm); + if (norm && norm != buf) free(norm); + } + if (test->expr->tag == Declare) { auto decl = Match(test->expr, Declare); return CORD_asprintf( @@ -863,7 +874,7 @@ CORD compile(env_t *env, ast_t *ast) compile(env, test->expr), compile(env, decl->var), compile_type_info(env, get_type(env, decl->value)), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->output)), + compile(env, WrapAST(test->expr, TextLiteral, .cord=output)), compile(env, WrapAST(test->expr, TextLiteral, .cord=test->expr->file->filename)), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); @@ -890,7 +901,7 @@ CORD compile(env_t *env, ast_t *ast) CORD_appendf(&code, "$test(%r, %r, %r);", compile(env, WrapAST(test->expr, TextLiteral, .cord=src)), expr_cord, - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->output))); + compile(env, WrapAST(test->expr, TextLiteral, .cord=output))); return CORD_cat(code, "\n}"); } else if (expr_t->tag == VoidType || expr_t->tag == AbortType) { return CORD_asprintf( @@ -908,7 +919,7 @@ CORD compile(env_t *env, ast_t *ast) compile_type(expr_t), compile(env, test->expr), compile_type_info(env, expr_t), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->output)), + compile(env, WrapAST(test->expr, TextLiteral, .cord=output)), compile(env, WrapAST(test->expr, TextLiteral, .cord=test->expr->file->filename)), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); diff --git a/test/text.tm b/test/text.tm index 7d54b811..33582a1c 100644 --- a/test/text.tm +++ b/test/text.tm @@ -6,5 +6,11 @@ >> str:lower():title() = "Hello Amélie!" ->> \U00E9 +>> \UE9 = "é" + +>> \U65\U301 += "é" + +>> \UE9 == \U65\U301 += yes diff --git a/tomo.h b/tomo.h index a3bde24a..2a0ba1f8 100644 --- a/tomo.h +++ b/tomo.h @@ -73,15 +73,4 @@ CORD as_cord(void *x, bool use_color, const char *fmt, ...); #define min(x, y) ({ $var($min_lhs, x); $var($min_rhs, y); $le($min_lhs, $min_rhs) ? $min_lhs : $min_rhs; }) #define max(x, y) ({ $var($min_lhs, x); $var($min_rhs, y); $ge($min_lhs, $min_rhs) ? $min_lhs : $min_rhs; }) -#define say(str) CORD_put(CORD_cat(str, "\n"), stdout) -#define $test(src, expr, expected) do { \ - CORD $result = $cord(expr); \ - CORD $output = CORD_catn(5, USE_COLOR ? "\x1b[33;1m>>\x1b[0m " : ">> ", src, USE_COLOR ? "\n\x1b[0;2m=\x1b[m " : "\n= ", $result, "\x1b[m"); \ - puts(CORD_to_const_char_star($output)); \ - if (expected && CORD_cmp($result, expected)) { \ - fprintf(stderr, USE_COLOR ? "\x1b[31;1;7mTEST FAILURE!\x1b[27m\nI expected:\n\t\x1b[0;1m%s\x1b[1;31m\nbut got:\n\t%s\x1b[m\n" : "TEST FAILURE!\nI expected:\n\t%s\nbut got:\n\t%s\n", CORD_to_const_char_star(expected), CORD_to_const_char_star($result)); \ - raise(SIGABRT); \ - } \ - } while (0) - // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 -- cgit v1.2.3