aboutsummaryrefslogtreecommitdiff
path: root/builtins
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2024-03-03 19:12:53 -0500
committerBruce Hill <bruce@bruce-hill.com>2024-03-03 19:12:53 -0500
commitdc04286e3a75d8f94a69c204cb4fbb7b22e2d6a9 (patch)
treea7e82524f7f29e1b0ce95e4629dc121bc3baf4f2 /builtins
parentd8a533cdf09a9dc7c302913cad514fbeee357b18 (diff)
Unicode normalization for equality, hashing, tests, and printing
Diffstat (limited to 'builtins')
-rw-r--r--builtins/functions.c37
-rw-r--r--builtins/functions.h3
-rw-r--r--builtins/text.c21
3 files changed, 49 insertions, 12 deletions
diff --git a/builtins/functions.c b/builtins/functions.c
index 69fe5934..c5514ffe 100644
--- a/builtins/functions.c
+++ b/builtins/functions.c
@@ -5,6 +5,7 @@
#include <errno.h>
#include <stdlib.h>
#include <sys/param.h>
+#include <uninorm.h>
#include "../SipHash/halfsiphash.h"
#include "../files.h"
@@ -12,6 +13,7 @@
#include "functions.h"
#include "array.h"
#include "table.h"
+#include "text.h"
#include "pointer.h"
#include "string.h"
#include "types.h"
@@ -158,25 +160,48 @@ public void __doctest(void *expr, const TypeInfo *type, CORD expected, const cha
CORD_fprintf(stderr, USE_COLOR ? "\x1b[33;1m>> \x1b[0m%.*s\x1b[m\n" : ">> %.*s\n", (end - start), file->text + start);
if (expr) {
- CORD expr_str = generic_as_text(expr, USE_COLOR, type);
+ CORD expr_cord = generic_as_text(expr, USE_COLOR, type);
CORD type_name = generic_as_text(NULL, false, type);
- CORD_fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %r \x1b[2m: %r\x1b[m\n" : "= %r : %r\n", expr_str, type_name);
+ uint8_t buf[512] = {0};
+ size_t buf_len = sizeof(buf)-1;
+ const char *expr_str = CORD_to_const_char_star(expr_cord);
+ uint8_t *normalized_str = u8_normalize(UNINORM_NFD, (uint8_t*)expr_str, strlen(expr_str), buf, &buf_len);
+ if (!normalized_str) errx(1, "Couldn't normalize unicode string!");
+ CORD expr_normalized = CORD_from_char_star((char*)buf);
+ if (normalized_str != buf)
+ free(normalized_str);
+
+ CORD_fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %r \x1b[2m: %r\x1b[m\n" : "= %r : %r\n", expr_normalized, type_name);
if (expected) {
- CORD expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_str;
- bool success = (CORD_cmp(expr_plain, expected) == 0);
+ CORD expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_normalized;
+ bool success = Text__equal(&expr_plain, &expected);
if (!success && CORD_chr(expected, 0, ':')) {
- success = (CORD_cmp(CORD_catn(3, expr_plain, " : ", type_name), expected) == 0);
+ CORD with_type = CORD_catn(3, expr_plain, " : ", type_name);
+ success = Text__equal(&with_type, &expected);
}
if (!success) {
fail_source(filename, start, end,
USE_COLOR ? "\x1b[31;1mDoctest failure:\nExpected: \x1b[32;1m%s\x1b[0m\n\x1b[31;1m But got: \x1b[31;7m%s\x1b[0m\n"
: "Doctest failure:\nExpected: %s\n But got: %s\n",
- CORD_to_const_char_star(expected), CORD_to_const_char_star(expr_str));
+ CORD_to_const_char_star(expected), CORD_to_const_char_star(expr_normalized));
}
}
}
}
+public void say(CORD text)
+{
+ uint8_t buf[512] = {0};
+ size_t buf_len = sizeof(buf)-1;
+ const char *str = CORD_to_const_char_star(text);
+ uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, strlen(str), buf, &buf_len);
+ if (normalized) {
+ puts((char*)normalized);
+ if (normalized != buf)
+ free(normalized);
+ }
+}
+
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/builtins/functions.h b/builtins/functions.h
index 8a6026a1..e476f104 100644
--- a/builtins/functions.h
+++ b/builtins/functions.h
@@ -11,7 +11,8 @@ extern const char *SSS_HASH_VECTOR;
void fail(CORD fmt, ...);
void fail_source(const char *filename, int64_t start, int64_t end, CORD fmt, ...);
CORD builtin_last_err();
-public void __doctest(void *expr, const TypeInfo *type, CORD expected, const char *filename, int64_t start, int64_t end);
+void __doctest(void *expr, const TypeInfo *type, CORD expected, const char *filename, int64_t start, int64_t end);
+void say(CORD text);
uint32_t generic_hash(const void *obj, const TypeInfo *type);
int32_t generic_compare(const void *x, const void *y, const TypeInfo *type);
diff --git a/builtins/text.c b/builtins/text.c
index a17ce23b..e280dcc6 100644
--- a/builtins/text.c
+++ b/builtins/text.c
@@ -10,11 +10,13 @@
#include <sys/param.h>
#include <unistr.h>
#include <unicase.h>
+#include <uninorm.h>
#include "../SipHash/halfsiphash.h"
-#include "types.h"
#include "array.h"
+#include "functions.h"
#include "text.h"
+#include "types.h"
#define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo))
@@ -86,12 +88,17 @@ public CORD Text__quoted(CORD str, bool colorize)
public int Text__compare(CORD *x, CORD *y)
{
- return CORD_cmp(*x, *y);
+ uint8_t *xx = (uint8_t*)CORD_to_const_char_star(*x);
+ uint8_t *yy = (uint8_t*)CORD_to_const_char_star(*y);
+ int result = 0;
+ if (u8_normcmp(xx, strlen((char*)xx), yy, strlen((char*)yy), UNINORM_NFD, &result))
+ fail("Something went wrong while comparing text");
+ return result;
}
public bool Text__equal(CORD *x, CORD *y)
{
- return CORD_cmp(*x, *y) == 0;
+ return Text__compare(x, y) == 0;
}
public uint32_t Text__hash(CORD *cord)
@@ -99,10 +106,14 @@ public uint32_t Text__hash(CORD *cord)
if (!*cord) return 0;
const char *str = CORD_to_const_char_star(*cord);
- *cord = str;
+ size_t len = strlen(str);
+ uint8_t buf[128] = {0};
+ size_t norm_len = sizeof(buf)-1;
+ uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len);
uint32_t hash;
- halfsiphash(str, strlen(str)+1, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash));
+ halfsiphash(normalized, norm_len, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash));
+ if (normalized != buf) free(normalized);
return hash;
}