aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2024-03-03 19:12:53 -0500
committerBruce Hill <bruce@bruce-hill.com>2024-03-03 19:12:53 -0500
commitdc04286e3a75d8f94a69c204cb4fbb7b22e2d6a9 (patch)
treea7e82524f7f29e1b0ce95e4629dc121bc3baf4f2
parentd8a533cdf09a9dc7c302913cad514fbeee357b18 (diff)
Unicode normalization for equality, hashing, tests, and printing
-rw-r--r--builtins/functions.c37
-rw-r--r--builtins/functions.h3
-rw-r--r--builtins/text.c21
-rw-r--r--compile.c17
-rw-r--r--test/text.tm8
-rw-r--r--tomo.h11
6 files changed, 70 insertions, 27 deletions
diff --git a/builtins/functions.c b/builtins/functions.c
index 69fe5934..c5514ffe 100644
--- a/builtins/functions.c
+++ b/builtins/functions.c
@@ -5,6 +5,7 @@
#include <errno.h>
#include <stdlib.h>
#include <sys/param.h>
+#include <uninorm.h>
#include "../SipHash/halfsiphash.h"
#include "../files.h"
@@ -12,6 +13,7 @@
#include "functions.h"
#include "array.h"
#include "table.h"
+#include "text.h"
#include "pointer.h"
#include "string.h"
#include "types.h"
@@ -158,25 +160,48 @@ public void __doctest(void *expr, const TypeInfo *type, CORD expected, const cha
CORD_fprintf(stderr, USE_COLOR ? "\x1b[33;1m>> \x1b[0m%.*s\x1b[m\n" : ">> %.*s\n", (end - start), file->text + start);
if (expr) {
- CORD expr_str = generic_as_text(expr, USE_COLOR, type);
+ CORD expr_cord = generic_as_text(expr, USE_COLOR, type);
CORD type_name = generic_as_text(NULL, false, type);
- CORD_fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %r \x1b[2m: %r\x1b[m\n" : "= %r : %r\n", expr_str, type_name);
+ uint8_t buf[512] = {0};
+ size_t buf_len = sizeof(buf)-1;
+ const char *expr_str = CORD_to_const_char_star(expr_cord);
+ uint8_t *normalized_str = u8_normalize(UNINORM_NFD, (uint8_t*)expr_str, strlen(expr_str), buf, &buf_len);
+ if (!normalized_str) errx(1, "Couldn't normalize unicode string!");
+ CORD expr_normalized = CORD_from_char_star((char*)buf);
+ if (normalized_str != buf)
+ free(normalized_str);
+
+ CORD_fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %r \x1b[2m: %r\x1b[m\n" : "= %r : %r\n", expr_normalized, type_name);
if (expected) {
- CORD expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_str;
- bool success = (CORD_cmp(expr_plain, expected) == 0);
+ CORD expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_normalized;
+ bool success = Text__equal(&expr_plain, &expected);
if (!success && CORD_chr(expected, 0, ':')) {
- success = (CORD_cmp(CORD_catn(3, expr_plain, " : ", type_name), expected) == 0);
+ CORD with_type = CORD_catn(3, expr_plain, " : ", type_name);
+ success = Text__equal(&with_type, &expected);
}
if (!success) {
fail_source(filename, start, end,
USE_COLOR ? "\x1b[31;1mDoctest failure:\nExpected: \x1b[32;1m%s\x1b[0m\n\x1b[31;1m But got: \x1b[31;7m%s\x1b[0m\n"
: "Doctest failure:\nExpected: %s\n But got: %s\n",
- CORD_to_const_char_star(expected), CORD_to_const_char_star(expr_str));
+ CORD_to_const_char_star(expected), CORD_to_const_char_star(expr_normalized));
}
}
}
}
+public void say(CORD text)
+{
+ uint8_t buf[512] = {0};
+ size_t buf_len = sizeof(buf)-1;
+ const char *str = CORD_to_const_char_star(text);
+ uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, strlen(str), buf, &buf_len);
+ if (normalized) {
+ puts((char*)normalized);
+ if (normalized != buf)
+ free(normalized);
+ }
+}
+
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/builtins/functions.h b/builtins/functions.h
index 8a6026a1..e476f104 100644
--- a/builtins/functions.h
+++ b/builtins/functions.h
@@ -11,7 +11,8 @@ extern const char *SSS_HASH_VECTOR;
void fail(CORD fmt, ...);
void fail_source(const char *filename, int64_t start, int64_t end, CORD fmt, ...);
CORD builtin_last_err();
-public void __doctest(void *expr, const TypeInfo *type, CORD expected, const char *filename, int64_t start, int64_t end);
+void __doctest(void *expr, const TypeInfo *type, CORD expected, const char *filename, int64_t start, int64_t end);
+void say(CORD text);
uint32_t generic_hash(const void *obj, const TypeInfo *type);
int32_t generic_compare(const void *x, const void *y, const TypeInfo *type);
diff --git a/builtins/text.c b/builtins/text.c
index a17ce23b..e280dcc6 100644
--- a/builtins/text.c
+++ b/builtins/text.c
@@ -10,11 +10,13 @@
#include <sys/param.h>
#include <unistr.h>
#include <unicase.h>
+#include <uninorm.h>
#include "../SipHash/halfsiphash.h"
-#include "types.h"
#include "array.h"
+#include "functions.h"
#include "text.h"
+#include "types.h"
#define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo))
@@ -86,12 +88,17 @@ public CORD Text__quoted(CORD str, bool colorize)
public int Text__compare(CORD *x, CORD *y)
{
- return CORD_cmp(*x, *y);
+ uint8_t *xx = (uint8_t*)CORD_to_const_char_star(*x);
+ uint8_t *yy = (uint8_t*)CORD_to_const_char_star(*y);
+ int result = 0;
+ if (u8_normcmp(xx, strlen((char*)xx), yy, strlen((char*)yy), UNINORM_NFD, &result))
+ fail("Something went wrong while comparing text");
+ return result;
}
public bool Text__equal(CORD *x, CORD *y)
{
- return CORD_cmp(*x, *y) == 0;
+ return Text__compare(x, y) == 0;
}
public uint32_t Text__hash(CORD *cord)
@@ -99,10 +106,14 @@ public uint32_t Text__hash(CORD *cord)
if (!*cord) return 0;
const char *str = CORD_to_const_char_star(*cord);
- *cord = str;
+ size_t len = strlen(str);
+ uint8_t buf[128] = {0};
+ size_t norm_len = sizeof(buf)-1;
+ uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len);
uint32_t hash;
- halfsiphash(str, strlen(str)+1, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash));
+ halfsiphash(normalized, norm_len, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash));
+ if (normalized != buf) free(normalized);
return hash;
}
diff --git a/compile.c b/compile.c
index ae1a66bf..79a92e0d 100644
--- a/compile.c
+++ b/compile.c
@@ -3,6 +3,7 @@
#include <gc/cord.h>
#include <gc.h>
#include <stdio.h>
+#include <uninorm.h>
#include "ast.h"
#include "builtins/text.h"
@@ -855,6 +856,16 @@ CORD compile(env_t *env, ast_t *ast)
if (!expr_t)
code_err(test->expr, "I couldn't figure out the type of this expression");
+ CORD output = NULL;
+ if (test->output) {
+ const uint8_t *raw = (const uint8_t*)CORD_to_const_char_star(test->output);
+ uint8_t buf[128] = {0};
+ size_t norm_len = sizeof(buf)-1;
+ uint8_t *norm = u8_normalize(UNINORM_NFD, (uint8_t*)raw, strlen((char*)raw), buf, &norm_len);
+ output = CORD_from_char_star((char*)norm);
+ if (norm && norm != buf) free(norm);
+ }
+
if (test->expr->tag == Declare) {
auto decl = Match(test->expr, Declare);
return CORD_asprintf(
@@ -863,7 +874,7 @@ CORD compile(env_t *env, ast_t *ast)
compile(env, test->expr),
compile(env, decl->var),
compile_type_info(env, get_type(env, decl->value)),
- compile(env, WrapAST(test->expr, TextLiteral, .cord=test->output)),
+ compile(env, WrapAST(test->expr, TextLiteral, .cord=output)),
compile(env, WrapAST(test->expr, TextLiteral, .cord=test->expr->file->filename)),
(int64_t)(test->expr->start - test->expr->file->text),
(int64_t)(test->expr->end - test->expr->file->text));
@@ -890,7 +901,7 @@ CORD compile(env_t *env, ast_t *ast)
CORD_appendf(&code, "$test(%r, %r, %r);",
compile(env, WrapAST(test->expr, TextLiteral, .cord=src)),
expr_cord,
- compile(env, WrapAST(test->expr, TextLiteral, .cord=test->output)));
+ compile(env, WrapAST(test->expr, TextLiteral, .cord=output)));
return CORD_cat(code, "\n}");
} else if (expr_t->tag == VoidType || expr_t->tag == AbortType) {
return CORD_asprintf(
@@ -908,7 +919,7 @@ CORD compile(env_t *env, ast_t *ast)
compile_type(expr_t),
compile(env, test->expr),
compile_type_info(env, expr_t),
- compile(env, WrapAST(test->expr, TextLiteral, .cord=test->output)),
+ compile(env, WrapAST(test->expr, TextLiteral, .cord=output)),
compile(env, WrapAST(test->expr, TextLiteral, .cord=test->expr->file->filename)),
(int64_t)(test->expr->start - test->expr->file->text),
(int64_t)(test->expr->end - test->expr->file->text));
diff --git a/test/text.tm b/test/text.tm
index 7d54b811..33582a1c 100644
--- a/test/text.tm
+++ b/test/text.tm
@@ -6,5 +6,11 @@
>> str:lower():title()
= "Hello Amélie!"
->> \U00E9
+>> \UE9
= "é"
+
+>> \U65\U301
+= "é"
+
+>> \UE9 == \U65\U301
+= yes
diff --git a/tomo.h b/tomo.h
index a3bde24a..2a0ba1f8 100644
--- a/tomo.h
+++ b/tomo.h
@@ -73,15 +73,4 @@ CORD as_cord(void *x, bool use_color, const char *fmt, ...);
#define min(x, y) ({ $var($min_lhs, x); $var($min_rhs, y); $le($min_lhs, $min_rhs) ? $min_lhs : $min_rhs; })
#define max(x, y) ({ $var($min_lhs, x); $var($min_rhs, y); $ge($min_lhs, $min_rhs) ? $min_lhs : $min_rhs; })
-#define say(str) CORD_put(CORD_cat(str, "\n"), stdout)
-#define $test(src, expr, expected) do { \
- CORD $result = $cord(expr); \
- CORD $output = CORD_catn(5, USE_COLOR ? "\x1b[33;1m>>\x1b[0m " : ">> ", src, USE_COLOR ? "\n\x1b[0;2m=\x1b[m " : "\n= ", $result, "\x1b[m"); \
- puts(CORD_to_const_char_star($output)); \
- if (expected && CORD_cmp($result, expected)) { \
- fprintf(stderr, USE_COLOR ? "\x1b[31;1;7mTEST FAILURE!\x1b[27m\nI expected:\n\t\x1b[0;1m%s\x1b[1;31m\nbut got:\n\t%s\x1b[m\n" : "TEST FAILURE!\nI expected:\n\t%s\nbut got:\n\t%s\n", CORD_to_const_char_star(expected), CORD_to_const_char_star($result)); \
- raise(SIGABRT); \
- } \
- } while (0)
-
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0