Add unicode name escapes: \U[penguin]

author: Bruce Hill <bruce@bruce-hill.com> 2024-09-07 01:43:36 -0400
committer: Bruce Hill <bruce@bruce-hill.com> 2024-09-07 01:43:36 -0400
commit: b0d806a1f06d89785f47ab72ee60a42495200995 (patch)
tree: fdeadcd79cb39c0f7a6f36a001cdd92911c6ce34
parent: 0a4d18c8bd120802bd5267f1ad117f6ff440478e (diff)
2 files changed, 23 insertions, 2 deletions
diff --git a/parse.c b/parse.c
index 74be5abd..d4f3fbdd 100644
--- a/parse.c
+++ b/parse.c
@@ -8,6 +8,7 @@
 #include <string.h>
 #include <unistr.h>
 #include <unictype.h>
+#include <uniname.h>
 #include <signal.h>
 
 #include "ast.h"
@@ -159,7 +160,7 @@ static void parser_err(parse_ctx_t *ctx, const char *start, const char *end, con
 //
 // Convert an escape sequence like \n to a string
 //
-const char *unescape(const char **out) {
+static const char *unescape(parse_ctx_t *ctx, const char **out) {
     const char **endpos = out;
     const char *escape = *out;
     static const char *unescapes[256] = {['a']="\a",['b']="\b",['e']="\e",['f']="\f",['n']="\n",['r']="\r",['t']="\t",['v']="\v",['_']=" "};
@@ -167,6 +168,23 @@ const char *unescape(const char **out) {
     if (unescapes[(int)escape[1]]) {
         *endpos = escape + 2;
         return GC_strdup(unescapes[(int)escape[1]]);
+    } else if (escape[1] == 'U' && escape[2] == '[') {
+        size_t len = strcspn(&escape[3], "\r\n]");
+        if (escape[3+len] != ']')
+            parser_err(ctx, escape, escape + 3 + len, "Missing closing ']'");
+        char name[len+1] = {};
+        memcpy(name, &escape[3], len);
+        name[len] = '\0';
+        uint32_t codepoint = unicode_name_character(name);
+        if (codepoint == UNINAME_INVALID)
+            parser_err(ctx, escape, escape + 4 + len,
+                       "Invalid unicode codepoint name: \"%s\"", name);
+        *endpos = escape + 4 + len;
+        char *str = GC_MALLOC_ATOMIC(16);
+        size_t u8_len = 16;
+        (void)u32_to_u8(&codepoint, 1, (uint8_t*)str, &u8_len);
+        str[u8_len] = '\0';
+        return str;
     } else if (escape[1] == 'U' && escape[2]) {
         char *endptr = NULL;
         long codepoint = strtol(escape+2, &endptr, 16);
@@ -1167,7 +1185,7 @@ PARSER(parse_text) {
     if (*pos == '\\') {
         CORD cord = CORD_EMPTY;
         do {
-            const char *c = unescape(&pos);
+            const char *c = unescape(ctx, &pos);
             cord = CORD_cat(cord, c);
             // cord = CORD_cat_char(cord, c);
         } while (*pos == '\\');
diff --git a/test/text.tm b/test/text.tm
index afaabb64..bf7b77cd 100644
--- a/test/text.tm
+++ b/test/text.tm
@@ -16,6 +16,9 @@ func main():
 	>> \U65\U301
 	= "é"
 
+	>> \U[Penguin]:codepoint_names()
+	= ["PENGUIN"]
+
 	>> \UE9 == \U65\U301
 	= yes
author	Bruce Hill <bruce@bruce-hill.com>	2024-09-07 01:43:36 -0400
committer	Bruce Hill <bruce@bruce-hill.com>	2024-09-07 01:43:36 -0400
commit	b0d806a1f06d89785f47ab72ee60a42495200995 (patch)
tree	fdeadcd79cb39c0f7a6f36a001cdd92911c6ce34
parent	0a4d18c8bd120802bd5267f1ad117f6ff440478e (diff)