Add methods for text:clusters(), text:codepoints(), text:bytes()

author: Bruce Hill <bruce@bruce-hill.com> 2024-03-03 19:40:01 -0500
committer: Bruce Hill <bruce@bruce-hill.com> 2024-03-03 19:40:01 -0500
commit: 1f6aa4cac7af0e00c826d5348b06aed70cbcb3e3 (patch)
tree: a53f2264f98d329b01278161c7ff32b5c1506593
parent: dc04286e3a75d8f94a69c204cb4fbb7b22e2d6a9 (diff)
4 files changed, 94 insertions, 12 deletions
diff --git a/builtins/text.c b/builtins/text.c
index e280dcc6..d4fbb32e 100644
--- a/builtins/text.c
+++ b/builtins/text.c
@@ -8,9 +8,10 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <sys/param.h>
-#include <unistr.h>
 #include <unicase.h>
+#include <unigbrk.h>
 #include <uninorm.h>
+#include <unistr.h>
 
 #include "../SipHash/halfsiphash.h"
 #include "array.h"
@@ -110,6 +111,7 @@ public uint32_t Text__hash(CORD *cord)
     uint8_t buf[128] = {0};
     size_t norm_len = sizeof(buf)-1;
     uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len);
+    if (!normalized) errx(1, "Unicode normalization error!");
 
     uint32_t hash;
     halfsiphash(normalized, norm_len, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash));
@@ -259,6 +261,72 @@ public CORD Text__join(CORD glue, array_t pieces)
     return ret;
 }
 
+public array_t Text__clusters(CORD text)
+{
+    array_t clusters = {.atomic=1};
+    const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
+    uint8_t buf[128] = {0};
+    size_t norm_len = sizeof(buf)-1;
+    uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), buf, &norm_len);
+    if (!normalized) errx(1, "Unicode normalization error!");
+
+    const uint8_t *end = normalized + strlen((char*)normalized);
+    for (const uint8_t *pos = normalized; pos != end; ) {
+        const uint8_t *next = u8_grapheme_next(pos, end);
+        size_t len = next ? (size_t)(next - pos) : strlen((char*)pos);
+        char cluster_buf[len+1];
+        strlcpy(cluster_buf, (char*)pos, len+1);
+        CORD cluster = CORD_from_char_star(cluster_buf);
+        Array__insert(&clusters, &cluster, 0, $ArrayInfo(&Text));
+        pos = next;
+    }
+
+    if (normalized != buf) free(normalized);
+    return clusters;
+}
+
+public array_t Text__codepoints(CORD text)
+{
+    const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
+    uint8_t norm_buf[128] = {0};
+    size_t norm_len = sizeof(norm_buf)-1;
+    uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len);
+    if (!normalized) errx(1, "Unicode normalization error!");
+
+    uint32_t codepoint_buf[128] = {0};
+    size_t codepoint_len = sizeof(codepoint_buf)-1;
+    uint32_t *codepoints = u8_to_u32(normalized, norm_len, codepoint_buf, &codepoint_len);
+    array_t ret = {
+        .length=codepoint_len,
+        .data=memcpy(GC_MALLOC_ATOMIC(sizeof(int32_t)*codepoint_len), codepoints, sizeof(int32_t)*codepoint_len),
+        .stride=sizeof(int32_t),
+        .atomic=1,
+    };
+
+    if (normalized != norm_buf) free(normalized);
+    if (codepoints != codepoint_buf) free(codepoints);
+    return ret;
+}
+
+public array_t Text__bytes(CORD text)
+{
+    const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
+    uint8_t norm_buf[128] = {0};
+    size_t norm_len = sizeof(norm_buf)-1;
+    uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len);
+    if (!normalized) errx(1, "Unicode normalization error!");
+
+    array_t ret = {
+        .length=norm_len,
+        .data=memcpy(GC_MALLOC_ATOMIC(sizeof(uint8_t)*norm_len), normalized, sizeof(uint8_t)*norm_len),
+        .stride=sizeof(uint8_t),
+        .atomic=1,
+    };
+
+    if (normalized != norm_buf) free(normalized);
+    return ret;
+}
+
 public const TypeInfo Text = {
     .size=sizeof(CORD),
     .align=__alignof__(CORD),
diff --git a/builtins/text.h b/builtins/text.h
index ecb6e603..44e2c270 100644
--- a/builtins/text.h
+++ b/builtins/text.h
@@ -29,6 +29,9 @@ find_result_t Text__find(CORD str, CORD pat);
 CORD Text__replace(CORD text, CORD pat, CORD replacement, int64_t limit);
 array_t Text__split(CORD str, CORD split);
 CORD Text__join(CORD glue, array_t pieces);
+array_t Text__clusters(CORD text);
+array_t Text__codepoints(CORD text);
+array_t Text__bytes(CORD text);
 
 extern const TypeInfo Text;
 
diff --git a/environment.c b/environment.c
index 5dd92beb..78c80c4e 100644
--- a/environment.c
+++ b/environment.c
@@ -147,18 +147,21 @@ env_t *new_compilation_unit(void)
 #undef F
 #undef C
         {"Text", Type(TextType), "Text_t", "Text", $TypedArray(ns_entry_t,
-            {"quoted", "Text__quoted", "func(s:Text, color=no)->Text"},
-            {"upper", "Text__upper", "func(s:Text)->Text"},
-            {"lower", "Text__lower", "func(s:Text)->Text"},
-            {"title", "Text__title", "func(s:Text)->Text"},
-            // {"has", "Text__has", "func(s:Text, target:Text, where=ANYWHERE)->Bool"},
-            // {"without", "Text__without", "func(s:Text, target:Text, where=ANYWHERE)->Text"},
-            // {"trimmed", "Text__without", "func(s:Text, skip:Text, where=ANYWHERE)->Text"},
-            {"title", "Text__title", "func(s:Text)->Text"},
-            // {"find", "Text__find", "func(s:Text, pattern:Text)->FindResult"},
-            {"replace", "Text__replace", "func(s:Text, pattern:Text, replacement:Text, limit=Int.max)->Text"},
-            {"split", "Text__split", "func(s:Text, split:Text)->[Text]"},
+            {"quoted", "Text__quoted", "func(text:Text, color=no)->Text"},
+            {"upper", "Text__upper", "func(text:Text)->Text"},
+            {"lower", "Text__lower", "func(text:Text)->Text"},
+            {"title", "Text__title", "func(text:Text)->Text"},
+            // {"has", "Text__has", "func(text:Text, target:Text, where=ANYWHERE)->Bool"},
+            // {"without", "Text__without", "func(text:Text, target:Text, where=ANYWHERE)->Text"},
+            // {"trimmed", "Text__without", "func(text:Text, skip:Text, where=ANYWHERE)->Text"},
+            {"title", "Text__title", "func(text:Text)->Text"},
+            // {"find", "Text__find", "func(text:Text, pattern:Text)->FindResult"},
+            {"replace", "Text__replace", "func(text:Text, pattern:Text, replacement:Text, limit=Int.max)->Text"},
+            {"split", "Text__split", "func(text:Text, split:Text)->[Text]"},
             {"join", "Text__join", "func(glue:Text, pieces:[Text])->Text"},
+            {"clusters", "Text__clusters", "func(text:Text)->[Text]"},
+            {"codepoints", "Text__codepoints", "func(text:Text)->[Int32]"},
+            {"bytes", "Text__bytes", "func(text:Text)->[Int8]"},
         )},
     };
 
diff --git a/test/text.tm b/test/text.tm
index 33582a1c..690bb57a 100644
--- a/test/text.tm
+++ b/test/text.tm
@@ -14,3 +14,11 @@
 
 >> \UE9 == \U65\U301
 = yes
+
+>> amelie := "Amélie"
+>> amelie:clusters()
+= ["A", "m", "é", "l", "i", "e"] : [Text]
+>> amelie:codepoints()
+= [65_i32, 109_i32, 101_i32, 769_i32, 108_i32, 105_i32, 101_i32] : [Int32]
+>> amelie:bytes()
+= [65_i8, 109_i8, 101_i8, -52_i8, -127_i8, 108_i8, 105_i8, 101_i8] : [Int8]
author	Bruce Hill <bruce@bruce-hill.com>	2024-03-03 19:40:01 -0500
committer	Bruce Hill <bruce@bruce-hill.com>	2024-03-03 19:40:01 -0500
commit	1f6aa4cac7af0e00c826d5348b06aed70cbcb3e3 (patch)
tree	a53f2264f98d329b01278161c7ff32b5c1506593
parent	dc04286e3a75d8f94a69c204cb4fbb7b22e2d6a9 (diff)