aboutsummaryrefslogtreecommitdiff
path: root/builtins
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2024-03-03 19:40:01 -0500
committerBruce Hill <bruce@bruce-hill.com>2024-03-03 19:40:01 -0500
commit1f6aa4cac7af0e00c826d5348b06aed70cbcb3e3 (patch)
treea53f2264f98d329b01278161c7ff32b5c1506593 /builtins
parentdc04286e3a75d8f94a69c204cb4fbb7b22e2d6a9 (diff)
Add methods for text:clusters(), text:codepoints(), text:bytes()
Diffstat (limited to 'builtins')
-rw-r--r--builtins/text.c70
-rw-r--r--builtins/text.h3
2 files changed, 72 insertions, 1 deletions
diff --git a/builtins/text.c b/builtins/text.c
index e280dcc6..d4fbb32e 100644
--- a/builtins/text.c
+++ b/builtins/text.c
@@ -8,9 +8,10 @@
#include <stdint.h>
#include <stdlib.h>
#include <sys/param.h>
-#include <unistr.h>
#include <unicase.h>
+#include <unigbrk.h>
#include <uninorm.h>
+#include <unistr.h>
#include "../SipHash/halfsiphash.h"
#include "array.h"
@@ -110,6 +111,7 @@ public uint32_t Text__hash(CORD *cord)
uint8_t buf[128] = {0};
size_t norm_len = sizeof(buf)-1;
uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len);
+ if (!normalized) errx(1, "Unicode normalization error!");
uint32_t hash;
halfsiphash(normalized, norm_len, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash));
@@ -259,6 +261,72 @@ public CORD Text__join(CORD glue, array_t pieces)
return ret;
}
+public array_t Text__clusters(CORD text)
+{
+ array_t clusters = {.atomic=1};
+ const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
+ uint8_t buf[128] = {0};
+ size_t norm_len = sizeof(buf)-1;
+ uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), buf, &norm_len);
+ if (!normalized) errx(1, "Unicode normalization error!");
+
+ const uint8_t *end = normalized + strlen((char*)normalized);
+ for (const uint8_t *pos = normalized; pos != end; ) {
+ const uint8_t *next = u8_grapheme_next(pos, end);
+ size_t len = next ? (size_t)(next - pos) : strlen((char*)pos);
+ char cluster_buf[len+1];
+ strlcpy(cluster_buf, (char*)pos, len+1);
+ CORD cluster = CORD_from_char_star(cluster_buf);
+ Array__insert(&clusters, &cluster, 0, $ArrayInfo(&Text));
+ pos = next;
+ }
+
+ if (normalized != buf) free(normalized);
+ return clusters;
+}
+
+public array_t Text__codepoints(CORD text)
+{
+ const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
+ uint8_t norm_buf[128] = {0};
+ size_t norm_len = sizeof(norm_buf)-1;
+ uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len);
+ if (!normalized) errx(1, "Unicode normalization error!");
+
+ uint32_t codepoint_buf[128] = {0};
+ size_t codepoint_len = sizeof(codepoint_buf)-1;
+ uint32_t *codepoints = u8_to_u32(normalized, norm_len, codepoint_buf, &codepoint_len);
+ array_t ret = {
+ .length=codepoint_len,
+ .data=memcpy(GC_MALLOC_ATOMIC(sizeof(int32_t)*codepoint_len), codepoints, sizeof(int32_t)*codepoint_len),
+ .stride=sizeof(int32_t),
+ .atomic=1,
+ };
+
+ if (normalized != norm_buf) free(normalized);
+ if (codepoints != codepoint_buf) free(codepoints);
+ return ret;
+}
+
+public array_t Text__bytes(CORD text)
+{
+ const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
+ uint8_t norm_buf[128] = {0};
+ size_t norm_len = sizeof(norm_buf)-1;
+ uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len);
+ if (!normalized) errx(1, "Unicode normalization error!");
+
+ array_t ret = {
+ .length=norm_len,
+ .data=memcpy(GC_MALLOC_ATOMIC(sizeof(uint8_t)*norm_len), normalized, sizeof(uint8_t)*norm_len),
+ .stride=sizeof(uint8_t),
+ .atomic=1,
+ };
+
+ if (normalized != norm_buf) free(normalized);
+ return ret;
+}
+
public const TypeInfo Text = {
.size=sizeof(CORD),
.align=__alignof__(CORD),
diff --git a/builtins/text.h b/builtins/text.h
index ecb6e603..44e2c270 100644
--- a/builtins/text.h
+++ b/builtins/text.h
@@ -29,6 +29,9 @@ find_result_t Text__find(CORD str, CORD pat);
CORD Text__replace(CORD text, CORD pat, CORD replacement, int64_t limit);
array_t Text__split(CORD str, CORD split);
CORD Text__join(CORD glue, array_t pieces);
+array_t Text__clusters(CORD text);
+array_t Text__codepoints(CORD text);
+array_t Text__bytes(CORD text);
extern const TypeInfo Text;