diff --git a/builtins/text.c b/builtins/text.c index e280dcc..d4fbb32 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -8,9 +8,10 @@ #include #include #include -#include #include +#include #include +#include #include "../SipHash/halfsiphash.h" #include "array.h" @@ -110,6 +111,7 @@ public uint32_t Text__hash(CORD *cord) uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf)-1; uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len); + if (!normalized) errx(1, "Unicode normalization error!"); uint32_t hash; halfsiphash(normalized, norm_len, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); @@ -259,6 +261,72 @@ public CORD Text__join(CORD glue, array_t pieces) return ret; } +public array_t Text__clusters(CORD text) +{ + array_t clusters = {.atomic=1}; + const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); + uint8_t buf[128] = {0}; + size_t norm_len = sizeof(buf)-1; + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), buf, &norm_len); + if (!normalized) errx(1, "Unicode normalization error!"); + + const uint8_t *end = normalized + strlen((char*)normalized); + for (const uint8_t *pos = normalized; pos != end; ) { + const uint8_t *next = u8_grapheme_next(pos, end); + size_t len = next ? (size_t)(next - pos) : strlen((char*)pos); + char cluster_buf[len+1]; + strlcpy(cluster_buf, (char*)pos, len+1); + CORD cluster = CORD_from_char_star(cluster_buf); + Array__insert(&clusters, &cluster, 0, $ArrayInfo(&Text)); + pos = next; + } + + if (normalized != buf) free(normalized); + return clusters; +} + +public array_t Text__codepoints(CORD text) +{ + const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); + uint8_t norm_buf[128] = {0}; + size_t norm_len = sizeof(norm_buf)-1; + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len); + if (!normalized) errx(1, "Unicode normalization error!"); + + uint32_t codepoint_buf[128] = {0}; + size_t codepoint_len = sizeof(codepoint_buf)-1; + uint32_t *codepoints = u8_to_u32(normalized, norm_len, codepoint_buf, &codepoint_len); + array_t ret = { + .length=codepoint_len, + .data=memcpy(GC_MALLOC_ATOMIC(sizeof(int32_t)*codepoint_len), codepoints, sizeof(int32_t)*codepoint_len), + .stride=sizeof(int32_t), + .atomic=1, + }; + + if (normalized != norm_buf) free(normalized); + if (codepoints != codepoint_buf) free(codepoints); + return ret; +} + +public array_t Text__bytes(CORD text) +{ + const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); + uint8_t norm_buf[128] = {0}; + size_t norm_len = sizeof(norm_buf)-1; + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len); + if (!normalized) errx(1, "Unicode normalization error!"); + + array_t ret = { + .length=norm_len, + .data=memcpy(GC_MALLOC_ATOMIC(sizeof(uint8_t)*norm_len), normalized, sizeof(uint8_t)*norm_len), + .stride=sizeof(uint8_t), + .atomic=1, + }; + + if (normalized != norm_buf) free(normalized); + return ret; +} + public const TypeInfo Text = { .size=sizeof(CORD), .align=__alignof__(CORD), diff --git a/builtins/text.h b/builtins/text.h index ecb6e60..44e2c27 100644 --- a/builtins/text.h +++ b/builtins/text.h @@ -29,6 +29,9 @@ find_result_t Text__find(CORD str, CORD pat); CORD Text__replace(CORD text, CORD pat, CORD replacement, int64_t limit); array_t Text__split(CORD str, CORD split); CORD Text__join(CORD glue, array_t pieces); +array_t Text__clusters(CORD text); +array_t Text__codepoints(CORD text); +array_t Text__bytes(CORD text); extern const TypeInfo Text; diff --git a/environment.c b/environment.c index 5dd92be..78c80c4 100644 --- a/environment.c +++ b/environment.c @@ -147,18 +147,21 @@ env_t *new_compilation_unit(void) #undef F #undef C {"Text", Type(TextType), "Text_t", "Text", $TypedArray(ns_entry_t, - {"quoted", "Text__quoted", "func(s:Text, color=no)->Text"}, - {"upper", "Text__upper", "func(s:Text)->Text"}, - {"lower", "Text__lower", "func(s:Text)->Text"}, - {"title", "Text__title", "func(s:Text)->Text"}, - // {"has", "Text__has", "func(s:Text, target:Text, where=ANYWHERE)->Bool"}, - // {"without", "Text__without", "func(s:Text, target:Text, where=ANYWHERE)->Text"}, - // {"trimmed", "Text__without", "func(s:Text, skip:Text, where=ANYWHERE)->Text"}, - {"title", "Text__title", "func(s:Text)->Text"}, - // {"find", "Text__find", "func(s:Text, pattern:Text)->FindResult"}, - {"replace", "Text__replace", "func(s:Text, pattern:Text, replacement:Text, limit=Int.max)->Text"}, - {"split", "Text__split", "func(s:Text, split:Text)->[Text]"}, + {"quoted", "Text__quoted", "func(text:Text, color=no)->Text"}, + {"upper", "Text__upper", "func(text:Text)->Text"}, + {"lower", "Text__lower", "func(text:Text)->Text"}, + {"title", "Text__title", "func(text:Text)->Text"}, + // {"has", "Text__has", "func(text:Text, target:Text, where=ANYWHERE)->Bool"}, + // {"without", "Text__without", "func(text:Text, target:Text, where=ANYWHERE)->Text"}, + // {"trimmed", "Text__without", "func(text:Text, skip:Text, where=ANYWHERE)->Text"}, + {"title", "Text__title", "func(text:Text)->Text"}, + // {"find", "Text__find", "func(text:Text, pattern:Text)->FindResult"}, + {"replace", "Text__replace", "func(text:Text, pattern:Text, replacement:Text, limit=Int.max)->Text"}, + {"split", "Text__split", "func(text:Text, split:Text)->[Text]"}, {"join", "Text__join", "func(glue:Text, pieces:[Text])->Text"}, + {"clusters", "Text__clusters", "func(text:Text)->[Text]"}, + {"codepoints", "Text__codepoints", "func(text:Text)->[Int32]"}, + {"bytes", "Text__bytes", "func(text:Text)->[Int8]"}, )}, }; diff --git a/test/text.tm b/test/text.tm index 33582a1..690bb57 100644 --- a/test/text.tm +++ b/test/text.tm @@ -14,3 +14,11 @@ >> \UE9 == \U65\U301 = yes + +>> amelie := "Amélie" +>> amelie:clusters() += ["A", "m", "é", "l", "i", "e"] : [Text] +>> amelie:codepoints() += [65_i32, 109_i32, 101_i32, 769_i32, 108_i32, 105_i32, 101_i32] : [Int32] +>> amelie:bytes() += [65_i8, 109_i8, 101_i8, -52_i8, -127_i8, 108_i8, 105_i8, 101_i8] : [Int8]