aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2024-09-02 22:30:19 -0400
committerBruce Hill <bruce@bruce-hill.com>2024-09-02 22:30:19 -0400
commit708acda54e56a74096ec780777596e570db5b627 (patch)
tree6a4ecb5020f2259fb2ca50467cf97a8fe0045d88
parent5a78eb61c876f73f6ed6bb3d631e0edc065026db (diff)
WIP fixes for synthetic graphemes and adding some text conversion
methods
-rw-r--r--builtins/text.c89
-rw-r--r--builtins/text.h3
-rw-r--r--environment.c5
-rw-r--r--test/text.tm15
4 files changed, 101 insertions, 11 deletions
diff --git a/builtins/text.c b/builtins/text.c
index 5e8b4625..d4b3e378 100644
--- a/builtins/text.c
+++ b/builtins/text.c
@@ -32,11 +32,14 @@
#include "siphash.c"
-static struct {
+typedef struct {
int64_t num_codepoints;
const uint32_t *codepoints;
const uint8_t *utf8;
-} synthetic_graphemes[1024] = {};
+} synthetic_grapheme_t;
+
+#define MAX_SYNTHETIC_GRAPHEMES 1024
+static synthetic_grapheme_t synthetic_graphemes[MAX_SYNTHETIC_GRAPHEMES] = {};
static int32_t num_synthetic_graphemes = 0;
@@ -69,14 +72,36 @@ int32_t find_synthetic_grapheme(const uint32_t *codepoints, int64_t len)
int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t len)
{
+ // for (int i = 0; i < (int)num_synthetic_graphemes; i++) {
+ // if (!synthetic_graphemes[i].utf8)
+ // errx(1, "Beforehand, missing grapheme utf8 at index %d", i);
+ // }
int32_t index = find_synthetic_grapheme(codepoints, len);
- if (index < num_synthetic_graphemes
+ if (index >= 0
+ && index < num_synthetic_graphemes
&& synthetic_graphemes[index].num_codepoints == len
&& memcmp(synthetic_graphemes[index].codepoints, codepoints, len) == 0) {
return -(index+1);
} else {
- if (num_synthetic_graphemes > 0)
- memmove(&synthetic_graphemes[index], &synthetic_graphemes[index + 1], num_synthetic_graphemes - index);
+ if (index < 0) index = 0;
+
+ if (num_synthetic_graphemes >= MAX_SYNTHETIC_GRAPHEMES)
+ fail("Too many synthetic graphemes!");
+
+ if (num_synthetic_graphemes > 0 && index != num_synthetic_graphemes) {
+ // printf("I have %d graphemes and I need to shift %d of them to open a space at %d\n",
+ // num_synthetic_graphemes, num_synthetic_graphemes - index, index);
+ memmove(&synthetic_graphemes[index + 1], &synthetic_graphemes[index],
+ sizeof(synthetic_grapheme_t[num_synthetic_graphemes - index]));
+ }
+ // for (int i = 0; i < index; i++) {
+ // if (!synthetic_graphemes[i].utf8)
+ // errx(1, "Missing pre-grapheme utf8 at index %d", i);
+ // }
+ // for (int i = index+1; i < num_synthetic_graphemes+1; i++) {
+ // if (!synthetic_graphemes[i].utf8)
+ // errx(1, "Missing post-grapheme utf8 at index %d", i);
+ // }
uint32_t *buf = GC_MALLOC_ATOMIC(sizeof(uint32_t[len]));
memcpy(buf, codepoints, sizeof(uint32_t[len]));
@@ -89,9 +114,16 @@ int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t len)
memcpy(gc_u8, u8, u8_len);
gc_u8[u8_len] = '\0';
synthetic_graphemes[index].utf8 = gc_u8;
+ assert(gc_u8);
free(u8);
++num_synthetic_graphemes;
+
+ // for (int i = 0; i < (int)num_synthetic_graphemes; i++) {
+ // if (!synthetic_graphemes[i].utf8)
+ // errx(1, "Afterwards, missing grapheme utf8 at index %d", i);
+ // }
+
return -(index+1);
}
}
@@ -137,6 +169,8 @@ int text_visualize(FILE *stream, Text_t t)
public int Text$print(FILE *stream, Text_t t)
{
+ if (t.length == 0) return 0;
+
switch (t.tag) {
case TEXT_SHORT_ASCII: return fwrite(t.short_ascii, sizeof(char), t.length, stream);
case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), t.length, stream);
@@ -153,6 +187,11 @@ public int Text$print(FILE *stream, Text_t t)
if (u8 != buf) free(u8);
} else {
const uint8_t *u8 = synthetic_graphemes[-grapheme-1].utf8;
+ if (!u8) {
+ printf("No synthetic grapheme: %d\n", grapheme);
+ printf("Num = %d\n", num_synthetic_graphemes);
+ }
+ assert(u8);
written += fwrite(u8, sizeof(uint8_t), strlen((char*)u8), stream);
}
}
@@ -252,7 +291,7 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
{
int64_t first = Int_to_Int64(first_int, false);
int64_t last = Int_to_Int64(last_int, false);
- if (first == 0) errx(1, "Invalid index: 0");
+ if (first == 0) fail("Invalid index: 0");
if (last == 0) return (Text_t){.length=0};
if (first < 0) first = text.length + first + 1;
@@ -1044,7 +1083,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
skip_whitespace(pattern, &pattern_index);
if (!match_grapheme(pattern, &pattern_index, ']'))
- errx(1, "Missing closing ']' in pattern: \"%T\"", &pattern);
+ fail("Missing closing ']' in pattern: %k", &pattern);
int64_t before_group = text_index;
bool any = false;
@@ -1139,7 +1178,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
if (!uc_property_is_valid(prop)) {
specific_codepoint = unicode_name_character(prop_name);
if (specific_codepoint == UNINAME_INVALID)
- errx(1, "Not a valid property or character name: %s", prop_name);
+ fail("Not a valid property or character name: %s", prop_name);
}
} else {
any = true;
@@ -1193,7 +1232,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
int32_t close = open;
uc_mirror_char(open, (uint32_t*)&close);
if (!match_grapheme(pattern, &pattern_index, close))
- errx(1, "I expected a closing brace");
+ fail("Pattern's closing brace is missing: %k", &pattern);
while (text_index < text.length) {
int32_t c = _next_grapheme(text, &text_state, text_index);
if (c == close)
@@ -1214,7 +1253,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
int32_t close = open;
uc_mirror_char(open, (uint32_t*)&close);
if (!match_grapheme(pattern, &pattern_index, close))
- errx(1, "I expected a closing brace");
+ fail("Pattern's closing brace is missing: %k", &pattern);
int64_t depth = 1;
for (; depth > 0 && text_index < text.length; ++text_index) {
int32_t c = _next_grapheme(text, &text_state, text_index);
@@ -1485,6 +1524,36 @@ public array_t Text$codepoint_names(Text_t text)
return names;
}
+public Text_t Text$from_codepoints(array_t codepoints)
+{
+ if (codepoints.stride != sizeof(int32_t))
+ Array$compact(&codepoints, sizeof(int32_t));
+
+ return text_from_u32(codepoints.data, codepoints.length, true);
+}
+
+public Text_t Text$from_codepoint_names(array_t codepoint_names)
+{
+ array_t codepoints = {};
+ for (int64_t i = 0; i < codepoint_names.length; i++) {
+ Text_t *name = ((Text_t*)(codepoint_names.data + i*codepoint_names.stride));
+ const char *name_str = Text$as_c_string(*name);
+ uint32_t codepoint = unicode_name_character(name_str);
+ Array$insert(&codepoints, &codepoint, I_small(0), sizeof(uint32_t));
+ }
+ return Text$from_codepoints(codepoints);
+}
+
+public Text_t Text$from_bytes(array_t bytes)
+{
+ if (bytes.stride != sizeof(int8_t))
+ Array$compact(&bytes, sizeof(int8_t));
+
+ int8_t nul = 0;
+ Array$insert(&bytes, &nul, I_small(0), sizeof(int8_t));
+ return Text$from_str(bytes.data);
+}
+
public const TypeInfo $Text = {
.size=sizeof(Text_t),
.align=__alignof__(Text_t),
diff --git a/builtins/text.h b/builtins/text.h
index 4fd3d5ac..7cef834d 100644
--- a/builtins/text.h
+++ b/builtins/text.h
@@ -68,6 +68,9 @@ array_t Text$clusters(Text_t text);
array_t Text$utf32_codepoints(Text_t text);
array_t Text$utf8_bytes(Text_t text);
array_t Text$codepoint_names(Text_t text);
+Text_t Text$from_codepoints(array_t codepoints);
+Text_t Text$from_codepoint_names(array_t codepoint_names);
+Text_t Text$from_bytes(array_t bytes);
extern const TypeInfo $Text;
diff --git a/environment.c b/environment.c
index db01f4d7..100cfcc2 100644
--- a/environment.c
+++ b/environment.c
@@ -251,7 +251,10 @@ env_t *new_compilation_unit(CORD *libname)
{"as_c_string", "CORD_to_char_star", "func(text:Text)->CString"},
{"clusters", "Text$clusters", "func(text:Text)->[Text]"},
{"codepoint_names", "Text$codepoint_names", "func(text:Text)->[Text]"},
- {"from_c_string", "CORD_from_char_star", "func(str:CString)->Text"},
+ {"from_bytes", "Text$from_bytes", "func(bytes:[Int8])->Text"},
+ {"from_c_string", "Text$from_str", "func(str:CString)->Text"},
+ {"from_codepoint_names", "Text$from_codepoint_names", "func(codepoint_names:[Text])->Text"},
+ {"from_codepoints", "Text$from_codepoints", "func(codepoints:[Int32])->Text"},
{"has", "Text$has", "func(text:Text, pattern:Text)->Bool"},
{"join", "Text$join", "func(glue:Text, pieces:[Text])->Text"},
{"lower", "Text$lower", "func(text:Text)->Text"},
diff --git a/test/text.tm b/test/text.tm
index 150bc31f..05763723 100644
--- a/test/text.tm
+++ b/test/text.tm
@@ -103,3 +103,18 @@ func main():
= "A 3"
>> $(one (nested) two $(1+2))
= "one (nested) two 3"
+
+
+ >> "one two three":replace("[..alpha]", "")
+
+ >> c := "É̩"
+ >> c:codepoint_names()
+ = ["LATIN CAPITAL LETTER E WITH ACUTE", "COMBINING VERTICAL LINE BELOW"]
+ >> c == Text.from_codepoint_names(c:codepoint_names())
+ = yes
+ >> c == Text.from_codepoints(c:utf32_codepoints())
+ = yes
+ >> c == Text.from_bytes(c:utf8_bytes())
+ = yes
+
+