WIP fixes for synthetic graphemes and adding some text conversion

methods
author: Bruce Hill <bruce@bruce-hill.com> 2024-09-02 22:30:19 -0400
committer: Bruce Hill <bruce@bruce-hill.com> 2024-09-02 22:30:19 -0400
commit: 708acda54e56a74096ec780777596e570db5b627 (patch)
tree: 6a4ecb5020f2259fb2ca50467cf97a8fe0045d88
parent: 5a78eb61c876f73f6ed6bb3d631e0edc065026db (diff)
4 files changed, 101 insertions, 11 deletions
diff --git a/builtins/text.c b/builtins/text.c
index 5e8b4625..d4b3e378 100644
--- a/builtins/text.c
+++ b/builtins/text.c
@@ -32,11 +32,14 @@
 
 #include "siphash.c"
 
-static struct {
+typedef struct {
     int64_t num_codepoints;
     const uint32_t *codepoints;
     const uint8_t *utf8;
-} synthetic_graphemes[1024] = {};
+} synthetic_grapheme_t;
+
+#define MAX_SYNTHETIC_GRAPHEMES 1024
+static synthetic_grapheme_t synthetic_graphemes[MAX_SYNTHETIC_GRAPHEMES] = {};
 
 static int32_t num_synthetic_graphemes = 0;
 
@@ -69,14 +72,36 @@ int32_t find_synthetic_grapheme(const uint32_t *codepoints, int64_t len)
 
 int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t len)
 {
+    // for (int i = 0; i < (int)num_synthetic_graphemes; i++) {
+    //     if (!synthetic_graphemes[i].utf8)
+    //         errx(1, "Beforehand, missing grapheme utf8 at index %d", i);
+    // }
     int32_t index = find_synthetic_grapheme(codepoints, len);
-    if (index < num_synthetic_graphemes
+    if (index >= 0 
+        && index < num_synthetic_graphemes
         && synthetic_graphemes[index].num_codepoints == len
         && memcmp(synthetic_graphemes[index].codepoints, codepoints, len) == 0) {
         return -(index+1);
     } else {
-        if (num_synthetic_graphemes > 0)
-            memmove(&synthetic_graphemes[index], &synthetic_graphemes[index + 1], num_synthetic_graphemes - index);
+        if (index < 0) index = 0;
+
+        if (num_synthetic_graphemes >= MAX_SYNTHETIC_GRAPHEMES)
+            fail("Too many synthetic graphemes!");
+
+        if (num_synthetic_graphemes > 0 && index != num_synthetic_graphemes) {
+            // printf("I have %d graphemes and I need to shift %d of them to open a space at %d\n",
+            //        num_synthetic_graphemes, num_synthetic_graphemes - index, index);
+            memmove(&synthetic_graphemes[index + 1], &synthetic_graphemes[index],
+                    sizeof(synthetic_grapheme_t[num_synthetic_graphemes - index]));
+        }
+        // for (int i = 0; i < index; i++) {
+        //     if (!synthetic_graphemes[i].utf8)
+        //         errx(1, "Missing pre-grapheme utf8 at index %d", i);
+        // }
+        // for (int i = index+1; i < num_synthetic_graphemes+1; i++) {
+        //     if (!synthetic_graphemes[i].utf8)
+        //         errx(1, "Missing post-grapheme utf8 at index %d", i);
+        // }
 
         uint32_t *buf = GC_MALLOC_ATOMIC(sizeof(uint32_t[len]));
         memcpy(buf, codepoints, sizeof(uint32_t[len]));
@@ -89,9 +114,16 @@ int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t len)
         memcpy(gc_u8, u8, u8_len);
         gc_u8[u8_len] = '\0';
         synthetic_graphemes[index].utf8 = gc_u8;
+        assert(gc_u8);
         free(u8);
 
         ++num_synthetic_graphemes;
+
+        // for (int i = 0; i < (int)num_synthetic_graphemes; i++) {
+        //     if (!synthetic_graphemes[i].utf8)
+        //         errx(1, "Afterwards, missing grapheme utf8 at index %d", i);
+        // }
+
         return -(index+1);
     }
 }
@@ -137,6 +169,8 @@ int text_visualize(FILE *stream, Text_t t)
 
 public int Text$print(FILE *stream, Text_t t)
 {
+    if (t.length == 0) return 0;
+
     switch (t.tag) {
     case TEXT_SHORT_ASCII: return fwrite(t.short_ascii, sizeof(char), t.length, stream);
     case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), t.length, stream);
@@ -153,6 +187,11 @@ public int Text$print(FILE *stream, Text_t t)
                 if (u8 != buf) free(u8);
             } else {
                 const uint8_t *u8 = synthetic_graphemes[-grapheme-1].utf8;
+                if (!u8) {
+                    printf("No synthetic grapheme: %d\n", grapheme);
+                    printf("Num = %d\n", num_synthetic_graphemes);
+                }
+                assert(u8);
                 written += fwrite(u8, sizeof(uint8_t), strlen((char*)u8), stream);
             }
         }
@@ -252,7 +291,7 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
 {
     int64_t first = Int_to_Int64(first_int, false);
     int64_t last = Int_to_Int64(last_int, false);
-    if (first == 0) errx(1, "Invalid index: 0");
+    if (first == 0) fail("Invalid index: 0");
     if (last == 0) return (Text_t){.length=0};
 
     if (first < 0) first = text.length + first + 1;
@@ -1044,7 +1083,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
 
             skip_whitespace(pattern, &pattern_index);
             if (!match_grapheme(pattern, &pattern_index, ']'))
-                errx(1, "Missing closing ']' in pattern: \"%T\"", &pattern);
+                fail("Missing closing ']' in pattern: %k", &pattern);
 
             int64_t before_group = text_index;
             bool any = false;
@@ -1139,7 +1178,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
                 if (!uc_property_is_valid(prop)) {
                     specific_codepoint = unicode_name_character(prop_name);
                     if (specific_codepoint == UNINAME_INVALID)
-                        errx(1, "Not a valid property or character name: %s", prop_name);
+                        fail("Not a valid property or character name: %s", prop_name);
                 }
             } else {
                 any = true;
@@ -1193,7 +1232,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
             int32_t close = open;
             uc_mirror_char(open, (uint32_t*)&close);
             if (!match_grapheme(pattern, &pattern_index, close))
-                errx(1, "I expected a closing brace");
+                fail("Pattern's closing brace is missing: %k", &pattern);
             while (text_index < text.length) {
                 int32_t c = _next_grapheme(text, &text_state, text_index);
                 if (c == close)
@@ -1214,7 +1253,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
             int32_t close = open;
             uc_mirror_char(open, (uint32_t*)&close);
             if (!match_grapheme(pattern, &pattern_index, close))
-                errx(1, "I expected a closing brace");
+                fail("Pattern's closing brace is missing: %k", &pattern);
             int64_t depth = 1;
             for (; depth > 0 && text_index < text.length; ++text_index) {
                 int32_t c = _next_grapheme(text, &text_state, text_index);
@@ -1485,6 +1524,36 @@ public array_t Text$codepoint_names(Text_t text)
     return names;
 }
 
+public Text_t Text$from_codepoints(array_t codepoints)
+{
+    if (codepoints.stride != sizeof(int32_t))
+        Array$compact(&codepoints, sizeof(int32_t));
+
+    return text_from_u32(codepoints.data, codepoints.length, true);
+}
+
+public Text_t Text$from_codepoint_names(array_t codepoint_names)
+{
+    array_t codepoints = {};
+    for (int64_t i = 0; i < codepoint_names.length; i++) {
+        Text_t *name = ((Text_t*)(codepoint_names.data + i*codepoint_names.stride));
+        const char *name_str = Text$as_c_string(*name);
+        uint32_t codepoint = unicode_name_character(name_str);
+        Array$insert(&codepoints, &codepoint, I_small(0), sizeof(uint32_t));
+    }
+    return Text$from_codepoints(codepoints);
+}
+
+public Text_t Text$from_bytes(array_t bytes)
+{
+    if (bytes.stride != sizeof(int8_t))
+        Array$compact(&bytes, sizeof(int8_t));
+
+    int8_t nul = 0;
+    Array$insert(&bytes, &nul, I_small(0), sizeof(int8_t));
+    return Text$from_str(bytes.data);
+}
+
 public const TypeInfo $Text = {
     .size=sizeof(Text_t),
     .align=__alignof__(Text_t),
diff --git a/builtins/text.h b/builtins/text.h
index 4fd3d5ac..7cef834d 100644
--- a/builtins/text.h
+++ b/builtins/text.h
@@ -68,6 +68,9 @@ array_t Text$clusters(Text_t text);
 array_t Text$utf32_codepoints(Text_t text);
 array_t Text$utf8_bytes(Text_t text);
 array_t Text$codepoint_names(Text_t text);
+Text_t Text$from_codepoints(array_t codepoints);
+Text_t Text$from_codepoint_names(array_t codepoint_names);
+Text_t Text$from_bytes(array_t bytes);
 
 extern const TypeInfo $Text;
 
diff --git a/environment.c b/environment.c
index db01f4d7..100cfcc2 100644
--- a/environment.c
+++ b/environment.c
@@ -251,7 +251,10 @@ env_t *new_compilation_unit(CORD *libname)
             {"as_c_string", "CORD_to_char_star", "func(text:Text)->CString"},
             {"clusters", "Text$clusters", "func(text:Text)->[Text]"},
             {"codepoint_names", "Text$codepoint_names", "func(text:Text)->[Text]"},
-            {"from_c_string", "CORD_from_char_star", "func(str:CString)->Text"},
+            {"from_bytes", "Text$from_bytes", "func(bytes:[Int8])->Text"},
+            {"from_c_string", "Text$from_str", "func(str:CString)->Text"},
+            {"from_codepoint_names", "Text$from_codepoint_names", "func(codepoint_names:[Text])->Text"},
+            {"from_codepoints", "Text$from_codepoints", "func(codepoints:[Int32])->Text"},
             {"has", "Text$has", "func(text:Text, pattern:Text)->Bool"},
             {"join", "Text$join", "func(glue:Text, pieces:[Text])->Text"},
             {"lower", "Text$lower", "func(text:Text)->Text"},
diff --git a/test/text.tm b/test/text.tm
index 150bc31f..05763723 100644
--- a/test/text.tm
+++ b/test/text.tm
@@ -103,3 +103,18 @@ func main():
 	= "A 3"
 	>> $(one (nested) two $(1+2))
 	= "one (nested) two 3"
+
+
+	>> "one two three":replace("[..alpha]", "")
+
+	>> c := "É̩"
+	>> c:codepoint_names()
+	= ["LATIN CAPITAL LETTER E WITH ACUTE", "COMBINING VERTICAL LINE BELOW"]
+	>> c == Text.from_codepoint_names(c:codepoint_names())
+	= yes
+	>> c == Text.from_codepoints(c:utf32_codepoints())
+	= yes
+	>> c == Text.from_bytes(c:utf8_bytes())
+	= yes
+
+
author	Bruce Hill <bruce@bruce-hill.com>	2024-09-02 22:30:19 -0400
committer	Bruce Hill <bruce@bruce-hill.com>	2024-09-02 22:30:19 -0400
commit	708acda54e56a74096ec780777596e570db5b627 (patch)
tree	6a4ecb5020f2259fb2ca50467cf97a8fe0045d88
parent	5a78eb61c876f73f6ed6bb3d631e0edc065026db (diff)