WIP fixes for synthetic graphemes and adding some text conversion

methods
2024-09-02 22:30:19 -04:00 · 2024-09-02 22:30:19 -04:00 · 708acda54e
commit 708acda54e
parent 5a78eb61c8
4 changed files with 101 additions and 11 deletions
--- a/builtins/text.c
+++ b/builtins/text.c
@ -32,11 +32,14 @@

 #include "siphash.c"

-static struct {
+typedef struct {
    int64_t num_codepoints;
    const uint32_t *codepoints;
    const uint8_t *utf8;
-} synthetic_graphemes[1024] = {};
+} synthetic_grapheme_t;
+
+#define MAX_SYNTHETIC_GRAPHEMES 1024
+static synthetic_grapheme_t synthetic_graphemes[MAX_SYNTHETIC_GRAPHEMES] = {};

 static int32_t num_synthetic_graphemes = 0;

@ -69,14 +72,36 @@ int32_t find_synthetic_grapheme(const uint32_t *codepoints, int64_t len)

 int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t len)
 {
+    // for (int i = 0; i < (int)num_synthetic_graphemes; i++) {
+    //     if (!synthetic_graphemes[i].utf8)
+    //         errx(1, "Beforehand, missing grapheme utf8 at index %d", i);
+    // }
    int32_t index = find_synthetic_grapheme(codepoints, len);
-    if (index < num_synthetic_graphemes
+    if (index >= 0 
+        && index < num_synthetic_graphemes
        && synthetic_graphemes[index].num_codepoints == len
        && memcmp(synthetic_graphemes[index].codepoints, codepoints, len) == 0) {
        return -(index+1);
    } else {
-        if (num_synthetic_graphemes > 0)
-            memmove(&synthetic_graphemes[index], &synthetic_graphemes[index + 1], num_synthetic_graphemes - index);
+        if (index < 0) index = 0;
+
+        if (num_synthetic_graphemes >= MAX_SYNTHETIC_GRAPHEMES)
+            fail("Too many synthetic graphemes!");
+
+        if (num_synthetic_graphemes > 0 && index != num_synthetic_graphemes) {
+            // printf("I have %d graphemes and I need to shift %d of them to open a space at %d\n",
+            //        num_synthetic_graphemes, num_synthetic_graphemes - index, index);
+            memmove(&synthetic_graphemes[index + 1], &synthetic_graphemes[index],
+                    sizeof(synthetic_grapheme_t[num_synthetic_graphemes - index]));
+        }
+        // for (int i = 0; i < index; i++) {
+        //     if (!synthetic_graphemes[i].utf8)
+        //         errx(1, "Missing pre-grapheme utf8 at index %d", i);
+        // }
+        // for (int i = index+1; i < num_synthetic_graphemes+1; i++) {
+        //     if (!synthetic_graphemes[i].utf8)
+        //         errx(1, "Missing post-grapheme utf8 at index %d", i);
+        // }

        uint32_t *buf = GC_MALLOC_ATOMIC(sizeof(uint32_t[len]));
        memcpy(buf, codepoints, sizeof(uint32_t[len]));
@ -89,9 +114,16 @@ int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t len)
        memcpy(gc_u8, u8, u8_len);
        gc_u8[u8_len] = '\0';
        synthetic_graphemes[index].utf8 = gc_u8;
+        assert(gc_u8);
        free(u8);

        ++num_synthetic_graphemes;
+
+        // for (int i = 0; i < (int)num_synthetic_graphemes; i++) {
+        //     if (!synthetic_graphemes[i].utf8)
+        //         errx(1, "Afterwards, missing grapheme utf8 at index %d", i);
+        // }
+
        return -(index+1);
    }
 }
@ -137,6 +169,8 @@ int text_visualize(FILE *stream, Text_t t)

 public int Text$print(FILE *stream, Text_t t)
 {
+    if (t.length == 0) return 0;
+
    switch (t.tag) {
    case TEXT_SHORT_ASCII: return fwrite(t.short_ascii, sizeof(char), t.length, stream);
    case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), t.length, stream);
@ -153,6 +187,11 @@ public int Text$print(FILE *stream, Text_t t)
                if (u8 != buf) free(u8);
            } else {
                const uint8_t *u8 = synthetic_graphemes[-grapheme-1].utf8;
+                if (!u8) {
+                    printf("No synthetic grapheme: %d\n", grapheme);
+                    printf("Num = %d\n", num_synthetic_graphemes);
+                }
+                assert(u8);
                written += fwrite(u8, sizeof(uint8_t), strlen((char*)u8), stream);
            }
        }
@ -252,7 +291,7 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
 {
    int64_t first = Int_to_Int64(first_int, false);
    int64_t last = Int_to_Int64(last_int, false);
-    if (first == 0) errx(1, "Invalid index: 0");
+    if (first == 0) fail("Invalid index: 0");
    if (last == 0) return (Text_t){.length=0};

    if (first < 0) first = text.length + first + 1;
@ -1044,7 +1083,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i

            skip_whitespace(pattern, &pattern_index);
            if (!match_grapheme(pattern, &pattern_index, ']'))
-                errx(1, "Missing closing ']' in pattern: \"%T\"", &pattern);
+                fail("Missing closing ']' in pattern: %k", &pattern);

            int64_t before_group = text_index;
            bool any = false;
@ -1139,7 +1178,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
                if (!uc_property_is_valid(prop)) {
                    specific_codepoint = unicode_name_character(prop_name);
                    if (specific_codepoint == UNINAME_INVALID)
-                        errx(1, "Not a valid property or character name: %s", prop_name);
+                        fail("Not a valid property or character name: %s", prop_name);
                }
            } else {
                any = true;
@ -1193,7 +1232,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
            int32_t close = open;
            uc_mirror_char(open, (uint32_t*)&close);
            if (!match_grapheme(pattern, &pattern_index, close))
-                errx(1, "I expected a closing brace");
+                fail("Pattern's closing brace is missing: %k", &pattern);
            while (text_index < text.length) {
                int32_t c = _next_grapheme(text, &text_state, text_index);
                if (c == close)
@ -1214,7 +1253,7 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
            int32_t close = open;
            uc_mirror_char(open, (uint32_t*)&close);
            if (!match_grapheme(pattern, &pattern_index, close))
-                errx(1, "I expected a closing brace");
+                fail("Pattern's closing brace is missing: %k", &pattern);
            int64_t depth = 1;
            for (; depth > 0 && text_index < text.length; ++text_index) {
                int32_t c = _next_grapheme(text, &text_state, text_index);
@ -1485,6 +1524,36 @@ public array_t Text$codepoint_names(Text_t text)
    return names;
 }

+public Text_t Text$from_codepoints(array_t codepoints)
+{
+    if (codepoints.stride != sizeof(int32_t))
+        Array$compact(&codepoints, sizeof(int32_t));
+
+    return text_from_u32(codepoints.data, codepoints.length, true);
+}
+
+public Text_t Text$from_codepoint_names(array_t codepoint_names)
+{
+    array_t codepoints = {};
+    for (int64_t i = 0; i < codepoint_names.length; i++) {
+        Text_t *name = ((Text_t*)(codepoint_names.data + i*codepoint_names.stride));
+        const char *name_str = Text$as_c_string(*name);
+        uint32_t codepoint = unicode_name_character(name_str);
+        Array$insert(&codepoints, &codepoint, I_small(0), sizeof(uint32_t));
+    }
+    return Text$from_codepoints(codepoints);
+}
+
+public Text_t Text$from_bytes(array_t bytes)
+{
+    if (bytes.stride != sizeof(int8_t))
+        Array$compact(&bytes, sizeof(int8_t));
+
+    int8_t nul = 0;
+    Array$insert(&bytes, &nul, I_small(0), sizeof(int8_t));
+    return Text$from_str(bytes.data);
+}
+
 public const TypeInfo $Text = {
    .size=sizeof(Text_t),
    .align=__alignof__(Text_t),
--- a/builtins/text.h
+++ b/builtins/text.h
@ -68,6 +68,9 @@ array_t Text$clusters(Text_t text);
 array_t Text$utf32_codepoints(Text_t text);
 array_t Text$utf8_bytes(Text_t text);
 array_t Text$codepoint_names(Text_t text);
+Text_t Text$from_codepoints(array_t codepoints);
+Text_t Text$from_codepoint_names(array_t codepoint_names);
+Text_t Text$from_bytes(array_t bytes);

 extern const TypeInfo $Text;

--- a/environment.c
+++ b/environment.c
@ -251,7 +251,10 @@ env_t *new_compilation_unit(CORD *libname)
            {"as_c_string", "CORD_to_char_star", "func(text:Text)->CString"},
            {"clusters", "Text$clusters", "func(text:Text)->[Text]"},
            {"codepoint_names", "Text$codepoint_names", "func(text:Text)->[Text]"},
-            {"from_c_string", "CORD_from_char_star", "func(str:CString)->Text"},
+            {"from_bytes", "Text$from_bytes", "func(bytes:[Int8])->Text"},
+            {"from_c_string", "Text$from_str", "func(str:CString)->Text"},
+            {"from_codepoint_names", "Text$from_codepoint_names", "func(codepoint_names:[Text])->Text"},
+            {"from_codepoints", "Text$from_codepoints", "func(codepoints:[Int32])->Text"},
            {"has", "Text$has", "func(text:Text, pattern:Text)->Bool"},
            {"join", "Text$join", "func(glue:Text, pieces:[Text])->Text"},
            {"lower", "Text$lower", "func(text:Text)->Text"},
--- a/test/text.tm
+++ b/test/text.tm
@ -103,3 +103,18 @@ func main():
 	= "A 3"
 	>> $(one (nested) two $(1+2))
 	= "one (nested) two 3"
+
+
+	>> "one two three":replace("[..alpha]", "")
+
+	>> c := "É̩"
+	>> c:codepoint_names()
+	= ["LATIN CAPITAL LETTER E WITH ACUTE", "COMBINING VERTICAL LINE BELOW"]
+	>> c == Text.from_codepoint_names(c:codepoint_names())
+	= yes
+	>> c == Text.from_codepoints(c:utf32_codepoints())
+	= yes
+	>> c == Text.from_bytes(c:utf8_bytes())
+	= yes
+
+