Text fixes

2024-09-02 21:18:15 -04:00 · 2024-09-02 21:18:15 -04:00 · cac560fdc3
commit cac560fdc3
parent 55d758c3c4
3 changed files with 17 additions and 19 deletions
--- a/builtins/functions.c
+++ b/builtins/functions.c
@ -218,13 +218,10 @@ public void end_test(void *expr, const TypeInfo *type, const char *expected, con
        Text_t expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_text;
        bool success = Text$equal(&expr_plain, &expected_text);
        if (!success) {
-            printf("Not equal: %k vs %k\n", &expr_plain, &expected_text);
            Int_t colon = Text$find(expected_text, Text$from_str(":"), I_small(0), NULL);
            if (colon.small != I_small(0).small) {
                Text_t with_type = Text$concat(expr_plain, Text$from_str(" : "), type_name);
                success = Text$equal(&with_type, &expected_text);
-                if (!success)
-                    printf("Not equal: %k vs %k\n", &with_type, &expected_text);
            }
        }

--- a/builtins/text.c
+++ b/builtins/text.c
@ -268,10 +268,11 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)

    switch (text.tag) {
    case TEXT_SHORT_ASCII: {
-        Text_t ret = text;
-        ret.length = last - first + 1;
-        if (first > 1)
-            memcpy(ret.short_ascii, text.short_ascii + (first-1), ret.length);
+        Text_t ret = (Text_t) {
+            .tag=TEXT_SHORT_ASCII,
+            .length=last - first + 1,
+        };
+        memcpy(ret.short_ascii, text.short_ascii + (first-1), ret.length);
        return ret;
    }
    case TEXT_ASCII: {
@ -317,7 +318,7 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
            num_subtexts += 1;
        }
        if (num_subtexts == 1)
-            return Text$slice(subtexts[0], Int64_to_Int(first+1), Int64_to_Int(last+1));
+            return Text$slice(subtexts[0], Int64_to_Int(first), Int64_to_Int(last));

        Text_t ret = {
            .length=needed_len,
@ -325,7 +326,7 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
            .subtexts=GC_MALLOC(sizeof(Text_t[num_subtexts])),
        };
        for (int64_t i = 0; i < num_subtexts; i++) {
-            ret.subtexts[i] = Text$slice(subtexts[i], Int64_to_Int(first+1), Int64_to_Int(last+1));
+            ret.subtexts[i] = Text$slice(subtexts[i], Int64_to_Int(first), Int64_to_Int(last));
            first = 1;
            needed_len -= ret.subtexts[i].length;
            last = first + needed_len - 1;
@ -410,7 +411,7 @@ static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i
    switch (text.tag) {
    case TEXT_ASCII: case TEXT_SHORT_ASCII: {
        if (*i + text.length > (int64_t)*capacity) {
-            *capacity = *i + text.length;
+            *capacity = *i + text.length + 1;
            *buf = GC_REALLOC(*buf, *capacity);
        }

@ -421,7 +422,7 @@ static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i
    }
    case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: {
        const int32_t *graphemes = text.tag == TEXT_GRAPHEMES ? text.graphemes : text.short_graphemes;
-        for (int64_t g = 0; g + 1 < text.length; g++) {
+        for (int64_t g = 0; g < text.length; g++) {
            const uint32_t *codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].codepoints : (uint32_t*)&graphemes[g];
            int64_t num_codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].num_codepoints : 1;
            uint8_t u8_buf[64];
@ -429,7 +430,7 @@ static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i
            uint8_t *u8 = u32_to_u8(codepoints, num_codepoints, u8_buf, &u8_len);

            if (*i + (int64_t)u8_len > (int64_t)*capacity) {
-                *capacity = *i + u8_len;
+                *capacity = *i + u8_len + 1;
                *buf = GC_REALLOC(*buf, *capacity);
            }

@ -1432,7 +1433,7 @@ public Text_t Text$format(const char *fmt, ...)
 public array_t Text$clusters(Text_t text)
 {
    array_t clusters = {.atomic=1};
-    for (int64_t i = 0; i < text.length; i++) {
+    for (int64_t i = 1; i <= text.length; i++) {
        Text_t cluster = Text$slice(text, Int64_to_Int(i), Int64_to_Int(i));
        Array$insert(&clusters, &cluster, I_small(0), sizeof(Text_t));
    }
--- a/test/text.tm
+++ b/test/text.tm
@ -23,22 +23,22 @@ func main():
 	>> amelie:clusters()
 	= ["A", "m", "é", "l", "i", "e"] : [Text]
 	>> amelie:utf32_codepoints()
-	= [65_i32, 109_i32, 101_i32, 769_i32, 108_i32, 105_i32, 101_i32] : [Int32]
+	= [65_i32, 109_i32, 233_i32, 108_i32, 105_i32, 101_i32] : [Int32]
 	>> amelie:utf8_bytes()
-	= [65_i8, 109_i8, 101_i8, -52_i8, -127_i8, 108_i8, 105_i8, 101_i8] : [Int8]
+	= [65_i8, 109_i8, -61_i8, -87_i8, 108_i8, 105_i8, 101_i8] : [Int8]

 	>> amelie2 := "Am$(\U65\U301)lie"
 	>> amelie2:clusters()
 	= ["A", "m", "é", "l", "i", "e"] : [Text]
 	>> amelie2:utf32_codepoints()
-	= [65_i32, 109_i32, 101_i32, 769_i32, 108_i32, 105_i32, 101_i32] : [Int32]
+	= [65_i32, 109_i32, 233_i32, 108_i32, 105_i32, 101_i32] : [Int32]
 	>> amelie2:utf8_bytes()
-	= [65_i8, 109_i8, 101_i8, -52_i8, -127_i8, 108_i8, 105_i8, 101_i8] : [Int8]
+	= [65_i8, 109_i8, -61_i8, -87_i8, 108_i8, 105_i8, 101_i8] : [Int8]

 	>> amelie:codepoint_names()
-	= ["LATIN CAPITAL LETTER A", "LATIN SMALL LETTER M", "LATIN SMALL LETTER E", "COMBINING ACUTE ACCENT", "LATIN SMALL LETTER L", "LATIN SMALL LETTER I", "LATIN SMALL LETTER E"]
+	= ["LATIN CAPITAL LETTER A", "LATIN SMALL LETTER M", "LATIN SMALL LETTER E WITH ACUTE", "LATIN SMALL LETTER L", "LATIN SMALL LETTER I", "LATIN SMALL LETTER E"]
 	>> amelie2:codepoint_names()
-	= ["LATIN CAPITAL LETTER A", "LATIN SMALL LETTER M", "LATIN SMALL LETTER E", "COMBINING ACUTE ACCENT", "LATIN SMALL LETTER L", "LATIN SMALL LETTER I", "LATIN SMALL LETTER E"]
+	= ["LATIN CAPITAL LETTER A", "LATIN SMALL LETTER M", "LATIN SMALL LETTER E WITH ACUTE", "LATIN SMALL LETTER L", "LATIN SMALL LETTER I", "LATIN SMALL LETTER E"]

 	>> "Hello":replace("e", "X")
 	= "HXllo"