Add Text.at(i) for getting a single cluster

author: Bruce Hill <bruce@bruce-hill.com> 2024-11-19 13:27:27 -0500
committer: Bruce Hill <bruce@bruce-hill.com> 2024-11-19 13:27:27 -0500
commit: ccada385c4fdc2dc320b0ab468c7413ff27bcc7d (patch)
tree: a9a70ea9b014212952c8d61a0bb5a0ff58a45f88
parent: e97c3850b817f6bda6f7ea3fff5c345c5f48bcd8 (diff)
4 files changed, 83 insertions, 0 deletions
diff --git a/docs/text.md b/docs/text.md
index 6278da2b..d0179bc4 100644
--- a/docs/text.md
+++ b/docs/text.md
@@ -426,6 +426,35 @@ A C-style string (`CString`) representing the text.
 
 ---
 
+## `at`
+
+**Description:**  
+Get the graphical cluster at a given index. This is similar to `str[i]` with
+ASCII text, but has more correct behavior for unicode text.
+
+**Signature:**  
+```tomo
+func at(text: Text, index: Int -> Text)
+```
+
+**Parameters:**
+
+- `text`: The text from which to get a cluster.
+- `index`: The index of the graphical cluster (1-indexed).
+
+**Returns:**  
+A `Text` with the single graphical cluster at the given index. Note: negative
+indices are counted from the back of the text, so `-1` means the last cluster,
+`-2` means the second-to-last, and so on.
+
+**Example:**  
+```tomo
+>> "Amélie":at(3)
+= "é"
+```
+
+---
+
 ## `utf8_bytes`
 
 **Description:**  
diff --git a/environment.c b/environment.c
index 3781b4d4..3ded0af3 100644
--- a/environment.c
+++ b/environment.c
@@ -385,6 +385,7 @@ env_t *new_compilation_unit(CORD libname)
         )},
         {"Text", TEXT_TYPE, "Text_t", "Text$info", TypedArray(ns_entry_t,
             {"as_c_string", "Text$as_c_string", "func(text:Text -> CString)"},
+            {"at", "Text$cluster", "func(text:Text, index:Int -> Text)"},
             {"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"},
             {"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"},
             {"each", "Text$each", "func(text:Text, pattern:Pattern, fn:func(match:Match))"},
diff --git a/stdlib/text.c b/stdlib/text.c
index 44179fa7..69e54ff6 100644
--- a/stdlib/text.c
+++ b/stdlib/text.c
@@ -563,6 +563,58 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
     }
 }
 
+public Text_t Text$cluster(Text_t text, Int_t index_int)
+{
+    int64_t index = Int_to_Int64(index_int, false);
+    if (index == 0) fail("Invalid index: 0");
+
+    if (index < 0) index = text.length + index + 1;
+
+    if (index > text.length || index < 1)
+        fail("Invalid index: %ld is beyond the length of the text (length = %ld)",
+             Int_to_Int64(index_int, false), text.length);
+
+    switch (text.tag) {
+    case TEXT_SHORT_ASCII: {
+        return (Text_t) {
+            .tag=TEXT_SHORT_ASCII,
+            .length=1,
+            .short_ascii={text.short_ascii[index-1]},
+        };
+    }
+    case TEXT_ASCII: {
+        return (Text_t) {
+            .tag=TEXT_SHORT_ASCII,
+            .length=1,
+            .short_ascii={text.ascii[index-1]},
+        };
+    }
+    case TEXT_SHORT_GRAPHEMES: {
+        return (Text_t) {
+            .tag=TEXT_SHORT_GRAPHEMES,
+            .length=1,
+            .short_graphemes={text.short_graphemes[index-1]},
+        };
+    }
+    case TEXT_GRAPHEMES: {
+        return (Text_t) {
+            .tag=TEXT_SHORT_GRAPHEMES,
+            .length=1,
+            .short_graphemes={text.graphemes[index-1]},
+        };
+    }
+    case TEXT_SUBTEXT: {
+        Text_t *subtext = text.subtexts;
+        while (index > subtext[0].length) {
+            index -= subtext[0].length;
+            ++subtext;
+        }
+        return Text$cluster(*subtext, I(index));
+    }
+    default: errx(1, "Invalid tag");
+    }
+}
+
 Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize)
 {
     // Normalization is apparently guaranteed to never exceed 3x in the input length
diff --git a/stdlib/text.h b/stdlib/text.h
index 45aa00ca..4f23834f 100644
--- a/stdlib/text.h
+++ b/stdlib/text.h
@@ -28,6 +28,7 @@ Text_t Text$_concat(int n, Text_t items[n]);
 #define Text$concat(...) Text$_concat(sizeof((Text_t[]){__VA_ARGS__})/sizeof(Text_t), (Text_t[]){__VA_ARGS__})
 #define Texts(...) Text$concat(__VA_ARGS__)
 Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int);
+Text_t Text$cluster(Text_t text, Int_t index_int);
 OptionalText_t Text$from_str(const char *str);
 OptionalText_t Text$from_strn(const char *str, size_t len);
 PUREFUNC uint64_t Text$hash(Text_t *text);
author	Bruce Hill <bruce@bruce-hill.com>	2024-11-19 13:27:27 -0500
committer	Bruce Hill <bruce@bruce-hill.com>	2024-11-19 13:27:27 -0500
commit	ccada385c4fdc2dc320b0ab468c7413ff27bcc7d (patch)
tree	a9a70ea9b014212952c8d61a0bb5a0ff58a45f88
parent	e97c3850b817f6bda6f7ea3fff5c345c5f48bcd8 (diff)