diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2024-11-19 13:27:27 -0500 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2024-11-19 13:27:27 -0500 |
| commit | ccada385c4fdc2dc320b0ab468c7413ff27bcc7d (patch) | |
| tree | a9a70ea9b014212952c8d61a0bb5a0ff58a45f88 | |
| parent | e97c3850b817f6bda6f7ea3fff5c345c5f48bcd8 (diff) | |
Add Text.at(i) for getting a single cluster
| -rw-r--r-- | docs/text.md | 29 | ||||
| -rw-r--r-- | environment.c | 1 | ||||
| -rw-r--r-- | stdlib/text.c | 52 | ||||
| -rw-r--r-- | stdlib/text.h | 1 |
4 files changed, 83 insertions, 0 deletions
diff --git a/docs/text.md b/docs/text.md index 6278da2b..d0179bc4 100644 --- a/docs/text.md +++ b/docs/text.md @@ -426,6 +426,35 @@ A C-style string (`CString`) representing the text. --- +## `at` + +**Description:** +Get the graphical cluster at a given index. This is similar to `str[i]` with +ASCII text, but has more correct behavior for unicode text. + +**Signature:** +```tomo +func at(text: Text, index: Int -> Text) +``` + +**Parameters:** + +- `text`: The text from which to get a cluster. +- `index`: The index of the graphical cluster (1-indexed). + +**Returns:** +A `Text` with the single graphical cluster at the given index. Note: negative +indices are counted from the back of the text, so `-1` means the last cluster, +`-2` means the second-to-last, and so on. + +**Example:** +```tomo +>> "Amélie":at(3) += "é" +``` + +--- + ## `utf8_bytes` **Description:** diff --git a/environment.c b/environment.c index 3781b4d4..3ded0af3 100644 --- a/environment.c +++ b/environment.c @@ -385,6 +385,7 @@ env_t *new_compilation_unit(CORD libname) )}, {"Text", TEXT_TYPE, "Text_t", "Text$info", TypedArray(ns_entry_t, {"as_c_string", "Text$as_c_string", "func(text:Text -> CString)"}, + {"at", "Text$cluster", "func(text:Text, index:Int -> Text)"}, {"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, {"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"}, {"each", "Text$each", "func(text:Text, pattern:Pattern, fn:func(match:Match))"}, diff --git a/stdlib/text.c b/stdlib/text.c index 44179fa7..69e54ff6 100644 --- a/stdlib/text.c +++ b/stdlib/text.c @@ -563,6 +563,58 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) } } +public Text_t Text$cluster(Text_t text, Int_t index_int) +{ + int64_t index = Int_to_Int64(index_int, false); + if (index == 0) fail("Invalid index: 0"); + + if (index < 0) index = text.length + index + 1; + + if (index > text.length || index < 1) + fail("Invalid index: %ld is beyond the length of the text (length = %ld)", + Int_to_Int64(index_int, false), text.length); + + switch (text.tag) { + case TEXT_SHORT_ASCII: { + return (Text_t) { + .tag=TEXT_SHORT_ASCII, + .length=1, + .short_ascii={text.short_ascii[index-1]}, + }; + } + case TEXT_ASCII: { + return (Text_t) { + .tag=TEXT_SHORT_ASCII, + .length=1, + .short_ascii={text.ascii[index-1]}, + }; + } + case TEXT_SHORT_GRAPHEMES: { + return (Text_t) { + .tag=TEXT_SHORT_GRAPHEMES, + .length=1, + .short_graphemes={text.short_graphemes[index-1]}, + }; + } + case TEXT_GRAPHEMES: { + return (Text_t) { + .tag=TEXT_SHORT_GRAPHEMES, + .length=1, + .short_graphemes={text.graphemes[index-1]}, + }; + } + case TEXT_SUBTEXT: { + Text_t *subtext = text.subtexts; + while (index > subtext[0].length) { + index -= subtext[0].length; + ++subtext; + } + return Text$cluster(*subtext, I(index)); + } + default: errx(1, "Invalid tag"); + } +} + Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize) { // Normalization is apparently guaranteed to never exceed 3x in the input length diff --git a/stdlib/text.h b/stdlib/text.h index 45aa00ca..4f23834f 100644 --- a/stdlib/text.h +++ b/stdlib/text.h @@ -28,6 +28,7 @@ Text_t Text$_concat(int n, Text_t items[n]); #define Text$concat(...) Text$_concat(sizeof((Text_t[]){__VA_ARGS__})/sizeof(Text_t), (Text_t[]){__VA_ARGS__}) #define Texts(...) Text$concat(__VA_ARGS__) Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int); +Text_t Text$cluster(Text_t text, Int_t index_int); OptionalText_t Text$from_str(const char *str); OptionalText_t Text$from_strn(const char *str, size_t len); PUREFUNC uint64_t Text$hash(Text_t *text); |
