Add Text.at(i) for getting a single cluster
This commit is contained in:
parent
e97c3850b8
commit
ccada385c4
29
docs/text.md
29
docs/text.md
@ -426,6 +426,35 @@ A C-style string (`CString`) representing the text.
|
||||
|
||||
---
|
||||
|
||||
## `at`
|
||||
|
||||
**Description:**
|
||||
Get the graphical cluster at a given index. This is similar to `str[i]` with
|
||||
ASCII text, but has more correct behavior for unicode text.
|
||||
|
||||
**Signature:**
|
||||
```tomo
|
||||
func at(text: Text, index: Int -> Text)
|
||||
```
|
||||
|
||||
**Parameters:**
|
||||
|
||||
- `text`: The text from which to get a cluster.
|
||||
- `index`: The index of the graphical cluster (1-indexed).
|
||||
|
||||
**Returns:**
|
||||
A `Text` with the single graphical cluster at the given index. Note: negative
|
||||
indices are counted from the back of the text, so `-1` means the last cluster,
|
||||
`-2` means the second-to-last, and so on.
|
||||
|
||||
**Example:**
|
||||
```tomo
|
||||
>> "Amélie":at(3)
|
||||
= "é"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## `utf8_bytes`
|
||||
|
||||
**Description:**
|
||||
|
@ -385,6 +385,7 @@ env_t *new_compilation_unit(CORD libname)
|
||||
)},
|
||||
{"Text", TEXT_TYPE, "Text_t", "Text$info", TypedArray(ns_entry_t,
|
||||
{"as_c_string", "Text$as_c_string", "func(text:Text -> CString)"},
|
||||
{"at", "Text$cluster", "func(text:Text, index:Int -> Text)"},
|
||||
{"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"},
|
||||
{"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"},
|
||||
{"each", "Text$each", "func(text:Text, pattern:Pattern, fn:func(match:Match))"},
|
||||
|
@ -563,6 +563,58 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
|
||||
}
|
||||
}
|
||||
|
||||
public Text_t Text$cluster(Text_t text, Int_t index_int)
|
||||
{
|
||||
int64_t index = Int_to_Int64(index_int, false);
|
||||
if (index == 0) fail("Invalid index: 0");
|
||||
|
||||
if (index < 0) index = text.length + index + 1;
|
||||
|
||||
if (index > text.length || index < 1)
|
||||
fail("Invalid index: %ld is beyond the length of the text (length = %ld)",
|
||||
Int_to_Int64(index_int, false), text.length);
|
||||
|
||||
switch (text.tag) {
|
||||
case TEXT_SHORT_ASCII: {
|
||||
return (Text_t) {
|
||||
.tag=TEXT_SHORT_ASCII,
|
||||
.length=1,
|
||||
.short_ascii={text.short_ascii[index-1]},
|
||||
};
|
||||
}
|
||||
case TEXT_ASCII: {
|
||||
return (Text_t) {
|
||||
.tag=TEXT_SHORT_ASCII,
|
||||
.length=1,
|
||||
.short_ascii={text.ascii[index-1]},
|
||||
};
|
||||
}
|
||||
case TEXT_SHORT_GRAPHEMES: {
|
||||
return (Text_t) {
|
||||
.tag=TEXT_SHORT_GRAPHEMES,
|
||||
.length=1,
|
||||
.short_graphemes={text.short_graphemes[index-1]},
|
||||
};
|
||||
}
|
||||
case TEXT_GRAPHEMES: {
|
||||
return (Text_t) {
|
||||
.tag=TEXT_SHORT_GRAPHEMES,
|
||||
.length=1,
|
||||
.short_graphemes={text.graphemes[index-1]},
|
||||
};
|
||||
}
|
||||
case TEXT_SUBTEXT: {
|
||||
Text_t *subtext = text.subtexts;
|
||||
while (index > subtext[0].length) {
|
||||
index -= subtext[0].length;
|
||||
++subtext;
|
||||
}
|
||||
return Text$cluster(*subtext, I(index));
|
||||
}
|
||||
default: errx(1, "Invalid tag");
|
||||
}
|
||||
}
|
||||
|
||||
Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize)
|
||||
{
|
||||
// Normalization is apparently guaranteed to never exceed 3x in the input length
|
||||
|
@ -28,6 +28,7 @@ Text_t Text$_concat(int n, Text_t items[n]);
|
||||
#define Text$concat(...) Text$_concat(sizeof((Text_t[]){__VA_ARGS__})/sizeof(Text_t), (Text_t[]){__VA_ARGS__})
|
||||
#define Texts(...) Text$concat(__VA_ARGS__)
|
||||
Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int);
|
||||
Text_t Text$cluster(Text_t text, Int_t index_int);
|
||||
OptionalText_t Text$from_str(const char *str);
|
||||
OptionalText_t Text$from_strn(const char *str, size_t len);
|
||||
PUREFUNC uint64_t Text$hash(Text_t *text);
|
||||
|
Loading…
Reference in New Issue
Block a user