Add proper language support for case operations on text

This commit is contained in:
Bruce Hill 2025-03-07 16:19:12 -05:00
parent d590b9d386
commit 7eb1fbf5d2
6 changed files with 96 additions and 33 deletions

View File

@ -276,6 +276,7 @@ pattern documentation](patterns.md) for more details.
- [`func by_match(text: Text, pattern: Pattern -> func(->Match?))`](#by_match)
- [`func by_split(text: Text, pattern: Pattern = $// -> func(->Text?))`](#by_split)
- [`func bytes(text: Text -> [Byte])`](#bytes)
- [`func caseless_equals(a: Text, b:Text, language:Text = "C" -> Bool)`](#caseless_equals)
- [`func codepoint_names(text: Text -> [Text])`](#codepoint_names)
- [`func each(text: Text, pattern: Pattern, fn: func(m: Match), recursive: Bool = yes -> Int?)`](#each)
- [`func ends_with(text: Text, suffix: Text -> Bool)`](#ends_with)
@ -289,7 +290,7 @@ pattern documentation](patterns.md) for more details.
- [`func has(text: Text, pattern: Pattern -> Bool)`](#has)
- [`func join(glue: Text, pieces: [Text] -> Text)`](#join)
- [`func split(text: Text -> [Text])`](#lines)
- [`func lower(text: Text -> Text)`](#lower)
- [`func lower(text: Text, language: Text = "C" -> Text)`](#lower)
- [`func map(text: Text, pattern: Pattern, fn: func(text:Match)->Text -> Text, recursive: Bool = yes)`](#map)
- [`func matches(text: Text, pattern: Pattern -> [Text])`](#matches)
- [`func quoted(text: Text, color: Bool = no -> Text)`](#quoted)
@ -300,10 +301,10 @@ pattern documentation](patterns.md) for more details.
- [`func slice(text: Text, from: Int = 1, to: Int = -1 -> Text)`](#slice)
- [`func split(text: Text, pattern: Pattern = "" -> [Text])`](#split)
- [`func starts_with(text: Text, prefix: Text -> Bool)`](#starts_with)
- [`func title(text: Text -> Text)`](#title)
- [`func title(text: Text, language: Text = "C" -> Text)`](#title)
- [`func to(text: Text, last: Int -> Text)`](#to)
- [`func trim(text: Text, pattern: Pattern = $/{whitespace/, trim_left: Bool = yes, trim_right: Bool = yes -> Text)`](#trim)
- [`func upper(text: Text -> Text)`](#upper)
- [`func upper(text: Text, language: Text "C" -> Text)`](#upper)
- [`func utf32_codepoints(text: Text -> [Int32])`](#utf32_codepoints)
### `as_c_string`
@ -451,6 +452,33 @@ An array of bytes (`[Byte]`) representing the text in UTF8 encoding.
---
### `caseless_equals`
Checks whether two texts are equal, ignoring the casing of the letters (i.e.
case-insensitive comparison).
```tomo
func caseless_equals(a: Text, b:Text, language:Text = "C" -> Bool)
```
- `a`: The first text to compare case-insensitively.
- `b`: The second text to compare case-insensitively.
- `language`: The ISO 639 language code for which casing rules to use.
**Returns:**
`yes` if `a` and `b` are equal to each other, ignoring casing, otherwise `no`.
**Example:**
```tomo
>> "A":caseless_equals("a")
= yes
# Turkish lowercase "I" is "ı" (dotless I), not "i"
>> "I":caseless_equals("i", language="tr_TR")
= no
```
---
### `codepoint_names`
Returns an array of the names of each codepoint in the text.
@ -782,10 +810,11 @@ An array of substrings resulting from the split.
Converts all characters in the text to lowercase.
```tomo
func lower(text: Text -> Text)
func lower(text: Text, language: Text = "C" -> Text)
```
- `text`: The text to be converted to lowercase.
- `language`: The ISO 639 language code for which casing rules to use.
**Returns:**
The lowercase version of the text.
@ -794,6 +823,9 @@ The lowercase version of the text.
```tomo
>> "AMÉLIE":lower()
= "amélie"
>> "I":lower(language="tr_TR")
>> "ı"
```
---
@ -1107,10 +1139,11 @@ func starts_with(text: Text, prefix: Text -> Bool)
Converts the text to title case (capitalizing the first letter of each word).
```tomo
func title(text: Text -> Text)
func title(text: Text, language: Text = "C" -> Text)
```
- `text`: The text to be converted to title case.
- `language`: The ISO 639 language code for which casing rules to use.
**Returns:**
The text in title case.
@ -1119,6 +1152,10 @@ The text in title case.
```tomo
>> "amélie":title()
= "Amélie"
# In Turkish, uppercase "i" is "İ"
>> "i":title(language="tr_TR")
= "İ"
```
---
@ -1183,10 +1220,11 @@ The text without the trim pattern at either end.
Converts all characters in the text to uppercase.
```tomo
func upper(text: Text -> Text)
func upper(text: Text, language: Text = "C" -> Text)
```
- `text`: The text to be converted to uppercase.
- `language`: The ISO 639 language code for which casing rules to use.
**Returns:**
The uppercase version of the text.
@ -1195,6 +1233,10 @@ The uppercase version of the text.
```tomo
>> "amélie":upper()
= "AMÉLIE"
# In Turkish, uppercase "i" is "İ"
>> "i":upper(language="tr_TR")
= "İ"
```
---

View File

@ -395,6 +395,7 @@ env_t *new_compilation_unit(CORD libname)
{"by_match", "Text$by_match", "func(text:Text, pattern:Pattern -> func(->Match?))"},
{"by_split", "Text$by_split", "func(text:Text, pattern=$Pattern'' -> func(->Text?))"},
{"bytes", "Text$utf8_bytes", "func(text:Text -> [Byte])"},
{"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language=\"C\" -> Bool)"},
{"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"},
{"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"},
{"each", "Text$each", "func(text:Text, pattern:Pattern, fn:func(match:Match), recursive=yes)"},
@ -409,7 +410,7 @@ env_t *new_compilation_unit(CORD libname)
{"has", "Text$has", "func(text:Text, pattern:Pattern -> Bool)"},
{"join", "Text$join", "func(glue:Text, pieces:[Text] -> Text)"},
{"lines", "Text$lines", "func(text:Text -> [Text])"},
{"lower", "Text$lower", "func(text:Text -> Text)"},
{"lower", "Text$lower", "func(text:Text, language=\"C\" -> Text)"},
{"map", "Text$map", "func(text:Text, pattern:Pattern, fn:func(match:Match -> Text), recursive=yes -> Text)"},
{"matches", "Text$matches", "func(text:Text, pattern:Pattern -> [Text]?)"},
{"quoted", "Text$quoted", "func(text:Text, color=no -> Text)"},
@ -420,10 +421,10 @@ env_t *new_compilation_unit(CORD libname)
{"slice", "Text$slice", "func(text:Text, from=1, to=-1 -> Text)"},
{"split", "Text$split", "func(text:Text, pattern=$Pattern'' -> [Text])"},
{"starts_with", "Text$starts_with", "func(text,prefix:Text -> Bool)"},
{"title", "Text$title", "func(text:Text -> Text)"},
{"title", "Text$title", "func(text:Text, language=\"C\" -> Text)"},
{"to", "Text$to", "func(text:Text, last:Int -> Text)"},
{"trim", "Text$trim", "func(text:Text, pattern=$/{whitespace}/, trim_left=yes, trim_right=yes -> Text)"},
{"upper", "Text$upper", "func(text:Text -> Text)"},
{"upper", "Text$upper", "func(text:Text, language=\"C\" -> Text)"},
{"utf32_codepoints", "Text$utf32_codepoints", "func(text:Text -> [Int32])"},
)},
{"Thread", THREAD_TYPE, "Thread_t", "Thread", TypedArray(ns_entry_t,

View File

@ -23,15 +23,15 @@ PUREFUNC public Text_t Bool$as_text(const void *b, bool colorize, const TypeInfo
PUREFUNC public OptionalBool_t Bool$parse(Text_t text)
{
if (Text$equal_ignoring_case(text, Text("yes"))
|| Text$equal_ignoring_case(text, Text("on"))
|| Text$equal_ignoring_case(text, Text("true"))
|| Text$equal_ignoring_case(text, Text("1"))) {
if (Text$equal_ignoring_case(text, Text("yes"), NONE_TEXT)
|| Text$equal_ignoring_case(text, Text("on"), NONE_TEXT)
|| Text$equal_ignoring_case(text, Text("true"), NONE_TEXT)
|| Text$equal_ignoring_case(text, Text("1"), NONE_TEXT)) {
return yes;
} else if (Text$equal_ignoring_case(text, Text("no"))
|| Text$equal_ignoring_case(text, Text("off"))
|| Text$equal_ignoring_case(text, Text("false"))
|| Text$equal_ignoring_case(text, Text("0"))) {
} else if (Text$equal_ignoring_case(text, Text("no"), NONE_TEXT)
|| Text$equal_ignoring_case(text, Text("off"), NONE_TEXT)
|| Text$equal_ignoring_case(text, Text("false"), NONE_TEXT)
|| Text$equal_ignoring_case(text, Text("0"), NONE_TEXT)) {
return no;
} else {
return NONE_BOOL;

View File

@ -968,13 +968,13 @@ PUREFUNC public bool Text$equal(const void *a, const void *b, const TypeInfo_t*)
return Text$equal_values(*(Text_t*)a, *(Text_t*)b);
}
PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b)
PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language)
{
if (a.length != b.length)
return false;
int64_t len = a.length;
TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
const char *language = uc_locale_language();
const char *uc_language = Text$as_c_string(language);
for (int64_t i = 0; i < len; i++) {
int32_t ai = Text$get_grapheme_fast(&a_state, i);
int32_t bi = Text$get_grapheme_fast(&b_state, i);
@ -986,7 +986,7 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b)
int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi);
int cmp = 0;
(void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, language, UNINORM_NFC, &cmp);
(void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, uc_language, UNINORM_NFC, &cmp);
if (cmp != 0)
return false;
}
@ -994,40 +994,40 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b)
return true;
}
public Text_t Text$upper(Text_t text)
public Text_t Text$upper(Text_t text, Text_t language)
{
if (text.length == 0) return text;
Array_t codepoints = Text$utf32_codepoints(text);
const char *language = uc_locale_language();
const char *uc_language = Text$as_c_string(language);
ucs4_t buf[128];
size_t out_len = sizeof(buf)/sizeof(buf[0]);
ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
Text_t ret = text_from_u32(upper, (int64_t)out_len, false);
if (upper != buf) free(upper);
return ret;
}
public Text_t Text$lower(Text_t text)
public Text_t Text$lower(Text_t text, Text_t language)
{
if (text.length == 0) return text;
Array_t codepoints = Text$utf32_codepoints(text);
const char *language = uc_locale_language();
const char *uc_language = Text$as_c_string(language);
ucs4_t buf[128];
size_t out_len = sizeof(buf)/sizeof(buf[0]);
ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
Text_t ret = text_from_u32(lower, (int64_t)out_len, false);
if (lower != buf) free(lower);
return ret;
}
public Text_t Text$title(Text_t text)
public Text_t Text$title(Text_t text, Text_t language)
{
if (text.length == 0) return text;
Array_t codepoints = Text$utf32_codepoints(text);
const char *language = uc_locale_language();
const char *uc_language = Text$as_c_string(language);
ucs4_t buf[128];
size_t out_len = sizeof(buf)/sizeof(buf[0]);
ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
Text_t ret = text_from_u32(title, (int64_t)out_len, false);
if (title != buf) free(title);
return ret;

View File

@ -45,11 +45,11 @@ PUREFUNC uint64_t Text$hash(const void *text, const TypeInfo_t*);
PUREFUNC int32_t Text$compare(const void *va, const void *vb, const TypeInfo_t*);
PUREFUNC bool Text$equal(const void *a, const void *b, const TypeInfo_t*);
PUREFUNC bool Text$equal_values(Text_t a, Text_t b);
PUREFUNC bool Text$equal_ignoring_case(Text_t a, Text_t b);
PUREFUNC bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language);
PUREFUNC bool Text$is_none(const void *t, const TypeInfo_t*);
Text_t Text$upper(Text_t text);
Text_t Text$lower(Text_t text);
Text_t Text$title(Text_t text);
Text_t Text$upper(Text_t text, Text_t language);
Text_t Text$lower(Text_t text, Text_t language);
Text_t Text$title(Text_t text, Text_t language);
Text_t Text$as_text(const void *text, bool colorize, const TypeInfo_t *info);
Text_t Text$quoted(Text_t str, bool colorize);
PUREFUNC bool Text$starts_with(Text_t text, Text_t prefix);

View File

@ -11,6 +11,26 @@ func main():
>> str[1]
= "H"
>> "I":lower()
= "i"
>> "I":lower(language="tr_TR")
= "ı"
>> "i":upper()
= "I"
>> "i":upper(language="tr_TR")
= "İ"
>> "ian":title()
= "Ian"
>> "ian":title(language="tr_TR")
= "İan"
>> "I":caseless_equals("ı")
= no
>> "I":caseless_equals("ı", language="tr_TR")
= yes
>> str[9]
= "é"