From 7eb1fbf5d2d632bbd98fc3fa7cff777197a63d34 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Fri, 7 Mar 2025 16:19:12 -0500 Subject: Add proper language support for case operations on text --- docs/text.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++------ environment.c | 7 ++++--- stdlib/bools.c | 16 ++++++++-------- stdlib/text.c | 24 ++++++++++++------------ stdlib/text.h | 8 ++++---- test/text.tm | 20 ++++++++++++++++++++ 6 files changed, 96 insertions(+), 33 deletions(-) diff --git a/docs/text.md b/docs/text.md index df759c13..cedc8d53 100644 --- a/docs/text.md +++ b/docs/text.md @@ -276,6 +276,7 @@ pattern documentation](patterns.md) for more details. - [`func by_match(text: Text, pattern: Pattern -> func(->Match?))`](#by_match) - [`func by_split(text: Text, pattern: Pattern = $// -> func(->Text?))`](#by_split) - [`func bytes(text: Text -> [Byte])`](#bytes) +- [`func caseless_equals(a: Text, b:Text, language:Text = "C" -> Bool)`](#caseless_equals) - [`func codepoint_names(text: Text -> [Text])`](#codepoint_names) - [`func each(text: Text, pattern: Pattern, fn: func(m: Match), recursive: Bool = yes -> Int?)`](#each) - [`func ends_with(text: Text, suffix: Text -> Bool)`](#ends_with) @@ -289,7 +290,7 @@ pattern documentation](patterns.md) for more details. - [`func has(text: Text, pattern: Pattern -> Bool)`](#has) - [`func join(glue: Text, pieces: [Text] -> Text)`](#join) - [`func split(text: Text -> [Text])`](#lines) -- [`func lower(text: Text -> Text)`](#lower) +- [`func lower(text: Text, language: Text = "C" -> Text)`](#lower) - [`func map(text: Text, pattern: Pattern, fn: func(text:Match)->Text -> Text, recursive: Bool = yes)`](#map) - [`func matches(text: Text, pattern: Pattern -> [Text])`](#matches) - [`func quoted(text: Text, color: Bool = no -> Text)`](#quoted) @@ -300,10 +301,10 @@ pattern documentation](patterns.md) for more details. - [`func slice(text: Text, from: Int = 1, to: Int = -1 -> Text)`](#slice) - [`func split(text: Text, pattern: Pattern = "" -> [Text])`](#split) - [`func starts_with(text: Text, prefix: Text -> Bool)`](#starts_with) -- [`func title(text: Text -> Text)`](#title) +- [`func title(text: Text, language: Text = "C" -> Text)`](#title) - [`func to(text: Text, last: Int -> Text)`](#to) - [`func trim(text: Text, pattern: Pattern = $/{whitespace/, trim_left: Bool = yes, trim_right: Bool = yes -> Text)`](#trim) -- [`func upper(text: Text -> Text)`](#upper) +- [`func upper(text: Text, language: Text "C" -> Text)`](#upper) - [`func utf32_codepoints(text: Text -> [Int32])`](#utf32_codepoints) ### `as_c_string` @@ -451,6 +452,33 @@ An array of bytes (`[Byte]`) representing the text in UTF8 encoding. --- +### `caseless_equals` +Checks whether two texts are equal, ignoring the casing of the letters (i.e. +case-insensitive comparison). + +```tomo +func caseless_equals(a: Text, b:Text, language:Text = "C" -> Bool) +``` + +- `a`: The first text to compare case-insensitively. +- `b`: The second text to compare case-insensitively. +- `language`: The ISO 639 language code for which casing rules to use. + +**Returns:** +`yes` if `a` and `b` are equal to each other, ignoring casing, otherwise `no`. + +**Example:** +```tomo +>> "A":caseless_equals("a") += yes + +# Turkish lowercase "I" is "ı" (dotless I), not "i" +>> "I":caseless_equals("i", language="tr_TR") += no +``` + +--- + ### `codepoint_names` Returns an array of the names of each codepoint in the text. @@ -782,10 +810,11 @@ An array of substrings resulting from the split. Converts all characters in the text to lowercase. ```tomo -func lower(text: Text -> Text) +func lower(text: Text, language: Text = "C" -> Text) ``` - `text`: The text to be converted to lowercase. +- `language`: The ISO 639 language code for which casing rules to use. **Returns:** The lowercase version of the text. @@ -794,6 +823,9 @@ The lowercase version of the text. ```tomo >> "AMÉLIE":lower() = "amélie" + +>> "I":lower(language="tr_TR") +>> "ı" ``` --- @@ -1107,10 +1139,11 @@ func starts_with(text: Text, prefix: Text -> Bool) Converts the text to title case (capitalizing the first letter of each word). ```tomo -func title(text: Text -> Text) +func title(text: Text, language: Text = "C" -> Text) ``` - `text`: The text to be converted to title case. +- `language`: The ISO 639 language code for which casing rules to use. **Returns:** The text in title case. @@ -1119,6 +1152,10 @@ The text in title case. ```tomo >> "amélie":title() = "Amélie" + +# In Turkish, uppercase "i" is "İ" +>> "i":title(language="tr_TR") += "İ" ``` --- @@ -1183,10 +1220,11 @@ The text without the trim pattern at either end. Converts all characters in the text to uppercase. ```tomo -func upper(text: Text -> Text) +func upper(text: Text, language: Text = "C" -> Text) ``` - `text`: The text to be converted to uppercase. +- `language`: The ISO 639 language code for which casing rules to use. **Returns:** The uppercase version of the text. @@ -1195,6 +1233,10 @@ The uppercase version of the text. ```tomo >> "amélie":upper() = "AMÉLIE" + +# In Turkish, uppercase "i" is "İ" +>> "i":upper(language="tr_TR") += "İ" ``` --- diff --git a/environment.c b/environment.c index d37d19ef..f283e693 100644 --- a/environment.c +++ b/environment.c @@ -395,6 +395,7 @@ env_t *new_compilation_unit(CORD libname) {"by_match", "Text$by_match", "func(text:Text, pattern:Pattern -> func(->Match?))"}, {"by_split", "Text$by_split", "func(text:Text, pattern=$Pattern'' -> func(->Text?))"}, {"bytes", "Text$utf8_bytes", "func(text:Text -> [Byte])"}, + {"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language=\"C\" -> Bool)"}, {"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, {"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"}, {"each", "Text$each", "func(text:Text, pattern:Pattern, fn:func(match:Match), recursive=yes)"}, @@ -409,7 +410,7 @@ env_t *new_compilation_unit(CORD libname) {"has", "Text$has", "func(text:Text, pattern:Pattern -> Bool)"}, {"join", "Text$join", "func(glue:Text, pieces:[Text] -> Text)"}, {"lines", "Text$lines", "func(text:Text -> [Text])"}, - {"lower", "Text$lower", "func(text:Text -> Text)"}, + {"lower", "Text$lower", "func(text:Text, language=\"C\" -> Text)"}, {"map", "Text$map", "func(text:Text, pattern:Pattern, fn:func(match:Match -> Text), recursive=yes -> Text)"}, {"matches", "Text$matches", "func(text:Text, pattern:Pattern -> [Text]?)"}, {"quoted", "Text$quoted", "func(text:Text, color=no -> Text)"}, @@ -420,10 +421,10 @@ env_t *new_compilation_unit(CORD libname) {"slice", "Text$slice", "func(text:Text, from=1, to=-1 -> Text)"}, {"split", "Text$split", "func(text:Text, pattern=$Pattern'' -> [Text])"}, {"starts_with", "Text$starts_with", "func(text,prefix:Text -> Bool)"}, - {"title", "Text$title", "func(text:Text -> Text)"}, + {"title", "Text$title", "func(text:Text, language=\"C\" -> Text)"}, {"to", "Text$to", "func(text:Text, last:Int -> Text)"}, {"trim", "Text$trim", "func(text:Text, pattern=$/{whitespace}/, trim_left=yes, trim_right=yes -> Text)"}, - {"upper", "Text$upper", "func(text:Text -> Text)"}, + {"upper", "Text$upper", "func(text:Text, language=\"C\" -> Text)"}, {"utf32_codepoints", "Text$utf32_codepoints", "func(text:Text -> [Int32])"}, )}, {"Thread", THREAD_TYPE, "Thread_t", "Thread", TypedArray(ns_entry_t, diff --git a/stdlib/bools.c b/stdlib/bools.c index 5e0ade37..bf820664 100644 --- a/stdlib/bools.c +++ b/stdlib/bools.c @@ -23,15 +23,15 @@ PUREFUNC public Text_t Bool$as_text(const void *b, bool colorize, const TypeInfo PUREFUNC public OptionalBool_t Bool$parse(Text_t text) { - if (Text$equal_ignoring_case(text, Text("yes")) - || Text$equal_ignoring_case(text, Text("on")) - || Text$equal_ignoring_case(text, Text("true")) - || Text$equal_ignoring_case(text, Text("1"))) { + if (Text$equal_ignoring_case(text, Text("yes"), NONE_TEXT) + || Text$equal_ignoring_case(text, Text("on"), NONE_TEXT) + || Text$equal_ignoring_case(text, Text("true"), NONE_TEXT) + || Text$equal_ignoring_case(text, Text("1"), NONE_TEXT)) { return yes; - } else if (Text$equal_ignoring_case(text, Text("no")) - || Text$equal_ignoring_case(text, Text("off")) - || Text$equal_ignoring_case(text, Text("false")) - || Text$equal_ignoring_case(text, Text("0"))) { + } else if (Text$equal_ignoring_case(text, Text("no"), NONE_TEXT) + || Text$equal_ignoring_case(text, Text("off"), NONE_TEXT) + || Text$equal_ignoring_case(text, Text("false"), NONE_TEXT) + || Text$equal_ignoring_case(text, Text("0"), NONE_TEXT)) { return no; } else { return NONE_BOOL; diff --git a/stdlib/text.c b/stdlib/text.c index 4ee21601..c8700739 100644 --- a/stdlib/text.c +++ b/stdlib/text.c @@ -968,13 +968,13 @@ PUREFUNC public bool Text$equal(const void *a, const void *b, const TypeInfo_t*) return Text$equal_values(*(Text_t*)a, *(Text_t*)b); } -PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b) +PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language) { if (a.length != b.length) return false; int64_t len = a.length; TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b); - const char *language = uc_locale_language(); + const char *uc_language = Text$as_c_string(language); for (int64_t i = 0; i < len; i++) { int32_t ai = Text$get_grapheme_fast(&a_state, i); int32_t bi = Text$get_grapheme_fast(&b_state, i); @@ -986,7 +986,7 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b) int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi); int cmp = 0; - (void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, language, UNINORM_NFC, &cmp); + (void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, uc_language, UNINORM_NFC, &cmp); if (cmp != 0) return false; } @@ -994,40 +994,40 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b) return true; } -public Text_t Text$upper(Text_t text) +public Text_t Text$upper(Text_t text, Text_t language) { if (text.length == 0) return text; Array_t codepoints = Text$utf32_codepoints(text); - const char *language = uc_locale_language(); + const char *uc_language = Text$as_c_string(language); ucs4_t buf[128]; size_t out_len = sizeof(buf)/sizeof(buf[0]); - ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len); + ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len); Text_t ret = text_from_u32(upper, (int64_t)out_len, false); if (upper != buf) free(upper); return ret; } -public Text_t Text$lower(Text_t text) +public Text_t Text$lower(Text_t text, Text_t language) { if (text.length == 0) return text; Array_t codepoints = Text$utf32_codepoints(text); - const char *language = uc_locale_language(); + const char *uc_language = Text$as_c_string(language); ucs4_t buf[128]; size_t out_len = sizeof(buf)/sizeof(buf[0]); - ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len); + ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len); Text_t ret = text_from_u32(lower, (int64_t)out_len, false); if (lower != buf) free(lower); return ret; } -public Text_t Text$title(Text_t text) +public Text_t Text$title(Text_t text, Text_t language) { if (text.length == 0) return text; Array_t codepoints = Text$utf32_codepoints(text); - const char *language = uc_locale_language(); + const char *uc_language = Text$as_c_string(language); ucs4_t buf[128]; size_t out_len = sizeof(buf)/sizeof(buf[0]); - ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len); + ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len); Text_t ret = text_from_u32(title, (int64_t)out_len, false); if (title != buf) free(title); return ret; diff --git a/stdlib/text.h b/stdlib/text.h index 64cf86f5..d3aba3f3 100644 --- a/stdlib/text.h +++ b/stdlib/text.h @@ -45,11 +45,11 @@ PUREFUNC uint64_t Text$hash(const void *text, const TypeInfo_t*); PUREFUNC int32_t Text$compare(const void *va, const void *vb, const TypeInfo_t*); PUREFUNC bool Text$equal(const void *a, const void *b, const TypeInfo_t*); PUREFUNC bool Text$equal_values(Text_t a, Text_t b); -PUREFUNC bool Text$equal_ignoring_case(Text_t a, Text_t b); +PUREFUNC bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language); PUREFUNC bool Text$is_none(const void *t, const TypeInfo_t*); -Text_t Text$upper(Text_t text); -Text_t Text$lower(Text_t text); -Text_t Text$title(Text_t text); +Text_t Text$upper(Text_t text, Text_t language); +Text_t Text$lower(Text_t text, Text_t language); +Text_t Text$title(Text_t text, Text_t language); Text_t Text$as_text(const void *text, bool colorize, const TypeInfo_t *info); Text_t Text$quoted(Text_t str, bool colorize); PUREFUNC bool Text$starts_with(Text_t text, Text_t prefix); diff --git a/test/text.tm b/test/text.tm index 7d6ce831..a1b36d97 100644 --- a/test/text.tm +++ b/test/text.tm @@ -11,6 +11,26 @@ func main(): >> str[1] = "H" + >> "I":lower() + = "i" + >> "I":lower(language="tr_TR") + = "ı" + + >> "i":upper() + = "I" + >> "i":upper(language="tr_TR") + = "İ" + + >> "ian":title() + = "Ian" + >> "ian":title(language="tr_TR") + = "İan" + + >> "I":caseless_equals("ı") + = no + >> "I":caseless_equals("ı", language="tr_TR") + = yes + >> str[9] = "é" -- cgit v1.2.3