From 7eb1fbf5d2d632bbd98fc3fa7cff777197a63d34 Mon Sep 17 00:00:00 2001
From: Bruce Hill <bruce@bruce-hill.com>
Date: Fri, 7 Mar 2025 16:19:12 -0500
Subject: Add proper language support for case operations on text

---
 docs/text.md   | 54 ++++++++++++++++++++++++++++++++++++++++++++++++------
 environment.c  |  7 ++++---
 stdlib/bools.c | 16 ++++++++--------
 stdlib/text.c  | 24 ++++++++++++------------
 stdlib/text.h  |  8 ++++----
 test/text.tm   | 20 ++++++++++++++++++++
 6 files changed, 96 insertions(+), 33 deletions(-)

diff --git a/docs/text.md b/docs/text.md
index df759c13..cedc8d53 100644
--- a/docs/text.md
+++ b/docs/text.md
@@ -276,6 +276,7 @@ pattern documentation](patterns.md) for more details.
 - [`func by_match(text: Text, pattern: Pattern -> func(->Match?))`](#by_match)
 - [`func by_split(text: Text, pattern: Pattern = $// -> func(->Text?))`](#by_split)
 - [`func bytes(text: Text -> [Byte])`](#bytes)
+- [`func caseless_equals(a: Text, b:Text, language:Text = "C" -> Bool)`](#caseless_equals)
 - [`func codepoint_names(text: Text -> [Text])`](#codepoint_names)
 - [`func each(text: Text, pattern: Pattern, fn: func(m: Match), recursive: Bool = yes -> Int?)`](#each)
 - [`func ends_with(text: Text, suffix: Text -> Bool)`](#ends_with)
@@ -289,7 +290,7 @@ pattern documentation](patterns.md) for more details.
 - [`func has(text: Text, pattern: Pattern -> Bool)`](#has)
 - [`func join(glue: Text, pieces: [Text] -> Text)`](#join)
 - [`func split(text: Text -> [Text])`](#lines)
-- [`func lower(text: Text -> Text)`](#lower)
+- [`func lower(text: Text, language: Text = "C" -> Text)`](#lower)
 - [`func map(text: Text, pattern: Pattern, fn: func(text:Match)->Text -> Text, recursive: Bool = yes)`](#map)
 - [`func matches(text: Text, pattern: Pattern -> [Text])`](#matches)
 - [`func quoted(text: Text, color: Bool = no -> Text)`](#quoted)
@@ -300,10 +301,10 @@ pattern documentation](patterns.md) for more details.
 - [`func slice(text: Text, from: Int = 1, to: Int = -1 -> Text)`](#slice)
 - [`func split(text: Text, pattern: Pattern = "" -> [Text])`](#split)
 - [`func starts_with(text: Text, prefix: Text -> Bool)`](#starts_with)
-- [`func title(text: Text -> Text)`](#title)
+- [`func title(text: Text, language: Text = "C" -> Text)`](#title)
 - [`func to(text: Text, last: Int -> Text)`](#to)
 - [`func trim(text: Text, pattern: Pattern = $/{whitespace/, trim_left: Bool = yes, trim_right: Bool = yes -> Text)`](#trim)
-- [`func upper(text: Text -> Text)`](#upper)
+- [`func upper(text: Text, language: Text "C" -> Text)`](#upper)
 - [`func utf32_codepoints(text: Text -> [Int32])`](#utf32_codepoints)
 
 ### `as_c_string`
@@ -451,6 +452,33 @@ An array of bytes (`[Byte]`) representing the text in UTF8 encoding.
 
 ---
 
+### `caseless_equals`
+Checks whether two texts are equal, ignoring the casing of the letters (i.e.
+case-insensitive comparison).
+
+```tomo
+func caseless_equals(a: Text, b:Text, language:Text = "C" -> Bool)
+```
+
+- `a`: The first text to compare case-insensitively.
+- `b`: The second text to compare case-insensitively.
+- `language`: The ISO 639 language code for which casing rules to use.
+
+**Returns:**  
+`yes` if `a` and `b` are equal to each other, ignoring casing, otherwise `no`.
+
+**Example:**  
+```tomo
+>> "A":caseless_equals("a")
+= yes
+
+# Turkish lowercase "I" is "ı" (dotless I), not "i"
+>> "I":caseless_equals("i", language="tr_TR")
+= no
+```
+
+---
+
 ### `codepoint_names`
 Returns an array of the names of each codepoint in the text.
 
@@ -782,10 +810,11 @@ An array of substrings resulting from the split.
 Converts all characters in the text to lowercase.
 
 ```tomo
-func lower(text: Text -> Text)
+func lower(text: Text, language: Text = "C" -> Text)
 ```
 
 - `text`: The text to be converted to lowercase.
+- `language`: The ISO 639 language code for which casing rules to use.
 
 **Returns:**  
 The lowercase version of the text.
@@ -794,6 +823,9 @@ The lowercase version of the text.
 ```tomo
 >> "AMÉLIE":lower()
 = "amélie"
+
+>> "I":lower(language="tr_TR")
+>> "ı"
 ```
 
 ---
@@ -1107,10 +1139,11 @@ func starts_with(text: Text, prefix: Text -> Bool)
 Converts the text to title case (capitalizing the first letter of each word).
 
 ```tomo
-func title(text: Text -> Text)
+func title(text: Text, language: Text = "C" -> Text)
 ```
 
 - `text`: The text to be converted to title case.
+- `language`: The ISO 639 language code for which casing rules to use.
 
 **Returns:**  
 The text in title case.
@@ -1119,6 +1152,10 @@ The text in title case.
 ```tomo
 >> "amélie":title()
 = "Amélie"
+
+# In Turkish, uppercase "i" is "İ"
+>> "i":title(language="tr_TR")
+= "İ"
 ```
 
 ---
@@ -1183,10 +1220,11 @@ The text without the trim pattern at either end.
 Converts all characters in the text to uppercase.
 
 ```tomo
-func upper(text: Text -> Text)
+func upper(text: Text, language: Text = "C" -> Text)
 ```
 
 - `text`: The text to be converted to uppercase.
+- `language`: The ISO 639 language code for which casing rules to use.
 
 **Returns:**  
 The uppercase version of the text.
@@ -1195,6 +1233,10 @@ The uppercase version of the text.
 ```tomo
 >> "amélie":upper()
 = "AMÉLIE"
+
+# In Turkish, uppercase "i" is "İ"
+>> "i":upper(language="tr_TR")
+= "İ"
 ```
 
 ---
diff --git a/environment.c b/environment.c
index d37d19ef..f283e693 100644
--- a/environment.c
+++ b/environment.c
@@ -395,6 +395,7 @@ env_t *new_compilation_unit(CORD libname)
             {"by_match", "Text$by_match", "func(text:Text, pattern:Pattern -> func(->Match?))"},
             {"by_split", "Text$by_split", "func(text:Text, pattern=$Pattern'' -> func(->Text?))"},
             {"bytes", "Text$utf8_bytes", "func(text:Text -> [Byte])"},
+            {"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language=\"C\" -> Bool)"},
             {"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"},
             {"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"},
             {"each", "Text$each", "func(text:Text, pattern:Pattern, fn:func(match:Match), recursive=yes)"},
@@ -409,7 +410,7 @@ env_t *new_compilation_unit(CORD libname)
             {"has", "Text$has", "func(text:Text, pattern:Pattern -> Bool)"},
             {"join", "Text$join", "func(glue:Text, pieces:[Text] -> Text)"},
             {"lines", "Text$lines", "func(text:Text -> [Text])"},
-            {"lower", "Text$lower", "func(text:Text -> Text)"},
+            {"lower", "Text$lower", "func(text:Text, language=\"C\" -> Text)"},
             {"map", "Text$map", "func(text:Text, pattern:Pattern, fn:func(match:Match -> Text), recursive=yes -> Text)"},
             {"matches", "Text$matches", "func(text:Text, pattern:Pattern -> [Text]?)"},
             {"quoted", "Text$quoted", "func(text:Text, color=no -> Text)"},
@@ -420,10 +421,10 @@ env_t *new_compilation_unit(CORD libname)
             {"slice", "Text$slice", "func(text:Text, from=1, to=-1 -> Text)"},
             {"split", "Text$split", "func(text:Text, pattern=$Pattern'' -> [Text])"},
             {"starts_with", "Text$starts_with", "func(text,prefix:Text -> Bool)"},
-            {"title", "Text$title", "func(text:Text -> Text)"},
+            {"title", "Text$title", "func(text:Text, language=\"C\" -> Text)"},
             {"to", "Text$to", "func(text:Text, last:Int -> Text)"},
             {"trim", "Text$trim", "func(text:Text, pattern=$/{whitespace}/, trim_left=yes, trim_right=yes -> Text)"},
-            {"upper", "Text$upper", "func(text:Text -> Text)"},
+            {"upper", "Text$upper", "func(text:Text, language=\"C\" -> Text)"},
             {"utf32_codepoints", "Text$utf32_codepoints", "func(text:Text -> [Int32])"},
         )},
         {"Thread", THREAD_TYPE, "Thread_t", "Thread", TypedArray(ns_entry_t,
diff --git a/stdlib/bools.c b/stdlib/bools.c
index 5e0ade37..bf820664 100644
--- a/stdlib/bools.c
+++ b/stdlib/bools.c
@@ -23,15 +23,15 @@ PUREFUNC public Text_t Bool$as_text(const void *b, bool colorize, const TypeInfo
 
 PUREFUNC public OptionalBool_t Bool$parse(Text_t text)
 {
-    if (Text$equal_ignoring_case(text, Text("yes"))
-        || Text$equal_ignoring_case(text, Text("on"))
-        || Text$equal_ignoring_case(text, Text("true"))
-        || Text$equal_ignoring_case(text, Text("1"))) {
+    if (Text$equal_ignoring_case(text, Text("yes"), NONE_TEXT)
+        || Text$equal_ignoring_case(text, Text("on"), NONE_TEXT)
+        || Text$equal_ignoring_case(text, Text("true"), NONE_TEXT)
+        || Text$equal_ignoring_case(text, Text("1"), NONE_TEXT)) {
         return yes;
-    } else if (Text$equal_ignoring_case(text, Text("no"))
-        || Text$equal_ignoring_case(text, Text("off"))
-        || Text$equal_ignoring_case(text, Text("false"))
-        || Text$equal_ignoring_case(text, Text("0"))) {
+    } else if (Text$equal_ignoring_case(text, Text("no"), NONE_TEXT)
+        || Text$equal_ignoring_case(text, Text("off"), NONE_TEXT)
+        || Text$equal_ignoring_case(text, Text("false"), NONE_TEXT)
+        || Text$equal_ignoring_case(text, Text("0"), NONE_TEXT)) {
         return no;
     } else {
         return NONE_BOOL;
diff --git a/stdlib/text.c b/stdlib/text.c
index 4ee21601..c8700739 100644
--- a/stdlib/text.c
+++ b/stdlib/text.c
@@ -968,13 +968,13 @@ PUREFUNC public bool Text$equal(const void *a, const void *b, const TypeInfo_t*)
     return Text$equal_values(*(Text_t*)a, *(Text_t*)b);
 }
 
-PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b)
+PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language)
 {
     if (a.length != b.length)
         return false;
     int64_t len = a.length;
     TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
-    const char *language = uc_locale_language();
+    const char *uc_language = Text$as_c_string(language);
     for (int64_t i = 0; i < len; i++) {
         int32_t ai = Text$get_grapheme_fast(&a_state, i);
         int32_t bi = Text$get_grapheme_fast(&b_state, i);
@@ -986,7 +986,7 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b)
             int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi);
 
             int cmp = 0;
-            (void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, language, UNINORM_NFC, &cmp);
+            (void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, uc_language, UNINORM_NFC, &cmp);
             if (cmp != 0)
                 return false;
         }
@@ -994,40 +994,40 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b)
     return true;
 }
 
-public Text_t Text$upper(Text_t text)
+public Text_t Text$upper(Text_t text, Text_t language)
 {
     if (text.length == 0) return text;
     Array_t codepoints = Text$utf32_codepoints(text);
-    const char *language = uc_locale_language();
+    const char *uc_language = Text$as_c_string(language);
     ucs4_t buf[128]; 
     size_t out_len = sizeof(buf)/sizeof(buf[0]);
-    ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
+    ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
     Text_t ret = text_from_u32(upper, (int64_t)out_len, false);
     if (upper != buf) free(upper);
     return ret;
 }
 
-public Text_t Text$lower(Text_t text)
+public Text_t Text$lower(Text_t text, Text_t language)
 {
     if (text.length == 0) return text;
     Array_t codepoints = Text$utf32_codepoints(text);
-    const char *language = uc_locale_language();
+    const char *uc_language = Text$as_c_string(language);
     ucs4_t buf[128]; 
     size_t out_len = sizeof(buf)/sizeof(buf[0]);
-    ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
+    ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
     Text_t ret = text_from_u32(lower, (int64_t)out_len, false);
     if (lower != buf) free(lower);
     return ret;
 }
 
-public Text_t Text$title(Text_t text)
+public Text_t Text$title(Text_t text, Text_t language)
 {
     if (text.length == 0) return text;
     Array_t codepoints = Text$utf32_codepoints(text);
-    const char *language = uc_locale_language();
+    const char *uc_language = Text$as_c_string(language);
     ucs4_t buf[128]; 
     size_t out_len = sizeof(buf)/sizeof(buf[0]);
-    ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
+    ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
     Text_t ret = text_from_u32(title, (int64_t)out_len, false);
     if (title != buf) free(title);
     return ret;
diff --git a/stdlib/text.h b/stdlib/text.h
index 64cf86f5..d3aba3f3 100644
--- a/stdlib/text.h
+++ b/stdlib/text.h
@@ -45,11 +45,11 @@ PUREFUNC uint64_t Text$hash(const void *text, const TypeInfo_t*);
 PUREFUNC int32_t Text$compare(const void *va, const void *vb, const TypeInfo_t*);
 PUREFUNC bool Text$equal(const void *a, const void *b, const TypeInfo_t*);
 PUREFUNC bool Text$equal_values(Text_t a, Text_t b);
-PUREFUNC bool Text$equal_ignoring_case(Text_t a, Text_t b);
+PUREFUNC bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language);
 PUREFUNC bool Text$is_none(const void *t, const TypeInfo_t*);
-Text_t Text$upper(Text_t text);
-Text_t Text$lower(Text_t text);
-Text_t Text$title(Text_t text);
+Text_t Text$upper(Text_t text, Text_t language);
+Text_t Text$lower(Text_t text, Text_t language);
+Text_t Text$title(Text_t text, Text_t language);
 Text_t Text$as_text(const void *text, bool colorize, const TypeInfo_t *info);
 Text_t Text$quoted(Text_t str, bool colorize);
 PUREFUNC bool Text$starts_with(Text_t text, Text_t prefix);
diff --git a/test/text.tm b/test/text.tm
index 7d6ce831..a1b36d97 100644
--- a/test/text.tm
+++ b/test/text.tm
@@ -11,6 +11,26 @@ func main():
 	>> str[1]
 	= "H"
 
+	>> "I":lower()
+	= "i"
+	>> "I":lower(language="tr_TR")
+	= "ı"
+
+	>> "i":upper()
+	= "I"
+	>> "i":upper(language="tr_TR")
+	= "İ"
+
+	>> "ian":title()
+	= "Ian"
+	>> "ian":title(language="tr_TR")
+	= "İan"
+
+	>> "I":caseless_equals("ı")
+	= no
+	>> "I":caseless_equals("ı", language="tr_TR")
+	= yes
+
 	>> str[9]
 	= "é"
 
-- 
cgit v1.2.3