Add proper language support for case operations on text
This commit is contained in:
parent
d590b9d386
commit
7eb1fbf5d2
54
docs/text.md
54
docs/text.md
@ -276,6 +276,7 @@ pattern documentation](patterns.md) for more details.
|
||||
- [`func by_match(text: Text, pattern: Pattern -> func(->Match?))`](#by_match)
|
||||
- [`func by_split(text: Text, pattern: Pattern = $// -> func(->Text?))`](#by_split)
|
||||
- [`func bytes(text: Text -> [Byte])`](#bytes)
|
||||
- [`func caseless_equals(a: Text, b:Text, language:Text = "C" -> Bool)`](#caseless_equals)
|
||||
- [`func codepoint_names(text: Text -> [Text])`](#codepoint_names)
|
||||
- [`func each(text: Text, pattern: Pattern, fn: func(m: Match), recursive: Bool = yes -> Int?)`](#each)
|
||||
- [`func ends_with(text: Text, suffix: Text -> Bool)`](#ends_with)
|
||||
@ -289,7 +290,7 @@ pattern documentation](patterns.md) for more details.
|
||||
- [`func has(text: Text, pattern: Pattern -> Bool)`](#has)
|
||||
- [`func join(glue: Text, pieces: [Text] -> Text)`](#join)
|
||||
- [`func split(text: Text -> [Text])`](#lines)
|
||||
- [`func lower(text: Text -> Text)`](#lower)
|
||||
- [`func lower(text: Text, language: Text = "C" -> Text)`](#lower)
|
||||
- [`func map(text: Text, pattern: Pattern, fn: func(text:Match)->Text -> Text, recursive: Bool = yes)`](#map)
|
||||
- [`func matches(text: Text, pattern: Pattern -> [Text])`](#matches)
|
||||
- [`func quoted(text: Text, color: Bool = no -> Text)`](#quoted)
|
||||
@ -300,10 +301,10 @@ pattern documentation](patterns.md) for more details.
|
||||
- [`func slice(text: Text, from: Int = 1, to: Int = -1 -> Text)`](#slice)
|
||||
- [`func split(text: Text, pattern: Pattern = "" -> [Text])`](#split)
|
||||
- [`func starts_with(text: Text, prefix: Text -> Bool)`](#starts_with)
|
||||
- [`func title(text: Text -> Text)`](#title)
|
||||
- [`func title(text: Text, language: Text = "C" -> Text)`](#title)
|
||||
- [`func to(text: Text, last: Int -> Text)`](#to)
|
||||
- [`func trim(text: Text, pattern: Pattern = $/{whitespace/, trim_left: Bool = yes, trim_right: Bool = yes -> Text)`](#trim)
|
||||
- [`func upper(text: Text -> Text)`](#upper)
|
||||
- [`func upper(text: Text, language: Text "C" -> Text)`](#upper)
|
||||
- [`func utf32_codepoints(text: Text -> [Int32])`](#utf32_codepoints)
|
||||
|
||||
### `as_c_string`
|
||||
@ -451,6 +452,33 @@ An array of bytes (`[Byte]`) representing the text in UTF8 encoding.
|
||||
|
||||
---
|
||||
|
||||
### `caseless_equals`
|
||||
Checks whether two texts are equal, ignoring the casing of the letters (i.e.
|
||||
case-insensitive comparison).
|
||||
|
||||
```tomo
|
||||
func caseless_equals(a: Text, b:Text, language:Text = "C" -> Bool)
|
||||
```
|
||||
|
||||
- `a`: The first text to compare case-insensitively.
|
||||
- `b`: The second text to compare case-insensitively.
|
||||
- `language`: The ISO 639 language code for which casing rules to use.
|
||||
|
||||
**Returns:**
|
||||
`yes` if `a` and `b` are equal to each other, ignoring casing, otherwise `no`.
|
||||
|
||||
**Example:**
|
||||
```tomo
|
||||
>> "A":caseless_equals("a")
|
||||
= yes
|
||||
|
||||
# Turkish lowercase "I" is "ı" (dotless I), not "i"
|
||||
>> "I":caseless_equals("i", language="tr_TR")
|
||||
= no
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `codepoint_names`
|
||||
Returns an array of the names of each codepoint in the text.
|
||||
|
||||
@ -782,10 +810,11 @@ An array of substrings resulting from the split.
|
||||
Converts all characters in the text to lowercase.
|
||||
|
||||
```tomo
|
||||
func lower(text: Text -> Text)
|
||||
func lower(text: Text, language: Text = "C" -> Text)
|
||||
```
|
||||
|
||||
- `text`: The text to be converted to lowercase.
|
||||
- `language`: The ISO 639 language code for which casing rules to use.
|
||||
|
||||
**Returns:**
|
||||
The lowercase version of the text.
|
||||
@ -794,6 +823,9 @@ The lowercase version of the text.
|
||||
```tomo
|
||||
>> "AMÉLIE":lower()
|
||||
= "amélie"
|
||||
|
||||
>> "I":lower(language="tr_TR")
|
||||
>> "ı"
|
||||
```
|
||||
|
||||
---
|
||||
@ -1107,10 +1139,11 @@ func starts_with(text: Text, prefix: Text -> Bool)
|
||||
Converts the text to title case (capitalizing the first letter of each word).
|
||||
|
||||
```tomo
|
||||
func title(text: Text -> Text)
|
||||
func title(text: Text, language: Text = "C" -> Text)
|
||||
```
|
||||
|
||||
- `text`: The text to be converted to title case.
|
||||
- `language`: The ISO 639 language code for which casing rules to use.
|
||||
|
||||
**Returns:**
|
||||
The text in title case.
|
||||
@ -1119,6 +1152,10 @@ The text in title case.
|
||||
```tomo
|
||||
>> "amélie":title()
|
||||
= "Amélie"
|
||||
|
||||
# In Turkish, uppercase "i" is "İ"
|
||||
>> "i":title(language="tr_TR")
|
||||
= "İ"
|
||||
```
|
||||
|
||||
---
|
||||
@ -1183,10 +1220,11 @@ The text without the trim pattern at either end.
|
||||
Converts all characters in the text to uppercase.
|
||||
|
||||
```tomo
|
||||
func upper(text: Text -> Text)
|
||||
func upper(text: Text, language: Text = "C" -> Text)
|
||||
```
|
||||
|
||||
- `text`: The text to be converted to uppercase.
|
||||
- `language`: The ISO 639 language code for which casing rules to use.
|
||||
|
||||
**Returns:**
|
||||
The uppercase version of the text.
|
||||
@ -1195,6 +1233,10 @@ The uppercase version of the text.
|
||||
```tomo
|
||||
>> "amélie":upper()
|
||||
= "AMÉLIE"
|
||||
|
||||
# In Turkish, uppercase "i" is "İ"
|
||||
>> "i":upper(language="tr_TR")
|
||||
= "İ"
|
||||
```
|
||||
|
||||
---
|
||||
|
@ -395,6 +395,7 @@ env_t *new_compilation_unit(CORD libname)
|
||||
{"by_match", "Text$by_match", "func(text:Text, pattern:Pattern -> func(->Match?))"},
|
||||
{"by_split", "Text$by_split", "func(text:Text, pattern=$Pattern'' -> func(->Text?))"},
|
||||
{"bytes", "Text$utf8_bytes", "func(text:Text -> [Byte])"},
|
||||
{"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language=\"C\" -> Bool)"},
|
||||
{"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"},
|
||||
{"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"},
|
||||
{"each", "Text$each", "func(text:Text, pattern:Pattern, fn:func(match:Match), recursive=yes)"},
|
||||
@ -409,7 +410,7 @@ env_t *new_compilation_unit(CORD libname)
|
||||
{"has", "Text$has", "func(text:Text, pattern:Pattern -> Bool)"},
|
||||
{"join", "Text$join", "func(glue:Text, pieces:[Text] -> Text)"},
|
||||
{"lines", "Text$lines", "func(text:Text -> [Text])"},
|
||||
{"lower", "Text$lower", "func(text:Text -> Text)"},
|
||||
{"lower", "Text$lower", "func(text:Text, language=\"C\" -> Text)"},
|
||||
{"map", "Text$map", "func(text:Text, pattern:Pattern, fn:func(match:Match -> Text), recursive=yes -> Text)"},
|
||||
{"matches", "Text$matches", "func(text:Text, pattern:Pattern -> [Text]?)"},
|
||||
{"quoted", "Text$quoted", "func(text:Text, color=no -> Text)"},
|
||||
@ -420,10 +421,10 @@ env_t *new_compilation_unit(CORD libname)
|
||||
{"slice", "Text$slice", "func(text:Text, from=1, to=-1 -> Text)"},
|
||||
{"split", "Text$split", "func(text:Text, pattern=$Pattern'' -> [Text])"},
|
||||
{"starts_with", "Text$starts_with", "func(text,prefix:Text -> Bool)"},
|
||||
{"title", "Text$title", "func(text:Text -> Text)"},
|
||||
{"title", "Text$title", "func(text:Text, language=\"C\" -> Text)"},
|
||||
{"to", "Text$to", "func(text:Text, last:Int -> Text)"},
|
||||
{"trim", "Text$trim", "func(text:Text, pattern=$/{whitespace}/, trim_left=yes, trim_right=yes -> Text)"},
|
||||
{"upper", "Text$upper", "func(text:Text -> Text)"},
|
||||
{"upper", "Text$upper", "func(text:Text, language=\"C\" -> Text)"},
|
||||
{"utf32_codepoints", "Text$utf32_codepoints", "func(text:Text -> [Int32])"},
|
||||
)},
|
||||
{"Thread", THREAD_TYPE, "Thread_t", "Thread", TypedArray(ns_entry_t,
|
||||
|
@ -23,15 +23,15 @@ PUREFUNC public Text_t Bool$as_text(const void *b, bool colorize, const TypeInfo
|
||||
|
||||
PUREFUNC public OptionalBool_t Bool$parse(Text_t text)
|
||||
{
|
||||
if (Text$equal_ignoring_case(text, Text("yes"))
|
||||
|| Text$equal_ignoring_case(text, Text("on"))
|
||||
|| Text$equal_ignoring_case(text, Text("true"))
|
||||
|| Text$equal_ignoring_case(text, Text("1"))) {
|
||||
if (Text$equal_ignoring_case(text, Text("yes"), NONE_TEXT)
|
||||
|| Text$equal_ignoring_case(text, Text("on"), NONE_TEXT)
|
||||
|| Text$equal_ignoring_case(text, Text("true"), NONE_TEXT)
|
||||
|| Text$equal_ignoring_case(text, Text("1"), NONE_TEXT)) {
|
||||
return yes;
|
||||
} else if (Text$equal_ignoring_case(text, Text("no"))
|
||||
|| Text$equal_ignoring_case(text, Text("off"))
|
||||
|| Text$equal_ignoring_case(text, Text("false"))
|
||||
|| Text$equal_ignoring_case(text, Text("0"))) {
|
||||
} else if (Text$equal_ignoring_case(text, Text("no"), NONE_TEXT)
|
||||
|| Text$equal_ignoring_case(text, Text("off"), NONE_TEXT)
|
||||
|| Text$equal_ignoring_case(text, Text("false"), NONE_TEXT)
|
||||
|| Text$equal_ignoring_case(text, Text("0"), NONE_TEXT)) {
|
||||
return no;
|
||||
} else {
|
||||
return NONE_BOOL;
|
||||
|
@ -968,13 +968,13 @@ PUREFUNC public bool Text$equal(const void *a, const void *b, const TypeInfo_t*)
|
||||
return Text$equal_values(*(Text_t*)a, *(Text_t*)b);
|
||||
}
|
||||
|
||||
PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b)
|
||||
PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language)
|
||||
{
|
||||
if (a.length != b.length)
|
||||
return false;
|
||||
int64_t len = a.length;
|
||||
TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
|
||||
const char *language = uc_locale_language();
|
||||
const char *uc_language = Text$as_c_string(language);
|
||||
for (int64_t i = 0; i < len; i++) {
|
||||
int32_t ai = Text$get_grapheme_fast(&a_state, i);
|
||||
int32_t bi = Text$get_grapheme_fast(&b_state, i);
|
||||
@ -986,7 +986,7 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b)
|
||||
int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi);
|
||||
|
||||
int cmp = 0;
|
||||
(void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, language, UNINORM_NFC, &cmp);
|
||||
(void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, uc_language, UNINORM_NFC, &cmp);
|
||||
if (cmp != 0)
|
||||
return false;
|
||||
}
|
||||
@ -994,40 +994,40 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b)
|
||||
return true;
|
||||
}
|
||||
|
||||
public Text_t Text$upper(Text_t text)
|
||||
public Text_t Text$upper(Text_t text, Text_t language)
|
||||
{
|
||||
if (text.length == 0) return text;
|
||||
Array_t codepoints = Text$utf32_codepoints(text);
|
||||
const char *language = uc_locale_language();
|
||||
const char *uc_language = Text$as_c_string(language);
|
||||
ucs4_t buf[128];
|
||||
size_t out_len = sizeof(buf)/sizeof(buf[0]);
|
||||
ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
|
||||
ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
|
||||
Text_t ret = text_from_u32(upper, (int64_t)out_len, false);
|
||||
if (upper != buf) free(upper);
|
||||
return ret;
|
||||
}
|
||||
|
||||
public Text_t Text$lower(Text_t text)
|
||||
public Text_t Text$lower(Text_t text, Text_t language)
|
||||
{
|
||||
if (text.length == 0) return text;
|
||||
Array_t codepoints = Text$utf32_codepoints(text);
|
||||
const char *language = uc_locale_language();
|
||||
const char *uc_language = Text$as_c_string(language);
|
||||
ucs4_t buf[128];
|
||||
size_t out_len = sizeof(buf)/sizeof(buf[0]);
|
||||
ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
|
||||
ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
|
||||
Text_t ret = text_from_u32(lower, (int64_t)out_len, false);
|
||||
if (lower != buf) free(lower);
|
||||
return ret;
|
||||
}
|
||||
|
||||
public Text_t Text$title(Text_t text)
|
||||
public Text_t Text$title(Text_t text, Text_t language)
|
||||
{
|
||||
if (text.length == 0) return text;
|
||||
Array_t codepoints = Text$utf32_codepoints(text);
|
||||
const char *language = uc_locale_language();
|
||||
const char *uc_language = Text$as_c_string(language);
|
||||
ucs4_t buf[128];
|
||||
size_t out_len = sizeof(buf)/sizeof(buf[0]);
|
||||
ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
|
||||
ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
|
||||
Text_t ret = text_from_u32(title, (int64_t)out_len, false);
|
||||
if (title != buf) free(title);
|
||||
return ret;
|
||||
|
@ -45,11 +45,11 @@ PUREFUNC uint64_t Text$hash(const void *text, const TypeInfo_t*);
|
||||
PUREFUNC int32_t Text$compare(const void *va, const void *vb, const TypeInfo_t*);
|
||||
PUREFUNC bool Text$equal(const void *a, const void *b, const TypeInfo_t*);
|
||||
PUREFUNC bool Text$equal_values(Text_t a, Text_t b);
|
||||
PUREFUNC bool Text$equal_ignoring_case(Text_t a, Text_t b);
|
||||
PUREFUNC bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language);
|
||||
PUREFUNC bool Text$is_none(const void *t, const TypeInfo_t*);
|
||||
Text_t Text$upper(Text_t text);
|
||||
Text_t Text$lower(Text_t text);
|
||||
Text_t Text$title(Text_t text);
|
||||
Text_t Text$upper(Text_t text, Text_t language);
|
||||
Text_t Text$lower(Text_t text, Text_t language);
|
||||
Text_t Text$title(Text_t text, Text_t language);
|
||||
Text_t Text$as_text(const void *text, bool colorize, const TypeInfo_t *info);
|
||||
Text_t Text$quoted(Text_t str, bool colorize);
|
||||
PUREFUNC bool Text$starts_with(Text_t text, Text_t prefix);
|
||||
|
20
test/text.tm
20
test/text.tm
@ -11,6 +11,26 @@ func main():
|
||||
>> str[1]
|
||||
= "H"
|
||||
|
||||
>> "I":lower()
|
||||
= "i"
|
||||
>> "I":lower(language="tr_TR")
|
||||
= "ı"
|
||||
|
||||
>> "i":upper()
|
||||
= "I"
|
||||
>> "i":upper(language="tr_TR")
|
||||
= "İ"
|
||||
|
||||
>> "ian":title()
|
||||
= "Ian"
|
||||
>> "ian":title(language="tr_TR")
|
||||
= "İan"
|
||||
|
||||
>> "I":caseless_equals("ı")
|
||||
= no
|
||||
>> "I":caseless_equals("ı", language="tr_TR")
|
||||
= yes
|
||||
|
||||
>> str[9]
|
||||
= "é"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user