From df844946f7d45fa39acf040eb8922ab658f9f5dd Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Wed, 4 Sep 2024 00:34:27 -0400 Subject: Add Text:matches() for convenience and performance --- builtins/text.c | 50 +++++++++++++++++++++++++++++++------------------- builtins/text.h | 1 + docs/text.md | 41 ++++++++++++++++++++++++++++++++++++----- environment.c | 1 + test/text.tm | 8 ++++++++ 5 files changed, 77 insertions(+), 24 deletions(-) diff --git a/builtins/text.c b/builtins/text.c index 2864f3a8..a96426f1 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -1386,14 +1386,8 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter #undef EAT2 #undef EAT_MANY -public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t *match_length) +static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length) { - int64_t first = Int_to_Int64(from_index, false); - if (first == 0) fail("Invalid index: 0"); - if (first < 0) first = text.length + first + 1; - if (first > text.length || first < 1) - return I_small(0); - int32_t first_grapheme = get_grapheme(pattern, 0); bool find_first = (first_grapheme != '{' && !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK) @@ -1401,7 +1395,7 @@ public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t iteration_state_t text_state = {0, 0}; - for (int64_t i = first-1; i < text.length; i++) { + for (int64_t i = first; i <= last; i++) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { while (i < text.length && _next_grapheme(text, &text_state, i) != first_grapheme) @@ -1412,17 +1406,36 @@ public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t if (m >= 0) { if (match_length) *match_length = m; - return I(i+1); + return i; } } if (match_length) *match_length = -1; - return I(0); + return -1; +} + +public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t *match_length) +{ + int64_t first = Int_to_Int64(from_index, false); + if (first == 0) fail("Invalid index: 0"); + if (first < 0) first = text.length + first + 1; + if (first > text.length || first < 1) + return I(0); + int64_t found = _find(text, pattern, first-1, text.length-1, match_length); + return I(found+1); } public bool Text$has(Text_t text, Pattern_t pattern) { - return !I_is_zero(Text$find(text, pattern, I_small(1), NULL)); + int64_t found = _find(text, pattern, 0, text.length-1, NULL); + return (found >= 0); +} + +public bool Text$matches(Text_t text, Pattern_t pattern) +{ + int64_t len; + int64_t found = _find(text, pattern, 0, 0, &len); + return (found >= 0) && len == text.length; } public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]) @@ -1527,14 +1540,13 @@ public array_t Text$find_all(Text_t text, Pattern_t pattern) array_t matches = {}; - Int_t i = I_small(1); - for (;;) { + for (int64_t i = 0; ; ) { int64_t len; - Int_t found = Text$find(text, pattern, i, &len); - if (I_is_zero(found)) break; - Text_t match = Text$slice(text, found, Int$plus(found, I(len-1))); + int64_t found = _find(text, pattern, i, text.length-1, &len); + if (found < 0) break; + Text_t match = Text$slice(text, I(found+1), I(found + len)); Array$insert(&matches, &match, I_small(0), sizeof(Text_t)); - i = Int$plus(found, I(len <= 0 ? 1 : len)); + i = found + MAX(len, 1); } return matches; @@ -1673,7 +1685,7 @@ public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t backref Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t)); Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures); ret = concat2(ret, replacement_text); - pos += len > 0 ? len : 1; + pos += MAX(len, 1); nonmatch_pos = pos; goto next_pos; } @@ -1707,7 +1719,7 @@ public array_t Text$split(Text_t text, Pattern_t pattern) if (I_is_zero(found)) break; Text_t chunk = Text$slice(text, i, Int$minus(found, I_small(1))); Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); - i = Int$plus(found, I(len <= 0 ? 1 : len)); + i = Int$plus(found, I(MAX(len, 1))); } Text_t last_chunk = Text$slice(text, i, I(text.length)); diff --git a/builtins/text.h b/builtins/text.h index ddc91afc..bcfe8055 100644 --- a/builtins/text.h +++ b/builtins/text.h @@ -38,6 +38,7 @@ array_t Text$split(Text_t text, Pattern_t pattern); Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length); array_t Text$find_all(Text_t text, Pattern_t pattern); bool Text$has(Text_t text, Pattern_t pattern); +bool Text$matches(Text_t text, Pattern_t pattern); const char *Text$as_c_string(Text_t text); public Text_t Text$format(const char *fmt, ...); array_t Text$clusters(Text_t text); diff --git a/docs/text.md b/docs/text.md index adf12dd1..b585676b 100644 --- a/docs/text.md +++ b/docs/text.md @@ -274,9 +274,10 @@ functions that would normally be handled by a more extensive API: Text.has(pattern:Pattern)->Bool Text.find(pattern:Pattern, start=1, length=!&Int64?)->Int Text.find_all(pattern:Pattern)->[Text] -Text.split(pattern:Pattern)->[Text] +Text.matches(pattern:Pattern)->Bool Text.replace(pattern:Pattern, replacement:Text, placeholder:Pattern=$//)->[Text] Text.replace_all(replacements:{Pattern:Text}, placeholder:Pattern=$//)->[Text] +Text.split(pattern:Pattern)->[Text] ``` See [Text Functions](#Text-Functions) for the full API documentation. @@ -708,13 +709,13 @@ has(text: Text, pattern: Text) -> Bool **Example:** ```tomo ->> "hello world":has("wo") +>> "hello world":has($/wo/) = yes ->> "hello world":has("{alpha}") +>> "hello world":has($/{alpha}/) = yes ->> "hello world":has("{digit}") +>> "hello world":has($/{digit}/) = no ->> "hello world":has("{start}he") +>> "hello world":has($/{start}he/) = yes ``` @@ -805,6 +806,36 @@ The lowercase version of the text. --- +## `matches` + +**Description:** +Checks if the `Text` matches target pattern (see: [Patterns](#Patterns)). + +**Usage:** +```tomo +matches(text: Text, pattern: Text) -> Bool +``` + +**Parameters:** + +- `text`: The text to be searched. +- `pattern`: The pattern to search for. + +**Returns:** +`yes` if the target pattern is found, `no` otherwise. + +**Example:** +```tomo +>> "Hello":matches($/{id}/) += yes +>> "Hello":matches($/{upper}/) += no +>> "Hello":matches($/{lower}/) += no +``` + +--- + ## `quoted` **Description:** diff --git a/environment.c b/environment.c index d105d2f4..6477039c 100644 --- a/environment.c +++ b/environment.c @@ -247,6 +247,7 @@ env_t *new_compilation_unit(CORD *libname) {"join", "Text$join", "func(glue:Text, pieces:[Text])->Text"}, {"lines", "Text$lines", "func(text:Text)->[Text]"}, {"lower", "Text$lower", "func(text:Text)->Text"}, + {"matches", "Text$matches", "func(text:Text, pattern:Pattern)->Bool"}, {"quoted", "Text$quoted", "func(text:Text, color=no)->Text"}, {"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, backref=$/\\/, recursive=yes)->Text"}, {"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern:Text}, backref=$/\\/, recursive=yes)->Text"}, diff --git a/test/text.tm b/test/text.tm index e3848a08..af03c3c7 100644 --- a/test/text.tm +++ b/test/text.tm @@ -242,3 +242,11 @@ func main(): >> " BAD(x, fn(y), BAD(z), w) ":replace($/BAD(?)/, "good(\1)", recursive=no) = " good(x, fn(y), BAD(z), w) " + >> "Hello":matches($/{id}/) + = yes + >> "Hello":matches($/{lower}/) + = no + >> "Hello":matches($/{upper}/) + = no + >> "Hello...":matches($/{id}/) + = no -- cgit v1.2.3