Fix some stuff around Text:find() and text indexing

author: Bruce Hill <bruce@bruce-hill.com> 2024-09-02 23:47:16 -0400
committer: Bruce Hill <bruce@bruce-hill.com> 2024-09-02 23:47:16 -0400
commit: 5aa5a5e99b322586eed9997a14b3d616540bef07 (patch)
tree: c8eede952aa9ff49cea8f981d48e0b5faad291b2
parent: 5d6fa135b1eadbceac04e5456fabb7e53feedc10 (diff)
4 files changed, 36 insertions, 11 deletions
diff --git a/builtins/functions.c b/builtins/functions.c
index 06636cba..4aa699a5 100644
--- a/builtins/functions.c
+++ b/builtins/functions.c
@@ -218,7 +218,7 @@ public void end_test(void *expr, const TypeInfo *type, const char *expected, con
         Text_t expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_text;
         bool success = Text$equal(&expr_plain, &expected_text);
         if (!success) {
-            Int_t colon = Text$find(expected_text, Text$from_str(":"), I_small(0), NULL);
+            Int_t colon = Text$find(expected_text, Text$from_str(":"), I_small(1), NULL);
             if (colon.small != I_small(0).small) {
                 Text_t with_type = Text$concat(expr_plain, Text$from_str(" : "), type_name);
                 success = Text$equal(&with_type, &expected_text);
diff --git a/builtins/text.c b/builtins/text.c
index 2f3fbb46..32eefd32 100644
--- a/builtins/text.c
+++ b/builtins/text.c
@@ -1291,16 +1291,23 @@ int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_i
 
 public Int_t Text$find(Text_t text, Text_t pattern, Int_t from_index, int64_t *match_length)
 {
-    int32_t first = get_grapheme(pattern, 0);
-    bool find_first = (first != '['
-                       && !uc_is_property(first, UC_PROPERTY_QUOTATION_MARK)
-                       && !uc_is_property(first, UC_PROPERTY_PAIRED_PUNCTUATION));
+    int64_t first = Int_to_Int64(from_index, false);
+    if (first == 0) fail("Invalid index: 0");
+    if (first < 0) first = text.length + first + 1;
+    if (first > text.length || first < 1)
+        return I_small(0);
+
+    int32_t first_grapheme = get_grapheme(pattern, 0);
+    bool find_first = (first_grapheme != '['
+                       && !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK)
+                       && !uc_is_property(first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
 
     iteration_state_t text_state = {0, 0};
-    for (int64_t i = Int_to_Int64(from_index, false)-1; i < text.length; i++) {
+
+    for (int64_t i = first-1; i < text.length; i++) {
         // Optimization: quickly skip ahead to first char in pattern:
         if (find_first) {
-            while (i < text.length && _next_grapheme(text, &text_state, i) != first)
+            while (i < text.length && _next_grapheme(text, &text_state, i) != first_grapheme)
                 ++i;
         }
 
@@ -1416,7 +1423,7 @@ public array_t Text$find_all(Text_t text, Text_t pattern)
         if (I_is_zero(found)) break;
         Text_t match = Text$slice(text, found, Int$plus(found, Int64_to_Int(len-1)));
         Array$insert(&matches, &match, I_small(0), sizeof(Text_t));
-        i = Int$plus(found, Int64_to_Int(len));
+        i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
     }
 
     return matches;
@@ -1437,7 +1444,7 @@ public Text_t Text$replace(Text_t text, Text_t pattern, Text_t replacement)
         } else {
             ret = concat2(ret, replacement);
         }
-        i = Int$plus(found, Int64_to_Int(len));
+        i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
     }
     if (Int_to_Int64(i, false) <= text.length) {
         Text_t last_slice = Text$slice(text, i, Int64_to_Int(text.length));
@@ -1463,7 +1470,7 @@ public array_t Text$split(Text_t text, Text_t pattern)
         if (I_is_zero(found)) break;
         Text_t chunk = Text$slice(text, i, Int$minus(found, I_small(1)));
         Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
-        i = Int$plus(found, Int64_to_Int(len));
+        i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
     }
 
     Text_t last_chunk = Text$slice(text, i, Int64_to_Int(text.length));
diff --git a/environment.c b/environment.c
index d4ed6c8d..ee277d2a 100644
--- a/environment.c
+++ b/environment.c
@@ -247,7 +247,7 @@ env_t *new_compilation_unit(CORD *libname)
             {"by", "Range$by", "func(range:Range, step:Int)->Range"},
         )},
         {"Text", TEXT_TYPE, "Text_t", "$Text", TypedArray(ns_entry_t,
-            {"find", "Text$find", "func(text:Text, pattern:Text)->Int"},
+            {"find", "Text$find", "func(text:Text, pattern:Text, start=1, length=!&Int64)->Int"},
             {"find_all", "Text$find_all", "func(text:Text, pattern:Text)->[Text]"},
             {"as_c_string", "CORD_to_char_star", "func(text:Text)->CString"},
             {"codepoint_names", "Text$codepoint_names", "func(text:Text)->[Text]"},
diff --git a/test/text.tm b/test/text.tm
index 0dd5f2ec..39e8a6e1 100644
--- a/test/text.tm
+++ b/test/text.tm
@@ -129,6 +129,7 @@ func main():
 	>> "one$(\r\n)two$(\r\n)three$(\r\n)":lines()
 	= ["one", "two", "three"]
 
+	//! Test splitting and joining text:
 	>> "one two three":split(" ")
 	= ["one", "two", "three"]
 
@@ -156,6 +157,7 @@ func main():
 	>> "":split()
 	= []
 
+	//! Test text:find_all()
 	>> " one  two three   ":find_all("[..alpha]")
 	= ["one", "two", "three"]
 
@@ -173,3 +175,19 @@ func main():
 
 	>> "Hello":find_all("")
 	= []
+
+	//! Test text:find()
+	>> " one   two  three   ":find("[..id]", start=-999)
+	= 0
+	>> " one   two  three   ":find("[..id]", start=999)
+	= 0
+	>> " one   two  three   ":find("[..id]")
+	= 2
+	>> " one   two  three   ":find("[..id]", start=5)
+	= 8
+
+	>> len := 0_i64
+	>> "   one  ":find("[..id]", length=&len)
+	= 4
+	>> len
+	= 3_i64
author	Bruce Hill <bruce@bruce-hill.com>	2024-09-02 23:47:16 -0400
committer	Bruce Hill <bruce@bruce-hill.com>	2024-09-02 23:47:16 -0400
commit	5aa5a5e99b322586eed9997a14b3d616540bef07 (patch)
tree	c8eede952aa9ff49cea8f981d48e0b5faad291b2
parent	5d6fa135b1eadbceac04e5456fabb7e53feedc10 (diff)