Introduce a Match struct to represent pattern matching results, which

improves the usability of a lot of the APIs. Also bugfix some issues
with ranges.
This commit is contained in:
Bruce Hill 2024-11-09 16:27:54 -05:00
parent 7a4f2e73ad
commit 898bee1581
12 changed files with 165 additions and 84 deletions

View File

@ -217,6 +217,7 @@ CORD compile_type(type_t *t)
if (t == THREAD_TYPE) return "Thread_t";
else if (t == RANGE_TYPE) return "Range_t";
else if (t == RNG_TYPE) return "RNG_t";
else if (t == MATCH_TYPE) return "Match_t";
switch (t->tag) {
case ReturnType: errx(1, "Shouldn't be compiling ReturnType to a type");
@ -3396,12 +3397,13 @@ CORD compile(env_t *env, ast_t *ast)
case StructType: {
for (arg_t *field = Match(value_t, StructType)->fields; field; field = field->next) {
if (streq(field->name, f->field)) {
const char *prefix = (value_t == RANGE_TYPE || value_t == MATCH_TYPE) ? "" : "$";
if (fielded_t->tag == PointerType) {
CORD fielded = compile_to_pointer_depth(env, f->fielded, 1, false);
return CORD_asprintf("(%r)->$%s", fielded, f->field);
return CORD_asprintf("(%r)->%s%s", fielded, prefix, f->field);
} else {
CORD fielded = compile(env, f->fielded);
return CORD_asprintf("(%r).$%s", fielded, f->field);
return CORD_asprintf("(%r).%s%s", fielded, prefix, f->field);
}
}
}
@ -3602,6 +3604,7 @@ CORD compile_type_info(env_t *env, type_t *t)
{
if (t == THREAD_TYPE) return "&Thread$info";
else if (t == RANGE_TYPE) return "&Range$info";
else if (t == MATCH_TYPE) return "&Match$info";
else if (t == RNG_TYPE) return "&RNG$info";
switch (t->tag) {

View File

@ -274,16 +274,22 @@ functions that would normally be handled by a more extensive API:
```
Text.has(pattern:Pattern -> Bool)
Text.find(pattern:Pattern, start=1 -> Int?)
Text.find_all(pattern:Pattern -> [Text])
Text.find(pattern:Pattern, start=1 -> Match?)
Text.find_all(pattern:Pattern -> [Match])
Text.matches(pattern:Pattern -> [Text]?)
Text.map(pattern:Pattern, fn:func(t:Text -> Text) -> Text)
Text.map(pattern:Pattern, fn:func(m:Match -> Text) -> Text)
Text.replace(pattern:Pattern, replacement:Text, placeholder:Pattern=$// -> [Text])
Text.replace_all(replacements:{Pattern:Text}, placeholder:Pattern=$// -> [Text])
Text.split(pattern:Pattern -> [Text])
Text.trim(pattern=$/{whitespace}/, trim_left=yes, trim_right=yes -> [Text])
```
Pattern matching functions work with a type called `Match` that has three fields:
- `text`: The full text of the match.
- `index`: The index in the text where the match was found.
- `captures`: An array containing the matching text of each non-literal pattern group.
See [Text Functions](#Text-Functions) for the full API documentation.
## Syntax
@ -652,19 +658,19 @@ func find(text: Text, pattern: Pattern, start: Int = 1 -> Int?)
- `start`: The index to start the search.
**Returns:**
`!Int` if the target pattern is not found, otherwise the index where the match
was found.
`!Match` if the target pattern is not found, otherwise a `Match` struct
containing information about the match.
**Example:**
```tomo
>> " one two three ":find("{id}", start=-999)
= !Int
>> " one two three ":find("{id}", start=999)
= !Int
>> " one two three ":find("{id}")
= 2?
>> " one two three ":find("{id}", start=5)
= 8?
>> " #one #two #three ":find($/#{id}/, start=-999)
= !Match
>> " #one #two #three ":find($/#{id}/, start=999)
= !Match
>> " #one #two #three ":find($/#{id}/)
= Match(text="#one", index=2, captures=["one"])?
>> " #one #two #three ":find("{id}", start=6)
= Match(text="#two", index=9, captures=["two"])?
```
---
@ -677,7 +683,7 @@ See: [Patterns](#Patterns) for more information on patterns.
**Signature:**
```tomo
func find_all(text: Text, pattern: Pattern -> [Text])
func find_all(text: Text, pattern: Pattern -> [Match])
```
**Parameters:**
@ -691,22 +697,19 @@ Note: if `text` or `pattern` is empty, an empty array will be returned.
**Example:**
```tomo
>> " one two three ":find_all("{alpha}")
= ["one", "two", "three"]
>> " one two three ":find_all("{!space}")
= ["one", "two", "three"]
>> " #one #two #three ":find_all($/#{alpha}/)
= [Match(text="#one", index=2, captures=["one"]), Match(text="#two", index=8, captures=["two"]), Match(text="#three", index=13, captures=["three"])]
>> " ":find_all("{alpha}")
= []
>> " foo(baz(), 1) doop() ":find_all("{id}(?)")
= ["foo(baz(), 1)", "doop()"]
= [Match(text="foo(baz(), 1)", index=2, captures=["foo", "baz(), 1"]), Match(text="doop()", index=17, captures=["doop", ""])]
>> "":find_all("")
>> "":find_all($//)
= []
>> "Hello":find_all("")
>> "Hello":find_all($//)
= []
```
@ -833,8 +836,8 @@ The lowercase version of the text.
**Description:**
Checks if the `Text` matches target pattern (see: [Patterns](#Patterns)) and
returns an array of the matching texts or a null value if the entire text
doesn't match the pattern.
returns an array of the matching text captures or a null value if the entire
text doesn't match the pattern.
**Signature:**
```tomo
@ -847,8 +850,8 @@ func matches(text: Text, pattern: Pattern -> [Text])
- `pattern`: The pattern to search for.
**Returns:**
An array of the matching text groups if the entire text matches the pattern, or
a null value otherwise.
An array of the matching text captures if the entire text matches the pattern,
or a null value otherwise.
**Example:**
```tomo
@ -865,11 +868,11 @@ a null value otherwise.
**Description:**
For each occurrence of the given pattern, replace the text with the result of
calling the given function on that text.
calling the given function on that match.
**Signature:**
```tomo
func map(text: Text, pattern: Pattern, fn: func(text:Text)->Text -> Text)
func map(text: Text, pattern: Pattern, fn: func(text:Match)->Text -> Text)
```
**Parameters:**
@ -884,9 +887,9 @@ function to each.
**Example:**
```tomo
>> "hello world":map($/world/, Text.upper)
>> "hello world":map($/world/, func(m:Match): m.text:upper())
= "hello WORLD"
>> "Some nums: 1 2 3 4":map($/{int}/, func(i:Text): "$(Int.parse(i)! + 10)")
>> "Some nums: 1 2 3 4":map($/{int}/, func(m:Match): "$(Int.parse(m.text)! + 10)")
= "Some nums: 11 12 13 14"
```
@ -1081,16 +1084,16 @@ An array of substrings resulting from the split.
**Example:**
```tomo
>> "one,two,three":split(",")
>> "one,two,three":split($/,/)
= ["one", "two", "three"]
>> "abc":split()
= ["a", "b", "c"]
>> "a b c":split("{space}")
>> "a b c":split($/{space}/)
= ["a", "b", "c"]
>> "a,b,c,":split(",")
>> "a,b,c,":split($/,/)
= ["a", "b", "c", ""]
```

View File

@ -12,6 +12,7 @@
#include "typecheck.h"
type_t *TEXT_TYPE = NULL;
type_t *MATCH_TYPE = NULL;
type_t *RANGE_TYPE = NULL;
type_t *RNG_TYPE = NULL;
public type_t *THREAD_TYPE = NULL;
@ -75,6 +76,15 @@ env_t *new_compilation_unit(CORD libname)
.next=new(arg_t, .name="step", .type=INT_TYPE, .default_val=FakeAST(Int, .str="1")))));
}
{
env_t *match_env = namespace_env(env, "Match");
MATCH_TYPE = Type(
StructType, .name="Match", .env=match_env,
.fields=new(arg_t, .name="text", .type=TEXT_TYPE,
.next=new(arg_t, .name="index", .type=INT_TYPE,
.next=new(arg_t, .name="captures", .type=Type(ArrayType, .item_type=TEXT_TYPE)))));
}
{
env_t *thread_env = namespace_env(env, "Thread");
THREAD_TYPE = Type(StructType, .name="Thread", .env=thread_env, .opaque=true);
@ -270,6 +280,9 @@ env_t *new_compilation_unit(CORD libname)
{"reversed", "Range$reversed", "func(range:Range -> Range)"},
{"by", "Range$by", "func(range:Range, step:Int -> Range)"},
)},
{"Match", MATCH_TYPE, "Match_t", "Match", TypedArray(ns_entry_t,
// No methods
)},
{"Pattern", Type(TextType, .lang="Pattern", .env=namespace_env(env, "Pattern")), "Pattern_t", "Pattern$info", TypedArray(ns_entry_t,
{"escape_int", "Int$value_as_text", "func(i:Int -> Pattern)"},
{"escape_text", "Pattern$escape_text", "func(text:Text -> Pattern)"},
@ -374,8 +387,8 @@ env_t *new_compilation_unit(CORD libname)
{"as_c_string", "Text$as_c_string", "func(text:Text -> CString)"},
{"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"},
{"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"},
{"find", "Text$find", "func(text:Text, pattern:Pattern, start=1 -> Int?)"},
{"find_all", "Text$find_all", "func(text:Text, pattern:Pattern -> [Text])"},
{"find", "Text$find", "func(text:Text, pattern:Pattern, start=1 -> Match?)"},
{"find_all", "Text$find_all", "func(text:Text, pattern:Pattern -> [Match])"},
{"from_bytes", "Text$from_bytes", "func(bytes:[Byte] -> Text?)"},
{"from_c_string", "Text$from_str", "func(str:CString -> Text?)"},
{"from_codepoint_names", "Text$from_codepoint_names", "func(codepoint_names:[Text] -> Text?)"},
@ -385,7 +398,7 @@ env_t *new_compilation_unit(CORD libname)
{"join", "Text$join", "func(glue:Text, pieces:[Text] -> Text)"},
{"lines", "Text$lines", "func(text:Text -> [Text])"},
{"lower", "Text$lower", "func(text:Text -> Text)"},
{"map", "Text$map", "func(text:Text, pattern:Pattern, fn:func(text:Text -> Text) -> Text)"},
{"map", "Text$map", "func(text:Text, pattern:Pattern, fn:func(match:Match -> Text) -> Text)"},
{"matches", "Text$matches", "func(text:Text, pattern:Pattern -> [Text]?)"},
{"quoted", "Text$quoted", "func(text:Text, color=no -> Text)"},
{"repeat", "Text$repeat", "func(text:Text, count:Int -> Text)"},

View File

@ -80,6 +80,7 @@ void set_binding(env_t *env, const char *name, binding_t *binding);
binding_t *get_namespace_binding(env_t *env, ast_t *self, const char *name);
#define code_err(ast, ...) compiler_err((ast)->file, (ast)->start, (ast)->end, __VA_ARGS__)
extern type_t *TEXT_TYPE;
extern type_t *MATCH_TYPE;
extern type_t *RANGE_TYPE;
extern type_t *RNG_TYPE;
extern type_t *THREAD_TYPE;

View File

@ -8,6 +8,7 @@
#include "datetime.h"
#include "integers.h"
#include "metamethods.h"
#include "patterns.h"
#include "text.h"
#include "threads.h"
#include "util.h"
@ -34,6 +35,8 @@ public PUREFUNC bool is_null(const void *obj, const TypeInfo_t *non_optional_typ
return *(pthread_t**)obj == NULL;
else if (non_optional_type == &DateTime$info)
return ((OptionalDateTime_t*)obj)->tv_usec < 0;
else if (non_optional_type == &Match$info)
return ((OptionalMatch_t*)obj)->index.small == 0;
switch (non_optional_type->tag) {
case ChannelInfo: return *(Channel_t**)obj == NULL;

View File

@ -380,7 +380,7 @@ static int64_t match_id(TextIter_t *state, int64_t index)
static int64_t match_int(TextIter_t *state, int64_t index)
{
int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
return len >= 0 ? len : -1;
return len > 0 ? len : -1;
}
static int64_t match_alphanumeric(TextIter_t *state, int64_t index)
@ -769,7 +769,7 @@ static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t
#undef EAT2
#undef EAT_MANY
static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length)
static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length, capture_t *captures)
{
int32_t first_grapheme = Text$get_grapheme(pattern, 0);
bool find_first = (first_grapheme != '{'
@ -784,7 +784,7 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last
++i;
}
int64_t m = match(text, i, pattern, 0, NULL, 0);
int64_t m = match(text, i, pattern, 0, captures, 0);
if (m >= 0) {
if (match_length)
*match_length = m;
@ -796,15 +796,30 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last
return -1;
}
public OptionalInt_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index)
public OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index)
{
int64_t first = Int_to_Int64(from_index, false);
if (first == 0) fail("Invalid index: 0");
if (first < 0) first = text.length + first + 1;
if (first > text.length || first < 1)
return NULL_INT;
int64_t found = _find(text, pattern, first-1, text.length-1, NULL);
return found == -1 ? NULL_INT : I(found+1);
return NULL_MATCH;
capture_t captures[MAX_BACKREFS] = {};
int64_t len = 0;
int64_t found = _find(text, pattern, first-1, text.length-1, &len, captures);
if (found == -1)
return NULL_MATCH;
Array_t capture_array = {};
for (int i = 0; captures[i].occupied; i++) {
Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
Array$insert(&capture_array, &capture, I(0), sizeof(Text_t));
}
return (OptionalMatch_t){
.text=Text$slice(text, I(found+1), I(found+len)),
.index=I(found+1),
.captures=capture_array,
};
}
PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern)
@ -820,7 +835,7 @@ PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern)
}
return false;
} else {
int64_t found = _find(text, pattern, 0, text.length-1, NULL);
int64_t found = _find(text, pattern, 0, text.length-1, NULL, NULL);
return (found >= 0);
}
}
@ -846,16 +861,13 @@ public Array_t Text$find_all(Text_t text, Pattern_t pattern)
return (Array_t){.length=0};
Array_t matches = {};
for (int64_t i = 0; ; ) {
int64_t len = 0;
int64_t found = _find(text, pattern, i, text.length-1, &len);
if (found < 0) break;
Text_t match = Text$slice(text, I(found+1), I(found + len));
Array$insert(&matches, &match, I_small(0), sizeof(Text_t));
i = found + MAX(len, 1);
for (int64_t i = 1; ; ) {
OptionalMatch_t m = Text$find(text, pattern, I(i));
if (!m.index.small)
break;
i = Int_to_Int64(m.index, false) + m.text.length;
Array$insert(&matches, &m, I_small(0), sizeof(Match_t));
}
return matches;
}
@ -999,7 +1011,7 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn)
TextIter_t text_state = {text, 0, 0};
int64_t nonmatching_pos = 0;
Text_t (*text_mapper)(Text_t, void*) = fn.fn;
Text_t (*text_mapper)(Match_t, void*) = fn.fn;
for (int64_t pos = 0; pos < text.length; pos++) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
@ -1007,10 +1019,21 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn)
++pos;
}
int64_t match_len = match(text, pos, pattern, 0, NULL, 0);
capture_t captures[MAX_BACKREFS] = {};
int64_t match_len = match(text, pos, pattern, 0, captures, 0);
if (match_len < 0) continue;
Text_t replacement = text_mapper(Text$slice(text, I(pos+1), I(pos+match_len)), fn.userdata);
Match_t m = {
.text=Text$slice(text, I(pos+1), I(pos+match_len)),
.index=I(pos+1),
.captures={},
};
for (int i = 0; captures[i].occupied; i++) {
Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
Array$insert(&m.captures, &capture, I(0), sizeof(Text_t));
}
Text_t replacement = text_mapper(m, fn.userdata);
if (pos > nonmatching_pos) {
Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos));
ret = Text$concat(ret, before_slice, replacement);
@ -1084,7 +1107,7 @@ public Array_t Text$split(Text_t text, Pattern_t pattern)
int64_t i = 0;
for (;;) {
int64_t len = 0;
int64_t found = _find(text, pattern, i, text.length-1, &len);
int64_t found = _find(text, pattern, i, text.length-1, &len, NULL);
if (found < 0) break;
Text_t chunk = Text$slice(text, I(i+1), I(found));
Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
@ -1097,5 +1120,27 @@ public Array_t Text$split(Text_t text, Pattern_t pattern)
return chunks;
}
public const TypeInfo_t Pattern$info = {
.size=sizeof(Pattern_t),
.align=__alignof__(Pattern_t),
.tag=TextInfo,
.TextInfo={.lang="Pattern"},
};
static NamedType_t _match_fields[3] = {
{"text", &Text$info},
{"index", &Int$info},
{"captures", Array$info(&Text$info)},
};
public const TypeInfo_t Match$info = {
.size=sizeof(Match_t),
.align=__alignof__(Match_t),
.tag=StructInfo,
.StructInfo={
.name="Match",
.num_fields=3,
.fields=_match_fields,
},
};
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0

View File

@ -14,12 +14,21 @@
#define Pattern(text) ((Pattern_t)Text(text))
#define Patterns(...) ((Pattern_t)Texts(__VA_ARGS__))
typedef struct {
Text_t text;
Int_t index;
Array_t captures;
} Match_t;
typedef Match_t OptionalMatch_t;
#define NULL_MATCH ((OptionalMatch_t){.index=NULL_INT})
Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t backref_pat, bool recursive);
Pattern_t Pattern$escape_text(Text_t text);
Text_t Text$replace_all(Text_t text, Table_t replacements, Pattern_t backref_pat, bool recursive);
Array_t Text$split(Text_t text, Pattern_t pattern);
Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right);
OptionalInt_t Text$find(Text_t text, Pattern_t pattern, Int_t i);
OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t i);
Array_t Text$find_all(Text_t text, Pattern_t pattern);
PUREFUNC bool Text$has(Text_t text, Pattern_t pattern);
OptionalArray_t Text$matches(Text_t text, Pattern_t pattern);
@ -29,6 +38,7 @@ Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn);
#define Pattern$compare Text$compare
#define Pattern$equal Text$equal
extern const TypeInfo_t Match$info;
extern const TypeInfo_t Pattern$info;
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0

View File

@ -38,10 +38,11 @@ static Text_t Range$as_text(const Range_t *r, bool use_color, const TypeInfo_t *
(void)type;
if (!r) return Text("Range");
return Text$format(use_color ? "\x1b[0;1mRange\x1b[m(first=%r, last=%r, step=%r)"
: "Range(first=%r, last=%r, step=%r)",
Int$as_text(&r->first, use_color, &Int$info), Int$as_text(&r->last, use_color, &Int$info),
Int$as_text(&r->step, use_color, &Int$info));
Text_t first = Int$as_text(&r->first, use_color, &Int$info);
Text_t last = Int$as_text(&r->last, use_color, &Int$info);
Text_t step = Int$as_text(&r->step, use_color, &Int$info);
return Text$format(use_color ? "\x1b[0;1mRange\x1b[m(first=%k, last=%k, step=%k)"
: "Range(first=%k, last=%k, step=%k)", &first, &last, &step);
}
PUREFUNC public Range_t Range$reversed(Range_t r)

View File

@ -451,8 +451,8 @@ public void end_test(const void *expr, const TypeInfo_t *type, const char *expec
Text_t expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_text;
bool success = Text$equal(&expr_plain, &expected_text);
if (!success) {
Int_t colon = Text$find(expected_text, Text(":"), I_small(1));
if (colon.small != I_small(0).small) {
OptionalMatch_t colon = Text$find(expected_text, Text(":"), I_small(1));
if (colon.index.small) {
Text_t with_type = Text$concat(expr_plain, Text(" : "), type_name);
success = Text$equal(&with_type, &expected_text);
}

View File

@ -1354,11 +1354,4 @@ public Pattern_t Pattern$escape_text(Text_t text)
#undef add_escaped
}
public const TypeInfo_t Pattern$info = {
.size=sizeof(Pattern_t),
.align=__alignof__(Pattern_t),
.tag=TextInfo,
.TextInfo={.lang="Pattern"},
};
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0

View File

@ -1,5 +1,14 @@
func main():
>> r := 5:to(10):by(2)
= Range(first=5, last=10, step=2)
>> r.first
= 5
>> r.last
= 10
>> r.step
= 2
>> Range(1, 5) == 1:to(5)
= yes

View File

@ -169,17 +169,17 @@ func main():
= []
!! Test text:find_all()
>> " one two three ":find_all($/{alpha}/)
= ["one", "two", "three"]
>> " #one #two #three ":find_all($/#{alpha}/)
= [Match(text="#one", index=2, captures=["one"]), Match(text="#two", index=8, captures=["two"]), Match(text="#three", index=13, captures=["three"])]
>> " one two three ":find_all($/{!space}/)
= ["one", "two", "three"]
>> " #one #two #three ":find_all($/#{!space}/)
= [Match(text="#one", index=2, captures=["one"]), Match(text="#two", index=8, captures=["two"]), Match(text="#three", index=13, captures=["three"])]
>> " ":find_all($/{alpha}/)
= []
>> " foo(baz(), 1) doop() ":find_all($/{id}(?)/)
= ["foo(baz(), 1)", "doop()"]
= [Match(text="foo(baz(), 1)", index=2, captures=["foo", "baz(), 1"]), Match(text="doop()", index=17, captures=["doop", ""])]
>> "":find_all($Pattern'')
= []
@ -189,13 +189,13 @@ func main():
!! Test text:find()
>> " one two three ":find($/{id}/, start=-999)
= !Int
= !Match
>> " one two three ":find($/{id}/, start=999)
= !Int
= !Match
>> " one two three ":find($/{id}/)
= 2?
= Match(text="one", index=2, captures=["one"])?
>> " one two three ":find($/{id}/, start=5)
= 8?
= Match(text="two", index=8, captures=["two"])?
!! Test text slicing:
>> "abcdef":slice()
@ -225,7 +225,7 @@ func main():
= !Text
>> "one two; three four":find_all($/; {..}/)
= ["; three four"]
= [Match(text="; three four", index=8, captures=["three four"])]
>> malicious := "{xxx}"
>> $/$malicious/
@ -261,7 +261,7 @@ func main():
else:
fail("Failed to match")
>> "hello world":map($/world/, Text.upper)
>> "hello world":map($/world/, func(m:Match): m.text:upper())
= "hello WORLD"
>> "Abc":repeat(3)