Add text:by_line()/:by_split()/:by_match()

This commit is contained in:
Bruce Hill 2024-12-21 16:32:22 -05:00
parent 325b367a13
commit 46b61d3ed2
7 changed files with 218 additions and 2 deletions

View File

@ -308,6 +308,8 @@ CORD compile_type(type_t *t)
case StructType: {
if (nonnull == THREAD_TYPE)
return "Thread_t";
if (nonnull == MATCH_TYPE)
return "OptionalMatch_t";
auto s = Match(nonnull, StructType);
return CORD_all(namespace_prefix(s->env, s->env->namespace->parent), "$Optional", s->name, "_t");
}
@ -422,7 +424,7 @@ CORD optional_into_nonnone(type_t *t, CORD value)
case IntType:
return CORD_all(value, ".i");
case StructType:
if (t == THREAD_TYPE)
if (t == THREAD_TYPE || t == MATCH_TYPE)
return value;
return CORD_all(value, ".value");
default:
@ -436,6 +438,8 @@ CORD check_none(type_t *t, CORD value)
if (t->tag == PointerType || t->tag == FunctionType || t->tag == CStringType
|| t->tag == ChannelType || t == THREAD_TYPE)
return CORD_all("(", value, " == NULL)");
else if (t == MATCH_TYPE)
return CORD_all("((", value, ").index.small == 0)");
else if (t->tag == BigIntType)
return CORD_all("((", value, ").small == 0)");
else if (t->tag == ClosureType)

View File

@ -457,6 +457,101 @@ indices are counted from the back of the text, so `-1` means the last cluster,
---
## `by_line`
**Description:**
Returns an iterator function that can be used to iterate over the lines in a
text.
**Signature:**
```tomo
func by_line(text: Text -> func(->Text?))
```
**Parameters:**
- `text`: The text to be iterated over, line by line.
**Returns:**
An iterator function that returns one line at a time, until it runs out and
returns `none`. **Note:** this function ignores a trailing newline if there is
one. If you don't want this behavior, use `text:by_split($/{1 nl}/)` instead.
**Example:**
```tomo
text := "
line one
line two
"
for line in text:by_line():
# Prints: "line one" then "line two":
say(line)
```
---
## `by_match`
**Description:**
Returns an iterator function that can be used to iterate over the occurrences
of a pattern in a text.
**Signature:**
```tomo
func by_match(text: Text, pattern: Pattern -> func(->Match?))
```
**Parameters:**
- `text`: The text to be iterated over looking for matches.
- `pattern`: The pattern to look for.
**Returns:**
An iterator function that returns one match result at a time, until it runs out
and returns `none`. **Note:** if a zero-length match is found, the iterator
will return it exactly once. Matches are always non-overlapping.
**Example:**
```tomo
text := "one two three"
for match in text:by_match($/{alpha}/):
# Prints: "one" then "two" then "three":
say(match.text)
```
---
## `by_split`
**Description:**
Returns an iterator function that can be used to iterate over text separated by
a pattern.
**Signature:**
```tomo
func by_split(text: Text, pattern: Pattern = $// -> func(->Text?))
```
**Parameters:**
- `text`: The text to be iterated over in pattern-delimited chunks.
- `pattern`: The pattern to split the text on.
**Returns:**
An iterator function that returns one chunk of text at a time, separated by the
given pattern, until it runs out and returns `none`. **Note:** using an empty
pattern (the default) will iterate over single grapheme clusters in the text.
**Example:**
```tomo
text := "one,two,three"
for chunk in text:by_split($/,/):
# Prints: "one" then "two" then "three":
say(chunk)
```
---
## `bytes`
**Description:**

View File

@ -393,6 +393,9 @@ env_t *new_compilation_unit(CORD libname)
{"Text", TEXT_TYPE, "Text_t", "Text$info", TypedArray(ns_entry_t,
{"as_c_string", "Text$as_c_string", "func(text:Text -> CString)"},
{"at", "Text$cluster", "func(text:Text, index:Int -> Text)"},
{"by_line", "Text$by_line", "func(text:Text -> func(->Text?))"},
{"by_match", "Text$by_match", "func(text:Text, pattern:Pattern -> func(->Match?))"},
{"by_split", "Text$by_split", "func(text:Text, pattern=$Pattern'' -> func(->Text?))"},
{"bytes", "Text$utf8_bytes", "func(text:Text -> [Byte])"},
{"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"},
{"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"},

View File

@ -873,6 +873,33 @@ public Array_t Text$find_all(Text_t text, Pattern_t pattern)
return matches;
}
typedef struct {
TextIter_t state;
Int_t i;
Pattern_t pattern;
} match_iter_state_t;
static OptionalMatch_t next_match(match_iter_state_t *state)
{
if (Int_to_Int64(state->i, false) > state->state.text.length)
return NONE_MATCH;
OptionalMatch_t m = Text$find(state->state.text, state->pattern, state->i);
if (m.index.small == 0) // No match
state->i = I(state->state.text.length + 1);
else
state->i = Int$plus(m.index, I(MAX(1, m.text.length)));
return m;
}
public Closure_t Text$by_match(Text_t text, Pattern_t pattern)
{
return (Closure_t){
.fn=(void*)next_match,
.userdata=new(match_iter_state_t, .state={text, 0, 0}, .i=I_small(1), .pattern=pattern),
};
}
static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t replacement, Pattern_t backref_pat, capture_t *captures)
{
if (backref_pat.length == 0)
@ -1145,10 +1172,12 @@ public Array_t Text$split(Text_t text, Pattern_t pattern)
for (;;) {
int64_t len = 0;
int64_t found = _find(text, pattern, i, text.length-1, &len, NULL);
if (found == i && len == 0)
found = _find(text, pattern, i + 1, text.length-1, &len, NULL);
if (found < 0) break;
Text_t chunk = Text$slice(text, I(i+1), I(found));
Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
i = found + MAX(len, 1);
i = MAX(found + len, i + 1);
}
Text_t last_chunk = Text$slice(text, I(i+1), I(text.length));
@ -1157,6 +1186,53 @@ public Array_t Text$split(Text_t text, Pattern_t pattern)
return chunks;
}
typedef struct {
TextIter_t state;
int64_t i;
Pattern_t pattern;
} split_iter_state_t;
static OptionalText_t next_split(split_iter_state_t *state)
{
Text_t text = state->state.text;
if (state->i >= text.length) {
if (state->pattern.length > 0 && state->i == text.length) { // special case
state->i = text.length + 1;
return (Text_t){.length=0};
}
return NONE_TEXT;
}
if (state->pattern.length == 0) { // special case
Text_t ret = Text$cluster(text, I(state->i+1));
state->i += 1;
return ret;
}
int64_t start = state->i;
int64_t len = 0;
int64_t found = _find(text, state->pattern, start, text.length-1, &len, NULL);
if (found == start && len == 0)
found = _find(text, state->pattern, start + 1, text.length-1, &len, NULL);
if (found >= 0) {
state->i = MAX(found + len, state->i + 1);
return Text$slice(text, I(start+1), I(found));
} else {
state->i = state->state.text.length + 1;
return Text$slice(text, I(start+1), I(text.length));
}
}
public Closure_t Text$by_split(Text_t text, Pattern_t pattern)
{
return (Closure_t){
.fn=(void*)next_split,
.userdata=new(split_iter_state_t, .state={text, 0, 0}, .i=0, .pattern=pattern),
};
}
public const TypeInfo_t Pattern$info = {
.size=sizeof(Pattern_t),
.align=__alignof__(Pattern_t),

View File

@ -27,9 +27,11 @@ Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t bac
Pattern_t Pattern$escape_text(Text_t text);
Text_t Text$replace_all(Text_t text, Table_t replacements, Pattern_t backref_pat, bool recursive);
Array_t Text$split(Text_t text, Pattern_t pattern);
Closure_t Text$by_split(Text_t text, Pattern_t pattern);
Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right);
OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t i);
Array_t Text$find_all(Text_t text, Pattern_t pattern);
Closure_t Text$by_match(Text_t text, Pattern_t pattern);
PUREFUNC bool Text$has(Text_t text, Pattern_t pattern);
OptionalArray_t Text$matches(Text_t text, Pattern_t pattern);
Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn);

View File

@ -1422,6 +1422,41 @@ public Array_t Text$lines(Text_t text)
return lines;
}
typedef struct {
TextIter_t state;
int64_t i;
} line_iter_state_t;
static OptionalText_t next_line(line_iter_state_t *state)
{
Text_t text = state->state.text;
for (int64_t i = state->i; i < text.length; i++) {
int32_t grapheme = Text$get_grapheme_fast(&state->state, i);
if (grapheme == '\r' && Text$get_grapheme_fast(&state->state, i + 1) == '\n') { // CRLF
Text_t line = Text$slice(text, I(state->i+1), I(i));
state->i = i + 2; // skip one extra for CR
return line;
} else if (grapheme == '\n') { // newline
Text_t line = Text$slice(text, I(state->i+1), I(i));
state->i = i + 1;
return line;
} else if (i == text.length-1 && state->i != i) { // last line
Text_t line = Text$slice(text, I(state->i+1), I(i+1));
state->i = i + 1;
return line;
}
}
return NONE_TEXT;
}
public Closure_t Text$by_line(Text_t text)
{
return (Closure_t){
.fn=(void*)next_line,
.userdata=new(line_iter_state_t, .state={text, 0, 0}, .i=0),
};
}
PUREFUNC public bool Text$is_none(const void *t, const TypeInfo_t*)
{
return ((Text_t*)t)->length < 0;

View File

@ -59,6 +59,7 @@ Text_t Text$from_codepoints(Array_t codepoints);
OptionalText_t Text$from_codepoint_names(Array_t codepoint_names);
OptionalText_t Text$from_bytes(Array_t bytes);
Array_t Text$lines(Text_t text);
Closure_t Text$by_line(Text_t text);
Text_t Text$join(Text_t glue, Array_t pieces);
Text_t Text$repeat(Text_t text, Int_t count);
int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index);