diff --git a/compile.c b/compile.c index 87c872b..38b6f75 100644 --- a/compile.c +++ b/compile.c @@ -308,6 +308,8 @@ CORD compile_type(type_t *t) case StructType: { if (nonnull == THREAD_TYPE) return "Thread_t"; + if (nonnull == MATCH_TYPE) + return "OptionalMatch_t"; auto s = Match(nonnull, StructType); return CORD_all(namespace_prefix(s->env, s->env->namespace->parent), "$Optional", s->name, "_t"); } @@ -422,7 +424,7 @@ CORD optional_into_nonnone(type_t *t, CORD value) case IntType: return CORD_all(value, ".i"); case StructType: - if (t == THREAD_TYPE) + if (t == THREAD_TYPE || t == MATCH_TYPE) return value; return CORD_all(value, ".value"); default: @@ -436,6 +438,8 @@ CORD check_none(type_t *t, CORD value) if (t->tag == PointerType || t->tag == FunctionType || t->tag == CStringType || t->tag == ChannelType || t == THREAD_TYPE) return CORD_all("(", value, " == NULL)"); + else if (t == MATCH_TYPE) + return CORD_all("((", value, ").index.small == 0)"); else if (t->tag == BigIntType) return CORD_all("((", value, ").small == 0)"); else if (t->tag == ClosureType) diff --git a/docs/text.md b/docs/text.md index 96d0f7d..fd37d4d 100644 --- a/docs/text.md +++ b/docs/text.md @@ -457,6 +457,101 @@ indices are counted from the back of the text, so `-1` means the last cluster, --- +## `by_line` + +**Description:** +Returns an iterator function that can be used to iterate over the lines in a +text. + +**Signature:** +```tomo +func by_line(text: Text -> func(->Text?)) +``` + +**Parameters:** + +- `text`: The text to be iterated over, line by line. + +**Returns:** +An iterator function that returns one line at a time, until it runs out and +returns `none`. **Note:** this function ignores a trailing newline if there is +one. If you don't want this behavior, use `text:by_split($/{1 nl}/)` instead. + +**Example:** +```tomo +text := " + line one + line two +" +for line in text:by_line(): + # Prints: "line one" then "line two": + say(line) +``` + +--- + +## `by_match` + +**Description:** +Returns an iterator function that can be used to iterate over the occurrences +of a pattern in a text. + +**Signature:** +```tomo +func by_match(text: Text, pattern: Pattern -> func(->Match?)) +``` + +**Parameters:** + +- `text`: The text to be iterated over looking for matches. +- `pattern`: The pattern to look for. + +**Returns:** +An iterator function that returns one match result at a time, until it runs out +and returns `none`. **Note:** if a zero-length match is found, the iterator +will return it exactly once. Matches are always non-overlapping. + +**Example:** +```tomo +text := "one two three" +for match in text:by_match($/{alpha}/): + # Prints: "one" then "two" then "three": + say(match.text) +``` + +--- + +## `by_split` + +**Description:** +Returns an iterator function that can be used to iterate over text separated by +a pattern. + +**Signature:** +```tomo +func by_split(text: Text, pattern: Pattern = $// -> func(->Text?)) +``` + +**Parameters:** + +- `text`: The text to be iterated over in pattern-delimited chunks. +- `pattern`: The pattern to split the text on. + +**Returns:** +An iterator function that returns one chunk of text at a time, separated by the +given pattern, until it runs out and returns `none`. **Note:** using an empty +pattern (the default) will iterate over single grapheme clusters in the text. + +**Example:** +```tomo +text := "one,two,three" +for chunk in text:by_split($/,/): + # Prints: "one" then "two" then "three": + say(chunk) +``` + +--- + ## `bytes` **Description:** diff --git a/environment.c b/environment.c index 3153a89..8c207d1 100644 --- a/environment.c +++ b/environment.c @@ -393,6 +393,9 @@ env_t *new_compilation_unit(CORD libname) {"Text", TEXT_TYPE, "Text_t", "Text$info", TypedArray(ns_entry_t, {"as_c_string", "Text$as_c_string", "func(text:Text -> CString)"}, {"at", "Text$cluster", "func(text:Text, index:Int -> Text)"}, + {"by_line", "Text$by_line", "func(text:Text -> func(->Text?))"}, + {"by_match", "Text$by_match", "func(text:Text, pattern:Pattern -> func(->Match?))"}, + {"by_split", "Text$by_split", "func(text:Text, pattern=$Pattern'' -> func(->Text?))"}, {"bytes", "Text$utf8_bytes", "func(text:Text -> [Byte])"}, {"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, {"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"}, diff --git a/stdlib/patterns.c b/stdlib/patterns.c index 66855d2..48f43ae 100644 --- a/stdlib/patterns.c +++ b/stdlib/patterns.c @@ -873,6 +873,33 @@ public Array_t Text$find_all(Text_t text, Pattern_t pattern) return matches; } +typedef struct { + TextIter_t state; + Int_t i; + Pattern_t pattern; +} match_iter_state_t; + +static OptionalMatch_t next_match(match_iter_state_t *state) +{ + if (Int_to_Int64(state->i, false) > state->state.text.length) + return NONE_MATCH; + + OptionalMatch_t m = Text$find(state->state.text, state->pattern, state->i); + if (m.index.small == 0) // No match + state->i = I(state->state.text.length + 1); + else + state->i = Int$plus(m.index, I(MAX(1, m.text.length))); + return m; +} + +public Closure_t Text$by_match(Text_t text, Pattern_t pattern) +{ + return (Closure_t){ + .fn=(void*)next_match, + .userdata=new(match_iter_state_t, .state={text, 0, 0}, .i=I_small(1), .pattern=pattern), + }; +} + static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t replacement, Pattern_t backref_pat, capture_t *captures) { if (backref_pat.length == 0) @@ -1145,10 +1172,12 @@ public Array_t Text$split(Text_t text, Pattern_t pattern) for (;;) { int64_t len = 0; int64_t found = _find(text, pattern, i, text.length-1, &len, NULL); + if (found == i && len == 0) + found = _find(text, pattern, i + 1, text.length-1, &len, NULL); if (found < 0) break; Text_t chunk = Text$slice(text, I(i+1), I(found)); Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); - i = found + MAX(len, 1); + i = MAX(found + len, i + 1); } Text_t last_chunk = Text$slice(text, I(i+1), I(text.length)); @@ -1157,6 +1186,53 @@ public Array_t Text$split(Text_t text, Pattern_t pattern) return chunks; } +typedef struct { + TextIter_t state; + int64_t i; + Pattern_t pattern; +} split_iter_state_t; + +static OptionalText_t next_split(split_iter_state_t *state) +{ + Text_t text = state->state.text; + if (state->i >= text.length) { + if (state->pattern.length > 0 && state->i == text.length) { // special case + state->i = text.length + 1; + return (Text_t){.length=0}; + } + return NONE_TEXT; + } + + if (state->pattern.length == 0) { // special case + Text_t ret = Text$cluster(text, I(state->i+1)); + state->i += 1; + return ret; + } + + int64_t start = state->i; + int64_t len = 0; + int64_t found = _find(text, state->pattern, start, text.length-1, &len, NULL); + + if (found == start && len == 0) + found = _find(text, state->pattern, start + 1, text.length-1, &len, NULL); + + if (found >= 0) { + state->i = MAX(found + len, state->i + 1); + return Text$slice(text, I(start+1), I(found)); + } else { + state->i = state->state.text.length + 1; + return Text$slice(text, I(start+1), I(text.length)); + } +} + +public Closure_t Text$by_split(Text_t text, Pattern_t pattern) +{ + return (Closure_t){ + .fn=(void*)next_split, + .userdata=new(split_iter_state_t, .state={text, 0, 0}, .i=0, .pattern=pattern), + }; +} + public const TypeInfo_t Pattern$info = { .size=sizeof(Pattern_t), .align=__alignof__(Pattern_t), diff --git a/stdlib/patterns.h b/stdlib/patterns.h index 641009a..494c833 100644 --- a/stdlib/patterns.h +++ b/stdlib/patterns.h @@ -27,9 +27,11 @@ Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t bac Pattern_t Pattern$escape_text(Text_t text); Text_t Text$replace_all(Text_t text, Table_t replacements, Pattern_t backref_pat, bool recursive); Array_t Text$split(Text_t text, Pattern_t pattern); +Closure_t Text$by_split(Text_t text, Pattern_t pattern); Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right); OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t i); Array_t Text$find_all(Text_t text, Pattern_t pattern); +Closure_t Text$by_match(Text_t text, Pattern_t pattern); PUREFUNC bool Text$has(Text_t text, Pattern_t pattern); OptionalArray_t Text$matches(Text_t text, Pattern_t pattern); Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn); diff --git a/stdlib/text.c b/stdlib/text.c index 57a98e8..65d4150 100644 --- a/stdlib/text.c +++ b/stdlib/text.c @@ -1422,6 +1422,41 @@ public Array_t Text$lines(Text_t text) return lines; } +typedef struct { + TextIter_t state; + int64_t i; +} line_iter_state_t; + +static OptionalText_t next_line(line_iter_state_t *state) +{ + Text_t text = state->state.text; + for (int64_t i = state->i; i < text.length; i++) { + int32_t grapheme = Text$get_grapheme_fast(&state->state, i); + if (grapheme == '\r' && Text$get_grapheme_fast(&state->state, i + 1) == '\n') { // CRLF + Text_t line = Text$slice(text, I(state->i+1), I(i)); + state->i = i + 2; // skip one extra for CR + return line; + } else if (grapheme == '\n') { // newline + Text_t line = Text$slice(text, I(state->i+1), I(i)); + state->i = i + 1; + return line; + } else if (i == text.length-1 && state->i != i) { // last line + Text_t line = Text$slice(text, I(state->i+1), I(i+1)); + state->i = i + 1; + return line; + } + } + return NONE_TEXT; +} + +public Closure_t Text$by_line(Text_t text) +{ + return (Closure_t){ + .fn=(void*)next_line, + .userdata=new(line_iter_state_t, .state={text, 0, 0}, .i=0), + }; +} + PUREFUNC public bool Text$is_none(const void *t, const TypeInfo_t*) { return ((Text_t*)t)->length < 0; diff --git a/stdlib/text.h b/stdlib/text.h index f7785dd..79c094a 100644 --- a/stdlib/text.h +++ b/stdlib/text.h @@ -59,6 +59,7 @@ Text_t Text$from_codepoints(Array_t codepoints); OptionalText_t Text$from_codepoint_names(Array_t codepoint_names); OptionalText_t Text$from_bytes(Array_t bytes); Array_t Text$lines(Text_t text); +Closure_t Text$by_line(Text_t text); Text_t Text$join(Text_t glue, Array_t pieces); Text_t Text$repeat(Text_t text, Int_t count); int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index);