diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2025-04-01 14:05:10 -0400 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2025-04-01 14:05:10 -0400 |
| commit | 4d59fc2987e52da0274e6b204a9d2885613f74b7 (patch) | |
| tree | 8c262f99cb6ae9b550b9f8abf0ab0477044d087a /examples | |
| parent | 7a2c99de74f5870e1dea5b59d049678ad0ef8e44 (diff) | |
Move patterns into a module
Diffstat (limited to 'examples')
| -rw-r--r-- | examples/colorful/README.md | 29 | ||||
| -rw-r--r-- | examples/colorful/colorful.tm | 2 | ||||
| -rw-r--r-- | examples/game/world.tm | 4 | ||||
| -rw-r--r-- | examples/patterns/README.md | 152 | ||||
| -rw-r--r-- | examples/patterns/match_type.h | 8 | ||||
| -rw-r--r-- | examples/patterns/patterns.c | 1291 | ||||
| -rw-r--r-- | examples/patterns/patterns.tm | 58 |
7 files changed, 1512 insertions, 32 deletions
diff --git a/examples/colorful/README.md b/examples/colorful/README.md index 5f6ef8a4..b504a730 100644 --- a/examples/colorful/README.md +++ b/examples/colorful/README.md @@ -62,32 +62,3 @@ $Colorful" We have @(green,bold:colors)! ":print() ``` - -You can very easily introduce your own syntax highlighting for a custom DSL: - -```tomo -lang Markdown: - func Colorful(md:Markdown -> Colorful): - text := md.text:replace_all({ - $/@/="@(at)", - $/(/="@(lparen)", - $/)/="@(rparen)", - $/**{..}**/="@(b:\1)", - $/*{..}*/="@(i:\1)", - $/[?](?)/="@(blue,underline:\1)", - }) - return Colorful.from_text(text) - - func colorful(md:Markdown -> Colorful): - return $Colorful"$md" -... - -md := $Markdown" - This is [a link with **bold** inside](example.com)! -" ->> colorful := md:colorful() -= $Colorful"This is @(blue,underline:a link with @(b:bold) inside)!" ->> colorful:for_terminal() -= "$\e[mThis is $\e[4;34ma link with $\e[1mbold$\e[22m inside$\e[24;39m!" -colorful:print() -``` diff --git a/examples/colorful/colorful.tm b/examples/colorful/colorful.tm index 929c2409..e138e037 100644 --- a/examples/colorful/colorful.tm +++ b/examples/colorful/colorful.tm @@ -8,7 +8,7 @@ CSI := "$\033[" lang Colorful: convert(text:Text -> Colorful): - text = text:replace_all({$/@/="@(at)", $/(/="@(lparen)", $/)/="@(rparen)"}) + text = text:translate({"@"="@(at)", "("="@(lparen)", ")"="@(rparen)"}) return Colorful.from_text(text) convert(i:Int -> Colorful): return Colorful.from_text("$i") diff --git a/examples/game/world.tm b/examples/game/world.tm index 58fcd4fa..809f1f80 100644 --- a/examples/game/world.tm +++ b/examples/game/world.tm @@ -68,8 +68,8 @@ struct World(player:@Player, goal:@Box, boxes:@[@Box], dt_accum=Num32(0.0), won= DrawText(CString("WINNER"), GetScreenWidth()/Int32(2)-Int32(48*3), GetScreenHeight()/Int32(2)-Int32(24), 48, Color(0,0,0)) func load_map(w:@World, map:Text): - if map:has($/[]/): - map = map:replace_all({$/[]/="#", $/@{1..}/="@", $/ /=" "}) + if map:has("[]"): + map = map:translate({"[]"="#", "@ "="@", " "=" "}) w.boxes = @[:@Box] box_size := Vector2(50., 50.) for y,line in map:lines(): diff --git a/examples/patterns/README.md b/examples/patterns/README.md new file mode 100644 index 00000000..728b978e --- /dev/null +++ b/examples/patterns/README.md @@ -0,0 +1,152 @@ +# Text Pattern Matching + +As an alternative to full regular expressions, Tomo provides a limited string +matching pattern syntax that is intended to solve 80% of use cases in under 1% +of the code size (PCRE's codebase is roughly 150k lines of code, and Tomo's +pattern matching code is a bit under 1k lines of code). Tomo's pattern matching +syntax is highly readable and works well for matching literal text without +getting [leaning toothpick syndrome](https://en.wikipedia.org/wiki/Leaning_toothpick_syndrome). + +For more advanced use cases, consider linking against a C library for regular +expressions or pattern matching. + +`Pattern` is a [domain-specific language](docs/langs.md), in other words, it's +like a `Text`, but it has a distinct type. As a convenience, you can use +`$/.../` to write pattern literals instead of using the general-purpose DSL +syntax of `$Pattern"..."`. + +Patterns are used in a small, but very powerful API that handles many text +functions that would normally be handled by a more extensive API: + +``` +Text.has(pattern:Pattern -> Bool) +Text.each(pattern:Pattern, fn:func(m:Match), recursive=yes -> Text) +Text.find(pattern:Pattern, start=1 -> Match?) +Text.find_all(pattern:Pattern -> [Match]) +Text.matches(pattern:Pattern -> [Text]?) +Text.map(pattern:Pattern, fn:func(m:Match -> Text), recursive=yes -> Text) +Text.replace(pattern:Pattern, replacement:Text, placeholder:Pattern=$//, recursive=yes -> [Text]) +Text.replace_all(replacements:{Pattern,Text}, placeholder:Pattern=$//, recursive=yes -> [Text]) +Text.split(pattern:Pattern -> [Text]) +Text.trim(pattern=$/{whitespace}/, trim_left=yes, trim_right=yes -> [Text]) +``` + +## Matches + +Pattern matching functions work with a type called `Match` that has three fields: + +- `text`: The full text of the match. +- `index`: The index in the text where the match was found. +- `captures`: An array containing the matching text of each non-literal pattern group. + +See [Text Functions](text.md#Text-Functions) for the full API documentation. + +## Syntax + +Patterns have three types of syntax: + +- `{` followed by an optional count (`n`, `n-m`, or `n+`), followed by an + optional `!` to negate the pattern, followed by an optional pattern name or + Unicode character name, followed by a required `}`. + +- Any matching pair of quotes or parentheses or braces with a `?` in the middle + (e.g. `"?"` or `(?)`). + +- Any other character is treated as a literal to be matched exactly. + +## Named Patterns + +Named patterns match certain pre-defined patterns that are commonly useful. To +use a named pattern, use the syntax `{name}`. Names are case-insensitive and +mostly ignore spaces, underscores, and dashes. + +- `..` - Any character (note that a single `.` would mean the literal period + character). +- `digit` - A unicode digit +- `email` - an email address +- `emoji` - an emoji +- `end` - the very end of the text +- `id` - A unicode identifier +- `int` - One or more digits with an optional `-` (minus sign) in front +- `ip` - an IP address (IPv4 or IPv6) +- `ipv4` - an IPv4 address +- `ipv6` - an IPv6 address +- `nl`/`newline`/`crlf` - A line break (either `\r\n` or `\n`) +- `num` - One or more digits with an optional `-` (minus sign) in front and an optional `.` and more digits after +- `start` - the very start of the text +- `uri` - a URI +- `url` - a URL (URI that specifically starts with `http://`, `https://`, `ws://`, `wss://`, or `ftp://`) +- `word` - A unicode identifier (same as `id`) + +For non-alphabetic characters, any single character is treated as matching +exactly that character. For example, `{1{}` matches exactly one `{` +character. Or, `{1.}` matches exactly one `.` character. + +Patterns can also use any Unicode property name. Some helpful ones are: + +- `hex` - Hexidecimal digits +- `lower` - Lowercase letters +- `space` - The space character +- `upper` - Uppercase letters +- `whitespace` - Whitespace characters + +Patterns may also use exact Unicode codepoint names. For example: `{1 latin +small letter A}` matches `a`. + +## Negating Patterns + +If an exclamation mark (`!`) is placed before a pattern's name, then characters +are matched only when they _don't_ match the pattern. For example, `{!alpha}` +will match all characters _except_ alphabetic ones. + +## Interpolating Text and Escaping + +To escape a character in a pattern (e.g. if you want to match the literal +character `?`), you can use the syntax `{1 ?}`. This is almost never necessary +unless you have text that looks like a Tomo text pattern and has something like +`{` or `(?)` inside it. + +However, if you're trying to do an exact match of arbitrary text values, you'll +want to have the text automatically escaped. Fortunately, Tomo's injection-safe +DSL text interpolation supports automatic text escaping. This means that if you +use text interpolation with the `$` sign to insert a text value, the value will +be automatically escaped using the `{1 ?}` rule described above: + +```tomo +# Risk of code injection (would cause an error because 'xxx' is not a valid +# pattern name: +>> user_input := get_user_input() += "{xxx}" + +# Interpolation automatically escapes: +>> $/$user_input/ += $/{1{}..xxx}/ + +# This is: `{ 1{ }` (one open brace) followed by the literal text "..xxx}" + +# No error: +>> some_text:find($/$user_input/) += 0 +``` + +If you prefer, you can also use this to insert literal characters: + +```tomo +>> $/literal $"{..}"/ += $/literal {1{}..}/ +``` + +## Repetitions + +By default, named patterns match 1 or more repetitions, but you can specify how +many repetitions you want by putting a number or range of numbers first using +`n` (exactly `n` repetitions), `n-m` (between `n` and `m` repetitions), or `n+` +(`n` or more repetitions): + +``` +{4-5 alpha} +0x{hex} +{4 digit}-{2 digit}-{2 digit} +{2+ space} +{0-1 question mark} +``` diff --git a/examples/patterns/match_type.h b/examples/patterns/match_type.h new file mode 100644 index 00000000..5d063431 --- /dev/null +++ b/examples/patterns/match_type.h @@ -0,0 +1,8 @@ +#pragma once + +typedef struct { + Text_t text; + Int_t index; + Array_t captures; +} XMatch; + diff --git a/examples/patterns/patterns.c b/examples/patterns/patterns.c new file mode 100644 index 00000000..ade68e04 --- /dev/null +++ b/examples/patterns/patterns.c @@ -0,0 +1,1291 @@ +// Logic for text pattern matching + +#include <ctype.h> +#include <sys/param.h> +#include <tomo/tomo.h> +#include <unictype.h> +#include <uniname.h> +#include <unistring/version.h> + +#define MAX_BACKREFS 100 + +typedef struct { + Text_t text; + Int_t index; + Array_t captures; +} PatternMatch; + +typedef struct { + Text_t text; + Int_t index; + Array_t captures; + bool is_none:1; +} OptionalPatternMatch; + +#define NONE_MATCH ((OptionalPatternMatch){.is_none=true}) + +typedef struct { + int64_t index, length; + bool occupied, recursive; +} capture_t; + +typedef struct { + enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag; + bool negated, non_capturing; + int64_t min, max; + union { + int32_t grapheme; + uc_property_t property; + int64_t (*fn)(TextIter_t *, int64_t); + int32_t quote_graphemes[2]; + int32_t pair_graphemes[2]; + }; +} pat_t; + +static Text_t replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive); + +static INLINE void skip_whitespace(TextIter_t *state, int64_t *i) +{ + while (*i < state->stack[0].text.length) { + int32_t grapheme = Text$get_grapheme_fast(state, *i); + if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) + return; + *i += 1; + } +} + +static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme) +{ + if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) { + *i += 1; + return true; + } + return false; +} + +static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str) +{ + int64_t matched = 0; + while (matched[str]) { + if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched]) + return false; + matched += 1; + } + *i += matched; + return true; +} + +static int64_t parse_int(TextIter_t *state, int64_t *i) +{ + int64_t value = 0; + for (;; *i += 1) { + uint32_t grapheme = Text$get_main_grapheme_fast(state, *i); + int digit = uc_digit_value(grapheme); + if (digit < 0) break; + if (value >= INT64_MAX/10) break; + value = 10*value + digit; + } + return value; +} + +static const char *get_property_name(TextIter_t *state, int64_t *i) +{ + skip_whitespace(state, i); + char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); + char *dest = name; + while (*i < state->stack[0].text.length) { + int32_t grapheme = Text$get_grapheme_fast(state, *i); + if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { + *dest = (char)grapheme; + ++dest; + if (dest >= name + UNINAME_MAX - 1) + break; + } else { + break; + } + *i += 1; + } + + while (dest > name && dest[-1] == ' ') + *(dest--) = '\0'; + + if (dest == name) return NULL; + *dest = '\0'; + return name; +} + +#define EAT1(state, index, cond) ({\ + int32_t grapheme = Text$get_grapheme_fast(state, index); \ + bool success = (cond); \ + if (success) index += 1; \ + success; }) + +#define EAT2(state, index, cond1, cond2) ({\ + int32_t grapheme = Text$get_grapheme_fast(state, index); \ + bool success = (cond1); \ + if (success) { \ + grapheme = Text$get_grapheme_fast(state, index + 1); \ + success = (cond2); \ + if (success) \ + index += 2; \ + } \ + success; }) + + +#define EAT_MANY(state, index, cond) ({ int64_t _n = 0; while (EAT1(state, index, cond)) { _n += 1; } _n; }) + +static int64_t match_email(TextIter_t *state, int64_t index) +{ + // email = local "@" domain + // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) + // domain = dns-label ("." dns-label)* + // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) + + if (index > 0) { + uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); + if (uc_is_property_alphabetic(prev_codepoint)) + return -1; + } + + int64_t start_index = index; + + // Local part: + int64_t local_len = 0; + static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; + while (EAT1(state, index, + (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { + local_len += 1; + if (local_len > 64) return -1; + } + + if (!EAT1(state, index, grapheme == '@')) + return -1; + + // Host + int64_t host_len = 0; + do { + int64_t label_len = 0; + while (EAT1(state, index, + (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { + label_len += 1; + if (label_len > 63) return -1; + } + + if (label_len == 0) + return -1; + + host_len += label_len; + if (host_len > 255) + return -1; + host_len += 1; + } while (EAT1(state, index, grapheme == '.')); + + return index - start_index; +} + +static int64_t match_ipv6(TextIter_t *state, int64_t index) +{ + if (index > 0) { + int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); + if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) + return -1; + } + int64_t start_index = index; + const int NUM_CLUSTERS = 8; + bool double_colon_used = false; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 4; digits++) { + if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + break; + } + if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) { + break; + } else if (!EAT1(state, index, grapheme == ':')) { + if (double_colon_used) + break; + return -1; + } + + if (EAT1(state, index, grapheme == ':')) { + if (double_colon_used) + return -1; + double_colon_used = true; + } + } + return index - start_index; +} + +static int64_t match_ipv4(TextIter_t *state, int64_t index) +{ + if (index > 0) { + int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); + if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) + return -1; + } + int64_t start_index = index; + + const int NUM_CLUSTERS = 4; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 3; digits++) { + if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { + if (digits == 0) return -1; + break; + } + } + + if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) + break; + else if (!EAT1(state, index, grapheme == '.')) + return -1; + } + return (index - start_index); +} + +static int64_t match_ip(TextIter_t *state, int64_t index) +{ + int64_t len = match_ipv6(state, index); + if (len >= 0) return len; + len = match_ipv4(state, index); + return (len >= 0) ? len : -1; +} + +static int64_t match_host(TextIter_t *state, int64_t index) +{ + int64_t ip_len = match_ip(state, index); + if (ip_len > 0) return ip_len; + + int64_t start_index = index; + if (match_grapheme(state, &index, '[')) { + ip_len = match_ip(state, index); + if (ip_len <= 0) return -1; + index += ip_len; + if (match_grapheme(state, &index, ']')) + return (index - start_index); + return -1; + } + + if (!EAT1(state, index, isalpha(grapheme))) + return -1; + + static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme)); + return (index - start_index); +} + +static int64_t match_authority(TextIter_t *state, int64_t index) +{ + int64_t authority_start = index; + static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`."; + + // Optional user@ prefix: + int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme)); + if (username_len < 1 || !EAT1(state, index, grapheme == '@')) + index = authority_start; // No user@ part + + // Host: + int64_t host_len = match_host(state, index); + if (host_len <= 0) return -1; + index += host_len; + + // Port: + if (EAT1(state, index, grapheme == ':')) { + if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) + return -1; + } + return (index - authority_start); +} + +static int64_t match_uri(TextIter_t *state, int64_t index) +{ + // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] + // scheme = [a-zA-Z] [a-zA-Z0-9+.-] + // authority = [userinfo "@"] host [":" port] + + if (index > 0) { + // Don't match if we're not at a word edge: + uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); + if (uc_is_property_alphabetic(prev_codepoint)) + return -1; + } + + int64_t start_index = index; + + // Scheme: + if (!EAT1(state, index, isalpha(grapheme))) + return -1; + EAT_MANY(state, index, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); + if (!match_grapheme(state, &index, ':')) + return -1; + + // Authority: + int64_t authority_len; + if (match_str(state, &index, "//")) { + authority_len = match_authority(state, index); + if (authority_len > 0) + index += authority_len; + } else { + authority_len = 0; + } + + // Path: + int64_t path_start = index; + if (EAT1(state, index, grapheme == '/') || authority_len <= 0) { + static const char *non_path = " \"#?<>[]{}\\^`|"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); + + if (EAT1(state, index, grapheme == '?')) { // Query + static const char *non_query = " \"#<>[]{}\\^`|"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); + } + + if (EAT1(state, index, grapheme == '#')) { // Fragment + static const char *non_fragment = " \"#<>[]{}\\^`|"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); + } + } + + if (authority_len <= 0 && index == path_start) + return -1; + + return index - start_index; +} + +static int64_t match_url(TextIter_t *state, int64_t index) +{ + int64_t lookahead = index; + if (!(match_str(state, &lookahead, "https:") + || match_str(state, &lookahead, "http:") + || match_str(state, &lookahead, "ftp:") + || match_str(state, &lookahead, "wss:") + || match_str(state, &lookahead, "ws:"))) + return -1; + + return match_uri(state, index); +} + +static int64_t match_id(TextIter_t *state, int64_t index) +{ + if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) + return -1; + return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE)); +} + +static int64_t match_int(TextIter_t *state, int64_t index) +{ + int64_t negative = EAT1(state, index, grapheme == '-') ? 1 : 0; + int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); + return len > 0 ? negative + len : -1; +} + +static int64_t match_alphanumeric(TextIter_t *state, int64_t index) +{ + return EAT1(state, index, uc_is_property_alphabetic((ucs4_t)grapheme) || uc_is_property_numeric((ucs4_t)grapheme)) + ? 1 : -1; +} + +static int64_t match_num(TextIter_t *state, int64_t index) +{ + bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0; + int64_t pre_decimal = EAT_MANY(state, index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); + bool decimal = (EAT1(state, index, grapheme == '.') == 1); + int64_t post_decimal = decimal ? EAT_MANY(state, index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0; + if (pre_decimal == 0 && post_decimal == 0) + return -1; + return negative + pre_decimal + decimal + post_decimal; +} + +static int64_t match_newline(TextIter_t *state, int64_t index) +{ + if (index >= state->stack[0].text.length) + return -1; + + uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index); + if (grapheme == '\n') + return 1; + if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n') + return 2; + return -1; +} + +static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat) +{ + Text_t text = state->stack[0].text; + int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index); + + switch (pat.tag) { + case PAT_START: { + if (index == 0) + return pat.negated ? -1 : 0; + return pat.negated ? 0 : -1; + } + case PAT_END: { + if (index >= text.length) + return pat.negated ? -1 : 0; + return pat.negated ? 0 : -1; + } + case PAT_ANY: { + assert(!pat.negated); + return (index < text.length) ? 1 : -1; + } + case PAT_GRAPHEME: { + if (index >= text.length) + return -1; + else if (grapheme == pat.grapheme) + return pat.negated ? -1 : 1; + return pat.negated ? 1 : -1; + } + case PAT_PROPERTY: { + if (index >= text.length) + return -1; + else if (uc_is_property((ucs4_t)grapheme, pat.property)) + return pat.negated ? -1 : 1; + return pat.negated ? 1 : -1; + } + case PAT_PAIR: { + // Nested punctuation: (?), [?], etc + if (index >= text.length) + return -1; + + int32_t open = pat.pair_graphemes[0]; + if (grapheme != open) + return pat.negated ? 1 : -1; + + int32_t close = pat.pair_graphemes[1]; + int64_t depth = 1; + int64_t match_len = 1; + for (; depth > 0; match_len++) { + if (index + match_len >= text.length) + return pat.negated ? 1 : -1; + + int32_t c = Text$get_grapheme_fast(state, index + match_len); + if (c == open) + depth += 1; + else if (c == close) + depth -= 1; + } + return pat.negated ? -1 : match_len; + } + case PAT_QUOTE: { + // Nested quotes: "?", '?', etc + if (index >= text.length) + return -1; + + int32_t open = pat.quote_graphemes[0]; + if (grapheme != open) + return pat.negated ? 1 : -1; + + int32_t close = pat.quote_graphemes[1]; + for (int64_t i = index + 1; i < text.length; i++) { + int32_t c = Text$get_grapheme_fast(state, i); + if (c == close) { + return pat.negated ? -1 : (i - index) + 1; + } else if (c == '\\' && index + 1 < text.length) { + i += 1; // Skip ahead an extra step + } + } + return pat.negated ? 1 : -1; + } + case PAT_FUNCTION: { + int64_t match_len = pat.fn(state, index); + if (match_len >= 0) + return pat.negated ? -1 : match_len; + return pat.negated ? 1 : -1; + } + default: errx(1, "Invalid pattern"); + } + errx(1, "Unreachable"); +} + +static pat_t parse_next_pat(TextIter_t *state, int64_t *index) +{ + if (EAT2(state, *index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), + grapheme == '?')) { + // Quotations: "?", '?', etc + int32_t open = Text$get_grapheme_fast(state, *index-2); + int32_t close = open; + uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); + if (!match_grapheme(state, index, close)) + fail("Pattern's closing quote is missing: ", state->stack[0].text); + + return (pat_t){ + .tag=PAT_QUOTE, + .min=1, .max=1, + .quote_graphemes={open, close}, + }; + } else if (EAT2(state, *index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), + grapheme == '?')) { + // Nested punctuation: (?), [?], etc + int32_t open = Text$get_grapheme_fast(state, *index-2); + int32_t close = open; + uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); + if (!match_grapheme(state, index, close)) + fail("Pattern's closing brace is missing: ", state->stack[0].text); + + return (pat_t){ + .tag=PAT_PAIR, + .min=1, .max=1, + .pair_graphemes={open, close}, + }; + } else if (EAT1(state, *index, grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc. + skip_whitespace(state, index); + int64_t min, max; + if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) { + min = parse_int(state, index); + skip_whitespace(state, index); + if (match_grapheme(state, index, '+')) { + max = INT64_MAX; + } else if (match_grapheme(state, index, '-')) { + max = parse_int(state, index); + } else { + max = min; + } + if (min > max) fail("Minimum repetitions (", min, ") is less than the maximum (", max, ")"); + } else { + min = -1, max = -1; + } + + skip_whitespace(state, index); + + bool negated = match_grapheme(state, index, '!'); +#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__}) + const char *prop_name; + if (match_str(state, index, "..")) + prop_name = ".."; + else + prop_name = get_property_name(state, index); + + if (!prop_name) { + // Literal character, e.g. {1?} + skip_whitespace(state, index); + int32_t grapheme = Text$get_grapheme_fast(state, (*index)++); + if (!match_grapheme(state, index, '}')) + fail("Missing closing '}' in pattern: ", state->stack[0].text); + return PAT(PAT_GRAPHEME, .grapheme=grapheme); + } else if (strlen(prop_name) == 1) { + // Single letter names: {1+ A} + skip_whitespace(state, index); + if (!match_grapheme(state, index, '}')) + fail("Missing closing '}' in pattern: ", state->stack[0].text); + return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]); + } + + skip_whitespace(state, index); + if (!match_grapheme(state, index, '}')) + fail("Missing closing '}' in pattern: ", state->stack[0].text); + + switch (tolower(prop_name[0])) { + case '.': + if (prop_name[1] == '.') { + if (negated) + return ((pat_t){.tag=PAT_END, .min=min, .max=max, .non_capturing=true}); + else + return PAT(PAT_ANY); + } + break; + case 'a': + if (strcasecmp(prop_name, "authority") == 0) { + return PAT(PAT_FUNCTION, .fn=match_authority); + } else if (strcasecmp(prop_name, "alphanum") == 0 || strcasecmp(prop_name, "anum") == 0 + || strcasecmp(prop_name, "alphanumeric") == 0) { + return PAT(PAT_FUNCTION, .fn=match_alphanumeric); + } + break; + case 'c': + if (strcasecmp(prop_name, "crlf") == 0) + return PAT(PAT_FUNCTION, .fn=match_newline); + break; + case 'd': + if (strcasecmp(prop_name, "digit") == 0) { + return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT); + } + break; + case 'e': + if (strcasecmp(prop_name, "end") == 0) { + return PAT(PAT_END, .non_capturing=!negated); + } else if (strcasecmp(prop_name, "email") == 0) { + return PAT(PAT_FUNCTION, .fn=match_email); + } +#if _LIBUNISTRING_VERSION >= 0x0100000 + else if (strcasecmp(prop_name, "emoji") == 0) { + return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI); + } +#endif + break; + case 'h': + if (strcasecmp(prop_name, "host") == 0) { + return PAT(PAT_FUNCTION, .fn=match_host); + } + break; + case 'i': + if (strcasecmp(prop_name, "id") == 0) { + return PAT(PAT_FUNCTION, .fn=match_id); + } else if (strcasecmp(prop_name, "int") == 0) { + return PAT(PAT_FUNCTION, .fn=match_int); + } else if (strcasecmp(prop_name, "ipv4") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ipv4); + } else if (strcasecmp(prop_name, "ipv6") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ipv6); + } else if (strcasecmp(prop_name, "ip") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ip); + } + break; + case 'n': + if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0) { + return PAT(PAT_FUNCTION, .fn=match_newline); + } else if (strcasecmp(prop_name, "num") == 0) { + return PAT(PAT_FUNCTION, .fn=match_num); + } + break; + case 's': + if (strcasecmp(prop_name, "start") == 0) { + return PAT(PAT_START, .non_capturing=!negated); + } + break; + case 'u': + if (strcasecmp(prop_name, "uri") == 0) { + return PAT(PAT_FUNCTION, .fn=match_uri); + } else if (strcasecmp(prop_name, "url") == 0) { + return PAT(PAT_FUNCTION, .fn=match_url); + } + break; + case 'w': + if (strcasecmp(prop_name, "word") == 0) { + return PAT(PAT_FUNCTION, .fn=match_id); + } + break; + default: break; + } + + uc_property_t prop = uc_property_byname(prop_name); + if (uc_property_is_valid(prop)) + return PAT(PAT_PROPERTY, .property=prop); + + ucs4_t grapheme = unicode_name_character(prop_name); + if (grapheme == UNINAME_INVALID) + fail("Not a valid property or character name: ", prop_name); + return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme); +#undef PAT + } else { + return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(state, (*index)++)}; + } +} + +static int64_t match(Text_t text, int64_t text_index, Text_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) +{ + if (pattern_index >= pattern.length) // End of the pattern + return 0; + + int64_t start_index = text_index; + TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text); + pat_t pat = parse_next_pat(&pattern_state, &pattern_index); + + if (pat.min == -1 && pat.max == -1) { + if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { + pat.min = pat.max = MAX(1, text.length - text_index); + } else { + pat.min = 1; + pat.max = INT64_MAX; + } + } + + int64_t capture_start = text_index; + int64_t count = 0, capture_len = 0, next_match_len = 0; + + if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { + int64_t remaining = text.length - text_index; + capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1; + text_index += capture_len; + goto success; + } + + if (pat.min == 0 && pattern_index < pattern.length) { + next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); + if (next_match_len >= 0) { + capture_len = 0; + goto success; + } + } + + while (count < pat.max) { + int64_t match_len = match_pat(&text_state, text_index, pat); + if (match_len < 0) + break; + capture_len += match_len; + text_index += match_len; + count += 1; + + if (pattern_index < pattern.length) { // More stuff after this + if (count < pat.min) + next_match_len = -1; + else + next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); + } else { + next_match_len = 0; + } + + if (match_len == 0) { + if (next_match_len >= 0) { + // If we're good to go, no need to keep re-matching zero-length + // matches till we hit max: + count = pat.max; + break; + } else { + return -1; + } + } + + if (pattern_index < pattern.length && next_match_len >= 0) + break; // Next guy exists and wants to stop here + + if (text_index >= text.length) + break; + } + + if (count < pat.min || next_match_len < 0) + return -1; + + success: + if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) { + if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) { + assert(capture_len > 0); + captures[capture_index] = (capture_t){ + .index=capture_start + 1, // Skip leading quote/paren + .length=capture_len - 2, // Skip open/close + .occupied=true, + .recursive=(pat.tag == PAT_PAIR), + }; + } else { + captures[capture_index] = (capture_t){ + .index=capture_start, + .length=capture_len, + .occupied=true, + .recursive=false, + }; + } + } + return (text_index - start_index) + next_match_len; +} + +#undef EAT1 +#undef EAT2 +#undef EAT_MANY + +static int64_t _find(Text_t text, Text_t pattern, int64_t first, int64_t last, int64_t *match_length, capture_t *captures) +{ + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); + for (int64_t i = first; i <= last; i++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme) + ++i; + } + + int64_t m = match(text, i, pattern, 0, captures, 0); + if (m >= 0) { + if (match_length) + *match_length = m; + return i; + } + } + if (match_length) + *match_length = -1; + return -1; +} + +static OptionalPatternMatch find(Text_t text, Text_t pattern, Int_t from_index) +{ + int64_t first = Int64$from_int(from_index, false); + if (first == 0) fail("Invalid index: 0"); + if (first < 0) first = text.length + first + 1; + if (first > text.length || first < 1) + return NONE_MATCH; + + capture_t captures[MAX_BACKREFS] = {}; + int64_t len = 0; + int64_t found = _find(text, pattern, first-1, text.length-1, &len, captures); + if (found == -1) + return NONE_MATCH; + + Array_t capture_array = {}; + for (int i = 0; captures[i].occupied; i++) { + Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); + Array$insert(&capture_array, &capture, I(0), sizeof(Text_t)); + } + return (OptionalPatternMatch){ + .text=Text$slice(text, I(found+1), I(found+len)), + .index=I(found+1), + .captures=capture_array, + }; +} + +PUREFUNC static bool Pattern$has(Text_t text, Text_t pattern) +{ + if (Text$starts_with(pattern, Text("{start}"))) { + int64_t m = match(text, 0, pattern, 0, NULL, 0); + return m >= 0; + } else if (Text$ends_with(text, Text("{end}"))) { + for (int64_t i = text.length-1; i >= 0; i--) { + int64_t match_len = match(text, i, pattern, 0, NULL, 0); + if (match_len >= 0 && i + match_len == text.length) + return true; + } + return false; + } else { + int64_t found = _find(text, pattern, 0, text.length-1, NULL, NULL); + return (found >= 0); + } +} + +static OptionalArray_t Pattern$matches(Text_t text, Text_t pattern) +{ + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, 0, pattern, 0, captures, 0); + if (match_len != text.length) + return NONE_ARRAY; + + Array_t capture_array = {}; + for (int i = 0; captures[i].occupied; i++) { + Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); + Array$insert(&capture_array, &capture, I(0), sizeof(Text_t)); + } + return capture_array; +} + +static Array_t Pattern$find_all(Text_t text, Text_t pattern) +{ + if (pattern.length == 0) // special case + return (Array_t){.length=0}; + + Array_t matches = {}; + for (int64_t i = 1; ; ) { + OptionalPatternMatch m = find(text, pattern, I(i)); + if (m.is_none) + break; + i = Int64$from_int(m.index, false) + m.text.length; + Array$insert(&matches, &m, I_small(0), sizeof(PatternMatch)); + } + return matches; +} + +typedef struct { + TextIter_t state; + Int_t i; + Text_t pattern; +} match_iter_state_t; + +static OptionalPatternMatch next_match(match_iter_state_t *state) +{ + if (Int64$from_int(state->i, false) > state->state.stack[0].text.length) + return NONE_MATCH; + + OptionalPatternMatch m = find(state->state.stack[0].text, state->pattern, state->i); + if (m.is_none) // No match + state->i = I(state->state.stack[0].text.length + 1); + else + state->i = Int$plus(m.index, I(MAX(1, m.text.length))); + return m; +} + +static Closure_t Pattern$by_match(Text_t text, Text_t pattern) +{ + return (Closure_t){ + .fn=(void*)next_match, + .userdata=new(match_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=I_small(1), .pattern=pattern), + }; +} + +static Text_t apply_backrefs(Text_t text, Array_t recursive_replacements, Text_t replacement, Text_t backref_pat, capture_t *captures) +{ + if (backref_pat.length == 0) + return replacement; + + int32_t first_grapheme = Text$get_grapheme(backref_pat, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + Text_t ret = Text(""); + TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement); + int64_t nonmatching_pos = 0; + for (int64_t pos = 0; pos < replacement.length; ) { + // Optimization: quickly skip ahead to first char in the backref pattern: + if (find_first) { + while (pos < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme) + ++pos; + } + + int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0); + if (backref_len < 0) { + pos += 1; + continue; + } + + int64_t after_backref = pos + backref_len; + int64_t backref = parse_int(&replacement_state, &after_backref); + if (after_backref == pos + backref_len) { // Not actually a backref if there's no number + pos += 1; + continue; + } + if (backref < 0 || backref > 9) fail("Invalid backref index: ", backref, " (only 0-", MAX_BACKREFS-1, " are allowed)"); + backref_len = (after_backref - pos); + + if (Text$get_grapheme_fast(&replacement_state, pos + backref_len) == ';') + backref_len += 1; // skip optional semicolon + + if (!captures[backref].occupied) + fail("There is no capture number ", backref, "!"); + + Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length)); + + if (captures[backref].recursive && recursive_replacements.length > 0) + backref_text = replace_array(backref_text, recursive_replacements, backref_pat, true); + + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, backref_text); + } else { + ret = Text$concat(ret, backref_text); + } + + pos += backref_len; + nonmatching_pos = pos; + } + if (nonmatching_pos < replacement.length) { + Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +static Text_t Pattern$replace(Text_t text, Text_t pattern, Text_t replacement, Text_t backref_pat, bool recursive) +{ + Text_t ret = EMPTY_TEXT; + + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + Text_t entries[2] = {pattern, replacement}; + Array_t replacements = { + .data=entries, + .length=1, + .stride=sizeof(entries), + }; + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); + int64_t nonmatching_pos = 0; + for (int64_t pos = 0; pos < text.length; ) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) + ++pos; + } + + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, pos, pattern, 0, captures, 1); + if (match_len < 0) { + pos += 1; + continue; + } + captures[0] = (capture_t){ + .index = pos, .length = match_len, + .occupied = true, .recursive = false, + }; + + Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (Array_t){}, replacement, backref_pat, captures); + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, replacement_text); + } else { + ret = Text$concat(ret, replacement_text); + } + nonmatching_pos = pos + match_len; + pos += MAX(match_len, 1); + } + if (nonmatching_pos < text.length) { + Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +static Text_t Pattern$trim(Text_t text, Text_t pattern, bool trim_left, bool trim_right) +{ + int64_t first = 0, last = text.length-1; + if (trim_left) { + int64_t match_len = match(text, 0, pattern, 0, NULL, 0); + if (match_len > 0) + first = match_len; + } + + if (trim_right) { + for (int64_t i = text.length-1; i >= first; i--) { + int64_t match_len = match(text, i, pattern, 0, NULL, 0); + if (match_len > 0 && i + match_len == text.length) + last = i-1; + } + } + return Text$slice(text, I(first+1), I(last+1)); +} + +static Text_t Pattern$map(Text_t text, Text_t pattern, Closure_t fn, bool recursive) +{ + Text_t ret = EMPTY_TEXT; + + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); + int64_t nonmatching_pos = 0; + + Text_t (*text_mapper)(PatternMatch, void*) = fn.fn; + for (int64_t pos = 0; pos < text.length; pos++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) + ++pos; + } + + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, pos, pattern, 0, captures, 0); + if (match_len < 0) continue; + + PatternMatch m = { + .text=Text$slice(text, I(pos+1), I(pos+match_len)), + .index=I(pos+1), + .captures={}, + }; + for (int i = 0; captures[i].occupied; i++) { + Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); + if (recursive) + capture = Pattern$map(capture, pattern, fn, recursive); + Array$insert(&m.captures, &capture, I(0), sizeof(Text_t)); + } + + Text_t replacement = text_mapper(m, fn.userdata); + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, replacement); + } else { + ret = Text$concat(ret, replacement); + } + nonmatching_pos = pos + match_len; + pos += (match_len - 1); + } + if (nonmatching_pos < text.length) { + Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +static void Pattern$each(Text_t text, Text_t pattern, Closure_t fn, bool recursive) +{ + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); + void (*action)(PatternMatch, void*) = fn.fn; + for (int64_t pos = 0; pos < text.length; pos++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) + ++pos; + } + + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, pos, pattern, 0, captures, 0); + if (match_len < 0) continue; + + PatternMatch m = { + .text=Text$slice(text, I(pos+1), I(pos+match_len)), + .index=I(pos+1), + .captures={}, + }; + for (int i = 0; captures[i].occupied; i++) { + Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); + if (recursive) + Pattern$each(capture, pattern, fn, recursive); + Array$insert(&m.captures, &capture, I(0), sizeof(Text_t)); + } + + action(m, fn.userdata); + pos += (match_len - 1); + } +} + +Text_t replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive) +{ + if (replacements.length == 0) return text; + + Text_t ret = EMPTY_TEXT; + + int64_t nonmatch_pos = 0; + for (int64_t pos = 0; pos < text.length; ) { + // Find the first matching pattern at this position: + for (int64_t i = 0; i < replacements.length; i++) { + Text_t pattern = *(Text_t*)(replacements.data + i*replacements.stride); + capture_t captures[MAX_BACKREFS] = {}; + int64_t len = match(text, pos, pattern, 0, captures, 1); + if (len < 0) continue; + captures[0].index = pos; + captures[0].length = len; + + // If we skipped over some non-matching text before finding a match, insert it here: + if (pos > nonmatch_pos) { + Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos)); + ret = Text$concat(ret, before_slice); + } + + // Concatenate the replacement: + Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride + sizeof(Text_t)); + Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (Array_t){}, replacement, backref_pat, captures); + ret = Text$concat(ret, replacement_text); + pos += MAX(len, 1); + nonmatch_pos = pos; + goto next_pos; + } + + pos += 1; + next_pos: + continue; + } + + if (nonmatch_pos <= text.length) { + Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +static Text_t Pattern$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive) +{ + return replace_array(text, replacements.entries, backref_pat, recursive); +} + +static Array_t Pattern$split(Text_t text, Text_t pattern) +{ + if (text.length == 0) // special case + return (Array_t){.length=0}; + + if (pattern.length == 0) // special case + return Text$clusters(text); + + Array_t chunks = {}; + + int64_t i = 0; + for (;;) { + int64_t len = 0; + int64_t found = _find(text, pattern, i, text.length-1, &len, NULL); + if (found == i && len == 0) + found = _find(text, pattern, i + 1, text.length-1, &len, NULL); + if (found < 0) break; + Text_t chunk = Text$slice(text, I(i+1), I(found)); + Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); + i = MAX(found + len, i + 1); + } + + Text_t last_chunk = Text$slice(text, I(i+1), I(text.length)); + Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t)); + + return chunks; +} + +typedef struct { + TextIter_t state; + int64_t i; + Text_t pattern; +} split_iter_state_t; + +static OptionalText_t next_split(split_iter_state_t *state) +{ + Text_t text = state->state.stack[0].text; + if (state->i >= text.length) { + if (state->pattern.length > 0 && state->i == text.length) { // special case + state->i = text.length + 1; + return EMPTY_TEXT; + } + return NONE_TEXT; + } + + if (state->pattern.length == 0) { // special case + Text_t ret = Text$cluster(text, I(state->i+1)); + state->i += 1; + return ret; + } + + int64_t start = state->i; + int64_t len = 0; + int64_t found = _find(text, state->pattern, start, text.length-1, &len, NULL); + + if (found == start && len == 0) + found = _find(text, state->pattern, start + 1, text.length-1, &len, NULL); + + if (found >= 0) { + state->i = MAX(found + len, state->i + 1); + return Text$slice(text, I(start+1), I(found)); + } else { + state->i = state->state.stack[0].text.length + 1; + return Text$slice(text, I(start+1), I(text.length)); + } +} + +static Closure_t Pattern$by_split(Text_t text, Text_t pattern) +{ + return (Closure_t){ + .fn=(void*)next_split, + .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .pattern=pattern), + }; +} + +static Text_t Pattern$escape_text(Text_t text) +{ + // TODO: optimize for spans of non-escaped text + Text_t ret = EMPTY_TEXT; + TextIter_t state = NEW_TEXT_ITER_STATE(text); + for (int64_t i = 0; i < text.length; i++) { + uint32_t g = Text$get_main_grapheme_fast(&state, i); + if (g == '{') { + ret = Text$concat(ret, Text("{1{}")); + } else if (g == '?' + || uc_is_property_quotation_mark(g) + || (uc_is_property_paired_punctuation(g) && uc_is_property_left_of_pair(g))) { + ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i+1), I(i+1)), Text("}")); + } else { + ret = Text$concat(ret, Text$slice(text, I(i+1), I(i+1))); + } + } + return ret; +} + +static Text_t Pattern$as_text(const void *obj, bool colorize, const TypeInfo_t *info) +{ + (void)info; + if (!obj) return Text("Pattern"); + + Text_t pat = *(Text_t*)obj; + Text_t quote = Pattern$has(pat, Text("/")) && !Pattern$has(pat, Text("|")) ? Text("|") : Text("/"); + return Text$concat(colorize ? Text("\x1b[1m$\033[m") : Text("$"), Text$quoted(pat, colorize, quote)); +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/examples/patterns/patterns.tm b/examples/patterns/patterns.tm new file mode 100644 index 00000000..fa6c789f --- /dev/null +++ b/examples/patterns/patterns.tm @@ -0,0 +1,58 @@ +use ./patterns.c + +struct PatternMatch(text:Text, index:Int, captures:[Text]) + +lang P: + convert(text:Text -> P): + return inline C : P { Pattern$escape_text(_$text); } + + convert(n:Int -> P): + return P.from_text("$n") + +extend Text: + func matches(text:Text, pattern:P -> [Text]?): + return inline C : [Text]? { Pattern$matches(_$text, _$pattern); } + + func pat_replace(text:Text, pattern:P, replacement:Text, backref="@", recursive=yes -> Text): + return inline C : Text { Pattern$replace(_$text, _$pattern, _$replacement, _$backref, _$recursive); } + + func pat_replace_all(text:Text, replacements:{P,Text}, backref="@", recursive=yes -> Text): + return inline C : Text { Pattern$replace_all(_$text, _$replacements, _$backref, _$recursive); } + + func has(text:Text, pattern:P -> Bool): + return inline C : Bool { Pattern$has(_$text, _$pattern); } + + func find_all(text:Text, pattern:P -> [PatternMatch]): + return inline C : [PatternMatch] { Pattern$find_all(_$text, _$pattern); } + + func by_match(text:Text, pattern:P -> func(->PatternMatch?)): + return inline C : func(->PatternMatch?) { Pattern$by_match(_$text, _$pattern); } + + func each(text:Text, pattern:P, fn:func(m:PatternMatch), recursive=yes): + inline C { Pattern$each(_$text, _$pattern, _$fn, _$recursive); } + + func map(text:Text, pattern:P, fn:func(m:PatternMatch -> Text), recursive=yes -> Text): + return inline C : Text { Pattern$map(_$text, _$pattern, _$fn, _$recursive); } + + func split(text:Text, pattern:P -> [Text]): + return inline C : [Text] { Pattern$split(_$text, _$pattern); } + + func by_split(text:Text, pattern:P -> func(->Text?)): + return inline C : func(->Text?) { Pattern$by_split(_$text, _$pattern); } + + func trim(text:Text, pattern:P, trim_left=yes, trim_right=yes -> Text): + return inline C : Text { Pattern$trim(_$text, _$pattern, _$trim_left, _$trim_right); } + + func trim_left(text:Text, pattern:P -> Text): + return text:trim(pattern, trim_left=yes, trim_right=no) + + func trim_right(text:Text, pattern:P -> Text): + return text:trim(pattern, trim_left=no, trim_right=yes) + +func main(): + >> "hello world":pat_replace($P/{id}/, "XXX") + >> "hello world":find_all($P/l/) + + for m in "hello one two three":by_match($P/{id}/): + >> m + |
