From 9447ba8c4aff985f9238b3f4e138afd4526799b0 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Fri, 13 Sep 2024 13:34:04 -0400 Subject: [PATCH] Split pattern code into its own file --- Makefile | 2 +- builtins/functions.c | 1 + builtins/path.c | 1 + builtins/pattern.c | 1065 +++++++++++++++++++++++++++++++++++++++ builtins/pattern.h | 33 ++ builtins/shell.c | 1 + builtins/text.c | 1127 ++---------------------------------------- builtins/text.h | 32 +- builtins/tomo.h | 1 + docs/text.md | 8 +- 10 files changed, 1157 insertions(+), 1114 deletions(-) create mode 100644 builtins/pattern.c create mode 100644 builtins/pattern.h diff --git a/Makefile b/Makefile index 1fd0dd4..1ec338e 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ LDLIBS=-lgc -lcord -lm -lunistring -lgmp -ldl BUILTIN_OBJS=builtins/siphash.o builtins/array.o builtins/bool.o builtins/channel.o builtins/nums.o builtins/functions.o builtins/integers.o \ builtins/pointer.o builtins/memory.o builtins/text.o builtins/thread.o builtins/c_string.o builtins/table.o \ builtins/types.o builtins/util.o builtins/files.o builtins/range.o builtins/shell.o builtins/path.o \ - builtins/optionals.o + builtins/optionals.o builtins/pattern.o TESTS=$(patsubst %.tm,%.tm.testresult,$(wildcard test/*.tm)) all: libtomo.so tomo diff --git a/builtins/functions.c b/builtins/functions.c index edbea33..08b04c9 100644 --- a/builtins/functions.c +++ b/builtins/functions.c @@ -18,6 +18,7 @@ #include "functions.h" #include "integers.h" #include "optionals.h" +#include "pattern.h" #include "pointer.h" #include "siphash.h" #include "string.h" diff --git a/builtins/path.c b/builtins/path.c index 8864b7d..09cdc4f 100644 --- a/builtins/path.c +++ b/builtins/path.c @@ -18,6 +18,7 @@ #include "integers.h" #include "optionals.h" #include "path.h" +#include "pattern.h" #include "text.h" #include "types.h" #include "util.h" diff --git a/builtins/pattern.c b/builtins/pattern.c new file mode 100644 index 0000000..6f46000 --- /dev/null +++ b/builtins/pattern.c @@ -0,0 +1,1065 @@ +// Logic for text pattern matching + +#include +#include +#include +#include + +#include "array.h" +#include "functions.h" +#include "integers.h" +#include "pattern.h" +#include "table.h" +#include "text.h" +#include "types.h" + +#define MAX_BACKREFS 100 + +static inline void skip_whitespace(Text_t text, int64_t *i) +{ + TextIter_t state = {0, 0}; + while (*i < text.length) { + int32_t grapheme = Text$get_grapheme_fast(text, &state, *i); + if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) + return; + *i += 1; + } +} + +static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme) +{ + if (*i < text.length && Text$get_grapheme(text, *i) == grapheme) { + *i += 1; + return true; + } + return false; +} + +static inline bool match_str(Text_t text, int64_t *i, const char *str) +{ + TextIter_t state = {0, 0}; + int64_t matched = 0; + while (matched[str]) { + if (*i + matched >= text.length || Text$get_grapheme_fast(text, &state, *i + matched) != str[matched]) + return false; + matched += 1; + } + *i += matched; + return true; +} + +static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop) +{ + if (*i >= text.length) return false; + TextIter_t state = {}; + ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i); + // TODO: check every codepoint in the cluster? + if (uc_is_property(grapheme, prop)) { + *i += 1; + return true; + } + return false; +} + +static int64_t parse_int(Text_t text, int64_t *i) +{ + TextIter_t state = {0, 0}; + int64_t value = 0; + for (;; *i += 1) { + ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i); + int digit = uc_digit_value((ucs4_t)grapheme); + if (digit < 0) break; + if (value >= INT64_MAX/10) break; + value = 10*value + digit; + } + return value; +} + +const char *get_property_name(Text_t text, int64_t *i) +{ + skip_whitespace(text, i); + char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); + char *dest = name; + TextIter_t state = {0, 0}; + while (*i < text.length) { + int32_t grapheme = Text$get_grapheme_fast(text, &state, *i); + if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { + *dest = (char)grapheme; + ++dest; + if (dest >= name + UNINAME_MAX - 1) + break; + } else { + break; + } + *i += 1; + } + + while (dest > name && dest[-1] == ' ') + *(dest--) = '\0'; + + if (dest == name) return NULL; + *dest = '\0'; + return name; +} + +#define EAT1(text, state, index, cond) ({\ + int32_t grapheme = Text$get_grapheme_fast(text, state, index); \ + bool success = (cond); \ + if (success) index += 1; \ + success; }) + +#define EAT2(text, state, index, cond1, cond2) ({\ + int32_t grapheme = Text$get_grapheme_fast(text, state, index); \ + bool success = (cond1); \ + if (success) { \ + grapheme = Text$get_grapheme_fast(text, state, index + 1); \ + success = (cond2); \ + if (success) \ + index += 2; \ + } \ + success; }) + + +#define EAT_MANY(text, state, index, cond) ({ int64_t _n = 0; while (EAT1(text, state, index, cond)) { _n += 1; } _n; }) + +int64_t match_email(Text_t text, int64_t index) +{ + // email = local "@" domain + // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) + // domain = dns-label ("." dns-label)* + // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) + + TextIter_t state = {0, 0}; + if (index > 0) { + ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1); + if (uc_is_property_alphabetic((ucs4_t)prev_codepoint)) + return -1; + } + + int64_t start_index = index; + + // Local part: + int64_t local_len = 0; + static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; + while (EAT1(text, &state, index, + (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { + local_len += 1; + if (local_len > 64) return -1; + } + + if (!EAT1(text, &state, index, grapheme == '@')) + return -1; + + // Host + int64_t host_len = 0; + do { + int64_t label_len = 0; + while (EAT1(text, &state, index, + (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { + label_len += 1; + if (label_len > 63) return -1; + } + + if (label_len == 0) + return -1; + + host_len += label_len; + if (host_len > 255) + return -1; + host_len += 1; + } while (EAT1(text, &state, index, grapheme == '.')); + + return index - start_index; +} + +int64_t match_ipv6(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + if (index > 0) { + int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1); + if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) + return -1; + } + int64_t start_index = index; + const int NUM_CLUSTERS = 8; + bool double_colon_used = false; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 4; digits++) { + if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + break; + } + if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) { + break; + } else if (!EAT1(text, &state, index, grapheme == ':')) { + if (double_colon_used) + break; + return -1; + } + + if (EAT1(text, &state, index, grapheme == ':')) { + if (double_colon_used) + return -1; + double_colon_used = true; + } + } + return index - start_index; +} + +static int64_t match_ipv4(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + if (index > 0) { + int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1); + if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) + return -1; + } + int64_t start_index = index; + + const int NUM_CLUSTERS = 4; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 3; digits++) { + if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { + if (digits == 0) return -1; + break; + } + } + + if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) + break; + else if (!EAT1(text, &state, index, grapheme == '.')) + return -1; + } + return (index - start_index); +} + +int64_t match_ip(Text_t text, int64_t index) +{ + int64_t len = match_ipv6(text, index); + if (len >= 0) return len; + len = match_ipv4(text, index); + return (len >= 0) ? len : -1; +} + +int64_t match_uri(Text_t text, int64_t index) +{ + // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] + // scheme = [a-zA-Z] [a-zA-Z0-9+.-] + // authority = [userinfo "@"] host [":" port] + + TextIter_t state = {0, 0}; + if (index > 0) { + ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1); + if (uc_is_property_alphabetic(prev_codepoint)) + return -1; + } + + int64_t start_index = index; + + // Scheme: + if (!EAT1(text, &state, index, isalpha(grapheme))) + return -1; + + EAT_MANY(text, &state, index, + !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); + + if (index == start_index) + return -1; + + if (!match_grapheme(text, &index, ':')) + return -1; + + // Authority: + if (match_str(text, &index, "//")) { + int64_t authority_start = index; + // Username or host: + static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/"; + if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) + return -1; + + if (EAT1(text, &state, index, grapheme == '@')) { + // Found a username, now get a host: + if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) + return -1; + } else { + int64_t ip = authority_start; + int64_t ipv4_len = match_ipv4(text, ip); + if (ipv4_len > 0) { + ip += ipv4_len; + } else if (match_grapheme(text, &ip, '[')) { + ip += match_ipv6(text, ip); + if (ip > authority_start + 1 && match_grapheme(text, &ip, ']')) + index = ip; + } + } + + // Port: + if (EAT1(text, &state, index, grapheme == ':')) { + if (EAT_MANY(text, &state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) + return -1; + } + if (!EAT1(text, &state, index, grapheme == '/')) + return (index - start_index); // No path + } else { + // Optional path root: + EAT1(text, &state, index, grapheme == '/'); + } + + // Path: + static const char *non_path = " \"#?<>[]{}\\^`|"; + EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); + + if (EAT1(text, &state, index, grapheme == '?')) { // Query + static const char *non_query = " \"#<>[]{}\\^`|"; + EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); + } + + if (EAT1(text, &state, index, grapheme == '#')) { // Fragment + static const char *non_fragment = " \"#<>[]{}\\^`|"; + EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); + } + return index - start_index; +} + +int64_t match_url(Text_t text, int64_t index) +{ + int64_t lookahead = index; + if (!(match_str(text, &lookahead, "https:") + || match_str(text, &lookahead, "http:") + || match_str(text, &lookahead, "ftp:") + || match_str(text, &lookahead, "wss:") + || match_str(text, &lookahead, "ws:"))) + return -1; + + return match_uri(text, index); +} + +int64_t match_id(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + if (!EAT1(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) + return -1; + return 1 + EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE)); +} + +int64_t match_int(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + int64_t len = EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); + return len >= 0 ? len : -1; +} + +int64_t match_num(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + bool negative = EAT1(text, &state, index, grapheme == '-') ? 1 : 0; + int64_t pre_decimal = EAT_MANY(text, &state, index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); + bool decimal = (EAT1(text, &state, index, grapheme == '.') == 1); + int64_t post_decimal = decimal ? EAT_MANY(text, &state, index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0; + if (pre_decimal == 0 && post_decimal == 0) + return -1; + return negative + pre_decimal + decimal + post_decimal; +} + +int64_t match_newline(Text_t text, int64_t index) +{ + if (index >= text.length) + return -1; + + TextIter_t state = {0, 0}; + ucs4_t grapheme = index >= text.length ? 0 : Text$get_main_grapheme_fast(text, &state, index); + if (grapheme == '\n') + return 1; + if (grapheme == '\r' && Text$get_grapheme_fast(text, &state, index + 1) == '\n') + return 2; + return -1; +} + +typedef struct { + int64_t index, length; + bool occupied, recursive; +} capture_t; + +typedef struct { + enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag; + bool negated, non_capturing; + int64_t min, max; + union { + int32_t grapheme; + uc_property_t property; + int64_t (*fn)(Text_t, int64_t); + int32_t quote_graphemes[2]; + int32_t pair_graphemes[2]; + }; +} pat_t; + +int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat) +{ + int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(text, state, index); + + switch (pat.tag) { + case PAT_START: { + if (index == 0) + return pat.negated ? -1 : 0; + return pat.negated ? 0 : -1; + } + case PAT_END: { + if (index >= text.length) + return pat.negated ? -1 : 0; + return pat.negated ? 0 : -1; + } + case PAT_ANY: { + assert(!pat.negated); + return (index < text.length) ? 1 : -1; + } + case PAT_GRAPHEME: { + if (index >= text.length) + return -1; + else if (grapheme == pat.grapheme) + return pat.negated ? -1 : 1; + return pat.negated ? 1 : -1; + } + case PAT_PROPERTY: { + if (index >= text.length) + return -1; + else if (uc_is_property((ucs4_t)grapheme, pat.property)) + return pat.negated ? -1 : 1; + return pat.negated ? 1 : -1; + } + case PAT_PAIR: { + // Nested punctuation: (?), [?], etc + if (index >= text.length) + return -1; + + int32_t open = pat.pair_graphemes[0]; + if (grapheme != open) + return pat.negated ? 1 : -1; + + int32_t close = pat.pair_graphemes[1]; + int64_t depth = 1; + int64_t match_len = 1; + for (; depth > 0; match_len++) { + if (index + match_len >= text.length) + return pat.negated ? 1 : -1; + + int32_t c = Text$get_grapheme_fast(text, state, index + match_len); + if (c == open) + depth += 1; + else if (c == close) + depth -= 1; + } + return pat.negated ? -1 : match_len; + } + case PAT_QUOTE: { + // Nested quotes: "?", '?', etc + if (index >= text.length) + return -1; + + int32_t open = pat.quote_graphemes[0]; + if (grapheme != open) + return pat.negated ? 1 : -1; + + int32_t close = pat.quote_graphemes[1]; + for (int64_t i = index + 1; i < text.length; i++) { + int32_t c = Text$get_grapheme_fast(text, state, i); + if (c == close) { + return pat.negated ? -1 : (i - index) + 1; + } else if (c == '\\' && index + 1 < text.length) { + i += 1; // Skip ahead an extra step + } + } + return pat.negated ? 1 : -1; + } + case PAT_FUNCTION: { + int64_t match_len = pat.fn(text, index); + if (match_len >= 0) + return pat.negated ? -1 : match_len; + return pat.negated ? 1 : -1; + } + default: errx(1, "Invalid pattern"); + } + errx(1, "Unreachable"); +} + +pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index) +{ + if (EAT2(pattern, state, *index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), + grapheme == '?')) { + // Quotations: "?", '?', etc + int32_t open = Text$get_grapheme_fast(pattern, state, *index-2); + int32_t close = open; + uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); + if (!match_grapheme(pattern, index, close)) + fail("Pattern's closing quote is missing: %k", &pattern); + + return (pat_t){ + .tag=PAT_QUOTE, + .min=1, .max=1, + .quote_graphemes={open, close}, + }; + } else if (EAT2(pattern, state, *index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), + grapheme == '?')) { + // Nested punctuation: (?), [?], etc + int32_t open = Text$get_grapheme_fast(pattern, state, *index-2); + int32_t close = open; + uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); + if (!match_grapheme(pattern, index, close)) + fail("Pattern's closing brace is missing: %k", &pattern); + + return (pat_t){ + .tag=PAT_PAIR, + .min=1, .max=1, + .pair_graphemes={open, close}, + }; + } else if (EAT1(pattern, state, *index, + grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc. + skip_whitespace(pattern, index); + int64_t min, max; + if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(pattern, state, *index))) { + min = parse_int(pattern, index); + skip_whitespace(pattern, index); + if (match_grapheme(pattern, index, '+')) { + max = INT64_MAX; + } else if (match_grapheme(pattern, index, '-')) { + max = parse_int(pattern, index); + } else { + max = min; + } + if (min > max) fail("Minimum repetitions (%ld) is less than the maximum (%ld)", min, max); + } else { + min = -1, max = -1; + } + + skip_whitespace(pattern, index); + + bool negated = match_grapheme(pattern, index, '!'); +#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__}) + const char *prop_name; + if (match_str(pattern, index, "..")) + prop_name = ".."; + else + prop_name = get_property_name(pattern, index); + + if (!prop_name) { + // Literal character, e.g. {1?} + skip_whitespace(pattern, index); + int32_t grapheme = Text$get_grapheme_fast(pattern, state, (*index)++); + if (!match_grapheme(pattern, index, '}')) + fail("Missing closing '}' in pattern: %k", &pattern); + return PAT(PAT_GRAPHEME, .grapheme=grapheme); + } else if (strlen(prop_name) == 1) { + // Single letter names: {1+ A} + skip_whitespace(pattern, index); + if (!match_grapheme(pattern, index, '}')) + fail("Missing closing '}' in pattern: %k", &pattern); + return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]); + } + + skip_whitespace(pattern, index); + if (!match_grapheme(pattern, index, '}')) + fail("Missing closing '}' in pattern: %k", &pattern); + + switch (tolower(prop_name[0])) { + case '.': + if (prop_name[1] == '.') { + if (negated) + return ((pat_t){.tag=PAT_END, .min=min, .max=max, .non_capturing=true}); + else + return PAT(PAT_ANY); + } + break; + case 'd': + if (strcasecmp(prop_name, "digit") == 0) { + return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT); + } + break; + case 'e': + if (strcasecmp(prop_name, "end") == 0) { + return PAT(PAT_END, .non_capturing=!negated); + } else if (strcasecmp(prop_name, "email") == 0) { + return PAT(PAT_FUNCTION, .fn=match_email); + } else if (strcasecmp(prop_name, "emoji") == 0) { + return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI); + } + break; + case 'i': + if (strcasecmp(prop_name, "id") == 0) { + return PAT(PAT_FUNCTION, .fn=match_id); + } else if (strcasecmp(prop_name, "int") == 0) { + return PAT(PAT_FUNCTION, .fn=match_int); + } else if (strcasecmp(prop_name, "ipv4") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ipv4); + } else if (strcasecmp(prop_name, "ipv6") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ipv6); + } else if (strcasecmp(prop_name, "ip") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ip); + } + break; + case 'n': + if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0 + || strcasecmp(prop_name, "crlf")) { + return PAT(PAT_FUNCTION, .fn=match_newline); + } else if (strcasecmp(prop_name, "num") == 0) { + return PAT(PAT_FUNCTION, .fn=match_num); + } + break; + case 's': + if (strcasecmp(prop_name, "start") == 0) { + return PAT(PAT_START, .non_capturing=!negated); + } + break; + case 'u': + if (strcasecmp(prop_name, "uri") == 0) { + return PAT(PAT_FUNCTION, .fn=match_uri); + } else if (strcasecmp(prop_name, "url") == 0) { + return PAT(PAT_FUNCTION, .fn=match_url); + } + break; + default: break; + } + + uc_property_t prop = uc_property_byname(prop_name); + if (uc_property_is_valid(prop)) + return PAT(PAT_PROPERTY, .property=prop); + + ucs4_t grapheme = unicode_name_character(prop_name); + if (grapheme == UNINAME_INVALID) + fail("Not a valid property or character name: %s", prop_name); + return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme); +#undef PAT + } else { + return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(pattern, state, (*index)++)}; + } +} + +int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) +{ + if (pattern_index >= pattern.length) // End of the pattern + return 0; + + int64_t start_index = text_index; + TextIter_t pattern_state = {0, 0}, text_state = {0, 0}; + pat_t pat = parse_next_pat(pattern, &pattern_state, &pattern_index); + + if (pat.min == -1 && pat.max == -1) { + if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { + pat.min = pat.max = MAX(1, text.length - text_index); + } else { + pat.min = 1; + pat.max = INT64_MAX; + } + } + + int64_t capture_start = text_index; + int64_t count = 0, capture_len = 0, next_match_len = 0; + + if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { + int64_t remaining = text.length - text_index; + capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1; + text_index += capture_len; + goto success; + } + + if (pat.min == 0 && pattern_index < pattern.length) { + next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); + if (next_match_len >= 0) { + capture_len = 0; + goto success; + } + } + + while (count < pat.max) { + int64_t match_len = match_pat(text, &text_state, text_index, pat); + if (match_len < 0) + break; + capture_len += match_len; + text_index += match_len; + count += 1; + + if (pattern_index < pattern.length) { // More stuff after this + if (count < pat.min) + next_match_len = -1; + else + next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); + } else { + next_match_len = 0; + } + + if (match_len == 0) { + if (next_match_len >= 0) { + // If we're good to go, no need to keep re-matching zero-length + // matches till we hit max: + count = pat.max; + break; + } else { + return -1; + } + } + + if (pattern_index < pattern.length && next_match_len >= 0) + break; // Next guy exists and wants to stop here + + if (text_index >= text.length) + break; + } + + if (count < pat.min || next_match_len < 0) + return -1; + + success: + if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) { + if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) { + assert(capture_len > 0); + captures[capture_index] = (capture_t){ + .index=capture_start + 1, // Skip leading quote/paren + .length=capture_len - 2, // Skip open/close + .occupied=true, + .recursive=(pat.tag == PAT_PAIR), + }; + } else { + captures[capture_index] = (capture_t){ + .index=capture_start, + .length=capture_len, + .occupied=true, + .recursive=false, + }; + } + } + return (text_index - start_index) + next_match_len; +} + +#undef EAT1 +#undef EAT2 +#undef EAT_MANY + +static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length) +{ + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = {0, 0}; + + for (int64_t i = first; i <= last; i++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (i < text.length && Text$get_grapheme_fast(text, &text_state, i) != first_grapheme) + ++i; + } + + int64_t m = match(text, i, pattern, 0, NULL, 0); + if (m >= 0) { + if (match_length) + *match_length = m; + return i; + } + } + if (match_length) + *match_length = -1; + return -1; +} + +public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t *match_length) +{ + int64_t first = Int_to_Int64(from_index, false); + if (first == 0) fail("Invalid index: 0"); + if (first < 0) first = text.length + first + 1; + if (first > text.length || first < 1) + return I(0); + int64_t found = _find(text, pattern, first-1, text.length-1, match_length); + return I(found+1); +} + +PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern) +{ + if (Text$starts_with(pattern, Text("{start}"))) { + int64_t m = match(text, 0, pattern, 0, NULL, 0); + return m >= 0; + } else if (Text$ends_with(text, Text("{end}"))) { + for (int64_t i = text.length-1; i >= 0; i--) { + int64_t match_len = match(text, i, pattern, 0, NULL, 0); + if (match_len >= 0 && i + match_len == text.length) + return true; + } + return false; + } else { + int64_t found = _find(text, pattern, 0, text.length-1, NULL); + return (found >= 0); + } +} + +PUREFUNC public bool Text$matches(Text_t text, Pattern_t pattern) +{ + int64_t m = match(text, 0, pattern, 0, NULL, 0); + return m == text.length; +} + +public Array_t Text$find_all(Text_t text, Pattern_t pattern) +{ + if (pattern.length == 0) // special case + return (Array_t){.length=0}; + + Array_t matches = {}; + + for (int64_t i = 0; ; ) { + int64_t len = 0; + int64_t found = _find(text, pattern, i, text.length-1, &len); + if (found < 0) break; + Text_t match = Text$slice(text, I(found+1), I(found + len)); + Array$insert(&matches, &match, I_small(0), sizeof(Text_t)); + i = found + MAX(len, 1); + } + + return matches; +} + +static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t replacement, Pattern_t backref_pat, capture_t *captures) +{ + if (backref_pat.length == 0) + return replacement; + + int32_t first_grapheme = Text$get_grapheme(backref_pat, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + Text_t ret = Text(""); + TextIter_t state = {0, 0}; + int64_t nonmatching_pos = 0; + for (int64_t pos = 0; pos < replacement.length; ) { + // Optimization: quickly skip ahead to first char in the backref pattern: + if (find_first) { + while (pos < replacement.length && Text$get_grapheme_fast(replacement, &state, pos) != first_grapheme) + ++pos; + } + + int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0); + if (backref_len < 0) { + pos += 1; + continue; + } + + int64_t after_backref = pos + backref_len; + int64_t backref = parse_int(replacement, &after_backref); + if (after_backref == pos + backref_len) { // Not actually a backref if there's no number + pos += 1; + continue; + } + if (backref < 0 || backref > 9) fail("Invalid backref index: %ld (only 0-%d are allowed)", backref, MAX_BACKREFS-1); + backref_len = (after_backref - pos); + + if (Text$get_grapheme_fast(replacement, &state, pos + backref_len) == ';') + backref_len += 1; // skip optional semicolon + + if (!captures[backref].occupied) + fail("There is no capture number %ld!", backref); + + Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length)); + + if (captures[backref].recursive && original_pattern.length > 0) + backref_text = Text$replace(backref_text, original_pattern, replacement, backref_pat, true); + + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, backref_text); + } else { + ret = Text$concat(ret, backref_text); + } + + pos += backref_len; + nonmatching_pos = pos; + } + if (nonmatching_pos < replacement.length) { + Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive) +{ + Text_t ret = {.length=0}; + + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = {0, 0}; + int64_t nonmatching_pos = 0; + for (int64_t pos = 0; pos < text.length; ) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme) + ++pos; + } + + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, pos, pattern, 0, captures, 1); + if (match_len < 0) { + pos += 1; + continue; + } + captures[0] = (capture_t){ + .index = pos, .length = match_len, + .occupied = true, .recursive = false, + }; + + Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures); + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, replacement_text); + } else { + ret = Text$concat(ret, replacement_text); + } + nonmatching_pos = pos + match_len; + pos += MAX(match_len, 1); + } + if (nonmatching_pos < text.length) { + Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right) +{ + int64_t first = 0, last = text.length-1; + if (trim_left) { + int64_t match_len = match(text, 0, pattern, 0, NULL, 0); + if (match_len > 0) + first = match_len; + } + + if (trim_right) { + for (int64_t i = text.length-1; i >= first; i--) { + int64_t match_len = match(text, i, pattern, 0, NULL, 0); + if (match_len > 0 && i + match_len == text.length) + last = i-1; + } + } + return Text$slice(text, I(first+1), I(last+1)); +} + +public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn) +{ + Text_t ret = {.length=0}; + + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = {0, 0}; + int64_t nonmatching_pos = 0; + + Text_t (*text_mapper)(Text_t, void*) = fn.fn; + for (int64_t pos = 0; pos < text.length; pos++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme) + ++pos; + } + + int64_t match_len = match(text, pos, pattern, 0, NULL, 0); + if (match_len < 0) continue; + + Text_t replacement = text_mapper(Text$slice(text, I(pos+1), I(pos+match_len)), fn.userdata); + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, replacement); + } else { + ret = Text$concat(ret, replacement); + } + nonmatching_pos = pos + match_len; + pos += (match_len - 1); + } + if (nonmatching_pos < text.length) { + Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive) +{ + if (replacements.entries.length == 0) return text; + + Text_t ret = {.length=0}; + + int64_t nonmatch_pos = 0; + for (int64_t pos = 0; pos < text.length; ) { + // Find the first matching pattern at this position: + for (int64_t i = 0; i < replacements.entries.length; i++) { + Pattern_t pattern = *(Pattern_t*)(replacements.entries.data + i*replacements.entries.stride); + capture_t captures[MAX_BACKREFS] = {}; + int64_t len = match(text, pos, pattern, 0, captures, 1); + if (len < 0) continue; + captures[0].index = pos; + captures[0].length = len; + + // If we skipped over some non-matching text before finding a match, insert it here: + if (pos > nonmatch_pos) { + Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos)); + ret = Text$concat(ret, before_slice); + } + + // Concatenate the replacement: + Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t)); + Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures); + ret = Text$concat(ret, replacement_text); + pos += MAX(len, 1); + nonmatch_pos = pos; + goto next_pos; + } + + pos += 1; + next_pos: + continue; + } + + if (nonmatch_pos <= text.length) { + Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +public Array_t Text$split(Text_t text, Pattern_t pattern) +{ + if (text.length == 0) // special case + return (Array_t){.length=0}; + + if (pattern.length == 0) // special case + return Text$clusters(text); + + Array_t chunks = {}; + + Int_t i = I_small(1); + for (;;) { + int64_t len = 0; + Int_t found = Text$find(text, pattern, i, &len); + if (I_is_zero(found)) break; + Text_t chunk = Text$slice(text, i, Int$minus(found, I_small(1))); + Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); + i = Int$plus(found, I(MAX(len, 1))); + } + + Text_t last_chunk = Text$slice(text, i, I(text.length)); + Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t)); + + return chunks; +} + + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/pattern.h b/builtins/pattern.h new file mode 100644 index 0000000..804fb28 --- /dev/null +++ b/builtins/pattern.h @@ -0,0 +1,33 @@ +#pragma once + +// The type representing text patterns for pattern matching. + +#include +#include +#include + +#include "datatypes.h" +#include "integers.h" +#include "types.h" + +#define Pattern(text) ((Pattern_t)Text(text)) +#define Patterns(...) ((Pattern_t)Texts(__VA_ARGS__)) + +Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t backref_pat, bool recursive); +Pattern_t Pattern$escape_text(Text_t text); +Text_t Text$replace_all(Text_t text, Table_t replacements, Pattern_t backref_pat, bool recursive); +Array_t Text$split(Text_t text, Pattern_t pattern); +Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right); +Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length); +Array_t Text$find_all(Text_t text, Pattern_t pattern); +PUREFUNC bool Text$has(Text_t text, Pattern_t pattern); +PUREFUNC bool Text$matches(Text_t text, Pattern_t pattern); +Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn); + +#define Pattern$hash Text$hash +#define Pattern$compare Text$compare +#define Pattern$equal Text$equal + +extern const TypeInfo Pattern$info; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/shell.c b/builtins/shell.c index 5bb34e8..d880718 100644 --- a/builtins/shell.c +++ b/builtins/shell.c @@ -5,6 +5,7 @@ #include "array.h" #include "functions.h" #include "integers.h" +#include "pattern.h" #include "shell.h" #include "text.h" #include "types.h" diff --git a/builtins/text.c b/builtins/text.c index 3902863..795f55d 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -84,12 +84,6 @@ typedef struct { const uint8_t *utf8; } synthetic_grapheme_t; -typedef struct { - int64_t subtext, sum_of_previous_subtexts; -} text_iter_t; - -#define MAX_BACKREFS 100 - // Synthetic grapheme clusters (clusters of more than one codepoint): static Table_t grapheme_ids_by_codepoints = {}; // ucs4_t* length-prefixed codepoints -> int32_t ID @@ -103,9 +97,6 @@ static int32_t num_synthetic_graphemes = 0; #define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id)-1].utf32_cluster[1]) #define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id)-1].utf8) -static int32_t get_grapheme(Text_t text, int64_t index); -static int32_t _get_grapheme(Text_t text, text_iter_t *state, int64_t index); -#define _get_main_grapheme(...) MAIN_GRAPHEME_CODEPOINT(_get_grapheme(__VA_ARGS__)) static Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize); PUREFUNC static bool graphemes_equal(ucs4_t **a, ucs4_t **b) { @@ -133,7 +124,7 @@ static const TypeInfo GraphemeIDLookupTableInfo = { }; #pragma GCC diagnostic ignored "-Wstack-protector" -int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len) +public int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len) { ucs4_t length_prefixed[1+utf32_len] = {}; length_prefixed[0] = (ucs4_t)utf32_len; @@ -298,8 +289,8 @@ static bool is_concat_stable(Text_t a, Text_t b) if (a.length == 0 || b.length == 0) return true; - int32_t last_a = get_grapheme(a, a.length-1); - int32_t first_b = get_grapheme(b, 0); + int32_t last_a = Text$get_grapheme(a, a.length-1); + int32_t first_b = Text$get_grapheme(b, 0); // Synthetic graphemes are weird and probably need to check with normalization: if (last_a < 0 || first_b < 0) @@ -385,8 +376,8 @@ static Text_t concat2(Text_t a, Text_t b) return concat2_assuming_safe(a, b); // Do full normalization of the last/first characters - int32_t last_a = get_grapheme(a, a.length-1); - int32_t first_b = get_grapheme(b, 0); + int32_t last_a = Text$get_grapheme(a, a.length-1); + int32_t first_b = Text$get_grapheme(b, 0); size_t utf32_len = (last_a >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(last_a)) + (first_b >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(first_b)); ucs4_t join_graphemes[utf32_len] = {}; @@ -833,7 +824,7 @@ PUREFUNC public uint64_t Text$hash(Text_t *text) return text->hash; } -int32_t _get_grapheme(Text_t text, text_iter_t *state, int64_t index) +public int32_t Text$get_grapheme_fast(Text_t text, TextIter_t *state, int64_t index) { switch (text.tag) { case TEXT_ASCII: return index < text.length ? (int32_t)text.ascii[index] : 0; @@ -841,7 +832,7 @@ int32_t _get_grapheme(Text_t text, text_iter_t *state, int64_t index) case TEXT_GRAPHEMES: return index < text.length ? text.graphemes[index] : 0; case TEXT_SHORT_GRAPHEMES: return index < text.length ? text.short_graphemes[index] : 0; case TEXT_SUBTEXT: { - text_iter_t backup_state = {0, 0}; + TextIter_t backup_state = {0, 0}; if (!state) state = &backup_state; if (index < 0 || index >= text.length) @@ -853,7 +844,7 @@ int32_t _get_grapheme(Text_t text, text_iter_t *state, int64_t index) } for (;;) { if (index < state->sum_of_previous_subtexts + text.subtexts[state->subtext].length) - return _get_grapheme(text.subtexts[state->subtext], NULL, index - state->sum_of_previous_subtexts); + return Text$get_grapheme_fast(text.subtexts[state->subtext], NULL, index - state->sum_of_previous_subtexts); state->sum_of_previous_subtexts += text.subtexts[state->subtext].length; state->subtext += 1; } @@ -864,10 +855,9 @@ int32_t _get_grapheme(Text_t text, text_iter_t *state, int64_t index) return 0; } -int32_t get_grapheme(Text_t text, int64_t index) +public ucs4_t Text$get_main_grapheme_fast(Text_t text, TextIter_t *state, int64_t index) { - text_iter_t state = {0, 0}; - return _get_grapheme(text, &state, index); + return MAIN_GRAPHEME_CODEPOINT(Text$get_grapheme_fast(text, state, index)); } PUREFUNC public int32_t Text$compare(const Text_t *a, const Text_t *b) @@ -875,10 +865,10 @@ PUREFUNC public int32_t Text$compare(const Text_t *a, const Text_t *b) if (a == b) return 0; int64_t len = MAX(a->length, b->length); - text_iter_t a_state = {0, 0}, b_state = {0, 0}; + TextIter_t a_state = {0, 0}, b_state = {0, 0}; for (int64_t i = 0; i < len; i++) { - int32_t ai = _get_grapheme(*a, &a_state, i); - int32_t bi = _get_grapheme(*b, &b_state, i); + int32_t ai = Text$get_grapheme_fast(*a, &a_state, i); + int32_t bi = Text$get_grapheme_fast(*b, &b_state, i); if (ai == bi) continue; int32_t cmp; if (ai > 0 && bi > 0) { @@ -909,10 +899,10 @@ PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix) { if (text.length < prefix.length) return false; - text_iter_t text_state = {0, 0}, prefix_state = {0, 0}; + TextIter_t text_state = {0, 0}, prefix_state = {0, 0}; for (int64_t i = 0; i < prefix.length; i++) { - int32_t text_i = _get_grapheme(text, &text_state, i); - int32_t prefix_i = _get_grapheme(prefix, &prefix_state, i); + int32_t text_i = Text$get_grapheme_fast(text, &text_state, i); + int32_t prefix_i = Text$get_grapheme_fast(prefix, &prefix_state, i); if (text_i != prefix_i) return false; } return true; @@ -922,10 +912,10 @@ PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix) { if (text.length < suffix.length) return false; - text_iter_t text_state = {0, 0}, prefix_state = {0, 0}; + TextIter_t text_state = {0, 0}, prefix_state = {0, 0}; for (int64_t i = 0; i < suffix.length; i++) { - int32_t text_i = _get_grapheme(text, &text_state, text.length - suffix.length + i); - int32_t suffix_i = _get_grapheme(suffix, &prefix_state, i); + int32_t text_i = Text$get_grapheme_fast(text, &text_state, text.length - suffix.length + i); + int32_t suffix_i = Text$get_grapheme_fast(suffix, &prefix_state, i); if (text_i != suffix_i) return false; } return true; @@ -936,10 +926,10 @@ PUREFUNC public bool Text$equal_values(Text_t a, Text_t b) if (a.length != b.length || (a.hash != 0 && b.hash != 0 && a.hash != b.hash)) return false; int64_t len = a.length; - text_iter_t a_state = {0, 0}, b_state = {0, 0}; + TextIter_t a_state = {0, 0}, b_state = {0, 0}; for (int64_t i = 0; i < len; i++) { - int32_t ai = _get_grapheme(a, &a_state, i); - int32_t bi = _get_grapheme(b, &b_state, i); + int32_t ai = Text$get_grapheme_fast(a, &a_state, i); + int32_t bi = Text$get_grapheme_fast(b, &b_state, i); if (ai != bi) return false; } return true; @@ -956,11 +946,11 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b) if (a.length != b.length) return false; int64_t len = a.length; - text_iter_t a_state = {0, 0}, b_state = {0, 0}; + TextIter_t a_state = {0, 0}, b_state = {0, 0}; const char *language = uc_locale_language(); for (int64_t i = 0; i < len; i++) { - int32_t ai = _get_grapheme(a, &a_state, i); - int32_t bi = _get_grapheme(b, &b_state, i); + int32_t ai = Text$get_grapheme_fast(a, &a_state, i); + int32_t bi = Text$get_grapheme_fast(b, &b_state, i); if (ai != bi) { const ucs4_t *a_codepoints = ai >= 0 ? (ucs4_t*)&ai : GRAPHEME_CODEPOINTS(ai); int64_t a_len = ai >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(ai); @@ -1016,794 +1006,6 @@ public Text_t Text$title(Text_t text) return ret; } -static inline void skip_whitespace(Text_t text, int64_t *i) -{ - text_iter_t state = {0, 0}; - while (*i < text.length) { - int32_t grapheme = _get_grapheme(text, &state, *i); - if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) - return; - *i += 1; - } -} - -static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme) -{ - if (*i < text.length && get_grapheme(text, *i) == grapheme) { - *i += 1; - return true; - } - return false; -} - -static inline bool match_str(Text_t text, int64_t *i, const char *str) -{ - text_iter_t state = {0, 0}; - int64_t matched = 0; - while (matched[str]) { - if (*i + matched >= text.length || _get_grapheme(text, &state, *i + matched) != str[matched]) - return false; - matched += 1; - } - *i += matched; - return true; -} - -static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop) -{ - if (*i >= text.length) return false; - int32_t grapheme = get_grapheme(text, *i); - // TODO: check every codepoint in the cluster? - if (uc_is_property(MAIN_GRAPHEME_CODEPOINT(grapheme), prop)) { - *i += 1; - return true; - } - return false; -} - -static int64_t parse_int(Text_t text, int64_t *i) -{ - text_iter_t state = {0, 0}; - int64_t value = 0; - for (;; *i += 1) { - ucs4_t grapheme = _get_main_grapheme(text, &state, *i); - int digit = uc_digit_value((ucs4_t)grapheme); - if (digit < 0) break; - if (value >= INT64_MAX/10) break; - value = 10*value + digit; - } - return value; -} - -const char *get_property_name(Text_t text, int64_t *i) -{ - skip_whitespace(text, i); - char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); - char *dest = name; - text_iter_t state = {0, 0}; - while (*i < text.length) { - int32_t grapheme = _get_grapheme(text, &state, *i); - if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { - *dest = (char)grapheme; - ++dest; - if (dest >= name + UNINAME_MAX - 1) - break; - } else { - break; - } - *i += 1; - } - - while (dest > name && dest[-1] == ' ') - *(dest--) = '\0'; - - if (dest == name) return NULL; - *dest = '\0'; - return name; -} - -#define EAT1(text, state, index, cond) ({\ - int32_t grapheme = _get_grapheme(text, state, index); \ - bool success = (cond); \ - if (success) index += 1; \ - success; }) - -#define EAT2(text, state, index, cond1, cond2) ({\ - int32_t grapheme = _get_grapheme(text, state, index); \ - bool success = (cond1); \ - if (success) { \ - grapheme = _get_grapheme(text, state, index + 1); \ - success = (cond2); \ - if (success) \ - index += 2; \ - } \ - success; }) - - -#define EAT_MANY(text, state, index, cond) ({ int64_t _n = 0; while (EAT1(text, state, index, cond)) { _n += 1; } _n; }) - -int64_t match_email(Text_t text, int64_t index) -{ - // email = local "@" domain - // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) - // domain = dns-label ("." dns-label)* - // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) - - text_iter_t state = {0, 0}; - if (index > 0) { - ucs4_t prev_codepoint = _get_main_grapheme(text, &state, index - 1); - if (uc_is_property_alphabetic((ucs4_t)prev_codepoint)) - return -1; - } - - int64_t start_index = index; - - // Local part: - int64_t local_len = 0; - static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; - while (EAT1(text, &state, index, - (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { - local_len += 1; - if (local_len > 64) return -1; - } - - if (!EAT1(text, &state, index, grapheme == '@')) - return -1; - - // Host - int64_t host_len = 0; - do { - int64_t label_len = 0; - while (EAT1(text, &state, index, - (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { - label_len += 1; - if (label_len > 63) return -1; - } - - if (label_len == 0) - return -1; - - host_len += label_len; - if (host_len > 255) - return -1; - host_len += 1; - } while (EAT1(text, &state, index, grapheme == '.')); - - return index - start_index; -} - -int64_t match_ipv6(Text_t text, int64_t index) -{ - text_iter_t state = {0, 0}; - if (index > 0) { - int32_t prev_codepoint = _get_grapheme(text, &state, index - 1); - if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) - return -1; - } - int64_t start_index = index; - const int NUM_CLUSTERS = 8; - bool double_colon_used = false; - for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { - for (int digits = 0; digits < 4; digits++) { - if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) - break; - } - if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) - return -1; // Too many digits - - if (cluster == NUM_CLUSTERS-1) { - break; - } else if (!EAT1(text, &state, index, grapheme == ':')) { - if (double_colon_used) - break; - return -1; - } - - if (EAT1(text, &state, index, grapheme == ':')) { - if (double_colon_used) - return -1; - double_colon_used = true; - } - } - return index - start_index; -} - -static int64_t match_ipv4(Text_t text, int64_t index) -{ - text_iter_t state = {0, 0}; - if (index > 0) { - int32_t prev_codepoint = _get_grapheme(text, &state, index - 1); - if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) - return -1; - } - int64_t start_index = index; - - const int NUM_CLUSTERS = 4; - for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { - for (int digits = 0; digits < 3; digits++) { - if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { - if (digits == 0) return -1; - break; - } - } - - if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) - return -1; // Too many digits - - if (cluster == NUM_CLUSTERS-1) - break; - else if (!EAT1(text, &state, index, grapheme == '.')) - return -1; - } - return (index - start_index); -} - -int64_t match_ip(Text_t text, int64_t index) -{ - int64_t len = match_ipv6(text, index); - if (len >= 0) return len; - len = match_ipv4(text, index); - return (len >= 0) ? len : -1; -} - -int64_t match_uri(Text_t text, int64_t index) -{ - // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] - // scheme = [a-zA-Z] [a-zA-Z0-9+.-] - // authority = [userinfo "@"] host [":" port] - - text_iter_t state = {0, 0}; - if (index > 0) { - int32_t prev_codepoint = _get_grapheme(text, &state, index - 1); - if (uc_is_property_alphabetic(MAIN_GRAPHEME_CODEPOINT(prev_codepoint))) - return -1; - } - - int64_t start_index = index; - - // Scheme: - if (!EAT1(text, &state, index, isalpha(grapheme))) - return -1; - - EAT_MANY(text, &state, index, - !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); - - if (index == start_index) - return -1; - - if (!match_grapheme(text, &index, ':')) - return -1; - - // Authority: - if (match_str(text, &index, "//")) { - int64_t authority_start = index; - // Username or host: - static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/"; - if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) - return -1; - - if (EAT1(text, &state, index, grapheme == '@')) { - // Found a username, now get a host: - if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) - return -1; - } else { - int64_t ip = authority_start; - int64_t ipv4_len = match_ipv4(text, ip); - if (ipv4_len > 0) { - ip += ipv4_len; - } else if (match_grapheme(text, &ip, '[')) { - ip += match_ipv6(text, ip); - if (ip > authority_start + 1 && match_grapheme(text, &ip, ']')) - index = ip; - } - } - - // Port: - if (EAT1(text, &state, index, grapheme == ':')) { - if (EAT_MANY(text, &state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) - return -1; - } - if (!EAT1(text, &state, index, grapheme == '/')) - return (index - start_index); // No path - } else { - // Optional path root: - EAT1(text, &state, index, grapheme == '/'); - } - - // Path: - static const char *non_path = " \"#?<>[]{}\\^`|"; - EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); - - if (EAT1(text, &state, index, grapheme == '?')) { // Query - static const char *non_query = " \"#<>[]{}\\^`|"; - EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); - } - - if (EAT1(text, &state, index, grapheme == '#')) { // Fragment - static const char *non_fragment = " \"#<>[]{}\\^`|"; - EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); - } - return index - start_index; -} - -int64_t match_url(Text_t text, int64_t index) -{ - int64_t lookahead = index; - if (!(match_str(text, &lookahead, "https:") - || match_str(text, &lookahead, "http:") - || match_str(text, &lookahead, "ftp:") - || match_str(text, &lookahead, "wss:") - || match_str(text, &lookahead, "ws:"))) - return -1; - - return match_uri(text, index); -} - -int64_t match_id(Text_t text, int64_t index) -{ - text_iter_t state = {0, 0}; - if (!EAT1(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) - return -1; - return 1 + EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE)); -} - -int64_t match_int(Text_t text, int64_t index) -{ - text_iter_t state = {0, 0}; - int64_t len = EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); - return len >= 0 ? len : -1; -} - -int64_t match_num(Text_t text, int64_t index) -{ - text_iter_t state = {0, 0}; - bool negative = EAT1(text, &state, index, grapheme == '-') ? 1 : 0; - int64_t pre_decimal = EAT_MANY(text, &state, index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); - bool decimal = (EAT1(text, &state, index, grapheme == '.') == 1); - int64_t post_decimal = decimal ? EAT_MANY(text, &state, index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0; - if (pre_decimal == 0 && post_decimal == 0) - return -1; - return negative + pre_decimal + decimal + post_decimal; -} - -int64_t match_newline(Text_t text, int64_t index) -{ - if (index >= text.length) - return -1; - - text_iter_t state = {0, 0}; - ucs4_t grapheme = index >= text.length ? 0 : _get_main_grapheme(text, &state, index); - if (grapheme == '\n') - return 1; - if (grapheme == '\r' && _get_grapheme(text, &state, index + 1) == '\n') - return 2; - return -1; -} - -typedef struct { - int64_t index, length; - bool occupied, recursive; -} capture_t; - -typedef struct { - enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag; - bool negated, non_capturing; - int64_t min, max; - union { - int32_t grapheme; - uc_property_t property; - int64_t (*fn)(Text_t, int64_t); - int32_t quote_graphemes[2]; - int32_t pair_graphemes[2]; - }; -} pat_t; - -int64_t match_pat(Text_t text, text_iter_t *state, int64_t index, pat_t pat) -{ - int32_t grapheme = index >= text.length ? 0 : _get_grapheme(text, state, index); - - switch (pat.tag) { - case PAT_START: { - if (index == 0) - return pat.negated ? -1 : 0; - return pat.negated ? 0 : -1; - } - case PAT_END: { - if (index >= text.length) - return pat.negated ? -1 : 0; - return pat.negated ? 0 : -1; - } - case PAT_ANY: { - assert(!pat.negated); - return (index < text.length) ? 1 : -1; - } - case PAT_GRAPHEME: { - if (index >= text.length) - return -1; - else if (grapheme == pat.grapheme) - return pat.negated ? -1 : 1; - return pat.negated ? 1 : -1; - } - case PAT_PROPERTY: { - if (index >= text.length) - return -1; - else if (uc_is_property((ucs4_t)grapheme, pat.property)) - return pat.negated ? -1 : 1; - return pat.negated ? 1 : -1; - } - case PAT_PAIR: { - // Nested punctuation: (?), [?], etc - if (index >= text.length) - return -1; - - int32_t open = pat.pair_graphemes[0]; - if (grapheme != open) - return pat.negated ? 1 : -1; - - int32_t close = pat.pair_graphemes[1]; - int64_t depth = 1; - int64_t match_len = 1; - for (; depth > 0; match_len++) { - if (index + match_len >= text.length) - return pat.negated ? 1 : -1; - - int32_t c = _get_grapheme(text, state, index + match_len); - if (c == open) - depth += 1; - else if (c == close) - depth -= 1; - } - return pat.negated ? -1 : match_len; - } - case PAT_QUOTE: { - // Nested quotes: "?", '?', etc - if (index >= text.length) - return -1; - - int32_t open = pat.quote_graphemes[0]; - if (grapheme != open) - return pat.negated ? 1 : -1; - - int32_t close = pat.quote_graphemes[1]; - for (int64_t i = index + 1; i < text.length; i++) { - int32_t c = _get_grapheme(text, state, i); - if (c == close) { - return pat.negated ? -1 : (i - index) + 1; - } else if (c == '\\' && index + 1 < text.length) { - i += 1; // Skip ahead an extra step - } - } - return pat.negated ? 1 : -1; - } - case PAT_FUNCTION: { - int64_t match_len = pat.fn(text, index); - if (match_len >= 0) - return pat.negated ? -1 : match_len; - return pat.negated ? 1 : -1; - } - default: errx(1, "Invalid pattern"); - } - errx(1, "Unreachable"); -} - -pat_t parse_next_pat(Text_t pattern, text_iter_t *state, int64_t *index) -{ - if (EAT2(pattern, state, *index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), - grapheme == '?')) { - // Quotations: "?", '?', etc - int32_t open = _get_grapheme(pattern, state, *index-2); - int32_t close = open; - uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); - if (!match_grapheme(pattern, index, close)) - fail("Pattern's closing quote is missing: %k", &pattern); - - return (pat_t){ - .tag=PAT_QUOTE, - .min=1, .max=1, - .quote_graphemes={open, close}, - }; - } else if (EAT2(pattern, state, *index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), - grapheme == '?')) { - // Nested punctuation: (?), [?], etc - int32_t open = _get_grapheme(pattern, state, *index-2); - int32_t close = open; - uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); - if (!match_grapheme(pattern, index, close)) - fail("Pattern's closing brace is missing: %k", &pattern); - - return (pat_t){ - .tag=PAT_PAIR, - .min=1, .max=1, - .pair_graphemes={open, close}, - }; - } else if (EAT1(pattern, state, *index, - grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc. - skip_whitespace(pattern, index); - int64_t min, max; - if (uc_is_digit((ucs4_t)_get_grapheme(pattern, state, *index))) { - min = parse_int(pattern, index); - skip_whitespace(pattern, index); - if (match_grapheme(pattern, index, '+')) { - max = INT64_MAX; - } else if (match_grapheme(pattern, index, '-')) { - max = parse_int(pattern, index); - } else { - max = min; - } - if (min > max) fail("Minimum repetitions (%ld) is less than the maximum (%ld)", min, max); - } else { - min = -1, max = -1; - } - - skip_whitespace(pattern, index); - - bool negated = match_grapheme(pattern, index, '!'); -#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__}) - const char *prop_name; - if (match_str(pattern, index, "..")) - prop_name = ".."; - else - prop_name = get_property_name(pattern, index); - - if (!prop_name) { - // Literal character, e.g. {1?} - skip_whitespace(pattern, index); - int32_t grapheme = _get_grapheme(pattern, state, (*index)++); - if (!match_grapheme(pattern, index, '}')) - fail("Missing closing '}' in pattern: %k", &pattern); - return PAT(PAT_GRAPHEME, .grapheme=grapheme); - } else if (strlen(prop_name) == 1) { - // Single letter names: {1+ A} - skip_whitespace(pattern, index); - if (!match_grapheme(pattern, index, '}')) - fail("Missing closing '}' in pattern: %k", &pattern); - return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]); - } - - skip_whitespace(pattern, index); - if (!match_grapheme(pattern, index, '}')) - fail("Missing closing '}' in pattern: %k", &pattern); - - switch (tolower(prop_name[0])) { - case '.': - if (prop_name[1] == '.') { - if (negated) - return ((pat_t){.tag=PAT_END, .min=min, .max=max, .non_capturing=true}); - else - return PAT(PAT_ANY); - } - break; - case 'd': - if (strcasecmp(prop_name, "digit") == 0) { - return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT); - } - break; - case 'e': - if (strcasecmp(prop_name, "end") == 0) { - return PAT(PAT_END, .non_capturing=!negated); - } else if (strcasecmp(prop_name, "email") == 0) { - return PAT(PAT_FUNCTION, .fn=match_email); - } else if (strcasecmp(prop_name, "emoji") == 0) { - return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI); - } - break; - case 'i': - if (strcasecmp(prop_name, "id") == 0) { - return PAT(PAT_FUNCTION, .fn=match_id); - } else if (strcasecmp(prop_name, "int") == 0) { - return PAT(PAT_FUNCTION, .fn=match_int); - } else if (strcasecmp(prop_name, "ipv4") == 0) { - return PAT(PAT_FUNCTION, .fn=match_ipv4); - } else if (strcasecmp(prop_name, "ipv6") == 0) { - return PAT(PAT_FUNCTION, .fn=match_ipv6); - } else if (strcasecmp(prop_name, "ip") == 0) { - return PAT(PAT_FUNCTION, .fn=match_ip); - } - break; - case 'n': - if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0 - || strcasecmp(prop_name, "crlf")) { - return PAT(PAT_FUNCTION, .fn=match_newline); - } else if (strcasecmp(prop_name, "num") == 0) { - return PAT(PAT_FUNCTION, .fn=match_num); - } - break; - case 's': - if (strcasecmp(prop_name, "start") == 0) { - return PAT(PAT_START, .non_capturing=!negated); - } - break; - case 'u': - if (strcasecmp(prop_name, "uri") == 0) { - return PAT(PAT_FUNCTION, .fn=match_uri); - } else if (strcasecmp(prop_name, "url") == 0) { - return PAT(PAT_FUNCTION, .fn=match_url); - } - break; - default: break; - } - - uc_property_t prop = uc_property_byname(prop_name); - if (uc_property_is_valid(prop)) - return PAT(PAT_PROPERTY, .property=prop); - - ucs4_t grapheme = unicode_name_character(prop_name); - if (grapheme == UNINAME_INVALID) - fail("Not a valid property or character name: %s", prop_name); - return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme); -#undef PAT - } else { - return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=_get_grapheme(pattern, state, (*index)++)}; - } -} - -int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) -{ - if (pattern_index >= pattern.length) // End of the pattern - return 0; - - int64_t start_index = text_index; - text_iter_t pattern_state = {0, 0}, text_state = {0, 0}; - pat_t pat = parse_next_pat(pattern, &pattern_state, &pattern_index); - - if (pat.min == -1 && pat.max == -1) { - if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { - pat.min = pat.max = MAX(1, text.length - text_index); - } else { - pat.min = 1; - pat.max = INT64_MAX; - } - } - - int64_t capture_start = text_index; - int64_t count = 0, capture_len = 0, next_match_len = 0; - - if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { - int64_t remaining = text.length - text_index; - capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1; - text_index += capture_len; - goto success; - } - - if (pat.min == 0 && pattern_index < pattern.length) { - next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); - if (next_match_len >= 0) { - capture_len = 0; - goto success; - } - } - - while (count < pat.max) { - int64_t match_len = match_pat(text, &text_state, text_index, pat); - if (match_len < 0) - break; - capture_len += match_len; - text_index += match_len; - count += 1; - - if (pattern_index < pattern.length) { // More stuff after this - if (count < pat.min) - next_match_len = -1; - else - next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); - } else { - next_match_len = 0; - } - - if (match_len == 0) { - if (next_match_len >= 0) { - // If we're good to go, no need to keep re-matching zero-length - // matches till we hit max: - count = pat.max; - break; - } else { - return -1; - } - } - - if (pattern_index < pattern.length && next_match_len >= 0) - break; // Next guy exists and wants to stop here - - if (text_index >= text.length) - break; - } - - if (count < pat.min || next_match_len < 0) - return -1; - - success: - if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) { - if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) { - assert(capture_len > 0); - captures[capture_index] = (capture_t){ - .index=capture_start + 1, // Skip leading quote/paren - .length=capture_len - 2, // Skip open/close - .occupied=true, - .recursive=(pat.tag == PAT_PAIR), - }; - } else { - captures[capture_index] = (capture_t){ - .index=capture_start, - .length=capture_len, - .occupied=true, - .recursive=false, - }; - } - } - return (text_index - start_index) + next_match_len; -} - -#undef EAT1 -#undef EAT2 -#undef EAT_MANY - -static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length) -{ - int32_t first_grapheme = get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - text_iter_t text_state = {0, 0}; - - for (int64_t i = first; i <= last; i++) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (i < text.length && _get_grapheme(text, &text_state, i) != first_grapheme) - ++i; - } - - int64_t m = match(text, i, pattern, 0, NULL, 0); - if (m >= 0) { - if (match_length) - *match_length = m; - return i; - } - } - if (match_length) - *match_length = -1; - return -1; -} - -public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t *match_length) -{ - int64_t first = Int_to_Int64(from_index, false); - if (first == 0) fail("Invalid index: 0"); - if (first < 0) first = text.length + first + 1; - if (first > text.length || first < 1) - return I(0); - int64_t found = _find(text, pattern, first-1, text.length-1, match_length); - return I(found+1); -} - -PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern) -{ - if (Text$starts_with(pattern, Text("{start}"))) { - int64_t m = match(text, 0, pattern, 0, NULL, 0); - return m >= 0; - } else if (Text$ends_with(text, Text("{end}"))) { - for (int64_t i = text.length-1; i >= 0; i--) { - int64_t match_len = match(text, i, pattern, 0, NULL, 0); - if (match_len >= 0 && i + match_len == text.length) - return true; - } - return false; - } else { - int64_t found = _find(text, pattern, 0, text.length-1, NULL); - return (found >= 0); - } -} - -PUREFUNC public bool Text$matches(Text_t text, Pattern_t pattern) -{ - int64_t m = match(text, 0, pattern, 0, NULL, 0); - return m == text.length; -} - public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]) { if (n < 1) return -1; @@ -1835,9 +1037,9 @@ static inline Text_t _quoted(Text_t text, bool colorize, char quote_char) add_char(quote_char); #define add_escaped(str) ({ if (colorize) add_str("\x1b[34;1m"); add_char('\\'); add_str(str); if (colorize) add_str("\x1b[0;35m"); }) - text_iter_t state = {0, 0}; + TextIter_t state = {0, 0}; for (int64_t i = 0; i < text.length; i++) { - int32_t g = _get_grapheme(text, &state, i); + int32_t g = Text$get_grapheme_fast(text, &state, i); switch (g) { case '\a': add_escaped("a"); break; case '\b': add_escaped("b"); break; @@ -1904,263 +1106,6 @@ public Text_t Text$quoted(Text_t text, bool colorize) return _quoted(text, colorize, '"'); } -public Array_t Text$find_all(Text_t text, Pattern_t pattern) -{ - if (pattern.length == 0) // special case - return (Array_t){.length=0}; - - Array_t matches = {}; - - for (int64_t i = 0; ; ) { - int64_t len = 0; - int64_t found = _find(text, pattern, i, text.length-1, &len); - if (found < 0) break; - Text_t match = Text$slice(text, I(found+1), I(found + len)); - Array$insert(&matches, &match, I_small(0), sizeof(Text_t)); - i = found + MAX(len, 1); - } - - return matches; -} - -static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t replacement, Pattern_t backref_pat, capture_t *captures) -{ - if (backref_pat.length == 0) - return replacement; - - int32_t first_grapheme = get_grapheme(backref_pat, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - Text_t ret = Text(""); - text_iter_t state = {0, 0}; - int64_t nonmatching_pos = 0; - for (int64_t pos = 0; pos < replacement.length; ) { - // Optimization: quickly skip ahead to first char in the backref pattern: - if (find_first) { - while (pos < replacement.length && _get_grapheme(replacement, &state, pos) != first_grapheme) - ++pos; - } - - int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0); - if (backref_len < 0) { - pos += 1; - continue; - } - - int64_t after_backref = pos + backref_len; - int64_t backref = parse_int(replacement, &after_backref); - if (after_backref == pos + backref_len) { // Not actually a backref if there's no number - pos += 1; - continue; - } - if (backref < 0 || backref > 9) fail("Invalid backref index: %ld (only 0-%d are allowed)", backref, MAX_BACKREFS-1); - backref_len = (after_backref - pos); - - if (_get_grapheme(replacement, &state, pos + backref_len) == ';') - backref_len += 1; // skip optional semicolon - - if (!captures[backref].occupied) - fail("There is no capture number %ld!", backref); - - Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length)); - - if (captures[backref].recursive && original_pattern.length > 0) - backref_text = Text$replace(backref_text, original_pattern, replacement, backref_pat, true); - - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos)); - ret = Text$concat(ret, before_slice, backref_text); - } else { - ret = concat2(ret, backref_text); - } - - pos += backref_len; - nonmatching_pos = pos; - } - if (nonmatching_pos < replacement.length) { - Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length)); - ret = concat2(ret, last_slice); - } - return ret; -} - -public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive) -{ - Text_t ret = {.length=0}; - - int32_t first_grapheme = get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - text_iter_t text_state = {0, 0}; - int64_t nonmatching_pos = 0; - for (int64_t pos = 0; pos < text.length; ) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (pos < text.length && _get_grapheme(text, &text_state, pos) != first_grapheme) - ++pos; - } - - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, pos, pattern, 0, captures, 1); - if (match_len < 0) { - pos += 1; - continue; - } - captures[0] = (capture_t){ - .index = pos, .length = match_len, - .occupied = true, .recursive = false, - }; - - Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures); - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); - ret = Text$concat(ret, before_slice, replacement_text); - } else { - ret = concat2(ret, replacement_text); - } - nonmatching_pos = pos + match_len; - pos += MAX(match_len, 1); - } - if (nonmatching_pos < text.length) { - Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); - ret = concat2(ret, last_slice); - } - return ret; -} - -public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right) -{ - int64_t first = 0, last = text.length-1; - if (trim_left) { - int64_t match_len = match(text, 0, pattern, 0, NULL, 0); - if (match_len > 0) - first = match_len; - } - - if (trim_right) { - for (int64_t i = text.length-1; i >= first; i--) { - int64_t match_len = match(text, i, pattern, 0, NULL, 0); - if (match_len > 0 && i + match_len == text.length) - last = i-1; - } - } - return Text$slice(text, I(first+1), I(last+1)); -} - -public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn) -{ - Text_t ret = {.length=0}; - - int32_t first_grapheme = get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - text_iter_t text_state = {0, 0}; - int64_t nonmatching_pos = 0; - - Text_t (*text_mapper)(Text_t, void*) = fn.fn; - for (int64_t pos = 0; pos < text.length; pos++) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (pos < text.length && _get_grapheme(text, &text_state, pos) != first_grapheme) - ++pos; - } - - int64_t match_len = match(text, pos, pattern, 0, NULL, 0); - if (match_len < 0) continue; - - Text_t replacement = text_mapper(Text$slice(text, I(pos+1), I(pos+match_len)), fn.userdata); - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); - ret = Text$concat(ret, before_slice, replacement); - } else { - ret = concat2(ret, replacement); - } - nonmatching_pos = pos + match_len; - pos += (match_len - 1); - } - if (nonmatching_pos < text.length) { - Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); - ret = concat2(ret, last_slice); - } - return ret; -} - -public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive) -{ - if (replacements.entries.length == 0) return text; - - Text_t ret = {.length=0}; - - int64_t nonmatch_pos = 0; - for (int64_t pos = 0; pos < text.length; ) { - // Find the first matching pattern at this position: - for (int64_t i = 0; i < replacements.entries.length; i++) { - Pattern_t pattern = *(Pattern_t*)(replacements.entries.data + i*replacements.entries.stride); - capture_t captures[MAX_BACKREFS] = {}; - int64_t len = match(text, pos, pattern, 0, captures, 1); - if (len < 0) continue; - captures[0].index = pos; - captures[0].length = len; - - // If we skipped over some non-matching text before finding a match, insert it here: - if (pos > nonmatch_pos) { - Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos)); - ret = concat2(ret, before_slice); - } - - // Concatenate the replacement: - Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t)); - Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures); - ret = concat2(ret, replacement_text); - pos += MAX(len, 1); - nonmatch_pos = pos; - goto next_pos; - } - - pos += 1; - next_pos: - continue; - } - - if (nonmatch_pos <= text.length) { - Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length)); - ret = concat2(ret, last_slice); - } - return ret; -} - -public Array_t Text$split(Text_t text, Pattern_t pattern) -{ - if (text.length == 0) // special case - return (Array_t){.length=0}; - - if (pattern.length == 0) // special case - return Text$clusters(text); - - Array_t chunks = {}; - - Int_t i = I_small(1); - for (;;) { - int64_t len = 0; - Int_t found = Text$find(text, pattern, i, &len); - if (I_is_zero(found)) break; - Text_t chunk = Text$slice(text, i, Int$minus(found, I_small(1))); - Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); - i = Int$plus(found, I(MAX(len, 1))); - } - - Text_t last_chunk = Text$slice(text, i, I(text.length)); - Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t)); - - return chunks; -} - public Text_t Text$join(Text_t glue, Array_t pieces) { if (pieces.length == 0) return (Text_t){.length=0}; @@ -2210,9 +1155,9 @@ public Array_t Text$clusters(Text_t text) public Array_t Text$utf32_codepoints(Text_t text) { Array_t codepoints = {.atomic=1}; - text_iter_t state = {0, 0}; + TextIter_t state = {0, 0}; for (int64_t i = 0; i < text.length; i++) { - int32_t grapheme = _get_grapheme(text, &state, i); + int32_t grapheme = Text$get_grapheme_fast(text, &state, i); if (grapheme < 0) { for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) { ucs4_t subg = GRAPHEME_CODEPOINTS(grapheme)[c]; @@ -2245,9 +1190,9 @@ static inline const char *codepoint_name(ucs4_t c) public Array_t Text$codepoint_names(Text_t text) { Array_t names = {}; - text_iter_t state = {0, 0}; + TextIter_t state = {0, 0}; for (int64_t i = 0; i < text.length; i++) { - int32_t grapheme = _get_grapheme(text, &state, i); + int32_t grapheme = Text$get_grapheme_fast(text, &state, i); if (grapheme < 0) { for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) { const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]); @@ -2297,10 +1242,10 @@ public Text_t Text$from_bytes(Array_t bytes) public Array_t Text$lines(Text_t text) { Array_t lines = {}; - text_iter_t state = {0, 0}; + TextIter_t state = {0, 0}; for (int64_t i = 0, line_start = 0; i < text.length; i++) { - int32_t grapheme = _get_grapheme(text, &state, i); - if (grapheme == '\r' && _get_grapheme(text, &state, i + 1) == '\n') { // CRLF + int32_t grapheme = Text$get_grapheme_fast(text, &state, i); + if (grapheme == '\r' && Text$get_grapheme_fast(text, &state, i + 1) == '\n') { // CRLF Text_t line = Text$slice(text, I(line_start+1), I(i)); Array$insert(&lines, &line, I_small(0), sizeof(Text_t)); i += 1; // skip one extra for CR @@ -2330,9 +1275,9 @@ public Pattern_t Pattern$escape_text(Text_t text) Array_t graphemes = {.atomic=1}; #define add_char(c) Array$insert_value(&graphemes, (ucs4_t)c, I_small(0), sizeof(ucs4_t)) #define add_str(s) ({ for (const char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (ucs4_t)*_c, I_small(0), sizeof(ucs4_t)); }) - text_iter_t state = {0, 0}; + TextIter_t state = {0, 0}; for (int64_t i = 0; i < text.length; i++) { - int32_t g = _get_grapheme(text, &state, i); + int32_t g = Text$get_grapheme_fast(text, &state, i); ucs4_t g0 = g < 0 ? GRAPHEME_CODEPOINTS(g)[0] : (ucs4_t)g; if (g == '{') { diff --git a/builtins/text.h b/builtins/text.h index 2e58ad6..e5a7b70 100644 --- a/builtins/text.h +++ b/builtins/text.h @@ -6,11 +6,16 @@ #include #include #include +#include #include "datatypes.h" #include "integers.h" #include "types.h" +typedef struct { + int64_t subtext, sum_of_previous_subtexts; +} TextIter_t; + int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]); int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]); @@ -34,16 +39,8 @@ Text_t Text$lower(Text_t text); Text_t Text$title(Text_t text); Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info); Text_t Text$quoted(Text_t str, bool colorize); -Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t backref_pat, bool recursive); -Text_t Text$replace_all(Text_t text, Table_t replacements, Pattern_t backref_pat, bool recursive); -Array_t Text$split(Text_t text, Pattern_t pattern); -Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right); -Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length); -Array_t Text$find_all(Text_t text, Pattern_t pattern); -PUREFUNC bool Text$has(Text_t text, Pattern_t pattern); PUREFUNC bool Text$starts_with(Text_t text, Text_t prefix); PUREFUNC bool Text$ends_with(Text_t text, Text_t suffix); -PUREFUNC bool Text$matches(Text_t text, Pattern_t pattern); char *Text$as_c_string(Text_t text); __attribute__((format(printf, 1, 2))) public Text_t Text$format(const char *fmt, ...); @@ -56,19 +53,16 @@ Text_t Text$from_codepoint_names(Array_t codepoint_names); Text_t Text$from_bytes(Array_t bytes); Array_t Text$lines(Text_t text); Text_t Text$join(Text_t glue, Array_t pieces); -Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn); Text_t Text$repeat(Text_t text, Int_t count); +int32_t Text$get_grapheme_fast(Text_t text, TextIter_t *state, int64_t index); +ucs4_t Text$get_main_grapheme_fast(Text_t text, TextIter_t *state, int64_t index); + +static inline int32_t Text$get_grapheme(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + return Text$get_grapheme_fast(text, &state, index); +} extern const TypeInfo Text$info; -#define Pattern(text) ((Pattern_t)Text(text)) -#define Patterns(...) ((Pattern_t)Texts(__VA_ARGS__)) -Pattern_t Pattern$escape_text(Text_t text); - -#define Pattern$hash Text$hash -#define Pattern$compare Text$compare -#define Pattern$equal Text$equal - -extern const TypeInfo Pattern$info; - // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/tomo.h b/builtins/tomo.h index 9354cca..c674ffe 100644 --- a/builtins/tomo.h +++ b/builtins/tomo.h @@ -22,6 +22,7 @@ #include "nums.h" #include "optionals.h" #include "path.h" +#include "pattern.h" #include "pointer.h" #include "range.h" #include "shell.h" diff --git a/docs/text.md b/docs/text.md index 8a9f641..18960b0 100644 --- a/docs/text.md +++ b/docs/text.md @@ -255,9 +255,11 @@ finding the value because the two texts are equivalent under normalization. # Patterns As an alternative to full regular expressions, Tomo provides a limited string -matching pattern syntax that is intended to solve 80% of use cases in 2% of the -code size (PCRE's codebase is roughly 150k lines of code, and Tomo's entire -Text codebase is around 1.8K lines of code). +matching pattern syntax that is intended to solve 80% of use cases in under 1% +of the code size (PCRE's codebase is roughly 150k lines of code, and Tomo's +pattern matching code is a bit under 1k lines of code). Tomo's pattern matching +syntax is highly readable and works well for matching literal text without +getting [leaning toothpick syndrome](https://en.wikipedia.org/wiki/Leaning_toothpick_syndrome). For more advanced use cases, consider linking against a C library for regular expressions or pattern matching.