aboutsummaryrefslogtreecommitdiff
path: root/stdlib/patterns.c
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2024-09-14 00:12:52 -0400
committerBruce Hill <bruce@bruce-hill.com>2024-09-14 00:12:52 -0400
commit6012a00763afdd467e71b1657bd9a39a4cba4493 (patch)
tree6994a7074b4182211c3beec8553c02f638976b7f /stdlib/patterns.c
parent2b0556084919ace0700e4480f7fa2886cf31b3e4 (diff)
Clean up pattern code to make better use of TextIter_t and fix up
URI/URL patterns
Diffstat (limited to 'stdlib/patterns.c')
-rw-r--r--stdlib/patterns.c418
1 files changed, 218 insertions, 200 deletions
diff --git a/stdlib/patterns.c b/stdlib/patterns.c
index 81beaffe..2de7fe3a 100644
--- a/stdlib/patterns.c
+++ b/stdlib/patterns.c
@@ -14,32 +14,48 @@
#define MAX_BACKREFS 100
-static inline void skip_whitespace(Text_t text, int64_t *i)
+typedef struct {
+ int64_t index, length;
+ bool occupied, recursive;
+} capture_t;
+
+typedef struct {
+ enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag;
+ bool negated, non_capturing;
+ int64_t min, max;
+ union {
+ int32_t grapheme;
+ uc_property_t property;
+ int64_t (*fn)(TextIter_t *, int64_t);
+ int32_t quote_graphemes[2];
+ int32_t pair_graphemes[2];
+ };
+} pat_t;
+
+static inline void skip_whitespace(TextIter_t *state, int64_t *i)
{
- TextIter_t state = {0, 0};
- while (*i < text.length) {
- int32_t grapheme = Text$get_grapheme_fast(text, &state, *i);
+ while (*i < state->text.length) {
+ int32_t grapheme = Text$get_grapheme_fast(state, *i);
if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme))
return;
*i += 1;
}
}
-static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme)
+static inline bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme)
{
- if (*i < text.length && Text$get_grapheme(text, *i) == grapheme) {
+ if (*i < state->text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
*i += 1;
return true;
}
return false;
}
-static inline bool match_str(Text_t text, int64_t *i, const char *str)
+static inline bool match_str(TextIter_t *state, int64_t *i, const char *str)
{
- TextIter_t state = {0, 0};
int64_t matched = 0;
while (matched[str]) {
- if (*i + matched >= text.length || Text$get_grapheme_fast(text, &state, *i + matched) != str[matched])
+ if (*i + matched >= state->text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
return false;
matched += 1;
}
@@ -47,11 +63,10 @@ static inline bool match_str(Text_t text, int64_t *i, const char *str)
return true;
}
-static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop)
+static inline bool match_property(TextIter_t *state, int64_t *i, uc_property_t prop)
{
- if (*i >= text.length) return false;
- TextIter_t state = {};
- ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i);
+ if (*i >= state->text.length) return false;
+ ucs4_t grapheme = Text$get_main_grapheme_fast(state, *i);
// TODO: check every codepoint in the cluster?
if (uc_is_property(grapheme, prop)) {
*i += 1;
@@ -60,12 +75,11 @@ static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop)
return false;
}
-static int64_t parse_int(Text_t text, int64_t *i)
+static int64_t parse_int(TextIter_t *state, int64_t *i)
{
- TextIter_t state = {0, 0};
int64_t value = 0;
for (;; *i += 1) {
- ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i);
+ ucs4_t grapheme = Text$get_main_grapheme_fast(state, *i);
int digit = uc_digit_value((ucs4_t)grapheme);
if (digit < 0) break;
if (value >= INT64_MAX/10) break;
@@ -74,14 +88,13 @@ static int64_t parse_int(Text_t text, int64_t *i)
return value;
}
-const char *get_property_name(Text_t text, int64_t *i)
+static const char *get_property_name(TextIter_t *state, int64_t *i)
{
- skip_whitespace(text, i);
+ skip_whitespace(state, i);
char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
char *dest = name;
- TextIter_t state = {0, 0};
- while (*i < text.length) {
- int32_t grapheme = Text$get_grapheme_fast(text, &state, *i);
+ while (*i < state->text.length) {
+ int32_t grapheme = Text$get_grapheme_fast(state, *i);
if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
*dest = (char)grapheme;
++dest;
@@ -101,17 +114,17 @@ const char *get_property_name(Text_t text, int64_t *i)
return name;
}
-#define EAT1(text, state, index, cond) ({\
- int32_t grapheme = Text$get_grapheme_fast(text, state, index); \
+#define EAT1(state, index, cond) ({\
+ int32_t grapheme = Text$get_grapheme_fast(state, index); \
bool success = (cond); \
if (success) index += 1; \
success; })
-#define EAT2(text, state, index, cond1, cond2) ({\
- int32_t grapheme = Text$get_grapheme_fast(text, state, index); \
+#define EAT2(state, index, cond1, cond2) ({\
+ int32_t grapheme = Text$get_grapheme_fast(state, index); \
bool success = (cond1); \
if (success) { \
- grapheme = Text$get_grapheme_fast(text, state, index + 1); \
+ grapheme = Text$get_grapheme_fast(state, index + 1); \
success = (cond2); \
if (success) \
index += 2; \
@@ -119,18 +132,17 @@ const char *get_property_name(Text_t text, int64_t *i)
success; })
-#define EAT_MANY(text, state, index, cond) ({ int64_t _n = 0; while (EAT1(text, state, index, cond)) { _n += 1; } _n; })
+#define EAT_MANY(state, index, cond) ({ int64_t _n = 0; while (EAT1(state, index, cond)) { _n += 1; } _n; })
-int64_t match_email(Text_t text, int64_t index)
+static int64_t match_email(TextIter_t *state, int64_t index)
{
// email = local "@" domain
// local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii)
// domain = dns-label ("." dns-label)*
// dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii)
- TextIter_t state = {0, 0};
if (index > 0) {
- ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1);
+ ucs4_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
if (uc_is_property_alphabetic((ucs4_t)prev_codepoint))
return -1;
}
@@ -140,20 +152,20 @@ int64_t match_email(Text_t text, int64_t index)
// Local part:
int64_t local_len = 0;
static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~";
- while (EAT1(text, &state, index,
+ while (EAT1(state, index,
(grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
local_len += 1;
if (local_len > 64) return -1;
}
- if (!EAT1(text, &state, index, grapheme == '@'))
+ if (!EAT1(state, index, grapheme == '@'))
return -1;
// Host
int64_t host_len = 0;
do {
int64_t label_len = 0;
- while (EAT1(text, &state, index,
+ while (EAT1(state, index,
(grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
label_len += 1;
if (label_len > 63) return -1;
@@ -166,16 +178,15 @@ int64_t match_email(Text_t text, int64_t index)
if (host_len > 255)
return -1;
host_len += 1;
- } while (EAT1(text, &state, index, grapheme == '.'));
+ } while (EAT1(state, index, grapheme == '.'));
return index - start_index;
}
-int64_t match_ipv6(Text_t text, int64_t index)
+static int64_t match_ipv6(TextIter_t *state, int64_t index)
{
- TextIter_t state = {0, 0};
if (index > 0) {
- int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1);
+ int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':'))
return -1;
}
@@ -184,21 +195,21 @@ int64_t match_ipv6(Text_t text, int64_t index)
bool double_colon_used = false;
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
for (int digits = 0; digits < 4; digits++) {
- if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
+ if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
break;
}
- if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
+ if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
return -1; // Too many digits
if (cluster == NUM_CLUSTERS-1) {
break;
- } else if (!EAT1(text, &state, index, grapheme == ':')) {
+ } else if (!EAT1(state, index, grapheme == ':')) {
if (double_colon_used)
break;
return -1;
}
- if (EAT1(text, &state, index, grapheme == ':')) {
+ if (EAT1(state, index, grapheme == ':')) {
if (double_colon_used)
return -1;
double_colon_used = true;
@@ -207,11 +218,10 @@ int64_t match_ipv6(Text_t text, int64_t index)
return index - start_index;
}
-static int64_t match_ipv4(Text_t text, int64_t index)
+static int64_t match_ipv4(TextIter_t *state, int64_t index)
{
- TextIter_t state = {0, 0};
if (index > 0) {
- int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1);
+ int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.'))
return -1;
}
@@ -220,40 +230,86 @@ static int64_t match_ipv4(Text_t text, int64_t index)
const int NUM_CLUSTERS = 4;
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
for (int digits = 0; digits < 3; digits++) {
- if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
+ if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
if (digits == 0) return -1;
break;
}
}
- if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
+ if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
return -1; // Too many digits
if (cluster == NUM_CLUSTERS-1)
break;
- else if (!EAT1(text, &state, index, grapheme == '.'))
+ else if (!EAT1(state, index, grapheme == '.'))
return -1;
}
return (index - start_index);
}
-int64_t match_ip(Text_t text, int64_t index)
+static int64_t match_ip(TextIter_t *state, int64_t index)
{
- int64_t len = match_ipv6(text, index);
+ int64_t len = match_ipv6(state, index);
if (len >= 0) return len;
- len = match_ipv4(text, index);
+ len = match_ipv4(state, index);
return (len >= 0) ? len : -1;
}
-int64_t match_uri(Text_t text, int64_t index)
+static int64_t match_host(TextIter_t *state, int64_t index)
+{
+ int64_t ip_len = match_ip(state, index);
+ if (ip_len > 0) return ip_len;
+
+ int64_t start_index = index;
+ if (match_grapheme(state, &index, '[')) {
+ ip_len = match_ip(state, index);
+ if (ip_len <= 0) return -1;
+ index += ip_len;
+ if (match_grapheme(state, &index, ']'))
+ return (index - start_index);
+ return -1;
+ }
+
+ if (!EAT1(state, index, isalpha(grapheme)))
+ return -1;
+
+ static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`";
+ EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme));
+ return (index - start_index);
+}
+
+static int64_t match_authority(TextIter_t *state, int64_t index)
+{
+ int64_t authority_start = index;
+ static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`.";
+
+ // Optional user@ prefix:
+ int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme));
+ if (username_len < 1 || !EAT1(state, index, grapheme == '@'))
+ index = authority_start; // No user@ part
+
+ // Host:
+ int64_t host_len = match_host(state, index);
+ if (host_len <= 0) return -1;
+ index += host_len;
+
+ // Port:
+ if (EAT1(state, index, grapheme == ':')) {
+ if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
+ return -1;
+ }
+ return (index - authority_start);
+}
+
+static int64_t match_uri(TextIter_t *state, int64_t index)
{
// URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment]
// scheme = [a-zA-Z] [a-zA-Z0-9+.-]
// authority = [userinfo "@"] host [":" port]
- TextIter_t state = {0, 0};
if (index > 0) {
- ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1);
+ // Don't match if we're not at a word edge:
+ ucs4_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
if (uc_is_property_alphabetic(prev_codepoint))
return -1;
}
@@ -261,147 +317,101 @@ int64_t match_uri(Text_t text, int64_t index)
int64_t start_index = index;
// Scheme:
- if (!EAT1(text, &state, index, isalpha(grapheme)))
+ if (!EAT1(state, index, isalpha(grapheme)))
return -1;
-
- EAT_MANY(text, &state, index,
- !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
-
- if (index == start_index)
- return -1;
-
- if (!match_grapheme(text, &index, ':'))
+ EAT_MANY(state, index, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
+ if (!match_grapheme(state, &index, ':'))
return -1;
// Authority:
- if (match_str(text, &index, "//")) {
- int64_t authority_start = index;
- // Username or host:
- static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/";
- if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
- return -1;
+ int64_t authority_len;
+ if (match_str(state, &index, "//")) {
+ authority_len = match_authority(state, index);
+ if (authority_len > 0)
+ index += authority_len;
+ } else {
+ authority_len = 0;
+ }
- if (EAT1(text, &state, index, grapheme == '@')) {
- // Found a username, now get a host:
- if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
- return -1;
- } else {
- int64_t ip = authority_start;
- int64_t ipv4_len = match_ipv4(text, ip);
- if (ipv4_len > 0) {
- ip += ipv4_len;
- } else if (match_grapheme(text, &ip, '[')) {
- ip += match_ipv6(text, ip);
- if (ip > authority_start + 1 && match_grapheme(text, &ip, ']'))
- index = ip;
- }
+ // Path:
+ int64_t path_start = index;
+ if (EAT1(state, index, grapheme == '/') || authority_len <= 0) {
+ static const char *non_path = " \"#?<>[]{}\\^`|";
+ EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
+
+ if (EAT1(state, index, grapheme == '?')) { // Query
+ static const char *non_query = " \"#<>[]{}\\^`|";
+ EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
}
-
- // Port:
- if (EAT1(text, &state, index, grapheme == ':')) {
- if (EAT_MANY(text, &state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
- return -1;
+
+ if (EAT1(state, index, grapheme == '#')) { // Fragment
+ static const char *non_fragment = " \"#<>[]{}\\^`|";
+ EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
}
- if (!EAT1(text, &state, index, grapheme == '/'))
- return (index - start_index); // No path
- } else {
- // Optional path root:
- EAT1(text, &state, index, grapheme == '/');
}
- // Path:
- static const char *non_path = " \"#?<>[]{}\\^`|";
- EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
+ if (authority_len <= 0 && index == path_start)
+ return -1;
- if (EAT1(text, &state, index, grapheme == '?')) { // Query
- static const char *non_query = " \"#<>[]{}\\^`|";
- EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
- }
-
- if (EAT1(text, &state, index, grapheme == '#')) { // Fragment
- static const char *non_fragment = " \"#<>[]{}\\^`|";
- EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
- }
return index - start_index;
}
-int64_t match_url(Text_t text, int64_t index)
+static int64_t match_url(TextIter_t *state, int64_t index)
{
int64_t lookahead = index;
- if (!(match_str(text, &lookahead, "https:")
- || match_str(text, &lookahead, "http:")
- || match_str(text, &lookahead, "ftp:")
- || match_str(text, &lookahead, "wss:")
- || match_str(text, &lookahead, "ws:")))
+ if (!(match_str(state, &lookahead, "https:")
+ || match_str(state, &lookahead, "http:")
+ || match_str(state, &lookahead, "ftp:")
+ || match_str(state, &lookahead, "wss:")
+ || match_str(state, &lookahead, "ws:")))
return -1;
- return match_uri(text, index);
+ return match_uri(state, index);
}
-int64_t match_id(Text_t text, int64_t index)
+static int64_t match_id(TextIter_t *state, int64_t index)
{
- TextIter_t state = {0, 0};
- if (!EAT1(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START)))
+ if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START)))
return -1;
- return 1 + EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE));
+ return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE));
}
-int64_t match_int(Text_t text, int64_t index)
+static int64_t match_int(TextIter_t *state, int64_t index)
{
- TextIter_t state = {0, 0};
- int64_t len = EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
+ int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
return len >= 0 ? len : -1;
}
-int64_t match_num(Text_t text, int64_t index)
+static int64_t match_num(TextIter_t *state, int64_t index)
{
- TextIter_t state = {0, 0};
- bool negative = EAT1(text, &state, index, grapheme == '-') ? 1 : 0;
- int64_t pre_decimal = EAT_MANY(text, &state, index,
+ bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0;
+ int64_t pre_decimal = EAT_MANY(state, index,
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
- bool decimal = (EAT1(text, &state, index, grapheme == '.') == 1);
- int64_t post_decimal = decimal ? EAT_MANY(text, &state, index,
+ bool decimal = (EAT1(state, index, grapheme == '.') == 1);
+ int64_t post_decimal = decimal ? EAT_MANY(state, index,
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
if (pre_decimal == 0 && post_decimal == 0)
return -1;
return negative + pre_decimal + decimal + post_decimal;
}
-int64_t match_newline(Text_t text, int64_t index)
+static int64_t match_newline(TextIter_t *state, int64_t index)
{
- if (index >= text.length)
+ if (index >= state->text.length)
return -1;
- TextIter_t state = {0, 0};
- ucs4_t grapheme = index >= text.length ? 0 : Text$get_main_grapheme_fast(text, &state, index);
+ ucs4_t grapheme = index >= state->text.length ? 0 : Text$get_main_grapheme_fast(state, index);
if (grapheme == '\n')
return 1;
- if (grapheme == '\r' && Text$get_grapheme_fast(text, &state, index + 1) == '\n')
+ if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n')
return 2;
return -1;
}
-typedef struct {
- int64_t index, length;
- bool occupied, recursive;
-} capture_t;
-
-typedef struct {
- enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag;
- bool negated, non_capturing;
- int64_t min, max;
- union {
- int32_t grapheme;
- uc_property_t property;
- int64_t (*fn)(Text_t, int64_t);
- int32_t quote_graphemes[2];
- int32_t pair_graphemes[2];
- };
-} pat_t;
-
-int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat)
+static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat)
{
- int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(text, state, index);
+ Text_t text = state->text;
+ int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index);
switch (pat.tag) {
case PAT_START: {
@@ -448,7 +458,7 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat)
if (index + match_len >= text.length)
return pat.negated ? 1 : -1;
- int32_t c = Text$get_grapheme_fast(text, state, index + match_len);
+ int32_t c = Text$get_grapheme_fast(state, index + match_len);
if (c == open)
depth += 1;
else if (c == close)
@@ -467,7 +477,7 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat)
int32_t close = pat.quote_graphemes[1];
for (int64_t i = index + 1; i < text.length; i++) {
- int32_t c = Text$get_grapheme_fast(text, state, i);
+ int32_t c = Text$get_grapheme_fast(state, i);
if (c == close) {
return pat.negated ? -1 : (i - index) + 1;
} else if (c == '\\' && index + 1 < text.length) {
@@ -477,7 +487,7 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat)
return pat.negated ? 1 : -1;
}
case PAT_FUNCTION: {
- int64_t match_len = pat.fn(text, index);
+ int64_t match_len = pat.fn(state, index);
if (match_len >= 0)
return pat.negated ? -1 : match_len;
return pat.negated ? 1 : -1;
@@ -487,49 +497,48 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat)
errx(1, "Unreachable");
}
-pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index)
+static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
{
- if (EAT2(pattern, state, *index,
+ if (EAT2(state, *index,
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK),
grapheme == '?')) {
// Quotations: "?", '?', etc
- int32_t open = Text$get_grapheme_fast(pattern, state, *index-2);
+ int32_t open = Text$get_grapheme_fast(state, *index-2);
int32_t close = open;
uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
- if (!match_grapheme(pattern, index, close))
- fail("Pattern's closing quote is missing: %k", &pattern);
+ if (!match_grapheme(state, index, close))
+ fail("Pattern's closing quote is missing: %k", &state->text);
return (pat_t){
.tag=PAT_QUOTE,
.min=1, .max=1,
.quote_graphemes={open, close},
};
- } else if (EAT2(pattern, state, *index,
+ } else if (EAT2(state, *index,
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION),
grapheme == '?')) {
// Nested punctuation: (?), [?], etc
- int32_t open = Text$get_grapheme_fast(pattern, state, *index-2);
+ int32_t open = Text$get_grapheme_fast(state, *index-2);
int32_t close = open;
uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
- if (!match_grapheme(pattern, index, close))
- fail("Pattern's closing brace is missing: %k", &pattern);
+ if (!match_grapheme(state, index, close))
+ fail("Pattern's closing brace is missing: %k", &state->text);
return (pat_t){
.tag=PAT_PAIR,
.min=1, .max=1,
.pair_graphemes={open, close},
};
- } else if (EAT1(pattern, state, *index,
- grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
- skip_whitespace(pattern, index);
+ } else if (EAT1(state, *index, grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
+ skip_whitespace(state, index);
int64_t min, max;
- if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(pattern, state, *index))) {
- min = parse_int(pattern, index);
- skip_whitespace(pattern, index);
- if (match_grapheme(pattern, index, '+')) {
+ if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) {
+ min = parse_int(state, index);
+ skip_whitespace(state, index);
+ if (match_grapheme(state, index, '+')) {
max = INT64_MAX;
- } else if (match_grapheme(pattern, index, '-')) {
- max = parse_int(pattern, index);
+ } else if (match_grapheme(state, index, '-')) {
+ max = parse_int(state, index);
} else {
max = min;
}
@@ -538,34 +547,34 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index)
min = -1, max = -1;
}
- skip_whitespace(pattern, index);
+ skip_whitespace(state, index);
- bool negated = match_grapheme(pattern, index, '!');
+ bool negated = match_grapheme(state, index, '!');
#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__})
const char *prop_name;
- if (match_str(pattern, index, ".."))
+ if (match_str(state, index, ".."))
prop_name = "..";
else
- prop_name = get_property_name(pattern, index);
+ prop_name = get_property_name(state, index);
if (!prop_name) {
// Literal character, e.g. {1?}
- skip_whitespace(pattern, index);
- int32_t grapheme = Text$get_grapheme_fast(pattern, state, (*index)++);
- if (!match_grapheme(pattern, index, '}'))
- fail("Missing closing '}' in pattern: %k", &pattern);
+ skip_whitespace(state, index);
+ int32_t grapheme = Text$get_grapheme_fast(state, (*index)++);
+ if (!match_grapheme(state, index, '}'))
+ fail("Missing closing '}' in pattern: %k", &state->text);
return PAT(PAT_GRAPHEME, .grapheme=grapheme);
} else if (strlen(prop_name) == 1) {
// Single letter names: {1+ A}
- skip_whitespace(pattern, index);
- if (!match_grapheme(pattern, index, '}'))
- fail("Missing closing '}' in pattern: %k", &pattern);
+ skip_whitespace(state, index);
+ if (!match_grapheme(state, index, '}'))
+ fail("Missing closing '}' in pattern: %k", &state->text);
return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]);
}
- skip_whitespace(pattern, index);
- if (!match_grapheme(pattern, index, '}'))
- fail("Missing closing '}' in pattern: %k", &pattern);
+ skip_whitespace(state, index);
+ if (!match_grapheme(state, index, '}'))
+ fail("Missing closing '}' in pattern: %k", &state->text);
switch (tolower(prop_name[0])) {
case '.':
@@ -576,6 +585,11 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index)
return PAT(PAT_ANY);
}
break;
+ case 'a':
+ if (strcasecmp(prop_name, "authority") == 0) {
+ return PAT(PAT_FUNCTION, .fn=match_authority);
+ }
+ break;
case 'd':
if (strcasecmp(prop_name, "digit") == 0) {
return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT);
@@ -590,6 +604,11 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index)
return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI);
}
break;
+ case 'h':
+ if (strcasecmp(prop_name, "host") == 0) {
+ return PAT(PAT_FUNCTION, .fn=match_host);
+ }
+ break;
case 'i':
if (strcasecmp(prop_name, "id") == 0) {
return PAT(PAT_FUNCTION, .fn=match_id);
@@ -636,18 +655,18 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index)
return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme);
#undef PAT
} else {
- return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(pattern, state, (*index)++)};
+ return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(state, (*index)++)};
}
}
-int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index)
+static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index)
{
if (pattern_index >= pattern.length) // End of the pattern
return 0;
int64_t start_index = text_index;
- TextIter_t pattern_state = {0, 0}, text_state = {0, 0};
- pat_t pat = parse_next_pat(pattern, &pattern_state, &pattern_index);
+ TextIter_t pattern_state = {pattern, 0, 0}, text_state = {text, 0, 0};
+ pat_t pat = parse_next_pat(&pattern_state, &pattern_index);
if (pat.min == -1 && pat.max == -1) {
if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
@@ -677,7 +696,7 @@ int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t patter
}
while (count < pat.max) {
- int64_t match_len = match_pat(text, &text_state, text_index, pat);
+ int64_t match_len = match_pat(&text_state, text_index, pat);
if (match_len < 0)
break;
capture_len += match_len;
@@ -747,12 +766,11 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
- TextIter_t text_state = {0, 0};
-
+ TextIter_t text_state = {text, 0, 0};
for (int64_t i = first; i <= last; i++) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
- while (i < text.length && Text$get_grapheme_fast(text, &text_state, i) != first_grapheme)
+ while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme)
++i;
}
@@ -833,12 +851,12 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
Text_t ret = Text("");
- TextIter_t state = {0, 0};
+ TextIter_t replacement_state = {replacement, 0, 0};
int64_t nonmatching_pos = 0;
for (int64_t pos = 0; pos < replacement.length; ) {
// Optimization: quickly skip ahead to first char in the backref pattern:
if (find_first) {
- while (pos < replacement.length && Text$get_grapheme_fast(replacement, &state, pos) != first_grapheme)
+ while (pos < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme)
++pos;
}
@@ -849,7 +867,7 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
}
int64_t after_backref = pos + backref_len;
- int64_t backref = parse_int(replacement, &after_backref);
+ int64_t backref = parse_int(&replacement_state, &after_backref);
if (after_backref == pos + backref_len) { // Not actually a backref if there's no number
pos += 1;
continue;
@@ -857,7 +875,7 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
if (backref < 0 || backref > 9) fail("Invalid backref index: %ld (only 0-%d are allowed)", backref, MAX_BACKREFS-1);
backref_len = (after_backref - pos);
- if (Text$get_grapheme_fast(replacement, &state, pos + backref_len) == ';')
+ if (Text$get_grapheme_fast(&replacement_state, pos + backref_len) == ';')
backref_len += 1; // skip optional semicolon
if (!captures[backref].occupied)
@@ -894,12 +912,12 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, P
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
- TextIter_t text_state = {0, 0};
+ TextIter_t text_state = {text, 0, 0};
int64_t nonmatching_pos = 0;
for (int64_t pos = 0; pos < text.length; ) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
- while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme)
+ while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
++pos;
}
@@ -959,14 +977,14 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
- TextIter_t text_state = {0, 0};
+ TextIter_t text_state = {text, 0, 0};
int64_t nonmatching_pos = 0;
Text_t (*text_mapper)(Text_t, void*) = fn.fn;
for (int64_t pos = 0; pos < text.length; pos++) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
- while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme)
+ while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
++pos;
}