aboutsummaryrefslogtreecommitdiff
path: root/stdlib
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2024-09-14 00:12:52 -0400
committerBruce Hill <bruce@bruce-hill.com>2024-09-14 00:12:52 -0400
commit6012a00763afdd467e71b1657bd9a39a4cba4493 (patch)
tree6994a7074b4182211c3beec8553c02f638976b7f /stdlib
parent2b0556084919ace0700e4480f7fa2886cf31b3e4 (diff)
Clean up pattern code to make better use of TextIter_t and fix up
URI/URL patterns
Diffstat (limited to 'stdlib')
-rw-r--r--stdlib/patterns.c418
-rw-r--r--stdlib/text.c64
-rw-r--r--stdlib/text.h9
3 files changed, 254 insertions, 237 deletions
diff --git a/stdlib/patterns.c b/stdlib/patterns.c
index 81beaffe..2de7fe3a 100644
--- a/stdlib/patterns.c
+++ b/stdlib/patterns.c
@@ -14,32 +14,48 @@
#define MAX_BACKREFS 100
-static inline void skip_whitespace(Text_t text, int64_t *i)
+typedef struct {
+ int64_t index, length;
+ bool occupied, recursive;
+} capture_t;
+
+typedef struct {
+ enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag;
+ bool negated, non_capturing;
+ int64_t min, max;
+ union {
+ int32_t grapheme;
+ uc_property_t property;
+ int64_t (*fn)(TextIter_t *, int64_t);
+ int32_t quote_graphemes[2];
+ int32_t pair_graphemes[2];
+ };
+} pat_t;
+
+static inline void skip_whitespace(TextIter_t *state, int64_t *i)
{
- TextIter_t state = {0, 0};
- while (*i < text.length) {
- int32_t grapheme = Text$get_grapheme_fast(text, &state, *i);
+ while (*i < state->text.length) {
+ int32_t grapheme = Text$get_grapheme_fast(state, *i);
if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme))
return;
*i += 1;
}
}
-static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme)
+static inline bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme)
{
- if (*i < text.length && Text$get_grapheme(text, *i) == grapheme) {
+ if (*i < state->text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
*i += 1;
return true;
}
return false;
}
-static inline bool match_str(Text_t text, int64_t *i, const char *str)
+static inline bool match_str(TextIter_t *state, int64_t *i, const char *str)
{
- TextIter_t state = {0, 0};
int64_t matched = 0;
while (matched[str]) {
- if (*i + matched >= text.length || Text$get_grapheme_fast(text, &state, *i + matched) != str[matched])
+ if (*i + matched >= state->text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
return false;
matched += 1;
}
@@ -47,11 +63,10 @@ static inline bool match_str(Text_t text, int64_t *i, const char *str)
return true;
}
-static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop)
+static inline bool match_property(TextIter_t *state, int64_t *i, uc_property_t prop)
{
- if (*i >= text.length) return false;
- TextIter_t state = {};
- ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i);
+ if (*i >= state->text.length) return false;
+ ucs4_t grapheme = Text$get_main_grapheme_fast(state, *i);
// TODO: check every codepoint in the cluster?
if (uc_is_property(grapheme, prop)) {
*i += 1;
@@ -60,12 +75,11 @@ static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop)
return false;
}
-static int64_t parse_int(Text_t text, int64_t *i)
+static int64_t parse_int(TextIter_t *state, int64_t *i)
{
- TextIter_t state = {0, 0};
int64_t value = 0;
for (;; *i += 1) {
- ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i);
+ ucs4_t grapheme = Text$get_main_grapheme_fast(state, *i);
int digit = uc_digit_value((ucs4_t)grapheme);
if (digit < 0) break;
if (value >= INT64_MAX/10) break;
@@ -74,14 +88,13 @@ static int64_t parse_int(Text_t text, int64_t *i)
return value;
}
-const char *get_property_name(Text_t text, int64_t *i)
+static const char *get_property_name(TextIter_t *state, int64_t *i)
{
- skip_whitespace(text, i);
+ skip_whitespace(state, i);
char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
char *dest = name;
- TextIter_t state = {0, 0};
- while (*i < text.length) {
- int32_t grapheme = Text$get_grapheme_fast(text, &state, *i);
+ while (*i < state->text.length) {
+ int32_t grapheme = Text$get_grapheme_fast(state, *i);
if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
*dest = (char)grapheme;
++dest;
@@ -101,17 +114,17 @@ const char *get_property_name(Text_t text, int64_t *i)
return name;
}
-#define EAT1(text, state, index, cond) ({\
- int32_t grapheme = Text$get_grapheme_fast(text, state, index); \
+#define EAT1(state, index, cond) ({\
+ int32_t grapheme = Text$get_grapheme_fast(state, index); \
bool success = (cond); \
if (success) index += 1; \
success; })
-#define EAT2(text, state, index, cond1, cond2) ({\
- int32_t grapheme = Text$get_grapheme_fast(text, state, index); \
+#define EAT2(state, index, cond1, cond2) ({\
+ int32_t grapheme = Text$get_grapheme_fast(state, index); \
bool success = (cond1); \
if (success) { \
- grapheme = Text$get_grapheme_fast(text, state, index + 1); \
+ grapheme = Text$get_grapheme_fast(state, index + 1); \
success = (cond2); \
if (success) \
index += 2; \
@@ -119,18 +132,17 @@ const char *get_property_name(Text_t text, int64_t *i)
success; })
-#define EAT_MANY(text, state, index, cond) ({ int64_t _n = 0; while (EAT1(text, state, index, cond)) { _n += 1; } _n; })
+#define EAT_MANY(state, index, cond) ({ int64_t _n = 0; while (EAT1(state, index, cond)) { _n += 1; } _n; })
-int64_t match_email(Text_t text, int64_t index)
+static int64_t match_email(TextIter_t *state, int64_t index)
{
// email = local "@" domain
// local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii)
// domain = dns-label ("." dns-label)*
// dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii)
- TextIter_t state = {0, 0};
if (index > 0) {
- ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1);
+ ucs4_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
if (uc_is_property_alphabetic((ucs4_t)prev_codepoint))
return -1;
}
@@ -140,20 +152,20 @@ int64_t match_email(Text_t text, int64_t index)
// Local part:
int64_t local_len = 0;
static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~";
- while (EAT1(text, &state, index,
+ while (EAT1(state, index,
(grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
local_len += 1;
if (local_len > 64) return -1;
}
- if (!EAT1(text, &state, index, grapheme == '@'))
+ if (!EAT1(state, index, grapheme == '@'))
return -1;
// Host
int64_t host_len = 0;
do {
int64_t label_len = 0;
- while (EAT1(text, &state, index,
+ while (EAT1(state, index,
(grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
label_len += 1;
if (label_len > 63) return -1;
@@ -166,16 +178,15 @@ int64_t match_email(Text_t text, int64_t index)
if (host_len > 255)
return -1;
host_len += 1;
- } while (EAT1(text, &state, index, grapheme == '.'));
+ } while (EAT1(state, index, grapheme == '.'));
return index - start_index;
}
-int64_t match_ipv6(Text_t text, int64_t index)
+static int64_t match_ipv6(TextIter_t *state, int64_t index)
{
- TextIter_t state = {0, 0};
if (index > 0) {
- int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1);
+ int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':'))
return -1;
}
@@ -184,21 +195,21 @@ int64_t match_ipv6(Text_t text, int64_t index)
bool double_colon_used = false;
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
for (int digits = 0; digits < 4; digits++) {
- if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
+ if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
break;
}
- if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
+ if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
return -1; // Too many digits
if (cluster == NUM_CLUSTERS-1) {
break;
- } else if (!EAT1(text, &state, index, grapheme == ':')) {
+ } else if (!EAT1(state, index, grapheme == ':')) {
if (double_colon_used)
break;
return -1;
}
- if (EAT1(text, &state, index, grapheme == ':')) {
+ if (EAT1(state, index, grapheme == ':')) {
if (double_colon_used)
return -1;
double_colon_used = true;
@@ -207,11 +218,10 @@ int64_t match_ipv6(Text_t text, int64_t index)
return index - start_index;
}
-static int64_t match_ipv4(Text_t text, int64_t index)
+static int64_t match_ipv4(TextIter_t *state, int64_t index)
{
- TextIter_t state = {0, 0};
if (index > 0) {
- int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1);
+ int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.'))
return -1;
}
@@ -220,40 +230,86 @@ static int64_t match_ipv4(Text_t text, int64_t index)
const int NUM_CLUSTERS = 4;
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
for (int digits = 0; digits < 3; digits++) {
- if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
+ if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
if (digits == 0) return -1;
break;
}
}
- if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
+ if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
return -1; // Too many digits
if (cluster == NUM_CLUSTERS-1)
break;
- else if (!EAT1(text, &state, index, grapheme == '.'))
+ else if (!EAT1(state, index, grapheme == '.'))
return -1;
}
return (index - start_index);
}
-int64_t match_ip(Text_t text, int64_t index)
+static int64_t match_ip(TextIter_t *state, int64_t index)
{
- int64_t len = match_ipv6(text, index);
+ int64_t len = match_ipv6(state, index);
if (len >= 0) return len;
- len = match_ipv4(text, index);
+ len = match_ipv4(state, index);
return (len >= 0) ? len : -1;
}
-int64_t match_uri(Text_t text, int64_t index)
+static int64_t match_host(TextIter_t *state, int64_t index)
+{
+ int64_t ip_len = match_ip(state, index);
+ if (ip_len > 0) return ip_len;
+
+ int64_t start_index = index;
+ if (match_grapheme(state, &index, '[')) {
+ ip_len = match_ip(state, index);
+ if (ip_len <= 0) return -1;
+ index += ip_len;
+ if (match_grapheme(state, &index, ']'))
+ return (index - start_index);
+ return -1;
+ }
+
+ if (!EAT1(state, index, isalpha(grapheme)))
+ return -1;
+
+ static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`";
+ EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme));
+ return (index - start_index);
+}
+
+static int64_t match_authority(TextIter_t *state, int64_t index)
+{
+ int64_t authority_start = index;
+ static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`.";
+
+ // Optional user@ prefix:
+ int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme));
+ if (username_len < 1 || !EAT1(state, index, grapheme == '@'))
+ index = authority_start; // No user@ part
+
+ // Host:
+ int64_t host_len = match_host(state, index);
+ if (host_len <= 0) return -1;
+ index += host_len;
+
+ // Port:
+ if (EAT1(state, index, grapheme == ':')) {
+ if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
+ return -1;
+ }
+ return (index - authority_start);
+}
+
+static int64_t match_uri(TextIter_t *state, int64_t index)
{
// URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment]
// scheme = [a-zA-Z] [a-zA-Z0-9+.-]
// authority = [userinfo "@"] host [":" port]
- TextIter_t state = {0, 0};
if (index > 0) {
- ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1);
+ // Don't match if we're not at a word edge:
+ ucs4_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
if (uc_is_property_alphabetic(prev_codepoint))
return -1;
}
@@ -261,147 +317,101 @@ int64_t match_uri(Text_t text, int64_t index)
int64_t start_index = index;
// Scheme:
- if (!EAT1(text, &state, index, isalpha(grapheme)))
+ if (!EAT1(state, index, isalpha(grapheme)))
return -1;
-
- EAT_MANY(text, &state, index,
- !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
-
- if (index == start_index)
- return -1;
-
- if (!match_grapheme(text, &index, ':'))
+ EAT_MANY(state, index, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
+ if (!match_grapheme(state, &index, ':'))
return -1;
// Authority:
- if (match_str(text, &index, "//")) {
- int64_t authority_start = index;
- // Username or host:
- static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/";
- if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
- return -1;
+ int64_t authority_len;
+ if (match_str(state, &index, "//")) {
+ authority_len = match_authority(state, index);
+ if (authority_len > 0)
+ index += authority_len;
+ } else {
+ authority_len = 0;
+ }
- if (EAT1(text, &state, index, grapheme == '@')) {
- // Found a username, now get a host:
- if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
- return -1;
- } else {
- int64_t ip = authority_start;
- int64_t ipv4_len = match_ipv4(text, ip);
- if (ipv4_len > 0) {
- ip += ipv4_len;
- } else if (match_grapheme(text, &ip, '[')) {
- ip += match_ipv6(text, ip);
- if (ip > authority_start + 1 && match_grapheme(text, &ip, ']'))
- index = ip;
- }
+ // Path:
+ int64_t path_start = index;
+ if (EAT1(state, index, grapheme == '/') || authority_len <= 0) {
+ static const char *non_path = " \"#?<>[]{}\\^`|";
+ EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
+
+ if (EAT1(state, index, grapheme == '?')) { // Query
+ static const char *non_query = " \"#<>[]{}\\^`|";
+ EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
}
-
- // Port:
- if (EAT1(text, &state, index, grapheme == ':')) {
- if (EAT_MANY(text, &state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
- return -1;
+
+ if (EAT1(state, index, grapheme == '#')) { // Fragment
+ static const char *non_fragment = " \"#<>[]{}\\^`|";
+ EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
}
- if (!EAT1(text, &state, index, grapheme == '/'))
- return (index - start_index); // No path
- } else {
- // Optional path root:
- EAT1(text, &state, index, grapheme == '/');
}
- // Path:
- static const char *non_path = " \"#?<>[]{}\\^`|";
- EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
+ if (authority_len <= 0 && index == path_start)
+ return -1;
- if (EAT1(text, &state, index, grapheme == '?')) { // Query
- static const char *non_query = " \"#<>[]{}\\^`|";
- EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
- }
-
- if (EAT1(text, &state, index, grapheme == '#')) { // Fragment
- static const char *non_fragment = " \"#<>[]{}\\^`|";
- EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
- }
return index - start_index;
}
-int64_t match_url(Text_t text, int64_t index)
+static int64_t match_url(TextIter_t *state, int64_t index)
{
int64_t lookahead = index;
- if (!(match_str(text, &lookahead, "https:")
- || match_str(text, &lookahead, "http:")
- || match_str(text, &lookahead, "ftp:")
- || match_str(text, &lookahead, "wss:")
- || match_str(text, &lookahead, "ws:")))
+ if (!(match_str(state, &lookahead, "https:")
+ || match_str(state, &lookahead, "http:")
+ || match_str(state, &lookahead, "ftp:")
+ || match_str(state, &lookahead, "wss:")
+ || match_str(state, &lookahead, "ws:")))
return -1;
- return match_uri(text, index);
+ return match_uri(state, index);
}
-int64_t match_id(Text_t text, int64_t index)
+static int64_t match_id(TextIter_t *state, int64_t index)
{
- TextIter_t state = {0, 0};
- if (!EAT1(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START)))
+ if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START)))
return -1;
- return 1 + EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE));
+ return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE));
}
-int64_t match_int(Text_t text, int64_t index)
+static int64_t match_int(TextIter_t *state, int64_t index)
{
- TextIter_t state = {0, 0};
- int64_t len = EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
+ int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
return len >= 0 ? len : -1;
}
-int64_t match_num(Text_t text, int64_t index)
+static int64_t match_num(TextIter_t *state, int64_t index)
{
- TextIter_t state = {0, 0};
- bool negative = EAT1(text, &state, index, grapheme == '-') ? 1 : 0;
- int64_t pre_decimal = EAT_MANY(text, &state, index,
+ bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0;
+ int64_t pre_decimal = EAT_MANY(state, index,
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
- bool decimal = (EAT1(text, &state, index, grapheme == '.') == 1);
- int64_t post_decimal = decimal ? EAT_MANY(text, &state, index,
+ bool decimal = (EAT1(state, index, grapheme == '.') == 1);
+ int64_t post_decimal = decimal ? EAT_MANY(state, index,
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
if (pre_decimal == 0 && post_decimal == 0)
return -1;
return negative + pre_decimal + decimal + post_decimal;
}
-int64_t match_newline(Text_t text, int64_t index)
+static int64_t match_newline(TextIter_t *state, int64_t index)
{
- if (index >= text.length)
+ if (index >= state->text.length)
return -1;
- TextIter_t state = {0, 0};
- ucs4_t grapheme = index >= text.length ? 0 : Text$get_main_grapheme_fast(text, &state, index);
+ ucs4_t grapheme = index >= state->text.length ? 0 : Text$get_main_grapheme_fast(state, index);
if (grapheme == '\n')
return 1;
- if (grapheme == '\r' && Text$get_grapheme_fast(text, &state, index + 1) == '\n')
+ if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n')
return 2;
return -1;
}
-typedef struct {
- int64_t index, length;
- bool occupied, recursive;
-} capture_t;
-
-typedef struct {
- enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag;
- bool negated, non_capturing;
- int64_t min, max;
- union {
- int32_t grapheme;
- uc_property_t property;
- int64_t (*fn)(Text_t, int64_t);
- int32_t quote_graphemes[2];
- int32_t pair_graphemes[2];
- };
-} pat_t;
-
-int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat)
+static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat)
{
- int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(text, state, index);
+ Text_t text = state->text;
+ int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index);
switch (pat.tag) {
case PAT_START: {
@@ -448,7 +458,7 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat)
if (index + match_len >= text.length)
return pat.negated ? 1 : -1;
- int32_t c = Text$get_grapheme_fast(text, state, index + match_len);
+ int32_t c = Text$get_grapheme_fast(state, index + match_len);
if (c == open)
depth += 1;
else if (c == close)
@@ -467,7 +477,7 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat)
int32_t close = pat.quote_graphemes[1];
for (int64_t i = index + 1; i < text.length; i++) {
- int32_t c = Text$get_grapheme_fast(text, state, i);
+ int32_t c = Text$get_grapheme_fast(state, i);
if (c == close) {
return pat.negated ? -1 : (i - index) + 1;
} else if (c == '\\' && index + 1 < text.length) {
@@ -477,7 +487,7 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat)
return pat.negated ? 1 : -1;
}
case PAT_FUNCTION: {
- int64_t match_len = pat.fn(text, index);
+ int64_t match_len = pat.fn(state, index);
if (match_len >= 0)
return pat.negated ? -1 : match_len;
return pat.negated ? 1 : -1;
@@ -487,49 +497,48 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat)
errx(1, "Unreachable");
}
-pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index)
+static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
{
- if (EAT2(pattern, state, *index,
+ if (EAT2(state, *index,
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK),
grapheme == '?')) {
// Quotations: "?", '?', etc
- int32_t open = Text$get_grapheme_fast(pattern, state, *index-2);
+ int32_t open = Text$get_grapheme_fast(state, *index-2);
int32_t close = open;
uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
- if (!match_grapheme(pattern, index, close))
- fail("Pattern's closing quote is missing: %k", &pattern);
+ if (!match_grapheme(state, index, close))
+ fail("Pattern's closing quote is missing: %k", &state->text);
return (pat_t){
.tag=PAT_QUOTE,
.min=1, .max=1,
.quote_graphemes={open, close},
};
- } else if (EAT2(pattern, state, *index,
+ } else if (EAT2(state, *index,
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION),
grapheme == '?')) {
// Nested punctuation: (?), [?], etc
- int32_t open = Text$get_grapheme_fast(pattern, state, *index-2);
+ int32_t open = Text$get_grapheme_fast(state, *index-2);
int32_t close = open;
uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
- if (!match_grapheme(pattern, index, close))
- fail("Pattern's closing brace is missing: %k", &pattern);
+ if (!match_grapheme(state, index, close))
+ fail("Pattern's closing brace is missing: %k", &state->text);
return (pat_t){
.tag=PAT_PAIR,
.min=1, .max=1,
.pair_graphemes={open, close},
};
- } else if (EAT1(pattern, state, *index,
- grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
- skip_whitespace(pattern, index);
+ } else if (EAT1(state, *index, grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
+ skip_whitespace(state, index);
int64_t min, max;
- if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(pattern, state, *index))) {
- min = parse_int(pattern, index);
- skip_whitespace(pattern, index);
- if (match_grapheme(pattern, index, '+')) {
+ if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) {
+ min = parse_int(state, index);
+ skip_whitespace(state, index);
+ if (match_grapheme(state, index, '+')) {
max = INT64_MAX;
- } else if (match_grapheme(pattern, index, '-')) {
- max = parse_int(pattern, index);
+ } else if (match_grapheme(state, index, '-')) {
+ max = parse_int(state, index);
} else {
max = min;
}
@@ -538,34 +547,34 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index)
min = -1, max = -1;
}
- skip_whitespace(pattern, index);
+ skip_whitespace(state, index);
- bool negated = match_grapheme(pattern, index, '!');
+ bool negated = match_grapheme(state, index, '!');
#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__})
const char *prop_name;
- if (match_str(pattern, index, ".."))
+ if (match_str(state, index, ".."))
prop_name = "..";
else
- prop_name = get_property_name(pattern, index);
+ prop_name = get_property_name(state, index);
if (!prop_name) {
// Literal character, e.g. {1?}
- skip_whitespace(pattern, index);
- int32_t grapheme = Text$get_grapheme_fast(pattern, state, (*index)++);
- if (!match_grapheme(pattern, index, '}'))
- fail("Missing closing '}' in pattern: %k", &pattern);
+ skip_whitespace(state, index);
+ int32_t grapheme = Text$get_grapheme_fast(state, (*index)++);
+ if (!match_grapheme(state, index, '}'))
+ fail("Missing closing '}' in pattern: %k", &state->text);
return PAT(PAT_GRAPHEME, .grapheme=grapheme);
} else if (strlen(prop_name) == 1) {
// Single letter names: {1+ A}
- skip_whitespace(pattern, index);
- if (!match_grapheme(pattern, index, '}'))
- fail("Missing closing '}' in pattern: %k", &pattern);
+ skip_whitespace(state, index);
+ if (!match_grapheme(state, index, '}'))
+ fail("Missing closing '}' in pattern: %k", &state->text);
return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]);
}
- skip_whitespace(pattern, index);
- if (!match_grapheme(pattern, index, '}'))
- fail("Missing closing '}' in pattern: %k", &pattern);
+ skip_whitespace(state, index);
+ if (!match_grapheme(state, index, '}'))
+ fail("Missing closing '}' in pattern: %k", &state->text);
switch (tolower(prop_name[0])) {
case '.':
@@ -576,6 +585,11 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index)
return PAT(PAT_ANY);
}
break;
+ case 'a':
+ if (strcasecmp(prop_name, "authority") == 0) {
+ return PAT(PAT_FUNCTION, .fn=match_authority);
+ }
+ break;
case 'd':
if (strcasecmp(prop_name, "digit") == 0) {
return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT);
@@ -590,6 +604,11 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index)
return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI);
}
break;
+ case 'h':
+ if (strcasecmp(prop_name, "host") == 0) {
+ return PAT(PAT_FUNCTION, .fn=match_host);
+ }
+ break;
case 'i':
if (strcasecmp(prop_name, "id") == 0) {
return PAT(PAT_FUNCTION, .fn=match_id);
@@ -636,18 +655,18 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index)
return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme);
#undef PAT
} else {
- return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(pattern, state, (*index)++)};
+ return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(state, (*index)++)};
}
}
-int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index)
+static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index)
{
if (pattern_index >= pattern.length) // End of the pattern
return 0;
int64_t start_index = text_index;
- TextIter_t pattern_state = {0, 0}, text_state = {0, 0};
- pat_t pat = parse_next_pat(pattern, &pattern_state, &pattern_index);
+ TextIter_t pattern_state = {pattern, 0, 0}, text_state = {text, 0, 0};
+ pat_t pat = parse_next_pat(&pattern_state, &pattern_index);
if (pat.min == -1 && pat.max == -1) {
if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
@@ -677,7 +696,7 @@ int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t patter
}
while (count < pat.max) {
- int64_t match_len = match_pat(text, &text_state, text_index, pat);
+ int64_t match_len = match_pat(&text_state, text_index, pat);
if (match_len < 0)
break;
capture_len += match_len;
@@ -747,12 +766,11 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
- TextIter_t text_state = {0, 0};
-
+ TextIter_t text_state = {text, 0, 0};
for (int64_t i = first; i <= last; i++) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
- while (i < text.length && Text$get_grapheme_fast(text, &text_state, i) != first_grapheme)
+ while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme)
++i;
}
@@ -833,12 +851,12 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
Text_t ret = Text("");
- TextIter_t state = {0, 0};
+ TextIter_t replacement_state = {replacement, 0, 0};
int64_t nonmatching_pos = 0;
for (int64_t pos = 0; pos < replacement.length; ) {
// Optimization: quickly skip ahead to first char in the backref pattern:
if (find_first) {
- while (pos < replacement.length && Text$get_grapheme_fast(replacement, &state, pos) != first_grapheme)
+ while (pos < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme)
++pos;
}
@@ -849,7 +867,7 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
}
int64_t after_backref = pos + backref_len;
- int64_t backref = parse_int(replacement, &after_backref);
+ int64_t backref = parse_int(&replacement_state, &after_backref);
if (after_backref == pos + backref_len) { // Not actually a backref if there's no number
pos += 1;
continue;
@@ -857,7 +875,7 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
if (backref < 0 || backref > 9) fail("Invalid backref index: %ld (only 0-%d are allowed)", backref, MAX_BACKREFS-1);
backref_len = (after_backref - pos);
- if (Text$get_grapheme_fast(replacement, &state, pos + backref_len) == ';')
+ if (Text$get_grapheme_fast(&replacement_state, pos + backref_len) == ';')
backref_len += 1; // skip optional semicolon
if (!captures[backref].occupied)
@@ -894,12 +912,12 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, P
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
- TextIter_t text_state = {0, 0};
+ TextIter_t text_state = {text, 0, 0};
int64_t nonmatching_pos = 0;
for (int64_t pos = 0; pos < text.length; ) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
- while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme)
+ while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
++pos;
}
@@ -959,14 +977,14 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
- TextIter_t text_state = {0, 0};
+ TextIter_t text_state = {text, 0, 0};
int64_t nonmatching_pos = 0;
Text_t (*text_mapper)(Text_t, void*) = fn.fn;
for (int64_t pos = 0; pos < text.length; pos++) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
- while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme)
+ while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
++pos;
}
diff --git a/stdlib/text.c b/stdlib/text.c
index 283dfb01..60b51962 100644
--- a/stdlib/text.c
+++ b/stdlib/text.c
@@ -817,17 +817,15 @@ PUREFUNC public uint64_t Text$hash(Text_t *text)
return text->hash;
}
-public int32_t Text$get_grapheme_fast(Text_t text, TextIter_t *state, int64_t index)
+public int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index)
{
+ Text_t text = state->text;
switch (text.tag) {
case TEXT_ASCII: return index < text.length ? (int32_t)text.ascii[index] : 0;
case TEXT_SHORT_ASCII: return index < text.length ? (int32_t)text.short_ascii[index] : 0;
case TEXT_GRAPHEMES: return index < text.length ? text.graphemes[index] : 0;
case TEXT_SHORT_GRAPHEMES: return index < text.length ? text.short_graphemes[index] : 0;
case TEXT_SUBTEXT: {
- TextIter_t backup_state = {0, 0};
- if (!state) state = &backup_state;
-
if (index < 0 || index >= text.length)
return 0;
@@ -837,7 +835,7 @@ public int32_t Text$get_grapheme_fast(Text_t text, TextIter_t *state, int64_t in
}
for (;;) {
if (index < state->sum_of_previous_subtexts + text.subtexts[state->subtext].length)
- return Text$get_grapheme_fast(text.subtexts[state->subtext], NULL, index - state->sum_of_previous_subtexts);
+ return Text$get_grapheme(text.subtexts[state->subtext], index - state->sum_of_previous_subtexts);
state->sum_of_previous_subtexts += text.subtexts[state->subtext].length;
state->subtext += 1;
}
@@ -848,9 +846,9 @@ public int32_t Text$get_grapheme_fast(Text_t text, TextIter_t *state, int64_t in
return 0;
}
-public ucs4_t Text$get_main_grapheme_fast(Text_t text, TextIter_t *state, int64_t index)
+public ucs4_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index)
{
- return MAIN_GRAPHEME_CODEPOINT(Text$get_grapheme_fast(text, state, index));
+ return MAIN_GRAPHEME_CODEPOINT(Text$get_grapheme_fast(state, index));
}
PUREFUNC public int32_t Text$compare(const Text_t *a, const Text_t *b)
@@ -858,10 +856,10 @@ PUREFUNC public int32_t Text$compare(const Text_t *a, const Text_t *b)
if (a == b) return 0;
int64_t len = MAX(a->length, b->length);
- TextIter_t a_state = {0, 0}, b_state = {0, 0};
+ TextIter_t a_state = {*a, 0, 0}, b_state = {*b, 0, 0};
for (int64_t i = 0; i < len; i++) {
- int32_t ai = Text$get_grapheme_fast(*a, &a_state, i);
- int32_t bi = Text$get_grapheme_fast(*b, &b_state, i);
+ int32_t ai = Text$get_grapheme_fast(&a_state, i);
+ int32_t bi = Text$get_grapheme_fast(&b_state, i);
if (ai == bi) continue;
int32_t cmp;
if (ai > 0 && bi > 0) {
@@ -892,10 +890,10 @@ PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix)
{
if (text.length < prefix.length)
return false;
- TextIter_t text_state = {0, 0}, prefix_state = {0, 0};
+ TextIter_t text_state = {text, 0, 0}, prefix_state = {prefix, 0, 0};
for (int64_t i = 0; i < prefix.length; i++) {
- int32_t text_i = Text$get_grapheme_fast(text, &text_state, i);
- int32_t prefix_i = Text$get_grapheme_fast(prefix, &prefix_state, i);
+ int32_t text_i = Text$get_grapheme_fast(&text_state, i);
+ int32_t prefix_i = Text$get_grapheme_fast(&prefix_state, i);
if (text_i != prefix_i) return false;
}
return true;
@@ -905,10 +903,10 @@ PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix)
{
if (text.length < suffix.length)
return false;
- TextIter_t text_state = {0, 0}, prefix_state = {0, 0};
+ TextIter_t text_state = {text, 0, 0}, suffix_state = {suffix, 0, 0};
for (int64_t i = 0; i < suffix.length; i++) {
- int32_t text_i = Text$get_grapheme_fast(text, &text_state, text.length - suffix.length + i);
- int32_t suffix_i = Text$get_grapheme_fast(suffix, &prefix_state, i);
+ int32_t text_i = Text$get_grapheme_fast(&text_state, text.length - suffix.length + i);
+ int32_t suffix_i = Text$get_grapheme_fast(&suffix_state, i);
if (text_i != suffix_i) return false;
}
return true;
@@ -919,10 +917,10 @@ PUREFUNC public bool Text$equal_values(Text_t a, Text_t b)
if (a.length != b.length || (a.hash != 0 && b.hash != 0 && a.hash != b.hash))
return false;
int64_t len = a.length;
- TextIter_t a_state = {0, 0}, b_state = {0, 0};
+ TextIter_t a_state = {a, 0, 0}, b_state = {b, 0, 0};
for (int64_t i = 0; i < len; i++) {
- int32_t ai = Text$get_grapheme_fast(a, &a_state, i);
- int32_t bi = Text$get_grapheme_fast(b, &b_state, i);
+ int32_t ai = Text$get_grapheme_fast(&a_state, i);
+ int32_t bi = Text$get_grapheme_fast(&b_state, i);
if (ai != bi) return false;
}
return true;
@@ -939,11 +937,11 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b)
if (a.length != b.length)
return false;
int64_t len = a.length;
- TextIter_t a_state = {0, 0}, b_state = {0, 0};
+ TextIter_t a_state = {a, 0, 0}, b_state = {b, 0, 0};
const char *language = uc_locale_language();
for (int64_t i = 0; i < len; i++) {
- int32_t ai = Text$get_grapheme_fast(a, &a_state, i);
- int32_t bi = Text$get_grapheme_fast(b, &b_state, i);
+ int32_t ai = Text$get_grapheme_fast(&a_state, i);
+ int32_t bi = Text$get_grapheme_fast(&b_state, i);
if (ai != bi) {
const ucs4_t *a_codepoints = ai >= 0 ? (ucs4_t*)&ai : GRAPHEME_CODEPOINTS(ai);
int64_t a_len = ai >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(ai);
@@ -1030,9 +1028,9 @@ static inline Text_t _quoted(Text_t text, bool colorize, char quote_char)
add_char(quote_char);
#define add_escaped(str) ({ if (colorize) add_str("\x1b[34;1m"); add_char('\\'); add_str(str); if (colorize) add_str("\x1b[0;35m"); })
- TextIter_t state = {0, 0};
+ TextIter_t state = {text, 0, 0};
for (int64_t i = 0; i < text.length; i++) {
- int32_t g = Text$get_grapheme_fast(text, &state, i);
+ int32_t g = Text$get_grapheme_fast(&state, i);
switch (g) {
case '\a': add_escaped("a"); break;
case '\b': add_escaped("b"); break;
@@ -1148,9 +1146,9 @@ public Array_t Text$clusters(Text_t text)
public Array_t Text$utf32_codepoints(Text_t text)
{
Array_t codepoints = {.atomic=1};
- TextIter_t state = {0, 0};
+ TextIter_t state = {text, 0, 0};
for (int64_t i = 0; i < text.length; i++) {
- int32_t grapheme = Text$get_grapheme_fast(text, &state, i);
+ int32_t grapheme = Text$get_grapheme_fast(&state, i);
if (grapheme < 0) {
for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
ucs4_t subg = GRAPHEME_CODEPOINTS(grapheme)[c];
@@ -1183,9 +1181,9 @@ static inline const char *codepoint_name(ucs4_t c)
public Array_t Text$codepoint_names(Text_t text)
{
Array_t names = {};
- TextIter_t state = {0, 0};
+ TextIter_t state = {text, 0, 0};
for (int64_t i = 0; i < text.length; i++) {
- int32_t grapheme = Text$get_grapheme_fast(text, &state, i);
+ int32_t grapheme = Text$get_grapheme_fast(&state, i);
if (grapheme < 0) {
for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]);
@@ -1235,10 +1233,10 @@ public Text_t Text$from_bytes(Array_t bytes)
public Array_t Text$lines(Text_t text)
{
Array_t lines = {};
- TextIter_t state = {0, 0};
+ TextIter_t state = {text, 0, 0};
for (int64_t i = 0, line_start = 0; i < text.length; i++) {
- int32_t grapheme = Text$get_grapheme_fast(text, &state, i);
- if (grapheme == '\r' && Text$get_grapheme_fast(text, &state, i + 1) == '\n') { // CRLF
+ int32_t grapheme = Text$get_grapheme_fast(&state, i);
+ if (grapheme == '\r' && Text$get_grapheme_fast(&state, i + 1) == '\n') { // CRLF
Text_t line = Text$slice(text, I(line_start+1), I(i));
Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
i += 1; // skip one extra for CR
@@ -1268,9 +1266,9 @@ public Pattern_t Pattern$escape_text(Text_t text)
Array_t graphemes = {.atomic=1};
#define add_char(c) Array$insert_value(&graphemes, (ucs4_t)c, I_small(0), sizeof(ucs4_t))
#define add_str(s) ({ for (const char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (ucs4_t)*_c, I_small(0), sizeof(ucs4_t)); })
- TextIter_t state = {0, 0};
+ TextIter_t state = {text, 0, 0};
for (int64_t i = 0; i < text.length; i++) {
- int32_t g = Text$get_grapheme_fast(text, &state, i);
+ int32_t g = Text$get_grapheme_fast(&state, i);
ucs4_t g0 = g < 0 ? GRAPHEME_CODEPOINTS(g)[0] : (ucs4_t)g;
if (g == '{') {
diff --git a/stdlib/text.h b/stdlib/text.h
index 841d51fe..bad0187b 100644
--- a/stdlib/text.h
+++ b/stdlib/text.h
@@ -12,6 +12,7 @@
#include "integers.h"
typedef struct {
+ Text_t text;
int64_t subtext, sum_of_previous_subtexts;
} TextIter_t;
@@ -53,13 +54,13 @@ Text_t Text$from_bytes(Array_t bytes);
Array_t Text$lines(Text_t text);
Text_t Text$join(Text_t glue, Array_t pieces);
Text_t Text$repeat(Text_t text, Int_t count);
-int32_t Text$get_grapheme_fast(Text_t text, TextIter_t *state, int64_t index);
-ucs4_t Text$get_main_grapheme_fast(Text_t text, TextIter_t *state, int64_t index);
+int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index);
+ucs4_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index);
static inline int32_t Text$get_grapheme(Text_t text, int64_t index)
{
- TextIter_t state = {0, 0};
- return Text$get_grapheme_fast(text, &state, index);
+ TextIter_t state = {text, 0, 0};
+ return Text$get_grapheme_fast(&state, index);
}
extern const TypeInfo Text$info;