tomo/examples/patterns/patterns.c

1299 lines
44 KiB
C
Raw Normal View History

2024-09-13 10:34:04 -07:00
// Logic for text pattern matching
#include <ctype.h>
#include <sys/param.h>
2025-04-01 11:05:10 -07:00
#include <tomo/tomo.h>
2024-09-13 10:34:04 -07:00
#include <unictype.h>
#include <uniname.h>
#include <unistring/version.h>
2024-09-13 10:34:04 -07:00
#define MAX_BACKREFS 100
2025-04-01 11:05:10 -07:00
typedef struct {
Text_t text;
Int_t index;
Array_t captures;
} PatternMatch;
typedef struct {
Text_t text;
Int_t index;
Array_t captures;
bool is_none:1;
} OptionalPatternMatch;
#define NONE_MATCH ((OptionalPatternMatch){.is_none=true})
typedef struct {
int64_t index, length;
bool occupied, recursive;
} capture_t;
typedef struct {
enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag;
bool negated, non_capturing;
int64_t min, max;
union {
int32_t grapheme;
uc_property_t property;
int64_t (*fn)(TextIter_t *, int64_t);
int32_t quote_graphemes[2];
int32_t pair_graphemes[2];
};
} pat_t;
2025-04-01 11:05:10 -07:00
static Text_t replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive);
static INLINE void skip_whitespace(TextIter_t *state, int64_t *i)
2024-09-13 10:34:04 -07:00
{
while (*i < state->stack[0].text.length) {
int32_t grapheme = Text$get_grapheme_fast(state, *i);
2024-09-13 10:34:04 -07:00
if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme))
return;
*i += 1;
}
}
static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme)
2024-09-13 10:34:04 -07:00
{
if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
2024-09-13 10:34:04 -07:00
*i += 1;
return true;
}
return false;
}
static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str)
2024-09-13 10:34:04 -07:00
{
int64_t matched = 0;
while (matched[str]) {
if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
2024-09-13 10:34:04 -07:00
return false;
matched += 1;
}
*i += matched;
return true;
}
static int64_t parse_int(TextIter_t *state, int64_t *i)
2024-09-13 10:34:04 -07:00
{
int64_t value = 0;
for (;; *i += 1) {
2024-10-25 11:05:07 -07:00
uint32_t grapheme = Text$get_main_grapheme_fast(state, *i);
int digit = uc_digit_value(grapheme);
2024-09-13 10:34:04 -07:00
if (digit < 0) break;
if (value >= INT64_MAX/10) break;
value = 10*value + digit;
}
return value;
}
static const char *get_property_name(TextIter_t *state, int64_t *i)
2024-09-13 10:34:04 -07:00
{
skip_whitespace(state, i);
2024-09-13 10:34:04 -07:00
char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
char *dest = name;
while (*i < state->stack[0].text.length) {
int32_t grapheme = Text$get_grapheme_fast(state, *i);
2024-09-13 10:34:04 -07:00
if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
*dest = (char)grapheme;
++dest;
if (dest >= name + UNINAME_MAX - 1)
break;
} else {
break;
}
*i += 1;
}
while (dest > name && dest[-1] == ' ')
*(dest--) = '\0';
if (dest == name) return NULL;
*dest = '\0';
return name;
}
#define EAT1(state, index, cond) ({\
int32_t grapheme = Text$get_grapheme_fast(state, index); \
2024-09-13 10:34:04 -07:00
bool success = (cond); \
if (success) index += 1; \
success; })
#define EAT2(state, index, cond1, cond2) ({\
int32_t grapheme = Text$get_grapheme_fast(state, index); \
2024-09-13 10:34:04 -07:00
bool success = (cond1); \
if (success) { \
grapheme = Text$get_grapheme_fast(state, index + 1); \
2024-09-13 10:34:04 -07:00
success = (cond2); \
if (success) \
index += 2; \
} \
success; })
#define EAT_MANY(state, index, cond) ({ int64_t _n = 0; while (EAT1(state, index, cond)) { _n += 1; } _n; })
2024-09-13 10:34:04 -07:00
static int64_t match_email(TextIter_t *state, int64_t index)
2024-09-13 10:34:04 -07:00
{
// email = local "@" domain
// local = 1-64 ([a-zA-Z0-9!#$%&*+/=?^_`.{|}~] | non-ascii)
// domain = dns-label ("." dns-label)*
// dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii)
if (index > 0) {
2024-10-25 11:05:07 -07:00
uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
if (uc_is_property_alphabetic(prev_codepoint))
2024-09-13 10:34:04 -07:00
return -1;
}
int64_t start_index = index;
// Local part:
int64_t local_len = 0;
static const char *allowed_local = "!#$%&*+/=?^_`.{|}~";
while (EAT1(state, index,
2024-09-13 10:34:04 -07:00
(grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
local_len += 1;
if (local_len > 64) return -1;
}
if (!EAT1(state, index, grapheme == '@'))
2024-09-13 10:34:04 -07:00
return -1;
// Host
int64_t host_len = 0;
do {
int64_t label_len = 0;
while (EAT1(state, index,
2024-09-13 10:34:04 -07:00
(grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
label_len += 1;
if (label_len > 63) return -1;
}
if (label_len == 0)
return -1;
host_len += label_len;
if (host_len > 255)
return -1;
host_len += 1;
} while (EAT1(state, index, grapheme == '.'));
2024-09-13 10:34:04 -07:00
return index - start_index;
}
static int64_t match_ipv6(TextIter_t *state, int64_t index)
2024-09-13 10:34:04 -07:00
{
if (index > 0) {
int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
2024-09-13 10:34:04 -07:00
if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':'))
return -1;
}
int64_t start_index = index;
const int NUM_CLUSTERS = 8;
bool double_colon_used = false;
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
for (int digits = 0; digits < 4; digits++) {
if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
2024-09-13 10:34:04 -07:00
break;
}
if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
2024-09-13 10:34:04 -07:00
return -1; // Too many digits
if (cluster == NUM_CLUSTERS-1) {
break;
} else if (!EAT1(state, index, grapheme == ':')) {
2024-09-13 10:34:04 -07:00
if (double_colon_used)
break;
return -1;
}
if (EAT1(state, index, grapheme == ':')) {
2024-09-13 10:34:04 -07:00
if (double_colon_used)
return -1;
double_colon_used = true;
}
}
return index - start_index;
}
static int64_t match_ipv4(TextIter_t *state, int64_t index)
2024-09-13 10:34:04 -07:00
{
if (index > 0) {
int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
2024-09-13 10:34:04 -07:00
if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.'))
return -1;
}
int64_t start_index = index;
const int NUM_CLUSTERS = 4;
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
for (int digits = 0; digits < 3; digits++) {
if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
2024-09-13 10:34:04 -07:00
if (digits == 0) return -1;
break;
}
}
if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
2024-09-13 10:34:04 -07:00
return -1; // Too many digits
if (cluster == NUM_CLUSTERS-1)
break;
else if (!EAT1(state, index, grapheme == '.'))
2024-09-13 10:34:04 -07:00
return -1;
}
return (index - start_index);
}
static int64_t match_ip(TextIter_t *state, int64_t index)
2024-09-13 10:34:04 -07:00
{
int64_t len = match_ipv6(state, index);
2024-09-13 10:34:04 -07:00
if (len >= 0) return len;
len = match_ipv4(state, index);
2024-09-13 10:34:04 -07:00
return (len >= 0) ? len : -1;
}
static int64_t match_host(TextIter_t *state, int64_t index)
{
int64_t ip_len = match_ip(state, index);
if (ip_len > 0) return ip_len;
int64_t start_index = index;
if (match_grapheme(state, &index, '[')) {
ip_len = match_ip(state, index);
if (ip_len <= 0) return -1;
index += ip_len;
if (match_grapheme(state, &index, ']'))
return (index - start_index);
return -1;
}
if (!EAT1(state, index, isalpha(grapheme)))
return -1;
static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`";
EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme));
return (index - start_index);
}
static int64_t match_authority(TextIter_t *state, int64_t index)
{
int64_t authority_start = index;
static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`.";
// Optional user@ prefix:
int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme));
if (username_len < 1 || !EAT1(state, index, grapheme == '@'))
index = authority_start; // No user@ part
// Host:
int64_t host_len = match_host(state, index);
if (host_len <= 0) return -1;
index += host_len;
// Port:
if (EAT1(state, index, grapheme == ':')) {
if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
return -1;
}
return (index - authority_start);
}
static int64_t match_uri(TextIter_t *state, int64_t index)
2024-09-13 10:34:04 -07:00
{
// URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment]
// scheme = [a-zA-Z] [a-zA-Z0-9+.-]
// authority = [userinfo "@"] host [":" port]
if (index > 0) {
// Don't match if we're not at a word edge:
2024-10-25 11:05:07 -07:00
uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
2024-09-13 10:34:04 -07:00
if (uc_is_property_alphabetic(prev_codepoint))
return -1;
}
int64_t start_index = index;
// Scheme:
if (!EAT1(state, index, isalpha(grapheme)))
2024-09-13 10:34:04 -07:00
return -1;
EAT_MANY(state, index, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
if (!match_grapheme(state, &index, ':'))
2024-09-13 10:34:04 -07:00
return -1;
// Authority:
int64_t authority_len;
if (match_str(state, &index, "//")) {
authority_len = match_authority(state, index);
if (authority_len > 0)
index += authority_len;
} else {
authority_len = 0;
}
2024-09-13 10:34:04 -07:00
// Path:
int64_t path_start = index;
if (EAT1(state, index, grapheme == '/') || authority_len <= 0) {
static const char *non_path = " \"#?<>[]{}\\^`|";
EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
if (EAT1(state, index, grapheme == '?')) { // Query
static const char *non_query = " \"#<>[]{}\\^`|";
EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
2024-09-13 10:34:04 -07:00
}
if (EAT1(state, index, grapheme == '#')) { // Fragment
static const char *non_fragment = " \"#<>[]{}\\^`|";
EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
2024-09-13 10:34:04 -07:00
}
}
if (authority_len <= 0 && index == path_start)
return -1;
2024-09-13 10:34:04 -07:00
return index - start_index;
}
static int64_t match_url(TextIter_t *state, int64_t index)
2024-09-13 10:34:04 -07:00
{
int64_t lookahead = index;
if (!(match_str(state, &lookahead, "https:")
|| match_str(state, &lookahead, "http:")
|| match_str(state, &lookahead, "ftp:")
|| match_str(state, &lookahead, "wss:")
|| match_str(state, &lookahead, "ws:")))
2024-09-13 10:34:04 -07:00
return -1;
return match_uri(state, index);
2024-09-13 10:34:04 -07:00
}
static int64_t match_id(TextIter_t *state, int64_t index)
2024-09-13 10:34:04 -07:00
{
if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START)))
2024-09-13 10:34:04 -07:00
return -1;
return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE));
2024-09-13 10:34:04 -07:00
}
static int64_t match_int(TextIter_t *state, int64_t index)
2024-09-13 10:34:04 -07:00
{
2024-12-15 13:53:07 -08:00
int64_t negative = EAT1(state, index, grapheme == '-') ? 1 : 0;
int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
2024-12-15 13:53:07 -08:00
return len > 0 ? negative + len : -1;
2024-09-13 10:34:04 -07:00
}
2024-09-15 15:41:37 -07:00
static int64_t match_alphanumeric(TextIter_t *state, int64_t index)
{
return EAT1(state, index, uc_is_property_alphabetic((ucs4_t)grapheme) || uc_is_property_numeric((ucs4_t)grapheme))
? 1 : -1;
}
static int64_t match_num(TextIter_t *state, int64_t index)
2024-09-13 10:34:04 -07:00
{
bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0;
int64_t pre_decimal = EAT_MANY(state, index,
2024-09-13 10:34:04 -07:00
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
bool decimal = (EAT1(state, index, grapheme == '.') == 1);
int64_t post_decimal = decimal ? EAT_MANY(state, index,
2024-09-13 10:34:04 -07:00
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
if (pre_decimal == 0 && post_decimal == 0)
return -1;
return negative + pre_decimal + decimal + post_decimal;
}
static int64_t match_newline(TextIter_t *state, int64_t index)
2024-09-13 10:34:04 -07:00
{
if (index >= state->stack[0].text.length)
2024-09-13 10:34:04 -07:00
return -1;
uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index);
2024-09-13 10:34:04 -07:00
if (grapheme == '\n')
return 1;
if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n')
2024-09-13 10:34:04 -07:00
return 2;
return -1;
}
static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat)
2024-09-13 10:34:04 -07:00
{
Text_t text = state->stack[0].text;
int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index);
2024-09-13 10:34:04 -07:00
switch (pat.tag) {
case PAT_START: {
if (index == 0)
return pat.negated ? -1 : 0;
return pat.negated ? 0 : -1;
}
case PAT_END: {
if (index >= text.length)
return pat.negated ? -1 : 0;
return pat.negated ? 0 : -1;
}
case PAT_ANY: {
assert(!pat.negated);
return (index < text.length) ? 1 : -1;
}
case PAT_GRAPHEME: {
if (index >= text.length)
return -1;
else if (grapheme == pat.grapheme)
return pat.negated ? -1 : 1;
return pat.negated ? 1 : -1;
}
case PAT_PROPERTY: {
if (index >= text.length)
return -1;
else if (uc_is_property((ucs4_t)grapheme, pat.property))
return pat.negated ? -1 : 1;
return pat.negated ? 1 : -1;
}
case PAT_PAIR: {
// Nested punctuation: (?), [?], etc
if (index >= text.length)
return -1;
int32_t open = pat.pair_graphemes[0];
if (grapheme != open)
return pat.negated ? 1 : -1;
int32_t close = pat.pair_graphemes[1];
int64_t depth = 1;
int64_t match_len = 1;
for (; depth > 0; match_len++) {
if (index + match_len >= text.length)
return pat.negated ? 1 : -1;
int32_t c = Text$get_grapheme_fast(state, index + match_len);
2024-09-13 10:34:04 -07:00
if (c == open)
depth += 1;
else if (c == close)
depth -= 1;
}
return pat.negated ? -1 : match_len;
}
case PAT_QUOTE: {
// Nested quotes: "?", '?', etc
if (index >= text.length)
return -1;
int32_t open = pat.quote_graphemes[0];
if (grapheme != open)
return pat.negated ? 1 : -1;
int32_t close = pat.quote_graphemes[1];
for (int64_t i = index + 1; i < text.length; i++) {
int32_t c = Text$get_grapheme_fast(state, i);
2024-09-13 10:34:04 -07:00
if (c == close) {
return pat.negated ? -1 : (i - index) + 1;
} else if (c == '\\' && index + 1 < text.length) {
i += 1; // Skip ahead an extra step
}
}
return pat.negated ? 1 : -1;
}
case PAT_FUNCTION: {
int64_t match_len = pat.fn(state, index);
2024-09-13 10:34:04 -07:00
if (match_len >= 0)
return pat.negated ? -1 : match_len;
return pat.negated ? 1 : -1;
}
default: errx(1, "Invalid pattern");
}
errx(1, "Unreachable");
}
static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
2024-09-13 10:34:04 -07:00
{
if (EAT2(state, *index,
2024-09-13 10:34:04 -07:00
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK),
grapheme == '?')) {
// Quotations: "?", '?', etc
int32_t open = Text$get_grapheme_fast(state, *index-2);
2024-09-13 10:34:04 -07:00
int32_t close = open;
uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
if (!match_grapheme(state, index, close))
fail("Pattern's closing quote is missing: ", state->stack[0].text);
2024-09-13 10:34:04 -07:00
return (pat_t){
.tag=PAT_QUOTE,
.min=1, .max=1,
.quote_graphemes={open, close},
};
} else if (EAT2(state, *index,
2024-09-13 10:34:04 -07:00
uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION),
grapheme == '?')) {
// Nested punctuation: (?), [?], etc
int32_t open = Text$get_grapheme_fast(state, *index-2);
2024-09-13 10:34:04 -07:00
int32_t close = open;
uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
if (!match_grapheme(state, index, close))
fail("Pattern's closing brace is missing: ", state->stack[0].text);
2024-09-13 10:34:04 -07:00
return (pat_t){
.tag=PAT_PAIR,
.min=1, .max=1,
.pair_graphemes={open, close},
};
} else if (EAT1(state, *index, grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
skip_whitespace(state, index);
2024-09-13 10:34:04 -07:00
int64_t min, max;
if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) {
min = parse_int(state, index);
skip_whitespace(state, index);
if (match_grapheme(state, index, '+')) {
2024-09-13 10:34:04 -07:00
max = INT64_MAX;
} else if (match_grapheme(state, index, '-')) {
max = parse_int(state, index);
2024-09-13 10:34:04 -07:00
} else {
max = min;
}
if (min > max) fail("Minimum repetitions (", min, ") is less than the maximum (", max, ")");
2024-09-13 10:34:04 -07:00
} else {
min = -1, max = -1;
}
skip_whitespace(state, index);
2024-09-13 10:34:04 -07:00
bool negated = match_grapheme(state, index, '!');
2024-09-13 10:34:04 -07:00
#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__})
const char *prop_name;
if (match_str(state, index, ".."))
2024-09-13 10:34:04 -07:00
prop_name = "..";
else
prop_name = get_property_name(state, index);
2024-09-13 10:34:04 -07:00
if (!prop_name) {
// Literal character, e.g. {1?}
skip_whitespace(state, index);
int32_t grapheme = Text$get_grapheme_fast(state, (*index)++);
if (!match_grapheme(state, index, '}'))
fail("Missing closing '}' in pattern: ", state->stack[0].text);
2024-09-13 10:34:04 -07:00
return PAT(PAT_GRAPHEME, .grapheme=grapheme);
} else if (strlen(prop_name) == 1) {
// Single letter names: {1+ A}
skip_whitespace(state, index);
if (!match_grapheme(state, index, '}'))
fail("Missing closing '}' in pattern: ", state->stack[0].text);
2024-09-13 10:34:04 -07:00
return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]);
}
skip_whitespace(state, index);
if (!match_grapheme(state, index, '}'))
fail("Missing closing '}' in pattern: ", state->stack[0].text);
2024-09-13 10:34:04 -07:00
switch (tolower(prop_name[0])) {
case '.':
if (prop_name[1] == '.') {
if (negated)
return ((pat_t){.tag=PAT_END, .min=min, .max=max, .non_capturing=true});
else
return PAT(PAT_ANY);
}
break;
case 'a':
if (strcasecmp(prop_name, "authority") == 0) {
return PAT(PAT_FUNCTION, .fn=match_authority);
2024-09-15 15:41:37 -07:00
} else if (strcasecmp(prop_name, "alphanum") == 0 || strcasecmp(prop_name, "anum") == 0
|| strcasecmp(prop_name, "alphanumeric") == 0) {
return PAT(PAT_FUNCTION, .fn=match_alphanumeric);
}
break;
2025-03-12 21:14:58 -07:00
case 'c':
if (strcasecmp(prop_name, "crlf") == 0)
return PAT(PAT_FUNCTION, .fn=match_newline);
break;
2024-09-13 10:34:04 -07:00
case 'd':
if (strcasecmp(prop_name, "digit") == 0) {
return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT);
}
break;
case 'e':
if (strcasecmp(prop_name, "end") == 0) {
return PAT(PAT_END, .non_capturing=!negated);
} else if (strcasecmp(prop_name, "email") == 0) {
return PAT(PAT_FUNCTION, .fn=match_email);
}
#if _LIBUNISTRING_VERSION >= 0x0100000
else if (strcasecmp(prop_name, "emoji") == 0) {
2024-09-13 10:34:04 -07:00
return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI);
}
#endif
2024-09-13 10:34:04 -07:00
break;
case 'h':
if (strcasecmp(prop_name, "host") == 0) {
return PAT(PAT_FUNCTION, .fn=match_host);
}
break;
2024-09-13 10:34:04 -07:00
case 'i':
if (strcasecmp(prop_name, "id") == 0) {
return PAT(PAT_FUNCTION, .fn=match_id);
} else if (strcasecmp(prop_name, "int") == 0) {
return PAT(PAT_FUNCTION, .fn=match_int);
} else if (strcasecmp(prop_name, "ipv4") == 0) {
return PAT(PAT_FUNCTION, .fn=match_ipv4);
} else if (strcasecmp(prop_name, "ipv6") == 0) {
return PAT(PAT_FUNCTION, .fn=match_ipv6);
} else if (strcasecmp(prop_name, "ip") == 0) {
return PAT(PAT_FUNCTION, .fn=match_ip);
}
break;
case 'n':
2025-03-12 21:14:58 -07:00
if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0) {
2024-09-13 10:34:04 -07:00
return PAT(PAT_FUNCTION, .fn=match_newline);
} else if (strcasecmp(prop_name, "num") == 0) {
return PAT(PAT_FUNCTION, .fn=match_num);
}
break;
case 's':
if (strcasecmp(prop_name, "start") == 0) {
return PAT(PAT_START, .non_capturing=!negated);
}
break;
case 'u':
if (strcasecmp(prop_name, "uri") == 0) {
return PAT(PAT_FUNCTION, .fn=match_uri);
} else if (strcasecmp(prop_name, "url") == 0) {
return PAT(PAT_FUNCTION, .fn=match_url);
}
break;
2025-02-04 10:44:38 -08:00
case 'w':
if (strcasecmp(prop_name, "word") == 0) {
return PAT(PAT_FUNCTION, .fn=match_id);
}
break;
2024-09-13 10:34:04 -07:00
default: break;
}
uc_property_t prop = uc_property_byname(prop_name);
if (uc_property_is_valid(prop))
return PAT(PAT_PROPERTY, .property=prop);
ucs4_t grapheme = unicode_name_character(prop_name);
if (grapheme == UNINAME_INVALID)
fail("Not a valid property or character name: ", prop_name);
2024-09-13 10:34:04 -07:00
return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme);
#undef PAT
} else {
return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(state, (*index)++)};
2024-09-13 10:34:04 -07:00
}
}
2025-04-01 11:05:10 -07:00
static int64_t match(Text_t text, int64_t text_index, Text_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index)
2024-09-13 10:34:04 -07:00
{
if (pattern_index >= pattern.length) // End of the pattern
return 0;
int64_t start_index = text_index;
TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text);
pat_t pat = parse_next_pat(&pattern_state, &pattern_index);
2024-09-13 10:34:04 -07:00
if (pat.min == -1 && pat.max == -1) {
if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
pat.min = pat.max = MAX(1, text.length - text_index);
} else {
pat.min = 1;
pat.max = INT64_MAX;
}
}
int64_t capture_start = text_index;
int64_t count = 0, capture_len = 0, next_match_len = 0;
if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
int64_t remaining = text.length - text_index;
capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1;
text_index += capture_len;
goto success;
}
if (pat.min == 0 && pattern_index < pattern.length) {
next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1));
if (next_match_len >= 0) {
capture_len = 0;
goto success;
}
}
while (count < pat.max) {
int64_t match_len = match_pat(&text_state, text_index, pat);
2024-09-13 10:34:04 -07:00
if (match_len < 0)
break;
capture_len += match_len;
text_index += match_len;
count += 1;
if (pattern_index < pattern.length) { // More stuff after this
if (count < pat.min)
next_match_len = -1;
else
next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1));
} else {
next_match_len = 0;
}
if (match_len == 0) {
if (next_match_len >= 0) {
// If we're good to go, no need to keep re-matching zero-length
// matches till we hit max:
count = pat.max;
break;
} else {
return -1;
}
}
if (pattern_index < pattern.length && next_match_len >= 0)
break; // Next guy exists and wants to stop here
if (text_index >= text.length)
break;
}
if (count < pat.min || next_match_len < 0)
return -1;
success:
if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) {
if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) {
assert(capture_len > 0);
captures[capture_index] = (capture_t){
.index=capture_start + 1, // Skip leading quote/paren
.length=capture_len - 2, // Skip open/close
.occupied=true,
.recursive=(pat.tag == PAT_PAIR),
};
} else {
captures[capture_index] = (capture_t){
.index=capture_start,
.length=capture_len,
.occupied=true,
.recursive=false,
};
}
}
return (text_index - start_index) + next_match_len;
}
#undef EAT1
#undef EAT2
#undef EAT_MANY
2025-04-01 11:05:10 -07:00
static int64_t _find(Text_t text, Text_t pattern, int64_t first, int64_t last, int64_t *match_length, capture_t *captures)
2024-09-13 10:34:04 -07:00
{
int32_t first_grapheme = Text$get_grapheme(pattern, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
2024-09-13 10:34:04 -07:00
for (int64_t i = first; i <= last; i++) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme)
2024-09-13 10:34:04 -07:00
++i;
}
int64_t m = match(text, i, pattern, 0, captures, 0);
2024-09-13 10:34:04 -07:00
if (m >= 0) {
if (match_length)
*match_length = m;
return i;
}
}
if (match_length)
*match_length = -1;
return -1;
}
2025-04-01 11:05:10 -07:00
static OptionalPatternMatch find(Text_t text, Text_t pattern, Int_t from_index)
2024-09-13 10:34:04 -07:00
{
int64_t first = Int64$from_int(from_index, false);
2024-09-13 10:34:04 -07:00
if (first == 0) fail("Invalid index: 0");
if (first < 0) first = text.length + first + 1;
if (first > text.length || first < 1)
2024-11-24 13:18:21 -08:00
return NONE_MATCH;
capture_t captures[MAX_BACKREFS] = {};
int64_t len = 0;
int64_t found = _find(text, pattern, first-1, text.length-1, &len, captures);
if (found == -1)
2024-11-24 13:18:21 -08:00
return NONE_MATCH;
Array_t capture_array = {};
for (int i = 0; captures[i].occupied; i++) {
Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
Array$insert(&capture_array, &capture, I(0), sizeof(Text_t));
}
2025-04-01 11:05:10 -07:00
return (OptionalPatternMatch){
.text=Text$slice(text, I(found+1), I(found+len)),
.index=I(found+1),
.captures=capture_array,
};
2024-09-13 10:34:04 -07:00
}
2025-04-01 11:05:10 -07:00
PUREFUNC static bool Pattern$has(Text_t text, Text_t pattern)
2024-09-13 10:34:04 -07:00
{
if (Text$starts_with(pattern, Text("{start}"))) {
int64_t m = match(text, 0, pattern, 0, NULL, 0);
return m >= 0;
} else if (Text$ends_with(text, Text("{end}"))) {
for (int64_t i = text.length-1; i >= 0; i--) {
int64_t match_len = match(text, i, pattern, 0, NULL, 0);
if (match_len >= 0 && i + match_len == text.length)
return true;
}
return false;
} else {
int64_t found = _find(text, pattern, 0, text.length-1, NULL, NULL);
2024-09-13 10:34:04 -07:00
return (found >= 0);
}
}
2025-04-01 17:08:36 -07:00
static bool Pattern$matches(Text_t text, Text_t pattern)
{
capture_t captures[MAX_BACKREFS] = {};
int64_t match_len = match(text, 0, pattern, 0, NULL, 0);
return (match_len == text.length);
}
static OptionalArray_t Pattern$captures(Text_t text, Text_t pattern)
2024-09-13 10:34:04 -07:00
{
capture_t captures[MAX_BACKREFS] = {};
int64_t match_len = match(text, 0, pattern, 0, captures, 0);
if (match_len != text.length)
2024-11-24 13:18:21 -08:00
return NONE_ARRAY;
Array_t capture_array = {};
for (int i = 0; captures[i].occupied; i++) {
Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
Array$insert(&capture_array, &capture, I(0), sizeof(Text_t));
}
return capture_array;
2024-09-13 10:34:04 -07:00
}
2025-04-01 11:05:10 -07:00
static Array_t Pattern$find_all(Text_t text, Text_t pattern)
2024-09-13 10:34:04 -07:00
{
if (pattern.length == 0) // special case
return (Array_t){.length=0};
Array_t matches = {};
for (int64_t i = 1; ; ) {
2025-04-01 11:05:10 -07:00
OptionalPatternMatch m = find(text, pattern, I(i));
if (m.is_none)
break;
i = Int64$from_int(m.index, false) + m.text.length;
2025-04-01 11:05:10 -07:00
Array$insert(&matches, &m, I_small(0), sizeof(PatternMatch));
2024-09-13 10:34:04 -07:00
}
return matches;
}
typedef struct {
TextIter_t state;
Int_t i;
2025-04-01 11:05:10 -07:00
Text_t pattern;
} match_iter_state_t;
2025-04-01 11:05:10 -07:00
static OptionalPatternMatch next_match(match_iter_state_t *state)
{
if (Int64$from_int(state->i, false) > state->state.stack[0].text.length)
return NONE_MATCH;
2025-04-01 11:05:10 -07:00
OptionalPatternMatch m = find(state->state.stack[0].text, state->pattern, state->i);
if (m.is_none) // No match
state->i = I(state->state.stack[0].text.length + 1);
else
state->i = Int$plus(m.index, I(MAX(1, m.text.length)));
return m;
}
2025-04-01 11:05:10 -07:00
static Closure_t Pattern$by_match(Text_t text, Text_t pattern)
{
return (Closure_t){
.fn=(void*)next_match,
.userdata=new(match_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=I_small(1), .pattern=pattern),
};
}
2025-04-01 11:05:10 -07:00
static Text_t apply_backrefs(Text_t text, Array_t recursive_replacements, Text_t replacement, Text_t backref_pat, capture_t *captures)
2024-09-13 10:34:04 -07:00
{
if (backref_pat.length == 0)
return replacement;
int32_t first_grapheme = Text$get_grapheme(backref_pat, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
Text_t ret = Text("");
TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement);
2024-09-13 10:34:04 -07:00
int64_t nonmatching_pos = 0;
for (int64_t pos = 0; pos < replacement.length; ) {
// Optimization: quickly skip ahead to first char in the backref pattern:
if (find_first) {
while (pos < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme)
2024-09-13 10:34:04 -07:00
++pos;
}
int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0);
if (backref_len < 0) {
pos += 1;
continue;
}
int64_t after_backref = pos + backref_len;
int64_t backref = parse_int(&replacement_state, &after_backref);
2024-09-13 10:34:04 -07:00
if (after_backref == pos + backref_len) { // Not actually a backref if there's no number
pos += 1;
continue;
}
if (backref < 0 || backref > 9) fail("Invalid backref index: ", backref, " (only 0-", MAX_BACKREFS-1, " are allowed)");
2024-09-13 10:34:04 -07:00
backref_len = (after_backref - pos);
if (Text$get_grapheme_fast(&replacement_state, pos + backref_len) == ';')
2024-09-13 10:34:04 -07:00
backref_len += 1; // skip optional semicolon
if (!captures[backref].occupied)
fail("There is no capture number ", backref, "!");
2024-09-13 10:34:04 -07:00
Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length));
if (captures[backref].recursive && recursive_replacements.length > 0)
2025-04-01 11:05:10 -07:00
backref_text = replace_array(backref_text, recursive_replacements, backref_pat, true);
2024-09-13 10:34:04 -07:00
if (pos > nonmatching_pos) {
Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos));
ret = Text$concat(ret, before_slice, backref_text);
} else {
ret = Text$concat(ret, backref_text);
}
pos += backref_len;
nonmatching_pos = pos;
}
if (nonmatching_pos < replacement.length) {
Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length));
ret = Text$concat(ret, last_slice);
}
return ret;
}
2025-04-01 11:05:10 -07:00
static Text_t Pattern$replace(Text_t text, Text_t pattern, Text_t replacement, Text_t backref_pat, bool recursive)
2024-09-13 10:34:04 -07:00
{
Text_t ret = EMPTY_TEXT;
2024-09-13 10:34:04 -07:00
int32_t first_grapheme = Text$get_grapheme(pattern, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
Text_t entries[2] = {pattern, replacement};
Array_t replacements = {
.data=entries,
.length=1,
.stride=sizeof(entries),
};
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
2024-09-13 10:34:04 -07:00
int64_t nonmatching_pos = 0;
for (int64_t pos = 0; pos < text.length; ) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
2024-09-13 10:34:04 -07:00
++pos;
}
capture_t captures[MAX_BACKREFS] = {};
int64_t match_len = match(text, pos, pattern, 0, captures, 1);
if (match_len < 0) {
pos += 1;
continue;
}
captures[0] = (capture_t){
.index = pos, .length = match_len,
.occupied = true, .recursive = false,
};
Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (Array_t){}, replacement, backref_pat, captures);
2024-09-13 10:34:04 -07:00
if (pos > nonmatching_pos) {
Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos));
ret = Text$concat(ret, before_slice, replacement_text);
} else {
ret = Text$concat(ret, replacement_text);
}
nonmatching_pos = pos + match_len;
pos += MAX(match_len, 1);
}
if (nonmatching_pos < text.length) {
Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length));
ret = Text$concat(ret, last_slice);
}
return ret;
}
2025-04-01 11:05:10 -07:00
static Text_t Pattern$trim(Text_t text, Text_t pattern, bool trim_left, bool trim_right)
2024-09-13 10:34:04 -07:00
{
int64_t first = 0, last = text.length-1;
if (trim_left) {
int64_t match_len = match(text, 0, pattern, 0, NULL, 0);
if (match_len > 0)
first = match_len;
}
if (trim_right) {
for (int64_t i = text.length-1; i >= first; i--) {
int64_t match_len = match(text, i, pattern, 0, NULL, 0);
if (match_len > 0 && i + match_len == text.length)
last = i-1;
}
}
return Text$slice(text, I(first+1), I(last+1));
}
2025-04-01 11:05:10 -07:00
static Text_t Pattern$map(Text_t text, Text_t pattern, Closure_t fn, bool recursive)
2024-09-13 10:34:04 -07:00
{
Text_t ret = EMPTY_TEXT;
2024-09-13 10:34:04 -07:00
int32_t first_grapheme = Text$get_grapheme(pattern, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
2024-09-13 10:34:04 -07:00
int64_t nonmatching_pos = 0;
2025-04-01 11:05:10 -07:00
Text_t (*text_mapper)(PatternMatch, void*) = fn.fn;
2024-09-13 10:34:04 -07:00
for (int64_t pos = 0; pos < text.length; pos++) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
2024-09-13 10:34:04 -07:00
++pos;
}
capture_t captures[MAX_BACKREFS] = {};
int64_t match_len = match(text, pos, pattern, 0, captures, 0);
2024-09-13 10:34:04 -07:00
if (match_len < 0) continue;
2025-04-01 11:05:10 -07:00
PatternMatch m = {
.text=Text$slice(text, I(pos+1), I(pos+match_len)),
.index=I(pos+1),
.captures={},
};
for (int i = 0; captures[i].occupied; i++) {
Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
if (recursive)
2025-04-01 11:05:10 -07:00
capture = Pattern$map(capture, pattern, fn, recursive);
Array$insert(&m.captures, &capture, I(0), sizeof(Text_t));
}
Text_t replacement = text_mapper(m, fn.userdata);
2024-09-13 10:34:04 -07:00
if (pos > nonmatching_pos) {
Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos));
ret = Text$concat(ret, before_slice, replacement);
} else {
ret = Text$concat(ret, replacement);
}
nonmatching_pos = pos + match_len;
pos += (match_len - 1);
}
if (nonmatching_pos < text.length) {
Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length));
ret = Text$concat(ret, last_slice);
}
return ret;
}
2025-04-01 11:05:10 -07:00
static void Pattern$each(Text_t text, Text_t pattern, Closure_t fn, bool recursive)
2024-11-09 13:55:44 -08:00
{
int32_t first_grapheme = Text$get_grapheme(pattern, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
2025-04-01 11:05:10 -07:00
void (*action)(PatternMatch, void*) = fn.fn;
2024-11-09 13:55:44 -08:00
for (int64_t pos = 0; pos < text.length; pos++) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
++pos;
}
capture_t captures[MAX_BACKREFS] = {};
int64_t match_len = match(text, pos, pattern, 0, captures, 0);
if (match_len < 0) continue;
2025-04-01 11:05:10 -07:00
PatternMatch m = {
2024-11-09 13:55:44 -08:00
.text=Text$slice(text, I(pos+1), I(pos+match_len)),
.index=I(pos+1),
.captures={},
};
for (int i = 0; captures[i].occupied; i++) {
Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
if (recursive)
2025-04-01 11:05:10 -07:00
Pattern$each(capture, pattern, fn, recursive);
2024-11-09 13:55:44 -08:00
Array$insert(&m.captures, &capture, I(0), sizeof(Text_t));
}
action(m, fn.userdata);
pos += (match_len - 1);
}
}
2025-04-01 11:05:10 -07:00
Text_t replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive)
2024-09-13 10:34:04 -07:00
{
if (replacements.length == 0) return text;
2024-09-13 10:34:04 -07:00
Text_t ret = EMPTY_TEXT;
2024-09-13 10:34:04 -07:00
int64_t nonmatch_pos = 0;
for (int64_t pos = 0; pos < text.length; ) {
// Find the first matching pattern at this position:
for (int64_t i = 0; i < replacements.length; i++) {
2025-04-01 11:05:10 -07:00
Text_t pattern = *(Text_t*)(replacements.data + i*replacements.stride);
2024-09-13 10:34:04 -07:00
capture_t captures[MAX_BACKREFS] = {};
int64_t len = match(text, pos, pattern, 0, captures, 1);
if (len < 0) continue;
captures[0].index = pos;
captures[0].length = len;
// If we skipped over some non-matching text before finding a match, insert it here:
if (pos > nonmatch_pos) {
Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos));
ret = Text$concat(ret, before_slice);
}
// Concatenate the replacement:
Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride + sizeof(Text_t));
Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (Array_t){}, replacement, backref_pat, captures);
2024-09-13 10:34:04 -07:00
ret = Text$concat(ret, replacement_text);
pos += MAX(len, 1);
nonmatch_pos = pos;
goto next_pos;
}
pos += 1;
next_pos:
continue;
}
if (nonmatch_pos <= text.length) {
Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length));
ret = Text$concat(ret, last_slice);
}
return ret;
}
2025-04-01 11:05:10 -07:00
static Text_t Pattern$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive)
{
2025-04-01 11:05:10 -07:00
return replace_array(text, replacements.entries, backref_pat, recursive);
}
2025-04-01 11:05:10 -07:00
static Array_t Pattern$split(Text_t text, Text_t pattern)
2024-09-13 10:34:04 -07:00
{
if (text.length == 0) // special case
return (Array_t){.length=0};
if (pattern.length == 0) // special case
return Text$clusters(text);
Array_t chunks = {};
2024-10-27 17:18:30 -07:00
int64_t i = 0;
2024-09-13 10:34:04 -07:00
for (;;) {
int64_t len = 0;
int64_t found = _find(text, pattern, i, text.length-1, &len, NULL);
if (found == i && len == 0)
found = _find(text, pattern, i + 1, text.length-1, &len, NULL);
2024-10-27 17:18:30 -07:00
if (found < 0) break;
Text_t chunk = Text$slice(text, I(i+1), I(found));
2024-09-13 10:34:04 -07:00
Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
i = MAX(found + len, i + 1);
2024-09-13 10:34:04 -07:00
}
2024-10-27 17:18:30 -07:00
Text_t last_chunk = Text$slice(text, I(i+1), I(text.length));
2024-09-13 10:34:04 -07:00
Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t));
return chunks;
}
typedef struct {
TextIter_t state;
int64_t i;
2025-04-01 11:05:10 -07:00
Text_t pattern;
} split_iter_state_t;
static OptionalText_t next_split(split_iter_state_t *state)
{
Text_t text = state->state.stack[0].text;
if (state->i >= text.length) {
if (state->pattern.length > 0 && state->i == text.length) { // special case
state->i = text.length + 1;
return EMPTY_TEXT;
}
return NONE_TEXT;
}
if (state->pattern.length == 0) { // special case
Text_t ret = Text$cluster(text, I(state->i+1));
state->i += 1;
return ret;
}
int64_t start = state->i;
int64_t len = 0;
int64_t found = _find(text, state->pattern, start, text.length-1, &len, NULL);
if (found == start && len == 0)
found = _find(text, state->pattern, start + 1, text.length-1, &len, NULL);
if (found >= 0) {
state->i = MAX(found + len, state->i + 1);
return Text$slice(text, I(start+1), I(found));
} else {
state->i = state->state.stack[0].text.length + 1;
return Text$slice(text, I(start+1), I(text.length));
}
}
2025-04-01 11:05:10 -07:00
static Closure_t Pattern$by_split(Text_t text, Text_t pattern)
{
return (Closure_t){
.fn=(void*)next_split,
.userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .pattern=pattern),
};
}
2025-04-01 11:05:10 -07:00
static Text_t Pattern$escape_text(Text_t text)
2025-03-30 23:17:47 -07:00
{
// TODO: optimize for spans of non-escaped text
Text_t ret = EMPTY_TEXT;
TextIter_t state = NEW_TEXT_ITER_STATE(text);
for (int64_t i = 0; i < text.length; i++) {
uint32_t g = Text$get_main_grapheme_fast(&state, i);
if (g == '{') {
ret = Text$concat(ret, Text("{1{}"));
} else if (g == '?'
|| uc_is_property_quotation_mark(g)
|| (uc_is_property_paired_punctuation(g) && uc_is_property_left_of_pair(g))) {
ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i+1), I(i+1)), Text("}"));
} else {
ret = Text$concat(ret, Text$slice(text, I(i+1), I(i+1)));
}
}
return ret;
}
static Text_t Pattern$as_text(const void *obj, bool colorize, const TypeInfo_t *info)
{
(void)info;
if (!obj) return Text("Pattern");
2025-04-01 11:05:10 -07:00
Text_t pat = *(Text_t*)obj;
Text_t quote = Pattern$has(pat, Text("/")) && !Pattern$has(pat, Text("|")) ? Text("|") : Text("/");
return Text$concat(colorize ? Text("\x1b[1m$\033[m") : Text("$"), Text$quoted(pat, colorize, quote));
}
2024-09-13 10:34:04 -07:00
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0