aboutsummaryrefslogtreecommitdiff
path: root/pattern.c
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2023-05-06 13:43:32 -0400
committerBruce Hill <bruce@bruce-hill.com>2023-05-06 13:43:32 -0400
commit6f5bb02b923c1402eba04ce78033317da401971e (patch)
tree7b43203ca28667b8d3526445b1ee2aff8bf43495 /pattern.c
parent0050a6fc064f7700d9940bf4fd381c7a072652a6 (diff)
Use tagged union style for extra safety and concision
Diffstat (limited to 'pattern.c')
-rw-r--r--pattern.c238
1 files changed, 87 insertions, 151 deletions
diff --git a/pattern.c b/pattern.c
index 9eee45a..75b21b3 100644
--- a/pattern.c
+++ b/pattern.c
@@ -46,24 +46,17 @@ static inline void parse_err(const char *start, const char *end, const char *msg
// Allocate a new pattern for this file (ensuring it will be automatically
// freed when the file is freed)
//
-__attribute__((returns_nonnull, nonnull(2)))
-static pat_t *new_pat(enum pattype_e type, const char *start, const char *end, size_t minlen, ssize_t maxlen)
+pat_t *allocate_pat(pat_t pat)
{
static size_t next_pat_id = 1;
- pat_t *pat = new(pat_t);
- *pat = (pat_t){
- .home = &allocated_pats,
- .next = allocated_pats,
- .type = type,
- .start = start,
- .end = end,
- .min_matchlen = minlen,
- .max_matchlen = maxlen,
- .id = next_pat_id++,
- };
- if (allocated_pats) allocated_pats->home = &pat->next;
- allocated_pats = pat;
- return pat;
+ pat_t *allocated = new(pat_t);
+ *allocated = pat;
+ allocated->home = &allocated_pats;
+ allocated->next = allocated_pats;
+ allocated->id = next_pat_id++;
+ if (allocated_pats) allocated_pats->home = &allocated->next;
+ allocated_pats = allocated;
+ return allocated;
}
//
@@ -75,12 +68,8 @@ static pat_t *new_range(const char *start, const char *end, size_t min, ssize_t
size_t minlen = min*repeating->min_matchlen + (min > 0 ? min-1 : 0)*(sep ? sep->min_matchlen : 0);
ssize_t maxlen = (max == -1 || UNBOUNDED(repeating) || (max != 0 && max != 1 && sep && UNBOUNDED(sep))) ? (ssize_t)-1
: max*repeating->max_matchlen + (ssize_t)(max > 0 ? min-1 : 0)*(ssize_t)(sep ? sep->min_matchlen : 0);
- pat_t *range = new_pat(BP_REPEAT, start, end, minlen, maxlen);
- range->args.repetitions.min = min;
- range->args.repetitions.max = max;
- range->args.repetitions.repeat_pat = repeating;
- range->args.repetitions.sep = sep;
- return range;
+ return Pattern(BP_REPEAT, start, end, minlen, maxlen,
+ .min=min, .max=max, .repeat_pat=repeating, .sep=sep);
}
//
@@ -126,12 +115,9 @@ static pat_t *expand_replacements(pat_t *replace_pat, const char *end, bool allo
replen = 0;
}
- pat_t *pat = new_pat(BP_REPLACE, replace_pat->start, str,
- replace_pat->min_matchlen, replace_pat->max_matchlen);
- pat->args.replace.pat = replace_pat;
- pat->args.replace.text = repstr;
- pat->args.replace.len = replen;
- replace_pat = pat;
+ replace_pat = Pattern(BP_REPLACE, replace_pat->start, str,
+ replace_pat->min_matchlen, replace_pat->max_matchlen,
+ .pat=replace_pat, .text=repstr, .len=replen);
}
return replace_pat;
}
@@ -152,7 +138,7 @@ static pat_t *expand_choices(pat_t *first, const char *end, bool allow_nl)
pat_t *second = bp_simplepattern(str, end);
if (second) str = second->end;
if (matchstr(&str, "=>", allow_nl, end))
- second = expand_replacements(second ? second : new_pat(BP_STRING, str-2, str-2, 0, 0), end, allow_nl);
+ second = expand_replacements(second ? second : Pattern(BP_STRING, str-2, str-2, 0, 0), end, allow_nl);
if (!second)
parse_err(str, str, "There should be a pattern here after a '/'");
second = expand_choices(second, end, allow_nl);
@@ -169,18 +155,12 @@ pat_t *chain_together(pat_t *first, pat_t *second)
if (second == NULL) return first;
if (first->type == BP_DEFINITIONS && second->type == BP_DEFINITIONS) {
- pat_t *chain = new_pat(BP_CHAIN, first->start, second->end, second->min_matchlen, second->max_matchlen);
- chain->args.multiple.first = first;
- chain->args.multiple.second = second;
- return chain;
+ return Pattern(BP_CHAIN, first->start, second->end, second->min_matchlen, second->max_matchlen, .first=first, .second=second);
}
size_t minlen = first->min_matchlen + second->min_matchlen;
ssize_t maxlen = (UNBOUNDED(first) || UNBOUNDED(second)) ? (ssize_t)-1 : first->max_matchlen + second->max_matchlen;
- pat_t *chain = new_pat(BP_CHAIN, first->start, second->end, minlen, maxlen);
- chain->args.multiple.first = first;
- chain->args.multiple.second = second;
- return chain;
+ return Pattern(BP_CHAIN, first->start, second->end, minlen, maxlen, .first=first, .second=second);
}
//
@@ -194,10 +174,7 @@ pat_t *either_pat(pat_t *first, pat_t *second)
size_t minlen = first->min_matchlen < second->min_matchlen ? first->min_matchlen : second->min_matchlen;
ssize_t maxlen = (UNBOUNDED(first) || UNBOUNDED(second)) ? (ssize_t)-1 :
(first->max_matchlen > second->max_matchlen ? first->max_matchlen : second->max_matchlen);
- pat_t *either = new_pat(BP_OTHERWISE, first->start, second->end, minlen, maxlen);
- either->args.multiple.first = first;
- either->args.multiple.second = second;
- return either;
+ return Pattern(BP_OTHERWISE, first->start, second->end, minlen, maxlen, .first=first, .second=second);
}
//
@@ -215,21 +192,13 @@ static pat_t *_bp_definition(const char *start, const char *end)
if (!def) parse_err(str, end, "Could not parse this definition.");
str = def->end;
(void)matchchar(&str, ';', false, end); // Optional semicolon
- pat_t *ret = new_pat(BP_DEFINITIONS, start, str, 0, -1);
- ret->args.def.name = start;
- ret->args.def.namelen = namelen;
if (is_tagged) { // `id:: foo` means define a rule named `id` that gives captures an `id` tag
- pat_t *capture = new_pat(BP_TAGGED, def->start, def->end, def->min_matchlen, def->max_matchlen);
- capture->args.capture.capture_pat = def;
- capture->args.capture.name = start;
- capture->args.capture.namelen = namelen;
- def = capture;
+ def = Pattern(BP_TAGGED, def->start, def->end, def->min_matchlen, def->max_matchlen,
+ .pat=def, .name=start, .namelen=namelen);
}
- ret->args.def.meaning = def;
- ret->args.def.next_def = _bp_definition(after_spaces(str, true, end), end);
- if (ret->args.def.next_def)
- ret->end = ret->args.def.next_def->end;
- return ret;
+ pat_t *next_def = _bp_definition(after_spaces(str, true, end), end);
+ return Pattern(BP_DEFINITIONS, start, next_def ? next_def->end : str, 0, -1,
+ .name=start, .namelen=namelen, .meaning=def, .next_def=next_def);
}
//
@@ -252,25 +221,30 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st
pat_t *extra_arg = NULL;
if (matchchar(&str, '%', false, end)) {
extra_arg = bp_simplepattern(str, end);
- if (!extra_arg)
+ if (extra_arg)
+ str = extra_arg->end;
+ else
parse_err(str, str, "There should be a pattern to skip here after the '%'");
} else if (matchchar(&str, '=', false, end)) {
extra_arg = bp_simplepattern(str, end);
- if (!extra_arg)
+ if (extra_arg)
+ str = extra_arg->end;
+ else
parse_err(str, str, "There should be a pattern here after the '='");
type = BP_UPTO_STRICT;
}
- pat_t *upto = new_pat(type, start, extra_arg ? extra_arg->end : str, 0, -1);
- upto->args.multiple.second = extra_arg;
+ pat_t *target;
if (inside_stringpattern) {
- maybe_pat_t target = bp_stringpattern(upto->end, end);
- upto->args.multiple.first = target.success ? target.value.pat : NULL;
+ maybe_pat_t maybe_target = bp_stringpattern(str, end);
+ target = maybe_target.success ? maybe_target.value.pat : NULL;
} else {
- upto->args.multiple.first = bp_simplepattern(upto->end, end);
+ target = bp_simplepattern(str, end);
}
- return upto;
+ return type == BP_UPTO ?
+ Pattern(BP_UPTO, start, str, 0, -1, .target=target, .skip=extra_arg)
+ : Pattern(BP_UPTO_STRICT, start, str, 0, -1, .target=target, .skip=extra_arg);
} else {
- return new_pat(BP_ANYCHAR, start, str, 1, UTF8_MAXCHARLEN);
+ return Pattern(BP_ANYCHAR, start, str, 1, UTF8_MAXCHARLEN);
}
}
// Char literals
@@ -295,14 +269,11 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st
c2 = tmp;
}
str = next_char(c2_loc, end);
- pat_t *pat = new_pat(BP_RANGE, start == c1_loc - 1 ? start : c1_loc, str, 1, 1);
- pat->args.range.low = (unsigned char)c1;
- pat->args.range.high = (unsigned char)c2;
+ pat_t *pat = Pattern(BP_RANGE, start == c1_loc - 1 ? start : c1_loc, str, 1, 1, .low=c1, .high=c2);
all = either_pat(all, pat);
} else {
size_t len = (size_t)(str - c1_loc);
- pat_t *pat = new_pat(BP_STRING, start, str, len, (ssize_t)len);
- pat->args.string = c1_loc;
+ pat_t *pat = Pattern(BP_STRING, start, str, len, (ssize_t)len, .string=c1_loc);
all = either_pat(all, pat);
}
} while (*str++ == ',');
@@ -318,19 +289,19 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st
do { // Comma-separated items:
const char *itemstart = str-1;
if (*str == 'N') { // \N (nodent)
- all = either_pat(all, new_pat(BP_NODENT, itemstart, ++str, 1, -1));
+ all = either_pat(all, Pattern(BP_NODENT, itemstart, ++str, 1, -1));
continue;
} else if (*str == 'C') { // \C (current indent)
- all = either_pat(all, new_pat(BP_CURDENT, itemstart, ++str, 1, -1));
+ all = either_pat(all, Pattern(BP_CURDENT, itemstart, ++str, 1, -1));
continue;
} else if (*str == 'i') { // \i (identifier char)
- all = either_pat(all, new_pat(BP_ID_CONTINUE, itemstart, ++str, 1, -1));
+ all = either_pat(all, Pattern(BP_ID_CONTINUE, itemstart, ++str, 1, -1));
continue;
} else if (*str == 'I') { // \I (identifier char, not including numbers)
- all = either_pat(all, new_pat(BP_ID_START, itemstart, ++str, 1, -1));
+ all = either_pat(all, Pattern(BP_ID_START, itemstart, ++str, 1, -1));
continue;
} else if (*str == 'b') { // \b word boundary
- all = either_pat(all, new_pat(BP_WORD_BOUNDARY, itemstart, ++str, 0, 0));
+ all = either_pat(all, Pattern(BP_WORD_BOUNDARY, itemstart, ++str, 0, 0));
continue;
}
@@ -350,9 +321,7 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st
if (e_high < e_low)
parse_err(start, str, "Escape ranges should be low-to-high, but this is high-to-low.");
}
- pat_t *esc = new_pat(BP_RANGE, start, str, 1, 1);
- esc->args.range.low = e_low;
- esc->args.range.high = e_high;
+ pat_t *esc = Pattern(BP_RANGE, start, str, 1, 1, .low=e_low, .high=e_high);
all = either_pat(all, esc);
} while (*str == ',' && str++ < end);
@@ -360,7 +329,7 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st
}
// Word boundary
case '|': {
- return new_pat(BP_WORD_BOUNDARY, start, str, 0, 0);
+ return Pattern(BP_WORD_BOUNDARY, start, str, 0, 0);
}
// String literal
case '"': case '\'': case '\002': case '{': {
@@ -370,18 +339,13 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st
str = next_char(str, end);
size_t len = (size_t)(str - litstart);
str = next_char(str, end);
-
- pat_t *pat = new_pat(BP_STRING, start, str, len, (ssize_t)len);
- pat->args.string = litstart;
- return pat;
+ return Pattern(BP_STRING, start, str, len, (ssize_t)len, .string=litstart);
}
// Not <pat>
case '!': {
pat_t *p = bp_simplepattern(str, end);
if (!p) parse_err(str, str, "There should be a pattern after this '!'");
- pat_t *not = new_pat(BP_NOT, start, p->end, 0, 0);
- not->args.pat = p;
- return not;
+ return Pattern(BP_NOT, start, p->end, 0, 0, .pat=p);
}
// Number of repetitions: <N>(-<N> / - / + / "")
case '0': case '1': case '2': case '3': case '4': case '5':
@@ -421,21 +385,14 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st
pat_t *behind = bp_simplepattern(str, end);
if (!behind)
parse_err(str, str, "There should be a pattern after this '<'");
- str = behind->end;
- str = behind->end;
- pat_t *pat = new_pat(BP_AFTER, start, str, 0, 0);
- pat->args.pat = behind;
- return pat;
+ return Pattern(BP_AFTER, start, behind->end, 0, 0, .pat=behind);
}
// Lookahead
case '>': {
pat_t *ahead = bp_simplepattern(str, end);
if (!ahead)
parse_err(str, str, "There should be a pattern after this '>'");
- str = ahead->end;
- pat_t *pat = new_pat(BP_BEFORE, start, str, 0, 0);
- pat->args.pat = ahead;
- return pat;
+ return Pattern(BP_BEFORE, start, ahead->end, 0, 0, .pat=ahead);
}
// Parentheses
case '(': {
@@ -486,11 +443,8 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st
p = bp_simplepattern(str, end);
if (p) str = p->end;
}
- pat_t *tagged = new_pat(BP_TAGGED, start, str, p ? p->min_matchlen : 0, p ? p->max_matchlen : 0);
- tagged->args.capture.capture_pat = p;
- tagged->args.capture.name = name;
- tagged->args.capture.namelen = namelen;
- return tagged;
+ return Pattern(BP_TAGGED, start, str, p ? p->min_matchlen : 0, p ? p->max_matchlen : 0,
+ .pat=p, .name=name, .namelen=namelen);
}
const char *name = NULL;
@@ -512,24 +466,20 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st
if (!pat)
parse_err(str, str, "There should be a valid pattern here to capture after the '@'");
- pat_t *capture = new_pat(BP_CAPTURE, start, pat->end, pat->min_matchlen, pat->max_matchlen);
- capture->args.capture.capture_pat = pat;
- capture->args.capture.name = name;
- capture->args.capture.namelen = namelen;
- capture->args.capture.backreffable = backreffable;
- return capture;
+ return Pattern(BP_CAPTURE, start, pat->end, pat->min_matchlen, pat->max_matchlen,
+ .pat = pat, .name = name, .namelen = namelen, .backreffable = backreffable);
}
// Start of file/line
case '^': {
if (*str == '^')
- return new_pat(BP_START_OF_FILE, start, ++str, 0, 0);
- return new_pat(BP_START_OF_LINE, start, str, 0, 0);
+ return Pattern(BP_START_OF_FILE, start, ++str, 0, 0);
+ return Pattern(BP_START_OF_LINE, start, str, 0, 0);
}
// End of file/line:
case '$': {
if (*str == '$')
- return new_pat(BP_END_OF_FILE, start, ++str, 0, 0);
- return new_pat(BP_END_OF_LINE, start, str, 0, 0);
+ return Pattern(BP_END_OF_FILE, start, ++str, 0, 0);
+ return Pattern(BP_END_OF_LINE, start, str, 0, 0);
}
default: {
pat_t *def = _bp_definition(start, end);
@@ -538,10 +488,7 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st
if (!isalpha(c) && c != '_') return NULL;
str = after_name(start, end);
size_t namelen = (size_t)(str - start);
- pat_t *ref = new_pat(BP_REF, start, str, 0, -1);
- ref->args.ref.name = start;
- ref->args.ref.len = namelen;
- return ref;
+ return Pattern(BP_REF, start, str, 0, -1, .name=start, .len=namelen);
}
}
}
@@ -570,8 +517,7 @@ maybe_pat_t bp_stringpattern(const char *str, const char *end)
// End of string
size_t len = (size_t)(str - start);
if (len > 0) {
- pat_t *str_chunk = new_pat(BP_STRING, start, str, len, (ssize_t)len);
- str_chunk->args.string = start;
+ pat_t *str_chunk = Pattern(BP_STRING, start, str, len, (ssize_t)len, .string=start);
ret = chain_together(ret, str_chunk);
}
if (interp) {
@@ -581,7 +527,7 @@ maybe_pat_t bp_stringpattern(const char *str, const char *end)
(void)matchchar(&str, ';', false, end);
}
}
- if (!ret) ret = new_pat(BP_STRING, str, str, 0, 0);
+ if (!ret) ret = Pattern(BP_STRING, str, str, 0, 0);
__END_TRY_PATTERN__
return (maybe_pat_t){.success = true, .value.pat = ret};
}
@@ -610,9 +556,9 @@ static pat_t *bp_simplepattern(const char *str, const char *end)
if (!second)
parse_err(str, str, "There should be a valid pattern here");
- pat = new_pat(type, start, second->end, first->min_matchlen, first->max_matchlen);
- pat->args.multiple.first = first;
- pat->args.multiple.second = second;
+ pat = type == BP_MATCH ?
+ Pattern(BP_MATCH, start, second->end, first->min_matchlen, first->max_matchlen, .pat=first, .must_match=second)
+ : Pattern(BP_NOT_MATCH, start, second->end, first->min_matchlen, first->max_matchlen, .pat=first, .must_not_match=second);
str = pat->end;
}
@@ -625,8 +571,6 @@ static pat_t *bp_simplepattern(const char *str, const char *end)
//
maybe_pat_t bp_replacement(pat_t *replacepat, const char *replacement, const char *end)
{
- pat_t *pat = new_pat(BP_REPLACE, replacepat->start, replacepat->end, replacepat->min_matchlen, replacepat->max_matchlen);
- pat->args.replace.pat = replacepat;
const char *p = replacement;
if (!end) end = replacement + strlen(replacement);
__TRY_PATTERN__
@@ -641,8 +585,8 @@ maybe_pat_t bp_replacement(pat_t *replacepat, const char *replacement, const cha
size_t rlen = (size_t)(p-replacement);
char *rcpy = new(char[rlen + 1]);
memcpy(rcpy, replacement, rlen);
- pat->args.replace.text = rcpy;
- pat->args.replace.len = rlen;
+ pat_t *pat = Pattern(BP_REPLACE, replacepat->start, replacepat->end, replacepat->min_matchlen, replacepat->max_matchlen,
+ .pat=replacepat, .text=rcpy, .len=rlen);
return (maybe_pat_t){.success = true, .value.pat = pat};
}
@@ -652,7 +596,7 @@ static pat_t *bp_pattern_nl(const char *str, const char *end, bool allow_nl)
pat_t *pat = bp_simplepattern(str, end);
if (pat != NULL) pat = expand_choices(pat, end, allow_nl);
if (matchstr(&str, "=>", allow_nl, end))
- pat = expand_replacements(pat ? pat : new_pat(BP_STRING, str-2, str-2, 0, 0), end, allow_nl);
+ pat = expand_replacements(pat ? pat : Pattern(BP_STRING, str-2, str-2, 0, 0), end, allow_nl);
return pat;
}
@@ -661,9 +605,7 @@ static pat_t *bp_pattern_nl(const char *str, const char *end, bool allow_nl)
//
pat_t *bp_raw_literal(const char *str, size_t len)
{
- pat_t *lit = new_pat(BP_STRING, str, &str[len], len, (ssize_t)len);
- lit->args.string = str;
- return lit;
+ return Pattern(BP_STRING, str, &str[len], len, (ssize_t)len, .string=str);
}
//
@@ -698,36 +640,30 @@ void delete_pat(pat_t **at_pat, bool recursive)
pat_t *pat = *at_pat;
if (!pat) return;
+#define T(tag, ...) case tag: { auto _data = Match(pat, tag); __VA_ARGS__; break; }
+#define F(field) delete_pat(&_data->field, true)
if (recursive) {
switch (pat->type) {
- case BP_DEFINITIONS:
- delete_pat(&pat->args.def.meaning, true);
- delete_pat(&pat->args.def.next_def, true);
- break;
- case BP_REPEAT:
- delete_pat(&pat->args.repetitions.sep, true);
- delete_pat(&pat->args.repetitions.repeat_pat, true);
- break;
- case BP_CHAIN: case BP_UPTO: case BP_UPTO_STRICT:
- case BP_OTHERWISE: case BP_NOT_MATCH: case BP_MATCH:
- delete_pat(&pat->args.multiple.first, true);
- delete_pat(&pat->args.multiple.second, true);
- break;
- case BP_REPLACE:
- delete_pat(&pat->args.replace.pat, true);
- break;
- case BP_CAPTURE: case BP_TAGGED:
- delete_pat(&pat->args.capture.capture_pat, true);
- break;
- case BP_NOT: case BP_AFTER: case BP_BEFORE:
- delete_pat(&pat->args.pat, true);
- break;
- case BP_LEFTRECURSION:
- delete_pat(&pat->args.leftrec->fallback, true);
- break;
+ T(BP_DEFINITIONS, F(meaning), F(next_def))
+ T(BP_REPEAT, F(sep), F(repeat_pat))
+ T(BP_CHAIN, F(first), F(second))
+ T(BP_UPTO, F(target), F(skip))
+ T(BP_UPTO_STRICT, F(target), F(skip))
+ T(BP_OTHERWISE, F(first), F(second))
+ T(BP_MATCH, F(pat), F(must_match))
+ T(BP_NOT_MATCH, F(pat), F(must_not_match))
+ T(BP_REPLACE, F(pat))
+ T(BP_CAPTURE, F(pat))
+ T(BP_TAGGED, F(pat))
+ T(BP_NOT, F(pat))
+ T(BP_AFTER, F(pat))
+ T(BP_BEFORE, F(pat))
+ T(BP_LEFTRECURSION, F(fallback))
default: break;
}
}
+#undef F
+#undef T
if (pat->home) *(pat->home) = pat->next;
if (pat->next) pat->next->home = pat->home;