Converted ^/^^/$/$$ into pattern types instead of builtin definitions

This commit is contained in:
Bruce Hill 2021-05-11 11:39:42 -07:00
parent b09bd4c746
commit 3359a804c8
5 changed files with 68 additions and 113 deletions

View File

@ -17,7 +17,7 @@ String-pattern: ..%(\n / Nodent / Escape / `\ pat [`;])$$
pat: simple-pat !(__("!="/"==")) / suffixed-pat
simple-pat: Upto-and / Dot / String / Chars / Nodent / Escape-range
/ Escape / Repeat / Optional / No / After / Before / Capture
/ Ref / parens
/ Start-of-File / Start-of-Line / End-of-File / End-of-Line / Ref / parens
suffixed-pat: (
Eq-pat
@ -63,6 +63,10 @@ Replace: (
@replace-pat=(Replace / Chain / pat) __ "=>" (__ @replacement=String / @!=(''=> "Expected replacement string"))
)
Ref: @name=id !(__`:)
Start-of-File: "^^"
Start-of-Line: "^"
End-of-File: "$$"
End-of-Line: "$"
parens: `( __ extended-pat (__ `) / @!=(''=> "Expected closing parenthesis here"))
@ -73,11 +77,7 @@ extended-pat: Otherwise / Replace / Chain / pat
# Special-symbol rules:
_: *(` / \t)
__: *(` / \t / \r / \n / comment)
$$: !(./\n)
$: !.
^^: !<(./\n)
^: !<.
id: "^^" / "^" / "__" / "_" / "$$" / "$" / "|" / `a-z,A-Z *`a-z,A-Z,0-9,-
id: "__" / "_" / "|" / `a-z,A-Z *`a-z,A-Z,0-9,-
comment: `# .. $

View File

@ -38,9 +38,5 @@ esc: \e
tab: \t
nl: \n; lf: \n
comment: !''; # No default definition, can be overridden
$$: !(./\n)
$: !.
^^: !<(./\n)
^: !<.
__: *(` /\t/\n/\r/comment)
_: *(` /\t)

127
match.c
View File

@ -36,7 +36,7 @@ static match_t *in_use_matches = NULL;
#endif
__attribute__((returns_nonnull))
static match_t *new_match(void);
static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *child);
__attribute__((nonnull, pure))
static inline const char *next_char(file_t *f, const char *str);
__attribute__((nonnull))
@ -172,34 +172,32 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
}
}
case BP_ANYCHAR: {
if (str >= f->end || *str == '\n')
return NULL;
match_t *m = new_match();
m->pat = pat;
m->start = str;
m->end = next_char(f, str);
return m;
return (str < f->end && *str != '\n') ? new_match(pat, str, next_char(f, str), NULL) : NULL;
}
case BP_START_OF_FILE: {
return (str == f->contents) ? new_match(pat, str, str, NULL) : NULL;
}
case BP_START_OF_LINE: {
return (str == f->contents || str[-1] == '\n') ? new_match(pat, str, str, NULL) : NULL;
}
case BP_END_OF_FILE: {
return (str == f->end) ? new_match(pat, str, str, NULL) : NULL;
}
case BP_END_OF_LINE: {
return (str == f->end || *str == '\n') ? new_match(pat, str, str, NULL) : NULL;
}
case BP_STRING: {
if (&str[pat->len] > f->end) return NULL;
if (ignorecase ? memicmp(str, pat->args.string, (size_t)pat->len) != 0
: memcmp(str, pat->args.string, (size_t)pat->len) != 0)
return NULL;
match_t *m = new_match();
m->pat = pat;
m->start = str;
m->end = str + pat->len;
return m;
return new_match(pat, str, str + pat->len, NULL);
}
case BP_RANGE: {
if (str >= f->end) return NULL;
if ((unsigned char)*str < pat->args.range.low || (unsigned char)*str > pat->args.range.high)
return NULL;
match_t *m = new_match();
m->pat = pat;
m->start = str;
m->end = str + 1;
return m;
return new_match(pat, str, str+1, NULL);
}
case BP_NOT: {
match_t *m = match(defs, f, str, pat->args.pat, ignorecase);
@ -207,17 +205,10 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
recycle_if_unused(&m);
return NULL;
}
m = new_match();
m->pat = pat;
m->start = str;
m->end = str;
return m;
return new_match(pat, str, str, NULL);
}
case BP_UPTO: {
match_t *m = new_match();
m->start = str;
m->pat = pat;
match_t *m = new_match(pat, str, str, NULL);
pat_t *target = pat->args.multiple.first, *skip = pat->args.multiple.second;
if (!target && !skip) {
while (str < f->end && *str != '\n') ++str;
@ -258,11 +249,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
return NULL;
}
case BP_REPEAT: {
match_t *m = new_match();
m->start = str;
m->end = str;
m->pat = pat;
match_t *m = new_match(pat, str, str, NULL);
match_t **dest = &m->child;
size_t reps = 0;
ssize_t max = pat->args.repetitions.max;
@ -319,32 +306,17 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
if (str - backtrack < f->contents) return NULL;
match_t *before = match(defs, f, str - backtrack, pat->args.pat, ignorecase);
if (before == NULL) return NULL;
match_t *m = new_match();
m->start = str;
m->end = str;
m->pat = pat;
ADD_OWNER(m->child, before);
return m;
return new_match(pat, str, str, before);
}
case BP_BEFORE: {
match_t *after = match(defs, f, str, pat->args.pat, ignorecase);
if (after == NULL) return NULL;
match_t *m = new_match();
m->start = str;
m->end = str;
m->pat = pat;
ADD_OWNER(m->child, after);
return m;
return new_match(pat, str, str, after);
}
case BP_CAPTURE: {
match_t *p = match(defs, f, str, pat->args.pat, ignorecase);
if (p == NULL) return NULL;
match_t *m = new_match();
m->start = str;
m->end = p->end;
m->pat = pat;
ADD_OWNER(m->child, p);
return m;
return new_match(pat, str, p->end, p);
}
case BP_OTHERWISE: {
match_t *m = match(defs, f, str, pat->args.multiple.first, ignorecase);
@ -369,13 +341,8 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
recycle_if_unused(&m1);
return NULL;
}
match_t *m = new_match();
m->start = str;
m->end = m2->end;
m->pat = pat;
ADD_OWNER(m->child, m1);
ADD_OWNER(m1->nextsibling, m2);
return m;
return new_match(pat, str, m2->end, m1);
}
case BP_EQUAL: case BP_NOT_EQUAL: {
match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase);
@ -397,17 +364,12 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
if (m2 != NULL) recycle_if_unused(&m2);
return NULL;
}
match_t *m = new_match();
m->start = m1->start;
m->end = m1->end;
m->pat = pat;
ADD_OWNER(m->child, m1);
if (pat->type == BP_EQUAL) {
ADD_OWNER(m1->nextsibling, m2);
} else {
recycle_if_unused(&m2);
}
return m;
return new_match(pat, m1->start, m1->end, m1);
}
case BP_REPLACE: {
match_t *p = NULL;
@ -415,21 +377,12 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
p = match(defs, f, str, pat->args.replace.pat, ignorecase);
if (p == NULL) return NULL;
}
match_t *m = new_match();
m->start = str;
m->pat = pat;
if (p) {
ADD_OWNER(m->child, p);
m->end = p->end;
} else {
m->end = m->start;
}
return m;
return new_match(pat, str, p ? p->end : str, p);
}
case BP_REF: {
def_t *def = lookup(defs, pat->args.name.len, pat->args.name.name);
def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name);
if (def == NULL)
errx(EXIT_FAILURE, "Unknown identifier: '%.*s'", (int)pat->args.name.len, pat->args.name.name);
errx(EXIT_FAILURE, "Unknown identifier: '%.*s'", (int)pat->args.ref.len, pat->args.ref.name);
pat_t *ref = def->pat;
pat_t rec_op = {
@ -483,21 +436,11 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
// does not affect correctness. It also helps with visualization of
// match results.
// OPTIMIZE: remove this if necessary
match_t *m2 = new_match();
m2->pat = pat;
m2->start = m->start;
m2->end = m->end;
ADD_OWNER(m2->child, m);
return m2;
return new_match(pat, m->start, m->end, m);
}
case BP_BACKREF: {
const char *end = match_backref(str, pat->args.backref, ignorecase);
if (end == NULL) return NULL;
match_t *m = new_match();
m->pat = pat;
m->start = str;
m->end = end;
return m;
return end ? new_match(pat, str, end, NULL) : NULL;
}
case BP_NODENT: {
if (*str != '\n') return NULL;
@ -520,11 +463,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
if (str[i] != denter || &str[i] >= f->end) return NULL;
}
match_t *m = new_match();
m->start = start;
m->end = &str[dents];
m->pat = pat;
return m;
return new_match(pat, start, &str[dents], NULL);
}
default: {
errx(EXIT_FAILURE, "Unknown pattern type: %d", pat->type);
@ -587,7 +526,7 @@ match_t *get_capture(match_t *m, const char **id)
//
// Return a match object which can be used (may be allocated or recycled).
//
static match_t *new_match(void)
static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *child)
{
match_t *m;
@ -611,6 +550,10 @@ static match_t *new_match(void)
}
#endif
m->pat = pat;
m->start = start;
m->end = end;
if (child) ADD_OWNER(m->child, child);
return m;
}

View File

@ -305,12 +305,12 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
if (c == '{') { // Surround with `|` word boundaries
pat_t *left = new_pat(f, start, start+1, -1, BP_REF);
left->args.name.name = "|";
left->args.name.len = 1;
left->args.ref.name = "|";
left->args.ref.len = 1;
pat_t *right = new_pat(f, str, str+1, -1, BP_REF);
right->args.name.name = "|";
right->args.name.len = 1;
right->args.ref.name = "|";
right->args.ref.len = 1;
pat = chain_together(f, left, chain_together(f, pat, right));
}
@ -442,15 +442,27 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
capture->args.capture.namelen = namelen;
return capture;
}
// Start of file/line:
case '^': {
if (matchchar(&str, '^'))
return new_pat(f, start, str, 0, BP_START_OF_FILE);
return new_pat(f, start, str, 0, BP_START_OF_LINE);
}
// End of file/line:
case '$': {
if (matchchar(&str, '$'))
return new_pat(f, start, str, 0, BP_END_OF_FILE);
return new_pat(f, start, str, 0, BP_END_OF_LINE);
}
// Special rules:
case '_': case '^': case '$': case '|': {
case '_': case '|': {
size_t namelen = 1;
if (matchchar(&str, c)) // double __, ^^, $$
if (c == '_' && matchchar(&str, c)) // double __, ^^, $$
++namelen;
if (matchchar(&str, ':')) return NULL; // Don't match definitions
pat_t *ref = new_pat(f, start, str, -1, BP_REF);
ref->args.name.name = start;
ref->args.name.len = namelen;
ref->args.ref.name = start;
ref->args.ref.len = namelen;
return ref;
}
default: {
@ -462,8 +474,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
if (matchchar(&str, ':')) // Don't match definitions
return NULL;
pat_t *ref = new_pat(f, start, str, -1, BP_REF);
ref->args.name.name = refname;
ref->args.name.len = (size_t)(str - refname);
ref->args.ref.name = refname;
ref->args.ref.len = (size_t)(str - refname);
return ref;
}
}

View File

@ -28,6 +28,10 @@ enum pattype_e {
BP_REF,
BP_BACKREF,
BP_NODENT,
BP_START_OF_FILE,
BP_START_OF_LINE,
BP_END_OF_FILE,
BP_END_OF_LINE,
BP_LEFTRECURSION,
};
@ -46,7 +50,7 @@ typedef struct pat_s {
struct {
const char *name;
size_t len;
} name;
} ref;
struct {
unsigned char low, high;
} range;