Mostly working version
This commit is contained in:
parent
a0028e9605
commit
655ed12128
2
Makefile
2
Makefile
@ -14,7 +14,7 @@ OBJFILES=$(CFILES:.c=.o)
|
||||
|
||||
all: $(NAME) bp.1
|
||||
|
||||
%.o: %.c %.h types.h
|
||||
%.o: %.c %.h types.h utf8.h
|
||||
$(CC) -c $(ALL_FLAGS) -o $@ $<
|
||||
|
||||
$(NAME): $(OBJFILES) bp.c
|
||||
|
@ -71,7 +71,7 @@ def_t *lookup(def_t *defs, size_t namelen, const char *name)
|
||||
def_t *with_backref(def_t *defs, file_t *f, size_t namelen, const char *name, match_t *m)
|
||||
{
|
||||
// TODO: maybe calculate length? (nontrivial because of replacements)
|
||||
pat_t *backref = new_pat(f, m->start, m->end, -1, BP_BACKREF);
|
||||
pat_t *backref = new_pat(f, m->start, m->end, 0, -1, BP_BACKREF);
|
||||
backref->args.backref = m;
|
||||
return with_def(defs, namelen, name, backref);
|
||||
}
|
||||
|
@ -24,8 +24,7 @@ id: left-id-edge !`0-9 !(keyword left-id-edge) +id-char
|
||||
id-char: `a-z,A-Z,_,0-9
|
||||
var: id
|
||||
keyword: !"" # No keywords defined by default
|
||||
left-word-edge: ^ / <(\x00-x7f!~(^^word-char)) / <((\xc0-xdf \x80-xbf)!~(^^word-char))
|
||||
/ <((\xe0-xef 2\x80-xbf)!~(^^word-char)) / <((\xf0-xf7 3\x80-xbf)!~(^^word-char))
|
||||
left-word-edge: !<word-char
|
||||
right-word-edge: !word-char
|
||||
word-char: `a-z,A-Z,_,0-9,-,'
|
||||
word: left-word-edge +word-char
|
||||
|
56
match.c
56
match.c
@ -13,6 +13,7 @@
|
||||
#include "match.h"
|
||||
#include "types.h"
|
||||
#include "utils.h"
|
||||
#include "utf8.h"
|
||||
|
||||
#ifdef DEBUG_HEAP
|
||||
// Doubly-linked list operations:
|
||||
@ -38,8 +39,6 @@ static match_t *in_use_matches = NULL;
|
||||
static inline pat_t *deref(def_t *defs, pat_t *pat);
|
||||
__attribute__((returns_nonnull))
|
||||
static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *child);
|
||||
__attribute__((nonnull, pure))
|
||||
static inline const char *next_char(file_t *f, const char *str);
|
||||
__attribute__((nonnull))
|
||||
static const char *match_backref(const char *str, match_t *cap, bool ignorecase);
|
||||
__attribute__((nonnull))
|
||||
@ -61,25 +60,6 @@ static inline pat_t *deref(def_t *defs, pat_t *pat)
|
||||
}
|
||||
return pat;
|
||||
}
|
||||
//
|
||||
// Return the location of the next character or UTF8 codepoint.
|
||||
// (i.e. skip forward one codepoint at a time, not one byte at a time)
|
||||
//
|
||||
static inline const char *next_char(file_t *f, const char *str)
|
||||
{
|
||||
char c = *str;
|
||||
++str;
|
||||
if (__builtin_expect(!(c & 0x80), 1))
|
||||
return str;
|
||||
|
||||
if (__builtin_expect(str < f->end && !!(*str & 0x80), 1))
|
||||
++str;
|
||||
if (c > '\xDF' && __builtin_expect(str < f->end && !!(*str & 0x80), 1))
|
||||
++str;
|
||||
if (c > '\xEF' && __builtin_expect(str < f->end && !!(*str & 0x80), 1))
|
||||
++str;
|
||||
return str;
|
||||
}
|
||||
|
||||
//
|
||||
// Attempt to match text against a previously captured value.
|
||||
@ -200,10 +180,10 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
return (str == f->end || *str == '\n') ? new_match(pat, str, str, NULL) : NULL;
|
||||
}
|
||||
case BP_STRING: {
|
||||
if (&str[pat->len] > f->end) return NULL;
|
||||
if (pat->len > 0 && (ignorecase ? memicmp : memcmp)(str, pat->args.string, (size_t)pat->len) != 0)
|
||||
if (&str[pat->min_matchlen] > f->end) return NULL;
|
||||
if (pat->min_matchlen > 0 && (ignorecase ? memicmp : memcmp)(str, pat->args.string, (size_t)pat->min_matchlen) != 0)
|
||||
return NULL;
|
||||
return new_match(pat, str, str + pat->len, NULL);
|
||||
return new_match(pat, str, str + pat->min_matchlen, NULL);
|
||||
}
|
||||
case BP_RANGE: {
|
||||
if (str >= f->end) return NULL;
|
||||
@ -315,28 +295,27 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
return m;
|
||||
}
|
||||
case BP_AFTER: {
|
||||
ssize_t backtrack = pat->args.pat->len;
|
||||
if (backtrack == -1)
|
||||
errx(EXIT_FAILURE, "'<' is only allowed for fixed-length operations");
|
||||
if (str - backtrack < f->contents) return NULL;
|
||||
match_t *before = match(defs, f, str - backtrack, pat->args.pat, ignorecase);
|
||||
if (before == NULL) return NULL;
|
||||
return new_match(pat, str, str, before);
|
||||
pat_t *back = deref(defs, pat->args.pat);
|
||||
for (const char *pos = &str[-back->min_matchlen];
|
||||
pos >= f->contents && (back->max_matchlen == -1 || pos >= &str[-back->max_matchlen]);
|
||||
pos = prev_char(f, pos)) {
|
||||
match_t *m = match(defs, f, pos, back, ignorecase);
|
||||
if (m) return new_match(pat, str, str, m);
|
||||
if (pos == f->contents) break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case BP_BEFORE: {
|
||||
match_t *after = match(defs, f, str, pat->args.pat, ignorecase);
|
||||
if (after == NULL) return NULL;
|
||||
return new_match(pat, str, str, after);
|
||||
return after ? new_match(pat, str, str, after) : NULL;
|
||||
}
|
||||
case BP_CAPTURE: {
|
||||
match_t *p = match(defs, f, str, pat->args.pat, ignorecase);
|
||||
if (p == NULL) return NULL;
|
||||
return new_match(pat, str, p->end, p);
|
||||
return p ? new_match(pat, str, p->end, p) : NULL;
|
||||
}
|
||||
case BP_OTHERWISE: {
|
||||
match_t *m = match(defs, f, str, pat->args.multiple.first, ignorecase);
|
||||
if (m == NULL) m = match(defs, f, str, pat->args.multiple.second, ignorecase);
|
||||
return m;
|
||||
return m ? match(defs, f, str, pat->args.multiple.second, ignorecase) : NULL;
|
||||
}
|
||||
case BP_CHAIN: {
|
||||
match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase);
|
||||
@ -400,7 +379,8 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
.type = BP_LEFTRECURSION,
|
||||
.start = ref->start,
|
||||
.end = ref->end,
|
||||
.len = 0,
|
||||
.min_matchlen = 0,
|
||||
.max_matchlen = -1,
|
||||
.args.leftrec = {
|
||||
.match = NULL,
|
||||
.visits = 0,
|
||||
|
130
pattern.c
130
pattern.c
@ -13,6 +13,7 @@
|
||||
#include "files.h"
|
||||
#include "pattern.h"
|
||||
#include "utils.h"
|
||||
#include "utf8.h"
|
||||
|
||||
__attribute__((nonnull(1,2)))
|
||||
static pat_t *expand_replacements(file_t *f, const char *str, pat_t *replace_pat);
|
||||
@ -31,7 +32,7 @@ static pat_t *bp_simplepattern(file_t *f, const char *str);
|
||||
// Allocate a new pattern for this file (ensuring it will be automatically
|
||||
// freed when the file is freed)
|
||||
//
|
||||
pat_t *new_pat(file_t *f, const char *start, const char *end, ssize_t len, enum pattype_e type)
|
||||
pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssize_t maxlen, enum pattype_e type)
|
||||
{
|
||||
allocated_pat_t *tracker = new(allocated_pat_t);
|
||||
tracker->next = f->pats;
|
||||
@ -39,7 +40,8 @@ pat_t *new_pat(file_t *f, const char *start, const char *end, ssize_t len, enum
|
||||
tracker->pat.type = type;
|
||||
tracker->pat.start = start;
|
||||
tracker->pat.end = end;
|
||||
tracker->pat.len = len;
|
||||
tracker->pat.min_matchlen = minlen;
|
||||
tracker->pat.max_matchlen = maxlen;
|
||||
return &tracker->pat;
|
||||
}
|
||||
|
||||
@ -48,13 +50,10 @@ pat_t *new_pat(file_t *f, const char *start, const char *end, ssize_t len, enum
|
||||
//
|
||||
static pat_t *new_range(file_t *f, const char *start, const char *end, size_t min, ssize_t max, pat_t *repeating, pat_t *sep)
|
||||
{
|
||||
pat_t *range = new_pat(f, start, end, -1, BP_REPEAT);
|
||||
if ((ssize_t)min == max && repeating->len >= 0) {
|
||||
if (sep == NULL || max == 0)
|
||||
range->len = repeating->len * max;
|
||||
else if (sep->len >= 0)
|
||||
range->len = repeating->len * max + sep->len * (max - 1);
|
||||
}
|
||||
size_t minlen = min*repeating->min_matchlen + (min > 0 ? min-1 : 0)*(sep ? sep->min_matchlen : 0);
|
||||
ssize_t maxlen = (max == -1 || UNBOUNDED(repeating) || (max != 0 && max != 1 && sep && UNBOUNDED(sep))) ? -1
|
||||
: max*repeating->max_matchlen + (ssize_t)(max > 0 ? min-1 : 0)*(ssize_t)(sep ? sep->min_matchlen : 0);
|
||||
pat_t *range = new_pat(f, start, end, minlen, maxlen, BP_REPEAT);
|
||||
range->args.repetitions.min = min;
|
||||
range->args.repetitions.max = max;
|
||||
range->args.repetitions.repeat_pat = repeating;
|
||||
@ -93,13 +92,14 @@ static pat_t *expand_replacements(file_t *f, const char *str, pat_t *replace_pat
|
||||
if (!str[1] || str[1] == '\n')
|
||||
file_err(f, str, str+1,
|
||||
"There should be an escape sequence after this backslash.");
|
||||
++str;
|
||||
str = next_char(f, str);
|
||||
}
|
||||
}
|
||||
(void)matchchar(&str, quote);
|
||||
|
||||
if (replace_pat == NULL) replace_pat = new_pat(f, start, start, 0, BP_STRING);
|
||||
pat_t *pat = new_pat(f, replace_pat->start, str, replace_pat->len, BP_REPLACE);
|
||||
if (replace_pat == NULL) replace_pat = new_pat(f, start, start, 0, 0, BP_STRING);
|
||||
pat_t *pat = new_pat(f, replace_pat->start, str, replace_pat->min_matchlen,
|
||||
replace_pat->max_matchlen, BP_REPLACE);
|
||||
pat->args.replace.pat = replace_pat;
|
||||
pat->args.replace.text = repstr;
|
||||
pat->args.replace.len = (size_t)(str-repstr-1);
|
||||
@ -134,8 +134,9 @@ pat_t *chain_together(file_t *f, pat_t *first, pat_t *second)
|
||||
{
|
||||
if (first == NULL) return second;
|
||||
if (second == NULL) return first;
|
||||
ssize_t len = (first->len >= 0 && second->len >= 0) ? first->len + second->len : -1;
|
||||
pat_t *chain = new_pat(f, first->start, second->end, len, BP_CHAIN);
|
||||
size_t minlen = first->min_matchlen + second->min_matchlen;
|
||||
ssize_t maxlen = (UNBOUNDED(first) || UNBOUNDED(second)) ? -1 : first->max_matchlen + second->max_matchlen;
|
||||
pat_t *chain = new_pat(f, first->start, second->end, minlen, maxlen, BP_CHAIN);
|
||||
chain->args.multiple.first = first;
|
||||
chain->args.multiple.second = second;
|
||||
|
||||
@ -144,6 +145,7 @@ pat_t *chain_together(file_t *f, pat_t *first, pat_t *second)
|
||||
for (pat_t *p = first; p; ) {
|
||||
if (p->type == BP_UPTO) {
|
||||
p->args.multiple.first = second;
|
||||
p->min_matchlen = second->min_matchlen;
|
||||
break;
|
||||
} else if (p->type == BP_CAPTURE) {
|
||||
p = p->args.capture.capture_pat;
|
||||
@ -164,8 +166,10 @@ pat_t *either_pat(file_t *f, pat_t *first, pat_t *second)
|
||||
{
|
||||
if (first == NULL) return second;
|
||||
if (second == NULL) return first;
|
||||
ssize_t len = first->len == second->len ? first->len : -1;
|
||||
pat_t *either = new_pat(f, first->start, second->end, len, BP_OTHERWISE);
|
||||
size_t minlen = first->min_matchlen < second->min_matchlen ? first->min_matchlen : second->min_matchlen;
|
||||
ssize_t maxlen = (UNBOUNDED(first) || UNBOUNDED(second)) ? -1 :
|
||||
(first->max_matchlen > second->max_matchlen ? first->max_matchlen : second->max_matchlen);
|
||||
pat_t *either = new_pat(f, first->start, second->end, minlen, maxlen, BP_OTHERWISE);
|
||||
either->args.multiple.first = first;
|
||||
either->args.multiple.second = second;
|
||||
return either;
|
||||
@ -197,7 +201,7 @@ static pat_t *bp_simplepattern(file_t *f, const char *str)
|
||||
if (!second)
|
||||
file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~");
|
||||
|
||||
pat = new_pat(f, str, second->end, first->len, type);
|
||||
pat = new_pat(f, str, second->end, first->min_matchlen, first->max_matchlen, type);
|
||||
pat->args.multiple.first = first;
|
||||
pat->args.multiple.second = second;
|
||||
str = pat->end;
|
||||
@ -216,54 +220,56 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
if (!*str) return NULL;
|
||||
const char *start = str;
|
||||
char c = *str;
|
||||
++str;
|
||||
str = next_char(f, str);
|
||||
switch (c) {
|
||||
// Any char (dot)
|
||||
case '.': {
|
||||
if (*str == '.') { // ".."
|
||||
pat_t *skip = NULL;
|
||||
++str;
|
||||
str = next_char(f, str);
|
||||
if (matchchar(&str, '%')) {
|
||||
skip = bp_simplepattern(f, str);
|
||||
if (!skip)
|
||||
file_err(f, str, str, "There should be a pattern to skip here after the '%%'");
|
||||
str = skip->end;
|
||||
}
|
||||
pat_t *upto = new_pat(f, start, str, -1, BP_UPTO);
|
||||
pat_t *upto = new_pat(f, start, str, 0, -1, BP_UPTO);
|
||||
upto->args.multiple.second = skip;
|
||||
return upto;
|
||||
} else {
|
||||
return new_pat(f, start, str, 1, BP_ANYCHAR);
|
||||
return new_pat(f, start, str, 1, UTF8_MAXCHARLEN, BP_ANYCHAR);
|
||||
}
|
||||
}
|
||||
// Char literals
|
||||
case '`': {
|
||||
pat_t *all = NULL;
|
||||
do {
|
||||
const char *charloc = str;
|
||||
c = *str;
|
||||
if (!c || c == '\n')
|
||||
if (str >= f->end || !*str || *str == '\n')
|
||||
file_err(f, str, str, "There should be a character here after the '`'");
|
||||
const char *opstart = str-1;
|
||||
|
||||
++str;
|
||||
const char *c1_loc = str;
|
||||
str = next_char(f, c1_loc);
|
||||
if (matchchar(&str, '-')) { // Range
|
||||
if (&str[-1] - c1_loc > 1 || next_char(f, str) > str+1)
|
||||
file_err(f, start, next_char(f, str), "Sorry, UTF-8 character ranges are not yet supported.");
|
||||
char c1 = *c1_loc;
|
||||
char c2 = *str;
|
||||
if (!c2 || c2 == '\n')
|
||||
file_err(f, str, str, "There should be a character here to complete the character range.");
|
||||
if (c > c2) { // Swap order
|
||||
char tmp = c;
|
||||
c = c2;
|
||||
if (c1 > c2) { // Swap order
|
||||
char tmp = c1;
|
||||
c1 = c2;
|
||||
c2 = tmp;
|
||||
}
|
||||
++str;
|
||||
pat_t *pat = new_pat(f, opstart, str, 1, BP_RANGE);
|
||||
pat->args.range.low = (unsigned char)c;
|
||||
str = next_char(f, str);
|
||||
pat_t *pat = new_pat(f, start, str, 1, 1, BP_RANGE);
|
||||
pat->args.range.low = (unsigned char)c1;
|
||||
pat->args.range.high = (unsigned char)c2;
|
||||
all = either_pat(f, all, pat);
|
||||
} else {
|
||||
pat_t *pat = new_pat(f, opstart, str, 1, BP_STRING);
|
||||
pat->args.string = charloc;
|
||||
size_t len = (size_t)(str - start - 1);
|
||||
pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING);
|
||||
pat->args.string = c1_loc;
|
||||
all = either_pat(f, all, pat);
|
||||
}
|
||||
} while (matchchar(&str, ','));
|
||||
@ -276,24 +282,26 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
file_err(f, str, str, "There should be an escape sequence here after this backslash.");
|
||||
|
||||
if (matchchar(&str, 'N')) // \N (nodent)
|
||||
return new_pat(f, start, str, -1, BP_NODENT);
|
||||
return new_pat(f, start, str, 1, -1, BP_NODENT);
|
||||
|
||||
const char *opstart = str;
|
||||
unsigned char e = (unsigned char)unescapechar(str, &str);
|
||||
if (*str == '-') { // Escape range (e.g. \x00-\xFF)
|
||||
++str;
|
||||
if (next_char(f, str) != str+1)
|
||||
file_err(f, start, next_char(f, str), "Sorry, UTF8 escape sequences are not supported.");
|
||||
str = next_char(f, str);
|
||||
const char *seqstart = str;
|
||||
unsigned char e2 = (unsigned char)unescapechar(str, &str);
|
||||
if (str == seqstart)
|
||||
file_err(f, seqstart, str+1, "This value isn't a valid escape sequence");
|
||||
if (e2 < e)
|
||||
file_err(f, start, str, "Escape ranges should be low-to-high, but this is high-to-low.");
|
||||
pat_t *esc = new_pat(f, opstart, str, 1, BP_RANGE);
|
||||
pat_t *esc = new_pat(f, opstart, str, 1, 1, BP_RANGE);
|
||||
esc->args.range.low = e;
|
||||
esc->args.range.high = e2;
|
||||
return esc;
|
||||
} else {
|
||||
pat_t *esc = new_pat(f, opstart, str, 1, BP_STRING);
|
||||
pat_t *esc = new_pat(f, opstart, str, 1, 1, BP_STRING);
|
||||
char *s = xcalloc(sizeof(char), 2);
|
||||
s[0] = (char)e;
|
||||
esc->args.string = s;
|
||||
@ -305,19 +313,19 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
char endquote = c == '{' ? '}' : (c == '\002' ? '\003' : c);
|
||||
char *litstart = (char*)str;
|
||||
while (str < f->end && *str != endquote)
|
||||
++str;
|
||||
ssize_t len = (ssize_t)(str - litstart);
|
||||
++str;
|
||||
str = next_char(f, str);
|
||||
size_t len = (size_t)(str - litstart);
|
||||
str = next_char(f, str);
|
||||
|
||||
pat_t *pat = new_pat(f, start, str, len, BP_STRING);
|
||||
pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING);
|
||||
pat->args.string = litstart;
|
||||
|
||||
if (c == '{') { // Surround with `|` word boundaries
|
||||
pat_t *left = new_pat(f, start, start+1, -1, BP_REF);
|
||||
pat_t *left = new_pat(f, start, start+1, 0, -1, BP_REF);
|
||||
left->args.ref.name = "left-word-edge";
|
||||
left->args.ref.len = strlen(left->args.ref.name);
|
||||
|
||||
pat_t *right = new_pat(f, str-1, str, -1, BP_REF);
|
||||
pat_t *right = new_pat(f, str-1, str, 0, -1, BP_REF);
|
||||
right->args.ref.name = "right-word-edge";
|
||||
right->args.ref.len = strlen(right->args.ref.name);
|
||||
|
||||
@ -329,7 +337,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
case '!': {
|
||||
pat_t *p = bp_simplepattern(f, str);
|
||||
if (!p) file_err(f, str, str, "There should be a pattern after this '!'");
|
||||
pat_t *not = new_pat(f, start, p->end, 0, BP_NOT);
|
||||
pat_t *not = new_pat(f, start, p->end, 0, 0, BP_NOT);
|
||||
not->args.pat = p;
|
||||
return not;
|
||||
}
|
||||
@ -372,12 +380,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
if (!behind)
|
||||
file_err(f, str, str, "There should be a pattern after this '<'");
|
||||
str = behind->end;
|
||||
if (behind->len == -1)
|
||||
file_err(f, start, behind->end,
|
||||
"Sorry, variable-length lookbehind patterns like this are not supported.\n"
|
||||
"Please use a fixed-length lookbehind pattern instead.");
|
||||
str = behind->end;
|
||||
pat_t *pat = new_pat(f, start, str, 0, BP_AFTER);
|
||||
pat_t *pat = new_pat(f, start, str, 0, 0, BP_AFTER);
|
||||
pat->args.pat = behind;
|
||||
return pat;
|
||||
}
|
||||
@ -387,7 +391,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
if (!ahead)
|
||||
file_err(f, str, str, "There should be a pattern after this '>'");
|
||||
str = ahead->end;
|
||||
pat_t *pat = new_pat(f, start, str, 0, BP_BEFORE);
|
||||
pat_t *pat = new_pat(f, start, str, 0, 0, BP_BEFORE);
|
||||
pat->args.pat = ahead;
|
||||
return pat;
|
||||
}
|
||||
@ -395,10 +399,10 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
case '(': {
|
||||
if (matchstr(&str, "!)")) {
|
||||
pat_t *pat = bp_simplepattern(f, str);
|
||||
if (!pat) pat = new_pat(f, str, str, 0, BP_STRING);
|
||||
if (!pat) pat = new_pat(f, str, str, 0, 0, BP_STRING);
|
||||
pat = expand_replacements(f, pat->end, pat);
|
||||
|
||||
pat_t *error = new_pat(f, start, pat->end, pat->len, BP_ERROR);
|
||||
pat_t *error = new_pat(f, start, pat->end, pat->min_matchlen, pat->max_matchlen, BP_ERROR);
|
||||
error->args.pat = pat;
|
||||
return error;
|
||||
}
|
||||
@ -453,7 +457,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
if (!pat)
|
||||
file_err(f, str, str, "There should be a valid pattern here to capture after the '@'");
|
||||
|
||||
pat_t *capture = new_pat(f, start, pat->end, pat->len, BP_CAPTURE);
|
||||
pat_t *capture = new_pat(f, start, pat->end, pat->min_matchlen, pat->max_matchlen, BP_CAPTURE);
|
||||
capture->args.capture.capture_pat = pat;
|
||||
capture->args.capture.name = name;
|
||||
capture->args.capture.namelen = namelen;
|
||||
@ -466,14 +470,14 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
// Start of file/line:
|
||||
case '^': {
|
||||
if (matchchar(&str, '^'))
|
||||
return new_pat(f, start, str, 0, BP_START_OF_FILE);
|
||||
return new_pat(f, start, str, 0, BP_START_OF_LINE);
|
||||
return new_pat(f, start, str, 0, 0, BP_START_OF_FILE);
|
||||
return new_pat(f, start, str, 0, 0, BP_START_OF_LINE);
|
||||
}
|
||||
// End of file/line:
|
||||
case '$': {
|
||||
if (matchchar(&str, '$'))
|
||||
return new_pat(f, start, str, 0, BP_END_OF_FILE);
|
||||
return new_pat(f, start, str, 0, BP_END_OF_LINE);
|
||||
return new_pat(f, start, str, 0, 0, BP_END_OF_FILE);
|
||||
return new_pat(f, start, str, 0, 0, BP_END_OF_LINE);
|
||||
}
|
||||
// Whitespace:
|
||||
case '_': {
|
||||
@ -481,7 +485,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
if (matchchar(&str, '_')) // double __ (whitespace with newlines)
|
||||
++namelen;
|
||||
if (matchchar(&str, ':')) return NULL; // Don't match definitions
|
||||
pat_t *ref = new_pat(f, start, str, -1, BP_REF);
|
||||
pat_t *ref = new_pat(f, start, str, 0, -1, BP_REF);
|
||||
ref->args.ref.name = start;
|
||||
ref->args.ref.len = namelen;
|
||||
return ref;
|
||||
@ -494,7 +498,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
str = after_name(str);
|
||||
if (matchchar(&str, ':')) // Don't match definitions
|
||||
return NULL;
|
||||
pat_t *ref = new_pat(f, start, str, -1, BP_REF);
|
||||
pat_t *ref = new_pat(f, start, str, 0, -1, BP_REF);
|
||||
ref->args.ref.name = refname;
|
||||
ref->args.ref.len = (size_t)(str - refname);
|
||||
return ref;
|
||||
@ -519,9 +523,9 @@ pat_t *bp_stringpattern(file_t *f, const char *str)
|
||||
}
|
||||
}
|
||||
// End of string
|
||||
ssize_t len = (ssize_t)(str - start);
|
||||
size_t len = (size_t)(str - start);
|
||||
if (len > 0) {
|
||||
pat_t *str_chunk = new_pat(f, start, str, len, BP_STRING);
|
||||
pat_t *str_chunk = new_pat(f, start, str, len, (ssize_t)len, BP_STRING);
|
||||
str_chunk->args.string = start;
|
||||
ret = chain_together(f, ret, str_chunk);
|
||||
}
|
||||
@ -541,7 +545,7 @@ pat_t *bp_stringpattern(file_t *f, const char *str)
|
||||
//
|
||||
pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement)
|
||||
{
|
||||
pat_t *pat = new_pat(f, replacepat->start, replacepat->end, replacepat->len, BP_REPLACE);
|
||||
pat_t *pat = new_pat(f, replacepat->start, replacepat->end, replacepat->min_matchlen, replacepat->max_matchlen, BP_REPLACE);
|
||||
pat->args.replace.pat = replacepat;
|
||||
const char *p = replacement;
|
||||
for (; *p; p++) {
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include "types.h"
|
||||
|
||||
__attribute__((returns_nonnull, nonnull(1,2)))
|
||||
pat_t *new_pat(file_t *f, const char *start, const char *end, ssize_t len, enum pattype_e type);
|
||||
pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssize_t maxlen, enum pattype_e type);
|
||||
__attribute__((nonnull(1,2)))
|
||||
pat_t *bp_stringpattern(file_t *f, const char *str);
|
||||
__attribute__((nonnull(1,2)))
|
||||
|
7
types.h
7
types.h
@ -9,6 +9,8 @@
|
||||
|
||||
#include "files.h"
|
||||
|
||||
#define UNBOUNDED(pat) ((pat)->max_matchlen == -1)
|
||||
|
||||
// BP virtual machine pattern types
|
||||
enum pattype_e {
|
||||
BP_ANYCHAR = 1,
|
||||
@ -44,8 +46,9 @@ struct match_s; // forward declared to resolve circular struct defs
|
||||
typedef struct pat_s {
|
||||
enum pattype_e type;
|
||||
const char *start, *end;
|
||||
// Length of the match, if constant, otherwise -1
|
||||
ssize_t len;
|
||||
// The bounds of the match length (used for backtracking)
|
||||
size_t min_matchlen;
|
||||
ssize_t max_matchlen; // -1 means unbounded length
|
||||
union {
|
||||
const char *string;
|
||||
struct {
|
||||
|
42
utf8.h
Normal file
42
utf8.h
Normal file
@ -0,0 +1,42 @@
|
||||
// UTF8 helper functions
|
||||
#ifndef UTF8__H
|
||||
#define UTF8__H
|
||||
|
||||
#define UTF8_MAXCHARLEN 4
|
||||
//
|
||||
// Return the location of the next character or UTF8 codepoint.
|
||||
// (i.e. skip forward one codepoint at a time, not one byte at a time)
|
||||
//
|
||||
__attribute__((nonnull, pure))
|
||||
static inline const char *next_char(file_t *f, const char *str)
|
||||
{
|
||||
if (__builtin_expect(str+1 <= f->end && (str[0] & 0x80) == 0x0, 1))
|
||||
return str+1;
|
||||
if (__builtin_expect(str+2 <= f->end && (str[0] & 0xe0) == 0xc0, 1))
|
||||
return str+2;
|
||||
if (__builtin_expect(str+3 <= f->end && (str[0] & 0xf0) == 0xe0, 1))
|
||||
return str+3;
|
||||
if (__builtin_expect(str+4 <= f->end && (str[0] & 0xf8) == 0xf0, 1))
|
||||
return str+4;
|
||||
return __builtin_expect(str+1 <= f->end, 1) ? str+1 : f->end;
|
||||
}
|
||||
|
||||
//
|
||||
// Return the location of the previous character or UTF8 codepoint.
|
||||
// (i.e. skip backwards one codepoint at a time, not one byte at a time)
|
||||
//
|
||||
__attribute__((nonnull, pure))
|
||||
static inline const char *prev_char(file_t *f, const char *str)
|
||||
{
|
||||
if (__builtin_expect(str-1 >= f->contents && (str[-1] & 0x80) == 0x0, 1))
|
||||
return str-1;
|
||||
if (__builtin_expect(str-2 >= f->contents && (str[-2] & 0xe0) == 0xc0, 1))
|
||||
return str-2;
|
||||
if (__builtin_expect(str-3 >= f->contents && (str[-3] & 0xf0) == 0xe0, 1))
|
||||
return str-3;
|
||||
if (__builtin_expect(str-4 >= f->contents && (str[-4] & 0xf8) == 0xf0, 1))
|
||||
return str-4;
|
||||
return __builtin_expect(str-1 >= f->contents, 1) ? str-1 : f->contents;
|
||||
}
|
||||
#endif
|
||||
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1
|
Loading…
Reference in New Issue
Block a user