diff options
| -rw-r--r-- | Lua/Makefile | 2 | ||||
| -rw-r--r-- | Lua/lbp.c | 60 | ||||
| -rw-r--r-- | bp.c | 77 | ||||
| -rw-r--r-- | definitions.c | 14 | ||||
| -rw-r--r-- | definitions.h | 3 | ||||
| -rw-r--r-- | files.c | 4 | ||||
| -rw-r--r-- | match.c | 319 | ||||
| -rw-r--r-- | match.h | 7 |
8 files changed, 290 insertions, 196 deletions
diff --git a/Lua/Makefile b/Lua/Makefile index 4569ce6..d67289e 100644 --- a/Lua/Makefile +++ b/Lua/Makefile @@ -39,7 +39,7 @@ clean: lbp.o: lbp.c $(CC) -c $(ALL_FLAGS) -o $@ $< -bp.so: lbp.o ../print.o ../files.o ../pattern.o ../utils.o ../utf8.o ../match.o ../definitions.o +bp.so: lbp.o ../pattern.o ../utils.o ../utf8.o ../match.o ../definitions.o $(MAKESO) -o $@ $^ test: bp.so @@ -11,8 +11,6 @@ #include "lua.h" #include "lauxlib.h" -#include "../print.h" -#include "../files.h" #include "../pattern.h" #include "../match.h" @@ -33,19 +31,12 @@ static inline void push_parse_error(lua_State *L, maybe_pat_t m) free(buf); } -static void push_matchstring(lua_State *L, file_t *f, match_t *m) +static void push_matchstring(lua_State *L, match_t *m) { char *buf = NULL; size_t size = 0; FILE *out = open_memstream(&buf, &size); - printer_t pr = { - .file = f, - .context_before = NO_CONTEXT, - .context_after = NO_CONTEXT, - .use_color = 0, - .lineformat = "", - }; - print_match(out, &pr, m); + fprint_match(out, m->start, m); fflush(out); lua_pushlstring(L, buf, size); fclose(out); @@ -54,7 +45,7 @@ static void push_matchstring(lua_State *L, file_t *f, match_t *m) static int Ltostring(lua_State *L) { match_t **m = (match_t**)lua_touserdata(L, 1); - push_matchstring(L, NULL, *m); + push_matchstring(L, *m); return 1; } @@ -80,7 +71,7 @@ static int Lindex(lua_State *L) if (type == LUA_TNUMBER) { int n = luaL_checkinteger(L, 2); if (n == 0) { - push_matchstring(L, NULL, *m); + push_matchstring(L, *m); return 1; } else if (n > 0) { ret = get_numbered_capture(*m, n); @@ -133,18 +124,15 @@ static int Lmatch(lua_State *L) if (index > (lua_Integer)strlen(text)+1) return 0; - file_t *pat_file = spoof_file(NULL, "<pattern argument>", pat_text, patlen); - maybe_pat_t maybe_pat = bp_pattern(pat_file->start, pat_file->end); + maybe_pat_t maybe_pat = bp_pattern(pat_text, pat_text + patlen); if (!maybe_pat.success) { push_parse_error(L, maybe_pat); - destroy_file(&pat_file); return 0; } - file_t *text_file = spoof_file(NULL, "<text argument>", text+(index-1), textlen); match_t *m = NULL; int ret = 0; - if (next_match(&m, NULL, text_file, maybe_pat.value.pat, NULL, false)) { + if (next_match(&m, NULL, text, &text[textlen], maybe_pat.value.pat, NULL, false)) { // lua_createtable(L, 0, 1); @@ -158,17 +146,13 @@ static int Lmatch(lua_State *L) // int n = 1; // assign_captures(L, text_file, m, &n); - lua_pushinteger(L, (int)(m->start - text_file->start) + index); + lua_pushinteger(L, (int)(m->start - text) + index); lua_pushinteger(L, (int)(m->end - m->start)); // stop_matching(&m); ret = 3; } - // destroy_file(&pat_file); - // destroy_file(&text_file); - - return ret; } @@ -182,48 +166,34 @@ static int Lreplace(lua_State *L) if (index > (lua_Integer)strlen(text)+1) index = (lua_Integer)strlen(text)+1; - file_t *pat_file = spoof_file(NULL, "<pattern argument>", pat_text, patlen); - maybe_pat_t maybe_pat = bp_pattern(pat_file->start, pat_file->end); + maybe_pat_t maybe_pat = bp_pattern(pat_text, pat_text + patlen); if (!maybe_pat.success) { push_parse_error(L, maybe_pat); - destroy_file(&pat_file); return 0; } - file_t *rep_file = spoof_file(NULL, "<replacement argument>", rep_text, replen); - maybe_pat = bp_replacement(maybe_pat.value.pat, rep_file->start, rep_file->end); + maybe_pat = bp_replacement(maybe_pat.value.pat, rep_text, rep_text + replen); if (!maybe_pat.success) { push_parse_error(L, maybe_pat); - destroy_file(&pat_file); - destroy_file(&rep_file); return 0; } - file_t *text_file = spoof_file(NULL, "<text argument>", text+(index-1), textlen); char *buf = NULL; size_t size = 0; FILE *out = open_memstream(&buf, &size); - printer_t pr = { - .file = text_file, - .context_before = ALL_CONTEXT, - .context_after = ALL_CONTEXT, - .use_color = 0, - .lineformat = "", - }; int replacements = 0; - for (match_t *m = NULL; next_match(&m, NULL, text_file, maybe_pat.value.pat, NULL, false); ) { - print_match(out, &pr, m); + const char *prev = text; + for (match_t *m = NULL; next_match(&m, NULL, text, &text[textlen], maybe_pat.value.pat, NULL, false); ) { + fwrite(prev, sizeof(char), (size_t)(m->start - prev), out); + fprint_match(out, text, m); + prev = m->end; ++replacements; } - print_match(out, &pr, NULL); + fwrite(prev, sizeof(char), (size_t)(&text[textlen] - prev), out); fflush(out); lua_pushlstring(L, buf, size); lua_pushinteger(L, replacements); fclose(out); - // destroy_file(&pat_file); - // destroy_file(&rep_file); - // destroy_file(&text_file); - return 2; } @@ -180,7 +180,7 @@ static int is_text_file(const char *filename) static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern) { int nmatches = 0; - for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) { + for (match_t *m = NULL; next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase); ) { if (++nmatches > 1) printf(",\n"); printf("{\"filename\":\"%s\",\"match\":", f->filename); @@ -196,7 +196,7 @@ static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern) static int explain_matches(def_t *defs, file_t *f, pat_t *pattern) { int nmatches = 0; - for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) { + for (match_t *m = NULL; next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase); ) { if (++nmatches == 1) { if (options.print_filenames) fprint_filename(stdout, f->filename); @@ -234,6 +234,36 @@ static void sig_handler(int sig) if (kill(0, sig)) _exit(EXIT_FAILURE); } +void fprint_between(FILE *out, file_t *f, const char *prev, const char *next) +{ + // if (options.context_before == NO_CONTEXT && options.context_after == NO_CONTEXT) + // return; + if (options.context_before == ALL_CONTEXT || options.context_after == ALL_CONTEXT) { + fwrite(prev ? prev : f->start, sizeof(char), (size_t)((next ? next : f->end) - (prev ? prev : f->start)), out); + return; + } + const char *after_prev = prev, *before_next = next; + if (prev && options.context_after >= 0) { + size_t after_prev_line = get_line_number(f, prev) + (size_t)options.context_after + 1; + after_prev = get_line(f, after_prev_line > f->nlines ? f->nlines : after_prev_line); + } + if (next && options.context_before >= 0) { + size_t before_next_line = get_line_number(f, next); + before_next_line = options.context_before >= (int)before_next_line ? 1 : before_next_line - (size_t)options.context_before; + before_next = get_line(f, before_next_line); + } + if (!prev) { + fwrite(before_next, sizeof(char), (size_t)(next - before_next), out); + } else if (!next) { + fwrite(prev, sizeof(char), (size_t)(after_prev - prev), out); + } else if (after_prev >= before_next) { + fwrite(prev, sizeof(char), (size_t)(next - prev), out); + } else { + fwrite(prev, sizeof(char), (size_t)(after_prev - prev), out); + fwrite(before_next, sizeof(char), (size_t)(next - before_next), out); + } +} + // // Print all the matches in a file. // @@ -241,29 +271,18 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern) { static int printed_filenames = 0; int matches = 0; - printer_t pr = { - .file = f, - .context_before = options.context_before, - .context_after = options.context_after, - .use_color = options.format == FORMAT_FANCY, - .lineformat = LINE_FORMATS[options.format], - }; - - for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) { + const char *prev = NULL; + for (match_t *m = NULL; next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase); ) { if (++matches == 1 && options.print_filenames) { if (printed_filenames++ > 0) printf("\n"); fprint_filename(out, f->filename); } - print_match(out, &pr, m); - } - - if (matches > 0 || (f->filename[0] == '\0' && options.context_before == ALL_CONTEXT)) { - // Print trailing context lines: - print_match(out, &pr, NULL); - // Guarantee trailing newline: - if (!pr.needs_line_number) fprintf(out, "\n"); + fprint_between(out, f, prev, m->start); + fprint_match(out, f->start, m); + prev = m->end; } - + if (matches > 0) + fprint_between(out, f, prev, NULL); return matches; } @@ -285,7 +304,7 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern) matches += explain_matches(defs, f, pattern); } else if (options.mode == MODE_LISTFILES) { match_t *m = NULL; - if (next_match(&m, defs, f, pattern, options.skip, options.ignorecase)) { + if (next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase)) { printf("%s\n", f->filename); matches += 1; } @@ -294,7 +313,7 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern) matches += print_matches_as_json(defs, f, pattern); } else if (options.mode == MODE_INPLACE) { match_t *m = NULL; - bool found = next_match(&m, defs, f, pattern, options.skip, options.ignorecase); + bool found = next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase); stop_matching(&m); if (!found) return 0; @@ -394,6 +413,20 @@ static int process_git_files(def_t *defs, pat_t *pattern, int argc, char *argv[] } // +// Load the given grammar (semicolon-separated definitions) +// and return the first rule defined. +// +static def_t *load_grammar(def_t *defs, file_t *f) +{ + maybe_pat_t maybe_pat = bp_pattern(f->start, f->end); + if (!maybe_pat.success) + file_err(f, maybe_pat.value.error.start, maybe_pat.value.error.end, maybe_pat.value.error.msg); + for (pat_t *p = maybe_pat.value.pat; p && p->type == BP_DEFINITION; p = p->args.def.pat) + defs = with_def(defs, p->args.def.namelen, p->args.def.name, p->args.def.def); + return defs; +} + +// // Convert a context string to an integer // static int context_from_flag(const char *flag) diff --git a/definitions.c b/definitions.c index 4df5dc4..3a0fa9e 100644 --- a/definitions.c +++ b/definitions.c @@ -28,20 +28,6 @@ def_t *with_def(def_t *defs, size_t namelen, const char *name, pat_t *pat) } // -// Load the given grammar (semicolon-separated definitions) -// and return the first rule defined. -// -def_t *load_grammar(def_t *defs, file_t *f) -{ - maybe_pat_t maybe_pat = bp_pattern(f->start, f->end); - if (!maybe_pat.success) - file_err(f, maybe_pat.value.error.start, maybe_pat.value.error.end, maybe_pat.value.error.msg); - for (pat_t *p = maybe_pat.value.pat; p && p->type == BP_DEFINITION; p = p->args.def.pat) - defs = with_def(defs, p->args.def.namelen, p->args.def.name, p->args.def.def); - return defs; -} - -// // Look up a backreference or grammar definition by name // def_t *lookup(def_t *defs, size_t namelen, const char *name) diff --git a/definitions.h b/definitions.h index a171de5..34e4344 100644 --- a/definitions.h +++ b/definitions.h @@ -4,7 +4,6 @@ #ifndef DEFINITIONS__H #define DEFINITIONS__H -#include "files.h" #include "pattern.h" // @@ -20,8 +19,6 @@ typedef struct def_s { __attribute__((nonnull(3,4), returns_nonnull)) def_t *with_def(def_t *defs, size_t namelen, const char *name, pat_t *pat); -__attribute__((nonnull(2))) -def_t *load_grammar(def_t *defs, file_t *f); __attribute__((pure, nonnull(3))) def_t *lookup(def_t *defs, size_t namelen, const char *name); __attribute__((nonnull(1))) @@ -36,7 +36,7 @@ static void populate_lines(file_t *f) f->lines = grow(f->lines, linecap *= 2); f->lines[n] = p; do { - char *nl = strchr(p, '\n'); + char *nl = memchr(p, '\n', (size_t)(f->end - p)); if (nl) { p = nl+1; break; @@ -234,7 +234,7 @@ void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, cons va_end(args); (void)fputc('\n', dest); - const char *eol = linenum == f->nlines ? strchr(line, '\0') : strchr(line, '\n'); + const char *eol = linenum == f->nlines ? f->end : strchr(line, '\n'); if (end == NULL || end > eol) end = eol; fprintf(dest, "\033[2m%5lu\033(0\x78\033(B\033[m%.*s\033[41;30m%.*s\033[m%.*s\n", linenum, @@ -2,6 +2,7 @@ // match.c - Code for the BP virtual machine that performs the matching. // +#include <ctype.h> #include <err.h> #include <limits.h> #include <stdbool.h> @@ -17,11 +18,20 @@ #define MAX_CACHE_SIZE (1<<14) +// Cache datastructure typedef struct { size_t size, occupancy; match_t **matches; } cache_t; +// Data structure for various ambient state for matching +typedef struct { + def_t *defs; + cache_t *cache; + const char *start, *end; + bool ignorecase; +} match_context_t; + // New match objects are either recycled from unused match objects or allocated // from the heap. While it is in use, the match object is stored in the // `in_use_matches` linked list. Once it is no longer needed, it is moved to @@ -33,8 +43,8 @@ static match_t *in_use_matches = NULL; #define MATCHES(...) (match_t*[]){__VA_ARGS__, NULL} -__attribute__((hot, nonnull(2,3,4,5))) -static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase); +__attribute__((hot, nonnull(1,2,3))) +static match_t *match(match_context_t *ctx, const char *str, pat_t *pat); // Store a value and update its refcount static inline void add_owner(match_t** owner, match_t* owned) @@ -239,22 +249,22 @@ static pat_t *first_pat(def_t *defs, pat_t *pat) // // Find the next match after prev (or the first match if prev is NULL) // -__attribute__((nonnull(3,5))) -static match_t *_next_match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, pat_t *skip, bool ignorecase) +__attribute__((nonnull(1,2,3))) +static match_t *_next_match(match_context_t *ctx, const char *str, pat_t *pat, pat_t *skip) { // Prune the unnecessary entries from the cache (those not between start/end) - if (cache->matches) { - for (size_t i = 0; i < cache->size; i++) { - for (match_t *m = cache->matches[i], *next = NULL; m; m = next) { + if (ctx->cache->matches) { + for (size_t i = 0; i < ctx->cache->size; i++) { + for (match_t *m = ctx->cache->matches[i], *next = NULL; m; m = next) { next = m->cache.next; - if (m->start < f->start || (m->end ? m->end : m->start) > f->end) - cache_remove(cache, m); + if (m->start < ctx->start || (m->end ? m->end : m->start) > ctx->end) + cache_remove(ctx->cache, m); } } } - pat = deref(defs, pat); - pat_t *first = first_pat(defs, pat); + pat = deref(ctx->defs, pat); + pat_t *first = first_pat(ctx->defs, pat); // Performance optimization: if the pattern starts with a string literal, // we can just rely on the highly optimized strstr()/strcasestr() @@ -264,27 +274,27 @@ static match_t *_next_match(def_t *defs, cache_t *cache, file_t *f, const char * if (first->args.string[i] == '\0') goto pattern_search; char *tmp = strndup(first->args.string, first->min_matchlen); - char *found = (ignorecase ? strcasestr : strstr)(str, tmp); + char *found = (ctx->ignorecase ? strcasestr : strstr)(str, tmp); if (found) str = found; else - str += strlen(str); // Use += strlen here instead of f->end to handle files with NULL bytes + str += strlen(str); // Use += strlen here instead of ctx->end to handle files with NULL bytes free(tmp); } pattern_search: - if (str > f->end) return NULL; + if (str > ctx->end) return NULL; do { - match_t *m = match(defs, cache, f, str, pat, ignorecase); + match_t *m = match(ctx, str, pat); if (m) return m; if (first->type == BP_START_OF_FILE) return NULL; match_t *s; - if (skip && (s = match(defs, cache, f, str, skip, ignorecase))) { + if (skip && (s = match(ctx, str, skip))) { str = s->end > str ? s->end : str + 1; recycle_if_unused(&s); - } else str = next_char(str, f->end); - } while (str < f->end); + } else str = next_char(str, ctx->end); + } while (str < ctx->end); return NULL; } @@ -293,13 +303,14 @@ static match_t *_next_match(def_t *defs, cache_t *cache, file_t *f, const char * // match object, or NULL if no match is found. // The returned value should be free()'d to avoid memory leaking. // -static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase) +static match_t *match(match_context_t *ctx, const char *str, pat_t *pat) { switch (pat->type) { case BP_DEFINITION: { - def_t *defs2 = with_def(defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def); - match_t *m = match(defs2, cache, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase); - defs = free_defs(defs2, defs); + match_context_t ctx2 = *ctx; + ctx2.defs = with_def(ctx->defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def); + match_t *m = match(&ctx2, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def); + free_defs(ctx2.defs, ctx->defs); return m; } case BP_LEFTRECURSION: { @@ -312,59 +323,60 @@ static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, p ++pat->args.leftrec.visits; return pat->args.leftrec.match; } else { - return match(defs, cache, f, str, pat->args.leftrec.fallback, ignorecase); + return match(ctx, str, pat->args.leftrec.fallback); } } case BP_ANYCHAR: { - return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL; + return (str < ctx->end && *str != '\n') ? new_match(ctx->defs, pat, str, next_char(str, ctx->end), NULL) : NULL; } case BP_ID_START: { - return (str < f->end && isidstart(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL; + return (str < ctx->end && isidstart(str, ctx->end)) ? new_match(ctx->defs, pat, str, next_char(str, ctx->end), NULL) : NULL; } case BP_ID_CONTINUE: { - return (str < f->end && isidcontinue(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL; + return (str < ctx->end && isidcontinue(str, ctx->end)) ? new_match(ctx->defs, pat, str, next_char(str, ctx->end), NULL) : NULL; } case BP_START_OF_FILE: { - return (str == f->start) ? new_match(defs, pat, str, str, NULL) : NULL; + return (str == ctx->start) ? new_match(ctx->defs, pat, str, str, NULL) : NULL; } case BP_START_OF_LINE: { - return (str == f->start || str[-1] == '\n') ? new_match(defs, pat, str, str, NULL) : NULL; + return (str == ctx->start || str[-1] == '\n') ? new_match(ctx->defs, pat, str, str, NULL) : NULL; } case BP_END_OF_FILE: { - return (str == f->end || (str == f->end-1 && *str == '\n')) ? new_match(defs, pat, str, str, NULL) : NULL; + return (str == ctx->end || (str == ctx->end-1 && *str == '\n')) ? new_match(ctx->defs, pat, str, str, NULL) : NULL; } case BP_END_OF_LINE: { - return (str == f->end || *str == '\n') ? new_match(defs, pat, str, str, NULL) : NULL; + return (str == ctx->end || *str == '\n') ? new_match(ctx->defs, pat, str, str, NULL) : NULL; } case BP_WORD_BOUNDARY: { - return (str == f->start || isidcontinue(str, f->end) != isidcontinue(prev_char(f->start, str), f->end)) ? new_match(defs, pat, str, str, NULL) : NULL; + return (str == ctx->start || isidcontinue(str, ctx->end) != isidcontinue(prev_char(ctx->start, str), ctx->end)) ? + new_match(ctx->defs, pat, str, str, NULL) : NULL; } case BP_STRING: { - if (&str[pat->min_matchlen] > f->end) return NULL; - if (pat->min_matchlen > 0 && (ignorecase ? memicmp : memcmp)(str, pat->args.string, pat->min_matchlen) != 0) + if (&str[pat->min_matchlen] > ctx->end) return NULL; + if (pat->min_matchlen > 0 && (ctx->ignorecase ? memicmp : memcmp)(str, pat->args.string, pat->min_matchlen) != 0) return NULL; - return new_match(defs, pat, str, str + pat->min_matchlen, NULL); + return new_match(ctx->defs, pat, str, str + pat->min_matchlen, NULL); } case BP_RANGE: { - if (str >= f->end) return NULL; + if (str >= ctx->end) return NULL; if ((unsigned char)*str < pat->args.range.low || (unsigned char)*str > pat->args.range.high) return NULL; - return new_match(defs, pat, str, str+1, NULL); + return new_match(ctx->defs, pat, str, str+1, NULL); } case BP_NOT: { - match_t *m = match(defs, cache, f, str, pat->args.pat, ignorecase); + match_t *m = match(ctx, str, pat->args.pat); if (m != NULL) { recycle_if_unused(&m); return NULL; } - return new_match(defs, pat, str, str, NULL); + return new_match(ctx->defs, pat, str, str, NULL); } case BP_UPTO: case BP_UPTO_STRICT: { - match_t *m = new_match(defs, pat, str, str, NULL); - pat_t *target = deref(defs, pat->args.multiple.first), - *skip = deref(defs, pat->args.multiple.second); + match_t *m = new_match(ctx->defs, pat, str, str, NULL); + pat_t *target = deref(ctx->defs, pat->args.multiple.first), + *skip = deref(ctx->defs, pat->args.multiple.second); if (!target && !skip) { - while (str < f->end && *str != '\n') ++str; + while (str < ctx->end && *str != '\n') ++str; m->end = str; return m; } @@ -373,18 +385,18 @@ static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, p for (const char *prev = NULL; prev < str; ) { prev = str; if (target) { - match_t *p = match(defs, cache, f, str, target, ignorecase); + match_t *p = match(ctx, str, target); if (p != NULL) { recycle_if_unused(&p); m->end = str; return m; } - } else if (str == f->end) { + } else if (str == ctx->end) { m->end = str; return m; } if (skip) { - match_t *s = match(defs, cache, f, str, skip, ignorecase); + match_t *s = match(ctx, str, skip); if (s != NULL) { str = s->end; if (nchildren+2 >= child_cap) { @@ -398,29 +410,29 @@ static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, p // This isn't in the for() structure because there needs to // be at least once chance to match the pattern, even if // we're at the end of the string already (e.g. "..$"). - if (str < f->end && *str != '\n' && pat->type != BP_UPTO_STRICT) - str = next_char(str, f->end); + if (str < ctx->end && *str != '\n' && pat->type != BP_UPTO_STRICT) + str = next_char(str, ctx->end); } recycle_if_unused(&m); return NULL; } case BP_REPEAT: { - match_t *m = new_match(defs, pat, str, str, NULL); + match_t *m = new_match(ctx->defs, pat, str, str, NULL); size_t reps = 0; ssize_t max = pat->args.repetitions.max; - pat_t *repeating = deref(defs, pat->args.repetitions.repeat_pat); - pat_t *sep = deref(defs, pat->args.repetitions.sep); + pat_t *repeating = deref(ctx->defs, pat->args.repetitions.repeat_pat); + pat_t *sep = deref(ctx->defs, pat->args.repetitions.sep); size_t child_cap = 0, nchildren = 0; for (reps = 0; max == -1 || reps < (size_t)max; ++reps) { const char *start = str; // Separator match_t *msep = NULL; if (sep != NULL && reps > 0) { - msep = match(defs, cache, f, str, sep, ignorecase); + msep = match(ctx, str, sep); if (msep == NULL) break; str = msep->end; } - match_t *mp = match(defs, cache, f, str, repeating, ignorecase); + match_t *mp = match(ctx, str, repeating); if (mp == NULL) { str = start; if (msep) recycle_if_unused(&msep); @@ -465,51 +477,52 @@ static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, p return m; } case BP_AFTER: { - pat_t *back = deref(defs, pat->args.pat); + pat_t *back = deref(ctx->defs, pat->args.pat); if (!back) return NULL; // We only care about the region from the backtrack pos up to the // current pos, so mock it out as a file slice. // TODO: this breaks ^/^^/$/$$, but that can probably be ignored // because you rarely need to check those in a backtrack. - cache_t slice_cache = {0}; - file_t slice; - slice_file(&slice, f, f->start, str); + match_context_t slice_ctx = *ctx; + slice_ctx.cache = &(cache_t){0}; + slice_ctx.start = ctx->start; + slice_ctx.end = str; for (const char *pos = &str[-(long)back->min_matchlen]; - pos >= f->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]); - pos = prev_char(f->start, pos)) { - cache_destroy(&slice_cache); - slice.start = (char*)pos; - match_t *m = match(defs, &slice_cache, &slice, pos, back, ignorecase); + pos >= ctx->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]); + pos = prev_char(ctx->start, pos)) { + cache_destroy(slice_ctx.cache); + slice_ctx.start = (char*)pos; + match_t *m = match(&slice_ctx, pos, back); // Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB") if (m && m->end != str) recycle_if_unused(&m); else if (m) { - cache_destroy(&slice_cache); - return new_match(defs, pat, str, str, MATCHES(m)); + cache_destroy(slice_ctx.cache); + return new_match(ctx->defs, pat, str, str, MATCHES(m)); } - if (pos == f->start) break; + if (pos == ctx->start) break; // To prevent extreme performance degradation, don't keep // walking backwards endlessly over newlines. if (back->max_matchlen == -1 && *pos == '\n') break; } - cache_destroy(&slice_cache); + cache_destroy(slice_ctx.cache); return NULL; } case BP_BEFORE: { - match_t *after = match(defs, cache, f, str, pat->args.pat, ignorecase); - return after ? new_match(defs, pat, str, str, MATCHES(after)) : NULL; + match_t *after = match(ctx, str, pat->args.pat); + return after ? new_match(ctx->defs, pat, str, str, MATCHES(after)) : NULL; } case BP_CAPTURE: { - match_t *p = match(defs, cache, f, str, pat->args.pat, ignorecase); - return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL; + match_t *p = match(ctx, str, pat->args.pat); + return p ? new_match(ctx->defs, pat, str, p->end, MATCHES(p)) : NULL; } case BP_OTHERWISE: { - match_t *m = match(defs, cache, f, str, pat->args.multiple.first, ignorecase); - return m ? m : match(defs, cache, f, str, pat->args.multiple.second, ignorecase); + match_t *m = match(ctx, str, pat->args.multiple.first); + return m ? m : match(ctx, str, pat->args.multiple.second); } case BP_CHAIN: { - match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase); + match_t *m1 = match(ctx, str, pat->args.multiple.first); if (m1 == NULL) return NULL; match_t *m2; @@ -518,17 +531,18 @@ static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, p // Temporarily add a rule that the backref name matches the // exact string of the original match (no replacements) pat_t *backref = bp_raw_literal(m1->start, (size_t)(m1->end - m1->start)); - def_t *defs2 = with_def(defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref); + match_context_t ctx2 = *ctx; + ctx2.defs = with_def(ctx->defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref); ++m1->refcount; { - m2 = match(defs2, cache, f, m1->end, pat->args.multiple.second, ignorecase); + m2 = match(&ctx2, m1->end, pat->args.multiple.second); if (!m2) { // No need to keep the backref in memory if it didn't match free_pat(backref); backref = NULL; } - defs = free_defs(defs2, defs); + free_defs(ctx2.defs, ctx->defs); } --m1->refcount; } else { - m2 = match(defs, cache, f, m1->end, pat->args.multiple.second, ignorecase); + m2 = match(ctx, m1->end, pat->args.multiple.second); } if (m2 == NULL) { @@ -536,42 +550,43 @@ static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, p return NULL; } - return new_match(defs, pat, str, m2->end, MATCHES(m1, m2)); + return new_match(ctx->defs, pat, str, m2->end, MATCHES(m1, m2)); } case BP_MATCH: case BP_NOT_MATCH: { - match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase); + match_t *m1 = match(ctx, str, pat->args.multiple.first); if (m1 == NULL) return NULL; // <p1>~<p2> matches iff the text of <p1> matches <p2> // <p1>!~<p2> matches iff the text of <p1> does not match <p2> - cache_t slice_cache = {0}; - file_t slice; - slice_file(&slice, f, m1->start, m1->end); - match_t *m2 = _next_match(defs, &slice_cache, &slice, slice.start, pat->args.multiple.second, NULL, ignorecase); + match_context_t slice_ctx = *ctx; + slice_ctx.cache = &(cache_t){0}; + slice_ctx.start = m1->start; + slice_ctx.end = m1->end; + match_t *m2 = _next_match(&slice_ctx, slice_ctx.start, pat->args.multiple.second, NULL); if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) { - cache_destroy(&slice_cache); + cache_destroy(slice_ctx.cache); if (m2) recycle_if_unused(&m2); recycle_if_unused(&m1); return NULL; } - match_t *ret = new_match(defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1)); - cache_destroy(&slice_cache); + match_t *ret = new_match(ctx->defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1)); + cache_destroy(slice_ctx.cache); return ret; } case BP_REPLACE: { match_t *p = NULL; if (pat->args.replace.pat) { - p = match(defs, cache, f, str, pat->args.replace.pat, ignorecase); + p = match(ctx, str, pat->args.replace.pat); if (p == NULL) return NULL; } - return new_match(defs, pat, str, p ? p->end : str, MATCHES(p)); + return new_match(ctx->defs, pat, str, p ? p->end : str, MATCHES(p)); } case BP_REF: { match_t *cached; - if (cache_get(cache, defs, str, pat, &cached)) + if (cache_get(ctx->cache, ctx->defs, str, pat, &cached)) return cached; - def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name); + def_t *def = lookup(ctx->defs, pat->args.ref.len, pat->args.ref.name); if (def == NULL) errx(EXIT_FAILURE, "Unknown identifier: '%.*s'", (int)pat->args.ref.len, pat->args.ref.name); pat_t *ref = def->pat; @@ -589,17 +604,18 @@ static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, p .fallback = ref, }, }; - def_t defs2 = { + match_context_t ctx2 = *ctx; + ctx2.defs = &(def_t){ .namelen = def->namelen, .name = def->name, .pat = &rec_op, - .next = defs, + .next = ctx->defs, }; const char *prev = str; - match_t *m = match(&defs2, cache, f, str, ref, ignorecase); + match_t *m = match(&ctx2, str, ref); if (m == NULL) { - cache_save(cache, defs, str, pat, NULL); + cache_save(ctx->cache, ctx->defs, str, pat, NULL); return NULL; } @@ -608,7 +624,7 @@ static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, p remove_ownership(&rec_op.args.leftrec.match); add_owner(&rec_op.args.leftrec.match, m); prev = m->end; - match_t *m2 = match(&defs2, cache, f, str, ref, ignorecase); + match_t *m2 = match(&ctx2, str, ref); if (m2 == NULL) break; if (m2->end <= prev) { recycle_if_unused(&m2); @@ -622,8 +638,8 @@ static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, p // leftrec.match is GC'd. It also helps with visualization of match // results. // OPTIMIZE: remove this if necessary - match_t *wrap = new_match(defs, pat, m->start, m->end, MATCHES(m)); - cache_save(cache, defs, str, pat, wrap); + match_t *wrap = new_match(ctx->defs, pat, m->start, m->end, MATCHES(m)); + cache_save(ctx->cache, ctx->defs, str, pat, wrap); if (rec_op.args.leftrec.match) remove_ownership(&rec_op.args.leftrec.match); @@ -635,21 +651,21 @@ static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, p const char *start = str; const char *p = str; - while (p > f->start && p[-1] != '\n') --p; + while (p > ctx->start && p[-1] != '\n') --p; // Current indentation: char denter = *p; int dents = 0; if (denter == ' ' || denter == '\t') { - for (; *p == denter && p < f->end; ++p) ++dents; + for (; *p == denter && p < ctx->end; ++p) ++dents; } // Subsequent indentation: while (*str == '\n' || *str == '\n') ++str; for (int i = 0; i < dents; i++) - if (&str[i] >= f->end || str[i] != denter) return NULL; + if (&str[i] >= ctx->end || str[i] != denter) return NULL; - return new_match(defs, pat, start, &str[dents], NULL); + return new_match(ctx->defs, pat, start, &str[dents], NULL); } default: { errx(EXIT_FAILURE, "Unknown pattern type: %u", pat->type); @@ -750,26 +766,33 @@ size_t free_all_matches(void) // Iterate over matches. // Usage: for (match_t *m = NULL; next_match(&m, ...); ) {...} // -bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase) +bool next_match(match_t **m, def_t *defs, const char *start, const char *end, pat_t *pat, pat_t *skip, bool ignorecase) { static cache_t cache = {0}; - if (!f || !pat) { // Cleanup for stop_matching() + if (!pat) { // Cleanup for stop_matching() recycle_if_unused(m); cache_destroy(&cache); return false; } - const char *start; + const char *pos; if (*m) { // Make sure forward progress is occurring, even after zero-width matches: - start = ((*m)->end > (*m)->start) ? (*m)->end : (*m)->end+1; + pos = ((*m)->end > (*m)->start) ? (*m)->end : (*m)->end+1; recycle_if_unused(m); } else { - start = f->start; + pos = start; cache_destroy(&cache); } - *m = (start <= f->end) ? _next_match(defs, &cache, f, start, pat, skip, ignorecase) : NULL; + match_context_t ctx = { + .defs = defs, + .cache = &cache, + .start = start, + .end = end, + .ignorecase = ignorecase, + }; + *m = (pos <= end) ? _next_match(&ctx, pos, pat, skip) : NULL; if (!*m) cache_destroy(&cache); return *m != NULL; } @@ -820,4 +843,88 @@ match_t *get_named_capture(match_t *m, const char *name, size_t namelen) return NULL; } +void fprint_match(FILE *out, const char *file_start, match_t *m) +{ + if (m->pat->type == BP_REPLACE) { + const char *text = m->pat->args.replace.text; + const char *end = &text[m->pat->args.replace.len]; + + // TODO: clean up the line numbering code + for (const char *r = text; r < end; ) { + // Capture substitution + if (*r == '@' && r+1 < end && r[1] != '@') { + ++r; + + // Retrieve the capture value: + match_t *cap = NULL; + if (isdigit(*r)) { + int n = (int)strtol(r, (char**)&r, 10); + cap = get_numbered_capture(m->children[0], n); + } else { + const char *name = r, *end = after_name(r); + if (end > name) { + cap = get_named_capture(m->children[0], name, (size_t)(end - name)); + r = end; + if (r < m->end && *r == ';') ++r; + } + } + + if (cap != NULL) { + fprint_match(out, file_start, cap); + continue; + } else { + --r; + } + } + + if (*r == '\\') { + ++r; + if (*r == 'N') { // \N (nodent) + ++r; + // Mildly hacky: nodents here are based on the *first line* + // of the match. If the match spans multiple lines, or if + // the replacement text contains newlines, this may get weird. + const char *line_start = m->start; + while (line_start > file_start && line_start[-1] != '\n') --line_start; + char denter = *line_start == '\t' ? '\t' : ' '; + fputc('\n', out); + if (denter == ' ' || denter == '\t') { + for (const char *p = line_start; p && *p == denter && p < m->start; ++p) + fputc(denter, out); + } + continue; + } + const char *start = r; + char c = unescapechar(r, &r); + if (r > start) (void)fputc(c, out); + else (void)fputc('\\', out); + continue; + } else if (*r == '\n') { + (void)fputc('\n', out); + ++r; + continue; + } else { + (void)fputc(*r, out); + ++r; + continue; + } + } + } else { + const char *prev = m->start; + for (int i = 0; m->children && m->children[i]; i++) { + match_t *child = m->children[i]; + // Skip children from e.g. zero-width matches like >@foo + if (!(prev <= child->start && child->start <= m->end && + prev <= child->end && child->end <= m->end)) + continue; + if (child->start > prev) + fwrite(prev, sizeof(char), (size_t)(child->start - prev), out); + fprint_match(out, file_start, child); + prev = child->end; + } + if (m->end > prev) + fwrite(prev, sizeof(char), (size_t)(m->end - prev), out); + } +} + // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 @@ -7,7 +7,6 @@ #include <stdbool.h> #include <stdio.h> -#include "files.h" #include "pattern.h" #include "definitions.h" @@ -40,12 +39,14 @@ __attribute__((nonnull)) void recycle_if_unused(match_t **at_m); size_t free_all_matches(void); size_t recycle_all_matches(void); -bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase); -#define stop_matching(m) next_match(m, NULL, NULL, NULL, NULL, 0) +bool next_match(match_t **m, def_t *defs, const char *start, const char *end, pat_t *pat, pat_t *skip, bool ignorecase); +#define stop_matching(m) next_match(m, NULL, NULL, NULL, NULL, NULL, 0) __attribute__((nonnull)) match_t *get_numbered_capture(match_t *m, int n); __attribute__((nonnull, pure)) match_t *get_named_capture(match_t *m, const char *name, size_t namelen); +__attribute__((nonnull)) +void fprint_match(FILE *out, const char *file_start, match_t *m); #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 |
