diff options
| -rw-r--r-- | Lua/Makefile | 2 | ||||
| -rw-r--r-- | Lua/lbp.c | 12 | ||||
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | bp.c | 66 | ||||
| -rw-r--r-- | definitions.c | 55 | ||||
| -rw-r--r-- | definitions.h | 28 | ||||
| -rw-r--r-- | match.c | 229 | ||||
| -rw-r--r-- | match.h | 8 | ||||
| -rw-r--r-- | pattern.c | 57 | ||||
| -rw-r--r-- | pattern.h | 4 |
10 files changed, 219 insertions, 244 deletions
diff --git a/Lua/Makefile b/Lua/Makefile index c729821..5be2cb4 100644 --- a/Lua/Makefile +++ b/Lua/Makefile @@ -39,7 +39,7 @@ clean: lbp.o: lbp.c builtins.h $(CC) -c $(ALL_FLAGS) -o $@ $< -bp.so: lbp.o ../pattern.o ../utils.o ../utf8.o ../match.o ../definitions.o +bp.so: lbp.o ../pattern.o ../utils.o ../utf8.o ../match.o $(MAKESO) -o $@ $^ builtins.h: ../grammars/builtins.bp @@ -26,7 +26,7 @@ static const char *builtins_source = ( #include "builtins.h" ); static int MATCH_METATABLE = 0, PAT_METATABLE = 0; -static def_t *builtins; +static pat_t *builtins; static void push_match(lua_State *L, match_t *m, const char *start); @@ -133,11 +133,13 @@ static int Lmatch(lua_State *L) match_t *m = NULL; int ret = 0; - if (next_match(&m, builtins, text+index-1, &text[textlen], pat, NULL, false)) { + pat_t *def_pat = chain_together(builtins, pat); + if (next_match(&m, text+index-1, &text[textlen], def_pat, NULL, false)) { push_match(L, m, text); stop_matching(&m); ret = 1; } + delete_pat(&def_pat, false); return ret; } @@ -170,7 +172,8 @@ static int Lreplace(lua_State *L) FILE *out = open_memstream(&buf, &size); int replacements = 0; const char *prev = text; - for (match_t *m = NULL; next_match(&m, builtins, text, &text[textlen], maybe_replacement.value.pat, NULL, false); ) { + pat_t *rep_pat = chain_together(builtins, maybe_replacement.value.pat); + for (match_t *m = NULL; next_match(&m, text, &text[textlen], rep_pat, NULL, false); ) { fwrite(prev, sizeof(char), (size_t)(m->start - prev), out); fprint_match(out, text, m, NULL); prev = m->end; @@ -329,8 +332,7 @@ LUALIB_API int luaopen_bp(lua_State *L) raise_parse_error(L, maybe_pat); return 0; } - for (pat_t *p = maybe_pat.value.pat; p && p->type == BP_DEFINITION; p = p->args.def.pat) - builtins = with_def(builtins, p->args.def.namelen, p->args.def.name, p->args.def.def); + builtins = maybe_pat.value.pat; lua_pushlightuserdata(L, (void*)&PAT_METATABLE); luaL_newlib(L, pat_metamethods); @@ -22,7 +22,7 @@ G= O=-O3 ALL_FLAGS=$(CFLAGS) $(OSFLAGS) -DBP_NAME="\"$(NAME)\"" $(EXTRA) $(CWARN) $(G) $(O) -CFILES=pattern.c definitions.c utils.c match.c files.c explain.c json.c utf8.c +CFILES=pattern.c utils.c match.c files.c explain.c json.c utf8.c OBJFILES=$(CFILES:.c=.o) all: $(NAME) bp.1 @@ -20,7 +20,6 @@ #include <sys/wait.h> #include <unistd.h> -#include "definitions.h" #include "explain.h" #include "files.h" #include "json.h" @@ -207,10 +206,10 @@ static int is_text_file(const char *filename) // // Print matches in JSON format. // -static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern) +static int print_matches_as_json(file_t *f, pat_t *pattern) { int nmatches = 0; - for (match_t *m = NULL; next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase); ) { + for (match_t *m = NULL; next_match(&m, f->start, f->end, pattern, options.skip, options.ignorecase); ) { if (++nmatches > 1) printf(",\n"); printf("{\"filename\":\"%s\",\"match\":", f->filename); @@ -223,10 +222,10 @@ static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern) // // Print matches in a visual explanation style // -static int explain_matches(def_t *defs, file_t *f, pat_t *pattern) +static int explain_matches(file_t *f, pat_t *pattern) { int nmatches = 0; - for (match_t *m = NULL; next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase); ) { + for (match_t *m = NULL; next_match(&m, f->start, f->end, pattern, options.skip, options.ignorecase); ) { if (++nmatches == 1) { if (options.print_filenames) fprint_filename(stdout, f->filename); @@ -349,7 +348,7 @@ static void on_nl(FILE *out) // // Print all the matches in a file. // -static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern) +static int print_matches(FILE *out, file_t *f, pat_t *pattern) { static int printed_filenames = 0; int matches = 0; @@ -364,7 +363,7 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern) print_opts.replace_color = "\033[0;34;1m"; print_opts.normal_color = "\033[m"; } - for (match_t *m = NULL; next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase); ) { + for (match_t *m = NULL; next_match(&m, f->start, f->end, pattern, options.skip, options.ignorecase); ) { if (++matches == 1 && options.print_filenames) { if (printed_filenames++ > 0) printf("\n"); fprint_filename(out, f->filename); @@ -388,8 +387,8 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern) // For a given filename, open the file and attempt to match the given pattern // against it, printing any results according to the flags. // -__attribute__((nonnull(2,3))) -static int process_file(def_t *defs, const char *filename, pat_t *pattern) +__attribute__((nonnull)) +static int process_file(const char *filename, pat_t *pattern) { file_t *f = load_file(NULL, filename); if (f == NULL) { @@ -399,19 +398,19 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern) int matches = 0; if (options.mode == MODE_EXPLAIN) { - matches += explain_matches(defs, f, pattern); + matches += explain_matches(f, pattern); } else if (options.mode == MODE_LISTFILES) { match_t *m = NULL; - if (next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase)) { + if (next_match(&m, f->start, f->end, pattern, options.skip, options.ignorecase)) { printf("%s\n", f->filename); matches += 1; } stop_matching(&m); } else if (options.mode == MODE_JSON) { - matches += print_matches_as_json(defs, f, pattern); + matches += print_matches_as_json(f, pattern); } else if (options.mode == MODE_INPLACE) { match_t *m = NULL; - bool found = next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase); + bool found = next_match(&m, f->start, f->end, pattern, options.skip, options.ignorecase); stop_matching(&m); if (!found) return 0; @@ -427,12 +426,12 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern) // are used to restore the original file contents. modifying_file = out; backup_file = f; { - matches += print_matches(out, defs, f, pattern); + matches += print_matches(out, f, pattern); } modifying_file = NULL; backup_file = NULL; fclose(out); } else { - matches += print_matches(stdout, defs, f, pattern); + matches += print_matches(stdout, f, pattern); } fflush(stdout); @@ -446,8 +445,8 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern) // // Recursively process all non-dotfile files in the given directory. // -__attribute__((nonnull(2,3))) -static int process_dir(def_t *defs, const char *dirname, pat_t *pattern) +__attribute__((nonnull)) +static int process_dir(const char *dirname, pat_t *pattern) { int matches = 0; glob_t globbuf; @@ -464,9 +463,9 @@ static int process_dir(def_t *defs, const char *dirname, pat_t *pattern) if (S_ISLNK(statbuf.st_mode)) continue; // Skip symbolic links else if (S_ISDIR(statbuf.st_mode)) - matches += process_dir(defs, globbuf.gl_pathv[i], pattern); + matches += process_dir(globbuf.gl_pathv[i], pattern); else if (is_text_file(globbuf.gl_pathv[i])) - matches += process_file(defs, globbuf.gl_pathv[i], pattern); + matches += process_file(globbuf.gl_pathv[i], pattern); } } globfree(&globbuf); @@ -476,8 +475,8 @@ static int process_dir(def_t *defs, const char *dirname, pat_t *pattern) // // Process git files using `git ls-files ...` // -__attribute__((nonnull(2))) -static int process_git_files(def_t *defs, pat_t *pattern, int argc, char *argv[]) +__attribute__((nonnull(1))) +static int process_git_files(pat_t *pattern, int argc, char *argv[]) { int fds[2]; require(pipe(fds), "Failed to create pipe"); @@ -500,7 +499,7 @@ static int process_git_files(def_t *defs, pat_t *pattern, int argc, char *argv[] size_t path_size = 0; int found = 0; while (getdelim(&path, &path_size, '\0', fp) > 0) - found += process_file(defs, path, pattern); + found += process_file(path, pattern); if (path) delete(&path); require(fclose(fp), "Failed to close read end of pipe"); int status; @@ -514,12 +513,9 @@ static int process_git_files(def_t *defs, pat_t *pattern, int argc, char *argv[] // Load the given grammar (semicolon-separated definitions) // and return the first rule defined. // -static def_t *load_grammar(def_t *defs, file_t *f) +static pat_t *load_grammar(pat_t *defs, file_t *f) { - pat_t *pat = assert_pat(f->start, bp_pattern(f->start, f->end)); - for (pat_t *p = pat; p && p->type == BP_DEFINITION; p = p->args.def.pat) - defs = with_def(defs, p->args.def.namelen, p->args.def.name, p->args.def.def); - return defs; + return chain_together(defs, assert_pat(f->start, bp_pattern(f->start, f->end))); } // @@ -539,7 +535,7 @@ int main(int argc, char *argv[]) { char *flag = NULL; - def_t *defs = NULL; + pat_t *defs = NULL; file_t *loaded_files = NULL; pat_t *pattern = NULL; @@ -634,6 +630,9 @@ int main(int argc, char *argv[]) if (pattern == NULL) errx(EXIT_FAILURE, "No pattern provided.\n\n%s", usage); + // Hook up definitions: + pattern = chain_together(defs, pattern); + for (argc = 0; argv[argc]; ++argc) ; // update argc if (options.context_before == USE_DEFAULT_CONTEXT) options.context_before = 0; @@ -655,7 +654,7 @@ int main(int argc, char *argv[]) int found = 0; if (options.mode == MODE_JSON) printf("["); if (options.git_mode) { // Get the list of files from `git --ls-files ...` - found = process_git_files(defs, pattern, argc, argv); + found = process_git_files(pattern, argc, argv); } else if (argv[0]) { // Files pass in as command line args: struct stat statbuf; @@ -663,17 +662,17 @@ int main(int argc, char *argv[]) options.print_filenames = false; for ( ; argv[0]; argv++) { if (stat(argv[0], &statbuf) == 0 && S_ISDIR(statbuf.st_mode)) // Symlinks are okay if manually specified - found += process_dir(defs, argv[0], pattern); + found += process_dir(argv[0], pattern); else - found += process_file(defs, argv[0], pattern); + found += process_file(argv[0], pattern); } } else if (isatty(STDIN_FILENO)) { // No files, no piped in input, so use files in current dir, recursively - found += process_dir(defs, ".", pattern); + found += process_dir(".", pattern); } else { // Piped in input: options.print_filenames = false; // Don't print filename on stdin - found += process_file(defs, "", pattern); + found += process_file("", pattern); } if (options.mode == MODE_JSON) printf("]\n"); @@ -681,7 +680,6 @@ int main(int argc, char *argv[]) // is about to exit, this step is unnecessary. However, it is useful for // tracking down memory leaks. free_all_matches(); - defs = free_defs(defs, NULL); free_all_pats(); while (loaded_files) { file_t *next = loaded_files->next; diff --git a/definitions.c b/definitions.c deleted file mode 100644 index 590a801..0000000 --- a/definitions.c +++ /dev/null @@ -1,55 +0,0 @@ -// -// definitions.c - Code for defining named pattern rules -// - -#include <err.h> -#include <stdlib.h> -#include <string.h> - -#include "definitions.h" -#include "pattern.h" -#include "utils.h" - -static size_t next_id = 0; - -// -// Return a new list of definitions with one added to the front -// -def_t *with_def(def_t *defs, size_t namelen, const char *name, pat_t *pat) -{ - def_t *def = new(def_t); - def->id = next_id++; - def->next = defs; - def->namelen = namelen; - def->name = name; - def->pat = pat; - return def; -} - -// -// Look up a backreference or grammar definition by name -// -def_t *lookup(def_t *defs, size_t namelen, const char *name) -{ - for ( ; defs; defs = defs->next) { - if (namelen == defs->namelen && strncmp(defs->name, name, namelen) == 0) - return defs; - } - return NULL; -} - -// -// Free all the given definitions up till (but not including) `stop` -// -def_t *free_defs(def_t *defs, def_t *stop) -{ - while (defs != stop && defs != NULL) { - def_t *next = defs->next; - defs->next = NULL; - delete(&defs); - defs = next; - } - return defs; -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/definitions.h b/definitions.h deleted file mode 100644 index 34e4344..0000000 --- a/definitions.h +++ /dev/null @@ -1,28 +0,0 @@ -// -// definitions.h - Header file defining pattern rules -// -#ifndef DEFINITIONS__H -#define DEFINITIONS__H - -#include "pattern.h" - -// -// Pattern matching rule definition(s) -// -typedef struct def_s { - size_t id; - size_t namelen; - const char *name; - pat_t *pat; - struct def_s *next; -} def_t; - -__attribute__((nonnull(3,4), returns_nonnull)) -def_t *with_def(def_t *defs, size_t namelen, const char *name, pat_t *pat); -__attribute__((pure, nonnull(3))) -def_t *lookup(def_t *defs, size_t namelen, const char *name); -__attribute__((nonnull(1))) -def_t *free_defs(def_t *defs, def_t *stop); - -#endif -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 @@ -10,7 +10,6 @@ #include <stdlib.h> #include <string.h> -#include "definitions.h" #include "match.h" #include "pattern.h" #include "utils.h" @@ -25,8 +24,9 @@ typedef struct { } cache_t; // Data structure for various ambient state for matching -typedef struct { - def_t *defs; +typedef struct match_ctx_s { + struct match_ctx_s *parent_ctx; + pat_t *defs; cache_t *cache; const char *start, *end; bool ignorecase; @@ -45,6 +45,8 @@ static match_t *in_use_matches = NULL; __attribute__((hot, nonnull(1,2,3))) static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat); +__attribute__((returns_nonnull)) +static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *children[]); // Store a value and update its refcount static inline void add_owner(match_t** owner, match_t* owned) @@ -107,12 +109,12 @@ static inline size_t hash(const char *str, pat_t *pat) // given definitions. If a result has been memoized, set *result to the // memoized value and return true, otherwise return false. // -static bool cache_get(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t **result) +static bool cache_get(match_ctx_t *ctx, const char *str, pat_t *pat, match_t **result) { - if (!cache->matches) return false; - size_t h = hash(str, pat) & (cache->size-1); - for (match_t *c = cache->matches[h]; c; c = c->cache.next) { - if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) { + if (!ctx->cache->matches) return false; + size_t h = hash(str, pat) & (ctx->cache->size-1); + for (match_t *c = ctx->cache->matches[h]; c; c = c->cache.next) { + if (c->pat == pat && c->start == str) { // If c->end == NULL, that means no match occurs here *result = c->end == NULL ? NULL : c; return true; @@ -124,7 +126,7 @@ static bool cache_get(cache_t *cache, def_t *defs, const char *str, pat_t *pat, // // Remove an item from the cache. // -static void cache_remove(cache_t *cache, match_t *m) +static void cache_remove(match_ctx_t *ctx, match_t *m) { if (!m->cache.home) return; *m->cache.home = m->cache.next; @@ -132,25 +134,26 @@ static void cache_remove(cache_t *cache, match_t *m) m->cache.next = NULL; m->cache.home = NULL; if (--m->refcount == 0) recycle_if_unused(&m); - --cache->occupancy; + --ctx->cache->occupancy; } // // Save a match in the cache. // -static void cache_save(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t *m) +static void cache_save(match_ctx_t *ctx, const char *str, pat_t *pat, match_t *m) { // As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used // to memoize the fact that `pat` will *not* match at `str`. - if (m == NULL) m = new_match(defs, pat, str, NULL, NULL); + if (m == NULL) m = new_match(pat, str, NULL, NULL); + cache_t *cache = ctx->cache; if (cache->occupancy+1 > 3*cache->size) { if (cache->size == MAX_CACHE_SIZE) { size_t h = hash(m->start, m->pat) & (cache->size-1); for (int quota = 2; cache->matches[h] && quota > 0; quota--) { match_t *last = cache->matches[h]; while (last->cache.next) last = last->cache.next; - cache_remove(cache, last); + cache_remove(ctx, last); } } else { match_t **old_matches = cache->matches; @@ -186,11 +189,12 @@ static void cache_save(cache_t *cache, def_t *defs, const char *str, pat_t *pat, // // Clear and deallocate the cache. // -void cache_destroy(cache_t *cache) +void cache_destroy(match_ctx_t *ctx) { + cache_t *cache = ctx->cache; for (size_t i = 0; i < cache->size; i++) { while (cache->matches[i]) - cache_remove(cache, cache->matches[i]); + cache_remove(ctx, cache->matches[i]); } cache->occupancy = 0; if (cache->matches) delete(&cache->matches); @@ -198,15 +202,29 @@ void cache_destroy(cache_t *cache) } // +// Look up a pattern definition by name. +// +__attribute__((nonnull)) +pat_t *lookup(match_ctx_t *ctx, const char *name, size_t namelen) +{ + for (pat_t *def = ctx->defs; def; def = def->args.def.next_def) { + if (namelen == def->args.def.namelen && strncmp(def->args.def.name, name, namelen) == 0) + return def->args.def.meaning; + } + if (ctx->parent_ctx) return lookup(ctx->parent_ctx, name, namelen); + return NULL; +} + +// // If the given pattern is a reference, look it up and return the referenced // pattern. This is used for an optimization to avoid repeated lookups. // __attribute__((nonnull(1))) -static inline pat_t *deref(def_t *defs, pat_t *pat) +static inline pat_t *deref(match_ctx_t *ctx, pat_t *pat) { if (pat && pat->type == BP_REF) { - def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name); - if (def) pat = def->pat; + pat_t *def = lookup(ctx, pat->args.ref.name, pat->args.ref.len); + if (def) return def; } return pat; } @@ -215,7 +233,7 @@ static inline pat_t *deref(def_t *defs, pat_t *pat) // Find and return the first and simplest pattern that will definitely have to // match for the whole pattern to match (if any) // -static pat_t *first_pat(def_t *defs, pat_t *pat) +static pat_t *first_pat(match_ctx_t *ctx, pat_t *pat) { for (pat_t *p = pat; p; ) { switch (p->type) { @@ -232,7 +250,7 @@ static pat_t *first_pat(def_t *defs, pat_t *pat) case BP_REPLACE: p = p->args.replace.pat; break; case BP_REF: { - pat_t *p2 = deref(defs, p); + pat_t *p2 = deref(ctx, p); if (p2 == p) return p2; p = p2; break; @@ -249,12 +267,14 @@ static pat_t *first_pat(def_t *defs, pat_t *pat) __attribute__((nonnull(1,2,3))) static match_t *_next_match(match_ctx_t *ctx, const char *str, pat_t *pat, pat_t *skip) { - // Definitions can be done once, ahead of time, instead of at each string position: - if (pat->type == BP_DEFINITION) { + if (pat->type == BP_DEFINITIONS || (pat->type == BP_CHAIN && pat->args.multiple.first->type == BP_DEFINITIONS)) { match_ctx_t ctx2 = *ctx; - ctx2.defs = with_def(ctx->defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def); - match_t *m = _next_match(&ctx2, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, skip); - free_defs(ctx2.defs, ctx->defs); + ctx2.cache = &(cache_t){0}; + ctx2.parent_ctx = ctx; + ctx2.defs = pat->type == BP_DEFINITIONS ? pat : pat->args.multiple.first; + pat_t *match_pat = pat->type == BP_DEFINITIONS ? pat->args.def.meaning : pat->args.multiple.second; + match_t *m = _next_match(&ctx2, str, match_pat, skip); + cache_destroy(&ctx2); return m; } @@ -263,12 +283,12 @@ static match_t *_next_match(match_ctx_t *ctx, const char *str, pat_t *pat, pat_t for (match_t *m = ctx->cache->matches[i], *next = NULL; m; m = next) { next = m->cache.next; if (m->start < ctx->start || (m->end ? m->end : m->start) > ctx->end) - cache_remove(ctx->cache, m); + cache_remove(ctx, m); } } - pat = deref(ctx->defs, pat); // Avoid repeated lookups - pat_t *first = first_pat(ctx->defs, pat); + pat = deref(ctx, pat); // Avoid repeated lookups + pat_t *first = first_pat(ctx, pat); // Don't bother looping if this can only match at the start: if (first->type == BP_START_OF_FILE) @@ -302,11 +322,13 @@ static match_t *_next_match(match_ctx_t *ctx, const char *str, pat_t *pat, pat_t static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) { switch (pat->type) { - case BP_DEFINITION: { + case BP_DEFINITIONS: { match_ctx_t ctx2 = *ctx; - ctx2.defs = with_def(ctx->defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def); - match_t *m = match(&ctx2, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def); - free_defs(ctx2.defs, ctx->defs); + ctx2.cache = &(cache_t){0}; + ctx2.parent_ctx = ctx; + ctx2.defs = pat; + match_t *m = match(&ctx2, str, pat->args.def.meaning); + cache_destroy(&ctx2); return m; } case BP_LEFTRECURSION: { @@ -323,41 +345,41 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) } } case BP_ANYCHAR: { - return (str < ctx->end && *str != '\n') ? new_match(ctx->defs, pat, str, next_char(str, ctx->end), NULL) : NULL; + return (str < ctx->end && *str != '\n') ? new_match(pat, str, next_char(str, ctx->end), NULL) : NULL; } case BP_ID_START: { - return (str < ctx->end && isidstart(str, ctx->end)) ? new_match(ctx->defs, pat, str, next_char(str, ctx->end), NULL) : NULL; + return (str < ctx->end && isidstart(str, ctx->end)) ? new_match(pat, str, next_char(str, ctx->end), NULL) : NULL; } case BP_ID_CONTINUE: { - return (str < ctx->end && isidcontinue(str, ctx->end)) ? new_match(ctx->defs, pat, str, next_char(str, ctx->end), NULL) : NULL; + return (str < ctx->end && isidcontinue(str, ctx->end)) ? new_match(pat, str, next_char(str, ctx->end), NULL) : NULL; } case BP_START_OF_FILE: { - return (str == ctx->start) ? new_match(ctx->defs, pat, str, str, NULL) : NULL; + return (str == ctx->start) ? new_match(pat, str, str, NULL) : NULL; } case BP_START_OF_LINE: { - return (str == ctx->start || str[-1] == '\n') ? new_match(ctx->defs, pat, str, str, NULL) : NULL; + return (str == ctx->start || str[-1] == '\n') ? new_match(pat, str, str, NULL) : NULL; } case BP_END_OF_FILE: { - return (str == ctx->end || (str == ctx->end-1 && *str == '\n')) ? new_match(ctx->defs, pat, str, str, NULL) : NULL; + return (str == ctx->end || (str == ctx->end-1 && *str == '\n')) ? new_match(pat, str, str, NULL) : NULL; } case BP_END_OF_LINE: { - return (str == ctx->end || *str == '\n') ? new_match(ctx->defs, pat, str, str, NULL) : NULL; + return (str == ctx->end || *str == '\n') ? new_match(pat, str, str, NULL) : NULL; } case BP_WORD_BOUNDARY: { return (str == ctx->start || isidcontinue(str, ctx->end) != isidcontinue(prev_char(ctx->start, str), ctx->end)) ? - new_match(ctx->defs, pat, str, str, NULL) : NULL; + new_match(pat, str, str, NULL) : NULL; } case BP_STRING: { if (&str[pat->min_matchlen] > ctx->end) return NULL; if (pat->min_matchlen > 0 && (ctx->ignorecase ? strncasecmp : strncmp)(str, pat->args.string, pat->min_matchlen) != 0) return NULL; - return new_match(ctx->defs, pat, str, str + pat->min_matchlen, NULL); + return new_match(pat, str, str + pat->min_matchlen, NULL); } case BP_RANGE: { if (str >= ctx->end) return NULL; if ((unsigned char)*str < pat->args.range.low || (unsigned char)*str > pat->args.range.high) return NULL; - return new_match(ctx->defs, pat, str, str+1, NULL); + return new_match(pat, str, str+1, NULL); } case BP_NOT: { match_t *m = match(ctx, str, pat->args.pat); @@ -365,12 +387,12 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) recycle_if_unused(&m); return NULL; } - return new_match(ctx->defs, pat, str, str, NULL); + return new_match(pat, str, str, NULL); } case BP_UPTO: case BP_UPTO_STRICT: { - match_t *m = new_match(ctx->defs, pat, str, str, NULL); - pat_t *target = deref(ctx->defs, pat->args.multiple.first), - *skip = deref(ctx->defs, pat->args.multiple.second); + match_t *m = new_match(pat, str, str, NULL); + pat_t *target = deref(ctx, pat->args.multiple.first), + *skip = deref(ctx, pat->args.multiple.second); if (!target && !skip) { while (str < ctx->end && *str != '\n') ++str; m->end = str; @@ -413,11 +435,11 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) return NULL; } case BP_REPEAT: { - match_t *m = new_match(ctx->defs, pat, str, str, NULL); + match_t *m = new_match(pat, str, str, NULL); size_t reps = 0; ssize_t max = pat->args.repetitions.max; - pat_t *repeating = deref(ctx->defs, pat->args.repetitions.repeat_pat); - pat_t *sep = deref(ctx->defs, pat->args.repetitions.sep); + pat_t *repeating = deref(ctx, pat->args.repetitions.repeat_pat); + pat_t *sep = deref(ctx, pat->args.repetitions.sep); size_t child_cap = 0, nchildren = 0; for (reps = 0; max == -1 || reps < (size_t)max; ++reps) { const char *start = str; @@ -473,7 +495,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) return m; } case BP_AFTER: { - pat_t *back = deref(ctx->defs, pat->args.pat); + pat_t *back = deref(ctx, pat->args.pat); if (!back) return NULL; // We only care about the region from the backtrack pos up to the @@ -487,37 +509,47 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) for (const char *pos = &str[-(long)back->min_matchlen]; pos >= ctx->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]); pos = prev_char(ctx->start, pos)) { - cache_destroy(slice_ctx.cache); + cache_destroy(&slice_ctx); slice_ctx.start = (char*)pos; match_t *m = match(&slice_ctx, pos, back); // Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB") if (m && m->end != str) recycle_if_unused(&m); else if (m) { - cache_destroy(slice_ctx.cache); - return new_match(ctx->defs, pat, str, str, MATCHES(m)); + cache_destroy(&slice_ctx); + return new_match(pat, str, str, MATCHES(m)); } if (pos == ctx->start) break; // To prevent extreme performance degradation, don't keep // walking backwards endlessly over newlines. if (back->max_matchlen == -1 && *pos == '\n') break; } - cache_destroy(slice_ctx.cache); + cache_destroy(&slice_ctx); return NULL; } case BP_BEFORE: { match_t *after = match(ctx, str, pat->args.pat); - return after ? new_match(ctx->defs, pat, str, str, MATCHES(after)) : NULL; + return after ? new_match(pat, str, str, MATCHES(after)) : NULL; } case BP_CAPTURE: { match_t *p = match(ctx, str, pat->args.pat); - return p ? new_match(ctx->defs, pat, str, p->end, MATCHES(p)) : NULL; + return p ? new_match(pat, str, p->end, MATCHES(p)) : NULL; } case BP_OTHERWISE: { match_t *m = match(ctx, str, pat->args.multiple.first); return m ? m : match(ctx, str, pat->args.multiple.second); } case BP_CHAIN: { + if (pat->args.multiple.first->type == BP_DEFINITIONS) { + match_ctx_t ctx2 = *ctx; + ctx2.cache = &(cache_t){0}; + ctx2.parent_ctx = ctx; + ctx2.defs = pat->args.multiple.first; + match_t *m = match(&ctx2, str, pat->args.multiple.second); + cache_destroy(&ctx2); + return m; + } + match_t *m1 = match(ctx, str, pat->args.multiple.first); if (m1 == NULL) return NULL; @@ -528,13 +560,25 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) // exact string of the original match (no replacements) pat_t *backref = bp_raw_literal(m1->start, (size_t)(m1->end - m1->start)); match_ctx_t ctx2 = *ctx; - ctx2.defs = with_def(ctx->defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref); + ctx2.cache = &(cache_t){0}; + ctx2.parent_ctx = ctx; + ctx2.defs = &(pat_t){ + .type = BP_DEFINITIONS, + .start = m1->pat->start, .end = m1->pat->end, + .args = { + .def = { + .name = m1->pat->args.capture.name, + .namelen = m1->pat->args.capture.namelen, + .meaning = backref, + } + }, + }; ++m1->refcount; { m2 = match(&ctx2, m1->end, pat->args.multiple.second); if (!m2) // No need to keep the backref in memory if it didn't match delete_pat(&backref, false); - free_defs(ctx2.defs, ctx->defs); } --m1->refcount; + cache_destroy(&ctx2); } else { m2 = match(ctx, m1->end, pat->args.multiple.second); } @@ -544,7 +588,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) return NULL; } - return new_match(ctx->defs, pat, str, m2->end, MATCHES(m1, m2)); + return new_match(pat, str, m2->end, MATCHES(m1, m2)); } case BP_MATCH: case BP_NOT_MATCH: { match_t *m1 = match(ctx, str, pat->args.multiple.first); @@ -558,13 +602,13 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) slice_ctx.end = m1->end; match_t *m2 = _next_match(&slice_ctx, slice_ctx.start, pat->args.multiple.second, NULL); if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) { - cache_destroy(slice_ctx.cache); + cache_destroy(&slice_ctx); if (m2) recycle_if_unused(&m2); recycle_if_unused(&m1); return NULL; } - match_t *ret = new_match(ctx->defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1)); - cache_destroy(slice_ctx.cache); + match_t *ret = new_match(pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1)); + cache_destroy(&slice_ctx); return ret; } case BP_REPLACE: { @@ -573,17 +617,16 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) p = match(ctx, str, pat->args.replace.pat); if (p == NULL) return NULL; } - return new_match(ctx->defs, pat, str, p ? p->end : str, MATCHES(p)); + return new_match(pat, str, p ? p->end : str, MATCHES(p)); } case BP_REF: { match_t *cached; - if (cache_get(ctx->cache, ctx->defs, str, pat, &cached)) + if (cache_get(ctx, str, pat, &cached)) return cached; - def_t *def = lookup(ctx->defs, pat->args.ref.len, pat->args.ref.name); - if (def == NULL) + pat_t *ref = lookup(ctx, pat->args.ref.name, pat->args.ref.len); + if (ref == NULL) errx(EXIT_FAILURE, "Unknown identifier: '%.*s'", (int)pat->args.ref.len, pat->args.ref.name); - pat_t *ref = def->pat; pat_t rec_op = { .type = BP_LEFTRECURSION, @@ -599,17 +642,23 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) }, }; match_ctx_t ctx2 = *ctx; - ctx2.defs = &(def_t){ - .namelen = def->namelen, - .name = def->name, - .pat = &rec_op, - .next = ctx->defs, + ctx2.parent_ctx = ctx; + ctx2.defs = &(pat_t){ + .type = BP_DEFINITIONS, + .start = pat->start, .end = pat->end, + .args = { + .def = { + .name = pat->args.ref.name, + .namelen = pat->args.ref.len, + .meaning = &rec_op, + } + }, }; const char *prev = str; match_t *m = match(&ctx2, str, ref); if (m == NULL) { - cache_save(ctx->cache, ctx->defs, str, pat, NULL); + cache_save(ctx, str, pat, NULL); return NULL; } @@ -632,8 +681,8 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) // leftrec.match is GC'd. It also helps with visualization of match // results. // OPTIMIZE: remove this if necessary - match_t *wrap = new_match(ctx->defs, pat, m->start, m->end, MATCHES(m)); - cache_save(ctx->cache, ctx->defs, str, pat, wrap); + match_t *wrap = new_match(pat, m->start, m->end, MATCHES(m)); + cache_save(ctx, str, pat, wrap); if (rec_op.args.leftrec.match) remove_ownership(&rec_op.args.leftrec.match); @@ -659,7 +708,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) for (int i = 0; i < dents; i++) if (&str[i] >= ctx->end || str[i] != denter) return NULL; - return new_match(ctx->defs, pat, start, &str[dents], NULL); + return new_match(pat, start, &str[dents], NULL); } default: { errx(EXIT_FAILURE, "Unknown pattern type: %u", pat->type); @@ -671,7 +720,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) // // Return a match object which can be used (may be allocated or recycled). // -match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]) +match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *children[]) { match_t *m; if (unused_matches) { @@ -687,7 +736,6 @@ match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, m->pat = pat; m->start = start; m->end = end; - m->defs_id = (defs?defs->id:0); if (children) { for (int i = 0; children[i]; i++) @@ -760,14 +808,15 @@ size_t free_all_matches(void) // Iterate over matches. // Usage: for (match_t *m = NULL; next_match(&m, ...); ) {...} // -bool next_match(match_t **m, def_t *defs, const char *start, const char *end, pat_t *pat, pat_t *skip, bool ignorecase) +bool next_match(match_t **m, const char *start, const char *end, pat_t *pat, pat_t *skip, bool ignorecase) { static cache_t cache = {0}; - if (!pat) { // Cleanup for stop_matching() - recycle_if_unused(m); - cache_destroy(&cache); - return false; - } + match_ctx_t ctx = { + .cache = &cache, + .start = start, + .end = end, + .ignorecase = ignorecase, + }; const char *pos; if (*m) { @@ -776,18 +825,12 @@ bool next_match(match_t **m, def_t *defs, const char *start, const char *end, pa recycle_if_unused(m); } else { pos = start; - cache_destroy(&cache); + cache_destroy(&ctx); } - match_ctx_t ctx = { - .defs = defs, - .cache = &cache, - .start = start, - .end = end, - .ignorecase = ignorecase, - }; + if (!pat) return false; *m = (pos <= end) ? _next_match(&ctx, pos, pat, skip) : NULL; - if (!*m) cache_destroy(&cache); + if (!*m) cache_destroy(&ctx); return *m != NULL; } @@ -8,7 +8,6 @@ #include <stdio.h> #include "pattern.h" -#include "definitions.h" struct match_s; // forward declared to resolve circular struct defs @@ -25,7 +24,6 @@ typedef struct match_s { pat_t *pat; // Intrusive linked list nodes for garbage collection and cache buckets: match_dll_t gc, cache; - size_t defs_id; int refcount; struct match_s **children; struct match_s *_children[3]; @@ -37,14 +35,12 @@ typedef struct { void (*on_nl)(FILE *out); } print_options_t; -__attribute__((returns_nonnull)) -match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]); __attribute__((nonnull)) void recycle_if_unused(match_t **at_m); size_t free_all_matches(void); size_t recycle_all_matches(void); -bool next_match(match_t **m, def_t *defs, const char *start, const char *end, pat_t *pat, pat_t *skip, bool ignorecase); -#define stop_matching(m) next_match(m, NULL, NULL, NULL, NULL, NULL, 0) +bool next_match(match_t **m, const char *start, const char *end, pat_t *pat, pat_t *skip, bool ignorecase); +#define stop_matching(m) next_match(m, NULL, NULL, NULL, NULL, 0) __attribute__((nonnull)) match_t *get_numbered_capture(match_t *m, int n); __attribute__((nonnull, pure)) @@ -167,6 +167,15 @@ pat_t *chain_together(pat_t *first, pat_t *second) { if (first == NULL) return second; if (second == NULL) return first; + + if (first->type == BP_DEFINITIONS && second->type == BP_DEFINITIONS) { + pat_t *first_end = first; + while (first_end->args.def.next_def != NULL) + first_end = first_end->args.def.next_def; + first_end->args.def.next_def = second; + return first; + } + size_t minlen = first->min_matchlen + second->min_matchlen; ssize_t maxlen = (UNBOUNDED(first) || UNBOUNDED(second)) ? (ssize_t)-1 : first->max_matchlen + second->max_matchlen; pat_t *chain = new_pat(BP_CHAIN, first->start, second->end, minlen, maxlen); @@ -210,6 +219,30 @@ pat_t *either_pat(pat_t *first, pat_t *second) } // +// Parse a definition +// +__attribute__((nonnull)) +static pat_t *_bp_definition(const char *start, const char *end) +{ + if (start >= end || !(isalpha(*start) || *start == '_')) return NULL; + const char *str = after_name(start); + size_t namelen = (size_t)(str - start); + if (!matchchar(&str, ':', false)) return NULL; + pat_t *def = bp_pattern_nl(str, end, false); + if (!def) parse_err(str, end, "Could not parse this definition."); + str = def->end; + (void)matchchar(&str, ';', false); // Optional semicolon + pat_t *ret = new_pat(BP_DEFINITIONS, start, str, 0, -1); + ret->args.def.name = start; + ret->args.def.namelen = namelen; + ret->args.def.meaning = def; + ret->args.def.next_def = _bp_definition(after_spaces(str, true), end); + if (ret->args.def.next_def) + ret->end = ret->args.def.next_def->end; + return ret; +} + +// // Compile a string of BP code into a BP pattern object. // __attribute__((nonnull)) @@ -475,26 +508,12 @@ static pat_t *_bp_simplepattern(const char *str, const char *end) return new_pat(BP_END_OF_LINE, start, str, 0, 0); } default: { + pat_t *def = _bp_definition(start, end); + if (def) return def; // Reference if (!isalpha(c) && c != '_') return NULL; str = after_name(start); size_t namelen = (size_t)(str - start); - if (matchchar(&str, ':', false)) { // Definitions - pat_t *def = bp_pattern_nl(str, end, false); - if (!def) parse_err(str, end, "Could not parse this definition."); - str = def->end; - (void)matchchar(&str, ';', false); // Optional semicolon - str = after_spaces(str, true); - pat_t *pat = bp_pattern_nl(str, end, false); - if (pat) str = pat->end; - else pat = def; - pat_t *ret = new_pat(BP_DEFINITION, start, str, pat->min_matchlen, pat->max_matchlen); - ret->args.def.name = start; - ret->args.def.namelen = namelen; - ret->args.def.def = def; - ret->args.def.pat = pat; - return ret; - } pat_t *ref = new_pat(BP_REF, start, str, 0, -1); ref->args.ref.name = start; ref->args.ref.len = namelen; @@ -653,9 +672,9 @@ void delete_pat(pat_t **at_pat, bool recursive) if (recursive) { switch (pat->type) { - case BP_DEFINITION: - delete_pat(&pat->args.def.def, true); - delete_pat(&pat->args.def.pat, true); + case BP_DEFINITIONS: + delete_pat(&pat->args.def.meaning, true); + delete_pat(&pat->args.def.next_def, true); break; case BP_REPEAT: delete_pat(&pat->args.repetitions.sep, true); @@ -34,7 +34,7 @@ enum pattype_e { BP_END_OF_FILE = 22, BP_END_OF_LINE = 23, BP_WORD_BOUNDARY = 24, - BP_DEFINITION = 25, + BP_DEFINITIONS = 25, BP_LEFTRECURSION = 26, }; @@ -57,7 +57,7 @@ typedef struct pat_s { struct { const char *name; size_t namelen; - struct pat_s *def, *pat; + struct pat_s *meaning, *next_def; } def; struct { unsigned char low, high; |
