aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2021-09-27 20:36:10 -0700
committerBruce Hill <bruce@bruce-hill.com>2021-09-27 20:36:10 -0700
commit911827fac3a53734c9a4a99c1b8ec2a689bc59d8 (patch)
tree5ac9e27f065b66ad613fbcac21c95f8b64706310
parenta96284615b27226f4d34de8dfa7235f0c14ac1d4 (diff)
Removed definitions as a separate type and instead encode that value in
the patterns themselves. This simplifies memory management a lot and speeds up performance.
-rw-r--r--Lua/Makefile2
-rw-r--r--Lua/lbp.c12
-rw-r--r--Makefile2
-rw-r--r--bp.c66
-rw-r--r--definitions.c55
-rw-r--r--definitions.h28
-rw-r--r--match.c229
-rw-r--r--match.h8
-rw-r--r--pattern.c57
-rw-r--r--pattern.h4
10 files changed, 219 insertions, 244 deletions
diff --git a/Lua/Makefile b/Lua/Makefile
index c729821..5be2cb4 100644
--- a/Lua/Makefile
+++ b/Lua/Makefile
@@ -39,7 +39,7 @@ clean:
lbp.o: lbp.c builtins.h
$(CC) -c $(ALL_FLAGS) -o $@ $<
-bp.so: lbp.o ../pattern.o ../utils.o ../utf8.o ../match.o ../definitions.o
+bp.so: lbp.o ../pattern.o ../utils.o ../utf8.o ../match.o
$(MAKESO) -o $@ $^
builtins.h: ../grammars/builtins.bp
diff --git a/Lua/lbp.c b/Lua/lbp.c
index 269e851..341aa81 100644
--- a/Lua/lbp.c
+++ b/Lua/lbp.c
@@ -26,7 +26,7 @@ static const char *builtins_source = (
#include "builtins.h"
);
static int MATCH_METATABLE = 0, PAT_METATABLE = 0;
-static def_t *builtins;
+static pat_t *builtins;
static void push_match(lua_State *L, match_t *m, const char *start);
@@ -133,11 +133,13 @@ static int Lmatch(lua_State *L)
match_t *m = NULL;
int ret = 0;
- if (next_match(&m, builtins, text+index-1, &text[textlen], pat, NULL, false)) {
+ pat_t *def_pat = chain_together(builtins, pat);
+ if (next_match(&m, text+index-1, &text[textlen], def_pat, NULL, false)) {
push_match(L, m, text);
stop_matching(&m);
ret = 1;
}
+ delete_pat(&def_pat, false);
return ret;
}
@@ -170,7 +172,8 @@ static int Lreplace(lua_State *L)
FILE *out = open_memstream(&buf, &size);
int replacements = 0;
const char *prev = text;
- for (match_t *m = NULL; next_match(&m, builtins, text, &text[textlen], maybe_replacement.value.pat, NULL, false); ) {
+ pat_t *rep_pat = chain_together(builtins, maybe_replacement.value.pat);
+ for (match_t *m = NULL; next_match(&m, text, &text[textlen], rep_pat, NULL, false); ) {
fwrite(prev, sizeof(char), (size_t)(m->start - prev), out);
fprint_match(out, text, m, NULL);
prev = m->end;
@@ -329,8 +332,7 @@ LUALIB_API int luaopen_bp(lua_State *L)
raise_parse_error(L, maybe_pat);
return 0;
}
- for (pat_t *p = maybe_pat.value.pat; p && p->type == BP_DEFINITION; p = p->args.def.pat)
- builtins = with_def(builtins, p->args.def.namelen, p->args.def.name, p->args.def.def);
+ builtins = maybe_pat.value.pat;
lua_pushlightuserdata(L, (void*)&PAT_METATABLE);
luaL_newlib(L, pat_metamethods);
diff --git a/Makefile b/Makefile
index 7cbe0b9..f54e8af 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,7 @@ G=
O=-O3
ALL_FLAGS=$(CFLAGS) $(OSFLAGS) -DBP_NAME="\"$(NAME)\"" $(EXTRA) $(CWARN) $(G) $(O)
-CFILES=pattern.c definitions.c utils.c match.c files.c explain.c json.c utf8.c
+CFILES=pattern.c utils.c match.c files.c explain.c json.c utf8.c
OBJFILES=$(CFILES:.c=.o)
all: $(NAME) bp.1
diff --git a/bp.c b/bp.c
index 73b0469..7df5ada 100644
--- a/bp.c
+++ b/bp.c
@@ -20,7 +20,6 @@
#include <sys/wait.h>
#include <unistd.h>
-#include "definitions.h"
#include "explain.h"
#include "files.h"
#include "json.h"
@@ -207,10 +206,10 @@ static int is_text_file(const char *filename)
//
// Print matches in JSON format.
//
-static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern)
+static int print_matches_as_json(file_t *f, pat_t *pattern)
{
int nmatches = 0;
- for (match_t *m = NULL; next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase); ) {
+ for (match_t *m = NULL; next_match(&m, f->start, f->end, pattern, options.skip, options.ignorecase); ) {
if (++nmatches > 1)
printf(",\n");
printf("{\"filename\":\"%s\",\"match\":", f->filename);
@@ -223,10 +222,10 @@ static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern)
//
// Print matches in a visual explanation style
//
-static int explain_matches(def_t *defs, file_t *f, pat_t *pattern)
+static int explain_matches(file_t *f, pat_t *pattern)
{
int nmatches = 0;
- for (match_t *m = NULL; next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase); ) {
+ for (match_t *m = NULL; next_match(&m, f->start, f->end, pattern, options.skip, options.ignorecase); ) {
if (++nmatches == 1) {
if (options.print_filenames)
fprint_filename(stdout, f->filename);
@@ -349,7 +348,7 @@ static void on_nl(FILE *out)
//
// Print all the matches in a file.
//
-static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern)
+static int print_matches(FILE *out, file_t *f, pat_t *pattern)
{
static int printed_filenames = 0;
int matches = 0;
@@ -364,7 +363,7 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern)
print_opts.replace_color = "\033[0;34;1m";
print_opts.normal_color = "\033[m";
}
- for (match_t *m = NULL; next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase); ) {
+ for (match_t *m = NULL; next_match(&m, f->start, f->end, pattern, options.skip, options.ignorecase); ) {
if (++matches == 1 && options.print_filenames) {
if (printed_filenames++ > 0) printf("\n");
fprint_filename(out, f->filename);
@@ -388,8 +387,8 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern)
// For a given filename, open the file and attempt to match the given pattern
// against it, printing any results according to the flags.
//
-__attribute__((nonnull(2,3)))
-static int process_file(def_t *defs, const char *filename, pat_t *pattern)
+__attribute__((nonnull))
+static int process_file(const char *filename, pat_t *pattern)
{
file_t *f = load_file(NULL, filename);
if (f == NULL) {
@@ -399,19 +398,19 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern)
int matches = 0;
if (options.mode == MODE_EXPLAIN) {
- matches += explain_matches(defs, f, pattern);
+ matches += explain_matches(f, pattern);
} else if (options.mode == MODE_LISTFILES) {
match_t *m = NULL;
- if (next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase)) {
+ if (next_match(&m, f->start, f->end, pattern, options.skip, options.ignorecase)) {
printf("%s\n", f->filename);
matches += 1;
}
stop_matching(&m);
} else if (options.mode == MODE_JSON) {
- matches += print_matches_as_json(defs, f, pattern);
+ matches += print_matches_as_json(f, pattern);
} else if (options.mode == MODE_INPLACE) {
match_t *m = NULL;
- bool found = next_match(&m, defs, f->start, f->end, pattern, options.skip, options.ignorecase);
+ bool found = next_match(&m, f->start, f->end, pattern, options.skip, options.ignorecase);
stop_matching(&m);
if (!found) return 0;
@@ -427,12 +426,12 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern)
// are used to restore the original file contents.
modifying_file = out; backup_file = f;
{
- matches += print_matches(out, defs, f, pattern);
+ matches += print_matches(out, f, pattern);
}
modifying_file = NULL; backup_file = NULL;
fclose(out);
} else {
- matches += print_matches(stdout, defs, f, pattern);
+ matches += print_matches(stdout, f, pattern);
}
fflush(stdout);
@@ -446,8 +445,8 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern)
//
// Recursively process all non-dotfile files in the given directory.
//
-__attribute__((nonnull(2,3)))
-static int process_dir(def_t *defs, const char *dirname, pat_t *pattern)
+__attribute__((nonnull))
+static int process_dir(const char *dirname, pat_t *pattern)
{
int matches = 0;
glob_t globbuf;
@@ -464,9 +463,9 @@ static int process_dir(def_t *defs, const char *dirname, pat_t *pattern)
if (S_ISLNK(statbuf.st_mode))
continue; // Skip symbolic links
else if (S_ISDIR(statbuf.st_mode))
- matches += process_dir(defs, globbuf.gl_pathv[i], pattern);
+ matches += process_dir(globbuf.gl_pathv[i], pattern);
else if (is_text_file(globbuf.gl_pathv[i]))
- matches += process_file(defs, globbuf.gl_pathv[i], pattern);
+ matches += process_file(globbuf.gl_pathv[i], pattern);
}
}
globfree(&globbuf);
@@ -476,8 +475,8 @@ static int process_dir(def_t *defs, const char *dirname, pat_t *pattern)
//
// Process git files using `git ls-files ...`
//
-__attribute__((nonnull(2)))
-static int process_git_files(def_t *defs, pat_t *pattern, int argc, char *argv[])
+__attribute__((nonnull(1)))
+static int process_git_files(pat_t *pattern, int argc, char *argv[])
{
int fds[2];
require(pipe(fds), "Failed to create pipe");
@@ -500,7 +499,7 @@ static int process_git_files(def_t *defs, pat_t *pattern, int argc, char *argv[]
size_t path_size = 0;
int found = 0;
while (getdelim(&path, &path_size, '\0', fp) > 0)
- found += process_file(defs, path, pattern);
+ found += process_file(path, pattern);
if (path) delete(&path);
require(fclose(fp), "Failed to close read end of pipe");
int status;
@@ -514,12 +513,9 @@ static int process_git_files(def_t *defs, pat_t *pattern, int argc, char *argv[]
// Load the given grammar (semicolon-separated definitions)
// and return the first rule defined.
//
-static def_t *load_grammar(def_t *defs, file_t *f)
+static pat_t *load_grammar(pat_t *defs, file_t *f)
{
- pat_t *pat = assert_pat(f->start, bp_pattern(f->start, f->end));
- for (pat_t *p = pat; p && p->type == BP_DEFINITION; p = p->args.def.pat)
- defs = with_def(defs, p->args.def.namelen, p->args.def.name, p->args.def.def);
- return defs;
+ return chain_together(defs, assert_pat(f->start, bp_pattern(f->start, f->end)));
}
//
@@ -539,7 +535,7 @@ int main(int argc, char *argv[])
{
char *flag = NULL;
- def_t *defs = NULL;
+ pat_t *defs = NULL;
file_t *loaded_files = NULL;
pat_t *pattern = NULL;
@@ -634,6 +630,9 @@ int main(int argc, char *argv[])
if (pattern == NULL)
errx(EXIT_FAILURE, "No pattern provided.\n\n%s", usage);
+ // Hook up definitions:
+ pattern = chain_together(defs, pattern);
+
for (argc = 0; argv[argc]; ++argc) ; // update argc
if (options.context_before == USE_DEFAULT_CONTEXT) options.context_before = 0;
@@ -655,7 +654,7 @@ int main(int argc, char *argv[])
int found = 0;
if (options.mode == MODE_JSON) printf("[");
if (options.git_mode) { // Get the list of files from `git --ls-files ...`
- found = process_git_files(defs, pattern, argc, argv);
+ found = process_git_files(pattern, argc, argv);
} else if (argv[0]) {
// Files pass in as command line args:
struct stat statbuf;
@@ -663,17 +662,17 @@ int main(int argc, char *argv[])
options.print_filenames = false;
for ( ; argv[0]; argv++) {
if (stat(argv[0], &statbuf) == 0 && S_ISDIR(statbuf.st_mode)) // Symlinks are okay if manually specified
- found += process_dir(defs, argv[0], pattern);
+ found += process_dir(argv[0], pattern);
else
- found += process_file(defs, argv[0], pattern);
+ found += process_file(argv[0], pattern);
}
} else if (isatty(STDIN_FILENO)) {
// No files, no piped in input, so use files in current dir, recursively
- found += process_dir(defs, ".", pattern);
+ found += process_dir(".", pattern);
} else {
// Piped in input:
options.print_filenames = false; // Don't print filename on stdin
- found += process_file(defs, "", pattern);
+ found += process_file("", pattern);
}
if (options.mode == MODE_JSON) printf("]\n");
@@ -681,7 +680,6 @@ int main(int argc, char *argv[])
// is about to exit, this step is unnecessary. However, it is useful for
// tracking down memory leaks.
free_all_matches();
- defs = free_defs(defs, NULL);
free_all_pats();
while (loaded_files) {
file_t *next = loaded_files->next;
diff --git a/definitions.c b/definitions.c
deleted file mode 100644
index 590a801..0000000
--- a/definitions.c
+++ /dev/null
@@ -1,55 +0,0 @@
-//
-// definitions.c - Code for defining named pattern rules
-//
-
-#include <err.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "definitions.h"
-#include "pattern.h"
-#include "utils.h"
-
-static size_t next_id = 0;
-
-//
-// Return a new list of definitions with one added to the front
-//
-def_t *with_def(def_t *defs, size_t namelen, const char *name, pat_t *pat)
-{
- def_t *def = new(def_t);
- def->id = next_id++;
- def->next = defs;
- def->namelen = namelen;
- def->name = name;
- def->pat = pat;
- return def;
-}
-
-//
-// Look up a backreference or grammar definition by name
-//
-def_t *lookup(def_t *defs, size_t namelen, const char *name)
-{
- for ( ; defs; defs = defs->next) {
- if (namelen == defs->namelen && strncmp(defs->name, name, namelen) == 0)
- return defs;
- }
- return NULL;
-}
-
-//
-// Free all the given definitions up till (but not including) `stop`
-//
-def_t *free_defs(def_t *defs, def_t *stop)
-{
- while (defs != stop && defs != NULL) {
- def_t *next = defs->next;
- defs->next = NULL;
- delete(&defs);
- defs = next;
- }
- return defs;
-}
-
-// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/definitions.h b/definitions.h
deleted file mode 100644
index 34e4344..0000000
--- a/definitions.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//
-// definitions.h - Header file defining pattern rules
-//
-#ifndef DEFINITIONS__H
-#define DEFINITIONS__H
-
-#include "pattern.h"
-
-//
-// Pattern matching rule definition(s)
-//
-typedef struct def_s {
- size_t id;
- size_t namelen;
- const char *name;
- pat_t *pat;
- struct def_s *next;
-} def_t;
-
-__attribute__((nonnull(3,4), returns_nonnull))
-def_t *with_def(def_t *defs, size_t namelen, const char *name, pat_t *pat);
-__attribute__((pure, nonnull(3)))
-def_t *lookup(def_t *defs, size_t namelen, const char *name);
-__attribute__((nonnull(1)))
-def_t *free_defs(def_t *defs, def_t *stop);
-
-#endif
-// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/match.c b/match.c
index 5ae9f0f..b61208a 100644
--- a/match.c
+++ b/match.c
@@ -10,7 +10,6 @@
#include <stdlib.h>
#include <string.h>
-#include "definitions.h"
#include "match.h"
#include "pattern.h"
#include "utils.h"
@@ -25,8 +24,9 @@ typedef struct {
} cache_t;
// Data structure for various ambient state for matching
-typedef struct {
- def_t *defs;
+typedef struct match_ctx_s {
+ struct match_ctx_s *parent_ctx;
+ pat_t *defs;
cache_t *cache;
const char *start, *end;
bool ignorecase;
@@ -45,6 +45,8 @@ static match_t *in_use_matches = NULL;
__attribute__((hot, nonnull(1,2,3)))
static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat);
+__attribute__((returns_nonnull))
+static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *children[]);
// Store a value and update its refcount
static inline void add_owner(match_t** owner, match_t* owned)
@@ -107,12 +109,12 @@ static inline size_t hash(const char *str, pat_t *pat)
// given definitions. If a result has been memoized, set *result to the
// memoized value and return true, otherwise return false.
//
-static bool cache_get(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t **result)
+static bool cache_get(match_ctx_t *ctx, const char *str, pat_t *pat, match_t **result)
{
- if (!cache->matches) return false;
- size_t h = hash(str, pat) & (cache->size-1);
- for (match_t *c = cache->matches[h]; c; c = c->cache.next) {
- if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) {
+ if (!ctx->cache->matches) return false;
+ size_t h = hash(str, pat) & (ctx->cache->size-1);
+ for (match_t *c = ctx->cache->matches[h]; c; c = c->cache.next) {
+ if (c->pat == pat && c->start == str) {
// If c->end == NULL, that means no match occurs here
*result = c->end == NULL ? NULL : c;
return true;
@@ -124,7 +126,7 @@ static bool cache_get(cache_t *cache, def_t *defs, const char *str, pat_t *pat,
//
// Remove an item from the cache.
//
-static void cache_remove(cache_t *cache, match_t *m)
+static void cache_remove(match_ctx_t *ctx, match_t *m)
{
if (!m->cache.home) return;
*m->cache.home = m->cache.next;
@@ -132,25 +134,26 @@ static void cache_remove(cache_t *cache, match_t *m)
m->cache.next = NULL;
m->cache.home = NULL;
if (--m->refcount == 0) recycle_if_unused(&m);
- --cache->occupancy;
+ --ctx->cache->occupancy;
}
//
// Save a match in the cache.
//
-static void cache_save(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t *m)
+static void cache_save(match_ctx_t *ctx, const char *str, pat_t *pat, match_t *m)
{
// As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used
// to memoize the fact that `pat` will *not* match at `str`.
- if (m == NULL) m = new_match(defs, pat, str, NULL, NULL);
+ if (m == NULL) m = new_match(pat, str, NULL, NULL);
+ cache_t *cache = ctx->cache;
if (cache->occupancy+1 > 3*cache->size) {
if (cache->size == MAX_CACHE_SIZE) {
size_t h = hash(m->start, m->pat) & (cache->size-1);
for (int quota = 2; cache->matches[h] && quota > 0; quota--) {
match_t *last = cache->matches[h];
while (last->cache.next) last = last->cache.next;
- cache_remove(cache, last);
+ cache_remove(ctx, last);
}
} else {
match_t **old_matches = cache->matches;
@@ -186,11 +189,12 @@ static void cache_save(cache_t *cache, def_t *defs, const char *str, pat_t *pat,
//
// Clear and deallocate the cache.
//
-void cache_destroy(cache_t *cache)
+void cache_destroy(match_ctx_t *ctx)
{
+ cache_t *cache = ctx->cache;
for (size_t i = 0; i < cache->size; i++) {
while (cache->matches[i])
- cache_remove(cache, cache->matches[i]);
+ cache_remove(ctx, cache->matches[i]);
}
cache->occupancy = 0;
if (cache->matches) delete(&cache->matches);
@@ -198,15 +202,29 @@ void cache_destroy(cache_t *cache)
}
//
+// Look up a pattern definition by name.
+//
+__attribute__((nonnull))
+pat_t *lookup(match_ctx_t *ctx, const char *name, size_t namelen)
+{
+ for (pat_t *def = ctx->defs; def; def = def->args.def.next_def) {
+ if (namelen == def->args.def.namelen && strncmp(def->args.def.name, name, namelen) == 0)
+ return def->args.def.meaning;
+ }
+ if (ctx->parent_ctx) return lookup(ctx->parent_ctx, name, namelen);
+ return NULL;
+}
+
+//
// If the given pattern is a reference, look it up and return the referenced
// pattern. This is used for an optimization to avoid repeated lookups.
//
__attribute__((nonnull(1)))
-static inline pat_t *deref(def_t *defs, pat_t *pat)
+static inline pat_t *deref(match_ctx_t *ctx, pat_t *pat)
{
if (pat && pat->type == BP_REF) {
- def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name);
- if (def) pat = def->pat;
+ pat_t *def = lookup(ctx, pat->args.ref.name, pat->args.ref.len);
+ if (def) return def;
}
return pat;
}
@@ -215,7 +233,7 @@ static inline pat_t *deref(def_t *defs, pat_t *pat)
// Find and return the first and simplest pattern that will definitely have to
// match for the whole pattern to match (if any)
//
-static pat_t *first_pat(def_t *defs, pat_t *pat)
+static pat_t *first_pat(match_ctx_t *ctx, pat_t *pat)
{
for (pat_t *p = pat; p; ) {
switch (p->type) {
@@ -232,7 +250,7 @@ static pat_t *first_pat(def_t *defs, pat_t *pat)
case BP_REPLACE:
p = p->args.replace.pat; break;
case BP_REF: {
- pat_t *p2 = deref(defs, p);
+ pat_t *p2 = deref(ctx, p);
if (p2 == p) return p2;
p = p2;
break;
@@ -249,12 +267,14 @@ static pat_t *first_pat(def_t *defs, pat_t *pat)
__attribute__((nonnull(1,2,3)))
static match_t *_next_match(match_ctx_t *ctx, const char *str, pat_t *pat, pat_t *skip)
{
- // Definitions can be done once, ahead of time, instead of at each string position:
- if (pat->type == BP_DEFINITION) {
+ if (pat->type == BP_DEFINITIONS || (pat->type == BP_CHAIN && pat->args.multiple.first->type == BP_DEFINITIONS)) {
match_ctx_t ctx2 = *ctx;
- ctx2.defs = with_def(ctx->defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def);
- match_t *m = _next_match(&ctx2, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, skip);
- free_defs(ctx2.defs, ctx->defs);
+ ctx2.cache = &(cache_t){0};
+ ctx2.parent_ctx = ctx;
+ ctx2.defs = pat->type == BP_DEFINITIONS ? pat : pat->args.multiple.first;
+ pat_t *match_pat = pat->type == BP_DEFINITIONS ? pat->args.def.meaning : pat->args.multiple.second;
+ match_t *m = _next_match(&ctx2, str, match_pat, skip);
+ cache_destroy(&ctx2);
return m;
}
@@ -263,12 +283,12 @@ static match_t *_next_match(match_ctx_t *ctx, const char *str, pat_t *pat, pat_t
for (match_t *m = ctx->cache->matches[i], *next = NULL; m; m = next) {
next = m->cache.next;
if (m->start < ctx->start || (m->end ? m->end : m->start) > ctx->end)
- cache_remove(ctx->cache, m);
+ cache_remove(ctx, m);
}
}
- pat = deref(ctx->defs, pat); // Avoid repeated lookups
- pat_t *first = first_pat(ctx->defs, pat);
+ pat = deref(ctx, pat); // Avoid repeated lookups
+ pat_t *first = first_pat(ctx, pat);
// Don't bother looping if this can only match at the start:
if (first->type == BP_START_OF_FILE)
@@ -302,11 +322,13 @@ static match_t *_next_match(match_ctx_t *ctx, const char *str, pat_t *pat, pat_t
static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
{
switch (pat->type) {
- case BP_DEFINITION: {
+ case BP_DEFINITIONS: {
match_ctx_t ctx2 = *ctx;
- ctx2.defs = with_def(ctx->defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def);
- match_t *m = match(&ctx2, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def);
- free_defs(ctx2.defs, ctx->defs);
+ ctx2.cache = &(cache_t){0};
+ ctx2.parent_ctx = ctx;
+ ctx2.defs = pat;
+ match_t *m = match(&ctx2, str, pat->args.def.meaning);
+ cache_destroy(&ctx2);
return m;
}
case BP_LEFTRECURSION: {
@@ -323,41 +345,41 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
}
}
case BP_ANYCHAR: {
- return (str < ctx->end && *str != '\n') ? new_match(ctx->defs, pat, str, next_char(str, ctx->end), NULL) : NULL;
+ return (str < ctx->end && *str != '\n') ? new_match(pat, str, next_char(str, ctx->end), NULL) : NULL;
}
case BP_ID_START: {
- return (str < ctx->end && isidstart(str, ctx->end)) ? new_match(ctx->defs, pat, str, next_char(str, ctx->end), NULL) : NULL;
+ return (str < ctx->end && isidstart(str, ctx->end)) ? new_match(pat, str, next_char(str, ctx->end), NULL) : NULL;
}
case BP_ID_CONTINUE: {
- return (str < ctx->end && isidcontinue(str, ctx->end)) ? new_match(ctx->defs, pat, str, next_char(str, ctx->end), NULL) : NULL;
+ return (str < ctx->end && isidcontinue(str, ctx->end)) ? new_match(pat, str, next_char(str, ctx->end), NULL) : NULL;
}
case BP_START_OF_FILE: {
- return (str == ctx->start) ? new_match(ctx->defs, pat, str, str, NULL) : NULL;
+ return (str == ctx->start) ? new_match(pat, str, str, NULL) : NULL;
}
case BP_START_OF_LINE: {
- return (str == ctx->start || str[-1] == '\n') ? new_match(ctx->defs, pat, str, str, NULL) : NULL;
+ return (str == ctx->start || str[-1] == '\n') ? new_match(pat, str, str, NULL) : NULL;
}
case BP_END_OF_FILE: {
- return (str == ctx->end || (str == ctx->end-1 && *str == '\n')) ? new_match(ctx->defs, pat, str, str, NULL) : NULL;
+ return (str == ctx->end || (str == ctx->end-1 && *str == '\n')) ? new_match(pat, str, str, NULL) : NULL;
}
case BP_END_OF_LINE: {
- return (str == ctx->end || *str == '\n') ? new_match(ctx->defs, pat, str, str, NULL) : NULL;
+ return (str == ctx->end || *str == '\n') ? new_match(pat, str, str, NULL) : NULL;
}
case BP_WORD_BOUNDARY: {
return (str == ctx->start || isidcontinue(str, ctx->end) != isidcontinue(prev_char(ctx->start, str), ctx->end)) ?
- new_match(ctx->defs, pat, str, str, NULL) : NULL;
+ new_match(pat, str, str, NULL) : NULL;
}
case BP_STRING: {
if (&str[pat->min_matchlen] > ctx->end) return NULL;
if (pat->min_matchlen > 0 && (ctx->ignorecase ? strncasecmp : strncmp)(str, pat->args.string, pat->min_matchlen) != 0)
return NULL;
- return new_match(ctx->defs, pat, str, str + pat->min_matchlen, NULL);
+ return new_match(pat, str, str + pat->min_matchlen, NULL);
}
case BP_RANGE: {
if (str >= ctx->end) return NULL;
if ((unsigned char)*str < pat->args.range.low || (unsigned char)*str > pat->args.range.high)
return NULL;
- return new_match(ctx->defs, pat, str, str+1, NULL);
+ return new_match(pat, str, str+1, NULL);
}
case BP_NOT: {
match_t *m = match(ctx, str, pat->args.pat);
@@ -365,12 +387,12 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
recycle_if_unused(&m);
return NULL;
}
- return new_match(ctx->defs, pat, str, str, NULL);
+ return new_match(pat, str, str, NULL);
}
case BP_UPTO: case BP_UPTO_STRICT: {
- match_t *m = new_match(ctx->defs, pat, str, str, NULL);
- pat_t *target = deref(ctx->defs, pat->args.multiple.first),
- *skip = deref(ctx->defs, pat->args.multiple.second);
+ match_t *m = new_match(pat, str, str, NULL);
+ pat_t *target = deref(ctx, pat->args.multiple.first),
+ *skip = deref(ctx, pat->args.multiple.second);
if (!target && !skip) {
while (str < ctx->end && *str != '\n') ++str;
m->end = str;
@@ -413,11 +435,11 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
return NULL;
}
case BP_REPEAT: {
- match_t *m = new_match(ctx->defs, pat, str, str, NULL);
+ match_t *m = new_match(pat, str, str, NULL);
size_t reps = 0;
ssize_t max = pat->args.repetitions.max;
- pat_t *repeating = deref(ctx->defs, pat->args.repetitions.repeat_pat);
- pat_t *sep = deref(ctx->defs, pat->args.repetitions.sep);
+ pat_t *repeating = deref(ctx, pat->args.repetitions.repeat_pat);
+ pat_t *sep = deref(ctx, pat->args.repetitions.sep);
size_t child_cap = 0, nchildren = 0;
for (reps = 0; max == -1 || reps < (size_t)max; ++reps) {
const char *start = str;
@@ -473,7 +495,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
return m;
}
case BP_AFTER: {
- pat_t *back = deref(ctx->defs, pat->args.pat);
+ pat_t *back = deref(ctx, pat->args.pat);
if (!back) return NULL;
// We only care about the region from the backtrack pos up to the
@@ -487,37 +509,47 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
for (const char *pos = &str[-(long)back->min_matchlen];
pos >= ctx->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]);
pos = prev_char(ctx->start, pos)) {
- cache_destroy(slice_ctx.cache);
+ cache_destroy(&slice_ctx);
slice_ctx.start = (char*)pos;
match_t *m = match(&slice_ctx, pos, back);
// Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB")
if (m && m->end != str)
recycle_if_unused(&m);
else if (m) {
- cache_destroy(slice_ctx.cache);
- return new_match(ctx->defs, pat, str, str, MATCHES(m));
+ cache_destroy(&slice_ctx);
+ return new_match(pat, str, str, MATCHES(m));
}
if (pos == ctx->start) break;
// To prevent extreme performance degradation, don't keep
// walking backwards endlessly over newlines.
if (back->max_matchlen == -1 && *pos == '\n') break;
}
- cache_destroy(slice_ctx.cache);
+ cache_destroy(&slice_ctx);
return NULL;
}
case BP_BEFORE: {
match_t *after = match(ctx, str, pat->args.pat);
- return after ? new_match(ctx->defs, pat, str, str, MATCHES(after)) : NULL;
+ return after ? new_match(pat, str, str, MATCHES(after)) : NULL;
}
case BP_CAPTURE: {
match_t *p = match(ctx, str, pat->args.pat);
- return p ? new_match(ctx->defs, pat, str, p->end, MATCHES(p)) : NULL;
+ return p ? new_match(pat, str, p->end, MATCHES(p)) : NULL;
}
case BP_OTHERWISE: {
match_t *m = match(ctx, str, pat->args.multiple.first);
return m ? m : match(ctx, str, pat->args.multiple.second);
}
case BP_CHAIN: {
+ if (pat->args.multiple.first->type == BP_DEFINITIONS) {
+ match_ctx_t ctx2 = *ctx;
+ ctx2.cache = &(cache_t){0};
+ ctx2.parent_ctx = ctx;
+ ctx2.defs = pat->args.multiple.first;
+ match_t *m = match(&ctx2, str, pat->args.multiple.second);
+ cache_destroy(&ctx2);
+ return m;
+ }
+
match_t *m1 = match(ctx, str, pat->args.multiple.first);
if (m1 == NULL) return NULL;
@@ -528,13 +560,25 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
// exact string of the original match (no replacements)
pat_t *backref = bp_raw_literal(m1->start, (size_t)(m1->end - m1->start));
match_ctx_t ctx2 = *ctx;
- ctx2.defs = with_def(ctx->defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref);
+ ctx2.cache = &(cache_t){0};
+ ctx2.parent_ctx = ctx;
+ ctx2.defs = &(pat_t){
+ .type = BP_DEFINITIONS,
+ .start = m1->pat->start, .end = m1->pat->end,
+ .args = {
+ .def = {
+ .name = m1->pat->args.capture.name,
+ .namelen = m1->pat->args.capture.namelen,
+ .meaning = backref,
+ }
+ },
+ };
++m1->refcount; {
m2 = match(&ctx2, m1->end, pat->args.multiple.second);
if (!m2) // No need to keep the backref in memory if it didn't match
delete_pat(&backref, false);
- free_defs(ctx2.defs, ctx->defs);
} --m1->refcount;
+ cache_destroy(&ctx2);
} else {
m2 = match(ctx, m1->end, pat->args.multiple.second);
}
@@ -544,7 +588,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
return NULL;
}
- return new_match(ctx->defs, pat, str, m2->end, MATCHES(m1, m2));
+ return new_match(pat, str, m2->end, MATCHES(m1, m2));
}
case BP_MATCH: case BP_NOT_MATCH: {
match_t *m1 = match(ctx, str, pat->args.multiple.first);
@@ -558,13 +602,13 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
slice_ctx.end = m1->end;
match_t *m2 = _next_match(&slice_ctx, slice_ctx.start, pat->args.multiple.second, NULL);
if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) {
- cache_destroy(slice_ctx.cache);
+ cache_destroy(&slice_ctx);
if (m2) recycle_if_unused(&m2);
recycle_if_unused(&m1);
return NULL;
}
- match_t *ret = new_match(ctx->defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1));
- cache_destroy(slice_ctx.cache);
+ match_t *ret = new_match(pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1));
+ cache_destroy(&slice_ctx);
return ret;
}
case BP_REPLACE: {
@@ -573,17 +617,16 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
p = match(ctx, str, pat->args.replace.pat);
if (p == NULL) return NULL;
}
- return new_match(ctx->defs, pat, str, p ? p->end : str, MATCHES(p));
+ return new_match(pat, str, p ? p->end : str, MATCHES(p));
}
case BP_REF: {
match_t *cached;
- if (cache_get(ctx->cache, ctx->defs, str, pat, &cached))
+ if (cache_get(ctx, str, pat, &cached))
return cached;
- def_t *def = lookup(ctx->defs, pat->args.ref.len, pat->args.ref.name);
- if (def == NULL)
+ pat_t *ref = lookup(ctx, pat->args.ref.name, pat->args.ref.len);
+ if (ref == NULL)
errx(EXIT_FAILURE, "Unknown identifier: '%.*s'", (int)pat->args.ref.len, pat->args.ref.name);
- pat_t *ref = def->pat;
pat_t rec_op = {
.type = BP_LEFTRECURSION,
@@ -599,17 +642,23 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
},
};
match_ctx_t ctx2 = *ctx;
- ctx2.defs = &(def_t){
- .namelen = def->namelen,
- .name = def->name,
- .pat = &rec_op,
- .next = ctx->defs,
+ ctx2.parent_ctx = ctx;
+ ctx2.defs = &(pat_t){
+ .type = BP_DEFINITIONS,
+ .start = pat->start, .end = pat->end,
+ .args = {
+ .def = {
+ .name = pat->args.ref.name,
+ .namelen = pat->args.ref.len,
+ .meaning = &rec_op,
+ }
+ },
};
const char *prev = str;
match_t *m = match(&ctx2, str, ref);
if (m == NULL) {
- cache_save(ctx->cache, ctx->defs, str, pat, NULL);
+ cache_save(ctx, str, pat, NULL);
return NULL;
}
@@ -632,8 +681,8 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
// leftrec.match is GC'd. It also helps with visualization of match
// results.
// OPTIMIZE: remove this if necessary
- match_t *wrap = new_match(ctx->defs, pat, m->start, m->end, MATCHES(m));
- cache_save(ctx->cache, ctx->defs, str, pat, wrap);
+ match_t *wrap = new_match(pat, m->start, m->end, MATCHES(m));
+ cache_save(ctx, str, pat, wrap);
if (rec_op.args.leftrec.match)
remove_ownership(&rec_op.args.leftrec.match);
@@ -659,7 +708,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
for (int i = 0; i < dents; i++)
if (&str[i] >= ctx->end || str[i] != denter) return NULL;
- return new_match(ctx->defs, pat, start, &str[dents], NULL);
+ return new_match(pat, start, &str[dents], NULL);
}
default: {
errx(EXIT_FAILURE, "Unknown pattern type: %u", pat->type);
@@ -671,7 +720,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
//
// Return a match object which can be used (may be allocated or recycled).
//
-match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[])
+match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *children[])
{
match_t *m;
if (unused_matches) {
@@ -687,7 +736,6 @@ match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end,
m->pat = pat;
m->start = start;
m->end = end;
- m->defs_id = (defs?defs->id:0);
if (children) {
for (int i = 0; children[i]; i++)
@@ -760,14 +808,15 @@ size_t free_all_matches(void)
// Iterate over matches.
// Usage: for (match_t *m = NULL; next_match(&m, ...); ) {...}
//
-bool next_match(match_t **m, def_t *defs, const char *start, const char *end, pat_t *pat, pat_t *skip, bool ignorecase)
+bool next_match(match_t **m, const char *start, const char *end, pat_t *pat, pat_t *skip, bool ignorecase)
{
static cache_t cache = {0};
- if (!pat) { // Cleanup for stop_matching()
- recycle_if_unused(m);
- cache_destroy(&cache);
- return false;
- }
+ match_ctx_t ctx = {
+ .cache = &cache,
+ .start = start,
+ .end = end,
+ .ignorecase = ignorecase,
+ };
const char *pos;
if (*m) {
@@ -776,18 +825,12 @@ bool next_match(match_t **m, def_t *defs, const char *start, const char *end, pa
recycle_if_unused(m);
} else {
pos = start;
- cache_destroy(&cache);
+ cache_destroy(&ctx);
}
- match_ctx_t ctx = {
- .defs = defs,
- .cache = &cache,
- .start = start,
- .end = end,
- .ignorecase = ignorecase,
- };
+ if (!pat) return false;
*m = (pos <= end) ? _next_match(&ctx, pos, pat, skip) : NULL;
- if (!*m) cache_destroy(&cache);
+ if (!*m) cache_destroy(&ctx);
return *m != NULL;
}
diff --git a/match.h b/match.h
index 0a22e36..487b833 100644
--- a/match.h
+++ b/match.h
@@ -8,7 +8,6 @@
#include <stdio.h>
#include "pattern.h"
-#include "definitions.h"
struct match_s; // forward declared to resolve circular struct defs
@@ -25,7 +24,6 @@ typedef struct match_s {
pat_t *pat;
// Intrusive linked list nodes for garbage collection and cache buckets:
match_dll_t gc, cache;
- size_t defs_id;
int refcount;
struct match_s **children;
struct match_s *_children[3];
@@ -37,14 +35,12 @@ typedef struct {
void (*on_nl)(FILE *out);
} print_options_t;
-__attribute__((returns_nonnull))
-match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]);
__attribute__((nonnull))
void recycle_if_unused(match_t **at_m);
size_t free_all_matches(void);
size_t recycle_all_matches(void);
-bool next_match(match_t **m, def_t *defs, const char *start, const char *end, pat_t *pat, pat_t *skip, bool ignorecase);
-#define stop_matching(m) next_match(m, NULL, NULL, NULL, NULL, NULL, 0)
+bool next_match(match_t **m, const char *start, const char *end, pat_t *pat, pat_t *skip, bool ignorecase);
+#define stop_matching(m) next_match(m, NULL, NULL, NULL, NULL, 0)
__attribute__((nonnull))
match_t *get_numbered_capture(match_t *m, int n);
__attribute__((nonnull, pure))
diff --git a/pattern.c b/pattern.c
index 9865c0f..7bd1124 100644
--- a/pattern.c
+++ b/pattern.c
@@ -167,6 +167,15 @@ pat_t *chain_together(pat_t *first, pat_t *second)
{
if (first == NULL) return second;
if (second == NULL) return first;
+
+ if (first->type == BP_DEFINITIONS && second->type == BP_DEFINITIONS) {
+ pat_t *first_end = first;
+ while (first_end->args.def.next_def != NULL)
+ first_end = first_end->args.def.next_def;
+ first_end->args.def.next_def = second;
+ return first;
+ }
+
size_t minlen = first->min_matchlen + second->min_matchlen;
ssize_t maxlen = (UNBOUNDED(first) || UNBOUNDED(second)) ? (ssize_t)-1 : first->max_matchlen + second->max_matchlen;
pat_t *chain = new_pat(BP_CHAIN, first->start, second->end, minlen, maxlen);
@@ -210,6 +219,30 @@ pat_t *either_pat(pat_t *first, pat_t *second)
}
//
+// Parse a definition
+//
+__attribute__((nonnull))
+static pat_t *_bp_definition(const char *start, const char *end)
+{
+ if (start >= end || !(isalpha(*start) || *start == '_')) return NULL;
+ const char *str = after_name(start);
+ size_t namelen = (size_t)(str - start);
+ if (!matchchar(&str, ':', false)) return NULL;
+ pat_t *def = bp_pattern_nl(str, end, false);
+ if (!def) parse_err(str, end, "Could not parse this definition.");
+ str = def->end;
+ (void)matchchar(&str, ';', false); // Optional semicolon
+ pat_t *ret = new_pat(BP_DEFINITIONS, start, str, 0, -1);
+ ret->args.def.name = start;
+ ret->args.def.namelen = namelen;
+ ret->args.def.meaning = def;
+ ret->args.def.next_def = _bp_definition(after_spaces(str, true), end);
+ if (ret->args.def.next_def)
+ ret->end = ret->args.def.next_def->end;
+ return ret;
+}
+
+//
// Compile a string of BP code into a BP pattern object.
//
__attribute__((nonnull))
@@ -475,26 +508,12 @@ static pat_t *_bp_simplepattern(const char *str, const char *end)
return new_pat(BP_END_OF_LINE, start, str, 0, 0);
}
default: {
+ pat_t *def = _bp_definition(start, end);
+ if (def) return def;
// Reference
if (!isalpha(c) && c != '_') return NULL;
str = after_name(start);
size_t namelen = (size_t)(str - start);
- if (matchchar(&str, ':', false)) { // Definitions
- pat_t *def = bp_pattern_nl(str, end, false);
- if (!def) parse_err(str, end, "Could not parse this definition.");
- str = def->end;
- (void)matchchar(&str, ';', false); // Optional semicolon
- str = after_spaces(str, true);
- pat_t *pat = bp_pattern_nl(str, end, false);
- if (pat) str = pat->end;
- else pat = def;
- pat_t *ret = new_pat(BP_DEFINITION, start, str, pat->min_matchlen, pat->max_matchlen);
- ret->args.def.name = start;
- ret->args.def.namelen = namelen;
- ret->args.def.def = def;
- ret->args.def.pat = pat;
- return ret;
- }
pat_t *ref = new_pat(BP_REF, start, str, 0, -1);
ref->args.ref.name = start;
ref->args.ref.len = namelen;
@@ -653,9 +672,9 @@ void delete_pat(pat_t **at_pat, bool recursive)
if (recursive) {
switch (pat->type) {
- case BP_DEFINITION:
- delete_pat(&pat->args.def.def, true);
- delete_pat(&pat->args.def.pat, true);
+ case BP_DEFINITIONS:
+ delete_pat(&pat->args.def.meaning, true);
+ delete_pat(&pat->args.def.next_def, true);
break;
case BP_REPEAT:
delete_pat(&pat->args.repetitions.sep, true);
diff --git a/pattern.h b/pattern.h
index 3a78f13..a1f7043 100644
--- a/pattern.h
+++ b/pattern.h
@@ -34,7 +34,7 @@ enum pattype_e {
BP_END_OF_FILE = 22,
BP_END_OF_LINE = 23,
BP_WORD_BOUNDARY = 24,
- BP_DEFINITION = 25,
+ BP_DEFINITIONS = 25,
BP_LEFTRECURSION = 26,
};
@@ -57,7 +57,7 @@ typedef struct pat_s {
struct {
const char *name;
size_t namelen;
- struct pat_s *def, *pat;
+ struct pat_s *meaning, *next_def;
} def;
struct {
unsigned char low, high;