diff --git a/Makefile b/Makefile index 2eb7f56..e914372 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ CWARN=-Wall -Wpedantic -Wextra -Wno-unknown-pragmas -Wno-missing-field-initializ G ?= O ?= -O3 -CFILES=compiler.c grammar.c utils.c vm.c +CFILES=compiler.c grammar.c utils.c vm.c file_loader.c OBJFILES=$(CFILES:.c=.o) all: $(NAME) diff --git a/bpeg.c b/bpeg.c index 7e89dba..b8b4c0d 100644 --- a/bpeg.c +++ b/bpeg.c @@ -12,6 +12,7 @@ #include #include "compiler.h" +#include "file_loader.h" #include "grammar.h" #include "utils.h" #include "vm.h" @@ -50,25 +51,18 @@ static char *getflag(const char *flag, char *argv[], int *i) static int run_match(grammar_t *g, const char *filename, vm_op_t *pattern, unsigned int flags) { - char *input; - if (filename == NULL || streq(filename, "-")) { - input = readfile(STDIN_FILENO); - } else { - int fd = open(filename, O_RDONLY); - check(fd >= 0, "Couldn't open file: %s", filename); - input = readfile(fd); - } - match_t *m = match(g, input, pattern, flags); + file_t *f = load_file(filename); + match_t *m = match(g, f, f->contents, pattern, flags); if (m != NULL && m->end > m->start + 1) { if (filename != NULL) { if (isatty(STDOUT_FILENO)) printf("\033[1;4;33m%s\033[0m\n", filename); else printf("%s\n", filename); } print_match(m, isatty(STDOUT_FILENO) ? "\033[0m" : NULL, (flags & BPEG_VERBOSE) != 0); - freefile(input); + destroy_file(&f); return 0; } else { - freefile(input); + destroy_file(&f); return 1; } } @@ -85,12 +79,11 @@ int main(int argc, char *argv[]) grammar_t *g = new_grammar(); // Load builtins: - int fd; - if ((fd=open("/etc/xdg/bpeg/builtins.bpeg", O_RDONLY)) >= 0) - load_grammar(g, readfile(fd)); // Keep in memory for debugging output + if (access("/etc/xdg/bpeg/builtins.bpeg", R_OK) != -1) + load_grammar(g, load_file("/etc/xdg/bpeg/builtins.bpeg")); // Keep in memory for debugging output sprintf(path, "%s/.config/bpeg/builtins.bpeg", getenv("HOME")); - if ((fd=open(path, O_RDONLY)) >= 0) - load_grammar(g, readfile(fd)); // Keep in memory for debugging output + if (access(path, R_OK) != -1) + load_grammar(g, load_file(path)); // Keep in memory for debugging output int i, npatterns = 0; check(argc > 1, "%s", usage); @@ -106,63 +99,58 @@ int main(int argc, char *argv[]) } else if (streq(argv[i], "--ignore-case") || streq(argv[i], "-i")) { flags |= BPEG_IGNORECASE; } else if (FLAG("--replace") || FLAG("-r")) { - vm_op_t *p = bpeg_replacement(bpeg_pattern("pattern"), flag); + vm_op_t *p = bpeg_replacement(bpeg_pattern(NULL, "pattern"), flag); check(p, "Replacement failed to compile"); - add_def(g, flag, "replacement", p); + add_def(g, NULL, flag, "replacement", p); rule = "replace-all"; } else if (FLAG("--grammar") || FLAG("-g")) { - int fd; - if (streq(flag, "-")) { - fd = STDIN_FILENO; - } else { - fd = open(flag, O_RDONLY); - if (fd < 0) { - sprintf(path, "%s/.config/bpeg/%s.bpeg", getenv("HOME"), flag); - fd = open(path, O_RDONLY); - } - if (fd < 0) { - sprintf(path, "/etc/xdg/bpeg/%s.bpeg", flag); - fd = open(path, O_RDONLY); - } - check(fd >= 0, "Couldn't find grammar: %s", flag); + file_t *f = load_file(flag); + if (f == NULL) { + sprintf(path, "%s/.config/bpeg/%s.bpeg", getenv("HOME"), flag); + f = load_file(path); } - load_grammar(g, readfile(fd)); // Keep in memory for debug output + if (f == NULL) { + sprintf(path, "/etc/xdg/bpeg/%s.bpeg", flag); + f = load_file(path); + } + check(f != NULL, "Couldn't find grammar: %s", flag); + load_grammar(g, f); // Keep in memory for debug output } else if (FLAG("--define") || FLAG("-d")) { char *def = flag; char *eq = strchr(def, '='); check(eq, "Rule definitions must include an '='\n\n%s", usage); *eq = '\0'; char *src = ++eq; - vm_op_t *pat = bpeg_pattern(src); + vm_op_t *pat = bpeg_pattern(NULL, src); check(pat, "Failed to compile pattern"); - add_def(g, src, def, pat); + add_def(g, NULL, src, def, pat); } else if (FLAG("--define-string") || FLAG("-D")) { char *def = flag; char *eq = strchr(def, '='); check(eq, "Rule definitions must include an '='\n\n%s", usage); *eq = '\0'; char *src = ++eq; - vm_op_t *pat = bpeg_stringpattern(src); + vm_op_t *pat = bpeg_stringpattern(NULL, src); check(pat, "Failed to compile pattern"); - add_def(g, src, def, pat); + add_def(g, NULL, src, def, pat); } else if (FLAG("--pattern") || FLAG("-p")) { check(npatterns == 0, "Cannot define multiple patterns"); - vm_op_t *p = bpeg_pattern(flag); + vm_op_t *p = bpeg_pattern(NULL, flag); check(p, "Pattern failed to compile: '%s'", flag); - add_def(g, flag, "pattern", p); + add_def(g, NULL, flag, "pattern", p); ++npatterns; } else if (FLAG("--pattern-string") || FLAG("-P")) { - vm_op_t *p = bpeg_stringpattern(flag); + vm_op_t *p = bpeg_stringpattern(NULL, flag); check(p, "Pattern failed to compile"); - add_def(g, flag, "pattern", p); + add_def(g, NULL, flag, "pattern", p); ++npatterns; } else if (FLAG("--mode") || FLAG("-m")) { rule = flag; } else if (argv[i][0] != '-') { if (npatterns > 0) break; - vm_op_t *p = bpeg_stringpattern(argv[i]); + vm_op_t *p = bpeg_stringpattern(NULL, argv[i]); check(p, "Pattern failed to compile"); - add_def(g, argv[i], "pattern", p); + add_def(g, NULL, argv[i], "pattern", p); ++npatterns; } else { printf("Unrecognized flag: %s\n\n%s\n", argv[i], usage); diff --git a/compiler.c b/compiler.c index b27717b..fe7ad3f 100644 --- a/compiler.c +++ b/compiler.c @@ -5,8 +5,8 @@ #include "compiler.h" #include "utils.h" -static vm_op_t *expand_chain(vm_op_t *first); -static vm_op_t *expand_choices(vm_op_t *first); +static vm_op_t *expand_chain(file_t *f, vm_op_t *first); +static vm_op_t *expand_choices(file_t *f, vm_op_t *first); static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second); static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep); @@ -31,11 +31,11 @@ static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op * followed by any patterns (e.g. "`x `y"), otherwise return * the original input. */ -static vm_op_t *expand_chain(vm_op_t *first) +static vm_op_t *expand_chain(file_t *f, vm_op_t *first) { - vm_op_t *second = bpeg_simplepattern(first->end); + vm_op_t *second = bpeg_simplepattern(f, first->end); if (second == NULL) return first; - second = expand_chain(second); + second = expand_chain(f, second); check(second->end > first->end, "No forward progress in chain!"); return chain_together(first, second); } @@ -45,14 +45,14 @@ static vm_op_t *expand_chain(vm_op_t *first) * followed by any "/"-separated patterns (e.g. "`x/`y"), otherwise * return the original input. */ -static vm_op_t *expand_choices(vm_op_t *first) +static vm_op_t *expand_choices(file_t *f, vm_op_t *first) { - first = expand_chain(first); + first = expand_chain(f, first); const char *str = first->end; if (!matchchar(&str, '/')) return first; - vm_op_t *second = bpeg_simplepattern(str); + vm_op_t *second = bpeg_simplepattern(f, str); check(second, "Expected pattern after '/'"); - second = expand_choices(second); + second = expand_choices(f, second); vm_op_t *choice = calloc(sizeof(vm_op_t), 1); choice->op = VM_OTHERWISE; choice->start = first->start; @@ -84,7 +84,7 @@ static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second) /* * Compile a string of BPEG code into virtual machine opcodes */ -vm_op_t *bpeg_simplepattern(const char *str) +vm_op_t *bpeg_simplepattern(file_t *f, const char *str) { if (!*str) return NULL; str = after_spaces(str); @@ -100,7 +100,7 @@ vm_op_t *bpeg_simplepattern(const char *str) if (matchchar(&str, '.')) { // ".." if (matchchar(&str, '.')) // "..." op->multiline = 1; - vm_op_t *till = bpeg_simplepattern(str); + vm_op_t *till = bpeg_simplepattern(f, str); op->op = VM_UPTO_AND; op->len = -1; op->args.pat = till; @@ -177,7 +177,7 @@ vm_op_t *bpeg_simplepattern(const char *str) } // Not case '!': { - vm_op_t *p = bpeg_simplepattern(str); + vm_op_t *p = bpeg_simplepattern(f, str); check(p, "Expected pattern after '!'\n"); str = p->end; op->op = VM_NOT; @@ -202,13 +202,13 @@ vm_op_t *bpeg_simplepattern(const char *str) } else { min = n1, max = n1; } - vm_op_t *pat = bpeg_simplepattern(str); + vm_op_t *pat = bpeg_simplepattern(f, str); check(pat, "Expected pattern after repetition count"); str = pat->end; str = after_spaces(str); vm_op_t *sep = NULL; if (matchchar(&str, '%')) { - sep = bpeg_simplepattern(str); + sep = bpeg_simplepattern(f, str); check(sep, "Expected pattern for separator after '%%'"); str = sep->end; } else { @@ -219,7 +219,7 @@ vm_op_t *bpeg_simplepattern(const char *str) } // Lookbehind case '<': { - vm_op_t *pat = bpeg_simplepattern(str); + vm_op_t *pat = bpeg_simplepattern(f, str); check(pat, "Expected pattern after <"); str = pat->end; check(pat->len != -1, "Lookbehind patterns must have a fixed length"); @@ -231,7 +231,7 @@ vm_op_t *bpeg_simplepattern(const char *str) } // Lookahead case '>': { - vm_op_t *pat = bpeg_simplepattern(str); + vm_op_t *pat = bpeg_simplepattern(f, str); check(pat, "Expected pattern after >"); str = pat->end; op->op = VM_BEFORE; @@ -242,9 +242,9 @@ vm_op_t *bpeg_simplepattern(const char *str) // Parentheses case '(': { free(op); - op = bpeg_simplepattern(str); + op = bpeg_simplepattern(f, str); check(op, "Expected pattern inside parentheses"); - op = expand_choices(op); + op = expand_choices(f, op); str = op->end; str = after_spaces(str); check(matchchar(&str, ')'), "Expected closing ')' instead of \"%s\"", str); @@ -261,7 +261,7 @@ vm_op_t *bpeg_simplepattern(const char *str) str = closing; check(matchchar(&str, ']'), "Expected closing ']'"); } - vm_op_t *pat = bpeg_simplepattern(str); + vm_op_t *pat = bpeg_simplepattern(f, str); check(pat, "Expected pattern after @"); str = pat->end; op->args.capture.capture_pat = pat; @@ -275,9 +275,9 @@ vm_op_t *bpeg_simplepattern(const char *str) if (strncmp(str, "=>", 2) == 0) { str += strlen("=>"); } else { - pat = bpeg_simplepattern(str); + pat = bpeg_simplepattern(f, str); check(pat, "Invalid pattern after '{'"); - pat = expand_choices(pat); + pat = expand_choices(f, pat); str = pat->end; str = after_spaces(str); check(matchchar(&str, '=') && matchchar(&str, '>'), @@ -360,7 +360,7 @@ vm_op_t *bpeg_simplepattern(const char *str) if (strncmp(after_spaces(str), "==", 2) == 0) { str = after_spaces(str)+2; vm_op_t *first = op; - vm_op_t *second = bpeg_simplepattern(str); + vm_op_t *second = bpeg_simplepattern(f, str); check(second, "Expected pattern after '=='"); check(first->len == -1 || second->len == -1 || first->len == second->len, "Two patterns cannot possibly match the same (different lengths: %ld != %ld)", @@ -382,7 +382,7 @@ vm_op_t *bpeg_simplepattern(const char *str) /* * Similar to bpeg_simplepattern, except that the pattern begins with an implicit, unclosable quote. */ -vm_op_t *bpeg_stringpattern(const char *str) +vm_op_t *bpeg_stringpattern(file_t *f, const char *str) { vm_op_t *ret = NULL; while (*str) { @@ -395,7 +395,7 @@ vm_op_t *bpeg_stringpattern(const char *str) for (; *str; str++) { if (*str == '\\') { check(str[1], "Expected more string contents after backslash"); - interp = bpeg_simplepattern(str + 1); + interp = bpeg_simplepattern(f, str + 1); check(interp != NULL, "No valid BPEG pattern detected after backslash"); break; } @@ -448,10 +448,10 @@ vm_op_t *bpeg_replacement(vm_op_t *pat, const char *replacement) return op; } -vm_op_t *bpeg_pattern(const char *str) +vm_op_t *bpeg_pattern(file_t *f, const char *str) { - vm_op_t *op = bpeg_simplepattern(str); - if (op != NULL) op = expand_choices(op); + vm_op_t *op = bpeg_simplepattern(f, str); + if (op != NULL) op = expand_choices(f, op); return op; } diff --git a/compiler.h b/compiler.h index 855b05d..005eacc 100644 --- a/compiler.h +++ b/compiler.h @@ -7,11 +7,12 @@ #include #include "types.h" +#include "file_loader.h" -vm_op_t *bpeg_simplepattern(const char *str); -vm_op_t *bpeg_stringpattern(const char *str); +vm_op_t *bpeg_simplepattern(file_t *f, const char *str); +vm_op_t *bpeg_stringpattern(file_t *f, const char *str); vm_op_t *bpeg_replacement(vm_op_t *pat, const char *replacement); -vm_op_t *bpeg_pattern(const char *str); +vm_op_t *bpeg_pattern(file_t *f, const char *str); #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/file_loader.c b/file_loader.c new file mode 100644 index 0000000..b7c3817 --- /dev/null +++ b/file_loader.c @@ -0,0 +1,102 @@ +/* + * file_loader.c - Implementation of some file loading functionality. + */ + +#include +#include +#include +#include +#include +#include + +#include "file_loader.h" + +/* + * Read an entire file into memory. + */ +file_t *load_file(const char *filename) +{ + if (filename == NULL) filename = "-"; + int fd = strcmp(filename, "-") != 0 ? open(filename, O_RDONLY) : STDIN_FILENO; + if (fd < 0) return NULL; + file_t *f = calloc(sizeof(file_t), 1); + f->filename = strdup(filename); + // TODO: use mmap when possible + f->mmapped = 0; + size_t capacity = 1000; + f->length = 0; + f->contents = calloc(sizeof(char), capacity+1); + ssize_t just_read; + while ((just_read=read(fd, &f->contents[f->length], capacity - f->length)) > 0) { + f->length += (size_t)just_read; + if (f->length >= capacity) + f->contents = realloc(f->contents, sizeof(char)*(capacity *= 2) + 1); + } + f->contents[f->length] = '\0'; + close(fd); + + // Calculate line numbers: + size_t linecap = 10; + f->lines = calloc(sizeof(const char*), linecap); + f->nlines = 1; + char *p = f->contents; + for (size_t n = 0; p && *p; ++n) { + if (n >= linecap) + f->lines = realloc(f->lines, sizeof(const char*)*(linecap *= 2)); + f->lines[n] = p; + p = strchr(p, '\n'); + if (p) ++p; + } + + return f; +} + +void destroy_file(file_t **f) +{ + if ((*f)->filename) { + free((char*)(*f)->filename); + (*f)->filename = NULL; + } + if ((*f)->lines) { + free((*f)->lines); + (*f)->lines = NULL; + } + if ((*f)->contents) { + free((*f)->contents); + (*f)->contents = NULL; + } + free(*f); + *f = NULL; +} + +size_t get_line_number(file_t *f, const char *p) +{ + // TODO: binary search + for (size_t n = 1; n < f->nlines; n++) { + if (f->lines[n] > p) + return n; + } + return 0; +} + +const char *get_line(file_t *f, size_t line_number) +{ + if (line_number == 0 || line_number > f->nlines) return NULL; + return f->lines[line_number - 1]; +} + +void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, const char *msg) +{ + size_t linenum = get_line_number(f, start); + const char *line = get_line(f, linenum); + size_t charnum = 1 + (size_t)(start - line); + fprintf(dest, "\033[1m%s:%ld:%ld:\033[0m %s\n", + f->filename, linenum, charnum, msg); + const char *eol = linenum == f->nlines ? strchr(line, '\0') : strchr(line, '\n'); + if (end == NULL || end > eol) end = eol; + fprintf(dest, "\033[2m% 5ld |\033[0m %.*s\033[31;4;1m%.*s\033[0m%.*s\n", + linenum, + (int)charnum - 1, line, + (int)(end - &line[charnum-1]), &line[charnum-1], + (int)(eol - end), end); +} diff --git a/file_loader.h b/file_loader.h new file mode 100644 index 0000000..cb49373 --- /dev/null +++ b/file_loader.h @@ -0,0 +1,22 @@ +/* + * file_loader.h - Definitions of an API for loading files. + */ +#ifndef FILE_LOADER__H +#define FILE_LOADER__H + +#include + +typedef struct { + const char *filename; + char *contents, **lines; + size_t length, nlines; + unsigned int mmapped:1; +} file_t; + +file_t *load_file(const char *filename); +void destroy_file(file_t **f); +size_t get_line_number(file_t *f, const char *p); +const char *get_line(file_t *f, size_t line_number); +void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, const char *msg); + +#endif diff --git a/grammar.c b/grammar.c index 61c82c9..b4621ee 100644 --- a/grammar.c +++ b/grammar.c @@ -4,6 +4,7 @@ #include "grammar.h" #include "compiler.h" +#include "file_loader.h" #include "utils.h" grammar_t *new_grammar(void) @@ -13,12 +14,13 @@ grammar_t *new_grammar(void) return g; } -void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op) +void add_def(grammar_t *g, file_t *f, const char *src, const char *name, vm_op_t *op) { if (g->defcount >= g->defcapacity) { g->definitions = realloc(g->definitions, sizeof(&g->definitions[0])*(g->defcapacity += 32)); } int i = g->defcount; + g->definitions[i].file = f; g->definitions[i].source = src; g->definitions[i].name = name; g->definitions[i].op = op; @@ -29,9 +31,10 @@ void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op) * Load the given grammar (semicolon-separated definitions) * and return the first rule defined. */ -vm_op_t *load_grammar(grammar_t *g, const char *src) +vm_op_t *load_grammar(grammar_t *g, file_t *f) { vm_op_t *ret = NULL; + const char *src = f->contents; do { src = after_spaces(src); if (!*src) break; @@ -41,9 +44,9 @@ vm_op_t *load_grammar(grammar_t *g, const char *src) name = strndup(name, (size_t)(name_end-name)); src = after_spaces(name_end); check(matchchar(&src, '='), "Expected '=' in definition"); - vm_op_t *op = bpeg_pattern(src); + vm_op_t *op = bpeg_pattern(f, src); check(op, "Couldn't load definition"); - add_def(g, src, name, op); + add_def(g, f, src, name, op); if (ret == NULL) { ret = op; } diff --git a/grammar.h b/grammar.h index f75b111..0f57616 100644 --- a/grammar.h +++ b/grammar.h @@ -7,13 +7,14 @@ #include #include +#include "file_loader.h" #include "types.h" grammar_t *new_grammar(void); -void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op); +void add_def(grammar_t *g, file_t *f, const char *src, const char *name, vm_op_t *op); void push_backref(grammar_t *g, const char *name, match_t *capture); void pop_backrefs(grammar_t *g, size_t count); -vm_op_t *load_grammar(grammar_t *g, const char *source); +vm_op_t *load_grammar(grammar_t *g, file_t *f); vm_op_t *lookup(grammar_t *g, const char *name); #endif diff --git a/types.h b/types.h index 3749156..de408cc 100644 --- a/types.h +++ b/types.h @@ -6,6 +6,8 @@ #include +#include "file_loader.h" + enum BPEGFlag { BPEG_VERBOSE = 1 << 0, BPEG_IGNORECASE = 1 << 1, @@ -85,6 +87,7 @@ typedef struct match_s { typedef struct { const char *name; const char *source; + file_t *file; vm_op_t *op; } def_t; diff --git a/utils.c b/utils.c index 7844934..6b9af5c 100644 --- a/utils.c +++ b/utils.c @@ -153,28 +153,5 @@ size_t unescape_string(char *dest, const char *src, size_t bufsize) #undef PUT } -/* - * Read an entire file into memory. (Guaranteeing that ret[-1] == '\0') - */ -char *readfile(int fd) -{ - size_t capacity = 1000, len = 0; - char *buf = calloc(sizeof(char), capacity+1); - buf[len++] = '\0'; - ssize_t just_read; - while ((just_read=read(fd, &buf[len], capacity-len)) > 0) { - len += (size_t)just_read; - if (len >= capacity) - buf = realloc(buf, sizeof(char)*(capacity *= 2)); - } - buf[len] = '\0'; - close(fd); - return &buf[1]; -} - -void freefile(char *f) -{ - free(&f[-1]); -} // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/utils.h b/utils.h index 8588d24..28bfe05 100644 --- a/utils.h +++ b/utils.h @@ -17,8 +17,6 @@ #define check(cond, ...) do { if (!(cond)) { fprintf(stderr, __VA_ARGS__); fwrite("\n", 1, 1, stderr); _exit(1); } } while(0) #define debug(...) do { if (verbose) fprintf(stderr, __VA_ARGS__); } while(0) -char *readfile(int fd); -void freefile(char *f); char unescapechar(const char *escaped, const char **end); const char *after_name(const char *str); const char *after_spaces(const char *str); diff --git a/vm.c b/vm.c index 7b6a04b..637c4b8 100644 --- a/vm.c +++ b/vm.c @@ -77,7 +77,7 @@ typedef struct recursive_ref_s { * a match struct, or NULL if no match is found. * The returned value should be free()'d to avoid memory leaking. */ -static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int flags, recursive_ref_t *rec) +static match_t *_match(grammar_t *g, file_t *f, const char *str, vm_op_t *op, unsigned int flags, recursive_ref_t *rec) { switch (op->op) { case VM_EMPTY: { @@ -116,7 +116,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int return m; } case VM_NOT: { - match_t *m = _match(g, str, op->args.pat, flags, rec); + match_t *m = _match(g, f, str, op->args.pat, flags, rec); if (m != NULL) { destroy_match(&m); return NULL; @@ -134,7 +134,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int if (op->args.pat) { for (const char *prev = NULL; prev < str; ) { prev = str; - match_t *p = _match(g, str, op->args.pat, flags, rec); + match_t *p = _match(g, f, str, op->args.pat, flags, rec); if (p) { m->child = p; m->end = p->end; @@ -168,11 +168,11 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int // Separator match_t *sep = NULL; if (op->args.repetitions.sep != NULL && reps > 0) { - sep = _match(g, str, op->args.repetitions.sep, flags, rec); + sep = _match(g, f, str, op->args.repetitions.sep, flags, rec); if (sep == NULL) break; str = sep->end; } - match_t *p = _match(g, str, op->args.repetitions.repeat_pat, flags, rec); + match_t *p = _match(g, f, str, op->args.repetitions.repeat_pat, flags, rec); if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops destroy_match(&sep); destroy_match(&p); @@ -204,11 +204,8 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int case VM_AFTER: { ssize_t backtrack = op->args.pat->len; check(backtrack != -1, "'<' is only allowed for fixed-length operations"); - // Check for necessary space: - for (int i = 0; i < backtrack; i++) { - if (str[-i] == '\0') return NULL; - } - match_t *before = _match(g, str - backtrack, op->args.pat, flags, rec); + if (str - backtrack < f->contents) return NULL; + match_t *before = _match(g, f, str - backtrack, op->args.pat, flags, rec); if (before == NULL) return NULL; match_t *m = calloc(sizeof(match_t), 1); m->start = str; @@ -218,7 +215,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int return m; } case VM_BEFORE: { - match_t *after = _match(g, str, op->args.pat, flags, rec); + match_t *after = _match(g, f, str, op->args.pat, flags, rec); if (after == NULL) return NULL; match_t *m = calloc(sizeof(match_t), 1); m->start = str; @@ -228,7 +225,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int return m; } case VM_CAPTURE: { - match_t *p = _match(g, str, op->args.pat, flags, rec); + match_t *p = _match(g, f, str, op->args.pat, flags, rec); if (p == NULL) return NULL; match_t *m = calloc(sizeof(match_t), 1); m->start = str; @@ -241,16 +238,16 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int return m; } case VM_OTHERWISE: { - match_t *m = _match(g, str, op->args.multiple.first, flags, rec); - if (m == NULL) m = _match(g, str, op->args.multiple.second, flags, rec); + match_t *m = _match(g, f, str, op->args.multiple.first, flags, rec); + if (m == NULL) m = _match(g, f, str, op->args.multiple.second, flags, rec); return m; } case VM_CHAIN: { - match_t *m1 = _match(g, str, op->args.multiple.first, flags, rec); + match_t *m1 = _match(g, f, str, op->args.multiple.first, flags, rec); if (m1 == NULL) return NULL; size_t nbackrefs = push_backrefs(g, m1); - match_t *m2 = _match(g, m1->end, op->args.multiple.second, flags, rec); + match_t *m2 = _match(g, f, m1->end, op->args.multiple.second, flags, rec); pop_backrefs(g, nbackrefs); if (m2 == NULL) { destroy_match(&m1); @@ -265,11 +262,11 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int return m; } case VM_EQUAL: { - match_t *m1 = _match(g, str, op->args.multiple.first, flags, rec); + match_t *m1 = _match(g, f, str, op->args.multiple.first, flags, rec); if (m1 == NULL) return NULL; // == matches iff both have the same start and end point: - match_t *m2 = _match(g, str, op->args.multiple.second, flags, rec); + match_t *m2 = _match(g, f, str, op->args.multiple.second, flags, rec); if (m2 == NULL || m2->end != m1->end) { destroy_match(&m1); destroy_match(&m2); @@ -288,7 +285,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int m->start = str; m->op = op; if (op->args.replace.replace_pat) { - match_t *p = _match(g, str, op->args.replace.replace_pat, flags, rec); + match_t *p = _match(g, f, str, op->args.replace.replace_pat, flags, rec); if (p == NULL) return NULL; m->child = p; m->end = p->end; @@ -320,7 +317,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int }; match_t *best = NULL; left_recursive:; - match_t *p = _match(g, str, r, flags, &wrap); + match_t *p = _match(g, f, str, r, flags, &wrap); if (p == NULL) return best; if (wrap.hit && (best == NULL || p->end > best->end)) { best = p; @@ -343,7 +340,10 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int return match_backref(str, op, (match_t*)op->args.backref, flags); } case VM_NODENT: { - if (str[-1] == '\0') { // First line + size_t linenum = get_line_number(f, str); + if (linenum == 1) { // First line + if (str > f->contents) + return NULL; match_t *m = calloc(sizeof(match_t), 1); m->start = str; m->end = str; @@ -352,9 +352,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int } else if (str[-1] != '\n') { return NULL; // Not at beginning of line } - const char *p = &str[-1]; - while (*p == '\n') --p; // Skip blank lines - while (p[-1] && p[-1] != '\n') --p; // Backtrack to start of last (nonblank) line + const char *p = get_line(f, linenum - 1); // Count indentation: char denter = *p; int dents = 0; @@ -686,9 +684,9 @@ static match_t *match_backref(const char *str, vm_op_t *op, match_t *cap, unsign return ret; } -match_t *match(grammar_t *g, const char *str, vm_op_t *op, unsigned int flags) +match_t *match(grammar_t *g, file_t *f, const char *str, vm_op_t *op, unsigned int flags) { - return _match(g, str, op, flags, NULL); + return _match(g, f, str, op, flags, NULL); } // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/vm.h b/vm.h index ac0269f..feb8eeb 100644 --- a/vm.h +++ b/vm.h @@ -12,7 +12,7 @@ #include "types.h" const char *opcode_name(enum VMOpcode o); -match_t *match(grammar_t *g, const char *str, vm_op_t *op, unsigned int flags); +match_t *match(grammar_t *g, file_t *f, const char *str, vm_op_t *op, unsigned int flags); void destroy_match(match_t **m); void print_pattern(vm_op_t *op); void print_match(match_t *m, const char *color, int verbose);