diff --git a/bpeg.c b/bpeg.c index 3af2b78..ea31412 100644 --- a/bpeg.c +++ b/bpeg.c @@ -258,12 +258,13 @@ static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op * followed by any patterns (e.g. "`x `y"), otherwise return * the original input. */ -static vm_op_t *expand_chain(vm_op_t *first) +static vm_op_t *expand_chain(const char *source, vm_op_t *first) { - vm_op_t *second = compile_bpeg(first->end); + vm_op_t *second = compile_bpeg(source, first->end); if (second == NULL) return first; check(second->end > first->end, "No forward progress in chain!"); - second = expand_chain(second); + visualize(source, first->end, "Expanding chain..."); + second = expand_chain(source, second); vm_op_t *chain = calloc(sizeof(vm_op_t), 1); chain->op = VM_CHAIN; chain->start = first->start; @@ -273,6 +274,7 @@ static vm_op_t *expand_chain(vm_op_t *first) chain->end = second->end; chain->args.multiple.first = first; chain->args.multiple.second = second; + visualize(source, chain->end, "Got chained pair."); return chain; } @@ -281,15 +283,16 @@ static vm_op_t *expand_chain(vm_op_t *first) * followed by any "/"-separated patterns (e.g. "`x/`y"), otherwise * return the original input. */ -static vm_op_t *expand_choices(vm_op_t *first) +static vm_op_t *expand_choices(const char *source, vm_op_t *first) { - first = expand_chain(first); + first = expand_chain(source, first); const char *str = first->end; if (!matchchar(&str, '/')) return first; - debug("Otherwise:\n"); - vm_op_t *second = compile_bpeg(str); + visualize(source, str, "Expanding choices..."); + //debug("Otherwise:\n"); + vm_op_t *second = compile_bpeg(source, str); check(second, "Expected pattern after '/'"); - second = expand_choices(second); + second = expand_choices(source, second); vm_op_t *choice = calloc(sizeof(vm_op_t), 1); choice->op = VM_OTHERWISE; choice->start = first->start; @@ -299,6 +302,7 @@ static vm_op_t *expand_choices(vm_op_t *first) choice->end = second->end; choice->args.multiple.first = first; choice->args.multiple.second = second; + visualize(source, choice->end, "Got two choices"); return choice; } @@ -344,10 +348,11 @@ static char escapechar(const char *escaped, const char **end) /* * Compile a string of BPEG code into virtual machine opcodes */ -static vm_op_t *compile_bpeg(const char *str) +static vm_op_t *compile_bpeg(const char *source, const char *str) { if (!*str) return NULL; - debug("Parsing \"%s\"...\n", str); + visualize(source, str, "Compiling..."); + //debug("Parsing \"%s\"...\n", str); str = after_spaces(str); check(*str, "Expected a pattern"); vm_op_t *op = calloc(sizeof(vm_op_t), 1); @@ -358,7 +363,8 @@ static vm_op_t *compile_bpeg(const char *str) switch (c) { // Any char (dot) case '.': { - debug("Dot\n"); + visualize(source, str, "Dot"); + //debug("Dot\n"); op->op = VM_ANYCHAR; op->len = 1; break; @@ -367,10 +373,12 @@ static vm_op_t *compile_bpeg(const char *str) case '`': { char literal[2] = {*str, '\0'}; ++str; + visualize(source, str, "Char literal"); check(literal[0], "Expected character after '`'\n"); op->len = 1; if (matchchar(&str, ',')) { // Range - debug("Char range\n"); + visualize(source, str, "Char range"); + //debug("Char range\n"); char c2 = *str; check(c2, "Expected character after ','"); op->op = VM_RANGE; @@ -378,7 +386,7 @@ static vm_op_t *compile_bpeg(const char *str) op->args.range.high = c2; ++str; } else { - debug("Char literal\n"); + //debug("Char literal\n"); op->op = VM_STRING; op->args.s = strdup(literal); } @@ -386,7 +394,8 @@ static vm_op_t *compile_bpeg(const char *str) } // Escapes case '\\': { - debug("Escape sequence\n"); + //debug("Escape sequence\n"); + visualize(source, str, "Escape sequence"); check(*str, "Expected escape after '\\'"); op->op = VM_STRING; op->len = 1; @@ -396,6 +405,7 @@ static vm_op_t *compile_bpeg(const char *str) } // String literal case '"': case '\'': { + visualize(source, str, "String literal"); char quote = c; const char *literal = str; for (; *str && *str != quote; str++) { @@ -403,19 +413,21 @@ static vm_op_t *compile_bpeg(const char *str) check(str[1], "Expected more string contents after backslash"); ++str; } + visualize(source, str, "String literal"); } op->op = VM_STRING; op->len = (ssize_t)(str - literal); op->args.s = strndup(literal, (size_t)op->len); // TODO: handle escape chars like \n - debug("String literal: %c%s%c\n", quote, op->args.s, quote); + //debug("String literal: %c%s%c\n", quote, op->args.s, quote); check(matchchar(&str, quote), "Missing closing quote"); break; } // Not case '!': { - debug("Not pattern\n"); - vm_op_t *p = compile_bpeg(str); + // debug("Not pattern\n"); + visualize(source, str, "Not "); + vm_op_t *p = compile_bpeg(source, str); check(p, "Expected pattern after '!'\n"); str = p->end; op->op = VM_NOT; @@ -425,8 +437,9 @@ static vm_op_t *compile_bpeg(const char *str) } // Upto case '^': { - debug("Upto pattern\n"); - vm_op_t *p = compile_bpeg(str); + visualize(source, str, "Upto "); + //debug("Upto pattern\n"); + vm_op_t *p = compile_bpeg(source, str); check(p, "Expected pattern after '^'\n"); str = p->end; op->op = VM_UPTO; @@ -436,8 +449,9 @@ static vm_op_t *compile_bpeg(const char *str) } // Upto and including case '&': { - debug("Upto-and pattern\n"); - vm_op_t *p = compile_bpeg(str); + //debug("Upto-and pattern\n"); + visualize(source, str, "Upto and including "); + vm_op_t *p = compile_bpeg(source, str); check(p, "Expected pattern after '&'\n"); str = p->end; op->op = VM_UPTO_AND; @@ -448,7 +462,7 @@ static vm_op_t *compile_bpeg(const char *str) // Number of repetitions: (- / - / + / "") case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { - debug("Repetitions\n"); + visualize(source, str, "Repeat "); ssize_t min = -1, max = -1; --str; long n1 = strtol(str, (char**)&str, 10); @@ -463,49 +477,56 @@ static vm_op_t *compile_bpeg(const char *str) } else { min = n1, max = n1; } - vm_op_t *pat = compile_bpeg(str); + visualize(source, str, NULL); + vm_op_t *pat = compile_bpeg(source, str); check(pat, "Expected pattern after repetition count"); str = pat->end; str = after_spaces(str); if (matchchar(&str, '%')) { - vm_op_t *sep = compile_bpeg(str); + visualize(source, str, "Repeat with separator"); + vm_op_t *sep = compile_bpeg(source, str); check(sep, "Expected pattern for separator after '%%'"); str = sep->end; set_range(op, min, max, pat, sep); } else { set_range(op, min, max, pat, NULL); } - debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max); + visualize(source, str, NULL); + //debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max); break; } // Special repetitions: case '+': case '*': case '?': { - debug("Special repetitions\n"); + //debug("Special repetitions\n"); + visualize(source, str, "Repeat "); ssize_t min = -1, max = -1; switch (c) { case '+': min = 1, max = -1; break; case '*': min = 0, max = -1; break; case '?': min = 0, max = 1; break; } - vm_op_t *pat = compile_bpeg(str); + vm_op_t *pat = compile_bpeg(source, str); check(pat, "Expected pattern after +"); str = pat->end; str = after_spaces(str); if (matchchar(&str, '%')) { - vm_op_t *sep = compile_bpeg(str); + visualize(source, str, "Repeat with separator"); + vm_op_t *sep = compile_bpeg(source, str); check(sep, "Expected pattern for separator after '%%'"); str = sep->end; set_range(op, min, max, pat, sep); } else { set_range(op, min, max, pat, NULL); } - debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max); + visualize(source, str, NULL); + //debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max); break; } // Lookbehind case '<': { - debug("Lookbehind\n"); - vm_op_t *pat = compile_bpeg(str); + visualize(source, str, "After "); + //debug("Lookbehind\n"); + vm_op_t *pat = compile_bpeg(source, str); check(pat, "Expected pattern after <"); str = pat->end; check(pat->len != -1, "Lookbehind patterns must have a fixed length"); @@ -517,8 +538,9 @@ static vm_op_t *compile_bpeg(const char *str) } // Lookahead case '>': { - debug("Lookahead\n"); - vm_op_t *pat = compile_bpeg(str); + visualize(source, str, "Before "); + //debug("Lookahead\n"); + vm_op_t *pat = compile_bpeg(source, str); check(pat, "Expected pattern after >"); str = pat->end; op->op = VM_BEFORE; @@ -528,50 +550,57 @@ static vm_op_t *compile_bpeg(const char *str) } // Parentheses case '(': { - debug("Open paren (\n"); + visualize(source, str, NULL); + // debug("Open paren (\n"); free(op); - op = compile_bpeg(str); + op = compile_bpeg(source, str); check(op, "Expected pattern inside parentheses"); - op = expand_choices(op); + op = expand_choices(source, op); str = op->end; str = after_spaces(str); check(matchchar(&str, ')'), "Expected closing parenthesis"); - debug(")\n"); + visualize(source, str, NULL); + // debug(")\n"); break; } // Capture case '@': { - debug("Capture\n"); + //debug("Capture\n"); + visualize(source, str, "Capture"); op->op = VM_CAPTURE; str = after_spaces(str); if (matchchar(&str, '[')) { char *closing = strchr(str, ']'); check(closing, "Expected closing ']'"); op->args.capture.name = strndup(str, (size_t)(closing-str)); - debug("named \"%s\"\n", op->args.capture.name); + visualize(source, str, "Named capture"); + //debug("named \"%s\"\n", op->args.capture.name); str = closing; check(matchchar(&str, ']'), "Expected closing ']'"); } - vm_op_t *pat = compile_bpeg(str); + vm_op_t *pat = compile_bpeg(source, str); check(pat, "Expected pattern after @"); str = pat->end; op->args.capture.capture_pat = pat; op->len = pat->len; + visualize(source, str, NULL); break; } // Replacement case '{': { - debug("Replacement {\n"); + //debug("Replacement {\n"); + visualize(source, str, "Replacement"); str = after_spaces(str); vm_op_t *pat = NULL; if (!matchchar(&str, '~')) { - pat = compile_bpeg(str); + pat = compile_bpeg(source, str); check(pat, "Expected pattern after '{'"); - pat = expand_choices(pat); + pat = expand_choices(source, pat); str = pat->end; str = after_spaces(str); check(matchchar(&str, '~'), "Expected '~' after pattern in replacement"); } + visualize(source, str, NULL); str = after_spaces(str); char quote = *str; @@ -587,22 +616,26 @@ static vm_op_t *compile_bpeg(const char *str) check(str[1], "Expected more string contents after backslash"); ++str; } + visualize(source, str, NULL); } replacement = strndup(replacement, (size_t)(str-replacement)); check(matchchar(&str, quote), "Expected closing quote"); } + visualize(source, str, NULL); check(matchchar(&str, '}'), "Expected a closing '}'"); op->op = VM_REPLACE; op->args.replace.replace_pat = pat; op->args.replace.replacement = replacement; - debug(" rep = \"%s\"\n", replacement); - debug("}\n"); + //debug(" rep = \"%s\"\n", replacement); + //debug("}\n"); if (pat != NULL) op->len = pat->len; + visualize(source, str, NULL); break; } // Whitespace case '_': { - debug("Whitespace\n"); + //debug("Whitespace\n"); + visualize(source, str, NULL); op->op = VM_REF; op->args.s = strdup("_"); break; @@ -610,16 +643,20 @@ static vm_op_t *compile_bpeg(const char *str) default: { // Reference if (isalpha(c)) { + visualize(source, str, "Ref"); --str; const char *refname = str; size_t len = 1; - for (++str; isalnum(*str); ++str) + for (++str; isalnum(*str); ++str) { ++len; + visualize(source, str, NULL); + } op->op = VM_REF; - debug("Ref: %s\n", refname); + //debug("Ref: %s\n", refname); op->args.s = strndup(refname, len); break; } else { + visualize(source, str, "No match"); free(op); return NULL; } @@ -629,11 +666,12 @@ static vm_op_t *compile_bpeg(const char *str) return op; } -static vm_op_t *load_def(const char *name, const char *def) +static vm_op_t *load_def(const char *name, const char *source) { defs[ndefs].name = name; - vm_op_t *op = compile_bpeg(def); - op = expand_choices(op); + defs[ndefs].source = source; + vm_op_t *op = compile_bpeg(source, source); + op = expand_choices(source, op); defs[ndefs].op = op; ++ndefs; return op; @@ -726,7 +764,7 @@ static void print_match(match_t *m, const char *color) } } } else { - if (m->is_capture) printf("\033[0;33m{"); + if (m->is_capture) printf("\033[0;2;33m{"); const char *prev = m->start; for (match_t *child = m->child; child; child = child->nextsibling) { if (child->start > prev) @@ -736,7 +774,7 @@ static void print_match(match_t *m, const char *color) } if (m->end > prev) printf("%s%.*s", color, (int)(m->end - prev), prev); - if (m->is_capture) printf("\033[0;33m}"); + if (m->is_capture) printf("\033[0;2;33m}"); } } @@ -756,18 +794,123 @@ static char *readfile(int fd) return buf; } +static void print_grammar(vm_op_t *op) +{ + switch (op->op) { + case VM_REF: fprintf(stderr, "a $%s", op->args.s); break; + case VM_EMPTY: fprintf(stderr, "empty"); break; + case VM_ANYCHAR: fprintf(stderr, "any char"); break; + case VM_STRING: fprintf(stderr, "string \"%s\"", op->args.s); break; + case VM_RANGE: { + fprintf(stderr, "char from %c-%c", op->args.range.low, op->args.range.high); + break; + } + case VM_REPEAT: { + if (op->args.repetitions.max == -1) + fprintf(stderr, "%ld or more ", op->args.repetitions.min); + else + fprintf(stderr, "%ld-%ld of (", + op->args.repetitions.min, + op->args.repetitions.max); + print_grammar(op->args.repetitions.repeat_pat); + fprintf(stderr, ")"); + if (op->args.repetitions.sep) { + fprintf(stderr, " separated by ("); + print_grammar(op->args.repetitions.sep); + fprintf(stderr, ")"); + } + break; + } + case VM_NOT: { + fprintf(stderr, "not ("); + print_grammar(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_UPTO: { + fprintf(stderr, "text up to ("); + print_grammar(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_UPTO_AND: { + fprintf(stderr, "text up to and including ("); + print_grammar(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_AFTER: { + fprintf(stderr, "after ("); + print_grammar(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_BEFORE: { + fprintf(stderr, "before ("); + print_grammar(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_CAPTURE: { + fprintf(stderr, "capture ("); + print_grammar(op->args.pat); + fprintf(stderr, ")"); + if (op->args.capture.name) + fprintf(stderr, " and call it %s", op->args.capture.name); + break; + } + case VM_OTHERWISE: { + fprintf(stderr, "("); + print_grammar(op->args.multiple.first); + fprintf(stderr, ") or else "); + if (op->args.multiple.second->op != VM_OTHERWISE) + fprintf(stderr, "("); + print_grammar(op->args.multiple.second); + if (op->args.multiple.second->op != VM_OTHERWISE) + fprintf(stderr, ")"); + break; + } + case VM_CHAIN: { + fprintf(stderr, "("); + print_grammar(op->args.multiple.first); + fprintf(stderr, ") then "); + if (op->args.multiple.second->op != VM_CHAIN) + fprintf(stderr, "("); + print_grammar(op->args.multiple.second); + if (op->args.multiple.second->op != VM_CHAIN) + fprintf(stderr, ")"); + break; + } + case VM_REPLACE: { + fprintf(stderr, "replace "); + if (op->args.replace.replace_pat) { + fprintf(stderr, "("); + print_grammar(op->args.replace.replace_pat); + fprintf(stderr, ")"); + } else + fprintf(stderr, "\"\""); + fprintf(stderr, " with \"%s\"", op->args.replace.replacement); + break; + } + default: break; + } +} + int main(int argc, char *argv[]) { check(argc >= 2, "Usage: bpeg []"); + fprintf(stderr, "========== Compiling ===========\n\n\n\n"); load_defs(); const char *lang = argv[1]; - vm_op_t *op = compile_bpeg(lang); + visualize_delay = 100000; + vm_op_t *op = compile_bpeg(lang, lang); check(op, "Failed to compile_bpeg input"); - op = expand_choices(op); + op = expand_choices(lang, op); const char *defs = op->end; while (matchchar(&defs, ';')) { + fprintf(stderr, "\n"); defs = after_spaces(defs); const char *name = defs; check(isalpha(*name), "Definition must begin with a name"); @@ -780,6 +923,10 @@ int main(int argc, char *argv[]) defs = def->end; } + fprintf(stderr, "\n\n"); + print_grammar(op); + fprintf(stderr, "\n\n"); + char *input; if (argc >= 3) { int fd = open(argv[2], O_RDONLY); diff --git a/bpeg.h b/bpeg.h index fea8fd5..bd86329 100644 --- a/bpeg.h +++ b/bpeg.h @@ -78,19 +78,20 @@ typedef struct vm_op_s { static inline const char *after_spaces(const char *str); static match_t *free_match(match_t *m); static match_t *match(const char *str, vm_op_t *op); -static vm_op_t *compile_bpeg(const char *str); -static vm_op_t *expand_chain(vm_op_t *first); -static vm_op_t *expand_choices(vm_op_t *op); +static vm_op_t *compile_bpeg(const char *source, const char *str); +static vm_op_t *expand_chain(const char *source, vm_op_t *first); +static vm_op_t *expand_choices(const char *source, vm_op_t *op); static void print_match(match_t *m, const char *color); static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep); typedef struct { const char *name; + const char *source; vm_op_t *op; } def_t; -static def_t defs[1024] = {{NULL, NULL}}; +static def_t defs[1024] = {{NULL, NULL, NULL}}; size_t ndefs = 0; -static int verbose = 1; +//static int verbose = 1; diff --git a/utils.h b/utils.h index 96c6de2..58cfdc4 100644 --- a/utils.h +++ b/utils.h @@ -6,6 +6,8 @@ #define check(cond, ...) do { if (!(cond)) { fprintf(stderr, __VA_ARGS__); fwrite("\n", 1, 1, stderr); _exit(1); } } while(0) #define debug(...) do { if (verbose) fprintf(stderr, __VA_ARGS__); } while(0) +static int visualize_delay = -1; + /* * Helper function to skip past all spaces (and comments) * Returns a pointer to the first non-space character. @@ -37,3 +39,16 @@ static inline int matchchar(const char **str, char c) return 0; } } + +static void visualize(const char *source, const char *ptr, const char *msg) +{ + if (visualize_delay < 0) return; + fprintf(stderr, "\033[0;1m\r\033[2A\033[K%.*s\033[0;2m%s\033[0m\n", + (int)(ptr-source), source, ptr); + fprintf(stderr, "\033[0;1m"); + for (--ptr ; ptr > source; --ptr) putc(' ', stderr); + fprintf(stderr, "^\033[K\n"); + if (msg) + fprintf(stderr, "\033[K\033[33;1m%s\033[0m", msg); + usleep(visualize_delay); +}