Major overhaul refactor restructuring

This commit is contained in:
Bruce Hill 2020-09-11 01:28:06 -07:00
parent 8a846230f7
commit 2a11acc667
12 changed files with 1303 additions and 1249 deletions

View File

@ -1,14 +1,21 @@
PREFIX= PREFIX=/usr/local
CFLAGS=-Wall -Wextra -pedantic -Wmissing-prototypes -Wstrict-prototypes CFLAGS=-Wall -Wextra -pedantic -Wmissing-prototypes -Wstrict-prototypes
LDFLAGS=
G ?= G ?=
O ?= -O3 O ?= -O3
CFILES=compiler.c grammar.c utils.c vm.c
OBJFILES=$(CFILES:.c=.o)
all: bpeg all: bpeg
clean: .c.o:
rm -f bpeg cc -c $(CFLAGS) $(G) $(O) -o $@ $<
bpeg: bpeg.c bpeg.h utils.h bpeg: $(OBJFILES) bpeg.c
cc $(CFLAGS) $(G) $(O) $< -o $@ cc $(CFLAGS) $(G) $(O) -o $@ $^
clean:
rm -f bpeg $(OBJFILES)
.PHONY: all clean .PHONY: all clean

View File

@ -1,7 +1,6 @@
# This is a file defining the BPEG grammar using BPEG syntax # This is a file defining the BPEG grammar using BPEG syntax
Grammar; Grammar = __ *Def%(__`;__) ?(`;__);
Grammar = __ @[main-pattern]extended-pat __ *((__`;__) Def) ?(`;__);
Def = @[name]Ref __ `= __ @[definition]extended-pat; Def = @[name]Ref __ `= __ @[definition]extended-pat;
# This is used for command line arguments: # This is used for command line arguments:

1132
bpeg.c

File diff suppressed because it is too large Load Diff

461
compiler.c Normal file
View File

@ -0,0 +1,461 @@
/*
* compiler.c - Compile strings into BPEG virtual machine code.
*/
#include "compiler.h"
#include "utils.h"
static vm_op_t *expand_chain(vm_op_t *first);
static vm_op_t *expand_choices(vm_op_t *first);
static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second);
static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep);
/*
* Helper function to initialize a range object.
*/
static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep)
{
op->op = VM_REPEAT;
if (pat->len >= 0 && (sep == NULL || sep->len >= 0) && min == max && min >= 0)
op->len = pat->len * min + (sep == NULL || min == 0 ? 0 : sep->len * (min-1));
else
op->len = -1;
op->args.repetitions.min = min;
op->args.repetitions.max = max;
op->args.repetitions.repeat_pat = pat;
op->args.repetitions.sep = sep;
}
/*
* Take an opcode and expand it into a chain of patterns if it's
* followed by any patterns (e.g. "`x `y"), otherwise return
* the original input.
*/
static vm_op_t *expand_chain(vm_op_t *first)
{
vm_op_t *second = bpeg_simplepattern(first->end);
if (second == NULL) return first;
second = expand_chain(second);
check(second->end > first->end, "No forward progress in chain!");
return chain_together(first, second);
}
/*
* Take an opcode and expand it into a chain of choices if it's
* followed by any "/"-separated patterns (e.g. "`x/`y"), otherwise
* return the original input.
*/
static vm_op_t *expand_choices(vm_op_t *first)
{
first = expand_chain(first);
const char *str = first->end;
if (!matchchar(&str, '/')) return first;
vm_op_t *second = bpeg_simplepattern(str);
check(second, "Expected pattern after '/'");
second = expand_choices(second);
vm_op_t *choice = calloc(sizeof(vm_op_t), 1);
choice->op = VM_OTHERWISE;
choice->start = first->start;
if (first->len == second->len)
choice->len = first->len;
else choice->len = -1;
choice->end = second->end;
choice->args.multiple.first = first;
choice->args.multiple.second = second;
return choice;
}
static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second)
{
if (first == NULL) return second;
if (second == NULL) return first;
vm_op_t *chain = calloc(sizeof(vm_op_t), 1);
chain->op = VM_CHAIN;
chain->start = first->start;
if (first->len >= 0 && second->len >= 0)
chain->len = first->len + second->len;
else chain->len = -1;
chain->end = second->end;
chain->args.multiple.first = first;
chain->args.multiple.second = second;
return chain;
}
/*
* Compile a string of BPEG code into virtual machine opcodes
*/
vm_op_t *bpeg_simplepattern(const char *str)
{
if (!*str) return NULL;
str = after_spaces(str);
check(*str, "Expected a pattern");
vm_op_t *op = calloc(sizeof(vm_op_t), 1);
op->start = str;
op->len = -1;
char c = *str;
++str;
switch (c) {
// Any char (dot) ($. is multiline anychar)
case '.': {
anychar:
op->op = VM_ANYCHAR;
op->len = 1;
break;
}
// Char literals
case '`': {
char literal[2] = {*str, '\0'};
++str;
check(literal[0], "Expected character after '`'\n");
op->len = 1;
if (matchchar(&str, '-')) { // Range
char c2 = *str;
check(c2, "Expected character after '-'");
check(c2 >= literal[0], "Character range must be low-to-high");
op->op = VM_RANGE;
op->args.range.low = literal[0];
op->args.range.high = c2;
++str;
} else {
op->op = VM_STRING;
op->args.s = strdup(literal);
}
break;
}
// Escapes
case '\\': {
check(*str, "Expected escape after '\\'");
op->len = 1;
char e = unescapechar(str, &str);
if (*str == '-') { // Escape range (e.g. \x00-\xFF)
++str;
char e2 = unescapechar(str, &str);
check(e2, "Expected character after '-'");
check(e2 >= e, "Character range must be low-to-high");
op->op = VM_RANGE;
op->args.range.low = e;
op->args.range.high = e2;
} else {
char literal[2] = {e, '\0'};
op->op = VM_STRING;
op->args.s = strdup(literal);
}
break;
}
// String literal
case '"': case '\'': case '\002': {
char endquote = c == '\002' ? '\003' : c;
char *literal = (char*)str;
for (; *str && *str != endquote; str++) {
if (*str == '\\') {
check(str[1], "Expected more string contents after backslash");
++str;
}
}
size_t len = (size_t)(str - literal);
literal = strndup(literal, len);
len = unescape_string(literal, literal, len);
op->op = VM_STRING;
op->len = len;
op->args.s = literal;
check(matchchar(&str, endquote), "Missing closing quote");
break;
}
// Not <pat>
case '!': {
vm_op_t *p = bpeg_simplepattern(str);
check(p, "Expected pattern after '!'\n");
str = p->end;
op->op = VM_NOT;
op->len = 0;
op->args.pat = p;
break;
}
// Anything but <pat>
case '~': {
if (matchchar(&str, '~')) op->multiline = 1;
vm_op_t *p = bpeg_simplepattern(str);
check(p, "Expected pattern after '~'\n");
str = p->end;
op->op = VM_ANYTHING_BUT;
op->len = -1;
op->args.pat = p;
break;
}
// Upto and including <pat>
case '&': {
if (matchchar(&str, '&')) op->multiline = 1;
vm_op_t *p = bpeg_simplepattern(str);
check(p, "Expected pattern after '&'\n");
str = p->end;
op->op = VM_UPTO_AND;
op->len = -1;
op->args.pat = p;
break;
}
// Number of repetitions: <N>(-<N> / - / + / "")
case '0': case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9': {
ssize_t min = -1, max = -1;
--str;
long n1 = strtol(str, (char**)&str, 10);
if (matchchar(&str, '-')) {
str = after_spaces(str);
const char *start = str;
long n2 = strtol(str, (char**)&str, 10);
if (str == start) min = 0, max = n1;
else min = n1, max = n2;
} else if (matchchar(&str, '+')) {
min = n1, max = -1;
} else {
min = n1, max = n1;
}
vm_op_t *pat = bpeg_simplepattern(str);
check(pat, "Expected pattern after repetition count");
str = pat->end;
str = after_spaces(str);
if (matchchar(&str, '%')) {
vm_op_t *sep = bpeg_simplepattern(str);
check(sep, "Expected pattern for separator after '%%'");
str = sep->end;
set_range(op, min, max, pat, sep);
} else {
set_range(op, min, max, pat, NULL);
}
break;
}
// Special repetitions:
case '+': case '*': case '?': {
ssize_t min = -1, max = -1;
switch (c) {
case '+': min = 1, max = -1; break;
case '*': min = 0, max = -1; break;
case '?': min = 0, max = 1; break;
}
vm_op_t *pat = bpeg_simplepattern(str);
check(pat, "Expected pattern after +");
str = pat->end;
str = after_spaces(str);
if (matchchar(&str, '%')) {
vm_op_t *sep = bpeg_simplepattern(str);
check(sep, "Expected pattern for separator after '%%'");
str = sep->end;
set_range(op, min, max, pat, sep);
} else {
set_range(op, min, max, pat, NULL);
}
break;
}
// Lookbehind
case '<': {
vm_op_t *pat = bpeg_simplepattern(str);
check(pat, "Expected pattern after <");
str = pat->end;
check(pat->len != -1, "Lookbehind patterns must have a fixed length");
str = pat->end;
op->op = VM_AFTER;
op->len = 0;
op->args.pat = pat;
break;
}
// Lookahead
case '>': {
vm_op_t *pat = bpeg_simplepattern(str);
check(pat, "Expected pattern after >");
str = pat->end;
op->op = VM_BEFORE;
op->len = 0;
op->args.pat = pat;
break;
}
// Parentheses
case '(': {
free(op);
op = bpeg_simplepattern(str);
check(op, "Expected pattern inside parentheses");
op = expand_choices(op);
str = op->end;
str = after_spaces(str);
check(matchchar(&str, ')'), "Expected closing parenthesis");
break;
}
// Capture
case '@': {
op->op = VM_CAPTURE;
str = after_spaces(str);
if (matchchar(&str, '[')) {
char *closing = strchr(str, ']');
check(closing, "Expected closing ']'");
op->args.capture.name = strndup(str, (size_t)(closing-str));
str = closing;
check(matchchar(&str, ']'), "Expected closing ']'");
}
vm_op_t *pat = bpeg_simplepattern(str);
check(pat, "Expected pattern after @");
str = pat->end;
op->args.capture.capture_pat = pat;
op->len = pat->len;
break;
}
// Replacement
case '{': {
str = after_spaces(str);
vm_op_t *pat = NULL;
if (strncmp(str, "=>", 2) == 0) {
str += strlen("=>");
} else {
pat = bpeg_simplepattern(str);
check(pat, "Invalid pattern after '{'");
pat = expand_choices(pat);
str = pat->end;
str = after_spaces(str);
check(matchchar(&str, '=') && matchchar(&str, '>'),
"Expected '=>' after pattern in replacement");
}
str = after_spaces(str);
char quote = *str;
const char *replacement;
if (matchchar(&str, '}')) {
replacement = strdup("");
} else {
check(matchchar(&str, '"') || matchchar(&str, '\''),
"Expected string literal for replacement");
replacement = str;
for (; *str && *str != quote; str++) {
if (*str == '\\') {
check(str[1], "Expected more string contents after backslash");
++str;
}
}
replacement = strndup(replacement, (size_t)(str-replacement));
check(matchchar(&str, quote), "Expected closing quote");
check(matchchar(&str, '}'), "Expected a closing '}'");
}
op->op = VM_REPLACE;
op->args.replace.replace_pat = pat;
op->args.replace.replacement = replacement;
if (pat != NULL) op->len = pat->len;
break;
}
// Special rules:
case '_': case '^': case '$': {
if (matchchar(&str, c)) { // double __, ^^, $$
char tmp[3] = {c, c, '\0'};
op->args.s = strdup(tmp);
} else if (c == '$' && matchchar(&str, '.')) { // $. (multi-line anychar)
op->multiline = 1;
goto anychar;
} else {
op->args.s = strndup(&c, 1);
}
op->op = VM_REF;
break;
}
// Empty choice (/) or {/}
case '/': {
str = after_spaces(str);
if (*str == ')' || *str == '}') {
op->op = VM_EMPTY;
} else {
free(op);
return NULL;
}
break;
}
default: {
// Reference
if (isalpha(c)) {
--str;
const char *refname = str;
str = after_name(str);
op->op = VM_REF;
op->len = (size_t)(str - refname);
op->args.s = strndup(refname, op->len);
break;
} else {
free(op);
return NULL;
}
}
}
op->end = str;
return op;
}
/*
* Similar to bpeg_simplepattern, except that the pattern begins with an implicit, unclosable quote.
*/
vm_op_t *bpeg_stringpattern(const char *str)
{
vm_op_t *ret = NULL;
while (*str) {
vm_op_t *strop = calloc(sizeof(vm_op_t), 1);
strop->start = str;
strop->len = 0;
strop->op = VM_STRING;
char *literal = (char*)str;
vm_op_t *interp = NULL;
for (; *str; str++) {
if (*str == '\\') {
check(str[1], "Expected more string contents after backslash");
interp = bpeg_simplepattern(str + 1);
check(interp != NULL, "No valid BPEG pattern detected after backslash");
break;
}
}
// End of string
size_t len = (size_t)(str - literal);
literal = strndup(literal, len);
len = unescape_string(literal, literal, len);
strop->len = len;
strop->args.s = literal;
strop->end = str;
if (strop->len == 0) {
free(strop);
strop = NULL;
} else {
ret = chain_together(ret, strop);
}
if (interp) {
ret = chain_together(ret, interp);
str = interp->end;
// allow terminating seq
matchchar(&str, ';');
}
}
return ret;
}
/*
* Given a pattern and a replacement string, compile the two into a replacement
* VM opcode.
*/
vm_op_t *bpeg_replacement(vm_op_t *pat, const char *replacement)
{
vm_op_t *op = calloc(sizeof(vm_op_t), 1);
op->op = VM_REPLACE;
op->start = pat->start;
op->len = pat->len;
op->args.replace.replace_pat = pat;
const char *p = replacement;
for (; *p; p++) {
if (*p == '\\') {
check(p[1], "Expected more string contents after backslash");
++p;
}
}
replacement = strndup(replacement, (size_t)(p-replacement));
op->args.replace.replacement = replacement;
return op;
}
vm_op_t *bpeg_pattern(const char *str)
{
vm_op_t *op = bpeg_simplepattern(str);
if (op != NULL) op = expand_choices(op);
return op;
}

16
compiler.h Normal file
View File

@ -0,0 +1,16 @@
/*
* compiler.h - Header file for BPEG compiler.
*/
#ifndef COMPILER__H
#define COMPILER__H
#include <stdlib.h>
#include "types.h"
vm_op_t *bpeg_simplepattern(const char *str);
vm_op_t *bpeg_stringpattern(const char *str);
vm_op_t *bpeg_replacement(vm_op_t *pat, const char *replacement);
vm_op_t *bpeg_pattern(const char *str);
#endif

93
grammar.c Normal file
View File

@ -0,0 +1,93 @@
/*
* grammar.c - Code for defining grammars (sets of rules)
*/
#include "grammar.h"
#include "compiler.h"
#include "utils.h"
const char *BPEG_BUILTIN_GRAMMAR = (
"crlf=\\r\\n;\n"
"cr=\\r;\n" "r=\\r;\n"
"anglebraces=`< *(anglebraces / ~~`>) `>;\n"
"brackets=`[ *(brackets / ~~`]) `];\n"
"braces=`{ *(braces / ~~`}) `};\n"
"parens=`( *(parens / ~~`)) `);\n"
"id=(`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9);\n"
"HEX=`0-9/`A-F;\n"
"Hex=`0-9/`a-f/`A-F;\n"
"hex=`0-9/`a-f;\n"
"number=+`0-9 ?(`. *`0-9) / `. +`0-9;\n"
"int=+`0-9;\n"
"digit=`0-9;\n"
"Abc=`a-z/`A-Z;\n"
"ABC=`A-Z;\n"
"abc=`a-z;\n"
"esc=\\e;\n" "e=\\e;\n"
"tab=\\t;\n" "t=\\t;\n"
"nl=\\n;\n" "lf=\\n;\n" "n=\\n;\n"
"c-block-comment='/*' &&'*/';\n"
"c-line-comment='//' &$;\n"
"c-comment=c-line-comment / c-block-comment;\n"
"hash-comment=`# &$;\n"
"comment=!(/);\n" // No default definition, can be overridden
"WS=` /\\t/\\n/\\r/comment;\n"
"ws=` /\\t;\n"
"$$=!$.;\n"
"$=!.;\n"
"^^=!<$.;\n"
"^=!<.;\n"
"__=*(` /\\t/\\n/\\r/comment);\n"
"_=*(` /\\t);\n"
);
grammar_t *new_grammar(void)
{
grammar_t *g = calloc(sizeof(grammar_t), 1);
g->definitions = calloc(sizeof(def_t), (g->capacity = 128));
load_grammar(g, BPEG_BUILTIN_GRAMMAR);
return g;
}
void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op)
{
if (g->size >= g->capacity) {
g->definitions = realloc(g->definitions, (g->capacity += 32));
}
int i = g->size;
g->definitions[i].source = src;
g->definitions[i].name = name;
g->definitions[i].op = op;
++g->size;
}
void load_grammar(grammar_t *g, const char *src)
{
vm_op_t *mainpat = NULL;
do {
src = after_spaces(src);
if (!*src) break;
const char *name = src;
const char *name_end = after_name(name);
check(name_end > name, "Invalid name for definition");
name = strndup(name, (size_t)(name_end-name));
src = after_spaces(name_end);
check(matchchar(&src, '='), "Expected '=' in definition");
vm_op_t *op = bpeg_pattern(src);
check(op, "Couldn't load definition");
add_def(g, src, name, op);
if (mainpat == NULL) {
mainpat = op;
g->pattern = op;
}
src = op->end;
} while (*src && matchchar(&src, ';'));
}
/*
* Print a BPEG grammar in human-readable form.
*/
void print_grammar(grammar_t *g)
{
if (g->pattern) print_pattern(g->pattern);
}

18
grammar.h Normal file
View File

@ -0,0 +1,18 @@
/*
* grammar.h - Header file defining grammars (sets of rule definitions)
*/
#ifndef GRAMMAR__H
#define GRAMMAR__H
#include <stdlib.h>
#include <string.h>
#include "types.h"
grammar_t *new_grammar(void);
void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op);
void load_grammar(grammar_t *g, const char *source);
void print_grammar(grammar_t *g);
#endif

View File

@ -1,24 +1,10 @@
/* /*
* bpeg.h - Header file for the bpeg parser * types.h - Datatypes used by BPEG
*/ */
#include <ctype.h> #ifndef TYPES__H
#include <fcntl.h> #define TYPES__H
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "utils.h" #include <sys/types.h>
const char *usage = (
"Usage:\n"
" bpeg [flags] <pattern> [<input files>...]\n\n"
"Flags:\n"
" -h --help\t print the usage and quit\n"
" -v --verbose\t print verbose debugging info\n"
" -s --slow\t run in slow mode for debugging\n"
" -r --replace <replacement> replace the input pattern with the given replacement\n"
" -g --grammar <grammar file> use the specified file as a grammar\n");
/* /*
* BPEG virtual machine opcodes * BPEG virtual machine opcodes
@ -87,20 +73,6 @@ typedef struct match_s {
vm_op_t *op; vm_op_t *op;
} match_t; } match_t;
static inline const char *after_spaces(const char *str);
static match_t *free_match(match_t *m);
static match_t *match(const char *str, vm_op_t *op);
static vm_op_t *compile_bpeg(const char *source, const char *str);
static vm_op_t *load_grammar(const char *grammar);
static vm_op_t *add_def(const char *name, const char *source, vm_op_t *op);
static vm_op_t *load_def(const char *name, const char *source);
static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second);
static vm_op_t *compile_bpeg_string(const char *source, const char *str);
static vm_op_t *expand_chain(const char *source, vm_op_t *first);
static vm_op_t *expand_choices(const char *source, vm_op_t *op);
static void print_match(match_t *m, const char *color);
static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep);
typedef struct { typedef struct {
const char *name; const char *name;
@ -108,5 +80,10 @@ typedef struct {
vm_op_t *op; vm_op_t *op;
} def_t; } def_t;
static def_t defs[1024] = {{NULL, NULL, NULL}}; typedef struct {
size_t ndefs = 0; vm_op_t *pattern;
def_t *definitions;
size_t size, capacity;
} grammar_t;
#endif

171
utils.c Normal file
View File

@ -0,0 +1,171 @@
/*
* utils.c - Some helper code for debugging and error logging.
*/
#include "utils.h"
/*
* Helper function to skip past all spaces (and comments)
* Returns a pointer to the first non-space character.
*/
const char *after_spaces(const char *str)
{
// Skip whitespace and comments:
skip_whitespace:
switch (*str) {
case ' ': case '\r': case '\n': case '\t': {
++str;
goto skip_whitespace;
}
case '#': {
while (*str && *str != '\n') ++str;
goto skip_whitespace;
}
}
return str;
}
/*
* Return the first character after a valid BPEG name, or NULL if none is
* found.
*/
const char *after_name(const char *str)
{
if (*str == '^' || *str == '_' || *str == '$') {
return (str[1] == *str) ? &str[2] : &str[1];
}
if (!isalpha(*str)) return NULL;
for (++str; *str; ++str) {
if (!(isalnum(*str) || *str == '-'))
break;
}
return str;
}
/*
* Check if a character is found and if so, move past it.
*/
int matchchar(const char **str, char c)
{
*str = after_spaces(*str);
if (**str == c) {
++(*str);
return 1;
} else {
return 0;
}
}
/*
* Process a string escape sequence for a character and return the
* character that was escaped.
* Set *end = the first character past the end of the escape sequence.
*/
char unescapechar(const char *escaped, const char **end)
{
size_t len = 1;
char ret = *escaped;
switch (*escaped) {
case 'a': ret = '\a'; break; case 'b': ret = '\b'; break;
case 'n': ret = '\n'; break; case 'r': ret = '\r'; break;
case 't': ret = '\t'; break; case 'v': ret = '\v'; break;
case 'e': ret = '\033'; break;
case 'x': { // Hex
static const char hextable[255] = {
['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4,
['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9,
['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf,
['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf,
};
if (hextable[(int)escaped[1]] && hextable[(int)escaped[2]]) {
ret = (hextable[(int)escaped[1]] << 4) | (hextable[(int)escaped[2]] & 0xF);
len = 3;
}
break;
}
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal
ret = escaped[0] - '0';
if ('0' <= escaped[1] && escaped[1] <= '7') {
++len;
ret = (ret << 3) | (escaped[1] - '0');
if ('0' <= escaped[2] && escaped[2] <= '7') {
++len;
ret = (ret << 3) | (escaped[2] - '0');
}
}
break;
}
default: break;
}
*end = &escaped[len];
return ret;
}
/*
* Write an unescaped version of `src` to `dest` (at most bufsize-1 chars,
* terminated by a null byte)
*/
size_t unescape_string(char *dest, const char *src, size_t bufsize)
{
size_t len = 0;
#define PUT(c) do { *(dest++) = (char)(c); ++len; } while (0)
for ( ; *src && len < bufsize; ++src) {
if (*src != '\\') {
PUT(*src);
continue;
}
++src;
switch (*src) {
case 'a': PUT('\a'); break; case 'b': PUT('\b'); break;
case 'n': PUT('\n'); break; case 'r': PUT('\r'); break;
case 't': PUT('\t'); break; case 'v': PUT('\v'); break;
case 'e': PUT('\033'); break;
case 'x': { // Hex
static const char hextable[255] = {
['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4,
['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9,
['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf,
['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf,
};
if (hextable[(int)src[1]] && hextable[(int)src[2]]) {
PUT((hextable[(int)src[1]] << 4) | (hextable[(int)src[2]] & 0xF));
src += 2;
}
break;
}
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal
int c = *src - '0';
if ('0' <= src[1] && src[1] <= '7') {
++src;
c = (c << 3) | (*src - '0');
if ('0' <= src[1] && src[1] <= '7' && (c << 3) < 256) {
++src;
c = (c << 3) | (*src - '0');
}
}
PUT(c);
break;
}
default: PUT(*src); break;
}
}
*dest = '\0';
return len;
#undef PUT
}
/*
* Read an entire file into memory.
*/
char *readfile(int fd)
{
size_t capacity = 1000, len = 0;
char *buf = calloc(sizeof(char), capacity+1);
ssize_t just_read;
while ((just_read=read(fd, &buf[len], capacity-len)) > 0) {
len += (size_t)just_read;
if (len >= capacity)
buf = realloc(buf, (capacity *= 2));
}
return buf;
}

129
utils.h
View File

@ -1,120 +1,27 @@
/* /*
* utils.h - Some helper code for debugging and error logging. * utils.h - Some utility and printing functions.
*/ */
#ifndef UTILS__H
#define UTILS__H
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "vm.h"
#define streq(a, b) (strcmp(a, b) == 0) #define streq(a, b) (strcmp(a, b) == 0)
// TODO: better error reporting // TODO: better error reporting
#define check(cond, ...) do { if (!(cond)) { fprintf(stderr, __VA_ARGS__); fwrite("\n", 1, 1, stderr); _exit(1); } } while(0) #define check(cond, ...) do { if (!(cond)) { fprintf(stderr, __VA_ARGS__); fwrite("\n", 1, 1, stderr); _exit(1); } } while(0)
#define debug(...) do { if (verbose) fprintf(stderr, __VA_ARGS__); } while(0) #define debug(...) do { if (verbose) fprintf(stderr, __VA_ARGS__); } while(0)
static int verbose = 0; char *readfile(int fd);
static int visualize_delay = -1; char unescapechar(const char *escaped, const char **end);
const char *after_name(const char *str);
const char *after_spaces(const char *str);
int matchchar(const char **str, char c);
size_t unescape_string(char *dest, const char *src, size_t bufsize);
/* #endif
* Helper function to skip past all spaces (and comments)
* Returns a pointer to the first non-space character.
*/
static inline const char *after_spaces(const char *str)
{
// Skip whitespace and comments:
skip_whitespace:
switch (*str) {
case ' ': case '\r': case '\n': case '\t': {
++str;
goto skip_whitespace;
}
case '#': {
while (*str && *str != '\n') ++str;
goto skip_whitespace;
}
}
return str;
}
static inline const char *after_name(const char *str)
{
if (!isalpha(*str)) return NULL;
for (++str; *str; ++str) {
if (!(isalnum(*str) || *str == '-'))
break;
}
return str;
}
static inline int matchchar(const char **str, char c)
{
*str = after_spaces(*str);
if (**str == c) {
++(*str);
return 1;
} else {
return 0;
}
}
static void visualize(const char *source, const char *ptr, const char *msg)
{
if (!verbose) return;
fprintf(stderr, "\033[0;1m\r\033[2A\033[K%.*s\033[0;2m%s\033[0m\n",
(int)(ptr-source), source, ptr);
fprintf(stderr, "\033[0;1m");
for (--ptr ; ptr > source; --ptr) putc(' ', stderr);
fprintf(stderr, "^\033[K\n");
if (msg)
fprintf(stderr, "\033[K\033[33;1m%s\033[0m", msg);
if (visualize_delay > 0)
usleep(visualize_delay);
}
/*
* Write an unescaped version of `src` to `dest` (at most bufsize-1 chars,
* terminated by a null byte)
*/
static size_t unescape_string(char *dest, const char *src, size_t bufsize)
{
size_t len = 0;
#define PUT(c) do { *(dest++) = (char)(c); ++len; } while (0)
for ( ; *src && len < bufsize; ++src) {
if (*src != '\\') {
PUT(*src);
continue;
}
++src;
switch (*src) {
case 'a': PUT('\a'); break; case 'b': PUT('\b'); break;
case 'n': PUT('\n'); break; case 'r': PUT('\r'); break;
case 't': PUT('\t'); break; case 'v': PUT('\v'); break;
case 'e': PUT('\033'); break;
case 'x': { // Hex
static const char hextable[255] = {
['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4,
['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9,
['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf,
['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf,
};
if (hextable[(int)src[1]] && hextable[(int)src[2]]) {
PUT((hextable[(int)src[1]] << 4) | (hextable[(int)src[2]] & 0xF));
src += 2;
}
break;
}
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal
int c = *src - '0';
if ('0' <= src[1] && src[1] <= '7') {
++src;
c = (c << 3) | (*src - '0');
if ('0' <= src[1] && src[1] <= '7' && (c << 3) < 256) {
++src;
c = (c << 3) | (*src - '0');
}
}
PUT(c);
break;
}
default: PUT(*src); break;
}
}
*dest = '\0';
return len;
#undef PUT
}

449
vm.c Normal file
View File

@ -0,0 +1,449 @@
/*
* vm.c - Code for the BPEG virtual machine that performs the matching.
*/
#include "vm.h"
#include "utils.h"
/*
* Recursively deallocate a match object and set to NULL
*/
void destroy_match(match_t **m)
{
if (!m || !*m) return;
destroy_match(&((*m)->child));
destroy_match(&((*m)->nextsibling));
*m = NULL;
}
/*
* Run virtual machine operation against a string and return
* a match struct, or NULL if no match is found.
* The returned value should be free()'d to avoid memory leaking.
*/
match_t *match(grammar_t *g, const char *str, vm_op_t *op)
{
//tailcall:
switch (op->op) {
case VM_EMPTY: {
match_t *m = calloc(sizeof(match_t), 1);
m->op = op;
m->start = str;
m->end = str;
return m;
}
case VM_ANYCHAR: {
if (!*str || (!op->multiline && *str == '\n'))
return NULL;
match_t *m = calloc(sizeof(match_t), 1);
m->op = op;
m->start = str;
m->end = str+1;
return m;
}
case VM_STRING: {
if (strncmp(str, op->args.s, op->len) != 0)
return NULL;
match_t *m = calloc(sizeof(match_t), 1);
m->op = op;
m->start = str;
m->end = str + op->len;
return m;
}
case VM_RANGE: {
if (*str < op->args.range.low || *str > op->args.range.high)
return NULL;
match_t *m = calloc(sizeof(match_t), 1);
m->op = op;
m->start = str;
m->end = str + 1;
return m;
}
case VM_NOT: case VM_ANYTHING_BUT: {
if (op->op == VM_ANYTHING_BUT)
if (!*str || (!op->multiline && *str == '\n'))
return NULL;
match_t *m = match(g, str, op->args.pat);
if (m != NULL) {
destroy_match(&m);
return NULL;
}
m = calloc(sizeof(match_t), 1);
m->op = op;
m->start = str;
if (op->op == VM_ANYTHING_BUT) ++str;
m->end = str;
return m;
}
case VM_UPTO_AND: {
match_t *m = calloc(sizeof(match_t), 1);
m->start = str;
m->op = op;
match_t *p = NULL;
for (const char *prev = NULL; p == NULL && prev < str; ) {
prev = str;
p = match(g, str, op->args.pat);
if (*str && (op->multiline || *str != '\n'))
++str;
}
if (p) {
m->end = p->end;
m->child = p;
return m;
}
destroy_match(&m);
return NULL;
}
case VM_REPEAT: {
match_t *m = calloc(sizeof(match_t), 1);
m->start = str;
m->end = str;
m->op = op;
if (op->args.repetitions.max == 0) return m;
match_t **dest = &m->child;
const char *prev = str;
size_t reps;
for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) {
// Separator
match_t *sep = NULL;
if (op->args.repetitions.sep != NULL && reps > 0) {
sep = match(g, str, op->args.repetitions.sep);
if (sep == NULL) break;
str = sep->end;
}
match_t *p = match(g, str, op->args.repetitions.repeat_pat);
if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops
destroy_match(&sep);
destroy_match(&p);
break;
}
if (sep) {
*dest = sep;
dest = &sep->nextsibling;
}
*dest = p;
dest = &p->nextsibling;
str = p->end;
prev = str;
}
if ((ssize_t)reps < op->args.repetitions.min) {
destroy_match(&m);
return NULL;
}
m->end = str;
return m;
}
case VM_AFTER: {
ssize_t backtrack = op->args.pat->len;
check(backtrack != -1, "'<' is only allowed for fixed-length operations");
// Check for necessary space:
for (int i = 0; i < backtrack; i++) {
if (str[-i] == '\0') return NULL;
}
match_t *before = match(g, str - backtrack, op->args.pat);
if (before == NULL) return NULL;
destroy_match(&before);
match_t *m = calloc(sizeof(match_t), 1);
m->start = str;
m->end = str;
m->op = op;
return m;
}
case VM_BEFORE: {
match_t *after = match(g, str, op->args.pat);
if (after == NULL) return NULL;
destroy_match(&after);
match_t *m = calloc(sizeof(match_t), 1);
m->start = str;
m->end = str;
m->op = op;
return m;
}
case VM_CAPTURE: {
match_t *p = match(g, str, op->args.pat);
if (p == NULL) return NULL;
match_t *m = calloc(sizeof(match_t), 1);
m->start = str;
m->end = p->end;
m->op = op;
m->child = p;
m->is_capture = 1;
if (op->args.capture.name)
m->name_or_replacement = op->args.capture.name;
return m;
}
case VM_OTHERWISE: {
match_t *m = match(g, str, op->args.multiple.first);
if (m == NULL) m = match(g, str, op->args.multiple.second);
return m;
}
case VM_CHAIN: {
match_t *m1 = match(g, str, op->args.multiple.first);
if (m1 == NULL) return NULL;
match_t *m2 = match(g, m1->end, op->args.multiple.second);
if (m2 == NULL) {
destroy_match(&m1);
return NULL;
}
match_t *m = calloc(sizeof(match_t), 1);
m->start = str;
m->end = m2->end;
m->op = op;
m->child = m1;
m1->nextsibling = m2;
return m;
}
case VM_REPLACE: {
match_t *m = calloc(sizeof(match_t), 1);
m->start = str;
m->op = op;
if (op->args.replace.replace_pat) {
match_t *p = match(g, str, op->args.replace.replace_pat);
if (p == NULL) return NULL;
m->child = p;
m->end = p->end;
} else {
m->end = m->start;
}
m->is_replacement = 1;
m->name_or_replacement = op->args.replace.replacement;
return m;
}
case VM_REF: {
// Search backwards so newer defs take precedence
for (int i = g->size-1; i >= 0; i--) {
if (streq(g->definitions[i].name, op->args.s)) {
// Bingo!
/*
op = g->definitions[i].op;
goto tailcall;
*/
match_t *p = match(g, str, g->definitions[i].op);
if (p == NULL) return NULL;
match_t *m = calloc(sizeof(match_t), 1);
m->start = p->start;
m->end = p->end;
m->op = op;
m->child = p;
m->name_or_replacement = g->definitions[i].name;
m->is_ref = 1;
return m;
}
}
check(0, "Unknown identifier: '%s'", op->args.s);
return NULL;
}
default: {
fprintf(stderr, "Unknown opcode: %d", op->op);
_exit(1);
return NULL;
}
}
}
void print_pattern(vm_op_t *op)
{
switch (op->op) {
case VM_REF: fprintf(stderr, "a $%s", op->args.s); break;
case VM_EMPTY: fprintf(stderr, "the empty string"); break;
case VM_ANYCHAR: fprintf(stderr, "any char"); break;
case VM_STRING: fprintf(stderr, "string \"%s\"", op->args.s); break;
case VM_RANGE: {
fprintf(stderr, "char from %c-%c", op->args.range.low, op->args.range.high);
break;
}
case VM_REPEAT: {
if (op->args.repetitions.max == -1)
fprintf(stderr, "%ld or more (", op->args.repetitions.min);
else
fprintf(stderr, "%ld-%ld of (",
op->args.repetitions.min,
op->args.repetitions.max);
print_pattern(op->args.repetitions.repeat_pat);
fprintf(stderr, ")");
if (op->args.repetitions.sep) {
fprintf(stderr, " separated by (");
print_pattern(op->args.repetitions.sep);
fprintf(stderr, ")");
}
break;
}
case VM_NOT: {
fprintf(stderr, "not (");
print_pattern(op->args.pat);
fprintf(stderr, ")");
break;
}
case VM_UPTO_AND: {
fprintf(stderr, "text up to and including (");
print_pattern(op->args.pat);
fprintf(stderr, ")");
break;
}
case VM_ANYTHING_BUT: {
fprintf(stderr, "anything but (");
print_pattern(op->args.pat);
fprintf(stderr, ")");
break;
}
case VM_AFTER: {
fprintf(stderr, "after (");
print_pattern(op->args.pat);
fprintf(stderr, ")");
break;
}
case VM_BEFORE: {
fprintf(stderr, "before (");
print_pattern(op->args.pat);
fprintf(stderr, ")");
break;
}
case VM_CAPTURE: {
fprintf(stderr, "capture (");
print_pattern(op->args.pat);
fprintf(stderr, ")");
if (op->args.capture.name)
fprintf(stderr, " and call it %s", op->args.capture.name);
break;
}
case VM_OTHERWISE: {
fprintf(stderr, "(");
print_pattern(op->args.multiple.first);
fprintf(stderr, ") or else ");
if (op->args.multiple.second->op != VM_OTHERWISE)
fprintf(stderr, "(");
print_pattern(op->args.multiple.second);
if (op->args.multiple.second->op != VM_OTHERWISE)
fprintf(stderr, ")");
break;
}
case VM_CHAIN: {
fprintf(stderr, "(");
print_pattern(op->args.multiple.first);
fprintf(stderr, ") then ");
if (op->args.multiple.second->op != VM_CHAIN)
fprintf(stderr, "(");
print_pattern(op->args.multiple.second);
if (op->args.multiple.second->op != VM_CHAIN)
fprintf(stderr, ")");
break;
}
case VM_REPLACE: {
fprintf(stderr, "replace ");
if (op->args.replace.replace_pat) {
fprintf(stderr, "(");
print_pattern(op->args.replace.replace_pat);
fprintf(stderr, ")");
} else
fprintf(stderr, "\"\"");
fprintf(stderr, " with \"%s\"", op->args.replace.replacement);
break;
}
default: break;
}
}
/*
* Get a specific numbered pattern capture.
*/
static match_t *get_capture_n(match_t *m, int *n)
{
if (!m) return NULL;
if (*n == 0) return m;
if (m->is_capture && *n == 1) return m;
if (m->is_capture) --(*n);
for (match_t *c = m->child; c; c = c->nextsibling) {
match_t *cap = get_capture_n(c, n);
if (cap) return cap;
}
return NULL;
}
/*
* Get a named capture.
*/
static match_t *get_capture_named(match_t *m, const char *name)
{
if (m->is_capture && m->name_or_replacement && streq(m->name_or_replacement, name))
return m;
for (match_t *c = m->child; c; c = c->nextsibling) {
match_t *cap = get_capture_named(c, name);
if (cap) return cap;
}
return NULL;
}
/*
* Print a match with replacements and highlighting.
*/
void print_match(match_t *m, const char *color, int verbose)
{
if (m->is_replacement) {
printf("\033[0;34m");
for (const char *r = m->name_or_replacement; *r; ) {
if (*r == '\\') {
fputc(unescapechar(r, &r), stdout);
continue;
} else if (*r != '@') {
fputc(*r, stdout);
++r;
continue;
}
++r;
match_t *cap = NULL;
switch (*r) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9': {
int n = (int)strtol(r, (char**)&r, 10);
cap = get_capture_n(m->child, &n);
break;
}
case '[': {
char *closing = strchr(r+1, ']');
if (!closing) {
fputc('@', stdout);
break;
}
++r;
char *name = strndup(r, (size_t)(closing-r));
cap = get_capture_named(m, name);
free(name);
r = closing + 1;
break;
}
default: {
fputc('@', stdout);
break;
}
}
if (cap != NULL) {
print_match(cap, "\033[0;35m", verbose);
printf("\033[0;34m");
}
}
} else {
const char *name = m->name_or_replacement;
if (verbose && m->is_ref && name && isupper(name[0]))
printf("\033[0;2;35m{%s:", name);
//if (m->is_capture && name)
// printf("\033[0;2;33m[%s:", name);
const char *prev = m->start;
for (match_t *child = m->child; child; child = child->nextsibling) {
if (child->start > prev)
printf("%s%.*s", color, (int)(child->start - prev), prev);
print_match(child, m->is_capture ? "\033[0;1m" : color, verbose);
prev = child->end;
}
if (m->end > prev)
printf("%s%.*s", color, (int)(m->end - prev), prev);
if (verbose && m->is_ref && name && isupper(name[0]))
printf("\033[0;2;35m}");
//if (m->is_capture && name)
// printf("\033[0;2;33m]");
}
}

18
vm.h Normal file
View File

@ -0,0 +1,18 @@
/*
* vm.h - Header file for BPEG virtual machine.
*/
#ifndef VM__H
#define VM__H
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include "types.h"
match_t *match(grammar_t *g, const char *str, vm_op_t *op);
void destroy_match(match_t **m);
void print_pattern(vm_op_t *op);
void print_match(match_t *m, const char *color, int verbose);
#endif