Some bug fixes, syntax tweaks, added ^^/$$/__, and bpeg meta grammar.

This commit is contained in:
Bruce Hill 2020-09-10 02:02:40 -07:00
parent 384fb6293a
commit 8126489f81
2 changed files with 119 additions and 25 deletions

59
bpeg.bpeg Normal file
View File

@ -0,0 +1,59 @@
# This is a file defining the BPEG grammar using BPEG syntax
grammar;
grammar = __ @[mainPattern]extendedPat __ (*def % (__`;__)) ?(__ `;) __;
def = @[name]ref __ `= __ @[definition]extendedPat;
# This is used for command line arguments:
stringGrammar = *(`\ pat ?`; / .);
pat = empty / dot / string / charRange / char / escape / no / anythingBut
/ uptoAnd / repeat / after / before / capture / replace / ref / parens;
empty = `/ >(__ (`}/`}));
dot = `.;
string = (
`" @[s]*(escape / ~`") `"
/ `' @[s]*(escape / ~`') `'
);
charRange = `` @[low]. `- @[high].;
char = `` @[s].;
escape = `\ @[s](
1-3 `0-7
/ `x 2 (`0-9/`a-f/`A-F)
/`a/`b/`e/`n/`r/`t/`v / . / \n
);
no = `! _ @pat;
anythingBut = `~ _ @pat;
uptoAnd = `& _ @pat;
repeat = (
@[min]int _ `- _ @[max]int
/ @[min]{->"0"} @[max]int _ `-
/ @[min]int _ `+ @[max](/)
/ @[min] @[max]int
/ `+ @[min]{->"1"} @[max](/)
/ `* @[min]{->"0"} @[max](/)
/ `? @[min]{->"0"} @[max]{->"1"}
) _ @[repeatPat]pat ?( __ `% __ @[sep]pat);
after = `< _ pat;
before = `> _ pat;
capture = `@ ?(_ `[ @[captureName]ref `]) _ @[capture]pat;
replace = `{ __ (
?(@[replacePat]extendedPat __) "=>" ?(__ @[replacement]string)
) __ `};
ref = @[name](
"^^" / "^" / "__" / "_" / "$$" / "$" /
(`a-z/`A-Z) *(`a-z/`A-Z/`0-9));
parens = `( __ extendedPat __ `);
chain = +@pat % (__);
otherwise = +@(chain/pat) % (__`/__);
extendedPat = otherwise / chain / pat;
_ = *(` / \t);
__ = *(` / \t / \r / \n / comment);
hashComment = `# *.;
# Note: comments are undefined by default in regular BPEG
comment = hashComment;

85
bpeg.c
View File

@ -1,5 +1,5 @@
/*
* bpeg.h - Source code for the bpeg parser
* bpeg.c - Source code for the bpeg parser
*
* Grammar:
* # <comment> comment
@ -9,10 +9,7 @@
* _ 0 or more spaces or tabs
* `<c> character <c>
* `<a>-<z> character between <a> and <z>
* `<a>,<b>,... character <a> or <b> (WIP)
* \<e> escape sequence (e.g. \n, \033)
* \<e1>-<e2> escape sequence range (WIP)
* \<e1>,<e2>,... one of multiple escape sequences (WIP)
* ! <pat> no <pat>
* ~ <pat> any character as long as it doesn't match <pat>
* & <pat> upto and including <pat> (aka *~<pat> <pat>)
@ -23,14 +20,15 @@
* <N> - <M> <pat> <N> to <M> (inclusive) <pat>s
* < <pat> after <pat>, ...
* > <pat> before <pat>, ...
* <pat> / <alt> <pat> otherwise <alt>
* ( <pat> ) <pat>
* @ <pat> capture <pat>
* @ [ <name> ] <pat> <pat> named <name>
* { <pat> => <str> } <pat> replaced with <str>
* "@1" or "@[1]" first capture
* "@foo" or "@[foo]" capture named "foo"
* <pat1> <pat2> <pat1> followed by <pat2>
* <pat> / <alt> <pat> otherwise <alt>
* ; <name> = <pat> <name> is defined to be <pat>
* { <pat> -> <str> } <pat> replaced with <str>
* "@1" or "@{1}" first capture
* "@foo" or "@{foo}" capture named "foo"
*/
#include "bpeg.h"
@ -103,9 +101,11 @@ static match_t *match(const char *str, vm_op_t *op)
case VM_UPTO_AND: {
match_t *m = calloc(sizeof(match_t), 1);
m->start = str;
while (*str && (multiline_dot || (*str != '\n' && *str != '\r'))) {
while (*str) {
match_t *p = match(str, op->args.pat);
if (p == NULL) {
if (!multiline_dot && (*str == '\n' || *str == '\r'))
break;
++str;
} else {
m->end = p->end;
@ -135,7 +135,7 @@ static match_t *match(const char *str, vm_op_t *op)
str = sep->end;
}
match_t *p = match(str, op->args.repetitions.repeat_pat);
if (p == NULL || p->end == prev) { // Prevent infinite loops
if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops
if (sep) sep = free_match(sep);
if (p) p = free_match(p);
break;
@ -612,16 +612,17 @@ static vm_op_t *compile_bpeg(const char *source, const char *str)
visualize(source, str, "Replacement");
str = after_spaces(str);
vm_op_t *pat = NULL;
if (strncmp(str, "->", 2) == 0) {
str += strlen("->");
if (strncmp(str, "=>", 2) == 0) {
str += strlen("=>");
} else {
pat = compile_bpeg(source, str);
check(pat, "Invalid pattern after '{'");
pat = expand_choices(source, pat);
str = pat->end;
str = after_spaces(str);
check(matchchar(&str, '-') && matchchar(&str, '>'),
"Expected '->' after pattern in replacement");
printf("at: '%s'\n", str);
check(matchchar(&str, '=') && matchchar(&str, '>'),
"Expected '=>' after pattern in replacement");
}
visualize(source, str, NULL);
str = after_spaces(str);
@ -630,6 +631,7 @@ static vm_op_t *compile_bpeg(const char *source, const char *str)
const char *replacement;
if (matchchar(&str, '}')) {
replacement = strdup("");
visualize(source, str, NULL);
} else {
check(matchchar(&str, '"') || matchchar(&str, '\''),
"Expected string literal for replacement");
@ -643,9 +645,8 @@ static vm_op_t *compile_bpeg(const char *source, const char *str)
}
replacement = strndup(replacement, (size_t)(str-replacement));
check(matchchar(&str, quote), "Expected closing quote");
check(matchchar(&str, '}'), "Expected a closing '}'");
}
visualize(source, str, NULL);
check(matchchar(&str, '}'), "Expected a closing '}'");
op->op = VM_REPLACE;
op->args.replace.replace_pat = pat;
op->args.replace.replacement = replacement;
@ -657,9 +658,24 @@ static vm_op_t *compile_bpeg(const char *source, const char *str)
}
// Special rules:
case '_': case '^': case '$': {
visualize(source, str, NULL);
op->op = VM_REF;
op->args.s = strndup(&c, 1);
if (matchchar(&str, c)) { // double __, ^^, $$
char tmp[3] = {c, c, '\0'};
op->args.s = strdup(tmp);
} else
op->args.s = strndup(&c, 1);
visualize(source, str, op->args.s);
break;
}
// Empty choice (/) or {/}
case '/': {
str = after_spaces(str);
if (*str == ')' || *str == '}') {
op->op = VM_EMPTY;
} else {
free(op);
return NULL;
}
break;
}
default: {
@ -795,10 +811,19 @@ static void load_defs(void)
load_def("esc", "\\e"); load_def("e", "\\e");
load_def("tab", "\\t"); load_def("t", "\\t");
load_def("nl", "\\n"); load_def("lf", "\\n"); load_def("n", "\\n");
load_def("ws", "` /\\t/\\n/\\r");
load_def("_", "*(` /\\t/\\n/\\r)");
load_def("$", ">\\n / !. / >\\r\\n");
load_def("cBlockComment", "'/*' &'*/'");
load_def("cLineComment", "'//' &$");
load_def("cComment", "cLineComment / cBlockComment");
load_def("hashComment", "`# &$");
load_def("comment", "!(/)"); // undefined by default
load_def("WS", "` /\\t/\\n/\\r/comment");
load_def("ws", "` /\\t");
load_def("$$", "!(. / \\n)");
load_def("$", "!. / >\\n");
load_def("^^", "!<(. / \\n)");
load_def("^", "<\\n / !<.");
load_def("__", "*(` /\\t/\\n/\\r/comment)");
load_def("_", "*(` /\\t)");
}
static match_t *get_capture_n(match_t *m, int *n)
@ -905,7 +930,7 @@ static void print_grammar(vm_op_t *op)
{
switch (op->op) {
case VM_REF: fprintf(stderr, "a $%s", op->args.s); break;
case VM_EMPTY: fprintf(stderr, "empty"); break;
case VM_EMPTY: fprintf(stderr, "the empty string"); break;
case VM_ANYCHAR: fprintf(stderr, "any char"); break;
case VM_STRING: fprintf(stderr, "string \"%s\"", op->args.s); break;
case VM_RANGE: {
@ -1014,9 +1039,19 @@ static vm_op_t *load_grammar(const char *grammar)
if (verbose) fprintf(stderr, "\n");
defs = after_spaces(defs);
const char *name = defs;
check(isalpha(*name), "Definition must begin with a name");
while (isalpha(*defs)) ++defs;
name = strndup(name, (size_t)(defs-name));
if (strncmp(name, "^^", 2) == 0 ||
strncmp(name, "__", 2) == 0 ||
strncmp(name, "$$", 2) == 0) {
name = strndup(name, 2);
defs += 2;
} else if (*name == '^' || *name == '_' || *name == '$') {
name = strndup(name, 1);
defs += 1;
} else {
check(isalpha(*name), "Definition must begin with a name");
while (isalnum(*defs)) ++defs;
name = strndup(name, (size_t)(defs-name));
}
defs = after_spaces(defs);
check(matchchar(&defs, '='), "Expected '=' in definition");
vm_op_t *def = load_def(name, defs);