Spruced up a bunch of stuff, tweaked the grammar, added docs

This commit is contained in:
Bruce Hill 2020-09-13 23:31:38 -07:00
parent 1570dd55e8
commit 4135115229
8 changed files with 175 additions and 95 deletions

162
bpeg.1
View File

@ -7,43 +7,164 @@ bpeg \- Bruce's Parsing Expression Grammar tool
.B bpeg
[\fI-h\fR|\fI--help\fR]
[\fI-v\fR|\fI--verbose\fR]
[\fI-p\fR|\fI--pattern\fR \fI<pattern>\fR]
[\fI-P\fR|\fI--pattern-string\fR \fI<string-pattern>\fR]
[\fI-d\fR|\fI--define\fR \fI<name>\fR=\fI<pattern>\fR]
[\fI-D\fR|\fI--define-string\fR \fI<name>\fR=\fI<string-pattern>\fR]
[\fI-r\fR|\fI--replace\fR \fI<replacement>\fR]
[\fI-g\fR|\fI--grammar\fR \fI<grammar file>\fR]
[\fI-m\fR|\fI--mode\fR \fI<mode>\fR]
\fI<pattern\fR
[[--] \fI<input file>\fR]
[[--] \fI<input files...>\fR]
.SH DESCRIPTION
\fBbpeg\fR is a tool that matches parsing expression grammars using a custom syntax.
.SH OPTIONS
.B \--verbose
.B \-v\fR, \fB--verbose
Print debugging information.
.B \--define <name>=<pattern>
Define a grammar rule.
.B \-d\fR, \fB--define \fI<name>\fR=\fI<pattern>\fR
Define a grammar rule using a bpeg pattern.
.B \--replace <replacement>
.B \-D\fR, \fB--define-string \fI<name>\fR=\fI<string-pattern>\fR
Define a grammar rule using a bpeg string pattern.
.B \-r\fR, \fB--replace \fI<replacement>\fR
Replace all occurrences of the main pattern with the given string.
.B \--grammar <grammar file>
.B \-g\fR, \fB--grammar \fI<grammar file>\fR
Load the grammar from the given file.
.B \-m\fR, \fB--mode \fI<mode>\fR
The mode to operate in. Options are: \fIfind-all\fR (the default),
\fIonly-matches\fR, \fIpattern\fR, \fIreplacement\fR, \fIreplace-all\fR
(implied by \fB--replace\fR), or any other grammar rule name.
.B \--help
Print the usage and exit.
.B <pattern>
The main pattern for bpeg to match. By default, this pattern
is in "string literal" mode (i.e. a backslash is requres for
non-literal patterns). The default mode is to find \fBall\fR
occurrences of the pattern and highlight them.
.B <string-pattern>
The main pattern for bpeg to match. By default, this pattern is a string
pattern (see the \fBSTRING PATTERNS\fR section below).
.B <input file>
The input file to search (default: stdin).
.B <input files...>
The input files to search. If no input files are provided and data was
piped in, that data will be used instead. If neither are provided,
\fBbpeg\fR will search through all files in the current directory and
its subdirectories (recursively).
.SH PATTERNS
Bpeg patterns are based off of a combination of Parsing Expression Grammars
and regular expression syntax. The syntax is designed to map closely to
verbal descriptions of the patterns, and prefix operators are preferred over
suffix operators (as is common in regex syntax).
Some patterns additionally have "multi-line" variants, which means that they
include the newline character.
.I <pat1> <pat2>
A chain of patterns, pronounced \fI<pat1>\fB-then-\fI<pat2>\fR
.I <pat1> \fB/\fI <pat2>\fR
A series of ordered choices (if one pattern matches, the following patterns
will not be attempted), pronounced \fI<pat1>\fB-or-\fI<pat2>\fR
.B ..
Any text \fBup-to\fR the following pattern, if any (multiline: \fB...\fR)
.B .
\fBAny\fR character (multiline: $.)
.B ^
\fBStart-of-a-line\fR
.B ^^
\fBStart-of-the-text\fR
.B $
\fBEnd-of-a-line\fR (does not include newline character)
.B $$
\fBEnd-of-the-text\fR
.B _
Zero or more \fBwhitespace\fR characters (specifically, spaces and tabs)
.B __
Zero or more \fBwhitespace-or-newline\fR characters
.B `\fI<c>\fR
The literal \fBcharacter-\fI<c>\fR
.B `\fI<c1>\fB-\fI<c2>\fR
The \fBcharacter-range-\fI<c1>\fB-to-\fI<c2>\fR
.B \\\fI<esc>\fR
The \fBescape-sequence-\fI<esc>\fR (\fB\\n\fR, \fB\\x1F\fR, \fB\\033\fR, etc.)
.B \\\fI<esc1>\fB-\fI<esc2>\fR
The \fBescape-sequence-range-\fI<esc1>\fB-to-\fI<esc2>\fR
.B !\fI<pat>\fR
\fBNot-\fI<pat>\fR
.B \fI<N> <pat>\fR
.B \fI<MIN>\fB-\fI<MAX> <pat>\fR
.B \fI<MIN>\fB+ \fI<pat>\fR
.B \fI<MAX>\fB- \fI<pat>\fR
\fI<MIN>\fB-to-\fI<MAX>\fB-\fI<pat>\fBs\fR (repetitions of a pattern)
.B *\fI<pat>\fR
\fBAny-\fI<pat>\fBs\fR (zero or more)
.B +\fI<pat>\fR
\fBSome-\fI<pat>\fBs\fR (one or more)
.B \fI<repeating-pat>\fR \fB%\fI <sep>\fR
\fI<repeating-pat>\fB-separated-by-\fI<sep>\fR (equivalent to \fI<pat>
\fB*(\fI<sep><pat>\fB)\fR)
.B <\fI<pat>\fR
\fBJust-after-\fI<pat>\fR (lookbehind)
.B >\fI<pat>\fR
\fBJust-before-\fI<pat>\fR (lookahead)
.B @\fI<pat>\fR
\fBCapture-\fI<pat>\fR
.B @[\fI<name>\fB]\fI<pat>\fR
\fBLet-\fI<name>\fB-equal-\fI<pat>\fR (named capture)
.B {\fI<pat>\fB => "\fI<replacement>\fB"}
\fBReplace-\fI<pat>\fB-with-\fI<replacement>\fR. Note: \fI<replacement>\fR should
be a string, and it may contain references to captured values: \fB@0\fR
(the whole of \fI<pat>\fR), \fB@1\fR (the first capture in \fI<pat>\fR),
\fB@[\fIfoo\fR]\fR (the capture named \fIfoo\fR in \fI<pat>\fR), etc.
.B \fI<pat1>\fB == \fI<pat2>\fR
Will match only if \fI<pat1>\fR and \fI<pat2>\fR both match and have the exact
same length. Pronounced \fI<pat1>\fB-assuming-it-equals-\fI<pat2>\fR
.B (/)
The empty string (a pattern that always matches).
.B # \fI<comment>\fR
A comment
.SH STRING PATTERNS
One of the most common use cases for pattern matching tools is matching plain,
literal strings, or strings that are primarily plain strings, with one or two
patterns. \fBbpeg\fR is designed around this fact. The default mode for bpeg
patterns is "string pattern mode". In string pattern mode, all characters
are interpreted literally except for the backslash (\fB\\\fR), which may be
followed by a bpeg pattern (see the \fBPATTERNS\fR section above). Optionally,
the bpeg pattern may be terminated by a semicolon (\fB;\fR).
.SH EXAMPLES
.TP
.B
ls | bpeg foo
Find files containing the string "foo"
Find files containing the string "foo" (a string pattern)
.TP
.B
@ -52,9 +173,16 @@ Find files ending with ".c" and replace the extension with ".h"
.TP
.B
bpeg -g grammar.bpeg '\\myThing' my_file.txt
Find ocurrences of the grammar rule "myThing" in the file \fBmy_file.txt\fR
using the grammar rules defined in \fBgrammar.bpeg\fR
bpeg -p '"foobar"==id parens' my_file.py
Find the literal string \fB"foobar"\fR, assuming it's a complete identifier,
followed by a pair of matching parentheses in the file \fImy_file.py\fR
.TP
.B
bpeg -g html -p html-element -D matching-tag=a foo.html
Using the \fIhtml\fR grammar, find all \fIhtml-element\fRs matching
the tag \fIa\fR in the file \fIfoo.html\fR
.SH AUTHOR
Bruce Hill (bruce@bruce-hill.com)

View File

@ -16,8 +16,8 @@ Eq-pat = @[first]simple-pat "==" @[second]pat;
Empty = `/ >(__ (`)/`}));
Dot = `. !`.;
String = (
`" @[s]*(Escape / ~`") `"
/ `' @[s]*(Escape / ~`') `'
`" @[s]*(Escape / !`"$.) `"
/ `' @[s]*(Escape / !`'$.) `'
);
Char-range = `` @[low]. `- @[high].;
Char = `` @[s].;
@ -29,7 +29,6 @@ escape-sequence = (
/`a/`b/`e/`n/`r/`t/`v / . / \n
);
No = `! _ @pat;
Anything-but = `~ ?`~ _ @pat;
Upto = 2-3`. ?>(_@pat);
Repeat = (
@[min]int _ `- _ @[max]int

46
bpeg.c
View File

@ -1,36 +1,7 @@
/*
* bpeg.c - Source code for the bpeg parser
*
* Grammar:
* # <comment> comment
* .. any text up to the following pattern (if any); (multiline: ...)
* . any character (multiline: $.)
* ^ beginning of a line (^^: beginning of file)
* $ end of a line ($$: end of file)
* _ 0 or more spaces or tabs (__: include newlines and comments)
* `<c> character <c>
* `<a>-<z> character between <a> and <z>
* \<e> escape sequence (e.g. \n, \033)
* \<e1>-<e2> escape sequence range (e.g. \x00-\xF0)
* ! <pat> no <pat>
* ~ <pat> any character as long as it doesn't match <pat> (multiline: ~~<pat>)
* <N=1> + <pat> [% <sep="">] <N> or more <pat>s (separated by <sep>)
* * <pat> [% <sep="">] sugar for "0+ <pat> [% <sep>]"
* <N=1> - <pat> [% <sep="">] <N> or fewer <pat>s (separated by <sep>)
* ? <pat> sugar for "1- <pat>"
* <N> - <M> <pat> <N> to <M> (inclusive) <pat>s
* < <pat> after <pat>, ...
* > <pat> before <pat>, ...
* ( <pat> ) <pat>
* @ <pat> capture <pat>
* @ [ <name> ] <pat> <pat> named <name>
* { <pat> => <str> } <pat> replaced with <str>
* <pat1> == <pat2> <pat1> iff <pat2> matches at the same spot for the same length
* "@1" or "@[1]" first capture
* "@foo" or "@[foo]" capture named "foo"
* <pat1> <pat2> <pat1> followed by <pat2>
* <pat> / <alt> <pat> otherwise <alt>
* ; <name> = <pat> <name> is defined to be <pat>
* See `man ./bpeg.1` for more details
*/
#include <fcntl.h>
#include <glob.h>
@ -53,8 +24,8 @@ static const char *usage = (
" -v --verbose\t print verbose debugging info\n"
" -d --define <name>=<def>\t define a grammar rule\n"
" -D --define-string <name>=<def>\t define a grammar rule (string-pattern)\n"
" -e --escaped <pat>\t provide an escaped pattern (equivalent to bpeg '\\(<pat>)')\n"
" -s --string <pat>\t provide a string pattern (equivalent to bpeg '<pat>', but may be useful if '<pat>' begins with a '-')\n"
" -p --pattern <pat>\t provide a pattern (equivalent to bpeg '\\(<pat>)')\n"
" -P --pattern-string <pat>\t provide a string pattern (equivalent to bpeg '<pat>', but may be useful if '<pat>' begins with a '-')\n"
" -r --replace <replacement> replace the input pattern with the given replacement\n"
" -m --mode <mode>\t set the behavior mode (defult: find-all)\n"
" -g --grammar <grammar file> use the specified file as a grammar\n");
@ -104,14 +75,15 @@ int main(int argc, char *argv[])
{
int verbose = 0;
char *flag = NULL;
char path[PATH_MAX] = {0};
const char *rule = "find-all";
grammar_t *g = new_grammar();
// Load builtins:
int fd;
if ((fd=open("/etc/xdg/bpeg/builtins.bpeg", O_RDONLY)) >= 0)
load_grammar(g, readfile(fd)); // Keep in memory for debugging output
char path[PATH_MAX] = {0};
sprintf(path, "%s/.config/bpeg/builtins.bpeg", getenv("HOME"));
if ((fd=open(path, O_RDONLY)) >= 0)
load_grammar(g, readfile(fd)); // Keep in memory for debugging output
@ -121,10 +93,10 @@ int main(int argc, char *argv[])
if (streq(argv[i], "--")) {
++i;
break;
} else if (FLAG("--help") || FLAG("-h")) {
} else if (streq(argv[i], "--help") || streq(argv[i], "-h")) {
printf("%s\n", usage);
return 0;
} else if (FLAG("--verbose") || FLAG("-v")) {
} else if (streq(argv[i], "--verbose") || streq(argv[i], "-v")) {
verbose = 1;
} else if (FLAG("--replace") || FLAG("-r")) {
vm_op_t *p = bpeg_replacement(bpeg_pattern("pattern"), flag);
@ -166,13 +138,13 @@ int main(int argc, char *argv[])
vm_op_t *pat = bpeg_stringpattern(src);
check(pat, "Failed to compile pattern");
add_def(g, src, def, pat);
} else if (FLAG("--escaped") || FLAG("-e")) {
} else if (FLAG("--pattern") || FLAG("-p")) {
check(npatterns == 0, "Cannot define multiple patterns");
vm_op_t *p = bpeg_pattern(flag);
check(p, "Pattern failed to compile: '%s'", flag);
add_def(g, flag, "pattern", p);
++npatterns;
} else if (FLAG("--string") || FLAG("-s")) {
} else if (FLAG("--pattern-string") || FLAG("-P")) {
vm_op_t *p = bpeg_stringpattern(flag);
check(p, "Pattern failed to compile");
add_def(g, flag, "pattern", p);

View File

@ -184,17 +184,6 @@ vm_op_t *bpeg_simplepattern(const char *str)
op->args.pat = p;
break;
}
// Anything but <pat>
case '~': {
if (matchchar(&str, '~')) op->multiline = 1;
vm_op_t *p = bpeg_simplepattern(str);
check(p, "Expected pattern after '~'\n");
str = p->end;
op->op = VM_ANYTHING_BUT;
op->len = -1;
op->args.pat = p;
break;
}
// Number of repetitions: <N>(-<N> / - / + / "")
case '0': case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9': {

View File

@ -1,6 +1,6 @@
# Meta-rules for acting on everything
pattern = !(/); # Not defined by default
replacement = {!(/)=>}; # Not defined by default
replacement = !(/); # Not defined by default
replace-all = +(...@replacement) ...;
find-all = {... >matching-line =>} +(matching-line/non-matching-line) ?{!<\n => "\n"};
only-matches = +{...@pattern=>'@1\n'};
@ -10,10 +10,10 @@ non-matching-line = {..$=>};
# Helper definitions (commonly used)
crlf = \r\n;
cr = \r; r = \r;
anglebraces = `< *(anglebraces / ~~`>) `>;
brackets = `[ *(brackets / ~~`]) `];
braces = `{ *(braces / ~~`}) `};
parens = `( *(parens / ~~`)) `);
anglebraces = `< *(anglebraces / !`>.) `>;
brackets = `[ *(brackets / !`].) `];
braces = `{ *(braces / !`}.) `};
parens = `( *(parens / !`).) `);
id = !<(`a-z/`A-Z/`_/`0-9) (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9);
word = !<(`a-z/`A-Z/`_/`0-9) +(`a-z/`A-Z) !>(`0-9/`_);
HEX = `0-9/`A-F;

View File

@ -11,17 +11,17 @@ html-element = (
void-element = `< @[tag](id==match-tag) __attributes__ `/? __ `>;
template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(~~`< / comment / html-element / ~~("</"tag__`>)) ("</"tag__`>);
template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(!`<$. / comment / html-element / !("</"tag__`>)$.) ("</"tag__`>);
raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*~~("</"tag__`>) ("</"tag__`>);
raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body].. ("</"tag__`>);
normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(~~`< / comment / html-element / ~~("</"tag__`>)) "</"tag__`>;
normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(!`<$. / comment / html-element / !("</"tag__`>)$.) "</"tag__`>;
comment = "<!--" &&"-->";
attributes = *(!(attribute==match-attribute))%__ __(attribute==match-attribute)__ *attribute%__;
attribute = (+id%`:)__`=__ (id / `" &`" / `' &`');
attribute = (+id%`:)__`=__ (id / `" *~`" `" / `' *~`' `');
attribute = (+id%`:)__`=__ (id / `" .. `" / `' .. `');
match-attribute = attribute;
match-tag = id;
match-body = (/);

View File

@ -12,7 +12,6 @@
enum VMOpcode {
VM_EMPTY = 0,
VM_ANYCHAR = 1,
VM_ANYTHING_BUT,
VM_STRING,
VM_RANGE,
VM_NOT,
@ -34,7 +33,7 @@ enum VMOpcode {
*/
typedef struct vm_op_s {
enum VMOpcode op;
unsigned int multiline:1;
unsigned int multiline:1, negate:1;
const char *start, *end;
// Length of the match, if constant, otherwise -1
ssize_t len;

25
vm.c
View File

@ -16,7 +16,6 @@ static match_t *get_capture_named(match_t *m, const char *name);
static const char *opcode_names[] = {
[VM_EMPTY] = "EMPTY",
[VM_ANYCHAR] = "ANYCHAR",
[VM_ANYTHING_BUT] = "ANYTHING_BUT",
[VM_STRING] = "STRING",
[VM_RANGE] = "RANGE",
[VM_NOT] = "NOT",
@ -114,10 +113,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref
m->end = str + 1;
return m;
}
case VM_NOT: case VM_ANYTHING_BUT: {
if (op->op == VM_ANYTHING_BUT)
if (!*str || (!op->multiline && *str == '\n'))
return NULL;
case VM_NOT: {
match_t *m = _match(g, str, op->args.pat, rec);
if (m != NULL) {
destroy_match(&m);
@ -126,7 +122,6 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref
m = calloc(sizeof(match_t), 1);
m->op = op;
m->start = str;
if (op->op == VM_ANYTHING_BUT) ++str;
m->end = str;
return m;
}
@ -160,13 +155,11 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref
m->start = str;
m->end = str;
m->op = op;
if (op->args.repetitions.max == 0) return m;
match_t **dest = &m->child;
const char *prev = str;
size_t reps;
for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) {
size_t reps = 0;
for (;;) {
// Separator
match_t *sep = NULL;
if (op->args.repetitions.sep != NULL && reps > 0) {
@ -188,6 +181,12 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref
dest = &p->nextsibling;
str = p->end;
prev = str;
++reps;
if (op->args.repetitions.max != -1 && reps > (size_t)op->args.repetitions.max) {
destroy_match(&m);
return NULL;
}
}
if ((ssize_t)reps < op->args.repetitions.min) {
@ -386,12 +385,6 @@ void print_pattern(vm_op_t *op)
fprintf(stderr, ")");
break;
}
case VM_ANYTHING_BUT: {
fprintf(stderr, "anything but (");
print_pattern(op->args.pat);
fprintf(stderr, ")");
break;
}
case VM_AFTER: {
fprintf(stderr, "after (");
print_pattern(op->args.pat);