aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Lua/README.md44
-rw-r--r--Lua/lbp.c259
-rw-r--r--Lua/test.lua36
3 files changed, 236 insertions, 103 deletions
diff --git a/Lua/README.md b/Lua/README.md
index 25ccf5f..1288696 100644
--- a/Lua/README.md
+++ b/Lua/README.md
@@ -9,17 +9,30 @@ roughly equivalent in usefulness to LPEG, but with a smaller codebase (roughly
The Lua `bp` bindings provide the following methods:
```lua
-bp.match(text, pattern, [start_index]) --> match, match_start, match_length / nil
-bp.replace(text, pattern, replacement, [start_index]) --> text_with_replacements, num_replacements
+bp.match(pattern, text, [start_index]) --> match / nil
+bp.replace(pattern, replacement, text, [start_index]) --> text_with_replacements, num_replacements
+bp.compile(pattern) --> pattern_object
+for m in bp.eachmatch(pattern, text, [start_index]) do ... end
+
+pattern_object:match(text, [start_index]) --> match / nil
+pattern_object:replace(replacement, text, [start_index]) --> text_with_replacements, num_replacements
+for m in pattern_object:eachmatch(text, [start_index]) do ... end
```
Match objects returned by `bp.match()` are tables whose `__tostring` will
return the text of the match. Additionally, match objects store the text of the
-match at index `0`, and any captures stored as match objects with a key
-corresponding to the capture's identifier (e.g. `@"a" @foo="bc"` will be
-encoded as `{[0]="abc", [1]="a", foo={[0]="bc"}}`. If multiple captures within
-a match share the same identifier, it is unspecified which captured match will
-be stored at that key, so it's best to be unambiguous.
+match at index `0`, the match's starting index in the source string as
+`.start`, the first index after the match as `.after`, and any captures stored
+as match objects with a key corresponding to the capture's identifier (e.g.
+`@"a" @foo="bc"` will be encoded as `{[0]="abc", [1]="a", foo={[0]="bc"}}`. If
+multiple captures within a match share the same identifier, it is unspecified
+which captured match will be stored at that key, so it's best to be
+unambiguous.
+
+Pattern objects returned by `bp.compile()` are pre-compiled patterns that are
+slightly faster to reuse than just calling `bp.match()` repeatedly. They have a
+`.source` attribute that holds the original text used to compile them and have
+`:match()`, `:replace()`, and `:eachmatch()` methods as described above.
All methods will raise an error with a descriptive message if the given pattern
has a syntax error.
@@ -28,9 +41,20 @@ has a syntax error.
```lua
local bp = require("bp")
-local m, i, len = bp.match("like finding a needle in a haystack", '"n" @Es=+`e "dle"')
---> {[0]="needle", Es={[0]="ee"}}, 16, 6
+local m = bp.match('"n" @Es=+`e "dle"', "like finding a needle in a haystack")
+--> {[0]="needle", Es={[0]="ee", start=17, after=19}, start=16, after=22}
--> tostring(m) == "needle", tostring(m.Es) == "ee"
-local replaced, nreplacements = bp.match("like finding a needle in a haystack", '"n" +`e "dle"', "cat")
+local replaced, nreplacements = bp.match('"n" +`e "dle"', "cat", "like finding a needle in a haystack")
--> "like finding a cat in a haystack", 1
+
+for word in bp.eachmatch("+`A-Z,a-z", "one, two three... four!") do
+ print(word) --> prints "one" "two" "three" "four"
+end
+
+local pat = bp.compile("word parens")
+for _,s in ipairs(my_strings) do
+ for fncall in pat:eachmatch(s) do
+ print(fncall)
+ end
+end
```
diff --git a/Lua/lbp.c b/Lua/lbp.c
index 592094b..050d0e8 100644
--- a/Lua/lbp.c
+++ b/Lua/lbp.c
@@ -1,8 +1,13 @@
/*
* lbp.c - bp library for lua
* API:
-* bp.match(str, pat[, start_index]) -> nil or (match_table, start_index, input_consumed)
-* bp.replace(str, pat, replacement, start_index) -> str with replacements, num_replacements
+* bp.match(pat, str, [start_index]) -> nil or match_table
+* bp.replace(pat, replacement, str, [start_index]) -> str with replacements, num_replacements
+* for match_table in bp.eachmatch(pat, str, [start_index]) do ... end
+* bp.compile(pat) -> pattern object
+* pat:match(str, [start_index])
+* pat:replace(replacement, str, [start_index])
+* for match in pat:eachmatch(str, [start_index]) do ... end
*/
#include <fcntl.h>
@@ -26,11 +31,11 @@
static const char *builtins_source = (
#include "builtins.h"
);
-
-static int MATCH_METATABLE = 0;
-
+static int MATCH_METATABLE = 0, PAT_METATABLE = 0;
static def_t *builtins;
+static void push_match(lua_State *L, match_t *m, const char *start);
+
static inline void raise_parse_error(lua_State *L, maybe_pat_t m)
{
size_t err_len = (size_t)(m.value.error.end - m.value.error.start);
@@ -40,6 +45,26 @@ static inline void raise_parse_error(lua_State *L, maybe_pat_t m)
free(buf);
}
+static int Lcompile(lua_State *L)
+{
+ size_t patlen;
+ const char *pat_text = luaL_checklstring(L, 1, &patlen);
+ maybe_pat_t maybe_pat = bp_pattern(pat_text, pat_text + patlen);
+ if (!maybe_pat.success) {
+ raise_parse_error(L, maybe_pat);
+ return 0;
+ }
+ lua_createtable(L, 2, 0);
+ lua_pushlightuserdata(L, (void*)&PAT_METATABLE);
+ lua_gettable(L, LUA_REGISTRYINDEX);
+ lua_setmetatable(L, -2);
+ lua_pushvalue(L, 1);
+ lua_setfield(L, -2, "source");
+ lua_pushlightuserdata(L, maybe_pat.value.pat);
+ lua_setfield(L, -2, "pattern");
+ return 1;
+}
+
static void push_matchstring(lua_State *L, match_t *m)
{
char *buf = NULL;
@@ -51,79 +76,71 @@ static void push_matchstring(lua_State *L, match_t *m)
fclose(out);
}
-static void push_match(lua_State *L, match_t *m);
-
-static void set_capture_fields(lua_State *L, match_t *m, int *n)
+static void set_capture_fields(lua_State *L, match_t *m, int *n, const char *start)
{
if (m->pat->type == BP_CAPTURE) {
if (m->pat->args.capture.namelen > 0) {
lua_pushlstring(L, m->pat->args.capture.name, m->pat->args.capture.namelen);
- push_match(L, m->children[0]);
+ push_match(L, m->children[0], start);
lua_settable(L, -3);
} else {
- push_match(L, m->children[0]);
+ push_match(L, m->children[0], start);
lua_seti(L, -2, *(n++));
}
} else if (m->children) {
for (int i = 0; m->children[i]; i++)
- set_capture_fields(L, m->children[i], n);
+ set_capture_fields(L, m->children[i], n, start);
}
}
-void push_match(lua_State *L, match_t *m)
+static void push_match(lua_State *L, match_t *m, const char *start)
{
- lua_createtable(L, 1, 0);
+ lua_createtable(L, 1, 2);
lua_pushlightuserdata(L, (void*)&MATCH_METATABLE);
lua_gettable(L, LUA_REGISTRYINDEX);
lua_setmetatable(L, -2);
push_matchstring(L, m);
lua_seti(L, -2, 0);
+
int capture_num = 1;
for (int i = 0; m->children && m->children[i]; i++)
- set_capture_fields(L, m->children[i], &capture_num);
-}
+ set_capture_fields(L, m->children[i], &capture_num, start);
-static int Ltostring(lua_State *L)
-{
- lua_geti(L, 1, 0);
- return 1;
+ lua_pushinteger(L, 1 + (int)(m->start - start));
+ lua_setfield(L, -2, "start");
+ lua_pushinteger(L, 1 + (int)(m->end - start));
+ lua_setfield(L, -2, "after");
}
-static const luaL_Reg Rinstance_metamethods[] =
-{
- {"__tostring", Ltostring},
- {NULL, NULL}
-};
-
-static void recursive_free_pat(pat_t *pat)
+static void recursively_free_pat(pat_t *pat)
{
// Do a depth-first traversal, freeing everyting along the way:
if (!pat) return;
switch (pat->type) {
case BP_DEFINITION:
- recursive_free_pat(pat->args.def.def);
- recursive_free_pat(pat->args.def.pat);
+ recursively_free_pat(pat->args.def.def);
+ recursively_free_pat(pat->args.def.pat);
break;
case BP_REPEAT:
- recursive_free_pat(pat->args.repetitions.sep);
- recursive_free_pat(pat->args.repetitions.repeat_pat);
+ recursively_free_pat(pat->args.repetitions.sep);
+ recursively_free_pat(pat->args.repetitions.repeat_pat);
break;
case BP_CHAIN: case BP_UPTO: case BP_UPTO_STRICT:
case BP_OTHERWISE: case BP_NOT_MATCH: case BP_MATCH:
- recursive_free_pat(pat->args.multiple.first);
- recursive_free_pat(pat->args.multiple.second);
+ recursively_free_pat(pat->args.multiple.first);
+ recursively_free_pat(pat->args.multiple.second);
break;
case BP_REPLACE:
- recursive_free_pat(pat->args.replace.pat);
+ recursively_free_pat(pat->args.replace.pat);
break;
case BP_CAPTURE:
- recursive_free_pat(pat->args.capture.capture_pat);
+ recursively_free_pat(pat->args.capture.capture_pat);
break;
case BP_NOT: case BP_AFTER: case BP_BEFORE:
- recursive_free_pat(pat->args.pat);
+ recursively_free_pat(pat->args.pat);
break;
case BP_LEFTRECURSION:
- recursive_free_pat(pat->args.leftrec.fallback);
+ recursively_free_pat(pat->args.leftrec.fallback);
break;
default: break;
}
@@ -132,52 +149,62 @@ static void recursive_free_pat(pat_t *pat)
static int Lmatch(lua_State *L)
{
- size_t textlen, patlen;
- const char *text = luaL_checklstring(L, 1, &textlen);
- const char *pat_text = luaL_checklstring(L, 2, &patlen);
- lua_Integer index = luaL_optinteger(L, 3, 1);
- if (index > (lua_Integer)strlen(text)+1)
- return 0;
+ if (lua_isstring(L, 1)) {
+ if (Lcompile(L) != 1)
+ return 0;
+ lua_replace(L, 1);
+ }
+ lua_getfield(L, 1, "pattern");
+ pat_t *pat = lua_touserdata(L, -1);
+ lua_pop(L, 1);
+ if (!pat) luaL_error(L, "Not a valid pattern");
- maybe_pat_t maybe_pat = bp_pattern(pat_text, pat_text + patlen);
- if (!maybe_pat.success) {
- raise_parse_error(L, maybe_pat);
- return 0;
+ size_t textlen;
+ const char *text = luaL_checklstring(L, 2, &textlen);
+ lua_Integer index;
+ if (lua_istable(L, 3)) {
+ lua_getfield(L, 3, "start");
+ lua_getfield(L, 3, "after");
+ index = luaL_optinteger(L, -1, 1);
+ if (lua_rawequal(L, -1, -2))
+ ++index;
+ } else {
+ index = luaL_optinteger(L, 3, 1);
}
+ if (index > (lua_Integer)strlen(text)+1)
+ return 0;
match_t *m = NULL;
int ret = 0;
- if (next_match(&m, builtins, text+index-1, &text[textlen], maybe_pat.value.pat, NULL, false)) {
- push_match(L, m);
- lua_pushinteger(L, (int)(m->start - text) + 1);
- lua_pushinteger(L, (int)(m->end - m->start));
+ if (next_match(&m, builtins, text+index-1, &text[textlen], pat, NULL, false)) {
+ push_match(L, m, text);
stop_matching(&m);
- ret = 3;
+ ret = 1;
}
-
- recursive_free_pat(maybe_pat.value.pat);
-
return ret;
}
static int Lreplace(lua_State *L)
{
- size_t textlen, patlen, replen;
- const char *text = luaL_checklstring(L, 1, &textlen);
- const char *pat_text = luaL_checklstring(L, 2, &patlen);
- const char *rep_text = luaL_checklstring(L, 3, &replen);
+ if (lua_isstring(L, 1)) {
+ if (Lcompile(L) != 1)
+ return 0;
+ lua_replace(L, 1);
+ }
+ lua_getfield(L, 1, "pattern");
+ pat_t *pat = lua_touserdata(L, -1);
+ lua_pop(L, 1);
+ if (!pat) luaL_error(L, "Not a valid pattern");
+
+ size_t replen, textlen;
+ const char *rep_text = luaL_checklstring(L, 2, &replen);
+ const char *text = luaL_checklstring(L, 3, &textlen);
lua_Integer index = luaL_optinteger(L, 4, 1);
if (index > (lua_Integer)strlen(text)+1)
index = (lua_Integer)strlen(text)+1;
- maybe_pat_t maybe_pat = bp_pattern(pat_text, pat_text + patlen);
- if (!maybe_pat.success) {
- raise_parse_error(L, maybe_pat);
- return 0;
- }
- maybe_pat_t maybe_replacement = bp_replacement(maybe_pat.value.pat, rep_text, rep_text + replen);
+ maybe_pat_t maybe_replacement = bp_replacement(pat, rep_text, rep_text + replen);
if (!maybe_replacement.success) {
- recursive_free_pat(maybe_pat.value.pat);
raise_parse_error(L, maybe_replacement);
return 0;
}
@@ -199,12 +226,93 @@ static int Lreplace(lua_State *L)
lua_pushinteger(L, replacements);
fclose(out);
- // maybe_pat will get freed by this:
- recursive_free_pat(maybe_replacement.value.pat);
+ free_pat(maybe_replacement.value.pat);
return 2;
}
+static int iter(lua_State *L)
+{
+ lua_geti(L, 1, 1);
+ lua_geti(L, 1, 2);
+ lua_replace(L, 1);
+ lua_rotate(L, 1, 1);
+ return Lmatch(L);
+}
+
+static int Leachmatch(lua_State *L)
+{
+ int nargs = lua_gettop(L);
+ lua_pushcfunction(L, iter); // iter
+ lua_createtable(L, 2, 0); // state: {pat, str}
+ if (lua_isstring(L, 1)) {
+ if (Lcompile(L) != 1)
+ return 0;
+ } else {
+ lua_pushvalue(L, 1);
+ }
+ lua_seti(L, -2, 1);
+ lua_pushvalue(L, 2);
+ lua_seti(L, -2, 2);
+ if (nargs >= 3) // start index
+ lua_pushvalue(L, 3);
+ else lua_pushnil(L);
+ return 3;
+}
+
+static int Lmatch_tostring(lua_State *L)
+{
+ lua_geti(L, 1, 0);
+ return 1;
+}
+
+static int Lpat_tostring(lua_State *L)
+{
+ luaL_Buffer b;
+ luaL_buffinit(L, &b);
+ luaL_addstring(&b, "Pattern [[");
+ lua_getfield(L, 1, "source");
+ luaL_addvalue(&b);
+ luaL_addstring(&b, "]]");
+ luaL_pushresult(&b);
+ return 1;
+}
+
+static int Lpat_gc(lua_State *L)
+{
+ pat_t *pat = lua_touserdata(L, 1);
+ if (pat)
+ recursively_free_pat(pat);
+ return 0;
+}
+
+static const luaL_Reg match_metamethods[] = {
+ {"__tostring", Lmatch_tostring},
+ {NULL, NULL}
+};
+
+static const luaL_Reg pat_methods[] = {
+ {"match", Lmatch},
+ {"replace", Lreplace},
+ {"eachmatch", Leachmatch},
+ {NULL, NULL}
+};
+
+static const luaL_Reg pat_metamethods[] = {
+ {"__gc", Lpat_gc},
+ {"__tostring", Lpat_tostring},
+ {"__index", NULL}, // placeholder for pat_methods
+ {NULL, NULL}
+};
+
+static const luaL_Reg bp_methods[] = {
+ {"match", Lmatch},
+ {"replace", Lreplace},
+ {"compile", Lcompile},
+ {"eachmatch", Leachmatch},
+ {NULL, NULL}
+};
+
LUALIB_API int luaopen_bp(lua_State *L)
{
maybe_pat_t maybe_pat = bp_pattern(builtins_source, builtins_source+strlen(builtins_source));
@@ -215,17 +323,16 @@ LUALIB_API int luaopen_bp(lua_State *L)
for (pat_t *p = maybe_pat.value.pat; p && p->type == BP_DEFINITION; p = p->args.def.pat)
builtins = with_def(builtins, p->args.def.namelen, p->args.def.name, p->args.def.def);
+ lua_pushlightuserdata(L, (void*)&PAT_METATABLE);
+ luaL_newlib(L, pat_metamethods);
+ luaL_newlib(L, pat_methods);
+ lua_setfield(L, -2, "__index");
+ lua_settable(L, LUA_REGISTRYINDEX);
+
lua_pushlightuserdata(L, (void*)&MATCH_METATABLE);
- lua_createtable(L, 0, 4);
- luaL_register(L, NULL, Rinstance_metamethods);
+ luaL_newlib(L, match_metamethods);
lua_settable(L, LUA_REGISTRYINDEX);
- lua_createtable(L, 0, 2);
- lua_pushcfunction(L, Lmatch);
- lua_setfield(L, -2, "match");
- lua_pushcfunction(L, Lreplace);
- lua_setfield(L, -2, "replace");
- // lua_pushcfunction(L, Leach);
- // lua_setfield(L, -1, "each");
+ luaL_newlib(L, bp_methods);
return 1;
}
diff --git a/Lua/test.lua b/Lua/test.lua
index c2fbb3c..3e13f55 100644
--- a/Lua/test.lua
+++ b/Lua/test.lua
@@ -1,15 +1,5 @@
local bp = require 'bp'
-local function iter(state, _)
- local m, start, len = bp.match(state[1], state[2], state[3])
- state[3] = m and start+math.max(len,1)
- return m, start, len
-end
-
-bp.each = function(s, pat, index)
- return iter, {s, pat, index}, index
-end
-
local function repr(obj)
if type(obj) == 'table' then
local ret = {}
@@ -23,26 +13,38 @@ local function repr(obj)
end
print("Matching:")
-for m, i,j in bp.each("one two three", "(*`a-z) => '(@0)'") do
- print(("%s @%d len=%d"):format(repr(m),i,j))
+for m in bp.eachmatch("(*`a-z) => '(@0)'", "one two three") do
+ print(repr(m))
end
-print(("Replacing: %q (%d replacements)"):format(bp.replace("one two three", "+`a-z", "(@0)")))
+print(("Replacing: %q (%d replacements)"):format(bp.replace("+`a-z", "(@0)", "one two three")))
print("Captures:")
-local m = bp.match("one two three four", "@first=+`a-z _ @second=(+`a-z => 'XX@0XX') _ @+`a-z _ @last=+`a-z")
+local m = bp.match("@first=+`a-z _ @second=(+`a-z => 'XX@0XX') _ @+`a-z _ @last=+`a-z", "one two three four")
print(repr(m))
-local m = bp.match("one two three four", "@dup=+`a-z _ @dup=+`a-z _ @two=(@a=+`a-z _ @b=+`a-z)")
+local m = bp.match("@dup=+`a-z _ @dup=+`a-z _ @two=(@a=+`a-z _ @b=+`a-z)", "one two three four")
print(repr(m))
print("Testing parse errors:")
local ok, msg = pcall(function()
- bp.match("xxx", ".;//;;; wtf")
+ bp.match(".;//;;; wtf", "xxx")
end)
if not ok then print(("\x1B[41;30mParse error:\x1B[0;1;31m %s\x1B[m\n"):format(msg)) end
print("Testing builtins:")
-print(bp.match("...(foo())...", "parens"))
+print(bp.match("parens", "...(foo())..."))
+
+
+print("Testing pat objects")
+local pat = bp.compile("+`a-z")
+print(pat)
+print(pat:match("...foo..."))
+print(pat:match("...baz..."))
+print(pat:replace("{@0}", "...baz..."))
+
+for m in pat:eachmatch("hello world") do
+ print(m)
+end