diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2021-09-25 19:03:56 -0700 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2021-09-25 19:03:56 -0700 |
| commit | fb1840e07a6643a92206abb8a64a33b003ec06a8 (patch) | |
| tree | 3aa4c2440b3e6222c77bd63445bf3c90c86dcb3d /Lua | |
| parent | 270a0a47c4561507fd34d750a59bb1e64acb822c (diff) | |
Improved lua API, including re.compile() and re.eachmatch()
Diffstat (limited to 'Lua')
| -rw-r--r-- | Lua/README.md | 44 | ||||
| -rw-r--r-- | Lua/lbp.c | 259 | ||||
| -rw-r--r-- | Lua/test.lua | 36 |
3 files changed, 236 insertions, 103 deletions
diff --git a/Lua/README.md b/Lua/README.md index 25ccf5f..1288696 100644 --- a/Lua/README.md +++ b/Lua/README.md @@ -9,17 +9,30 @@ roughly equivalent in usefulness to LPEG, but with a smaller codebase (roughly The Lua `bp` bindings provide the following methods: ```lua -bp.match(text, pattern, [start_index]) --> match, match_start, match_length / nil -bp.replace(text, pattern, replacement, [start_index]) --> text_with_replacements, num_replacements +bp.match(pattern, text, [start_index]) --> match / nil +bp.replace(pattern, replacement, text, [start_index]) --> text_with_replacements, num_replacements +bp.compile(pattern) --> pattern_object +for m in bp.eachmatch(pattern, text, [start_index]) do ... end + +pattern_object:match(text, [start_index]) --> match / nil +pattern_object:replace(replacement, text, [start_index]) --> text_with_replacements, num_replacements +for m in pattern_object:eachmatch(text, [start_index]) do ... end ``` Match objects returned by `bp.match()` are tables whose `__tostring` will return the text of the match. Additionally, match objects store the text of the -match at index `0`, and any captures stored as match objects with a key -corresponding to the capture's identifier (e.g. `@"a" @foo="bc"` will be -encoded as `{[0]="abc", [1]="a", foo={[0]="bc"}}`. If multiple captures within -a match share the same identifier, it is unspecified which captured match will -be stored at that key, so it's best to be unambiguous. +match at index `0`, the match's starting index in the source string as +`.start`, the first index after the match as `.after`, and any captures stored +as match objects with a key corresponding to the capture's identifier (e.g. +`@"a" @foo="bc"` will be encoded as `{[0]="abc", [1]="a", foo={[0]="bc"}}`. If +multiple captures within a match share the same identifier, it is unspecified +which captured match will be stored at that key, so it's best to be +unambiguous. + +Pattern objects returned by `bp.compile()` are pre-compiled patterns that are +slightly faster to reuse than just calling `bp.match()` repeatedly. They have a +`.source` attribute that holds the original text used to compile them and have +`:match()`, `:replace()`, and `:eachmatch()` methods as described above. All methods will raise an error with a descriptive message if the given pattern has a syntax error. @@ -28,9 +41,20 @@ has a syntax error. ```lua local bp = require("bp") -local m, i, len = bp.match("like finding a needle in a haystack", '"n" @Es=+`e "dle"') ---> {[0]="needle", Es={[0]="ee"}}, 16, 6 +local m = bp.match('"n" @Es=+`e "dle"', "like finding a needle in a haystack") +--> {[0]="needle", Es={[0]="ee", start=17, after=19}, start=16, after=22} --> tostring(m) == "needle", tostring(m.Es) == "ee" -local replaced, nreplacements = bp.match("like finding a needle in a haystack", '"n" +`e "dle"', "cat") +local replaced, nreplacements = bp.match('"n" +`e "dle"', "cat", "like finding a needle in a haystack") --> "like finding a cat in a haystack", 1 + +for word in bp.eachmatch("+`A-Z,a-z", "one, two three... four!") do + print(word) --> prints "one" "two" "three" "four" +end + +local pat = bp.compile("word parens") +for _,s in ipairs(my_strings) do + for fncall in pat:eachmatch(s) do + print(fncall) + end +end ``` @@ -1,8 +1,13 @@ /* * lbp.c - bp library for lua * API: -* bp.match(str, pat[, start_index]) -> nil or (match_table, start_index, input_consumed) -* bp.replace(str, pat, replacement, start_index) -> str with replacements, num_replacements +* bp.match(pat, str, [start_index]) -> nil or match_table +* bp.replace(pat, replacement, str, [start_index]) -> str with replacements, num_replacements +* for match_table in bp.eachmatch(pat, str, [start_index]) do ... end +* bp.compile(pat) -> pattern object +* pat:match(str, [start_index]) +* pat:replace(replacement, str, [start_index]) +* for match in pat:eachmatch(str, [start_index]) do ... end */ #include <fcntl.h> @@ -26,11 +31,11 @@ static const char *builtins_source = ( #include "builtins.h" ); - -static int MATCH_METATABLE = 0; - +static int MATCH_METATABLE = 0, PAT_METATABLE = 0; static def_t *builtins; +static void push_match(lua_State *L, match_t *m, const char *start); + static inline void raise_parse_error(lua_State *L, maybe_pat_t m) { size_t err_len = (size_t)(m.value.error.end - m.value.error.start); @@ -40,6 +45,26 @@ static inline void raise_parse_error(lua_State *L, maybe_pat_t m) free(buf); } +static int Lcompile(lua_State *L) +{ + size_t patlen; + const char *pat_text = luaL_checklstring(L, 1, &patlen); + maybe_pat_t maybe_pat = bp_pattern(pat_text, pat_text + patlen); + if (!maybe_pat.success) { + raise_parse_error(L, maybe_pat); + return 0; + } + lua_createtable(L, 2, 0); + lua_pushlightuserdata(L, (void*)&PAT_METATABLE); + lua_gettable(L, LUA_REGISTRYINDEX); + lua_setmetatable(L, -2); + lua_pushvalue(L, 1); + lua_setfield(L, -2, "source"); + lua_pushlightuserdata(L, maybe_pat.value.pat); + lua_setfield(L, -2, "pattern"); + return 1; +} + static void push_matchstring(lua_State *L, match_t *m) { char *buf = NULL; @@ -51,79 +76,71 @@ static void push_matchstring(lua_State *L, match_t *m) fclose(out); } -static void push_match(lua_State *L, match_t *m); - -static void set_capture_fields(lua_State *L, match_t *m, int *n) +static void set_capture_fields(lua_State *L, match_t *m, int *n, const char *start) { if (m->pat->type == BP_CAPTURE) { if (m->pat->args.capture.namelen > 0) { lua_pushlstring(L, m->pat->args.capture.name, m->pat->args.capture.namelen); - push_match(L, m->children[0]); + push_match(L, m->children[0], start); lua_settable(L, -3); } else { - push_match(L, m->children[0]); + push_match(L, m->children[0], start); lua_seti(L, -2, *(n++)); } } else if (m->children) { for (int i = 0; m->children[i]; i++) - set_capture_fields(L, m->children[i], n); + set_capture_fields(L, m->children[i], n, start); } } -void push_match(lua_State *L, match_t *m) +static void push_match(lua_State *L, match_t *m, const char *start) { - lua_createtable(L, 1, 0); + lua_createtable(L, 1, 2); lua_pushlightuserdata(L, (void*)&MATCH_METATABLE); lua_gettable(L, LUA_REGISTRYINDEX); lua_setmetatable(L, -2); push_matchstring(L, m); lua_seti(L, -2, 0); + int capture_num = 1; for (int i = 0; m->children && m->children[i]; i++) - set_capture_fields(L, m->children[i], &capture_num); -} + set_capture_fields(L, m->children[i], &capture_num, start); -static int Ltostring(lua_State *L) -{ - lua_geti(L, 1, 0); - return 1; + lua_pushinteger(L, 1 + (int)(m->start - start)); + lua_setfield(L, -2, "start"); + lua_pushinteger(L, 1 + (int)(m->end - start)); + lua_setfield(L, -2, "after"); } -static const luaL_Reg Rinstance_metamethods[] = -{ - {"__tostring", Ltostring}, - {NULL, NULL} -}; - -static void recursive_free_pat(pat_t *pat) +static void recursively_free_pat(pat_t *pat) { // Do a depth-first traversal, freeing everyting along the way: if (!pat) return; switch (pat->type) { case BP_DEFINITION: - recursive_free_pat(pat->args.def.def); - recursive_free_pat(pat->args.def.pat); + recursively_free_pat(pat->args.def.def); + recursively_free_pat(pat->args.def.pat); break; case BP_REPEAT: - recursive_free_pat(pat->args.repetitions.sep); - recursive_free_pat(pat->args.repetitions.repeat_pat); + recursively_free_pat(pat->args.repetitions.sep); + recursively_free_pat(pat->args.repetitions.repeat_pat); break; case BP_CHAIN: case BP_UPTO: case BP_UPTO_STRICT: case BP_OTHERWISE: case BP_NOT_MATCH: case BP_MATCH: - recursive_free_pat(pat->args.multiple.first); - recursive_free_pat(pat->args.multiple.second); + recursively_free_pat(pat->args.multiple.first); + recursively_free_pat(pat->args.multiple.second); break; case BP_REPLACE: - recursive_free_pat(pat->args.replace.pat); + recursively_free_pat(pat->args.replace.pat); break; case BP_CAPTURE: - recursive_free_pat(pat->args.capture.capture_pat); + recursively_free_pat(pat->args.capture.capture_pat); break; case BP_NOT: case BP_AFTER: case BP_BEFORE: - recursive_free_pat(pat->args.pat); + recursively_free_pat(pat->args.pat); break; case BP_LEFTRECURSION: - recursive_free_pat(pat->args.leftrec.fallback); + recursively_free_pat(pat->args.leftrec.fallback); break; default: break; } @@ -132,52 +149,62 @@ static void recursive_free_pat(pat_t *pat) static int Lmatch(lua_State *L) { - size_t textlen, patlen; - const char *text = luaL_checklstring(L, 1, &textlen); - const char *pat_text = luaL_checklstring(L, 2, &patlen); - lua_Integer index = luaL_optinteger(L, 3, 1); - if (index > (lua_Integer)strlen(text)+1) - return 0; + if (lua_isstring(L, 1)) { + if (Lcompile(L) != 1) + return 0; + lua_replace(L, 1); + } + lua_getfield(L, 1, "pattern"); + pat_t *pat = lua_touserdata(L, -1); + lua_pop(L, 1); + if (!pat) luaL_error(L, "Not a valid pattern"); - maybe_pat_t maybe_pat = bp_pattern(pat_text, pat_text + patlen); - if (!maybe_pat.success) { - raise_parse_error(L, maybe_pat); - return 0; + size_t textlen; + const char *text = luaL_checklstring(L, 2, &textlen); + lua_Integer index; + if (lua_istable(L, 3)) { + lua_getfield(L, 3, "start"); + lua_getfield(L, 3, "after"); + index = luaL_optinteger(L, -1, 1); + if (lua_rawequal(L, -1, -2)) + ++index; + } else { + index = luaL_optinteger(L, 3, 1); } + if (index > (lua_Integer)strlen(text)+1) + return 0; match_t *m = NULL; int ret = 0; - if (next_match(&m, builtins, text+index-1, &text[textlen], maybe_pat.value.pat, NULL, false)) { - push_match(L, m); - lua_pushinteger(L, (int)(m->start - text) + 1); - lua_pushinteger(L, (int)(m->end - m->start)); + if (next_match(&m, builtins, text+index-1, &text[textlen], pat, NULL, false)) { + push_match(L, m, text); stop_matching(&m); - ret = 3; + ret = 1; } - - recursive_free_pat(maybe_pat.value.pat); - return ret; } static int Lreplace(lua_State *L) { - size_t textlen, patlen, replen; - const char *text = luaL_checklstring(L, 1, &textlen); - const char *pat_text = luaL_checklstring(L, 2, &patlen); - const char *rep_text = luaL_checklstring(L, 3, &replen); + if (lua_isstring(L, 1)) { + if (Lcompile(L) != 1) + return 0; + lua_replace(L, 1); + } + lua_getfield(L, 1, "pattern"); + pat_t *pat = lua_touserdata(L, -1); + lua_pop(L, 1); + if (!pat) luaL_error(L, "Not a valid pattern"); + + size_t replen, textlen; + const char *rep_text = luaL_checklstring(L, 2, &replen); + const char *text = luaL_checklstring(L, 3, &textlen); lua_Integer index = luaL_optinteger(L, 4, 1); if (index > (lua_Integer)strlen(text)+1) index = (lua_Integer)strlen(text)+1; - maybe_pat_t maybe_pat = bp_pattern(pat_text, pat_text + patlen); - if (!maybe_pat.success) { - raise_parse_error(L, maybe_pat); - return 0; - } - maybe_pat_t maybe_replacement = bp_replacement(maybe_pat.value.pat, rep_text, rep_text + replen); + maybe_pat_t maybe_replacement = bp_replacement(pat, rep_text, rep_text + replen); if (!maybe_replacement.success) { - recursive_free_pat(maybe_pat.value.pat); raise_parse_error(L, maybe_replacement); return 0; } @@ -199,12 +226,93 @@ static int Lreplace(lua_State *L) lua_pushinteger(L, replacements); fclose(out); - // maybe_pat will get freed by this: - recursive_free_pat(maybe_replacement.value.pat); + free_pat(maybe_replacement.value.pat); return 2; } +static int iter(lua_State *L) +{ + lua_geti(L, 1, 1); + lua_geti(L, 1, 2); + lua_replace(L, 1); + lua_rotate(L, 1, 1); + return Lmatch(L); +} + +static int Leachmatch(lua_State *L) +{ + int nargs = lua_gettop(L); + lua_pushcfunction(L, iter); // iter + lua_createtable(L, 2, 0); // state: {pat, str} + if (lua_isstring(L, 1)) { + if (Lcompile(L) != 1) + return 0; + } else { + lua_pushvalue(L, 1); + } + lua_seti(L, -2, 1); + lua_pushvalue(L, 2); + lua_seti(L, -2, 2); + if (nargs >= 3) // start index + lua_pushvalue(L, 3); + else lua_pushnil(L); + return 3; +} + +static int Lmatch_tostring(lua_State *L) +{ + lua_geti(L, 1, 0); + return 1; +} + +static int Lpat_tostring(lua_State *L) +{ + luaL_Buffer b; + luaL_buffinit(L, &b); + luaL_addstring(&b, "Pattern [["); + lua_getfield(L, 1, "source"); + luaL_addvalue(&b); + luaL_addstring(&b, "]]"); + luaL_pushresult(&b); + return 1; +} + +static int Lpat_gc(lua_State *L) +{ + pat_t *pat = lua_touserdata(L, 1); + if (pat) + recursively_free_pat(pat); + return 0; +} + +static const luaL_Reg match_metamethods[] = { + {"__tostring", Lmatch_tostring}, + {NULL, NULL} +}; + +static const luaL_Reg pat_methods[] = { + {"match", Lmatch}, + {"replace", Lreplace}, + {"eachmatch", Leachmatch}, + {NULL, NULL} +}; + +static const luaL_Reg pat_metamethods[] = { + {"__gc", Lpat_gc}, + {"__tostring", Lpat_tostring}, + {"__index", NULL}, // placeholder for pat_methods + {NULL, NULL} +}; + +static const luaL_Reg bp_methods[] = { + {"match", Lmatch}, + {"replace", Lreplace}, + {"compile", Lcompile}, + {"eachmatch", Leachmatch}, + {NULL, NULL} +}; + LUALIB_API int luaopen_bp(lua_State *L) { maybe_pat_t maybe_pat = bp_pattern(builtins_source, builtins_source+strlen(builtins_source)); @@ -215,17 +323,16 @@ LUALIB_API int luaopen_bp(lua_State *L) for (pat_t *p = maybe_pat.value.pat; p && p->type == BP_DEFINITION; p = p->args.def.pat) builtins = with_def(builtins, p->args.def.namelen, p->args.def.name, p->args.def.def); + lua_pushlightuserdata(L, (void*)&PAT_METATABLE); + luaL_newlib(L, pat_metamethods); + luaL_newlib(L, pat_methods); + lua_setfield(L, -2, "__index"); + lua_settable(L, LUA_REGISTRYINDEX); + lua_pushlightuserdata(L, (void*)&MATCH_METATABLE); - lua_createtable(L, 0, 4); - luaL_register(L, NULL, Rinstance_metamethods); + luaL_newlib(L, match_metamethods); lua_settable(L, LUA_REGISTRYINDEX); - lua_createtable(L, 0, 2); - lua_pushcfunction(L, Lmatch); - lua_setfield(L, -2, "match"); - lua_pushcfunction(L, Lreplace); - lua_setfield(L, -2, "replace"); - // lua_pushcfunction(L, Leach); - // lua_setfield(L, -1, "each"); + luaL_newlib(L, bp_methods); return 1; } diff --git a/Lua/test.lua b/Lua/test.lua index c2fbb3c..3e13f55 100644 --- a/Lua/test.lua +++ b/Lua/test.lua @@ -1,15 +1,5 @@ local bp = require 'bp' -local function iter(state, _) - local m, start, len = bp.match(state[1], state[2], state[3]) - state[3] = m and start+math.max(len,1) - return m, start, len -end - -bp.each = function(s, pat, index) - return iter, {s, pat, index}, index -end - local function repr(obj) if type(obj) == 'table' then local ret = {} @@ -23,26 +13,38 @@ local function repr(obj) end print("Matching:") -for m, i,j in bp.each("one two three", "(*`a-z) => '(@0)'") do - print(("%s @%d len=%d"):format(repr(m),i,j)) +for m in bp.eachmatch("(*`a-z) => '(@0)'", "one two three") do + print(repr(m)) end -print(("Replacing: %q (%d replacements)"):format(bp.replace("one two three", "+`a-z", "(@0)"))) +print(("Replacing: %q (%d replacements)"):format(bp.replace("+`a-z", "(@0)", "one two three"))) print("Captures:") -local m = bp.match("one two three four", "@first=+`a-z _ @second=(+`a-z => 'XX@0XX') _ @+`a-z _ @last=+`a-z") +local m = bp.match("@first=+`a-z _ @second=(+`a-z => 'XX@0XX') _ @+`a-z _ @last=+`a-z", "one two three four") print(repr(m)) -local m = bp.match("one two three four", "@dup=+`a-z _ @dup=+`a-z _ @two=(@a=+`a-z _ @b=+`a-z)") +local m = bp.match("@dup=+`a-z _ @dup=+`a-z _ @two=(@a=+`a-z _ @b=+`a-z)", "one two three four") print(repr(m)) print("Testing parse errors:") local ok, msg = pcall(function() - bp.match("xxx", ".;//;;; wtf") + bp.match(".;//;;; wtf", "xxx") end) if not ok then print(("\x1B[41;30mParse error:\x1B[0;1;31m %s\x1B[m\n"):format(msg)) end print("Testing builtins:") -print(bp.match("...(foo())...", "parens")) +print(bp.match("parens", "...(foo())...")) + + +print("Testing pat objects") +local pat = bp.compile("+`a-z") +print(pat) +print(pat:match("...foo...")) +print(pat:match("...baz...")) +print(pat:replace("{@0}", "...baz...")) + +for m in pat:eachmatch("hello world") do + print(m) +end |
