nomsu/parser.moon

-- This file contains the parser, which converts Nomsu text into abstract syntax trees
lpeg = require 'lpeg'
re = require 're'
lpeg.setmaxstack 10000
{:P,:R,:S,:C,:Cmt,:Carg} = lpeg
{:match, :sub} = string
{:insert, :remove} = table
files = require 'files'
{:NomsuCode, :LuaCode, :Source} = require "code_obj"
AST = require "nomsu_tree"

NOMSU_DEFS = with {}
    -- Newline supports either windows-style CR+LF or unix-style LF
    .nl = P("\r")^-1 * P("\n")
    .ws = S(" \t")
    .tonumber = tonumber
    .table = -> {}
    .unpack = unpack or table.unpack
    string_escapes = n:"\n", t:"\t", b:"\b", a:"\a", v:"\v", f:"\f", r:"\r"
    digit, hex = R('09'), R('09','af','AF')
    .escaped_char = (P("\\")*S("xX")*C(hex*hex)) / => string.char(tonumber(@, 16))
    .escaped_char += (P("\\")*C(digit*(digit^-2))) / => string.char(tonumber @)
    .escaped_char += (P("\\")*C(S("ntbavfr"))) / string_escapes
    .operator_char = S("'`~!@$^&*-+=|<>?/")
    .utf8_char = (
        R("\194\223")*R("\128\191") +
        R("\224\239")*R("\128\191")*R("\128\191") +
        R("\240\244")*R("\128\191")*R("\128\191")*R("\128\191"))
    .ident_char = R("az","AZ","09") + P("_") + .utf8_char

    .userdata = Carg(1)

    .add_comment = (src,end_pos,start_pos,comment,userdata)->
        userdata.comments[start_pos] = comment
        return true

    .error = (src,end_pos,start_pos,err_msg,userdata)->
        seen_errors = userdata.errors
        if seen_errors[start_pos]
            return true
        num_errors = 0
        for _ in pairs(seen_errors) do num_errors += 1
        if num_errors >= 10
            seen_errors[start_pos+1] = colored.bright colored.yellow colored.onred "Too many errors, canceling parsing..."
            return #src+1
        err_pos = start_pos
        line_no = files.get_line_number(src, err_pos)
        --src = files.read(userdata.source.filename)
        prev_line = line_no == 1 and "" or files.get_line(src, line_no-1)
        err_line = files.get_line(src, line_no)
        next_line = files.get_line(src, line_no+1)
        i = err_pos-files.get_line_starts(src)[line_no]
        pointer = ("-")\rep(i) .. "^"
        err_msg = colored.bright colored.yellow colored.onred (err_msg or "Parse error").." at #{userdata.source.filename}:#{line_no}:"
        if #prev_line > 0 then err_msg ..= "\n"..colored.dim(prev_line)
        err_line = colored.white(err_line\sub(1, i))..colored.bright(colored.red(err_line\sub(i+1,i+1)))..colored.dim(err_line\sub(i+2,-1))
        err_msg ..= "\n#{err_line}\n#{colored.red pointer}"
        if #next_line > 0 then err_msg ..= "\n"..colored.dim(next_line)
        seen_errors[start_pos] = err_msg
        return true

setmetatable(NOMSU_DEFS, {__index:(key)=>
    make_node = (start, value, stop, userdata)->
        if userdata.source
            with userdata.source
                value.source = Source(.filename, .start + start-1, .start + stop-1)
        setmetatable(value, AST[key])
        if value.__init then value\__init!
        return value

    self[key] = make_node
    return make_node
})

Parser = {version:2, patterns:{}}
do
    -- Just for cleanliness, I put the language spec in its own file using a slightly modified
    -- version of the lpeg.re syntax.
    peg_tidier = re.compile [[
    file <- %nl* {~ (def/comment) (%nl+ (def/comment))* %nl* ~}
    def <- anon_def / captured_def
    anon_def <- ({ident} (" "*) ":"
        {~ ((%nl " "+ def_line?)+) / def_line ~}) -> "%1 <- %2"
    captured_def <- ({ident} (" "*) "(" {ident} ")" (" "*) ":"
        {~ ((%nl " "+ def_line?)+) / def_line ~}) -> "%1 <- (({} {| %3 |} {} %%userdata) -> %2)"
    def_line <- (err / [^%nl])+
    err <- ("(!!" { (!("!!)") .)* } "!!)") -> "(({} (%1) %%userdata) => error)"
    ident <- [a-zA-Z_][a-zA-Z0-9_]*
    comment <- "--" [^%nl]*
    ]]
    for version=1,Parser.version
        peg_file = io.open("nomsu.#{version}.peg")
        if not peg_file and package.nomsupath
            for path in package.nomsupath\gmatch("[^;]+")
                peg_file = io.open(path.."/nomsu.#{version}.peg")
                break if peg_file
        assert(peg_file, "could not find nomsu .peg file")
        nomsu_peg = peg_tidier\match(peg_file\read('*a'))
        peg_file\close!
        Parser.patterns[version] = re.compile(nomsu_peg, NOMSU_DEFS)

Parser.parse = (nomsu_code, source=nil, version=nil)->
    source or= nomsu_code.source
    nomsu_code = tostring(nomsu_code)
    version or= nomsu_code\match("^#![^\n]*nomsu[ ]+-V[ ]*([0-9.]+)")
    syntax_version = version and tonumber(version\match("^[0-9]+")) or Parser.version
    userdata = {
        errors: {}, :source, comments: {}
    }
    tree = Parser.patterns[syntax_version]\match(nomsu_code, nil, userdata)
    unless tree
        error "In file #{colored.blue tostring(source or "<unknown>")} failed to parse:\n#{colored.onyellow colored.black nomsu_code}"
    if type(tree) == 'number'
        return nil

    if next(userdata.errors)
        keys = [k for k,v in pairs(userdata.errors)]
        table.sort(keys)
        errors = [userdata.errors[k] for k in *keys]
        error("Errors occurred while parsing (v#{syntax_version}):\n\n"..table.concat(errors, "\n\n"), 0)

    comments = [{comment:c, pos:p} for p,c in pairs(userdata.comments)]
    -- Sort in descending order so we can pop the first comments off the end one at a time
    table.sort comments, (a,b)-> a.pos > b.pos
    comment_i = 1
    walk_tree = (t)->
        export comment_i
        comment_buff = {}
        while comments[#comments] and comments[#comments].pos <= t.source.start
            table.insert(comment_buff, table.remove(comments))
        for x in *t
            if AST.is_syntax_tree x
                walk_tree x
        while comments[#comments] and comments[#comments].pos <= t.source.stop
            table.insert(comment_buff, table.remove(comments))
        t.comments = comment_buff if #comment_buff > 0
    walk_tree tree

    return tree

Parser.is_operator = (s)->
    return not not (NOMSU_DEFS.operator_char^1 * -1)\match(s)

Parser.is_identifier = (s)->
    return not not (NOMSU_DEFS.ident_char^1 * -1)\match(s)

inline_escaper = re.compile "{~ (%utf8_char / ('\"' -> '\\\"') / ('\n' -> '\\n') / ('\t' -> '\\t') / ('\b' -> '\\b') / ('\a' -> '\\a') / ('\v' -> '\\v') / ('\f' -> '\\f') / ('\r' -> '\\r') / ('\\' -> '\\\\') / ([^ -~] -> escape) / .)* ~}", {utf8_char: NOMSU_DEFS.utf8_char, escape:(=> ("\\%03d")\format(@byte!))}
Parser.inline_escape = (s)->
    return inline_escaper\match(s)
escaper = re.compile "{~ (%utf8_char / ('\\' -> '\\\\') / [\n\r\t -~] / (. -> escape))* ~}", {utf8_char: NOMSU_DEFS.utf8_char, escape:(=> ("\\%03d")\format(@byte!))}
Parser.escape = (s)->
    return escaper\match(s)

return Parser
Moved error logic into its own file. 2018-06-19 01:12:43 -07:00			`-- This file contains the parser, which converts Nomsu text into abstract syntax trees`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`lpeg = require 'lpeg'`
			`re = require 're'`
			`lpeg.setmaxstack 10000`
Reducing imports. 2018-06-24 23:21:07 -07:00			`{:P,:R,:S,:C,:Cmt,:Carg} = lpeg`
Cleaning up code and shuffling things around. 2018-06-24 23:18:32 -07:00			`{:match, :sub} = string`
Initial version of new pegfile. works (ish) 2018-07-12 16:14:29 -07:00			`{:insert, :remove} = table`
Improvements to nomsu codegen (comments are now retained) and some improvements to handling of stdin and fixes for error reporting. 2018-06-28 14:12:24 -07:00			`files = require 'files'`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`{:NomsuCode, :LuaCode, :Source} = require "code_obj"`
			`AST = require "nomsu_tree"`

			`NOMSU_DEFS = with {}`
			`-- Newline supports either windows-style CR+LF or unix-style LF`
			`.nl = P("\r")^-1 * P("\n")`
			`.ws = S(" \t")`
			`.tonumber = tonumber`
Fixed up comment generation to old standards 2018-07-14 14:41:17 -07:00			`.table = -> {}`
Switching to use 'unpack' for indentation grouping. 2018-07-13 09:56:12 -07:00			`.unpack = unpack or table.unpack`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`string_escapes = n:"\n", t:"\t", b:"\b", a:"\a", v:"\v", f:"\f", r:"\r"`
			`digit, hex = R('09'), R('09','af','AF')`
			`.escaped_char = (P("\\")S("xX")C(hex*hex)) / => string.char(tonumber(@, 16))`
			`.escaped_char += (P("\\")C(digit(digit^-2))) / => string.char(tonumber @)`
			`.escaped_char += (P("\\")*C(S("ntbavfr"))) / string_escapes`
			.operator_char = S("'`~!@$^&*-+=\|<>?/")
			`.utf8_char = (`
			`R("\194\223")*R("\128\191") +`
			`R("\224\239")R("\128\191")R("\128\191") +`
			`R("\240\244")R("\128\191")R("\128\191")*R("\128\191"))`
			`.ident_char = R("az","AZ","09") + P("_") + .utf8_char`

			`.userdata = Carg(1)`

Fixed up comment generation to old standards 2018-07-14 14:41:17 -07:00			`.add_comment = (src,end_pos,start_pos,comment,userdata)->`
			`userdata.comments[start_pos] = comment`
			`return true`

Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`.error = (src,end_pos,start_pos,err_msg,userdata)->`
			`seen_errors = userdata.errors`
			`if seen_errors[start_pos]`
			`return true`
Reducing imports. 2018-06-24 23:21:07 -07:00			`num_errors = 0`
			`for _ in pairs(seen_errors) do num_errors += 1`
			`if num_errors >= 10`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`seen_errors[start_pos+1] = colored.bright colored.yellow colored.onred "Too many errors, canceling parsing..."`
			`return #src+1`
			`err_pos = start_pos`
Improvements to nomsu codegen (comments are now retained) and some improvements to handling of stdin and fixes for error reporting. 2018-06-28 14:12:24 -07:00			`line_no = files.get_line_number(src, err_pos)`
Fixed up nomsupath behavior and refactored file stuff into its own file. 2018-06-23 17:22:23 -07:00			`--src = files.read(userdata.source.filename)`
Improvements to nomsu codegen (comments are now retained) and some improvements to handling of stdin and fixes for error reporting. 2018-06-28 14:12:24 -07:00			`prev_line = line_no == 1 and "" or files.get_line(src, line_no-1)`
			`err_line = files.get_line(src, line_no)`
			`next_line = files.get_line(src, line_no+1)`
			`i = err_pos-files.get_line_starts(src)[line_no]`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`pointer = ("-")\rep(i) .. "^"`
			`err_msg = colored.bright colored.yellow colored.onred (err_msg or "Parse error").." at #{userdata.source.filename}:#{line_no}:"`
			`if #prev_line > 0 then err_msg ..= "\n"..colored.dim(prev_line)`
			`err_line = colored.white(err_line\sub(1, i))..colored.bright(colored.red(err_line\sub(i+1,i+1)))..colored.dim(err_line\sub(i+2,-1))`
			`err_msg ..= "\n#{err_line}\n#{colored.red pointer}"`
			`if #next_line > 0 then err_msg ..= "\n"..colored.dim(next_line)`
			`seen_errors[start_pos] = err_msg`
			`return true`

			`setmetatable(NOMSU_DEFS, {__index:(key)=>`
			`make_node = (start, value, stop, userdata)->`
			`if userdata.source`
			`with userdata.source`
			`value.source = Source(.filename, .start + start-1, .start + stop-1)`
			`setmetatable(value, AST[key])`
			`if value.__init then value\__init!`
			`return value`

			`self[key] = make_node`
			`return make_node`
			`})`

Major changes to how versioning and parsing work. This should be a better path going forward to handling upgrades. Old syntax files will stick around for compatibility purposes. Old syntax can be parsed into valid syntax trees via the old syntax (.peg) files, and then old syntax trees should be valid and can be upgraded via the normal code path. This change has lots of improvements to Nomsu codegen too. 2018-07-15 19:41:22 -07:00			`Parser = {version:2, patterns:{}}`
			`do`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`-- Just for cleanliness, I put the language spec in its own file using a slightly modified`
			`-- version of the lpeg.re syntax.`
			`peg_tidier = re.compile [[`
Major changes to how versioning and parsing work. This should be a better path going forward to handling upgrades. Old syntax files will stick around for compatibility purposes. Old syntax can be parsed into valid syntax trees via the old syntax (.peg) files, and then old syntax trees should be valid and can be upgraded via the normal code path. This change has lots of improvements to Nomsu codegen too. 2018-07-15 19:41:22 -07:00			`file <- %nl* {~ (def/comment) (%nl+ (def/comment))* %nl* ~}`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`def <- anon_def / captured_def`
			`anon_def <- ({ident} (" "*) ":"`
Initial version of new pegfile. works (ish) 2018-07-12 16:14:29 -07:00			`{~ ((%nl " "+ def_line?)+) / def_line ~}) -> "%1 <- %2"`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`captured_def <- ({ident} (" ") "(" {ident} ")" (" ") ":"`
Initial version of new pegfile. works (ish) 2018-07-12 16:14:29 -07:00			`{~ ((%nl " "+ def_line?)+) / def_line ~}) -> "%1 <- (({} {\| %3 \|} {} %%userdata) -> %2)"`
			`def_line <- (err / [^%nl])+`
			`err <- ("(!!" { (!("!!)") .)* } "!!)") -> "(({} (%1) %%userdata) => error)"`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`ident <- [a-zA-Z_][a-zA-Z0-9_]*`
			`comment <- "--" [^%nl]*`
Major changes to how versioning and parsing work. This should be a better path going forward to handling upgrades. Old syntax files will stick around for compatibility purposes. Old syntax can be parsed into valid syntax trees via the old syntax (.peg) files, and then old syntax trees should be valid and can be upgraded via the normal code path. This change has lots of improvements to Nomsu codegen too. 2018-07-15 19:41:22 -07:00			`]]`
			`for version=1,Parser.version`
			`peg_file = io.open("nomsu.#{version}.peg")`
			`if not peg_file and package.nomsupath`
			`for path in package.nomsupath\gmatch("[^;]+")`
			`peg_file = io.open(path.."/nomsu.#{version}.peg")`
			`break if peg_file`
			`assert(peg_file, "could not find nomsu .peg file")`
			`nomsu_peg = peg_tidier\match(peg_file\read('*a'))`
			`peg_file\close!`
			`Parser.patterns[version] = re.compile(nomsu_peg, NOMSU_DEFS)`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00
Major changes to how versioning and parsing work. This should be a better path going forward to handling upgrades. Old syntax files will stick around for compatibility purposes. Old syntax can be parsed into valid syntax trees via the old syntax (.peg) files, and then old syntax trees should be valid and can be upgraded via the normal code path. This change has lots of improvements to Nomsu codegen too. 2018-07-15 19:41:22 -07:00			`Parser.parse = (nomsu_code, source=nil, version=nil)->`
Minor fixes. 2018-06-27 10:22:58 -07:00			`source or= nomsu_code.source`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`nomsu_code = tostring(nomsu_code)`
Major changes to how versioning and parsing work. This should be a better path going forward to handling upgrades. Old syntax files will stick around for compatibility purposes. Old syntax can be parsed into valid syntax trees via the old syntax (.peg) files, and then old syntax trees should be valid and can be upgraded via the normal code path. This change has lots of improvements to Nomsu codegen too. 2018-07-15 19:41:22 -07:00			`version or= nomsu_code\match("^#![^\n]nomsu[ ]+-V[ ]([0-9.]+)")`
Oops, didn't mean to check in smushed_action. 2018-07-17 17:25:12 -07:00			`syntax_version = version and tonumber(version\match("^[0-9]+")) or Parser.version`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`userdata = {`
Fixed up comment generation to old standards 2018-07-14 14:41:17 -07:00			`errors: {}, :source, comments: {}`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`}`
Oops, didn't mean to check in smushed_action. 2018-07-17 17:25:12 -07:00			`tree = Parser.patterns[syntax_version]\match(nomsu_code, nil, userdata)`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`unless tree`
			`error "In file #{colored.blue tostring(source or "<unknown>")} failed to parse:\n#{colored.onyellow colored.black nomsu_code}"`
			`if type(tree) == 'number'`
Improvements to nomsu codegen (comments are now retained) and some improvements to handling of stdin and fixes for error reporting. 2018-06-28 14:12:24 -07:00			`return nil`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00
			`if next(userdata.errors)`
Reducing imports. 2018-06-24 23:21:07 -07:00			`keys = [k for k,v in pairs(userdata.errors)]`
Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`table.sort(keys)`
			`errors = [userdata.errors[k] for k in *keys]`
Oops, didn't mean to check in smushed_action. 2018-07-17 17:25:12 -07:00			`error("Errors occurred while parsing (v#{syntax_version}):\n\n"..table.concat(errors, "\n\n"), 0)`
Overhaul of comment handling, plus a few fixes (e.g. a fix for indented text that begins with a nomsu comment) 2018-07-17 14:12:11 -07:00
			`comments = [{comment:c, pos:p} for p,c in pairs(userdata.comments)]`
			`-- Sort in descending order so we can pop the first comments off the end one at a time`
			`table.sort comments, (a,b)-> a.pos > b.pos`
			`comment_i = 1`
			`walk_tree = (t)->`
			`export comment_i`
			`comment_buff = {}`
			`while comments[#comments] and comments[#comments].pos <= t.source.start`
			`table.insert(comment_buff, table.remove(comments))`
			`for x in *t`
			`if AST.is_syntax_tree x`
			`walk_tree x`
			`while comments[#comments] and comments[#comments].pos <= t.source.stop`
			`table.insert(comment_buff, table.remove(comments))`
			`t.comments = comment_buff if #comment_buff > 0`
			`walk_tree tree`

Moved parsing into a different file 2018-06-18 15:46:28 -07:00			`return tree`

Improvements to nomsu codegen. 2018-07-17 15:00:57 -07:00			`Parser.is_operator = (s)->`
Improving nomsu codegen. 2018-07-17 23:33:49 -07:00			`return not not (NOMSU_DEFS.operator_char^1 * -1)\match(s)`

			`Parser.is_identifier = (s)->`
			`return not not (NOMSU_DEFS.ident_char^1 * -1)\match(s)`
Improvements to nomsu codegen. 2018-07-17 15:00:57 -07:00
Improving nomsu codegen. 2018-07-20 20:13:01 -07:00			`inline_escaper = re.compile "{~ (%utf8_char / ('\"' -> '\\\"') / ('\n' -> '\\n') / ('\t' -> '\\t') / ('\b' -> '\\b') / ('\a' -> '\\a') / ('\v' -> '\\v') / ('\f' -> '\\f') / ('\r' -> '\\r') / ('\\' -> '\\\\') / ([^ -~] -> escape) / .)* ~}", {utf8_char: NOMSU_DEFS.utf8_char, escape:(=> ("\\%03d")\format(@byte!))}`
Fixed some bugs in trailing_line_len() and refactored tree_to_nomsu into separate inline/not-inline functions. 2018-07-19 20:41:31 -07:00			`Parser.inline_escape = (s)->`
			`return inline_escaper\match(s)`
			`escaper = re.compile "{~ (%utf8_char / ('\\' -> '\\\\') / [\n\r\t -~] / (. -> escape))* ~}", {utf8_char: NOMSU_DEFS.utf8_char, escape:(=> ("\\%03d")\format(@byte!))}`
			`Parser.escape = (s)->`
			`return escaper\match(s)`

Added versioning system. 2018-06-23 00:57:31 -07:00			`return Parser`