Better string parsing.
This commit is contained in:
parent
930c09f46d
commit
4e545c6798
3
ast.c
3
ast.c
@ -92,8 +92,7 @@ CORD ast_to_cord(ast_t *ast)
|
||||
T(Char, "(\x1b[35m'%c'\x1b[m)", data.c)
|
||||
T(StringLiteral, "\x1b[35m\"%s\"\x1b[m", data.str)
|
||||
T(StringJoin, "(%r)", ast_list_to_cord(data.children))
|
||||
T(Interp, "(%r)", ast_to_cord(data.value))
|
||||
T(Declare, "(var=%s, value=%s)", ast_to_cord(data.var), ast_to_cord(data.value))
|
||||
T(Declare, "(var=%s, value=%r)", ast_to_cord(data.var), ast_to_cord(data.value))
|
||||
T(Assign, "(targets=%r, values=%r)", ast_list_to_cord(data.targets), ast_list_to_cord(data.values))
|
||||
T(BinaryOp, "(%r, %s, %r)", ast_to_cord(data.lhs), OP_NAMES[data.op], ast_to_cord(data.rhs))
|
||||
T(UpdateAssign, "(%r, %s, %r)", ast_to_cord(data.lhs), OP_NAMES[data.op], ast_to_cord(data.rhs))
|
||||
|
6
ast.h
6
ast.h
@ -92,7 +92,7 @@ typedef enum {
|
||||
Unknown = 0,
|
||||
Nil, Bool, Var,
|
||||
Int, Num, Char,
|
||||
StringLiteral, StringJoin, Interp,
|
||||
StringLiteral, StringJoin,
|
||||
Declare, Assign,
|
||||
BinaryOp, UnaryOp, UpdateAssign,
|
||||
Min, Max,
|
||||
@ -145,10 +145,6 @@ struct ast_s {
|
||||
struct {
|
||||
ast_list_t *children;
|
||||
} StringJoin;
|
||||
struct {
|
||||
ast_t *value;
|
||||
bool labelled:1, colorize:1, quote_string:1;
|
||||
} Interp;
|
||||
struct {
|
||||
ast_t *var;
|
||||
ast_t *value;
|
||||
|
@ -130,9 +130,6 @@ CORD compile(ast_t *ast)
|
||||
}
|
||||
return code;
|
||||
}
|
||||
case Interp: {
|
||||
return CORD_asprintf("__cord(%r)", compile(Match(ast, Interp)->value));
|
||||
}
|
||||
case Block: {
|
||||
ast_list_t *stmts = Match(ast, Block)->statements;
|
||||
if (stmts && !stmts->next)
|
||||
|
@ -35,6 +35,7 @@
|
||||
int32_t: CORD_asprintf("%d", x), int64_t: CORD_asprintf("%ld", x), \
|
||||
double: CORD_asprintf("%g", x), float: CORD_asprintf("%g", x), \
|
||||
CORD: x, \
|
||||
char*: x, \
|
||||
default: "???")
|
||||
#define __heap(x) (__typeof(x)*)memcpy(GC_MALLOC(sizeof(x)), (__typeof(x)[1]){x}, sizeof(x))
|
||||
#define __stack(x) (&(__typeof(x)){x})
|
||||
@ -56,3 +57,5 @@
|
||||
#define mod1(x, n) (((x) % (n)) + (__typeof(x))1)
|
||||
|
||||
#define say(str) puts(CORD_to_const_char_star(str))
|
||||
|
||||
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
|
||||
|
223
parse.c
223
parse.c
@ -14,6 +14,7 @@
|
||||
#include "ast.h"
|
||||
#include "util.h"
|
||||
|
||||
static const char closing[128] = {['(']=')', ['[']=']', ['<']='>', ['{']='}', ['|']='|', ['/']='/'};
|
||||
|
||||
typedef struct {
|
||||
sss_file_t *file;
|
||||
@ -80,6 +81,7 @@ static PARSER(parse_opt_indented_block);
|
||||
static PARSER(parse_var);
|
||||
static PARSER(parse_enum_def);
|
||||
static PARSER(parse_struct_def);
|
||||
static PARSER(parse_string);
|
||||
static PARSER(parse_func_def);
|
||||
static PARSER(parse_extern);
|
||||
static PARSER(parse_declaration);
|
||||
@ -166,7 +168,10 @@ size_t some_not(const char **pos, const char *forbid) {
|
||||
return len;
|
||||
}
|
||||
|
||||
size_t spaces(const char **pos) { return some_of(pos, " \t"); }
|
||||
size_t spaces(const char **pos) {
|
||||
return some_of(pos, " \t");
|
||||
}
|
||||
|
||||
size_t whitespace(const char **pos) {
|
||||
const char *p0 = *pos;
|
||||
while (some_of(pos, " \t\r\n") || comment(pos))
|
||||
@ -222,7 +227,7 @@ static void expect_closing(
|
||||
|
||||
const char *eol = strchr(*pos, '\n');
|
||||
const char *next = strstr(*pos, closing);
|
||||
|
||||
|
||||
const char *end = eol < next ? eol : next;
|
||||
|
||||
if (isatty(STDERR_FILENO) && !getenv("NO_COLOR"))
|
||||
@ -868,154 +873,100 @@ PARSER(parse_char) {
|
||||
}
|
||||
}
|
||||
|
||||
PARSER(parse_interpolation) {
|
||||
const char *start = pos;
|
||||
++pos; // ignore the initial character, typically a '$', but might be other stuff like '@' in different contexts
|
||||
bool labelled = match(&pos, ":");
|
||||
ast_t *value = optional(ctx, &pos, parse_parens);
|
||||
if (!value) value = optional(ctx, &pos, parse_term);
|
||||
if (!value) {
|
||||
match_group(&pos, '(');
|
||||
parser_err(ctx, start, pos, "This interpolation didn't parse");
|
||||
}
|
||||
return NewAST(ctx->file, start, pos, Interp, .value=value, .labelled=labelled);
|
||||
}
|
||||
|
||||
PARSER(parse_string) {
|
||||
static const char closing[128] = {['(']=')', ['[']=']', ['<']='>', ['{']='}'};
|
||||
static const bool escapes[128] = {['\'']='\x1B', ['(']='\x1B', ['>']='\x1B', ['/']='\x1B'};
|
||||
static const char interps[128] = {['>']='@', ['/']='@', ['\'']='\x1A', ['(']='\x1A'};
|
||||
|
||||
const char *string_start = pos;
|
||||
char open, close;
|
||||
if (match(&pos, "$")) {
|
||||
open = *pos;
|
||||
close = closing[(int)open] ? closing[(int)open] : open;
|
||||
++pos;
|
||||
// ["$" [interp-char [closing-interp-char]]] ('"' ... '"' / "'" ... "'")
|
||||
const char *start = pos;
|
||||
char close_quote, start_interp = '\x03', close_interp = '\x02';
|
||||
if (match(&pos, "\"")) {
|
||||
close_quote = '"', start_interp = '{', close_interp = '}';
|
||||
} else if (match(&pos, "'")) {
|
||||
close_quote = '\'';
|
||||
} else if (match(&pos, "$")) {
|
||||
if (*pos == '"' || *pos == '\'' || *pos == '/' || *pos == ';') {
|
||||
close_quote = *pos;
|
||||
} else {
|
||||
start_interp = *(pos++);
|
||||
close_interp = closing[(int)start_interp];
|
||||
if (*pos == close_interp) ++pos;
|
||||
close_quote = closing[(int)*pos] ? closing[(int)*pos] : *pos;
|
||||
++pos;
|
||||
}
|
||||
} else {
|
||||
if (*pos != '\'' && *pos != '"')
|
||||
return NULL;
|
||||
open = *pos;
|
||||
close = *pos;
|
||||
++pos;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char interp_char = interps[(int)open] ? interps[(int)open] : '$';
|
||||
char escape_char = escapes[(int)open] ? escapes[(int)open] : '\\';
|
||||
// printf("Parsing string: '%c' .. '%c' interp: '%c%c'\n", *start, close_quote, start_interp, close_interp);
|
||||
|
||||
if (open == ':' || open == '>')
|
||||
spaces(&pos);
|
||||
int64_t starting_indent = sss_get_indent(ctx->file, pos);
|
||||
int64_t string_indent;
|
||||
if (*pos == '\r' || *pos == '\n') {
|
||||
const char *first_line = pos;
|
||||
whitespace(&first_line);
|
||||
string_indent = sss_get_indent(ctx->file, first_line);
|
||||
if (string_indent <= starting_indent)
|
||||
parser_err(ctx, start, first_line, "Multi-line strings must be indented on their first line");
|
||||
} else {
|
||||
string_indent = starting_indent + 4;
|
||||
}
|
||||
|
||||
ast_list_t *chunks = NULL;
|
||||
if (*pos == '\r' || *pos == '\n') { // Multiline string
|
||||
char special[] = {'\n','\r',interp_char,escape_char,'\0'};
|
||||
int64_t starting_indent = sss_get_indent(ctx->file, pos);
|
||||
// indentation-delimited string
|
||||
match(&pos, "\r");
|
||||
match(&pos, "\n");
|
||||
int64_t first_line = sss_get_line_number(ctx->file, pos);
|
||||
int64_t indented = sss_get_indent(ctx->file, pos);
|
||||
pos = sss_get_line(ctx->file, first_line);
|
||||
while (pos < ctx->file->text + ctx->file->len) {
|
||||
const char *eol = strchrnul(pos, '\n');
|
||||
if (eol == pos + strspn(pos, " \t\r")) { // Empty line
|
||||
ast_t *ast = NewAST(ctx->file, pos, eol, StringLiteral, .str="\n");
|
||||
chunks = new(ast_list_t, .ast=ast, .next=chunks);
|
||||
pos = eol + 1;
|
||||
CORD chunk = NULL;
|
||||
const char *chunk_start = pos;
|
||||
for (; pos < ctx->file->text + ctx->file->len && *pos != close_quote; ++pos) {
|
||||
if (*pos == start_interp) {
|
||||
if (chunk) {
|
||||
ast_t *literal = NewAST(ctx->file, chunk_start, pos, StringLiteral, .str=CORD_to_const_char_star(chunk));
|
||||
chunks = new(ast_list_t, .ast=literal, .next=chunks);
|
||||
chunk = NULL;
|
||||
}
|
||||
++pos;
|
||||
spaces(&pos);
|
||||
for (ast_t *interp; (interp=optional(ctx, &pos, parse_expr)); spaces(&pos)) {
|
||||
interp = WrapAST(interp, FunctionCall, .fn=WrapAST(interp, Var, .name="__cord"), .args=new(ast_list_t, .ast=interp));
|
||||
chunks = new(ast_list_t, .ast=interp, .next=chunks);
|
||||
chunk_start = pos;
|
||||
}
|
||||
if (close_interp) {
|
||||
const char *closing = pos;
|
||||
spaces(&closing);
|
||||
if (*closing == close_interp) {
|
||||
pos = closing;
|
||||
}
|
||||
}
|
||||
} else if (*pos == '\r' || *pos == '\n') {
|
||||
// Newline handling
|
||||
match(&pos, "\r");
|
||||
match(&pos, "\n");
|
||||
if (match_indentation(&pos, string_indent)) {
|
||||
if (chunk || chunks)
|
||||
chunk = CORD_cat_char(chunk, '\n');
|
||||
--pos;
|
||||
continue;
|
||||
}
|
||||
if (!match_indentation(&pos, starting_indent))
|
||||
parser_err(ctx, pos, strchrnul(pos, '\n'), "This isn't a valid indentation level for this unterminated string");
|
||||
|
||||
if (*pos == close) {
|
||||
++pos;
|
||||
goto finished;
|
||||
}
|
||||
|
||||
if (!match_indentation(&pos, (indented - starting_indent)))
|
||||
parser_err(ctx, pos, strchrnul(pos, '\n'), "I was expecting this to have %lu extra indentation beyond %lu",
|
||||
(indented - starting_indent), starting_indent);
|
||||
|
||||
while (pos < eol+1) {
|
||||
size_t len = strcspn(pos, special);
|
||||
if (pos[len] == '\r') ++len;
|
||||
if (pos[len] == '\n') ++len;
|
||||
|
||||
if (len > 0) {
|
||||
ast_t *chunk = NewAST(ctx->file, pos, pos+len-1, StringLiteral, .str=heap_strn(pos, len));
|
||||
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
|
||||
}
|
||||
|
||||
pos += len;
|
||||
|
||||
if (*pos == escape_char) {
|
||||
const char *start = pos;
|
||||
const char* unescaped = unescape(&pos);
|
||||
ast_t *chunk = NewAST(ctx->file, start, pos, StringLiteral, .str=unescaped);
|
||||
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
|
||||
++pos;
|
||||
} else if (*pos == interp_char) {
|
||||
ast_t *chunk = parse_interpolation(ctx, pos);
|
||||
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
|
||||
pos = chunk->end;
|
||||
if (sss_get_indent(ctx->file, pos) == starting_indent) {
|
||||
if (*pos == close_quote) {
|
||||
break;
|
||||
} else if (some_of(&pos, ".") >= 2) {
|
||||
// Multi-line split
|
||||
--pos;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
finished:;
|
||||
// Strip trailing newline:
|
||||
if (chunks) {
|
||||
ast_t *last_chunk = chunks->ast;
|
||||
if (last_chunk->tag == StringLiteral) {
|
||||
auto str = Match(last_chunk, StringLiteral);
|
||||
const char* trimmed = heap_strn(str->str, strlen(str->str)-1);
|
||||
chunks->ast = NewAST(ctx->file, last_chunk->start, last_chunk->end-1, StringLiteral, .str=trimmed);
|
||||
}
|
||||
}
|
||||
} else { // Inline string
|
||||
char special[] = {'\n','\r',open,close,interp_char,escape_char,'\0'};
|
||||
int depth = 1;
|
||||
while (depth > 0 && *pos) {
|
||||
size_t len = strcspn(pos, special);
|
||||
if (len > 0) {
|
||||
ast_t *chunk = NewAST(ctx->file, pos, pos+len-1, StringLiteral, .str=heap_strn(pos, len));
|
||||
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
|
||||
pos += len;
|
||||
}
|
||||
|
||||
if (*pos == interp_char) {
|
||||
ast_t *chunk = parse_interpolation(ctx, pos);
|
||||
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
|
||||
pos = chunk->end;
|
||||
} else if (*pos == escape_char) {
|
||||
const char *start = pos;
|
||||
const char* unescaped = unescape(&pos);
|
||||
ast_t *chunk = NewAST(ctx->file, start, pos, StringLiteral, .str=unescaped);
|
||||
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
|
||||
} else if (*pos == '\r' || *pos == '\n') {
|
||||
if (open == ' ' || open == ':' || open == '>') goto string_finished;
|
||||
parser_err(ctx, string_start, pos, "This line ended without closing the string");
|
||||
} else if (*pos == close) { // if open == close, then don't do nesting (i.e. check 'close' first)
|
||||
--depth;
|
||||
if (depth > 0) {
|
||||
ast_t *chunk = NewAST(ctx->file, pos, pos+1, StringLiteral, .str=heap_strn(pos, 1));
|
||||
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
|
||||
}
|
||||
++pos;
|
||||
} else if (*pos == open) {
|
||||
++depth;
|
||||
ast_t *chunk = NewAST(ctx->file, pos, pos+1, StringLiteral, .str=heap_strn(pos, 1));
|
||||
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
|
||||
++pos;
|
||||
} else {
|
||||
ast_t *chunk = NewAST(ctx->file, pos, pos+1, StringLiteral, .str=heap_strn(pos, 1));
|
||||
++pos;
|
||||
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
|
||||
}
|
||||
parser_err(ctx, pos, strchrnul(pos, '\n'), "This string line isn't correctly indented");
|
||||
} else {
|
||||
chunk = CORD_cat_char(chunk, *pos);
|
||||
}
|
||||
}
|
||||
string_finished:;
|
||||
|
||||
if (chunk) {
|
||||
ast_t *literal = NewAST(ctx->file, chunk_start, pos, StringLiteral, .str=CORD_to_const_char_star(chunk));
|
||||
chunks = new(ast_list_t, .ast=literal, .next=chunks);
|
||||
chunk = NULL;
|
||||
}
|
||||
|
||||
REVERSE_LIST(chunks);
|
||||
return NewAST(ctx->file, string_start, pos, StringJoin, .children=chunks);
|
||||
expect_closing(ctx, &pos, (char[]){close_quote, 0}, "I was expecting a '%c' to finish this string", close_quote);
|
||||
return NewAST(ctx->file, start, pos, StringJoin, .children=chunks);
|
||||
}
|
||||
|
||||
PARSER(parse_skip) {
|
||||
|
Loading…
Reference in New Issue
Block a user