Better string parsing.

This commit is contained in:
Bruce Hill 2024-02-10 15:23:06 -05:00
parent 930c09f46d
commit 4e545c6798
5 changed files with 92 additions and 146 deletions

3
ast.c
View File

@ -92,8 +92,7 @@ CORD ast_to_cord(ast_t *ast)
T(Char, "(\x1b[35m'%c'\x1b[m)", data.c)
T(StringLiteral, "\x1b[35m\"%s\"\x1b[m", data.str)
T(StringJoin, "(%r)", ast_list_to_cord(data.children))
T(Interp, "(%r)", ast_to_cord(data.value))
T(Declare, "(var=%s, value=%s)", ast_to_cord(data.var), ast_to_cord(data.value))
T(Declare, "(var=%s, value=%r)", ast_to_cord(data.var), ast_to_cord(data.value))
T(Assign, "(targets=%r, values=%r)", ast_list_to_cord(data.targets), ast_list_to_cord(data.values))
T(BinaryOp, "(%r, %s, %r)", ast_to_cord(data.lhs), OP_NAMES[data.op], ast_to_cord(data.rhs))
T(UpdateAssign, "(%r, %s, %r)", ast_to_cord(data.lhs), OP_NAMES[data.op], ast_to_cord(data.rhs))

6
ast.h
View File

@ -92,7 +92,7 @@ typedef enum {
Unknown = 0,
Nil, Bool, Var,
Int, Num, Char,
StringLiteral, StringJoin, Interp,
StringLiteral, StringJoin,
Declare, Assign,
BinaryOp, UnaryOp, UpdateAssign,
Min, Max,
@ -145,10 +145,6 @@ struct ast_s {
struct {
ast_list_t *children;
} StringJoin;
struct {
ast_t *value;
bool labelled:1, colorize:1, quote_string:1;
} Interp;
struct {
ast_t *var;
ast_t *value;

View File

@ -130,9 +130,6 @@ CORD compile(ast_t *ast)
}
return code;
}
case Interp: {
return CORD_asprintf("__cord(%r)", compile(Match(ast, Interp)->value));
}
case Block: {
ast_list_t *stmts = Match(ast, Block)->statements;
if (stmts && !stmts->next)

View File

@ -35,6 +35,7 @@
int32_t: CORD_asprintf("%d", x), int64_t: CORD_asprintf("%ld", x), \
double: CORD_asprintf("%g", x), float: CORD_asprintf("%g", x), \
CORD: x, \
char*: x, \
default: "???")
#define __heap(x) (__typeof(x)*)memcpy(GC_MALLOC(sizeof(x)), (__typeof(x)[1]){x}, sizeof(x))
#define __stack(x) (&(__typeof(x)){x})
@ -56,3 +57,5 @@
#define mod1(x, n) (((x) % (n)) + (__typeof(x))1)
#define say(str) puts(CORD_to_const_char_star(str))
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0

223
parse.c
View File

@ -14,6 +14,7 @@
#include "ast.h"
#include "util.h"
static const char closing[128] = {['(']=')', ['[']=']', ['<']='>', ['{']='}', ['|']='|', ['/']='/'};
typedef struct {
sss_file_t *file;
@ -80,6 +81,7 @@ static PARSER(parse_opt_indented_block);
static PARSER(parse_var);
static PARSER(parse_enum_def);
static PARSER(parse_struct_def);
static PARSER(parse_string);
static PARSER(parse_func_def);
static PARSER(parse_extern);
static PARSER(parse_declaration);
@ -166,7 +168,10 @@ size_t some_not(const char **pos, const char *forbid) {
return len;
}
size_t spaces(const char **pos) { return some_of(pos, " \t"); }
size_t spaces(const char **pos) {
return some_of(pos, " \t");
}
size_t whitespace(const char **pos) {
const char *p0 = *pos;
while (some_of(pos, " \t\r\n") || comment(pos))
@ -222,7 +227,7 @@ static void expect_closing(
const char *eol = strchr(*pos, '\n');
const char *next = strstr(*pos, closing);
const char *end = eol < next ? eol : next;
if (isatty(STDERR_FILENO) && !getenv("NO_COLOR"))
@ -868,154 +873,100 @@ PARSER(parse_char) {
}
}
PARSER(parse_interpolation) {
const char *start = pos;
++pos; // ignore the initial character, typically a '$', but might be other stuff like '@' in different contexts
bool labelled = match(&pos, ":");
ast_t *value = optional(ctx, &pos, parse_parens);
if (!value) value = optional(ctx, &pos, parse_term);
if (!value) {
match_group(&pos, '(');
parser_err(ctx, start, pos, "This interpolation didn't parse");
}
return NewAST(ctx->file, start, pos, Interp, .value=value, .labelled=labelled);
}
PARSER(parse_string) {
static const char closing[128] = {['(']=')', ['[']=']', ['<']='>', ['{']='}'};
static const bool escapes[128] = {['\'']='\x1B', ['(']='\x1B', ['>']='\x1B', ['/']='\x1B'};
static const char interps[128] = {['>']='@', ['/']='@', ['\'']='\x1A', ['(']='\x1A'};
const char *string_start = pos;
char open, close;
if (match(&pos, "$")) {
open = *pos;
close = closing[(int)open] ? closing[(int)open] : open;
++pos;
// ["$" [interp-char [closing-interp-char]]] ('"' ... '"' / "'" ... "'")
const char *start = pos;
char close_quote, start_interp = '\x03', close_interp = '\x02';
if (match(&pos, "\"")) {
close_quote = '"', start_interp = '{', close_interp = '}';
} else if (match(&pos, "'")) {
close_quote = '\'';
} else if (match(&pos, "$")) {
if (*pos == '"' || *pos == '\'' || *pos == '/' || *pos == ';') {
close_quote = *pos;
} else {
start_interp = *(pos++);
close_interp = closing[(int)start_interp];
if (*pos == close_interp) ++pos;
close_quote = closing[(int)*pos] ? closing[(int)*pos] : *pos;
++pos;
}
} else {
if (*pos != '\'' && *pos != '"')
return NULL;
open = *pos;
close = *pos;
++pos;
return NULL;
}
char interp_char = interps[(int)open] ? interps[(int)open] : '$';
char escape_char = escapes[(int)open] ? escapes[(int)open] : '\\';
// printf("Parsing string: '%c' .. '%c' interp: '%c%c'\n", *start, close_quote, start_interp, close_interp);
if (open == ':' || open == '>')
spaces(&pos);
int64_t starting_indent = sss_get_indent(ctx->file, pos);
int64_t string_indent;
if (*pos == '\r' || *pos == '\n') {
const char *first_line = pos;
whitespace(&first_line);
string_indent = sss_get_indent(ctx->file, first_line);
if (string_indent <= starting_indent)
parser_err(ctx, start, first_line, "Multi-line strings must be indented on their first line");
} else {
string_indent = starting_indent + 4;
}
ast_list_t *chunks = NULL;
if (*pos == '\r' || *pos == '\n') { // Multiline string
char special[] = {'\n','\r',interp_char,escape_char,'\0'};
int64_t starting_indent = sss_get_indent(ctx->file, pos);
// indentation-delimited string
match(&pos, "\r");
match(&pos, "\n");
int64_t first_line = sss_get_line_number(ctx->file, pos);
int64_t indented = sss_get_indent(ctx->file, pos);
pos = sss_get_line(ctx->file, first_line);
while (pos < ctx->file->text + ctx->file->len) {
const char *eol = strchrnul(pos, '\n');
if (eol == pos + strspn(pos, " \t\r")) { // Empty line
ast_t *ast = NewAST(ctx->file, pos, eol, StringLiteral, .str="\n");
chunks = new(ast_list_t, .ast=ast, .next=chunks);
pos = eol + 1;
CORD chunk = NULL;
const char *chunk_start = pos;
for (; pos < ctx->file->text + ctx->file->len && *pos != close_quote; ++pos) {
if (*pos == start_interp) {
if (chunk) {
ast_t *literal = NewAST(ctx->file, chunk_start, pos, StringLiteral, .str=CORD_to_const_char_star(chunk));
chunks = new(ast_list_t, .ast=literal, .next=chunks);
chunk = NULL;
}
++pos;
spaces(&pos);
for (ast_t *interp; (interp=optional(ctx, &pos, parse_expr)); spaces(&pos)) {
interp = WrapAST(interp, FunctionCall, .fn=WrapAST(interp, Var, .name="__cord"), .args=new(ast_list_t, .ast=interp));
chunks = new(ast_list_t, .ast=interp, .next=chunks);
chunk_start = pos;
}
if (close_interp) {
const char *closing = pos;
spaces(&closing);
if (*closing == close_interp) {
pos = closing;
}
}
} else if (*pos == '\r' || *pos == '\n') {
// Newline handling
match(&pos, "\r");
match(&pos, "\n");
if (match_indentation(&pos, string_indent)) {
if (chunk || chunks)
chunk = CORD_cat_char(chunk, '\n');
--pos;
continue;
}
if (!match_indentation(&pos, starting_indent))
parser_err(ctx, pos, strchrnul(pos, '\n'), "This isn't a valid indentation level for this unterminated string");
if (*pos == close) {
++pos;
goto finished;
}
if (!match_indentation(&pos, (indented - starting_indent)))
parser_err(ctx, pos, strchrnul(pos, '\n'), "I was expecting this to have %lu extra indentation beyond %lu",
(indented - starting_indent), starting_indent);
while (pos < eol+1) {
size_t len = strcspn(pos, special);
if (pos[len] == '\r') ++len;
if (pos[len] == '\n') ++len;
if (len > 0) {
ast_t *chunk = NewAST(ctx->file, pos, pos+len-1, StringLiteral, .str=heap_strn(pos, len));
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
}
pos += len;
if (*pos == escape_char) {
const char *start = pos;
const char* unescaped = unescape(&pos);
ast_t *chunk = NewAST(ctx->file, start, pos, StringLiteral, .str=unescaped);
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
++pos;
} else if (*pos == interp_char) {
ast_t *chunk = parse_interpolation(ctx, pos);
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
pos = chunk->end;
if (sss_get_indent(ctx->file, pos) == starting_indent) {
if (*pos == close_quote) {
break;
} else if (some_of(&pos, ".") >= 2) {
// Multi-line split
--pos;
continue;
}
}
}
finished:;
// Strip trailing newline:
if (chunks) {
ast_t *last_chunk = chunks->ast;
if (last_chunk->tag == StringLiteral) {
auto str = Match(last_chunk, StringLiteral);
const char* trimmed = heap_strn(str->str, strlen(str->str)-1);
chunks->ast = NewAST(ctx->file, last_chunk->start, last_chunk->end-1, StringLiteral, .str=trimmed);
}
}
} else { // Inline string
char special[] = {'\n','\r',open,close,interp_char,escape_char,'\0'};
int depth = 1;
while (depth > 0 && *pos) {
size_t len = strcspn(pos, special);
if (len > 0) {
ast_t *chunk = NewAST(ctx->file, pos, pos+len-1, StringLiteral, .str=heap_strn(pos, len));
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
pos += len;
}
if (*pos == interp_char) {
ast_t *chunk = parse_interpolation(ctx, pos);
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
pos = chunk->end;
} else if (*pos == escape_char) {
const char *start = pos;
const char* unescaped = unescape(&pos);
ast_t *chunk = NewAST(ctx->file, start, pos, StringLiteral, .str=unescaped);
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
} else if (*pos == '\r' || *pos == '\n') {
if (open == ' ' || open == ':' || open == '>') goto string_finished;
parser_err(ctx, string_start, pos, "This line ended without closing the string");
} else if (*pos == close) { // if open == close, then don't do nesting (i.e. check 'close' first)
--depth;
if (depth > 0) {
ast_t *chunk = NewAST(ctx->file, pos, pos+1, StringLiteral, .str=heap_strn(pos, 1));
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
}
++pos;
} else if (*pos == open) {
++depth;
ast_t *chunk = NewAST(ctx->file, pos, pos+1, StringLiteral, .str=heap_strn(pos, 1));
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
++pos;
} else {
ast_t *chunk = NewAST(ctx->file, pos, pos+1, StringLiteral, .str=heap_strn(pos, 1));
++pos;
chunks = new(ast_list_t, .ast=chunk, .next=chunks);
}
parser_err(ctx, pos, strchrnul(pos, '\n'), "This string line isn't correctly indented");
} else {
chunk = CORD_cat_char(chunk, *pos);
}
}
string_finished:;
if (chunk) {
ast_t *literal = NewAST(ctx->file, chunk_start, pos, StringLiteral, .str=CORD_to_const_char_star(chunk));
chunks = new(ast_list_t, .ast=literal, .next=chunks);
chunk = NULL;
}
REVERSE_LIST(chunks);
return NewAST(ctx->file, string_start, pos, StringJoin, .children=chunks);
expect_closing(ctx, &pos, (char[]){close_quote, 0}, "I was expecting a '%c' to finish this string", close_quote);
return NewAST(ctx->file, start, pos, StringJoin, .children=chunks);
}
PARSER(parse_skip) {