aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2025-08-25 00:30:08 -0400
committerBruce Hill <bruce@bruce-hill.com>2025-08-25 00:30:08 -0400
commitb378f7359a6b8bd2e47bafdba496d115825adcd7 (patch)
tree52deff3a66e2265b84bd20a7e81b20111d29f7dd /src
parent80755477735baaea66f865c316aff036bebd8e2f (diff)
Split out some parser functionality.
Diffstat (limited to 'src')
-rw-r--r--src/parse/errors.h86
-rw-r--r--src/parse/parse.c344
-rw-r--r--src/parse/parse.h2
-rw-r--r--src/parse/utils.c272
-rw-r--r--src/parse/utils.h24
-rw-r--r--src/typecheck.c1
6 files changed, 389 insertions, 340 deletions
diff --git a/src/parse/errors.h b/src/parse/errors.h
new file mode 100644
index 00000000..479a785f
--- /dev/null
+++ b/src/parse/errors.h
@@ -0,0 +1,86 @@
+
+#include <ctype.h> // IWYU pragma: export
+#include <stdio.h> // IWYU pragma: export
+#include <stdlib.h> // IWYU pragma: export
+#include <string.h> // IWYU pragma: export
+
+#include "../stdlib/files.h" // IWYU pragma: export
+#include "../stdlib/print.h" // IWYU pragma: export
+#include "../stdlib/stacktrace.h" // IWYU pragma: export
+#include "../stdlib/stdlib.h" // IWYU pragma: export
+#include "utils.h" // IWYU pragma: export
+
+//
+// Print a parse error and exit (or use the on_err longjmp)
+//
+#define parser_err(ctx, start, end, ...) \
+ ({ \
+ if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \
+ fprint_inline(stderr, (ctx)->file->relative_filename, ":", get_line_number((ctx)->file, (start)), ".", \
+ get_line_column((ctx)->file, (start)), ": ", __VA_ARGS__); \
+ if (USE_COLOR) fputs(" \x1b[m", stderr); \
+ fputs("\n\n", stderr); \
+ highlight_error((ctx)->file, (start), (end), "\x1b[31;1;7m", 2, USE_COLOR); \
+ fputs("\n", stderr); \
+ if (getenv("TOMO_STACKTRACE")) print_stacktrace(stderr, 1); \
+ if ((ctx)->on_err) longjmp(*((ctx)->on_err), 1); \
+ raise(SIGABRT); \
+ exit(1); \
+ })
+
+//
+// Expect a string (potentially after whitespace) and emit a parser error if it's not there
+//
+#define expect_str(ctx, start, pos, target, ...) \
+ ({ \
+ spaces(pos); \
+ if (!match(pos, target)) { \
+ if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \
+ parser_err(ctx, start, *pos, __VA_ARGS__); \
+ } \
+ char _lastchar = target[strlen(target) - 1]; \
+ if (isalpha(_lastchar) || isdigit(_lastchar) || _lastchar == '_') { \
+ if (is_xid_continue_next(*pos)) { \
+ if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \
+ parser_err(ctx, start, *pos, __VA_ARGS__); \
+ } \
+ } \
+ })
+
+//
+// Helper for matching closing parens with good error messages
+//
+#define expect_closing(ctx, pos, close_str, ...) \
+ ({ \
+ const char *_start = *pos; \
+ spaces(pos); \
+ if (!match(pos, (close_str))) { \
+ const char *_eol = strchr(*pos, '\n'); \
+ const char *_next = strstr(*pos, (close_str)); \
+ const char *_end = _eol < _next ? _eol : _next; \
+ if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \
+ parser_err(ctx, _start, _end, __VA_ARGS__); \
+ } \
+ })
+
+#define expect(ctx, start, pos, parser, ...) \
+ ({ \
+ const char **_pos = pos; \
+ spaces(_pos); \
+ __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos); \
+ if (!_result) { \
+ if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \
+ parser_err(ctx, start, *_pos, __VA_ARGS__); \
+ } \
+ *_pos = _result->end; \
+ _result; \
+ })
+
+#define optional(ctx, pos, parser) \
+ ({ \
+ const char **_pos = pos; \
+ spaces(_pos); \
+ __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos); \
+ if (_result) *_pos = _result->end; \
+ _result; \
+ })
diff --git a/src/parse/parse.c b/src/parse/parse.c
index 40e2a766..ba88843d 100644
--- a/src/parse/parse.c
+++ b/src/parse/parse.c
@@ -2,23 +2,24 @@
#include <ctype.h>
#include <gc.h>
#include <setjmp.h>
+#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <string.h>
-#include <signal.h>
+#include "../unistr-fixed.h"
#include <unictype.h>
#include <uniname.h>
#include "../ast.h"
#include "../stdlib/print.h"
-#include "../stdlib/stacktrace.h"
#include "../stdlib/stdlib.h"
#include "../stdlib/tables.h"
#include "../stdlib/text.h"
#include "../stdlib/util.h"
-#include "../unistr-fixed.h"
+#include "errors.h"
#include "parse.h"
+#include "utils.h"
// The cache of {filename -> parsed AST} will hold at most this many entries:
#ifndef PARSE_CACHE_SIZE
@@ -28,8 +29,6 @@
static const double RADIANS_PER_DEGREE = 0.0174532925199432957692369076848861271344287188854172545609719144;
static const char closing[128] = {['('] = ')', ['['] = ']', ['<'] = '>', ['{'] = '}'};
-#define SPACES_PER_INDENT 4
-
int op_tightness[] = {
[Power] = 9,
[Multiply] = 8,
@@ -57,25 +56,6 @@ int op_tightness[] = {
[Xor] = 1,
};
-static const char *keywords[] = {
- "C_code", "_max_", "_min_", "and", "assert", "break", "continue", "defer", "deserialize", "do",
- "else", "enum", "extend", "extern", "for", "func", "if", "in", "lang", "mod",
- "mod1", "no", "none", "not", "or", "pass", "return", "skip", "skip", "stop",
- "struct", "then", "unless", "use", "when", "while", "xor", "yes",
-};
-
-enum { NORMAL_FUNCTION = 0, EXTERN_FUNCTION = 1 };
-
-static INLINE size_t some_of(const char **pos, const char *allow);
-static INLINE size_t some_not(const char **pos, const char *forbid);
-static INLINE size_t spaces(const char **pos);
-static INLINE void whitespace(const char **pos);
-static INLINE size_t match(const char **pos, const char *target);
-static INLINE size_t match_word(const char **pos, const char *word);
-static INLINE const char *get_word(const char **pos);
-static INLINE const char *get_id(const char **pos);
-static INLINE bool comment(const char **pos);
-static INLINE bool indent(parse_ctx_t *ctx, const char **pos);
static INLINE ast_e match_binary_operator(const char **pos);
static ast_t *parse_comprehension_suffix(parse_ctx_t *ctx, ast_t *expr);
static ast_t *parse_field_suffix(parse_ctx_t *ctx, ast_t *lhs);
@@ -148,307 +128,6 @@ static ast_t *parse_deserialize(parse_ctx_t *ctx, const char *pos);
static ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote,
char open_interp, bool allow_escapes);
-//
-// Print a parse error and exit (or use the on_err longjmp)
-//
-#define parser_err(ctx, start, end, ...) \
- ({ \
- if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \
- fprint_inline(stderr, (ctx)->file->relative_filename, ":", get_line_number((ctx)->file, (start)), ".", \
- get_line_column((ctx)->file, (start)), ": ", __VA_ARGS__); \
- if (USE_COLOR) fputs(" \x1b[m", stderr); \
- fputs("\n\n", stderr); \
- highlight_error((ctx)->file, (start), (end), "\x1b[31;1;7m", 2, USE_COLOR); \
- fputs("\n", stderr); \
- if (getenv("TOMO_STACKTRACE")) print_stacktrace(stderr, 1); \
- if ((ctx)->on_err) longjmp(*((ctx)->on_err), 1); \
- raise(SIGABRT); \
- exit(1); \
- })
-
-//
-// Expect a string (potentially after whitespace) and emit a parser error if it's not there
-//
-#define expect_str(ctx, start, pos, target, ...) \
- ({ \
- spaces(pos); \
- if (!match(pos, target)) { \
- if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \
- parser_err(ctx, start, *pos, __VA_ARGS__); \
- } \
- char _lastchar = target[strlen(target) - 1]; \
- if (isalpha(_lastchar) || isdigit(_lastchar) || _lastchar == '_') { \
- if (is_xid_continue_next(*pos)) { \
- if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \
- parser_err(ctx, start, *pos, __VA_ARGS__); \
- } \
- } \
- })
-
-//
-// Helper for matching closing parens with good error messages
-//
-#define expect_closing(ctx, pos, close_str, ...) \
- ({ \
- const char *_start = *pos; \
- spaces(pos); \
- if (!match(pos, (close_str))) { \
- const char *_eol = strchr(*pos, '\n'); \
- const char *_next = strstr(*pos, (close_str)); \
- const char *_end = _eol < _next ? _eol : _next; \
- if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \
- parser_err(ctx, _start, _end, __VA_ARGS__); \
- } \
- })
-
-#define expect(ctx, start, pos, parser, ...) \
- ({ \
- const char **_pos = pos; \
- spaces(_pos); \
- __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos); \
- if (!_result) { \
- if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \
- parser_err(ctx, start, *_pos, __VA_ARGS__); \
- } \
- *_pos = _result->end; \
- _result; \
- })
-
-#define optional(ctx, pos, parser) \
- ({ \
- const char **_pos = pos; \
- spaces(_pos); \
- __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos); \
- if (_result) *_pos = _result->end; \
- _result; \
- })
-
-//
-// Convert an escape sequence like \n to a string
-//
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstack-protector"
-#endif
-static const char *unescape(parse_ctx_t *ctx, const char **out) {
- const char **endpos = out;
- const char *escape = *out;
- static const char *unescapes[256] = {['a'] = "\a", ['b'] = "\b", ['e'] = "\x1b", ['f'] = "\f", ['n'] = "\n",
- ['r'] = "\r", ['t'] = "\t", ['v'] = "\v", ['_'] = " "};
- assert(*escape == '\\');
- if (unescapes[(int)escape[1]]) {
- *endpos = escape + 2;
- return GC_strdup(unescapes[(int)escape[1]]);
- } else if (escape[1] == '[') {
- // ANSI Control Sequence Indicator: \033 [ ... m
- size_t len = strcspn(&escape[2], "\r\n]");
- if (escape[2 + len] != ']') parser_err(ctx, escape, escape + 2 + len, "Missing closing ']'");
- *endpos = escape + 3 + len;
- return String("\033[", string_slice(&escape[2], len), "m");
- } else if (escape[1] == '{') {
- // Unicode codepoints by name
- size_t len = strcspn(&escape[2], "\r\n}");
- if (escape[2 + len] != '}') parser_err(ctx, escape, escape + 2 + len, "Missing closing '}'");
- char name[len + 1];
- memcpy(name, &escape[2], len);
- name[len] = '\0';
-
- if (name[0] == 'U') {
- for (char *p = &name[1]; *p; p++) {
- if (!isxdigit(*p)) goto look_up_unicode_name;
- }
- // Unicode codepoints by hex
- char *endptr = NULL;
- long codepoint = strtol(name + 1, &endptr, 16);
- uint32_t ustr[2] = {codepoint, 0};
- size_t bufsize = 8;
- uint8_t buf[bufsize];
- (void)u32_to_u8(ustr, bufsize, buf, &bufsize);
- *endpos = escape + 3 + len;
- return GC_strndup((char *)buf, bufsize);
- }
-
- look_up_unicode_name:;
-
- uint32_t codepoint = unicode_name_character(name);
- if (codepoint == UNINAME_INVALID)
- parser_err(ctx, escape, escape + 3 + len, "Invalid unicode codepoint name: ", quoted(name));
- *endpos = escape + 3 + len;
- char *str = GC_MALLOC_ATOMIC(16);
- size_t u8_len = 16;
- (void)u32_to_u8(&codepoint, 1, (uint8_t *)str, &u8_len);
- str[u8_len] = '\0';
- return str;
- } else if (escape[1] == 'x' && escape[2] && escape[3]) {
- // ASCII 2-digit hex
- char buf[] = {escape[2], escape[3], 0};
- char c = (char)strtol(buf, NULL, 16);
- *endpos = escape + 4;
- return GC_strndup(&c, 1);
- } else if ('0' <= escape[1] && escape[1] <= '7' && '0' <= escape[2] && escape[2] <= '7' && '0' <= escape[3]
- && escape[3] <= '7') {
- char buf[] = {escape[1], escape[2], escape[3], 0};
- char c = (char)strtol(buf, NULL, 8);
- *endpos = escape + 4;
- return GC_strndup(&c, 1);
- } else {
- *endpos = escape + 2;
- return GC_strndup(escape + 1, 1);
- }
-}
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-// Indent is in number of spaces (assuming that \t is 4 spaces)
-PUREFUNC static INLINE int64_t get_indent(parse_ctx_t *ctx, const char *pos) {
- int64_t line_num = get_line_number(ctx->file, pos);
- const char *line = get_line(ctx->file, line_num);
- if (line == NULL) {
- return 0;
- } else if (*line == ' ') {
- int64_t spaces = (int64_t)strspn(line, " ");
- if (line[spaces] == '\t')
- parser_err(ctx, line + spaces, line + spaces + 1,
- "This is a tab following spaces, and you can't mix tabs and spaces");
- return spaces;
- } else if (*line == '\t') {
- int64_t indent = (int64_t)strspn(line, "\t");
- if (line[indent] == ' ')
- parser_err(ctx, line + indent, line + indent + 1,
- "This is a space following tabs, and you can't mix tabs and spaces");
- return indent * SPACES_PER_INDENT;
- } else {
- return 0;
- }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////// Text-based parsing primitives ///////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////////
-size_t some_of(const char **pos, const char *allow) {
- size_t len = strspn(*pos, allow);
- *pos += len;
- return len;
-}
-
-size_t some_not(const char **pos, const char *forbid) {
- size_t len = strcspn(*pos, forbid);
- *pos += len;
- return len;
-}
-
-size_t spaces(const char **pos) { return some_of(pos, " \t"); }
-
-void whitespace(const char **pos) {
- while (some_of(pos, " \t\r\n") || comment(pos))
- continue;
-}
-
-size_t match(const char **pos, const char *target) {
- size_t len = strlen(target);
- if (strncmp(*pos, target, len) != 0) return 0;
- *pos += len;
- return len;
-}
-
-static INLINE bool is_xid_continue_next(const char *pos) {
- ucs4_t point = 0;
- u8_next(&point, (const uint8_t *)pos);
- return uc_is_property_xid_continue(point);
-}
-
-size_t match_word(const char **out, const char *word) {
- const char *pos = *out;
- spaces(&pos);
- if (!match(&pos, word) || is_xid_continue_next(pos)) return 0;
-
- *out = pos;
- return strlen(word);
-}
-
-const char *get_word(const char **inout) {
- const char *word = *inout;
- spaces(&word);
- const uint8_t *pos = (const uint8_t *)word;
- ucs4_t point;
- pos = u8_next(&point, pos);
- if (!uc_is_property_xid_start(point) && point != '_') return NULL;
-
- for (const uint8_t *next; (next = u8_next(&point, pos)); pos = next) {
- if (!uc_is_property_xid_continue(point)) break;
- }
- *inout = (const char *)pos;
- return GC_strndup(word, (size_t)((const char *)pos - word));
-}
-
-static CONSTFUNC bool is_keyword(const char *word) {
- int64_t lo = 0, hi = sizeof(keywords) / sizeof(keywords[0]) - 1;
- while (lo <= hi) {
- int64_t mid = (lo + hi) / 2;
- int32_t cmp = strcmp(word, keywords[mid]);
- if (cmp == 0) return true;
- else if (cmp > 0) lo = mid + 1;
- else if (cmp < 0) hi = mid - 1;
- }
- return false;
-}
-
-const char *get_id(const char **inout) {
- const char *pos = *inout;
- const char *word = get_word(&pos);
- if (!word || is_keyword(word)) return NULL;
- *inout = pos;
- return word;
-}
-
-static const char *eol(const char *str) { return str + strcspn(str, "\r\n"); }
-
-bool comment(const char **pos) {
- if ((*pos)[0] == '#') {
- *pos += strcspn(*pos, "\r\n");
- return true;
- } else {
- return false;
- }
-}
-
-bool indent(parse_ctx_t *ctx, const char **out) {
- const char *pos = *out;
- int64_t starting_indent = get_indent(ctx, pos);
- whitespace(&pos);
- const char *next_line = get_line(ctx->file, get_line_number(ctx->file, pos));
- if (next_line <= *out) return false;
-
- if (get_indent(ctx, next_line) != starting_indent + SPACES_PER_INDENT) return false;
-
- *out = next_line + strspn(next_line, " \t");
- return true;
-}
-
-bool newline_with_indentation(const char **out, int64_t target) {
- const char *pos = *out;
- if (*pos == '\r') ++pos;
- if (*pos != '\n') return false;
- ++pos;
- if (*pos == '\r' || *pos == '\n' || *pos == '\0') {
- // Empty line
- *out = pos;
- return true;
- }
-
- if (*pos == ' ') {
- if ((int64_t)strspn(pos, " ") >= target) {
- *out = pos + target;
- return true;
- }
- } else if ((int64_t)strspn(pos, "\t") * SPACES_PER_INDENT >= target) {
- *out = pos + target / SPACES_PER_INDENT;
- return true;
- }
- return false;
-}
-
///////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// AST-based parsers /////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -661,21 +340,6 @@ ast_t *parse_num(parse_ctx_t *ctx, const char *pos) {
return NewAST(ctx->file, start, pos, Num, .n = d);
}
-static INLINE bool match_separator(const char **pos) { // Either comma or newline
- const char *p = *pos;
- int separators = 0;
- for (;;) {
- if (some_of(&p, "\r\n,")) ++separators;
- else if (!comment(&p) && !some_of(&p, " \t")) break;
- }
- if (separators > 0) {
- *pos = p;
- return true;
- } else {
- return false;
- }
-}
-
ast_t *parse_list(parse_ctx_t *ctx, const char *pos) {
const char *start = pos;
if (!match(&pos, "[")) return NULL;
diff --git a/src/parse/parse.h b/src/parse/parse.h
index 2d81170c..24e173a5 100644
--- a/src/parse/parse.h
+++ b/src/parse/parse.h
@@ -8,6 +8,8 @@
#include "../ast.h"
#include "../stdlib/files.h"
+#define SPACES_PER_INDENT 4
+
typedef struct {
file_t *file;
jmp_buf *on_err;
diff --git a/src/parse/utils.c b/src/parse/utils.c
new file mode 100644
index 00000000..d745dec8
--- /dev/null
+++ b/src/parse/utils.c
@@ -0,0 +1,272 @@
+// Some common parsing utilities
+
+#include <stdint.h>
+
+#include "../unistr-fixed.h"
+#include <unictype.h>
+#include <uniname.h>
+
+#include "../stdlib/util.h"
+#include "errors.h"
+#include "parse.h"
+#include "utils.h"
+
+static const char *keywords[] = {
+ "C_code", "_max_", "_min_", "and", "assert", "break", "continue", "defer", "deserialize", "do",
+ "else", "enum", "extend", "extern", "for", "func", "if", "in", "lang", "mod",
+ "mod1", "no", "none", "not", "or", "pass", "return", "skip", "skip", "stop",
+ "struct", "then", "unless", "use", "when", "while", "xor", "yes",
+};
+
+public
+CONSTFUNC bool is_keyword(const char *word) {
+ int64_t lo = 0, hi = sizeof(keywords) / sizeof(keywords[0]) - 1;
+ while (lo <= hi) {
+ int64_t mid = (lo + hi) / 2;
+ int32_t cmp = strcmp(word, keywords[mid]);
+ if (cmp == 0) return true;
+ else if (cmp > 0) lo = mid + 1;
+ else if (cmp < 0) hi = mid - 1;
+ }
+ return false;
+}
+
+public
+size_t some_of(const char **pos, const char *allow) {
+ size_t len = strspn(*pos, allow);
+ *pos += len;
+ return len;
+}
+
+public
+size_t some_not(const char **pos, const char *forbid) {
+ size_t len = strcspn(*pos, forbid);
+ *pos += len;
+ return len;
+}
+
+public
+size_t spaces(const char **pos) { return some_of(pos, " \t"); }
+
+public
+void whitespace(const char **pos) {
+ while (some_of(pos, " \t\r\n") || comment(pos))
+ continue;
+}
+
+public
+size_t match(const char **pos, const char *target) {
+ size_t len = strlen(target);
+ if (strncmp(*pos, target, len) != 0) return 0;
+ *pos += len;
+ return len;
+}
+
+public
+bool is_xid_continue_next(const char *pos) {
+ ucs4_t point = 0;
+ u8_next(&point, (const uint8_t *)pos);
+ return uc_is_property_xid_continue(point);
+}
+
+public
+size_t match_word(const char **out, const char *word) {
+ const char *pos = *out;
+ spaces(&pos);
+ if (!match(&pos, word) || is_xid_continue_next(pos)) return 0;
+
+ *out = pos;
+ return strlen(word);
+}
+
+public
+const char *get_word(const char **inout) {
+ const char *word = *inout;
+ spaces(&word);
+ const uint8_t *pos = (const uint8_t *)word;
+ ucs4_t point;
+ pos = u8_next(&point, pos);
+ if (!uc_is_property_xid_start(point) && point != '_') return NULL;
+
+ for (const uint8_t *next; (next = u8_next(&point, pos)); pos = next) {
+ if (!uc_is_property_xid_continue(point)) break;
+ }
+ *inout = (const char *)pos;
+ return GC_strndup(word, (size_t)((const char *)pos - word));
+}
+
+public
+const char *get_id(const char **inout) {
+ const char *pos = *inout;
+ const char *word = get_word(&pos);
+ if (!word || is_keyword(word)) return NULL;
+ *inout = pos;
+ return word;
+}
+
+public
+const char *eol(const char *str) { return str + strcspn(str, "\r\n"); }
+
+public
+bool comment(const char **pos) {
+ if ((*pos)[0] == '#') {
+ *pos += strcspn(*pos, "\r\n");
+ return true;
+ } else {
+ return false;
+ }
+}
+
+public
+PUREFUNC int64_t get_indent(parse_ctx_t *ctx, const char *pos) {
+ int64_t line_num = get_line_number(ctx->file, pos);
+ const char *line = get_line(ctx->file, line_num);
+ if (line == NULL) {
+ return 0;
+ } else if (*line == ' ') {
+ int64_t spaces = (int64_t)strspn(line, " ");
+ if (line[spaces] == '\t')
+ parser_err(ctx, line + spaces, line + spaces + 1,
+ "This is a tab following spaces, and you can't mix tabs and spaces");
+ return spaces;
+ } else if (*line == '\t') {
+ int64_t indent = (int64_t)strspn(line, "\t");
+ if (line[indent] == ' ')
+ parser_err(ctx, line + indent, line + indent + 1,
+ "This is a space following tabs, and you can't mix tabs and spaces");
+ return indent * SPACES_PER_INDENT;
+ } else {
+ return 0;
+ }
+}
+
+public
+bool indent(parse_ctx_t *ctx, const char **out) {
+ const char *pos = *out;
+ int64_t starting_indent = get_indent(ctx, pos);
+ whitespace(&pos);
+ const char *next_line = get_line(ctx->file, get_line_number(ctx->file, pos));
+ if (next_line <= *out) return false;
+
+ if (get_indent(ctx, next_line) != starting_indent + SPACES_PER_INDENT) return false;
+
+ *out = next_line + strspn(next_line, " \t");
+ return true;
+}
+
+public
+bool newline_with_indentation(const char **out, int64_t target) {
+ const char *pos = *out;
+ if (*pos == '\r') ++pos;
+ if (*pos != '\n') return false;
+ ++pos;
+ if (*pos == '\r' || *pos == '\n' || *pos == '\0') {
+ // Empty line
+ *out = pos;
+ return true;
+ }
+
+ if (*pos == ' ') {
+ if ((int64_t)strspn(pos, " ") >= target) {
+ *out = pos + target;
+ return true;
+ }
+ } else if ((int64_t)strspn(pos, "\t") * SPACES_PER_INDENT >= target) {
+ *out = pos + target / SPACES_PER_INDENT;
+ return true;
+ }
+ return false;
+}
+
+//
+// Convert an escape sequence like \n to a string
+//
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstack-protector"
+#endif
+const char *unescape(parse_ctx_t *ctx, const char **out) {
+ const char **endpos = out;
+ const char *escape = *out;
+ static const char *unescapes[256] = {['a'] = "\a", ['b'] = "\b", ['e'] = "\x1b", ['f'] = "\f", ['n'] = "\n",
+ ['r'] = "\r", ['t'] = "\t", ['v'] = "\v", ['_'] = " "};
+ assert(*escape == '\\');
+ if (unescapes[(int)escape[1]]) {
+ *endpos = escape + 2;
+ return GC_strdup(unescapes[(int)escape[1]]);
+ } else if (escape[1] == '[') {
+ // ANSI Control Sequence Indicator: \033 [ ... m
+ size_t len = strcspn(&escape[2], "\r\n]");
+ if (escape[2 + len] != ']') parser_err(ctx, escape, escape + 2 + len, "Missing closing ']'");
+ *endpos = escape + 3 + len;
+ return String("\033[", string_slice(&escape[2], len), "m");
+ } else if (escape[1] == '{') {
+ // Unicode codepoints by name
+ size_t len = strcspn(&escape[2], "\r\n}");
+ if (escape[2 + len] != '}') parser_err(ctx, escape, escape + 2 + len, "Missing closing '}'");
+ char name[len + 1];
+ memcpy(name, &escape[2], len);
+ name[len] = '\0';
+
+ if (name[0] == 'U') {
+ for (char *p = &name[1]; *p; p++) {
+ if (!isxdigit(*p)) goto look_up_unicode_name;
+ }
+ // Unicode codepoints by hex
+ char *endptr = NULL;
+ long codepoint = strtol(name + 1, &endptr, 16);
+ uint32_t ustr[2] = {codepoint, 0};
+ size_t bufsize = 8;
+ uint8_t buf[bufsize];
+ (void)u32_to_u8(ustr, bufsize, buf, &bufsize);
+ *endpos = escape + 3 + len;
+ return GC_strndup((char *)buf, bufsize);
+ }
+
+ look_up_unicode_name:;
+
+ uint32_t codepoint = unicode_name_character(name);
+ if (codepoint == UNINAME_INVALID)
+ parser_err(ctx, escape, escape + 3 + len, "Invalid unicode codepoint name: ", quoted(name));
+ *endpos = escape + 3 + len;
+ char *str = GC_MALLOC_ATOMIC(16);
+ size_t u8_len = 16;
+ (void)u32_to_u8(&codepoint, 1, (uint8_t *)str, &u8_len);
+ str[u8_len] = '\0';
+ return str;
+ } else if (escape[1] == 'x' && escape[2] && escape[3]) {
+ // ASCII 2-digit hex
+ char buf[] = {escape[2], escape[3], 0};
+ char c = (char)strtol(buf, NULL, 16);
+ *endpos = escape + 4;
+ return GC_strndup(&c, 1);
+ } else if ('0' <= escape[1] && escape[1] <= '7' && '0' <= escape[2] && escape[2] <= '7' && '0' <= escape[3]
+ && escape[3] <= '7') {
+ char buf[] = {escape[1], escape[2], escape[3], 0};
+ char c = (char)strtol(buf, NULL, 8);
+ *endpos = escape + 4;
+ return GC_strndup(&c, 1);
+ } else {
+ *endpos = escape + 2;
+ return GC_strndup(escape + 1, 1);
+ }
+}
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+public
+bool match_separator(const char **pos) { // Either comma or newline
+ const char *p = *pos;
+ int separators = 0;
+ for (;;) {
+ if (some_of(&p, "\r\n,")) ++separators;
+ else if (!comment(&p) && !some_of(&p, " \t")) break;
+ }
+ if (separators > 0) {
+ *pos = p;
+ return true;
+ } else {
+ return false;
+ }
+}
diff --git a/src/parse/utils.h b/src/parse/utils.h
new file mode 100644
index 00000000..cec48f31
--- /dev/null
+++ b/src/parse/utils.h
@@ -0,0 +1,24 @@
+// Some common parsing utilities
+
+#include <stdbool.h>
+
+#include "../stdlib/util.h"
+#include "parse.h"
+
+CONSTFUNC bool is_keyword(const char *word);
+size_t some_of(const char **pos, const char *allow);
+size_t some_not(const char **pos, const char *forbid);
+size_t spaces(const char **pos);
+void whitespace(const char **pos);
+size_t match(const char **pos, const char *target);
+size_t match_word(const char **pos, const char *word);
+const char *get_word(const char **pos);
+const char *get_id(const char **pos);
+bool comment(const char **pos);
+bool indent(parse_ctx_t *ctx, const char **pos);
+const char *eol(const char *str);
+PUREFUNC int64_t get_indent(parse_ctx_t *ctx, const char *pos);
+const char *unescape(parse_ctx_t *ctx, const char **out);
+bool is_xid_continue_next(const char *pos);
+bool newline_with_indentation(const char **out, int64_t target);
+bool match_separator(const char **pos);
diff --git a/src/typecheck.c b/src/typecheck.c
index 09c5fc62..08966c31 100644
--- a/src/typecheck.c
+++ b/src/typecheck.c
@@ -13,6 +13,7 @@
#include "modules.h"
#include "naming.h"
#include "parse/parse.h"
+#include "parse/types.h"
#include "stdlib/paths.h"
#include "stdlib/tables.h"
#include "stdlib/text.h"