Split out some parser functionality.

author: Bruce Hill <bruce@bruce-hill.com> 2025-08-25 00:30:08 -0400
committer: Bruce Hill <bruce@bruce-hill.com> 2025-08-25 00:30:08 -0400
commit: b378f7359a6b8bd2e47bafdba496d115825adcd7 (patch)
tree: 52deff3a66e2265b84bd20a7e81b20111d29f7dd /src
parent: 80755477735baaea66f865c316aff036bebd8e2f (diff)
6 files changed, 389 insertions, 340 deletions
diff --git a/src/parse/errors.h b/src/parse/errors.h
new file mode 100644
index 00000000..479a785f
--- /dev/null
+++ b/src/parse/errors.h
@@ -0,0 +1,86 @@
+
+#include <ctype.h> // IWYU pragma: export
+#include <stdio.h> // IWYU pragma: export
+#include <stdlib.h> // IWYU pragma: export
+#include <string.h> // IWYU pragma: export
+
+#include "../stdlib/files.h" // IWYU pragma: export
+#include "../stdlib/print.h" // IWYU pragma: export
+#include "../stdlib/stacktrace.h" // IWYU pragma: export
+#include "../stdlib/stdlib.h" // IWYU pragma: export
+#include "utils.h" // IWYU pragma: export
+
+//
+// Print a parse error and exit (or use the on_err longjmp)
+//
+#define parser_err(ctx, start, end, ...)                                                                               \
+    ({                                                                                                                 \
+        if (USE_COLOR) fputs("\x1b[31;1;7m", stderr);                                                                  \
+        fprint_inline(stderr, (ctx)->file->relative_filename, ":", get_line_number((ctx)->file, (start)), ".",         \
+                      get_line_column((ctx)->file, (start)), ": ", __VA_ARGS__);                                       \
+        if (USE_COLOR) fputs(" \x1b[m", stderr);                                                                       \
+        fputs("\n\n", stderr);                                                                                         \
+        highlight_error((ctx)->file, (start), (end), "\x1b[31;1;7m", 2, USE_COLOR);                                    \
+        fputs("\n", stderr);                                                                                           \
+        if (getenv("TOMO_STACKTRACE")) print_stacktrace(stderr, 1);                                                    \
+        if ((ctx)->on_err) longjmp(*((ctx)->on_err), 1);                                                               \
+        raise(SIGABRT);                                                                                                \
+        exit(1);                                                                                                       \
+    })
+
+//
+// Expect a string (potentially after whitespace) and emit a parser error if it's not there
+//
+#define expect_str(ctx, start, pos, target, ...)                                                                       \
+    ({                                                                                                                 \
+        spaces(pos);                                                                                                   \
+        if (!match(pos, target)) {                                                                                     \
+            if (USE_COLOR) fputs("\x1b[31;1;7m", stderr);                                                              \
+            parser_err(ctx, start, *pos, __VA_ARGS__);                                                                 \
+        }                                                                                                              \
+        char _lastchar = target[strlen(target) - 1];                                                                   \
+        if (isalpha(_lastchar) || isdigit(_lastchar) || _lastchar == '_') {                                            \
+            if (is_xid_continue_next(*pos)) {                                                                          \
+                if (USE_COLOR) fputs("\x1b[31;1;7m", stderr);                                                          \
+                parser_err(ctx, start, *pos, __VA_ARGS__);                                                             \
+            }                                                                                                          \
+        }                                                                                                              \
+    })
+
+//
+// Helper for matching closing parens with good error messages
+//
+#define expect_closing(ctx, pos, close_str, ...)                                                                       \
+    ({                                                                                                                 \
+        const char *_start = *pos;                                                                                     \
+        spaces(pos);                                                                                                   \
+        if (!match(pos, (close_str))) {                                                                                \
+            const char *_eol = strchr(*pos, '\n');                                                                     \
+            const char *_next = strstr(*pos, (close_str));                                                             \
+            const char *_end = _eol < _next ? _eol : _next;                                                            \
+            if (USE_COLOR) fputs("\x1b[31;1;7m", stderr);                                                              \
+            parser_err(ctx, _start, _end, __VA_ARGS__);                                                                \
+        }                                                                                                              \
+    })
+
+#define expect(ctx, start, pos, parser, ...)                                                                           \
+    ({                                                                                                                 \
+        const char **_pos = pos;                                                                                       \
+        spaces(_pos);                                                                                                  \
+        __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos);                                                     \
+        if (!_result) {                                                                                                \
+            if (USE_COLOR) fputs("\x1b[31;1;7m", stderr);                                                              \
+            parser_err(ctx, start, *_pos, __VA_ARGS__);                                                                \
+        }                                                                                                              \
+        *_pos = _result->end;                                                                                          \
+        _result;                                                                                                       \
+    })
+
+#define optional(ctx, pos, parser)                                                                                     \
+    ({                                                                                                                 \
+        const char **_pos = pos;                                                                                       \
+        spaces(_pos);                                                                                                  \
+        __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos);                                                     \
+        if (_result) *_pos = _result->end;                                                                             \
+        _result;                                                                                                       \
+    })
diff --git a/src/parse/parse.c b/src/parse/parse.c
index 40e2a766..ba88843d 100644
--- a/src/parse/parse.c
+++ b/src/parse/parse.c
@@ -2,23 +2,24 @@
 #include <ctype.h>
 #include <gc.h>
 #include <setjmp.h>
+#include <signal.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <string.h>
 
-#include <signal.h>
+#include "../unistr-fixed.h"
 #include <unictype.h>
 #include <uniname.h>
 
 #include "../ast.h"
 #include "../stdlib/print.h"
-#include "../stdlib/stacktrace.h"
 #include "../stdlib/stdlib.h"
 #include "../stdlib/tables.h"
 #include "../stdlib/text.h"
 #include "../stdlib/util.h"
-#include "../unistr-fixed.h"
+#include "errors.h"
 #include "parse.h"
+#include "utils.h"
 
 // The cache of {filename -> parsed AST} will hold at most this many entries:
 #ifndef PARSE_CACHE_SIZE
@@ -28,8 +29,6 @@
 static const double RADIANS_PER_DEGREE = 0.0174532925199432957692369076848861271344287188854172545609719144;
 static const char closing[128] = {['('] = ')', ['['] = ']', ['<'] = '>', ['{'] = '}'};
 
-#define SPACES_PER_INDENT 4
-
 int op_tightness[] = {
     [Power] = 9,
     [Multiply] = 8,
@@ -57,25 +56,6 @@ int op_tightness[] = {
     [Xor] = 1,
 };
 
-static const char *keywords[] = {
-    "C_code", "_max_", "_min_",  "and",    "assert", "break", "continue", "defer", "deserialize", "do",
-    "else",   "enum",  "extend", "extern", "for",    "func",  "if",       "in",    "lang",        "mod",
-    "mod1",   "no",    "none",   "not",    "or",     "pass",  "return",   "skip",  "skip",        "stop",
-    "struct", "then",  "unless", "use",    "when",   "while", "xor",      "yes",
-};
-
-enum { NORMAL_FUNCTION = 0, EXTERN_FUNCTION = 1 };
-
-static INLINE size_t some_of(const char **pos, const char *allow);
-static INLINE size_t some_not(const char **pos, const char *forbid);
-static INLINE size_t spaces(const char **pos);
-static INLINE void whitespace(const char **pos);
-static INLINE size_t match(const char **pos, const char *target);
-static INLINE size_t match_word(const char **pos, const char *word);
-static INLINE const char *get_word(const char **pos);
-static INLINE const char *get_id(const char **pos);
-static INLINE bool comment(const char **pos);
-static INLINE bool indent(parse_ctx_t *ctx, const char **pos);
 static INLINE ast_e match_binary_operator(const char **pos);
 static ast_t *parse_comprehension_suffix(parse_ctx_t *ctx, ast_t *expr);
 static ast_t *parse_field_suffix(parse_ctx_t *ctx, ast_t *lhs);
@@ -148,307 +128,6 @@ static ast_t *parse_deserialize(parse_ctx_t *ctx, const char *pos);
 static ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote,
                                       char open_interp, bool allow_escapes);
 
-//
-// Print a parse error and exit (or use the on_err longjmp)
-//
-#define parser_err(ctx, start, end, ...)                                                                               \
-    ({                                                                                                                 \
-        if (USE_COLOR) fputs("\x1b[31;1;7m", stderr);                                                                  \
-        fprint_inline(stderr, (ctx)->file->relative_filename, ":", get_line_number((ctx)->file, (start)), ".",         \
-                      get_line_column((ctx)->file, (start)), ": ", __VA_ARGS__);                                       \
-        if (USE_COLOR) fputs(" \x1b[m", stderr);                                                                       \
-        fputs("\n\n", stderr);                                                                                         \
-        highlight_error((ctx)->file, (start), (end), "\x1b[31;1;7m", 2, USE_COLOR);                                    \
-        fputs("\n", stderr);                                                                                           \
-        if (getenv("TOMO_STACKTRACE")) print_stacktrace(stderr, 1);                                                    \
-        if ((ctx)->on_err) longjmp(*((ctx)->on_err), 1);                                                               \
-        raise(SIGABRT);                                                                                                \
-        exit(1);                                                                                                       \
-    })
-
-//
-// Expect a string (potentially after whitespace) and emit a parser error if it's not there
-//
-#define expect_str(ctx, start, pos, target, ...)                                                                       \
-    ({                                                                                                                 \
-        spaces(pos);                                                                                                   \
-        if (!match(pos, target)) {                                                                                     \
-            if (USE_COLOR) fputs("\x1b[31;1;7m", stderr);                                                              \
-            parser_err(ctx, start, *pos, __VA_ARGS__);                                                                 \
-        }                                                                                                              \
-        char _lastchar = target[strlen(target) - 1];                                                                   \
-        if (isalpha(_lastchar) || isdigit(_lastchar) || _lastchar == '_') {                                            \
-            if (is_xid_continue_next(*pos)) {                                                                          \
-                if (USE_COLOR) fputs("\x1b[31;1;7m", stderr);                                                          \
-                parser_err(ctx, start, *pos, __VA_ARGS__);                                                             \
-            }                                                                                                          \
-        }                                                                                                              \
-    })
-
-//
-// Helper for matching closing parens with good error messages
-//
-#define expect_closing(ctx, pos, close_str, ...)                                                                       \
-    ({                                                                                                                 \
-        const char *_start = *pos;                                                                                     \
-        spaces(pos);                                                                                                   \
-        if (!match(pos, (close_str))) {                                                                                \
-            const char *_eol = strchr(*pos, '\n');                                                                     \
-            const char *_next = strstr(*pos, (close_str));                                                             \
-            const char *_end = _eol < _next ? _eol : _next;                                                            \
-            if (USE_COLOR) fputs("\x1b[31;1;7m", stderr);                                                              \
-            parser_err(ctx, _start, _end, __VA_ARGS__);                                                                \
-        }                                                                                                              \
-    })
-
-#define expect(ctx, start, pos, parser, ...)                                                                           \
-    ({                                                                                                                 \
-        const char **_pos = pos;                                                                                       \
-        spaces(_pos);                                                                                                  \
-        __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos);                                                     \
-        if (!_result) {                                                                                                \
-            if (USE_COLOR) fputs("\x1b[31;1;7m", stderr);                                                              \
-            parser_err(ctx, start, *_pos, __VA_ARGS__);                                                                \
-        }                                                                                                              \
-        *_pos = _result->end;                                                                                          \
-        _result;                                                                                                       \
-    })
-
-#define optional(ctx, pos, parser)                                                                                     \
-    ({                                                                                                                 \
-        const char **_pos = pos;                                                                                       \
-        spaces(_pos);                                                                                                  \
-        __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos);                                                     \
-        if (_result) *_pos = _result->end;                                                                             \
-        _result;                                                                                                       \
-    })
-
-//
-// Convert an escape sequence like \n to a string
-//
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstack-protector"
-#endif
-static const char *unescape(parse_ctx_t *ctx, const char **out) {
-    const char **endpos = out;
-    const char *escape = *out;
-    static const char *unescapes[256] = {['a'] = "\a", ['b'] = "\b", ['e'] = "\x1b", ['f'] = "\f", ['n'] = "\n",
-                                         ['r'] = "\r", ['t'] = "\t", ['v'] = "\v",   ['_'] = " "};
-    assert(*escape == '\\');
-    if (unescapes[(int)escape[1]]) {
-        *endpos = escape + 2;
-        return GC_strdup(unescapes[(int)escape[1]]);
-    } else if (escape[1] == '[') {
-        // ANSI Control Sequence Indicator: \033 [ ... m
-        size_t len = strcspn(&escape[2], "\r\n]");
-        if (escape[2 + len] != ']') parser_err(ctx, escape, escape + 2 + len, "Missing closing ']'");
-        *endpos = escape + 3 + len;
-        return String("\033[", string_slice(&escape[2], len), "m");
-    } else if (escape[1] == '{') {
-        // Unicode codepoints by name
-        size_t len = strcspn(&escape[2], "\r\n}");
-        if (escape[2 + len] != '}') parser_err(ctx, escape, escape + 2 + len, "Missing closing '}'");
-        char name[len + 1];
-        memcpy(name, &escape[2], len);
-        name[len] = '\0';
-
-        if (name[0] == 'U') {
-            for (char *p = &name[1]; *p; p++) {
-                if (!isxdigit(*p)) goto look_up_unicode_name;
-            }
-            // Unicode codepoints by hex
-            char *endptr = NULL;
-            long codepoint = strtol(name + 1, &endptr, 16);
-            uint32_t ustr[2] = {codepoint, 0};
-            size_t bufsize = 8;
-            uint8_t buf[bufsize];
-            (void)u32_to_u8(ustr, bufsize, buf, &bufsize);
-            *endpos = escape + 3 + len;
-            return GC_strndup((char *)buf, bufsize);
-        }
-
-    look_up_unicode_name:;
-
-        uint32_t codepoint = unicode_name_character(name);
-        if (codepoint == UNINAME_INVALID)
-            parser_err(ctx, escape, escape + 3 + len, "Invalid unicode codepoint name: ", quoted(name));
-        *endpos = escape + 3 + len;
-        char *str = GC_MALLOC_ATOMIC(16);
-        size_t u8_len = 16;
-        (void)u32_to_u8(&codepoint, 1, (uint8_t *)str, &u8_len);
-        str[u8_len] = '\0';
-        return str;
-    } else if (escape[1] == 'x' && escape[2] && escape[3]) {
-        // ASCII 2-digit hex
-        char buf[] = {escape[2], escape[3], 0};
-        char c = (char)strtol(buf, NULL, 16);
-        *endpos = escape + 4;
-        return GC_strndup(&c, 1);
-    } else if ('0' <= escape[1] && escape[1] <= '7' && '0' <= escape[2] && escape[2] <= '7' && '0' <= escape[3]
-               && escape[3] <= '7') {
-        char buf[] = {escape[1], escape[2], escape[3], 0};
-        char c = (char)strtol(buf, NULL, 8);
-        *endpos = escape + 4;
-        return GC_strndup(&c, 1);
-    } else {
-        *endpos = escape + 2;
-        return GC_strndup(escape + 1, 1);
-    }
-}
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-// Indent is in number of spaces (assuming that \t is 4 spaces)
-PUREFUNC static INLINE int64_t get_indent(parse_ctx_t *ctx, const char *pos) {
-    int64_t line_num = get_line_number(ctx->file, pos);
-    const char *line = get_line(ctx->file, line_num);
-    if (line == NULL) {
-        return 0;
-    } else if (*line == ' ') {
-        int64_t spaces = (int64_t)strspn(line, " ");
-        if (line[spaces] == '\t')
-            parser_err(ctx, line + spaces, line + spaces + 1,
-                       "This is a tab following spaces, and you can't mix tabs and spaces");
-        return spaces;
-    } else if (*line == '\t') {
-        int64_t indent = (int64_t)strspn(line, "\t");
-        if (line[indent] == ' ')
-            parser_err(ctx, line + indent, line + indent + 1,
-                       "This is a space following tabs, and you can't mix tabs and spaces");
-        return indent * SPACES_PER_INDENT;
-    } else {
-        return 0;
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////     Text-based parsing primitives     ///////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////////
-size_t some_of(const char **pos, const char *allow) {
-    size_t len = strspn(*pos, allow);
-    *pos += len;
-    return len;
-}
-
-size_t some_not(const char **pos, const char *forbid) {
-    size_t len = strcspn(*pos, forbid);
-    *pos += len;
-    return len;
-}
-
-size_t spaces(const char **pos) { return some_of(pos, " \t"); }
-
-void whitespace(const char **pos) {
-    while (some_of(pos, " \t\r\n") || comment(pos))
-        continue;
-}
-
-size_t match(const char **pos, const char *target) {
-    size_t len = strlen(target);
-    if (strncmp(*pos, target, len) != 0) return 0;
-    *pos += len;
-    return len;
-}
-
-static INLINE bool is_xid_continue_next(const char *pos) {
-    ucs4_t point = 0;
-    u8_next(&point, (const uint8_t *)pos);
-    return uc_is_property_xid_continue(point);
-}
-
-size_t match_word(const char **out, const char *word) {
-    const char *pos = *out;
-    spaces(&pos);
-    if (!match(&pos, word) || is_xid_continue_next(pos)) return 0;
-
-    *out = pos;
-    return strlen(word);
-}
-
-const char *get_word(const char **inout) {
-    const char *word = *inout;
-    spaces(&word);
-    const uint8_t *pos = (const uint8_t *)word;
-    ucs4_t point;
-    pos = u8_next(&point, pos);
-    if (!uc_is_property_xid_start(point) && point != '_') return NULL;
-
-    for (const uint8_t *next; (next = u8_next(&point, pos)); pos = next) {
-        if (!uc_is_property_xid_continue(point)) break;
-    }
-    *inout = (const char *)pos;
-    return GC_strndup(word, (size_t)((const char *)pos - word));
-}
-
-static CONSTFUNC bool is_keyword(const char *word) {
-    int64_t lo = 0, hi = sizeof(keywords) / sizeof(keywords[0]) - 1;
-    while (lo <= hi) {
-        int64_t mid = (lo + hi) / 2;
-        int32_t cmp = strcmp(word, keywords[mid]);
-        if (cmp == 0) return true;
-        else if (cmp > 0) lo = mid + 1;
-        else if (cmp < 0) hi = mid - 1;
-    }
-    return false;
-}
-
-const char *get_id(const char **inout) {
-    const char *pos = *inout;
-    const char *word = get_word(&pos);
-    if (!word || is_keyword(word)) return NULL;
-    *inout = pos;
-    return word;
-}
-
-static const char *eol(const char *str) { return str + strcspn(str, "\r\n"); }
-
-bool comment(const char **pos) {
-    if ((*pos)[0] == '#') {
-        *pos += strcspn(*pos, "\r\n");
-        return true;
-    } else {
-        return false;
-    }
-}
-
-bool indent(parse_ctx_t *ctx, const char **out) {
-    const char *pos = *out;
-    int64_t starting_indent = get_indent(ctx, pos);
-    whitespace(&pos);
-    const char *next_line = get_line(ctx->file, get_line_number(ctx->file, pos));
-    if (next_line <= *out) return false;
-
-    if (get_indent(ctx, next_line) != starting_indent + SPACES_PER_INDENT) return false;
-
-    *out = next_line + strspn(next_line, " \t");
-    return true;
-}
-
-bool newline_with_indentation(const char **out, int64_t target) {
-    const char *pos = *out;
-    if (*pos == '\r') ++pos;
-    if (*pos != '\n') return false;
-    ++pos;
-    if (*pos == '\r' || *pos == '\n' || *pos == '\0') {
-        // Empty line
-        *out = pos;
-        return true;
-    }
-
-    if (*pos == ' ') {
-        if ((int64_t)strspn(pos, " ") >= target) {
-            *out = pos + target;
-            return true;
-        }
-    } else if ((int64_t)strspn(pos, "\t") * SPACES_PER_INDENT >= target) {
-        *out = pos + target / SPACES_PER_INDENT;
-        return true;
-    }
-    return false;
-}
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////     AST-based parsers    /////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -661,21 +340,6 @@ ast_t *parse_num(parse_ctx_t *ctx, const char *pos) {
     return NewAST(ctx->file, start, pos, Num, .n = d);
 }
 
-static INLINE bool match_separator(const char **pos) { // Either comma or newline
-    const char *p = *pos;
-    int separators = 0;
-    for (;;) {
-        if (some_of(&p, "\r\n,")) ++separators;
-        else if (!comment(&p) && !some_of(&p, " \t")) break;
-    }
-    if (separators > 0) {
-        *pos = p;
-        return true;
-    } else {
-        return false;
-    }
-}
-
 ast_t *parse_list(parse_ctx_t *ctx, const char *pos) {
     const char *start = pos;
     if (!match(&pos, "[")) return NULL;
diff --git a/src/parse/parse.h b/src/parse/parse.h
index 2d81170c..24e173a5 100644
--- a/src/parse/parse.h
+++ b/src/parse/parse.h
@@ -8,6 +8,8 @@
 #include "../ast.h"
 #include "../stdlib/files.h"
 
+#define SPACES_PER_INDENT 4
+
 typedef struct {
     file_t *file;
     jmp_buf *on_err;
diff --git a/src/parse/utils.c b/src/parse/utils.c
new file mode 100644
index 00000000..d745dec8
--- /dev/null
+++ b/src/parse/utils.c
@@ -0,0 +1,272 @@
+// Some common parsing utilities
+
+#include <stdint.h>
+
+#include "../unistr-fixed.h"
+#include <unictype.h>
+#include <uniname.h>
+
+#include "../stdlib/util.h"
+#include "errors.h"
+#include "parse.h"
+#include "utils.h"
+
+static const char *keywords[] = {
+    "C_code", "_max_", "_min_",  "and",    "assert", "break", "continue", "defer", "deserialize", "do",
+    "else",   "enum",  "extend", "extern", "for",    "func",  "if",       "in",    "lang",        "mod",
+    "mod1",   "no",    "none",   "not",    "or",     "pass",  "return",   "skip",  "skip",        "stop",
+    "struct", "then",  "unless", "use",    "when",   "while", "xor",      "yes",
+};
+
+public
+CONSTFUNC bool is_keyword(const char *word) {
+    int64_t lo = 0, hi = sizeof(keywords) / sizeof(keywords[0]) - 1;
+    while (lo <= hi) {
+        int64_t mid = (lo + hi) / 2;
+        int32_t cmp = strcmp(word, keywords[mid]);
+        if (cmp == 0) return true;
+        else if (cmp > 0) lo = mid + 1;
+        else if (cmp < 0) hi = mid - 1;
+    }
+    return false;
+}
+
+public
+size_t some_of(const char **pos, const char *allow) {
+    size_t len = strspn(*pos, allow);
+    *pos += len;
+    return len;
+}
+
+public
+size_t some_not(const char **pos, const char *forbid) {
+    size_t len = strcspn(*pos, forbid);
+    *pos += len;
+    return len;
+}
+
+public
+size_t spaces(const char **pos) { return some_of(pos, " \t"); }
+
+public
+void whitespace(const char **pos) {
+    while (some_of(pos, " \t\r\n") || comment(pos))
+        continue;
+}
+
+public
+size_t match(const char **pos, const char *target) {
+    size_t len = strlen(target);
+    if (strncmp(*pos, target, len) != 0) return 0;
+    *pos += len;
+    return len;
+}
+
+public
+bool is_xid_continue_next(const char *pos) {
+    ucs4_t point = 0;
+    u8_next(&point, (const uint8_t *)pos);
+    return uc_is_property_xid_continue(point);
+}
+
+public
+size_t match_word(const char **out, const char *word) {
+    const char *pos = *out;
+    spaces(&pos);
+    if (!match(&pos, word) || is_xid_continue_next(pos)) return 0;
+
+    *out = pos;
+    return strlen(word);
+}
+
+public
+const char *get_word(const char **inout) {
+    const char *word = *inout;
+    spaces(&word);
+    const uint8_t *pos = (const uint8_t *)word;
+    ucs4_t point;
+    pos = u8_next(&point, pos);
+    if (!uc_is_property_xid_start(point) && point != '_') return NULL;
+
+    for (const uint8_t *next; (next = u8_next(&point, pos)); pos = next) {
+        if (!uc_is_property_xid_continue(point)) break;
+    }
+    *inout = (const char *)pos;
+    return GC_strndup(word, (size_t)((const char *)pos - word));
+}
+
+public
+const char *get_id(const char **inout) {
+    const char *pos = *inout;
+    const char *word = get_word(&pos);
+    if (!word || is_keyword(word)) return NULL;
+    *inout = pos;
+    return word;
+}
+
+public
+const char *eol(const char *str) { return str + strcspn(str, "\r\n"); }
+
+public
+bool comment(const char **pos) {
+    if ((*pos)[0] == '#') {
+        *pos += strcspn(*pos, "\r\n");
+        return true;
+    } else {
+        return false;
+    }
+}
+
+public
+PUREFUNC int64_t get_indent(parse_ctx_t *ctx, const char *pos) {
+    int64_t line_num = get_line_number(ctx->file, pos);
+    const char *line = get_line(ctx->file, line_num);
+    if (line == NULL) {
+        return 0;
+    } else if (*line == ' ') {
+        int64_t spaces = (int64_t)strspn(line, " ");
+        if (line[spaces] == '\t')
+            parser_err(ctx, line + spaces, line + spaces + 1,
+                       "This is a tab following spaces, and you can't mix tabs and spaces");
+        return spaces;
+    } else if (*line == '\t') {
+        int64_t indent = (int64_t)strspn(line, "\t");
+        if (line[indent] == ' ')
+            parser_err(ctx, line + indent, line + indent + 1,
+                       "This is a space following tabs, and you can't mix tabs and spaces");
+        return indent * SPACES_PER_INDENT;
+    } else {
+        return 0;
+    }
+}
+
+public
+bool indent(parse_ctx_t *ctx, const char **out) {
+    const char *pos = *out;
+    int64_t starting_indent = get_indent(ctx, pos);
+    whitespace(&pos);
+    const char *next_line = get_line(ctx->file, get_line_number(ctx->file, pos));
+    if (next_line <= *out) return false;
+
+    if (get_indent(ctx, next_line) != starting_indent + SPACES_PER_INDENT) return false;
+
+    *out = next_line + strspn(next_line, " \t");
+    return true;
+}
+
+public
+bool newline_with_indentation(const char **out, int64_t target) {
+    const char *pos = *out;
+    if (*pos == '\r') ++pos;
+    if (*pos != '\n') return false;
+    ++pos;
+    if (*pos == '\r' || *pos == '\n' || *pos == '\0') {
+        // Empty line
+        *out = pos;
+        return true;
+    }
+
+    if (*pos == ' ') {
+        if ((int64_t)strspn(pos, " ") >= target) {
+            *out = pos + target;
+            return true;
+        }
+    } else if ((int64_t)strspn(pos, "\t") * SPACES_PER_INDENT >= target) {
+        *out = pos + target / SPACES_PER_INDENT;
+        return true;
+    }
+    return false;
+}
+
+//
+// Convert an escape sequence like \n to a string
+//
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstack-protector"
+#endif
+const char *unescape(parse_ctx_t *ctx, const char **out) {
+    const char **endpos = out;
+    const char *escape = *out;
+    static const char *unescapes[256] = {['a'] = "\a", ['b'] = "\b", ['e'] = "\x1b", ['f'] = "\f", ['n'] = "\n",
+                                         ['r'] = "\r", ['t'] = "\t", ['v'] = "\v",   ['_'] = " "};
+    assert(*escape == '\\');
+    if (unescapes[(int)escape[1]]) {
+        *endpos = escape + 2;
+        return GC_strdup(unescapes[(int)escape[1]]);
+    } else if (escape[1] == '[') {
+        // ANSI Control Sequence Indicator: \033 [ ... m
+        size_t len = strcspn(&escape[2], "\r\n]");
+        if (escape[2 + len] != ']') parser_err(ctx, escape, escape + 2 + len, "Missing closing ']'");
+        *endpos = escape + 3 + len;
+        return String("\033[", string_slice(&escape[2], len), "m");
+    } else if (escape[1] == '{') {
+        // Unicode codepoints by name
+        size_t len = strcspn(&escape[2], "\r\n}");
+        if (escape[2 + len] != '}') parser_err(ctx, escape, escape + 2 + len, "Missing closing '}'");
+        char name[len + 1];
+        memcpy(name, &escape[2], len);
+        name[len] = '\0';
+
+        if (name[0] == 'U') {
+            for (char *p = &name[1]; *p; p++) {
+                if (!isxdigit(*p)) goto look_up_unicode_name;
+            }
+            // Unicode codepoints by hex
+            char *endptr = NULL;
+            long codepoint = strtol(name + 1, &endptr, 16);
+            uint32_t ustr[2] = {codepoint, 0};
+            size_t bufsize = 8;
+            uint8_t buf[bufsize];
+            (void)u32_to_u8(ustr, bufsize, buf, &bufsize);
+            *endpos = escape + 3 + len;
+            return GC_strndup((char *)buf, bufsize);
+        }
+
+    look_up_unicode_name:;
+
+        uint32_t codepoint = unicode_name_character(name);
+        if (codepoint == UNINAME_INVALID)
+            parser_err(ctx, escape, escape + 3 + len, "Invalid unicode codepoint name: ", quoted(name));
+        *endpos = escape + 3 + len;
+        char *str = GC_MALLOC_ATOMIC(16);
+        size_t u8_len = 16;
+        (void)u32_to_u8(&codepoint, 1, (uint8_t *)str, &u8_len);
+        str[u8_len] = '\0';
+        return str;
+    } else if (escape[1] == 'x' && escape[2] && escape[3]) {
+        // ASCII 2-digit hex
+        char buf[] = {escape[2], escape[3], 0};
+        char c = (char)strtol(buf, NULL, 16);
+        *endpos = escape + 4;
+        return GC_strndup(&c, 1);
+    } else if ('0' <= escape[1] && escape[1] <= '7' && '0' <= escape[2] && escape[2] <= '7' && '0' <= escape[3]
+               && escape[3] <= '7') {
+        char buf[] = {escape[1], escape[2], escape[3], 0};
+        char c = (char)strtol(buf, NULL, 8);
+        *endpos = escape + 4;
+        return GC_strndup(&c, 1);
+    } else {
+        *endpos = escape + 2;
+        return GC_strndup(escape + 1, 1);
+    }
+}
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+public
+bool match_separator(const char **pos) { // Either comma or newline
+    const char *p = *pos;
+    int separators = 0;
+    for (;;) {
+        if (some_of(&p, "\r\n,")) ++separators;
+        else if (!comment(&p) && !some_of(&p, " \t")) break;
+    }
+    if (separators > 0) {
+        *pos = p;
+        return true;
+    } else {
+        return false;
+    }
+}
diff --git a/src/parse/utils.h b/src/parse/utils.h
new file mode 100644
index 00000000..cec48f31
--- /dev/null
+++ b/src/parse/utils.h
@@ -0,0 +1,24 @@
+// Some common parsing utilities
+
+#include <stdbool.h>
+
+#include "../stdlib/util.h"
+#include "parse.h"
+
+CONSTFUNC bool is_keyword(const char *word);
+size_t some_of(const char **pos, const char *allow);
+size_t some_not(const char **pos, const char *forbid);
+size_t spaces(const char **pos);
+void whitespace(const char **pos);
+size_t match(const char **pos, const char *target);
+size_t match_word(const char **pos, const char *word);
+const char *get_word(const char **pos);
+const char *get_id(const char **pos);
+bool comment(const char **pos);
+bool indent(parse_ctx_t *ctx, const char **pos);
+const char *eol(const char *str);
+PUREFUNC int64_t get_indent(parse_ctx_t *ctx, const char *pos);
+const char *unescape(parse_ctx_t *ctx, const char **out);
+bool is_xid_continue_next(const char *pos);
+bool newline_with_indentation(const char **out, int64_t target);
+bool match_separator(const char **pos);
diff --git a/src/typecheck.c b/src/typecheck.c
index 09c5fc62..08966c31 100644
--- a/src/typecheck.c
+++ b/src/typecheck.c
@@ -13,6 +13,7 @@
 #include "modules.h"
 #include "naming.h"
 #include "parse/parse.h"
+#include "parse/types.h"
 #include "stdlib/paths.h"
 #include "stdlib/tables.h"
 #include "stdlib/text.h"
author	Bruce Hill <bruce@bruce-hill.com>	2025-08-25 00:30:08 -0400
committer	Bruce Hill <bruce@bruce-hill.com>	2025-08-25 00:30:08 -0400
commit	b378f7359a6b8bd2e47bafdba496d115825adcd7 (patch)
tree	52deff3a66e2265b84bd20a7e81b20111d29f7dd /src
parent	80755477735baaea66f865c316aff036bebd8e2f (diff)