diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/parse/errors.h | 86 | ||||
| -rw-r--r-- | src/parse/parse.c | 344 | ||||
| -rw-r--r-- | src/parse/parse.h | 2 | ||||
| -rw-r--r-- | src/parse/utils.c | 272 | ||||
| -rw-r--r-- | src/parse/utils.h | 24 | ||||
| -rw-r--r-- | src/typecheck.c | 1 |
6 files changed, 389 insertions, 340 deletions
diff --git a/src/parse/errors.h b/src/parse/errors.h new file mode 100644 index 00000000..479a785f --- /dev/null +++ b/src/parse/errors.h @@ -0,0 +1,86 @@ + +#include <ctype.h> // IWYU pragma: export +#include <stdio.h> // IWYU pragma: export +#include <stdlib.h> // IWYU pragma: export +#include <string.h> // IWYU pragma: export + +#include "../stdlib/files.h" // IWYU pragma: export +#include "../stdlib/print.h" // IWYU pragma: export +#include "../stdlib/stacktrace.h" // IWYU pragma: export +#include "../stdlib/stdlib.h" // IWYU pragma: export +#include "utils.h" // IWYU pragma: export + +// +// Print a parse error and exit (or use the on_err longjmp) +// +#define parser_err(ctx, start, end, ...) \ + ({ \ + if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \ + fprint_inline(stderr, (ctx)->file->relative_filename, ":", get_line_number((ctx)->file, (start)), ".", \ + get_line_column((ctx)->file, (start)), ": ", __VA_ARGS__); \ + if (USE_COLOR) fputs(" \x1b[m", stderr); \ + fputs("\n\n", stderr); \ + highlight_error((ctx)->file, (start), (end), "\x1b[31;1;7m", 2, USE_COLOR); \ + fputs("\n", stderr); \ + if (getenv("TOMO_STACKTRACE")) print_stacktrace(stderr, 1); \ + if ((ctx)->on_err) longjmp(*((ctx)->on_err), 1); \ + raise(SIGABRT); \ + exit(1); \ + }) + +// +// Expect a string (potentially after whitespace) and emit a parser error if it's not there +// +#define expect_str(ctx, start, pos, target, ...) \ + ({ \ + spaces(pos); \ + if (!match(pos, target)) { \ + if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \ + parser_err(ctx, start, *pos, __VA_ARGS__); \ + } \ + char _lastchar = target[strlen(target) - 1]; \ + if (isalpha(_lastchar) || isdigit(_lastchar) || _lastchar == '_') { \ + if (is_xid_continue_next(*pos)) { \ + if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \ + parser_err(ctx, start, *pos, __VA_ARGS__); \ + } \ + } \ + }) + +// +// Helper for matching closing parens with good error messages +// +#define expect_closing(ctx, pos, close_str, ...) \ + ({ \ + const char *_start = *pos; \ + spaces(pos); \ + if (!match(pos, (close_str))) { \ + const char *_eol = strchr(*pos, '\n'); \ + const char *_next = strstr(*pos, (close_str)); \ + const char *_end = _eol < _next ? _eol : _next; \ + if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \ + parser_err(ctx, _start, _end, __VA_ARGS__); \ + } \ + }) + +#define expect(ctx, start, pos, parser, ...) \ + ({ \ + const char **_pos = pos; \ + spaces(_pos); \ + __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos); \ + if (!_result) { \ + if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \ + parser_err(ctx, start, *_pos, __VA_ARGS__); \ + } \ + *_pos = _result->end; \ + _result; \ + }) + +#define optional(ctx, pos, parser) \ + ({ \ + const char **_pos = pos; \ + spaces(_pos); \ + __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos); \ + if (_result) *_pos = _result->end; \ + _result; \ + }) diff --git a/src/parse/parse.c b/src/parse/parse.c index 40e2a766..ba88843d 100644 --- a/src/parse/parse.c +++ b/src/parse/parse.c @@ -2,23 +2,24 @@ #include <ctype.h> #include <gc.h> #include <setjmp.h> +#include <signal.h> #include <stdarg.h> #include <stdbool.h> #include <string.h> -#include <signal.h> +#include "../unistr-fixed.h" #include <unictype.h> #include <uniname.h> #include "../ast.h" #include "../stdlib/print.h" -#include "../stdlib/stacktrace.h" #include "../stdlib/stdlib.h" #include "../stdlib/tables.h" #include "../stdlib/text.h" #include "../stdlib/util.h" -#include "../unistr-fixed.h" +#include "errors.h" #include "parse.h" +#include "utils.h" // The cache of {filename -> parsed AST} will hold at most this many entries: #ifndef PARSE_CACHE_SIZE @@ -28,8 +29,6 @@ static const double RADIANS_PER_DEGREE = 0.0174532925199432957692369076848861271344287188854172545609719144; static const char closing[128] = {['('] = ')', ['['] = ']', ['<'] = '>', ['{'] = '}'}; -#define SPACES_PER_INDENT 4 - int op_tightness[] = { [Power] = 9, [Multiply] = 8, @@ -57,25 +56,6 @@ int op_tightness[] = { [Xor] = 1, }; -static const char *keywords[] = { - "C_code", "_max_", "_min_", "and", "assert", "break", "continue", "defer", "deserialize", "do", - "else", "enum", "extend", "extern", "for", "func", "if", "in", "lang", "mod", - "mod1", "no", "none", "not", "or", "pass", "return", "skip", "skip", "stop", - "struct", "then", "unless", "use", "when", "while", "xor", "yes", -}; - -enum { NORMAL_FUNCTION = 0, EXTERN_FUNCTION = 1 }; - -static INLINE size_t some_of(const char **pos, const char *allow); -static INLINE size_t some_not(const char **pos, const char *forbid); -static INLINE size_t spaces(const char **pos); -static INLINE void whitespace(const char **pos); -static INLINE size_t match(const char **pos, const char *target); -static INLINE size_t match_word(const char **pos, const char *word); -static INLINE const char *get_word(const char **pos); -static INLINE const char *get_id(const char **pos); -static INLINE bool comment(const char **pos); -static INLINE bool indent(parse_ctx_t *ctx, const char **pos); static INLINE ast_e match_binary_operator(const char **pos); static ast_t *parse_comprehension_suffix(parse_ctx_t *ctx, ast_t *expr); static ast_t *parse_field_suffix(parse_ctx_t *ctx, ast_t *lhs); @@ -148,307 +128,6 @@ static ast_t *parse_deserialize(parse_ctx_t *ctx, const char *pos); static ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote, char open_interp, bool allow_escapes); -// -// Print a parse error and exit (or use the on_err longjmp) -// -#define parser_err(ctx, start, end, ...) \ - ({ \ - if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \ - fprint_inline(stderr, (ctx)->file->relative_filename, ":", get_line_number((ctx)->file, (start)), ".", \ - get_line_column((ctx)->file, (start)), ": ", __VA_ARGS__); \ - if (USE_COLOR) fputs(" \x1b[m", stderr); \ - fputs("\n\n", stderr); \ - highlight_error((ctx)->file, (start), (end), "\x1b[31;1;7m", 2, USE_COLOR); \ - fputs("\n", stderr); \ - if (getenv("TOMO_STACKTRACE")) print_stacktrace(stderr, 1); \ - if ((ctx)->on_err) longjmp(*((ctx)->on_err), 1); \ - raise(SIGABRT); \ - exit(1); \ - }) - -// -// Expect a string (potentially after whitespace) and emit a parser error if it's not there -// -#define expect_str(ctx, start, pos, target, ...) \ - ({ \ - spaces(pos); \ - if (!match(pos, target)) { \ - if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \ - parser_err(ctx, start, *pos, __VA_ARGS__); \ - } \ - char _lastchar = target[strlen(target) - 1]; \ - if (isalpha(_lastchar) || isdigit(_lastchar) || _lastchar == '_') { \ - if (is_xid_continue_next(*pos)) { \ - if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \ - parser_err(ctx, start, *pos, __VA_ARGS__); \ - } \ - } \ - }) - -// -// Helper for matching closing parens with good error messages -// -#define expect_closing(ctx, pos, close_str, ...) \ - ({ \ - const char *_start = *pos; \ - spaces(pos); \ - if (!match(pos, (close_str))) { \ - const char *_eol = strchr(*pos, '\n'); \ - const char *_next = strstr(*pos, (close_str)); \ - const char *_end = _eol < _next ? _eol : _next; \ - if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \ - parser_err(ctx, _start, _end, __VA_ARGS__); \ - } \ - }) - -#define expect(ctx, start, pos, parser, ...) \ - ({ \ - const char **_pos = pos; \ - spaces(_pos); \ - __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos); \ - if (!_result) { \ - if (USE_COLOR) fputs("\x1b[31;1;7m", stderr); \ - parser_err(ctx, start, *_pos, __VA_ARGS__); \ - } \ - *_pos = _result->end; \ - _result; \ - }) - -#define optional(ctx, pos, parser) \ - ({ \ - const char **_pos = pos; \ - spaces(_pos); \ - __typeof(parser(ctx, *_pos)) _result = parser(ctx, *_pos); \ - if (_result) *_pos = _result->end; \ - _result; \ - }) - -// -// Convert an escape sequence like \n to a string -// -#ifdef __GNUC__ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstack-protector" -#endif -static const char *unescape(parse_ctx_t *ctx, const char **out) { - const char **endpos = out; - const char *escape = *out; - static const char *unescapes[256] = {['a'] = "\a", ['b'] = "\b", ['e'] = "\x1b", ['f'] = "\f", ['n'] = "\n", - ['r'] = "\r", ['t'] = "\t", ['v'] = "\v", ['_'] = " "}; - assert(*escape == '\\'); - if (unescapes[(int)escape[1]]) { - *endpos = escape + 2; - return GC_strdup(unescapes[(int)escape[1]]); - } else if (escape[1] == '[') { - // ANSI Control Sequence Indicator: \033 [ ... m - size_t len = strcspn(&escape[2], "\r\n]"); - if (escape[2 + len] != ']') parser_err(ctx, escape, escape + 2 + len, "Missing closing ']'"); - *endpos = escape + 3 + len; - return String("\033[", string_slice(&escape[2], len), "m"); - } else if (escape[1] == '{') { - // Unicode codepoints by name - size_t len = strcspn(&escape[2], "\r\n}"); - if (escape[2 + len] != '}') parser_err(ctx, escape, escape + 2 + len, "Missing closing '}'"); - char name[len + 1]; - memcpy(name, &escape[2], len); - name[len] = '\0'; - - if (name[0] == 'U') { - for (char *p = &name[1]; *p; p++) { - if (!isxdigit(*p)) goto look_up_unicode_name; - } - // Unicode codepoints by hex - char *endptr = NULL; - long codepoint = strtol(name + 1, &endptr, 16); - uint32_t ustr[2] = {codepoint, 0}; - size_t bufsize = 8; - uint8_t buf[bufsize]; - (void)u32_to_u8(ustr, bufsize, buf, &bufsize); - *endpos = escape + 3 + len; - return GC_strndup((char *)buf, bufsize); - } - - look_up_unicode_name:; - - uint32_t codepoint = unicode_name_character(name); - if (codepoint == UNINAME_INVALID) - parser_err(ctx, escape, escape + 3 + len, "Invalid unicode codepoint name: ", quoted(name)); - *endpos = escape + 3 + len; - char *str = GC_MALLOC_ATOMIC(16); - size_t u8_len = 16; - (void)u32_to_u8(&codepoint, 1, (uint8_t *)str, &u8_len); - str[u8_len] = '\0'; - return str; - } else if (escape[1] == 'x' && escape[2] && escape[3]) { - // ASCII 2-digit hex - char buf[] = {escape[2], escape[3], 0}; - char c = (char)strtol(buf, NULL, 16); - *endpos = escape + 4; - return GC_strndup(&c, 1); - } else if ('0' <= escape[1] && escape[1] <= '7' && '0' <= escape[2] && escape[2] <= '7' && '0' <= escape[3] - && escape[3] <= '7') { - char buf[] = {escape[1], escape[2], escape[3], 0}; - char c = (char)strtol(buf, NULL, 8); - *endpos = escape + 4; - return GC_strndup(&c, 1); - } else { - *endpos = escape + 2; - return GC_strndup(escape + 1, 1); - } -} -#ifdef __GNUC__ -#pragma GCC diagnostic pop -#endif - -// Indent is in number of spaces (assuming that \t is 4 spaces) -PUREFUNC static INLINE int64_t get_indent(parse_ctx_t *ctx, const char *pos) { - int64_t line_num = get_line_number(ctx->file, pos); - const char *line = get_line(ctx->file, line_num); - if (line == NULL) { - return 0; - } else if (*line == ' ') { - int64_t spaces = (int64_t)strspn(line, " "); - if (line[spaces] == '\t') - parser_err(ctx, line + spaces, line + spaces + 1, - "This is a tab following spaces, and you can't mix tabs and spaces"); - return spaces; - } else if (*line == '\t') { - int64_t indent = (int64_t)strspn(line, "\t"); - if (line[indent] == ' ') - parser_err(ctx, line + indent, line + indent + 1, - "This is a space following tabs, and you can't mix tabs and spaces"); - return indent * SPACES_PER_INDENT; - } else { - return 0; - } -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////////// -///////////////////////////// Text-based parsing primitives /////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////////////// -size_t some_of(const char **pos, const char *allow) { - size_t len = strspn(*pos, allow); - *pos += len; - return len; -} - -size_t some_not(const char **pos, const char *forbid) { - size_t len = strcspn(*pos, forbid); - *pos += len; - return len; -} - -size_t spaces(const char **pos) { return some_of(pos, " \t"); } - -void whitespace(const char **pos) { - while (some_of(pos, " \t\r\n") || comment(pos)) - continue; -} - -size_t match(const char **pos, const char *target) { - size_t len = strlen(target); - if (strncmp(*pos, target, len) != 0) return 0; - *pos += len; - return len; -} - -static INLINE bool is_xid_continue_next(const char *pos) { - ucs4_t point = 0; - u8_next(&point, (const uint8_t *)pos); - return uc_is_property_xid_continue(point); -} - -size_t match_word(const char **out, const char *word) { - const char *pos = *out; - spaces(&pos); - if (!match(&pos, word) || is_xid_continue_next(pos)) return 0; - - *out = pos; - return strlen(word); -} - -const char *get_word(const char **inout) { - const char *word = *inout; - spaces(&word); - const uint8_t *pos = (const uint8_t *)word; - ucs4_t point; - pos = u8_next(&point, pos); - if (!uc_is_property_xid_start(point) && point != '_') return NULL; - - for (const uint8_t *next; (next = u8_next(&point, pos)); pos = next) { - if (!uc_is_property_xid_continue(point)) break; - } - *inout = (const char *)pos; - return GC_strndup(word, (size_t)((const char *)pos - word)); -} - -static CONSTFUNC bool is_keyword(const char *word) { - int64_t lo = 0, hi = sizeof(keywords) / sizeof(keywords[0]) - 1; - while (lo <= hi) { - int64_t mid = (lo + hi) / 2; - int32_t cmp = strcmp(word, keywords[mid]); - if (cmp == 0) return true; - else if (cmp > 0) lo = mid + 1; - else if (cmp < 0) hi = mid - 1; - } - return false; -} - -const char *get_id(const char **inout) { - const char *pos = *inout; - const char *word = get_word(&pos); - if (!word || is_keyword(word)) return NULL; - *inout = pos; - return word; -} - -static const char *eol(const char *str) { return str + strcspn(str, "\r\n"); } - -bool comment(const char **pos) { - if ((*pos)[0] == '#') { - *pos += strcspn(*pos, "\r\n"); - return true; - } else { - return false; - } -} - -bool indent(parse_ctx_t *ctx, const char **out) { - const char *pos = *out; - int64_t starting_indent = get_indent(ctx, pos); - whitespace(&pos); - const char *next_line = get_line(ctx->file, get_line_number(ctx->file, pos)); - if (next_line <= *out) return false; - - if (get_indent(ctx, next_line) != starting_indent + SPACES_PER_INDENT) return false; - - *out = next_line + strspn(next_line, " \t"); - return true; -} - -bool newline_with_indentation(const char **out, int64_t target) { - const char *pos = *out; - if (*pos == '\r') ++pos; - if (*pos != '\n') return false; - ++pos; - if (*pos == '\r' || *pos == '\n' || *pos == '\0') { - // Empty line - *out = pos; - return true; - } - - if (*pos == ' ') { - if ((int64_t)strspn(pos, " ") >= target) { - *out = pos + target; - return true; - } - } else if ((int64_t)strspn(pos, "\t") * SPACES_PER_INDENT >= target) { - *out = pos + target / SPACES_PER_INDENT; - return true; - } - return false; -} - /////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////// AST-based parsers ///////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -661,21 +340,6 @@ ast_t *parse_num(parse_ctx_t *ctx, const char *pos) { return NewAST(ctx->file, start, pos, Num, .n = d); } -static INLINE bool match_separator(const char **pos) { // Either comma or newline - const char *p = *pos; - int separators = 0; - for (;;) { - if (some_of(&p, "\r\n,")) ++separators; - else if (!comment(&p) && !some_of(&p, " \t")) break; - } - if (separators > 0) { - *pos = p; - return true; - } else { - return false; - } -} - ast_t *parse_list(parse_ctx_t *ctx, const char *pos) { const char *start = pos; if (!match(&pos, "[")) return NULL; diff --git a/src/parse/parse.h b/src/parse/parse.h index 2d81170c..24e173a5 100644 --- a/src/parse/parse.h +++ b/src/parse/parse.h @@ -8,6 +8,8 @@ #include "../ast.h" #include "../stdlib/files.h" +#define SPACES_PER_INDENT 4 + typedef struct { file_t *file; jmp_buf *on_err; diff --git a/src/parse/utils.c b/src/parse/utils.c new file mode 100644 index 00000000..d745dec8 --- /dev/null +++ b/src/parse/utils.c @@ -0,0 +1,272 @@ +// Some common parsing utilities + +#include <stdint.h> + +#include "../unistr-fixed.h" +#include <unictype.h> +#include <uniname.h> + +#include "../stdlib/util.h" +#include "errors.h" +#include "parse.h" +#include "utils.h" + +static const char *keywords[] = { + "C_code", "_max_", "_min_", "and", "assert", "break", "continue", "defer", "deserialize", "do", + "else", "enum", "extend", "extern", "for", "func", "if", "in", "lang", "mod", + "mod1", "no", "none", "not", "or", "pass", "return", "skip", "skip", "stop", + "struct", "then", "unless", "use", "when", "while", "xor", "yes", +}; + +public +CONSTFUNC bool is_keyword(const char *word) { + int64_t lo = 0, hi = sizeof(keywords) / sizeof(keywords[0]) - 1; + while (lo <= hi) { + int64_t mid = (lo + hi) / 2; + int32_t cmp = strcmp(word, keywords[mid]); + if (cmp == 0) return true; + else if (cmp > 0) lo = mid + 1; + else if (cmp < 0) hi = mid - 1; + } + return false; +} + +public +size_t some_of(const char **pos, const char *allow) { + size_t len = strspn(*pos, allow); + *pos += len; + return len; +} + +public +size_t some_not(const char **pos, const char *forbid) { + size_t len = strcspn(*pos, forbid); + *pos += len; + return len; +} + +public +size_t spaces(const char **pos) { return some_of(pos, " \t"); } + +public +void whitespace(const char **pos) { + while (some_of(pos, " \t\r\n") || comment(pos)) + continue; +} + +public +size_t match(const char **pos, const char *target) { + size_t len = strlen(target); + if (strncmp(*pos, target, len) != 0) return 0; + *pos += len; + return len; +} + +public +bool is_xid_continue_next(const char *pos) { + ucs4_t point = 0; + u8_next(&point, (const uint8_t *)pos); + return uc_is_property_xid_continue(point); +} + +public +size_t match_word(const char **out, const char *word) { + const char *pos = *out; + spaces(&pos); + if (!match(&pos, word) || is_xid_continue_next(pos)) return 0; + + *out = pos; + return strlen(word); +} + +public +const char *get_word(const char **inout) { + const char *word = *inout; + spaces(&word); + const uint8_t *pos = (const uint8_t *)word; + ucs4_t point; + pos = u8_next(&point, pos); + if (!uc_is_property_xid_start(point) && point != '_') return NULL; + + for (const uint8_t *next; (next = u8_next(&point, pos)); pos = next) { + if (!uc_is_property_xid_continue(point)) break; + } + *inout = (const char *)pos; + return GC_strndup(word, (size_t)((const char *)pos - word)); +} + +public +const char *get_id(const char **inout) { + const char *pos = *inout; + const char *word = get_word(&pos); + if (!word || is_keyword(word)) return NULL; + *inout = pos; + return word; +} + +public +const char *eol(const char *str) { return str + strcspn(str, "\r\n"); } + +public +bool comment(const char **pos) { + if ((*pos)[0] == '#') { + *pos += strcspn(*pos, "\r\n"); + return true; + } else { + return false; + } +} + +public +PUREFUNC int64_t get_indent(parse_ctx_t *ctx, const char *pos) { + int64_t line_num = get_line_number(ctx->file, pos); + const char *line = get_line(ctx->file, line_num); + if (line == NULL) { + return 0; + } else if (*line == ' ') { + int64_t spaces = (int64_t)strspn(line, " "); + if (line[spaces] == '\t') + parser_err(ctx, line + spaces, line + spaces + 1, + "This is a tab following spaces, and you can't mix tabs and spaces"); + return spaces; + } else if (*line == '\t') { + int64_t indent = (int64_t)strspn(line, "\t"); + if (line[indent] == ' ') + parser_err(ctx, line + indent, line + indent + 1, + "This is a space following tabs, and you can't mix tabs and spaces"); + return indent * SPACES_PER_INDENT; + } else { + return 0; + } +} + +public +bool indent(parse_ctx_t *ctx, const char **out) { + const char *pos = *out; + int64_t starting_indent = get_indent(ctx, pos); + whitespace(&pos); + const char *next_line = get_line(ctx->file, get_line_number(ctx->file, pos)); + if (next_line <= *out) return false; + + if (get_indent(ctx, next_line) != starting_indent + SPACES_PER_INDENT) return false; + + *out = next_line + strspn(next_line, " \t"); + return true; +} + +public +bool newline_with_indentation(const char **out, int64_t target) { + const char *pos = *out; + if (*pos == '\r') ++pos; + if (*pos != '\n') return false; + ++pos; + if (*pos == '\r' || *pos == '\n' || *pos == '\0') { + // Empty line + *out = pos; + return true; + } + + if (*pos == ' ') { + if ((int64_t)strspn(pos, " ") >= target) { + *out = pos + target; + return true; + } + } else if ((int64_t)strspn(pos, "\t") * SPACES_PER_INDENT >= target) { + *out = pos + target / SPACES_PER_INDENT; + return true; + } + return false; +} + +// +// Convert an escape sequence like \n to a string +// +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstack-protector" +#endif +const char *unescape(parse_ctx_t *ctx, const char **out) { + const char **endpos = out; + const char *escape = *out; + static const char *unescapes[256] = {['a'] = "\a", ['b'] = "\b", ['e'] = "\x1b", ['f'] = "\f", ['n'] = "\n", + ['r'] = "\r", ['t'] = "\t", ['v'] = "\v", ['_'] = " "}; + assert(*escape == '\\'); + if (unescapes[(int)escape[1]]) { + *endpos = escape + 2; + return GC_strdup(unescapes[(int)escape[1]]); + } else if (escape[1] == '[') { + // ANSI Control Sequence Indicator: \033 [ ... m + size_t len = strcspn(&escape[2], "\r\n]"); + if (escape[2 + len] != ']') parser_err(ctx, escape, escape + 2 + len, "Missing closing ']'"); + *endpos = escape + 3 + len; + return String("\033[", string_slice(&escape[2], len), "m"); + } else if (escape[1] == '{') { + // Unicode codepoints by name + size_t len = strcspn(&escape[2], "\r\n}"); + if (escape[2 + len] != '}') parser_err(ctx, escape, escape + 2 + len, "Missing closing '}'"); + char name[len + 1]; + memcpy(name, &escape[2], len); + name[len] = '\0'; + + if (name[0] == 'U') { + for (char *p = &name[1]; *p; p++) { + if (!isxdigit(*p)) goto look_up_unicode_name; + } + // Unicode codepoints by hex + char *endptr = NULL; + long codepoint = strtol(name + 1, &endptr, 16); + uint32_t ustr[2] = {codepoint, 0}; + size_t bufsize = 8; + uint8_t buf[bufsize]; + (void)u32_to_u8(ustr, bufsize, buf, &bufsize); + *endpos = escape + 3 + len; + return GC_strndup((char *)buf, bufsize); + } + + look_up_unicode_name:; + + uint32_t codepoint = unicode_name_character(name); + if (codepoint == UNINAME_INVALID) + parser_err(ctx, escape, escape + 3 + len, "Invalid unicode codepoint name: ", quoted(name)); + *endpos = escape + 3 + len; + char *str = GC_MALLOC_ATOMIC(16); + size_t u8_len = 16; + (void)u32_to_u8(&codepoint, 1, (uint8_t *)str, &u8_len); + str[u8_len] = '\0'; + return str; + } else if (escape[1] == 'x' && escape[2] && escape[3]) { + // ASCII 2-digit hex + char buf[] = {escape[2], escape[3], 0}; + char c = (char)strtol(buf, NULL, 16); + *endpos = escape + 4; + return GC_strndup(&c, 1); + } else if ('0' <= escape[1] && escape[1] <= '7' && '0' <= escape[2] && escape[2] <= '7' && '0' <= escape[3] + && escape[3] <= '7') { + char buf[] = {escape[1], escape[2], escape[3], 0}; + char c = (char)strtol(buf, NULL, 8); + *endpos = escape + 4; + return GC_strndup(&c, 1); + } else { + *endpos = escape + 2; + return GC_strndup(escape + 1, 1); + } +} +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +public +bool match_separator(const char **pos) { // Either comma or newline + const char *p = *pos; + int separators = 0; + for (;;) { + if (some_of(&p, "\r\n,")) ++separators; + else if (!comment(&p) && !some_of(&p, " \t")) break; + } + if (separators > 0) { + *pos = p; + return true; + } else { + return false; + } +} diff --git a/src/parse/utils.h b/src/parse/utils.h new file mode 100644 index 00000000..cec48f31 --- /dev/null +++ b/src/parse/utils.h @@ -0,0 +1,24 @@ +// Some common parsing utilities + +#include <stdbool.h> + +#include "../stdlib/util.h" +#include "parse.h" + +CONSTFUNC bool is_keyword(const char *word); +size_t some_of(const char **pos, const char *allow); +size_t some_not(const char **pos, const char *forbid); +size_t spaces(const char **pos); +void whitespace(const char **pos); +size_t match(const char **pos, const char *target); +size_t match_word(const char **pos, const char *word); +const char *get_word(const char **pos); +const char *get_id(const char **pos); +bool comment(const char **pos); +bool indent(parse_ctx_t *ctx, const char **pos); +const char *eol(const char *str); +PUREFUNC int64_t get_indent(parse_ctx_t *ctx, const char *pos); +const char *unescape(parse_ctx_t *ctx, const char **out); +bool is_xid_continue_next(const char *pos); +bool newline_with_indentation(const char **out, int64_t target); +bool match_separator(const char **pos); diff --git a/src/typecheck.c b/src/typecheck.c index 09c5fc62..08966c31 100644 --- a/src/typecheck.c +++ b/src/typecheck.c @@ -13,6 +13,7 @@ #include "modules.h" #include "naming.h" #include "parse/parse.h" +#include "parse/types.h" #include "stdlib/paths.h" #include "stdlib/tables.h" #include "stdlib/text.h" |
