// Some common parsing utilities #include #include "../unistr-fixed.h" #include #include #include "../stdlib/tables.h" #include "../util.h" #include "errors.h" #include "utils.h" static const char *keywords[] = { "C_code", "_max_", "_min_", "and", "assert", "break", "continue", "defer", "do", "else", "enum", "for", "func", "if", "in", "lang", "mod", "mod1", "no", "none", "not", "or", "pass", "return", "skip", "skip", "stop", "struct", "then", "unless", "use", "when", "while", "xor", "yes", }; CONSTFUNC bool is_keyword(const char *word) { int64_t lo = 0, hi = sizeof(keywords) / sizeof(keywords[0]) - 1; while (lo <= hi) { int64_t mid = (lo + hi) / 2; int32_t cmp = strcmp(word, keywords[mid]); if (cmp == 0) return true; else if (cmp > 0) lo = mid + 1; else if (cmp < 0) hi = mid - 1; } return false; } size_t some_of(const char **pos, const char *allow) { size_t len = strspn(*pos, allow); *pos += len; return len; } size_t some_not(const char **pos, const char *forbid) { size_t len = strcspn(*pos, forbid); *pos += len; return len; } size_t spaces(const char **pos) { return some_of(pos, " \t"); } void whitespace(parse_ctx_t *ctx, const char **pos) { while (some_of(pos, " \t\r\n") || comment(ctx, pos)) continue; } size_t match(const char **pos, const char *target) { size_t len = strlen(target); if (strncmp(*pos, target, len) != 0) return 0; *pos += len; return len; } bool is_xid_continue_next(const char *pos) { ucs4_t point = 0; u8_next(&point, (const uint8_t *)pos); return uc_is_property_xid_continue(point); } size_t match_word(const char **out, const char *word) { const char *pos = *out; spaces(&pos); if (!match(&pos, word) || is_xid_continue_next(pos)) return 0; *out = pos; return strlen(word); } const char *get_word(const char **inout) { const char *word = *inout; spaces(&word); const uint8_t *pos = (const uint8_t *)word; ucs4_t point; pos = u8_next(&point, pos); if (!uc_is_property_xid_start(point) && point != '_') return NULL; for (const uint8_t *next; (next = u8_next(&point, pos)); pos = next) { if (!uc_is_property_xid_continue(point)) break; } *inout = (const char *)pos; return GC_strndup(word, (size_t)((const char *)pos - word)); } const char *get_id(const char **inout) { const char *pos = *inout; const char *word = get_word(&pos); if (!word || is_keyword(word)) return NULL; *inout = pos; return word; } PUREFUNC const char *eol(const char *str) { return str + strcspn(str, "\r\n"); } bool comment(parse_ctx_t *ctx, const char **pos) { if ((*pos)[0] == '#') { const char *start = *pos; *pos += strcspn(*pos, "\r\n"); const char *end = *pos; Table$set(&ctx->comments, &start, &end, parse_comments_info); return true; } else { return false; } } PUREFUNC int64_t get_indent(parse_ctx_t *ctx, const char *pos) { int64_t line_num = get_line_number(ctx->file, pos); const char *line = get_line(ctx->file, line_num); if (line == NULL) { return 0; } else if (*line == ' ') { int64_t spaces = (int64_t)strspn(line, " "); if (line[spaces] == '\t') parser_err(ctx, line + spaces, line + spaces + 1, "This is a tab following spaces, and you can't mix tabs and spaces"); return spaces; } else if (*line == '\t') { int64_t indent = (int64_t)strspn(line, "\t"); if (line[indent] == ' ') parser_err(ctx, line + indent, line + indent + 1, "This is a space following tabs, and you can't mix tabs and spaces"); return indent * SPACES_PER_INDENT; } else { return 0; } } bool indent(parse_ctx_t *ctx, const char **out) { const char *pos = *out; int64_t starting_indent = get_indent(ctx, pos); whitespace(ctx, &pos); const char *next_line = get_line(ctx->file, get_line_number(ctx->file, pos)); if (next_line <= *out) return false; if (get_indent(ctx, next_line) != starting_indent + SPACES_PER_INDENT) return false; *out = next_line + strspn(next_line, " \t"); return true; } bool newline_with_indentation(const char **out, int64_t target) { const char *pos = *out; if (*pos == '\r') ++pos; if (*pos != '\n') return false; ++pos; if (*pos == '\r' || *pos == '\n' || *pos == '\0') { // Empty line *out = pos; return true; } if (*pos == ' ') { if ((int64_t)strspn(pos, " ") >= target) { *out = pos + target; return true; } } else if ((int64_t)strspn(pos, "\t") * SPACES_PER_INDENT >= target) { *out = pos + target / SPACES_PER_INDENT; return true; } return false; } // // Convert an escape sequence like \n to a string // #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstack-protector" #endif const char *unescape(parse_ctx_t *ctx, const char **out) { const char **endpos = out; const char *escape = *out; static const char *unescapes[256] = {['a'] = "\a", ['b'] = "\b", ['e'] = "\x1b", ['f'] = "\f", ['n'] = "\n", ['r'] = "\r", ['t'] = "\t", ['v'] = "\v", ['_'] = " "}; assert(*escape == '\\'); if (unescapes[(int)escape[1]]) { *endpos = escape + 2; return GC_strdup(unescapes[(int)escape[1]]); } else if (escape[1] == '[') { // ANSI Control Sequence Indicator: \033 [ ... m size_t len = strcspn(&escape[2], "\r\n]"); if (escape[2 + len] != ']') parser_err(ctx, escape, escape + 2 + len, "Missing closing ']'"); *endpos = escape + 3 + len; return String("\033[", string_slice(&escape[2], len), "m"); } else if (escape[1] == '{') { // Unicode codepoints by name size_t len = strcspn(&escape[2], "\r\n}"); if (escape[2 + len] != '}') parser_err(ctx, escape, escape + 2 + len, "Missing closing '}'"); char name[len + 1]; memcpy(name, &escape[2], len); name[len] = '\0'; if (name[0] == 'U') { for (char *p = &name[1]; *p; p++) { if (!isxdigit(*p)) goto look_up_unicode_name; } // Unicode codepoints by hex char *endptr = NULL; long codepoint = strtol(name + 1, &endptr, 16); uint32_t ustr[2] = {codepoint, 0}; size_t bufsize = 8; uint8_t buf[bufsize]; (void)u32_to_u8(ustr, bufsize, buf, &bufsize); *endpos = escape + 3 + len; return GC_strndup((char *)buf, bufsize); } look_up_unicode_name:; uint32_t codepoint = unicode_name_character(name); if (codepoint == UNINAME_INVALID) parser_err(ctx, escape, escape + 3 + len, "Invalid unicode codepoint name: ", quoted(name)); *endpos = escape + 3 + len; char *str = GC_MALLOC_ATOMIC(16); size_t u8_len = 16; (void)u32_to_u8(&codepoint, 1, (uint8_t *)str, &u8_len); str[u8_len] = '\0'; return str; } else if (escape[1] == 'x' && escape[2] && escape[3]) { // ASCII 2-digit hex char buf[] = {escape[2], escape[3], 0}; char c = (char)strtol(buf, NULL, 16); *endpos = escape + 4; return GC_strndup(&c, 1); } else if ('0' <= escape[1] && escape[1] <= '7' && '0' <= escape[2] && escape[2] <= '7' && '0' <= escape[3] && escape[3] <= '7') { char buf[] = {escape[1], escape[2], escape[3], 0}; char c = (char)strtol(buf, NULL, 8); *endpos = escape + 4; return GC_strndup(&c, 1); } else { *endpos = escape + 2; return GC_strndup(escape + 1, 1); } } #ifdef __GNUC__ #pragma GCC diagnostic pop #endif bool match_separator(parse_ctx_t *ctx, const char **pos) { // Either comma or newline const char *p = *pos; int separators = 0; for (;;) { if (some_of(&p, "\r\n,")) ++separators; else if (!comment(ctx, &p) && !some_of(&p, " \t")) break; } if (separators > 0) { *pos = p; return true; } else { return false; } }