aboutsummaryrefslogtreecommitdiff
path: root/src/parse/utils.c
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2025-08-25 00:30:08 -0400
committerBruce Hill <bruce@bruce-hill.com>2025-08-25 00:30:08 -0400
commitb378f7359a6b8bd2e47bafdba496d115825adcd7 (patch)
tree52deff3a66e2265b84bd20a7e81b20111d29f7dd /src/parse/utils.c
parent80755477735baaea66f865c316aff036bebd8e2f (diff)
Split out some parser functionality.
Diffstat (limited to 'src/parse/utils.c')
-rw-r--r--src/parse/utils.c272
1 files changed, 272 insertions, 0 deletions
diff --git a/src/parse/utils.c b/src/parse/utils.c
new file mode 100644
index 00000000..d745dec8
--- /dev/null
+++ b/src/parse/utils.c
@@ -0,0 +1,272 @@
+// Some common parsing utilities
+
+#include <stdint.h>
+
+#include "../unistr-fixed.h"
+#include <unictype.h>
+#include <uniname.h>
+
+#include "../stdlib/util.h"
+#include "errors.h"
+#include "parse.h"
+#include "utils.h"
+
+static const char *keywords[] = {
+ "C_code", "_max_", "_min_", "and", "assert", "break", "continue", "defer", "deserialize", "do",
+ "else", "enum", "extend", "extern", "for", "func", "if", "in", "lang", "mod",
+ "mod1", "no", "none", "not", "or", "pass", "return", "skip", "skip", "stop",
+ "struct", "then", "unless", "use", "when", "while", "xor", "yes",
+};
+
+public
+CONSTFUNC bool is_keyword(const char *word) {
+ int64_t lo = 0, hi = sizeof(keywords) / sizeof(keywords[0]) - 1;
+ while (lo <= hi) {
+ int64_t mid = (lo + hi) / 2;
+ int32_t cmp = strcmp(word, keywords[mid]);
+ if (cmp == 0) return true;
+ else if (cmp > 0) lo = mid + 1;
+ else if (cmp < 0) hi = mid - 1;
+ }
+ return false;
+}
+
+public
+size_t some_of(const char **pos, const char *allow) {
+ size_t len = strspn(*pos, allow);
+ *pos += len;
+ return len;
+}
+
+public
+size_t some_not(const char **pos, const char *forbid) {
+ size_t len = strcspn(*pos, forbid);
+ *pos += len;
+ return len;
+}
+
+public
+size_t spaces(const char **pos) { return some_of(pos, " \t"); }
+
+public
+void whitespace(const char **pos) {
+ while (some_of(pos, " \t\r\n") || comment(pos))
+ continue;
+}
+
+public
+size_t match(const char **pos, const char *target) {
+ size_t len = strlen(target);
+ if (strncmp(*pos, target, len) != 0) return 0;
+ *pos += len;
+ return len;
+}
+
+public
+bool is_xid_continue_next(const char *pos) {
+ ucs4_t point = 0;
+ u8_next(&point, (const uint8_t *)pos);
+ return uc_is_property_xid_continue(point);
+}
+
+public
+size_t match_word(const char **out, const char *word) {
+ const char *pos = *out;
+ spaces(&pos);
+ if (!match(&pos, word) || is_xid_continue_next(pos)) return 0;
+
+ *out = pos;
+ return strlen(word);
+}
+
+public
+const char *get_word(const char **inout) {
+ const char *word = *inout;
+ spaces(&word);
+ const uint8_t *pos = (const uint8_t *)word;
+ ucs4_t point;
+ pos = u8_next(&point, pos);
+ if (!uc_is_property_xid_start(point) && point != '_') return NULL;
+
+ for (const uint8_t *next; (next = u8_next(&point, pos)); pos = next) {
+ if (!uc_is_property_xid_continue(point)) break;
+ }
+ *inout = (const char *)pos;
+ return GC_strndup(word, (size_t)((const char *)pos - word));
+}
+
+public
+const char *get_id(const char **inout) {
+ const char *pos = *inout;
+ const char *word = get_word(&pos);
+ if (!word || is_keyword(word)) return NULL;
+ *inout = pos;
+ return word;
+}
+
+public
+const char *eol(const char *str) { return str + strcspn(str, "\r\n"); }
+
+public
+bool comment(const char **pos) {
+ if ((*pos)[0] == '#') {
+ *pos += strcspn(*pos, "\r\n");
+ return true;
+ } else {
+ return false;
+ }
+}
+
+public
+PUREFUNC int64_t get_indent(parse_ctx_t *ctx, const char *pos) {
+ int64_t line_num = get_line_number(ctx->file, pos);
+ const char *line = get_line(ctx->file, line_num);
+ if (line == NULL) {
+ return 0;
+ } else if (*line == ' ') {
+ int64_t spaces = (int64_t)strspn(line, " ");
+ if (line[spaces] == '\t')
+ parser_err(ctx, line + spaces, line + spaces + 1,
+ "This is a tab following spaces, and you can't mix tabs and spaces");
+ return spaces;
+ } else if (*line == '\t') {
+ int64_t indent = (int64_t)strspn(line, "\t");
+ if (line[indent] == ' ')
+ parser_err(ctx, line + indent, line + indent + 1,
+ "This is a space following tabs, and you can't mix tabs and spaces");
+ return indent * SPACES_PER_INDENT;
+ } else {
+ return 0;
+ }
+}
+
+public
+bool indent(parse_ctx_t *ctx, const char **out) {
+ const char *pos = *out;
+ int64_t starting_indent = get_indent(ctx, pos);
+ whitespace(&pos);
+ const char *next_line = get_line(ctx->file, get_line_number(ctx->file, pos));
+ if (next_line <= *out) return false;
+
+ if (get_indent(ctx, next_line) != starting_indent + SPACES_PER_INDENT) return false;
+
+ *out = next_line + strspn(next_line, " \t");
+ return true;
+}
+
+public
+bool newline_with_indentation(const char **out, int64_t target) {
+ const char *pos = *out;
+ if (*pos == '\r') ++pos;
+ if (*pos != '\n') return false;
+ ++pos;
+ if (*pos == '\r' || *pos == '\n' || *pos == '\0') {
+ // Empty line
+ *out = pos;
+ return true;
+ }
+
+ if (*pos == ' ') {
+ if ((int64_t)strspn(pos, " ") >= target) {
+ *out = pos + target;
+ return true;
+ }
+ } else if ((int64_t)strspn(pos, "\t") * SPACES_PER_INDENT >= target) {
+ *out = pos + target / SPACES_PER_INDENT;
+ return true;
+ }
+ return false;
+}
+
+//
+// Convert an escape sequence like \n to a string
+//
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstack-protector"
+#endif
+const char *unescape(parse_ctx_t *ctx, const char **out) {
+ const char **endpos = out;
+ const char *escape = *out;
+ static const char *unescapes[256] = {['a'] = "\a", ['b'] = "\b", ['e'] = "\x1b", ['f'] = "\f", ['n'] = "\n",
+ ['r'] = "\r", ['t'] = "\t", ['v'] = "\v", ['_'] = " "};
+ assert(*escape == '\\');
+ if (unescapes[(int)escape[1]]) {
+ *endpos = escape + 2;
+ return GC_strdup(unescapes[(int)escape[1]]);
+ } else if (escape[1] == '[') {
+ // ANSI Control Sequence Indicator: \033 [ ... m
+ size_t len = strcspn(&escape[2], "\r\n]");
+ if (escape[2 + len] != ']') parser_err(ctx, escape, escape + 2 + len, "Missing closing ']'");
+ *endpos = escape + 3 + len;
+ return String("\033[", string_slice(&escape[2], len), "m");
+ } else if (escape[1] == '{') {
+ // Unicode codepoints by name
+ size_t len = strcspn(&escape[2], "\r\n}");
+ if (escape[2 + len] != '}') parser_err(ctx, escape, escape + 2 + len, "Missing closing '}'");
+ char name[len + 1];
+ memcpy(name, &escape[2], len);
+ name[len] = '\0';
+
+ if (name[0] == 'U') {
+ for (char *p = &name[1]; *p; p++) {
+ if (!isxdigit(*p)) goto look_up_unicode_name;
+ }
+ // Unicode codepoints by hex
+ char *endptr = NULL;
+ long codepoint = strtol(name + 1, &endptr, 16);
+ uint32_t ustr[2] = {codepoint, 0};
+ size_t bufsize = 8;
+ uint8_t buf[bufsize];
+ (void)u32_to_u8(ustr, bufsize, buf, &bufsize);
+ *endpos = escape + 3 + len;
+ return GC_strndup((char *)buf, bufsize);
+ }
+
+ look_up_unicode_name:;
+
+ uint32_t codepoint = unicode_name_character(name);
+ if (codepoint == UNINAME_INVALID)
+ parser_err(ctx, escape, escape + 3 + len, "Invalid unicode codepoint name: ", quoted(name));
+ *endpos = escape + 3 + len;
+ char *str = GC_MALLOC_ATOMIC(16);
+ size_t u8_len = 16;
+ (void)u32_to_u8(&codepoint, 1, (uint8_t *)str, &u8_len);
+ str[u8_len] = '\0';
+ return str;
+ } else if (escape[1] == 'x' && escape[2] && escape[3]) {
+ // ASCII 2-digit hex
+ char buf[] = {escape[2], escape[3], 0};
+ char c = (char)strtol(buf, NULL, 16);
+ *endpos = escape + 4;
+ return GC_strndup(&c, 1);
+ } else if ('0' <= escape[1] && escape[1] <= '7' && '0' <= escape[2] && escape[2] <= '7' && '0' <= escape[3]
+ && escape[3] <= '7') {
+ char buf[] = {escape[1], escape[2], escape[3], 0};
+ char c = (char)strtol(buf, NULL, 8);
+ *endpos = escape + 4;
+ return GC_strndup(&c, 1);
+ } else {
+ *endpos = escape + 2;
+ return GC_strndup(escape + 1, 1);
+ }
+}
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+public
+bool match_separator(const char **pos) { // Either comma or newline
+ const char *p = *pos;
+ int separators = 0;
+ for (;;) {
+ if (some_of(&p, "\r\n,")) ++separators;
+ else if (!comment(&p) && !some_of(&p, " \t")) break;
+ }
+ if (separators > 0) {
+ *pos = p;
+ return true;
+ } else {
+ return false;
+ }
+}