code / tomo

Lines41.3K C23.7K Markdown9.7K YAML5.0K Tomo2.3K
7 others 763
Python231 Shell230 make212 INI47 Text21 SVG16 Lua6
(268 lines)
1 // Some common parsing utilities
3 #include <stdint.h>
5 #include "../unistr-fixed.h"
6 #include <unictype.h>
7 #include <uniname.h>
9 #include "../stdlib/tables.h"
10 #include "../stdlib/util.h"
11 #include "errors.h"
12 #include "utils.h"
14 static const char *keywords[] = {
15 "C_code", "_max_", "_min_", "and", "assert", "break", "continue", "defer", "do", "else", "enum", "for",
16 "func", "if", "in", "lang", "mod", "mod1", "no", "none", "not", "or", "pass", "return",
17 "skip", "skip", "stop", "struct", "then", "unless", "use", "when", "while", "xor", "yes",
18 };
20 CONSTFUNC bool is_keyword(const char *word) {
21 int64_t lo = 0, hi = sizeof(keywords) / sizeof(keywords[0]) - 1;
22 while (lo <= hi) {
23 int64_t mid = (lo + hi) / 2;
24 int32_t cmp = strcmp(word, keywords[mid]);
25 if (cmp == 0) return true;
26 else if (cmp > 0) lo = mid + 1;
27 else if (cmp < 0) hi = mid - 1;
29 return false;
32 size_t some_of(const char **pos, const char *allow) {
33 size_t len = strspn(*pos, allow);
34 *pos += len;
35 return len;
38 size_t some_not(const char **pos, const char *forbid) {
39 size_t len = strcspn(*pos, forbid);
40 *pos += len;
41 return len;
44 size_t spaces(const char **pos) {
45 return some_of(pos, " \t");
48 void whitespace(parse_ctx_t *ctx, const char **pos) {
49 while (some_of(pos, " \t\r\n") || comment(ctx, pos))
50 continue;
53 size_t match(const char **pos, const char *target) {
54 size_t len = strlen(target);
55 if (strncmp(*pos, target, len) != 0) return 0;
56 *pos += len;
57 return len;
60 bool is_xid_start_next(const char *pos) {
61 ucs4_t point = 0;
62 u8_next(&point, (const uint8_t *)pos);
63 return uc_is_property_xid_start(point);
66 bool is_xid_continue_next(const char *pos) {
67 ucs4_t point = 0;
68 u8_next(&point, (const uint8_t *)pos);
69 return uc_is_property_xid_continue(point);
72 size_t match_word(const char **out, const char *word) {
73 const char *pos = *out;
74 spaces(&pos);
75 if (!match(&pos, word) || is_xid_continue_next(pos)) return 0;
77 *out = pos;
78 return strlen(word);
81 const char *get_word(const char **inout) {
82 const char *word = *inout;
83 spaces(&word);
84 const uint8_t *pos = (const uint8_t *)word;
85 ucs4_t point;
86 pos = u8_next(&point, pos);
87 if (!uc_is_property_xid_start(point) && point != '_') return NULL;
89 for (const uint8_t *next; (next = u8_next(&point, pos)); pos = next) {
90 if (!uc_is_property_xid_continue(point)) break;
92 *inout = (const char *)pos;
93 return GC_strndup(word, (size_t)((const char *)pos - word));
96 const char *get_id(const char **inout) {
97 const char *pos = *inout;
98 const char *word = get_word(&pos);
99 if (!word || is_keyword(word)) return NULL;
100 *inout = pos;
101 return word;
104 PUREFUNC const char *eol(const char *str) {
105 return str + strcspn(str, "\r\n");
108 bool comment(parse_ctx_t *ctx, const char **pos) {
109 if ((*pos)[0] == '#') {
110 const char *start = *pos;
111 *pos += strcspn(*pos, "\r\n");
112 const char *end = *pos;
113 Table$set(&ctx->comments, &start, &end, parse_comments_info);
114 return true;
115 } else {
116 return false;
120 PUREFUNC int64_t get_indent(parse_ctx_t *ctx, const char *pos) {
121 int64_t line_num = get_line_number(ctx->file, pos);
122 const char *line = get_line(ctx->file, line_num);
123 if (line == NULL) {
124 return 0;
125 } else if (*line == ' ') {
126 int64_t spaces = (int64_t)strspn(line, " ");
127 if (line[spaces] == '\t')
128 parser_err(ctx, line + spaces, line + spaces + 1,
129 "This is a tab following spaces, and you can't mix tabs and spaces");
130 return spaces;
131 } else if (*line == '\t') {
132 int64_t indent = (int64_t)strspn(line, "\t");
133 if (line[indent] == ' ')
134 parser_err(ctx, line + indent, line + indent + 1,
135 "This is a space following tabs, and you can't mix tabs and spaces");
136 return indent * SPACES_PER_INDENT;
137 } else {
138 return 0;
142 bool indent(parse_ctx_t *ctx, const char **out) {
143 const char *pos = *out;
144 int64_t starting_indent = get_indent(ctx, pos);
145 whitespace(ctx, &pos);
146 const char *next_line = get_line(ctx->file, get_line_number(ctx->file, pos));
147 if (next_line <= *out) return false;
149 if (get_indent(ctx, next_line) != starting_indent + SPACES_PER_INDENT) return false;
151 *out = next_line + strspn(next_line, " \t");
152 return true;
155 bool newline_with_indentation(const char **out, int64_t target) {
156 const char *pos = *out;
157 if (*pos == '\r') ++pos;
158 if (*pos != '\n') return false;
159 ++pos;
160 if (*pos == '\r' || *pos == '\n' || *pos == '\0') {
161 // Empty line
162 *out = pos;
163 return true;
166 if (*pos == ' ') {
167 if ((int64_t)strspn(pos, " ") >= target) {
168 *out = pos + target;
169 return true;
171 } else if ((int64_t)strspn(pos, "\t") * SPACES_PER_INDENT >= target) {
172 *out = pos + target / SPACES_PER_INDENT;
173 return true;
175 return false;
179 // Convert an escape sequence like \n to a string
181 #ifdef __GNUC__
182 #pragma GCC diagnostic push
183 #pragma GCC diagnostic ignored "-Wstack-protector"
184 #endif
185 const char *unescape(parse_ctx_t *ctx, const char **out) {
186 const char **endpos = out;
187 const char *escape = *out;
188 static const char *unescapes[256] = {['a'] = "\a", ['b'] = "\b", ['e'] = "\x1b", ['f'] = "\f", ['n'] = "\n",
189 ['r'] = "\r", ['t'] = "\t", ['v'] = "\v", ['_'] = " "};
190 assert(*escape == '\\');
191 if (unescapes[(int)escape[1]]) {
192 *endpos = escape + 2;
193 return GC_strdup(unescapes[(int)escape[1]]);
194 } else if (escape[1] == '[') {
195 // ANSI Control Sequence Indicator: \033 [ ... m
196 size_t len = strcspn(&escape[2], "\r\n]");
197 if (escape[2 + len] != ']') parser_err(ctx, escape, escape + 2 + len, "Missing closing ']'");
198 *endpos = escape + 3 + len;
199 return String("\033[", string_slice(&escape[2], len), "m");
200 } else if (escape[1] == '{') {
201 // Unicode codepoints by name
202 size_t len = strcspn(&escape[2], "\r\n}");
203 if (escape[2 + len] != '}') parser_err(ctx, escape, escape + 2 + len, "Missing closing '}'");
204 char name[len + 1];
205 memcpy(name, &escape[2], len);
206 name[len] = '\0';
208 if (name[0] == 'U') {
209 for (char *p = &name[1]; *p; p++) {
210 if (!isxdigit(*p)) goto look_up_unicode_name;
212 // Unicode codepoints by hex
213 char *endptr = NULL;
214 long codepoint = strtol(name + 1, &endptr, 16);
215 uint32_t ustr[2] = {codepoint, 0};
216 size_t bufsize = 8;
217 uint8_t buf[bufsize];
218 (void)u32_to_u8(ustr, bufsize, buf, &bufsize);
219 *endpos = escape + 3 + len;
220 return GC_strndup((char *)buf, bufsize);
223 look_up_unicode_name:;
225 uint32_t codepoint = unicode_name_character(name);
226 if (codepoint == UNINAME_INVALID)
227 parser_err(ctx, escape, escape + 3 + len, "Invalid unicode codepoint name: ", quoted(name));
228 *endpos = escape + 3 + len;
229 char *str = GC_MALLOC_ATOMIC(16);
230 size_t u8_len = 16;
231 (void)u32_to_u8(&codepoint, 1, (uint8_t *)str, &u8_len);
232 str[u8_len] = '\0';
233 return str;
234 } else if (escape[1] == 'x' && escape[2] && escape[3]) {
235 // ASCII 2-digit hex
236 char buf[] = {escape[2], escape[3], 0};
237 char c = (char)strtol(buf, NULL, 16);
238 *endpos = escape + 4;
239 return GC_strndup(&c, 1);
240 } else if ('0' <= escape[1] && escape[1] <= '7' && '0' <= escape[2] && escape[2] <= '7' && '0' <= escape[3]
241 && escape[3] <= '7') {
242 char buf[] = {escape[1], escape[2], escape[3], 0};
243 char c = (char)strtol(buf, NULL, 8);
244 *endpos = escape + 4;
245 return GC_strndup(&c, 1);
246 } else {
247 *endpos = escape + 2;
248 return GC_strndup(escape + 1, 1);
251 #ifdef __GNUC__
252 #pragma GCC diagnostic pop
253 #endif
255 bool match_separator(parse_ctx_t *ctx, const char **pos) { // Either comma or newline
256 const char *p = *pos;
257 int separators = 0;
258 for (;;) {
259 if (some_of(&p, "\r\n,")) ++separators;
260 else if (!comment(ctx, &p) && !some_of(&p, " \t")) break;
262 if (separators > 0) {
263 *pos = p;
264 return true;
265 } else {
266 return false;