utils.c - tomo

(268 lines)
   1 // Some common parsing utilities
   2 
   3 #include <stdint.h>
   4 
   5 #include "../unistr-fixed.h"
   6 #include <unictype.h>
   7 #include <uniname.h>
   8 
   9 #include "../stdlib/tables.h"
  10 #include "../stdlib/util.h"
  11 #include "errors.h"
  12 #include "utils.h"
  13 
  14 static const char *keywords[] = {
  15     "C_code", "_max_", "_min_", "and",    "assert", "break",  "continue", "defer", "do",    "else", "enum", "for",
  16     "func",   "if",    "in",    "lang",   "mod",    "mod1",   "no",       "none",  "not",   "or",   "pass", "return",
  17     "skip",   "skip",  "stop",  "struct", "then",   "unless", "use",      "when",  "while", "xor",  "yes",
  18 };
  19 
  20 CONSTFUNC bool is_keyword(const char *word) {
  21     int64_t lo = 0, hi = sizeof(keywords) / sizeof(keywords[0]) - 1;
  22     while (lo <= hi) {
  23         int64_t mid = (lo + hi) / 2;
  24         int32_t cmp = strcmp(word, keywords[mid]);
  25         if (cmp == 0) return true;
  26         else if (cmp > 0) lo = mid + 1;
  27         else if (cmp < 0) hi = mid - 1;
  28     }
  29     return false;
  30 }
  31 
  32 size_t some_of(const char **pos, const char *allow) {
  33     size_t len = strspn(*pos, allow);
  34     *pos += len;
  35     return len;
  36 }
  37 
  38 size_t some_not(const char **pos, const char *forbid) {
  39     size_t len = strcspn(*pos, forbid);
  40     *pos += len;
  41     return len;
  42 }
  43 
  44 size_t spaces(const char **pos) {
  45     return some_of(pos, " \t");
  46 }
  47 
  48 void whitespace(parse_ctx_t *ctx, const char **pos) {
  49     while (some_of(pos, " \t\r\n") || comment(ctx, pos))
  50         continue;
  51 }
  52 
  53 size_t match(const char **pos, const char *target) {
  54     size_t len = strlen(target);
  55     if (strncmp(*pos, target, len) != 0) return 0;
  56     *pos += len;
  57     return len;
  58 }
  59 
  60 bool is_xid_start_next(const char *pos) {
  61     ucs4_t point = 0;
  62     u8_next(&point, (const uint8_t *)pos);
  63     return uc_is_property_xid_start(point);
  64 }
  65 
  66 bool is_xid_continue_next(const char *pos) {
  67     ucs4_t point = 0;
  68     u8_next(&point, (const uint8_t *)pos);
  69     return uc_is_property_xid_continue(point);
  70 }
  71 
  72 size_t match_word(const char **out, const char *word) {
  73     const char *pos = *out;
  74     spaces(&pos);
  75     if (!match(&pos, word) || is_xid_continue_next(pos)) return 0;
  76 
  77     *out = pos;
  78     return strlen(word);
  79 }
  80 
  81 const char *get_word(const char **inout) {
  82     const char *word = *inout;
  83     spaces(&word);
  84     const uint8_t *pos = (const uint8_t *)word;
  85     ucs4_t point;
  86     pos = u8_next(&point, pos);
  87     if (!uc_is_property_xid_start(point) && point != '_') return NULL;
  88 
  89     for (const uint8_t *next; (next = u8_next(&point, pos)); pos = next) {
  90         if (!uc_is_property_xid_continue(point)) break;
  91     }
  92     *inout = (const char *)pos;
  93     return GC_strndup(word, (size_t)((const char *)pos - word));
  94 }
  95 
  96 const char *get_id(const char **inout) {
  97     const char *pos = *inout;
  98     const char *word = get_word(&pos);
  99     if (!word || is_keyword(word)) return NULL;
 100     *inout = pos;
 101     return word;
 102 }
 103 
 104 PUREFUNC const char *eol(const char *str) {
 105     return str + strcspn(str, "\r\n");
 106 }
 107 
 108 bool comment(parse_ctx_t *ctx, const char **pos) {
 109     if ((*pos)[0] == '#') {
 110         const char *start = *pos;
 111         *pos += strcspn(*pos, "\r\n");
 112         const char *end = *pos;
 113         Table$set(&ctx->comments, &start, &end, parse_comments_info);
 114         return true;
 115     } else {
 116         return false;
 117     }
 118 }
 119 
 120 PUREFUNC int64_t get_indent(parse_ctx_t *ctx, const char *pos) {
 121     int64_t line_num = get_line_number(ctx->file, pos);
 122     const char *line = get_line(ctx->file, line_num);
 123     if (line == NULL) {
 124         return 0;
 125     } else if (*line == ' ') {
 126         int64_t spaces = (int64_t)strspn(line, " ");
 127         if (line[spaces] == '\t')
 128             parser_err(ctx, line + spaces, line + spaces + 1,
 129                        "This is a tab following spaces, and you can't mix tabs and spaces");
 130         return spaces;
 131     } else if (*line == '\t') {
 132         int64_t indent = (int64_t)strspn(line, "\t");
 133         if (line[indent] == ' ')
 134             parser_err(ctx, line + indent, line + indent + 1,
 135                        "This is a space following tabs, and you can't mix tabs and spaces");
 136         return indent * SPACES_PER_INDENT;
 137     } else {
 138         return 0;
 139     }
 140 }
 141 
 142 bool indent(parse_ctx_t *ctx, const char **out) {
 143     const char *pos = *out;
 144     int64_t starting_indent = get_indent(ctx, pos);
 145     whitespace(ctx, &pos);
 146     const char *next_line = get_line(ctx->file, get_line_number(ctx->file, pos));
 147     if (next_line <= *out) return false;
 148 
 149     if (get_indent(ctx, next_line) != starting_indent + SPACES_PER_INDENT) return false;
 150 
 151     *out = next_line + strspn(next_line, " \t");
 152     return true;
 153 }
 154 
 155 bool newline_with_indentation(const char **out, int64_t target) {
 156     const char *pos = *out;
 157     if (*pos == '\r') ++pos;
 158     if (*pos != '\n') return false;
 159     ++pos;
 160     if (*pos == '\r' || *pos == '\n' || *pos == '\0') {
 161         // Empty line
 162         *out = pos;
 163         return true;
 164     }
 165 
 166     if (*pos == ' ') {
 167         if ((int64_t)strspn(pos, " ") >= target) {
 168             *out = pos + target;
 169             return true;
 170         }
 171     } else if ((int64_t)strspn(pos, "\t") * SPACES_PER_INDENT >= target) {
 172         *out = pos + target / SPACES_PER_INDENT;
 173         return true;
 174     }
 175     return false;
 176 }
 177 
 178 //
 179 // Convert an escape sequence like \n to a string
 180 //
 181 #ifdef __GNUC__
 182 #pragma GCC diagnostic push
 183 #pragma GCC diagnostic ignored "-Wstack-protector"
 184 #endif
 185 const char *unescape(parse_ctx_t *ctx, const char **out) {
 186     const char **endpos = out;
 187     const char *escape = *out;
 188     static const char *unescapes[256] = {['a'] = "\a", ['b'] = "\b", ['e'] = "\x1b", ['f'] = "\f", ['n'] = "\n",
 189                                          ['r'] = "\r", ['t'] = "\t", ['v'] = "\v",   ['_'] = " "};
 190     assert(*escape == '\\');
 191     if (unescapes[(int)escape[1]]) {
 192         *endpos = escape + 2;
 193         return GC_strdup(unescapes[(int)escape[1]]);
 194     } else if (escape[1] == '[') {
 195         // ANSI Control Sequence Indicator: \033 [ ... m
 196         size_t len = strcspn(&escape[2], "\r\n]");
 197         if (escape[2 + len] != ']') parser_err(ctx, escape, escape + 2 + len, "Missing closing ']'");
 198         *endpos = escape + 3 + len;
 199         return String("\033[", string_slice(&escape[2], len), "m");
 200     } else if (escape[1] == '{') {
 201         // Unicode codepoints by name
 202         size_t len = strcspn(&escape[2], "\r\n}");
 203         if (escape[2 + len] != '}') parser_err(ctx, escape, escape + 2 + len, "Missing closing '}'");
 204         char name[len + 1];
 205         memcpy(name, &escape[2], len);
 206         name[len] = '\0';
 207 
 208         if (name[0] == 'U') {
 209             for (char *p = &name[1]; *p; p++) {
 210                 if (!isxdigit(*p)) goto look_up_unicode_name;
 211             }
 212             // Unicode codepoints by hex
 213             char *endptr = NULL;
 214             long codepoint = strtol(name + 1, &endptr, 16);
 215             uint32_t ustr[2] = {codepoint, 0};
 216             size_t bufsize = 8;
 217             uint8_t buf[bufsize];
 218             (void)u32_to_u8(ustr, bufsize, buf, &bufsize);
 219             *endpos = escape + 3 + len;
 220             return GC_strndup((char *)buf, bufsize);
 221         }
 222 
 223     look_up_unicode_name:;
 224 
 225         uint32_t codepoint = unicode_name_character(name);
 226         if (codepoint == UNINAME_INVALID)
 227             parser_err(ctx, escape, escape + 3 + len, "Invalid unicode codepoint name: ", quoted(name));
 228         *endpos = escape + 3 + len;
 229         char *str = GC_MALLOC_ATOMIC(16);
 230         size_t u8_len = 16;
 231         (void)u32_to_u8(&codepoint, 1, (uint8_t *)str, &u8_len);
 232         str[u8_len] = '\0';
 233         return str;
 234     } else if (escape[1] == 'x' && escape[2] && escape[3]) {
 235         // ASCII 2-digit hex
 236         char buf[] = {escape[2], escape[3], 0};
 237         char c = (char)strtol(buf, NULL, 16);
 238         *endpos = escape + 4;
 239         return GC_strndup(&c, 1);
 240     } else if ('0' <= escape[1] && escape[1] <= '7' && '0' <= escape[2] && escape[2] <= '7' && '0' <= escape[3]
 241                && escape[3] <= '7') {
 242         char buf[] = {escape[1], escape[2], escape[3], 0};
 243         char c = (char)strtol(buf, NULL, 8);
 244         *endpos = escape + 4;
 245         return GC_strndup(&c, 1);
 246     } else {
 247         *endpos = escape + 2;
 248         return GC_strndup(escape + 1, 1);
 249     }
 250 }
 251 #ifdef __GNUC__
 252 #pragma GCC diagnostic pop
 253 #endif
 254 
 255 bool match_separator(parse_ctx_t *ctx, const char **pos) { // Either comma or newline
 256     const char *p = *pos;
 257     int separators = 0;
 258     for (;;) {
 259         if (some_of(&p, "\r\n,")) ++separators;
 260         else if (!comment(ctx, &p) && !some_of(&p, " \t")) break;
 261     }
 262     if (separators > 0) {
 263         *pos = p;
 264         return true;
 265     } else {
 266         return false;
 267     }
 268 }