diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2020-09-25 08:41:29 -0700 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2020-09-25 08:41:29 -0700 |
| commit | 49c05d01b76787c43d6e5b1b11e1a949c1a0d625 (patch) | |
| tree | 2966344b570ace8eb9e160bff17ac11824625174 /vm.c | |
| parent | 437425e3483cdb7fc5145232853a27a1e66cfeb2 (diff) | |
Some light unicode support
Diffstat (limited to 'vm.c')
| -rw-r--r-- | vm.c | 21 |
1 files changed, 19 insertions, 2 deletions
@@ -35,6 +35,22 @@ static const char *opcode_names[] = { [VM_NODENT] = "NODENT", }; +static inline const char *next_char(file_t *f, const char *str) +{ + char c = *str; + ++str; + if (__builtin_expect(!(c & 0x80), 1)) + return str; + + if (__builtin_expect(str < f->end && !!(*str & 0x80), 1)) + ++str; + if (c > '\xDF' && __builtin_expect(str < f->end && !!(*str & 0x80), 1)) + ++str; + if (c > '\xEF' && __builtin_expect(str < f->end && !!(*str & 0x80), 1)) + ++str; + return str; +} + const char *opcode_name(enum VMOpcode o) { return opcode_names[o]; @@ -88,7 +104,7 @@ static match_t *_match(grammar_t *g, file_t *f, const char *str, vm_op_t *op, un match_t *m = calloc(sizeof(match_t), 1); m->op = op; m->start = str; - m->end = str+1; + m->end = next_char(f, str); return m; } case VM_STRING: { @@ -138,7 +154,8 @@ static match_t *_match(grammar_t *g, file_t *f, const char *str, vm_op_t *op, un // This isn't in the for() structure because there needs to // be at least once chance to match the pattern, even if // we're at the end of the string already (e.g. "..$"). - if (str < f->end && (op->multiline || *str != '\n')) ++str; + if (str < f->end && (op->multiline || *str != '\n')) + str = next_char(f, str); } destroy_match(&m); return NULL; |
