diff --git a/bp.c b/bp.c index b1ae071..3adc5f2 100644 --- a/bp.c +++ b/bp.c @@ -562,4 +562,4 @@ int main(int argc, char *argv[]) exit(found > 0 ? EXIT_SUCCESS : EXIT_FAILURE); } -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/definitions.c b/definitions.c index 717e1a5..5309c8b 100644 --- a/definitions.c +++ b/definitions.c @@ -68,4 +68,4 @@ def_t *free_defs(def_t *defs, def_t *stop) return defs; } -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/definitions.h b/definitions.h index 5788ac0..6dd0920 100644 --- a/definitions.h +++ b/definitions.h @@ -17,4 +17,4 @@ __attribute__((nonnull(1))) def_t *free_defs(def_t *defs, def_t *stop); #endif -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/explain.c b/explain.c index 2d54234..79ef8c7 100644 --- a/explain.c +++ b/explain.c @@ -65,9 +65,9 @@ static void _explain_matches(match_node_t *firstmatch, int depth, const char *te for (size_t i = 0; i < viz_typelen; i++) { switch (viz_type[i]) { - case '\n': printf("↵"); break; - case '\t': printf("⇥"); break; - default: printf("%c", viz_type[i]); break; + case '\n': printf("↵"); break; + case '\t': printf("⇥"); break; + default: printf("%c", viz_type[i]); break; } } @@ -169,3 +169,5 @@ void explain_match(match_t *m) _explain_matches(&first, 0, m->start, (size_t)(m->end - m->start)); printf("\033[?7h"); // Re-enable line wrapping } + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/explain.h b/explain.h index 75cf184..dab52b1 100644 --- a/explain.h +++ b/explain.h @@ -10,4 +10,4 @@ __attribute__((nonnull)) void explain_match(match_t *m); #endif -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/files.c b/files.c index 79d2a03..7e55d27 100644 --- a/files.c +++ b/files.c @@ -383,4 +383,4 @@ void cache_destroy(file_t *f) f->cache.size = 0; } -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/files.h b/files.h index 375e142..840412b 100644 --- a/files.h +++ b/files.h @@ -53,4 +53,4 @@ __attribute__((nonnull)) void cache_destroy(file_t *f); #endif -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/json.c b/json.c index 23079a7..e102cfe 100644 --- a/json.c +++ b/json.c @@ -29,11 +29,11 @@ static int _json_match(const char *text, match_t *m, int comma, bool verbose) printf("{\"rule\":\""); for (const char *c = m->pat->start; c < m->pat->end; c++) { switch (*c) { - case '"': printf("\\\""); break; - case '\\': printf("\\\\"); break; - case '\t': printf("\\t"); break; - case '\n': printf("↵"); break; - default: printf("%c", *c); break; + case '"': printf("\\\""); break; + case '\\': printf("\\\\"); break; + case '\t': printf("\\t"); break; + case '\n': printf("↵"); break; + default: printf("%c", *c); break; } } printf("\",\"start\":%ld,\"end\":%ld,\"children\":[", @@ -52,4 +52,4 @@ void json_match(const char *text, match_t *m, bool verbose) (void)_json_match(text, m, 0, verbose); } -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/json.h b/json.h index 554c88b..4450387 100644 --- a/json.h +++ b/json.h @@ -12,4 +12,4 @@ __attribute__((nonnull)) void json_match(const char *text, match_t *m, bool verbose); #endif -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/match.c b/match.c index ed26a62..c1b5f49 100644 --- a/match.c +++ b/match.c @@ -106,25 +106,25 @@ static pat_t *first_pat(def_t *defs, pat_t *pat) { for (pat_t *p = pat; p; ) { switch (p->type) { - case BP_BEFORE: - p = p->args.pat; break; - case BP_REPEAT: - if (p->args.repetitions.min == 0) - return p; - p = p->args.repetitions.repeat_pat; break; - case BP_CAPTURE: - p = p->args.capture.capture_pat; break; - case BP_CHAIN: case BP_MATCH: case BP_NOT_MATCH: - p = p->args.multiple.first; break; - case BP_REPLACE: - p = p->args.replace.pat; break; - case BP_REF: { - pat_t *p2 = deref(defs, p); - if (p2 == p) return p2; - p = p2; - break; - } - default: return p; + case BP_BEFORE: + p = p->args.pat; break; + case BP_REPEAT: + if (p->args.repetitions.min == 0) + return p; + p = p->args.repetitions.repeat_pat; break; + case BP_CAPTURE: + p = p->args.capture.capture_pat; break; + case BP_CHAIN: case BP_MATCH: case BP_NOT_MATCH: + p = p->args.multiple.first; break; + case BP_REPLACE: + p = p->args.replace.pat; break; + case BP_REF: { + pat_t *p2 = deref(defs, p); + if (p2 == p) return p2; + p = p2; + break; + } + default: return p; } } return pat; @@ -187,376 +187,376 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase) { switch (pat->type) { - case BP_DEFINITION: { - def_t *defs2 = with_def(defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def); - match_t *m = match(defs2, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase); - defs = free_defs(defs2, defs); - return m; + case BP_DEFINITION: { + def_t *defs2 = with_def(defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def); + match_t *m = match(defs2, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase); + defs = free_defs(defs2, defs); + return m; + } + case BP_LEFTRECURSION: { + // Left recursion occurs when a pattern directly or indirectly + // invokes itself at the same position in the text. It's handled as + // a special case, but if a pattern invokes itself at a later + // point, it can be handled with normal recursion. + // See: left-recursion.md for more details. + if (str == pat->args.leftrec.at) { + ++pat->args.leftrec.visits; + return pat->args.leftrec.match; + } else { + return match(defs, f, str, pat->args.leftrec.fallback, ignorecase); } - case BP_LEFTRECURSION: { - // Left recursion occurs when a pattern directly or indirectly - // invokes itself at the same position in the text. It's handled as - // a special case, but if a pattern invokes itself at a later - // point, it can be handled with normal recursion. - // See: left-recursion.md for more details. - if (str == pat->args.leftrec.at) { - ++pat->args.leftrec.visits; - return pat->args.leftrec.match; - } else { - return match(defs, f, str, pat->args.leftrec.fallback, ignorecase); - } - } - case BP_ANYCHAR: { - return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; - } - case BP_ID_START: { - return (str < f->end && isidstart(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; - } - case BP_ID_CONTINUE: { - return (str < f->end && isidcontinue(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; - } - case BP_START_OF_FILE: { - return (str == f->start) ? new_match(defs, pat, str, str, NULL) : NULL; - } - case BP_START_OF_LINE: { - return (str == f->start || str[-1] == '\n') ? new_match(defs, pat, str, str, NULL) : NULL; - } - case BP_END_OF_FILE: { - return (str == f->end) ? new_match(defs, pat, str, str, NULL) : NULL; - } - case BP_END_OF_LINE: { - return (str == f->end || *str == '\n') ? new_match(defs, pat, str, str, NULL) : NULL; - } - case BP_WORD_BOUNDARY: { - return (str == f->start || isidcontinue(f, str) != isidcontinue(f, prev_char(f, str))) ? new_match(defs, pat, str, str, NULL) : NULL; - } - case BP_STRING: { - if (&str[pat->min_matchlen] > f->end) return NULL; - if (pat->min_matchlen > 0 && (ignorecase ? memicmp : memcmp)(str, pat->args.string, pat->min_matchlen) != 0) - return NULL; - return new_match(defs, pat, str, str + pat->min_matchlen, NULL); - } - case BP_RANGE: { - if (str >= f->end) return NULL; - if ((unsigned char)*str < pat->args.range.low || (unsigned char)*str > pat->args.range.high) - return NULL; - return new_match(defs, pat, str, str+1, NULL); - } - case BP_NOT: { - match_t *m = match(defs, f, str, pat->args.pat, ignorecase); - if (m != NULL) { - recycle_if_unused(&m); - return NULL; - } - return new_match(defs, pat, str, str, NULL); - } - case BP_UPTO: case BP_UPTO_STRICT: { - match_t *m = new_match(defs, pat, str, str, NULL); - pat_t *target = deref(defs, pat->args.multiple.first), - *skip = deref(defs, pat->args.multiple.second); - if (!target && !skip) { - while (str < f->end && *str != '\n') ++str; - m->end = str; - return m; - } - - size_t child_cap = 0, nchildren = 0; - for (const char *prev = NULL; prev < str; ) { - prev = str; - if (target) { - match_t *p = match(defs, f, str, target, ignorecase); - if (p != NULL) { - recycle_if_unused(&p); - m->end = str; - return m; - } - } else if (str == f->end) { - m->end = str; - return m; - } - if (skip) { - match_t *s = match(defs, f, str, skip, ignorecase); - if (s != NULL) { - str = s->end; - if (nchildren+2 >= child_cap) { - m->children = grow(m->children, child_cap += 5); - for (size_t i = nchildren; i < child_cap; i++) m->children[i] = NULL; - } - add_owner(&m->children[nchildren++], s); - continue; - } - } - // This isn't in the for() structure because there needs to - // be at least once chance to match the pattern, even if - // we're at the end of the string already (e.g. "..$"). - if (str < f->end && *str != '\n' && pat->type != BP_UPTO_STRICT) - str = next_char(f, str); - } + } + case BP_ANYCHAR: { + return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; + } + case BP_ID_START: { + return (str < f->end && isidstart(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; + } + case BP_ID_CONTINUE: { + return (str < f->end && isidcontinue(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; + } + case BP_START_OF_FILE: { + return (str == f->start) ? new_match(defs, pat, str, str, NULL) : NULL; + } + case BP_START_OF_LINE: { + return (str == f->start || str[-1] == '\n') ? new_match(defs, pat, str, str, NULL) : NULL; + } + case BP_END_OF_FILE: { + return (str == f->end || (str == f->end-1 && *str == '\n')) ? new_match(defs, pat, str, str, NULL) : NULL; + } + case BP_END_OF_LINE: { + return (str == f->end || *str == '\n') ? new_match(defs, pat, str, str, NULL) : NULL; + } + case BP_WORD_BOUNDARY: { + return (str == f->start || isidcontinue(f, str) != isidcontinue(f, prev_char(f, str))) ? new_match(defs, pat, str, str, NULL) : NULL; + } + case BP_STRING: { + if (&str[pat->min_matchlen] > f->end) return NULL; + if (pat->min_matchlen > 0 && (ignorecase ? memicmp : memcmp)(str, pat->args.string, pat->min_matchlen) != 0) + return NULL; + return new_match(defs, pat, str, str + pat->min_matchlen, NULL); + } + case BP_RANGE: { + if (str >= f->end) return NULL; + if ((unsigned char)*str < pat->args.range.low || (unsigned char)*str > pat->args.range.high) + return NULL; + return new_match(defs, pat, str, str+1, NULL); + } + case BP_NOT: { + match_t *m = match(defs, f, str, pat->args.pat, ignorecase); + if (m != NULL) { recycle_if_unused(&m); return NULL; } - case BP_REPEAT: { - match_t *m = new_match(defs, pat, str, str, NULL); - size_t reps = 0; - ssize_t max = pat->args.repetitions.max; - pat_t *repeating = deref(defs, pat->args.repetitions.repeat_pat); - pat_t *sep = deref(defs, pat->args.repetitions.sep); - size_t child_cap = 0, nchildren = 0; - for (reps = 0; max == -1 || reps < (size_t)max; ++reps) { - const char *start = str; - // Separator - match_t *msep = NULL; - if (sep != NULL && reps > 0) { - msep = match(defs, f, str, sep, ignorecase); - if (msep == NULL) break; - str = msep->end; + return new_match(defs, pat, str, str, NULL); + } + case BP_UPTO: case BP_UPTO_STRICT: { + match_t *m = new_match(defs, pat, str, str, NULL); + pat_t *target = deref(defs, pat->args.multiple.first), + *skip = deref(defs, pat->args.multiple.second); + if (!target && !skip) { + while (str < f->end && *str != '\n') ++str; + m->end = str; + return m; + } + + size_t child_cap = 0, nchildren = 0; + for (const char *prev = NULL; prev < str; ) { + prev = str; + if (target) { + match_t *p = match(defs, f, str, target, ignorecase); + if (p != NULL) { + recycle_if_unused(&p); + m->end = str; + return m; } - match_t *mp = match(defs, f, str, repeating, ignorecase); - if (mp == NULL) { - str = start; - if (msep) recycle_if_unused(&msep); - break; - } - if (mp->end == start && reps > 0) { - // Since no forward progress was made on either `repeating` - // or `sep` and BP does not have mutable state, it's - // guaranteed that no progress will be made on the next - // loop either. We know that this will continue to loop - // until reps==max, so let's just cut to the chase instead - // of looping infinitely. - if (msep) recycle_if_unused(&msep); - recycle_if_unused(&mp); - if (pat->args.repetitions.max == -1) - reps = ~(size_t)0; - else - reps = (size_t)pat->args.repetitions.max; - break; - } - if (msep) { + } else if (str == f->end) { + m->end = str; + return m; + } + if (skip) { + match_t *s = match(defs, f, str, skip, ignorecase); + if (s != NULL) { + str = s->end; if (nchildren+2 >= child_cap) { m->children = grow(m->children, child_cap += 5); for (size_t i = nchildren; i < child_cap; i++) m->children[i] = NULL; } - add_owner(&m->children[nchildren++], msep); + add_owner(&m->children[nchildren++], s); + continue; } - + } + // This isn't in the for() structure because there needs to + // be at least once chance to match the pattern, even if + // we're at the end of the string already (e.g. "..$"). + if (str < f->end && *str != '\n' && pat->type != BP_UPTO_STRICT) + str = next_char(f, str); + } + recycle_if_unused(&m); + return NULL; + } + case BP_REPEAT: { + match_t *m = new_match(defs, pat, str, str, NULL); + size_t reps = 0; + ssize_t max = pat->args.repetitions.max; + pat_t *repeating = deref(defs, pat->args.repetitions.repeat_pat); + pat_t *sep = deref(defs, pat->args.repetitions.sep); + size_t child_cap = 0, nchildren = 0; + for (reps = 0; max == -1 || reps < (size_t)max; ++reps) { + const char *start = str; + // Separator + match_t *msep = NULL; + if (sep != NULL && reps > 0) { + msep = match(defs, f, str, sep, ignorecase); + if (msep == NULL) break; + str = msep->end; + } + match_t *mp = match(defs, f, str, repeating, ignorecase); + if (mp == NULL) { + str = start; + if (msep) recycle_if_unused(&msep); + break; + } + if (mp->end == start && reps > 0) { + // Since no forward progress was made on either `repeating` + // or `sep` and BP does not have mutable state, it's + // guaranteed that no progress will be made on the next + // loop either. We know that this will continue to loop + // until reps==max, so let's just cut to the chase instead + // of looping infinitely. + if (msep) recycle_if_unused(&msep); + recycle_if_unused(&mp); + if (pat->args.repetitions.max == -1) + reps = ~(size_t)0; + else + reps = (size_t)pat->args.repetitions.max; + break; + } + if (msep) { if (nchildren+2 >= child_cap) { m->children = grow(m->children, child_cap += 5); for (size_t i = nchildren; i < child_cap; i++) m->children[i] = NULL; } - add_owner(&m->children[nchildren++], mp); - str = mp->end; + add_owner(&m->children[nchildren++], msep); } - if (reps < (size_t)pat->args.repetitions.min) { - recycle_if_unused(&m); - return NULL; + if (nchildren+2 >= child_cap) { + m->children = grow(m->children, child_cap += 5); + for (size_t i = nchildren; i < child_cap; i++) m->children[i] = NULL; } - m->end = str; - return m; + add_owner(&m->children[nchildren++], mp); + str = mp->end; } - case BP_AFTER: { - pat_t *back = deref(defs, pat->args.pat); - if (!back) return NULL; - // We only care about the region from the backtrack pos up to the - // current pos, so mock it out as a file slice. - // TODO: this breaks ^/^^/$/$$, but that can probably be ignored - // because you rarely need to check those in a backtrack. - file_t slice; - slice_file(&slice, f, f->start, str); - for (const char *pos = &str[-(long)back->min_matchlen]; - pos >= f->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]); - pos = prev_char(f, pos)) { - cache_destroy(&slice); - slice.start = (char*)pos; - match_t *m = match(defs, &slice, pos, back, ignorecase); - // Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB") - if (m && m->end != str) - recycle_if_unused(&m); - else if (m) { - cache_destroy(&slice); - return new_match(defs, pat, str, str, MATCHES(m)); - } - if (pos == f->start) break; - // To prevent extreme performance degradation, don't keep - // walking backwards endlessly over newlines. - if (back->max_matchlen == -1 && *pos == '\n') break; - } - cache_destroy(&slice); + if (reps < (size_t)pat->args.repetitions.min) { + recycle_if_unused(&m); return NULL; } - case BP_BEFORE: { - match_t *after = match(defs, f, str, pat->args.pat, ignorecase); - return after ? new_match(defs, pat, str, str, MATCHES(after)) : NULL; - } - case BP_CAPTURE: { - match_t *p = match(defs, f, str, pat->args.pat, ignorecase); - return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL; - } - case BP_OTHERWISE: { - match_t *m = match(defs, f, str, pat->args.multiple.first, ignorecase); - return m ? m : match(defs, f, str, pat->args.multiple.second, ignorecase); - } - case BP_CHAIN: { - match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase); - if (m1 == NULL) return NULL; + m->end = str; + return m; + } + case BP_AFTER: { + pat_t *back = deref(defs, pat->args.pat); + if (!back) return NULL; - match_t *m2; - // Push backrefs and run matching, then cleanup - if (m1->pat->type == BP_CAPTURE && m1->pat->args.capture.name) { - // Temporarily add a rule that the backref name matches the - // exact string of the original match (no replacements) - size_t len = (size_t)(m1->end - m1->start); - pat_t *backref = new_pat(f, m1->start, m1->end, len, (ssize_t)len, BP_STRING); - backref->args.string = m1->start; + // We only care about the region from the backtrack pos up to the + // current pos, so mock it out as a file slice. + // TODO: this breaks ^/^^/$/$$, but that can probably be ignored + // because you rarely need to check those in a backtrack. + file_t slice; + slice_file(&slice, f, f->start, str); + for (const char *pos = &str[-(long)back->min_matchlen]; + pos >= f->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]); + pos = prev_char(f, pos)) { + cache_destroy(&slice); + slice.start = (char*)pos; + match_t *m = match(defs, &slice, pos, back, ignorecase); + // Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB") + if (m && m->end != str) + recycle_if_unused(&m); + else if (m) { + cache_destroy(&slice); + return new_match(defs, pat, str, str, MATCHES(m)); + } + if (pos == f->start) break; + // To prevent extreme performance degradation, don't keep + // walking backwards endlessly over newlines. + if (back->max_matchlen == -1 && *pos == '\n') break; + } + cache_destroy(&slice); + return NULL; + } + case BP_BEFORE: { + match_t *after = match(defs, f, str, pat->args.pat, ignorecase); + return after ? new_match(defs, pat, str, str, MATCHES(after)) : NULL; + } + case BP_CAPTURE: { + match_t *p = match(defs, f, str, pat->args.pat, ignorecase); + return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL; + } + case BP_OTHERWISE: { + match_t *m = match(defs, f, str, pat->args.multiple.first, ignorecase); + return m ? m : match(defs, f, str, pat->args.multiple.second, ignorecase); + } + case BP_CHAIN: { + match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase); + if (m1 == NULL) return NULL; - def_t *defs2 = with_def(defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref); - ++m1->refcount; { - m2 = match(defs2, f, m1->end, pat->args.multiple.second, ignorecase); - if (!m2) { // No need to keep the backref in memory if it didn't match - for (pat_t **rem = &f->pats; *rem; rem = &(*rem)->next) { - if ((*rem) == backref) { - pat_t *tmp = *rem; - *rem = (*rem)->next; - free(tmp); - break; - } + match_t *m2; + // Push backrefs and run matching, then cleanup + if (m1->pat->type == BP_CAPTURE && m1->pat->args.capture.name) { + // Temporarily add a rule that the backref name matches the + // exact string of the original match (no replacements) + size_t len = (size_t)(m1->end - m1->start); + pat_t *backref = new_pat(f, m1->start, m1->end, len, (ssize_t)len, BP_STRING); + backref->args.string = m1->start; + + def_t *defs2 = with_def(defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref); + ++m1->refcount; { + m2 = match(defs2, f, m1->end, pat->args.multiple.second, ignorecase); + if (!m2) { // No need to keep the backref in memory if it didn't match + for (pat_t **rem = &f->pats; *rem; rem = &(*rem)->next) { + if ((*rem) == backref) { + pat_t *tmp = *rem; + *rem = (*rem)->next; + free(tmp); + break; } } - defs = free_defs(defs2, defs); - } --m1->refcount; - } else { - m2 = match(defs, f, m1->end, pat->args.multiple.second, ignorecase); - } - - if (m2 == NULL) { - recycle_if_unused(&m1); - return NULL; - } - - return new_match(defs, pat, str, m2->end, MATCHES(m1, m2)); - } - case BP_MATCH: case BP_NOT_MATCH: { - match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase); - if (m1 == NULL) return NULL; - - // ~ matches iff the text of matches - // !~ matches iff the text of does not match - file_t slice; - slice_file(&slice, f, m1->start, m1->end); - match_t *m2 = next_match(defs, &slice, NULL, pat->args.multiple.second, NULL, ignorecase); - if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) { - if (m2) recycle_if_unused(&m2); - cache_destroy(&slice); - recycle_if_unused(&m1); - return NULL; - } - cache_destroy(&slice); - return new_match(defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : NULL); - } - case BP_REPLACE: { - match_t *p = NULL; - if (pat->args.replace.pat) { - p = match(defs, f, str, pat->args.replace.pat, ignorecase); - if (p == NULL) return NULL; - } - return new_match(defs, pat, str, p ? p->end : str, MATCHES(p)); - } - case BP_REF: { - match_t *cached; - if (cache_get(f, defs, str, pat, &cached)) - return cached; - - def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name); - if (def == NULL) - errx(EXIT_FAILURE, "Unknown identifier: '%.*s'", (int)pat->args.ref.len, pat->args.ref.name); - pat_t *ref = def->pat; - - pat_t rec_op = { - .type = BP_LEFTRECURSION, - .start = ref->start, - .end = ref->end, - .min_matchlen = 0, - .max_matchlen = -1, - .args.leftrec = { - .match = NULL, - .visits = 0, - .at = str, - .fallback = ref, - }, - }; - def_t defs2 = { - .namelen = def->namelen, - .name = def->name, - .pat = &rec_op, - .next = defs, - }; - - const char *prev = str; - match_t *m = match(&defs2, f, str, ref, ignorecase); - if (m == NULL) { - cache_save(f, defs, str, pat, NULL); - return NULL; - } - - while (rec_op.args.leftrec.visits > 0) { - rec_op.args.leftrec.visits = 0; - remove_ownership(&rec_op.args.leftrec.match); - add_owner(&rec_op.args.leftrec.match, m); - prev = m->end; - match_t *m2 = match(&defs2, f, str, ref, ignorecase); - if (m2 == NULL) break; - if (m2->end <= prev) { - recycle_if_unused(&m2); - break; } - m = m2; - } - - // This match wrapper mainly exists for record-keeping purposes. - // However, it also keeps `m` from getting garbage collected with - // leftrec.match is GC'd. It also helps with visualization of match - // results. - // OPTIMIZE: remove this if necessary - match_t *wrap = new_match(defs, pat, m->start, m->end, MATCHES(m)); - cache_save(f, defs, str, pat, wrap); - - if (rec_op.args.leftrec.match) - remove_ownership(&rec_op.args.leftrec.match); - - return wrap; + defs = free_defs(defs2, defs); + } --m1->refcount; + } else { + m2 = match(defs, f, m1->end, pat->args.multiple.second, ignorecase); } - case BP_NODENT: { - if (*str != '\n') return NULL; - const char *start = str; - size_t linenum = get_line_number(f, str); - const char *p = get_line(f, linenum); - if (p < f->start) p = f->start; // Can happen with recursive matching - - // Current indentation: - char denter = *p; - int dents = 0; - if (denter == ' ' || denter == '\t') { - for (; *p == denter && p < f->end; ++p) ++dents; - } - - // Subsequent indentation: - while (*str == '\n' || *str == '\n') ++str; - for (int i = 0; i < dents; i++) - if (&str[i] >= f->end || str[i] != denter) return NULL; - - return new_match(defs, pat, start, &str[dents], NULL); - } - case BP_ERROR: { - match_t *p = pat->args.pat ? match(defs, f, str, pat->args.pat, ignorecase) : NULL; - return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL; - } - default: { - errx(EXIT_FAILURE, "Unknown pattern type: %u", pat->type); + if (m2 == NULL) { + recycle_if_unused(&m1); return NULL; } + + return new_match(defs, pat, str, m2->end, MATCHES(m1, m2)); + } + case BP_MATCH: case BP_NOT_MATCH: { + match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase); + if (m1 == NULL) return NULL; + + // ~ matches iff the text of matches + // !~ matches iff the text of does not match + file_t slice; + slice_file(&slice, f, m1->start, m1->end); + match_t *m2 = next_match(defs, &slice, NULL, pat->args.multiple.second, NULL, ignorecase); + if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) { + if (m2) recycle_if_unused(&m2); + cache_destroy(&slice); + recycle_if_unused(&m1); + return NULL; + } + cache_destroy(&slice); + return new_match(defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : NULL); + } + case BP_REPLACE: { + match_t *p = NULL; + if (pat->args.replace.pat) { + p = match(defs, f, str, pat->args.replace.pat, ignorecase); + if (p == NULL) return NULL; + } + return new_match(defs, pat, str, p ? p->end : str, MATCHES(p)); + } + case BP_REF: { + match_t *cached; + if (cache_get(f, defs, str, pat, &cached)) + return cached; + + def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name); + if (def == NULL) + errx(EXIT_FAILURE, "Unknown identifier: '%.*s'", (int)pat->args.ref.len, pat->args.ref.name); + pat_t *ref = def->pat; + + pat_t rec_op = { + .type = BP_LEFTRECURSION, + .start = ref->start, + .end = ref->end, + .min_matchlen = 0, + .max_matchlen = -1, + .args.leftrec = { + .match = NULL, + .visits = 0, + .at = str, + .fallback = ref, + }, + }; + def_t defs2 = { + .namelen = def->namelen, + .name = def->name, + .pat = &rec_op, + .next = defs, + }; + + const char *prev = str; + match_t *m = match(&defs2, f, str, ref, ignorecase); + if (m == NULL) { + cache_save(f, defs, str, pat, NULL); + return NULL; + } + + while (rec_op.args.leftrec.visits > 0) { + rec_op.args.leftrec.visits = 0; + remove_ownership(&rec_op.args.leftrec.match); + add_owner(&rec_op.args.leftrec.match, m); + prev = m->end; + match_t *m2 = match(&defs2, f, str, ref, ignorecase); + if (m2 == NULL) break; + if (m2->end <= prev) { + recycle_if_unused(&m2); + break; + } + m = m2; + } + + // This match wrapper mainly exists for record-keeping purposes. + // However, it also keeps `m` from getting garbage collected with + // leftrec.match is GC'd. It also helps with visualization of match + // results. + // OPTIMIZE: remove this if necessary + match_t *wrap = new_match(defs, pat, m->start, m->end, MATCHES(m)); + cache_save(f, defs, str, pat, wrap); + + if (rec_op.args.leftrec.match) + remove_ownership(&rec_op.args.leftrec.match); + + return wrap; + } + case BP_NODENT: { + if (*str != '\n') return NULL; + const char *start = str; + + size_t linenum = get_line_number(f, str); + const char *p = get_line(f, linenum); + if (p < f->start) p = f->start; // Can happen with recursive matching + + // Current indentation: + char denter = *p; + int dents = 0; + if (denter == ' ' || denter == '\t') { + for (; *p == denter && p < f->end; ++p) ++dents; + } + + // Subsequent indentation: + while (*str == '\n' || *str == '\n') ++str; + for (int i = 0; i < dents; i++) + if (&str[i] >= f->end || str[i] != denter) return NULL; + + return new_match(defs, pat, start, &str[dents], NULL); + } + case BP_ERROR: { + match_t *p = pat->args.pat ? match(defs, f, str, pat->args.pat, ignorecase) : NULL; + return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL; + } + default: { + errx(EXIT_FAILURE, "Unknown pattern type: %u", pat->type); + return NULL; + } } } @@ -703,4 +703,4 @@ size_t free_all_matches(void) return count; } -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/match.h b/match.h index 5d4bd26..74ae5ca 100644 --- a/match.h +++ b/match.h @@ -22,4 +22,4 @@ size_t free_all_matches(void); size_t recycle_all_matches(void); #endif -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/pattern.c b/pattern.c index 5e4fb80..7e31bfc 100644 --- a/pattern.c +++ b/pattern.c @@ -234,291 +234,291 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) char c = *str; str = next_char(f, str); switch (c) { - // Any char (dot) - case '.': { - if (*str == '.') { // ".." - pat_t *skip = NULL; - str = next_char(f, str); - char skipper = *str; - if (matchchar(&str, '%', false) || matchchar(&str, '=', false)) { - skip = bp_simplepattern(f, str); - if (!skip) - file_err(f, str, str, "There should be a pattern to skip here after the '%c'", skipper); - str = skip->end; - } - pat_t *upto = new_pat(f, start, str, 0, -1, skipper == '=' ? BP_UPTO_STRICT : BP_UPTO); - upto->args.multiple.second = skip; - return upto; - } else { - return new_pat(f, start, str, 1, UTF8_MAXCHARLEN, BP_ANYCHAR); - } - } - // Char literals - case '`': { - pat_t *all = NULL; - do { // Comma-separated items: - if (str >= f->end || !*str || *str == '\n') - file_err(f, str, str, "There should be a character here after the '`'"); - - const char *c1_loc = str; - str = next_char(f, c1_loc); - if (*str == '-') { // Range - const char *c2_loc = ++str; - if (next_char(f, c1_loc) > c1_loc+1 || next_char(f, c2_loc) > c2_loc+1) - file_err(f, start, next_char(f, c2_loc), "Sorry, UTF-8 character ranges are not yet supported."); - char c1 = *c1_loc, c2 = *c2_loc; - if (!c2 || c2 == '\n') - file_err(f, str, str, "There should be a character here to complete the character range."); - if (c1 > c2) { // Swap order - char tmp = c1; - c1 = c2; - c2 = tmp; - } - str = next_char(f, c2_loc); - pat_t *pat = new_pat(f, start == c1_loc - 1 ? start : c1_loc, str, 1, 1, BP_RANGE); - pat->args.range.low = (unsigned char)c1; - pat->args.range.high = (unsigned char)c2; - all = either_pat(f, all, pat); - } else { - size_t len = (size_t)(str - c1_loc); - pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING); - pat->args.string = c1_loc; - all = either_pat(f, all, pat); - } - } while (*str++ == ','); - - return all; - } - // Escapes - case '\\': { - if (!*str || *str == '\n') - file_err(f, str, str, "There should be an escape sequence here after this backslash."); - - pat_t *all = NULL; - do { // Comma-separated items: - const char *itemstart = str-1; - if (*str == 'N') { // \N (nodent) - all = either_pat(f, all, new_pat(f, itemstart, ++str, 1, -1, BP_NODENT)); - continue; - } else if (*str == 'i') { // \i (identifier char) - all = either_pat(f, all, new_pat(f, itemstart, ++str, 1, -1, BP_ID_CONTINUE)); - continue; - } else if (*str == 'I') { // \I (identifier char, not including numbers) - all = either_pat(f, all, new_pat(f, itemstart, ++str, 1, -1, BP_ID_START)); - continue; - } else if (*str == 'b') { // \b word boundary - all = either_pat(f, all, new_pat(f, itemstart, ++str, 0, 0, BP_WORD_BOUNDARY)); - continue; - } - - const char *opstart = str; - unsigned char e_low = (unsigned char)unescapechar(str, &str); - if (str == opstart) - file_err(f, start, str+1, "This isn't a valid escape sequence."); - unsigned char e_high = e_low; - if (*str == '-') { // Escape range (e.g. \x00-\xFF) - ++str; - if (next_char(f, str) != str+1) - file_err(f, start, next_char(f, str), "Sorry, UTF8 escape sequences are not supported in ranges."); - const char *seqstart = str; - e_high = (unsigned char)unescapechar(str, &str); - if (str == seqstart) - file_err(f, seqstart, str+1, "This value isn't a valid escape sequence"); - if (e_high < e_low) - file_err(f, start, str, "Escape ranges should be low-to-high, but this is high-to-low."); - } - pat_t *esc = new_pat(f, start, str, 1, 1, BP_RANGE); - esc->args.range.low = e_low; - esc->args.range.high = e_high; - all = either_pat(f, all, esc); - } while (*str++ == ','); - - return all; - } - // Word boundary - case '|': { - return new_pat(f, start, str, 0, 0, BP_WORD_BOUNDARY); - } - // String literal - case '"': case '\'': case '\002': case '{': { - char endquote = c == '\002' ? '\003' : (c == '{' ? '}' : c); - char *litstart = (char*)str; - while (str < f->end && *str != endquote) - str = next_char(f, str); - size_t len = (size_t)(str - litstart); + // Any char (dot) + case '.': { + if (*str == '.') { // ".." + pat_t *skip = NULL; str = next_char(f, str); + char skipper = *str; + if (matchchar(&str, '%', false) || matchchar(&str, '=', false)) { + skip = bp_simplepattern(f, str); + if (!skip) + file_err(f, str, str, "There should be a pattern to skip here after the '%c'", skipper); + str = skip->end; + } + pat_t *upto = new_pat(f, start, str, 0, -1, skipper == '=' ? BP_UPTO_STRICT : BP_UPTO); + upto->args.multiple.second = skip; + return upto; + } else { + return new_pat(f, start, str, 1, UTF8_MAXCHARLEN, BP_ANYCHAR); + } + } + // Char literals + case '`': { + pat_t *all = NULL; + do { // Comma-separated items: + if (str >= f->end || !*str || *str == '\n') + file_err(f, str, str, "There should be a character here after the '`'"); - pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING); - pat->args.string = litstart; - return pat; - } - // Not - case '!': { - pat_t *p = bp_simplepattern(f, str); - if (!p) file_err(f, str, str, "There should be a pattern after this '!'"); - pat_t *not = new_pat(f, start, p->end, 0, 0, BP_NOT); - not->args.pat = p; - return not; - } - // Number of repetitions: (- / - / + / "") - case '0': case '1': case '2': case '3': case '4': case '5': - case '6': case '7': case '8': case '9': { - size_t min = 0; - ssize_t max = -1; - --str; - long n1 = strtol(str, (char**)&str, 10); - if (matchchar(&str, '-', false)) { - str = after_spaces(str, false); - const char *numstart = str; - long n2 = strtol(str, (char**)&str, 10); - if (str == numstart) min = 0, max = (ssize_t)n1; - else min = (size_t)n1, max = (ssize_t)n2; - } else if (matchchar(&str, '+', false)) { - min = (size_t)n1, max = -1; + const char *c1_loc = str; + str = next_char(f, c1_loc); + if (*str == '-') { // Range + const char *c2_loc = ++str; + if (next_char(f, c1_loc) > c1_loc+1 || next_char(f, c2_loc) > c2_loc+1) + file_err(f, start, next_char(f, c2_loc), "Sorry, UTF-8 character ranges are not yet supported."); + char c1 = *c1_loc, c2 = *c2_loc; + if (!c2 || c2 == '\n') + file_err(f, str, str, "There should be a character here to complete the character range."); + if (c1 > c2) { // Swap order + char tmp = c1; + c1 = c2; + c2 = tmp; + } + str = next_char(f, c2_loc); + pat_t *pat = new_pat(f, start == c1_loc - 1 ? start : c1_loc, str, 1, 1, BP_RANGE); + pat->args.range.low = (unsigned char)c1; + pat->args.range.high = (unsigned char)c2; + all = either_pat(f, all, pat); } else { - min = (size_t)n1, max = (ssize_t)n1; + size_t len = (size_t)(str - c1_loc); + pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING); + pat->args.string = c1_loc; + all = either_pat(f, all, pat); } - pat_t *repeating = bp_simplepattern(f, str); - if (!repeating) - file_err(f, str, str, "There should be a pattern after this repetition count."); - str = repeating->end; - pat_t *sep = NULL; - if (matchchar(&str, '%', false)) { - sep = bp_simplepattern(f, str); - if (!sep) - file_err(f, str, str, "There should be a separator pattern after this '%%'"); - str = sep->end; - } else { - str = repeating->end; - } - return new_range(f, start, str, min, max, repeating, sep); - } - // Lookbehind - case '<': { - pat_t *behind = bp_simplepattern(f, str); - if (!behind) - file_err(f, str, str, "There should be a pattern after this '<'"); - str = behind->end; - str = behind->end; - pat_t *pat = new_pat(f, start, str, 0, 0, BP_AFTER); - pat->args.pat = behind; - return pat; - } - // Lookahead - case '>': { - pat_t *ahead = bp_simplepattern(f, str); - if (!ahead) - file_err(f, str, str, "There should be a pattern after this '>'"); - str = ahead->end; - pat_t *pat = new_pat(f, start, str, 0, 0, BP_BEFORE); - pat->args.pat = ahead; - return pat; - } - // Parentheses - case '(': { - if (start + 2 < f->end && strncmp(start, "(!)", 3) == 0) { // (!) errors - str = start + 3; - pat_t *pat = bp_simplepattern(f, str); - if (!pat) pat = new_pat(f, str, str, 0, 0, BP_STRING); - pat = expand_replacements(f, pat, false); - pat_t *error = new_pat(f, start, pat->end, pat->min_matchlen, pat->max_matchlen, BP_ERROR); - error->args.pat = pat; - return error; + } while (*str++ == ','); + + return all; + } + // Escapes + case '\\': { + if (!*str || *str == '\n') + file_err(f, str, str, "There should be an escape sequence here after this backslash."); + + pat_t *all = NULL; + do { // Comma-separated items: + const char *itemstart = str-1; + if (*str == 'N') { // \N (nodent) + all = either_pat(f, all, new_pat(f, itemstart, ++str, 1, -1, BP_NODENT)); + continue; + } else if (*str == 'i') { // \i (identifier char) + all = either_pat(f, all, new_pat(f, itemstart, ++str, 1, -1, BP_ID_CONTINUE)); + continue; + } else if (*str == 'I') { // \I (identifier char, not including numbers) + all = either_pat(f, all, new_pat(f, itemstart, ++str, 1, -1, BP_ID_START)); + continue; + } else if (*str == 'b') { // \b word boundary + all = either_pat(f, all, new_pat(f, itemstart, ++str, 0, 0, BP_WORD_BOUNDARY)); + continue; } - pat_t *pat = bp_pattern_nl(f, str, true); - if (!pat) - file_err(f, str, str, "There should be a valid pattern after this parenthesis."); - str = pat->end; - if (!matchchar(&str, ')', true)) file_err(f, str, str, "Missing paren: )"); - pat->start = start; - pat->end = str; - return pat; + const char *opstart = str; + unsigned char e_low = (unsigned char)unescapechar(str, &str); + if (str == opstart) + file_err(f, start, str+1, "This isn't a valid escape sequence."); + unsigned char e_high = e_low; + if (*str == '-') { // Escape range (e.g. \x00-\xFF) + ++str; + if (next_char(f, str) != str+1) + file_err(f, start, next_char(f, str), "Sorry, UTF8 escape sequences are not supported in ranges."); + const char *seqstart = str; + e_high = (unsigned char)unescapechar(str, &str); + if (str == seqstart) + file_err(f, seqstart, str+1, "This value isn't a valid escape sequence"); + if (e_high < e_low) + file_err(f, start, str, "Escape ranges should be low-to-high, but this is high-to-low."); + } + pat_t *esc = new_pat(f, start, str, 1, 1, BP_RANGE); + esc->args.range.low = e_low; + esc->args.range.high = e_high; + all = either_pat(f, all, esc); + } while (*str++ == ','); + + return all; + } + // Word boundary + case '|': { + return new_pat(f, start, str, 0, 0, BP_WORD_BOUNDARY); + } + // String literal + case '"': case '\'': case '\002': case '{': { + char endquote = c == '\002' ? '\003' : (c == '{' ? '}' : c); + char *litstart = (char*)str; + while (str < f->end && *str != endquote) + str = next_char(f, str); + size_t len = (size_t)(str - litstart); + str = next_char(f, str); + + pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING); + pat->args.string = litstart; + return pat; + } + // Not + case '!': { + pat_t *p = bp_simplepattern(f, str); + if (!p) file_err(f, str, str, "There should be a pattern after this '!'"); + pat_t *not = new_pat(f, start, p->end, 0, 0, BP_NOT); + not->args.pat = p; + return not; + } + // Number of repetitions: (- / - / + / "") + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': { + size_t min = 0; + ssize_t max = -1; + --str; + long n1 = strtol(str, (char**)&str, 10); + if (matchchar(&str, '-', false)) { + str = after_spaces(str, false); + const char *numstart = str; + long n2 = strtol(str, (char**)&str, 10); + if (str == numstart) min = 0, max = (ssize_t)n1; + else min = (size_t)n1, max = (ssize_t)n2; + } else if (matchchar(&str, '+', false)) { + min = (size_t)n1, max = -1; + } else { + min = (size_t)n1, max = (ssize_t)n1; } - // Square brackets - case '[': { - pat_t *maybe = bp_pattern_nl(f, str, true); - if (!maybe) - file_err(f, str, str, "There should be a valid pattern after this square bracket."); - str = maybe->end; - (void)matchchar(&str, ']', true); - return new_range(f, start, str, 0, 1, maybe, NULL); - } - // Repeating - case '*': case '+': { - size_t min = (size_t)(c == '*' ? 0 : 1); - pat_t *repeating = bp_simplepattern(f, str); - if (!repeating) - file_err(f, str, str, "There should be a valid pattern here after the '%c'", c); + pat_t *repeating = bp_simplepattern(f, str); + if (!repeating) + file_err(f, str, str, "There should be a pattern after this repetition count."); + str = repeating->end; + pat_t *sep = NULL; + if (matchchar(&str, '%', false)) { + sep = bp_simplepattern(f, str); + if (!sep) + file_err(f, str, str, "There should be a separator pattern after this '%%'"); + str = sep->end; + } else { str = repeating->end; - pat_t *sep = NULL; - if (matchchar(&str, '%', false)) { - sep = bp_simplepattern(f, str); - if (!sep) - file_err(f, str, str, "There should be a separator pattern after the '%%' here."); - str = sep->end; - } - return new_range(f, start, str, min, -1, repeating, sep); } - // Capture - case '@': { - const char *name = NULL; - size_t namelen = 0; - const char *a = after_name(str); - const char *eq = a; - if (a > str && !matchstr(&eq, "=>", false) && matchchar(&eq, '=', false)) { - name = str; - namelen = (size_t)(a-str); - str = eq; - } + return new_range(f, start, str, min, max, repeating, sep); + } + // Lookbehind + case '<': { + pat_t *behind = bp_simplepattern(f, str); + if (!behind) + file_err(f, str, str, "There should be a pattern after this '<'"); + str = behind->end; + str = behind->end; + pat_t *pat = new_pat(f, start, str, 0, 0, BP_AFTER); + pat->args.pat = behind; + return pat; + } + // Lookahead + case '>': { + pat_t *ahead = bp_simplepattern(f, str); + if (!ahead) + file_err(f, str, str, "There should be a pattern after this '>'"); + str = ahead->end; + pat_t *pat = new_pat(f, start, str, 0, 0, BP_BEFORE); + pat->args.pat = ahead; + return pat; + } + // Parentheses + case '(': { + if (start + 2 < f->end && strncmp(start, "(!)", 3) == 0) { // (!) errors + str = start + 3; pat_t *pat = bp_simplepattern(f, str); - if (!pat) - file_err(f, str, str, "There should be a valid pattern here to capture after the '@'"); + if (!pat) pat = new_pat(f, str, str, 0, 0, BP_STRING); + pat = expand_replacements(f, pat, false); + pat_t *error = new_pat(f, start, pat->end, pat->min_matchlen, pat->max_matchlen, BP_ERROR); + error->args.pat = pat; + return error; + } - pat_t *capture = new_pat(f, start, pat->end, pat->min_matchlen, pat->max_matchlen, BP_CAPTURE); - capture->args.capture.capture_pat = pat; - capture->args.capture.name = name; - capture->args.capture.namelen = namelen; - return capture; + pat_t *pat = bp_pattern_nl(f, str, true); + if (!pat) + file_err(f, str, str, "There should be a valid pattern after this parenthesis."); + str = pat->end; + if (!matchchar(&str, ')', true)) file_err(f, str, str, "Missing paren: )"); + pat->start = start; + pat->end = str; + return pat; + } + // Square brackets + case '[': { + pat_t *maybe = bp_pattern_nl(f, str, true); + if (!maybe) + file_err(f, str, str, "There should be a valid pattern after this square bracket."); + str = maybe->end; + (void)matchchar(&str, ']', true); + return new_range(f, start, str, 0, 1, maybe, NULL); + } + // Repeating + case '*': case '+': { + size_t min = (size_t)(c == '*' ? 0 : 1); + pat_t *repeating = bp_simplepattern(f, str); + if (!repeating) + file_err(f, str, str, "There should be a valid pattern here after the '%c'", c); + str = repeating->end; + pat_t *sep = NULL; + if (matchchar(&str, '%', false)) { + sep = bp_simplepattern(f, str); + if (!sep) + file_err(f, str, str, "There should be a separator pattern after the '%%' here."); + str = sep->end; } - // Start of file/line - case '^': { - if (*str == '^') - return new_pat(f, start, ++str, 0, 0, BP_START_OF_FILE); - return new_pat(f, start, str, 0, 0, BP_START_OF_LINE); + return new_range(f, start, str, min, -1, repeating, sep); + } + // Capture + case '@': { + const char *name = NULL; + size_t namelen = 0; + const char *a = after_name(str); + const char *eq = a; + if (a > str && !matchstr(&eq, "=>", false) && matchchar(&eq, '=', false)) { + name = str; + namelen = (size_t)(a-str); + str = eq; } - // End of file/line: - case '$': { - if (*str == '$') - return new_pat(f, start, ++str, 0, 0, BP_END_OF_FILE); - return new_pat(f, start, str, 0, 0, BP_END_OF_LINE); - } - default: { - // Reference - if (!isalpha(c) && c != '_') return NULL; - str = after_name(start); - size_t namelen = (size_t)(str - start); - if (matchchar(&str, ':', false)) { // Definitions - pat_t *def = bp_pattern_nl(f, str, false); - if (!def) file_err(f, str, f->end, "Could not parse this definition."); - str = def->end; - (void)matchchar(&str, ';', false); // Optional semicolon - str = after_spaces(str, true); - pat_t *pat = bp_pattern_nl(f, str, false); - if (pat) str = pat->end; - else pat = def; - pat_t *ret = new_pat(f, start, str, pat->min_matchlen, pat->max_matchlen, BP_DEFINITION); - ret->args.def.name = start; - ret->args.def.namelen = namelen; - ret->args.def.def = def; - ret->args.def.pat = pat; - return ret; - } - pat_t *ref = new_pat(f, start, str, 0, -1, BP_REF); - ref->args.ref.name = start; - ref->args.ref.len = namelen; - return ref; + pat_t *pat = bp_simplepattern(f, str); + if (!pat) + file_err(f, str, str, "There should be a valid pattern here to capture after the '@'"); + + pat_t *capture = new_pat(f, start, pat->end, pat->min_matchlen, pat->max_matchlen, BP_CAPTURE); + capture->args.capture.capture_pat = pat; + capture->args.capture.name = name; + capture->args.capture.namelen = namelen; + return capture; + } + // Start of file/line + case '^': { + if (*str == '^') + return new_pat(f, start, ++str, 0, 0, BP_START_OF_FILE); + return new_pat(f, start, str, 0, 0, BP_START_OF_LINE); + } + // End of file/line: + case '$': { + if (*str == '$') + return new_pat(f, start, ++str, 0, 0, BP_END_OF_FILE); + return new_pat(f, start, str, 0, 0, BP_END_OF_LINE); + } + default: { + // Reference + if (!isalpha(c) && c != '_') return NULL; + str = after_name(start); + size_t namelen = (size_t)(str - start); + if (matchchar(&str, ':', false)) { // Definitions + pat_t *def = bp_pattern_nl(f, str, false); + if (!def) file_err(f, str, f->end, "Could not parse this definition."); + str = def->end; + (void)matchchar(&str, ';', false); // Optional semicolon + str = after_spaces(str, true); + pat_t *pat = bp_pattern_nl(f, str, false); + if (pat) str = pat->end; + else pat = def; + pat_t *ret = new_pat(f, start, str, pat->min_matchlen, pat->max_matchlen, BP_DEFINITION); + ret->args.def.name = start; + ret->args.def.namelen = namelen; + ret->args.def.def = def; + ret->args.def.pat = pat; + return ret; } + pat_t *ref = new_pat(f, start, str, 0, -1, BP_REF); + ref->args.ref.name = start; + ref->args.ref.len = namelen; + return ref; + } } } @@ -600,4 +600,4 @@ pat_t *bp_pattern(file_t *f, const char *str) return bp_pattern_nl(f, str, false); } -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/pattern.h b/pattern.h index 47d0c63..39aba63 100644 --- a/pattern.h +++ b/pattern.h @@ -21,4 +21,4 @@ __attribute__((nonnull)) pat_t *bp_pattern(file_t *f, const char *str); #endif -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/print.c b/print.c index b5eb219..f3d5278 100644 --- a/print.c +++ b/print.c @@ -266,4 +266,4 @@ int print_errors(file_t *f, match_t *m) return ret; } -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/print.h b/print.h index 2bc2948..56dcae6 100644 --- a/print.h +++ b/print.h @@ -28,4 +28,4 @@ __attribute__((nonnull)) int print_errors(file_t *f, match_t *m); #endif -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/types.h b/types.h index dd7de77..eb22e69 100644 --- a/types.h +++ b/types.h @@ -130,4 +130,4 @@ typedef struct def_s { } def_t; #endif -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/utf8.c b/utf8.c index ad807bd..6180ffe 100644 --- a/utf8.c +++ b/utf8.c @@ -280,4 +280,4 @@ bool isidcontinue(file_t *f, const char *str) || find_in_ranges(codepoint, XID_Continue_only, ARRAY_LEN(XID_Continue_only))); } -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/utf8.h b/utf8.h index 9c43f13..97e259e 100644 --- a/utf8.h +++ b/utf8.h @@ -18,4 +18,4 @@ __attribute__((nonnull, pure)) bool isidcontinue(file_t *f, const char *str); #endif -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/utils.c b/utils.c index cb0719c..47958f7 100644 --- a/utils.c +++ b/utils.c @@ -20,18 +20,18 @@ const char *after_spaces(const char *str, bool skip_nl) // Skip whitespace and comments: skip_whitespace: switch (*str) { - case '\r': case '\n': - if (!skip_nl) break; - __attribute__ ((fallthrough)); - case ' ': case '\t': { - ++str; - goto skip_whitespace; - } - case '#': { - while (*str && *str != '\n') ++str; - goto skip_whitespace; - } - default: break; + case '\r': case '\n': + if (!skip_nl) break; + __attribute__ ((fallthrough)); + case ' ': case '\t': { + ++str; + goto skip_whitespace; + } + case '#': { + while (*str && *str != '\n') ++str; + goto skip_whitespace; + } + default: break; } return str; } @@ -90,39 +90,38 @@ char unescapechar(const char *escaped, const char **end) size_t len = 1; unsigned char ret = (unsigned char)*escaped; switch (*escaped) { - case 'a': ret = '\a'; break; case 'b': ret = '\b'; break; - case 'n': ret = '\n'; break; case 'r': ret = '\r'; break; - case 't': ret = '\t'; break; case 'v': ret = '\v'; break; - case 'e': ret = '\033'; break; case '\\': ret = '\\'; break; - case 'x': { // Hex - static const unsigned char hextable[255] = { - ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4, - ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9, - ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf, - ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf, - }; - if (hextable[(int)escaped[1]] && hextable[(int)escaped[2]]) { - ret = (hextable[(int)escaped[1]] << 4) | (hextable[(int)escaped[2]] & 0xF); - len = 3; - } - break; + case 'a': ret = '\a'; break; case 'b': ret = '\b'; break; + case 'n': ret = '\n'; break; case 'r': ret = '\r'; break; + case 't': ret = '\t'; break; case 'v': ret = '\v'; break; + case 'e': ret = '\033'; break; case '\\': ret = '\\'; break; + case 'x': { // Hex + static const unsigned char hextable[255] = { + ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4, + ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9, + ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf, + ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf, + }; + if (hextable[(int)escaped[1]] && hextable[(int)escaped[2]]) { + ret = (hextable[(int)escaped[1]] << 4) | (hextable[(int)escaped[2]] & 0xF); + len = 3; } - case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal - ret = (unsigned char)(escaped[0] - '0'); - if ('0' <= escaped[1] && escaped[1] <= '7') { + break; + } + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal + ret = (unsigned char)(escaped[0] - '0'); + if ('0' <= escaped[1] && escaped[1] <= '7') { + ++len; + ret = (ret << 3) | (escaped[1] - '0'); + if ('0' <= escaped[2] && escaped[2] <= '7') { ++len; - ret = (ret << 3) | (escaped[1] - '0'); - if ('0' <= escaped[2] && escaped[2] <= '7') { - ++len; - ret = (ret << 3) | (escaped[2] - '0'); - } + ret = (ret << 3) | (escaped[2] - '0'); } - break; - } - default: { - if (end) *end = escaped; - return (char)0; } + break; + } + default: + if (end) *end = escaped; + return (char)0; } if (end) *end = &escaped[len]; return (char)ret; @@ -151,4 +150,4 @@ void delete(void *p) *((void**)p) = NULL; } -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/utils.h b/utils.h index 04df47a..59fedf7 100644 --- a/utils.h +++ b/utils.h @@ -63,4 +63,4 @@ __attribute__((nonnull)) void delete(void *p); #endif -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0