diff options
| -rw-r--r-- | grammars/html.bp | 4 | ||||
| -rw-r--r-- | match.c | 43 |
2 files changed, 45 insertions, 2 deletions
diff --git a/grammars/html.bp b/grammars/html.bp index 457a183..29e4566 100644 --- a/grammars/html.bp +++ b/grammars/html.bp @@ -14,9 +14,9 @@ void-element: `< ("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"me template-element: "<template>" ..%(\n / comment / element) "</template>" -raw-element: `< @tag=("script"/"style"/"textarea"/"title") __attributes__ `> ..%\n ("</"tag__`>) +raw-element: `< @tag=("script"/"style"/"textarea"/"title") __attributes__ `> ..%\n "</"tag__`> -normal-element: `< @tag=id __attributes__ `> ..%(\n / comment / element) ("</"tag`>) +normal-element: `< @tag=id __attributes__ `> ..%(\n / comment / element) "</"tag__`> comment: "<!--" ..%\n "-->" @@ -62,10 +62,34 @@ static inline pat_t *deref(def_t *defs, pat_t *pat) } // +// Find and return the first string literal to be matched (if any) +// +static pat_t *first_literal(def_t *defs, pat_t *pat) +{ + for (pat_t *p = pat; p; ) { + if (p->type == BP_STRING) + return p; + else if (p->type == BP_BEFORE) + p = p->args.pat; + else if (p->type == BP_CAPTURE) + p = p->args.capture.capture_pat; + else if (p->type == BP_CHAIN) + p = p->args.multiple.first; + else if (p->type == BP_REPLACE) + p = p->args.replace.pat; + else if (p->type == BP_REF) + p = deref(defs, p); + else break; + } + return NULL; +} + +// // Find the next match after prev (or the first match if prev is NULL) // match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase) { + pat = deref(defs, pat); const char *str; if (prev) { str = prev->end > prev->start ? prev->end : prev->end + 1; @@ -74,6 +98,25 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk str = f->start; } bool only_start = pat->type == BP_START_OF_FILE || (pat->type == BP_CHAIN && pat->args.multiple.first->type == BP_START_OF_FILE); + + // Performance optimization: if the pattern starts with a string literal, + // we can just rely on the highly optimized strstr()/strcasestr() + // implementations to skip past areas where we know we won't find a match. + pat_t *first_str = first_literal(defs, pat); + if (first_str) { + for (size_t i = 0; i < first_str->min_matchlen; i++) + if (first_str->args.string[i] == '\0') + goto pattern_search; + char *tmp = strndup(first_str->args.string, first_str->min_matchlen); + char *found = (ignorecase ? strcasestr : strstr)(str, tmp); + if (found) + str = found; + else if (&str[strlen(str)] == f->end) + str = f->end+1; + free(tmp); + } + + pattern_search: while (str <= f->end) { match_t *m = match(defs, f, str, pat, ignorecase); if (m) return m; |
