Moving cache logic into match, cleaner next_match() API, and slightly
less tightly coupled UTF8 API
This commit is contained in:
parent
9401facbe7
commit
90c3c13a02
46
bp.c
46
bp.c
@ -169,17 +169,15 @@ static int is_text_file(const char *filename)
|
||||
//
|
||||
static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern)
|
||||
{
|
||||
static int matches = 0;
|
||||
match_t *m = NULL;
|
||||
while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
|
||||
if (++matches > 1)
|
||||
int nmatches = 0;
|
||||
for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
|
||||
if (++nmatches > 1)
|
||||
printf(",\n");
|
||||
printf("{\"filename\":\"%s\",\"match\":", f->filename);
|
||||
json_match(f->start, m, options.verbose);
|
||||
printf("}");
|
||||
}
|
||||
if (m) recycle_if_unused(&m);
|
||||
return matches;
|
||||
return nmatches;
|
||||
}
|
||||
|
||||
//
|
||||
@ -187,18 +185,16 @@ static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern)
|
||||
//
|
||||
static int explain_matches(def_t *defs, file_t *f, pat_t *pattern)
|
||||
{
|
||||
int matches = 0;
|
||||
match_t *m = NULL;
|
||||
while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
|
||||
if (++matches == 1) {
|
||||
int nmatches = 0;
|
||||
for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
|
||||
if (++nmatches == 1) {
|
||||
if (options.print_filenames)
|
||||
fprint_filename(stdout, f->filename);
|
||||
} else
|
||||
printf("\n\n");
|
||||
explain_match(m);
|
||||
}
|
||||
if (m) recycle_if_unused(&m);
|
||||
return matches;
|
||||
return nmatches;
|
||||
}
|
||||
|
||||
//
|
||||
@ -243,8 +239,7 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern)
|
||||
.lineformat = LINE_FORMATS[options.format],
|
||||
};
|
||||
|
||||
match_t *m = NULL;
|
||||
while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
|
||||
for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
|
||||
if (print_errors(f, m) > 0)
|
||||
exit(EXIT_FAILURE);
|
||||
|
||||
@ -254,7 +249,6 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern)
|
||||
}
|
||||
print_match(out, &pr, m);
|
||||
}
|
||||
if (m) recycle_if_unused(&m);
|
||||
|
||||
if (matches > 0 || (f->filename[0] == '\0' && options.context_before == ALL_CONTEXT)) {
|
||||
// Print trailing context lines:
|
||||
@ -281,18 +275,19 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern)
|
||||
if (options.mode == MODE_EXPLAIN) {
|
||||
matches += explain_matches(defs, f, pattern);
|
||||
} else if (options.mode == MODE_LISTFILES) {
|
||||
match_t *m = next_match(defs, f, NULL, pattern, options.skip, options.ignorecase);
|
||||
if (m) {
|
||||
recycle_if_unused(&m);
|
||||
match_t *m = NULL;
|
||||
if (next_match(&m, defs, f, pattern, options.skip, options.ignorecase)) {
|
||||
printf("%s\n", f->filename);
|
||||
matches += 1;
|
||||
}
|
||||
stop_matching(&m);
|
||||
} else if (options.mode == MODE_JSON) {
|
||||
matches += print_matches_as_json(defs, f, pattern);
|
||||
} else if (options.mode == MODE_INPLACE) {
|
||||
match_t *m = next_match(defs, f, NULL, pattern, options.skip, options.ignorecase);
|
||||
if (m) recycle_if_unused(&m);
|
||||
else return 0;
|
||||
match_t *m = NULL;
|
||||
bool found = next_match(&m, defs, f, pattern, options.skip, options.ignorecase);
|
||||
stop_matching(&m);
|
||||
if (!found) return 0;
|
||||
|
||||
// Ensure the file is resident in memory:
|
||||
if (f->mmapped) {
|
||||
@ -315,7 +310,6 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern)
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
cache_destroy(f);
|
||||
if (recycle_all_matches() != 0)
|
||||
fprintf(stderr, "\033[33;1mMemory leak: there should no longer be any matches in use at this point.\033[m\n");
|
||||
destroy_file(&f);
|
||||
@ -480,10 +474,10 @@ int main(int argc, char *argv[])
|
||||
file_t *arg_file = spoof_file(&loaded_files, "<skip argument>", flag, -1);
|
||||
pat_t *s = bp_pattern(arg_file, arg_file->start);
|
||||
if (!s) {
|
||||
fprint_line(stdout, arg_file, arg_file->start, arg_file->end,
|
||||
file_err(arg_file, arg_file->start, arg_file->end,
|
||||
"Failed to compile the skip argument");
|
||||
} else if (after_spaces(s->end, true) < arg_file->end) {
|
||||
fprint_line(stdout, arg_file, s->end, arg_file->end,
|
||||
file_err(arg_file, s->end, arg_file->end,
|
||||
"Failed to compile part of the skip argument");
|
||||
}
|
||||
options.skip = either_pat(arg_file, options.skip, s);
|
||||
@ -537,10 +531,6 @@ int main(int argc, char *argv[])
|
||||
// Handle exit() calls gracefully:
|
||||
require(atexit(&cleanup), "Failed to set cleanup handler at exit");
|
||||
|
||||
// No need for these caches anymore:
|
||||
for (file_t *f = loaded_files; f; f = f->next)
|
||||
cache_destroy(f);
|
||||
|
||||
int found = 0;
|
||||
if (options.mode == MODE_JSON) printf("[");
|
||||
if (options.git_mode) { // Get the list of files from `git --ls-files ...`
|
||||
|
124
files.c
124
files.c
@ -182,8 +182,6 @@ void destroy_file(file_t **at_f)
|
||||
f->mmapped = NULL;
|
||||
}
|
||||
|
||||
cache_destroy(f);
|
||||
|
||||
for (pat_t *next; f->pats; f->pats = next) {
|
||||
next = f->pats->next;
|
||||
delete(&f->pats);
|
||||
@ -261,126 +259,4 @@ void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, cons
|
||||
fprintf(dest, "\033[m\n");
|
||||
}
|
||||
|
||||
//
|
||||
// Hash a string position/pattern.
|
||||
//
|
||||
static inline size_t hash(const char *str, pat_t *pat)
|
||||
{
|
||||
return (size_t)str + 2*pat->id;
|
||||
}
|
||||
|
||||
//
|
||||
// Check if we have memoized a pattern match at the given position for the
|
||||
// given definitions. If a result has been memoized, set *result to the
|
||||
// memoized value and return true, otherwise return false.
|
||||
//
|
||||
bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result)
|
||||
{
|
||||
if (!f->cache.matches) return NULL;
|
||||
size_t h = hash(str, pat) & (f->cache.size-1);
|
||||
for (match_t *c = f->cache.matches[h]; c; c = c->cache.next) {
|
||||
if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) {
|
||||
// If c->end == NULL, that means no match occurs here
|
||||
*result = c->end == NULL ? NULL : c;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// Remove an item from the cache.
|
||||
//
|
||||
static void cache_remove(file_t *f, match_t *m)
|
||||
{
|
||||
if (!m->cache.home) return;
|
||||
*m->cache.home = m->cache.next;
|
||||
if (m->cache.next) m->cache.next->cache.home = m->cache.home;
|
||||
m->cache.next = NULL;
|
||||
m->cache.home = NULL;
|
||||
if (--m->refcount == 0) recycle_if_unused(&m);
|
||||
--f->cache.occupancy;
|
||||
}
|
||||
|
||||
//
|
||||
// Save a match in the cache.
|
||||
//
|
||||
void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m)
|
||||
{
|
||||
// As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used
|
||||
// to memoize the fact that `pat` will *not* match at `str`.
|
||||
if (m == NULL) m = new_match(defs, pat, str, NULL, NULL);
|
||||
|
||||
if (f->cache.occupancy+1 > 3*f->cache.size) {
|
||||
if (f->cache.size == MAX_CACHE_SIZE) {
|
||||
size_t h = hash(m->start, m->pat) & (f->cache.size-1);
|
||||
for (int quota = 2; f->cache.matches[h] && quota > 0; quota--) {
|
||||
match_t *last = f->cache.matches[h];
|
||||
while (last->cache.next) last = last->cache.next;
|
||||
cache_remove(f, last);
|
||||
}
|
||||
} else {
|
||||
match_t **old_matches = f->cache.matches;
|
||||
size_t old_size = f->cache.size;
|
||||
f->cache.size = old_size == 0 ? 16 : 2*old_size;
|
||||
f->cache.matches = new(match_t*[f->cache.size]);
|
||||
|
||||
// Rehash:
|
||||
if (old_matches) {
|
||||
for (size_t i = 0; i < old_size; i++) {
|
||||
for (match_t *o; (o = old_matches[i]); ) {
|
||||
*o->cache.home = o->cache.next;
|
||||
if (o->cache.next) o->cache.next->cache.home = o->cache.home;
|
||||
size_t h = hash(o->start, o->pat) & (f->cache.size-1);
|
||||
o->cache.home = &(f->cache.matches[h]);
|
||||
o->cache.next = f->cache.matches[h];
|
||||
if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &o->cache.next;
|
||||
f->cache.matches[h] = o;
|
||||
}
|
||||
}
|
||||
free(old_matches);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t h = hash(m->start, m->pat) & (f->cache.size-1);
|
||||
m->cache.home = &(f->cache.matches[h]);
|
||||
m->cache.next = f->cache.matches[h];
|
||||
if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &m->cache.next;
|
||||
f->cache.matches[h] = m;
|
||||
++m->refcount;
|
||||
++f->cache.occupancy;
|
||||
}
|
||||
|
||||
//
|
||||
// Remove all items from the cache that do not overlap `start` and `end`.
|
||||
// (This is used to remove useless items from the cache)
|
||||
//
|
||||
void cache_prune(file_t *f, const char *start, const char *end)
|
||||
{
|
||||
if (!f->cache.matches) return;
|
||||
for (size_t i = 0; i < f->cache.size; i++) {
|
||||
for (match_t *m = f->cache.matches[i], *next = NULL; m; m = next) {
|
||||
next = m->cache.next;
|
||||
if (m->start < start || (m->end ? m->end : m->start) > end)
|
||||
cache_remove(f, m);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Clear and deallocate the cache.
|
||||
//
|
||||
void cache_destroy(file_t *f)
|
||||
{
|
||||
if (!f->cache.matches) return;
|
||||
for (size_t i = 0; i < f->cache.size; i++) {
|
||||
while (f->cache.matches[i])
|
||||
cache_remove(f, f->cache.matches[i]);
|
||||
}
|
||||
f->cache.occupancy = 0;
|
||||
delete(&f->cache.matches);
|
||||
f->cache.size = 0;
|
||||
}
|
||||
|
||||
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
|
||||
|
11
files.h
11
files.h
@ -6,14 +6,11 @@
|
||||
|
||||
#include "types.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#define file_err(f, ...) do { fprint_line(stderr, f, __VA_ARGS__); exit(EXIT_FAILURE); } while(false)
|
||||
|
||||
#define MAX_CACHE_SIZE (1<<14)
|
||||
|
||||
typedef struct file_s {
|
||||
struct file_s *next;
|
||||
const char *filename;
|
||||
@ -43,14 +40,6 @@ __attribute__((pure, nonnull))
|
||||
const char *get_line(file_t *f, size_t line_number);
|
||||
__attribute__((nonnull(1,2,3), format(printf,5,6)))
|
||||
void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, const char *fmt, ...);
|
||||
__attribute__((nonnull(1,3,4,5)))
|
||||
bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result);
|
||||
__attribute__((nonnull(1,3,4)))
|
||||
void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m);
|
||||
__attribute__((nonnull))
|
||||
void cache_prune(file_t *f, const char *start, const char *end);
|
||||
__attribute__((nonnull))
|
||||
void cache_destroy(file_t *f);
|
||||
|
||||
#endif
|
||||
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
|
||||
|
252
match.c
252
match.c
@ -16,6 +16,13 @@
|
||||
#include "utils.h"
|
||||
#include "utf8.h"
|
||||
|
||||
#define MAX_CACHE_SIZE (1<<14)
|
||||
|
||||
typedef struct {
|
||||
size_t size, occupancy;
|
||||
match_t **matches;
|
||||
} cache_t;
|
||||
|
||||
// New match objects are either recycled from unused match objects or allocated
|
||||
// from the heap. While it is in use, the match object is stored in the
|
||||
// `in_use_matches` linked list. Once it is no longer needed, it is moved to
|
||||
@ -27,10 +34,8 @@ static match_t *in_use_matches = NULL;
|
||||
|
||||
#define MATCHES(...) (match_t*[]){__VA_ARGS__, NULL}
|
||||
|
||||
__attribute__((nonnull(1)))
|
||||
static inline pat_t *deref(def_t *defs, pat_t *pat);
|
||||
__attribute__((hot, nonnull(2,3,4)))
|
||||
static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase);
|
||||
__attribute__((hot, nonnull(2,3,4,5)))
|
||||
static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase);
|
||||
|
||||
// Store a value and update its refcount
|
||||
static inline void add_owner(match_t** owner, match_t* owned)
|
||||
@ -80,10 +85,117 @@ static inline void list_remove(match_t *m, match_dll_t *node)
|
||||
node->next = NULL;
|
||||
}
|
||||
|
||||
//
|
||||
// Hash a string position/pattern.
|
||||
//
|
||||
static inline size_t hash(const char *str, pat_t *pat)
|
||||
{
|
||||
return (size_t)str + 2*pat->id;
|
||||
}
|
||||
|
||||
//
|
||||
// Check if we have memoized a pattern match at the given position for the
|
||||
// given definitions. If a result has been memoized, set *result to the
|
||||
// memoized value and return true, otherwise return false.
|
||||
//
|
||||
static bool cache_get(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t **result)
|
||||
{
|
||||
if (!cache->matches) return NULL;
|
||||
size_t h = hash(str, pat) & (cache->size-1);
|
||||
for (match_t *c = cache->matches[h]; c; c = c->cache.next) {
|
||||
if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) {
|
||||
// If c->end == NULL, that means no match occurs here
|
||||
*result = c->end == NULL ? NULL : c;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// Remove an item from the cache.
|
||||
//
|
||||
static void cache_remove(cache_t *cache, match_t *m)
|
||||
{
|
||||
if (!m->cache.home) return;
|
||||
*m->cache.home = m->cache.next;
|
||||
if (m->cache.next) m->cache.next->cache.home = m->cache.home;
|
||||
m->cache.next = NULL;
|
||||
m->cache.home = NULL;
|
||||
if (--m->refcount == 0) recycle_if_unused(&m);
|
||||
--cache->occupancy;
|
||||
}
|
||||
|
||||
//
|
||||
// Save a match in the cache.
|
||||
//
|
||||
static void cache_save(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t *m)
|
||||
{
|
||||
// As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used
|
||||
// to memoize the fact that `pat` will *not* match at `str`.
|
||||
if (m == NULL) m = new_match(defs, pat, str, NULL, NULL);
|
||||
|
||||
if (cache->occupancy+1 > 3*cache->size) {
|
||||
if (cache->size == MAX_CACHE_SIZE) {
|
||||
size_t h = hash(m->start, m->pat) & (cache->size-1);
|
||||
for (int quota = 2; cache->matches[h] && quota > 0; quota--) {
|
||||
match_t *last = cache->matches[h];
|
||||
while (last->cache.next) last = last->cache.next;
|
||||
cache_remove(cache, last);
|
||||
}
|
||||
} else {
|
||||
match_t **old_matches = cache->matches;
|
||||
size_t old_size = cache->size;
|
||||
cache->size = old_size == 0 ? 16 : 2*old_size;
|
||||
cache->matches = new(match_t*[cache->size]);
|
||||
|
||||
// Rehash:
|
||||
if (old_matches) {
|
||||
for (size_t i = 0; i < old_size; i++) {
|
||||
for (match_t *o; (o = old_matches[i]); ) {
|
||||
*o->cache.home = o->cache.next;
|
||||
if (o->cache.next) o->cache.next->cache.home = o->cache.home;
|
||||
size_t h = hash(o->start, o->pat) & (cache->size-1);
|
||||
o->cache.home = &(cache->matches[h]);
|
||||
o->cache.next = cache->matches[h];
|
||||
if (cache->matches[h]) cache->matches[h]->cache.home = &o->cache.next;
|
||||
cache->matches[h] = o;
|
||||
}
|
||||
}
|
||||
free(old_matches);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t h = hash(m->start, m->pat) & (cache->size-1);
|
||||
m->cache.home = &(cache->matches[h]);
|
||||
m->cache.next = cache->matches[h];
|
||||
if (cache->matches[h]) cache->matches[h]->cache.home = &m->cache.next;
|
||||
cache->matches[h] = m;
|
||||
++m->refcount;
|
||||
++cache->occupancy;
|
||||
}
|
||||
|
||||
//
|
||||
// Clear and deallocate the cache.
|
||||
//
|
||||
void cache_destroy(cache_t *cache)
|
||||
{
|
||||
if (!cache->matches) return;
|
||||
for (size_t i = 0; i < cache->size; i++) {
|
||||
while (cache->matches[i])
|
||||
cache_remove(cache, cache->matches[i]);
|
||||
}
|
||||
cache->occupancy = 0;
|
||||
delete(&cache->matches);
|
||||
cache->size = 0;
|
||||
}
|
||||
|
||||
//
|
||||
// If the given pattern is a reference, look it up and return the referenced
|
||||
// pattern. This is used for an optimization to avoid repeated lookups.
|
||||
//
|
||||
__attribute__((nonnull(1)))
|
||||
static inline pat_t *deref(def_t *defs, pat_t *pat)
|
||||
{
|
||||
if (pat && pat->type == BP_REF) {
|
||||
@ -128,15 +240,18 @@ static pat_t *first_pat(def_t *defs, pat_t *pat)
|
||||
//
|
||||
// Find the next match after prev (or the first match if prev is NULL)
|
||||
//
|
||||
match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase)
|
||||
__attribute__((nonnull(3,5)))
|
||||
static match_t *_next_match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, pat_t *skip, bool ignorecase)
|
||||
{
|
||||
const char *str;
|
||||
if (prev) {
|
||||
str = prev->end > prev->start ? prev->end : prev->end + 1;
|
||||
if (prev->refcount == 0) recycle_if_unused(&prev);
|
||||
cache_prune(f, str, f->end);
|
||||
} else {
|
||||
str = f->start;
|
||||
// Prune the unnecessary entries from the cache (those not between start/end)
|
||||
if (cache->matches) {
|
||||
for (size_t i = 0; i < cache->size; i++) {
|
||||
for (match_t *m = cache->matches[i], *next = NULL; m; m = next) {
|
||||
next = m->cache.next;
|
||||
if (m->start < f->start || (m->end ? m->end : m->start) > f->end)
|
||||
cache_remove(cache, m);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pat = deref(defs, pat);
|
||||
@ -162,14 +277,14 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk
|
||||
if (str > f->end) return NULL;
|
||||
|
||||
do {
|
||||
match_t *m = match(defs, f, str, pat, ignorecase);
|
||||
match_t *m = match(defs, cache, f, str, pat, ignorecase);
|
||||
if (m) return m;
|
||||
if (first->type == BP_START_OF_FILE) return NULL;
|
||||
match_t *s;
|
||||
if (skip && (s = match(defs, f, str, skip, ignorecase))) {
|
||||
if (skip && (s = match(defs, cache, f, str, skip, ignorecase))) {
|
||||
str = s->end > str ? s->end : str + 1;
|
||||
recycle_if_unused(&s);
|
||||
} else str = next_char(f, str);
|
||||
} else str = next_char(str, f->end);
|
||||
} while (str < f->end);
|
||||
return NULL;
|
||||
}
|
||||
@ -179,12 +294,12 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk
|
||||
// match object, or NULL if no match is found.
|
||||
// The returned value should be free()'d to avoid memory leaking.
|
||||
//
|
||||
static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase)
|
||||
static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase)
|
||||
{
|
||||
switch (pat->type) {
|
||||
case BP_DEFINITION: {
|
||||
def_t *defs2 = with_def(defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def);
|
||||
match_t *m = match(defs2, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase);
|
||||
match_t *m = match(defs2, cache, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase);
|
||||
defs = free_defs(defs2, defs);
|
||||
return m;
|
||||
}
|
||||
@ -198,17 +313,17 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
++pat->args.leftrec.visits;
|
||||
return pat->args.leftrec.match;
|
||||
} else {
|
||||
return match(defs, f, str, pat->args.leftrec.fallback, ignorecase);
|
||||
return match(defs, cache, f, str, pat->args.leftrec.fallback, ignorecase);
|
||||
}
|
||||
}
|
||||
case BP_ANYCHAR: {
|
||||
return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
|
||||
return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
|
||||
}
|
||||
case BP_ID_START: {
|
||||
return (str < f->end && isidstart(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
|
||||
return (str < f->end && isidstart(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
|
||||
}
|
||||
case BP_ID_CONTINUE: {
|
||||
return (str < f->end && isidcontinue(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
|
||||
return (str < f->end && isidcontinue(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
|
||||
}
|
||||
case BP_START_OF_FILE: {
|
||||
return (str == f->start) ? new_match(defs, pat, str, str, NULL) : NULL;
|
||||
@ -223,7 +338,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
return (str == f->end || *str == '\n') ? new_match(defs, pat, str, str, NULL) : NULL;
|
||||
}
|
||||
case BP_WORD_BOUNDARY: {
|
||||
return (str == f->start || isidcontinue(f, str) != isidcontinue(f, prev_char(f, str))) ? new_match(defs, pat, str, str, NULL) : NULL;
|
||||
return (str == f->start || isidcontinue(str, f->end) != isidcontinue(prev_char(f->start, str), f->end)) ? new_match(defs, pat, str, str, NULL) : NULL;
|
||||
}
|
||||
case BP_STRING: {
|
||||
if (&str[pat->min_matchlen] > f->end) return NULL;
|
||||
@ -238,7 +353,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
return new_match(defs, pat, str, str+1, NULL);
|
||||
}
|
||||
case BP_NOT: {
|
||||
match_t *m = match(defs, f, str, pat->args.pat, ignorecase);
|
||||
match_t *m = match(defs, cache, f, str, pat->args.pat, ignorecase);
|
||||
if (m != NULL) {
|
||||
recycle_if_unused(&m);
|
||||
return NULL;
|
||||
@ -259,7 +374,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
for (const char *prev = NULL; prev < str; ) {
|
||||
prev = str;
|
||||
if (target) {
|
||||
match_t *p = match(defs, f, str, target, ignorecase);
|
||||
match_t *p = match(defs, cache, f, str, target, ignorecase);
|
||||
if (p != NULL) {
|
||||
recycle_if_unused(&p);
|
||||
m->end = str;
|
||||
@ -270,7 +385,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
return m;
|
||||
}
|
||||
if (skip) {
|
||||
match_t *s = match(defs, f, str, skip, ignorecase);
|
||||
match_t *s = match(defs, cache, f, str, skip, ignorecase);
|
||||
if (s != NULL) {
|
||||
str = s->end;
|
||||
if (nchildren+2 >= child_cap) {
|
||||
@ -285,7 +400,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
// be at least once chance to match the pattern, even if
|
||||
// we're at the end of the string already (e.g. "..$").
|
||||
if (str < f->end && *str != '\n' && pat->type != BP_UPTO_STRICT)
|
||||
str = next_char(f, str);
|
||||
str = next_char(str, f->end);
|
||||
}
|
||||
recycle_if_unused(&m);
|
||||
return NULL;
|
||||
@ -302,11 +417,11 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
// Separator
|
||||
match_t *msep = NULL;
|
||||
if (sep != NULL && reps > 0) {
|
||||
msep = match(defs, f, str, sep, ignorecase);
|
||||
msep = match(defs, cache, f, str, sep, ignorecase);
|
||||
if (msep == NULL) break;
|
||||
str = msep->end;
|
||||
}
|
||||
match_t *mp = match(defs, f, str, repeating, ignorecase);
|
||||
match_t *mp = match(defs, cache, f, str, repeating, ignorecase);
|
||||
if (mp == NULL) {
|
||||
str = start;
|
||||
if (msep) recycle_if_unused(&msep);
|
||||
@ -358,19 +473,20 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
// current pos, so mock it out as a file slice.
|
||||
// TODO: this breaks ^/^^/$/$$, but that can probably be ignored
|
||||
// because you rarely need to check those in a backtrack.
|
||||
cache_t slice_cache = {0};
|
||||
file_t slice;
|
||||
slice_file(&slice, f, f->start, str);
|
||||
for (const char *pos = &str[-(long)back->min_matchlen];
|
||||
pos >= f->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]);
|
||||
pos = prev_char(f, pos)) {
|
||||
cache_destroy(&slice);
|
||||
pos = prev_char(f->start, pos)) {
|
||||
cache_destroy(&slice_cache);
|
||||
slice.start = (char*)pos;
|
||||
match_t *m = match(defs, &slice, pos, back, ignorecase);
|
||||
match_t *m = match(defs, &slice_cache, &slice, pos, back, ignorecase);
|
||||
// Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB")
|
||||
if (m && m->end != str)
|
||||
recycle_if_unused(&m);
|
||||
else if (m) {
|
||||
cache_destroy(&slice);
|
||||
cache_destroy(&slice_cache);
|
||||
return new_match(defs, pat, str, str, MATCHES(m));
|
||||
}
|
||||
if (pos == f->start) break;
|
||||
@ -378,23 +494,23 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
// walking backwards endlessly over newlines.
|
||||
if (back->max_matchlen == -1 && *pos == '\n') break;
|
||||
}
|
||||
cache_destroy(&slice);
|
||||
cache_destroy(&slice_cache);
|
||||
return NULL;
|
||||
}
|
||||
case BP_BEFORE: {
|
||||
match_t *after = match(defs, f, str, pat->args.pat, ignorecase);
|
||||
match_t *after = match(defs, cache, f, str, pat->args.pat, ignorecase);
|
||||
return after ? new_match(defs, pat, str, str, MATCHES(after)) : NULL;
|
||||
}
|
||||
case BP_CAPTURE: {
|
||||
match_t *p = match(defs, f, str, pat->args.pat, ignorecase);
|
||||
match_t *p = match(defs, cache, f, str, pat->args.pat, ignorecase);
|
||||
return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL;
|
||||
}
|
||||
case BP_OTHERWISE: {
|
||||
match_t *m = match(defs, f, str, pat->args.multiple.first, ignorecase);
|
||||
return m ? m : match(defs, f, str, pat->args.multiple.second, ignorecase);
|
||||
match_t *m = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
|
||||
return m ? m : match(defs, cache, f, str, pat->args.multiple.second, ignorecase);
|
||||
}
|
||||
case BP_CHAIN: {
|
||||
match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase);
|
||||
match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
|
||||
if (m1 == NULL) return NULL;
|
||||
|
||||
match_t *m2;
|
||||
@ -408,7 +524,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
|
||||
def_t *defs2 = with_def(defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref);
|
||||
++m1->refcount; {
|
||||
m2 = match(defs2, f, m1->end, pat->args.multiple.second, ignorecase);
|
||||
m2 = match(defs2, cache, f, m1->end, pat->args.multiple.second, ignorecase);
|
||||
if (!m2) { // No need to keep the backref in memory if it didn't match
|
||||
for (pat_t **rem = &f->pats; *rem; rem = &(*rem)->next) {
|
||||
if ((*rem) == backref) {
|
||||
@ -422,7 +538,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
defs = free_defs(defs2, defs);
|
||||
} --m1->refcount;
|
||||
} else {
|
||||
m2 = match(defs, f, m1->end, pat->args.multiple.second, ignorecase);
|
||||
m2 = match(defs, cache, f, m1->end, pat->args.multiple.second, ignorecase);
|
||||
}
|
||||
|
||||
if (m2 == NULL) {
|
||||
@ -433,35 +549,36 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
return new_match(defs, pat, str, m2->end, MATCHES(m1, m2));
|
||||
}
|
||||
case BP_MATCH: case BP_NOT_MATCH: {
|
||||
match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase);
|
||||
match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
|
||||
if (m1 == NULL) return NULL;
|
||||
|
||||
// <p1>~<p2> matches iff the text of <p1> matches <p2>
|
||||
// <p1>!~<p2> matches iff the text of <p1> does not match <p2>
|
||||
cache_t slice_cache = {0};
|
||||
file_t slice;
|
||||
slice_file(&slice, f, m1->start, m1->end);
|
||||
match_t *m2 = next_match(defs, &slice, NULL, pat->args.multiple.second, NULL, ignorecase);
|
||||
match_t *m2 = _next_match(defs, &slice_cache, &slice, slice.start, pat->args.multiple.second, NULL, ignorecase);
|
||||
if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) {
|
||||
cache_destroy(&slice);
|
||||
cache_destroy(&slice_cache);
|
||||
if (m2) recycle_if_unused(&m2);
|
||||
recycle_if_unused(&m1);
|
||||
return NULL;
|
||||
}
|
||||
match_t *ret = new_match(defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1));
|
||||
cache_destroy(&slice);
|
||||
cache_destroy(&slice_cache);
|
||||
return ret;
|
||||
}
|
||||
case BP_REPLACE: {
|
||||
match_t *p = NULL;
|
||||
if (pat->args.replace.pat) {
|
||||
p = match(defs, f, str, pat->args.replace.pat, ignorecase);
|
||||
p = match(defs, cache, f, str, pat->args.replace.pat, ignorecase);
|
||||
if (p == NULL) return NULL;
|
||||
}
|
||||
return new_match(defs, pat, str, p ? p->end : str, MATCHES(p));
|
||||
}
|
||||
case BP_REF: {
|
||||
match_t *cached;
|
||||
if (cache_get(f, defs, str, pat, &cached))
|
||||
if (cache_get(cache, defs, str, pat, &cached))
|
||||
return cached;
|
||||
|
||||
def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name);
|
||||
@ -490,9 +607,9 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
};
|
||||
|
||||
const char *prev = str;
|
||||
match_t *m = match(&defs2, f, str, ref, ignorecase);
|
||||
match_t *m = match(&defs2, cache, f, str, ref, ignorecase);
|
||||
if (m == NULL) {
|
||||
cache_save(f, defs, str, pat, NULL);
|
||||
cache_save(cache, defs, str, pat, NULL);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -501,7 +618,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
remove_ownership(&rec_op.args.leftrec.match);
|
||||
add_owner(&rec_op.args.leftrec.match, m);
|
||||
prev = m->end;
|
||||
match_t *m2 = match(&defs2, f, str, ref, ignorecase);
|
||||
match_t *m2 = match(&defs2, cache, f, str, ref, ignorecase);
|
||||
if (m2 == NULL) break;
|
||||
if (m2->end <= prev) {
|
||||
recycle_if_unused(&m2);
|
||||
@ -516,7 +633,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
// results.
|
||||
// OPTIMIZE: remove this if necessary
|
||||
match_t *wrap = new_match(defs, pat, m->start, m->end, MATCHES(m));
|
||||
cache_save(f, defs, str, pat, wrap);
|
||||
cache_save(cache, defs, str, pat, wrap);
|
||||
|
||||
if (rec_op.args.leftrec.match)
|
||||
remove_ownership(&rec_op.args.leftrec.match);
|
||||
@ -527,9 +644,8 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
if (*str != '\n') return NULL;
|
||||
const char *start = str;
|
||||
|
||||
size_t linenum = get_line_number(f, str);
|
||||
const char *p = get_line(f, linenum);
|
||||
if (p < f->start) p = f->start; // Can happen with recursive matching
|
||||
const char *p = str;
|
||||
while (p > f->start && p[-1] != '\n') --p;
|
||||
|
||||
// Current indentation:
|
||||
char denter = *p;
|
||||
@ -546,7 +662,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
return new_match(defs, pat, start, &str[dents], NULL);
|
||||
}
|
||||
case BP_ERROR: {
|
||||
match_t *p = pat->args.pat ? match(defs, f, str, pat->args.pat, ignorecase) : NULL;
|
||||
match_t *p = pat->args.pat ? match(defs, cache, f, str, pat->args.pat, ignorecase) : NULL;
|
||||
return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL;
|
||||
}
|
||||
default: {
|
||||
@ -644,4 +760,32 @@ size_t free_all_matches(void)
|
||||
return count;
|
||||
}
|
||||
|
||||
//
|
||||
// Iterate over matches.
|
||||
// Usage: for (match_t *m = NULL; next_match(&m, ...); ) {...}
|
||||
//
|
||||
bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase)
|
||||
{
|
||||
static cache_t cache = {0};
|
||||
if (!f || !pat) { // Cleanup for stop_matching()
|
||||
recycle_if_unused(m);
|
||||
cache_destroy(&cache);
|
||||
return false;
|
||||
}
|
||||
|
||||
const char *start;
|
||||
if (*m) {
|
||||
// Make sure forward progress is occurring, even after zero-width matches:
|
||||
start = ((*m)->end > (*m)->start) ? (*m)->end : (*m)->end+1;
|
||||
recycle_if_unused(m);
|
||||
} else {
|
||||
start = f->start;
|
||||
cache_destroy(&cache);
|
||||
}
|
||||
|
||||
*m = (start <= f->end) ? _next_match(defs, &cache, f, start, pat, skip, ignorecase) : NULL;
|
||||
if (!*m) cache_destroy(&cache);
|
||||
return *m != NULL;
|
||||
}
|
||||
|
||||
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
|
||||
|
5
match.h
5
match.h
@ -12,12 +12,13 @@
|
||||
|
||||
__attribute__((returns_nonnull))
|
||||
match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]);
|
||||
__attribute__((nonnull(2,4)))
|
||||
match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase);
|
||||
__attribute__((nonnull))
|
||||
void recycle_if_unused(match_t **at_m);
|
||||
size_t free_all_matches(void);
|
||||
size_t recycle_all_matches(void);
|
||||
|
||||
bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase);
|
||||
#define stop_matching(m) next_match(m, NULL, NULL, NULL, NULL, 0)
|
||||
|
||||
#endif
|
||||
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
|
||||
|
111
pattern.c
111
pattern.c
@ -16,16 +16,6 @@
|
||||
__attribute__((nonnull))
|
||||
static pat_t *bp_pattern_nl(file_t *f, const char *str, bool allow_nl);
|
||||
__attribute__((nonnull))
|
||||
static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl);
|
||||
__attribute__((nonnull))
|
||||
static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl);
|
||||
__attribute__((nonnull))
|
||||
static pat_t *expand_choices(file_t *f, pat_t *first, bool allow_nl);
|
||||
__attribute__((nonnull))
|
||||
static pat_t *_bp_simplepattern(file_t *f, const char *str);
|
||||
__attribute__((nonnull(1,2,3,6)))
|
||||
static pat_t *new_range(file_t *f, const char *start, const char *end, size_t min, ssize_t max, pat_t *repeating, pat_t *sep);
|
||||
__attribute__((nonnull(1,2)))
|
||||
static pat_t *bp_simplepattern(file_t *f, const char *str);
|
||||
|
||||
//
|
||||
@ -52,6 +42,7 @@ pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssi
|
||||
//
|
||||
// Helper function to initialize a range object.
|
||||
//
|
||||
__attribute__((nonnull(1,2,3,6)))
|
||||
static pat_t *new_range(file_t *f, const char *start, const char *end, size_t min, ssize_t max, pat_t *repeating, pat_t *sep)
|
||||
{
|
||||
size_t minlen = min*repeating->min_matchlen + (min > 0 ? min-1 : 0)*(sep ? sep->min_matchlen : 0);
|
||||
@ -69,6 +60,7 @@ static pat_t *new_range(file_t *f, const char *start, const char *end, size_t mi
|
||||
// Take a pattern and expand it into a chain of patterns if it's followed by
|
||||
// any patterns (e.g. "`x `y"), otherwise return the original input.
|
||||
//
|
||||
__attribute__((nonnull))
|
||||
static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl)
|
||||
{
|
||||
const char *str = after_spaces(first->end, allow_nl);
|
||||
@ -84,6 +76,7 @@ static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl)
|
||||
//
|
||||
// Match trailing => replacements (with optional pattern beforehand)
|
||||
//
|
||||
__attribute__((nonnull))
|
||||
static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
|
||||
{
|
||||
const char *str = replace_pat->end;
|
||||
@ -94,12 +87,12 @@ static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
|
||||
|| matchchar(&str, '{', allow_nl) || matchchar(&str, '\002', allow_nl)) {
|
||||
char closequote = str[-1] == '{' ? '}' : (str[-1] == '\002' ? '\003' : str[-1]);
|
||||
repstr = str;
|
||||
for (; *str && *str != closequote; str = next_char(f, str)) {
|
||||
for (; str < f->end && *str != closequote; str = next_char(str, f->end)) {
|
||||
if (*str == '\\') {
|
||||
if (!str[1] || str[1] == '\n')
|
||||
file_err(f, str, str+1,
|
||||
"There should be an escape sequence after this backslash.");
|
||||
str = next_char(f, str);
|
||||
str = next_char(str, f->end);
|
||||
}
|
||||
}
|
||||
replen = (size_t)(str-repstr);
|
||||
@ -124,6 +117,7 @@ static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
|
||||
// chain of choices if it's followed by any "/"-separated patterns (e.g.
|
||||
// "`x/`y"), otherwise return the original input.
|
||||
//
|
||||
__attribute__((nonnull))
|
||||
static pat_t *expand_choices(file_t *f, pat_t *first, bool allow_nl)
|
||||
{
|
||||
first = expand_chain(f, first, allow_nl);
|
||||
@ -191,54 +185,23 @@ pat_t *either_pat(file_t *f, pat_t *first, pat_t *second)
|
||||
return either;
|
||||
}
|
||||
|
||||
//
|
||||
// Wrapper for _bp_simplepattern() that expands any postfix operators (~, !~)
|
||||
//
|
||||
static pat_t *bp_simplepattern(file_t *f, const char *str)
|
||||
{
|
||||
pat_t *pat = _bp_simplepattern(f, str);
|
||||
if (pat == NULL) return pat;
|
||||
str = pat->end;
|
||||
|
||||
// Expand postfix operators (if any)
|
||||
while (str < f->end) {
|
||||
enum pattype_e type;
|
||||
if (matchchar(&str, '~', false))
|
||||
type = BP_MATCH;
|
||||
else if (matchstr(&str, "!~", false))
|
||||
type = BP_NOT_MATCH;
|
||||
else break;
|
||||
|
||||
pat_t *first = pat;
|
||||
pat_t *second = bp_simplepattern(f, str);
|
||||
if (!second)
|
||||
file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~");
|
||||
|
||||
pat = new_pat(f, str, second->end, first->min_matchlen, first->max_matchlen, type);
|
||||
pat->args.multiple.first = first;
|
||||
pat->args.multiple.second = second;
|
||||
str = pat->end;
|
||||
}
|
||||
|
||||
return pat;
|
||||
}
|
||||
|
||||
//
|
||||
// Compile a string of BP code into a BP pattern object.
|
||||
//
|
||||
__attribute__((nonnull))
|
||||
static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
{
|
||||
str = after_spaces(str, false);
|
||||
if (!*str) return NULL;
|
||||
const char *start = str;
|
||||
char c = *str;
|
||||
str = next_char(f, str);
|
||||
str = next_char(str, f->end);
|
||||
switch (c) {
|
||||
// Any char (dot)
|
||||
case '.': {
|
||||
if (*str == '.') { // ".."
|
||||
pat_t *skip = NULL;
|
||||
str = next_char(f, str);
|
||||
str = next_char(str, f->end);
|
||||
char skipper = *str;
|
||||
if (matchchar(&str, '%', false) || matchchar(&str, '=', false)) {
|
||||
skip = bp_simplepattern(f, str);
|
||||
@ -261,11 +224,11 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
file_err(f, str, str, "There should be a character here after the '`'");
|
||||
|
||||
const char *c1_loc = str;
|
||||
str = next_char(f, c1_loc);
|
||||
str = next_char(c1_loc, f->end);
|
||||
if (*str == '-') { // Range
|
||||
const char *c2_loc = ++str;
|
||||
if (next_char(f, c1_loc) > c1_loc+1 || next_char(f, c2_loc) > c2_loc+1)
|
||||
file_err(f, start, next_char(f, c2_loc), "Sorry, UTF-8 character ranges are not yet supported.");
|
||||
if (next_char(c1_loc, f->end) > c1_loc+1 || next_char(c2_loc, f->end) > c2_loc+1)
|
||||
file_err(f, start, next_char(c2_loc, f->end), "Sorry, UTF-8 character ranges are not yet supported.");
|
||||
char c1 = *c1_loc, c2 = *c2_loc;
|
||||
if (!c2 || c2 == '\n')
|
||||
file_err(f, str, str, "There should be a character here to complete the character range.");
|
||||
@ -274,7 +237,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
c1 = c2;
|
||||
c2 = tmp;
|
||||
}
|
||||
str = next_char(f, c2_loc);
|
||||
str = next_char(c2_loc, f->end);
|
||||
pat_t *pat = new_pat(f, start == c1_loc - 1 ? start : c1_loc, str, 1, 1, BP_RANGE);
|
||||
pat->args.range.low = (unsigned char)c1;
|
||||
pat->args.range.high = (unsigned char)c2;
|
||||
@ -318,8 +281,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
unsigned char e_high = e_low;
|
||||
if (*str == '-') { // Escape range (e.g. \x00-\xFF)
|
||||
++str;
|
||||
if (next_char(f, str) != str+1)
|
||||
file_err(f, start, next_char(f, str), "Sorry, UTF8 escape sequences are not supported in ranges.");
|
||||
if (next_char(str, f->end) != str+1)
|
||||
file_err(f, start, next_char(str, f->end), "Sorry, UTF8 escape sequences are not supported in ranges.");
|
||||
const char *seqstart = str;
|
||||
e_high = (unsigned char)unescapechar(str, &str);
|
||||
if (str == seqstart)
|
||||
@ -331,7 +294,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
esc->args.range.low = e_low;
|
||||
esc->args.range.high = e_high;
|
||||
all = either_pat(f, all, esc);
|
||||
} while (*str++ == ',');
|
||||
} while (*str == ',' && str++ < f->end);
|
||||
|
||||
return all;
|
||||
}
|
||||
@ -344,9 +307,9 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
char endquote = c == '\002' ? '\003' : (c == '{' ? '}' : c);
|
||||
char *litstart = (char*)str;
|
||||
while (str < f->end && *str != endquote)
|
||||
str = next_char(f, str);
|
||||
str = next_char(str, f->end);
|
||||
size_t len = (size_t)(str - litstart);
|
||||
str = next_char(f, str);
|
||||
str = next_char(str, f->end);
|
||||
|
||||
pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING);
|
||||
pat->args.string = litstart;
|
||||
@ -528,10 +491,10 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
pat_t *bp_stringpattern(file_t *f, const char *str)
|
||||
{
|
||||
pat_t *ret = NULL;
|
||||
while (*str) {
|
||||
while (str < f->end) {
|
||||
char *start = (char*)str;
|
||||
pat_t *interp = NULL;
|
||||
for (; str < f->end; str = next_char(f, str)) {
|
||||
for (; str < f->end; str = next_char(str, f->end)) {
|
||||
if (*str == '\\' && str+1 < f->end) {
|
||||
if (str[1] == '\\' || isalnum(str[1]))
|
||||
interp = bp_simplepattern(f, str);
|
||||
@ -558,6 +521,38 @@ pat_t *bp_stringpattern(file_t *f, const char *str)
|
||||
return ret;
|
||||
}
|
||||
|
||||
//
|
||||
// Wrapper for _bp_simplepattern() that expands any postfix operators (~, !~)
|
||||
//
|
||||
static pat_t *bp_simplepattern(file_t *f, const char *str)
|
||||
{
|
||||
pat_t *pat = _bp_simplepattern(f, str);
|
||||
if (pat == NULL) return pat;
|
||||
str = pat->end;
|
||||
|
||||
// Expand postfix operators (if any)
|
||||
while (str < f->end) {
|
||||
enum pattype_e type;
|
||||
if (matchchar(&str, '~', false))
|
||||
type = BP_MATCH;
|
||||
else if (matchstr(&str, "!~", false))
|
||||
type = BP_NOT_MATCH;
|
||||
else break;
|
||||
|
||||
pat_t *first = pat;
|
||||
pat_t *second = bp_simplepattern(f, str);
|
||||
if (!second)
|
||||
file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~");
|
||||
|
||||
pat = new_pat(f, str, second->end, first->min_matchlen, first->max_matchlen, type);
|
||||
pat->args.multiple.first = first;
|
||||
pat->args.multiple.second = second;
|
||||
str = pat->end;
|
||||
}
|
||||
|
||||
return pat;
|
||||
}
|
||||
|
||||
//
|
||||
// Given a pattern and a replacement string, compile the two into a BP
|
||||
// replace pattern.
|
||||
@ -567,7 +562,7 @@ pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement)
|
||||
pat_t *pat = new_pat(f, replacepat->start, replacepat->end, replacepat->min_matchlen, replacepat->max_matchlen, BP_REPLACE);
|
||||
pat->args.replace.pat = replacepat;
|
||||
const char *p = replacement;
|
||||
for (; *p; p++) {
|
||||
for (; p < f->end; p++) {
|
||||
if (*p == '\\') {
|
||||
if (!p[1] || p[1] == '\n')
|
||||
file_err(f, p, p, "There should be an escape sequence or pattern here after this backslash.");
|
||||
|
@ -9,7 +9,7 @@
|
||||
|
||||
__attribute__((returns_nonnull, nonnull(1,2)))
|
||||
pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssize_t maxlen, enum pattype_e type);
|
||||
__attribute__((nonnull(1,2)))
|
||||
__attribute__((nonnull))
|
||||
pat_t *bp_stringpattern(file_t *f, const char *str);
|
||||
__attribute__((nonnull(1,2)))
|
||||
pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement);
|
||||
|
45
utf8.c
45
utf8.c
@ -3,8 +3,9 @@
|
||||
//
|
||||
#include <ctype.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "files.h"
|
||||
#include "utf8.h"
|
||||
|
||||
#define ARRAY_LEN(a) (sizeof(a)/sizeof((a)[0]))
|
||||
@ -181,39 +182,39 @@ static const uint32_t XID_Continue_only[][2] = {
|
||||
// Return the location of the next character or UTF8 codepoint.
|
||||
// (i.e. skip forward one codepoint at a time, not one byte at a time)
|
||||
//
|
||||
const char *next_char(file_t *f, const char *str)
|
||||
const char *next_char(const char *str, const char *end)
|
||||
{
|
||||
if (likely(str+1 <= f->end) && likely((str[0] & 0x80) == 0x0))
|
||||
if (likely(str+1 <= end) && likely((str[0] & 0x80) == 0x0))
|
||||
return str+1;
|
||||
if (likely(str+2 <= f->end) && (str[0] & 0xe0) == 0xc0)
|
||||
if (likely(str+2 <= end) && (str[0] & 0xe0) == 0xc0)
|
||||
return str+2;
|
||||
if (likely(str+3 <= f->end) && (str[0] & 0xf0) == 0xe0)
|
||||
if (likely(str+3 <= end) && (str[0] & 0xf0) == 0xe0)
|
||||
return str+3;
|
||||
if (likely(str+4 <= f->end) && (str[0] & 0xf8) == 0xf0)
|
||||
if (likely(str+4 <= end) && (str[0] & 0xf8) == 0xf0)
|
||||
return str+4;
|
||||
return likely(str+1 <= f->end) ? str+1 : f->end;
|
||||
return likely(str+1 <= end) ? str+1 : end;
|
||||
}
|
||||
|
||||
//
|
||||
// Return the location of the previous character or UTF8 codepoint.
|
||||
// (i.e. skip backwards one codepoint at a time, not one byte at a time)
|
||||
//
|
||||
const char *prev_char(file_t *f, const char *str)
|
||||
const char *prev_char(const char *start, const char *str)
|
||||
{
|
||||
if (likely(str-1 >= f->start) && likely((str[-1] & 0x80) == 0x0))
|
||||
if (likely(str-1 >= start) && likely((str[-1] & 0x80) == 0x0))
|
||||
return str-1;
|
||||
if (likely(str-2 >= f->start) && (str[-2] & 0xe0) == 0xc0)
|
||||
if (likely(str-2 >= start) && (str[-2] & 0xe0) == 0xc0)
|
||||
return str-2;
|
||||
if (likely(str-3 >= f->start) && (str[-3] & 0xf0) == 0xe0)
|
||||
if (likely(str-3 >= start) && (str[-3] & 0xf0) == 0xe0)
|
||||
return str-3;
|
||||
if (likely(str-4 >= f->start) && (str[-4] & 0xf8) == 0xf0)
|
||||
if (likely(str-4 >= start) && (str[-4] & 0xf8) == 0xf0)
|
||||
return str-4;
|
||||
return likely(str-1 >= f->start) ? str-1 : f->start;
|
||||
return likely(str-1 >= start) ? str-1 : start;
|
||||
}
|
||||
|
||||
static uint32_t get_codepoint(file_t *f, const char *str)
|
||||
static uint32_t get_codepoint(const char *str, const char *end)
|
||||
{
|
||||
if (str >= f->end)
|
||||
if (unlikely(str >= end))
|
||||
return (uint32_t)-1;
|
||||
|
||||
unsigned char c1 = (unsigned char)str[0];
|
||||
@ -235,7 +236,7 @@ static uint32_t get_codepoint(file_t *f, const char *str)
|
||||
}
|
||||
|
||||
for (int i = 1; i < seqlen; ++i) {
|
||||
if (unlikely(&str[i] >= f->end || (str[i] & 0xC0) != 0x80))
|
||||
if (unlikely((&str[i] >= end) || (str[i] & 0xC0) != 0x80))
|
||||
return (uint32_t)-1;
|
||||
codepoint = ((codepoint << 6) | (uint32_t)(str[i] & 0x3F));
|
||||
}
|
||||
@ -259,22 +260,22 @@ static bool find_in_ranges(uint32_t codepoint, const uint32_t ranges[][2], size_
|
||||
return false;
|
||||
}
|
||||
|
||||
bool isidstart(file_t *f, const char *str)
|
||||
bool isidstart(const char *str, const char *end)
|
||||
{
|
||||
if (unlikely(str >= f->end)) return false;
|
||||
if (unlikely(str >= end)) return false;
|
||||
else if (isalpha(*str) || *str == '_') return true;
|
||||
else if (likely((*str & 0x80) == 0)) return false;
|
||||
uint32_t codepoint = get_codepoint(f, str);
|
||||
uint32_t codepoint = get_codepoint(str, end);
|
||||
return codepoint != (uint32_t)-1
|
||||
&& find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start));
|
||||
}
|
||||
|
||||
bool isidcontinue(file_t *f, const char *str)
|
||||
bool isidcontinue(const char *str, const char *end)
|
||||
{
|
||||
if (unlikely(str >= f->end)) return false;
|
||||
if (unlikely(str >= end)) return false;
|
||||
else if (isalnum(*str) || *str == '_') return true;
|
||||
else if (likely((*str & 0x80) == 0)) return false;
|
||||
uint32_t codepoint = get_codepoint(f, str);
|
||||
uint32_t codepoint = get_codepoint(str, end);
|
||||
return codepoint != (uint32_t)-1
|
||||
&& (find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start))
|
||||
|| find_in_ranges(codepoint, XID_Continue_only, ARRAY_LEN(XID_Continue_only)));
|
||||
|
12
utf8.h
12
utf8.h
@ -1,21 +1,21 @@
|
||||
//
|
||||
// utf8.h - UTF8 helper functions
|
||||
//
|
||||
#include "files.h"
|
||||
|
||||
#ifndef UTF8__H
|
||||
#define UTF8__H
|
||||
|
||||
#include <stdbool.h>
|
||||
|
||||
#define UTF8_MAXCHARLEN 4
|
||||
|
||||
__attribute__((nonnull, pure))
|
||||
const char *next_char(file_t *f, const char *str);
|
||||
const char *next_char(const char *str, const char *end);
|
||||
__attribute__((nonnull, pure))
|
||||
const char *prev_char(file_t *f, const char *str);
|
||||
const char *prev_char(const char *start, const char *str);
|
||||
__attribute__((nonnull, pure))
|
||||
bool isidstart(file_t *f, const char *str);
|
||||
bool isidstart(const char *str, const char *end);
|
||||
__attribute__((nonnull, pure))
|
||||
bool isidcontinue(file_t *f, const char *str);
|
||||
bool isidcontinue(const char *str, const char *end);
|
||||
|
||||
#endif
|
||||
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
|
||||
|
Loading…
Reference in New Issue
Block a user