From da6c8857d6bad131635a846e8177e7c00a4c224e Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Mon, 31 May 2021 12:38:42 -0700 Subject: [PATCH] Moved utf8 code into a C file, fixed some potential null deref issues --- Makefile | 4 ++-- match.c | 5 +++-- print.c | 4 ++-- utf8.c | 40 ++++++++++++++++++++++++++++++++++++++++ utf8.h | 41 ++++++----------------------------------- 5 files changed, 53 insertions(+), 41 deletions(-) create mode 100644 utf8.c diff --git a/Makefile b/Makefile index a107318..c033b25 100644 --- a/Makefile +++ b/Makefile @@ -2,14 +2,14 @@ NAME=bp CC=cc PREFIX=/usr/local SYSCONFDIR=/etc -CFLAGS=-std=c99 -Werror -D_XOPEN_SOURCE=700 -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L +CFLAGS=-std=c99 -Werror -D_XOPEN_SOURCE=700 -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L -flto CWARN=-Wall -Wpedantic -Wextra -Wsign-conversion -Wtype-limits -Wunused-result -Wnull-dereference EXTRA= G= O=-O3 ALL_FLAGS=$(CFLAGS) -DBP_NAME="\"$(NAME)\"" $(EXTRA) $(CWARN) $(G) $(O) -CFILES=pattern.c definitions.c utils.c match.c files.c print.c json.c +CFILES=pattern.c definitions.c utils.c match.c files.c print.c json.c utf8.c OBJFILES=$(CFILES:.c=.o) all: $(NAME) bp.1 diff --git a/match.c b/match.c index 25a08a7..bdf13df 100644 --- a/match.c +++ b/match.c @@ -36,6 +36,7 @@ static match_t *unused_matches = NULL; static match_t *in_use_matches = NULL; #endif +__attribute__((nonnull(1))) static inline pat_t *deref(def_t *defs, pat_t *pat); __attribute__((returns_nonnull)) static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *child); @@ -52,10 +53,9 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool // If the given pattern is a reference, look it up and return the referenced // pattern. This is used for an optimization to avoid repeated lookups. // -__attribute__((nonnull, returns_nonnull)) static inline pat_t *deref(def_t *defs, pat_t *pat) { - if (pat->type == BP_REF) { + if (pat && pat->type == BP_REF) { def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name); if (def) pat = def->pat; } @@ -297,6 +297,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool } case BP_AFTER: { pat_t *back = deref(defs, pat->args.pat); + if (!back) return NULL; // We only care about the region from the backtrack pos up to the // current pos, so mock it out as a file slice. diff --git a/print.c b/print.c index 20be733..6dfef7b 100644 --- a/print.c +++ b/print.c @@ -314,13 +314,13 @@ static void _print_match(FILE *out, printer_t *pr, match_t *m) // the replacement text contains newlines, this may get weird. const char *line_start = get_line( pr->file, get_line_number(pr->file, m->start)); - char denter = *line_start; + char denter = line_start ? *line_start : '\t'; fputc('\n', out); ++line; pr->needs_line_number = 1; print_line_number(out, pr, 0, pr->use_color ? color_replace : NULL); if (denter == ' ' || denter == '\t') { - for (const char *p = line_start; *p == denter && p < m->start; ++p) + for (const char *p = line_start; p && *p == denter && p < m->start; ++p) fputc(denter, out); } continue; diff --git a/utf8.c b/utf8.c new file mode 100644 index 0000000..25e0048 --- /dev/null +++ b/utf8.c @@ -0,0 +1,40 @@ +// +// utf8.c - UTF8 helper functions +// +#include "files.h" +#include "utf8.h" + +// +// Return the location of the next character or UTF8 codepoint. +// (i.e. skip forward one codepoint at a time, not one byte at a time) +// +const char *next_char(file_t *f, const char *str) +{ + if (__builtin_expect(str+1 <= f->end && (str[0] & 0x80) == 0x0, 1)) + return str+1; + if (__builtin_expect(str+2 <= f->end && (str[0] & 0xe0) == 0xc0, 1)) + return str+2; + if (__builtin_expect(str+3 <= f->end && (str[0] & 0xf0) == 0xe0, 1)) + return str+3; + if (__builtin_expect(str+4 <= f->end && (str[0] & 0xf8) == 0xf0, 1)) + return str+4; + return __builtin_expect(str+1 <= f->end, 1) ? str+1 : f->end; +} + +// +// Return the location of the previous character or UTF8 codepoint. +// (i.e. skip backwards one codepoint at a time, not one byte at a time) +// +const char *prev_char(file_t *f, const char *str) +{ + if (__builtin_expect(str-1 >= f->start && (str[-1] & 0x80) == 0x0, 1)) + return str-1; + if (__builtin_expect(str-2 >= f->start && (str[-2] & 0xe0) == 0xc0, 1)) + return str-2; + if (__builtin_expect(str-3 >= f->start && (str[-3] & 0xf0) == 0xe0, 1)) + return str-3; + if (__builtin_expect(str-4 >= f->start && (str[-4] & 0xf8) == 0xf0, 1)) + return str-4; + return __builtin_expect(str-1 >= f->start, 1) ? str-1 : f->start; +} +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/utf8.h b/utf8.h index 8d7d969..ae2df2a 100644 --- a/utf8.h +++ b/utf8.h @@ -1,46 +1,17 @@ // // utf8.h - UTF8 helper functions // +#include "files.h" + #ifndef UTF8__H #define UTF8__H -#include "files.h" - #define UTF8_MAXCHARLEN 4 -// -// Return the location of the next character or UTF8 codepoint. -// (i.e. skip forward one codepoint at a time, not one byte at a time) -// -__attribute__((nonnull, pure)) -inline const char *next_char(file_t *f, const char *str) -{ - if (__builtin_expect(str+1 <= f->end && (str[0] & 0x80) == 0x0, 1)) - return str+1; - if (__builtin_expect(str+2 <= f->end && (str[0] & 0xe0) == 0xc0, 1)) - return str+2; - if (__builtin_expect(str+3 <= f->end && (str[0] & 0xf0) == 0xe0, 1)) - return str+3; - if (__builtin_expect(str+4 <= f->end && (str[0] & 0xf8) == 0xf0, 1)) - return str+4; - return __builtin_expect(str+1 <= f->end, 1) ? str+1 : f->end; -} -// -// Return the location of the previous character or UTF8 codepoint. -// (i.e. skip backwards one codepoint at a time, not one byte at a time) -// __attribute__((nonnull, pure)) -inline const char *prev_char(file_t *f, const char *str) -{ - if (__builtin_expect(str-1 >= f->start && (str[-1] & 0x80) == 0x0, 1)) - return str-1; - if (__builtin_expect(str-2 >= f->start && (str[-2] & 0xe0) == 0xc0, 1)) - return str-2; - if (__builtin_expect(str-3 >= f->start && (str[-3] & 0xf0) == 0xe0, 1)) - return str-3; - if (__builtin_expect(str-4 >= f->start && (str[-4] & 0xf8) == 0xf0, 1)) - return str-4; - return __builtin_expect(str-1 >= f->start, 1) ? str-1 : f->start; -} +const char *next_char(file_t *f, const char *str); +__attribute__((nonnull, pure)) +const char *prev_char(file_t *f, const char *str); + #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1