Moved utf8 code into a C file, fixed some potential null deref issues

author: Bruce Hill <bruce@bruce-hill.com> 2021-05-31 12:38:42 -0700
committer: Bruce Hill <bruce@bruce-hill.com> 2021-05-31 12:38:42 -0700
commit: da6c8857d6bad131635a846e8177e7c00a4c224e (patch)
tree: 4adf3115388cdee07cdb169642133ef3a7ab2630
parent: 0443fbb06387138fc88be80104bef102246fdd25 (diff)
5 files changed, 53 insertions, 41 deletions
diff --git a/Makefile b/Makefile
index a107318..c033b25 100644
--- a/Makefile
+++ b/Makefile
@@ -2,14 +2,14 @@ NAME=bp
 CC=cc
 PREFIX=/usr/local
 SYSCONFDIR=/etc
-CFLAGS=-std=c99 -Werror -D_XOPEN_SOURCE=700 -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L
+CFLAGS=-std=c99 -Werror -D_XOPEN_SOURCE=700 -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L -flto
 CWARN=-Wall -Wpedantic -Wextra -Wsign-conversion -Wtype-limits -Wunused-result -Wnull-dereference
 EXTRA=
 G=
 O=-O3
 ALL_FLAGS=$(CFLAGS) -DBP_NAME="\"$(NAME)\"" $(EXTRA) $(CWARN) $(G) $(O)
 
-CFILES=pattern.c definitions.c utils.c match.c files.c print.c json.c
+CFILES=pattern.c definitions.c utils.c match.c files.c print.c json.c utf8.c
 OBJFILES=$(CFILES:.c=.o)
 
 all: $(NAME) bp.1
diff --git a/match.c b/match.c
index 25a08a7..bdf13df 100644
--- a/match.c
+++ b/match.c
@@ -36,6 +36,7 @@ static match_t *unused_matches = NULL;
 static match_t *in_use_matches = NULL;
 #endif
 
+__attribute__((nonnull(1)))
 static inline pat_t *deref(def_t *defs, pat_t *pat);
 __attribute__((returns_nonnull))
 static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *child);
@@ -52,10 +53,9 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
 // If the given pattern is a reference, look it up and return the referenced
 // pattern. This is used for an optimization to avoid repeated lookups.
 //
-__attribute__((nonnull, returns_nonnull))
 static inline pat_t *deref(def_t *defs, pat_t *pat)
 {
-    if (pat->type == BP_REF) {
+    if (pat && pat->type == BP_REF) {
         def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name);
         if (def) pat = def->pat;
     }
@@ -297,6 +297,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
         }
         case BP_AFTER: {
             pat_t *back = deref(defs, pat->args.pat);
+            if (!back) return NULL;
 
             // We only care about the region from the backtrack pos up to the
             // current pos, so mock it out as a file slice.
diff --git a/print.c b/print.c
index 20be733..6dfef7b 100644
--- a/print.c
+++ b/print.c
@@ -314,13 +314,13 @@ static void _print_match(FILE *out, printer_t *pr, match_t *m)
                     // the replacement text contains newlines, this may get weird.
                     const char *line_start = get_line(
                         pr->file, get_line_number(pr->file, m->start));
-                    char denter = *line_start;
+                    char denter = line_start ? *line_start : '\t';
                     fputc('\n', out);
                     ++line;
                     pr->needs_line_number = 1;
                     print_line_number(out, pr, 0, pr->use_color ? color_replace : NULL);
                     if (denter == ' ' || denter == '\t') {
-                        for (const char *p = line_start; *p == denter && p < m->start; ++p)
+                        for (const char *p = line_start; p && *p == denter && p < m->start; ++p)
                             fputc(denter, out);
                     }
                     continue;
diff --git a/utf8.c b/utf8.c
new file mode 100644
index 0000000..25e0048
--- /dev/null
+++ b/utf8.c
@@ -0,0 +1,40 @@
+//
+// utf8.c - UTF8 helper functions
+//
+#include "files.h"
+#include "utf8.h"
+
+//
+// Return the location of the next character or UTF8 codepoint.
+// (i.e. skip forward one codepoint at a time, not one byte at a time)
+//
+const char *next_char(file_t *f, const char *str)
+{
+    if (__builtin_expect(str+1 <= f->end && (str[0] & 0x80) == 0x0, 1))
+        return str+1;
+    if (__builtin_expect(str+2 <= f->end && (str[0] & 0xe0) == 0xc0, 1))
+        return str+2;
+    if (__builtin_expect(str+3 <= f->end && (str[0] & 0xf0) == 0xe0, 1))
+        return str+3;
+    if (__builtin_expect(str+4 <= f->end && (str[0] & 0xf8) == 0xf0, 1))
+        return str+4;
+    return __builtin_expect(str+1 <= f->end, 1) ? str+1 : f->end;
+}
+
+//
+// Return the location of the previous character or UTF8 codepoint.
+// (i.e. skip backwards one codepoint at a time, not one byte at a time)
+//
+const char *prev_char(file_t *f, const char *str)
+{
+    if (__builtin_expect(str-1 >= f->start && (str[-1] & 0x80) == 0x0, 1))
+        return str-1;
+    if (__builtin_expect(str-2 >= f->start && (str[-2] & 0xe0) == 0xc0, 1))
+        return str-2;
+    if (__builtin_expect(str-3 >= f->start && (str[-3] & 0xf0) == 0xe0, 1))
+        return str-3;
+    if (__builtin_expect(str-4 >= f->start && (str[-4] & 0xf8) == 0xf0, 1))
+        return str-4;
+    return __builtin_expect(str-1 >= f->start, 1) ? str-1 : f->start;
+}
+// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1
diff --git a/utf8.h b/utf8.h
index 8d7d969..ae2df2a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -1,46 +1,17 @@
 //
 // utf8.h - UTF8 helper functions
 //
+#include "files.h"
+
 #ifndef UTF8__H
 #define UTF8__H
 
-#include "files.h"
-
 #define UTF8_MAXCHARLEN 4
-//
-// Return the location of the next character or UTF8 codepoint.
-// (i.e. skip forward one codepoint at a time, not one byte at a time)
-//
-__attribute__((nonnull, pure))
-inline const char *next_char(file_t *f, const char *str)
-{
-    if (__builtin_expect(str+1 <= f->end && (str[0] & 0x80) == 0x0, 1))
-        return str+1;
-    if (__builtin_expect(str+2 <= f->end && (str[0] & 0xe0) == 0xc0, 1))
-        return str+2;
-    if (__builtin_expect(str+3 <= f->end && (str[0] & 0xf0) == 0xe0, 1))
-        return str+3;
-    if (__builtin_expect(str+4 <= f->end && (str[0] & 0xf8) == 0xf0, 1))
-        return str+4;
-    return __builtin_expect(str+1 <= f->end, 1) ? str+1 : f->end;
-}
 
-//
-// Return the location of the previous character or UTF8 codepoint.
-// (i.e. skip backwards one codepoint at a time, not one byte at a time)
-//
 __attribute__((nonnull, pure))
-inline const char *prev_char(file_t *f, const char *str)
-{
-    if (__builtin_expect(str-1 >= f->start && (str[-1] & 0x80) == 0x0, 1))
-        return str-1;
-    if (__builtin_expect(str-2 >= f->start && (str[-2] & 0xe0) == 0xc0, 1))
-        return str-2;
-    if (__builtin_expect(str-3 >= f->start && (str[-3] & 0xf0) == 0xe0, 1))
-        return str-3;
-    if (__builtin_expect(str-4 >= f->start && (str[-4] & 0xf8) == 0xf0, 1))
-        return str-4;
-    return __builtin_expect(str-1 >= f->start, 1) ? str-1 : f->start;
-}
+const char *next_char(file_t *f, const char *str);
+__attribute__((nonnull, pure))
+const char *prev_char(file_t *f, const char *str);
+
 #endif
 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1
author	Bruce Hill <bruce@bruce-hill.com>	2021-05-31 12:38:42 -0700
committer	Bruce Hill <bruce@bruce-hill.com>	2021-05-31 12:38:42 -0700
commit	da6c8857d6bad131635a846e8177e7c00a4c224e (patch)
tree	4adf3115388cdee07cdb169642133ef3a7ab2630
parent	0443fbb06387138fc88be80104bef102246fdd25 (diff)