From d3c1526f71a083b98b7ac8833434cd0347d8fc7f Mon Sep 17 00:00:00 2001
From: Bruce Hill <bruce@bruce-hill.com>
Date: Sun, 1 Aug 2021 13:41:13 -0700
Subject: Cleanup of space/line skipping code

---
 bp.c          |   2 +-
 definitions.c |   3 +-
 pattern.c     | 112 ++++++++++++++++++++++++++--------------------------------
 utils.c       |  17 +++++----
 utils.h       |   6 ++--
 5 files changed, 66 insertions(+), 74 deletions(-)

diff --git a/bp.c b/bp.c
index b909c2b..6dec579 100644
--- a/bp.c
+++ b/bp.c
@@ -562,7 +562,7 @@ int main(int argc, char *argv[])
             if (!s) {
                 fprint_line(stdout, arg_file, arg_file->start, arg_file->end,
                             "Failed to compile the skip argument");
-            } else if (after_spaces(s->end) < arg_file->end) {
+            } else if (after_spaces(s->end, true) < arg_file->end) {
                 fprint_line(stdout, arg_file, s->end, arg_file->end,
                             "Failed to compile part of the skip argument");
             }
diff --git a/definitions.c b/definitions.c
index 8216e1d..43bfe3c 100644
--- a/definitions.c
+++ b/definitions.c
@@ -33,8 +33,7 @@ def_t *with_def(def_t *defs, size_t namelen, const char *name, pat_t *pat)
 //
 def_t *load_grammar(def_t *defs, file_t *f)
 {
-    const char *str = after_spaces(f->start);
-    while (*str == '\r' || *str == '\n') str = after_spaces(++str);
+    const char *str = after_spaces(f->start, true);
     pat_t *pat = bp_pattern(f, str);
     if (!pat) file_err(f, str, f->end, "Could not parse this file");
     if (pat->end < f->end) file_err(f, pat->end, f->end, "Could not parse this part of the file");
diff --git a/pattern.c b/pattern.c
index 48a45a4..a521389 100644
--- a/pattern.c
+++ b/pattern.c
@@ -28,8 +28,6 @@ static pat_t *new_range(file_t *f, const char *start, const char *end, size_t mi
 __attribute__((nonnull(1,2)))
 static pat_t *bp_simplepattern(file_t *f, const char *str);
 
-#define SKIP_NL_SPACES(str) for (str = after_spaces(str); *str == '\n' || *str == '\r'; ) str = after_spaces(++str)
-
 //
 // Allocate a new pattern for this file (ensuring it will be automatically
 // freed when the file is freed)
@@ -73,8 +71,7 @@ static pat_t *new_range(file_t *f, const char *start, const char *end, size_t mi
 //
 static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl)
 {
-    const char *str = first->end;
-    if (allow_nl) SKIP_NL_SPACES(str);
+    const char *str = after_spaces(first->end, allow_nl);
     pat_t *second = bp_simplepattern(f, str);
     if (second == NULL) return first;
     second = expand_chain(f, second, allow_nl);
@@ -90,12 +87,10 @@ static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl)
 static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
 {
     const char *str = replace_pat->end;
-    if (allow_nl) SKIP_NL_SPACES(str);
-    while (matchstr(&str, "=>")) {
-        if (allow_nl) SKIP_NL_SPACES(str);
+    while (matchstr(&str, "=>", allow_nl)) {
         const char *repstr;
         size_t replen;
-        if (matchchar(&str, '"') || matchchar(&str, '\'')) {
+        if (matchchar(&str, '"', allow_nl) || matchchar(&str, '\'', allow_nl)) {
             char quote = str[-1];
             repstr = str;
             for (; *str && *str != quote; str = next_char(f, str)) {
@@ -107,7 +102,7 @@ static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
                 }
             }
             replen = (size_t)(str-repstr);
-            (void)matchchar(&str, quote);
+            (void)matchchar(&str, quote, true);
         } else {
             repstr = "";
             replen = 0;
@@ -133,15 +128,11 @@ static pat_t *expand_choices(file_t *f, pat_t *first, bool allow_nl)
     first = expand_chain(f, first, allow_nl);
     first = expand_replacements(f, first, allow_nl);
     const char *str = first->end;
-    if (allow_nl) SKIP_NL_SPACES(str);
-    if (!matchchar(&str, '/')) return first;
-    if (allow_nl) SKIP_NL_SPACES(str);
+    if (!matchchar(&str, '/', allow_nl)) return first;
+    str = after_spaces(str, allow_nl);
     pat_t *second = bp_simplepattern(f, str);
-    if (second) {
-        str = second->end;
-        if (allow_nl) SKIP_NL_SPACES(str);
-    }
-    if (matchstr(&str, "=>"))
+    if (second) str = second->end;
+    if (matchstr(&str, "=>", allow_nl))
         second = expand_replacements(f, second ? second : new_pat(f, str-2, str-2, 0, 0, BP_STRING), allow_nl);
     if (!second)
         file_err(f, str, str, "There should be a pattern here after a '/'");
@@ -211,12 +202,11 @@ static pat_t *bp_simplepattern(file_t *f, const char *str)
         errx(EXIT_FAILURE, "pat->end is uninitialized!");
 
     // Expand postfix operators (if any)
-    str = after_spaces(pat->end);
-    while (str+2 < f->end) {
+    while (str < f->end) {
         enum pattype_e type;
-        if (matchchar(&str, '~'))
+        if (matchchar(&str, '~', false))
             type = BP_MATCH;
-        else if (matchstr(&str, "!~"))
+        else if (matchstr(&str, "!~", false))
             type = BP_NOT_MATCH;
         else break;
 
@@ -229,7 +219,6 @@ static pat_t *bp_simplepattern(file_t *f, const char *str)
         pat->args.multiple.first = first;
         pat->args.multiple.second = second;
         str = pat->end;
-        str = after_spaces(str);
     }
 
     return pat;
@@ -240,7 +229,7 @@ static pat_t *bp_simplepattern(file_t *f, const char *str)
 //
 static pat_t *_bp_simplepattern(file_t *f, const char *str)
 {
-    str = after_spaces(str);
+    str = after_spaces(str, false);
     if (!*str) return NULL;
     const char *start = str;
     char c = *str;
@@ -252,7 +241,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
                 pat_t *skip = NULL;
                 str = next_char(f, str);
                 char skipper = *str;
-                if (matchchar(&str, '%') || matchchar(&str, '=')) {
+                if (matchchar(&str, '%', false) || matchchar(&str, '=', false)) {
                     skip = bp_simplepattern(f, str);
                     if (!skip)
                         file_err(f, str, str, "There should be a pattern to skip here after the '%c'", skipper);
@@ -274,8 +263,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
 
                 const char *c1_loc = str;
                 str = next_char(f, c1_loc);
-                if (matchchar(&str, '-')) { // Range
-                    const char *c2_loc = str;
+                if (*str == '-') { // Range
+                    const char *c2_loc = ++str;
                     if (next_char(f, c1_loc) > c1_loc+1 || next_char(f, c2_loc) > c2_loc+1)
                         file_err(f, start, next_char(f, c2_loc), "Sorry, UTF-8 character ranges are not yet supported.");
                     char c1 = *c1_loc, c2 = *c2_loc;
@@ -297,7 +286,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
                     pat->args.string = c1_loc;
                     all = either_pat(f, all, pat);
                 }
-            } while (matchchar(&str, ','));
+            } while (*str++ == ',');
 
             return all;
         }
@@ -308,17 +297,18 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
 
             pat_t *all = NULL;
             do { // Comma-separated items:
-                if (matchchar(&str, 'N')) { // \N (nodent)
-                    all = either_pat(f, all, new_pat(f, start, str, 1, -1, BP_NODENT));
+                const char *itemstart = str-1;
+                if (*str == 'N') { // \N (nodent)
+                    all = either_pat(f, all, new_pat(f, itemstart, ++str, 1, -1, BP_NODENT));
                     continue;
-                } else if (matchchar(&str, 'i')) { // \i (identifier char)
-                    all = either_pat(f, all, new_pat(f, start, str, 1, -1, BP_ID_CONTINUE));
+                } else if (*str == 'i') { // \i (identifier char)
+                    all = either_pat(f, all, new_pat(f, itemstart, ++str, 1, -1, BP_ID_CONTINUE));
                     continue;
-                } else if (matchchar(&str, 'I')) { // \I (identifier char, not including numbers)
-                    all = either_pat(f, all, new_pat(f, start, str, 1, -1, BP_ID_START));
+                } else if (*str == 'I') { // \I (identifier char, not including numbers)
+                    all = either_pat(f, all, new_pat(f, itemstart, ++str, 1, -1, BP_ID_START));
                     continue;
-                } else if (matchchar(&str, 'b')) { // \b word boundary
-                    all = either_pat(f, all, new_pat(f, start, str, 0, 0, BP_WORD_BOUNDARY));
+                } else if (*str == 'b') { // \b word boundary
+                    all = either_pat(f, all, new_pat(f, itemstart, ++str, 0, 0, BP_WORD_BOUNDARY));
                     continue;
                 }
 
@@ -327,7 +317,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
                 if (str == opstart)
                     file_err(f, start, str+1, "This isn't a valid escape sequence.");
                 unsigned char e_high = e_low;
-                if (matchchar(&str, '-')) { // Escape range (e.g. \x00-\xFF)
+                if (*str == '-') { // Escape range (e.g. \x00-\xFF)
+                    ++str;
                     if (next_char(f, str) != str+1)
                         file_err(f, start, next_char(f, str), "Sorry, UTF8 escape sequences are not supported in ranges.");
                     const char *seqstart = str;
@@ -341,7 +332,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
                 esc->args.range.low = e_low;
                 esc->args.range.high = e_high;
                 all = either_pat(f, all, esc);
-            } while (matchchar(&str, ','));
+            } while (*str++ == ',');
 
             return all;
         }
@@ -377,13 +368,13 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
             ssize_t max = -1;
             --str;
             long n1 = strtol(str, (char**)&str, 10);
-            if (matchchar(&str, '-')) {
-                str = after_spaces(str);
+            if (matchchar(&str, '-', false)) {
+                str = after_spaces(str, false);
                 const char *numstart = str;
                 long n2 = strtol(str, (char**)&str, 10);
                 if (str == numstart) min = 0, max = (ssize_t)n1;
                 else min = (size_t)n1, max = (ssize_t)n2;
-            } else if (matchchar(&str, '+')) {
+            } else if (matchchar(&str, '+', false)) {
                 min = (size_t)n1, max = -1;
             } else {
                 min = (size_t)n1, max = (ssize_t)n1;
@@ -393,7 +384,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
                 file_err(f, str, str, "There should be a pattern after this repetition count.");
             str = repeating->end;
             pat_t *sep = NULL;
-            if (matchchar(&str, '%')) {
+            if (matchchar(&str, '%', false)) {
                 sep = bp_simplepattern(f, str);
                 if (!sep)
                     file_err(f, str, str, "There should be a separator pattern after this '%%'");
@@ -426,7 +417,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
         }
         // Parentheses
         case '(': {
-            if (matchstr(&str, "!)")) { // (!) errors
+            if (start + 2 < f->end && strncmp(start, "(!)", 3) == 0) { // (!) errors
+                str = start + 3;
                 pat_t *pat = bp_simplepattern(f, str);
                 if (!pat) pat = new_pat(f, str, str, 0, 0, BP_STRING);
                 pat = expand_replacements(f, pat, false);
@@ -439,8 +431,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
             if (!pat)
                 file_err(f, str, str, "There should be a valid pattern after this parenthesis.");
             str = pat->end;
-            SKIP_NL_SPACES(str);
-            if (!matchchar(&str, ')')) file_err(f, str, str, "Missing paren: )");
+            if (!matchchar(&str, ')', true)) file_err(f, str, str, "Missing paren: )");
             pat->start = start;
             pat->end = str;
             return pat;
@@ -451,8 +442,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
             if (!maybe)
                 file_err(f, str, str, "There should be a valid pattern after this square bracket.");
             str = maybe->end;
-            SKIP_NL_SPACES(str);
-            (void)matchchar(&str, ']');
+            (void)matchchar(&str, ']', true);
             return new_range(f, start, str, 0, 1, maybe, NULL);
         }
         // Repeating
@@ -463,7 +453,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
                 file_err(f, str, str, "There should be a valid pattern here after the '%c'", c);
             str = repeating->end;
             pat_t *sep = NULL;
-            if (matchchar(&str, '%')) {
+            if (matchchar(&str, '%', false)) {
                 sep = bp_simplepattern(f, str);
                 if (!sep)
                     file_err(f, str, str, "There should be a separator pattern after the '%%' here.");
@@ -476,10 +466,11 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
             const char *name = NULL;
             size_t namelen = 0;
             const char *a = after_name(str);
-            if (a > str && after_spaces(a)[0] == '=' && after_spaces(a)[1] != '>') {
+            const char *eq = a;
+            if (a > str && !matchstr(&eq, "=>", false) && matchchar(&eq, '=', false)) {
                 name = str;
                 namelen = (size_t)(a-str);
-                str = after_spaces(a) + 1;
+                str = eq;
             }
             pat_t *pat = bp_simplepattern(f, str);
             if (!pat)
@@ -491,16 +482,16 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
             capture->args.capture.namelen = namelen;
             return capture;
         }
-        // Start of file/line:
+        // Start of file/line
         case '^': {
-            if (matchchar(&str, '^'))
-                return new_pat(f, start, str, 0, 0, BP_START_OF_FILE);
+            if (*str == '^')
+                return new_pat(f, start, ++str, 0, 0, BP_START_OF_FILE);
             return new_pat(f, start, str, 0, 0, BP_START_OF_LINE);
         }
         // End of file/line:
         case '$': {
-            if (matchchar(&str, '$'))
-                return new_pat(f, start, str, 0, 0, BP_END_OF_FILE);
+            if (*str == '$')
+                return new_pat(f, start, ++str, 0, 0, BP_END_OF_FILE);
             return new_pat(f, start, str, 0, 0, BP_END_OF_LINE);
         }
         default: {
@@ -508,12 +499,12 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
             if (!isalpha(c) && c != '_') return NULL;
             str = after_name(start);
             size_t namelen = (size_t)(str - start);
-            if (matchchar(&str, ':')) { // Definitions
+            if (matchchar(&str, ':', false)) { // Definitions
                 pat_t *def = bp_pattern_nl(f, str, false);
                 if (!def) file_err(f, str, f->end, "Could not parse this definition.");
                 str = def->end;
-                (void)matchchar(&str, ';'); // Optional semicolon
-                SKIP_NL_SPACES(str);
+                (void)matchchar(&str, ';', false); // Optional semicolon
+                str = after_spaces(str, true);
                 pat_t *pat = bp_pattern_nl(f, str, false);
                 if (pat) str = pat->end;
                 else pat = def;
@@ -562,7 +553,7 @@ pat_t *bp_stringpattern(file_t *f, const char *str)
             ret = chain_together(f, ret, interp);
             str = interp->end;
             // allow terminating seq
-            (void)matchchar(&str, ';');
+            (void)matchchar(&str, ';', false);
         }
     }
     return ret;
@@ -594,11 +585,10 @@ pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement)
 
 static pat_t *bp_pattern_nl(file_t *f, const char *str, bool allow_nl)
 {
-    SKIP_NL_SPACES(str);
+    str = after_spaces(str, allow_nl);
     pat_t *pat = bp_simplepattern(f, str);
     if (pat != NULL) pat = expand_choices(f, pat, allow_nl);
-    SKIP_NL_SPACES(str);
-    if (matchstr(&str, "=>"))
+    if (matchstr(&str, "=>", allow_nl))
         pat = expand_replacements(f, pat ? pat : new_pat(f, str-2, str-2, 0, 0, BP_STRING), allow_nl);
     return pat;
 }
diff --git a/utils.c b/utils.c
index 98eb8b3..5ecbcf1 100644
--- a/utils.c
+++ b/utils.c
@@ -5,6 +5,7 @@
 #include <ctype.h>
 #include <err.h>
 #include <stdarg.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <unistd.h>
 
@@ -14,12 +15,14 @@
 // Helper function to skip past all spaces (and comments)
 // Returns a pointer to the first non-space character.
 //
-const char *after_spaces(const char *str)
+const char *after_spaces(const char *str, bool skip_nl)
 {
     // Skip whitespace and comments:
   skip_whitespace:
     switch (*str) {
-        // case ' ': case '\r': case '\n': case '\t': {
+        case '\r': case '\n':
+            if (!skip_nl) break;
+            __attribute__ ((fallthrough));
         case ' ': case '\t': {
             ++str;
             goto skip_whitespace;
@@ -54,11 +57,11 @@ const char *after_name(const char *str)
 //
 // Check if a character is found and if so, move past it.
 //
-bool matchchar(const char **str, char c)
+bool matchchar(const char **str, char c, bool skip_nl)
 {
-    const char *next = after_spaces(*str);
+    const char *next = after_spaces(*str, skip_nl);
     if (*next == c) {
-        *str = &next[1];
+        *str = next + 1;
         return true;
     }
     return false;
@@ -67,9 +70,9 @@ bool matchchar(const char **str, char c)
 //
 // Check if a string is found and if so, move past it.
 //
-bool matchstr(const char **str, const char *target)
+bool matchstr(const char **str, const char *target, bool skip_nl)
 {
-    const char *next = after_spaces(*str);
+    const char *next = after_spaces(*str, skip_nl);
     if (strncmp(next, target, strlen(target)) == 0) {
         *str = &next[strlen(target)];
         return true;
diff --git a/utils.h b/utils.h
index 550fa0d..b645817 100644
--- a/utils.h
+++ b/utils.h
@@ -26,11 +26,11 @@ char unescapechar(const char *escaped, const char **end);
 __attribute__((pure, nonnull))
 const char *after_name(const char *str);
 __attribute__((pure, nonnull, returns_nonnull))
-const char *after_spaces(const char *str);
+const char *after_spaces(const char *str, bool skip_nl);
 __attribute__((nonnull))
-bool matchchar(const char **str, char c);
+bool matchchar(const char **str, char c, bool skip_nl);
 __attribute__((nonnull))
-bool matchstr(const char **str, const char *target);
+bool matchstr(const char **str, const char *target, bool skip_nl);
 __attribute__((returns_nonnull))
 void *check_nonnull(void *p, const char *err_msg, ...);
 int check_nonnegative(int i, const char *err_msg, ...);
-- 
cgit v1.2.3