Split out text parsing logic

author: Bruce Hill <bruce@bruce-hill.com> 2025-08-25 01:00:13 -0400
committer: Bruce Hill <bruce@bruce-hill.com> 2025-08-25 01:00:13 -0400
commit: 3cf0d5f0bee3787a3d46126d5c0e1310e35a7cb9 (patch)
tree: 035e97eaaf2d722daa6ffa0d447d0a8c34a5f0b9 /src/parse
parent: c859ed479227cee2cecedb83d74a40acf9758051 (diff)
5 files changed, 195 insertions, 175 deletions
diff --git a/src/parse/files.c b/src/parse/files.c
index def740dd..416adc92 100644
--- a/src/parse/files.c
+++ b/src/parse/files.c
@@ -14,6 +14,7 @@
 #include "files.h"
 #include "functions.h"
 #include "parse.h"
+#include "text.h"
 #include "utils.h"
 
 // The cache of {filename -> parsed AST} will hold at most this many entries:
diff --git a/src/parse/parse.c b/src/parse/parse.c
index 52399e93..d25f0817 100644
--- a/src/parse/parse.c
+++ b/src/parse/parse.c
@@ -1,17 +1,11 @@
 // Recursive descent parser for parsing code
 
 #include <gc.h>
-#include <stdarg.h>
 #include <stdbool.h>
 #include <string.h>
 
-#include "../unistr-fixed.h"
-#include <unictype.h>
-#include <uniname.h>
-
 #include "../ast.h"
 #include "../stdlib/print.h"
-#include "../stdlib/text.h"
 #include "../stdlib/util.h"
 #include "containers.h"
 #include "context.h"
@@ -20,11 +14,10 @@
 #include "functions.h"
 #include "numbers.h"
 #include "parse.h"
+#include "text.h"
 #include "types.h"
 #include "utils.h"
 
-static const char closing[128] = {['('] = ')', ['['] = ']', ['<'] = '>', ['{'] = '}'};
-
 int op_tightness[] = {
     [Power] = 9,
     [Multiply] = 8,
@@ -423,143 +416,6 @@ ast_t *parse_bool(parse_ctx_t *ctx, const char *pos) {
     else return NULL;
 }
 
-ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote,
-                               char open_interp, bool allow_escapes) {
-    const char *pos = *out_pos;
-    int64_t starting_indent = get_indent(ctx, pos);
-    int64_t string_indent = starting_indent + SPACES_PER_INDENT;
-    ast_list_t *chunks = NULL;
-    Text_t chunk = EMPTY_TEXT;
-    const char *chunk_start = pos;
-    int depth = 1;
-    bool leading_newline = false;
-    int64_t plain_span_len = 0;
-#define FLUSH_PLAIN_SPAN()                                                                                             \
-    do {                                                                                                               \
-        if (plain_span_len > 0) {                                                                                      \
-            chunk = Texts(chunk, Text$from_strn(pos - plain_span_len, (size_t)plain_span_len));                        \
-            plain_span_len = 0;                                                                                        \
-        }                                                                                                              \
-    } while (0)
-    for (const char *end = ctx->file->text + ctx->file->len; pos < end && depth > 0;) {
-        const char *after_indentation = pos;
-        if (*pos == open_interp) { // Interpolation
-            FLUSH_PLAIN_SPAN();
-            const char *interp_start = pos;
-            if (chunk.length > 0) {
-                ast_t *literal = NewAST(ctx->file, chunk_start, pos, TextLiteral, .text = chunk);
-                chunks = new (ast_list_t, .ast = literal, .next = chunks);
-                chunk = EMPTY_TEXT;
-            }
-            ++pos;
-            ast_t *interp;
-            if (*pos == ' ' || *pos == '\t')
-                parser_err(ctx, pos, pos + 1, "Whitespace is not allowed before an interpolation here");
-            interp = expect(ctx, interp_start, &pos, parse_term_no_suffix, "I expected an interpolation term here");
-            chunks = new (ast_list_t, .ast = interp, .next = chunks);
-            chunk_start = pos;
-        } else if (allow_escapes && *pos == '\\') {
-            FLUSH_PLAIN_SPAN();
-            const char *c = unescape(ctx, &pos);
-            chunk = Texts(chunk, Text$from_str(c));
-        } else if (!leading_newline && *pos == open_quote && closing[(int)open_quote]) { // Nested pair begin
-            if (get_indent(ctx, pos) == starting_indent) {
-                ++depth;
-            }
-            plain_span_len += 1;
-            ++pos;
-        } else if (!leading_newline && *pos == close_quote) { // Nested pair end
-            if (get_indent(ctx, pos) == starting_indent) {
-                --depth;
-                if (depth == 0) break;
-            }
-            plain_span_len += 1;
-            ++pos;
-        } else if (newline_with_indentation(&after_indentation, string_indent)) { // Newline
-            FLUSH_PLAIN_SPAN();
-            pos = after_indentation;
-            if (!leading_newline && !(chunk.length > 0 || chunks)) {
-                leading_newline = true;
-            } else {
-                chunk = Texts(chunk, Text("\n"));
-            }
-        } else if (newline_with_indentation(&after_indentation, starting_indent)) { // Line continuation (..)
-            FLUSH_PLAIN_SPAN();
-            pos = after_indentation;
-            if (*pos == close_quote) {
-                break;
-            } else if (some_of(&pos, ".") >= 2) {
-                // Multi-line split
-                continue;
-            } else {
-                parser_err(ctx, pos, eol(pos),
-                           "This multi-line string should be either indented or have '..' at the front");
-            }
-        } else { // Plain character
-            ucs4_t codepoint;
-            const char *next = (const char *)u8_next(&codepoint, (const uint8_t *)pos);
-            plain_span_len += (int64_t)(next - pos);
-            if (next == NULL) break;
-            pos = next;
-        }
-    }
-
-    FLUSH_PLAIN_SPAN();
-#undef FLUSH_PLAIN_SPAN
-
-    if (chunk.length > 0) {
-        ast_t *literal = NewAST(ctx->file, chunk_start, pos, TextLiteral, .text = chunk);
-        chunks = new (ast_list_t, .ast = literal, .next = chunks);
-        chunk = EMPTY_TEXT;
-    }
-
-    REVERSE_LIST(chunks);
-    char close_str[2] = {close_quote, 0};
-    expect_closing(ctx, &pos, close_str, "I was expecting a ", close_quote, " to finish this string");
-    *out_pos = pos;
-    return chunks;
-}
-
-ast_t *parse_text(parse_ctx_t *ctx, const char *pos) {
-    // ('"' ... '"' / "'" ... "'" / "`" ... "`")
-    // "$" [name] [interp-char] quote-char ... close-quote
-    const char *start = pos;
-    const char *lang = NULL;
-
-    char open_quote, close_quote, open_interp = '$';
-    if (match(&pos, "\"")) { // Double quote
-        open_quote = '"', close_quote = '"', open_interp = '$';
-    } else if (match(&pos, "`")) { // Backtick
-        open_quote = '`', close_quote = '`', open_interp = '$';
-    } else if (match(&pos, "'")) { // Single quote
-        open_quote = '\'', close_quote = '\'', open_interp = '$';
-    } else if (match(&pos, "$")) { // Customized strings
-        lang = get_id(&pos);
-        // $"..." or $@"...."
-        static const char *interp_chars = "~!@#$%^&*+=\\?";
-        if (match(&pos, "$")) { // Disable interpolation with $$
-            open_interp = '\x03';
-        } else if (strchr(interp_chars, *pos)) {
-            open_interp = *pos;
-            ++pos;
-        }
-        static const char *quote_chars = "\"'`|/;([{<";
-        if (!strchr(quote_chars, *pos))
-            parser_err(ctx, pos, pos + 1,
-                       "This is not a valid string quotation character. Valid characters are: \"'`|/;([{<");
-        open_quote = *pos;
-        ++pos;
-        close_quote = closing[(int)open_quote] ? closing[(int)open_quote] : open_quote;
-    } else {
-        return NULL;
-    }
-
-    bool allow_escapes = (open_quote != '`');
-    ast_list_t *chunks = _parse_text_helper(ctx, &pos, open_quote, close_quote, open_interp, allow_escapes);
-    bool colorize = match(&pos, "~") && match_word(&pos, "colorized");
-    return NewAST(ctx->file, start, pos, TextJoin, .lang = lang, .children = chunks, .colorize = colorize);
-}
-
 ast_t *parse_path(parse_ctx_t *ctx, const char *pos) {
     // "(" ("~/" / "./" / "../" / "/") ... ")"
     const char *start = pos;
@@ -1260,32 +1116,6 @@ ast_t *parse_extern(parse_ctx_t *ctx, const char *pos) {
     return NewAST(ctx->file, start, pos, Extern, .name = name, .type = type);
 }
 
-ast_t *parse_inline_c(parse_ctx_t *ctx, const char *pos) {
-    const char *start = pos;
-    if (!match_word(&pos, "C_code")) return NULL;
-
-    spaces(&pos);
-    type_ast_t *type = NULL;
-    ast_list_t *chunks;
-    if (match(&pos, ":")) {
-        type = expect(ctx, start, &pos, parse_type, "I couldn't parse the type for this C_code code");
-        spaces(&pos);
-        if (!match(&pos, "(")) parser_err(ctx, start, pos, "I expected a '(' here");
-        chunks = new (ast_list_t, .ast = NewAST(ctx->file, pos, pos, TextLiteral, Text("({")),
-                      .next = _parse_text_helper(ctx, &pos, '(', ')', '@', false));
-        if (type) {
-            REVERSE_LIST(chunks);
-            chunks = new (ast_list_t, .ast = NewAST(ctx->file, pos, pos, TextLiteral, Text("; })")), .next = chunks);
-            REVERSE_LIST(chunks);
-        }
-    } else {
-        if (!match(&pos, "{")) parser_err(ctx, start, pos, "I expected a '{' here");
-        chunks = _parse_text_helper(ctx, &pos, '{', '}', '@', false);
-    }
-
-    return NewAST(ctx->file, start, pos, InlineCCode, .chunks = chunks, .type_ast = type);
-}
-
 ast_t *parse_doctest(parse_ctx_t *ctx, const char *pos) {
     const char *start = pos;
     if (!match(&pos, ">>")) return NULL;
diff --git a/src/parse/parse.h b/src/parse/parse.h
index 83f824db..b3a36f72 100644
--- a/src/parse/parse.h
+++ b/src/parse/parse.h
@@ -32,7 +32,6 @@ ast_t *parse_extern(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_for(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_heap_alloc(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_if(parse_ctx_t *ctx, const char *pos);
-ast_t *parse_inline_c(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_lang_def(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_extend(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_namespace(parse_ctx_t *ctx, const char *pos);
@@ -52,12 +51,9 @@ ast_t *parse_stop(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_struct_def(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_term(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_term_no_suffix(parse_ctx_t *ctx, const char *pos);
-ast_t *parse_text(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_update(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_use(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_var(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_when(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_while(parse_ctx_t *ctx, const char *pos);
 ast_t *parse_deserialize(parse_ctx_t *ctx, const char *pos);
-ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote,
-                               char open_interp, bool allow_escapes);
diff --git a/src/parse/text.c b/src/parse/text.c
new file mode 100644
index 00000000..b9827644
--- /dev/null
+++ b/src/parse/text.c
@@ -0,0 +1,185 @@
+// Logic for parsing text literals
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "../unistr-fixed.h"
+#include <unictype.h>
+#include <uniname.h>
+
+#include "../ast.h"
+#include "../stdlib/text.h"
+#include "../stdlib/util.h"
+#include "context.h"
+#include "errors.h"
+#include "parse.h"
+#include "types.h"
+#include "utils.h"
+
+static const char closing[128] = {['('] = ')', ['['] = ']', ['<'] = '>', ['{'] = '}'};
+
+static ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote,
+                                      char open_interp, bool allow_escapes) {
+    const char *pos = *out_pos;
+    int64_t starting_indent = get_indent(ctx, pos);
+    int64_t string_indent = starting_indent + SPACES_PER_INDENT;
+    ast_list_t *chunks = NULL;
+    Text_t chunk = EMPTY_TEXT;
+    const char *chunk_start = pos;
+    int depth = 1;
+    bool leading_newline = false;
+    int64_t plain_span_len = 0;
+#define FLUSH_PLAIN_SPAN()                                                                                             \
+    do {                                                                                                               \
+        if (plain_span_len > 0) {                                                                                      \
+            chunk = Texts(chunk, Text$from_strn(pos - plain_span_len, (size_t)plain_span_len));                        \
+            plain_span_len = 0;                                                                                        \
+        }                                                                                                              \
+    } while (0)
+    for (const char *end = ctx->file->text + ctx->file->len; pos < end && depth > 0;) {
+        const char *after_indentation = pos;
+        if (*pos == open_interp) { // Interpolation
+            FLUSH_PLAIN_SPAN();
+            const char *interp_start = pos;
+            if (chunk.length > 0) {
+                ast_t *literal = NewAST(ctx->file, chunk_start, pos, TextLiteral, .text = chunk);
+                chunks = new (ast_list_t, .ast = literal, .next = chunks);
+                chunk = EMPTY_TEXT;
+            }
+            ++pos;
+            ast_t *interp;
+            if (*pos == ' ' || *pos == '\t')
+                parser_err(ctx, pos, pos + 1, "Whitespace is not allowed before an interpolation here");
+            interp = expect(ctx, interp_start, &pos, parse_term_no_suffix, "I expected an interpolation term here");
+            chunks = new (ast_list_t, .ast = interp, .next = chunks);
+            chunk_start = pos;
+        } else if (allow_escapes && *pos == '\\') {
+            FLUSH_PLAIN_SPAN();
+            const char *c = unescape(ctx, &pos);
+            chunk = Texts(chunk, Text$from_str(c));
+        } else if (!leading_newline && *pos == open_quote && closing[(int)open_quote]) { // Nested pair begin
+            if (get_indent(ctx, pos) == starting_indent) {
+                ++depth;
+            }
+            plain_span_len += 1;
+            ++pos;
+        } else if (!leading_newline && *pos == close_quote) { // Nested pair end
+            if (get_indent(ctx, pos) == starting_indent) {
+                --depth;
+                if (depth == 0) break;
+            }
+            plain_span_len += 1;
+            ++pos;
+        } else if (newline_with_indentation(&after_indentation, string_indent)) { // Newline
+            FLUSH_PLAIN_SPAN();
+            pos = after_indentation;
+            if (!leading_newline && !(chunk.length > 0 || chunks)) {
+                leading_newline = true;
+            } else {
+                chunk = Texts(chunk, Text("\n"));
+            }
+        } else if (newline_with_indentation(&after_indentation, starting_indent)) { // Line continuation (..)
+            FLUSH_PLAIN_SPAN();
+            pos = after_indentation;
+            if (*pos == close_quote) {
+                break;
+            } else if (some_of(&pos, ".") >= 2) {
+                // Multi-line split
+                continue;
+            } else {
+                parser_err(ctx, pos, eol(pos),
+                           "This multi-line string should be either indented or have '..' at the front");
+            }
+        } else { // Plain character
+            ucs4_t codepoint;
+            const char *next = (const char *)u8_next(&codepoint, (const uint8_t *)pos);
+            plain_span_len += (int64_t)(next - pos);
+            if (next == NULL) break;
+            pos = next;
+        }
+    }
+
+    FLUSH_PLAIN_SPAN();
+#undef FLUSH_PLAIN_SPAN
+
+    if (chunk.length > 0) {
+        ast_t *literal = NewAST(ctx->file, chunk_start, pos, TextLiteral, .text = chunk);
+        chunks = new (ast_list_t, .ast = literal, .next = chunks);
+        chunk = EMPTY_TEXT;
+    }
+
+    REVERSE_LIST(chunks);
+    char close_str[2] = {close_quote, 0};
+    expect_closing(ctx, &pos, close_str, "I was expecting a ", close_quote, " to finish this string");
+    *out_pos = pos;
+    return chunks;
+}
+
+public
+ast_t *parse_text(parse_ctx_t *ctx, const char *pos) {
+    // ('"' ... '"' / "'" ... "'" / "`" ... "`")
+    // "$" [name] [interp-char] quote-char ... close-quote
+    const char *start = pos;
+    const char *lang = NULL;
+
+    char open_quote, close_quote, open_interp = '$';
+    if (match(&pos, "\"")) { // Double quote
+        open_quote = '"', close_quote = '"', open_interp = '$';
+    } else if (match(&pos, "`")) { // Backtick
+        open_quote = '`', close_quote = '`', open_interp = '$';
+    } else if (match(&pos, "'")) { // Single quote
+        open_quote = '\'', close_quote = '\'', open_interp = '$';
+    } else if (match(&pos, "$")) { // Customized strings
+        lang = get_id(&pos);
+        // $"..." or $@"...."
+        static const char *interp_chars = "~!@#$%^&*+=\\?";
+        if (match(&pos, "$")) { // Disable interpolation with $$
+            open_interp = '\x03';
+        } else if (strchr(interp_chars, *pos)) {
+            open_interp = *pos;
+            ++pos;
+        }
+        static const char *quote_chars = "\"'`|/;([{<";
+        if (!strchr(quote_chars, *pos))
+            parser_err(ctx, pos, pos + 1,
+                       "This is not a valid string quotation character. Valid characters are: \"'`|/;([{<");
+        open_quote = *pos;
+        ++pos;
+        close_quote = closing[(int)open_quote] ? closing[(int)open_quote] : open_quote;
+    } else {
+        return NULL;
+    }
+
+    bool allow_escapes = (open_quote != '`');
+    ast_list_t *chunks = _parse_text_helper(ctx, &pos, open_quote, close_quote, open_interp, allow_escapes);
+    bool colorize = match(&pos, "~") && match_word(&pos, "colorized");
+    return NewAST(ctx->file, start, pos, TextJoin, .lang = lang, .children = chunks, .colorize = colorize);
+}
+
+public
+ast_t *parse_inline_c(parse_ctx_t *ctx, const char *pos) {
+    const char *start = pos;
+    if (!match_word(&pos, "C_code")) return NULL;
+
+    spaces(&pos);
+    type_ast_t *type = NULL;
+    ast_list_t *chunks;
+    if (match(&pos, ":")) {
+        type = expect(ctx, start, &pos, parse_type, "I couldn't parse the type for this C_code code");
+        spaces(&pos);
+        if (!match(&pos, "(")) parser_err(ctx, start, pos, "I expected a '(' here");
+        chunks = new (ast_list_t, .ast = NewAST(ctx->file, pos, pos, TextLiteral, Text("({")),
+                      .next = _parse_text_helper(ctx, &pos, '(', ')', '@', false));
+        if (type) {
+            REVERSE_LIST(chunks);
+            chunks = new (ast_list_t, .ast = NewAST(ctx->file, pos, pos, TextLiteral, Text("; })")), .next = chunks);
+            REVERSE_LIST(chunks);
+        }
+    } else {
+        if (!match(&pos, "{")) parser_err(ctx, start, pos, "I expected a '{' here");
+        chunks = _parse_text_helper(ctx, &pos, '{', '}', '@', false);
+    }
+
+    return NewAST(ctx->file, start, pos, InlineCCode, .chunks = chunks, .type_ast = type);
+}
diff --git a/src/parse/text.h b/src/parse/text.h
new file mode 100644
index 00000000..cd07e0e1
--- /dev/null
+++ b/src/parse/text.h
@@ -0,0 +1,8 @@
+// Logic for parsing text literals
+#pragma once
+
+#include "../ast.h"
+#include "context.h"
+
+ast_t *parse_text(parse_ctx_t *ctx, const char *pos);
+ast_t *parse_inline_c(parse_ctx_t *ctx, const char *pos);
author	Bruce Hill <bruce@bruce-hill.com>	2025-08-25 01:00:13 -0400
committer	Bruce Hill <bruce@bruce-hill.com>	2025-08-25 01:00:13 -0400
commit	3cf0d5f0bee3787a3d46126d5c0e1310e35a7cb9 (patch)
tree	035e97eaaf2d722daa6ffa0d447d0a8c34a5f0b9 /src/parse
parent	c859ed479227cee2cecedb83d74a40acf9758051 (diff)