diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2025-08-25 01:00:13 -0400 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2025-08-25 01:00:13 -0400 |
| commit | 3cf0d5f0bee3787a3d46126d5c0e1310e35a7cb9 (patch) | |
| tree | 035e97eaaf2d722daa6ffa0d447d0a8c34a5f0b9 /src/parse | |
| parent | c859ed479227cee2cecedb83d74a40acf9758051 (diff) | |
Split out text parsing logic
Diffstat (limited to 'src/parse')
| -rw-r--r-- | src/parse/files.c | 1 | ||||
| -rw-r--r-- | src/parse/parse.c | 172 | ||||
| -rw-r--r-- | src/parse/parse.h | 4 | ||||
| -rw-r--r-- | src/parse/text.c | 185 | ||||
| -rw-r--r-- | src/parse/text.h | 8 |
5 files changed, 195 insertions, 175 deletions
diff --git a/src/parse/files.c b/src/parse/files.c index def740dd..416adc92 100644 --- a/src/parse/files.c +++ b/src/parse/files.c @@ -14,6 +14,7 @@ #include "files.h" #include "functions.h" #include "parse.h" +#include "text.h" #include "utils.h" // The cache of {filename -> parsed AST} will hold at most this many entries: diff --git a/src/parse/parse.c b/src/parse/parse.c index 52399e93..d25f0817 100644 --- a/src/parse/parse.c +++ b/src/parse/parse.c @@ -1,17 +1,11 @@ // Recursive descent parser for parsing code #include <gc.h> -#include <stdarg.h> #include <stdbool.h> #include <string.h> -#include "../unistr-fixed.h" -#include <unictype.h> -#include <uniname.h> - #include "../ast.h" #include "../stdlib/print.h" -#include "../stdlib/text.h" #include "../stdlib/util.h" #include "containers.h" #include "context.h" @@ -20,11 +14,10 @@ #include "functions.h" #include "numbers.h" #include "parse.h" +#include "text.h" #include "types.h" #include "utils.h" -static const char closing[128] = {['('] = ')', ['['] = ']', ['<'] = '>', ['{'] = '}'}; - int op_tightness[] = { [Power] = 9, [Multiply] = 8, @@ -423,143 +416,6 @@ ast_t *parse_bool(parse_ctx_t *ctx, const char *pos) { else return NULL; } -ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote, - char open_interp, bool allow_escapes) { - const char *pos = *out_pos; - int64_t starting_indent = get_indent(ctx, pos); - int64_t string_indent = starting_indent + SPACES_PER_INDENT; - ast_list_t *chunks = NULL; - Text_t chunk = EMPTY_TEXT; - const char *chunk_start = pos; - int depth = 1; - bool leading_newline = false; - int64_t plain_span_len = 0; -#define FLUSH_PLAIN_SPAN() \ - do { \ - if (plain_span_len > 0) { \ - chunk = Texts(chunk, Text$from_strn(pos - plain_span_len, (size_t)plain_span_len)); \ - plain_span_len = 0; \ - } \ - } while (0) - for (const char *end = ctx->file->text + ctx->file->len; pos < end && depth > 0;) { - const char *after_indentation = pos; - if (*pos == open_interp) { // Interpolation - FLUSH_PLAIN_SPAN(); - const char *interp_start = pos; - if (chunk.length > 0) { - ast_t *literal = NewAST(ctx->file, chunk_start, pos, TextLiteral, .text = chunk); - chunks = new (ast_list_t, .ast = literal, .next = chunks); - chunk = EMPTY_TEXT; - } - ++pos; - ast_t *interp; - if (*pos == ' ' || *pos == '\t') - parser_err(ctx, pos, pos + 1, "Whitespace is not allowed before an interpolation here"); - interp = expect(ctx, interp_start, &pos, parse_term_no_suffix, "I expected an interpolation term here"); - chunks = new (ast_list_t, .ast = interp, .next = chunks); - chunk_start = pos; - } else if (allow_escapes && *pos == '\\') { - FLUSH_PLAIN_SPAN(); - const char *c = unescape(ctx, &pos); - chunk = Texts(chunk, Text$from_str(c)); - } else if (!leading_newline && *pos == open_quote && closing[(int)open_quote]) { // Nested pair begin - if (get_indent(ctx, pos) == starting_indent) { - ++depth; - } - plain_span_len += 1; - ++pos; - } else if (!leading_newline && *pos == close_quote) { // Nested pair end - if (get_indent(ctx, pos) == starting_indent) { - --depth; - if (depth == 0) break; - } - plain_span_len += 1; - ++pos; - } else if (newline_with_indentation(&after_indentation, string_indent)) { // Newline - FLUSH_PLAIN_SPAN(); - pos = after_indentation; - if (!leading_newline && !(chunk.length > 0 || chunks)) { - leading_newline = true; - } else { - chunk = Texts(chunk, Text("\n")); - } - } else if (newline_with_indentation(&after_indentation, starting_indent)) { // Line continuation (..) - FLUSH_PLAIN_SPAN(); - pos = after_indentation; - if (*pos == close_quote) { - break; - } else if (some_of(&pos, ".") >= 2) { - // Multi-line split - continue; - } else { - parser_err(ctx, pos, eol(pos), - "This multi-line string should be either indented or have '..' at the front"); - } - } else { // Plain character - ucs4_t codepoint; - const char *next = (const char *)u8_next(&codepoint, (const uint8_t *)pos); - plain_span_len += (int64_t)(next - pos); - if (next == NULL) break; - pos = next; - } - } - - FLUSH_PLAIN_SPAN(); -#undef FLUSH_PLAIN_SPAN - - if (chunk.length > 0) { - ast_t *literal = NewAST(ctx->file, chunk_start, pos, TextLiteral, .text = chunk); - chunks = new (ast_list_t, .ast = literal, .next = chunks); - chunk = EMPTY_TEXT; - } - - REVERSE_LIST(chunks); - char close_str[2] = {close_quote, 0}; - expect_closing(ctx, &pos, close_str, "I was expecting a ", close_quote, " to finish this string"); - *out_pos = pos; - return chunks; -} - -ast_t *parse_text(parse_ctx_t *ctx, const char *pos) { - // ('"' ... '"' / "'" ... "'" / "`" ... "`") - // "$" [name] [interp-char] quote-char ... close-quote - const char *start = pos; - const char *lang = NULL; - - char open_quote, close_quote, open_interp = '$'; - if (match(&pos, "\"")) { // Double quote - open_quote = '"', close_quote = '"', open_interp = '$'; - } else if (match(&pos, "`")) { // Backtick - open_quote = '`', close_quote = '`', open_interp = '$'; - } else if (match(&pos, "'")) { // Single quote - open_quote = '\'', close_quote = '\'', open_interp = '$'; - } else if (match(&pos, "$")) { // Customized strings - lang = get_id(&pos); - // $"..." or $@"...." - static const char *interp_chars = "~!@#$%^&*+=\\?"; - if (match(&pos, "$")) { // Disable interpolation with $$ - open_interp = '\x03'; - } else if (strchr(interp_chars, *pos)) { - open_interp = *pos; - ++pos; - } - static const char *quote_chars = "\"'`|/;([{<"; - if (!strchr(quote_chars, *pos)) - parser_err(ctx, pos, pos + 1, - "This is not a valid string quotation character. Valid characters are: \"'`|/;([{<"); - open_quote = *pos; - ++pos; - close_quote = closing[(int)open_quote] ? closing[(int)open_quote] : open_quote; - } else { - return NULL; - } - - bool allow_escapes = (open_quote != '`'); - ast_list_t *chunks = _parse_text_helper(ctx, &pos, open_quote, close_quote, open_interp, allow_escapes); - bool colorize = match(&pos, "~") && match_word(&pos, "colorized"); - return NewAST(ctx->file, start, pos, TextJoin, .lang = lang, .children = chunks, .colorize = colorize); -} - ast_t *parse_path(parse_ctx_t *ctx, const char *pos) { // "(" ("~/" / "./" / "../" / "/") ... ")" const char *start = pos; @@ -1260,32 +1116,6 @@ ast_t *parse_extern(parse_ctx_t *ctx, const char *pos) { return NewAST(ctx->file, start, pos, Extern, .name = name, .type = type); } -ast_t *parse_inline_c(parse_ctx_t *ctx, const char *pos) { - const char *start = pos; - if (!match_word(&pos, "C_code")) return NULL; - - spaces(&pos); - type_ast_t *type = NULL; - ast_list_t *chunks; - if (match(&pos, ":")) { - type = expect(ctx, start, &pos, parse_type, "I couldn't parse the type for this C_code code"); - spaces(&pos); - if (!match(&pos, "(")) parser_err(ctx, start, pos, "I expected a '(' here"); - chunks = new (ast_list_t, .ast = NewAST(ctx->file, pos, pos, TextLiteral, Text("({")), - .next = _parse_text_helper(ctx, &pos, '(', ')', '@', false)); - if (type) { - REVERSE_LIST(chunks); - chunks = new (ast_list_t, .ast = NewAST(ctx->file, pos, pos, TextLiteral, Text("; })")), .next = chunks); - REVERSE_LIST(chunks); - } - } else { - if (!match(&pos, "{")) parser_err(ctx, start, pos, "I expected a '{' here"); - chunks = _parse_text_helper(ctx, &pos, '{', '}', '@', false); - } - - return NewAST(ctx->file, start, pos, InlineCCode, .chunks = chunks, .type_ast = type); -} - ast_t *parse_doctest(parse_ctx_t *ctx, const char *pos) { const char *start = pos; if (!match(&pos, ">>")) return NULL; diff --git a/src/parse/parse.h b/src/parse/parse.h index 83f824db..b3a36f72 100644 --- a/src/parse/parse.h +++ b/src/parse/parse.h @@ -32,7 +32,6 @@ ast_t *parse_extern(parse_ctx_t *ctx, const char *pos); ast_t *parse_for(parse_ctx_t *ctx, const char *pos); ast_t *parse_heap_alloc(parse_ctx_t *ctx, const char *pos); ast_t *parse_if(parse_ctx_t *ctx, const char *pos); -ast_t *parse_inline_c(parse_ctx_t *ctx, const char *pos); ast_t *parse_lang_def(parse_ctx_t *ctx, const char *pos); ast_t *parse_extend(parse_ctx_t *ctx, const char *pos); ast_t *parse_namespace(parse_ctx_t *ctx, const char *pos); @@ -52,12 +51,9 @@ ast_t *parse_stop(parse_ctx_t *ctx, const char *pos); ast_t *parse_struct_def(parse_ctx_t *ctx, const char *pos); ast_t *parse_term(parse_ctx_t *ctx, const char *pos); ast_t *parse_term_no_suffix(parse_ctx_t *ctx, const char *pos); -ast_t *parse_text(parse_ctx_t *ctx, const char *pos); ast_t *parse_update(parse_ctx_t *ctx, const char *pos); ast_t *parse_use(parse_ctx_t *ctx, const char *pos); ast_t *parse_var(parse_ctx_t *ctx, const char *pos); ast_t *parse_when(parse_ctx_t *ctx, const char *pos); ast_t *parse_while(parse_ctx_t *ctx, const char *pos); ast_t *parse_deserialize(parse_ctx_t *ctx, const char *pos); -ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote, - char open_interp, bool allow_escapes); diff --git a/src/parse/text.c b/src/parse/text.c new file mode 100644 index 00000000..b9827644 --- /dev/null +++ b/src/parse/text.c @@ -0,0 +1,185 @@ +// Logic for parsing text literals + +#include <stdarg.h> +#include <stdbool.h> +#include <string.h> + +#include "../unistr-fixed.h" +#include <unictype.h> +#include <uniname.h> + +#include "../ast.h" +#include "../stdlib/text.h" +#include "../stdlib/util.h" +#include "context.h" +#include "errors.h" +#include "parse.h" +#include "types.h" +#include "utils.h" + +static const char closing[128] = {['('] = ')', ['['] = ']', ['<'] = '>', ['{'] = '}'}; + +static ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote, + char open_interp, bool allow_escapes) { + const char *pos = *out_pos; + int64_t starting_indent = get_indent(ctx, pos); + int64_t string_indent = starting_indent + SPACES_PER_INDENT; + ast_list_t *chunks = NULL; + Text_t chunk = EMPTY_TEXT; + const char *chunk_start = pos; + int depth = 1; + bool leading_newline = false; + int64_t plain_span_len = 0; +#define FLUSH_PLAIN_SPAN() \ + do { \ + if (plain_span_len > 0) { \ + chunk = Texts(chunk, Text$from_strn(pos - plain_span_len, (size_t)plain_span_len)); \ + plain_span_len = 0; \ + } \ + } while (0) + for (const char *end = ctx->file->text + ctx->file->len; pos < end && depth > 0;) { + const char *after_indentation = pos; + if (*pos == open_interp) { // Interpolation + FLUSH_PLAIN_SPAN(); + const char *interp_start = pos; + if (chunk.length > 0) { + ast_t *literal = NewAST(ctx->file, chunk_start, pos, TextLiteral, .text = chunk); + chunks = new (ast_list_t, .ast = literal, .next = chunks); + chunk = EMPTY_TEXT; + } + ++pos; + ast_t *interp; + if (*pos == ' ' || *pos == '\t') + parser_err(ctx, pos, pos + 1, "Whitespace is not allowed before an interpolation here"); + interp = expect(ctx, interp_start, &pos, parse_term_no_suffix, "I expected an interpolation term here"); + chunks = new (ast_list_t, .ast = interp, .next = chunks); + chunk_start = pos; + } else if (allow_escapes && *pos == '\\') { + FLUSH_PLAIN_SPAN(); + const char *c = unescape(ctx, &pos); + chunk = Texts(chunk, Text$from_str(c)); + } else if (!leading_newline && *pos == open_quote && closing[(int)open_quote]) { // Nested pair begin + if (get_indent(ctx, pos) == starting_indent) { + ++depth; + } + plain_span_len += 1; + ++pos; + } else if (!leading_newline && *pos == close_quote) { // Nested pair end + if (get_indent(ctx, pos) == starting_indent) { + --depth; + if (depth == 0) break; + } + plain_span_len += 1; + ++pos; + } else if (newline_with_indentation(&after_indentation, string_indent)) { // Newline + FLUSH_PLAIN_SPAN(); + pos = after_indentation; + if (!leading_newline && !(chunk.length > 0 || chunks)) { + leading_newline = true; + } else { + chunk = Texts(chunk, Text("\n")); + } + } else if (newline_with_indentation(&after_indentation, starting_indent)) { // Line continuation (..) + FLUSH_PLAIN_SPAN(); + pos = after_indentation; + if (*pos == close_quote) { + break; + } else if (some_of(&pos, ".") >= 2) { + // Multi-line split + continue; + } else { + parser_err(ctx, pos, eol(pos), + "This multi-line string should be either indented or have '..' at the front"); + } + } else { // Plain character + ucs4_t codepoint; + const char *next = (const char *)u8_next(&codepoint, (const uint8_t *)pos); + plain_span_len += (int64_t)(next - pos); + if (next == NULL) break; + pos = next; + } + } + + FLUSH_PLAIN_SPAN(); +#undef FLUSH_PLAIN_SPAN + + if (chunk.length > 0) { + ast_t *literal = NewAST(ctx->file, chunk_start, pos, TextLiteral, .text = chunk); + chunks = new (ast_list_t, .ast = literal, .next = chunks); + chunk = EMPTY_TEXT; + } + + REVERSE_LIST(chunks); + char close_str[2] = {close_quote, 0}; + expect_closing(ctx, &pos, close_str, "I was expecting a ", close_quote, " to finish this string"); + *out_pos = pos; + return chunks; +} + +public +ast_t *parse_text(parse_ctx_t *ctx, const char *pos) { + // ('"' ... '"' / "'" ... "'" / "`" ... "`") + // "$" [name] [interp-char] quote-char ... close-quote + const char *start = pos; + const char *lang = NULL; + + char open_quote, close_quote, open_interp = '$'; + if (match(&pos, "\"")) { // Double quote + open_quote = '"', close_quote = '"', open_interp = '$'; + } else if (match(&pos, "`")) { // Backtick + open_quote = '`', close_quote = '`', open_interp = '$'; + } else if (match(&pos, "'")) { // Single quote + open_quote = '\'', close_quote = '\'', open_interp = '$'; + } else if (match(&pos, "$")) { // Customized strings + lang = get_id(&pos); + // $"..." or $@"...." + static const char *interp_chars = "~!@#$%^&*+=\\?"; + if (match(&pos, "$")) { // Disable interpolation with $$ + open_interp = '\x03'; + } else if (strchr(interp_chars, *pos)) { + open_interp = *pos; + ++pos; + } + static const char *quote_chars = "\"'`|/;([{<"; + if (!strchr(quote_chars, *pos)) + parser_err(ctx, pos, pos + 1, + "This is not a valid string quotation character. Valid characters are: \"'`|/;([{<"); + open_quote = *pos; + ++pos; + close_quote = closing[(int)open_quote] ? closing[(int)open_quote] : open_quote; + } else { + return NULL; + } + + bool allow_escapes = (open_quote != '`'); + ast_list_t *chunks = _parse_text_helper(ctx, &pos, open_quote, close_quote, open_interp, allow_escapes); + bool colorize = match(&pos, "~") && match_word(&pos, "colorized"); + return NewAST(ctx->file, start, pos, TextJoin, .lang = lang, .children = chunks, .colorize = colorize); +} + +public +ast_t *parse_inline_c(parse_ctx_t *ctx, const char *pos) { + const char *start = pos; + if (!match_word(&pos, "C_code")) return NULL; + + spaces(&pos); + type_ast_t *type = NULL; + ast_list_t *chunks; + if (match(&pos, ":")) { + type = expect(ctx, start, &pos, parse_type, "I couldn't parse the type for this C_code code"); + spaces(&pos); + if (!match(&pos, "(")) parser_err(ctx, start, pos, "I expected a '(' here"); + chunks = new (ast_list_t, .ast = NewAST(ctx->file, pos, pos, TextLiteral, Text("({")), + .next = _parse_text_helper(ctx, &pos, '(', ')', '@', false)); + if (type) { + REVERSE_LIST(chunks); + chunks = new (ast_list_t, .ast = NewAST(ctx->file, pos, pos, TextLiteral, Text("; })")), .next = chunks); + REVERSE_LIST(chunks); + } + } else { + if (!match(&pos, "{")) parser_err(ctx, start, pos, "I expected a '{' here"); + chunks = _parse_text_helper(ctx, &pos, '{', '}', '@', false); + } + + return NewAST(ctx->file, start, pos, InlineCCode, .chunks = chunks, .type_ast = type); +} diff --git a/src/parse/text.h b/src/parse/text.h new file mode 100644 index 00000000..cd07e0e1 --- /dev/null +++ b/src/parse/text.h @@ -0,0 +1,8 @@ +// Logic for parsing text literals +#pragma once + +#include "../ast.h" +#include "context.h" + +ast_t *parse_text(parse_ctx_t *ctx, const char *pos); +ast_t *parse_inline_c(parse_ctx_t *ctx, const char *pos); |
