diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/environment.c | 4 | ||||
| -rw-r--r-- | src/parse.c | 61 | ||||
| -rw-r--r-- | src/stdlib/text.c | 37 |
3 files changed, 42 insertions, 60 deletions
diff --git a/src/environment.c b/src/environment.c index 21d3db9b..8084758e 100644 --- a/src/environment.c +++ b/src/environment.c @@ -327,7 +327,7 @@ env_t *global_env(void) {"at", "Text$cluster", "func(text:Text, index:Int -> Text)"}, {"by_line", "Text$by_line", "func(text:Text -> func(->Text?))"}, {"by_split", "Text$by_split", "func(text:Text, delimiter='' -> func(->Text?))"}, - {"by_split_any", "Text$by_split_any", "func(text:Text, delimiters=\" $\\t\\r\\n\" -> func(->Text?))"}, + {"by_split_any", "Text$by_split_any", "func(text:Text, delimiters=' \\t\\r\\n' -> func(->Text?))"}, {"bytes", "Text$utf8_bytes", "func(text:Text -> [Byte])"}, {"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language='C' -> Bool)"}, {"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, @@ -351,7 +351,7 @@ env_t *global_env(void) {"right_pad", "Text$right_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"}, {"slice", "Text$slice", "func(text:Text, from=1, to=-1 -> Text)"}, {"split", "Text$split", "func(text:Text, delimiter='' -> [Text])"}, - {"split_any", "Text$split_any", "func(text:Text, delimiters=\" $\\t\\r\\n\" -> [Text])"}, + {"split_any", "Text$split_any", "func(text:Text, delimiters=' \\t\\r\\n' -> [Text])"}, {"starts_with", "Text$starts_with", "func(text,prefix:Text -> Bool)"}, {"title", "Text$title", "func(text:Text, language='C' -> Text)"}, {"to", "Text$to", "func(text:Text, last:Int -> Text)"}, diff --git a/src/parse.c b/src/parse.c index c31bea64..b9a695b9 100644 --- a/src/parse.c +++ b/src/parse.c @@ -147,7 +147,7 @@ static PARSER(parse_var); static PARSER(parse_when); static PARSER(parse_while); static PARSER(parse_deserialize); -static ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote, char open_interp); +static ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote, char open_interp, bool allow_escapes); // // Print a parse error and exit (or use the on_err longjmp) @@ -255,6 +255,24 @@ static const char *unescape(parse_ctx_t *ctx, const char **out) { char name[len+1]; memcpy(name, &escape[2], len); name[len] = '\0'; + + if (name[0] == 'U') { + for (char *p = &name[1]; *p; p++) { + if (!isxdigit(*p)) goto look_up_unicode_name; + } + // Unicode codepoints by hex + char *endptr = NULL; + long codepoint = strtol(name+1, &endptr, 16); + uint32_t ustr[2] = {codepoint, 0}; + size_t bufsize = 8; + uint8_t buf[bufsize]; + (void)u32_to_u8(ustr, bufsize, buf, &bufsize); + *endpos = escape + 3 + len; + return GC_strndup((char*)buf, bufsize); + } + + look_up_unicode_name:; + uint32_t codepoint = unicode_name_character(name); if (codepoint == UNINAME_INVALID) parser_err(ctx, escape, escape + 3 + len, @@ -265,16 +283,6 @@ static const char *unescape(parse_ctx_t *ctx, const char **out) { (void)u32_to_u8(&codepoint, 1, (uint8_t*)str, &u8_len); str[u8_len] = '\0'; return str; - } else if (escape[1] == 'U' && escape[2]) { - // Unicode codepoints by hex - char *endptr = NULL; - long codepoint = strtol(escape+2, &endptr, 16); - uint32_t ustr[2] = {codepoint, 0}; - size_t bufsize = 8; - uint8_t buf[bufsize]; - (void)u32_to_u8(ustr, bufsize, buf, &bufsize); - *endpos = endptr; - return GC_strndup((char*)buf, bufsize); } else if (escape[1] == 'x' && escape[2] && escape[3]) { // ASCII 2-digit hex char buf[] = {escape[2], escape[3], 0}; @@ -1187,7 +1195,7 @@ PARSER(parse_bool) { return NULL; } -ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote, char open_interp) +ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open_quote, char close_quote, char open_interp, bool allow_escapes) { const char *pos = *out_pos; int64_t starting_indent = get_indent(ctx, pos); @@ -1203,7 +1211,7 @@ ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open if (chunk) { ast_t *literal = NewAST(ctx->file, chunk_start, pos, TextLiteral, .cord=chunk); chunks = new(ast_list_t, .ast=literal, .next=chunks); - chunk = NULL; + chunk = CORD_EMPTY; } ++pos; ast_t *interp; @@ -1212,6 +1220,9 @@ ast_list_t *_parse_text_helper(parse_ctx_t *ctx, const char **out_pos, char open interp = expect(ctx, interp_start, &pos, parse_term_no_suffix, "I expected an interpolation term here"); chunks = new(ast_list_t, .ast=interp, .next=chunks); chunk_start = pos; + } else if (allow_escapes && *pos == '\\') { + const char *c = unescape(ctx, &pos); + chunk = CORD_cat(chunk, c); } else if (!leading_newline && *pos == open_quote && closing[(int)open_quote]) { // Nested pair begin if (get_indent(ctx, pos) == starting_indent) { ++depth; @@ -1266,35 +1277,22 @@ PARSER(parse_text) { const char *start = pos; const char *lang = NULL; - // Escape sequence, e.g. \r\n - if (*pos == '\\') { - CORD cord = CORD_EMPTY; - do { - const char *c = unescape(ctx, &pos); - cord = CORD_cat(cord, c); - // cord = CORD_cat_char(cord, c); - } while (*pos == '\\'); - return NewAST(ctx->file, start, pos, TextLiteral, .cord=cord); - } - char open_quote, close_quote, open_interp = '$'; if (match(&pos, "\"")) { // Double quote open_quote = '"', close_quote = '"', open_interp = '$'; } else if (match(&pos, "`")) { // Backtick open_quote = '`', close_quote = '`', open_interp = '$'; } else if (match(&pos, "'")) { // Single quote - open_quote = '\'', close_quote = '\'', open_interp = '\x03'; + open_quote = '\'', close_quote = '\'', open_interp = '$'; } else if (match(&pos, "$")) { // Customized strings lang = get_id(&pos); // $"..." or $@"...." static const char *interp_chars = "~!@#$%^&*+=\\?"; - if (match(&pos, "$")) { // Disable interpolation with $ + if (match(&pos, "$")) { // Disable interpolation with $$ open_interp = '\x03'; } else if (strchr(interp_chars, *pos)) { open_interp = *pos; ++pos; - } else if (*pos == '(') { - open_interp = '@'; // For shell commands } static const char *quote_chars = "\"'`|/;([{<"; if (!strchr(quote_chars, *pos)) @@ -1306,7 +1304,8 @@ PARSER(parse_text) { return NULL; } - ast_list_t *chunks = _parse_text_helper(ctx, &pos, open_quote, close_quote, open_interp); + bool allow_escapes = (open_quote != '`'); + ast_list_t *chunks = _parse_text_helper(ctx, &pos, open_quote, close_quote, open_interp, allow_escapes); return NewAST(ctx->file, start, pos, TextJoin, .lang=lang, .children=chunks); } @@ -2277,7 +2276,7 @@ PARSER(parse_inline_c) { if (!match(&pos, "(")) parser_err(ctx, start, pos, "I expected a '(' here"); chunks = new(ast_list_t, .ast=NewAST(ctx->file, pos, pos, TextLiteral, "({"), - .next=_parse_text_helper(ctx, &pos, '(', ')', '@')); + .next=_parse_text_helper(ctx, &pos, '(', ')', '@', false)); if (type) { REVERSE_LIST(chunks); chunks = new(ast_list_t, .ast=NewAST(ctx->file, pos, pos, TextLiteral, "; })"), .next=chunks); @@ -2286,7 +2285,7 @@ PARSER(parse_inline_c) { } else { if (!match(&pos, "{")) parser_err(ctx, start, pos, "I expected a '{' here"); - chunks = _parse_text_helper(ctx, &pos, '{', '}', '@'); + chunks = _parse_text_helper(ctx, &pos, '{', '}', '@', false); } return NewAST(ctx->file, start, pos, InlineCCode, .chunks=chunks, .type_ast=type); diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 621de942..b3e9cebb 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -1352,12 +1352,9 @@ public Text_t Text$quoted(Text_t text, bool colorize, Text_t quotation_mark) int32_t quote_char = Text$get_grapheme(quotation_mark, 0); #define add_escaped(str) ({ if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m")); \ - if (!just_escaped) ret = concat2_assuming_safe(ret, Text("$")); \ ret = concat2_assuming_safe(ret, Text("\\" str)); \ - just_escaped = true; \ if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m")); }) TextIter_t state = NEW_TEXT_ITER_STATE(text); - bool just_escaped = false; // TODO: optimize for spans of non-escaped text for (int64_t i = 0; i < text.length; i++) { int32_t g = Text$get_grapheme_fast(&state, i); @@ -1371,21 +1368,11 @@ public Text_t Text$quoted(Text_t text, bool colorize, Text_t quotation_mark) case '\t': add_escaped("t"); break; case '\v': add_escaped("v"); break; case '\\': { - if (just_escaped) { - add_escaped("\\"); - } else { - ret = concat2_assuming_safe(ret, Text("\\")); - just_escaped = false; - } + add_escaped("\\"); break; } case '$': { - if (quote_char == '\'') { - ret = concat2_assuming_safe(ret, Text("$")); - just_escaped = false; - } else { - add_escaped("$"); - } + add_escaped("$"); break; } case '\x00' ... '\x06': case '\x0E' ... '\x1A': @@ -1397,7 +1384,6 @@ public Text_t Text$quoted(Text_t text, bool colorize, Text_t quotation_mark) ret = concat2_assuming_safe(ret, Text$from_strn(tmp, 2)); if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m")); - just_escaped = true; break; } default: { @@ -1405,7 +1391,6 @@ public Text_t Text$quoted(Text_t text, bool colorize, Text_t quotation_mark) ret = concat2_assuming_safe(ret, quotation_mark); } else { ret = concat2_assuming_safe(ret, Text$slice(text, I(i+1), I(i+1))); - just_escaped = false; } break; } @@ -1427,14 +1412,12 @@ public Text_t Text$as_text(const void *vtext, bool colorize, const TypeInfo_t *i Text_t text = *(Text_t*)vtext; // Figure out the best quotation mark to use: - bool has_dollar = false, has_double_quote = false, has_backtick = false, + bool has_double_quote = false, has_backtick = false, has_single_quote = false, needs_escapes = false; TextIter_t state = NEW_TEXT_ITER_STATE(text); for (int64_t i = 0; i < text.length; i++) { int32_t g = Text$get_grapheme_fast(&state, i); - if (g == '$') { - has_dollar = true; - } else if (g == '"') { + if (g == '"') { has_double_quote = true; } else if (g == '`') { has_backtick = true; @@ -1444,15 +1427,15 @@ public Text_t Text$as_text(const void *vtext, bool colorize, const TypeInfo_t *i } Text_t quote; - // If there's dollar signs and/or double quotes in the string, it would - // be nice to avoid needing to escape them by using single quotes, but - // only if we don't have single quotes or need to escape anything else - // (because single quotes don't have interpolation): - if ((has_dollar || has_double_quote) && !has_single_quote && !needs_escapes) + // If there's double quotes in the string, it would be nice to avoid + // needing to escape them by using single quotes, but only if we don't have + // single quotes or need to escape anything else (because single quotes + // don't have interpolation): + if (has_double_quote && !has_single_quote) quote = Text("'"); // If there is a double quote, but no backtick, we can save a bit of // escaping by using backtick instead of double quote: - else if (has_double_quote && !has_backtick) + else if (has_double_quote && has_single_quote && !has_backtick && !needs_escapes) quote = Text("`"); // Otherwise fall back to double quotes as the default quoting style: else |
