From 05724a659f914cee31dc8bc7c96a98115fd6325e Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Thu, 12 Sep 2024 01:43:00 -0400 Subject: [PATCH] Change unicode escape to \{name} and add escape for ANSI CSI sequences: \[...] --- parse.c | 24 +++++++++++++++++------- test/text.tm | 5 ++++- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/parse.c b/parse.c index d88ef04..1aff4bc 100644 --- a/parse.c +++ b/parse.c @@ -168,24 +168,33 @@ static const char *unescape(parse_ctx_t *ctx, const char **out) { if (unescapes[(int)escape[1]]) { *endpos = escape + 2; return GC_strdup(unescapes[(int)escape[1]]); - } else if (escape[1] == 'U' && escape[2] == '[') { - size_t len = strcspn(&escape[3], "\r\n]"); - if (escape[3+len] != ']') - parser_err(ctx, escape, escape + 3 + len, "Missing closing ']'"); + } else if (escape[1] == '[') { + // ANSI Control Sequence Indicator: \033 [ ... m + size_t len = strcspn(&escape[2], "\r\n]"); + if (escape[2+len] != ']') + parser_err(ctx, escape, escape + 2 + len, "Missing closing ']'"); + *endpos = escape + 3 + len; + return heap_strf("\033[%.*sm", len, &escape[2]); + } else if (escape[1] == '{') { + // Unicode codepoints by name + size_t len = strcspn(&escape[2], "\r\n}"); + if (escape[2+len] != '}') + parser_err(ctx, escape, escape + 2 + len, "Missing closing '}'"); char name[len+1] = {}; - memcpy(name, &escape[3], len); + memcpy(name, &escape[2], len); name[len] = '\0'; uint32_t codepoint = unicode_name_character(name); if (codepoint == UNINAME_INVALID) - parser_err(ctx, escape, escape + 4 + len, + parser_err(ctx, escape, escape + 3 + len, "Invalid unicode codepoint name: \"%s\"", name); - *endpos = escape + 4 + len; + *endpos = escape + 3 + len; char *str = GC_MALLOC_ATOMIC(16); size_t u8_len = 16; (void)u32_to_u8(&codepoint, 1, (uint8_t*)str, &u8_len); str[u8_len] = '\0'; return str; } else if (escape[1] == 'U' && escape[2]) { + // Unicode codepoints by hex char *endptr = NULL; long codepoint = strtol(escape+2, &endptr, 16); uint32_t ustr[2] = {codepoint, 0}; @@ -195,6 +204,7 @@ static const char *unescape(parse_ctx_t *ctx, const char **out) { *endpos = endptr; return GC_strndup((char*)buf, bufsize); } else if (escape[1] == 'x' && escape[2] && escape[3]) { + // ASCII 2-digit hex char *endptr = NULL; char c = (char)strtol(escape+2, &endptr, 16); *endpos = escape + 4; diff --git a/test/text.tm b/test/text.tm index 216e5aa..44e42d3 100644 --- a/test/text.tm +++ b/test/text.tm @@ -16,9 +16,12 @@ func main(): >> \U65\U301 = "é" - >> \U[Penguin]:codepoint_names() + >> \{Penguin}:codepoint_names() = ["PENGUIN"] + >> \[31;1] + = "\e[31;1m" + >> \UE9 == \U65\U301 = yes