diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2024-09-02 18:47:39 -0400 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2024-09-02 18:47:39 -0400 |
| commit | 61e482f6f36aee6f72392a6188f2ec5c858b88fd (patch) | |
| tree | bea4123fcc62dd834405ae89ce9fe260e90a0023 | |
| parent | f0f8f218703ebb4512b3cd3f9e06b86a7d9861b0 (diff) | |
Initial WIP first past
38 files changed, 1857 insertions, 697 deletions
@@ -35,9 +35,9 @@ static CORD optional_tagged_type(const char *tag, type_ast_t *ast); CORD xml_escape(CORD text) { - text = Text$replace(text, "&", "&", I(-1)); - text = Text$replace(text, "<", "<", I(-1)); - text = Text$replace(text, ">", ">", I(-1)); + text = CORD_replace(text, "&", "&"); + text = CORD_replace(text, "<", "<"); + text = CORD_replace(text, ">", ">"); return text; } diff --git a/builtins/array.c b/builtins/array.c index bf1fe4d8..63539559 100644 --- a/builtins/array.c +++ b/builtins/array.c @@ -12,12 +12,14 @@ #include "array.h" #include "functions.h" -#include "halfsiphash.h" #include "integers.h" #include "table.h" +#include "text.h" #include "types.h" #include "util.h" +#include "siphash.c" + static inline int64_t get_padded_item_size(const TypeInfo *info) { int64_t size = info->ArrayInfo.item->size; @@ -532,67 +534,38 @@ public bool Array$equal(const array_t *x, const array_t *y, const TypeInfo *type return (Array$compare(x, y, type) == 0); } -public CORD Array$as_text(const array_t *arr, bool colorize, const TypeInfo *type) +public Text_t Array$as_text(const array_t *arr, bool colorize, const TypeInfo *type) { if (!arr) - return CORD_all("[", generic_as_text(NULL, false, type->ArrayInfo.item), "]"); + return Text$concat(Text$from_str("["), generic_as_text(NULL, false, type->ArrayInfo.item), Text$from_str("]")); const TypeInfo *item_type = type->ArrayInfo.item; - CORD c = "["; + Text_t text = Text$from_str("["); for (int64_t i = 0; i < arr->length; i++) { if (i > 0) - c = CORD_cat(c, ", "); - CORD item_cord = generic_as_text(arr->data + i*arr->stride, colorize, item_type); - c = CORD_cat(c, item_cord); + text = Text$concat(text, Text$from_str(", ")); + Text_t item_text = generic_as_text(arr->data + i*arr->stride, colorize, item_type); + text = Text$concat(text, item_text); } - c = CORD_cat(c, "]"); - return c; + text = Text$concat(text, Text$from_str("]")); + return text; } -public uint32_t Array$hash(const array_t *arr, const TypeInfo *type) +public uint64_t Array$hash(const array_t *arr, const TypeInfo *type) { - // Array hash is calculated as a rolling, compacting hash of the length of the array, followed by - // the hashes of its items (or the items themselves if they're small plain data) - // In other words, it reads in a chunk of items or item hashes, then when it fills up the chunk, - // hashes it down to a single item to start the next chunk. This repeats until the end, when it - // hashes the last chunk down to a uint32_t. const TypeInfo *item = type->ArrayInfo.item; - if (item->tag == PointerInfo || (item->tag == CustomInfo && item->CustomInfo.hash == NULL)) { // Raw data hash - uint8_t hash_batch[4 + 8*item->size]; - memset(hash_batch, 0, sizeof(hash_batch)); - uint8_t *p = hash_batch, *end = hash_batch + sizeof(hash_batch); - int64_t length = arr->length; - *p = (uint32_t)length; - p += sizeof(uint32_t); - for (int64_t i = 0; i < arr->length; i++) { - if (p >= end) { - uint32_t chunk_hash; - halfsiphash(&hash_batch, sizeof(hash_batch), TOMO_HASH_KEY, (uint8_t*)&chunk_hash, sizeof(chunk_hash)); - p = hash_batch; - *(uint32_t*)p = chunk_hash; - p += sizeof(uint32_t); - } - memcpy((p += item->size), arr->data + i*arr->stride, item->size); - } - uint32_t hash; - halfsiphash(&hash_batch, ((int64_t)p) - ((int64_t)hash_batch), TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); - return hash; + siphash sh; + siphashinit(&sh, sizeof(uint64_t[arr->length]), (uint64_t*)TOMO_HASH_KEY); + if (item->tag == PointerInfo || (item->tag == CustomInfo && item->CustomInfo.hash == NULL && item->size == sizeof(void*))) { // Raw data hash + for (int64_t i = 0; i < arr->length; i++) + siphashadd64bits(&sh, (uint64_t)(arr->data + i*arr->stride)); } else { - uint32_t hash_batch[16] = {(uint32_t)arr->length}; - uint32_t *p = &hash_batch[1], *end = hash_batch + sizeof(hash_batch)/sizeof(hash_batch[0]); for (int64_t i = 0; i < arr->length; i++) { - if (p >= end) { - uint64_t chunk_hash; - halfsiphash(&hash_batch, sizeof(hash_batch), TOMO_HASH_KEY, (uint8_t*)&chunk_hash, sizeof(chunk_hash)); - p = hash_batch; - *(p++) = chunk_hash; - } - *(p++) = generic_hash(arr->data + i*arr->stride, item); + uint64_t item_hash = generic_hash(arr->data + i*arr->stride, item); + siphashadd64bits(&sh, item_hash); } - uint32_t hash; - halfsiphash(&hash_batch, ((int64_t)p) - ((int64_t)hash_batch), TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); - return hash; } + return siphashfinish_last_part(&sh, 0); } static void siftdown(array_t *heap, int64_t startpos, int64_t pos, closure_t comparison, int64_t padded_item_size) diff --git a/builtins/array.h b/builtins/array.h index 47d10fd1..9dcdca6f 100644 --- a/builtins/array.h +++ b/builtins/array.h @@ -16,7 +16,7 @@ const array_t arr = arr_expr; int64_t index = index_expr; \ int64_t off = index + (index < 0) * (arr.length + 1) - 1; \ if (__builtin_expect(off < 0 || off >= arr.length, 0)) \ - fail_source(filename, start, end, "Invalid array index: %r (array has length %ld)\n", Int64$as_text(&index, no, NULL), arr.length); \ + fail_source(filename, start, end, "Invalid array index: %s (array has length %ld)\n", Text$as_c_string(Int64$as_text(&index, no, NULL)), arr.length); \ (item_type*)(arr.data + arr.stride * off);}) #define Array_get_unchecked(type, x, i) *({ const array_t arr = x; int64_t index = i; \ int64_t off = index + (index < 0) * (arr.length + 1) - 1; \ @@ -25,7 +25,7 @@ array_t *arr = arr_expr; int64_t index = index_expr; \ int64_t off = index + (index < 0) * (arr->length + 1) - 1; \ if (__builtin_expect(off < 0 || off >= arr->length, 0)) \ - fail_source(filename, start, end, "Invalid array index: %r (array has length %ld)\n", Int64$as_text(&index, no, NULL), arr->length); \ + fail_source(filename, start, end, "Invalid array index: %s (array has length %ld)\n", Text$as_c_string(Int64$as_text(&index, no, NULL)), arr->length); \ if (arr->data_refcount > 0) \ Array$compact(arr, padded_item_size); \ (item_type*)(arr->data + arr->stride * off); }) @@ -87,10 +87,10 @@ array_t Array$to(array_t array, Int_t last); array_t Array$by(array_t array, Int_t stride, int64_t padded_item_size); array_t Array$reversed(array_t array, int64_t padded_item_size); array_t Array$concat(array_t x, array_t y, int64_t padded_item_size); -uint32_t Array$hash(const array_t *arr, const TypeInfo *type); +uint64_t Array$hash(const array_t *arr, const TypeInfo *type); int32_t Array$compare(const array_t *x, const array_t *y, const TypeInfo *type); bool Array$equal(const array_t *x, const array_t *y, const TypeInfo *type); -CORD Array$as_text(const array_t *arr, bool colorize, const TypeInfo *type); +Text_t Array$as_text(const array_t *arr, bool colorize, const TypeInfo *type); void Array$heapify(array_t *heap, closure_t comparison, int64_t padded_item_size); void Array$heap_push(array_t *heap, const void *item, closure_t comparison, int64_t padded_item_size); #define Array$heap_push_value(heap, _value, comparison, padded_item_size) ({ __typeof(_value) value = _value; Array$heap_push(heap, &value, comparison, padded_item_size); }) diff --git a/builtins/bool.c b/builtins/bool.c index af2f0ac7..488c6ddc 100644 --- a/builtins/bool.c +++ b/builtins/bool.c @@ -13,25 +13,28 @@ #include "types.h" #include "util.h" -public CORD Bool$as_text(const bool *b, bool colorize, const TypeInfo *type) +public Text_t Bool$as_text(const bool *b, bool colorize, const TypeInfo *type) { (void)type; - if (!b) return "Bool"; + if (!b) return Text$from_str("Bool"); if (colorize) - return *b ? "\x1b[35myes\x1b[m" : "\x1b[35mno\x1b[m"; + return *b ? Text$from_str("\x1b[35myes\x1b[m") : Text$from_str("\x1b[35mno\x1b[m"); else - return *b ? "yes" : "no"; + return *b ? Text$from_str("yes") : Text$from_str("no"); } -public Bool_t Bool$from_text(CORD text, bool *success) +public Bool_t Bool$from_text(Text_t text, bool *success) { - CORD lower = Text$lower(text); - if (CORD_cmp(lower, "yes") == 0 || CORD_cmp(lower, "on") == 0 - || CORD_cmp(lower, "true") == 0 || CORD_cmp(lower, "1") == 0) { + if (Text$equal_ignoring_case(text, Text$from_str("yes")) + || Text$equal_ignoring_case(text, Text$from_str("on")) + || Text$equal_ignoring_case(text, Text$from_str("true")) + || Text$equal_ignoring_case(text, Text$from_str("1"))) { if (success) *success = yes; return yes; - } else if (CORD_cmp(lower, "no") == 0 || CORD_cmp(lower, "off") == 0 - || CORD_cmp(lower, "false") == 0 || CORD_cmp(lower, "0") == 0) { + } else if (Text$equal_ignoring_case(text, Text$from_str("no")) + || Text$equal_ignoring_case(text, Text$from_str("off")) + || Text$equal_ignoring_case(text, Text$from_str("false")) + || Text$equal_ignoring_case(text, Text$from_str("0"))) { if (success) *success = yes; return no; } else { diff --git a/builtins/bool.h b/builtins/bool.h index 716ddd5b..578cad6c 100644 --- a/builtins/bool.h +++ b/builtins/bool.h @@ -12,8 +12,8 @@ #define yes (Bool_t)true #define no (Bool_t)false -CORD Bool$as_text(const bool *b, bool colorize, const TypeInfo *type); -bool Bool$from_text(CORD text, bool *success); +Text_t Bool$as_text(const bool *b, bool colorize, const TypeInfo *type); +bool Bool$from_text(Text_t text, bool *success); Bool_t Bool$random(double p); extern const TypeInfo $Bool; diff --git a/builtins/c_string.c b/builtins/c_string.c index 3b258aad..8abb2b9f 100644 --- a/builtins/c_string.c +++ b/builtins/c_string.c @@ -13,12 +13,12 @@ #include "types.h" #include "util.h" -public CORD CString$as_text(const void *c_string, bool colorize, const TypeInfo *info) +public Text_t CString$as_text(const void *c_string, bool colorize, const TypeInfo *info) { (void)info; - if (!c_string) return "CString"; - CORD text = CORD_from_char_star(*(char**)c_string); - return CORD_all(colorize ? "\x1b[34mCString\x1b[m(" : "CString(", Text$quoted(text, colorize), ")"); + if (!c_string) return Text$from_str("CString"); + Text_t text = Text$from_str(*(char**)c_string); + return Text$concat(Text$from_str(colorize ? "\x1b[34mCString\x1b[m(" : "CString("), Text$quoted(text, colorize), Text$from_str(")")); } public int CString$compare(const char **x, const char **y) diff --git a/builtins/c_string.h b/builtins/c_string.h index 6b4b0aad..d909083d 100644 --- a/builtins/c_string.h +++ b/builtins/c_string.h @@ -8,10 +8,10 @@ #include "types.h" -CORD CString$as_text(const void *str, bool colorize, const TypeInfo *info); +Text_t CString$as_text(const void *str, bool colorize, const TypeInfo *info); int CString$compare(const char **x, const char **y); bool CString$equal(const char **x, const char **y); -uint32_t CString$hash(const char **str); +uint64_t CString$hash(const char **str); extern const TypeInfo $CString; diff --git a/builtins/channel.c b/builtins/channel.c index c2e2cf82..a0a0ddc5 100644 --- a/builtins/channel.c +++ b/builtins/channel.c @@ -15,6 +15,7 @@ #include "functions.h" #include "halfsiphash.h" #include "integers.h" +#include "text.h" #include "types.h" #include "util.h" #include "where.h" @@ -120,15 +121,21 @@ bool Channel$equal(const channel_t **x, const channel_t **y, const TypeInfo *typ return (*x == *y); } -CORD Channel$as_text(const channel_t **channel, bool colorize, const TypeInfo *type) +Text_t Channel$as_text(const channel_t **channel, bool colorize, const TypeInfo *type) { const TypeInfo *item_type = type->ChannelInfo.item; if (!channel) { - CORD typename = generic_as_text(NULL, false, item_type); - return colorize ? CORD_asprintf("\x1b[34;1m|:%s|\x1b[m", typename) : CORD_all("|:", typename, "|"); + Text_t typename = generic_as_text(NULL, false, item_type); + return Text$concat(Text$from_str(colorize ? "\x1b[34;1m|:" : "|:"), typename, Text$from_str(colorize ? "|\x1b[m" : "|")); } - CORD typename = generic_as_text(NULL, false, item_type); - return CORD_asprintf(colorize ? "\x1b[34;1m|:%s|<%p>\x1b[m" : "|:%s|<%p>", typename, *channel); + Text_t typename = generic_as_text(NULL, false, item_type); + return Text$concat( + Text$from_str(colorize ? "\x1b[34;1m|:" : "|:"), + typename, + Text$from_str("|<"), + Int64$hex((int64_t)(void*)*channel, I_small(0), true, true), + Text$from_str(colorize ? ">\x1b[m" : ">") + ); } // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/channel.h b/builtins/channel.h index 241e7ec9..bf24f806 100644 --- a/builtins/channel.h +++ b/builtins/channel.h @@ -22,9 +22,9 @@ void Channel$peek(channel_t *channel, void *out, Where_t where, int64_t item_siz #define Channel$peek_value(channel, where, t) ({ t _val; Channel$peek(channel, &_val, where, sizeof(t)); _val; }) void Channel$clear(channel_t *channel); array_t Channel$view(channel_t *channel); -uint32_t Channel$hash(const channel_t **channel, const TypeInfo *type); +uint64_t Channel$hash(const channel_t **channel, const TypeInfo *type); int32_t Channel$compare(const channel_t **x, const channel_t **y, const TypeInfo *type); bool Channel$equal(const channel_t **x, const channel_t **y, const TypeInfo *type); -CORD Channel$as_text(const channel_t **channel, bool colorize, const TypeInfo *type); +Text_t Channel$as_text(const channel_t **channel, bool colorize, const TypeInfo *type); // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/datatypes.h b/builtins/datatypes.h index 699c40e0..433e1dd9 100644 --- a/builtins/datatypes.h +++ b/builtins/datatypes.h @@ -71,4 +71,19 @@ typedef struct { int64_t max_size; } channel_t; +enum text_type { TEXT_SHORT_ASCII, TEXT_ASCII, TEXT_SHORT_GRAPHEMES, TEXT_GRAPHEMES, TEXT_SUBTEXT }; + +typedef struct Text_s { + int64_t length; // Number of grapheme clusters + uint64_t hash:61; + uint8_t tag:3; + union { + char short_ascii[8]; + const char *ascii; + int32_t short_graphemes[2]; + int32_t *graphemes; + struct Text_s *subtexts; + }; +} Text_t; + // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/functions.c b/builtins/functions.c index 3eea3c89..06636cba 100644 --- a/builtins/functions.c +++ b/builtins/functions.c @@ -2,7 +2,6 @@ #include <errno.h> #include <execinfo.h> #include <gc.h> -#include <gc/cord.h> #include <stdbool.h> #include <stdint.h> #include <stdlib.h> @@ -16,7 +15,6 @@ #include "channel.h" #include "files.h" #include "functions.h" -#include "halfsiphash.h" #include "integers.h" #include "pointer.h" #include "string.h" @@ -25,7 +23,9 @@ #include "types.h" #include "util.h" -public uint8_t TOMO_HASH_KEY[8] = {0}; +#include "siphash.c" + +public uint8_t TOMO_HASH_KEY[16] = {0}; public void tomo_init(void) { @@ -37,6 +37,9 @@ public void tomo_init(void) srand(seed); srand48(seed); Int$init_random(seed); + + if (register_printf_specifier('k', printf_text, printf_text_size)) + errx(1, "Couldn't set printf specifier"); } static void print_stack_trace(FILE *out) @@ -60,13 +63,13 @@ static void print_stack_trace(FILE *out) fprintf(out, "\x1b[m"); } -public void fail(CORD fmt, ...) +public void fail(const char *fmt, ...) { if (USE_COLOR) fputs("\x1b[31;7m ==================== ERROR ==================== \n\n\x1b[0;1m", stderr); else fputs("==================== ERROR ====================\n\n", stderr); va_list args; va_start(args, fmt); - CORD_vfprintf(stderr, fmt, args); + vfprintf(stderr, fmt, args); if (USE_COLOR) fputs("\x1b[m", stderr); fputs("\n\n", stderr); va_end(args); @@ -75,14 +78,14 @@ public void fail(CORD fmt, ...) raise(SIGABRT); } -public void fail_source(const char *filename, int64_t start, int64_t end, CORD fmt, ...) +public void fail_source(const char *filename, int64_t start, int64_t end, const char *fmt, ...) { if (USE_COLOR) fputs("\n\x1b[31;7m ==================== ERROR ==================== \n\n\x1b[0;1m", stderr); else fputs("\n==================== ERROR ====================\n\n", stderr); va_list args; va_start(args, fmt); - CORD_vfprintf(stderr, fmt, args); + vfprintf(stderr, fmt, args); va_end(args); file_t *file = filename ? load_file(filename) : NULL; @@ -98,11 +101,10 @@ public void fail_source(const char *filename, int64_t start, int64_t end, CORD f raise(SIGABRT); } -public uint32_t generic_hash(const void *obj, const TypeInfo *type) +public uint64_t generic_hash(const void *obj, const TypeInfo *type) { switch (type->tag) { - case PointerInfo: case FunctionInfo: return Pointer$hash(obj, type); - case TextInfo: return Text$hash(obj); + case TextInfo: return Text$hash((void*)obj); case ArrayInfo: return Array$hash(obj, type); case ChannelInfo: return Channel$hash((const channel_t**)obj, type); case TableInfo: return Table$hash(obj, type); @@ -113,9 +115,7 @@ public uint32_t generic_hash(const void *obj, const TypeInfo *type) return type->CustomInfo.hash(obj, type); default: { hash_data:; - uint32_t hash; - halfsiphash((void*)obj, type->size, TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); - return hash; + return siphash24((void*)obj, type->size, (uint64_t*)TOMO_HASH_KEY); } } } @@ -158,7 +158,7 @@ public bool generic_equal(const void *x, const void *y, const TypeInfo *type) } } -public CORD generic_as_text(const void *obj, bool colorize, const TypeInfo *type) +public Text_t generic_as_text(const void *obj, bool colorize, const TypeInfo *type) { switch (type->tag) { case PointerInfo: return Pointer$as_text(obj, colorize, type); @@ -168,19 +168,21 @@ public CORD generic_as_text(const void *obj, bool colorize, const TypeInfo *type case ChannelInfo: return Channel$as_text((const channel_t**)obj, colorize, type); case TableInfo: return Table$as_text(obj, colorize, type); case TypeInfoInfo: return Type$as_text(obj, colorize, type); - case EmptyStruct: return colorize ? CORD_all("\x1b[0;1m", type->EmptyStruct.name, "\x1b[m()") : CORD_all(type->EmptyStruct.name, "()"); + case EmptyStruct: return colorize ? + Text$concat(Text$from_str("\x1b[0;1m"), Text$from_str(type->EmptyStruct.name), Text$from_str("\x1b[m()")) + : Text$concat(Text$from_str(type->EmptyStruct.name), Text$from_str("()")); case CustomInfo: if (!type->CustomInfo.as_text) - fail("No cord function provided for type!\n"); + fail("No text function provided for type!\n"); return type->CustomInfo.as_text(obj, colorize, type); default: errx(1, "Invalid type tag: %d", type->tag); } } -public CORD builtin_last_err() +public Text_t builtin_last_err() { - return CORD_from_char_star(strerror(errno)); + return Text$from_str(strerror(errno)); } static int TEST_DEPTH = 0; @@ -193,12 +195,12 @@ public void start_test(const char *filename, int64_t start, int64_t end) if (filename && file) { for (int i = 0; i < 3*TEST_DEPTH; i++) fputc(' ', stderr); - CORD_fprintf(stderr, USE_COLOR ? "\x1b[33;1m>> \x1b[0m%.*s\x1b[m\n" : ">> %.*s\n", (end - start), file->text + start); + fprintf(stderr, USE_COLOR ? "\x1b[33;1m>> \x1b[0m%.*s\x1b[m\n" : ">> %.*s\n", (end - start), file->text + start); } ++TEST_DEPTH; } -public void end_test(void *expr, const TypeInfo *type, CORD expected, const char *filename, int64_t start, int64_t end) +public void end_test(void *expr, const TypeInfo *type, const char *expected, const char *filename, int64_t start, int64_t end) { (void)filename; (void)start; @@ -206,25 +208,29 @@ public void end_test(void *expr, const TypeInfo *type, CORD expected, const char --TEST_DEPTH; if (!expr) return; - CORD expr_cord = generic_as_text(expr, USE_COLOR, type); - CORD type_name = generic_as_text(NULL, false, type); + Text_t expr_text = generic_as_text(expr, USE_COLOR, type); + Text_t type_name = generic_as_text(NULL, false, type); for (int i = 0; i < 3*TEST_DEPTH; i++) fputc(' ', stderr); - CORD_fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %r \x1b[2m: %r\x1b[m\n" : "= %r : %r\n", expr_cord, type_name); - if (expected) { - CORD expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_cord; - bool success = Text$equal(&expr_plain, &expected); - if (!success && CORD_chr(expected, 0, ':')) { - CORD with_type = CORD_catn(3, expr_plain, " : ", type_name); - success = Text$equal(&with_type, &expected); + fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %k \x1b[2m: %k\x1b[m\n" : "= %k : %k\n", &expr_text, &type_name); + if (expected && expected[0]) { + Text_t expected_text = Text$from_str(expected); + Text_t expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_text; + bool success = Text$equal(&expr_plain, &expected_text); + if (!success) { + Int_t colon = Text$find(expected_text, Text$from_str(":"), I_small(0), NULL); + if (colon.small != I_small(0).small) { + Text_t with_type = Text$concat(expr_plain, Text$from_str(" : "), type_name); + success = Text$equal(&with_type, &expected_text); + } } if (!success) { fprintf(stderr, USE_COLOR - ? "\n\x1b[31;7m ==================== TEST FAILED ==================== \x1b[0;1m\n\nExpected: \x1b[1;32m%s\x1b[0m\n\x1b[1m But got:\x1b[m %s\n\n" - : "\n==================== TEST FAILED ====================\nExpected: %s\n\n But got: %s\n\n", - CORD_to_const_char_star(expected), CORD_to_const_char_star(expr_cord)); + ? "\n\x1b[31;7m ==================== TEST FAILED ==================== \x1b[0;1m\n\nExpected: \x1b[1;32m%s\x1b[0m\n\x1b[1m But got:\x1b[m %k\n\n" + : "\n==================== TEST FAILED ====================\nExpected: %s\n\n But got: %k\n\n", + expected, &expr_text); print_stack_trace(stderr); fflush(stderr); @@ -233,37 +239,29 @@ public void end_test(void *expr, const TypeInfo *type, CORD expected, const char } } -public void say(CORD text, bool newline) +public void say(Text_t text, bool newline) { - uint8_t buf[512] = {0}; - size_t buf_len = sizeof(buf)-1; - const char *str = CORD_to_const_char_star(text); - uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, strlen(str), buf, &buf_len); - if (normalized) { - write(STDOUT_FILENO, normalized, buf_len); - if (newline) - write(STDOUT_FILENO, "\n", 1); - if (normalized != buf) - free(normalized); - } + Text$print(stdout, text); + if (newline) + fputc('\n', stdout); } -public bool pop_flag(char **argv, int *i, const char *flag, CORD *result) +public bool pop_flag(char **argv, int *i, const char *flag, Text_t *result) { if (argv[*i][0] != '-' || argv[*i][1] != '-') { return false; } else if (streq(argv[*i] + 2, flag)) { - *result = CORD_EMPTY; + *result = (Text_t){.length=0}; argv[*i] = NULL; *i += 1; return true; } else if (strncmp(argv[*i] + 2, "no-", 3) == 0 && streq(argv[*i] + 5, flag)) { - *result = "no"; + *result = Text$from_str("no"); argv[*i] = NULL; *i += 1; return true; } else if (strncmp(argv[*i] + 2, flag, strlen(flag)) == 0 && argv[*i][2 + strlen(flag)] == '=') { - *result = CORD_from_char_star(argv[*i] + 2 + strlen(flag) + 1); + *result = Text$from_str(argv[*i] + 2 + strlen(flag) + 1); argv[*i] = NULL; *i += 1; return true; diff --git a/builtins/functions.h b/builtins/functions.h index 70266ba6..96837249 100644 --- a/builtins/functions.h +++ b/builtins/functions.h @@ -9,25 +9,25 @@ #include "datatypes.h" #include "types.h" -extern uint8_t TOMO_HASH_KEY[8]; +extern uint8_t TOMO_HASH_KEY[16]; void tomo_init(void); -void fail(CORD fmt, ...); -void fail_source(const char *filename, int64_t start, int64_t end, CORD fmt, ...); -CORD builtin_last_err(); +void fail(const char *fmt, ...); +void fail_source(const char *filename, int64_t start, int64_t end, const char *fmt, ...); +Text_t builtin_last_err(); void start_test(const char *filename, int64_t start, int64_t end); -void end_test(void *expr, const TypeInfo *type, CORD expected, const char *filename, int64_t start, int64_t end); +void end_test(void *expr, const TypeInfo *type, const char *expected, const char *filename, int64_t start, int64_t end); #define test(expr, type, expected, filename, start, end) {\ start_test(filename, start, end); \ end_test(expr, type, expected, filename, start, end); } -void say(CORD text, bool newline); +void say(Text_t text, bool newline); -uint32_t generic_hash(const void *obj, const TypeInfo *type); +uint64_t generic_hash(const void *obj, const TypeInfo *type); int32_t generic_compare(const void *x, const void *y, const TypeInfo *type); bool generic_equal(const void *x, const void *y, const TypeInfo *type); -CORD generic_as_text(const void *obj, bool colorize, const TypeInfo *type); +Text_t generic_as_text(const void *obj, bool colorize, const TypeInfo *type); closure_t spawn(closure_t fn); -bool pop_flag(char **argv, int *i, const char *flag, CORD *result); +bool pop_flag(char **argv, int *i, const char *flag, Text_t *result); // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/integers.c b/builtins/integers.c index 4a7b5c3a..45db160d 100644 --- a/builtins/integers.c +++ b/builtins/integers.c @@ -1,4 +1,5 @@ // Integer type infos and methods +#include <ctype.h> #include <gc.h> #include <gc/cord.h> #include <gmp.h> @@ -11,7 +12,8 @@ #include "integers.h" #include "text.h" #include "types.h" -#include "SipHash/halfsiphash.h" + +#include "siphash.c" static gmp_randstate_t Int_rng = {}; @@ -21,15 +23,17 @@ public void Int$init_random(long seed) gmp_randseed_ui(Int_rng, (unsigned long)seed); } -public CORD Int$as_text(const Int_t *i, bool colorize, const TypeInfo *type) { +public Text_t Int$as_text(const Int_t *i, bool colorize, const TypeInfo *type) { (void)type; - if (!i) return "Int"; + if (!i) return Text$from_str("Int"); if (__builtin_expect(i->small & 1, 1)) { - return CORD_asprintf(colorize ? "\x1b[35m%ld\x1b[33;2m\x1b[m" : "%ld", (i->small)>>2); + return Text$format(colorize ? "\x1b[35m%ld\x1b[m" : "%ld", (i->small)>>2); } else { char *str = mpz_get_str(NULL, 10, *i->big); - return CORD_asprintf(colorize ? "\x1b[35m%s\x1b[33;2m\x1b[m" : "%s", str); + Text_t text = Text$from_str(str); + if (colorize) text = Text$concat(Text$from_str("\x1b[35m"), text, Text$from_str("\x1b[m")); + return text; } } @@ -55,62 +59,86 @@ public bool Int$equal_value(const Int_t x, const Int_t y) { return x.small == y.small || (__builtin_expect(((x.small | y.small) & 1) == 0, 0) && mpz_cmp(*x.big, *y.big) == 0); } -public uint32_t Int$hash(const Int_t *x, const TypeInfo *type) { +public uint64_t Int$hash(const Int_t *x, const TypeInfo *type) { (void)type; - uint32_t hash; if (__builtin_expect(x->small & 1, 1)) { - halfsiphash(&x->small, sizeof(x->small), TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); + int64_t i = (x->small>>2); + return siphash24((void*)&i, sizeof(i), (uint64_t*)TOMO_HASH_KEY); } else { char *str = mpz_get_str(NULL, 16, *x->big); - halfsiphash(str, strlen(str), TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); + return siphash24((void*)str, strlen(str), (uint64_t*)TOMO_HASH_KEY); } - return hash; } -public CORD Int$format(Int_t i, Int_t digits_int) +public Text_t Int$format(Int_t i, Int_t digits_int) { int64_t digits = Int_to_Int64(digits_int, false); if (__builtin_expect(i.small & 1, 1)) { - return CORD_asprintf("%0.*ld", digits, (i.small)>>2); + return Text$format("%0.*ld", digits, (i.small)>>2); } else { - CORD str = mpz_get_str(NULL, 10, *i.big); + char *str = mpz_get_str(NULL, 10, *i.big); bool negative = (str[0] == '-'); - if (digits > (int64_t)CORD_len(str)) { - if (negative) - str = CORD_all("-", CORD_chars('0', digits - CORD_len(str)), CORD_substr(str, 1, ~0)); - else - str = CORD_all(CORD_chars('0', digits - CORD_len(str)), str); - } - return str; + int64_t needed_zeroes = digits - (int64_t)strlen(str); + if (needed_zeroes <= 0) + return Text$from_str(str); + + char *zeroes = GC_MALLOC_ATOMIC(needed_zeroes); + memset(zeroes, '0', needed_zeroes); + if (negative) + return Text$concat(Text$from_str("-"), Text$from_str(zeroes), Text$from_str(str + 1)); + else + return Text$concat(Text$from_str(zeroes), Text$from_str(str)); } } -public CORD Int$hex(Int_t i, Int_t digits_int, bool uppercase, bool prefix) { +public Text_t Int$hex(Int_t i, Int_t digits_int, bool uppercase, bool prefix) { + if (Int$compare(&i, (Int_t[1]){I_small(0)}, &$Int) < 0) + return Text$concat(Text$from_str("-"), Int$hex(Int$negative(i), digits_int, uppercase, prefix)); + int64_t digits = Int_to_Int64(digits_int, false); - const char *hex_fmt = uppercase ? (prefix ? "0x%0.*lX" : "%0.*lX") : (prefix ? "0x%0.*lx" : "%0.*lx"); if (__builtin_expect(i.small & 1, 1)) { - return CORD_asprintf(hex_fmt, digits, (i.small)>>2); + const char *hex_fmt = uppercase ? (prefix ? "0x%0.*lX" : "%0.*lX") : (prefix ? "0x%0.*lx" : "%0.*lx"); + return Text$format(hex_fmt, digits, (i.small)>>2); } else { - CORD str = mpz_get_str(NULL, 16, *i.big); - if (uppercase) str = Text$upper(str); - if (digits > (int64_t)CORD_len(str)) - str = CORD_cat(CORD_chars('0', digits - CORD_len(str)), str); - if (prefix) str = CORD_cat("0x", str); - return str; + char *str = mpz_get_str(NULL, 16, *i.big); + if (uppercase) { + for (char *c = str; *c; c++) + *c = (char)toupper(*c); + } + int64_t needed_zeroes = digits - (int64_t)strlen(str); + if (needed_zeroes <= 0) + return prefix ? Text$concat(Text$from_str("0x"), Text$from_str(str)) : Text$from_str(str); + + char *zeroes = GC_MALLOC_ATOMIC(needed_zeroes); + memset(zeroes, '0', needed_zeroes); + if (prefix) + return Text$concat(Text$from_str("0x"), Text$from_str(zeroes), Text$from_str(str)); + else + return Text$concat(Text$from_str(zeroes), Text$from_str(str)); } } -public CORD Int$octal(Int_t i, Int_t digits_int, bool prefix) { +public Text_t Int$octal(Int_t i, Int_t digits_int, bool prefix) { + Int_t zero = I_small(0); + if (Int$compare(&i, &zero, &$Int) < 0) + return Text$concat(Text$from_str("-"), Int$octal(Int$negative(i), digits_int, prefix)); + int64_t digits = Int_to_Int64(digits_int, false); - const char *octal_fmt = prefix ? "0o%0.*lo" : "%0.*lo"; if (__builtin_expect(i.small & 1, 1)) { - return CORD_asprintf(octal_fmt, (int)digits, (uint64_t)(i.small >> 2)); + const char *octal_fmt = prefix ? "0o%0.*lo" : "%0.*lo"; + return Text$format(octal_fmt, digits, (i.small)>>2); } else { - CORD str = mpz_get_str(NULL, 8, *i.big); - if (digits > (int64_t)CORD_len(str)) - str = CORD_cat(CORD_chars('0', digits - CORD_len(str)), str); - if (prefix) str = CORD_cat("0o", str); - return str; + char *str = mpz_get_str(NULL, 8, *i.big); + int64_t needed_zeroes = digits - (int64_t)strlen(str); + if (needed_zeroes <= 0) + return prefix ? Text$concat(Text$from_str("0o"), Text$from_str(str)) : Text$from_str(str); + + char *zeroes = GC_MALLOC_ATOMIC(needed_zeroes); + memset(zeroes, '0', needed_zeroes); + if (prefix) + return Text$concat(Text$from_str("0o"), Text$from_str(zeroes), Text$from_str(str)); + else + return Text$concat(Text$from_str(zeroes), Text$from_str(str)); } } @@ -290,9 +318,11 @@ public Int_t Int$sqrt(Int_t i) public Int_t Int$random(Int_t min, Int_t max) { int32_t cmp = Int$compare(&min, &max, &$Int); - if (cmp > 0) - fail("Random minimum value (%r) is larger than the maximum value (%r)", - Int$as_text(&min, false, &$Int), Int$as_text(&max, false, &$Int)); + if (cmp > 0) { + Text_t min_text = Int$as_text(&min, false, &$Int), max_text = Int$as_text(&max, false, &$Int); + fail("Random minimum value (%k) is larger than the maximum value (%k)", + &min_text, &max_text); + } if (cmp == 0) return min; mpz_t range_size; @@ -315,8 +345,8 @@ public Range_t Int$to(Int_t from, Int_t to) { return (Range_t){from, to, Int$compare(&to, &from, &$Int) >= 0 ? (Int_t){.small=(1<<2)|1} : (Int_t){.small=(-1>>2)|1}}; } -public Int_t Int$from_text(CORD text, bool *success) { - const char *str = CORD_to_const_char_star(text); +public Int_t Int$from_text(Text_t text, bool *success) { + const char *str = Text$as_c_string(text); mpz_t i; int result; if (strncmp(str, "0x", 2) == 0) { @@ -355,7 +385,7 @@ public Int_t Int$prev_prime(Int_t x) mpz_t p; mpz_init_set_int(p, x); if (mpz_prevprime(p, p) == 0) - fail("There is no prime number before %r", Int$as_text(&x, false, &$Int)); + fail("There is no prime number before %k", (Text_t[1]){Int$as_text(&x, false, &$Int)}); return Int$from_mpz(p); } @@ -373,13 +403,11 @@ public const TypeInfo $Int = { #define DEFINE_INT_TYPE(c_type, KindOfInt, fmt, min_val, max_val)\ - public CORD KindOfInt ## $as_text(const c_type *i, bool colorize, const TypeInfo *type) { \ + public Text_t KindOfInt ## $as_text(const c_type *i, bool colorize, const TypeInfo *type) { \ (void)type; \ - if (!i) return #KindOfInt; \ - CORD c; \ - if (colorize) CORD_sprintf(&c, "\x1b[35m%"fmt"\x1b[33;2m\x1b[m", *i); \ - else CORD_sprintf(&c, "%"fmt, *i); \ - return c; \ + if (!i) return Text$from_str(#KindOfInt); \ + Int_t as_int = KindOfInt##_to_Int(*i); \ + return Int$as_text(&as_int, colorize, type); \ } \ public int32_t KindOfInt ## $compare(const c_type *x, const c_type *y, const TypeInfo *type) { \ (void)type; \ @@ -389,19 +417,17 @@ public const TypeInfo $Int = { (void)type; \ return *x == *y; \ } \ - public CORD KindOfInt ## $format(c_type i, Int_t digits_int) { \ - int64_t digits = Int_to_Int64(digits_int, false); \ - return CORD_asprintf("%0*ld", (int)digits, (int64_t)i); \ + public Text_t KindOfInt ## $format(c_type i, Int_t digits_int) { \ + Int_t as_int = KindOfInt##_to_Int(i); \ + return Int$format(as_int, digits_int); \ } \ - public CORD KindOfInt ## $hex(c_type i, Int_t digits_int, bool uppercase, bool prefix) { \ - int64_t digits = Int_to_Int64(digits_int, false); \ - const char *hex_fmt = uppercase ? (prefix ? "0x%0.*lX" : "%0.*lX") : (prefix ? "0x%0.*lx" : "%0.*lx"); \ - return CORD_asprintf(hex_fmt, (int)digits, (uint64_t)i); \ + public Text_t KindOfInt ## $hex(c_type i, Int_t digits_int, bool uppercase, bool prefix) { \ + Int_t as_int = KindOfInt##_to_Int(i); \ + return Int$hex(as_int, digits_int, uppercase, prefix); \ } \ - public CORD KindOfInt ## $octal(c_type i, Int_t digits_int, bool prefix) { \ - int64_t digits = Int_to_Int64(digits_int, false); \ - const char *octal_fmt = prefix ? "0o%0.*lo" : "%0.*lo"; \ - return CORD_asprintf(octal_fmt, (int)digits, (uint64_t)i); \ + public Text_t KindOfInt ## $octal(c_type i, Int_t digits_int, bool prefix) { \ + Int_t as_int = KindOfInt##_to_Int(i); \ + return Int$octal(as_int, digits_int, prefix); \ } \ public array_t KindOfInt ## $bits(c_type x) { \ array_t bit_array = (array_t){.data=GC_MALLOC_ATOMIC(sizeof(bool[8*sizeof(c_type)])), .atomic=1, .stride=sizeof(bool), .length=8*sizeof(c_type)}; \ @@ -432,8 +458,8 @@ public const TypeInfo $Int = { public Range_t KindOfInt ## $to(c_type from, c_type to) { \ return (Range_t){Int64_to_Int(from), Int64_to_Int(to), to >= from ? (Int_t){.small=(1<<2)&1} : (Int_t){.small=(1<<2)&1}}; \ } \ - public c_type KindOfInt ## $from_text(CORD text, CORD *the_rest) { \ - const char *str = CORD_to_const_char_star(text); \ + public c_type KindOfInt ## $from_text(Text_t text, Text_t *the_rest) { \ + const char *str = Text$as_c_string(text); \ long i; \ char *end_ptr = NULL; \ if (strncmp(str, "0x", 2) == 0) { \ @@ -445,7 +471,7 @@ public const TypeInfo $Int = { } else { \ i = strtol(str, &end_ptr, 10); \ } \ - if (the_rest) *the_rest = CORD_from_char_star(end_ptr); \ + if (the_rest) *the_rest = Text$from_str(end_ptr); \ if (i < min_val) i = min_val; \ else if (i > max_val) i = min_val; \ return (c_type)i; \ diff --git a/builtins/integers.h b/builtins/integers.h index e5a662cc..359b1d57 100644 --- a/builtins/integers.h +++ b/builtins/integers.h @@ -24,16 +24,16 @@ #define I8(x) ((int8_t)x) #define DEFINE_INT_TYPE(c_type, type_name) \ - CORD type_name ## $as_text(const c_type *i, bool colorize, const TypeInfo *type); \ + Text_t type_name ## $as_text(const c_type *i, bool colorize, const TypeInfo *type); \ int32_t type_name ## $compare(const c_type *x, const c_type *y, const TypeInfo *type); \ bool type_name ## $equal(const c_type *x, const c_type *y, const TypeInfo *type); \ - CORD type_name ## $format(c_type i, Int_t digits); \ - CORD type_name ## $hex(c_type i, Int_t digits, bool uppercase, bool prefix); \ - CORD type_name ## $octal(c_type i, Int_t digits, bool prefix); \ + Text_t type_name ## $format(c_type i, Int_t digits); \ + Text_t type_name ## $hex(c_type i, Int_t digits, bool uppercase, bool prefix); \ + Text_t type_name ## $octal(c_type i, Int_t digits, bool prefix); \ array_t type_name ## $bits(c_type x); \ c_type type_name ## $random(c_type min, c_type max); \ Range_t type_name ## $to(c_type from, c_type to); \ - c_type type_name ## $from_text(CORD text, CORD *the_rest); \ + c_type type_name ## $from_text(Text_t text, Text_t *the_rest); \ static inline c_type type_name ## $clamped(c_type x, c_type min, c_type max) { \ return x < min ? min : (x > max ? max : x); \ } \ @@ -70,19 +70,19 @@ DEFINE_INT_TYPE(int8_t, Int8); #define Int16$abs(...) I16(abs(__VA_ARGS__)) #define Int8$abs(...) I8(abs(__VA_ARGS__)) -CORD Int$as_text(const Int_t *i, bool colorize, const TypeInfo *type); -uint32_t Int$hash(const Int_t *x, const TypeInfo *type); +Text_t Int$as_text(const Int_t *i, bool colorize, const TypeInfo *type); +uint64_t Int$hash(const Int_t *x, const TypeInfo *type); int32_t Int$compare(const Int_t *x, const Int_t *y, const TypeInfo *type); int32_t Int$compare_value(const Int_t x, const Int_t y); bool Int$equal(const Int_t *x, const Int_t *y, const TypeInfo *type); bool Int$equal_value(const Int_t x, const Int_t y); -CORD Int$format(Int_t i, Int_t digits); -CORD Int$hex(Int_t i, Int_t digits, bool uppercase, bool prefix); -CORD Int$octal(Int_t i, Int_t digits, bool prefix); +Text_t Int$format(Int_t i, Int_t digits); +Text_t Int$hex(Int_t i, Int_t digits, bool uppercase, bool prefix); +Text_t Int$octal(Int_t i, Int_t digits, bool prefix); void Int$init_random(long seed); Int_t Int$random(Int_t min, Int_t max); Range_t Int$to(Int_t from, Int_t to); -Int_t Int$from_text(CORD text, bool *success); +Int_t Int$from_text(Text_t text, bool *success); Int_t Int$abs(Int_t x); Int_t Int$power(Int_t base, Int_t exponent); Int_t Int$sqrt(Int_t i); diff --git a/builtins/memory.c b/builtins/memory.c index 5b9f39ad..4e8e4c50 100644 --- a/builtins/memory.c +++ b/builtins/memory.c @@ -1,6 +1,5 @@ // Type info and methods for "Memory" opaque type #include <gc.h> -#include <gc/cord.h> #include <stdbool.h> #include <stdint.h> #include <stdlib.h> @@ -8,17 +7,16 @@ #include <sys/param.h> #include <err.h> -#include "util.h" #include "halfsiphash.h" #include "memory.h" +#include "text.h" #include "types.h" +#include "util.h" -public CORD Memory__as_text(const void *p, bool colorize, const TypeInfo *type) { +public Text_t Memory__as_text(const void *p, bool colorize, const TypeInfo *type) { (void)type; - if (!p) return "Memory"; - CORD cord; - CORD_sprintf(&cord, colorize ? "\x1b[0;34;1mMemory<%p>\x1b[m" : "Memory<%p>", p); - return cord; + if (!p) return Text$from_str("Memory"); + return Text$format(colorize ? "\x1b[0;34;1mMemory<%p>\x1b[m" : "Memory<%p>", p); } public const TypeInfo $Memory = { diff --git a/builtins/memory.h b/builtins/memory.h index 48a2dafd..e3cb2983 100644 --- a/builtins/memory.h +++ b/builtins/memory.h @@ -9,6 +9,6 @@ #include "types.h" extern const TypeInfo $Memory; -CORD Memory$as_text(const void *p, bool colorize, const TypeInfo *type); +Text_t Memory$as_text(const void *p, bool colorize, const TypeInfo *type); // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/nums.c b/builtins/nums.c index 6b4f6a8a..5848a589 100644 --- a/builtins/nums.c +++ b/builtins/nums.c @@ -11,15 +11,13 @@ #include "array.h" #include "nums.h" #include "string.h" +#include "text.h" #include "types.h" -public CORD Num$as_text(const double *f, bool colorize, const TypeInfo *type) { +public Text_t Num$as_text(const double *f, bool colorize, const TypeInfo *type) { (void)type; - if (!f) return "Num"; - CORD c; - if (colorize) CORD_sprintf(&c, "\x1b[35m%.16g\x1b[33;2m\x1b[m", *f); - else CORD_sprintf(&c, "%.16g", *f); - return c; + if (!f) return Text$from_str("Num"); + return Text$format(colorize ? "\x1b[35m%.16g\x1b[33;2m\x1b[m" : "%.16g", *f); } public int32_t Num$compare(const double *x, const double *y, const TypeInfo *type) { @@ -47,12 +45,12 @@ public bool Num$near(double a, double b, double ratio, double absolute) { return (diff < epsilon); } -public CORD Num$format(double f, Int_t precision) { - return CORD_asprintf("%.*f", (int)Int_to_Int64(precision, false), f); +public Text_t Num$format(double f, Int_t precision) { + return Text$format("%.*f", (int)Int_to_Int64(precision, false), f); } -public CORD Num$scientific(double f, Int_t precision) { - return CORD_asprintf("%.*e", (int)Int_to_Int64(precision, false), f); +public Text_t Num$scientific(double f, Int_t precision) { + return Text$format("%.*e", (int)Int_to_Int64(precision, false), f); } public double Num$mod(double num, double modulus) { @@ -68,16 +66,16 @@ public double Num$mix(double amount, double x, double y) { return (1.0-amount)*x + amount*y; } -public double Num$from_text(CORD text, CORD *the_rest) { - const char *str = CORD_to_const_char_star(text); +public double Num$from_text(Text_t text, Text_t *the_rest) { + const char *str = Text$as_c_string(text); char *end = NULL; double d = strtod(str, &end); - if (the_rest) *the_rest = CORD_from_char_star(end); + if (the_rest) *the_rest = Text$from_str(end); return d; } -public double Num$nan(CORD tag) { - return nan(CORD_to_const_char_star(tag)); +public double Num$nan(Text_t tag) { + return nan(Text$as_c_string(tag)); } public bool Num$isinf(double n) { return !!isinf(n); } @@ -95,13 +93,10 @@ public const TypeInfo $Num = { }, }; -public CORD Num32$as_text(const float *f, bool colorize, const TypeInfo *type) { +public Text_t Num32$as_text(const float *f, bool colorize, const TypeInfo *type) { (void)type; - if (!f) return "Num32"; - CORD c; - if (colorize) CORD_sprintf(&c, "\x1b[35m%.8g_f32\x1b[m", *f); - else CORD_sprintf(&c, "%.8g_f32", *f); - return c; + if (!f) return Text$from_str("Num32"); + return Text$format(colorize ? "\x1b[35m%.8g_f32\x1b[33;2m\x1b[m" : "%.8g_f32", *f); } public int32_t Num32$compare(const float *x, const float *y, const TypeInfo *type) { @@ -129,12 +124,12 @@ public bool Num32$near(float a, float b, float ratio, float absolute) { return (diff < epsilon); } -public CORD Num32$format(float f, Int_t precision) { - return CORD_asprintf("%.*f", (int)Int_to_Int64(precision, false), f); +public Text_t Num32$format(float f, Int_t precision) { + return Text$format("%.*f", (int)Int_to_Int64(precision, false), f); } -public CORD Num32$scientific(float f, Int_t precision) { - return CORD_asprintf("%.*e", (int)Int_to_Int64(precision, false), f); +public Text_t Num32$scientific(float f, Int_t precision) { + return Text$format("%.*e", (int)Int_to_Int64(precision, false), f); } public float Num32$mod(float num, float modulus) { @@ -150,16 +145,16 @@ public float Num32$mix(float amount, float x, float y) { return (1.0-amount)*x + amount*y; } -public float Num32$from_text(CORD text, CORD *the_rest) { - const char *str = CORD_to_const_char_star(text); +public float Num32$from_text(Text_t text, Text_t *the_rest) { + const char *str = Text$as_c_string(text); char *end = NULL; double d = strtod(str, &end); - if (the_rest) *the_rest = CORD_from_char_star(end); + if (the_rest) *the_rest = Text$from_str(end); return (float)d; } -public float Num32$nan(CORD tag) { - return nanf(CORD_to_const_char_star(tag)); +public float Num32$nan(Text_t tag) { + return nanf(Text$as_c_string(tag)); } public bool Num32$isinf(float n) { return isinf(n); } diff --git a/builtins/nums.h b/builtins/nums.h index 94b11055..c5562f0a 100644 --- a/builtins/nums.h +++ b/builtins/nums.h @@ -14,39 +14,39 @@ #define N32(n) ((float)n) #define N64(n) ((double)n) -CORD Num$as_text(const double *f, bool colorize, const TypeInfo *type); +Text_t Num$as_text(const double *f, bool colorize, const TypeInfo *type); int32_t Num$compare(const double *x, const double *y, const TypeInfo *type); bool Num$equal(const double *x, const double *y, const TypeInfo *type); bool Num$near(double a, double b, double ratio, double absolute); -CORD Num$format(double f, Int_t precision); -CORD Num$scientific(double f, Int_t precision); +Text_t Num$format(double f, Int_t precision); +Text_t Num$scientific(double f, Int_t precision); double Num$mod(double num, double modulus); bool Num$isinf(double n); bool Num$finite(double n); bool Num$isnan(double n); -double Num$nan(CORD tag); +double Num$nan(Text_t tag); double Num$random(void); double Num$mix(double amount, double x, double y); -double Num$from_text(CORD text, CORD *the_rest); +double Num$from_text(Text_t text, Text_t *the_rest); static inline double Num$clamped(double x, double low, double high) { return (x <= low) ? low : (x >= high ? high : x); } extern const TypeInfo $Num; -CORD Num32$as_text(const float *f, bool colorize, const TypeInfo *type); +Text_t Num32$as_text(const float *f, bool colorize, const TypeInfo *type); int32_t Num32$compare(const float *x, const float *y, const TypeInfo *type); bool Num32$equal(const float *x, const float *y, const TypeInfo *type); bool Num32$near(float a, float b, float ratio, float absolute); -CORD Num32$format(float f, Int_t precision); -CORD Num32$scientific(float f, Int_t precision); +Text_t Num32$format(float f, Int_t precision); +Text_t Num32$scientific(float f, Int_t precision); float Num32$mod(float num, float modulus); bool Num32$isinf(float n); bool Num32$finite(float n); bool Num32$isnan(float n); float Num32$random(void); float Num32$mix(float amount, float x, float y); -float Num32$from_text(CORD text, CORD *the_rest); -float Num32$nan(CORD tag); +float Num32$from_text(Text_t text, Text_t *the_rest); +float Num32$nan(Text_t tag); static inline float Num32$clamped(float x, float low, float high) { return (x <= low) ? low : (x >= high ? high : x); } diff --git a/builtins/pointer.c b/builtins/pointer.c index 73bd41be..41f4a2a1 100644 --- a/builtins/pointer.c +++ b/builtins/pointer.c @@ -8,27 +8,39 @@ #include <stdlib.h> #include <sys/param.h> -#include "util.h" #include "functions.h" #include "halfsiphash.h" +#include "text.h" #include "types.h" +#include "util.h" typedef struct recursion_s { const void *ptr; struct recursion_s *next; } recursion_t; -public CORD Pointer$as_text(const void *x, bool colorize, const TypeInfo *type) { +public Text_t Pointer$as_text(const void *x, bool colorize, const TypeInfo *type) { auto ptr_info = type->PointerInfo; if (!x) { - CORD typename = generic_as_text(NULL, false, ptr_info.pointed); - CORD c = colorize ? CORD_asprintf("\x1b[34;1m%s%s\x1b[m", ptr_info.sigil, typename) : CORD_cat(ptr_info.sigil, typename); - return ptr_info.is_optional ? CORD_cat(c, "?") : c; + Text_t typename = generic_as_text(NULL, false, ptr_info.pointed); + Text_t text; + if (colorize) + text = Text$concat(Text$from_str("\x1b[34;1m"), Text$from_str(ptr_info.sigil), typename, Text$from_str("\x1b[m")); + else + text = Text$concat(Text$from_str(ptr_info.sigil), typename); + + if (ptr_info.is_optional) + text = Text$concat(text, Text$from_str("?")); + + return text; } const void *ptr = *(const void**)x; if (!ptr) { - CORD typename = generic_as_text(NULL, false, ptr_info.pointed); - return colorize ? CORD_asprintf("\x1b[34;1m!%s\x1b[m", typename) : CORD_cat("!", typename); + Text_t typename = generic_as_text(NULL, false, ptr_info.pointed); + if (colorize) + return Text$concat(Text$from_str("\x1b[34;1m!"), typename, Text$from_str("\x1b[m")); + else + return Text$concat(Text$from_str("!"), typename); } // Check for recursive references, so if `x.foo = x`, then it prints as @@ -38,22 +50,34 @@ public CORD Pointer$as_text(const void *x, bool colorize, const TypeInfo *type) for (recursion_t *r = recursion; r; r = r->next) { ++depth; if (r->ptr == ptr) { - CORD c = CORD_asprintf(colorize ? "\x1b[34;1m%s..%d\x1b[m" : "%s..%d", ptr_info.sigil, depth); - if (ptr_info.is_optional) c = CORD_cat(c, colorize ? "\x1b[34;1m?\x1b[m" : "?"); - return c; + Text_t text = Text$concat( + Text$from_str(colorize ? "\x1b[34;1m" : ""), + Text$from_str(ptr_info.sigil), + Text$from_str(".."), + Int32$as_text(&depth, false, &$Int32), + Text$from_str(colorize ? "\x1b[m" : "")); + if (ptr_info.is_optional) + text = Text$concat(text, Text$from_str(colorize ? "\x1b[34;1m?\x1b[m" : "?")); + return text; } } - CORD pointed; + Text_t pointed; { // Stringify with this pointer flagged as a recursive one: recursion_t my_recursion = {.ptr=ptr, .next=recursion}; recursion = &my_recursion; pointed = generic_as_text(ptr, colorize, ptr_info.pointed); recursion = recursion->next; } - CORD c = colorize ? CORD_asprintf("\x1b[34;1m%s\x1b[m%r", ptr_info.sigil, pointed) : CORD_cat(ptr_info.sigil, pointed); - if (ptr_info.is_optional) c = CORD_cat(c, colorize ? "\x1b[34;1m?\x1b[m" : "?"); - return c; + Text_t text; + if (colorize) + text = Text$concat(Text$from_str("\x1b[34;1m"), Text$from_str(ptr_info.sigil), Text$from_str("\x1b[m"), pointed); + else + text = Text$concat(Text$from_str(ptr_info.sigil), pointed); + + if (ptr_info.is_optional) + text = Text$concat(text, Text$from_str("?")); + return text; } public int32_t Pointer$compare(const void *x, const void *y, const TypeInfo *type) { @@ -68,11 +92,4 @@ public bool Pointer$equal(const void *x, const void *y, const TypeInfo *type) { return xp == yp; } -public uint32_t Pointer$hash(const void *x, const TypeInfo *type) { - (void)type; - uint32_t hash; - halfsiphash(x, sizeof(void*), TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); - return hash; -} - // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/pointer.h b/builtins/pointer.h index 538960b3..7748da4b 100644 --- a/builtins/pointer.h +++ b/builtins/pointer.h @@ -8,10 +8,9 @@ #include "types.h" -CORD Pointer$as_text(const void *x, bool colorize, const TypeInfo *type); +Text_t Pointer$as_text(const void *x, bool colorize, const TypeInfo *type); int32_t Pointer$compare(const void *x, const void *y, const TypeInfo *type); bool Pointer$equal(const void *x, const void *y, const TypeInfo *type); -uint32_t Pointer$hash(const void *x, const TypeInfo *type); #define Null(t) (t*)NULL #define POINTER_TYPE(_sigil, _pointed) (&(TypeInfo){\ diff --git a/builtins/range.c b/builtins/range.c index 840397b9..9b5af8cd 100644 --- a/builtins/range.c +++ b/builtins/range.c @@ -4,15 +4,15 @@ #include <err.h> #include <gmp.h> #include <gc.h> -#include <gc/cord.h> #include <math.h> #include <stdbool.h> #include <stdint.h> #include <stdlib.h> #include <sys/param.h> -#include "types.h" #include "integers.h" +#include "text.h" +#include "types.h" #include "util.h" @@ -32,15 +32,15 @@ static bool Range$equal(const Range_t *x, const Range_t *y, const TypeInfo *type return Int$equal(&x->first, &y->first, &$Int) && Int$equal(&x->last, &y->last, &$Int) && Int$equal(&x->step, &y->step, &$Int); } -static CORD Range$as_text(const Range_t *r, bool use_color, const TypeInfo *type) +static Text_t Range$as_text(const Range_t *r, bool use_color, const TypeInfo *type) { (void)type; - if (!r) return "Range"; + if (!r) return Text$from_str("Range"); - return CORD_asprintf(use_color ? "\x1b[0;1mRange\x1b[m(first=%r, last=%r, step=%r)" - : "Range(first=%r, last=%r, step=%r)", - Int$as_text(&r->first, use_color, &$Int), Int$as_text(&r->last, use_color, &$Int), - Int$as_text(&r->step, use_color, &$Int)); + return Text$format(use_color ? "\x1b[0;1mRange\x1b[m(first=%r, last=%r, step=%r)" + : "Range(first=%r, last=%r, step=%r)", + Int$as_text(&r->first, use_color, &$Int), Int$as_text(&r->last, use_color, &$Int), + Int$as_text(&r->step, use_color, &$Int)); } public Range_t Range$reversed(Range_t r) diff --git a/builtins/table.c b/builtins/table.c index 8de6532c..9bc3ded1 100644 --- a/builtins/table.c +++ b/builtins/table.c @@ -16,14 +16,15 @@ #include <string.h> #include <sys/param.h> -#include "util.h" #include "array.h" +#include "c_string.h" #include "datatypes.h" #include "halfsiphash.h" #include "memory.h" #include "table.h" #include "text.h" #include "types.h" +#include "util.h" // #define DEBUG_TABLES @@ -51,11 +52,11 @@ static const TypeInfo MemoryPointer = { }, }; -const TypeInfo StrToVoidStarTable = { +const TypeInfo CStrToVoidStarTable = { .size=sizeof(table_t), .align=__alignof__(table_t), .tag=TableInfo, - .TableInfo={.key=&$Text, .value=&MemoryPointer}, + .TableInfo={.key=&$CString, .value=&MemoryPointer}, }; static inline size_t entry_size(const TypeInfo *info) @@ -450,36 +451,43 @@ public uint32_t Table$hash(const table_t *t, const TypeInfo *type) return hash; } -public CORD Table$as_text(const table_t *t, bool colorize, const TypeInfo *type) +public Text_t Table$as_text(const table_t *t, bool colorize, const TypeInfo *type) { assert(type->tag == TableInfo); auto table = type->TableInfo; if (!t) { if (table.value != &$Void) - return CORD_all("{", generic_as_text(NULL, false, table.key), ":", generic_as_text(NULL, false, table.value), "}"); + return Text$concat( + Text$from_str("{"), + generic_as_text(NULL, false, table.key), + Text$from_str(":"), + generic_as_text(NULL, false, table.value), + Text$from_str("}")); else - return CORD_all("{", generic_as_text(NULL, false, table.key), "}"); + return Text$concat( + Text$from_str("{"), + generic_as_text(NULL, false, table.key), + Text$from_str("}")); } int64_t val_off = value_offset(type); - CORD c = "{"; + Text_t text = Text$from_str("{"); for (int64_t i = 0, length = Table$length(*t); i < length; i++) { if (i > 0) - c = CORD_cat(c, ", "); + text = Text$concat(text, Text$from_str(", ")); void *entry = GET_ENTRY(*t, i); - c = CORD_cat(c, generic_as_text(entry, colorize, table.key)); + text = Text$concat(text, generic_as_text(entry, colorize, table.key)); if (table.value != &$Void) - c = CORD_all(c, ":", generic_as_text(entry + val_off, colorize, table.value)); + text = Text$concat(text, Text$from_str(":"), generic_as_text(entry + val_off, colorize, table.value)); } if (t->fallback) { - c = CORD_cat(c, "; fallback="); - c = CORD_cat(c, Table$as_text(t->fallback, colorize, type)); + text = Text$concat(text, Text$from_str("; fallback="), Table$as_text(t->fallback, colorize, type)); } - c = CORD_cat(c, "}"); - return c; + text = Text$concat(text, Text$from_str("}")); + return text; } public table_t Table$from_entries(array_t entries, const TypeInfo *type) @@ -592,29 +600,29 @@ public bool Table$is_superset_of(table_t a, table_t b, bool strict, const TypeIn public void *Table$str_get(table_t t, const char *key) { - void **ret = Table$get(t, &key, &StrToVoidStarTable); + void **ret = Table$get(t, &key, &CStrToVoidStarTable); return ret ? *ret : NULL; } public void *Table$str_get_raw(table_t t, const char *key) { - void **ret = Table$get_raw(t, &key, &StrToVoidStarTable); + void **ret = Table$get_raw(t, &key, &CStrToVoidStarTable); return ret ? *ret : NULL; } public void *Table$str_reserve(table_t *t, const char *key, const void *value) { - return Table$reserve(t, &key, &value, &StrToVoidStarTable); + return Table$reserve(t, &key, &value, &CStrToVoidStarTable); } public void Table$str_set(table_t *t, const char *key, const void *value) { - Table$set(t, &key, &value, &StrToVoidStarTable); + Table$set(t, &key, &value, &CStrToVoidStarTable); } public void Table$str_remove(table_t *t, const char *key) { - return Table$remove(t, &key, &StrToVoidStarTable); + return Table$remove(t, &key, &CStrToVoidStarTable); } public void *Table$str_entry(table_t t, int64_t n) diff --git a/builtins/table.h b/builtins/table.h index 0ff4cb91..da60b3be 100644 --- a/builtins/table.h +++ b/builtins/table.h @@ -74,7 +74,7 @@ void Table$mark_copy_on_write(table_t *t); int32_t Table$compare(const table_t *x, const table_t *y, const TypeInfo *type); bool Table$equal(const table_t *x, const table_t *y, const TypeInfo *type); uint32_t Table$hash(const table_t *t, const TypeInfo *type); -CORD Table$as_text(const table_t *t, bool colorize, const TypeInfo *type); +Text_t Table$as_text(const table_t *t, bool colorize, const TypeInfo *type); void *Table$str_entry(table_t t, int64_t n); void *Table$str_get(table_t t, const char *key); @@ -85,6 +85,6 @@ void Table$str_remove(table_t *t, const char *key); #define Table$length(t) ((t).entries.length) -extern const TypeInfo StrToVoidStarTable; +extern const TypeInfo CStrToVoidStarTable; // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/builtins/text.c b/builtins/text.c index 966018f1..ff709e02 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -1,417 +1,1454 @@ // Type info and methods for Text datatype, which uses the Boehm "cord" library // and libunistr + #include <assert.h> #include <ctype.h> #include <err.h> #include <gc.h> -#include <gc/cord.h> #include <gmp.h> #include <limits.h> +#include <printf.h> #include <readline/history.h> #include <readline/readline.h> #include <stdbool.h> #include <stdint.h> #include <stdlib.h> #include <sys/param.h> + #include <unicase.h> +#include <unictype.h> #include <unigbrk.h> #include <uniname.h> #include <uninorm.h> +#include <unistd.h> +#include <unistdio.h> #include <unistr.h> #include "array.h" #include "functions.h" -#include "halfsiphash.h" #include "integers.h" #include "text.h" #include "types.h" -#define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo)) +static struct { + size_t num_codepoints; + const uint32_t *codepoints; +} synthetic_graphemes[1024] = {}; + +static int32_t num_synthetic_graphemes = 0; + +static int32_t get_grapheme(Text_t text, int64_t index); + +typedef struct { + int64_t subtext, sum_of_previous_subtexts; +} iteration_state_t; -static inline uint8_t *_normalize(CORD str, uint8_t *buf, size_t *len) +static int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index); + +int32_t find_synthetic_grapheme(const uint32_t *codepoints, size_t len) { - const uint8_t *str_u8 = (const uint8_t*)CORD_to_const_char_star(str); - uint8_t *normalized = u8_normalize(UNINORM_NFD, str_u8, strlen((char*)str_u8)+1, buf, len); - if (!normalized) errx(1, "Unicode normalization error!"); - return normalized; + int32_t lo = 0, hi = num_synthetic_graphemes; + while (lo <= hi) { + int32_t mid = (lo + hi) / 2; + int32_t cmp = (synthetic_graphemes[mid].num_codepoints > len) - (synthetic_graphemes[mid].num_codepoints < len); + if (cmp == 0) + cmp = memcmp(synthetic_graphemes[mid].codepoints, codepoints, sizeof(uint32_t[len])); + + if (cmp == 0) + return mid; + else if (cmp < 0) + lo = mid + 1; + else if (cmp > 0) + hi = mid - 1; + } + return hi; } -public CORD Text$as_text(const void *text, bool colorize, const TypeInfo *info) +int32_t get_synthetic_grapheme(const uint32_t *codepoints, size_t len) { - if (!text) return info->TextInfo.lang; - CORD ret = Text$quoted(*(CORD*)text, colorize); - if (!streq(info->TextInfo.lang, "Text")) - ret = colorize ? CORD_all("\x1b[1m$", info->TextInfo.lang, "\x1b[m", ret) : CORD_all("$", info->TextInfo.lang, ret); - return ret; + int32_t index = find_synthetic_grapheme(codepoints, len); + if (index < num_synthetic_graphemes + && synthetic_graphemes[index].num_codepoints == len + && memcmp(synthetic_graphemes[index].codepoints, codepoints, len) == 0) { + return -(index+1); + } else { + if (num_synthetic_graphemes > 0) + memmove(&synthetic_graphemes[index], &synthetic_graphemes[index + 1], num_synthetic_graphemes - index); + + uint32_t *buf = GC_MALLOC_ATOMIC(sizeof(uint32_t[len])); + memcpy(buf, codepoints, sizeof(uint32_t[len])); + synthetic_graphemes[index].codepoints = buf; + synthetic_graphemes[index].num_codepoints = len; + + ++num_synthetic_graphemes; + return -(index+1); + } } -public CORD Text$quoted(CORD str, bool colorize) -{ - // Note: it's important to have unicode strings not get broken up with - // escapes, otherwise they won't print right. - if (colorize) { - CORD quoted = "\x1b[35m\""; - CORD_pos i; - CORD_FOR(i, str) { - char c = CORD_pos_fetch(i); - switch (c) { -#define BACKSLASHED(esc) "\x1b[34m\\\x1b[1m" esc "\x1b[0;35m" - case '\a': quoted = CORD_cat(quoted, BACKSLASHED("a")); break; - case '\b': quoted = CORD_cat(quoted, BACKSLASHED("b")); break; - case '\x1b': quoted = CORD_cat(quoted, BACKSLASHED("e")); break; - case '\f': quoted = CORD_cat(quoted, BACKSLASHED("f")); break; - case '\n': quoted = CORD_cat(quoted, BACKSLASHED("n")); break; - case '\r': quoted = CORD_cat(quoted, BACKSLASHED("r")); break; - case '\t': quoted = CORD_cat(quoted, BACKSLASHED("t")); break; - case '\v': quoted = CORD_cat(quoted, BACKSLASHED("v")); break; - case '"': quoted = CORD_cat(quoted, BACKSLASHED("\"")); break; - case '\\': quoted = CORD_cat(quoted, BACKSLASHED("\\")); break; - case '\x00' ... '\x06': case '\x0E' ... '\x1A': - case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': - CORD_sprintf("ed, "%r" BACKSLASHED("x%02X"), quoted, c); - break; - default: quoted = CORD_cat_char(quoted, c); break; -#undef BACKSLASHED +static inline size_t num_subtexts(Text_t t) +{ + if (t.tag != TEXT_SUBTEXT) return 1; + size_t len = t.length; + size_t n = 0; + while (len > 0) { + len -= t.subtexts[n].length; + ++n; + } + return n; +} + +int text_visualize(FILE *stream, Text_t t) +{ + switch (t.tag) { + case TEXT_SHORT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.short_ascii); + case TEXT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.ascii); + case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { + int printed = fprintf(stream, "<graphemes length=%ld>", t.length); + printed += Text$print(stream, t); + printed += fprintf(stream, "</graphemes>"); + return printed; + } + case TEXT_SUBTEXT: { + int printed = fprintf(stream, "<text length=%ld>", t.length); + size_t to_print = t.length; + for (int i = 0; to_print > 0; ++i) { + printed += fprintf(stream, "\n "); + printed += text_visualize(stream, t.subtexts[i]); + to_print -= t.subtexts[i].length; + if (t.subtexts[i].length == 0) break; + } + printed += fprintf(stream, "\n</text>"); + return printed; + } + default: return 0; + } +} + +public int Text$print(FILE *stream, Text_t t) +{ + switch (t.tag) { + case TEXT_SHORT_ASCII: return fwrite(t.short_ascii, sizeof(char), t.length, stream); + case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), t.length, stream); + case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { + int32_t *graphemes = t.tag == TEXT_SHORT_GRAPHEMES ? t.short_graphemes : t.graphemes; + int written = 0; + for (int64_t i = 0; i < t.length; i++) { + int32_t grapheme = graphemes[i]; + if (grapheme >= 0) { + written += ulc_fprintf(stream, "%.*llU", 1, &grapheme); + } else { + written += ulc_fprintf( + stream, "%.*llU", + synthetic_graphemes[-grapheme-1].num_codepoints, + synthetic_graphemes[-grapheme-1].codepoints); } } - quoted = CORD_cat(quoted, "\"\x1b[m"); - return quoted; + return written; + } + case TEXT_SUBTEXT: { + int written = 0; + int i = 0; + for (size_t to_print = t.length; to_print > 0; to_print -= t.subtexts[i].length, ++i) + written += Text$print(stream, t.subtexts[i]); + return written; + } + default: return 0; + } +} + +static Text_t concat2(Text_t a, Text_t b) +{ + if (a.length == 0) return b; + if (b.length == 0) return a; + + if (a.tag == TEXT_SUBTEXT && b.tag == TEXT_SUBTEXT) { + size_t na = num_subtexts(a); + size_t nb = num_subtexts(b); + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[na + nb])), + }; + memcpy(&ret.subtexts[0], a.subtexts, sizeof(Text_t[na])); + memcpy(&ret.subtexts[na], b.subtexts, sizeof(Text_t[nb])); + return ret; + } else if (a.tag == TEXT_SUBTEXT) { + size_t n = num_subtexts(a); + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])), + }; + memcpy(ret.subtexts, a.subtexts, sizeof(Text_t[n])); + ret.subtexts[n] = b; + return ret; + } else if (b.tag == TEXT_SUBTEXT) { + size_t n = num_subtexts(b); + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])), + }; + ret.subtexts[0] = a; + memcpy(&ret.subtexts[1], b.subtexts, sizeof(Text_t[n])); + return ret; } else { - CORD quoted = "\""; - CORD_pos i; - CORD_FOR(i, str) { - char c = CORD_pos_fetch(i); - switch (c) { - case '\a': quoted = CORD_cat(quoted, "\\a"); break; - case '\b': quoted = CORD_cat(quoted, "\\b"); break; - case '\x1b': quoted = CORD_cat(quoted, "\\e"); break; - case '\f': quoted = CORD_cat(quoted, "\\f"); break; - case '\n': quoted = CORD_cat(quoted, "\\n"); break; - case '\r': quoted = CORD_cat(quoted, "\\r"); break; - case '\t': quoted = CORD_cat(quoted, "\\t"); break; - case '\v': quoted = CORD_cat(quoted, "\\v"); break; - case '"': quoted = CORD_cat(quoted, "\\\""); break; - case '\\': quoted = CORD_cat(quoted, "\\\\"); break; - case '\x00' ... '\x06': case '\x0E' ... '\x1A': - case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': - CORD_sprintf("ed, "%r\\x%02X", quoted, c); - break; - default: quoted = CORD_cat_char(quoted, c); break; + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[2])), + }; + ret.subtexts[0] = a; + ret.subtexts[1] = b; + return ret; + } +} + +public Text_t Text$_concat(int n, Text_t items[n]) +{ + if (n == 0) return (Text_t){.length=0}; + if (n == 1) return items[0]; + if (n == 2) return concat2(items[0], items[1]); + + int64_t len = 0, subtexts = 0; + for (int i = 0; i < n; i++) { + len += items[i].length; + subtexts += num_subtexts(items[i]); + } + + Text_t ret = { + .length=len, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[len])), + }; + int64_t sub_i = 0; + for (int i = 0; i < n; i++) { + if (items[i].tag == TEXT_SUBTEXT) { + for (int64_t j = 0, remainder = items[i].length; remainder > 0; j++) { + ret.subtexts[sub_i++] = items[i].subtexts[j]; + remainder -= items[i].subtexts[j].length; } + } else { + ret.subtexts[sub_i++] = items[i]; } - quoted = CORD_cat_char(quoted, '"'); - return quoted; } + return ret; } -public int Text$compare(const CORD *x, const CORD *y) +public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) { - uint8_t *xx = (uint8_t*)CORD_to_const_char_star(*x); - uint8_t *yy = (uint8_t*)CORD_to_const_char_star(*y); - int result = 0; - if (u8_normcmp(xx, strlen((char*)xx), yy, strlen((char*)yy), UNINORM_NFD, &result)) - fail("Something went wrong while comparing text"); - return result; + int64_t first = Int_to_Int64(first_int, false)-1; + int64_t last = Int_to_Int64(last_int, false)-1; + if (first == 0) errx(1, "Invalid index: 0"); + if (last == 0) return (Text_t){.length=0}; + + if (first < 0) first = text.length + first + 1; + if (last < 0) last = text.length + last + 1; + + if (last > text.length) last = text.length; + + if (first > text.length || last < first) + return (Text_t){.length=0}; + + if (first == 1 && last == text.length) + return text; + + switch (text.tag) { + case TEXT_SHORT_ASCII: { + Text_t ret = text; + ret.length = last - first + 1; + if (first > 1) + memcpy(ret.short_ascii, text.short_ascii + (first-1), ret.length); + return ret; + } + case TEXT_ASCII: { + Text_t ret = { + .tag=TEXT_ASCII, + .length=last - first + 1, + .ascii=text.ascii + (first-1), + }; + return ret; + } + case TEXT_SHORT_GRAPHEMES: { + assert((first == 1 && last == 1) || (first == 2 && last == 2)); + Text_t ret = { + .tag=TEXT_SHORT_GRAPHEMES, + .length=1, + .short_graphemes={text.short_graphemes[first-1]}, + }; + return ret; + } + case TEXT_GRAPHEMES: { + Text_t ret = { + .tag=TEXT_GRAPHEMES, + .length=last - first + 1, + .graphemes=text.graphemes + (first-1), + }; + return ret; + } + case TEXT_SUBTEXT: { + Text_t *subtexts = text.subtexts; + while (first > subtexts[0].length) { + first -= subtexts[0].length; + last -= subtexts[0].length; + ++subtexts; + } + + int64_t needed_len = (last - first) + 1; + int64_t num_subtexts = 0; + for (int64_t included = 0; included < needed_len; ) { + if (included == 0) + included += subtexts[num_subtexts].length - first + 1; + else + included += subtexts[num_subtexts].length; + num_subtexts += 1; + } + if (num_subtexts == 1) + return Text$slice(subtexts[0], Int64_to_Int(first+1), Int64_to_Int(last+1)); + + Text_t ret = { + .length=needed_len, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[num_subtexts])), + }; + for (int64_t i = 0; i < num_subtexts; i++) { + ret.subtexts[i] = Text$slice(subtexts[i], Int64_to_Int(first+1), Int64_to_Int(last+1)); + first = 1; + needed_len -= ret.subtexts[i].length; + last = first + needed_len - 1; + } + return ret; + } + default: errx(1, "Invalid tag"); + } } -public bool Text$equal(const CORD *x, const CORD *y) +Text_t text_from_u32(uint32_t *codepoints, size_t num_codepoints, bool normalize) { - return Text$compare(x, y) == 0; + uint32_t norm_buf[128]; + if (normalize) { + size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]); + uint32_t *normalized = u32_normalize(UNINORM_NFC, codepoints, num_codepoints, norm_buf, &norm_length); + codepoints = normalized; + num_codepoints = norm_length; + } + + char breaks[num_codepoints]; + u32_grapheme_breaks(codepoints, num_codepoints, breaks); + + Text_t ret = { + .length=0, + .tag=TEXT_SHORT_GRAPHEMES, + }; + const uint32_t *src = codepoints; + int32_t *dest = &ret.short_graphemes[0]; + while (src != &codepoints[num_codepoints]) { + ++ret.length; + + if (ret.tag == TEXT_SHORT_GRAPHEMES && ret.length > 2) { + int32_t *graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints])); // May be a slight overallocation + graphemes[0] = ret.short_graphemes[0]; + graphemes[1] = ret.short_graphemes[1]; + ret.tag = TEXT_GRAPHEMES; + ret.graphemes = graphemes; + dest = &graphemes[2]; + } + + const uint32_t *next = u32_grapheme_next(src, &codepoints[num_codepoints]); + if (next == &src[1]) { + *dest = (int32_t)*src; + } else { + // Synthetic grapheme + *dest = get_synthetic_grapheme(src, next-src); + } + ++dest; + src = next; + } + if (normalize && codepoints != norm_buf) free(codepoints); + return ret; } -public uint32_t Text$hash(const CORD *cord) +public Text_t Text$from_str(const char *str) { - if (!*cord) return 0; + size_t ascii_span = 0; + while (str[ascii_span] && isascii(str[ascii_span])) + ascii_span++; - uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); - uint8_t *normalized = _normalize(*cord, buf, &norm_len); + if (str[ascii_span] == '\0') { // All ASCII + Text_t ret = {.length=ascii_span}; + if (ascii_span <= 8) { + ret.tag = TEXT_SHORT_ASCII; + for (size_t i = 0; i < ascii_span; i++) + ret.short_ascii[i] = str[i]; + } else { + ret.tag = TEXT_ASCII; + ret.ascii = str; + } + return ret; + } else { + uint32_t buf[128]; + size_t length = sizeof(buf)/sizeof(buf[0]); + uint32_t *codepoints = u8_to_u32((uint8_t*)str, ascii_span + strlen(str + ascii_span), buf, &length); + Text_t ret = text_from_u32(codepoints, length, true); + if (codepoints != buf) free(codepoints); + return ret; + } +} + +static void u8_buf_append(Text_t text, char **buf, size_t *capacity, int64_t *i) +{ + switch (text.tag) { + case TEXT_ASCII: case TEXT_SHORT_ASCII: { + if (*i + text.length > (int64_t)*capacity) { + *capacity = *i + text.length; + *buf = GC_REALLOC(*buf, *capacity); + } + + const char *bytes = text.tag == TEXT_ASCII ? text.ascii : text.short_ascii; + memcpy(*buf + *i, bytes, text.length); + *i += text.length; + break; + } + case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { + const int32_t *graphemes = text.tag == TEXT_GRAPHEMES ? text.graphemes : text.short_graphemes; + for (int64_t g = 0; g + 1 < text.length; g++) { + const uint32_t *codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].codepoints : (uint32_t*)&graphemes[g]; + size_t num_codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].num_codepoints : 1; + uint8_t u8_buf[64]; + size_t u8_len = sizeof(u8_buf); + uint8_t *u8 = u32_to_u8(codepoints, num_codepoints, u8_buf, &u8_len); + + if (*i + (int64_t)u8_len > (int64_t)*capacity) { + *capacity = *i + u8_len; + *buf = GC_REALLOC(*buf, *capacity); + } - uint32_t hash; - halfsiphash(normalized, norm_len, TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); - if (normalized != buf) free(normalized); - return hash; + memcpy(*buf + *i, u8, u8_len); + *i += u8_len; + if (u8 != u8_buf) free(u8); + } + break; + } + case TEXT_SUBTEXT: { + for (int64_t s = 0, remaining = text.length; remaining > 0; s++) { + u8_buf_append(text.subtexts[s], buf, capacity, i); + remaining -= text.subtexts[s].length; + } + break; + } + default: break; + } } -public CORD Text$upper(CORD str) +public const char *Text$as_c_string(Text_t text) { - if (!str) return str; - size_t len = strlen(str) + 1; - uint8_t *dest = GC_MALLOC_ATOMIC(len); - dest[len-1] = 0; - return (CORD)u8_toupper((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); + size_t capacity = text.length; + char *buf = GC_MALLOC_ATOMIC(capacity); + int64_t i = 0; + u8_buf_append(text, &buf, &capacity, &i); + return buf; } -public CORD Text$lower(CORD str) +uint32_t *text_to_u32(Text_t text, size_t *length) { - if (!str) return str; - size_t len = strlen(str) + 1; - uint8_t *dest = GC_MALLOC_ATOMIC(len); - dest[len-1] = 0; - return (CORD)u8_tolower((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); + // Precalculate size: + size_t len = 0; + if (text.tag == TEXT_ASCII) { + len = text.length; + } else { + iteration_state_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t grapheme = _next_grapheme(text, &state, i); + if (grapheme < 0) + len += synthetic_graphemes[-grapheme-1].num_codepoints; + else + len += 1; + } + } + assert(length); + *length = len; + + // Copy over codepoints one grapheme cluster at a time: + uint32_t *ret = GC_MALLOC_ATOMIC(sizeof(uint32_t[len])); + uint32_t *dest = ret; + iteration_state_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t grapheme = _next_grapheme(text, &state, i); + if (grapheme < 0) { + const uint32_t *codepoints = synthetic_graphemes[-grapheme-1].codepoints; + size_t num_codepoints = synthetic_graphemes[-grapheme-1].num_codepoints; + for (size_t j = 0; j < num_codepoints; j++) + *(dest++) = codepoints[j]; + } else { + *(dest++) = (uint32_t)grapheme; + } + } + return ret; } -public CORD Text$title(CORD str) +#include "siphash.c" + +public uint64_t Text$hash(Text_t *text) { - if (!str) return str; - size_t len = strlen(str) + 1; - uint8_t *dest = GC_MALLOC_ATOMIC(len); - dest[len-1] = 0; - return (CORD)u8_totitle((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); + if (text->hash != 0) return text->hash; + siphash sh; + siphashinit(&sh, sizeof(int32_t[text->length]), (uint64_t*)TOMO_HASH_KEY); + + union { + int32_t chunks[2]; + uint64_t whole; + } tmp; + switch (text->tag) { + case TEXT_ASCII: case TEXT_SHORT_ASCII: { + const char *bytes = text->tag == TEXT_ASCII ? text->ascii : text->short_ascii; + for (int64_t i = 0; i + 1 < text->length; i++) { + tmp.chunks[0] = (int32_t)bytes[i]; + tmp.chunks[1] = (int32_t)bytes[i+1]; + siphashadd64bits(&sh, tmp.whole); + } + int32_t last = text->length & 0x1 ? (int32_t)bytes[text->length-1] : 0; // Odd number of graphemes + text->hash = siphashfinish_last_part(&sh, (uint64_t)last); + break; + } + case TEXT_GRAPHEMES: { + const int32_t *graphemes = text->graphemes; + for (int64_t i = 0; i + 1 < text->length; i++) { + tmp.chunks[0] = graphemes[i]; + tmp.chunks[1] = graphemes[i]; + siphashadd64bits(&sh, tmp.whole); + } + int32_t last = text->length & 0x1 ? graphemes[text->length-1] : 0; // Odd number of graphemes + text->hash = siphashfinish_last_part(&sh, (uint64_t)last); + break; + } + case TEXT_SHORT_GRAPHEMES: { + tmp.chunks[0] = text->short_graphemes[0]; + if (text->length > 1) + tmp.chunks[1] = text->short_graphemes[1]; + text->hash = siphashfinish_last_part(&sh, (uint64_t)tmp.whole); + break; + } + case TEXT_SUBTEXT: { + int32_t leftover = 0; + for (int64_t sub_i = 0, to_hash = text->length; to_hash > 0; ) { + Text_t subtext = text->subtexts[sub_i]; + if (subtext.tag == TEXT_ASCII || subtext.tag == TEXT_SHORT_ASCII) { + const char *bytes = subtext.tag == TEXT_ASCII ? subtext.ascii : subtext.short_ascii; + int64_t grapheme = 0; + if (leftover) { + tmp.chunks[0] = leftover; + tmp.chunks[1] = (int32_t)bytes[0]; + siphashadd64bits(&sh, tmp.whole); + grapheme += 1; + } + for (; grapheme + 1 < subtext.length; grapheme += 2) { + tmp.chunks[0] = (int32_t)bytes[grapheme]; + tmp.chunks[1] = (int32_t)bytes[grapheme+1]; + siphashadd64bits(&sh, tmp.whole); + } + leftover = grapheme < subtext.length ? (int32_t)bytes[grapheme] : 0; + } else if (subtext.tag == TEXT_SHORT_GRAPHEMES) { + if (leftover) { + tmp.chunks[0] = leftover; + tmp.chunks[1] = subtext.short_graphemes[0]; + siphashadd64bits(&sh, tmp.whole); + leftover = subtext.length > 1 ? subtext.short_graphemes[1] : 0; + } else if (subtext.length == 1) { + leftover = subtext.short_graphemes[0]; + } else { + tmp.chunks[0] = subtext.short_graphemes[0]; + tmp.chunks[1] = subtext.short_graphemes[1]; + siphashadd64bits(&sh, tmp.whole); + } + } else if (subtext.tag == TEXT_GRAPHEMES) { + int32_t *graphemes = subtext.graphemes; + int64_t grapheme = 0; + if (leftover) { + tmp.chunks[0] = leftover; + tmp.chunks[1] = graphemes[0]; + siphashadd64bits(&sh, tmp.whole); + grapheme += 1; + } + for (; grapheme + 1 < subtext.length; grapheme += 2) { + tmp.chunks[0] = graphemes[grapheme]; + tmp.chunks[1] = graphemes[grapheme+1]; + siphashadd64bits(&sh, tmp.whole); + } + leftover = grapheme < subtext.length ? graphemes[grapheme] : 0; + } + + to_hash -= text->subtexts[sub_i].length; + + ++sub_i; + } + + text->hash = siphashfinish_last_part(&sh, leftover); + break; + } + default: errx(1, "Invalid text"); + } + + if (text->hash == 0) + text->hash = 1; + + return text->hash; } -public bool Text$has(CORD str, CORD target, Where_t where) +int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index) { - if (!target) return true; - if (!str) return false; + switch (text.tag) { + case TEXT_ASCII: return index < text.length ? (int32_t)text.ascii[index] : 0; + case TEXT_SHORT_ASCII: return index < text.length ? (int32_t)text.short_ascii[index] : 0; + case TEXT_GRAPHEMES: return index < text.length ? text.graphemes[index] : 0; + case TEXT_SHORT_GRAPHEMES: return index < text.length ? text.short_graphemes[index] : 0; + case TEXT_SUBTEXT: { + iteration_state_t backup_state = {0, 0}; + if (!state) state = &backup_state; - uint8_t str_buf[128] = {0}; size_t str_norm_len = sizeof(str_buf); - uint8_t *str_normalized = _normalize(str, str_buf, &str_norm_len); + if (index < 0 || index >= text.length) + return 0; - uint8_t target_buf[128] = {0}; size_t target_norm_len = sizeof(target_buf); - uint8_t *target_normalized = _normalize(target, target_buf, &target_norm_len); + while (index < state->sum_of_previous_subtexts && state->subtext > 0) { + state->sum_of_previous_subtexts -= text.subtexts[state->subtext].length; + state->subtext -= 1; + } + for (;;) { + if (index < state->sum_of_previous_subtexts + text.subtexts[state->subtext].length) + return _next_grapheme(text.subtexts[state->subtext], NULL, index); + state->sum_of_previous_subtexts += text.subtexts[state->subtext].length; + state->subtext += 1; + } + return 0; + } + default: errx(1, "Invalid text"); + } + return 0; +} - if (target_norm_len > str_norm_len) return false; +int32_t get_grapheme(Text_t text, int64_t index) +{ + iteration_state_t state = {0, 0}; + return _next_grapheme(text, &state, index); +} - bool ret; - if (where.tag == $tag$Where$Start) { - ret = (u8_strncmp(str_normalized, target_normalized, target_norm_len-1) == 0); - } else if (where.tag == $tag$Where$End) { - ret = (u8_strcmp(str_normalized + str_norm_len - target_norm_len, target_normalized) == 0); - } else { - assert(where.tag == $tag$Where$Anywhere); - ret = (u8_strstr(str_normalized, target_normalized) != NULL); +int32_t Text$compare(const Text_t *a, const Text_t *b) +{ + int64_t len = MAX(a->length, b->length); + iteration_state_t a_state = {0, 0}, b_state = {0, 0}; + for (int64_t i = 0; i < len; i++) { + int32_t ai = _next_grapheme(*a, &a_state, i); + int32_t bi = _next_grapheme(*b, &b_state, i); + if (ai == bi) continue; + int32_t cmp; + if (ai > 0 && bi > 0) { + cmp = u32_cmp((uint32_t*)&ai, (uint32_t*)&bi, 1); + } else if (ai > 0) { + cmp = u32_cmp2( + (uint32_t*)&ai, 1, + synthetic_graphemes[-bi-1].codepoints, + synthetic_graphemes[-bi-1].num_codepoints); + } else if (bi > 0) { + cmp = u32_cmp2( + synthetic_graphemes[-ai-1].codepoints, + synthetic_graphemes[-ai-1].num_codepoints, + (uint32_t*)&bi, 1); + } else { + cmp = u32_cmp2( + synthetic_graphemes[-ai-1].codepoints, + synthetic_graphemes[-ai-1].num_codepoints, + synthetic_graphemes[-bi-1].codepoints, + synthetic_graphemes[-bi-1].num_codepoints); + } + if (cmp != 0) return cmp; } + return 0; +} - if (str_normalized != str_buf) free(str_normalized); - if (target_normalized != target_buf) free(target_normalized); - return ret; +public bool Text$equal(const Text_t *a, const Text_t *b) +{ + if (a->length != b->length || (a->hash != 0 && b->hash != 0 && a->hash != b->hash)) + return false; + int64_t len = a->length; + iteration_state_t a_state = {0, 0}, b_state = {0, 0}; + for (int64_t i = 0; i < len; i++) { + int32_t ai = _next_grapheme(*a, &a_state, i); + int32_t bi = _next_grapheme(*b, &b_state, i); + if (ai != bi) return false; + } + return true; } -public CORD Text$without(CORD str, CORD target, Where_t where) +public bool Text$equal_ignoring_case(Text_t a, Text_t b) { - if (!str || !target) return str; + if (a.length != b.length) + return false; + int64_t len = a.length; + iteration_state_t a_state = {0, 0}, b_state = {0, 0}; + const char *language = uc_locale_language(); + for (int64_t i = 0; i < len; i++) { + int32_t ai = _next_grapheme(a, &a_state, i); + int32_t bi = _next_grapheme(b, &b_state, i); + if (ai != bi) { + const uint32_t *a_codepoints = ai >= 0 ? (uint32_t*)&ai : synthetic_graphemes[-ai-1].codepoints; + size_t a_len = ai >= 0 ? 1 : synthetic_graphemes[-ai-1].num_codepoints; - size_t target_len = CORD_len(target); - size_t str_len = CORD_len(str); - if (where.tag == $tag$Where$Start) { - if (CORD_ncmp(str, 0, target, 0, target_len) == 0) - return CORD_substr(str, target_len, str_len - target_len); - return str; - } else if (where.tag == $tag$Where$End) { - if (CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0) - return CORD_substr(str, 0, str_len - target_len); - return str; - } else { - CORD ret = CORD_EMPTY; - size_t i = 0; - for (;;) { - size_t match = CORD_str(str, i, target); - if (match == CORD_NOT_FOUND) { - if (i == 0) return str; // No matches! - ret = CORD_cat(ret, CORD_substr(str, i, str_len)); - break; - } - ret = CORD_cat(ret, CORD_substr(str, i, (match-i))); - i = match + target_len; + const uint32_t *b_codepoints = bi >= 0 ? (uint32_t*)&bi : synthetic_graphemes[-bi-1].codepoints; + size_t b_len = bi >= 0 ? 1 : synthetic_graphemes[-bi-1].num_codepoints; + + int cmp; + (void)u32_casecmp(a_codepoints, a_len, b_codepoints, b_len, language, UNINORM_NFC, &cmp); + if (cmp != 0) + return false; } - return ret; } + return true; } -public CORD Text$trimmed(CORD str, CORD skip, Where_t where) +public Text_t Text$upper(Text_t text) { - if (!str || !skip) return str; - const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(str); - const uint8_t *uskip = (const uint8_t*)CORD_to_const_char_star(skip); - // TODO: implement proper reverse iteration with u8_prev() - if (where.tag == $tag$Where$Start) { - size_t span = u8_strspn(ustr, uskip); - return (CORD)ustr + span; - } else if (where.tag == $tag$Where$End) { - size_t len = u8_strlen(ustr); - const uint8_t *back = ustr + len; - size_t back_span = 0; - while (back - back_span > ustr && u8_strspn(back-back_span-1, uskip) > back_span) - ++back_span; - return CORD_substr((CORD)ustr, 0, len - back_span); - } else { - size_t span = u8_strspn(ustr, uskip); - size_t len = u8_strlen(ustr); - const uint8_t *back = ustr + len; - size_t back_span = 0; - while (back - back_span > ustr + span && u8_strspn(back-back_span-1, uskip) > back_span) - ++back_span; - return CORD_substr((CORD)(ustr + span), 0, len - span - back_span); + size_t length; + uint32_t *codepoints = text_to_u32(text, &length); + const char *language = uc_locale_language(); + uint32_t buf[128]; + size_t out_len; + uint32_t *upper = u32_toupper(codepoints, length, language, UNINORM_NFC, buf, &out_len); + Text_t ret = text_from_u32(upper, out_len, false); + if (upper != buf) free(upper); + return ret; +} + +public Text_t Text$lower(Text_t text) +{ + size_t length; + uint32_t *codepoints = text_to_u32(text, &length); + const char *language = uc_locale_language(); + uint32_t buf[128]; + size_t out_len; + uint32_t *lower = u32_tolower(codepoints, length, language, UNINORM_NFC, buf, &out_len); + Text_t ret = text_from_u32(lower, out_len, false); + if (lower != codepoints) free(lower); + return ret; +} + +public Text_t Text$title(Text_t text) +{ + size_t length; + uint32_t *codepoints = text_to_u32(text, &length); + const char *language = uc_locale_language(); + uint32_t buf[128]; + size_t out_len; + uint32_t *title = u32_totitle(codepoints, length, language, UNINORM_NFC, buf, &out_len); + Text_t ret = text_from_u32(title, out_len, false); + if (title != codepoints) free(title); + return ret; +} + +static inline void skip_whitespace(Text_t text, int64_t *i) +{ + iteration_state_t state = {0, 0}; + while (*i < text.length) { + int32_t grapheme = _next_grapheme(text, &state, *i); + if (grapheme > 0 && !uc_is_property_white_space(grapheme)) + return; + *i += 1; } } -public find_result_t Text$find(CORD str, CORD pat) +static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme) { - if (!pat) return (find_result_t){.status=FIND_SUCCESS, .index=1}; - size_t pos = CORD_str(str, 0, pat); - return (pos == CORD_NOT_FOUND) ? (find_result_t){.status=FIND_FAILURE} : (find_result_t){.status=FIND_SUCCESS, .index=(int32_t)pos}; + if (*i < text.length && get_grapheme(text, *i) == grapheme) { + *i += 1; + return true; + } + return false; } -public CORD Text$replace(CORD text, CORD pat, CORD replacement, Int_t int_limit) +static inline bool match_str(Text_t text, int64_t *i, const char *str) { - if (!text || !pat) return text; - CORD ret = CORD_EMPTY; - size_t pos = 0, pat_len = CORD_len(pat); - int64_t limit = Int_to_Int64(int_limit, false); - for (size_t found; limit != 0 && (found=CORD_str(text, pos, pat)) != CORD_NOT_FOUND; --limit) { - ret = CORD_all(ret, CORD_substr(text, pos, found - pos), replacement); - pos = found + pat_len; + iteration_state_t state = {0, 0}; + int64_t matched = 0; + while (matched[str]) { + if (*i + matched >= text.length || _next_grapheme(text, &state, *i + matched) != str[matched]) + return false; + matched += 1; } - size_t str_len = CORD_len(text); - return CORD_cat(ret, CORD_substr(text, pos, str_len - pos)); + *i += matched; + return true; } -public array_t Text$split(CORD str, CORD split) +static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop) { - if (!str) return (array_t){.data=GC_MALLOC(sizeof(CORD)), .atomic=1, .length=1, .stride=sizeof(CORD)}; - array_t strings = {.stride=sizeof(CORD), .atomic=1}; + if (*i >= text.length) return false; + int32_t grapheme = get_grapheme(text, *i); + if (grapheme < 0) // TODO: check every codepoint in the cluster? + grapheme = synthetic_graphemes[-grapheme-1].codepoints[0]; - const uint8_t *ustr = (uint8_t*)CORD_to_const_char_star(str); - const uint8_t *usplit = (uint8_t*)CORD_to_const_char_star(split); - for (int64_t i = 0; ; ) { - size_t non_split = u8_strcspn(ustr + i, usplit); - CORD chunk = CORD_substr((CORD)ustr, i, non_split); - Array$insert(&strings, &chunk, I(0), sizeof(CORD)); + if (uc_is_property(grapheme, prop)) { + *i += 1; + return true; + } + return false; +} - i += non_split; +static int64_t parse_int(Text_t text, int64_t *i) +{ + iteration_state_t state = {0, 0}; + int64_t value = 0; + for (;; *i += 1) { + int32_t grapheme = _next_grapheme(text, &state, *i); + if (grapheme < 0) + grapheme = synthetic_graphemes[-grapheme-1].codepoints[0]; + int digit = uc_digit_value(grapheme); + if (digit < 0) break; + if (value >= INT64_MAX/10) break; + value = 10*value + digit; + } + return value; +} - size_t split_span = u8_strspn(ustr + i, usplit); - if (split_span == 0) break; - i += split_span; +const char *get_property_name(Text_t text, int64_t *i) +{ + skip_whitespace(text, i); + char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); + char *dest = name; + iteration_state_t state = {0, 0}; + while (*i < text.length) { + int32_t grapheme = _next_grapheme(text, &state, *i); + if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { + *dest = (char)grapheme; + ++dest; + if (dest >= name + UNINAME_MAX - 1) + break; + } else if (dest == name && grapheme >= 0 && grapheme != ']') { + // Literal character escape: [..[] --> "LEFT SQUARE BRACKET" + name = unicode_character_name(grapheme, name); + *i += 1; + return name; + } else { + break; + } + *i += 1; } - return strings; + if (dest == name) return NULL; + *dest = '\0'; + return name; } -public CORD Text$join(CORD glue, array_t pieces) +#define EAT1(state, cond) ({\ + int32_t grapheme = _next_grapheme(text, state, text_index); \ + bool success = (cond); \ + if (success) text_index += 1; \ + success; }) + +#define EAT_MANY(state, cond) ({ int64_t n = 0; while (EAT1(state, cond)) { n += 1; } n; }) + +int64_t match_email(Text_t text, int64_t text_index) { - if (pieces.length == 0) return CORD_EMPTY; + // email = local "@" domain + // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) + // domain = dns-label ("." dns-label)* + // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) - CORD ret = CORD_EMPTY; - for (int64_t i = 0; i < pieces.length; i++) { - if (i > 0) ret = CORD_cat(ret, glue); - ret = CORD_cat(ret, *(CORD*)((void*)pieces.data + i*pieces.stride)); + iteration_state_t state = {0, 0}; + if (text_index > 0) { + int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); + if (prev_codepoint < 0) + prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0]; + if (uc_is_property_alphabetic(prev_codepoint)) + return -1; } - return ret; + + int64_t start_index = text_index; + + // Local part: + int64_t local_len = 0; + static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; + while (EAT1(&state, (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { + local_len += 1; + if (local_len > 64) return -1; + } + + if (!EAT1(&state, grapheme == '@')) + return -1; + + // Host + int64_t host_len = 0; + do { + int64_t label_len = 0; + while (EAT1(&state, (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { + label_len += 1; + if (label_len > 63) return -1; + } + + if (label_len == 0) + return -1; + + host_len += label_len; + if (host_len > 255) + return -1; + host_len += 1; + } while (EAT1(&state, grapheme == '.')); + + return text_index - start_index; } -public array_t Text$clusters(CORD text) +int64_t match_ipv6(Text_t text, int64_t text_index) { - array_t clusters = {.atomic=1}; - uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); - uint8_t *normalized = _normalize(text, buf, &norm_len); - const uint8_t *end = normalized + strlen((char*)normalized); - for (const uint8_t *pos = normalized; pos != end; ) { - const uint8_t *next = u8_grapheme_next(pos, end); - size_t len = (size_t)(next - pos); - char cluster_buf[len+1]; - strlcpy(cluster_buf, (char*)pos, len+1); - CORD cluster = CORD_from_char_star(cluster_buf); - Array$insert(&clusters, &cluster, I(0), sizeof(CORD)); - pos = next; + iteration_state_t state = {0, 0}; + if (text_index > 0) { + int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); + if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) + return -1; } + int64_t start_index = text_index; + const int NUM_CLUSTERS = 8; + bool double_colon_used = false; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 4; digits++) { + if (!EAT1(&state, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + break; + } + if (EAT1(&state, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) { + break; + } else if (!EAT1(&state, grapheme == ':')) { + if (double_colon_used) + break; + return -1; + } - if (normalized != buf) free(normalized); - return clusters; + if (EAT1(&state, grapheme == ':')) { + if (double_colon_used) + return -1; + double_colon_used = true; + } + } + return text_index - start_index; } -public array_t Text$codepoints(CORD text) +static int64_t match_ipv4(Text_t text, int64_t text_index) { - uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); - uint8_t *normalized = _normalize(text, norm_buf, &norm_len); + iteration_state_t state = {0, 0}; + if (text_index > 0) { + int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); + if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) + return -1; + } + int64_t start_index = text_index; - uint32_t codepoint_buf[128] = {0}; - size_t codepoint_len = sizeof(codepoint_buf); - uint32_t *codepoints = u8_to_u32(normalized, norm_len-1, codepoint_buf, &codepoint_len); - array_t ret = { - .length=codepoint_len, - .data=memcpy(GC_MALLOC_ATOMIC(sizeof(int32_t[codepoint_len])), codepoints, sizeof(int32_t[codepoint_len])), - .stride=sizeof(int32_t), - .atomic=1, - }; + const int NUM_CLUSTERS = 4; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 3; digits++) { + if (!EAT1(&state, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { + if (digits == 0) return -1; + break; + } + } - if (normalized != norm_buf) free(normalized); - if (codepoints != codepoint_buf) free(codepoints); - return ret; + if (EAT1(&state, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) + break; + else if (!EAT1(&state, grapheme == '.')) + return -1; + } + return (text_index - start_index); } -public array_t Text$bytes(CORD text) +int64_t match_uri(Text_t text, int64_t text_index) { - uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); - uint8_t *normalized = _normalize(text, norm_buf, &norm_len); + // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] + // scheme = [a-zA-Z] [a-zA-Z0-9+.-] + // authority = [userinfo "@"] host [":" port] - --norm_len; // NUL byte - array_t ret = { - .length=norm_len, - .data=memcpy(GC_MALLOC_ATOMIC(sizeof(uint8_t[norm_len])), normalized, sizeof(uint8_t[norm_len])), - .stride=sizeof(uint8_t), - .atomic=1, - }; + iteration_state_t state = {0, 0}; + if (text_index > 0) { + int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); + if (prev_codepoint < 0) + prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0]; + if (uc_is_property_alphabetic(prev_codepoint)) + return -1; + } - if (normalized != norm_buf) free(normalized); - return ret; + int64_t start_index = text_index; + + // Scheme: + if (!EAT1(&state, isalpha(grapheme))) + return -1; + + EAT_MANY(&state, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); + + if (text_index == start_index) + return -1; + + if (!match_grapheme(text, &text_index, ':')) + return -1; + + // Authority: + if (match_str(text, &text_index, "//")) { + int64_t authority_start = text_index; + // Username or host: + static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/"; + if (EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) + return -1; + + if (EAT1(&state, grapheme == '@')) { + // Found a username, now get a host: + if (EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) + return -1; + } else { + int64_t ip = authority_start; + int64_t ipv4_len = match_ipv4(text, ip); + if (ipv4_len > 0) { + ip += ipv4_len; + } else if (match_grapheme(text, &ip, '[')) { + ip += match_ipv6(text, ip); + if (ip > authority_start + 1 && match_grapheme(text, &ip, ']')) + text_index = ip; + } + } + + // Port: + if (EAT1(&state, grapheme == ':')) { + if (EAT_MANY(&state, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) + return -1; + } + if (!EAT1(&state, grapheme == '/')) + return (text_index - start_index); // No path + } else { + // Optional path root: + EAT1(&state, grapheme == '/'); + } + + // Path: + static const char *non_path = " \"#?<>[]{}\\^`|"; + EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); + + if (EAT1(&state, grapheme == '?')) { // Query + static const char *non_query = " \"#<>[]{}\\^`|"; + EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); + } + + if (EAT1(&state, grapheme == '#')) { // Fragment + static const char *non_fragment = " \"#<>[]{}\\^`|"; + EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); + } + return text_index - start_index; } -public Int_t Text$num_clusters(CORD text) +int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_index) { - const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); - int64_t num_clusters = 0; - const uint8_t *end = ustr + u8_strlen(ustr); - for (const uint8_t *pos = ustr; pos != end; ) { - const uint8_t *next = u8_grapheme_next(pos, end); - ++num_clusters; - pos = next; + if (pattern_index >= pattern.length) return 0; + int64_t start_index = text_index; + iteration_state_t pattern_state = {0, 0}, text_state = {0, 0}; + while (pattern_index < pattern.length) { + int64_t old_pat_index = pattern_index; + if (match_str(pattern, &pattern_index, "[..")) { + skip_whitespace(pattern, &pattern_index); + int64_t min, max; + if (uc_is_digit(_next_grapheme(pattern, &pattern_state, pattern_index))) { + min = parse_int(pattern, &pattern_index); + skip_whitespace(pattern, &pattern_index); + if (match_grapheme(pattern, &pattern_index, '+')) { + max = INT64_MAX; + } else if (match_grapheme(pattern, &pattern_index, '-')) { + max = parse_int(pattern, &pattern_index); + } else { + max = min; + } + } else { + min = 1, max = INT64_MAX; + } + + skip_whitespace(pattern, &pattern_index); + bool want_to_match = !match_grapheme(pattern, &pattern_index, '!'); + const char *prop_name = get_property_name(pattern, &pattern_index); + + skip_whitespace(pattern, &pattern_index); + if (!match_grapheme(pattern, &pattern_index, ']')) + errx(1, "Missing closing ']' in pattern: \"%T\"", &pattern); + + int64_t before_group = text_index; + bool any = false; + uc_property_t prop; + int32_t specific_codepoint = UNINAME_INVALID; + +#define FAIL() ({ if (min < 1) { text_index = before_group; continue; } else { return -1; } }) + if (prop_name) { + switch (tolower(prop_name[0])) { + case 'd': + if (strcasecmp(prop_name, "digit") == 0) { + prop = UC_PROPERTY_DECIMAL_DIGIT; + goto got_prop; + } + break; + case 'e': + if (strcasecmp(prop_name, "end") == 0) { + if (text_index != text.length) + FAIL(); + continue; + } else if (prop_name && strcasecmp(prop_name, "email") == 0) { + int64_t len = match_email(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } else if (prop_name && strcasecmp(prop_name, "emoji") == 0) { + prop = UC_PROPERTY_EMOJI; + goto got_prop; + } + break; + case 'i': + if (prop_name && strcasecmp(prop_name, "id") == 0) { + if (!EAT1(&text_state, uc_is_property(grapheme, UC_PROPERTY_XID_START))) + FAIL(); + EAT_MANY(&text_state, uc_is_property(grapheme, UC_PROPERTY_XID_CONTINUE)); + continue; + } else if (prop_name && strcasecmp(prop_name, "ipv4") == 0) { + int64_t len = match_ipv4(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } else if (prop_name && strcasecmp(prop_name, "ipv6") == 0) { + int64_t len = match_ipv6(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } else if (prop_name && strcasecmp(prop_name, "ip") == 0) { + int64_t len = match_ipv6(text, text_index); + if (len < 0) + len = match_ipv4(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } + break; + case 's': + if (strcasecmp(prop_name, "start") == 0) { + if (text_index != 0) return -1; + continue; + } + break; + case 'u': + if (prop_name && strcasecmp(prop_name, "uri") == 0) { + int64_t len = match_uri(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } else if (prop_name && strcasecmp(prop_name, "url") == 0) { + int64_t lookahead = text_index; + if (!(match_str(text, &lookahead, "https:") + || match_str(text, &lookahead, "http:") + || match_str(text, &lookahead, "ftp:") + || match_str(text, &lookahead, "wss:") + || match_str(text, &lookahead, "ws:"))) + FAIL(); + + int64_t len = match_uri(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } + break; + } + + prop = uc_property_byname(prop_name); + if (!uc_property_is_valid(prop)) { + specific_codepoint = unicode_name_character(prop_name); + if (specific_codepoint == UNINAME_INVALID) + errx(1, "Not a valid property or character name: %s", prop_name); + } + } else { + any = true; + prop = UC_PROPERTY_PRIVATE_USE; + } + got_prop:; + + if (min == 0 && pattern_index < pattern.length) { + int64_t match_len = match(text, pattern, text_index, pattern_index); + if (match_len >= 0) + return (text_index - start_index) + match_len; + } + + for (int64_t count = 0; count < max; ) { + int32_t grapheme = _next_grapheme(text, &text_state, text_index); + if (grapheme < 0) + grapheme = synthetic_graphemes[-grapheme-1].codepoints[0]; + + bool success; + if (any) + success = true; + else if (specific_codepoint != UNINAME_INVALID) + success = (grapheme == specific_codepoint); + else + success = uc_is_property(grapheme, prop); + + if (success != want_to_match) { + if (count < min) return -1; + else break; + } + + text_index += 1; + count += 1; + + if (count >= min) { + if (pattern_index < pattern.length) { + int64_t match_len = match(text, pattern, text_index, pattern_index); + if (match_len >= 0) { + return (text_index - start_index) + match_len; + } + } else if (text_index >= text.length) { + break; + } + } + } + } else if (uc_is_property(_next_grapheme(pattern, &pattern_state, pattern_index), UC_PROPERTY_QUOTATION_MARK) + && (pattern_index += 1, match_grapheme(pattern, &pattern_index, '?'))) { + // Quotation: "?", '?', etc + int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2); + if (!match_grapheme(text, &text_index, open)) return -1; + int32_t close = open; + uc_mirror_char(open, (uint32_t*)&close); + if (!match_grapheme(pattern, &pattern_index, close)) + errx(1, "I expected a closing brace"); + while (text_index < text.length) { + int32_t c = _next_grapheme(text, &text_state, text_index); + if (c == close) + return (text_index - start_index); + + if (c == '\\' && text_index < text.length) { + text_index += 2; + } else { + text_index += 1; + } + } + return -1; + } else if (uc_is_property(_next_grapheme(pattern, &pattern_state, pattern_index), UC_PROPERTY_PAIRED_PUNCTUATION) + && (pattern_index += 1, match_grapheme(pattern, &pattern_index, '?'))) { + // Nested punctuation: (?), [?], etc + int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2); + if (!match_grapheme(text, &text_index, open)) return -1; + int32_t close = open; + uc_mirror_char(open, (uint32_t*)&close); + if (!match_grapheme(pattern, &pattern_index, close)) + errx(1, "I expected a closing brace"); + int64_t depth = 1; + for (; depth > 0 && text_index < text.length; ++text_index) { + int32_t c = _next_grapheme(text, &text_state, text_index); + if (c == open) + depth += 1; + else if (c == close) + depth -= 1; + } + if (depth > 0) return -1; + } else { + // Plain character: + pattern_index = old_pat_index; + int32_t pat_grapheme = _next_grapheme(pattern, &pattern_state, pattern_index); + + if (pattern_index == 0 && text_index > 0) { + int32_t pat_codepoint = pat_grapheme; + if (pat_codepoint < 0) + pat_codepoint = synthetic_graphemes[-pat_codepoint-1].codepoints[0]; + + int32_t prev_codepoint = _next_grapheme(text, &text_state, text_index - 1); + if (prev_codepoint < 0) + prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0]; + if (uc_is_property_alphabetic(pat_codepoint) && uc_is_property_alphabetic(prev_codepoint)) + return -1; + } + + int32_t text_grapheme = _next_grapheme(text, &text_state, text_index); + if (pat_grapheme != text_grapheme) + return -1; + + pattern_index += 1; + text_index += 1; + + if (pattern_index == pattern.length && text_index < text.length) { + int32_t pat_codepoint = pat_grapheme; + if (pat_codepoint < 0) + pat_codepoint = synthetic_graphemes[-pat_codepoint-1].codepoints[0]; + + int32_t next_codepoint = _next_grapheme(text, &text_state, text_index); + if (next_codepoint < 0) + next_codepoint = synthetic_graphemes[-next_codepoint-1].codepoints[0]; + if (uc_is_property_alphabetic(pat_codepoint) && uc_is_property_alphabetic(next_codepoint)) + return -1; + } + } + } + if (text_index >= text.length && pattern_index < pattern.length) + return -1; + return (text_index - start_index); +} + +#undef EAT1 +#undef EAT_MANY + +public Int_t Text$find(Text_t text, Text_t pattern, Int_t from_index, int64_t *match_length) +{ + int32_t first = get_grapheme(pattern, 0); + bool find_first = (first != '[' + && !uc_is_property(first, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property(first, UC_PROPERTY_PAIRED_PUNCTUATION)); + + iteration_state_t text_state = {0, 0}; + for (int64_t i = Int_to_Int64(from_index, false)-1; i < text.length; i++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (i < text.length && _next_grapheme(text, &text_state, i) != first) + ++i; + } + + int64_t m = match(text, pattern, i, 0); + if (m >= 0) { + if (match_length) + *match_length = m; + return I(i+1); + } } - return I(num_clusters); + if (match_length) + *match_length = -1; + return I(0); +} + +public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]) +{ + if (n < 1) return -1; + (void)info; + argtypes[0] = PA_POINTER; + sizes[0] = sizeof(Text_t*); + return 1; +} + +public int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]) +{ + (void)info; + Text_t t = **(Text_t**)args[0]; + return Text$print(stream, t); } -public Int_t Text$num_codepoints(CORD text) +public Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info) { - uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); - uint8_t *normalized = _normalize(text, buf, &norm_len); - int64_t num_codepoints = u8_mbsnlen(normalized, norm_len-1); - if (normalized != buf) free(normalized); - return I(num_codepoints); + (void)info; + if (!text) return Text$from_str("Text"); + return Text$quoted(*(Text_t*)text, colorize); } -public Int_t Text$num_bytes(CORD text) +public Text_t Text$quoted(Text_t text, bool colorize) { - uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); - uint8_t *normalized = _normalize(text, norm_buf, &norm_len); - --norm_len; // NUL byte - if (!normalized) errx(1, "Unicode normalization error!"); - if (normalized != norm_buf) free(normalized); - return I(norm_len); + // TODO: optimize for ASCII and short strings + array_t graphemes = {.atomic=1}; +#define add_char(c) Array$insert_value(&graphemes, (uint32_t)c, I_small(0), sizeof(uint32_t)) +#define add_str(s) ({ for (char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (uint32_t)*_c, I_small(0), sizeof(uint32_t)); }) + if (colorize) + add_str("\x1b[35m\""); + else + add_char('"'); + +#define add_escaped(str) ({ if (colorize) add_str("\x1b[34;1m"); add_char('\\'); add_str(str); if (colorize) add_str("\x1b[0;35m"); }) + iteration_state_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t g = _next_grapheme(text, &state, i); + switch (g) { + case '\a': add_escaped("a"); break; + case '\b': add_escaped("b"); break; + case '\x1b': add_escaped("e"); break; + case '\f': add_escaped("f"); break; + case '\n': add_escaped("n"); break; + case '\r': add_escaped("r"); break; + case '\t': add_escaped("t"); break; + case '\v': add_escaped("v"); break; + case '"': add_escaped("\""); break; + case '\\': add_escaped("\\"); break; + case '\x00' ... '\x06': case '\x0E' ... '\x1A': + case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': { + if (colorize) add_str("\x1b[34;1m"); + add_char('\\'); + add_char('x'); + char tmp[4]; + sprintf(tmp, "%02X", g); + add_str(tmp); + if (colorize) + add_str("\x1b[0;35m"); + break; + } + default: add_char(g); break; + } + } + + if (colorize) + add_str("\"\x1b[m"); + else + add_char('"'); + + return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data}; +#undef add_str +#undef add_char +#undef add_escaped } -public array_t Text$character_names(CORD text) +public Text_t Text$replace(Text_t text, Text_t pattern, Text_t replacement) { - array_t codepoints = Text$codepoints(text); - array_t ret = {.length=codepoints.length, .stride=sizeof(CORD), .data=GC_MALLOC(sizeof(CORD[codepoints.length]))}; - for (int64_t i = 0; i < codepoints.length; i++) { - char buf[UNINAME_MAX]; - unicode_character_name(*(ucs4_t*)(codepoints.data + codepoints.stride*i), buf); - *(CORD*)(ret.data + ret.stride*i) = CORD_from_char_star(buf); + Text_t ret = {.length=0}; + + Int_t i = I_small(0); + for (;;) { + int64_t len; + Int_t found = Text$find(text, pattern, i, &len); + if (found.small == I_small(0).small) break; + if (Int$compare(&found, &i, &$Text) > 0) { + ret = Text$concat( + ret, + Text$slice(text, i, Int$minus(found, I_small(1))), + replacement + ); + } else { + ret = concat2(ret, replacement); + } + } + if (Int_to_Int64(i, false) <= text.length) { + ret = concat2(ret, Text$slice(text, i, Int64_to_Int(text.length))); } return ret; } -public CORD Text$read_line(CORD prompt) +public Text_t Text$format(const char *fmt, ...) { - char *line = readline(CORD_to_const_char_star(prompt)); - if (!line) return CORD_EMPTY; - CORD ret = CORD_from_char_star(line); - free(line); + va_list args; + va_start(args, fmt); + + char buf[8]; + int len = vsnprintf(buf, sizeof(buf), fmt, args); + Text_t ret; + if (len <= (int)sizeof(buf)) { + ret = (Text_t){ + .length=len, + .tag = TEXT_SHORT_ASCII, + }; + for (int i = 0; i < len; i++) + ret.short_ascii[i] = buf[i]; + } else { + char *str = GC_MALLOC_ATOMIC(len); + vsnprintf(str, len, fmt, args); + ret = Text$from_str(str); + } + va_end(args); return ret; } public const TypeInfo $Text = { - .size=sizeof(CORD), - .align=__alignof__(CORD), + .size=sizeof(Text_t), + .align=__alignof__(Text_t), .tag=TextInfo, .TextInfo={.lang="Text"}, }; diff --git a/builtins/text.h b/builtins/text.h index 017a2804..1e671695 100644 --- a/builtins/text.h +++ b/builtins/text.h @@ -1,10 +1,10 @@ #pragma once -// Type info and methods for Text datatype, which uses the Boehm "cord" library -// and libunistr +// Type info and methods for Text datatype, which uses a struct inspired by +// Raku's string representation and libunistr -#include <gc/cord.h> #include <stdbool.h> +#include <printf.h> #include <stdint.h> #include "datatypes.h" @@ -12,36 +12,57 @@ #include "types.h" #include "where.h" -#define Text_t CORD - typedef struct { enum { FIND_FAILURE, FIND_SUCCESS } status; int32_t index; } find_result_t; -CORD Text$as_text(const void *str, bool colorize, const TypeInfo *info); -CORD Text$quoted(CORD str, bool colorize); -int Text$compare(const CORD *x, const CORD *y); -bool Text$equal(const CORD *x, const CORD *y); -uint32_t Text$hash(const CORD *cord); -CORD Text$upper(CORD str); -CORD Text$lower(CORD str); -CORD Text$title(CORD str); -bool Text$has(CORD str, CORD target, Where_t where); -CORD Text$without(CORD str, CORD target, Where_t where); -CORD Text$trimmed(CORD str, CORD skip, Where_t where); -find_result_t Text$find(CORD str, CORD pat); -CORD Text$replace(CORD text, CORD pat, CORD replacement, Int_t limit); -array_t Text$split(CORD str, CORD split); -CORD Text$join(CORD glue, array_t pieces); -array_t Text$clusters(CORD text); -array_t Text$codepoints(CORD text); -array_t Text$bytes(CORD text); -Int_t Text$num_clusters(CORD text); -Int_t Text$num_codepoints(CORD text); -Int_t Text$num_bytes(CORD text); -array_t Text$character_names(CORD text); -CORD Text$read_line(CORD prompt); +// CORD Text$as_text(const void *str, bool colorize, const TypeInfo *info); +// CORD Text$quoted(CORD str, bool colorize); +// // int Text$compare(const CORD *x, const CORD *y); +// // bool Text$equal(const CORD *x, const CORD *y); +// // uint32_t Text$hash(const CORD *cord); +// // CORD Text$upper(CORD str); +// // CORD Text$lower(CORD str); +// // CORD Text$title(CORD str); +// bool Text$has(CORD str, CORD target, Where_t where); +// CORD Text$without(CORD str, CORD target, Where_t where); +// CORD Text$trimmed(CORD str, CORD skip, Where_t where); +// find_result_t Text$find(CORD str, CORD pat); +// CORD Text$replace(CORD text, CORD pat, CORD replacement, Int_t limit); +// array_t Text$split(CORD str, CORD split); +// CORD Text$join(CORD glue, array_t pieces); +// array_t Text$clusters(CORD text); +// array_t Text$codepoints(CORD text); +// array_t Text$bytes(CORD text); +// Int_t Text$num_clusters(CORD text); +// Int_t Text$num_codepoints(CORD text); +// Int_t Text$num_bytes(CORD text); +// array_t Text$character_names(CORD text); +// CORD Text$read_line(CORD prompt); + +int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]); +int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]); + +int Text$print(FILE *stream, Text_t t); +void Text$visualize(Text_t t); +Text_t Text$_concat(int n, Text_t items[n]); +#define Text$concat(...) Text$_concat(sizeof((Text_t[]){__VA_ARGS__})/sizeof(Text_t), (Text_t[]){__VA_ARGS__}) +Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int); +Text_t Text$from_str(const char *str); +uint64_t Text$hash(Text_t *text); +int32_t Text$compare(const Text_t *a, const Text_t *b); +bool Text$equal(const Text_t *a, const Text_t *b); +bool Text$equal_ignoring_case(Text_t a, Text_t b); +Text_t Text$upper(Text_t text); +Text_t Text$lower(Text_t text); +Text_t Text$title(Text_t text); +Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info); +Text_t Text$quoted(Text_t str, bool colorize); +Text_t Text$replace(Text_t str, Text_t pat, Text_t replacement); +Int_t Text$find(Text_t text, Text_t pattern, Int_t i, int64_t *match_length); +const char *Text$as_c_string(Text_t text); +public Text_t Text$format(const char *fmt, ...); extern const TypeInfo $Text; diff --git a/builtins/thread.c b/builtins/thread.c index b9586917..793a0101 100644 --- a/builtins/thread.c +++ b/builtins/thread.c @@ -3,7 +3,6 @@ #include <ctype.h> #include <err.h> #include <gc.h> -#include <gc/cord.h> #include <math.h> #include <stdbool.h> #include <stdint.h> @@ -14,6 +13,7 @@ #include "array.h" #include "functions.h" #include "halfsiphash.h" +#include "text.h" #include "types.h" #include "util.h" @@ -39,13 +39,13 @@ public void Thread$detach(pthread_t *thread) pthread_detach(*thread); } -CORD Thread$as_text(const pthread_t **thread, bool colorize, const TypeInfo *type) +Text_t Thread$as_text(const pthread_t **thread, bool colorize, const TypeInfo *type) { (void)type; if (!thread) { - return colorize ? "\x1b[34;1mThread\x1b[m" : "Thread"; + return Text$from_str(colorize ? "\x1b[34;1mThread\x1b[m" : "Thread"); } - return CORD_asprintf(colorize ? "\x1b[34;1mThread(%p)\x1b[m" : "Thread(%p)", *thread); + return Text$format(colorize ? "\x1b[34;1mThread(%p)\x1b[m" : "Thread(%p)", *thread); } public const TypeInfo Thread = { diff --git a/builtins/thread.h b/builtins/thread.h index efccae33..2956dda6 100644 --- a/builtins/thread.h +++ b/builtins/thread.h @@ -14,7 +14,7 @@ pthread_t *Thread$new(closure_t fn); void Thread$cancel(pthread_t *thread); void Thread$join(pthread_t *thread); void Thread$detach(pthread_t *thread); -CORD Thread$as_text(const pthread_t **thread, bool colorize, const TypeInfo *type); +Text_t Thread$as_text(const pthread_t **thread, bool colorize, const TypeInfo *type); extern TypeInfo Thread; diff --git a/builtins/types.c b/builtins/types.c index 4fb2c523..ab1b8013 100644 --- a/builtins/types.c +++ b/builtins/types.c @@ -9,17 +9,20 @@ #include "array.h" #include "pointer.h" #include "table.h" +#include "text.h" #include "types.h" -public CORD Type$as_text(const void *typeinfo, bool colorize, const TypeInfo *type) +public Text_t Type$as_text(const void *typeinfo, bool colorize, const TypeInfo *type) { - if (!typeinfo) return "TypeInfo"; + if (!typeinfo) return Text$from_str("TypeInfo"); - if (!colorize) - return type->TypeInfoInfo.type_str; - CORD c; - CORD_sprintf(&c, "\x1b[36;1m%s\x1b[m", type->TypeInfoInfo.type_str); - return c; + if (colorize) + return Text$concat( + Text$from_str("\x1b[36;1m"), + Text$from_str(type->TypeInfoInfo.type_str), + Text$from_str("\x1b[m")); + else + return Text$from_str(type->TypeInfoInfo.type_str); } public const TypeInfo $TypeInfo = { @@ -32,13 +35,13 @@ public const TypeInfo $TypeInfo = { public const TypeInfo $Void = {.size=0, .align=0, .tag=EmptyStruct}; public const TypeInfo $Abort = {.size=0, .align=0, .tag=EmptyStruct}; -public CORD Func$as_text(const void *fn, bool colorize, const TypeInfo *type) +public Text_t Func$as_text(const void *fn, bool colorize, const TypeInfo *type) { (void)fn; - CORD c = type->FunctionInfo.type_str; + Text_t text = Text$from_str(type->FunctionInfo.type_str); if (fn && colorize) - CORD_sprintf(&c, "\x1b[32;1m%r\x1b[m", c); - return c; + text = Text$concat(Text$from_str("\x1b[32;1m"), text, Text$from_str("\x1b[m")); + return text; } // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/types.h b/builtins/types.h index 70f8dc00..2647ef92 100644 --- a/builtins/types.h +++ b/builtins/types.h @@ -2,7 +2,6 @@ // Type information and methods for TypeInfos (i.e. runtime representations of types) -#include <gc/cord.h> #include <stdbool.h> #include <stdint.h> @@ -13,7 +12,7 @@ struct TypeInfo; typedef uint32_t (*hash_fn_t)(const void*, const struct TypeInfo*); typedef int32_t (*compare_fn_t)(const void*, const void*, const struct TypeInfo*); typedef bool (*equal_fn_t)(const void*, const void*, const struct TypeInfo*); -typedef CORD (*str_fn_t)(const void*, bool, const struct TypeInfo*); +typedef Text_t (*text_fn_t)(const void*, bool, const struct TypeInfo*); typedef struct TypeInfo { int64_t size, align; @@ -24,7 +23,7 @@ typedef struct TypeInfo { equal_fn_t equal; compare_fn_t compare; hash_fn_t hash; - str_fn_t as_text; + text_fn_t as_text; } CustomInfo; struct { const char *sigil; @@ -76,7 +75,7 @@ extern const TypeInfo $Void; extern const TypeInfo $Abort; #define Void_t void -CORD Type$as_text(const void *typeinfo, bool colorize, const TypeInfo *type); -CORD Func$as_text(const void *fn, bool colorize, const TypeInfo *type); +Text_t Type$as_text(const void *typeinfo, bool colorize, const TypeInfo *type); +Text_t Func$as_text(const void *fn, bool colorize, const TypeInfo *type); // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/util.c b/builtins/util.c index d4f3cd31..7fca15e3 100644 --- a/builtins/util.c +++ b/builtins/util.c @@ -7,6 +7,7 @@ #include <stdlib.h> #include <string.h> +#include "text.h" #include "util.h" public bool USE_COLOR; @@ -67,4 +68,54 @@ public CORD CORD_asprintf(CORD fmt, ...) return c; } +public CORD CORD_quoted(CORD str) +{ + CORD quoted = "\""; + CORD_pos i; + CORD_FOR(i, str) { + char c = CORD_pos_fetch(i); + switch (c) { + case '\a': quoted = CORD_cat(quoted, "\\a"); break; + case '\b': quoted = CORD_cat(quoted, "\\b"); break; + case '\x1b': quoted = CORD_cat(quoted, "\\e"); break; + case '\f': quoted = CORD_cat(quoted, "\\f"); break; + case '\n': quoted = CORD_cat(quoted, "\\n"); break; + case '\r': quoted = CORD_cat(quoted, "\\r"); break; + case '\t': quoted = CORD_cat(quoted, "\\t"); break; + case '\v': quoted = CORD_cat(quoted, "\\v"); break; + case '"': quoted = CORD_cat(quoted, "\\\""); break; + case '\\': quoted = CORD_cat(quoted, "\\\\"); break; + case '\x00' ... '\x06': case '\x0E' ... '\x1A': + case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': + CORD_sprintf("ed, "%r\\x%02X", quoted, c); + break; + default: quoted = CORD_cat_char(quoted, c); break; + } + } + quoted = CORD_cat_char(quoted, '"'); + return quoted; +} + +public CORD CORD_replace(CORD c, CORD to_replace, CORD replacement) +{ + size_t len = CORD_len(c); + size_t replaced_len = CORD_len(to_replace); + size_t pos = 0; + CORD ret = CORD_EMPTY; + while (pos < len) { + size_t found = CORD_str(c, pos, to_replace); + if (found == CORD_NOT_FOUND) { + if (pos < len-1) + ret = CORD_cat(ret, CORD_substr(c, pos, len)); + return ret; + } + if (found > pos) + ret = CORD_cat(ret, CORD_substr(c, pos, found-pos)); + ret = CORD_cat(ret, replacement); + pos = found + replaced_len; + } + return ret; +} + + // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/util.h b/builtins/util.h index 271403ff..a3f5f2b4 100644 --- a/builtins/util.h +++ b/builtins/util.h @@ -31,6 +31,8 @@ extern bool USE_COLOR; char *heap_strf(const char *fmt, ...); CORD CORD_asprintf(CORD fmt, ...); +CORD CORD_quoted(CORD str); +CORD CORD_replace(CORD c, CORD to_replace, CORD replacement); #define CORD_appendf(cord, fmt, ...) CORD_sprintf(cord, "%r" fmt, *(cord) __VA_OPT__(,) __VA_ARGS__) #define CORD_all(...) CORD_catn(sizeof((CORD[]){__VA_ARGS__})/sizeof(CORD), __VA_ARGS__) diff --git a/builtins/where.c b/builtins/where.c index f7db2db3..d57f532e 100644 --- a/builtins/where.c +++ b/builtins/where.c @@ -1,27 +1,27 @@ // A type called "Where" that is an enum for "Anywhere", "Start", or "End" // Mainly used for text methods -#include <gc/cord.h> #include <stdbool.h> #include <stdint.h> +#include "text.h" #include "types.h" -#include "where.h" #include "util.h" +#include "where.h" -static CORD Where$as_text(Where_t *obj, bool use_color) +static Text_t Where$as_text(Where_t *obj, bool use_color) { if (!obj) - return "Where"; + return Text$from_str("Where"); switch (obj->tag) { case $tag$Where$Anywhere: - return use_color ? "\x1b[36;1mWhere.Anywhere\x1b[m" : "Where.Anywhere"; + return Text$from_str(use_color ? "\x1b[36;1mWhere.Anywhere\x1b[m" : "Where.Anywhere"); case $tag$Where$Start: - return use_color ? "\x1b[36;1mWhere.Start\x1b[m" : "Where.Start"; + return Text$from_str(use_color ? "\x1b[36;1mWhere.Start\x1b[m" : "Where.Start"); case $tag$Where$End: - return use_color ? "\x1b[36;1mWhere.End\x1b[m" : "Where.End"; + return Text$from_str(use_color ? "\x1b[36;1mWhere.End\x1b[m" : "Where.End"); default: - return CORD_EMPTY; + return (Text_t){.length=0}; } } @@ -224,7 +224,7 @@ static CORD compile_lvalue(env_t *env, ast_t *ast) return CORD_all("Array_lvalue(", compile_type(item_type), ", ", target_code, ", ", compile_int_to_type(env, index->index, Type(IntType, .bits=TYPE_IBITS64)), ", ", CORD_asprintf("%ld", padded_type_size(item_type)), - ", ", Text$quoted(ast->file->filename, false), ", ", heap_strf("%ld", ast->start - ast->file->text), + ", ", CORD_quoted(ast->file->filename), ", ", heap_strf("%ld", ast->start - ast->file->text), ", ", heap_strf("%ld", ast->end - ast->file->text), ")"); } } else { @@ -320,7 +320,7 @@ CORD compile_statement(env_t *env, ast_t *ast) if (!expr_t) code_err(test->expr, "I couldn't figure out the type of this expression"); - CORD output = NULL; + CORD output = CORD_EMPTY; if (test->output) { const uint8_t *raw = (const uint8_t*)CORD_to_const_char_star(test->output); uint8_t buf[128] = {0}; @@ -328,6 +328,7 @@ CORD compile_statement(env_t *env, ast_t *ast) uint8_t *norm = u8_normalize(UNINORM_NFD, (uint8_t*)raw, strlen((char*)raw)+1, buf, &norm_len); assert(norm[norm_len-1] == 0); output = CORD_from_char_star((char*)norm); + CORD_printf("OUTPUT: %r\n", output); if (norm && norm != buf) free(norm); } @@ -337,8 +338,8 @@ CORD compile_statement(env_t *env, ast_t *ast) assert(compile_statement(env, test->expr) == CORD_EMPTY); return CORD_asprintf( "test(NULL, NULL, %r, %r, %ld, %ld);", - compile(env, WrapAST(test->expr, TextLiteral, .cord=output)), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->expr->file->filename)), + CORD_quoted(output), + CORD_quoted(test->expr->file->filename), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); } else { @@ -355,8 +356,8 @@ CORD compile_statement(env_t *env, ast_t *ast) compile_declaration(t, var), var, val_code, var, compile_type_info(env, get_type(env, decl->value)), - compile(env, WrapAST(test->expr, TextLiteral, .cord=output)), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->expr->file->filename)), + CORD_quoted(output), + CORD_quoted(test->expr->file->filename), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); } @@ -382,8 +383,8 @@ CORD compile_statement(env_t *env, ast_t *ast) compile_assignment(env, assign->targets->ast, value), compile(env, assign->targets->ast), compile_type_info(env, lhs_t), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->output)), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->expr->file->filename)), + CORD_quoted(test->output), + CORD_quoted(test->expr->file->filename), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); } else { @@ -415,8 +416,8 @@ CORD compile_statement(env_t *env, ast_t *ast) CORD_appendf(&code, "&$1; }), %r, %r, %r, %ld, %ld);", compile_type_info(env, get_type(env, assign->targets->ast)), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->output)), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->expr->file->filename)), + CORD_quoted(test->output), + CORD_quoted(test->expr->file->filename), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); return code; @@ -427,25 +428,25 @@ CORD compile_statement(env_t *env, ast_t *ast) compile_statement(env, test->expr), compile_lvalue(env, Match(test->expr, UpdateAssign)->lhs), compile_type_info(env, get_type(env, Match(test->expr, UpdateAssign)->lhs)), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->output)), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->expr->file->filename)), + CORD_quoted(test->output), + CORD_quoted(test->expr->file->filename), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); } else if (expr_t->tag == VoidType || expr_t->tag == AbortType || expr_t->tag == ReturnType) { return CORD_asprintf( "test(({ %r; NULL; }), NULL, NULL, %r, %ld, %ld);", compile_statement(env, test->expr), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->expr->file->filename)), + CORD_quoted(test->expr->file->filename), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); } else { return CORD_asprintf( "test(%r, %r, %r, %r, %ld, %ld);", test->expr->tag == Var ? CORD_all("&", compile(env, test->expr)) - : CORD_all("(", compile_type(expr_t), "[1]){", compile(env, test->expr), "}"), + : CORD_all("(", compile_type(expr_t), "[1]){", compile(env, test->expr), "}"), compile_type_info(env, expr_t), - compile(env, WrapAST(test->expr, TextLiteral, .cord=output)), - compile(env, WrapAST(test->expr, TextLiteral, .cord=test->expr->file->filename)), + CORD_quoted(output), + CORD_quoted(test->expr->file->filename), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); } @@ -629,7 +630,7 @@ CORD compile_statement(env_t *env, ast_t *ast) auto def = Match(ast, LangDef); CORD_appendf(&env->code->typeinfos, "public const TypeInfo %r%s = {%zu, %zu, {.tag=TextInfo, .TextInfo={%r}}};\n", namespace_prefix(env->libname, env->namespace), def->name, sizeof(CORD), __alignof__(CORD), - Text$quoted(def->name, false)); + CORD_quoted(def->name)); compile_namespace(env, def->name, def->namespace); return CORD_EMPTY; } @@ -703,7 +704,7 @@ CORD compile_statement(env_t *env, ast_t *ast) "}\n"); env->code->funcs = CORD_cat(env->code->funcs, wrapper); } else if (fndef->cache && fndef->cache->tag == Int) { - int64_t cache_size = Int64$from_text(Match(fndef->cache, Int)->str, NULL); + int64_t cache_size = Int64$from_text(Text$from_str(Match(fndef->cache, Int)->str), NULL); const char *arg_type_name = heap_strf("%s$args", Match(fndef->name, Var)->name); ast_t *args_def = FakeAST(StructDef, .name=arg_type_name, .fields=fndef->args); prebind_statement(env, args_def); @@ -1314,7 +1315,7 @@ CORD compile_int_to_type(env_t *env, ast_t *ast, type_t *target) } int64_t target_bits = (int64_t)Match(target, IntType)->bits; - Int_t int_val = Int$from_text(Match(ast, Int)->str, NULL); + Int_t int_val = Int$from_text(Text$from_str(Match(ast, Int)->str), NULL); mpz_t i; mpz_init_set_int(i, int_val); @@ -1354,7 +1355,7 @@ CORD compile_arguments(env_t *env, ast_t *call_ast, arg_t *spec_args, arg_ast_t if (spec_arg->type->tag == IntType && call_arg->value->tag == Int) { value = compile_int_to_type(env, call_arg->value, spec_arg->type); } else if (spec_arg->type->tag == NumType && call_arg->value->tag == Int) { - Int_t int_val = Int$from_text(Match(call_arg->value, Int)->str, NULL); + Int_t int_val = Int$from_text(Text$from_str(Match(call_arg->value, Int)->str), NULL); double n = Int_to_Num(int_val); value = CORD_asprintf(Match(spec_arg->type, NumType)->bits == TYPE_NBITS64 ? "N64(%.20g)" : "N32(%.10g)", n); @@ -1382,7 +1383,7 @@ CORD compile_arguments(env_t *env, ast_t *call_ast, arg_t *spec_args, arg_ast_t if (spec_arg->type->tag == IntType && call_arg->value->tag == Int) { value = compile_int_to_type(env, call_arg->value, spec_arg->type); } else if (spec_arg->type->tag == NumType && call_arg->value->tag == Int) { - Int_t int_val = Int$from_text(Match(call_arg->value, Int)->str, NULL); + Int_t int_val = Int$from_text(Text$from_str(Match(call_arg->value, Int)->str), NULL); double n = Int_to_Num(int_val); value = CORD_asprintf(Match(spec_arg->type, NumType)->bits == TYPE_NBITS64 ? "N64(%.20g)" : "N32(%.10g)", n); @@ -1513,7 +1514,7 @@ CORD compile(env_t *env, ast_t *ast) } case Int: { const char *str = Match(ast, Int)->str; - Int_t int_val = Int$from_text(str, NULL); + Int_t int_val = Int$from_text(Text$from_str(str), NULL); mpz_t i; mpz_init_set_int(i, int_val); @@ -1780,8 +1781,8 @@ CORD compile(env_t *env, ast_t *ast) case TextLiteral: { CORD literal = Match(ast, TextLiteral)->cord; if (literal == CORD_EMPTY) - return "(CORD)CORD_EMPTY"; - CORD code = "(CORD)\""; + return "((Text_t){.length=0})"; + CORD code = "Text$from_str(\""; CORD_pos i; CORD_FOR(i, literal) { char c = CORD_pos_fetch(i); @@ -1803,7 +1804,7 @@ CORD compile(env_t *env, ast_t *ast) } } } - return CORD_cat_char(code, '"'); + return CORD_cat(code, "\")"); } case TextJoin: { const char *lang = Match(ast, TextJoin)->lang; @@ -1812,7 +1813,7 @@ CORD compile(env_t *env, ast_t *ast) code_err(ast, "%s is not a valid text language name", lang); ast_list_t *chunks = Match(ast, TextJoin)->children; if (!chunks) { - return "(CORD)CORD_EMPTY"; + return "((Text_t){.length=0})"; } else if (!chunks->next && chunks->ast->tag == TextLiteral) { return compile(env, chunks->ast); } else { @@ -1839,7 +1840,7 @@ CORD compile(env_t *env, ast_t *ast) if (chunk->next) code = CORD_cat(code, ", "); } if (chunks->next) - return CORD_all("CORD_all(", code, ")"); + return CORD_all("Text$concat(", code, ")"); else return code; } @@ -2447,7 +2448,8 @@ CORD compile(env_t *env, ast_t *ast) file_t *f = ast->file; return CORD_all("Table$get_value_or_fail(", self, ", ", compile_type(table->key_type), ", ", compile_type(table->value_type), ", ", compile_arguments(env, ast, arg_spec, call->args), ", ", compile_type_info(env, self_value_t), ", ", - Text$quoted(f->filename, false), ", ", CORD_asprintf("%ld", (int64_t)(ast->start - f->text)), ", ", + CORD_quoted(f->filename), ", ", + CORD_asprintf("%ld", (int64_t)(ast->start - f->text)), ", ", CORD_asprintf("%ld", (int64_t)(ast->end - f->text)), ")"); } @@ -2630,8 +2632,9 @@ CORD compile(env_t *env, ast_t *ast) } else { empty = FakeAST( InlineCCode, - CORD_asprintf("fail_source(%r, %ld, %ld, \"This collection was empty!\");\n", - Text$quoted(ast->file->filename, false), (long)(reduction->iter->start - reduction->iter->file->text), + CORD_asprintf("fail_source(%s, %ld, %ld, \"This collection was empty!\");\n", + CORD_quoted(ast->file->filename), + (long)(reduction->iter->start - reduction->iter->file->text), (long)(reduction->iter->end - reduction->iter->file->text))); } ast_t *item = FakeAST(Var, "$iter_value"); @@ -2785,7 +2788,8 @@ CORD compile(env_t *env, ast_t *ast) else return CORD_all("Array_get(", compile_type(item_type), ", ", arr, ", ", compile_int_to_type(env, indexing->index, Type(IntType, .bits=TYPE_IBITS64)), ", ", - Text$quoted(f->filename, false), ", ", CORD_asprintf("%ld", (int64_t)(indexing->index->start - f->text)), ", ", + CORD_quoted(f->filename), ", ", + CORD_asprintf("%ld", (int64_t)(indexing->index->start - f->text)), ", ", CORD_asprintf("%ld", (int64_t)(indexing->index->end - f->text)), ")"); } else { @@ -2935,15 +2939,15 @@ CORD compile_type_info(env_t *env, type_t *t) CORD sigil = ptr->is_stack ? "&" : "@"; if (ptr->is_readonly) sigil = CORD_cat(sigil, "%"); return CORD_asprintf("$PointerInfo(%r, %r, %s)", - Text$quoted(sigil, false), + CORD_quoted(sigil), compile_type_info(env, ptr->pointed), ptr->is_optional ? "yes" : "no"); } case FunctionType: { - return CORD_asprintf("$FunctionInfo(%r)", Text$quoted(type_to_cord(t), false)); + return CORD_asprintf("$FunctionInfo(%r)", CORD_quoted(type_to_cord(t))); } case ClosureType: { - return CORD_asprintf("$ClosureInfo(%r)", Text$quoted(type_to_cord(t), false)); + return CORD_asprintf("$ClosureInfo(%r)", CORD_quoted(type_to_cord(t))); } case TypeInfoType: return "&$TypeInfo"; case MemoryType: return "&$Memory"; @@ -2968,7 +2972,7 @@ CORD compile_cli_arg_call(env_t *env, CORD fn_name, type_t *fn_type) for (arg_t *arg = fn_info->args; arg; arg = arg->next) { usage = CORD_cat(usage, " "); type_t *t = get_arg_type(main_env, arg); - CORD flag = Text$replace(arg->name, "_", "-", I(-1)); + CORD flag = CORD_replace(arg->name, "_", "-"); if (arg->default_val) { if (t->tag == BoolType) usage = CORD_all(usage, "[--", flag, "]"); @@ -2983,7 +2987,7 @@ CORD compile_cli_arg_call(env_t *env, CORD fn_name, type_t *fn_type) usage = CORD_all(usage, "<", flag, ">"); } } - CORD code = CORD_all("CORD usage = CORD_all(\"Usage: \", argv[0], ", usage ? Text$quoted(usage, false) : "CORD_EMPTY", ");\n", + CORD code = CORD_all("CORD usage = CORD_all(\"Usage: \", argv[0], ", usage ? CORD_quoted(usage) : "CORD_EMPTY", ");\n", "#define USAGE_ERR(...) errx(1, CORD_to_const_char_star(CORD_all(__VA_ARGS__)))\n" "#define IS_FLAG(str, flag) (strncmp(str, flag, strlen(flag) == 0 && (str[strlen(flag)] == 0 || str[strlen(flag)] == '=')) == 0)\n"); @@ -3006,7 +3010,7 @@ CORD compile_cli_arg_call(env_t *env, CORD fn_name, type_t *fn_type) "if (strncmp(argv[i], \"--\", 2) != 0) {\n++i;\ncontinue;\n}\n"); for (arg_t *arg = fn_info->args; arg; arg = arg->next) { type_t *t = get_arg_type(main_env, arg); - CORD flag = Text$replace(arg->name, "_", "-", I(-1)); + CORD flag = CORD_replace(arg->name, "_", "-"); switch (t->tag) { case BoolType: { code = CORD_all(code, "else if (pop_flag(argv, &i, \"", flag, "\", &flag)) {\n" @@ -15,6 +15,7 @@ #include "ast.h" #include "builtins/integers.h" +#include "builtins/text.h" #include "builtins/table.h" #include "builtins/util.h" @@ -1894,7 +1895,7 @@ ast_t *parse_enum_def(parse_ctx_t *ctx, const char *pos) { spaces(&pos); if (match(&pos, "=")) { ast_t *val = expect(ctx, tag_start, &pos, parse_int, "I expected an integer literal after this '='"); - Int_t i = Int$from_text(Match(val, Int)->str, NULL); + Int_t i = Int$from_text(Text$from_str(Match(val, Int)->str), NULL); // TODO check for overflow next_value = (i.small >> 2); } @@ -208,7 +208,7 @@ static double ast_to_num(env_t *env, ast_t *ast) } } -static CORD obj_to_text(type_t *t, const void *obj, bool use_color) +static Text_t obj_to_text(type_t *t, const void *obj, bool use_color) { const TypeInfo *info = type_to_type_info(t); return generic_as_text(obj, use_color, info); @@ -272,8 +272,8 @@ void run(env_t *env, ast_t *ast) } else { void *value = GC_MALLOC(size); eval(env, doctest->expr, value); - CORD c = obj_to_text(t, value, true); - printf("= %s \x1b[2m: %T\x1b[m\n", CORD_to_const_char_star(c), t); + Text_t text = obj_to_text(t, value, true); + printf("= %k \x1b[2m: %T\x1b[m\n", &text, t); fflush(stdout); } break; @@ -353,11 +353,11 @@ void eval(env_t *env, ast_t *ast, void *dest) case Int: { if (!dest) return; switch (Match(ast, Int)->bits) { - case 0: *(Int_t*)dest = Int$from_text(Match(ast, Int)->str, NULL); break; - case 64: *(int64_t*)dest = Int64$from_text(Match(ast, Int)->str, NULL); break; - case 32: *(int32_t*)dest = Int32$from_text(Match(ast, Int)->str, NULL); break; - case 16: *(int16_t*)dest = Int16$from_text(Match(ast, Int)->str, NULL); break; - case 8: *(int8_t*)dest = Int8$from_text(Match(ast, Int)->str, NULL); break; + case 0: *(Int_t*)dest = Int$from_text(Text$from_str(Match(ast, Int)->str), NULL); break; + case 64: *(int64_t*)dest = Int64$from_text(Text$from_str(Match(ast, Int)->str), NULL); break; + case 32: *(int32_t*)dest = Int32$from_text(Text$from_str(Match(ast, Int)->str), NULL); break; + case 16: *(int16_t*)dest = Int16$from_text(Text$from_str(Match(ast, Int)->str), NULL); break; + case 8: *(int8_t*)dest = Int8$from_text(Text$from_str(Match(ast, Int)->str), NULL); break; default: errx(1, "Invalid int bits: %ld", Match(ast, Int)->bits); } break; @@ -386,7 +386,7 @@ void eval(env_t *env, ast_t *ast, void *dest) size_t chunk_size = type_size(chunk_t); char buf[chunk_size]; eval(env, chunk->ast, buf); - ret = CORD_cat(ret, obj_to_text(chunk_t, buf, false)); + ret = CORD_cat(ret, Text$as_c_string(obj_to_text(chunk_t, buf, false))); } } if (dest) *(CORD*)dest = ret; @@ -166,7 +166,7 @@ void compile_struct_def(env_t *env, ast_t *ast) } else { // If there are no fields, we can use an EmptyStruct typeinfo, which generates less code: CORD typeinfo = CORD_asprintf("public const TypeInfo %r = {%zu, %zu, {.tag=EmptyStruct, .EmptyStruct.name=%r}};\n", - full_name, type_size(t), type_align(t), Text$quoted(def->name, false)); + full_name, type_size(t), type_align(t), Text$quoted(Text$from_str(def->name), false)); env->code->typeinfos = CORD_all(env->code->typeinfos, typeinfo); } @@ -84,6 +84,8 @@ int main(int argc, char *argv[]) errx(1, "Couldn't set printf specifier"); if (register_printf_specifier('W', printf_ast, printf_pointer_size)) errx(1, "Couldn't set printf specifier"); + if (register_printf_specifier('k', printf_text, printf_text_size)) + errx(1, "Couldn't set printf specifier"); setenv("TOMO_IMPORT_PATH", "~/.local/src/tomo:.", 0); setenv("TOMO_LIB_PATH", "~/.local/lib/tomo:.", 0); diff --git a/typecheck.c b/typecheck.c index bb40666f..4d4c080e 100644 --- a/typecheck.c +++ b/typecheck.c @@ -9,11 +9,12 @@ #include <sys/stat.h> #include "ast.h" +#include "builtins/text.h" +#include "builtins/util.h" #include "environment.h" #include "parse.h" #include "typecheck.h" #include "types.h" -#include "builtins/util.h" type_t *parse_type_ast(env_t *env, type_ast_t *ast) { @@ -1367,7 +1368,7 @@ bool is_constant(env_t *env, ast_t *ast) case Int: { auto info = Match(ast, Int); if (info->bits == IBITS_UNSPECIFIED) { - Int_t int_val = Int$from_text(info->str, NULL); + Int_t int_val = Int$from_text(Text$from_str(info->str), NULL); mpz_t i; mpz_init_set_int(i, int_val); return (mpz_cmpabs_ui(i, BIGGEST_SMALL_INT) <= 0); |
