From c455e7b67d2e55e6ed03e3449203d4e307f5a7dd Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Fri, 13 Sep 2024 20:18:08 -0400 Subject: Rename builtins/ -> stdlib/ --- Makefile | 12 +- ast.c | 4 +- ast.h | 4 +- builtins/arrays.c | 684 ---------------------- builtins/arrays.h | 103 ---- builtins/bools.c | 54 -- builtins/bools.h | 22 - builtins/c_strings.c | 55 -- builtins/c_strings.h | 18 - builtins/channels.c | 137 ----- builtins/channels.h | 28 - builtins/datatypes.h | 91 --- builtins/files.c | 322 ----------- builtins/files.h | 35 -- builtins/functiontype.c | 35 -- builtins/functiontype.h | 9 - builtins/integers.c | 490 ---------------- builtins/integers.h | 375 ------------ builtins/memory.c | 28 - builtins/memory.h | 13 - builtins/metamethods.c | 124 ---- builtins/metamethods.h | 15 - builtins/nums.c | 178 ------ builtins/nums.h | 60 -- builtins/optionals.c | 73 --- builtins/optionals.h | 23 - builtins/paths.c | 481 ---------------- builtins/paths.h | 50 -- builtins/patterns.c | 1064 ---------------------------------- builtins/patterns.h | 33 -- builtins/pointers.c | 84 --- builtins/pointers.h | 19 - builtins/ranges.c | 63 -- builtins/ranges.h | 10 - builtins/shell.c | 67 --- builtins/shell.h | 25 - builtins/siphash-internals.h | 127 ---- builtins/siphash.c | 77 --- builtins/siphash.h | 13 - builtins/stdlib.c | 274 --------- builtins/stdlib.h | 34 -- builtins/tables.c | 636 --------------------- builtins/tables.h | 84 --- builtins/text.c | 1302 ------------------------------------------ builtins/text.h | 67 --- builtins/threads.c | 55 -- builtins/threads.h | 20 - builtins/tomo.h | 34 -- builtins/types.c | 38 -- builtins/types.h | 86 --- builtins/util.c | 28 - builtins/util.h | 66 --- compile.c | 6 +- compile.h | 2 +- cordhelpers.c | 2 +- enums.c | 4 +- environment.c | 6 +- environment.h | 2 +- parse.c | 8 +- repl.c | 4 +- stdlib/arrays.c | 684 ++++++++++++++++++++++ stdlib/arrays.h | 103 ++++ stdlib/bools.c | 54 ++ stdlib/bools.h | 22 + stdlib/c_strings.c | 55 ++ stdlib/c_strings.h | 18 + stdlib/channels.c | 137 +++++ stdlib/channels.h | 28 + stdlib/datatypes.h | 91 +++ stdlib/files.c | 322 +++++++++++ stdlib/files.h | 35 ++ stdlib/functiontype.c | 35 ++ stdlib/functiontype.h | 9 + stdlib/integers.c | 490 ++++++++++++++++ stdlib/integers.h | 375 ++++++++++++ stdlib/memory.c | 28 + stdlib/memory.h | 13 + stdlib/metamethods.c | 124 ++++ stdlib/metamethods.h | 15 + stdlib/nums.c | 178 ++++++ stdlib/nums.h | 60 ++ stdlib/optionals.c | 73 +++ stdlib/optionals.h | 23 + stdlib/paths.c | 481 ++++++++++++++++ stdlib/paths.h | 50 ++ stdlib/patterns.c | 1064 ++++++++++++++++++++++++++++++++++ stdlib/patterns.h | 33 ++ stdlib/pointers.c | 84 +++ stdlib/pointers.h | 19 + stdlib/ranges.c | 63 ++ stdlib/ranges.h | 10 + stdlib/shell.c | 67 +++ stdlib/shell.h | 25 + stdlib/siphash-internals.h | 127 ++++ stdlib/siphash.c | 77 +++ stdlib/siphash.h | 13 + stdlib/stdlib.c | 274 +++++++++ stdlib/stdlib.h | 34 ++ stdlib/tables.c | 636 +++++++++++++++++++++ stdlib/tables.h | 84 +++ stdlib/text.c | 1302 ++++++++++++++++++++++++++++++++++++++++++ stdlib/text.h | 67 +++ stdlib/threads.c | 55 ++ stdlib/threads.h | 20 + stdlib/tomo.h | 34 ++ stdlib/types.c | 38 ++ stdlib/types.h | 86 +++ stdlib/util.c | 28 + stdlib/util.h | 66 +++ structs.c | 4 +- tomo.c | 6 +- typecheck.c | 4 +- types.c | 6 +- types.h | 2 +- 114 files changed, 7847 insertions(+), 7847 deletions(-) delete mode 100644 builtins/arrays.c delete mode 100644 builtins/arrays.h delete mode 100644 builtins/bools.c delete mode 100644 builtins/bools.h delete mode 100644 builtins/c_strings.c delete mode 100644 builtins/c_strings.h delete mode 100644 builtins/channels.c delete mode 100644 builtins/channels.h delete mode 100644 builtins/datatypes.h delete mode 100644 builtins/files.c delete mode 100644 builtins/files.h delete mode 100644 builtins/functiontype.c delete mode 100644 builtins/functiontype.h delete mode 100644 builtins/integers.c delete mode 100644 builtins/integers.h delete mode 100644 builtins/memory.c delete mode 100644 builtins/memory.h delete mode 100644 builtins/metamethods.c delete mode 100644 builtins/metamethods.h delete mode 100644 builtins/nums.c delete mode 100644 builtins/nums.h delete mode 100644 builtins/optionals.c delete mode 100644 builtins/optionals.h delete mode 100644 builtins/paths.c delete mode 100644 builtins/paths.h delete mode 100644 builtins/patterns.c delete mode 100644 builtins/patterns.h delete mode 100644 builtins/pointers.c delete mode 100644 builtins/pointers.h delete mode 100644 builtins/ranges.c delete mode 100644 builtins/ranges.h delete mode 100644 builtins/shell.c delete mode 100644 builtins/shell.h delete mode 100644 builtins/siphash-internals.h delete mode 100644 builtins/siphash.c delete mode 100644 builtins/siphash.h delete mode 100644 builtins/stdlib.c delete mode 100644 builtins/stdlib.h delete mode 100644 builtins/tables.c delete mode 100644 builtins/tables.h delete mode 100644 builtins/text.c delete mode 100644 builtins/text.h delete mode 100644 builtins/threads.c delete mode 100644 builtins/threads.h delete mode 100644 builtins/tomo.h delete mode 100644 builtins/types.c delete mode 100644 builtins/types.h delete mode 100644 builtins/util.c delete mode 100644 builtins/util.h create mode 100644 stdlib/arrays.c create mode 100644 stdlib/arrays.h create mode 100644 stdlib/bools.c create mode 100644 stdlib/bools.h create mode 100644 stdlib/c_strings.c create mode 100644 stdlib/c_strings.h create mode 100644 stdlib/channels.c create mode 100644 stdlib/channels.h create mode 100644 stdlib/datatypes.h create mode 100644 stdlib/files.c create mode 100644 stdlib/files.h create mode 100644 stdlib/functiontype.c create mode 100644 stdlib/functiontype.h create mode 100644 stdlib/integers.c create mode 100644 stdlib/integers.h create mode 100644 stdlib/memory.c create mode 100644 stdlib/memory.h create mode 100644 stdlib/metamethods.c create mode 100644 stdlib/metamethods.h create mode 100644 stdlib/nums.c create mode 100644 stdlib/nums.h create mode 100644 stdlib/optionals.c create mode 100644 stdlib/optionals.h create mode 100644 stdlib/paths.c create mode 100644 stdlib/paths.h create mode 100644 stdlib/patterns.c create mode 100644 stdlib/patterns.h create mode 100644 stdlib/pointers.c create mode 100644 stdlib/pointers.h create mode 100644 stdlib/ranges.c create mode 100644 stdlib/ranges.h create mode 100644 stdlib/shell.c create mode 100644 stdlib/shell.h create mode 100644 stdlib/siphash-internals.h create mode 100644 stdlib/siphash.c create mode 100644 stdlib/siphash.h create mode 100644 stdlib/stdlib.c create mode 100644 stdlib/stdlib.h create mode 100644 stdlib/tables.c create mode 100644 stdlib/tables.h create mode 100644 stdlib/text.c create mode 100644 stdlib/text.h create mode 100644 stdlib/threads.c create mode 100644 stdlib/threads.h create mode 100644 stdlib/tomo.h create mode 100644 stdlib/types.c create mode 100644 stdlib/types.h create mode 100644 stdlib/util.c create mode 100644 stdlib/util.h diff --git a/Makefile b/Makefile index 8cbfef32..d4694d53 100644 --- a/Makefile +++ b/Makefile @@ -28,10 +28,10 @@ O=-Og CFLAGS=$(CCONFIG) $(EXTRA) $(CWARN) $(G) $(O) $(OSFLAGS) CFLAGS_PLACEHOLDER="$$(echo -e '\033[2m\033[m')" LDLIBS=-lgc -lcord -lm -lunistring -lgmp -ldl -BUILTIN_OBJS=builtins/siphash.o builtins/arrays.o builtins/bools.o builtins/channels.o builtins/nums.o builtins/integers.o \ - builtins/pointers.o builtins/memory.o builtins/text.o builtins/threads.o builtins/c_strings.o builtins/tables.o \ - builtins/types.o builtins/util.o builtins/files.o builtins/ranges.o builtins/shell.o builtins/paths.o \ - builtins/optionals.o builtins/patterns.o builtins/metamethods.o builtins/functiontype.o builtins/stdlib.o +BUILTIN_OBJS=stdlib/siphash.o stdlib/arrays.o stdlib/bools.o stdlib/channels.o stdlib/nums.o stdlib/integers.o \ + stdlib/pointers.o stdlib/memory.o stdlib/text.o stdlib/threads.o stdlib/c_strings.o stdlib/tables.o \ + stdlib/types.o stdlib/util.o stdlib/files.o stdlib/ranges.o stdlib/shell.o stdlib/paths.o \ + stdlib/optionals.o stdlib/patterns.o stdlib/metamethods.o stdlib/functiontype.o stdlib/stdlib.o TESTS=$(patsubst %.tm,%.tm.testresult,$(wildcard test/*.tm)) all: libtomo.so tomo @@ -63,14 +63,14 @@ test: $(TESTS) @echo -e '\x1b[32;7m ALL TESTS PASSED! \x1b[m' clean: - rm -f tomo *.o builtins/*.o libtomo.so test/*.tm.{c,h,o,testresult} examples/*.tm.*{c,h,o} + rm -f tomo *.o stdlib/*.o libtomo.so test/*.tm.{c,h,o,testresult} examples/*.tm.*{c,h,o} %: %.md pandoc --lua-filter=.pandoc/bold-code.lua -s $< -t man -o $@ install: tomo libtomo.so tomo.1 mkdir -p -m 755 "$(PREFIX)/man/man1" "$(PREFIX)/bin" "$(PREFIX)/include/tomo" "$(PREFIX)/lib" "$(PREFIX)/share/tomo/modules" - cp -v builtins/*.h "$(PREFIX)/include/tomo/" + cp -v stdlib/*.h "$(PREFIX)/include/tomo/" cp -v libtomo.so "$(PREFIX)/lib/" rm -f "$(PREFIX)/bin/tomo" cp -v tomo "$(PREFIX)/bin/" diff --git a/ast.c b/ast.c index aa8b1b50..f00ae5c1 100644 --- a/ast.c +++ b/ast.c @@ -5,8 +5,8 @@ #include #include "ast.h" -#include "builtins/integers.h" -#include "builtins/text.h" +#include "stdlib/integers.h" +#include "stdlib/text.h" #include "cordhelpers.h" static const char *OP_NAMES[] = { diff --git a/ast.h b/ast.h index 198e3d80..55ac28fb 100644 --- a/ast.h +++ b/ast.h @@ -8,8 +8,8 @@ #include #include -#include "builtins/files.h" -#include "builtins/util.h" +#include "stdlib/files.h" +#include "stdlib/util.h" #define NewAST(_file, _start, _end, ast_tag, ...) (new(ast_t, .file=_file, .start=_start, .end=_end,\ .tag=ast_tag, .__data.ast_tag={__VA_ARGS__})) diff --git a/builtins/arrays.c b/builtins/arrays.c deleted file mode 100644 index 58a33754..00000000 --- a/builtins/arrays.c +++ /dev/null @@ -1,684 +0,0 @@ -// Functions that operate on arrays - -#include -#include -#include -#include - -#include "arrays.h" -#include "metamethods.h" -#include "optionals.h" -#include "tables.h" -#include "text.h" -#include "util.h" - -// Use inline version of siphash code: -#include "siphash.h" -#include "siphash-internals.h" - -PUREFUNC static inline int64_t get_padded_item_size(const TypeInfo *info) -{ - int64_t size = info->ArrayInfo.item->size; - if (info->ArrayInfo.item->align > 1 && size % info->ArrayInfo.item->align) - size += info->ArrayInfo.item->align - (size % info->ArrayInfo.item->align); // padding - return size; -} - -// Replace the array's .data pointer with a new pointer to a copy of the -// data that is compacted and has a stride of exactly `padded_item_size` -public void Array$compact(Array_t *arr, int64_t padded_item_size) -{ - void *copy = NULL; - if (arr->length > 0) { - copy = arr->atomic ? GC_MALLOC_ATOMIC((size_t)arr->length * (size_t)padded_item_size) - : GC_MALLOC((size_t)arr->length * (size_t)padded_item_size); - if ((int64_t)arr->stride == padded_item_size) { - memcpy(copy, arr->data, (size_t)arr->length * (size_t)padded_item_size); - } else { - for (int64_t i = 0; i < arr->length; i++) - memcpy(copy + i*padded_item_size, arr->data + arr->stride*i, (size_t)padded_item_size); - } - } - *arr = (Array_t){ - .data=copy, - .length=arr->length, - .stride=padded_item_size, - .atomic=arr->atomic, - }; -} - -public void Array$insert(Array_t *arr, const void *item, Int_t int_index, int64_t padded_item_size) -{ - int64_t index = Int_to_Int64(int_index, false); - if (index <= 0) index = arr->length + index + 1; - - if (index < 1) index = 1; - else if (index > (int64_t)arr->length + 1) - fail("Invalid insertion index %ld for an array with length %ld", index, arr->length); - - if (!arr->data) { - arr->free = 4; - arr->data = arr->atomic ? GC_MALLOC_ATOMIC((size_t)arr->free * (size_t)padded_item_size) - : GC_MALLOC((size_t)arr->free * (size_t)padded_item_size); - arr->stride = padded_item_size; - } else if (arr->free < 1 || arr->data_refcount != 0 || (int64_t)arr->stride != padded_item_size) { - arr->free = MIN(ARRAY_MAX_FREE_ENTRIES, MAX(8, arr->length/4)); - void *copy = arr->atomic ? GC_MALLOC_ATOMIC((size_t)(arr->length + arr->free) * (size_t)padded_item_size) - : GC_MALLOC((size_t)(arr->length + arr->free) * (size_t)padded_item_size); - for (int64_t i = 0; i < index-1; i++) - memcpy(copy + i*padded_item_size, arr->data + arr->stride*i, (size_t)padded_item_size); - for (int64_t i = index-1; i < (int64_t)arr->length; i++) - memcpy(copy + (i+1)*padded_item_size, arr->data + arr->stride*i, (size_t)padded_item_size); - arr->data = copy; - arr->data_refcount = 0; - arr->stride = padded_item_size; - } else { - if (index != arr->length+1) - memmove( - arr->data + index*padded_item_size, - arr->data + (index-1)*padded_item_size, - (size_t)((arr->length - index + 1)*padded_item_size)); - } - assert(arr->free > 0); - --arr->free; - ++arr->length; - memcpy((void*)arr->data + (index-1)*padded_item_size, item, (size_t)padded_item_size); -} - -public void Array$insert_all(Array_t *arr, Array_t to_insert, Int_t int_index, int64_t padded_item_size) -{ - int64_t index = Int_to_Int64(int_index, false); - if (to_insert.length == 0) - return; - - if (!arr->data) { - *arr = to_insert; - ARRAY_INCREF(*arr); - return; - } - - if (index < 1) index = arr->length + index + 1; - - if (index < 1) index = 1; - else if (index > (int64_t)arr->length + 1) - fail("Invalid insertion index %ld for an array with length %ld", index, arr->length); - - if ((int64_t)arr->free >= (int64_t)to_insert.length // Adequate free space - && arr->data_refcount == 0 // Not aliased memory - && (int64_t)arr->stride == padded_item_size) { // Contiguous array - // If we can fit this within the array's preallocated free space, do that: - arr->free -= to_insert.length; - arr->length += to_insert.length; - if (index != arr->length+1) - memmove((void*)arr->data + index*padded_item_size, - arr->data + (index-1)*padded_item_size, - (size_t)((arr->length - index + to_insert.length-1)*padded_item_size)); - for (int64_t i = 0; i < to_insert.length; i++) - memcpy((void*)arr->data + (index-1 + i)*padded_item_size, - to_insert.data + i*to_insert.stride, (size_t)padded_item_size); - } else { - // Otherwise, allocate a new chunk of memory for the array and populate it: - int64_t new_len = arr->length + to_insert.length; - arr->free = MIN(ARRAY_MAX_FREE_ENTRIES, MAX(8, new_len/4)); - void *data = arr->atomic ? GC_MALLOC_ATOMIC((size_t)((new_len + arr->free) * padded_item_size)) - : GC_MALLOC((size_t)((new_len + arr->free) * padded_item_size)); - void *p = data; - - // Copy first chunk of `arr` if needed: - if (index > 1) { - if (arr->stride == padded_item_size) { - p = mempcpy(p, arr->data, (size_t)((index-1)*padded_item_size)); - } else { - for (int64_t i = 0; i < index-1; i++) - p = mempcpy(p, arr->data + arr->stride*i, (size_t)padded_item_size); - } - } - - // Copy `to_insert` - if (to_insert.stride == padded_item_size) { - p = mempcpy(p, to_insert.data, (size_t)(to_insert.length*padded_item_size)); - } else { - for (int64_t i = 0; i < index-1; i++) - p = mempcpy(p, to_insert.data + to_insert.stride*i, (size_t)padded_item_size); - } - - // Copy last chunk of `arr` if needed: - if (index < arr->length + 1) { - if (arr->stride == padded_item_size) { - p = mempcpy(p, arr->data + padded_item_size*(index-1), (size_t)((arr->length - index + 1)*padded_item_size)); - } else { - for (int64_t i = index-1; i < arr->length-1; i++) - p = mempcpy(p, arr->data + arr->stride*i, (size_t)padded_item_size); - } - } - arr->length = new_len; - arr->stride = padded_item_size; - arr->data = data; - arr->data_refcount = 0; - } -} - -public void Array$remove_at(Array_t *arr, Int_t int_index, Int_t int_count, int64_t padded_item_size) -{ - int64_t index = Int_to_Int64(int_index, false); - if (index < 1) index = arr->length + index + 1; - - int64_t count = Int_to_Int64(int_count, false); - if (index < 1 || index > (int64_t)arr->length || count < 1) return; - - if (count > arr->length - index + 1) - count = (arr->length - index) + 1; - - if (index == 1) { - arr->data += arr->stride * count; - } else if (index + count > arr->length) { - if (arr->free >= 0) - arr->free += count; - } else if (arr->data_refcount != 0 || (int64_t)arr->stride != padded_item_size) { - void *copy = arr->atomic ? GC_MALLOC_ATOMIC((size_t)((arr->length-1) * padded_item_size)) - : GC_MALLOC((size_t)((arr->length-1) * padded_item_size)); - for (int64_t src = 1, dest = 1; src <= (int64_t)arr->length; src++) { - if (src < index || src >= index + count) { - memcpy(copy + (dest - 1)*padded_item_size, arr->data + arr->stride*(src - 1), (size_t)padded_item_size); - ++dest; - } - } - arr->data = copy; - arr->free = 0; - arr->data_refcount = 0; - } else { - memmove((void*)arr->data + (index-1)*padded_item_size, arr->data + (index-1 + count)*padded_item_size, - (size_t)((arr->length - index + count - 1)*padded_item_size)); - arr->free += count; - } - arr->length -= count; - if (arr->length == 0) arr->data = NULL; -} - -public void Array$remove_item(Array_t *arr, void *item, Int_t max_removals, const TypeInfo *type) -{ - int64_t padded_item_size = get_padded_item_size(type); - const Int_t ZERO = (Int_t){.small=(0<<2)|1}; - const Int_t ONE = (Int_t){.small=(1<<2)|1}; - const TypeInfo *item_type = type->ArrayInfo.item; - for (int64_t i = 0; i < arr->length; ) { - if (max_removals.small == ZERO.small) // zero - break; - - if (generic_equal(item, arr->data + i*arr->stride, item_type)) { - Array$remove_at(arr, I(i+1), ONE, padded_item_size); - max_removals = Int$minus(max_removals, ONE); - } else { - i++; - } - } -} - -public Int_t Array$find(Array_t arr, void *item, const TypeInfo *type) -{ - const TypeInfo *item_type = type->ArrayInfo.item; - for (int64_t i = 0; i < arr.length; i++) { - if (generic_equal(item, arr.data + i*arr.stride, item_type)) - return I(i+1); - } - return NULL_INT; -} - -public Int_t Array$first(Array_t arr, Closure_t predicate) -{ - bool (*is_good)(void*, void*) = (void*)predicate.fn; - for (int64_t i = 0; i < arr.length; i++) { - if (is_good(arr.data + i*arr.stride, predicate.userdata)) - return I(i+1); - } - return NULL_INT; -} - -public void Array$sort(Array_t *arr, Closure_t comparison, int64_t padded_item_size) -{ - if (arr->data_refcount != 0 || (int64_t)arr->stride != padded_item_size) - Array$compact(arr, padded_item_size); - - qsort_r(arr->data, (size_t)arr->length, (size_t)padded_item_size, comparison.fn, comparison.userdata); -} - -public Array_t Array$sorted(Array_t arr, Closure_t comparison, int64_t padded_item_size) -{ - Array$compact(&arr, padded_item_size); - qsort_r(arr.data, (size_t)arr.length, (size_t)padded_item_size, comparison.fn, comparison.userdata); - return arr; -} - -#pragma GCC diagnostic ignored "-Wstack-protector" -public void Array$shuffle(Array_t *arr, int64_t padded_item_size) -{ - if (arr->data_refcount != 0 || (int64_t)arr->stride != padded_item_size) - Array$compact(arr, padded_item_size); - - char tmp[padded_item_size]; - for (int64_t i = arr->length-1; i > 1; i--) { - int64_t j = arc4random_uniform(i+1); - memcpy(tmp, arr->data + i*padded_item_size, (size_t)padded_item_size); - memcpy((void*)arr->data + i*padded_item_size, arr->data + j*padded_item_size, (size_t)padded_item_size); - memcpy((void*)arr->data + j*padded_item_size, tmp, (size_t)padded_item_size); - } -} - -public Array_t Array$shuffled(Array_t arr, int64_t padded_item_size) -{ - Array$compact(&arr, padded_item_size); - Array$shuffle(&arr, padded_item_size); - return arr; -} - -public void *Array$random(Array_t arr) -{ - if (arr.length == 0) - return NULL; // fail("Cannot get a random item from an empty array!"); - int64_t index = arc4random_uniform(arr.length); - return arr.data + arr.stride*index; -} - -public Table_t Array$counts(Array_t arr, const TypeInfo *type) -{ - Table_t counts = {}; - const TypeInfo count_type = {.size=sizeof(Table_t), .align=__alignof__(Table_t), - .tag=TableInfo, .TableInfo.key=type->ArrayInfo.item, .TableInfo.value=&Int$info}; - for (int64_t i = 0; i < arr.length; i++) { - void *key = arr.data + i*arr.stride; - int64_t *count = Table$get(counts, key, &count_type); - int64_t val = count ? *count + 1 : 1; - Table$set(&counts, key, &val, &count_type); - } - return counts; -} - -public Array_t Array$sample(Array_t arr, Int_t int_n, Array_t weights, int64_t padded_item_size) -{ - int64_t n = Int_to_Int64(int_n, false); - if (arr.length == 0 || n <= 0) - return (Array_t){}; - - Array_t selected = { - .data=arr.atomic ? GC_MALLOC_ATOMIC((size_t)(n * padded_item_size)) : GC_MALLOC((size_t)(n * padded_item_size)), - .length=n, - .stride=padded_item_size, .atomic=arr.atomic}; - - double total = 0.0; - for (int64_t i = 0; i < weights.length && i < arr.length; i++) { - double weight = *(double*)(weights.data + weights.stride*i); - if (isinf(weight)) - fail("Infinite weight!"); - else if (isnan(weight)) - fail("NaN weight!"); - else if (weight < 0.0) - fail("Negative weight!"); - else - total += weight; - } - - if (isinf(total)) - fail("Sample weights have overflowed to infinity"); - - if (total == 0.0) { - for (int64_t i = 0; i < n; i++) { - int64_t index = arc4random_uniform(arr.length); - memcpy(selected.data + i*padded_item_size, arr.data + arr.stride*index, (size_t)padded_item_size); - } - } else { - double inverse_average = (double)arr.length / total; - - struct { - int64_t alias; - double odds; - } aliases[arr.length] = {}; - - for (int64_t i = 0; i < arr.length; i++) { - double weight = i >= weights.length ? 0.0 : *(double*)(weights.data + weights.stride*i); - aliases[i].odds = weight * inverse_average; - aliases[i].alias = -1; - } - - int64_t small = 0; - for (int64_t big = 0; big < arr.length; big++) { - while (aliases[big].odds >= 1.0) { - while (small < arr.length && (aliases[small].odds >= 1.0 || aliases[small].alias != -1)) - ++small; - - if (small >= arr.length) { - aliases[big].odds = 1.0; - aliases[big].alias = big; - break; - } - - aliases[small].alias = big; - aliases[big].odds = (aliases[small].odds + aliases[big].odds) - 1.0; - } - if (big < small) small = big; - } - - for (int64_t i = small; i < arr.length; i++) - if (aliases[i].alias == -1) - aliases[i].alias = i; - - for (int64_t i = 0; i < n; i++) { - double r = drand48() * arr.length; - int64_t index = (int64_t)r; - if ((r - (double)index) > aliases[index].odds) - index = aliases[index].alias; - memcpy(selected.data + i*selected.stride, arr.data + index*arr.stride, (size_t)padded_item_size); - } - } - return selected; -} - -public Array_t Array$from(Array_t array, Int_t int_first) -{ - int64_t first = Int_to_Int64(int_first, false); - if (first < 0) - first = array.length + first + 1; - - if (first < 1 || first > array.length) - return (Array_t){.atomic=array.atomic}; - - return (Array_t){ - .atomic=array.atomic, - .data=array.data + array.stride*(first-1), - .length=array.length - first + 1, - .stride=array.stride, - .data_refcount=array.data_refcount, - }; -} - -public Array_t Array$to(Array_t array, Int_t int_last) -{ - int64_t last = Int_to_Int64(int_last, false); - if (last < 0) - last = array.length + last + 1; - - if (last > array.length) - last = array.length; - - if (last == 0) - return (Array_t){.atomic=array.atomic}; - - return (Array_t){ - .atomic=array.atomic, - .data=array.data, - .length=last, - .stride=array.stride, - .data_refcount=array.data_refcount, - }; -} - -public Array_t Array$by(Array_t array, Int_t int_stride, int64_t padded_item_size) -{ - int64_t stride = Int_to_Int64(int_stride, false); - // In the unlikely event that the stride value would be too large to fit in - // a 15-bit integer, fall back to creating a copy of the array: - if (__builtin_expect(array.stride*stride < ARRAY_MIN_STRIDE || array.stride*stride > ARRAY_MAX_STRIDE, 0)) { - void *copy = NULL; - int64_t len = (stride < 0 ? array.length / -stride : array.length / stride) + ((array.length % stride) != 0); - if (len > 0) { - copy = array.atomic ? GC_MALLOC_ATOMIC((size_t)(len * padded_item_size)) : GC_MALLOC((size_t)(len * padded_item_size)); - void *start = (stride < 0 ? array.data + (array.stride * (array.length - 1)) : array.data); - for (int64_t i = 0; i < len; i++) - memcpy(copy + i*padded_item_size, start + array.stride*stride*i, (size_t)padded_item_size); - } - return (Array_t){ - .data=copy, - .length=len, - .stride=padded_item_size, - .atomic=array.atomic, - }; - } - - if (stride == 0) - return (Array_t){.atomic=array.atomic}; - - return (Array_t){ - .atomic=array.atomic, - .data=(stride < 0 ? array.data + (array.stride * (array.length - 1)) : array.data), - .length=(stride < 0 ? array.length / -stride : array.length / stride) + ((array.length % stride) != 0), - .stride=array.stride * stride, - .data_refcount=array.data_refcount, - }; -} - -public Array_t Array$reversed(Array_t array, int64_t padded_item_size) -{ - // Just in case negating the stride gives a value that doesn't fit into a - // 15-bit integer, fall back to Array$by()'s more general method of copying - // the array. This should only happen if array.stride is MIN_STRIDE to - // begin with (very unlikely). - if (__builtin_expect(-array.stride < ARRAY_MIN_STRIDE || -array.stride > ARRAY_MAX_STRIDE, 0)) - return Array$by(array, I(-1), padded_item_size); - - Array_t reversed = array; - reversed.stride = -array.stride; - reversed.data = array.data + (array.length-1)*array.stride; - return reversed; -} - -public Array_t Array$concat(Array_t x, Array_t y, int64_t padded_item_size) -{ - void *data = x.atomic ? GC_MALLOC_ATOMIC((size_t)(padded_item_size*(x.length + y.length))) - : GC_MALLOC((size_t)(padded_item_size*(x.length + y.length))); - if (x.stride == padded_item_size) { - memcpy(data, x.data, (size_t)(padded_item_size*x.length)); - } else { - for (int64_t i = 0; i < x.length; i++) - memcpy(data + i*padded_item_size, x.data + i*padded_item_size, (size_t)padded_item_size); - } - - if (y.stride == padded_item_size) { - memcpy(data + padded_item_size*x.length, y.data, (size_t)(padded_item_size*y.length)); - } else { - for (int64_t i = 0; i < x.length; i++) - memcpy(data + (x.length + i)*padded_item_size, y.data + i*padded_item_size, (size_t)padded_item_size); - } - - return (Array_t){ - .data=data, - .length=x.length + y.length, - .stride=padded_item_size, - .atomic=x.atomic, - }; -} - -public bool Array$has(Array_t array, void *item, const TypeInfo *type) -{ - const TypeInfo *item_type = type->ArrayInfo.item; - for (int64_t i = 0; i < array.length; i++) { - if (generic_equal(array.data + i*array.stride, item, item_type)) - return true; - } - return false; -} - -public void Array$clear(Array_t *array) -{ - *array = (Array_t){.data=0, .length=0}; -} - -public int32_t Array$compare(const Array_t *x, const Array_t *y, const TypeInfo *type) -{ - // Early out for arrays with the same data, e.g. two copies of the same array: - if (x->data == y->data && x->stride == y->stride) - return (x->length > y->length) - (x->length < y->length); - - const TypeInfo *item = type->ArrayInfo.item; - if (item->tag == PointerInfo || (item->tag == CustomInfo && item->CustomInfo.compare == NULL)) { // data comparison - int64_t item_padded_size = type->ArrayInfo.item->size; - if (type->ArrayInfo.item->align > 1 && item_padded_size % type->ArrayInfo.item->align) - item_padded_size += type->ArrayInfo.item->align - (item_padded_size % type->ArrayInfo.item->align); // padding - - if ((int64_t)x->stride == item_padded_size && (int64_t)y->stride == item_padded_size && item->size == item_padded_size) { - int32_t cmp = (int32_t)memcmp(x->data, y->data, (size_t)(MIN(x->length, y->length)*item_padded_size)); - if (cmp != 0) return cmp; - } else { - for (int32_t i = 0, len = MIN(x->length, y->length); i < len; i++) { - int32_t cmp = (int32_t)memcmp(x->data+ x->stride*i, y->data + y->stride*i, (size_t)(item->size)); - if (cmp != 0) return cmp; - } - } - } else { - for (int32_t i = 0, len = MIN(x->length, y->length); i < len; i++) { - int32_t cmp = generic_compare(x->data + x->stride*i, y->data + y->stride*i, item); - if (cmp != 0) return cmp; - } - } - return (x->length > y->length) - (x->length < y->length); -} - -public bool Array$equal(const Array_t *x, const Array_t *y, const TypeInfo *type) -{ - return x == y || (x->length == y->length && Array$compare(x, y, type) == 0); -} - -public Text_t Array$as_text(const Array_t *arr, bool colorize, const TypeInfo *type) -{ - if (!arr) - return Text$concat(Text("["), generic_as_text(NULL, false, type->ArrayInfo.item), Text("]")); - - const TypeInfo *item_type = type->ArrayInfo.item; - Text_t text = Text("["); - for (int64_t i = 0; i < arr->length; i++) { - if (i > 0) - text = Text$concat(text, Text(", ")); - Text_t item_text = generic_as_text(arr->data + i*arr->stride, colorize, item_type); - text = Text$concat(text, item_text); - } - text = Text$concat(text, Text("]")); - return text; -} - -public uint64_t Array$hash(const Array_t *arr, const TypeInfo *type) -{ - const TypeInfo *item = type->ArrayInfo.item; - siphash sh; - siphashinit(&sh, sizeof(uint64_t[arr->length])); - if (item->tag == PointerInfo || (item->tag == CustomInfo && item->CustomInfo.hash == NULL && item->size == sizeof(void*))) { // Raw data hash - for (int64_t i = 0; i < arr->length; i++) - siphashadd64bits(&sh, (uint64_t)(arr->data + i*arr->stride)); - } else { - for (int64_t i = 0; i < arr->length; i++) { - uint64_t item_hash = generic_hash(arr->data + i*arr->stride, item); - siphashadd64bits(&sh, item_hash); - } - } - return siphashfinish_last_part(&sh, 0); -} - -#pragma GCC diagnostic ignored "-Wstack-protector" -static void siftdown(Array_t *heap, int64_t startpos, int64_t pos, Closure_t comparison, int64_t padded_item_size) -{ - assert(pos > 0 && pos < heap->length); - char newitem[padded_item_size]; - memcpy(newitem, heap->data + heap->stride*pos, (size_t)(padded_item_size)); - while (pos > startpos) { - int64_t parentpos = (pos - 1) >> 1; - typedef int32_t (*cmp_fn_t)(void*, void*, void*); - int32_t cmp = ((cmp_fn_t)comparison.fn)(newitem, heap->data + heap->stride*parentpos, comparison.userdata); - if (cmp >= 0) - break; - - memcpy(heap->data + heap->stride*pos, heap->data + heap->stride*parentpos, (size_t)(padded_item_size)); - pos = parentpos; - } - memcpy(heap->data + heap->stride*pos, newitem, (size_t)(padded_item_size)); -} - -static void siftup(Array_t *heap, int64_t pos, Closure_t comparison, int64_t padded_item_size) -{ - int64_t endpos = heap->length; - int64_t startpos = pos; - assert(pos < endpos); - - char old_top[padded_item_size]; - memcpy(old_top, heap->data + heap->stride*pos, (size_t)(padded_item_size)); - // Bubble up the smallest leaf node - int64_t limit = endpos >> 1; - while (pos < limit) { - int64_t childpos = 2*pos + 1; // Smaller of the two child nodes - if (childpos + 1 < endpos) { - typedef int32_t (*cmp_fn_t)(void*, void*, void*); - int32_t cmp = ((cmp_fn_t)comparison.fn)( - heap->data + heap->stride*childpos, - heap->data + heap->stride*(childpos + 1), - comparison.userdata); - childpos += (cmp >= 0); - } - - // Move the child node up: - memcpy(heap->data + heap->stride*pos, heap->data + heap->stride*childpos, (size_t)(padded_item_size)); - pos = childpos; - } - memcpy(heap->data + heap->stride*pos, old_top, (size_t)(padded_item_size)); - // Shift the node's parents down: - siftdown(heap, startpos, pos, comparison, padded_item_size); -} - -public void Array$heap_push(Array_t *heap, const void *item, Closure_t comparison, int64_t padded_item_size) -{ - Array$insert(heap, item, I(0), padded_item_size); - - if (heap->length > 1) { - if (heap->data_refcount != 0) - Array$compact(heap, padded_item_size); - siftdown(heap, 0, heap->length-1, comparison, padded_item_size); - } -} - -public void Array$heap_pop(Array_t *heap, Closure_t comparison, int64_t padded_item_size) -{ - if (heap->length == 0) - fail("Attempt to pop from an empty array"); - - if (heap->length == 1) { - *heap = (Array_t){}; - } else if (heap->length == 2) { - heap->data += heap->stride; - --heap->length; - } else { - if (heap->data_refcount != 0) - Array$compact(heap, padded_item_size); - memcpy(heap->data, heap->data + heap->stride*(heap->length-1), (size_t)(padded_item_size)); - --heap->length; - siftup(heap, 0, comparison, padded_item_size); - } -} - -public void Array$heapify(Array_t *heap, Closure_t comparison, int64_t padded_item_size) -{ - if (heap->data_refcount != 0) - Array$compact(heap, padded_item_size); - - // It's necessary to bump the refcount because the user's comparison - // function could do stuff that modifies the heap's data. - ARRAY_INCREF(*heap); - int64_t i, n = heap->length; - for (i = (n >> 1) - 1 ; i >= 0 ; i--) - siftup(heap, i, comparison, padded_item_size); - ARRAY_DECREF(*heap); -} - -public Int_t Array$binary_search(Array_t array, void *target, Closure_t comparison) -{ - typedef int32_t (*cmp_fn_t)(void*, void*, void*); - int64_t lo = 0, hi = array.length-1; - while (lo <= hi) { - int64_t mid = (lo + hi) / 2; - int32_t cmp = ((cmp_fn_t)comparison.fn)( - array.data + array.stride*mid, target, comparison.userdata); - if (cmp == 0) - return I(mid+1); - else if (cmp < 0) - lo = mid + 1; - else if (cmp > 0) - hi = mid - 1; - } - return I(lo+1); // Return the index where the target would be inserted -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/arrays.h b/builtins/arrays.h deleted file mode 100644 index 1e945e5e..00000000 --- a/builtins/arrays.h +++ /dev/null @@ -1,103 +0,0 @@ -#pragma once - -// Functions that operate on arrays - -#include - -#include "datatypes.h" -#include "integers.h" -#include "types.h" -#include "util.h" - -// Convert negative indices to back-indexed without branching: index0 = index + (index < 0)*(len+1)) - 1 -#define Array_get(item_type, arr_expr, index_expr, start, end) *({ \ - const Array_t arr = arr_expr; int64_t index = index_expr; \ - int64_t off = index + (index < 0) * (arr.length + 1) - 1; \ - if (__builtin_expect(off < 0 || off >= arr.length, 0)) \ - fail_source(__SOURCE_FILE__, start, end, "Invalid array index: %s (array has length %ld)\n", Text$as_c_string(Int64$as_text(&index, no, NULL)), arr.length); \ - (item_type*)(arr.data + arr.stride * off);}) -#define Array_get_unchecked(type, x, i) *({ const Array_t arr = x; int64_t index = i; \ - int64_t off = index + (index < 0) * (arr.length + 1) - 1; \ - (type*)(arr.data + arr.stride * off);}) -#define Array_lvalue(item_type, arr_expr, index_expr, padded_item_size, start, end) *({ \ - Array_t *arr = arr_expr; int64_t index = index_expr; \ - int64_t off = index + (index < 0) * (arr->length + 1) - 1; \ - if (__builtin_expect(off < 0 || off >= arr->length, 0)) \ - fail_source(__SOURCE_FILE__, start, end, "Invalid array index: %s (array has length %ld)\n", Text$as_c_string(Int64$as_text(&index, no, NULL)), arr->length); \ - if (arr->data_refcount > 0) \ - Array$compact(arr, padded_item_size); \ - (item_type*)(arr->data + arr->stride * off); }) -#define Array_lvalue_unchecked(item_type, arr_expr, index_expr, padded_item_size) *({ \ - Array_t *arr = arr_expr; int64_t index = index_expr; \ - int64_t off = index + (index < 0) * (arr->length + 1) - 1; \ - if (arr->data_refcount > 0) \ - Array$compact(arr, padded_item_size); \ - (item_type*)(arr->data + arr->stride * off); }) -#define Array_set(item_type, arr, index, value, padded_item_size, start, end) \ - Array_lvalue(item_type, arr_expr, index, padded_item_size, start, end) = value -#define is_atomic(x) _Generic(x, bool: true, int8_t: true, int16_t: true, int32_t: true, int64_t: true, float: true, double: true, default: false) -#define TypedArray(t, ...) ({ t items[] = {__VA_ARGS__}; \ - (Array_t){.length=sizeof(items)/sizeof(items[0]), \ - .stride=(int64_t)&items[1] - (int64_t)&items[0], \ - .data=memcpy(GC_MALLOC(sizeof(items)), items, sizeof(items)), \ - .atomic=0, \ - .data_refcount=0}; }) -#define TypedArrayN(t, N, ...) ({ t items[N] = {__VA_ARGS__}; \ - (Array_t){.length=N, \ - .stride=(int64_t)&items[1] - (int64_t)&items[0], \ - .data=memcpy(GC_MALLOC(sizeof(items)), items, sizeof(items)), \ - .atomic=0, \ - .data_refcount=0}; }) -#define Array(x, ...) ({ __typeof(x) items[] = {x, __VA_ARGS__}; \ - (Array_t){.length=sizeof(items)/sizeof(items[0]), \ - .stride=(int64_t)&items[1] - (int64_t)&items[0], \ - .data=memcpy(is_atomic(x) ? GC_MALLOC_ATOMIC(sizeof(items)) : GC_MALLOC(sizeof(items)), items, sizeof(items)), \ - .atomic=is_atomic(x), \ - .data_refcount=0}; }) -// Array refcounts use a saturating add, where once it's at the max value, it stays there. -#define ARRAY_INCREF(arr) (arr).data_refcount += ((arr).data_refcount < ARRAY_MAX_DATA_REFCOUNT) -#define ARRAY_DECREF(arr) (arr).data_refcount -= ((arr).data_refcount < ARRAY_MAX_DATA_REFCOUNT) -#define ARRAY_COPY(arr) ({ ARRAY_INCREF(arr); arr; }) - -#define Array$insert_value(arr, item_expr, index, padded_item_size) ({ __typeof(item_expr) item = item_expr; Array$insert(arr, &item, index, padded_item_size); }) -void Array$insert(Array_t *arr, const void *item, Int_t index, int64_t padded_item_size); -void Array$insert_all(Array_t *arr, Array_t to_insert, Int_t index, int64_t padded_item_size); -void Array$remove_at(Array_t *arr, Int_t index, Int_t count, int64_t padded_item_size); -void Array$remove_item(Array_t *arr, void *item, Int_t max_removals, const TypeInfo *type); -#define Array$remove_item_value(arr, item_expr, max, type) ({ __typeof(item_expr) item = item_expr; Array$remove_item(arr, &item, max, type); }) -Int_t Array$find(Array_t arr, void *item, const TypeInfo *type); -#define Array$find_value(arr, item_expr, type) ({ __typeof(item_expr) item = item_expr; Array$find(arr, &item, type); }) -Int_t Array$first(Array_t arr, Closure_t predicate); -void Array$sort(Array_t *arr, Closure_t comparison, int64_t padded_item_size); -Array_t Array$sorted(Array_t arr, Closure_t comparison, int64_t padded_item_size); -void Array$shuffle(Array_t *arr, int64_t padded_item_size); -Array_t Array$shuffled(Array_t arr, int64_t padded_item_size); -void *Array$random(Array_t arr); -#define Array$random_value(arr, t) ({ Array_t _arr = arr; if (_arr.length == 0) fail("Cannot get a random value from an empty array!"); *(t*)Array$random(_arr); }) -Array_t Array$sample(Array_t arr, Int_t n, Array_t weights, int64_t padded_item_size); -Table_t Array$counts(Array_t arr, const TypeInfo *type); -void Array$clear(Array_t *array); -void Array$compact(Array_t *arr, int64_t padded_item_size); -PUREFUNC bool Array$has(Array_t array, void *item, const TypeInfo *type); -#define Array$has_value(arr, item_expr, type) ({ __typeof(item_expr) item = item_expr; Array$has(arr, &item, type); }) -PUREFUNC Array_t Array$from(Array_t array, Int_t first); -PUREFUNC Array_t Array$to(Array_t array, Int_t last); -PUREFUNC Array_t Array$by(Array_t array, Int_t stride, int64_t padded_item_size); -PUREFUNC Array_t Array$reversed(Array_t array, int64_t padded_item_size); -Array_t Array$concat(Array_t x, Array_t y, int64_t padded_item_size); -PUREFUNC uint64_t Array$hash(const Array_t *arr, const TypeInfo *type); -PUREFUNC int32_t Array$compare(const Array_t *x, const Array_t *y, const TypeInfo *type); -PUREFUNC bool Array$equal(const Array_t *x, const Array_t *y, const TypeInfo *type); -Text_t Array$as_text(const Array_t *arr, bool colorize, const TypeInfo *type); -void Array$heapify(Array_t *heap, Closure_t comparison, int64_t padded_item_size); -void Array$heap_push(Array_t *heap, const void *item, Closure_t comparison, int64_t padded_item_size); -#define Array$heap_push_value(heap, _value, comparison, padded_item_size) ({ __typeof(_value) value = _value; Array$heap_push(heap, &value, comparison, padded_item_size); }) -void Array$heap_pop(Array_t *heap, Closure_t comparison, int64_t padded_item_size); -#define Array$heap_pop_value(heap, comparison, padded_item_size, type) \ - ({ Array_t *_heap = heap; if (_heap->length == 0) fail("Attempt to pop from an empty array"); \ - type value = *(type*)_heap->data; Array$heap_pop(_heap, comparison, padded_item_size); value; }) -Int_t Array$binary_search(Array_t array, void *target, Closure_t comparison); -#define Array$binary_search_value(array, target, comparison) \ - ({ __typeof(target) _target = target; Array$binary_search(array, &_target, comparison); }) - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/bools.c b/builtins/bools.c deleted file mode 100644 index d7b3718f..00000000 --- a/builtins/bools.c +++ /dev/null @@ -1,54 +0,0 @@ -// Boolean methods/type info -#include -#include -#include -#include -#include -#include -#include - -#include "bools.h" -#include "optionals.h" -#include "text.h" -#include "util.h" - -PUREFUNC public Text_t Bool$as_text(const bool *b, bool colorize, const TypeInfo *type) -{ - (void)type; - if (!b) return Text("Bool"); - if (colorize) - return *b ? Text("\x1b[35myes\x1b[m") : Text("\x1b[35mno\x1b[m"); - else - return *b ? Text("yes") : Text("no"); -} - -PUREFUNC public OptionalBool_t Bool$from_text(Text_t text) -{ - if (Text$equal_ignoring_case(text, Text("yes")) - || Text$equal_ignoring_case(text, Text("on")) - || Text$equal_ignoring_case(text, Text("true")) - || Text$equal_ignoring_case(text, Text("1"))) { - return yes; - } else if (Text$equal_ignoring_case(text, Text("no")) - || Text$equal_ignoring_case(text, Text("off")) - || Text$equal_ignoring_case(text, Text("false")) - || Text$equal_ignoring_case(text, Text("0"))) { - return no; - } else { - return NULL_BOOL; - } -} - -public Bool_t Bool$random(double p) -{ - return (drand48() < p); -} - -public const TypeInfo Bool$info = { - .size=sizeof(bool), - .align=__alignof__(bool), - .tag=CustomInfo, - .CustomInfo={.as_text=(void*)Bool$as_text}, -}; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/bools.h b/builtins/bools.h deleted file mode 100644 index 98b2ac06..00000000 --- a/builtins/bools.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -// Boolean functions/type info - -#include -#include - -#include "types.h" -#include "optionals.h" -#include "util.h" - -#define Bool_t bool -#define yes (Bool_t)true -#define no (Bool_t)false - -PUREFUNC Text_t Bool$as_text(const bool *b, bool colorize, const TypeInfo *type); -OptionalBool_t Bool$from_text(Text_t text); -Bool_t Bool$random(double p); - -extern const TypeInfo Bool$info; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/c_strings.c b/builtins/c_strings.c deleted file mode 100644 index 392565ab..00000000 --- a/builtins/c_strings.c +++ /dev/null @@ -1,55 +0,0 @@ -// Type info and methods for CString datatype (char*) -#include -#include -#include -#include -#include -#include - -#include "text.h" -#include "siphash.h" -#include "util.h" - -public Text_t CString$as_text(const char **c_string, bool colorize, const TypeInfo *info) -{ - (void)info; - if (!c_string) return Text("CString"); - Text_t text = Text$from_str(*c_string); - return Text$concat(colorize ? Text("\x1b[34mCString\x1b[m(") : Text("CString("), Text$quoted(text, colorize), Text(")")); -} - -public Text_t CString$as_text_simple(const char *str) -{ - return Text$format("%s", str); -} - -PUREFUNC public int32_t CString$compare(const char **x, const char **y) -{ - if (x == y) - return 0; - - if (!*x != !*y) - return (!*y) - (!*x); - - return strcmp(*x, *y); -} - -PUREFUNC public bool CString$equal(const char **x, const char **y) -{ - return CString$compare(x, y) == 0; -} - -PUREFUNC public uint64_t CString$hash(const char **c_str) -{ - if (!*c_str) return 0; - return siphash24((void*)*c_str, strlen(*c_str)); -} - -public const TypeInfo CString$info = { - .size=sizeof(char*), - .align=__alignof__(char*), - .tag=CStringInfo, - .CustomInfo={.as_text=(void*)CString$as_text, .compare=(void*)CString$compare, .equal=(void*)CString$equal, .hash=(void*)CString$hash}, -}; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/c_strings.h b/builtins/c_strings.h deleted file mode 100644 index d4c1caa7..00000000 --- a/builtins/c_strings.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -// Type info and methods for CString datatype, which represents C's `char*` - -#include -#include - -#include "types.h" - -Text_t CString$as_text(char **str, bool colorize, const TypeInfo *info); -Text_t CString$as_text_simple(const char *str); -PUREFUNC int CString$compare(const char **x, const char **y); -PUREFUNC bool CString$equal(const char **x, const char **y); -PUREFUNC uint64_t CString$hash(const char **str); - -extern const TypeInfo CString$info; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/channels.c b/builtins/channels.c deleted file mode 100644 index 3681b0b8..00000000 --- a/builtins/channels.c +++ /dev/null @@ -1,137 +0,0 @@ -// Functions that operate on channels - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrays.h" -#include "metamethods.h" -#include "integers.h" -#include "siphash.h" -#include "text.h" -#include "types.h" -#include "util.h" - -public Channel_t *Channel$new(Int_t max_size) -{ - if (Int$compare_value(max_size, I_small(0)) <= 0) - fail("Cannot create a channel with a size less than one: %ld", max_size); - Channel_t *channel = new(Channel_t); - channel->items = (Array_t){}; - channel->mutex = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; - channel->cond = (pthread_cond_t)PTHREAD_COND_INITIALIZER; - channel->max_size = Int_to_Int64(max_size, false); - return channel; -} - -public void Channel$give(Channel_t *channel, const void *item, bool front, int64_t padded_item_size) -{ - (void)pthread_mutex_lock(&channel->mutex); - while (channel->items.length >= channel->max_size) - pthread_cond_wait(&channel->cond, &channel->mutex); - Int_t index = front ? I_small(1) : I_small(0); - Array$insert(&channel->items, item, index, padded_item_size); - (void)pthread_mutex_unlock(&channel->mutex); - (void)pthread_cond_signal(&channel->cond); -} - -public void Channel$give_all(Channel_t *channel, Array_t to_give, bool front, int64_t padded_item_size) -{ - if (to_give.length == 0) return; - (void)pthread_mutex_lock(&channel->mutex); - Int_t index = front ? I_small(1) : I_small(0); - if (channel->items.length + to_give.length >= channel->max_size) { - for (int64_t i = 0; i < to_give.length; i++) { - while (channel->items.length >= channel->max_size) - pthread_cond_wait(&channel->cond, &channel->mutex); - Array$insert(&channel->items, to_give.data + i*to_give.stride, index, padded_item_size); - } - } else { - Array$insert_all(&channel->items, to_give, index, padded_item_size); - } - (void)pthread_mutex_unlock(&channel->mutex); - (void)pthread_cond_signal(&channel->cond); -} - -public void Channel$get(Channel_t *channel, void *out, bool front, int64_t item_size, int64_t padded_item_size) -{ - (void)pthread_mutex_lock(&channel->mutex); - while (channel->items.length == 0) - pthread_cond_wait(&channel->cond, &channel->mutex); - memcpy(out, channel->items.data + channel->items.stride * (front ? 0 : channel->items.length-1), (size_t)(item_size)); - Int_t index = front ? I_small(1) : Int64_to_Int(channel->items.length); - Array$remove_at(&channel->items, index, I_small(1), padded_item_size); - (void)pthread_mutex_unlock(&channel->mutex); - (void)pthread_cond_signal(&channel->cond); -} - -public void Channel$peek(Channel_t *channel, void *out, bool front, int64_t item_size) -{ - (void)pthread_mutex_lock(&channel->mutex); - while (channel->items.length == 0) - pthread_cond_wait(&channel->cond, &channel->mutex); - int64_t index = front ? 0 : channel->items.length-1; - memcpy(out, channel->items.data + channel->items.stride*index, (size_t)(item_size)); - (void)pthread_mutex_unlock(&channel->mutex); - (void)pthread_cond_signal(&channel->cond); -} - -public Array_t Channel$view(Channel_t *channel) -{ - (void)pthread_mutex_lock(&channel->mutex); - ARRAY_INCREF(channel->items); - Array_t ret = channel->items; - (void)pthread_mutex_unlock(&channel->mutex); - return ret; -} - -public void Channel$clear(Channel_t *channel) -{ - (void)pthread_mutex_lock(&channel->mutex); - Array$clear(&channel->items); - (void)pthread_mutex_unlock(&channel->mutex); - (void)pthread_cond_signal(&channel->cond); -} - -PUREFUNC public uint64_t Channel$hash(Channel_t **channel, const TypeInfo *type) -{ - (void)type; - return siphash24((void*)*channel, sizeof(Channel_t*)); -} - -PUREFUNC public int32_t Channel$compare(Channel_t **x, Channel_t **y, const TypeInfo *type) -{ - (void)type; - return (*x > *y) - (*x < *y); -} - -PUREFUNC public bool Channel$equal(Channel_t **x, Channel_t **y, const TypeInfo *type) -{ - (void)type; - return (*x == *y); -} - -public Text_t Channel$as_text(Channel_t **channel, bool colorize, const TypeInfo *type) -{ - const TypeInfo *item_type = type->ChannelInfo.item; - if (!channel) { - Text_t typename = generic_as_text(NULL, false, item_type); - return Text$concat(colorize ? Text("\x1b[34;1m|:") : Text("|:"), typename, colorize ? Text("|\x1b[m") : Text("|")); - } - Text_t typename = generic_as_text(NULL, false, item_type); - return Text$concat( - colorize ? Text("\x1b[34;1m|:") : Text("|:"), - typename, - Text("|<"), - Int64$hex((int64_t)(void*)*channel, I_small(0), true, true), - colorize ? Text(">\x1b[m") : Text(">") - ); -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/channels.h b/builtins/channels.h deleted file mode 100644 index 8deb0569..00000000 --- a/builtins/channels.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -// Functions that operate on channels (thread-safe arrays) - -#include - -#include "datatypes.h" -#include "types.h" -#include "util.h" - -Channel_t *Channel$new(Int_t max_size); -void Channel$give(Channel_t *channel, const void *item, bool front, int64_t padded_item_size); -#define Channel$give_value(channel, item, front, padded_item_size) \ - ({ __typeof(item) _item = item; Channel$give(channel, &_item, front, padded_item_size); }) -void Channel$give_all(Channel_t *channel, Array_t to_give, bool front, int64_t padded_item_size); -void Channel$get(Channel_t *channel, void *out, bool front, int64_t item_size, int64_t padded_item_size); -#define Channel$get_value(channel, front, t, padded_item_size) \ - ({ t _val; Channel$get(channel, &_val, front, sizeof(t), padded_item_size); _val; }) -void Channel$peek(Channel_t *channel, void *out, bool front, int64_t item_size); -#define Channel$peek_value(channel, front, t) ({ t _val; Channel$peek(channel, &_val, front, sizeof(t)); _val; }) -void Channel$clear(Channel_t *channel); -Array_t Channel$view(Channel_t *channel); -PUREFUNC uint64_t Channel$hash(Channel_t **channel, const TypeInfo *type); -PUREFUNC int32_t Channel$compare(Channel_t **x, Channel_t **y, const TypeInfo *type); -PUREFUNC bool Channel$equal(Channel_t **x, Channel_t **y, const TypeInfo *type); -Text_t Channel$as_text(Channel_t **channel, bool colorize, const TypeInfo *type); - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/datatypes.h b/builtins/datatypes.h deleted file mode 100644 index 8c13d3c4..00000000 --- a/builtins/datatypes.h +++ /dev/null @@ -1,91 +0,0 @@ -#pragma once - -// Common datastructures (arrays, tables, closures) - -#include -#include -#include -#include - -#define ARRAY_LENGTH_BITS 42 -#define ARRAY_FREE_BITS 6 -#define ARRAY_REFCOUNT_BITS 3 -#define ARRAY_STRIDE_BITS 12 - -#define MAX_FOR_N_BITS(N) ((1<<(N))-1) -#define ARRAY_MAX_STRIDE MAX_FOR_N_BITS(ARRAY_STRIDE_BITS-1) -#define ARRAY_MIN_STRIDE (~MAX_FOR_N_BITS(ARRAY_STRIDE_BITS-1)) -#define ARRAY_MAX_DATA_REFCOUNT MAX_FOR_N_BITS(ARRAY_REFCOUNT_BITS) -#define ARRAY_MAX_FREE_ENTRIES MAX_FOR_N_BITS(ARRAY_FREE_BITS) - -typedef union { - int64_t small; - mpz_t *big; -} Int_t; - -typedef struct { - void *data; - // All of the following fields add up to 64 bits, which means that array - // structs can be passed in two 64-bit registers. C will handle doing the - // bit arithmetic to extract the necessary values, which is cheaper than - // spilling onto the stack and needing to retrieve data from the stack. - int64_t length:ARRAY_LENGTH_BITS; - uint8_t free:ARRAY_FREE_BITS; - bool atomic:1; - uint8_t data_refcount:ARRAY_REFCOUNT_BITS; - int16_t stride:ARRAY_STRIDE_BITS; -} Array_t; - -typedef struct { - uint32_t occupied:1, index:31; - uint32_t next_bucket; -} bucket_t; - -#define TABLE_MAX_BUCKETS 0x7fffffff -#define TABLE_MAX_DATA_REFCOUNT 3 - -typedef struct { - uint32_t count:31, last_free:31; - uint8_t data_refcount:2; - bucket_t buckets[]; -} bucket_info_t; - -typedef struct table_s { - Array_t entries; - bucket_info_t *bucket_info; - struct table_s *fallback; -} Table_t; - -typedef struct { - void *fn, *userdata; -} Closure_t; - -typedef struct Range_s { - Int_t first, last, step; -} Range_t; - -typedef struct { - Array_t items; - pthread_mutex_t mutex; - pthread_cond_t cond; - int64_t max_size; -} Channel_t; - -enum text_type { TEXT_SHORT_ASCII, TEXT_ASCII, TEXT_SHORT_GRAPHEMES, TEXT_GRAPHEMES, TEXT_SUBTEXT }; - -typedef struct Text_s { - int64_t length; // Number of grapheme clusters - uint64_t hash:61; - uint8_t tag:3; - union { - char short_ascii[8]; - const char *ascii; - int32_t short_graphemes[2]; - const int32_t *graphemes; - struct Text_s *subtexts; - }; -} Text_t; - -#define Pattern_t Text_t - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/files.c b/builtins/files.c deleted file mode 100644 index 4a4220e7..00000000 --- a/builtins/files.c +++ /dev/null @@ -1,322 +0,0 @@ -// -// files.c - Implementation of some file loading functionality. -// - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "files.h" -#include "util.h" - -static const int tabstop = 4; - -public char *resolve_path(const char *path, const char *relative_to, const char *system_path) -{ - if (!relative_to || streq(relative_to, "/dev/stdin")) relative_to = "."; - if (!path || strlen(path) == 0) return NULL; - - // Resolve the path to an absolute path, assuming it's relative to the file - // it was found in: - char buf[PATH_MAX] = {0}; - if (streq(path, "~") || starts_with(path, "~/")) { - char *resolved = realpath(heap_strf("%s%s", getenv("HOME"), path+1), buf); - if (resolved) return GC_strdup(resolved); - } else if (streq(path, ".") || starts_with(path, "./") || starts_with(path, "../")) { - char *relative_dir = dirname(GC_strdup(relative_to)); - char *resolved = realpath(heap_strf("%s/%s", relative_dir, path), buf); - if (resolved) return GC_strdup(resolved); - } else if (path[0] == '/') { - // Absolute path: - char *resolved = realpath(path, buf); - if (resolved) return GC_strdup(resolved); - } else { - // Relative path: - char *relative_dir = dirname(GC_strdup(relative_to)); - if (!system_path) system_path = "."; - char *copy = GC_strdup(system_path); - for (char *dir, *pos = copy; (dir = strsep(&pos, ":")); ) { - if (dir[0] == '/') { - char *resolved = realpath(heap_strf("%s/%s", dir, path), buf); - if (resolved) return GC_strdup(resolved); - } else if (dir[0] == '~' && (dir[1] == '\0' || dir[1] == '/')) { - char *resolved = realpath(heap_strf("%s%s/%s", getenv("HOME"), dir+1, path), buf); - if (resolved) return GC_strdup(resolved); - } else if (streq(dir, ".") || strncmp(dir, "./", 2) == 0) { - char *resolved = realpath(heap_strf("%s/%s", relative_dir, path), buf); - if (resolved) return GC_strdup(resolved); - } else if (streq(dir, ".") || streq(dir, "..") || strncmp(dir, "./", 2) == 0 || strncmp(dir, "../", 3) == 0) { - char *resolved = realpath(heap_strf("%s/%s/%s", relative_dir, dir, path), buf); - if (resolved) return GC_strdup(resolved); - } else { - char *resolved = realpath(heap_strf("%s/%s", dir, path), buf); - if (resolved) return GC_strdup(resolved); - } - } - } - return NULL; -} - -public char *file_base_name(const char *path) -{ - const char *slash = strrchr(path, '/'); - if (slash) path = slash + 1; - assert(!isdigit(*path)); - const char *end = strchrnul(path, '.'); - size_t len = (size_t)(end - path); - char *buf = GC_MALLOC_ATOMIC(len+1); - strncpy(buf, path, len); - buf[len] = '\0'; - for (char *p = buf; *p; p++) { - if (!isalnum(*p)) - *p = '_'; - } - return buf; -} - -static file_t *_load_file(const char* filename, FILE *file) -{ - if (!file) return NULL; - - file_t *ret = new(file_t, .filename=filename); - - size_t file_size = 0, line_cap = 0; - char *file_buf = NULL, *line_buf = NULL; - FILE *mem = open_memstream(&file_buf, &file_size); - int64_t line_len = 0; - while ((line_len = getline(&line_buf, &line_cap, file)) >= 0) { - if (ret->line_capacity <= ret->num_lines) - ret->line_offsets = GC_REALLOC(ret->line_offsets, sizeof(int64_t[ret->line_capacity += 32])); - ret->line_offsets[ret->num_lines++] = (int64_t)file_size; - fwrite(line_buf, sizeof(char), (size_t)line_len, mem); - fflush(mem); - } - fclose(file); - - char *copy = GC_MALLOC_ATOMIC(file_size+1); - memcpy(copy, file_buf, file_size); - copy[file_size] = '\0'; - ret->text = copy; - ret->len = (int64_t)file_size; - fclose(mem); - - free(file_buf); - ret->relative_filename = filename; - if (filename && filename[0] != '<' && !streq(filename, "/dev/stdin")) { - filename = resolve_path(filename, ".", "."); - // Convert to relative path (if applicable) - char buf[PATH_MAX]; - char *cwd = getcwd(buf, sizeof(buf)); - size_t cwd_len = strlen(cwd); - if (strncmp(cwd, filename, cwd_len) == 0 && filename[cwd_len] == '/') - ret->relative_filename = &filename[cwd_len+1]; - } - return ret; -} - -// -// Read an entire file into memory. -// -public file_t *load_file(const char* filename) -{ - FILE *file = filename[0] ? fopen(filename, "r") : stdin; - return _load_file(filename, file); -} - -// -// Create a virtual file from a string. -// -public file_t *spoof_file(const char* filename, const char *text) -{ - FILE *file = fmemopen((char*)text, strlen(text)+1, "r"); - return _load_file(filename, file); -} - -// -// Given a pointer, determine which line number it points to (1-indexed) -// -public int64_t get_line_number(file_t *f, const char *p) -{ - // Binary search: - int64_t lo = 0, hi = (int64_t)f->num_lines-1; - if (p < f->text) return 0; - int64_t offset = (int64_t)(p - f->text); - while (lo <= hi) { - int64_t mid = (lo + hi) / 2; - int64_t line_offset = f->line_offsets[mid]; - if (line_offset == offset) - return mid + 1; - else if (line_offset < offset) - lo = mid + 1; - else if (line_offset > offset) - hi = mid - 1; - } - return lo; // Return the line number whose line starts closest before p -} - -// -// Given a pointer, determine which line column it points to. -// -public int64_t get_line_column(file_t *f, const char *p) -{ - int64_t line_no = get_line_number(f, p); - int64_t line_offset = f->line_offsets[line_no-1]; - return 1 + (int64_t)(p - (f->text + line_offset)); -} - -// -// Return a pointer to the line with the specified line number (1-indexed) -// -public const char *get_line(file_t *f, int64_t line_number) -{ - if (line_number == 0 || line_number > (int64_t)f->num_lines) return NULL; - int64_t line_offset = f->line_offsets[line_number-1]; - return f->text + line_offset; -} - -// -// Return a value like /foo:line:col -// -public const char *get_file_pos(file_t *f, const char *p) -{ - return heap_strf("%s:%ld:%ld", f->filename, get_line_number(f, p), get_line_column(f, p)); -} - -static int fputc_column(FILE *out, char c, char print_char, int *column) -{ - int printed = 0; - if (print_char == '\t') print_char = ' '; - if (c == '\t') { - for (int to_fill = tabstop - (*column % tabstop); to_fill > 0; --to_fill) { - printed += fputc(print_char, out); - ++*column; - } - } else { - printed += fputc(print_char, out); - ++*column; - } - return printed; -} - -// -// Print a span from a file -// -public int highlight_error(file_t *file, const char *start, const char *end, const char *hl_color, int64_t context_lines, bool use_color) -{ - if (!file) return 0; - - // Handle spans that come from multiple files: - if (start < file->text || start > file->text + file->len) - start = end; - if (end < file->text || end > file->text + file->len) - end = start; - // Just in case neither end of the span came from this file: - if (end < file->text || end > file->text + file->len) - start = end = file->text; - - const char *lineno_fmt, *normal_color, *empty_marker; - bool print_carets = false; - int printed = 0; - if (use_color) { - lineno_fmt = "\x1b[0;2m%*lu\x1b(0\x78\x1b(B\x1b[m "; - normal_color = "\x1b[m"; - empty_marker = "\x1b(0\x61\x1b(B"; - printed += fprintf(stderr, "\x1b[33;4;1m%s\x1b[m\n", file->relative_filename); - } else { - lineno_fmt = "%*lu| "; - hl_color = ""; - normal_color = ""; - empty_marker = " "; - print_carets = true; - printed += fprintf(stderr, "%s\n", file->relative_filename); - } - - if (context_lines == 0) - return fprintf(stderr, "%s%.*s%s", hl_color, (int)(end - start), start, normal_color); - - int64_t start_line = get_line_number(file, start), - end_line = get_line_number(file, end); - - int64_t first_line = start_line - (context_lines - 1), - last_line = end_line + (context_lines - 1); - - if (first_line < 1) first_line = 1; - if (last_line > file->num_lines) last_line = file->num_lines; - - int digits = 1; - for (int64_t i = last_line; i > 0; i /= 10) ++digits; - - for (int64_t line_no = first_line; line_no <= last_line; ++line_no) { - if (line_no > first_line + 5 && line_no < last_line - 5) { - if (use_color) - printed += fprintf(stderr, "\x1b[0;2;3;4m ... %ld lines omitted ... \x1b[m\n", (last_line - first_line) - 11); - else - printed += fprintf(stderr, " ... %ld lines omitted ...\n", (last_line - first_line) - 11); - line_no = last_line - 6; - continue; - } - - printed += fprintf(stderr, lineno_fmt, digits, line_no); - const char *line = get_line(file, line_no); - if (!line) break; - - int column = 0; - const char *p = line; - // Before match - for (; *p && *p != '\r' && *p != '\n' && p < start; ++p) - printed += fputc_column(stderr, *p, *p, &column); - - // Zero-width matches - if (p == start && start == end) { - printed += fprintf(stderr, "%s%s%s", hl_color, empty_marker, normal_color); - column += 1; - } - - // Inside match - if (start <= p && p < end) { - printed += fputs(hl_color, stderr); - for (; *p && *p != '\r' && *p != '\n' && p < end; ++p) - printed += fputc_column(stderr, *p, *p, &column); - printed += fputs(normal_color, stderr); - } - - // After match - for (; *p && *p != '\r' && *p != '\n'; ++p) - printed += fputc_column(stderr, *p, *p, &column); - - printed += fprintf(stderr, "\n"); - - const char *eol = strchrnul(line, '\n'); - if (print_carets && start >= line && start < eol && line <= start) { - for (int num = 0; num < digits; num++) - printed += fputc(' ', stderr); - printed += fputs(": ", stderr); - int col = 0; - for (const char *sp = line; *sp && *sp != '\n'; ++sp) { - char print_char; - if (sp < start) - print_char = ' '; - else if (sp == start && sp == end) - print_char = '^'; - else if (sp >= start && sp < end) - print_char = '-'; - else - print_char = ' '; - printed += fputc_column(stderr, *sp, print_char, &col); - } - printed += fputs("\n", stderr); - } - } - fflush(stderr); - return printed; -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/files.h b/builtins/files.h deleted file mode 100644 index f650f78e..00000000 --- a/builtins/files.h +++ /dev/null @@ -1,35 +0,0 @@ -// -// files.h - Definitions of an API for loading files. -// -#pragma once - -#include -#include -#include - -typedef struct { - const char *filename, *relative_filename; - const char *text; - int64_t len; - int64_t num_lines, line_capacity; - int64_t *line_offsets; -} file_t; - -char *resolve_path(const char *path, const char *relative_to, const char *system_path); -__attribute__((pure, nonnull)) -char *file_base_name(const char *path); -__attribute__((nonnull)) -file_t *load_file(const char *filename); -__attribute__((nonnull, returns_nonnull)) -file_t *spoof_file(const char *filename, const char *text); -__attribute__((pure, nonnull)) -int64_t get_line_number(file_t *f, const char *p); -__attribute__((pure, nonnull)) -int64_t get_line_column(file_t *f, const char *p); -__attribute__((pure, nonnull)) -const char *get_line(file_t *f, int64_t line_number); -__attribute__((pure, nonnull)) -const char *get_file_pos(file_t *f, const char *p); -int highlight_error(file_t *file, const char *start, const char *end, const char *hl_color, int64_t context_lines, bool use_color); - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/functiontype.c b/builtins/functiontype.c deleted file mode 100644 index 251a01ed..00000000 --- a/builtins/functiontype.c +++ /dev/null @@ -1,35 +0,0 @@ -// Logic for handling function type values - -#include "datatypes.h" -#include "tables.h" -#include "text.h" -#include "types.h" -#include "util.h" - -static Table_t function_names = {}; - -public void register_function(void *fn, Text_t name) -{ - Table$set(&function_names, &fn, &name, Table$info(Function$info("???"), &Text$info)); -} - -public Text_t *get_function_name(void *fn) -{ - return Table$get(function_names, &fn, Table$info(Function$info("???"), &Text$info)); -} - -public Text_t Func$as_text(const void *fn, bool colorize, const TypeInfo *type) -{ - (void)fn; - Text_t text = Text$from_str(type->FunctionInfo.type_str); - if (fn) { - Text_t *name = get_function_name(*(void**)fn); - if (name) - text = *name; - } - if (fn && colorize) - text = Text$concat(Text("\x1b[32;1m"), text, Text("\x1b[m")); - return text; -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/functiontype.h b/builtins/functiontype.h deleted file mode 100644 index e3feb03e..00000000 --- a/builtins/functiontype.h +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -// Logic for handling function type values - -void register_function(void *fn, Text_t name); -Text_t *get_function_name(void *fn); -Text_t Func$as_text(const void *fn, bool colorize, const TypeInfo *type); - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/integers.c b/builtins/integers.c deleted file mode 100644 index ef588984..00000000 --- a/builtins/integers.c +++ /dev/null @@ -1,490 +0,0 @@ -// Integer type infos and methods -#include -#include -#include -#include -#include -#include - -#include "arrays.h" -#include "datatypes.h" -#include "integers.h" -#include "optionals.h" -#include "siphash.h" -#include "text.h" -#include "types.h" - -static gmp_randstate_t Int_rng = {}; - -public void Int$init_random(long seed) { - gmp_randinit_default(Int_rng); - gmp_randseed_ui(Int_rng, (unsigned long)seed); -} - -public Text_t Int$value_as_text(Int_t i) { - if (__builtin_expect(i.small & 1, 1)) { - return Text$format("%ld", (i.small)>>2); - } else { - char *str = mpz_get_str(NULL, 10, *i.big); - return Text$from_str(str); - } -} - -public Text_t Int$as_text(const Int_t *i, bool colorize, const TypeInfo *type) { - (void)type; - if (!i) return Text("Int"); - - Text_t text = Int$value_as_text(*i); - if (colorize) text = Text$concat(Text("\x1b[35m"), text, Text("\x1b[m")); - return text; -} - -public PUREFUNC int32_t Int$compare(const Int_t *x, const Int_t *y, const TypeInfo *type) { - (void)type; - if (__builtin_expect(((x->small | y->small) & 1) == 0, 0)) - return x->big == y->big ? 0 : mpz_cmp(*x->big, *y->big); - return (x->small > y->small) - (x->small < y->small); -} - -public PUREFUNC int32_t Int$compare_value(const Int_t x, const Int_t y) { - if (__builtin_expect(((x.small | y.small) & 1) == 0, 0)) - return x.big == y.big ? 0 : mpz_cmp(*x.big, *y.big); - return (x.small > y.small) - (x.small < y.small); -} - -public PUREFUNC bool Int$equal(const Int_t *x, const Int_t *y, const TypeInfo *type) { - (void)type; - return x->small == y->small || (__builtin_expect(((x->small | y->small) & 1) == 0, 0) && mpz_cmp(*x->big, *y->big) == 0); -} - -public PUREFUNC bool Int$equal_value(const Int_t x, const Int_t y) { - return x.small == y.small || (__builtin_expect(((x.small | y.small) & 1) == 0, 0) && mpz_cmp(*x.big, *y.big) == 0); -} - -public PUREFUNC uint64_t Int$hash(const Int_t *x, const TypeInfo *type) { - (void)type; - if (__builtin_expect(x->small & 1, 1)) { - int64_t i = (x->small>>2); - return siphash24((void*)&i, sizeof(i)); - } else { - char *str = mpz_get_str(NULL, 16, *x->big); - return siphash24((void*)str, strlen(str)); - } -} - -public Text_t Int$format(Int_t i, Int_t digits_int) { - int64_t digits = Int_to_Int64(digits_int, false); - if (__builtin_expect(i.small & 1, 1)) { - return Text$format("%0.*ld", digits, (i.small)>>2); - } else { - char *str = mpz_get_str(NULL, 10, *i.big); - bool negative = (str[0] == '-'); - int64_t needed_zeroes = digits - (int64_t)strlen(str); - if (needed_zeroes <= 0) - return Text$from_str(str); - - char *zeroes = GC_MALLOC_ATOMIC((size_t)(needed_zeroes)); - memset(zeroes, '0', (size_t)(needed_zeroes)); - if (negative) - return Text$concat(Text("-"), Text$from_str(zeroes), Text$from_str(str + 1)); - else - return Text$concat(Text$from_str(zeroes), Text$from_str(str)); - } -} - -public Text_t Int$hex(Int_t i, Int_t digits_int, bool uppercase, bool prefix) { - if (Int$is_negative(i)) - return Text$concat(Text("-"), Int$hex(Int$negative(i), digits_int, uppercase, prefix)); - - int64_t digits = Int_to_Int64(digits_int, false); - if (__builtin_expect(i.small & 1, 1)) { - const char *hex_fmt = uppercase ? (prefix ? "0x%0.*lX" : "%0.*lX") : (prefix ? "0x%0.*lx" : "%0.*lx"); - return Text$format(hex_fmt, digits, (i.small)>>2); - } else { - char *str = mpz_get_str(NULL, 16, *i.big); - if (uppercase) { - for (char *c = str; *c; c++) - *c = (char)toupper(*c); - } - int64_t needed_zeroes = digits - (int64_t)strlen(str); - if (needed_zeroes <= 0) - return prefix ? Text$concat(Text("0x"), Text$from_str(str)) : Text$from_str(str); - - char *zeroes = GC_MALLOC_ATOMIC((size_t)(needed_zeroes)); - memset(zeroes, '0', (size_t)(needed_zeroes)); - if (prefix) - return Text$concat(Text("0x"), Text$from_str(zeroes), Text$from_str(str)); - else - return Text$concat(Text$from_str(zeroes), Text$from_str(str)); - } -} - -public Text_t Int$octal(Int_t i, Int_t digits_int, bool prefix) { - if (Int$is_negative(i)) - return Text$concat(Text("-"), Int$octal(Int$negative(i), digits_int, prefix)); - - int64_t digits = Int_to_Int64(digits_int, false); - if (__builtin_expect(i.small & 1, 1)) { - const char *octal_fmt = prefix ? "0o%0.*lo" : "%0.*lo"; - return Text$format(octal_fmt, digits, (i.small)>>2); - } else { - char *str = mpz_get_str(NULL, 8, *i.big); - int64_t needed_zeroes = digits - (int64_t)strlen(str); - if (needed_zeroes <= 0) - return prefix ? Text$concat(Text("0o"), Text$from_str(str)) : Text$from_str(str); - - char *zeroes = GC_MALLOC_ATOMIC((size_t)(needed_zeroes)); - memset(zeroes, '0', (size_t)(needed_zeroes)); - if (prefix) - return Text$concat(Text("0o"), Text$from_str(zeroes), Text$from_str(str)); - else - return Text$concat(Text$from_str(zeroes), Text$from_str(str)); - } -} - -public Int_t Int$slow_plus(Int_t x, Int_t y) { - mpz_t result; - mpz_init_set_int(result, x); - if (y.small & 1) { - if (y.small < 0) - mpz_sub_ui(result, result, (uint64_t)(-(y.small >> 2))); - else - mpz_add_ui(result, result, (uint64_t)(y.small >> 2)); - } else { - mpz_add(result, result, *y.big); - } - return Int$from_mpz(result); -} - -public Int_t Int$slow_minus(Int_t x, Int_t y) { - mpz_t result; - mpz_init_set_int(result, x); - if (y.small & 1) { - if (y.small < 0) - mpz_add_ui(result, result, (uint64_t)(-(y.small >> 2))); - else - mpz_sub_ui(result, result, (uint64_t)(y.small >> 2)); - } else { - mpz_sub(result, result, *y.big); - } - return Int$from_mpz(result); -} - -public Int_t Int$slow_times(Int_t x, Int_t y) { - mpz_t result; - mpz_init_set_int(result, x); - if (y.small & 1) - mpz_mul_si(result, result, y.small >> 2); - else - mpz_mul(result, result, *y.big); - return Int$from_mpz(result); -} - -public Int_t Int$slow_divided_by(Int_t dividend, Int_t divisor) { - // Euclidean division, see: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/divmodnote-letter.pdf - mpz_t quotient, remainder; - mpz_init_set_int(quotient, dividend); - mpz_init_set_int(remainder, divisor); - mpz_tdiv_qr(quotient, remainder, quotient, remainder); - if (mpz_sgn(remainder) < 0) { - bool d_positive = __builtin_expect(divisor.small & 1, 1) ? divisor.small > 0x1 : mpz_sgn(*divisor.big) > 0; - if (d_positive) - mpz_sub_ui(quotient, quotient, 1); - else - mpz_add_ui(quotient, quotient, 1); - } - return Int$from_mpz(quotient); -} - -public Int_t Int$slow_modulo(Int_t x, Int_t modulus) -{ - mpz_t result; - mpz_init_set_int(result, x); - mpz_t divisor; - mpz_init_set_int(divisor, modulus); - mpz_mod(result, result, divisor); - return Int$from_mpz(result); -} - -public Int_t Int$slow_modulo1(Int_t x, Int_t modulus) -{ - mpz_t result; - mpz_init_set_int(result, x); - mpz_sub_ui(result, result, 1); - mpz_t divisor; - mpz_init_set_int(divisor, modulus); - mpz_mod(result, result, divisor); - mpz_add_ui(result, result, 1); - return Int$from_mpz(result); -} - -public Int_t Int$slow_left_shifted(Int_t x, Int_t y) -{ - mp_bitcnt_t bits = (mp_bitcnt_t)Int_to_Int64(y, false); - mpz_t result; - mpz_init_set_int(result, x); - mpz_mul_2exp(result, result, bits); - return Int$from_mpz(result); -} - -public Int_t Int$slow_right_shifted(Int_t x, Int_t y) -{ - mp_bitcnt_t bits = (mp_bitcnt_t)Int_to_Int64(y, false); - mpz_t result; - mpz_init_set_int(result, x); - mpz_tdiv_q_2exp(result, result, bits); - return Int$from_mpz(result); -} - -public Int_t Int$slow_bit_and(Int_t x, Int_t y) -{ - mpz_t result; - mpz_init_set_int(result, x); - mpz_t y_mpz; - mpz_init_set_int(y_mpz, y); - mpz_and(result, result, y_mpz); - return Int$from_mpz(result); -} - -public Int_t Int$slow_bit_or(Int_t x, Int_t y) -{ - mpz_t result; - mpz_init_set_int(result, x); - mpz_t y_mpz; - mpz_init_set_int(y_mpz, y); - mpz_ior(result, result, y_mpz); - return Int$from_mpz(result); -} - -public Int_t Int$slow_bit_xor(Int_t x, Int_t y) -{ - mpz_t result; - mpz_init_set_int(result, x); - mpz_t y_mpz; - mpz_init_set_int(y_mpz, y); - mpz_xor(result, result, y_mpz); - return Int$from_mpz(result); -} - -public Int_t Int$slow_negated(Int_t x) -{ - mpz_t result; - mpz_init_set_int(result, x); - mpz_neg(result, result); - mpz_sub_ui(result, result, 1); - return Int$from_mpz(result); -} - -public Int_t Int$slow_negative(Int_t x) -{ - if (__builtin_expect((x.small & 1), 1)) - return (Int_t){.small=4*-((x.small)>>2) + 1}; - - mpz_t result; - mpz_init_set_int(result, x); - mpz_neg(result, result); - return Int$from_mpz(result); -} - -public Int_t Int$abs(Int_t x) -{ - if (__builtin_expect((x.small & 1), 1)) - return (Int_t){.small=4*labs((x.small)>>2) + 1}; - - mpz_t result; - mpz_init_set_int(result, x); - mpz_abs(result, result); - return Int$from_mpz(result); -} - -public Int_t Int$power(Int_t base, Int_t exponent) -{ - int64_t exp = Int_to_Int64(exponent, false); - if (__builtin_expect(exp < 0, 0)) - fail("Cannot take a negative power of an integer!"); - mpz_t result; - mpz_init_set_int(result, base); - mpz_pow_ui(result, result, (uint64_t)exp); - return Int$from_mpz(result); -} - -public Int_t Int$sqrt(Int_t i) -{ - mpz_t result; - mpz_init_set_int(result, i); - mpz_sqrt(result, result); - return Int$from_mpz(result); -} - -public Int_t Int$random(Int_t min, Int_t max) { - int32_t cmp = Int$compare_value(min, max); - if (cmp > 0) { - Text_t min_text = Int$as_text(&min, false, &Int$info), max_text = Int$as_text(&max, false, &Int$info); - fail("Random minimum value (%k) is larger than the maximum value (%k)", - &min_text, &max_text); - } - if (cmp == 0) return min; - - mpz_t range_size; - mpz_init_set_int(range_size, max); - if (min.small & 1) { - mpz_t min_mpz; - mpz_init_set_si(min_mpz, min.small >> 2); - mpz_sub(range_size, range_size, min_mpz); - } else { - mpz_sub(range_size, range_size, *min.big); - } - - mpz_t r; - mpz_init(r); - mpz_urandomm(r, Int_rng, range_size); - return Int$plus(min, Int$from_mpz(r)); -} - -public PUREFUNC Range_t Int$to(Int_t from, Int_t to) { - return (Range_t){from, to, Int$compare_value(to, from) >= 0 ? (Int_t){.small=(1<<2)|1} : (Int_t){.small=(-1>>2)|1}}; -} - -public Int_t Int$from_str(const char *str) { - mpz_t i; - int result; - if (strncmp(str, "0x", 2) == 0) { - result = mpz_init_set_str(i, str + 2, 16); - } else if (strncmp(str, "0o", 2) == 0) { - result = mpz_init_set_str(i, str + 2, 8); - } else if (strncmp(str, "0b", 2) == 0) { - result = mpz_init_set_str(i, str + 2, 2); - } else { - result = mpz_init_set_str(i, str, 10); - } - if (result != 0) - return NULL_INT; - return Int$from_mpz(i); -} - -public OptionalInt_t Int$from_text(Text_t text) { - return Int$from_str(Text$as_c_string(text)); -} - -public bool Int$is_prime(Int_t x, Int_t reps) -{ - mpz_t p; - mpz_init_set_int(p, x); - if (Int$compare_value(reps, I(9999)) > 0) - fail("Number of prime-test repetitions should not be above 9999"); - int reps_int = Int_to_Int32(reps, false); - return (mpz_probab_prime_p(p, reps_int) != 0); -} - -public Int_t Int$next_prime(Int_t x) -{ - mpz_t p; - mpz_init_set_int(p, x); - mpz_nextprime(p, p); - return Int$from_mpz(p); -} - -public Int_t Int$prev_prime(Int_t x) -{ - mpz_t p; - mpz_init_set_int(p, x); - if (mpz_prevprime(p, p) == 0) - fail("There is no prime number before %k", (Text_t[1]){Int$as_text(&x, false, &Int$info)}); - return Int$from_mpz(p); -} - -public const TypeInfo Int$info = { - .size=sizeof(Int_t), - .align=__alignof__(Int_t), - .tag=CustomInfo, - .CustomInfo={ - .compare=(void*)Int$compare, - .equal=(void*)Int$equal, - .hash=(void*)Int$hash, - .as_text=(void*)Int$as_text, - }, -}; - - -#define DEFINE_INT_TYPE(c_type, KindOfInt, fmt, min_val, max_val)\ - public Text_t KindOfInt ## $as_text(const c_type *i, bool colorize, const TypeInfo *type) { \ - (void)type; \ - if (!i) return Text(#KindOfInt); \ - return Text$format(colorize ? "\x1b[35m%" fmt "\x1b[m" : "%" fmt, *i); \ - } \ - public PUREFUNC int32_t KindOfInt ## $compare(const c_type *x, const c_type *y, const TypeInfo *type) { \ - (void)type; \ - return (*x > *y) - (*x < *y); \ - } \ - public PUREFUNC bool KindOfInt ## $equal(const c_type *x, const c_type *y, const TypeInfo *type) { \ - (void)type; \ - return *x == *y; \ - } \ - public Text_t KindOfInt ## $format(c_type i, Int_t digits_int) { \ - Int_t as_int = KindOfInt##_to_Int(i); \ - return Int$format(as_int, digits_int); \ - } \ - public Text_t KindOfInt ## $hex(c_type i, Int_t digits_int, bool uppercase, bool prefix) { \ - Int_t as_int = KindOfInt##_to_Int(i); \ - return Int$hex(as_int, digits_int, uppercase, prefix); \ - } \ - public Text_t KindOfInt ## $octal(c_type i, Int_t digits_int, bool prefix) { \ - Int_t as_int = KindOfInt##_to_Int(i); \ - return Int$octal(as_int, digits_int, prefix); \ - } \ - public Array_t KindOfInt ## $bits(c_type x) { \ - Array_t bit_array = (Array_t){.data=GC_MALLOC_ATOMIC(sizeof(bool[8*sizeof(c_type)])), .atomic=1, .stride=sizeof(bool), .length=8*sizeof(c_type)}; \ - bool *bits = bit_array.data + sizeof(c_type)*8; \ - for (size_t i = 0; i < 8*sizeof(c_type); i++) { \ - *(bits--) = x & 1; \ - x >>= 1; \ - } \ - return bit_array; \ - } \ - public c_type KindOfInt ## $random(c_type min, c_type max) { \ - if (min > max) fail("Random minimum value (%ld) is larger than the maximum value (%ld)", min, max); \ - if (min == max) return min; \ - if (min == min_val && max == max_val) { \ - c_type r; \ - arc4random_buf(&r, sizeof(r)); \ - return r; \ - } \ - uint64_t range = (uint64_t)max - (uint64_t)min + 1; \ - uint64_t min_r = -range % range; \ - uint64_t r; \ - for (;;) { \ - arc4random_buf(&r, sizeof(r)); \ - if (r >= min_r) break; \ - } \ - return (c_type)((uint64_t)min + (r % range)); \ - } \ - public Range_t KindOfInt ## $to(c_type from, c_type to) { \ - return (Range_t){Int64_to_Int(from), Int64_to_Int(to), to >= from ? (Int_t){.small=(1<<2)&1} : (Int_t){.small=(1<<2)&1}}; \ - } \ - public PUREFUNC Optional ## KindOfInt ## _t KindOfInt ## $from_text(Text_t text) { \ - OptionalInt_t full_int = Int$from_text(text); \ - if (full_int.small == 0) return (Optional ## KindOfInt ## _t){.is_null=true}; \ - if (Int$compare_value(full_int, I(min_val)) < 0) { \ - return (Optional ## KindOfInt ## _t){.is_null=true}; \ - } \ - if (Int$compare_value(full_int, I(max_val)) > 0) { \ - return (Optional ## KindOfInt ## _t){.is_null=true}; \ - } \ - return (Optional ## KindOfInt ## _t){.i=Int_to_ ## KindOfInt(full_int, true)}; \ - } \ - public const c_type KindOfInt##$min = min_val; \ - public const c_type KindOfInt##$max = max_val; \ - public const TypeInfo KindOfInt##$info = { \ - .size=sizeof(c_type), \ - .align=__alignof__(c_type), \ - .tag=CustomInfo, \ - .CustomInfo={.compare=(void*)KindOfInt##$compare, .as_text=(void*)KindOfInt##$as_text}, \ - }; - -DEFINE_INT_TYPE(int64_t, Int64, "ld_i64", INT64_MIN, INT64_MAX) -DEFINE_INT_TYPE(int32_t, Int32, "d_i32", INT32_MIN, INT32_MAX) -DEFINE_INT_TYPE(int16_t, Int16, "d_i16", INT16_MIN, INT16_MAX) -DEFINE_INT_TYPE(int8_t, Int8, "d_i8", INT8_MIN, INT8_MAX) -#undef DEFINE_INT_TYPE - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/integers.h b/builtins/integers.h deleted file mode 100644 index 1c0ab1cd..00000000 --- a/builtins/integers.h +++ /dev/null @@ -1,375 +0,0 @@ -#pragma once - -// Integer type infos and methods - -#include -#include -#include -#include - -#include "datatypes.h" -#include "nums.h" -#include "stdlib.h" -#include "types.h" -#include "util.h" - -#define Int64_t int64_t -#define Int32_t int32_t -#define Int16_t int16_t -#define Int8_t int8_t -#define I64(x) ((int64_t)x) -#define I32(x) ((int32_t)x) -#define I16(x) ((int16_t)x) -#define I8(x) ((int8_t)x) - -#define DEFINE_INT_TYPE(c_type, type_name) \ - typedef struct { \ - c_type i; \ - bool is_null:1; \ - } Optional ## type_name ## _t; \ - Text_t type_name ## $as_text(const c_type *i, bool colorize, const TypeInfo *type); \ - PUREFUNC int32_t type_name ## $compare(const c_type *x, const c_type *y, const TypeInfo *type); \ - PUREFUNC bool type_name ## $equal(const c_type *x, const c_type *y, const TypeInfo *type); \ - Text_t type_name ## $format(c_type i, Int_t digits); \ - Text_t type_name ## $hex(c_type i, Int_t digits, bool uppercase, bool prefix); \ - Text_t type_name ## $octal(c_type i, Int_t digits, bool prefix); \ - Array_t type_name ## $bits(c_type x); \ - c_type type_name ## $random(c_type min, c_type max); \ - Range_t type_name ## $to(c_type from, c_type to); \ - PUREFUNC Optional ## type_name ## _t type_name ## $from_text(Text_t text); \ - PUREFUNC static inline c_type type_name ## $clamped(c_type x, c_type min, c_type max) { \ - return x < min ? min : (x > max ? max : x); \ - } \ - extern const c_type type_name ## $min, type_name##$max; \ - extern const TypeInfo type_name ## $info; \ - static inline c_type type_name ## $divided_by(c_type D, c_type d) { \ - c_type q = D/d, r = D%d; \ - if (r < 0) { \ - if (d > 0) q = q-1; \ - else q = q+1; \ - } \ - return q; \ - } \ - static inline c_type type_name ## $modulo(c_type D, c_type d) { \ - c_type r = D%d; \ - if (r < 0) { \ - if (d > 0) r = r + d; \ - else r = r - d; \ - } \ - return r; \ - } \ - static inline c_type type_name ## $modulo1(c_type D, c_type d) { \ - return type_name ## $modulo(D-1, d) + 1; \ - } - -DEFINE_INT_TYPE(int64_t, Int64) -DEFINE_INT_TYPE(int32_t, Int32) -DEFINE_INT_TYPE(int16_t, Int16) -DEFINE_INT_TYPE(int8_t, Int8) -#undef DEFINE_INT_TYPE - -#define NULL_INT64 ((OptionalInt64_t){.is_null=true}) -#define NULL_INT32 ((OptionalInt32_t){.is_null=true}) -#define NULL_INT16 ((OptionalInt16_t){.is_null=true}) -#define NULL_INT8 ((OptionalInt8_t){.is_null=true}) - -#define Int64$abs(...) I64(labs(__VA_ARGS__)) -#define Int32$abs(...) I32(abs(__VA_ARGS__)) -#define Int16$abs(...) I16(abs(__VA_ARGS__)) -#define Int8$abs(...) I8(abs(__VA_ARGS__)) - -#define OptionalInt_t Int_t - -Text_t Int$as_text(const Int_t *i, bool colorize, const TypeInfo *type); -Text_t Int$value_as_text(Int_t i); -PUREFUNC uint64_t Int$hash(const Int_t *x, const TypeInfo *type); -PUREFUNC int32_t Int$compare(const Int_t *x, const Int_t *y, const TypeInfo *type); -PUREFUNC int32_t Int$compare_value(const Int_t x, const Int_t y); -PUREFUNC bool Int$equal(const Int_t *x, const Int_t *y, const TypeInfo *type); -PUREFUNC bool Int$equal_value(const Int_t x, const Int_t y); -Text_t Int$format(Int_t i, Int_t digits); -Text_t Int$hex(Int_t i, Int_t digits, bool uppercase, bool prefix); -Text_t Int$octal(Int_t i, Int_t digits, bool prefix); -void Int$init_random(long seed); -Int_t Int$random(Int_t min, Int_t max); -PUREFUNC Range_t Int$to(Int_t from, Int_t to); -OptionalInt_t Int$from_str(const char *str); -OptionalInt_t Int$from_text(Text_t text); -Int_t Int$abs(Int_t x); -Int_t Int$power(Int_t base, Int_t exponent); -Int_t Int$sqrt(Int_t i); - -#define BIGGEST_SMALL_INT ((1<<29)-1) - -#define Int$from_mpz(mpz) (\ - mpz_cmpabs_ui(mpz, BIGGEST_SMALL_INT) <= 0 ? ( \ - (Int_t){.small=(mpz_get_si(mpz)<<2)|1} \ - ) : ( \ - (Int_t){.big=memcpy(new(mpz_t), &mpz, sizeof(mpz_t))} \ - )) - -#define mpz_init_set_int(mpz, i) do { \ - if (__builtin_expect((i).small & 1, 1)) mpz_init_set_si(mpz, (i).small >> 2); \ - else mpz_init_set(mpz, *(i).big); \ -} while (0) - -#define I(i) ((int64_t)(i) == (int32_t)(i) ? ((Int_t){.small=(int64_t)((uint64_t)(i)<<2)|1}) : Int64_to_Int(i)) -#define I_small(i) ((Int_t){.small=((uint64_t)(i)<<2)|1}) -#define I_is_zero(i) ((i).small == 1) - -Int_t Int$slow_plus(Int_t x, Int_t y); -Int_t Int$slow_minus(Int_t x, Int_t y); -Int_t Int$slow_times(Int_t x, Int_t y); -Int_t Int$slow_divided_by(Int_t x, Int_t y); -Int_t Int$slow_modulo(Int_t x, Int_t y); -Int_t Int$slow_modulo1(Int_t x, Int_t y); -Int_t Int$slow_left_shifted(Int_t x, Int_t y); -Int_t Int$slow_right_shifted(Int_t x, Int_t y); -Int_t Int$slow_bit_and(Int_t x, Int_t y); -Int_t Int$slow_bit_or(Int_t x, Int_t y); -Int_t Int$slow_bit_xor(Int_t x, Int_t y); -Int_t Int$slow_negative(Int_t x); -Int_t Int$slow_negated(Int_t x); -bool Int$is_prime(Int_t x, Int_t reps); -Int_t Int$next_prime(Int_t x); -Int_t Int$prev_prime(Int_t x); - -extern const TypeInfo Int$info; - -static inline Int_t Int$clamped(Int_t x, Int_t low, Int_t high) -{ - return (Int$compare(&x, &low, &Int$info) <= 0) ? low : (Int$compare(&x, &high, &Int$info) >= 0 ? high : x); -} - -// Fast-path inline versions for the common case where integer arithmetic is -// between two small ints. - -static inline Int_t Int$plus(Int_t x, Int_t y) { - const int64_t z = (int64_t)((uint64_t)x.small + (uint64_t)y.small); - if (__builtin_expect(((z|2) == (int32_t)z), 1)) - return (Int_t){.small=(z-1)}; - return Int$slow_plus(x, y); -} - -static inline Int_t Int$minus(Int_t x, Int_t y) { - const int64_t z = (int64_t)(((uint64_t)x.small ^ 3) - (uint64_t)y.small); - if (__builtin_expect(((z & ~2) == (int32_t)z), 1)) - return (Int_t){.small=z}; - return Int$slow_minus(x, y); -} - -static inline Int_t Int$times(Int_t x, Int_t y) { - if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { - const int64_t z = (x.small>>1) * (y.small>>1); - if (__builtin_expect(z == (int32_t)z, 1)) - return (Int_t){.small=z+1}; - } - return Int$slow_times(x, y); -} - -static inline Int_t Int$divided_by(Int_t x, Int_t y) { - if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { - // Euclidean division, see: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/divmodnote-letter.pdf - const int64_t D = (x.small>>2); - const int64_t d = (y.small>>2); - int64_t q = D/d; - int64_t r = D%d; - if (r < 0) { - if (d > 0) q = q-1; - else q = q+1; - } - if (__builtin_expect(q == (int32_t)q, 1)) - return (Int_t){.small=(q<<2)|1}; - } - return Int$slow_divided_by(x, y); -} - -static inline Int_t Int$modulo(Int_t x, Int_t y) { - if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { - // Euclidean modulus, see: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/divmodnote-letter.pdf - const int64_t D = (x.small>>2); - const int64_t d = (y.small>>2); - int64_t r = D%d; - if (r < 0) { - if (d > 0) r = r + d; - else r = r - d; - } - return (Int_t){.small=(r<<2)|1}; - } - return Int$slow_modulo(x, y); -} - -static inline Int_t Int$modulo1(Int_t x, Int_t y) { - if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { - // Euclidean modulus, see: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/divmodnote-letter.pdf - const int64_t D = (x.small>>2)-1; - const int64_t d = (y.small>>2); - int64_t r = D%d; - if (r < 0) { - if (d > 0) r = r + d; - else r = r - d; - } - return (Int_t){.small=((r+1)<<2)|1}; - } - return Int$slow_modulo1(x, y); -} - -static inline Int_t Int$left_shifted(Int_t x, Int_t y) { - if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { - const int64_t z = ((x.small>>2) << (y.small>>2))<<2; - if (__builtin_expect(z == (int32_t)z, 1)) - return (Int_t){.small=z+1}; - } - return Int$slow_left_shifted(x, y); -} - -static inline Int_t Int$right_shifted(Int_t x, Int_t y) { - if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { - const int64_t z = ((x.small>>2) >> (y.small>>2))<<2; - if (__builtin_expect(z == (int32_t)z, 1)) - return (Int_t){.small=z+1}; - } - return Int$slow_right_shifted(x, y); -} - -static inline Int_t Int$bit_and(Int_t x, Int_t y) { - const int64_t z = x.small & y.small; - if (__builtin_expect((z & 1) == 1, 1)) - return (Int_t){.small=z}; - return Int$slow_bit_and(x, y); -} - -static inline Int_t Int$bit_or(Int_t x, Int_t y) { - if (__builtin_expect(((x.small & y.small) & 1) == 1, 1)) - return (Int_t){.small=(x.small | y.small)}; - return Int$slow_bit_or(x, y); -} - -static inline Int_t Int$bit_xor(Int_t x, Int_t y) { - if (__builtin_expect(((x.small & y.small) & 1) == 1, 1)) - return (Int_t){.small=(x.small ^ y.small) | 1}; - return Int$slow_bit_xor(x, y); -} - -static inline Int_t Int$negated(Int_t x) -{ - if (__builtin_expect((x.small & 1), 1)) - return (Int_t){.small=(~x.small) ^ 3}; - return Int$slow_negated(x); -} - -static inline Int_t Int$negative(Int_t x) -{ - if (__builtin_expect((x.small & 1), 1)) - return (Int_t){.small=((-((x.small)>>2))<<2) | 1}; - return Int$slow_negative(x); -} - -static inline bool Int$is_negative(Int_t x) -{ - if (__builtin_expect((x.small & 1), 1)) - return x.small < 0; - return Int$compare_value(x, I_small(0)) < 0; -} - -// Conversion functions: - -static inline Int_t Int64_to_Int(int64_t i) -{ - int64_t z = i<<2; - if (__builtin_expect(z == (int32_t)z, 1)) - return (Int_t){.small=z+1}; - mpz_t result; - mpz_init_set_si(result, i); - return Int$from_mpz(result); -} - -#define Int32_to_Int(i) Int64_to_Int(i) -#define Int16_to_Int(i) Int64_to_Int(i) -#define Int8_to_Int(i) Int64_to_Int(i) - -#pragma GCC diagnostic ignored "-Winline" -PUREFUNC static inline Int64_t Int_to_Int64(Int_t i, bool truncate) { - if (__builtin_expect(i.small & 1, 1)) - return (int64_t)(i.small >> 2); - if (__builtin_expect(!truncate && !mpz_fits_slong_p(*i.big), 0)) - fail("Integer is too big to fit in a 64-bit integer!"); - return mpz_get_si(*i.big); -} - -PUREFUNC static inline Int32_t Int_to_Int32(Int_t i, bool truncate) { - int64_t i64 = Int_to_Int64(i, truncate); - int32_t i32 = (int32_t)i64; - if (__builtin_expect(i64 != i32 && !truncate, 0)) - fail("Integer is too big to fit in a 32-bit integer!"); - return i32; -} - -PUREFUNC static inline Int16_t Int_to_Int16(Int_t i, bool truncate) { - int64_t i64 = Int_to_Int64(i, truncate); - int16_t i16 = (int16_t)i64; - if (__builtin_expect(i64 != i16 && !truncate, 0)) - fail("Integer is too big to fit in a 16-bit integer!"); - return i16; -} - -PUREFUNC static inline Int8_t Int_to_Int8(Int_t i, bool truncate) { - int64_t i64 = Int_to_Int64(i, truncate); - int8_t i8 = (int8_t)i64; - if (__builtin_expect(i64 != i8 && !truncate, 0)) - fail("Integer is too big to fit in an 8-bit integer!"); - return i8; -} - -PUREFUNC static inline Int_t Num_to_Int(double n) -{ - mpz_t result; - mpz_init_set_d(result, n); - return Int$from_mpz(result); -} - -PUREFUNC static inline double Int_to_Num(Int_t i) -{ - if (__builtin_expect(i.small & 1, 1)) - return (double)(i.small >> 2); - - return mpz_get_d(*i.big); -} - -#define Int_to_Num32(i) (Num32_t)Int_to_Num(i) - -#define CONVERSION_FUNC(hi, lo) \ - PUREFUNC static inline int##lo##_t Int##hi##_to_Int##lo(int##hi##_t i, bool truncate) { \ - if (__builtin_expect(!truncate && (i != (int##lo##_t)i), 0)) \ - fail("Cannot truncate the Int" #hi " %ld to an Int" #lo, (int64_t)i); \ - return (int##lo##_t)i; \ - } - -CONVERSION_FUNC(64, 32) -CONVERSION_FUNC(64, 16) -CONVERSION_FUNC(64, 8) -CONVERSION_FUNC(32, 16) -CONVERSION_FUNC(32, 8) -CONVERSION_FUNC(16, 8) -#undef CONVERSION_FUNC - -#pragma GCC diagnostic ignored "-Wfloat-equal" -#define CONVERSION_FUNC(num, int_type) \ - PUREFUNC static inline int_type##_t num##_to_##int_type(num##_t n, bool truncate) { \ - num##_t rounded = (num##_t)round((double)n); \ - if (__builtin_expect(!truncate && (num##_t)(int_type##_t)rounded != rounded, 0)) \ - fail("Cannot truncate the " #num " %g to an " #int_type, (double)rounded); \ - return (int_type##_t)rounded; \ - } - -CONVERSION_FUNC(Num, Int64) -CONVERSION_FUNC(Num, Int32) -CONVERSION_FUNC(Num, Int16) -CONVERSION_FUNC(Num, Int8) -CONVERSION_FUNC(Num32, Int64) -CONVERSION_FUNC(Num32, Int32) -CONVERSION_FUNC(Num32, Int16) -CONVERSION_FUNC(Num32, Int8) -#undef CONVERSION_FUNC - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/memory.c b/builtins/memory.c deleted file mode 100644 index 9d7dbc80..00000000 --- a/builtins/memory.c +++ /dev/null @@ -1,28 +0,0 @@ -// Type info and methods for "Memory" opaque type -#include -#include -#include -#include -#include -#include -#include - -#include "memory.h" -#include "text.h" -#include "types.h" -#include "util.h" - -public Text_t Memory__as_text(const void *p, bool colorize, const TypeInfo *type) { - (void)type; - if (!p) return Text("Memory"); - return Text$format(colorize ? "\x1b[0;34;1mMemory<%p>\x1b[m" : "Memory<%p>", p); -} - -public const TypeInfo Memory$info = { - .size=0, - .align=0, - .tag=CustomInfo, - .CustomInfo={.as_text=(void*)Memory__as_text}, -}; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/memory.h b/builtins/memory.h deleted file mode 100644 index 701ea68a..00000000 --- a/builtins/memory.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -// Type info and methods for "Memory" opaque type - -#include -#include - -#include "types.h" - -extern const TypeInfo Memory$info; -Text_t Memory$as_text(const void *p, bool colorize, const TypeInfo *type); - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/metamethods.c b/builtins/metamethods.c deleted file mode 100644 index 9b0560ab..00000000 --- a/builtins/metamethods.c +++ /dev/null @@ -1,124 +0,0 @@ -// Metamethods are methods that all types share for hashing, equality, comparison, and textifying - -#include -#include - -#include "arrays.h" -#include "channels.h" -#include "functiontype.h" -#include "metamethods.h" -#include "optionals.h" -#include "pointers.h" -#include "siphash.h" -#include "tables.h" -#include "text.h" -#include "util.h" - - -PUREFUNC public uint64_t generic_hash(const void *obj, const TypeInfo *type) -{ - switch (type->tag) { - case TextInfo: return Text$hash((void*)obj); - case ArrayInfo: return Array$hash(obj, type); - case ChannelInfo: return Channel$hash((Channel_t**)obj, type); - case TableInfo: return Table$hash(obj, type); - case OptionalInfo: return is_null(obj, type->OptionalInfo.type) ? 0 : generic_hash(obj, type->OptionalInfo.type); - case EmptyStructInfo: return 0; - case CustomInfo: case StructInfo: case EnumInfo: case CStringInfo: // These all share the same info - if (!type->CustomInfo.hash) - goto hash_data; - return type->CustomInfo.hash(obj, type); - case PointerInfo: case FunctionInfo: case TypeInfoInfo: case OpaqueInfo: default: { - hash_data:; - return siphash24((void*)obj, (size_t)(type->size)); - } - } -} - -PUREFUNC public int32_t generic_compare(const void *x, const void *y, const TypeInfo *type) -{ - if (x == y) return 0; - - switch (type->tag) { - case PointerInfo: case FunctionInfo: return Pointer$compare(x, y, type); - case TextInfo: return Text$compare(x, y); - case ArrayInfo: return Array$compare(x, y, type); - case ChannelInfo: return Channel$compare((Channel_t**)x, (Channel_t**)y, type); - case TableInfo: return Table$compare(x, y, type); - case OptionalInfo: { - bool x_is_null = is_null(x, type->OptionalInfo.type); - bool y_is_null = is_null(y, type->OptionalInfo.type); - if (x_is_null && y_is_null) return 0; - else if (x_is_null != y_is_null) return (int32_t)y_is_null - (int32_t)x_is_null; - else return generic_compare(x, y, type->OptionalInfo.type); - } - case EmptyStructInfo: return 0; - case CustomInfo: case StructInfo: case EnumInfo: case CStringInfo: // These all share the same info - if (!type->CustomInfo.compare) - goto compare_data; - return type->CustomInfo.compare(x, y, type); - case TypeInfoInfo: case OpaqueInfo: default: - compare_data: - return (int32_t)memcmp((void*)x, (void*)y, (size_t)(type->size)); - } -} - -PUREFUNC public bool generic_equal(const void *x, const void *y, const TypeInfo *type) -{ - if (x == y) return true; - - switch (type->tag) { - case PointerInfo: case FunctionInfo: return Pointer$equal(x, y, type); - case TextInfo: return Text$equal(x, y); - case ArrayInfo: return Array$equal(x, y, type); - case ChannelInfo: return Channel$equal((Channel_t**)x, (Channel_t**)y, type); - case TableInfo: return Table$equal(x, y, type); - case EmptyStructInfo: return true; - case OptionalInfo: { - bool x_is_null = is_null(x, type->OptionalInfo.type); - bool y_is_null = is_null(y, type->OptionalInfo.type); - if (x_is_null && y_is_null) return true; - else if (x_is_null != y_is_null) return false; - else return generic_equal(x, y, type->OptionalInfo.type); - } - case CustomInfo: case StructInfo: case EnumInfo: case CStringInfo: // These all share the same info - if (!type->CustomInfo.equal) - goto use_generic_compare; - return type->CustomInfo.equal(x, y, type); - case TypeInfoInfo: case OpaqueInfo: default: - use_generic_compare: - return (generic_compare(x, y, type) == 0); - } -} - -public Text_t generic_as_text(const void *obj, bool colorize, const TypeInfo *type) -{ - switch (type->tag) { - case PointerInfo: return Pointer$as_text(obj, colorize, type); - case FunctionInfo: return Func$as_text(obj, colorize, type); - case TextInfo: return Text$as_text(obj, colorize, type); - case ArrayInfo: return Array$as_text(obj, colorize, type); - case ChannelInfo: return Channel$as_text((Channel_t**)obj, colorize, type); - case TableInfo: return Table$as_text(obj, colorize, type); - case TypeInfoInfo: return Type$as_text(obj, colorize, type); - case OptionalInfo: return Optional$as_text(obj, colorize, type); - case EmptyStructInfo: return colorize ? - Text$concat(Text("\x1b[0;1m"), Text$from_str(type->EmptyStructInfo.name), Text("\x1b[m()")) - : Text$concat(Text$from_str(type->EmptyStructInfo.name), Text("()")); - case CustomInfo: case StructInfo: case EnumInfo: case CStringInfo: // These all share the same info - if (!type->CustomInfo.as_text) - fail("No text function provided for type!\n"); - return type->CustomInfo.as_text(obj, colorize, type); - case OpaqueInfo: return Text("???"); - default: errx(1, "Invalid type tag: %d", type->tag); - } -} - -public int generic_print(const void *obj, bool colorize, const TypeInfo *type) -{ - Text_t text = generic_as_text(obj, colorize, type); - return Text$print(stdout, text) + printf("\n"); -} - - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/metamethods.h b/builtins/metamethods.h deleted file mode 100644 index be712a61..00000000 --- a/builtins/metamethods.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once -// Metamethods are methods that all types share: - -#include - -#include "types.h" -#include "util.h" - -PUREFUNC uint64_t generic_hash(const void *obj, const TypeInfo *type); -PUREFUNC int32_t generic_compare(const void *x, const void *y, const TypeInfo *type); -PUREFUNC bool generic_equal(const void *x, const void *y, const TypeInfo *type); -Text_t generic_as_text(const void *obj, bool colorize, const TypeInfo *type); -int generic_print(const void *obj, bool colorize, const TypeInfo *type); - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/nums.c b/builtins/nums.c deleted file mode 100644 index 1956140a..00000000 --- a/builtins/nums.c +++ /dev/null @@ -1,178 +0,0 @@ -// Type infos and methods for Nums (floating point) - -#include -#include -#include -#include -#include -#include - -#include "arrays.h" -#include "nums.h" -#include "string.h" -#include "text.h" -#include "types.h" - -public PUREFUNC Text_t Num$as_text(const double *f, bool colorize, const TypeInfo *type) { - (void)type; - if (!f) return Text("Num"); - return Text$format(colorize ? "\x1b[35m%.16g\x1b[33;2m\x1b[m" : "%.16g", *f); -} - -public PUREFUNC int32_t Num$compare(const double *x, const double *y, const TypeInfo *type) { - (void)type; - return (*x > *y) - (*x < *y); -} - -public PUREFUNC bool Num$equal(const double *x, const double *y, const TypeInfo *type) { - (void)type; - return *x == *y; -} - -public CONSTFUNC bool Num$near(double a, double b, double ratio, double absolute) { - if (ratio < 0) ratio = 0; - else if (ratio > 1) ratio = 1; - - if (a == b) return true; - - double diff = fabs(a - b); - if (diff < absolute) return true; - else if (isnan(diff)) return false; - - double epsilon = fabs(a * ratio) + fabs(b * ratio); - if (isinf(epsilon)) epsilon = DBL_MAX; - return (diff < epsilon); -} - -public Text_t Num$format(double f, Int_t precision) { - return Text$format("%.*f", (int)Int_to_Int64(precision, false), f); -} - -public Text_t Num$scientific(double f, Int_t precision) { - return Text$format("%.*e", (int)Int_to_Int64(precision, false), f); -} - -public double Num$mod(double num, double modulus) { - double result = fmod(num, modulus); - return (result < 0) != (modulus < 0) ? result + modulus : result; -} - -public double Num$random(void) { - return drand48(); -} - -public CONSTFUNC double Num$mix(double amount, double x, double y) { - return (1.0-amount)*x + amount*y; -} - -public OptionalNum_t Num$from_text(Text_t text) { - const char *str = Text$as_c_string(text); - char *end = NULL; - double d = strtod(str, &end); - if (end > str && end[0] == '\0') - return d; - else - return nan("null"); -} - -public double Num$nan(Text_t tag) { - return nan(Text$as_c_string(tag)); -} - -public CONSTFUNC bool Num$isinf(double n) { return !!isinf(n); } -public CONSTFUNC bool Num$finite(double n) { return !!finite(n); } -public CONSTFUNC bool Num$isnan(double n) { return !!isnan(n); } - -public const TypeInfo Num$info = { - .size=sizeof(double), - .align=__alignof__(double), - .tag=CustomInfo, - .CustomInfo={ - .compare=(void*)Num$compare, - .equal=(void*)Num$equal, - .as_text=(void*)Num$as_text, - }, -}; - -public PUREFUNC Text_t Num32$as_text(const float *f, bool colorize, const TypeInfo *type) { - (void)type; - if (!f) return Text("Num32"); - return Text$format(colorize ? "\x1b[35m%.8g_f32\x1b[33;2m\x1b[m" : "%.8g_f32", (double)*f); -} - -public PUREFUNC int32_t Num32$compare(const float *x, const float *y, const TypeInfo *type) { - (void)type; - return (*x > *y) - (*x < *y); -} - -public PUREFUNC bool Num32$equal(const float *x, const float *y, const TypeInfo *type) { - (void)type; - return *x == *y; -} - -public CONSTFUNC bool Num32$near(float a, float b, float ratio, float absolute) { - if (ratio < 0) ratio = 0; - else if (ratio > 1) ratio = 1; - - if (a == b) return true; - - float diff = fabs(a - b); - if (diff < absolute) return true; - else if (isnan(diff)) return false; - - float epsilon = fabs(a * ratio) + fabs(b * ratio); - if (isinf(epsilon)) epsilon = FLT_MAX; - return (diff < epsilon); -} - -public Text_t Num32$format(float f, Int_t precision) { - return Text$format("%.*f", (int)Int_to_Int64(precision, false), (double)f); -} - -public Text_t Num32$scientific(float f, Int_t precision) { - return Text$format("%.*e", (int)Int_to_Int64(precision, false), (double)f); -} - -public float Num32$mod(float num, float modulus) { - float result = fmodf(num, modulus); - return (result < 0) != (modulus < 0) ? result + modulus : result; -} - -public float Num32$random(void) { - return (float)drand48(); -} - -public CONSTFUNC float Num32$mix(float amount, float x, float y) { - return (1.0f-amount)*x + amount*y; -} - -public OptionalNum32_t Num32$from_text(Text_t text) { - const char *str = Text$as_c_string(text); - char *end = NULL; - double d = strtod(str, &end); - if (end > str && end[0] == '\0') - return d; - else - return nan("null"); -} - -public float Num32$nan(Text_t tag) { - return nanf(Text$as_c_string(tag)); -} - -public CONSTFUNC bool Num32$isinf(float n) { return isinf(n); } -public CONSTFUNC bool Num32$finite(float n) { return finite(n); } -public CONSTFUNC bool Num32$isnan(float n) { return isnan(n); } - -public const TypeInfo Num32$info = { - .size=sizeof(float), - .align=__alignof__(float), - .tag=CustomInfo, - .CustomInfo={ - .compare=(void*)Num32$compare, - .equal=(void*)Num32$equal, - .as_text=(void*)Num32$as_text, - }, -}; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/nums.h b/builtins/nums.h deleted file mode 100644 index 78f32c1e..00000000 --- a/builtins/nums.h +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once - -// Type infos and methods for Nums (floating point) - -#include -#include -#include - -#include "types.h" -#include "util.h" - -#define Num_t double -#define Num32_t float -#define OptionalNum_t double -#define OptionalNum32_t float -#define N32(n) ((float)n) -#define N64(n) ((double)n) - -Text_t Num$as_text(const double *f, bool colorize, const TypeInfo *type); -PUREFUNC int32_t Num$compare(const double *x, const double *y, const TypeInfo *type); -PUREFUNC bool Num$equal(const double *x, const double *y, const TypeInfo *type); -CONSTFUNC bool Num$near(double a, double b, double ratio, double absolute); -Text_t Num$format(double f, Int_t precision); -Text_t Num$scientific(double f, Int_t precision); -double Num$mod(double num, double modulus); -CONSTFUNC bool Num$isinf(double n); -CONSTFUNC bool Num$finite(double n); -CONSTFUNC bool Num$isnan(double n); -double Num$nan(Text_t tag); -double Num$random(void); -CONSTFUNC double Num$mix(double amount, double x, double y); -OptionalNum_t Num$from_text(Text_t text); -CONSTFUNC static inline double Num$clamped(double x, double low, double high) { - return (x <= low) ? low : (x >= high ? high : x); -} -extern const TypeInfo Num$info; - -Text_t Num32$as_text(const float *f, bool colorize, const TypeInfo *type); -PUREFUNC int32_t Num32$compare(const float *x, const float *y, const TypeInfo *type); -PUREFUNC bool Num32$equal(const float *x, const float *y, const TypeInfo *type); -CONSTFUNC bool Num32$near(float a, float b, float ratio, float absolute); -Text_t Num32$format(float f, Int_t precision); -Text_t Num32$scientific(float f, Int_t precision); -float Num32$mod(float num, float modulus); -CONSTFUNC bool Num32$isinf(float n); -CONSTFUNC bool Num32$finite(float n); -CONSTFUNC bool Num32$isnan(float n); -float Num32$random(void); -CONSTFUNC float Num32$mix(float amount, float x, float y); -OptionalNum32_t Num32$from_text(Text_t text); -float Num32$nan(Text_t tag); -CONSTFUNC static inline float Num32$clamped(float x, float low, float high) { - return (x <= low) ? low : (x >= high ? high : x); -} -extern const TypeInfo Num32$info; - -#define Num_to_Num32(n) ((Num32_t)(n)) -#define Num32_to_Num(n) ((Num_t)(n)) - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/optionals.c b/builtins/optionals.c deleted file mode 100644 index b6ca8dfb..00000000 --- a/builtins/optionals.c +++ /dev/null @@ -1,73 +0,0 @@ -// Optional types - -#include - -#include "bools.h" -#include "datatypes.h" -#include "integers.h" -#include "metamethods.h" -#include "threads.h" -#include "text.h" -#include "util.h" - -public const Array_t NULL_ARRAY = {.length=-1}; -public const OptionalBool_t NULL_BOOL = 2; -public const Int_t NULL_INT = {.small=0}; -public const Table_t NULL_TABLE = {.entries.length=-1}; -public const Closure_t NULL_CLOSURE = {.fn=NULL}; -public const Text_t NULL_TEXT = {.length=-1}; - -public PUREFUNC bool is_null(const void *obj, const TypeInfo *non_optional_type) -{ - if (non_optional_type == &Int$info) - return ((Int_t*)obj)->small == 0; - else if (non_optional_type == &Bool$info) - return *((OptionalBool_t*)obj) == NULL_BOOL; - else if (non_optional_type == &Num$info) - return isnan(*((Num_t*)obj)); - else if (non_optional_type == &Int64$info) - return ((OptionalInt64_t*)obj)->is_null; - else if (non_optional_type == &Int32$info) - return ((OptionalInt32_t*)obj)->is_null; - else if (non_optional_type == &Int16$info) - return ((OptionalInt16_t*)obj)->is_null; - else if (non_optional_type == &Int8$info) - return ((OptionalInt8_t*)obj)->is_null; - else if (non_optional_type == &Thread) - return *(pthread_t**)obj == NULL; - - switch (non_optional_type->tag) { - case ChannelInfo: return *(Channel_t**)obj == NULL; - case PointerInfo: return *(void**)obj == NULL; - case TextInfo: return ((Text_t*)obj)->length < 0; - case ArrayInfo: return ((Array_t*)obj)->length < 0; - case TableInfo: return ((Table_t*)obj)->entries.length < 0; - case FunctionInfo: return *(void**)obj == NULL; - case StructInfo: { - int64_t offset = non_optional_type->size; - if (offset % non_optional_type->align) - offset += non_optional_type->align - (offset % non_optional_type->align); - return *(bool*)(obj + offset); - } - case EnumInfo: return (*(int*)obj) == 0; // NULL tag - case CStringInfo: return (*(char**)obj) == NULL; - default: { - Text_t t = generic_as_text(NULL, false, non_optional_type); - errx(1, "is_null() not implemented for: %k", &t); - } - } -} - -#pragma GCC diagnostic ignored "-Wstack-protector" -public Text_t Optional$as_text(const void *obj, bool colorize, const TypeInfo *type) -{ - if (!obj) - return Text$concat(generic_as_text(obj, colorize, type->OptionalInfo.type), Text("?")); - - if (is_null(obj, type->OptionalInfo.type)) - return Text$concat(colorize ? Text("\x1b[31m!") : Text("!"), generic_as_text(NULL, false, type->OptionalInfo.type), - colorize ? Text("\x1b[m") : Text("")); - return Text$concat(generic_as_text(obj, colorize, type->OptionalInfo.type), colorize ? Text("\x1b[33m?\x1b[m") : Text("?")); -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/builtins/optionals.h b/builtins/optionals.h deleted file mode 100644 index e37d5345..00000000 --- a/builtins/optionals.h +++ /dev/null @@ -1,23 +0,0 @@ -#pragma once - -// Optional types - -#include -#include - -#include "types.h" -#include "util.h" - -#define OptionalBool_t uint8_t - -extern const OptionalBool_t NULL_BOOL; -extern const Table_t NULL_TABLE; -extern const Array_t NULL_ARRAY; -extern const Int_t NULL_INT; -extern const Closure_t NULL_CLOSURE; -extern const Text_t NULL_TEXT; - -PUREFUNC bool is_null(const void *obj, const TypeInfo *non_optional_type); -Text_t Optional$as_text(const void *obj, bool colorize, const TypeInfo *type); - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/paths.c b/builtins/paths.c deleted file mode 100644 index 231a7c23..00000000 --- a/builtins/paths.c +++ /dev/null @@ -1,481 +0,0 @@ -// A lang for filesystem paths -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrays.h" -#include "files.h" -#include "integers.h" -#include "optionals.h" -#include "paths.h" -#include "patterns.h" -#include "text.h" -#include "types.h" -#include "util.h" - -PUREFUNC public Path_t Path$escape_text(Text_t text) -{ - if (Text$has(text, Pattern("/"))) - fail("Path interpolations cannot contain slashes: %k", &text); - else if (Text$has(text, Pattern(";"))) - fail("Path interpolations cannot contain semicolons: %k", &text); - else if (Text$equal_values(text, Path(".")) || Text$equal_values(text, Path(".."))) - fail("Path interpolation is \"%k\" which is disallowed to prevent security vulnerabilities", &text); - return (Path_t)text; -} - -PUREFUNC public Path_t Path$escape_path(Path_t path) -{ - if (Text$starts_with(path, Path("~/")) || Text$starts_with(path, Path("/"))) - fail("Invalid path component: %k", &path); - return path; -} - -public Path_t Path$cleanup(Path_t path) -{ - if (!Text$starts_with(path, Path("/")) && !Text$starts_with(path, Path("./")) - && !Text$starts_with(path, Path("../")) && !Text$starts_with(path, Path("~/"))) - path = Text$concat(Text("./"), path); - - // Not fully resolved, but at least get rid of some of the cruft like "/./" - // and "/foo/../" and "//" - bool trailing_slash = Text$ends_with(path, Path("/")); - Array_t components = Text$split(path, Pattern("/")); - if (components.length == 0) return Path("/"); - Path_t root = *(Path_t*)components.data; - Array$remove_at(&components, I(1), I(1), sizeof(Path_t)); - - for (int64_t i = 0; i < components.length; ) { - Path_t component = *(Path_t*)(components.data + i*components.stride); - if (component.length == 0 || Text$equal_values(component, Path("."))) { // Skip (//) and (/./) - Array$remove_at(&components, I(i+1), I(1), sizeof(Path_t)); - } else if (Text$equal_values(component, Path(".."))) { - if (i == 0) { - if (root.length == 0) { // (/..) -> (/) - Array$remove_at(&components, I(i+1), I(1), sizeof(Path_t)); - i += 1; - } else if (Text$equal_values(root, Path("."))) { // (./..) -> (..) - root = Path(".."); - Array$remove_at(&components, I(i+1), I(1), sizeof(Path_t)); - i += 1; - } else if (Text$equal_values(root, Path("~"))) { - root = Path(""); // Convert $HOME to absolute path: - - Array$remove_at(&components, I(i+1), I(1), sizeof(Path_t)); - // `i` is pointing to where the `..` lived - - const char *home = getenv("HOME"); - if (!home) fail("Could not get $HOME directory!"); - - // Insert all but the last component: - for (const char *p = home + 1; *p; ) { - const char *next_slash = strchr(p, '/'); - if (!next_slash) break; // Skip last component - Path_t home_component = Text$format("%.*s", (int)(next_slash - p), p); - Array$insert(&components, &home_component, I(i+1), sizeof(Path_t)); - i += 1; - p = next_slash + 1; - } - } else { // (../..) -> (../..) - i += 1; - } - } else if (Text$equal(&component, (Path_t*)(components.data + (i-1)*components.stride))) { // (___/../..) -> (____/../..) - i += 1; - } else { // (___/foo/..) -> (___) - Array$remove_at(&components, I(i), I(2), sizeof(Path_t)); - i -= 1; - } - } else { // (___/foo/baz) -> (___/foo/baz) - i++; - } - } - - Text_t cleaned_up = Text$concat(root, Text("/"), Text$join(Text("/"), components)); - if (trailing_slash && !Text$ends_with(cleaned_up, Text("/"))) - cleaned_up = Text$concat(cleaned_up, Text("/")); - return cleaned_up; -} - -static inline Path_t Path$_expand_home(Path_t path) -{ - if (Text$starts_with(path, Path("~/"))) { - Path_t after_tilde = Text$slice(path, I(2), I(-1)); - return Text$format("%s%k", getenv("HOME"), &after_tilde); - } else { - return path; - } -} - -public Path_t Path$_concat(int n, Path_t items[n]) -{ - Path_t cleaned_up = Path$cleanup(Text$_concat(n, items)); - if (cleaned_up.length > PATH_MAX) - fail("Path exceeds the maximum path length: %k", &cleaned_up); - return cleaned_up; -} - -public Text_t Path$resolved(Path_t path, Path_t relative_to) -{ - path = Path$cleanup(path); - - const char *path_str = Text$as_c_string(path); - const char *relative_to_str = Text$as_c_string(relative_to); - const char *resolved_path = resolve_path(path_str, relative_to_str, relative_to_str); - if (resolved_path) { - return (Path_t)(Text$from_str(resolved_path)); - } else if (path_str[0] == '/') { - return path; - } else if (path_str[0] == '~' && path_str[1] == '/') { - return (Path_t)Text$format("%s%s", getenv("HOME"), path_str + 1); - } else { - return Text$concat(Path$resolved(relative_to, Path(".")), Path("/"), path); - } -} - -public Text_t Path$relative(Path_t path, Path_t relative_to) -{ - path = Path$resolved(path, relative_to); - relative_to = Path$resolved(relative_to, Path(".")); - if (Text$matches(path, Patterns(Pattern("{start}"), relative_to, Pattern("{0+..}")))) - return Text$slice(path, I(relative_to.length + 2), I(-1)); - return path; -} - -public bool Path$exists(Path_t path) -{ - path = Path$_expand_home(path); - struct stat sb; - return (stat(Text$as_c_string(path), &sb) == 0); -} - -public bool Path$is_file(Path_t path, bool follow_symlinks) -{ - path = Path$_expand_home(path); - struct stat sb; - const char *path_str = Text$as_c_string(path); - int status = follow_symlinks ? stat(path_str, &sb) : lstat(path_str, &sb); - if (status != 0) return false; - return (sb.st_mode & S_IFMT) == S_IFREG; -} - -public bool Path$is_directory(Path_t path, bool follow_symlinks) -{ - path = Path$_expand_home(path); - struct stat sb; - const char *path_str = Text$as_c_string(path); - int status = follow_symlinks ? stat(path_str, &sb) : lstat(path_str, &sb); - if (status != 0) return false; - return (sb.st_mode & S_IFMT) == S_IFDIR; -} - -public bool Path$is_pipe(Path_t path, bool follow_symlinks) -{ - path = Path$_expand_home(path); - struct stat sb; - const char *path_str = Text$as_c_string(path); - int status = follow_symlinks ? stat(path_str, &sb) : lstat(path_str, &sb); - if (status != 0) return false; - return (sb.st_mode & S_IFMT) == S_IFIFO; -} - -public bool Path$is_socket(Path_t path, bool follow_symlinks) -{ - path = Path$_expand_home(path); - struct stat sb; - const char *path_str = Text$as_c_string(path); - int status = follow_symlinks ? stat(path_str, &sb) : lstat(path_str, &sb); - if (status != 0) return false; - return (sb.st_mode & S_IFMT) == S_IFSOCK; -} - -public bool Path$is_symlink(Path_t path) -{ - path = Path$_expand_home(path); - struct stat sb; - const char *path_str = Text$as_c_string(path); - int status = stat(path_str, &sb); - if (status != 0) return false; - return (sb.st_mode & S_IFMT) == S_IFLNK; -} - -static void _write(Path_t path, Text_t text, int mode, int permissions) -{ - path = Path$_expand_home(path); - const char *path_str = Text$as_c_string(path); - int fd = open(path_str, mode, permissions); - if (fd == -1) - fail("Could not write to file: %s\n%s", path_str, strerror(errno)); - - const char *str = Text$as_c_string(text); - size_t len = strlen(str); - ssize_t written = write(fd, str, len); - if (written != (ssize_t)len) - fail("Could not write to file: %s\n%s", path_str, strerror(errno)); -} - -public void Path$write(Path_t path, Text_t text, int permissions) -{ - _write(path, text, O_WRONLY | O_CREAT, permissions); -} - -public void Path$append(Path_t path, Text_t text, int permissions) -{ - _write(path, text, O_WRONLY | O_APPEND | O_CREAT, permissions); -} - -public Text_t Path$read(Path_t path) -{ - path = Path$_expand_home(path); - int fd = open(Text$as_c_string(path), O_RDONLY); - if (fd == -1) - fail("Could not read file: %k (%s)", &path, strerror(errno)); - - struct stat sb; - if (fstat(fd, &sb) != 0) - fail("Could not read file: %k (%s)", &path, strerror(errno)); - - if ((sb.st_mode & S_IFMT) == S_IFREG) { // Use memory mapping if it's a real file: - const char *mem = mmap(NULL, (size_t)sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); - char *gc_mem = GC_MALLOC_ATOMIC((size_t)sb.st_size+1); - memcpy(gc_mem, mem, (size_t)sb.st_size); - gc_mem[sb.st_size] = '\0'; - close(fd); - return Text$from_strn(gc_mem, (size_t)sb.st_size); - } else { - size_t capacity = 256, len = 0; - char *content = GC_MALLOC_ATOMIC(capacity); - for (;;) { - char chunk[256]; - ssize_t just_read = read(fd, chunk, sizeof(chunk)); - if (just_read < 0) - fail("Failed while reading file: %k (%s)", &path, strerror(errno)); - else if (just_read == 0) { - if (errno == EAGAIN || errno == EINTR) - continue; - break; - } - - if (len + (size_t)just_read >= capacity) { - content = GC_REALLOC(content, (capacity *= 2)); - } - - memcpy(&content[len], chunk, (size_t)just_read); - len += (size_t)just_read; - - if ((size_t)just_read < sizeof(chunk)) - break; - } - close(fd); - - if (u8_check((uint8_t*)content, len) != NULL) - fail("File does not contain valid UTF8 data!"); - - return Text$from_strn(content, len); - } -} - -public void Path$remove(Path_t path, bool ignore_missing) -{ - path = Path$_expand_home(path); - const char *path_str = Text$as_c_string(path); - struct stat sb; - if (lstat(path_str, &sb) != 0) { - if (!ignore_missing) - fail("Could not remove file: %s (%s)", path_str, strerror(errno)); - } - - if ((sb.st_mode & S_IFMT) == S_IFREG || (sb.st_mode & S_IFMT) == S_IFLNK) { - if (unlink(path_str) != 0 && !ignore_missing) - fail("Could not remove file: %s (%s)", path_str, strerror(errno)); - } else if ((sb.st_mode & S_IFMT) == S_IFDIR) { - if (rmdir(path_str) != 0 && !ignore_missing) - fail("Could not remove directory: %s (%s)", path_str, strerror(errno)); - } else { - fail("Could not remove path: %s (not a file or directory)", path_str, strerror(errno)); - } -} - -public void Path$create_directory(Path_t path, int permissions) -{ - path = Path$_expand_home(path); - if (mkdir(Text$as_c_string(path), (mode_t)permissions) != 0) - fail("Could not create directory: %k (%s)", &path, strerror(errno)); -} - -static Array_t _filtered_children(Path_t path, bool include_hidden, mode_t filter) -{ - path = Path$_expand_home(path); - struct dirent *dir; - Array_t children = {}; - const char *path_str = Text$as_c_string(path); - size_t path_len = strlen(path_str); - DIR *d = opendir(path_str); - if (!d) - fail("Could not open directory: %k (%s)", &path, strerror(errno)); - - if (path_str[path_len-1] == '/') - --path_len; - - while ((dir = readdir(d)) != NULL) { - if (!include_hidden && dir->d_name[0] == '.') - continue; - if (streq(dir->d_name, ".") || streq(dir->d_name, "..")) - continue; - - const char *child_str = heap_strf("%.*s/%s", path_len, path_str, dir->d_name); - struct stat sb; - if (stat(child_str, &sb) != 0) - continue; - if (!((sb.st_mode & S_IFMT) & filter)) - continue; - - Path_t child = Text$format("%s%s", child_str, ((sb.st_mode & S_IFMT) == S_IFDIR) ? "/" : ""); // Trailing slash for dirs - Array$insert(&children, &child, I(0), sizeof(Path_t)); - } - closedir(d); - return children; -} - -public Array_t Path$children(Path_t path, bool include_hidden) -{ - return _filtered_children(path, include_hidden, (mode_t)-1); -} - -public Array_t Path$files(Path_t path, bool include_hidden) -{ - return _filtered_children(path, include_hidden, S_IFREG); -} - -public Array_t Path$subdirectories(Path_t path, bool include_hidden) -{ - return _filtered_children(path, include_hidden, S_IFDIR); -} - -public Path_t Path$unique_directory(Path_t path) -{ - path = Path$_expand_home(path); - const char *path_str = Text$as_c_string(path); - size_t len = strlen(path_str); - if (len >= PATH_MAX) fail("Path is too long: %s", path_str); - char buf[PATH_MAX] = {}; - strcpy(buf, path_str); - if (buf[len-1] == '/') - buf[--len] = '\0'; - char *created = mkdtemp(buf); - if (!created) fail("Failed to create temporary directory: %s (%s)", path_str, strerror(errno)); - return Text$format("%s/", created); -} - -public Text_t Path$write_unique(Path_t path, Text_t text) -{ - path = Path$_expand_home(path); - const char *path_str = Text$as_c_string(path); - size_t len = strlen(path_str); - if (len >= PATH_MAX) fail("Path is too long: %s", path_str); - char buf[PATH_MAX] = {}; - strcpy(buf, path_str); - - int64_t suffixlen = 0; - (void)Text$find(path, Pattern("{0+!X}{end}"), I(1), &suffixlen); - if (suffixlen < 0) suffixlen = 0; - - int fd = mkstemps(buf, suffixlen); - if (fd == -1) - fail("Could not write to unique file: %s\n%s", buf, strerror(errno)); - - const char *str = Text$as_c_string(text); - size_t write_len = strlen(str); - ssize_t written = write(fd, str, write_len); - if (written != (ssize_t)write_len) - fail("Could not write to file: %s\n%s", buf, strerror(errno)); - return Text$format("%s", buf); -} - -public Path_t Path$parent(Path_t path) -{ - return Path$cleanup(Text$concat(path, Path("/../"))); -} - -public Text_t Path$base_name(Path_t path) -{ - path = Path$cleanup(path); - if (Text$ends_with(path, Path("/"))) - return Text$replace(path, Pattern("{0+..}/{!/}/{end}"), Text("@2"), Text("@"), false); - else - return Text$replace(path, Pattern("{0+..}/{!/}{end}"), Text("@2"), Text("@"), false); -} - -public Text_t Path$extension(Path_t path, bool full) -{ - Text_t base = Path$base_name(path); - if (Text$matches(base, Pattern(".{!.}.{..}"))) - return Text$replace(base, full ? Pattern(".{!.}.{..}") : Pattern(".{..}.{!.}{end}"), Text("@2"), Text("@"), false); - else if (Text$matches(base, Pattern("{!.}.{..}"))) - return Text$replace(base, full ? Pattern("{!.}.{..}") : Pattern("{..}.{!.}{end}"), Text("@2"), Text("@"), false); - else - return Text(""); -} - -static void _line_reader_cleanup(FILE **f) -{ - if (f && *f) { - fclose(*f); - *f = NULL; - } -} - -static Text_t _next_line(FILE **f) -{ - if (!f || !*f) return NULL_TEXT; - - char *line = NULL; - size_t size = 0; - ssize_t len = getline(&line, &size, *f); - if (len <= 0) { - _line_reader_cleanup(f); - return NULL_TEXT; - } - - while (len > 0 && (line[len-1] == '\r' || line[len-1] == '\n')) - --len; - - if (u8_check((uint8_t*)line, (size_t)len) != NULL) - fail("Invalid UTF8!"); - - Text_t line_text = Text$format("%.*s", len, line); - free(line); - return line_text; -} - -public Closure_t Path$by_line(Path_t path) -{ - path = Path$_expand_home(path); - - FILE *f = fopen(Text$as_c_string(path), "r"); - if (f == NULL) - fail("Could not read file: %k (%s)", &path, strerror(errno)); - - FILE **wrapper = GC_MALLOC(sizeof(FILE*)); - *wrapper = f; - GC_register_finalizer(wrapper, (void*)_line_reader_cleanup, NULL, NULL, NULL); - return (Closure_t){.fn=(void*)_next_line, .userdata=wrapper}; -} - -public const TypeInfo Path$info = { - .size=sizeof(Path_t), - .align=__alignof__(Path_t), - .tag=TextInfo, - .TextInfo={.lang="Path"}, -}; - - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/paths.h b/builtins/paths.h deleted file mode 100644 index e0d85258..00000000 --- a/builtins/paths.h +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -// A lang for filesystem paths - -#include -#include - -#include "types.h" -#include "datatypes.h" - -#define Path_t Text_t -#define Path(text) ((Path_t)Text(text)) -#define Paths(...) Path$_concat(sizeof((Path_t[]){__VA_ARGS__})/sizeof(Path_t), (Path_t[]){__VA_ARGS__}) - -Path_t Path$cleanup(Path_t path); -Path_t Path$_concat(int n, Path_t items[n]); -#define Path$concat(a, b) Paths(a, Path("/"), b) -PUREFUNC Path_t Path$escape_text(Text_t text); -PUREFUNC Path_t Path$escape_path(Text_t path); -Path_t Path$resolved(Path_t path, Path_t relative_to); -Path_t Path$relative(Path_t path, Path_t relative_to); -bool Path$exists(Path_t path); -bool Path$is_file(Path_t path, bool follow_symlinks); -bool Path$is_directory(Path_t path, bool follow_symlinks); -bool Path$is_pipe(Path_t path, bool follow_symlinks); -bool Path$is_socket(Path_t path, bool follow_symlinks); -bool Path$is_symlink(Path_t path); -void Path$write(Path_t path, Text_t text, int permissions); -void Path$append(Path_t path, Text_t text, int permissions); -Text_t Path$read(Path_t path); -void Path$remove(Path_t path, bool ignore_missing); -void Path$create_directory(Path_t path, int permissions); -Array_t Path$children(Path_t path, bool include_hidden); -Array_t Path$files(Path_t path, bool include_hidden); -Array_t Path$subdirectories(Path_t path, bool include_hidden); -Path_t Path$unique_directory(Path_t path); -Text_t Path$write_unique(Path_t path, Text_t text); -Path_t Path$parent(Path_t path); -Text_t Path$base_name(Path_t path); -Text_t Path$extension(Path_t path, bool full); -Closure_t Path$by_line(Path_t path); - -#define Path$hash Text$hash -#define Path$compare Text$compare -#define Path$equal Text$equal - -extern const TypeInfo Path$info; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 - diff --git a/builtins/patterns.c b/builtins/patterns.c deleted file mode 100644 index 81beaffe..00000000 --- a/builtins/patterns.c +++ /dev/null @@ -1,1064 +0,0 @@ -// Logic for text pattern matching - -#include -#include -#include -#include - -#include "arrays.h" -#include "integers.h" -#include "patterns.h" -#include "tables.h" -#include "text.h" -#include "types.h" - -#define MAX_BACKREFS 100 - -static inline void skip_whitespace(Text_t text, int64_t *i) -{ - TextIter_t state = {0, 0}; - while (*i < text.length) { - int32_t grapheme = Text$get_grapheme_fast(text, &state, *i); - if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) - return; - *i += 1; - } -} - -static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme) -{ - if (*i < text.length && Text$get_grapheme(text, *i) == grapheme) { - *i += 1; - return true; - } - return false; -} - -static inline bool match_str(Text_t text, int64_t *i, const char *str) -{ - TextIter_t state = {0, 0}; - int64_t matched = 0; - while (matched[str]) { - if (*i + matched >= text.length || Text$get_grapheme_fast(text, &state, *i + matched) != str[matched]) - return false; - matched += 1; - } - *i += matched; - return true; -} - -static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop) -{ - if (*i >= text.length) return false; - TextIter_t state = {}; - ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i); - // TODO: check every codepoint in the cluster? - if (uc_is_property(grapheme, prop)) { - *i += 1; - return true; - } - return false; -} - -static int64_t parse_int(Text_t text, int64_t *i) -{ - TextIter_t state = {0, 0}; - int64_t value = 0; - for (;; *i += 1) { - ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i); - int digit = uc_digit_value((ucs4_t)grapheme); - if (digit < 0) break; - if (value >= INT64_MAX/10) break; - value = 10*value + digit; - } - return value; -} - -const char *get_property_name(Text_t text, int64_t *i) -{ - skip_whitespace(text, i); - char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); - char *dest = name; - TextIter_t state = {0, 0}; - while (*i < text.length) { - int32_t grapheme = Text$get_grapheme_fast(text, &state, *i); - if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { - *dest = (char)grapheme; - ++dest; - if (dest >= name + UNINAME_MAX - 1) - break; - } else { - break; - } - *i += 1; - } - - while (dest > name && dest[-1] == ' ') - *(dest--) = '\0'; - - if (dest == name) return NULL; - *dest = '\0'; - return name; -} - -#define EAT1(text, state, index, cond) ({\ - int32_t grapheme = Text$get_grapheme_fast(text, state, index); \ - bool success = (cond); \ - if (success) index += 1; \ - success; }) - -#define EAT2(text, state, index, cond1, cond2) ({\ - int32_t grapheme = Text$get_grapheme_fast(text, state, index); \ - bool success = (cond1); \ - if (success) { \ - grapheme = Text$get_grapheme_fast(text, state, index + 1); \ - success = (cond2); \ - if (success) \ - index += 2; \ - } \ - success; }) - - -#define EAT_MANY(text, state, index, cond) ({ int64_t _n = 0; while (EAT1(text, state, index, cond)) { _n += 1; } _n; }) - -int64_t match_email(Text_t text, int64_t index) -{ - // email = local "@" domain - // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) - // domain = dns-label ("." dns-label)* - // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) - - TextIter_t state = {0, 0}; - if (index > 0) { - ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1); - if (uc_is_property_alphabetic((ucs4_t)prev_codepoint)) - return -1; - } - - int64_t start_index = index; - - // Local part: - int64_t local_len = 0; - static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; - while (EAT1(text, &state, index, - (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { - local_len += 1; - if (local_len > 64) return -1; - } - - if (!EAT1(text, &state, index, grapheme == '@')) - return -1; - - // Host - int64_t host_len = 0; - do { - int64_t label_len = 0; - while (EAT1(text, &state, index, - (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { - label_len += 1; - if (label_len > 63) return -1; - } - - if (label_len == 0) - return -1; - - host_len += label_len; - if (host_len > 255) - return -1; - host_len += 1; - } while (EAT1(text, &state, index, grapheme == '.')); - - return index - start_index; -} - -int64_t match_ipv6(Text_t text, int64_t index) -{ - TextIter_t state = {0, 0}; - if (index > 0) { - int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1); - if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) - return -1; - } - int64_t start_index = index; - const int NUM_CLUSTERS = 8; - bool double_colon_used = false; - for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { - for (int digits = 0; digits < 4; digits++) { - if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) - break; - } - if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) - return -1; // Too many digits - - if (cluster == NUM_CLUSTERS-1) { - break; - } else if (!EAT1(text, &state, index, grapheme == ':')) { - if (double_colon_used) - break; - return -1; - } - - if (EAT1(text, &state, index, grapheme == ':')) { - if (double_colon_used) - return -1; - double_colon_used = true; - } - } - return index - start_index; -} - -static int64_t match_ipv4(Text_t text, int64_t index) -{ - TextIter_t state = {0, 0}; - if (index > 0) { - int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1); - if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) - return -1; - } - int64_t start_index = index; - - const int NUM_CLUSTERS = 4; - for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { - for (int digits = 0; digits < 3; digits++) { - if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { - if (digits == 0) return -1; - break; - } - } - - if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) - return -1; // Too many digits - - if (cluster == NUM_CLUSTERS-1) - break; - else if (!EAT1(text, &state, index, grapheme == '.')) - return -1; - } - return (index - start_index); -} - -int64_t match_ip(Text_t text, int64_t index) -{ - int64_t len = match_ipv6(text, index); - if (len >= 0) return len; - len = match_ipv4(text, index); - return (len >= 0) ? len : -1; -} - -int64_t match_uri(Text_t text, int64_t index) -{ - // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] - // scheme = [a-zA-Z] [a-zA-Z0-9+.-] - // authority = [userinfo "@"] host [":" port] - - TextIter_t state = {0, 0}; - if (index > 0) { - ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1); - if (uc_is_property_alphabetic(prev_codepoint)) - return -1; - } - - int64_t start_index = index; - - // Scheme: - if (!EAT1(text, &state, index, isalpha(grapheme))) - return -1; - - EAT_MANY(text, &state, index, - !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); - - if (index == start_index) - return -1; - - if (!match_grapheme(text, &index, ':')) - return -1; - - // Authority: - if (match_str(text, &index, "//")) { - int64_t authority_start = index; - // Username or host: - static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/"; - if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) - return -1; - - if (EAT1(text, &state, index, grapheme == '@')) { - // Found a username, now get a host: - if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) - return -1; - } else { - int64_t ip = authority_start; - int64_t ipv4_len = match_ipv4(text, ip); - if (ipv4_len > 0) { - ip += ipv4_len; - } else if (match_grapheme(text, &ip, '[')) { - ip += match_ipv6(text, ip); - if (ip > authority_start + 1 && match_grapheme(text, &ip, ']')) - index = ip; - } - } - - // Port: - if (EAT1(text, &state, index, grapheme == ':')) { - if (EAT_MANY(text, &state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) - return -1; - } - if (!EAT1(text, &state, index, grapheme == '/')) - return (index - start_index); // No path - } else { - // Optional path root: - EAT1(text, &state, index, grapheme == '/'); - } - - // Path: - static const char *non_path = " \"#?<>[]{}\\^`|"; - EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); - - if (EAT1(text, &state, index, grapheme == '?')) { // Query - static const char *non_query = " \"#<>[]{}\\^`|"; - EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); - } - - if (EAT1(text, &state, index, grapheme == '#')) { // Fragment - static const char *non_fragment = " \"#<>[]{}\\^`|"; - EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); - } - return index - start_index; -} - -int64_t match_url(Text_t text, int64_t index) -{ - int64_t lookahead = index; - if (!(match_str(text, &lookahead, "https:") - || match_str(text, &lookahead, "http:") - || match_str(text, &lookahead, "ftp:") - || match_str(text, &lookahead, "wss:") - || match_str(text, &lookahead, "ws:"))) - return -1; - - return match_uri(text, index); -} - -int64_t match_id(Text_t text, int64_t index) -{ - TextIter_t state = {0, 0}; - if (!EAT1(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) - return -1; - return 1 + EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE)); -} - -int64_t match_int(Text_t text, int64_t index) -{ - TextIter_t state = {0, 0}; - int64_t len = EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); - return len >= 0 ? len : -1; -} - -int64_t match_num(Text_t text, int64_t index) -{ - TextIter_t state = {0, 0}; - bool negative = EAT1(text, &state, index, grapheme == '-') ? 1 : 0; - int64_t pre_decimal = EAT_MANY(text, &state, index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); - bool decimal = (EAT1(text, &state, index, grapheme == '.') == 1); - int64_t post_decimal = decimal ? EAT_MANY(text, &state, index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0; - if (pre_decimal == 0 && post_decimal == 0) - return -1; - return negative + pre_decimal + decimal + post_decimal; -} - -int64_t match_newline(Text_t text, int64_t index) -{ - if (index >= text.length) - return -1; - - TextIter_t state = {0, 0}; - ucs4_t grapheme = index >= text.length ? 0 : Text$get_main_grapheme_fast(text, &state, index); - if (grapheme == '\n') - return 1; - if (grapheme == '\r' && Text$get_grapheme_fast(text, &state, index + 1) == '\n') - return 2; - return -1; -} - -typedef struct { - int64_t index, length; - bool occupied, recursive; -} capture_t; - -typedef struct { - enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag; - bool negated, non_capturing; - int64_t min, max; - union { - int32_t grapheme; - uc_property_t property; - int64_t (*fn)(Text_t, int64_t); - int32_t quote_graphemes[2]; - int32_t pair_graphemes[2]; - }; -} pat_t; - -int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat) -{ - int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(text, state, index); - - switch (pat.tag) { - case PAT_START: { - if (index == 0) - return pat.negated ? -1 : 0; - return pat.negated ? 0 : -1; - } - case PAT_END: { - if (index >= text.length) - return pat.negated ? -1 : 0; - return pat.negated ? 0 : -1; - } - case PAT_ANY: { - assert(!pat.negated); - return (index < text.length) ? 1 : -1; - } - case PAT_GRAPHEME: { - if (index >= text.length) - return -1; - else if (grapheme == pat.grapheme) - return pat.negated ? -1 : 1; - return pat.negated ? 1 : -1; - } - case PAT_PROPERTY: { - if (index >= text.length) - return -1; - else if (uc_is_property((ucs4_t)grapheme, pat.property)) - return pat.negated ? -1 : 1; - return pat.negated ? 1 : -1; - } - case PAT_PAIR: { - // Nested punctuation: (?), [?], etc - if (index >= text.length) - return -1; - - int32_t open = pat.pair_graphemes[0]; - if (grapheme != open) - return pat.negated ? 1 : -1; - - int32_t close = pat.pair_graphemes[1]; - int64_t depth = 1; - int64_t match_len = 1; - for (; depth > 0; match_len++) { - if (index + match_len >= text.length) - return pat.negated ? 1 : -1; - - int32_t c = Text$get_grapheme_fast(text, state, index + match_len); - if (c == open) - depth += 1; - else if (c == close) - depth -= 1; - } - return pat.negated ? -1 : match_len; - } - case PAT_QUOTE: { - // Nested quotes: "?", '?', etc - if (index >= text.length) - return -1; - - int32_t open = pat.quote_graphemes[0]; - if (grapheme != open) - return pat.negated ? 1 : -1; - - int32_t close = pat.quote_graphemes[1]; - for (int64_t i = index + 1; i < text.length; i++) { - int32_t c = Text$get_grapheme_fast(text, state, i); - if (c == close) { - return pat.negated ? -1 : (i - index) + 1; - } else if (c == '\\' && index + 1 < text.length) { - i += 1; // Skip ahead an extra step - } - } - return pat.negated ? 1 : -1; - } - case PAT_FUNCTION: { - int64_t match_len = pat.fn(text, index); - if (match_len >= 0) - return pat.negated ? -1 : match_len; - return pat.negated ? 1 : -1; - } - default: errx(1, "Invalid pattern"); - } - errx(1, "Unreachable"); -} - -pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index) -{ - if (EAT2(pattern, state, *index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), - grapheme == '?')) { - // Quotations: "?", '?', etc - int32_t open = Text$get_grapheme_fast(pattern, state, *index-2); - int32_t close = open; - uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); - if (!match_grapheme(pattern, index, close)) - fail("Pattern's closing quote is missing: %k", &pattern); - - return (pat_t){ - .tag=PAT_QUOTE, - .min=1, .max=1, - .quote_graphemes={open, close}, - }; - } else if (EAT2(pattern, state, *index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), - grapheme == '?')) { - // Nested punctuation: (?), [?], etc - int32_t open = Text$get_grapheme_fast(pattern, state, *index-2); - int32_t close = open; - uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); - if (!match_grapheme(pattern, index, close)) - fail("Pattern's closing brace is missing: %k", &pattern); - - return (pat_t){ - .tag=PAT_PAIR, - .min=1, .max=1, - .pair_graphemes={open, close}, - }; - } else if (EAT1(pattern, state, *index, - grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc. - skip_whitespace(pattern, index); - int64_t min, max; - if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(pattern, state, *index))) { - min = parse_int(pattern, index); - skip_whitespace(pattern, index); - if (match_grapheme(pattern, index, '+')) { - max = INT64_MAX; - } else if (match_grapheme(pattern, index, '-')) { - max = parse_int(pattern, index); - } else { - max = min; - } - if (min > max) fail("Minimum repetitions (%ld) is less than the maximum (%ld)", min, max); - } else { - min = -1, max = -1; - } - - skip_whitespace(pattern, index); - - bool negated = match_grapheme(pattern, index, '!'); -#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__}) - const char *prop_name; - if (match_str(pattern, index, "..")) - prop_name = ".."; - else - prop_name = get_property_name(pattern, index); - - if (!prop_name) { - // Literal character, e.g. {1?} - skip_whitespace(pattern, index); - int32_t grapheme = Text$get_grapheme_fast(pattern, state, (*index)++); - if (!match_grapheme(pattern, index, '}')) - fail("Missing closing '}' in pattern: %k", &pattern); - return PAT(PAT_GRAPHEME, .grapheme=grapheme); - } else if (strlen(prop_name) == 1) { - // Single letter names: {1+ A} - skip_whitespace(pattern, index); - if (!match_grapheme(pattern, index, '}')) - fail("Missing closing '}' in pattern: %k", &pattern); - return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]); - } - - skip_whitespace(pattern, index); - if (!match_grapheme(pattern, index, '}')) - fail("Missing closing '}' in pattern: %k", &pattern); - - switch (tolower(prop_name[0])) { - case '.': - if (prop_name[1] == '.') { - if (negated) - return ((pat_t){.tag=PAT_END, .min=min, .max=max, .non_capturing=true}); - else - return PAT(PAT_ANY); - } - break; - case 'd': - if (strcasecmp(prop_name, "digit") == 0) { - return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT); - } - break; - case 'e': - if (strcasecmp(prop_name, "end") == 0) { - return PAT(PAT_END, .non_capturing=!negated); - } else if (strcasecmp(prop_name, "email") == 0) { - return PAT(PAT_FUNCTION, .fn=match_email); - } else if (strcasecmp(prop_name, "emoji") == 0) { - return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI); - } - break; - case 'i': - if (strcasecmp(prop_name, "id") == 0) { - return PAT(PAT_FUNCTION, .fn=match_id); - } else if (strcasecmp(prop_name, "int") == 0) { - return PAT(PAT_FUNCTION, .fn=match_int); - } else if (strcasecmp(prop_name, "ipv4") == 0) { - return PAT(PAT_FUNCTION, .fn=match_ipv4); - } else if (strcasecmp(prop_name, "ipv6") == 0) { - return PAT(PAT_FUNCTION, .fn=match_ipv6); - } else if (strcasecmp(prop_name, "ip") == 0) { - return PAT(PAT_FUNCTION, .fn=match_ip); - } - break; - case 'n': - if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0 - || strcasecmp(prop_name, "crlf")) { - return PAT(PAT_FUNCTION, .fn=match_newline); - } else if (strcasecmp(prop_name, "num") == 0) { - return PAT(PAT_FUNCTION, .fn=match_num); - } - break; - case 's': - if (strcasecmp(prop_name, "start") == 0) { - return PAT(PAT_START, .non_capturing=!negated); - } - break; - case 'u': - if (strcasecmp(prop_name, "uri") == 0) { - return PAT(PAT_FUNCTION, .fn=match_uri); - } else if (strcasecmp(prop_name, "url") == 0) { - return PAT(PAT_FUNCTION, .fn=match_url); - } - break; - default: break; - } - - uc_property_t prop = uc_property_byname(prop_name); - if (uc_property_is_valid(prop)) - return PAT(PAT_PROPERTY, .property=prop); - - ucs4_t grapheme = unicode_name_character(prop_name); - if (grapheme == UNINAME_INVALID) - fail("Not a valid property or character name: %s", prop_name); - return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme); -#undef PAT - } else { - return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(pattern, state, (*index)++)}; - } -} - -int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) -{ - if (pattern_index >= pattern.length) // End of the pattern - return 0; - - int64_t start_index = text_index; - TextIter_t pattern_state = {0, 0}, text_state = {0, 0}; - pat_t pat = parse_next_pat(pattern, &pattern_state, &pattern_index); - - if (pat.min == -1 && pat.max == -1) { - if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { - pat.min = pat.max = MAX(1, text.length - text_index); - } else { - pat.min = 1; - pat.max = INT64_MAX; - } - } - - int64_t capture_start = text_index; - int64_t count = 0, capture_len = 0, next_match_len = 0; - - if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { - int64_t remaining = text.length - text_index; - capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1; - text_index += capture_len; - goto success; - } - - if (pat.min == 0 && pattern_index < pattern.length) { - next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); - if (next_match_len >= 0) { - capture_len = 0; - goto success; - } - } - - while (count < pat.max) { - int64_t match_len = match_pat(text, &text_state, text_index, pat); - if (match_len < 0) - break; - capture_len += match_len; - text_index += match_len; - count += 1; - - if (pattern_index < pattern.length) { // More stuff after this - if (count < pat.min) - next_match_len = -1; - else - next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); - } else { - next_match_len = 0; - } - - if (match_len == 0) { - if (next_match_len >= 0) { - // If we're good to go, no need to keep re-matching zero-length - // matches till we hit max: - count = pat.max; - break; - } else { - return -1; - } - } - - if (pattern_index < pattern.length && next_match_len >= 0) - break; // Next guy exists and wants to stop here - - if (text_index >= text.length) - break; - } - - if (count < pat.min || next_match_len < 0) - return -1; - - success: - if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) { - if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) { - assert(capture_len > 0); - captures[capture_index] = (capture_t){ - .index=capture_start + 1, // Skip leading quote/paren - .length=capture_len - 2, // Skip open/close - .occupied=true, - .recursive=(pat.tag == PAT_PAIR), - }; - } else { - captures[capture_index] = (capture_t){ - .index=capture_start, - .length=capture_len, - .occupied=true, - .recursive=false, - }; - } - } - return (text_index - start_index) + next_match_len; -} - -#undef EAT1 -#undef EAT2 -#undef EAT_MANY - -static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length) -{ - int32_t first_grapheme = Text$get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - TextIter_t text_state = {0, 0}; - - for (int64_t i = first; i <= last; i++) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (i < text.length && Text$get_grapheme_fast(text, &text_state, i) != first_grapheme) - ++i; - } - - int64_t m = match(text, i, pattern, 0, NULL, 0); - if (m >= 0) { - if (match_length) - *match_length = m; - return i; - } - } - if (match_length) - *match_length = -1; - return -1; -} - -public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t *match_length) -{ - int64_t first = Int_to_Int64(from_index, false); - if (first == 0) fail("Invalid index: 0"); - if (first < 0) first = text.length + first + 1; - if (first > text.length || first < 1) - return I(0); - int64_t found = _find(text, pattern, first-1, text.length-1, match_length); - return I(found+1); -} - -PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern) -{ - if (Text$starts_with(pattern, Text("{start}"))) { - int64_t m = match(text, 0, pattern, 0, NULL, 0); - return m >= 0; - } else if (Text$ends_with(text, Text("{end}"))) { - for (int64_t i = text.length-1; i >= 0; i--) { - int64_t match_len = match(text, i, pattern, 0, NULL, 0); - if (match_len >= 0 && i + match_len == text.length) - return true; - } - return false; - } else { - int64_t found = _find(text, pattern, 0, text.length-1, NULL); - return (found >= 0); - } -} - -PUREFUNC public bool Text$matches(Text_t text, Pattern_t pattern) -{ - int64_t m = match(text, 0, pattern, 0, NULL, 0); - return m == text.length; -} - -public Array_t Text$find_all(Text_t text, Pattern_t pattern) -{ - if (pattern.length == 0) // special case - return (Array_t){.length=0}; - - Array_t matches = {}; - - for (int64_t i = 0; ; ) { - int64_t len = 0; - int64_t found = _find(text, pattern, i, text.length-1, &len); - if (found < 0) break; - Text_t match = Text$slice(text, I(found+1), I(found + len)); - Array$insert(&matches, &match, I_small(0), sizeof(Text_t)); - i = found + MAX(len, 1); - } - - return matches; -} - -static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t replacement, Pattern_t backref_pat, capture_t *captures) -{ - if (backref_pat.length == 0) - return replacement; - - int32_t first_grapheme = Text$get_grapheme(backref_pat, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - Text_t ret = Text(""); - TextIter_t state = {0, 0}; - int64_t nonmatching_pos = 0; - for (int64_t pos = 0; pos < replacement.length; ) { - // Optimization: quickly skip ahead to first char in the backref pattern: - if (find_first) { - while (pos < replacement.length && Text$get_grapheme_fast(replacement, &state, pos) != first_grapheme) - ++pos; - } - - int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0); - if (backref_len < 0) { - pos += 1; - continue; - } - - int64_t after_backref = pos + backref_len; - int64_t backref = parse_int(replacement, &after_backref); - if (after_backref == pos + backref_len) { // Not actually a backref if there's no number - pos += 1; - continue; - } - if (backref < 0 || backref > 9) fail("Invalid backref index: %ld (only 0-%d are allowed)", backref, MAX_BACKREFS-1); - backref_len = (after_backref - pos); - - if (Text$get_grapheme_fast(replacement, &state, pos + backref_len) == ';') - backref_len += 1; // skip optional semicolon - - if (!captures[backref].occupied) - fail("There is no capture number %ld!", backref); - - Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length)); - - if (captures[backref].recursive && original_pattern.length > 0) - backref_text = Text$replace(backref_text, original_pattern, replacement, backref_pat, true); - - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos)); - ret = Text$concat(ret, before_slice, backref_text); - } else { - ret = Text$concat(ret, backref_text); - } - - pos += backref_len; - nonmatching_pos = pos; - } - if (nonmatching_pos < replacement.length) { - Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive) -{ - Text_t ret = {.length=0}; - - int32_t first_grapheme = Text$get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - TextIter_t text_state = {0, 0}; - int64_t nonmatching_pos = 0; - for (int64_t pos = 0; pos < text.length; ) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme) - ++pos; - } - - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, pos, pattern, 0, captures, 1); - if (match_len < 0) { - pos += 1; - continue; - } - captures[0] = (capture_t){ - .index = pos, .length = match_len, - .occupied = true, .recursive = false, - }; - - Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures); - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); - ret = Text$concat(ret, before_slice, replacement_text); - } else { - ret = Text$concat(ret, replacement_text); - } - nonmatching_pos = pos + match_len; - pos += MAX(match_len, 1); - } - if (nonmatching_pos < text.length) { - Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right) -{ - int64_t first = 0, last = text.length-1; - if (trim_left) { - int64_t match_len = match(text, 0, pattern, 0, NULL, 0); - if (match_len > 0) - first = match_len; - } - - if (trim_right) { - for (int64_t i = text.length-1; i >= first; i--) { - int64_t match_len = match(text, i, pattern, 0, NULL, 0); - if (match_len > 0 && i + match_len == text.length) - last = i-1; - } - } - return Text$slice(text, I(first+1), I(last+1)); -} - -public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn) -{ - Text_t ret = {.length=0}; - - int32_t first_grapheme = Text$get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - TextIter_t text_state = {0, 0}; - int64_t nonmatching_pos = 0; - - Text_t (*text_mapper)(Text_t, void*) = fn.fn; - for (int64_t pos = 0; pos < text.length; pos++) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme) - ++pos; - } - - int64_t match_len = match(text, pos, pattern, 0, NULL, 0); - if (match_len < 0) continue; - - Text_t replacement = text_mapper(Text$slice(text, I(pos+1), I(pos+match_len)), fn.userdata); - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); - ret = Text$concat(ret, before_slice, replacement); - } else { - ret = Text$concat(ret, replacement); - } - nonmatching_pos = pos + match_len; - pos += (match_len - 1); - } - if (nonmatching_pos < text.length) { - Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive) -{ - if (replacements.entries.length == 0) return text; - - Text_t ret = {.length=0}; - - int64_t nonmatch_pos = 0; - for (int64_t pos = 0; pos < text.length; ) { - // Find the first matching pattern at this position: - for (int64_t i = 0; i < replacements.entries.length; i++) { - Pattern_t pattern = *(Pattern_t*)(replacements.entries.data + i*replacements.entries.stride); - capture_t captures[MAX_BACKREFS] = {}; - int64_t len = match(text, pos, pattern, 0, captures, 1); - if (len < 0) continue; - captures[0].index = pos; - captures[0].length = len; - - // If we skipped over some non-matching text before finding a match, insert it here: - if (pos > nonmatch_pos) { - Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos)); - ret = Text$concat(ret, before_slice); - } - - // Concatenate the replacement: - Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t)); - Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures); - ret = Text$concat(ret, replacement_text); - pos += MAX(len, 1); - nonmatch_pos = pos; - goto next_pos; - } - - pos += 1; - next_pos: - continue; - } - - if (nonmatch_pos <= text.length) { - Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -public Array_t Text$split(Text_t text, Pattern_t pattern) -{ - if (text.length == 0) // special case - return (Array_t){.length=0}; - - if (pattern.length == 0) // special case - return Text$clusters(text); - - Array_t chunks = {}; - - Int_t i = I_small(1); - for (;;) { - int64_t len = 0; - Int_t found = Text$find(text, pattern, i, &len); - if (I_is_zero(found)) break; - Text_t chunk = Text$slice(text, i, Int$minus(found, I_small(1))); - Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); - i = Int$plus(found, I(MAX(len, 1))); - } - - Text_t last_chunk = Text$slice(text, i, I(text.length)); - Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t)); - - return chunks; -} - - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/patterns.h b/builtins/patterns.h deleted file mode 100644 index 804fb286..00000000 --- a/builtins/patterns.h +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -// The type representing text patterns for pattern matching. - -#include -#include -#include - -#include "datatypes.h" -#include "integers.h" -#include "types.h" - -#define Pattern(text) ((Pattern_t)Text(text)) -#define Patterns(...) ((Pattern_t)Texts(__VA_ARGS__)) - -Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t backref_pat, bool recursive); -Pattern_t Pattern$escape_text(Text_t text); -Text_t Text$replace_all(Text_t text, Table_t replacements, Pattern_t backref_pat, bool recursive); -Array_t Text$split(Text_t text, Pattern_t pattern); -Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right); -Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length); -Array_t Text$find_all(Text_t text, Pattern_t pattern); -PUREFUNC bool Text$has(Text_t text, Pattern_t pattern); -PUREFUNC bool Text$matches(Text_t text, Pattern_t pattern); -Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn); - -#define Pattern$hash Text$hash -#define Pattern$compare Text$compare -#define Pattern$equal Text$equal - -extern const TypeInfo Pattern$info; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/pointers.c b/builtins/pointers.c deleted file mode 100644 index 1ad9f407..00000000 --- a/builtins/pointers.c +++ /dev/null @@ -1,84 +0,0 @@ -// Type infos and methods for Pointer types -#include -#include -#include -#include -#include -#include -#include - -#include "metamethods.h" -#include "text.h" -#include "types.h" -#include "util.h" - -typedef struct recursion_s { - const void *ptr; - struct recursion_s *next; -} recursion_t; - -public Text_t Pointer$as_text(const void *x, bool colorize, const TypeInfo *type) { - auto ptr_info = type->PointerInfo; - if (!x) { - Text_t typename = generic_as_text(NULL, false, ptr_info.pointed); - Text_t text; - if (colorize) - text = Text$concat(Text("\x1b[34;1m"), Text$from_str(ptr_info.sigil), typename, Text("\x1b[m")); - else - text = Text$concat(Text$from_str(ptr_info.sigil), typename); - return text; - } - const void *ptr = *(const void**)x; - if (!ptr) { - Text_t typename = generic_as_text(NULL, false, ptr_info.pointed); - if (colorize) - return Text$concat(Text("\x1b[34;1m!"), typename, Text("\x1b[m")); - else - return Text$concat(Text("!"), typename); - } - - // Check for recursive references, so if `x.foo = x`, then it prints as - // `@Foo{foo=@..1}` instead of overflowing the stack: - static recursion_t *recursion = NULL; - int32_t depth = 0; - for (recursion_t *r = recursion; r; r = r->next) { - ++depth; - if (r->ptr == ptr) { - Text_t text = Text$concat( - colorize ? Text("\x1b[34;1m") : Text(""), - Text$from_str(ptr_info.sigil), - Text(".."), - Int32$as_text(&depth, false, &Int32$info), - colorize ? Text("\x1b[m") : Text("")); - return text; - } - } - - Text_t pointed; - { // Stringify with this pointer flagged as a recursive one: - recursion_t my_recursion = {.ptr=ptr, .next=recursion}; - recursion = &my_recursion; - pointed = generic_as_text(ptr, colorize, ptr_info.pointed); - recursion = recursion->next; - } - Text_t text; - if (colorize) - text = Text$concat(Text("\x1b[34;1m"), Text$from_str(ptr_info.sigil), Text("\x1b[m"), pointed); - else - text = Text$concat(Text$from_str(ptr_info.sigil), pointed); - return text; -} - -PUREFUNC public int32_t Pointer$compare(const void *x, const void *y, const TypeInfo *type) { - (void)type; - const void *xp = *(const void**)x, *yp = *(const void**)y; - return (xp > yp) - (xp < yp); -} - -PUREFUNC public bool Pointer$equal(const void *x, const void *y, const TypeInfo *type) { - (void)type; - const void *xp = *(const void**)x, *yp = *(const void**)y; - return xp == yp; -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/pointers.h b/builtins/pointers.h deleted file mode 100644 index faa95316..00000000 --- a/builtins/pointers.h +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -// Type infos and methods for Pointer types - -#include -#include - -#include "types.h" -#include "util.h" - -Text_t Pointer$as_text(const void *x, bool colorize, const TypeInfo *type); -PUREFUNC int32_t Pointer$compare(const void *x, const void *y, const TypeInfo *type); -PUREFUNC bool Pointer$equal(const void *x, const void *y, const TypeInfo *type); - -#define Null(t) (t*)NULL -#define POINTER_TYPE(_sigil, _pointed) (&(TypeInfo){\ - .size=sizeof(void*), .align=alignof(void*), .tag=PointerInfo, .PointerInfo.sigil=_sigil, .PointerInfo.pointed=_pointed}) - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/ranges.c b/builtins/ranges.c deleted file mode 100644 index 9dfd1efe..00000000 --- a/builtins/ranges.c +++ /dev/null @@ -1,63 +0,0 @@ -// Functions that operate on numeric ranges - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "integers.h" -#include "text.h" -#include "types.h" -#include "util.h" - - -PUREFUNC static int32_t Range$compare(const Range_t *x, const Range_t *y, const TypeInfo *type) -{ - (void)type; - if (x == y) return 0; - int32_t diff = Int$compare(&x->first, &y->first, &Int$info); - if (diff != 0) return diff; - diff = Int$compare(&x->last, &y->last, &Int$info); - if (diff != 0) return diff; - return Int$compare(&x->step, &y->step, &Int$info); -} - -PUREFUNC static bool Range$equal(const Range_t *x, const Range_t *y, const TypeInfo *type) -{ - (void)type; - if (x == y) return true; - return Int$equal(&x->first, &y->first, &Int$info) && Int$equal(&x->last, &y->last, &Int$info) && Int$equal(&x->step, &y->step, &Int$info); -} - -static Text_t Range$as_text(const Range_t *r, bool use_color, const TypeInfo *type) -{ - (void)type; - if (!r) return Text("Range"); - - return Text$format(use_color ? "\x1b[0;1mRange\x1b[m(first=%r, last=%r, step=%r)" - : "Range(first=%r, last=%r, step=%r)", - Int$as_text(&r->first, use_color, &Int$info), Int$as_text(&r->last, use_color, &Int$info), - Int$as_text(&r->step, use_color, &Int$info)); -} - -PUREFUNC public Range_t Range$reversed(Range_t r) -{ - return (Range_t){r.last, r.first, Int$negative(r.step)}; -} - -PUREFUNC public Range_t Range$by(Range_t r, Int_t step) -{ - return (Range_t){r.first, r.last, Int$times(step, r.step)}; -} - -public const TypeInfo Range = {sizeof(Range_t), __alignof(Range_t), {.tag=CustomInfo, .CustomInfo={ - .as_text=(void*)Range$as_text, - .compare=(void*)Range$compare, - .equal=(void*)Range$equal, -}}}; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/ranges.h b/builtins/ranges.h deleted file mode 100644 index 2a4f1d68..00000000 --- a/builtins/ranges.h +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -// Ranges represent numeric ranges - -PUREFUNC Range_t Range$reversed(Range_t r); -PUREFUNC Range_t Range$by(Range_t r, Int_t step); - -extern const TypeInfo Range; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/builtins/shell.c b/builtins/shell.c deleted file mode 100644 index 36b6a9ad..00000000 --- a/builtins/shell.c +++ /dev/null @@ -1,67 +0,0 @@ -// A lang for Shell Command Language -#include -#include - -#include "arrays.h" -#include "integers.h" -#include "patterns.h" -#include "shell.h" -#include "text.h" -#include "types.h" -#include "util.h" - -public Shell_t Shell$escape_text(Text_t text) -{ - // TODO: optimize for ASCII and short strings - Array_t shell_graphemes = {.atomic=1}; -#define add_char(c) Array$insert(&shell_graphemes, (uint32_t[1]){c}, I_small(0), sizeof(uint32_t)) - add_char('\''); - const char *text_utf8 = Text$as_c_string(text); - for (const char *p = text_utf8; *p; p++) { - if (*p == '\'') { - add_char('\''); - add_char('"'); - add_char('\''); - add_char('"'); - add_char('\''); - } else - add_char((uint8_t)*p); - } - add_char('\''); -#undef add_char - return (Text_t){.length=shell_graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=shell_graphemes.data}; -} - -public Text_t Shell$run(Shell_t command, int32_t *status) -{ - const char *cmd_str = Text$as_c_string(command); - FILE *prog = popen(cmd_str, "r"); - - const int chunk_size = 256; - char *buf = GC_MALLOC_ATOMIC(chunk_size); - Text_t output = Text(""); - size_t just_read; - do { - just_read = fread(buf, sizeof(char), chunk_size, prog); - if (just_read > 0) { - output = Texts(output, Text$from_strn(buf, just_read)); - buf = GC_MALLOC_ATOMIC(chunk_size); - } - } while (just_read > 0); - - if (status) - *status = WEXITSTATUS(pclose(prog)); - else - pclose(prog); - - return Text$trim(output, Pattern("{1 nl}"), false, true); -} - -public const TypeInfo Shell$info = { - .size=sizeof(Shell_t), - .align=__alignof__(Shell_t), - .tag=TextInfo, - .TextInfo={.lang="Shell"}, -}; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/shell.h b/builtins/shell.h deleted file mode 100644 index 48c59abc..00000000 --- a/builtins/shell.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -// A lang for Shell Command Language - -#include -#include - -#include "types.h" -#include "datatypes.h" - -#define Shell_t Text_t -#define Shell(text) ((Shell_t)Text(text)) -#define Shells(...) ((Shell_t)Texts(__VA_ARGS__)) - -Text_t Shell$run(Shell_t command, int32_t *status); -Shell_t Shell$escape_text(Text_t text); - -#define Shell$hash Text$hash -#define Shell$compare Text$compare -#define Shell$equal Text$equal - -extern const TypeInfo Shell$info; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 - diff --git a/builtins/siphash-internals.h b/builtins/siphash-internals.h deleted file mode 100644 index d1906be4..00000000 --- a/builtins/siphash-internals.h +++ /dev/null @@ -1,127 +0,0 @@ -#pragma once - -// This file holds the internals for the SipHash implementation. For a few -// cases, we want to include this for incrementally computing hashes. -// Otherwise, it suffices to just use the siphash24() function from siphash.h - -#include -#include -#include - -#include "siphash.h" - -/* - Copyright (c) 2013 Marek Majkowski - Copyright (c) 2018 Samantha McVey - Copyright (c) 2024 Bruce Hill - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - - - Original location: - https://github.com/majek/csiphash/ - - Original solution inspired by code from: - Samuel Neves (supercop/crypto_auth/siphash24/little) - djb (supercop/crypto_auth/siphash24/little2) - Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c) - - Extensive modifications for MoarVM by Samantha McVey - - Further modifications for Tomo by Bruce Hill -*/ -struct siphash { - uint64_t v0; - uint64_t v1; - uint64_t v2; - uint64_t v3; - uint64_t b; -}; -typedef struct siphash siphash; -#define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) ) - -#define HALF_ROUND(a,b,c,d,s,t) \ - a += b; c += d; \ - b = ROTATE(b, s) ^ a; \ - d = ROTATE(d, t) ^ c; \ - a = ROTATE(a, 32); - -#define DOUBLE_ROUND(v0,v1,v2,v3) \ - HALF_ROUND(v0,v1,v2,v3,13,16); \ - HALF_ROUND(v2,v1,v0,v3,17,21); \ - HALF_ROUND(v0,v1,v2,v3,13,16); \ - HALF_ROUND(v2,v1,v0,v3,17,21); - -static inline void siphashinit (siphash *sh, size_t src_sz) { - const uint64_t k0 = TOMO_HASH_KEY[0]; - const uint64_t k1 = TOMO_HASH_KEY[1]; - sh->b = (uint64_t)src_sz << 56; - sh->v0 = k0 ^ 0x736f6d6570736575ULL; - sh->v1 = k1 ^ 0x646f72616e646f6dULL; - sh->v2 = k0 ^ 0x6c7967656e657261ULL; - sh->v3 = k1 ^ 0x7465646279746573ULL; -} -static inline void siphashadd64bits (siphash *sh, const uint64_t in) { - const uint64_t mi = in; - sh->v3 ^= mi; - DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); - sh->v0 ^= mi; -} -#pragma GCC diagnostic ignored "-Winline" -static inline uint64_t siphashfinish_last_part (siphash *sh, uint64_t t) { - sh->b |= t; - sh->v3 ^= sh->b; - DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); - sh->v0 ^= sh->b; - sh->v2 ^= 0xff; - DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); - DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); - return (sh->v0 ^ sh->v1) ^ (sh->v2 ^ sh->v3); -} -/* This union helps us avoid doing weird things with pointers that can cause old - * compilers like GCC 4 to generate bad code. In addition it is nicely more C - * standards compliant to keep type punning to a minimum. */ -union SipHash64_union { - uint64_t u64; - uint32_t u32; - uint8_t u8[8]; -}; -static inline uint64_t siphashfinish (siphash *sh, const uint8_t *src, size_t src_sz) { - union SipHash64_union t = { 0 }; - switch (src_sz) { - /* Falls through */ - case 7: t.u8[6] = src[6]; - /* Falls through */ - case 6: t.u8[5] = src[5]; - /* Falls through */ - case 5: t.u8[4] = src[4]; - /* Falls through */ - case 4: t.u8[3] = src[3]; - /* Falls through */ - case 3: t.u8[2] = src[2]; - /* Falls through */ - case 2: t.u8[1] = src[1]; - /* Falls through */ - case 1: t.u8[0] = src[0]; - default: break; - } - return siphashfinish_last_part(sh, t.u64); -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/siphash.c b/builtins/siphash.c deleted file mode 100644 index 671fbad6..00000000 --- a/builtins/siphash.c +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include - -#include "siphash.h" -#include "util.h" - -public uint64_t TOMO_HASH_KEY[2] = {23, 42}; // Randomized in tomo_init() - -/* - Copyright (c) 2013 Marek Majkowski - Copyright (c) 2018 Samantha McVey - Copyright (c) 2024 Bruce Hill - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - - - Original location: - https://github.com/majek/csiphash/ - - Original solution inspired by code from: - Samuel Neves (supercop/crypto_auth/siphash24/little) - djb (supercop/crypto_auth/siphash24/little2) - Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c) - - Extensive modifications for MoarVM by Samantha McVey - - Further modifications for Tomo by Bruce Hill -*/ - -#include "siphash-internals.h" - -public uint64_t siphash24(const uint8_t *src, size_t src_sz) { - siphash sh; - if ((uint64_t)src % __alignof__(uint64_t) == 0) { -#pragma GCC diagnostic ignored "-Wcast-align" - const uint64_t *in = (uint64_t*)src; - /* Find largest src_sz evenly divisible by 8 bytes. */ - const ptrdiff_t src_sz_nearest_8bits = ((ptrdiff_t)src_sz >> 3) << 3; - const uint64_t *goal = (uint64_t*)(src + src_sz_nearest_8bits); - siphashinit(&sh, src_sz); - src_sz -= (size_t)src_sz_nearest_8bits; - while (in < goal) { - siphashadd64bits(&sh, *in); - in++; - } - return siphashfinish(&sh, (uint8_t *)in, src_sz); - } else { - const uint8_t *in = src; - siphashinit(&sh, src_sz); - while (src_sz >= 8) { - uint64_t in_64; - memcpy(&in_64, in, sizeof(uint64_t)); - siphashadd64bits(&sh, in_64); - in += 8; src_sz -= 8; - } - return siphashfinish(&sh, (uint8_t *)in, src_sz); - } -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/siphash.h b/builtins/siphash.h deleted file mode 100644 index 8104a306..00000000 --- a/builtins/siphash.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -// An implementation of the SipHash algorithm. - -#include -#include - -// This value will be randomized on startup in tomo_init(): -extern uint64_t TOMO_HASH_KEY[2]; - -uint64_t siphash24(const uint8_t *src, size_t src_sz); - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/stdlib.c b/builtins/stdlib.c deleted file mode 100644 index b8e40a54..00000000 --- a/builtins/stdlib.c +++ /dev/null @@ -1,274 +0,0 @@ -// Built-in functions - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "files.h" -#include "integers.h" -#include "metamethods.h" -#include "patterns.h" -#include "siphash.h" -#include "tables.h" -#include "text.h" -#include "util.h" - -public void tomo_init(void) -{ - GC_INIT(); - USE_COLOR = getenv("COLOR") ? strcmp(getenv("COLOR"), "1") == 0 : isatty(STDOUT_FILENO); - getrandom(TOMO_HASH_KEY, sizeof(TOMO_HASH_KEY), 0); - unsigned int seed; - getrandom(&seed, sizeof(seed), 0); - srand(seed); - srand48(seed); - Int$init_random(seed); - - if (register_printf_specifier('k', printf_text, printf_text_size)) - errx(1, "Couldn't set printf specifier"); -} - -void print_stack_trace(FILE *out, int start, int stop) -{ - // Print stack trace: - fprintf(out, "\x1b[34m"); - fflush(out); - void *array[1024]; - int64_t size = (int64_t)backtrace(array, sizeof(array)/sizeof(array[0])); - char **strings = strings = backtrace_symbols(array, size); - for (int64_t i = start; i < size - stop; i++) { - char *filename = strings[i]; - const char *cmd = heap_strf("addr2line -e %.*s -fisp | sed 's/\\$/./g;s/ at /() at /' >&2", strcspn(filename, "("), filename); - FILE *fp = popen(cmd, "w"); - if (fp) { - char *paren = strchrnul(strings[i], '('); - fprintf(fp, "%.*s\n", strcspn(paren + 1, ")"), paren + 1); - } - pclose(fp); - } - fprintf(out, "\x1b[m"); - fflush(out); -} - -__attribute__((format(printf, 1, 2))) -public _Noreturn void fail(const char *fmt, ...) -{ - fflush(stdout); - if (USE_COLOR) fputs("\x1b[31;7m ==================== ERROR ==================== \n\n\x1b[0;1m", stderr); - else fputs("==================== ERROR ====================\n\n", stderr); - va_list args; - va_start(args, fmt); - vfprintf(stderr, fmt, args); - if (USE_COLOR) fputs("\x1b[m", stderr); - fputs("\n\n", stderr); - va_end(args); - print_stack_trace(stderr, 2, 4); - fflush(stderr); - raise(SIGABRT); - _exit(1); -} - -__attribute__((format(printf, 4, 5))) -public _Noreturn void fail_source(const char *filename, int64_t start, int64_t end, const char *fmt, ...) -{ - if (USE_COLOR) fputs("\n\x1b[31;7m ==================== ERROR ==================== \n\n\x1b[0;1m", stderr); - else fputs("\n==================== ERROR ====================\n\n", stderr); - - va_list args; - va_start(args, fmt); - vfprintf(stderr, fmt, args); - va_end(args); - - file_t *file = filename ? load_file(filename) : NULL; - if (filename && file) { - fputs("\n", stderr); - highlight_error(file, file->text+start, file->text+end, "\x1b[31;1m", 2, USE_COLOR); - fputs("\n", stderr); - } - if (USE_COLOR) fputs("\x1b[m", stderr); - - print_stack_trace(stderr, 2, 4); - fflush(stderr); - raise(SIGABRT); - _exit(1); -} - -public Text_t builtin_last_err() -{ - return Text$from_str(strerror(errno)); -} - -static int TEST_DEPTH = 0; -static file_t *file = NULL; - -public void start_test(const char *filename, int64_t start, int64_t end) -{ - if (filename && (file == NULL || strcmp(file->filename, filename) != 0)) - file = load_file(filename); - - if (filename && file) { - for (int i = 0; i < 3*TEST_DEPTH; i++) fputc(' ', stderr); - - int64_t first_line_len = (int64_t)strcspn(file->text + start, "\r\n"); - fprintf(stderr, USE_COLOR ? "\x1b[33;1m>> \x1b[m%.*s\n" : ">> %.*s\n", first_line_len, file->text + start); - - // For multi-line expressions, dedent each and print it on a new line with ".. " in front: - if (end > start + first_line_len) { - int64_t line_num = get_line_number(file, file->text + start); - const char *line_start = get_line(file, line_num); - int64_t indent_len = (int64_t)strspn(line_start, " \t"); - for (const char *line = file->text + start + first_line_len; line < file->text + end; line += strcspn(line, "\r\n")) { - line += strspn(line, "\r\n"); - if ((int64_t)strspn(line, " \t") >= indent_len) - line += indent_len; - fprintf(stderr, USE_COLOR ? "\x1b[33m.. \x1b[m%.*s\n" : ".. %.*s\n", strcspn(line, "\r\n"), line); - } - } - } - ++TEST_DEPTH; -} - -public void end_test(const void *expr, const TypeInfo *type, const char *expected, const char *filename, int64_t start, int64_t end) -{ - (void)filename; - (void)start; - (void)end; - --TEST_DEPTH; - if (!expr || !type) return; - - Text_t expr_text = generic_as_text(expr, USE_COLOR, type); - Text_t type_name = generic_as_text(NULL, false, type); - - for (int i = 0; i < 3*TEST_DEPTH; i++) fputc(' ', stderr); - fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %k \x1b[2m: %k\x1b[m\n" : "= %k : %k\n", &expr_text, &type_name); - if (expected && expected[0]) { - Text_t expected_text = Text$from_str(expected); - Text_t expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_text; - bool success = Text$equal(&expr_plain, &expected_text); - if (!success) { - Int_t colon = Text$find(expected_text, Text(":"), I_small(1), NULL); - if (colon.small != I_small(0).small) { - Text_t with_type = Text$concat(expr_plain, Text(" : "), type_name); - success = Text$equal(&with_type, &expected_text); - } - } - - if (!success) { - fprintf(stderr, - USE_COLOR - ? "\n\x1b[31;7m ==================== TEST FAILED ==================== \x1b[0;1m\n\nExpected: \x1b[1;32m%s\x1b[0m\n\x1b[1m But got:\x1b[m %k\n\n" - : "\n==================== TEST FAILED ====================\nExpected: %s\n\n But got: %k\n\n", - expected, &expr_text); - - print_stack_trace(stderr, 2, 4); - fflush(stderr); - raise(SIGABRT); - } - } -} - -public void say(Text_t text, bool newline) -{ - Text$print(stdout, text); - if (newline) - fputc('\n', stdout); - fflush(stdout); -} - -public _Noreturn void tomo_exit(Text_t text, int32_t status) -{ - if (text.length > 0) - say(text, true); - _exit(status); -} - -public Text_t ask(Text_t prompt, bool bold, bool force_tty) -{ - Text_t ret = Text(""); - FILE *out = stdout; - FILE *in = stdin; - - char *line = NULL; - size_t bufsize = 0; - ssize_t length = 0; - char *gc_input = NULL; - - if (force_tty && !isatty(STDOUT_FILENO)) { - out = fopen("/dev/tty", "w"); - if (!out) goto cleanup; - } - - if (bold) fputs("\x1b[1m", out); - Text$print(out, prompt); - if (bold) fputs("\x1b[m", out); - fflush(out); - - if (force_tty && !isatty(STDIN_FILENO)) { - in = fopen("/dev/tty", "r"); - if (!in) { - fputs("\n", out); // finish the line, since the user can't - goto cleanup; - } - } - - length = getline(&line, &bufsize, in); - if (length == -1) { - fputs("\n", out); // finish the line, since we didn't get any input - goto cleanup; - } - - if (length > 0 && line[length-1] == '\n') { - line[length-1] = '\0'; - --length; - } - - gc_input = GC_MALLOC_ATOMIC((size_t)(length + 1)); - memcpy(gc_input, line, (size_t)(length + 1)); - - ret = Text$from_strn(gc_input, (size_t)(length)); - - cleanup: - if (out && out != stdout) fclose(out); - if (in && in != stdin) fclose(in); - return ret; -} - -public bool pop_flag(char **argv, int *i, const char *flag, Text_t *result) -{ - if (argv[*i][0] != '-' || argv[*i][1] != '-') { - return false; - } else if (streq(argv[*i] + 2, flag)) { - *result = (Text_t){.length=0}; - argv[*i] = NULL; - *i += 1; - return true; - } else if (strncmp(argv[*i] + 2, "no-", 3) == 0 && streq(argv[*i] + 5, flag)) { - *result = Text("no"); - argv[*i] = NULL; - *i += 1; - return true; - } else if (strncmp(argv[*i] + 2, flag, strlen(flag)) == 0 && argv[*i][2 + strlen(flag)] == '=') { - *result = Text$from_str(argv[*i] + 2 + strlen(flag) + 1); - argv[*i] = NULL; - *i += 1; - return true; - } else { - return false; - } -} - -public void sleep_num(double seconds) -{ - struct timespec ts; - ts.tv_sec = (time_t)seconds; - ts.tv_nsec = (long)((seconds - (double)ts.tv_sec) * 1e9); - nanosleep(&ts, NULL); -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/stdlib.h b/builtins/stdlib.h deleted file mode 100644 index da3ddbf7..00000000 --- a/builtins/stdlib.h +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - -// Built-in functions - -#include -#include -#include - -#include "datatypes.h" -#include "types.h" -#include "util.h" - -void tomo_init(void); -__attribute__((format(printf, 1, 2))) -_Noreturn void fail(const char *fmt, ...); -__attribute__((format(printf, 4, 5))) -_Noreturn void fail_source(const char *filename, int64_t start, int64_t end, const char *fmt, ...); -Text_t builtin_last_err(); -void start_test(const char *filename, int64_t start, int64_t end); -void end_test(const void *expr, const TypeInfo *type, const char *expected, const char *filename, int64_t start, int64_t end); -#define test(expr, typeinfo, expected, start, end) {\ - start_test(__SOURCE_FILE__, start, end); \ - auto _expr = expr; \ - end_test(&_expr, typeinfo, expected, __SOURCE_FILE__, start, end); } -void say(Text_t text, bool newline); -Text_t ask(Text_t prompt, bool bold, bool force_tty); -_Noreturn void tomo_exit(Text_t text, int32_t status); - -Closure_t spawn(Closure_t fn); -bool pop_flag(char **argv, int *i, const char *flag, Text_t *result); -void print_stack_trace(FILE *out, int start, int stop); -void sleep_num(double seconds); - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/tables.c b/builtins/tables.c deleted file mode 100644 index e644fd23..00000000 --- a/builtins/tables.c +++ /dev/null @@ -1,636 +0,0 @@ -// table.c - C Hash table implementation -// Copyright 2024 Bruce Hill -// Provided under the MIT license with the Commons Clause -// See included LICENSE for details. - -// Hash table (aka Dictionary) Implementation -// Hash keys and values are stored *by value* -// The hash insertion/lookup implementation is based on Lua's tables, -// which use a chained scatter with Brent's variation. - -#include -#include -#include -#include -#include -#include -#include - -#include "arrays.h" -#include "c_strings.h" -#include "datatypes.h" -#include "memory.h" -#include "metamethods.h" -#include "siphash.h" -#include "tables.h" -#include "text.h" -#include "types.h" -#include "util.h" - -// #define DEBUG_TABLES - -#ifdef DEBUG_TABLES -#define hdebug(fmt, ...) printf("\x1b[2m" fmt "\x1b[m" __VA_OPT__(,) __VA_ARGS__) -#else -#define hdebug(...) (void)0 -#endif - -// Helper accessors for type functions/values: -#define HASH_KEY(t, k) (generic_hash((k), type->TableInfo.key) % ((t).bucket_info->count)) -#define EQUAL_KEYS(x, y) (generic_equal((x), (y), type->TableInfo.key)) -#define END_OF_CHAIN UINT32_MAX - -#define GET_ENTRY(t, i) ((t).entries.data + (t).entries.stride*(i)) - -static const TypeInfo MemoryPointer = { - .size=sizeof(void*), - .align=__alignof__(void*), - .tag=PointerInfo, - .PointerInfo={ - .sigil="@", - .pointed=&Memory$info, - }, -}; - -const TypeInfo CStrToVoidStarTable = { - .size=sizeof(Table_t), - .align=__alignof__(Table_t), - .tag=TableInfo, - .TableInfo={.key=&CString$info, .value=&MemoryPointer}, -}; - -PUREFUNC static inline size_t entry_size(const TypeInfo *info) -{ - size_t size = (size_t)info->TableInfo.key->size; - if (info->TableInfo.value->align > 1 && size % (size_t)info->TableInfo.value->align) - size += (size_t)info->TableInfo.value->align - (size % (size_t)info->TableInfo.value->align); // padding - size += (size_t)info->TableInfo.value->size; - if (info->TableInfo.key->align > 1 && size % (size_t)info->TableInfo.key->align) - size += (size_t)info->TableInfo.key->align - (size % (size_t)info->TableInfo.key->align); // padding - return size; -} - -PUREFUNC static inline size_t entry_align(const TypeInfo *info) -{ - return (size_t)MAX(info->TableInfo.key->align, info->TableInfo.value->align); -} - -PUREFUNC static inline size_t value_offset(const TypeInfo *info) -{ - size_t offset = (size_t)info->TableInfo.key->size; - if ((size_t)info->TableInfo.value->align > 1 && offset % (size_t)info->TableInfo.value->align) - offset += (size_t)info->TableInfo.value->align - (offset % (size_t)info->TableInfo.value->align); // padding - return offset; -} - -static inline void hshow(const Table_t *t) -{ - hdebug("{"); - for (uint32_t i = 0; t->bucket_info && i < t->bucket_info->count; i++) { - if (i > 0) hdebug(" "); - if (t->bucket_info->buckets[i].occupied) - hdebug("[%d]=%d(%d)", i, t->bucket_info->buckets[i].index, t->bucket_info->buckets[i].next_bucket); - else - hdebug("[%d]=_", i); - } - hdebug("}\n"); -} - -static void maybe_copy_on_write(Table_t *t, const TypeInfo *type) -{ - if (t->entries.data_refcount != 0) - Array$compact(&t->entries, (int64_t)entry_size(type)); - - if (t->bucket_info && t->bucket_info->data_refcount != 0) { - size_t size = sizeof(bucket_info_t) + sizeof(bucket_t[t->bucket_info->count]); - t->bucket_info = memcpy(GC_MALLOC(size), t->bucket_info, size); - t->bucket_info->data_refcount = 0; - } -} - -// Return address of value or NULL -PUREFUNC public void *Table$get_raw(Table_t t, const void *key, const TypeInfo *type) -{ - assert(type->tag == TableInfo); - if (!key || !t.bucket_info) return NULL; - - uint64_t hash = HASH_KEY(t, key); - hshow(&t); - hdebug("Getting value with initial probe at %u\n", hash); - bucket_t *buckets = t.bucket_info->buckets; - for (uint64_t i = hash; buckets[i].occupied; i = buckets[i].next_bucket) { - hdebug("Checking against key in bucket %u\n", i); - void *entry = GET_ENTRY(t, buckets[i].index); - if (EQUAL_KEYS(entry, key)) { - hdebug("Found key!\n"); - return entry + value_offset(type); - } - if (buckets[i].next_bucket == END_OF_CHAIN) - break; - } - return NULL; -} - -PUREFUNC public void *Table$get(Table_t t, const void *key, const TypeInfo *type) -{ - assert(type->tag == TableInfo); - for (const Table_t *iter = &t; iter; iter = iter->fallback) { - void *ret = Table$get_raw(*iter, key, type); - if (ret) return ret; - } - return NULL; -} - -static void Table$set_bucket(Table_t *t, const void *entry, int32_t index, const TypeInfo *type) -{ - assert(t->bucket_info); - hshow(t); - const void *key = entry; - bucket_t *buckets = t->bucket_info->buckets; - uint64_t hash = HASH_KEY(*t, key); - hdebug("Hash value (mod %u) = %u\n", t->bucket_info->count, hash); - bucket_t *bucket = &buckets[hash]; - if (!bucket->occupied) { - hdebug("Got an empty space\n"); - // Empty space: - bucket->occupied = 1; - bucket->index = index; - bucket->next_bucket = END_OF_CHAIN; - hshow(t); - return; - } - - hdebug("Collision detected in bucket %u (entry %u)\n", hash, bucket->index); - - while (buckets[t->bucket_info->last_free].occupied) { - assert(t->bucket_info->last_free > 0); - --t->bucket_info->last_free; - } - - uint64_t collided_hash = HASH_KEY(*t, GET_ENTRY(*t, bucket->index)); - if (collided_hash != hash) { // Collided with a mid-chain entry - hdebug("Hit a mid-chain entry at bucket %u (chain starting at %u)\n", hash, collided_hash); - // Find chain predecessor - uint64_t predecessor = collided_hash; - while (buckets[predecessor].next_bucket != hash) - predecessor = buckets[predecessor].next_bucket; - - // Move mid-chain entry to free space and update predecessor - buckets[predecessor].next_bucket = t->bucket_info->last_free; - buckets[t->bucket_info->last_free] = *bucket; - } else { // Collided with the start of a chain - hdebug("Hit start of a chain\n"); - uint64_t end_of_chain = hash; - while (buckets[end_of_chain].next_bucket != END_OF_CHAIN) - end_of_chain = buckets[end_of_chain].next_bucket; - hdebug("Appending to chain\n"); - // Chain now ends on the free space: - buckets[end_of_chain].next_bucket = t->bucket_info->last_free; - bucket = &buckets[t->bucket_info->last_free]; - } - - bucket->occupied = 1; - bucket->index = index; - bucket->next_bucket = END_OF_CHAIN; - hshow(t); -} - -static void hashmap_resize_buckets(Table_t *t, uint32_t new_capacity, const TypeInfo *type) -{ - if (__builtin_expect(new_capacity > TABLE_MAX_BUCKETS, 0)) - fail("Table has exceeded the maximum table size (2^31) and cannot grow further!"); - hdebug("About to resize from %u to %u\n", t->bucket_info ? t->bucket_info->count : 0, new_capacity); - hshow(t); - size_t alloc_size = sizeof(bucket_info_t) + sizeof(bucket_t[new_capacity]); - t->bucket_info = GC_MALLOC_ATOMIC(alloc_size); - memset(t->bucket_info->buckets, 0, sizeof(bucket_t[new_capacity])); - t->bucket_info->count = new_capacity; - t->bucket_info->last_free = new_capacity-1; - // Rehash: - for (int64_t i = 0; i < Table$length(*t); i++) { - hdebug("Rehashing %u\n", i); - Table$set_bucket(t, GET_ENTRY(*t, i), i, type); - } - - hshow(t); - hdebug("Finished resizing\n"); -} - -// Return address of value -#pragma GCC diagnostic ignored "-Wstack-protector" -public void *Table$reserve(Table_t *t, const void *key, const void *value, const TypeInfo *type) -{ - assert(type->tag == TableInfo); - if (!t || !key) return NULL; - hshow(t); - - int64_t key_size = type->TableInfo.key->size, - value_size = type->TableInfo.value->size; - if (!t->bucket_info || t->bucket_info->count == 0) { - hashmap_resize_buckets(t, 4, type); - } else { - // Check if we are clobbering a value: - void *value_home = Table$get_raw(*t, key, type); - if (value_home) { // Update existing slot - // Ensure that `value_home` is still inside t->entries, even if COW occurs - ptrdiff_t offset = value_home - t->entries.data; - maybe_copy_on_write(t, type); - value_home = t->entries.data + offset; - - if (value && value_size > 0) - memcpy(value_home, value, (size_t)value_size); - - return value_home; - } - } - // Otherwise add a new entry: - - // Resize buckets if necessary - if (t->entries.length >= (int64_t)t->bucket_info->count) { - uint32_t newsize = (uint32_t)t->bucket_info->count + MIN((uint32_t)t->bucket_info->count, 64); - if (__builtin_expect(newsize > TABLE_MAX_BUCKETS, 0)) - newsize = t->entries.length + 1; - hashmap_resize_buckets(t, newsize, type); - } - - if (!value && value_size > 0) { - for (Table_t *iter = t->fallback; iter; iter = iter->fallback) { - value = Table$get_raw(*iter, key, type); - if (value) break; - } - } - - maybe_copy_on_write(t, type); - - char buf[entry_size(type)]; - memset(buf, 0, sizeof(buf)); - memcpy(buf, key, (size_t)key_size); - if (value && value_size > 0) - memcpy(buf + value_offset(type), value, (size_t)value_size); - else - memset(buf + value_offset(type), 0, (size_t)value_size); - Array$insert(&t->entries, buf, I(0), (int64_t)entry_size(type)); - - int64_t entry_index = t->entries.length-1; - void *entry = GET_ENTRY(*t, entry_index); - Table$set_bucket(t, entry, entry_index, type); - return entry + value_offset(type); -} - -public void Table$set(Table_t *t, const void *key, const void *value, const TypeInfo *type) -{ - assert(type->tag == TableInfo); - (void)Table$reserve(t, key, value, type); -} - -public void Table$remove(Table_t *t, const void *key, const TypeInfo *type) -{ - assert(type->tag == TableInfo); - if (!t || Table$length(*t) == 0) return; - - // TODO: this work doesn't need to be done if the key is already missing - maybe_copy_on_write(t, type); - - // If unspecified, pop the last key: - if (!key) - key = GET_ENTRY(*t, t->entries.length-1); - - // Steps: look up the bucket for the removed key - // If missing, then return immediately - // Swap last key/value into the removed bucket's index1 - // Zero out the last key/value and decrement the count - // Find the last key/value's bucket and update its index1 - // Look up the bucket for the removed key - // If bucket is first in chain: - // Move bucket->next to bucket's spot - // zero out bucket->next's old spot - // maybe update lastfree_index1 to second-in-chain's index - // Else: - // set prev->next = bucket->next - // zero out bucket - // maybe update lastfree_index1 to removed bucket's index - - uint64_t hash = HASH_KEY(*t, key); - hdebug("Removing key with hash %u\n", hash); - bucket_t *bucket, *prev = NULL; - for (uint64_t i = hash; t->bucket_info->buckets[i].occupied; i = t->bucket_info->buckets[i].next_bucket) { - if (EQUAL_KEYS(GET_ENTRY(*t, t->bucket_info->buckets[i].index), key)) { - bucket = &t->bucket_info->buckets[i]; - hdebug("Found key to delete in bucket %u\n", i); - goto found_it; - } - if (t->bucket_info->buckets[i].next_bucket == END_OF_CHAIN) - return; - prev = &t->bucket_info->buckets[i]; - } - return; - - found_it:; - assert(bucket->occupied); - - // Always remove the last entry. If we need to remove some other entry, - // swap the other entry into the last position and then remove the last - // entry. This disturbs the ordering of the table, but keeps removal O(1) - // instead of O(N) - int64_t last_entry = t->entries.length-1; - if (bucket->index != last_entry) { - hdebug("Removing key/value from the middle of the entries array\n"); - - // Find the bucket that points to the last entry's index: - uint64_t i = HASH_KEY(*t, GET_ENTRY(*t, last_entry)); - while (t->bucket_info->buckets[i].index != last_entry) - i = t->bucket_info->buckets[i].next_bucket; - // Update the bucket to point to the last entry's new home (the space - // where the removed entry currently sits): - t->bucket_info->buckets[i].index = bucket->index; - - // Clobber the entry being removed (in the middle of the array) with - // the last entry: - memcpy(GET_ENTRY(*t, bucket->index), GET_ENTRY(*t, last_entry), entry_size(type)); - } - - // Last entry is being removed, so clear it out to be safe: - memset(GET_ENTRY(*t, last_entry), 0, entry_size(type)); - - Array$remove_at(&t->entries, I(t->entries.length), I(1), (int64_t)entry_size(type)); - - int64_t bucket_to_clear; - if (prev) { // Middle (or end) of a chain - hdebug("Removing from middle of a chain\n"); - bucket_to_clear = (bucket - t->bucket_info->buckets); - prev->next_bucket = bucket->next_bucket; - } else if (bucket->next_bucket != END_OF_CHAIN) { // Start of a chain - hdebug("Removing from start of a chain\n"); - bucket_to_clear = bucket->next_bucket; - *bucket = t->bucket_info->buckets[bucket_to_clear]; - } else { // Empty chain - hdebug("Removing from empty chain\n"); - bucket_to_clear = (bucket - t->bucket_info->buckets); - } - - t->bucket_info->buckets[bucket_to_clear] = (bucket_t){0}; - if (bucket_to_clear > t->bucket_info->last_free) - t->bucket_info->last_free = bucket_to_clear; - - hshow(t); -} - -CONSTFUNC public void *Table$entry(Table_t t, int64_t n) -{ - if (n < 1 || n > Table$length(t)) - return NULL; - return GET_ENTRY(t, n-1); -} - -public void Table$clear(Table_t *t) -{ - memset(t, 0, sizeof(Table_t)); -} - -public Table_t Table$sorted(Table_t t, const TypeInfo *type) -{ - Closure_t cmp = (Closure_t){.fn=generic_compare, .userdata=(void*)type->TableInfo.key}; - Array_t entries = Array$sorted(t.entries, cmp, (int64_t)entry_size(type)); - return Table$from_entries(entries, type); -} - -PUREFUNC public bool Table$equal(const Table_t *x, const Table_t *y, const TypeInfo *type) -{ - if (x == y) return true; - - assert(type->tag == TableInfo); - if (Table$length(*x) != Table$length(*y)) - return false; - - if ((x->fallback != NULL) != (y->fallback != NULL)) - return false; - - return (Table$compare(x, y, type) == 0); -} - -PUREFUNC public int32_t Table$compare(const Table_t *x, const Table_t *y, const TypeInfo *type) -{ - if (x == y) return 0; - - assert(type->tag == TableInfo); - auto table = type->TableInfo; - if (x->entries.length == 0) - return 0; - else if (x->entries.length != y->entries.length) - return (x->entries.length > y->entries.length) - (x->entries.length < y->entries.length); - - for (int64_t i = 0; i < x->entries.length; i++) { - void *x_key = x->entries.data + x->entries.stride * i; - void *y_key = y->entries.data + y->entries.stride * i; - int32_t diff = generic_compare(x_key, y_key, table.key); - if (diff != 0) return diff; - void *x_value = x_key + value_offset(type); - void *y_value = y_key + value_offset(type); - diff = generic_compare(x_value, y_value, table.value); - if (diff != 0) return diff; - } - - if (!x->fallback != !y->fallback) { - return (!x->fallback) - (!y->fallback); - } else if (x->fallback && y->fallback) { - return generic_compare(x->fallback, y->fallback, type); - } - - return 0; -} - -PUREFUNC public uint64_t Table$hash(const Table_t *t, const TypeInfo *type) -{ - assert(type->tag == TableInfo); - // Table hashes are computed as: - // hash(hash(t.keys), hash(t.values), hash(t.fallback), hash(t.default)) - // Where fallback and default hash to zero if absent - auto table = type->TableInfo; - uint64_t components[] = { - Array$hash(&t->entries, Array$info(table.key)), - Array$hash(&t->entries + value_offset(type), Array$info(table.value)), - t->fallback ? Table$hash(t->fallback, type) : 0, - }; - return siphash24((void*)&components, sizeof(components)); -} - -public Text_t Table$as_text(const Table_t *t, bool colorize, const TypeInfo *type) -{ - assert(type->tag == TableInfo); - auto table = type->TableInfo; - - if (!t) { - if (table.value != &Void$info) - return Text$concat( - Text("{"), - generic_as_text(NULL, false, table.key), - Text(":"), - generic_as_text(NULL, false, table.value), - Text("}")); - else - return Text$concat( - Text("{"), - generic_as_text(NULL, false, table.key), - Text("}")); - } - - int64_t val_off = (int64_t)value_offset(type); - Text_t text = Text("{"); - for (int64_t i = 0, length = Table$length(*t); i < length; i++) { - if (i > 0) - text = Text$concat(text, Text(", ")); - void *entry = GET_ENTRY(*t, i); - text = Text$concat(text, generic_as_text(entry, colorize, table.key)); - if (table.value != &Void$info) - text = Text$concat(text, Text(":"), generic_as_text(entry + val_off, colorize, table.value)); - } - - if (t->fallback) { - text = Text$concat(text, Text("; fallback="), Table$as_text(t->fallback, colorize, type)); - } - - text = Text$concat(text, Text("}")); - return text; -} - -public Table_t Table$from_entries(Array_t entries, const TypeInfo *type) -{ - assert(type->tag == TableInfo); - if (entries.length == 0) - return (Table_t){}; - - Table_t t = {}; - int64_t length = entries.length + entries.length / 4; - size_t alloc_size = sizeof(bucket_info_t) + sizeof(bucket_t[length]); - t.bucket_info = GC_MALLOC_ATOMIC(alloc_size); - memset(t.bucket_info->buckets, 0, sizeof(bucket_t[length])); - t.bucket_info->count = length; - t.bucket_info->last_free = length-1; - - size_t offset = value_offset(type); - for (int64_t i = 0; i < entries.length; i++) { - void *key = entries.data + i*entries.stride; - Table$set(&t, key, key + offset, type); - } - return t; -} - -// Overlap is "set intersection" in formal terms -public Table_t Table$overlap(Table_t a, Table_t b, const TypeInfo *type) -{ - // Return a table such that t[k]==a[k] for all k such that a:has(k), b:has(k), and a[k]==b[k] - Table_t result = {}; - const size_t offset = value_offset(type); - for (int64_t i = 0; i < Table$length(a); i++) { - void *key = GET_ENTRY(a, i); - void *a_value = key + offset; - void *b_value = Table$get(b, key, type); - if (b_value && generic_equal(a_value, b_value, type->TableInfo.value)) - Table$set(&result, key, a_value, type); - } - - if (a.fallback) { - result.fallback = new(Table_t); - *result.fallback = Table$overlap(*a.fallback, b, type); - } - - return result; -} - -// With is "set union" in formal terms -public Table_t Table$with(Table_t a, Table_t b, const TypeInfo *type) -{ - // return a table such that t[k]==b[k] for all k such that b:has(k), and t[k]==a[k] for all k such that a:has(k) and not b:has(k) - Table_t result = {}; - const size_t offset = value_offset(type); - for (int64_t i = 0; i < Table$length(a); i++) { - void *key = GET_ENTRY(a, i); - Table$set(&result, key, key + offset, type); - } - for (int64_t i = 0; i < Table$length(b); i++) { - void *key = GET_ENTRY(b, i); - Table$set(&result, key, key + offset, type); - } - - if (a.fallback && b.fallback) { - result.fallback = new(Table_t); - *result.fallback = Table$with(*a.fallback, *b.fallback, type); - } else { - result.fallback = a.fallback ? a.fallback : b.fallback; - } - - return result; -} - -// Without is "set difference" in formal terms -public Table_t Table$without(Table_t a, Table_t b, const TypeInfo *type) -{ - // Return a table such that t[k]==a[k] for all k such that not b:has(k) or b[k] != a[k] - Table_t result = {}; - const size_t offset = value_offset(type); - for (int64_t i = 0; i < Table$length(a); i++) { - void *key = GET_ENTRY(a, i); - void *a_value = key + offset; - void *b_value = Table$get(b, key, type); - if (!b_value || !generic_equal(a_value, b_value, type->TableInfo.value)) - Table$set(&result, key, a_value, type); - } - - if (a.fallback) { - result.fallback = new(Table_t); - *result.fallback = Table$without(*a.fallback, b, type); - } - - return result; -} - -PUREFUNC public bool Table$is_subset_of(Table_t a, Table_t b, bool strict, const TypeInfo *type) -{ - if (a.entries.length > b.entries.length || (strict && a.entries.length == b.entries.length)) - return false; - - for (int64_t i = 0; i < Table$length(a); i++) { - void *found = Table$get_raw(b, GET_ENTRY(a, i), type); - if (!found) return false; - } - return true; -} - -PUREFUNC public bool Table$is_superset_of(Table_t a, Table_t b, bool strict, const TypeInfo *type) -{ - return Table$is_subset_of(b, a, strict, type); -} - -PUREFUNC public void *Table$str_get(Table_t t, const char *key) -{ - void **ret = Table$get(t, &key, &CStrToVoidStarTable); - return ret ? *ret : NULL; -} - -PUREFUNC public void *Table$str_get_raw(Table_t t, const char *key) -{ - void **ret = Table$get_raw(t, &key, &CStrToVoidStarTable); - return ret ? *ret : NULL; -} - -public void *Table$str_reserve(Table_t *t, const char *key, const void *value) -{ - return Table$reserve(t, &key, &value, &CStrToVoidStarTable); -} - -public void Table$str_set(Table_t *t, const char *key, const void *value) -{ - Table$set(t, &key, &value, &CStrToVoidStarTable); -} - -public void Table$str_remove(Table_t *t, const char *key) -{ - return Table$remove(t, &key, &CStrToVoidStarTable); -} - -CONSTFUNC public void *Table$str_entry(Table_t t, int64_t n) -{ - return Table$entry(t, n); -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/builtins/tables.h b/builtins/tables.h deleted file mode 100644 index 53e0c583..00000000 --- a/builtins/tables.h +++ /dev/null @@ -1,84 +0,0 @@ -#pragma once - -// Hash table datastructure with methods and type information - -#include -#include -#include - -#include "arrays.h" -#include "datatypes.h" -#include "types.h" -#include "util.h" - -#define Table(key_t, val_t, key_info, value_info, fb, N, ...) ({ \ - struct { key_t k; val_t v; } ents[N] = {__VA_ARGS__}; \ - Table_t table = Table$from_entries((Array_t){ \ - .data=memcpy(GC_MALLOC(sizeof(ents)), ents, sizeof(ents)), \ - .length=sizeof(ents)/sizeof(ents[0]), \ - .stride=(void*)&ents[1] - (void*)&ents[0], \ - }, Table$info(key_info, value_info)); \ - table.fallback = fb; \ - table; }) -#define Set(item_t, item_info, N, ...) ({ \ - item_t ents[N] = {__VA_ARGS__}; \ - Table_t set = Table$from_entries((Array_t){ \ - .data=memcpy(GC_MALLOC(sizeof(ents)), ents, sizeof(ents)), \ - .length=sizeof(ents)/sizeof(ents[0]), \ - .stride=(void*)&ents[1] - (void*)&ents[0], \ - }, Set$info(item_info)); \ - set; }) - -Table_t Table$from_entries(Array_t entries, const TypeInfo *type); -void *Table$get(Table_t t, const void *key, const TypeInfo *type); -#define Table$get_optional(table_expr, key_t, val_t, key_expr, nonnull_var, nonnull_expr, null_expr, info_expr) ({ \ - const Table_t t = table_expr; const key_t k = key_expr; \ - val_t *nonnull_var = Table$get(t, &k, info_expr); \ - nonnull_var ? nonnull_expr : null_expr; }) -#define Table$has_value(table_expr, key_expr, info_expr) ({ \ - const Table_t t = table_expr; __typeof(key_expr) k = key_expr; \ - (Table$get(t, &k, info_expr) != NULL); }) -PUREFUNC void *Table$get_raw(Table_t t, const void *key, const TypeInfo *type); -CONSTFUNC void *Table$entry(Table_t t, int64_t n); -void *Table$reserve(Table_t *t, const void *key, const void *value, const TypeInfo *type); -void Table$set(Table_t *t, const void *key, const void *value, const TypeInfo *type); -#define Table$set_value(t, key_expr, value_expr, type) ({ __typeof(key_expr) k = key_expr; __typeof(value_expr) v = value_expr; \ - Table$set(t, &k, &v, type); }) -#define Table$reserve_value(t, key_expr, type) ({ __typeof(key_expr) k = key_expr; Table$reserve(t, &k, NULL, type); }) -#define Table$bump(t_expr, key_expr, amount_expr, type) ({ __typeof(key_expr) key = key_expr; \ - Table_t *t = t_expr; \ - __typeof(amount_expr) *val = Table$get_raw(*t, &key, type); \ - if (val) *val += amount_expr; \ - else { __typeof(amount_expr) init = amount_expr; Table$set(t, &key, &init, type); } (void)0; }) - -void Table$remove(Table_t *t, const void *key, const TypeInfo *type); -#define Table$remove_value(t, key_expr, type) ({ __typeof(key_expr) k = key_expr; Table$remove(t, &k, type); }) - -Table_t Table$overlap(Table_t a, Table_t b, const TypeInfo *type); -Table_t Table$with(Table_t a, Table_t b, const TypeInfo *type); -Table_t Table$without(Table_t a, Table_t b, const TypeInfo *type); -PUREFUNC bool Table$is_subset_of(Table_t a, Table_t b, bool strict, const TypeInfo *type); -PUREFUNC bool Table$is_superset_of(Table_t a, Table_t b, bool strict, const TypeInfo *type); - -void Table$clear(Table_t *t); -Table_t Table$sorted(Table_t t, const TypeInfo *type); -void Table$mark_copy_on_write(Table_t *t); -#define TABLE_INCREF(t) ({ ARRAY_INCREF((t).entries); if ((t).bucket_info) (t).bucket_info->data_refcount += ((t).bucket_info->data_refcount < TABLE_MAX_DATA_REFCOUNT); }) -#define TABLE_COPY(t) ({ TABLE_INCREF(t); t; }) -PUREFUNC int32_t Table$compare(const Table_t *x, const Table_t *y, const TypeInfo *type); -PUREFUNC bool Table$equal(const Table_t *x, const Table_t *y, const TypeInfo *type); -PUREFUNC uint64_t Table$hash(const Table_t *t, const TypeInfo *type); -Text_t Table$as_text(const Table_t *t, bool colorize, const TypeInfo *type); - -CONSTFUNC void *Table$str_entry(Table_t t, int64_t n); -PUREFUNC void *Table$str_get(Table_t t, const char *key); -PUREFUNC void *Table$str_get_raw(Table_t t, const char *key); -void Table$str_set(Table_t *t, const char *key, const void *value); -void *Table$str_reserve(Table_t *t, const char *key, const void *value); -void Table$str_remove(Table_t *t, const char *key); - -#define Table$length(t) ((t).entries.length) - -extern const TypeInfo CStrToVoidStarTable; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/builtins/text.c b/builtins/text.c deleted file mode 100644 index 283dfb01..00000000 --- a/builtins/text.c +++ /dev/null @@ -1,1302 +0,0 @@ -// Type info and methods for Text datatype, which uses libunistr for Unicode -// support and implements a datastructure based on Raku/MoarVM's strings to -// efficiently store arbitrary unicode data using a mix of densely packed plain -// ASCII, 32-bit integers representing grapheme clusters (see below), and ropes -// that represent text that is a composite of multiple subtexts. Subtexts are -// only nested one level deep, not arbitrarily deep trees. -// -// A note on grapheme clusters: In Unicode, codepoints can be represented using -// a 32-bit integer. Most codepoints correspond to the intuitive notion of a -// "letter", which is more formally known as a "grapheme cluster". A grapheme -// cluster is roughly speaking the amount of text that your cursor moves over -// when you press the arrow key once. However, some codepoints act as modifiers -// on other codepoints. For example, U+0301 (COMBINING ACUTE ACCENT) can modify -// a letter like "e" to form "é". During normalization, this frequently -// resolves down to a single unicode codepoint, in this case, "é" resolves to -// the single codepoint U+00E9 (LATIN SMALL LETTER E WITH ACUTE). However, in -// some cases, multiple codepoints make up a grapheme cluster but *don't* -// normalize to a single codepoint. For example, LATIN SMALL LETTER E (U+0065) -// + COMBINING VERTICAL LINE BELOW (U+0329) combine to form an unusual glyph -// that is not used frequently enough to warrant its own unique codepoint (this -// is basically what Zalgo text is). -// -// There are a lot of benefits to storing text with one grapheme cluster per -// index in a densely packed array. It lets us have one canonical length for -// the text that can be precomputed and is meaningful to users. It lets us -// quickly get the Nth "letter" in the text. Substring slicing is fast. -// However, since not all grapheme clusters take up the same number of -// codepoints, we're faced with the problem of how to jam multiple codepoints -// into a single 32-bit slot. Inspired by Raku and MoarVM's approach, this -// implementation uses "synthetic graphemes" (in Raku's terms, Normal Form -// Graphemes, aka NFG). A synthetic grapheme is a negative 32-bit signed -// integer that represents a multi-codepoint grapheme cluster that has been -// encountered during the program's runtime. These clusters are stored in a -// lookup array and hash map so that we can rapidly convert between the -// synthetic grapheme integer ID and the unicode codepoints associated with it. -// Essentially, it's like we create a supplement to the unicode standard with -// things that would be nice if they had their own codepoint so things worked -// out nicely because we're using them right now, and we'll give them a -// negative number so it doesn't overlap with any real codepoints. -// -// Example 1: U+0048, U+00E9 -// AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E WITH ACUTE -// This would be stored as: (int32_t[]){0x48, 0xE9} -// Example 2: U+0048, U+0065, U+0309 -// AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E, COMBINING VERTICAL LINE BELOW -// This would be stored as: (int32_t[]){0x48, -2} -// Where -2 is used as a lookup in an array that holds the actual unicode codepoints: -// (ucs4_t[]){0x65, 0x0309} - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "arrays.h" -#include "integers.h" -#include "patterns.h" -#include "tables.h" -#include "text.h" - -// Use inline version of the siphash code for performance: -#include "siphash.h" -#include "siphash-internals.h" - -typedef struct { - ucs4_t main_codepoint; - ucs4_t *utf32_cluster; // length-prefixed - const uint8_t *utf8; -} synthetic_grapheme_t; - -// Synthetic grapheme clusters (clusters of more than one codepoint): -static Table_t grapheme_ids_by_codepoints = {}; // ucs4_t* length-prefixed codepoints -> int32_t ID - -// This will hold a dynamically growing array of synthetic graphemes: -static synthetic_grapheme_t *synthetic_graphemes = NULL; -static int32_t synthetic_grapheme_capacity = 0; -static int32_t num_synthetic_graphemes = 0; - -#define MAIN_GRAPHEME_CODEPOINT(_g) ({ int32_t g = _g; (g) >= 0 ? (ucs4_t)(g) : synthetic_graphemes[-(g)-1].main_codepoint; }) -#define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id)-1].utf32_cluster[0]) -#define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id)-1].utf32_cluster[1]) -#define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id)-1].utf8) - -static Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize); - -PUREFUNC static bool graphemes_equal(ucs4_t **a, ucs4_t **b) { - if ((*a)[0] != (*b)[0]) return false; - for (int i = 0; i < (int)(*a)[0]; i++) - if ((*a)[i] != (*b)[i]) return false; - return true; -} - -PUREFUNC static uint64_t grapheme_hash(ucs4_t **g) { - ucs4_t *cluster = *g; - return siphash24((void*)&cluster[1], sizeof(ucs4_t[cluster[0]])); -} - -static const TypeInfo GraphemeClusterInfo = { - .size=sizeof(ucs4_t*), - .align=__alignof__(ucs4_t*), - .tag=CustomInfo, - .CustomInfo={.equal=(void*)graphemes_equal, .hash=(void*)grapheme_hash}, -}; - -static const TypeInfo GraphemeIDLookupTableInfo = { - .size=sizeof(Table_t), .align=__alignof__(Table_t), - .tag=TableInfo, .TableInfo={.key=&GraphemeClusterInfo, .value=&Int32$info}, -}; - -#pragma GCC diagnostic ignored "-Wstack-protector" -public int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len) -{ - ucs4_t length_prefixed[1+utf32_len] = {}; - length_prefixed[0] = (ucs4_t)utf32_len; - for (int i = 0; i < utf32_len; i++) - length_prefixed[i+1] = codepoints[i]; - ucs4_t *ptr = &length_prefixed[0]; - - // Optimization for common case of one frequently used synthetic grapheme: - static int32_t last_grapheme = 0; - if (last_grapheme != 0 && graphemes_equal(&ptr, &synthetic_graphemes[-last_grapheme-1].utf32_cluster)) - return last_grapheme; - - int32_t *found = Table$get(grapheme_ids_by_codepoints, &ptr, &GraphemeIDLookupTableInfo); - if (found) return *found; - - // New synthetic grapheme: - if (num_synthetic_graphemes >= synthetic_grapheme_capacity) { - // If we don't have space, allocate more: - synthetic_grapheme_capacity = MAX(128, synthetic_grapheme_capacity * 2); - synthetic_grapheme_t *new = GC_MALLOC(sizeof(synthetic_grapheme_t[synthetic_grapheme_capacity])); - memcpy(new, synthetic_graphemes, sizeof(synthetic_grapheme_t[num_synthetic_graphemes])); - synthetic_graphemes = new; - } - - int32_t grapheme_id = -(num_synthetic_graphemes+1); - num_synthetic_graphemes += 1; - - // Get UTF8 representation: - uint8_t u8_buf[64]; - size_t u8_len = sizeof(u8_buf)/sizeof(u8_buf[0]); - uint8_t *u8 = u32_to_u8(codepoints, (size_t)utf32_len, u8_buf, &u8_len); - - // For performance reasons, use an arena allocator here to ensure that - // synthetic graphemes store all of their information in a densely packed - // area with good cache locality: - static void *arena = NULL, *arena_end = NULL; - // Eat up any space needed to make arena 32-bit aligned: - if ((size_t)arena % __alignof__(ucs4_t) != 0) - arena += __alignof__(ucs4_t) - ((size_t)arena % __alignof__(ucs4_t)); - - // If we have filled up this arena, allocate a new one: - size_t needed_memory = sizeof(ucs4_t[1+utf32_len]) + sizeof(uint8_t[u8_len + 1]); - if (arena + needed_memory > arena_end) { - // Do reasonably big chunks at a time, so most synthetic codepoints are - // nearby each other in memory and cache locality is good. This is a - // rough guess at a good size: - size_t chunk_size = MAX(needed_memory, 512); - arena = GC_MALLOC_ATOMIC(chunk_size); - arena_end = arena + chunk_size; - } - - // Copy length-prefixed UTF32 codepoints into the arena and store where they live: - ucs4_t *codepoint_copy = arena; - mempcpy(codepoint_copy, length_prefixed, sizeof(ucs4_t[1+utf32_len])); - synthetic_graphemes[-grapheme_id-1].utf32_cluster = codepoint_copy; - arena += sizeof(ucs4_t[1+utf32_len]); - - // Copy UTF8 bytes into the arena and store where they live: - uint8_t *utf8_final = arena; - memcpy(utf8_final, u8, sizeof(uint8_t[u8_len])); - utf8_final[u8_len] = '\0'; // Add a terminating NUL byte - synthetic_graphemes[-grapheme_id-1].utf8 = utf8_final; - arena += sizeof(uint8_t[u8_len + 1]); - - // Sickos at the unicode consortium decreed that you can have grapheme clusters - // that begin with *prefix* modifiers, so we gotta check for that case: - synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1]; - for (ucs4_t i = 0; i < utf32_len; i++) { - if (!__builtin_expect(uc_is_property_prepended_concatenation_mark(length_prefixed[1+i]), 0)) { - synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1+i]; - break; - } - } - - // Cleanup from unicode API: - if (u8 != u8_buf) free(u8); - - Table$set(&grapheme_ids_by_codepoints, &codepoint_copy, &grapheme_id, &GraphemeIDLookupTableInfo); - - last_grapheme = grapheme_id; - return grapheme_id; -} - -PUREFUNC static inline int64_t num_subtexts(Text_t t) -{ - if (t.tag != TEXT_SUBTEXT) return 1; - int64_t len = t.length; - int64_t n = 0; - while (len > 0) { - len -= t.subtexts[n].length; - ++n; - } - return n; -} - -int text_visualize(FILE *stream, Text_t t) -{ - switch (t.tag) { - case TEXT_SHORT_ASCII: return fprintf(stream, "%.*s", t.length, t.length, t.short_ascii); - case TEXT_ASCII: return fprintf(stream, "%.*s", t.length, t.length, t.ascii); - case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { - int printed = fprintf(stream, "", t.length); - printed += Text$print(stream, t); - printed += fprintf(stream, ""); - return printed; - } - case TEXT_SUBTEXT: { - int printed = fprintf(stream, "", t.length); - int64_t to_print = t.length; - for (int i = 0; to_print > 0; ++i) { - printed += fprintf(stream, "\n "); - printed += text_visualize(stream, t.subtexts[i]); - to_print -= t.subtexts[i].length; - if (t.subtexts[i].length == 0) break; - } - printed += fprintf(stream, "\n"); - return printed; - } - default: return 0; - } -} - -public int Text$print(FILE *stream, Text_t t) -{ - if (t.length == 0) return 0; - - switch (t.tag) { - case TEXT_SHORT_ASCII: return fwrite(t.short_ascii, sizeof(char), (size_t)t.length, stream); - case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), (size_t)t.length, stream); - case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { - const int32_t *graphemes = t.tag == TEXT_SHORT_GRAPHEMES ? t.short_graphemes : t.graphemes; - int written = 0; - for (int64_t i = 0; i < t.length; i++) { - int32_t grapheme = graphemes[i]; - if (grapheme >= 0) { - uint8_t buf[8]; - size_t len = sizeof(buf); - uint8_t *u8 = u32_to_u8((ucs4_t*)&grapheme, 1, buf, &len); - written += (int)fwrite(u8, sizeof(char), len, stream); - if (u8 != buf) free(u8); - } else { - const uint8_t *u8 = GRAPHEME_UTF8(grapheme); - assert(u8); - written += (int)fwrite(u8, sizeof(uint8_t), strlen((char*)u8), stream); - } - } - return written; - } - case TEXT_SUBTEXT: { - int written = 0; - int i = 0; - for (int64_t to_print = t.length; to_print > 0; to_print -= t.subtexts[i].length, ++i) - written += Text$print(stream, t.subtexts[i]); - return written; - } - default: return 0; - } -} - -static bool is_concat_stable(Text_t a, Text_t b) -{ - if (a.length == 0 || b.length == 0) - return true; - - int32_t last_a = Text$get_grapheme(a, a.length-1); - int32_t first_b = Text$get_grapheme(b, 0); - - // Synthetic graphemes are weird and probably need to check with normalization: - if (last_a < 0 || first_b < 0) - return 0; - - // Magic number, we know that no codepoints below here trigger instability: - static const int32_t LOWEST_CODEPOINT_TO_CHECK = 0x300; - if (last_a < LOWEST_CODEPOINT_TO_CHECK && first_b < LOWEST_CODEPOINT_TO_CHECK) - return true; - - // Do a normalization run for these two codepoints and see if it looks different: - ucs4_t codepoints[2] = {(ucs4_t)last_a, (ucs4_t)first_b}; - ucs4_t norm_buf[3*2]; // Normalization should not exceed 3x in the input length - size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]); - ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, 2, norm_buf, &norm_length); - if (norm_length != 2) { - // Looks like these two codepoints merged into one (or maybe had a child, who knows?) - if (normalized != norm_buf) free(normalized); - return false; - } - - // If there's still two codepoints, we might end up with a single grapheme - // cluster which will need to turn into a synthetic grapheme: - const void *second_grapheme = u32_grapheme_next(normalized, &normalized[2]); - if (normalized != norm_buf) free(normalized); - return (second_grapheme == &normalized[1]); -} - -static Text_t concat2_assuming_safe(Text_t a, Text_t b) -{ - if (a.length == 0) return b; - if (b.length == 0) return a; - - if (a.tag == TEXT_SUBTEXT && b.tag == TEXT_SUBTEXT) { - int64_t na = num_subtexts(a); - int64_t nb = num_subtexts(b); - Text_t ret = { - .length=a.length + b.length, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[na + nb])), - }; - memcpy(&ret.subtexts[0], a.subtexts, sizeof(Text_t[na])); - memcpy(&ret.subtexts[na], b.subtexts, sizeof(Text_t[nb])); - return ret; - } else if (a.tag == TEXT_SUBTEXT) { - int64_t n = num_subtexts(a); - Text_t ret = { - .length=a.length + b.length, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])), - }; - memcpy(ret.subtexts, a.subtexts, sizeof(Text_t[n])); - ret.subtexts[n] = b; - return ret; - } else if (b.tag == TEXT_SUBTEXT) { - int64_t n = num_subtexts(b); - Text_t ret = { - .length=a.length + b.length, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])), - }; - ret.subtexts[0] = a; - memcpy(&ret.subtexts[1], b.subtexts, sizeof(Text_t[n])); - return ret; - } else { - Text_t ret = { - .length=a.length + b.length, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[2])), - }; - ret.subtexts[0] = a; - ret.subtexts[1] = b; - return ret; - } -} - -static Text_t concat2(Text_t a, Text_t b) -{ - if (a.length == 0) return b; - if (b.length == 0) return a; - - if (__builtin_expect(is_concat_stable(a, b), 1)) - return concat2_assuming_safe(a, b); - - // Do full normalization of the last/first characters - int32_t last_a = Text$get_grapheme(a, a.length-1); - int32_t first_b = Text$get_grapheme(b, 0); - - size_t utf32_len = (last_a >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(last_a)) + (first_b >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(first_b)); - ucs4_t join_graphemes[utf32_len] = {}; - ucs4_t *p = &join_graphemes[0]; - if (last_a < 0) p = mempcpy(p, GRAPHEME_CODEPOINTS(last_a), NUM_GRAPHEME_CODEPOINTS(last_a)); - else *(p++) = (ucs4_t)last_a; - if (first_b < 0) p = mempcpy(p, GRAPHEME_CODEPOINTS(first_b), NUM_GRAPHEME_CODEPOINTS(first_b)); - else *(p++) = (ucs4_t)first_b; - - Text_t glue = text_from_u32(join_graphemes, (int64_t)utf32_len, true); - - if (a.length == 1 && b.length == 1) - return glue; - else if (a.length == 1) - return concat2_assuming_safe(glue, Text$slice(b, I(2), I(b.length))); - else if (b.length == 1) - return concat2_assuming_safe(Text$slice(a, I(1), I(a.length-1)), glue); - else - return concat2_assuming_safe( - concat2_assuming_safe(Text$slice(a, I(1), I(a.length-1)), glue), - b); -} - -public Text_t Text$_concat(int n, Text_t items[n]) -{ - if (n == 0) return (Text_t){.length=0}; - if (n == 1) return items[0]; - if (n == 2) return concat2(items[0], items[1]); - - int64_t len = 0, subtexts = 0; - for (int i = 0; i < n; i++) { - len += items[i].length; - if (items[i].length > 0) - subtexts += num_subtexts(items[i]); - } - - Text_t ret = { - .length=0, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[len])), - }; - int64_t sub_i = 0; - for (int i = 0; i < n; i++) { - if (items[i].length == 0) - continue; - - if (i > 0 && !__builtin_expect(is_concat_stable(items[i-1], items[i]), 1)) { - // Oops, guess this wasn't stable for concatenation, let's break it - // up into subtasks: - return concat2(ret, Text$_concat(n-i, &items[i])); - } - - if (items[i].tag == TEXT_SUBTEXT) { - for (int64_t j = 0, remainder = items[i].length; remainder > 0; j++) { - ret.subtexts[sub_i++] = items[i].subtexts[j]; - remainder -= items[i].subtexts[j].length; - } - } else { - ret.subtexts[sub_i++] = items[i]; - } - ret.length += items[i].length; - } - return ret; -} - -public Text_t Text$repeat(Text_t text, Int_t count) -{ - if (text.length == 0 || Int$is_negative(count)) - return Text(""); - - Int_t result_len = Int$times(count, I(text.length)); - if (Int$compare_value(result_len, I(1l<<40)) > 0) - fail("Text repeating would produce too big of an result!"); - - int64_t count64 = Int_to_Int64(count, false); - if (text.tag == TEXT_SUBTEXT) { - int64_t subtexts = num_subtexts(text); - Text_t ret = { - .length=text.length * count64, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[subtexts * count64])), - }; - for (int64_t c = 0; c < count64; c++) { - for (int64_t i = 0; i < subtexts; i++) { - if (text.subtexts[i].length > 0) - ret.subtexts[c*subtexts + i] = text.subtexts[i]; - } - } - return ret; - } else { - Text_t ret = { - .length=text.length * count64, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[count64])), - }; - for (int64_t i = 0; i < count64; i++) - ret.subtexts[i] = text; - return ret; - } -} - -public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) -{ - int64_t first = Int_to_Int64(first_int, false); - int64_t last = Int_to_Int64(last_int, false); - if (first == 0) fail("Invalid index: 0"); - if (last == 0) return (Text_t){.length=0}; - - if (first < 0) first = text.length + first + 1; - if (last < 0) last = text.length + last + 1; - - if (last > text.length) last = text.length; - - if (first > text.length || last < first) - return (Text_t){.length=0}; - - if (first == 1 && last == text.length) - return text; - - switch (text.tag) { - case TEXT_SHORT_ASCII: { - Text_t ret = (Text_t) { - .tag=TEXT_SHORT_ASCII, - .length=last - first + 1, - }; - memcpy(ret.short_ascii, text.short_ascii + (first-1), (size_t)ret.length); - return ret; - } - case TEXT_ASCII: { - Text_t ret = { - .tag=TEXT_ASCII, - .length=last - first + 1, - .ascii=text.ascii + (first-1), - }; - return ret; - } - case TEXT_SHORT_GRAPHEMES: { - assert((first == 1 && last == 1) || (first == 2 && last == 2)); - Text_t ret = { - .tag=TEXT_SHORT_GRAPHEMES, - .length=1, - .short_graphemes={text.short_graphemes[first-1]}, - }; - return ret; - } - case TEXT_GRAPHEMES: { - Text_t ret = { - .tag=TEXT_GRAPHEMES, - .length=last - first + 1, - .graphemes=text.graphemes + (first-1), - }; - return ret; - } - case TEXT_SUBTEXT: { - Text_t *subtexts = text.subtexts; - while (first > subtexts[0].length) { - first -= subtexts[0].length; - last -= subtexts[0].length; - ++subtexts; - } - - int64_t needed_len = (last - first) + 1; - int64_t num_subtexts = 0; - for (int64_t included = 0; included < needed_len; ) { - if (included == 0) - included += subtexts[num_subtexts].length - first + 1; - else - included += subtexts[num_subtexts].length; - num_subtexts += 1; - } - if (num_subtexts == 1) - return Text$slice(subtexts[0], I(first), I(last)); - - Text_t ret = { - .length=needed_len, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[num_subtexts])), - }; - for (int64_t i = 0; i < num_subtexts; i++) { - ret.subtexts[i] = Text$slice(subtexts[i], I(first), I(last)); - first = 1; - needed_len -= ret.subtexts[i].length; - last = first + needed_len - 1; - } - return ret; - } - default: errx(1, "Invalid tag"); - } -} - -Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize) -{ - // Normalization is apparently guaranteed to never exceed 3x in the input length - ucs4_t norm_buf[MIN(256, 3*num_codepoints)]; - if (normalize) { - size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]); - ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, (size_t)num_codepoints, norm_buf, &norm_length); - codepoints = normalized; - num_codepoints = (int64_t)norm_length; - } - - // char breaks[num_codepoints]; - // u32_grapheme_breaks(codepoints, num_codepoints, breaks); - - Text_t ret = { - .length=0, - .tag=TEXT_SHORT_GRAPHEMES, - }; - const ucs4_t *src = codepoints; - int32_t *graphemes = ret.short_graphemes; - while (src < &codepoints[num_codepoints]) { - if (ret.tag == TEXT_SHORT_GRAPHEMES && ret.length + 1 > 2) { - graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints])); // May be a slight overallocation - graphemes[0] = ret.short_graphemes[0]; - graphemes[1] = ret.short_graphemes[1]; - ret.tag = TEXT_GRAPHEMES; - ret.graphemes = graphemes; - } - - // TODO: use grapheme breaks instead of u32_grapheme_next() - const ucs4_t *next = u32_grapheme_next(src, &codepoints[num_codepoints]); - if (next == &src[1]) { - graphemes[ret.length] = (int32_t)*src; - } else { - // Synthetic grapheme - graphemes[ret.length] = get_synthetic_grapheme(src, next-src); - } - ++ret.length; - src = next; - } - if (normalize && codepoints != norm_buf) free(codepoints); - return ret; -} - -public Text_t Text$from_strn(const char *str, size_t len) -{ - int64_t ascii_span = 0; - for (size_t i = 0; i < len && isascii(str[i]); i++) - ascii_span++; - - if (ascii_span == (int64_t)len) { // All ASCII - Text_t ret = {.length=ascii_span}; - if (ascii_span <= 8) { - ret.tag = TEXT_SHORT_ASCII; - for (int64_t i = 0; i < ascii_span; i++) - ret.short_ascii[i] = str[i]; - } else { - ret.tag = TEXT_ASCII; - ret.ascii = str; - } - return ret; - } else { - if (u8_check((uint8_t*)str, len) != NULL) - return Text(""); - - ucs4_t buf[128]; - size_t length = sizeof(buf)/sizeof(buf[0]); - - ucs4_t *codepoints = u8_to_u32((uint8_t*)str, (size_t)ascii_span + strlen(str + ascii_span), buf, &length); - Text_t ret = text_from_u32(codepoints, (int64_t)length, true); - if (codepoints != buf) free(codepoints); - return ret; - } -} - -public Text_t Text$from_str(const char *str) -{ - return str ? Text$from_strn(str, strlen(str)) : Text(""); -} - -static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i) -{ - switch (text.tag) { - case TEXT_ASCII: case TEXT_SHORT_ASCII: { - if (*i + text.length > (int64_t)*capacity) { - *capacity = *i + text.length + 1; - *buf = GC_REALLOC(*buf, (size_t)*capacity); - } - - const char *bytes = text.tag == TEXT_ASCII ? text.ascii : text.short_ascii; - memcpy(*buf + *i, bytes, (size_t)text.length); - *i += text.length; - break; - } - case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { - const int32_t *graphemes = text.tag == TEXT_GRAPHEMES ? text.graphemes : text.short_graphemes; - for (int64_t g = 0; g < text.length; g++) { - if (graphemes[g] >= 0) { - uint8_t u8_buf[64]; - size_t u8_len = sizeof(u8_buf); - uint8_t *u8 = u32_to_u8((ucs4_t*)&graphemes[g], 1, u8_buf, &u8_len); - - if (*i + (int64_t)u8_len > (int64_t)*capacity) { - *capacity = *i + (int64_t)u8_len + 1; - *buf = GC_REALLOC(*buf, (size_t)*capacity); - } - - memcpy(*buf + *i, u8, u8_len); - *i += (int64_t)u8_len; - if (u8 != u8_buf) free(u8); - } else { - const uint8_t *u8 = GRAPHEME_UTF8(graphemes[g]); - size_t u8_len = u8_strlen(u8); - if (*i + (int64_t)u8_len > (int64_t)*capacity) { - *capacity = *i + (int64_t)u8_len + 1; - *buf = GC_REALLOC(*buf, (size_t)*capacity); - } - - memcpy(*buf + *i, u8, u8_len); - *i += (int64_t)u8_len; - } - } - break; - } - case TEXT_SUBTEXT: { - for (int64_t s = 0, remaining = text.length; remaining > 0; s++) { - u8_buf_append(text.subtexts[s], buf, capacity, i); - remaining -= text.subtexts[s].length; - } - break; - } - default: break; - } -} - -public char *Text$as_c_string(Text_t text) -{ - int64_t capacity = text.length + 1; - char *buf = GC_MALLOC_ATOMIC((size_t)capacity); - int64_t i = 0; - u8_buf_append(text, &buf, &capacity, &i); - - if (i + 1 > (int64_t)capacity) { - capacity = i + 1; - buf = GC_REALLOC(buf, (size_t)capacity); - } - buf[i] = '\0'; - return buf; -} - -PUREFUNC public uint64_t Text$hash(Text_t *text) -{ - if (text->hash != 0) return text->hash; - siphash sh; - siphashinit(&sh, sizeof(int32_t[text->length])); - - union { - int32_t chunks[2]; - uint64_t whole; - } tmp; - switch (text->tag) { - case TEXT_ASCII: case TEXT_SHORT_ASCII: { - const char *bytes = text->tag == TEXT_ASCII ? text->ascii : text->short_ascii; - for (int64_t i = 0; i + 1 < text->length; i++) { - tmp.chunks[0] = (int32_t)bytes[i]; - tmp.chunks[1] = (int32_t)bytes[i+1]; - siphashadd64bits(&sh, tmp.whole); - } - int32_t last = text->length & 0x1 ? (int32_t)bytes[text->length-1] : 0; // Odd number of graphemes - text->hash = siphashfinish_last_part(&sh, (uint64_t)last); - break; - } - case TEXT_GRAPHEMES: { - const int32_t *graphemes = text->graphemes; - for (int64_t i = 0; i + 1 < text->length; i++) { - tmp.chunks[0] = graphemes[i]; - tmp.chunks[1] = graphemes[i]; - siphashadd64bits(&sh, tmp.whole); - } - int32_t last = text->length & 0x1 ? graphemes[text->length-1] : 0; // Odd number of graphemes - text->hash = siphashfinish_last_part(&sh, (uint64_t)last); - break; - } - case TEXT_SHORT_GRAPHEMES: { - tmp.chunks[0] = text->short_graphemes[0]; - if (text->length > 1) - tmp.chunks[1] = text->short_graphemes[1]; - text->hash = siphashfinish_last_part(&sh, (uint64_t)tmp.whole); - break; - } - case TEXT_SUBTEXT: { - int32_t leftover = 0; - for (int64_t sub_i = 0, to_hash = text->length; to_hash > 0; ) { - Text_t subtext = text->subtexts[sub_i]; - if (subtext.tag == TEXT_ASCII || subtext.tag == TEXT_SHORT_ASCII) { - const char *bytes = subtext.tag == TEXT_ASCII ? subtext.ascii : subtext.short_ascii; - int64_t grapheme = 0; - if (leftover) { - tmp.chunks[0] = leftover; - tmp.chunks[1] = (int32_t)bytes[0]; - siphashadd64bits(&sh, tmp.whole); - grapheme += 1; - } - for (; grapheme + 1 < subtext.length; grapheme += 2) { - tmp.chunks[0] = (int32_t)bytes[grapheme]; - tmp.chunks[1] = (int32_t)bytes[grapheme+1]; - siphashadd64bits(&sh, tmp.whole); - } - leftover = grapheme < subtext.length ? (int32_t)bytes[grapheme] : 0; - } else if (subtext.tag == TEXT_SHORT_GRAPHEMES) { - if (leftover) { - tmp.chunks[0] = leftover; - tmp.chunks[1] = subtext.short_graphemes[0]; - siphashadd64bits(&sh, tmp.whole); - leftover = subtext.length > 1 ? subtext.short_graphemes[1] : 0; - } else if (subtext.length == 1) { - leftover = subtext.short_graphemes[0]; - } else { - tmp.chunks[0] = subtext.short_graphemes[0]; - tmp.chunks[1] = subtext.short_graphemes[1]; - siphashadd64bits(&sh, tmp.whole); - } - } else if (subtext.tag == TEXT_GRAPHEMES) { - const int32_t *graphemes = subtext.graphemes; - int64_t grapheme = 0; - if (leftover) { - tmp.chunks[0] = leftover; - tmp.chunks[1] = graphemes[0]; - siphashadd64bits(&sh, tmp.whole); - grapheme += 1; - } - for (; grapheme + 1 < subtext.length; grapheme += 2) { - tmp.chunks[0] = graphemes[grapheme]; - tmp.chunks[1] = graphemes[grapheme+1]; - siphashadd64bits(&sh, tmp.whole); - } - leftover = grapheme < subtext.length ? graphemes[grapheme] : 0; - } - - to_hash -= text->subtexts[sub_i].length; - - ++sub_i; - } - - text->hash = siphashfinish_last_part(&sh, (uint64_t)leftover); - break; - } - default: errx(1, "Invalid text"); - } - - if (text->hash == 0) - text->hash = 1; - - return text->hash; -} - -public int32_t Text$get_grapheme_fast(Text_t text, TextIter_t *state, int64_t index) -{ - switch (text.tag) { - case TEXT_ASCII: return index < text.length ? (int32_t)text.ascii[index] : 0; - case TEXT_SHORT_ASCII: return index < text.length ? (int32_t)text.short_ascii[index] : 0; - case TEXT_GRAPHEMES: return index < text.length ? text.graphemes[index] : 0; - case TEXT_SHORT_GRAPHEMES: return index < text.length ? text.short_graphemes[index] : 0; - case TEXT_SUBTEXT: { - TextIter_t backup_state = {0, 0}; - if (!state) state = &backup_state; - - if (index < 0 || index >= text.length) - return 0; - - while (index < state->sum_of_previous_subtexts && state->subtext > 0) { - state->sum_of_previous_subtexts -= text.subtexts[state->subtext].length; - state->subtext -= 1; - } - for (;;) { - if (index < state->sum_of_previous_subtexts + text.subtexts[state->subtext].length) - return Text$get_grapheme_fast(text.subtexts[state->subtext], NULL, index - state->sum_of_previous_subtexts); - state->sum_of_previous_subtexts += text.subtexts[state->subtext].length; - state->subtext += 1; - } - return 0; - } - default: errx(1, "Invalid text"); - } - return 0; -} - -public ucs4_t Text$get_main_grapheme_fast(Text_t text, TextIter_t *state, int64_t index) -{ - return MAIN_GRAPHEME_CODEPOINT(Text$get_grapheme_fast(text, state, index)); -} - -PUREFUNC public int32_t Text$compare(const Text_t *a, const Text_t *b) -{ - if (a == b) return 0; - - int64_t len = MAX(a->length, b->length); - TextIter_t a_state = {0, 0}, b_state = {0, 0}; - for (int64_t i = 0; i < len; i++) { - int32_t ai = Text$get_grapheme_fast(*a, &a_state, i); - int32_t bi = Text$get_grapheme_fast(*b, &b_state, i); - if (ai == bi) continue; - int32_t cmp; - if (ai > 0 && bi > 0) { - cmp = u32_cmp((ucs4_t*)&ai, (ucs4_t*)&bi, 1); - } else if (ai > 0) { - cmp = u32_cmp2( - (ucs4_t*)&ai, 1, - GRAPHEME_CODEPOINTS(bi), - NUM_GRAPHEME_CODEPOINTS(bi)); - } else if (bi > 0) { - cmp = u32_cmp2( - GRAPHEME_CODEPOINTS(ai), - NUM_GRAPHEME_CODEPOINTS(ai), - (ucs4_t*)&bi, 1); - } else { - cmp = u32_cmp2( - GRAPHEME_CODEPOINTS(ai), - NUM_GRAPHEME_CODEPOINTS(ai), - GRAPHEME_CODEPOINTS(bi), - NUM_GRAPHEME_CODEPOINTS(bi)); - } - if (cmp != 0) return cmp; - } - return 0; -} - -PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix) -{ - if (text.length < prefix.length) - return false; - TextIter_t text_state = {0, 0}, prefix_state = {0, 0}; - for (int64_t i = 0; i < prefix.length; i++) { - int32_t text_i = Text$get_grapheme_fast(text, &text_state, i); - int32_t prefix_i = Text$get_grapheme_fast(prefix, &prefix_state, i); - if (text_i != prefix_i) return false; - } - return true; -} - -PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix) -{ - if (text.length < suffix.length) - return false; - TextIter_t text_state = {0, 0}, prefix_state = {0, 0}; - for (int64_t i = 0; i < suffix.length; i++) { - int32_t text_i = Text$get_grapheme_fast(text, &text_state, text.length - suffix.length + i); - int32_t suffix_i = Text$get_grapheme_fast(suffix, &prefix_state, i); - if (text_i != suffix_i) return false; - } - return true; -} - -PUREFUNC public bool Text$equal_values(Text_t a, Text_t b) -{ - if (a.length != b.length || (a.hash != 0 && b.hash != 0 && a.hash != b.hash)) - return false; - int64_t len = a.length; - TextIter_t a_state = {0, 0}, b_state = {0, 0}; - for (int64_t i = 0; i < len; i++) { - int32_t ai = Text$get_grapheme_fast(a, &a_state, i); - int32_t bi = Text$get_grapheme_fast(b, &b_state, i); - if (ai != bi) return false; - } - return true; -} - -PUREFUNC public bool Text$equal(const Text_t *a, const Text_t *b) -{ - if (a == b) return true; - return Text$equal_values(*a, *b); -} - -PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b) -{ - if (a.length != b.length) - return false; - int64_t len = a.length; - TextIter_t a_state = {0, 0}, b_state = {0, 0}; - const char *language = uc_locale_language(); - for (int64_t i = 0; i < len; i++) { - int32_t ai = Text$get_grapheme_fast(a, &a_state, i); - int32_t bi = Text$get_grapheme_fast(b, &b_state, i); - if (ai != bi) { - const ucs4_t *a_codepoints = ai >= 0 ? (ucs4_t*)&ai : GRAPHEME_CODEPOINTS(ai); - int64_t a_len = ai >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(ai); - - const ucs4_t *b_codepoints = bi >= 0 ? (ucs4_t*)&bi : GRAPHEME_CODEPOINTS(bi); - int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi); - - int cmp = 0; - (void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, language, UNINORM_NFC, &cmp); - if (cmp != 0) - return false; - } - } - return true; -} - -public Text_t Text$upper(Text_t text) -{ - if (text.length == 0) return text; - Array_t codepoints = Text$utf32_codepoints(text); - const char *language = uc_locale_language(); - ucs4_t buf[128]; - size_t out_len = sizeof(buf)/sizeof(buf[0]); - ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len); - Text_t ret = text_from_u32(upper, (int64_t)out_len, false); - if (upper != buf) free(upper); - return ret; -} - -public Text_t Text$lower(Text_t text) -{ - if (text.length == 0) return text; - Array_t codepoints = Text$utf32_codepoints(text); - const char *language = uc_locale_language(); - ucs4_t buf[128]; - size_t out_len = sizeof(buf)/sizeof(buf[0]); - ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len); - Text_t ret = text_from_u32(lower, (int64_t)out_len, false); - if (lower != buf) free(lower); - return ret; -} - -public Text_t Text$title(Text_t text) -{ - if (text.length == 0) return text; - Array_t codepoints = Text$utf32_codepoints(text); - const char *language = uc_locale_language(); - ucs4_t buf[128]; - size_t out_len = sizeof(buf)/sizeof(buf[0]); - ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len); - Text_t ret = text_from_u32(title, (int64_t)out_len, false); - if (title != buf) free(title); - return ret; -} - -public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]) -{ - if (n < 1) return -1; - (void)info; - argtypes[0] = PA_POINTER; - sizes[0] = sizeof(Text_t*); - return 1; -} - -public int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]) -{ - Text_t t = **(Text_t**)args[0]; - if (info->alt) - return text_visualize(stream, t); - else - return Text$print(stream, t); -} - -static inline Text_t _quoted(Text_t text, bool colorize, char quote_char) -{ - // TODO: optimize for ASCII and short strings - Array_t graphemes = {.atomic=1}; -#define add_char(c) Array$insert_value(&graphemes, (ucs4_t)c, I_small(0), sizeof(ucs4_t)) -#define add_str(s) ({ for (const char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (ucs4_t)*_c, I_small(0), sizeof(ucs4_t)); }) - if (colorize) - add_str("\x1b[35m"); - if (quote_char != '"' && quote_char != '\'' && quote_char != '`') - add_char('$'); - add_char(quote_char); - -#define add_escaped(str) ({ if (colorize) add_str("\x1b[34;1m"); add_char('\\'); add_str(str); if (colorize) add_str("\x1b[0;35m"); }) - TextIter_t state = {0, 0}; - for (int64_t i = 0; i < text.length; i++) { - int32_t g = Text$get_grapheme_fast(text, &state, i); - switch (g) { - case '\a': add_escaped("a"); break; - case '\b': add_escaped("b"); break; - case '\x1b': add_escaped("e"); break; - case '\f': add_escaped("f"); break; - case '\n': add_escaped("n"); break; - case '\r': add_escaped("r"); break; - case '\t': add_escaped("t"); break; - case '\v': add_escaped("v"); break; - case '\\': add_escaped("\\"); break; - case '\x00' ... '\x06': case '\x0E' ... '\x1A': - case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': { - if (colorize) add_str("\x1b[34;1m"); - add_char('\\'); - add_char('x'); - char tmp[4]; - sprintf(tmp, "%02X", g); - add_str(tmp); - if (colorize) - add_str("\x1b[0;35m"); - break; - } - default: { - if (g == quote_char) - add_escaped(((char[2]){quote_char, 0})); - else - add_char(g); - break; - } - } - } - - add_char(quote_char); - if (colorize) - add_str("\x1b[m"); - - return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data}; -#undef add_str -#undef add_char -#undef add_escaped -} - -public Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info) -{ - (void)info; - if (info->TextInfo.lang && streq(info->TextInfo.lang, "Path")) { - if (!text) return Text("Path"); - return Text$format("(%s%k%s)", colorize ? "\x1b[35m" : "", text, colorize ? "\x1b[m" : ""); - } - - if (!text) return info && info->TextInfo.lang ? Text$from_str(info->TextInfo.lang) : Text("Text"); - Text_t as_text = _quoted(*(Text_t*)text, colorize, info == &Pattern$info ? '/' : '"'); - if (info && info->TextInfo.lang && info != &Text$info && info != &Pattern$info) - as_text = Text$concat( - colorize ? Text("\x1b[1m$") : Text("$"), - Text$from_str(info->TextInfo.lang), - colorize ? Text("\x1b[0m") : Text(""), - as_text); - return as_text; -} - -public Text_t Text$quoted(Text_t text, bool colorize) -{ - return _quoted(text, colorize, '"'); -} - -public Text_t Text$join(Text_t glue, Array_t pieces) -{ - if (pieces.length == 0) return (Text_t){.length=0}; - - Text_t result = *(Text_t*)pieces.data; - for (int64_t i = 1; i < pieces.length; i++) { - result = Text$concat(result, glue, *(Text_t*)(pieces.data + i*pieces.stride)); - } - return result; -} - -__attribute__((format(printf, 1, 2))) -public Text_t Text$format(const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - - char buf[9]; - int len = vsnprintf(buf, sizeof(buf), fmt, args); - Text_t ret; - if (len <= 8) { - ret = (Text_t){ - .length=len, - .tag=TEXT_SHORT_ASCII, - }; - for (int i = 0; i < len; i++) - ret.short_ascii[i] = buf[i]; - } else { - char *str = GC_MALLOC_ATOMIC((size_t)(len+1)); - vsnprintf(str, (size_t)(len+1), fmt, args); - ret = Text$from_str(str); - } - va_end(args); - return ret; -} - -public Array_t Text$clusters(Text_t text) -{ - Array_t clusters = {.atomic=1}; - for (int64_t i = 1; i <= text.length; i++) { - Text_t cluster = Text$slice(text, I(i), I(i)); - Array$insert(&clusters, &cluster, I_small(0), sizeof(Text_t)); - } - return clusters; -} - -public Array_t Text$utf32_codepoints(Text_t text) -{ - Array_t codepoints = {.atomic=1}; - TextIter_t state = {0, 0}; - for (int64_t i = 0; i < text.length; i++) { - int32_t grapheme = Text$get_grapheme_fast(text, &state, i); - if (grapheme < 0) { - for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) { - ucs4_t subg = GRAPHEME_CODEPOINTS(grapheme)[c]; - Array$insert(&codepoints, &subg, I_small(0), sizeof(ucs4_t)); - } - } else { - Array$insert(&codepoints, &grapheme, I_small(0), sizeof(ucs4_t)); - } - } - return codepoints; -} - -public Array_t Text$utf8_bytes(Text_t text) -{ - const char *str = Text$as_c_string(text); - return (Array_t){.length=strlen(str), .stride=1, .atomic=1, .data=(void*)str}; -} - -static inline const char *codepoint_name(ucs4_t c) -{ - char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); - char *found_name = unicode_character_name(c, name); - if (found_name) return found_name; - const uc_block_t *block = uc_block(c); - assert(block); - snprintf(name, UNINAME_MAX, "%s-%X", block->name, c); - return name; -} - -public Array_t Text$codepoint_names(Text_t text) -{ - Array_t names = {}; - TextIter_t state = {0, 0}; - for (int64_t i = 0; i < text.length; i++) { - int32_t grapheme = Text$get_grapheme_fast(text, &state, i); - if (grapheme < 0) { - for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) { - const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]); - Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=(int64_t)strlen(name), .ascii=name}; - Array$insert(&names, &name_text, I_small(0), sizeof(Text_t)); - } - } else { - const char *name = codepoint_name((ucs4_t)grapheme); - Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=(int64_t)strlen(name), .ascii=name}; - Array$insert(&names, &name_text, I_small(0), sizeof(Text_t)); - } - } - return names; -} - -public Text_t Text$from_codepoints(Array_t codepoints) -{ - if (codepoints.stride != sizeof(int32_t)) - Array$compact(&codepoints, sizeof(int32_t)); - - return text_from_u32(codepoints.data, codepoints.length, true); -} - -public Text_t Text$from_codepoint_names(Array_t codepoint_names) -{ - Array_t codepoints = {}; - for (int64_t i = 0; i < codepoint_names.length; i++) { - Text_t *name = ((Text_t*)(codepoint_names.data + i*codepoint_names.stride)); - const char *name_str = Text$as_c_string(*name); - ucs4_t codepoint = unicode_name_character(name_str); - if (codepoint != UNINAME_INVALID) - Array$insert(&codepoints, &codepoint, I_small(0), sizeof(ucs4_t)); - } - return Text$from_codepoints(codepoints); -} - -public Text_t Text$from_bytes(Array_t bytes) -{ - if (bytes.stride != sizeof(int8_t)) - Array$compact(&bytes, sizeof(int8_t)); - - int8_t nul = 0; - Array$insert(&bytes, &nul, I_small(0), sizeof(int8_t)); - return Text$from_str(bytes.data); -} - -public Array_t Text$lines(Text_t text) -{ - Array_t lines = {}; - TextIter_t state = {0, 0}; - for (int64_t i = 0, line_start = 0; i < text.length; i++) { - int32_t grapheme = Text$get_grapheme_fast(text, &state, i); - if (grapheme == '\r' && Text$get_grapheme_fast(text, &state, i + 1) == '\n') { // CRLF - Text_t line = Text$slice(text, I(line_start+1), I(i)); - Array$insert(&lines, &line, I_small(0), sizeof(Text_t)); - i += 1; // skip one extra for CR - line_start = i + 1; - } else if (grapheme == '\n') { // newline - Text_t line = Text$slice(text, I(line_start+1), I(i)); - Array$insert(&lines, &line, I_small(0), sizeof(Text_t)); - line_start = i + 1; - } else if (i == text.length-1 && line_start != i) { // last line - Text_t line = Text$slice(text, I(line_start+1), I(i+1)); - Array$insert(&lines, &line, I_small(0), sizeof(Text_t)); - } - } - return lines; -} - -public const TypeInfo Text$info = { - .size=sizeof(Text_t), - .align=__alignof__(Text_t), - .tag=TextInfo, - .TextInfo={.lang="Text"}, -}; - -public Pattern_t Pattern$escape_text(Text_t text) -{ - // TODO: optimize for ASCII and short strings - Array_t graphemes = {.atomic=1}; -#define add_char(c) Array$insert_value(&graphemes, (ucs4_t)c, I_small(0), sizeof(ucs4_t)) -#define add_str(s) ({ for (const char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (ucs4_t)*_c, I_small(0), sizeof(ucs4_t)); }) - TextIter_t state = {0, 0}; - for (int64_t i = 0; i < text.length; i++) { - int32_t g = Text$get_grapheme_fast(text, &state, i); - ucs4_t g0 = g < 0 ? GRAPHEME_CODEPOINTS(g)[0] : (ucs4_t)g; - - if (g == '{') { - add_str("{1{}"); - } else if (g0 == '?' - || uc_is_property_quotation_mark(g0) - || (uc_is_property_paired_punctuation(g0) && uc_is_property_left_of_pair(g0))) { - add_char('{'); - add_char('1'); - add_char(g); - add_char('}'); - } else { - add_char(g); - } - } - return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data}; -#undef add_str -#undef add_char -#undef add_escaped -} - -public const TypeInfo Pattern$info = { - .size=sizeof(Pattern_t), - .align=__alignof__(Pattern_t), - .tag=TextInfo, - .TextInfo={.lang="Pattern"}, -}; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/text.h b/builtins/text.h deleted file mode 100644 index 841d51fe..00000000 --- a/builtins/text.h +++ /dev/null @@ -1,67 +0,0 @@ -#pragma once - -// Type info and methods for Text datatype, which uses a struct inspired by -// Raku's string representation and libunistr - -#include -#include -#include -#include - -#include "datatypes.h" -#include "integers.h" - -typedef struct { - int64_t subtext, sum_of_previous_subtexts; -} TextIter_t; - -int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]); -int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]); - -#define Text(str) ((Text_t){.length=sizeof(str)-1, .tag=TEXT_ASCII, .ascii="" str}) - -int Text$print(FILE *stream, Text_t t); -void Text$visualize(Text_t t); -Text_t Text$_concat(int n, Text_t items[n]); -#define Text$concat(...) Text$_concat(sizeof((Text_t[]){__VA_ARGS__})/sizeof(Text_t), (Text_t[]){__VA_ARGS__}) -#define Texts(...) Text$concat(__VA_ARGS__) -Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int); -Text_t Text$from_str(const char *str); -Text_t Text$from_strn(const char *str, size_t len); -PUREFUNC uint64_t Text$hash(Text_t *text); -PUREFUNC int32_t Text$compare(const Text_t *a, const Text_t *b); -PUREFUNC bool Text$equal(const Text_t *a, const Text_t *b); -PUREFUNC bool Text$equal_values(Text_t a, Text_t b); -PUREFUNC bool Text$equal_ignoring_case(Text_t a, Text_t b); -Text_t Text$upper(Text_t text); -Text_t Text$lower(Text_t text); -Text_t Text$title(Text_t text); -Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info); -Text_t Text$quoted(Text_t str, bool colorize); -PUREFUNC bool Text$starts_with(Text_t text, Text_t prefix); -PUREFUNC bool Text$ends_with(Text_t text, Text_t suffix); -char *Text$as_c_string(Text_t text); -__attribute__((format(printf, 1, 2))) -public Text_t Text$format(const char *fmt, ...); -Array_t Text$clusters(Text_t text); -Array_t Text$utf32_codepoints(Text_t text); -Array_t Text$utf8_bytes(Text_t text); -Array_t Text$codepoint_names(Text_t text); -Text_t Text$from_codepoints(Array_t codepoints); -Text_t Text$from_codepoint_names(Array_t codepoint_names); -Text_t Text$from_bytes(Array_t bytes); -Array_t Text$lines(Text_t text); -Text_t Text$join(Text_t glue, Array_t pieces); -Text_t Text$repeat(Text_t text, Int_t count); -int32_t Text$get_grapheme_fast(Text_t text, TextIter_t *state, int64_t index); -ucs4_t Text$get_main_grapheme_fast(Text_t text, TextIter_t *state, int64_t index); - -static inline int32_t Text$get_grapheme(Text_t text, int64_t index) -{ - TextIter_t state = {0, 0}; - return Text$get_grapheme_fast(text, &state, index); -} - -extern const TypeInfo Text$info; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/threads.c b/builtins/threads.c deleted file mode 100644 index 74e73832..00000000 --- a/builtins/threads.c +++ /dev/null @@ -1,55 +0,0 @@ -// Logic for the Thread type, representing a pthread - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrays.h" -#include "text.h" -#include "types.h" -#include "util.h" - -public pthread_t *Thread$new(Closure_t fn) -{ - pthread_t *thread = new(pthread_t); - pthread_create(thread, NULL, fn.fn, fn.userdata); - return thread; -} - -public void Thread$join(pthread_t *thread) -{ - pthread_join(*thread, NULL); -} - -public void Thread$cancel(pthread_t *thread) -{ - pthread_cancel(*thread); -} - -public void Thread$detach(pthread_t *thread) -{ - pthread_detach(*thread); -} - -Text_t Thread$as_text(const pthread_t **thread, bool colorize, const TypeInfo *type) -{ - (void)type; - if (!thread) { - return colorize ? Text("\x1b[34;1mThread\x1b[m") : Text("Thread"); - } - return Text$format(colorize ? "\x1b[34;1mThread(%p)\x1b[m" : "Thread(%p)", *thread); -} - -public const TypeInfo Thread = { - .size=sizeof(pthread_t*), .align=__alignof(pthread_t*), - .tag=CustomInfo, - .CustomInfo={.as_text=(void*)Thread$as_text}, -}; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/threads.h b/builtins/threads.h deleted file mode 100644 index 52091677..00000000 --- a/builtins/threads.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -// Logic for the Thread type, representing a pthread - -#include -#include - -#include "datatypes.h" -#include "types.h" -#include "util.h" - -pthread_t *Thread$new(Closure_t fn); -void Thread$cancel(pthread_t *thread); -void Thread$join(pthread_t *thread); -void Thread$detach(pthread_t *thread); -Text_t Thread$as_text(const pthread_t **thread, bool colorize, const TypeInfo *type); - -extern TypeInfo Thread; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/tomo.h b/builtins/tomo.h deleted file mode 100644 index 7db0f490..00000000 --- a/builtins/tomo.h +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - -// All of the different builtin modules can be included by including this one -// import - -#include -#include -#include -#include -#include - -#include "arrays.h" -#include "bools.h" -#include "c_strings.h" -#include "channels.h" -#include "datatypes.h" -#include "functiontype.h" -#include "integers.h" -#include "memory.h" -#include "metamethods.h" -#include "nums.h" -#include "optionals.h" -#include "paths.h" -#include "patterns.h" -#include "pointers.h" -#include "ranges.h" -#include "shell.h" -#include "siphash.h" -#include "tables.h" -#include "text.h" -#include "threads.h" -#include "types.h" - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/types.c b/builtins/types.c deleted file mode 100644 index c9f2578f..00000000 --- a/builtins/types.c +++ /dev/null @@ -1,38 +0,0 @@ -// Type information and methods for TypeInfos (i.e. runtime representations of types) -#include -#include -#include -#include -#include - -#include "util.h" -#include "arrays.h" -#include "pointers.h" -#include "tables.h" -#include "text.h" -#include "types.h" - -public Text_t Type$as_text(const void *typeinfo, bool colorize, const TypeInfo *type) -{ - if (!typeinfo) return Text("TypeInfo"); - - if (colorize) - return Text$concat( - Text("\x1b[36;1m"), - Text$from_str(type->TypeInfoInfo.type_str), - Text("\x1b[m")); - else - return Text$from_str(type->TypeInfoInfo.type_str); -} - -public const TypeInfo TypeInfo$info = { - .size=sizeof(TypeInfo), - .align=__alignof__(TypeInfo), - .tag=CustomInfo, - .TypeInfoInfo.type_str="TypeInfo", -}; - -public const TypeInfo Void$info = {.size=0, .align=0, .tag=EmptyStructInfo}; -public const TypeInfo Abort$info = {.size=0, .align=0, .tag=EmptyStructInfo}; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/types.h b/builtins/types.h deleted file mode 100644 index bcdafad2..00000000 --- a/builtins/types.h +++ /dev/null @@ -1,86 +0,0 @@ -#pragma once - -// Type information and methods for TypeInfos (i.e. runtime representations of types) - -#include -#include - -#include "datatypes.h" - -struct TypeInfo; - -typedef uint64_t (*hash_fn_t)(const void*, const struct TypeInfo*); -typedef int32_t (*compare_fn_t)(const void*, const void*, const struct TypeInfo*); -typedef bool (*equal_fn_t)(const void*, const void*, const struct TypeInfo*); -typedef Text_t (*text_fn_t)(const void*, bool, const struct TypeInfo*); - -typedef struct TypeInfo { - int64_t size, align; - struct { // Anonymous tagged union for convenience - enum { CustomInfo, StructInfo, EnumInfo, PointerInfo, TextInfo, ArrayInfo, ChannelInfo, TableInfo, FunctionInfo, - OptionalInfo, TypeInfoInfo, OpaqueInfo, EmptyStructInfo, CStringInfo } tag; - union { - struct { - equal_fn_t equal; - compare_fn_t compare; - hash_fn_t hash; - text_fn_t as_text; - } CustomInfo; - struct { - const char *sigil; - const struct TypeInfo *pointed; - } PointerInfo; - struct { - const char *lang; - } TextInfo; - struct { - const struct TypeInfo *item; - } ArrayInfo, ChannelInfo; - struct { - const struct TypeInfo *key, *value; - } TableInfo; - struct { - const char *type_str; - } FunctionInfo; - struct { - const char *type_str; - } TypeInfoInfo; - struct { - const struct TypeInfo *type; - } OptionalInfo; -#pragma GCC diagnostic ignored "-Wpedantic" - struct {} OpaqueInfo; - struct { - const char *name; - } EmptyStructInfo; - }; - }; -} TypeInfo; - -#define Pointer$info(sigil_expr, pointed_info) &((TypeInfo){.size=sizeof(void*), .align=__alignof__(void*), \ - .tag=PointerInfo, .PointerInfo={.sigil=sigil_expr, .pointed=pointed_info}}) -#define Array$info(item_info) &((TypeInfo){.size=sizeof(Array_t), .align=__alignof__(Array_t), \ - .tag=ArrayInfo, .ArrayInfo.item=item_info}) -#define Set$info(item_info) &((TypeInfo){.size=sizeof(Table_t), .align=__alignof__(Table_t), \ - .tag=TableInfo, .TableInfo.key=item_info, .TableInfo.value=&Void$info}) -#define Channel$info(item_info) &((TypeInfo){.size=sizeof(Channel_t), .align=__alignof__(Channel_t), \ - .tag=ChannelInfo, .ChannelInfo.item=item_info}) -#define Table$info(key_expr, value_expr) &((TypeInfo){.size=sizeof(Table_t), .align=__alignof__(Table_t), \ - .tag=TableInfo, .TableInfo.key=key_expr, .TableInfo.value=value_expr}) -#define Function$info(typestr) &((TypeInfo){.size=sizeof(void*), .align=__alignof__(void*), \ - .tag=FunctionInfo, .FunctionInfo.type_str=typestr}) -#define Closure$info(typestr) &((TypeInfo){.size=sizeof(void*[2]), .align=__alignof__(void*), \ - .tag=FunctionInfo, .FunctionInfo.type_str=typestr}) -#define TypeInfo$info(typestr) &((TypeInfo){.size=sizeof(TypeInfo), .align=__alignof__(TypeInfo), \ - .tag=TypeInfoInfo, .TypeInfoInfo.type_str=typestr}) -#define Optional$info(t) &((TypeInfo){.size=(t)->size, .align=(t)->align, \ - .tag=OptionalInfo, .OptionalInfo.type=t}) - -extern const TypeInfo TypeInfo$info; -extern const TypeInfo Void$info; -extern const TypeInfo Abort$info; -#define Void_t void - -Text_t Type$as_text(const void *typeinfo, bool colorize, const TypeInfo *type); - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/util.c b/builtins/util.c deleted file mode 100644 index 7749b22c..00000000 --- a/builtins/util.c +++ /dev/null @@ -1,28 +0,0 @@ -// Built-in utility functions -#include -#include -#include -#include -#include -#include - -#include "text.h" -#include "util.h" - -public bool USE_COLOR; - -__attribute__((format(printf, 1, 2))) -public char *heap_strf(const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - char *tmp = NULL; - int len = vasprintf(&tmp, fmt, args); - if (len < 0) return NULL; - va_end(args); - char *ret = GC_strndup(tmp, (size_t)len); - free(tmp); - return ret; -} - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/util.h b/builtins/util.h deleted file mode 100644 index a24264cd..00000000 --- a/builtins/util.h +++ /dev/null @@ -1,66 +0,0 @@ -#pragma once - -// Built-in utility functions - -#include -#include -#include -#include -#include - -#define streq(a, b) (((a) == NULL && (b) == NULL) || (((a) == NULL) == ((b) == NULL) && strcmp(a, b) == 0)) -#define starts_with(line, prefix) (strncmp(line, prefix, strlen(prefix)) == 0) -#define ends_with(line, suffix) (strlen(line) >= strlen(suffix) && strcmp(line + strlen(line) - strlen(suffix), suffix) == 0) -#define new(t, ...) ((t*)memcpy(GC_MALLOC(sizeof(t)), &(t){__VA_ARGS__}, sizeof(t))) -#define heap(x) (__typeof(x)*)memcpy(GC_MALLOC(sizeof(x)), (__typeof(x)[1]){x}, sizeof(x)) -#define stack(x) (__typeof(x)*)((__typeof(x)[1]){x}) -#define Match(x, _tag) ((x)->tag == _tag ? &(x)->__data._tag : (errx(1, __FILE__ ":%d This was supposed to be a " # _tag "\n", __LINE__), &(x)->__data._tag)) -#define check_initialized(var, name) *({ if (!var ## $initialized) fail("The variable " name " is being accessed before it has been initialized!"); \ - &var; }) - -#ifndef auto -#define auto __auto_type -#endif - -#ifndef public -#define public __attribute__ ((visibility ("default"))) -#endif - -#ifndef PUREFUNC -#define PUREFUNC __attribute__ ((pure)) -#endif - -#ifndef CONSTFUNC -#define CONSTFUNC __attribute__ ((const)) -#endif - -extern bool USE_COLOR; - -#define REVERSE_LIST(list) do { \ - __typeof(list) _prev = NULL; \ - __typeof(list) _next = NULL; \ - auto _current = list; \ - while (_current != NULL) { \ - _next = _current->next; \ - _current->next = _prev; \ - _prev = _current; \ - _current = _next; \ - } \ - list = _prev; \ -} while(0) - -#define LIST_MAP(src, var, ...) ({\ - __typeof(src) mapped = NULL; \ - __typeof(src) *next = &mapped; \ - for (__typeof(src) var = src; var; var = var->next) { \ - *next = GC_MALLOC(sizeof(__typeof(*(src)))); \ - **next = *var; \ - **next = (__typeof(*(src))){__VA_ARGS__}; \ - next = &((*next)->next); \ - } \ - mapped; }) - -__attribute__((format(printf, 1, 2))) -char *heap_strf(const char *fmt, ...); - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/compile.c b/compile.c index c9e08c92..2802519a 100644 --- a/compile.c +++ b/compile.c @@ -7,15 +7,15 @@ #include #include "ast.h" -#include "builtins/integers.h" -#include "builtins/text.h" +#include "stdlib/integers.h" +#include "stdlib/text.h" #include "compile.h" #include "cordhelpers.h" #include "enums.h" #include "structs.h" #include "environment.h" #include "typecheck.h" -#include "builtins/util.h" +#include "stdlib/util.h" typedef ast_t* (*comprehension_body_t)(ast_t*, ast_t*); diff --git a/compile.h b/compile.h index fa18ee89..d53262da 100644 --- a/compile.h +++ b/compile.h @@ -6,7 +6,7 @@ #include #include -#include "builtins/util.h" +#include "stdlib/util.h" #include "environment.h" CORD expr_as_text(env_t *env, CORD expr, type_t *t, CORD color); diff --git a/cordhelpers.c b/cordhelpers.c index f0e1ab38..5a0066f7 100644 --- a/cordhelpers.c +++ b/cordhelpers.c @@ -3,7 +3,7 @@ #include #include -#include "builtins/util.h" +#include "stdlib/util.h" __attribute__((format(printf, 1, 2))) public CORD CORD_asprintf(CORD fmt, ...) diff --git a/enums.c b/enums.c index 7268d565..e06ca0e0 100644 --- a/enums.c +++ b/enums.c @@ -5,13 +5,13 @@ #include #include "ast.h" -#include "builtins/text.h" +#include "stdlib/text.h" #include "compile.h" #include "cordhelpers.h" #include "structs.h" #include "environment.h" #include "typecheck.h" -#include "builtins/util.h" +#include "stdlib/util.h" PUREFUNC static bool has_extra_data(tag_ast_t *tags) { diff --git a/environment.c b/environment.c index 5170fa95..dc31eee1 100644 --- a/environment.c +++ b/environment.c @@ -3,9 +3,9 @@ #include #include -#include "builtins/tables.h" -#include "builtins/text.h" -#include "builtins/util.h" +#include "stdlib/tables.h" +#include "stdlib/text.h" +#include "stdlib/util.h" #include "cordhelpers.h" #include "environment.h" #include "typecheck.h" diff --git a/environment.h b/environment.h index 005fda7b..59af36da 100644 --- a/environment.h +++ b/environment.h @@ -5,7 +5,7 @@ #include #include "types.h" -#include "builtins/tables.h" +#include "stdlib/tables.h" typedef struct { CORD local_typedefs; diff --git a/parse.c b/parse.c index 4e57cabf..a5ad136b 100644 --- a/parse.c +++ b/parse.c @@ -12,10 +12,10 @@ #include #include "ast.h" -#include "builtins/integers.h" -#include "builtins/text.h" -#include "builtins/tables.h" -#include "builtins/util.h" +#include "stdlib/integers.h" +#include "stdlib/text.h" +#include "stdlib/tables.h" +#include "stdlib/util.h" #include "cordhelpers.h" // The cache of {filename -> parsed AST} will hold at most this many entries: diff --git a/repl.c b/repl.c index 06806c3c..1dbc01c0 100644 --- a/repl.c +++ b/repl.c @@ -8,8 +8,8 @@ #include #include -#include "builtins/tomo.h" -#include "builtins/util.h" +#include "stdlib/tomo.h" +#include "stdlib/util.h" #include "typecheck.h" #include "parse.h" diff --git a/stdlib/arrays.c b/stdlib/arrays.c new file mode 100644 index 00000000..58a33754 --- /dev/null +++ b/stdlib/arrays.c @@ -0,0 +1,684 @@ +// Functions that operate on arrays + +#include +#include +#include +#include + +#include "arrays.h" +#include "metamethods.h" +#include "optionals.h" +#include "tables.h" +#include "text.h" +#include "util.h" + +// Use inline version of siphash code: +#include "siphash.h" +#include "siphash-internals.h" + +PUREFUNC static inline int64_t get_padded_item_size(const TypeInfo *info) +{ + int64_t size = info->ArrayInfo.item->size; + if (info->ArrayInfo.item->align > 1 && size % info->ArrayInfo.item->align) + size += info->ArrayInfo.item->align - (size % info->ArrayInfo.item->align); // padding + return size; +} + +// Replace the array's .data pointer with a new pointer to a copy of the +// data that is compacted and has a stride of exactly `padded_item_size` +public void Array$compact(Array_t *arr, int64_t padded_item_size) +{ + void *copy = NULL; + if (arr->length > 0) { + copy = arr->atomic ? GC_MALLOC_ATOMIC((size_t)arr->length * (size_t)padded_item_size) + : GC_MALLOC((size_t)arr->length * (size_t)padded_item_size); + if ((int64_t)arr->stride == padded_item_size) { + memcpy(copy, arr->data, (size_t)arr->length * (size_t)padded_item_size); + } else { + for (int64_t i = 0; i < arr->length; i++) + memcpy(copy + i*padded_item_size, arr->data + arr->stride*i, (size_t)padded_item_size); + } + } + *arr = (Array_t){ + .data=copy, + .length=arr->length, + .stride=padded_item_size, + .atomic=arr->atomic, + }; +} + +public void Array$insert(Array_t *arr, const void *item, Int_t int_index, int64_t padded_item_size) +{ + int64_t index = Int_to_Int64(int_index, false); + if (index <= 0) index = arr->length + index + 1; + + if (index < 1) index = 1; + else if (index > (int64_t)arr->length + 1) + fail("Invalid insertion index %ld for an array with length %ld", index, arr->length); + + if (!arr->data) { + arr->free = 4; + arr->data = arr->atomic ? GC_MALLOC_ATOMIC((size_t)arr->free * (size_t)padded_item_size) + : GC_MALLOC((size_t)arr->free * (size_t)padded_item_size); + arr->stride = padded_item_size; + } else if (arr->free < 1 || arr->data_refcount != 0 || (int64_t)arr->stride != padded_item_size) { + arr->free = MIN(ARRAY_MAX_FREE_ENTRIES, MAX(8, arr->length/4)); + void *copy = arr->atomic ? GC_MALLOC_ATOMIC((size_t)(arr->length + arr->free) * (size_t)padded_item_size) + : GC_MALLOC((size_t)(arr->length + arr->free) * (size_t)padded_item_size); + for (int64_t i = 0; i < index-1; i++) + memcpy(copy + i*padded_item_size, arr->data + arr->stride*i, (size_t)padded_item_size); + for (int64_t i = index-1; i < (int64_t)arr->length; i++) + memcpy(copy + (i+1)*padded_item_size, arr->data + arr->stride*i, (size_t)padded_item_size); + arr->data = copy; + arr->data_refcount = 0; + arr->stride = padded_item_size; + } else { + if (index != arr->length+1) + memmove( + arr->data + index*padded_item_size, + arr->data + (index-1)*padded_item_size, + (size_t)((arr->length - index + 1)*padded_item_size)); + } + assert(arr->free > 0); + --arr->free; + ++arr->length; + memcpy((void*)arr->data + (index-1)*padded_item_size, item, (size_t)padded_item_size); +} + +public void Array$insert_all(Array_t *arr, Array_t to_insert, Int_t int_index, int64_t padded_item_size) +{ + int64_t index = Int_to_Int64(int_index, false); + if (to_insert.length == 0) + return; + + if (!arr->data) { + *arr = to_insert; + ARRAY_INCREF(*arr); + return; + } + + if (index < 1) index = arr->length + index + 1; + + if (index < 1) index = 1; + else if (index > (int64_t)arr->length + 1) + fail("Invalid insertion index %ld for an array with length %ld", index, arr->length); + + if ((int64_t)arr->free >= (int64_t)to_insert.length // Adequate free space + && arr->data_refcount == 0 // Not aliased memory + && (int64_t)arr->stride == padded_item_size) { // Contiguous array + // If we can fit this within the array's preallocated free space, do that: + arr->free -= to_insert.length; + arr->length += to_insert.length; + if (index != arr->length+1) + memmove((void*)arr->data + index*padded_item_size, + arr->data + (index-1)*padded_item_size, + (size_t)((arr->length - index + to_insert.length-1)*padded_item_size)); + for (int64_t i = 0; i < to_insert.length; i++) + memcpy((void*)arr->data + (index-1 + i)*padded_item_size, + to_insert.data + i*to_insert.stride, (size_t)padded_item_size); + } else { + // Otherwise, allocate a new chunk of memory for the array and populate it: + int64_t new_len = arr->length + to_insert.length; + arr->free = MIN(ARRAY_MAX_FREE_ENTRIES, MAX(8, new_len/4)); + void *data = arr->atomic ? GC_MALLOC_ATOMIC((size_t)((new_len + arr->free) * padded_item_size)) + : GC_MALLOC((size_t)((new_len + arr->free) * padded_item_size)); + void *p = data; + + // Copy first chunk of `arr` if needed: + if (index > 1) { + if (arr->stride == padded_item_size) { + p = mempcpy(p, arr->data, (size_t)((index-1)*padded_item_size)); + } else { + for (int64_t i = 0; i < index-1; i++) + p = mempcpy(p, arr->data + arr->stride*i, (size_t)padded_item_size); + } + } + + // Copy `to_insert` + if (to_insert.stride == padded_item_size) { + p = mempcpy(p, to_insert.data, (size_t)(to_insert.length*padded_item_size)); + } else { + for (int64_t i = 0; i < index-1; i++) + p = mempcpy(p, to_insert.data + to_insert.stride*i, (size_t)padded_item_size); + } + + // Copy last chunk of `arr` if needed: + if (index < arr->length + 1) { + if (arr->stride == padded_item_size) { + p = mempcpy(p, arr->data + padded_item_size*(index-1), (size_t)((arr->length - index + 1)*padded_item_size)); + } else { + for (int64_t i = index-1; i < arr->length-1; i++) + p = mempcpy(p, arr->data + arr->stride*i, (size_t)padded_item_size); + } + } + arr->length = new_len; + arr->stride = padded_item_size; + arr->data = data; + arr->data_refcount = 0; + } +} + +public void Array$remove_at(Array_t *arr, Int_t int_index, Int_t int_count, int64_t padded_item_size) +{ + int64_t index = Int_to_Int64(int_index, false); + if (index < 1) index = arr->length + index + 1; + + int64_t count = Int_to_Int64(int_count, false); + if (index < 1 || index > (int64_t)arr->length || count < 1) return; + + if (count > arr->length - index + 1) + count = (arr->length - index) + 1; + + if (index == 1) { + arr->data += arr->stride * count; + } else if (index + count > arr->length) { + if (arr->free >= 0) + arr->free += count; + } else if (arr->data_refcount != 0 || (int64_t)arr->stride != padded_item_size) { + void *copy = arr->atomic ? GC_MALLOC_ATOMIC((size_t)((arr->length-1) * padded_item_size)) + : GC_MALLOC((size_t)((arr->length-1) * padded_item_size)); + for (int64_t src = 1, dest = 1; src <= (int64_t)arr->length; src++) { + if (src < index || src >= index + count) { + memcpy(copy + (dest - 1)*padded_item_size, arr->data + arr->stride*(src - 1), (size_t)padded_item_size); + ++dest; + } + } + arr->data = copy; + arr->free = 0; + arr->data_refcount = 0; + } else { + memmove((void*)arr->data + (index-1)*padded_item_size, arr->data + (index-1 + count)*padded_item_size, + (size_t)((arr->length - index + count - 1)*padded_item_size)); + arr->free += count; + } + arr->length -= count; + if (arr->length == 0) arr->data = NULL; +} + +public void Array$remove_item(Array_t *arr, void *item, Int_t max_removals, const TypeInfo *type) +{ + int64_t padded_item_size = get_padded_item_size(type); + const Int_t ZERO = (Int_t){.small=(0<<2)|1}; + const Int_t ONE = (Int_t){.small=(1<<2)|1}; + const TypeInfo *item_type = type->ArrayInfo.item; + for (int64_t i = 0; i < arr->length; ) { + if (max_removals.small == ZERO.small) // zero + break; + + if (generic_equal(item, arr->data + i*arr->stride, item_type)) { + Array$remove_at(arr, I(i+1), ONE, padded_item_size); + max_removals = Int$minus(max_removals, ONE); + } else { + i++; + } + } +} + +public Int_t Array$find(Array_t arr, void *item, const TypeInfo *type) +{ + const TypeInfo *item_type = type->ArrayInfo.item; + for (int64_t i = 0; i < arr.length; i++) { + if (generic_equal(item, arr.data + i*arr.stride, item_type)) + return I(i+1); + } + return NULL_INT; +} + +public Int_t Array$first(Array_t arr, Closure_t predicate) +{ + bool (*is_good)(void*, void*) = (void*)predicate.fn; + for (int64_t i = 0; i < arr.length; i++) { + if (is_good(arr.data + i*arr.stride, predicate.userdata)) + return I(i+1); + } + return NULL_INT; +} + +public void Array$sort(Array_t *arr, Closure_t comparison, int64_t padded_item_size) +{ + if (arr->data_refcount != 0 || (int64_t)arr->stride != padded_item_size) + Array$compact(arr, padded_item_size); + + qsort_r(arr->data, (size_t)arr->length, (size_t)padded_item_size, comparison.fn, comparison.userdata); +} + +public Array_t Array$sorted(Array_t arr, Closure_t comparison, int64_t padded_item_size) +{ + Array$compact(&arr, padded_item_size); + qsort_r(arr.data, (size_t)arr.length, (size_t)padded_item_size, comparison.fn, comparison.userdata); + return arr; +} + +#pragma GCC diagnostic ignored "-Wstack-protector" +public void Array$shuffle(Array_t *arr, int64_t padded_item_size) +{ + if (arr->data_refcount != 0 || (int64_t)arr->stride != padded_item_size) + Array$compact(arr, padded_item_size); + + char tmp[padded_item_size]; + for (int64_t i = arr->length-1; i > 1; i--) { + int64_t j = arc4random_uniform(i+1); + memcpy(tmp, arr->data + i*padded_item_size, (size_t)padded_item_size); + memcpy((void*)arr->data + i*padded_item_size, arr->data + j*padded_item_size, (size_t)padded_item_size); + memcpy((void*)arr->data + j*padded_item_size, tmp, (size_t)padded_item_size); + } +} + +public Array_t Array$shuffled(Array_t arr, int64_t padded_item_size) +{ + Array$compact(&arr, padded_item_size); + Array$shuffle(&arr, padded_item_size); + return arr; +} + +public void *Array$random(Array_t arr) +{ + if (arr.length == 0) + return NULL; // fail("Cannot get a random item from an empty array!"); + int64_t index = arc4random_uniform(arr.length); + return arr.data + arr.stride*index; +} + +public Table_t Array$counts(Array_t arr, const TypeInfo *type) +{ + Table_t counts = {}; + const TypeInfo count_type = {.size=sizeof(Table_t), .align=__alignof__(Table_t), + .tag=TableInfo, .TableInfo.key=type->ArrayInfo.item, .TableInfo.value=&Int$info}; + for (int64_t i = 0; i < arr.length; i++) { + void *key = arr.data + i*arr.stride; + int64_t *count = Table$get(counts, key, &count_type); + int64_t val = count ? *count + 1 : 1; + Table$set(&counts, key, &val, &count_type); + } + return counts; +} + +public Array_t Array$sample(Array_t arr, Int_t int_n, Array_t weights, int64_t padded_item_size) +{ + int64_t n = Int_to_Int64(int_n, false); + if (arr.length == 0 || n <= 0) + return (Array_t){}; + + Array_t selected = { + .data=arr.atomic ? GC_MALLOC_ATOMIC((size_t)(n * padded_item_size)) : GC_MALLOC((size_t)(n * padded_item_size)), + .length=n, + .stride=padded_item_size, .atomic=arr.atomic}; + + double total = 0.0; + for (int64_t i = 0; i < weights.length && i < arr.length; i++) { + double weight = *(double*)(weights.data + weights.stride*i); + if (isinf(weight)) + fail("Infinite weight!"); + else if (isnan(weight)) + fail("NaN weight!"); + else if (weight < 0.0) + fail("Negative weight!"); + else + total += weight; + } + + if (isinf(total)) + fail("Sample weights have overflowed to infinity"); + + if (total == 0.0) { + for (int64_t i = 0; i < n; i++) { + int64_t index = arc4random_uniform(arr.length); + memcpy(selected.data + i*padded_item_size, arr.data + arr.stride*index, (size_t)padded_item_size); + } + } else { + double inverse_average = (double)arr.length / total; + + struct { + int64_t alias; + double odds; + } aliases[arr.length] = {}; + + for (int64_t i = 0; i < arr.length; i++) { + double weight = i >= weights.length ? 0.0 : *(double*)(weights.data + weights.stride*i); + aliases[i].odds = weight * inverse_average; + aliases[i].alias = -1; + } + + int64_t small = 0; + for (int64_t big = 0; big < arr.length; big++) { + while (aliases[big].odds >= 1.0) { + while (small < arr.length && (aliases[small].odds >= 1.0 || aliases[small].alias != -1)) + ++small; + + if (small >= arr.length) { + aliases[big].odds = 1.0; + aliases[big].alias = big; + break; + } + + aliases[small].alias = big; + aliases[big].odds = (aliases[small].odds + aliases[big].odds) - 1.0; + } + if (big < small) small = big; + } + + for (int64_t i = small; i < arr.length; i++) + if (aliases[i].alias == -1) + aliases[i].alias = i; + + for (int64_t i = 0; i < n; i++) { + double r = drand48() * arr.length; + int64_t index = (int64_t)r; + if ((r - (double)index) > aliases[index].odds) + index = aliases[index].alias; + memcpy(selected.data + i*selected.stride, arr.data + index*arr.stride, (size_t)padded_item_size); + } + } + return selected; +} + +public Array_t Array$from(Array_t array, Int_t int_first) +{ + int64_t first = Int_to_Int64(int_first, false); + if (first < 0) + first = array.length + first + 1; + + if (first < 1 || first > array.length) + return (Array_t){.atomic=array.atomic}; + + return (Array_t){ + .atomic=array.atomic, + .data=array.data + array.stride*(first-1), + .length=array.length - first + 1, + .stride=array.stride, + .data_refcount=array.data_refcount, + }; +} + +public Array_t Array$to(Array_t array, Int_t int_last) +{ + int64_t last = Int_to_Int64(int_last, false); + if (last < 0) + last = array.length + last + 1; + + if (last > array.length) + last = array.length; + + if (last == 0) + return (Array_t){.atomic=array.atomic}; + + return (Array_t){ + .atomic=array.atomic, + .data=array.data, + .length=last, + .stride=array.stride, + .data_refcount=array.data_refcount, + }; +} + +public Array_t Array$by(Array_t array, Int_t int_stride, int64_t padded_item_size) +{ + int64_t stride = Int_to_Int64(int_stride, false); + // In the unlikely event that the stride value would be too large to fit in + // a 15-bit integer, fall back to creating a copy of the array: + if (__builtin_expect(array.stride*stride < ARRAY_MIN_STRIDE || array.stride*stride > ARRAY_MAX_STRIDE, 0)) { + void *copy = NULL; + int64_t len = (stride < 0 ? array.length / -stride : array.length / stride) + ((array.length % stride) != 0); + if (len > 0) { + copy = array.atomic ? GC_MALLOC_ATOMIC((size_t)(len * padded_item_size)) : GC_MALLOC((size_t)(len * padded_item_size)); + void *start = (stride < 0 ? array.data + (array.stride * (array.length - 1)) : array.data); + for (int64_t i = 0; i < len; i++) + memcpy(copy + i*padded_item_size, start + array.stride*stride*i, (size_t)padded_item_size); + } + return (Array_t){ + .data=copy, + .length=len, + .stride=padded_item_size, + .atomic=array.atomic, + }; + } + + if (stride == 0) + return (Array_t){.atomic=array.atomic}; + + return (Array_t){ + .atomic=array.atomic, + .data=(stride < 0 ? array.data + (array.stride * (array.length - 1)) : array.data), + .length=(stride < 0 ? array.length / -stride : array.length / stride) + ((array.length % stride) != 0), + .stride=array.stride * stride, + .data_refcount=array.data_refcount, + }; +} + +public Array_t Array$reversed(Array_t array, int64_t padded_item_size) +{ + // Just in case negating the stride gives a value that doesn't fit into a + // 15-bit integer, fall back to Array$by()'s more general method of copying + // the array. This should only happen if array.stride is MIN_STRIDE to + // begin with (very unlikely). + if (__builtin_expect(-array.stride < ARRAY_MIN_STRIDE || -array.stride > ARRAY_MAX_STRIDE, 0)) + return Array$by(array, I(-1), padded_item_size); + + Array_t reversed = array; + reversed.stride = -array.stride; + reversed.data = array.data + (array.length-1)*array.stride; + return reversed; +} + +public Array_t Array$concat(Array_t x, Array_t y, int64_t padded_item_size) +{ + void *data = x.atomic ? GC_MALLOC_ATOMIC((size_t)(padded_item_size*(x.length + y.length))) + : GC_MALLOC((size_t)(padded_item_size*(x.length + y.length))); + if (x.stride == padded_item_size) { + memcpy(data, x.data, (size_t)(padded_item_size*x.length)); + } else { + for (int64_t i = 0; i < x.length; i++) + memcpy(data + i*padded_item_size, x.data + i*padded_item_size, (size_t)padded_item_size); + } + + if (y.stride == padded_item_size) { + memcpy(data + padded_item_size*x.length, y.data, (size_t)(padded_item_size*y.length)); + } else { + for (int64_t i = 0; i < x.length; i++) + memcpy(data + (x.length + i)*padded_item_size, y.data + i*padded_item_size, (size_t)padded_item_size); + } + + return (Array_t){ + .data=data, + .length=x.length + y.length, + .stride=padded_item_size, + .atomic=x.atomic, + }; +} + +public bool Array$has(Array_t array, void *item, const TypeInfo *type) +{ + const TypeInfo *item_type = type->ArrayInfo.item; + for (int64_t i = 0; i < array.length; i++) { + if (generic_equal(array.data + i*array.stride, item, item_type)) + return true; + } + return false; +} + +public void Array$clear(Array_t *array) +{ + *array = (Array_t){.data=0, .length=0}; +} + +public int32_t Array$compare(const Array_t *x, const Array_t *y, const TypeInfo *type) +{ + // Early out for arrays with the same data, e.g. two copies of the same array: + if (x->data == y->data && x->stride == y->stride) + return (x->length > y->length) - (x->length < y->length); + + const TypeInfo *item = type->ArrayInfo.item; + if (item->tag == PointerInfo || (item->tag == CustomInfo && item->CustomInfo.compare == NULL)) { // data comparison + int64_t item_padded_size = type->ArrayInfo.item->size; + if (type->ArrayInfo.item->align > 1 && item_padded_size % type->ArrayInfo.item->align) + item_padded_size += type->ArrayInfo.item->align - (item_padded_size % type->ArrayInfo.item->align); // padding + + if ((int64_t)x->stride == item_padded_size && (int64_t)y->stride == item_padded_size && item->size == item_padded_size) { + int32_t cmp = (int32_t)memcmp(x->data, y->data, (size_t)(MIN(x->length, y->length)*item_padded_size)); + if (cmp != 0) return cmp; + } else { + for (int32_t i = 0, len = MIN(x->length, y->length); i < len; i++) { + int32_t cmp = (int32_t)memcmp(x->data+ x->stride*i, y->data + y->stride*i, (size_t)(item->size)); + if (cmp != 0) return cmp; + } + } + } else { + for (int32_t i = 0, len = MIN(x->length, y->length); i < len; i++) { + int32_t cmp = generic_compare(x->data + x->stride*i, y->data + y->stride*i, item); + if (cmp != 0) return cmp; + } + } + return (x->length > y->length) - (x->length < y->length); +} + +public bool Array$equal(const Array_t *x, const Array_t *y, const TypeInfo *type) +{ + return x == y || (x->length == y->length && Array$compare(x, y, type) == 0); +} + +public Text_t Array$as_text(const Array_t *arr, bool colorize, const TypeInfo *type) +{ + if (!arr) + return Text$concat(Text("["), generic_as_text(NULL, false, type->ArrayInfo.item), Text("]")); + + const TypeInfo *item_type = type->ArrayInfo.item; + Text_t text = Text("["); + for (int64_t i = 0; i < arr->length; i++) { + if (i > 0) + text = Text$concat(text, Text(", ")); + Text_t item_text = generic_as_text(arr->data + i*arr->stride, colorize, item_type); + text = Text$concat(text, item_text); + } + text = Text$concat(text, Text("]")); + return text; +} + +public uint64_t Array$hash(const Array_t *arr, const TypeInfo *type) +{ + const TypeInfo *item = type->ArrayInfo.item; + siphash sh; + siphashinit(&sh, sizeof(uint64_t[arr->length])); + if (item->tag == PointerInfo || (item->tag == CustomInfo && item->CustomInfo.hash == NULL && item->size == sizeof(void*))) { // Raw data hash + for (int64_t i = 0; i < arr->length; i++) + siphashadd64bits(&sh, (uint64_t)(arr->data + i*arr->stride)); + } else { + for (int64_t i = 0; i < arr->length; i++) { + uint64_t item_hash = generic_hash(arr->data + i*arr->stride, item); + siphashadd64bits(&sh, item_hash); + } + } + return siphashfinish_last_part(&sh, 0); +} + +#pragma GCC diagnostic ignored "-Wstack-protector" +static void siftdown(Array_t *heap, int64_t startpos, int64_t pos, Closure_t comparison, int64_t padded_item_size) +{ + assert(pos > 0 && pos < heap->length); + char newitem[padded_item_size]; + memcpy(newitem, heap->data + heap->stride*pos, (size_t)(padded_item_size)); + while (pos > startpos) { + int64_t parentpos = (pos - 1) >> 1; + typedef int32_t (*cmp_fn_t)(void*, void*, void*); + int32_t cmp = ((cmp_fn_t)comparison.fn)(newitem, heap->data + heap->stride*parentpos, comparison.userdata); + if (cmp >= 0) + break; + + memcpy(heap->data + heap->stride*pos, heap->data + heap->stride*parentpos, (size_t)(padded_item_size)); + pos = parentpos; + } + memcpy(heap->data + heap->stride*pos, newitem, (size_t)(padded_item_size)); +} + +static void siftup(Array_t *heap, int64_t pos, Closure_t comparison, int64_t padded_item_size) +{ + int64_t endpos = heap->length; + int64_t startpos = pos; + assert(pos < endpos); + + char old_top[padded_item_size]; + memcpy(old_top, heap->data + heap->stride*pos, (size_t)(padded_item_size)); + // Bubble up the smallest leaf node + int64_t limit = endpos >> 1; + while (pos < limit) { + int64_t childpos = 2*pos + 1; // Smaller of the two child nodes + if (childpos + 1 < endpos) { + typedef int32_t (*cmp_fn_t)(void*, void*, void*); + int32_t cmp = ((cmp_fn_t)comparison.fn)( + heap->data + heap->stride*childpos, + heap->data + heap->stride*(childpos + 1), + comparison.userdata); + childpos += (cmp >= 0); + } + + // Move the child node up: + memcpy(heap->data + heap->stride*pos, heap->data + heap->stride*childpos, (size_t)(padded_item_size)); + pos = childpos; + } + memcpy(heap->data + heap->stride*pos, old_top, (size_t)(padded_item_size)); + // Shift the node's parents down: + siftdown(heap, startpos, pos, comparison, padded_item_size); +} + +public void Array$heap_push(Array_t *heap, const void *item, Closure_t comparison, int64_t padded_item_size) +{ + Array$insert(heap, item, I(0), padded_item_size); + + if (heap->length > 1) { + if (heap->data_refcount != 0) + Array$compact(heap, padded_item_size); + siftdown(heap, 0, heap->length-1, comparison, padded_item_size); + } +} + +public void Array$heap_pop(Array_t *heap, Closure_t comparison, int64_t padded_item_size) +{ + if (heap->length == 0) + fail("Attempt to pop from an empty array"); + + if (heap->length == 1) { + *heap = (Array_t){}; + } else if (heap->length == 2) { + heap->data += heap->stride; + --heap->length; + } else { + if (heap->data_refcount != 0) + Array$compact(heap, padded_item_size); + memcpy(heap->data, heap->data + heap->stride*(heap->length-1), (size_t)(padded_item_size)); + --heap->length; + siftup(heap, 0, comparison, padded_item_size); + } +} + +public void Array$heapify(Array_t *heap, Closure_t comparison, int64_t padded_item_size) +{ + if (heap->data_refcount != 0) + Array$compact(heap, padded_item_size); + + // It's necessary to bump the refcount because the user's comparison + // function could do stuff that modifies the heap's data. + ARRAY_INCREF(*heap); + int64_t i, n = heap->length; + for (i = (n >> 1) - 1 ; i >= 0 ; i--) + siftup(heap, i, comparison, padded_item_size); + ARRAY_DECREF(*heap); +} + +public Int_t Array$binary_search(Array_t array, void *target, Closure_t comparison) +{ + typedef int32_t (*cmp_fn_t)(void*, void*, void*); + int64_t lo = 0, hi = array.length-1; + while (lo <= hi) { + int64_t mid = (lo + hi) / 2; + int32_t cmp = ((cmp_fn_t)comparison.fn)( + array.data + array.stride*mid, target, comparison.userdata); + if (cmp == 0) + return I(mid+1); + else if (cmp < 0) + lo = mid + 1; + else if (cmp > 0) + hi = mid - 1; + } + return I(lo+1); // Return the index where the target would be inserted +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/arrays.h b/stdlib/arrays.h new file mode 100644 index 00000000..1e945e5e --- /dev/null +++ b/stdlib/arrays.h @@ -0,0 +1,103 @@ +#pragma once + +// Functions that operate on arrays + +#include + +#include "datatypes.h" +#include "integers.h" +#include "types.h" +#include "util.h" + +// Convert negative indices to back-indexed without branching: index0 = index + (index < 0)*(len+1)) - 1 +#define Array_get(item_type, arr_expr, index_expr, start, end) *({ \ + const Array_t arr = arr_expr; int64_t index = index_expr; \ + int64_t off = index + (index < 0) * (arr.length + 1) - 1; \ + if (__builtin_expect(off < 0 || off >= arr.length, 0)) \ + fail_source(__SOURCE_FILE__, start, end, "Invalid array index: %s (array has length %ld)\n", Text$as_c_string(Int64$as_text(&index, no, NULL)), arr.length); \ + (item_type*)(arr.data + arr.stride * off);}) +#define Array_get_unchecked(type, x, i) *({ const Array_t arr = x; int64_t index = i; \ + int64_t off = index + (index < 0) * (arr.length + 1) - 1; \ + (type*)(arr.data + arr.stride * off);}) +#define Array_lvalue(item_type, arr_expr, index_expr, padded_item_size, start, end) *({ \ + Array_t *arr = arr_expr; int64_t index = index_expr; \ + int64_t off = index + (index < 0) * (arr->length + 1) - 1; \ + if (__builtin_expect(off < 0 || off >= arr->length, 0)) \ + fail_source(__SOURCE_FILE__, start, end, "Invalid array index: %s (array has length %ld)\n", Text$as_c_string(Int64$as_text(&index, no, NULL)), arr->length); \ + if (arr->data_refcount > 0) \ + Array$compact(arr, padded_item_size); \ + (item_type*)(arr->data + arr->stride * off); }) +#define Array_lvalue_unchecked(item_type, arr_expr, index_expr, padded_item_size) *({ \ + Array_t *arr = arr_expr; int64_t index = index_expr; \ + int64_t off = index + (index < 0) * (arr->length + 1) - 1; \ + if (arr->data_refcount > 0) \ + Array$compact(arr, padded_item_size); \ + (item_type*)(arr->data + arr->stride * off); }) +#define Array_set(item_type, arr, index, value, padded_item_size, start, end) \ + Array_lvalue(item_type, arr_expr, index, padded_item_size, start, end) = value +#define is_atomic(x) _Generic(x, bool: true, int8_t: true, int16_t: true, int32_t: true, int64_t: true, float: true, double: true, default: false) +#define TypedArray(t, ...) ({ t items[] = {__VA_ARGS__}; \ + (Array_t){.length=sizeof(items)/sizeof(items[0]), \ + .stride=(int64_t)&items[1] - (int64_t)&items[0], \ + .data=memcpy(GC_MALLOC(sizeof(items)), items, sizeof(items)), \ + .atomic=0, \ + .data_refcount=0}; }) +#define TypedArrayN(t, N, ...) ({ t items[N] = {__VA_ARGS__}; \ + (Array_t){.length=N, \ + .stride=(int64_t)&items[1] - (int64_t)&items[0], \ + .data=memcpy(GC_MALLOC(sizeof(items)), items, sizeof(items)), \ + .atomic=0, \ + .data_refcount=0}; }) +#define Array(x, ...) ({ __typeof(x) items[] = {x, __VA_ARGS__}; \ + (Array_t){.length=sizeof(items)/sizeof(items[0]), \ + .stride=(int64_t)&items[1] - (int64_t)&items[0], \ + .data=memcpy(is_atomic(x) ? GC_MALLOC_ATOMIC(sizeof(items)) : GC_MALLOC(sizeof(items)), items, sizeof(items)), \ + .atomic=is_atomic(x), \ + .data_refcount=0}; }) +// Array refcounts use a saturating add, where once it's at the max value, it stays there. +#define ARRAY_INCREF(arr) (arr).data_refcount += ((arr).data_refcount < ARRAY_MAX_DATA_REFCOUNT) +#define ARRAY_DECREF(arr) (arr).data_refcount -= ((arr).data_refcount < ARRAY_MAX_DATA_REFCOUNT) +#define ARRAY_COPY(arr) ({ ARRAY_INCREF(arr); arr; }) + +#define Array$insert_value(arr, item_expr, index, padded_item_size) ({ __typeof(item_expr) item = item_expr; Array$insert(arr, &item, index, padded_item_size); }) +void Array$insert(Array_t *arr, const void *item, Int_t index, int64_t padded_item_size); +void Array$insert_all(Array_t *arr, Array_t to_insert, Int_t index, int64_t padded_item_size); +void Array$remove_at(Array_t *arr, Int_t index, Int_t count, int64_t padded_item_size); +void Array$remove_item(Array_t *arr, void *item, Int_t max_removals, const TypeInfo *type); +#define Array$remove_item_value(arr, item_expr, max, type) ({ __typeof(item_expr) item = item_expr; Array$remove_item(arr, &item, max, type); }) +Int_t Array$find(Array_t arr, void *item, const TypeInfo *type); +#define Array$find_value(arr, item_expr, type) ({ __typeof(item_expr) item = item_expr; Array$find(arr, &item, type); }) +Int_t Array$first(Array_t arr, Closure_t predicate); +void Array$sort(Array_t *arr, Closure_t comparison, int64_t padded_item_size); +Array_t Array$sorted(Array_t arr, Closure_t comparison, int64_t padded_item_size); +void Array$shuffle(Array_t *arr, int64_t padded_item_size); +Array_t Array$shuffled(Array_t arr, int64_t padded_item_size); +void *Array$random(Array_t arr); +#define Array$random_value(arr, t) ({ Array_t _arr = arr; if (_arr.length == 0) fail("Cannot get a random value from an empty array!"); *(t*)Array$random(_arr); }) +Array_t Array$sample(Array_t arr, Int_t n, Array_t weights, int64_t padded_item_size); +Table_t Array$counts(Array_t arr, const TypeInfo *type); +void Array$clear(Array_t *array); +void Array$compact(Array_t *arr, int64_t padded_item_size); +PUREFUNC bool Array$has(Array_t array, void *item, const TypeInfo *type); +#define Array$has_value(arr, item_expr, type) ({ __typeof(item_expr) item = item_expr; Array$has(arr, &item, type); }) +PUREFUNC Array_t Array$from(Array_t array, Int_t first); +PUREFUNC Array_t Array$to(Array_t array, Int_t last); +PUREFUNC Array_t Array$by(Array_t array, Int_t stride, int64_t padded_item_size); +PUREFUNC Array_t Array$reversed(Array_t array, int64_t padded_item_size); +Array_t Array$concat(Array_t x, Array_t y, int64_t padded_item_size); +PUREFUNC uint64_t Array$hash(const Array_t *arr, const TypeInfo *type); +PUREFUNC int32_t Array$compare(const Array_t *x, const Array_t *y, const TypeInfo *type); +PUREFUNC bool Array$equal(const Array_t *x, const Array_t *y, const TypeInfo *type); +Text_t Array$as_text(const Array_t *arr, bool colorize, const TypeInfo *type); +void Array$heapify(Array_t *heap, Closure_t comparison, int64_t padded_item_size); +void Array$heap_push(Array_t *heap, const void *item, Closure_t comparison, int64_t padded_item_size); +#define Array$heap_push_value(heap, _value, comparison, padded_item_size) ({ __typeof(_value) value = _value; Array$heap_push(heap, &value, comparison, padded_item_size); }) +void Array$heap_pop(Array_t *heap, Closure_t comparison, int64_t padded_item_size); +#define Array$heap_pop_value(heap, comparison, padded_item_size, type) \ + ({ Array_t *_heap = heap; if (_heap->length == 0) fail("Attempt to pop from an empty array"); \ + type value = *(type*)_heap->data; Array$heap_pop(_heap, comparison, padded_item_size); value; }) +Int_t Array$binary_search(Array_t array, void *target, Closure_t comparison); +#define Array$binary_search_value(array, target, comparison) \ + ({ __typeof(target) _target = target; Array$binary_search(array, &_target, comparison); }) + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/bools.c b/stdlib/bools.c new file mode 100644 index 00000000..d7b3718f --- /dev/null +++ b/stdlib/bools.c @@ -0,0 +1,54 @@ +// Boolean methods/type info +#include +#include +#include +#include +#include +#include +#include + +#include "bools.h" +#include "optionals.h" +#include "text.h" +#include "util.h" + +PUREFUNC public Text_t Bool$as_text(const bool *b, bool colorize, const TypeInfo *type) +{ + (void)type; + if (!b) return Text("Bool"); + if (colorize) + return *b ? Text("\x1b[35myes\x1b[m") : Text("\x1b[35mno\x1b[m"); + else + return *b ? Text("yes") : Text("no"); +} + +PUREFUNC public OptionalBool_t Bool$from_text(Text_t text) +{ + if (Text$equal_ignoring_case(text, Text("yes")) + || Text$equal_ignoring_case(text, Text("on")) + || Text$equal_ignoring_case(text, Text("true")) + || Text$equal_ignoring_case(text, Text("1"))) { + return yes; + } else if (Text$equal_ignoring_case(text, Text("no")) + || Text$equal_ignoring_case(text, Text("off")) + || Text$equal_ignoring_case(text, Text("false")) + || Text$equal_ignoring_case(text, Text("0"))) { + return no; + } else { + return NULL_BOOL; + } +} + +public Bool_t Bool$random(double p) +{ + return (drand48() < p); +} + +public const TypeInfo Bool$info = { + .size=sizeof(bool), + .align=__alignof__(bool), + .tag=CustomInfo, + .CustomInfo={.as_text=(void*)Bool$as_text}, +}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/bools.h b/stdlib/bools.h new file mode 100644 index 00000000..98b2ac06 --- /dev/null +++ b/stdlib/bools.h @@ -0,0 +1,22 @@ +#pragma once + +// Boolean functions/type info + +#include +#include + +#include "types.h" +#include "optionals.h" +#include "util.h" + +#define Bool_t bool +#define yes (Bool_t)true +#define no (Bool_t)false + +PUREFUNC Text_t Bool$as_text(const bool *b, bool colorize, const TypeInfo *type); +OptionalBool_t Bool$from_text(Text_t text); +Bool_t Bool$random(double p); + +extern const TypeInfo Bool$info; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/c_strings.c b/stdlib/c_strings.c new file mode 100644 index 00000000..392565ab --- /dev/null +++ b/stdlib/c_strings.c @@ -0,0 +1,55 @@ +// Type info and methods for CString datatype (char*) +#include +#include +#include +#include +#include +#include + +#include "text.h" +#include "siphash.h" +#include "util.h" + +public Text_t CString$as_text(const char **c_string, bool colorize, const TypeInfo *info) +{ + (void)info; + if (!c_string) return Text("CString"); + Text_t text = Text$from_str(*c_string); + return Text$concat(colorize ? Text("\x1b[34mCString\x1b[m(") : Text("CString("), Text$quoted(text, colorize), Text(")")); +} + +public Text_t CString$as_text_simple(const char *str) +{ + return Text$format("%s", str); +} + +PUREFUNC public int32_t CString$compare(const char **x, const char **y) +{ + if (x == y) + return 0; + + if (!*x != !*y) + return (!*y) - (!*x); + + return strcmp(*x, *y); +} + +PUREFUNC public bool CString$equal(const char **x, const char **y) +{ + return CString$compare(x, y) == 0; +} + +PUREFUNC public uint64_t CString$hash(const char **c_str) +{ + if (!*c_str) return 0; + return siphash24((void*)*c_str, strlen(*c_str)); +} + +public const TypeInfo CString$info = { + .size=sizeof(char*), + .align=__alignof__(char*), + .tag=CStringInfo, + .CustomInfo={.as_text=(void*)CString$as_text, .compare=(void*)CString$compare, .equal=(void*)CString$equal, .hash=(void*)CString$hash}, +}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/c_strings.h b/stdlib/c_strings.h new file mode 100644 index 00000000..d4c1caa7 --- /dev/null +++ b/stdlib/c_strings.h @@ -0,0 +1,18 @@ +#pragma once + +// Type info and methods for CString datatype, which represents C's `char*` + +#include +#include + +#include "types.h" + +Text_t CString$as_text(char **str, bool colorize, const TypeInfo *info); +Text_t CString$as_text_simple(const char *str); +PUREFUNC int CString$compare(const char **x, const char **y); +PUREFUNC bool CString$equal(const char **x, const char **y); +PUREFUNC uint64_t CString$hash(const char **str); + +extern const TypeInfo CString$info; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/channels.c b/stdlib/channels.c new file mode 100644 index 00000000..3681b0b8 --- /dev/null +++ b/stdlib/channels.c @@ -0,0 +1,137 @@ +// Functions that operate on channels + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrays.h" +#include "metamethods.h" +#include "integers.h" +#include "siphash.h" +#include "text.h" +#include "types.h" +#include "util.h" + +public Channel_t *Channel$new(Int_t max_size) +{ + if (Int$compare_value(max_size, I_small(0)) <= 0) + fail("Cannot create a channel with a size less than one: %ld", max_size); + Channel_t *channel = new(Channel_t); + channel->items = (Array_t){}; + channel->mutex = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; + channel->cond = (pthread_cond_t)PTHREAD_COND_INITIALIZER; + channel->max_size = Int_to_Int64(max_size, false); + return channel; +} + +public void Channel$give(Channel_t *channel, const void *item, bool front, int64_t padded_item_size) +{ + (void)pthread_mutex_lock(&channel->mutex); + while (channel->items.length >= channel->max_size) + pthread_cond_wait(&channel->cond, &channel->mutex); + Int_t index = front ? I_small(1) : I_small(0); + Array$insert(&channel->items, item, index, padded_item_size); + (void)pthread_mutex_unlock(&channel->mutex); + (void)pthread_cond_signal(&channel->cond); +} + +public void Channel$give_all(Channel_t *channel, Array_t to_give, bool front, int64_t padded_item_size) +{ + if (to_give.length == 0) return; + (void)pthread_mutex_lock(&channel->mutex); + Int_t index = front ? I_small(1) : I_small(0); + if (channel->items.length + to_give.length >= channel->max_size) { + for (int64_t i = 0; i < to_give.length; i++) { + while (channel->items.length >= channel->max_size) + pthread_cond_wait(&channel->cond, &channel->mutex); + Array$insert(&channel->items, to_give.data + i*to_give.stride, index, padded_item_size); + } + } else { + Array$insert_all(&channel->items, to_give, index, padded_item_size); + } + (void)pthread_mutex_unlock(&channel->mutex); + (void)pthread_cond_signal(&channel->cond); +} + +public void Channel$get(Channel_t *channel, void *out, bool front, int64_t item_size, int64_t padded_item_size) +{ + (void)pthread_mutex_lock(&channel->mutex); + while (channel->items.length == 0) + pthread_cond_wait(&channel->cond, &channel->mutex); + memcpy(out, channel->items.data + channel->items.stride * (front ? 0 : channel->items.length-1), (size_t)(item_size)); + Int_t index = front ? I_small(1) : Int64_to_Int(channel->items.length); + Array$remove_at(&channel->items, index, I_small(1), padded_item_size); + (void)pthread_mutex_unlock(&channel->mutex); + (void)pthread_cond_signal(&channel->cond); +} + +public void Channel$peek(Channel_t *channel, void *out, bool front, int64_t item_size) +{ + (void)pthread_mutex_lock(&channel->mutex); + while (channel->items.length == 0) + pthread_cond_wait(&channel->cond, &channel->mutex); + int64_t index = front ? 0 : channel->items.length-1; + memcpy(out, channel->items.data + channel->items.stride*index, (size_t)(item_size)); + (void)pthread_mutex_unlock(&channel->mutex); + (void)pthread_cond_signal(&channel->cond); +} + +public Array_t Channel$view(Channel_t *channel) +{ + (void)pthread_mutex_lock(&channel->mutex); + ARRAY_INCREF(channel->items); + Array_t ret = channel->items; + (void)pthread_mutex_unlock(&channel->mutex); + return ret; +} + +public void Channel$clear(Channel_t *channel) +{ + (void)pthread_mutex_lock(&channel->mutex); + Array$clear(&channel->items); + (void)pthread_mutex_unlock(&channel->mutex); + (void)pthread_cond_signal(&channel->cond); +} + +PUREFUNC public uint64_t Channel$hash(Channel_t **channel, const TypeInfo *type) +{ + (void)type; + return siphash24((void*)*channel, sizeof(Channel_t*)); +} + +PUREFUNC public int32_t Channel$compare(Channel_t **x, Channel_t **y, const TypeInfo *type) +{ + (void)type; + return (*x > *y) - (*x < *y); +} + +PUREFUNC public bool Channel$equal(Channel_t **x, Channel_t **y, const TypeInfo *type) +{ + (void)type; + return (*x == *y); +} + +public Text_t Channel$as_text(Channel_t **channel, bool colorize, const TypeInfo *type) +{ + const TypeInfo *item_type = type->ChannelInfo.item; + if (!channel) { + Text_t typename = generic_as_text(NULL, false, item_type); + return Text$concat(colorize ? Text("\x1b[34;1m|:") : Text("|:"), typename, colorize ? Text("|\x1b[m") : Text("|")); + } + Text_t typename = generic_as_text(NULL, false, item_type); + return Text$concat( + colorize ? Text("\x1b[34;1m|:") : Text("|:"), + typename, + Text("|<"), + Int64$hex((int64_t)(void*)*channel, I_small(0), true, true), + colorize ? Text(">\x1b[m") : Text(">") + ); +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/channels.h b/stdlib/channels.h new file mode 100644 index 00000000..8deb0569 --- /dev/null +++ b/stdlib/channels.h @@ -0,0 +1,28 @@ +#pragma once + +// Functions that operate on channels (thread-safe arrays) + +#include + +#include "datatypes.h" +#include "types.h" +#include "util.h" + +Channel_t *Channel$new(Int_t max_size); +void Channel$give(Channel_t *channel, const void *item, bool front, int64_t padded_item_size); +#define Channel$give_value(channel, item, front, padded_item_size) \ + ({ __typeof(item) _item = item; Channel$give(channel, &_item, front, padded_item_size); }) +void Channel$give_all(Channel_t *channel, Array_t to_give, bool front, int64_t padded_item_size); +void Channel$get(Channel_t *channel, void *out, bool front, int64_t item_size, int64_t padded_item_size); +#define Channel$get_value(channel, front, t, padded_item_size) \ + ({ t _val; Channel$get(channel, &_val, front, sizeof(t), padded_item_size); _val; }) +void Channel$peek(Channel_t *channel, void *out, bool front, int64_t item_size); +#define Channel$peek_value(channel, front, t) ({ t _val; Channel$peek(channel, &_val, front, sizeof(t)); _val; }) +void Channel$clear(Channel_t *channel); +Array_t Channel$view(Channel_t *channel); +PUREFUNC uint64_t Channel$hash(Channel_t **channel, const TypeInfo *type); +PUREFUNC int32_t Channel$compare(Channel_t **x, Channel_t **y, const TypeInfo *type); +PUREFUNC bool Channel$equal(Channel_t **x, Channel_t **y, const TypeInfo *type); +Text_t Channel$as_text(Channel_t **channel, bool colorize, const TypeInfo *type); + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/datatypes.h b/stdlib/datatypes.h new file mode 100644 index 00000000..8c13d3c4 --- /dev/null +++ b/stdlib/datatypes.h @@ -0,0 +1,91 @@ +#pragma once + +// Common datastructures (arrays, tables, closures) + +#include +#include +#include +#include + +#define ARRAY_LENGTH_BITS 42 +#define ARRAY_FREE_BITS 6 +#define ARRAY_REFCOUNT_BITS 3 +#define ARRAY_STRIDE_BITS 12 + +#define MAX_FOR_N_BITS(N) ((1<<(N))-1) +#define ARRAY_MAX_STRIDE MAX_FOR_N_BITS(ARRAY_STRIDE_BITS-1) +#define ARRAY_MIN_STRIDE (~MAX_FOR_N_BITS(ARRAY_STRIDE_BITS-1)) +#define ARRAY_MAX_DATA_REFCOUNT MAX_FOR_N_BITS(ARRAY_REFCOUNT_BITS) +#define ARRAY_MAX_FREE_ENTRIES MAX_FOR_N_BITS(ARRAY_FREE_BITS) + +typedef union { + int64_t small; + mpz_t *big; +} Int_t; + +typedef struct { + void *data; + // All of the following fields add up to 64 bits, which means that array + // structs can be passed in two 64-bit registers. C will handle doing the + // bit arithmetic to extract the necessary values, which is cheaper than + // spilling onto the stack and needing to retrieve data from the stack. + int64_t length:ARRAY_LENGTH_BITS; + uint8_t free:ARRAY_FREE_BITS; + bool atomic:1; + uint8_t data_refcount:ARRAY_REFCOUNT_BITS; + int16_t stride:ARRAY_STRIDE_BITS; +} Array_t; + +typedef struct { + uint32_t occupied:1, index:31; + uint32_t next_bucket; +} bucket_t; + +#define TABLE_MAX_BUCKETS 0x7fffffff +#define TABLE_MAX_DATA_REFCOUNT 3 + +typedef struct { + uint32_t count:31, last_free:31; + uint8_t data_refcount:2; + bucket_t buckets[]; +} bucket_info_t; + +typedef struct table_s { + Array_t entries; + bucket_info_t *bucket_info; + struct table_s *fallback; +} Table_t; + +typedef struct { + void *fn, *userdata; +} Closure_t; + +typedef struct Range_s { + Int_t first, last, step; +} Range_t; + +typedef struct { + Array_t items; + pthread_mutex_t mutex; + pthread_cond_t cond; + int64_t max_size; +} Channel_t; + +enum text_type { TEXT_SHORT_ASCII, TEXT_ASCII, TEXT_SHORT_GRAPHEMES, TEXT_GRAPHEMES, TEXT_SUBTEXT }; + +typedef struct Text_s { + int64_t length; // Number of grapheme clusters + uint64_t hash:61; + uint8_t tag:3; + union { + char short_ascii[8]; + const char *ascii; + int32_t short_graphemes[2]; + const int32_t *graphemes; + struct Text_s *subtexts; + }; +} Text_t; + +#define Pattern_t Text_t + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/files.c b/stdlib/files.c new file mode 100644 index 00000000..4a4220e7 --- /dev/null +++ b/stdlib/files.c @@ -0,0 +1,322 @@ +// +// files.c - Implementation of some file loading functionality. +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "files.h" +#include "util.h" + +static const int tabstop = 4; + +public char *resolve_path(const char *path, const char *relative_to, const char *system_path) +{ + if (!relative_to || streq(relative_to, "/dev/stdin")) relative_to = "."; + if (!path || strlen(path) == 0) return NULL; + + // Resolve the path to an absolute path, assuming it's relative to the file + // it was found in: + char buf[PATH_MAX] = {0}; + if (streq(path, "~") || starts_with(path, "~/")) { + char *resolved = realpath(heap_strf("%s%s", getenv("HOME"), path+1), buf); + if (resolved) return GC_strdup(resolved); + } else if (streq(path, ".") || starts_with(path, "./") || starts_with(path, "../")) { + char *relative_dir = dirname(GC_strdup(relative_to)); + char *resolved = realpath(heap_strf("%s/%s", relative_dir, path), buf); + if (resolved) return GC_strdup(resolved); + } else if (path[0] == '/') { + // Absolute path: + char *resolved = realpath(path, buf); + if (resolved) return GC_strdup(resolved); + } else { + // Relative path: + char *relative_dir = dirname(GC_strdup(relative_to)); + if (!system_path) system_path = "."; + char *copy = GC_strdup(system_path); + for (char *dir, *pos = copy; (dir = strsep(&pos, ":")); ) { + if (dir[0] == '/') { + char *resolved = realpath(heap_strf("%s/%s", dir, path), buf); + if (resolved) return GC_strdup(resolved); + } else if (dir[0] == '~' && (dir[1] == '\0' || dir[1] == '/')) { + char *resolved = realpath(heap_strf("%s%s/%s", getenv("HOME"), dir+1, path), buf); + if (resolved) return GC_strdup(resolved); + } else if (streq(dir, ".") || strncmp(dir, "./", 2) == 0) { + char *resolved = realpath(heap_strf("%s/%s", relative_dir, path), buf); + if (resolved) return GC_strdup(resolved); + } else if (streq(dir, ".") || streq(dir, "..") || strncmp(dir, "./", 2) == 0 || strncmp(dir, "../", 3) == 0) { + char *resolved = realpath(heap_strf("%s/%s/%s", relative_dir, dir, path), buf); + if (resolved) return GC_strdup(resolved); + } else { + char *resolved = realpath(heap_strf("%s/%s", dir, path), buf); + if (resolved) return GC_strdup(resolved); + } + } + } + return NULL; +} + +public char *file_base_name(const char *path) +{ + const char *slash = strrchr(path, '/'); + if (slash) path = slash + 1; + assert(!isdigit(*path)); + const char *end = strchrnul(path, '.'); + size_t len = (size_t)(end - path); + char *buf = GC_MALLOC_ATOMIC(len+1); + strncpy(buf, path, len); + buf[len] = '\0'; + for (char *p = buf; *p; p++) { + if (!isalnum(*p)) + *p = '_'; + } + return buf; +} + +static file_t *_load_file(const char* filename, FILE *file) +{ + if (!file) return NULL; + + file_t *ret = new(file_t, .filename=filename); + + size_t file_size = 0, line_cap = 0; + char *file_buf = NULL, *line_buf = NULL; + FILE *mem = open_memstream(&file_buf, &file_size); + int64_t line_len = 0; + while ((line_len = getline(&line_buf, &line_cap, file)) >= 0) { + if (ret->line_capacity <= ret->num_lines) + ret->line_offsets = GC_REALLOC(ret->line_offsets, sizeof(int64_t[ret->line_capacity += 32])); + ret->line_offsets[ret->num_lines++] = (int64_t)file_size; + fwrite(line_buf, sizeof(char), (size_t)line_len, mem); + fflush(mem); + } + fclose(file); + + char *copy = GC_MALLOC_ATOMIC(file_size+1); + memcpy(copy, file_buf, file_size); + copy[file_size] = '\0'; + ret->text = copy; + ret->len = (int64_t)file_size; + fclose(mem); + + free(file_buf); + ret->relative_filename = filename; + if (filename && filename[0] != '<' && !streq(filename, "/dev/stdin")) { + filename = resolve_path(filename, ".", "."); + // Convert to relative path (if applicable) + char buf[PATH_MAX]; + char *cwd = getcwd(buf, sizeof(buf)); + size_t cwd_len = strlen(cwd); + if (strncmp(cwd, filename, cwd_len) == 0 && filename[cwd_len] == '/') + ret->relative_filename = &filename[cwd_len+1]; + } + return ret; +} + +// +// Read an entire file into memory. +// +public file_t *load_file(const char* filename) +{ + FILE *file = filename[0] ? fopen(filename, "r") : stdin; + return _load_file(filename, file); +} + +// +// Create a virtual file from a string. +// +public file_t *spoof_file(const char* filename, const char *text) +{ + FILE *file = fmemopen((char*)text, strlen(text)+1, "r"); + return _load_file(filename, file); +} + +// +// Given a pointer, determine which line number it points to (1-indexed) +// +public int64_t get_line_number(file_t *f, const char *p) +{ + // Binary search: + int64_t lo = 0, hi = (int64_t)f->num_lines-1; + if (p < f->text) return 0; + int64_t offset = (int64_t)(p - f->text); + while (lo <= hi) { + int64_t mid = (lo + hi) / 2; + int64_t line_offset = f->line_offsets[mid]; + if (line_offset == offset) + return mid + 1; + else if (line_offset < offset) + lo = mid + 1; + else if (line_offset > offset) + hi = mid - 1; + } + return lo; // Return the line number whose line starts closest before p +} + +// +// Given a pointer, determine which line column it points to. +// +public int64_t get_line_column(file_t *f, const char *p) +{ + int64_t line_no = get_line_number(f, p); + int64_t line_offset = f->line_offsets[line_no-1]; + return 1 + (int64_t)(p - (f->text + line_offset)); +} + +// +// Return a pointer to the line with the specified line number (1-indexed) +// +public const char *get_line(file_t *f, int64_t line_number) +{ + if (line_number == 0 || line_number > (int64_t)f->num_lines) return NULL; + int64_t line_offset = f->line_offsets[line_number-1]; + return f->text + line_offset; +} + +// +// Return a value like /foo:line:col +// +public const char *get_file_pos(file_t *f, const char *p) +{ + return heap_strf("%s:%ld:%ld", f->filename, get_line_number(f, p), get_line_column(f, p)); +} + +static int fputc_column(FILE *out, char c, char print_char, int *column) +{ + int printed = 0; + if (print_char == '\t') print_char = ' '; + if (c == '\t') { + for (int to_fill = tabstop - (*column % tabstop); to_fill > 0; --to_fill) { + printed += fputc(print_char, out); + ++*column; + } + } else { + printed += fputc(print_char, out); + ++*column; + } + return printed; +} + +// +// Print a span from a file +// +public int highlight_error(file_t *file, const char *start, const char *end, const char *hl_color, int64_t context_lines, bool use_color) +{ + if (!file) return 0; + + // Handle spans that come from multiple files: + if (start < file->text || start > file->text + file->len) + start = end; + if (end < file->text || end > file->text + file->len) + end = start; + // Just in case neither end of the span came from this file: + if (end < file->text || end > file->text + file->len) + start = end = file->text; + + const char *lineno_fmt, *normal_color, *empty_marker; + bool print_carets = false; + int printed = 0; + if (use_color) { + lineno_fmt = "\x1b[0;2m%*lu\x1b(0\x78\x1b(B\x1b[m "; + normal_color = "\x1b[m"; + empty_marker = "\x1b(0\x61\x1b(B"; + printed += fprintf(stderr, "\x1b[33;4;1m%s\x1b[m\n", file->relative_filename); + } else { + lineno_fmt = "%*lu| "; + hl_color = ""; + normal_color = ""; + empty_marker = " "; + print_carets = true; + printed += fprintf(stderr, "%s\n", file->relative_filename); + } + + if (context_lines == 0) + return fprintf(stderr, "%s%.*s%s", hl_color, (int)(end - start), start, normal_color); + + int64_t start_line = get_line_number(file, start), + end_line = get_line_number(file, end); + + int64_t first_line = start_line - (context_lines - 1), + last_line = end_line + (context_lines - 1); + + if (first_line < 1) first_line = 1; + if (last_line > file->num_lines) last_line = file->num_lines; + + int digits = 1; + for (int64_t i = last_line; i > 0; i /= 10) ++digits; + + for (int64_t line_no = first_line; line_no <= last_line; ++line_no) { + if (line_no > first_line + 5 && line_no < last_line - 5) { + if (use_color) + printed += fprintf(stderr, "\x1b[0;2;3;4m ... %ld lines omitted ... \x1b[m\n", (last_line - first_line) - 11); + else + printed += fprintf(stderr, " ... %ld lines omitted ...\n", (last_line - first_line) - 11); + line_no = last_line - 6; + continue; + } + + printed += fprintf(stderr, lineno_fmt, digits, line_no); + const char *line = get_line(file, line_no); + if (!line) break; + + int column = 0; + const char *p = line; + // Before match + for (; *p && *p != '\r' && *p != '\n' && p < start; ++p) + printed += fputc_column(stderr, *p, *p, &column); + + // Zero-width matches + if (p == start && start == end) { + printed += fprintf(stderr, "%s%s%s", hl_color, empty_marker, normal_color); + column += 1; + } + + // Inside match + if (start <= p && p < end) { + printed += fputs(hl_color, stderr); + for (; *p && *p != '\r' && *p != '\n' && p < end; ++p) + printed += fputc_column(stderr, *p, *p, &column); + printed += fputs(normal_color, stderr); + } + + // After match + for (; *p && *p != '\r' && *p != '\n'; ++p) + printed += fputc_column(stderr, *p, *p, &column); + + printed += fprintf(stderr, "\n"); + + const char *eol = strchrnul(line, '\n'); + if (print_carets && start >= line && start < eol && line <= start) { + for (int num = 0; num < digits; num++) + printed += fputc(' ', stderr); + printed += fputs(": ", stderr); + int col = 0; + for (const char *sp = line; *sp && *sp != '\n'; ++sp) { + char print_char; + if (sp < start) + print_char = ' '; + else if (sp == start && sp == end) + print_char = '^'; + else if (sp >= start && sp < end) + print_char = '-'; + else + print_char = ' '; + printed += fputc_column(stderr, *sp, print_char, &col); + } + printed += fputs("\n", stderr); + } + } + fflush(stderr); + return printed; +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/files.h b/stdlib/files.h new file mode 100644 index 00000000..f650f78e --- /dev/null +++ b/stdlib/files.h @@ -0,0 +1,35 @@ +// +// files.h - Definitions of an API for loading files. +// +#pragma once + +#include +#include +#include + +typedef struct { + const char *filename, *relative_filename; + const char *text; + int64_t len; + int64_t num_lines, line_capacity; + int64_t *line_offsets; +} file_t; + +char *resolve_path(const char *path, const char *relative_to, const char *system_path); +__attribute__((pure, nonnull)) +char *file_base_name(const char *path); +__attribute__((nonnull)) +file_t *load_file(const char *filename); +__attribute__((nonnull, returns_nonnull)) +file_t *spoof_file(const char *filename, const char *text); +__attribute__((pure, nonnull)) +int64_t get_line_number(file_t *f, const char *p); +__attribute__((pure, nonnull)) +int64_t get_line_column(file_t *f, const char *p); +__attribute__((pure, nonnull)) +const char *get_line(file_t *f, int64_t line_number); +__attribute__((pure, nonnull)) +const char *get_file_pos(file_t *f, const char *p); +int highlight_error(file_t *file, const char *start, const char *end, const char *hl_color, int64_t context_lines, bool use_color); + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/functiontype.c b/stdlib/functiontype.c new file mode 100644 index 00000000..251a01ed --- /dev/null +++ b/stdlib/functiontype.c @@ -0,0 +1,35 @@ +// Logic for handling function type values + +#include "datatypes.h" +#include "tables.h" +#include "text.h" +#include "types.h" +#include "util.h" + +static Table_t function_names = {}; + +public void register_function(void *fn, Text_t name) +{ + Table$set(&function_names, &fn, &name, Table$info(Function$info("???"), &Text$info)); +} + +public Text_t *get_function_name(void *fn) +{ + return Table$get(function_names, &fn, Table$info(Function$info("???"), &Text$info)); +} + +public Text_t Func$as_text(const void *fn, bool colorize, const TypeInfo *type) +{ + (void)fn; + Text_t text = Text$from_str(type->FunctionInfo.type_str); + if (fn) { + Text_t *name = get_function_name(*(void**)fn); + if (name) + text = *name; + } + if (fn && colorize) + text = Text$concat(Text("\x1b[32;1m"), text, Text("\x1b[m")); + return text; +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/functiontype.h b/stdlib/functiontype.h new file mode 100644 index 00000000..e3feb03e --- /dev/null +++ b/stdlib/functiontype.h @@ -0,0 +1,9 @@ +#pragma once + +// Logic for handling function type values + +void register_function(void *fn, Text_t name); +Text_t *get_function_name(void *fn); +Text_t Func$as_text(const void *fn, bool colorize, const TypeInfo *type); + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/integers.c b/stdlib/integers.c new file mode 100644 index 00000000..ef588984 --- /dev/null +++ b/stdlib/integers.c @@ -0,0 +1,490 @@ +// Integer type infos and methods +#include +#include +#include +#include +#include +#include + +#include "arrays.h" +#include "datatypes.h" +#include "integers.h" +#include "optionals.h" +#include "siphash.h" +#include "text.h" +#include "types.h" + +static gmp_randstate_t Int_rng = {}; + +public void Int$init_random(long seed) { + gmp_randinit_default(Int_rng); + gmp_randseed_ui(Int_rng, (unsigned long)seed); +} + +public Text_t Int$value_as_text(Int_t i) { + if (__builtin_expect(i.small & 1, 1)) { + return Text$format("%ld", (i.small)>>2); + } else { + char *str = mpz_get_str(NULL, 10, *i.big); + return Text$from_str(str); + } +} + +public Text_t Int$as_text(const Int_t *i, bool colorize, const TypeInfo *type) { + (void)type; + if (!i) return Text("Int"); + + Text_t text = Int$value_as_text(*i); + if (colorize) text = Text$concat(Text("\x1b[35m"), text, Text("\x1b[m")); + return text; +} + +public PUREFUNC int32_t Int$compare(const Int_t *x, const Int_t *y, const TypeInfo *type) { + (void)type; + if (__builtin_expect(((x->small | y->small) & 1) == 0, 0)) + return x->big == y->big ? 0 : mpz_cmp(*x->big, *y->big); + return (x->small > y->small) - (x->small < y->small); +} + +public PUREFUNC int32_t Int$compare_value(const Int_t x, const Int_t y) { + if (__builtin_expect(((x.small | y.small) & 1) == 0, 0)) + return x.big == y.big ? 0 : mpz_cmp(*x.big, *y.big); + return (x.small > y.small) - (x.small < y.small); +} + +public PUREFUNC bool Int$equal(const Int_t *x, const Int_t *y, const TypeInfo *type) { + (void)type; + return x->small == y->small || (__builtin_expect(((x->small | y->small) & 1) == 0, 0) && mpz_cmp(*x->big, *y->big) == 0); +} + +public PUREFUNC bool Int$equal_value(const Int_t x, const Int_t y) { + return x.small == y.small || (__builtin_expect(((x.small | y.small) & 1) == 0, 0) && mpz_cmp(*x.big, *y.big) == 0); +} + +public PUREFUNC uint64_t Int$hash(const Int_t *x, const TypeInfo *type) { + (void)type; + if (__builtin_expect(x->small & 1, 1)) { + int64_t i = (x->small>>2); + return siphash24((void*)&i, sizeof(i)); + } else { + char *str = mpz_get_str(NULL, 16, *x->big); + return siphash24((void*)str, strlen(str)); + } +} + +public Text_t Int$format(Int_t i, Int_t digits_int) { + int64_t digits = Int_to_Int64(digits_int, false); + if (__builtin_expect(i.small & 1, 1)) { + return Text$format("%0.*ld", digits, (i.small)>>2); + } else { + char *str = mpz_get_str(NULL, 10, *i.big); + bool negative = (str[0] == '-'); + int64_t needed_zeroes = digits - (int64_t)strlen(str); + if (needed_zeroes <= 0) + return Text$from_str(str); + + char *zeroes = GC_MALLOC_ATOMIC((size_t)(needed_zeroes)); + memset(zeroes, '0', (size_t)(needed_zeroes)); + if (negative) + return Text$concat(Text("-"), Text$from_str(zeroes), Text$from_str(str + 1)); + else + return Text$concat(Text$from_str(zeroes), Text$from_str(str)); + } +} + +public Text_t Int$hex(Int_t i, Int_t digits_int, bool uppercase, bool prefix) { + if (Int$is_negative(i)) + return Text$concat(Text("-"), Int$hex(Int$negative(i), digits_int, uppercase, prefix)); + + int64_t digits = Int_to_Int64(digits_int, false); + if (__builtin_expect(i.small & 1, 1)) { + const char *hex_fmt = uppercase ? (prefix ? "0x%0.*lX" : "%0.*lX") : (prefix ? "0x%0.*lx" : "%0.*lx"); + return Text$format(hex_fmt, digits, (i.small)>>2); + } else { + char *str = mpz_get_str(NULL, 16, *i.big); + if (uppercase) { + for (char *c = str; *c; c++) + *c = (char)toupper(*c); + } + int64_t needed_zeroes = digits - (int64_t)strlen(str); + if (needed_zeroes <= 0) + return prefix ? Text$concat(Text("0x"), Text$from_str(str)) : Text$from_str(str); + + char *zeroes = GC_MALLOC_ATOMIC((size_t)(needed_zeroes)); + memset(zeroes, '0', (size_t)(needed_zeroes)); + if (prefix) + return Text$concat(Text("0x"), Text$from_str(zeroes), Text$from_str(str)); + else + return Text$concat(Text$from_str(zeroes), Text$from_str(str)); + } +} + +public Text_t Int$octal(Int_t i, Int_t digits_int, bool prefix) { + if (Int$is_negative(i)) + return Text$concat(Text("-"), Int$octal(Int$negative(i), digits_int, prefix)); + + int64_t digits = Int_to_Int64(digits_int, false); + if (__builtin_expect(i.small & 1, 1)) { + const char *octal_fmt = prefix ? "0o%0.*lo" : "%0.*lo"; + return Text$format(octal_fmt, digits, (i.small)>>2); + } else { + char *str = mpz_get_str(NULL, 8, *i.big); + int64_t needed_zeroes = digits - (int64_t)strlen(str); + if (needed_zeroes <= 0) + return prefix ? Text$concat(Text("0o"), Text$from_str(str)) : Text$from_str(str); + + char *zeroes = GC_MALLOC_ATOMIC((size_t)(needed_zeroes)); + memset(zeroes, '0', (size_t)(needed_zeroes)); + if (prefix) + return Text$concat(Text("0o"), Text$from_str(zeroes), Text$from_str(str)); + else + return Text$concat(Text$from_str(zeroes), Text$from_str(str)); + } +} + +public Int_t Int$slow_plus(Int_t x, Int_t y) { + mpz_t result; + mpz_init_set_int(result, x); + if (y.small & 1) { + if (y.small < 0) + mpz_sub_ui(result, result, (uint64_t)(-(y.small >> 2))); + else + mpz_add_ui(result, result, (uint64_t)(y.small >> 2)); + } else { + mpz_add(result, result, *y.big); + } + return Int$from_mpz(result); +} + +public Int_t Int$slow_minus(Int_t x, Int_t y) { + mpz_t result; + mpz_init_set_int(result, x); + if (y.small & 1) { + if (y.small < 0) + mpz_add_ui(result, result, (uint64_t)(-(y.small >> 2))); + else + mpz_sub_ui(result, result, (uint64_t)(y.small >> 2)); + } else { + mpz_sub(result, result, *y.big); + } + return Int$from_mpz(result); +} + +public Int_t Int$slow_times(Int_t x, Int_t y) { + mpz_t result; + mpz_init_set_int(result, x); + if (y.small & 1) + mpz_mul_si(result, result, y.small >> 2); + else + mpz_mul(result, result, *y.big); + return Int$from_mpz(result); +} + +public Int_t Int$slow_divided_by(Int_t dividend, Int_t divisor) { + // Euclidean division, see: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/divmodnote-letter.pdf + mpz_t quotient, remainder; + mpz_init_set_int(quotient, dividend); + mpz_init_set_int(remainder, divisor); + mpz_tdiv_qr(quotient, remainder, quotient, remainder); + if (mpz_sgn(remainder) < 0) { + bool d_positive = __builtin_expect(divisor.small & 1, 1) ? divisor.small > 0x1 : mpz_sgn(*divisor.big) > 0; + if (d_positive) + mpz_sub_ui(quotient, quotient, 1); + else + mpz_add_ui(quotient, quotient, 1); + } + return Int$from_mpz(quotient); +} + +public Int_t Int$slow_modulo(Int_t x, Int_t modulus) +{ + mpz_t result; + mpz_init_set_int(result, x); + mpz_t divisor; + mpz_init_set_int(divisor, modulus); + mpz_mod(result, result, divisor); + return Int$from_mpz(result); +} + +public Int_t Int$slow_modulo1(Int_t x, Int_t modulus) +{ + mpz_t result; + mpz_init_set_int(result, x); + mpz_sub_ui(result, result, 1); + mpz_t divisor; + mpz_init_set_int(divisor, modulus); + mpz_mod(result, result, divisor); + mpz_add_ui(result, result, 1); + return Int$from_mpz(result); +} + +public Int_t Int$slow_left_shifted(Int_t x, Int_t y) +{ + mp_bitcnt_t bits = (mp_bitcnt_t)Int_to_Int64(y, false); + mpz_t result; + mpz_init_set_int(result, x); + mpz_mul_2exp(result, result, bits); + return Int$from_mpz(result); +} + +public Int_t Int$slow_right_shifted(Int_t x, Int_t y) +{ + mp_bitcnt_t bits = (mp_bitcnt_t)Int_to_Int64(y, false); + mpz_t result; + mpz_init_set_int(result, x); + mpz_tdiv_q_2exp(result, result, bits); + return Int$from_mpz(result); +} + +public Int_t Int$slow_bit_and(Int_t x, Int_t y) +{ + mpz_t result; + mpz_init_set_int(result, x); + mpz_t y_mpz; + mpz_init_set_int(y_mpz, y); + mpz_and(result, result, y_mpz); + return Int$from_mpz(result); +} + +public Int_t Int$slow_bit_or(Int_t x, Int_t y) +{ + mpz_t result; + mpz_init_set_int(result, x); + mpz_t y_mpz; + mpz_init_set_int(y_mpz, y); + mpz_ior(result, result, y_mpz); + return Int$from_mpz(result); +} + +public Int_t Int$slow_bit_xor(Int_t x, Int_t y) +{ + mpz_t result; + mpz_init_set_int(result, x); + mpz_t y_mpz; + mpz_init_set_int(y_mpz, y); + mpz_xor(result, result, y_mpz); + return Int$from_mpz(result); +} + +public Int_t Int$slow_negated(Int_t x) +{ + mpz_t result; + mpz_init_set_int(result, x); + mpz_neg(result, result); + mpz_sub_ui(result, result, 1); + return Int$from_mpz(result); +} + +public Int_t Int$slow_negative(Int_t x) +{ + if (__builtin_expect((x.small & 1), 1)) + return (Int_t){.small=4*-((x.small)>>2) + 1}; + + mpz_t result; + mpz_init_set_int(result, x); + mpz_neg(result, result); + return Int$from_mpz(result); +} + +public Int_t Int$abs(Int_t x) +{ + if (__builtin_expect((x.small & 1), 1)) + return (Int_t){.small=4*labs((x.small)>>2) + 1}; + + mpz_t result; + mpz_init_set_int(result, x); + mpz_abs(result, result); + return Int$from_mpz(result); +} + +public Int_t Int$power(Int_t base, Int_t exponent) +{ + int64_t exp = Int_to_Int64(exponent, false); + if (__builtin_expect(exp < 0, 0)) + fail("Cannot take a negative power of an integer!"); + mpz_t result; + mpz_init_set_int(result, base); + mpz_pow_ui(result, result, (uint64_t)exp); + return Int$from_mpz(result); +} + +public Int_t Int$sqrt(Int_t i) +{ + mpz_t result; + mpz_init_set_int(result, i); + mpz_sqrt(result, result); + return Int$from_mpz(result); +} + +public Int_t Int$random(Int_t min, Int_t max) { + int32_t cmp = Int$compare_value(min, max); + if (cmp > 0) { + Text_t min_text = Int$as_text(&min, false, &Int$info), max_text = Int$as_text(&max, false, &Int$info); + fail("Random minimum value (%k) is larger than the maximum value (%k)", + &min_text, &max_text); + } + if (cmp == 0) return min; + + mpz_t range_size; + mpz_init_set_int(range_size, max); + if (min.small & 1) { + mpz_t min_mpz; + mpz_init_set_si(min_mpz, min.small >> 2); + mpz_sub(range_size, range_size, min_mpz); + } else { + mpz_sub(range_size, range_size, *min.big); + } + + mpz_t r; + mpz_init(r); + mpz_urandomm(r, Int_rng, range_size); + return Int$plus(min, Int$from_mpz(r)); +} + +public PUREFUNC Range_t Int$to(Int_t from, Int_t to) { + return (Range_t){from, to, Int$compare_value(to, from) >= 0 ? (Int_t){.small=(1<<2)|1} : (Int_t){.small=(-1>>2)|1}}; +} + +public Int_t Int$from_str(const char *str) { + mpz_t i; + int result; + if (strncmp(str, "0x", 2) == 0) { + result = mpz_init_set_str(i, str + 2, 16); + } else if (strncmp(str, "0o", 2) == 0) { + result = mpz_init_set_str(i, str + 2, 8); + } else if (strncmp(str, "0b", 2) == 0) { + result = mpz_init_set_str(i, str + 2, 2); + } else { + result = mpz_init_set_str(i, str, 10); + } + if (result != 0) + return NULL_INT; + return Int$from_mpz(i); +} + +public OptionalInt_t Int$from_text(Text_t text) { + return Int$from_str(Text$as_c_string(text)); +} + +public bool Int$is_prime(Int_t x, Int_t reps) +{ + mpz_t p; + mpz_init_set_int(p, x); + if (Int$compare_value(reps, I(9999)) > 0) + fail("Number of prime-test repetitions should not be above 9999"); + int reps_int = Int_to_Int32(reps, false); + return (mpz_probab_prime_p(p, reps_int) != 0); +} + +public Int_t Int$next_prime(Int_t x) +{ + mpz_t p; + mpz_init_set_int(p, x); + mpz_nextprime(p, p); + return Int$from_mpz(p); +} + +public Int_t Int$prev_prime(Int_t x) +{ + mpz_t p; + mpz_init_set_int(p, x); + if (mpz_prevprime(p, p) == 0) + fail("There is no prime number before %k", (Text_t[1]){Int$as_text(&x, false, &Int$info)}); + return Int$from_mpz(p); +} + +public const TypeInfo Int$info = { + .size=sizeof(Int_t), + .align=__alignof__(Int_t), + .tag=CustomInfo, + .CustomInfo={ + .compare=(void*)Int$compare, + .equal=(void*)Int$equal, + .hash=(void*)Int$hash, + .as_text=(void*)Int$as_text, + }, +}; + + +#define DEFINE_INT_TYPE(c_type, KindOfInt, fmt, min_val, max_val)\ + public Text_t KindOfInt ## $as_text(const c_type *i, bool colorize, const TypeInfo *type) { \ + (void)type; \ + if (!i) return Text(#KindOfInt); \ + return Text$format(colorize ? "\x1b[35m%" fmt "\x1b[m" : "%" fmt, *i); \ + } \ + public PUREFUNC int32_t KindOfInt ## $compare(const c_type *x, const c_type *y, const TypeInfo *type) { \ + (void)type; \ + return (*x > *y) - (*x < *y); \ + } \ + public PUREFUNC bool KindOfInt ## $equal(const c_type *x, const c_type *y, const TypeInfo *type) { \ + (void)type; \ + return *x == *y; \ + } \ + public Text_t KindOfInt ## $format(c_type i, Int_t digits_int) { \ + Int_t as_int = KindOfInt##_to_Int(i); \ + return Int$format(as_int, digits_int); \ + } \ + public Text_t KindOfInt ## $hex(c_type i, Int_t digits_int, bool uppercase, bool prefix) { \ + Int_t as_int = KindOfInt##_to_Int(i); \ + return Int$hex(as_int, digits_int, uppercase, prefix); \ + } \ + public Text_t KindOfInt ## $octal(c_type i, Int_t digits_int, bool prefix) { \ + Int_t as_int = KindOfInt##_to_Int(i); \ + return Int$octal(as_int, digits_int, prefix); \ + } \ + public Array_t KindOfInt ## $bits(c_type x) { \ + Array_t bit_array = (Array_t){.data=GC_MALLOC_ATOMIC(sizeof(bool[8*sizeof(c_type)])), .atomic=1, .stride=sizeof(bool), .length=8*sizeof(c_type)}; \ + bool *bits = bit_array.data + sizeof(c_type)*8; \ + for (size_t i = 0; i < 8*sizeof(c_type); i++) { \ + *(bits--) = x & 1; \ + x >>= 1; \ + } \ + return bit_array; \ + } \ + public c_type KindOfInt ## $random(c_type min, c_type max) { \ + if (min > max) fail("Random minimum value (%ld) is larger than the maximum value (%ld)", min, max); \ + if (min == max) return min; \ + if (min == min_val && max == max_val) { \ + c_type r; \ + arc4random_buf(&r, sizeof(r)); \ + return r; \ + } \ + uint64_t range = (uint64_t)max - (uint64_t)min + 1; \ + uint64_t min_r = -range % range; \ + uint64_t r; \ + for (;;) { \ + arc4random_buf(&r, sizeof(r)); \ + if (r >= min_r) break; \ + } \ + return (c_type)((uint64_t)min + (r % range)); \ + } \ + public Range_t KindOfInt ## $to(c_type from, c_type to) { \ + return (Range_t){Int64_to_Int(from), Int64_to_Int(to), to >= from ? (Int_t){.small=(1<<2)&1} : (Int_t){.small=(1<<2)&1}}; \ + } \ + public PUREFUNC Optional ## KindOfInt ## _t KindOfInt ## $from_text(Text_t text) { \ + OptionalInt_t full_int = Int$from_text(text); \ + if (full_int.small == 0) return (Optional ## KindOfInt ## _t){.is_null=true}; \ + if (Int$compare_value(full_int, I(min_val)) < 0) { \ + return (Optional ## KindOfInt ## _t){.is_null=true}; \ + } \ + if (Int$compare_value(full_int, I(max_val)) > 0) { \ + return (Optional ## KindOfInt ## _t){.is_null=true}; \ + } \ + return (Optional ## KindOfInt ## _t){.i=Int_to_ ## KindOfInt(full_int, true)}; \ + } \ + public const c_type KindOfInt##$min = min_val; \ + public const c_type KindOfInt##$max = max_val; \ + public const TypeInfo KindOfInt##$info = { \ + .size=sizeof(c_type), \ + .align=__alignof__(c_type), \ + .tag=CustomInfo, \ + .CustomInfo={.compare=(void*)KindOfInt##$compare, .as_text=(void*)KindOfInt##$as_text}, \ + }; + +DEFINE_INT_TYPE(int64_t, Int64, "ld_i64", INT64_MIN, INT64_MAX) +DEFINE_INT_TYPE(int32_t, Int32, "d_i32", INT32_MIN, INT32_MAX) +DEFINE_INT_TYPE(int16_t, Int16, "d_i16", INT16_MIN, INT16_MAX) +DEFINE_INT_TYPE(int8_t, Int8, "d_i8", INT8_MIN, INT8_MAX) +#undef DEFINE_INT_TYPE + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/integers.h b/stdlib/integers.h new file mode 100644 index 00000000..1c0ab1cd --- /dev/null +++ b/stdlib/integers.h @@ -0,0 +1,375 @@ +#pragma once + +// Integer type infos and methods + +#include +#include +#include +#include + +#include "datatypes.h" +#include "nums.h" +#include "stdlib.h" +#include "types.h" +#include "util.h" + +#define Int64_t int64_t +#define Int32_t int32_t +#define Int16_t int16_t +#define Int8_t int8_t +#define I64(x) ((int64_t)x) +#define I32(x) ((int32_t)x) +#define I16(x) ((int16_t)x) +#define I8(x) ((int8_t)x) + +#define DEFINE_INT_TYPE(c_type, type_name) \ + typedef struct { \ + c_type i; \ + bool is_null:1; \ + } Optional ## type_name ## _t; \ + Text_t type_name ## $as_text(const c_type *i, bool colorize, const TypeInfo *type); \ + PUREFUNC int32_t type_name ## $compare(const c_type *x, const c_type *y, const TypeInfo *type); \ + PUREFUNC bool type_name ## $equal(const c_type *x, const c_type *y, const TypeInfo *type); \ + Text_t type_name ## $format(c_type i, Int_t digits); \ + Text_t type_name ## $hex(c_type i, Int_t digits, bool uppercase, bool prefix); \ + Text_t type_name ## $octal(c_type i, Int_t digits, bool prefix); \ + Array_t type_name ## $bits(c_type x); \ + c_type type_name ## $random(c_type min, c_type max); \ + Range_t type_name ## $to(c_type from, c_type to); \ + PUREFUNC Optional ## type_name ## _t type_name ## $from_text(Text_t text); \ + PUREFUNC static inline c_type type_name ## $clamped(c_type x, c_type min, c_type max) { \ + return x < min ? min : (x > max ? max : x); \ + } \ + extern const c_type type_name ## $min, type_name##$max; \ + extern const TypeInfo type_name ## $info; \ + static inline c_type type_name ## $divided_by(c_type D, c_type d) { \ + c_type q = D/d, r = D%d; \ + if (r < 0) { \ + if (d > 0) q = q-1; \ + else q = q+1; \ + } \ + return q; \ + } \ + static inline c_type type_name ## $modulo(c_type D, c_type d) { \ + c_type r = D%d; \ + if (r < 0) { \ + if (d > 0) r = r + d; \ + else r = r - d; \ + } \ + return r; \ + } \ + static inline c_type type_name ## $modulo1(c_type D, c_type d) { \ + return type_name ## $modulo(D-1, d) + 1; \ + } + +DEFINE_INT_TYPE(int64_t, Int64) +DEFINE_INT_TYPE(int32_t, Int32) +DEFINE_INT_TYPE(int16_t, Int16) +DEFINE_INT_TYPE(int8_t, Int8) +#undef DEFINE_INT_TYPE + +#define NULL_INT64 ((OptionalInt64_t){.is_null=true}) +#define NULL_INT32 ((OptionalInt32_t){.is_null=true}) +#define NULL_INT16 ((OptionalInt16_t){.is_null=true}) +#define NULL_INT8 ((OptionalInt8_t){.is_null=true}) + +#define Int64$abs(...) I64(labs(__VA_ARGS__)) +#define Int32$abs(...) I32(abs(__VA_ARGS__)) +#define Int16$abs(...) I16(abs(__VA_ARGS__)) +#define Int8$abs(...) I8(abs(__VA_ARGS__)) + +#define OptionalInt_t Int_t + +Text_t Int$as_text(const Int_t *i, bool colorize, const TypeInfo *type); +Text_t Int$value_as_text(Int_t i); +PUREFUNC uint64_t Int$hash(const Int_t *x, const TypeInfo *type); +PUREFUNC int32_t Int$compare(const Int_t *x, const Int_t *y, const TypeInfo *type); +PUREFUNC int32_t Int$compare_value(const Int_t x, const Int_t y); +PUREFUNC bool Int$equal(const Int_t *x, const Int_t *y, const TypeInfo *type); +PUREFUNC bool Int$equal_value(const Int_t x, const Int_t y); +Text_t Int$format(Int_t i, Int_t digits); +Text_t Int$hex(Int_t i, Int_t digits, bool uppercase, bool prefix); +Text_t Int$octal(Int_t i, Int_t digits, bool prefix); +void Int$init_random(long seed); +Int_t Int$random(Int_t min, Int_t max); +PUREFUNC Range_t Int$to(Int_t from, Int_t to); +OptionalInt_t Int$from_str(const char *str); +OptionalInt_t Int$from_text(Text_t text); +Int_t Int$abs(Int_t x); +Int_t Int$power(Int_t base, Int_t exponent); +Int_t Int$sqrt(Int_t i); + +#define BIGGEST_SMALL_INT ((1<<29)-1) + +#define Int$from_mpz(mpz) (\ + mpz_cmpabs_ui(mpz, BIGGEST_SMALL_INT) <= 0 ? ( \ + (Int_t){.small=(mpz_get_si(mpz)<<2)|1} \ + ) : ( \ + (Int_t){.big=memcpy(new(mpz_t), &mpz, sizeof(mpz_t))} \ + )) + +#define mpz_init_set_int(mpz, i) do { \ + if (__builtin_expect((i).small & 1, 1)) mpz_init_set_si(mpz, (i).small >> 2); \ + else mpz_init_set(mpz, *(i).big); \ +} while (0) + +#define I(i) ((int64_t)(i) == (int32_t)(i) ? ((Int_t){.small=(int64_t)((uint64_t)(i)<<2)|1}) : Int64_to_Int(i)) +#define I_small(i) ((Int_t){.small=((uint64_t)(i)<<2)|1}) +#define I_is_zero(i) ((i).small == 1) + +Int_t Int$slow_plus(Int_t x, Int_t y); +Int_t Int$slow_minus(Int_t x, Int_t y); +Int_t Int$slow_times(Int_t x, Int_t y); +Int_t Int$slow_divided_by(Int_t x, Int_t y); +Int_t Int$slow_modulo(Int_t x, Int_t y); +Int_t Int$slow_modulo1(Int_t x, Int_t y); +Int_t Int$slow_left_shifted(Int_t x, Int_t y); +Int_t Int$slow_right_shifted(Int_t x, Int_t y); +Int_t Int$slow_bit_and(Int_t x, Int_t y); +Int_t Int$slow_bit_or(Int_t x, Int_t y); +Int_t Int$slow_bit_xor(Int_t x, Int_t y); +Int_t Int$slow_negative(Int_t x); +Int_t Int$slow_negated(Int_t x); +bool Int$is_prime(Int_t x, Int_t reps); +Int_t Int$next_prime(Int_t x); +Int_t Int$prev_prime(Int_t x); + +extern const TypeInfo Int$info; + +static inline Int_t Int$clamped(Int_t x, Int_t low, Int_t high) +{ + return (Int$compare(&x, &low, &Int$info) <= 0) ? low : (Int$compare(&x, &high, &Int$info) >= 0 ? high : x); +} + +// Fast-path inline versions for the common case where integer arithmetic is +// between two small ints. + +static inline Int_t Int$plus(Int_t x, Int_t y) { + const int64_t z = (int64_t)((uint64_t)x.small + (uint64_t)y.small); + if (__builtin_expect(((z|2) == (int32_t)z), 1)) + return (Int_t){.small=(z-1)}; + return Int$slow_plus(x, y); +} + +static inline Int_t Int$minus(Int_t x, Int_t y) { + const int64_t z = (int64_t)(((uint64_t)x.small ^ 3) - (uint64_t)y.small); + if (__builtin_expect(((z & ~2) == (int32_t)z), 1)) + return (Int_t){.small=z}; + return Int$slow_minus(x, y); +} + +static inline Int_t Int$times(Int_t x, Int_t y) { + if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { + const int64_t z = (x.small>>1) * (y.small>>1); + if (__builtin_expect(z == (int32_t)z, 1)) + return (Int_t){.small=z+1}; + } + return Int$slow_times(x, y); +} + +static inline Int_t Int$divided_by(Int_t x, Int_t y) { + if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { + // Euclidean division, see: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/divmodnote-letter.pdf + const int64_t D = (x.small>>2); + const int64_t d = (y.small>>2); + int64_t q = D/d; + int64_t r = D%d; + if (r < 0) { + if (d > 0) q = q-1; + else q = q+1; + } + if (__builtin_expect(q == (int32_t)q, 1)) + return (Int_t){.small=(q<<2)|1}; + } + return Int$slow_divided_by(x, y); +} + +static inline Int_t Int$modulo(Int_t x, Int_t y) { + if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { + // Euclidean modulus, see: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/divmodnote-letter.pdf + const int64_t D = (x.small>>2); + const int64_t d = (y.small>>2); + int64_t r = D%d; + if (r < 0) { + if (d > 0) r = r + d; + else r = r - d; + } + return (Int_t){.small=(r<<2)|1}; + } + return Int$slow_modulo(x, y); +} + +static inline Int_t Int$modulo1(Int_t x, Int_t y) { + if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { + // Euclidean modulus, see: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/divmodnote-letter.pdf + const int64_t D = (x.small>>2)-1; + const int64_t d = (y.small>>2); + int64_t r = D%d; + if (r < 0) { + if (d > 0) r = r + d; + else r = r - d; + } + return (Int_t){.small=((r+1)<<2)|1}; + } + return Int$slow_modulo1(x, y); +} + +static inline Int_t Int$left_shifted(Int_t x, Int_t y) { + if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { + const int64_t z = ((x.small>>2) << (y.small>>2))<<2; + if (__builtin_expect(z == (int32_t)z, 1)) + return (Int_t){.small=z+1}; + } + return Int$slow_left_shifted(x, y); +} + +static inline Int_t Int$right_shifted(Int_t x, Int_t y) { + if (__builtin_expect(((x.small & y.small) & 1) != 0, 1)) { + const int64_t z = ((x.small>>2) >> (y.small>>2))<<2; + if (__builtin_expect(z == (int32_t)z, 1)) + return (Int_t){.small=z+1}; + } + return Int$slow_right_shifted(x, y); +} + +static inline Int_t Int$bit_and(Int_t x, Int_t y) { + const int64_t z = x.small & y.small; + if (__builtin_expect((z & 1) == 1, 1)) + return (Int_t){.small=z}; + return Int$slow_bit_and(x, y); +} + +static inline Int_t Int$bit_or(Int_t x, Int_t y) { + if (__builtin_expect(((x.small & y.small) & 1) == 1, 1)) + return (Int_t){.small=(x.small | y.small)}; + return Int$slow_bit_or(x, y); +} + +static inline Int_t Int$bit_xor(Int_t x, Int_t y) { + if (__builtin_expect(((x.small & y.small) & 1) == 1, 1)) + return (Int_t){.small=(x.small ^ y.small) | 1}; + return Int$slow_bit_xor(x, y); +} + +static inline Int_t Int$negated(Int_t x) +{ + if (__builtin_expect((x.small & 1), 1)) + return (Int_t){.small=(~x.small) ^ 3}; + return Int$slow_negated(x); +} + +static inline Int_t Int$negative(Int_t x) +{ + if (__builtin_expect((x.small & 1), 1)) + return (Int_t){.small=((-((x.small)>>2))<<2) | 1}; + return Int$slow_negative(x); +} + +static inline bool Int$is_negative(Int_t x) +{ + if (__builtin_expect((x.small & 1), 1)) + return x.small < 0; + return Int$compare_value(x, I_small(0)) < 0; +} + +// Conversion functions: + +static inline Int_t Int64_to_Int(int64_t i) +{ + int64_t z = i<<2; + if (__builtin_expect(z == (int32_t)z, 1)) + return (Int_t){.small=z+1}; + mpz_t result; + mpz_init_set_si(result, i); + return Int$from_mpz(result); +} + +#define Int32_to_Int(i) Int64_to_Int(i) +#define Int16_to_Int(i) Int64_to_Int(i) +#define Int8_to_Int(i) Int64_to_Int(i) + +#pragma GCC diagnostic ignored "-Winline" +PUREFUNC static inline Int64_t Int_to_Int64(Int_t i, bool truncate) { + if (__builtin_expect(i.small & 1, 1)) + return (int64_t)(i.small >> 2); + if (__builtin_expect(!truncate && !mpz_fits_slong_p(*i.big), 0)) + fail("Integer is too big to fit in a 64-bit integer!"); + return mpz_get_si(*i.big); +} + +PUREFUNC static inline Int32_t Int_to_Int32(Int_t i, bool truncate) { + int64_t i64 = Int_to_Int64(i, truncate); + int32_t i32 = (int32_t)i64; + if (__builtin_expect(i64 != i32 && !truncate, 0)) + fail("Integer is too big to fit in a 32-bit integer!"); + return i32; +} + +PUREFUNC static inline Int16_t Int_to_Int16(Int_t i, bool truncate) { + int64_t i64 = Int_to_Int64(i, truncate); + int16_t i16 = (int16_t)i64; + if (__builtin_expect(i64 != i16 && !truncate, 0)) + fail("Integer is too big to fit in a 16-bit integer!"); + return i16; +} + +PUREFUNC static inline Int8_t Int_to_Int8(Int_t i, bool truncate) { + int64_t i64 = Int_to_Int64(i, truncate); + int8_t i8 = (int8_t)i64; + if (__builtin_expect(i64 != i8 && !truncate, 0)) + fail("Integer is too big to fit in an 8-bit integer!"); + return i8; +} + +PUREFUNC static inline Int_t Num_to_Int(double n) +{ + mpz_t result; + mpz_init_set_d(result, n); + return Int$from_mpz(result); +} + +PUREFUNC static inline double Int_to_Num(Int_t i) +{ + if (__builtin_expect(i.small & 1, 1)) + return (double)(i.small >> 2); + + return mpz_get_d(*i.big); +} + +#define Int_to_Num32(i) (Num32_t)Int_to_Num(i) + +#define CONVERSION_FUNC(hi, lo) \ + PUREFUNC static inline int##lo##_t Int##hi##_to_Int##lo(int##hi##_t i, bool truncate) { \ + if (__builtin_expect(!truncate && (i != (int##lo##_t)i), 0)) \ + fail("Cannot truncate the Int" #hi " %ld to an Int" #lo, (int64_t)i); \ + return (int##lo##_t)i; \ + } + +CONVERSION_FUNC(64, 32) +CONVERSION_FUNC(64, 16) +CONVERSION_FUNC(64, 8) +CONVERSION_FUNC(32, 16) +CONVERSION_FUNC(32, 8) +CONVERSION_FUNC(16, 8) +#undef CONVERSION_FUNC + +#pragma GCC diagnostic ignored "-Wfloat-equal" +#define CONVERSION_FUNC(num, int_type) \ + PUREFUNC static inline int_type##_t num##_to_##int_type(num##_t n, bool truncate) { \ + num##_t rounded = (num##_t)round((double)n); \ + if (__builtin_expect(!truncate && (num##_t)(int_type##_t)rounded != rounded, 0)) \ + fail("Cannot truncate the " #num " %g to an " #int_type, (double)rounded); \ + return (int_type##_t)rounded; \ + } + +CONVERSION_FUNC(Num, Int64) +CONVERSION_FUNC(Num, Int32) +CONVERSION_FUNC(Num, Int16) +CONVERSION_FUNC(Num, Int8) +CONVERSION_FUNC(Num32, Int64) +CONVERSION_FUNC(Num32, Int32) +CONVERSION_FUNC(Num32, Int16) +CONVERSION_FUNC(Num32, Int8) +#undef CONVERSION_FUNC + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/memory.c b/stdlib/memory.c new file mode 100644 index 00000000..9d7dbc80 --- /dev/null +++ b/stdlib/memory.c @@ -0,0 +1,28 @@ +// Type info and methods for "Memory" opaque type +#include +#include +#include +#include +#include +#include +#include + +#include "memory.h" +#include "text.h" +#include "types.h" +#include "util.h" + +public Text_t Memory__as_text(const void *p, bool colorize, const TypeInfo *type) { + (void)type; + if (!p) return Text("Memory"); + return Text$format(colorize ? "\x1b[0;34;1mMemory<%p>\x1b[m" : "Memory<%p>", p); +} + +public const TypeInfo Memory$info = { + .size=0, + .align=0, + .tag=CustomInfo, + .CustomInfo={.as_text=(void*)Memory__as_text}, +}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/memory.h b/stdlib/memory.h new file mode 100644 index 00000000..701ea68a --- /dev/null +++ b/stdlib/memory.h @@ -0,0 +1,13 @@ +#pragma once + +// Type info and methods for "Memory" opaque type + +#include +#include + +#include "types.h" + +extern const TypeInfo Memory$info; +Text_t Memory$as_text(const void *p, bool colorize, const TypeInfo *type); + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/metamethods.c b/stdlib/metamethods.c new file mode 100644 index 00000000..9b0560ab --- /dev/null +++ b/stdlib/metamethods.c @@ -0,0 +1,124 @@ +// Metamethods are methods that all types share for hashing, equality, comparison, and textifying + +#include +#include + +#include "arrays.h" +#include "channels.h" +#include "functiontype.h" +#include "metamethods.h" +#include "optionals.h" +#include "pointers.h" +#include "siphash.h" +#include "tables.h" +#include "text.h" +#include "util.h" + + +PUREFUNC public uint64_t generic_hash(const void *obj, const TypeInfo *type) +{ + switch (type->tag) { + case TextInfo: return Text$hash((void*)obj); + case ArrayInfo: return Array$hash(obj, type); + case ChannelInfo: return Channel$hash((Channel_t**)obj, type); + case TableInfo: return Table$hash(obj, type); + case OptionalInfo: return is_null(obj, type->OptionalInfo.type) ? 0 : generic_hash(obj, type->OptionalInfo.type); + case EmptyStructInfo: return 0; + case CustomInfo: case StructInfo: case EnumInfo: case CStringInfo: // These all share the same info + if (!type->CustomInfo.hash) + goto hash_data; + return type->CustomInfo.hash(obj, type); + case PointerInfo: case FunctionInfo: case TypeInfoInfo: case OpaqueInfo: default: { + hash_data:; + return siphash24((void*)obj, (size_t)(type->size)); + } + } +} + +PUREFUNC public int32_t generic_compare(const void *x, const void *y, const TypeInfo *type) +{ + if (x == y) return 0; + + switch (type->tag) { + case PointerInfo: case FunctionInfo: return Pointer$compare(x, y, type); + case TextInfo: return Text$compare(x, y); + case ArrayInfo: return Array$compare(x, y, type); + case ChannelInfo: return Channel$compare((Channel_t**)x, (Channel_t**)y, type); + case TableInfo: return Table$compare(x, y, type); + case OptionalInfo: { + bool x_is_null = is_null(x, type->OptionalInfo.type); + bool y_is_null = is_null(y, type->OptionalInfo.type); + if (x_is_null && y_is_null) return 0; + else if (x_is_null != y_is_null) return (int32_t)y_is_null - (int32_t)x_is_null; + else return generic_compare(x, y, type->OptionalInfo.type); + } + case EmptyStructInfo: return 0; + case CustomInfo: case StructInfo: case EnumInfo: case CStringInfo: // These all share the same info + if (!type->CustomInfo.compare) + goto compare_data; + return type->CustomInfo.compare(x, y, type); + case TypeInfoInfo: case OpaqueInfo: default: + compare_data: + return (int32_t)memcmp((void*)x, (void*)y, (size_t)(type->size)); + } +} + +PUREFUNC public bool generic_equal(const void *x, const void *y, const TypeInfo *type) +{ + if (x == y) return true; + + switch (type->tag) { + case PointerInfo: case FunctionInfo: return Pointer$equal(x, y, type); + case TextInfo: return Text$equal(x, y); + case ArrayInfo: return Array$equal(x, y, type); + case ChannelInfo: return Channel$equal((Channel_t**)x, (Channel_t**)y, type); + case TableInfo: return Table$equal(x, y, type); + case EmptyStructInfo: return true; + case OptionalInfo: { + bool x_is_null = is_null(x, type->OptionalInfo.type); + bool y_is_null = is_null(y, type->OptionalInfo.type); + if (x_is_null && y_is_null) return true; + else if (x_is_null != y_is_null) return false; + else return generic_equal(x, y, type->OptionalInfo.type); + } + case CustomInfo: case StructInfo: case EnumInfo: case CStringInfo: // These all share the same info + if (!type->CustomInfo.equal) + goto use_generic_compare; + return type->CustomInfo.equal(x, y, type); + case TypeInfoInfo: case OpaqueInfo: default: + use_generic_compare: + return (generic_compare(x, y, type) == 0); + } +} + +public Text_t generic_as_text(const void *obj, bool colorize, const TypeInfo *type) +{ + switch (type->tag) { + case PointerInfo: return Pointer$as_text(obj, colorize, type); + case FunctionInfo: return Func$as_text(obj, colorize, type); + case TextInfo: return Text$as_text(obj, colorize, type); + case ArrayInfo: return Array$as_text(obj, colorize, type); + case ChannelInfo: return Channel$as_text((Channel_t**)obj, colorize, type); + case TableInfo: return Table$as_text(obj, colorize, type); + case TypeInfoInfo: return Type$as_text(obj, colorize, type); + case OptionalInfo: return Optional$as_text(obj, colorize, type); + case EmptyStructInfo: return colorize ? + Text$concat(Text("\x1b[0;1m"), Text$from_str(type->EmptyStructInfo.name), Text("\x1b[m()")) + : Text$concat(Text$from_str(type->EmptyStructInfo.name), Text("()")); + case CustomInfo: case StructInfo: case EnumInfo: case CStringInfo: // These all share the same info + if (!type->CustomInfo.as_text) + fail("No text function provided for type!\n"); + return type->CustomInfo.as_text(obj, colorize, type); + case OpaqueInfo: return Text("???"); + default: errx(1, "Invalid type tag: %d", type->tag); + } +} + +public int generic_print(const void *obj, bool colorize, const TypeInfo *type) +{ + Text_t text = generic_as_text(obj, colorize, type); + return Text$print(stdout, text) + printf("\n"); +} + + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/metamethods.h b/stdlib/metamethods.h new file mode 100644 index 00000000..be712a61 --- /dev/null +++ b/stdlib/metamethods.h @@ -0,0 +1,15 @@ +#pragma once +// Metamethods are methods that all types share: + +#include + +#include "types.h" +#include "util.h" + +PUREFUNC uint64_t generic_hash(const void *obj, const TypeInfo *type); +PUREFUNC int32_t generic_compare(const void *x, const void *y, const TypeInfo *type); +PUREFUNC bool generic_equal(const void *x, const void *y, const TypeInfo *type); +Text_t generic_as_text(const void *obj, bool colorize, const TypeInfo *type); +int generic_print(const void *obj, bool colorize, const TypeInfo *type); + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/nums.c b/stdlib/nums.c new file mode 100644 index 00000000..1956140a --- /dev/null +++ b/stdlib/nums.c @@ -0,0 +1,178 @@ +// Type infos and methods for Nums (floating point) + +#include +#include +#include +#include +#include +#include + +#include "arrays.h" +#include "nums.h" +#include "string.h" +#include "text.h" +#include "types.h" + +public PUREFUNC Text_t Num$as_text(const double *f, bool colorize, const TypeInfo *type) { + (void)type; + if (!f) return Text("Num"); + return Text$format(colorize ? "\x1b[35m%.16g\x1b[33;2m\x1b[m" : "%.16g", *f); +} + +public PUREFUNC int32_t Num$compare(const double *x, const double *y, const TypeInfo *type) { + (void)type; + return (*x > *y) - (*x < *y); +} + +public PUREFUNC bool Num$equal(const double *x, const double *y, const TypeInfo *type) { + (void)type; + return *x == *y; +} + +public CONSTFUNC bool Num$near(double a, double b, double ratio, double absolute) { + if (ratio < 0) ratio = 0; + else if (ratio > 1) ratio = 1; + + if (a == b) return true; + + double diff = fabs(a - b); + if (diff < absolute) return true; + else if (isnan(diff)) return false; + + double epsilon = fabs(a * ratio) + fabs(b * ratio); + if (isinf(epsilon)) epsilon = DBL_MAX; + return (diff < epsilon); +} + +public Text_t Num$format(double f, Int_t precision) { + return Text$format("%.*f", (int)Int_to_Int64(precision, false), f); +} + +public Text_t Num$scientific(double f, Int_t precision) { + return Text$format("%.*e", (int)Int_to_Int64(precision, false), f); +} + +public double Num$mod(double num, double modulus) { + double result = fmod(num, modulus); + return (result < 0) != (modulus < 0) ? result + modulus : result; +} + +public double Num$random(void) { + return drand48(); +} + +public CONSTFUNC double Num$mix(double amount, double x, double y) { + return (1.0-amount)*x + amount*y; +} + +public OptionalNum_t Num$from_text(Text_t text) { + const char *str = Text$as_c_string(text); + char *end = NULL; + double d = strtod(str, &end); + if (end > str && end[0] == '\0') + return d; + else + return nan("null"); +} + +public double Num$nan(Text_t tag) { + return nan(Text$as_c_string(tag)); +} + +public CONSTFUNC bool Num$isinf(double n) { return !!isinf(n); } +public CONSTFUNC bool Num$finite(double n) { return !!finite(n); } +public CONSTFUNC bool Num$isnan(double n) { return !!isnan(n); } + +public const TypeInfo Num$info = { + .size=sizeof(double), + .align=__alignof__(double), + .tag=CustomInfo, + .CustomInfo={ + .compare=(void*)Num$compare, + .equal=(void*)Num$equal, + .as_text=(void*)Num$as_text, + }, +}; + +public PUREFUNC Text_t Num32$as_text(const float *f, bool colorize, const TypeInfo *type) { + (void)type; + if (!f) return Text("Num32"); + return Text$format(colorize ? "\x1b[35m%.8g_f32\x1b[33;2m\x1b[m" : "%.8g_f32", (double)*f); +} + +public PUREFUNC int32_t Num32$compare(const float *x, const float *y, const TypeInfo *type) { + (void)type; + return (*x > *y) - (*x < *y); +} + +public PUREFUNC bool Num32$equal(const float *x, const float *y, const TypeInfo *type) { + (void)type; + return *x == *y; +} + +public CONSTFUNC bool Num32$near(float a, float b, float ratio, float absolute) { + if (ratio < 0) ratio = 0; + else if (ratio > 1) ratio = 1; + + if (a == b) return true; + + float diff = fabs(a - b); + if (diff < absolute) return true; + else if (isnan(diff)) return false; + + float epsilon = fabs(a * ratio) + fabs(b * ratio); + if (isinf(epsilon)) epsilon = FLT_MAX; + return (diff < epsilon); +} + +public Text_t Num32$format(float f, Int_t precision) { + return Text$format("%.*f", (int)Int_to_Int64(precision, false), (double)f); +} + +public Text_t Num32$scientific(float f, Int_t precision) { + return Text$format("%.*e", (int)Int_to_Int64(precision, false), (double)f); +} + +public float Num32$mod(float num, float modulus) { + float result = fmodf(num, modulus); + return (result < 0) != (modulus < 0) ? result + modulus : result; +} + +public float Num32$random(void) { + return (float)drand48(); +} + +public CONSTFUNC float Num32$mix(float amount, float x, float y) { + return (1.0f-amount)*x + amount*y; +} + +public OptionalNum32_t Num32$from_text(Text_t text) { + const char *str = Text$as_c_string(text); + char *end = NULL; + double d = strtod(str, &end); + if (end > str && end[0] == '\0') + return d; + else + return nan("null"); +} + +public float Num32$nan(Text_t tag) { + return nanf(Text$as_c_string(tag)); +} + +public CONSTFUNC bool Num32$isinf(float n) { return isinf(n); } +public CONSTFUNC bool Num32$finite(float n) { return finite(n); } +public CONSTFUNC bool Num32$isnan(float n) { return isnan(n); } + +public const TypeInfo Num32$info = { + .size=sizeof(float), + .align=__alignof__(float), + .tag=CustomInfo, + .CustomInfo={ + .compare=(void*)Num32$compare, + .equal=(void*)Num32$equal, + .as_text=(void*)Num32$as_text, + }, +}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/nums.h b/stdlib/nums.h new file mode 100644 index 00000000..78f32c1e --- /dev/null +++ b/stdlib/nums.h @@ -0,0 +1,60 @@ +#pragma once + +// Type infos and methods for Nums (floating point) + +#include +#include +#include + +#include "types.h" +#include "util.h" + +#define Num_t double +#define Num32_t float +#define OptionalNum_t double +#define OptionalNum32_t float +#define N32(n) ((float)n) +#define N64(n) ((double)n) + +Text_t Num$as_text(const double *f, bool colorize, const TypeInfo *type); +PUREFUNC int32_t Num$compare(const double *x, const double *y, const TypeInfo *type); +PUREFUNC bool Num$equal(const double *x, const double *y, const TypeInfo *type); +CONSTFUNC bool Num$near(double a, double b, double ratio, double absolute); +Text_t Num$format(double f, Int_t precision); +Text_t Num$scientific(double f, Int_t precision); +double Num$mod(double num, double modulus); +CONSTFUNC bool Num$isinf(double n); +CONSTFUNC bool Num$finite(double n); +CONSTFUNC bool Num$isnan(double n); +double Num$nan(Text_t tag); +double Num$random(void); +CONSTFUNC double Num$mix(double amount, double x, double y); +OptionalNum_t Num$from_text(Text_t text); +CONSTFUNC static inline double Num$clamped(double x, double low, double high) { + return (x <= low) ? low : (x >= high ? high : x); +} +extern const TypeInfo Num$info; + +Text_t Num32$as_text(const float *f, bool colorize, const TypeInfo *type); +PUREFUNC int32_t Num32$compare(const float *x, const float *y, const TypeInfo *type); +PUREFUNC bool Num32$equal(const float *x, const float *y, const TypeInfo *type); +CONSTFUNC bool Num32$near(float a, float b, float ratio, float absolute); +Text_t Num32$format(float f, Int_t precision); +Text_t Num32$scientific(float f, Int_t precision); +float Num32$mod(float num, float modulus); +CONSTFUNC bool Num32$isinf(float n); +CONSTFUNC bool Num32$finite(float n); +CONSTFUNC bool Num32$isnan(float n); +float Num32$random(void); +CONSTFUNC float Num32$mix(float amount, float x, float y); +OptionalNum32_t Num32$from_text(Text_t text); +float Num32$nan(Text_t tag); +CONSTFUNC static inline float Num32$clamped(float x, float low, float high) { + return (x <= low) ? low : (x >= high ? high : x); +} +extern const TypeInfo Num32$info; + +#define Num_to_Num32(n) ((Num32_t)(n)) +#define Num32_to_Num(n) ((Num_t)(n)) + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/optionals.c b/stdlib/optionals.c new file mode 100644 index 00000000..b6ca8dfb --- /dev/null +++ b/stdlib/optionals.c @@ -0,0 +1,73 @@ +// Optional types + +#include + +#include "bools.h" +#include "datatypes.h" +#include "integers.h" +#include "metamethods.h" +#include "threads.h" +#include "text.h" +#include "util.h" + +public const Array_t NULL_ARRAY = {.length=-1}; +public const OptionalBool_t NULL_BOOL = 2; +public const Int_t NULL_INT = {.small=0}; +public const Table_t NULL_TABLE = {.entries.length=-1}; +public const Closure_t NULL_CLOSURE = {.fn=NULL}; +public const Text_t NULL_TEXT = {.length=-1}; + +public PUREFUNC bool is_null(const void *obj, const TypeInfo *non_optional_type) +{ + if (non_optional_type == &Int$info) + return ((Int_t*)obj)->small == 0; + else if (non_optional_type == &Bool$info) + return *((OptionalBool_t*)obj) == NULL_BOOL; + else if (non_optional_type == &Num$info) + return isnan(*((Num_t*)obj)); + else if (non_optional_type == &Int64$info) + return ((OptionalInt64_t*)obj)->is_null; + else if (non_optional_type == &Int32$info) + return ((OptionalInt32_t*)obj)->is_null; + else if (non_optional_type == &Int16$info) + return ((OptionalInt16_t*)obj)->is_null; + else if (non_optional_type == &Int8$info) + return ((OptionalInt8_t*)obj)->is_null; + else if (non_optional_type == &Thread) + return *(pthread_t**)obj == NULL; + + switch (non_optional_type->tag) { + case ChannelInfo: return *(Channel_t**)obj == NULL; + case PointerInfo: return *(void**)obj == NULL; + case TextInfo: return ((Text_t*)obj)->length < 0; + case ArrayInfo: return ((Array_t*)obj)->length < 0; + case TableInfo: return ((Table_t*)obj)->entries.length < 0; + case FunctionInfo: return *(void**)obj == NULL; + case StructInfo: { + int64_t offset = non_optional_type->size; + if (offset % non_optional_type->align) + offset += non_optional_type->align - (offset % non_optional_type->align); + return *(bool*)(obj + offset); + } + case EnumInfo: return (*(int*)obj) == 0; // NULL tag + case CStringInfo: return (*(char**)obj) == NULL; + default: { + Text_t t = generic_as_text(NULL, false, non_optional_type); + errx(1, "is_null() not implemented for: %k", &t); + } + } +} + +#pragma GCC diagnostic ignored "-Wstack-protector" +public Text_t Optional$as_text(const void *obj, bool colorize, const TypeInfo *type) +{ + if (!obj) + return Text$concat(generic_as_text(obj, colorize, type->OptionalInfo.type), Text("?")); + + if (is_null(obj, type->OptionalInfo.type)) + return Text$concat(colorize ? Text("\x1b[31m!") : Text("!"), generic_as_text(NULL, false, type->OptionalInfo.type), + colorize ? Text("\x1b[m") : Text("")); + return Text$concat(generic_as_text(obj, colorize, type->OptionalInfo.type), colorize ? Text("\x1b[33m?\x1b[m") : Text("?")); +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/stdlib/optionals.h b/stdlib/optionals.h new file mode 100644 index 00000000..e37d5345 --- /dev/null +++ b/stdlib/optionals.h @@ -0,0 +1,23 @@ +#pragma once + +// Optional types + +#include +#include + +#include "types.h" +#include "util.h" + +#define OptionalBool_t uint8_t + +extern const OptionalBool_t NULL_BOOL; +extern const Table_t NULL_TABLE; +extern const Array_t NULL_ARRAY; +extern const Int_t NULL_INT; +extern const Closure_t NULL_CLOSURE; +extern const Text_t NULL_TEXT; + +PUREFUNC bool is_null(const void *obj, const TypeInfo *non_optional_type); +Text_t Optional$as_text(const void *obj, bool colorize, const TypeInfo *type); + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/paths.c b/stdlib/paths.c new file mode 100644 index 00000000..231a7c23 --- /dev/null +++ b/stdlib/paths.c @@ -0,0 +1,481 @@ +// A lang for filesystem paths +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrays.h" +#include "files.h" +#include "integers.h" +#include "optionals.h" +#include "paths.h" +#include "patterns.h" +#include "text.h" +#include "types.h" +#include "util.h" + +PUREFUNC public Path_t Path$escape_text(Text_t text) +{ + if (Text$has(text, Pattern("/"))) + fail("Path interpolations cannot contain slashes: %k", &text); + else if (Text$has(text, Pattern(";"))) + fail("Path interpolations cannot contain semicolons: %k", &text); + else if (Text$equal_values(text, Path(".")) || Text$equal_values(text, Path(".."))) + fail("Path interpolation is \"%k\" which is disallowed to prevent security vulnerabilities", &text); + return (Path_t)text; +} + +PUREFUNC public Path_t Path$escape_path(Path_t path) +{ + if (Text$starts_with(path, Path("~/")) || Text$starts_with(path, Path("/"))) + fail("Invalid path component: %k", &path); + return path; +} + +public Path_t Path$cleanup(Path_t path) +{ + if (!Text$starts_with(path, Path("/")) && !Text$starts_with(path, Path("./")) + && !Text$starts_with(path, Path("../")) && !Text$starts_with(path, Path("~/"))) + path = Text$concat(Text("./"), path); + + // Not fully resolved, but at least get rid of some of the cruft like "/./" + // and "/foo/../" and "//" + bool trailing_slash = Text$ends_with(path, Path("/")); + Array_t components = Text$split(path, Pattern("/")); + if (components.length == 0) return Path("/"); + Path_t root = *(Path_t*)components.data; + Array$remove_at(&components, I(1), I(1), sizeof(Path_t)); + + for (int64_t i = 0; i < components.length; ) { + Path_t component = *(Path_t*)(components.data + i*components.stride); + if (component.length == 0 || Text$equal_values(component, Path("."))) { // Skip (//) and (/./) + Array$remove_at(&components, I(i+1), I(1), sizeof(Path_t)); + } else if (Text$equal_values(component, Path(".."))) { + if (i == 0) { + if (root.length == 0) { // (/..) -> (/) + Array$remove_at(&components, I(i+1), I(1), sizeof(Path_t)); + i += 1; + } else if (Text$equal_values(root, Path("."))) { // (./..) -> (..) + root = Path(".."); + Array$remove_at(&components, I(i+1), I(1), sizeof(Path_t)); + i += 1; + } else if (Text$equal_values(root, Path("~"))) { + root = Path(""); // Convert $HOME to absolute path: + + Array$remove_at(&components, I(i+1), I(1), sizeof(Path_t)); + // `i` is pointing to where the `..` lived + + const char *home = getenv("HOME"); + if (!home) fail("Could not get $HOME directory!"); + + // Insert all but the last component: + for (const char *p = home + 1; *p; ) { + const char *next_slash = strchr(p, '/'); + if (!next_slash) break; // Skip last component + Path_t home_component = Text$format("%.*s", (int)(next_slash - p), p); + Array$insert(&components, &home_component, I(i+1), sizeof(Path_t)); + i += 1; + p = next_slash + 1; + } + } else { // (../..) -> (../..) + i += 1; + } + } else if (Text$equal(&component, (Path_t*)(components.data + (i-1)*components.stride))) { // (___/../..) -> (____/../..) + i += 1; + } else { // (___/foo/..) -> (___) + Array$remove_at(&components, I(i), I(2), sizeof(Path_t)); + i -= 1; + } + } else { // (___/foo/baz) -> (___/foo/baz) + i++; + } + } + + Text_t cleaned_up = Text$concat(root, Text("/"), Text$join(Text("/"), components)); + if (trailing_slash && !Text$ends_with(cleaned_up, Text("/"))) + cleaned_up = Text$concat(cleaned_up, Text("/")); + return cleaned_up; +} + +static inline Path_t Path$_expand_home(Path_t path) +{ + if (Text$starts_with(path, Path("~/"))) { + Path_t after_tilde = Text$slice(path, I(2), I(-1)); + return Text$format("%s%k", getenv("HOME"), &after_tilde); + } else { + return path; + } +} + +public Path_t Path$_concat(int n, Path_t items[n]) +{ + Path_t cleaned_up = Path$cleanup(Text$_concat(n, items)); + if (cleaned_up.length > PATH_MAX) + fail("Path exceeds the maximum path length: %k", &cleaned_up); + return cleaned_up; +} + +public Text_t Path$resolved(Path_t path, Path_t relative_to) +{ + path = Path$cleanup(path); + + const char *path_str = Text$as_c_string(path); + const char *relative_to_str = Text$as_c_string(relative_to); + const char *resolved_path = resolve_path(path_str, relative_to_str, relative_to_str); + if (resolved_path) { + return (Path_t)(Text$from_str(resolved_path)); + } else if (path_str[0] == '/') { + return path; + } else if (path_str[0] == '~' && path_str[1] == '/') { + return (Path_t)Text$format("%s%s", getenv("HOME"), path_str + 1); + } else { + return Text$concat(Path$resolved(relative_to, Path(".")), Path("/"), path); + } +} + +public Text_t Path$relative(Path_t path, Path_t relative_to) +{ + path = Path$resolved(path, relative_to); + relative_to = Path$resolved(relative_to, Path(".")); + if (Text$matches(path, Patterns(Pattern("{start}"), relative_to, Pattern("{0+..}")))) + return Text$slice(path, I(relative_to.length + 2), I(-1)); + return path; +} + +public bool Path$exists(Path_t path) +{ + path = Path$_expand_home(path); + struct stat sb; + return (stat(Text$as_c_string(path), &sb) == 0); +} + +public bool Path$is_file(Path_t path, bool follow_symlinks) +{ + path = Path$_expand_home(path); + struct stat sb; + const char *path_str = Text$as_c_string(path); + int status = follow_symlinks ? stat(path_str, &sb) : lstat(path_str, &sb); + if (status != 0) return false; + return (sb.st_mode & S_IFMT) == S_IFREG; +} + +public bool Path$is_directory(Path_t path, bool follow_symlinks) +{ + path = Path$_expand_home(path); + struct stat sb; + const char *path_str = Text$as_c_string(path); + int status = follow_symlinks ? stat(path_str, &sb) : lstat(path_str, &sb); + if (status != 0) return false; + return (sb.st_mode & S_IFMT) == S_IFDIR; +} + +public bool Path$is_pipe(Path_t path, bool follow_symlinks) +{ + path = Path$_expand_home(path); + struct stat sb; + const char *path_str = Text$as_c_string(path); + int status = follow_symlinks ? stat(path_str, &sb) : lstat(path_str, &sb); + if (status != 0) return false; + return (sb.st_mode & S_IFMT) == S_IFIFO; +} + +public bool Path$is_socket(Path_t path, bool follow_symlinks) +{ + path = Path$_expand_home(path); + struct stat sb; + const char *path_str = Text$as_c_string(path); + int status = follow_symlinks ? stat(path_str, &sb) : lstat(path_str, &sb); + if (status != 0) return false; + return (sb.st_mode & S_IFMT) == S_IFSOCK; +} + +public bool Path$is_symlink(Path_t path) +{ + path = Path$_expand_home(path); + struct stat sb; + const char *path_str = Text$as_c_string(path); + int status = stat(path_str, &sb); + if (status != 0) return false; + return (sb.st_mode & S_IFMT) == S_IFLNK; +} + +static void _write(Path_t path, Text_t text, int mode, int permissions) +{ + path = Path$_expand_home(path); + const char *path_str = Text$as_c_string(path); + int fd = open(path_str, mode, permissions); + if (fd == -1) + fail("Could not write to file: %s\n%s", path_str, strerror(errno)); + + const char *str = Text$as_c_string(text); + size_t len = strlen(str); + ssize_t written = write(fd, str, len); + if (written != (ssize_t)len) + fail("Could not write to file: %s\n%s", path_str, strerror(errno)); +} + +public void Path$write(Path_t path, Text_t text, int permissions) +{ + _write(path, text, O_WRONLY | O_CREAT, permissions); +} + +public void Path$append(Path_t path, Text_t text, int permissions) +{ + _write(path, text, O_WRONLY | O_APPEND | O_CREAT, permissions); +} + +public Text_t Path$read(Path_t path) +{ + path = Path$_expand_home(path); + int fd = open(Text$as_c_string(path), O_RDONLY); + if (fd == -1) + fail("Could not read file: %k (%s)", &path, strerror(errno)); + + struct stat sb; + if (fstat(fd, &sb) != 0) + fail("Could not read file: %k (%s)", &path, strerror(errno)); + + if ((sb.st_mode & S_IFMT) == S_IFREG) { // Use memory mapping if it's a real file: + const char *mem = mmap(NULL, (size_t)sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + char *gc_mem = GC_MALLOC_ATOMIC((size_t)sb.st_size+1); + memcpy(gc_mem, mem, (size_t)sb.st_size); + gc_mem[sb.st_size] = '\0'; + close(fd); + return Text$from_strn(gc_mem, (size_t)sb.st_size); + } else { + size_t capacity = 256, len = 0; + char *content = GC_MALLOC_ATOMIC(capacity); + for (;;) { + char chunk[256]; + ssize_t just_read = read(fd, chunk, sizeof(chunk)); + if (just_read < 0) + fail("Failed while reading file: %k (%s)", &path, strerror(errno)); + else if (just_read == 0) { + if (errno == EAGAIN || errno == EINTR) + continue; + break; + } + + if (len + (size_t)just_read >= capacity) { + content = GC_REALLOC(content, (capacity *= 2)); + } + + memcpy(&content[len], chunk, (size_t)just_read); + len += (size_t)just_read; + + if ((size_t)just_read < sizeof(chunk)) + break; + } + close(fd); + + if (u8_check((uint8_t*)content, len) != NULL) + fail("File does not contain valid UTF8 data!"); + + return Text$from_strn(content, len); + } +} + +public void Path$remove(Path_t path, bool ignore_missing) +{ + path = Path$_expand_home(path); + const char *path_str = Text$as_c_string(path); + struct stat sb; + if (lstat(path_str, &sb) != 0) { + if (!ignore_missing) + fail("Could not remove file: %s (%s)", path_str, strerror(errno)); + } + + if ((sb.st_mode & S_IFMT) == S_IFREG || (sb.st_mode & S_IFMT) == S_IFLNK) { + if (unlink(path_str) != 0 && !ignore_missing) + fail("Could not remove file: %s (%s)", path_str, strerror(errno)); + } else if ((sb.st_mode & S_IFMT) == S_IFDIR) { + if (rmdir(path_str) != 0 && !ignore_missing) + fail("Could not remove directory: %s (%s)", path_str, strerror(errno)); + } else { + fail("Could not remove path: %s (not a file or directory)", path_str, strerror(errno)); + } +} + +public void Path$create_directory(Path_t path, int permissions) +{ + path = Path$_expand_home(path); + if (mkdir(Text$as_c_string(path), (mode_t)permissions) != 0) + fail("Could not create directory: %k (%s)", &path, strerror(errno)); +} + +static Array_t _filtered_children(Path_t path, bool include_hidden, mode_t filter) +{ + path = Path$_expand_home(path); + struct dirent *dir; + Array_t children = {}; + const char *path_str = Text$as_c_string(path); + size_t path_len = strlen(path_str); + DIR *d = opendir(path_str); + if (!d) + fail("Could not open directory: %k (%s)", &path, strerror(errno)); + + if (path_str[path_len-1] == '/') + --path_len; + + while ((dir = readdir(d)) != NULL) { + if (!include_hidden && dir->d_name[0] == '.') + continue; + if (streq(dir->d_name, ".") || streq(dir->d_name, "..")) + continue; + + const char *child_str = heap_strf("%.*s/%s", path_len, path_str, dir->d_name); + struct stat sb; + if (stat(child_str, &sb) != 0) + continue; + if (!((sb.st_mode & S_IFMT) & filter)) + continue; + + Path_t child = Text$format("%s%s", child_str, ((sb.st_mode & S_IFMT) == S_IFDIR) ? "/" : ""); // Trailing slash for dirs + Array$insert(&children, &child, I(0), sizeof(Path_t)); + } + closedir(d); + return children; +} + +public Array_t Path$children(Path_t path, bool include_hidden) +{ + return _filtered_children(path, include_hidden, (mode_t)-1); +} + +public Array_t Path$files(Path_t path, bool include_hidden) +{ + return _filtered_children(path, include_hidden, S_IFREG); +} + +public Array_t Path$subdirectories(Path_t path, bool include_hidden) +{ + return _filtered_children(path, include_hidden, S_IFDIR); +} + +public Path_t Path$unique_directory(Path_t path) +{ + path = Path$_expand_home(path); + const char *path_str = Text$as_c_string(path); + size_t len = strlen(path_str); + if (len >= PATH_MAX) fail("Path is too long: %s", path_str); + char buf[PATH_MAX] = {}; + strcpy(buf, path_str); + if (buf[len-1] == '/') + buf[--len] = '\0'; + char *created = mkdtemp(buf); + if (!created) fail("Failed to create temporary directory: %s (%s)", path_str, strerror(errno)); + return Text$format("%s/", created); +} + +public Text_t Path$write_unique(Path_t path, Text_t text) +{ + path = Path$_expand_home(path); + const char *path_str = Text$as_c_string(path); + size_t len = strlen(path_str); + if (len >= PATH_MAX) fail("Path is too long: %s", path_str); + char buf[PATH_MAX] = {}; + strcpy(buf, path_str); + + int64_t suffixlen = 0; + (void)Text$find(path, Pattern("{0+!X}{end}"), I(1), &suffixlen); + if (suffixlen < 0) suffixlen = 0; + + int fd = mkstemps(buf, suffixlen); + if (fd == -1) + fail("Could not write to unique file: %s\n%s", buf, strerror(errno)); + + const char *str = Text$as_c_string(text); + size_t write_len = strlen(str); + ssize_t written = write(fd, str, write_len); + if (written != (ssize_t)write_len) + fail("Could not write to file: %s\n%s", buf, strerror(errno)); + return Text$format("%s", buf); +} + +public Path_t Path$parent(Path_t path) +{ + return Path$cleanup(Text$concat(path, Path("/../"))); +} + +public Text_t Path$base_name(Path_t path) +{ + path = Path$cleanup(path); + if (Text$ends_with(path, Path("/"))) + return Text$replace(path, Pattern("{0+..}/{!/}/{end}"), Text("@2"), Text("@"), false); + else + return Text$replace(path, Pattern("{0+..}/{!/}{end}"), Text("@2"), Text("@"), false); +} + +public Text_t Path$extension(Path_t path, bool full) +{ + Text_t base = Path$base_name(path); + if (Text$matches(base, Pattern(".{!.}.{..}"))) + return Text$replace(base, full ? Pattern(".{!.}.{..}") : Pattern(".{..}.{!.}{end}"), Text("@2"), Text("@"), false); + else if (Text$matches(base, Pattern("{!.}.{..}"))) + return Text$replace(base, full ? Pattern("{!.}.{..}") : Pattern("{..}.{!.}{end}"), Text("@2"), Text("@"), false); + else + return Text(""); +} + +static void _line_reader_cleanup(FILE **f) +{ + if (f && *f) { + fclose(*f); + *f = NULL; + } +} + +static Text_t _next_line(FILE **f) +{ + if (!f || !*f) return NULL_TEXT; + + char *line = NULL; + size_t size = 0; + ssize_t len = getline(&line, &size, *f); + if (len <= 0) { + _line_reader_cleanup(f); + return NULL_TEXT; + } + + while (len > 0 && (line[len-1] == '\r' || line[len-1] == '\n')) + --len; + + if (u8_check((uint8_t*)line, (size_t)len) != NULL) + fail("Invalid UTF8!"); + + Text_t line_text = Text$format("%.*s", len, line); + free(line); + return line_text; +} + +public Closure_t Path$by_line(Path_t path) +{ + path = Path$_expand_home(path); + + FILE *f = fopen(Text$as_c_string(path), "r"); + if (f == NULL) + fail("Could not read file: %k (%s)", &path, strerror(errno)); + + FILE **wrapper = GC_MALLOC(sizeof(FILE*)); + *wrapper = f; + GC_register_finalizer(wrapper, (void*)_line_reader_cleanup, NULL, NULL, NULL); + return (Closure_t){.fn=(void*)_next_line, .userdata=wrapper}; +} + +public const TypeInfo Path$info = { + .size=sizeof(Path_t), + .align=__alignof__(Path_t), + .tag=TextInfo, + .TextInfo={.lang="Path"}, +}; + + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/paths.h b/stdlib/paths.h new file mode 100644 index 00000000..e0d85258 --- /dev/null +++ b/stdlib/paths.h @@ -0,0 +1,50 @@ +#pragma once + +// A lang for filesystem paths + +#include +#include + +#include "types.h" +#include "datatypes.h" + +#define Path_t Text_t +#define Path(text) ((Path_t)Text(text)) +#define Paths(...) Path$_concat(sizeof((Path_t[]){__VA_ARGS__})/sizeof(Path_t), (Path_t[]){__VA_ARGS__}) + +Path_t Path$cleanup(Path_t path); +Path_t Path$_concat(int n, Path_t items[n]); +#define Path$concat(a, b) Paths(a, Path("/"), b) +PUREFUNC Path_t Path$escape_text(Text_t text); +PUREFUNC Path_t Path$escape_path(Text_t path); +Path_t Path$resolved(Path_t path, Path_t relative_to); +Path_t Path$relative(Path_t path, Path_t relative_to); +bool Path$exists(Path_t path); +bool Path$is_file(Path_t path, bool follow_symlinks); +bool Path$is_directory(Path_t path, bool follow_symlinks); +bool Path$is_pipe(Path_t path, bool follow_symlinks); +bool Path$is_socket(Path_t path, bool follow_symlinks); +bool Path$is_symlink(Path_t path); +void Path$write(Path_t path, Text_t text, int permissions); +void Path$append(Path_t path, Text_t text, int permissions); +Text_t Path$read(Path_t path); +void Path$remove(Path_t path, bool ignore_missing); +void Path$create_directory(Path_t path, int permissions); +Array_t Path$children(Path_t path, bool include_hidden); +Array_t Path$files(Path_t path, bool include_hidden); +Array_t Path$subdirectories(Path_t path, bool include_hidden); +Path_t Path$unique_directory(Path_t path); +Text_t Path$write_unique(Path_t path, Text_t text); +Path_t Path$parent(Path_t path); +Text_t Path$base_name(Path_t path); +Text_t Path$extension(Path_t path, bool full); +Closure_t Path$by_line(Path_t path); + +#define Path$hash Text$hash +#define Path$compare Text$compare +#define Path$equal Text$equal + +extern const TypeInfo Path$info; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 + diff --git a/stdlib/patterns.c b/stdlib/patterns.c new file mode 100644 index 00000000..81beaffe --- /dev/null +++ b/stdlib/patterns.c @@ -0,0 +1,1064 @@ +// Logic for text pattern matching + +#include +#include +#include +#include + +#include "arrays.h" +#include "integers.h" +#include "patterns.h" +#include "tables.h" +#include "text.h" +#include "types.h" + +#define MAX_BACKREFS 100 + +static inline void skip_whitespace(Text_t text, int64_t *i) +{ + TextIter_t state = {0, 0}; + while (*i < text.length) { + int32_t grapheme = Text$get_grapheme_fast(text, &state, *i); + if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) + return; + *i += 1; + } +} + +static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme) +{ + if (*i < text.length && Text$get_grapheme(text, *i) == grapheme) { + *i += 1; + return true; + } + return false; +} + +static inline bool match_str(Text_t text, int64_t *i, const char *str) +{ + TextIter_t state = {0, 0}; + int64_t matched = 0; + while (matched[str]) { + if (*i + matched >= text.length || Text$get_grapheme_fast(text, &state, *i + matched) != str[matched]) + return false; + matched += 1; + } + *i += matched; + return true; +} + +static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop) +{ + if (*i >= text.length) return false; + TextIter_t state = {}; + ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i); + // TODO: check every codepoint in the cluster? + if (uc_is_property(grapheme, prop)) { + *i += 1; + return true; + } + return false; +} + +static int64_t parse_int(Text_t text, int64_t *i) +{ + TextIter_t state = {0, 0}; + int64_t value = 0; + for (;; *i += 1) { + ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i); + int digit = uc_digit_value((ucs4_t)grapheme); + if (digit < 0) break; + if (value >= INT64_MAX/10) break; + value = 10*value + digit; + } + return value; +} + +const char *get_property_name(Text_t text, int64_t *i) +{ + skip_whitespace(text, i); + char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); + char *dest = name; + TextIter_t state = {0, 0}; + while (*i < text.length) { + int32_t grapheme = Text$get_grapheme_fast(text, &state, *i); + if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { + *dest = (char)grapheme; + ++dest; + if (dest >= name + UNINAME_MAX - 1) + break; + } else { + break; + } + *i += 1; + } + + while (dest > name && dest[-1] == ' ') + *(dest--) = '\0'; + + if (dest == name) return NULL; + *dest = '\0'; + return name; +} + +#define EAT1(text, state, index, cond) ({\ + int32_t grapheme = Text$get_grapheme_fast(text, state, index); \ + bool success = (cond); \ + if (success) index += 1; \ + success; }) + +#define EAT2(text, state, index, cond1, cond2) ({\ + int32_t grapheme = Text$get_grapheme_fast(text, state, index); \ + bool success = (cond1); \ + if (success) { \ + grapheme = Text$get_grapheme_fast(text, state, index + 1); \ + success = (cond2); \ + if (success) \ + index += 2; \ + } \ + success; }) + + +#define EAT_MANY(text, state, index, cond) ({ int64_t _n = 0; while (EAT1(text, state, index, cond)) { _n += 1; } _n; }) + +int64_t match_email(Text_t text, int64_t index) +{ + // email = local "@" domain + // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) + // domain = dns-label ("." dns-label)* + // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) + + TextIter_t state = {0, 0}; + if (index > 0) { + ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1); + if (uc_is_property_alphabetic((ucs4_t)prev_codepoint)) + return -1; + } + + int64_t start_index = index; + + // Local part: + int64_t local_len = 0; + static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; + while (EAT1(text, &state, index, + (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { + local_len += 1; + if (local_len > 64) return -1; + } + + if (!EAT1(text, &state, index, grapheme == '@')) + return -1; + + // Host + int64_t host_len = 0; + do { + int64_t label_len = 0; + while (EAT1(text, &state, index, + (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { + label_len += 1; + if (label_len > 63) return -1; + } + + if (label_len == 0) + return -1; + + host_len += label_len; + if (host_len > 255) + return -1; + host_len += 1; + } while (EAT1(text, &state, index, grapheme == '.')); + + return index - start_index; +} + +int64_t match_ipv6(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + if (index > 0) { + int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1); + if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) + return -1; + } + int64_t start_index = index; + const int NUM_CLUSTERS = 8; + bool double_colon_used = false; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 4; digits++) { + if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + break; + } + if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) { + break; + } else if (!EAT1(text, &state, index, grapheme == ':')) { + if (double_colon_used) + break; + return -1; + } + + if (EAT1(text, &state, index, grapheme == ':')) { + if (double_colon_used) + return -1; + double_colon_used = true; + } + } + return index - start_index; +} + +static int64_t match_ipv4(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + if (index > 0) { + int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1); + if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) + return -1; + } + int64_t start_index = index; + + const int NUM_CLUSTERS = 4; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 3; digits++) { + if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { + if (digits == 0) return -1; + break; + } + } + + if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) + break; + else if (!EAT1(text, &state, index, grapheme == '.')) + return -1; + } + return (index - start_index); +} + +int64_t match_ip(Text_t text, int64_t index) +{ + int64_t len = match_ipv6(text, index); + if (len >= 0) return len; + len = match_ipv4(text, index); + return (len >= 0) ? len : -1; +} + +int64_t match_uri(Text_t text, int64_t index) +{ + // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] + // scheme = [a-zA-Z] [a-zA-Z0-9+.-] + // authority = [userinfo "@"] host [":" port] + + TextIter_t state = {0, 0}; + if (index > 0) { + ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1); + if (uc_is_property_alphabetic(prev_codepoint)) + return -1; + } + + int64_t start_index = index; + + // Scheme: + if (!EAT1(text, &state, index, isalpha(grapheme))) + return -1; + + EAT_MANY(text, &state, index, + !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); + + if (index == start_index) + return -1; + + if (!match_grapheme(text, &index, ':')) + return -1; + + // Authority: + if (match_str(text, &index, "//")) { + int64_t authority_start = index; + // Username or host: + static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/"; + if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) + return -1; + + if (EAT1(text, &state, index, grapheme == '@')) { + // Found a username, now get a host: + if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) + return -1; + } else { + int64_t ip = authority_start; + int64_t ipv4_len = match_ipv4(text, ip); + if (ipv4_len > 0) { + ip += ipv4_len; + } else if (match_grapheme(text, &ip, '[')) { + ip += match_ipv6(text, ip); + if (ip > authority_start + 1 && match_grapheme(text, &ip, ']')) + index = ip; + } + } + + // Port: + if (EAT1(text, &state, index, grapheme == ':')) { + if (EAT_MANY(text, &state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) + return -1; + } + if (!EAT1(text, &state, index, grapheme == '/')) + return (index - start_index); // No path + } else { + // Optional path root: + EAT1(text, &state, index, grapheme == '/'); + } + + // Path: + static const char *non_path = " \"#?<>[]{}\\^`|"; + EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); + + if (EAT1(text, &state, index, grapheme == '?')) { // Query + static const char *non_query = " \"#<>[]{}\\^`|"; + EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); + } + + if (EAT1(text, &state, index, grapheme == '#')) { // Fragment + static const char *non_fragment = " \"#<>[]{}\\^`|"; + EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); + } + return index - start_index; +} + +int64_t match_url(Text_t text, int64_t index) +{ + int64_t lookahead = index; + if (!(match_str(text, &lookahead, "https:") + || match_str(text, &lookahead, "http:") + || match_str(text, &lookahead, "ftp:") + || match_str(text, &lookahead, "wss:") + || match_str(text, &lookahead, "ws:"))) + return -1; + + return match_uri(text, index); +} + +int64_t match_id(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + if (!EAT1(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) + return -1; + return 1 + EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE)); +} + +int64_t match_int(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + int64_t len = EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); + return len >= 0 ? len : -1; +} + +int64_t match_num(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + bool negative = EAT1(text, &state, index, grapheme == '-') ? 1 : 0; + int64_t pre_decimal = EAT_MANY(text, &state, index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); + bool decimal = (EAT1(text, &state, index, grapheme == '.') == 1); + int64_t post_decimal = decimal ? EAT_MANY(text, &state, index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0; + if (pre_decimal == 0 && post_decimal == 0) + return -1; + return negative + pre_decimal + decimal + post_decimal; +} + +int64_t match_newline(Text_t text, int64_t index) +{ + if (index >= text.length) + return -1; + + TextIter_t state = {0, 0}; + ucs4_t grapheme = index >= text.length ? 0 : Text$get_main_grapheme_fast(text, &state, index); + if (grapheme == '\n') + return 1; + if (grapheme == '\r' && Text$get_grapheme_fast(text, &state, index + 1) == '\n') + return 2; + return -1; +} + +typedef struct { + int64_t index, length; + bool occupied, recursive; +} capture_t; + +typedef struct { + enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag; + bool negated, non_capturing; + int64_t min, max; + union { + int32_t grapheme; + uc_property_t property; + int64_t (*fn)(Text_t, int64_t); + int32_t quote_graphemes[2]; + int32_t pair_graphemes[2]; + }; +} pat_t; + +int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat) +{ + int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(text, state, index); + + switch (pat.tag) { + case PAT_START: { + if (index == 0) + return pat.negated ? -1 : 0; + return pat.negated ? 0 : -1; + } + case PAT_END: { + if (index >= text.length) + return pat.negated ? -1 : 0; + return pat.negated ? 0 : -1; + } + case PAT_ANY: { + assert(!pat.negated); + return (index < text.length) ? 1 : -1; + } + case PAT_GRAPHEME: { + if (index >= text.length) + return -1; + else if (grapheme == pat.grapheme) + return pat.negated ? -1 : 1; + return pat.negated ? 1 : -1; + } + case PAT_PROPERTY: { + if (index >= text.length) + return -1; + else if (uc_is_property((ucs4_t)grapheme, pat.property)) + return pat.negated ? -1 : 1; + return pat.negated ? 1 : -1; + } + case PAT_PAIR: { + // Nested punctuation: (?), [?], etc + if (index >= text.length) + return -1; + + int32_t open = pat.pair_graphemes[0]; + if (grapheme != open) + return pat.negated ? 1 : -1; + + int32_t close = pat.pair_graphemes[1]; + int64_t depth = 1; + int64_t match_len = 1; + for (; depth > 0; match_len++) { + if (index + match_len >= text.length) + return pat.negated ? 1 : -1; + + int32_t c = Text$get_grapheme_fast(text, state, index + match_len); + if (c == open) + depth += 1; + else if (c == close) + depth -= 1; + } + return pat.negated ? -1 : match_len; + } + case PAT_QUOTE: { + // Nested quotes: "?", '?', etc + if (index >= text.length) + return -1; + + int32_t open = pat.quote_graphemes[0]; + if (grapheme != open) + return pat.negated ? 1 : -1; + + int32_t close = pat.quote_graphemes[1]; + for (int64_t i = index + 1; i < text.length; i++) { + int32_t c = Text$get_grapheme_fast(text, state, i); + if (c == close) { + return pat.negated ? -1 : (i - index) + 1; + } else if (c == '\\' && index + 1 < text.length) { + i += 1; // Skip ahead an extra step + } + } + return pat.negated ? 1 : -1; + } + case PAT_FUNCTION: { + int64_t match_len = pat.fn(text, index); + if (match_len >= 0) + return pat.negated ? -1 : match_len; + return pat.negated ? 1 : -1; + } + default: errx(1, "Invalid pattern"); + } + errx(1, "Unreachable"); +} + +pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index) +{ + if (EAT2(pattern, state, *index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), + grapheme == '?')) { + // Quotations: "?", '?', etc + int32_t open = Text$get_grapheme_fast(pattern, state, *index-2); + int32_t close = open; + uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); + if (!match_grapheme(pattern, index, close)) + fail("Pattern's closing quote is missing: %k", &pattern); + + return (pat_t){ + .tag=PAT_QUOTE, + .min=1, .max=1, + .quote_graphemes={open, close}, + }; + } else if (EAT2(pattern, state, *index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), + grapheme == '?')) { + // Nested punctuation: (?), [?], etc + int32_t open = Text$get_grapheme_fast(pattern, state, *index-2); + int32_t close = open; + uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); + if (!match_grapheme(pattern, index, close)) + fail("Pattern's closing brace is missing: %k", &pattern); + + return (pat_t){ + .tag=PAT_PAIR, + .min=1, .max=1, + .pair_graphemes={open, close}, + }; + } else if (EAT1(pattern, state, *index, + grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc. + skip_whitespace(pattern, index); + int64_t min, max; + if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(pattern, state, *index))) { + min = parse_int(pattern, index); + skip_whitespace(pattern, index); + if (match_grapheme(pattern, index, '+')) { + max = INT64_MAX; + } else if (match_grapheme(pattern, index, '-')) { + max = parse_int(pattern, index); + } else { + max = min; + } + if (min > max) fail("Minimum repetitions (%ld) is less than the maximum (%ld)", min, max); + } else { + min = -1, max = -1; + } + + skip_whitespace(pattern, index); + + bool negated = match_grapheme(pattern, index, '!'); +#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__}) + const char *prop_name; + if (match_str(pattern, index, "..")) + prop_name = ".."; + else + prop_name = get_property_name(pattern, index); + + if (!prop_name) { + // Literal character, e.g. {1?} + skip_whitespace(pattern, index); + int32_t grapheme = Text$get_grapheme_fast(pattern, state, (*index)++); + if (!match_grapheme(pattern, index, '}')) + fail("Missing closing '}' in pattern: %k", &pattern); + return PAT(PAT_GRAPHEME, .grapheme=grapheme); + } else if (strlen(prop_name) == 1) { + // Single letter names: {1+ A} + skip_whitespace(pattern, index); + if (!match_grapheme(pattern, index, '}')) + fail("Missing closing '}' in pattern: %k", &pattern); + return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]); + } + + skip_whitespace(pattern, index); + if (!match_grapheme(pattern, index, '}')) + fail("Missing closing '}' in pattern: %k", &pattern); + + switch (tolower(prop_name[0])) { + case '.': + if (prop_name[1] == '.') { + if (negated) + return ((pat_t){.tag=PAT_END, .min=min, .max=max, .non_capturing=true}); + else + return PAT(PAT_ANY); + } + break; + case 'd': + if (strcasecmp(prop_name, "digit") == 0) { + return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT); + } + break; + case 'e': + if (strcasecmp(prop_name, "end") == 0) { + return PAT(PAT_END, .non_capturing=!negated); + } else if (strcasecmp(prop_name, "email") == 0) { + return PAT(PAT_FUNCTION, .fn=match_email); + } else if (strcasecmp(prop_name, "emoji") == 0) { + return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI); + } + break; + case 'i': + if (strcasecmp(prop_name, "id") == 0) { + return PAT(PAT_FUNCTION, .fn=match_id); + } else if (strcasecmp(prop_name, "int") == 0) { + return PAT(PAT_FUNCTION, .fn=match_int); + } else if (strcasecmp(prop_name, "ipv4") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ipv4); + } else if (strcasecmp(prop_name, "ipv6") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ipv6); + } else if (strcasecmp(prop_name, "ip") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ip); + } + break; + case 'n': + if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0 + || strcasecmp(prop_name, "crlf")) { + return PAT(PAT_FUNCTION, .fn=match_newline); + } else if (strcasecmp(prop_name, "num") == 0) { + return PAT(PAT_FUNCTION, .fn=match_num); + } + break; + case 's': + if (strcasecmp(prop_name, "start") == 0) { + return PAT(PAT_START, .non_capturing=!negated); + } + break; + case 'u': + if (strcasecmp(prop_name, "uri") == 0) { + return PAT(PAT_FUNCTION, .fn=match_uri); + } else if (strcasecmp(prop_name, "url") == 0) { + return PAT(PAT_FUNCTION, .fn=match_url); + } + break; + default: break; + } + + uc_property_t prop = uc_property_byname(prop_name); + if (uc_property_is_valid(prop)) + return PAT(PAT_PROPERTY, .property=prop); + + ucs4_t grapheme = unicode_name_character(prop_name); + if (grapheme == UNINAME_INVALID) + fail("Not a valid property or character name: %s", prop_name); + return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme); +#undef PAT + } else { + return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(pattern, state, (*index)++)}; + } +} + +int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) +{ + if (pattern_index >= pattern.length) // End of the pattern + return 0; + + int64_t start_index = text_index; + TextIter_t pattern_state = {0, 0}, text_state = {0, 0}; + pat_t pat = parse_next_pat(pattern, &pattern_state, &pattern_index); + + if (pat.min == -1 && pat.max == -1) { + if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { + pat.min = pat.max = MAX(1, text.length - text_index); + } else { + pat.min = 1; + pat.max = INT64_MAX; + } + } + + int64_t capture_start = text_index; + int64_t count = 0, capture_len = 0, next_match_len = 0; + + if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { + int64_t remaining = text.length - text_index; + capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1; + text_index += capture_len; + goto success; + } + + if (pat.min == 0 && pattern_index < pattern.length) { + next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); + if (next_match_len >= 0) { + capture_len = 0; + goto success; + } + } + + while (count < pat.max) { + int64_t match_len = match_pat(text, &text_state, text_index, pat); + if (match_len < 0) + break; + capture_len += match_len; + text_index += match_len; + count += 1; + + if (pattern_index < pattern.length) { // More stuff after this + if (count < pat.min) + next_match_len = -1; + else + next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); + } else { + next_match_len = 0; + } + + if (match_len == 0) { + if (next_match_len >= 0) { + // If we're good to go, no need to keep re-matching zero-length + // matches till we hit max: + count = pat.max; + break; + } else { + return -1; + } + } + + if (pattern_index < pattern.length && next_match_len >= 0) + break; // Next guy exists and wants to stop here + + if (text_index >= text.length) + break; + } + + if (count < pat.min || next_match_len < 0) + return -1; + + success: + if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) { + if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) { + assert(capture_len > 0); + captures[capture_index] = (capture_t){ + .index=capture_start + 1, // Skip leading quote/paren + .length=capture_len - 2, // Skip open/close + .occupied=true, + .recursive=(pat.tag == PAT_PAIR), + }; + } else { + captures[capture_index] = (capture_t){ + .index=capture_start, + .length=capture_len, + .occupied=true, + .recursive=false, + }; + } + } + return (text_index - start_index) + next_match_len; +} + +#undef EAT1 +#undef EAT2 +#undef EAT_MANY + +static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length) +{ + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = {0, 0}; + + for (int64_t i = first; i <= last; i++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (i < text.length && Text$get_grapheme_fast(text, &text_state, i) != first_grapheme) + ++i; + } + + int64_t m = match(text, i, pattern, 0, NULL, 0); + if (m >= 0) { + if (match_length) + *match_length = m; + return i; + } + } + if (match_length) + *match_length = -1; + return -1; +} + +public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t *match_length) +{ + int64_t first = Int_to_Int64(from_index, false); + if (first == 0) fail("Invalid index: 0"); + if (first < 0) first = text.length + first + 1; + if (first > text.length || first < 1) + return I(0); + int64_t found = _find(text, pattern, first-1, text.length-1, match_length); + return I(found+1); +} + +PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern) +{ + if (Text$starts_with(pattern, Text("{start}"))) { + int64_t m = match(text, 0, pattern, 0, NULL, 0); + return m >= 0; + } else if (Text$ends_with(text, Text("{end}"))) { + for (int64_t i = text.length-1; i >= 0; i--) { + int64_t match_len = match(text, i, pattern, 0, NULL, 0); + if (match_len >= 0 && i + match_len == text.length) + return true; + } + return false; + } else { + int64_t found = _find(text, pattern, 0, text.length-1, NULL); + return (found >= 0); + } +} + +PUREFUNC public bool Text$matches(Text_t text, Pattern_t pattern) +{ + int64_t m = match(text, 0, pattern, 0, NULL, 0); + return m == text.length; +} + +public Array_t Text$find_all(Text_t text, Pattern_t pattern) +{ + if (pattern.length == 0) // special case + return (Array_t){.length=0}; + + Array_t matches = {}; + + for (int64_t i = 0; ; ) { + int64_t len = 0; + int64_t found = _find(text, pattern, i, text.length-1, &len); + if (found < 0) break; + Text_t match = Text$slice(text, I(found+1), I(found + len)); + Array$insert(&matches, &match, I_small(0), sizeof(Text_t)); + i = found + MAX(len, 1); + } + + return matches; +} + +static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t replacement, Pattern_t backref_pat, capture_t *captures) +{ + if (backref_pat.length == 0) + return replacement; + + int32_t first_grapheme = Text$get_grapheme(backref_pat, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + Text_t ret = Text(""); + TextIter_t state = {0, 0}; + int64_t nonmatching_pos = 0; + for (int64_t pos = 0; pos < replacement.length; ) { + // Optimization: quickly skip ahead to first char in the backref pattern: + if (find_first) { + while (pos < replacement.length && Text$get_grapheme_fast(replacement, &state, pos) != first_grapheme) + ++pos; + } + + int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0); + if (backref_len < 0) { + pos += 1; + continue; + } + + int64_t after_backref = pos + backref_len; + int64_t backref = parse_int(replacement, &after_backref); + if (after_backref == pos + backref_len) { // Not actually a backref if there's no number + pos += 1; + continue; + } + if (backref < 0 || backref > 9) fail("Invalid backref index: %ld (only 0-%d are allowed)", backref, MAX_BACKREFS-1); + backref_len = (after_backref - pos); + + if (Text$get_grapheme_fast(replacement, &state, pos + backref_len) == ';') + backref_len += 1; // skip optional semicolon + + if (!captures[backref].occupied) + fail("There is no capture number %ld!", backref); + + Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length)); + + if (captures[backref].recursive && original_pattern.length > 0) + backref_text = Text$replace(backref_text, original_pattern, replacement, backref_pat, true); + + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, backref_text); + } else { + ret = Text$concat(ret, backref_text); + } + + pos += backref_len; + nonmatching_pos = pos; + } + if (nonmatching_pos < replacement.length) { + Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive) +{ + Text_t ret = {.length=0}; + + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = {0, 0}; + int64_t nonmatching_pos = 0; + for (int64_t pos = 0; pos < text.length; ) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme) + ++pos; + } + + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, pos, pattern, 0, captures, 1); + if (match_len < 0) { + pos += 1; + continue; + } + captures[0] = (capture_t){ + .index = pos, .length = match_len, + .occupied = true, .recursive = false, + }; + + Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures); + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, replacement_text); + } else { + ret = Text$concat(ret, replacement_text); + } + nonmatching_pos = pos + match_len; + pos += MAX(match_len, 1); + } + if (nonmatching_pos < text.length) { + Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right) +{ + int64_t first = 0, last = text.length-1; + if (trim_left) { + int64_t match_len = match(text, 0, pattern, 0, NULL, 0); + if (match_len > 0) + first = match_len; + } + + if (trim_right) { + for (int64_t i = text.length-1; i >= first; i--) { + int64_t match_len = match(text, i, pattern, 0, NULL, 0); + if (match_len > 0 && i + match_len == text.length) + last = i-1; + } + } + return Text$slice(text, I(first+1), I(last+1)); +} + +public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn) +{ + Text_t ret = {.length=0}; + + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = {0, 0}; + int64_t nonmatching_pos = 0; + + Text_t (*text_mapper)(Text_t, void*) = fn.fn; + for (int64_t pos = 0; pos < text.length; pos++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme) + ++pos; + } + + int64_t match_len = match(text, pos, pattern, 0, NULL, 0); + if (match_len < 0) continue; + + Text_t replacement = text_mapper(Text$slice(text, I(pos+1), I(pos+match_len)), fn.userdata); + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, replacement); + } else { + ret = Text$concat(ret, replacement); + } + nonmatching_pos = pos + match_len; + pos += (match_len - 1); + } + if (nonmatching_pos < text.length) { + Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive) +{ + if (replacements.entries.length == 0) return text; + + Text_t ret = {.length=0}; + + int64_t nonmatch_pos = 0; + for (int64_t pos = 0; pos < text.length; ) { + // Find the first matching pattern at this position: + for (int64_t i = 0; i < replacements.entries.length; i++) { + Pattern_t pattern = *(Pattern_t*)(replacements.entries.data + i*replacements.entries.stride); + capture_t captures[MAX_BACKREFS] = {}; + int64_t len = match(text, pos, pattern, 0, captures, 1); + if (len < 0) continue; + captures[0].index = pos; + captures[0].length = len; + + // If we skipped over some non-matching text before finding a match, insert it here: + if (pos > nonmatch_pos) { + Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos)); + ret = Text$concat(ret, before_slice); + } + + // Concatenate the replacement: + Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t)); + Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures); + ret = Text$concat(ret, replacement_text); + pos += MAX(len, 1); + nonmatch_pos = pos; + goto next_pos; + } + + pos += 1; + next_pos: + continue; + } + + if (nonmatch_pos <= text.length) { + Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +public Array_t Text$split(Text_t text, Pattern_t pattern) +{ + if (text.length == 0) // special case + return (Array_t){.length=0}; + + if (pattern.length == 0) // special case + return Text$clusters(text); + + Array_t chunks = {}; + + Int_t i = I_small(1); + for (;;) { + int64_t len = 0; + Int_t found = Text$find(text, pattern, i, &len); + if (I_is_zero(found)) break; + Text_t chunk = Text$slice(text, i, Int$minus(found, I_small(1))); + Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); + i = Int$plus(found, I(MAX(len, 1))); + } + + Text_t last_chunk = Text$slice(text, i, I(text.length)); + Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t)); + + return chunks; +} + + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/patterns.h b/stdlib/patterns.h new file mode 100644 index 00000000..804fb286 --- /dev/null +++ b/stdlib/patterns.h @@ -0,0 +1,33 @@ +#pragma once + +// The type representing text patterns for pattern matching. + +#include +#include +#include + +#include "datatypes.h" +#include "integers.h" +#include "types.h" + +#define Pattern(text) ((Pattern_t)Text(text)) +#define Patterns(...) ((Pattern_t)Texts(__VA_ARGS__)) + +Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t backref_pat, bool recursive); +Pattern_t Pattern$escape_text(Text_t text); +Text_t Text$replace_all(Text_t text, Table_t replacements, Pattern_t backref_pat, bool recursive); +Array_t Text$split(Text_t text, Pattern_t pattern); +Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right); +Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length); +Array_t Text$find_all(Text_t text, Pattern_t pattern); +PUREFUNC bool Text$has(Text_t text, Pattern_t pattern); +PUREFUNC bool Text$matches(Text_t text, Pattern_t pattern); +Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn); + +#define Pattern$hash Text$hash +#define Pattern$compare Text$compare +#define Pattern$equal Text$equal + +extern const TypeInfo Pattern$info; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/pointers.c b/stdlib/pointers.c new file mode 100644 index 00000000..1ad9f407 --- /dev/null +++ b/stdlib/pointers.c @@ -0,0 +1,84 @@ +// Type infos and methods for Pointer types +#include +#include +#include +#include +#include +#include +#include + +#include "metamethods.h" +#include "text.h" +#include "types.h" +#include "util.h" + +typedef struct recursion_s { + const void *ptr; + struct recursion_s *next; +} recursion_t; + +public Text_t Pointer$as_text(const void *x, bool colorize, const TypeInfo *type) { + auto ptr_info = type->PointerInfo; + if (!x) { + Text_t typename = generic_as_text(NULL, false, ptr_info.pointed); + Text_t text; + if (colorize) + text = Text$concat(Text("\x1b[34;1m"), Text$from_str(ptr_info.sigil), typename, Text("\x1b[m")); + else + text = Text$concat(Text$from_str(ptr_info.sigil), typename); + return text; + } + const void *ptr = *(const void**)x; + if (!ptr) { + Text_t typename = generic_as_text(NULL, false, ptr_info.pointed); + if (colorize) + return Text$concat(Text("\x1b[34;1m!"), typename, Text("\x1b[m")); + else + return Text$concat(Text("!"), typename); + } + + // Check for recursive references, so if `x.foo = x`, then it prints as + // `@Foo{foo=@..1}` instead of overflowing the stack: + static recursion_t *recursion = NULL; + int32_t depth = 0; + for (recursion_t *r = recursion; r; r = r->next) { + ++depth; + if (r->ptr == ptr) { + Text_t text = Text$concat( + colorize ? Text("\x1b[34;1m") : Text(""), + Text$from_str(ptr_info.sigil), + Text(".."), + Int32$as_text(&depth, false, &Int32$info), + colorize ? Text("\x1b[m") : Text("")); + return text; + } + } + + Text_t pointed; + { // Stringify with this pointer flagged as a recursive one: + recursion_t my_recursion = {.ptr=ptr, .next=recursion}; + recursion = &my_recursion; + pointed = generic_as_text(ptr, colorize, ptr_info.pointed); + recursion = recursion->next; + } + Text_t text; + if (colorize) + text = Text$concat(Text("\x1b[34;1m"), Text$from_str(ptr_info.sigil), Text("\x1b[m"), pointed); + else + text = Text$concat(Text$from_str(ptr_info.sigil), pointed); + return text; +} + +PUREFUNC public int32_t Pointer$compare(const void *x, const void *y, const TypeInfo *type) { + (void)type; + const void *xp = *(const void**)x, *yp = *(const void**)y; + return (xp > yp) - (xp < yp); +} + +PUREFUNC public bool Pointer$equal(const void *x, const void *y, const TypeInfo *type) { + (void)type; + const void *xp = *(const void**)x, *yp = *(const void**)y; + return xp == yp; +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/pointers.h b/stdlib/pointers.h new file mode 100644 index 00000000..faa95316 --- /dev/null +++ b/stdlib/pointers.h @@ -0,0 +1,19 @@ +#pragma once + +// Type infos and methods for Pointer types + +#include +#include + +#include "types.h" +#include "util.h" + +Text_t Pointer$as_text(const void *x, bool colorize, const TypeInfo *type); +PUREFUNC int32_t Pointer$compare(const void *x, const void *y, const TypeInfo *type); +PUREFUNC bool Pointer$equal(const void *x, const void *y, const TypeInfo *type); + +#define Null(t) (t*)NULL +#define POINTER_TYPE(_sigil, _pointed) (&(TypeInfo){\ + .size=sizeof(void*), .align=alignof(void*), .tag=PointerInfo, .PointerInfo.sigil=_sigil, .PointerInfo.pointed=_pointed}) + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/ranges.c b/stdlib/ranges.c new file mode 100644 index 00000000..9dfd1efe --- /dev/null +++ b/stdlib/ranges.c @@ -0,0 +1,63 @@ +// Functions that operate on numeric ranges + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "integers.h" +#include "text.h" +#include "types.h" +#include "util.h" + + +PUREFUNC static int32_t Range$compare(const Range_t *x, const Range_t *y, const TypeInfo *type) +{ + (void)type; + if (x == y) return 0; + int32_t diff = Int$compare(&x->first, &y->first, &Int$info); + if (diff != 0) return diff; + diff = Int$compare(&x->last, &y->last, &Int$info); + if (diff != 0) return diff; + return Int$compare(&x->step, &y->step, &Int$info); +} + +PUREFUNC static bool Range$equal(const Range_t *x, const Range_t *y, const TypeInfo *type) +{ + (void)type; + if (x == y) return true; + return Int$equal(&x->first, &y->first, &Int$info) && Int$equal(&x->last, &y->last, &Int$info) && Int$equal(&x->step, &y->step, &Int$info); +} + +static Text_t Range$as_text(const Range_t *r, bool use_color, const TypeInfo *type) +{ + (void)type; + if (!r) return Text("Range"); + + return Text$format(use_color ? "\x1b[0;1mRange\x1b[m(first=%r, last=%r, step=%r)" + : "Range(first=%r, last=%r, step=%r)", + Int$as_text(&r->first, use_color, &Int$info), Int$as_text(&r->last, use_color, &Int$info), + Int$as_text(&r->step, use_color, &Int$info)); +} + +PUREFUNC public Range_t Range$reversed(Range_t r) +{ + return (Range_t){r.last, r.first, Int$negative(r.step)}; +} + +PUREFUNC public Range_t Range$by(Range_t r, Int_t step) +{ + return (Range_t){r.first, r.last, Int$times(step, r.step)}; +} + +public const TypeInfo Range = {sizeof(Range_t), __alignof(Range_t), {.tag=CustomInfo, .CustomInfo={ + .as_text=(void*)Range$as_text, + .compare=(void*)Range$compare, + .equal=(void*)Range$equal, +}}}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/ranges.h b/stdlib/ranges.h new file mode 100644 index 00000000..2a4f1d68 --- /dev/null +++ b/stdlib/ranges.h @@ -0,0 +1,10 @@ +#pragma once + +// Ranges represent numeric ranges + +PUREFUNC Range_t Range$reversed(Range_t r); +PUREFUNC Range_t Range$by(Range_t r, Int_t step); + +extern const TypeInfo Range; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/stdlib/shell.c b/stdlib/shell.c new file mode 100644 index 00000000..36b6a9ad --- /dev/null +++ b/stdlib/shell.c @@ -0,0 +1,67 @@ +// A lang for Shell Command Language +#include +#include + +#include "arrays.h" +#include "integers.h" +#include "patterns.h" +#include "shell.h" +#include "text.h" +#include "types.h" +#include "util.h" + +public Shell_t Shell$escape_text(Text_t text) +{ + // TODO: optimize for ASCII and short strings + Array_t shell_graphemes = {.atomic=1}; +#define add_char(c) Array$insert(&shell_graphemes, (uint32_t[1]){c}, I_small(0), sizeof(uint32_t)) + add_char('\''); + const char *text_utf8 = Text$as_c_string(text); + for (const char *p = text_utf8; *p; p++) { + if (*p == '\'') { + add_char('\''); + add_char('"'); + add_char('\''); + add_char('"'); + add_char('\''); + } else + add_char((uint8_t)*p); + } + add_char('\''); +#undef add_char + return (Text_t){.length=shell_graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=shell_graphemes.data}; +} + +public Text_t Shell$run(Shell_t command, int32_t *status) +{ + const char *cmd_str = Text$as_c_string(command); + FILE *prog = popen(cmd_str, "r"); + + const int chunk_size = 256; + char *buf = GC_MALLOC_ATOMIC(chunk_size); + Text_t output = Text(""); + size_t just_read; + do { + just_read = fread(buf, sizeof(char), chunk_size, prog); + if (just_read > 0) { + output = Texts(output, Text$from_strn(buf, just_read)); + buf = GC_MALLOC_ATOMIC(chunk_size); + } + } while (just_read > 0); + + if (status) + *status = WEXITSTATUS(pclose(prog)); + else + pclose(prog); + + return Text$trim(output, Pattern("{1 nl}"), false, true); +} + +public const TypeInfo Shell$info = { + .size=sizeof(Shell_t), + .align=__alignof__(Shell_t), + .tag=TextInfo, + .TextInfo={.lang="Shell"}, +}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/shell.h b/stdlib/shell.h new file mode 100644 index 00000000..48c59abc --- /dev/null +++ b/stdlib/shell.h @@ -0,0 +1,25 @@ +#pragma once + +// A lang for Shell Command Language + +#include +#include + +#include "types.h" +#include "datatypes.h" + +#define Shell_t Text_t +#define Shell(text) ((Shell_t)Text(text)) +#define Shells(...) ((Shell_t)Texts(__VA_ARGS__)) + +Text_t Shell$run(Shell_t command, int32_t *status); +Shell_t Shell$escape_text(Text_t text); + +#define Shell$hash Text$hash +#define Shell$compare Text$compare +#define Shell$equal Text$equal + +extern const TypeInfo Shell$info; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 + diff --git a/stdlib/siphash-internals.h b/stdlib/siphash-internals.h new file mode 100644 index 00000000..d1906be4 --- /dev/null +++ b/stdlib/siphash-internals.h @@ -0,0 +1,127 @@ +#pragma once + +// This file holds the internals for the SipHash implementation. For a few +// cases, we want to include this for incrementally computing hashes. +// Otherwise, it suffices to just use the siphash24() function from siphash.h + +#include +#include +#include + +#include "siphash.h" + +/* + Copyright (c) 2013 Marek Majkowski + Copyright (c) 2018 Samantha McVey + Copyright (c) 2024 Bruce Hill + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + + Original location: + https://github.com/majek/csiphash/ + + Original solution inspired by code from: + Samuel Neves (supercop/crypto_auth/siphash24/little) + djb (supercop/crypto_auth/siphash24/little2) + Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c) + + Extensive modifications for MoarVM by Samantha McVey + + Further modifications for Tomo by Bruce Hill +*/ +struct siphash { + uint64_t v0; + uint64_t v1; + uint64_t v2; + uint64_t v3; + uint64_t b; +}; +typedef struct siphash siphash; +#define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) ) + +#define HALF_ROUND(a,b,c,d,s,t) \ + a += b; c += d; \ + b = ROTATE(b, s) ^ a; \ + d = ROTATE(d, t) ^ c; \ + a = ROTATE(a, 32); + +#define DOUBLE_ROUND(v0,v1,v2,v3) \ + HALF_ROUND(v0,v1,v2,v3,13,16); \ + HALF_ROUND(v2,v1,v0,v3,17,21); \ + HALF_ROUND(v0,v1,v2,v3,13,16); \ + HALF_ROUND(v2,v1,v0,v3,17,21); + +static inline void siphashinit (siphash *sh, size_t src_sz) { + const uint64_t k0 = TOMO_HASH_KEY[0]; + const uint64_t k1 = TOMO_HASH_KEY[1]; + sh->b = (uint64_t)src_sz << 56; + sh->v0 = k0 ^ 0x736f6d6570736575ULL; + sh->v1 = k1 ^ 0x646f72616e646f6dULL; + sh->v2 = k0 ^ 0x6c7967656e657261ULL; + sh->v3 = k1 ^ 0x7465646279746573ULL; +} +static inline void siphashadd64bits (siphash *sh, const uint64_t in) { + const uint64_t mi = in; + sh->v3 ^= mi; + DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); + sh->v0 ^= mi; +} +#pragma GCC diagnostic ignored "-Winline" +static inline uint64_t siphashfinish_last_part (siphash *sh, uint64_t t) { + sh->b |= t; + sh->v3 ^= sh->b; + DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); + sh->v0 ^= sh->b; + sh->v2 ^= 0xff; + DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); + DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); + return (sh->v0 ^ sh->v1) ^ (sh->v2 ^ sh->v3); +} +/* This union helps us avoid doing weird things with pointers that can cause old + * compilers like GCC 4 to generate bad code. In addition it is nicely more C + * standards compliant to keep type punning to a minimum. */ +union SipHash64_union { + uint64_t u64; + uint32_t u32; + uint8_t u8[8]; +}; +static inline uint64_t siphashfinish (siphash *sh, const uint8_t *src, size_t src_sz) { + union SipHash64_union t = { 0 }; + switch (src_sz) { + /* Falls through */ + case 7: t.u8[6] = src[6]; + /* Falls through */ + case 6: t.u8[5] = src[5]; + /* Falls through */ + case 5: t.u8[4] = src[4]; + /* Falls through */ + case 4: t.u8[3] = src[3]; + /* Falls through */ + case 3: t.u8[2] = src[2]; + /* Falls through */ + case 2: t.u8[1] = src[1]; + /* Falls through */ + case 1: t.u8[0] = src[0]; + default: break; + } + return siphashfinish_last_part(sh, t.u64); +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/siphash.c b/stdlib/siphash.c new file mode 100644 index 00000000..671fbad6 --- /dev/null +++ b/stdlib/siphash.c @@ -0,0 +1,77 @@ +#include +#include +#include + +#include "siphash.h" +#include "util.h" + +public uint64_t TOMO_HASH_KEY[2] = {23, 42}; // Randomized in tomo_init() + +/* + Copyright (c) 2013 Marek Majkowski + Copyright (c) 2018 Samantha McVey + Copyright (c) 2024 Bruce Hill + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + + Original location: + https://github.com/majek/csiphash/ + + Original solution inspired by code from: + Samuel Neves (supercop/crypto_auth/siphash24/little) + djb (supercop/crypto_auth/siphash24/little2) + Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c) + + Extensive modifications for MoarVM by Samantha McVey + + Further modifications for Tomo by Bruce Hill +*/ + +#include "siphash-internals.h" + +public uint64_t siphash24(const uint8_t *src, size_t src_sz) { + siphash sh; + if ((uint64_t)src % __alignof__(uint64_t) == 0) { +#pragma GCC diagnostic ignored "-Wcast-align" + const uint64_t *in = (uint64_t*)src; + /* Find largest src_sz evenly divisible by 8 bytes. */ + const ptrdiff_t src_sz_nearest_8bits = ((ptrdiff_t)src_sz >> 3) << 3; + const uint64_t *goal = (uint64_t*)(src + src_sz_nearest_8bits); + siphashinit(&sh, src_sz); + src_sz -= (size_t)src_sz_nearest_8bits; + while (in < goal) { + siphashadd64bits(&sh, *in); + in++; + } + return siphashfinish(&sh, (uint8_t *)in, src_sz); + } else { + const uint8_t *in = src; + siphashinit(&sh, src_sz); + while (src_sz >= 8) { + uint64_t in_64; + memcpy(&in_64, in, sizeof(uint64_t)); + siphashadd64bits(&sh, in_64); + in += 8; src_sz -= 8; + } + return siphashfinish(&sh, (uint8_t *)in, src_sz); + } +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/siphash.h b/stdlib/siphash.h new file mode 100644 index 00000000..8104a306 --- /dev/null +++ b/stdlib/siphash.h @@ -0,0 +1,13 @@ +#pragma once + +// An implementation of the SipHash algorithm. + +#include +#include + +// This value will be randomized on startup in tomo_init(): +extern uint64_t TOMO_HASH_KEY[2]; + +uint64_t siphash24(const uint8_t *src, size_t src_sz); + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/stdlib.c b/stdlib/stdlib.c new file mode 100644 index 00000000..b8e40a54 --- /dev/null +++ b/stdlib/stdlib.c @@ -0,0 +1,274 @@ +// Built-in functions + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "files.h" +#include "integers.h" +#include "metamethods.h" +#include "patterns.h" +#include "siphash.h" +#include "tables.h" +#include "text.h" +#include "util.h" + +public void tomo_init(void) +{ + GC_INIT(); + USE_COLOR = getenv("COLOR") ? strcmp(getenv("COLOR"), "1") == 0 : isatty(STDOUT_FILENO); + getrandom(TOMO_HASH_KEY, sizeof(TOMO_HASH_KEY), 0); + unsigned int seed; + getrandom(&seed, sizeof(seed), 0); + srand(seed); + srand48(seed); + Int$init_random(seed); + + if (register_printf_specifier('k', printf_text, printf_text_size)) + errx(1, "Couldn't set printf specifier"); +} + +void print_stack_trace(FILE *out, int start, int stop) +{ + // Print stack trace: + fprintf(out, "\x1b[34m"); + fflush(out); + void *array[1024]; + int64_t size = (int64_t)backtrace(array, sizeof(array)/sizeof(array[0])); + char **strings = strings = backtrace_symbols(array, size); + for (int64_t i = start; i < size - stop; i++) { + char *filename = strings[i]; + const char *cmd = heap_strf("addr2line -e %.*s -fisp | sed 's/\\$/./g;s/ at /() at /' >&2", strcspn(filename, "("), filename); + FILE *fp = popen(cmd, "w"); + if (fp) { + char *paren = strchrnul(strings[i], '('); + fprintf(fp, "%.*s\n", strcspn(paren + 1, ")"), paren + 1); + } + pclose(fp); + } + fprintf(out, "\x1b[m"); + fflush(out); +} + +__attribute__((format(printf, 1, 2))) +public _Noreturn void fail(const char *fmt, ...) +{ + fflush(stdout); + if (USE_COLOR) fputs("\x1b[31;7m ==================== ERROR ==================== \n\n\x1b[0;1m", stderr); + else fputs("==================== ERROR ====================\n\n", stderr); + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + if (USE_COLOR) fputs("\x1b[m", stderr); + fputs("\n\n", stderr); + va_end(args); + print_stack_trace(stderr, 2, 4); + fflush(stderr); + raise(SIGABRT); + _exit(1); +} + +__attribute__((format(printf, 4, 5))) +public _Noreturn void fail_source(const char *filename, int64_t start, int64_t end, const char *fmt, ...) +{ + if (USE_COLOR) fputs("\n\x1b[31;7m ==================== ERROR ==================== \n\n\x1b[0;1m", stderr); + else fputs("\n==================== ERROR ====================\n\n", stderr); + + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + + file_t *file = filename ? load_file(filename) : NULL; + if (filename && file) { + fputs("\n", stderr); + highlight_error(file, file->text+start, file->text+end, "\x1b[31;1m", 2, USE_COLOR); + fputs("\n", stderr); + } + if (USE_COLOR) fputs("\x1b[m", stderr); + + print_stack_trace(stderr, 2, 4); + fflush(stderr); + raise(SIGABRT); + _exit(1); +} + +public Text_t builtin_last_err() +{ + return Text$from_str(strerror(errno)); +} + +static int TEST_DEPTH = 0; +static file_t *file = NULL; + +public void start_test(const char *filename, int64_t start, int64_t end) +{ + if (filename && (file == NULL || strcmp(file->filename, filename) != 0)) + file = load_file(filename); + + if (filename && file) { + for (int i = 0; i < 3*TEST_DEPTH; i++) fputc(' ', stderr); + + int64_t first_line_len = (int64_t)strcspn(file->text + start, "\r\n"); + fprintf(stderr, USE_COLOR ? "\x1b[33;1m>> \x1b[m%.*s\n" : ">> %.*s\n", first_line_len, file->text + start); + + // For multi-line expressions, dedent each and print it on a new line with ".. " in front: + if (end > start + first_line_len) { + int64_t line_num = get_line_number(file, file->text + start); + const char *line_start = get_line(file, line_num); + int64_t indent_len = (int64_t)strspn(line_start, " \t"); + for (const char *line = file->text + start + first_line_len; line < file->text + end; line += strcspn(line, "\r\n")) { + line += strspn(line, "\r\n"); + if ((int64_t)strspn(line, " \t") >= indent_len) + line += indent_len; + fprintf(stderr, USE_COLOR ? "\x1b[33m.. \x1b[m%.*s\n" : ".. %.*s\n", strcspn(line, "\r\n"), line); + } + } + } + ++TEST_DEPTH; +} + +public void end_test(const void *expr, const TypeInfo *type, const char *expected, const char *filename, int64_t start, int64_t end) +{ + (void)filename; + (void)start; + (void)end; + --TEST_DEPTH; + if (!expr || !type) return; + + Text_t expr_text = generic_as_text(expr, USE_COLOR, type); + Text_t type_name = generic_as_text(NULL, false, type); + + for (int i = 0; i < 3*TEST_DEPTH; i++) fputc(' ', stderr); + fprintf(stderr, USE_COLOR ? "\x1b[2m=\x1b[0m %k \x1b[2m: %k\x1b[m\n" : "= %k : %k\n", &expr_text, &type_name); + if (expected && expected[0]) { + Text_t expected_text = Text$from_str(expected); + Text_t expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_text; + bool success = Text$equal(&expr_plain, &expected_text); + if (!success) { + Int_t colon = Text$find(expected_text, Text(":"), I_small(1), NULL); + if (colon.small != I_small(0).small) { + Text_t with_type = Text$concat(expr_plain, Text(" : "), type_name); + success = Text$equal(&with_type, &expected_text); + } + } + + if (!success) { + fprintf(stderr, + USE_COLOR + ? "\n\x1b[31;7m ==================== TEST FAILED ==================== \x1b[0;1m\n\nExpected: \x1b[1;32m%s\x1b[0m\n\x1b[1m But got:\x1b[m %k\n\n" + : "\n==================== TEST FAILED ====================\nExpected: %s\n\n But got: %k\n\n", + expected, &expr_text); + + print_stack_trace(stderr, 2, 4); + fflush(stderr); + raise(SIGABRT); + } + } +} + +public void say(Text_t text, bool newline) +{ + Text$print(stdout, text); + if (newline) + fputc('\n', stdout); + fflush(stdout); +} + +public _Noreturn void tomo_exit(Text_t text, int32_t status) +{ + if (text.length > 0) + say(text, true); + _exit(status); +} + +public Text_t ask(Text_t prompt, bool bold, bool force_tty) +{ + Text_t ret = Text(""); + FILE *out = stdout; + FILE *in = stdin; + + char *line = NULL; + size_t bufsize = 0; + ssize_t length = 0; + char *gc_input = NULL; + + if (force_tty && !isatty(STDOUT_FILENO)) { + out = fopen("/dev/tty", "w"); + if (!out) goto cleanup; + } + + if (bold) fputs("\x1b[1m", out); + Text$print(out, prompt); + if (bold) fputs("\x1b[m", out); + fflush(out); + + if (force_tty && !isatty(STDIN_FILENO)) { + in = fopen("/dev/tty", "r"); + if (!in) { + fputs("\n", out); // finish the line, since the user can't + goto cleanup; + } + } + + length = getline(&line, &bufsize, in); + if (length == -1) { + fputs("\n", out); // finish the line, since we didn't get any input + goto cleanup; + } + + if (length > 0 && line[length-1] == '\n') { + line[length-1] = '\0'; + --length; + } + + gc_input = GC_MALLOC_ATOMIC((size_t)(length + 1)); + memcpy(gc_input, line, (size_t)(length + 1)); + + ret = Text$from_strn(gc_input, (size_t)(length)); + + cleanup: + if (out && out != stdout) fclose(out); + if (in && in != stdin) fclose(in); + return ret; +} + +public bool pop_flag(char **argv, int *i, const char *flag, Text_t *result) +{ + if (argv[*i][0] != '-' || argv[*i][1] != '-') { + return false; + } else if (streq(argv[*i] + 2, flag)) { + *result = (Text_t){.length=0}; + argv[*i] = NULL; + *i += 1; + return true; + } else if (strncmp(argv[*i] + 2, "no-", 3) == 0 && streq(argv[*i] + 5, flag)) { + *result = Text("no"); + argv[*i] = NULL; + *i += 1; + return true; + } else if (strncmp(argv[*i] + 2, flag, strlen(flag)) == 0 && argv[*i][2 + strlen(flag)] == '=') { + *result = Text$from_str(argv[*i] + 2 + strlen(flag) + 1); + argv[*i] = NULL; + *i += 1; + return true; + } else { + return false; + } +} + +public void sleep_num(double seconds) +{ + struct timespec ts; + ts.tv_sec = (time_t)seconds; + ts.tv_nsec = (long)((seconds - (double)ts.tv_sec) * 1e9); + nanosleep(&ts, NULL); +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/stdlib.h b/stdlib/stdlib.h new file mode 100644 index 00000000..da3ddbf7 --- /dev/null +++ b/stdlib/stdlib.h @@ -0,0 +1,34 @@ +#pragma once + +// Built-in functions + +#include +#include +#include + +#include "datatypes.h" +#include "types.h" +#include "util.h" + +void tomo_init(void); +__attribute__((format(printf, 1, 2))) +_Noreturn void fail(const char *fmt, ...); +__attribute__((format(printf, 4, 5))) +_Noreturn void fail_source(const char *filename, int64_t start, int64_t end, const char *fmt, ...); +Text_t builtin_last_err(); +void start_test(const char *filename, int64_t start, int64_t end); +void end_test(const void *expr, const TypeInfo *type, const char *expected, const char *filename, int64_t start, int64_t end); +#define test(expr, typeinfo, expected, start, end) {\ + start_test(__SOURCE_FILE__, start, end); \ + auto _expr = expr; \ + end_test(&_expr, typeinfo, expected, __SOURCE_FILE__, start, end); } +void say(Text_t text, bool newline); +Text_t ask(Text_t prompt, bool bold, bool force_tty); +_Noreturn void tomo_exit(Text_t text, int32_t status); + +Closure_t spawn(Closure_t fn); +bool pop_flag(char **argv, int *i, const char *flag, Text_t *result); +void print_stack_trace(FILE *out, int start, int stop); +void sleep_num(double seconds); + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/tables.c b/stdlib/tables.c new file mode 100644 index 00000000..e644fd23 --- /dev/null +++ b/stdlib/tables.c @@ -0,0 +1,636 @@ +// table.c - C Hash table implementation +// Copyright 2024 Bruce Hill +// Provided under the MIT license with the Commons Clause +// See included LICENSE for details. + +// Hash table (aka Dictionary) Implementation +// Hash keys and values are stored *by value* +// The hash insertion/lookup implementation is based on Lua's tables, +// which use a chained scatter with Brent's variation. + +#include +#include +#include +#include +#include +#include +#include + +#include "arrays.h" +#include "c_strings.h" +#include "datatypes.h" +#include "memory.h" +#include "metamethods.h" +#include "siphash.h" +#include "tables.h" +#include "text.h" +#include "types.h" +#include "util.h" + +// #define DEBUG_TABLES + +#ifdef DEBUG_TABLES +#define hdebug(fmt, ...) printf("\x1b[2m" fmt "\x1b[m" __VA_OPT__(,) __VA_ARGS__) +#else +#define hdebug(...) (void)0 +#endif + +// Helper accessors for type functions/values: +#define HASH_KEY(t, k) (generic_hash((k), type->TableInfo.key) % ((t).bucket_info->count)) +#define EQUAL_KEYS(x, y) (generic_equal((x), (y), type->TableInfo.key)) +#define END_OF_CHAIN UINT32_MAX + +#define GET_ENTRY(t, i) ((t).entries.data + (t).entries.stride*(i)) + +static const TypeInfo MemoryPointer = { + .size=sizeof(void*), + .align=__alignof__(void*), + .tag=PointerInfo, + .PointerInfo={ + .sigil="@", + .pointed=&Memory$info, + }, +}; + +const TypeInfo CStrToVoidStarTable = { + .size=sizeof(Table_t), + .align=__alignof__(Table_t), + .tag=TableInfo, + .TableInfo={.key=&CString$info, .value=&MemoryPointer}, +}; + +PUREFUNC static inline size_t entry_size(const TypeInfo *info) +{ + size_t size = (size_t)info->TableInfo.key->size; + if (info->TableInfo.value->align > 1 && size % (size_t)info->TableInfo.value->align) + size += (size_t)info->TableInfo.value->align - (size % (size_t)info->TableInfo.value->align); // padding + size += (size_t)info->TableInfo.value->size; + if (info->TableInfo.key->align > 1 && size % (size_t)info->TableInfo.key->align) + size += (size_t)info->TableInfo.key->align - (size % (size_t)info->TableInfo.key->align); // padding + return size; +} + +PUREFUNC static inline size_t entry_align(const TypeInfo *info) +{ + return (size_t)MAX(info->TableInfo.key->align, info->TableInfo.value->align); +} + +PUREFUNC static inline size_t value_offset(const TypeInfo *info) +{ + size_t offset = (size_t)info->TableInfo.key->size; + if ((size_t)info->TableInfo.value->align > 1 && offset % (size_t)info->TableInfo.value->align) + offset += (size_t)info->TableInfo.value->align - (offset % (size_t)info->TableInfo.value->align); // padding + return offset; +} + +static inline void hshow(const Table_t *t) +{ + hdebug("{"); + for (uint32_t i = 0; t->bucket_info && i < t->bucket_info->count; i++) { + if (i > 0) hdebug(" "); + if (t->bucket_info->buckets[i].occupied) + hdebug("[%d]=%d(%d)", i, t->bucket_info->buckets[i].index, t->bucket_info->buckets[i].next_bucket); + else + hdebug("[%d]=_", i); + } + hdebug("}\n"); +} + +static void maybe_copy_on_write(Table_t *t, const TypeInfo *type) +{ + if (t->entries.data_refcount != 0) + Array$compact(&t->entries, (int64_t)entry_size(type)); + + if (t->bucket_info && t->bucket_info->data_refcount != 0) { + size_t size = sizeof(bucket_info_t) + sizeof(bucket_t[t->bucket_info->count]); + t->bucket_info = memcpy(GC_MALLOC(size), t->bucket_info, size); + t->bucket_info->data_refcount = 0; + } +} + +// Return address of value or NULL +PUREFUNC public void *Table$get_raw(Table_t t, const void *key, const TypeInfo *type) +{ + assert(type->tag == TableInfo); + if (!key || !t.bucket_info) return NULL; + + uint64_t hash = HASH_KEY(t, key); + hshow(&t); + hdebug("Getting value with initial probe at %u\n", hash); + bucket_t *buckets = t.bucket_info->buckets; + for (uint64_t i = hash; buckets[i].occupied; i = buckets[i].next_bucket) { + hdebug("Checking against key in bucket %u\n", i); + void *entry = GET_ENTRY(t, buckets[i].index); + if (EQUAL_KEYS(entry, key)) { + hdebug("Found key!\n"); + return entry + value_offset(type); + } + if (buckets[i].next_bucket == END_OF_CHAIN) + break; + } + return NULL; +} + +PUREFUNC public void *Table$get(Table_t t, const void *key, const TypeInfo *type) +{ + assert(type->tag == TableInfo); + for (const Table_t *iter = &t; iter; iter = iter->fallback) { + void *ret = Table$get_raw(*iter, key, type); + if (ret) return ret; + } + return NULL; +} + +static void Table$set_bucket(Table_t *t, const void *entry, int32_t index, const TypeInfo *type) +{ + assert(t->bucket_info); + hshow(t); + const void *key = entry; + bucket_t *buckets = t->bucket_info->buckets; + uint64_t hash = HASH_KEY(*t, key); + hdebug("Hash value (mod %u) = %u\n", t->bucket_info->count, hash); + bucket_t *bucket = &buckets[hash]; + if (!bucket->occupied) { + hdebug("Got an empty space\n"); + // Empty space: + bucket->occupied = 1; + bucket->index = index; + bucket->next_bucket = END_OF_CHAIN; + hshow(t); + return; + } + + hdebug("Collision detected in bucket %u (entry %u)\n", hash, bucket->index); + + while (buckets[t->bucket_info->last_free].occupied) { + assert(t->bucket_info->last_free > 0); + --t->bucket_info->last_free; + } + + uint64_t collided_hash = HASH_KEY(*t, GET_ENTRY(*t, bucket->index)); + if (collided_hash != hash) { // Collided with a mid-chain entry + hdebug("Hit a mid-chain entry at bucket %u (chain starting at %u)\n", hash, collided_hash); + // Find chain predecessor + uint64_t predecessor = collided_hash; + while (buckets[predecessor].next_bucket != hash) + predecessor = buckets[predecessor].next_bucket; + + // Move mid-chain entry to free space and update predecessor + buckets[predecessor].next_bucket = t->bucket_info->last_free; + buckets[t->bucket_info->last_free] = *bucket; + } else { // Collided with the start of a chain + hdebug("Hit start of a chain\n"); + uint64_t end_of_chain = hash; + while (buckets[end_of_chain].next_bucket != END_OF_CHAIN) + end_of_chain = buckets[end_of_chain].next_bucket; + hdebug("Appending to chain\n"); + // Chain now ends on the free space: + buckets[end_of_chain].next_bucket = t->bucket_info->last_free; + bucket = &buckets[t->bucket_info->last_free]; + } + + bucket->occupied = 1; + bucket->index = index; + bucket->next_bucket = END_OF_CHAIN; + hshow(t); +} + +static void hashmap_resize_buckets(Table_t *t, uint32_t new_capacity, const TypeInfo *type) +{ + if (__builtin_expect(new_capacity > TABLE_MAX_BUCKETS, 0)) + fail("Table has exceeded the maximum table size (2^31) and cannot grow further!"); + hdebug("About to resize from %u to %u\n", t->bucket_info ? t->bucket_info->count : 0, new_capacity); + hshow(t); + size_t alloc_size = sizeof(bucket_info_t) + sizeof(bucket_t[new_capacity]); + t->bucket_info = GC_MALLOC_ATOMIC(alloc_size); + memset(t->bucket_info->buckets, 0, sizeof(bucket_t[new_capacity])); + t->bucket_info->count = new_capacity; + t->bucket_info->last_free = new_capacity-1; + // Rehash: + for (int64_t i = 0; i < Table$length(*t); i++) { + hdebug("Rehashing %u\n", i); + Table$set_bucket(t, GET_ENTRY(*t, i), i, type); + } + + hshow(t); + hdebug("Finished resizing\n"); +} + +// Return address of value +#pragma GCC diagnostic ignored "-Wstack-protector" +public void *Table$reserve(Table_t *t, const void *key, const void *value, const TypeInfo *type) +{ + assert(type->tag == TableInfo); + if (!t || !key) return NULL; + hshow(t); + + int64_t key_size = type->TableInfo.key->size, + value_size = type->TableInfo.value->size; + if (!t->bucket_info || t->bucket_info->count == 0) { + hashmap_resize_buckets(t, 4, type); + } else { + // Check if we are clobbering a value: + void *value_home = Table$get_raw(*t, key, type); + if (value_home) { // Update existing slot + // Ensure that `value_home` is still inside t->entries, even if COW occurs + ptrdiff_t offset = value_home - t->entries.data; + maybe_copy_on_write(t, type); + value_home = t->entries.data + offset; + + if (value && value_size > 0) + memcpy(value_home, value, (size_t)value_size); + + return value_home; + } + } + // Otherwise add a new entry: + + // Resize buckets if necessary + if (t->entries.length >= (int64_t)t->bucket_info->count) { + uint32_t newsize = (uint32_t)t->bucket_info->count + MIN((uint32_t)t->bucket_info->count, 64); + if (__builtin_expect(newsize > TABLE_MAX_BUCKETS, 0)) + newsize = t->entries.length + 1; + hashmap_resize_buckets(t, newsize, type); + } + + if (!value && value_size > 0) { + for (Table_t *iter = t->fallback; iter; iter = iter->fallback) { + value = Table$get_raw(*iter, key, type); + if (value) break; + } + } + + maybe_copy_on_write(t, type); + + char buf[entry_size(type)]; + memset(buf, 0, sizeof(buf)); + memcpy(buf, key, (size_t)key_size); + if (value && value_size > 0) + memcpy(buf + value_offset(type), value, (size_t)value_size); + else + memset(buf + value_offset(type), 0, (size_t)value_size); + Array$insert(&t->entries, buf, I(0), (int64_t)entry_size(type)); + + int64_t entry_index = t->entries.length-1; + void *entry = GET_ENTRY(*t, entry_index); + Table$set_bucket(t, entry, entry_index, type); + return entry + value_offset(type); +} + +public void Table$set(Table_t *t, const void *key, const void *value, const TypeInfo *type) +{ + assert(type->tag == TableInfo); + (void)Table$reserve(t, key, value, type); +} + +public void Table$remove(Table_t *t, const void *key, const TypeInfo *type) +{ + assert(type->tag == TableInfo); + if (!t || Table$length(*t) == 0) return; + + // TODO: this work doesn't need to be done if the key is already missing + maybe_copy_on_write(t, type); + + // If unspecified, pop the last key: + if (!key) + key = GET_ENTRY(*t, t->entries.length-1); + + // Steps: look up the bucket for the removed key + // If missing, then return immediately + // Swap last key/value into the removed bucket's index1 + // Zero out the last key/value and decrement the count + // Find the last key/value's bucket and update its index1 + // Look up the bucket for the removed key + // If bucket is first in chain: + // Move bucket->next to bucket's spot + // zero out bucket->next's old spot + // maybe update lastfree_index1 to second-in-chain's index + // Else: + // set prev->next = bucket->next + // zero out bucket + // maybe update lastfree_index1 to removed bucket's index + + uint64_t hash = HASH_KEY(*t, key); + hdebug("Removing key with hash %u\n", hash); + bucket_t *bucket, *prev = NULL; + for (uint64_t i = hash; t->bucket_info->buckets[i].occupied; i = t->bucket_info->buckets[i].next_bucket) { + if (EQUAL_KEYS(GET_ENTRY(*t, t->bucket_info->buckets[i].index), key)) { + bucket = &t->bucket_info->buckets[i]; + hdebug("Found key to delete in bucket %u\n", i); + goto found_it; + } + if (t->bucket_info->buckets[i].next_bucket == END_OF_CHAIN) + return; + prev = &t->bucket_info->buckets[i]; + } + return; + + found_it:; + assert(bucket->occupied); + + // Always remove the last entry. If we need to remove some other entry, + // swap the other entry into the last position and then remove the last + // entry. This disturbs the ordering of the table, but keeps removal O(1) + // instead of O(N) + int64_t last_entry = t->entries.length-1; + if (bucket->index != last_entry) { + hdebug("Removing key/value from the middle of the entries array\n"); + + // Find the bucket that points to the last entry's index: + uint64_t i = HASH_KEY(*t, GET_ENTRY(*t, last_entry)); + while (t->bucket_info->buckets[i].index != last_entry) + i = t->bucket_info->buckets[i].next_bucket; + // Update the bucket to point to the last entry's new home (the space + // where the removed entry currently sits): + t->bucket_info->buckets[i].index = bucket->index; + + // Clobber the entry being removed (in the middle of the array) with + // the last entry: + memcpy(GET_ENTRY(*t, bucket->index), GET_ENTRY(*t, last_entry), entry_size(type)); + } + + // Last entry is being removed, so clear it out to be safe: + memset(GET_ENTRY(*t, last_entry), 0, entry_size(type)); + + Array$remove_at(&t->entries, I(t->entries.length), I(1), (int64_t)entry_size(type)); + + int64_t bucket_to_clear; + if (prev) { // Middle (or end) of a chain + hdebug("Removing from middle of a chain\n"); + bucket_to_clear = (bucket - t->bucket_info->buckets); + prev->next_bucket = bucket->next_bucket; + } else if (bucket->next_bucket != END_OF_CHAIN) { // Start of a chain + hdebug("Removing from start of a chain\n"); + bucket_to_clear = bucket->next_bucket; + *bucket = t->bucket_info->buckets[bucket_to_clear]; + } else { // Empty chain + hdebug("Removing from empty chain\n"); + bucket_to_clear = (bucket - t->bucket_info->buckets); + } + + t->bucket_info->buckets[bucket_to_clear] = (bucket_t){0}; + if (bucket_to_clear > t->bucket_info->last_free) + t->bucket_info->last_free = bucket_to_clear; + + hshow(t); +} + +CONSTFUNC public void *Table$entry(Table_t t, int64_t n) +{ + if (n < 1 || n > Table$length(t)) + return NULL; + return GET_ENTRY(t, n-1); +} + +public void Table$clear(Table_t *t) +{ + memset(t, 0, sizeof(Table_t)); +} + +public Table_t Table$sorted(Table_t t, const TypeInfo *type) +{ + Closure_t cmp = (Closure_t){.fn=generic_compare, .userdata=(void*)type->TableInfo.key}; + Array_t entries = Array$sorted(t.entries, cmp, (int64_t)entry_size(type)); + return Table$from_entries(entries, type); +} + +PUREFUNC public bool Table$equal(const Table_t *x, const Table_t *y, const TypeInfo *type) +{ + if (x == y) return true; + + assert(type->tag == TableInfo); + if (Table$length(*x) != Table$length(*y)) + return false; + + if ((x->fallback != NULL) != (y->fallback != NULL)) + return false; + + return (Table$compare(x, y, type) == 0); +} + +PUREFUNC public int32_t Table$compare(const Table_t *x, const Table_t *y, const TypeInfo *type) +{ + if (x == y) return 0; + + assert(type->tag == TableInfo); + auto table = type->TableInfo; + if (x->entries.length == 0) + return 0; + else if (x->entries.length != y->entries.length) + return (x->entries.length > y->entries.length) - (x->entries.length < y->entries.length); + + for (int64_t i = 0; i < x->entries.length; i++) { + void *x_key = x->entries.data + x->entries.stride * i; + void *y_key = y->entries.data + y->entries.stride * i; + int32_t diff = generic_compare(x_key, y_key, table.key); + if (diff != 0) return diff; + void *x_value = x_key + value_offset(type); + void *y_value = y_key + value_offset(type); + diff = generic_compare(x_value, y_value, table.value); + if (diff != 0) return diff; + } + + if (!x->fallback != !y->fallback) { + return (!x->fallback) - (!y->fallback); + } else if (x->fallback && y->fallback) { + return generic_compare(x->fallback, y->fallback, type); + } + + return 0; +} + +PUREFUNC public uint64_t Table$hash(const Table_t *t, const TypeInfo *type) +{ + assert(type->tag == TableInfo); + // Table hashes are computed as: + // hash(hash(t.keys), hash(t.values), hash(t.fallback), hash(t.default)) + // Where fallback and default hash to zero if absent + auto table = type->TableInfo; + uint64_t components[] = { + Array$hash(&t->entries, Array$info(table.key)), + Array$hash(&t->entries + value_offset(type), Array$info(table.value)), + t->fallback ? Table$hash(t->fallback, type) : 0, + }; + return siphash24((void*)&components, sizeof(components)); +} + +public Text_t Table$as_text(const Table_t *t, bool colorize, const TypeInfo *type) +{ + assert(type->tag == TableInfo); + auto table = type->TableInfo; + + if (!t) { + if (table.value != &Void$info) + return Text$concat( + Text("{"), + generic_as_text(NULL, false, table.key), + Text(":"), + generic_as_text(NULL, false, table.value), + Text("}")); + else + return Text$concat( + Text("{"), + generic_as_text(NULL, false, table.key), + Text("}")); + } + + int64_t val_off = (int64_t)value_offset(type); + Text_t text = Text("{"); + for (int64_t i = 0, length = Table$length(*t); i < length; i++) { + if (i > 0) + text = Text$concat(text, Text(", ")); + void *entry = GET_ENTRY(*t, i); + text = Text$concat(text, generic_as_text(entry, colorize, table.key)); + if (table.value != &Void$info) + text = Text$concat(text, Text(":"), generic_as_text(entry + val_off, colorize, table.value)); + } + + if (t->fallback) { + text = Text$concat(text, Text("; fallback="), Table$as_text(t->fallback, colorize, type)); + } + + text = Text$concat(text, Text("}")); + return text; +} + +public Table_t Table$from_entries(Array_t entries, const TypeInfo *type) +{ + assert(type->tag == TableInfo); + if (entries.length == 0) + return (Table_t){}; + + Table_t t = {}; + int64_t length = entries.length + entries.length / 4; + size_t alloc_size = sizeof(bucket_info_t) + sizeof(bucket_t[length]); + t.bucket_info = GC_MALLOC_ATOMIC(alloc_size); + memset(t.bucket_info->buckets, 0, sizeof(bucket_t[length])); + t.bucket_info->count = length; + t.bucket_info->last_free = length-1; + + size_t offset = value_offset(type); + for (int64_t i = 0; i < entries.length; i++) { + void *key = entries.data + i*entries.stride; + Table$set(&t, key, key + offset, type); + } + return t; +} + +// Overlap is "set intersection" in formal terms +public Table_t Table$overlap(Table_t a, Table_t b, const TypeInfo *type) +{ + // Return a table such that t[k]==a[k] for all k such that a:has(k), b:has(k), and a[k]==b[k] + Table_t result = {}; + const size_t offset = value_offset(type); + for (int64_t i = 0; i < Table$length(a); i++) { + void *key = GET_ENTRY(a, i); + void *a_value = key + offset; + void *b_value = Table$get(b, key, type); + if (b_value && generic_equal(a_value, b_value, type->TableInfo.value)) + Table$set(&result, key, a_value, type); + } + + if (a.fallback) { + result.fallback = new(Table_t); + *result.fallback = Table$overlap(*a.fallback, b, type); + } + + return result; +} + +// With is "set union" in formal terms +public Table_t Table$with(Table_t a, Table_t b, const TypeInfo *type) +{ + // return a table such that t[k]==b[k] for all k such that b:has(k), and t[k]==a[k] for all k such that a:has(k) and not b:has(k) + Table_t result = {}; + const size_t offset = value_offset(type); + for (int64_t i = 0; i < Table$length(a); i++) { + void *key = GET_ENTRY(a, i); + Table$set(&result, key, key + offset, type); + } + for (int64_t i = 0; i < Table$length(b); i++) { + void *key = GET_ENTRY(b, i); + Table$set(&result, key, key + offset, type); + } + + if (a.fallback && b.fallback) { + result.fallback = new(Table_t); + *result.fallback = Table$with(*a.fallback, *b.fallback, type); + } else { + result.fallback = a.fallback ? a.fallback : b.fallback; + } + + return result; +} + +// Without is "set difference" in formal terms +public Table_t Table$without(Table_t a, Table_t b, const TypeInfo *type) +{ + // Return a table such that t[k]==a[k] for all k such that not b:has(k) or b[k] != a[k] + Table_t result = {}; + const size_t offset = value_offset(type); + for (int64_t i = 0; i < Table$length(a); i++) { + void *key = GET_ENTRY(a, i); + void *a_value = key + offset; + void *b_value = Table$get(b, key, type); + if (!b_value || !generic_equal(a_value, b_value, type->TableInfo.value)) + Table$set(&result, key, a_value, type); + } + + if (a.fallback) { + result.fallback = new(Table_t); + *result.fallback = Table$without(*a.fallback, b, type); + } + + return result; +} + +PUREFUNC public bool Table$is_subset_of(Table_t a, Table_t b, bool strict, const TypeInfo *type) +{ + if (a.entries.length > b.entries.length || (strict && a.entries.length == b.entries.length)) + return false; + + for (int64_t i = 0; i < Table$length(a); i++) { + void *found = Table$get_raw(b, GET_ENTRY(a, i), type); + if (!found) return false; + } + return true; +} + +PUREFUNC public bool Table$is_superset_of(Table_t a, Table_t b, bool strict, const TypeInfo *type) +{ + return Table$is_subset_of(b, a, strict, type); +} + +PUREFUNC public void *Table$str_get(Table_t t, const char *key) +{ + void **ret = Table$get(t, &key, &CStrToVoidStarTable); + return ret ? *ret : NULL; +} + +PUREFUNC public void *Table$str_get_raw(Table_t t, const char *key) +{ + void **ret = Table$get_raw(t, &key, &CStrToVoidStarTable); + return ret ? *ret : NULL; +} + +public void *Table$str_reserve(Table_t *t, const char *key, const void *value) +{ + return Table$reserve(t, &key, &value, &CStrToVoidStarTable); +} + +public void Table$str_set(Table_t *t, const char *key, const void *value) +{ + Table$set(t, &key, &value, &CStrToVoidStarTable); +} + +public void Table$str_remove(Table_t *t, const char *key) +{ + return Table$remove(t, &key, &CStrToVoidStarTable); +} + +CONSTFUNC public void *Table$str_entry(Table_t t, int64_t n) +{ + return Table$entry(t, n); +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/stdlib/tables.h b/stdlib/tables.h new file mode 100644 index 00000000..53e0c583 --- /dev/null +++ b/stdlib/tables.h @@ -0,0 +1,84 @@ +#pragma once + +// Hash table datastructure with methods and type information + +#include +#include +#include + +#include "arrays.h" +#include "datatypes.h" +#include "types.h" +#include "util.h" + +#define Table(key_t, val_t, key_info, value_info, fb, N, ...) ({ \ + struct { key_t k; val_t v; } ents[N] = {__VA_ARGS__}; \ + Table_t table = Table$from_entries((Array_t){ \ + .data=memcpy(GC_MALLOC(sizeof(ents)), ents, sizeof(ents)), \ + .length=sizeof(ents)/sizeof(ents[0]), \ + .stride=(void*)&ents[1] - (void*)&ents[0], \ + }, Table$info(key_info, value_info)); \ + table.fallback = fb; \ + table; }) +#define Set(item_t, item_info, N, ...) ({ \ + item_t ents[N] = {__VA_ARGS__}; \ + Table_t set = Table$from_entries((Array_t){ \ + .data=memcpy(GC_MALLOC(sizeof(ents)), ents, sizeof(ents)), \ + .length=sizeof(ents)/sizeof(ents[0]), \ + .stride=(void*)&ents[1] - (void*)&ents[0], \ + }, Set$info(item_info)); \ + set; }) + +Table_t Table$from_entries(Array_t entries, const TypeInfo *type); +void *Table$get(Table_t t, const void *key, const TypeInfo *type); +#define Table$get_optional(table_expr, key_t, val_t, key_expr, nonnull_var, nonnull_expr, null_expr, info_expr) ({ \ + const Table_t t = table_expr; const key_t k = key_expr; \ + val_t *nonnull_var = Table$get(t, &k, info_expr); \ + nonnull_var ? nonnull_expr : null_expr; }) +#define Table$has_value(table_expr, key_expr, info_expr) ({ \ + const Table_t t = table_expr; __typeof(key_expr) k = key_expr; \ + (Table$get(t, &k, info_expr) != NULL); }) +PUREFUNC void *Table$get_raw(Table_t t, const void *key, const TypeInfo *type); +CONSTFUNC void *Table$entry(Table_t t, int64_t n); +void *Table$reserve(Table_t *t, const void *key, const void *value, const TypeInfo *type); +void Table$set(Table_t *t, const void *key, const void *value, const TypeInfo *type); +#define Table$set_value(t, key_expr, value_expr, type) ({ __typeof(key_expr) k = key_expr; __typeof(value_expr) v = value_expr; \ + Table$set(t, &k, &v, type); }) +#define Table$reserve_value(t, key_expr, type) ({ __typeof(key_expr) k = key_expr; Table$reserve(t, &k, NULL, type); }) +#define Table$bump(t_expr, key_expr, amount_expr, type) ({ __typeof(key_expr) key = key_expr; \ + Table_t *t = t_expr; \ + __typeof(amount_expr) *val = Table$get_raw(*t, &key, type); \ + if (val) *val += amount_expr; \ + else { __typeof(amount_expr) init = amount_expr; Table$set(t, &key, &init, type); } (void)0; }) + +void Table$remove(Table_t *t, const void *key, const TypeInfo *type); +#define Table$remove_value(t, key_expr, type) ({ __typeof(key_expr) k = key_expr; Table$remove(t, &k, type); }) + +Table_t Table$overlap(Table_t a, Table_t b, const TypeInfo *type); +Table_t Table$with(Table_t a, Table_t b, const TypeInfo *type); +Table_t Table$without(Table_t a, Table_t b, const TypeInfo *type); +PUREFUNC bool Table$is_subset_of(Table_t a, Table_t b, bool strict, const TypeInfo *type); +PUREFUNC bool Table$is_superset_of(Table_t a, Table_t b, bool strict, const TypeInfo *type); + +void Table$clear(Table_t *t); +Table_t Table$sorted(Table_t t, const TypeInfo *type); +void Table$mark_copy_on_write(Table_t *t); +#define TABLE_INCREF(t) ({ ARRAY_INCREF((t).entries); if ((t).bucket_info) (t).bucket_info->data_refcount += ((t).bucket_info->data_refcount < TABLE_MAX_DATA_REFCOUNT); }) +#define TABLE_COPY(t) ({ TABLE_INCREF(t); t; }) +PUREFUNC int32_t Table$compare(const Table_t *x, const Table_t *y, const TypeInfo *type); +PUREFUNC bool Table$equal(const Table_t *x, const Table_t *y, const TypeInfo *type); +PUREFUNC uint64_t Table$hash(const Table_t *t, const TypeInfo *type); +Text_t Table$as_text(const Table_t *t, bool colorize, const TypeInfo *type); + +CONSTFUNC void *Table$str_entry(Table_t t, int64_t n); +PUREFUNC void *Table$str_get(Table_t t, const char *key); +PUREFUNC void *Table$str_get_raw(Table_t t, const char *key); +void Table$str_set(Table_t *t, const char *key, const void *value); +void *Table$str_reserve(Table_t *t, const char *key, const void *value); +void Table$str_remove(Table_t *t, const char *key); + +#define Table$length(t) ((t).entries.length) + +extern const TypeInfo CStrToVoidStarTable; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/stdlib/text.c b/stdlib/text.c new file mode 100644 index 00000000..283dfb01 --- /dev/null +++ b/stdlib/text.c @@ -0,0 +1,1302 @@ +// Type info and methods for Text datatype, which uses libunistr for Unicode +// support and implements a datastructure based on Raku/MoarVM's strings to +// efficiently store arbitrary unicode data using a mix of densely packed plain +// ASCII, 32-bit integers representing grapheme clusters (see below), and ropes +// that represent text that is a composite of multiple subtexts. Subtexts are +// only nested one level deep, not arbitrarily deep trees. +// +// A note on grapheme clusters: In Unicode, codepoints can be represented using +// a 32-bit integer. Most codepoints correspond to the intuitive notion of a +// "letter", which is more formally known as a "grapheme cluster". A grapheme +// cluster is roughly speaking the amount of text that your cursor moves over +// when you press the arrow key once. However, some codepoints act as modifiers +// on other codepoints. For example, U+0301 (COMBINING ACUTE ACCENT) can modify +// a letter like "e" to form "é". During normalization, this frequently +// resolves down to a single unicode codepoint, in this case, "é" resolves to +// the single codepoint U+00E9 (LATIN SMALL LETTER E WITH ACUTE). However, in +// some cases, multiple codepoints make up a grapheme cluster but *don't* +// normalize to a single codepoint. For example, LATIN SMALL LETTER E (U+0065) +// + COMBINING VERTICAL LINE BELOW (U+0329) combine to form an unusual glyph +// that is not used frequently enough to warrant its own unique codepoint (this +// is basically what Zalgo text is). +// +// There are a lot of benefits to storing text with one grapheme cluster per +// index in a densely packed array. It lets us have one canonical length for +// the text that can be precomputed and is meaningful to users. It lets us +// quickly get the Nth "letter" in the text. Substring slicing is fast. +// However, since not all grapheme clusters take up the same number of +// codepoints, we're faced with the problem of how to jam multiple codepoints +// into a single 32-bit slot. Inspired by Raku and MoarVM's approach, this +// implementation uses "synthetic graphemes" (in Raku's terms, Normal Form +// Graphemes, aka NFG). A synthetic grapheme is a negative 32-bit signed +// integer that represents a multi-codepoint grapheme cluster that has been +// encountered during the program's runtime. These clusters are stored in a +// lookup array and hash map so that we can rapidly convert between the +// synthetic grapheme integer ID and the unicode codepoints associated with it. +// Essentially, it's like we create a supplement to the unicode standard with +// things that would be nice if they had their own codepoint so things worked +// out nicely because we're using them right now, and we'll give them a +// negative number so it doesn't overlap with any real codepoints. +// +// Example 1: U+0048, U+00E9 +// AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E WITH ACUTE +// This would be stored as: (int32_t[]){0x48, 0xE9} +// Example 2: U+0048, U+0065, U+0309 +// AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E, COMBINING VERTICAL LINE BELOW +// This would be stored as: (int32_t[]){0x48, -2} +// Where -2 is used as a lookup in an array that holds the actual unicode codepoints: +// (ucs4_t[]){0x65, 0x0309} + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "arrays.h" +#include "integers.h" +#include "patterns.h" +#include "tables.h" +#include "text.h" + +// Use inline version of the siphash code for performance: +#include "siphash.h" +#include "siphash-internals.h" + +typedef struct { + ucs4_t main_codepoint; + ucs4_t *utf32_cluster; // length-prefixed + const uint8_t *utf8; +} synthetic_grapheme_t; + +// Synthetic grapheme clusters (clusters of more than one codepoint): +static Table_t grapheme_ids_by_codepoints = {}; // ucs4_t* length-prefixed codepoints -> int32_t ID + +// This will hold a dynamically growing array of synthetic graphemes: +static synthetic_grapheme_t *synthetic_graphemes = NULL; +static int32_t synthetic_grapheme_capacity = 0; +static int32_t num_synthetic_graphemes = 0; + +#define MAIN_GRAPHEME_CODEPOINT(_g) ({ int32_t g = _g; (g) >= 0 ? (ucs4_t)(g) : synthetic_graphemes[-(g)-1].main_codepoint; }) +#define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id)-1].utf32_cluster[0]) +#define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id)-1].utf32_cluster[1]) +#define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id)-1].utf8) + +static Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize); + +PUREFUNC static bool graphemes_equal(ucs4_t **a, ucs4_t **b) { + if ((*a)[0] != (*b)[0]) return false; + for (int i = 0; i < (int)(*a)[0]; i++) + if ((*a)[i] != (*b)[i]) return false; + return true; +} + +PUREFUNC static uint64_t grapheme_hash(ucs4_t **g) { + ucs4_t *cluster = *g; + return siphash24((void*)&cluster[1], sizeof(ucs4_t[cluster[0]])); +} + +static const TypeInfo GraphemeClusterInfo = { + .size=sizeof(ucs4_t*), + .align=__alignof__(ucs4_t*), + .tag=CustomInfo, + .CustomInfo={.equal=(void*)graphemes_equal, .hash=(void*)grapheme_hash}, +}; + +static const TypeInfo GraphemeIDLookupTableInfo = { + .size=sizeof(Table_t), .align=__alignof__(Table_t), + .tag=TableInfo, .TableInfo={.key=&GraphemeClusterInfo, .value=&Int32$info}, +}; + +#pragma GCC diagnostic ignored "-Wstack-protector" +public int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len) +{ + ucs4_t length_prefixed[1+utf32_len] = {}; + length_prefixed[0] = (ucs4_t)utf32_len; + for (int i = 0; i < utf32_len; i++) + length_prefixed[i+1] = codepoints[i]; + ucs4_t *ptr = &length_prefixed[0]; + + // Optimization for common case of one frequently used synthetic grapheme: + static int32_t last_grapheme = 0; + if (last_grapheme != 0 && graphemes_equal(&ptr, &synthetic_graphemes[-last_grapheme-1].utf32_cluster)) + return last_grapheme; + + int32_t *found = Table$get(grapheme_ids_by_codepoints, &ptr, &GraphemeIDLookupTableInfo); + if (found) return *found; + + // New synthetic grapheme: + if (num_synthetic_graphemes >= synthetic_grapheme_capacity) { + // If we don't have space, allocate more: + synthetic_grapheme_capacity = MAX(128, synthetic_grapheme_capacity * 2); + synthetic_grapheme_t *new = GC_MALLOC(sizeof(synthetic_grapheme_t[synthetic_grapheme_capacity])); + memcpy(new, synthetic_graphemes, sizeof(synthetic_grapheme_t[num_synthetic_graphemes])); + synthetic_graphemes = new; + } + + int32_t grapheme_id = -(num_synthetic_graphemes+1); + num_synthetic_graphemes += 1; + + // Get UTF8 representation: + uint8_t u8_buf[64]; + size_t u8_len = sizeof(u8_buf)/sizeof(u8_buf[0]); + uint8_t *u8 = u32_to_u8(codepoints, (size_t)utf32_len, u8_buf, &u8_len); + + // For performance reasons, use an arena allocator here to ensure that + // synthetic graphemes store all of their information in a densely packed + // area with good cache locality: + static void *arena = NULL, *arena_end = NULL; + // Eat up any space needed to make arena 32-bit aligned: + if ((size_t)arena % __alignof__(ucs4_t) != 0) + arena += __alignof__(ucs4_t) - ((size_t)arena % __alignof__(ucs4_t)); + + // If we have filled up this arena, allocate a new one: + size_t needed_memory = sizeof(ucs4_t[1+utf32_len]) + sizeof(uint8_t[u8_len + 1]); + if (arena + needed_memory > arena_end) { + // Do reasonably big chunks at a time, so most synthetic codepoints are + // nearby each other in memory and cache locality is good. This is a + // rough guess at a good size: + size_t chunk_size = MAX(needed_memory, 512); + arena = GC_MALLOC_ATOMIC(chunk_size); + arena_end = arena + chunk_size; + } + + // Copy length-prefixed UTF32 codepoints into the arena and store where they live: + ucs4_t *codepoint_copy = arena; + mempcpy(codepoint_copy, length_prefixed, sizeof(ucs4_t[1+utf32_len])); + synthetic_graphemes[-grapheme_id-1].utf32_cluster = codepoint_copy; + arena += sizeof(ucs4_t[1+utf32_len]); + + // Copy UTF8 bytes into the arena and store where they live: + uint8_t *utf8_final = arena; + memcpy(utf8_final, u8, sizeof(uint8_t[u8_len])); + utf8_final[u8_len] = '\0'; // Add a terminating NUL byte + synthetic_graphemes[-grapheme_id-1].utf8 = utf8_final; + arena += sizeof(uint8_t[u8_len + 1]); + + // Sickos at the unicode consortium decreed that you can have grapheme clusters + // that begin with *prefix* modifiers, so we gotta check for that case: + synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1]; + for (ucs4_t i = 0; i < utf32_len; i++) { + if (!__builtin_expect(uc_is_property_prepended_concatenation_mark(length_prefixed[1+i]), 0)) { + synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1+i]; + break; + } + } + + // Cleanup from unicode API: + if (u8 != u8_buf) free(u8); + + Table$set(&grapheme_ids_by_codepoints, &codepoint_copy, &grapheme_id, &GraphemeIDLookupTableInfo); + + last_grapheme = grapheme_id; + return grapheme_id; +} + +PUREFUNC static inline int64_t num_subtexts(Text_t t) +{ + if (t.tag != TEXT_SUBTEXT) return 1; + int64_t len = t.length; + int64_t n = 0; + while (len > 0) { + len -= t.subtexts[n].length; + ++n; + } + return n; +} + +int text_visualize(FILE *stream, Text_t t) +{ + switch (t.tag) { + case TEXT_SHORT_ASCII: return fprintf(stream, "%.*s", t.length, t.length, t.short_ascii); + case TEXT_ASCII: return fprintf(stream, "%.*s", t.length, t.length, t.ascii); + case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { + int printed = fprintf(stream, "", t.length); + printed += Text$print(stream, t); + printed += fprintf(stream, ""); + return printed; + } + case TEXT_SUBTEXT: { + int printed = fprintf(stream, "", t.length); + int64_t to_print = t.length; + for (int i = 0; to_print > 0; ++i) { + printed += fprintf(stream, "\n "); + printed += text_visualize(stream, t.subtexts[i]); + to_print -= t.subtexts[i].length; + if (t.subtexts[i].length == 0) break; + } + printed += fprintf(stream, "\n"); + return printed; + } + default: return 0; + } +} + +public int Text$print(FILE *stream, Text_t t) +{ + if (t.length == 0) return 0; + + switch (t.tag) { + case TEXT_SHORT_ASCII: return fwrite(t.short_ascii, sizeof(char), (size_t)t.length, stream); + case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), (size_t)t.length, stream); + case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { + const int32_t *graphemes = t.tag == TEXT_SHORT_GRAPHEMES ? t.short_graphemes : t.graphemes; + int written = 0; + for (int64_t i = 0; i < t.length; i++) { + int32_t grapheme = graphemes[i]; + if (grapheme >= 0) { + uint8_t buf[8]; + size_t len = sizeof(buf); + uint8_t *u8 = u32_to_u8((ucs4_t*)&grapheme, 1, buf, &len); + written += (int)fwrite(u8, sizeof(char), len, stream); + if (u8 != buf) free(u8); + } else { + const uint8_t *u8 = GRAPHEME_UTF8(grapheme); + assert(u8); + written += (int)fwrite(u8, sizeof(uint8_t), strlen((char*)u8), stream); + } + } + return written; + } + case TEXT_SUBTEXT: { + int written = 0; + int i = 0; + for (int64_t to_print = t.length; to_print > 0; to_print -= t.subtexts[i].length, ++i) + written += Text$print(stream, t.subtexts[i]); + return written; + } + default: return 0; + } +} + +static bool is_concat_stable(Text_t a, Text_t b) +{ + if (a.length == 0 || b.length == 0) + return true; + + int32_t last_a = Text$get_grapheme(a, a.length-1); + int32_t first_b = Text$get_grapheme(b, 0); + + // Synthetic graphemes are weird and probably need to check with normalization: + if (last_a < 0 || first_b < 0) + return 0; + + // Magic number, we know that no codepoints below here trigger instability: + static const int32_t LOWEST_CODEPOINT_TO_CHECK = 0x300; + if (last_a < LOWEST_CODEPOINT_TO_CHECK && first_b < LOWEST_CODEPOINT_TO_CHECK) + return true; + + // Do a normalization run for these two codepoints and see if it looks different: + ucs4_t codepoints[2] = {(ucs4_t)last_a, (ucs4_t)first_b}; + ucs4_t norm_buf[3*2]; // Normalization should not exceed 3x in the input length + size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]); + ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, 2, norm_buf, &norm_length); + if (norm_length != 2) { + // Looks like these two codepoints merged into one (or maybe had a child, who knows?) + if (normalized != norm_buf) free(normalized); + return false; + } + + // If there's still two codepoints, we might end up with a single grapheme + // cluster which will need to turn into a synthetic grapheme: + const void *second_grapheme = u32_grapheme_next(normalized, &normalized[2]); + if (normalized != norm_buf) free(normalized); + return (second_grapheme == &normalized[1]); +} + +static Text_t concat2_assuming_safe(Text_t a, Text_t b) +{ + if (a.length == 0) return b; + if (b.length == 0) return a; + + if (a.tag == TEXT_SUBTEXT && b.tag == TEXT_SUBTEXT) { + int64_t na = num_subtexts(a); + int64_t nb = num_subtexts(b); + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[na + nb])), + }; + memcpy(&ret.subtexts[0], a.subtexts, sizeof(Text_t[na])); + memcpy(&ret.subtexts[na], b.subtexts, sizeof(Text_t[nb])); + return ret; + } else if (a.tag == TEXT_SUBTEXT) { + int64_t n = num_subtexts(a); + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])), + }; + memcpy(ret.subtexts, a.subtexts, sizeof(Text_t[n])); + ret.subtexts[n] = b; + return ret; + } else if (b.tag == TEXT_SUBTEXT) { + int64_t n = num_subtexts(b); + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])), + }; + ret.subtexts[0] = a; + memcpy(&ret.subtexts[1], b.subtexts, sizeof(Text_t[n])); + return ret; + } else { + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[2])), + }; + ret.subtexts[0] = a; + ret.subtexts[1] = b; + return ret; + } +} + +static Text_t concat2(Text_t a, Text_t b) +{ + if (a.length == 0) return b; + if (b.length == 0) return a; + + if (__builtin_expect(is_concat_stable(a, b), 1)) + return concat2_assuming_safe(a, b); + + // Do full normalization of the last/first characters + int32_t last_a = Text$get_grapheme(a, a.length-1); + int32_t first_b = Text$get_grapheme(b, 0); + + size_t utf32_len = (last_a >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(last_a)) + (first_b >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(first_b)); + ucs4_t join_graphemes[utf32_len] = {}; + ucs4_t *p = &join_graphemes[0]; + if (last_a < 0) p = mempcpy(p, GRAPHEME_CODEPOINTS(last_a), NUM_GRAPHEME_CODEPOINTS(last_a)); + else *(p++) = (ucs4_t)last_a; + if (first_b < 0) p = mempcpy(p, GRAPHEME_CODEPOINTS(first_b), NUM_GRAPHEME_CODEPOINTS(first_b)); + else *(p++) = (ucs4_t)first_b; + + Text_t glue = text_from_u32(join_graphemes, (int64_t)utf32_len, true); + + if (a.length == 1 && b.length == 1) + return glue; + else if (a.length == 1) + return concat2_assuming_safe(glue, Text$slice(b, I(2), I(b.length))); + else if (b.length == 1) + return concat2_assuming_safe(Text$slice(a, I(1), I(a.length-1)), glue); + else + return concat2_assuming_safe( + concat2_assuming_safe(Text$slice(a, I(1), I(a.length-1)), glue), + b); +} + +public Text_t Text$_concat(int n, Text_t items[n]) +{ + if (n == 0) return (Text_t){.length=0}; + if (n == 1) return items[0]; + if (n == 2) return concat2(items[0], items[1]); + + int64_t len = 0, subtexts = 0; + for (int i = 0; i < n; i++) { + len += items[i].length; + if (items[i].length > 0) + subtexts += num_subtexts(items[i]); + } + + Text_t ret = { + .length=0, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[len])), + }; + int64_t sub_i = 0; + for (int i = 0; i < n; i++) { + if (items[i].length == 0) + continue; + + if (i > 0 && !__builtin_expect(is_concat_stable(items[i-1], items[i]), 1)) { + // Oops, guess this wasn't stable for concatenation, let's break it + // up into subtasks: + return concat2(ret, Text$_concat(n-i, &items[i])); + } + + if (items[i].tag == TEXT_SUBTEXT) { + for (int64_t j = 0, remainder = items[i].length; remainder > 0; j++) { + ret.subtexts[sub_i++] = items[i].subtexts[j]; + remainder -= items[i].subtexts[j].length; + } + } else { + ret.subtexts[sub_i++] = items[i]; + } + ret.length += items[i].length; + } + return ret; +} + +public Text_t Text$repeat(Text_t text, Int_t count) +{ + if (text.length == 0 || Int$is_negative(count)) + return Text(""); + + Int_t result_len = Int$times(count, I(text.length)); + if (Int$compare_value(result_len, I(1l<<40)) > 0) + fail("Text repeating would produce too big of an result!"); + + int64_t count64 = Int_to_Int64(count, false); + if (text.tag == TEXT_SUBTEXT) { + int64_t subtexts = num_subtexts(text); + Text_t ret = { + .length=text.length * count64, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[subtexts * count64])), + }; + for (int64_t c = 0; c < count64; c++) { + for (int64_t i = 0; i < subtexts; i++) { + if (text.subtexts[i].length > 0) + ret.subtexts[c*subtexts + i] = text.subtexts[i]; + } + } + return ret; + } else { + Text_t ret = { + .length=text.length * count64, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[count64])), + }; + for (int64_t i = 0; i < count64; i++) + ret.subtexts[i] = text; + return ret; + } +} + +public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) +{ + int64_t first = Int_to_Int64(first_int, false); + int64_t last = Int_to_Int64(last_int, false); + if (first == 0) fail("Invalid index: 0"); + if (last == 0) return (Text_t){.length=0}; + + if (first < 0) first = text.length + first + 1; + if (last < 0) last = text.length + last + 1; + + if (last > text.length) last = text.length; + + if (first > text.length || last < first) + return (Text_t){.length=0}; + + if (first == 1 && last == text.length) + return text; + + switch (text.tag) { + case TEXT_SHORT_ASCII: { + Text_t ret = (Text_t) { + .tag=TEXT_SHORT_ASCII, + .length=last - first + 1, + }; + memcpy(ret.short_ascii, text.short_ascii + (first-1), (size_t)ret.length); + return ret; + } + case TEXT_ASCII: { + Text_t ret = { + .tag=TEXT_ASCII, + .length=last - first + 1, + .ascii=text.ascii + (first-1), + }; + return ret; + } + case TEXT_SHORT_GRAPHEMES: { + assert((first == 1 && last == 1) || (first == 2 && last == 2)); + Text_t ret = { + .tag=TEXT_SHORT_GRAPHEMES, + .length=1, + .short_graphemes={text.short_graphemes[first-1]}, + }; + return ret; + } + case TEXT_GRAPHEMES: { + Text_t ret = { + .tag=TEXT_GRAPHEMES, + .length=last - first + 1, + .graphemes=text.graphemes + (first-1), + }; + return ret; + } + case TEXT_SUBTEXT: { + Text_t *subtexts = text.subtexts; + while (first > subtexts[0].length) { + first -= subtexts[0].length; + last -= subtexts[0].length; + ++subtexts; + } + + int64_t needed_len = (last - first) + 1; + int64_t num_subtexts = 0; + for (int64_t included = 0; included < needed_len; ) { + if (included == 0) + included += subtexts[num_subtexts].length - first + 1; + else + included += subtexts[num_subtexts].length; + num_subtexts += 1; + } + if (num_subtexts == 1) + return Text$slice(subtexts[0], I(first), I(last)); + + Text_t ret = { + .length=needed_len, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[num_subtexts])), + }; + for (int64_t i = 0; i < num_subtexts; i++) { + ret.subtexts[i] = Text$slice(subtexts[i], I(first), I(last)); + first = 1; + needed_len -= ret.subtexts[i].length; + last = first + needed_len - 1; + } + return ret; + } + default: errx(1, "Invalid tag"); + } +} + +Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize) +{ + // Normalization is apparently guaranteed to never exceed 3x in the input length + ucs4_t norm_buf[MIN(256, 3*num_codepoints)]; + if (normalize) { + size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]); + ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, (size_t)num_codepoints, norm_buf, &norm_length); + codepoints = normalized; + num_codepoints = (int64_t)norm_length; + } + + // char breaks[num_codepoints]; + // u32_grapheme_breaks(codepoints, num_codepoints, breaks); + + Text_t ret = { + .length=0, + .tag=TEXT_SHORT_GRAPHEMES, + }; + const ucs4_t *src = codepoints; + int32_t *graphemes = ret.short_graphemes; + while (src < &codepoints[num_codepoints]) { + if (ret.tag == TEXT_SHORT_GRAPHEMES && ret.length + 1 > 2) { + graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints])); // May be a slight overallocation + graphemes[0] = ret.short_graphemes[0]; + graphemes[1] = ret.short_graphemes[1]; + ret.tag = TEXT_GRAPHEMES; + ret.graphemes = graphemes; + } + + // TODO: use grapheme breaks instead of u32_grapheme_next() + const ucs4_t *next = u32_grapheme_next(src, &codepoints[num_codepoints]); + if (next == &src[1]) { + graphemes[ret.length] = (int32_t)*src; + } else { + // Synthetic grapheme + graphemes[ret.length] = get_synthetic_grapheme(src, next-src); + } + ++ret.length; + src = next; + } + if (normalize && codepoints != norm_buf) free(codepoints); + return ret; +} + +public Text_t Text$from_strn(const char *str, size_t len) +{ + int64_t ascii_span = 0; + for (size_t i = 0; i < len && isascii(str[i]); i++) + ascii_span++; + + if (ascii_span == (int64_t)len) { // All ASCII + Text_t ret = {.length=ascii_span}; + if (ascii_span <= 8) { + ret.tag = TEXT_SHORT_ASCII; + for (int64_t i = 0; i < ascii_span; i++) + ret.short_ascii[i] = str[i]; + } else { + ret.tag = TEXT_ASCII; + ret.ascii = str; + } + return ret; + } else { + if (u8_check((uint8_t*)str, len) != NULL) + return Text(""); + + ucs4_t buf[128]; + size_t length = sizeof(buf)/sizeof(buf[0]); + + ucs4_t *codepoints = u8_to_u32((uint8_t*)str, (size_t)ascii_span + strlen(str + ascii_span), buf, &length); + Text_t ret = text_from_u32(codepoints, (int64_t)length, true); + if (codepoints != buf) free(codepoints); + return ret; + } +} + +public Text_t Text$from_str(const char *str) +{ + return str ? Text$from_strn(str, strlen(str)) : Text(""); +} + +static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i) +{ + switch (text.tag) { + case TEXT_ASCII: case TEXT_SHORT_ASCII: { + if (*i + text.length > (int64_t)*capacity) { + *capacity = *i + text.length + 1; + *buf = GC_REALLOC(*buf, (size_t)*capacity); + } + + const char *bytes = text.tag == TEXT_ASCII ? text.ascii : text.short_ascii; + memcpy(*buf + *i, bytes, (size_t)text.length); + *i += text.length; + break; + } + case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { + const int32_t *graphemes = text.tag == TEXT_GRAPHEMES ? text.graphemes : text.short_graphemes; + for (int64_t g = 0; g < text.length; g++) { + if (graphemes[g] >= 0) { + uint8_t u8_buf[64]; + size_t u8_len = sizeof(u8_buf); + uint8_t *u8 = u32_to_u8((ucs4_t*)&graphemes[g], 1, u8_buf, &u8_len); + + if (*i + (int64_t)u8_len > (int64_t)*capacity) { + *capacity = *i + (int64_t)u8_len + 1; + *buf = GC_REALLOC(*buf, (size_t)*capacity); + } + + memcpy(*buf + *i, u8, u8_len); + *i += (int64_t)u8_len; + if (u8 != u8_buf) free(u8); + } else { + const uint8_t *u8 = GRAPHEME_UTF8(graphemes[g]); + size_t u8_len = u8_strlen(u8); + if (*i + (int64_t)u8_len > (int64_t)*capacity) { + *capacity = *i + (int64_t)u8_len + 1; + *buf = GC_REALLOC(*buf, (size_t)*capacity); + } + + memcpy(*buf + *i, u8, u8_len); + *i += (int64_t)u8_len; + } + } + break; + } + case TEXT_SUBTEXT: { + for (int64_t s = 0, remaining = text.length; remaining > 0; s++) { + u8_buf_append(text.subtexts[s], buf, capacity, i); + remaining -= text.subtexts[s].length; + } + break; + } + default: break; + } +} + +public char *Text$as_c_string(Text_t text) +{ + int64_t capacity = text.length + 1; + char *buf = GC_MALLOC_ATOMIC((size_t)capacity); + int64_t i = 0; + u8_buf_append(text, &buf, &capacity, &i); + + if (i + 1 > (int64_t)capacity) { + capacity = i + 1; + buf = GC_REALLOC(buf, (size_t)capacity); + } + buf[i] = '\0'; + return buf; +} + +PUREFUNC public uint64_t Text$hash(Text_t *text) +{ + if (text->hash != 0) return text->hash; + siphash sh; + siphashinit(&sh, sizeof(int32_t[text->length])); + + union { + int32_t chunks[2]; + uint64_t whole; + } tmp; + switch (text->tag) { + case TEXT_ASCII: case TEXT_SHORT_ASCII: { + const char *bytes = text->tag == TEXT_ASCII ? text->ascii : text->short_ascii; + for (int64_t i = 0; i + 1 < text->length; i++) { + tmp.chunks[0] = (int32_t)bytes[i]; + tmp.chunks[1] = (int32_t)bytes[i+1]; + siphashadd64bits(&sh, tmp.whole); + } + int32_t last = text->length & 0x1 ? (int32_t)bytes[text->length-1] : 0; // Odd number of graphemes + text->hash = siphashfinish_last_part(&sh, (uint64_t)last); + break; + } + case TEXT_GRAPHEMES: { + const int32_t *graphemes = text->graphemes; + for (int64_t i = 0; i + 1 < text->length; i++) { + tmp.chunks[0] = graphemes[i]; + tmp.chunks[1] = graphemes[i]; + siphashadd64bits(&sh, tmp.whole); + } + int32_t last = text->length & 0x1 ? graphemes[text->length-1] : 0; // Odd number of graphemes + text->hash = siphashfinish_last_part(&sh, (uint64_t)last); + break; + } + case TEXT_SHORT_GRAPHEMES: { + tmp.chunks[0] = text->short_graphemes[0]; + if (text->length > 1) + tmp.chunks[1] = text->short_graphemes[1]; + text->hash = siphashfinish_last_part(&sh, (uint64_t)tmp.whole); + break; + } + case TEXT_SUBTEXT: { + int32_t leftover = 0; + for (int64_t sub_i = 0, to_hash = text->length; to_hash > 0; ) { + Text_t subtext = text->subtexts[sub_i]; + if (subtext.tag == TEXT_ASCII || subtext.tag == TEXT_SHORT_ASCII) { + const char *bytes = subtext.tag == TEXT_ASCII ? subtext.ascii : subtext.short_ascii; + int64_t grapheme = 0; + if (leftover) { + tmp.chunks[0] = leftover; + tmp.chunks[1] = (int32_t)bytes[0]; + siphashadd64bits(&sh, tmp.whole); + grapheme += 1; + } + for (; grapheme + 1 < subtext.length; grapheme += 2) { + tmp.chunks[0] = (int32_t)bytes[grapheme]; + tmp.chunks[1] = (int32_t)bytes[grapheme+1]; + siphashadd64bits(&sh, tmp.whole); + } + leftover = grapheme < subtext.length ? (int32_t)bytes[grapheme] : 0; + } else if (subtext.tag == TEXT_SHORT_GRAPHEMES) { + if (leftover) { + tmp.chunks[0] = leftover; + tmp.chunks[1] = subtext.short_graphemes[0]; + siphashadd64bits(&sh, tmp.whole); + leftover = subtext.length > 1 ? subtext.short_graphemes[1] : 0; + } else if (subtext.length == 1) { + leftover = subtext.short_graphemes[0]; + } else { + tmp.chunks[0] = subtext.short_graphemes[0]; + tmp.chunks[1] = subtext.short_graphemes[1]; + siphashadd64bits(&sh, tmp.whole); + } + } else if (subtext.tag == TEXT_GRAPHEMES) { + const int32_t *graphemes = subtext.graphemes; + int64_t grapheme = 0; + if (leftover) { + tmp.chunks[0] = leftover; + tmp.chunks[1] = graphemes[0]; + siphashadd64bits(&sh, tmp.whole); + grapheme += 1; + } + for (; grapheme + 1 < subtext.length; grapheme += 2) { + tmp.chunks[0] = graphemes[grapheme]; + tmp.chunks[1] = graphemes[grapheme+1]; + siphashadd64bits(&sh, tmp.whole); + } + leftover = grapheme < subtext.length ? graphemes[grapheme] : 0; + } + + to_hash -= text->subtexts[sub_i].length; + + ++sub_i; + } + + text->hash = siphashfinish_last_part(&sh, (uint64_t)leftover); + break; + } + default: errx(1, "Invalid text"); + } + + if (text->hash == 0) + text->hash = 1; + + return text->hash; +} + +public int32_t Text$get_grapheme_fast(Text_t text, TextIter_t *state, int64_t index) +{ + switch (text.tag) { + case TEXT_ASCII: return index < text.length ? (int32_t)text.ascii[index] : 0; + case TEXT_SHORT_ASCII: return index < text.length ? (int32_t)text.short_ascii[index] : 0; + case TEXT_GRAPHEMES: return index < text.length ? text.graphemes[index] : 0; + case TEXT_SHORT_GRAPHEMES: return index < text.length ? text.short_graphemes[index] : 0; + case TEXT_SUBTEXT: { + TextIter_t backup_state = {0, 0}; + if (!state) state = &backup_state; + + if (index < 0 || index >= text.length) + return 0; + + while (index < state->sum_of_previous_subtexts && state->subtext > 0) { + state->sum_of_previous_subtexts -= text.subtexts[state->subtext].length; + state->subtext -= 1; + } + for (;;) { + if (index < state->sum_of_previous_subtexts + text.subtexts[state->subtext].length) + return Text$get_grapheme_fast(text.subtexts[state->subtext], NULL, index - state->sum_of_previous_subtexts); + state->sum_of_previous_subtexts += text.subtexts[state->subtext].length; + state->subtext += 1; + } + return 0; + } + default: errx(1, "Invalid text"); + } + return 0; +} + +public ucs4_t Text$get_main_grapheme_fast(Text_t text, TextIter_t *state, int64_t index) +{ + return MAIN_GRAPHEME_CODEPOINT(Text$get_grapheme_fast(text, state, index)); +} + +PUREFUNC public int32_t Text$compare(const Text_t *a, const Text_t *b) +{ + if (a == b) return 0; + + int64_t len = MAX(a->length, b->length); + TextIter_t a_state = {0, 0}, b_state = {0, 0}; + for (int64_t i = 0; i < len; i++) { + int32_t ai = Text$get_grapheme_fast(*a, &a_state, i); + int32_t bi = Text$get_grapheme_fast(*b, &b_state, i); + if (ai == bi) continue; + int32_t cmp; + if (ai > 0 && bi > 0) { + cmp = u32_cmp((ucs4_t*)&ai, (ucs4_t*)&bi, 1); + } else if (ai > 0) { + cmp = u32_cmp2( + (ucs4_t*)&ai, 1, + GRAPHEME_CODEPOINTS(bi), + NUM_GRAPHEME_CODEPOINTS(bi)); + } else if (bi > 0) { + cmp = u32_cmp2( + GRAPHEME_CODEPOINTS(ai), + NUM_GRAPHEME_CODEPOINTS(ai), + (ucs4_t*)&bi, 1); + } else { + cmp = u32_cmp2( + GRAPHEME_CODEPOINTS(ai), + NUM_GRAPHEME_CODEPOINTS(ai), + GRAPHEME_CODEPOINTS(bi), + NUM_GRAPHEME_CODEPOINTS(bi)); + } + if (cmp != 0) return cmp; + } + return 0; +} + +PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix) +{ + if (text.length < prefix.length) + return false; + TextIter_t text_state = {0, 0}, prefix_state = {0, 0}; + for (int64_t i = 0; i < prefix.length; i++) { + int32_t text_i = Text$get_grapheme_fast(text, &text_state, i); + int32_t prefix_i = Text$get_grapheme_fast(prefix, &prefix_state, i); + if (text_i != prefix_i) return false; + } + return true; +} + +PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix) +{ + if (text.length < suffix.length) + return false; + TextIter_t text_state = {0, 0}, prefix_state = {0, 0}; + for (int64_t i = 0; i < suffix.length; i++) { + int32_t text_i = Text$get_grapheme_fast(text, &text_state, text.length - suffix.length + i); + int32_t suffix_i = Text$get_grapheme_fast(suffix, &prefix_state, i); + if (text_i != suffix_i) return false; + } + return true; +} + +PUREFUNC public bool Text$equal_values(Text_t a, Text_t b) +{ + if (a.length != b.length || (a.hash != 0 && b.hash != 0 && a.hash != b.hash)) + return false; + int64_t len = a.length; + TextIter_t a_state = {0, 0}, b_state = {0, 0}; + for (int64_t i = 0; i < len; i++) { + int32_t ai = Text$get_grapheme_fast(a, &a_state, i); + int32_t bi = Text$get_grapheme_fast(b, &b_state, i); + if (ai != bi) return false; + } + return true; +} + +PUREFUNC public bool Text$equal(const Text_t *a, const Text_t *b) +{ + if (a == b) return true; + return Text$equal_values(*a, *b); +} + +PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b) +{ + if (a.length != b.length) + return false; + int64_t len = a.length; + TextIter_t a_state = {0, 0}, b_state = {0, 0}; + const char *language = uc_locale_language(); + for (int64_t i = 0; i < len; i++) { + int32_t ai = Text$get_grapheme_fast(a, &a_state, i); + int32_t bi = Text$get_grapheme_fast(b, &b_state, i); + if (ai != bi) { + const ucs4_t *a_codepoints = ai >= 0 ? (ucs4_t*)&ai : GRAPHEME_CODEPOINTS(ai); + int64_t a_len = ai >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(ai); + + const ucs4_t *b_codepoints = bi >= 0 ? (ucs4_t*)&bi : GRAPHEME_CODEPOINTS(bi); + int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi); + + int cmp = 0; + (void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, language, UNINORM_NFC, &cmp); + if (cmp != 0) + return false; + } + } + return true; +} + +public Text_t Text$upper(Text_t text) +{ + if (text.length == 0) return text; + Array_t codepoints = Text$utf32_codepoints(text); + const char *language = uc_locale_language(); + ucs4_t buf[128]; + size_t out_len = sizeof(buf)/sizeof(buf[0]); + ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len); + Text_t ret = text_from_u32(upper, (int64_t)out_len, false); + if (upper != buf) free(upper); + return ret; +} + +public Text_t Text$lower(Text_t text) +{ + if (text.length == 0) return text; + Array_t codepoints = Text$utf32_codepoints(text); + const char *language = uc_locale_language(); + ucs4_t buf[128]; + size_t out_len = sizeof(buf)/sizeof(buf[0]); + ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len); + Text_t ret = text_from_u32(lower, (int64_t)out_len, false); + if (lower != buf) free(lower); + return ret; +} + +public Text_t Text$title(Text_t text) +{ + if (text.length == 0) return text; + Array_t codepoints = Text$utf32_codepoints(text); + const char *language = uc_locale_language(); + ucs4_t buf[128]; + size_t out_len = sizeof(buf)/sizeof(buf[0]); + ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len); + Text_t ret = text_from_u32(title, (int64_t)out_len, false); + if (title != buf) free(title); + return ret; +} + +public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]) +{ + if (n < 1) return -1; + (void)info; + argtypes[0] = PA_POINTER; + sizes[0] = sizeof(Text_t*); + return 1; +} + +public int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]) +{ + Text_t t = **(Text_t**)args[0]; + if (info->alt) + return text_visualize(stream, t); + else + return Text$print(stream, t); +} + +static inline Text_t _quoted(Text_t text, bool colorize, char quote_char) +{ + // TODO: optimize for ASCII and short strings + Array_t graphemes = {.atomic=1}; +#define add_char(c) Array$insert_value(&graphemes, (ucs4_t)c, I_small(0), sizeof(ucs4_t)) +#define add_str(s) ({ for (const char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (ucs4_t)*_c, I_small(0), sizeof(ucs4_t)); }) + if (colorize) + add_str("\x1b[35m"); + if (quote_char != '"' && quote_char != '\'' && quote_char != '`') + add_char('$'); + add_char(quote_char); + +#define add_escaped(str) ({ if (colorize) add_str("\x1b[34;1m"); add_char('\\'); add_str(str); if (colorize) add_str("\x1b[0;35m"); }) + TextIter_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t g = Text$get_grapheme_fast(text, &state, i); + switch (g) { + case '\a': add_escaped("a"); break; + case '\b': add_escaped("b"); break; + case '\x1b': add_escaped("e"); break; + case '\f': add_escaped("f"); break; + case '\n': add_escaped("n"); break; + case '\r': add_escaped("r"); break; + case '\t': add_escaped("t"); break; + case '\v': add_escaped("v"); break; + case '\\': add_escaped("\\"); break; + case '\x00' ... '\x06': case '\x0E' ... '\x1A': + case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': { + if (colorize) add_str("\x1b[34;1m"); + add_char('\\'); + add_char('x'); + char tmp[4]; + sprintf(tmp, "%02X", g); + add_str(tmp); + if (colorize) + add_str("\x1b[0;35m"); + break; + } + default: { + if (g == quote_char) + add_escaped(((char[2]){quote_char, 0})); + else + add_char(g); + break; + } + } + } + + add_char(quote_char); + if (colorize) + add_str("\x1b[m"); + + return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data}; +#undef add_str +#undef add_char +#undef add_escaped +} + +public Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info) +{ + (void)info; + if (info->TextInfo.lang && streq(info->TextInfo.lang, "Path")) { + if (!text) return Text("Path"); + return Text$format("(%s%k%s)", colorize ? "\x1b[35m" : "", text, colorize ? "\x1b[m" : ""); + } + + if (!text) return info && info->TextInfo.lang ? Text$from_str(info->TextInfo.lang) : Text("Text"); + Text_t as_text = _quoted(*(Text_t*)text, colorize, info == &Pattern$info ? '/' : '"'); + if (info && info->TextInfo.lang && info != &Text$info && info != &Pattern$info) + as_text = Text$concat( + colorize ? Text("\x1b[1m$") : Text("$"), + Text$from_str(info->TextInfo.lang), + colorize ? Text("\x1b[0m") : Text(""), + as_text); + return as_text; +} + +public Text_t Text$quoted(Text_t text, bool colorize) +{ + return _quoted(text, colorize, '"'); +} + +public Text_t Text$join(Text_t glue, Array_t pieces) +{ + if (pieces.length == 0) return (Text_t){.length=0}; + + Text_t result = *(Text_t*)pieces.data; + for (int64_t i = 1; i < pieces.length; i++) { + result = Text$concat(result, glue, *(Text_t*)(pieces.data + i*pieces.stride)); + } + return result; +} + +__attribute__((format(printf, 1, 2))) +public Text_t Text$format(const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + + char buf[9]; + int len = vsnprintf(buf, sizeof(buf), fmt, args); + Text_t ret; + if (len <= 8) { + ret = (Text_t){ + .length=len, + .tag=TEXT_SHORT_ASCII, + }; + for (int i = 0; i < len; i++) + ret.short_ascii[i] = buf[i]; + } else { + char *str = GC_MALLOC_ATOMIC((size_t)(len+1)); + vsnprintf(str, (size_t)(len+1), fmt, args); + ret = Text$from_str(str); + } + va_end(args); + return ret; +} + +public Array_t Text$clusters(Text_t text) +{ + Array_t clusters = {.atomic=1}; + for (int64_t i = 1; i <= text.length; i++) { + Text_t cluster = Text$slice(text, I(i), I(i)); + Array$insert(&clusters, &cluster, I_small(0), sizeof(Text_t)); + } + return clusters; +} + +public Array_t Text$utf32_codepoints(Text_t text) +{ + Array_t codepoints = {.atomic=1}; + TextIter_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t grapheme = Text$get_grapheme_fast(text, &state, i); + if (grapheme < 0) { + for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) { + ucs4_t subg = GRAPHEME_CODEPOINTS(grapheme)[c]; + Array$insert(&codepoints, &subg, I_small(0), sizeof(ucs4_t)); + } + } else { + Array$insert(&codepoints, &grapheme, I_small(0), sizeof(ucs4_t)); + } + } + return codepoints; +} + +public Array_t Text$utf8_bytes(Text_t text) +{ + const char *str = Text$as_c_string(text); + return (Array_t){.length=strlen(str), .stride=1, .atomic=1, .data=(void*)str}; +} + +static inline const char *codepoint_name(ucs4_t c) +{ + char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); + char *found_name = unicode_character_name(c, name); + if (found_name) return found_name; + const uc_block_t *block = uc_block(c); + assert(block); + snprintf(name, UNINAME_MAX, "%s-%X", block->name, c); + return name; +} + +public Array_t Text$codepoint_names(Text_t text) +{ + Array_t names = {}; + TextIter_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t grapheme = Text$get_grapheme_fast(text, &state, i); + if (grapheme < 0) { + for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) { + const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]); + Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=(int64_t)strlen(name), .ascii=name}; + Array$insert(&names, &name_text, I_small(0), sizeof(Text_t)); + } + } else { + const char *name = codepoint_name((ucs4_t)grapheme); + Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=(int64_t)strlen(name), .ascii=name}; + Array$insert(&names, &name_text, I_small(0), sizeof(Text_t)); + } + } + return names; +} + +public Text_t Text$from_codepoints(Array_t codepoints) +{ + if (codepoints.stride != sizeof(int32_t)) + Array$compact(&codepoints, sizeof(int32_t)); + + return text_from_u32(codepoints.data, codepoints.length, true); +} + +public Text_t Text$from_codepoint_names(Array_t codepoint_names) +{ + Array_t codepoints = {}; + for (int64_t i = 0; i < codepoint_names.length; i++) { + Text_t *name = ((Text_t*)(codepoint_names.data + i*codepoint_names.stride)); + const char *name_str = Text$as_c_string(*name); + ucs4_t codepoint = unicode_name_character(name_str); + if (codepoint != UNINAME_INVALID) + Array$insert(&codepoints, &codepoint, I_small(0), sizeof(ucs4_t)); + } + return Text$from_codepoints(codepoints); +} + +public Text_t Text$from_bytes(Array_t bytes) +{ + if (bytes.stride != sizeof(int8_t)) + Array$compact(&bytes, sizeof(int8_t)); + + int8_t nul = 0; + Array$insert(&bytes, &nul, I_small(0), sizeof(int8_t)); + return Text$from_str(bytes.data); +} + +public Array_t Text$lines(Text_t text) +{ + Array_t lines = {}; + TextIter_t state = {0, 0}; + for (int64_t i = 0, line_start = 0; i < text.length; i++) { + int32_t grapheme = Text$get_grapheme_fast(text, &state, i); + if (grapheme == '\r' && Text$get_grapheme_fast(text, &state, i + 1) == '\n') { // CRLF + Text_t line = Text$slice(text, I(line_start+1), I(i)); + Array$insert(&lines, &line, I_small(0), sizeof(Text_t)); + i += 1; // skip one extra for CR + line_start = i + 1; + } else if (grapheme == '\n') { // newline + Text_t line = Text$slice(text, I(line_start+1), I(i)); + Array$insert(&lines, &line, I_small(0), sizeof(Text_t)); + line_start = i + 1; + } else if (i == text.length-1 && line_start != i) { // last line + Text_t line = Text$slice(text, I(line_start+1), I(i+1)); + Array$insert(&lines, &line, I_small(0), sizeof(Text_t)); + } + } + return lines; +} + +public const TypeInfo Text$info = { + .size=sizeof(Text_t), + .align=__alignof__(Text_t), + .tag=TextInfo, + .TextInfo={.lang="Text"}, +}; + +public Pattern_t Pattern$escape_text(Text_t text) +{ + // TODO: optimize for ASCII and short strings + Array_t graphemes = {.atomic=1}; +#define add_char(c) Array$insert_value(&graphemes, (ucs4_t)c, I_small(0), sizeof(ucs4_t)) +#define add_str(s) ({ for (const char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (ucs4_t)*_c, I_small(0), sizeof(ucs4_t)); }) + TextIter_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t g = Text$get_grapheme_fast(text, &state, i); + ucs4_t g0 = g < 0 ? GRAPHEME_CODEPOINTS(g)[0] : (ucs4_t)g; + + if (g == '{') { + add_str("{1{}"); + } else if (g0 == '?' + || uc_is_property_quotation_mark(g0) + || (uc_is_property_paired_punctuation(g0) && uc_is_property_left_of_pair(g0))) { + add_char('{'); + add_char('1'); + add_char(g); + add_char('}'); + } else { + add_char(g); + } + } + return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data}; +#undef add_str +#undef add_char +#undef add_escaped +} + +public const TypeInfo Pattern$info = { + .size=sizeof(Pattern_t), + .align=__alignof__(Pattern_t), + .tag=TextInfo, + .TextInfo={.lang="Pattern"}, +}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/text.h b/stdlib/text.h new file mode 100644 index 00000000..841d51fe --- /dev/null +++ b/stdlib/text.h @@ -0,0 +1,67 @@ +#pragma once + +// Type info and methods for Text datatype, which uses a struct inspired by +// Raku's string representation and libunistr + +#include +#include +#include +#include + +#include "datatypes.h" +#include "integers.h" + +typedef struct { + int64_t subtext, sum_of_previous_subtexts; +} TextIter_t; + +int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]); +int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]); + +#define Text(str) ((Text_t){.length=sizeof(str)-1, .tag=TEXT_ASCII, .ascii="" str}) + +int Text$print(FILE *stream, Text_t t); +void Text$visualize(Text_t t); +Text_t Text$_concat(int n, Text_t items[n]); +#define Text$concat(...) Text$_concat(sizeof((Text_t[]){__VA_ARGS__})/sizeof(Text_t), (Text_t[]){__VA_ARGS__}) +#define Texts(...) Text$concat(__VA_ARGS__) +Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int); +Text_t Text$from_str(const char *str); +Text_t Text$from_strn(const char *str, size_t len); +PUREFUNC uint64_t Text$hash(Text_t *text); +PUREFUNC int32_t Text$compare(const Text_t *a, const Text_t *b); +PUREFUNC bool Text$equal(const Text_t *a, const Text_t *b); +PUREFUNC bool Text$equal_values(Text_t a, Text_t b); +PUREFUNC bool Text$equal_ignoring_case(Text_t a, Text_t b); +Text_t Text$upper(Text_t text); +Text_t Text$lower(Text_t text); +Text_t Text$title(Text_t text); +Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info); +Text_t Text$quoted(Text_t str, bool colorize); +PUREFUNC bool Text$starts_with(Text_t text, Text_t prefix); +PUREFUNC bool Text$ends_with(Text_t text, Text_t suffix); +char *Text$as_c_string(Text_t text); +__attribute__((format(printf, 1, 2))) +public Text_t Text$format(const char *fmt, ...); +Array_t Text$clusters(Text_t text); +Array_t Text$utf32_codepoints(Text_t text); +Array_t Text$utf8_bytes(Text_t text); +Array_t Text$codepoint_names(Text_t text); +Text_t Text$from_codepoints(Array_t codepoints); +Text_t Text$from_codepoint_names(Array_t codepoint_names); +Text_t Text$from_bytes(Array_t bytes); +Array_t Text$lines(Text_t text); +Text_t Text$join(Text_t glue, Array_t pieces); +Text_t Text$repeat(Text_t text, Int_t count); +int32_t Text$get_grapheme_fast(Text_t text, TextIter_t *state, int64_t index); +ucs4_t Text$get_main_grapheme_fast(Text_t text, TextIter_t *state, int64_t index); + +static inline int32_t Text$get_grapheme(Text_t text, int64_t index) +{ + TextIter_t state = {0, 0}; + return Text$get_grapheme_fast(text, &state, index); +} + +extern const TypeInfo Text$info; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/threads.c b/stdlib/threads.c new file mode 100644 index 00000000..74e73832 --- /dev/null +++ b/stdlib/threads.c @@ -0,0 +1,55 @@ +// Logic for the Thread type, representing a pthread + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrays.h" +#include "text.h" +#include "types.h" +#include "util.h" + +public pthread_t *Thread$new(Closure_t fn) +{ + pthread_t *thread = new(pthread_t); + pthread_create(thread, NULL, fn.fn, fn.userdata); + return thread; +} + +public void Thread$join(pthread_t *thread) +{ + pthread_join(*thread, NULL); +} + +public void Thread$cancel(pthread_t *thread) +{ + pthread_cancel(*thread); +} + +public void Thread$detach(pthread_t *thread) +{ + pthread_detach(*thread); +} + +Text_t Thread$as_text(const pthread_t **thread, bool colorize, const TypeInfo *type) +{ + (void)type; + if (!thread) { + return colorize ? Text("\x1b[34;1mThread\x1b[m") : Text("Thread"); + } + return Text$format(colorize ? "\x1b[34;1mThread(%p)\x1b[m" : "Thread(%p)", *thread); +} + +public const TypeInfo Thread = { + .size=sizeof(pthread_t*), .align=__alignof(pthread_t*), + .tag=CustomInfo, + .CustomInfo={.as_text=(void*)Thread$as_text}, +}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/threads.h b/stdlib/threads.h new file mode 100644 index 00000000..52091677 --- /dev/null +++ b/stdlib/threads.h @@ -0,0 +1,20 @@ +#pragma once + +// Logic for the Thread type, representing a pthread + +#include +#include + +#include "datatypes.h" +#include "types.h" +#include "util.h" + +pthread_t *Thread$new(Closure_t fn); +void Thread$cancel(pthread_t *thread); +void Thread$join(pthread_t *thread); +void Thread$detach(pthread_t *thread); +Text_t Thread$as_text(const pthread_t **thread, bool colorize, const TypeInfo *type); + +extern TypeInfo Thread; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/tomo.h b/stdlib/tomo.h new file mode 100644 index 00000000..7db0f490 --- /dev/null +++ b/stdlib/tomo.h @@ -0,0 +1,34 @@ +#pragma once + +// All of the different builtin modules can be included by including this one +// import + +#include +#include +#include +#include +#include + +#include "arrays.h" +#include "bools.h" +#include "c_strings.h" +#include "channels.h" +#include "datatypes.h" +#include "functiontype.h" +#include "integers.h" +#include "memory.h" +#include "metamethods.h" +#include "nums.h" +#include "optionals.h" +#include "paths.h" +#include "patterns.h" +#include "pointers.h" +#include "ranges.h" +#include "shell.h" +#include "siphash.h" +#include "tables.h" +#include "text.h" +#include "threads.h" +#include "types.h" + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/types.c b/stdlib/types.c new file mode 100644 index 00000000..c9f2578f --- /dev/null +++ b/stdlib/types.c @@ -0,0 +1,38 @@ +// Type information and methods for TypeInfos (i.e. runtime representations of types) +#include +#include +#include +#include +#include + +#include "util.h" +#include "arrays.h" +#include "pointers.h" +#include "tables.h" +#include "text.h" +#include "types.h" + +public Text_t Type$as_text(const void *typeinfo, bool colorize, const TypeInfo *type) +{ + if (!typeinfo) return Text("TypeInfo"); + + if (colorize) + return Text$concat( + Text("\x1b[36;1m"), + Text$from_str(type->TypeInfoInfo.type_str), + Text("\x1b[m")); + else + return Text$from_str(type->TypeInfoInfo.type_str); +} + +public const TypeInfo TypeInfo$info = { + .size=sizeof(TypeInfo), + .align=__alignof__(TypeInfo), + .tag=CustomInfo, + .TypeInfoInfo.type_str="TypeInfo", +}; + +public const TypeInfo Void$info = {.size=0, .align=0, .tag=EmptyStructInfo}; +public const TypeInfo Abort$info = {.size=0, .align=0, .tag=EmptyStructInfo}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/types.h b/stdlib/types.h new file mode 100644 index 00000000..bcdafad2 --- /dev/null +++ b/stdlib/types.h @@ -0,0 +1,86 @@ +#pragma once + +// Type information and methods for TypeInfos (i.e. runtime representations of types) + +#include +#include + +#include "datatypes.h" + +struct TypeInfo; + +typedef uint64_t (*hash_fn_t)(const void*, const struct TypeInfo*); +typedef int32_t (*compare_fn_t)(const void*, const void*, const struct TypeInfo*); +typedef bool (*equal_fn_t)(const void*, const void*, const struct TypeInfo*); +typedef Text_t (*text_fn_t)(const void*, bool, const struct TypeInfo*); + +typedef struct TypeInfo { + int64_t size, align; + struct { // Anonymous tagged union for convenience + enum { CustomInfo, StructInfo, EnumInfo, PointerInfo, TextInfo, ArrayInfo, ChannelInfo, TableInfo, FunctionInfo, + OptionalInfo, TypeInfoInfo, OpaqueInfo, EmptyStructInfo, CStringInfo } tag; + union { + struct { + equal_fn_t equal; + compare_fn_t compare; + hash_fn_t hash; + text_fn_t as_text; + } CustomInfo; + struct { + const char *sigil; + const struct TypeInfo *pointed; + } PointerInfo; + struct { + const char *lang; + } TextInfo; + struct { + const struct TypeInfo *item; + } ArrayInfo, ChannelInfo; + struct { + const struct TypeInfo *key, *value; + } TableInfo; + struct { + const char *type_str; + } FunctionInfo; + struct { + const char *type_str; + } TypeInfoInfo; + struct { + const struct TypeInfo *type; + } OptionalInfo; +#pragma GCC diagnostic ignored "-Wpedantic" + struct {} OpaqueInfo; + struct { + const char *name; + } EmptyStructInfo; + }; + }; +} TypeInfo; + +#define Pointer$info(sigil_expr, pointed_info) &((TypeInfo){.size=sizeof(void*), .align=__alignof__(void*), \ + .tag=PointerInfo, .PointerInfo={.sigil=sigil_expr, .pointed=pointed_info}}) +#define Array$info(item_info) &((TypeInfo){.size=sizeof(Array_t), .align=__alignof__(Array_t), \ + .tag=ArrayInfo, .ArrayInfo.item=item_info}) +#define Set$info(item_info) &((TypeInfo){.size=sizeof(Table_t), .align=__alignof__(Table_t), \ + .tag=TableInfo, .TableInfo.key=item_info, .TableInfo.value=&Void$info}) +#define Channel$info(item_info) &((TypeInfo){.size=sizeof(Channel_t), .align=__alignof__(Channel_t), \ + .tag=ChannelInfo, .ChannelInfo.item=item_info}) +#define Table$info(key_expr, value_expr) &((TypeInfo){.size=sizeof(Table_t), .align=__alignof__(Table_t), \ + .tag=TableInfo, .TableInfo.key=key_expr, .TableInfo.value=value_expr}) +#define Function$info(typestr) &((TypeInfo){.size=sizeof(void*), .align=__alignof__(void*), \ + .tag=FunctionInfo, .FunctionInfo.type_str=typestr}) +#define Closure$info(typestr) &((TypeInfo){.size=sizeof(void*[2]), .align=__alignof__(void*), \ + .tag=FunctionInfo, .FunctionInfo.type_str=typestr}) +#define TypeInfo$info(typestr) &((TypeInfo){.size=sizeof(TypeInfo), .align=__alignof__(TypeInfo), \ + .tag=TypeInfoInfo, .TypeInfoInfo.type_str=typestr}) +#define Optional$info(t) &((TypeInfo){.size=(t)->size, .align=(t)->align, \ + .tag=OptionalInfo, .OptionalInfo.type=t}) + +extern const TypeInfo TypeInfo$info; +extern const TypeInfo Void$info; +extern const TypeInfo Abort$info; +#define Void_t void + +Text_t Type$as_text(const void *typeinfo, bool colorize, const TypeInfo *type); + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/util.c b/stdlib/util.c new file mode 100644 index 00000000..7749b22c --- /dev/null +++ b/stdlib/util.c @@ -0,0 +1,28 @@ +// Built-in utility functions +#include +#include +#include +#include +#include +#include + +#include "text.h" +#include "util.h" + +public bool USE_COLOR; + +__attribute__((format(printf, 1, 2))) +public char *heap_strf(const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + char *tmp = NULL; + int len = vasprintf(&tmp, fmt, args); + if (len < 0) return NULL; + va_end(args); + char *ret = GC_strndup(tmp, (size_t)len); + free(tmp); + return ret; +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/util.h b/stdlib/util.h new file mode 100644 index 00000000..a24264cd --- /dev/null +++ b/stdlib/util.h @@ -0,0 +1,66 @@ +#pragma once + +// Built-in utility functions + +#include +#include +#include +#include +#include + +#define streq(a, b) (((a) == NULL && (b) == NULL) || (((a) == NULL) == ((b) == NULL) && strcmp(a, b) == 0)) +#define starts_with(line, prefix) (strncmp(line, prefix, strlen(prefix)) == 0) +#define ends_with(line, suffix) (strlen(line) >= strlen(suffix) && strcmp(line + strlen(line) - strlen(suffix), suffix) == 0) +#define new(t, ...) ((t*)memcpy(GC_MALLOC(sizeof(t)), &(t){__VA_ARGS__}, sizeof(t))) +#define heap(x) (__typeof(x)*)memcpy(GC_MALLOC(sizeof(x)), (__typeof(x)[1]){x}, sizeof(x)) +#define stack(x) (__typeof(x)*)((__typeof(x)[1]){x}) +#define Match(x, _tag) ((x)->tag == _tag ? &(x)->__data._tag : (errx(1, __FILE__ ":%d This was supposed to be a " # _tag "\n", __LINE__), &(x)->__data._tag)) +#define check_initialized(var, name) *({ if (!var ## $initialized) fail("The variable " name " is being accessed before it has been initialized!"); \ + &var; }) + +#ifndef auto +#define auto __auto_type +#endif + +#ifndef public +#define public __attribute__ ((visibility ("default"))) +#endif + +#ifndef PUREFUNC +#define PUREFUNC __attribute__ ((pure)) +#endif + +#ifndef CONSTFUNC +#define CONSTFUNC __attribute__ ((const)) +#endif + +extern bool USE_COLOR; + +#define REVERSE_LIST(list) do { \ + __typeof(list) _prev = NULL; \ + __typeof(list) _next = NULL; \ + auto _current = list; \ + while (_current != NULL) { \ + _next = _current->next; \ + _current->next = _prev; \ + _prev = _current; \ + _current = _next; \ + } \ + list = _prev; \ +} while(0) + +#define LIST_MAP(src, var, ...) ({\ + __typeof(src) mapped = NULL; \ + __typeof(src) *next = &mapped; \ + for (__typeof(src) var = src; var; var = var->next) { \ + *next = GC_MALLOC(sizeof(__typeof(*(src)))); \ + **next = *var; \ + **next = (__typeof(*(src))){__VA_ARGS__}; \ + next = &((*next)->next); \ + } \ + mapped; }) + +__attribute__((format(printf, 1, 2))) +char *heap_strf(const char *fmt, ...); + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/structs.c b/structs.c index 328fa3be..6584eac8 100644 --- a/structs.c +++ b/structs.c @@ -5,12 +5,12 @@ #include #include "ast.h" -#include "builtins/text.h" +#include "stdlib/text.h" #include "compile.h" #include "cordhelpers.h" #include "environment.h" #include "typecheck.h" -#include "builtins/util.h" +#include "stdlib/util.h" static CORD compile_str_method(env_t *env, ast_t *ast) { diff --git a/tomo.c b/tomo.c index ec4f1c35..4d1692a8 100644 --- a/tomo.c +++ b/tomo.c @@ -10,9 +10,9 @@ #include #include "ast.h" -#include "builtins/arrays.h" -#include "builtins/datatypes.h" -#include "builtins/text.h" +#include "stdlib/arrays.h" +#include "stdlib/datatypes.h" +#include "stdlib/text.h" #include "compile.h" #include "cordhelpers.h" #include "parse.h" diff --git a/typecheck.c b/typecheck.c index 65597a8a..86c204dc 100644 --- a/typecheck.c +++ b/typecheck.c @@ -8,8 +8,8 @@ #include #include "ast.h" -#include "builtins/text.h" -#include "builtins/util.h" +#include "stdlib/text.h" +#include "stdlib/util.h" #include "cordhelpers.h" #include "environment.h" #include "parse.h" diff --git a/types.c b/types.c index 3f546432..bc54015c 100644 --- a/types.c +++ b/types.c @@ -6,9 +6,9 @@ #include #include -#include "builtins/integers.h" -#include "builtins/tables.h" -#include "builtins/util.h" +#include "stdlib/integers.h" +#include "stdlib/tables.h" +#include "stdlib/util.h" #include "cordhelpers.h" #include "types.h" diff --git a/types.h b/types.h index 94d74022..8b1ded4a 100644 --- a/types.h +++ b/types.h @@ -6,7 +6,7 @@ #include #include "ast.h" -#include "builtins/arrays.h" +#include "stdlib/arrays.h" typedef struct type_s type_t; -- cgit v1.2.3