diff options
Diffstat (limited to 'lib')
35 files changed, 0 insertions, 3853 deletions
diff --git a/lib/README.md b/lib/README.md deleted file mode 100644 index 31de5b92..00000000 --- a/lib/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Core Libraries - -This folder contains some libraries that are installed by default when -installing Tomo. - -## Libraries - -Libraries can be installed with `tomo -IL ./library-folder` - -- [base64](base64/): A base64 encoding/decoding library. -- [commands](commands/): A library for running commands. -- [core](core/): Bundling up commonly used libraries into a single library. -- [json](json/): JSON parsing and encoding. -- [patterns](patterns/): Pattern matching for text. -- [pthreads](pthreads/): A POSIX threads library. -- [random](random/): Pseudorandom number generators. -- [shell](shell/): A DSL for running shell commands. -- [time](time/): A module for working with dates and times. -- [uuid](uuid/): A universally unique identifier library. diff --git a/lib/base64/CHANGES.md b/lib/base64/CHANGES.md deleted file mode 100644 index 42ae752c..00000000 --- a/lib/base64/CHANGES.md +++ /dev/null @@ -1,5 +0,0 @@ -# Version History - -## v1.0 - -Initial version diff --git a/lib/base64/README.md b/lib/base64/README.md deleted file mode 100644 index 8db0f8ec..00000000 --- a/lib/base64/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Base64 - -This is a library for encoding/decoding Base64 values. diff --git a/lib/base64/base64.tm b/lib/base64/base64.tm deleted file mode 100644 index bf512a83..00000000 --- a/lib/base64/base64.tm +++ /dev/null @@ -1,96 +0,0 @@ -# Base 64 encoding and decoding - -_enc := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".bytes() - -_EQUAL_BYTE := Byte(0x3D) - -_dec : [Byte] = [ - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 62, 255, 255, 255, 63, - 52, 53, 54, 55, 56, 57, 58, 59, - 60, 61, 255, 255, 255, 255, 255, 255, - 255, 0, 1, 2, 3, 4, 5, 6, - 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, - 23, 24, 25, 255, 255, 255, 255, 255, - 255, 26, 27, 28, 29, 30, 31, 32, - 33, 34, 35, 36, 37, 38, 39, 40, - 41, 42, 43, 44, 45, 46, 47, 48, - 49, 50, 51, 255, 255, 255, 255, 255, -] - -lang Base64 - func parse(text:Text -> Base64?) - return Base64.from_bytes(text.bytes()) - - func from_bytes(bytes:[Byte] -> Base64?) - output := &[Byte(0) for _ in bytes.length * 4 / 3 + 4] - src := Int64(1) - dest := Int64(1) - while src + 2 <= Int64(bytes.length) - chunk24 := ( - (Int32(bytes[src]) <<< 16) or (Int32(bytes[src+1]) <<< 8) or Int32(bytes[src+2]) - ) - src += 3 - - output[dest] = _enc[1 + ((chunk24 >>> 18) and 0b111111)] - output[dest+1] = _enc[1 + ((chunk24 >>> 12) and 0b111111)] - output[dest+2] = _enc[1 + ((chunk24 >>> 6) and 0b111111)] - output[dest+3] = _enc[1 + (chunk24 and 0b111111)] - dest += 4 - - if src + 1 == bytes.length - chunk16 := ( - (Int32(bytes[src]) <<< 8) or Int32(bytes[src+1]) - ) - output[dest] = _enc[1 + ((chunk16 >>> 10) and 0b11111)] - output[dest+1] = _enc[1 + ((chunk16 >>> 4) and 0b111111)] - output[dest+2] = _enc[1 + ((chunk16 <<< 2)and 0b111111)] - output[dest+3] = _EQUAL_BYTE - else if src == bytes.length - chunk8 := Int32(bytes[src]) - output[dest] = _enc[1 + ((chunk8 >>> 2) and 0b111111)] - output[dest+1] = _enc[1 + ((chunk8 <<< 4) and 0b111111)] - output[dest+2] = _EQUAL_BYTE - output[dest+3] = _EQUAL_BYTE - - return Base64.from_text(Text.from_bytes(output[]) or return none) - - func decode_text(b64:Base64 -> Text?) - return Text.from_bytes(b64.decode_bytes() or return none) - - func decode_bytes(b64:Base64 -> [Byte]?) - bytes := b64.text.bytes() - output := &[Byte(0) for _ in bytes.length/4 * 3] - src := Int64(1) - dest := Int64(1) - while src + 3 <= Int64(bytes.length) - chunk24 := ( - (Int32(_dec[1+bytes[src]]) <<< 18) or - (Int32(_dec[1+bytes[src+1]]) <<< 12) or - (Int32(_dec[1+bytes[src+2]]) <<< 6) or - Int32(_dec[1+bytes[src+3]]) - ) - src += 4 - - output[dest] = Byte((chunk24 >>> 16) and 0xFF) - output[dest+1] = Byte((chunk24 >>> 8) and 0xFF) - output[dest+2] = Byte(chunk24 and 0xFF) - dest += 3 - - while output[-1] == 0xFF - output[] = output.to(-2) - - return output[] - -func main(input=(/dev/stdin), decode=no) - if decode - b := Base64.from_text(input.read()!) - say(b.decode_text()!) - else - text := input.read()! - say(Base64.parse(text)!.text) diff --git a/lib/commands/CHANGES.md b/lib/commands/CHANGES.md deleted file mode 100644 index 42ae752c..00000000 --- a/lib/commands/CHANGES.md +++ /dev/null @@ -1,5 +0,0 @@ -# Version History - -## v1.0 - -Initial version diff --git a/lib/commands/README.md b/lib/commands/README.md deleted file mode 100644 index 040f4bd5..00000000 --- a/lib/commands/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Commands - -This module provides a way to run executable programs and get their output. You -can also feed in text to the programs' `stdin`. Think of it as `popen()` on -steroids. diff --git a/lib/commands/commands.c b/lib/commands/commands.c deleted file mode 100644 index 110ae696..00000000 --- a/lib/commands/commands.c +++ /dev/null @@ -1,295 +0,0 @@ -// Logic for running system commands - -#pragma once - -#include <errno.h> -#include <gc.h> -#include <poll.h> -#include <pthread.h> -#include <signal.h> -#include <spawn.h> -#include <stdbool.h> -#include <stdint.h> -#include <string.h> -#include <sys/param.h> -#include <sys/wait.h> -#include <unistd.h> - -// This is a workaround fix for an issue on some systems that don't have `__GLIBC__` defined -// and run into problems with <unistr.h> - -#ifndef __GLIBC__ -#define __GLIBC__ 2 -#include <unistr.h> // IWYU pragma: export -#undef __GLIBC__ -#else -#include <unistr.h> // IWYU pragma: export -#endif - -#define READ_END 0 -#define WRITE_END 1 - -static void xpipe(int fd[2]) { - if (pipe(fd) != 0) fail("Failed to create pipe: ", strerror(errno)); -} - -int run_command(Text_t exe, List_t arg_list, Table_t env_table, OptionalList_t input_bytes, List_t *output_bytes, - List_t *error_bytes) { - pthread_testcancel(); - - struct sigaction sa = {.sa_handler = SIG_IGN}, oldint, oldquit; - sigaction(SIGINT, &sa, &oldint); - sigaction(SIGQUIT, &sa, &oldquit); - sigaddset(&sa.sa_mask, SIGCHLD); - sigset_t old, reset; - sigprocmask(SIG_BLOCK, &sa.sa_mask, &old); - sigemptyset(&reset); - if (oldint.sa_handler != SIG_IGN) sigaddset(&reset, SIGINT); - if (oldquit.sa_handler != SIG_IGN) sigaddset(&reset, SIGQUIT); - posix_spawnattr_t attr; - posix_spawnattr_init(&attr); - posix_spawnattr_setsigmask(&attr, &old); - posix_spawnattr_setsigdefault(&attr, &reset); - posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETSIGDEF | POSIX_SPAWN_SETSIGMASK); - - int child_inpipe[2], child_outpipe[2], child_errpipe[2]; - if (input_bytes.length >= 0) xpipe(child_inpipe); - if (output_bytes) xpipe(child_outpipe); - if (error_bytes) xpipe(child_errpipe); - - posix_spawn_file_actions_t actions; - posix_spawn_file_actions_init(&actions); - if (input_bytes.length >= 0) { - posix_spawn_file_actions_adddup2(&actions, child_inpipe[READ_END], STDIN_FILENO); - posix_spawn_file_actions_addclose(&actions, child_inpipe[WRITE_END]); - } - if (output_bytes) { - posix_spawn_file_actions_adddup2(&actions, child_outpipe[WRITE_END], STDOUT_FILENO); - posix_spawn_file_actions_addclose(&actions, child_outpipe[READ_END]); - } - if (error_bytes) { - posix_spawn_file_actions_adddup2(&actions, child_errpipe[WRITE_END], STDERR_FILENO); - posix_spawn_file_actions_addclose(&actions, child_errpipe[READ_END]); - } - - const char *exe_str = Text$as_c_string(exe); - - List_t arg_strs = {}; - List$insert_value(&arg_strs, exe_str, I(0), sizeof(char *)); - for (int64_t i = 0; i < arg_list.length; i++) - List$insert_value(&arg_strs, Text$as_c_string(*(Text_t *)(arg_list.data + i * arg_list.stride)), I(0), - sizeof(char *)); - List$insert_value(&arg_strs, NULL, I(0), sizeof(char *)); - char **args = arg_strs.data; - - extern char **environ; - char **env = environ; - if (env_table.entries.length > 0) { - List_t env_list = {}; // List of const char* - for (char **e = environ; *e; e++) - List$insert(&env_list, e, I(0), sizeof(char *)); - - for (int64_t i = 0; i < env_table.entries.length; i++) { - struct { - Text_t key, value; - } *entry = env_table.entries.data + env_table.entries.stride * i; - const char *env_entry = String(entry->key, "=", entry->value); - List$insert(&env_list, &env_entry, I(0), sizeof(char *)); - } - List$insert_value(&env_list, NULL, I(0), sizeof(char *)); - assert(env_list.stride == sizeof(char *)); - env = env_list.data; - } - - pid_t pid; - int ret = exe_str[0] == '/' ? posix_spawn(&pid, exe_str, &actions, &attr, args, env) - : posix_spawnp(&pid, exe_str, &actions, &attr, args, env); - if (ret != 0) return -1; - - posix_spawnattr_destroy(&attr); - posix_spawn_file_actions_destroy(&actions); - - if (input_bytes.length >= 0) close(child_inpipe[READ_END]); - if (output_bytes) close(child_outpipe[WRITE_END]); - if (error_bytes) close(child_errpipe[WRITE_END]); - - struct pollfd pollfds[3] = {}; - if (input_bytes.length >= 0) pollfds[0] = (struct pollfd){.fd = child_inpipe[WRITE_END], .events = POLLOUT}; - if (output_bytes) pollfds[1] = (struct pollfd){.fd = child_outpipe[WRITE_END], .events = POLLIN}; - if (error_bytes) pollfds[2] = (struct pollfd){.fd = child_errpipe[WRITE_END], .events = POLLIN}; - - if (input_bytes.length > 0 && input_bytes.stride != 1) List$compact(&input_bytes, sizeof(char)); - if (output_bytes) *output_bytes = (List_t){.atomic = 1, .stride = 1, .length = 0}; - if (error_bytes) *error_bytes = (List_t){.atomic = 1, .stride = 1, .length = 0}; - - while (input_bytes.length > 0 || output_bytes || error_bytes) { - (void)poll(pollfds, sizeof(pollfds) / sizeof(pollfds[0]), -1); // Wait for data or readiness - bool did_something = false; - if (input_bytes.length >= 0 && pollfds[0].revents) { - if (input_bytes.length > 0) { - ssize_t written = write(child_inpipe[WRITE_END], input_bytes.data, (size_t)input_bytes.length); - if (written > 0) { - input_bytes.data += written; - input_bytes.length -= (int64_t)written; - did_something = true; - } else if (written < 0) { - close(child_inpipe[WRITE_END]); - pollfds[0].events = 0; - } - } - if (input_bytes.length <= 0) { - close(child_inpipe[WRITE_END]); - pollfds[0].events = 0; - } - } - char buf[256]; - if (output_bytes && pollfds[1].revents) { - ssize_t n = read(child_outpipe[READ_END], buf, sizeof(buf)); - did_something = did_something || (n > 0); - if (n <= 0) { - close(child_outpipe[READ_END]); - pollfds[1].events = 0; - } else if (n > 0) { - if (output_bytes->free < n) { - output_bytes->data = GC_REALLOC(output_bytes->data, (size_t)(output_bytes->length + n)); - output_bytes->free = 0; - } - memcpy(output_bytes->data + output_bytes->length, buf, (size_t)n); - output_bytes->length += n; - } - } - if (error_bytes && pollfds[2].revents) { - ssize_t n = read(child_errpipe[READ_END], buf, sizeof(buf)); - did_something = did_something || (n > 0); - if (n <= 0) { - close(child_errpipe[READ_END]); - pollfds[2].events = 0; - } else if (n > 0) { - if (error_bytes->free < n) { - error_bytes->data = GC_REALLOC(error_bytes->data, (size_t)(error_bytes->length + n)); - error_bytes->free = 0; - } - memcpy(error_bytes->data + error_bytes->length, buf, (size_t)n); - error_bytes->length += n; - } - } - if (!did_something) break; - } - - int status = 0; - if (ret == 0) { - while (waitpid(pid, &status, 0) < 0 && errno == EINTR) { - if (WIFEXITED(status) || WIFSIGNALED(status)) break; - else if (WIFSTOPPED(status)) kill(pid, SIGCONT); - } - } - - if (input_bytes.length >= 0) close(child_inpipe[WRITE_END]); - if (output_bytes) close(child_outpipe[READ_END]); - if (error_bytes) close(child_errpipe[READ_END]); - - sigaction(SIGINT, &oldint, NULL); - sigaction(SIGQUIT, &oldquit, NULL); - sigprocmask(SIG_SETMASK, &old, NULL); - - if (ret) errno = ret; - return status; -} - -typedef struct { - pid_t pid; - FILE *out; -} child_info_t; - -static void _line_reader_cleanup(child_info_t *child) { - if (child && child->out) { - fclose(child->out); - child->out = NULL; - } - if (child->pid) { - kill(child->pid, SIGTERM); - child->pid = 0; - } -} - -static Text_t _next_line(child_info_t *child) { - if (!child || !child->out) return NONE_TEXT; - - char *line = NULL; - size_t size = 0; - ssize_t len = getline(&line, &size, child->out); - if (len <= 0) { - _line_reader_cleanup(child); - return NONE_TEXT; - } - - while (len > 0 && (line[len - 1] == '\r' || line[len - 1] == '\n')) - --len; - - if (u8_check((uint8_t *)line, (size_t)len) != NULL) fail("Invalid UTF8!"); - - Text_t line_text = Text$from_strn(line, len); - free(line); - return line_text; -} - -OptionalClosure_t command_by_line(Text_t exe, List_t arg_list, Table_t env_table) { - posix_spawnattr_t attr; - posix_spawnattr_init(&attr); - - int child_outpipe[2]; - xpipe(child_outpipe); - - posix_spawn_file_actions_t actions; - posix_spawn_file_actions_init(&actions); - posix_spawn_file_actions_adddup2(&actions, child_outpipe[WRITE_END], STDOUT_FILENO); - posix_spawn_file_actions_addclose(&actions, child_outpipe[READ_END]); - - const char *exe_str = Text$as_c_string(exe); - - List_t arg_strs = {}; - List$insert_value(&arg_strs, exe_str, I(0), sizeof(char *)); - for (int64_t i = 0; i < arg_list.length; i++) - List$insert_value(&arg_strs, Text$as_c_string(*(Text_t *)(arg_list.data + i * arg_list.stride)), I(0), - sizeof(char *)); - List$insert_value(&arg_strs, NULL, I(0), sizeof(char *)); - char **args = arg_strs.data; - - extern char **environ; - char **env = environ; - if (env_table.entries.length > 0) { - List_t env_list = {}; // List of const char* - for (char **e = environ; *e; e++) - List$insert(&env_list, e, I(0), sizeof(char *)); - - for (int64_t i = 0; i < env_table.entries.length; i++) { - struct { - Text_t key, value; - } *entry = env_table.entries.data + env_table.entries.stride * i; - const char *env_entry = String(entry->key, "=", entry->value); - List$insert(&env_list, &env_entry, I(0), sizeof(char *)); - } - List$insert_value(&env_list, NULL, I(0), sizeof(char *)); - assert(env_list.stride == sizeof(char *)); - env = env_list.data; - } - - pid_t pid; - int ret = exe_str[0] == '/' ? posix_spawn(&pid, exe_str, &actions, &attr, args, env) - : posix_spawnp(&pid, exe_str, &actions, &attr, args, env); - if (ret != 0) return NONE_CLOSURE; - - posix_spawnattr_destroy(&attr); - posix_spawn_file_actions_destroy(&actions); - - close(child_outpipe[WRITE_END]); - - child_info_t *child_info = GC_MALLOC(sizeof(child_info_t)); - child_info->out = fdopen(child_outpipe[READ_END], "r"); - child_info->pid = pid; - GC_register_finalizer(child_info, (void *)_line_reader_cleanup, NULL, NULL, NULL); - return (Closure_t){.fn = (void *)_next_line, .userdata = child_info}; -} - -#undef READ_END -#undef WRITE_END diff --git a/lib/commands/commands.tm b/lib/commands/commands.tm deleted file mode 100644 index 8a131042..00000000 --- a/lib/commands/commands.tm +++ /dev/null @@ -1,90 +0,0 @@ -# Functions for running system commands - -use ./commands.c - -extern run_command : func(exe:Text, args:[Text], env:{Text=Text}, input:[Byte]?, output:&[Byte]?, error:&[Byte]? -> Int32) -extern command_by_line : func(exe:Text, args:[Text], env:{Text=Text} -> func(->Text?)?) - -enum ExitType(Exited(status:Int32), Signaled(signal:Int32), Failed) - func succeeded(e:ExitType -> Bool) - when e is Exited(status) return (status == 0) - else return no - - func or_fail(e:ExitType, message:Text?=none) - if not e.succeeded() - fail(message or "Program failed: $e") - -struct ProgramResult(output:[Byte], errors:[Byte], exit_type:ExitType) - func or_fail(r:ProgramResult, message:Text?=none -> ProgramResult) - when r.exit_type is Exited(status) - if status != 0 - fail(message or "Program failed: $r") - else fail(message or "Program failed: $r") - return r - - func output_text(r:ProgramResult, trim_newline=yes -> Text?) - when r.exit_type is Exited(status) - if status == 0 - if text := Text.from_bytes(r.output) - if trim_newline - text = text.without_suffix("\n") - return text - else return none - return none - - func error_text(r:ProgramResult -> Text?) - when r.exit_type is Exited(status) - if status == 0 - return Text.from_bytes(r.errors) - else return none - return none - - func succeeded(r:ProgramResult -> Bool) - when r.exit_type is Exited(status) - return (status == 0) - else - return no - -struct Command(command:Text, args:[Text]=[], env:{Text=Text}={}) - func from_path(path:Path, args:[Text]=[], env:{Text=Text}={} -> Command) - return Command(Text(path), args, env) - - func result(command:Command, input="", input_bytes:[Byte]=[] -> ProgramResult) - if input.length > 0 - (&input_bytes).insert_all(input.bytes()) - - output : [Byte] - errors : [Byte] - status := run_command(command.command, command.args, command.env, input_bytes, &output, &errors) - - if C_code:Bool(WIFEXITED(@status)) - return ProgramResult(output, errors, ExitType.Exited(C_code:Int32(WEXITSTATUS(@status)))) - - if C_code:Bool(WIFSIGNALED(@status)) - return ProgramResult(output, errors, ExitType.Signaled(C_code:Int32(WTERMSIG(@status)))) - - return ProgramResult(output, errors, ExitType.Failed) - - func run(command:Command, -> ExitType) - status := run_command(command.command, command.args, command.env, none, none, none) - - if C_code:Bool(WIFEXITED(@status)) - return ExitType.Exited(C_code:Int32(WEXITSTATUS(@status))) - - if C_code:Bool(WIFSIGNALED(@status)) - return ExitType.Signaled(C_code:Int32(WTERMSIG(@status))) - - return ExitType.Failed - - func get_output(command:Command, input="", trim_newline=yes -> Text?) - return command.result(input=input).output_text(trim_newline=trim_newline) - - func get_output_bytes(command:Command, input="", input_bytes:[Byte]=[] -> [Byte]?) - result := command.result(input=input, input_bytes=input_bytes) - when result.exit_type is Exited(status) - if status == 0 return result.output - return none - else return none - - func by_line(command:Command -> func(->Text?)?) - return command_by_line(command.command, command.args, command.env) diff --git a/lib/core/CHANGES.md b/lib/core/CHANGES.md deleted file mode 100644 index 42ae752c..00000000 --- a/lib/core/CHANGES.md +++ /dev/null @@ -1,5 +0,0 @@ -# Version History - -## v1.0 - -Initial version diff --git a/lib/core/core.tm b/lib/core/core.tm deleted file mode 100644 index 5ed77756..00000000 --- a/lib/core/core.tm +++ /dev/null @@ -1,9 +0,0 @@ -# This file just uses all the most commonly used standard -# library modules so you don't have to import them one-by-one - -use patterns_v1.1 -use commands_v1.0 -use shell_v1.0 -use pthreads_v1.0 -use random_v1.0 -use time_v1.0 diff --git a/lib/json/CHANGES.md b/lib/json/CHANGES.md deleted file mode 100644 index 42ae752c..00000000 --- a/lib/json/CHANGES.md +++ /dev/null @@ -1,5 +0,0 @@ -# Version History - -## v1.0 - -Initial version diff --git a/lib/json/README.md b/lib/json/README.md deleted file mode 100644 index 33e2101a..00000000 --- a/lib/json/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# JSON - -This is a library for encoding/decoding JSON values. - -## Usage - -```tomo ->> j := JSON({"key1"=123, "key2"=[yes, {"ok"="inner"}, JSON.Null]}) -= JSON.Object({"key1"=Number(123), "key2"=Array([Boolean(yes), Object({"ok"=String("inner")}), Null])}) - -say("$(j.encode())") -say("$(j.pretty_print())") - -when JSON.parse("[1, null, true]") is Success(obj) - >> obj -is Failure(msg) - fail("Failed to parse JSON: $msg") -``` diff --git a/lib/json/json.tm b/lib/json/json.tm deleted file mode 100644 index 8127ce52..00000000 --- a/lib/json/json.tm +++ /dev/null @@ -1,173 +0,0 @@ -# Base 64 encoding and decoding -use patterns - -enum JSONDecodeResult( - Success(json:JSON) - Failure(reason:Text) -) - func invalid(text:Text -> JSONDecodeResult) - return Failure("Unrecognized JSON: $(text.quoted())") - -extend Text - func json_quoted(text:Text -> Text) - return '"' ++ text.translate({ - "\\"="\\\\", - '"'='\\"', - "\f"="\\f", - "\r"="\\r", - "\n"="\\n", - "\b"="\\b", - "\t"="\\t", - }) ++ '"' - -enum JSON( - Object(items:{Text=JSON}) - Array(items:[JSON]) - Boolean(value:Bool) - String(text:Text) - Number(n:Num) - Null -) - func encode(j:JSON -> Text) - when j is Object(items) - return "{" ++ ", ".join([ - '$(k.json_quoted()): $(v.encode())' - for k,v in items - ]) ++ "}" - is Array(items) - return "[" ++ ", ".join([item.encode() for item in items]) ++ "]" - is Boolean(value) - return (if value then "true" else "false") - is String(text) - return text.json_quoted() - is Number(n) - return "$n" - is Null - return "null" - - func pretty_print(j:JSON, max_line:Int=80, indent:Text=" ", current_indent:Text="" -> Text) - inline := j.encode() - if inline.length > max_line - next_indent := current_indent ++ indent - when j is Object(items) - return "{\n$next_indent" ++ ",\n$next_indent".join([ - '$(k.json_quoted()): $(v.pretty_print(max_line, indent, next_indent))' - for k,v in items - ]) ++ "\n$current_indent}" - is Array(items) - return "[\n$next_indent" ++ ",\n$next_indent".join([item.pretty_print(max_line, indent, next_indent) for item in items]) ++ "\n$current_indent]" - else pass - - return inline - - func parse_text(text:Text, remainder:&Text? = none -> JSONDecodeResult) - if text.starts_with('"') - string := "" - pos := 2 - escapes := {"n"="\n", "t"="\t", "r"="\r", '"'='"', "\\"="\\", "/"="/", "b"="\b", "f"="\f"} - while pos <= text.length - c := text[pos] - if c == '"' - if remainder - remainder[] = text.from(pos + 1) - return Success(JSON.String(string)) - - if c == "\\" - stop if pos + 1 > text.length - - if esc := escapes[text[pos+1]] - string ++= esc - pos += 2 - else if m := text.matching_pattern($Pat/u{4 digit}/) - string ++= Text.from_codepoints([Int32.parse(m.captures[1])!]) - pos += 1 + m.text.length - else - if remainder - remainder[] = text - return JSONDecodeResult.invalid(text) - else - string ++= c - pos += 1 - - if remainder - remainder[] = text - return JSONDecodeResult.invalid(text) - - func parse(text:Text, remainder:&Text? = none, trailing_commas:Bool=no -> JSONDecodeResult) - if text.starts_with("true", remainder) - return Success(JSON.Boolean(yes)) - else if text.starts_with("false", remainder) - return Success(JSON.Boolean(no)) - else if text.starts_with("null", remainder) - return Success(JSON.Null) - else if n := Num.parse(text, remainder) - return Success(JSON.Number(n)) - else if text.starts_with('"') - return JSON.parse_text(text, remainder) - else if text.starts_with("[") - elements : &[JSON] - text = text.from(2).trim_pattern($Pat"{whitespace}", right=no) - repeat - when JSON.parse(text, &text) is Success(elem) - elements.insert(elem) - else stop - - if delim := text.matching_pattern($Pat'{0+ ws},{0+ ws}') - text = text.from(delim.text.length + 1) - else stop - - if trailing_commas - if delim := text.matching_pattern($Pat'{0+ ws},{0+ ws}') - text = text.from(delim.text.length + 1) - - if terminator := text.matching_pattern($Pat'{0+ ws}]') - if remainder - remainder[] = text.from(terminator.text.length + 1) - return Success(JSON.Array(elements)) - else if text.starts_with("{") - object : &{Text=JSON} - text = text.from(2).trim_pattern($Pat"{whitespace}", right=no) - repeat - key_text := text - when JSON.parse_text(text, &text) is Success(key) - if separator := text.matching_pattern($Pat'{0+ ws}:{0+ ws}') - text = text.from(separator.text.length + 1) - else - return JSONDecodeResult.invalid(text) - - when JSON.parse(text, &text) is Success(value) - when key is String(str) - object[str] = value - else - return JSONDecodeResult.invalid(key_text) - else - return JSONDecodeResult.invalid(text) - else stop - - if delim := text.matching_pattern($Pat'{0+ ws},{0+ ws}') - text = text.from(delim.text.length + 1) - else stop - - if trailing_commas - if delim := text.matching_pattern($Pat'{0+ ws},{0+ ws}') - text = text.from(delim.text.length + 1) - - if terminator := text.matching_pattern($Pat'{0+ ws}{}}') - if remainder - remainder[] = text.from(terminator.text.length + 1) - return Success(JSON.Object(object)) - - return JSONDecodeResult.invalid(text) - -func main(input=(/dev/stdin), pretty_print:Bool = no, trailing_commas:Bool = yes) - text := (input.read() or exit("Invalid file: $input")).trim_pattern($Pat"{whitespace}") - while text.length > 0 - when JSON.parse(text, remainder=&text, trailing_commas=trailing_commas) is Success(json) - if pretty_print - say(json.pretty_print()) - else - say(json.encode()) - is Failure(msg) - exit("\033[31;1m$msg\033[m", code=1) - - text = text.trim_pattern($Pat"{whitespace}") diff --git a/lib/patterns/CHANGES.md b/lib/patterns/CHANGES.md deleted file mode 100644 index cf6254cb..00000000 --- a/lib/patterns/CHANGES.md +++ /dev/null @@ -1,8 +0,0 @@ -# Version History - -## v1.1 -- Added `Text.matching_pattern(text:Text, pattern:Pattern, pos:Int = 1 -> PatternMatch?)` - -## v1.0 - -Initial version diff --git a/lib/patterns/README.md b/lib/patterns/README.md deleted file mode 100644 index faf2854e..00000000 --- a/lib/patterns/README.md +++ /dev/null @@ -1,444 +0,0 @@ -# Text Pattern Matching - -As an alternative to full regular expressions, Tomo provides a limited text -matching pattern syntax that is intended to solve 80% of use cases in under 1% -of the code size (PCRE's codebase is roughly 150k lines of code, and Tomo's -pattern matching code is a bit under 1k lines of code). Tomo's pattern matching -syntax is highly readable and works well for matching literal text without -getting [leaning toothpick syndrome](https://en.wikipedia.org/wiki/Leaning_toothpick_syndrome). - -For more advanced use cases, consider linking against a C library for regular -expressions or pattern matching. - -`Pat` is a [domain-specific language](docs/langs.md), in other words, it's -like a `Text`, but it has a distinct type. - -Patterns are used in a small, but very powerful API that handles many text -functions that would normally be handled by a more extensive API: - -- [`by_pattern(text:Text, pattern:Pat -> func(->PatternMatch?))`](#by_pattern) -- [`by_pattern_split(text:Text, pattern:Pat -> func(->Text?))`](#by_pattern_split) -- [`each_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch), recursive=yes)`](#each_pattern) -- [`find_patterns(text:Text, pattern:Pat -> [PatternMatch])`](#find_patterns) -- [`has_pattern(text:Text, pattern:Pat -> Bool)`](#has_pattern) -- [`map_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch -> Text), recursive=yes -> Text)`](#map_pattern) -- [`matches_pattern(text:Text, pattern:Pat -> Bool)`](#matches_pattern) -- [`pattern_captures(text:Text, pattern:Pat -> [Text]?)`](#pattern_captures) -- [`replace_pattern(text:Text, pattern:Pat, replacement:Text, backref="@", recursive=yes -> Text)`](#replace_pattern) -- [`split_pattern(text:Text, pattern:Pat -> [Text])`](#split_pattern) -- [`translate_patterns(text:Text, replacements:{Pat,Text}, backref="@", recursive=yes -> Text)`](#translate_patterns) -- [`trim_pattern(text:Text, pattern=$Pat"{space}", left=yes, right=yes -> Text)`](#trim_pattern) - -## Matches - -Pattern matching functions work with a type called `PatternMatch` that has three fields: - -- `text`: The full text of the match. -- `index`: The index in the text where the match was found. -- `captures`: A list containing the matching text of each non-literal pattern group. - -See [Text Functions](text.md#Text-Functions) for the full API documentation. - -## Syntax - -Patterns have three types of syntax: - -- `{` followed by an optional count (`n`, `n-m`, or `n+`), followed by an - optional `!` to negate the pattern, followed by an optional pattern name or - Unicode character name, followed by a required `}`. - -- Any matching pair of quotes or parentheses or braces with a `?` in the middle - (e.g. `"?"` or `(?)`). - -- Any other character is treated as a literal to be matched exactly. - -## Named Patterns - -Named patterns match certain pre-defined patterns that are commonly useful. To -use a named pattern, use the syntax `{name}`. Names are case-insensitive and -mostly ignore spaces, underscores, and dashes. - -- `..` - Any character (note that a single `.` would mean the literal period - character). -- `digit` - A unicode digit -- `email` - an email address -- `emoji` - an emoji -- `end` - the very end of the text -- `id` - A unicode identifier -- `int` - One or more digits with an optional `-` (minus sign) in front -- `ip` - an IP address (IPv4 or IPv6) -- `ipv4` - an IPv4 address -- `ipv6` - an IPv6 address -- `nl`/`newline`/`crlf` - A line break (either `\r\n` or `\n`) -- `num` - One or more digits with an optional `-` (minus sign) in front and an optional `.` and more digits after -- `start` - the very start of the text -- `uri` - a URI -- `url` - a URL (URI that specifically starts with `http://`, `https://`, `ws://`, `wss://`, or `ftp://`) -- `word` - A unicode identifier (same as `id`) - -For non-alphabetic characters, any single character is treated as matching -exactly that character. For example, `{1{}` matches exactly one `{` -character. Or, `{1.}` matches exactly one `.` character. - -Patterns can also use any Unicode property name. Some helpful ones are: - -- `hex` - Hexidecimal digits -- `lower` - Lowercase letters -- `space` - The space character -- `upper` - Uppercase letters -- `whitespace` - Whitespace characters - -Patterns may also use exact Unicode codepoint names. For example: `{1 latin -small letter A}` matches `a`. - -## Negating Patterns - -If an exclamation mark (`!`) is placed before a pattern's name, then characters -are matched only when they _don't_ match the pattern. For example, `{!alpha}` -will match all characters _except_ alphabetic ones. - -## Interpolating Text and Escaping - -To escape a character in a pattern (e.g. if you want to match the literal -character `?`), you can use the syntax `{1 ?}`. This is almost never necessary -unless you have text that looks like a Tomo text pattern and has something like -`{` or `(?)` inside it. - -However, if you're trying to do an exact match of arbitrary text values, you'll -want to have the text automatically escaped. Fortunately, Tomo's injection-safe -DSL text interpolation supports automatic text escaping. This means that if you -use text interpolation with the `$` sign to insert a text value, the value will -be automatically escaped using the `{1 ?}` rule described above: - -```tomo -# Risk of code injection (would cause an error because 'xxx' is not a valid -# pattern name: ->> user_input := get_user_input() -= "{xxx}" - -# Interpolation automatically escapes: ->> $/$user_input/ -= $/{1{}..xxx}/ - -# This is: `{ 1{ }` (one open brace) followed by the literal text "..xxx}" - -# No error: ->> some_text.find($/$user_input/) -= 0 -``` - -If you prefer, you can also use this to insert literal characters: - -```tomo ->> $/literal $"{..}"/ -= $/literal {1{}..}/ -``` - -## Repetitions - -By default, named patterns match 1 or more repetitions, but you can specify how -many repetitions you want by putting a number or range of numbers first using -`n` (exactly `n` repetitions), `n-m` (between `n` and `m` repetitions), or `n+` -(`n` or more repetitions): - -``` -{4-5 alpha} -0x{hex} -{4 digit}-{2 digit}-{2 digit} -{2+ space} -{0-1 question mark} -``` - - -# Methods - -### `by_pattern` -Returns an iterator function that yields `PatternMatch` objects for each occurrence. - -```tomo -func by_pattern(text:Text, pattern:Pat -> func(->PatternMatch?)) -``` - -- `text`: The text to search. -- `pattern`: The pattern to match. - -**Returns:** -An iterator function that yields `PatternMatch` objects one at a time. - -**Example:** -```tomo -text := "one, two, three" -for word in text.by_pattern($Pat"{id}"): - say(word.text) -``` - ---- - -### `by_pattern_split` -Returns an iterator function that yields text segments split by a pattern. - -```tomo -func by_pattern_split(text:Text, pattern:Pat -> func(->Text?)) -``` - -- `text`: The text to split. -- `pattern`: The pattern to use as a separator. - -**Returns:** -An iterator function that yields text segments. - -**Example:** -```tomo -text := "one two three" -for word in text.by_pattern_split($Pat"{whitespace}"): - say(word.text) -``` - ---- - -### `each_pattern` -Applies a function to each occurrence of a pattern in the text. - -```tomo -func each_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch), recursive=yes) -``` - -- `text`: The text to search. -- `pattern`: The pattern to match. -- `fn`: The function to apply to each match. -- `recursive`: If `yes`, applies the function recursively on modified text. - -**Example:** -```tomo -text := "one two three" -text.each_pattern($Pat"{id}", func(m:PatternMatch): - say(m.txt) -) -``` - ---- - -### `find_patterns` -Finds all occurrences of a pattern in a text and returns them as `PatternMatch` objects. - -```tomo -func find_patterns(text:Text, pattern:Pat -> [PatternMatch]) -``` - -- `text`: The text to search. -- `pattern`: The pattern to match. - -**Returns:** -A list of `PatternMatch` objects. - -**Example:** -```tomo -text := "one! two three!" ->> text.find_patterns($Pat"{id}!") -= [PatternMatch(text="one!", index=1, captures=["one"]), PatternMatch(text="three!", index=10, captures=["three"])] -``` - ---- - -### `has_pattern` -Checks whether a given pattern appears in the text. - -```tomo -func has_pattern(text:Text, pattern:Pat -> Bool) -``` - -- `text`: The text to search. -- `pattern`: The pattern to check for. - -**Returns:** -`yes` if a match is found, otherwise `no`. - -**Example:** -```tomo -text := "...okay..." ->> text.has_pattern($Pat"{id}") -= yes -``` - ---- - -### `map_pattern` -Transforms matches of a pattern using a mapping function. - -```tomo -func map_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch -> Text), recursive=yes -> Text) -``` - -- `text`: The text to modify. -- `pattern`: The pattern to match. -- `fn`: A function that transforms matches. -- `recursive`: If `yes`, applies transformations recursively. - -**Returns:** -A new text with the transformed matches. - -**Example:** -```tomo -text := "I have #apples and #oranges and #plums" -fruits := {"apples"=4, "oranges"=5} ->> text.map_pattern($Pat'#{id}', func(match:PatternMatch): - fruit := match.captures[1] - "$(fruits[fruit] or 0) $fruit" -) -= "I have 4 apples and 5 oranges and 0 plums" -``` - ---- - -### `matches_pattern` -Returns whether or not text matches a pattern completely. - -```tomo -func matches_pattern(text:Text, pattern:Pat -> Bool) -``` - -- `text`: The text to match against. -- `pattern`: The pattern to match. - -**Returns:** -`yes` if the whole text matches the pattern, otherwise `no`. - -**Example:** -```tomo ->> "Hello!!!".matches_pattern($Pat"{id}") -= no ->> "Hello".matches_pattern($Pat"{id}") -= yes -``` - ---- - -### `pattern_captures` -Returns a list of pattern captures for the given pattern. - -```tomo -func pattern_captures(text:Text, pattern:Pat -> [Text]?) -``` - -- `text`: The text to match against. -- `pattern`: The pattern to match. - -**Returns:** -An optional list of matched pattern captures. Returns `none` if the text does -not match the pattern. - -**Example:** -```tomo ->> "123 boxes".pattern_captures($Pat"{int} {id}") -= ["123", "boxes"]? ->> "xxx".pattern_captures($Pat"{int} {id}") -= none -``` - ---- - -### `replace_pattern` -Replaces occurrences of a pattern with a replacement text, supporting backreferences. - -```tomo -func replace_pattern(text:Text, pattern:Pat, replacement:Text, backref="@", recursive=yes -> Text) -``` - -- `text`: The text to modify. -- `pattern`: The pattern to match. -- `replacement`: The text to replace matches with. -- `backref`: The symbol for backreferences in the replacement. -- `recursive`: If `yes`, applies replacements recursively. - -**Returns:** -A new text with replacements applied. - -**Example:** -```tomo ->> "I have 123 apples and 456 oranges".replace_pattern($Pat"{int}", "some") -= "I have some apples and some oranges" - ->> "I have 123 apples and 456 oranges".replace_pattern($Pat"{int}", "(@1)") -= "I have (123) apples and (456) oranges" - ->> "I have 123 apples and 456 oranges".replace_pattern($Pat"{int}", "(?1)", backref="?") -= "I have (123) apples and (456) oranges" - ->> "bad(fn(), bad(notbad))".replace_pattern($Pat"bad(?)", "good(@1)") -= "good(fn(), good(notbad))" - ->> "bad(fn(), bad(notbad))".replace_pattern($Pat"bad(?)", "good(@1)", recursive=no) -= "good(fn(), bad(notbad))" -``` - ---- - -### `split_pattern` -Splits a text into segments using a pattern as the delimiter. - -```tomo -func split_pattern(text:Text, pattern:Pat -> [Text]) -``` - -- `text`: The text to split. -- `pattern`: The pattern to use as a separator. - -**Returns:** -A list of text segments. - -**Example:** -```tomo ->> "one two three".split_pattern($Pat"{whitespace}") -= ["one", "two", "three"] -``` - ---- - -### `translate_patterns` -Replaces multiple patterns using a mapping of patterns to replacement texts. - -```tomo -func translate_patterns(text:Text, replacements:{Pat,Text}, backref="@", recursive=yes -> Text) -``` - -- `text`: The text to modify. -- `replacements`: A table mapping patterns to their replacements. -- `backref`: The symbol for backreferences in replacements. -- `recursive`: If `yes`, applies replacements recursively. - -**Returns:** -A new text with all specified replacements applied. - -**Example:** -```tomo ->> text := "foo(x, baz(1))" ->> text.translate_patterns({ - $Pat"{id}(?)"="call(fn('@1'), @2)", - $Pat"{id}"="var('@1')", - $Pat"{int}"="int(@1)", -}) -= "call(fn('foo'), var('x'), call(fn('baz'), int(1)))" -``` - ---- - -### `trim_pattern` -Removes matching patterns from the beginning and/or end of a text. - -```tomo -func trim_pattern(text:Text, pattern=$Pat"{space}", left=yes, right=yes -> Text) -``` - -- `text`: The text to trim. -- `pattern`: The pattern to trim (defaults to whitespace). -- `left`: If `yes`, trims from the beginning. -- `right`: If `yes`, trims from the end. - -**Returns:** -The trimmed text. - -**Example:** -```tomo ->> "123abc456".trim_pattern($Pat"{digit}") -= "abc" -``` diff --git a/lib/patterns/_test.tm b/lib/patterns/_test.tm deleted file mode 100644 index 26c23628..00000000 --- a/lib/patterns/_test.tm +++ /dev/null @@ -1,256 +0,0 @@ -use patterns_v1.0 - -func main() - amelie := "Am\{UE9}lie" - amelie2 := "Am\{U65}\{U301}lie" - >> "Hello".replace_pattern($Pat/e/, "X") - = "HXllo" - - >> "Hello".has_pattern($Pat/l/) - = yes - >> "Hello".has_pattern($Pat/l{end}/) - = no - >> "Hello".has_pattern($Pat/{start}l/) - = no - - >> "Hello".has_pattern($Pat/o/) - = yes - >> "Hello".has_pattern($Pat/o{end}/) - = yes - >> "Hello".has_pattern($Pat/{start}o/) - = no - - >> "Hello".has_pattern($Pat/H/) - = yes - >> "Hello".has_pattern($Pat/H{end}/) - = no - >> "Hello".has_pattern($Pat/{start}H/) - = yes - - >> "Hello".replace_pattern($Pat/l/, "") - = "Heo" - >> "xxxx".replace_pattern($Pat/x/, "") - = "" - >> "xxxx".replace_pattern($Pat/y/, "") - = "xxxx" - >> "One two three four five six".replace_pattern($Pat/e /, "") - = "Ontwo threfour fivsix" - - >> " one ".replace_pattern($Pat/{start}{space}/, "") - = "one " - >> " one ".replace_pattern($Pat/{space}{end}/, "") - = " one" - - >> amelie.has_pattern($Pat/$amelie2/) - = yes - - >> "one two three".replace_pattern($Pat/{alpha}/, "") - = " " - >> "one two three".replace_pattern($Pat/{alpha}/, "word") - = "word word word" - - say("Test splitting and joining text:") - - >> "one two three".split_pattern($Pat/ /) - = ["one", "two", "three"] - - >> "one,two,three,".split_pattern($Pat/,/) - = ["one", "two", "three", ""] - - >> "one two three".split_pattern($Pat/{space}/) - = ["one", "two", "three"] - - >> "abc".split_pattern($Pat//) - = ["a", "b", "c"] - - >> ", ".join(["one", "two", "three"]) - = "one, two, three" - - >> "".join(["one", "two", "three"]) - = "onetwothree" - - >> "+".join(["one"]) - = "one" - - >> "+".join([]) - = "" - - say("Test text.find_patterns()") - >> " #one #two #three ".find_patterns($Pat/#{alpha}/) - = [PatternMatch(text="#one", index=2, captures=["one"]), PatternMatch(text="#two", index=8, captures=["two"]), PatternMatch(text="#three", index=13, captures=["three"])] - - >> " #one #two #three ".find_patterns($Pat/#{!space}/) - = [PatternMatch(text="#one", index=2, captures=["one"]), PatternMatch(text="#two", index=8, captures=["two"]), PatternMatch(text="#three", index=13, captures=["three"])] - - >> " ".find_patterns($Pat/{alpha}/) - = [] - - >> " foo(baz(), 1) doop() ".find_patterns($Pat/{id}(?)/) - = [PatternMatch(text="foo(baz(), 1)", index=2, captures=["foo", "baz(), 1"]), PatternMatch(text="doop()", index=17, captures=["doop", ""])] - - >> "".find_patterns($Pat'') - = [] - - >> "Hello".find_patterns($Pat'') - = [] - - say("Test text slicing:") - >> "abcdef".slice() - = "abcdef" - >> "abcdef".slice(from=3) - = "cdef" - >> "abcdef".slice(to=-2) - = "abcde" - >> "abcdef".slice(from=2, to=4) - = "bcd" - >> "abcdef".slice(from=5, to=1) - = "" - - >> house := "家" - = "家" - >> house.length - = 1 - >> house.codepoint_names() - = ["CJK Unified Ideographs-5BB6"] - >> house.utf32_codepoints() - = [23478] - - >> "🐧".codepoint_names() - = ["PENGUIN"] - - >> Text.from_codepoint_names(["not a valid name here buddy"]) - = none - - >> "one two; three four".find_patterns($Pat/; {..}/) - = [PatternMatch(text="; three four", index=8, captures=["three four"])] - - malicious := "{xxx}" - >> $Pat/$malicious/ - = $Pat/{1{}xxx}/ - - >> "Hello".replace_pattern($Pat/{lower}/, "(@0)") - = "H(ello)" - - >> " foo(xyz) foo(yyy) foo(z()) ".replace_pattern($Pat/foo(?)/, "baz(@1)") - = " baz(xyz) baz(yyy) baz(z()) " - - >> "<tag>".translate_patterns({$Pat/</="<", $Pat/>/=">"}) - = "<tag>" - - >> " BAD(x, fn(y), BAD(z), w) ".replace_pattern($Pat/BAD(?)/, "good(@1)", recursive=yes) - = " good(x, fn(y), good(z), w) " - - >> " BAD(x, fn(y), BAD(z), w) ".replace_pattern($Pat/BAD(?)/, "good(@1)", recursive=no) - = " good(x, fn(y), BAD(z), w) " - - >> "Hello".matches_pattern($Pat/{id}/) - = yes - >> "Hello".matches_pattern($Pat/{lower}/) - = no - >> "Hello".matches_pattern($Pat/{upper}/) - = no - >> "Hello...".matches_pattern($Pat/{id}/) - = no - - >> "hello world".map_pattern($Pat/world/, func(m:PatternMatch) m.text.upper()) - = "hello WORLD" - - >> "Abc".repeat(3) - = "AbcAbcAbc" - - >> " abc def ".trim_pattern() - = "abc def" - >> " abc123def ".trim_pattern($Pat/{!digit}/) - = "123" - >> " abc123def ".trim_pattern($Pat/{!digit}/, left=no) - = " abc123" - >> " abc123def ".trim_pattern($Pat/{!digit}/, right=no) - = "123def " - # Only trim single whole matches that bookend the text: - >> "AbcAbcxxxxxxxxAbcAbc".trim_pattern($Pat/Abc/) - = "AbcxxxxxxxxAbc" - - >> "A=B=C=D".replace_pattern($Pat/{..}={..}/, "1:(@1) 2:(@2)") - = "1:(A) 2:(B=C=D)" - - >> "abcde".starts_with("ab") - = yes - >> "abcde".starts_with("bc") - = no - - >> "abcde".ends_with("de") - = yes - >> "abcde".starts_with("cd") - = no - - >> ("hello" ++ " " ++ "Amélie").reversed() - = "eilémA olleh" - - do - say("Testing concatenation-stability:") - ab := Text.from_codepoint_names(["LATIN SMALL LETTER E", "COMBINING VERTICAL LINE BELOW"])! - >> ab.codepoint_names() - = ["LATIN SMALL LETTER E", "COMBINING VERTICAL LINE BELOW"] - >> ab.length - = 1 - - a := Text.from_codepoint_names(["LATIN SMALL LETTER E"])! - b := Text.from_codepoint_names(["COMBINING VERTICAL LINE BELOW"])! - >> (a++b).codepoint_names() - = ["LATIN SMALL LETTER E", "COMBINING VERTICAL LINE BELOW"] - >> (a++b) == ab - = yes - >> (a++b).length - = 1 - - - do - concat := "e" ++ Text.from_codepoints([Int32(0x300)]) - >> concat.length - = 1 - - concat2 := concat ++ Text.from_codepoints([Int32(0x302)]) - >> concat2.length - = 1 - - concat3 := concat2 ++ Text.from_codepoints([Int32(0x303)]) - >> concat3.length - = 1 - - final := Text.from_codepoints([Int32(0x65), Int32(0x300), Int32(0x302), Int32(0x303)]) - >> final.length - = 1 - >> concat3 == final - = yes - - concat4 := Text.from_codepoints([Int32(0x65), Int32(0x300)]) ++ Text.from_codepoints([Int32(0x302), Int32(0x303)]) - >> concat4.length - = 1 - >> concat4 == final - = yes - - >> "x".left_pad(5) - = " x" - >> "x".right_pad(5) - = "x " - >> "x".middle_pad(5) - = " x " - >> "1234".left_pad(8, "XYZ") - = "XYZX1234" - >> "1234".right_pad(8, "XYZ") - = "1234XYZX" - >> "1234".middle_pad(9, "XYZ") - = "XY1234XYZ" - - >> amelie.width() - = 6 - cowboy := "🤠" - >> cowboy.width() - = 2 - >> cowboy.left_pad(4) - = " 🤠" - >> cowboy.right_pad(4) - = "🤠 " - >> cowboy.middle_pad(4) - = " 🤠 " - diff --git a/lib/patterns/match_type.h b/lib/patterns/match_type.h deleted file mode 100644 index f85f1862..00000000 --- a/lib/patterns/match_type.h +++ /dev/null @@ -1,9 +0,0 @@ -// A datatype used for pattern match results - -#pragma once - -typedef struct { - Text_t text; - Int_t index; - List_t captures; -} XMatch; diff --git a/lib/patterns/patterns.c b/lib/patterns/patterns.c deleted file mode 100644 index 27799c40..00000000 --- a/lib/patterns/patterns.c +++ /dev/null @@ -1,1212 +0,0 @@ -// Logic for text pattern matching - -#include <ctype.h> -#include <string.h> -#include <strings.h> -#include <sys/param.h> -#include <unictype.h> -#include <uniname.h> -#include <unistring/version.h> - -#define MAX_BACKREFS 100 - -typedef struct { - Text_t text; - Int_t index; - List_t captures; -} PatternMatch; - -typedef struct { - Text_t text; - Int_t index; - List_t captures; - bool is_none : 1; -} OptionalPatternMatch; - -#define NONE_MATCH ((OptionalPatternMatch){.is_none = true}) - -typedef struct { - int64_t index, length; - bool occupied, recursive; -} capture_t; - -typedef struct { - enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag; - bool negated, non_capturing; - int64_t min, max; - union { - int32_t grapheme; - uc_property_t property; - int64_t (*fn)(TextIter_t *, int64_t); - int32_t quote_graphemes[2]; - int32_t pair_graphemes[2]; - }; -} pat_t; - -static Text_t replace_list(Text_t text, List_t replacements, Text_t backref_pat, bool recursive); - -static INLINE void skip_whitespace(TextIter_t *state, int64_t *i) { - while (*i < state->stack[0].text.length) { - int32_t grapheme = Text$get_grapheme_fast(state, *i); - if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) return; - *i += 1; - } -} - -static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme) { - if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) { - *i += 1; - return true; - } - return false; -} - -static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str) { - int64_t matched = 0; - while (matched[str]) { - if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched]) - return false; - matched += 1; - } - *i += matched; - return true; -} - -static int64_t parse_int(TextIter_t *state, int64_t *i) { - int64_t value = 0; - for (;; *i += 1) { - uint32_t grapheme = Text$get_main_grapheme_fast(state, *i); - int digit = uc_digit_value(grapheme); - if (digit < 0) break; - if (value >= INT64_MAX / 10) break; - value = 10 * value + digit; - } - return value; -} - -static const char *get_property_name(TextIter_t *state, int64_t *i) { - skip_whitespace(state, i); - char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); - char *dest = name; - while (*i < state->stack[0].text.length) { - int32_t grapheme = Text$get_grapheme_fast(state, *i); - if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { - *dest = (char)grapheme; - ++dest; - if (dest >= name + UNINAME_MAX - 1) break; - } else { - break; - } - *i += 1; - } - - while (dest > name && dest[-1] == ' ') - *(dest--) = '\0'; - - if (dest == name) return NULL; - *dest = '\0'; - return name; -} - -#define EAT1(state, index, cond) \ - ({ \ - int32_t grapheme = Text$get_grapheme_fast(state, index); \ - bool success = (cond); \ - if (success) index += 1; \ - success; \ - }) - -#define EAT2(state, index, cond1, cond2) \ - ({ \ - int32_t grapheme = Text$get_grapheme_fast(state, index); \ - bool success = (cond1); \ - if (success) { \ - grapheme = Text$get_grapheme_fast(state, index + 1); \ - success = (cond2); \ - if (success) index += 2; \ - } \ - success; \ - }) - -#define EAT_MANY(state, index, cond) \ - ({ \ - int64_t _n = 0; \ - while (EAT1(state, index, cond)) { \ - _n += 1; \ - } \ - _n; \ - }) - -static int64_t match_email(TextIter_t *state, int64_t index) { - // email = local "@" domain - // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) - // domain = dns-label ("." dns-label)* - // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) - - if (index > 0) { - uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); - if (uc_is_property_alphabetic(prev_codepoint)) return -1; - } - - int64_t start_index = index; - - // Local part: - int64_t local_len = 0; - static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; - while (EAT1(state, index, (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { - local_len += 1; - if (local_len > 64) return -1; - } - - if (!EAT1(state, index, grapheme == '@')) return -1; - - // Host - int64_t host_len = 0; - do { - int64_t label_len = 0; - while (EAT1(state, index, (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { - label_len += 1; - if (label_len > 63) return -1; - } - - if (label_len == 0) return -1; - - host_len += label_len; - if (host_len > 255) return -1; - host_len += 1; - } while (EAT1(state, index, grapheme == '.')); - - return index - start_index; -} - -static int64_t match_ipv6(TextIter_t *state, int64_t index) { - if (index > 0) { - int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); - if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) return -1; - } - int64_t start_index = index; - const int NUM_CLUSTERS = 8; - bool double_colon_used = false; - for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { - for (int digits = 0; digits < 4; digits++) { - if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) break; - } - if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) return -1; // Too many digits - - if (cluster == NUM_CLUSTERS - 1) { - break; - } else if (!EAT1(state, index, grapheme == ':')) { - if (double_colon_used) break; - return -1; - } - - if (EAT1(state, index, grapheme == ':')) { - if (double_colon_used) return -1; - double_colon_used = true; - } - } - return index - start_index; -} - -static int64_t match_ipv4(TextIter_t *state, int64_t index) { - if (index > 0) { - int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); - if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) return -1; - } - int64_t start_index = index; - - const int NUM_CLUSTERS = 4; - for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { - for (int digits = 0; digits < 3; digits++) { - if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { - if (digits == 0) return -1; - break; - } - } - - if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) return -1; // Too many digits - - if (cluster == NUM_CLUSTERS - 1) break; - else if (!EAT1(state, index, grapheme == '.')) return -1; - } - return (index - start_index); -} - -static int64_t match_ip(TextIter_t *state, int64_t index) { - int64_t len = match_ipv6(state, index); - if (len >= 0) return len; - len = match_ipv4(state, index); - return (len >= 0) ? len : -1; -} - -static int64_t match_host(TextIter_t *state, int64_t index) { - int64_t ip_len = match_ip(state, index); - if (ip_len > 0) return ip_len; - - int64_t start_index = index; - if (match_grapheme(state, &index, '[')) { - ip_len = match_ip(state, index); - if (ip_len <= 0) return -1; - index += ip_len; - if (match_grapheme(state, &index, ']')) return (index - start_index); - return -1; - } - - if (!EAT1(state, index, isalpha(grapheme))) return -1; - - static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`"; - EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme)); - return (index - start_index); -} - -static int64_t match_authority(TextIter_t *state, int64_t index) { - int64_t authority_start = index; - static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`."; - - // Optional user@ prefix: - int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme)); - if (username_len < 1 || !EAT1(state, index, grapheme == '@')) index = authority_start; // No user@ part - - // Host: - int64_t host_len = match_host(state, index); - if (host_len <= 0) return -1; - index += host_len; - - // Port: - if (EAT1(state, index, grapheme == ':')) { - if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) return -1; - } - return (index - authority_start); -} - -static int64_t match_uri(TextIter_t *state, int64_t index) { - // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] - // scheme = [a-zA-Z] [a-zA-Z0-9+.-] - // authority = [userinfo "@"] host [":" port] - - if (index > 0) { - // Don't match if we're not at a word edge: - uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); - if (uc_is_property_alphabetic(prev_codepoint)) return -1; - } - - int64_t start_index = index; - - // Scheme: - if (!EAT1(state, index, isalpha(grapheme))) return -1; - EAT_MANY(state, index, - !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); - if (!match_grapheme(state, &index, ':')) return -1; - - // Authority: - int64_t authority_len; - if (match_str(state, &index, "//")) { - authority_len = match_authority(state, index); - if (authority_len > 0) index += authority_len; - } else { - authority_len = 0; - } - - // Path: - int64_t path_start = index; - if (EAT1(state, index, grapheme == '/') || authority_len <= 0) { - static const char *non_path = " \"#?<>[]{}\\^`|"; - EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); - - if (EAT1(state, index, grapheme == '?')) { // Query - static const char *non_query = " \"#<>[]{}\\^`|"; - EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); - } - - if (EAT1(state, index, grapheme == '#')) { // Fragment - static const char *non_fragment = " \"#<>[]{}\\^`|"; - EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); - } - } - - if (authority_len <= 0 && index == path_start) return -1; - - return index - start_index; -} - -static int64_t match_url(TextIter_t *state, int64_t index) { - int64_t lookahead = index; - if (!(match_str(state, &lookahead, "https:") || match_str(state, &lookahead, "http:") - || match_str(state, &lookahead, "ftp:") || match_str(state, &lookahead, "wss:") - || match_str(state, &lookahead, "ws:"))) - return -1; - - return match_uri(state, index); -} - -static int64_t match_id(TextIter_t *state, int64_t index) { - if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) return -1; - return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE)); -} - -static int64_t match_int(TextIter_t *state, int64_t index) { - int64_t negative = EAT1(state, index, grapheme == '-') ? 1 : 0; - int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); - return len > 0 ? negative + len : -1; -} - -static int64_t match_alphanumeric(TextIter_t *state, int64_t index) { - return EAT1(state, index, uc_is_property_alphabetic((ucs4_t)grapheme) || uc_is_property_numeric((ucs4_t)grapheme)) - ? 1 - : -1; -} - -static int64_t match_num(TextIter_t *state, int64_t index) { - bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0; - int64_t pre_decimal = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); - bool decimal = (EAT1(state, index, grapheme == '.') == 1); - int64_t post_decimal = - decimal ? EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0; - if (pre_decimal == 0 && post_decimal == 0) return -1; - return negative + pre_decimal + decimal + post_decimal; -} - -static int64_t match_newline(TextIter_t *state, int64_t index) { - if (index >= state->stack[0].text.length) return -1; - - uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index); - if (grapheme == '\n') return 1; - if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n') return 2; - return -1; -} - -static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat) { - Text_t text = state->stack[0].text; - int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index); - - switch (pat.tag) { - case PAT_START: { - if (index == 0) return pat.negated ? -1 : 0; - return pat.negated ? 0 : -1; - } - case PAT_END: { - if (index >= text.length) return pat.negated ? -1 : 0; - return pat.negated ? 0 : -1; - } - case PAT_ANY: { - assert(!pat.negated); - return (index < text.length) ? 1 : -1; - } - case PAT_GRAPHEME: { - if (index >= text.length) return -1; - else if (grapheme == pat.grapheme) return pat.negated ? -1 : 1; - return pat.negated ? 1 : -1; - } - case PAT_PROPERTY: { - if (index >= text.length) return -1; - else if (uc_is_property((ucs4_t)grapheme, pat.property)) return pat.negated ? -1 : 1; - return pat.negated ? 1 : -1; - } - case PAT_PAIR: { - // Nested punctuation: (?), [?], etc - if (index >= text.length) return -1; - - int32_t open = pat.pair_graphemes[0]; - if (grapheme != open) return pat.negated ? 1 : -1; - - int32_t close = pat.pair_graphemes[1]; - int64_t depth = 1; - int64_t match_len = 1; - for (; depth > 0; match_len++) { - if (index + match_len >= text.length) return pat.negated ? 1 : -1; - - int32_t c = Text$get_grapheme_fast(state, index + match_len); - if (c == open) depth += 1; - else if (c == close) depth -= 1; - } - return pat.negated ? -1 : match_len; - } - case PAT_QUOTE: { - // Nested quotes: "?", '?', etc - if (index >= text.length) return -1; - - int32_t open = pat.quote_graphemes[0]; - if (grapheme != open) return pat.negated ? 1 : -1; - - int32_t close = pat.quote_graphemes[1]; - for (int64_t i = index + 1; i < text.length; i++) { - int32_t c = Text$get_grapheme_fast(state, i); - if (c == close) { - return pat.negated ? -1 : (i - index) + 1; - } else if (c == '\\' && index + 1 < text.length) { - i += 1; // Skip ahead an extra step - } - } - return pat.negated ? 1 : -1; - } - case PAT_FUNCTION: { - int64_t match_len = pat.fn(state, index); - if (match_len >= 0) return pat.negated ? -1 : match_len; - return pat.negated ? 1 : -1; - } - default: errx(1, "Invalid pattern"); - } - errx(1, "Unreachable"); - return 0; -} - -static pat_t parse_next_pat(TextIter_t *state, int64_t *index) { - if (EAT2(state, *index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), grapheme == '?')) { - // Quotations: "?", '?', etc - int32_t open = Text$get_grapheme_fast(state, *index - 2); - int32_t close = open; - uc_mirror_char((ucs4_t)open, (ucs4_t *)&close); - if (!match_grapheme(state, index, close)) fail("Pattern's closing quote is missing: ", state->stack[0].text); - - return (pat_t){ - .tag = PAT_QUOTE, - .min = 1, - .max = 1, - .quote_graphemes = {open, close}, - }; - } else if (EAT2(state, *index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), grapheme == '?')) { - // Nested punctuation: (?), [?], etc - int32_t open = Text$get_grapheme_fast(state, *index - 2); - int32_t close = open; - uc_mirror_char((ucs4_t)open, (ucs4_t *)&close); - if (!match_grapheme(state, index, close)) fail("Pattern's closing brace is missing: ", state->stack[0].text); - - return (pat_t){ - .tag = PAT_PAIR, - .min = 1, - .max = 1, - .pair_graphemes = {open, close}, - }; - } else if (EAT1(state, *index, grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc. - skip_whitespace(state, index); - int64_t min, max; - if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) { - min = parse_int(state, index); - skip_whitespace(state, index); - if (match_grapheme(state, index, '+')) { - max = INT64_MAX; - } else if (match_grapheme(state, index, '-')) { - max = parse_int(state, index); - } else { - max = min; - } - if (min > max) fail("Minimum repetitions (", min, ") is less than the maximum (", max, ")"); - } else { - min = -1, max = -1; - } - - skip_whitespace(state, index); - - bool negated = match_grapheme(state, index, '!'); -#define PAT(_tag, ...) ((pat_t){.min = min, .max = max, .negated = negated, .tag = _tag, __VA_ARGS__}) - const char *prop_name; - if (match_str(state, index, "..")) prop_name = ".."; - else prop_name = get_property_name(state, index); - - if (!prop_name) { - // Literal character, e.g. {1?} - skip_whitespace(state, index); - int32_t grapheme = Text$get_grapheme_fast(state, (*index)++); - if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text); - return PAT(PAT_GRAPHEME, .grapheme = grapheme); - } else if (strlen(prop_name) == 1) { - // Single letter names: {1+ A} - skip_whitespace(state, index); - if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text); - return PAT(PAT_GRAPHEME, .grapheme = prop_name[0]); - } - - skip_whitespace(state, index); - if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text); - - switch (tolower(prop_name[0])) { - case '.': - if (prop_name[1] == '.') { - if (negated) return ((pat_t){.tag = PAT_END, .min = min, .max = max, .non_capturing = true}); - else return PAT(PAT_ANY); - } - break; - case 'a': - if (strcasecmp(prop_name, "authority") == 0) { - return PAT(PAT_FUNCTION, .fn = match_authority); - } else if (strcasecmp(prop_name, "alphanum") == 0 || strcasecmp(prop_name, "anum") == 0 - || strcasecmp(prop_name, "alphanumeric") == 0) { - return PAT(PAT_FUNCTION, .fn = match_alphanumeric); - } - break; - case 'c': - if (strcasecmp(prop_name, "crlf") == 0) return PAT(PAT_FUNCTION, .fn = match_newline); - break; - case 'd': - if (strcasecmp(prop_name, "digit") == 0) { - return PAT(PAT_PROPERTY, .property = UC_PROPERTY_DECIMAL_DIGIT); - } - break; - case 'e': - if (strcasecmp(prop_name, "end") == 0) { - return PAT(PAT_END, .non_capturing = !negated); - } else if (strcasecmp(prop_name, "email") == 0) { - return PAT(PAT_FUNCTION, .fn = match_email); - } -#if _LIBUNISTRING_VERSION >= 0x0100000 - else if (strcasecmp(prop_name, "emoji") == 0) { - return PAT(PAT_PROPERTY, .property = UC_PROPERTY_EMOJI); - } -#endif - break; - case 'h': - if (strcasecmp(prop_name, "host") == 0) { - return PAT(PAT_FUNCTION, .fn = match_host); - } - break; - case 'i': - if (strcasecmp(prop_name, "id") == 0) { - return PAT(PAT_FUNCTION, .fn = match_id); - } else if (strcasecmp(prop_name, "int") == 0) { - return PAT(PAT_FUNCTION, .fn = match_int); - } else if (strcasecmp(prop_name, "ipv4") == 0) { - return PAT(PAT_FUNCTION, .fn = match_ipv4); - } else if (strcasecmp(prop_name, "ipv6") == 0) { - return PAT(PAT_FUNCTION, .fn = match_ipv6); - } else if (strcasecmp(prop_name, "ip") == 0) { - return PAT(PAT_FUNCTION, .fn = match_ip); - } - break; - case 'n': - if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0) { - return PAT(PAT_FUNCTION, .fn = match_newline); - } else if (strcasecmp(prop_name, "num") == 0) { - return PAT(PAT_FUNCTION, .fn = match_num); - } - break; - case 's': - if (strcasecmp(prop_name, "start") == 0) { - return PAT(PAT_START, .non_capturing = !negated); - } - break; - case 'u': - if (strcasecmp(prop_name, "uri") == 0) { - return PAT(PAT_FUNCTION, .fn = match_uri); - } else if (strcasecmp(prop_name, "url") == 0) { - return PAT(PAT_FUNCTION, .fn = match_url); - } - break; - case 'w': - if (strcasecmp(prop_name, "word") == 0) { - return PAT(PAT_FUNCTION, .fn = match_id); - } else if (strcasecmp(prop_name, "ws") == 0 || strcasecmp(prop_name, "whitespace") == 0) { - return PAT(PAT_PROPERTY, .property = UC_PROPERTY_WHITE_SPACE); - } - break; - default: break; - } - - uc_property_t prop = uc_property_byname(prop_name); - if (uc_property_is_valid(prop)) return PAT(PAT_PROPERTY, .property = prop); - - ucs4_t grapheme = unicode_name_character(prop_name); - if (grapheme == UNINAME_INVALID) fail("Not a valid property or character name: ", prop_name); - return PAT(PAT_GRAPHEME, .grapheme = (int32_t)grapheme); -#undef PAT - } else { - return (pat_t){.tag = PAT_GRAPHEME, - .non_capturing = true, - .min = 1, - .max = 1, - .grapheme = Text$get_grapheme_fast(state, (*index)++)}; - } -} - -static int64_t match(Text_t text, int64_t text_index, Text_t pattern, int64_t pattern_index, capture_t *captures, - int64_t capture_index) { - if (pattern_index >= pattern.length) // End of the pattern - return 0; - - int64_t start_index = text_index; - TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text); - pat_t pat = parse_next_pat(&pattern_state, &pattern_index); - - if (pat.min == -1 && pat.max == -1) { - if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { - pat.min = pat.max = MAX(1, text.length - text_index); - } else { - pat.min = 1; - pat.max = INT64_MAX; - } - } - - int64_t capture_start = text_index; - int64_t count = 0, capture_len = 0, next_match_len = 0; - - if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { - int64_t remaining = text.length - text_index; - capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1; - text_index += capture_len; - goto success; - } - - if (pat.min == 0 && pattern_index < pattern.length) { - next_match_len = - match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); - if (next_match_len >= 0) { - capture_len = 0; - goto success; - } - } - - while (count < pat.max) { - int64_t match_len = match_pat(&text_state, text_index, pat); - if (match_len < 0) break; - capture_len += match_len; - text_index += match_len; - count += 1; - - if (pattern_index < pattern.length) { // More stuff after this - if (count < pat.min) next_match_len = -1; - else - next_match_len = match(text, text_index, pattern, pattern_index, captures, - capture_index + (pat.non_capturing ? 0 : 1)); - } else { - next_match_len = 0; - } - - if (match_len == 0) { - if (next_match_len >= 0) { - // If we're good to go, no need to keep re-matching zero-length - // matches till we hit max: - count = pat.max; - break; - } else { - return -1; - } - } - - if (pattern_index < pattern.length && next_match_len >= 0) break; // Next guy exists and wants to stop here - - if (text_index >= text.length) break; - } - - if (count < pat.min || next_match_len < 0) return -1; - -success: - if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) { - if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) { - assert(capture_len > 0); - captures[capture_index] = (capture_t){ - .index = capture_start + 1, // Skip leading quote/paren - .length = capture_len - 2, // Skip open/close - .occupied = true, - .recursive = (pat.tag == PAT_PAIR), - }; - } else { - captures[capture_index] = (capture_t){ - .index = capture_start, - .length = capture_len, - .occupied = true, - .recursive = false, - }; - } - } - return (text_index - start_index) + next_match_len; -} - -#undef EAT1 -#undef EAT2 -#undef EAT_MANY - -static int64_t _find(Text_t text, Text_t pattern, int64_t first, int64_t last, int64_t *match_length, - capture_t *captures) { - int32_t first_grapheme = Text$get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - TextIter_t text_state = NEW_TEXT_ITER_STATE(text); - for (int64_t i = first; i <= last; i++) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme) - ++i; - } - - int64_t m = match(text, i, pattern, 0, captures, 0); - if (m >= 0) { - if (match_length) *match_length = m; - return i; - } - } - if (match_length) *match_length = -1; - return -1; -} - -static OptionalPatternMatch find(Text_t text, Text_t pattern, Int_t from_index) { - int64_t first = Int64$from_int(from_index, false); - if (first == 0) fail("Invalid index: 0"); - if (first < 0) first = text.length + first + 1; - if (first > text.length || first < 1) return NONE_MATCH; - - capture_t captures[MAX_BACKREFS] = {}; - int64_t len = 0; - int64_t found = _find(text, pattern, first - 1, text.length - 1, &len, captures); - if (found == -1) return NONE_MATCH; - - List_t capture_list = {}; - for (int i = 0; captures[i].occupied; i++) { - Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length)); - List$insert(&capture_list, &capture, I(0), sizeof(Text_t)); - } - return (OptionalPatternMatch){ - .text = Text$slice(text, I(found + 1), I(found + len)), - .index = I(found + 1), - .captures = capture_list, - }; -} - -PUREFUNC static bool Pattern$has(Text_t text, Text_t pattern) { - if (Text$starts_with(pattern, Text("{start}"), &pattern)) { - int64_t m = match(text, 0, pattern, 0, NULL, 0); - return m >= 0; - } else if (Text$ends_with(text, Text("{end}"), NULL)) { - for (int64_t i = text.length - 1; i >= 0; i--) { - int64_t match_len = match(text, i, pattern, 0, NULL, 0); - if (match_len >= 0 && i + match_len == text.length) return true; - } - return false; - } else { - int64_t found = _find(text, pattern, 0, text.length - 1, NULL, NULL); - return (found >= 0); - } -} - -static bool Pattern$matches(Text_t text, Text_t pattern) { - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, 0, pattern, 0, NULL, 0); - return (match_len == text.length); -} - -static bool Pattern$match_at(Text_t text, Text_t pattern, Int_t pos, PatternMatch *dest) { - int64_t start = Int64$from_int(pos, false) - 1; - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, start, pattern, 0, captures, 0); - if (match_len < 0) return false; - - List_t capture_list = {}; - for (int i = 0; captures[i].occupied; i++) { - Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length)); - List$insert(&capture_list, &capture, I(0), sizeof(Text_t)); - } - dest->text = Text$slice(text, I(start + 1), I(start + match_len)); - dest->index = I(start + 1); - dest->captures = capture_list; - return true; -} - -static OptionalList_t Pattern$captures(Text_t text, Text_t pattern) { - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, 0, pattern, 0, captures, 0); - if (match_len != text.length) return NONE_LIST; - - List_t capture_list = {}; - for (int i = 0; captures[i].occupied; i++) { - Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length)); - List$insert(&capture_list, &capture, I(0), sizeof(Text_t)); - } - return capture_list; -} - -static List_t Pattern$find_all(Text_t text, Text_t pattern) { - if (pattern.length == 0) // special case - return (List_t){.length = 0}; - - List_t matches = {}; - for (int64_t i = 1;;) { - OptionalPatternMatch m = find(text, pattern, I(i)); - if (m.is_none) break; - i = Int64$from_int(m.index, false) + m.text.length; - List$insert(&matches, &m, I_small(0), sizeof(PatternMatch)); - } - return matches; -} - -typedef struct { - TextIter_t state; - Int_t i; - Text_t pattern; -} match_iter_state_t; - -static OptionalPatternMatch next_match(match_iter_state_t *state) { - if (Int64$from_int(state->i, false) > state->state.stack[0].text.length) return NONE_MATCH; - - OptionalPatternMatch m = find(state->state.stack[0].text, state->pattern, state->i); - if (m.is_none) // No match - state->i = I(state->state.stack[0].text.length + 1); - else state->i = Int$plus(m.index, I(MAX(1, m.text.length))); - return m; -} - -static Closure_t Pattern$by_match(Text_t text, Text_t pattern) { - return (Closure_t){ - .fn = (void *)next_match, - .userdata = new (match_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = I_small(1), .pattern = pattern), - }; -} - -static Text_t apply_backrefs(Text_t text, List_t recursive_replacements, Text_t replacement, Text_t backref_pat, - capture_t *captures) { - if (backref_pat.length == 0) return replacement; - - int32_t first_grapheme = Text$get_grapheme(backref_pat, 0); - bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - Text_t ret = Text(""); - TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement); - int64_t nonmatching_pos = 0; - for (int64_t pos = 0; pos < replacement.length;) { - // Optimization: quickly skip ahead to first char in the backref pattern: - if (find_first) { - while (pos < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme) - ++pos; - } - - int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0); - if (backref_len < 0) { - pos += 1; - continue; - } - - int64_t after_backref = pos + backref_len; - int64_t backref = parse_int(&replacement_state, &after_backref); - if (after_backref == pos + backref_len) { // Not actually a backref if there's no number - pos += 1; - continue; - } - if (backref < 0 || backref > 9) - fail("Invalid backref index: ", backref, " (only 0-", MAX_BACKREFS - 1, " are allowed)"); - backref_len = (after_backref - pos); - - if (Text$get_grapheme_fast(&replacement_state, pos + backref_len) == ';') - backref_len += 1; // skip optional semicolon - - if (!captures[backref].occupied) fail("There is no capture number ", backref, "!"); - - Text_t backref_text = - Text$slice(text, I(captures[backref].index + 1), I(captures[backref].index + captures[backref].length)); - - if (captures[backref].recursive && recursive_replacements.length > 0) - backref_text = replace_list(backref_text, recursive_replacements, backref_pat, true); - - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(replacement, I(nonmatching_pos + 1), I(pos)); - ret = Text$concat(ret, before_slice, backref_text); - } else { - ret = Text$concat(ret, backref_text); - } - - pos += backref_len; - nonmatching_pos = pos; - } - if (nonmatching_pos < replacement.length) { - Text_t last_slice = Text$slice(replacement, I(nonmatching_pos + 1), I(replacement.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -static Text_t Pattern$replace(Text_t text, Text_t pattern, Text_t replacement, Text_t backref_pat, bool recursive) { - Text_t ret = EMPTY_TEXT; - - int32_t first_grapheme = Text$get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - Text_t entries[2] = {pattern, replacement}; - List_t replacements = { - .data = entries, - .length = 1, - .stride = sizeof(entries), - }; - - TextIter_t text_state = NEW_TEXT_ITER_STATE(text); - int64_t nonmatching_pos = 0; - for (int64_t pos = 0; pos < text.length;) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) - ++pos; - } - - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, pos, pattern, 0, captures, 1); - if (match_len < 0) { - pos += 1; - continue; - } - captures[0] = (capture_t){ - .index = pos, - .length = match_len, - .occupied = true, - .recursive = false, - }; - - Text_t replacement_text = - apply_backrefs(text, recursive ? replacements : (List_t){}, replacement, backref_pat, captures); - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(text, I(nonmatching_pos + 1), I(pos)); - ret = Text$concat(ret, before_slice, replacement_text); - } else { - ret = Text$concat(ret, replacement_text); - } - nonmatching_pos = pos + match_len; - pos += MAX(match_len, 1); - } - if (nonmatching_pos < text.length) { - Text_t last_slice = Text$slice(text, I(nonmatching_pos + 1), I(text.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -static Text_t Pattern$trim(Text_t text, Text_t pattern, bool trim_left, bool trim_right) { - int64_t first = 0, last = text.length - 1; - if (trim_left) { - int64_t match_len = match(text, 0, pattern, 0, NULL, 0); - if (match_len > 0) first = match_len; - } - - if (trim_right) { - for (int64_t i = text.length - 1; i >= first; i--) { - int64_t match_len = match(text, i, pattern, 0, NULL, 0); - if (match_len > 0 && i + match_len == text.length) last = i - 1; - } - } - return Text$slice(text, I(first + 1), I(last + 1)); -} - -static Text_t Pattern$map(Text_t text, Text_t pattern, Closure_t fn, bool recursive) { - Text_t ret = EMPTY_TEXT; - - int32_t first_grapheme = Text$get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - TextIter_t text_state = NEW_TEXT_ITER_STATE(text); - int64_t nonmatching_pos = 0; - - Text_t (*text_mapper)(PatternMatch, void *) = fn.fn; - for (int64_t pos = 0; pos < text.length; pos++) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) - ++pos; - } - - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, pos, pattern, 0, captures, 0); - if (match_len < 0) continue; - - PatternMatch m = { - .text = Text$slice(text, I(pos + 1), I(pos + match_len)), - .index = I(pos + 1), - .captures = {}, - }; - for (int i = 0; captures[i].occupied; i++) { - Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length)); - if (recursive) capture = Pattern$map(capture, pattern, fn, recursive); - List$insert(&m.captures, &capture, I(0), sizeof(Text_t)); - } - - Text_t replacement = text_mapper(m, fn.userdata); - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(text, I(nonmatching_pos + 1), I(pos)); - ret = Text$concat(ret, before_slice, replacement); - } else { - ret = Text$concat(ret, replacement); - } - nonmatching_pos = pos + match_len; - pos += (match_len - 1); - } - if (nonmatching_pos < text.length) { - Text_t last_slice = Text$slice(text, I(nonmatching_pos + 1), I(text.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -static void Pattern$each(Text_t text, Text_t pattern, Closure_t fn, bool recursive) { - int32_t first_grapheme = Text$get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - TextIter_t text_state = NEW_TEXT_ITER_STATE(text); - void (*action)(PatternMatch, void *) = fn.fn; - for (int64_t pos = 0; pos < text.length; pos++) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) - ++pos; - } - - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, pos, pattern, 0, captures, 0); - if (match_len < 0) continue; - - PatternMatch m = { - .text = Text$slice(text, I(pos + 1), I(pos + match_len)), - .index = I(pos + 1), - .captures = {}, - }; - for (int i = 0; captures[i].occupied; i++) { - Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length)); - if (recursive) Pattern$each(capture, pattern, fn, recursive); - List$insert(&m.captures, &capture, I(0), sizeof(Text_t)); - } - - action(m, fn.userdata); - pos += (match_len - 1); - } -} - -Text_t replace_list(Text_t text, List_t replacements, Text_t backref_pat, bool recursive) { - if (replacements.length == 0) return text; - - Text_t ret = EMPTY_TEXT; - - int64_t nonmatch_pos = 0; - for (int64_t pos = 0; pos < text.length;) { - // Find the first matching pattern at this position: - for (int64_t i = 0; i < replacements.length; i++) { - Text_t pattern = *(Text_t *)(replacements.data + i * replacements.stride); - capture_t captures[MAX_BACKREFS] = {}; - int64_t len = match(text, pos, pattern, 0, captures, 1); - if (len < 0) continue; - captures[0].index = pos; - captures[0].length = len; - - // If we skipped over some non-matching text before finding a match, insert it here: - if (pos > nonmatch_pos) { - Text_t before_slice = Text$slice(text, I(nonmatch_pos + 1), I(pos)); - ret = Text$concat(ret, before_slice); - } - - // Concatenate the replacement: - Text_t replacement = *(Text_t *)(replacements.data + i * replacements.stride + sizeof(Text_t)); - Text_t replacement_text = - apply_backrefs(text, recursive ? replacements : (List_t){}, replacement, backref_pat, captures); - ret = Text$concat(ret, replacement_text); - pos += MAX(len, 1); - nonmatch_pos = pos; - goto next_pos; - } - - pos += 1; - next_pos: - continue; - } - - if (nonmatch_pos <= text.length) { - Text_t last_slice = Text$slice(text, I(nonmatch_pos + 1), I(text.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -static Text_t Pattern$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive) { - return replace_list(text, replacements.entries, backref_pat, recursive); -} - -static List_t Pattern$split(Text_t text, Text_t pattern) { - if (text.length == 0) // special case - return (List_t){.length = 0}; - - if (pattern.length == 0) // special case - return Text$clusters(text); - - List_t chunks = {}; - - int64_t i = 0; - for (;;) { - int64_t len = 0; - int64_t found = _find(text, pattern, i, text.length - 1, &len, NULL); - if (found == i && len == 0) found = _find(text, pattern, i + 1, text.length - 1, &len, NULL); - if (found < 0) break; - Text_t chunk = Text$slice(text, I(i + 1), I(found)); - List$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); - i = MAX(found + len, i + 1); - } - - Text_t last_chunk = Text$slice(text, I(i + 1), I(text.length)); - List$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t)); - - return chunks; -} - -typedef struct { - TextIter_t state; - int64_t i; - Text_t pattern; -} split_iter_state_t; - -static OptionalText_t next_split(split_iter_state_t *state) { - Text_t text = state->state.stack[0].text; - if (state->i >= text.length) { - if (state->pattern.length > 0 && state->i == text.length) { // special case - state->i = text.length + 1; - return EMPTY_TEXT; - } - return NONE_TEXT; - } - - if (state->pattern.length == 0) { // special case - Text_t ret = Text$cluster(text, I(state->i + 1)); - state->i += 1; - return ret; - } - - int64_t start = state->i; - int64_t len = 0; - int64_t found = _find(text, state->pattern, start, text.length - 1, &len, NULL); - - if (found == start && len == 0) found = _find(text, state->pattern, start + 1, text.length - 1, &len, NULL); - - if (found >= 0) { - state->i = MAX(found + len, state->i + 1); - return Text$slice(text, I(start + 1), I(found)); - } else { - state->i = state->state.stack[0].text.length + 1; - return Text$slice(text, I(start + 1), I(text.length)); - } -} - -static Closure_t Pattern$by_split(Text_t text, Text_t pattern) { - return (Closure_t){ - .fn = (void *)next_split, - .userdata = new (split_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0, .pattern = pattern), - }; -} - -static Text_t Pattern$escape_text(Text_t text) { - // TODO: optimize for spans of non-escaped text - Text_t ret = EMPTY_TEXT; - TextIter_t state = NEW_TEXT_ITER_STATE(text); - for (int64_t i = 0; i < text.length; i++) { - uint32_t g = Text$get_main_grapheme_fast(&state, i); - if (g == '{') { - ret = Text$concat(ret, Text("{1{}")); - } else if (g == '?' || uc_is_property_quotation_mark(g) - || (uc_is_property_paired_punctuation(g) && uc_is_property_left_of_pair(g))) { - ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i + 1), I(i + 1)), Text("}")); - } else { - ret = Text$concat(ret, Text$slice(text, I(i + 1), I(i + 1))); - } - } - return ret; -} - -static Text_t Pattern$as_text(const void *obj, bool colorize, const TypeInfo_t *info) { - (void)info; - if (!obj) return Text("Pattern"); - - Text_t pat = *(Text_t *)obj; - Text_t quote = Pattern$has(pat, Text("/")) && !Pattern$has(pat, Text("|")) ? Text("|") : Text("/"); - return Text$concat(colorize ? Text("\x1b[1m$\033[m") : Text("$"), Text$quoted(pat, colorize, quote)); -} diff --git a/lib/patterns/patterns.tm b/lib/patterns/patterns.tm deleted file mode 100644 index c5444b86..00000000 --- a/lib/patterns/patterns.tm +++ /dev/null @@ -1,66 +0,0 @@ -use ./patterns.c - -struct PatternMatch(text:Text, index:Int, captures:[Text]) - -lang Pat - convert(text:Text -> Pat) - return C_code:Pat(Pattern$escape_text(@text)) - - convert(n:Int -> Pat) - return Pat.from_text("$n") - -extend Text - func matching_pattern(text:Text, pattern:Pat, pos:Int = 1 -> PatternMatch?) - result : PatternMatch - if C_code:Bool(Pattern$match_at(@text, @pattern, @pos, (void*)&@result)) - return result - return none - - func matches_pattern(text:Text, pattern:Pat -> Bool) - return C_code:Bool(Pattern$matches(@text, @pattern)) - - func pattern_captures(text:Text, pattern:Pat -> [Text]?) - return C_code:[Text]?(Pattern$captures(@text, @pattern)) - - func replace_pattern(text:Text, pattern:Pat, replacement:Text, backref="@", recursive=yes -> Text) - return C_code:Text(Pattern$replace(@text, @pattern, @replacement, @backref, @recursive)) - - func translate_patterns(text:Text, replacements:{Pat=Text}, backref="@", recursive=yes -> Text) - return C_code:Text(Pattern$replace_all(@text, @replacements, @backref, @recursive)) - - func has_pattern(text:Text, pattern:Pat -> Bool) - return C_code:Bool(Pattern$has(@text, @pattern)) - - func find_patterns(text:Text, pattern:Pat -> [PatternMatch]) - return C_code:[PatternMatch](Pattern$find_all(@text, @pattern)) - - func by_pattern(text:Text, pattern:Pat -> func(->PatternMatch?)) - return C_code:func(->PatternMatch?)(Pattern$by_match(@text, @pattern)) - - func each_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch), recursive=yes) - C_code { Pattern$each(@text, @pattern, @fn, @recursive); } - - func map_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch -> Text), recursive=yes -> Text) - return C_code:Text(Pattern$map(@text, @pattern, @fn, @recursive)) - - func split_pattern(text:Text, pattern:Pat -> [Text]) - return C_code:[Text](Pattern$split(@text, @pattern)) - - func by_pattern_split(text:Text, pattern:Pat -> func(->Text?)) - return C_code:func(->Text?)(Pattern$by_split(@text, @pattern)) - - func trim_pattern(text:Text, pattern=$Pat"{space}", left=yes, right=yes -> Text) - return C_code:Text(Pattern$trim(@text, @pattern, @left, @right)) - -func main() - >> "Hello world".matching_pattern($Pat'{id}') - >> "...Hello world".matching_pattern($Pat'{id}') -# func main(pattern:Pat, input=(/dev/stdin)) -# for line in input.by_line()! -# skip if not line.has_pattern(pattern) -# pos := 1 -# for match in line.by_pattern(pattern) -# say(line.slice(pos, match.index-1), newline=no) -# say("\033[34;1m$(match.text)\033[m", newline=no) -# pos = match.index + match.text.length -# say(line.from(pos), newline=yes) diff --git a/lib/pthreads/CHANGES.md b/lib/pthreads/CHANGES.md deleted file mode 100644 index 42ae752c..00000000 --- a/lib/pthreads/CHANGES.md +++ /dev/null @@ -1,5 +0,0 @@ -# Version History - -## v1.0 - -Initial version diff --git a/lib/pthreads/pthreads.tm b/lib/pthreads/pthreads.tm deleted file mode 100644 index 8fca6bd6..00000000 --- a/lib/pthreads/pthreads.tm +++ /dev/null @@ -1,117 +0,0 @@ -# A Posix Threads (pthreads) wrapper -use <pthread.h> - -struct pthread_mutex_t(; extern, opaque) - func new(->@pthread_mutex_t) - return C_code : @pthread_mutex_t( - pthread_mutex_t *mutex = GC_MALLOC(sizeof(pthread_mutex_t)); - pthread_mutex_init(mutex, NULL); - GC_register_finalizer(mutex, (void*)pthread_mutex_destroy, NULL, NULL, NULL); - mutex - ) - - func lock(m:&pthread_mutex_t) - fail("Failed to lock mutex") unless C_code:Int32(pthread_mutex_lock(@m)) == 0 - - func unlock(m:&pthread_mutex_t) - fail("Failed to unlock mutex") unless C_code:Int32(pthread_mutex_unlock(@m)) == 0 - -struct pthread_cond_t(; extern, opaque) - func new(->@pthread_cond_t) - return C_code : @pthread_cond_t( - pthread_cond_t *cond = GC_MALLOC(sizeof(pthread_cond_t)); - pthread_cond_init(cond, NULL); - GC_register_finalizer(cond, (void*)pthread_cond_destroy, NULL, NULL, NULL); - cond - ) - - func wait(cond:&pthread_cond_t, mutex:&pthread_mutex_t) - fail("Failed to wait on condition") unless C_code:Int32(pthread_cond_wait(@cond, @mutex)) == 0 - - func signal(cond:&pthread_cond_t) - fail("Failed to signal pthread_cond_t") unless C_code:Int32(pthread_cond_signal(@cond)) == 0 - - func broadcast(cond:&pthread_cond_t) - fail("Failed to broadcast pthread_cond_t") unless C_code:Int32(pthread_cond_broadcast(@cond)) == 0 - -struct pthread_rwlock_t(; extern, opaque) - func new(->@pthread_rwlock_t) - return C_code : @pthread_rwlock_t ( - pthread_rwlock_t *lock = GC_MALLOC(sizeof(pthread_rwlock_t)); - pthread_rwlock_init(lock, NULL); - GC_register_finalizer(lock, (void*)pthread_rwlock_destroy, NULL, NULL, NULL); - lock - ) - - func read_lock(lock:&pthread_rwlock_t) - C_code { pthread_rwlock_rdlock(@lock); } - - func write_lock(lock:&pthread_rwlock_t) - C_code { pthread_rwlock_wrlock(@lock); } - - func unlock(lock:&pthread_rwlock_t) - C_code { pthread_rwlock_unlock(@lock); } - -struct pthread_t(; extern, opaque) - func new(fn:func() -> @pthread_t) - return C_code:@pthread_t( - pthread_t *thread = GC_MALLOC(sizeof(pthread_t)); - pthread_create(thread, NULL, @fn.fn, @fn.userdata); - thread - ) - - func join(p:pthread_t) C_code { pthread_join(@p, NULL); } - func cancel(p:pthread_t) C_code { pthread_cancel(@p); } - func detatch(p:pthread_t) C_code { pthread_detach(@p); } - -struct IntQueue(_queue:@[Int], _mutex:@pthread_mutex_t, _cond:@pthread_cond_t) - func new(initial:[Int]=[] -> IntQueue) - return IntQueue(@initial, pthread_mutex_t.new(), pthread_cond_t.new()) - - func give(q:IntQueue, n:Int) - do q._mutex.lock() - q._queue.insert(n) - q._mutex.unlock() - q._cond.signal() - - func take(q:IntQueue -> Int) - do q._mutex.lock() - n := q._queue.pop(1) - while not n - q._cond.wait(q._mutex) - n = q._queue.pop(1) - q._mutex.unlock() - return n! - -func main() - jobs := IntQueue.new() - results := IntQueue.new() - - say_mutex := pthread_mutex_t.new() - announce := func(speaker:Text, text:Text) - do say_mutex.lock() - say("\[2][$speaker]\[] $text") - say_mutex.unlock() - - worker := pthread_t.new(func() - say("I'm in the thread!") - repeat - announce("worker", "waiting for job") - job := jobs.take() - result := job * 10 - announce("worker", "Jobbing $job into $result") - results.give(result) - announce("worker", "Signaled $result") - ) - - for i in 10 - announce("boss", "Pushing job $i") - jobs.give(i) - announce("boss", "Gave job $i") - - for i in 10 - announce("boss", "Getting result...") - result := results.take() - announce("boss", "Got result $result") - - >> worker.cancel() diff --git a/lib/random/CHANGES.md b/lib/random/CHANGES.md deleted file mode 100644 index 42ae752c..00000000 --- a/lib/random/CHANGES.md +++ /dev/null @@ -1,5 +0,0 @@ -# Version History - -## v1.0 - -Initial version diff --git a/lib/random/README.md b/lib/random/README.md deleted file mode 100644 index 183b9d0b..00000000 --- a/lib/random/README.md +++ /dev/null @@ -1,196 +0,0 @@ -# Random Number Generators (RNG) - -This library provides an `RNG` type (Random Number Generator). This type -represents a self-contained piece of data that encapsulates the state of a -relatively fast and relatively secure pseudo-random number generator. The -current implementation is based on the [ChaCha20 stream -cipher,](https://en.wikipedia.org/wiki/Salsa20#ChaCha_variant) inspired by -[`arc4random` in OpenBSD.](https://man.openbsd.org/arc4random.3) - -An `RNG` object can be used for deterministic, repeatable generation of -pseudorandom numbers (for example, to be used in a video game for creating -seeded levels). The default random number generator for Tomo is called `random` -and is, by default, initialized with random data from the operating system when -a Tomo program launches. - -## RNG Functions - -This documentation provides details on RNG functions available in the API. -Lists also have some methods which use RNG values: -`list.shuffle()`, `list.shuffled()`, `list.random()`, and `list.sample()`. - -- [`func bool(rng: RNG, p: Num = 0.5 -> Bool)`](#bool) -- [`func byte(rng: RNG -> Byte)`](#byte) -- [`func bytes(rng: RNG, count: Int -> [Byte])`](#bytes) -- [`func copy(rng: RNG -> RNG)`](#copy) -- [`func int(rng: RNG, min: Int, max: Int -> Int)`](#int`, `int64`, `int32`, `int16`, `int8) -- [`func new(seed: [Byte] = (/dev/urandom).read_bytes(40)! -> RNG)`](#new) -- [`func num(rng: RNG, min: Num = 0.0, max: Num = 1.0 -> Num)`](#num`, `num32) - -------------- - -### `bool` -Generate a random boolean value with a given probability. - -```tomo -func bool(rng: RNG, p: Num = 0.5 -> Bool) -``` - -- `rng`: The random number generator to use. -- `p`: The probability of returning a `yes` value. Values less than zero and - `NaN` values are treated as equal to zero and values larger than zero are - treated as equal to one. - -**Returns:** -`yes` with probability `p` and `no` with probability `1-p`. - -**Example:** -```tomo ->> random.bool() -= no ->> random.bool(1.0) -= yes -``` - ---- - -### `byte` -Generate a random byte with uniform probability. - -```tomo -func byte(rng: RNG -> Byte) -``` - -- `rng`: The random number generator to use. - -**Returns:** -A random byte (0-255). - -**Example:** -```tomo ->> random.byte() -= 103[B] -``` - ---- - -### `bytes` -Generate a list of uniformly random bytes with the given length. - -```tomo -func bytes(rng: RNG, count: Int -> [Byte]) -``` - -- `rng`: The random number generator to use. -- `count`: The number of random bytes to return. - -**Returns:** -A list of length `count` random bytes with uniform random distribution (0-255). - -**Example:** -```tomo ->> random.bytes(4) -= [135[B], 169[B], 103[B], 212[B]] -``` - ---- - -### `copy` -Return a copy of a random number generator. This copy will be a parallel version of -the given RNG with its own internal state. - -```tomo -func copy(rng: RNG -> RNG) -``` - -- `rng`: The random number generator to copy. - -**Returns:** -A copy of the given RNG. - -**Example:** -```tomo ->> rng := RNG.new([]) ->> copy := rng.copy() - ->> rng.bytes(10) -= [224[B], 102[B], 190[B], 59[B], 251[B], 50[B], 217[B], 170[B], 15[B], 221[B]] - -# The copy runs in parallel to the original RNG: ->> copy.bytes(10) -= [224[B], 102[B], 190[B], 59[B], 251[B], 50[B], 217[B], 170[B], 15[B], 221[B]] -``` - ---- - -### `int`, `int64`, `int32`, `int16`, `int8` -Generate a random integer value with the given range. - -```tomo -func int(rng: RNG, min: Int, max: Int -> Int) -func int64(rng: RNG, min: Int64 = Int64.min, max: Int64 = Int64.max -> Int) -func int32(rng: RNG, min: Int32 = Int32.min, max: Int32 = Int32.max -> Int) -func int16(rng: RNG, min: Int16 = Int16.min, max: Int16 = Int16.max -> Int) -func int8(rng: RNG, min: Int8 = Int8.min, max: Int8 = Int8.max -> Int) -``` - -- `rng`: The random number generator to use. -- `min`: The minimum value to be returned. -- `max`: The maximum value to be returned. - -**Returns:** -An integer uniformly chosen from the range `[min, max]` (inclusive). If `min` -is greater than `max`, an error will be raised. - -**Example:** -```tomo ->> random.int(1, 10) -= 8 -``` - ---- - -### `new` -Return a new random number generator. - -```tomo -func new(seed: [Byte] = (/dev/urandom).read_bytes(40)! -> RNG) -``` - -- `seed`: The seed use for the random number generator. A seed length of 40 - bytes is recommended. Seed lengths of less than 40 bytes are padded with - zeroes. - -**Returns:** -A new random number generator. - -**Example:** -```tomo ->> my_rng := RNG.new([1[B], 2[B], 3[B], 4[B]]) ->> my_rng.bool() -= yes -``` - ---- - -### `num`, `num32` -Generate a random floating point value with the given range. - -```tomo -func num(rng: RNG, min: Num = 0.0, max: Num = 1.0 -> Int) -func num32(rng: RNG, min: Num = 0.0_f32, max: Num = 1.0_f32 -> Int) -``` - -- `rng`: The random number generator to use. -- `min`: The minimum value to be returned. -- `max`: The maximum value to be returned. - -**Returns:** -A floating point number uniformly chosen from the range `[min, max]` -(inclusive). If `min` is greater than `max`, an error will be raised. - -**Example:** -```tomo ->> random.num(1, 10) -= 9.512830439975572 -``` diff --git a/lib/random/chacha.h b/lib/random/chacha.h deleted file mode 100644 index 22015819..00000000 --- a/lib/random/chacha.h +++ /dev/null @@ -1,192 +0,0 @@ -// The ChaCha stream cipher used for pseudorandom number generation - -#pragma once -/* -chacha-merged.c version 20080118 -D. J. Bernstein -Public domain. -*/ - -/* $OpenBSD: chacha_private.h,v 1.3 2022/02/28 21:56:29 dtucker Exp $ */ -/* Tomo: chacha.h,v 1.0 2024/11/03 Bruce Hill */ - -typedef unsigned char u8; -typedef unsigned int u32; - -typedef struct { - u32 input[16]; /* could be compressed */ -} chacha_ctx; - -#define KEYSZ 32 -#define IVSZ 8 - -#define U8C(v) (v##U) -#define U32C(v) (v##U) - -#define U8V(v) ((u8)(v) & U8C(0xFF)) -#define U32V(v) ((u32)(v) & U32C(0xFFFFFFFF)) - -#define ROTL32(v, n) (U32V((v) << (n)) | ((v) >> (32 - (n)))) - -#define U8TO32_LITTLE(p) (((u32)((p)[0])) | ((u32)((p)[1]) << 8) | ((u32)((p)[2]) << 16) | ((u32)((p)[3]) << 24)) - -#define U32TO8_LITTLE(p, v) \ - do { \ - (p)[0] = U8V((v)); \ - (p)[1] = U8V((v) >> 8); \ - (p)[2] = U8V((v) >> 16); \ - (p)[3] = U8V((v) >> 24); \ - } while (0) - -#define ROTATE(v, c) (ROTL32(v, c)) -#define XOR(v, w) ((v) ^ (w)) -#define PLUS(v, w) (U32V((v) + (w))) -#define PLUSONE(v) (PLUS((v), 1)) - -#define QUARTERROUND(a, b, c, d) \ - a = PLUS(a, b); \ - d = ROTATE(XOR(d, a), 16); \ - c = PLUS(c, d); \ - b = ROTATE(XOR(b, c), 12); \ - a = PLUS(a, b); \ - d = ROTATE(XOR(d, a), 8); \ - c = PLUS(c, d); \ - b = ROTATE(XOR(b, c), 7); - -static const char sigma[16] = "expand 32-byte k"; - -static void chacha_keysetup(chacha_ctx *chacha, const u8 *k) { - chacha->input[0] = U8TO32_LITTLE(sigma + 0); - chacha->input[1] = U8TO32_LITTLE(sigma + 4); - chacha->input[2] = U8TO32_LITTLE(sigma + 8); - chacha->input[3] = U8TO32_LITTLE(sigma + 12); - chacha->input[4] = U8TO32_LITTLE(k + 0); - chacha->input[5] = U8TO32_LITTLE(k + 4); - chacha->input[6] = U8TO32_LITTLE(k + 8); - chacha->input[7] = U8TO32_LITTLE(k + 12); - chacha->input[8] = U8TO32_LITTLE(k + 16); - chacha->input[9] = U8TO32_LITTLE(k + 20); - chacha->input[10] = U8TO32_LITTLE(k + 24); - chacha->input[11] = U8TO32_LITTLE(k + 28); -} - -static void chacha_ivsetup(chacha_ctx *chacha, const u8 *iv) { - chacha->input[12] = 0; - chacha->input[13] = 0; - chacha->input[14] = U8TO32_LITTLE(iv + 0); - chacha->input[15] = U8TO32_LITTLE(iv + 4); -} - -static void chacha_encrypt_bytes(chacha_ctx *chacha, const u8 *m, u8 *c, u32 bytes) { - u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - u32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; - u8 *ctarget = (u8 *)0; - u8 tmp[64]; - unsigned int i; - - if (!bytes) return; - - j0 = chacha->input[0]; - j1 = chacha->input[1]; - j2 = chacha->input[2]; - j3 = chacha->input[3]; - j4 = chacha->input[4]; - j5 = chacha->input[5]; - j6 = chacha->input[6]; - j7 = chacha->input[7]; - j8 = chacha->input[8]; - j9 = chacha->input[9]; - j10 = chacha->input[10]; - j11 = chacha->input[11]; - j12 = chacha->input[12]; - j13 = chacha->input[13]; - j14 = chacha->input[14]; - j15 = chacha->input[15]; - - for (;;) { - if (bytes < 64) { - for (i = 0; i < bytes; ++i) - tmp[i] = m[i]; - m = tmp; - ctarget = c; - c = tmp; - } - x0 = j0; - x1 = j1; - x2 = j2; - x3 = j3; - x4 = j4; - x5 = j5; - x6 = j6; - x7 = j7; - x8 = j8; - x9 = j9; - x10 = j10; - x11 = j11; - x12 = j12; - x13 = j13; - x14 = j14; - x15 = j15; - for (i = 20; i > 0; i -= 2) { - QUARTERROUND(x0, x4, x8, x12) - QUARTERROUND(x1, x5, x9, x13) - QUARTERROUND(x2, x6, x10, x14) - QUARTERROUND(x3, x7, x11, x15) - QUARTERROUND(x0, x5, x10, x15) - QUARTERROUND(x1, x6, x11, x12) - QUARTERROUND(x2, x7, x8, x13) - QUARTERROUND(x3, x4, x9, x14) - } - x0 = PLUS(x0, j0); - x1 = PLUS(x1, j1); - x2 = PLUS(x2, j2); - x3 = PLUS(x3, j3); - x4 = PLUS(x4, j4); - x5 = PLUS(x5, j5); - x6 = PLUS(x6, j6); - x7 = PLUS(x7, j7); - x8 = PLUS(x8, j8); - x9 = PLUS(x9, j9); - x10 = PLUS(x10, j10); - x11 = PLUS(x11, j11); - x12 = PLUS(x12, j12); - x13 = PLUS(x13, j13); - x14 = PLUS(x14, j14); - x15 = PLUS(x15, j15); - - j12 = PLUSONE(j12); - if (!j12) { - j13 = PLUSONE(j13); - /* stopping at 2^70 bytes per nonce is user's responsibility */ - } - - U32TO8_LITTLE(c + 0, x0); - U32TO8_LITTLE(c + 4, x1); - U32TO8_LITTLE(c + 8, x2); - U32TO8_LITTLE(c + 12, x3); - U32TO8_LITTLE(c + 16, x4); - U32TO8_LITTLE(c + 20, x5); - U32TO8_LITTLE(c + 24, x6); - U32TO8_LITTLE(c + 28, x7); - U32TO8_LITTLE(c + 32, x8); - U32TO8_LITTLE(c + 36, x9); - U32TO8_LITTLE(c + 40, x10); - U32TO8_LITTLE(c + 44, x11); - U32TO8_LITTLE(c + 48, x12); - U32TO8_LITTLE(c + 52, x13); - U32TO8_LITTLE(c + 56, x14); - U32TO8_LITTLE(c + 60, x15); - - if (bytes <= 64) { - if (bytes < 64) { - for (i = 0; i < bytes; ++i) - ctarget[i] = c[i]; - } - chacha->input[12] = j12; - chacha->input[13] = j13; - return; - } - bytes -= 64; - c += 64; - } -} diff --git a/lib/random/random.tm b/lib/random/random.tm deleted file mode 100644 index 107fad66..00000000 --- a/lib/random/random.tm +++ /dev/null @@ -1,234 +0,0 @@ -# Random Number Generator (RNG) implementation based on ChaCha - -use <assert.h> -use ./sysrandom.h -use ./chacha.h - -struct chacha_ctx(j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15:Int32; extern, secret) - func from_seed(seed:[Byte]=[] -> chacha_ctx) - return C_code:chacha_ctx( - chacha_ctx ctx; - uint8_t seed_bytes[KEYSZ + IVSZ] = {}; - for (int64_t i = 0; i < (int64_t)sizeof(seed_bytes); i++) - seed_bytes[i] = i < @seed.length ? *(uint8_t*)(@seed.data + i*@seed.stride) : 0; - chacha_keysetup(&ctx, seed_bytes); - chacha_ivsetup(&ctx, seed_bytes + KEYSZ); - ctx - ) - -random := RandomNumberGenerator.new() - -func _os_random_bytes(count:Int64 -> [Byte]) - return C_code:[Byte]( - uint8_t *random_bytes = GC_MALLOC_ATOMIC(@count); - assert(getrandom(random_bytes, (size_t)@count, 0) == (size_t)@count); - (List_t){.length=@count, .data=random_bytes, .stride=1, .atomic=1} - ) -struct RandomNumberGenerator(_chacha:chacha_ctx, _random_bytes:[Byte]=[]; secret) - func new(seed:[Byte]?=none, -> @RandomNumberGenerator) - ctx := chacha_ctx.from_seed(seed or _os_random_bytes(40)) - return @RandomNumberGenerator(ctx, []) - - func _rekey(rng:&RandomNumberGenerator) - rng._random_bytes = C_code:[Byte]( - Byte_t new_keystream[KEYSZ + IVSZ] = {}; - // Fill the buffer with the keystream - chacha_encrypt_bytes(&@rng->_chacha, new_keystream, new_keystream, sizeof(new_keystream)); - // Immediately reinitialize for backtracking resistance - chacha_keysetup(&@rng->_chacha, new_keystream); - chacha_ivsetup(&@rng->_chacha, new_keystream + KEYSZ); - List_t new_bytes = (List_t){.data=GC_MALLOC_ATOMIC(1024), .length=1024, .stride=1, .atomic=1}; - memset(new_bytes.data, 0, new_bytes.length); - chacha_encrypt_bytes(&@rng->_chacha, new_bytes.data, new_bytes.data, new_bytes.length); - new_bytes - ) - - func _fill_bytes(rng:&RandomNumberGenerator, dest:&Memory, needed:Int64) - C_code { - while (@needed > 0) { - if (@rng->_random_bytes.length == 0) - @(rng._rekey()); - - assert(@rng->_random_bytes.stride == 1); - - int64_t batch_size = MIN(@needed, @rng->_random_bytes.length); - uint8_t *batch_src = @rng->_random_bytes.data; - memcpy(@dest, batch_src, batch_size); - memset(batch_src, 0, batch_size); - @rng->_random_bytes.data += batch_size; - @rng->_random_bytes.length -= batch_size; - @dest += batch_size; - @needed -= batch_size; - } - } - - func bytes(rng:&RandomNumberGenerator, count:Int -> [Byte]) - count64 := Int64(count) - buf := C_code:@Memory(GC_MALLOC_ATOMIC(@count64)) - rng._fill_bytes(buf, count64) - return C_code:[Byte]((List_t){.data=@buf, .stride=1, .atomic=1, .length=@count64}) - - func byte(rng:&RandomNumberGenerator -> Byte) - byte : &Byte - rng._fill_bytes(byte, 1) - return byte[] - - func bool(rng:&RandomNumberGenerator, probability=0.5 -> Bool) - if probability == 0.5 - return rng.byte() < 0x80 - else - return rng.num(0., 1.) < 0.5 - - func int64(rng:&RandomNumberGenerator, min=Int64.min, max=Int64.max -> Int64) - fail("Random minimum value $min is larger than the maximum value $max") if min > max - return min if min == max - random_int64 : &Int64 - rng._fill_bytes(random_int64, 8) - if min == Int64.min and max == Int64.max - return random_int64 - - return C_code:Int64( - uint64_t range = (uint64_t)@max - (uint64_t)@min + 1; - uint64_t min_r = -range % range; - uint64_t r; - @random_int64 = (int64_t*)&r; - for (;;) { - @(rng._fill_bytes(random_int64, 8)); - if (r >= min_r) break; - } - (int64_t)((uint64_t)@min + (r % range)) - ) - - func int32(rng:&RandomNumberGenerator, min=Int32.min, max=Int32.max -> Int32) - fail("Random minimum value $min is larger than the maximum value $max") if min > max - return min if min == max - random_int32 : &Int32 - rng._fill_bytes(random_int32, 8) - if min == Int32.min and max == Int32.max - return random_int32 - - return C_code:Int32( - uint32_t range = (uint32_t)@max - (uint32_t)@min + 1; - uint32_t min_r = -range % range; - uint32_t r; - @random_int32 = (int32_t*)&r; - for (;;) { - @(rng._fill_bytes(random_int32, 8)); - if (r >= min_r) break; - } - (int32_t)((uint32_t)@min + (r % range)) - ) - - func int16(rng:&RandomNumberGenerator, min=Int16.min, max=Int16.max -> Int16) - fail("Random minimum value $min is larger than the maximum value $max") if min > max - return min if min == max - random_int16 : &Int16 - rng._fill_bytes(random_int16, 8) - if min == Int16.min and max == Int16.max - return random_int16 - - return C_code:Int16( - uint16_t range = (uint16_t)@max - (uint16_t)@min + 1; - uint16_t min_r = -range % range; - uint16_t r; - @random_int16 = (int16_t*)&r; - for (;;) { - @(rng._fill_bytes(random_int16, 8)); - if (r >= min_r) break; - } - (int16_t)((uint16_t)@min + (r % range)) - ) - - func int8(rng:&RandomNumberGenerator, min=Int8.min, max=Int8.max -> Int8) - fail("Random minimum value $min is larger than the maximum value $max") if min > max - return min if min == max - random_int8 : &Int8 - rng._fill_bytes(random_int8, 8) - if min == Int8.min and max == Int8.max - return random_int8 - - return C_code:Int8( - uint8_t range = (uint8_t)@max - (uint8_t)@min + 1; - uint8_t min_r = -range % range; - uint8_t r; - @random_int8 = (int8_t*)&r; - for (;;) { - @(rng._fill_bytes(random_int8, 8)); - if (r >= min_r) break; - } - (int8_t)((uint8_t)@min + (r % range)) - ) - - func num(rng:&RandomNumberGenerator, min=0., max=1. -> Num) - num_buf : &Num - return C_code:Num( - if (@min > @max) fail("Random minimum value (", @min, ") is larger than the maximum value (", @max, ")"); - if (@min == @max) return @min; - - union { - Num_t num; - uint64_t bits; - } r = {.bits=0}, one = {.num=1.0}; - @num_buf = &r.num; - @(rng._fill_bytes(num_buf, 8)); - - // Set r.num to 1.<random-bits> - r.bits &= ~(0xFFFULL << 52); - r.bits |= (one.bits & (0xFFFULL << 52)); - - r.num -= 1.0; - - (@min == 0.0 && @max == 1.0) ? r.num : ((1.0-r.num)*@min + r.num*@max) - ) - - func num32(rng:&RandomNumberGenerator, min=Num32(0.), max=Num32(1.) -> Num32) - return Num32(rng.num(Num(min), Num(max))) - - func int(rng:&RandomNumberGenerator, min:Int, max:Int -> Int) - return C_code:Int( - if (likely(((@min.small & @max.small) & 1) != 0)) { - int32_t r = @(rng.int32(Int32(min), Int32(max))); - return I_small(r); - } - - int32_t cmp = @(min <> max); - if (cmp > 0) - fail("Random minimum value (", @min, ") is larger than the maximum value (", @max, ")"); - if (cmp == 0) return @min; - - mpz_t range_size; - mpz_init_set_int(range_size, @max); - if (@min.small & 1) { - mpz_t min_mpz; - mpz_init_set_si(min_mpz, @min.small >> 2); - mpz_sub(range_size, range_size, min_mpz); - } else { - mpz_sub(range_size, range_size, *@min.big); - } - - gmp_randstate_t gmp_rng; - gmp_randinit_default(gmp_rng); - int64_t seed = @(rng.int64()); - gmp_randseed_ui(gmp_rng, (unsigned long)seed); - - mpz_t r; - mpz_init(r); - mpz_urandomm(r, gmp_rng, range_size); - - gmp_randclear(gmp_rng); - Int$plus(@min, Int$from_mpz(r)) - ) - - -func main() - >> rng := RandomNumberGenerator.new() - >> rng.num() - >> rng.num() - >> rng.num() - >> rng.num(0, 100) - >> rng.byte() - >> rng.bytes(20) - # >> rng.int(1, 100) - # >> rng.int(1, 100) - # >> rng.int(1, 100) - # >> rng.int(1, 100) diff --git a/lib/random/sysrandom.h b/lib/random/sysrandom.h deleted file mode 100644 index 3d6b800f..00000000 --- a/lib/random/sysrandom.h +++ /dev/null @@ -1,17 +0,0 @@ -// Logic for using system random numbers - -#pragma once - -#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__APPLE__) -#include <stdlib.h> -static ssize_t getrandom(void *buf, size_t buflen, unsigned int flags) { - (void)flags; - arc4random_buf(buf, buflen); - return buflen; -} -#elif defined(__linux__) -// Use getrandom() -#include <sys/random.h> -#else -#error "Unsupported platform for secure random number generation" -#endif diff --git a/lib/shell/CHANGES.md b/lib/shell/CHANGES.md deleted file mode 100644 index 42ae752c..00000000 --- a/lib/shell/CHANGES.md +++ /dev/null @@ -1,5 +0,0 @@ -# Version History - -## v1.0 - -Initial version diff --git a/lib/shell/README.md b/lib/shell/README.md deleted file mode 100644 index d4bcd42e..00000000 --- a/lib/shell/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Shell - -This module defines a `lang` for running shell scripts: - -```tomo -use shell_v1.0 - ->> $Shell" - seq 5 - echo DONE -":get_output() -= "1$\n2$\n3$\n4$\n5$\nDONE" -``` diff --git a/lib/shell/shell.tm b/lib/shell/shell.tm deleted file mode 100644 index f9476161..00000000 --- a/lib/shell/shell.tm +++ /dev/null @@ -1,44 +0,0 @@ -use commands_v1.0 - -lang Shell - convert(text:Text -> Shell) - return Shell.from_text("'" ++ text.replace($/'/, `'"'"'`) ++ "'") - - convert(texts:[Text] -> Shell) - return Shell.from_text(" ".join([Shell(t).text for t in texts])) - - convert(path:Path -> Shell) - return Shell(Text(path.expand_home())) - - convert(paths:[Path] -> Shell) - return Shell.from_text(" ".join([Shell(Text(p)).text for p in paths])) - - convert(n:Int -> Shell) return Shell.from_text(Text(n)) - convert(n:Int64 -> Shell) return Shell.from_text(Text(n)) - convert(n:Int32 -> Shell) return Shell.from_text(Text(n)) - convert(n:Int16 -> Shell) return Shell.from_text(Text(n)) - convert(n:Int8 -> Shell) return Shell.from_text(Text(n)) - convert(n:Num -> Shell) return Shell.from_text(Text(n)) - convert(n:Num32 -> Shell) return Shell.from_text(Text(n)) - - func command(shell:Shell -> Command) - return Command("sh", ["-c", shell.text]) - - func result(shell:Shell, input="", input_bytes:[Byte]=[] -> ProgramResult) - return shell.command().result(input=input, input_bytes=input_bytes) - - func run(shell:Shell -> ExitType) - return shell.command().run() - - func get_output(shell:Shell, input="", trim_newline=yes -> Text?) - return shell.command().get_output(input=input, trim_newline=trim_newline) - - func get_output_bytes(shell:Shell, input="", input_bytes:[Byte]=[] -> [Byte]?) - return shell.command().get_output_bytes(input=input, input_bytes=input_bytes) - - func by_line(shell:Shell -> func(->Text?)?) - return shell.command().by_line() - -func main(command:Shell) - for line in command.by_line()! - >> line diff --git a/lib/time/CHANGES.md b/lib/time/CHANGES.md deleted file mode 100644 index 42ae752c..00000000 --- a/lib/time/CHANGES.md +++ /dev/null @@ -1,5 +0,0 @@ -# Version History - -## v1.0 - -Initial version diff --git a/lib/time/README.md b/lib/time/README.md deleted file mode 100644 index 55f725f1..00000000 --- a/lib/time/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Time - -This is a Tomo library for working with dates and times. The `Time` type that -is provided is a date-and-time datatype that refers to a specific moment in -time. diff --git a/lib/time/time.tm b/lib/time/time.tm deleted file mode 100644 index d26d7218..00000000 --- a/lib/time/time.tm +++ /dev/null @@ -1,214 +0,0 @@ -# Time - a module for dealing with dates and times -use <math.h> -use ./time_defs.h - -enum Weekday(Sunday, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday) - -struct TimeInfo(year,month,day,hour,minute,second,nanosecond:Int, weekday:Weekday, day_of_year:Int, timezone:Text) - -struct Time(tv_sec:Int64, tv_usec:Int64; extern) - func now(->Time) - return C_code : Time ( - struct timespec ts; - if (clock_gettime(CLOCK_REALTIME, &ts) != 0) - fail("Couldn't get the time!"); - (Time){.tv_sec=ts.tv_sec, .tv_usec=ts.tv_nsec/1000} - ) - - func local_timezone(->Text) - C_code { - if (_local_timezone.length < 0) { - static char buf[PATH_MAX]; - ssize_t len = readlink("/etc/localtime", buf, sizeof(buf)); - if (len < 0) - fail("Could not get local tz!"); - - char *zoneinfo = strstr(buf, "/zoneinfo/"); - if (zoneinfo) - _local_timezone = Text$from_str(zoneinfo + strlen("/zoneinfo/")); - else - fail("Could not resolve local tz!"); - } - } - return C_code:Text(_local_timezone) - - func set_local_timezone(timezone:Text) - C_code { - setenv("TZ", @(CString(timezone)), 1); - _local_timezone = @timezone; - tzset(); - } - - func format(t:Time, format="%c", timezone=Time.local_timezone() -> Text) - return C_code : Text ( - struct tm result; - time_t time = @t.tv_sec; - struct tm *final_info; - WITH_TIMEZONE(@timezone, final_info = localtime_r(&time, &result)); - static char buf[256]; - size_t len = strftime(buf, sizeof(buf), String(@format), final_info); - Text$from_strn(buf, len) - ) - - func new(year,month,day:Int, hour=0, minute=0, second=0.0, timezone=Time.local_timezone() -> Time) - return C_code : Time( - struct tm info = { - .tm_min=Int32$from_int(@minute, false), - .tm_hour=Int32$from_int(@hour, false), - .tm_mday=Int32$from_int(@day, false), - .tm_mon=Int32$from_int(@month, false) - 1, - .tm_year=Int32$from_int(@year, false) - 1900, - .tm_isdst=-1, - }; - - time_t t; - WITH_TIMEZONE(@timezone, t = mktime(&info)); - (Time){.tv_sec=t + (time_t)@second, .tv_usec=(suseconds_t)(fmod(@second, 1.0) * 1e9)} - ) - - func unix_timestamp(t:Time -> Int64) - return C_code:Int64((int64_t)@t.tv_sec) - - func from_unix_timestamp(timestamp:Int64 -> Time) - return C_code:Time((Time){.tv_sec=@timestamp};) - - func seconds_till(t:Time, target:Time -> Num) - seconds := Num(target.tv_sec - t.tv_sec) - seconds += 1e-9*Num(target.tv_usec - t.tv_usec) - return seconds - - func minutes_till(t:Time, target:Time -> Num) - return t.seconds_till(target)/60. - - func hours_till(t:Time, target:Time -> Num) - return t.seconds_till(target)/3600. - - func relative(t:Time, relative_to=Time.now(), timezone=Time.local_timezone() -> Text) - C_code { - struct tm info = {}; - struct tm relative_info = {}; - WITH_TIMEZONE(@timezone, { - localtime_r(&@t.tv_sec, &info); - localtime_r(&@relative_to.tv_sec, &relative_info); - }); - double second_diff = @(relative_to.seconds_till(t)); - if (info.tm_year != relative_info.tm_year && fabs(second_diff) > 365.*24.*60.*60.) - return num_format((long)info.tm_year - (long)relative_info.tm_year, "year"); - else if (info.tm_mon != relative_info.tm_mon && fabs(second_diff) > 31.*24.*60.*60.) - return num_format(12*((long)info.tm_year - (long)relative_info.tm_year) + (long)info.tm_mon - (long)relative_info.tm_mon, "month"); - else if (info.tm_yday != relative_info.tm_yday && fabs(second_diff) > 24.*60.*60.) - return num_format(round(second_diff/(24.*60.*60.)), "day"); - else if (info.tm_hour != relative_info.tm_hour && fabs(second_diff) > 60.*60.) - return num_format(round(second_diff/(60.*60.)), "hour"); - else if (info.tm_min != relative_info.tm_min && fabs(second_diff) > 60.) - return num_format(round(second_diff/(60.)), "minute"); - else { - if (fabs(second_diff) < 1e-6) - return num_format((long)(second_diff*1e9), "nanosecond"); - else if (fabs(second_diff) < 1e-3) - return num_format((long)(second_diff*1e6), "microsecond"); - else if (fabs(second_diff) < 1.0) - return num_format((long)(second_diff*1e3), "millisecond"); - else - return num_format((long)(second_diff), "second"); - } - } - fail("Unreachable") - - func time(t:Time, seconds=no, am_pm=yes, timezone=Time.local_timezone() -> Text) - time := if seconds and am_pm - t.format("%l:%M:%S%P") - else if seconds and not am_pm - t.format("%T") - else if not seconds and am_pm - t.format("%l:%M%P") - else - t.format("%H:%M") - return time.trim() - - func date(t:Time, timezone=Time.local_timezone() -> Text) - return t.format("%F") - - func info(t:Time, timezone=Time.local_timezone() -> TimeInfo) - ret : TimeInfo - C_code { - struct tm info = {}; - WITH_TIMEZONE(@timezone, localtime_r(&@t.tv_sec, &info)); - @ret.year = I(info.tm_year + 1900); - @ret.month = I(info.tm_mon + 1); - @ret.day = I(info.tm_mday); - @ret.hour = I(info.tm_hour); - @ret.minute = I(info.tm_min); - @ret.second = I(info.tm_sec); - @ret.nanosecond = I(@t.tv_usec); - @ret.weekday = info.tm_wday + 1; - @ret.day_of_year = I(info.tm_yday); - @ret.timezone = @timezone; - } - return ret - - func after(t:Time, seconds=0.0, minutes=0.0, hours=0.0, days=0, weeks=0, months=0, years=0, timezone=Time.local_timezone() -> Time) - return C_code : Time ( - double offset = @seconds + 60.*@minutes + 3600.*@hours ; - @t.tv_sec += (time_t)offset; - - struct tm info = {}; - WITH_TIMEZONE(@timezone, localtime_r(&@t.tv_sec, &info)); - - info.tm_mday += Int32$from_int(@days, false) + 7*Int32$from_int(@weeks, false); - info.tm_mon += Int32$from_int(@months, false); - info.tm_year += Int32$from_int(@years, false); - - time_t t = mktime(&info); - (Time){ - .tv_sec=t, - .tv_usec=@t.tv_usec + (suseconds_t)(fmod(offset, 1.0) * 1e9), - } - ) - - func parse(text:Text, format="%Y-%m-%dT%H:%M:%S%z", timezone=Time.local_timezone() -> Time?) - ret : Time? - C_code { - struct tm info = {.tm_isdst=-1}; - const char *str = Text$as_c_string(@text); - const char *fmt = Text$as_c_string(@format); - if (strstr(fmt, "%Z")) - fail("The %Z specifier is not supported for time parsing!"); - - char *invalid; - WITH_TIMEZONE(@timezone, invalid = strptime(str, fmt, &info)); - if (!invalid || invalid[0] != '\0') { - @ret.is_none = true; - } else { - long offset = info.tm_gmtoff; // Need to cache this because mktime() mutates it to local tz >:( - time_t t; - WITH_TIMEZONE(@timezone, t = mktime(&info)); - @ret.value.tv_sec = t + offset - info.tm_gmtoff; - } - } - return ret - -func _run_tests() - >> Time.now().format() - >> Time.set_local_timezone("Europe/Paris") - >> Time.now().format() - >> Time.set_local_timezone("America/New_York") - >> Time.now().format() - # >> Time.now().format(timezone="Europe/Paris") - # >> Time.now().format() - # >> Time.now().format("%Y-%m-%d") - # >> Time.new(2023, 11, 5).format() - # >> Time.local_timezone() - - # >> Time.new(2023, 11, 5).seconds_till(Time.now()) - # >> Time.new(2023, 11, 5).relative() - - # >> Time.now().info() - # >> Time.now().time() - # >> Time.now().date() - - # >> Time.parse("2023-11-05 01:01", "%Y-%m-%d %H:%M") - # >> Time.parse("2023-11-05 01:01", "%Y-%m-%d %H:%M", timezone="Europe/Paris") - -func main() - _run_tests() diff --git a/lib/time/time_defs.h b/lib/time/time_defs.h deleted file mode 100644 index eac3f23a..00000000 --- a/lib/time/time_defs.h +++ /dev/null @@ -1,34 +0,0 @@ -// Some helper logic for working with times. - -#pragma once -#include <stdlib.h> -#include <sys/time.h> -#include <time.h> - -typedef struct timeval Time; - -static OptionalText_t _local_timezone = NONE_TEXT; - -static INLINE Text_t num_format(long n, const char *unit) { - if (n == 0) return Text("now"); - return Text$from_str( - String((int64_t)labs(n), " ", unit, (n == -1 || n == 1) ? "" : "s", n <= 0 ? " ago" : " later")); -} - -static void set_local_timezone(Text_t tz) { - setenv("TZ", Text$as_c_string(tz), 1); - _local_timezone = tz; - tzset(); -} - -#define WITH_TIMEZONE(tz, body) \ - ({ \ - if (tz.length >= 0) { \ - OptionalText_t old_timezone = _local_timezone; \ - set_local_timezone(tz); \ - body; \ - set_local_timezone(old_timezone); \ - } else { \ - body; \ - } \ - }) diff --git a/lib/uuid/CHANGES.md b/lib/uuid/CHANGES.md deleted file mode 100644 index 42ae752c..00000000 --- a/lib/uuid/CHANGES.md +++ /dev/null @@ -1,5 +0,0 @@ -# Version History - -## v1.0 - -Initial version diff --git a/lib/uuid/uuid.tm b/lib/uuid/uuid.tm deleted file mode 100644 index f2be618e..00000000 --- a/lib/uuid/uuid.tm +++ /dev/null @@ -1,39 +0,0 @@ -use random_v1.0 -use time_v1.0 - -lang UUID - func v4(-> UUID) # Random UUID - bytes := &random.bytes(16) - bytes[7; unchecked] = 0x40 or (bytes[7; unchecked] and 0x0F) - bytes[9; unchecked] = (Byte(random.int8(0x8, 0xB)) << 4) or (bytes[9; unchecked] and 0x0F) - hex := "".join([b.hex() for b in bytes]) - uuid := "$(hex.slice(1, 8))-$(hex.slice(9, 12))-$(hex.slice(13, 16))-$(hex.slice(17, -1))" - return UUID.from_text(uuid) - - func v7(-> UUID) # Timestamp + random UUID - n := Time.now() - timestamp := n.tv_sec*1000 + n.tv_usec/1_000 - - bytes := [ - Byte((timestamp >> 40)), - Byte((timestamp >> 32)), - Byte((timestamp >> 24)), - Byte((timestamp >> 16)), - Byte((timestamp >> 8)), - Byte(timestamp), - (random.byte() and 0x0F) or 0x70, - random.byte(), - (random.byte() and 0x3F) or 0x80, - random.byte() for _ in 7, - ] - - hex := "".join([b.hex() for b in bytes]) - uuid := "$(hex.slice(1, 8))-$(hex.slice(9, 12))-$(hex.slice(13, 16))-$(hex.slice(17, -1))" - return UUID.from_text(uuid) - -enum UUIDVersion(v4, v7) -func main(version=UUIDVersion.v7) - when version is v4 - say(UUID.v4().text) - is v7 - say(UUID.v7().text) |
