From 3efd7d9cfbd330ebb45f39648ee96a3e429a06f9 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Mon, 7 Apr 2025 18:14:20 -0400 Subject: Move core libraries into their own folder --- lib/base64/README.md | 3 + lib/base64/base64.tm | 96 ++++ lib/commands/README.md | 5 + lib/commands/commands.c | 285 ++++++++++ lib/commands/commands.tm | 91 ++++ lib/core/core.tm | 9 + lib/patterns/README.md | 444 ++++++++++++++++ lib/patterns/match_type.h | 8 + lib/patterns/patterns.c | 1298 +++++++++++++++++++++++++++++++++++++++++++++ lib/patterns/patterns.tm | 47 ++ lib/pthreads/pthreads.tm | 118 +++++ lib/random/README.md | 196 +++++++ lib/random/chacha.h | 196 +++++++ lib/random/random.tm | 233 ++++++++ lib/random/sysrandom.h | 14 + lib/shell/shell.tm | 44 ++ lib/time/README.md | 5 + lib/time/time.tm | 210 ++++++++ lib/time/time_defs.h | 30 ++ lib/uuid/uuid.tm | 36 ++ 20 files changed, 3368 insertions(+) create mode 100644 lib/base64/README.md create mode 100644 lib/base64/base64.tm create mode 100644 lib/commands/README.md create mode 100644 lib/commands/commands.c create mode 100644 lib/commands/commands.tm create mode 100644 lib/core/core.tm create mode 100644 lib/patterns/README.md create mode 100644 lib/patterns/match_type.h create mode 100644 lib/patterns/patterns.c create mode 100644 lib/patterns/patterns.tm create mode 100644 lib/pthreads/pthreads.tm create mode 100644 lib/random/README.md create mode 100644 lib/random/chacha.h create mode 100644 lib/random/random.tm create mode 100644 lib/random/sysrandom.h create mode 100644 lib/shell/shell.tm create mode 100644 lib/time/README.md create mode 100644 lib/time/time.tm create mode 100644 lib/time/time_defs.h create mode 100644 lib/uuid/uuid.tm (limited to 'lib') diff --git a/lib/base64/README.md b/lib/base64/README.md new file mode 100644 index 00000000..8db0f8ec --- /dev/null +++ b/lib/base64/README.md @@ -0,0 +1,3 @@ +# Base64 + +This is a library for encoding/decoding Base64 values. diff --git a/lib/base64/base64.tm b/lib/base64/base64.tm new file mode 100644 index 00000000..bf512a83 --- /dev/null +++ b/lib/base64/base64.tm @@ -0,0 +1,96 @@ +# Base 64 encoding and decoding + +_enc := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".bytes() + +_EQUAL_BYTE := Byte(0x3D) + +_dec : [Byte] = [ + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 62, 255, 255, 255, 63, + 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 255, 255, 255, 255, 255, 255, + 255, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 255, 255, 255, 255, 255, + 255, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, 255, 255, 255, 255, 255, +] + +lang Base64 + func parse(text:Text -> Base64?) + return Base64.from_bytes(text.bytes()) + + func from_bytes(bytes:[Byte] -> Base64?) + output := &[Byte(0) for _ in bytes.length * 4 / 3 + 4] + src := Int64(1) + dest := Int64(1) + while src + 2 <= Int64(bytes.length) + chunk24 := ( + (Int32(bytes[src]) <<< 16) or (Int32(bytes[src+1]) <<< 8) or Int32(bytes[src+2]) + ) + src += 3 + + output[dest] = _enc[1 + ((chunk24 >>> 18) and 0b111111)] + output[dest+1] = _enc[1 + ((chunk24 >>> 12) and 0b111111)] + output[dest+2] = _enc[1 + ((chunk24 >>> 6) and 0b111111)] + output[dest+3] = _enc[1 + (chunk24 and 0b111111)] + dest += 4 + + if src + 1 == bytes.length + chunk16 := ( + (Int32(bytes[src]) <<< 8) or Int32(bytes[src+1]) + ) + output[dest] = _enc[1 + ((chunk16 >>> 10) and 0b11111)] + output[dest+1] = _enc[1 + ((chunk16 >>> 4) and 0b111111)] + output[dest+2] = _enc[1 + ((chunk16 <<< 2)and 0b111111)] + output[dest+3] = _EQUAL_BYTE + else if src == bytes.length + chunk8 := Int32(bytes[src]) + output[dest] = _enc[1 + ((chunk8 >>> 2) and 0b111111)] + output[dest+1] = _enc[1 + ((chunk8 <<< 4) and 0b111111)] + output[dest+2] = _EQUAL_BYTE + output[dest+3] = _EQUAL_BYTE + + return Base64.from_text(Text.from_bytes(output[]) or return none) + + func decode_text(b64:Base64 -> Text?) + return Text.from_bytes(b64.decode_bytes() or return none) + + func decode_bytes(b64:Base64 -> [Byte]?) + bytes := b64.text.bytes() + output := &[Byte(0) for _ in bytes.length/4 * 3] + src := Int64(1) + dest := Int64(1) + while src + 3 <= Int64(bytes.length) + chunk24 := ( + (Int32(_dec[1+bytes[src]]) <<< 18) or + (Int32(_dec[1+bytes[src+1]]) <<< 12) or + (Int32(_dec[1+bytes[src+2]]) <<< 6) or + Int32(_dec[1+bytes[src+3]]) + ) + src += 4 + + output[dest] = Byte((chunk24 >>> 16) and 0xFF) + output[dest+1] = Byte((chunk24 >>> 8) and 0xFF) + output[dest+2] = Byte(chunk24 and 0xFF) + dest += 3 + + while output[-1] == 0xFF + output[] = output.to(-2) + + return output[] + +func main(input=(/dev/stdin), decode=no) + if decode + b := Base64.from_text(input.read()!) + say(b.decode_text()!) + else + text := input.read()! + say(Base64.parse(text)!.text) diff --git a/lib/commands/README.md b/lib/commands/README.md new file mode 100644 index 00000000..040f4bd5 --- /dev/null +++ b/lib/commands/README.md @@ -0,0 +1,5 @@ +# Commands + +This module provides a way to run executable programs and get their output. You +can also feed in text to the programs' `stdin`. Think of it as `popen()` on +steroids. diff --git a/lib/commands/commands.c b/lib/commands/commands.c new file mode 100644 index 00000000..973fc0c5 --- /dev/null +++ b/lib/commands/commands.c @@ -0,0 +1,285 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define READ_END 0 +#define WRITE_END 1 +int run_command(Text_t exe, List_t arg_list, Table_t env_table, + OptionalList_t input_bytes, List_t *output_bytes, List_t *error_bytes) +{ + pthread_testcancel(); + + struct sigaction sa = { .sa_handler = SIG_IGN }, oldint, oldquit; + sigaction(SIGINT, &sa, &oldint); + sigaction(SIGQUIT, &sa, &oldquit); + sigaddset(&sa.sa_mask, SIGCHLD); + sigset_t old, reset; + sigprocmask(SIG_BLOCK, &sa.sa_mask, &old); + sigemptyset(&reset); + if (oldint.sa_handler != SIG_IGN) sigaddset(&reset, SIGINT); + if (oldquit.sa_handler != SIG_IGN) sigaddset(&reset, SIGQUIT); + posix_spawnattr_t attr; + posix_spawnattr_init(&attr); + posix_spawnattr_setsigmask(&attr, &old); + posix_spawnattr_setsigdefault(&attr, &reset); + posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETSIGDEF|POSIX_SPAWN_SETSIGMASK); + + int child_inpipe[2], child_outpipe[2], child_errpipe[2]; + if (input_bytes.length >= 0) pipe(child_inpipe); + if (output_bytes) pipe(child_outpipe); + if (error_bytes) pipe(child_errpipe); + + posix_spawn_file_actions_t actions; + posix_spawn_file_actions_init(&actions); + if (input_bytes.length >= 0) { + posix_spawn_file_actions_adddup2(&actions, child_inpipe[READ_END], STDIN_FILENO); + posix_spawn_file_actions_addclose(&actions, child_inpipe[WRITE_END]); + } + if (output_bytes) { + posix_spawn_file_actions_adddup2(&actions, child_outpipe[WRITE_END], STDOUT_FILENO); + posix_spawn_file_actions_addclose(&actions, child_outpipe[READ_END]); + } + if (error_bytes) { + posix_spawn_file_actions_adddup2(&actions, child_errpipe[WRITE_END], STDERR_FILENO); + posix_spawn_file_actions_addclose(&actions, child_errpipe[READ_END]); + } + + const char *exe_str = Text$as_c_string(exe); + + List_t arg_strs = {}; + List$insert_value(&arg_strs, exe_str, I(0), sizeof(char*)); + for (int64_t i = 0; i < arg_list.length; i++) + List$insert_value(&arg_strs, Text$as_c_string(*(Text_t*)(arg_list.data + i*arg_list.stride)), I(0), sizeof(char*)); + List$insert_value(&arg_strs, NULL, I(0), sizeof(char*)); + char **args = arg_strs.data; + + extern char **environ; + char **env = environ; + if (env_table.entries.length > 0) { + List_t env_list = {}; // List of const char* + for (char **e = environ; *e; e++) + List$insert(&env_list, e, I(0), sizeof(char*)); + + for (int64_t i = 0; i < env_table.entries.length; i++) { + struct { Text_t key, value; } *entry = env_table.entries.data + env_table.entries.stride*i; + const char *env_entry = String(entry->key, "=", entry->value); + List$insert(&env_list, &env_entry, I(0), sizeof(char*)); + } + List$insert_value(&env_list, NULL, I(0), sizeof(char*)); + assert(env_list.stride == sizeof(char*)); + env = env_list.data; + } + + pid_t pid; + int ret = exe_str[0] == '/' ? + posix_spawn(&pid, exe_str, &actions, &attr, args, env) + : posix_spawnp(&pid, exe_str, &actions, &attr, args, env); + if (ret != 0) + return -1; + + posix_spawnattr_destroy(&attr); + posix_spawn_file_actions_destroy(&actions); + + if (input_bytes.length >= 0) close(child_inpipe[READ_END]); + if (output_bytes) close(child_outpipe[WRITE_END]); + if (error_bytes) close(child_errpipe[WRITE_END]); + + struct pollfd pollfds[3] = {}; + if (input_bytes.length >= 0) pollfds[0] = (struct pollfd){.fd=child_inpipe[WRITE_END], .events=POLLOUT}; + if (output_bytes) pollfds[1] = (struct pollfd){.fd=child_outpipe[WRITE_END], .events=POLLIN}; + if (error_bytes) pollfds[2] = (struct pollfd){.fd=child_errpipe[WRITE_END], .events=POLLIN}; + + if (input_bytes.length > 0 && input_bytes.stride != 1) + List$compact(&input_bytes, sizeof(char)); + if (output_bytes) + *output_bytes = (List_t){.atomic=1, .stride=1, .length=0}; + if (error_bytes) + *error_bytes = (List_t){.atomic=1, .stride=1, .length=0}; + + while (input_bytes.length > 0 || output_bytes || error_bytes) { + (void)poll(pollfds, sizeof(pollfds)/sizeof(pollfds[0]), -1); // Wait for data or readiness + bool did_something = false; + if (input_bytes.length >= 0 && pollfds[0].revents) { + if (input_bytes.length > 0) { + ssize_t written = write(child_inpipe[WRITE_END], input_bytes.data, (size_t)input_bytes.length); + if (written > 0) { + input_bytes.data += written; + input_bytes.length -= (int64_t)written; + did_something = true; + } else if (written < 0) { + close(child_inpipe[WRITE_END]); + pollfds[0].events = 0; + } + } + if (input_bytes.length <= 0) { + close(child_inpipe[WRITE_END]); + pollfds[0].events = 0; + } + } + char buf[256]; + if (output_bytes && pollfds[1].revents) { + ssize_t n = read(child_outpipe[READ_END], buf, sizeof(buf)); + did_something = did_something || (n > 0); + if (n <= 0) { + close(child_outpipe[READ_END]); + pollfds[1].events = 0; + } else if (n > 0) { + if (output_bytes->free < n) { + output_bytes->data = GC_REALLOC(output_bytes->data, (size_t)(output_bytes->length + n)); + output_bytes->free = 0; + } + memcpy(output_bytes->data + output_bytes->length, buf, (size_t)n); + output_bytes->length += n; + } + } + if (error_bytes && pollfds[2].revents) { + ssize_t n = read(child_errpipe[READ_END], buf, sizeof(buf)); + did_something = did_something || (n > 0); + if (n <= 0) { + close(child_errpipe[READ_END]); + pollfds[2].events = 0; + } else if (n > 0) { + if (error_bytes->free < n) { + error_bytes->data = GC_REALLOC(error_bytes->data, (size_t)(error_bytes->length + n)); + error_bytes->free = 0; + } + memcpy(error_bytes->data + error_bytes->length, buf, (size_t)n); + error_bytes->length += n; + } + } + if (!did_something) break; + } + + int status = 0; + if (ret == 0) { + while (waitpid(pid, &status, 0) < 0 && errno == EINTR) { + if (WIFEXITED(status) || WIFSIGNALED(status)) + break; + else if (WIFSTOPPED(status)) + kill(pid, SIGCONT); + } + } + + if (input_bytes.length >= 0) close(child_inpipe[WRITE_END]); + if (output_bytes) close(child_outpipe[READ_END]); + if (error_bytes) close(child_errpipe[READ_END]); + + sigaction(SIGINT, &oldint, NULL); + sigaction(SIGQUIT, &oldquit, NULL); + sigprocmask(SIG_SETMASK, &old, NULL); + + if (ret) errno = ret; + return status; +} + +typedef struct { + pid_t pid; + FILE *out; +} child_info_t; + +static void _line_reader_cleanup(child_info_t *child) +{ + if (child && child->out) { + fclose(child->out); + child->out = NULL; + } + if (child->pid) { + kill(child->pid, SIGTERM); + child->pid = 0; + } +} + +static Text_t _next_line(child_info_t *child) +{ + if (!child || !child->out) return NONE_TEXT; + + char *line = NULL; + size_t size = 0; + ssize_t len = getline(&line, &size, child->out); + if (len <= 0) { + _line_reader_cleanup(child); + return NONE_TEXT; + } + + while (len > 0 && (line[len-1] == '\r' || line[len-1] == '\n')) + --len; + + if (u8_check((uint8_t*)line, (size_t)len) != NULL) + fail("Invalid UTF8!"); + + Text_t line_text = Text$from_strn(line, len); + free(line); + return line_text; +} + +OptionalClosure_t command_by_line(Text_t exe, List_t arg_list, Table_t env_table) +{ + posix_spawnattr_t attr; + posix_spawnattr_init(&attr); + + int child_outpipe[2]; + pipe(child_outpipe); + + posix_spawn_file_actions_t actions; + posix_spawn_file_actions_init(&actions); + posix_spawn_file_actions_adddup2(&actions, child_outpipe[WRITE_END], STDOUT_FILENO); + posix_spawn_file_actions_addclose(&actions, child_outpipe[READ_END]); + + const char *exe_str = Text$as_c_string(exe); + + List_t arg_strs = {}; + List$insert_value(&arg_strs, exe_str, I(0), sizeof(char*)); + for (int64_t i = 0; i < arg_list.length; i++) + List$insert_value(&arg_strs, Text$as_c_string(*(Text_t*)(arg_list.data + i*arg_list.stride)), I(0), sizeof(char*)); + List$insert_value(&arg_strs, NULL, I(0), sizeof(char*)); + char **args = arg_strs.data; + + extern char **environ; + char **env = environ; + if (env_table.entries.length > 0) { + List_t env_list = {}; // List of const char* + for (char **e = environ; *e; e++) + List$insert(&env_list, e, I(0), sizeof(char*)); + + for (int64_t i = 0; i < env_table.entries.length; i++) { + struct { Text_t key, value; } *entry = env_table.entries.data + env_table.entries.stride*i; + const char *env_entry = String(entry->key, "=", entry->value); + List$insert(&env_list, &env_entry, I(0), sizeof(char*)); + } + List$insert_value(&env_list, NULL, I(0), sizeof(char*)); + assert(env_list.stride == sizeof(char*)); + env = env_list.data; + } + + pid_t pid; + int ret = exe_str[0] == '/' ? + posix_spawn(&pid, exe_str, &actions, &attr, args, env) + : posix_spawnp(&pid, exe_str, &actions, &attr, args, env); + if (ret != 0) + return NONE_CLOSURE; + + posix_spawnattr_destroy(&attr); + posix_spawn_file_actions_destroy(&actions); + + close(child_outpipe[WRITE_END]); + + child_info_t *child_info = GC_MALLOC(sizeof(child_info_t)); + child_info->out = fdopen(child_outpipe[READ_END], "r"); + child_info->pid = pid; + GC_register_finalizer(child_info, (void*)_line_reader_cleanup, NULL, NULL, NULL); + return (Closure_t){.fn=(void*)_next_line, .userdata=child_info}; +} + +#undef READ_END +#undef WRITE_END diff --git a/lib/commands/commands.tm b/lib/commands/commands.tm new file mode 100644 index 00000000..d72398b9 --- /dev/null +++ b/lib/commands/commands.tm @@ -0,0 +1,91 @@ +# Functions for running system commands + +use ./commands.c +use -lunistring + +extern run_command : func(exe:Text, args:[Text], env:{Text=Text}, input:[Byte]?, output:&[Byte]?, error:&[Byte]? -> Int32) +extern command_by_line : func(exe:Text, args:[Text], env:{Text=Text} -> func(->Text?)?) + +enum ExitType(Exited(status:Int32), Signaled(signal:Int32), Failed) + func succeeded(e:ExitType -> Bool) + when e is Exited(status) return (status == 0) + else return no + + func or_fail(e:ExitType, message:Text?=none) + if not e.succeeded() + fail(message or "Program failed: $e") + +struct ProgramResult(stdout:[Byte], stderr:[Byte], exit_type:ExitType) + func or_fail(r:ProgramResult, message:Text?=none -> ProgramResult) + when r.exit_type is Exited(status) + if status == 0 + return r + else fail(message or "Program failed: $r") + fail(message or "Program failed: $r") + + func output_text(r:ProgramResult, trim_newline=yes -> Text?) + when r.exit_type is Exited(status) + if status == 0 + if text := Text.from_bytes(r.stdout) + if trim_newline + text = text.without_suffix("\n") + return text + else return none + return none + + func error_text(r:ProgramResult -> Text?) + when r.exit_type is Exited(status) + if status == 0 + return Text.from_bytes(r.stderr) + else return none + return none + + func succeeded(r:ProgramResult -> Bool) + when r.exit_type is Exited(status) + return (status == 0) + else + return no + +struct Command(command:Text, args:[Text]=[], env:{Text=Text}={}) + func from_path(path:Path, args:[Text]=[], env:{Text=Text}={} -> Command) + return Command(Text(path), args, env) + + func result(command:Command, input="", input_bytes:[Byte]=[] -> ProgramResult) + if input.length > 0 + (&input_bytes).insert_all(input.bytes()) + + stdout : [Byte] + stderr : [Byte] + status := run_command(command.command, command.args, command.env, input_bytes, &stdout, &stderr) + + if C_code:Bool(WIFEXITED(_$status)) + return ProgramResult(stdout, stderr, ExitType.Exited(C_code:Int32(WEXITSTATUS(_$status)))) + + if C_code:Bool(WIFSIGNALED(_$status)) + return ProgramResult(stdout, stderr, ExitType.Signaled(C_code:Int32(WTERMSIG(_$status)))) + + return ProgramResult(stdout, stderr, ExitType.Failed) + + func run(command:Command, -> ExitType) + status := run_command(command.command, command.args, command.env, none, none, none) + + if C_code:Bool(WIFEXITED(_$status)) + return ExitType.Exited(C_code:Int32(WEXITSTATUS(_$status))) + + if C_code:Bool(WIFSIGNALED(_$status)) + return ExitType.Signaled(C_code:Int32(WTERMSIG(_$status))) + + return ExitType.Failed + + func get_output(command:Command, input="", trim_newline=yes -> Text?) + return command.result(input=input).output_text(trim_newline=trim_newline) + + func get_output_bytes(command:Command, input="", input_bytes:[Byte]=[] -> [Byte]?) + result := command.result(input=input, input_bytes=input_bytes) + when result.exit_type is Exited(status) + if status == 0 return result.stdout + return none + else return none + + func by_line(command:Command -> func(->Text?)?) + return command_by_line(command.command, command.args, command.env) diff --git a/lib/core/core.tm b/lib/core/core.tm new file mode 100644 index 00000000..6aa4c267 --- /dev/null +++ b/lib/core/core.tm @@ -0,0 +1,9 @@ +# This file just uses all the most commonly used standard +# library modules so you don't have to import them one-by-one + +use patterns +use commands +use shell +use pthreads +use random +use time diff --git a/lib/patterns/README.md b/lib/patterns/README.md new file mode 100644 index 00000000..faf2854e --- /dev/null +++ b/lib/patterns/README.md @@ -0,0 +1,444 @@ +# Text Pattern Matching + +As an alternative to full regular expressions, Tomo provides a limited text +matching pattern syntax that is intended to solve 80% of use cases in under 1% +of the code size (PCRE's codebase is roughly 150k lines of code, and Tomo's +pattern matching code is a bit under 1k lines of code). Tomo's pattern matching +syntax is highly readable and works well for matching literal text without +getting [leaning toothpick syndrome](https://en.wikipedia.org/wiki/Leaning_toothpick_syndrome). + +For more advanced use cases, consider linking against a C library for regular +expressions or pattern matching. + +`Pat` is a [domain-specific language](docs/langs.md), in other words, it's +like a `Text`, but it has a distinct type. + +Patterns are used in a small, but very powerful API that handles many text +functions that would normally be handled by a more extensive API: + +- [`by_pattern(text:Text, pattern:Pat -> func(->PatternMatch?))`](#by_pattern) +- [`by_pattern_split(text:Text, pattern:Pat -> func(->Text?))`](#by_pattern_split) +- [`each_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch), recursive=yes)`](#each_pattern) +- [`find_patterns(text:Text, pattern:Pat -> [PatternMatch])`](#find_patterns) +- [`has_pattern(text:Text, pattern:Pat -> Bool)`](#has_pattern) +- [`map_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch -> Text), recursive=yes -> Text)`](#map_pattern) +- [`matches_pattern(text:Text, pattern:Pat -> Bool)`](#matches_pattern) +- [`pattern_captures(text:Text, pattern:Pat -> [Text]?)`](#pattern_captures) +- [`replace_pattern(text:Text, pattern:Pat, replacement:Text, backref="@", recursive=yes -> Text)`](#replace_pattern) +- [`split_pattern(text:Text, pattern:Pat -> [Text])`](#split_pattern) +- [`translate_patterns(text:Text, replacements:{Pat,Text}, backref="@", recursive=yes -> Text)`](#translate_patterns) +- [`trim_pattern(text:Text, pattern=$Pat"{space}", left=yes, right=yes -> Text)`](#trim_pattern) + +## Matches + +Pattern matching functions work with a type called `PatternMatch` that has three fields: + +- `text`: The full text of the match. +- `index`: The index in the text where the match was found. +- `captures`: A list containing the matching text of each non-literal pattern group. + +See [Text Functions](text.md#Text-Functions) for the full API documentation. + +## Syntax + +Patterns have three types of syntax: + +- `{` followed by an optional count (`n`, `n-m`, or `n+`), followed by an + optional `!` to negate the pattern, followed by an optional pattern name or + Unicode character name, followed by a required `}`. + +- Any matching pair of quotes or parentheses or braces with a `?` in the middle + (e.g. `"?"` or `(?)`). + +- Any other character is treated as a literal to be matched exactly. + +## Named Patterns + +Named patterns match certain pre-defined patterns that are commonly useful. To +use a named pattern, use the syntax `{name}`. Names are case-insensitive and +mostly ignore spaces, underscores, and dashes. + +- `..` - Any character (note that a single `.` would mean the literal period + character). +- `digit` - A unicode digit +- `email` - an email address +- `emoji` - an emoji +- `end` - the very end of the text +- `id` - A unicode identifier +- `int` - One or more digits with an optional `-` (minus sign) in front +- `ip` - an IP address (IPv4 or IPv6) +- `ipv4` - an IPv4 address +- `ipv6` - an IPv6 address +- `nl`/`newline`/`crlf` - A line break (either `\r\n` or `\n`) +- `num` - One or more digits with an optional `-` (minus sign) in front and an optional `.` and more digits after +- `start` - the very start of the text +- `uri` - a URI +- `url` - a URL (URI that specifically starts with `http://`, `https://`, `ws://`, `wss://`, or `ftp://`) +- `word` - A unicode identifier (same as `id`) + +For non-alphabetic characters, any single character is treated as matching +exactly that character. For example, `{1{}` matches exactly one `{` +character. Or, `{1.}` matches exactly one `.` character. + +Patterns can also use any Unicode property name. Some helpful ones are: + +- `hex` - Hexidecimal digits +- `lower` - Lowercase letters +- `space` - The space character +- `upper` - Uppercase letters +- `whitespace` - Whitespace characters + +Patterns may also use exact Unicode codepoint names. For example: `{1 latin +small letter A}` matches `a`. + +## Negating Patterns + +If an exclamation mark (`!`) is placed before a pattern's name, then characters +are matched only when they _don't_ match the pattern. For example, `{!alpha}` +will match all characters _except_ alphabetic ones. + +## Interpolating Text and Escaping + +To escape a character in a pattern (e.g. if you want to match the literal +character `?`), you can use the syntax `{1 ?}`. This is almost never necessary +unless you have text that looks like a Tomo text pattern and has something like +`{` or `(?)` inside it. + +However, if you're trying to do an exact match of arbitrary text values, you'll +want to have the text automatically escaped. Fortunately, Tomo's injection-safe +DSL text interpolation supports automatic text escaping. This means that if you +use text interpolation with the `$` sign to insert a text value, the value will +be automatically escaped using the `{1 ?}` rule described above: + +```tomo +# Risk of code injection (would cause an error because 'xxx' is not a valid +# pattern name: +>> user_input := get_user_input() += "{xxx}" + +# Interpolation automatically escapes: +>> $/$user_input/ += $/{1{}..xxx}/ + +# This is: `{ 1{ }` (one open brace) followed by the literal text "..xxx}" + +# No error: +>> some_text.find($/$user_input/) += 0 +``` + +If you prefer, you can also use this to insert literal characters: + +```tomo +>> $/literal $"{..}"/ += $/literal {1{}..}/ +``` + +## Repetitions + +By default, named patterns match 1 or more repetitions, but you can specify how +many repetitions you want by putting a number or range of numbers first using +`n` (exactly `n` repetitions), `n-m` (between `n` and `m` repetitions), or `n+` +(`n` or more repetitions): + +``` +{4-5 alpha} +0x{hex} +{4 digit}-{2 digit}-{2 digit} +{2+ space} +{0-1 question mark} +``` + + +# Methods + +### `by_pattern` +Returns an iterator function that yields `PatternMatch` objects for each occurrence. + +```tomo +func by_pattern(text:Text, pattern:Pat -> func(->PatternMatch?)) +``` + +- `text`: The text to search. +- `pattern`: The pattern to match. + +**Returns:** +An iterator function that yields `PatternMatch` objects one at a time. + +**Example:** +```tomo +text := "one, two, three" +for word in text.by_pattern($Pat"{id}"): + say(word.text) +``` + +--- + +### `by_pattern_split` +Returns an iterator function that yields text segments split by a pattern. + +```tomo +func by_pattern_split(text:Text, pattern:Pat -> func(->Text?)) +``` + +- `text`: The text to split. +- `pattern`: The pattern to use as a separator. + +**Returns:** +An iterator function that yields text segments. + +**Example:** +```tomo +text := "one two three" +for word in text.by_pattern_split($Pat"{whitespace}"): + say(word.text) +``` + +--- + +### `each_pattern` +Applies a function to each occurrence of a pattern in the text. + +```tomo +func each_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch), recursive=yes) +``` + +- `text`: The text to search. +- `pattern`: The pattern to match. +- `fn`: The function to apply to each match. +- `recursive`: If `yes`, applies the function recursively on modified text. + +**Example:** +```tomo +text := "one two three" +text.each_pattern($Pat"{id}", func(m:PatternMatch): + say(m.txt) +) +``` + +--- + +### `find_patterns` +Finds all occurrences of a pattern in a text and returns them as `PatternMatch` objects. + +```tomo +func find_patterns(text:Text, pattern:Pat -> [PatternMatch]) +``` + +- `text`: The text to search. +- `pattern`: The pattern to match. + +**Returns:** +A list of `PatternMatch` objects. + +**Example:** +```tomo +text := "one! two three!" +>> text.find_patterns($Pat"{id}!") += [PatternMatch(text="one!", index=1, captures=["one"]), PatternMatch(text="three!", index=10, captures=["three"])] +``` + +--- + +### `has_pattern` +Checks whether a given pattern appears in the text. + +```tomo +func has_pattern(text:Text, pattern:Pat -> Bool) +``` + +- `text`: The text to search. +- `pattern`: The pattern to check for. + +**Returns:** +`yes` if a match is found, otherwise `no`. + +**Example:** +```tomo +text := "...okay..." +>> text.has_pattern($Pat"{id}") += yes +``` + +--- + +### `map_pattern` +Transforms matches of a pattern using a mapping function. + +```tomo +func map_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch -> Text), recursive=yes -> Text) +``` + +- `text`: The text to modify. +- `pattern`: The pattern to match. +- `fn`: A function that transforms matches. +- `recursive`: If `yes`, applies transformations recursively. + +**Returns:** +A new text with the transformed matches. + +**Example:** +```tomo +text := "I have #apples and #oranges and #plums" +fruits := {"apples"=4, "oranges"=5} +>> text.map_pattern($Pat'#{id}', func(match:PatternMatch): + fruit := match.captures[1] + "$(fruits[fruit] or 0) $fruit" +) += "I have 4 apples and 5 oranges and 0 plums" +``` + +--- + +### `matches_pattern` +Returns whether or not text matches a pattern completely. + +```tomo +func matches_pattern(text:Text, pattern:Pat -> Bool) +``` + +- `text`: The text to match against. +- `pattern`: The pattern to match. + +**Returns:** +`yes` if the whole text matches the pattern, otherwise `no`. + +**Example:** +```tomo +>> "Hello!!!".matches_pattern($Pat"{id}") += no +>> "Hello".matches_pattern($Pat"{id}") += yes +``` + +--- + +### `pattern_captures` +Returns a list of pattern captures for the given pattern. + +```tomo +func pattern_captures(text:Text, pattern:Pat -> [Text]?) +``` + +- `text`: The text to match against. +- `pattern`: The pattern to match. + +**Returns:** +An optional list of matched pattern captures. Returns `none` if the text does +not match the pattern. + +**Example:** +```tomo +>> "123 boxes".pattern_captures($Pat"{int} {id}") += ["123", "boxes"]? +>> "xxx".pattern_captures($Pat"{int} {id}") += none +``` + +--- + +### `replace_pattern` +Replaces occurrences of a pattern with a replacement text, supporting backreferences. + +```tomo +func replace_pattern(text:Text, pattern:Pat, replacement:Text, backref="@", recursive=yes -> Text) +``` + +- `text`: The text to modify. +- `pattern`: The pattern to match. +- `replacement`: The text to replace matches with. +- `backref`: The symbol for backreferences in the replacement. +- `recursive`: If `yes`, applies replacements recursively. + +**Returns:** +A new text with replacements applied. + +**Example:** +```tomo +>> "I have 123 apples and 456 oranges".replace_pattern($Pat"{int}", "some") += "I have some apples and some oranges" + +>> "I have 123 apples and 456 oranges".replace_pattern($Pat"{int}", "(@1)") += "I have (123) apples and (456) oranges" + +>> "I have 123 apples and 456 oranges".replace_pattern($Pat"{int}", "(?1)", backref="?") += "I have (123) apples and (456) oranges" + +>> "bad(fn(), bad(notbad))".replace_pattern($Pat"bad(?)", "good(@1)") += "good(fn(), good(notbad))" + +>> "bad(fn(), bad(notbad))".replace_pattern($Pat"bad(?)", "good(@1)", recursive=no) += "good(fn(), bad(notbad))" +``` + +--- + +### `split_pattern` +Splits a text into segments using a pattern as the delimiter. + +```tomo +func split_pattern(text:Text, pattern:Pat -> [Text]) +``` + +- `text`: The text to split. +- `pattern`: The pattern to use as a separator. + +**Returns:** +A list of text segments. + +**Example:** +```tomo +>> "one two three".split_pattern($Pat"{whitespace}") += ["one", "two", "three"] +``` + +--- + +### `translate_patterns` +Replaces multiple patterns using a mapping of patterns to replacement texts. + +```tomo +func translate_patterns(text:Text, replacements:{Pat,Text}, backref="@", recursive=yes -> Text) +``` + +- `text`: The text to modify. +- `replacements`: A table mapping patterns to their replacements. +- `backref`: The symbol for backreferences in replacements. +- `recursive`: If `yes`, applies replacements recursively. + +**Returns:** +A new text with all specified replacements applied. + +**Example:** +```tomo +>> text := "foo(x, baz(1))" +>> text.translate_patterns({ + $Pat"{id}(?)"="call(fn('@1'), @2)", + $Pat"{id}"="var('@1')", + $Pat"{int}"="int(@1)", +}) += "call(fn('foo'), var('x'), call(fn('baz'), int(1)))" +``` + +--- + +### `trim_pattern` +Removes matching patterns from the beginning and/or end of a text. + +```tomo +func trim_pattern(text:Text, pattern=$Pat"{space}", left=yes, right=yes -> Text) +``` + +- `text`: The text to trim. +- `pattern`: The pattern to trim (defaults to whitespace). +- `left`: If `yes`, trims from the beginning. +- `right`: If `yes`, trims from the end. + +**Returns:** +The trimmed text. + +**Example:** +```tomo +>> "123abc456".trim_pattern($Pat"{digit}") += "abc" +``` diff --git a/lib/patterns/match_type.h b/lib/patterns/match_type.h new file mode 100644 index 00000000..abbc4fce --- /dev/null +++ b/lib/patterns/match_type.h @@ -0,0 +1,8 @@ +#pragma once + +typedef struct { + Text_t text; + Int_t index; + List_t captures; +} XMatch; + diff --git a/lib/patterns/patterns.c b/lib/patterns/patterns.c new file mode 100644 index 00000000..ee27e4e3 --- /dev/null +++ b/lib/patterns/patterns.c @@ -0,0 +1,1298 @@ +// Logic for text pattern matching + +#include +#include +#include +#include +#include +#include + +#define MAX_BACKREFS 100 + +typedef struct { + Text_t text; + Int_t index; + List_t captures; +} PatternMatch; + +typedef struct { + Text_t text; + Int_t index; + List_t captures; + bool is_none:1; +} OptionalPatternMatch; + +#define NONE_MATCH ((OptionalPatternMatch){.is_none=true}) + +typedef struct { + int64_t index, length; + bool occupied, recursive; +} capture_t; + +typedef struct { + enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag; + bool negated, non_capturing; + int64_t min, max; + union { + int32_t grapheme; + uc_property_t property; + int64_t (*fn)(TextIter_t *, int64_t); + int32_t quote_graphemes[2]; + int32_t pair_graphemes[2]; + }; +} pat_t; + +static Text_t replace_list(Text_t text, List_t replacements, Text_t backref_pat, bool recursive); + +static INLINE void skip_whitespace(TextIter_t *state, int64_t *i) +{ + while (*i < state->stack[0].text.length) { + int32_t grapheme = Text$get_grapheme_fast(state, *i); + if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) + return; + *i += 1; + } +} + +static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme) +{ + if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) { + *i += 1; + return true; + } + return false; +} + +static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str) +{ + int64_t matched = 0; + while (matched[str]) { + if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched]) + return false; + matched += 1; + } + *i += matched; + return true; +} + +static int64_t parse_int(TextIter_t *state, int64_t *i) +{ + int64_t value = 0; + for (;; *i += 1) { + uint32_t grapheme = Text$get_main_grapheme_fast(state, *i); + int digit = uc_digit_value(grapheme); + if (digit < 0) break; + if (value >= INT64_MAX/10) break; + value = 10*value + digit; + } + return value; +} + +static const char *get_property_name(TextIter_t *state, int64_t *i) +{ + skip_whitespace(state, i); + char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); + char *dest = name; + while (*i < state->stack[0].text.length) { + int32_t grapheme = Text$get_grapheme_fast(state, *i); + if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { + *dest = (char)grapheme; + ++dest; + if (dest >= name + UNINAME_MAX - 1) + break; + } else { + break; + } + *i += 1; + } + + while (dest > name && dest[-1] == ' ') + *(dest--) = '\0'; + + if (dest == name) return NULL; + *dest = '\0'; + return name; +} + +#define EAT1(state, index, cond) ({\ + int32_t grapheme = Text$get_grapheme_fast(state, index); \ + bool success = (cond); \ + if (success) index += 1; \ + success; }) + +#define EAT2(state, index, cond1, cond2) ({\ + int32_t grapheme = Text$get_grapheme_fast(state, index); \ + bool success = (cond1); \ + if (success) { \ + grapheme = Text$get_grapheme_fast(state, index + 1); \ + success = (cond2); \ + if (success) \ + index += 2; \ + } \ + success; }) + + +#define EAT_MANY(state, index, cond) ({ int64_t _n = 0; while (EAT1(state, index, cond)) { _n += 1; } _n; }) + +static int64_t match_email(TextIter_t *state, int64_t index) +{ + // email = local "@" domain + // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) + // domain = dns-label ("." dns-label)* + // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) + + if (index > 0) { + uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); + if (uc_is_property_alphabetic(prev_codepoint)) + return -1; + } + + int64_t start_index = index; + + // Local part: + int64_t local_len = 0; + static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; + while (EAT1(state, index, + (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { + local_len += 1; + if (local_len > 64) return -1; + } + + if (!EAT1(state, index, grapheme == '@')) + return -1; + + // Host + int64_t host_len = 0; + do { + int64_t label_len = 0; + while (EAT1(state, index, + (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { + label_len += 1; + if (label_len > 63) return -1; + } + + if (label_len == 0) + return -1; + + host_len += label_len; + if (host_len > 255) + return -1; + host_len += 1; + } while (EAT1(state, index, grapheme == '.')); + + return index - start_index; +} + +static int64_t match_ipv6(TextIter_t *state, int64_t index) +{ + if (index > 0) { + int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); + if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) + return -1; + } + int64_t start_index = index; + const int NUM_CLUSTERS = 8; + bool double_colon_used = false; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 4; digits++) { + if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + break; + } + if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) { + break; + } else if (!EAT1(state, index, grapheme == ':')) { + if (double_colon_used) + break; + return -1; + } + + if (EAT1(state, index, grapheme == ':')) { + if (double_colon_used) + return -1; + double_colon_used = true; + } + } + return index - start_index; +} + +static int64_t match_ipv4(TextIter_t *state, int64_t index) +{ + if (index > 0) { + int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); + if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) + return -1; + } + int64_t start_index = index; + + const int NUM_CLUSTERS = 4; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 3; digits++) { + if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { + if (digits == 0) return -1; + break; + } + } + + if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) + break; + else if (!EAT1(state, index, grapheme == '.')) + return -1; + } + return (index - start_index); +} + +static int64_t match_ip(TextIter_t *state, int64_t index) +{ + int64_t len = match_ipv6(state, index); + if (len >= 0) return len; + len = match_ipv4(state, index); + return (len >= 0) ? len : -1; +} + +static int64_t match_host(TextIter_t *state, int64_t index) +{ + int64_t ip_len = match_ip(state, index); + if (ip_len > 0) return ip_len; + + int64_t start_index = index; + if (match_grapheme(state, &index, '[')) { + ip_len = match_ip(state, index); + if (ip_len <= 0) return -1; + index += ip_len; + if (match_grapheme(state, &index, ']')) + return (index - start_index); + return -1; + } + + if (!EAT1(state, index, isalpha(grapheme))) + return -1; + + static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme)); + return (index - start_index); +} + +static int64_t match_authority(TextIter_t *state, int64_t index) +{ + int64_t authority_start = index; + static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`."; + + // Optional user@ prefix: + int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme)); + if (username_len < 1 || !EAT1(state, index, grapheme == '@')) + index = authority_start; // No user@ part + + // Host: + int64_t host_len = match_host(state, index); + if (host_len <= 0) return -1; + index += host_len; + + // Port: + if (EAT1(state, index, grapheme == ':')) { + if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) + return -1; + } + return (index - authority_start); +} + +static int64_t match_uri(TextIter_t *state, int64_t index) +{ + // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] + // scheme = [a-zA-Z] [a-zA-Z0-9+.-] + // authority = [userinfo "@"] host [":" port] + + if (index > 0) { + // Don't match if we're not at a word edge: + uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); + if (uc_is_property_alphabetic(prev_codepoint)) + return -1; + } + + int64_t start_index = index; + + // Scheme: + if (!EAT1(state, index, isalpha(grapheme))) + return -1; + EAT_MANY(state, index, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); + if (!match_grapheme(state, &index, ':')) + return -1; + + // Authority: + int64_t authority_len; + if (match_str(state, &index, "//")) { + authority_len = match_authority(state, index); + if (authority_len > 0) + index += authority_len; + } else { + authority_len = 0; + } + + // Path: + int64_t path_start = index; + if (EAT1(state, index, grapheme == '/') || authority_len <= 0) { + static const char *non_path = " \"#?<>[]{}\\^`|"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); + + if (EAT1(state, index, grapheme == '?')) { // Query + static const char *non_query = " \"#<>[]{}\\^`|"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); + } + + if (EAT1(state, index, grapheme == '#')) { // Fragment + static const char *non_fragment = " \"#<>[]{}\\^`|"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); + } + } + + if (authority_len <= 0 && index == path_start) + return -1; + + return index - start_index; +} + +static int64_t match_url(TextIter_t *state, int64_t index) +{ + int64_t lookahead = index; + if (!(match_str(state, &lookahead, "https:") + || match_str(state, &lookahead, "http:") + || match_str(state, &lookahead, "ftp:") + || match_str(state, &lookahead, "wss:") + || match_str(state, &lookahead, "ws:"))) + return -1; + + return match_uri(state, index); +} + +static int64_t match_id(TextIter_t *state, int64_t index) +{ + if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) + return -1; + return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE)); +} + +static int64_t match_int(TextIter_t *state, int64_t index) +{ + int64_t negative = EAT1(state, index, grapheme == '-') ? 1 : 0; + int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); + return len > 0 ? negative + len : -1; +} + +static int64_t match_alphanumeric(TextIter_t *state, int64_t index) +{ + return EAT1(state, index, uc_is_property_alphabetic((ucs4_t)grapheme) || uc_is_property_numeric((ucs4_t)grapheme)) + ? 1 : -1; +} + +static int64_t match_num(TextIter_t *state, int64_t index) +{ + bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0; + int64_t pre_decimal = EAT_MANY(state, index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); + bool decimal = (EAT1(state, index, grapheme == '.') == 1); + int64_t post_decimal = decimal ? EAT_MANY(state, index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0; + if (pre_decimal == 0 && post_decimal == 0) + return -1; + return negative + pre_decimal + decimal + post_decimal; +} + +static int64_t match_newline(TextIter_t *state, int64_t index) +{ + if (index >= state->stack[0].text.length) + return -1; + + uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index); + if (grapheme == '\n') + return 1; + if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n') + return 2; + return -1; +} + +static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat) +{ + Text_t text = state->stack[0].text; + int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index); + + switch (pat.tag) { + case PAT_START: { + if (index == 0) + return pat.negated ? -1 : 0; + return pat.negated ? 0 : -1; + } + case PAT_END: { + if (index >= text.length) + return pat.negated ? -1 : 0; + return pat.negated ? 0 : -1; + } + case PAT_ANY: { + assert(!pat.negated); + return (index < text.length) ? 1 : -1; + } + case PAT_GRAPHEME: { + if (index >= text.length) + return -1; + else if (grapheme == pat.grapheme) + return pat.negated ? -1 : 1; + return pat.negated ? 1 : -1; + } + case PAT_PROPERTY: { + if (index >= text.length) + return -1; + else if (uc_is_property((ucs4_t)grapheme, pat.property)) + return pat.negated ? -1 : 1; + return pat.negated ? 1 : -1; + } + case PAT_PAIR: { + // Nested punctuation: (?), [?], etc + if (index >= text.length) + return -1; + + int32_t open = pat.pair_graphemes[0]; + if (grapheme != open) + return pat.negated ? 1 : -1; + + int32_t close = pat.pair_graphemes[1]; + int64_t depth = 1; + int64_t match_len = 1; + for (; depth > 0; match_len++) { + if (index + match_len >= text.length) + return pat.negated ? 1 : -1; + + int32_t c = Text$get_grapheme_fast(state, index + match_len); + if (c == open) + depth += 1; + else if (c == close) + depth -= 1; + } + return pat.negated ? -1 : match_len; + } + case PAT_QUOTE: { + // Nested quotes: "?", '?', etc + if (index >= text.length) + return -1; + + int32_t open = pat.quote_graphemes[0]; + if (grapheme != open) + return pat.negated ? 1 : -1; + + int32_t close = pat.quote_graphemes[1]; + for (int64_t i = index + 1; i < text.length; i++) { + int32_t c = Text$get_grapheme_fast(state, i); + if (c == close) { + return pat.negated ? -1 : (i - index) + 1; + } else if (c == '\\' && index + 1 < text.length) { + i += 1; // Skip ahead an extra step + } + } + return pat.negated ? 1 : -1; + } + case PAT_FUNCTION: { + int64_t match_len = pat.fn(state, index); + if (match_len >= 0) + return pat.negated ? -1 : match_len; + return pat.negated ? 1 : -1; + } + default: errx(1, "Invalid pattern"); + } + errx(1, "Unreachable"); +} + +static pat_t parse_next_pat(TextIter_t *state, int64_t *index) +{ + if (EAT2(state, *index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), + grapheme == '?')) { + // Quotations: "?", '?', etc + int32_t open = Text$get_grapheme_fast(state, *index-2); + int32_t close = open; + uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); + if (!match_grapheme(state, index, close)) + fail("Pattern's closing quote is missing: ", state->stack[0].text); + + return (pat_t){ + .tag=PAT_QUOTE, + .min=1, .max=1, + .quote_graphemes={open, close}, + }; + } else if (EAT2(state, *index, + uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), + grapheme == '?')) { + // Nested punctuation: (?), [?], etc + int32_t open = Text$get_grapheme_fast(state, *index-2); + int32_t close = open; + uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); + if (!match_grapheme(state, index, close)) + fail("Pattern's closing brace is missing: ", state->stack[0].text); + + return (pat_t){ + .tag=PAT_PAIR, + .min=1, .max=1, + .pair_graphemes={open, close}, + }; + } else if (EAT1(state, *index, grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc. + skip_whitespace(state, index); + int64_t min, max; + if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) { + min = parse_int(state, index); + skip_whitespace(state, index); + if (match_grapheme(state, index, '+')) { + max = INT64_MAX; + } else if (match_grapheme(state, index, '-')) { + max = parse_int(state, index); + } else { + max = min; + } + if (min > max) fail("Minimum repetitions (", min, ") is less than the maximum (", max, ")"); + } else { + min = -1, max = -1; + } + + skip_whitespace(state, index); + + bool negated = match_grapheme(state, index, '!'); +#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__}) + const char *prop_name; + if (match_str(state, index, "..")) + prop_name = ".."; + else + prop_name = get_property_name(state, index); + + if (!prop_name) { + // Literal character, e.g. {1?} + skip_whitespace(state, index); + int32_t grapheme = Text$get_grapheme_fast(state, (*index)++); + if (!match_grapheme(state, index, '}')) + fail("Missing closing '}' in pattern: ", state->stack[0].text); + return PAT(PAT_GRAPHEME, .grapheme=grapheme); + } else if (strlen(prop_name) == 1) { + // Single letter names: {1+ A} + skip_whitespace(state, index); + if (!match_grapheme(state, index, '}')) + fail("Missing closing '}' in pattern: ", state->stack[0].text); + return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]); + } + + skip_whitespace(state, index); + if (!match_grapheme(state, index, '}')) + fail("Missing closing '}' in pattern: ", state->stack[0].text); + + switch (tolower(prop_name[0])) { + case '.': + if (prop_name[1] == '.') { + if (negated) + return ((pat_t){.tag=PAT_END, .min=min, .max=max, .non_capturing=true}); + else + return PAT(PAT_ANY); + } + break; + case 'a': + if (strcasecmp(prop_name, "authority") == 0) { + return PAT(PAT_FUNCTION, .fn=match_authority); + } else if (strcasecmp(prop_name, "alphanum") == 0 || strcasecmp(prop_name, "anum") == 0 + || strcasecmp(prop_name, "alphanumeric") == 0) { + return PAT(PAT_FUNCTION, .fn=match_alphanumeric); + } + break; + case 'c': + if (strcasecmp(prop_name, "crlf") == 0) + return PAT(PAT_FUNCTION, .fn=match_newline); + break; + case 'd': + if (strcasecmp(prop_name, "digit") == 0) { + return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT); + } + break; + case 'e': + if (strcasecmp(prop_name, "end") == 0) { + return PAT(PAT_END, .non_capturing=!negated); + } else if (strcasecmp(prop_name, "email") == 0) { + return PAT(PAT_FUNCTION, .fn=match_email); + } +#if _LIBUNISTRING_VERSION >= 0x0100000 + else if (strcasecmp(prop_name, "emoji") == 0) { + return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI); + } +#endif + break; + case 'h': + if (strcasecmp(prop_name, "host") == 0) { + return PAT(PAT_FUNCTION, .fn=match_host); + } + break; + case 'i': + if (strcasecmp(prop_name, "id") == 0) { + return PAT(PAT_FUNCTION, .fn=match_id); + } else if (strcasecmp(prop_name, "int") == 0) { + return PAT(PAT_FUNCTION, .fn=match_int); + } else if (strcasecmp(prop_name, "ipv4") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ipv4); + } else if (strcasecmp(prop_name, "ipv6") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ipv6); + } else if (strcasecmp(prop_name, "ip") == 0) { + return PAT(PAT_FUNCTION, .fn=match_ip); + } + break; + case 'n': + if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0) { + return PAT(PAT_FUNCTION, .fn=match_newline); + } else if (strcasecmp(prop_name, "num") == 0) { + return PAT(PAT_FUNCTION, .fn=match_num); + } + break; + case 's': + if (strcasecmp(prop_name, "start") == 0) { + return PAT(PAT_START, .non_capturing=!negated); + } + break; + case 'u': + if (strcasecmp(prop_name, "uri") == 0) { + return PAT(PAT_FUNCTION, .fn=match_uri); + } else if (strcasecmp(prop_name, "url") == 0) { + return PAT(PAT_FUNCTION, .fn=match_url); + } + break; + case 'w': + if (strcasecmp(prop_name, "word") == 0) { + return PAT(PAT_FUNCTION, .fn=match_id); + } + break; + default: break; + } + + uc_property_t prop = uc_property_byname(prop_name); + if (uc_property_is_valid(prop)) + return PAT(PAT_PROPERTY, .property=prop); + + ucs4_t grapheme = unicode_name_character(prop_name); + if (grapheme == UNINAME_INVALID) + fail("Not a valid property or character name: ", prop_name); + return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme); +#undef PAT + } else { + return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(state, (*index)++)}; + } +} + +static int64_t match(Text_t text, int64_t text_index, Text_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) +{ + if (pattern_index >= pattern.length) // End of the pattern + return 0; + + int64_t start_index = text_index; + TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text); + pat_t pat = parse_next_pat(&pattern_state, &pattern_index); + + if (pat.min == -1 && pat.max == -1) { + if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { + pat.min = pat.max = MAX(1, text.length - text_index); + } else { + pat.min = 1; + pat.max = INT64_MAX; + } + } + + int64_t capture_start = text_index; + int64_t count = 0, capture_len = 0, next_match_len = 0; + + if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { + int64_t remaining = text.length - text_index; + capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1; + text_index += capture_len; + goto success; + } + + if (pat.min == 0 && pattern_index < pattern.length) { + next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); + if (next_match_len >= 0) { + capture_len = 0; + goto success; + } + } + + while (count < pat.max) { + int64_t match_len = match_pat(&text_state, text_index, pat); + if (match_len < 0) + break; + capture_len += match_len; + text_index += match_len; + count += 1; + + if (pattern_index < pattern.length) { // More stuff after this + if (count < pat.min) + next_match_len = -1; + else + next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); + } else { + next_match_len = 0; + } + + if (match_len == 0) { + if (next_match_len >= 0) { + // If we're good to go, no need to keep re-matching zero-length + // matches till we hit max: + count = pat.max; + break; + } else { + return -1; + } + } + + if (pattern_index < pattern.length && next_match_len >= 0) + break; // Next guy exists and wants to stop here + + if (text_index >= text.length) + break; + } + + if (count < pat.min || next_match_len < 0) + return -1; + + success: + if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) { + if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) { + assert(capture_len > 0); + captures[capture_index] = (capture_t){ + .index=capture_start + 1, // Skip leading quote/paren + .length=capture_len - 2, // Skip open/close + .occupied=true, + .recursive=(pat.tag == PAT_PAIR), + }; + } else { + captures[capture_index] = (capture_t){ + .index=capture_start, + .length=capture_len, + .occupied=true, + .recursive=false, + }; + } + } + return (text_index - start_index) + next_match_len; +} + +#undef EAT1 +#undef EAT2 +#undef EAT_MANY + +static int64_t _find(Text_t text, Text_t pattern, int64_t first, int64_t last, int64_t *match_length, capture_t *captures) +{ + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); + for (int64_t i = first; i <= last; i++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme) + ++i; + } + + int64_t m = match(text, i, pattern, 0, captures, 0); + if (m >= 0) { + if (match_length) + *match_length = m; + return i; + } + } + if (match_length) + *match_length = -1; + return -1; +} + +static OptionalPatternMatch find(Text_t text, Text_t pattern, Int_t from_index) +{ + int64_t first = Int64$from_int(from_index, false); + if (first == 0) fail("Invalid index: 0"); + if (first < 0) first = text.length + first + 1; + if (first > text.length || first < 1) + return NONE_MATCH; + + capture_t captures[MAX_BACKREFS] = {}; + int64_t len = 0; + int64_t found = _find(text, pattern, first-1, text.length-1, &len, captures); + if (found == -1) + return NONE_MATCH; + + List_t capture_list = {}; + for (int i = 0; captures[i].occupied; i++) { + Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); + List$insert(&capture_list, &capture, I(0), sizeof(Text_t)); + } + return (OptionalPatternMatch){ + .text=Text$slice(text, I(found+1), I(found+len)), + .index=I(found+1), + .captures=capture_list, + }; +} + +PUREFUNC static bool Pattern$has(Text_t text, Text_t pattern) +{ + if (Text$starts_with(pattern, Text("{start}"))) { + int64_t m = match(text, 0, pattern, 0, NULL, 0); + return m >= 0; + } else if (Text$ends_with(text, Text("{end}"))) { + for (int64_t i = text.length-1; i >= 0; i--) { + int64_t match_len = match(text, i, pattern, 0, NULL, 0); + if (match_len >= 0 && i + match_len == text.length) + return true; + } + return false; + } else { + int64_t found = _find(text, pattern, 0, text.length-1, NULL, NULL); + return (found >= 0); + } +} + +static bool Pattern$matches(Text_t text, Text_t pattern) +{ + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, 0, pattern, 0, NULL, 0); + return (match_len == text.length); +} + +static OptionalList_t Pattern$captures(Text_t text, Text_t pattern) +{ + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, 0, pattern, 0, captures, 0); + if (match_len != text.length) + return NONE_LIST; + + List_t capture_list = {}; + for (int i = 0; captures[i].occupied; i++) { + Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); + List$insert(&capture_list, &capture, I(0), sizeof(Text_t)); + } + return capture_list; +} + +static List_t Pattern$find_all(Text_t text, Text_t pattern) +{ + if (pattern.length == 0) // special case + return (List_t){.length=0}; + + List_t matches = {}; + for (int64_t i = 1; ; ) { + OptionalPatternMatch m = find(text, pattern, I(i)); + if (m.is_none) + break; + i = Int64$from_int(m.index, false) + m.text.length; + List$insert(&matches, &m, I_small(0), sizeof(PatternMatch)); + } + return matches; +} + +typedef struct { + TextIter_t state; + Int_t i; + Text_t pattern; +} match_iter_state_t; + +static OptionalPatternMatch next_match(match_iter_state_t *state) +{ + if (Int64$from_int(state->i, false) > state->state.stack[0].text.length) + return NONE_MATCH; + + OptionalPatternMatch m = find(state->state.stack[0].text, state->pattern, state->i); + if (m.is_none) // No match + state->i = I(state->state.stack[0].text.length + 1); + else + state->i = Int$plus(m.index, I(MAX(1, m.text.length))); + return m; +} + +static Closure_t Pattern$by_match(Text_t text, Text_t pattern) +{ + return (Closure_t){ + .fn=(void*)next_match, + .userdata=new(match_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=I_small(1), .pattern=pattern), + }; +} + +static Text_t apply_backrefs(Text_t text, List_t recursive_replacements, Text_t replacement, Text_t backref_pat, capture_t *captures) +{ + if (backref_pat.length == 0) + return replacement; + + int32_t first_grapheme = Text$get_grapheme(backref_pat, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + Text_t ret = Text(""); + TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement); + int64_t nonmatching_pos = 0; + for (int64_t pos = 0; pos < replacement.length; ) { + // Optimization: quickly skip ahead to first char in the backref pattern: + if (find_first) { + while (pos < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme) + ++pos; + } + + int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0); + if (backref_len < 0) { + pos += 1; + continue; + } + + int64_t after_backref = pos + backref_len; + int64_t backref = parse_int(&replacement_state, &after_backref); + if (after_backref == pos + backref_len) { // Not actually a backref if there's no number + pos += 1; + continue; + } + if (backref < 0 || backref > 9) fail("Invalid backref index: ", backref, " (only 0-", MAX_BACKREFS-1, " are allowed)"); + backref_len = (after_backref - pos); + + if (Text$get_grapheme_fast(&replacement_state, pos + backref_len) == ';') + backref_len += 1; // skip optional semicolon + + if (!captures[backref].occupied) + fail("There is no capture number ", backref, "!"); + + Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length)); + + if (captures[backref].recursive && recursive_replacements.length > 0) + backref_text = replace_list(backref_text, recursive_replacements, backref_pat, true); + + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, backref_text); + } else { + ret = Text$concat(ret, backref_text); + } + + pos += backref_len; + nonmatching_pos = pos; + } + if (nonmatching_pos < replacement.length) { + Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +static Text_t Pattern$replace(Text_t text, Text_t pattern, Text_t replacement, Text_t backref_pat, bool recursive) +{ + Text_t ret = EMPTY_TEXT; + + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + Text_t entries[2] = {pattern, replacement}; + List_t replacements = { + .data=entries, + .length=1, + .stride=sizeof(entries), + }; + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); + int64_t nonmatching_pos = 0; + for (int64_t pos = 0; pos < text.length; ) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) + ++pos; + } + + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, pos, pattern, 0, captures, 1); + if (match_len < 0) { + pos += 1; + continue; + } + captures[0] = (capture_t){ + .index = pos, .length = match_len, + .occupied = true, .recursive = false, + }; + + Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (List_t){}, replacement, backref_pat, captures); + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, replacement_text); + } else { + ret = Text$concat(ret, replacement_text); + } + nonmatching_pos = pos + match_len; + pos += MAX(match_len, 1); + } + if (nonmatching_pos < text.length) { + Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +static Text_t Pattern$trim(Text_t text, Text_t pattern, bool trim_left, bool trim_right) +{ + int64_t first = 0, last = text.length-1; + if (trim_left) { + int64_t match_len = match(text, 0, pattern, 0, NULL, 0); + if (match_len > 0) + first = match_len; + } + + if (trim_right) { + for (int64_t i = text.length-1; i >= first; i--) { + int64_t match_len = match(text, i, pattern, 0, NULL, 0); + if (match_len > 0 && i + match_len == text.length) + last = i-1; + } + } + return Text$slice(text, I(first+1), I(last+1)); +} + +static Text_t Pattern$map(Text_t text, Text_t pattern, Closure_t fn, bool recursive) +{ + Text_t ret = EMPTY_TEXT; + + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); + int64_t nonmatching_pos = 0; + + Text_t (*text_mapper)(PatternMatch, void*) = fn.fn; + for (int64_t pos = 0; pos < text.length; pos++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) + ++pos; + } + + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, pos, pattern, 0, captures, 0); + if (match_len < 0) continue; + + PatternMatch m = { + .text=Text$slice(text, I(pos+1), I(pos+match_len)), + .index=I(pos+1), + .captures={}, + }; + for (int i = 0; captures[i].occupied; i++) { + Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); + if (recursive) + capture = Pattern$map(capture, pattern, fn, recursive); + List$insert(&m.captures, &capture, I(0), sizeof(Text_t)); + } + + Text_t replacement = text_mapper(m, fn.userdata); + if (pos > nonmatching_pos) { + Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); + ret = Text$concat(ret, before_slice, replacement); + } else { + ret = Text$concat(ret, replacement); + } + nonmatching_pos = pos + match_len; + pos += (match_len - 1); + } + if (nonmatching_pos < text.length) { + Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +static void Pattern$each(Text_t text, Text_t pattern, Closure_t fn, bool recursive) +{ + int32_t first_grapheme = Text$get_grapheme(pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); + void (*action)(PatternMatch, void*) = fn.fn; + for (int64_t pos = 0; pos < text.length; pos++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) + ++pos; + } + + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, pos, pattern, 0, captures, 0); + if (match_len < 0) continue; + + PatternMatch m = { + .text=Text$slice(text, I(pos+1), I(pos+match_len)), + .index=I(pos+1), + .captures={}, + }; + for (int i = 0; captures[i].occupied; i++) { + Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); + if (recursive) + Pattern$each(capture, pattern, fn, recursive); + List$insert(&m.captures, &capture, I(0), sizeof(Text_t)); + } + + action(m, fn.userdata); + pos += (match_len - 1); + } +} + +Text_t replace_list(Text_t text, List_t replacements, Text_t backref_pat, bool recursive) +{ + if (replacements.length == 0) return text; + + Text_t ret = EMPTY_TEXT; + + int64_t nonmatch_pos = 0; + for (int64_t pos = 0; pos < text.length; ) { + // Find the first matching pattern at this position: + for (int64_t i = 0; i < replacements.length; i++) { + Text_t pattern = *(Text_t*)(replacements.data + i*replacements.stride); + capture_t captures[MAX_BACKREFS] = {}; + int64_t len = match(text, pos, pattern, 0, captures, 1); + if (len < 0) continue; + captures[0].index = pos; + captures[0].length = len; + + // If we skipped over some non-matching text before finding a match, insert it here: + if (pos > nonmatch_pos) { + Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos)); + ret = Text$concat(ret, before_slice); + } + + // Concatenate the replacement: + Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride + sizeof(Text_t)); + Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (List_t){}, replacement, backref_pat, captures); + ret = Text$concat(ret, replacement_text); + pos += MAX(len, 1); + nonmatch_pos = pos; + goto next_pos; + } + + pos += 1; + next_pos: + continue; + } + + if (nonmatch_pos <= text.length) { + Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length)); + ret = Text$concat(ret, last_slice); + } + return ret; +} + +static Text_t Pattern$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive) +{ + return replace_list(text, replacements.entries, backref_pat, recursive); +} + +static List_t Pattern$split(Text_t text, Text_t pattern) +{ + if (text.length == 0) // special case + return (List_t){.length=0}; + + if (pattern.length == 0) // special case + return Text$clusters(text); + + List_t chunks = {}; + + int64_t i = 0; + for (;;) { + int64_t len = 0; + int64_t found = _find(text, pattern, i, text.length-1, &len, NULL); + if (found == i && len == 0) + found = _find(text, pattern, i + 1, text.length-1, &len, NULL); + if (found < 0) break; + Text_t chunk = Text$slice(text, I(i+1), I(found)); + List$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); + i = MAX(found + len, i + 1); + } + + Text_t last_chunk = Text$slice(text, I(i+1), I(text.length)); + List$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t)); + + return chunks; +} + +typedef struct { + TextIter_t state; + int64_t i; + Text_t pattern; +} split_iter_state_t; + +static OptionalText_t next_split(split_iter_state_t *state) +{ + Text_t text = state->state.stack[0].text; + if (state->i >= text.length) { + if (state->pattern.length > 0 && state->i == text.length) { // special case + state->i = text.length + 1; + return EMPTY_TEXT; + } + return NONE_TEXT; + } + + if (state->pattern.length == 0) { // special case + Text_t ret = Text$cluster(text, I(state->i+1)); + state->i += 1; + return ret; + } + + int64_t start = state->i; + int64_t len = 0; + int64_t found = _find(text, state->pattern, start, text.length-1, &len, NULL); + + if (found == start && len == 0) + found = _find(text, state->pattern, start + 1, text.length-1, &len, NULL); + + if (found >= 0) { + state->i = MAX(found + len, state->i + 1); + return Text$slice(text, I(start+1), I(found)); + } else { + state->i = state->state.stack[0].text.length + 1; + return Text$slice(text, I(start+1), I(text.length)); + } +} + +static Closure_t Pattern$by_split(Text_t text, Text_t pattern) +{ + return (Closure_t){ + .fn=(void*)next_split, + .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .pattern=pattern), + }; +} + +static Text_t Pattern$escape_text(Text_t text) +{ + // TODO: optimize for spans of non-escaped text + Text_t ret = EMPTY_TEXT; + TextIter_t state = NEW_TEXT_ITER_STATE(text); + for (int64_t i = 0; i < text.length; i++) { + uint32_t g = Text$get_main_grapheme_fast(&state, i); + if (g == '{') { + ret = Text$concat(ret, Text("{1{}")); + } else if (g == '?' + || uc_is_property_quotation_mark(g) + || (uc_is_property_paired_punctuation(g) && uc_is_property_left_of_pair(g))) { + ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i+1), I(i+1)), Text("}")); + } else { + ret = Text$concat(ret, Text$slice(text, I(i+1), I(i+1))); + } + } + return ret; +} + +static Text_t Pattern$as_text(const void *obj, bool colorize, const TypeInfo_t *info) +{ + (void)info; + if (!obj) return Text("Pattern"); + + Text_t pat = *(Text_t*)obj; + Text_t quote = Pattern$has(pat, Text("/")) && !Pattern$has(pat, Text("|")) ? Text("|") : Text("/"); + return Text$concat(colorize ? Text("\x1b[1m$\033[m") : Text("$"), Text$quoted(pat, colorize, quote)); +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/lib/patterns/patterns.tm b/lib/patterns/patterns.tm new file mode 100644 index 00000000..bab0c3dc --- /dev/null +++ b/lib/patterns/patterns.tm @@ -0,0 +1,47 @@ +use ./patterns.c + +struct PatternMatch(text:Text, index:Int, captures:[Text]) + +lang Pat + convert(text:Text -> Pat) + return C_code:Pat(Pattern$escape_text(@text)) + + convert(n:Int -> Pat) + return Pat.from_text("$n") + +extend Text + func matches_pattern(text:Text, pattern:Pat -> Bool) + return C_code:Bool(Pattern$matches(@text, @pattern)) + + func pattern_captures(text:Text, pattern:Pat -> [Text]?) + return C_code:[Text]?(Pattern$captures(@text, @pattern)) + + func replace_pattern(text:Text, pattern:Pat, replacement:Text, backref="@", recursive=yes -> Text) + return C_code:Text(Pattern$replace(@text, @pattern, @replacement, @backref, @recursive)) + + func translate_patterns(text:Text, replacements:{Pat=Text}, backref="@", recursive=yes -> Text) + return C_code:Text(Pattern$replace_all(@text, @replacements, @backref, @recursive)) + + func has_pattern(text:Text, pattern:Pat -> Bool) + return C_code:Bool(Pattern$has(@text, @pattern)) + + func find_patterns(text:Text, pattern:Pat -> [PatternMatch]) + return C_code:[PatternMatch](Pattern$find_all(@text, @pattern)) + + func by_pattern(text:Text, pattern:Pat -> func(->PatternMatch?)) + return C_code:func(->PatternMatch?)(Pattern$by_match(@text, @pattern)) + + func each_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch), recursive=yes) + C_code { Pattern$each(@text, @pattern, @fn, @recursive); } + + func map_pattern(text:Text, pattern:Pat, fn:func(m:PatternMatch -> Text), recursive=yes -> Text) + return C_code:Text(Pattern$map(@text, @pattern, @fn, @recursive)) + + func split_pattern(text:Text, pattern:Pat -> [Text]) + return C_code:[Text](Pattern$split(@text, @pattern)) + + func by_pattern_split(text:Text, pattern:Pat -> func(->Text?)) + return C_code:func(->Text?)(Pattern$by_split(@text, @pattern)) + + func trim_pattern(text:Text, pattern=$Pat"{space}", left=yes, right=yes -> Text) + return C_code:Text(Pattern$trim(@text, @pattern, @left, @right)) diff --git a/lib/pthreads/pthreads.tm b/lib/pthreads/pthreads.tm new file mode 100644 index 00000000..fee7ce5d --- /dev/null +++ b/lib/pthreads/pthreads.tm @@ -0,0 +1,118 @@ +# A Posix Threads (pthreads) wrapper +use + +struct pthread_mutex_t(; extern, opaque) + func new(->@pthread_mutex_t) + return C_code : @pthread_mutex_t( + pthread_mutex_t *mutex = GC_MALLOC(sizeof(pthread_mutex_t)); + pthread_mutex_init(mutex, NULL); + GC_register_finalizer(mutex, (void*)pthread_mutex_destroy, NULL, NULL, NULL); + mutex + ) + + func lock(m:&pthread_mutex_t) + fail("Failed to lock mutex") unless C_code:Int32(pthread_mutex_lock(@m)) == 0 + + func unlock(m:&pthread_mutex_t) + fail("Failed to unlock mutex") unless C_code:Int32(pthread_mutex_unlock(@m)) == 0 + +struct pthread_cond_t(; extern, opaque) + func new(->@pthread_cond_t) + return C_code : @pthread_cond_t( + pthread_cond_t *cond = GC_MALLOC(sizeof(pthread_cond_t)); + pthread_cond_init(cond, NULL); + GC_register_finalizer(cond, (void*)pthread_cond_destroy, NULL, NULL, NULL); + cond + ) + + func wait(cond:&pthread_cond_t, mutex:&pthread_mutex_t) + fail("Failed to wait on condition") unless C_code:Int32(pthread_cond_wait(@cond, @mutex)) == 0 + + func signal(cond:&pthread_cond_t) + fail("Failed to signal pthread_cond_t") unless C_code:Int32(pthread_cond_signal(@cond)) == 0 + + func broadcast(cond:&pthread_cond_t) + fail("Failed to broadcast pthread_cond_t") unless C_code:Int32(pthread_cond_broadcast(@cond)) == 0 + +struct pthread_rwlock_t(; extern, opaque) + func new(->@pthread_rwlock_t) + return C_code : @pthread_rwlock_t ( + pthread_rwlock_t *lock = GC_MALLOC(sizeof(pthread_rwlock_t)); + pthread_rwlock_init(lock, NULL); + GC_register_finalizer(lock, (void*)pthread_rwlock_destroy, NULL, NULL, NULL); + lock + ) + + func read_lock(lock:&pthread_rwlock_t) + C_code { pthread_rwlock_rdlock(@lock); } + + func write_lock(lock:&pthread_rwlock_t) + C_code { pthread_rwlock_wrlock(@lock); } + + func unlock(lock:&pthread_rwlock_t) + C_code { pthread_rwlock_unlock(@lock); } + +struct pthread_t(; extern, opaque) + func new(fn:func() -> @pthread_t) + return C_code:@pthread_t( + pthread_t *thread = GC_MALLOC(sizeof(pthread_t)); + pthread_create(thread, NULL, @fn.fn, @fn.userdata); + thread + ) + + func join(p:pthread_t) C_code { pthread_join(@p, NULL); } + func cancel(p:pthread_t) C_code { pthread_cancel(@p); } + func detatch(p:pthread_t) C_code { pthread_detach(@p); } + +struct IntQueue(_queue:@[Int], _mutex:@pthread_mutex_t, _cond:@pthread_cond_t) + func new(initial:[Int]=[] -> IntQueue) + return IntQueue(@initial, pthread_mutex_t.new(), pthread_cond_t.new()) + + func give(q:IntQueue, n:Int) + do q._mutex.lock() + q._queue.insert(n) + q._mutex.unlock() + q._cond.signal() + + func take(q:IntQueue -> Int) + do q._mutex.lock() + n := q._queue.pop(1) + while not n + q._cond.wait(q._mutex) + n = q._queue.pop(1) + q._mutex.unlock() + return n! + fail("Unreachable") + +func main() + jobs := IntQueue.new() + results := IntQueue.new() + + say_mutex := pthread_mutex_t.new() + announce := func(speaker:Text, text:Text) + do say_mutex.lock() + say("\[2][$speaker]\[] $text") + say_mutex.unlock() + + worker := pthread_t.new(func() + say("I'm in the thread!") + repeat + announce("worker", "waiting for job") + job := jobs.take() + result := job * 10 + announce("worker", "Jobbing $job into $result") + results.give(result) + announce("worker", "Signaled $result") + ) + + for i in 10 + announce("boss", "Pushing job $i") + jobs.give(i) + announce("boss", "Gave job $i") + + for i in 10 + announce("boss", "Getting result...") + result := results.take() + announce("boss", "Got result $result") + + >> worker.cancel() diff --git a/lib/random/README.md b/lib/random/README.md new file mode 100644 index 00000000..183b9d0b --- /dev/null +++ b/lib/random/README.md @@ -0,0 +1,196 @@ +# Random Number Generators (RNG) + +This library provides an `RNG` type (Random Number Generator). This type +represents a self-contained piece of data that encapsulates the state of a +relatively fast and relatively secure pseudo-random number generator. The +current implementation is based on the [ChaCha20 stream +cipher,](https://en.wikipedia.org/wiki/Salsa20#ChaCha_variant) inspired by +[`arc4random` in OpenBSD.](https://man.openbsd.org/arc4random.3) + +An `RNG` object can be used for deterministic, repeatable generation of +pseudorandom numbers (for example, to be used in a video game for creating +seeded levels). The default random number generator for Tomo is called `random` +and is, by default, initialized with random data from the operating system when +a Tomo program launches. + +## RNG Functions + +This documentation provides details on RNG functions available in the API. +Lists also have some methods which use RNG values: +`list.shuffle()`, `list.shuffled()`, `list.random()`, and `list.sample()`. + +- [`func bool(rng: RNG, p: Num = 0.5 -> Bool)`](#bool) +- [`func byte(rng: RNG -> Byte)`](#byte) +- [`func bytes(rng: RNG, count: Int -> [Byte])`](#bytes) +- [`func copy(rng: RNG -> RNG)`](#copy) +- [`func int(rng: RNG, min: Int, max: Int -> Int)`](#int`, `int64`, `int32`, `int16`, `int8) +- [`func new(seed: [Byte] = (/dev/urandom).read_bytes(40)! -> RNG)`](#new) +- [`func num(rng: RNG, min: Num = 0.0, max: Num = 1.0 -> Num)`](#num`, `num32) + +------------- + +### `bool` +Generate a random boolean value with a given probability. + +```tomo +func bool(rng: RNG, p: Num = 0.5 -> Bool) +``` + +- `rng`: The random number generator to use. +- `p`: The probability of returning a `yes` value. Values less than zero and + `NaN` values are treated as equal to zero and values larger than zero are + treated as equal to one. + +**Returns:** +`yes` with probability `p` and `no` with probability `1-p`. + +**Example:** +```tomo +>> random.bool() += no +>> random.bool(1.0) += yes +``` + +--- + +### `byte` +Generate a random byte with uniform probability. + +```tomo +func byte(rng: RNG -> Byte) +``` + +- `rng`: The random number generator to use. + +**Returns:** +A random byte (0-255). + +**Example:** +```tomo +>> random.byte() += 103[B] +``` + +--- + +### `bytes` +Generate a list of uniformly random bytes with the given length. + +```tomo +func bytes(rng: RNG, count: Int -> [Byte]) +``` + +- `rng`: The random number generator to use. +- `count`: The number of random bytes to return. + +**Returns:** +A list of length `count` random bytes with uniform random distribution (0-255). + +**Example:** +```tomo +>> random.bytes(4) += [135[B], 169[B], 103[B], 212[B]] +``` + +--- + +### `copy` +Return a copy of a random number generator. This copy will be a parallel version of +the given RNG with its own internal state. + +```tomo +func copy(rng: RNG -> RNG) +``` + +- `rng`: The random number generator to copy. + +**Returns:** +A copy of the given RNG. + +**Example:** +```tomo +>> rng := RNG.new([]) +>> copy := rng.copy() + +>> rng.bytes(10) += [224[B], 102[B], 190[B], 59[B], 251[B], 50[B], 217[B], 170[B], 15[B], 221[B]] + +# The copy runs in parallel to the original RNG: +>> copy.bytes(10) += [224[B], 102[B], 190[B], 59[B], 251[B], 50[B], 217[B], 170[B], 15[B], 221[B]] +``` + +--- + +### `int`, `int64`, `int32`, `int16`, `int8` +Generate a random integer value with the given range. + +```tomo +func int(rng: RNG, min: Int, max: Int -> Int) +func int64(rng: RNG, min: Int64 = Int64.min, max: Int64 = Int64.max -> Int) +func int32(rng: RNG, min: Int32 = Int32.min, max: Int32 = Int32.max -> Int) +func int16(rng: RNG, min: Int16 = Int16.min, max: Int16 = Int16.max -> Int) +func int8(rng: RNG, min: Int8 = Int8.min, max: Int8 = Int8.max -> Int) +``` + +- `rng`: The random number generator to use. +- `min`: The minimum value to be returned. +- `max`: The maximum value to be returned. + +**Returns:** +An integer uniformly chosen from the range `[min, max]` (inclusive). If `min` +is greater than `max`, an error will be raised. + +**Example:** +```tomo +>> random.int(1, 10) += 8 +``` + +--- + +### `new` +Return a new random number generator. + +```tomo +func new(seed: [Byte] = (/dev/urandom).read_bytes(40)! -> RNG) +``` + +- `seed`: The seed use for the random number generator. A seed length of 40 + bytes is recommended. Seed lengths of less than 40 bytes are padded with + zeroes. + +**Returns:** +A new random number generator. + +**Example:** +```tomo +>> my_rng := RNG.new([1[B], 2[B], 3[B], 4[B]]) +>> my_rng.bool() += yes +``` + +--- + +### `num`, `num32` +Generate a random floating point value with the given range. + +```tomo +func num(rng: RNG, min: Num = 0.0, max: Num = 1.0 -> Int) +func num32(rng: RNG, min: Num = 0.0_f32, max: Num = 1.0_f32 -> Int) +``` + +- `rng`: The random number generator to use. +- `min`: The minimum value to be returned. +- `max`: The maximum value to be returned. + +**Returns:** +A floating point number uniformly chosen from the range `[min, max]` +(inclusive). If `min` is greater than `max`, an error will be raised. + +**Example:** +```tomo +>> random.num(1, 10) += 9.512830439975572 +``` diff --git a/lib/random/chacha.h b/lib/random/chacha.h new file mode 100644 index 00000000..7df352f9 --- /dev/null +++ b/lib/random/chacha.h @@ -0,0 +1,196 @@ +#pragma once +/* +chacha-merged.c version 20080118 +D. J. Bernstein +Public domain. +*/ + +/* $OpenBSD: chacha_private.h,v 1.3 2022/02/28 21:56:29 dtucker Exp $ */ +/* Tomo: chacha.h,v 1.0 2024/11/03 Bruce Hill */ + +typedef unsigned char u8; +typedef unsigned int u32; + +typedef struct +{ + u32 input[16]; /* could be compressed */ +} chacha_ctx; + +#define KEYSZ 32 +#define IVSZ 8 + +#define U8C(v) (v##U) +#define U32C(v) (v##U) + +#define U8V(v) ((u8)(v) & U8C(0xFF)) +#define U32V(v) ((u32)(v) & U32C(0xFFFFFFFF)) + +#define ROTL32(v, n) \ + (U32V((v) << (n)) | ((v) >> (32 - (n)))) + +#define U8TO32_LITTLE(p) \ + (((u32)((p)[0]) ) | \ + ((u32)((p)[1]) << 8) | \ + ((u32)((p)[2]) << 16) | \ + ((u32)((p)[3]) << 24)) + +#define U32TO8_LITTLE(p, v) \ + do { \ + (p)[0] = U8V((v) ); \ + (p)[1] = U8V((v) >> 8); \ + (p)[2] = U8V((v) >> 16); \ + (p)[3] = U8V((v) >> 24); \ + } while (0) + +#define ROTATE(v, c) (ROTL32(v, c)) +#define XOR(v, w) ((v) ^ (w)) +#define PLUS(v, w) (U32V((v) + (w))) +#define PLUSONE(v) (PLUS((v), 1)) + +#define QUARTERROUND(a, b, c, d) \ + a = PLUS(a, b); d = ROTATE(XOR(d, a), 16); \ + c = PLUS(c, d); b = ROTATE(XOR(b, c), 12); \ + a = PLUS(a, b); d = ROTATE(XOR(d, a), 8); \ + c = PLUS(c, d); b = ROTATE(XOR(b, c), 7); + +static const char sigma[16] = "expand 32-byte k"; + +static void +chacha_keysetup(chacha_ctx *chacha, const u8 *k) +{ + chacha->input[0] = U8TO32_LITTLE(sigma + 0); + chacha->input[1] = U8TO32_LITTLE(sigma + 4); + chacha->input[2] = U8TO32_LITTLE(sigma + 8); + chacha->input[3] = U8TO32_LITTLE(sigma + 12); + chacha->input[4] = U8TO32_LITTLE(k + 0); + chacha->input[5] = U8TO32_LITTLE(k + 4); + chacha->input[6] = U8TO32_LITTLE(k + 8); + chacha->input[7] = U8TO32_LITTLE(k + 12); + chacha->input[8] = U8TO32_LITTLE(k + 16); + chacha->input[9] = U8TO32_LITTLE(k + 20); + chacha->input[10] = U8TO32_LITTLE(k + 24); + chacha->input[11] = U8TO32_LITTLE(k + 28); +} + +static void +chacha_ivsetup(chacha_ctx *chacha, const u8 *iv) +{ + chacha->input[12] = 0; + chacha->input[13] = 0; + chacha->input[14] = U8TO32_LITTLE(iv + 0); + chacha->input[15] = U8TO32_LITTLE(iv + 4); +} + +static void +chacha_encrypt_bytes(chacha_ctx *chacha, const u8 *m, u8 *c, u32 bytes) +{ + u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + u32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; + u8 *ctarget = NULL; + u8 tmp[64]; + u_int i; + + if (!bytes) return; + + j0 = chacha->input[0]; + j1 = chacha->input[1]; + j2 = chacha->input[2]; + j3 = chacha->input[3]; + j4 = chacha->input[4]; + j5 = chacha->input[5]; + j6 = chacha->input[6]; + j7 = chacha->input[7]; + j8 = chacha->input[8]; + j9 = chacha->input[9]; + j10 = chacha->input[10]; + j11 = chacha->input[11]; + j12 = chacha->input[12]; + j13 = chacha->input[13]; + j14 = chacha->input[14]; + j15 = chacha->input[15]; + + for (;;) { + if (bytes < 64) { + for (i = 0;i < bytes;++i) tmp[i] = m[i]; + m = tmp; + ctarget = c; + c = tmp; + } + x0 = j0; + x1 = j1; + x2 = j2; + x3 = j3; + x4 = j4; + x5 = j5; + x6 = j6; + x7 = j7; + x8 = j8; + x9 = j9; + x10 = j10; + x11 = j11; + x12 = j12; + x13 = j13; + x14 = j14; + x15 = j15; + for (i = 20;i > 0;i -= 2) { + QUARTERROUND( x0, x4, x8, x12) + QUARTERROUND( x1, x5, x9, x13) + QUARTERROUND( x2, x6, x10, x14) + QUARTERROUND( x3, x7, x11, x15) + QUARTERROUND( x0, x5, x10, x15) + QUARTERROUND( x1, x6, x11, x12) + QUARTERROUND( x2, x7, x8, x13) + QUARTERROUND( x3, x4, x9, x14) + } + x0 = PLUS(x0, j0); + x1 = PLUS(x1, j1); + x2 = PLUS(x2, j2); + x3 = PLUS(x3, j3); + x4 = PLUS(x4, j4); + x5 = PLUS(x5, j5); + x6 = PLUS(x6, j6); + x7 = PLUS(x7, j7); + x8 = PLUS(x8, j8); + x9 = PLUS(x9, j9); + x10 = PLUS(x10, j10); + x11 = PLUS(x11, j11); + x12 = PLUS(x12, j12); + x13 = PLUS(x13, j13); + x14 = PLUS(x14, j14); + x15 = PLUS(x15, j15); + + j12 = PLUSONE(j12); + if (!j12) { + j13 = PLUSONE(j13); + /* stopping at 2^70 bytes per nonce is user's responsibility */ + } + + U32TO8_LITTLE(c + 0, x0); + U32TO8_LITTLE(c + 4, x1); + U32TO8_LITTLE(c + 8, x2); + U32TO8_LITTLE(c + 12, x3); + U32TO8_LITTLE(c + 16, x4); + U32TO8_LITTLE(c + 20, x5); + U32TO8_LITTLE(c + 24, x6); + U32TO8_LITTLE(c + 28, x7); + U32TO8_LITTLE(c + 32, x8); + U32TO8_LITTLE(c + 36, x9); + U32TO8_LITTLE(c + 40, x10); + U32TO8_LITTLE(c + 44, x11); + U32TO8_LITTLE(c + 48, x12); + U32TO8_LITTLE(c + 52, x13); + U32TO8_LITTLE(c + 56, x14); + U32TO8_LITTLE(c + 60, x15); + + if (bytes <= 64) { + if (bytes < 64) { + for (i = 0;i < bytes;++i) ctarget[i] = c[i]; + } + chacha->input[12] = j12; + chacha->input[13] = j13; + return; + } + bytes -= 64; + c += 64; + } +} diff --git a/lib/random/random.tm b/lib/random/random.tm new file mode 100644 index 00000000..14b68d7f --- /dev/null +++ b/lib/random/random.tm @@ -0,0 +1,233 @@ +# Random Number Generator (RNG) implementation based on ChaCha + +use ./sysrandom.h +use ./chacha.h + +struct chacha_ctx(j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15:Int32; extern, secret) + func from_seed(seed:[Byte]=[] -> chacha_ctx) + return C_code:chacha_ctx( + chacha_ctx ctx; + uint8_t seed_bytes[KEYSZ + IVSZ] = {}; + for (int64_t i = 0; i < (int64_t)sizeof(seed_bytes); i++) + seed_bytes[i] = i < @seed.length ? *(uint8_t*)(@seed.data + i*@seed.stride) : 0; + chacha_keysetup(&ctx, seed_bytes); + chacha_ivsetup(&ctx, seed_bytes + KEYSZ); + ctx + ) + +random := RandomNumberGenerator.new() + +func _os_random_bytes(count:Int64 -> [Byte]) + return C_code:[Byte]( + uint8_t *random_bytes = GC_MALLOC_ATOMIC(@count); + getrandom(random_bytes, @count, 0); + (List_t){.length=@count, .data=random_bytes, .stride=1, .atomic=1} + ) +struct RandomNumberGenerator(_chacha:chacha_ctx, _random_bytes:[Byte]=[]; secret) + func new(seed:[Byte]?=none, -> @RandomNumberGenerator) + ctx := chacha_ctx.from_seed(seed or _os_random_bytes(40)) + return @RandomNumberGenerator(ctx, []) + + func _rekey(rng:&RandomNumberGenerator) + rng._random_bytes = C_code:[Byte]( + Byte_t new_keystream[KEYSZ + IVSZ] = {}; + // Fill the buffer with the keystream + chacha_encrypt_bytes(&@rng->_chacha, new_keystream, new_keystream, sizeof(new_keystream)); + // Immediately reinitialize for backtracking resistance + chacha_keysetup(&@rng->_chacha, new_keystream); + chacha_ivsetup(&@rng->_chacha, new_keystream + KEYSZ); + List_t new_bytes = (List_t){.data=GC_MALLOC_ATOMIC(1024), .length=1024, .stride=1, .atomic=1}; + memset(new_bytes.data, 0, new_bytes.length); + chacha_encrypt_bytes(&@rng->_chacha, new_bytes.data, new_bytes.data, new_bytes.length); + new_bytes + ) + + func _fill_bytes(rng:&RandomNumberGenerator, dest:&Memory, needed:Int64) + C_code { + while (@needed > 0) { + if (@rng->_random_bytes.length == 0) + @(rng._rekey()); + + assert(@rng->_random_bytes.stride == 1); + + int64_t batch_size = MIN(@needed, @rng->_random_bytes.length); + uint8_t *batch_src = @rng->_random_bytes.data; + memcpy(@dest, batch_src, batch_size); + memset(batch_src, 0, batch_size); + @rng->_random_bytes.data += batch_size; + @rng->_random_bytes.length -= batch_size; + @dest += batch_size; + @needed -= batch_size; + } + } + + func bytes(rng:&RandomNumberGenerator, count:Int -> [Byte]) + count64 := Int64(count) + buf := C_code:@Memory(GC_MALLOC_ATOMIC(@count64)) + rng._fill_bytes(buf, count64) + return C_code:[Byte]((List_t){.data=@buf, .stride=1, .atomic=1, .length=@count64}) + + func byte(rng:&RandomNumberGenerator -> Byte) + byte : &Byte + rng._fill_bytes(byte, 1) + return byte[] + + func bool(rng:&RandomNumberGenerator, probability=0.5 -> Bool) + if probability == 0.5 + return rng.byte() < 0x80 + else + return rng.num(0., 1.) < 0.5 + + func int64(rng:&RandomNumberGenerator, min=Int64.min, max=Int64.max -> Int64) + fail("Random minimum value $min is larger than the maximum value $max") if min > max + return min if min == max + random_int64 : &Int64 + rng._fill_bytes(random_int64, 8) + if min == Int64.min and max == Int64.max + return random_int64 + + return C_code:Int64( + uint64_t range = (uint64_t)@max - (uint64_t)@min + 1; + uint64_t min_r = -range % range; + uint64_t r; + @random_int64 = &r; + for (;;) { + @(rng._fill_bytes(random_int64, 8)); + if (r >= min_r) break; + } + (int64_t)((uint64_t)@min + (r % range)) + ) + + func int32(rng:&RandomNumberGenerator, min=Int32.min, max=Int32.max -> Int32) + fail("Random minimum value $min is larger than the maximum value $max") if min > max + return min if min == max + random_int32 : &Int32 + rng._fill_bytes(random_int32, 8) + if min == Int32.min and max == Int32.max + return random_int32 + + return C_code:Int32( + uint32_t range = (uint32_t)@max - (uint32_t)@min + 1; + uint32_t min_r = -range % range; + uint32_t r; + @random_int32 = &r; + for (;;) { + @(rng._fill_bytes(random_int32, 8)); + if (r >= min_r) break; + } + (int32_t)((uint32_t)@min + (r % range)) + ) + + func int16(rng:&RandomNumberGenerator, min=Int16.min, max=Int16.max -> Int16) + fail("Random minimum value $min is larger than the maximum value $max") if min > max + return min if min == max + random_int16 : &Int16 + rng._fill_bytes(random_int16, 8) + if min == Int16.min and max == Int16.max + return random_int16 + + return C_code:Int16( + uint16_t range = (uint16_t)@max - (uint16_t)@min + 1; + uint16_t min_r = -range % range; + uint16_t r; + @random_int16 = &r; + for (;;) { + @(rng._fill_bytes(random_int16, 8)); + if (r >= min_r) break; + } + (int16_t)((uint16_t)@min + (r % range)) + ) + + func int8(rng:&RandomNumberGenerator, min=Int8.min, max=Int8.max -> Int8) + fail("Random minimum value $min is larger than the maximum value $max") if min > max + return min if min == max + random_int8 : &Int8 + rng._fill_bytes(random_int8, 8) + if min == Int8.min and max == Int8.max + return random_int8 + + return C_code:Int8( + uint8_t range = (uint8_t)@max - (uint8_t)@min + 1; + uint8_t min_r = -range % range; + uint8_t r; + @random_int8 = &r; + for (;;) { + @(rng._fill_bytes(random_int8, 8)); + if (r >= min_r) break; + } + (int8_t)((uint8_t)@min + (r % range)) + ) + + func num(rng:&RandomNumberGenerator, min=0., max=1. -> Num) + num_buf : &Num + return C_code:Num( + if (@min > @max) fail("Random minimum value (", @min, ") is larger than the maximum value (", @max, ")"); + if (@min == @max) return @min; + + union { + Num_t num; + uint64_t bits; + } r = {.bits=0}, one = {.num=1.0}; + @num_buf = &r.num; + @(rng._fill_bytes(num_buf, 8)); + + // Set r.num to 1. + r.bits &= ~(0xFFFULL << 52); + r.bits |= (one.bits & (0xFFFULL << 52)); + + r.num -= 1.0; + + (@min == 0.0 && @max == 1.0) ? r.num : ((1.0-r.num)*@min + r.num*@max) + ) + + func num32(rng:&RandomNumberGenerator, min=Num32(0.), max=Num32(1.) -> Num32) + return Num32(rng.num(Num(min), Num(max))) + + func int(rng:&RandomNumberGenerator, min:Int, max:Int -> Int) + return C_code:Int( + if (likely(((@min.small & @max.small) & 1) != 0)) { + int32_t r = @(rng.int32(Int32(min), Int32(max))); + return I_small(r); + } + + int32_t cmp = @(min <> max); + if (cmp > 0) + fail("Random minimum value (", @min, ") is larger than the maximum value (", @max, ")"); + if (cmp == 0) return @min; + + mpz_t range_size; + mpz_init_set_int(range_size, @max); + if (@min.small & 1) { + mpz_t min_mpz; + mpz_init_set_si(min_mpz, @min.small >> 2); + mpz_sub(range_size, range_size, min_mpz); + } else { + mpz_sub(range_size, range_size, *@min.big); + } + + gmp_randstate_t gmp_rng; + gmp_randinit_default(gmp_rng); + int64_t seed = @(rng.int64()); + gmp_randseed_ui(gmp_rng, (unsigned long)seed); + + mpz_t r; + mpz_init(r); + mpz_urandomm(r, gmp_rng, range_size); + + gmp_randclear(gmp_rng); + Int$plus(@min, Int$from_mpz(r)) + ) + + +func main() + >> rng := RandomNumberGenerator.new() + >> rng.num() + >> rng.num() + >> rng.num() + >> rng.num(0, 100) + >> rng.byte() + >> rng.bytes(20) + # >> rng.int(1, 100) + # >> rng.int(1, 100) + # >> rng.int(1, 100) + # >> rng.int(1, 100) diff --git a/lib/random/sysrandom.h b/lib/random/sysrandom.h new file mode 100644 index 00000000..ea29296b --- /dev/null +++ b/lib/random/sysrandom.h @@ -0,0 +1,14 @@ +#pragma once + +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__APPLE__) +static ssize_t getrandom(void *buf, size_t buflen, unsigned int flags) { + (void)flags; + arc4random_buf(buf, buflen); + return buflen; +} +#elif defined(__linux__) +// Use getrandom() +# include +#else + #error "Unsupported platform for secure random number generation" +#endif diff --git a/lib/shell/shell.tm b/lib/shell/shell.tm new file mode 100644 index 00000000..da03f843 --- /dev/null +++ b/lib/shell/shell.tm @@ -0,0 +1,44 @@ +use commands + +lang Shell + convert(text:Text -> Shell) + return Shell.from_text("'" ++ text.replace($/'/, `'"'"'`) ++ "'") + + convert(texts:[Text] -> Shell) + return Shell.from_text(" ".join([Shell(t).text for t in texts])) + + convert(path:Path -> Shell) + return Shell(Text(path.expand_home())) + + convert(paths:[Path] -> Shell) + return Shell.from_text(" ".join([Shell(Text(p)).text for p in paths])) + + convert(n:Int -> Shell) return Shell.from_text(Text(n)) + convert(n:Int64 -> Shell) return Shell.from_text(Text(n)) + convert(n:Int32 -> Shell) return Shell.from_text(Text(n)) + convert(n:Int16 -> Shell) return Shell.from_text(Text(n)) + convert(n:Int8 -> Shell) return Shell.from_text(Text(n)) + convert(n:Num -> Shell) return Shell.from_text(Text(n)) + convert(n:Num32 -> Shell) return Shell.from_text(Text(n)) + + func command(shell:Shell -> Command) + return Command("sh", ["-c", shell.text]) + + func result(shell:Shell, input="", input_bytes:[Byte]=[] -> ProgramResult) + return shell.command().result(input=input, input_bytes=input_bytes) + + func run(shell:Shell -> ExitType) + return shell.command().run() + + func get_output(shell:Shell, input="", trim_newline=yes -> Text?) + return shell.command().get_output(input=input, trim_newline=trim_newline) + + func get_output_bytes(shell:Shell, input="", input_bytes:[Byte]=[] -> [Byte]?) + return shell.command().get_output_bytes(input=input, input_bytes=input_bytes) + + func by_line(shell:Shell -> func(->Text?)?) + return shell.command().by_line() + +func main(command:Shell) + for line in command.by_line()! + >> line diff --git a/lib/time/README.md b/lib/time/README.md new file mode 100644 index 00000000..55f725f1 --- /dev/null +++ b/lib/time/README.md @@ -0,0 +1,5 @@ +# Time + +This is a Tomo library for working with dates and times. The `Time` type that +is provided is a date-and-time datatype that refers to a specific moment in +time. diff --git a/lib/time/time.tm b/lib/time/time.tm new file mode 100644 index 00000000..1796654d --- /dev/null +++ b/lib/time/time.tm @@ -0,0 +1,210 @@ +# Time - a module for dealing with dates and times +use ./time_defs.h + +enum Weekday(Sunday, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday) + +struct TimeInfo(year,month,day,hour,minute,second,nanosecond:Int, weekday:Weekday, day_of_year:Int, timezone:Text) + +struct Time(tv_sec:Int64, tv_usec:Int64; extern) + func now(->Time) + return C_code : Time ( + struct timespec ts; + if (clock_gettime(CLOCK_REALTIME, &ts) != 0) + fail("Couldn't get the time!"); + (Time){.tv_sec=ts.tv_sec, .tv_usec=ts.tv_nsec/1000} + ) + + func local_timezone(->Text) + C_code { + if (_local_timezone.length < 0) { + static char buf[PATH_MAX]; + ssize_t len = readlink("/etc/localtime", buf, sizeof(buf)); + if (len < 0) + fail("Could not get local tz!"); + + char *zoneinfo = strstr(buf, "/zoneinfo/"); + if (zoneinfo) + _local_timezone = Text$from_str(zoneinfo + strlen("/zoneinfo/")); + else + fail("Could not resolve local tz!"); + } + } + return C_code:Text(_local_timezone) + + func set_local_timezone(timezone:Text) + C_code { + setenv("TZ", @(CString(timezone)), 1); + _local_timezone = @timezone; + tzset(); + } + + func format(t:Time, format="%c", timezone=Time.local_timezone() -> Text) + return C_code : Text ( + struct tm result; + time_t time = @t.tv_sec; + struct tm *final_info; + WITH_TIMEZONE(@timezone, final_info = localtime_r(&time, &result)); + static char buf[256]; + size_t len = strftime(buf, sizeof(buf), String(@format), final_info); + Text$from_strn(buf, len) + ) + + func new(year,month,day:Int, hour=0, minute=0, second=0.0, timezone=Time.local_timezone() -> Time) + return C_code : Time( + struct tm info = { + .tm_min=Int32$from_int(@minute, false), + .tm_hour=Int32$from_int(@hour, false), + .tm_mday=Int32$from_int(@day, false), + .tm_mon=Int32$from_int(@month, false) - 1, + .tm_year=Int32$from_int(@year, false) - 1900, + .tm_isdst=-1, + }; + + time_t t; + WITH_TIMEZONE(@timezone, t = mktime(&info)); + (Time){.tv_sec=t + (time_t)@second, .tv_usec=(suseconds_t)(fmod(@second, 1.0) * 1e9)} + ) + + func unix_timestamp(t:Time -> Int64) + return C_code:Int64((int64_t)@t.tv_sec) + + func from_unix_timestamp(timestamp:Int64 -> Time) + return C_code:Time((Time){.tv_sec=@timestamp};) + + func seconds_till(t:Time, target:Time -> Num) + seconds := Num(target.tv_sec - t.tv_sec) + seconds += 1e-9*Num(target.tv_usec - t.tv_usec) + return seconds + + func minutes_till(t:Time, target:Time -> Num) + return t.seconds_till(target)/60. + + func hours_till(t:Time, target:Time -> Num) + return t.seconds_till(target)/3600. + + func relative(t:Time, relative_to=Time.now(), timezone=Time.local_timezone() -> Text) + C_code { + struct tm info = {}; + struct tm relative_info = {}; + WITH_TIMEZONE(@timezone, { + localtime_r(&@t.tv_sec, &info); + localtime_r(&@relative_to.tv_sec, &relative_info); + }); + double second_diff = @(relative_to.seconds_till(t)); + if (info.tm_year != relative_info.tm_year && fabs(second_diff) > 365.*24.*60.*60.) + return num_format((long)info.tm_year - (long)relative_info.tm_year, "year"); + else if (info.tm_mon != relative_info.tm_mon && fabs(second_diff) > 31.*24.*60.*60.) + return num_format(12*((long)info.tm_year - (long)relative_info.tm_year) + (long)info.tm_mon - (long)relative_info.tm_mon, "month"); + else if (info.tm_yday != relative_info.tm_yday && fabs(second_diff) > 24.*60.*60.) + return num_format(round(second_diff/(24.*60.*60.)), "day"); + else if (info.tm_hour != relative_info.tm_hour && fabs(second_diff) > 60.*60.) + return num_format(round(second_diff/(60.*60.)), "hour"); + else if (info.tm_min != relative_info.tm_min && fabs(second_diff) > 60.) + return num_format(round(second_diff/(60.)), "minute"); + else { + if (fabs(second_diff) < 1e-6) + return num_format((long)(second_diff*1e9), "nanosecond"); + else if (fabs(second_diff) < 1e-3) + return num_format((long)(second_diff*1e6), "microsecond"); + else if (fabs(second_diff) < 1.0) + return num_format((long)(second_diff*1e3), "millisecond"); + else + return num_format((long)(second_diff), "second"); + } + } + fail("Unreachable") + + func time(t:Time, seconds=no, am_pm=yes, timezone=Time.local_timezone() -> Text) + time := if seconds and am_pm + t.format("%l:%M:%S%P") + else if seconds and not am_pm + t.format("%T") + else if not seconds and am_pm + t.format("%l:%M%P") + else + t.format("%H:%M") + return time.trim() + + func date(t:Time, timezone=Time.local_timezone() -> Text) + return t.format("%F") + + func info(t:Time, timezone=Time.local_timezone() -> TimeInfo) + return C_code : TimeInfo ( + struct tm info = {}; + WITH_TIMEZONE(@timezone, localtime_r(&@t.tv_sec, &info)); + (_$time$TimeInfo$$type){ + .year = I(info.tm_year + 1900), + .month = I(info.tm_mon + 1), + .day = I(info.tm_mday), + .hour = I(info.tm_hour), + .minute = I(info.tm_min), + .second = I(info.tm_sec), + .nanosecond = I(@t.tv_usec), + .weekday = (_$time$Weekday$$type){.$tag=info.tm_wday + 1}, + .day_of_year = I(info.tm_yday), + .timezone = @timezone, + } + ) + + func after(t:Time, seconds=0.0, minutes=0.0, hours=0.0, days=0, weeks=0, months=0, years=0, timezone=Time.local_timezone() -> Time) + return C_code : Time ( + double offset = @seconds + 60.*@minutes + 3600.*@hours ; + @t.tv_sec += (time_t)offset; + + struct tm info = {}; + WITH_TIMEZONE(@timezone, localtime_r(&@t.tv_sec, &info)); + + info.tm_mday += Int32$from_int(@days, false) + 7*Int32$from_int(@weeks, false); + info.tm_mon += Int32$from_int(@months, false); + info.tm_year += Int32$from_int(@years, false); + + time_t t = mktime(&info); + (Time){ + .tv_sec=t, + .tv_usec=@t.tv_usec + (suseconds_t)(fmod(offset, 1.0) * 1e9), + } + ) + + func parse(text:Text, format="%Y-%m-%dT%H:%M:%S%z", timezone=Time.local_timezone() -> Time?) + return C_code : Time? ( + struct tm info = {.tm_isdst=-1}; + const char *str = Text$as_c_string(@text); + const char *fmt = Text$as_c_string(@format); + if (strstr(fmt, "%Z")) + fail("The %Z specifier is not supported for time parsing!"); + + char *invalid; + WITH_TIMEZONE(@timezone, invalid = strptime(str, fmt, &info)); + if (!invalid || invalid[0] != '\0') + return (_$time$$OptionalTime$$type){.is_none=true}; + + long offset = info.tm_gmtoff; // Need to cache this because mktime() mutates it to local tz >:( + time_t t; + WITH_TIMEZONE(@timezone, t = mktime(&info)); + (_$time$$OptionalTime$$type){.value={.tv_sec=t + offset - info.tm_gmtoff}}; + ) + +func _run_tests() + >> Time.now().format() + >> Time.set_local_timezone("Europe/Paris") + >> Time.now().format() + >> Time.set_local_timezone("America/New_York") + >> Time.now().format() + # >> Time.now().format(timezone="Europe/Paris") + # >> Time.now().format() + # >> Time.now().format("%Y-%m-%d") + # >> Time.new(2023, 11, 5).format() + # >> Time.local_timezone() + + # >> Time.new(2023, 11, 5).seconds_till(Time.now()) + # >> Time.new(2023, 11, 5).relative() + + # >> Time.now().info() + # >> Time.now().time() + # >> Time.now().date() + + # >> Time.parse("2023-11-05 01:01", "%Y-%m-%d %H:%M") + # >> Time.parse("2023-11-05 01:01", "%Y-%m-%d %H:%M", timezone="Europe/Paris") + +func main() + _run_tests() diff --git a/lib/time/time_defs.h b/lib/time/time_defs.h new file mode 100644 index 00000000..fd8fd4f3 --- /dev/null +++ b/lib/time/time_defs.h @@ -0,0 +1,30 @@ +#pragma once +#include +#include + +typedef struct timeval Time; + +static OptionalText_t _local_timezone = NONE_TEXT; + +static INLINE Text_t num_format(long n, const char *unit) +{ + if (n == 0) + return Text("now"); + return Text$format((n == 1 || n == -1) ? "%ld %s %s" : "%ld %ss %s", n < 0 ? -n : n, unit, n < 0 ? "ago" : "later"); +} + +static void set_local_timezone(Text_t tz) +{ + setenv("TZ", Text$as_c_string(tz), 1); + _local_timezone = tz; + tzset(); +} + +#define WITH_TIMEZONE(tz, body) ({ if (tz.length >= 0) { \ + OptionalText_t old_timezone = _local_timezone; \ + set_local_timezone(tz); \ + body; \ + set_local_timezone(old_timezone); \ + } else { \ + body; \ + }}) diff --git a/lib/uuid/uuid.tm b/lib/uuid/uuid.tm new file mode 100644 index 00000000..002b4808 --- /dev/null +++ b/lib/uuid/uuid.tm @@ -0,0 +1,36 @@ +lang UUID + func v4(-> UUID) # Random UUID + bytes := &random.bytes(16) + bytes[7; unchecked] = 0x40 or (bytes[7; unchecked] and 0x0F) + bytes[9; unchecked] = (Byte(random.int8(0x8, 0xB)) << 4) or (bytes[9; unchecked] and 0x0F) + hex := "".join([b.hex() for b in bytes]) + uuid := "$(hex.slice(1, 8))-$(hex.slice(9, 12))-$(hex.slice(13, 16))-$(hex.slice(17, -1))" + return UUID.from_text(uuid) + + func v7(-> UUID) # Timestamp + random UUID + n := now() + timestamp := n.seconds*1000 + n.microseconds/1_000 + + bytes := [ + Byte((timestamp >> 40)), + Byte((timestamp >> 32)), + Byte((timestamp >> 24)), + Byte((timestamp >> 16)), + Byte((timestamp >> 8)), + Byte(timestamp), + (random.byte() and 0x0F) or 0x70, + random.byte(), + (random.byte() and 0x3F) or 0x80, + random.byte() for _ in 7, + ] + + hex := "".join([b.hex() for b in bytes]) + uuid := "$(hex.slice(1, 8))-$(hex.slice(9, 12))-$(hex.slice(13, 16))-$(hex.slice(17, -1))" + return UUID.from_text(uuid) + +enum UUIDVersion(v4, v7) +func main(version=UUIDVersion.v7) + when version is v4 + say(UUID.v4().text_content) + is v7 + say(UUID.v7().text_content) -- cgit v1.2.3