aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/ast.c1
-rw-r--r--src/ast.h5
-rw-r--r--src/compile.c51
-rw-r--r--src/environment.c39
-rw-r--r--src/environment.h1
-rw-r--r--src/parse.c53
-rw-r--r--src/stdlib/README.md1
-rw-r--r--src/stdlib/datatypes.h3
-rw-r--r--src/stdlib/optionals.c1
-rw-r--r--src/stdlib/paths.c22
-rw-r--r--src/stdlib/patterns.c1337
-rw-r--r--src/stdlib/patterns.h46
-rw-r--r--src/stdlib/stdlib.c1
-rw-r--r--src/stdlib/text.c251
-rw-r--r--src/stdlib/text.h10
-rw-r--r--src/stdlib/tomo.h1
-rw-r--r--src/tomo.c8
-rw-r--r--src/typecheck.c30
18 files changed, 366 insertions, 1495 deletions
diff --git a/src/ast.c b/src/ast.c
index 5f4e24f1..75795d61 100644
--- a/src/ast.c
+++ b/src/ast.c
@@ -165,6 +165,7 @@ CORD ast_to_xml(ast_t *ast)
T(Use, "<Use>%r%r</Use>", optional_tagged("var", data.var), xml_escape(data.path))
T(InlineCCode, "<InlineCode>%r</InlineCode>", xml_escape(data.code))
T(Deserialize, "<Deserialize><type>%r</type>%r</Deserialize>", type_ast_to_xml(data.type), ast_to_xml(data.value))
+ T(Extend, "<Extend name=\"%s\">%r</Extend>", data.name, ast_to_xml(data.body))
default: return "???";
#undef T
}
diff --git a/src/ast.h b/src/ast.h
index b5b1ad3c..cad7bb64 100644
--- a/src/ast.h
+++ b/src/ast.h
@@ -143,6 +143,7 @@ typedef enum {
Use,
InlineCCode,
Deserialize,
+ Extend,
} ast_e;
struct ast_s {
@@ -331,6 +332,10 @@ struct ast_s {
ast_t *value;
type_ast_t *type;
} Deserialize;
+ struct {
+ const char *name;
+ ast_t *body;
+ } Extend;
} __data;
};
diff --git a/src/compile.c b/src/compile.c
index f228148a..2cc20f39 100644
--- a/src/compile.c
+++ b/src/compile.c
@@ -15,7 +15,6 @@
#include "stdlib/integers.h"
#include "stdlib/nums.h"
#include "stdlib/paths.h"
-#include "stdlib/patterns.h"
#include "stdlib/text.h"
#include "stdlib/util.h"
#include "structs.h"
@@ -39,7 +38,7 @@ static CORD compile_string_literal(CORD literal);
CORD promote_to_optional(type_t *t, CORD code)
{
- if (t == PATH_TYPE || t == PATH_TYPE_TYPE || t == MATCH_TYPE) {
+ if (t == PATH_TYPE || t == PATH_TYPE_TYPE) {
return code;
} else if (t->tag == IntType) {
switch (Match(t, IntType)->bits) {
@@ -442,7 +441,7 @@ static void add_closed_vars(Table_t *closed_vars, env_t *enclosing_scope, env_t
add_closed_vars(closed_vars, enclosing_scope, env, Match(ast, Deserialize)->value);
break;
}
- case Use: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: {
+ case Use: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: case Extend: {
errx(1, "Definitions should not be reachable in a closure.");
}
default:
@@ -497,7 +496,6 @@ PUREFUNC CORD compile_unsigned_type(type_t *t)
CORD compile_type(type_t *t)
{
if (t == RNG_TYPE) return "RNG_t";
- else if (t == MATCH_TYPE) return "Match_t";
else if (t == PATH_TYPE) return "Path_t";
else if (t == PATH_TYPE_TYPE) return "PathType_t";
@@ -516,8 +514,6 @@ CORD compile_type(type_t *t)
auto text = Match(t, TextType);
if (!text->lang || streq(text->lang, "Text"))
return "Text_t";
- else if (streq(text->lang, "Pattern"))
- return "Pattern_t";
else
return CORD_all(namespace_prefix(text->env, text->env->namespace->parent), text->lang, "$$type");
}
@@ -558,8 +554,6 @@ CORD compile_type(type_t *t)
case ArrayType: case TableType: case SetType:
return CORD_all("Optional", compile_type(nonnull));
case StructType: {
- if (nonnull == MATCH_TYPE)
- return "OptionalMatch_t";
if (nonnull == PATH_TYPE)
return "OptionalPath_t";
if (nonnull == PATH_TYPE_TYPE)
@@ -680,7 +674,7 @@ CORD optional_into_nonnone(type_t *t, CORD value)
case IntType:
return CORD_all(value, ".value");
case StructType:
- if (t == MATCH_TYPE || t == PATH_TYPE || t == PATH_TYPE_TYPE)
+ if (t == PATH_TYPE || t == PATH_TYPE_TYPE)
return value;
return CORD_all(value, ".value");
default:
@@ -695,8 +689,6 @@ CORD check_none(type_t *t, CORD value)
// complain about excessive parens around equality comparisons
if (t->tag == PointerType || t->tag == FunctionType || t->tag == CStringType)
return CORD_all("({", value, " == NULL;})");
- else if (t == MATCH_TYPE)
- return CORD_all("({(", value, ").index.small == 0;})");
else if (t == PATH_TYPE)
return CORD_all("({(", value, ").type.$tag == PATH_NONE;})");
else if (t == PATH_TYPE_TYPE)
@@ -1168,7 +1160,7 @@ static CORD _compile_statement(env_t *env, ast_t *ast)
default: code_err(ast, "Update assignments are not implemented for this operation");
}
}
- case StructDef: case EnumDef: case LangDef: case FunctionDef: case ConvertDef: {
+ case StructDef: case EnumDef: case LangDef: case Extend: case FunctionDef: case ConvertDef: {
return CORD_EMPTY;
}
case Skip: {
@@ -1730,8 +1722,13 @@ static CORD _compile_statement(env_t *env, ast_t *ast)
code_err(ast, "Could not find library");
CORD initialization = CORD_EMPTY;
- const char *lib_id = Text$as_c_string(
- Text$replace(Text$from_str(use->path), Pattern("{1+ !alphanumeric}"), Text("_"), Pattern(""), false));
+
+ char *lib_id = String(use->path);
+ for (char *p = lib_id; *p; p++) {
+ if (!isalnum(*p) && *p != '_')
+ *p = '_';
+ }
+
for (size_t i = 0; i < tm_files.gl_pathc; i++) {
const char *filename = tm_files.gl_pathv[i];
initialization = CORD_all(
@@ -2165,7 +2162,6 @@ CORD compile_none(type_t *t)
if (t == PATH_TYPE) return "NONE_PATH";
else if (t == PATH_TYPE_TYPE) return "((OptionalPathType_t){})";
- else if (t == MATCH_TYPE) return "NONE_MATCH";
switch (t->tag) {
case BigIntType: return "NONE_INT";
@@ -2597,8 +2593,6 @@ CORD compile(env_t *env, ast_t *ast)
CORD lang_constructor;
if (!lang || streq(lang, "Text"))
lang_constructor = "Text";
- else if (streq(lang, "Pattern"))
- lang_constructor = lang;
else
lang_constructor = CORD_all(namespace_prefix(Match(text_t, TextType)->env, Match(text_t, TextType)->env->namespace->parent), lang);
@@ -3752,7 +3746,7 @@ CORD compile(env_t *env, ast_t *ast)
case Defer: code_err(ast, "Compiling 'defer' as expression!");
case Extern: code_err(ast, "Externs are not supported as expressions");
case TableEntry: code_err(ast, "Table entries should not be compiled directly");
- case Declare: case Assign: case UpdateAssign: case For: case While: case Repeat: case StructDef: case LangDef:
+ case Declare: case Assign: case UpdateAssign: case For: case While: case Repeat: case StructDef: case LangDef: case Extend:
case EnumDef: case FunctionDef: case ConvertDef: case Skip: case Stop: case Pass: case Return: case DocTest: case PrintStatement:
code_err(ast, "This is not a valid expression");
default: case Unknown: code_err(ast, "Unknown AST");
@@ -3762,7 +3756,6 @@ CORD compile(env_t *env, ast_t *ast)
CORD compile_type_info(type_t *t)
{
if (t == RNG_TYPE) return "&RNG$info";
- else if (t == MATCH_TYPE) return "&Match$info";
else if (t == PATH_TYPE) return "&Path$info";
else if (t == PATH_TYPE_TYPE) return "&PathType$info";
@@ -3773,8 +3766,6 @@ CORD compile_type_info(type_t *t)
auto text = Match(t, TextType);
if (!text->lang || streq(text->lang, "Text"))
return "&Text$info";
- else if (streq(text->lang, "Pattern"))
- return "&Pattern$info";
return CORD_all("(&", namespace_prefix(text->env, text->env->namespace->parent), text->lang, "$$info)");
}
case StructType: {
@@ -4206,6 +4197,12 @@ CORD compile_top_level_code(env_t *env, ast_t *ast)
env_t *ns_env = namespace_env(env, def->name);
return CORD_all(code, def->namespace ? compile_top_level_code(ns_env, def->namespace) : CORD_EMPTY);
}
+ case Extend: {
+ auto extend = Match(ast, Extend);
+ env_t *ns_env = namespace_env(env, extend->name);
+ ns_env->libname = env->libname;
+ return compile_top_level_code(ns_env, extend->body);
+ }
case Extern: return CORD_EMPTY;
case Block: {
CORD code = CORD_EMPTY;
@@ -4258,6 +4255,9 @@ static void initialize_vars_and_statics(env_t *env, ast_t *ast)
} else if (stmt->ast->tag == LangDef) {
initialize_vars_and_statics(namespace_env(env, Match(stmt->ast, LangDef)->name),
Match(stmt->ast, LangDef)->namespace);
+ } else if (stmt->ast->tag == Extend) {
+ initialize_vars_and_statics(namespace_env(env, Match(stmt->ast, Extend)->name),
+ Match(stmt->ast, Extend)->body);
} else if (stmt->ast->tag == Use) {
continue;
} else {
@@ -4348,6 +4348,9 @@ CORD compile_statement_type_header(env_t *env, Path_t header_path, ast_t *ast)
"extern const TypeInfo_t ", full_name, ";\n"
);
}
+ case Extend: {
+ return CORD_EMPTY;
+ }
default:
return CORD_EMPTY;
}
@@ -4364,6 +4367,12 @@ CORD compile_statement_namespace_header(env_t *env, Path_t header_path, ast_t *a
block = def->namespace;
break;
}
+ case Extend: {
+ auto extend = Match(ast, Extend);
+ ns_name = extend->name;
+ block = extend->body;
+ break;
+ }
case StructDef: {
auto def = Match(ast, StructDef);
ns_name = def->name;
diff --git a/src/environment.c b/src/environment.c
index 776e7852..6822502a 100644
--- a/src/environment.c
+++ b/src/environment.c
@@ -13,7 +13,6 @@
#include "typecheck.h"
type_t *TEXT_TYPE = NULL;
-type_t *MATCH_TYPE = NULL;
type_t *RNG_TYPE = NULL;
public type_t *PATH_TYPE = NULL;
public type_t *PATH_TYPE_TYPE = NULL;
@@ -67,7 +66,6 @@ env_t *global_env(void)
(void)bind_type(env, "Int32", Type(IntType, .bits=TYPE_IBITS32));
(void)bind_type(env, "Memory", Type(MemoryType));
PATH_TYPE_TYPE = declare_type(env, "enum PathType(Relative, Absolute, Home)");
- MATCH_TYPE = declare_type(env, "struct Match(text:Text, index:Int, captures:[Text])");
PATH_TYPE = declare_type(env, "struct Path(type:PathType, components:[Text])");
RNG_TYPE = declare_type(env, "struct RNG(state:@Memory)");
@@ -279,13 +277,6 @@ env_t *global_env(void)
#undef F_opt
#undef F
#undef C
- {"Match", MATCH_TYPE, "Match_t", "Match", TypedArray(ns_entry_t,
- // No methods
- )},
- {"Pattern", Type(TextType, .lang="Pattern", .env=namespace_env(env, "Pattern")), "Pattern_t", "Pattern$info", TypedArray(ns_entry_t,
- {"escape_int", "Int$value_as_text", "func(i:Int -> Pattern)"},
- {"escape_text", "Pattern$escape_text", "func(text:Text -> Pattern)"},
- )},
{"PathType", PATH_TYPE_TYPE, "PathType_t", "PathType$info", TypedArray(ns_entry_t,
{"Relative", "((PathType_t){.$tag=PATH_RELATIVE})", "PathType"},
{"Absolute", "((PathType_t){.$tag=PATH_ABSOLUTE})", "PathType"},
@@ -353,44 +344,42 @@ env_t *global_env(void)
{"as_c_string", "Text$as_c_string", "func(text:Text -> CString)"},
{"at", "Text$cluster", "func(text:Text, index:Int -> Text)"},
{"by_line", "Text$by_line", "func(text:Text -> func(->Text?))"},
- {"by_match", "Text$by_match", "func(text:Text, pattern:Pattern -> func(->Match?))"},
- {"by_split", "Text$by_split", "func(text:Text, pattern=$Pattern'' -> func(->Text?))"},
+ {"by_split", "Text$by_split", "func(text:Text, delimiter='' -> func(->Text?))"},
+ {"by_split_any", "Text$by_split_any", "func(text:Text, delimiters=\" $\\t\\r\\n\" -> func(->Text?))"},
{"bytes", "Text$utf8_bytes", "func(text:Text -> [Byte])"},
{"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language='C' -> Bool)"},
{"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"},
{"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"},
- {"each", "Text$each", "func(text:Text, pattern:Pattern, fn:func(match:Match), recursive=yes)"},
- {"find", "Text$find", "func(text:Text, pattern:Pattern, start=1 -> Match?)"},
- {"find_all", "Text$find_all", "func(text:Text, pattern:Pattern -> [Match])"},
{"from", "Text$from", "func(text:Text, first:Int -> Text)"},
{"from_bytes", "Text$from_bytes", "func(bytes:[Byte] -> Text?)"},
{"from_c_string", "Text$from_str", "func(str:CString -> Text?)"},
{"from_codepoint_names", "Text$from_codepoint_names", "func(codepoint_names:[Text] -> Text?)"},
{"from_codepoints", "Text$from_codepoints", "func(codepoints:[Int32] -> Text)"},
{"from_text", "Path$from_text", "func(text:Text -> Path)"},
- {"has", "Text$has", "func(text:Text, pattern:Pattern -> Bool)"},
+ {"has", "Text$has", "func(text:Text, target:Text -> Bool)"},
{"join", "Text$join", "func(glue:Text, pieces:[Text] -> Text)"},
{"left_pad", "Text$left_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"},
{"lines", "Text$lines", "func(text:Text -> [Text])"},
{"lower", "Text$lower", "func(text:Text, language='C' -> Text)"},
- {"map", "Text$map", "func(text:Text, pattern:Pattern, fn:func(match:Match -> Text), recursive=yes -> Text)"},
- {"matches", "Text$matches", "func(text:Text, pattern:Pattern -> [Text]?)"},
{"middle_pad", "Text$middle_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"},
{"quoted", "Text$quoted", "func(text:Text, color=no, quotation_mark='\"' -> Text)"},
{"repeat", "Text$repeat", "func(text:Text, count:Int -> Text)"},
- {"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, backref=$/\\/, recursive=yes -> Text)"},
- {"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern,Text}, backref=$/\\/, recursive=yes -> Text)"},
+ {"replace", "Text$replace", "func(text:Text, target:Text, replacement:Text -> Text)"},
{"reversed", "Text$reversed", "func(text:Text -> Text)"},
{"right_pad", "Text$right_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"},
{"slice", "Text$slice", "func(text:Text, from=1, to=-1 -> Text)"},
- {"split", "Text$split", "func(text:Text, pattern=$Pattern'' -> [Text])"},
+ {"split", "Text$split", "func(text:Text, delimiter='' -> [Text])"},
+ {"split_any", "Text$split_any", "func(text:Text, delimiters=\" $\\t\\r\\n\" -> [Text])"},
{"starts_with", "Text$starts_with", "func(text,prefix:Text -> Bool)"},
{"title", "Text$title", "func(text:Text, language='C' -> Text)"},
{"to", "Text$to", "func(text:Text, last:Int -> Text)"},
- {"trim", "Text$trim", "func(text:Text, pattern=$/{whitespace}/, trim_left=yes, trim_right=yes -> Text)"},
+ {"translate", "Text$translate", "func(text:Text, translations:{Text,Text} -> Text)"},
+ {"trim", "Text$trim", "func(text:Text, to_trim=\" \t\r\n\", left=yes, right=yes -> Text)"},
{"upper", "Text$upper", "func(text:Text, language='C' -> Text)"},
{"utf32_codepoints", "Text$utf32_codepoints", "func(text:Text -> [Int32])"},
{"width", "Text$width", "func(text:Text, language='C' -> Int)"},
+ {"without_prefix", "Text$without_prefix", "func(text,prefix:Text -> Text)"},
+ {"without_suffix", "Text$without_suffix", "func(text,suffix:Text -> Text)"},
)},
};
@@ -518,9 +507,6 @@ env_t *global_env(void)
{"Num32$from_int64", "func(i:Int64, truncate=no -> Num32)"},
{"Num32$from_int", "func(i:Int, truncate=no -> Num32)"},
{"Num32$from_num", "func(n:Num -> Num32)"});
- ADD_CONSTRUCTORS("Pattern",
- {"Pattern$escape_text", "func(text:Text -> Pattern)"},
- {"Int$value_as_text", "func(i:Int -> Pattern)"});
ADD_CONSTRUCTORS("Path",
{"Path$escape_text", "func(text:Text -> Path)"},
{"Path$escape_path", "func(path:Path -> Path)"},
@@ -534,11 +520,6 @@ env_t *global_env(void)
.ret=PATH_TYPE),
"Path$from_text");
- set_binding(namespace_env(env, "Pattern"), "from_text",
- Type(FunctionType, .args=new(arg_t, .name="text", .type=TEXT_TYPE),
- .ret=Type(TextType, .lang="Pattern", .env=namespace_env(env, "Pattern"))),
- "(Pattern_t)");
-
struct {
const char *name, *code, *type_str;
} global_vars[] = {
diff --git a/src/environment.h b/src/environment.h
index 95e3c3e1..00b8fbba 100644
--- a/src/environment.h
+++ b/src/environment.h
@@ -89,7 +89,6 @@ void set_binding(env_t *env, const char *name, type_t *type, CORD code);
binding_t *get_namespace_binding(env_t *env, ast_t *self, const char *name);
#define code_err(ast, ...) compiler_err((ast)->file, (ast)->start, (ast)->end, __VA_ARGS__)
extern type_t *TEXT_TYPE;
-extern type_t *MATCH_TYPE;
extern type_t *RNG_TYPE;
extern type_t *PATH_TYPE;
extern type_t *PATH_TYPE_TYPE;
diff --git a/src/parse.c b/src/parse.c
index 14221cc0..2e3e2ece 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -22,7 +22,6 @@
#include "ast.h"
#include "cordhelpers.h"
#include "stdlib/integers.h"
-#include "stdlib/patterns.h"
#include "stdlib/paths.h"
#include "stdlib/print.h"
#include "stdlib/stdlib.h"
@@ -64,7 +63,7 @@ int op_tightness[] = {
static const char *keywords[] = {
"yes", "xor", "while", "when", "use", "unless", "struct", "stop", "skip", "return",
"or", "not", "none", "no", "mod1", "mod", "pass", "lang", "inline", "in", "if",
- "func", "for", "extern", "enum", "else", "do", "deserialize", "defer", "and",
+ "func", "for", "extern", "extend", "enum", "else", "do", "deserialize", "defer", "and",
"_min_", "_max_", NULL,
};
@@ -120,6 +119,7 @@ static PARSER(parse_inline_c);
static PARSER(parse_int);
static PARSER(parse_lambda);
static PARSER(parse_lang_def);
+static PARSER(parse_extend);
static PARSER(parse_namespace);
static PARSER(parse_negative);
static PARSER(parse_not);
@@ -1241,9 +1241,6 @@ PARSER(parse_text) {
open_quote = *pos;
++pos;
close_quote = closing[(int)open_quote] ? closing[(int)open_quote] : open_quote;
-
- if (!lang && (open_quote == '/' || open_quote == '|'))
- lang = "Pattern";
} else {
return NULL;
}
@@ -1904,9 +1901,10 @@ PARSER(parse_namespace) {
if (get_indent(ctx, next) != indent) break;
ast_t *stmt;
if ((stmt=optional(ctx, &pos, parse_struct_def))
+ ||(stmt=optional(ctx, &pos, parse_func_def))
||(stmt=optional(ctx, &pos, parse_enum_def))
||(stmt=optional(ctx, &pos, parse_lang_def))
- ||(stmt=optional(ctx, &pos, parse_func_def))
+ ||(stmt=optional(ctx, &pos, parse_extend))
||(stmt=optional(ctx, &pos, parse_convert_def))
||(stmt=optional(ctx, &pos, parse_use))
||(stmt=optional(ctx, &pos, parse_extern))
@@ -1940,9 +1938,10 @@ PARSER(parse_file_body) {
if (get_indent(ctx, next) != 0) break;
ast_t *stmt;
if ((stmt=optional(ctx, &pos, parse_struct_def))
+ ||(stmt=optional(ctx, &pos, parse_func_def))
||(stmt=optional(ctx, &pos, parse_enum_def))
||(stmt=optional(ctx, &pos, parse_lang_def))
- ||(stmt=optional(ctx, &pos, parse_func_def))
+ ||(stmt=optional(ctx, &pos, parse_extend))
||(stmt=optional(ctx, &pos, parse_convert_def))
||(stmt=optional(ctx, &pos, parse_use))
||(stmt=optional(ctx, &pos, parse_extern))
@@ -2112,6 +2111,32 @@ PARSER(parse_lang_def) {
return NewAST(ctx->file, start, pos, LangDef, .name=name, .namespace=namespace);
}
+PARSER(parse_extend) {
+ const char *start = pos;
+ // extend Name: body...
+ if (!match_word(&pos, "extend")) return NULL;
+ int64_t starting_indent = get_indent(ctx, pos);
+ spaces(&pos);
+ const char *name = get_id(&pos);
+ if (!name)
+ parser_err(ctx, start, pos, "I expected a name for this lang");
+
+ ast_t *body = NULL;
+ if (match(&pos, ":")) {
+ const char *ns_pos = pos;
+ whitespace(&ns_pos);
+ int64_t ns_indent = get_indent(ctx, ns_pos);
+ if (ns_indent > starting_indent) {
+ pos = ns_pos;
+ body = optional(ctx, &pos, parse_namespace);
+ }
+ }
+ if (!body)
+ body = NewAST(ctx->file, pos, pos, Block, .statements=NULL);
+
+ return NewAST(ctx->file, start, pos, Extend, .name=name, .body=body);
+}
+
arg_ast_t *parse_args(parse_ctx_t *ctx, const char **pos)
{
arg_ast_t *args = NULL;
@@ -2373,20 +2398,6 @@ PARSER(parse_use) {
what = USE_LOCAL;
} else {
what = USE_MODULE;
-
- // When `use`ing a URL, convert it to a hash:
- Text_t text = Text$from_str(name);
- Array_t m = Text$matches(text, Pattern("{url}"));
- if (m.length >= 0) {
- text = Text$trim(text, Pattern("http{0-1 s}://"), true, false);
- FILE *shasum = popen(String("echo -n '", text, "' | sha256sum"), "r");
- const size_t HASH_LEN = 32;
- char *hash = GC_MALLOC_ATOMIC(HASH_LEN + 1);
- size_t just_read = fread(hash, sizeof(char), HASH_LEN, shasum);
- if (just_read < HASH_LEN)
- print_err("Failed to get SHA sum for 'use': ", name);
- name = hash;
- }
}
return NewAST(ctx->file, start, pos, Use, .var=var, .path=name, .what=what);
}
diff --git a/src/stdlib/README.md b/src/stdlib/README.md
index 6591ead6..1c72d3d3 100644
--- a/src/stdlib/README.md
+++ b/src/stdlib/README.md
@@ -27,7 +27,6 @@ some common functionality.
- Nums: [nums.h](nums.h), [nums.c](nums.c)
- Optionals: [optionals.h](optionals.h), [optionals.c](optionals.c)
- Paths: [paths.h](paths.h), [paths.c](paths.c)
-- Patterns: [patterns.h](patterns.h), [patterns.c](patterns.c)
- Pointers: [pointers.h](pointers.h), [pointers.c](pointers.c)
- Tables: [tables.h](tables.h), [tables.c](tables.c)
- Text: [text.h](text.h), [text.c](text.c)
diff --git a/src/stdlib/datatypes.h b/src/stdlib/datatypes.h
index b1265fc3..26bd9c3c 100644
--- a/src/stdlib/datatypes.h
+++ b/src/stdlib/datatypes.h
@@ -94,9 +94,6 @@ typedef struct Text_s {
};
} Text_t;
-#define Pattern_t Text_t
-#define OptionalPattern_t Text_t
-
typedef struct {
enum { PATH_NONE, PATH_RELATIVE, PATH_ABSOLUTE, PATH_HOME } $tag;
} PathType_t;
diff --git a/src/stdlib/optionals.c b/src/stdlib/optionals.c
index 797cb111..d3309029 100644
--- a/src/stdlib/optionals.c
+++ b/src/stdlib/optionals.c
@@ -6,7 +6,6 @@
#include "integers.h"
#include "metamethods.h"
#include "nums.h"
-#include "patterns.h"
#include "text.h"
#include "util.h"
diff --git a/src/stdlib/paths.c b/src/stdlib/paths.c
index 05575620..3f27aef7 100644
--- a/src/stdlib/paths.c
+++ b/src/stdlib/paths.c
@@ -24,7 +24,6 @@
#include "integers.h"
#include "optionals.h"
#include "paths.h"
-#include "patterns.h"
#include "structs.h"
#include "text.h"
#include "types.h"
@@ -599,15 +598,10 @@ public PUREFUNC Text_t Path$base_name(Path_t path)
public Text_t Path$extension(Path_t path, bool full)
{
- Text_t base = Path$base_name(path);
- Array_t results = Text$matches(base, full ? Pattern(".{!.}.{..}") : Pattern(".{..}.{!.}{end}"));
- if (results.length > 0)
- return *((Text_t*)(results.data + results.stride*1));
- results = Text$matches(base, full ? Pattern("{!.}.{..}") : Pattern("{..}.{!.}{end}"));
- if (results.length > 0)
- return *((Text_t*)(results.data + results.stride*1));
- else
- return Text("");
+ const char *base = Text$as_c_string(Path$base_name(path));
+ const char *dot = full ? strchr(base + 1, '.') : strrchr(base + 1, '.');
+ const char *extension = dot ? dot + 1 : "";
+ return Text$from_str(extension);
}
public Path_t Path$with_component(Path_t path, Text_t component)
@@ -635,10 +629,10 @@ public Path_t Path$with_extension(Path_t path, Text_t extension, bool replace)
Text_t last = *(Text_t*)(path.components.data + path.components.stride*(path.components.length-1));
Array$remove_at(&result.components, I(-1), I(1), sizeof(Text_t));
if (replace) {
- if (Text$starts_with(last, Text(".")))
- last = Text$replace(last, Pattern(".{!.}.{..}"), Text(".@1"), Pattern("@"), false);
- else
- last = Text$replace(last, Pattern("{!.}.{..}"), Text("@1"), Pattern("@"), false);
+ const char *base = Text$as_c_string(last);
+ const char *dot = strchr(base + 1, '.');
+ if (dot)
+ last = Text$from_strn(base, (size_t)(dot - base));
}
last = Text$concat(last, extension);
diff --git a/src/stdlib/patterns.c b/src/stdlib/patterns.c
deleted file mode 100644
index b7891f88..00000000
--- a/src/stdlib/patterns.c
+++ /dev/null
@@ -1,1337 +0,0 @@
-// Logic for text pattern matching
-
-#include <ctype.h>
-#include <sys/param.h>
-#include <unictype.h>
-#include <uniname.h>
-#include <unistring/version.h>
-
-#include "arrays.h"
-#include "integers.h"
-#include "optionals.h"
-#include "patterns.h"
-#include "structs.h"
-#include "tables.h"
-#include "text.h"
-#include "types.h"
-
-#define MAX_BACKREFS 100
-
-typedef struct {
- int64_t index, length;
- bool occupied, recursive;
-} capture_t;
-
-typedef struct {
- enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag;
- bool negated, non_capturing;
- int64_t min, max;
- union {
- int32_t grapheme;
- uc_property_t property;
- int64_t (*fn)(TextIter_t *, int64_t);
- int32_t quote_graphemes[2];
- int32_t pair_graphemes[2];
- };
-} pat_t;
-
-static Text_t Text$replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive);
-
-static INLINE void skip_whitespace(TextIter_t *state, int64_t *i)
-{
- while (*i < state->stack[0].text.length) {
- int32_t grapheme = Text$get_grapheme_fast(state, *i);
- if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme))
- return;
- *i += 1;
- }
-}
-
-static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme)
-{
- if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
- *i += 1;
- return true;
- }
- return false;
-}
-
-static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str)
-{
- int64_t matched = 0;
- while (matched[str]) {
- if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
- return false;
- matched += 1;
- }
- *i += matched;
- return true;
-}
-
-static int64_t parse_int(TextIter_t *state, int64_t *i)
-{
- int64_t value = 0;
- for (;; *i += 1) {
- uint32_t grapheme = Text$get_main_grapheme_fast(state, *i);
- int digit = uc_digit_value(grapheme);
- if (digit < 0) break;
- if (value >= INT64_MAX/10) break;
- value = 10*value + digit;
- }
- return value;
-}
-
-static const char *get_property_name(TextIter_t *state, int64_t *i)
-{
- skip_whitespace(state, i);
- char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
- char *dest = name;
- while (*i < state->stack[0].text.length) {
- int32_t grapheme = Text$get_grapheme_fast(state, *i);
- if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
- *dest = (char)grapheme;
- ++dest;
- if (dest >= name + UNINAME_MAX - 1)
- break;
- } else {
- break;
- }
- *i += 1;
- }
-
- while (dest > name && dest[-1] == ' ')
- *(dest--) = '\0';
-
- if (dest == name) return NULL;
- *dest = '\0';
- return name;
-}
-
-#define EAT1(state, index, cond) ({\
- int32_t grapheme = Text$get_grapheme_fast(state, index); \
- bool success = (cond); \
- if (success) index += 1; \
- success; })
-
-#define EAT2(state, index, cond1, cond2) ({\
- int32_t grapheme = Text$get_grapheme_fast(state, index); \
- bool success = (cond1); \
- if (success) { \
- grapheme = Text$get_grapheme_fast(state, index + 1); \
- success = (cond2); \
- if (success) \
- index += 2; \
- } \
- success; })
-
-
-#define EAT_MANY(state, index, cond) ({ int64_t _n = 0; while (EAT1(state, index, cond)) { _n += 1; } _n; })
-
-static int64_t match_email(TextIter_t *state, int64_t index)
-{
- // email = local "@" domain
- // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii)
- // domain = dns-label ("." dns-label)*
- // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii)
-
- if (index > 0) {
- uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
- if (uc_is_property_alphabetic(prev_codepoint))
- return -1;
- }
-
- int64_t start_index = index;
-
- // Local part:
- int64_t local_len = 0;
- static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~";
- while (EAT1(state, index,
- (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
- local_len += 1;
- if (local_len > 64) return -1;
- }
-
- if (!EAT1(state, index, grapheme == '@'))
- return -1;
-
- // Host
- int64_t host_len = 0;
- do {
- int64_t label_len = 0;
- while (EAT1(state, index,
- (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
- label_len += 1;
- if (label_len > 63) return -1;
- }
-
- if (label_len == 0)
- return -1;
-
- host_len += label_len;
- if (host_len > 255)
- return -1;
- host_len += 1;
- } while (EAT1(state, index, grapheme == '.'));
-
- return index - start_index;
-}
-
-static int64_t match_ipv6(TextIter_t *state, int64_t index)
-{
- if (index > 0) {
- int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
- if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':'))
- return -1;
- }
- int64_t start_index = index;
- const int NUM_CLUSTERS = 8;
- bool double_colon_used = false;
- for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
- for (int digits = 0; digits < 4; digits++) {
- if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
- break;
- }
- if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
- return -1; // Too many digits
-
- if (cluster == NUM_CLUSTERS-1) {
- break;
- } else if (!EAT1(state, index, grapheme == ':')) {
- if (double_colon_used)
- break;
- return -1;
- }
-
- if (EAT1(state, index, grapheme == ':')) {
- if (double_colon_used)
- return -1;
- double_colon_used = true;
- }
- }
- return index - start_index;
-}
-
-static int64_t match_ipv4(TextIter_t *state, int64_t index)
-{
- if (index > 0) {
- int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
- if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.'))
- return -1;
- }
- int64_t start_index = index;
-
- const int NUM_CLUSTERS = 4;
- for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
- for (int digits = 0; digits < 3; digits++) {
- if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
- if (digits == 0) return -1;
- break;
- }
- }
-
- if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
- return -1; // Too many digits
-
- if (cluster == NUM_CLUSTERS-1)
- break;
- else if (!EAT1(state, index, grapheme == '.'))
- return -1;
- }
- return (index - start_index);
-}
-
-static int64_t match_ip(TextIter_t *state, int64_t index)
-{
- int64_t len = match_ipv6(state, index);
- if (len >= 0) return len;
- len = match_ipv4(state, index);
- return (len >= 0) ? len : -1;
-}
-
-static int64_t match_host(TextIter_t *state, int64_t index)
-{
- int64_t ip_len = match_ip(state, index);
- if (ip_len > 0) return ip_len;
-
- int64_t start_index = index;
- if (match_grapheme(state, &index, '[')) {
- ip_len = match_ip(state, index);
- if (ip_len <= 0) return -1;
- index += ip_len;
- if (match_grapheme(state, &index, ']'))
- return (index - start_index);
- return -1;
- }
-
- if (!EAT1(state, index, isalpha(grapheme)))
- return -1;
-
- static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`";
- EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme));
- return (index - start_index);
-}
-
-static int64_t match_authority(TextIter_t *state, int64_t index)
-{
- int64_t authority_start = index;
- static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`.";
-
- // Optional user@ prefix:
- int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme));
- if (username_len < 1 || !EAT1(state, index, grapheme == '@'))
- index = authority_start; // No user@ part
-
- // Host:
- int64_t host_len = match_host(state, index);
- if (host_len <= 0) return -1;
- index += host_len;
-
- // Port:
- if (EAT1(state, index, grapheme == ':')) {
- if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
- return -1;
- }
- return (index - authority_start);
-}
-
-static int64_t match_uri(TextIter_t *state, int64_t index)
-{
- // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment]
- // scheme = [a-zA-Z] [a-zA-Z0-9+.-]
- // authority = [userinfo "@"] host [":" port]
-
- if (index > 0) {
- // Don't match if we're not at a word edge:
- uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
- if (uc_is_property_alphabetic(prev_codepoint))
- return -1;
- }
-
- int64_t start_index = index;
-
- // Scheme:
- if (!EAT1(state, index, isalpha(grapheme)))
- return -1;
- EAT_MANY(state, index, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
- if (!match_grapheme(state, &index, ':'))
- return -1;
-
- // Authority:
- int64_t authority_len;
- if (match_str(state, &index, "//")) {
- authority_len = match_authority(state, index);
- if (authority_len > 0)
- index += authority_len;
- } else {
- authority_len = 0;
- }
-
- // Path:
- int64_t path_start = index;
- if (EAT1(state, index, grapheme == '/') || authority_len <= 0) {
- static const char *non_path = " \"#?<>[]{}\\^`|";
- EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
-
- if (EAT1(state, index, grapheme == '?')) { // Query
- static const char *non_query = " \"#<>[]{}\\^`|";
- EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
- }
-
- if (EAT1(state, index, grapheme == '#')) { // Fragment
- static const char *non_fragment = " \"#<>[]{}\\^`|";
- EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
- }
- }
-
- if (authority_len <= 0 && index == path_start)
- return -1;
-
- return index - start_index;
-}
-
-static int64_t match_url(TextIter_t *state, int64_t index)
-{
- int64_t lookahead = index;
- if (!(match_str(state, &lookahead, "https:")
- || match_str(state, &lookahead, "http:")
- || match_str(state, &lookahead, "ftp:")
- || match_str(state, &lookahead, "wss:")
- || match_str(state, &lookahead, "ws:")))
- return -1;
-
- return match_uri(state, index);
-}
-
-static int64_t match_id(TextIter_t *state, int64_t index)
-{
- if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START)))
- return -1;
- return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE));
-}
-
-static int64_t match_int(TextIter_t *state, int64_t index)
-{
- int64_t negative = EAT1(state, index, grapheme == '-') ? 1 : 0;
- int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
- return len > 0 ? negative + len : -1;
-}
-
-static int64_t match_alphanumeric(TextIter_t *state, int64_t index)
-{
- return EAT1(state, index, uc_is_property_alphabetic((ucs4_t)grapheme) || uc_is_property_numeric((ucs4_t)grapheme))
- ? 1 : -1;
-}
-
-static int64_t match_num(TextIter_t *state, int64_t index)
-{
- bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0;
- int64_t pre_decimal = EAT_MANY(state, index,
- uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
- bool decimal = (EAT1(state, index, grapheme == '.') == 1);
- int64_t post_decimal = decimal ? EAT_MANY(state, index,
- uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
- if (pre_decimal == 0 && post_decimal == 0)
- return -1;
- return negative + pre_decimal + decimal + post_decimal;
-}
-
-static int64_t match_newline(TextIter_t *state, int64_t index)
-{
- if (index >= state->stack[0].text.length)
- return -1;
-
- uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index);
- if (grapheme == '\n')
- return 1;
- if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n')
- return 2;
- return -1;
-}
-
-static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat)
-{
- Text_t text = state->stack[0].text;
- int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index);
-
- switch (pat.tag) {
- case PAT_START: {
- if (index == 0)
- return pat.negated ? -1 : 0;
- return pat.negated ? 0 : -1;
- }
- case PAT_END: {
- if (index >= text.length)
- return pat.negated ? -1 : 0;
- return pat.negated ? 0 : -1;
- }
- case PAT_ANY: {
- assert(!pat.negated);
- return (index < text.length) ? 1 : -1;
- }
- case PAT_GRAPHEME: {
- if (index >= text.length)
- return -1;
- else if (grapheme == pat.grapheme)
- return pat.negated ? -1 : 1;
- return pat.negated ? 1 : -1;
- }
- case PAT_PROPERTY: {
- if (index >= text.length)
- return -1;
- else if (uc_is_property((ucs4_t)grapheme, pat.property))
- return pat.negated ? -1 : 1;
- return pat.negated ? 1 : -1;
- }
- case PAT_PAIR: {
- // Nested punctuation: (?), [?], etc
- if (index >= text.length)
- return -1;
-
- int32_t open = pat.pair_graphemes[0];
- if (grapheme != open)
- return pat.negated ? 1 : -1;
-
- int32_t close = pat.pair_graphemes[1];
- int64_t depth = 1;
- int64_t match_len = 1;
- for (; depth > 0; match_len++) {
- if (index + match_len >= text.length)
- return pat.negated ? 1 : -1;
-
- int32_t c = Text$get_grapheme_fast(state, index + match_len);
- if (c == open)
- depth += 1;
- else if (c == close)
- depth -= 1;
- }
- return pat.negated ? -1 : match_len;
- }
- case PAT_QUOTE: {
- // Nested quotes: "?", '?', etc
- if (index >= text.length)
- return -1;
-
- int32_t open = pat.quote_graphemes[0];
- if (grapheme != open)
- return pat.negated ? 1 : -1;
-
- int32_t close = pat.quote_graphemes[1];
- for (int64_t i = index + 1; i < text.length; i++) {
- int32_t c = Text$get_grapheme_fast(state, i);
- if (c == close) {
- return pat.negated ? -1 : (i - index) + 1;
- } else if (c == '\\' && index + 1 < text.length) {
- i += 1; // Skip ahead an extra step
- }
- }
- return pat.negated ? 1 : -1;
- }
- case PAT_FUNCTION: {
- int64_t match_len = pat.fn(state, index);
- if (match_len >= 0)
- return pat.negated ? -1 : match_len;
- return pat.negated ? 1 : -1;
- }
- default: errx(1, "Invalid pattern");
- }
- errx(1, "Unreachable");
-}
-
-static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
-{
- if (EAT2(state, *index,
- uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK),
- grapheme == '?')) {
- // Quotations: "?", '?', etc
- int32_t open = Text$get_grapheme_fast(state, *index-2);
- int32_t close = open;
- uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
- if (!match_grapheme(state, index, close))
- fail("Pattern's closing quote is missing: ", state->stack[0].text);
-
- return (pat_t){
- .tag=PAT_QUOTE,
- .min=1, .max=1,
- .quote_graphemes={open, close},
- };
- } else if (EAT2(state, *index,
- uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION),
- grapheme == '?')) {
- // Nested punctuation: (?), [?], etc
- int32_t open = Text$get_grapheme_fast(state, *index-2);
- int32_t close = open;
- uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
- if (!match_grapheme(state, index, close))
- fail("Pattern's closing brace is missing: ", state->stack[0].text);
-
- return (pat_t){
- .tag=PAT_PAIR,
- .min=1, .max=1,
- .pair_graphemes={open, close},
- };
- } else if (EAT1(state, *index, grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
- skip_whitespace(state, index);
- int64_t min, max;
- if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) {
- min = parse_int(state, index);
- skip_whitespace(state, index);
- if (match_grapheme(state, index, '+')) {
- max = INT64_MAX;
- } else if (match_grapheme(state, index, '-')) {
- max = parse_int(state, index);
- } else {
- max = min;
- }
- if (min > max) fail("Minimum repetitions (", min, ") is less than the maximum (", max, ")");
- } else {
- min = -1, max = -1;
- }
-
- skip_whitespace(state, index);
-
- bool negated = match_grapheme(state, index, '!');
-#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__})
- const char *prop_name;
- if (match_str(state, index, ".."))
- prop_name = "..";
- else
- prop_name = get_property_name(state, index);
-
- if (!prop_name) {
- // Literal character, e.g. {1?}
- skip_whitespace(state, index);
- int32_t grapheme = Text$get_grapheme_fast(state, (*index)++);
- if (!match_grapheme(state, index, '}'))
- fail("Missing closing '}' in pattern: ", state->stack[0].text);
- return PAT(PAT_GRAPHEME, .grapheme=grapheme);
- } else if (strlen(prop_name) == 1) {
- // Single letter names: {1+ A}
- skip_whitespace(state, index);
- if (!match_grapheme(state, index, '}'))
- fail("Missing closing '}' in pattern: ", state->stack[0].text);
- return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]);
- }
-
- skip_whitespace(state, index);
- if (!match_grapheme(state, index, '}'))
- fail("Missing closing '}' in pattern: ", state->stack[0].text);
-
- switch (tolower(prop_name[0])) {
- case '.':
- if (prop_name[1] == '.') {
- if (negated)
- return ((pat_t){.tag=PAT_END, .min=min, .max=max, .non_capturing=true});
- else
- return PAT(PAT_ANY);
- }
- break;
- case 'a':
- if (strcasecmp(prop_name, "authority") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_authority);
- } else if (strcasecmp(prop_name, "alphanum") == 0 || strcasecmp(prop_name, "anum") == 0
- || strcasecmp(prop_name, "alphanumeric") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_alphanumeric);
- }
- break;
- case 'c':
- if (strcasecmp(prop_name, "crlf") == 0)
- return PAT(PAT_FUNCTION, .fn=match_newline);
- break;
- case 'd':
- if (strcasecmp(prop_name, "digit") == 0) {
- return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT);
- }
- break;
- case 'e':
- if (strcasecmp(prop_name, "end") == 0) {
- return PAT(PAT_END, .non_capturing=!negated);
- } else if (strcasecmp(prop_name, "email") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_email);
- }
-#if _LIBUNISTRING_VERSION >= 0x0100000
- else if (strcasecmp(prop_name, "emoji") == 0) {
- return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI);
- }
-#endif
- break;
- case 'h':
- if (strcasecmp(prop_name, "host") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_host);
- }
- break;
- case 'i':
- if (strcasecmp(prop_name, "id") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_id);
- } else if (strcasecmp(prop_name, "int") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_int);
- } else if (strcasecmp(prop_name, "ipv4") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_ipv4);
- } else if (strcasecmp(prop_name, "ipv6") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_ipv6);
- } else if (strcasecmp(prop_name, "ip") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_ip);
- }
- break;
- case 'n':
- if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_newline);
- } else if (strcasecmp(prop_name, "num") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_num);
- }
- break;
- case 's':
- if (strcasecmp(prop_name, "start") == 0) {
- return PAT(PAT_START, .non_capturing=!negated);
- }
- break;
- case 'u':
- if (strcasecmp(prop_name, "uri") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_uri);
- } else if (strcasecmp(prop_name, "url") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_url);
- }
- break;
- case 'w':
- if (strcasecmp(prop_name, "word") == 0) {
- return PAT(PAT_FUNCTION, .fn=match_id);
- }
- break;
- default: break;
- }
-
- uc_property_t prop = uc_property_byname(prop_name);
- if (uc_property_is_valid(prop))
- return PAT(PAT_PROPERTY, .property=prop);
-
- ucs4_t grapheme = unicode_name_character(prop_name);
- if (grapheme == UNINAME_INVALID)
- fail("Not a valid property or character name: ", prop_name);
- return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme);
-#undef PAT
- } else {
- return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(state, (*index)++)};
- }
-}
-
-static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index)
-{
- if (pattern_index >= pattern.length) // End of the pattern
- return 0;
-
- int64_t start_index = text_index;
- TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text);
- pat_t pat = parse_next_pat(&pattern_state, &pattern_index);
-
- if (pat.min == -1 && pat.max == -1) {
- if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
- pat.min = pat.max = MAX(1, text.length - text_index);
- } else {
- pat.min = 1;
- pat.max = INT64_MAX;
- }
- }
-
- int64_t capture_start = text_index;
- int64_t count = 0, capture_len = 0, next_match_len = 0;
-
- if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
- int64_t remaining = text.length - text_index;
- capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1;
- text_index += capture_len;
- goto success;
- }
-
- if (pat.min == 0 && pattern_index < pattern.length) {
- next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1));
- if (next_match_len >= 0) {
- capture_len = 0;
- goto success;
- }
- }
-
- while (count < pat.max) {
- int64_t match_len = match_pat(&text_state, text_index, pat);
- if (match_len < 0)
- break;
- capture_len += match_len;
- text_index += match_len;
- count += 1;
-
- if (pattern_index < pattern.length) { // More stuff after this
- if (count < pat.min)
- next_match_len = -1;
- else
- next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1));
- } else {
- next_match_len = 0;
- }
-
- if (match_len == 0) {
- if (next_match_len >= 0) {
- // If we're good to go, no need to keep re-matching zero-length
- // matches till we hit max:
- count = pat.max;
- break;
- } else {
- return -1;
- }
- }
-
- if (pattern_index < pattern.length && next_match_len >= 0)
- break; // Next guy exists and wants to stop here
-
- if (text_index >= text.length)
- break;
- }
-
- if (count < pat.min || next_match_len < 0)
- return -1;
-
- success:
- if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) {
- if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) {
- assert(capture_len > 0);
- captures[capture_index] = (capture_t){
- .index=capture_start + 1, // Skip leading quote/paren
- .length=capture_len - 2, // Skip open/close
- .occupied=true,
- .recursive=(pat.tag == PAT_PAIR),
- };
- } else {
- captures[capture_index] = (capture_t){
- .index=capture_start,
- .length=capture_len,
- .occupied=true,
- .recursive=false,
- };
- }
- }
- return (text_index - start_index) + next_match_len;
-}
-
-#undef EAT1
-#undef EAT2
-#undef EAT_MANY
-
-static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length, capture_t *captures)
-{
- int32_t first_grapheme = Text$get_grapheme(pattern, 0);
- bool find_first = (first_grapheme != '{'
- && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
- && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
-
- TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
- for (int64_t i = first; i <= last; i++) {
- // Optimization: quickly skip ahead to first char in pattern:
- if (find_first) {
- while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme)
- ++i;
- }
-
- int64_t m = match(text, i, pattern, 0, captures, 0);
- if (m >= 0) {
- if (match_length)
- *match_length = m;
- return i;
- }
- }
- if (match_length)
- *match_length = -1;
- return -1;
-}
-
-public OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index)
-{
- int64_t first = Int64$from_int(from_index, false);
- if (first == 0) fail("Invalid index: 0");
- if (first < 0) first = text.length + first + 1;
- if (first > text.length || first < 1)
- return NONE_MATCH;
-
- capture_t captures[MAX_BACKREFS] = {};
- int64_t len = 0;
- int64_t found = _find(text, pattern, first-1, text.length-1, &len, captures);
- if (found == -1)
- return NONE_MATCH;
-
- Array_t capture_array = {};
- for (int i = 0; captures[i].occupied; i++) {
- Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
- Array$insert(&capture_array, &capture, I(0), sizeof(Text_t));
- }
- return (OptionalMatch_t){
- .text=Text$slice(text, I(found+1), I(found+len)),
- .index=I(found+1),
- .captures=capture_array,
- };
-}
-
-PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern)
-{
- if (Text$starts_with(pattern, Text("{start}"))) {
- int64_t m = match(text, 0, pattern, 0, NULL, 0);
- return m >= 0;
- } else if (Text$ends_with(text, Text("{end}"))) {
- for (int64_t i = text.length-1; i >= 0; i--) {
- int64_t match_len = match(text, i, pattern, 0, NULL, 0);
- if (match_len >= 0 && i + match_len == text.length)
- return true;
- }
- return false;
- } else {
- int64_t found = _find(text, pattern, 0, text.length-1, NULL, NULL);
- return (found >= 0);
- }
-}
-
-public OptionalArray_t Text$matches(Text_t text, Pattern_t pattern)
-{
- capture_t captures[MAX_BACKREFS] = {};
- int64_t match_len = match(text, 0, pattern, 0, captures, 0);
- if (match_len != text.length)
- return NONE_ARRAY;
-
- Array_t capture_array = {};
- for (int i = 0; captures[i].occupied; i++) {
- Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
- Array$insert(&capture_array, &capture, I(0), sizeof(Text_t));
- }
- return capture_array;
-}
-
-public Array_t Text$find_all(Text_t text, Pattern_t pattern)
-{
- if (pattern.length == 0) // special case
- return (Array_t){.length=0};
-
- Array_t matches = {};
- for (int64_t i = 1; ; ) {
- OptionalMatch_t m = Text$find(text, pattern, I(i));
- if (!m.index.small)
- break;
- i = Int64$from_int(m.index, false) + m.text.length;
- Array$insert(&matches, &m, I_small(0), sizeof(Match_t));
- }
- return matches;
-}
-
-typedef struct {
- TextIter_t state;
- Int_t i;
- Pattern_t pattern;
-} match_iter_state_t;
-
-static OptionalMatch_t next_match(match_iter_state_t *state)
-{
- if (Int64$from_int(state->i, false) > state->state.stack[0].text.length)
- return NONE_MATCH;
-
- OptionalMatch_t m = Text$find(state->state.stack[0].text, state->pattern, state->i);
- if (m.index.small == 0) // No match
- state->i = I(state->state.stack[0].text.length + 1);
- else
- state->i = Int$plus(m.index, I(MAX(1, m.text.length)));
- return m;
-}
-
-public Closure_t Text$by_match(Text_t text, Pattern_t pattern)
-{
- return (Closure_t){
- .fn=(void*)next_match,
- .userdata=new(match_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=I_small(1), .pattern=pattern),
- };
-}
-
-static Text_t apply_backrefs(Text_t text, Array_t recursive_replacements, Text_t replacement, Pattern_t backref_pat, capture_t *captures)
-{
- if (backref_pat.length == 0)
- return replacement;
-
- int32_t first_grapheme = Text$get_grapheme(backref_pat, 0);
- bool find_first = (first_grapheme != '{'
- && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
- && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
-
- Text_t ret = Text("");
- TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement);
- int64_t nonmatching_pos = 0;
- for (int64_t pos = 0; pos < replacement.length; ) {
- // Optimization: quickly skip ahead to first char in the backref pattern:
- if (find_first) {
- while (pos < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme)
- ++pos;
- }
-
- int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0);
- if (backref_len < 0) {
- pos += 1;
- continue;
- }
-
- int64_t after_backref = pos + backref_len;
- int64_t backref = parse_int(&replacement_state, &after_backref);
- if (after_backref == pos + backref_len) { // Not actually a backref if there's no number
- pos += 1;
- continue;
- }
- if (backref < 0 || backref > 9) fail("Invalid backref index: ", backref, " (only 0-", MAX_BACKREFS-1, " are allowed)");
- backref_len = (after_backref - pos);
-
- if (Text$get_grapheme_fast(&replacement_state, pos + backref_len) == ';')
- backref_len += 1; // skip optional semicolon
-
- if (!captures[backref].occupied)
- fail("There is no capture number ", backref, "!");
-
- Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length));
-
- if (captures[backref].recursive && recursive_replacements.length > 0)
- backref_text = Text$replace_array(backref_text, recursive_replacements, backref_pat, true);
-
- if (pos > nonmatching_pos) {
- Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos));
- ret = Text$concat(ret, before_slice, backref_text);
- } else {
- ret = Text$concat(ret, backref_text);
- }
-
- pos += backref_len;
- nonmatching_pos = pos;
- }
- if (nonmatching_pos < replacement.length) {
- Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length));
- ret = Text$concat(ret, last_slice);
- }
- return ret;
-}
-
-public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive)
-{
- Text_t ret = EMPTY_TEXT;
-
- int32_t first_grapheme = Text$get_grapheme(pattern, 0);
- bool find_first = (first_grapheme != '{'
- && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
- && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
-
- Text_t entries[2] = {pattern, replacement};
- Array_t replacements = {
- .data=entries,
- .length=1,
- .stride=sizeof(entries),
- };
-
- TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
- int64_t nonmatching_pos = 0;
- for (int64_t pos = 0; pos < text.length; ) {
- // Optimization: quickly skip ahead to first char in pattern:
- if (find_first) {
- while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
- ++pos;
- }
-
- capture_t captures[MAX_BACKREFS] = {};
- int64_t match_len = match(text, pos, pattern, 0, captures, 1);
- if (match_len < 0) {
- pos += 1;
- continue;
- }
- captures[0] = (capture_t){
- .index = pos, .length = match_len,
- .occupied = true, .recursive = false,
- };
-
- Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (Array_t){}, replacement, backref_pat, captures);
- if (pos > nonmatching_pos) {
- Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos));
- ret = Text$concat(ret, before_slice, replacement_text);
- } else {
- ret = Text$concat(ret, replacement_text);
- }
- nonmatching_pos = pos + match_len;
- pos += MAX(match_len, 1);
- }
- if (nonmatching_pos < text.length) {
- Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length));
- ret = Text$concat(ret, last_slice);
- }
- return ret;
-}
-
-public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right)
-{
- int64_t first = 0, last = text.length-1;
- if (trim_left) {
- int64_t match_len = match(text, 0, pattern, 0, NULL, 0);
- if (match_len > 0)
- first = match_len;
- }
-
- if (trim_right) {
- for (int64_t i = text.length-1; i >= first; i--) {
- int64_t match_len = match(text, i, pattern, 0, NULL, 0);
- if (match_len > 0 && i + match_len == text.length)
- last = i-1;
- }
- }
- return Text$slice(text, I(first+1), I(last+1));
-}
-
-public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive)
-{
- Text_t ret = EMPTY_TEXT;
-
- int32_t first_grapheme = Text$get_grapheme(pattern, 0);
- bool find_first = (first_grapheme != '{'
- && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
- && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
-
- TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
- int64_t nonmatching_pos = 0;
-
- Text_t (*text_mapper)(Match_t, void*) = fn.fn;
- for (int64_t pos = 0; pos < text.length; pos++) {
- // Optimization: quickly skip ahead to first char in pattern:
- if (find_first) {
- while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
- ++pos;
- }
-
- capture_t captures[MAX_BACKREFS] = {};
- int64_t match_len = match(text, pos, pattern, 0, captures, 0);
- if (match_len < 0) continue;
-
- Match_t m = {
- .text=Text$slice(text, I(pos+1), I(pos+match_len)),
- .index=I(pos+1),
- .captures={},
- };
- for (int i = 0; captures[i].occupied; i++) {
- Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
- if (recursive)
- capture = Text$map(capture, pattern, fn, recursive);
- Array$insert(&m.captures, &capture, I(0), sizeof(Text_t));
- }
-
- Text_t replacement = text_mapper(m, fn.userdata);
- if (pos > nonmatching_pos) {
- Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos));
- ret = Text$concat(ret, before_slice, replacement);
- } else {
- ret = Text$concat(ret, replacement);
- }
- nonmatching_pos = pos + match_len;
- pos += (match_len - 1);
- }
- if (nonmatching_pos < text.length) {
- Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length));
- ret = Text$concat(ret, last_slice);
- }
- return ret;
-}
-
-public void Text$each(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive)
-{
- int32_t first_grapheme = Text$get_grapheme(pattern, 0);
- bool find_first = (first_grapheme != '{'
- && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
- && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
-
- TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
- void (*action)(Match_t, void*) = fn.fn;
- for (int64_t pos = 0; pos < text.length; pos++) {
- // Optimization: quickly skip ahead to first char in pattern:
- if (find_first) {
- while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
- ++pos;
- }
-
- capture_t captures[MAX_BACKREFS] = {};
- int64_t match_len = match(text, pos, pattern, 0, captures, 0);
- if (match_len < 0) continue;
-
- Match_t m = {
- .text=Text$slice(text, I(pos+1), I(pos+match_len)),
- .index=I(pos+1),
- .captures={},
- };
- for (int i = 0; captures[i].occupied; i++) {
- Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
- if (recursive)
- Text$each(capture, pattern, fn, recursive);
- Array$insert(&m.captures, &capture, I(0), sizeof(Text_t));
- }
-
- action(m, fn.userdata);
- pos += (match_len - 1);
- }
-}
-
-Text_t Text$replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive)
-{
- if (replacements.length == 0) return text;
-
- Text_t ret = EMPTY_TEXT;
-
- int64_t nonmatch_pos = 0;
- for (int64_t pos = 0; pos < text.length; ) {
- // Find the first matching pattern at this position:
- for (int64_t i = 0; i < replacements.length; i++) {
- Pattern_t pattern = *(Pattern_t*)(replacements.data + i*replacements.stride);
- capture_t captures[MAX_BACKREFS] = {};
- int64_t len = match(text, pos, pattern, 0, captures, 1);
- if (len < 0) continue;
- captures[0].index = pos;
- captures[0].length = len;
-
- // If we skipped over some non-matching text before finding a match, insert it here:
- if (pos > nonmatch_pos) {
- Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos));
- ret = Text$concat(ret, before_slice);
- }
-
- // Concatenate the replacement:
- Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride + sizeof(Text_t));
- Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (Array_t){}, replacement, backref_pat, captures);
- ret = Text$concat(ret, replacement_text);
- pos += MAX(len, 1);
- nonmatch_pos = pos;
- goto next_pos;
- }
-
- pos += 1;
- next_pos:
- continue;
- }
-
- if (nonmatch_pos <= text.length) {
- Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length));
- ret = Text$concat(ret, last_slice);
- }
- return ret;
-}
-
-public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive)
-{
- return Text$replace_array(text, replacements.entries, backref_pat, recursive);
-}
-
-public Array_t Text$split(Text_t text, Pattern_t pattern)
-{
- if (text.length == 0) // special case
- return (Array_t){.length=0};
-
- if (pattern.length == 0) // special case
- return Text$clusters(text);
-
- Array_t chunks = {};
-
- int64_t i = 0;
- for (;;) {
- int64_t len = 0;
- int64_t found = _find(text, pattern, i, text.length-1, &len, NULL);
- if (found == i && len == 0)
- found = _find(text, pattern, i + 1, text.length-1, &len, NULL);
- if (found < 0) break;
- Text_t chunk = Text$slice(text, I(i+1), I(found));
- Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
- i = MAX(found + len, i + 1);
- }
-
- Text_t last_chunk = Text$slice(text, I(i+1), I(text.length));
- Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t));
-
- return chunks;
-}
-
-typedef struct {
- TextIter_t state;
- int64_t i;
- Pattern_t pattern;
-} split_iter_state_t;
-
-static OptionalText_t next_split(split_iter_state_t *state)
-{
- Text_t text = state->state.stack[0].text;
- if (state->i >= text.length) {
- if (state->pattern.length > 0 && state->i == text.length) { // special case
- state->i = text.length + 1;
- return EMPTY_TEXT;
- }
- return NONE_TEXT;
- }
-
- if (state->pattern.length == 0) { // special case
- Text_t ret = Text$cluster(text, I(state->i+1));
- state->i += 1;
- return ret;
- }
-
- int64_t start = state->i;
- int64_t len = 0;
- int64_t found = _find(text, state->pattern, start, text.length-1, &len, NULL);
-
- if (found == start && len == 0)
- found = _find(text, state->pattern, start + 1, text.length-1, &len, NULL);
-
- if (found >= 0) {
- state->i = MAX(found + len, state->i + 1);
- return Text$slice(text, I(start+1), I(found));
- } else {
- state->i = state->state.stack[0].text.length + 1;
- return Text$slice(text, I(start+1), I(text.length));
- }
-}
-
-public Closure_t Text$by_split(Text_t text, Pattern_t pattern)
-{
- return (Closure_t){
- .fn=(void*)next_split,
- .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .pattern=pattern),
- };
-}
-
-public Pattern_t Pattern$escape_text(Text_t text)
-{
- // TODO: optimize for spans of non-escaped text
- Text_t ret = EMPTY_TEXT;
- TextIter_t state = NEW_TEXT_ITER_STATE(text);
- for (int64_t i = 0; i < text.length; i++) {
- uint32_t g = Text$get_main_grapheme_fast(&state, i);
- if (g == '{') {
- ret = Text$concat(ret, Text("{1{}"));
- } else if (g == '?'
- || uc_is_property_quotation_mark(g)
- || (uc_is_property_paired_punctuation(g) && uc_is_property_left_of_pair(g))) {
- ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i+1), I(i+1)), Text("}"));
- } else {
- ret = Text$concat(ret, Text$slice(text, I(i+1), I(i+1)));
- }
- }
- return ret;
-}
-
-static Text_t Pattern$as_text(const void *obj, bool colorize, const TypeInfo_t *info)
-{
- (void)info;
- if (!obj) return Text("Pattern");
-
- Pattern_t pat = *(Pattern_t*)obj;
- Text_t quote = Text$has(pat, Pattern("/")) && !Text$has(pat, Pattern("|")) ? Text("|") : Text("/");
- return Text$concat( colorize ? Text("\x1b[1m$\033[m") : Text("$"), Text$quoted(pat, colorize, quote));
-}
-
-public const TypeInfo_t Pattern$info = {
- .size=sizeof(Pattern_t),
- .align=__alignof__(Pattern_t),
- .tag=TextInfo,
- .TextInfo={.lang="Pattern"},
- .metamethods={
- .as_text=Pattern$as_text,
- .hash=Text$hash,
- .compare=Text$compare,
- .equal=Text$equal,
- .is_none=Text$is_none,
- .serialize=Text$serialize,
- .deserialize=Text$deserialize,
- },
-};
-
-static const TypeInfo_t _text_array = {
- .size=sizeof(Array_t),
- .align=__alignof__(Array_t),
- .tag=ArrayInfo,
- .ArrayInfo.item=&Text$info,
- .metamethods=Array$metamethods,
-};
-
-static NamedType_t _match_fields[3] = {
- {"text", &Text$info},
- {"index", &Int$info},
- {"captures", &_text_array},
-};
-
-static bool Match$is_none(const void *m, const TypeInfo_t*)
-{
- return ((OptionalMatch_t*)m)->index.small == 0;
-}
-
-public const TypeInfo_t Match$info = {
- .size=sizeof(Match_t),
- .align=__alignof__(Match_t),
- .tag=StructInfo,
- .StructInfo={
- .name="Match",
- .num_fields=3,
- .fields=_match_fields,
- },
- .metamethods={
- .as_text=Struct$as_text,
- .hash=Struct$hash,
- .compare=Struct$compare,
- .equal=Struct$equal,
- .is_none=Match$is_none,
- },
-};
-
-// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/src/stdlib/patterns.h b/src/stdlib/patterns.h
deleted file mode 100644
index 2b77e490..00000000
--- a/src/stdlib/patterns.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#pragma once
-
-// The type representing text patterns for pattern matching.
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#include "datatypes.h"
-#include "integers.h"
-#include "optionals.h"
-#include "types.h"
-
-#define Pattern(text) ((Pattern_t)Text(text))
-#define Patterns(...) ((Pattern_t)Texts(__VA_ARGS__))
-
-typedef struct {
- Text_t text;
- Int_t index;
- Array_t captures;
-} Match_t;
-
-typedef Match_t OptionalMatch_t;
-#define NONE_MATCH ((OptionalMatch_t){.index=NONE_INT})
-
-Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t backref_pat, bool recursive);
-Pattern_t Pattern$escape_text(Text_t text);
-Text_t Text$replace_all(Text_t text, Table_t replacements, Pattern_t backref_pat, bool recursive);
-Array_t Text$split(Text_t text, Pattern_t pattern);
-Closure_t Text$by_split(Text_t text, Pattern_t pattern);
-Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right);
-OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t i);
-Array_t Text$find_all(Text_t text, Pattern_t pattern);
-Closure_t Text$by_match(Text_t text, Pattern_t pattern);
-PUREFUNC bool Text$has(Text_t text, Pattern_t pattern);
-OptionalArray_t Text$matches(Text_t text, Pattern_t pattern);
-Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive);
-void Text$each(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive);
-
-#define Pattern$hash Text$hash
-#define Pattern$compare Text$compare
-#define Pattern$equal Text$equal
-
-extern const TypeInfo_t Match$info;
-extern const TypeInfo_t Pattern$info;
-
-// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/src/stdlib/stdlib.c b/src/stdlib/stdlib.c
index fc94bc97..77383cd1 100644
--- a/src/stdlib/stdlib.c
+++ b/src/stdlib/stdlib.c
@@ -20,7 +20,6 @@
#include "optionals.h"
#include "metamethods.h"
#include "nums.h"
-#include "patterns.h"
#include "paths.h"
#include "rng.h"
#include "siphash.h"
diff --git a/src/stdlib/text.c b/src/stdlib/text.c
index 27acdfa4..621de942 100644
--- a/src/stdlib/text.c
+++ b/src/stdlib/text.c
@@ -998,17 +998,22 @@ PUREFUNC public int32_t Text$compare(const void *va, const void *vb, const TypeI
return 0;
}
+bool _matches(TextIter_t *text_state, TextIter_t *target_state, int64_t pos)
+{
+ for (int64_t i = 0; i < target_state->stack[0].text.length; i++) {
+ int32_t text_i = Text$get_grapheme_fast(text_state, pos + i);
+ int32_t prefix_i = Text$get_grapheme_fast(target_state, i);
+ if (text_i != prefix_i) return false;
+ }
+ return true;
+}
+
PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix)
{
if (text.length < prefix.length)
return false;
TextIter_t text_state = NEW_TEXT_ITER_STATE(text), prefix_state = NEW_TEXT_ITER_STATE(prefix);
- for (int64_t i = 0; i < prefix.length; i++) {
- int32_t text_i = Text$get_grapheme_fast(&text_state, i);
- int32_t prefix_i = Text$get_grapheme_fast(&prefix_state, i);
- if (text_i != prefix_i) return false;
- }
- return true;
+ return _matches(&text_state, &prefix_state, 0);
}
PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix)
@@ -1016,12 +1021,236 @@ PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix)
if (text.length < suffix.length)
return false;
TextIter_t text_state = NEW_TEXT_ITER_STATE(text), suffix_state = NEW_TEXT_ITER_STATE(suffix);
- for (int64_t i = 0; i < suffix.length; i++) {
- int32_t text_i = Text$get_grapheme_fast(&text_state, text.length - suffix.length + i);
- int32_t suffix_i = Text$get_grapheme_fast(&suffix_state, i);
- if (text_i != suffix_i) return false;
+ return _matches(&text_state, &suffix_state, text.length - suffix.length);
+}
+
+public Text_t Text$without_prefix(Text_t text, Text_t prefix)
+{
+ return Text$starts_with(text, prefix) ? Text$slice(text, I(prefix.length + 1), I(text.length)) : text;
+}
+
+public Text_t Text$without_suffix(Text_t text, Text_t suffix)
+{
+ return Text$ends_with(text, suffix) ? Text$slice(text, I(1), I(text.length - suffix.length)) : text;
+}
+
+static bool _has_grapheme(TextIter_t *text, int32_t g)
+{
+ for (int64_t t = 0; t < text->stack[0].text.length; t++) {
+ if (g == Text$get_grapheme_fast(text, t)) {
+ return true;
+ }
}
- return true;
+ return false;
+}
+
+public Text_t Text$trim(Text_t text, Text_t to_trim, bool left, bool right)
+{
+ int64_t first = 0;
+ TextIter_t text_state = NEW_TEXT_ITER_STATE(text), trim_state = NEW_TEXT_ITER_STATE(to_trim);
+ if (left) {
+ while (first < text.length && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, first))) {
+ first += 1;
+ }
+ }
+ int64_t last = text.length-1;
+ if (right) {
+ while (last >= first && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, last))) {
+ last -= 1;
+ }
+ }
+ return (first != 0 || last != text.length-1) ? Text$slice(text, I(first+1), I(last+1)) : text;
+}
+
+public Text_t Text$translate(Text_t text, Table_t translations)
+{
+ TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
+ Text_t result = EMPTY_TEXT;
+ int64_t span_start = 0;
+ Array_t replacement_array = translations.entries;
+ for (int64_t i = 0; i < text.length; ) {
+ for (int64_t r = 0; r < replacement_array.length; r++) {
+ struct { Text_t target, replacement; } *entry = replacement_array.data + r*replacement_array.stride;
+ TextIter_t target_state = NEW_TEXT_ITER_STATE(entry->target);
+ if (_matches(&text_state, &target_state, i)) {
+ if (i > span_start)
+ result = concat2(result, Text$slice(text, I(span_start+1), I(i)));
+
+ result = concat2(result, entry->replacement);
+ i += entry->target.length;
+ span_start = i;
+ goto found_match;
+ }
+ }
+ i += 1;
+ found_match: continue;
+ }
+ if (span_start < text.length)
+ result = concat2(result, Text$slice(text, I(span_start+1), I(text.length)));
+ return result;
+}
+
+public Text_t Text$replace(Text_t text, Text_t target, Text_t replacement)
+{
+ TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target);
+ Text_t result = EMPTY_TEXT;
+ int64_t span_start = 0;
+ for (int64_t i = 0; i < text.length; ) {
+ if (_matches(&text_state, &target_state, i)) {
+ if (i > span_start)
+ result = concat2(result, Text$slice(text, I(span_start+1), I(i)));
+
+ result = concat2(result, replacement);
+ i += target.length;
+ span_start = i;
+ } else {
+ i += 1;
+ }
+ }
+ if (span_start < text.length)
+ result = concat2(result, Text$slice(text, I(span_start+1), I(text.length)));
+ return result;
+}
+
+public bool Text$has(Text_t text, Text_t target)
+{
+ TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target);
+ for (int64_t i = 0; i < text.length; i++) {
+ if (_matches(&text_state, &target_state, i))
+ return true;
+ }
+ return false;
+}
+
+public Array_t Text$split(Text_t text, Text_t delimiters)
+{
+ if (delimiters.length == 0)
+ return Text$clusters(text);
+
+ TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters);
+ Array_t splits = {};
+ for (int64_t i = 0; i < text.length; ) {
+ int64_t span_len = 0;
+ while (i + span_len < text.length && !_matches(&text_state, &delim_state, i + span_len)) {
+ span_len += 1;
+ }
+ Text_t slice = Text$slice(text, I(i+1), I(i+span_len));
+ Array$insert(&splits, &slice, I(0), sizeof(slice));
+ i += span_len + delimiters.length;
+ if (i == text.length) {
+ Text_t empty = Text("");
+ Array$insert(&splits, &empty, I(0), sizeof(empty));
+ }
+ }
+ return splits;
+}
+
+public Array_t Text$split_any(Text_t text, Text_t delimiters)
+{
+ if (delimiters.length == 0)
+ return Array(text);
+
+ TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters);
+ Array_t splits = {};
+ for (int64_t i = 0; i < text.length; ) {
+ int64_t span_len = 0;
+ while (i + span_len < text.length && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i + span_len))) {
+ span_len += 1;
+ }
+ bool trailing_delim = i + span_len < text.length;
+ Text_t slice = Text$slice(text, I(i+1), I(i+span_len));
+ Array$insert(&splits, &slice, I(0), sizeof(slice));
+ i += span_len + 1;
+ while (i < text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i))) {
+ i += 1;
+ }
+ if (i >= text.length && trailing_delim) {
+ Text_t empty = Text("");
+ Array$insert(&splits, &empty, I(0), sizeof(empty));
+ }
+ }
+ return splits;
+}
+
+typedef struct {
+ TextIter_t state;
+ int64_t i;
+ Text_t delimiter;
+} split_iter_state_t;
+
+static OptionalText_t next_split(split_iter_state_t *state)
+{
+ Text_t text = state->state.stack[0].text;
+ if (state->i >= text.length) {
+ if (state->delimiter.length > 0 && state->i == text.length) { // special case
+ state->i = text.length + 1;
+ return EMPTY_TEXT;
+ }
+ return NONE_TEXT;
+ }
+
+ if (state->delimiter.length == 0) { // special case
+ state->i = text.length + 1;
+ return text;
+ }
+
+ TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter);
+ int64_t i = state->i;
+ int64_t span_len = 0;
+ while (i + span_len < text.length && !_matches(&state->state, &delim_state, i + span_len)) {
+ span_len += 1;
+ }
+ Text_t slice = Text$slice(text, I(i+1), I(i+span_len));
+ state->i = i + span_len + state->delimiter.length;
+ return slice;
+}
+
+public Closure_t Text$by_split(Text_t text, Text_t delimiter)
+{
+ return (Closure_t){
+ .fn=(void*)next_split,
+ .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .delimiter=delimiter),
+ };
+}
+
+static OptionalText_t next_split_any(split_iter_state_t *state)
+{
+ Text_t text = state->state.stack[0].text;
+ if (state->i >= text.length) {
+ if (state->delimiter.length > 0 && state->i == text.length) { // special case
+ state->i = text.length + 1;
+ return EMPTY_TEXT;
+ }
+ return NONE_TEXT;
+ }
+
+ if (state->delimiter.length == 0) { // special case
+ Text_t ret = Text$cluster(text, I(state->i+1));
+ state->i += 1;
+ return ret;
+ }
+
+ TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter);
+ int64_t i = state->i;
+ int64_t span_len = 0;
+ while (i + span_len < text.length && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i + span_len))) {
+ span_len += 1;
+ }
+ Text_t slice = Text$slice(text, I(i+1), I(i+span_len));
+ i += span_len + 1;
+ while (i < text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i))) {
+ i += 1;
+ }
+ state->i = i;
+ return slice;
+}
+
+public Closure_t Text$by_split_any(Text_t text, Text_t delimiters)
+{
+ return (Closure_t){
+ .fn=(void*)next_split_any,
+ .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .delimiter=delimiters),
+ };
}
PUREFUNC public bool Text$equal_values(Text_t a, Text_t b)
diff --git a/src/stdlib/text.h b/src/stdlib/text.h
index 4acca8a2..662c6e5f 100644
--- a/src/stdlib/text.h
+++ b/src/stdlib/text.h
@@ -50,6 +50,16 @@ Text_t Text$as_text(const void *text, bool colorize, const TypeInfo_t *info);
Text_t Text$quoted(Text_t str, bool colorize, Text_t quotation_mark);
PUREFUNC bool Text$starts_with(Text_t text, Text_t prefix);
PUREFUNC bool Text$ends_with(Text_t text, Text_t suffix);
+Text_t Text$without_prefix(Text_t text, Text_t prefix);
+Text_t Text$without_suffix(Text_t text, Text_t suffix);
+Text_t Text$replace(Text_t text, Text_t target, Text_t replacement);
+Text_t Text$translate(Text_t text, Table_t translations);
+bool Text$has(Text_t text, Text_t target);
+Array_t Text$split(Text_t text, Text_t delimiter);
+Array_t Text$split_any(Text_t text, Text_t delimiters);
+Closure_t Text$by_split(Text_t text, Text_t delimiter);
+Closure_t Text$by_split_any(Text_t text, Text_t delimiters);
+Text_t Text$trim(Text_t text, Text_t to_trim, bool left, bool right);
char *Text$as_c_string(Text_t text);
__attribute__((format(printf, 1, 2)))
public Text_t Text$format(const char *fmt, ...);
diff --git a/src/stdlib/tomo.h b/src/stdlib/tomo.h
index 4aa1253d..e42b562e 100644
--- a/src/stdlib/tomo.h
+++ b/src/stdlib/tomo.h
@@ -20,7 +20,6 @@
#include "nums.h"
#include "optionals.h"
#include "paths.h"
-#include "patterns.h"
#include "pointers.h"
#include "print.h"
#include "rng.h"
diff --git a/src/tomo.c b/src/tomo.c
index b91d45e0..f93dd1f8 100644
--- a/src/tomo.c
+++ b/src/tomo.c
@@ -21,7 +21,6 @@
#include "stdlib/datatypes.h"
#include "stdlib/integers.h"
#include "stdlib/optionals.h"
-#include "stdlib/patterns.h"
#include "stdlib/paths.h"
#include "stdlib/print.h"
#include "stdlib/text.h"
@@ -294,7 +293,12 @@ int main(int argc, char *argv[])
Text_t escape_lib_name(Text_t lib_name)
{
- return Text$replace(lib_name, Pattern("{1+ !alphanumeric}"), Text("_"), Pattern(""), false);
+ char *libname_id = String(lib_name);
+ for (char *p = libname_id; *p; p++) {
+ if (!isalnum(*p) && *p != '_')
+ *p = '_';
+ }
+ return Text$from_str(libname_id);
}
Path_t build_file(Path_t path, const char *extension)
diff --git a/src/typecheck.c b/src/typecheck.c
index 0bfe6a07..ff609435 100644
--- a/src/typecheck.c
+++ b/src/typecheck.c
@@ -12,7 +12,6 @@
#include "cordhelpers.h"
#include "environment.h"
#include "parse.h"
-#include "stdlib/patterns.h"
#include "stdlib/paths.h"
#include "stdlib/tables.h"
#include "stdlib/text.h"
@@ -195,8 +194,11 @@ static env_t *load_module(env_t *env, ast_t *module_ast)
env_t *module_env = fresh_scope(env);
Table$str_set(env->imports, use->path, module_env);
- char *libname_id = Text$as_c_string(
- Text$replace(Text$from_str(use->path), Pattern("{1+ !alphanumeric}"), Text("_"), Pattern(""), false));
+ char *libname_id = String(use->path);
+ for (char *p = libname_id; *p; p++) {
+ if (!isalnum(*p) && *p != '_')
+ *p = '_';
+ }
module_env->libname = libname_id;
for (size_t i = 0; i < tm_files.gl_pathc; i++) {
const char *filename = tm_files.gl_pathv[i];
@@ -269,6 +271,14 @@ void prebind_statement(env_t *env, ast_t *statement)
prebind_statement(ns_env, stmt->ast);
break;
}
+ case Extend: {
+ auto extend = Match(statement, Extend);
+ env_t *ns_env = namespace_env(env, extend->name);
+ ns_env->libname = env->libname;
+ for (ast_list_t *stmt = extend->body ? Match(extend->body, Block)->statements : NULL; stmt; stmt = stmt->next)
+ prebind_statement(ns_env, stmt->ast);
+ break;
+ }
default: break;
}
}
@@ -435,6 +445,14 @@ void bind_statement(env_t *env, ast_t *statement)
bind_statement(ns_env, stmt->ast);
break;
}
+ case Extend: {
+ auto extend = Match(statement, Extend);
+ env_t *ns_env = namespace_env(env, extend->name);
+ ns_env->libname = env->libname;
+ for (ast_list_t *stmt = extend->body ? Match(extend->body, Block)->statements : NULL; stmt; stmt = stmt->next)
+ bind_statement(ns_env, stmt->ast);
+ break;
+ }
case Use: {
env_t *module_env = load_module(env, statement);
if (!module_env) break;
@@ -940,7 +958,7 @@ type_t *get_type(env_t *env, ast_t *ast)
// Early out if the type is knowable without any context from the block:
switch (last->ast->tag) {
- case UpdateAssign: case Assign: case Declare: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef:
+ case UpdateAssign: case Assign: case Declare: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: case Extend:
return Type(VoidType);
default: break;
}
@@ -1240,7 +1258,7 @@ type_t *get_type(env_t *env, ast_t *ast)
return Type(ClosureType, Type(FunctionType, .args=args, .ret=ret));
}
- case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: {
+ case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: case Extend: {
return Type(VoidType);
}
@@ -1399,7 +1417,7 @@ PUREFUNC bool is_discardable(env_t *env, ast_t *ast)
{
switch (ast->tag) {
case UpdateAssign: case Assign: case Declare: case FunctionDef: case ConvertDef: case StructDef: case EnumDef:
- case LangDef: case Use:
+ case LangDef: case Use: case Extend:
return true;
default: break;
}