diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2025-04-01 14:05:10 -0400 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2025-04-01 14:05:10 -0400 |
| commit | 4d59fc2987e52da0274e6b204a9d2885613f74b7 (patch) | |
| tree | 8c262f99cb6ae9b550b9f8abf0ab0477044d087a /src | |
| parent | 7a2c99de74f5870e1dea5b59d049678ad0ef8e44 (diff) | |
Move patterns into a module
Diffstat (limited to 'src')
| -rw-r--r-- | src/ast.c | 1 | ||||
| -rw-r--r-- | src/ast.h | 5 | ||||
| -rw-r--r-- | src/compile.c | 51 | ||||
| -rw-r--r-- | src/environment.c | 39 | ||||
| -rw-r--r-- | src/environment.h | 1 | ||||
| -rw-r--r-- | src/parse.c | 53 | ||||
| -rw-r--r-- | src/stdlib/README.md | 1 | ||||
| -rw-r--r-- | src/stdlib/datatypes.h | 3 | ||||
| -rw-r--r-- | src/stdlib/optionals.c | 1 | ||||
| -rw-r--r-- | src/stdlib/paths.c | 22 | ||||
| -rw-r--r-- | src/stdlib/patterns.c | 1337 | ||||
| -rw-r--r-- | src/stdlib/patterns.h | 46 | ||||
| -rw-r--r-- | src/stdlib/stdlib.c | 1 | ||||
| -rw-r--r-- | src/stdlib/text.c | 251 | ||||
| -rw-r--r-- | src/stdlib/text.h | 10 | ||||
| -rw-r--r-- | src/stdlib/tomo.h | 1 | ||||
| -rw-r--r-- | src/tomo.c | 8 | ||||
| -rw-r--r-- | src/typecheck.c | 30 |
18 files changed, 366 insertions, 1495 deletions
@@ -165,6 +165,7 @@ CORD ast_to_xml(ast_t *ast) T(Use, "<Use>%r%r</Use>", optional_tagged("var", data.var), xml_escape(data.path)) T(InlineCCode, "<InlineCode>%r</InlineCode>", xml_escape(data.code)) T(Deserialize, "<Deserialize><type>%r</type>%r</Deserialize>", type_ast_to_xml(data.type), ast_to_xml(data.value)) + T(Extend, "<Extend name=\"%s\">%r</Extend>", data.name, ast_to_xml(data.body)) default: return "???"; #undef T } @@ -143,6 +143,7 @@ typedef enum { Use, InlineCCode, Deserialize, + Extend, } ast_e; struct ast_s { @@ -331,6 +332,10 @@ struct ast_s { ast_t *value; type_ast_t *type; } Deserialize; + struct { + const char *name; + ast_t *body; + } Extend; } __data; }; diff --git a/src/compile.c b/src/compile.c index f228148a..2cc20f39 100644 --- a/src/compile.c +++ b/src/compile.c @@ -15,7 +15,6 @@ #include "stdlib/integers.h" #include "stdlib/nums.h" #include "stdlib/paths.h" -#include "stdlib/patterns.h" #include "stdlib/text.h" #include "stdlib/util.h" #include "structs.h" @@ -39,7 +38,7 @@ static CORD compile_string_literal(CORD literal); CORD promote_to_optional(type_t *t, CORD code) { - if (t == PATH_TYPE || t == PATH_TYPE_TYPE || t == MATCH_TYPE) { + if (t == PATH_TYPE || t == PATH_TYPE_TYPE) { return code; } else if (t->tag == IntType) { switch (Match(t, IntType)->bits) { @@ -442,7 +441,7 @@ static void add_closed_vars(Table_t *closed_vars, env_t *enclosing_scope, env_t add_closed_vars(closed_vars, enclosing_scope, env, Match(ast, Deserialize)->value); break; } - case Use: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: { + case Use: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: case Extend: { errx(1, "Definitions should not be reachable in a closure."); } default: @@ -497,7 +496,6 @@ PUREFUNC CORD compile_unsigned_type(type_t *t) CORD compile_type(type_t *t) { if (t == RNG_TYPE) return "RNG_t"; - else if (t == MATCH_TYPE) return "Match_t"; else if (t == PATH_TYPE) return "Path_t"; else if (t == PATH_TYPE_TYPE) return "PathType_t"; @@ -516,8 +514,6 @@ CORD compile_type(type_t *t) auto text = Match(t, TextType); if (!text->lang || streq(text->lang, "Text")) return "Text_t"; - else if (streq(text->lang, "Pattern")) - return "Pattern_t"; else return CORD_all(namespace_prefix(text->env, text->env->namespace->parent), text->lang, "$$type"); } @@ -558,8 +554,6 @@ CORD compile_type(type_t *t) case ArrayType: case TableType: case SetType: return CORD_all("Optional", compile_type(nonnull)); case StructType: { - if (nonnull == MATCH_TYPE) - return "OptionalMatch_t"; if (nonnull == PATH_TYPE) return "OptionalPath_t"; if (nonnull == PATH_TYPE_TYPE) @@ -680,7 +674,7 @@ CORD optional_into_nonnone(type_t *t, CORD value) case IntType: return CORD_all(value, ".value"); case StructType: - if (t == MATCH_TYPE || t == PATH_TYPE || t == PATH_TYPE_TYPE) + if (t == PATH_TYPE || t == PATH_TYPE_TYPE) return value; return CORD_all(value, ".value"); default: @@ -695,8 +689,6 @@ CORD check_none(type_t *t, CORD value) // complain about excessive parens around equality comparisons if (t->tag == PointerType || t->tag == FunctionType || t->tag == CStringType) return CORD_all("({", value, " == NULL;})"); - else if (t == MATCH_TYPE) - return CORD_all("({(", value, ").index.small == 0;})"); else if (t == PATH_TYPE) return CORD_all("({(", value, ").type.$tag == PATH_NONE;})"); else if (t == PATH_TYPE_TYPE) @@ -1168,7 +1160,7 @@ static CORD _compile_statement(env_t *env, ast_t *ast) default: code_err(ast, "Update assignments are not implemented for this operation"); } } - case StructDef: case EnumDef: case LangDef: case FunctionDef: case ConvertDef: { + case StructDef: case EnumDef: case LangDef: case Extend: case FunctionDef: case ConvertDef: { return CORD_EMPTY; } case Skip: { @@ -1730,8 +1722,13 @@ static CORD _compile_statement(env_t *env, ast_t *ast) code_err(ast, "Could not find library"); CORD initialization = CORD_EMPTY; - const char *lib_id = Text$as_c_string( - Text$replace(Text$from_str(use->path), Pattern("{1+ !alphanumeric}"), Text("_"), Pattern(""), false)); + + char *lib_id = String(use->path); + for (char *p = lib_id; *p; p++) { + if (!isalnum(*p) && *p != '_') + *p = '_'; + } + for (size_t i = 0; i < tm_files.gl_pathc; i++) { const char *filename = tm_files.gl_pathv[i]; initialization = CORD_all( @@ -2165,7 +2162,6 @@ CORD compile_none(type_t *t) if (t == PATH_TYPE) return "NONE_PATH"; else if (t == PATH_TYPE_TYPE) return "((OptionalPathType_t){})"; - else if (t == MATCH_TYPE) return "NONE_MATCH"; switch (t->tag) { case BigIntType: return "NONE_INT"; @@ -2597,8 +2593,6 @@ CORD compile(env_t *env, ast_t *ast) CORD lang_constructor; if (!lang || streq(lang, "Text")) lang_constructor = "Text"; - else if (streq(lang, "Pattern")) - lang_constructor = lang; else lang_constructor = CORD_all(namespace_prefix(Match(text_t, TextType)->env, Match(text_t, TextType)->env->namespace->parent), lang); @@ -3752,7 +3746,7 @@ CORD compile(env_t *env, ast_t *ast) case Defer: code_err(ast, "Compiling 'defer' as expression!"); case Extern: code_err(ast, "Externs are not supported as expressions"); case TableEntry: code_err(ast, "Table entries should not be compiled directly"); - case Declare: case Assign: case UpdateAssign: case For: case While: case Repeat: case StructDef: case LangDef: + case Declare: case Assign: case UpdateAssign: case For: case While: case Repeat: case StructDef: case LangDef: case Extend: case EnumDef: case FunctionDef: case ConvertDef: case Skip: case Stop: case Pass: case Return: case DocTest: case PrintStatement: code_err(ast, "This is not a valid expression"); default: case Unknown: code_err(ast, "Unknown AST"); @@ -3762,7 +3756,6 @@ CORD compile(env_t *env, ast_t *ast) CORD compile_type_info(type_t *t) { if (t == RNG_TYPE) return "&RNG$info"; - else if (t == MATCH_TYPE) return "&Match$info"; else if (t == PATH_TYPE) return "&Path$info"; else if (t == PATH_TYPE_TYPE) return "&PathType$info"; @@ -3773,8 +3766,6 @@ CORD compile_type_info(type_t *t) auto text = Match(t, TextType); if (!text->lang || streq(text->lang, "Text")) return "&Text$info"; - else if (streq(text->lang, "Pattern")) - return "&Pattern$info"; return CORD_all("(&", namespace_prefix(text->env, text->env->namespace->parent), text->lang, "$$info)"); } case StructType: { @@ -4206,6 +4197,12 @@ CORD compile_top_level_code(env_t *env, ast_t *ast) env_t *ns_env = namespace_env(env, def->name); return CORD_all(code, def->namespace ? compile_top_level_code(ns_env, def->namespace) : CORD_EMPTY); } + case Extend: { + auto extend = Match(ast, Extend); + env_t *ns_env = namespace_env(env, extend->name); + ns_env->libname = env->libname; + return compile_top_level_code(ns_env, extend->body); + } case Extern: return CORD_EMPTY; case Block: { CORD code = CORD_EMPTY; @@ -4258,6 +4255,9 @@ static void initialize_vars_and_statics(env_t *env, ast_t *ast) } else if (stmt->ast->tag == LangDef) { initialize_vars_and_statics(namespace_env(env, Match(stmt->ast, LangDef)->name), Match(stmt->ast, LangDef)->namespace); + } else if (stmt->ast->tag == Extend) { + initialize_vars_and_statics(namespace_env(env, Match(stmt->ast, Extend)->name), + Match(stmt->ast, Extend)->body); } else if (stmt->ast->tag == Use) { continue; } else { @@ -4348,6 +4348,9 @@ CORD compile_statement_type_header(env_t *env, Path_t header_path, ast_t *ast) "extern const TypeInfo_t ", full_name, ";\n" ); } + case Extend: { + return CORD_EMPTY; + } default: return CORD_EMPTY; } @@ -4364,6 +4367,12 @@ CORD compile_statement_namespace_header(env_t *env, Path_t header_path, ast_t *a block = def->namespace; break; } + case Extend: { + auto extend = Match(ast, Extend); + ns_name = extend->name; + block = extend->body; + break; + } case StructDef: { auto def = Match(ast, StructDef); ns_name = def->name; diff --git a/src/environment.c b/src/environment.c index 776e7852..6822502a 100644 --- a/src/environment.c +++ b/src/environment.c @@ -13,7 +13,6 @@ #include "typecheck.h" type_t *TEXT_TYPE = NULL; -type_t *MATCH_TYPE = NULL; type_t *RNG_TYPE = NULL; public type_t *PATH_TYPE = NULL; public type_t *PATH_TYPE_TYPE = NULL; @@ -67,7 +66,6 @@ env_t *global_env(void) (void)bind_type(env, "Int32", Type(IntType, .bits=TYPE_IBITS32)); (void)bind_type(env, "Memory", Type(MemoryType)); PATH_TYPE_TYPE = declare_type(env, "enum PathType(Relative, Absolute, Home)"); - MATCH_TYPE = declare_type(env, "struct Match(text:Text, index:Int, captures:[Text])"); PATH_TYPE = declare_type(env, "struct Path(type:PathType, components:[Text])"); RNG_TYPE = declare_type(env, "struct RNG(state:@Memory)"); @@ -279,13 +277,6 @@ env_t *global_env(void) #undef F_opt #undef F #undef C - {"Match", MATCH_TYPE, "Match_t", "Match", TypedArray(ns_entry_t, - // No methods - )}, - {"Pattern", Type(TextType, .lang="Pattern", .env=namespace_env(env, "Pattern")), "Pattern_t", "Pattern$info", TypedArray(ns_entry_t, - {"escape_int", "Int$value_as_text", "func(i:Int -> Pattern)"}, - {"escape_text", "Pattern$escape_text", "func(text:Text -> Pattern)"}, - )}, {"PathType", PATH_TYPE_TYPE, "PathType_t", "PathType$info", TypedArray(ns_entry_t, {"Relative", "((PathType_t){.$tag=PATH_RELATIVE})", "PathType"}, {"Absolute", "((PathType_t){.$tag=PATH_ABSOLUTE})", "PathType"}, @@ -353,44 +344,42 @@ env_t *global_env(void) {"as_c_string", "Text$as_c_string", "func(text:Text -> CString)"}, {"at", "Text$cluster", "func(text:Text, index:Int -> Text)"}, {"by_line", "Text$by_line", "func(text:Text -> func(->Text?))"}, - {"by_match", "Text$by_match", "func(text:Text, pattern:Pattern -> func(->Match?))"}, - {"by_split", "Text$by_split", "func(text:Text, pattern=$Pattern'' -> func(->Text?))"}, + {"by_split", "Text$by_split", "func(text:Text, delimiter='' -> func(->Text?))"}, + {"by_split_any", "Text$by_split_any", "func(text:Text, delimiters=\" $\\t\\r\\n\" -> func(->Text?))"}, {"bytes", "Text$utf8_bytes", "func(text:Text -> [Byte])"}, {"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language='C' -> Bool)"}, {"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, {"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"}, - {"each", "Text$each", "func(text:Text, pattern:Pattern, fn:func(match:Match), recursive=yes)"}, - {"find", "Text$find", "func(text:Text, pattern:Pattern, start=1 -> Match?)"}, - {"find_all", "Text$find_all", "func(text:Text, pattern:Pattern -> [Match])"}, {"from", "Text$from", "func(text:Text, first:Int -> Text)"}, {"from_bytes", "Text$from_bytes", "func(bytes:[Byte] -> Text?)"}, {"from_c_string", "Text$from_str", "func(str:CString -> Text?)"}, {"from_codepoint_names", "Text$from_codepoint_names", "func(codepoint_names:[Text] -> Text?)"}, {"from_codepoints", "Text$from_codepoints", "func(codepoints:[Int32] -> Text)"}, {"from_text", "Path$from_text", "func(text:Text -> Path)"}, - {"has", "Text$has", "func(text:Text, pattern:Pattern -> Bool)"}, + {"has", "Text$has", "func(text:Text, target:Text -> Bool)"}, {"join", "Text$join", "func(glue:Text, pieces:[Text] -> Text)"}, {"left_pad", "Text$left_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"}, {"lines", "Text$lines", "func(text:Text -> [Text])"}, {"lower", "Text$lower", "func(text:Text, language='C' -> Text)"}, - {"map", "Text$map", "func(text:Text, pattern:Pattern, fn:func(match:Match -> Text), recursive=yes -> Text)"}, - {"matches", "Text$matches", "func(text:Text, pattern:Pattern -> [Text]?)"}, {"middle_pad", "Text$middle_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"}, {"quoted", "Text$quoted", "func(text:Text, color=no, quotation_mark='\"' -> Text)"}, {"repeat", "Text$repeat", "func(text:Text, count:Int -> Text)"}, - {"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, backref=$/\\/, recursive=yes -> Text)"}, - {"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern,Text}, backref=$/\\/, recursive=yes -> Text)"}, + {"replace", "Text$replace", "func(text:Text, target:Text, replacement:Text -> Text)"}, {"reversed", "Text$reversed", "func(text:Text -> Text)"}, {"right_pad", "Text$right_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"}, {"slice", "Text$slice", "func(text:Text, from=1, to=-1 -> Text)"}, - {"split", "Text$split", "func(text:Text, pattern=$Pattern'' -> [Text])"}, + {"split", "Text$split", "func(text:Text, delimiter='' -> [Text])"}, + {"split_any", "Text$split_any", "func(text:Text, delimiters=\" $\\t\\r\\n\" -> [Text])"}, {"starts_with", "Text$starts_with", "func(text,prefix:Text -> Bool)"}, {"title", "Text$title", "func(text:Text, language='C' -> Text)"}, {"to", "Text$to", "func(text:Text, last:Int -> Text)"}, - {"trim", "Text$trim", "func(text:Text, pattern=$/{whitespace}/, trim_left=yes, trim_right=yes -> Text)"}, + {"translate", "Text$translate", "func(text:Text, translations:{Text,Text} -> Text)"}, + {"trim", "Text$trim", "func(text:Text, to_trim=\" \t\r\n\", left=yes, right=yes -> Text)"}, {"upper", "Text$upper", "func(text:Text, language='C' -> Text)"}, {"utf32_codepoints", "Text$utf32_codepoints", "func(text:Text -> [Int32])"}, {"width", "Text$width", "func(text:Text, language='C' -> Int)"}, + {"without_prefix", "Text$without_prefix", "func(text,prefix:Text -> Text)"}, + {"without_suffix", "Text$without_suffix", "func(text,suffix:Text -> Text)"}, )}, }; @@ -518,9 +507,6 @@ env_t *global_env(void) {"Num32$from_int64", "func(i:Int64, truncate=no -> Num32)"}, {"Num32$from_int", "func(i:Int, truncate=no -> Num32)"}, {"Num32$from_num", "func(n:Num -> Num32)"}); - ADD_CONSTRUCTORS("Pattern", - {"Pattern$escape_text", "func(text:Text -> Pattern)"}, - {"Int$value_as_text", "func(i:Int -> Pattern)"}); ADD_CONSTRUCTORS("Path", {"Path$escape_text", "func(text:Text -> Path)"}, {"Path$escape_path", "func(path:Path -> Path)"}, @@ -534,11 +520,6 @@ env_t *global_env(void) .ret=PATH_TYPE), "Path$from_text"); - set_binding(namespace_env(env, "Pattern"), "from_text", - Type(FunctionType, .args=new(arg_t, .name="text", .type=TEXT_TYPE), - .ret=Type(TextType, .lang="Pattern", .env=namespace_env(env, "Pattern"))), - "(Pattern_t)"); - struct { const char *name, *code, *type_str; } global_vars[] = { diff --git a/src/environment.h b/src/environment.h index 95e3c3e1..00b8fbba 100644 --- a/src/environment.h +++ b/src/environment.h @@ -89,7 +89,6 @@ void set_binding(env_t *env, const char *name, type_t *type, CORD code); binding_t *get_namespace_binding(env_t *env, ast_t *self, const char *name); #define code_err(ast, ...) compiler_err((ast)->file, (ast)->start, (ast)->end, __VA_ARGS__) extern type_t *TEXT_TYPE; -extern type_t *MATCH_TYPE; extern type_t *RNG_TYPE; extern type_t *PATH_TYPE; extern type_t *PATH_TYPE_TYPE; diff --git a/src/parse.c b/src/parse.c index 14221cc0..2e3e2ece 100644 --- a/src/parse.c +++ b/src/parse.c @@ -22,7 +22,6 @@ #include "ast.h" #include "cordhelpers.h" #include "stdlib/integers.h" -#include "stdlib/patterns.h" #include "stdlib/paths.h" #include "stdlib/print.h" #include "stdlib/stdlib.h" @@ -64,7 +63,7 @@ int op_tightness[] = { static const char *keywords[] = { "yes", "xor", "while", "when", "use", "unless", "struct", "stop", "skip", "return", "or", "not", "none", "no", "mod1", "mod", "pass", "lang", "inline", "in", "if", - "func", "for", "extern", "enum", "else", "do", "deserialize", "defer", "and", + "func", "for", "extern", "extend", "enum", "else", "do", "deserialize", "defer", "and", "_min_", "_max_", NULL, }; @@ -120,6 +119,7 @@ static PARSER(parse_inline_c); static PARSER(parse_int); static PARSER(parse_lambda); static PARSER(parse_lang_def); +static PARSER(parse_extend); static PARSER(parse_namespace); static PARSER(parse_negative); static PARSER(parse_not); @@ -1241,9 +1241,6 @@ PARSER(parse_text) { open_quote = *pos; ++pos; close_quote = closing[(int)open_quote] ? closing[(int)open_quote] : open_quote; - - if (!lang && (open_quote == '/' || open_quote == '|')) - lang = "Pattern"; } else { return NULL; } @@ -1904,9 +1901,10 @@ PARSER(parse_namespace) { if (get_indent(ctx, next) != indent) break; ast_t *stmt; if ((stmt=optional(ctx, &pos, parse_struct_def)) + ||(stmt=optional(ctx, &pos, parse_func_def)) ||(stmt=optional(ctx, &pos, parse_enum_def)) ||(stmt=optional(ctx, &pos, parse_lang_def)) - ||(stmt=optional(ctx, &pos, parse_func_def)) + ||(stmt=optional(ctx, &pos, parse_extend)) ||(stmt=optional(ctx, &pos, parse_convert_def)) ||(stmt=optional(ctx, &pos, parse_use)) ||(stmt=optional(ctx, &pos, parse_extern)) @@ -1940,9 +1938,10 @@ PARSER(parse_file_body) { if (get_indent(ctx, next) != 0) break; ast_t *stmt; if ((stmt=optional(ctx, &pos, parse_struct_def)) + ||(stmt=optional(ctx, &pos, parse_func_def)) ||(stmt=optional(ctx, &pos, parse_enum_def)) ||(stmt=optional(ctx, &pos, parse_lang_def)) - ||(stmt=optional(ctx, &pos, parse_func_def)) + ||(stmt=optional(ctx, &pos, parse_extend)) ||(stmt=optional(ctx, &pos, parse_convert_def)) ||(stmt=optional(ctx, &pos, parse_use)) ||(stmt=optional(ctx, &pos, parse_extern)) @@ -2112,6 +2111,32 @@ PARSER(parse_lang_def) { return NewAST(ctx->file, start, pos, LangDef, .name=name, .namespace=namespace); } +PARSER(parse_extend) { + const char *start = pos; + // extend Name: body... + if (!match_word(&pos, "extend")) return NULL; + int64_t starting_indent = get_indent(ctx, pos); + spaces(&pos); + const char *name = get_id(&pos); + if (!name) + parser_err(ctx, start, pos, "I expected a name for this lang"); + + ast_t *body = NULL; + if (match(&pos, ":")) { + const char *ns_pos = pos; + whitespace(&ns_pos); + int64_t ns_indent = get_indent(ctx, ns_pos); + if (ns_indent > starting_indent) { + pos = ns_pos; + body = optional(ctx, &pos, parse_namespace); + } + } + if (!body) + body = NewAST(ctx->file, pos, pos, Block, .statements=NULL); + + return NewAST(ctx->file, start, pos, Extend, .name=name, .body=body); +} + arg_ast_t *parse_args(parse_ctx_t *ctx, const char **pos) { arg_ast_t *args = NULL; @@ -2373,20 +2398,6 @@ PARSER(parse_use) { what = USE_LOCAL; } else { what = USE_MODULE; - - // When `use`ing a URL, convert it to a hash: - Text_t text = Text$from_str(name); - Array_t m = Text$matches(text, Pattern("{url}")); - if (m.length >= 0) { - text = Text$trim(text, Pattern("http{0-1 s}://"), true, false); - FILE *shasum = popen(String("echo -n '", text, "' | sha256sum"), "r"); - const size_t HASH_LEN = 32; - char *hash = GC_MALLOC_ATOMIC(HASH_LEN + 1); - size_t just_read = fread(hash, sizeof(char), HASH_LEN, shasum); - if (just_read < HASH_LEN) - print_err("Failed to get SHA sum for 'use': ", name); - name = hash; - } } return NewAST(ctx->file, start, pos, Use, .var=var, .path=name, .what=what); } diff --git a/src/stdlib/README.md b/src/stdlib/README.md index 6591ead6..1c72d3d3 100644 --- a/src/stdlib/README.md +++ b/src/stdlib/README.md @@ -27,7 +27,6 @@ some common functionality. - Nums: [nums.h](nums.h), [nums.c](nums.c) - Optionals: [optionals.h](optionals.h), [optionals.c](optionals.c) - Paths: [paths.h](paths.h), [paths.c](paths.c) -- Patterns: [patterns.h](patterns.h), [patterns.c](patterns.c) - Pointers: [pointers.h](pointers.h), [pointers.c](pointers.c) - Tables: [tables.h](tables.h), [tables.c](tables.c) - Text: [text.h](text.h), [text.c](text.c) diff --git a/src/stdlib/datatypes.h b/src/stdlib/datatypes.h index b1265fc3..26bd9c3c 100644 --- a/src/stdlib/datatypes.h +++ b/src/stdlib/datatypes.h @@ -94,9 +94,6 @@ typedef struct Text_s { }; } Text_t; -#define Pattern_t Text_t -#define OptionalPattern_t Text_t - typedef struct { enum { PATH_NONE, PATH_RELATIVE, PATH_ABSOLUTE, PATH_HOME } $tag; } PathType_t; diff --git a/src/stdlib/optionals.c b/src/stdlib/optionals.c index 797cb111..d3309029 100644 --- a/src/stdlib/optionals.c +++ b/src/stdlib/optionals.c @@ -6,7 +6,6 @@ #include "integers.h" #include "metamethods.h" #include "nums.h" -#include "patterns.h" #include "text.h" #include "util.h" diff --git a/src/stdlib/paths.c b/src/stdlib/paths.c index 05575620..3f27aef7 100644 --- a/src/stdlib/paths.c +++ b/src/stdlib/paths.c @@ -24,7 +24,6 @@ #include "integers.h" #include "optionals.h" #include "paths.h" -#include "patterns.h" #include "structs.h" #include "text.h" #include "types.h" @@ -599,15 +598,10 @@ public PUREFUNC Text_t Path$base_name(Path_t path) public Text_t Path$extension(Path_t path, bool full) { - Text_t base = Path$base_name(path); - Array_t results = Text$matches(base, full ? Pattern(".{!.}.{..}") : Pattern(".{..}.{!.}{end}")); - if (results.length > 0) - return *((Text_t*)(results.data + results.stride*1)); - results = Text$matches(base, full ? Pattern("{!.}.{..}") : Pattern("{..}.{!.}{end}")); - if (results.length > 0) - return *((Text_t*)(results.data + results.stride*1)); - else - return Text(""); + const char *base = Text$as_c_string(Path$base_name(path)); + const char *dot = full ? strchr(base + 1, '.') : strrchr(base + 1, '.'); + const char *extension = dot ? dot + 1 : ""; + return Text$from_str(extension); } public Path_t Path$with_component(Path_t path, Text_t component) @@ -635,10 +629,10 @@ public Path_t Path$with_extension(Path_t path, Text_t extension, bool replace) Text_t last = *(Text_t*)(path.components.data + path.components.stride*(path.components.length-1)); Array$remove_at(&result.components, I(-1), I(1), sizeof(Text_t)); if (replace) { - if (Text$starts_with(last, Text("."))) - last = Text$replace(last, Pattern(".{!.}.{..}"), Text(".@1"), Pattern("@"), false); - else - last = Text$replace(last, Pattern("{!.}.{..}"), Text("@1"), Pattern("@"), false); + const char *base = Text$as_c_string(last); + const char *dot = strchr(base + 1, '.'); + if (dot) + last = Text$from_strn(base, (size_t)(dot - base)); } last = Text$concat(last, extension); diff --git a/src/stdlib/patterns.c b/src/stdlib/patterns.c deleted file mode 100644 index b7891f88..00000000 --- a/src/stdlib/patterns.c +++ /dev/null @@ -1,1337 +0,0 @@ -// Logic for text pattern matching - -#include <ctype.h> -#include <sys/param.h> -#include <unictype.h> -#include <uniname.h> -#include <unistring/version.h> - -#include "arrays.h" -#include "integers.h" -#include "optionals.h" -#include "patterns.h" -#include "structs.h" -#include "tables.h" -#include "text.h" -#include "types.h" - -#define MAX_BACKREFS 100 - -typedef struct { - int64_t index, length; - bool occupied, recursive; -} capture_t; - -typedef struct { - enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag; - bool negated, non_capturing; - int64_t min, max; - union { - int32_t grapheme; - uc_property_t property; - int64_t (*fn)(TextIter_t *, int64_t); - int32_t quote_graphemes[2]; - int32_t pair_graphemes[2]; - }; -} pat_t; - -static Text_t Text$replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive); - -static INLINE void skip_whitespace(TextIter_t *state, int64_t *i) -{ - while (*i < state->stack[0].text.length) { - int32_t grapheme = Text$get_grapheme_fast(state, *i); - if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) - return; - *i += 1; - } -} - -static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme) -{ - if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) { - *i += 1; - return true; - } - return false; -} - -static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str) -{ - int64_t matched = 0; - while (matched[str]) { - if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched]) - return false; - matched += 1; - } - *i += matched; - return true; -} - -static int64_t parse_int(TextIter_t *state, int64_t *i) -{ - int64_t value = 0; - for (;; *i += 1) { - uint32_t grapheme = Text$get_main_grapheme_fast(state, *i); - int digit = uc_digit_value(grapheme); - if (digit < 0) break; - if (value >= INT64_MAX/10) break; - value = 10*value + digit; - } - return value; -} - -static const char *get_property_name(TextIter_t *state, int64_t *i) -{ - skip_whitespace(state, i); - char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); - char *dest = name; - while (*i < state->stack[0].text.length) { - int32_t grapheme = Text$get_grapheme_fast(state, *i); - if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { - *dest = (char)grapheme; - ++dest; - if (dest >= name + UNINAME_MAX - 1) - break; - } else { - break; - } - *i += 1; - } - - while (dest > name && dest[-1] == ' ') - *(dest--) = '\0'; - - if (dest == name) return NULL; - *dest = '\0'; - return name; -} - -#define EAT1(state, index, cond) ({\ - int32_t grapheme = Text$get_grapheme_fast(state, index); \ - bool success = (cond); \ - if (success) index += 1; \ - success; }) - -#define EAT2(state, index, cond1, cond2) ({\ - int32_t grapheme = Text$get_grapheme_fast(state, index); \ - bool success = (cond1); \ - if (success) { \ - grapheme = Text$get_grapheme_fast(state, index + 1); \ - success = (cond2); \ - if (success) \ - index += 2; \ - } \ - success; }) - - -#define EAT_MANY(state, index, cond) ({ int64_t _n = 0; while (EAT1(state, index, cond)) { _n += 1; } _n; }) - -static int64_t match_email(TextIter_t *state, int64_t index) -{ - // email = local "@" domain - // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) - // domain = dns-label ("." dns-label)* - // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) - - if (index > 0) { - uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); - if (uc_is_property_alphabetic(prev_codepoint)) - return -1; - } - - int64_t start_index = index; - - // Local part: - int64_t local_len = 0; - static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; - while (EAT1(state, index, - (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { - local_len += 1; - if (local_len > 64) return -1; - } - - if (!EAT1(state, index, grapheme == '@')) - return -1; - - // Host - int64_t host_len = 0; - do { - int64_t label_len = 0; - while (EAT1(state, index, - (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { - label_len += 1; - if (label_len > 63) return -1; - } - - if (label_len == 0) - return -1; - - host_len += label_len; - if (host_len > 255) - return -1; - host_len += 1; - } while (EAT1(state, index, grapheme == '.')); - - return index - start_index; -} - -static int64_t match_ipv6(TextIter_t *state, int64_t index) -{ - if (index > 0) { - int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); - if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) - return -1; - } - int64_t start_index = index; - const int NUM_CLUSTERS = 8; - bool double_colon_used = false; - for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { - for (int digits = 0; digits < 4; digits++) { - if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) - break; - } - if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) - return -1; // Too many digits - - if (cluster == NUM_CLUSTERS-1) { - break; - } else if (!EAT1(state, index, grapheme == ':')) { - if (double_colon_used) - break; - return -1; - } - - if (EAT1(state, index, grapheme == ':')) { - if (double_colon_used) - return -1; - double_colon_used = true; - } - } - return index - start_index; -} - -static int64_t match_ipv4(TextIter_t *state, int64_t index) -{ - if (index > 0) { - int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); - if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) - return -1; - } - int64_t start_index = index; - - const int NUM_CLUSTERS = 4; - for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { - for (int digits = 0; digits < 3; digits++) { - if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { - if (digits == 0) return -1; - break; - } - } - - if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) - return -1; // Too many digits - - if (cluster == NUM_CLUSTERS-1) - break; - else if (!EAT1(state, index, grapheme == '.')) - return -1; - } - return (index - start_index); -} - -static int64_t match_ip(TextIter_t *state, int64_t index) -{ - int64_t len = match_ipv6(state, index); - if (len >= 0) return len; - len = match_ipv4(state, index); - return (len >= 0) ? len : -1; -} - -static int64_t match_host(TextIter_t *state, int64_t index) -{ - int64_t ip_len = match_ip(state, index); - if (ip_len > 0) return ip_len; - - int64_t start_index = index; - if (match_grapheme(state, &index, '[')) { - ip_len = match_ip(state, index); - if (ip_len <= 0) return -1; - index += ip_len; - if (match_grapheme(state, &index, ']')) - return (index - start_index); - return -1; - } - - if (!EAT1(state, index, isalpha(grapheme))) - return -1; - - static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`"; - EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme)); - return (index - start_index); -} - -static int64_t match_authority(TextIter_t *state, int64_t index) -{ - int64_t authority_start = index; - static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`."; - - // Optional user@ prefix: - int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme)); - if (username_len < 1 || !EAT1(state, index, grapheme == '@')) - index = authority_start; // No user@ part - - // Host: - int64_t host_len = match_host(state, index); - if (host_len <= 0) return -1; - index += host_len; - - // Port: - if (EAT1(state, index, grapheme == ':')) { - if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) - return -1; - } - return (index - authority_start); -} - -static int64_t match_uri(TextIter_t *state, int64_t index) -{ - // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] - // scheme = [a-zA-Z] [a-zA-Z0-9+.-] - // authority = [userinfo "@"] host [":" port] - - if (index > 0) { - // Don't match if we're not at a word edge: - uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); - if (uc_is_property_alphabetic(prev_codepoint)) - return -1; - } - - int64_t start_index = index; - - // Scheme: - if (!EAT1(state, index, isalpha(grapheme))) - return -1; - EAT_MANY(state, index, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); - if (!match_grapheme(state, &index, ':')) - return -1; - - // Authority: - int64_t authority_len; - if (match_str(state, &index, "//")) { - authority_len = match_authority(state, index); - if (authority_len > 0) - index += authority_len; - } else { - authority_len = 0; - } - - // Path: - int64_t path_start = index; - if (EAT1(state, index, grapheme == '/') || authority_len <= 0) { - static const char *non_path = " \"#?<>[]{}\\^`|"; - EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); - - if (EAT1(state, index, grapheme == '?')) { // Query - static const char *non_query = " \"#<>[]{}\\^`|"; - EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); - } - - if (EAT1(state, index, grapheme == '#')) { // Fragment - static const char *non_fragment = " \"#<>[]{}\\^`|"; - EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); - } - } - - if (authority_len <= 0 && index == path_start) - return -1; - - return index - start_index; -} - -static int64_t match_url(TextIter_t *state, int64_t index) -{ - int64_t lookahead = index; - if (!(match_str(state, &lookahead, "https:") - || match_str(state, &lookahead, "http:") - || match_str(state, &lookahead, "ftp:") - || match_str(state, &lookahead, "wss:") - || match_str(state, &lookahead, "ws:"))) - return -1; - - return match_uri(state, index); -} - -static int64_t match_id(TextIter_t *state, int64_t index) -{ - if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) - return -1; - return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE)); -} - -static int64_t match_int(TextIter_t *state, int64_t index) -{ - int64_t negative = EAT1(state, index, grapheme == '-') ? 1 : 0; - int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); - return len > 0 ? negative + len : -1; -} - -static int64_t match_alphanumeric(TextIter_t *state, int64_t index) -{ - return EAT1(state, index, uc_is_property_alphabetic((ucs4_t)grapheme) || uc_is_property_numeric((ucs4_t)grapheme)) - ? 1 : -1; -} - -static int64_t match_num(TextIter_t *state, int64_t index) -{ - bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0; - int64_t pre_decimal = EAT_MANY(state, index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); - bool decimal = (EAT1(state, index, grapheme == '.') == 1); - int64_t post_decimal = decimal ? EAT_MANY(state, index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0; - if (pre_decimal == 0 && post_decimal == 0) - return -1; - return negative + pre_decimal + decimal + post_decimal; -} - -static int64_t match_newline(TextIter_t *state, int64_t index) -{ - if (index >= state->stack[0].text.length) - return -1; - - uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index); - if (grapheme == '\n') - return 1; - if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n') - return 2; - return -1; -} - -static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat) -{ - Text_t text = state->stack[0].text; - int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index); - - switch (pat.tag) { - case PAT_START: { - if (index == 0) - return pat.negated ? -1 : 0; - return pat.negated ? 0 : -1; - } - case PAT_END: { - if (index >= text.length) - return pat.negated ? -1 : 0; - return pat.negated ? 0 : -1; - } - case PAT_ANY: { - assert(!pat.negated); - return (index < text.length) ? 1 : -1; - } - case PAT_GRAPHEME: { - if (index >= text.length) - return -1; - else if (grapheme == pat.grapheme) - return pat.negated ? -1 : 1; - return pat.negated ? 1 : -1; - } - case PAT_PROPERTY: { - if (index >= text.length) - return -1; - else if (uc_is_property((ucs4_t)grapheme, pat.property)) - return pat.negated ? -1 : 1; - return pat.negated ? 1 : -1; - } - case PAT_PAIR: { - // Nested punctuation: (?), [?], etc - if (index >= text.length) - return -1; - - int32_t open = pat.pair_graphemes[0]; - if (grapheme != open) - return pat.negated ? 1 : -1; - - int32_t close = pat.pair_graphemes[1]; - int64_t depth = 1; - int64_t match_len = 1; - for (; depth > 0; match_len++) { - if (index + match_len >= text.length) - return pat.negated ? 1 : -1; - - int32_t c = Text$get_grapheme_fast(state, index + match_len); - if (c == open) - depth += 1; - else if (c == close) - depth -= 1; - } - return pat.negated ? -1 : match_len; - } - case PAT_QUOTE: { - // Nested quotes: "?", '?', etc - if (index >= text.length) - return -1; - - int32_t open = pat.quote_graphemes[0]; - if (grapheme != open) - return pat.negated ? 1 : -1; - - int32_t close = pat.quote_graphemes[1]; - for (int64_t i = index + 1; i < text.length; i++) { - int32_t c = Text$get_grapheme_fast(state, i); - if (c == close) { - return pat.negated ? -1 : (i - index) + 1; - } else if (c == '\\' && index + 1 < text.length) { - i += 1; // Skip ahead an extra step - } - } - return pat.negated ? 1 : -1; - } - case PAT_FUNCTION: { - int64_t match_len = pat.fn(state, index); - if (match_len >= 0) - return pat.negated ? -1 : match_len; - return pat.negated ? 1 : -1; - } - default: errx(1, "Invalid pattern"); - } - errx(1, "Unreachable"); -} - -static pat_t parse_next_pat(TextIter_t *state, int64_t *index) -{ - if (EAT2(state, *index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), - grapheme == '?')) { - // Quotations: "?", '?', etc - int32_t open = Text$get_grapheme_fast(state, *index-2); - int32_t close = open; - uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); - if (!match_grapheme(state, index, close)) - fail("Pattern's closing quote is missing: ", state->stack[0].text); - - return (pat_t){ - .tag=PAT_QUOTE, - .min=1, .max=1, - .quote_graphemes={open, close}, - }; - } else if (EAT2(state, *index, - uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), - grapheme == '?')) { - // Nested punctuation: (?), [?], etc - int32_t open = Text$get_grapheme_fast(state, *index-2); - int32_t close = open; - uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); - if (!match_grapheme(state, index, close)) - fail("Pattern's closing brace is missing: ", state->stack[0].text); - - return (pat_t){ - .tag=PAT_PAIR, - .min=1, .max=1, - .pair_graphemes={open, close}, - }; - } else if (EAT1(state, *index, grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc. - skip_whitespace(state, index); - int64_t min, max; - if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) { - min = parse_int(state, index); - skip_whitespace(state, index); - if (match_grapheme(state, index, '+')) { - max = INT64_MAX; - } else if (match_grapheme(state, index, '-')) { - max = parse_int(state, index); - } else { - max = min; - } - if (min > max) fail("Minimum repetitions (", min, ") is less than the maximum (", max, ")"); - } else { - min = -1, max = -1; - } - - skip_whitespace(state, index); - - bool negated = match_grapheme(state, index, '!'); -#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__}) - const char *prop_name; - if (match_str(state, index, "..")) - prop_name = ".."; - else - prop_name = get_property_name(state, index); - - if (!prop_name) { - // Literal character, e.g. {1?} - skip_whitespace(state, index); - int32_t grapheme = Text$get_grapheme_fast(state, (*index)++); - if (!match_grapheme(state, index, '}')) - fail("Missing closing '}' in pattern: ", state->stack[0].text); - return PAT(PAT_GRAPHEME, .grapheme=grapheme); - } else if (strlen(prop_name) == 1) { - // Single letter names: {1+ A} - skip_whitespace(state, index); - if (!match_grapheme(state, index, '}')) - fail("Missing closing '}' in pattern: ", state->stack[0].text); - return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]); - } - - skip_whitespace(state, index); - if (!match_grapheme(state, index, '}')) - fail("Missing closing '}' in pattern: ", state->stack[0].text); - - switch (tolower(prop_name[0])) { - case '.': - if (prop_name[1] == '.') { - if (negated) - return ((pat_t){.tag=PAT_END, .min=min, .max=max, .non_capturing=true}); - else - return PAT(PAT_ANY); - } - break; - case 'a': - if (strcasecmp(prop_name, "authority") == 0) { - return PAT(PAT_FUNCTION, .fn=match_authority); - } else if (strcasecmp(prop_name, "alphanum") == 0 || strcasecmp(prop_name, "anum") == 0 - || strcasecmp(prop_name, "alphanumeric") == 0) { - return PAT(PAT_FUNCTION, .fn=match_alphanumeric); - } - break; - case 'c': - if (strcasecmp(prop_name, "crlf") == 0) - return PAT(PAT_FUNCTION, .fn=match_newline); - break; - case 'd': - if (strcasecmp(prop_name, "digit") == 0) { - return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT); - } - break; - case 'e': - if (strcasecmp(prop_name, "end") == 0) { - return PAT(PAT_END, .non_capturing=!negated); - } else if (strcasecmp(prop_name, "email") == 0) { - return PAT(PAT_FUNCTION, .fn=match_email); - } -#if _LIBUNISTRING_VERSION >= 0x0100000 - else if (strcasecmp(prop_name, "emoji") == 0) { - return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI); - } -#endif - break; - case 'h': - if (strcasecmp(prop_name, "host") == 0) { - return PAT(PAT_FUNCTION, .fn=match_host); - } - break; - case 'i': - if (strcasecmp(prop_name, "id") == 0) { - return PAT(PAT_FUNCTION, .fn=match_id); - } else if (strcasecmp(prop_name, "int") == 0) { - return PAT(PAT_FUNCTION, .fn=match_int); - } else if (strcasecmp(prop_name, "ipv4") == 0) { - return PAT(PAT_FUNCTION, .fn=match_ipv4); - } else if (strcasecmp(prop_name, "ipv6") == 0) { - return PAT(PAT_FUNCTION, .fn=match_ipv6); - } else if (strcasecmp(prop_name, "ip") == 0) { - return PAT(PAT_FUNCTION, .fn=match_ip); - } - break; - case 'n': - if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0) { - return PAT(PAT_FUNCTION, .fn=match_newline); - } else if (strcasecmp(prop_name, "num") == 0) { - return PAT(PAT_FUNCTION, .fn=match_num); - } - break; - case 's': - if (strcasecmp(prop_name, "start") == 0) { - return PAT(PAT_START, .non_capturing=!negated); - } - break; - case 'u': - if (strcasecmp(prop_name, "uri") == 0) { - return PAT(PAT_FUNCTION, .fn=match_uri); - } else if (strcasecmp(prop_name, "url") == 0) { - return PAT(PAT_FUNCTION, .fn=match_url); - } - break; - case 'w': - if (strcasecmp(prop_name, "word") == 0) { - return PAT(PAT_FUNCTION, .fn=match_id); - } - break; - default: break; - } - - uc_property_t prop = uc_property_byname(prop_name); - if (uc_property_is_valid(prop)) - return PAT(PAT_PROPERTY, .property=prop); - - ucs4_t grapheme = unicode_name_character(prop_name); - if (grapheme == UNINAME_INVALID) - fail("Not a valid property or character name: ", prop_name); - return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme); -#undef PAT - } else { - return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(state, (*index)++)}; - } -} - -static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) -{ - if (pattern_index >= pattern.length) // End of the pattern - return 0; - - int64_t start_index = text_index; - TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text); - pat_t pat = parse_next_pat(&pattern_state, &pattern_index); - - if (pat.min == -1 && pat.max == -1) { - if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { - pat.min = pat.max = MAX(1, text.length - text_index); - } else { - pat.min = 1; - pat.max = INT64_MAX; - } - } - - int64_t capture_start = text_index; - int64_t count = 0, capture_len = 0, next_match_len = 0; - - if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { - int64_t remaining = text.length - text_index; - capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1; - text_index += capture_len; - goto success; - } - - if (pat.min == 0 && pattern_index < pattern.length) { - next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); - if (next_match_len >= 0) { - capture_len = 0; - goto success; - } - } - - while (count < pat.max) { - int64_t match_len = match_pat(&text_state, text_index, pat); - if (match_len < 0) - break; - capture_len += match_len; - text_index += match_len; - count += 1; - - if (pattern_index < pattern.length) { // More stuff after this - if (count < pat.min) - next_match_len = -1; - else - next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); - } else { - next_match_len = 0; - } - - if (match_len == 0) { - if (next_match_len >= 0) { - // If we're good to go, no need to keep re-matching zero-length - // matches till we hit max: - count = pat.max; - break; - } else { - return -1; - } - } - - if (pattern_index < pattern.length && next_match_len >= 0) - break; // Next guy exists and wants to stop here - - if (text_index >= text.length) - break; - } - - if (count < pat.min || next_match_len < 0) - return -1; - - success: - if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) { - if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) { - assert(capture_len > 0); - captures[capture_index] = (capture_t){ - .index=capture_start + 1, // Skip leading quote/paren - .length=capture_len - 2, // Skip open/close - .occupied=true, - .recursive=(pat.tag == PAT_PAIR), - }; - } else { - captures[capture_index] = (capture_t){ - .index=capture_start, - .length=capture_len, - .occupied=true, - .recursive=false, - }; - } - } - return (text_index - start_index) + next_match_len; -} - -#undef EAT1 -#undef EAT2 -#undef EAT_MANY - -static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length, capture_t *captures) -{ - int32_t first_grapheme = Text$get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - TextIter_t text_state = NEW_TEXT_ITER_STATE(text); - for (int64_t i = first; i <= last; i++) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme) - ++i; - } - - int64_t m = match(text, i, pattern, 0, captures, 0); - if (m >= 0) { - if (match_length) - *match_length = m; - return i; - } - } - if (match_length) - *match_length = -1; - return -1; -} - -public OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index) -{ - int64_t first = Int64$from_int(from_index, false); - if (first == 0) fail("Invalid index: 0"); - if (first < 0) first = text.length + first + 1; - if (first > text.length || first < 1) - return NONE_MATCH; - - capture_t captures[MAX_BACKREFS] = {}; - int64_t len = 0; - int64_t found = _find(text, pattern, first-1, text.length-1, &len, captures); - if (found == -1) - return NONE_MATCH; - - Array_t capture_array = {}; - for (int i = 0; captures[i].occupied; i++) { - Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); - Array$insert(&capture_array, &capture, I(0), sizeof(Text_t)); - } - return (OptionalMatch_t){ - .text=Text$slice(text, I(found+1), I(found+len)), - .index=I(found+1), - .captures=capture_array, - }; -} - -PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern) -{ - if (Text$starts_with(pattern, Text("{start}"))) { - int64_t m = match(text, 0, pattern, 0, NULL, 0); - return m >= 0; - } else if (Text$ends_with(text, Text("{end}"))) { - for (int64_t i = text.length-1; i >= 0; i--) { - int64_t match_len = match(text, i, pattern, 0, NULL, 0); - if (match_len >= 0 && i + match_len == text.length) - return true; - } - return false; - } else { - int64_t found = _find(text, pattern, 0, text.length-1, NULL, NULL); - return (found >= 0); - } -} - -public OptionalArray_t Text$matches(Text_t text, Pattern_t pattern) -{ - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, 0, pattern, 0, captures, 0); - if (match_len != text.length) - return NONE_ARRAY; - - Array_t capture_array = {}; - for (int i = 0; captures[i].occupied; i++) { - Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); - Array$insert(&capture_array, &capture, I(0), sizeof(Text_t)); - } - return capture_array; -} - -public Array_t Text$find_all(Text_t text, Pattern_t pattern) -{ - if (pattern.length == 0) // special case - return (Array_t){.length=0}; - - Array_t matches = {}; - for (int64_t i = 1; ; ) { - OptionalMatch_t m = Text$find(text, pattern, I(i)); - if (!m.index.small) - break; - i = Int64$from_int(m.index, false) + m.text.length; - Array$insert(&matches, &m, I_small(0), sizeof(Match_t)); - } - return matches; -} - -typedef struct { - TextIter_t state; - Int_t i; - Pattern_t pattern; -} match_iter_state_t; - -static OptionalMatch_t next_match(match_iter_state_t *state) -{ - if (Int64$from_int(state->i, false) > state->state.stack[0].text.length) - return NONE_MATCH; - - OptionalMatch_t m = Text$find(state->state.stack[0].text, state->pattern, state->i); - if (m.index.small == 0) // No match - state->i = I(state->state.stack[0].text.length + 1); - else - state->i = Int$plus(m.index, I(MAX(1, m.text.length))); - return m; -} - -public Closure_t Text$by_match(Text_t text, Pattern_t pattern) -{ - return (Closure_t){ - .fn=(void*)next_match, - .userdata=new(match_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=I_small(1), .pattern=pattern), - }; -} - -static Text_t apply_backrefs(Text_t text, Array_t recursive_replacements, Text_t replacement, Pattern_t backref_pat, capture_t *captures) -{ - if (backref_pat.length == 0) - return replacement; - - int32_t first_grapheme = Text$get_grapheme(backref_pat, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - Text_t ret = Text(""); - TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement); - int64_t nonmatching_pos = 0; - for (int64_t pos = 0; pos < replacement.length; ) { - // Optimization: quickly skip ahead to first char in the backref pattern: - if (find_first) { - while (pos < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme) - ++pos; - } - - int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0); - if (backref_len < 0) { - pos += 1; - continue; - } - - int64_t after_backref = pos + backref_len; - int64_t backref = parse_int(&replacement_state, &after_backref); - if (after_backref == pos + backref_len) { // Not actually a backref if there's no number - pos += 1; - continue; - } - if (backref < 0 || backref > 9) fail("Invalid backref index: ", backref, " (only 0-", MAX_BACKREFS-1, " are allowed)"); - backref_len = (after_backref - pos); - - if (Text$get_grapheme_fast(&replacement_state, pos + backref_len) == ';') - backref_len += 1; // skip optional semicolon - - if (!captures[backref].occupied) - fail("There is no capture number ", backref, "!"); - - Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length)); - - if (captures[backref].recursive && recursive_replacements.length > 0) - backref_text = Text$replace_array(backref_text, recursive_replacements, backref_pat, true); - - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos)); - ret = Text$concat(ret, before_slice, backref_text); - } else { - ret = Text$concat(ret, backref_text); - } - - pos += backref_len; - nonmatching_pos = pos; - } - if (nonmatching_pos < replacement.length) { - Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive) -{ - Text_t ret = EMPTY_TEXT; - - int32_t first_grapheme = Text$get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - Text_t entries[2] = {pattern, replacement}; - Array_t replacements = { - .data=entries, - .length=1, - .stride=sizeof(entries), - }; - - TextIter_t text_state = NEW_TEXT_ITER_STATE(text); - int64_t nonmatching_pos = 0; - for (int64_t pos = 0; pos < text.length; ) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) - ++pos; - } - - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, pos, pattern, 0, captures, 1); - if (match_len < 0) { - pos += 1; - continue; - } - captures[0] = (capture_t){ - .index = pos, .length = match_len, - .occupied = true, .recursive = false, - }; - - Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (Array_t){}, replacement, backref_pat, captures); - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); - ret = Text$concat(ret, before_slice, replacement_text); - } else { - ret = Text$concat(ret, replacement_text); - } - nonmatching_pos = pos + match_len; - pos += MAX(match_len, 1); - } - if (nonmatching_pos < text.length) { - Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right) -{ - int64_t first = 0, last = text.length-1; - if (trim_left) { - int64_t match_len = match(text, 0, pattern, 0, NULL, 0); - if (match_len > 0) - first = match_len; - } - - if (trim_right) { - for (int64_t i = text.length-1; i >= first; i--) { - int64_t match_len = match(text, i, pattern, 0, NULL, 0); - if (match_len > 0 && i + match_len == text.length) - last = i-1; - } - } - return Text$slice(text, I(first+1), I(last+1)); -} - -public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive) -{ - Text_t ret = EMPTY_TEXT; - - int32_t first_grapheme = Text$get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - TextIter_t text_state = NEW_TEXT_ITER_STATE(text); - int64_t nonmatching_pos = 0; - - Text_t (*text_mapper)(Match_t, void*) = fn.fn; - for (int64_t pos = 0; pos < text.length; pos++) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) - ++pos; - } - - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, pos, pattern, 0, captures, 0); - if (match_len < 0) continue; - - Match_t m = { - .text=Text$slice(text, I(pos+1), I(pos+match_len)), - .index=I(pos+1), - .captures={}, - }; - for (int i = 0; captures[i].occupied; i++) { - Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); - if (recursive) - capture = Text$map(capture, pattern, fn, recursive); - Array$insert(&m.captures, &capture, I(0), sizeof(Text_t)); - } - - Text_t replacement = text_mapper(m, fn.userdata); - if (pos > nonmatching_pos) { - Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); - ret = Text$concat(ret, before_slice, replacement); - } else { - ret = Text$concat(ret, replacement); - } - nonmatching_pos = pos + match_len; - pos += (match_len - 1); - } - if (nonmatching_pos < text.length) { - Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -public void Text$each(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive) -{ - int32_t first_grapheme = Text$get_grapheme(pattern, 0); - bool find_first = (first_grapheme != '{' - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) - && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - - TextIter_t text_state = NEW_TEXT_ITER_STATE(text); - void (*action)(Match_t, void*) = fn.fn; - for (int64_t pos = 0; pos < text.length; pos++) { - // Optimization: quickly skip ahead to first char in pattern: - if (find_first) { - while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) - ++pos; - } - - capture_t captures[MAX_BACKREFS] = {}; - int64_t match_len = match(text, pos, pattern, 0, captures, 0); - if (match_len < 0) continue; - - Match_t m = { - .text=Text$slice(text, I(pos+1), I(pos+match_len)), - .index=I(pos+1), - .captures={}, - }; - for (int i = 0; captures[i].occupied; i++) { - Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); - if (recursive) - Text$each(capture, pattern, fn, recursive); - Array$insert(&m.captures, &capture, I(0), sizeof(Text_t)); - } - - action(m, fn.userdata); - pos += (match_len - 1); - } -} - -Text_t Text$replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive) -{ - if (replacements.length == 0) return text; - - Text_t ret = EMPTY_TEXT; - - int64_t nonmatch_pos = 0; - for (int64_t pos = 0; pos < text.length; ) { - // Find the first matching pattern at this position: - for (int64_t i = 0; i < replacements.length; i++) { - Pattern_t pattern = *(Pattern_t*)(replacements.data + i*replacements.stride); - capture_t captures[MAX_BACKREFS] = {}; - int64_t len = match(text, pos, pattern, 0, captures, 1); - if (len < 0) continue; - captures[0].index = pos; - captures[0].length = len; - - // If we skipped over some non-matching text before finding a match, insert it here: - if (pos > nonmatch_pos) { - Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos)); - ret = Text$concat(ret, before_slice); - } - - // Concatenate the replacement: - Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride + sizeof(Text_t)); - Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (Array_t){}, replacement, backref_pat, captures); - ret = Text$concat(ret, replacement_text); - pos += MAX(len, 1); - nonmatch_pos = pos; - goto next_pos; - } - - pos += 1; - next_pos: - continue; - } - - if (nonmatch_pos <= text.length) { - Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length)); - ret = Text$concat(ret, last_slice); - } - return ret; -} - -public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive) -{ - return Text$replace_array(text, replacements.entries, backref_pat, recursive); -} - -public Array_t Text$split(Text_t text, Pattern_t pattern) -{ - if (text.length == 0) // special case - return (Array_t){.length=0}; - - if (pattern.length == 0) // special case - return Text$clusters(text); - - Array_t chunks = {}; - - int64_t i = 0; - for (;;) { - int64_t len = 0; - int64_t found = _find(text, pattern, i, text.length-1, &len, NULL); - if (found == i && len == 0) - found = _find(text, pattern, i + 1, text.length-1, &len, NULL); - if (found < 0) break; - Text_t chunk = Text$slice(text, I(i+1), I(found)); - Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); - i = MAX(found + len, i + 1); - } - - Text_t last_chunk = Text$slice(text, I(i+1), I(text.length)); - Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t)); - - return chunks; -} - -typedef struct { - TextIter_t state; - int64_t i; - Pattern_t pattern; -} split_iter_state_t; - -static OptionalText_t next_split(split_iter_state_t *state) -{ - Text_t text = state->state.stack[0].text; - if (state->i >= text.length) { - if (state->pattern.length > 0 && state->i == text.length) { // special case - state->i = text.length + 1; - return EMPTY_TEXT; - } - return NONE_TEXT; - } - - if (state->pattern.length == 0) { // special case - Text_t ret = Text$cluster(text, I(state->i+1)); - state->i += 1; - return ret; - } - - int64_t start = state->i; - int64_t len = 0; - int64_t found = _find(text, state->pattern, start, text.length-1, &len, NULL); - - if (found == start && len == 0) - found = _find(text, state->pattern, start + 1, text.length-1, &len, NULL); - - if (found >= 0) { - state->i = MAX(found + len, state->i + 1); - return Text$slice(text, I(start+1), I(found)); - } else { - state->i = state->state.stack[0].text.length + 1; - return Text$slice(text, I(start+1), I(text.length)); - } -} - -public Closure_t Text$by_split(Text_t text, Pattern_t pattern) -{ - return (Closure_t){ - .fn=(void*)next_split, - .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .pattern=pattern), - }; -} - -public Pattern_t Pattern$escape_text(Text_t text) -{ - // TODO: optimize for spans of non-escaped text - Text_t ret = EMPTY_TEXT; - TextIter_t state = NEW_TEXT_ITER_STATE(text); - for (int64_t i = 0; i < text.length; i++) { - uint32_t g = Text$get_main_grapheme_fast(&state, i); - if (g == '{') { - ret = Text$concat(ret, Text("{1{}")); - } else if (g == '?' - || uc_is_property_quotation_mark(g) - || (uc_is_property_paired_punctuation(g) && uc_is_property_left_of_pair(g))) { - ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i+1), I(i+1)), Text("}")); - } else { - ret = Text$concat(ret, Text$slice(text, I(i+1), I(i+1))); - } - } - return ret; -} - -static Text_t Pattern$as_text(const void *obj, bool colorize, const TypeInfo_t *info) -{ - (void)info; - if (!obj) return Text("Pattern"); - - Pattern_t pat = *(Pattern_t*)obj; - Text_t quote = Text$has(pat, Pattern("/")) && !Text$has(pat, Pattern("|")) ? Text("|") : Text("/"); - return Text$concat( colorize ? Text("\x1b[1m$\033[m") : Text("$"), Text$quoted(pat, colorize, quote)); -} - -public const TypeInfo_t Pattern$info = { - .size=sizeof(Pattern_t), - .align=__alignof__(Pattern_t), - .tag=TextInfo, - .TextInfo={.lang="Pattern"}, - .metamethods={ - .as_text=Pattern$as_text, - .hash=Text$hash, - .compare=Text$compare, - .equal=Text$equal, - .is_none=Text$is_none, - .serialize=Text$serialize, - .deserialize=Text$deserialize, - }, -}; - -static const TypeInfo_t _text_array = { - .size=sizeof(Array_t), - .align=__alignof__(Array_t), - .tag=ArrayInfo, - .ArrayInfo.item=&Text$info, - .metamethods=Array$metamethods, -}; - -static NamedType_t _match_fields[3] = { - {"text", &Text$info}, - {"index", &Int$info}, - {"captures", &_text_array}, -}; - -static bool Match$is_none(const void *m, const TypeInfo_t*) -{ - return ((OptionalMatch_t*)m)->index.small == 0; -} - -public const TypeInfo_t Match$info = { - .size=sizeof(Match_t), - .align=__alignof__(Match_t), - .tag=StructInfo, - .StructInfo={ - .name="Match", - .num_fields=3, - .fields=_match_fields, - }, - .metamethods={ - .as_text=Struct$as_text, - .hash=Struct$hash, - .compare=Struct$compare, - .equal=Struct$equal, - .is_none=Match$is_none, - }, -}; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/src/stdlib/patterns.h b/src/stdlib/patterns.h deleted file mode 100644 index 2b77e490..00000000 --- a/src/stdlib/patterns.h +++ /dev/null @@ -1,46 +0,0 @@ -#pragma once - -// The type representing text patterns for pattern matching. - -#include <stdbool.h> -#include <stdint.h> - -#include "datatypes.h" -#include "integers.h" -#include "optionals.h" -#include "types.h" - -#define Pattern(text) ((Pattern_t)Text(text)) -#define Patterns(...) ((Pattern_t)Texts(__VA_ARGS__)) - -typedef struct { - Text_t text; - Int_t index; - Array_t captures; -} Match_t; - -typedef Match_t OptionalMatch_t; -#define NONE_MATCH ((OptionalMatch_t){.index=NONE_INT}) - -Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t backref_pat, bool recursive); -Pattern_t Pattern$escape_text(Text_t text); -Text_t Text$replace_all(Text_t text, Table_t replacements, Pattern_t backref_pat, bool recursive); -Array_t Text$split(Text_t text, Pattern_t pattern); -Closure_t Text$by_split(Text_t text, Pattern_t pattern); -Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right); -OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t i); -Array_t Text$find_all(Text_t text, Pattern_t pattern); -Closure_t Text$by_match(Text_t text, Pattern_t pattern); -PUREFUNC bool Text$has(Text_t text, Pattern_t pattern); -OptionalArray_t Text$matches(Text_t text, Pattern_t pattern); -Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive); -void Text$each(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive); - -#define Pattern$hash Text$hash -#define Pattern$compare Text$compare -#define Pattern$equal Text$equal - -extern const TypeInfo_t Match$info; -extern const TypeInfo_t Pattern$info; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/src/stdlib/stdlib.c b/src/stdlib/stdlib.c index fc94bc97..77383cd1 100644 --- a/src/stdlib/stdlib.c +++ b/src/stdlib/stdlib.c @@ -20,7 +20,6 @@ #include "optionals.h" #include "metamethods.h" #include "nums.h" -#include "patterns.h" #include "paths.h" #include "rng.h" #include "siphash.h" diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 27acdfa4..621de942 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -998,17 +998,22 @@ PUREFUNC public int32_t Text$compare(const void *va, const void *vb, const TypeI return 0; } +bool _matches(TextIter_t *text_state, TextIter_t *target_state, int64_t pos) +{ + for (int64_t i = 0; i < target_state->stack[0].text.length; i++) { + int32_t text_i = Text$get_grapheme_fast(text_state, pos + i); + int32_t prefix_i = Text$get_grapheme_fast(target_state, i); + if (text_i != prefix_i) return false; + } + return true; +} + PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix) { if (text.length < prefix.length) return false; TextIter_t text_state = NEW_TEXT_ITER_STATE(text), prefix_state = NEW_TEXT_ITER_STATE(prefix); - for (int64_t i = 0; i < prefix.length; i++) { - int32_t text_i = Text$get_grapheme_fast(&text_state, i); - int32_t prefix_i = Text$get_grapheme_fast(&prefix_state, i); - if (text_i != prefix_i) return false; - } - return true; + return _matches(&text_state, &prefix_state, 0); } PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix) @@ -1016,12 +1021,236 @@ PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix) if (text.length < suffix.length) return false; TextIter_t text_state = NEW_TEXT_ITER_STATE(text), suffix_state = NEW_TEXT_ITER_STATE(suffix); - for (int64_t i = 0; i < suffix.length; i++) { - int32_t text_i = Text$get_grapheme_fast(&text_state, text.length - suffix.length + i); - int32_t suffix_i = Text$get_grapheme_fast(&suffix_state, i); - if (text_i != suffix_i) return false; + return _matches(&text_state, &suffix_state, text.length - suffix.length); +} + +public Text_t Text$without_prefix(Text_t text, Text_t prefix) +{ + return Text$starts_with(text, prefix) ? Text$slice(text, I(prefix.length + 1), I(text.length)) : text; +} + +public Text_t Text$without_suffix(Text_t text, Text_t suffix) +{ + return Text$ends_with(text, suffix) ? Text$slice(text, I(1), I(text.length - suffix.length)) : text; +} + +static bool _has_grapheme(TextIter_t *text, int32_t g) +{ + for (int64_t t = 0; t < text->stack[0].text.length; t++) { + if (g == Text$get_grapheme_fast(text, t)) { + return true; + } } - return true; + return false; +} + +public Text_t Text$trim(Text_t text, Text_t to_trim, bool left, bool right) +{ + int64_t first = 0; + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), trim_state = NEW_TEXT_ITER_STATE(to_trim); + if (left) { + while (first < text.length && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, first))) { + first += 1; + } + } + int64_t last = text.length-1; + if (right) { + while (last >= first && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, last))) { + last -= 1; + } + } + return (first != 0 || last != text.length-1) ? Text$slice(text, I(first+1), I(last+1)) : text; +} + +public Text_t Text$translate(Text_t text, Table_t translations) +{ + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); + Text_t result = EMPTY_TEXT; + int64_t span_start = 0; + Array_t replacement_array = translations.entries; + for (int64_t i = 0; i < text.length; ) { + for (int64_t r = 0; r < replacement_array.length; r++) { + struct { Text_t target, replacement; } *entry = replacement_array.data + r*replacement_array.stride; + TextIter_t target_state = NEW_TEXT_ITER_STATE(entry->target); + if (_matches(&text_state, &target_state, i)) { + if (i > span_start) + result = concat2(result, Text$slice(text, I(span_start+1), I(i))); + + result = concat2(result, entry->replacement); + i += entry->target.length; + span_start = i; + goto found_match; + } + } + i += 1; + found_match: continue; + } + if (span_start < text.length) + result = concat2(result, Text$slice(text, I(span_start+1), I(text.length))); + return result; +} + +public Text_t Text$replace(Text_t text, Text_t target, Text_t replacement) +{ + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target); + Text_t result = EMPTY_TEXT; + int64_t span_start = 0; + for (int64_t i = 0; i < text.length; ) { + if (_matches(&text_state, &target_state, i)) { + if (i > span_start) + result = concat2(result, Text$slice(text, I(span_start+1), I(i))); + + result = concat2(result, replacement); + i += target.length; + span_start = i; + } else { + i += 1; + } + } + if (span_start < text.length) + result = concat2(result, Text$slice(text, I(span_start+1), I(text.length))); + return result; +} + +public bool Text$has(Text_t text, Text_t target) +{ + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target); + for (int64_t i = 0; i < text.length; i++) { + if (_matches(&text_state, &target_state, i)) + return true; + } + return false; +} + +public Array_t Text$split(Text_t text, Text_t delimiters) +{ + if (delimiters.length == 0) + return Text$clusters(text); + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters); + Array_t splits = {}; + for (int64_t i = 0; i < text.length; ) { + int64_t span_len = 0; + while (i + span_len < text.length && !_matches(&text_state, &delim_state, i + span_len)) { + span_len += 1; + } + Text_t slice = Text$slice(text, I(i+1), I(i+span_len)); + Array$insert(&splits, &slice, I(0), sizeof(slice)); + i += span_len + delimiters.length; + if (i == text.length) { + Text_t empty = Text(""); + Array$insert(&splits, &empty, I(0), sizeof(empty)); + } + } + return splits; +} + +public Array_t Text$split_any(Text_t text, Text_t delimiters) +{ + if (delimiters.length == 0) + return Array(text); + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters); + Array_t splits = {}; + for (int64_t i = 0; i < text.length; ) { + int64_t span_len = 0; + while (i + span_len < text.length && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i + span_len))) { + span_len += 1; + } + bool trailing_delim = i + span_len < text.length; + Text_t slice = Text$slice(text, I(i+1), I(i+span_len)); + Array$insert(&splits, &slice, I(0), sizeof(slice)); + i += span_len + 1; + while (i < text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i))) { + i += 1; + } + if (i >= text.length && trailing_delim) { + Text_t empty = Text(""); + Array$insert(&splits, &empty, I(0), sizeof(empty)); + } + } + return splits; +} + +typedef struct { + TextIter_t state; + int64_t i; + Text_t delimiter; +} split_iter_state_t; + +static OptionalText_t next_split(split_iter_state_t *state) +{ + Text_t text = state->state.stack[0].text; + if (state->i >= text.length) { + if (state->delimiter.length > 0 && state->i == text.length) { // special case + state->i = text.length + 1; + return EMPTY_TEXT; + } + return NONE_TEXT; + } + + if (state->delimiter.length == 0) { // special case + state->i = text.length + 1; + return text; + } + + TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter); + int64_t i = state->i; + int64_t span_len = 0; + while (i + span_len < text.length && !_matches(&state->state, &delim_state, i + span_len)) { + span_len += 1; + } + Text_t slice = Text$slice(text, I(i+1), I(i+span_len)); + state->i = i + span_len + state->delimiter.length; + return slice; +} + +public Closure_t Text$by_split(Text_t text, Text_t delimiter) +{ + return (Closure_t){ + .fn=(void*)next_split, + .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .delimiter=delimiter), + }; +} + +static OptionalText_t next_split_any(split_iter_state_t *state) +{ + Text_t text = state->state.stack[0].text; + if (state->i >= text.length) { + if (state->delimiter.length > 0 && state->i == text.length) { // special case + state->i = text.length + 1; + return EMPTY_TEXT; + } + return NONE_TEXT; + } + + if (state->delimiter.length == 0) { // special case + Text_t ret = Text$cluster(text, I(state->i+1)); + state->i += 1; + return ret; + } + + TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter); + int64_t i = state->i; + int64_t span_len = 0; + while (i + span_len < text.length && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i + span_len))) { + span_len += 1; + } + Text_t slice = Text$slice(text, I(i+1), I(i+span_len)); + i += span_len + 1; + while (i < text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i))) { + i += 1; + } + state->i = i; + return slice; +} + +public Closure_t Text$by_split_any(Text_t text, Text_t delimiters) +{ + return (Closure_t){ + .fn=(void*)next_split_any, + .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .delimiter=delimiters), + }; } PUREFUNC public bool Text$equal_values(Text_t a, Text_t b) diff --git a/src/stdlib/text.h b/src/stdlib/text.h index 4acca8a2..662c6e5f 100644 --- a/src/stdlib/text.h +++ b/src/stdlib/text.h @@ -50,6 +50,16 @@ Text_t Text$as_text(const void *text, bool colorize, const TypeInfo_t *info); Text_t Text$quoted(Text_t str, bool colorize, Text_t quotation_mark); PUREFUNC bool Text$starts_with(Text_t text, Text_t prefix); PUREFUNC bool Text$ends_with(Text_t text, Text_t suffix); +Text_t Text$without_prefix(Text_t text, Text_t prefix); +Text_t Text$without_suffix(Text_t text, Text_t suffix); +Text_t Text$replace(Text_t text, Text_t target, Text_t replacement); +Text_t Text$translate(Text_t text, Table_t translations); +bool Text$has(Text_t text, Text_t target); +Array_t Text$split(Text_t text, Text_t delimiter); +Array_t Text$split_any(Text_t text, Text_t delimiters); +Closure_t Text$by_split(Text_t text, Text_t delimiter); +Closure_t Text$by_split_any(Text_t text, Text_t delimiters); +Text_t Text$trim(Text_t text, Text_t to_trim, bool left, bool right); char *Text$as_c_string(Text_t text); __attribute__((format(printf, 1, 2))) public Text_t Text$format(const char *fmt, ...); diff --git a/src/stdlib/tomo.h b/src/stdlib/tomo.h index 4aa1253d..e42b562e 100644 --- a/src/stdlib/tomo.h +++ b/src/stdlib/tomo.h @@ -20,7 +20,6 @@ #include "nums.h" #include "optionals.h" #include "paths.h" -#include "patterns.h" #include "pointers.h" #include "print.h" #include "rng.h" @@ -21,7 +21,6 @@ #include "stdlib/datatypes.h" #include "stdlib/integers.h" #include "stdlib/optionals.h" -#include "stdlib/patterns.h" #include "stdlib/paths.h" #include "stdlib/print.h" #include "stdlib/text.h" @@ -294,7 +293,12 @@ int main(int argc, char *argv[]) Text_t escape_lib_name(Text_t lib_name) { - return Text$replace(lib_name, Pattern("{1+ !alphanumeric}"), Text("_"), Pattern(""), false); + char *libname_id = String(lib_name); + for (char *p = libname_id; *p; p++) { + if (!isalnum(*p) && *p != '_') + *p = '_'; + } + return Text$from_str(libname_id); } Path_t build_file(Path_t path, const char *extension) diff --git a/src/typecheck.c b/src/typecheck.c index 0bfe6a07..ff609435 100644 --- a/src/typecheck.c +++ b/src/typecheck.c @@ -12,7 +12,6 @@ #include "cordhelpers.h" #include "environment.h" #include "parse.h" -#include "stdlib/patterns.h" #include "stdlib/paths.h" #include "stdlib/tables.h" #include "stdlib/text.h" @@ -195,8 +194,11 @@ static env_t *load_module(env_t *env, ast_t *module_ast) env_t *module_env = fresh_scope(env); Table$str_set(env->imports, use->path, module_env); - char *libname_id = Text$as_c_string( - Text$replace(Text$from_str(use->path), Pattern("{1+ !alphanumeric}"), Text("_"), Pattern(""), false)); + char *libname_id = String(use->path); + for (char *p = libname_id; *p; p++) { + if (!isalnum(*p) && *p != '_') + *p = '_'; + } module_env->libname = libname_id; for (size_t i = 0; i < tm_files.gl_pathc; i++) { const char *filename = tm_files.gl_pathv[i]; @@ -269,6 +271,14 @@ void prebind_statement(env_t *env, ast_t *statement) prebind_statement(ns_env, stmt->ast); break; } + case Extend: { + auto extend = Match(statement, Extend); + env_t *ns_env = namespace_env(env, extend->name); + ns_env->libname = env->libname; + for (ast_list_t *stmt = extend->body ? Match(extend->body, Block)->statements : NULL; stmt; stmt = stmt->next) + prebind_statement(ns_env, stmt->ast); + break; + } default: break; } } @@ -435,6 +445,14 @@ void bind_statement(env_t *env, ast_t *statement) bind_statement(ns_env, stmt->ast); break; } + case Extend: { + auto extend = Match(statement, Extend); + env_t *ns_env = namespace_env(env, extend->name); + ns_env->libname = env->libname; + for (ast_list_t *stmt = extend->body ? Match(extend->body, Block)->statements : NULL; stmt; stmt = stmt->next) + bind_statement(ns_env, stmt->ast); + break; + } case Use: { env_t *module_env = load_module(env, statement); if (!module_env) break; @@ -940,7 +958,7 @@ type_t *get_type(env_t *env, ast_t *ast) // Early out if the type is knowable without any context from the block: switch (last->ast->tag) { - case UpdateAssign: case Assign: case Declare: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: + case UpdateAssign: case Assign: case Declare: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: case Extend: return Type(VoidType); default: break; } @@ -1240,7 +1258,7 @@ type_t *get_type(env_t *env, ast_t *ast) return Type(ClosureType, Type(FunctionType, .args=args, .ret=ret)); } - case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: { + case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: case Extend: { return Type(VoidType); } @@ -1399,7 +1417,7 @@ PUREFUNC bool is_discardable(env_t *env, ast_t *ast) { switch (ast->tag) { case UpdateAssign: case Assign: case Declare: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: - case LangDef: case Use: + case LangDef: case Use: case Extend: return true; default: break; } |
