diff --git a/docs/langs.md b/docs/langs.md index 0e6242d..d7225ae 100644 --- a/docs/langs.md +++ b/docs/langs.md @@ -11,12 +11,12 @@ where a different type of string is needed. ```tomo lang HTML: convert(t:Text -> HTML): - t = t:replace_all({ - $/&/ = "&", - $// = ">", - $/"/ = """, - $/'/ = "'", + t = t:translate({ + "&" = "&", + "<" = "<", + ">" = ">", + '"' = """, + "'" = "'", }) return HTML.from_text(t) @@ -75,7 +75,7 @@ instead of building a global function called `execute()` that takes a ```tomo lang Sh: convert(text:Text -> Sh): - return Sh.from_text("'" ++ text:replace($/'/, "''") ++ "'") + return Sh.from_text("'" ++ text:replace("'", "''") ++ "'") func execute(sh:Sh -> Text): ... @@ -94,7 +94,7 @@ another type's block or at the top level. ```tomo lang Sh: convert(text:Text -> Sh): - return Sh.from_text("'" ++ text:replace($/'/, "''") ++ "'") + return Sh.from_text("'" ++ text:replace("'", "''") ++ "'") struct Foo(x,y:Int): convert(f:Foo -> Sh): diff --git a/docs/text.md b/docs/text.md index c1b2cab..62d2315 100644 --- a/docs/text.md +++ b/docs/text.md @@ -273,43 +273,43 @@ pattern documentation](patterns.md) for more details. - [`func as_c_string(text: Text -> CString)`](#as_c_string) - [`func at(text: Text, index: Int -> Text)`](#at) - [`func by_line(text: Text -> func(->Text?))`](#by_line) -- [`func by_match(text: Text, pattern: Pattern -> func(->Match?))`](#by_match) -- [`func by_split(text: Text, pattern: Pattern = $// -> func(->Text?))`](#by_split) +- [`func by_split(text: Text, delimiter: Text = "" -> func(->Text?))`](#by_split) +- [`func by_split_any(text: Text, delimiters: Text = " $\t\r\n" -> func(->Text?))`](#by_split_any) - [`func bytes(text: Text -> [Byte])`](#bytes) - [`func caseless_equals(a: Text, b:Text, language:Text = "C" -> Bool)`](#caseless_equals) - [`func codepoint_names(text: Text -> [Text])`](#codepoint_names) -- [`func each(text: Text, pattern: Pattern, fn: func(m: Match), recursive: Bool = yes -> Int?)`](#each) - [`func ends_with(text: Text, suffix: Text -> Bool)`](#ends_with) -- [`func find(text: Text, pattern: Pattern, start: Int = 1 -> Int?)`](#find) -- [`func find_all(text: Text, pattern: Pattern -> [Match])`](#find_all) - [`func from(text: Text, first: Int -> Text)`](#from) -- [`func from_codepoint_names(codepoints: [Int32] -> [Text])`](#from_bytes) +- [`func from_bytes(codepoints: [Int32] -> [Text])`](#from_bytes) - [`func from_c_string(str: CString -> Text)`](#from_c_string) - [`func from_codepoint_names(codepoint_names: [Text] -> [Text])`](#from_codepoint_names) -- [`func from_codepoint_names(codepoints: [Int32] -> [Text])`](#from_codepoints) -- [`func has(text: Text, pattern: Pattern -> Bool)`](#has) +- [`func from_codepoints(codepoints: [Int32] -> [Text])`](#from_codepoints) +- [`func has(text: Text, target: Text -> Bool)`](#has) - [`func join(glue: Text, pieces: [Text] -> Text)`](#join) -- [`func split(text: Text -> [Text])`](#lines) +- [`func split(text: Text, delimiter: Text = "" -> [Text])`](#split) +- [`func split_any(text: Text, delimiters: Text = " $\t\r\n" -> [Text])`](#split_any) - [`func middle_pad(text: Text, width: Int, pad: Text = " ", language: Text = "C" -> Text)`](#middle_pad) - [`func left_pad(text: Text, width: Int, pad: Text = " ", language: Text = "C" -> Text)`](#left_pad) -- [`func lines(text: Text, pattern: Pattern = "" -> [Text])`](#lines) +- [`func lines(text: Text -> [Text])`](#lines) - [`func lower(text: Text, language: Text = "C" -> Text)`](#lower) -- [`func map(text: Text, pattern: Pattern, fn: func(text:Match)->Text -> Text, recursive: Bool = yes)`](#map) -- [`func matches(text: Text, pattern: Pattern -> [Text])`](#matches) - [`func quoted(text: Text, color: Bool = no, quotation_mark: Text = '"' -> Text)`](#quoted) - [`func repeat(text: Text, count:Int -> Text)`](#repeat) -- [`func replace(text: Text, pattern: Pattern, replacement: Text, backref: Pattern = $/\/, recursive: Bool = yes -> Text)`](#replace) -- [`func replace_all(replacements:{Pattern,Text}, backref: Pattern = $/\/, recursive: Bool = yes -> Text)`](#replace_all) +- [`func replace(text: Text, target: Text, replacement: Text -> Text)`](#replace) - [`func reversed(text: Text -> Text)`](#reversed) - [`func right_pad(text: Text, width: Int, pad: Text = " ", language: Text = "C" -> Text)`](#right_pad) - [`func slice(text: Text, from: Int = 1, to: Int = -1 -> Text)`](#slice) - [`func starts_with(text: Text, prefix: Text -> Bool)`](#starts_with) - [`func title(text: Text, language: Text = "C" -> Text)`](#title) - [`func to(text: Text, last: Int -> Text)`](#to) -- [`func trim(text: Text, pattern: Pattern = $/{whitespace/, trim_left: Bool = yes, trim_right: Bool = yes -> Text)`](#trim) +- [`func translate(translations:{Text,Text} -> Text)`](#translate) +- [`func trim(text: Text, to_trim: Text = " $\t\r\n", left: Bool = yes, right: Bool = yes -> Text)`](#trim) - [`func upper(text: Text, language: Text "C" -> Text)`](#upper) - [`func utf32_codepoints(text: Text -> [Int32])`](#utf32_codepoints) - [`func width(text: Text -> Int)`](#width) +- [`func without_prefix(text: Text, prefix: Text -> Text)`](#without_prefix) +- [`func without_suffix(text: Text, suffix: Text -> Text)`](#without_suffix) + +---------------- ### `as_c_string` Converts a `Text` value to a C-style string. @@ -411,24 +411,53 @@ for match in text:by_match($/{alpha}/): ### `by_split` Returns an iterator function that can be used to iterate over text separated by -a pattern. +a delimiter. +**Note:** to split based on a set of delimiters, use [`by_split_any()`](#by_split_any). ```tomo -func by_split(text: Text, pattern: Pattern = $// -> func(->Text?)) +func by_split(text: Text, delimiter: Text = "" -> func(->Text?)) ``` - `text`: The text to be iterated over in pattern-delimited chunks. -- `pattern`: The [pattern](patterns.md) to split the text on. +- `delimiter`: An exact delimiter to use for splitting the text. If an empty text + is given, then each split will be the graphical clusters of the text. **Returns:** An iterator function that returns one chunk of text at a time, separated by the -given pattern, until it runs out and returns `none`. **Note:** using an empty -pattern (the default) will iterate over single grapheme clusters in the text. +given delimiter, until it runs out and returns `none`. **Note:** using an empty +delimiter (the default) will iterate over single grapheme clusters in the text. **Example:** ```tomo text := "one,two,three" -for chunk in text:by_split($/,/): +for chunk in text:by_split(","): + # Prints: "one" then "two" then "three": + say(chunk) +``` + +--- + +### `by_split_any` +Returns an iterator function that can be used to iterate over text separated by +one or more characters (grapheme clusters) from a given text of delimiters. +**Note:** to split based on an exact delimiter, use [`by_split()`](#by_split). + +```tomo +func by_split_any(text: Text, delimiters: Text = " $\t\r\n" -> func(->Text?)) +``` + +- `text`: The text to be iterated over in pattern-delimited chunks. +- `delimiters`: An text containing multiple delimiter characters (grapheme clusters) + to use for splitting the text. + +**Returns:** +An iterator function that returns one chunk of text at a time, separated by the +given delimiter characters, until it runs out and returns `none`. + +**Example:** +```tomo +text := "one,two,;,three" +for chunk in text:by_split_any(",;"): # Prints: "one" then "two" then "three": say(chunk) ``` @@ -628,7 +657,7 @@ func from(text: Text, first: Int -> Text) The text from the given grapheme cluster to the end of the text. Note: a negative index counts backwards from the end of the text, so `-1` refers to the last cluster, `-2` the second-to-last, etc. Slice ranges will be truncated to -the length of the string. +the length of the text. **Example:** ```tomo @@ -647,10 +676,10 @@ text will be normalized, so the resulting text's UTF8 bytes may not exactly match the input. ```tomo -func from_codepoint_names(codepoints: [Int32] -> [Text]) +func from_bytes(bytes: [Byte] -> [Text]) ``` -- `codepoints`: The UTF32 codepoints in the desired text. +- `bytes`: The UTF-8 bytes of the desired text. **Returns:** A new text based on the input UTF8 bytes after normalization has been applied. @@ -717,7 +746,7 @@ the text will be normalized, so the resulting text's codepoints may not exactly match the input codepoints. ```tomo -func from_codepoint_names(codepoints: [Int32] -> [Text]) +func from_codepoints(codepoints: [Int32] -> [Text]) ``` - `codepoints`: The UTF32 codepoints in the desired text. @@ -734,28 +763,24 @@ A new text with the specified codepoints after normalization has been applied. --- ### `has` -Checks if the `Text` contains a target [pattern](patterns.md). +Checks if the `Text` contains some target text. ```tomo -func has(text: Text, pattern: Pattern -> Bool) +func has(text: Text, target: Text -> Bool) ``` - `text`: The text to be searched. -- `pattern`: The [pattern](patterns.md) to search for. +- `target`: The text to search for. **Returns:** -`yes` if the target pattern is found, `no` otherwise. +`yes` if the target text is found, `no` otherwise. **Example:** ```tomo ->> "hello world":has($/wo/) +>> "hello world":has("wo") = yes ->> "hello world":has($/{alpha}/) -= yes ->> "hello world":has($/{digit}/) +>> "hello world":has("xxx") = no ->> "hello world":has($/{start}he/) -= yes ``` --- @@ -888,63 +913,8 @@ The lowercase version of the text. --- -### `map` -For each occurrence of the given [pattern](patterns.md), replace the text with -the result of calling the given function on that match. - -```tomo -func map(text: Text, pattern: Pattern, fn: func(text:Match)->Text -> Text, recursive: Bool = yes) -``` - -- `text`: The text to be searched. -- `pattern`: The [pattern](patterns.md) to search for. -- `fn`: The function to apply to each match. -- `recursive`: Whether to recursively map `fn` to each of the captures of the - pattern before handing them to `fn`. - -**Returns:** -The text with the matching parts replaced with the result of applying the given -function to each. - -**Example:** -```tomo ->> "hello world":map($/world/, func(m:Match): m.text:upper()) -= "hello WORLD" ->> "Some nums: 1 2 3 4":map($/{int}/, func(m:Match): "$(Int.parse(m.text)! + 10)") -= "Some nums: 11 12 13 14" -``` - ---- - -### `matches` -Checks if the `Text` matches target [pattern](patterns.md) and returns an array -of the matching text captures or a null value if the entire text doesn't match -the pattern. - -```tomo -func matches(text: Text, pattern: Pattern -> [Text]) -``` - -- `text`: The text to be searched. -- `pattern`: The [pattern](patterns.md) to search for. - -**Returns:** -An array of the matching text captures if the entire text matches the pattern, -or a null value otherwise. - -**Example:** -```tomo ->> "hello world":matches($/{id}/) -= none : [Text]? - ->> "hello world":matches($/{id} {id}/) -= ["hello", "world"] : [Text]? -``` - ---- - ### `quoted` -Formats the text as a quoted string. +Formats the text with quotation marks and escapes. ```tomo func quoted(text: Text, color: Bool = no, quotation_mark: Text = '"' -> Text) @@ -955,7 +925,7 @@ func quoted(text: Text, color: Bool = no, quotation_mark: Text = '"' -> Text) - `quotation_mark`: The quotation mark to use (default is `"`). **Returns:** -The text formatted as a quoted string. +The text formatted as a quoted text. **Example:** ```tomo @@ -987,106 +957,23 @@ The text repeated the given number of times. --- ### `replace` -Replaces occurrences of a [pattern](patterns.md) in the text with a replacement -string. +Replaces occurrences of a target text with a replacement text. ```tomo -func replace(text: Text, pattern: Pattern, replacement: Text, backref: Pattern = $/\/, recursive: Bool = yes -> Text) +func replace(text: Text, target: Text, replacement: Text -> Text) ``` - `text`: The text in which to perform replacements. -- `pattern`: The [pattern](patterns.md) to be replaced. -- `replacement`: The text to replace the pattern with. -- `backref`: If non-empty, the replacement text will have occurrences of this - pattern followed by a number replaced with the corresponding backreference. - By default, the backreference pattern is a single backslash, so - backreferences look like `\0`, `\1`, etc. -- `recursive`: For backreferences of a nested capture, if recursive is set to - `yes`, then the whole replacement will be reapplied recursively to the - backreferenced text if it's used in the replacement. - -**Backreferences** -If a backreference pattern is in the replacement, then that backreference is -replaced with the corresponding group from the matching text. Backreference -`0` is the entire matching text, backreference `1` is the first matched group, -and so on. Literal text is not captured for backreferences, only named group -captures (`{foo}`), quoted captures (`"?"`), and nested group captures (`(?)`). -For quoted and nested group captures, the backreference refers to the *inside* -of the capture without the enclosing punctuation. - -If you need to insert a digit immediately after a backreference, you can use an -optional semicolon: `\1;2` (backref 1, followed by the replacement text`"2"`). +- `target`: The target text to be replaced. +- `replacement`: The text to replace the target with. **Returns:** -The text with occurrences of the pattern replaced. +The text with occurrences of the target replaced. **Example:** ```tomo ->> "Hello world":replace($/world/, "there") +>> "Hello world":replace("world", "there") = "Hello there" - ->> "Hello world":replace($/{id}/, "xxx") -= "xxx xxx" - ->> "Hello world":replace($/{id}/, "\0") -= "(Hello) (world)" - ->> "Hello world":replace($/{id}/, "(@0)", backref=$/@/) -= "(Hello) (world)" - ->> "Hello world":replace($/{id} {id}/, "just \2") -= "just world" - -# Recursive is the default behavior: ->> " BAD(x, BAD(y), z) ":replace($/BAD(?)/, "good(\1)", recursive=yes) -= " good(x, good(y), z) " - ->> " BAD(x, BAD(y), z) ":replace($/BAD(?)/, "good(\1)", recursive=no) -= " good(x, BAD(y), z) " -``` - ---- - -### `replace_all` -Takes a table mapping [patterns](patterns.md) to replacement texts and performs -all the replacements in the table on the whole text. At each position, the -first matching pattern's replacement is applied and the pattern matching moves -on to *after* the replacement text, so replacement text is not recursively -modified. See [`replace()`](#replace) for more information about replacement -behavior. - -```tomo -func replace_all(replacements:{Pattern,Text}, backref: Pattern = $/\/, recursive: Bool = yes -> Text) -``` - -- `text`: The text in which to perform replacements. -- `replacements`: A table mapping from [pattern](patterns.md) to the - replacement text associated with that pattern. -- `backref`: If non-empty, the replacement text will have occurrences of this - pattern followed by a number replaced with the corresponding backreference. - By default, the backreference pattern is a single backslash, so - backreferences look like `\0`, `\1`, etc. -- `recursive`: For backreferences of a nested capture, if recursive is set to - `yes`, then the matching replacement will be reapplied recursively to the - backreferenced text if it's used in the replacement. - -**Returns:** -The text with all occurrences of the patterns replaced with their corresponding -replacement text. - -**Example:** -```tomo ->> "A & an amperand":replace_all({ - $/&/ = "&", - $// = ">", - $/"/ = """, - $/'/ = "'", -} -= "A <tag> & an ampersand" - ->> "Hello":replace_all({$/{lower}/="[\0]", $/{upper}/="{\0}"}) -= "{H}[ello]" ``` --- @@ -1153,7 +1040,7 @@ func slice(text: Text, from: Int = 1, to: Int = -1 -> Text) The text that spans the given grapheme cluster indices. Note: a negative index counts backwards from the end of the text, so `-1` refers to the last cluster, `-2` the second-to-last, etc. Slice ranges will be truncated to the length of -the string. +the text. **Example:** ```tomo @@ -1170,32 +1057,51 @@ the string. --- ### `split` -Splits the text into an array of substrings based on a [pattern](patterns.md). +Splits the text into an array of substrings based on exact matches of a delimiter. +**Note:** to split based on a set of delimiter characters, use [`split_any()`](#split_any). ```tomo -func split(text: Text, pattern: Pattern = "" -> [Text]) +func split(text: Text, delimiter: Text = "" -> [Text]) ``` - `text`: The text to be split. -- `pattern`: The [pattern](patterns.md) used to split the text. If the pattern - is the empty string, the text will be split into individual grapheme clusters. +- `delimiter`: The delimiter used to split the text. If the delimiter is the + empty text, the text will be split into individual grapheme clusters. **Returns:** -An array of substrings resulting from the split. +An array of subtexts resulting from the split. **Example:** ```tomo ->> "one,two,three":split($/,/) -= ["one", "two", "three"] +>> "one,two,,three":split(",") += ["one", "two", "", "three"] >> "abc":split() = ["a", "b", "c"] +``` ->> "a b c":split($/{space}/) -= ["a", "b", "c"] +--- ->> "a,b,c,":split($/,/) -= ["a", "b", "c", ""] +### `split_any` +Splits the text into an array of substrings at one or more occurrences of a set +of delimiter characters (grapheme clusters). +**Note:** to split based on an exact delimiter, use [`split()`](#split). + +```tomo +func split_any(text: Text, delimiters: Text = " $\t\r\n" -> [Text]) +``` + +- `text`: The text to be split. +- `delimiters`: A text containing multiple delimiters to be used for + splitting the text into chunks. + +**Returns:** +An array of subtexts resulting from the split. + +**Example:** +```tomo +>> "one, two,,three":split_any(", ") += ["one", "two", "three"] ``` --- @@ -1260,7 +1166,7 @@ func to(text: Text, last: Int -> Text) The text up to and including the given grapheme cluster. Note: a negative index counts backwards from the end of the text, so `-1` refers to the last cluster, `-2` the second-to-last, etc. Slice ranges will be truncated to the length of -the string. +the text. **Example:** ```tomo @@ -1273,30 +1179,62 @@ the string. --- -### `trim` -Trims the matching [pattern](patterns.md) from the left and/or right side of the text. +### `translate` +Takes a table mapping target texts to their replacements and performs all the +replacements in the table on the whole text. At each position, the first +matching replacement is applied and the matching moves on to *after* the +replacement text, so replacement text is not recursively modified. See +[`replace()`](#replace) for more information about replacement behavior. ```tomo -func trim(text: Text, pattern: Pattern = $/{whitespace/, trim_left: Bool = yes, trim_right: Bool = yes -> Text) +func translate(translations:{Pattern,Text} -> Text) +``` + +- `text`: The text in which to perform replacements. +- `translations`: A table mapping from target text to its replacement. + +**Returns:** +The text with all occurrences of the patterns replaced with their corresponding +replacement text. + +**Example:** +```tomo +>> "A & an amperand":translate({ + "&" = "&", + "<" = "<", + ">" = ">", + '"" = """, + "'" = "'", +} += "A <tag> & an ampersand" +``` + +--- + +### `trim` +Trims the given characters (grapheme clusters) from the left and/or right side of the text. + +```tomo +func trim(text: Text, to_trim: Text = " $\t\r\n", left: Bool = yes, right: Bool = yes -> Text) ``` - `text`: The text to be trimmed. -- `pattern`: The [pattern](patterns.md) that will be trimmed away. -- `trim_left`: Whether or not to trim from the front of the text. -- `trim_right`: Whether or not to trim from the back of the text. +- `to_trim`: The characters to remove from the left/right of the text. +- `left`: Whether or not to trim from the front of the text. +- `right`: Whether or not to trim from the back of the text. **Returns:** -The text without the trim pattern at either end. +The text without the trim characters at either end. **Example:** ```tomo >> " x y z $(\n)":trim() = "x y z" ->> "abc123def":trim($/{!digit}/) -= "123" +>> "one,":trim(",") += "one" ->> " xyz ":trim(trim_right=no) +>> " xyz ":trim(right=no) = "xyz " ``` @@ -1371,3 +1309,51 @@ An integer representing the display width of the text. >> "🤠":width() = 2 ``` + +--- + +### `without_prefix` +Returns the text with a given prefix removed (if present). + +```tomo +func without_prefix(text: Text, prefix: Text -> Text) +``` + +- `text`: The text to remove the prefix from. +- `prefix`: The prefix to remove. + +**Returns:** +A text without the given prefix (if present) or the unmodified text if the +prefix is not present. + +**Example:** +```tomo +>> "foo:baz":without_prefix("foo:") += "baz" +>> "qux":without_prefix("foo:") += "qux" +``` + +--- + +### `without_suffix` +Returns the text with a given suffix removed (if present). + +```tomo +func without_suffix(text: Text, suffix: Text -> Text) +``` + +- `text`: The text to remove the suffix from. +- `suffix`: The suffix to remove. + +**Returns:** +A text without the given suffix (if present) or the unmodified text if the +suffix is not present. + +**Example:** +```tomo +>> "baz.foo":without_suffix(".foo") += "baz" +>> "qux":without_suffix(".foo") += "qux" +``` diff --git a/examples/colorful/README.md b/examples/colorful/README.md index 5f6ef8a..b504a73 100644 --- a/examples/colorful/README.md +++ b/examples/colorful/README.md @@ -62,32 +62,3 @@ $Colorful" We have @(green,bold:colors)! ":print() ``` - -You can very easily introduce your own syntax highlighting for a custom DSL: - -```tomo -lang Markdown: - func Colorful(md:Markdown -> Colorful): - text := md.text:replace_all({ - $/@/="@(at)", - $/(/="@(lparen)", - $/)/="@(rparen)", - $/**{..}**/="@(b:\1)", - $/*{..}*/="@(i:\1)", - $/[?](?)/="@(blue,underline:\1)", - }) - return Colorful.from_text(text) - - func colorful(md:Markdown -> Colorful): - return $Colorful"$md" -... - -md := $Markdown" - This is [a link with **bold** inside](example.com)! -" ->> colorful := md:colorful() -= $Colorful"This is @(blue,underline:a link with @(b:bold) inside)!" ->> colorful:for_terminal() -= "$\e[mThis is $\e[4;34ma link with $\e[1mbold$\e[22m inside$\e[24;39m!" -colorful:print() -``` diff --git a/examples/colorful/colorful.tm b/examples/colorful/colorful.tm index 929c240..e138e03 100644 --- a/examples/colorful/colorful.tm +++ b/examples/colorful/colorful.tm @@ -8,7 +8,7 @@ CSI := "$\033[" lang Colorful: convert(text:Text -> Colorful): - text = text:replace_all({$/@/="@(at)", $/(/="@(lparen)", $/)/="@(rparen)"}) + text = text:translate({"@"="@(at)", "("="@(lparen)", ")"="@(rparen)"}) return Colorful.from_text(text) convert(i:Int -> Colorful): return Colorful.from_text("$i") diff --git a/examples/game/world.tm b/examples/game/world.tm index 58fcd4f..809f1f8 100644 --- a/examples/game/world.tm +++ b/examples/game/world.tm @@ -68,8 +68,8 @@ struct World(player:@Player, goal:@Box, boxes:@[@Box], dt_accum=Num32(0.0), won= DrawText(CString("WINNER"), GetScreenWidth()/Int32(2)-Int32(48*3), GetScreenHeight()/Int32(2)-Int32(24), 48, Color(0,0,0)) func load_map(w:@World, map:Text): - if map:has($/[]/): - map = map:replace_all({$/[]/="#", $/@{1..}/="@", $/ /=" "}) + if map:has("[]"): + map = map:translate({"[]"="#", "@ "="@", " "=" "}) w.boxes = @[:@Box] box_size := Vector2(50., 50.) for y,line in map:lines(): diff --git a/docs/patterns.md b/examples/patterns/README.md similarity index 100% rename from docs/patterns.md rename to examples/patterns/README.md diff --git a/examples/patterns/match_type.h b/examples/patterns/match_type.h new file mode 100644 index 0000000..5d06343 --- /dev/null +++ b/examples/patterns/match_type.h @@ -0,0 +1,8 @@ +#pragma once + +typedef struct { + Text_t text; + Int_t index; + Array_t captures; +} XMatch; + diff --git a/src/stdlib/patterns.c b/examples/patterns/patterns.c similarity index 91% rename from src/stdlib/patterns.c rename to examples/patterns/patterns.c index b7891f8..ade68e0 100644 --- a/src/stdlib/patterns.c +++ b/examples/patterns/patterns.c @@ -2,21 +2,28 @@ #include #include +#include #include #include #include -#include "arrays.h" -#include "integers.h" -#include "optionals.h" -#include "patterns.h" -#include "structs.h" -#include "tables.h" -#include "text.h" -#include "types.h" - #define MAX_BACKREFS 100 +typedef struct { + Text_t text; + Int_t index; + Array_t captures; +} PatternMatch; + +typedef struct { + Text_t text; + Int_t index; + Array_t captures; + bool is_none:1; +} OptionalPatternMatch; + +#define NONE_MATCH ((OptionalPatternMatch){.is_none=true}) + typedef struct { int64_t index, length; bool occupied, recursive; @@ -35,7 +42,7 @@ typedef struct { }; } pat_t; -static Text_t Text$replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive); +static Text_t replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive); static INLINE void skip_whitespace(TextIter_t *state, int64_t *i) { @@ -673,7 +680,7 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index) } } -static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) +static int64_t match(Text_t text, int64_t text_index, Text_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) { if (pattern_index >= pattern.length) // End of the pattern return 0; @@ -773,7 +780,7 @@ static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t #undef EAT2 #undef EAT_MANY -static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length, capture_t *captures) +static int64_t _find(Text_t text, Text_t pattern, int64_t first, int64_t last, int64_t *match_length, capture_t *captures) { int32_t first_grapheme = Text$get_grapheme(pattern, 0); bool find_first = (first_grapheme != '{' @@ -800,7 +807,7 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last return -1; } -public OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index) +static OptionalPatternMatch find(Text_t text, Text_t pattern, Int_t from_index) { int64_t first = Int64$from_int(from_index, false); if (first == 0) fail("Invalid index: 0"); @@ -819,14 +826,14 @@ public OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t from_inde Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); Array$insert(&capture_array, &capture, I(0), sizeof(Text_t)); } - return (OptionalMatch_t){ + return (OptionalPatternMatch){ .text=Text$slice(text, I(found+1), I(found+len)), .index=I(found+1), .captures=capture_array, }; } -PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern) +PUREFUNC static bool Pattern$has(Text_t text, Text_t pattern) { if (Text$starts_with(pattern, Text("{start}"))) { int64_t m = match(text, 0, pattern, 0, NULL, 0); @@ -844,7 +851,7 @@ PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern) } } -public OptionalArray_t Text$matches(Text_t text, Pattern_t pattern) +static OptionalArray_t Pattern$matches(Text_t text, Text_t pattern) { capture_t captures[MAX_BACKREFS] = {}; int64_t match_len = match(text, 0, pattern, 0, captures, 0); @@ -859,18 +866,18 @@ public OptionalArray_t Text$matches(Text_t text, Pattern_t pattern) return capture_array; } -public Array_t Text$find_all(Text_t text, Pattern_t pattern) +static Array_t Pattern$find_all(Text_t text, Text_t pattern) { if (pattern.length == 0) // special case return (Array_t){.length=0}; Array_t matches = {}; for (int64_t i = 1; ; ) { - OptionalMatch_t m = Text$find(text, pattern, I(i)); - if (!m.index.small) + OptionalPatternMatch m = find(text, pattern, I(i)); + if (m.is_none) break; i = Int64$from_int(m.index, false) + m.text.length; - Array$insert(&matches, &m, I_small(0), sizeof(Match_t)); + Array$insert(&matches, &m, I_small(0), sizeof(PatternMatch)); } return matches; } @@ -878,23 +885,23 @@ public Array_t Text$find_all(Text_t text, Pattern_t pattern) typedef struct { TextIter_t state; Int_t i; - Pattern_t pattern; + Text_t pattern; } match_iter_state_t; -static OptionalMatch_t next_match(match_iter_state_t *state) +static OptionalPatternMatch next_match(match_iter_state_t *state) { if (Int64$from_int(state->i, false) > state->state.stack[0].text.length) return NONE_MATCH; - OptionalMatch_t m = Text$find(state->state.stack[0].text, state->pattern, state->i); - if (m.index.small == 0) // No match + OptionalPatternMatch m = find(state->state.stack[0].text, state->pattern, state->i); + if (m.is_none) // No match state->i = I(state->state.stack[0].text.length + 1); else state->i = Int$plus(m.index, I(MAX(1, m.text.length))); return m; } -public Closure_t Text$by_match(Text_t text, Pattern_t pattern) +static Closure_t Pattern$by_match(Text_t text, Text_t pattern) { return (Closure_t){ .fn=(void*)next_match, @@ -902,7 +909,7 @@ public Closure_t Text$by_match(Text_t text, Pattern_t pattern) }; } -static Text_t apply_backrefs(Text_t text, Array_t recursive_replacements, Text_t replacement, Pattern_t backref_pat, capture_t *captures) +static Text_t apply_backrefs(Text_t text, Array_t recursive_replacements, Text_t replacement, Text_t backref_pat, capture_t *captures) { if (backref_pat.length == 0) return replacement; @@ -946,7 +953,7 @@ static Text_t apply_backrefs(Text_t text, Array_t recursive_replacements, Text_t Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length)); if (captures[backref].recursive && recursive_replacements.length > 0) - backref_text = Text$replace_array(backref_text, recursive_replacements, backref_pat, true); + backref_text = replace_array(backref_text, recursive_replacements, backref_pat, true); if (pos > nonmatching_pos) { Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos)); @@ -965,7 +972,7 @@ static Text_t apply_backrefs(Text_t text, Array_t recursive_replacements, Text_t return ret; } -public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive) +static Text_t Pattern$replace(Text_t text, Text_t pattern, Text_t replacement, Text_t backref_pat, bool recursive) { Text_t ret = EMPTY_TEXT; @@ -1018,7 +1025,7 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, P return ret; } -public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right) +static Text_t Pattern$trim(Text_t text, Text_t pattern, bool trim_left, bool trim_right) { int64_t first = 0, last = text.length-1; if (trim_left) { @@ -1037,7 +1044,7 @@ public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool tri return Text$slice(text, I(first+1), I(last+1)); } -public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive) +static Text_t Pattern$map(Text_t text, Text_t pattern, Closure_t fn, bool recursive) { Text_t ret = EMPTY_TEXT; @@ -1049,7 +1056,7 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn, bool recurs TextIter_t text_state = NEW_TEXT_ITER_STATE(text); int64_t nonmatching_pos = 0; - Text_t (*text_mapper)(Match_t, void*) = fn.fn; + Text_t (*text_mapper)(PatternMatch, void*) = fn.fn; for (int64_t pos = 0; pos < text.length; pos++) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { @@ -1061,7 +1068,7 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn, bool recurs int64_t match_len = match(text, pos, pattern, 0, captures, 0); if (match_len < 0) continue; - Match_t m = { + PatternMatch m = { .text=Text$slice(text, I(pos+1), I(pos+match_len)), .index=I(pos+1), .captures={}, @@ -1069,7 +1076,7 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn, bool recurs for (int i = 0; captures[i].occupied; i++) { Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); if (recursive) - capture = Text$map(capture, pattern, fn, recursive); + capture = Pattern$map(capture, pattern, fn, recursive); Array$insert(&m.captures, &capture, I(0), sizeof(Text_t)); } @@ -1090,7 +1097,7 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn, bool recurs return ret; } -public void Text$each(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive) +static void Pattern$each(Text_t text, Text_t pattern, Closure_t fn, bool recursive) { int32_t first_grapheme = Text$get_grapheme(pattern, 0); bool find_first = (first_grapheme != '{' @@ -1098,7 +1105,7 @@ public void Text$each(Text_t text, Pattern_t pattern, Closure_t fn, bool recursi && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); TextIter_t text_state = NEW_TEXT_ITER_STATE(text); - void (*action)(Match_t, void*) = fn.fn; + void (*action)(PatternMatch, void*) = fn.fn; for (int64_t pos = 0; pos < text.length; pos++) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { @@ -1110,7 +1117,7 @@ public void Text$each(Text_t text, Pattern_t pattern, Closure_t fn, bool recursi int64_t match_len = match(text, pos, pattern, 0, captures, 0); if (match_len < 0) continue; - Match_t m = { + PatternMatch m = { .text=Text$slice(text, I(pos+1), I(pos+match_len)), .index=I(pos+1), .captures={}, @@ -1118,7 +1125,7 @@ public void Text$each(Text_t text, Pattern_t pattern, Closure_t fn, bool recursi for (int i = 0; captures[i].occupied; i++) { Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); if (recursive) - Text$each(capture, pattern, fn, recursive); + Pattern$each(capture, pattern, fn, recursive); Array$insert(&m.captures, &capture, I(0), sizeof(Text_t)); } @@ -1127,7 +1134,7 @@ public void Text$each(Text_t text, Pattern_t pattern, Closure_t fn, bool recursi } } -Text_t Text$replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive) +Text_t replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive) { if (replacements.length == 0) return text; @@ -1137,7 +1144,7 @@ Text_t Text$replace_array(Text_t text, Array_t replacements, Text_t backref_pat, for (int64_t pos = 0; pos < text.length; ) { // Find the first matching pattern at this position: for (int64_t i = 0; i < replacements.length; i++) { - Pattern_t pattern = *(Pattern_t*)(replacements.data + i*replacements.stride); + Text_t pattern = *(Text_t*)(replacements.data + i*replacements.stride); capture_t captures[MAX_BACKREFS] = {}; int64_t len = match(text, pos, pattern, 0, captures, 1); if (len < 0) continue; @@ -1171,12 +1178,12 @@ Text_t Text$replace_array(Text_t text, Array_t replacements, Text_t backref_pat, return ret; } -public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive) +static Text_t Pattern$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive) { - return Text$replace_array(text, replacements.entries, backref_pat, recursive); + return replace_array(text, replacements.entries, backref_pat, recursive); } -public Array_t Text$split(Text_t text, Pattern_t pattern) +static Array_t Pattern$split(Text_t text, Text_t pattern) { if (text.length == 0) // special case return (Array_t){.length=0}; @@ -1207,7 +1214,7 @@ public Array_t Text$split(Text_t text, Pattern_t pattern) typedef struct { TextIter_t state; int64_t i; - Pattern_t pattern; + Text_t pattern; } split_iter_state_t; static OptionalText_t next_split(split_iter_state_t *state) @@ -1243,7 +1250,7 @@ static OptionalText_t next_split(split_iter_state_t *state) } } -public Closure_t Text$by_split(Text_t text, Pattern_t pattern) +static Closure_t Pattern$by_split(Text_t text, Text_t pattern) { return (Closure_t){ .fn=(void*)next_split, @@ -1251,7 +1258,7 @@ public Closure_t Text$by_split(Text_t text, Pattern_t pattern) }; } -public Pattern_t Pattern$escape_text(Text_t text) +static Text_t Pattern$escape_text(Text_t text) { // TODO: optimize for spans of non-escaped text Text_t ret = EMPTY_TEXT; @@ -1276,62 +1283,9 @@ static Text_t Pattern$as_text(const void *obj, bool colorize, const TypeInfo_t * (void)info; if (!obj) return Text("Pattern"); - Pattern_t pat = *(Pattern_t*)obj; - Text_t quote = Text$has(pat, Pattern("/")) && !Text$has(pat, Pattern("|")) ? Text("|") : Text("/"); - return Text$concat( colorize ? Text("\x1b[1m$\033[m") : Text("$"), Text$quoted(pat, colorize, quote)); + Text_t pat = *(Text_t*)obj; + Text_t quote = Pattern$has(pat, Text("/")) && !Pattern$has(pat, Text("|")) ? Text("|") : Text("/"); + return Text$concat(colorize ? Text("\x1b[1m$\033[m") : Text("$"), Text$quoted(pat, colorize, quote)); } -public const TypeInfo_t Pattern$info = { - .size=sizeof(Pattern_t), - .align=__alignof__(Pattern_t), - .tag=TextInfo, - .TextInfo={.lang="Pattern"}, - .metamethods={ - .as_text=Pattern$as_text, - .hash=Text$hash, - .compare=Text$compare, - .equal=Text$equal, - .is_none=Text$is_none, - .serialize=Text$serialize, - .deserialize=Text$deserialize, - }, -}; - -static const TypeInfo_t _text_array = { - .size=sizeof(Array_t), - .align=__alignof__(Array_t), - .tag=ArrayInfo, - .ArrayInfo.item=&Text$info, - .metamethods=Array$metamethods, -}; - -static NamedType_t _match_fields[3] = { - {"text", &Text$info}, - {"index", &Int$info}, - {"captures", &_text_array}, -}; - -static bool Match$is_none(const void *m, const TypeInfo_t*) -{ - return ((OptionalMatch_t*)m)->index.small == 0; -} - -public const TypeInfo_t Match$info = { - .size=sizeof(Match_t), - .align=__alignof__(Match_t), - .tag=StructInfo, - .StructInfo={ - .name="Match", - .num_fields=3, - .fields=_match_fields, - }, - .metamethods={ - .as_text=Struct$as_text, - .hash=Struct$hash, - .compare=Struct$compare, - .equal=Struct$equal, - .is_none=Match$is_none, - }, -}; - // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/examples/patterns/patterns.tm b/examples/patterns/patterns.tm new file mode 100644 index 0000000..fa6c789 --- /dev/null +++ b/examples/patterns/patterns.tm @@ -0,0 +1,58 @@ +use ./patterns.c + +struct PatternMatch(text:Text, index:Int, captures:[Text]) + +lang P: + convert(text:Text -> P): + return inline C : P { Pattern$escape_text(_$text); } + + convert(n:Int -> P): + return P.from_text("$n") + +extend Text: + func matches(text:Text, pattern:P -> [Text]?): + return inline C : [Text]? { Pattern$matches(_$text, _$pattern); } + + func pat_replace(text:Text, pattern:P, replacement:Text, backref="@", recursive=yes -> Text): + return inline C : Text { Pattern$replace(_$text, _$pattern, _$replacement, _$backref, _$recursive); } + + func pat_replace_all(text:Text, replacements:{P,Text}, backref="@", recursive=yes -> Text): + return inline C : Text { Pattern$replace_all(_$text, _$replacements, _$backref, _$recursive); } + + func has(text:Text, pattern:P -> Bool): + return inline C : Bool { Pattern$has(_$text, _$pattern); } + + func find_all(text:Text, pattern:P -> [PatternMatch]): + return inline C : [PatternMatch] { Pattern$find_all(_$text, _$pattern); } + + func by_match(text:Text, pattern:P -> func(->PatternMatch?)): + return inline C : func(->PatternMatch?) { Pattern$by_match(_$text, _$pattern); } + + func each(text:Text, pattern:P, fn:func(m:PatternMatch), recursive=yes): + inline C { Pattern$each(_$text, _$pattern, _$fn, _$recursive); } + + func map(text:Text, pattern:P, fn:func(m:PatternMatch -> Text), recursive=yes -> Text): + return inline C : Text { Pattern$map(_$text, _$pattern, _$fn, _$recursive); } + + func split(text:Text, pattern:P -> [Text]): + return inline C : [Text] { Pattern$split(_$text, _$pattern); } + + func by_split(text:Text, pattern:P -> func(->Text?)): + return inline C : func(->Text?) { Pattern$by_split(_$text, _$pattern); } + + func trim(text:Text, pattern:P, trim_left=yes, trim_right=yes -> Text): + return inline C : Text { Pattern$trim(_$text, _$pattern, _$trim_left, _$trim_right); } + + func trim_left(text:Text, pattern:P -> Text): + return text:trim(pattern, trim_left=yes, trim_right=no) + + func trim_right(text:Text, pattern:P -> Text): + return text:trim(pattern, trim_left=no, trim_right=yes) + +func main(): + >> "hello world":pat_replace($P/{id}/, "XXX") + >> "hello world":find_all($P/l/) + + for m in "hello one two three":by_match($P/{id}/): + >> m + diff --git a/src/ast.c b/src/ast.c index 5f4e24f..75795d6 100644 --- a/src/ast.c +++ b/src/ast.c @@ -165,6 +165,7 @@ CORD ast_to_xml(ast_t *ast) T(Use, "%r%r", optional_tagged("var", data.var), xml_escape(data.path)) T(InlineCCode, "%r", xml_escape(data.code)) T(Deserialize, "%r%r", type_ast_to_xml(data.type), ast_to_xml(data.value)) + T(Extend, "%r", data.name, ast_to_xml(data.body)) default: return "???"; #undef T } diff --git a/src/ast.h b/src/ast.h index b5b1ad3..cad7bb6 100644 --- a/src/ast.h +++ b/src/ast.h @@ -143,6 +143,7 @@ typedef enum { Use, InlineCCode, Deserialize, + Extend, } ast_e; struct ast_s { @@ -331,6 +332,10 @@ struct ast_s { ast_t *value; type_ast_t *type; } Deserialize; + struct { + const char *name; + ast_t *body; + } Extend; } __data; }; diff --git a/src/compile.c b/src/compile.c index f228148..2cc20f3 100644 --- a/src/compile.c +++ b/src/compile.c @@ -15,7 +15,6 @@ #include "stdlib/integers.h" #include "stdlib/nums.h" #include "stdlib/paths.h" -#include "stdlib/patterns.h" #include "stdlib/text.h" #include "stdlib/util.h" #include "structs.h" @@ -39,7 +38,7 @@ static CORD compile_string_literal(CORD literal); CORD promote_to_optional(type_t *t, CORD code) { - if (t == PATH_TYPE || t == PATH_TYPE_TYPE || t == MATCH_TYPE) { + if (t == PATH_TYPE || t == PATH_TYPE_TYPE) { return code; } else if (t->tag == IntType) { switch (Match(t, IntType)->bits) { @@ -442,7 +441,7 @@ static void add_closed_vars(Table_t *closed_vars, env_t *enclosing_scope, env_t add_closed_vars(closed_vars, enclosing_scope, env, Match(ast, Deserialize)->value); break; } - case Use: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: { + case Use: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: case Extend: { errx(1, "Definitions should not be reachable in a closure."); } default: @@ -497,7 +496,6 @@ PUREFUNC CORD compile_unsigned_type(type_t *t) CORD compile_type(type_t *t) { if (t == RNG_TYPE) return "RNG_t"; - else if (t == MATCH_TYPE) return "Match_t"; else if (t == PATH_TYPE) return "Path_t"; else if (t == PATH_TYPE_TYPE) return "PathType_t"; @@ -516,8 +514,6 @@ CORD compile_type(type_t *t) auto text = Match(t, TextType); if (!text->lang || streq(text->lang, "Text")) return "Text_t"; - else if (streq(text->lang, "Pattern")) - return "Pattern_t"; else return CORD_all(namespace_prefix(text->env, text->env->namespace->parent), text->lang, "$$type"); } @@ -558,8 +554,6 @@ CORD compile_type(type_t *t) case ArrayType: case TableType: case SetType: return CORD_all("Optional", compile_type(nonnull)); case StructType: { - if (nonnull == MATCH_TYPE) - return "OptionalMatch_t"; if (nonnull == PATH_TYPE) return "OptionalPath_t"; if (nonnull == PATH_TYPE_TYPE) @@ -680,7 +674,7 @@ CORD optional_into_nonnone(type_t *t, CORD value) case IntType: return CORD_all(value, ".value"); case StructType: - if (t == MATCH_TYPE || t == PATH_TYPE || t == PATH_TYPE_TYPE) + if (t == PATH_TYPE || t == PATH_TYPE_TYPE) return value; return CORD_all(value, ".value"); default: @@ -695,8 +689,6 @@ CORD check_none(type_t *t, CORD value) // complain about excessive parens around equality comparisons if (t->tag == PointerType || t->tag == FunctionType || t->tag == CStringType) return CORD_all("({", value, " == NULL;})"); - else if (t == MATCH_TYPE) - return CORD_all("({(", value, ").index.small == 0;})"); else if (t == PATH_TYPE) return CORD_all("({(", value, ").type.$tag == PATH_NONE;})"); else if (t == PATH_TYPE_TYPE) @@ -1168,7 +1160,7 @@ static CORD _compile_statement(env_t *env, ast_t *ast) default: code_err(ast, "Update assignments are not implemented for this operation"); } } - case StructDef: case EnumDef: case LangDef: case FunctionDef: case ConvertDef: { + case StructDef: case EnumDef: case LangDef: case Extend: case FunctionDef: case ConvertDef: { return CORD_EMPTY; } case Skip: { @@ -1730,8 +1722,13 @@ static CORD _compile_statement(env_t *env, ast_t *ast) code_err(ast, "Could not find library"); CORD initialization = CORD_EMPTY; - const char *lib_id = Text$as_c_string( - Text$replace(Text$from_str(use->path), Pattern("{1+ !alphanumeric}"), Text("_"), Pattern(""), false)); + + char *lib_id = String(use->path); + for (char *p = lib_id; *p; p++) { + if (!isalnum(*p) && *p != '_') + *p = '_'; + } + for (size_t i = 0; i < tm_files.gl_pathc; i++) { const char *filename = tm_files.gl_pathv[i]; initialization = CORD_all( @@ -2165,7 +2162,6 @@ CORD compile_none(type_t *t) if (t == PATH_TYPE) return "NONE_PATH"; else if (t == PATH_TYPE_TYPE) return "((OptionalPathType_t){})"; - else if (t == MATCH_TYPE) return "NONE_MATCH"; switch (t->tag) { case BigIntType: return "NONE_INT"; @@ -2597,8 +2593,6 @@ CORD compile(env_t *env, ast_t *ast) CORD lang_constructor; if (!lang || streq(lang, "Text")) lang_constructor = "Text"; - else if (streq(lang, "Pattern")) - lang_constructor = lang; else lang_constructor = CORD_all(namespace_prefix(Match(text_t, TextType)->env, Match(text_t, TextType)->env->namespace->parent), lang); @@ -3752,7 +3746,7 @@ CORD compile(env_t *env, ast_t *ast) case Defer: code_err(ast, "Compiling 'defer' as expression!"); case Extern: code_err(ast, "Externs are not supported as expressions"); case TableEntry: code_err(ast, "Table entries should not be compiled directly"); - case Declare: case Assign: case UpdateAssign: case For: case While: case Repeat: case StructDef: case LangDef: + case Declare: case Assign: case UpdateAssign: case For: case While: case Repeat: case StructDef: case LangDef: case Extend: case EnumDef: case FunctionDef: case ConvertDef: case Skip: case Stop: case Pass: case Return: case DocTest: case PrintStatement: code_err(ast, "This is not a valid expression"); default: case Unknown: code_err(ast, "Unknown AST"); @@ -3762,7 +3756,6 @@ CORD compile(env_t *env, ast_t *ast) CORD compile_type_info(type_t *t) { if (t == RNG_TYPE) return "&RNG$info"; - else if (t == MATCH_TYPE) return "&Match$info"; else if (t == PATH_TYPE) return "&Path$info"; else if (t == PATH_TYPE_TYPE) return "&PathType$info"; @@ -3773,8 +3766,6 @@ CORD compile_type_info(type_t *t) auto text = Match(t, TextType); if (!text->lang || streq(text->lang, "Text")) return "&Text$info"; - else if (streq(text->lang, "Pattern")) - return "&Pattern$info"; return CORD_all("(&", namespace_prefix(text->env, text->env->namespace->parent), text->lang, "$$info)"); } case StructType: { @@ -4206,6 +4197,12 @@ CORD compile_top_level_code(env_t *env, ast_t *ast) env_t *ns_env = namespace_env(env, def->name); return CORD_all(code, def->namespace ? compile_top_level_code(ns_env, def->namespace) : CORD_EMPTY); } + case Extend: { + auto extend = Match(ast, Extend); + env_t *ns_env = namespace_env(env, extend->name); + ns_env->libname = env->libname; + return compile_top_level_code(ns_env, extend->body); + } case Extern: return CORD_EMPTY; case Block: { CORD code = CORD_EMPTY; @@ -4258,6 +4255,9 @@ static void initialize_vars_and_statics(env_t *env, ast_t *ast) } else if (stmt->ast->tag == LangDef) { initialize_vars_and_statics(namespace_env(env, Match(stmt->ast, LangDef)->name), Match(stmt->ast, LangDef)->namespace); + } else if (stmt->ast->tag == Extend) { + initialize_vars_and_statics(namespace_env(env, Match(stmt->ast, Extend)->name), + Match(stmt->ast, Extend)->body); } else if (stmt->ast->tag == Use) { continue; } else { @@ -4348,6 +4348,9 @@ CORD compile_statement_type_header(env_t *env, Path_t header_path, ast_t *ast) "extern const TypeInfo_t ", full_name, ";\n" ); } + case Extend: { + return CORD_EMPTY; + } default: return CORD_EMPTY; } @@ -4364,6 +4367,12 @@ CORD compile_statement_namespace_header(env_t *env, Path_t header_path, ast_t *a block = def->namespace; break; } + case Extend: { + auto extend = Match(ast, Extend); + ns_name = extend->name; + block = extend->body; + break; + } case StructDef: { auto def = Match(ast, StructDef); ns_name = def->name; diff --git a/src/environment.c b/src/environment.c index 776e785..6822502 100644 --- a/src/environment.c +++ b/src/environment.c @@ -13,7 +13,6 @@ #include "typecheck.h" type_t *TEXT_TYPE = NULL; -type_t *MATCH_TYPE = NULL; type_t *RNG_TYPE = NULL; public type_t *PATH_TYPE = NULL; public type_t *PATH_TYPE_TYPE = NULL; @@ -67,7 +66,6 @@ env_t *global_env(void) (void)bind_type(env, "Int32", Type(IntType, .bits=TYPE_IBITS32)); (void)bind_type(env, "Memory", Type(MemoryType)); PATH_TYPE_TYPE = declare_type(env, "enum PathType(Relative, Absolute, Home)"); - MATCH_TYPE = declare_type(env, "struct Match(text:Text, index:Int, captures:[Text])"); PATH_TYPE = declare_type(env, "struct Path(type:PathType, components:[Text])"); RNG_TYPE = declare_type(env, "struct RNG(state:@Memory)"); @@ -279,13 +277,6 @@ env_t *global_env(void) #undef F_opt #undef F #undef C - {"Match", MATCH_TYPE, "Match_t", "Match", TypedArray(ns_entry_t, - // No methods - )}, - {"Pattern", Type(TextType, .lang="Pattern", .env=namespace_env(env, "Pattern")), "Pattern_t", "Pattern$info", TypedArray(ns_entry_t, - {"escape_int", "Int$value_as_text", "func(i:Int -> Pattern)"}, - {"escape_text", "Pattern$escape_text", "func(text:Text -> Pattern)"}, - )}, {"PathType", PATH_TYPE_TYPE, "PathType_t", "PathType$info", TypedArray(ns_entry_t, {"Relative", "((PathType_t){.$tag=PATH_RELATIVE})", "PathType"}, {"Absolute", "((PathType_t){.$tag=PATH_ABSOLUTE})", "PathType"}, @@ -353,44 +344,42 @@ env_t *global_env(void) {"as_c_string", "Text$as_c_string", "func(text:Text -> CString)"}, {"at", "Text$cluster", "func(text:Text, index:Int -> Text)"}, {"by_line", "Text$by_line", "func(text:Text -> func(->Text?))"}, - {"by_match", "Text$by_match", "func(text:Text, pattern:Pattern -> func(->Match?))"}, - {"by_split", "Text$by_split", "func(text:Text, pattern=$Pattern'' -> func(->Text?))"}, + {"by_split", "Text$by_split", "func(text:Text, delimiter='' -> func(->Text?))"}, + {"by_split_any", "Text$by_split_any", "func(text:Text, delimiters=\" $\\t\\r\\n\" -> func(->Text?))"}, {"bytes", "Text$utf8_bytes", "func(text:Text -> [Byte])"}, {"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language='C' -> Bool)"}, {"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, {"ends_with", "Text$ends_with", "func(text,suffix:Text -> Bool)"}, - {"each", "Text$each", "func(text:Text, pattern:Pattern, fn:func(match:Match), recursive=yes)"}, - {"find", "Text$find", "func(text:Text, pattern:Pattern, start=1 -> Match?)"}, - {"find_all", "Text$find_all", "func(text:Text, pattern:Pattern -> [Match])"}, {"from", "Text$from", "func(text:Text, first:Int -> Text)"}, {"from_bytes", "Text$from_bytes", "func(bytes:[Byte] -> Text?)"}, {"from_c_string", "Text$from_str", "func(str:CString -> Text?)"}, {"from_codepoint_names", "Text$from_codepoint_names", "func(codepoint_names:[Text] -> Text?)"}, {"from_codepoints", "Text$from_codepoints", "func(codepoints:[Int32] -> Text)"}, {"from_text", "Path$from_text", "func(text:Text -> Path)"}, - {"has", "Text$has", "func(text:Text, pattern:Pattern -> Bool)"}, + {"has", "Text$has", "func(text:Text, target:Text -> Bool)"}, {"join", "Text$join", "func(glue:Text, pieces:[Text] -> Text)"}, {"left_pad", "Text$left_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"}, {"lines", "Text$lines", "func(text:Text -> [Text])"}, {"lower", "Text$lower", "func(text:Text, language='C' -> Text)"}, - {"map", "Text$map", "func(text:Text, pattern:Pattern, fn:func(match:Match -> Text), recursive=yes -> Text)"}, - {"matches", "Text$matches", "func(text:Text, pattern:Pattern -> [Text]?)"}, {"middle_pad", "Text$middle_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"}, {"quoted", "Text$quoted", "func(text:Text, color=no, quotation_mark='\"' -> Text)"}, {"repeat", "Text$repeat", "func(text:Text, count:Int -> Text)"}, - {"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, backref=$/\\/, recursive=yes -> Text)"}, - {"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern,Text}, backref=$/\\/, recursive=yes -> Text)"}, + {"replace", "Text$replace", "func(text:Text, target:Text, replacement:Text -> Text)"}, {"reversed", "Text$reversed", "func(text:Text -> Text)"}, {"right_pad", "Text$right_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"}, {"slice", "Text$slice", "func(text:Text, from=1, to=-1 -> Text)"}, - {"split", "Text$split", "func(text:Text, pattern=$Pattern'' -> [Text])"}, + {"split", "Text$split", "func(text:Text, delimiter='' -> [Text])"}, + {"split_any", "Text$split_any", "func(text:Text, delimiters=\" $\\t\\r\\n\" -> [Text])"}, {"starts_with", "Text$starts_with", "func(text,prefix:Text -> Bool)"}, {"title", "Text$title", "func(text:Text, language='C' -> Text)"}, {"to", "Text$to", "func(text:Text, last:Int -> Text)"}, - {"trim", "Text$trim", "func(text:Text, pattern=$/{whitespace}/, trim_left=yes, trim_right=yes -> Text)"}, + {"translate", "Text$translate", "func(text:Text, translations:{Text,Text} -> Text)"}, + {"trim", "Text$trim", "func(text:Text, to_trim=\" \t\r\n\", left=yes, right=yes -> Text)"}, {"upper", "Text$upper", "func(text:Text, language='C' -> Text)"}, {"utf32_codepoints", "Text$utf32_codepoints", "func(text:Text -> [Int32])"}, {"width", "Text$width", "func(text:Text, language='C' -> Int)"}, + {"without_prefix", "Text$without_prefix", "func(text,prefix:Text -> Text)"}, + {"without_suffix", "Text$without_suffix", "func(text,suffix:Text -> Text)"}, )}, }; @@ -518,9 +507,6 @@ env_t *global_env(void) {"Num32$from_int64", "func(i:Int64, truncate=no -> Num32)"}, {"Num32$from_int", "func(i:Int, truncate=no -> Num32)"}, {"Num32$from_num", "func(n:Num -> Num32)"}); - ADD_CONSTRUCTORS("Pattern", - {"Pattern$escape_text", "func(text:Text -> Pattern)"}, - {"Int$value_as_text", "func(i:Int -> Pattern)"}); ADD_CONSTRUCTORS("Path", {"Path$escape_text", "func(text:Text -> Path)"}, {"Path$escape_path", "func(path:Path -> Path)"}, @@ -534,11 +520,6 @@ env_t *global_env(void) .ret=PATH_TYPE), "Path$from_text"); - set_binding(namespace_env(env, "Pattern"), "from_text", - Type(FunctionType, .args=new(arg_t, .name="text", .type=TEXT_TYPE), - .ret=Type(TextType, .lang="Pattern", .env=namespace_env(env, "Pattern"))), - "(Pattern_t)"); - struct { const char *name, *code, *type_str; } global_vars[] = { diff --git a/src/environment.h b/src/environment.h index 95e3c3e..00b8fbb 100644 --- a/src/environment.h +++ b/src/environment.h @@ -89,7 +89,6 @@ void set_binding(env_t *env, const char *name, type_t *type, CORD code); binding_t *get_namespace_binding(env_t *env, ast_t *self, const char *name); #define code_err(ast, ...) compiler_err((ast)->file, (ast)->start, (ast)->end, __VA_ARGS__) extern type_t *TEXT_TYPE; -extern type_t *MATCH_TYPE; extern type_t *RNG_TYPE; extern type_t *PATH_TYPE; extern type_t *PATH_TYPE_TYPE; diff --git a/src/parse.c b/src/parse.c index 14221cc..2e3e2ec 100644 --- a/src/parse.c +++ b/src/parse.c @@ -22,7 +22,6 @@ #include "ast.h" #include "cordhelpers.h" #include "stdlib/integers.h" -#include "stdlib/patterns.h" #include "stdlib/paths.h" #include "stdlib/print.h" #include "stdlib/stdlib.h" @@ -64,7 +63,7 @@ int op_tightness[] = { static const char *keywords[] = { "yes", "xor", "while", "when", "use", "unless", "struct", "stop", "skip", "return", "or", "not", "none", "no", "mod1", "mod", "pass", "lang", "inline", "in", "if", - "func", "for", "extern", "enum", "else", "do", "deserialize", "defer", "and", + "func", "for", "extern", "extend", "enum", "else", "do", "deserialize", "defer", "and", "_min_", "_max_", NULL, }; @@ -120,6 +119,7 @@ static PARSER(parse_inline_c); static PARSER(parse_int); static PARSER(parse_lambda); static PARSER(parse_lang_def); +static PARSER(parse_extend); static PARSER(parse_namespace); static PARSER(parse_negative); static PARSER(parse_not); @@ -1241,9 +1241,6 @@ PARSER(parse_text) { open_quote = *pos; ++pos; close_quote = closing[(int)open_quote] ? closing[(int)open_quote] : open_quote; - - if (!lang && (open_quote == '/' || open_quote == '|')) - lang = "Pattern"; } else { return NULL; } @@ -1904,9 +1901,10 @@ PARSER(parse_namespace) { if (get_indent(ctx, next) != indent) break; ast_t *stmt; if ((stmt=optional(ctx, &pos, parse_struct_def)) + ||(stmt=optional(ctx, &pos, parse_func_def)) ||(stmt=optional(ctx, &pos, parse_enum_def)) ||(stmt=optional(ctx, &pos, parse_lang_def)) - ||(stmt=optional(ctx, &pos, parse_func_def)) + ||(stmt=optional(ctx, &pos, parse_extend)) ||(stmt=optional(ctx, &pos, parse_convert_def)) ||(stmt=optional(ctx, &pos, parse_use)) ||(stmt=optional(ctx, &pos, parse_extern)) @@ -1940,9 +1938,10 @@ PARSER(parse_file_body) { if (get_indent(ctx, next) != 0) break; ast_t *stmt; if ((stmt=optional(ctx, &pos, parse_struct_def)) + ||(stmt=optional(ctx, &pos, parse_func_def)) ||(stmt=optional(ctx, &pos, parse_enum_def)) ||(stmt=optional(ctx, &pos, parse_lang_def)) - ||(stmt=optional(ctx, &pos, parse_func_def)) + ||(stmt=optional(ctx, &pos, parse_extend)) ||(stmt=optional(ctx, &pos, parse_convert_def)) ||(stmt=optional(ctx, &pos, parse_use)) ||(stmt=optional(ctx, &pos, parse_extern)) @@ -2112,6 +2111,32 @@ PARSER(parse_lang_def) { return NewAST(ctx->file, start, pos, LangDef, .name=name, .namespace=namespace); } +PARSER(parse_extend) { + const char *start = pos; + // extend Name: body... + if (!match_word(&pos, "extend")) return NULL; + int64_t starting_indent = get_indent(ctx, pos); + spaces(&pos); + const char *name = get_id(&pos); + if (!name) + parser_err(ctx, start, pos, "I expected a name for this lang"); + + ast_t *body = NULL; + if (match(&pos, ":")) { + const char *ns_pos = pos; + whitespace(&ns_pos); + int64_t ns_indent = get_indent(ctx, ns_pos); + if (ns_indent > starting_indent) { + pos = ns_pos; + body = optional(ctx, &pos, parse_namespace); + } + } + if (!body) + body = NewAST(ctx->file, pos, pos, Block, .statements=NULL); + + return NewAST(ctx->file, start, pos, Extend, .name=name, .body=body); +} + arg_ast_t *parse_args(parse_ctx_t *ctx, const char **pos) { arg_ast_t *args = NULL; @@ -2373,20 +2398,6 @@ PARSER(parse_use) { what = USE_LOCAL; } else { what = USE_MODULE; - - // When `use`ing a URL, convert it to a hash: - Text_t text = Text$from_str(name); - Array_t m = Text$matches(text, Pattern("{url}")); - if (m.length >= 0) { - text = Text$trim(text, Pattern("http{0-1 s}://"), true, false); - FILE *shasum = popen(String("echo -n '", text, "' | sha256sum"), "r"); - const size_t HASH_LEN = 32; - char *hash = GC_MALLOC_ATOMIC(HASH_LEN + 1); - size_t just_read = fread(hash, sizeof(char), HASH_LEN, shasum); - if (just_read < HASH_LEN) - print_err("Failed to get SHA sum for 'use': ", name); - name = hash; - } } return NewAST(ctx->file, start, pos, Use, .var=var, .path=name, .what=what); } diff --git a/src/stdlib/README.md b/src/stdlib/README.md index 6591ead..1c72d3d 100644 --- a/src/stdlib/README.md +++ b/src/stdlib/README.md @@ -27,7 +27,6 @@ some common functionality. - Nums: [nums.h](nums.h), [nums.c](nums.c) - Optionals: [optionals.h](optionals.h), [optionals.c](optionals.c) - Paths: [paths.h](paths.h), [paths.c](paths.c) -- Patterns: [patterns.h](patterns.h), [patterns.c](patterns.c) - Pointers: [pointers.h](pointers.h), [pointers.c](pointers.c) - Tables: [tables.h](tables.h), [tables.c](tables.c) - Text: [text.h](text.h), [text.c](text.c) diff --git a/src/stdlib/datatypes.h b/src/stdlib/datatypes.h index b1265fc..26bd9c3 100644 --- a/src/stdlib/datatypes.h +++ b/src/stdlib/datatypes.h @@ -94,9 +94,6 @@ typedef struct Text_s { }; } Text_t; -#define Pattern_t Text_t -#define OptionalPattern_t Text_t - typedef struct { enum { PATH_NONE, PATH_RELATIVE, PATH_ABSOLUTE, PATH_HOME } $tag; } PathType_t; diff --git a/src/stdlib/optionals.c b/src/stdlib/optionals.c index 797cb11..d330902 100644 --- a/src/stdlib/optionals.c +++ b/src/stdlib/optionals.c @@ -6,7 +6,6 @@ #include "integers.h" #include "metamethods.h" #include "nums.h" -#include "patterns.h" #include "text.h" #include "util.h" diff --git a/src/stdlib/paths.c b/src/stdlib/paths.c index 0557562..3f27aef 100644 --- a/src/stdlib/paths.c +++ b/src/stdlib/paths.c @@ -24,7 +24,6 @@ #include "integers.h" #include "optionals.h" #include "paths.h" -#include "patterns.h" #include "structs.h" #include "text.h" #include "types.h" @@ -599,15 +598,10 @@ public PUREFUNC Text_t Path$base_name(Path_t path) public Text_t Path$extension(Path_t path, bool full) { - Text_t base = Path$base_name(path); - Array_t results = Text$matches(base, full ? Pattern(".{!.}.{..}") : Pattern(".{..}.{!.}{end}")); - if (results.length > 0) - return *((Text_t*)(results.data + results.stride*1)); - results = Text$matches(base, full ? Pattern("{!.}.{..}") : Pattern("{..}.{!.}{end}")); - if (results.length > 0) - return *((Text_t*)(results.data + results.stride*1)); - else - return Text(""); + const char *base = Text$as_c_string(Path$base_name(path)); + const char *dot = full ? strchr(base + 1, '.') : strrchr(base + 1, '.'); + const char *extension = dot ? dot + 1 : ""; + return Text$from_str(extension); } public Path_t Path$with_component(Path_t path, Text_t component) @@ -635,10 +629,10 @@ public Path_t Path$with_extension(Path_t path, Text_t extension, bool replace) Text_t last = *(Text_t*)(path.components.data + path.components.stride*(path.components.length-1)); Array$remove_at(&result.components, I(-1), I(1), sizeof(Text_t)); if (replace) { - if (Text$starts_with(last, Text("."))) - last = Text$replace(last, Pattern(".{!.}.{..}"), Text(".@1"), Pattern("@"), false); - else - last = Text$replace(last, Pattern("{!.}.{..}"), Text("@1"), Pattern("@"), false); + const char *base = Text$as_c_string(last); + const char *dot = strchr(base + 1, '.'); + if (dot) + last = Text$from_strn(base, (size_t)(dot - base)); } last = Text$concat(last, extension); diff --git a/src/stdlib/patterns.h b/src/stdlib/patterns.h deleted file mode 100644 index 2b77e49..0000000 --- a/src/stdlib/patterns.h +++ /dev/null @@ -1,46 +0,0 @@ -#pragma once - -// The type representing text patterns for pattern matching. - -#include -#include - -#include "datatypes.h" -#include "integers.h" -#include "optionals.h" -#include "types.h" - -#define Pattern(text) ((Pattern_t)Text(text)) -#define Patterns(...) ((Pattern_t)Texts(__VA_ARGS__)) - -typedef struct { - Text_t text; - Int_t index; - Array_t captures; -} Match_t; - -typedef Match_t OptionalMatch_t; -#define NONE_MATCH ((OptionalMatch_t){.index=NONE_INT}) - -Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t backref_pat, bool recursive); -Pattern_t Pattern$escape_text(Text_t text); -Text_t Text$replace_all(Text_t text, Table_t replacements, Pattern_t backref_pat, bool recursive); -Array_t Text$split(Text_t text, Pattern_t pattern); -Closure_t Text$by_split(Text_t text, Pattern_t pattern); -Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right); -OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t i); -Array_t Text$find_all(Text_t text, Pattern_t pattern); -Closure_t Text$by_match(Text_t text, Pattern_t pattern); -PUREFUNC bool Text$has(Text_t text, Pattern_t pattern); -OptionalArray_t Text$matches(Text_t text, Pattern_t pattern); -Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive); -void Text$each(Text_t text, Pattern_t pattern, Closure_t fn, bool recursive); - -#define Pattern$hash Text$hash -#define Pattern$compare Text$compare -#define Pattern$equal Text$equal - -extern const TypeInfo_t Match$info; -extern const TypeInfo_t Pattern$info; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/src/stdlib/stdlib.c b/src/stdlib/stdlib.c index fc94bc9..77383cd 100644 --- a/src/stdlib/stdlib.c +++ b/src/stdlib/stdlib.c @@ -20,7 +20,6 @@ #include "optionals.h" #include "metamethods.h" #include "nums.h" -#include "patterns.h" #include "paths.h" #include "rng.h" #include "siphash.h" diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 27acdfa..621de94 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -998,17 +998,22 @@ PUREFUNC public int32_t Text$compare(const void *va, const void *vb, const TypeI return 0; } +bool _matches(TextIter_t *text_state, TextIter_t *target_state, int64_t pos) +{ + for (int64_t i = 0; i < target_state->stack[0].text.length; i++) { + int32_t text_i = Text$get_grapheme_fast(text_state, pos + i); + int32_t prefix_i = Text$get_grapheme_fast(target_state, i); + if (text_i != prefix_i) return false; + } + return true; +} + PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix) { if (text.length < prefix.length) return false; TextIter_t text_state = NEW_TEXT_ITER_STATE(text), prefix_state = NEW_TEXT_ITER_STATE(prefix); - for (int64_t i = 0; i < prefix.length; i++) { - int32_t text_i = Text$get_grapheme_fast(&text_state, i); - int32_t prefix_i = Text$get_grapheme_fast(&prefix_state, i); - if (text_i != prefix_i) return false; - } - return true; + return _matches(&text_state, &prefix_state, 0); } PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix) @@ -1016,12 +1021,236 @@ PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix) if (text.length < suffix.length) return false; TextIter_t text_state = NEW_TEXT_ITER_STATE(text), suffix_state = NEW_TEXT_ITER_STATE(suffix); - for (int64_t i = 0; i < suffix.length; i++) { - int32_t text_i = Text$get_grapheme_fast(&text_state, text.length - suffix.length + i); - int32_t suffix_i = Text$get_grapheme_fast(&suffix_state, i); - if (text_i != suffix_i) return false; + return _matches(&text_state, &suffix_state, text.length - suffix.length); +} + +public Text_t Text$without_prefix(Text_t text, Text_t prefix) +{ + return Text$starts_with(text, prefix) ? Text$slice(text, I(prefix.length + 1), I(text.length)) : text; +} + +public Text_t Text$without_suffix(Text_t text, Text_t suffix) +{ + return Text$ends_with(text, suffix) ? Text$slice(text, I(1), I(text.length - suffix.length)) : text; +} + +static bool _has_grapheme(TextIter_t *text, int32_t g) +{ + for (int64_t t = 0; t < text->stack[0].text.length; t++) { + if (g == Text$get_grapheme_fast(text, t)) { + return true; + } } - return true; + return false; +} + +public Text_t Text$trim(Text_t text, Text_t to_trim, bool left, bool right) +{ + int64_t first = 0; + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), trim_state = NEW_TEXT_ITER_STATE(to_trim); + if (left) { + while (first < text.length && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, first))) { + first += 1; + } + } + int64_t last = text.length-1; + if (right) { + while (last >= first && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, last))) { + last -= 1; + } + } + return (first != 0 || last != text.length-1) ? Text$slice(text, I(first+1), I(last+1)) : text; +} + +public Text_t Text$translate(Text_t text, Table_t translations) +{ + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); + Text_t result = EMPTY_TEXT; + int64_t span_start = 0; + Array_t replacement_array = translations.entries; + for (int64_t i = 0; i < text.length; ) { + for (int64_t r = 0; r < replacement_array.length; r++) { + struct { Text_t target, replacement; } *entry = replacement_array.data + r*replacement_array.stride; + TextIter_t target_state = NEW_TEXT_ITER_STATE(entry->target); + if (_matches(&text_state, &target_state, i)) { + if (i > span_start) + result = concat2(result, Text$slice(text, I(span_start+1), I(i))); + + result = concat2(result, entry->replacement); + i += entry->target.length; + span_start = i; + goto found_match; + } + } + i += 1; + found_match: continue; + } + if (span_start < text.length) + result = concat2(result, Text$slice(text, I(span_start+1), I(text.length))); + return result; +} + +public Text_t Text$replace(Text_t text, Text_t target, Text_t replacement) +{ + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target); + Text_t result = EMPTY_TEXT; + int64_t span_start = 0; + for (int64_t i = 0; i < text.length; ) { + if (_matches(&text_state, &target_state, i)) { + if (i > span_start) + result = concat2(result, Text$slice(text, I(span_start+1), I(i))); + + result = concat2(result, replacement); + i += target.length; + span_start = i; + } else { + i += 1; + } + } + if (span_start < text.length) + result = concat2(result, Text$slice(text, I(span_start+1), I(text.length))); + return result; +} + +public bool Text$has(Text_t text, Text_t target) +{ + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target); + for (int64_t i = 0; i < text.length; i++) { + if (_matches(&text_state, &target_state, i)) + return true; + } + return false; +} + +public Array_t Text$split(Text_t text, Text_t delimiters) +{ + if (delimiters.length == 0) + return Text$clusters(text); + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters); + Array_t splits = {}; + for (int64_t i = 0; i < text.length; ) { + int64_t span_len = 0; + while (i + span_len < text.length && !_matches(&text_state, &delim_state, i + span_len)) { + span_len += 1; + } + Text_t slice = Text$slice(text, I(i+1), I(i+span_len)); + Array$insert(&splits, &slice, I(0), sizeof(slice)); + i += span_len + delimiters.length; + if (i == text.length) { + Text_t empty = Text(""); + Array$insert(&splits, &empty, I(0), sizeof(empty)); + } + } + return splits; +} + +public Array_t Text$split_any(Text_t text, Text_t delimiters) +{ + if (delimiters.length == 0) + return Array(text); + + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters); + Array_t splits = {}; + for (int64_t i = 0; i < text.length; ) { + int64_t span_len = 0; + while (i + span_len < text.length && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i + span_len))) { + span_len += 1; + } + bool trailing_delim = i + span_len < text.length; + Text_t slice = Text$slice(text, I(i+1), I(i+span_len)); + Array$insert(&splits, &slice, I(0), sizeof(slice)); + i += span_len + 1; + while (i < text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i))) { + i += 1; + } + if (i >= text.length && trailing_delim) { + Text_t empty = Text(""); + Array$insert(&splits, &empty, I(0), sizeof(empty)); + } + } + return splits; +} + +typedef struct { + TextIter_t state; + int64_t i; + Text_t delimiter; +} split_iter_state_t; + +static OptionalText_t next_split(split_iter_state_t *state) +{ + Text_t text = state->state.stack[0].text; + if (state->i >= text.length) { + if (state->delimiter.length > 0 && state->i == text.length) { // special case + state->i = text.length + 1; + return EMPTY_TEXT; + } + return NONE_TEXT; + } + + if (state->delimiter.length == 0) { // special case + state->i = text.length + 1; + return text; + } + + TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter); + int64_t i = state->i; + int64_t span_len = 0; + while (i + span_len < text.length && !_matches(&state->state, &delim_state, i + span_len)) { + span_len += 1; + } + Text_t slice = Text$slice(text, I(i+1), I(i+span_len)); + state->i = i + span_len + state->delimiter.length; + return slice; +} + +public Closure_t Text$by_split(Text_t text, Text_t delimiter) +{ + return (Closure_t){ + .fn=(void*)next_split, + .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .delimiter=delimiter), + }; +} + +static OptionalText_t next_split_any(split_iter_state_t *state) +{ + Text_t text = state->state.stack[0].text; + if (state->i >= text.length) { + if (state->delimiter.length > 0 && state->i == text.length) { // special case + state->i = text.length + 1; + return EMPTY_TEXT; + } + return NONE_TEXT; + } + + if (state->delimiter.length == 0) { // special case + Text_t ret = Text$cluster(text, I(state->i+1)); + state->i += 1; + return ret; + } + + TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter); + int64_t i = state->i; + int64_t span_len = 0; + while (i + span_len < text.length && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i + span_len))) { + span_len += 1; + } + Text_t slice = Text$slice(text, I(i+1), I(i+span_len)); + i += span_len + 1; + while (i < text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i))) { + i += 1; + } + state->i = i; + return slice; +} + +public Closure_t Text$by_split_any(Text_t text, Text_t delimiters) +{ + return (Closure_t){ + .fn=(void*)next_split_any, + .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .delimiter=delimiters), + }; } PUREFUNC public bool Text$equal_values(Text_t a, Text_t b) diff --git a/src/stdlib/text.h b/src/stdlib/text.h index 4acca8a..662c6e5 100644 --- a/src/stdlib/text.h +++ b/src/stdlib/text.h @@ -50,6 +50,16 @@ Text_t Text$as_text(const void *text, bool colorize, const TypeInfo_t *info); Text_t Text$quoted(Text_t str, bool colorize, Text_t quotation_mark); PUREFUNC bool Text$starts_with(Text_t text, Text_t prefix); PUREFUNC bool Text$ends_with(Text_t text, Text_t suffix); +Text_t Text$without_prefix(Text_t text, Text_t prefix); +Text_t Text$without_suffix(Text_t text, Text_t suffix); +Text_t Text$replace(Text_t text, Text_t target, Text_t replacement); +Text_t Text$translate(Text_t text, Table_t translations); +bool Text$has(Text_t text, Text_t target); +Array_t Text$split(Text_t text, Text_t delimiter); +Array_t Text$split_any(Text_t text, Text_t delimiters); +Closure_t Text$by_split(Text_t text, Text_t delimiter); +Closure_t Text$by_split_any(Text_t text, Text_t delimiters); +Text_t Text$trim(Text_t text, Text_t to_trim, bool left, bool right); char *Text$as_c_string(Text_t text); __attribute__((format(printf, 1, 2))) public Text_t Text$format(const char *fmt, ...); diff --git a/src/stdlib/tomo.h b/src/stdlib/tomo.h index 4aa1253..e42b562 100644 --- a/src/stdlib/tomo.h +++ b/src/stdlib/tomo.h @@ -20,7 +20,6 @@ #include "nums.h" #include "optionals.h" #include "paths.h" -#include "patterns.h" #include "pointers.h" #include "print.h" #include "rng.h" diff --git a/src/tomo.c b/src/tomo.c index b91d45e..f93dd1f 100644 --- a/src/tomo.c +++ b/src/tomo.c @@ -21,7 +21,6 @@ #include "stdlib/datatypes.h" #include "stdlib/integers.h" #include "stdlib/optionals.h" -#include "stdlib/patterns.h" #include "stdlib/paths.h" #include "stdlib/print.h" #include "stdlib/text.h" @@ -294,7 +293,12 @@ int main(int argc, char *argv[]) Text_t escape_lib_name(Text_t lib_name) { - return Text$replace(lib_name, Pattern("{1+ !alphanumeric}"), Text("_"), Pattern(""), false); + char *libname_id = String(lib_name); + for (char *p = libname_id; *p; p++) { + if (!isalnum(*p) && *p != '_') + *p = '_'; + } + return Text$from_str(libname_id); } Path_t build_file(Path_t path, const char *extension) diff --git a/src/typecheck.c b/src/typecheck.c index 0bfe6a0..ff60943 100644 --- a/src/typecheck.c +++ b/src/typecheck.c @@ -12,7 +12,6 @@ #include "cordhelpers.h" #include "environment.h" #include "parse.h" -#include "stdlib/patterns.h" #include "stdlib/paths.h" #include "stdlib/tables.h" #include "stdlib/text.h" @@ -195,8 +194,11 @@ static env_t *load_module(env_t *env, ast_t *module_ast) env_t *module_env = fresh_scope(env); Table$str_set(env->imports, use->path, module_env); - char *libname_id = Text$as_c_string( - Text$replace(Text$from_str(use->path), Pattern("{1+ !alphanumeric}"), Text("_"), Pattern(""), false)); + char *libname_id = String(use->path); + for (char *p = libname_id; *p; p++) { + if (!isalnum(*p) && *p != '_') + *p = '_'; + } module_env->libname = libname_id; for (size_t i = 0; i < tm_files.gl_pathc; i++) { const char *filename = tm_files.gl_pathv[i]; @@ -269,6 +271,14 @@ void prebind_statement(env_t *env, ast_t *statement) prebind_statement(ns_env, stmt->ast); break; } + case Extend: { + auto extend = Match(statement, Extend); + env_t *ns_env = namespace_env(env, extend->name); + ns_env->libname = env->libname; + for (ast_list_t *stmt = extend->body ? Match(extend->body, Block)->statements : NULL; stmt; stmt = stmt->next) + prebind_statement(ns_env, stmt->ast); + break; + } default: break; } } @@ -435,6 +445,14 @@ void bind_statement(env_t *env, ast_t *statement) bind_statement(ns_env, stmt->ast); break; } + case Extend: { + auto extend = Match(statement, Extend); + env_t *ns_env = namespace_env(env, extend->name); + ns_env->libname = env->libname; + for (ast_list_t *stmt = extend->body ? Match(extend->body, Block)->statements : NULL; stmt; stmt = stmt->next) + bind_statement(ns_env, stmt->ast); + break; + } case Use: { env_t *module_env = load_module(env, statement); if (!module_env) break; @@ -940,7 +958,7 @@ type_t *get_type(env_t *env, ast_t *ast) // Early out if the type is knowable without any context from the block: switch (last->ast->tag) { - case UpdateAssign: case Assign: case Declare: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: + case UpdateAssign: case Assign: case Declare: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: case Extend: return Type(VoidType); default: break; } @@ -1240,7 +1258,7 @@ type_t *get_type(env_t *env, ast_t *ast) return Type(ClosureType, Type(FunctionType, .args=args, .ret=ret)); } - case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: { + case FunctionDef: case ConvertDef: case StructDef: case EnumDef: case LangDef: case Extend: { return Type(VoidType); } @@ -1399,7 +1417,7 @@ PUREFUNC bool is_discardable(env_t *env, ast_t *ast) { switch (ast->tag) { case UpdateAssign: case Assign: case Declare: case FunctionDef: case ConvertDef: case StructDef: case EnumDef: - case LangDef: case Use: + case LangDef: case Use: case Extend: return true; default: break; } diff --git a/test/lang.tm b/test/lang.tm index 2aebf8e..dba096d 100644 --- a/test/lang.tm +++ b/test/lang.tm @@ -1,12 +1,12 @@ lang HTML: HEADER := $HTML"" convert(t:Text->HTML): - t = t:replace_all({ - $/&/="&", - $//=">", - $/"/=""", - $/'/="'", + t = t:translate({ + "&"="&", + "<"="<", + ">"=">", + '"'=""", + "'"="'", }) return HTML.from_text(t) diff --git a/test/text.tm b/test/text.tm index ffd3c06..fe295f9 100644 --- a/test/text.tm +++ b/test/text.tm @@ -74,45 +74,24 @@ func main(): >> amelie2:codepoint_names() = ["LATIN CAPITAL LETTER A", "LATIN SMALL LETTER M", "LATIN SMALL LETTER E WITH ACUTE", "LATIN SMALL LETTER L", "LATIN SMALL LETTER I", "LATIN SMALL LETTER E"] - >> "Hello":replace($/e/, "X") + >> "Hello":replace("e", "X") = "HXllo" - >> "Hello":has($/l/) + >> "Hello":has("l") = yes - >> "Hello":has($/l{end}/) - = no - >> "Hello":has($/{start}l/) + >> "Hello":has("x") = no - >> "Hello":has($/o/) - = yes - >> "Hello":has($/o{end}/) - = yes - >> "Hello":has($/{start}o/) - = no - - >> "Hello":has($/H/) - = yes - >> "Hello":has($/H{end}/) - = no - >> "Hello":has($/{start}H/) - = yes - - >> "Hello":replace($/l/, "") + >> "Hello":replace("l", "") = "Heo" - >> "xxxx":replace($/x/, "") + >> "xxxx":replace("x", "") = "" - >> "xxxx":replace($/y/, "") + >> "xxxx":replace("y", "") = "xxxx" - >> "One two three four five six":replace($/e /, "") + >> "One two three four five six":replace("e ", "") = "Ontwo threfour fivsix" - >> " one ":replace($/{start}{space}/, "") - = "one " - >> " one ":replace($/{space}{end}/, "") - = " one" - - >> amelie:has($/$amelie2/) + >> amelie:has(amelie2) = yes >> multiline := " @@ -138,11 +117,6 @@ func main(): >> ${one {nested} two $(1+2)} = "one {nested} two 3" - >> "one two three":replace($/{alpha}/, "") - = " " - >> "one two three":replace($/{alpha}/, "word") - = "word word word" - c := "É̩" >> c:codepoint_names() = ["LATIN CAPITAL LETTER E WITH ACUTE", "COMBINING VERTICAL LINE BELOW"] @@ -165,18 +139,29 @@ func main(): = [:Text] !! Test splitting and joining text: - >> "one two three":split($/ /) + >> "one,, two,three":split(",") + = ["one", "", " two", "three"] + >> [t for t in "one,, two,three":by_split(",")] + = ["one", "", " two", "three"] + >> "one,, two,three":split_any(", ") = ["one", "two", "three"] - - >> "one,two,three,":split($/,/) - = ["one", "two", "three", ""] - - >> "one two three":split($/{space}/) + >> [t for t in "one,, two,three":by_split_any(", ")] = ["one", "two", "three"] + >> ",one,, two,three,":split(",") + = ["", "one", "", " two", "three", ""] + >> [t for t in ",one,, two,three,":by_split(",")] + = ["", "one", "", " two", "three", ""] + >> ",one,, two,three,":split_any(", ") + = ["", "one", "two", "three", ""] + >> [t for t in ",one,, two,three,":by_split_any(", ")] + = ["", "one", "two", "three", ""] - >> "abc":split($//) + >> "abc":split() = ["a", "b", "c"] + >> "one two three":split_any() + = ["one", "two", "three"] + >> ", ":join(["one", "two", "three"]) = "one, two, three" @@ -192,35 +177,6 @@ func main(): >> "":split() = [:Text] - !! Test text:find_all() - >> " #one #two #three ":find_all($/#{alpha}/) - = [Match(text="#one", index=2, captures=["one"]), Match(text="#two", index=8, captures=["two"]), Match(text="#three", index=13, captures=["three"])] - - >> " #one #two #three ":find_all($/#{!space}/) - = [Match(text="#one", index=2, captures=["one"]), Match(text="#two", index=8, captures=["two"]), Match(text="#three", index=13, captures=["three"])] - - >> " ":find_all($/{alpha}/) - = [:Match] - - >> " foo(baz(), 1) doop() ":find_all($/{id}(?)/) - = [Match(text="foo(baz(), 1)", index=2, captures=["foo", "baz(), 1"]), Match(text="doop()", index=17, captures=["doop", ""])] - - >> "":find_all($Pattern'') - = [:Match] - - >> "Hello":find_all($Pattern'') - = [:Match] - - !! Test text:find() - >> " one two three ":find($/{id}/, start=-999) - = none : Match - >> " one two three ":find($/{id}/, start=999) - = none : Match - >> " one two three ":find($/{id}/) - = Match(text="one", index=2, captures=["one"])? - >> " one two three ":find($/{id}/, start=5) - = Match(text="two", index=8, captures=["two"])? - !! Test text slicing: >> "abcdef":slice() = "abcdef" @@ -248,64 +204,15 @@ func main(): >> Text.from_codepoint_names(["not a valid name here buddy"]) = none : Text - >> "one two; three four":find_all($/; {..}/) - = [Match(text="; three four", index=8, captures=["three four"])] + >> "Hello":replace("ello", "i") + = "Hi" - malicious := "{xxx}" - >> $/$malicious/ - = $/{1{}xxx}/ - - >> "Hello":replace($/{lower}/, "(\0)") - = "H(ello)" - - >> " foo(xyz) foo(yyy) foo(z()) ":replace($/foo(?)/, "baz(\1)") - = " baz(xyz) baz(yyy) baz(z()) " - - >> "":replace_all({$//=">"}) + >> "":translate({"<"="<", ">"=">"}) = "<tag>" - >> " BAD(x, fn(y), BAD(z), w) ":replace($/BAD(?)/, "good(\1)", recursive=yes) - = " good(x, fn(y), good(z), w) " - - >> " BAD(x, fn(y), BAD(z), w) ":replace($/BAD(?)/, "good(\1)", recursive=no) - = " good(x, fn(y), BAD(z), w) " - - >> "Hello":matches($/{id}/) - = ["Hello"]? - >> "Hello":matches($/{lower}/) - = none : [Text] - >> "Hello":matches($/{upper}/) - = none : [Text] - >> "Hello...":matches($/{id}/) - = none : [Text] - - if matches := "hello world":matches($/{id} {id}/): - >> matches - = ["hello", "world"] - else: - fail("Failed to match") - - >> "hello world":map($/world/, func(m:Match): m.text:upper()) - = "hello WORLD" - >> "Abc":repeat(3) = "AbcAbcAbc" - >> " abc def ":trim() - = "abc def" - >> " abc123def ":trim($/{!digit}/) - = "123" - >> " abc123def ":trim($/{!digit}/, trim_left=no) - = " abc123" - >> " abc123def ":trim($/{!digit}/, trim_right=no) - = "123def " - # Only trim single whole matches that bookend the text: - >> "AbcAbcxxxxxxxxAbcAbc":trim($/Abc/) - = "AbcxxxxxxxxAbc" - - >> "A=B=C=D":replace($/{..}={..}/, "1:(\1) 2:(\2)") - = "1:(A) 2:(B=C=D)" - >> "abcde":starts_with("ab") = yes >> "abcde":starts_with("bc") @@ -316,6 +223,16 @@ func main(): >> "abcde":starts_with("cd") = no + >> "abcde":without_prefix("ab") + = "cde" + >> "abcde":without_suffix("ab") + = "abcde" + + >> "abcde":without_prefix("de") + = "abcde" + >> "abcde":without_suffix("de") + = "abc" + >> ("hello" ++ " " ++ "Amélie"):reversed() = "eilémA olleh" @@ -387,3 +304,13 @@ func main(): >> cowboy:middle_pad(4) = " 🤠 " + >> " one, ":trim(" ,") + = "one" + >> " one, ":trim(" ,", left=no) + = " one" + >> " one, ":trim(" ,", right=no) + = "one, " + >> " ":trim(" ,") + = "" + >> " ":trim(" ,", left=no) + = ""