diff options
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | builtins/text.c | 107 | ||||
| -rw-r--r-- | builtins/text.h | 9 | ||||
| -rw-r--r-- | builtins/where.c | 54 | ||||
| -rw-r--r-- | builtins/where.h | 37 | ||||
| -rw-r--r-- | environment.c | 37 | ||||
| -rw-r--r-- | test/text.tm | 37 |
7 files changed, 226 insertions, 57 deletions
@@ -25,7 +25,7 @@ O=-Og CFLAGS=$(CCONFIG) $(EXTRA) $(CWARN) $(G) $(O) $(OSFLAGS) LDLIBS=-lgc -lcord -lm -lunistring -ldl -L. -ltomo BUILTIN_OBJS=builtins/array.o builtins/bool.o builtins/nums.o builtins/functions.o builtins/integers.o \ - builtins/pointer.o builtins/memory.o builtins/text.o builtins/c_string.o builtins/table.o \ + builtins/pointer.o builtins/memory.o builtins/text.o builtins/where.o builtins/c_string.o builtins/table.o \ builtins/types.o builtins/util.o builtins/files.o all: libtomo.so tomo diff --git a/builtins/text.c b/builtins/text.c index 9a9c3801..a3161926 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -24,6 +24,14 @@ #define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo)) +static inline uint8_t *_normalize(CORD str, uint8_t *buf, size_t *len) +{ + const uint8_t *str_u8 = (const uint8_t*)CORD_to_const_char_star(str); + uint8_t *normalized = u8_normalize(UNINORM_NFD, str_u8, strlen((char*)str_u8)+1, buf, len); + if (!normalized) errx(1, "Unicode normalization error!"); + return normalized; +} + public CORD Text$as_text(const void *text, bool colorize, const TypeInfo *info) { if (!text) return info->TextInfo.lang; @@ -111,12 +119,8 @@ public uint32_t Text$hash(const CORD *cord) { if (!*cord) return 0; - const char *str = CORD_to_const_char_star(*cord); - size_t len = strlen(str); - uint8_t buf[128] = {0}; - size_t norm_len = sizeof(buf); - uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len+1, buf, &norm_len); - if (!normalized) errx(1, "Unicode normalization error!"); + uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); + uint8_t *normalized = _normalize(*cord, buf, &norm_len); uint32_t hash; halfsiphash(normalized, norm_len, TOMO_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); @@ -166,51 +170,75 @@ public CORD Text$title(CORD str) return (CORD)u8_totitle((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); } -public bool Text$has(CORD str, CORD target, where_e where) +public bool Text$has(CORD str, CORD target, Where_t where) { if (!target) return true; if (!str) return false; - if (where == WHERE_START) { - return (CORD_ncmp(str, 0, target, 0, CORD_len(target)) == 0); - } else if (where == WHERE_END) { - size_t str_len = CORD_len(str); - size_t target_len = CORD_len(target); - return (str_len >= target_len && CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0); + uint8_t str_buf[128] = {0}; size_t str_norm_len = sizeof(str_buf); + uint8_t *str_normalized = _normalize(str, str_buf, &str_norm_len); + + uint8_t target_buf[128] = {0}; size_t target_norm_len = sizeof(target_buf); + uint8_t *target_normalized = _normalize(target, target_buf, &target_norm_len); + + if (target_norm_len > str_norm_len) return false; + + bool ret; + if (where.$tag == $tag$Where$Start) { + ret = (u8_strncmp(str_normalized, target_normalized, target_norm_len-1) == 0); + } else if (where.$tag == $tag$Where$End) { + ret = (u8_strcmp(str_normalized + str_norm_len - target_norm_len, target_normalized) == 0); } else { - size_t pos = CORD_str(str, 0, target); - return (pos != CORD_NOT_FOUND); + assert(where.$tag == $tag$Where$Anywhere); + ret = (u8_strstr(str_normalized, target_normalized) != NULL); } + + if (str_normalized != str_buf) free(str_normalized); + if (target_normalized != target_buf) free(target_normalized); + return ret; } -public CORD Text$without(CORD str, CORD target, where_e where) +public CORD Text$without(CORD str, CORD target, Where_t where) { if (!str || !target) return str; size_t target_len = CORD_len(target); size_t str_len = CORD_len(str); - if (where == WHERE_START) { + if (where.$tag == $tag$Where$Start) { if (CORD_ncmp(str, 0, target, 0, target_len) == 0) return CORD_substr(str, target_len, str_len - target_len); return str; - } else if (where == WHERE_END) { + } else if (where.$tag == $tag$Where$End) { if (CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0) return CORD_substr(str, 0, str_len - target_len); return str; } else { - errx(1, "Not implemented"); + CORD ret = CORD_EMPTY; + size_t i = 0; + for (;;) { + size_t match = CORD_str(str, i, target); + if (match == CORD_NOT_FOUND) { + if (i == 0) return str; // No matches! + ret = CORD_cat(ret, CORD_substr(str, i, str_len)); + break; + } + ret = CORD_cat(ret, CORD_substr(str, i, (match-i))); + i = match + target_len; + } + return ret; } } -public CORD Text$trimmed(CORD str, CORD skip, where_e where) +public CORD Text$trimmed(CORD str, CORD skip, Where_t where) { if (!str || !skip) return str; const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(str); const uint8_t *uskip = (const uint8_t*)CORD_to_const_char_star(skip); - if (where == WHERE_START) { + // TODO: implement proper reverse iteration with u8_prev() + if (where.$tag == $tag$Where$Start) { size_t span = u8_strspn(ustr, uskip); return (CORD)ustr + span; - } else if (where == WHERE_END) { + } else if (where.$tag == $tag$Where$End) { size_t len = u8_strlen(ustr); const uint8_t *back = ustr + len; size_t back_span = 0; @@ -287,12 +315,8 @@ public CORD Text$join(CORD glue, array_t pieces) public array_t Text$clusters(CORD text) { array_t clusters = {.atomic=1}; - const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); - uint8_t buf[128] = {0}; - size_t norm_len = sizeof(buf); - uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, buf, &norm_len); - if (!normalized) errx(1, "Unicode normalization error!"); - + uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); + uint8_t *normalized = _normalize(text, buf, &norm_len); const uint8_t *end = normalized + strlen((char*)normalized); for (const uint8_t *pos = normalized; pos != end; ) { const uint8_t *next = u8_grapheme_next(pos, end); @@ -310,11 +334,8 @@ public array_t Text$clusters(CORD text) public array_t Text$codepoints(CORD text) { - const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); - uint8_t norm_buf[128] = {0}; - size_t norm_len = sizeof(norm_buf); - uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len); - if (!normalized) errx(1, "Unicode normalization error!"); + uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); + uint8_t *normalized = _normalize(text, norm_buf, &norm_len); uint32_t codepoint_buf[128] = {0}; size_t codepoint_len = sizeof(codepoint_buf); @@ -333,11 +354,8 @@ public array_t Text$codepoints(CORD text) public array_t Text$bytes(CORD text) { - const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); - uint8_t norm_buf[128] = {0}; - size_t norm_len = sizeof(norm_buf); - uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len); - if (!normalized) errx(1, "Unicode normalization error!"); + uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); + uint8_t *normalized = _normalize(text, norm_buf, &norm_len); --norm_len; // NUL byte array_t ret = { @@ -366,11 +384,8 @@ public int64_t Text$num_clusters(CORD text) public int64_t Text$num_codepoints(CORD text) { - const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); - uint8_t buf[128] = {0}; - size_t norm_len = sizeof(buf); - uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, buf, &norm_len); - if (!normalized) errx(1, "Unicode normalization error!"); + uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); + uint8_t *normalized = _normalize(text, buf, &norm_len); int64_t num_codepoints = u8_mbsnlen(normalized, norm_len-1); if (normalized != buf) free(normalized); return num_codepoints; @@ -378,10 +393,8 @@ public int64_t Text$num_codepoints(CORD text) public int64_t Text$num_bytes(CORD text) { - const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); - uint8_t norm_buf[128] = {0}; - size_t norm_len = sizeof(norm_buf); - uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len); + uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); + uint8_t *normalized = _normalize(text, norm_buf, &norm_len); --norm_len; // NUL byte if (!normalized) errx(1, "Unicode normalization error!"); if (normalized != norm_buf) free(normalized); diff --git a/builtins/text.h b/builtins/text.h index cd191cb6..e97c4010 100644 --- a/builtins/text.h +++ b/builtins/text.h @@ -8,11 +8,10 @@ #include <stdint.h> #include "types.h" +#include "where.h" #define Text_t CORD -typedef enum { WHERE_ANYWHERE, WHERE_START, WHERE_END } where_e; - typedef struct { enum { FIND_FAILURE, FIND_SUCCESS } status; int32_t index; @@ -27,9 +26,9 @@ CORD Text$slice(CORD text, int64_t first, int64_t length); CORD Text$upper(CORD str); CORD Text$lower(CORD str); CORD Text$title(CORD str); -bool Text$has(CORD str, CORD target, where_e where); -CORD Text$without(CORD str, CORD target, where_e where); -CORD Text$trimmed(CORD str, CORD skip, where_e where); +bool Text$has(CORD str, CORD target, Where_t where); +CORD Text$without(CORD str, CORD target, Where_t where); +CORD Text$trimmed(CORD str, CORD skip, Where_t where); find_result_t Text$find(CORD str, CORD pat); CORD Text$replace(CORD text, CORD pat, CORD replacement, int64_t limit); array_t Text$split(CORD str, CORD split); diff --git a/builtins/where.c b/builtins/where.c new file mode 100644 index 00000000..5447d8c9 --- /dev/null +++ b/builtins/where.c @@ -0,0 +1,54 @@ +// A type called "Where" that is an enum for "Anywhere", "Start", or "End" +// Mainly used for text methods + +#include <gc/cord.h> +#include <stdbool.h> +#include <stdint.h> + +#include "types.h" +#include "where.h" +#include "util.h" + +static CORD Where$Anywhere$as_text(Where$Anywhere_t *obj, bool use_color) +{ + if (!obj) return "Anywhere"; + return CORD_all(use_color ? "\x1b[0;1mAnywhere\x1b[m(" : "Anywhere(", ")"); +} + +static CORD Where$Start$as_text(Where$Start_t *obj, bool use_color) +{ + if (!obj) return "Start"; + return CORD_all(use_color ? "\x1b[0;1mStart\x1b[m(" : "Start(", ")"); +} + +static CORD Where$End$as_text(Where$End_t *obj, bool use_color) +{ + if (!obj) return "End"; + return CORD_all(use_color ? "\x1b[0;1mEnd\x1b[m(" : "End(", ")"); +} + +static CORD Where$as_text(Where_t *obj, bool use_color) +{ + if (!obj) + return "Where"; + switch (obj->$tag) { + case $tag$Where$Anywhere: + return use_color ? "\x1b[36;1mWhere.Anywhere\x1b[m" : "Where.Anywhere"; + case $tag$Where$Start: + return use_color ? "\x1b[36;1mWhere.Start\x1b[m" : "Where.Start"; + case $tag$Where$End: + return use_color ? "\x1b[36;1mWhere.End\x1b[m" : "Where.End"; + default: + return CORD_EMPTY; + } +} + +public const Where_t Where$tagged$Anywhere = {$tag$Where$Anywhere}; +public const Where_t Where$tagged$Start = {$tag$Where$Start}; +public const Where_t Where$tagged$End = {$tag$Where$End}; +public const TypeInfo Where$Anywhere = {0, 0, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$Anywhere$as_text}}}; +public const TypeInfo Where$Start = {0, 0, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$Start$as_text}}}; +public const TypeInfo Where$End = {0, 0, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$End$as_text}}}; +public const TypeInfo Where = {4, 4, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$as_text}}}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/where.h b/builtins/where.h new file mode 100644 index 00000000..75de22a3 --- /dev/null +++ b/builtins/where.h @@ -0,0 +1,37 @@ +#pragma once + +// Type info and methods for Where datatype (Anywhere, Start, or End enum) +// Mainly used for text methods. + +#include <gc/cord.h> +#include <stdbool.h> +#include <stdint.h> + +#include "types.h" + +typedef struct Where_s Where_t; +extern const TypeInfo Where; +typedef struct Where$Anywhere_s Where$Anywhere_t; +extern const TypeInfo Where$Anywhere; +typedef struct Where$Start_s Where$Start_t; +extern const TypeInfo Where$Start; +typedef struct Where$End_s Where$End_t; +extern const TypeInfo Where$End; + +struct Where$Anywhere_s {}; +struct Where$Start_s {}; +struct Where$End_s {}; +struct Where_s { + enum { $tag$Where$Anywhere = 0, $tag$Where$Start = 1, $tag$Where$End = 2 } $tag; + union { + Where$Anywhere_t Anywhere; + Where$Start_t Start; + Where$End_t End; + }; +}; + +extern const Where_t Where$tagged$Anywhere; +extern const Where_t Where$tagged$Start; +extern const Where_t Where$tagged$End; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/environment.c b/environment.c index a3ea49fd..03c592ea 100644 --- a/environment.c +++ b/environment.c @@ -42,6 +42,21 @@ env_t *new_compilation_unit(void) const char *name, *code, *type_str; } ns_entry_t; + type_t *where; + { + env_t *where_env = namespace_env(env, "Where"); + type_t *anywhere = Type(StructType, .name="Anywhere"); + type_t *start = Type(StructType, .name="Start"); + type_t *end = Type(StructType, .name="End"); + where = Type(EnumType, .name="Where", .env=where_env, + .tags=new(tag_t, .name="Anywhere", .tag_value=0, .type=anywhere, + .next=new(tag_t, .name="Start", .tag_value=0, .type=start, + .next=new(tag_t, .name="End", .tag_value=0, .type=end)))); + set_binding(where_env, "Anywhere", new(binding_t, .type=where, .code="Where$tagged$Anywhere")); + set_binding(where_env, "Start", new(binding_t, .type=where, .code="Where$tagged$Start")); + set_binding(where_env, "End", new(binding_t, .type=where, .code="Where$tagged$End")); + } + struct { const char *name; type_t *type; @@ -157,6 +172,7 @@ env_t *new_compilation_unit(void) #undef F2 #undef F #undef C + {"Where", where, "Where_t", "Where", {}}, {"Text", TEXT_TYPE, "Text_t", "$Text", TypedArray(ns_entry_t, {"slice", "Text$slice", "func(text:Text, index:Int, length=Int.max)->Text"}, {"quoted", "Text$quoted", "func(text:Text, color=no)->Text"}, @@ -165,9 +181,9 @@ env_t *new_compilation_unit(void) {"title", "Text$title", "func(text:Text)->Text"}, {"as_c_string", "CORD_to_char_star", "func(text:Text)->CString"}, {"from_c_string", "CORD_from_char_star", "func(str:CString)->Text"}, - // {"has", "Text$has", "func(text:Text, target:Text, where=ANYWHERE)->Bool"}, - // {"without", "Text$without", "func(text:Text, target:Text, where=ANYWHERE)->Text"}, - // {"trimmed", "Text$without", "func(text:Text, skip:Text, where=ANYWHERE)->Text"}, + {"has", "Text$has", "func(text:Text, target:Text, where=Where.Anywhere)->Bool"}, + {"without", "Text$without", "func(text:Text, target:Text, where=Where.Anywhere)->Text"}, + {"trimmed", "Text$trimmed", "func(text:Text, trim=\" {\\n\\r\\t}\", where=Where.Anywhere)->Text"}, {"title", "Text$title", "func(text:Text)->Text"}, // {"find", "Text$find", "func(text:Text, pattern:Text)->FindResult"}, {"replace", "Text$replace", "func(text:Text, pattern:Text, replacement:Text, limit=Int.max)->Text"}, @@ -184,7 +200,20 @@ env_t *new_compilation_unit(void) }; for (size_t i = 0; i < sizeof(global_types)/sizeof(global_types[0]); i++) { - env_t *ns_env = global_types[i].type == TEXT_TYPE ? Match(TEXT_TYPE, TextType)->env : namespace_env(env, global_types[i].name); + env_t *ns_env = NULL; + switch (global_types[i].type->tag) { + case TextType: + ns_env = Match(global_types[i].type, TextType)->env; + break; + case StructType: + ns_env = Match(global_types[i].type, StructType)->env; + break; + case EnumType: + ns_env = Match(global_types[i].type, EnumType)->env; + break; + default: break; + } + if (ns_env == NULL) ns_env = namespace_env(env, global_types[i].name); binding_t *binding = new(binding_t, .type=Type(TypeInfoType, .name=global_types[i].name, .type=global_types[i].type, .env=ns_env)); Table$str_set(env->globals, global_types[i].name, binding); Table$str_set(env->types, global_types[i].name, global_types[i].type); diff --git a/test/text.tm b/test/text.tm index 9c451b89..9df48467 100644 --- a/test/text.tm +++ b/test/text.tm @@ -55,3 +55,40 @@ func main(): >> "Hello":replace("e", "X") = "HXllo" + + >> "Hello":has("l") + = yes + >> "Hello":has("l", End) + = no + >> "Hello":has("l", Start) + = no + + >> "Hello":has("o") + = yes + >> "Hello":has("o", where=End) + = yes + >> "Hello":has("o", where=Start) + = no + + >> "Hello":has("H") + = yes + >> "Hello":has("H", End) + = no + >> "Hello":has("H", Start) + = yes + + >> "Hello":without("l") + = "Heo" + >> "xxxx":without("x") + = "" + >> "xxxx":without("y") + = "xxxx" + >> "One two three four five six":without("e ") + = "Ontwo threfour fivsix" + + >> " one ":trimmed() + = "one" + >> " one ":trimmed(" aeiou") + = "n" + + >> amelie:has(amelie2) |
