From 8fab88c56f95c03ffcb4be178f5dbb21b239d95e Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Sun, 3 Mar 2024 18:15:45 -0500 Subject: Rename Str -> Text --- builtins/string.c | 263 ------------------------------------------------------ builtins/string.h | 36 -------- builtins/table.c | 4 +- builtins/text.c | 263 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ builtins/text.h | 35 ++++++++ 5 files changed, 300 insertions(+), 301 deletions(-) delete mode 100644 builtins/string.c delete mode 100644 builtins/string.h create mode 100644 builtins/text.c create mode 100644 builtins/text.h (limited to 'builtins') diff --git a/builtins/string.c b/builtins/string.c deleted file mode 100644 index c28aa510..00000000 --- a/builtins/string.c +++ /dev/null @@ -1,263 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../SipHash/halfsiphash.h" -#include "types.h" -#include "array.h" -#include "string.h" - -#define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo)) - -public CORD Str__as_str(const void *str, bool colorize, const TypeInfo *info) -{ - (void)info; - if (!str) return "Str"; - return Str__quoted(*(CORD*)str, colorize); -} - -public CORD Str__quoted(CORD str, bool colorize) -{ - // Note: it's important to have unicode strings not get broken up with - // escapes, otherwise they won't print right. - if (colorize) { - CORD quoted = "\x1b[35m\""; - CORD_pos i; - CORD_FOR(i, str) { - char c = CORD_pos_fetch(i); - switch (c) { -#define BACKSLASHED(esc) "\x1b[34m\\\x1b[1m" esc "\x1b[0;35m" - case '\a': quoted = CORD_cat(quoted, BACKSLASHED("a")); break; - case '\b': quoted = CORD_cat(quoted, BACKSLASHED("b")); break; - case '\x1b': quoted = CORD_cat(quoted, BACKSLASHED("e")); break; - case '\f': quoted = CORD_cat(quoted, BACKSLASHED("f")); break; - case '\n': quoted = CORD_cat(quoted, BACKSLASHED("n")); break; - case '\r': quoted = CORD_cat(quoted, BACKSLASHED("r")); break; - case '\t': quoted = CORD_cat(quoted, BACKSLASHED("t")); break; - case '\v': quoted = CORD_cat(quoted, BACKSLASHED("v")); break; - case '"': quoted = CORD_cat(quoted, BACKSLASHED("\"")); break; - case '\\': quoted = CORD_cat(quoted, BACKSLASHED("\\")); break; - case '\x00' ... '\x06': case '\x0E' ... '\x1A': - case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': - CORD_sprintf("ed, "%r" BACKSLASHED("x%02X"), quoted, c); - break; - default: quoted = CORD_cat_char(quoted, c); break; -#undef BACKSLASHED - } - } - quoted = CORD_cat(quoted, "\"\x1b[m"); - return quoted; - } else { - CORD quoted = "\""; - CORD_pos i; - CORD_FOR(i, str) { - char c = CORD_pos_fetch(i); - switch (c) { - case '\a': quoted = CORD_cat(quoted, "\\a"); break; - case '\b': quoted = CORD_cat(quoted, "\\b"); break; - case '\x1b': quoted = CORD_cat(quoted, "\\e"); break; - case '\f': quoted = CORD_cat(quoted, "\\f"); break; - case '\n': quoted = CORD_cat(quoted, "\\n"); break; - case '\r': quoted = CORD_cat(quoted, "\\r"); break; - case '\t': quoted = CORD_cat(quoted, "\\t"); break; - case '\v': quoted = CORD_cat(quoted, "\\v"); break; - case '"': quoted = CORD_cat(quoted, "\\\""); break; - case '\\': quoted = CORD_cat(quoted, "\\\\"); break; - case '\x00' ... '\x06': case '\x0E' ... '\x1A': - case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': - CORD_sprintf("ed, "%r\\x%02X", quoted, c); - break; - default: quoted = CORD_cat_char(quoted, c); break; - } - } - quoted = CORD_cat_char(quoted, '"'); - return quoted; - } -} - -public int Str__compare(CORD *x, CORD *y) -{ - return CORD_cmp(*x, *y); -} - -public bool Str__equal(CORD *x, CORD *y) -{ - return CORD_cmp(*x, *y) == 0; -} - -public uint32_t Str__hash(CORD *cord) -{ - if (!*cord) return 0; - - const char *str = CORD_to_const_char_star(*cord); - *cord = str; - - uint32_t hash; - halfsiphash(str, strlen(str)+1, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); - return hash; -} - -public CORD Str__upper(CORD str) -{ - if (!str) return str; - size_t len = strlen(str) + 1; - uint8_t *dest = GC_MALLOC_ATOMIC(len); - return (CORD)u8_toupper((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); -} - -public CORD Str__lower(CORD str) -{ - if (!str) return str; - size_t len = strlen(str) + 1; - uint8_t *dest = GC_MALLOC_ATOMIC(len); - return (CORD)u8_tolower((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); -} - -public CORD Str__title(CORD str) -{ - if (!str) return str; - size_t len = strlen(str) + 1; - uint8_t *dest = GC_MALLOC_ATOMIC(len); - return (CORD)u8_totitle((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); -} - -public bool Str__has(CORD str, CORD target, where_e where) -{ - if (!target) return true; - if (!str) return false; - - if (where == WHERE_START) { - return (CORD_ncmp(str, 0, target, 0, CORD_len(target)) == 0); - } else if (where == WHERE_END) { - size_t str_len = CORD_len(str); - size_t target_len = CORD_len(target); - return (str_len >= target_len && CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0); - } else { - size_t pos = CORD_str(str, 0, target); - return (pos != CORD_NOT_FOUND); - } -} - -public CORD Str__without(CORD str, CORD target, where_e where) -{ - if (!str || !target) return str; - - size_t target_len = CORD_len(target); - if (where == WHERE_START) { - if (CORD_ncmp(str, 0, target, 0, target_len) == 0) - return CORD_substr(str, target_len, SIZE_MAX); - return str; - } else if (where == WHERE_END) { - size_t str_len = CORD_len(str); - if (CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0) - return CORD_substr(str, 0, str_len - target_len); - return str; - } else { - errx(1, "Not implemented"); - } -} - -public CORD Str__trimmed(CORD str, CORD skip, where_e where) -{ - if (!str || !skip) return str; - const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(str); - const uint8_t *uskip = (const uint8_t*)CORD_to_const_char_star(skip); - if (where == WHERE_START) { - size_t span = u8_strspn(ustr, uskip); - return (CORD)ustr + span; - } else if (where == WHERE_END) { - size_t len = u8_strlen(ustr); - const uint8_t *back = ustr + len; - size_t back_span = 0; - while (back - back_span > ustr && u8_strspn(back-back_span-1, uskip) > back_span) - ++back_span; - return CORD_substr((CORD)ustr, 0, len - back_span); - } else { - size_t span = u8_strspn(ustr, uskip); - size_t len = u8_strlen(ustr); - const uint8_t *back = ustr + len; - size_t back_span = 0; - while (back - back_span > ustr + span && u8_strspn(back-back_span-1, uskip) > back_span) - ++back_span; - return CORD_substr((CORD)(ustr + span), 0, len - span - back_span); - } -} - -public find_result_t Str__find(CORD str, CORD pat) -{ - if (!pat) return (find_result_t){.status=FIND_SUCCESS, .index=1}; - size_t pos = CORD_str(str, 0, pat); - return (pos == CORD_NOT_FOUND) ? (find_result_t){.status=FIND_FAILURE} : (find_result_t){.status=FIND_SUCCESS, .index=(int32_t)pos}; -} - -public CORD Str__replace(CORD text, CORD pat, CORD replacement, int64_t limit) -{ - if (!text || !pat) return text; - CORD ret = NULL; - size_t pos = 0, pat_len = CORD_len(pat); - for (size_t found; limit > 0 && (found=CORD_str(text, pos, pat)) != CORD_NOT_FOUND; --limit) { - ret = CORD_cat(ret, CORD_substr(text, pos, found)); - ret = CORD_cat(ret, replacement); - pos = found + pat_len; - } - return CORD_cat(ret, CORD_substr(text, pos, SIZE_MAX)); -} - -public array_t Str__split(CORD str, CORD split) -{ - if (!str) return (array_t){.data=GC_MALLOC(sizeof(CORD)), .atomic=1, .length=1, .stride=sizeof(CORD)}; - array_t strings = {.stride=sizeof(CORD), .atomic=1}; - int64_t capacity = 0; - - const uint8_t *ustr = (uint8_t*)CORD_to_const_char_star(str); - const uint8_t *usplit = (uint8_t*)CORD_to_const_char_star(split); - for (int64_t i = 0; ; ) { - size_t non_split = u8_strcspn(ustr + i, usplit); - CORD chunk = CORD_substr((CORD)ustr, i, non_split); - if (capacity <= 0) - strings.data = GC_REALLOC(strings.data, sizeof(CORD)*(capacity += 10)); - ((CORD*)strings.data)[strings.length++] = chunk; - - i += non_split; - - size_t split = u8_strspn(ustr + i, usplit); - if (split == 0) break; - i += split; - } - return strings; -} - -public CORD Str__join(CORD glue, array_t pieces) -{ - if (pieces.length == 0) return CORD_EMPTY; - - CORD ret = CORD_EMPTY; - for (int64_t i = 0; i < pieces.length; i++) { - if (i > 0) ret = CORD_cat(ret, glue); - ret = CORD_cat(ret, *(CORD*)((void*)pieces.data + i*pieces.stride)); - } - return ret; -} - -public const TypeInfo Str = { - .size=sizeof(CORD), - .align=__alignof__(CORD), - .tag=CustomInfo, - .CustomInfo={ - .as_str=(void*)Str__as_str, - .compare=(void*)Str__compare, - .equal=(void*)Str__equal, - .hash=(void*)Str__hash, - }, -}; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/string.h b/builtins/string.h deleted file mode 100644 index c24de72d..00000000 --- a/builtins/string.h +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once -#include -#include -#include - -#include "types.h" - -#define String_t CORD -#define Str_t CORD - -typedef enum { WHERE_ANYWHERE, WHERE_START, WHERE_END } where_e; - -typedef struct { - enum { FIND_FAILURE, FIND_SUCCESS } status; - int32_t index; -} find_result_t; - -CORD Str__as_str(const void *str, bool colorize, const TypeInfo *info); -CORD Str__quoted(CORD str, bool colorize); -int Str__compare(CORD *x, CORD *y); -bool Str__equal(CORD *x, CORD *y); -uint32_t Str__hash(CORD *cord); -CORD Str__upper(CORD str); -CORD Str__lower(CORD str); -CORD Str__title(CORD str); -bool Str__has(CORD str, CORD target, where_e where); -CORD Str__without(CORD str, CORD target, where_e where); -CORD Str__trimmed(CORD str, CORD skip, where_e where); -find_result_t Str__find(CORD str, CORD pat); -CORD Str__replace(CORD text, CORD pat, CORD replacement, int64_t limit); -array_t Str__split(CORD str, CORD split); -CORD Str__join(CORD glue, array_t pieces); - -extern const TypeInfo Str; - -// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/table.c b/builtins/table.c index fcb64e40..523aae70 100644 --- a/builtins/table.c +++ b/builtins/table.c @@ -21,7 +21,7 @@ #include "../util.h" #include "array.h" #include "datatypes.h" -#include "string.h" +#include "text.h" #include "table.h" #include "types.h" @@ -55,7 +55,7 @@ TypeInfo StrToVoidStarTable = { .size=sizeof(table_t), .align=__alignof__(table_t), .tag=TableInfo, - .TableInfo={.key=&Str, .value=&MemoryPointer}, + .TableInfo={.key=&Text, .value=&MemoryPointer}, }; static inline size_t entry_size(const TypeInfo *info) diff --git a/builtins/text.c b/builtins/text.c new file mode 100644 index 00000000..d85c03f0 --- /dev/null +++ b/builtins/text.c @@ -0,0 +1,263 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../SipHash/halfsiphash.h" +#include "types.h" +#include "array.h" +#include "text.h" + +#define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo)) + +public CORD Text__as_str(const void *str, bool colorize, const TypeInfo *info) +{ + (void)info; + if (!str) return "Text"; + return Text__quoted(*(CORD*)str, colorize); +} + +public CORD Text__quoted(CORD str, bool colorize) +{ + // Note: it's important to have unicode strings not get broken up with + // escapes, otherwise they won't print right. + if (colorize) { + CORD quoted = "\x1b[35m\""; + CORD_pos i; + CORD_FOR(i, str) { + char c = CORD_pos_fetch(i); + switch (c) { +#define BACKSLASHED(esc) "\x1b[34m\\\x1b[1m" esc "\x1b[0;35m" + case '\a': quoted = CORD_cat(quoted, BACKSLASHED("a")); break; + case '\b': quoted = CORD_cat(quoted, BACKSLASHED("b")); break; + case '\x1b': quoted = CORD_cat(quoted, BACKSLASHED("e")); break; + case '\f': quoted = CORD_cat(quoted, BACKSLASHED("f")); break; + case '\n': quoted = CORD_cat(quoted, BACKSLASHED("n")); break; + case '\r': quoted = CORD_cat(quoted, BACKSLASHED("r")); break; + case '\t': quoted = CORD_cat(quoted, BACKSLASHED("t")); break; + case '\v': quoted = CORD_cat(quoted, BACKSLASHED("v")); break; + case '"': quoted = CORD_cat(quoted, BACKSLASHED("\"")); break; + case '\\': quoted = CORD_cat(quoted, BACKSLASHED("\\")); break; + case '\x00' ... '\x06': case '\x0E' ... '\x1A': + case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': + CORD_sprintf("ed, "%r" BACKSLASHED("x%02X"), quoted, c); + break; + default: quoted = CORD_cat_char(quoted, c); break; +#undef BACKSLASHED + } + } + quoted = CORD_cat(quoted, "\"\x1b[m"); + return quoted; + } else { + CORD quoted = "\""; + CORD_pos i; + CORD_FOR(i, str) { + char c = CORD_pos_fetch(i); + switch (c) { + case '\a': quoted = CORD_cat(quoted, "\\a"); break; + case '\b': quoted = CORD_cat(quoted, "\\b"); break; + case '\x1b': quoted = CORD_cat(quoted, "\\e"); break; + case '\f': quoted = CORD_cat(quoted, "\\f"); break; + case '\n': quoted = CORD_cat(quoted, "\\n"); break; + case '\r': quoted = CORD_cat(quoted, "\\r"); break; + case '\t': quoted = CORD_cat(quoted, "\\t"); break; + case '\v': quoted = CORD_cat(quoted, "\\v"); break; + case '"': quoted = CORD_cat(quoted, "\\\""); break; + case '\\': quoted = CORD_cat(quoted, "\\\\"); break; + case '\x00' ... '\x06': case '\x0E' ... '\x1A': + case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': + CORD_sprintf("ed, "%r\\x%02X", quoted, c); + break; + default: quoted = CORD_cat_char(quoted, c); break; + } + } + quoted = CORD_cat_char(quoted, '"'); + return quoted; + } +} + +public int Text__compare(CORD *x, CORD *y) +{ + return CORD_cmp(*x, *y); +} + +public bool Text__equal(CORD *x, CORD *y) +{ + return CORD_cmp(*x, *y) == 0; +} + +public uint32_t Text__hash(CORD *cord) +{ + if (!*cord) return 0; + + const char *str = CORD_to_const_char_star(*cord); + *cord = str; + + uint32_t hash; + halfsiphash(str, strlen(str)+1, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); + return hash; +} + +public CORD Text__upper(CORD str) +{ + if (!str) return str; + size_t len = strlen(str) + 1; + uint8_t *dest = GC_MALLOC_ATOMIC(len); + return (CORD)u8_toupper((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); +} + +public CORD Text__lower(CORD str) +{ + if (!str) return str; + size_t len = strlen(str) + 1; + uint8_t *dest = GC_MALLOC_ATOMIC(len); + return (CORD)u8_tolower((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); +} + +public CORD Text__title(CORD str) +{ + if (!str) return str; + size_t len = strlen(str) + 1; + uint8_t *dest = GC_MALLOC_ATOMIC(len); + return (CORD)u8_totitle((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); +} + +public bool Text__has(CORD str, CORD target, where_e where) +{ + if (!target) return true; + if (!str) return false; + + if (where == WHERE_START) { + return (CORD_ncmp(str, 0, target, 0, CORD_len(target)) == 0); + } else if (where == WHERE_END) { + size_t str_len = CORD_len(str); + size_t target_len = CORD_len(target); + return (str_len >= target_len && CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0); + } else { + size_t pos = CORD_str(str, 0, target); + return (pos != CORD_NOT_FOUND); + } +} + +public CORD Text__without(CORD str, CORD target, where_e where) +{ + if (!str || !target) return str; + + size_t target_len = CORD_len(target); + if (where == WHERE_START) { + if (CORD_ncmp(str, 0, target, 0, target_len) == 0) + return CORD_substr(str, target_len, SIZE_MAX); + return str; + } else if (where == WHERE_END) { + size_t str_len = CORD_len(str); + if (CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0) + return CORD_substr(str, 0, str_len - target_len); + return str; + } else { + errx(1, "Not implemented"); + } +} + +public CORD Text__trimmed(CORD str, CORD skip, where_e where) +{ + if (!str || !skip) return str; + const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(str); + const uint8_t *uskip = (const uint8_t*)CORD_to_const_char_star(skip); + if (where == WHERE_START) { + size_t span = u8_strspn(ustr, uskip); + return (CORD)ustr + span; + } else if (where == WHERE_END) { + size_t len = u8_strlen(ustr); + const uint8_t *back = ustr + len; + size_t back_span = 0; + while (back - back_span > ustr && u8_strspn(back-back_span-1, uskip) > back_span) + ++back_span; + return CORD_substr((CORD)ustr, 0, len - back_span); + } else { + size_t span = u8_strspn(ustr, uskip); + size_t len = u8_strlen(ustr); + const uint8_t *back = ustr + len; + size_t back_span = 0; + while (back - back_span > ustr + span && u8_strspn(back-back_span-1, uskip) > back_span) + ++back_span; + return CORD_substr((CORD)(ustr + span), 0, len - span - back_span); + } +} + +public find_result_t Text__find(CORD str, CORD pat) +{ + if (!pat) return (find_result_t){.status=FIND_SUCCESS, .index=1}; + size_t pos = CORD_str(str, 0, pat); + return (pos == CORD_NOT_FOUND) ? (find_result_t){.status=FIND_FAILURE} : (find_result_t){.status=FIND_SUCCESS, .index=(int32_t)pos}; +} + +public CORD Text__replace(CORD text, CORD pat, CORD replacement, int64_t limit) +{ + if (!text || !pat) return text; + CORD ret = NULL; + size_t pos = 0, pat_len = CORD_len(pat); + for (size_t found; limit > 0 && (found=CORD_str(text, pos, pat)) != CORD_NOT_FOUND; --limit) { + ret = CORD_cat(ret, CORD_substr(text, pos, found)); + ret = CORD_cat(ret, replacement); + pos = found + pat_len; + } + return CORD_cat(ret, CORD_substr(text, pos, SIZE_MAX)); +} + +public array_t Text__split(CORD str, CORD split) +{ + if (!str) return (array_t){.data=GC_MALLOC(sizeof(CORD)), .atomic=1, .length=1, .stride=sizeof(CORD)}; + array_t strings = {.stride=sizeof(CORD), .atomic=1}; + int64_t capacity = 0; + + const uint8_t *ustr = (uint8_t*)CORD_to_const_char_star(str); + const uint8_t *usplit = (uint8_t*)CORD_to_const_char_star(split); + for (int64_t i = 0; ; ) { + size_t non_split = u8_strcspn(ustr + i, usplit); + CORD chunk = CORD_substr((CORD)ustr, i, non_split); + if (capacity <= 0) + strings.data = GC_REALLOC(strings.data, sizeof(CORD)*(capacity += 10)); + ((CORD*)strings.data)[strings.length++] = chunk; + + i += non_split; + + size_t split = u8_strspn(ustr + i, usplit); + if (split == 0) break; + i += split; + } + return strings; +} + +public CORD Text__join(CORD glue, array_t pieces) +{ + if (pieces.length == 0) return CORD_EMPTY; + + CORD ret = CORD_EMPTY; + for (int64_t i = 0; i < pieces.length; i++) { + if (i > 0) ret = CORD_cat(ret, glue); + ret = CORD_cat(ret, *(CORD*)((void*)pieces.data + i*pieces.stride)); + } + return ret; +} + +public const TypeInfo Text = { + .size=sizeof(CORD), + .align=__alignof__(CORD), + .tag=CustomInfo, + .CustomInfo={ + .as_str=(void*)Text__as_str, + .compare=(void*)Text__compare, + .equal=(void*)Text__equal, + .hash=(void*)Text__hash, + }, +}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/text.h b/builtins/text.h new file mode 100644 index 00000000..a8782d24 --- /dev/null +++ b/builtins/text.h @@ -0,0 +1,35 @@ +#pragma once +#include +#include +#include + +#include "types.h" + +#define Text_t CORD + +typedef enum { WHERE_ANYWHERE, WHERE_START, WHERE_END } where_e; + +typedef struct { + enum { FIND_FAILURE, FIND_SUCCESS } status; + int32_t index; +} find_result_t; + +CORD Text__as_str(const void *str, bool colorize, const TypeInfo *info); +CORD Text__quoted(CORD str, bool colorize); +int Text__compare(CORD *x, CORD *y); +bool Text__equal(CORD *x, CORD *y); +uint32_t Text__hash(CORD *cord); +CORD Text__upper(CORD str); +CORD Text__lower(CORD str); +CORD Text__title(CORD str); +bool Text__has(CORD str, CORD target, where_e where); +CORD Text__without(CORD str, CORD target, where_e where); +CORD Text__trimmed(CORD str, CORD skip, where_e where); +find_result_t Text__find(CORD str, CORD pat); +CORD Text__replace(CORD text, CORD pat, CORD replacement, int64_t limit); +array_t Text__split(CORD str, CORD split); +CORD Text__join(CORD glue, array_t pieces); + +extern const TypeInfo Text; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 -- cgit v1.2.3