diff options
Diffstat (limited to 'builtins/string.c')
| -rw-r--r-- | builtins/string.c | 597 |
1 files changed, 597 insertions, 0 deletions
diff --git a/builtins/string.c b/builtins/string.c new file mode 100644 index 00000000..92741903 --- /dev/null +++ b/builtins/string.c @@ -0,0 +1,597 @@ +#include <assert.h> +#include <gc.h> +#include <gc/cord.h> +#include <stdalign.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> +#include <ctype.h> +#include <sys/param.h> +#include <err.h> + +#include "../SipHash/halfsiphash.h" +#include "types.h" +#include "array.h" +#include "string.h" + +#define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo)) + +extern const void *SSS_HASH_VECTOR; + +typedef struct { + uint8_t success; + int32_t index; +} find_result_t; + +static Str_t compacted(const Str_t str) +{ + if (str.stride == 1) return str; + char *buf = GC_MALLOC_ATOMIC(str.length + 1); + for (int64_t i = 0; i < str.length; i++) + buf[i] = str.data[i*str.stride]; + return (Str_t){.data=buf, .length=str.length, .stride=1}; +} + +public CORD Str__cord(const Str_t *s, bool colorize, const TypeInfo *type) +{ + (void)type; + char *data = s->data; + // Note: it's important to have unicode strings not get broken up with + // escapes, otherwise they won't print right. + if (colorize) { + CORD cord = "\x1b[35m\""; + for (int32_t i = 0; i < s->length; i++) { + char c = data[i*s->stride]; + switch (c) { +#define BACKSLASHED(esc) "\x1b[34m\\\x1b[1m" esc "\x1b[0;35m" + case '\a': cord = CORD_cat(cord, BACKSLASHED("a")); break; + case '\b': cord = CORD_cat(cord, BACKSLASHED("b")); break; + case '\x1b': cord = CORD_cat(cord, BACKSLASHED("e")); break; + case '\f': cord = CORD_cat(cord, BACKSLASHED("f")); break; + case '\n': cord = CORD_cat(cord, BACKSLASHED("n")); break; + case '\r': cord = CORD_cat(cord, BACKSLASHED("r")); break; + case '\t': cord = CORD_cat(cord, BACKSLASHED("t")); break; + case '\v': cord = CORD_cat(cord, BACKSLASHED("v")); break; + case '"': cord = CORD_cat(cord, BACKSLASHED("\"")); break; + case '\\': cord = CORD_cat(cord, BACKSLASHED("\\")); break; + case '\x00' ... '\x06': case '\x0E' ... '\x1A': + case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': + CORD_sprintf(&cord, "%r" BACKSLASHED("x%02X"), cord, c); + break; + default: cord = CORD_cat_char(cord, c); break; +#undef BACKSLASHED + } + } + cord = CORD_cat(cord, "\"\x1b[m"); + if (strcmp(type->name, "Str") != 0) + CORD_sprintf(&cord, "\x1b[0;1m%s::\x1b[m%r", type->name, cord); + return cord; + } else { + CORD cord = "\""; + for (int32_t i = 0; i < s->length; i++) { + char c = data[i*s->stride]; + switch (c) { + case '\a': cord = CORD_cat(cord, "\\a"); break; + case '\b': cord = CORD_cat(cord, "\\b"); break; + case '\x1b': cord = CORD_cat(cord, "\\e"); break; + case '\f': cord = CORD_cat(cord, "\\f"); break; + case '\n': cord = CORD_cat(cord, "\\n"); break; + case '\r': cord = CORD_cat(cord, "\\r"); break; + case '\t': cord = CORD_cat(cord, "\\t"); break; + case '\v': cord = CORD_cat(cord, "\\v"); break; + case '"': cord = CORD_cat(cord, "\\\""); break; + case '\\': cord = CORD_cat(cord, "\\\\"); break; + case '\x00' ... '\x06': case '\x0E' ... '\x1A': + case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': + CORD_sprintf(&cord, "%r\\x%02X", cord, c); + break; + default: cord = CORD_cat_char(cord, c); break; + } + } + cord = CORD_cat_char(cord, '"'); + if (strcmp(type->name, "Str") != 0) + CORD_sprintf(&cord, "%s::%r", type->name, cord); + return cord; + } +} + +public int32_t Str__compare(const Str_t *x, const Str_t *y) +{ + int64_t length = x->length < y->length ? x->length : y->length; + for (int64_t i = 0; i < length; i++) { + char xc = x->data[i*x->stride], yc = y->data[i*y->stride]; + if (xc != yc) + return (xc > yc) ? 1 : -1; + } + return (x->length > y->length) - (x->length < y->length); +} + +public bool Str__equal(const Str_t *x, const Str_t *y) +{ + return (Str__compare(x, y) == 0); +} + +public int Str__hash(const Str_t *s, const TypeInfo *type) +{ + (void)type; + if (s->length == 0 || !s->data) return 0; + + const char *data; + if (s->stride == 1) { + data = s->data; + } else { + char *buf = alloca(s->length); + for (int64_t i = 0; i < s->length; i++) + buf[i] = s->data[i*s->stride]; + data = buf; + } + + uint32_t hash; + halfsiphash(data, s->length, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); + return hash; +} + +public Str_t Str__uppercased(const Str_t s) +{ + char *s2 = GC_MALLOC_ATOMIC(s.length + 1); + for (int64_t i = 0; i < s.length; i++) + s2[i] = toupper(s.data[i*s.stride]); + return (Str_t){.data=s2, .length=s.length, .stride=1}; +} + +public Str_t Str__lowercased(const Str_t s) +{ + char *s2 = GC_MALLOC_ATOMIC(s.length + 1); + for (int64_t i = 0; i < s.length; i++) + s2[i] = tolower(s.data[i*s.stride]); + return (Str_t){.data=s2, .length=s.length, .stride=1}; +} + +public Str_t Str__capitalized(const Str_t s) +{ + char *s2 = GC_MALLOC_ATOMIC(s.length + 1); + int64_t i; + for (i = 0; i < s.length; i++) { + if (isalpha(s.data[i*s.stride])) { + s2[i] = toupper(s.data[i*s.stride]); + ++i; + break; + } else { + s2[i] = s.data[i*s.stride]; + } + } + for (; i < s.length; i++) + s2[i] = s.data[i*s.stride]; + return (Str_t){.data=s2, .length=s.length, .stride=1}; +} + +public Str_t Str__titlecased(const Str_t s) +{ + char *s2 = GC_MALLOC_ATOMIC(s.length + 1); + bool should_uppercase = true; + for (int64_t i = 0; i < s.length; i++) { + if (isalpha(s.data[i*s.stride])) { + if (should_uppercase) { + s2[i] = toupper(s.data[i*s.stride]); + should_uppercase = false; + } else { + s2[i] = tolower(s.data[i*s.stride]); + } + } else { + should_uppercase = true; + s2[i] = s.data[i*s.stride]; + } + } + return (Str_t){.data=s2, .length=s.length, .stride=1}; +} + +public bool Str__starts_with(const Str_t s, const Str_t prefix) +{ + if (s.length < prefix.length) return false; + for (int64_t i = 0; i < prefix.length; i++) { + if (s.data[i*s.stride] != prefix.data[i*prefix.stride]) + return false; + } + return true; +} + +public bool Str__ends_with(const Str_t s, const Str_t suffix) +{ + if (s.length < suffix.length) return false; + for (int64_t i = 0; i < suffix.length; i++) { + if (s.data[(s.length-suffix.length+i)*s.stride] != suffix.data[i*suffix.stride]) + return false; + } + return true; +} + +public Str_t Str__without_prefix(const Str_t s, const Str_t prefix) +{ + if (s.length < prefix.length) return s; + for (int64_t i = 0; i < prefix.length; i++) { + if (s.data[i*s.stride] != prefix.data[i*prefix.stride]) + return s; + } + return (Str_t){ + .data=s.data + prefix.length*s.stride, + .length=s.length - prefix.length, + .stride=s.stride, + .cow=0, .atomic=1, + }; +} + +public Str_t Str__without_suffix(const Str_t s, const Str_t suffix) +{ + if (s.length < suffix.length) return s; + for (int64_t i = 0; i < suffix.length; i++) { + if (s.data[(s.length - suffix.length + i)*s.stride] != suffix.data[i*suffix.stride]) + return s; + } + return (Str_t){ + .data=s.data, + .length=s.length - suffix.length, + .stride=s.stride, + .cow=0, .atomic=1, + }; +} + +public Str_t Str__trimmed(const Str_t s, const Str_t trim_chars, bool trim_left, bool trim_right) +{ + int64_t length = s.length; + int64_t start = 0; + if (trim_left) { + for (; start < s.length; start++) { + for (int64_t t = 0; t < trim_chars.length; t++) { + if (s.data[start*s.stride] == trim_chars.data[t*trim_chars.stride]) + goto found_ltrim; + } + goto done_trimming_left; + found_ltrim: + --length; + } + } + done_trimming_left:; + if (trim_right) { + while (length > 0) { + for (int64_t t = 0; t < trim_chars.length; t++) { + if (s.data[(start+length-1)*s.stride] == trim_chars.data[t*trim_chars.stride]) + goto found_rtrim; + } + goto done_trimming_right; + found_rtrim: + --length; + } + } + done_trimming_right:; + return (Str_t){.data=s.data+start*s.stride, .length=length, .stride=s.stride}; +} + +public const char *Str__c_string(const Str_t str) +{ + if (str.length == 0) + return ""; + else if (str.stride == 1 + // Verify that the '\0' would be on the same page, so unlikely to segfault: + && (((uint64_t)str.data + str.length + 1) & 0xFFF) != 0 + // Check for nul-termination + && str.data[str.length+1] == '\0') + return str.data; + + char *buf = GC_MALLOC_ATOMIC(str.length + 1); + for (int64_t i = 0; i < str.length; i++) + buf[i] = str.data[i*str.stride]; + buf[str.length] = '\0'; + return buf; +} + +public Str_t Str__from_c_string(const char *str) +{ + size_t length = str ? strlen(str) : 0; + if (length == 0) return (Str_t){.length=0, .stride=0}; + char *buf = GC_MALLOC_ATOMIC(length + 1); + memcpy(buf, str, length+1); + buf[length+1] = '\0'; + return (Str_t){.data=buf, .length=(int64_t)length, .stride=1}; +} + +public find_result_t Str__find(const Str_t str, const Str_t pat) +{ + if (str.length < pat.length) return (find_result_t){.success=0}; + if (pat.length == 0) return (find_result_t){.success=1, .index=1}; + + // For short strings, do naive approach: + // if (str.length*pat.length < UCHAR_MAX) { + for (int64_t s = 0; s < str.length; s++) { + for (int64_t p = 0; p < pat.length; p++) { + if (str.data[s*str.stride] != pat.data[p*pat.stride]) + goto not_a_match; + } + return (find_result_t){.success=1, .index=s+1}; + not_a_match:; + } + return (find_result_t){.success=0}; + // } + + // // Boyer-Moore algorithm: + // static int skip[UCHAR_MAX]; + // for (int32_t i = 0; i <= UCHAR_MAX; ++i) + // skip[i] = str.length; + // for (int32_t i = 0; i < pat.length; ++i) + // skip[(unsigned char)pat.data[i*pat.stride]] = pat.length - i - 1; + // char lastpatchar = pat.data[(pat.length - 1)*pat.stride]; + // int32_t min_skip = pat.length; + // for (int32_t p = 0; p < pat.length - 1; ++p) { + // if (pat.data[p*pat.stride] == lastpatchar) + // min_skip = pat.length - p - 1; + // } + + // for (int32_t i = pat.length - 1; i < str.length; ) { + // // Use skip table: + // int32_t can_skip = skip[(unsigned char)str.data[i*str.stride]]; + // if (can_skip != 0) { + // i += can_skip; + // continue; + // } + // // Check for exact match: + // for (int32_t j = 0; j < pat.length; j++) { + // if (str.data[(i-pat.length+j)*str.stride] != pat.data[j*pat.stride]) { + // // Mismatch: + // i += min_skip; + // goto keep_going; + // } + // } + // return i - pat.length + 1; + // keep_going: continue; + // } + // return 0; +} + +public Str_t Str__replace(Str_t text, Str_t pat, Str_t replacement, int64_t limit) +{ + text = compacted(text); + pat = compacted(pat); + replacement = compacted(replacement); + char *buf; + size_t size; + FILE *mem = open_memstream(&buf, &size); + for (const char *pos = text.data; ; --limit) { + const char *match = limit == 0 ? NULL : memmem(pos, (int64_t)text.length - (int64_t)(pos - text.data), pat.data, pat.length); + if (match) { + fwrite(pos, 1, (size_t)(match - pos), mem); + fwrite(replacement.data, 1, replacement.length, mem); + pos = match + pat.length; + } else { + fwrite(pos, 1, (size_t)text.length - (size_t)(pos - text.data), mem); + break; + } + } + fflush(mem); + char *str = GC_MALLOC_ATOMIC(size + 1); + memcpy(str, buf, size+1); + fclose(mem); + free(buf); + return (Str_t){.data=str, .length=size, .stride=1}; +} + +public Str_t Str__quoted(const Str_t text, const char *dsl, bool colorize) +{ + char *buf; + size_t size; + FILE *mem = open_memstream(&buf, &size); + if (colorize) fputs("\x1b[35m", mem); + if (dsl && dsl[0]) fprintf(mem, "$%s", dsl); + const char *escape_color = colorize ? "\x1b[1;34m" : ""; + const char *reset_color = colorize ? "\x1b[0;35m" : ""; + fputc('"', mem); + for (int64_t i = 0; i < text.length; i++) { + char c = text.data[i*text.stride]; + switch (c) { + case '\\': fprintf(mem, "%s\\\\%s", escape_color, reset_color); break; + case '"': fprintf(mem, "%s\\\"%s", escape_color, reset_color); break; + case '\n': fprintf(mem, "%s\\n%s", escape_color, reset_color); break; + case '\t': fprintf(mem, "%s\\t%s", escape_color, reset_color); break; + case '\r': fprintf(mem, "%s\\r%s", escape_color, reset_color); break; + case '\a': fprintf(mem, "%s\\a%s", escape_color, reset_color); break; + case '\b': fprintf(mem, "%s\\b%s", escape_color, reset_color); break; + case '\v': fprintf(mem, "%s\\v%s", escape_color, reset_color); break; + default: { + if (isprint(c)) + fputc(c, mem); + else + fprintf(mem, "%s\\x%02X%s", escape_color, (int)c, reset_color); + } + } + } + fputc('"', mem); + if (colorize) fputs("\x1b[m", mem); + fflush(mem); + char *str = GC_MALLOC_ATOMIC(size + 1); + memcpy(str, buf, size+1); + fclose(mem); + free(buf); + return (Str_t){.data=str, .length=size, .stride=1}; +} + +public Str_Array_t Str__split(const Str_t str, const Str_t split_chars) +{ + if (str.length == 0) return (Str_Array_t){.stride=sizeof(Str_t)}; + Str_Array_t strings = {.stride=sizeof(Str_t)}; + int64_t capacity = 0; + bool separators[256] = {0}; + for (int64_t i = 0; i < split_chars.length; i++) + separators[(int)split_chars.data[split_chars.stride*i]] = true; + + for (int64_t i = 0; i < str.length; i++) { + if (separators[(int)str.data[str.stride*i]]) continue; + int64_t length = 0; + while (i < str.length && !separators[(int)str.data[str.stride*i]]) { + ++length; + ++i; + } + strings.data = GC_REALLOC(strings.data, sizeof(Str_t)*(capacity += 1)); + strings.data[strings.length++] = (Str_t){ + .data=&str.data[str.stride*(i-length)], + .length=length, + .stride=str.stride, + }; + } + return strings; +} + +public Str_t Str__join(Str_t glue, Str_Array_t pieces) +{ + if (pieces.length == 0) return (Str_t){.stride=1}; + + int64_t length = 0; + for (int64_t i = 0; i < pieces.length; i++) { + if (i > 0) length += glue.length; + length += ((Str_t*)((void*)pieces.data + i*pieces.stride))->length; + } + char *data = GC_MALLOC_ATOMIC((size_t)length+1); + char *ptr = data; + for (int64_t i = 0; i < pieces.length; i++) { + if (i > 0) { + for (int64_t j = 0; j < glue.length; j++) + *(ptr++) = glue.data[j*glue.stride]; + } + Str_t piece = *(Str_t*)((void*)pieces.data + i*pieces.stride); + for (int64_t j = 0; j < piece.length; j++) + *(ptr++) = piece.data[j*piece.stride]; + } + return (Str_t){.data=data, .length=length, .stride=1}; +} + +public struct { + TypeInfo type; + Str_t (*uppercased)(const Str_t s); + Str_t (*lowercased)(const Str_t s); + Str_t (*capitalized)(const Str_t s); + Str_t (*titlecased)(const Str_t s); + bool (*starts_with)(const Str_t s, const Str_t prefix); + bool (*ends_with)(const Str_t s, const Str_t suffix); + Str_t (*without_prefix)(const Str_t s, const Str_t prefix); + Str_t (*without_suffix)(const Str_t s, const Str_t suffix); + Str_t (*trimmed)(const Str_t s, const Str_t trim_chars, bool trim_left, bool trim_right); + Str_t (*slice)(Str_t *s, int64_t first, int64_t stride, int64_t length, bool readonly, const TypeInfo *type); + const char *(*c_string)(const Str_t str); + Str_t (*from_c_string)(const char *str); + find_result_t (*find)(const Str_t str, const Str_t pat); + Str_t (*replace)(Str_t text, Str_t pat, Str_t replacement, int64_t limit); + Str_t (*quoted)(const Str_t text, const char *dsl, bool colorize); + Str_Array_t (*split)(const Str_t str, const Str_t split_chars); + Str_t (*join)(Str_t glue, Str_Array_t pieces); + bool (*equal)(const Str_t *x, const Str_t *y); + int32_t (*compare)(const Str_t *x, const Str_t *y); + int (*hash)(const Str_t *s, const TypeInfo *type); + CORD (*cord)(const Str_t *s, bool colorize, const TypeInfo *type); +} Str_type = { + .type={ + .name="Str", + .size=sizeof(Str_t), + .align=alignof(Str_t), + .tag=CustomInfo, + .CustomInfo={ + .cord=(void*)Str__cord, + .compare=(void*)Str__compare, + .equal=(void*)Str__equal, + .hash=(void*)Str__hash, + }, + }, + .uppercased=Str__uppercased, + .lowercased=Str__lowercased, + .capitalized=Str__capitalized, + .titlecased=Str__titlecased, + .starts_with=Str__starts_with, + .ends_with=Str__ends_with, + .without_prefix=Str__without_prefix, + .without_suffix=Str__without_suffix, + .trimmed=Str__trimmed, + .slice=(void*)Array_slice, + .c_string=Str__c_string, + .from_c_string=Str__from_c_string, + .find=Str__find, + .replace=Str__replace, + .quoted=Str__quoted, + .split=Str__split, + .join=Str__join, +}; + +public CORD CString_cord(const char **s, bool colorize, const TypeInfo *type) +{ + Str_t str = {.data=(char*)*s, .length=*s ? strlen(*s) : 0, .stride=1}; + return Str__cord(&str, colorize, type); +} + +public uint32_t CString_hash(const char **s, const TypeInfo *type) +{ + (void)type; + if (!*s) return 0; + uint32_t hash; + halfsiphash(*s, strlen(*s)+1, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash)); + assert(strlen(*s) > 0); + return hash; +} + +public uint32_t CString_compare(const char **x, const char **y, const TypeInfo *type) +{ + (void)type; + if (!*x || !*y) + return (!*x) - (!*y); + return strcmp(*x, *y); +} + +public struct { + TypeInfo type; + const char *(*from_str)(const Str_t str); + Str_t (*as_str)(const char *str); +} CString_type = { + .type={ + .name="CString", + .size=sizeof(char*), + .align=alignof(char*), + .tag=CustomInfo, + .CustomInfo={ + .cord=(void*)CString_cord, + .hash=(void*)CString_hash, + .compare=(void*)CString_compare, + }, + }, + .from_str=Str__c_string, + .as_str=Str__from_c_string, +}; + +static CORD Cord_cord(const CORD *c, bool colorize, const TypeInfo *type) +{ + const char *str = CORD_to_const_char_star(*c); + return CString_cord(&str, colorize, type); +} + +static uint32_t Cord_hash(const CORD *c, const TypeInfo *type) +{ + const char *str = CORD_to_const_char_star(*c); + return CString_hash(&str, type); +} + +static uint32_t Cord_compare(const char **x, const char **y, const TypeInfo *type) +{ + (void)type; + return CORD_cmp(*x, *y); +} + +public struct { + TypeInfo type; +} Cord_type = { + .type={ + .name="Cord", + .size=sizeof(CORD), + .align=alignof(CORD), + .tag=CustomInfo, + .CustomInfo={ + .cord=(void*)Cord_cord, + .hash=(void*)Cord_hash, + .compare=(void*)Cord_compare, + }, + }, +}; + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 |
