// Type info and methods for Text datatype, which uses the Boehm "cord" library // and libunistr #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "array.h" #include "functions.h" #include "halfsiphash.h" #include "integers.h" #include "text.h" #include "types.h" #define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo)) static inline uint8_t *_normalize(CORD str, uint8_t *buf, size_t *len) { const uint8_t *str_u8 = (const uint8_t*)CORD_to_const_char_star(str); uint8_t *normalized = u8_normalize(UNINORM_NFD, str_u8, strlen((char*)str_u8)+1, buf, len); if (!normalized) errx(1, "Unicode normalization error!"); return normalized; } public CORD Text$as_text(const void *text, bool colorize, const TypeInfo *info) { if (!text) return info->TextInfo.lang; CORD ret = Text$quoted(*(CORD*)text, colorize); if (!streq(info->TextInfo.lang, "Text")) ret = colorize ? CORD_all("\x1b[1m$", info->TextInfo.lang, "\x1b[m", ret) : CORD_all("$", info->TextInfo.lang, ret); return ret; } public CORD Text$quoted(CORD str, bool colorize) { // Note: it's important to have unicode strings not get broken up with // escapes, otherwise they won't print right. if (colorize) { CORD quoted = "\x1b[35m\""; CORD_pos i; CORD_FOR(i, str) { char c = CORD_pos_fetch(i); switch (c) { #define BACKSLASHED(esc) "\x1b[34m\\\x1b[1m" esc "\x1b[0;35m" case '\a': quoted = CORD_cat(quoted, BACKSLASHED("a")); break; case '\b': quoted = CORD_cat(quoted, BACKSLASHED("b")); break; case '\x1b': quoted = CORD_cat(quoted, BACKSLASHED("e")); break; case '\f': quoted = CORD_cat(quoted, BACKSLASHED("f")); break; case '\n': quoted = CORD_cat(quoted, BACKSLASHED("n")); break; case '\r': quoted = CORD_cat(quoted, BACKSLASHED("r")); break; case '\t': quoted = CORD_cat(quoted, BACKSLASHED("t")); break; case '\v': quoted = CORD_cat(quoted, BACKSLASHED("v")); break; case '"': quoted = CORD_cat(quoted, BACKSLASHED("\"")); break; case '\\': quoted = CORD_cat(quoted, BACKSLASHED("\\")); break; case '\x00' ... '\x06': case '\x0E' ... '\x1A': case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': CORD_sprintf("ed, "%r" BACKSLASHED("x%02X"), quoted, c); break; default: quoted = CORD_cat_char(quoted, c); break; #undef BACKSLASHED } } quoted = CORD_cat(quoted, "\"\x1b[m"); return quoted; } else { CORD quoted = "\""; CORD_pos i; CORD_FOR(i, str) { char c = CORD_pos_fetch(i); switch (c) { case '\a': quoted = CORD_cat(quoted, "\\a"); break; case '\b': quoted = CORD_cat(quoted, "\\b"); break; case '\x1b': quoted = CORD_cat(quoted, "\\e"); break; case '\f': quoted = CORD_cat(quoted, "\\f"); break; case '\n': quoted = CORD_cat(quoted, "\\n"); break; case '\r': quoted = CORD_cat(quoted, "\\r"); break; case '\t': quoted = CORD_cat(quoted, "\\t"); break; case '\v': quoted = CORD_cat(quoted, "\\v"); break; case '"': quoted = CORD_cat(quoted, "\\\""); break; case '\\': quoted = CORD_cat(quoted, "\\\\"); break; case '\x00' ... '\x06': case '\x0E' ... '\x1A': case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': CORD_sprintf("ed, "%r\\x%02X", quoted, c); break; default: quoted = CORD_cat_char(quoted, c); break; } } quoted = CORD_cat_char(quoted, '"'); return quoted; } } public int Text$compare(const CORD *x, const CORD *y) { uint8_t *xx = (uint8_t*)CORD_to_const_char_star(*x); uint8_t *yy = (uint8_t*)CORD_to_const_char_star(*y); int result = 0; if (u8_normcmp(xx, strlen((char*)xx), yy, strlen((char*)yy), UNINORM_NFD, &result)) fail("Something went wrong while comparing text"); return result; } public bool Text$equal(const CORD *x, const CORD *y) { return Text$compare(x, y) == 0; } public uint32_t Text$hash(const CORD *cord) { if (!*cord) return 0; uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); uint8_t *normalized = _normalize(*cord, buf, &norm_len); uint32_t hash; halfsiphash(normalized, norm_len, TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); if (normalized != buf) free(normalized); return hash; } public CORD Text$upper(CORD str) { if (!str) return str; size_t len = strlen(str) + 1; uint8_t *dest = GC_MALLOC_ATOMIC(len); dest[len-1] = 0; return (CORD)u8_toupper((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); } public CORD Text$lower(CORD str) { if (!str) return str; size_t len = strlen(str) + 1; uint8_t *dest = GC_MALLOC_ATOMIC(len); dest[len-1] = 0; return (CORD)u8_tolower((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); } public CORD Text$title(CORD str) { if (!str) return str; size_t len = strlen(str) + 1; uint8_t *dest = GC_MALLOC_ATOMIC(len); dest[len-1] = 0; return (CORD)u8_totitle((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); } public bool Text$has(CORD str, CORD target, Where_t where) { if (!target) return true; if (!str) return false; uint8_t str_buf[128] = {0}; size_t str_norm_len = sizeof(str_buf); uint8_t *str_normalized = _normalize(str, str_buf, &str_norm_len); uint8_t target_buf[128] = {0}; size_t target_norm_len = sizeof(target_buf); uint8_t *target_normalized = _normalize(target, target_buf, &target_norm_len); if (target_norm_len > str_norm_len) return false; bool ret; if (where.$tag == $tag$Where$Start) { ret = (u8_strncmp(str_normalized, target_normalized, target_norm_len-1) == 0); } else if (where.$tag == $tag$Where$End) { ret = (u8_strcmp(str_normalized + str_norm_len - target_norm_len, target_normalized) == 0); } else { assert(where.$tag == $tag$Where$Anywhere); ret = (u8_strstr(str_normalized, target_normalized) != NULL); } if (str_normalized != str_buf) free(str_normalized); if (target_normalized != target_buf) free(target_normalized); return ret; } public CORD Text$without(CORD str, CORD target, Where_t where) { if (!str || !target) return str; size_t target_len = CORD_len(target); size_t str_len = CORD_len(str); if (where.$tag == $tag$Where$Start) { if (CORD_ncmp(str, 0, target, 0, target_len) == 0) return CORD_substr(str, target_len, str_len - target_len); return str; } else if (where.$tag == $tag$Where$End) { if (CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0) return CORD_substr(str, 0, str_len - target_len); return str; } else { CORD ret = CORD_EMPTY; size_t i = 0; for (;;) { size_t match = CORD_str(str, i, target); if (match == CORD_NOT_FOUND) { if (i == 0) return str; // No matches! ret = CORD_cat(ret, CORD_substr(str, i, str_len)); break; } ret = CORD_cat(ret, CORD_substr(str, i, (match-i))); i = match + target_len; } return ret; } } public CORD Text$trimmed(CORD str, CORD skip, Where_t where) { if (!str || !skip) return str; const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(str); const uint8_t *uskip = (const uint8_t*)CORD_to_const_char_star(skip); // TODO: implement proper reverse iteration with u8_prev() if (where.$tag == $tag$Where$Start) { size_t span = u8_strspn(ustr, uskip); return (CORD)ustr + span; } else if (where.$tag == $tag$Where$End) { size_t len = u8_strlen(ustr); const uint8_t *back = ustr + len; size_t back_span = 0; while (back - back_span > ustr && u8_strspn(back-back_span-1, uskip) > back_span) ++back_span; return CORD_substr((CORD)ustr, 0, len - back_span); } else { size_t span = u8_strspn(ustr, uskip); size_t len = u8_strlen(ustr); const uint8_t *back = ustr + len; size_t back_span = 0; while (back - back_span > ustr + span && u8_strspn(back-back_span-1, uskip) > back_span) ++back_span; return CORD_substr((CORD)(ustr + span), 0, len - span - back_span); } } public find_result_t Text$find(CORD str, CORD pat) { if (!pat) return (find_result_t){.status=FIND_SUCCESS, .index=1}; size_t pos = CORD_str(str, 0, pat); return (pos == CORD_NOT_FOUND) ? (find_result_t){.status=FIND_FAILURE} : (find_result_t){.status=FIND_SUCCESS, .index=(int32_t)pos}; } public CORD Text$replace(CORD text, CORD pat, CORD replacement, Int_t int_limit) { if (!text || !pat) return text; CORD ret = CORD_EMPTY; size_t pos = 0, pat_len = CORD_len(pat); int64_t limit = Int$as_i64(int_limit); for (size_t found; limit != 0 && (found=CORD_str(text, pos, pat)) != CORD_NOT_FOUND; --limit) { ret = CORD_all(ret, CORD_substr(text, pos, found - pos), replacement); pos = found + pat_len; } size_t str_len = CORD_len(text); return CORD_cat(ret, CORD_substr(text, pos, str_len - pos)); } public array_t Text$split(CORD str, CORD split) { if (!str) return (array_t){.data=GC_MALLOC(sizeof(CORD)), .atomic=1, .length=1, .stride=sizeof(CORD)}; array_t strings = {.stride=sizeof(CORD), .atomic=1}; const uint8_t *ustr = (uint8_t*)CORD_to_const_char_star(str); const uint8_t *usplit = (uint8_t*)CORD_to_const_char_star(split); for (int64_t i = 0; ; ) { size_t non_split = u8_strcspn(ustr + i, usplit); CORD chunk = CORD_substr((CORD)ustr, i, non_split); Array$insert(&strings, &chunk, I(0), sizeof(CORD)); i += non_split; size_t split_span = u8_strspn(ustr + i, usplit); if (split_span == 0) break; i += split_span; } return strings; } public CORD Text$join(CORD glue, array_t pieces) { if (pieces.length == 0) return CORD_EMPTY; CORD ret = CORD_EMPTY; for (int64_t i = 0; i < pieces.length; i++) { if (i > 0) ret = CORD_cat(ret, glue); ret = CORD_cat(ret, *(CORD*)((void*)pieces.data + i*pieces.stride)); } return ret; } public array_t Text$clusters(CORD text) { array_t clusters = {.atomic=1}; uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); uint8_t *normalized = _normalize(text, buf, &norm_len); const uint8_t *end = normalized + strlen((char*)normalized); for (const uint8_t *pos = normalized; pos != end; ) { const uint8_t *next = u8_grapheme_next(pos, end); size_t len = (size_t)(next - pos); char cluster_buf[len+1]; strlcpy(cluster_buf, (char*)pos, len+1); CORD cluster = CORD_from_char_star(cluster_buf); Array$insert(&clusters, &cluster, I(0), sizeof(CORD)); pos = next; } if (normalized != buf) free(normalized); return clusters; } public array_t Text$codepoints(CORD text) { uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); uint8_t *normalized = _normalize(text, norm_buf, &norm_len); uint32_t codepoint_buf[128] = {0}; size_t codepoint_len = sizeof(codepoint_buf); uint32_t *codepoints = u8_to_u32(normalized, norm_len-1, codepoint_buf, &codepoint_len); array_t ret = { .length=codepoint_len, .data=memcpy(GC_MALLOC_ATOMIC(sizeof(int32_t[codepoint_len])), codepoints, sizeof(int32_t[codepoint_len])), .stride=sizeof(int32_t), .atomic=1, }; if (normalized != norm_buf) free(normalized); if (codepoints != codepoint_buf) free(codepoints); return ret; } public array_t Text$bytes(CORD text) { uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); uint8_t *normalized = _normalize(text, norm_buf, &norm_len); --norm_len; // NUL byte array_t ret = { .length=norm_len, .data=memcpy(GC_MALLOC_ATOMIC(sizeof(uint8_t[norm_len])), normalized, sizeof(uint8_t[norm_len])), .stride=sizeof(uint8_t), .atomic=1, }; if (normalized != norm_buf) free(normalized); return ret; } public Int_t Text$num_clusters(CORD text) { const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); int64_t num_clusters = 0; const uint8_t *end = ustr + u8_strlen(ustr); for (const uint8_t *pos = ustr; pos != end; ) { const uint8_t *next = u8_grapheme_next(pos, end); ++num_clusters; pos = next; } return I(num_clusters); } public Int_t Text$num_codepoints(CORD text) { uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); uint8_t *normalized = _normalize(text, buf, &norm_len); int64_t num_codepoints = u8_mbsnlen(normalized, norm_len-1); if (normalized != buf) free(normalized); return I(num_codepoints); } public Int_t Text$num_bytes(CORD text) { uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); uint8_t *normalized = _normalize(text, norm_buf, &norm_len); --norm_len; // NUL byte if (!normalized) errx(1, "Unicode normalization error!"); if (normalized != norm_buf) free(normalized); return I(norm_len); } public array_t Text$character_names(CORD text) { array_t codepoints = Text$codepoints(text); array_t ret = {.length=codepoints.length, .stride=sizeof(CORD), .data=GC_MALLOC(sizeof(CORD[codepoints.length]))}; for (int64_t i = 0; i < codepoints.length; i++) { char buf[UNINAME_MAX]; unicode_character_name(*(ucs4_t*)(codepoints.data + codepoints.stride*i), buf); *(CORD*)(ret.data + ret.stride*i) = CORD_from_char_star(buf); } return ret; } public const TypeInfo $Text = { .size=sizeof(CORD), .align=__alignof__(CORD), .tag=TextInfo, .TextInfo={.lang="Text"}, }; // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0