Hook up Text.has(), Text.trimmed(), Text.without()

author: Bruce Hill <bruce@bruce-hill.com> 2024-05-20 15:19:31 -0400
committer: Bruce Hill <bruce@bruce-hill.com> 2024-05-20 15:19:31 -0400
commit: 5b1960859fcc6331a486ced98c5f25d93168fa18 (patch)
tree: a4969d21479d68f86a65f2cbb396878336db7bf2 /builtins
parent: 7dddfb71a0ebf8dbbe8a9bdfc156d7471ff943bd (diff)
4 files changed, 155 insertions, 52 deletions
diff --git a/builtins/text.c b/builtins/text.c
index 9a9c3801..a3161926 100644
--- a/builtins/text.c
+++ b/builtins/text.c
@@ -24,6 +24,14 @@
 
 #define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo))
 
+static inline uint8_t *_normalize(CORD str, uint8_t *buf, size_t *len)
+{
+    const uint8_t *str_u8 = (const uint8_t*)CORD_to_const_char_star(str);
+    uint8_t *normalized = u8_normalize(UNINORM_NFD, str_u8, strlen((char*)str_u8)+1, buf, len);
+    if (!normalized) errx(1, "Unicode normalization error!");
+    return normalized;
+}
+
 public CORD Text$as_text(const void *text, bool colorize, const TypeInfo *info)
 {
     if (!text) return info->TextInfo.lang;
@@ -111,12 +119,8 @@ public uint32_t Text$hash(const CORD *cord)
 {
     if (!*cord) return 0;
 
-    const char *str = CORD_to_const_char_star(*cord);
-    size_t len = strlen(str);
-    uint8_t buf[128] = {0};
-    size_t norm_len = sizeof(buf);
-    uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len+1, buf, &norm_len);
-    if (!normalized) errx(1, "Unicode normalization error!");
+    uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf);
+    uint8_t *normalized = _normalize(*cord, buf, &norm_len);
 
     uint32_t hash;
     halfsiphash(normalized, norm_len, TOMO_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash));
@@ -166,51 +170,75 @@ public CORD Text$title(CORD str)
     return (CORD)u8_totitle((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len);
 }
 
-public bool Text$has(CORD str, CORD target, where_e where)
+public bool Text$has(CORD str, CORD target, Where_t where)
 {
     if (!target) return true;
     if (!str) return false;
 
-    if (where == WHERE_START) {
-        return (CORD_ncmp(str, 0, target, 0, CORD_len(target)) == 0);
-    } else if (where == WHERE_END) {
-        size_t str_len = CORD_len(str);
-        size_t target_len = CORD_len(target);
-        return (str_len >= target_len && CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0);
+    uint8_t str_buf[128] = {0}; size_t str_norm_len = sizeof(str_buf);
+    uint8_t *str_normalized = _normalize(str, str_buf, &str_norm_len);
+
+    uint8_t target_buf[128] = {0}; size_t target_norm_len = sizeof(target_buf);
+    uint8_t *target_normalized = _normalize(target, target_buf, &target_norm_len);
+
+    if (target_norm_len > str_norm_len) return false;
+
+    bool ret;
+    if (where.$tag == $tag$Where$Start) {
+        ret = (u8_strncmp(str_normalized, target_normalized, target_norm_len-1) == 0);
+    } else if (where.$tag == $tag$Where$End) {
+        ret = (u8_strcmp(str_normalized + str_norm_len - target_norm_len, target_normalized) == 0);
     } else {
-        size_t pos = CORD_str(str, 0, target);
-        return (pos != CORD_NOT_FOUND);
+        assert(where.$tag == $tag$Where$Anywhere);
+        ret = (u8_strstr(str_normalized, target_normalized) != NULL);
     }
+
+    if (str_normalized != str_buf) free(str_normalized);
+    if (target_normalized != target_buf) free(target_normalized);
+    return ret;
 }
 
-public CORD Text$without(CORD str, CORD target, where_e where)
+public CORD Text$without(CORD str, CORD target, Where_t where)
 {
     if (!str || !target) return str;
 
     size_t target_len = CORD_len(target);
     size_t str_len = CORD_len(str);
-    if (where == WHERE_START) {
+    if (where.$tag == $tag$Where$Start) {
         if (CORD_ncmp(str, 0, target, 0, target_len) == 0)
             return CORD_substr(str, target_len, str_len - target_len);
         return str;
-    } else if (where == WHERE_END) {
+    } else if (where.$tag == $tag$Where$End) {
         if (CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0)
             return CORD_substr(str, 0, str_len - target_len);
         return str;
     } else {
-        errx(1, "Not implemented");
+        CORD ret = CORD_EMPTY;
+        size_t i = 0;
+        for (;;) {
+            size_t match = CORD_str(str, i, target);
+            if (match == CORD_NOT_FOUND) {
+                if (i == 0) return str; // No matches!
+                ret = CORD_cat(ret, CORD_substr(str, i, str_len));
+                break;
+            }
+            ret = CORD_cat(ret, CORD_substr(str, i, (match-i)));
+            i = match + target_len;
+        }
+        return ret;
     }
 }
 
-public CORD Text$trimmed(CORD str, CORD skip, where_e where)
+public CORD Text$trimmed(CORD str, CORD skip, Where_t where)
 {
     if (!str || !skip) return str;
     const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(str);
     const uint8_t *uskip = (const uint8_t*)CORD_to_const_char_star(skip);
-    if (where == WHERE_START) {
+    // TODO: implement proper reverse iteration with u8_prev()
+    if (where.$tag == $tag$Where$Start) {
         size_t span = u8_strspn(ustr, uskip);
         return (CORD)ustr + span;
-    } else if (where == WHERE_END) {
+    } else if (where.$tag == $tag$Where$End) {
         size_t len = u8_strlen(ustr);
         const uint8_t *back = ustr + len;
         size_t back_span = 0;
@@ -287,12 +315,8 @@ public CORD Text$join(CORD glue, array_t pieces)
 public array_t Text$clusters(CORD text)
 {
     array_t clusters = {.atomic=1};
-    const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
-    uint8_t buf[128] = {0};
-    size_t norm_len = sizeof(buf);
-    uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, buf, &norm_len);
-    if (!normalized) errx(1, "Unicode normalization error!");
-
+    uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf);
+    uint8_t *normalized = _normalize(text, buf, &norm_len);
     const uint8_t *end = normalized + strlen((char*)normalized);
     for (const uint8_t *pos = normalized; pos != end; ) {
         const uint8_t *next = u8_grapheme_next(pos, end);
@@ -310,11 +334,8 @@ public array_t Text$clusters(CORD text)
 
 public array_t Text$codepoints(CORD text)
 {
-    const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
-    uint8_t norm_buf[128] = {0};
-    size_t norm_len = sizeof(norm_buf);
-    uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len);
-    if (!normalized) errx(1, "Unicode normalization error!");
+    uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf);
+    uint8_t *normalized = _normalize(text, norm_buf, &norm_len);
 
     uint32_t codepoint_buf[128] = {0};
     size_t codepoint_len = sizeof(codepoint_buf);
@@ -333,11 +354,8 @@ public array_t Text$codepoints(CORD text)
 
 public array_t Text$bytes(CORD text)
 {
-    const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
-    uint8_t norm_buf[128] = {0};
-    size_t norm_len = sizeof(norm_buf);
-    uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len);
-    if (!normalized) errx(1, "Unicode normalization error!");
+    uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf);
+    uint8_t *normalized = _normalize(text, norm_buf, &norm_len);
 
     --norm_len; // NUL byte
     array_t ret = {
@@ -366,11 +384,8 @@ public int64_t Text$num_clusters(CORD text)
 
 public int64_t Text$num_codepoints(CORD text)
 {
-    const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
-    uint8_t buf[128] = {0};
-    size_t norm_len = sizeof(buf);
-    uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, buf, &norm_len);
-    if (!normalized) errx(1, "Unicode normalization error!");
+    uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf);
+    uint8_t *normalized = _normalize(text, buf, &norm_len);
     int64_t num_codepoints = u8_mbsnlen(normalized, norm_len-1);
     if (normalized != buf) free(normalized);
     return num_codepoints;
@@ -378,10 +393,8 @@ public int64_t Text$num_codepoints(CORD text)
 
 public int64_t Text$num_bytes(CORD text)
 {
-    const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
-    uint8_t norm_buf[128] = {0};
-    size_t norm_len = sizeof(norm_buf);
-    uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len);
+    uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf);
+    uint8_t *normalized = _normalize(text, norm_buf, &norm_len);
     --norm_len; // NUL byte
     if (!normalized) errx(1, "Unicode normalization error!");
     if (normalized != norm_buf) free(normalized);
diff --git a/builtins/text.h b/builtins/text.h
index cd191cb6..e97c4010 100644
--- a/builtins/text.h
+++ b/builtins/text.h
@@ -8,11 +8,10 @@
 #include <stdint.h>
 
 #include "types.h"
+#include "where.h"
 
 #define Text_t CORD
 
-typedef enum { WHERE_ANYWHERE, WHERE_START, WHERE_END } where_e;
-
 typedef struct {
     enum { FIND_FAILURE, FIND_SUCCESS } status;
     int32_t index;
@@ -27,9 +26,9 @@ CORD Text$slice(CORD text, int64_t first, int64_t length);
 CORD Text$upper(CORD str);
 CORD Text$lower(CORD str);
 CORD Text$title(CORD str);
-bool Text$has(CORD str, CORD target, where_e where);
-CORD Text$without(CORD str, CORD target, where_e where);
-CORD Text$trimmed(CORD str, CORD skip, where_e where);
+bool Text$has(CORD str, CORD target, Where_t where);
+CORD Text$without(CORD str, CORD target, Where_t where);
+CORD Text$trimmed(CORD str, CORD skip, Where_t where);
 find_result_t Text$find(CORD str, CORD pat);
 CORD Text$replace(CORD text, CORD pat, CORD replacement, int64_t limit);
 array_t Text$split(CORD str, CORD split);
diff --git a/builtins/where.c b/builtins/where.c
new file mode 100644
index 00000000..5447d8c9
--- /dev/null
+++ b/builtins/where.c
@@ -0,0 +1,54 @@
+// A type called "Where" that is an enum for "Anywhere", "Start", or "End"
+// Mainly used for text methods
+
+#include <gc/cord.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "types.h"
+#include "where.h"
+#include "util.h"
+
+static CORD Where$Anywhere$as_text(Where$Anywhere_t *obj, bool use_color)
+{
+    if (!obj) return "Anywhere";
+    return CORD_all(use_color ? "\x1b[0;1mAnywhere\x1b[m(" : "Anywhere(", ")");
+}
+
+static CORD Where$Start$as_text(Where$Start_t *obj, bool use_color)
+{
+    if (!obj) return "Start";
+    return CORD_all(use_color ? "\x1b[0;1mStart\x1b[m(" : "Start(", ")");
+}
+
+static CORD Where$End$as_text(Where$End_t *obj, bool use_color)
+{
+    if (!obj) return "End";
+    return CORD_all(use_color ? "\x1b[0;1mEnd\x1b[m(" : "End(", ")");
+}
+
+static CORD Where$as_text(Where_t *obj, bool use_color)
+{
+    if (!obj)
+        return "Where";
+    switch (obj->$tag) {
+    case $tag$Where$Anywhere:
+        return use_color ? "\x1b[36;1mWhere.Anywhere\x1b[m" : "Where.Anywhere";
+    case $tag$Where$Start:
+        return use_color ? "\x1b[36;1mWhere.Start\x1b[m" : "Where.Start";
+    case $tag$Where$End:
+        return use_color ? "\x1b[36;1mWhere.End\x1b[m" : "Where.End";
+    default:
+        return CORD_EMPTY;
+    }
+}
+
+public const Where_t Where$tagged$Anywhere = {$tag$Where$Anywhere};
+public const Where_t Where$tagged$Start = {$tag$Where$Start};
+public const Where_t Where$tagged$End = {$tag$Where$End};
+public const TypeInfo Where$Anywhere = {0, 0, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$Anywhere$as_text}}};
+public const TypeInfo Where$Start = {0, 0, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$Start$as_text}}};
+public const TypeInfo Where$End = {0, 0, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$End$as_text}}};
+public const TypeInfo Where = {4, 4, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$as_text}}};
+
+// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/builtins/where.h b/builtins/where.h
new file mode 100644
index 00000000..75de22a3
--- /dev/null
+++ b/builtins/where.h
@@ -0,0 +1,37 @@
+#pragma once
+
+// Type info and methods for Where datatype (Anywhere, Start, or End enum)
+// Mainly used for text methods.
+
+#include <gc/cord.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "types.h"
+
+typedef struct Where_s Where_t;
+extern const TypeInfo Where;
+typedef struct Where$Anywhere_s Where$Anywhere_t;
+extern const TypeInfo Where$Anywhere;
+typedef struct Where$Start_s Where$Start_t;
+extern const TypeInfo Where$Start;
+typedef struct Where$End_s Where$End_t;
+extern const TypeInfo Where$End;
+
+struct Where$Anywhere_s {};
+struct Where$Start_s {};
+struct Where$End_s {};
+struct Where_s {
+    enum { $tag$Where$Anywhere = 0, $tag$Where$Start = 1, $tag$Where$End = 2 } $tag;
+    union {
+        Where$Anywhere_t Anywhere;
+        Where$Start_t Start;
+        Where$End_t End;
+    };
+};
+
+extern const Where_t Where$tagged$Anywhere;
+extern const Where_t Where$tagged$Start;
+extern const Where_t Where$tagged$End;
+
+// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
author	Bruce Hill <bruce@bruce-hill.com>	2024-05-20 15:19:31 -0400
committer	Bruce Hill <bruce@bruce-hill.com>	2024-05-20 15:19:31 -0400
commit	5b1960859fcc6331a486ced98c5f25d93168fa18 (patch)
tree	a4969d21479d68f86a65f2cbb396878336db7bf2 /builtins
parent	7dddfb71a0ebf8dbbe8a9bdfc156d7471ff943bd (diff)