Hook up Text.has(), Text.trimmed(), Text.without()
This commit is contained in:
parent
7dddfb71a0
commit
5b1960859f
2
Makefile
2
Makefile
@ -25,7 +25,7 @@ O=-Og
|
||||
CFLAGS=$(CCONFIG) $(EXTRA) $(CWARN) $(G) $(O) $(OSFLAGS)
|
||||
LDLIBS=-lgc -lcord -lm -lunistring -ldl -L. -ltomo
|
||||
BUILTIN_OBJS=builtins/array.o builtins/bool.o builtins/nums.o builtins/functions.o builtins/integers.o \
|
||||
builtins/pointer.o builtins/memory.o builtins/text.o builtins/c_string.o builtins/table.o \
|
||||
builtins/pointer.o builtins/memory.o builtins/text.o builtins/where.o builtins/c_string.o builtins/table.o \
|
||||
builtins/types.o builtins/util.o builtins/files.o
|
||||
|
||||
all: libtomo.so tomo
|
||||
|
109
builtins/text.c
109
builtins/text.c
@ -24,6 +24,14 @@
|
||||
|
||||
#define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo))
|
||||
|
||||
static inline uint8_t *_normalize(CORD str, uint8_t *buf, size_t *len)
|
||||
{
|
||||
const uint8_t *str_u8 = (const uint8_t*)CORD_to_const_char_star(str);
|
||||
uint8_t *normalized = u8_normalize(UNINORM_NFD, str_u8, strlen((char*)str_u8)+1, buf, len);
|
||||
if (!normalized) errx(1, "Unicode normalization error!");
|
||||
return normalized;
|
||||
}
|
||||
|
||||
public CORD Text$as_text(const void *text, bool colorize, const TypeInfo *info)
|
||||
{
|
||||
if (!text) return info->TextInfo.lang;
|
||||
@ -111,12 +119,8 @@ public uint32_t Text$hash(const CORD *cord)
|
||||
{
|
||||
if (!*cord) return 0;
|
||||
|
||||
const char *str = CORD_to_const_char_star(*cord);
|
||||
size_t len = strlen(str);
|
||||
uint8_t buf[128] = {0};
|
||||
size_t norm_len = sizeof(buf);
|
||||
uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len+1, buf, &norm_len);
|
||||
if (!normalized) errx(1, "Unicode normalization error!");
|
||||
uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf);
|
||||
uint8_t *normalized = _normalize(*cord, buf, &norm_len);
|
||||
|
||||
uint32_t hash;
|
||||
halfsiphash(normalized, norm_len, TOMO_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash));
|
||||
@ -166,51 +170,75 @@ public CORD Text$title(CORD str)
|
||||
return (CORD)u8_totitle((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len);
|
||||
}
|
||||
|
||||
public bool Text$has(CORD str, CORD target, where_e where)
|
||||
public bool Text$has(CORD str, CORD target, Where_t where)
|
||||
{
|
||||
if (!target) return true;
|
||||
if (!str) return false;
|
||||
|
||||
if (where == WHERE_START) {
|
||||
return (CORD_ncmp(str, 0, target, 0, CORD_len(target)) == 0);
|
||||
} else if (where == WHERE_END) {
|
||||
size_t str_len = CORD_len(str);
|
||||
size_t target_len = CORD_len(target);
|
||||
return (str_len >= target_len && CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0);
|
||||
uint8_t str_buf[128] = {0}; size_t str_norm_len = sizeof(str_buf);
|
||||
uint8_t *str_normalized = _normalize(str, str_buf, &str_norm_len);
|
||||
|
||||
uint8_t target_buf[128] = {0}; size_t target_norm_len = sizeof(target_buf);
|
||||
uint8_t *target_normalized = _normalize(target, target_buf, &target_norm_len);
|
||||
|
||||
if (target_norm_len > str_norm_len) return false;
|
||||
|
||||
bool ret;
|
||||
if (where.$tag == $tag$Where$Start) {
|
||||
ret = (u8_strncmp(str_normalized, target_normalized, target_norm_len-1) == 0);
|
||||
} else if (where.$tag == $tag$Where$End) {
|
||||
ret = (u8_strcmp(str_normalized + str_norm_len - target_norm_len, target_normalized) == 0);
|
||||
} else {
|
||||
size_t pos = CORD_str(str, 0, target);
|
||||
return (pos != CORD_NOT_FOUND);
|
||||
}
|
||||
assert(where.$tag == $tag$Where$Anywhere);
|
||||
ret = (u8_strstr(str_normalized, target_normalized) != NULL);
|
||||
}
|
||||
|
||||
public CORD Text$without(CORD str, CORD target, where_e where)
|
||||
if (str_normalized != str_buf) free(str_normalized);
|
||||
if (target_normalized != target_buf) free(target_normalized);
|
||||
return ret;
|
||||
}
|
||||
|
||||
public CORD Text$without(CORD str, CORD target, Where_t where)
|
||||
{
|
||||
if (!str || !target) return str;
|
||||
|
||||
size_t target_len = CORD_len(target);
|
||||
size_t str_len = CORD_len(str);
|
||||
if (where == WHERE_START) {
|
||||
if (where.$tag == $tag$Where$Start) {
|
||||
if (CORD_ncmp(str, 0, target, 0, target_len) == 0)
|
||||
return CORD_substr(str, target_len, str_len - target_len);
|
||||
return str;
|
||||
} else if (where == WHERE_END) {
|
||||
} else if (where.$tag == $tag$Where$End) {
|
||||
if (CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0)
|
||||
return CORD_substr(str, 0, str_len - target_len);
|
||||
return str;
|
||||
} else {
|
||||
errx(1, "Not implemented");
|
||||
CORD ret = CORD_EMPTY;
|
||||
size_t i = 0;
|
||||
for (;;) {
|
||||
size_t match = CORD_str(str, i, target);
|
||||
if (match == CORD_NOT_FOUND) {
|
||||
if (i == 0) return str; // No matches!
|
||||
ret = CORD_cat(ret, CORD_substr(str, i, str_len));
|
||||
break;
|
||||
}
|
||||
ret = CORD_cat(ret, CORD_substr(str, i, (match-i)));
|
||||
i = match + target_len;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
public CORD Text$trimmed(CORD str, CORD skip, where_e where)
|
||||
public CORD Text$trimmed(CORD str, CORD skip, Where_t where)
|
||||
{
|
||||
if (!str || !skip) return str;
|
||||
const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(str);
|
||||
const uint8_t *uskip = (const uint8_t*)CORD_to_const_char_star(skip);
|
||||
if (where == WHERE_START) {
|
||||
// TODO: implement proper reverse iteration with u8_prev()
|
||||
if (where.$tag == $tag$Where$Start) {
|
||||
size_t span = u8_strspn(ustr, uskip);
|
||||
return (CORD)ustr + span;
|
||||
} else if (where == WHERE_END) {
|
||||
} else if (where.$tag == $tag$Where$End) {
|
||||
size_t len = u8_strlen(ustr);
|
||||
const uint8_t *back = ustr + len;
|
||||
size_t back_span = 0;
|
||||
@ -287,12 +315,8 @@ public CORD Text$join(CORD glue, array_t pieces)
|
||||
public array_t Text$clusters(CORD text)
|
||||
{
|
||||
array_t clusters = {.atomic=1};
|
||||
const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
|
||||
uint8_t buf[128] = {0};
|
||||
size_t norm_len = sizeof(buf);
|
||||
uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, buf, &norm_len);
|
||||
if (!normalized) errx(1, "Unicode normalization error!");
|
||||
|
||||
uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf);
|
||||
uint8_t *normalized = _normalize(text, buf, &norm_len);
|
||||
const uint8_t *end = normalized + strlen((char*)normalized);
|
||||
for (const uint8_t *pos = normalized; pos != end; ) {
|
||||
const uint8_t *next = u8_grapheme_next(pos, end);
|
||||
@ -310,11 +334,8 @@ public array_t Text$clusters(CORD text)
|
||||
|
||||
public array_t Text$codepoints(CORD text)
|
||||
{
|
||||
const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
|
||||
uint8_t norm_buf[128] = {0};
|
||||
size_t norm_len = sizeof(norm_buf);
|
||||
uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len);
|
||||
if (!normalized) errx(1, "Unicode normalization error!");
|
||||
uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf);
|
||||
uint8_t *normalized = _normalize(text, norm_buf, &norm_len);
|
||||
|
||||
uint32_t codepoint_buf[128] = {0};
|
||||
size_t codepoint_len = sizeof(codepoint_buf);
|
||||
@ -333,11 +354,8 @@ public array_t Text$codepoints(CORD text)
|
||||
|
||||
public array_t Text$bytes(CORD text)
|
||||
{
|
||||
const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
|
||||
uint8_t norm_buf[128] = {0};
|
||||
size_t norm_len = sizeof(norm_buf);
|
||||
uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len);
|
||||
if (!normalized) errx(1, "Unicode normalization error!");
|
||||
uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf);
|
||||
uint8_t *normalized = _normalize(text, norm_buf, &norm_len);
|
||||
|
||||
--norm_len; // NUL byte
|
||||
array_t ret = {
|
||||
@ -366,11 +384,8 @@ public int64_t Text$num_clusters(CORD text)
|
||||
|
||||
public int64_t Text$num_codepoints(CORD text)
|
||||
{
|
||||
const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
|
||||
uint8_t buf[128] = {0};
|
||||
size_t norm_len = sizeof(buf);
|
||||
uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, buf, &norm_len);
|
||||
if (!normalized) errx(1, "Unicode normalization error!");
|
||||
uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf);
|
||||
uint8_t *normalized = _normalize(text, buf, &norm_len);
|
||||
int64_t num_codepoints = u8_mbsnlen(normalized, norm_len-1);
|
||||
if (normalized != buf) free(normalized);
|
||||
return num_codepoints;
|
||||
@ -378,10 +393,8 @@ public int64_t Text$num_codepoints(CORD text)
|
||||
|
||||
public int64_t Text$num_bytes(CORD text)
|
||||
{
|
||||
const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
|
||||
uint8_t norm_buf[128] = {0};
|
||||
size_t norm_len = sizeof(norm_buf);
|
||||
uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len);
|
||||
uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf);
|
||||
uint8_t *normalized = _normalize(text, norm_buf, &norm_len);
|
||||
--norm_len; // NUL byte
|
||||
if (!normalized) errx(1, "Unicode normalization error!");
|
||||
if (normalized != norm_buf) free(normalized);
|
||||
|
@ -8,11 +8,10 @@
|
||||
#include <stdint.h>
|
||||
|
||||
#include "types.h"
|
||||
#include "where.h"
|
||||
|
||||
#define Text_t CORD
|
||||
|
||||
typedef enum { WHERE_ANYWHERE, WHERE_START, WHERE_END } where_e;
|
||||
|
||||
typedef struct {
|
||||
enum { FIND_FAILURE, FIND_SUCCESS } status;
|
||||
int32_t index;
|
||||
@ -27,9 +26,9 @@ CORD Text$slice(CORD text, int64_t first, int64_t length);
|
||||
CORD Text$upper(CORD str);
|
||||
CORD Text$lower(CORD str);
|
||||
CORD Text$title(CORD str);
|
||||
bool Text$has(CORD str, CORD target, where_e where);
|
||||
CORD Text$without(CORD str, CORD target, where_e where);
|
||||
CORD Text$trimmed(CORD str, CORD skip, where_e where);
|
||||
bool Text$has(CORD str, CORD target, Where_t where);
|
||||
CORD Text$without(CORD str, CORD target, Where_t where);
|
||||
CORD Text$trimmed(CORD str, CORD skip, Where_t where);
|
||||
find_result_t Text$find(CORD str, CORD pat);
|
||||
CORD Text$replace(CORD text, CORD pat, CORD replacement, int64_t limit);
|
||||
array_t Text$split(CORD str, CORD split);
|
||||
|
54
builtins/where.c
Normal file
54
builtins/where.c
Normal file
@ -0,0 +1,54 @@
|
||||
// A type called "Where" that is an enum for "Anywhere", "Start", or "End"
|
||||
// Mainly used for text methods
|
||||
|
||||
#include <gc/cord.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "types.h"
|
||||
#include "where.h"
|
||||
#include "util.h"
|
||||
|
||||
static CORD Where$Anywhere$as_text(Where$Anywhere_t *obj, bool use_color)
|
||||
{
|
||||
if (!obj) return "Anywhere";
|
||||
return CORD_all(use_color ? "\x1b[0;1mAnywhere\x1b[m(" : "Anywhere(", ")");
|
||||
}
|
||||
|
||||
static CORD Where$Start$as_text(Where$Start_t *obj, bool use_color)
|
||||
{
|
||||
if (!obj) return "Start";
|
||||
return CORD_all(use_color ? "\x1b[0;1mStart\x1b[m(" : "Start(", ")");
|
||||
}
|
||||
|
||||
static CORD Where$End$as_text(Where$End_t *obj, bool use_color)
|
||||
{
|
||||
if (!obj) return "End";
|
||||
return CORD_all(use_color ? "\x1b[0;1mEnd\x1b[m(" : "End(", ")");
|
||||
}
|
||||
|
||||
static CORD Where$as_text(Where_t *obj, bool use_color)
|
||||
{
|
||||
if (!obj)
|
||||
return "Where";
|
||||
switch (obj->$tag) {
|
||||
case $tag$Where$Anywhere:
|
||||
return use_color ? "\x1b[36;1mWhere.Anywhere\x1b[m" : "Where.Anywhere";
|
||||
case $tag$Where$Start:
|
||||
return use_color ? "\x1b[36;1mWhere.Start\x1b[m" : "Where.Start";
|
||||
case $tag$Where$End:
|
||||
return use_color ? "\x1b[36;1mWhere.End\x1b[m" : "Where.End";
|
||||
default:
|
||||
return CORD_EMPTY;
|
||||
}
|
||||
}
|
||||
|
||||
public const Where_t Where$tagged$Anywhere = {$tag$Where$Anywhere};
|
||||
public const Where_t Where$tagged$Start = {$tag$Where$Start};
|
||||
public const Where_t Where$tagged$End = {$tag$Where$End};
|
||||
public const TypeInfo Where$Anywhere = {0, 0, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$Anywhere$as_text}}};
|
||||
public const TypeInfo Where$Start = {0, 0, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$Start$as_text}}};
|
||||
public const TypeInfo Where$End = {0, 0, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$End$as_text}}};
|
||||
public const TypeInfo Where = {4, 4, {.tag=CustomInfo, .CustomInfo={.as_text=(void*)Where$as_text}}};
|
||||
|
||||
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
|
37
builtins/where.h
Normal file
37
builtins/where.h
Normal file
@ -0,0 +1,37 @@
|
||||
#pragma once
|
||||
|
||||
// Type info and methods for Where datatype (Anywhere, Start, or End enum)
|
||||
// Mainly used for text methods.
|
||||
|
||||
#include <gc/cord.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "types.h"
|
||||
|
||||
typedef struct Where_s Where_t;
|
||||
extern const TypeInfo Where;
|
||||
typedef struct Where$Anywhere_s Where$Anywhere_t;
|
||||
extern const TypeInfo Where$Anywhere;
|
||||
typedef struct Where$Start_s Where$Start_t;
|
||||
extern const TypeInfo Where$Start;
|
||||
typedef struct Where$End_s Where$End_t;
|
||||
extern const TypeInfo Where$End;
|
||||
|
||||
struct Where$Anywhere_s {};
|
||||
struct Where$Start_s {};
|
||||
struct Where$End_s {};
|
||||
struct Where_s {
|
||||
enum { $tag$Where$Anywhere = 0, $tag$Where$Start = 1, $tag$Where$End = 2 } $tag;
|
||||
union {
|
||||
Where$Anywhere_t Anywhere;
|
||||
Where$Start_t Start;
|
||||
Where$End_t End;
|
||||
};
|
||||
};
|
||||
|
||||
extern const Where_t Where$tagged$Anywhere;
|
||||
extern const Where_t Where$tagged$Start;
|
||||
extern const Where_t Where$tagged$End;
|
||||
|
||||
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
|
@ -42,6 +42,21 @@ env_t *new_compilation_unit(void)
|
||||
const char *name, *code, *type_str;
|
||||
} ns_entry_t;
|
||||
|
||||
type_t *where;
|
||||
{
|
||||
env_t *where_env = namespace_env(env, "Where");
|
||||
type_t *anywhere = Type(StructType, .name="Anywhere");
|
||||
type_t *start = Type(StructType, .name="Start");
|
||||
type_t *end = Type(StructType, .name="End");
|
||||
where = Type(EnumType, .name="Where", .env=where_env,
|
||||
.tags=new(tag_t, .name="Anywhere", .tag_value=0, .type=anywhere,
|
||||
.next=new(tag_t, .name="Start", .tag_value=0, .type=start,
|
||||
.next=new(tag_t, .name="End", .tag_value=0, .type=end))));
|
||||
set_binding(where_env, "Anywhere", new(binding_t, .type=where, .code="Where$tagged$Anywhere"));
|
||||
set_binding(where_env, "Start", new(binding_t, .type=where, .code="Where$tagged$Start"));
|
||||
set_binding(where_env, "End", new(binding_t, .type=where, .code="Where$tagged$End"));
|
||||
}
|
||||
|
||||
struct {
|
||||
const char *name;
|
||||
type_t *type;
|
||||
@ -157,6 +172,7 @@ env_t *new_compilation_unit(void)
|
||||
#undef F2
|
||||
#undef F
|
||||
#undef C
|
||||
{"Where", where, "Where_t", "Where", {}},
|
||||
{"Text", TEXT_TYPE, "Text_t", "$Text", TypedArray(ns_entry_t,
|
||||
{"slice", "Text$slice", "func(text:Text, index:Int, length=Int.max)->Text"},
|
||||
{"quoted", "Text$quoted", "func(text:Text, color=no)->Text"},
|
||||
@ -165,9 +181,9 @@ env_t *new_compilation_unit(void)
|
||||
{"title", "Text$title", "func(text:Text)->Text"},
|
||||
{"as_c_string", "CORD_to_char_star", "func(text:Text)->CString"},
|
||||
{"from_c_string", "CORD_from_char_star", "func(str:CString)->Text"},
|
||||
// {"has", "Text$has", "func(text:Text, target:Text, where=ANYWHERE)->Bool"},
|
||||
// {"without", "Text$without", "func(text:Text, target:Text, where=ANYWHERE)->Text"},
|
||||
// {"trimmed", "Text$without", "func(text:Text, skip:Text, where=ANYWHERE)->Text"},
|
||||
{"has", "Text$has", "func(text:Text, target:Text, where=Where.Anywhere)->Bool"},
|
||||
{"without", "Text$without", "func(text:Text, target:Text, where=Where.Anywhere)->Text"},
|
||||
{"trimmed", "Text$trimmed", "func(text:Text, trim=\" {\\n\\r\\t}\", where=Where.Anywhere)->Text"},
|
||||
{"title", "Text$title", "func(text:Text)->Text"},
|
||||
// {"find", "Text$find", "func(text:Text, pattern:Text)->FindResult"},
|
||||
{"replace", "Text$replace", "func(text:Text, pattern:Text, replacement:Text, limit=Int.max)->Text"},
|
||||
@ -184,7 +200,20 @@ env_t *new_compilation_unit(void)
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < sizeof(global_types)/sizeof(global_types[0]); i++) {
|
||||
env_t *ns_env = global_types[i].type == TEXT_TYPE ? Match(TEXT_TYPE, TextType)->env : namespace_env(env, global_types[i].name);
|
||||
env_t *ns_env = NULL;
|
||||
switch (global_types[i].type->tag) {
|
||||
case TextType:
|
||||
ns_env = Match(global_types[i].type, TextType)->env;
|
||||
break;
|
||||
case StructType:
|
||||
ns_env = Match(global_types[i].type, StructType)->env;
|
||||
break;
|
||||
case EnumType:
|
||||
ns_env = Match(global_types[i].type, EnumType)->env;
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
if (ns_env == NULL) ns_env = namespace_env(env, global_types[i].name);
|
||||
binding_t *binding = new(binding_t, .type=Type(TypeInfoType, .name=global_types[i].name, .type=global_types[i].type, .env=ns_env));
|
||||
Table$str_set(env->globals, global_types[i].name, binding);
|
||||
Table$str_set(env->types, global_types[i].name, global_types[i].type);
|
||||
|
37
test/text.tm
37
test/text.tm
@ -55,3 +55,40 @@ func main():
|
||||
|
||||
>> "Hello":replace("e", "X")
|
||||
= "HXllo"
|
||||
|
||||
>> "Hello":has("l")
|
||||
= yes
|
||||
>> "Hello":has("l", End)
|
||||
= no
|
||||
>> "Hello":has("l", Start)
|
||||
= no
|
||||
|
||||
>> "Hello":has("o")
|
||||
= yes
|
||||
>> "Hello":has("o", where=End)
|
||||
= yes
|
||||
>> "Hello":has("o", where=Start)
|
||||
= no
|
||||
|
||||
>> "Hello":has("H")
|
||||
= yes
|
||||
>> "Hello":has("H", End)
|
||||
= no
|
||||
>> "Hello":has("H", Start)
|
||||
= yes
|
||||
|
||||
>> "Hello":without("l")
|
||||
= "Heo"
|
||||
>> "xxxx":without("x")
|
||||
= ""
|
||||
>> "xxxx":without("y")
|
||||
= "xxxx"
|
||||
>> "One two three four five six":without("e ")
|
||||
= "Ontwo threfour fivsix"
|
||||
|
||||
>> " one ":trimmed()
|
||||
= "one"
|
||||
>> " one ":trimmed(" aeiou")
|
||||
= "n"
|
||||
|
||||
>> amelie:has(amelie2)
|
||||
|
Loading…
Reference in New Issue
Block a user