Add methods for text:clusters(), text:codepoints(), text:bytes()
This commit is contained in:
parent
dc04286e3a
commit
1f6aa4cac7
@ -8,9 +8,10 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <sys/param.h>
|
#include <sys/param.h>
|
||||||
#include <unistr.h>
|
|
||||||
#include <unicase.h>
|
#include <unicase.h>
|
||||||
|
#include <unigbrk.h>
|
||||||
#include <uninorm.h>
|
#include <uninorm.h>
|
||||||
|
#include <unistr.h>
|
||||||
|
|
||||||
#include "../SipHash/halfsiphash.h"
|
#include "../SipHash/halfsiphash.h"
|
||||||
#include "array.h"
|
#include "array.h"
|
||||||
@ -110,6 +111,7 @@ public uint32_t Text__hash(CORD *cord)
|
|||||||
uint8_t buf[128] = {0};
|
uint8_t buf[128] = {0};
|
||||||
size_t norm_len = sizeof(buf)-1;
|
size_t norm_len = sizeof(buf)-1;
|
||||||
uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len);
|
uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len);
|
||||||
|
if (!normalized) errx(1, "Unicode normalization error!");
|
||||||
|
|
||||||
uint32_t hash;
|
uint32_t hash;
|
||||||
halfsiphash(normalized, norm_len, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash));
|
halfsiphash(normalized, norm_len, SSS_HASH_VECTOR, (uint8_t*)&hash, sizeof(hash));
|
||||||
@ -259,6 +261,72 @@ public CORD Text__join(CORD glue, array_t pieces)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public array_t Text__clusters(CORD text)
|
||||||
|
{
|
||||||
|
array_t clusters = {.atomic=1};
|
||||||
|
const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
|
||||||
|
uint8_t buf[128] = {0};
|
||||||
|
size_t norm_len = sizeof(buf)-1;
|
||||||
|
uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), buf, &norm_len);
|
||||||
|
if (!normalized) errx(1, "Unicode normalization error!");
|
||||||
|
|
||||||
|
const uint8_t *end = normalized + strlen((char*)normalized);
|
||||||
|
for (const uint8_t *pos = normalized; pos != end; ) {
|
||||||
|
const uint8_t *next = u8_grapheme_next(pos, end);
|
||||||
|
size_t len = next ? (size_t)(next - pos) : strlen((char*)pos);
|
||||||
|
char cluster_buf[len+1];
|
||||||
|
strlcpy(cluster_buf, (char*)pos, len+1);
|
||||||
|
CORD cluster = CORD_from_char_star(cluster_buf);
|
||||||
|
Array__insert(&clusters, &cluster, 0, $ArrayInfo(&Text));
|
||||||
|
pos = next;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalized != buf) free(normalized);
|
||||||
|
return clusters;
|
||||||
|
}
|
||||||
|
|
||||||
|
public array_t Text__codepoints(CORD text)
|
||||||
|
{
|
||||||
|
const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
|
||||||
|
uint8_t norm_buf[128] = {0};
|
||||||
|
size_t norm_len = sizeof(norm_buf)-1;
|
||||||
|
uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len);
|
||||||
|
if (!normalized) errx(1, "Unicode normalization error!");
|
||||||
|
|
||||||
|
uint32_t codepoint_buf[128] = {0};
|
||||||
|
size_t codepoint_len = sizeof(codepoint_buf)-1;
|
||||||
|
uint32_t *codepoints = u8_to_u32(normalized, norm_len, codepoint_buf, &codepoint_len);
|
||||||
|
array_t ret = {
|
||||||
|
.length=codepoint_len,
|
||||||
|
.data=memcpy(GC_MALLOC_ATOMIC(sizeof(int32_t)*codepoint_len), codepoints, sizeof(int32_t)*codepoint_len),
|
||||||
|
.stride=sizeof(int32_t),
|
||||||
|
.atomic=1,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (normalized != norm_buf) free(normalized);
|
||||||
|
if (codepoints != codepoint_buf) free(codepoints);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public array_t Text__bytes(CORD text)
|
||||||
|
{
|
||||||
|
const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
|
||||||
|
uint8_t norm_buf[128] = {0};
|
||||||
|
size_t norm_len = sizeof(norm_buf)-1;
|
||||||
|
uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len);
|
||||||
|
if (!normalized) errx(1, "Unicode normalization error!");
|
||||||
|
|
||||||
|
array_t ret = {
|
||||||
|
.length=norm_len,
|
||||||
|
.data=memcpy(GC_MALLOC_ATOMIC(sizeof(uint8_t)*norm_len), normalized, sizeof(uint8_t)*norm_len),
|
||||||
|
.stride=sizeof(uint8_t),
|
||||||
|
.atomic=1,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (normalized != norm_buf) free(normalized);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
public const TypeInfo Text = {
|
public const TypeInfo Text = {
|
||||||
.size=sizeof(CORD),
|
.size=sizeof(CORD),
|
||||||
.align=__alignof__(CORD),
|
.align=__alignof__(CORD),
|
||||||
|
@ -29,6 +29,9 @@ find_result_t Text__find(CORD str, CORD pat);
|
|||||||
CORD Text__replace(CORD text, CORD pat, CORD replacement, int64_t limit);
|
CORD Text__replace(CORD text, CORD pat, CORD replacement, int64_t limit);
|
||||||
array_t Text__split(CORD str, CORD split);
|
array_t Text__split(CORD str, CORD split);
|
||||||
CORD Text__join(CORD glue, array_t pieces);
|
CORD Text__join(CORD glue, array_t pieces);
|
||||||
|
array_t Text__clusters(CORD text);
|
||||||
|
array_t Text__codepoints(CORD text);
|
||||||
|
array_t Text__bytes(CORD text);
|
||||||
|
|
||||||
extern const TypeInfo Text;
|
extern const TypeInfo Text;
|
||||||
|
|
||||||
|
@ -147,18 +147,21 @@ env_t *new_compilation_unit(void)
|
|||||||
#undef F
|
#undef F
|
||||||
#undef C
|
#undef C
|
||||||
{"Text", Type(TextType), "Text_t", "Text", $TypedArray(ns_entry_t,
|
{"Text", Type(TextType), "Text_t", "Text", $TypedArray(ns_entry_t,
|
||||||
{"quoted", "Text__quoted", "func(s:Text, color=no)->Text"},
|
{"quoted", "Text__quoted", "func(text:Text, color=no)->Text"},
|
||||||
{"upper", "Text__upper", "func(s:Text)->Text"},
|
{"upper", "Text__upper", "func(text:Text)->Text"},
|
||||||
{"lower", "Text__lower", "func(s:Text)->Text"},
|
{"lower", "Text__lower", "func(text:Text)->Text"},
|
||||||
{"title", "Text__title", "func(s:Text)->Text"},
|
{"title", "Text__title", "func(text:Text)->Text"},
|
||||||
// {"has", "Text__has", "func(s:Text, target:Text, where=ANYWHERE)->Bool"},
|
// {"has", "Text__has", "func(text:Text, target:Text, where=ANYWHERE)->Bool"},
|
||||||
// {"without", "Text__without", "func(s:Text, target:Text, where=ANYWHERE)->Text"},
|
// {"without", "Text__without", "func(text:Text, target:Text, where=ANYWHERE)->Text"},
|
||||||
// {"trimmed", "Text__without", "func(s:Text, skip:Text, where=ANYWHERE)->Text"},
|
// {"trimmed", "Text__without", "func(text:Text, skip:Text, where=ANYWHERE)->Text"},
|
||||||
{"title", "Text__title", "func(s:Text)->Text"},
|
{"title", "Text__title", "func(text:Text)->Text"},
|
||||||
// {"find", "Text__find", "func(s:Text, pattern:Text)->FindResult"},
|
// {"find", "Text__find", "func(text:Text, pattern:Text)->FindResult"},
|
||||||
{"replace", "Text__replace", "func(s:Text, pattern:Text, replacement:Text, limit=Int.max)->Text"},
|
{"replace", "Text__replace", "func(text:Text, pattern:Text, replacement:Text, limit=Int.max)->Text"},
|
||||||
{"split", "Text__split", "func(s:Text, split:Text)->[Text]"},
|
{"split", "Text__split", "func(text:Text, split:Text)->[Text]"},
|
||||||
{"join", "Text__join", "func(glue:Text, pieces:[Text])->Text"},
|
{"join", "Text__join", "func(glue:Text, pieces:[Text])->Text"},
|
||||||
|
{"clusters", "Text__clusters", "func(text:Text)->[Text]"},
|
||||||
|
{"codepoints", "Text__codepoints", "func(text:Text)->[Int32]"},
|
||||||
|
{"bytes", "Text__bytes", "func(text:Text)->[Int8]"},
|
||||||
)},
|
)},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -14,3 +14,11 @@
|
|||||||
|
|
||||||
>> \UE9 == \U65\U301
|
>> \UE9 == \U65\U301
|
||||||
= yes
|
= yes
|
||||||
|
|
||||||
|
>> amelie := "Amélie"
|
||||||
|
>> amelie:clusters()
|
||||||
|
= ["A", "m", "é", "l", "i", "e"] : [Text]
|
||||||
|
>> amelie:codepoints()
|
||||||
|
= [65_i32, 109_i32, 101_i32, 769_i32, 108_i32, 105_i32, 101_i32] : [Int32]
|
||||||
|
>> amelie:bytes()
|
||||||
|
= [65_i8, 109_i8, 101_i8, -52_i8, -127_i8, 108_i8, 105_i8, 101_i8] : [Int8]
|
||||||
|
Loading…
Reference in New Issue
Block a user