aboutsummaryrefslogtreecommitdiff
path: root/builtins
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2024-09-05 02:56:56 -0400
committerBruce Hill <bruce@bruce-hill.com>2024-09-05 02:56:56 -0400
commite3cc4f88e015a2ca06a5b53c3908ed45d2d6daf2 (patch)
tree131cd23152ab35d63ef9aa317e7b479742faae0a /builtins
parentfa5ca582ffc543b8093762d2e0c876b3c6068351 (diff)
Be extra correct about prefix codepoints
Diffstat (limited to 'builtins')
-rw-r--r--builtins/text.c29
1 files changed, 19 insertions, 10 deletions
diff --git a/builtins/text.c b/builtins/text.c
index 54b9d3d5..59063a58 100644
--- a/builtins/text.c
+++ b/builtins/text.c
@@ -77,6 +77,7 @@
#include "siphash.c"
typedef struct {
+ uint32_t main_codepoint;
uint32_t *utf32_cluster; // length-prefixed
const uint8_t *utf8;
} synthetic_grapheme_t;
@@ -91,6 +92,7 @@ typedef struct {
static table_t grapheme_ids_by_codepoints = {}; // uint32_t* length-prefixed codepoints -> int32_t ID
static synthetic_grapheme_t crlf_grapheme = {
+ .main_codepoint='\n',
.utf32_cluster=(uint32_t[]){2, '\r', '\n'},
.utf8=(uint8_t[]){'\r', '\n', '\0'},
};
@@ -99,6 +101,7 @@ static synthetic_grapheme_t crlf_grapheme = {
static synthetic_grapheme_t *synthetic_graphemes = &crlf_grapheme;
static int32_t synthetic_grapheme_capacity = 1;
+#define MAIN_GRAPHEME_CODEPOINT(g) ((g) >= 0 ? (uint32_t)(g) : synthetic_graphemes[-(g)-1].main_codepoint)
#define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id)-1].utf32_cluster[0])
#define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id)-1].utf32_cluster[1])
#define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id)-1].utf8)
@@ -201,6 +204,16 @@ int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t utf32_len)
synthetic_graphemes[-grapheme_id-1].utf8 = utf8_final;
arena += sizeof(uint8_t[u8_len + 1]);
+ // Sickos at the unicode consortium decreed that you can have grapheme clusters
+ // that begin with *prefix* modifiers, so we gotta check for that case:
+ synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1];
+ for (uint32_t i = 0; i < utf32_len; i++) {
+ if (!__builtin_expect(uc_is_property_prepended_concatenation_mark(length_prefixed[1+i]), 0)) {
+ synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1+i];
+ break;
+ }
+ }
+
// Cleanup from unicode API:
if (u8 != u8_buf) free(u8);
@@ -927,8 +940,8 @@ static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop)
{
if (*i >= text.length) return false;
int32_t grapheme = get_grapheme(text, *i);
- if (grapheme < 0) // TODO: check every codepoint in the cluster?
- grapheme = GRAPHEME_CODEPOINTS(grapheme)[0];
+ // TODO: check every codepoint in the cluster?
+ grapheme = MAIN_GRAPHEME_CODEPOINT(grapheme);
if (uc_is_property(grapheme, prop)) {
*i += 1;
@@ -943,8 +956,7 @@ static int64_t parse_int(Text_t text, int64_t *i)
int64_t value = 0;
for (;; *i += 1) {
int32_t grapheme = _next_grapheme(text, &state, *i);
- if (grapheme < 0)
- grapheme = GRAPHEME_CODEPOINTS(grapheme)[0];
+ grapheme = MAIN_GRAPHEME_CODEPOINT(grapheme);
int digit = uc_digit_value(grapheme);
if (digit < 0) break;
if (value >= INT64_MAX/10) break;
@@ -1010,8 +1022,7 @@ int64_t match_email(Text_t text, int64_t text_index)
iteration_state_t state = {0, 0};
if (text_index > 0) {
int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1);
- if (prev_codepoint < 0)
- prev_codepoint = GRAPHEME_CODEPOINTS(prev_codepoint)[0];
+ prev_codepoint = MAIN_GRAPHEME_CODEPOINT(prev_codepoint);
if (uc_is_property_alphabetic(prev_codepoint))
return -1;
}
@@ -1127,8 +1138,7 @@ int64_t match_uri(Text_t text, int64_t text_index)
iteration_state_t state = {0, 0};
if (text_index > 0) {
int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1);
- if (prev_codepoint < 0)
- prev_codepoint = GRAPHEME_CODEPOINTS(prev_codepoint)[0];
+ prev_codepoint = MAIN_GRAPHEME_CODEPOINT(prev_codepoint);
if (uc_is_property_alphabetic(prev_codepoint))
return -1;
}
@@ -1477,8 +1487,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
for (int64_t count = 0; count < max; ) {
int32_t grapheme = _next_grapheme(text, &text_state, text_index);
- if (grapheme < 0)
- grapheme = GRAPHEME_CODEPOINTS(grapheme)[0];
+ grapheme = MAIN_GRAPHEME_CODEPOINT(grapheme);
bool success;
if (any) {