diff options
Diffstat (limited to 'builtins/text.c')
| -rw-r--r-- | builtins/text.c | 29 |
1 files changed, 19 insertions, 10 deletions
diff --git a/builtins/text.c b/builtins/text.c index 54b9d3d5..59063a58 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -77,6 +77,7 @@ #include "siphash.c" typedef struct { + uint32_t main_codepoint; uint32_t *utf32_cluster; // length-prefixed const uint8_t *utf8; } synthetic_grapheme_t; @@ -91,6 +92,7 @@ typedef struct { static table_t grapheme_ids_by_codepoints = {}; // uint32_t* length-prefixed codepoints -> int32_t ID static synthetic_grapheme_t crlf_grapheme = { + .main_codepoint='\n', .utf32_cluster=(uint32_t[]){2, '\r', '\n'}, .utf8=(uint8_t[]){'\r', '\n', '\0'}, }; @@ -99,6 +101,7 @@ static synthetic_grapheme_t crlf_grapheme = { static synthetic_grapheme_t *synthetic_graphemes = &crlf_grapheme; static int32_t synthetic_grapheme_capacity = 1; +#define MAIN_GRAPHEME_CODEPOINT(g) ((g) >= 0 ? (uint32_t)(g) : synthetic_graphemes[-(g)-1].main_codepoint) #define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id)-1].utf32_cluster[0]) #define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id)-1].utf32_cluster[1]) #define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id)-1].utf8) @@ -201,6 +204,16 @@ int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t utf32_len) synthetic_graphemes[-grapheme_id-1].utf8 = utf8_final; arena += sizeof(uint8_t[u8_len + 1]); + // Sickos at the unicode consortium decreed that you can have grapheme clusters + // that begin with *prefix* modifiers, so we gotta check for that case: + synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1]; + for (uint32_t i = 0; i < utf32_len; i++) { + if (!__builtin_expect(uc_is_property_prepended_concatenation_mark(length_prefixed[1+i]), 0)) { + synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1+i]; + break; + } + } + // Cleanup from unicode API: if (u8 != u8_buf) free(u8); @@ -927,8 +940,8 @@ static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop) { if (*i >= text.length) return false; int32_t grapheme = get_grapheme(text, *i); - if (grapheme < 0) // TODO: check every codepoint in the cluster? - grapheme = GRAPHEME_CODEPOINTS(grapheme)[0]; + // TODO: check every codepoint in the cluster? + grapheme = MAIN_GRAPHEME_CODEPOINT(grapheme); if (uc_is_property(grapheme, prop)) { *i += 1; @@ -943,8 +956,7 @@ static int64_t parse_int(Text_t text, int64_t *i) int64_t value = 0; for (;; *i += 1) { int32_t grapheme = _next_grapheme(text, &state, *i); - if (grapheme < 0) - grapheme = GRAPHEME_CODEPOINTS(grapheme)[0]; + grapheme = MAIN_GRAPHEME_CODEPOINT(grapheme); int digit = uc_digit_value(grapheme); if (digit < 0) break; if (value >= INT64_MAX/10) break; @@ -1010,8 +1022,7 @@ int64_t match_email(Text_t text, int64_t text_index) iteration_state_t state = {0, 0}; if (text_index > 0) { int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); - if (prev_codepoint < 0) - prev_codepoint = GRAPHEME_CODEPOINTS(prev_codepoint)[0]; + prev_codepoint = MAIN_GRAPHEME_CODEPOINT(prev_codepoint); if (uc_is_property_alphabetic(prev_codepoint)) return -1; } @@ -1127,8 +1138,7 @@ int64_t match_uri(Text_t text, int64_t text_index) iteration_state_t state = {0, 0}; if (text_index > 0) { int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); - if (prev_codepoint < 0) - prev_codepoint = GRAPHEME_CODEPOINTS(prev_codepoint)[0]; + prev_codepoint = MAIN_GRAPHEME_CODEPOINT(prev_codepoint); if (uc_is_property_alphabetic(prev_codepoint)) return -1; } @@ -1477,8 +1487,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter for (int64_t count = 0; count < max; ) { int32_t grapheme = _next_grapheme(text, &text_state, text_index); - if (grapheme < 0) - grapheme = GRAPHEME_CODEPOINTS(grapheme)[0]; + grapheme = MAIN_GRAPHEME_CODEPOINT(grapheme); bool success; if (any) { |
