diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2024-09-05 03:58:20 -0400 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2024-09-05 03:58:20 -0400 |
| commit | 41e4ddb9736e84bda457a75f51307ca0bc4bc9ab (patch) | |
| tree | 2171a41551188f38b8cdab1cff533ecd58fe91ef | |
| parent | 73df39ff7e769c705496cf5eb0ba4681e967dcec (diff) | |
Clear out some CRLF crufttext2
| -rw-r--r-- | builtins/text.c | 60 |
1 files changed, 17 insertions, 43 deletions
diff --git a/builtins/text.c b/builtins/text.c index 19a7916b..1871fb1c 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -91,24 +91,16 @@ typedef struct { // Synthetic grapheme clusters (clusters of more than one codepoint): static table_t grapheme_ids_by_codepoints = {}; // uint32_t* length-prefixed codepoints -> int32_t ID -static synthetic_grapheme_t crlf_grapheme = { - .main_codepoint='\n', - .utf32_cluster=(uint32_t[]){2, '\r', '\n'}, - .utf8=(uint8_t[]){'\r', '\n', '\0'}, -}; -// This will hold a dynamically growing array of synthetic graphemes, but for -// now we can just start it off with an array of one: -static synthetic_grapheme_t *synthetic_graphemes = &crlf_grapheme; -static int32_t synthetic_grapheme_capacity = 1; +// This will hold a dynamically growing array of synthetic graphemes: +static synthetic_grapheme_t *synthetic_graphemes = NULL; +static int32_t synthetic_grapheme_capacity = 0; +static int32_t num_synthetic_graphemes = 0; #define MAIN_GRAPHEME_CODEPOINT(g) ((g) >= 0 ? (uint32_t)(g) : synthetic_graphemes[-(g)-1].main_codepoint) #define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id)-1].utf32_cluster[0]) #define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id)-1].utf32_cluster[1]) #define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id)-1].utf8) -static int32_t num_synthetic_graphemes = 1; -const int32_t CRLF_GRAPHEME = -1; - static int32_t get_grapheme(Text_t text, int64_t index); static int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index); static Text_t text_from_u32(uint32_t *codepoints, int64_t num_codepoints, bool normalize); @@ -139,9 +131,6 @@ static const TypeInfo GraphemeIDLookupTableInfo = { int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t utf32_len) { - if (utf32_len == 2 && codepoints[0] == '\r' && codepoints[1] == '\n') - return CRLF_GRAPHEME; - uint32_t length_prefixed[1+utf32_len] = {}; length_prefixed[0] = (uint32_t)utf32_len; for (int i = 0; i < utf32_len; i++) @@ -149,8 +138,8 @@ int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t utf32_len) uint32_t *ptr = &length_prefixed[0]; // Optimization for common case of one frequently used synthetic grapheme: - static int32_t last_grapheme = CRLF_GRAPHEME; - if (graphemes_equal(&ptr, &synthetic_graphemes[-last_grapheme-1].utf32_cluster)) + static int32_t last_grapheme = 0; + if (last_grapheme != 0 && graphemes_equal(&ptr, &synthetic_graphemes[-last_grapheme-1].utf32_cluster)) return last_grapheme; int32_t *found = Table$get(grapheme_ids_by_codepoints, &ptr, &GraphemeIDLookupTableInfo); @@ -302,50 +291,34 @@ public int Text$print(FILE *stream, Text_t t) static bool is_concat_stable(Text_t a, Text_t b) { - /* If either string is empty, we're good. */ if (a.length == 0 || b.length == 0) return true; - /* Get first and last graphemes of the strings. */ int32_t last_a = get_grapheme(a, a.length-1); int32_t first_b = get_grapheme(b, 0); - if (first_b == '\n') - /* If we see \r + \n we need to renormalize. Otherwise we're good */ - return (last_a != '\r'); - - /* As a control code we are always going to break if we see one of these. - * Check first_b for speeding up line endings */ - if (first_b == CRLF_GRAPHEME || last_a == CRLF_GRAPHEME) - return 0; - - /* If either is synthetic other than "\r\n", assume we'll have to re-normalize - * (this is an over-estimate, most likely). Note if you optimize this that it - * serves as a guard for what follows. - * TODO get the last codepoint of last_a and first codepoint of first_b and call - * MVM_unicode_normalize_should_break */ + // Synthetic graphemes are weird and probably need to check with normalization: if (last_a < 0 || first_b < 0) return 0; - /* If both less than the first significant char for NFC we are good */ + // Magic number, we know that no codepoints below here trigger instability: static const int32_t LOWEST_CODEPOINT_TO_CHECK = 0x300; if (last_a < LOWEST_CODEPOINT_TO_CHECK && first_b < LOWEST_CODEPOINT_TO_CHECK) return true; - /* Check if the two codepoints would be joined during normalization. - * Returns 1 if they would break and thus is safe under concat, or 0 if - * they would be joined. */ + // Do a normalization run for these two codepoints and see if it looks different: uint32_t codepoints[2] = {(uint32_t)last_a, (uint32_t)first_b}; - - // Normalization should not exceed 3x in the input length - uint32_t norm_buf[3*2]; + uint32_t norm_buf[3*2]; // Normalization should not exceed 3x in the input length size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]); uint32_t *normalized = u32_normalize(UNINORM_NFC, codepoints, 2, norm_buf, &norm_length); if (norm_length != 2) { + // Looks like these two codepoints merged into one (or maybe had a child, who knows?) if (normalized != norm_buf) free(normalized); return false; } + // If there's still two codepoints, we might end up with a single grapheme + // cluster which will need to turn into a synthetic grapheme: const void *second_grapheme = u32_grapheme_next(normalized, &normalized[2]); if (normalized != norm_buf) free(normalized); return (second_grapheme == &normalized[1]); @@ -1403,7 +1376,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter skip_whitespace(pattern, &pattern_index); - bool any = false; + bool any = false, crlf = false; uc_property_t prop; int32_t specific_grapheme = UNINAME_INVALID; bool want_to_match = !match_grapheme(pattern, &pattern_index, '!'); @@ -1509,7 +1482,8 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter case 'n': if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0 || strcasecmp(prop_name, "crlf")) { - specific_grapheme = CRLF_GRAPHEME; + crlf = true; + prop = UC_PROPERTY_PRIVATE_USE; goto got_prop; } else if (strcasecmp(prop_name, "num") == 0) { EAT1(text, &text_state, text_index, grapheme == '-'); @@ -1587,7 +1561,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter bool success; if (any) { success = true; - } else if (specific_grapheme == CRLF_GRAPHEME) { + } else if (crlf) { if (grapheme == '\r' && _next_grapheme(text, &text_state, text_index + 1) == '\n') { text_index += 1; grapheme = '\n'; |
