diff options
Diffstat (limited to 'src/stdlib/text.c')
| -rw-r--r-- | src/stdlib/text.c | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 4bf6d999..117b4a8d 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -1427,6 +1427,70 @@ Text_t Text$title(Text_t text, Text_t language) { } public +double Text$distance(Text_t a, Text_t b, Text_t language) { + if (a.length == 0) return (double)b.length; + if (b.length == 0) return (double)a.length; + + // The current implementation of text distance uses a modified form + // of Damerau–Levenshtein distance that gives slightly lower distances + // for letters with the same main grapheme and slightly lower distances + // for letters that are the same, but with different casing. + double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t[a.length][b.length])); +#define DIST(x, y) distances[(x) * b.length + (y)] + for (int64_t i = 0; i <= a.length; i++) + DIST(i, 0) = i; + for (int64_t j = 0; j <= b.length; j++) + DIST(0, j) = j; + + TextIter_t a_state = NEW_TEXT_ITER_STATE(a); + TextIter_t b_state = NEW_TEXT_ITER_STATE(b); + const char *uc_language = Text$as_c_string(language); + for (int64_t i = 1; i <= a.length; i++) { + for (int64_t j = 1; j <= b.length; j++) { + int32_t ai = Text$get_grapheme_fast(&a_state, i - 1); + int32_t bi = Text$get_grapheme_fast(&b_state, i - 1); + if (ai == bi) { + DIST(i, j) = DIST(i - 1, j - 1); + } else { + ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint; + ucs4_t main_bi = (bi) >= 0 ? (ucs4_t)bi : synthetic_graphemes[-(bi)-1].main_codepoint; + if (main_ai == main_bi) { + // Same main grapheme (different modifiers) + DIST(i, j) = 0.25 + DIST(i - 1, j - 1); + } else { + int cmp; + (void)u32_casecmp(&main_ai, 1, &main_bi, 1, uc_language, UNINORM_NFC, &cmp); + if (cmp == 0) { + // Same main grapheme, different casing (e.g. "a" vs "A") + DIST(i, j) = 0.5 + DIST(i - 1, j - 1); + } else { + // Different main grapheme + double insertion = 1. + DIST(i - 1, j); + double deletion = 1. + DIST(i, j - 1); + double dist = MIN(insertion, deletion); + double substitution = 1. + DIST(i - 1, j - 1); + dist = MIN(dist, substitution); + // Check for transposition: + if (i >= 2 && j >= 2) { + int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2); + int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2); + if (ai == bi_prev && bi == ai_prev) { + double transposition = 1. + DIST(i - 2, j - 2); + dist = MIN(dist, transposition); + } + } + DIST(i, j) = dist; + } + } + } + } + } +#undef DIST + + return (double)distances[a.length * b.length + b.length]; +} + +public Text_t Text$escaped(Text_t text, bool colorize, Text_t extra_escapes) { Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT; #define flush_unquoted() \ |
