From 2b7e96835e75e0d153e7f993d1c4fc2add452ddd Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Sun, 8 Feb 2026 22:47:02 -0500 Subject: Added Text.distance(a,b) for text similarity comparisons. --- src/stdlib/text.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/stdlib/text.h | 1 + 2 files changed, 65 insertions(+) (limited to 'src/stdlib') diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 4bf6d999..117b4a8d 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -1426,6 +1426,70 @@ Text_t Text$title(Text_t text, Text_t language) { return ret; } +public +double Text$distance(Text_t a, Text_t b, Text_t language) { + if (a.length == 0) return (double)b.length; + if (b.length == 0) return (double)a.length; + + // The current implementation of text distance uses a modified form + // of Damerau–Levenshtein distance that gives slightly lower distances + // for letters with the same main grapheme and slightly lower distances + // for letters that are the same, but with different casing. + double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t[a.length][b.length])); +#define DIST(x, y) distances[(x) * b.length + (y)] + for (int64_t i = 0; i <= a.length; i++) + DIST(i, 0) = i; + for (int64_t j = 0; j <= b.length; j++) + DIST(0, j) = j; + + TextIter_t a_state = NEW_TEXT_ITER_STATE(a); + TextIter_t b_state = NEW_TEXT_ITER_STATE(b); + const char *uc_language = Text$as_c_string(language); + for (int64_t i = 1; i <= a.length; i++) { + for (int64_t j = 1; j <= b.length; j++) { + int32_t ai = Text$get_grapheme_fast(&a_state, i - 1); + int32_t bi = Text$get_grapheme_fast(&b_state, i - 1); + if (ai == bi) { + DIST(i, j) = DIST(i - 1, j - 1); + } else { + ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint; + ucs4_t main_bi = (bi) >= 0 ? (ucs4_t)bi : synthetic_graphemes[-(bi)-1].main_codepoint; + if (main_ai == main_bi) { + // Same main grapheme (different modifiers) + DIST(i, j) = 0.25 + DIST(i - 1, j - 1); + } else { + int cmp; + (void)u32_casecmp(&main_ai, 1, &main_bi, 1, uc_language, UNINORM_NFC, &cmp); + if (cmp == 0) { + // Same main grapheme, different casing (e.g. "a" vs "A") + DIST(i, j) = 0.5 + DIST(i - 1, j - 1); + } else { + // Different main grapheme + double insertion = 1. + DIST(i - 1, j); + double deletion = 1. + DIST(i, j - 1); + double dist = MIN(insertion, deletion); + double substitution = 1. + DIST(i - 1, j - 1); + dist = MIN(dist, substitution); + // Check for transposition: + if (i >= 2 && j >= 2) { + int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2); + int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2); + if (ai == bi_prev && bi == ai_prev) { + double transposition = 1. + DIST(i - 2, j - 2); + dist = MIN(dist, transposition); + } + } + DIST(i, j) = dist; + } + } + } + } + } +#undef DIST + + return (double)distances[a.length * b.length + b.length]; +} + public Text_t Text$escaped(Text_t text, bool colorize, Text_t extra_escapes) { Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT; diff --git a/src/stdlib/text.h b/src/stdlib/text.h index 856b173a..7259b545 100644 --- a/src/stdlib/text.h +++ b/src/stdlib/text.h @@ -116,6 +116,7 @@ Int_t Text$width(Text_t text, Text_t language); Text_t Text$left_pad(Text_t text, Int_t width, Text_t padding, Text_t language); Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language); Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language); +double Text$distance(Text_t a, Text_t b, Text_t language); int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index); uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index); Int_t Text$memory_size(Text_t text); -- cgit v1.2.3