aboutsummaryrefslogtreecommitdiff
path: root/src/stdlib/text.c
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2026-02-08 22:47:02 -0500
committerBruce Hill <bruce@bruce-hill.com>2026-02-08 22:47:02 -0500
commit2b7e96835e75e0d153e7f993d1c4fc2add452ddd (patch)
treeed1104f60ed35af2bf3c9d8cd66d17f45683f07c /src/stdlib/text.c
parent2371542adb017afc87ecc572901107bf493e214f (diff)
Added Text.distance(a,b) for text similarity comparisons.
Diffstat (limited to 'src/stdlib/text.c')
-rw-r--r--src/stdlib/text.c64
1 files changed, 64 insertions, 0 deletions
diff --git a/src/stdlib/text.c b/src/stdlib/text.c
index 4bf6d999..117b4a8d 100644
--- a/src/stdlib/text.c
+++ b/src/stdlib/text.c
@@ -1427,6 +1427,70 @@ Text_t Text$title(Text_t text, Text_t language) {
}
public
+double Text$distance(Text_t a, Text_t b, Text_t language) {
+ if (a.length == 0) return (double)b.length;
+ if (b.length == 0) return (double)a.length;
+
+ // The current implementation of text distance uses a modified form
+ // of Damerau–Levenshtein distance that gives slightly lower distances
+ // for letters with the same main grapheme and slightly lower distances
+ // for letters that are the same, but with different casing.
+ double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t[a.length][b.length]));
+#define DIST(x, y) distances[(x) * b.length + (y)]
+ for (int64_t i = 0; i <= a.length; i++)
+ DIST(i, 0) = i;
+ for (int64_t j = 0; j <= b.length; j++)
+ DIST(0, j) = j;
+
+ TextIter_t a_state = NEW_TEXT_ITER_STATE(a);
+ TextIter_t b_state = NEW_TEXT_ITER_STATE(b);
+ const char *uc_language = Text$as_c_string(language);
+ for (int64_t i = 1; i <= a.length; i++) {
+ for (int64_t j = 1; j <= b.length; j++) {
+ int32_t ai = Text$get_grapheme_fast(&a_state, i - 1);
+ int32_t bi = Text$get_grapheme_fast(&b_state, i - 1);
+ if (ai == bi) {
+ DIST(i, j) = DIST(i - 1, j - 1);
+ } else {
+ ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint;
+ ucs4_t main_bi = (bi) >= 0 ? (ucs4_t)bi : synthetic_graphemes[-(bi)-1].main_codepoint;
+ if (main_ai == main_bi) {
+ // Same main grapheme (different modifiers)
+ DIST(i, j) = 0.25 + DIST(i - 1, j - 1);
+ } else {
+ int cmp;
+ (void)u32_casecmp(&main_ai, 1, &main_bi, 1, uc_language, UNINORM_NFC, &cmp);
+ if (cmp == 0) {
+ // Same main grapheme, different casing (e.g. "a" vs "A")
+ DIST(i, j) = 0.5 + DIST(i - 1, j - 1);
+ } else {
+ // Different main grapheme
+ double insertion = 1. + DIST(i - 1, j);
+ double deletion = 1. + DIST(i, j - 1);
+ double dist = MIN(insertion, deletion);
+ double substitution = 1. + DIST(i - 1, j - 1);
+ dist = MIN(dist, substitution);
+ // Check for transposition:
+ if (i >= 2 && j >= 2) {
+ int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2);
+ int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2);
+ if (ai == bi_prev && bi == ai_prev) {
+ double transposition = 1. + DIST(i - 2, j - 2);
+ dist = MIN(dist, transposition);
+ }
+ }
+ DIST(i, j) = dist;
+ }
+ }
+ }
+ }
+ }
+#undef DIST
+
+ return (double)distances[a.length * b.length + b.length];
+}
+
+public
Text_t Text$escaped(Text_t text, bool colorize, Text_t extra_escapes) {
Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT;
#define flush_unquoted() \