diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2026-02-08 22:47:02 -0500 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2026-02-08 22:47:02 -0500 |
| commit | 2b7e96835e75e0d153e7f993d1c4fc2add452ddd (patch) | |
| tree | ed1104f60ed35af2bf3c9d8cd66d17f45683f07c /src | |
| parent | 2371542adb017afc87ecc572901107bf493e214f (diff) | |
Added Text.distance(a,b) for text similarity comparisons.
Diffstat (limited to 'src')
| -rw-r--r-- | src/environment.c | 1 | ||||
| -rw-r--r-- | src/stdlib/text.c | 64 | ||||
| -rw-r--r-- | src/stdlib/text.h | 1 |
3 files changed, 66 insertions, 0 deletions
diff --git a/src/environment.c b/src/environment.c index eb2c275c..d209471e 100644 --- a/src/environment.c +++ b/src/environment.c @@ -357,6 +357,7 @@ env_t *global_env(bool source_mapping) { {"by_split_any", "Text$by_split_any", "func(text:Text, delimiters=' \\t\\r\\n' -> func(->Text?))"}, // {"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language='C' -> Bool)"}, // {"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, // + {"distance", "Text$distance", "func(a,b:Text, language='C' -> Num)"}, // {"ends_with", "Text$ends_with", "func(text,suffix:Text, remainder:&Text? = none -> Bool)"}, // {"find", "Text$find", "func(text,target:Text, start=1 -> Int?)"}, // {"from", "Text$from", "func(text:Text, first:Int -> Text)"}, // diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 4bf6d999..117b4a8d 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -1427,6 +1427,70 @@ Text_t Text$title(Text_t text, Text_t language) { } public +double Text$distance(Text_t a, Text_t b, Text_t language) { + if (a.length == 0) return (double)b.length; + if (b.length == 0) return (double)a.length; + + // The current implementation of text distance uses a modified form + // of Damerau–Levenshtein distance that gives slightly lower distances + // for letters with the same main grapheme and slightly lower distances + // for letters that are the same, but with different casing. + double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t[a.length][b.length])); +#define DIST(x, y) distances[(x) * b.length + (y)] + for (int64_t i = 0; i <= a.length; i++) + DIST(i, 0) = i; + for (int64_t j = 0; j <= b.length; j++) + DIST(0, j) = j; + + TextIter_t a_state = NEW_TEXT_ITER_STATE(a); + TextIter_t b_state = NEW_TEXT_ITER_STATE(b); + const char *uc_language = Text$as_c_string(language); + for (int64_t i = 1; i <= a.length; i++) { + for (int64_t j = 1; j <= b.length; j++) { + int32_t ai = Text$get_grapheme_fast(&a_state, i - 1); + int32_t bi = Text$get_grapheme_fast(&b_state, i - 1); + if (ai == bi) { + DIST(i, j) = DIST(i - 1, j - 1); + } else { + ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint; + ucs4_t main_bi = (bi) >= 0 ? (ucs4_t)bi : synthetic_graphemes[-(bi)-1].main_codepoint; + if (main_ai == main_bi) { + // Same main grapheme (different modifiers) + DIST(i, j) = 0.25 + DIST(i - 1, j - 1); + } else { + int cmp; + (void)u32_casecmp(&main_ai, 1, &main_bi, 1, uc_language, UNINORM_NFC, &cmp); + if (cmp == 0) { + // Same main grapheme, different casing (e.g. "a" vs "A") + DIST(i, j) = 0.5 + DIST(i - 1, j - 1); + } else { + // Different main grapheme + double insertion = 1. + DIST(i - 1, j); + double deletion = 1. + DIST(i, j - 1); + double dist = MIN(insertion, deletion); + double substitution = 1. + DIST(i - 1, j - 1); + dist = MIN(dist, substitution); + // Check for transposition: + if (i >= 2 && j >= 2) { + int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2); + int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2); + if (ai == bi_prev && bi == ai_prev) { + double transposition = 1. + DIST(i - 2, j - 2); + dist = MIN(dist, transposition); + } + } + DIST(i, j) = dist; + } + } + } + } + } +#undef DIST + + return (double)distances[a.length * b.length + b.length]; +} + +public Text_t Text$escaped(Text_t text, bool colorize, Text_t extra_escapes) { Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT; #define flush_unquoted() \ diff --git a/src/stdlib/text.h b/src/stdlib/text.h index 856b173a..7259b545 100644 --- a/src/stdlib/text.h +++ b/src/stdlib/text.h @@ -116,6 +116,7 @@ Int_t Text$width(Text_t text, Text_t language); Text_t Text$left_pad(Text_t text, Int_t width, Text_t padding, Text_t language); Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language); Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language); +double Text$distance(Text_t a, Text_t b, Text_t language); int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index); uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index); Int_t Text$memory_size(Text_t text); |
