aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2026-02-08 22:47:02 -0500
committerBruce Hill <bruce@bruce-hill.com>2026-02-08 22:47:02 -0500
commit2b7e96835e75e0d153e7f993d1c4fc2add452ddd (patch)
treeed1104f60ed35af2bf3c9d8cd66d17f45683f07c /src
parent2371542adb017afc87ecc572901107bf493e214f (diff)
Added Text.distance(a,b) for text similarity comparisons.
Diffstat (limited to 'src')
-rw-r--r--src/environment.c1
-rw-r--r--src/stdlib/text.c64
-rw-r--r--src/stdlib/text.h1
3 files changed, 66 insertions, 0 deletions
diff --git a/src/environment.c b/src/environment.c
index eb2c275c..d209471e 100644
--- a/src/environment.c
+++ b/src/environment.c
@@ -357,6 +357,7 @@ env_t *global_env(bool source_mapping) {
{"by_split_any", "Text$by_split_any", "func(text:Text, delimiters=' \\t\\r\\n' -> func(->Text?))"}, //
{"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language='C' -> Bool)"}, //
{"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, //
+ {"distance", "Text$distance", "func(a,b:Text, language='C' -> Num)"}, //
{"ends_with", "Text$ends_with", "func(text,suffix:Text, remainder:&Text? = none -> Bool)"}, //
{"find", "Text$find", "func(text,target:Text, start=1 -> Int?)"}, //
{"from", "Text$from", "func(text:Text, first:Int -> Text)"}, //
diff --git a/src/stdlib/text.c b/src/stdlib/text.c
index 4bf6d999..117b4a8d 100644
--- a/src/stdlib/text.c
+++ b/src/stdlib/text.c
@@ -1427,6 +1427,70 @@ Text_t Text$title(Text_t text, Text_t language) {
}
public
+double Text$distance(Text_t a, Text_t b, Text_t language) {
+ if (a.length == 0) return (double)b.length;
+ if (b.length == 0) return (double)a.length;
+
+ // The current implementation of text distance uses a modified form
+ // of Damerau–Levenshtein distance that gives slightly lower distances
+ // for letters with the same main grapheme and slightly lower distances
+ // for letters that are the same, but with different casing.
+ double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t[a.length][b.length]));
+#define DIST(x, y) distances[(x) * b.length + (y)]
+ for (int64_t i = 0; i <= a.length; i++)
+ DIST(i, 0) = i;
+ for (int64_t j = 0; j <= b.length; j++)
+ DIST(0, j) = j;
+
+ TextIter_t a_state = NEW_TEXT_ITER_STATE(a);
+ TextIter_t b_state = NEW_TEXT_ITER_STATE(b);
+ const char *uc_language = Text$as_c_string(language);
+ for (int64_t i = 1; i <= a.length; i++) {
+ for (int64_t j = 1; j <= b.length; j++) {
+ int32_t ai = Text$get_grapheme_fast(&a_state, i - 1);
+ int32_t bi = Text$get_grapheme_fast(&b_state, i - 1);
+ if (ai == bi) {
+ DIST(i, j) = DIST(i - 1, j - 1);
+ } else {
+ ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint;
+ ucs4_t main_bi = (bi) >= 0 ? (ucs4_t)bi : synthetic_graphemes[-(bi)-1].main_codepoint;
+ if (main_ai == main_bi) {
+ // Same main grapheme (different modifiers)
+ DIST(i, j) = 0.25 + DIST(i - 1, j - 1);
+ } else {
+ int cmp;
+ (void)u32_casecmp(&main_ai, 1, &main_bi, 1, uc_language, UNINORM_NFC, &cmp);
+ if (cmp == 0) {
+ // Same main grapheme, different casing (e.g. "a" vs "A")
+ DIST(i, j) = 0.5 + DIST(i - 1, j - 1);
+ } else {
+ // Different main grapheme
+ double insertion = 1. + DIST(i - 1, j);
+ double deletion = 1. + DIST(i, j - 1);
+ double dist = MIN(insertion, deletion);
+ double substitution = 1. + DIST(i - 1, j - 1);
+ dist = MIN(dist, substitution);
+ // Check for transposition:
+ if (i >= 2 && j >= 2) {
+ int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2);
+ int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2);
+ if (ai == bi_prev && bi == ai_prev) {
+ double transposition = 1. + DIST(i - 2, j - 2);
+ dist = MIN(dist, transposition);
+ }
+ }
+ DIST(i, j) = dist;
+ }
+ }
+ }
+ }
+ }
+#undef DIST
+
+ return (double)distances[a.length * b.length + b.length];
+}
+
+public
Text_t Text$escaped(Text_t text, bool colorize, Text_t extra_escapes) {
Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT;
#define flush_unquoted() \
diff --git a/src/stdlib/text.h b/src/stdlib/text.h
index 856b173a..7259b545 100644
--- a/src/stdlib/text.h
+++ b/src/stdlib/text.h
@@ -116,6 +116,7 @@ Int_t Text$width(Text_t text, Text_t language);
Text_t Text$left_pad(Text_t text, Int_t width, Text_t padding, Text_t language);
Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language);
Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language);
+double Text$distance(Text_t a, Text_t b, Text_t language);
int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index);
uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index);
Int_t Text$memory_size(Text_t text);