From 2b7e96835e75e0d153e7f993d1c4fc2add452ddd Mon Sep 17 00:00:00 2001
From: Bruce Hill <bruce@bruce-hill.com>
Date: Sun, 8 Feb 2026 22:47:02 -0500
Subject: Added Text.distance(a,b) for text similarity comparisons.

---
 CHANGES.md                    |  4 +++
 api/api.md                    | 27 ++++++++++++++++++
 api/text.md                   | 27 ++++++++++++++++++
 api/text.yaml                 | 33 ++++++++++++++++++++++
 man/man3/tomo-Text.3          | 12 ++++++--
 man/man3/tomo-Text.distance.3 | 43 +++++++++++++++++++++++++++++
 src/environment.c             |  1 +
 src/stdlib/text.c             | 64 +++++++++++++++++++++++++++++++++++++++++++
 src/stdlib/text.h             |  1 +
 test/text.tm                  |  8 ++++++
 10 files changed, 218 insertions(+), 2 deletions(-)
 create mode 100644 man/man3/tomo-Text.distance.3

diff --git a/CHANGES.md b/CHANGES.md
index 856abfb7..d27c60c4 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,9 @@
 # Version History
 
+## v2026-02-08
+
+- Added `Text.distance(a,b)` for calculating text distances.
+
 ## v2025-12-31
 
 - Added support for `123.foo()` parsing the same as `(123).foo()`
diff --git a/api/api.md b/api/api.md
index 9e7c553e..0a2213cb 100644
--- a/api/api.md
+++ b/api/api.md
@@ -3965,6 +3965,33 @@ assert "Amélie".codepoint_names() == [
     "LATIN SMALL LETTER E",
 ]
 
+```
+## Text.distance
+
+```tomo
+Text.distance : func(a: Text, b: Text, language: Text = "C" -> Num)
+```
+
+Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar.
+
+The exact distance algorithm is not specified and may be subject to change over time.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+a | `Text` | The first text to compare.  | -
+b | `Text` | The second text to compare.  | -
+language | `Text` | The ISO 639 language code for which character width to use.  | `"C"`
+
+**Return:** The distance between the two texts (larger means more dissimilar).
+
+
+**Example:**
+```tomo
+assert "hello".distance("hello") == 0
+texts := &["goodbye", "hello", "hallo"]
+texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello"))
+assert texts == ["hello", "hallo", "goodbye"]
+
 ```
 ## Text.ends_with
 
diff --git a/api/text.md b/api/text.md
index 928cb6ec..2536ff21 100644
--- a/api/text.md
+++ b/api/text.md
@@ -179,6 +179,33 @@ assert "Amélie".codepoint_names() == [
     "LATIN SMALL LETTER E",
 ]
 
+```
+## Text.distance
+
+```tomo
+Text.distance : func(a: Text, b: Text, language: Text = "C" -> Num)
+```
+
+Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar.
+
+The exact distance algorithm is not specified and may be subject to change over time.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+a | `Text` | The first text to compare.  | -
+b | `Text` | The second text to compare.  | -
+language | `Text` | The ISO 639 language code for which character width to use.  | `"C"`
+
+**Return:** The distance between the two texts (larger means more dissimilar).
+
+
+**Example:**
+```tomo
+assert "hello".distance("hello") == 0
+texts := &["goodbye", "hello", "hallo"]
+texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello"))
+assert texts == ["hello", "hallo", "goodbye"]
+
 ```
 ## Text.ends_with
 
diff --git a/api/text.yaml b/api/text.yaml
index 6874bfc8..2af7cae4 100644
--- a/api/text.yaml
+++ b/api/text.yaml
@@ -225,6 +225,39 @@ Text.ends_with:
     assert "hello world".ends_with("world", &remainder) == yes
     assert remainder == "hello "
 
+Text.distance:
+  short: distance between two texts
+  description: >
+    Get an approximate distance between two texts, such that when the distance
+    is small, the texts are similar and when the distance is large, the texts
+    are dissimilar.
+  note: >
+    The exact distance algorithm is not specified and may be subject to change
+    over time.
+  return:
+    type: 'Num'
+    description: >
+      The distance between the two texts (larger means more dissimilar).
+  args:
+    a:
+      type: 'Text'
+      description: >
+        The first text to compare.
+    b:
+      type: 'Text'
+      description: >
+        The second text to compare.
+    language:
+      type: 'Text'
+      default: '"C"'
+      description: >
+        The ISO 639 language code for which character width to use.
+  example: |
+    assert "hello".distance("hello") == 0
+    texts := &["goodbye", "hello", "hallo"]
+    texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello"))
+    assert texts == ["hello", "hallo", "goodbye"]
+
 Text.find:
   short: find a substring
   description: >
diff --git a/man/man3/tomo-Text.3 b/man/man3/tomo-Text.3
index 634e3e0a..10032155 100644
--- a/man/man3/tomo-Text.3
+++ b/man/man3/tomo-Text.3
@@ -1,8 +1,8 @@
 '\" t
-.\" Copyright (c) 2025 Bruce Hill
+.\" Copyright (c) 2026 Bruce Hill
 .\" All rights reserved.
 .\"
-.TH Text 3 2025-11-29 "Tomo man-pages"
+.TH Text 3 2026-02-08 "Tomo man-pages"
 .SH NAME
 Text \- a Tomo type
 .SH LIBRARY
@@ -66,6 +66,14 @@ For more, see:
 .BR Tomo-Text.codepoint_names (3)
 
 
+.TP
+.BI Text.distance\ :\ func(a:\ Text,\ b:\ Text,\ language:\ Text\ =\ "C"\ ->\ Num)
+Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar. 
+
+For more, see:
+.BR Tomo-Text.distance (3)
+
+
 .TP
 .BI Text.ends_with\ :\ func(text:\ Text,\ suffix:\ Text,\ remainder:\ &Text?\ =\ none\ ->\ Bool)
 Checks if the \fBText\fR ends with a literal suffix text. 
diff --git a/man/man3/tomo-Text.distance.3 b/man/man3/tomo-Text.distance.3
new file mode 100644
index 00000000..fe516e48
--- /dev/null
+++ b/man/man3/tomo-Text.distance.3
@@ -0,0 +1,43 @@
+'\" t
+.\" Copyright (c) 2026 Bruce Hill
+.\" All rights reserved.
+.\"
+.TH Text.distance 3 2026-02-08 "Tomo man-pages"
+.SH NAME
+Text.distance \- distance between two texts
+.SH LIBRARY
+Tomo Standard Library
+.SH SYNOPSIS
+.nf
+.BI Text.distance\ :\ func(a:\ Text,\ b:\ Text,\ language:\ Text\ =\ "C"\ ->\ Num)
+.fi
+.SH DESCRIPTION
+Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar.
+
+
+.SH ARGUMENTS
+
+.TS
+allbox;
+lb lb lbx lb
+l l l l.
+Name	Type	Description	Default
+a	Text	The first text to compare. 	-
+b	Text	The second text to compare. 	-
+language	Text	The ISO 639 language code for which character width to use. 	"C"
+.TE
+.SH RETURN
+The distance between the two texts (larger means more dissimilar).
+
+.SH NOTES
+The exact distance algorithm is not specified and may be subject to change over time.
+
+.SH EXAMPLES
+.EX
+assert "hello".distance("hello") == 0
+texts := &["goodbye", "hello", "hallo"]
+texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello"))
+assert texts == ["hello", "hallo", "goodbye"]
+.EE
+.SH SEE ALSO
+.BR Tomo-Text (3)
diff --git a/src/environment.c b/src/environment.c
index eb2c275c..d209471e 100644
--- a/src/environment.c
+++ b/src/environment.c
@@ -357,6 +357,7 @@ env_t *global_env(bool source_mapping) {
             {"by_split_any", "Text$by_split_any", "func(text:Text, delimiters=' \\t\\r\\n' -> func(->Text?))"}, //
             {"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language='C' -> Bool)"}, //
             {"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, //
+            {"distance", "Text$distance", "func(a,b:Text, language='C' -> Num)"}, //
             {"ends_with", "Text$ends_with", "func(text,suffix:Text, remainder:&Text? = none -> Bool)"}, //
             {"find", "Text$find", "func(text,target:Text, start=1 -> Int?)"}, //
             {"from", "Text$from", "func(text:Text, first:Int -> Text)"}, //
diff --git a/src/stdlib/text.c b/src/stdlib/text.c
index 4bf6d999..117b4a8d 100644
--- a/src/stdlib/text.c
+++ b/src/stdlib/text.c
@@ -1426,6 +1426,70 @@ Text_t Text$title(Text_t text, Text_t language) {
     return ret;
 }
 
+public
+double Text$distance(Text_t a, Text_t b, Text_t language) {
+    if (a.length == 0) return (double)b.length;
+    if (b.length == 0) return (double)a.length;
+
+    // The current implementation of text distance uses a modified form
+    // of Damerau–Levenshtein distance that gives slightly lower distances
+    // for letters with the same main grapheme and slightly lower distances
+    // for letters that are the same, but with different casing.
+    double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t[a.length][b.length]));
+#define DIST(x, y) distances[(x) * b.length + (y)]
+    for (int64_t i = 0; i <= a.length; i++)
+        DIST(i, 0) = i;
+    for (int64_t j = 0; j <= b.length; j++)
+        DIST(0, j) = j;
+
+    TextIter_t a_state = NEW_TEXT_ITER_STATE(a);
+    TextIter_t b_state = NEW_TEXT_ITER_STATE(b);
+    const char *uc_language = Text$as_c_string(language);
+    for (int64_t i = 1; i <= a.length; i++) {
+        for (int64_t j = 1; j <= b.length; j++) {
+            int32_t ai = Text$get_grapheme_fast(&a_state, i - 1);
+            int32_t bi = Text$get_grapheme_fast(&b_state, i - 1);
+            if (ai == bi) {
+                DIST(i, j) = DIST(i - 1, j - 1);
+            } else {
+                ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint;
+                ucs4_t main_bi = (bi) >= 0 ? (ucs4_t)bi : synthetic_graphemes[-(bi)-1].main_codepoint;
+                if (main_ai == main_bi) {
+                    // Same main grapheme (different modifiers)
+                    DIST(i, j) = 0.25 + DIST(i - 1, j - 1);
+                } else {
+                    int cmp;
+                    (void)u32_casecmp(&main_ai, 1, &main_bi, 1, uc_language, UNINORM_NFC, &cmp);
+                    if (cmp == 0) {
+                        // Same main grapheme, different casing (e.g. "a" vs "A")
+                        DIST(i, j) = 0.5 + DIST(i - 1, j - 1);
+                    } else {
+                        // Different main grapheme
+                        double insertion = 1. + DIST(i - 1, j);
+                        double deletion = 1. + DIST(i, j - 1);
+                        double dist = MIN(insertion, deletion);
+                        double substitution = 1. + DIST(i - 1, j - 1);
+                        dist = MIN(dist, substitution);
+                        // Check for transposition:
+                        if (i >= 2 && j >= 2) {
+                            int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2);
+                            int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2);
+                            if (ai == bi_prev && bi == ai_prev) {
+                                double transposition = 1. + DIST(i - 2, j - 2);
+                                dist = MIN(dist, transposition);
+                            }
+                        }
+                        DIST(i, j) = dist;
+                    }
+                }
+            }
+        }
+    }
+#undef DIST
+
+    return (double)distances[a.length * b.length + b.length];
+}
+
 public
 Text_t Text$escaped(Text_t text, bool colorize, Text_t extra_escapes) {
     Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT;
diff --git a/src/stdlib/text.h b/src/stdlib/text.h
index 856b173a..7259b545 100644
--- a/src/stdlib/text.h
+++ b/src/stdlib/text.h
@@ -116,6 +116,7 @@ Int_t Text$width(Text_t text, Text_t language);
 Text_t Text$left_pad(Text_t text, Int_t width, Text_t padding, Text_t language);
 Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language);
 Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language);
+double Text$distance(Text_t a, Text_t b, Text_t language);
 int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index);
 uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index);
 Int_t Text$memory_size(Text_t text);
diff --git a/test/text.tm b/test/text.tm
index 6c23042d..8f0da922 100644
--- a/test/text.tm
+++ b/test/text.tm
@@ -208,3 +208,11 @@ func main()
 	assert "one two".find("two") == 5
 	assert "one two".find("three") == none
     assert "one two".find("o", start=2) == 7
+
+	
+	assert "hello".distance("hello") == 0
+	assert "hello".distance("goodbye") > 2.0
+	assert "hello".distance("hola") < "hello".distance("goodbye")
+	assert "hello".distance("Hello") <= 1.0
+	assert "hello".distance("xello") <= 1.0
+	assert "hello".distance("ehllo") <= "hello".distance("XXllo")
-- 
cgit v1.2.3