aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGES.md4
-rw-r--r--api/api.md27
-rw-r--r--api/text.md27
-rw-r--r--api/text.yaml33
-rw-r--r--man/man3/tomo-Text.312
-rw-r--r--man/man3/tomo-Text.distance.343
-rw-r--r--src/environment.c1
-rw-r--r--src/stdlib/text.c64
-rw-r--r--src/stdlib/text.h1
-rw-r--r--test/text.tm8
10 files changed, 218 insertions, 2 deletions
diff --git a/CHANGES.md b/CHANGES.md
index 856abfb7..d27c60c4 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,9 @@
# Version History
+## v2026-02-08
+
+- Added `Text.distance(a,b)` for calculating text distances.
+
## v2025-12-31
- Added support for `123.foo()` parsing the same as `(123).foo()`
diff --git a/api/api.md b/api/api.md
index 9e7c553e..0a2213cb 100644
--- a/api/api.md
+++ b/api/api.md
@@ -3966,6 +3966,33 @@ assert "Amélie".codepoint_names() == [
]
```
+## Text.distance
+
+```tomo
+Text.distance : func(a: Text, b: Text, language: Text = "C" -> Num)
+```
+
+Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar.
+
+The exact distance algorithm is not specified and may be subject to change over time.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+a | `Text` | The first text to compare. | -
+b | `Text` | The second text to compare. | -
+language | `Text` | The ISO 639 language code for which character width to use. | `"C"`
+
+**Return:** The distance between the two texts (larger means more dissimilar).
+
+
+**Example:**
+```tomo
+assert "hello".distance("hello") == 0
+texts := &["goodbye", "hello", "hallo"]
+texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello"))
+assert texts == ["hello", "hallo", "goodbye"]
+
+```
## Text.ends_with
```tomo
diff --git a/api/text.md b/api/text.md
index 928cb6ec..2536ff21 100644
--- a/api/text.md
+++ b/api/text.md
@@ -180,6 +180,33 @@ assert "Amélie".codepoint_names() == [
]
```
+## Text.distance
+
+```tomo
+Text.distance : func(a: Text, b: Text, language: Text = "C" -> Num)
+```
+
+Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar.
+
+The exact distance algorithm is not specified and may be subject to change over time.
+
+Argument | Type | Description | Default
+---------|------|-------------|---------
+a | `Text` | The first text to compare. | -
+b | `Text` | The second text to compare. | -
+language | `Text` | The ISO 639 language code for which character width to use. | `"C"`
+
+**Return:** The distance between the two texts (larger means more dissimilar).
+
+
+**Example:**
+```tomo
+assert "hello".distance("hello") == 0
+texts := &["goodbye", "hello", "hallo"]
+texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello"))
+assert texts == ["hello", "hallo", "goodbye"]
+
+```
## Text.ends_with
```tomo
diff --git a/api/text.yaml b/api/text.yaml
index 6874bfc8..2af7cae4 100644
--- a/api/text.yaml
+++ b/api/text.yaml
@@ -225,6 +225,39 @@ Text.ends_with:
assert "hello world".ends_with("world", &remainder) == yes
assert remainder == "hello "
+Text.distance:
+ short: distance between two texts
+ description: >
+ Get an approximate distance between two texts, such that when the distance
+ is small, the texts are similar and when the distance is large, the texts
+ are dissimilar.
+ note: >
+ The exact distance algorithm is not specified and may be subject to change
+ over time.
+ return:
+ type: 'Num'
+ description: >
+ The distance between the two texts (larger means more dissimilar).
+ args:
+ a:
+ type: 'Text'
+ description: >
+ The first text to compare.
+ b:
+ type: 'Text'
+ description: >
+ The second text to compare.
+ language:
+ type: 'Text'
+ default: '"C"'
+ description: >
+ The ISO 639 language code for which character width to use.
+ example: |
+ assert "hello".distance("hello") == 0
+ texts := &["goodbye", "hello", "hallo"]
+ texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello"))
+ assert texts == ["hello", "hallo", "goodbye"]
+
Text.find:
short: find a substring
description: >
diff --git a/man/man3/tomo-Text.3 b/man/man3/tomo-Text.3
index 634e3e0a..10032155 100644
--- a/man/man3/tomo-Text.3
+++ b/man/man3/tomo-Text.3
@@ -1,8 +1,8 @@
'\" t
-.\" Copyright (c) 2025 Bruce Hill
+.\" Copyright (c) 2026 Bruce Hill
.\" All rights reserved.
.\"
-.TH Text 3 2025-11-29 "Tomo man-pages"
+.TH Text 3 2026-02-08 "Tomo man-pages"
.SH NAME
Text \- a Tomo type
.SH LIBRARY
@@ -67,6 +67,14 @@ For more, see:
.TP
+.BI Text.distance\ :\ func(a:\ Text,\ b:\ Text,\ language:\ Text\ =\ "C"\ ->\ Num)
+Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar.
+
+For more, see:
+.BR Tomo-Text.distance (3)
+
+
+.TP
.BI Text.ends_with\ :\ func(text:\ Text,\ suffix:\ Text,\ remainder:\ &Text?\ =\ none\ ->\ Bool)
Checks if the \fBText\fR ends with a literal suffix text.
diff --git a/man/man3/tomo-Text.distance.3 b/man/man3/tomo-Text.distance.3
new file mode 100644
index 00000000..fe516e48
--- /dev/null
+++ b/man/man3/tomo-Text.distance.3
@@ -0,0 +1,43 @@
+'\" t
+.\" Copyright (c) 2026 Bruce Hill
+.\" All rights reserved.
+.\"
+.TH Text.distance 3 2026-02-08 "Tomo man-pages"
+.SH NAME
+Text.distance \- distance between two texts
+.SH LIBRARY
+Tomo Standard Library
+.SH SYNOPSIS
+.nf
+.BI Text.distance\ :\ func(a:\ Text,\ b:\ Text,\ language:\ Text\ =\ "C"\ ->\ Num)
+.fi
+.SH DESCRIPTION
+Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar.
+
+
+.SH ARGUMENTS
+
+.TS
+allbox;
+lb lb lbx lb
+l l l l.
+Name Type Description Default
+a Text The first text to compare. -
+b Text The second text to compare. -
+language Text The ISO 639 language code for which character width to use. "C"
+.TE
+.SH RETURN
+The distance between the two texts (larger means more dissimilar).
+
+.SH NOTES
+The exact distance algorithm is not specified and may be subject to change over time.
+
+.SH EXAMPLES
+.EX
+assert "hello".distance("hello") == 0
+texts := &["goodbye", "hello", "hallo"]
+texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello"))
+assert texts == ["hello", "hallo", "goodbye"]
+.EE
+.SH SEE ALSO
+.BR Tomo-Text (3)
diff --git a/src/environment.c b/src/environment.c
index eb2c275c..d209471e 100644
--- a/src/environment.c
+++ b/src/environment.c
@@ -357,6 +357,7 @@ env_t *global_env(bool source_mapping) {
{"by_split_any", "Text$by_split_any", "func(text:Text, delimiters=' \\t\\r\\n' -> func(->Text?))"}, //
{"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language='C' -> Bool)"}, //
{"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, //
+ {"distance", "Text$distance", "func(a,b:Text, language='C' -> Num)"}, //
{"ends_with", "Text$ends_with", "func(text,suffix:Text, remainder:&Text? = none -> Bool)"}, //
{"find", "Text$find", "func(text,target:Text, start=1 -> Int?)"}, //
{"from", "Text$from", "func(text:Text, first:Int -> Text)"}, //
diff --git a/src/stdlib/text.c b/src/stdlib/text.c
index 4bf6d999..117b4a8d 100644
--- a/src/stdlib/text.c
+++ b/src/stdlib/text.c
@@ -1427,6 +1427,70 @@ Text_t Text$title(Text_t text, Text_t language) {
}
public
+double Text$distance(Text_t a, Text_t b, Text_t language) {
+ if (a.length == 0) return (double)b.length;
+ if (b.length == 0) return (double)a.length;
+
+ // The current implementation of text distance uses a modified form
+ // of Damerau–Levenshtein distance that gives slightly lower distances
+ // for letters with the same main grapheme and slightly lower distances
+ // for letters that are the same, but with different casing.
+ double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t[a.length][b.length]));
+#define DIST(x, y) distances[(x) * b.length + (y)]
+ for (int64_t i = 0; i <= a.length; i++)
+ DIST(i, 0) = i;
+ for (int64_t j = 0; j <= b.length; j++)
+ DIST(0, j) = j;
+
+ TextIter_t a_state = NEW_TEXT_ITER_STATE(a);
+ TextIter_t b_state = NEW_TEXT_ITER_STATE(b);
+ const char *uc_language = Text$as_c_string(language);
+ for (int64_t i = 1; i <= a.length; i++) {
+ for (int64_t j = 1; j <= b.length; j++) {
+ int32_t ai = Text$get_grapheme_fast(&a_state, i - 1);
+ int32_t bi = Text$get_grapheme_fast(&b_state, i - 1);
+ if (ai == bi) {
+ DIST(i, j) = DIST(i - 1, j - 1);
+ } else {
+ ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint;
+ ucs4_t main_bi = (bi) >= 0 ? (ucs4_t)bi : synthetic_graphemes[-(bi)-1].main_codepoint;
+ if (main_ai == main_bi) {
+ // Same main grapheme (different modifiers)
+ DIST(i, j) = 0.25 + DIST(i - 1, j - 1);
+ } else {
+ int cmp;
+ (void)u32_casecmp(&main_ai, 1, &main_bi, 1, uc_language, UNINORM_NFC, &cmp);
+ if (cmp == 0) {
+ // Same main grapheme, different casing (e.g. "a" vs "A")
+ DIST(i, j) = 0.5 + DIST(i - 1, j - 1);
+ } else {
+ // Different main grapheme
+ double insertion = 1. + DIST(i - 1, j);
+ double deletion = 1. + DIST(i, j - 1);
+ double dist = MIN(insertion, deletion);
+ double substitution = 1. + DIST(i - 1, j - 1);
+ dist = MIN(dist, substitution);
+ // Check for transposition:
+ if (i >= 2 && j >= 2) {
+ int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2);
+ int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2);
+ if (ai == bi_prev && bi == ai_prev) {
+ double transposition = 1. + DIST(i - 2, j - 2);
+ dist = MIN(dist, transposition);
+ }
+ }
+ DIST(i, j) = dist;
+ }
+ }
+ }
+ }
+ }
+#undef DIST
+
+ return (double)distances[a.length * b.length + b.length];
+}
+
+public
Text_t Text$escaped(Text_t text, bool colorize, Text_t extra_escapes) {
Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT;
#define flush_unquoted() \
diff --git a/src/stdlib/text.h b/src/stdlib/text.h
index 856b173a..7259b545 100644
--- a/src/stdlib/text.h
+++ b/src/stdlib/text.h
@@ -116,6 +116,7 @@ Int_t Text$width(Text_t text, Text_t language);
Text_t Text$left_pad(Text_t text, Int_t width, Text_t padding, Text_t language);
Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language);
Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language);
+double Text$distance(Text_t a, Text_t b, Text_t language);
int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index);
uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index);
Int_t Text$memory_size(Text_t text);
diff --git a/test/text.tm b/test/text.tm
index 6c23042d..8f0da922 100644
--- a/test/text.tm
+++ b/test/text.tm
@@ -208,3 +208,11 @@ func main()
assert "one two".find("two") == 5
assert "one two".find("three") == none
assert "one two".find("o", start=2) == 7
+
+
+ assert "hello".distance("hello") == 0
+ assert "hello".distance("goodbye") > 2.0
+ assert "hello".distance("hola") < "hello".distance("goodbye")
+ assert "hello".distance("Hello") <= 1.0
+ assert "hello".distance("xello") <= 1.0
+ assert "hello".distance("ehllo") <= "hello".distance("XXllo")