From 2b7e96835e75e0d153e7f993d1c4fc2add452ddd Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Sun, 8 Feb 2026 22:47:02 -0500 Subject: Added Text.distance(a,b) for text similarity comparisons. --- CHANGES.md | 4 +++ api/api.md | 27 ++++++++++++++++++ api/text.md | 27 ++++++++++++++++++ api/text.yaml | 33 ++++++++++++++++++++++ man/man3/tomo-Text.3 | 12 ++++++-- man/man3/tomo-Text.distance.3 | 43 +++++++++++++++++++++++++++++ src/environment.c | 1 + src/stdlib/text.c | 64 +++++++++++++++++++++++++++++++++++++++++++ src/stdlib/text.h | 1 + test/text.tm | 8 ++++++ 10 files changed, 218 insertions(+), 2 deletions(-) create mode 100644 man/man3/tomo-Text.distance.3 diff --git a/CHANGES.md b/CHANGES.md index 856abfb7..d27c60c4 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,9 @@ # Version History +## v2026-02-08 + +- Added `Text.distance(a,b)` for calculating text distances. + ## v2025-12-31 - Added support for `123.foo()` parsing the same as `(123).foo()` diff --git a/api/api.md b/api/api.md index 9e7c553e..0a2213cb 100644 --- a/api/api.md +++ b/api/api.md @@ -3965,6 +3965,33 @@ assert "Amélie".codepoint_names() == [ "LATIN SMALL LETTER E", ] +``` +## Text.distance + +```tomo +Text.distance : func(a: Text, b: Text, language: Text = "C" -> Num) +``` + +Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar. + +The exact distance algorithm is not specified and may be subject to change over time. + +Argument | Type | Description | Default +---------|------|-------------|--------- +a | `Text` | The first text to compare. | - +b | `Text` | The second text to compare. | - +language | `Text` | The ISO 639 language code for which character width to use. | `"C"` + +**Return:** The distance between the two texts (larger means more dissimilar). + + +**Example:** +```tomo +assert "hello".distance("hello") == 0 +texts := &["goodbye", "hello", "hallo"] +texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello")) +assert texts == ["hello", "hallo", "goodbye"] + ``` ## Text.ends_with diff --git a/api/text.md b/api/text.md index 928cb6ec..2536ff21 100644 --- a/api/text.md +++ b/api/text.md @@ -179,6 +179,33 @@ assert "Amélie".codepoint_names() == [ "LATIN SMALL LETTER E", ] +``` +## Text.distance + +```tomo +Text.distance : func(a: Text, b: Text, language: Text = "C" -> Num) +``` + +Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar. + +The exact distance algorithm is not specified and may be subject to change over time. + +Argument | Type | Description | Default +---------|------|-------------|--------- +a | `Text` | The first text to compare. | - +b | `Text` | The second text to compare. | - +language | `Text` | The ISO 639 language code for which character width to use. | `"C"` + +**Return:** The distance between the two texts (larger means more dissimilar). + + +**Example:** +```tomo +assert "hello".distance("hello") == 0 +texts := &["goodbye", "hello", "hallo"] +texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello")) +assert texts == ["hello", "hallo", "goodbye"] + ``` ## Text.ends_with diff --git a/api/text.yaml b/api/text.yaml index 6874bfc8..2af7cae4 100644 --- a/api/text.yaml +++ b/api/text.yaml @@ -225,6 +225,39 @@ Text.ends_with: assert "hello world".ends_with("world", &remainder) == yes assert remainder == "hello " +Text.distance: + short: distance between two texts + description: > + Get an approximate distance between two texts, such that when the distance + is small, the texts are similar and when the distance is large, the texts + are dissimilar. + note: > + The exact distance algorithm is not specified and may be subject to change + over time. + return: + type: 'Num' + description: > + The distance between the two texts (larger means more dissimilar). + args: + a: + type: 'Text' + description: > + The first text to compare. + b: + type: 'Text' + description: > + The second text to compare. + language: + type: 'Text' + default: '"C"' + description: > + The ISO 639 language code for which character width to use. + example: | + assert "hello".distance("hello") == 0 + texts := &["goodbye", "hello", "hallo"] + texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello")) + assert texts == ["hello", "hallo", "goodbye"] + Text.find: short: find a substring description: > diff --git a/man/man3/tomo-Text.3 b/man/man3/tomo-Text.3 index 634e3e0a..10032155 100644 --- a/man/man3/tomo-Text.3 +++ b/man/man3/tomo-Text.3 @@ -1,8 +1,8 @@ '\" t -.\" Copyright (c) 2025 Bruce Hill +.\" Copyright (c) 2026 Bruce Hill .\" All rights reserved. .\" -.TH Text 3 2025-11-29 "Tomo man-pages" +.TH Text 3 2026-02-08 "Tomo man-pages" .SH NAME Text \- a Tomo type .SH LIBRARY @@ -66,6 +66,14 @@ For more, see: .BR Tomo-Text.codepoint_names (3) +.TP +.BI Text.distance\ :\ func(a:\ Text,\ b:\ Text,\ language:\ Text\ =\ "C"\ ->\ Num) +Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar. + +For more, see: +.BR Tomo-Text.distance (3) + + .TP .BI Text.ends_with\ :\ func(text:\ Text,\ suffix:\ Text,\ remainder:\ &Text?\ =\ none\ ->\ Bool) Checks if the \fBText\fR ends with a literal suffix text. diff --git a/man/man3/tomo-Text.distance.3 b/man/man3/tomo-Text.distance.3 new file mode 100644 index 00000000..fe516e48 --- /dev/null +++ b/man/man3/tomo-Text.distance.3 @@ -0,0 +1,43 @@ +'\" t +.\" Copyright (c) 2026 Bruce Hill +.\" All rights reserved. +.\" +.TH Text.distance 3 2026-02-08 "Tomo man-pages" +.SH NAME +Text.distance \- distance between two texts +.SH LIBRARY +Tomo Standard Library +.SH SYNOPSIS +.nf +.BI Text.distance\ :\ func(a:\ Text,\ b:\ Text,\ language:\ Text\ =\ "C"\ ->\ Num) +.fi +.SH DESCRIPTION +Get an approximate distance between two texts, such that when the distance is small, the texts are similar and when the distance is large, the texts are dissimilar. + + +.SH ARGUMENTS + +.TS +allbox; +lb lb lbx lb +l l l l. +Name Type Description Default +a Text The first text to compare. - +b Text The second text to compare. - +language Text The ISO 639 language code for which character width to use. "C" +.TE +.SH RETURN +The distance between the two texts (larger means more dissimilar). + +.SH NOTES +The exact distance algorithm is not specified and may be subject to change over time. + +.SH EXAMPLES +.EX +assert "hello".distance("hello") == 0 +texts := &["goodbye", "hello", "hallo"] +texts.sort(func(a,b:&Text) a.distance("hello") <> b.distance("hello")) +assert texts == ["hello", "hallo", "goodbye"] +.EE +.SH SEE ALSO +.BR Tomo-Text (3) diff --git a/src/environment.c b/src/environment.c index eb2c275c..d209471e 100644 --- a/src/environment.c +++ b/src/environment.c @@ -357,6 +357,7 @@ env_t *global_env(bool source_mapping) { {"by_split_any", "Text$by_split_any", "func(text:Text, delimiters=' \\t\\r\\n' -> func(->Text?))"}, // {"caseless_equals", "Text$equal_ignoring_case", "func(a,b:Text, language='C' -> Bool)"}, // {"codepoint_names", "Text$codepoint_names", "func(text:Text -> [Text])"}, // + {"distance", "Text$distance", "func(a,b:Text, language='C' -> Num)"}, // {"ends_with", "Text$ends_with", "func(text,suffix:Text, remainder:&Text? = none -> Bool)"}, // {"find", "Text$find", "func(text,target:Text, start=1 -> Int?)"}, // {"from", "Text$from", "func(text:Text, first:Int -> Text)"}, // diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 4bf6d999..117b4a8d 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -1426,6 +1426,70 @@ Text_t Text$title(Text_t text, Text_t language) { return ret; } +public +double Text$distance(Text_t a, Text_t b, Text_t language) { + if (a.length == 0) return (double)b.length; + if (b.length == 0) return (double)a.length; + + // The current implementation of text distance uses a modified form + // of Damerau–Levenshtein distance that gives slightly lower distances + // for letters with the same main grapheme and slightly lower distances + // for letters that are the same, but with different casing. + double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t[a.length][b.length])); +#define DIST(x, y) distances[(x) * b.length + (y)] + for (int64_t i = 0; i <= a.length; i++) + DIST(i, 0) = i; + for (int64_t j = 0; j <= b.length; j++) + DIST(0, j) = j; + + TextIter_t a_state = NEW_TEXT_ITER_STATE(a); + TextIter_t b_state = NEW_TEXT_ITER_STATE(b); + const char *uc_language = Text$as_c_string(language); + for (int64_t i = 1; i <= a.length; i++) { + for (int64_t j = 1; j <= b.length; j++) { + int32_t ai = Text$get_grapheme_fast(&a_state, i - 1); + int32_t bi = Text$get_grapheme_fast(&b_state, i - 1); + if (ai == bi) { + DIST(i, j) = DIST(i - 1, j - 1); + } else { + ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint; + ucs4_t main_bi = (bi) >= 0 ? (ucs4_t)bi : synthetic_graphemes[-(bi)-1].main_codepoint; + if (main_ai == main_bi) { + // Same main grapheme (different modifiers) + DIST(i, j) = 0.25 + DIST(i - 1, j - 1); + } else { + int cmp; + (void)u32_casecmp(&main_ai, 1, &main_bi, 1, uc_language, UNINORM_NFC, &cmp); + if (cmp == 0) { + // Same main grapheme, different casing (e.g. "a" vs "A") + DIST(i, j) = 0.5 + DIST(i - 1, j - 1); + } else { + // Different main grapheme + double insertion = 1. + DIST(i - 1, j); + double deletion = 1. + DIST(i, j - 1); + double dist = MIN(insertion, deletion); + double substitution = 1. + DIST(i - 1, j - 1); + dist = MIN(dist, substitution); + // Check for transposition: + if (i >= 2 && j >= 2) { + int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2); + int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2); + if (ai == bi_prev && bi == ai_prev) { + double transposition = 1. + DIST(i - 2, j - 2); + dist = MIN(dist, transposition); + } + } + DIST(i, j) = dist; + } + } + } + } + } +#undef DIST + + return (double)distances[a.length * b.length + b.length]; +} + public Text_t Text$escaped(Text_t text, bool colorize, Text_t extra_escapes) { Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT; diff --git a/src/stdlib/text.h b/src/stdlib/text.h index 856b173a..7259b545 100644 --- a/src/stdlib/text.h +++ b/src/stdlib/text.h @@ -116,6 +116,7 @@ Int_t Text$width(Text_t text, Text_t language); Text_t Text$left_pad(Text_t text, Int_t width, Text_t padding, Text_t language); Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language); Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language); +double Text$distance(Text_t a, Text_t b, Text_t language); int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index); uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index); Int_t Text$memory_size(Text_t text); diff --git a/test/text.tm b/test/text.tm index 6c23042d..8f0da922 100644 --- a/test/text.tm +++ b/test/text.tm @@ -208,3 +208,11 @@ func main() assert "one two".find("two") == 5 assert "one two".find("three") == none assert "one two".find("o", start=2) == 7 + + + assert "hello".distance("hello") == 0 + assert "hello".distance("goodbye") > 2.0 + assert "hello".distance("hola") < "hello".distance("goodbye") + assert "hello".distance("Hello") <= 1.0 + assert "hello".distance("xello") <= 1.0 + assert "hello".distance("ehllo") <= "hello".distance("XXllo") -- cgit v1.2.3