aboutsummaryrefslogtreecommitdiff
path: root/src/stdlib
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2026-02-08 23:42:27 -0500
committerBruce Hill <bruce@bruce-hill.com>2026-02-08 23:42:27 -0500
commit4e31786efcf32818a620cc87ffa80dadef90a466 (patch)
tree13c105b061230e1207d1415ef8089056abb627b3 /src/stdlib
parent2b7e96835e75e0d153e7f993d1c4fc2add452ddd (diff)
Bugfixes for Text.distance()
Diffstat (limited to 'src/stdlib')
-rw-r--r--src/stdlib/text.c59
1 files changed, 27 insertions, 32 deletions
diff --git a/src/stdlib/text.c b/src/stdlib/text.c
index 117b4a8d..3725bab1 100644
--- a/src/stdlib/text.c
+++ b/src/stdlib/text.c
@@ -1428,14 +1428,14 @@ Text_t Text$title(Text_t text, Text_t language) {
public
double Text$distance(Text_t a, Text_t b, Text_t language) {
- if (a.length == 0) return (double)b.length;
- if (b.length == 0) return (double)a.length;
+ if (a.length <= 0) return (double)b.length;
+ if (b.length <= 0) return (double)a.length;
// The current implementation of text distance uses a modified form
// of Damerau–Levenshtein distance that gives slightly lower distances
// for letters with the same main grapheme and slightly lower distances
// for letters that are the same, but with different casing.
- double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t[a.length][b.length]));
+ double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t) * (a.length + 1) * (b.length + 1));
#define DIST(x, y) distances[(x) * b.length + (y)]
for (int64_t i = 0; i <= a.length; i++)
DIST(i, 0) = i;
@@ -1448,41 +1448,36 @@ double Text$distance(Text_t a, Text_t b, Text_t language) {
for (int64_t i = 1; i <= a.length; i++) {
for (int64_t j = 1; j <= b.length; j++) {
int32_t ai = Text$get_grapheme_fast(&a_state, i - 1);
- int32_t bi = Text$get_grapheme_fast(&b_state, i - 1);
- if (ai == bi) {
- DIST(i, j) = DIST(i - 1, j - 1);
- } else {
+ int32_t bj = Text$get_grapheme_fast(&b_state, j - 1);
+ double cost = (double)(ai != bj);
+ if (cost > 0) {
ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint;
- ucs4_t main_bi = (bi) >= 0 ? (ucs4_t)bi : synthetic_graphemes[-(bi)-1].main_codepoint;
- if (main_ai == main_bi) {
+ ucs4_t main_bj = (bj) >= 0 ? (ucs4_t)bj : synthetic_graphemes[-(bj)-1].main_codepoint;
+ if (main_ai == main_bj) {
// Same main grapheme (different modifiers)
- DIST(i, j) = 0.25 + DIST(i - 1, j - 1);
+ cost = 0.25;
} else {
int cmp;
- (void)u32_casecmp(&main_ai, 1, &main_bi, 1, uc_language, UNINORM_NFC, &cmp);
- if (cmp == 0) {
- // Same main grapheme, different casing (e.g. "a" vs "A")
- DIST(i, j) = 0.5 + DIST(i - 1, j - 1);
- } else {
- // Different main grapheme
- double insertion = 1. + DIST(i - 1, j);
- double deletion = 1. + DIST(i, j - 1);
- double dist = MIN(insertion, deletion);
- double substitution = 1. + DIST(i - 1, j - 1);
- dist = MIN(dist, substitution);
- // Check for transposition:
- if (i >= 2 && j >= 2) {
- int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2);
- int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2);
- if (ai == bi_prev && bi == ai_prev) {
- double transposition = 1. + DIST(i - 2, j - 2);
- dist = MIN(dist, transposition);
- }
- }
- DIST(i, j) = dist;
- }
+ (void)u32_casecmp(&main_ai, 1, &main_bj, 1, uc_language, UNINORM_NFC, &cmp);
+ if (cmp == 0) cost = 0.5;
+ }
+ }
+
+ double insertion = 1. + DIST(i - 1, j);
+ double deletion = 1. + DIST(i, j - 1);
+ double dist = MIN(insertion, deletion);
+ double substitution = cost + DIST(i - 1, j - 1);
+ dist = MIN(dist, substitution);
+ // Check for transposition:
+ if (i >= 2 && j >= 2 && cost != 0) {
+ int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2);
+ int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2);
+ if (ai == bi_prev && bj == ai_prev) {
+ double transposition = cost + DIST(i - 2, j - 2);
+ dist = MIN(dist, transposition);
}
}
+ DIST(i, j) = dist;
}
}
#undef DIST