OLD | NEW |
| (Empty) |
1 library levenshtein; | |
2 | |
3 import 'dart:math' as math; | |
4 | |
5 /** | |
6 * The value returned by [levenshtein] if the distance is determined | |
7 * to be over the specified threshold. | |
8 */ | |
9 const int LEVENSHTEIN_MAX = 1 << 20; | |
10 | |
11 const int _MAX_VALUE = 1 << 10; | |
12 | |
13 /** | |
14 * Find the Levenshtein distance between two [String]s if it's less than or | |
15 * equal to a given threshold. | |
16 * | |
17 * This is the number of changes needed to change one String into another, | |
18 * where each change is a single character modification (deletion, insertion or | |
19 * substitution). | |
20 * | |
21 * This implementation follows from Algorithms on Strings, Trees and Sequences | |
22 * by Dan Gusfield and Chas Emerick's implementation of the Levenshtein distance | |
23 * algorithm. | |
24 */ | |
25 int levenshtein(String s, String t, int threshold, {bool caseSensitive: true}) { | |
26 if (s == null || t == null) { | |
27 throw new ArgumentError('Strings must not be null'); | |
28 } | |
29 if (threshold < 0) { | |
30 throw new ArgumentError('Threshold must not be negative'); | |
31 } | |
32 | |
33 if (!caseSensitive) { | |
34 s = s.toLowerCase(); | |
35 t = t.toLowerCase(); | |
36 } | |
37 | |
38 int s_len = s.length; | |
39 int t_len = t.length; | |
40 | |
41 // if one string is empty, | |
42 // the edit distance is necessarily the length of the other | |
43 if (s_len == 0) { | |
44 return t_len <= threshold ? t_len : LEVENSHTEIN_MAX; | |
45 } | |
46 if (t_len == 0) { | |
47 return s_len <= threshold ? s_len : LEVENSHTEIN_MAX; | |
48 } | |
49 // the distance can never be less than abs(s_len - t_len) | |
50 if ((s_len - t_len).abs() > threshold) { | |
51 return LEVENSHTEIN_MAX; | |
52 } | |
53 | |
54 // swap the two strings to consume less memory | |
55 if (s_len > t_len) { | |
56 String tmp = s; | |
57 s = t; | |
58 t = tmp; | |
59 s_len = t_len; | |
60 t_len = t.length; | |
61 } | |
62 | |
63 // 'previous' cost array, horizontally | |
64 List<int> p = new List<int>.filled(s_len + 1, 0); | |
65 // cost array, horizontally | |
66 List<int> d = new List<int>.filled(s_len + 1, 0); | |
67 // placeholder to assist in swapping p and d | |
68 List<int> _d; | |
69 | |
70 // fill in starting table values | |
71 int boundary = math.min(s_len, threshold) + 1; | |
72 for (int i = 0; i < boundary; i++) { | |
73 p[i] = i; | |
74 } | |
75 | |
76 // these fills ensure that the value above the rightmost entry of our | |
77 // stripe will be ignored in following loop iterations | |
78 _setRange(p, boundary, p.length, _MAX_VALUE); | |
79 _setRange(d, 0, d.length, _MAX_VALUE); | |
80 | |
81 // iterates through t | |
82 for (int j = 1; j <= t_len; j++) { | |
83 // jth character of t | |
84 int t_j = t.codeUnitAt(j - 1); | |
85 d[0] = j; | |
86 | |
87 // compute stripe indices, constrain to array size | |
88 int min = math.max(1, j - threshold); | |
89 int max = math.min(s_len, j + threshold); | |
90 | |
91 // the stripe may lead off of the table if s and t are of different sizes | |
92 if (min > max) { | |
93 return LEVENSHTEIN_MAX; | |
94 } | |
95 | |
96 // ignore entry left of leftmost | |
97 if (min > 1) { | |
98 d[min - 1] = _MAX_VALUE; | |
99 } | |
100 | |
101 // iterates through [min, max] in s | |
102 for (int i = min; i <= max; i++) { | |
103 if (s.codeUnitAt(i - 1) == t_j) { | |
104 // diagonally left and up | |
105 d[i] = p[i - 1]; | |
106 } else { | |
107 // 1 + minimum of cell to the left, to the top, diagonally left and up | |
108 d[i] = 1 + math.min(math.min(d[i - 1], p[i]), p[i - 1]); | |
109 } | |
110 } | |
111 | |
112 // copy current distance counts to 'previous row' distance counts | |
113 _d = p; | |
114 p = d; | |
115 d = _d; | |
116 } | |
117 | |
118 // if p[n] is greater than the threshold, | |
119 // there's no guarantee on it being the correct distance | |
120 if (p[s_len] <= threshold) { | |
121 return p[s_len]; | |
122 } | |
123 | |
124 return LEVENSHTEIN_MAX; | |
125 } | |
126 | |
127 void _setRange(List<int> a, int start, int end, int value) { | |
128 for (int i = start; i < end; i++) { | |
129 a[i] = value; | |
130 } | |
131 } | |
OLD | NEW |