Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(592)

Side by Side Diff: lib/src/comparators.dart

Issue 1412293008: Add string comparators that can compare ASCII strings ignoring case. (Closed) Base URL: https://github.com/dart-lang/collection.git@master
Patch Set: Address comments. PTAL Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2015, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 library dart.pkg.collection.comparators;
6
7 // Character constants.
8 const int _zero = 0x30;
9 const int _upperCaseA = 0x41;
10 const int _upperCaseZ = 0x5d;
Lasse Reichstein Nielsen 2015/11/19 07:15:37 Whoops, that should be 0x5a, and 0x7a below. Will
11 const int _lowerCaseA = 0x61;
12 const int _lowerCaseZ = 0x7D;
13 const int _asciiCaseBit = 0x20;
14
15 /// Checks if strings [a] and [b] differ only on the case of ASCII letters.
16 ///
17 /// Strings are equal if they have the same length, and the characters at
18 /// each index are the same, or they are ASCII letters where one is upper-case
19 /// and the other is the lower-case version of the same letter.
20 ///
21 /// The comparison does not ignore the case of non-ASCII letters, so
22 /// a capital ae-ligature (Æ) is different from a lower case ae-ligature (æ).
23 ///
24 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense
25 /// for situations where the strings are known to be ASCII. Examples could
26 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar
27 /// strings with a known structure.
28 bool equalsIgnoreAsciiCase(String a, String b) {
29 if (a.length != b.length) return false;
30 for (int i = 0; i < a.length; i++) {
31 var aChar = a.codeUnitAt(i);
32 var bChar = b.codeUnitAt(i);
33 if (aChar == bChar) continue;
34 // Quick-check for whether this may be different cases of the same letter.
35 if (aChar ^ bChar != _asciiCaseBit) return false;
36 // If it's possible, then check if either character is actually an ASCII
37 // letter.
38 int aCharUpperCase = aChar | _asciiCaseBit;
39 if (_upperCaseA <= aCharUpperCase && aCharUpperCase <= _upperCaseZ) {
40 continue;
41 }
42 return false;
43 }
44 return true;
45 }
46
47
48 /// Hash code for a string which is compatible with [equalsIgnoreAsciiCase].
49 ///
50 /// The hash code is unaffected by changing the case of ASCII letters, but
51 /// the case of non-ASCII letters do affect the result.
52 int hashIgnoreAsciiCase(String string) {
53 // Jenkins hash code ( http://en.wikipedia.org/wiki/Jenkins_hash_function).
sra1 2015/11/18 23:40:16 Jenkins hash code modified to operate on Smi value
Lasse Reichstein Nielsen 2015/11/19 07:15:37 Done.
54 // Same hash used by dart2js for strings, modified to ignore ASCII letter
55 // case.
56 int hash = 0;
57 for (int i = 0; i < string.length; i++) {
58 int char = string.codeUnitAt(i);
59 // Convert lower-case ASCII letters to upper case.upper
60 // This ensures that strings that differ only in case will have the
61 // same hash code.
62 if (_lowerCaseA <= char && char <= _lowerCaseZ) char -= _asciiCaseBit;
63 hash = 0x1fffffff & (hash + char);
64 hash = 0x1fffffff & (hash + ((0x0007ffff & hash) << 10));
65 hash >>= 6;
66 }
67 hash = 0x1fffffff & (hash + ((0x03ffffff & hash) << 3));
68 hash >>= 11;
69 return 0x1fffffff & (hash + ((0x00003fff & hash) << 15));
70 }
71
72
73 /// Compares [a] and [b] lexically, converting ASCII letters to upper case.
74 ///
75 /// Comparison treats all lower-case ASCII letters as upper-case letters,
76 /// but does no case conversion for non-ASCII letters.
77 ///
78 /// If two strings differ only on the case of ASCII letters, the one with the
79 /// capital letter at the first difference will compare as less than the other
80 /// string. This tie-breaking ensures that the comparison is a total ordering
81 /// on strings and is compatible with equality.
82 ///
83 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense
84 /// for situations where the strings are known to be ASCII. Examples could
85 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar
86 /// strings with a known structure.
87 int compareAsciiUpperCase(String a, String b) {
88 int defaultResult = 0; // Returned if no difference found.
89 for (int i = 0; i < a.length; i++) {
90 if (i == b.length) return 1;
91 var aChar = a.codeUnitAt(i);
92 var bChar = b.codeUnitAt(i);
93 if (aChar == bChar) continue;
94 // Upper-case if letters.
95 int aUpperCase = aChar;
96 int bUpperCase = bChar;
97 if (_lowerCaseA <= aChar && aChar <= _lowerCaseZ) {
98 aUpperCase -= _asciiCaseBit;
99 }
100 if (_lowerCaseA <= bChar && bChar <= _lowerCaseZ) {
101 bUpperCase -= _asciiCaseBit;
102 }
103 if (aUpperCase != bUpperCase) return (aUpperCase - bUpperCase).sign;
104 if (defaultResult == 0) defaultResult = (aChar - bChar);
105 }
106 if (b.length > a.length) return -1;
107 return defaultResult.sign;
108 }
109
110
111 /// Compares [a] and [b] lexically, converting ASCII letters to lower case.
112 ///
113 /// Comparison treats all upper-case ASCII letters as lower-case letters,
114 /// but does no case conversion for non-ASCII letters.
115 ///
116 /// If two strings differ only on the case of ASCII letters, the one with the
117 /// capital letter at the first difference will compare as less than the other
118 /// string. This tie-breaking ensures that the comparison is a total ordering
119 /// on strings.
120 ///
121 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense
122 /// for situations where the strings are known to be ASCII. Examples could
123 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar
124 /// strings with a known structure.
125 int compareAsciiLowerCase(String a, String b) {
floitsch 2015/11/18 22:50:11 How is this one different from compareAsciiUpperCa
Lasse Reichstein Nielsen 2015/11/19 07:15:37 Only in how it orders, e.g., "a" vs. "_". Since "
126 int defaultResult = 0;
127 for (int i = 0; i < a.length; i++) {
128 if (i == b.length) return 1;
129 var aChar = a.codeUnitAt(i);
130 var bChar = b.codeUnitAt(i);
131 if (aChar == bChar) continue;
132 int aLowerCase = aChar;
133 int bLowerCase = bChar;
134 // Upper case if ASCII letters.
135 if (_upperCaseA <= bChar && bChar <= _upperCaseZ) {
136 bLowerCase += _asciiCaseBit;
137 }
138 if (_upperCaseA <= aChar && aChar <= _upperCaseZ) {
139 aLowerCase += _asciiCaseBit;
140 }
141 if (aLowerCase != bLowerCase) return (aLowerCase - bLowerCase).sign;
142 if (defaultResult == 0) defaultResult = aChar - bChar;
143 }
144 if (b.length > a.length) return -1;
145 return defaultResult.sign;
146 }
147
148 /// Compares strings [a] and [b] according to [natural sort ordering].
149 ///
150 /// A natural sort ordering is a lexical ordering where embedded
151 /// numerals (digit sequeunces) are treated as a single unit and ordered by
152 /// numerical value.
153 /// This means that `"a10b"` will be ordered after `"a7b"` in natural
154 /// ordering, where leixcal ordering would put the `1` before the `7`, ignoring
floitsch 2015/11/18 22:50:11 lexical
155 /// that the `1` is part of a larger number.
156 ///
157 /// Example:
158 /// The following strings are in the order they would be sorted by using this
159 /// comparison function:
160 ///
161 /// "a", "a0", "a0b", "a1", "a01", "a9", "a10", "a100", "a100b", "aa"
162 ///
163 /// [natural sort ordering]: https://en.wikipedia.org/wiki/Natural_sort_order
164 int compareNatural(String a, String b) {
165 for (int i = 0; i < a.length; i++) {
166 if (i == b.length) return 1;
sra1 2015/11/18 23:40:16 Use if (i >= b.length) to improve constraint-b
Lasse Reichstein Nielsen 2015/11/19 07:15:37 Done.
167 var aChar = a.codeUnitAt(i);
168 var bChar = b.codeUnitAt(i);
169 if (aChar != bChar) {
170 return _compareNaturally(a, b, i, aChar, bChar);
171 }
172 }
173 if (b.length > a.length) return -1;
174 return 0;
175 }
176
177 /// Compares strings [a] and [b] according to lower-case
178 /// [natural sort ordering].
179 ///
180 /// ASCII letters are converted to lower case before being compared, like
181 /// for [compareAsciiLowerCase], then the result is compared like for
182 /// [compareNatural].
183 ///
184 /// If two strings differ only on the case of ASCII letters, the one with the
185 /// capital letter at the first difference will compare as less than the other
186 /// string. This tie-breaking ensures that the comparison is a total ordering
187 /// on strings.
188 ///
189 /// [natural sort ordering]: https://en.wikipedia.org/wiki/Natural_sort_order
190 int compareAsciiLowerCaseNatural(String a, String b) {
191 int defaultResult = 0; // Returned if no difference found.
192 for (int i = 0; i < a.length; i++) {
193 if (i == b.length) return 1;
194 var aChar = a.codeUnitAt(i);
195 var bChar = b.codeUnitAt(i);
196 if (aChar == bChar) continue;
197 int aLowerCase = aChar;
198 int bLowerCase = bChar;
199 if (_upperCaseA <= aChar && aChar <= _upperCaseZ) {
200 aLowerCase += _asciiCaseBit;
201 }
202 if (_upperCaseA <= bChar && bChar <= _upperCaseZ) {
203 bLowerCase += _asciiCaseBit;
204 }
205 if (aLowerCase != bLowerCase) {
206 return _compareNaturally(a, b, i, aLowerCase, bLowerCase);
207 }
208 if (defaultResult == 0) defaultResult = aChar - bChar;
209 }
210 if (b.length > a.length) return -1;
211 return defaultResult.sign;
212 }
213
214 /// Compares strings [a] and [b] according to upper-case
215 /// [natural sort ordering].
216 ///
217 /// ASCII letters are converted to upper case before being compared, like
218 /// for [compareAsciiUpperCase], then the result is compared like for
219 /// [compareNatural].
220 ///
221 /// If two strings differ only on the case of ASCII letters, the one with the
222 /// capital letter at the first difference will compare as less than the other
223 /// string. This tie-breaking ensures that the comparison is a total ordering
224 /// on strings
225 ///
226 /// [natural sort ordering]: https://en.wikipedia.org/wiki/Natural_sort_order
227 int compareAsciiUpperCaseNatural(String a, String b) {
floitsch 2015/11/18 22:50:11 How is this one different from compareAsciiLowerCa
Lasse Reichstein Nielsen 2015/11/19 07:15:37 Same as above, differs on six characters that lie
228 int defaultResult = 0;
229 for (int i = 0; i < a.length; i++) {
230 if (i == b.length) return 1;
231 var aChar = a.codeUnitAt(i);
232 var bChar = b.codeUnitAt(i);
233 if (aChar == bChar) continue;
234 int aUpperCase = aChar;
235 int bUpperCase = bChar;
236 if (_lowerCaseA <= aChar && aChar <= _lowerCaseZ) {
237 aUpperCase -= _asciiCaseBit;
238 }
239 if (_lowerCaseA <= bChar && bChar <= _lowerCaseZ) {
240 bUpperCase -= _asciiCaseBit;
241 }
242 if (aUpperCase != bUpperCase) {
243 return _compareNaturally(a, b, i, aUpperCase, bUpperCase);
244 }
245 if (defaultResult == 0) defaultResult = aChar - bChar;
246 }
247 if (b.length > a.length) return -1;
248 return defaultResult.sign;
249 }
250
251 /// Check for numbers overlapping the current mismatched characters.
252 ///
253 /// If both [aChar] and [bChar] are digits, use numerical comparison.
254 /// Check if the previous characters is a non-zero number, and if not,
255 /// skip - but count - leading zeros before comparing numbers.
256 ///
257 /// If one is a digit and the other isn't, check if the previous character
258 /// is a digit, and if so, the the one with the digit is the greater number.
259 ///
260 /// Otherwise just returns the difference between [aChar] and [bChar].
261 int _compareNaturally(
262 String a, String b, int index, int aChar, int bChar) {
263 assert(aChar != bChar);
264 var aIsDigit = _isDigit(aChar);
265 var bIsDigit = _isDigit(bChar);
266 if (aIsDigit) {
267 if (bIsDigit) {
268 return _compareNumerically(a, b, aChar, bChar, index);
269 } else if (index > 0 && _isDigit(a.codeUnitAt(index - 1))) {
270 // aChar is the continuation of a longer number.
271 return 1;
272 }
273 } else if (bIsDigit && index > 0 && _isDigit(b.codeUnitAt(index - 1))) {
274 // bChar is the continuation of a longer number.
275 return -1;
276 }
277 // Characters are both non-digits, or not continuation of earlier number.
278 return (aChar - bChar).sign;
279 }
280
281 /// Compare numbers overlapping [aChar] and [bChar] numerically.
282 ///
283 /// If the numbers have the same numerical value, but one has more leading
284 /// zeros, the longer number is considered greater than the shorter one.
285 ///
286 /// This ensures a total ordering on strings compatible with equality.
287 int _compareNumerically(String a, String b, int aChar, int bChar, int index) {
288 // Both are digits. Find the first significant different digit, then find
289 // the length of the numbers.
290 if (_isNumberSuffix(a, index)) {
291 // Part of a longer number, differs at this index, just count the length.
292 int result = _compareDigitCount(a, b, index, index);
293 if (result != 0) return result;
294 // If same length, the current character is the most significant differing
295 // digit.
296 return (aChar - bChar).sign;
297 }
298 // Not part of larger (non-zero) number, so skip leading zeros before
299 // comparing numbers.
300 int aIndex = index;
301 int bIndex = index;
302 if (aChar == _zero) {
303 do {
304 aIndex++;
305 if (aIndex == a.length) return -1; // number in a is zero, b is not.
306 aChar = a.codeUnitAt(aIndex);
307 } while (aChar == _zero);
308 if (!_isDigit(aChar)) return -1;
309 } else if (bChar == _zero) {
310 do {
311 bIndex++;
312 if (bIndex == b.length) return 1; // number in b is zero, a is not.
313 bChar = b.codeUnitAt(bIndex);
314 } while (bChar == _zero);
315 if (!_isDigit(bChar)) return 1;
316 }
317 if (aChar != bChar) {
318 int result = _compareDigitCount(a, b, aIndex, bIndex);
319 if (result != 0) return result;
320 return (aChar - bChar).sign;
321 }
322 // Same leading digit, one had more leading zeros.
323 // Compare digits until reaching a difference.
324 while (true) {
325 var aIsDigit = false;
326 var bIsDigit = false;
327 aChar = 0;
328 bChar = 0;
329 if (++aIndex < a.length) {
330 aChar = a.codeUnitAt(aIndex);
331 aIsDigit = _isDigit(aChar);
332 }
333 if (++bIndex < b.length) {
334 bChar = b.codeUnitAt(bIndex);
335 bIsDigit = _isDigit(bChar);
336 }
337 if (aIsDigit) {
338 if (bIsDigit) {
339 if (aChar == bChar) continue;
340 // First different digit found.
341 break;
342 }
343 // bChar is non-digit, so a has longer number.
344 return 1;
345 } else if (bIsDigit) {
346 return -1; // b has longer number.
347 } else {
348 // Neither is digit, so numbers had same numerical value.
349 // Fall back on number of leading zeros
350 // (reflected by difference in indices).
351 return (aIndex - bIndex).sign;
352 }
353 }
354 // At first differing digits.
355 int result = _compareDigitCount(a, b, aIndex, bIndex);
356 if (result != 0) return result;
357 return (aChar - bChar).sign;
358 }
359
360 /// Checks which of [a] and [b] has the longest sequence of digits.
361 ///
362 /// Starts counting from `i + 1` and `j + 1` (assumes that `a[i]` and `b[j]` are
363 /// both already known to be digits).
364 int _compareDigitCount(String a, String b, int i, int j) {
365 while (++i < a.length) {
366 bool aIsDigit = _isDigit(a.codeUnitAt(i));
367 if (++j == b.length) return aIsDigit ? 1 : 0;
368 bool bIsDigit = _isDigit(b.codeUnitAt(j));
369 if (aIsDigit) {
370 if (bIsDigit) continue;
371 return 1;
372 } else if (bIsDigit) {
373 return -1;
374 } else {
375 return 0;
376 }
377 }
378 if (++j < b.length && _isDigit(b.codeUnitAt(j))) {
379 return -1;
380 }
381 return 0;
382 }
383
384 bool _isDigit(int charCode) => (charCode ^ _zero) <= 9;
385
386 /// Check if the digit at [index] is continuing a non-zero number.
387 ///
388 /// If there is no non-zero digits before, then leading zeros at [index]
389 /// are also ignored when comparing numerically. If there is a non-zero digit
390 /// before, then zeros at [index] are significant.
391 bool _isNumberSuffix(String string, int index) {
sra1 2015/11/18 23:40:16 maybe call it _isNonZeroNumberSuffix
Lasse Reichstein Nielsen 2015/11/19 07:15:37 Done.
392 while (--index >= 0) {
393 int char = string.codeUnitAt(index);
394 if (char != _zero) return _isDigit(char);
395 }
396 return false;
397 }
OLDNEW
« no previous file with comments | « lib/collection.dart ('k') | pubspec.yaml » ('j') | test/comparators_test.dart » ('J')

Powered by Google App Engine
This is Rietveld 408576698