Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1163)

Side by Side Diff: lib/src/comparators.dart

Issue 1412293008: Add string comparators that can compare ASCII strings ignoring case. (Closed) Base URL: https://github.com/dart-lang/collection.git@master
Patch Set: Address comments. Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2015, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 library dart.pkg.collection.comparators;
6
7 /// Checks if strings [a] and [b] differ only on the case of ASCII letters.
8 ///
9 /// Strings are equal if they have the same length, and the characters at
10 /// each index are the same, or they are ASCII letters where one is upper-case
11 /// and the other is the lower-case version of the same letter.
12 ///
13 /// The comparison does not ignore the case of non-ASCII letters, so
14 /// a capital ae-ligature (Æ) is different from a lower case ae-ligature (æ).
15 ///
16 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense
17 /// for situations where the strings are known to be ASCII. Examples could
18 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar
19 /// strings with a known structure.
20 bool equalsIgnoreAsciiCase(String a, String b) {
21 if (a.length != b.length) return false;
22 for (int i = 0; i < a.length; i++) {
23 var aChar = a.codeUnitAt(i);
24 var bChar = b.codeUnitAt(i);
25 if (aChar == bChar) continue;
26 const int upperCaseA = 0x41;
sra1 2015/11/13 18:25:40 Hoist these common constants to top level library
Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done.
27 const int upperCaseZ = 0x5d;
28 if (aChar ^ bChar != 0x20) return false;
Bob Nystrom 2015/11/13 16:55:36 Document this. This is because upper/lowercase ASC
Lasse Reichstein Nielsen 2015/11/18 12:12:47 Yes, this is a quick bailout. If the characters di
29 if (upperCaseA <= aChar && aChar <= upperCaseZ) continue;
30 if (upperCaseA <= bChar && bChar <= upperCaseZ) continue;
31 return false;
32 }
33 return true;
34 }
35
36 /// Hash code for a string which is compatible with [equalsIgnoreAsciiCase].
37 ///
38 /// The hash code is unaffected by changing the case of ASCII letters, but
39 /// the case of non-ASCII letters do affect the result.
40 int hashIgnoreAsciiCase(String string) {
41 int hash = 0;
42 for (int i = 0; i < string.length; i++) {
43 int char = string.codeUnitAt(i);
44 const int lowerCaseA = 0x61;
45 const int lowerCaseZ = 0x7D;
46 if (lowerCaseA <= char && char <= lowerCaseZ) char -= 0x20;
Bob Nystrom 2015/11/13 16:55:36 Document this.
Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done.
47 hash = 0x1fffffff & (hash + char);
48 hash = 0x1fffffff & (hash + ((0x0007ffff & hash) << 10));
49 hash >>= 6;
50 }
51 hash = 0x1fffffff & (hash + ((0x03ffffff & hash) << 3));
52 hash >>= 11;
53 return 0x1fffffff & (hash + ((0x00003fff & hash) << 15));
Bob Nystrom 2015/11/13 16:55:36 It would be good to document a reference for where
Lasse Reichstein Nielsen 2015/11/18 12:12:47 It's Jenkins, stole the code from the actual strin
54 }
55
56 /// Compares [a] and [b] lexically, converting ASCII letters to upper case.
57 ///
58 /// Comparison treats all lower-case ASCII letters as their upper-case letter,
59 /// but does no case conversion for non-ASCII letters.
60 ///
61 /// If two strings differ only on the case of ASCII letters, the one with the
62 /// capital letter at the first difference will compare as less than the other
63 /// string. This tie-breaking ensures that the comparison is a total ordering
64 /// on strings.
65 ///
66 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense
67 /// for situations where the strings are known to be ASCII. Examples could
68 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar
69 /// strings with a known structure.
70 int compareAsciiUpperCase(String a, String b) {
71 int defaultResult = 0; // Returned if no difference found.
72 for (int i = 0; i < a.length; i++) {
73 if (i == b.length) return 1;
74 var aChar = a.codeUnitAt(i);
75 var bChar = b.codeUnitAt(i);
76 if (aChar == bChar) continue;
77 const int lowerCaseA = 0x61;
78 const int lowerCaseZ = 0x7d;
79 // Lower case if letters.
80 int aUpperCase = aChar;
81 int bUpperCase = bChar;
82 if (lowerCaseA <= aChar && aChar <= lowerCaseZ) aUpperCase -= 0x20;
83 if (lowerCaseA <= bChar && bChar <= lowerCaseZ) bUpperCase -= 0x20;
Bob Nystrom 2015/11/13 16:55:36 You do this in a few places. I wonder if it's wort
Lasse Reichstein Nielsen 2015/11/18 12:12:47 I could, but only if I'm sure it will get inlined.
84 if (aUpperCase != bUpperCase) return (aUpperCase - bUpperCase).sign;
85 if (defaultResult == 0) defaultResult = (aChar - bChar);
86 }
87 if (b.length > a.length) return -1;
88 return defaultResult.sign;
89 }
90
91 /// Compares [a] and [b] lexically, converting ASCII letters to lower case.
92 ///
93 /// Comparison treats all upper-case ASCII letters as lower-case letter,
Bob Nystrom 2015/11/13 16:55:36 "letter" -> "letters"
Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done.
94 /// but does no case conversion for non-ASCII letters.
95 ///
96 /// If two strings differ only on the case of ASCII letters, the one with the
97 /// capital letter at the first difference will compare as less than the other
98 /// string. This tie-breaking ensures that the comparison is a total ordering
99 /// on strings.
100 ///
101 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense
102 /// for situations where the strings are known to be ASCII. Examples could
103 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar
104 /// strings with a known structure.
105 int compareAsciiLowerCase(String a, String b) {
106 int defaultResult = 0;
107 for (int i = 0; i < a.length; i++) {
108 if (i == b.length) return 1;
109 var aChar = a.codeUnitAt(i);
110 var bChar = b.codeUnitAt(i);
111 if (aChar == bChar) continue;
112 const int upperCaseA = 0x41;
113 const int upperCaseZ = 0x5d;
114 int aLowerCase = aChar;
115 int bLowerCase = bChar;
116 // Upper case if ASCII letters.
117 if (upperCaseA <= aChar && aChar <= upperCaseZ) aLowerCase += 0x20;
118 if (upperCaseA <= bChar && bChar <= upperCaseZ) bLowerCase += 0x20;
119 if (aLowerCase != bLowerCase) return (aLowerCase - bLowerCase).sign;
120 if (defaultResult == 0) defaultResult = aChar - bChar;
121 }
122 if (b.length > a.length) return -1;
123 return defaultResult.sign;
124 }
125
126 /// Compares strings [a] and [b] according to natural sort ordering.
127 ///
128 /// A traditional natural sort ordering is a lexical ordering where embedded
129 /// numerals (digit sequeunces) are treated as a single unit and ordered by
130 /// numerical value.
131 /// This means that `"a10b"` will be ordered after `"a7b"` in natural
132 /// ordering, where leixcal ordering would put the `1` before the `7`, ignoring
133 /// that the `1` is part of a larger number.
134 /// (See: [Natural sort order](https://en.wikipedia.org/wiki/Natural_sort_order) )
Bob Nystrom 2015/11/13 16:55:36 Long line. I'd probably do: /// Compares strings
Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done.
135 ///
136 /// The ordering of this function is not pure numerical ordering on embedded
137 /// numerals, which would ignore leading zeros. Instead it orders the numerals
138 /// by length first and then by numerical value if they have the same length.
139 /// This is indistinguishable from natural sort ordering if either all numerals
140 /// are zero padded, or if none of them are.
141 ///
142 /// Example:
143 /// The following strings are in the order they would be sorted by using this
144 /// comparison function:
145 ///
146 /// "a", "a0", "a0b", "a1", "a9", "a01", "a10", "a100", "a100b", "aa"
sra1 2015/11/13 18:25:40 I find it odd that a01 is after a9. This is neithe
Lasse Reichstein Nielsen 2015/11/18 12:12:47 Good point. It's simpler to explain "numerical ord
147 int compareNatural(String a, String b) {
148 for (int i = 0; i < a.length; i++) {
149 if (i == b.length) return 1;
150 var aChar = a.codeUnitAt(i);
151 var bChar = b.codeUnitAt(i);
152 if (aChar != bChar) {
153 return _compareDigitsLexicographically(a, b, i, aChar, bChar);
154 }
155 }
156 if (b.length > a.length) return -1;
157 return 0;
158 }
159
160 /// Compares strings [a] and [b] according to lower-case natural sort ordering.
161 ///
162 /// ASCII letters are converted to lower case before being compared, like
163 /// for [compareAsciiLowerCase], then the result is compared like for
164 /// [compareNatural].
165 ///
166 /// If two strings differ only on the case of ASCII letters, the one with the
167 /// capital letter at the first difference will compare as less than the other
168 /// string. This tie-breaking ensures that the comparison is a total ordering
169 /// on strings.
170 int compareAsciiLowerCaseNatural(String a, String b) {
171 int defaultResult = 0; // Returned if no difference found.
172 for (int i = 0; i < a.length; i++) {
173 if (i == b.length) return 1;
174 var aChar = a.codeUnitAt(i);
175 var bChar = b.codeUnitAt(i);
176 if (aChar == bChar) continue;
177 const int upperCaseA = 0x41;
178 const int upperCaseZ = 0x5d;
179 int aLowerCase = aChar;
180 int bLowerCase = bChar;
181 if (upperCaseA <= aChar && aChar <= upperCaseZ) aLowerCase += 0x20;
182 if (upperCaseA <= bChar && bChar <= upperCaseZ) bLowerCase += 0x20;
183 if (aLowerCase != bLowerCase) {
184 return _compareDigitsLexicographically(a, b, i, aLowerCase, bLowerCase);
185 }
186 if (defaultResult == 0) defaultResult = aChar - bChar;
187 }
188 if (b.length > a.length) return -1;
189 return defaultResult.sign;
190 }
191
192 /// Compares strings [a] and [b] according to upper-case natural sort ordering.
193 ///
194 /// ASCII letters are converted to upper case before being compared, like
195 /// for [compareAsciiUpperCase], then the result is compared like for
196 /// [compareNatural].
197 ///
198 /// If two strings differ only on the case of ASCII letters, the one with the
199 /// capital letter at the first difference will compare as less than the other
200 /// string. This tie-breaking ensures that the comparison is a total ordering
201 /// on strings
202 int compareAsciiUpperCaseNatural(String a, String b) {
203 int defaultResult = 0;
204 for (int i = 0; i < a.length; i++) {
205 if (i == b.length) return 1;
206 var aChar = a.codeUnitAt(i);
207 var bChar = b.codeUnitAt(i);
208 if (aChar == bChar) continue;
209 const int lowerCaseA = 0x61;
210 const int lowerCaseZ = 0x7d;
211 int aUpperCase = aChar;
212 int bUpperCase = bChar;
213 if (lowerCaseA <= aChar && aChar <= lowerCaseZ) aUpperCase -= 0x20;
214 if (lowerCaseA <= bChar && bChar <= lowerCaseZ) bUpperCase -= 0x20;
215 if (aUpperCase != bUpperCase) {
216 return _compareDigitsLexicographically(a, b, i, aUpperCase, bUpperCase);
217 }
218 if (defaultResult == 0) defaultResult = aChar - bChar;
219 }
220 if (b.length > a.length) return -1;
221 return defaultResult.sign;
222 }
223
224 /// Check for numbers overlapping the current mismatched characters.
225 ///
226 /// If either [aChar] or [bChar] or both are digits we check if one is part of
227 /// a longer digits sequence than the other,
228 /// and let the longer sequence compare as greater.
229 ///
230 /// Otherwise just returns the difference between [aChar] and [bChar].
231 int _compareDigitsLexicographically(
232 String a, String b, int index, int aChar, int bChar) {
233 assert(aChar != bChar);
234 var aIsDigit = _isDigit(aChar);
235 var bIsDigit = _isDigit(bChar);
236 if (aIsDigit) {
237 if (bIsDigit) {
238 int lengthCompare = _compareDigitCount(a, b, index);
239 if (lengthCompare != 0) return lengthCompare;
240 } else if (index > 0 && _isDigit(a.codeUnitAt(index - 1))) {
241 // aChar is the continuation of a longer number.
242 return 1;
243 }
244 } else if (bIsDigit && index > 0 && _isDigit(b.codeUnitAt(index - 1))) {
245 // bChar is the continuation of a longer number.
246 return -1;
247 }
248 // Characters are both non-digits.
249 return (aChar - bChar).sign;
250 }
251
252 /// Checks which of [a] and [b] has the longest sequence of digits.
253 ///
254 /// Starts counting from `i + 1` (assumes that `a[i]` and `b[i]` are both
255 /// already known to be digits).
256 int _compareDigitCount(String a, String b, int i) {
257 while (++i < a.length) {
258 bool aIsDigit = _isDigit(a.codeUnitAt(i));
259 if (i == b.length) return aIsDigit ? 1 : 0;
260 bool bIsDigit = _isDigit(b.codeUnitAt(i));
261 if (aIsDigit) {
262 if (bIsDigit) continue;
263 return 1;
264 } else if (bIsDigit) {
265 return -1;
266 } else {
267 return 0;
268 }
269 }
270 if (i < b.length && _isDigit(b.codeUnitAt(i))) {
271 return -1;
272 }
273 return 0;
274 }
275
276 bool _isDigit(int charCode) => (charCode ^ 0x30) <= 9;
OLDNEW
« CHANGELOG.md ('K') | « lib/collection.dart ('k') | pubspec.yaml » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698