lib/src/comparators.dart - Issue 1412293008: Add string comparators that can compare ASCII strings ignoring case.

Side by Side Diff: lib/src/comparators.dart

Issue 1412293008: Add string comparators that can compare ASCII strings ignoring case. (Closed) Base URL: https://github.com/dart-lang/collection.git@master

Patch Set: Address comments. Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright (c) 2015, the Dart project authors. Please see the AUTHORS file

	2 // for details. All rights reserved. Use of this source code is governed by a

	3 // BSD-style license that can be found in the LICENSE file.

	4

	5 library dart.pkg.collection.comparators;

	6

	7 /// Checks if strings [a] and [b] differ only on the case of ASCII letters.

	8 ///

	9 /// Strings are equal if they have the same length, and the characters at

	10 /// each index are the same, or they are ASCII letters where one is upper-case

	11 /// and the other is the lower-case version of the same letter.

	12 ///

	13 /// The comparison does not ignore the case of non-ASCII letters, so

	14 /// a capital ae-ligature (Æ) is different from a lower case ae-ligature (æ).

	15 ///

	16 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

	17 /// for situations where the strings are known to be ASCII. Examples could

	18 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

	19 /// strings with a known structure.

	20 bool equalsIgnoreAsciiCase(String a, String b) {

	21 if (a.length != b.length) return false;

	22 for (int i = 0; i < a.length; i++) {

	23 var aChar = a.codeUnitAt(i);

	24 var bChar = b.codeUnitAt(i);

	25 if (aChar == bChar) continue;

	26 const int upperCaseA = 0x41;
	sra1 2015/11/13 18:25:40 Hoist these common constants to top level library Hoist these common constants to top level library private constants. Naming 0x20 as _asciiCaseBit would 'document' the trick in one place. Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done. Show quoted text On 2015/11/13 18:25:40, sra1 wrote: > Hoist these common constants to top level library private constants. > > Naming 0x20 as _asciiCaseBit would 'document' the trick in one place. Done.
	27 const int upperCaseZ = 0x5d;

	28 if (aChar ^ bChar != 0x20) return false;
	Bob Nystrom 2015/11/13 16:55:36 Document this. This is because upper/lowercase ASC Document this. This is because upper/lowercase ASCII letters only differ by their sixth bit, right? Lasse Reichstein Nielsen 2015/11/18 12:12:47 Yes, this is a quick bailout. If the characters di Yes, this is a quick bailout. If the characters differ by anything but that bit, it's not the same character with different case.
	29 if (upperCaseA <= aChar && aChar <= upperCaseZ) continue;

	30 if (upperCaseA <= bChar && bChar <= upperCaseZ) continue;

	31 return false;

	32 }

	33 return true;

	34 }

	35

	36 /// Hash code for a string which is compatible with [equalsIgnoreAsciiCase].

	37 ///

	38 /// The hash code is unaffected by changing the case of ASCII letters, but

	39 /// the case of non-ASCII letters do affect the result.

	40 int hashIgnoreAsciiCase(String string) {

	41 int hash = 0;

	42 for (int i = 0; i < string.length; i++) {

	43 int char = string.codeUnitAt(i);

	44 const int lowerCaseA = 0x61;

	45 const int lowerCaseZ = 0x7D;

	46 if (lowerCaseA <= char && char <= lowerCaseZ) char -= 0x20;
	Bob Nystrom 2015/11/13 16:55:36 Document this. Document this. Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done. Show quoted text On 2015/11/13 16:55:36, Bob Nystrom wrote: > Document this. Done.
	47 hash = 0x1fffffff & (hash + char);

	48 hash = 0x1fffffff & (hash + ((0x0007ffff & hash) << 10));

	49 hash >>= 6;

	50 }

	51 hash = 0x1fffffff & (hash + ((0x03ffffff & hash) << 3));

	52 hash >>= 11;

	53 return 0x1fffffff & (hash + ((0x00003fff & hash) << 15));
	Bob Nystrom 2015/11/13 16:55:36 It would be good to document a reference for where It would be good to document a reference for where the hash logic comes from. This is Jenkins hash, right? Lasse Reichstein Nielsen 2015/11/18 12:12:47 It's Jenkins, stole the code from the actual strin It's Jenkins, stole the code from the actual string hash.
	54 }

	55

	56 /// Compares [a] and [b] lexically, converting ASCII letters to upper case.

	57 ///

	58 /// Comparison treats all lower-case ASCII letters as their upper-case letter,

	59 /// but does no case conversion for non-ASCII letters.

	60 ///

	61 /// If two strings differ only on the case of ASCII letters, the one with the

	62 /// capital letter at the first difference will compare as less than the other

	63 /// string. This tie-breaking ensures that the comparison is a total ordering

	64 /// on strings.

	65 ///

	66 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

	67 /// for situations where the strings are known to be ASCII. Examples could

	68 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

	69 /// strings with a known structure.

	70 int compareAsciiUpperCase(String a, String b) {

	71 int defaultResult = 0; // Returned if no difference found.

	72 for (int i = 0; i < a.length; i++) {

	73 if (i == b.length) return 1;

	74 var aChar = a.codeUnitAt(i);

	75 var bChar = b.codeUnitAt(i);

	76 if (aChar == bChar) continue;

	77 const int lowerCaseA = 0x61;

	78 const int lowerCaseZ = 0x7d;

	79 // Lower case if letters.

	80 int aUpperCase = aChar;

	81 int bUpperCase = bChar;

	82 if (lowerCaseA <= aChar && aChar <= lowerCaseZ) aUpperCase -= 0x20;

	83 if (lowerCaseA <= bChar && bChar <= lowerCaseZ) bUpperCase -= 0x20;
	Bob Nystrom 2015/11/13 16:55:36 You do this in a few places. I wonder if it's wort You do this in a few places. I wonder if it's worth making a little helper function: int _toAsciiUpper(int char) ... Lasse Reichstein Nielsen 2015/11/18 12:12:47 I could, but only if I'm sure it will get inlined. I could, but only if I'm sure it will get inlined. This line should be compilable to 4-5 machine instructions if both aChar and aUpperCase are in registers, and it's in the hot loop, so I don't want to risk unoptimal compilation.
	84 if (aUpperCase != bUpperCase) return (aUpperCase - bUpperCase).sign;

	85 if (defaultResult == 0) defaultResult = (aChar - bChar);

	86 }

	87 if (b.length > a.length) return -1;

	88 return defaultResult.sign;

	89 }

	90

	91 /// Compares [a] and [b] lexically, converting ASCII letters to lower case.

	92 ///

	93 /// Comparison treats all upper-case ASCII letters as lower-case letter,
	Bob Nystrom 2015/11/13 16:55:36 "letter" -> "letters" "letter" -> "letters" Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done. Show quoted text On 2015/11/13 16:55:36, Bob Nystrom wrote: > "letter" -> "letters" Done.
	94 /// but does no case conversion for non-ASCII letters.

	95 ///

	96 /// If two strings differ only on the case of ASCII letters, the one with the

	97 /// capital letter at the first difference will compare as less than the other

	98 /// string. This tie-breaking ensures that the comparison is a total ordering

	99 /// on strings.

	100 ///

	101 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

	102 /// for situations where the strings are known to be ASCII. Examples could

	103 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

	104 /// strings with a known structure.

	105 int compareAsciiLowerCase(String a, String b) {

	106 int defaultResult = 0;

	107 for (int i = 0; i < a.length; i++) {

	108 if (i == b.length) return 1;

	109 var aChar = a.codeUnitAt(i);

	110 var bChar = b.codeUnitAt(i);

	111 if (aChar == bChar) continue;

	112 const int upperCaseA = 0x41;

	113 const int upperCaseZ = 0x5d;

	114 int aLowerCase = aChar;

	115 int bLowerCase = bChar;

	116 // Upper case if ASCII letters.

	117 if (upperCaseA <= aChar && aChar <= upperCaseZ) aLowerCase += 0x20;

	118 if (upperCaseA <= bChar && bChar <= upperCaseZ) bLowerCase += 0x20;

	119 if (aLowerCase != bLowerCase) return (aLowerCase - bLowerCase).sign;

	120 if (defaultResult == 0) defaultResult = aChar - bChar;

	121 }

	122 if (b.length > a.length) return -1;

	123 return defaultResult.sign;

	124 }

	125

	126 /// Compares strings [a] and [b] according to natural sort ordering.

	127 ///

	128 /// A traditional natural sort ordering is a lexical ordering where embedded

	129 /// numerals (digit sequeunces) are treated as a single unit and ordered by

	130 /// numerical value.

	131 /// This means that `"a10b"` will be ordered after `"a7b"` in natural

	132 /// ordering, where leixcal ordering would put the `1` before the `7`, ignoring

	133 /// that the `1` is part of a larger number.

	134 /// (See: [Natural sort order](https://en.wikipedia.org/wiki/Natural_sort_order) )
	Bob Nystrom 2015/11/13 16:55:36 Long line. I'd probably do: /// Compares strings Long line. I'd probably do: /// Compares strings [a] and [b] according to [natural sort ordering][]. /// /// ... /// /// [natural sort ordering]: https://en.wikipedia.org/wiki/Natural_sort_order Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done. Show quoted text On 2015/11/13 16:55:36, Bob Nystrom wrote: > Long line. I'd probably do: > > /// Compares strings [a] and [b] according to [natural sort ordering][]. > /// > /// ... > /// > /// [natural sort ordering]: https://en.wikipedia.org/wiki/Natural_sort_order Done.
	135 ///

	136 /// The ordering of this function is not pure numerical ordering on embedded

	137 /// numerals, which would ignore leading zeros. Instead it orders the numerals

	138 /// by length first and then by numerical value if they have the same length.

	139 /// This is indistinguishable from natural sort ordering if either all numerals

	140 /// are zero padded, or if none of them are.

	141 ///

	142 /// Example:

	143 /// The following strings are in the order they would be sorted by using this

	144 /// comparison function:

	145 ///

	146 /// "a", "a0", "a0b", "a1", "a9", "a01", "a10", "a100", "a100b", "aa"
	sra1 2015/11/13 18:25:40 I find it odd that a01 is after a9. This is neithe I find it odd that a01 is after a9. This is neither numerical nor plain string ordering. It would feel more 'natural' if different strings with the same numerical value were together (as advertised in the first paragraph). digit strings with the same integer value are resolved with a character-based tie-breaker. This gives order: a a0 a00 a000 a000b a00b a0b a001 a01 a1 sort --version-sort has this ordering. I think you can still do this with 'local' logic starting at the mismatch. Lasse Reichstein Nielsen 2015/11/18 12:12:47 Good point. It's simpler to explain "numerical ord Good point. It's simpler to explain "numerical ordering, length as tie-breaker" than the other way around. I'll do that. It's slightly harder to implement (more special cases), but it's not that bad.
	147 int compareNatural(String a, String b) {

	148 for (int i = 0; i < a.length; i++) {

	149 if (i == b.length) return 1;

	150 var aChar = a.codeUnitAt(i);

	151 var bChar = b.codeUnitAt(i);

	152 if (aChar != bChar) {

	153 return _compareDigitsLexicographically(a, b, i, aChar, bChar);

	154 }

	155 }

	156 if (b.length > a.length) return -1;

	157 return 0;

	158 }

	159

	160 /// Compares strings [a] and [b] according to lower-case natural sort ordering.

	161 ///

	162 /// ASCII letters are converted to lower case before being compared, like

	163 /// for [compareAsciiLowerCase], then the result is compared like for

	164 /// [compareNatural].

	165 ///

	166 /// If two strings differ only on the case of ASCII letters, the one with the

	167 /// capital letter at the first difference will compare as less than the other

	168 /// string. This tie-breaking ensures that the comparison is a total ordering

	169 /// on strings.

	170 int compareAsciiLowerCaseNatural(String a, String b) {

	171 int defaultResult = 0; // Returned if no difference found.

	172 for (int i = 0; i < a.length; i++) {

	173 if (i == b.length) return 1;

	174 var aChar = a.codeUnitAt(i);

	175 var bChar = b.codeUnitAt(i);

	176 if (aChar == bChar) continue;

	177 const int upperCaseA = 0x41;

	178 const int upperCaseZ = 0x5d;

	179 int aLowerCase = aChar;

	180 int bLowerCase = bChar;

	181 if (upperCaseA <= aChar && aChar <= upperCaseZ) aLowerCase += 0x20;

	182 if (upperCaseA <= bChar && bChar <= upperCaseZ) bLowerCase += 0x20;

	183 if (aLowerCase != bLowerCase) {

	184 return _compareDigitsLexicographically(a, b, i, aLowerCase, bLowerCase);

	185 }

	186 if (defaultResult == 0) defaultResult = aChar - bChar;

	187 }

	188 if (b.length > a.length) return -1;

	189 return defaultResult.sign;

	190 }

	191

	192 /// Compares strings [a] and [b] according to upper-case natural sort ordering.

	193 ///

	194 /// ASCII letters are converted to upper case before being compared, like

	195 /// for [compareAsciiUpperCase], then the result is compared like for

	196 /// [compareNatural].

	197 ///

	198 /// If two strings differ only on the case of ASCII letters, the one with the

	199 /// capital letter at the first difference will compare as less than the other

	200 /// string. This tie-breaking ensures that the comparison is a total ordering

	201 /// on strings

	202 int compareAsciiUpperCaseNatural(String a, String b) {

	203 int defaultResult = 0;

	204 for (int i = 0; i < a.length; i++) {

	205 if (i == b.length) return 1;

	206 var aChar = a.codeUnitAt(i);

	207 var bChar = b.codeUnitAt(i);

	208 if (aChar == bChar) continue;

	209 const int lowerCaseA = 0x61;

	210 const int lowerCaseZ = 0x7d;

	211 int aUpperCase = aChar;

	212 int bUpperCase = bChar;

	213 if (lowerCaseA <= aChar && aChar <= lowerCaseZ) aUpperCase -= 0x20;

	214 if (lowerCaseA <= bChar && bChar <= lowerCaseZ) bUpperCase -= 0x20;

	215 if (aUpperCase != bUpperCase) {

	216 return _compareDigitsLexicographically(a, b, i, aUpperCase, bUpperCase);

	217 }

	218 if (defaultResult == 0) defaultResult = aChar - bChar;

	219 }

	220 if (b.length > a.length) return -1;

	221 return defaultResult.sign;

	222 }

	223

	224 /// Check for numbers overlapping the current mismatched characters.

	225 ///

	226 /// If either [aChar] or [bChar] or both are digits we check if one is part of

	227 /// a longer digits sequence than the other,

	228 /// and let the longer sequence compare as greater.

	229 ///

	230 /// Otherwise just returns the difference between [aChar] and [bChar].

	231 int _compareDigitsLexicographically(

	232 String a, String b, int index, int aChar, int bChar) {

	233 assert(aChar != bChar);

	234 var aIsDigit = _isDigit(aChar);

	235 var bIsDigit = _isDigit(bChar);

	236 if (aIsDigit) {

	237 if (bIsDigit) {

	238 int lengthCompare = _compareDigitCount(a, b, index);

	239 if (lengthCompare != 0) return lengthCompare;

	240 } else if (index > 0 && _isDigit(a.codeUnitAt(index - 1))) {

	241 // aChar is the continuation of a longer number.

	242 return 1;

	243 }

	244 } else if (bIsDigit && index > 0 && _isDigit(b.codeUnitAt(index - 1))) {

	245 // bChar is the continuation of a longer number.

	246 return -1;

	247 }

	248 // Characters are both non-digits.

	249 return (aChar - bChar).sign;

	250 }

	251

	252 /// Checks which of [a] and [b] has the longest sequence of digits.

	253 ///

	254 /// Starts counting from `i + 1` (assumes that `a[i]` and `b[i]` are both

	255 /// already known to be digits).

	256 int _compareDigitCount(String a, String b, int i) {

	257 while (++i < a.length) {

	258 bool aIsDigit = _isDigit(a.codeUnitAt(i));

	259 if (i == b.length) return aIsDigit ? 1 : 0;

	260 bool bIsDigit = _isDigit(b.codeUnitAt(i));

	261 if (aIsDigit) {

	262 if (bIsDigit) continue;

	263 return 1;

	264 } else if (bIsDigit) {

	265 return -1;

	266 } else {

	267 return 0;

	268 }

	269 }

	270 if (i < b.length && _isDigit(b.codeUnitAt(i))) {

	271 return -1;

	272 }

	273 return 0;

	274 }

	275

	276 bool _isDigit(int charCode) => (charCode ^ 0x30) <= 9;

OLD	NEW

« CHANGELOG.md ('K') | « lib/collection.dart ('k') | pubspec.yaml » ('j') | no next file with comments »