lib/src/comparators.dart - Issue 1412293008: Add string comparators that can compare ASCII strings ignoring case.

Side by Side Diff: lib/src/comparators.dart

Issue 1412293008: Add string comparators that can compare ASCII strings ignoring case. (Closed) Base URL: https://github.com/dart-lang/collection.git@master

Patch Set: Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright (c) 2015, the Dart project authors. Please see the AUTHORS file

	2 // for details. All rights reserved. Use of this source code is governed by a

	3 // BSD-style license that can be found in the LICENSE file.

	4

	5 library dart.pkg.collection.comparators;

	6

	7 /// Checks if strings [a] and [b] differ only on the case of ASCII letters.

	8 ///

	9 /// Strings are equal if they have the same length, and the characters at

	10 /// each index are the same, or they are ASCII letters where one is upper-case

	11 /// and the other is the lower-case version of the same letter.

	12 ///

	13 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

	14 /// for situations where the strings are known to be ASCII. Examples could

	15 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

	16 /// strings with a known structure.
	kevmoo 2015/11/10 19:30:35 Describe how non-ASCII characters are handled. Describe how non-ASCII characters are handled.
	17 bool equalsIgnoreAsciiCase(String a, String b) {

	18 if (a.length != b.length) return false;

	19 for (int i = 0; i < a.length; i++) {

	20 var aChar = a.codeUnitAt(i);

	21 var bChar = b.codeUnitAt(i);

	22 if (aChar == bChar) continue;

	23 const int upperCaseA = 0x41;

	24 const int upperCaseZ = 0x5d;

	25 if (aChar ^ bChar != 0x20) return false;

	26 if (upperCaseA <= aChar && aChar <= upperCaseZ) continue;

	27 if (upperCaseA <= bChar && bChar <= upperCaseZ) continue;

	28 return false;

	29 }

	30 return true;

	31 }

	32

	33 /// Hash code for a string which is compatible with [equalsIgnoreAsciiCase].
	kevmoo 2015/11/10 19:30:35 Describe how non-ASCII characters are handled. Describe how non-ASCII characters are handled.
	34 int hashIgnoreAsciiCase(String string) {

	35 int hash = 0;

	36 for (int i = 0; i < string.length; i++) {

	37 int char = string.codeUnitAt(i);

	38 const int lowerCaseA = 0x61;

	39 const int lowerCaseZ = 0x7D;

	40 if (lowerCaseA <= char && char <= lowerCaseZ) char -= 0x20;

	41 hash = 0x1fffffff & (hash + char);

	42 hash = 0x1fffffff & (hash + ((0x0007ffff & hash) << 10));

	43 hash >>= 6;

	44 }

	45 hash = 0x1fffffff & (hash + ((0x03ffffff & hash) << 3));

	46 hash >>= 11;

	47 return 0x1fffffff & (hash + ((0x00003fff & hash) << 15));

	48 }

	49

	50 /// Compares [a] and [b] lexically, converting ASCII letters to upper case.

	51 ///

	52 /// Comparison treats all lower-case ASCII letters as their upper-case letter,

	53 /// but does no case conversion for non-ASCII letters.

	54 ///

	55 /// If two strings differ only on the case of ASCII letters, the one with the

	56 /// capital letter at the first difference will compare as less than the other

	57 /// string. This tie-breaking ensures that the comparison is a total ordering

	58 /// on strings.

	59 ///

	60 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

	61 /// for situations where the strings are known to be ASCII. Examples could

	62 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

	63 /// strings with a known structure.

	64 int compareAsciiUpperCase(String a, String b) {

	65 int defaultResult = 0; // Returned if no difference found.

	66 for (int i = 0; i < a.length; i++) {

	67 if (i == b.length) return 1;

	68 var aChar = a.codeUnitAt(i);

	69 var bChar = b.codeUnitAt(i);

	70 if (aChar == bChar) continue;

	71 const int lowerCaseA = 0x61;

	72 const int lowerCaseZ = 0x7d;

	73 // Lower case if letters.

	74 int aUpperCase = aChar;

	75 int bUpperCase = bChar;

	76 if (lowerCaseA <= aChar && aChar <= lowerCaseZ) aUpperCase -= 0x20;

	77 if (lowerCaseA <= bChar && bChar <= lowerCaseZ) bUpperCase -= 0x20;

	78 if (aUpperCase != bUpperCase) return (aUpperCase - bUpperCase).sign;

	79 if (defaultResult == 0) defaultResult = (aChar - bChar);

	80 }

	81 if (b.length > a.length) return -1;

	82 return defaultResult.sign;

	83 }

	84

	85 /// Compares [a] and [b] lexically, converting ASCII letters to lower case.

	86 ///

	87 /// Comparison treats all upper-case ASCII letters as lower-case letter,

	88 /// but does no case conversion for non-ASCII letters.

	89 ///

	90 /// If two strings differ only on the case of ASCII letters, the one with the

	91 /// capital letter at the first difference will compare as less than the other

	92 /// string. This tie-breaking ensures that the comparison is a total ordering

	93 /// on strings.

	94 ///

	95 /// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

	96 /// for situations where the strings are known to be ASCII. Examples could

	97 /// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

	98 /// strings with a known structure.

	99 int compareAsciiLowerCase(String a, String b) {

	100 int defaultResult = 0;

	101 for (int i = 0; i < a.length; i++) {

	102 if (i == b.length) return 1;

	103 var aChar = a.codeUnitAt(i);

	104 var bChar = b.codeUnitAt(i);

	105 if (aChar == bChar) continue;

	106 const int upperCaseA = 0x41;

	107 const int upperCaseZ = 0x5d;

	108 int aLowerCase = aChar;

	109 int bLowerCase = bChar;

	110 // Upper case if ASCII letters.

	111 if (upperCaseA <= aChar && aChar <= upperCaseZ) aLowerCase += 0x20;

	112 if (upperCaseA <= bChar && bChar <= upperCaseZ) bLowerCase += 0x20;

	113 if (aLowerCase != bLowerCase) return (aLowerCase - bLowerCase).sign;

	114 if (defaultResult == 0) defaultResult = aChar - bChar;

	115 }

	116 if (b.length > a.length) return -1;

	117 return defaultResult.sign;

	118 }

	119

	120 /// Compares strings [a] and [b] lexically, with embedded numbers recognized.
	sra1 2015/11/10 04:41:40 'with embedded sequences of digits compared as a u 'with embedded sequences of digits compared as a unit' I'm confused by the terms 'lexically' and 'lexicographically'. Do you mean the same thing? If so, use one term throughout. Lasse Reichstein Nielsen 2015/11/10 08:21:33 What I intend is: Leixcally: Dictionary sorting ( What I intend is: Leixcally: Dictionary sorting (alphabetical with most weight on first letter). Lexicographically: Shorter words sorted before longer words, words of same length ordered lexically. (Basically, lexical pads the shorter string on the right, lexicograpical pads it on the left, to ensure comparison of sequences with the same length) Wikipedia doesn't agree with me that there is a difference, it uses the words interchangeably, so I might need to find a different word to describe the second ordering.
	121 ///

	122 /// If both strings contain an embedded number at the same locations, the

	123 /// numbers are compared lexicographically. This means that a longer number

	124 /// is considered greater than a shorter number, and numbers of the same length

	125 /// are ordered numerically (or alhabetically, it's the same thing).

	126 /// If numbers do not have leading zeros, lexicographical ordering is also

	127 /// the same as numerical ordering.
	sra1 2015/11/10 04:41:40 Add examples in the doc to tease out when this is Add examples in the doc to tease out when this is number-like and when it is not. Lasse Reichstein Nielsen 2015/11/10 08:21:33 Good idea. Good idea.
	128 int compareWithNumbers(String a, String b) {
	sra1 2015/11/10 04:41:40 'With' makes it sound like one of the arguments is 'With' makes it sound like one of the arguments is a number. Maybe 'compareStringsContainingNumbers' Lasse Reichstein Nielsen 2015/11/10 08:21:33 Yes, the name sucks. Suggestions will be appreciat Yes, the name sucks. Suggestions will be appreciated. "comparStringsContainingNumbers" is't precise either - the function can compare strings without numbers too. It needs to make the point that any embedded numbers are compared "numerically" (which, in practice, is the goal, but it's achieved by using "lexicographical" ordering on the numerals which does sort "3" before "02").
	129 for (int i = 0; i < a.length; i++) {

	130 if (i == b.length) return 1;

	131 var aChar = a.codeUnitAt(i);

	132 var bChar = b.codeUnitAt(i);

	133 if (aChar != bChar) {

	134 return _compareDigitsLexicographically(a, b, i, aChar, bChar);

	135 }

	136 }

	137 if (b.length > a.length) return -1;

	138 return 0;

	139 }

	140

	141 /// Compares strings [a] and [b] lexically, with embedded numbers recognized.

	142 ///

	143 /// ASCII letters are converted to lower case before being compared, like

	144 /// for [compareAsciiLowerCase].

	145 ///

	146 /// If two strings differ only on the case of ASCII letters, the one with the

	147 /// capital letter at the first difference will compare as less than the other

	148 /// string. This tie-breaking ensures that the comparison is a total ordering

	149 /// on strings.

	150 ///

	151 /// If both strings contain an embedded number at the same locations, the

	152 /// numbers are compared lexicographically. This means that a longer number

	153 /// is considered greater than a shorter number, and numbers of the same length

	154 /// are ordered numerically (or alhabetically, it's the same thing).

	155 /// If numbers do not have leading zeros, lexicographical ordering is also

	156 /// the same as numerical ordering.

	157 int compareAsciiLowerCaseWithNumbers(String a, String b) {

	158 int defaultResult = 0; // Returned if no difference found.

	159 for (int i = 0; i < a.length; i++) {

	160 if (i == b.length) return 1;

	161 var aChar = a.codeUnitAt(i);

	162 var bChar = b.codeUnitAt(i);

	163 if (aChar == bChar) continue;

	164 const int upperCaseA = 0x41;

	165 const int upperCaseZ = 0x5d;

	166 int aLowerCase = aChar;

	167 int bLowerCase = bChar;

	168 if (upperCaseA <= aChar && aChar <= upperCaseZ) aLowerCase += 0x20;

	169 if (upperCaseA <= bChar && bChar <= upperCaseZ) bLowerCase += 0x20;

	170 if (aLowerCase != bLowerCase) {

	171 return _compareDigitsLexicographically(a, b, i, aLowerCase, bLowerCase);

	172 }

	173 if (defaultResult == 0) defaultResult = aChar - bChar;

	174 }

	175 if (b.length > a.length) return -1;

	176 return defaultResult.sign;

	177 }

	178

	179 /// Compares strings [a] and [b] lexically, with embedded numbers recognized.

	180 ///

	181 /// ASCII letters are converted to upper case before being compared, like

	182 /// for [compareAsciiUpperCase].

	183 ///

	184 /// If two strings differ only on the case of ASCII letters, the one with the

	185 /// capital letter at the first difference will compare as less than the other

	186 /// string. This tie-breaking ensures that the comparison is a total ordering

	187 /// on strings.

	188 ///

	189 /// If both strings contain an embedded number at the same locations, the

	190 /// numbers are compared lexicographically. This means that a longer number

	191 /// is considered greater than a shorter number, and numbers of the same length

	192 /// are ordered numerically (or alhabetically, it's the same thing).

	193 /// If numbers do not have leading zeros, lexicographical ordering is also

	194 /// the same as numerical ordering.

	195 int compareAsciiUpperCaseWithNumbers(String a, String b) {

	196 int defaultResult = 0;

	197 for (int i = 0; i < a.length; i++) {

	198 if (i == b.length) return 1;

	199 var aChar = a.codeUnitAt(i);

	200 var bChar = b.codeUnitAt(i);

	201 if (aChar == bChar) continue;

	202 const int lowerCaseA = 0x61;

	203 const int lowerCaseZ = 0x7d;

	204 int aUpperCase = aChar;

	205 int bUpperCase = bChar;

	206 if (lowerCaseA <= aChar && aChar <= lowerCaseZ) aUpperCase -= 0x20;

	207 if (lowerCaseA <= bChar && bChar <= lowerCaseZ) bUpperCase -= 0x20;

	208 if (aUpperCase != bUpperCase) {

	209 return _compareDigitsLexicographically(a, b, i, aUpperCase, bUpperCase);

	210 }

	211 if (defaultResult == 0) defaultResult = aChar - bChar;

	212 }

	213 if (b.length > a.length) return -1;

	214 return defaultResult.sign;

	215 }

	216

	217 /// Check for numbers overlapping the current mismatched characters.

	218 ///

	219 /// If either [aChar] or [bChar] or both are digits we check if one is part of

	220 /// a longer digits sequence than the other,

	221 /// and let the longer sequence compare as greater.

	222 ///

	223 /// Otherwise just returns the difference between [aChar] and [bChar].

	224 int _compareDigitsLexicographically(

	225 String a, String b, int index, int aChar, int bChar) {

	226 assert(aChar != bChar);

	227 var aIsDigit = _isDigit(aChar);

	228 var bIsDigit = _isDigit(bChar);

	229 if (aIsDigit) {

	230 if (bIsDigit) {

	231 int lengthCompare = _compareDigitCount(a, b, index);

	232 if (lengthCompare != 0) return lengthCompare;

	233 } else if (index > 0 && _isDigit(a.codeUnitAt(index - 1))) {

	234 // aChar is the continuation of a longer number.

	235 return 1;

	236 }

	237 } else if (bIsDigit && index > 0 && _isDigit(b.codeUnitAt(index - 1))) {

	238 // bChar is the continuation of a longer number.

	239 return -1;

	240 }

	241 // Characters are not both part , or part
	sra1 2015/11/10 04:41:40 Fix comment. Fix comment. Lasse Reichstein Nielsen 2015/11/10 08:21:33 I wonder what I meant to say :) I wonder what I meant to say :)
	242 return (aChar - bChar).sign;

	243 }

	244

	245 /// Checks which of [a] and [b] has the longest sequence of digits.

	246 ///

	247 /// Starts counting from `i + 1` (assumes that `a[i]` and `b[i]` are both

	248 /// already known to be digits).

	249 int _compareDigitCount(String a, String b, int i) {

	250 while (++i < a.length) {

	251 bool aIsDigit = _isDigit(a.codeUnitAt(i));

	252 if (i == b.length) return aIsDigit ? 1 : 0;

	253 bool bIsDigit = _isDigit(b.codeUnitAt(i));

	254 if (aIsDigit) {

	255 if (bIsDigit) continue;

	256 return 1;

	257 } else if (bIsDigit) {

	258 return -1;

	259 } else {

	260 return 0;

	261 }

	262 }

	263 if (i < b.length && _isDigit(b.codeUnitAt(i))) {

	264 return -1;

	265 }

	266 return 0;

	267 }

	268

	269 bool _isDigit(int charCode) => (charCode ^ 0x30) <= 9;

OLD	NEW

« no previous file with comments | « lib/collection.dart ('k') | test/comparators_test.dart » ('j') | no next file with comments »