lib/src/comparators.dart - Issue 1412293008: Add string comparators that can compare ASCII strings ignoring case.

Unified Diff: lib/src/comparators.dart

Issue 1412293008: Add string comparators that can compare ASCII strings ignoring case. (Closed) Base URL: https://github.com/dart-lang/collection.git@master

Patch Set: Address comments. PTAL Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: lib/src/comparators.dart

diff --git a/lib/src/comparators.dart b/lib/src/comparators.dart

new file mode 100644

index 0000000000000000000000000000000000000000..b9f496a8310444860afaaa3cce43a59f8d3cbf47

--- /dev/null

+++ b/lib/src/comparators.dart

@@ -0,0 +1,397 @@

+// BSD-style license that can be found in the LICENSE file.

+library dart.pkg.collection.comparators;

+// Character constants.

+const int _zero = 0x30;

+const int _upperCaseA = 0x41;

+const int _upperCaseZ = 0x5d;

Lasse Reichstein Nielsen 2015/11/19 07:15:37 Whoops, that should be 0x5a, and 0x7a below. Will

+const int _lowerCaseA = 0x61;

+const int _lowerCaseZ = 0x7D;

+const int _asciiCaseBit = 0x20;

+/// Checks if strings [a] and [b] differ only on the case of ASCII letters.

+///

+/// Strings are equal if they have the same length, and the characters at

+/// each index are the same, or they are ASCII letters where one is upper-case

+/// and the other is the lower-case version of the same letter.

+///

+/// The comparison does not ignore the case of non-ASCII letters, so

+/// a capital ae-ligature (Æ) is different from a lower case ae-ligature (æ).

+///

+/// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

+/// for situations where the strings are known to be ASCII. Examples could

+/// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

+/// strings with a known structure.

+bool equalsIgnoreAsciiCase(String a, String b) {

+ if (a.length != b.length) return false;

+ for (int i = 0; i < a.length; i++) {

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar == bChar) continue;

+ // Quick-check for whether this may be different cases of the same letter.

+ if (aChar ^ bChar != _asciiCaseBit) return false;

+ // If it's possible, then check if either character is actually an ASCII

+ // letter.

+ int aCharUpperCase = aChar | _asciiCaseBit;

+ if (_upperCaseA <= aCharUpperCase && aCharUpperCase <= _upperCaseZ) {

+ continue;

+ }

+ return false;

+ }

+ return true;

+/// Hash code for a string which is compatible with [equalsIgnoreAsciiCase].

+///

+/// The hash code is unaffected by changing the case of ASCII letters, but

+/// the case of non-ASCII letters do affect the result.

+int hashIgnoreAsciiCase(String string) {

+ // Jenkins hash code ( http://en.wikipedia.org/wiki/Jenkins_hash_function).

sra1 2015/11/18 23:40:16 Jenkins hash code modified to operate on Smi value

Lasse Reichstein Nielsen 2015/11/19 07:15:37 Done.

+ // Same hash used by dart2js for strings, modified to ignore ASCII letter

+ // case.

+ int hash = 0;

+ for (int i = 0; i < string.length; i++) {

+ int char = string.codeUnitAt(i);

+ // Convert lower-case ASCII letters to upper case.upper

+ // This ensures that strings that differ only in case will have the

+ // same hash code.

+ if (_lowerCaseA <= char && char <= _lowerCaseZ) char -= _asciiCaseBit;

+ hash = 0x1fffffff & (hash + char);

+ hash = 0x1fffffff & (hash + ((0x0007ffff & hash) << 10));

+ hash >>= 6;

+ }

+ hash = 0x1fffffff & (hash + ((0x03ffffff & hash) << 3));

+ hash >>= 11;

+ return 0x1fffffff & (hash + ((0x00003fff & hash) << 15));

+/// Compares [a] and [b] lexically, converting ASCII letters to upper case.

+///

+/// Comparison treats all lower-case ASCII letters as upper-case letters,

+/// but does no case conversion for non-ASCII letters.

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings and is compatible with equality.

+///

+/// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

+/// for situations where the strings are known to be ASCII. Examples could

+/// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

+/// strings with a known structure.

+int compareAsciiUpperCase(String a, String b) {

+ int defaultResult = 0; // Returned if no difference found.

+ for (int i = 0; i < a.length; i++) {

+ if (i == b.length) return 1;

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar == bChar) continue;

+ // Upper-case if letters.

+ int aUpperCase = aChar;

+ int bUpperCase = bChar;

+ if (_lowerCaseA <= aChar && aChar <= _lowerCaseZ) {

+ aUpperCase -= _asciiCaseBit;

+ }

+ if (_lowerCaseA <= bChar && bChar <= _lowerCaseZ) {

+ bUpperCase -= _asciiCaseBit;

+ }

+ if (aUpperCase != bUpperCase) return (aUpperCase - bUpperCase).sign;

+ if (defaultResult == 0) defaultResult = (aChar - bChar);

+ }

+ if (b.length > a.length) return -1;

+ return defaultResult.sign;

+/// Compares [a] and [b] lexically, converting ASCII letters to lower case.

+///

+/// Comparison treats all upper-case ASCII letters as lower-case letters,

+/// but does no case conversion for non-ASCII letters.

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings.

+///

+/// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

+/// for situations where the strings are known to be ASCII. Examples could

+/// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

+/// strings with a known structure.

+int compareAsciiLowerCase(String a, String b) {

floitsch 2015/11/18 22:50:11 How is this one different from compareAsciiUpperCa

Lasse Reichstein Nielsen 2015/11/19 07:15:37 Only in how it orders, e.g., "a" vs. "_". Since "

+ int defaultResult = 0;

+ for (int i = 0; i < a.length; i++) {

+ if (i == b.length) return 1;

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar == bChar) continue;

+ int aLowerCase = aChar;

+ int bLowerCase = bChar;

+ // Upper case if ASCII letters.

+ if (_upperCaseA <= bChar && bChar <= _upperCaseZ) {

+ bLowerCase += _asciiCaseBit;

+ }

+ if (_upperCaseA <= aChar && aChar <= _upperCaseZ) {

+ aLowerCase += _asciiCaseBit;

+ }

+ if (aLowerCase != bLowerCase) return (aLowerCase - bLowerCase).sign;

+ if (defaultResult == 0) defaultResult = aChar - bChar;

+ }

+ if (b.length > a.length) return -1;

+ return defaultResult.sign;

+/// Compares strings [a] and [b] according to [natural sort ordering].

+///

+/// A natural sort ordering is a lexical ordering where embedded

+/// numerals (digit sequeunces) are treated as a single unit and ordered by

+/// numerical value.

+/// This means that `"a10b"` will be ordered after `"a7b"` in natural

+/// ordering, where leixcal ordering would put the `1` before the `7`, ignoring

floitsch 2015/11/18 22:50:11 lexical

+/// that the `1` is part of a larger number.

+///

+/// Example:

+/// The following strings are in the order they would be sorted by using this

+/// comparison function:

+///

+/// "a", "a0", "a0b", "a1", "a01", "a9", "a10", "a100", "a100b", "aa"

+///

+/// [natural sort ordering]: https://en.wikipedia.org/wiki/Natural_sort_order

+int compareNatural(String a, String b) {

+ for (int i = 0; i < a.length; i++) {

+ if (i == b.length) return 1;

sra1 2015/11/18 23:40:16 Use if (i >= b.length) to improve constraint-b

Lasse Reichstein Nielsen 2015/11/19 07:15:37 Done.

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar != bChar) {

+ return _compareNaturally(a, b, i, aChar, bChar);

+ }

+ if (b.length > a.length) return -1;

+ return 0;

+/// Compares strings [a] and [b] according to lower-case

+/// [natural sort ordering].

+///

+/// ASCII letters are converted to lower case before being compared, like

+/// for [compareAsciiLowerCase], then the result is compared like for

+/// [compareNatural].

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings.

+///

+/// [natural sort ordering]: https://en.wikipedia.org/wiki/Natural_sort_order

+int compareAsciiLowerCaseNatural(String a, String b) {

+ int defaultResult = 0; // Returned if no difference found.

+ for (int i = 0; i < a.length; i++) {

+ if (i == b.length) return 1;

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar == bChar) continue;

+ int aLowerCase = aChar;

+ int bLowerCase = bChar;

+ if (_upperCaseA <= aChar && aChar <= _upperCaseZ) {

+ aLowerCase += _asciiCaseBit;

+ }

+ if (_upperCaseA <= bChar && bChar <= _upperCaseZ) {

+ bLowerCase += _asciiCaseBit;

+ }

+ if (aLowerCase != bLowerCase) {

+ return _compareNaturally(a, b, i, aLowerCase, bLowerCase);

+ }

+ if (defaultResult == 0) defaultResult = aChar - bChar;

+ }

+ if (b.length > a.length) return -1;

+ return defaultResult.sign;

+/// Compares strings [a] and [b] according to upper-case

+/// [natural sort ordering].

+///

+/// ASCII letters are converted to upper case before being compared, like

+/// for [compareAsciiUpperCase], then the result is compared like for

+/// [compareNatural].

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings

+///

+/// [natural sort ordering]: https://en.wikipedia.org/wiki/Natural_sort_order

+int compareAsciiUpperCaseNatural(String a, String b) {

floitsch 2015/11/18 22:50:11 How is this one different from compareAsciiLowerCa

Lasse Reichstein Nielsen 2015/11/19 07:15:37 Same as above, differs on six characters that lie

+ int defaultResult = 0;

+ for (int i = 0; i < a.length; i++) {

+ if (i == b.length) return 1;

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar == bChar) continue;

+ int aUpperCase = aChar;

+ int bUpperCase = bChar;

+ if (_lowerCaseA <= aChar && aChar <= _lowerCaseZ) {

+ aUpperCase -= _asciiCaseBit;

+ }

+ if (_lowerCaseA <= bChar && bChar <= _lowerCaseZ) {

+ bUpperCase -= _asciiCaseBit;

+ }

+ if (aUpperCase != bUpperCase) {

+ return _compareNaturally(a, b, i, aUpperCase, bUpperCase);

+ }

+ if (defaultResult == 0) defaultResult = aChar - bChar;

+ }

+ if (b.length > a.length) return -1;

+ return defaultResult.sign;

+/// Check for numbers overlapping the current mismatched characters.

+///

+/// If both [aChar] and [bChar] are digits, use numerical comparison.

+/// Check if the previous characters is a non-zero number, and if not,

+/// skip - but count - leading zeros before comparing numbers.

+///

+/// If one is a digit and the other isn't, check if the previous character

+/// is a digit, and if so, the the one with the digit is the greater number.

+///

+/// Otherwise just returns the difference between [aChar] and [bChar].

+int _compareNaturally(

+ String a, String b, int index, int aChar, int bChar) {

+ assert(aChar != bChar);

+ var aIsDigit = _isDigit(aChar);

+ var bIsDigit = _isDigit(bChar);

+ if (aIsDigit) {

+ if (bIsDigit) {

+ return _compareNumerically(a, b, aChar, bChar, index);

+ } else if (index > 0 && _isDigit(a.codeUnitAt(index - 1))) {

+ // aChar is the continuation of a longer number.

+ return 1;

+ }

+ } else if (bIsDigit && index > 0 && _isDigit(b.codeUnitAt(index - 1))) {

+ // bChar is the continuation of a longer number.

+ return -1;

+ }

+ // Characters are both non-digits, or not continuation of earlier number.

+ return (aChar - bChar).sign;

+/// Compare numbers overlapping [aChar] and [bChar] numerically.

+///

+/// If the numbers have the same numerical value, but one has more leading

+/// zeros, the longer number is considered greater than the shorter one.

+///

+/// This ensures a total ordering on strings compatible with equality.

+int _compareNumerically(String a, String b, int aChar, int bChar, int index) {

+ // Both are digits. Find the first significant different digit, then find

+ // the length of the numbers.

+ if (_isNumberSuffix(a, index)) {

+ // Part of a longer number, differs at this index, just count the length.

+ int result = _compareDigitCount(a, b, index, index);

+ if (result != 0) return result;

+ // If same length, the current character is the most significant differing

+ // digit.

+ return (aChar - bChar).sign;

+ }

+ // Not part of larger (non-zero) number, so skip leading zeros before

+ // comparing numbers.

+ int aIndex = index;

+ int bIndex = index;

+ if (aChar == _zero) {

+ do {

+ aIndex++;

+ if (aIndex == a.length) return -1; // number in a is zero, b is not.

+ aChar = a.codeUnitAt(aIndex);

+ } while (aChar == _zero);

+ if (!_isDigit(aChar)) return -1;

+ } else if (bChar == _zero) {

+ do {

+ bIndex++;

+ if (bIndex == b.length) return 1; // number in b is zero, a is not.

+ bChar = b.codeUnitAt(bIndex);

+ } while (bChar == _zero);

+ if (!_isDigit(bChar)) return 1;

+ }

+ if (aChar != bChar) {

+ int result = _compareDigitCount(a, b, aIndex, bIndex);

+ if (result != 0) return result;

+ return (aChar - bChar).sign;

+ }

+ // Same leading digit, one had more leading zeros.

+ // Compare digits until reaching a difference.

+ while (true) {

+ var aIsDigit = false;

+ var bIsDigit = false;

+ aChar = 0;

+ bChar = 0;

+ if (++aIndex < a.length) {

+ aChar = a.codeUnitAt(aIndex);

+ aIsDigit = _isDigit(aChar);

+ }

+ if (++bIndex < b.length) {

+ bChar = b.codeUnitAt(bIndex);

+ bIsDigit = _isDigit(bChar);

+ }

+ if (aIsDigit) {

+ if (bIsDigit) {

+ if (aChar == bChar) continue;

+ // First different digit found.

+ break;

+ }

+ // bChar is non-digit, so a has longer number.

+ return 1;

+ } else if (bIsDigit) {

+ return -1; // b has longer number.

+ } else {

+ // Neither is digit, so numbers had same numerical value.

+ // Fall back on number of leading zeros

+ // (reflected by difference in indices).

+ return (aIndex - bIndex).sign;

+ }

+ // At first differing digits.

+ int result = _compareDigitCount(a, b, aIndex, bIndex);

+ if (result != 0) return result;

+ return (aChar - bChar).sign;

+/// Checks which of [a] and [b] has the longest sequence of digits.

+///

+/// Starts counting from `i + 1` and `j + 1` (assumes that `a[i]` and `b[j]` are

+/// both already known to be digits).

+int _compareDigitCount(String a, String b, int i, int j) {

+ while (++i < a.length) {

+ bool aIsDigit = _isDigit(a.codeUnitAt(i));

+ if (++j == b.length) return aIsDigit ? 1 : 0;

+ bool bIsDigit = _isDigit(b.codeUnitAt(j));

+ if (aIsDigit) {

+ if (bIsDigit) continue;

+ return 1;

+ } else if (bIsDigit) {

+ return -1;

+ } else {

+ return 0;

+ }

+ if (++j < b.length && _isDigit(b.codeUnitAt(j))) {

+ return -1;

+ }

+ return 0;

+bool _isDigit(int charCode) => (charCode ^ _zero) <= 9;

+/// Check if the digit at [index] is continuing a non-zero number.

+///

+/// If there is no non-zero digits before, then leading zeros at [index]

+/// are also ignored when comparing numerically. If there is a non-zero digit

+/// before, then zeros at [index] are significant.

+bool _isNumberSuffix(String string, int index) {

sra1 2015/11/18 23:40:16 maybe call it _isNonZeroNumberSuffix

Lasse Reichstein Nielsen 2015/11/19 07:15:37 Done.

+ while (--index >= 0) {

+ int char = string.codeUnitAt(index);

+ if (char != _zero) return _isDigit(char);

+ }

+ return false;

« no previous file with comments | « lib/collection.dart ('k') | pubspec.yaml » ('j') | test/comparators_test.dart » ('J')