lib/src/comparators.dart - Issue 1412293008: Add string comparators that can compare ASCII strings ignoring case.

Unified Diff: lib/src/comparators.dart

Issue 1412293008: Add string comparators that can compare ASCII strings ignoring case. (Closed) Base URL: https://github.com/dart-lang/collection.git@master

Patch Set: Address comments. Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: lib/src/comparators.dart

diff --git a/lib/src/comparators.dart b/lib/src/comparators.dart

new file mode 100644

index 0000000000000000000000000000000000000000..42afacf410bdf424dec5319874608009054dc005

--- /dev/null

+++ b/lib/src/comparators.dart

@@ -0,0 +1,276 @@

+// BSD-style license that can be found in the LICENSE file.

+library dart.pkg.collection.comparators;

+/// Checks if strings [a] and [b] differ only on the case of ASCII letters.

+///

+/// Strings are equal if they have the same length, and the characters at

+/// each index are the same, or they are ASCII letters where one is upper-case

+/// and the other is the lower-case version of the same letter.

+///

+/// The comparison does not ignore the case of non-ASCII letters, so

+/// a capital ae-ligature (Æ) is different from a lower case ae-ligature (æ).

+///

+/// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

+/// for situations where the strings are known to be ASCII. Examples could

+/// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

+/// strings with a known structure.

+bool equalsIgnoreAsciiCase(String a, String b) {

+ if (a.length != b.length) return false;

+ for (int i = 0; i < a.length; i++) {

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar == bChar) continue;

+ const int upperCaseA = 0x41;

sra1 2015/11/13 18:25:40 Hoist these common constants to top level library

Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done.

+ const int upperCaseZ = 0x5d;

+ if (aChar ^ bChar != 0x20) return false;

Bob Nystrom 2015/11/13 16:55:36 Document this. This is because upper/lowercase ASC

Lasse Reichstein Nielsen 2015/11/18 12:12:47 Yes, this is a quick bailout. If the characters di

+ if (upperCaseA <= aChar && aChar <= upperCaseZ) continue;

+ if (upperCaseA <= bChar && bChar <= upperCaseZ) continue;

+ return false;

+ }

+ return true;

+/// Hash code for a string which is compatible with [equalsIgnoreAsciiCase].

+///

+/// The hash code is unaffected by changing the case of ASCII letters, but

+/// the case of non-ASCII letters do affect the result.

+int hashIgnoreAsciiCase(String string) {

+ int hash = 0;

+ for (int i = 0; i < string.length; i++) {

+ int char = string.codeUnitAt(i);

+ const int lowerCaseA = 0x61;

+ const int lowerCaseZ = 0x7D;

+ if (lowerCaseA <= char && char <= lowerCaseZ) char -= 0x20;

Bob Nystrom 2015/11/13 16:55:36 Document this.

Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done.

+ hash = 0x1fffffff & (hash + char);

+ hash = 0x1fffffff & (hash + ((0x0007ffff & hash) << 10));

+ hash >>= 6;

+ }

+ hash = 0x1fffffff & (hash + ((0x03ffffff & hash) << 3));

+ hash >>= 11;

+ return 0x1fffffff & (hash + ((0x00003fff & hash) << 15));

Bob Nystrom 2015/11/13 16:55:36 It would be good to document a reference for where

Lasse Reichstein Nielsen 2015/11/18 12:12:47 It's Jenkins, stole the code from the actual strin

+/// Compares [a] and [b] lexically, converting ASCII letters to upper case.

+///

+/// Comparison treats all lower-case ASCII letters as their upper-case letter,

+/// but does no case conversion for non-ASCII letters.

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings.

+///

+/// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

+/// for situations where the strings are known to be ASCII. Examples could

+/// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

+/// strings with a known structure.

+int compareAsciiUpperCase(String a, String b) {

+ int defaultResult = 0; // Returned if no difference found.

+ for (int i = 0; i < a.length; i++) {

+ if (i == b.length) return 1;

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar == bChar) continue;

+ const int lowerCaseA = 0x61;

+ const int lowerCaseZ = 0x7d;

+ // Lower case if letters.

+ int aUpperCase = aChar;

+ int bUpperCase = bChar;

+ if (lowerCaseA <= aChar && aChar <= lowerCaseZ) aUpperCase -= 0x20;

+ if (lowerCaseA <= bChar && bChar <= lowerCaseZ) bUpperCase -= 0x20;

Bob Nystrom 2015/11/13 16:55:36 You do this in a few places. I wonder if it's wort

Lasse Reichstein Nielsen 2015/11/18 12:12:47 I could, but only if I'm sure it will get inlined.

+ if (aUpperCase != bUpperCase) return (aUpperCase - bUpperCase).sign;

+ if (defaultResult == 0) defaultResult = (aChar - bChar);

+ }

+ if (b.length > a.length) return -1;

+ return defaultResult.sign;

+/// Compares [a] and [b] lexically, converting ASCII letters to lower case.

+///

+/// Comparison treats all upper-case ASCII letters as lower-case letter,

Bob Nystrom 2015/11/13 16:55:36 "letter" -> "letters"

Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done.

+/// but does no case conversion for non-ASCII letters.

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings.

+///

+/// Ignoring non-ASCII letters is not generally a good idea, but it makes sense

+/// for situations where the strings are known to be ASCII. Examples could

+/// be Dart identifiers, base-64 or hex encoded strings, GUIDs or similar

+/// strings with a known structure.

+int compareAsciiLowerCase(String a, String b) {

+ int defaultResult = 0;

+ for (int i = 0; i < a.length; i++) {

+ if (i == b.length) return 1;

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar == bChar) continue;

+ const int upperCaseA = 0x41;

+ const int upperCaseZ = 0x5d;

+ int aLowerCase = aChar;

+ int bLowerCase = bChar;

+ // Upper case if ASCII letters.

+ if (upperCaseA <= aChar && aChar <= upperCaseZ) aLowerCase += 0x20;

+ if (upperCaseA <= bChar && bChar <= upperCaseZ) bLowerCase += 0x20;

+ if (aLowerCase != bLowerCase) return (aLowerCase - bLowerCase).sign;

+ if (defaultResult == 0) defaultResult = aChar - bChar;

+ }

+ if (b.length > a.length) return -1;

+ return defaultResult.sign;

+/// Compares strings [a] and [b] according to natural sort ordering.

+///

+/// A traditional natural sort ordering is a lexical ordering where embedded

+/// numerals (digit sequeunces) are treated as a single unit and ordered by

+/// numerical value.

+/// This means that `"a10b"` will be ordered after `"a7b"` in natural

+/// ordering, where leixcal ordering would put the `1` before the `7`, ignoring

+/// that the `1` is part of a larger number.

+/// (See: [Natural sort order](https://en.wikipedia.org/wiki/Natural_sort_order))

Bob Nystrom 2015/11/13 16:55:36 Long line. I'd probably do: /// Compares strings

Lasse Reichstein Nielsen 2015/11/18 12:12:47 Done.

+///

+/// The ordering of this function is not pure numerical ordering on embedded

+/// numerals, which would ignore leading zeros. Instead it orders the numerals

+/// by length first and then by numerical value if they have the same length.

+/// This is indistinguishable from natural sort ordering if either all numerals

+/// are zero padded, or if none of them are.

+///

+/// Example:

+/// The following strings are in the order they would be sorted by using this

+/// comparison function:

+///

+/// "a", "a0", "a0b", "a1", "a9", "a01", "a10", "a100", "a100b", "aa"

sra1 2015/11/13 18:25:40 I find it odd that a01 is after a9. This is neithe

Lasse Reichstein Nielsen 2015/11/18 12:12:47 Good point. It's simpler to explain "numerical ord

+int compareNatural(String a, String b) {

+ for (int i = 0; i < a.length; i++) {

+ if (i == b.length) return 1;

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar != bChar) {

+ return _compareDigitsLexicographically(a, b, i, aChar, bChar);

+ }

+ if (b.length > a.length) return -1;

+ return 0;

+/// Compares strings [a] and [b] according to lower-case natural sort ordering.

+///

+/// ASCII letters are converted to lower case before being compared, like

+/// for [compareAsciiLowerCase], then the result is compared like for

+/// [compareNatural].

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings.

+int compareAsciiLowerCaseNatural(String a, String b) {

+ int defaultResult = 0; // Returned if no difference found.

+ for (int i = 0; i < a.length; i++) {

+ if (i == b.length) return 1;

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar == bChar) continue;

+ const int upperCaseA = 0x41;

+ const int upperCaseZ = 0x5d;

+ int aLowerCase = aChar;

+ int bLowerCase = bChar;

+ if (upperCaseA <= aChar && aChar <= upperCaseZ) aLowerCase += 0x20;

+ if (upperCaseA <= bChar && bChar <= upperCaseZ) bLowerCase += 0x20;

+ if (aLowerCase != bLowerCase) {

+ return _compareDigitsLexicographically(a, b, i, aLowerCase, bLowerCase);

+ }

+ if (defaultResult == 0) defaultResult = aChar - bChar;

+ }

+ if (b.length > a.length) return -1;

+ return defaultResult.sign;

+/// Compares strings [a] and [b] according to upper-case natural sort ordering.

+///

+/// ASCII letters are converted to upper case before being compared, like

+/// for [compareAsciiUpperCase], then the result is compared like for

+/// [compareNatural].

+///

+/// If two strings differ only on the case of ASCII letters, the one with the

+/// capital letter at the first difference will compare as less than the other

+/// string. This tie-breaking ensures that the comparison is a total ordering

+/// on strings

+int compareAsciiUpperCaseNatural(String a, String b) {

+ int defaultResult = 0;

+ for (int i = 0; i < a.length; i++) {

+ if (i == b.length) return 1;

+ var aChar = a.codeUnitAt(i);

+ var bChar = b.codeUnitAt(i);

+ if (aChar == bChar) continue;

+ const int lowerCaseA = 0x61;

+ const int lowerCaseZ = 0x7d;

+ int aUpperCase = aChar;

+ int bUpperCase = bChar;

+ if (lowerCaseA <= aChar && aChar <= lowerCaseZ) aUpperCase -= 0x20;

+ if (lowerCaseA <= bChar && bChar <= lowerCaseZ) bUpperCase -= 0x20;

+ if (aUpperCase != bUpperCase) {

+ return _compareDigitsLexicographically(a, b, i, aUpperCase, bUpperCase);

+ }

+ if (defaultResult == 0) defaultResult = aChar - bChar;

+ }

+ if (b.length > a.length) return -1;

+ return defaultResult.sign;

+/// Check for numbers overlapping the current mismatched characters.

+///

+/// If either [aChar] or [bChar] or both are digits we check if one is part of

+/// a longer digits sequence than the other,

+/// and let the longer sequence compare as greater.

+///

+/// Otherwise just returns the difference between [aChar] and [bChar].

+int _compareDigitsLexicographically(

+ String a, String b, int index, int aChar, int bChar) {

+ assert(aChar != bChar);

+ var aIsDigit = _isDigit(aChar);

+ var bIsDigit = _isDigit(bChar);

+ if (aIsDigit) {

+ if (bIsDigit) {

+ int lengthCompare = _compareDigitCount(a, b, index);

+ if (lengthCompare != 0) return lengthCompare;

+ } else if (index > 0 && _isDigit(a.codeUnitAt(index - 1))) {

+ // aChar is the continuation of a longer number.

+ return 1;

+ }

+ } else if (bIsDigit && index > 0 && _isDigit(b.codeUnitAt(index - 1))) {

+ // bChar is the continuation of a longer number.

+ return -1;

+ }

+ // Characters are both non-digits.

+ return (aChar - bChar).sign;

+/// Checks which of [a] and [b] has the longest sequence of digits.

+///

+/// Starts counting from `i + 1` (assumes that `a[i]` and `b[i]` are both

+/// already known to be digits).

+int _compareDigitCount(String a, String b, int i) {

+ while (++i < a.length) {

+ bool aIsDigit = _isDigit(a.codeUnitAt(i));

+ if (i == b.length) return aIsDigit ? 1 : 0;

+ bool bIsDigit = _isDigit(b.codeUnitAt(i));

+ if (aIsDigit) {

+ if (bIsDigit) continue;

+ return 1;

+ } else if (bIsDigit) {

+ return -1;

+ } else {

+ return 0;

+ }

+ if (i < b.length && _isDigit(b.codeUnitAt(i))) {

+ return -1;

+ }

+ return 0;

+bool _isDigit(int charCode) => (charCode ^ 0x30) <= 9;

« CHANGELOG.md ('K') | « lib/collection.dart ('k') | pubspec.yaml » ('j') | no next file with comments »