Chromium Code Reviews| Index: Source/wtf/text/StringImpl.cpp |
| diff --git a/Source/wtf/text/StringImpl.cpp b/Source/wtf/text/StringImpl.cpp |
| index 838b6715fcdce7c992b8a90e4fcfaab69c36ac77..b29005523e2d40f46ee94b2741cf4b43bc5babea 100644 |
| --- a/Source/wtf/text/StringImpl.cpp |
| +++ b/Source/wtf/text/StringImpl.cpp |
| @@ -1893,6 +1893,120 @@ PassRefPtr<StringImpl> StringImpl::replace(StringImpl* pattern, StringImpl* repl |
| return newImpl.release(); |
| } |
| +bool StringImpl::hasUnmatchedSurrogates() const |
| +{ |
| + // By definition, 8-bit strings are confined to the Latin-1 code page and |
| + // have no surrogates, matched or otherwise. |
| + if (is8Bit()) |
| + return false; |
| + |
| + const UChar* characters = characters16(); |
| + const unsigned length = m_length; |
| + |
| + for (unsigned i = 0; i < length; ++i) { |
| + UChar c = characters[i]; |
| + if (c < 0xD800 || c > 0xDFFF) { |
|
tkent
2014/06/16 07:51:16
!U16_IS_SURROGATE(c)
jsbell
2014/06/17 21:39:50
Done - used U16_XXX macros throughout. Also, tight
|
| + // Non-surrogate |
| + continue; |
| + } |
| + if (0xDC00 <= c && c <= 0xDFFF) { |
|
tkent
2014/06/16 07:51:16
U16_IS_TRAIL(c)
|
| + // Unmatched trail surrogate. |
| + return true; |
| + } |
|
Nils Barth (inactive)
2014/06/16 07:08:28
Want to add something like:
// Lead surrogate.
//
|
| + if (i == length - 1) { |
| + // Unmatched lead surrogate at EOF. |
| + return true; |
| + } |
| + UChar d = characters[i + 1]; |
| + if (0xDC00 <= d && d <= 0xDFFF) { |
|
tkent
2014/06/16 07:51:16
U16_IS_TRAIL(d)
|
| + // Matching trail surrogate. |
| + ++i; |
| + continue; |
| + } |
| + // Unmatched lead. |
| + return true; |
| + } |
| + return false; |
| +} |
| + |
| +PassRefPtr<StringImpl> StringImpl::replaceUnmatchedSurrogates() |
| +{ |
| + // This roughly implements http://heycam.github.io/webidl/#dfn-obtain-unicode |
| + // but the output is still a sequence of 16-bit code units, effectively |
| + // re-encoding to UTF-16 after performing the replacements. |
| + |
| + // The concepts of surrogate pairs are explained at: |
| + // http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G2630 |
| + |
| + // Blink-specific optimization to avoid making an unnecessary copy. |
| + if (!hasUnmatchedSurrogates()) |
| + return this; |
| + ASSERT(!is8Bit()); |
| + |
| + // 1. Let S be the DOMString value. |
| + const UChar* s = characters16(); |
| + |
| + // 2. Let n be the length of S. |
| + const unsigned n = m_length; |
| + |
| + // 3. Initialize i to 0. |
| + unsigned i = 0; |
| + |
| + // 4. Initialize U to be an empty sequence of Unicode characters. |
| + // (Blink: we just use an array of UTF-16 code units.) |
| + UChar* u; |
| + RefPtr<StringImpl> newImpl = createUninitialized(n, u); |
| + |
| + // 5. While i < n: |
| + while (i < n) { |
| + // 1. Let c be the code unit in S at index i. |
| + UChar c = s[i]; |
| + // 2. Depending on the value of c: |
| + if (c < 0xD800 || c > 0xDFFF) { |
| + // c < 0xD800 or c > 0xDFFF |
| + // Append to U the Unicode character with code point c. |
| + u[i] = c; |
| + } else if (0xDC00 <= c && c <= 0xDFFF) { |
| + // 0xDC00 <= c <= 0xDFFF |
| + // Append to U a U+FFFD REPLACEMENT CHARACTER. |
| + u[i] = Unicode::replacementCharacter; |
| + } else { |
| + // 0xD800 <= c <= 0xDBFF |
| + ASSERT(0xD800 <= c && c <= 0xDBFF); |
| + if (i == n - 1) { |
| + // 1. If i = n−1, then append to U a U+FFFD REPLACEMENT CHARACTER. |
| + u[i] = Unicode::replacementCharacter; |
| + } else { |
| + // 2. Otherwise, i < n−1: |
| + ASSERT(i < n - 1); |
| + // ..1. Let d be the code unit in S at index i+1. |
| + UChar d = s[i + 1]; |
| + if (0xDC00 <= d && d <= 0xDFFF) { |
| + // 2. If 0xDC00 ≤ d ≤ 0xDFFF, then: |
| + // ..1. Let a be c & 0x3FF. |
| + // ..2. Let b be d & 0x3FF. |
| + // ..3. Append to U the Unicode character with code point 2^16+2^10*a+b. |
| + // (Blink: Just pass through the UTF-16 code units rather than |
| + // decoding to a Unicode scalar value then re-encoding.) |
| + u[i] = c; |
| + u[i + 1] = d; |
| + // ..4. Set i to i+1. |
| + ++i; |
| + } else { |
| + // 3. Otherwise, d < 0xDC00 or d > 0xDFFF. Append to U a U+FFFD REPLACEMENT CHARACTER. |
| + ASSERT(d < 0xD800 || d > 0xDFFF); |
| + u[i] = Unicode::replacementCharacter; |
| + } |
| + } |
| + } |
| + // 3. Set i to i+1. |
| + ++i; |
| + } |
| + |
| + // 6. Return U. |
| + return newImpl.release(); |
| +} |
| + |
| PassRefPtr<StringImpl> StringImpl::upconvertedString() |
| { |
| if (is8Bit()) |