OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright (C) 1999 Lars Knoll (knoll@kde.org) | 2 * Copyright (C) 1999 Lars Knoll (knoll@kde.org) |
3 * (C) 1999 Antti Koivisto (koivisto@kde.org) | 3 * (C) 1999 Antti Koivisto (koivisto@kde.org) |
4 * (C) 2001 Dirk Mueller ( mueller@kde.org ) | 4 * (C) 2001 Dirk Mueller ( mueller@kde.org ) |
5 * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2013 Apple Inc. All r ights reserved. | 5 * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2013 Apple Inc. All r ights reserved. |
6 * Copyright (C) 2006 Andrew Wellington (proton@wiretapped.net) | 6 * Copyright (C) 2006 Andrew Wellington (proton@wiretapped.net) |
7 * | 7 * |
8 * This library is free software; you can redistribute it and/or | 8 * This library is free software; you can redistribute it and/or |
9 * modify it under the terms of the GNU Library General Public | 9 * modify it under the terms of the GNU Library General Public |
10 * License as published by the Free Software Foundation; either | 10 * License as published by the Free Software Foundation; either |
(...skipping 1875 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1886 } else { | 1886 } else { |
1887 // Cases 2 & 4. | 1887 // Cases 2 & 4. |
1888 memcpy(data + dstOffset, characters16() + srcSegmentStart, srcSegmentLen gth * sizeof(UChar)); | 1888 memcpy(data + dstOffset, characters16() + srcSegmentStart, srcSegmentLen gth * sizeof(UChar)); |
1889 } | 1889 } |
1890 | 1890 |
1891 ASSERT(dstOffset + srcSegmentLength == newImpl->length()); | 1891 ASSERT(dstOffset + srcSegmentLength == newImpl->length()); |
1892 | 1892 |
1893 return newImpl.release(); | 1893 return newImpl.release(); |
1894 } | 1894 } |
1895 | 1895 |
1896 bool StringImpl::hasUnmatchedSurrogates() const | |
1897 { | |
1898 // By definition, 8-bit strings are confined to the Latin-1 code page and | |
1899 // have no surrogates, matched or otherwise. | |
1900 if (is8Bit()) | |
1901 return false; | |
1902 | |
1903 const UChar* characters = characters16(); | |
1904 const unsigned length = m_length; | |
1905 | |
1906 for (unsigned i = 0; i < length; ++i) { | |
1907 UChar c = characters[i]; | |
1908 if (c < 0xD800 || c > 0xDFFF) { | |
tkent
2014/06/16 07:51:16
!U16_IS_SURROGATE(c)
jsbell
2014/06/17 21:39:50
Done - used U16_XXX macros throughout. Also, tight
| |
1909 // Non-surrogate | |
1910 continue; | |
1911 } | |
1912 if (0xDC00 <= c && c <= 0xDFFF) { | |
tkent
2014/06/16 07:51:16
U16_IS_TRAIL(c)
| |
1913 // Unmatched trail surrogate. | |
1914 return true; | |
1915 } | |
Nils Barth (inactive)
2014/06/16 07:08:28
Want to add something like:
// Lead surrogate.
//
| |
1916 if (i == length - 1) { | |
1917 // Unmatched lead surrogate at EOF. | |
1918 return true; | |
1919 } | |
1920 UChar d = characters[i + 1]; | |
1921 if (0xDC00 <= d && d <= 0xDFFF) { | |
tkent
2014/06/16 07:51:16
U16_IS_TRAIL(d)
| |
1922 // Matching trail surrogate. | |
1923 ++i; | |
1924 continue; | |
1925 } | |
1926 // Unmatched lead. | |
1927 return true; | |
1928 } | |
1929 return false; | |
1930 } | |
1931 | |
1932 PassRefPtr<StringImpl> StringImpl::replaceUnmatchedSurrogates() | |
1933 { | |
1934 // This roughly implements http://heycam.github.io/webidl/#dfn-obtain-unicod e | |
1935 // but the output is still a sequence of 16-bit code units, effectively | |
1936 // re-encoding to UTF-16 after performing the replacements. | |
1937 | |
1938 // The concepts of surrogate pairs are explained at: | |
1939 // http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G2630 | |
1940 | |
1941 // Blink-specific optimization to avoid making an unnecessary copy. | |
1942 if (!hasUnmatchedSurrogates()) | |
1943 return this; | |
1944 ASSERT(!is8Bit()); | |
1945 | |
1946 // 1. Let S be the DOMString value. | |
1947 const UChar* s = characters16(); | |
1948 | |
1949 // 2. Let n be the length of S. | |
1950 const unsigned n = m_length; | |
1951 | |
1952 // 3. Initialize i to 0. | |
1953 unsigned i = 0; | |
1954 | |
1955 // 4. Initialize U to be an empty sequence of Unicode characters. | |
1956 // (Blink: we just use an array of UTF-16 code units.) | |
1957 UChar* u; | |
1958 RefPtr<StringImpl> newImpl = createUninitialized(n, u); | |
1959 | |
1960 // 5. While i < n: | |
1961 while (i < n) { | |
1962 // 1. Let c be the code unit in S at index i. | |
1963 UChar c = s[i]; | |
1964 // 2. Depending on the value of c: | |
1965 if (c < 0xD800 || c > 0xDFFF) { | |
1966 // c < 0xD800 or c > 0xDFFF | |
1967 // Append to U the Unicode character with code point c. | |
1968 u[i] = c; | |
1969 } else if (0xDC00 <= c && c <= 0xDFFF) { | |
1970 // 0xDC00 <= c <= 0xDFFF | |
1971 // Append to U a U+FFFD REPLACEMENT CHARACTER. | |
1972 u[i] = Unicode::replacementCharacter; | |
1973 } else { | |
1974 // 0xD800 <= c <= 0xDBFF | |
1975 ASSERT(0xD800 <= c && c <= 0xDBFF); | |
1976 if (i == n - 1) { | |
1977 // 1. If i = n−1, then append to U a U+FFFD REPLACEMENT CHARACTE R. | |
1978 u[i] = Unicode::replacementCharacter; | |
1979 } else { | |
1980 // 2. Otherwise, i < n−1: | |
1981 ASSERT(i < n - 1); | |
1982 // ..1. Let d be the code unit in S at index i+1. | |
1983 UChar d = s[i + 1]; | |
1984 if (0xDC00 <= d && d <= 0xDFFF) { | |
1985 // 2. If 0xDC00 ≤ d ≤ 0xDFFF, then: | |
1986 // ..1. Let a be c & 0x3FF. | |
1987 // ..2. Let b be d & 0x3FF. | |
1988 // ..3. Append to U the Unicode character with code point 2^ 16+2^10*a+b. | |
1989 // (Blink: Just pass through the UTF-16 code units rather th an | |
1990 // decoding to a Unicode scalar value then re-encoding.) | |
1991 u[i] = c; | |
1992 u[i + 1] = d; | |
1993 // ..4. Set i to i+1. | |
1994 ++i; | |
1995 } else { | |
1996 // 3. Otherwise, d < 0xDC00 or d > 0xDFFF. Append to U a U+F FFD REPLACEMENT CHARACTER. | |
1997 ASSERT(d < 0xD800 || d > 0xDFFF); | |
1998 u[i] = Unicode::replacementCharacter; | |
1999 } | |
2000 } | |
2001 } | |
2002 // 3. Set i to i+1. | |
2003 ++i; | |
2004 } | |
2005 | |
2006 // 6. Return U. | |
2007 return newImpl.release(); | |
2008 } | |
2009 | |
1896 PassRefPtr<StringImpl> StringImpl::upconvertedString() | 2010 PassRefPtr<StringImpl> StringImpl::upconvertedString() |
1897 { | 2011 { |
1898 if (is8Bit()) | 2012 if (is8Bit()) |
1899 return String::make16BitFrom8BitSource(characters8(), m_length).releaseI mpl(); | 2013 return String::make16BitFrom8BitSource(characters8(), m_length).releaseI mpl(); |
1900 return this; | 2014 return this; |
1901 } | 2015 } |
1902 | 2016 |
1903 static inline bool stringImplContentEqual(const StringImpl* a, const StringImpl* b) | 2017 static inline bool stringImplContentEqual(const StringImpl* a, const StringImpl* b) |
1904 { | 2018 { |
1905 unsigned aLength = a->length(); | 2019 unsigned aLength = a->length(); |
(...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2099 | 2213 |
2100 size_t StringImpl::sizeInBytes() const | 2214 size_t StringImpl::sizeInBytes() const |
2101 { | 2215 { |
2102 size_t size = length(); | 2216 size_t size = length(); |
2103 if (!is8Bit()) | 2217 if (!is8Bit()) |
2104 size *= 2; | 2218 size *= 2; |
2105 return size + sizeof(*this); | 2219 return size + sizeof(*this); |
2106 } | 2220 } |
2107 | 2221 |
2108 } // namespace WTF | 2222 } // namespace WTF |
OLD | NEW |