OLD | NEW |
| (Empty) |
1 /* | |
2 * (C) 1999 Lars Knoll (knoll@kde.org) | |
3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights | |
4 * reserved. | |
5 * Copyright (C) 2007-2009 Torch Mobile, Inc. | |
6 * | |
7 * This library is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Library General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2 of the License, or (at your option) any later version. | |
11 * | |
12 * This library is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Library General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Library General Public License | |
18 * along with this library; see the file COPYING.LIB. If not, write to | |
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | |
20 * Boston, MA 02110-1301, USA. | |
21 */ | |
22 | |
23 #include "wtf/text/WTFString.h" | |
24 | |
25 #include "base/strings/string_util.h" | |
26 #include "wtf/ASCIICType.h" | |
27 #include "wtf/DataLog.h" | |
28 #include "wtf/HexNumber.h" | |
29 #include "wtf/MathExtras.h" | |
30 #include "wtf/StringExtras.h" | |
31 #include "wtf/Vector.h" | |
32 #include "wtf/dtoa.h" | |
33 #include "wtf/text/CString.h" | |
34 #include "wtf/text/CharacterNames.h" | |
35 #include "wtf/text/IntegerToStringConversion.h" | |
36 #include "wtf/text/UTF8.h" | |
37 #include "wtf/text/Unicode.h" | |
38 #include <algorithm> | |
39 #include <stdarg.h> | |
40 | |
41 namespace WTF { | |
42 | |
43 using namespace Unicode; | |
44 | |
45 namespace { | |
46 | |
47 Vector<char> asciiDebug(StringImpl* impl) { | |
48 if (!impl) | |
49 return asciiDebug(String("[null]").impl()); | |
50 | |
51 Vector<char> buffer; | |
52 for (unsigned i = 0; i < impl->length(); ++i) { | |
53 UChar ch = (*impl)[i]; | |
54 if (isASCIIPrintable(ch)) { | |
55 if (ch == '\\') | |
56 buffer.push_back('\\'); | |
57 buffer.push_back(static_cast<char>(ch)); | |
58 } else { | |
59 buffer.push_back('\\'); | |
60 buffer.push_back('u'); | |
61 HexNumber::appendUnsignedAsHexFixedSize(ch, buffer, 4); | |
62 } | |
63 } | |
64 buffer.push_back('\0'); | |
65 return buffer; | |
66 } | |
67 | |
68 } // namespace | |
69 | |
70 // Construct a string with UTF-16 data. | |
71 String::String(const UChar* characters, unsigned length) | |
72 : m_impl(characters ? StringImpl::create(characters, length) : nullptr) {} | |
73 | |
74 // Construct a string with UTF-16 data, from a null-terminated source. | |
75 String::String(const UChar* str) { | |
76 if (!str) | |
77 return; | |
78 m_impl = StringImpl::create(str, lengthOfNullTerminatedString(str)); | |
79 } | |
80 | |
81 // Construct a string with latin1 data. | |
82 String::String(const LChar* characters, unsigned length) | |
83 : m_impl(characters ? StringImpl::create(characters, length) : nullptr) {} | |
84 | |
85 String::String(const char* characters, unsigned length) | |
86 : m_impl(characters ? StringImpl::create( | |
87 reinterpret_cast<const LChar*>(characters), | |
88 length) | |
89 : nullptr) {} | |
90 | |
91 void String::append(const StringView& string) { | |
92 if (string.isEmpty()) | |
93 return; | |
94 if (!m_impl) { | |
95 m_impl = string.toString().releaseImpl(); | |
96 return; | |
97 } | |
98 | |
99 // FIXME: This is extremely inefficient. So much so that we might want to | |
100 // take this out of String's API. We can make it better by optimizing the | |
101 // case where exactly one String is pointing at this StringImpl, but even | |
102 // then it's going to require a call into the allocator every single time. | |
103 | |
104 if (m_impl->is8Bit() && string.is8Bit()) { | |
105 LChar* data; | |
106 RELEASE_ASSERT(string.length() <= | |
107 std::numeric_limits<unsigned>::max() - m_impl->length()); | |
108 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized( | |
109 m_impl->length() + string.length(), data); | |
110 memcpy(data, m_impl->characters8(), m_impl->length() * sizeof(LChar)); | |
111 memcpy(data + m_impl->length(), string.characters8(), | |
112 string.length() * sizeof(LChar)); | |
113 m_impl = newImpl.release(); | |
114 return; | |
115 } | |
116 | |
117 UChar* data; | |
118 RELEASE_ASSERT(string.length() <= | |
119 std::numeric_limits<unsigned>::max() - m_impl->length()); | |
120 RefPtr<StringImpl> newImpl = | |
121 StringImpl::createUninitialized(m_impl->length() + string.length(), data); | |
122 | |
123 if (m_impl->is8Bit()) | |
124 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length()); | |
125 else | |
126 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length()); | |
127 | |
128 if (string.is8Bit()) | |
129 StringImpl::copyChars(data + m_impl->length(), string.characters8(), | |
130 string.length()); | |
131 else | |
132 StringImpl::copyChars(data + m_impl->length(), string.characters16(), | |
133 string.length()); | |
134 | |
135 m_impl = newImpl.release(); | |
136 } | |
137 | |
138 template <typename CharacterType> | |
139 inline void String::appendInternal(CharacterType c) { | |
140 // FIXME: This is extremely inefficient. So much so that we might want to | |
141 // take this out of String's API. We can make it better by optimizing the | |
142 // case where exactly one String is pointing at this StringImpl, but even | |
143 // then it's going to require a call into the allocator every single time. | |
144 if (!m_impl) { | |
145 m_impl = StringImpl::create(&c, 1); | |
146 return; | |
147 } | |
148 | |
149 // FIXME: We should be able to create an 8 bit string via this code path. | |
150 UChar* data; | |
151 RELEASE_ASSERT(m_impl->length() < std::numeric_limits<unsigned>::max()); | |
152 RefPtr<StringImpl> newImpl = | |
153 StringImpl::createUninitialized(m_impl->length() + 1, data); | |
154 if (m_impl->is8Bit()) | |
155 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length()); | |
156 else | |
157 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length()); | |
158 data[m_impl->length()] = c; | |
159 m_impl = newImpl.release(); | |
160 } | |
161 | |
162 void String::append(LChar c) { | |
163 appendInternal(c); | |
164 } | |
165 | |
166 void String::append(UChar c) { | |
167 appendInternal(c); | |
168 } | |
169 | |
170 int codePointCompare(const String& a, const String& b) { | |
171 return codePointCompare(a.impl(), b.impl()); | |
172 } | |
173 | |
174 int codePointCompareIgnoringASCIICase(const String& a, const char* b) { | |
175 return codePointCompareIgnoringASCIICase(a.impl(), | |
176 reinterpret_cast<const LChar*>(b)); | |
177 } | |
178 | |
179 template <typename CharType> | |
180 PassRefPtr<StringImpl> insertInternal(PassRefPtr<StringImpl> impl, | |
181 const CharType* charactersToInsert, | |
182 unsigned lengthToInsert, | |
183 unsigned position) { | |
184 if (!lengthToInsert) | |
185 return impl; | |
186 | |
187 DCHECK(charactersToInsert); | |
188 UChar* data; // FIXME: We should be able to create an 8 bit string here. | |
189 RELEASE_ASSERT(lengthToInsert <= | |
190 std::numeric_limits<unsigned>::max() - impl->length()); | |
191 RefPtr<StringImpl> newImpl = | |
192 StringImpl::createUninitialized(impl->length() + lengthToInsert, data); | |
193 | |
194 if (impl->is8Bit()) | |
195 StringImpl::copyChars(data, impl->characters8(), position); | |
196 else | |
197 StringImpl::copyChars(data, impl->characters16(), position); | |
198 | |
199 StringImpl::copyChars(data + position, charactersToInsert, lengthToInsert); | |
200 | |
201 if (impl->is8Bit()) | |
202 StringImpl::copyChars(data + position + lengthToInsert, | |
203 impl->characters8() + position, | |
204 impl->length() - position); | |
205 else | |
206 StringImpl::copyChars(data + position + lengthToInsert, | |
207 impl->characters16() + position, | |
208 impl->length() - position); | |
209 | |
210 return newImpl.release(); | |
211 } | |
212 | |
213 void String::insert(const StringView& string, unsigned position) { | |
214 if (string.isEmpty()) { | |
215 if (string.isNull()) | |
216 return; | |
217 if (isNull()) | |
218 m_impl = string.toString().releaseImpl(); | |
219 return; | |
220 } | |
221 | |
222 if (position >= length()) { | |
223 if (string.is8Bit()) | |
224 append(string); | |
225 else | |
226 append(string); | |
227 return; | |
228 } | |
229 | |
230 DCHECK(m_impl); | |
231 if (string.is8Bit()) | |
232 m_impl = insertInternal(m_impl.release(), string.characters8(), | |
233 string.length(), position); | |
234 else | |
235 m_impl = insertInternal(m_impl.release(), string.characters16(), | |
236 string.length(), position); | |
237 } | |
238 | |
239 UChar32 String::characterStartingAt(unsigned i) const { | |
240 if (!m_impl || i >= m_impl->length()) | |
241 return 0; | |
242 return m_impl->characterStartingAt(i); | |
243 } | |
244 | |
245 void String::ensure16Bit() { | |
246 if (isNull()) | |
247 return; | |
248 if (!is8Bit()) | |
249 return; | |
250 if (unsigned length = this->length()) | |
251 m_impl = | |
252 make16BitFrom8BitSource(m_impl->characters8(), length).releaseImpl(); | |
253 else | |
254 m_impl = StringImpl::empty16Bit; | |
255 } | |
256 | |
257 void String::truncate(unsigned length) { | |
258 if (m_impl) | |
259 m_impl = m_impl->truncate(length); | |
260 } | |
261 | |
262 void String::remove(unsigned start, unsigned lengthToRemove) { | |
263 if (m_impl) | |
264 m_impl = m_impl->remove(start, lengthToRemove); | |
265 } | |
266 | |
267 String String::substring(unsigned pos, unsigned len) const { | |
268 if (!m_impl) | |
269 return String(); | |
270 return m_impl->substring(pos, len); | |
271 } | |
272 | |
273 String String::lower() const { | |
274 if (!m_impl) | |
275 return String(); | |
276 return m_impl->lower(); | |
277 } | |
278 | |
279 String String::upper() const { | |
280 if (!m_impl) | |
281 return String(); | |
282 return m_impl->upper(); | |
283 } | |
284 | |
285 String String::lower(const AtomicString& localeIdentifier) const { | |
286 if (!m_impl) | |
287 return String(); | |
288 return m_impl->lower(localeIdentifier); | |
289 } | |
290 | |
291 String String::upper(const AtomicString& localeIdentifier) const { | |
292 if (!m_impl) | |
293 return String(); | |
294 return m_impl->upper(localeIdentifier); | |
295 } | |
296 | |
297 String String::upperASCII() const { | |
298 if (!m_impl) | |
299 return String(); | |
300 return m_impl->upperASCII(); | |
301 } | |
302 | |
303 String String::stripWhiteSpace() const { | |
304 if (!m_impl) | |
305 return String(); | |
306 return m_impl->stripWhiteSpace(); | |
307 } | |
308 | |
309 String String::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const { | |
310 if (!m_impl) | |
311 return String(); | |
312 return m_impl->stripWhiteSpace(isWhiteSpace); | |
313 } | |
314 | |
315 String String::simplifyWhiteSpace(StripBehavior stripBehavior) const { | |
316 if (!m_impl) | |
317 return String(); | |
318 return m_impl->simplifyWhiteSpace(stripBehavior); | |
319 } | |
320 | |
321 String String::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace, | |
322 StripBehavior stripBehavior) const { | |
323 if (!m_impl) | |
324 return String(); | |
325 return m_impl->simplifyWhiteSpace(isWhiteSpace, stripBehavior); | |
326 } | |
327 | |
328 String String::removeCharacters(CharacterMatchFunctionPtr findMatch) const { | |
329 if (!m_impl) | |
330 return String(); | |
331 return m_impl->removeCharacters(findMatch); | |
332 } | |
333 | |
334 String String::foldCase() const { | |
335 if (!m_impl) | |
336 return String(); | |
337 return m_impl->foldCase(); | |
338 } | |
339 | |
340 String String::format(const char* format, ...) { | |
341 va_list args; | |
342 | |
343 // TODO(esprehn): base uses 1024, maybe we should use a bigger size too. | |
344 static const unsigned kDefaultSize = 256; | |
345 Vector<char, kDefaultSize> buffer(kDefaultSize); | |
346 | |
347 va_start(args, format); | |
348 int length = base::vsnprintf(buffer.data(), buffer.size(), format, args); | |
349 va_end(args); | |
350 | |
351 // TODO(esprehn): This can only happen if there's an encoding error, what's | |
352 // the locale set to inside blink? Can this happen? We should probably CHECK | |
353 // instead. | |
354 if (length < 0) | |
355 return String(); | |
356 | |
357 if (static_cast<unsigned>(length) >= buffer.size()) { | |
358 // vsnprintf doesn't include the NUL terminator in the length so we need to | |
359 // add space for it when growing. | |
360 buffer.grow(length + 1); | |
361 | |
362 // We need to call va_end() and then va_start() each time we use args, as | |
363 // the contents of args is undefined after the call to vsnprintf according | |
364 // to http://man.cx/snprintf(3) | |
365 // | |
366 // Not calling va_end/va_start here happens to work on lots of systems, but | |
367 // fails e.g. on 64bit Linux. | |
368 va_start(args, format); | |
369 length = base::vsnprintf(buffer.data(), buffer.size(), format, args); | |
370 va_end(args); | |
371 } | |
372 | |
373 CHECK_LT(static_cast<unsigned>(length), buffer.size()); | |
374 return String(reinterpret_cast<const LChar*>(buffer.data()), length); | |
375 } | |
376 | |
377 template <typename IntegerType> | |
378 static String integerToString(IntegerType input) { | |
379 IntegerToStringConverter<IntegerType> converter(input); | |
380 return StringImpl::create(converter.characters8(), converter.length()); | |
381 } | |
382 | |
383 String String::number(int number) { | |
384 return integerToString(number); | |
385 } | |
386 | |
387 String String::number(unsigned number) { | |
388 return integerToString(number); | |
389 } | |
390 | |
391 String String::number(long number) { | |
392 return integerToString(number); | |
393 } | |
394 | |
395 String String::number(unsigned long number) { | |
396 return integerToString(number); | |
397 } | |
398 | |
399 String String::number(long long number) { | |
400 return integerToString(number); | |
401 } | |
402 | |
403 String String::number(unsigned long long number) { | |
404 return integerToString(number); | |
405 } | |
406 | |
407 String String::number(double number, unsigned precision) { | |
408 NumberToStringBuffer buffer; | |
409 return String(numberToFixedPrecisionString(number, precision, buffer)); | |
410 } | |
411 | |
412 String String::numberToStringECMAScript(double number) { | |
413 NumberToStringBuffer buffer; | |
414 return String(numberToString(number, buffer)); | |
415 } | |
416 | |
417 String String::numberToStringFixedWidth(double number, unsigned decimalPlaces) { | |
418 NumberToStringBuffer buffer; | |
419 return String(numberToFixedWidthString(number, decimalPlaces, buffer)); | |
420 } | |
421 | |
422 int String::toIntStrict(bool* ok, int base) const { | |
423 if (!m_impl) { | |
424 if (ok) | |
425 *ok = false; | |
426 return 0; | |
427 } | |
428 return m_impl->toIntStrict(ok, base); | |
429 } | |
430 | |
431 unsigned String::toUIntStrict(bool* ok, int base) const { | |
432 if (!m_impl) { | |
433 if (ok) | |
434 *ok = false; | |
435 return 0; | |
436 } | |
437 return m_impl->toUIntStrict(ok, base); | |
438 } | |
439 | |
440 int64_t String::toInt64Strict(bool* ok, int base) const { | |
441 if (!m_impl) { | |
442 if (ok) | |
443 *ok = false; | |
444 return 0; | |
445 } | |
446 return m_impl->toInt64Strict(ok, base); | |
447 } | |
448 | |
449 uint64_t String::toUInt64Strict(bool* ok, int base) const { | |
450 if (!m_impl) { | |
451 if (ok) | |
452 *ok = false; | |
453 return 0; | |
454 } | |
455 return m_impl->toUInt64Strict(ok, base); | |
456 } | |
457 | |
458 int String::toInt(bool* ok) const { | |
459 if (!m_impl) { | |
460 if (ok) | |
461 *ok = false; | |
462 return 0; | |
463 } | |
464 return m_impl->toInt(ok); | |
465 } | |
466 | |
467 unsigned String::toUInt(bool* ok) const { | |
468 if (!m_impl) { | |
469 if (ok) | |
470 *ok = false; | |
471 return 0; | |
472 } | |
473 return m_impl->toUInt(ok); | |
474 } | |
475 | |
476 int64_t String::toInt64(bool* ok) const { | |
477 if (!m_impl) { | |
478 if (ok) | |
479 *ok = false; | |
480 return 0; | |
481 } | |
482 return m_impl->toInt64(ok); | |
483 } | |
484 | |
485 uint64_t String::toUInt64(bool* ok) const { | |
486 if (!m_impl) { | |
487 if (ok) | |
488 *ok = false; | |
489 return 0; | |
490 } | |
491 return m_impl->toUInt64(ok); | |
492 } | |
493 | |
494 double String::toDouble(bool* ok) const { | |
495 if (!m_impl) { | |
496 if (ok) | |
497 *ok = false; | |
498 return 0.0; | |
499 } | |
500 return m_impl->toDouble(ok); | |
501 } | |
502 | |
503 float String::toFloat(bool* ok) const { | |
504 if (!m_impl) { | |
505 if (ok) | |
506 *ok = false; | |
507 return 0.0f; | |
508 } | |
509 return m_impl->toFloat(ok); | |
510 } | |
511 | |
512 String String::isolatedCopy() const { | |
513 if (!m_impl) | |
514 return String(); | |
515 return m_impl->isolatedCopy(); | |
516 } | |
517 | |
518 bool String::isSafeToSendToAnotherThread() const { | |
519 return !m_impl || m_impl->isSafeToSendToAnotherThread(); | |
520 } | |
521 | |
522 void String::split(const StringView& separator, | |
523 bool allowEmptyEntries, | |
524 Vector<String>& result) const { | |
525 result.clear(); | |
526 | |
527 unsigned startPos = 0; | |
528 size_t endPos; | |
529 while ((endPos = find(separator, startPos)) != kNotFound) { | |
530 if (allowEmptyEntries || startPos != endPos) | |
531 result.push_back(substring(startPos, endPos - startPos)); | |
532 startPos = endPos + separator.length(); | |
533 } | |
534 if (allowEmptyEntries || startPos != length()) | |
535 result.push_back(substring(startPos)); | |
536 } | |
537 | |
538 void String::split(UChar separator, | |
539 bool allowEmptyEntries, | |
540 Vector<String>& result) const { | |
541 result.clear(); | |
542 | |
543 unsigned startPos = 0; | |
544 size_t endPos; | |
545 while ((endPos = find(separator, startPos)) != kNotFound) { | |
546 if (allowEmptyEntries || startPos != endPos) | |
547 result.push_back(substring(startPos, endPos - startPos)); | |
548 startPos = endPos + 1; | |
549 } | |
550 if (allowEmptyEntries || startPos != length()) | |
551 result.push_back(substring(startPos)); | |
552 } | |
553 | |
554 CString String::ascii() const { | |
555 // Printable ASCII characters 32..127 and the null character are | |
556 // preserved, characters outside of this range are converted to '?'. | |
557 | |
558 unsigned length = this->length(); | |
559 if (!length) { | |
560 char* characterBuffer; | |
561 return CString::createUninitialized(length, characterBuffer); | |
562 } | |
563 | |
564 if (this->is8Bit()) { | |
565 const LChar* characters = this->characters8(); | |
566 | |
567 char* characterBuffer; | |
568 CString result = CString::createUninitialized(length, characterBuffer); | |
569 | |
570 for (unsigned i = 0; i < length; ++i) { | |
571 LChar ch = characters[i]; | |
572 characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch; | |
573 } | |
574 | |
575 return result; | |
576 } | |
577 | |
578 const UChar* characters = this->characters16(); | |
579 | |
580 char* characterBuffer; | |
581 CString result = CString::createUninitialized(length, characterBuffer); | |
582 | |
583 for (unsigned i = 0; i < length; ++i) { | |
584 UChar ch = characters[i]; | |
585 characterBuffer[i] = | |
586 ch && (ch < 0x20 || ch > 0x7f) ? '?' : static_cast<char>(ch); | |
587 } | |
588 | |
589 return result; | |
590 } | |
591 | |
592 CString String::latin1() const { | |
593 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are | |
594 // preserved, characters outside of this range are converted to '?'. | |
595 | |
596 unsigned length = this->length(); | |
597 | |
598 if (!length) | |
599 return CString("", 0); | |
600 | |
601 if (is8Bit()) | |
602 return CString(reinterpret_cast<const char*>(this->characters8()), length); | |
603 | |
604 const UChar* characters = this->characters16(); | |
605 | |
606 char* characterBuffer; | |
607 CString result = CString::createUninitialized(length, characterBuffer); | |
608 | |
609 for (unsigned i = 0; i < length; ++i) { | |
610 UChar ch = characters[i]; | |
611 characterBuffer[i] = ch > 0xff ? '?' : static_cast<char>(ch); | |
612 } | |
613 | |
614 return result; | |
615 } | |
616 | |
617 // Helper to write a three-byte UTF-8 code point to the buffer, caller must | |
618 // check room is available. | |
619 static inline void putUTF8Triple(char*& buffer, UChar ch) { | |
620 DCHECK_GE(ch, 0x0800); | |
621 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); | |
622 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); | |
623 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); | |
624 } | |
625 | |
626 CString String::utf8(UTF8ConversionMode mode) const { | |
627 unsigned length = this->length(); | |
628 | |
629 if (!length) | |
630 return CString("", 0); | |
631 | |
632 // Allocate a buffer big enough to hold all the characters | |
633 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). | |
634 // Optimization ideas, if we find this function is hot: | |
635 // * We could speculatively create a CStringImpl to contain 'length' | |
636 // characters, and resize if necessary (i.e. if the buffer contains | |
637 // non-ascii characters). (Alternatively, scan the buffer first for | |
638 // ascii characters, so we know this will be sufficient). | |
639 // * We could allocate a CStringImpl with an appropriate size to | |
640 // have a good chance of being able to write the string into the | |
641 // buffer without reallocing (say, 1.5 x length). | |
642 if (length > std::numeric_limits<unsigned>::max() / 3) | |
643 return CString(); | |
644 Vector<char, 1024> bufferVector(length * 3); | |
645 | |
646 char* buffer = bufferVector.data(); | |
647 | |
648 if (is8Bit()) { | |
649 const LChar* characters = this->characters8(); | |
650 | |
651 ConversionResult result = | |
652 convertLatin1ToUTF8(&characters, characters + length, &buffer, | |
653 buffer + bufferVector.size()); | |
654 // (length * 3) should be sufficient for any conversion | |
655 DCHECK_NE(result, targetExhausted); | |
656 } else { | |
657 const UChar* characters = this->characters16(); | |
658 | |
659 if (mode == StrictUTF8ConversionReplacingUnpairedSurrogatesWithFFFD) { | |
660 const UChar* charactersEnd = characters + length; | |
661 char* bufferEnd = buffer + bufferVector.size(); | |
662 while (characters < charactersEnd) { | |
663 // Use strict conversion to detect unpaired surrogates. | |
664 ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd, | |
665 &buffer, bufferEnd, true); | |
666 DCHECK_NE(result, targetExhausted); | |
667 // Conversion fails when there is an unpaired surrogate. Put | |
668 // replacement character (U+FFFD) instead of the unpaired | |
669 // surrogate. | |
670 if (result != conversionOK) { | |
671 DCHECK_LE(0xD800, *characters); | |
672 DCHECK_LE(*characters, 0xDFFF); | |
673 // There should be room left, since one UChar hasn't been | |
674 // converted. | |
675 DCHECK_LE(buffer + 3, bufferEnd); | |
676 putUTF8Triple(buffer, replacementCharacter); | |
677 ++characters; | |
678 } | |
679 } | |
680 } else { | |
681 bool strict = mode == StrictUTF8Conversion; | |
682 ConversionResult result = | |
683 convertUTF16ToUTF8(&characters, characters + length, &buffer, | |
684 buffer + bufferVector.size(), strict); | |
685 // (length * 3) should be sufficient for any conversion | |
686 DCHECK_NE(result, targetExhausted); | |
687 | |
688 // Only produced from strict conversion. | |
689 if (result == sourceIllegal) { | |
690 DCHECK(strict); | |
691 return CString(); | |
692 } | |
693 | |
694 // Check for an unconverted high surrogate. | |
695 if (result == sourceExhausted) { | |
696 if (strict) | |
697 return CString(); | |
698 // This should be one unpaired high surrogate. Treat it the same | |
699 // was as an unpaired high surrogate would have been handled in | |
700 // the middle of a string with non-strict conversion - which is | |
701 // to say, simply encode it to UTF-8. | |
702 DCHECK_EQ(characters + 1, this->characters16() + length); | |
703 DCHECK_GE(*characters, 0xD800); | |
704 DCHECK_LE(*characters, 0xDBFF); | |
705 // There should be room left, since one UChar hasn't been | |
706 // converted. | |
707 DCHECK_LE(buffer + 3, buffer + bufferVector.size()); | |
708 putUTF8Triple(buffer, *characters); | |
709 } | |
710 } | |
711 } | |
712 | |
713 return CString(bufferVector.data(), buffer - bufferVector.data()); | |
714 } | |
715 | |
716 String String::make8BitFrom16BitSource(const UChar* source, size_t length) { | |
717 if (!length) | |
718 return emptyString; | |
719 | |
720 LChar* destination; | |
721 String result = String::createUninitialized(length, destination); | |
722 | |
723 copyLCharsFromUCharSource(destination, source, length); | |
724 | |
725 return result; | |
726 } | |
727 | |
728 String String::make16BitFrom8BitSource(const LChar* source, size_t length) { | |
729 if (!length) | |
730 return emptyString16Bit; | |
731 | |
732 UChar* destination; | |
733 String result = String::createUninitialized(length, destination); | |
734 | |
735 StringImpl::copyChars(destination, source, length); | |
736 | |
737 return result; | |
738 } | |
739 | |
740 String String::fromUTF8(const LChar* stringStart, size_t length) { | |
741 RELEASE_ASSERT(length <= std::numeric_limits<unsigned>::max()); | |
742 | |
743 if (!stringStart) | |
744 return String(); | |
745 | |
746 if (!length) | |
747 return emptyString; | |
748 | |
749 if (charactersAreAllASCII(stringStart, length)) | |
750 return StringImpl::create(stringStart, length); | |
751 | |
752 Vector<UChar, 1024> buffer(length); | |
753 UChar* bufferStart = buffer.data(); | |
754 | |
755 UChar* bufferCurrent = bufferStart; | |
756 const char* stringCurrent = reinterpret_cast<const char*>(stringStart); | |
757 if (convertUTF8ToUTF16( | |
758 &stringCurrent, reinterpret_cast<const char*>(stringStart + length), | |
759 &bufferCurrent, bufferCurrent + buffer.size()) != conversionOK) | |
760 return String(); | |
761 | |
762 unsigned utf16Length = bufferCurrent - bufferStart; | |
763 DCHECK_LT(utf16Length, length); | |
764 return StringImpl::create(bufferStart, utf16Length); | |
765 } | |
766 | |
767 String String::fromUTF8(const LChar* string) { | |
768 if (!string) | |
769 return String(); | |
770 return fromUTF8(string, strlen(reinterpret_cast<const char*>(string))); | |
771 } | |
772 | |
773 String String::fromUTF8(const CString& s) { | |
774 return fromUTF8(s.data()); | |
775 } | |
776 | |
777 String String::fromUTF8WithLatin1Fallback(const LChar* string, size_t size) { | |
778 String utf8 = fromUTF8(string, size); | |
779 if (!utf8) | |
780 return String(string, size); | |
781 return utf8; | |
782 } | |
783 | |
784 std::ostream& operator<<(std::ostream& out, const String& string) { | |
785 if (string.isNull()) | |
786 return out << "<null>"; | |
787 | |
788 out << '"'; | |
789 for (unsigned index = 0; index < string.length(); ++index) { | |
790 // Print shorthands for select cases. | |
791 UChar character = string[index]; | |
792 switch (character) { | |
793 case '\t': | |
794 out << "\\t"; | |
795 break; | |
796 case '\n': | |
797 out << "\\n"; | |
798 break; | |
799 case '\r': | |
800 out << "\\r"; | |
801 break; | |
802 case '"': | |
803 out << "\\\""; | |
804 break; | |
805 case '\\': | |
806 out << "\\\\"; | |
807 break; | |
808 default: | |
809 if (isASCIIPrintable(character)) { | |
810 out << static_cast<char>(character); | |
811 } else { | |
812 // Print "\uXXXX" for control or non-ASCII characters. | |
813 out << "\\u"; | |
814 out.width(4); | |
815 out.fill('0'); | |
816 out.setf(std::ios_base::hex, std::ios_base::basefield); | |
817 out.setf(std::ios::uppercase); | |
818 out << character; | |
819 } | |
820 break; | |
821 } | |
822 } | |
823 return out << '"'; | |
824 } | |
825 | |
826 #ifndef NDEBUG | |
827 void String::show() const { | |
828 dataLogF("%s\n", asciiDebug(impl()).data()); | |
829 } | |
830 #endif | |
831 | |
832 } // namespace WTF | |
OLD | NEW |