| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * (C) 1999 Lars Knoll (knoll@kde.org) | |
| 3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights | |
| 4 * reserved. | |
| 5 * Copyright (C) 2007-2009 Torch Mobile, Inc. | |
| 6 * | |
| 7 * This library is free software; you can redistribute it and/or | |
| 8 * modify it under the terms of the GNU Library General Public | |
| 9 * License as published by the Free Software Foundation; either | |
| 10 * version 2 of the License, or (at your option) any later version. | |
| 11 * | |
| 12 * This library is distributed in the hope that it will be useful, | |
| 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 15 * Library General Public License for more details. | |
| 16 * | |
| 17 * You should have received a copy of the GNU Library General Public License | |
| 18 * along with this library; see the file COPYING.LIB. If not, write to | |
| 19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | |
| 20 * Boston, MA 02110-1301, USA. | |
| 21 */ | |
| 22 | |
| 23 #include "wtf/text/WTFString.h" | |
| 24 | |
| 25 #include "base/strings/string_util.h" | |
| 26 #include "wtf/ASCIICType.h" | |
| 27 #include "wtf/DataLog.h" | |
| 28 #include "wtf/HexNumber.h" | |
| 29 #include "wtf/MathExtras.h" | |
| 30 #include "wtf/StringExtras.h" | |
| 31 #include "wtf/Vector.h" | |
| 32 #include "wtf/dtoa.h" | |
| 33 #include "wtf/text/CString.h" | |
| 34 #include "wtf/text/CharacterNames.h" | |
| 35 #include "wtf/text/IntegerToStringConversion.h" | |
| 36 #include "wtf/text/UTF8.h" | |
| 37 #include "wtf/text/Unicode.h" | |
| 38 #include <algorithm> | |
| 39 #include <stdarg.h> | |
| 40 | |
| 41 namespace WTF { | |
| 42 | |
| 43 using namespace Unicode; | |
| 44 | |
| 45 namespace { | |
| 46 | |
| 47 Vector<char> asciiDebug(StringImpl* impl) { | |
| 48 if (!impl) | |
| 49 return asciiDebug(String("[null]").impl()); | |
| 50 | |
| 51 Vector<char> buffer; | |
| 52 for (unsigned i = 0; i < impl->length(); ++i) { | |
| 53 UChar ch = (*impl)[i]; | |
| 54 if (isASCIIPrintable(ch)) { | |
| 55 if (ch == '\\') | |
| 56 buffer.push_back('\\'); | |
| 57 buffer.push_back(static_cast<char>(ch)); | |
| 58 } else { | |
| 59 buffer.push_back('\\'); | |
| 60 buffer.push_back('u'); | |
| 61 HexNumber::appendUnsignedAsHexFixedSize(ch, buffer, 4); | |
| 62 } | |
| 63 } | |
| 64 buffer.push_back('\0'); | |
| 65 return buffer; | |
| 66 } | |
| 67 | |
| 68 } // namespace | |
| 69 | |
| 70 // Construct a string with UTF-16 data. | |
| 71 String::String(const UChar* characters, unsigned length) | |
| 72 : m_impl(characters ? StringImpl::create(characters, length) : nullptr) {} | |
| 73 | |
| 74 // Construct a string with UTF-16 data, from a null-terminated source. | |
| 75 String::String(const UChar* str) { | |
| 76 if (!str) | |
| 77 return; | |
| 78 m_impl = StringImpl::create(str, lengthOfNullTerminatedString(str)); | |
| 79 } | |
| 80 | |
| 81 // Construct a string with latin1 data. | |
| 82 String::String(const LChar* characters, unsigned length) | |
| 83 : m_impl(characters ? StringImpl::create(characters, length) : nullptr) {} | |
| 84 | |
| 85 String::String(const char* characters, unsigned length) | |
| 86 : m_impl(characters ? StringImpl::create( | |
| 87 reinterpret_cast<const LChar*>(characters), | |
| 88 length) | |
| 89 : nullptr) {} | |
| 90 | |
| 91 void String::append(const StringView& string) { | |
| 92 if (string.isEmpty()) | |
| 93 return; | |
| 94 if (!m_impl) { | |
| 95 m_impl = string.toString().releaseImpl(); | |
| 96 return; | |
| 97 } | |
| 98 | |
| 99 // FIXME: This is extremely inefficient. So much so that we might want to | |
| 100 // take this out of String's API. We can make it better by optimizing the | |
| 101 // case where exactly one String is pointing at this StringImpl, but even | |
| 102 // then it's going to require a call into the allocator every single time. | |
| 103 | |
| 104 if (m_impl->is8Bit() && string.is8Bit()) { | |
| 105 LChar* data; | |
| 106 RELEASE_ASSERT(string.length() <= | |
| 107 std::numeric_limits<unsigned>::max() - m_impl->length()); | |
| 108 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized( | |
| 109 m_impl->length() + string.length(), data); | |
| 110 memcpy(data, m_impl->characters8(), m_impl->length() * sizeof(LChar)); | |
| 111 memcpy(data + m_impl->length(), string.characters8(), | |
| 112 string.length() * sizeof(LChar)); | |
| 113 m_impl = newImpl.release(); | |
| 114 return; | |
| 115 } | |
| 116 | |
| 117 UChar* data; | |
| 118 RELEASE_ASSERT(string.length() <= | |
| 119 std::numeric_limits<unsigned>::max() - m_impl->length()); | |
| 120 RefPtr<StringImpl> newImpl = | |
| 121 StringImpl::createUninitialized(m_impl->length() + string.length(), data); | |
| 122 | |
| 123 if (m_impl->is8Bit()) | |
| 124 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length()); | |
| 125 else | |
| 126 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length()); | |
| 127 | |
| 128 if (string.is8Bit()) | |
| 129 StringImpl::copyChars(data + m_impl->length(), string.characters8(), | |
| 130 string.length()); | |
| 131 else | |
| 132 StringImpl::copyChars(data + m_impl->length(), string.characters16(), | |
| 133 string.length()); | |
| 134 | |
| 135 m_impl = newImpl.release(); | |
| 136 } | |
| 137 | |
| 138 template <typename CharacterType> | |
| 139 inline void String::appendInternal(CharacterType c) { | |
| 140 // FIXME: This is extremely inefficient. So much so that we might want to | |
| 141 // take this out of String's API. We can make it better by optimizing the | |
| 142 // case where exactly one String is pointing at this StringImpl, but even | |
| 143 // then it's going to require a call into the allocator every single time. | |
| 144 if (!m_impl) { | |
| 145 m_impl = StringImpl::create(&c, 1); | |
| 146 return; | |
| 147 } | |
| 148 | |
| 149 // FIXME: We should be able to create an 8 bit string via this code path. | |
| 150 UChar* data; | |
| 151 RELEASE_ASSERT(m_impl->length() < std::numeric_limits<unsigned>::max()); | |
| 152 RefPtr<StringImpl> newImpl = | |
| 153 StringImpl::createUninitialized(m_impl->length() + 1, data); | |
| 154 if (m_impl->is8Bit()) | |
| 155 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length()); | |
| 156 else | |
| 157 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length()); | |
| 158 data[m_impl->length()] = c; | |
| 159 m_impl = newImpl.release(); | |
| 160 } | |
| 161 | |
| 162 void String::append(LChar c) { | |
| 163 appendInternal(c); | |
| 164 } | |
| 165 | |
| 166 void String::append(UChar c) { | |
| 167 appendInternal(c); | |
| 168 } | |
| 169 | |
| 170 int codePointCompare(const String& a, const String& b) { | |
| 171 return codePointCompare(a.impl(), b.impl()); | |
| 172 } | |
| 173 | |
| 174 int codePointCompareIgnoringASCIICase(const String& a, const char* b) { | |
| 175 return codePointCompareIgnoringASCIICase(a.impl(), | |
| 176 reinterpret_cast<const LChar*>(b)); | |
| 177 } | |
| 178 | |
| 179 template <typename CharType> | |
| 180 PassRefPtr<StringImpl> insertInternal(PassRefPtr<StringImpl> impl, | |
| 181 const CharType* charactersToInsert, | |
| 182 unsigned lengthToInsert, | |
| 183 unsigned position) { | |
| 184 if (!lengthToInsert) | |
| 185 return impl; | |
| 186 | |
| 187 DCHECK(charactersToInsert); | |
| 188 UChar* data; // FIXME: We should be able to create an 8 bit string here. | |
| 189 RELEASE_ASSERT(lengthToInsert <= | |
| 190 std::numeric_limits<unsigned>::max() - impl->length()); | |
| 191 RefPtr<StringImpl> newImpl = | |
| 192 StringImpl::createUninitialized(impl->length() + lengthToInsert, data); | |
| 193 | |
| 194 if (impl->is8Bit()) | |
| 195 StringImpl::copyChars(data, impl->characters8(), position); | |
| 196 else | |
| 197 StringImpl::copyChars(data, impl->characters16(), position); | |
| 198 | |
| 199 StringImpl::copyChars(data + position, charactersToInsert, lengthToInsert); | |
| 200 | |
| 201 if (impl->is8Bit()) | |
| 202 StringImpl::copyChars(data + position + lengthToInsert, | |
| 203 impl->characters8() + position, | |
| 204 impl->length() - position); | |
| 205 else | |
| 206 StringImpl::copyChars(data + position + lengthToInsert, | |
| 207 impl->characters16() + position, | |
| 208 impl->length() - position); | |
| 209 | |
| 210 return newImpl.release(); | |
| 211 } | |
| 212 | |
| 213 void String::insert(const StringView& string, unsigned position) { | |
| 214 if (string.isEmpty()) { | |
| 215 if (string.isNull()) | |
| 216 return; | |
| 217 if (isNull()) | |
| 218 m_impl = string.toString().releaseImpl(); | |
| 219 return; | |
| 220 } | |
| 221 | |
| 222 if (position >= length()) { | |
| 223 if (string.is8Bit()) | |
| 224 append(string); | |
| 225 else | |
| 226 append(string); | |
| 227 return; | |
| 228 } | |
| 229 | |
| 230 DCHECK(m_impl); | |
| 231 if (string.is8Bit()) | |
| 232 m_impl = insertInternal(m_impl.release(), string.characters8(), | |
| 233 string.length(), position); | |
| 234 else | |
| 235 m_impl = insertInternal(m_impl.release(), string.characters16(), | |
| 236 string.length(), position); | |
| 237 } | |
| 238 | |
| 239 UChar32 String::characterStartingAt(unsigned i) const { | |
| 240 if (!m_impl || i >= m_impl->length()) | |
| 241 return 0; | |
| 242 return m_impl->characterStartingAt(i); | |
| 243 } | |
| 244 | |
| 245 void String::ensure16Bit() { | |
| 246 if (isNull()) | |
| 247 return; | |
| 248 if (!is8Bit()) | |
| 249 return; | |
| 250 if (unsigned length = this->length()) | |
| 251 m_impl = | |
| 252 make16BitFrom8BitSource(m_impl->characters8(), length).releaseImpl(); | |
| 253 else | |
| 254 m_impl = StringImpl::empty16Bit; | |
| 255 } | |
| 256 | |
| 257 void String::truncate(unsigned length) { | |
| 258 if (m_impl) | |
| 259 m_impl = m_impl->truncate(length); | |
| 260 } | |
| 261 | |
| 262 void String::remove(unsigned start, unsigned lengthToRemove) { | |
| 263 if (m_impl) | |
| 264 m_impl = m_impl->remove(start, lengthToRemove); | |
| 265 } | |
| 266 | |
| 267 String String::substring(unsigned pos, unsigned len) const { | |
| 268 if (!m_impl) | |
| 269 return String(); | |
| 270 return m_impl->substring(pos, len); | |
| 271 } | |
| 272 | |
| 273 String String::lower() const { | |
| 274 if (!m_impl) | |
| 275 return String(); | |
| 276 return m_impl->lower(); | |
| 277 } | |
| 278 | |
| 279 String String::upper() const { | |
| 280 if (!m_impl) | |
| 281 return String(); | |
| 282 return m_impl->upper(); | |
| 283 } | |
| 284 | |
| 285 String String::lower(const AtomicString& localeIdentifier) const { | |
| 286 if (!m_impl) | |
| 287 return String(); | |
| 288 return m_impl->lower(localeIdentifier); | |
| 289 } | |
| 290 | |
| 291 String String::upper(const AtomicString& localeIdentifier) const { | |
| 292 if (!m_impl) | |
| 293 return String(); | |
| 294 return m_impl->upper(localeIdentifier); | |
| 295 } | |
| 296 | |
| 297 String String::upperASCII() const { | |
| 298 if (!m_impl) | |
| 299 return String(); | |
| 300 return m_impl->upperASCII(); | |
| 301 } | |
| 302 | |
| 303 String String::stripWhiteSpace() const { | |
| 304 if (!m_impl) | |
| 305 return String(); | |
| 306 return m_impl->stripWhiteSpace(); | |
| 307 } | |
| 308 | |
| 309 String String::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const { | |
| 310 if (!m_impl) | |
| 311 return String(); | |
| 312 return m_impl->stripWhiteSpace(isWhiteSpace); | |
| 313 } | |
| 314 | |
| 315 String String::simplifyWhiteSpace(StripBehavior stripBehavior) const { | |
| 316 if (!m_impl) | |
| 317 return String(); | |
| 318 return m_impl->simplifyWhiteSpace(stripBehavior); | |
| 319 } | |
| 320 | |
| 321 String String::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace, | |
| 322 StripBehavior stripBehavior) const { | |
| 323 if (!m_impl) | |
| 324 return String(); | |
| 325 return m_impl->simplifyWhiteSpace(isWhiteSpace, stripBehavior); | |
| 326 } | |
| 327 | |
| 328 String String::removeCharacters(CharacterMatchFunctionPtr findMatch) const { | |
| 329 if (!m_impl) | |
| 330 return String(); | |
| 331 return m_impl->removeCharacters(findMatch); | |
| 332 } | |
| 333 | |
| 334 String String::foldCase() const { | |
| 335 if (!m_impl) | |
| 336 return String(); | |
| 337 return m_impl->foldCase(); | |
| 338 } | |
| 339 | |
| 340 String String::format(const char* format, ...) { | |
| 341 va_list args; | |
| 342 | |
| 343 // TODO(esprehn): base uses 1024, maybe we should use a bigger size too. | |
| 344 static const unsigned kDefaultSize = 256; | |
| 345 Vector<char, kDefaultSize> buffer(kDefaultSize); | |
| 346 | |
| 347 va_start(args, format); | |
| 348 int length = base::vsnprintf(buffer.data(), buffer.size(), format, args); | |
| 349 va_end(args); | |
| 350 | |
| 351 // TODO(esprehn): This can only happen if there's an encoding error, what's | |
| 352 // the locale set to inside blink? Can this happen? We should probably CHECK | |
| 353 // instead. | |
| 354 if (length < 0) | |
| 355 return String(); | |
| 356 | |
| 357 if (static_cast<unsigned>(length) >= buffer.size()) { | |
| 358 // vsnprintf doesn't include the NUL terminator in the length so we need to | |
| 359 // add space for it when growing. | |
| 360 buffer.grow(length + 1); | |
| 361 | |
| 362 // We need to call va_end() and then va_start() each time we use args, as | |
| 363 // the contents of args is undefined after the call to vsnprintf according | |
| 364 // to http://man.cx/snprintf(3) | |
| 365 // | |
| 366 // Not calling va_end/va_start here happens to work on lots of systems, but | |
| 367 // fails e.g. on 64bit Linux. | |
| 368 va_start(args, format); | |
| 369 length = base::vsnprintf(buffer.data(), buffer.size(), format, args); | |
| 370 va_end(args); | |
| 371 } | |
| 372 | |
| 373 CHECK_LT(static_cast<unsigned>(length), buffer.size()); | |
| 374 return String(reinterpret_cast<const LChar*>(buffer.data()), length); | |
| 375 } | |
| 376 | |
| 377 template <typename IntegerType> | |
| 378 static String integerToString(IntegerType input) { | |
| 379 IntegerToStringConverter<IntegerType> converter(input); | |
| 380 return StringImpl::create(converter.characters8(), converter.length()); | |
| 381 } | |
| 382 | |
| 383 String String::number(int number) { | |
| 384 return integerToString(number); | |
| 385 } | |
| 386 | |
| 387 String String::number(unsigned number) { | |
| 388 return integerToString(number); | |
| 389 } | |
| 390 | |
| 391 String String::number(long number) { | |
| 392 return integerToString(number); | |
| 393 } | |
| 394 | |
| 395 String String::number(unsigned long number) { | |
| 396 return integerToString(number); | |
| 397 } | |
| 398 | |
| 399 String String::number(long long number) { | |
| 400 return integerToString(number); | |
| 401 } | |
| 402 | |
| 403 String String::number(unsigned long long number) { | |
| 404 return integerToString(number); | |
| 405 } | |
| 406 | |
| 407 String String::number(double number, unsigned precision) { | |
| 408 NumberToStringBuffer buffer; | |
| 409 return String(numberToFixedPrecisionString(number, precision, buffer)); | |
| 410 } | |
| 411 | |
| 412 String String::numberToStringECMAScript(double number) { | |
| 413 NumberToStringBuffer buffer; | |
| 414 return String(numberToString(number, buffer)); | |
| 415 } | |
| 416 | |
| 417 String String::numberToStringFixedWidth(double number, unsigned decimalPlaces) { | |
| 418 NumberToStringBuffer buffer; | |
| 419 return String(numberToFixedWidthString(number, decimalPlaces, buffer)); | |
| 420 } | |
| 421 | |
| 422 int String::toIntStrict(bool* ok, int base) const { | |
| 423 if (!m_impl) { | |
| 424 if (ok) | |
| 425 *ok = false; | |
| 426 return 0; | |
| 427 } | |
| 428 return m_impl->toIntStrict(ok, base); | |
| 429 } | |
| 430 | |
| 431 unsigned String::toUIntStrict(bool* ok, int base) const { | |
| 432 if (!m_impl) { | |
| 433 if (ok) | |
| 434 *ok = false; | |
| 435 return 0; | |
| 436 } | |
| 437 return m_impl->toUIntStrict(ok, base); | |
| 438 } | |
| 439 | |
| 440 int64_t String::toInt64Strict(bool* ok, int base) const { | |
| 441 if (!m_impl) { | |
| 442 if (ok) | |
| 443 *ok = false; | |
| 444 return 0; | |
| 445 } | |
| 446 return m_impl->toInt64Strict(ok, base); | |
| 447 } | |
| 448 | |
| 449 uint64_t String::toUInt64Strict(bool* ok, int base) const { | |
| 450 if (!m_impl) { | |
| 451 if (ok) | |
| 452 *ok = false; | |
| 453 return 0; | |
| 454 } | |
| 455 return m_impl->toUInt64Strict(ok, base); | |
| 456 } | |
| 457 | |
| 458 int String::toInt(bool* ok) const { | |
| 459 if (!m_impl) { | |
| 460 if (ok) | |
| 461 *ok = false; | |
| 462 return 0; | |
| 463 } | |
| 464 return m_impl->toInt(ok); | |
| 465 } | |
| 466 | |
| 467 unsigned String::toUInt(bool* ok) const { | |
| 468 if (!m_impl) { | |
| 469 if (ok) | |
| 470 *ok = false; | |
| 471 return 0; | |
| 472 } | |
| 473 return m_impl->toUInt(ok); | |
| 474 } | |
| 475 | |
| 476 int64_t String::toInt64(bool* ok) const { | |
| 477 if (!m_impl) { | |
| 478 if (ok) | |
| 479 *ok = false; | |
| 480 return 0; | |
| 481 } | |
| 482 return m_impl->toInt64(ok); | |
| 483 } | |
| 484 | |
| 485 uint64_t String::toUInt64(bool* ok) const { | |
| 486 if (!m_impl) { | |
| 487 if (ok) | |
| 488 *ok = false; | |
| 489 return 0; | |
| 490 } | |
| 491 return m_impl->toUInt64(ok); | |
| 492 } | |
| 493 | |
| 494 double String::toDouble(bool* ok) const { | |
| 495 if (!m_impl) { | |
| 496 if (ok) | |
| 497 *ok = false; | |
| 498 return 0.0; | |
| 499 } | |
| 500 return m_impl->toDouble(ok); | |
| 501 } | |
| 502 | |
| 503 float String::toFloat(bool* ok) const { | |
| 504 if (!m_impl) { | |
| 505 if (ok) | |
| 506 *ok = false; | |
| 507 return 0.0f; | |
| 508 } | |
| 509 return m_impl->toFloat(ok); | |
| 510 } | |
| 511 | |
| 512 String String::isolatedCopy() const { | |
| 513 if (!m_impl) | |
| 514 return String(); | |
| 515 return m_impl->isolatedCopy(); | |
| 516 } | |
| 517 | |
| 518 bool String::isSafeToSendToAnotherThread() const { | |
| 519 return !m_impl || m_impl->isSafeToSendToAnotherThread(); | |
| 520 } | |
| 521 | |
| 522 void String::split(const StringView& separator, | |
| 523 bool allowEmptyEntries, | |
| 524 Vector<String>& result) const { | |
| 525 result.clear(); | |
| 526 | |
| 527 unsigned startPos = 0; | |
| 528 size_t endPos; | |
| 529 while ((endPos = find(separator, startPos)) != kNotFound) { | |
| 530 if (allowEmptyEntries || startPos != endPos) | |
| 531 result.push_back(substring(startPos, endPos - startPos)); | |
| 532 startPos = endPos + separator.length(); | |
| 533 } | |
| 534 if (allowEmptyEntries || startPos != length()) | |
| 535 result.push_back(substring(startPos)); | |
| 536 } | |
| 537 | |
| 538 void String::split(UChar separator, | |
| 539 bool allowEmptyEntries, | |
| 540 Vector<String>& result) const { | |
| 541 result.clear(); | |
| 542 | |
| 543 unsigned startPos = 0; | |
| 544 size_t endPos; | |
| 545 while ((endPos = find(separator, startPos)) != kNotFound) { | |
| 546 if (allowEmptyEntries || startPos != endPos) | |
| 547 result.push_back(substring(startPos, endPos - startPos)); | |
| 548 startPos = endPos + 1; | |
| 549 } | |
| 550 if (allowEmptyEntries || startPos != length()) | |
| 551 result.push_back(substring(startPos)); | |
| 552 } | |
| 553 | |
| 554 CString String::ascii() const { | |
| 555 // Printable ASCII characters 32..127 and the null character are | |
| 556 // preserved, characters outside of this range are converted to '?'. | |
| 557 | |
| 558 unsigned length = this->length(); | |
| 559 if (!length) { | |
| 560 char* characterBuffer; | |
| 561 return CString::createUninitialized(length, characterBuffer); | |
| 562 } | |
| 563 | |
| 564 if (this->is8Bit()) { | |
| 565 const LChar* characters = this->characters8(); | |
| 566 | |
| 567 char* characterBuffer; | |
| 568 CString result = CString::createUninitialized(length, characterBuffer); | |
| 569 | |
| 570 for (unsigned i = 0; i < length; ++i) { | |
| 571 LChar ch = characters[i]; | |
| 572 characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch; | |
| 573 } | |
| 574 | |
| 575 return result; | |
| 576 } | |
| 577 | |
| 578 const UChar* characters = this->characters16(); | |
| 579 | |
| 580 char* characterBuffer; | |
| 581 CString result = CString::createUninitialized(length, characterBuffer); | |
| 582 | |
| 583 for (unsigned i = 0; i < length; ++i) { | |
| 584 UChar ch = characters[i]; | |
| 585 characterBuffer[i] = | |
| 586 ch && (ch < 0x20 || ch > 0x7f) ? '?' : static_cast<char>(ch); | |
| 587 } | |
| 588 | |
| 589 return result; | |
| 590 } | |
| 591 | |
| 592 CString String::latin1() const { | |
| 593 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are | |
| 594 // preserved, characters outside of this range are converted to '?'. | |
| 595 | |
| 596 unsigned length = this->length(); | |
| 597 | |
| 598 if (!length) | |
| 599 return CString("", 0); | |
| 600 | |
| 601 if (is8Bit()) | |
| 602 return CString(reinterpret_cast<const char*>(this->characters8()), length); | |
| 603 | |
| 604 const UChar* characters = this->characters16(); | |
| 605 | |
| 606 char* characterBuffer; | |
| 607 CString result = CString::createUninitialized(length, characterBuffer); | |
| 608 | |
| 609 for (unsigned i = 0; i < length; ++i) { | |
| 610 UChar ch = characters[i]; | |
| 611 characterBuffer[i] = ch > 0xff ? '?' : static_cast<char>(ch); | |
| 612 } | |
| 613 | |
| 614 return result; | |
| 615 } | |
| 616 | |
| 617 // Helper to write a three-byte UTF-8 code point to the buffer, caller must | |
| 618 // check room is available. | |
| 619 static inline void putUTF8Triple(char*& buffer, UChar ch) { | |
| 620 DCHECK_GE(ch, 0x0800); | |
| 621 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); | |
| 622 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); | |
| 623 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); | |
| 624 } | |
| 625 | |
| 626 CString String::utf8(UTF8ConversionMode mode) const { | |
| 627 unsigned length = this->length(); | |
| 628 | |
| 629 if (!length) | |
| 630 return CString("", 0); | |
| 631 | |
| 632 // Allocate a buffer big enough to hold all the characters | |
| 633 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). | |
| 634 // Optimization ideas, if we find this function is hot: | |
| 635 // * We could speculatively create a CStringImpl to contain 'length' | |
| 636 // characters, and resize if necessary (i.e. if the buffer contains | |
| 637 // non-ascii characters). (Alternatively, scan the buffer first for | |
| 638 // ascii characters, so we know this will be sufficient). | |
| 639 // * We could allocate a CStringImpl with an appropriate size to | |
| 640 // have a good chance of being able to write the string into the | |
| 641 // buffer without reallocing (say, 1.5 x length). | |
| 642 if (length > std::numeric_limits<unsigned>::max() / 3) | |
| 643 return CString(); | |
| 644 Vector<char, 1024> bufferVector(length * 3); | |
| 645 | |
| 646 char* buffer = bufferVector.data(); | |
| 647 | |
| 648 if (is8Bit()) { | |
| 649 const LChar* characters = this->characters8(); | |
| 650 | |
| 651 ConversionResult result = | |
| 652 convertLatin1ToUTF8(&characters, characters + length, &buffer, | |
| 653 buffer + bufferVector.size()); | |
| 654 // (length * 3) should be sufficient for any conversion | |
| 655 DCHECK_NE(result, targetExhausted); | |
| 656 } else { | |
| 657 const UChar* characters = this->characters16(); | |
| 658 | |
| 659 if (mode == StrictUTF8ConversionReplacingUnpairedSurrogatesWithFFFD) { | |
| 660 const UChar* charactersEnd = characters + length; | |
| 661 char* bufferEnd = buffer + bufferVector.size(); | |
| 662 while (characters < charactersEnd) { | |
| 663 // Use strict conversion to detect unpaired surrogates. | |
| 664 ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd, | |
| 665 &buffer, bufferEnd, true); | |
| 666 DCHECK_NE(result, targetExhausted); | |
| 667 // Conversion fails when there is an unpaired surrogate. Put | |
| 668 // replacement character (U+FFFD) instead of the unpaired | |
| 669 // surrogate. | |
| 670 if (result != conversionOK) { | |
| 671 DCHECK_LE(0xD800, *characters); | |
| 672 DCHECK_LE(*characters, 0xDFFF); | |
| 673 // There should be room left, since one UChar hasn't been | |
| 674 // converted. | |
| 675 DCHECK_LE(buffer + 3, bufferEnd); | |
| 676 putUTF8Triple(buffer, replacementCharacter); | |
| 677 ++characters; | |
| 678 } | |
| 679 } | |
| 680 } else { | |
| 681 bool strict = mode == StrictUTF8Conversion; | |
| 682 ConversionResult result = | |
| 683 convertUTF16ToUTF8(&characters, characters + length, &buffer, | |
| 684 buffer + bufferVector.size(), strict); | |
| 685 // (length * 3) should be sufficient for any conversion | |
| 686 DCHECK_NE(result, targetExhausted); | |
| 687 | |
| 688 // Only produced from strict conversion. | |
| 689 if (result == sourceIllegal) { | |
| 690 DCHECK(strict); | |
| 691 return CString(); | |
| 692 } | |
| 693 | |
| 694 // Check for an unconverted high surrogate. | |
| 695 if (result == sourceExhausted) { | |
| 696 if (strict) | |
| 697 return CString(); | |
| 698 // This should be one unpaired high surrogate. Treat it the same | |
| 699 // was as an unpaired high surrogate would have been handled in | |
| 700 // the middle of a string with non-strict conversion - which is | |
| 701 // to say, simply encode it to UTF-8. | |
| 702 DCHECK_EQ(characters + 1, this->characters16() + length); | |
| 703 DCHECK_GE(*characters, 0xD800); | |
| 704 DCHECK_LE(*characters, 0xDBFF); | |
| 705 // There should be room left, since one UChar hasn't been | |
| 706 // converted. | |
| 707 DCHECK_LE(buffer + 3, buffer + bufferVector.size()); | |
| 708 putUTF8Triple(buffer, *characters); | |
| 709 } | |
| 710 } | |
| 711 } | |
| 712 | |
| 713 return CString(bufferVector.data(), buffer - bufferVector.data()); | |
| 714 } | |
| 715 | |
| 716 String String::make8BitFrom16BitSource(const UChar* source, size_t length) { | |
| 717 if (!length) | |
| 718 return emptyString; | |
| 719 | |
| 720 LChar* destination; | |
| 721 String result = String::createUninitialized(length, destination); | |
| 722 | |
| 723 copyLCharsFromUCharSource(destination, source, length); | |
| 724 | |
| 725 return result; | |
| 726 } | |
| 727 | |
| 728 String String::make16BitFrom8BitSource(const LChar* source, size_t length) { | |
| 729 if (!length) | |
| 730 return emptyString16Bit; | |
| 731 | |
| 732 UChar* destination; | |
| 733 String result = String::createUninitialized(length, destination); | |
| 734 | |
| 735 StringImpl::copyChars(destination, source, length); | |
| 736 | |
| 737 return result; | |
| 738 } | |
| 739 | |
| 740 String String::fromUTF8(const LChar* stringStart, size_t length) { | |
| 741 RELEASE_ASSERT(length <= std::numeric_limits<unsigned>::max()); | |
| 742 | |
| 743 if (!stringStart) | |
| 744 return String(); | |
| 745 | |
| 746 if (!length) | |
| 747 return emptyString; | |
| 748 | |
| 749 if (charactersAreAllASCII(stringStart, length)) | |
| 750 return StringImpl::create(stringStart, length); | |
| 751 | |
| 752 Vector<UChar, 1024> buffer(length); | |
| 753 UChar* bufferStart = buffer.data(); | |
| 754 | |
| 755 UChar* bufferCurrent = bufferStart; | |
| 756 const char* stringCurrent = reinterpret_cast<const char*>(stringStart); | |
| 757 if (convertUTF8ToUTF16( | |
| 758 &stringCurrent, reinterpret_cast<const char*>(stringStart + length), | |
| 759 &bufferCurrent, bufferCurrent + buffer.size()) != conversionOK) | |
| 760 return String(); | |
| 761 | |
| 762 unsigned utf16Length = bufferCurrent - bufferStart; | |
| 763 DCHECK_LT(utf16Length, length); | |
| 764 return StringImpl::create(bufferStart, utf16Length); | |
| 765 } | |
| 766 | |
| 767 String String::fromUTF8(const LChar* string) { | |
| 768 if (!string) | |
| 769 return String(); | |
| 770 return fromUTF8(string, strlen(reinterpret_cast<const char*>(string))); | |
| 771 } | |
| 772 | |
| 773 String String::fromUTF8(const CString& s) { | |
| 774 return fromUTF8(s.data()); | |
| 775 } | |
| 776 | |
| 777 String String::fromUTF8WithLatin1Fallback(const LChar* string, size_t size) { | |
| 778 String utf8 = fromUTF8(string, size); | |
| 779 if (!utf8) | |
| 780 return String(string, size); | |
| 781 return utf8; | |
| 782 } | |
| 783 | |
| 784 std::ostream& operator<<(std::ostream& out, const String& string) { | |
| 785 if (string.isNull()) | |
| 786 return out << "<null>"; | |
| 787 | |
| 788 out << '"'; | |
| 789 for (unsigned index = 0; index < string.length(); ++index) { | |
| 790 // Print shorthands for select cases. | |
| 791 UChar character = string[index]; | |
| 792 switch (character) { | |
| 793 case '\t': | |
| 794 out << "\\t"; | |
| 795 break; | |
| 796 case '\n': | |
| 797 out << "\\n"; | |
| 798 break; | |
| 799 case '\r': | |
| 800 out << "\\r"; | |
| 801 break; | |
| 802 case '"': | |
| 803 out << "\\\""; | |
| 804 break; | |
| 805 case '\\': | |
| 806 out << "\\\\"; | |
| 807 break; | |
| 808 default: | |
| 809 if (isASCIIPrintable(character)) { | |
| 810 out << static_cast<char>(character); | |
| 811 } else { | |
| 812 // Print "\uXXXX" for control or non-ASCII characters. | |
| 813 out << "\\u"; | |
| 814 out.width(4); | |
| 815 out.fill('0'); | |
| 816 out.setf(std::ios_base::hex, std::ios_base::basefield); | |
| 817 out.setf(std::ios::uppercase); | |
| 818 out << character; | |
| 819 } | |
| 820 break; | |
| 821 } | |
| 822 } | |
| 823 return out << '"'; | |
| 824 } | |
| 825 | |
| 826 #ifndef NDEBUG | |
| 827 void String::show() const { | |
| 828 dataLogF("%s\n", asciiDebug(impl()).data()); | |
| 829 } | |
| 830 #endif | |
| 831 | |
| 832 } // namespace WTF | |
| OLD | NEW |