src/inspector/String16.cpp - Issue 2300823002: Revert of [inspector] Initial import of v8_inspector.

Side by Side Diff: src/inspector/String16.cpp

Issue 2300823002: Revert of [inspector] Initial import of v8_inspector. (Closed)

Patch Set: Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "src/inspector/String16.h"

6

7 #include "src/inspector/ProtocolPlatform.h"

8

9 #include <algorithm>

10 #include <cctype>

11 #include <cstdio>

12 #include <cstdlib>

13 #include <cstring>

14 #include <locale>

15 #include <string>

16

17 namespace v8_inspector {

18

19 namespace {

20

21 bool isASCII(UChar c)

22 {

23 return !(c & ~0x7F);

24 }

25

26 bool isSpaceOrNewLine(UChar c)

27 {

28 return isASCII(c) && c <= ' ' && (c == ' ' \|\| (c <= 0xD && c >= 0x9));

29 }

30

31 int charactersToInteger(const UChar* characters, size_t length, bool* ok = nullp tr)

32 {

33 std::vector<char> buffer;

34 buffer.reserve(length + 1);

35 for (size_t i = 0; i < length; ++i) {

36 if (!isASCII(characters[i])) {

37 if (ok)

38 *ok = false;

39 return 0;

40 }

41 buffer.push_back(static_cast<char>(characters[i]));

42 }

43 buffer.push_back('\0');

44

45 char* endptr;

46 int result = std::strtol(buffer.data(), &endptr, 10);

47 if (ok)

48 ok = !(endptr);

49 return result;

50 }

51

52 const UChar replacementCharacter = 0xFFFD;

53 using UChar32 = uint32_t;

54

55 inline int inlineUTF8SequenceLengthNonASCII(char b0)

56 {

57 if ((b0 & 0xC0) != 0xC0)

58 return 0;

59 if ((b0 & 0xE0) == 0xC0)

60 return 2;

61 if ((b0 & 0xF0) == 0xE0)

62 return 3;

63 if ((b0 & 0xF8) == 0xF0)

64 return 4;

65 return 0;

66 }

67

68 inline int inlineUTF8SequenceLength(char b0)

69 {

70 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);

71 }

72

73 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed

74 // into the first byte, depending on how many bytes follow. There are

75 // as many entries in this table as there are UTF-8 sequence types.

76 // (I.e., one byte sequence, two byte... etc.). Remember that sequences

77 // for legal UTF-8 will be 4 or fewer bytes total.

78 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC };

79

80 typedef enum {

81 conversionOK, // conversion successful

82 sourceExhausted, // partial character in source, but hit end

83 targetExhausted, // insuff. room in target for conversion

84 sourceIllegal // source sequence is illegal/malformed

85 } ConversionResult;

86

87 ConversionResult convertUTF16ToUTF8(

88 const UChar** sourceStart, const UChar* sourceEnd,

89 char** targetStart, char* targetEnd, bool strict)

90 {

91 ConversionResult result = conversionOK;

92 const UChar* source = *sourceStart;

93 char* target = *targetStart;

94 while (source < sourceEnd) {

95 UChar32 ch;

96 unsigned short bytesToWrite = 0;

97 const UChar32 byteMask = 0xBF;

98 const UChar32 byteMark = 0x80;

99 const UChar* oldSource = source; // In case we have to back up because o f target overflow.

100 ch = static_cast<unsigned short>(*source++);

101 // If we have a surrogate pair, convert to UChar32 first.

102 if (ch >= 0xD800 && ch <= 0xDBFF) {

103 // If the 16 bits following the high surrogate are in the source buf fer...

104 if (source < sourceEnd) {

105 UChar32 ch2 = static_cast<unsigned short>(*source);

106 // If it's a low surrogate, convert to UChar32.

107 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {

108 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;

109 ++source;

110 } else if (strict) { // it's an unpaired high surrogate

111 --source; // return to the illegal value itself

112 result = sourceIllegal;

113 break;

114 }

115 } else { // We don't have the 16 bits following the high surrogate.

116 --source; // return to the high surrogate

117 result = sourceExhausted;

118 break;

119 }

120 } else if (strict) {

121 // UTF-16 surrogate values are illegal in UTF-32

122 if (ch >= 0xDC00 && ch <= 0xDFFF) {

123 --source; // return to the illegal value itself

124 result = sourceIllegal;

125 break;

126 }

127 }

128 // Figure out how many bytes the result will require

129 if (ch < (UChar32)0x80) {

130 bytesToWrite = 1;

131 } else if (ch < (UChar32)0x800) {

132 bytesToWrite = 2;

133 } else if (ch < (UChar32)0x10000) {

134 bytesToWrite = 3;

135 } else if (ch < (UChar32)0x110000) {

136 bytesToWrite = 4;

137 } else {

138 bytesToWrite = 3;

139 ch = replacementCharacter;

140 }

141

142 target += bytesToWrite;

143 if (target > targetEnd) {

144 source = oldSource; // Back up source pointer!

145 target -= bytesToWrite;

146 result = targetExhausted;

147 break;

148 }

149 switch (bytesToWrite) { // note: everything falls through.

150 case 4:

151 *--target = (char)((ch \| byteMark) & byteMask);

152 ch >>= 6;

153 case 3:

154 *--target = (char)((ch \| byteMark) & byteMask);

155 ch >>= 6;

156 case 2:

157 *--target = (char)((ch \| byteMark) & byteMask);

158 ch >>= 6;

159 case 1:

160 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

161 }

162 target += bytesToWrite;

163 }

164 *sourceStart = source;

165 *targetStart = target;

166 return result;

167 }

168

169 /**

170 * Is this code point a BMP code point (U+0000..U+ffff)?

171 * @param c 32-bit code point

172 * @return TRUE or FALSE

173 * @stable ICU 2.8

174 */

175 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff)

176

177 /**

178 * Is this code point a supplementary code point (U+10000..U+10ffff)?

179 * @param c 32-bit code point

180 * @return TRUE or FALSE

181 * @stable ICU 2.8

182 */

183 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c) - 0x10000) <= 0xfffff)

184

185 /**

186 * Is this code point a surrogate (U+d800..U+dfff)?

187 * @param c 32-bit code point

188 * @return TRUE or FALSE

189 * @stable ICU 2.4

190 */

191 #define U_IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800)

192

193 /**

194 * Get the lead surrogate (0xd800..0xdbff) for a

195 * supplementary code point (0x10000..0x10ffff).

196 * @param supplementary 32-bit code point (U+10000..U+10ffff)

197 * @return lead surrogate (U+d800..U+dbff) for supplementary

198 * @stable ICU 2.4

199 */

200 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0)

201

202 /**

203 * Get the trail surrogate (0xdc00..0xdfff) for a

204 * supplementary code point (0x10000..0x10ffff).

205 * @param supplementary 32-bit code point (U+10000..U+10ffff)

206 * @return trail surrogate (U+dc00..U+dfff) for supplementary

207 * @stable ICU 2.4

208 */

209 #define U16_TRAIL(supplementary) (UChar)(((supplementary) & 0x3ff) \| 0xdc00)

210

211 // This must be called with the length pre-determined by the first byte.

212 // If presented with a length > 4, this returns false. The Unicode

213 // definition of UTF-8 goes up to 4-byte sequences.

214 static bool isLegalUTF8(const unsigned char* source, int length)

215 {

216 unsigned char a;

217 const unsigned char* srcptr = source + length;

218 switch (length) {

219 default:

220 return false;

221 // Everything else falls through when "true"...

222 case 4:

223 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

224 return false;

225 case 3:

226 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

227 return false;

228 case 2:

229 if ((a = (*--srcptr)) > 0xBF)

230 return false;

231

232 // no fall-through in this inner switch

233 switch (*source) {

234 case 0xE0:

235 if (a < 0xA0)

236 return false;

237 break;

238 case 0xED:

239 if (a > 0x9F)

240 return false;

241 break;

242 case 0xF0:

243 if (a < 0x90)

244 return false;

245 break;

246 case 0xF4:

247 if (a > 0x8F)

248 return false;

249 break;

250 default:

251 if (a < 0x80)

252 return false;

253 }

254

255 case 1:

256 if (source >= 0x80 && source < 0xC2)

257 return false;

258 }

259 if (*source > 0xF4)

260 return false;

261 return true;

262 }

263

264 // Magic values subtracted from a buffer value during UTF8 conversion.

265 // This table contains as many values as there might be trailing bytes

266 // in a UTF-8 sequence.

267 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) };

268

269 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)

270 {

271 UChar32 character = 0;

272

273 // The cases all fall through.

274 switch (length) {

275 case 6:

276 character += static_cast<unsigned char>(*sequence++);

277 character <<= 6;

278 case 5:

279 character += static_cast<unsigned char>(*sequence++);

280 character <<= 6;

281 case 4:

282 character += static_cast<unsigned char>(*sequence++);

283 character <<= 6;

284 case 3:

285 character += static_cast<unsigned char>(*sequence++);

286 character <<= 6;

287 case 2:

288 character += static_cast<unsigned char>(*sequence++);

289 character <<= 6;

290 case 1:

291 character += static_cast<unsigned char>(*sequence++);

292 }

293

294 return character - offsetsFromUTF8[length - 1];

295 }

296

297 ConversionResult convertUTF8ToUTF16(

298 const char** sourceStart, const char* sourceEnd,

299 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)

300 {

301 ConversionResult result = conversionOK;

302 const char* source = *sourceStart;

303 UChar* target = *targetStart;

304 UChar orAllData = 0;

305 while (source < sourceEnd) {

306 int utf8SequenceLength = inlineUTF8SequenceLength(*source);

307 if (sourceEnd - source < utf8SequenceLength) {

308 result = sourceExhausted;

309 break;

310 }

311 // Do this check whether lenient or strict

312 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) {

313 result = sourceIllegal;

314 break;

315 }

316

317 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

318

319 if (target >= targetEnd) {

320 source -= utf8SequenceLength; // Back up source pointer!

321 result = targetExhausted;

322 break;

323 }

324

325 if (U_IS_BMP(character)) {

326 // UTF-16 surrogate values are illegal in UTF-32

327 if (U_IS_SURROGATE(character)) {

328 if (strict) {

329 source -= utf8SequenceLength; // return to the illegal value itself

330 result = sourceIllegal;

331 break;

332 }

333 *target++ = replacementCharacter;

334 orAllData \|= replacementCharacter;

335 } else {

336 *target++ = static_cast<UChar>(character); // normal case

337 orAllData \|= character;

338 }

339 } else if (U_IS_SUPPLEMENTARY(character)) {

340 // target is a character in range 0xFFFF - 0x10FFFF

341 if (target + 1 >= targetEnd) {

342 source -= utf8SequenceLength; // Back up source pointer!

343 result = targetExhausted;

344 break;

345 }

346 *target++ = U16_LEAD(character);

347 *target++ = U16_TRAIL(character);

348 orAllData = 0xffff;

349 } else {

350 if (strict) {

351 source -= utf8SequenceLength; // return to the start

352 result = sourceIllegal;

353 break; // Bail out; shouldn't continue

354 } else {

355 *target++ = replacementCharacter;

356 orAllData \|= replacementCharacter;

357 }

358 }

359 }

360 *sourceStart = source;

361 *targetStart = target;

362

363 if (sourceAllASCII)

364 *sourceAllASCII = !(orAllData & ~0x7f);

365

366 return result;

367 }

368

369 // Helper to write a three-byte UTF-8 code point to the buffer, caller must chec k room is available.

370 static inline void putUTF8Triple(char*& buffer, UChar ch)

371 {

372 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) \| 0xE0);

373 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) \| 0x80);

374 *buffer++ = static_cast<char>((ch & 0x3F) \| 0x80);

375 }

376

377 } // namespace

378

379 // static

380 String16 String16::fromInteger(int number)

381 {

382 const size_t kBufferSize = 50;

383 char buffer[kBufferSize];

384 std::snprintf(buffer, kBufferSize, "%d", number);

385 return String16(buffer);

386 }

387

388 // static

389 String16 String16::fromDouble(double number)

390 {

391 const size_t kBufferSize = 100;

392 char buffer[kBufferSize];

393 std::snprintf(buffer, kBufferSize, "%f", number);

394 return String16(buffer);

395 }

396

397 // static

398 String16 String16::fromDoublePrecision3(double number)

399 {

400 const size_t kBufferSize = 100;

401 char buffer[kBufferSize];

402 std::snprintf(buffer, kBufferSize, "%.3g", number);

403 return String16(buffer);

404 }

405

406 // static

407 String16 String16::fromDoublePrecision6(double number)

408 {

409 const size_t kBufferSize = 100;

410 char buffer[kBufferSize];

411 std::snprintf(buffer, kBufferSize, "%.6g", number);

412 return String16(buffer);

413 }

414

415 int String16::toInteger(bool* ok) const

416 {

417 return charactersToInteger(characters16(), length(), ok);

418 }

419

420 String16 String16::stripWhiteSpace() const

421 {

422 if (!length())

423 return String16();

424

425 unsigned start = 0;

426 unsigned end = length() - 1;

427

428 // skip white space from start

429 while (start <= end && isSpaceOrNewLine(characters16()[start]))

430 ++start;

431

432 // only white space

433 if (start > end)

434 return String16();

435

436 // skip white space from end

437 while (end && isSpaceOrNewLine(characters16()[end]))

438 --end;

439

440 if (!start && end == length() - 1)

441 return *this;

442 return String16(characters16() + start, end + 1 - start);

443 }

444

445 String16Builder::String16Builder()

446 {

447 }

448

449 void String16Builder::append(const String16& s)

450 {

451 m_buffer.insert(m_buffer.end(), s.characters16(), s.characters16() + s.lengt h());

452 }

453

454 void String16Builder::append(UChar c)

455 {

456 m_buffer.push_back(c);

457 }

458

459 void String16Builder::append(char c)

460 {

461 UChar u = c;

462 m_buffer.push_back(u);

463 }

464

465 void String16Builder::append(const UChar* characters, size_t length)

466 {

467 m_buffer.insert(m_buffer.end(), characters, characters + length);

468 }

469

470 void String16Builder::append(const char* characters, size_t length)

471 {

472 m_buffer.insert(m_buffer.end(), characters, characters + length);

473 }

474

475 String16 String16Builder::toString()

476 {

477 return String16(m_buffer.data(), m_buffer.size());

478 }

479

480 void String16Builder::reserveCapacity(size_t capacity)

481 {

482 m_buffer.reserve(capacity);

483 }

484

485 String16 String16::fromUTF8(const char* stringStart, size_t length)

486 {

487 if (!stringStart \|\| !length)

488 return String16();

489

490 std::vector<UChar> buffer(length);

491 UChar* bufferStart = buffer.data();

492

493 UChar* bufferCurrent = bufferStart;

494 const char* stringCurrent = stringStart;

495 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent, bufferCurrent + buffer.size(), 0, true) != conversionOK)

496 return String16();

497

498 unsigned utf16Length = bufferCurrent - bufferStart;

499 return String16(bufferStart, utf16Length);

500 }

501

502 std::string String16::utf8() const

503 {

504 unsigned length = this->length();

505

506 if (!length)

507 return std::string("");

508

509 // Allocate a buffer big enough to hold all the characters

510 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).

511 // Optimization ideas, if we find this function is hot:

512 // * We could speculatively create a CStringBuffer to contain 'length'

513 // characters, and resize if necessary (i.e. if the buffer contains

514 // non-ascii characters). (Alternatively, scan the buffer first for

515 // ascii characters, so we know this will be sufficient).

516 // * We could allocate a CStringBuffer with an appropriate size to

517 // have a good chance of being able to write the string into the

518 // buffer without reallocing (say, 1.5 x length).

519 if (length > std::numeric_limits<unsigned>::max() / 3)

520 return std::string();

521 std::vector<char> bufferVector(length * 3);

522 char* buffer = bufferVector.data();

523 const UChar* characters = m_impl.data();

524

525 ConversionResult result = convertUTF16ToUTF8(&characters, characters + lengt h, &buffer, buffer + bufferVector.size(), false);

526 DCHECK(result != targetExhausted); // (length * 3) should be sufficient for any conversion

527

528 // Only produced from strict conversion.

529 DCHECK(result != sourceIllegal);

530

531 // Check for an unconverted high surrogate.

532 if (result == sourceExhausted) {

533 // This should be one unpaired high surrogate. Treat it the same

534 // was as an unpaired high surrogate would have been handled in

535 // the middle of a string with non-strict conversion - which is

536 // to say, simply encode it to UTF-8.

537 DCHECK((characters + 1) == (m_impl.data() + length));

538 DCHECK((characters >= 0xD800) && (characters <= 0xDBFF));

539 // There should be room left, since one UChar hasn't been

540 // converted.

541 DCHECK((buffer + 3) <= (buffer + bufferVector.size()));

542 putUTF8Triple(buffer, *characters);

543 }

544

545 return std::string(bufferVector.data(), buffer - bufferVector.data());

546 }

547

548 } // namespace v8_inspector

OLD	NEW

« no previous file with comments | « src/inspector/String16.h ('k') | src/inspector/StringUtil.h » ('j') | no next file with comments »