third_party/WebKit/Source/platform/v8_inspector/String16STL.cpp - Issue 2251343003: [DevTools] Generate separate copies of inspector_protocol.

Side by Side Diff: third_party/WebKit/Source/platform/v8_inspector/String16STL.cpp

Issue 2251343003: [DevTools] Generate separate copies of inspector_protocol. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« third_party/WebKit/Source/platform/v8_inspector/String16.cpp ('K') | « third_party/WebKit/Source/platform/v8_inspector/String16STL.h ('k') | third_party/WebKit/Source/platform/v8_inspector/String16WTF.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright 2016 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "platform/inspector_protocol/InspectorProtocol.h"

6

7 #include <algorithm>

8 #include <cctype>

9 #include <cstdio>

10 #include <locale>

11

12 namespace blink {

13 namespace protocol {

14

15 const UChar replacementCharacter = 0xFFFD;

16 using UChar32 = uint32_t;

17

18 inline int inlineUTF8SequenceLengthNonASCII(char b0)

19 {

20 if ((b0 & 0xC0) != 0xC0)

21 return 0;

22 if ((b0 & 0xE0) == 0xC0)

23 return 2;

24 if ((b0 & 0xF0) == 0xE0)

25 return 3;

26 if ((b0 & 0xF8) == 0xF0)

27 return 4;

28 return 0;

29 }

30

31 inline int inlineUTF8SequenceLength(char b0)

32 {

33 return String16::isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);

34 }

35

36 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed

37 // into the first byte, depending on how many bytes follow. There are

38 // as many entries in this table as there are UTF-8 sequence types.

39 // (I.e., one byte sequence, two byte... etc.). Remember that sequences

40 // for legal UTF-8 will be 4 or fewer bytes total.

41 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC };

42

43 typedef enum {

44 conversionOK, // conversion successful

45 sourceExhausted, // partial character in source, but hit end

46 targetExhausted, // insuff. room in target for conversion

47 sourceIllegal // source sequence is illegal/malformed

48 } ConversionResult;

49

50 ConversionResult convertUTF16ToUTF8(

51 const UChar** sourceStart, const UChar* sourceEnd,

52 char** targetStart, char* targetEnd, bool strict)

53 {

54 ConversionResult result = conversionOK;

55 const UChar* source = *sourceStart;

56 char* target = *targetStart;

57 while (source < sourceEnd) {

58 UChar32 ch;

59 unsigned short bytesToWrite = 0;

60 const UChar32 byteMask = 0xBF;

61 const UChar32 byteMark = 0x80;

62 const UChar* oldSource = source; // In case we have to back up because o f target overflow.

63 ch = static_cast<unsigned short>(*source++);

64 // If we have a surrogate pair, convert to UChar32 first.

65 if (ch >= 0xD800 && ch <= 0xDBFF) {

66 // If the 16 bits following the high surrogate are in the source buf fer...

67 if (source < sourceEnd) {

68 UChar32 ch2 = static_cast<unsigned short>(*source);

69 // If it's a low surrogate, convert to UChar32.

70 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {

71 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;

72 ++source;

73 } else if (strict) { // it's an unpaired high surrogate

74 --source; // return to the illegal value itself

75 result = sourceIllegal;

76 break;

77 }

78 } else { // We don't have the 16 bits following the high surrogate.

79 --source; // return to the high surrogate

80 result = sourceExhausted;

81 break;

82 }

83 } else if (strict) {

84 // UTF-16 surrogate values are illegal in UTF-32

85 if (ch >= 0xDC00 && ch <= 0xDFFF) {

86 --source; // return to the illegal value itself

87 result = sourceIllegal;

88 break;

89 }

90 }

91 // Figure out how many bytes the result will require

92 if (ch < (UChar32)0x80) {

93 bytesToWrite = 1;

94 } else if (ch < (UChar32)0x800) {

95 bytesToWrite = 2;

96 } else if (ch < (UChar32)0x10000) {

97 bytesToWrite = 3;

98 } else if (ch < (UChar32)0x110000) {

99 bytesToWrite = 4;

100 } else {

101 bytesToWrite = 3;

102 ch = replacementCharacter;

103 }

104

105 target += bytesToWrite;

106 if (target > targetEnd) {

107 source = oldSource; // Back up source pointer!

108 target -= bytesToWrite;

109 result = targetExhausted;

110 break;

111 }

112 switch (bytesToWrite) { // note: everything falls through.

113 case 4:

114 *--target = (char)((ch \| byteMark) & byteMask);

115 ch >>= 6;

116 case 3:

117 *--target = (char)((ch \| byteMark) & byteMask);

118 ch >>= 6;

119 case 2:

120 *--target = (char)((ch \| byteMark) & byteMask);

121 ch >>= 6;

122 case 1:

123 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

124 }

125 target += bytesToWrite;

126 }

127 *sourceStart = source;

128 *targetStart = target;

129 return result;

130 }

131

132 /**

133 * Is this code point a BMP code point (U+0000..U+ffff)?

134 * @param c 32-bit code point

135 * @return TRUE or FALSE

136 * @stable ICU 2.8

137 */

138 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff)

139

140 /**

141 * Is this code point a supplementary code point (U+10000..U+10ffff)?

142 * @param c 32-bit code point

143 * @return TRUE or FALSE

144 * @stable ICU 2.8

145 */

146 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c) - 0x10000) <= 0xfffff)

147

148 /**

149 * Is this code point a surrogate (U+d800..U+dfff)?

150 * @param c 32-bit code point

151 * @return TRUE or FALSE

152 * @stable ICU 2.4

153 */

154 #define U_IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800)

155

156 /**

157 * Get the lead surrogate (0xd800..0xdbff) for a

158 * supplementary code point (0x10000..0x10ffff).

159 * @param supplementary 32-bit code point (U+10000..U+10ffff)

160 * @return lead surrogate (U+d800..U+dbff) for supplementary

161 * @stable ICU 2.4

162 */

163 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0)

164

165 /**

166 * Get the trail surrogate (0xdc00..0xdfff) for a

167 * supplementary code point (0x10000..0x10ffff).

168 * @param supplementary 32-bit code point (U+10000..U+10ffff)

169 * @return trail surrogate (U+dc00..U+dfff) for supplementary

170 * @stable ICU 2.4

171 */

172 #define U16_TRAIL(supplementary) (UChar)(((supplementary) & 0x3ff) \| 0xdc00)

173

174 // This must be called with the length pre-determined by the first byte.

175 // If presented with a length > 4, this returns false. The Unicode

176 // definition of UTF-8 goes up to 4-byte sequences.

177 static bool isLegalUTF8(const unsigned char* source, int length)

178 {

179 unsigned char a;

180 const unsigned char* srcptr = source + length;

181 switch (length) {

182 default:

183 return false;

184 // Everything else falls through when "true"...

185 case 4:

186 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

187 return false;

188 case 3:

189 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

190 return false;

191 case 2:

192 if ((a = (*--srcptr)) > 0xBF)

193 return false;

194

195 // no fall-through in this inner switch

196 switch (*source) {

197 case 0xE0:

198 if (a < 0xA0)

199 return false;

200 break;

201 case 0xED:

202 if (a > 0x9F)

203 return false;

204 break;

205 case 0xF0:

206 if (a < 0x90)

207 return false;

208 break;

209 case 0xF4:

210 if (a > 0x8F)

211 return false;

212 break;

213 default:

214 if (a < 0x80)

215 return false;

216 }

217

218 case 1:

219 if (source >= 0x80 && source < 0xC2)

220 return false;

221 }

222 if (*source > 0xF4)

223 return false;

224 return true;

225 }

226

227 // Magic values subtracted from a buffer value during UTF8 conversion.

228 // This table contains as many values as there might be trailing bytes

229 // in a UTF-8 sequence.

230 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) };

231

232 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)

233 {

234 UChar32 character = 0;

235

236 // The cases all fall through.

237 switch (length) {

238 case 6:

239 character += static_cast<unsigned char>(*sequence++);

240 character <<= 6;

241 case 5:

242 character += static_cast<unsigned char>(*sequence++);

243 character <<= 6;

244 case 4:

245 character += static_cast<unsigned char>(*sequence++);

246 character <<= 6;

247 case 3:

248 character += static_cast<unsigned char>(*sequence++);

249 character <<= 6;

250 case 2:

251 character += static_cast<unsigned char>(*sequence++);

252 character <<= 6;

253 case 1:

254 character += static_cast<unsigned char>(*sequence++);

255 }

256

257 return character - offsetsFromUTF8[length - 1];

258 }

259

260 ConversionResult convertUTF8ToUTF16(

261 const char** sourceStart, const char* sourceEnd,

262 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)

263 {

264 ConversionResult result = conversionOK;

265 const char* source = *sourceStart;

266 UChar* target = *targetStart;

267 UChar orAllData = 0;

268 while (source < sourceEnd) {

269 int utf8SequenceLength = inlineUTF8SequenceLength(*source);

270 if (sourceEnd - source < utf8SequenceLength) {

271 result = sourceExhausted;

272 break;

273 }

274 // Do this check whether lenient or strict

275 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) {

276 result = sourceIllegal;

277 break;

278 }

279

280 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

281

282 if (target >= targetEnd) {

283 source -= utf8SequenceLength; // Back up source pointer!

284 result = targetExhausted;

285 break;

286 }

287

288 if (U_IS_BMP(character)) {

289 // UTF-16 surrogate values are illegal in UTF-32

290 if (U_IS_SURROGATE(character)) {

291 if (strict) {

292 source -= utf8SequenceLength; // return to the illegal value itself

293 result = sourceIllegal;

294 break;

295 }

296 *target++ = replacementCharacter;

297 orAllData \|= replacementCharacter;

298 } else {

299 *target++ = static_cast<UChar>(character); // normal case

300 orAllData \|= character;

301 }

302 } else if (U_IS_SUPPLEMENTARY(character)) {

303 // target is a character in range 0xFFFF - 0x10FFFF

304 if (target + 1 >= targetEnd) {

305 source -= utf8SequenceLength; // Back up source pointer!

306 result = targetExhausted;

307 break;

308 }

309 *target++ = U16_LEAD(character);

310 *target++ = U16_TRAIL(character);

311 orAllData = 0xffff;

312 } else {

313 if (strict) {

314 source -= utf8SequenceLength; // return to the start

315 result = sourceIllegal;

316 break; // Bail out; shouldn't continue

317 } else {

318 *target++ = replacementCharacter;

319 orAllData \|= replacementCharacter;

320 }

321 }

322 }

323 *sourceStart = source;

324 *targetStart = target;

325

326 if (sourceAllASCII)

327 *sourceAllASCII = !(orAllData & ~0x7f);

328

329 return result;

330 }

331

332 // Helper to write a three-byte UTF-8 code point to the buffer, caller must chec k room is available.

333 static inline void putUTF8Triple(char*& buffer, UChar ch)

334 {

335 DCHECK_GE(ch, 0x0800);

336 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) \| 0xE0);

337 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) \| 0x80);

338 *buffer++ = static_cast<char>((ch & 0x3F) \| 0x80);

339 }

340

341 String16 String16::fromUTF8(const char* stringStart, size_t length)

342 {

343 if (!stringStart \|\| !length)

344 return String16();

345

346 std::vector<UChar> buffer(length);

347 UChar* bufferStart = buffer.data();

348

349 UChar* bufferCurrent = bufferStart;

350 const char* stringCurrent = stringStart;

351 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent, bufferCurrent + buffer.size(), 0, true) != conversionOK)

352 return String16();

353

354 unsigned utf16Length = bufferCurrent - bufferStart;

355 return String16(bufferStart, utf16Length);

356 }

357

358 std::string String16::utf8() const

359 {

360 unsigned length = this->length();

361

362 if (!length)

363 return std::string("");

364

365 // Allocate a buffer big enough to hold all the characters

366 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).

367 // Optimization ideas, if we find this function is hot:

368 // * We could speculatively create a CStringBuffer to contain 'length'

369 // characters, and resize if necessary (i.e. if the buffer contains

370 // non-ascii characters). (Alternatively, scan the buffer first for

371 // ascii characters, so we know this will be sufficient).

372 // * We could allocate a CStringBuffer with an appropriate size to

373 // have a good chance of being able to write the string into the

374 // buffer without reallocing (say, 1.5 x length).

375 if (length > std::numeric_limits<unsigned>::max() / 3)

376 return std::string();

377 std::vector<char> bufferVector(length * 3);

378 char* buffer = bufferVector.data();

379 const UChar* characters = m_impl.data();

380

381 ConversionResult result = convertUTF16ToUTF8(&characters, characters + lengt h, &buffer, buffer + bufferVector.size(), false);

382 DCHECK(result != targetExhausted); // (length * 3) should be sufficient for any conversion

383

384 // Only produced from strict conversion.

385 DCHECK(result != sourceIllegal);

386

387 // Check for an unconverted high surrogate.

388 if (result == sourceExhausted) {

389 // This should be one unpaired high surrogate. Treat it the same

390 // was as an unpaired high surrogate would have been handled in

391 // the middle of a string with non-strict conversion - which is

392 // to say, simply encode it to UTF-8.

393 DCHECK((characters + 1) == (m_impl.data() + length));

394 DCHECK((characters >= 0xD800) && (characters <= 0xDBFF));

395 // There should be room left, since one UChar hasn't been

396 // converted.

397 DCHECK((buffer + 3) <= (buffer + bufferVector.size()));

398 putUTF8Triple(buffer, *characters);

399 }

400

401 return std::string(bufferVector.data(), buffer - bufferVector.data());

402 }

403

404 } // namespace protocol

405 } // namespace blink

OLD	NEW