src/inspector/String16.cpp - Issue 2292573002: [inspector] Initial import of v8_inspector.

Side by Side Diff: src/inspector/String16.cpp

Issue 2292573002: [inspector] Initial import of v8_inspector. (Closed)

Patch Set: format the code, disable cpplint Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2016 the V8 project authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "src/inspector/String16.h"

	6

	7 #include "src/inspector/ProtocolPlatform.h"

	8

	9 #include <algorithm>

	10 #include <cctype>

	11 #include <cstdio>

	12 #include <cstdlib>

	13 #include <cstring>

	14 #include <locale>

	15 #include <string>

	16

	17 namespace v8_inspector {

	18

	19 namespace {

	20

	21 bool isASCII(UChar c) { return !(c & ~0x7F); }

	22

	23 bool isSpaceOrNewLine(UChar c) {

	24 return isASCII(c) && c <= ' ' && (c == ' ' \|\| (c <= 0xD && c >= 0x9));

	25 }

	26

	27 int charactersToInteger(const UChar* characters, size_t length,

	28 bool* ok = nullptr) {

	29 std::vector<char> buffer;

	30 buffer.reserve(length + 1);

	31 for (size_t i = 0; i < length; ++i) {

	32 if (!isASCII(characters[i])) {

	33 if (ok) *ok = false;

	34 return 0;

	35 }

	36 buffer.push_back(static_cast<char>(characters[i]));

	37 }

	38 buffer.push_back('\0');

	39

	40 char* endptr;

	41 int result = std::strtol(buffer.data(), &endptr, 10);

	42 if (ok) ok = !(endptr);

	43 return result;

	44 }

	45

	46 const UChar replacementCharacter = 0xFFFD;

	47 using UChar32 = uint32_t;

	48

	49 inline int inlineUTF8SequenceLengthNonASCII(char b0) {

	50 if ((b0 & 0xC0) != 0xC0) return 0;

	51 if ((b0 & 0xE0) == 0xC0) return 2;

	52 if ((b0 & 0xF0) == 0xE0) return 3;

	53 if ((b0 & 0xF8) == 0xF0) return 4;

	54 return 0;

	55 }

	56

	57 inline int inlineUTF8SequenceLength(char b0) {

	58 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);

	59 }

	60

	61 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed

	62 // into the first byte, depending on how many bytes follow. There are

	63 // as many entries in this table as there are UTF-8 sequence types.

	64 // (I.e., one byte sequence, two byte... etc.). Remember that sequences

	65 // for legal UTF-8 will be 4 or fewer bytes total.

	66 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,

	67 0xF0, 0xF8, 0xFC};

	68

	69 typedef enum {

	70 conversionOK, // conversion successful

	71 sourceExhausted, // partial character in source, but hit end

	72 targetExhausted, // insuff. room in target for conversion

	73 sourceIllegal // source sequence is illegal/malformed

	74 } ConversionResult;

	75

	76 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart,

	77 const UChar* sourceEnd, char** targetStart,

	78 char* targetEnd, bool strict) {

	79 ConversionResult result = conversionOK;

	80 const UChar* source = *sourceStart;

	81 char* target = *targetStart;

	82 while (source < sourceEnd) {

	83 UChar32 ch;

	84 unsigned short bytesToWrite = 0;

	85 const UChar32 byteMask = 0xBF;

	86 const UChar32 byteMark = 0x80;

	87 const UChar* oldSource =

	88 source; // In case we have to back up because of target overflow.

	89 ch = static_cast<unsigned short>(*source++);

	90 // If we have a surrogate pair, convert to UChar32 first.

	91 if (ch >= 0xD800 && ch <= 0xDBFF) {

	92 // If the 16 bits following the high surrogate are in the source buffer...

	93 if (source < sourceEnd) {

	94 UChar32 ch2 = static_cast<unsigned short>(*source);

	95 // If it's a low surrogate, convert to UChar32.

	96 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {

	97 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;

	98 ++source;

	99 } else if (strict) { // it's an unpaired high surrogate

	100 --source; // return to the illegal value itself

	101 result = sourceIllegal;

	102 break;

	103 }

	104 } else { // We don't have the 16 bits following the high surrogate.

	105 --source; // return to the high surrogate

	106 result = sourceExhausted;

	107 break;

	108 }

	109 } else if (strict) {

	110 // UTF-16 surrogate values are illegal in UTF-32

	111 if (ch >= 0xDC00 && ch <= 0xDFFF) {

	112 --source; // return to the illegal value itself

	113 result = sourceIllegal;

	114 break;

	115 }

	116 }

	117 // Figure out how many bytes the result will require

	118 if (ch < (UChar32)0x80) {

	119 bytesToWrite = 1;

	120 } else if (ch < (UChar32)0x800) {

	121 bytesToWrite = 2;

	122 } else if (ch < (UChar32)0x10000) {

	123 bytesToWrite = 3;

	124 } else if (ch < (UChar32)0x110000) {

	125 bytesToWrite = 4;

	126 } else {

	127 bytesToWrite = 3;

	128 ch = replacementCharacter;

	129 }

	130

	131 target += bytesToWrite;

	132 if (target > targetEnd) {

	133 source = oldSource; // Back up source pointer!

	134 target -= bytesToWrite;

	135 result = targetExhausted;

	136 break;

	137 }

	138 switch (bytesToWrite) { // note: everything falls through.

	139 case 4:

	140 *--target = (char)((ch \| byteMark) & byteMask);

	141 ch >>= 6;

	142 case 3:

	143 *--target = (char)((ch \| byteMark) & byteMask);

	144 ch >>= 6;

	145 case 2:

	146 *--target = (char)((ch \| byteMark) & byteMask);

	147 ch >>= 6;

	148 case 1:

	149 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

	150 }

	151 target += bytesToWrite;

	152 }

	153 *sourceStart = source;

	154 *targetStart = target;

	155 return result;

	156 }

	157

	158 /**

	159 * Is this code point a BMP code point (U+0000..U+ffff)?

	160 * @param c 32-bit code point

	161 * @return TRUE or FALSE

	162 * @stable ICU 2.8

	163 */

	164 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff)

	165

	166 /**

	167 * Is this code point a supplementary code point (U+10000..U+10ffff)?

	168 * @param c 32-bit code point

	169 * @return TRUE or FALSE

	170 * @stable ICU 2.8

	171 */

	172 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000) <= 0xfffff)

	173

	174 /**

	175 * Is this code point a surrogate (U+d800..U+dfff)?

	176 * @param c 32-bit code point

	177 * @return TRUE or FALSE

	178 * @stable ICU 2.4

	179 */

	180 #define U_IS_SURROGATE(c) (((c)&0xfffff800) == 0xd800)

	181

	182 /**

	183 * Get the lead surrogate (0xd800..0xdbff) for a

	184 * supplementary code point (0x10000..0x10ffff).

	185 * @param supplementary 32-bit code point (U+10000..U+10ffff)

	186 * @return lead surrogate (U+d800..U+dbff) for supplementary

	187 * @stable ICU 2.4

	188 */

	189 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0)

	190

	191 /**

	192 * Get the trail surrogate (0xdc00..0xdfff) for a

	193 * supplementary code point (0x10000..0x10ffff).

	194 * @param supplementary 32-bit code point (U+10000..U+10ffff)

	195 * @return trail surrogate (U+dc00..U+dfff) for supplementary

	196 * @stable ICU 2.4

	197 */

	198 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff) \| 0xdc00)

	199

	200 // This must be called with the length pre-determined by the first byte.

	201 // If presented with a length > 4, this returns false. The Unicode

	202 // definition of UTF-8 goes up to 4-byte sequences.

	203 static bool isLegalUTF8(const unsigned char* source, int length) {

	204 unsigned char a;

	205 const unsigned char* srcptr = source + length;

	206 switch (length) {

	207 default:

	208 return false;

	209 // Everything else falls through when "true"...

	210 case 4:

	211 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;

	212 case 3:

	213 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;

	214 case 2:

	215 if ((a = (*--srcptr)) > 0xBF) return false;

	216

	217 // no fall-through in this inner switch

	218 switch (*source) {

	219 case 0xE0:

	220 if (a < 0xA0) return false;

	221 break;

	222 case 0xED:

	223 if (a > 0x9F) return false;

	224 break;

	225 case 0xF0:

	226 if (a < 0x90) return false;

	227 break;

	228 case 0xF4:

	229 if (a > 0x8F) return false;

	230 break;

	231 default:

	232 if (a < 0x80) return false;

	233 }

	234

	235 case 1:

	236 if (source >= 0x80 && source < 0xC2) return false;

	237 }

	238 if (*source > 0xF4) return false;

	239 return true;

	240 }

	241

	242 // Magic values subtracted from a buffer value during UTF8 conversion.

	243 // This table contains as many values as there might be trailing bytes

	244 // in a UTF-8 sequence.

	245 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL,

	246 0x00003080UL,

	247 0x000E2080UL,

	248 0x03C82080UL,

	249 static_cast<UChar32>(0xFA082080UL),

	250 static_cast<UChar32>(0x82082080UL)};

	251

	252 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) {

	253 UChar32 character = 0;

	254

	255 // The cases all fall through.

	256 switch (length) {

	257 case 6:

	258 character += static_cast<unsigned char>(*sequence++);

	259 character <<= 6;

	260 case 5:

	261 character += static_cast<unsigned char>(*sequence++);

	262 character <<= 6;

	263 case 4:

	264 character += static_cast<unsigned char>(*sequence++);

	265 character <<= 6;

	266 case 3:

	267 character += static_cast<unsigned char>(*sequence++);

	268 character <<= 6;

	269 case 2:

	270 character += static_cast<unsigned char>(*sequence++);

	271 character <<= 6;

	272 case 1:

	273 character += static_cast<unsigned char>(*sequence++);

	274 }

	275

	276 return character - offsetsFromUTF8[length - 1];

	277 }

	278

	279 ConversionResult convertUTF8ToUTF16(const char** sourceStart,

	280 const char* sourceEnd, UChar** targetStart,

	281 UChar* targetEnd, bool* sourceAllASCII,

	282 bool strict) {

	283 ConversionResult result = conversionOK;

	284 const char* source = *sourceStart;

	285 UChar* target = *targetStart;

	286 UChar orAllData = 0;

	287 while (source < sourceEnd) {

	288 int utf8SequenceLength = inlineUTF8SequenceLength(*source);

	289 if (sourceEnd - source < utf8SequenceLength) {

	290 result = sourceExhausted;

	291 break;

	292 }

	293 // Do this check whether lenient or strict

	294 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source),

	295 utf8SequenceLength)) {

	296 result = sourceIllegal;

	297 break;

	298 }

	299

	300 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

	301

	302 if (target >= targetEnd) {

	303 source -= utf8SequenceLength; // Back up source pointer!

	304 result = targetExhausted;

	305 break;

	306 }

	307

	308 if (U_IS_BMP(character)) {

	309 // UTF-16 surrogate values are illegal in UTF-32

	310 if (U_IS_SURROGATE(character)) {

	311 if (strict) {

	312 source -= utf8SequenceLength; // return to the illegal value itself

	313 result = sourceIllegal;

	314 break;

	315 }

	316 *target++ = replacementCharacter;

	317 orAllData \|= replacementCharacter;

	318 } else {

	319 *target++ = static_cast<UChar>(character); // normal case

	320 orAllData \|= character;

	321 }

	322 } else if (U_IS_SUPPLEMENTARY(character)) {

	323 // target is a character in range 0xFFFF - 0x10FFFF

	324 if (target + 1 >= targetEnd) {

	325 source -= utf8SequenceLength; // Back up source pointer!

	326 result = targetExhausted;

	327 break;

	328 }

	329 *target++ = U16_LEAD(character);

	330 *target++ = U16_TRAIL(character);

	331 orAllData = 0xffff;

	332 } else {

	333 if (strict) {

	334 source -= utf8SequenceLength; // return to the start

	335 result = sourceIllegal;

	336 break; // Bail out; shouldn't continue

	337 } else {

	338 *target++ = replacementCharacter;

	339 orAllData \|= replacementCharacter;

	340 }

	341 }

	342 }

	343 *sourceStart = source;

	344 *targetStart = target;

	345

	346 if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7f);

	347

	348 return result;

	349 }

	350

	351 // Helper to write a three-byte UTF-8 code point to the buffer, caller must

	352 // check room is available.

	353 static inline void putUTF8Triple(char*& buffer, UChar ch) {

	354 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) \| 0xE0);

	355 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) \| 0x80);

	356 *buffer++ = static_cast<char>((ch & 0x3F) \| 0x80);

	357 }

	358

	359 } // namespace

	360

	361 // static

	362 String16 String16::fromInteger(int number) {

	363 const size_t kBufferSize = 50;

	364 char buffer[kBufferSize];

	365 std::snprintf(buffer, kBufferSize, "%d", number);

	366 return String16(buffer);

	367 }

	368

	369 // static

	370 String16 String16::fromDouble(double number) {

	371 const size_t kBufferSize = 100;

	372 char buffer[kBufferSize];

	373 std::snprintf(buffer, kBufferSize, "%f", number);

	374 return String16(buffer);

	375 }

	376

	377 // static

	378 String16 String16::fromDoublePrecision3(double number) {

	379 const size_t kBufferSize = 100;

	380 char buffer[kBufferSize];

	381 std::snprintf(buffer, kBufferSize, "%.3g", number);

	382 return String16(buffer);

	383 }

	384

	385 // static

	386 String16 String16::fromDoublePrecision6(double number) {

	387 const size_t kBufferSize = 100;

	388 char buffer[kBufferSize];

	389 std::snprintf(buffer, kBufferSize, "%.6g", number);

	390 return String16(buffer);

	391 }

	392

	393 int String16::toInteger(bool* ok) const {

	394 return charactersToInteger(characters16(), length(), ok);

	395 }

	396

	397 String16 String16::stripWhiteSpace() const {

	398 if (!length()) return String16();

	399

	400 unsigned start = 0;

	401 unsigned end = length() - 1;

	402

	403 // skip white space from start

	404 while (start <= end && isSpaceOrNewLine(characters16()[start])) ++start;

	405

	406 // only white space

	407 if (start > end) return String16();

	408

	409 // skip white space from end

	410 while (end && isSpaceOrNewLine(characters16()[end])) --end;

	411

	412 if (!start && end == length() - 1) return *this;

	413 return String16(characters16() + start, end + 1 - start);

	414 }

	415

	416 String16Builder::String16Builder() {}

	417

	418 void String16Builder::append(const String16& s) {

	419 m_buffer.insert(m_buffer.end(), s.characters16(),

	420 s.characters16() + s.length());

	421 }

	422

	423 void String16Builder::append(UChar c) { m_buffer.push_back(c); }

	424

	425 void String16Builder::append(char c) {

	426 UChar u = c;

	427 m_buffer.push_back(u);

	428 }

	429

	430 void String16Builder::append(const UChar* characters, size_t length) {

	431 m_buffer.insert(m_buffer.end(), characters, characters + length);

	432 }

	433

	434 void String16Builder::append(const char* characters, size_t length) {

	435 m_buffer.insert(m_buffer.end(), characters, characters + length);

	436 }

	437

	438 String16 String16Builder::toString() {

	439 return String16(m_buffer.data(), m_buffer.size());

	440 }

	441

	442 void String16Builder::reserveCapacity(size_t capacity) {

	443 m_buffer.reserve(capacity);

	444 }

	445

	446 String16 String16::fromUTF8(const char* stringStart, size_t length) {

	447 if (!stringStart \|\| !length) return String16();

	448

	449 std::vector<UChar> buffer(length);

	450 UChar* bufferStart = buffer.data();

	451

	452 UChar* bufferCurrent = bufferStart;

	453 const char* stringCurrent = stringStart;

	454 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent,

	455 bufferCurrent + buffer.size(), 0,

	456 true) != conversionOK)

	457 return String16();

	458

	459 unsigned utf16Length = bufferCurrent - bufferStart;

	460 return String16(bufferStart, utf16Length);

	461 }

	462

	463 std::string String16::utf8() const {

	464 unsigned length = this->length();

	465

	466 if (!length) return std::string("");

	467

	468 // Allocate a buffer big enough to hold all the characters

	469 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).

	470 // Optimization ideas, if we find this function is hot:

	471 // * We could speculatively create a CStringBuffer to contain 'length'

	472 // characters, and resize if necessary (i.e. if the buffer contains

	473 // non-ascii characters). (Alternatively, scan the buffer first for

	474 // ascii characters, so we know this will be sufficient).

	475 // * We could allocate a CStringBuffer with an appropriate size to

	476 // have a good chance of being able to write the string into the

	477 // buffer without reallocing (say, 1.5 x length).

	478 if (length > std::numeric_limits<unsigned>::max() / 3) return std::string();

	479 std::vector<char> bufferVector(length * 3);

	480 char* buffer = bufferVector.data();

	481 const UChar* characters = m_impl.data();

	482

	483 ConversionResult result =

	484 convertUTF16ToUTF8(&characters, characters + length, &buffer,

	485 buffer + bufferVector.size(), false);

	486 DCHECK(

	487 result !=

	488 targetExhausted); // (length * 3) should be sufficient for any conversion

	489

	490 // Only produced from strict conversion.

	491 DCHECK(result != sourceIllegal);

	492

	493 // Check for an unconverted high surrogate.

	494 if (result == sourceExhausted) {

	495 // This should be one unpaired high surrogate. Treat it the same

	496 // was as an unpaired high surrogate would have been handled in

	497 // the middle of a string with non-strict conversion - which is

	498 // to say, simply encode it to UTF-8.

	499 DCHECK((characters + 1) == (m_impl.data() + length));

	500 DCHECK((characters >= 0xD800) && (characters <= 0xDBFF));

	501 // There should be room left, since one UChar hasn't been

	502 // converted.

	503 DCHECK((buffer + 3) <= (buffer + bufferVector.size()));

	504 putUTF8Triple(buffer, *characters);

	505 }

	506

	507 return std::string(bufferVector.data(), buffer - bufferVector.data());

	508 }

	509

	510 } // namespace v8_inspector

OLD	NEW

« no previous file with comments | « src/inspector/String16.h ('k') | src/inspector/StringUtil.h » ('j') | no next file with comments »