src/unicode-inl.h - Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option

Side by Side Diff: src/unicode-inl.h

Issue 121173009: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option (Closed) Base URL: git://github.com/v8/v8.git@master

Patch Set: DISALLOW_INVALID_UTF8 flag and fixes Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2007-2010 the V8 project authors. All rights reserved.	1 // Copyright 2007-2010 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 89 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
100 static const int kMask = ~(1 << 6);	100 static const int kMask = ~(1 << 6);

101 if (c <= kMaxOneByteChar) {	101 if (c <= kMaxOneByteChar) {

102 str[0] = c;	102 str[0] = c;

103 return 1;	103 return 1;

104 }	104 }

105 str[0] = 0xC0 \| (c >> 6);	105 str[0] = 0xC0 \| (c >> 6);

106 str[1] = 0x80 \| (c & kMask);	106 str[1] = 0x80 \| (c & kMask);

107 return 2;	107 return 2;

108 }	108 }

109	109

	110 // Encode encodes the unicode code point c into the given str buffer. Unless

	111 // allow_invalid is set to true, surrogate code points will be replaced with

	112 // kReplacementCharacter. The caller is required to combine surrogate pairs

	113 // into code points before calling Encode.

	114 unsigned Utf8::Encode(char* str, uchar c, bool allow_invalid) {

	115 if (!allow_invalid &&
	dcarney 2014/01/07 10:12:16 move this block down into the kMaxThreeByteChar cl move this block down into the kMaxThreeByteChar clause. it's in the way of the common case
	116 (Utf16::IsLeadSurrogate(c) \|\|

	117 Utf16::IsTrailSurrogate(c))) {

	118 c = kReplacementCharacter;

	119 }

110	120

111 unsigned Utf8::Encode(char* str, uchar c, int previous) {

112 static const int kMask = ~(1 << 6);	121 static const int kMask = ~(1 << 6);

113 if (c <= kMaxOneByteChar) {	122 if (c <= kMaxOneByteChar) {

114 str[0] = c;	123 str[0] = c;

115 return 1;	124 return 1;

116 } else if (c <= kMaxTwoByteChar) {	125 } else if (c <= kMaxTwoByteChar) {

117 str[0] = 0xC0 \| (c >> 6);	126 str[0] = 0xC0 \| (c >> 6);

118 str[1] = 0x80 \| (c & kMask);	127 str[1] = 0x80 \| (c & kMask);

119 return 2;	128 return 2;

120 } else if (c <= kMaxThreeByteChar) {	129 } else if (c <= kMaxThreeByteChar) {

121 if (Utf16::IsTrailSurrogate(c) &&

122 Utf16::IsLeadSurrogate(previous)) {

123 const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;

124 return Encode(str - kUnmatchedSize,

125 Utf16::CombineSurrogatePair(previous, c),

126 Utf16::kNoPreviousCharacter) - kUnmatchedSize;

127 }

128 str[0] = 0xE0 \| (c >> 12);	130 str[0] = 0xE0 \| (c >> 12);

129 str[1] = 0x80 \| ((c >> 6) & kMask);	131 str[1] = 0x80 \| ((c >> 6) & kMask);

130 str[2] = 0x80 \| (c & kMask);	132 str[2] = 0x80 \| (c & kMask);

131 return 3;	133 return 3;

132 } else {	134 } else {

133 str[0] = 0xF0 \| (c >> 18);	135 str[0] = 0xF0 \| (c >> 18);

134 str[1] = 0x80 \| ((c >> 12) & kMask);	136 str[1] = 0x80 \| ((c >> 12) & kMask);

135 str[2] = 0x80 \| ((c >> 6) & kMask);	137 str[2] = 0x80 \| ((c >> 6) & kMask);

136 str[3] = 0x80 \| (c & kMask);	138 str[3] = 0x80 \| (c & kMask);

137 return 4;	139 return 4;

138 }	140 }

139 }	141 }

140	142

141	143

142 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {	144 uchar Utf8::ValueOf(const byte* bytes, unsigned length, unsigned* cursor) {

143 if (length <= 0) return kBadChar;	145 if (length <= 0) return kBadChar;

144 byte first = bytes[0];	146 byte first = bytes[0];

145 // Characters between 0000 and 0007F are encoded as a single character	147 // Characters between 0000 and 0007F are encoded as a single character

146 if (first <= kMaxOneByteChar) {	148 if (first <= kMaxOneByteChar) {

147 *cursor += 1;	149 *cursor += 1;

148 return first;	150 return first;

149 }	151 }

150 return CalculateValue(bytes, length, cursor);	152 return CalculateValue(bytes, length, cursor);

151 }	153 }

152	154

	155 // @TODO give this the same semantics as Encode?
	dcarney 2014/01/07 11:05:50 i don't see an easy way to do this. You'd have to i don't see an easy way to do this. You'd have to then run over the byte stream with a second pass to adjust for matching surrogates, but the current implementation here already doing that correctly and efficiently for the case where the replacement character is inserted and the case that it isn't. I think you'll ultimately need to keep the rewriting above in Utf8::Encode as it is instead of moving it to WritePair in api.cc. Instead you can just write kReplacementCharacter if the flag is set. This makes the Length and Encode functions easier to understand, since they complement one another and are in the same place. Also, you won't have to change any other call sites of Encode, which is of great benefit.
153 unsigned Utf8::Length(uchar c, int previous) {	156 unsigned Utf8::Length(uchar c, int previous) {

154 if (c <= kMaxOneByteChar) {	157 if (c <= kMaxOneByteChar) {

155 return 1;	158 return 1;

156 } else if (c <= kMaxTwoByteChar) {	159 } else if (c <= kMaxTwoByteChar) {

157 return 2;	160 return 2;

158 } else if (c <= kMaxThreeByteChar) {	161 } else if (c <= kMaxThreeByteChar) {

159 if (Utf16::IsTrailSurrogate(c) &&	162 if (Utf16::IsTrailSurrogate(c) &&

160 Utf16::IsLeadSurrogate(previous)) {	163 Utf16::IsLeadSurrogate(previous)) {

161 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;	164 return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;

162 }	165 }

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
209 // Copy the rest the slow way.	212 // Copy the rest the slow way.

210 WriteUtf16Slow(unbuffered_start_,	213 WriteUtf16Slow(unbuffered_start_,

211 data + buffer_length,	214 data + buffer_length,

212 length - buffer_length);	215 length - buffer_length);

213 return length;	216 return length;

214 }	217 }

215	218

216 } // namespace unibrow	219 } // namespace unibrow

217	220

218 #endif // V8_UNICODE_INL_H_	221 #endif // V8_UNICODE_INL_H_

OLD	NEW

« src/api.cc ('K') | « src/unicode.h ('k') | test/cctest/test-api.cc » ('j') | no next file with comments »