src/unicode.cc - Issue 1148653007: Update UTF-8 decoder to detect more special cases.

Side by Side Diff: src/unicode.cc

Issue 1148653007: Update UTF-8 decoder to detect more special cases. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: updates Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // This file was generated at 2014-10-08 15:25:47.940335	5 // This file was generated at 2014-10-08 15:25:47.940335

6	6

7 #include "src/unicode-inl.h"	7 #include "src/unicode-inl.h"

8 #include <stdio.h>	8 #include <stdio.h>

9 #include <stdlib.h>	9 #include <stdlib.h>

10	10

(...skipping 172 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
183 return 0;	183 return 0;

184 }	184 }

185 return -1;	185 return -1;

186 }	186 }

187 } else {	187 } else {

188 return 0;	188 return 0;

189 }	189 }

190 }	190 }

191	191

192	192

193 uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) {	193 static inline size_t NonASCIISequenceLength(byte first) {

194 // We only get called for non-ASCII characters.	194 // clang-format off

195 if (length == 1) {	195 static const uint8_t lengths[256] = {

	196 // The first 128 entries correspond to ASCII characters.

	197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	205 // The following 64 entries correspond to continuation bytes.

	206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	209 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	210 // The next are two invalid overlong encodings and 30 two-byte sequences.

	211 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

	212 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

	213 // 16 three-byte sequences.

	214 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

	215 // 5 four-byte sequences, followed by sequences that could only encode

	216 // code points outside of the unicode range.

	217 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

	218 // clang-format on

	219 return lengths[first];

	220 }

	221

	222

	223 static inline bool IsContinuationCharacter(byte chr) {

	224 return chr >= 0x80 && chr <= 0xBF;

	225 }

	226

	227

	228 // This method decodes an UTF-8 value according to RFC 3629.

	229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
	vogelheim 2015/05/22 15:53:38 Not sure if worth the effort, but this might be mo Not sure if worth the effort, but this might be more compact if written like this: uchar c = kBadChar; size_t length = NonASCIISequenceLength(str[0]); the if/switch-es would then only handle the success cases and update c; cursor += (c == kBadChar) ? 1 : length; return c; I guess it's a matter of taste. You pick. jochen (gone - plz use gerrit)* 2015/05/22 18:12:39 then I pick this version :) Show quoted text On 2015/05/22 at 15:53:38, vogelheim wrote: > Not sure if worth the effort, but this might be more compact if written like this: > > uchar c = kBadChar; > size_t length = NonASCIISequenceLength(str[0]); > the if/switch-es would then only handle the success cases and update c; > *cursor += (c == kBadChar) ? 1 : length; > return c; > > I guess it's a matter of taste. You pick. then I pick this version :)
	230 size_t length = NonASCIISequenceLength(str[0]);

	231 if (length == 0 \|\| max_length < length) {

196 *cursor += 1;	232 *cursor += 1;

197 return kBadChar;	233 return kBadChar;

198 }	234 }

199 byte first = str[0];	235 if (length == 2) {

200 byte second = str[1] ^ 0x80;	236 if (!IsContinuationCharacter(str[1])) {

201 if (second & 0xC0) {

202 *cursor += 1;

203 return kBadChar;

204 }

205 if (first < 0xE0) {

206 if (first < 0xC0) {

207 *cursor += 1;

208 return kBadChar;

209 }

210 uchar code_point = ((first << 6) \| second) & kMaxTwoByteChar;

211 if (code_point <= kMaxOneByteChar) {

212 *cursor += 1;	237 *cursor += 1;

213 return kBadChar;	238 return kBadChar;

214 }	239 }

215 *cursor += 2;	240 *cursor += 2;

216 return code_point;	241 return ((str[0] << 6) + str[1]) - 0x00003080;

217 }	242 }

218 if (length == 2) {	243 if (length == 3) {

219 *cursor += 1;	244 switch (str[0]) {

220 return kBadChar;	245 case 0xE0:

221 }	246 // Overlong three-byte sequence.

222 byte third = str[2] ^ 0x80;	247 if (str[1] < 0xA0 \|\| str[1] > 0xBF) {

223 if (third & 0xC0) {	248 *cursor += 1;

224 *cursor += 1;	249 return kBadChar;

225 return kBadChar;	250 }

226 }	251 break;

227 if (first < 0xF0) {	252 case 0xED:

228 uchar code_point = ((((first << 6) \| second) << 6) \| third)	253 // High and low surrogate halves.

229 & kMaxThreeByteChar;	254 if (str[1] < 0x80 \|\| str[1] > 0x9F) {

230 if (code_point <= kMaxTwoByteChar) {	255 *cursor += 1;

	256 return kBadChar;

	257 }

	258 break;

	259 default:

	260 if (!IsContinuationCharacter(str[1])) {

	261 *cursor += 1;

	262 return kBadChar;

	263 }

	264 }

	265 if (!IsContinuationCharacter(str[2])) {

231 *cursor += 1;	266 *cursor += 1;

232 return kBadChar;	267 return kBadChar;

233 }	268 }

234 *cursor += 3;	269 *cursor += 3;

235 return code_point;	270 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;

236 }	271 }

237 if (length == 3) {	272 DCHECK(length == 4);

	273 switch (str[0]) {

	274 case 0xF0:

	275 // Overlong four-byte sequence.

	276 if (str[1] < 0x90 \|\| str[1] > 0xBF) {

	277 *cursor += 1;

	278 return kBadChar;

	279 }

	280 break;

	281 case 0xF4:

	282 // Code poits outside of the unicode range.
	vogelheim 2015/05/22 15:53:38 poits -> points poits -> points
	283 if (str[1] < 0x80 \|\| str[1] > 0x8F) {

	284 *cursor += 1;

	285 return kBadChar;

	286 }

	287 break;

	288 default:

	289 if (!IsContinuationCharacter(str[1])) {

	290 *cursor += 1;

	291 return kBadChar;

	292 }

	293 }

	294 if (!IsContinuationCharacter(str[2])) {

238 *cursor += 1;	295 *cursor += 1;

239 return kBadChar;	296 return kBadChar;

240 }	297 }

241 byte fourth = str[3] ^ 0x80;	298 if (!IsContinuationCharacter(str[3])) {

242 if (fourth & 0xC0) {

243 *cursor += 1;	299 *cursor += 1;

244 return kBadChar;	300 return kBadChar;

245 }	301 }

246 if (first < 0xF8) {	302 *cursor += 4;

247 uchar code_point = (((((first << 6 \| second) << 6) \| third) << 6) \| fourth)	303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -

248 & kMaxFourByteChar;	304 0x03C82080;

249 if (code_point <= kMaxThreeByteChar) {

250 *cursor += 1;

251 return kBadChar;

252 }

253 *cursor += 4;

254 return code_point;

255 }

256 *cursor += 1;

257 return kBadChar;

258 }	305 }

259	306

260	307

261 // Uppercase: point.category == 'Lu'	308 // Uppercase: point.category == 'Lu'

262	309

263 static const uint16_t kUppercaseTable0Size = 455;	310 static const uint16_t kUppercaseTable0Size = 455;

264 static const int32_t kUppercaseTable0[455] = {	311 static const int32_t kUppercaseTable0[455] = {

265 1073741889, 90, 1073742016, 214,	312 1073741889, 90, 1073742016, 214,

266 1073742040, 222, 256, 258, // NOLINT	313 1073742040, 222, 256, 258, // NOLINT

267 260, 262, 264, 266,	314 260, 262, 264, 266,

(...skipping 3117 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3385 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3432 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3386 +	3433 +

3387 kCanonicalizationRangeMultiStrings1Size *	3434 kCanonicalizationRangeMultiStrings1Size *

3388 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3435 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3389 +	3436 +

3390 kCanonicalizationRangeMultiStrings7Size *	3437 kCanonicalizationRangeMultiStrings7Size *

3391 sizeof(MultiCharacterSpecialCase<1>); // NOLINT	3438 sizeof(MultiCharacterSpecialCase<1>); // NOLINT

3392 }	3439 }

3393	3440

3394 } // namespace unibrow	3441 } // namespace unibrow

OLD	NEW

« no previous file with comments | « no previous file | test/cctest/test-api.cc » ('j') | test/cctest/test-api.cc » ('J')