src/unicode.cc - Issue 1148653007: Update UTF-8 decoder to detect more special cases.

Side by Side Diff: src/unicode.cc

Issue 1148653007: Update UTF-8 decoder to detect more special cases. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // This file was generated at 2014-10-08 15:25:47.940335	5 // This file was generated at 2014-10-08 15:25:47.940335

6	6

7 #include "src/unicode-inl.h"	7 #include "src/unicode-inl.h"

8 #include <stdio.h>	8 #include <stdio.h>

9 #include <stdlib.h>	9 #include <stdlib.h>

10	10

(...skipping 172 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
183 return 0;	183 return 0;

184 }	184 }

185 return -1;	185 return -1;

186 }	186 }

187 } else {	187 } else {

188 return 0;	188 return 0;

189 }	189 }

190 }	190 }

191	191

192	192

193 uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) {	193 static inline size_t NonASCIISequenceLength(byte first) {

194 // We only get called for non-ASCII characters.	194 static const uint8_t lengths[256] = {

195 if (length == 1) {	195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	200 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

	201 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

	202 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

	203 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

	204 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

	205 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
	vogelheim 2015/05/21 16:58:38 The table is difficult to read. It also leaves me The table is difficult to read. It also leaves me rather confused. - I take it the purpose is to determine the length of an UTF-8 sequence from it's first byte. I'm seeing: 128 x 0, 96 x 2, 16 x 3, 5 x 4, 11 x 0. Shouldn't this be: 192 x 0, 32 x 2, 16 x 3, 8 x 4, 8 x 0 ? - In particular: - I don't understand why you count the continuation characters (0b10xxxxxx) as '2'. - I don't understand why you count some of the 4-byte prefixes as 0/error (0b11110101 + 0b1111011x) - I'd find an 16x16 layout probably more readable. jochen (gone - plz use gerrit) 2015/05/22 12:38:12 I updated the matrix to be 16 x 16, and added comm Show quoted text On 2015/05/21 at 16:58:38, vogelheim wrote: > The table is difficult to read. It also leaves me rather confused. > > - I take it the purpose is to determine the length of an UTF-8 sequence from it's first byte. > I'm seeing: 128 x 0, 96 x 2, 16 x 3, 5 x 4, 11 x 0. > Shouldn't this be: 192 x 0, 32 x 2, 16 x 3, 8 x 4, 8 x 0 ? I updated the matrix to be 16 x 16, and added comments Show quoted text > > - In particular: > - I don't understand why you count the continuation characters (0b10xxxxxx) as '2'. fixed Show quoted text > - I don't understand why you count some of the 4-byte prefixes as > 0/error (0b11110101 + 0b1111011x) the code points those would encode are outside of the unicode range Show quoted text > > - I'd find an 16x16 layout probably more readable. done
	206 return lengths[first];

	207 }

	208

	209

	210 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
	vogelheim 2015/05/21 16:58:38 This might also benefit from a unit test that will This might also benefit from a unit test that will try some of the edge cases this is meant to fix. vogelheim 2015/05/21 16:58:38 I believe this deserves some commentary, and if on I believe this deserves some commentary, and if only that it's meant to be consistent with Blink's UTF decoder. jochen (gone - plz use gerrit) 2015/05/22 12:38:12 yeah, actually, it's supposed to be consistent wit Show quoted text On 2015/05/21 at 16:58:38, vogelheim wrote: > I believe this deserves some commentary, and if only that it's meant to be consistent with Blink's UTF decoder. yeah, actually, it's supposed to be consistent with the definition of UTF-8 which the previous version was not :-/ there are tests, and they fail now. I'll update them
	211 DCHECK((str[0] & 0x80) == 0x80);

	212 size_t length = NonASCIISequenceLength(str[0]);

	213 if (length == 0 \|\| max_length < length) {

196 *cursor += 1;	214 *cursor += 1;

197 return kBadChar;	215 return kBadChar;

198 }	216 }

199 byte first = str[0];	217 if (length == 2) {
	vogelheim 2015/05/21 16:58:38 I was trying to figure out why these characters I was trying to figure out why these characters are restricted. It seems ECMAScript specs explicitly allow pretty much all of them. "All Unicode code point values from U+0000 to U+10FFFF, including surrogate code points, may occur in source text where permitted by the ECMAScript grammars." [From an ES6 draft. ES5 final has similar text.] vogelheim 2015/05/21 16:58:38 I find the code below to be somewhat confusing. If I find the code below to be somewhat confusing. If I get this correctly, the code below either does the default UTF-8 processing, or for some 'special' code point ranges it does additional checks. Maybe an alternative would be to fold the dispatching of all those 'special' checks into the length matrix above. I.e., for the code block starting with 0xE0, make lengths[0xE0] == 0xE0, etc. Then these blocks below would all look like this: switch (NonASCIISequenceLength[str[0]) { case 0: ... error: case 2..4: normal UTF-8 processing case 0xE0: extra check, then UTF-8 processing. .... default: UNREACHABLE(); } jochen (gone - plz use gerrit) 2015/05/22 12:38:12 right. It's just that UTF-8 cannot encode all of u Show quoted text On 2015/05/21 at 16:58:38, vogelheim wrote: > I was trying to figure out why these characters are restricted. It seems ECMAScript specs explicitly allow pretty much all of them. > > "All Unicode code point values from U+0000 to U+10FFFF, including surrogate code points, may occur in source text where permitted by the ECMAScript grammars." [From an ES6 draft. ES5 final has similar text.] right. It's just that UTF-8 cannot encode all of unicode, in particular not U+D800 through U+DFFF.
200 byte second = str[1] ^ 0x80;	218 DCHECK(str[0] <= 0xDF);

201 if (second & 0xC0) {	219 if (str[0] < 0xC2) {

202 *cursor += 1;

203 return kBadChar;

204 }

205 if (first < 0xE0) {

206 if (first < 0xC0) {

207 *cursor += 1;	220 *cursor += 1;

208 return kBadChar;	221 return kBadChar;

209 }	222 }

210 uchar code_point = ((first << 6) \| second) & kMaxTwoByteChar;	223 if (str[1] < 0x80 \|\| str[1] > 0xBF) {

211 if (code_point <= kMaxOneByteChar) {

212 *cursor += 1;	224 *cursor += 1;

213 return kBadChar;	225 return kBadChar;

214 }	226 }

215 *cursor += 2;	227 *cursor += 2;

216 return code_point;	228 return ((str[0] << 6) + str[1]) - 0x00003080;

217 }	229 }

218 if (length == 2) {	230 if (length == 3) {

219 *cursor += 1;	231 DCHECK(str[0] >= 0xE0 && str[0] <= 0xEF);

220 return kBadChar;	232 switch (str[0]) {

221 }	233 case 0xE0:

222 byte third = str[2] ^ 0x80;	234 if (str[1] < 0xA0 \|\| str[1] > 0xBF) {

223 if (third & 0xC0) {	235 *cursor += 1;

224 *cursor += 1;	236 return kBadChar;

225 return kBadChar;	237 }

226 }	238 break;

227 if (first < 0xF0) {	239 case 0xED:

228 uchar code_point = ((((first << 6) \| second) << 6) \| third)	240 if (str[1] < 0x80 \|\| str[1] > 0x9F) {

229 & kMaxThreeByteChar;	241 *cursor += 1;

230 if (code_point <= kMaxTwoByteChar) {	242 return kBadChar;

	243 }

	244 break;

	245 default:

	246 if (str[1] < 0x80 \|\| str[1] > 0xBF) {

	247 *cursor += 1;

	248 return kBadChar;

	249 }

	250 }

	251 if (str[2] < 0x80 \|\| str[2] > 0xBF) {

231 *cursor += 1;	252 *cursor += 1;

232 return kBadChar;	253 return kBadChar;

233 }	254 }

234 *cursor += 3;	255 *cursor += 3;

235 return code_point;	256 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;

236 }	257 }

237 if (length == 3) {	258 DCHECK(length == 4);

	259 DCHECK(str[0] >= 0xF0 && str[0] <= 0xF4);

	260 switch (str[0]) {

	261 case 0xF0:

	262 if (str[1] < 0x90 \|\| str[1] > 0xBF) {

	263 *cursor += 1;

	264 return kBadChar;

	265 }

	266 break;

	267 case 0xF4:

	268 if (str[1] < 0x80 \|\| str[1] > 0x8F) {

	269 *cursor += 1;

	270 return kBadChar;

	271 }

	272 break;

	273 default:

	274 if (str[1] < 0x80 \|\| str[1] > 0xBF) {

	275 *cursor += 1;

	276 return kBadChar;

	277 }

	278 }

	279 if (str[2] < 0x80 \|\| str[2] > 0xBF) {

238 *cursor += 1;	280 *cursor += 1;

239 return kBadChar;	281 return kBadChar;

240 }	282 }

241 byte fourth = str[3] ^ 0x80;	283 if (str[3] < 0x80 \|\| str[3] > 0xBF) {

242 if (fourth & 0xC0) {

243 *cursor += 1;	284 *cursor += 1;

244 return kBadChar;	285 return kBadChar;

245 }	286 }

246 if (first < 0xF8) {	287 *cursor += 4;

247 uchar code_point = (((((first << 6 \| second) << 6) \| third) << 6) \| fourth)	288 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -

248 & kMaxFourByteChar;	289 0x03C82080;

249 if (code_point <= kMaxThreeByteChar) {

250 *cursor += 1;

251 return kBadChar;

252 }

253 *cursor += 4;

254 return code_point;

255 }

256 *cursor += 1;

257 return kBadChar;

258 }	290 }

259	291

260	292

261 // Uppercase: point.category == 'Lu'	293 // Uppercase: point.category == 'Lu'

262	294

263 static const uint16_t kUppercaseTable0Size = 455;	295 static const uint16_t kUppercaseTable0Size = 455;

264 static const int32_t kUppercaseTable0[455] = {	296 static const int32_t kUppercaseTable0[455] = {

265 1073741889, 90, 1073742016, 214,	297 1073741889, 90, 1073742016, 214,

266 1073742040, 222, 256, 258, // NOLINT	298 1073742040, 222, 256, 258, // NOLINT

267 260, 262, 264, 266,	299 260, 262, 264, 266,

(...skipping 3117 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3385 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3417 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3386 +	3418 +

3387 kCanonicalizationRangeMultiStrings1Size *	3419 kCanonicalizationRangeMultiStrings1Size *

3388 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3420 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3389 +	3421 +

3390 kCanonicalizationRangeMultiStrings7Size *	3422 kCanonicalizationRangeMultiStrings7Size *

3391 sizeof(MultiCharacterSpecialCase<1>); // NOLINT	3423 sizeof(MultiCharacterSpecialCase<1>); // NOLINT

3392 }	3424 }

3393	3425

3394 } // namespace unibrow	3426 } // namespace unibrow

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »