src/unicode.cc - Issue 2493143003: Return kBadChar for longest subpart of incomplete utf-8 character.

Side by Side Diff: src/unicode.cc

Issue 2493143003: Return kBadChar for longest subpart of incomplete utf-8 character. (Closed)

Patch Set: Fix end of buffer handling. Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // This file was generated at 2014-10-08 15:25:47.940335	5 // This file was generated at 2014-10-08 15:25:47.940335

6	6

7 #include "src/unicode.h"	7 #include "src/unicode.h"

8 #include "src/unicode-inl.h"	8 #include "src/unicode-inl.h"

9 #include <stdio.h>	9 #include <stdio.h>

10 #include <stdlib.h>	10 #include <stdlib.h>

(...skipping 210 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
221	221

222	222

223 static inline bool IsContinuationCharacter(byte chr) {	223 static inline bool IsContinuationCharacter(byte chr) {

224 return chr >= 0x80 && chr <= 0xBF;	224 return chr >= 0x80 && chr <= 0xBF;

225 }	225 }

226	226

227	227

228 // This method decodes an UTF-8 value according to RFC 3629.	228 // This method decodes an UTF-8 value according to RFC 3629.

229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {	229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {

230 size_t length = NonASCIISequenceLength(str[0]);	230 size_t length = NonASCIISequenceLength(str[0]);

231 if (length == 0 \|\| max_length < length) {	231

232 *cursor += 1;	232 // Check continuation characters.

	233 size_t max_count = std::min(length, max_length);

	234 size_t count = 1;

	235 while (count < max_count && IsContinuationCharacter(str[count])) {

	236 count++;

	237 }

	238

	239 // Check overly long sequences & other conditions. Use length as error

	240 // indicator.

	241 if (length == 3) {

	242 if (str[0] == 0xE0 && (str[1] < 0xA0 \|\| str[1] > 0xBF)) {

	243 // Overlong three-byte sequence?

	244 length = 0;

	245 } else if (str[0] == 0xED && (str[1] < 0x80 \|\| str[1] > 0x9F)) {

	246 // High and low surrogate halves?

	247 length = 0;

	248 }

	249 } else if (length == 4) {

	250 if (str[0] == 0xF0 && (str[1] < 0x90 \|\| str[1] > 0xBF)) {

	251 // Overlong four-byte sequence.

	252 length = 0;

	253 } else if (str[0] == 0xF4 && (str[1] < 0x80 \|\| str[1] > 0x8F)) {

	254 // Code points outside of the unicode range.

	255 length = 0;

	256 }

	257 }

	258

	259 if (count != length) {

	260 // All invalid encodings should land here.

	261 *cursor += count;

233 return kBadChar;	262 return kBadChar;

234 }	263 }

235 if (length == 2) {	264

236 if (!IsContinuationCharacter(str[1])) {	265 // All errors have been handled, so we only have to assemble the result.

237 *cursor += 1;	266 *cursor += length;

238 return kBadChar;	267 switch (length) {

239 }	268 case 1:

240 *cursor += 2;	269 return str[0];

241 return ((str[0] << 6) + str[1]) - 0x00003080;	270 case 2:

	271 return ((str[0] << 6) + str[1]) - 0x00003080;

	272 case 3:

	273 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;

	274 case 4:

	275 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -

	276 0x03C82080;

242 }	277 }

243 if (length == 3) {	278

244 switch (str[0]) {	279 UNREACHABLE();

245 case 0xE0:	280 return kBadChar;

246 // Overlong three-byte sequence.

247 if (str[1] < 0xA0 \|\| str[1] > 0xBF) {

248 *cursor += 1;

249 return kBadChar;

250 }

251 break;

252 case 0xED:

253 // High and low surrogate halves.

254 if (str[1] < 0x80 \|\| str[1] > 0x9F) {

255 *cursor += 1;

256 return kBadChar;

257 }

258 break;

259 default:

260 if (!IsContinuationCharacter(str[1])) {

261 *cursor += 1;

262 return kBadChar;

263 }

264 }

265 if (!IsContinuationCharacter(str[2])) {

266 *cursor += 1;

267 return kBadChar;

268 }

269 *cursor += 3;

270 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;

271 }

272 DCHECK(length == 4);

273 switch (str[0]) {

274 case 0xF0:

275 // Overlong four-byte sequence.

276 if (str[1] < 0x90 \|\| str[1] > 0xBF) {

277 *cursor += 1;

278 return kBadChar;

279 }

280 break;

281 case 0xF4:

282 // Code points outside of the unicode range.

283 if (str[1] < 0x80 \|\| str[1] > 0x8F) {

284 *cursor += 1;

285 return kBadChar;

286 }

287 break;

288 default:

289 if (!IsContinuationCharacter(str[1])) {

290 *cursor += 1;

291 return kBadChar;

292 }

293 }

294 if (!IsContinuationCharacter(str[2])) {

295 *cursor += 1;

296 return kBadChar;

297 }

298 if (!IsContinuationCharacter(str[3])) {

299 *cursor += 1;

300 return kBadChar;

301 }

302 *cursor += 4;

303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -

304 0x03C82080;

305 }	281 }

306	282

307 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {	283 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {

308 DCHECK_NOT_NULL(buffer);	284 DCHECK_NOT_NULL(buffer);

309	285

310 // The common case: 1-byte Utf8 (and no incomplete char in the buffer)	286 // The common case: 1-byte Utf8 (and no incomplete char in the buffer)

311 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) {	287 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) {

312 return static_cast<uchar>(next);	288 return static_cast<uchar>(next);

313 }	289 }

314	290

315 if (*buffer == 0) {	291 if (*buffer == 0) {

316 // We're at the start of a new character.	292 // We're at the start of a new character.

317 uint32_t kind = NonASCIISequenceLength(next);	293 uint32_t kind = NonASCIISequenceLength(next);

318 if (kind >= 2 && kind <= 4) {	294 if (kind >= 2 && kind <= 4) {

319 // Start of 2..4 byte character, and no buffer.	295 // Start of 2..4 byte character, and no buffer.

320	296

321 // The mask for the lower bits depends on the kind, and is	297 // The mask for the lower bits depends on the kind, and is

322 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that	298 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that

323 // with one shift.	299 // with one shift.

324 uint8_t mask = 0x7f >> kind;	300 uint8_t mask = 0x7f >> kind;

325	301

326 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value	302 // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)

327 // in the bottom three.	303 // in 2nd nibble, and the value in the bottom three. The 2nd nibble is

328 *buffer = (kind - 1) << 24 \| (next & mask);	304 // intended as a counter about how many bytes are still needed.

	305 *buffer = kind << 28 \| (kind - 1) << 24 \| (next & mask);

329 return kIncomplete;	306 return kIncomplete;

330 } else {	307 } else {

331 // No buffer, and not the start of a 1-byte char (handled at the	308 // No buffer, and not the start of a 1-byte char (handled at the

332 // beginning), and not the start of a 2..4 byte char? Bad char.	309 // beginning), and not the start of a 2..4 byte char? Bad char.

333 *buffer = 0;	310 *buffer = 0;

334 return kBadChar;	311 return kBadChar;

335 }	312 }

336 } else if (*buffer <= 0xff) {	313 } else if (*buffer <= 0xff) {

337 // We have one unprocessed byte left (from the last else case in this if	314 // We have one unprocessed byte left (from the last else case in this if

338 // statement).	315 // statement).

339 uchar previous = *buffer;	316 uchar previous = *buffer;

340 *buffer = 0;	317 *buffer = 0;

341 uchar t = ValueOfIncremental(previous, buffer);	318 uchar t = ValueOfIncremental(previous, buffer);

342 if (t == kIncomplete) {	319 if (t == kIncomplete) {

343 // If we have an incomplete character, process both the previous and the	320 // If we have an incomplete character, process both the previous and the

344 // next byte at once.	321 // next byte at once.

345 return ValueOfIncremental(next, buffer);	322 return ValueOfIncremental(next, buffer);

346 } else {	323 } else {

347 // Otherwise, process the previous byte and save the next byte for next	324 // Otherwise, process the previous byte and save the next byte for next

348 // time.	325 // time.

349 DCHECK_EQ(0, *buffer);	326 DCHECK_EQ(0, *buffer);

350 *buffer = next;	327 *buffer = next;

351 return t;	328 return t;

352 }	329 }

353 } else if (IsContinuationCharacter(next)) {	330 } else if (IsContinuationCharacter(next)) {

354 // We're inside of a character, as described by buffer.	331 // We're inside of a character, as described by buffer.

355	332

356 // How many bytes (excluding this one) do we still expect?	333 // How many bytes (excluding this one) do we still expect?

357 uint8_t count = (*buffer >> 24) - 1;	334 uint8_t bytes_expected = *buffer >> 28;

	335 uint8_t bytes_left = (*buffer >> 24) & 0x0f;

	336 bytes_left--;

358 // Update the value.	337 // Update the value.

359 uint32_t value = ((*buffer & 0xffffff) << 6) \| (next & 0x3F);	338 uint32_t value = ((*buffer & 0xffffff) << 6) \| (next & 0x3F);

360 if (count) {	339 if (bytes_left) {

361 *buffer = count << 24 \| value;	340 *buffer = (bytes_expected << 28 \| bytes_left << 24 \| value);

362 return kIncomplete;	341 return kIncomplete;

363 } else {	342 } else {

364 *buffer = 0;	343 *buffer = 0;

365 return value;	344 bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) \|\|

	345 (bytes_expected == 3 && value < 0x800);

	346 return sequence_was_too_long ? kBadChar : value;

366 }	347 }

367 } else {	348 } else {

368 // Within a character, but not a continuation character? Then the	349 // Within a character, but not a continuation character? Then the

369 // previous char was a bad char. But we need to save the current	350 // previous char was a bad char. But we need to save the current

370 // one.	351 // one.

371 *buffer = next;	352 *buffer = next;

372 return kBadChar;	353 return kBadChar;

373 }	354 }

374 }	355 }

375	356

(...skipping 3150 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3526 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3507 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3527 +	3508 +

3528 kCanonicalizationRangeMultiStrings1Size *	3509 kCanonicalizationRangeMultiStrings1Size *

3529 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3510 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3530 +	3511 +

3531 kCanonicalizationRangeMultiStrings7Size *	3512 kCanonicalizationRangeMultiStrings7Size *

3532 sizeof(MultiCharacterSpecialCase<1>); // NOLINT	3513 sizeof(MultiCharacterSpecialCase<1>); // NOLINT

3533 }	3514 }

3534	3515

3535 } // namespace unibrow	3516 } // namespace unibrow

OLD	NEW

« no previous file with comments | « no previous file | test/cctest/test-parsing.cc » ('j') | no next file with comments »