src/unicode.cc - Issue 2522193002: Merged: Squashed multiple commits.

Side by Side Diff: src/unicode.cc

Issue 2522193002: Merged: Squashed multiple commits. (Closed)

Patch Set: Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // This file was generated at 2014-10-08 15:25:47.940335	5 // This file was generated at 2014-10-08 15:25:47.940335

6	6

7 #include "src/unicode.h"	7 #include "src/unicode.h"

8 #include "src/unicode-inl.h"	8 #include "src/unicode-inl.h"

9 #include <stdio.h>	9 #include <stdio.h>

10 #include <stdlib.h>	10 #include <stdlib.h>

(...skipping 210 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
221	221

222	222

223 static inline bool IsContinuationCharacter(byte chr) {	223 static inline bool IsContinuationCharacter(byte chr) {

224 return chr >= 0x80 && chr <= 0xBF;	224 return chr >= 0x80 && chr <= 0xBF;

225 }	225 }

226	226

227	227

228 // This method decodes an UTF-8 value according to RFC 3629.	228 // This method decodes an UTF-8 value according to RFC 3629.

229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {	229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {

230 size_t length = NonASCIISequenceLength(str[0]);	230 size_t length = NonASCIISequenceLength(str[0]);

231 if (length == 0 \|\| max_length < length) {	231

232 *cursor += 1;	232 // Check continuation characters.

233 return kBadChar;	233 size_t max_count = std::min(length, max_length);

	234 size_t count = 1;

	235 while (count < max_count && IsContinuationCharacter(str[count])) {

	236 count++;

234 }	237 }

235 if (length == 2) {	238 *cursor += count;

236 if (!IsContinuationCharacter(str[1])) {	239

237 *cursor += 1;	240 // There must be enough continuation characters.

	241 if (count != length) return kBadChar;

	242

	243 // Check overly long sequences & other conditions.

	244 if (length == 3) {

	245 if (str[0] == 0xE0 && (str[1] < 0xA0 \|\| str[1] > 0xBF)) {

	246 // Overlong three-byte sequence?

	247 return kBadChar;

	248 } else if (str[0] == 0xED && (str[1] < 0x80 \|\| str[1] > 0x9F)) {

	249 // High and low surrogate halves?

238 return kBadChar;	250 return kBadChar;

239 }	251 }

240 *cursor += 2;	252 } else if (length == 4) {

241 return ((str[0] << 6) + str[1]) - 0x00003080;	253 if (str[0] == 0xF0 && (str[1] < 0x90 \|\| str[1] > 0xBF)) {

242 }	254 // Overlong four-byte sequence.

243 if (length == 3) {	255 return kBadChar;

244 switch (str[0]) {	256 } else if (str[0] == 0xF4 && (str[1] < 0x80 \|\| str[1] > 0x8F)) {

245 case 0xE0:	257 // Code points outside of the unicode range.

246 // Overlong three-byte sequence.

247 if (str[1] < 0xA0 \|\| str[1] > 0xBF) {

248 *cursor += 1;

249 return kBadChar;

250 }

251 break;

252 case 0xED:

253 // High and low surrogate halves.

254 if (str[1] < 0x80 \|\| str[1] > 0x9F) {

255 *cursor += 1;

256 return kBadChar;

257 }

258 break;

259 default:

260 if (!IsContinuationCharacter(str[1])) {

261 *cursor += 1;

262 return kBadChar;

263 }

264 }

265 if (!IsContinuationCharacter(str[2])) {

266 *cursor += 1;

267 return kBadChar;	258 return kBadChar;

268 }	259 }

269 *cursor += 3;

270 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;

271 }	260 }

272 DCHECK(length == 4);	261

273 switch (str[0]) {	262 // All errors have been handled, so we only have to assemble the result.

274 case 0xF0:	263 switch (length) {

275 // Overlong four-byte sequence.	264 case 1:

276 if (str[1] < 0x90 \|\| str[1] > 0xBF) {	265 return str[0];

277 *cursor += 1;	266 case 2:

278 return kBadChar;	267 return ((str[0] << 6) + str[1]) - 0x00003080;

279 }	268 case 3:

280 break;	269 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;

281 case 0xF4:	270 case 4:

282 // Code points outside of the unicode range.	271 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -

283 if (str[1] < 0x80 \|\| str[1] > 0x8F) {	272 0x03C82080;

284 *cursor += 1;

285 return kBadChar;

286 }

287 break;

288 default:

289 if (!IsContinuationCharacter(str[1])) {

290 *cursor += 1;

291 return kBadChar;

292 }

293 }	273 }

294 if (!IsContinuationCharacter(str[2])) {	274

295 *cursor += 1;	275 UNREACHABLE();

296 return kBadChar;	276 return kBadChar;

297 }

298 if (!IsContinuationCharacter(str[3])) {

299 *cursor += 1;

300 return kBadChar;

301 }

302 *cursor += 4;

303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -

304 0x03C82080;

305 }	277 }

306	278

307 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {	279 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {

308 DCHECK_NOT_NULL(buffer);	280 DCHECK_NOT_NULL(buffer);

309	281

310 // The common case: 1-byte Utf8 (and no incomplete char in the buffer)	282 // The common case: 1-byte Utf8 (and no incomplete char in the buffer)

311 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) {	283 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) {

312 return static_cast<uchar>(next);	284 return static_cast<uchar>(next);

313 }	285 }

314	286

315 if (*buffer == 0) {	287 if (*buffer == 0) {

316 // We're at the start of a new character.	288 // We're at the start of a new character.

317 uint32_t kind = NonASCIISequenceLength(next);	289 uint32_t kind = NonASCIISequenceLength(next);

318 if (kind >= 2 && kind <= 4) {	290 if (kind >= 2 && kind <= 4) {

319 // Start of 2..4 byte character, and no buffer.	291 // Start of 2..4 byte character, and no buffer.

320	292

321 // The mask for the lower bits depends on the kind, and is	293 // The mask for the lower bits depends on the kind, and is

322 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that	294 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that

323 // with one shift.	295 // with one shift.

324 uint8_t mask = 0x7f >> kind;	296 uint8_t mask = 0x7f >> kind;

325	297

326 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value	298 // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)

327 // in the bottom three.	299 // in 2nd nibble, and the value in the bottom three. The 2nd nibble is

328 *buffer = (kind - 1) << 24 \| (next & mask);	300 // intended as a counter about how many bytes are still needed.

	301 *buffer = kind << 28 \| (kind - 1) << 24 \| (next & mask);

329 return kIncomplete;	302 return kIncomplete;

330 } else {	303 } else {

331 // No buffer, and not the start of a 1-byte char (handled at the	304 // No buffer, and not the start of a 1-byte char (handled at the

332 // beginning), and not the start of a 2..4 byte char? Bad char.	305 // beginning), and not the start of a 2..4 byte char? Bad char.

333 *buffer = 0;	306 *buffer = 0;

334 return kBadChar;	307 return kBadChar;

335 }	308 }

336 } else if (*buffer <= 0xff) {	309 } else if (*buffer <= 0xff) {

337 // We have one unprocessed byte left (from the last else case in this if	310 // We have one unprocessed byte left (from the last else case in this if

338 // statement).	311 // statement).

339 uchar previous = *buffer;	312 uchar previous = *buffer;

340 *buffer = 0;	313 *buffer = 0;

341 uchar t = ValueOfIncremental(previous, buffer);	314 uchar t = ValueOfIncremental(previous, buffer);

342 if (t == kIncomplete) {	315 if (t == kIncomplete) {

343 // If we have an incomplete character, process both the previous and the	316 // If we have an incomplete character, process both the previous and the

344 // next byte at once.	317 // next byte at once.

345 return ValueOfIncremental(next, buffer);	318 return ValueOfIncremental(next, buffer);

346 } else {	319 } else {

347 // Otherwise, process the previous byte and save the next byte for next	320 // Otherwise, process the previous byte and save the next byte for next

348 // time.	321 // time.

349 DCHECK_EQ(0u, *buffer);	322 DCHECK_EQ(0u, *buffer);

350 *buffer = next;	323 *buffer = next;

351 return t;	324 return t;

352 }	325 }

353 } else if (IsContinuationCharacter(next)) {	326 } else if (IsContinuationCharacter(next)) {

354 // We're inside of a character, as described by buffer.	327 // We're inside of a character, as described by buffer.

355	328

356 // How many bytes (excluding this one) do we still expect?	329 // How many bytes (excluding this one) do we still expect?

357 uint8_t count = (*buffer >> 24) - 1;	330 uint8_t bytes_expected = *buffer >> 28;

	331 uint8_t bytes_left = (*buffer >> 24) & 0x0f;

	332 bytes_left--;

358 // Update the value.	333 // Update the value.

359 uint32_t value = ((*buffer & 0xffffff) << 6) \| (next & 0x3F);	334 uint32_t value = ((*buffer & 0xffffff) << 6) \| (next & 0x3F);

360 if (count) {	335 if (bytes_left) {

361 *buffer = count << 24 \| value;	336 *buffer = (bytes_expected << 28 \| bytes_left << 24 \| value);

362 return kIncomplete;	337 return kIncomplete;

363 } else {	338 } else {

364 *buffer = 0;	339 *buffer = 0;

365 return value;	340 bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) \|\|

	341 (bytes_expected == 3 && value < 0x800);

	342 return sequence_was_too_long ? kBadChar : value;

366 }	343 }

367 } else {	344 } else {

368 // Within a character, but not a continuation character? Then the	345 // Within a character, but not a continuation character? Then the

369 // previous char was a bad char. But we need to save the current	346 // previous char was a bad char. But we need to save the current

370 // one.	347 // one.

371 *buffer = next;	348 *buffer = next;

372 return kBadChar;	349 return kBadChar;

373 }	350 }

374 }	351 }

375	352

(...skipping 3150 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3526 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3503 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3527 +	3504 +

3528 kCanonicalizationRangeMultiStrings1Size *	3505 kCanonicalizationRangeMultiStrings1Size *

3529 sizeof(MultiCharacterSpecialCase<1>) // NOLINT	3506 sizeof(MultiCharacterSpecialCase<1>) // NOLINT

3530 +	3507 +

3531 kCanonicalizationRangeMultiStrings7Size *	3508 kCanonicalizationRangeMultiStrings7Size *

3532 sizeof(MultiCharacterSpecialCase<1>); // NOLINT	3509 sizeof(MultiCharacterSpecialCase<1>); // NOLINT

3533 }	3510 }

3534	3511

3535 } // namespace unibrow	3512 } // namespace unibrow

OLD	NEW

« no previous file with comments | « no previous file | src/unicode-decoder.h » ('j') | no next file with comments »