test/cctest/test-parsing.cc - Issue 1148653007: Update UTF-8 decoder to detect more special cases.

Side by Side Diff: test/cctest/test-parsing.cc

Issue 1148653007: Update UTF-8 decoder to detect more special cases. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: updates Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 681 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
692 static const int kAllUtf8CharsSize =	692 static const int kAllUtf8CharsSize =

693 (unibrow::Utf8::kMaxOneByteChar + 1) +	693 (unibrow::Utf8::kMaxOneByteChar + 1) +

694 (unibrow::Utf8::kMaxTwoByteChar - unibrow::Utf8::kMaxOneByteChar) * 2 +	694 (unibrow::Utf8::kMaxTwoByteChar - unibrow::Utf8::kMaxOneByteChar) * 2 +

695 (unibrow::Utf8::kMaxThreeByteChar - unibrow::Utf8::kMaxTwoByteChar) * 3;	695 (unibrow::Utf8::kMaxThreeByteChar - unibrow::Utf8::kMaxTwoByteChar) * 3;

696 static const unsigned kAllUtf8CharsSizeU =	696 static const unsigned kAllUtf8CharsSizeU =

697 static_cast<unsigned>(kAllUtf8CharsSize);	697 static_cast<unsigned>(kAllUtf8CharsSize);

698	698

699 char buffer[kAllUtf8CharsSizeU];	699 char buffer[kAllUtf8CharsSizeU];

700 unsigned cursor = 0;	700 unsigned cursor = 0;

701 for (int i = 0; i <= kMaxUC16Char; i++) {	701 for (int i = 0; i <= kMaxUC16Char; i++) {

702 cursor += unibrow::Utf8::Encode(buffer + cursor,	702 cursor += unibrow::Utf8::Encode(buffer + cursor, i,

703 i,	703 unibrow::Utf16::kNoPreviousCharacter, true);

704 unibrow::Utf16::kNoPreviousCharacter);

705 }	704 }

706 DCHECK(cursor == kAllUtf8CharsSizeU);	705 DCHECK(cursor == kAllUtf8CharsSizeU);

707	706

708 i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),	707 i::Utf8ToUtf16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),

709 kAllUtf8CharsSizeU);	708 kAllUtf8CharsSizeU);

	709 int32_t bad = unibrow::Utf8::kBadChar;

710 for (int i = 0; i <= kMaxUC16Char; i++) {	710 for (int i = 0; i <= kMaxUC16Char; i++) {

711 CHECK_EQU(i, stream.pos());	711 CHECK_EQU(i, stream.pos());

712 int32_t c = stream.Advance();	712 int32_t c = stream.Advance();

713 CHECK_EQ(i, c);	713 if (i >= 0xd800 && i <= 0xdfff) {

	714 CHECK_EQ(bad, c);

	715 } else {

	716 CHECK_EQ(i, c);

	717 }

714 CHECK_EQU(i + 1, stream.pos());	718 CHECK_EQU(i + 1, stream.pos());

715 }	719 }

716 for (int i = kMaxUC16Char; i >= 0; i--) {	720 for (int i = kMaxUC16Char; i >= 0; i--) {

717 CHECK_EQU(i + 1, stream.pos());	721 CHECK_EQU(i + 1, stream.pos());

718 stream.PushBack(i);	722 stream.PushBack(i);

719 CHECK_EQU(i, stream.pos());	723 CHECK_EQU(i, stream.pos());

720 }	724 }

721 int i = 0;	725 int i = 0;

722 while (stream.pos() < kMaxUC16CharU) {	726 while (stream.pos() < kMaxUC16CharU) {

723 CHECK_EQU(i, stream.pos());	727 CHECK_EQU(i, stream.pos());

724 int progress = static_cast<int>(stream.SeekForward(12));	728 int progress = static_cast<int>(stream.SeekForward(12));

725 i += progress;	729 i += progress;

726 int32_t c = stream.Advance();	730 int32_t c = stream.Advance();

727 if (i <= kMaxUC16Char) {	731 if (i >= 0xd800 && i <= 0xdfff) {

	732 CHECK_EQ(bad, c);

	733 } else if (i <= kMaxUC16Char) {

728 CHECK_EQ(i, c);	734 CHECK_EQ(i, c);

729 } else {	735 } else {

730 CHECK_EQ(-1, c);	736 CHECK_EQ(-1, c);

731 }	737 }

732 i += 1;	738 i += 1;

733 CHECK_EQU(i, stream.pos());	739 CHECK_EQU(i, stream.pos());

734 }	740 }

735 }	741 }

736	742

737 #undef CHECK_EQU	743 #undef CHECK_EQU

(...skipping 168 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
906 }	912 }

907 input_offset = 3;	913 input_offset = 3;

908 // 4 bytes of UTF-8 turn into 2 UTF-16 code units.	914 // 4 bytes of UTF-8 turn into 2 UTF-16 code units.

909 character_length -= 2;	915 character_length -= 2;

910 } else if (c >= 0xe0) {	916 } else if (c >= 0xe0) {

911 if ((c & 0xf) == 0 && ((s[i + 1] & 0x20) == 0)) {	917 if ((c & 0xf) == 0 && ((s[i + 1] & 0x20) == 0)) {

912 // This 3 byte sequence could have been coded as a 2 byte sequence.	918 // This 3 byte sequence could have been coded as a 2 byte sequence.

913 // Record a single kBadChar for the first byte and continue.	919 // Record a single kBadChar for the first byte and continue.

914 continue;	920 continue;

915 }	921 }

	922 if (c == 0xed) {

	923 unsigned char d = s[i + 1];

	924 if ((d < 0x80) \|\| (d > 0x9f)) {

	925 // This 3 byte sequence is part of a surrogate pair which is not

	926 // supported by UTF-8. Record a single kBadChar for the first byte

	927 // and continue.

	928 continue;

	929 }

	930 }

916 input_offset = 2;	931 input_offset = 2;

917 // 3 bytes of UTF-8 turn into 1 UTF-16 code unit.	932 // 3 bytes of UTF-8 turn into 1 UTF-16 code unit.

918 output_adjust = 2;	933 output_adjust = 2;

919 } else {	934 } else {

920 if ((c & 0x1e) == 0) {	935 if ((c & 0x1e) == 0) {

921 // This 2 byte sequence could have been coded as a 1 byte sequence.	936 // This 2 byte sequence could have been coded as a 1 byte sequence.

922 // Record a single kBadChar for the first byte and continue.	937 // Record a single kBadChar for the first byte and continue.

923 continue;	938 continue;

924 }	939 }

925 input_offset = 1;	940 input_offset = 1;

(...skipping 5663 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6589 "[a, ...]",	6604 "[a, ...]",

6590 "[..., ]",	6605 "[..., ]",

6591 "[..., ...]",	6606 "[..., ...]",

6592 "[ (...a)]",	6607 "[ (...a)]",

6593 NULL};	6608 NULL};

6594 // clang-format on	6609 // clang-format on

6595 static const ParserFlag always_flags[] = {kAllowHarmonySpreadArrays};	6610 static const ParserFlag always_flags[] = {kAllowHarmonySpreadArrays};

6596 RunParserSyncTest(context_data, data, kError, NULL, 0, always_flags,	6611 RunParserSyncTest(context_data, data, kError, NULL, 0, always_flags,

6597 arraysize(always_flags));	6612 arraysize(always_flags));

6598 }	6613 }

OLD	NEW

« no previous file with comments | « test/cctest/test-api.cc ('k') | no next file » | no next file with comments »