Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(905)

Unified Diff: src/lexer/lexer-shell.cc

Issue 192643003: Experimental parser: fix UTF8TO16 handling. (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser
Patch Set: Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | tools/lexer_generator/test/run_lexing_tests.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/lexer/lexer-shell.cc
diff --git a/src/lexer/lexer-shell.cc b/src/lexer/lexer-shell.cc
index 09822542d5bfb3a8e4ed2bfc01a079d2acdfb39e..e7a700ea89c0cca5894e9abf23ebd12083812bc6 100644
--- a/src/lexer/lexer-shell.cc
+++ b/src/lexer/lexer-shell.cc
@@ -47,49 +47,6 @@
using namespace v8::internal;
-static byte* ReadFile(const char* name, const byte** end, int repeat,
- bool convert_to_utf16) {
- FILE* file = fopen(name, "rb");
- if (file == NULL) return NULL;
-
- fseek(file, 0, SEEK_END);
- int file_size = ftell(file);
- rewind(file);
-
- int size = file_size * repeat;
-
- byte* chars = new byte[size];
- for (int i = 0; i < file_size;) {
- int read = static_cast<int>(fread(&chars[i], 1, file_size - i, file));
- i += read;
- }
- fclose(file);
-
- for (int i = file_size; i < size; i++) {
- chars[i] = chars[i - file_size];
- }
- *end = &chars[size];
-
- if (!convert_to_utf16) return chars;
-
- // Length of new_chars is not strictly accurate, but should be enough.
- uint16_t* new_chars = new uint16_t[size];
- {
- Utf8ToUtf16CharacterStream stream(chars, size);
- uint16_t* cursor = new_chars;
- // uc32 c;
- // The 32-bit char type is probably only so that we can have -1 as a return
- // value. If the char is not -1, it should fit into 16 bits.
- CHECK(false);
- // while ((c = stream.Advance()) != -1) {
- // *cursor++ = c;
- // }
- *end = reinterpret_cast<byte*>(cursor);
- }
- delete[] chars;
- return reinterpret_cast<byte*>(new_chars);
-}
-
enum Encoding {
LATIN1,
@@ -120,6 +77,90 @@ struct LexerShellSettings {
};
+static uint16_t* ReadFile(const char* name, const uint8_t** end,
+ const LexerShellSettings& settings) {
+ FILE* file = fopen(name, "rb");
+ CHECK(file != NULL);
+
+ fseek(file, 0, SEEK_END);
+ unsigned file_size = ftell(file);
+ rewind(file);
+
+ uint16_t* two_byte_data = new uint16_t[file_size / 2 + file_size % 2];
+
+ uint8_t* char_data = reinterpret_cast<uint8_t*>(two_byte_data);
+ for (unsigned i = 0; i < file_size;) {
+ i += fread(&char_data[i], 1, file_size - i, file);
+ }
+ fclose(file);
+
+ if (settings.encoding == UTF8TO16) {
+ const uint32_t kMaxUtf16Character = 0xffff;
+ // Get utf8 length.
+ unsigned utf16_chars = 0;
+ {
+ unsigned position = 0;
+ while (position < file_size) {
+ uint32_t c = char_data[position];
+ if (c <= unibrow::Utf8::kMaxOneByteChar) {
+ position++;
+ } else {
+ c = unibrow::Utf8::CalculateValue(char_data + position,
+ file_size - position,
+ &position);
+ }
+ if (c > kMaxUtf16Character) {
+ utf16_chars += 2;
+ } else {
+ utf16_chars += 1;
+ }
+ }
+ }
+ // Write new buffer out.
+ uint16_t* data = new uint16_t[utf16_chars];
+ unsigned position = 0;
+ unsigned i = 0;
+ while (position < file_size) {
+ uint32_t c = char_data[position];
+ if (c <= unibrow::Utf8::kMaxOneByteChar) {
+ position++;
+ } else {
+ c = unibrow::Utf8::CalculateValue(char_data + position,
+ file_size - position,
+ &position);
+ }
+ if (c > kMaxUtf16Character) {
+ data[i++] = unibrow::Utf16::LeadSurrogate(c);
+ data[i++] = unibrow::Utf16::TrailSurrogate(c);
+ } else {
+ data[i++] = static_cast<uc16>(c);
+ }
+ }
+ // Swap buffers.
+ delete two_byte_data;
+ file_size = utf16_chars * 2;
+ two_byte_data = data;
+ char_data = reinterpret_cast<uint8_t*>(two_byte_data);
+ }
+
+ // Duplicate buffer if necessary.
+ if (settings.repeat > 1) {
+ unsigned size = file_size * settings.repeat;
+ uint16_t* data = new uint16_t[size / 2 + size % 2];
+ char_data = reinterpret_cast<uint8_t*>(two_byte_data);
+ for (int i = 0; i < settings.repeat; i++) {
+ memcpy(&char_data[i * file_size], two_byte_data, file_size);
+ }
+ delete two_byte_data;
+ file_size = size;
+ two_byte_data = data;
+ }
+
+ *end = &char_data[file_size];
+ return two_byte_data;
+}
+
+
struct TokenWithLocation {
Token::Value value;
size_t beg;
@@ -193,29 +234,30 @@ static TokenWithLocation GetTokenWithLocation(
}
-static TimeDelta RunLexer(const byte* source,
- const byte* source_end,
+static TimeDelta RunLexer(const uint16_t* source,
+ const uint8_t* source_end,
Isolate* isolate,
std::vector<TokenWithLocation>* tokens,
const LexerShellSettings& settings) {
SmartPointer<Utf16CharacterStream> stream;
+ const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);
+ int bytes = source_end - one_byte_source;
switch (settings.encoding) {
case UTF8:
- case UTF8TO16:
- stream.Reset(new Utf8ToUtf16CharacterStream(source, source_end - source));
+ stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));
break;
+ case UTF8TO16:
case UTF16: {
+ CHECK_EQ(0, bytes % 2);
Handle<String> result = isolate->factory()->NewStringFromTwoByte(
- Vector<const uint16_t>(
- reinterpret_cast<const uint16_t*>(source),
- (source_end - source) / 2));
+ Vector<const uint16_t>(source, bytes / 2));
stream.Reset(
new GenericStringUtf16CharacterStream(result, 0, result->length()));
break;
}
case LATIN1: {
Handle<String> result = isolate->factory()->NewStringFromOneByte(
- Vector<const uint8_t>(source, source_end - source));
+ Vector<const uint8_t>(one_byte_source, bytes));
stream.Reset(
new GenericStringUtf16CharacterStream(result, 0, result->length()));
break;
@@ -258,9 +300,9 @@ static TimeDelta ProcessFile(
std::vector<TokenWithLocation> tokens;
TimeDelta time;
{
- const byte* buffer_end = 0;
- const byte* buffer = ReadFile(fname, &buffer_end, settings.repeat, false);
- if (truncate_by > buffer_end - buffer) {
+ const uint8_t* buffer_end = 0;
+ const uint16_t* buffer = ReadFile(fname, &buffer_end, settings);
+ if (truncate_by > buffer_end - reinterpret_cast<const uint8_t*>(buffer)) {
*can_truncate = false;
} else {
buffer_end -= truncate_by;
« no previous file with comments | « no previous file | tools/lexer_generator/test/run_lexing_tests.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698