src/lexer/lexer-shell.cc - Issue 192643003: Experimental parser: fix UTF8TO16 handling.

Unified Diff: src/lexer/lexer-shell.cc

Issue 192643003: Experimental parser: fix UTF8TO16 handling. (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/lexer/lexer-shell.cc

diff --git a/src/lexer/lexer-shell.cc b/src/lexer/lexer-shell.cc

index 09822542d5bfb3a8e4ed2bfc01a079d2acdfb39e..e7a700ea89c0cca5894e9abf23ebd12083812bc6 100644

--- a/src/lexer/lexer-shell.cc

+++ b/src/lexer/lexer-shell.cc

@@ -47,49 +47,6 @@

using namespace v8::internal;

-static byte* ReadFile(const char* name, const byte** end, int repeat,

- bool convert_to_utf16) {

- FILE* file = fopen(name, "rb");

- if (file == NULL) return NULL;

- fseek(file, 0, SEEK_END);

- int file_size = ftell(file);

- rewind(file);

- int size = file_size * repeat;

- byte* chars = new byte[size];

- for (int i = 0; i < file_size;) {

- int read = static_cast<int>(fread(&chars[i], 1, file_size - i, file));

- i += read;

- }

- fclose(file);

- for (int i = file_size; i < size; i++) {

- chars[i] = chars[i - file_size];

- }

- *end = &chars[size];

- if (!convert_to_utf16) return chars;

- // Length of new_chars is not strictly accurate, but should be enough.

- uint16_t* new_chars = new uint16_t[size];

- {

- Utf8ToUtf16CharacterStream stream(chars, size);

- uint16_t* cursor = new_chars;

- // uc32 c;

- // The 32-bit char type is probably only so that we can have -1 as a return

- // value. If the char is not -1, it should fit into 16 bits.

- CHECK(false);

- // while ((c = stream.Advance()) != -1) {

- // *cursor++ = c;

- // }

- *end = reinterpret_cast<byte*>(cursor);

- }

- delete[] chars;

- return reinterpret_cast<byte*>(new_chars);

enum Encoding {

LATIN1,

@@ -120,6 +77,90 @@ struct LexerShellSettings {

};

+static uint16_t* ReadFile(const char* name, const uint8_t** end,

+ const LexerShellSettings& settings) {

+ FILE* file = fopen(name, "rb");

+ CHECK(file != NULL);

+ fseek(file, 0, SEEK_END);

+ unsigned file_size = ftell(file);

+ rewind(file);

+ uint16_t* two_byte_data = new uint16_t[file_size / 2 + file_size % 2];

+ uint8_t* char_data = reinterpret_cast<uint8_t*>(two_byte_data);

+ for (unsigned i = 0; i < file_size;) {

+ i += fread(&char_data[i], 1, file_size - i, file);

+ }

+ fclose(file);

+ if (settings.encoding == UTF8TO16) {

+ const uint32_t kMaxUtf16Character = 0xffff;

+ // Get utf8 length.

+ unsigned utf16_chars = 0;

+ {

+ unsigned position = 0;

+ while (position < file_size) {

+ uint32_t c = char_data[position];

+ if (c <= unibrow::Utf8::kMaxOneByteChar) {

+ position++;

+ } else {

+ c = unibrow::Utf8::CalculateValue(char_data + position,

+ file_size - position,

+ &position);

+ }

+ if (c > kMaxUtf16Character) {

+ utf16_chars += 2;

+ } else {

+ utf16_chars += 1;

+ }

+ // Write new buffer out.

+ uint16_t* data = new uint16_t[utf16_chars];

+ unsigned position = 0;

+ unsigned i = 0;

+ while (position < file_size) {

+ uint32_t c = char_data[position];

+ if (c <= unibrow::Utf8::kMaxOneByteChar) {

+ position++;

+ } else {

+ c = unibrow::Utf8::CalculateValue(char_data + position,

+ file_size - position,

+ &position);

+ }

+ if (c > kMaxUtf16Character) {

+ data[i++] = unibrow::Utf16::LeadSurrogate(c);

+ data[i++] = unibrow::Utf16::TrailSurrogate(c);

+ } else {

+ data[i++] = static_cast<uc16>(c);

+ }

+ // Swap buffers.

+ delete two_byte_data;

+ file_size = utf16_chars * 2;

+ two_byte_data = data;

+ char_data = reinterpret_cast<uint8_t*>(two_byte_data);

+ }

+ // Duplicate buffer if necessary.

+ if (settings.repeat > 1) {

+ unsigned size = file_size * settings.repeat;

+ uint16_t* data = new uint16_t[size / 2 + size % 2];

+ char_data = reinterpret_cast<uint8_t*>(two_byte_data);

+ for (int i = 0; i < settings.repeat; i++) {

+ memcpy(&char_data[i * file_size], two_byte_data, file_size);

+ }

+ delete two_byte_data;

+ file_size = size;

+ two_byte_data = data;

+ }

+ *end = &char_data[file_size];

+ return two_byte_data;

struct TokenWithLocation {

Token::Value value;

size_t beg;

@@ -193,29 +234,30 @@ static TokenWithLocation GetTokenWithLocation(

}

-static TimeDelta RunLexer(const byte* source,

- const byte* source_end,

+static TimeDelta RunLexer(const uint16_t* source,

+ const uint8_t* source_end,

Isolate* isolate,

std::vector<TokenWithLocation>* tokens,

const LexerShellSettings& settings) {

SmartPointer<Utf16CharacterStream> stream;

+ const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);

+ int bytes = source_end - one_byte_source;

switch (settings.encoding) {

case UTF8:

- case UTF8TO16:

- stream.Reset(new Utf8ToUtf16CharacterStream(source, source_end - source));

+ stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));

break;

+ case UTF8TO16:

case UTF16: {

+ CHECK_EQ(0, bytes % 2);

Handle<String> result = isolate->factory()->NewStringFromTwoByte(

- Vector<const uint16_t>(

- reinterpret_cast<const uint16_t*>(source),

- (source_end - source) / 2));

+ Vector<const uint16_t>(source, bytes / 2));

stream.Reset(

new GenericStringUtf16CharacterStream(result, 0, result->length()));

break;

}

case LATIN1: {

Handle<String> result = isolate->factory()->NewStringFromOneByte(

- Vector<const uint8_t>(source, source_end - source));

+ Vector<const uint8_t>(one_byte_source, bytes));

stream.Reset(

new GenericStringUtf16CharacterStream(result, 0, result->length()));

break;

@@ -258,9 +300,9 @@ static TimeDelta ProcessFile(

std::vector<TokenWithLocation> tokens;

TimeDelta time;

{

- const byte* buffer_end = 0;

- const byte* buffer = ReadFile(fname, &buffer_end, settings.repeat, false);

- if (truncate_by > buffer_end - buffer) {

+ const uint8_t* buffer_end = 0;

+ const uint16_t* buffer = ReadFile(fname, &buffer_end, settings);

+ if (truncate_by > buffer_end - reinterpret_cast<const uint8_t*>(buffer)) {

*can_truncate = false;

} else {

buffer_end -= truncate_by;

« no previous file with comments | « no previous file | tools/lexer_generator/test/run_lexing_tests.py » ('j') | no next file with comments »