src/lexer/lexer-shell.cc - Issue 196943021: Experimental parser: add utf8tolatin1 conversion

Unified Diff: src/lexer/lexer-shell.cc

Issue 196943021: Experimental parser: add utf8tolatin1 conversion (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/lexer/lexer-shell.cc

diff --git a/src/lexer/lexer-shell.cc b/src/lexer/lexer-shell.cc

index 1eb906a3d06fd237442c1895401685aa66abdc47..b180a6681804f1f1f71ac92a80f5299eda006ad5 100644

--- a/src/lexer/lexer-shell.cc

+++ b/src/lexer/lexer-shell.cc

@@ -53,7 +53,7 @@ enum Encoding {

UTF8,

UTF16,

UTF8TO16, // Convert stream via scanner input stream

- UTF8TO16_PRECONVERT // Convert stream during file read

+ UTF8TOLATIN1, // Convert stream via scanner input stream

};

@@ -81,12 +81,14 @@ struct LexerShellSettings {

static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in,

- unsigned* length) {

+ unsigned* length,

+ bool* is_one_byte) {

const unsigned file_size = *length;

const uint8_t* char_data = reinterpret_cast<const uint8_t*>(data_in);

const uint32_t kMaxUtf16Character = 0xffff;

// Get utf8 length.

unsigned utf16_chars = 0;

+ *is_one_byte = true;

{

unsigned position = 0;

while (position < file_size) {

@@ -94,6 +96,7 @@ static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in,

if (c <= unibrow::Utf8::kMaxOneByteChar) {

position++;

} else {

+ *is_one_byte = false;

c = unibrow::Utf8::CalculateValue(char_data + position,

file_size - position,

&position);

@@ -130,6 +133,17 @@ static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in,

}

+static uint16_t* ConvertUtf16ToLatin1(const uint16_t* const data_in,

+ unsigned* length) {

+ const unsigned size = *length / 2 + *length % 2;

+ uint16_t* data = new uint16_t[size];

+ uint8_t* char_data = reinterpret_cast<uint8_t*>(data);

+ CopyChars(char_data, data_in, size);

+ *length = size;

+ return data;

static uint16_t* Repeat(int repeat,

const uint16_t* const data_in,

unsigned* length) {

@@ -166,13 +180,29 @@ static uint16_t* ReadFile(const char* name, unsigned* length) {

static uint16_t* ReadFile(const char* name,

const LexerShellSettings& settings,

- unsigned* length) {

+ unsigned* length,

+ Encoding* output_encoding) {

uint16_t* data = ReadFile(name, length);

CHECK_GE(*length, 0);

if (*length == 0) return data;

- if (settings.encoding == UTF8TO16_PRECONVERT) {

- uint16_t* new_data = ConvertUtf8ToUtf16(data, length);

+ *output_encoding = settings.encoding;

+ if (settings.encoding == UTF8TO16 ||

+ settings.encoding == UTF8TOLATIN1) {

+ bool is_one_byte;

+ uint16_t* new_data = ConvertUtf8ToUtf16(data, length, &is_one_byte);

+ if (settings.encoding == UTF8TOLATIN1 && is_one_byte) {

+ *output_encoding = LATIN1;

+ } else {

+ *output_encoding = UTF16;

+ }

+ delete data;

+ data = new_data;

+ }

+ if (settings.encoding == UTF8TOLATIN1 && *output_encoding == LATIN1) {

+ uint16_t* new_data = ConvertUtf16ToLatin1(data, length);

delete data;

data = new_data;

}

@@ -265,16 +295,15 @@ class TokenWithLocation {

static TimeDelta RunLexer(const uint16_t* source,

const uint8_t* source_end,

Isolate* isolate,

+ Encoding output_encoding,

const LexerShellSettings& settings) {

SmartPointer<Utf16CharacterStream> stream;

const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);

int bytes = source_end - one_byte_source;

- switch (settings.encoding) {

- case UTF8TO16:

+ switch (output_encoding) {

case UTF8:

stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));

break;

- case UTF8TO16_PRECONVERT:

case UTF16: {

CHECK_EQ(0, bytes % 2);

Handle<String> result = isolate->factory()->NewStringFromTwoByte(

@@ -290,6 +319,9 @@ static TimeDelta RunLexer(const uint16_t* source,

new GenericStringUtf16CharacterStream(result, 0, result->length()));

break;

}

+ case UTF8TO16:

+ case UTF8TOLATIN1:

+ CHECK(false);

}

Scanner scanner(isolate->unicode_cache());

scanner.SetHarmonyNumericLiterals(settings.harmony_numeric_literals);

@@ -340,14 +372,16 @@ static TimeDelta ProcessFile(

TimeDelta time;

{

unsigned length_in_bytes;

- const uint16_t* buffer = ReadFile(fname, settings, &length_in_bytes);

+ Encoding output_encoding;

+ const uint16_t* buffer =

+ ReadFile(fname, settings, &length_in_bytes, &output_encoding);

const uint8_t* char_data = reinterpret_cast<const uint8_t*>(buffer);

const uint8_t* buffer_end = &char_data[length_in_bytes];

if (truncate_by > buffer_end - char_data) {

*can_truncate = false;

} else {

buffer_end -= truncate_by;

- time = RunLexer(buffer, buffer_end, isolate, settings);

+ time = RunLexer(buffer, buffer_end, isolate, output_encoding, settings);

}

delete[] buffer;

}

@@ -370,9 +404,15 @@ int main(int argc, char* argv[]) {

settings.encoding = UTF16;

} else if (strcmp(argv[i], "--utf8to16") == 0) {

#ifdef V8_USE_GENERATED_LEXER

- settings.encoding = UTF8TO16_PRECONVERT;

-#else

settings.encoding = UTF8TO16;

+#else

+ settings.encoding = UTF8;

+#endif

+ } else if (strcmp(argv[i], "--utf8tolatin1") == 0) {

+#ifdef V8_USE_GENERATED_LEXER

+ settings.encoding = UTF8TOLATIN1;

+#else

+ settings.encoding = UTF8;

#endif

} else if (strcmp(argv[i], "--print-tokens") == 0) {

settings.print_tokens = true;

« no previous file with comments | « no previous file | tools/lexer_generator/test/run_lexing_tests.py » ('j') | no next file with comments »