src/lexer/lexer.cc - Issue 187603004: Experimental parser: make utf8 sort of work

Unified Diff: src/lexer/lexer.cc

Issue 187603004: Experimental parser: make utf8 sort of work (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/lexer/lexer.cc

diff --git a/src/lexer/lexer.cc b/src/lexer/lexer.cc

index df1e6541858b9edeac70ad05cc842f1348a69224..fe6df5998c567f81ef77c4d1bb406cd977cc527d 100644

--- a/src/lexer/lexer.cc

+++ b/src/lexer/lexer.cc

@@ -139,10 +139,10 @@ void LexerGCHandler::UpdateLexersAfterGC() {

LexerBase::LexerBase(UnicodeCache* unicode_cache)

: unicode_cache_(unicode_cache),

- has_line_terminator_before_next_(true),

- has_multiline_comment_before_next_(false),

current_literal_(&literals_[0]),

next_literal_(&literals_[1]),

+ has_line_terminator_before_next_(true),

+ has_multiline_comment_before_next_(false),

harmony_numeric_literals_(false),

harmony_modules_(false),

harmony_scoping_(false) {

@@ -170,14 +170,13 @@ Lexer<Char>::Lexer(UnicodeCache* unicode_cache,

: LexerBase(unicode_cache),

isolate_(NULL),

source_ptr_(source_ptr),

- start_position_(0),

end_position_(length),

- buffer_(NULL),

- buffer_end_(NULL),

- start_(NULL),

- cursor_(NULL),

+ buffer_(source_ptr),

+ buffer_end_(source_ptr + length),

+ start_(source_ptr),

+ cursor_(source_ptr),

last_octal_end_(NULL) {

- CHECK(false); // not yet supported

+ current_.beg_pos = current_.end_pos = next_.beg_pos = next_.end_pos = 0;

}

@@ -190,20 +189,16 @@ Lexer<Char>::Lexer(UnicodeCache* unicode_cache,

isolate_(source->GetIsolate()),

source_handle_(FlattenGetString(source)),

source_ptr_(NULL),

- start_position_(start_position),

end_position_(end_position),

buffer_(NULL),

buffer_end_(NULL),

start_(NULL),

cursor_(NULL),

last_octal_end_(NULL) {

+ cursor_ += start_position;

UpdateBufferBasedOnHandle();

- current_.beg_pos = current_.end_pos = next_.beg_pos = next_.end_pos = 0;

isolate_->lexer_gc_handler()->AddLexer(this);

- // TODO(dcarney): move this to UpdateBufferBasedOnHandle

- cursor_ = buffer_ + start_position;

- buffer_end_ = buffer_ + end_position;

- start_ = cursor_;

+ current_.beg_pos = current_.end_pos = next_.beg_pos = next_.end_pos = 0;

}

@@ -215,16 +210,19 @@ Lexer<Char>::~Lexer() {

}

+// TODO(dcarney): utf8 handling

template<typename Char>

void Lexer<Char>::SeekForward(int pos) {

+ // TODO(dcarney): utf8 handling

cursor_ = buffer_ + pos;

start_ = cursor_;

has_line_terminator_before_next_ = false;

has_multiline_comment_before_next_ = false;

- Scan(); // Fills in next_.

+ Scan();

}

+// TODO(dcarney): utf8 handling

template<typename Char>

bool Lexer<Char>::ScanRegExpPattern(bool seen_equal) {

// Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags

@@ -269,6 +267,7 @@ bool Lexer<Char>::ScanRegExpPattern(bool seen_equal) {

}

+// TODO(dcarney): utf8 handling

template<typename Char>

bool Lexer<Char>::ScanRegExpFlags() {

next_.beg_pos = cursor_ - buffer_;

@@ -302,7 +301,7 @@ uc32 Lexer<Char>::ScanHexNumber(int length) {

template<typename Char>

-const Char* Lexer<Char>::ScanHexNumber(

+static const Char* ScanHexNumber(

const Char* cursor, const Char* end, uc32* result) {

uc32 x = 0;

for ( ; cursor < end; ++cursor) {

@@ -321,7 +320,7 @@ const Char* Lexer<Char>::ScanHexNumber(

// Octal escapes of the forms '\0xx' and '\xxx' are not a part of

// ECMA-262. Other JS VMs support them.

template<typename Char>

-const Char* Lexer<Char>::ScanOctalEscape(

+static const Char* ScanOctalEscape(

const Char* start, const Char* end, uc32* result) {

uc32 x = *result - '0';

const Char* cursor;

@@ -337,6 +336,7 @@ const Char* Lexer<Char>::ScanOctalEscape(

}

+// TODO(dcarney): utf8 handling

template<typename Char>

bool Lexer<Char>::ScanLiteralUnicodeEscape() {

ASSERT(cursor_ < buffer_end_);

@@ -359,7 +359,7 @@ bool Lexer<Char>::ScanLiteralUnicodeEscape() {

template<typename Char>

-const Char* Lexer<Char>::ScanIdentifierUnicodeEscape(

+static const Char* ScanIdentifierUnicodeEscape(

const Char* cursor, const Char* end, uc32* result) {

ASSERT(*cursor == '\\');

if (++cursor >= end) return NULL;

@@ -372,14 +372,16 @@ const Char* Lexer<Char>::ScanIdentifierUnicodeEscape(

template<typename Char>

-const Char* Lexer<Char>::ScanEscape(

- const Char* cursor, const Char* end, LiteralBuffer* literal) {

+static const Char* ScanEscape(UnicodeCache* cache,

+ const Char* cursor,

+ const Char* end,

+ LiteralBuffer* literal) {

ASSERT(*cursor == '\\');

if (++cursor >= end) return NULL;

uc32 c = *cursor;

if (++cursor > end) return NULL;

// Skip escaped newlines.

- if (unicode_cache_->IsLineTerminator(c)) {

+ if (cache->IsLineTerminator(c)) {

uc32 peek = *cursor;

// Allow CR+LF newlines in multiline string literals.

if (IsCarriageReturn(c) && IsLineFeed(peek)) cursor++;

@@ -432,14 +434,14 @@ const Char* Lexer<Char>::ScanEscape(

template<typename Char>

LexerBase::Location Lexer<Char>::octal_position() const {

- if (!last_octal_end_)

- return Location::invalid();

+ if (!last_octal_end_) return Location::invalid();

// The last octal might be an octal escape or an octal number. Whichever it

// is, we'll find the start by just scanning back until we hit a non-octal

// character.

const Char* temp_cursor = last_octal_end_ - 1;

- while (temp_cursor >= buffer_ && *temp_cursor >= '0' && *temp_cursor <= '7')

+ while (temp_cursor >= buffer_ && *temp_cursor >= '0' && *temp_cursor <= '7') {

--temp_cursor;

+ }

return Location(temp_cursor - buffer_ + 1, last_octal_end_ - buffer_);

}

@@ -477,102 +479,153 @@ void Lexer<Char>::UpdateBufferBasedOnHandle() {

int cursor_offset = cursor_ - buffer_;

int last_octal_end_offset = last_octal_end_ - buffer_;

buffer_ = new_buffer;

- buffer_end_ = buffer_ + source_handle_->length();

+ buffer_end_ = buffer_ + end_position_;

start_ = buffer_ + start_offset;

cursor_ = buffer_ + cursor_offset;

if (last_octal_end_ != NULL) {

last_octal_end_ = buffer_ + last_octal_end_offset;

}

- ResetLiterals();

+ current_literal_->Invalidate();

+ next_literal_->Invalidate();

}

-template<>

-bool Lexer<uint8_t>::IsSubstringOfSource(const TokenDesc& token) {

- return !token.has_escapes;

+void LexerBase::LiteralDesc::SetOneByteString(

+ Vector<const uint8_t> string, bool owned) {

+ is_in_buffer_ = false;

+ if (is_one_byte_string_owned_) {

+ one_byte_string_.Dispose();

+ }

+ is_one_byte_string_owned_ = owned;

+ is_one_byte_ = true;

+ one_byte_string_ = string;

+void LexerBase::LiteralDesc::SetTwoByteString(Vector<const uint16_t> string) {

+ is_in_buffer_ = false;

+ is_one_byte_ = false;

+ two_byte_string_ = string;

+void LexerBase::LiteralDesc::SetStringFromLiteralBuffer() {

+ is_one_byte_ = buffer.is_ascii();

+ is_in_buffer_ = true;

+ length = buffer.length();

+ if (is_one_byte_) {

+ if (is_one_byte_string_owned_) {

+ one_byte_string_.Dispose();

+ }

+ is_one_byte_string_owned_ = false;

+ one_byte_string_ = Vector<const uint8_t>::cast(buffer.ascii_literal());

+ } else {

+ two_byte_string_ = buffer.utf16_literal();

+ }

+static inline bool IsOneByte(const uint8_t* cursor, const uint8_t* end) {

+ return true;

+static inline bool IsOneByte(const uint16_t* cursor, const uint16_t* end) {

+ uint16_t acc = 0;

+ while (cursor != end) {

+ acc |= *cursor++ >> 8;

+ }

+ return acc == 0;

+static inline bool IsOneByte(const int8_t* cursor, const int8_t* end) {

+ int8_t acc = 0;

+ while (cursor != end) {

+ acc |= *cursor++ >> 7;

+ }

+ return acc == 0;

}

template<>

-bool Lexer<uint16_t>::IsSubstringOfSource(

- const TokenDesc& token) {

- if (token.has_escapes) return false;

- const uint16_t* start = buffer_ + token.beg_pos;

- const uint16_t* end = buffer_ + token.end_pos;

- for (const uint16_t* cursor = start; cursor != end; ++cursor) {

- if (*cursor >= unibrow::Latin1::kMaxChar) return true;

+template<>

+inline void Lexer<uint16_t>::SetLiteral<true>(const uint16_t* cursor,

+ const uint16_t* end,

+ LiteralDesc* literal) {

+ Vector<uint8_t> vector = Vector<uint8_t>::New(literal->length);

+ uint8_t* data = vector.start();

+ while (cursor < end) {

+ *data++ = *cursor++;

}

- return false;

+ literal->SetOneByteString(Vector<const uint8_t>::cast(vector), true);

}

template<>

-bool Lexer<int8_t>::IsSubstringOfSource(const TokenDesc& token) {

- // FIXME: implement.

- UNREACHABLE();

- return false;

+template<>

+inline void Lexer<uint16_t>::SetLiteral<false>(const uint16_t* start,

+ const uint16_t* end,

+ LiteralDesc* literal) {

+ literal->SetTwoByteString(Vector<const uint16_t>(start, literal->length));

}

template<>

-bool Lexer<uint8_t>::FillLiteral(

- const TokenDesc& token, LiteralDesc* literal) {

- literal->beg_pos = token.beg_pos;

- const uint8_t* start = buffer_ + token.beg_pos;

- const uint8_t* end = buffer_ + token.end_pos;

- if (token.token == Token::STRING) {

- ++start;

- --end;

- }

- if (IsSubstringOfSource(token)) {

- literal->is_one_byte = true;

- literal->is_in_buffer = false;

- literal->offset = start - buffer_;

- literal->length = end - start;

- literal->one_byte_string = Vector<const uint8_t>(start, literal->length);

- return true;

- }

- return CopyToLiteralBuffer(start, end, token, literal);

+template<>

+inline void Lexer<uint8_t>::SetLiteral<true>(const uint8_t* start,

+ const uint8_t* end,

+ LiteralDesc* literal) {

+ literal->SetOneByteString(

+ Vector<const uint8_t>(start, literal->length), false);

}

template<>

-bool Lexer<uint16_t>::FillLiteral(

- const TokenDesc& token, LiteralDesc* literal) {

+template<>

+inline void Lexer<int8_t>::SetLiteral<true>(const int8_t* start,

+ const int8_t* end,

+ LiteralDesc* literal) {

+ const uint8_t* cast = reinterpret_cast<const uint8_t*>(start);

+ literal->SetOneByteString(

+ Vector<const uint8_t>(cast, literal->length), false);

+template<class Char>

+bool Lexer<Char>::FillLiteral(const TokenDesc& token, LiteralDesc* literal) {

literal->beg_pos = token.beg_pos;

- const uint16_t* start = buffer_ + token.beg_pos;

- const uint16_t* end = buffer_ + token.end_pos;

+ const Char* start = buffer_ + token.beg_pos;

+ const Char* end = buffer_ + token.end_pos;

if (token.token == Token::STRING) {

++start;

--end;

}

- if (IsSubstringOfSource(token)) {

- literal->is_one_byte = false;

- literal->is_in_buffer = false;

- literal->offset = start - buffer_;

- literal->length = end - start;

- literal->two_byte_string = Vector<const uint16_t>(start, literal->length);

- return true;

+ if (!token.has_escapes) {

+ bool is_one_byte = IsOneByte(start, end);

+ if (sizeof(Char) == 2 || is_one_byte) {

+ literal->offset = start - buffer_;

+ literal->length = end - start;

+ if (sizeof(Char) == 1) {

+ SetLiteral<true>(start, end, literal);

+ } else if (is_one_byte) {

+ SetLiteral<true>(start, end, literal);

+ } else {

+ SetLiteral<false>(start, end, literal);

+ }

+ return true;

+ }

}

return CopyToLiteralBuffer(start, end, token, literal);

}

-template<>

-bool Lexer<int8_t>::FillLiteral(

- const TokenDesc& token, LiteralDesc* literal) {

- // FIXME: implement.

- UNREACHABLE();

- return false;

template<class Char>

bool Lexer<Char>::CopyToLiteralBuffer(const Char* start,

- const Char* end,

- const TokenDesc& token,

- LiteralDesc* literal) {

+ const Char* end,

+ const TokenDesc& token,

+ LiteralDesc* literal) {

literal->buffer.Reset();

if (token.has_escapes) {

for (const Char* cursor = start; cursor != end;) {

@@ -585,25 +638,19 @@ bool Lexer<Char>::CopyToLiteralBuffer(const Char* start,

if (cursor == NULL) return false;

literal->buffer.AddChar(c);

} else {

- cursor = ScanEscape(cursor, end, &literal->buffer);

+ cursor = ScanEscape(unicode_cache_, cursor, end, &literal->buffer);

ASSERT(cursor != NULL);

if (cursor == NULL) return false;

}

} else {

+ // TODO(dcarney): This can only happen for utf8 strings

+ // use a helper function.

for (const Char* cursor = start; cursor != end;) {

literal->buffer.AddChar(*cursor++);

}

- literal->is_one_byte = literal->buffer.is_ascii();

- literal->is_in_buffer = true;

- literal->length = literal->buffer.length();

- if (literal->is_one_byte) {

- literal->one_byte_string =

- Vector<const uint8_t>::cast(literal->buffer.ascii_literal());

- } else {

- literal->two_byte_string = literal->buffer.utf16_literal();

- }

+ literal->SetStringFromLiteralBuffer();

return true;

}

@@ -611,73 +658,79 @@ bool Lexer<Char>::CopyToLiteralBuffer(const Char* start,

template<class Char>

Handle<String> Lexer<Char>::InternalizeLiteral(

LiteralDesc* literal) {

- Factory* factory = isolate_->factory();

- if (literal->is_in_buffer) {

- return literal->is_one_byte

- ? factory->InternalizeOneByteString(

- Vector<const uint8_t>::cast(literal->one_byte_string))

- : factory->InternalizeTwoByteString(literal->two_byte_string);

- }

- if (sizeof(Char) == 1) {

- SubStringKey<uint8_t> key(

- source_handle_, literal->offset, literal->length);

- return factory->InternalizeStringWithKey(&key);

- } else {

- SubStringKey<uint16_t> key(

- source_handle_, literal->offset, literal->length);

- return factory->InternalizeStringWithKey(&key);

- }

+ // Factory* factory = isolate_->factory();

+ // if (literal->is_in_buffer) {

+ // return literal->is_one_byte

+ // ? factory->InternalizeOneByteString(

+ // Vector<const uint8_t>::cast(literal->one_byte_string))

+ // : factory->InternalizeTwoByteString(literal->two_byte_string);

+ // }

+ // if (sizeof(Char) == 1) {

+ // SubStringKey<uint8_t> key(

+ // source_handle_, literal->offset, literal->length);

+ // return factory->InternalizeStringWithKey(&key);

+ // } else {

+ // SubStringKey<uint16_t> key(

+ // source_handle_, literal->offset, literal->length);

+ // return factory->InternalizeStringWithKey(&key);

+ // }

+ CHECK(false);

+ return Handle<String>();

}

template<>

Handle<String> Lexer<uint8_t>::AllocateLiteral(

LiteralDesc* literal, PretenureFlag pretenured) {

- Factory* factory = isolate_->factory();

- if (literal->is_in_buffer) {

- return literal->is_one_byte

- ? factory->NewStringFromOneByte(literal->one_byte_string, pretenured)

- : factory->NewStringFromTwoByte(literal->two_byte_string, pretenured);

- }

- int from = literal->offset;

- int length = literal->length;

- // Save the offset and the length before allocating the string as it may

- // cause a GC, invalidate the literal, and move the source.

- Handle<String> result = factory->NewRawOneByteString(length, pretenured);

- uint8_t* chars = SeqOneByteString::cast(*result)->GetChars();

- String::WriteToFlat(*source_handle_, chars, from, from + length);

- return result;

+ // Factory* factory = isolate_->factory();

+ // if (literal->is_in_buffer) {

+ // return literal->is_one_byte

+ // ? factory->NewStringFromOneByte(literal->one_byte_string, pretenured)

+ // : factory->NewStringFromTwoByte(literal->two_byte_string, pretenured)

+ // }

+ // int from = literal->offset;

+ // int length = literal->length;

+ // // Save the offset and the length before allocating the string as it may

+ // // cause a GC, invalidate the literal, and move the source.

+ // Handle<String> result = factory->NewRawOneByteString(length, pretenured);

+ // uint8_t* chars = SeqOneByteString::cast(*result)->GetChars();

+ // String::WriteToFlat(*source_handle_, chars, from, from + length);

+ // return result;

+ CHECK(false);

+ return Handle<String>();

}

template<>

Handle<String> Lexer<uint16_t>::AllocateLiteral(

LiteralDesc* literal, PretenureFlag pretenured) {

- Factory* factory = isolate_->factory();

- if (literal->is_in_buffer) {

- return literal->is_one_byte

- ? factory->NewStringFromOneByte(literal->one_byte_string, pretenured)

- : factory->NewStringFromTwoByte(literal->two_byte_string, pretenured);

- }

- // Save the offset and the length before allocating the string as it may

- // cause a GC, invalidate the literal, and move the source.

- int from = literal->offset;

- int length = literal->length;

- Handle<String> result = factory->NewRawTwoByteString(length, pretenured);

- uint16_t* chars = SeqTwoByteString::cast(*result)->GetChars();

- String::WriteToFlat(*source_handle_, chars, from, from + length);

- return result;

+ // Factory* factory = isolate_->factory();

+ // if (literal->is_in_buffer) {

+ // return literal->is_one_byte

+ // ? factory->NewStringFromOneByte(literal->one_byte_string, pretenured)

+ // : factory->NewStringFromTwoByte(literal->two_byte_string, pretenured)

+ // }

+ // // Save the offset and the length before allocating the string as it may

+ // // cause a GC, invalidate the literal, and move the source.

+ // int from = literal->offset;

+ // int length = literal->length;

+ // Handle<String> result = factory->NewRawTwoByteString(length, pretenured);

+ // uint16_t* chars = SeqTwoByteString::cast(*result)->GetChars();

+ // String::WriteToFlat(*source_handle_, chars, from, from + length);

+ // return result;

+ CHECK(false);

+ return Handle<String>();

}

template<>

Handle<String> Lexer<int8_t>::AllocateLiteral(

LiteralDesc* literal, PretenureFlag pretenured) {

- // FIXME: implement

- UNREACHABLE();

+ CHECK(false);

return Handle<String>();

}

template class Lexer<uint8_t>;

template class Lexer<uint16_t>;

template class Lexer<int8_t>;

« no previous file with comments | « src/lexer/lexer.h ('k') | no next file » | no next file with comments »