Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(524)

Unified Diff: src/lexer/lexer.cc

Issue 187603004: Experimental parser: make utf8 sort of work (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser
Patch Set: Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/lexer/lexer.h ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/lexer/lexer.cc
diff --git a/src/lexer/lexer.cc b/src/lexer/lexer.cc
index df1e6541858b9edeac70ad05cc842f1348a69224..fe6df5998c567f81ef77c4d1bb406cd977cc527d 100644
--- a/src/lexer/lexer.cc
+++ b/src/lexer/lexer.cc
@@ -139,10 +139,10 @@ void LexerGCHandler::UpdateLexersAfterGC() {
LexerBase::LexerBase(UnicodeCache* unicode_cache)
: unicode_cache_(unicode_cache),
- has_line_terminator_before_next_(true),
- has_multiline_comment_before_next_(false),
current_literal_(&literals_[0]),
next_literal_(&literals_[1]),
+ has_line_terminator_before_next_(true),
+ has_multiline_comment_before_next_(false),
harmony_numeric_literals_(false),
harmony_modules_(false),
harmony_scoping_(false) {
@@ -170,14 +170,13 @@ Lexer<Char>::Lexer(UnicodeCache* unicode_cache,
: LexerBase(unicode_cache),
isolate_(NULL),
source_ptr_(source_ptr),
- start_position_(0),
end_position_(length),
- buffer_(NULL),
- buffer_end_(NULL),
- start_(NULL),
- cursor_(NULL),
+ buffer_(source_ptr),
+ buffer_end_(source_ptr + length),
+ start_(source_ptr),
+ cursor_(source_ptr),
last_octal_end_(NULL) {
- CHECK(false); // not yet supported
+ current_.beg_pos = current_.end_pos = next_.beg_pos = next_.end_pos = 0;
}
@@ -190,20 +189,16 @@ Lexer<Char>::Lexer(UnicodeCache* unicode_cache,
isolate_(source->GetIsolate()),
source_handle_(FlattenGetString(source)),
source_ptr_(NULL),
- start_position_(start_position),
end_position_(end_position),
buffer_(NULL),
buffer_end_(NULL),
start_(NULL),
cursor_(NULL),
last_octal_end_(NULL) {
+ cursor_ += start_position;
UpdateBufferBasedOnHandle();
- current_.beg_pos = current_.end_pos = next_.beg_pos = next_.end_pos = 0;
isolate_->lexer_gc_handler()->AddLexer(this);
- // TODO(dcarney): move this to UpdateBufferBasedOnHandle
- cursor_ = buffer_ + start_position;
- buffer_end_ = buffer_ + end_position;
- start_ = cursor_;
+ current_.beg_pos = current_.end_pos = next_.beg_pos = next_.end_pos = 0;
}
@@ -215,16 +210,19 @@ Lexer<Char>::~Lexer() {
}
+// TODO(dcarney): utf8 handling
template<typename Char>
void Lexer<Char>::SeekForward(int pos) {
+ // TODO(dcarney): utf8 handling
cursor_ = buffer_ + pos;
start_ = cursor_;
has_line_terminator_before_next_ = false;
has_multiline_comment_before_next_ = false;
- Scan(); // Fills in next_.
+ Scan();
}
+// TODO(dcarney): utf8 handling
template<typename Char>
bool Lexer<Char>::ScanRegExpPattern(bool seen_equal) {
// Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
@@ -269,6 +267,7 @@ bool Lexer<Char>::ScanRegExpPattern(bool seen_equal) {
}
+// TODO(dcarney): utf8 handling
template<typename Char>
bool Lexer<Char>::ScanRegExpFlags() {
next_.beg_pos = cursor_ - buffer_;
@@ -302,7 +301,7 @@ uc32 Lexer<Char>::ScanHexNumber(int length) {
template<typename Char>
-const Char* Lexer<Char>::ScanHexNumber(
+static const Char* ScanHexNumber(
const Char* cursor, const Char* end, uc32* result) {
uc32 x = 0;
for ( ; cursor < end; ++cursor) {
@@ -321,7 +320,7 @@ const Char* Lexer<Char>::ScanHexNumber(
// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
// ECMA-262. Other JS VMs support them.
template<typename Char>
-const Char* Lexer<Char>::ScanOctalEscape(
+static const Char* ScanOctalEscape(
const Char* start, const Char* end, uc32* result) {
uc32 x = *result - '0';
const Char* cursor;
@@ -337,6 +336,7 @@ const Char* Lexer<Char>::ScanOctalEscape(
}
+// TODO(dcarney): utf8 handling
template<typename Char>
bool Lexer<Char>::ScanLiteralUnicodeEscape() {
ASSERT(cursor_ < buffer_end_);
@@ -359,7 +359,7 @@ bool Lexer<Char>::ScanLiteralUnicodeEscape() {
template<typename Char>
-const Char* Lexer<Char>::ScanIdentifierUnicodeEscape(
+static const Char* ScanIdentifierUnicodeEscape(
const Char* cursor, const Char* end, uc32* result) {
ASSERT(*cursor == '\\');
if (++cursor >= end) return NULL;
@@ -372,14 +372,16 @@ const Char* Lexer<Char>::ScanIdentifierUnicodeEscape(
template<typename Char>
-const Char* Lexer<Char>::ScanEscape(
- const Char* cursor, const Char* end, LiteralBuffer* literal) {
+static const Char* ScanEscape(UnicodeCache* cache,
+ const Char* cursor,
+ const Char* end,
+ LiteralBuffer* literal) {
ASSERT(*cursor == '\\');
if (++cursor >= end) return NULL;
uc32 c = *cursor;
if (++cursor > end) return NULL;
// Skip escaped newlines.
- if (unicode_cache_->IsLineTerminator(c)) {
+ if (cache->IsLineTerminator(c)) {
uc32 peek = *cursor;
// Allow CR+LF newlines in multiline string literals.
if (IsCarriageReturn(c) && IsLineFeed(peek)) cursor++;
@@ -432,14 +434,14 @@ const Char* Lexer<Char>::ScanEscape(
template<typename Char>
LexerBase::Location Lexer<Char>::octal_position() const {
- if (!last_octal_end_)
- return Location::invalid();
+ if (!last_octal_end_) return Location::invalid();
// The last octal might be an octal escape or an octal number. Whichever it
// is, we'll find the start by just scanning back until we hit a non-octal
// character.
const Char* temp_cursor = last_octal_end_ - 1;
- while (temp_cursor >= buffer_ && *temp_cursor >= '0' && *temp_cursor <= '7')
+ while (temp_cursor >= buffer_ && *temp_cursor >= '0' && *temp_cursor <= '7') {
--temp_cursor;
+ }
return Location(temp_cursor - buffer_ + 1, last_octal_end_ - buffer_);
}
@@ -477,102 +479,153 @@ void Lexer<Char>::UpdateBufferBasedOnHandle() {
int cursor_offset = cursor_ - buffer_;
int last_octal_end_offset = last_octal_end_ - buffer_;
buffer_ = new_buffer;
- buffer_end_ = buffer_ + source_handle_->length();
+ buffer_end_ = buffer_ + end_position_;
start_ = buffer_ + start_offset;
cursor_ = buffer_ + cursor_offset;
if (last_octal_end_ != NULL) {
last_octal_end_ = buffer_ + last_octal_end_offset;
}
- ResetLiterals();
+ current_literal_->Invalidate();
+ next_literal_->Invalidate();
}
}
-template<>
-bool Lexer<uint8_t>::IsSubstringOfSource(const TokenDesc& token) {
- return !token.has_escapes;
+void LexerBase::LiteralDesc::SetOneByteString(
+ Vector<const uint8_t> string, bool owned) {
+ is_in_buffer_ = false;
+ if (is_one_byte_string_owned_) {
+ one_byte_string_.Dispose();
+ }
+ is_one_byte_string_owned_ = owned;
+ is_one_byte_ = true;
+ one_byte_string_ = string;
+}
+
+
+void LexerBase::LiteralDesc::SetTwoByteString(Vector<const uint16_t> string) {
+ is_in_buffer_ = false;
+ is_one_byte_ = false;
+ two_byte_string_ = string;
+}
+
+
+void LexerBase::LiteralDesc::SetStringFromLiteralBuffer() {
+ is_one_byte_ = buffer.is_ascii();
+ is_in_buffer_ = true;
+ length = buffer.length();
+ if (is_one_byte_) {
+ if (is_one_byte_string_owned_) {
+ one_byte_string_.Dispose();
+ }
+ is_one_byte_string_owned_ = false;
+ one_byte_string_ = Vector<const uint8_t>::cast(buffer.ascii_literal());
+ } else {
+ two_byte_string_ = buffer.utf16_literal();
+ }
+}
+
+
+static inline bool IsOneByte(const uint8_t* cursor, const uint8_t* end) {
+ return true;
+}
+
+
+static inline bool IsOneByte(const uint16_t* cursor, const uint16_t* end) {
+ uint16_t acc = 0;
+ while (cursor != end) {
+ acc |= *cursor++ >> 8;
+ }
+ return acc == 0;
+}
+
+
+static inline bool IsOneByte(const int8_t* cursor, const int8_t* end) {
+ int8_t acc = 0;
+ while (cursor != end) {
+ acc |= *cursor++ >> 7;
+ }
+ return acc == 0;
}
template<>
-bool Lexer<uint16_t>::IsSubstringOfSource(
- const TokenDesc& token) {
- if (token.has_escapes) return false;
- const uint16_t* start = buffer_ + token.beg_pos;
- const uint16_t* end = buffer_ + token.end_pos;
- for (const uint16_t* cursor = start; cursor != end; ++cursor) {
- if (*cursor >= unibrow::Latin1::kMaxChar) return true;
+template<>
+inline void Lexer<uint16_t>::SetLiteral<true>(const uint16_t* cursor,
+ const uint16_t* end,
+ LiteralDesc* literal) {
+ Vector<uint8_t> vector = Vector<uint8_t>::New(literal->length);
+ uint8_t* data = vector.start();
+ while (cursor < end) {
+ *data++ = *cursor++;
}
- return false;
+ literal->SetOneByteString(Vector<const uint8_t>::cast(vector), true);
}
template<>
-bool Lexer<int8_t>::IsSubstringOfSource(const TokenDesc& token) {
- // FIXME: implement.
- UNREACHABLE();
- return false;
+template<>
+inline void Lexer<uint16_t>::SetLiteral<false>(const uint16_t* start,
+ const uint16_t* end,
+ LiteralDesc* literal) {
+ literal->SetTwoByteString(Vector<const uint16_t>(start, literal->length));
}
template<>
-bool Lexer<uint8_t>::FillLiteral(
- const TokenDesc& token, LiteralDesc* literal) {
- literal->beg_pos = token.beg_pos;
- const uint8_t* start = buffer_ + token.beg_pos;
- const uint8_t* end = buffer_ + token.end_pos;
- if (token.token == Token::STRING) {
- ++start;
- --end;
- }
- if (IsSubstringOfSource(token)) {
- literal->is_one_byte = true;
- literal->is_in_buffer = false;
- literal->offset = start - buffer_;
- literal->length = end - start;
- literal->one_byte_string = Vector<const uint8_t>(start, literal->length);
- return true;
- }
- return CopyToLiteralBuffer(start, end, token, literal);
+template<>
+inline void Lexer<uint8_t>::SetLiteral<true>(const uint8_t* start,
+ const uint8_t* end,
+ LiteralDesc* literal) {
+ literal->SetOneByteString(
+ Vector<const uint8_t>(start, literal->length), false);
}
template<>
-bool Lexer<uint16_t>::FillLiteral(
- const TokenDesc& token, LiteralDesc* literal) {
+template<>
+inline void Lexer<int8_t>::SetLiteral<true>(const int8_t* start,
+ const int8_t* end,
+ LiteralDesc* literal) {
+ const uint8_t* cast = reinterpret_cast<const uint8_t*>(start);
+ literal->SetOneByteString(
+ Vector<const uint8_t>(cast, literal->length), false);
+}
+
+
+template<class Char>
+bool Lexer<Char>::FillLiteral(const TokenDesc& token, LiteralDesc* literal) {
literal->beg_pos = token.beg_pos;
- const uint16_t* start = buffer_ + token.beg_pos;
- const uint16_t* end = buffer_ + token.end_pos;
+ const Char* start = buffer_ + token.beg_pos;
+ const Char* end = buffer_ + token.end_pos;
if (token.token == Token::STRING) {
++start;
--end;
}
- if (IsSubstringOfSource(token)) {
- literal->is_one_byte = false;
- literal->is_in_buffer = false;
- literal->offset = start - buffer_;
- literal->length = end - start;
- literal->two_byte_string = Vector<const uint16_t>(start, literal->length);
- return true;
+ if (!token.has_escapes) {
+ bool is_one_byte = IsOneByte(start, end);
+ if (sizeof(Char) == 2 || is_one_byte) {
+ literal->offset = start - buffer_;
+ literal->length = end - start;
+ if (sizeof(Char) == 1) {
+ SetLiteral<true>(start, end, literal);
+ } else if (is_one_byte) {
+ SetLiteral<true>(start, end, literal);
+ } else {
+ SetLiteral<false>(start, end, literal);
+ }
+ return true;
+ }
}
return CopyToLiteralBuffer(start, end, token, literal);
}
-template<>
-bool Lexer<int8_t>::FillLiteral(
- const TokenDesc& token, LiteralDesc* literal) {
- // FIXME: implement.
- UNREACHABLE();
- return false;
-}
-
-
template<class Char>
bool Lexer<Char>::CopyToLiteralBuffer(const Char* start,
- const Char* end,
- const TokenDesc& token,
- LiteralDesc* literal) {
+ const Char* end,
+ const TokenDesc& token,
+ LiteralDesc* literal) {
literal->buffer.Reset();
if (token.has_escapes) {
for (const Char* cursor = start; cursor != end;) {
@@ -585,25 +638,19 @@ bool Lexer<Char>::CopyToLiteralBuffer(const Char* start,
if (cursor == NULL) return false;
literal->buffer.AddChar(c);
} else {
- cursor = ScanEscape(cursor, end, &literal->buffer);
+ cursor = ScanEscape(unicode_cache_, cursor, end, &literal->buffer);
ASSERT(cursor != NULL);
if (cursor == NULL) return false;
}
}
} else {
+ // TODO(dcarney): This can only happen for utf8 strings
+ // use a helper function.
for (const Char* cursor = start; cursor != end;) {
literal->buffer.AddChar(*cursor++);
}
}
- literal->is_one_byte = literal->buffer.is_ascii();
- literal->is_in_buffer = true;
- literal->length = literal->buffer.length();
- if (literal->is_one_byte) {
- literal->one_byte_string =
- Vector<const uint8_t>::cast(literal->buffer.ascii_literal());
- } else {
- literal->two_byte_string = literal->buffer.utf16_literal();
- }
+ literal->SetStringFromLiteralBuffer();
return true;
}
@@ -611,73 +658,79 @@ bool Lexer<Char>::CopyToLiteralBuffer(const Char* start,
template<class Char>
Handle<String> Lexer<Char>::InternalizeLiteral(
LiteralDesc* literal) {
- Factory* factory = isolate_->factory();
- if (literal->is_in_buffer) {
- return literal->is_one_byte
- ? factory->InternalizeOneByteString(
- Vector<const uint8_t>::cast(literal->one_byte_string))
- : factory->InternalizeTwoByteString(literal->two_byte_string);
- }
- if (sizeof(Char) == 1) {
- SubStringKey<uint8_t> key(
- source_handle_, literal->offset, literal->length);
- return factory->InternalizeStringWithKey(&key);
- } else {
- SubStringKey<uint16_t> key(
- source_handle_, literal->offset, literal->length);
- return factory->InternalizeStringWithKey(&key);
- }
+ // Factory* factory = isolate_->factory();
+ // if (literal->is_in_buffer) {
+ // return literal->is_one_byte
+ // ? factory->InternalizeOneByteString(
+ // Vector<const uint8_t>::cast(literal->one_byte_string))
+ // : factory->InternalizeTwoByteString(literal->two_byte_string);
+ // }
+ // if (sizeof(Char) == 1) {
+ // SubStringKey<uint8_t> key(
+ // source_handle_, literal->offset, literal->length);
+ // return factory->InternalizeStringWithKey(&key);
+ // } else {
+ // SubStringKey<uint16_t> key(
+ // source_handle_, literal->offset, literal->length);
+ // return factory->InternalizeStringWithKey(&key);
+ // }
+ CHECK(false);
+ return Handle<String>();
}
template<>
Handle<String> Lexer<uint8_t>::AllocateLiteral(
LiteralDesc* literal, PretenureFlag pretenured) {
- Factory* factory = isolate_->factory();
- if (literal->is_in_buffer) {
- return literal->is_one_byte
- ? factory->NewStringFromOneByte(literal->one_byte_string, pretenured)
- : factory->NewStringFromTwoByte(literal->two_byte_string, pretenured);
- }
- int from = literal->offset;
- int length = literal->length;
- // Save the offset and the length before allocating the string as it may
- // cause a GC, invalidate the literal, and move the source.
- Handle<String> result = factory->NewRawOneByteString(length, pretenured);
- uint8_t* chars = SeqOneByteString::cast(*result)->GetChars();
- String::WriteToFlat(*source_handle_, chars, from, from + length);
- return result;
+ // Factory* factory = isolate_->factory();
+ // if (literal->is_in_buffer) {
+ // return literal->is_one_byte
+ // ? factory->NewStringFromOneByte(literal->one_byte_string, pretenured)
+ // : factory->NewStringFromTwoByte(literal->two_byte_string, pretenured)
+ // }
+ // int from = literal->offset;
+ // int length = literal->length;
+ // // Save the offset and the length before allocating the string as it may
+ // // cause a GC, invalidate the literal, and move the source.
+ // Handle<String> result = factory->NewRawOneByteString(length, pretenured);
+ // uint8_t* chars = SeqOneByteString::cast(*result)->GetChars();
+ // String::WriteToFlat(*source_handle_, chars, from, from + length);
+ // return result;
+ CHECK(false);
+ return Handle<String>();
}
template<>
Handle<String> Lexer<uint16_t>::AllocateLiteral(
LiteralDesc* literal, PretenureFlag pretenured) {
- Factory* factory = isolate_->factory();
- if (literal->is_in_buffer) {
- return literal->is_one_byte
- ? factory->NewStringFromOneByte(literal->one_byte_string, pretenured)
- : factory->NewStringFromTwoByte(literal->two_byte_string, pretenured);
- }
- // Save the offset and the length before allocating the string as it may
- // cause a GC, invalidate the literal, and move the source.
- int from = literal->offset;
- int length = literal->length;
- Handle<String> result = factory->NewRawTwoByteString(length, pretenured);
- uint16_t* chars = SeqTwoByteString::cast(*result)->GetChars();
- String::WriteToFlat(*source_handle_, chars, from, from + length);
- return result;
+ // Factory* factory = isolate_->factory();
+ // if (literal->is_in_buffer) {
+ // return literal->is_one_byte
+ // ? factory->NewStringFromOneByte(literal->one_byte_string, pretenured)
+ // : factory->NewStringFromTwoByte(literal->two_byte_string, pretenured)
+ // }
+ // // Save the offset and the length before allocating the string as it may
+ // // cause a GC, invalidate the literal, and move the source.
+ // int from = literal->offset;
+ // int length = literal->length;
+ // Handle<String> result = factory->NewRawTwoByteString(length, pretenured);
+ // uint16_t* chars = SeqTwoByteString::cast(*result)->GetChars();
+ // String::WriteToFlat(*source_handle_, chars, from, from + length);
+ // return result;
+ CHECK(false);
+ return Handle<String>();
}
template<>
Handle<String> Lexer<int8_t>::AllocateLiteral(
LiteralDesc* literal, PretenureFlag pretenured) {
- // FIXME: implement
- UNREACHABLE();
+ CHECK(false);
return Handle<String>();
}
+
template class Lexer<uint8_t>;
template class Lexer<uint16_t>;
template class Lexer<int8_t>;
« no previous file with comments | « src/lexer/lexer.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698