| Index: src/scanner.cc
|
| ===================================================================
|
| --- src/scanner.cc (revision 1926)
|
| +++ src/scanner.cc (working copy)
|
| @@ -48,8 +48,12 @@
|
| // ----------------------------------------------------------------------------
|
| // UTF8Buffer
|
|
|
| -UTF8Buffer::UTF8Buffer() : data_(NULL) {
|
| - Initialize(NULL, 0);
|
| +UTF8Buffer::UTF8Buffer() {
|
| + static const int kInitialCapacity = 1 * KB;
|
| + data_ = NewArray<char>(kInitialCapacity);
|
| + limit_ = ComputeLimit(data_, kInitialCapacity);
|
| + Reset();
|
| + ASSERT(Capacity() == kInitialCapacity && pos() == 0);
|
| }
|
|
|
|
|
| @@ -58,33 +62,27 @@
|
| }
|
|
|
|
|
| -void UTF8Buffer::Initialize(char* src, int length) {
|
| - DeleteArray(data_);
|
| - data_ = src;
|
| - size_ = length;
|
| - Reset();
|
| -}
|
| -
|
| -
|
| -void UTF8Buffer::AddChar(uc32 c) {
|
| - const int min_size = 1024;
|
| - if (pos_ + static_cast<int>(unibrow::Utf8::kMaxEncodedSize) > size_) {
|
| - int new_size = size_ * 2;
|
| - if (new_size < min_size) {
|
| - new_size = min_size;
|
| - }
|
| - char* new_data = NewArray<char>(new_size);
|
| - memcpy(new_data, data_, pos_);
|
| +void UTF8Buffer::AddCharSlow(uc32 c) {
|
| + static const int kCapacityGrowthLimit = 1 * MB;
|
| + if (cursor_ > limit_) {
|
| + int old_capacity = Capacity();
|
| + int old_position = pos();
|
| + int new_capacity =
|
| + Min(old_capacity * 2, old_capacity + kCapacityGrowthLimit);
|
| + char* new_data = NewArray<char>(new_capacity);
|
| + memcpy(new_data, data_, old_position);
|
| DeleteArray(data_);
|
| data_ = new_data;
|
| - size_ = new_size;
|
| + cursor_ = new_data + old_position;
|
| + limit_ = ComputeLimit(new_data, new_capacity);
|
| + ASSERT(Capacity() == new_capacity && pos() == old_position);
|
| }
|
| - if (static_cast<unsigned>(c) < unibrow::Utf8::kMaxOneByteChar) {
|
| - data_[pos_++] = c; // common case: 7bit ASCII
|
| + if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
|
| + *cursor_++ = c; // Common case: 7-bit ASCII.
|
| } else {
|
| - pos_ += unibrow::Utf8::Encode(&data_[pos_], c);
|
| + cursor_ += unibrow::Utf8::Encode(cursor_, c);
|
| }
|
| - ASSERT(pos_ <= size_);
|
| + ASSERT(pos() <= Capacity());
|
| }
|
|
|
|
|
| @@ -172,9 +170,10 @@
|
| ASSERT(kCharacterLookaheadBufferSize == 1);
|
| Advance();
|
|
|
| - // Skip initial whitespace (allowing HTML comment ends) and scan
|
| - // first token.
|
| - SkipWhiteSpace(true);
|
| + // Skip initial whitespace allowing HTML comment ends just like
|
| + // after a newline and scan first token.
|
| + has_line_terminator_before_next_ = true;
|
| + SkipWhiteSpace();
|
| Scan();
|
| }
|
|
|
| @@ -246,18 +245,19 @@
|
| }
|
|
|
|
|
| -void Scanner::SkipWhiteSpace(bool initial) {
|
| - has_line_terminator_before_next_ = initial;
|
| +bool Scanner::SkipWhiteSpace() {
|
| + int start_position = source_pos();
|
|
|
| while (true) {
|
| // We treat byte-order marks (BOMs) as whitespace for better
|
| // compatibility with Spidermonkey and other JavaScript engines.
|
| while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
|
| // IsWhiteSpace() includes line terminators!
|
| - if (kIsLineTerminator.get(c0_))
|
| + if (kIsLineTerminator.get(c0_)) {
|
| // Ignore line terminators, but remember them. This is necessary
|
| // for automatic semicolon insertion.
|
| has_line_terminator_before_next_ = true;
|
| + }
|
| Advance();
|
| }
|
|
|
| @@ -279,7 +279,8 @@
|
| }
|
| PushBack('-'); // undo Advance()
|
| }
|
| - return;
|
| + // Return whether or not we skipped any characters.
|
| + return source_pos() != start_position;
|
| }
|
| }
|
|
|
| @@ -296,7 +297,7 @@
|
| Advance();
|
| }
|
|
|
| - return Token::COMMENT;
|
| + return Token::WHITESPACE;
|
| }
|
|
|
|
|
| @@ -316,7 +317,7 @@
|
| // matches the behaviour of SpiderMonkey and KJS.
|
| if (ch == '*' && c0_ == '/') {
|
| c0_ = ' ';
|
| - return Token::COMMENT;
|
| + return Token::WHITESPACE;
|
| }
|
| }
|
|
|
| @@ -342,19 +343,239 @@
|
|
|
| void Scanner::Scan() {
|
| Token::Value token;
|
| - bool has_line_terminator = false;
|
| + has_line_terminator_before_next_ = false;
|
| do {
|
| - SkipWhiteSpace(has_line_terminator);
|
| -
|
| - // Remember the line terminator in previous loop
|
| - has_line_terminator = has_line_terminator_before_next();
|
| -
|
| // Remember the position of the next token
|
| next_.location.beg_pos = source_pos();
|
|
|
| - token = ScanToken();
|
| - } while (token == Token::COMMENT);
|
| + switch (c0_) {
|
| + case ' ':
|
| + case '\t':
|
| + Advance();
|
| + token = Token::WHITESPACE;
|
| + break;
|
|
|
| + case '\n':
|
| + Advance();
|
| + has_line_terminator_before_next_ = true;
|
| + token = Token::WHITESPACE;
|
| + break;
|
| +
|
| + case '"': case '\'':
|
| + token = ScanString();
|
| + break;
|
| +
|
| + case '<':
|
| + // < <= << <<= <!--
|
| + Advance();
|
| + if (c0_ == '=') {
|
| + token = Select(Token::LTE);
|
| + } else if (c0_ == '<') {
|
| + token = Select('=', Token::ASSIGN_SHL, Token::SHL);
|
| + } else if (c0_ == '!') {
|
| + token = ScanHtmlComment();
|
| + } else {
|
| + token = Token::LT;
|
| + }
|
| + break;
|
| +
|
| + case '>':
|
| + // > >= >> >>= >>> >>>=
|
| + Advance();
|
| + if (c0_ == '=') {
|
| + token = Select(Token::GTE);
|
| + } else if (c0_ == '>') {
|
| + // >> >>= >>> >>>=
|
| + Advance();
|
| + if (c0_ == '=') {
|
| + token = Select(Token::ASSIGN_SAR);
|
| + } else if (c0_ == '>') {
|
| + token = Select('=', Token::ASSIGN_SHR, Token::SHR);
|
| + } else {
|
| + token = Token::SAR;
|
| + }
|
| + } else {
|
| + token = Token::GT;
|
| + }
|
| + break;
|
| +
|
| + case '=':
|
| + // = == ===
|
| + Advance();
|
| + if (c0_ == '=') {
|
| + token = Select('=', Token::EQ_STRICT, Token::EQ);
|
| + } else {
|
| + token = Token::ASSIGN;
|
| + }
|
| + break;
|
| +
|
| + case '!':
|
| + // ! != !==
|
| + Advance();
|
| + if (c0_ == '=') {
|
| + token = Select('=', Token::NE_STRICT, Token::NE);
|
| + } else {
|
| + token = Token::NOT;
|
| + }
|
| + break;
|
| +
|
| + case '+':
|
| + // + ++ +=
|
| + Advance();
|
| + if (c0_ == '+') {
|
| + token = Select(Token::INC);
|
| + } else if (c0_ == '=') {
|
| + token = Select(Token::ASSIGN_ADD);
|
| + } else {
|
| + token = Token::ADD;
|
| + }
|
| + break;
|
| +
|
| + case '-':
|
| + // - -- --> -=
|
| + Advance();
|
| + if (c0_ == '-') {
|
| + Advance();
|
| + if (c0_ == '>' && has_line_terminator_before_next_) {
|
| + // For compatibility with SpiderMonkey, we skip lines that
|
| + // start with an HTML comment end '-->'.
|
| + token = SkipSingleLineComment();
|
| + } else {
|
| + token = Token::DEC;
|
| + }
|
| + } else if (c0_ == '=') {
|
| + token = Select(Token::ASSIGN_SUB);
|
| + } else {
|
| + token = Token::SUB;
|
| + }
|
| + break;
|
| +
|
| + case '*':
|
| + // * *=
|
| + token = Select('=', Token::ASSIGN_MUL, Token::MUL);
|
| + break;
|
| +
|
| + case '%':
|
| + // % %=
|
| + token = Select('=', Token::ASSIGN_MOD, Token::MOD);
|
| + break;
|
| +
|
| + case '/':
|
| + // / // /* /=
|
| + Advance();
|
| + if (c0_ == '/') {
|
| + token = SkipSingleLineComment();
|
| + } else if (c0_ == '*') {
|
| + token = SkipMultiLineComment();
|
| + } else if (c0_ == '=') {
|
| + token = Select(Token::ASSIGN_DIV);
|
| + } else {
|
| + token = Token::DIV;
|
| + }
|
| + break;
|
| +
|
| + case '&':
|
| + // & && &=
|
| + Advance();
|
| + if (c0_ == '&') {
|
| + token = Select(Token::AND);
|
| + } else if (c0_ == '=') {
|
| + token = Select(Token::ASSIGN_BIT_AND);
|
| + } else {
|
| + token = Token::BIT_AND;
|
| + }
|
| + break;
|
| +
|
| + case '|':
|
| + // | || |=
|
| + Advance();
|
| + if (c0_ == '|') {
|
| + token = Select(Token::OR);
|
| + } else if (c0_ == '=') {
|
| + token = Select(Token::ASSIGN_BIT_OR);
|
| + } else {
|
| + token = Token::BIT_OR;
|
| + }
|
| + break;
|
| +
|
| + case '^':
|
| + // ^ ^=
|
| + token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
|
| + break;
|
| +
|
| + case '.':
|
| + // . Number
|
| + Advance();
|
| + if (IsDecimalDigit(c0_)) {
|
| + token = ScanNumber(true);
|
| + } else {
|
| + token = Token::PERIOD;
|
| + }
|
| + break;
|
| +
|
| + case ':':
|
| + token = Select(Token::COLON);
|
| + break;
|
| +
|
| + case ';':
|
| + token = Select(Token::SEMICOLON);
|
| + break;
|
| +
|
| + case ',':
|
| + token = Select(Token::COMMA);
|
| + break;
|
| +
|
| + case '(':
|
| + token = Select(Token::LPAREN);
|
| + break;
|
| +
|
| + case ')':
|
| + token = Select(Token::RPAREN);
|
| + break;
|
| +
|
| + case '[':
|
| + token = Select(Token::LBRACK);
|
| + break;
|
| +
|
| + case ']':
|
| + token = Select(Token::RBRACK);
|
| + break;
|
| +
|
| + case '{':
|
| + token = Select(Token::LBRACE);
|
| + break;
|
| +
|
| + case '}':
|
| + token = Select(Token::RBRACE);
|
| + break;
|
| +
|
| + case '?':
|
| + token = Select(Token::CONDITIONAL);
|
| + break;
|
| +
|
| + case '~':
|
| + token = Select(Token::BIT_NOT);
|
| + break;
|
| +
|
| + default:
|
| + if (kIsIdentifierStart.get(c0_)) {
|
| + token = ScanIdentifier();
|
| + } else if (IsDecimalDigit(c0_)) {
|
| + token = ScanNumber(false);
|
| + } else if (SkipWhiteSpace()) {
|
| + token = Token::WHITESPACE;
|
| + } else if (c0_ < 0) {
|
| + token = Token::EOS;
|
| + } else {
|
| + token = Select(Token::ILLEGAL);
|
| + }
|
| + break;
|
| + }
|
| +
|
| + // Continue scanning for tokens as long as we're just skipping
|
| + // whitespace.
|
| + } while (token == Token::WHITESPACE);
|
| +
|
| next_.location.end_pos = source_pos();
|
| next_.token = token;
|
| }
|
| @@ -495,147 +716,6 @@
|
| }
|
|
|
|
|
| -Token::Value Scanner::ScanToken() {
|
| - switch (c0_) {
|
| - // strings
|
| - case '"': case '\'':
|
| - return ScanString();
|
| -
|
| - case '<':
|
| - // < <= << <<= <!--
|
| - Advance();
|
| - if (c0_ == '=') return Select(Token::LTE);
|
| - if (c0_ == '<') return Select('=', Token::ASSIGN_SHL, Token::SHL);
|
| - if (c0_ == '!') return ScanHtmlComment();
|
| - return Token::LT;
|
| -
|
| - case '>':
|
| - // > >= >> >>= >>> >>>=
|
| - Advance();
|
| - if (c0_ == '=') return Select(Token::GTE);
|
| - if (c0_ == '>') {
|
| - // >> >>= >>> >>>=
|
| - Advance();
|
| - if (c0_ == '=') return Select(Token::ASSIGN_SAR);
|
| - if (c0_ == '>') return Select('=', Token::ASSIGN_SHR, Token::SHR);
|
| - return Token::SAR;
|
| - }
|
| - return Token::GT;
|
| -
|
| - case '=':
|
| - // = == ===
|
| - Advance();
|
| - if (c0_ == '=') return Select('=', Token::EQ_STRICT, Token::EQ);
|
| - return Token::ASSIGN;
|
| -
|
| - case '!':
|
| - // ! != !==
|
| - Advance();
|
| - if (c0_ == '=') return Select('=', Token::NE_STRICT, Token::NE);
|
| - return Token::NOT;
|
| -
|
| - case '+':
|
| - // + ++ +=
|
| - Advance();
|
| - if (c0_ == '+') return Select(Token::INC);
|
| - if (c0_ == '=') return Select(Token::ASSIGN_ADD);
|
| - return Token::ADD;
|
| -
|
| - case '-':
|
| - // - -- -=
|
| - Advance();
|
| - if (c0_ == '-') return Select(Token::DEC);
|
| - if (c0_ == '=') return Select(Token::ASSIGN_SUB);
|
| - return Token::SUB;
|
| -
|
| - case '*':
|
| - // * *=
|
| - return Select('=', Token::ASSIGN_MUL, Token::MUL);
|
| -
|
| - case '%':
|
| - // % %=
|
| - return Select('=', Token::ASSIGN_MOD, Token::MOD);
|
| -
|
| - case '/':
|
| - // / // /* /=
|
| - Advance();
|
| - if (c0_ == '/') return SkipSingleLineComment();
|
| - if (c0_ == '*') return SkipMultiLineComment();
|
| - if (c0_ == '=') return Select(Token::ASSIGN_DIV);
|
| - return Token::DIV;
|
| -
|
| - case '&':
|
| - // & && &=
|
| - Advance();
|
| - if (c0_ == '&') return Select(Token::AND);
|
| - if (c0_ == '=') return Select(Token::ASSIGN_BIT_AND);
|
| - return Token::BIT_AND;
|
| -
|
| - case '|':
|
| - // | || |=
|
| - Advance();
|
| - if (c0_ == '|') return Select(Token::OR);
|
| - if (c0_ == '=') return Select(Token::ASSIGN_BIT_OR);
|
| - return Token::BIT_OR;
|
| -
|
| - case '^':
|
| - // ^ ^=
|
| - return Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
|
| -
|
| - case '.':
|
| - // . Number
|
| - Advance();
|
| - if (IsDecimalDigit(c0_)) return ScanNumber(true);
|
| - return Token::PERIOD;
|
| -
|
| - case ':':
|
| - return Select(Token::COLON);
|
| -
|
| - case ';':
|
| - return Select(Token::SEMICOLON);
|
| -
|
| - case ',':
|
| - return Select(Token::COMMA);
|
| -
|
| - case '(':
|
| - return Select(Token::LPAREN);
|
| -
|
| - case ')':
|
| - return Select(Token::RPAREN);
|
| -
|
| - case '[':
|
| - return Select(Token::LBRACK);
|
| -
|
| - case ']':
|
| - return Select(Token::RBRACK);
|
| -
|
| - case '{':
|
| - return Select(Token::LBRACE);
|
| -
|
| - case '}':
|
| - return Select(Token::RBRACE);
|
| -
|
| - case '?':
|
| - return Select(Token::CONDITIONAL);
|
| -
|
| - case '~':
|
| - return Select(Token::BIT_NOT);
|
| -
|
| - default:
|
| - if (kIsIdentifierStart.get(c0_))
|
| - return ScanIdentifier();
|
| - if (IsDecimalDigit(c0_))
|
| - return ScanNumber(false);
|
| - if (c0_ < 0)
|
| - return Token::EOS;
|
| - return Select(Token::ILLEGAL);
|
| - }
|
| -
|
| - UNREACHABLE();
|
| - return Token::ILLEGAL;
|
| -}
|
| -
|
| -
|
| // Returns true if any decimal digits were scanned, returns false otherwise.
|
| void Scanner::ScanDecimalDigits() {
|
| while (IsDecimalDigit(c0_))
|
| @@ -734,7 +814,6 @@
|
|
|
| Token::Value Scanner::ScanIdentifier() {
|
| ASSERT(kIsIdentifierStart.get(c0_));
|
| -
|
| bool has_escapes = false;
|
|
|
| StartLiteral();
|
| @@ -746,8 +825,10 @@
|
| if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL;
|
| AddChar(c);
|
| } else {
|
| - AddCharAdvance();
|
| + AddChar(c0_);
|
| + Advance();
|
| }
|
| +
|
| // Scan the rest of the identifier characters.
|
| while (kIsIdentifierPart.get(c0_)) {
|
| if (c0_ == '\\') {
|
| @@ -757,19 +838,22 @@
|
| if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL;
|
| AddChar(c);
|
| } else {
|
| - AddCharAdvance();
|
| + AddChar(c0_);
|
| + Advance();
|
| }
|
| }
|
| TerminateLiteral();
|
|
|
| // We don't have any 1-letter keywords (this is probably a common case).
|
| - if ((next_.literal_end - next_.literal_pos) == 1)
|
| + if ((next_.literal_end - next_.literal_pos) == 1) {
|
| return Token::IDENTIFIER;
|
| + }
|
|
|
| // If the identifier contains unicode escapes, it must not be
|
| // resolved to a keyword.
|
| - if (has_escapes)
|
| + if (has_escapes) {
|
| return Token::IDENTIFIER;
|
| + }
|
|
|
| return Token::Lookup(&literals_.data()[next_.literal_pos]);
|
| }
|
|
|