Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(549)

Side by Side Diff: src/scanner-base.h

Issue 6075005: Change scanner buffers to not use utf-8. (Closed)
Patch Set: Fixed linto. Created 10 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/scanner.cc ('k') | src/scanner-base.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2010 the V8 project authors. All rights reserved. 1 // Copyright 2010 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after
134 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; 134 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
135 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; 135 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
136 136
137 static bool IsIdentifier(unibrow::CharacterStream* buffer); 137 static bool IsIdentifier(unibrow::CharacterStream* buffer);
138 138
139 private: 139 private:
140 static StaticResource<Utf8Decoder> utf8_decoder_; 140 static StaticResource<Utf8Decoder> utf8_decoder_;
141 }; 141 };
142 142
143 // ---------------------------------------------------------------------------- 143 // ----------------------------------------------------------------------------
144 // LiteralCollector - Collector of chars of literals. 144 // LiteralBuffer - Collector of chars of literals.
145 145
146 class LiteralCollector { 146 class LiteralBuffer {
147 public: 147 public:
148 LiteralCollector(); 148 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
149 ~LiteralCollector();
150 149
151 inline void AddChar(uc32 c) { 150 ~LiteralBuffer() {
152 if (recording_) { 151 if (backing_store_.length() > 0) {
153 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { 152 backing_store_.Dispose();
154 buffer_.Add(static_cast<char>(c));
155 } else {
156 AddCharSlow(c);
157 }
158 } 153 }
159 } 154 }
160 155
161 void StartLiteral() { 156 inline void AddChar(uc16 character) {
162 buffer_.StartSequence(); 157 if (position_ >= backing_store_.length()) ExpandBuffer();
163 recording_ = true; 158 if (is_ascii_) {
159 if (character < kMaxAsciiCharCodeU) {
160 backing_store_[position_] = static_cast<byte>(character);
161 position_ += kASCIISize;
162 return;
163 }
164 ConvertToUC16();
165 }
166 *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
167 position_ += kUC16Size;
164 } 168 }
165 169
166 Vector<const char> EndLiteral() { 170 bool is_ascii() { return is_ascii_; }
167 if (recording_) { 171
168 recording_ = false; 172 Vector<const uc16> uc16_literal() {
169 buffer_.Add(kEndMarker); 173 ASSERT(!is_ascii_);
170 Vector<char> sequence = buffer_.EndSequence(); 174 ASSERT((position_ & 0x1) == 0);
171 return Vector<const char>(sequence.start(), sequence.length()); 175 return Vector<const uc16>(
172 } 176 reinterpret_cast<const uc16*>(backing_store_.start()),
173 return Vector<const char>(); 177 position_ >> 1);
174 } 178 }
175 179
176 void DropLiteral() { 180 Vector<const char> ascii_literal() {
177 if (recording_) { 181 ASSERT(is_ascii_);
178 recording_ = false; 182 return Vector<const char>(
179 buffer_.DropSequence(); 183 reinterpret_cast<const char*>(backing_store_.start()),
180 } 184 position_);
185 }
186
187 int length() {
188 return is_ascii_ ? position_ : (position_ >> 1);
181 } 189 }
182 190
183 void Reset() { 191 void Reset() {
184 buffer_.Reset(); 192 position_ = 0;
193 is_ascii_ = true;
194 }
195 private:
196 static const int kInitialCapacity = 16;
197 static const int kGrowthFactory = 4;
198 static const int kMinConversionSlack = 256;
199 static const int kMaxGrowth = 1 * MB;
200 inline int NewCapacity(int min_capacity) {
201 int capacity = Max(min_capacity, backing_store_.length());
202 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
203 return new_capacity;
185 } 204 }
186 205
187 // The end marker added after a parsed literal. 206 void ExpandBuffer() {
188 // Using zero allows the usage of strlen and similar functions on 207 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
189 // identifiers and numbers (but not strings, since they may contain zero 208 memcpy(new_store.start(), backing_store_.start(), position_);
190 // bytes). 209 backing_store_.Dispose();
191 static const char kEndMarker = '\x00'; 210 backing_store_ = new_store;
192 private: 211 }
193 static const int kInitialCapacity = 256; 212
194 SequenceCollector<char, 4> buffer_; 213 void ConvertToUC16() {
195 bool recording_; 214 ASSERT(is_ascii_);
196 void AddCharSlow(uc32 c); 215 Vector<byte> new_store;
216 int new_content_size = position_ * kUC16Size;
217 if (new_content_size > backing_store_.length()) {
218 new_store = Vector<byte>::New(NewCapacity(new_content_size));
219 } else {
220 new_store = backing_store_;
221 }
222 char* src = reinterpret_cast<char*>(backing_store_.start());
223 uc16* dst = reinterpret_cast<uc16*>(new_store.start());
224 for (int i = position_ - 1; i >= 0; i--) {
225 dst[i] = src[i];
226 }
227 if (new_store.start() != backing_store_.start()) {
228 backing_store_.Dispose();
229 backing_store_ = new_store;
230 }
231 position_ = new_content_size;
232 is_ascii_ = false;
233 }
234
235 bool is_ascii_;
236 int position_;
237 Vector<byte> backing_store_;
197 }; 238 };
198 239
240
199 // ---------------------------------------------------------------------------- 241 // ----------------------------------------------------------------------------
200 // Scanner base-class. 242 // Scanner base-class.
201 243
202 // Generic functionality used by both JSON and JavaScript scanners. 244 // Generic functionality used by both JSON and JavaScript scanners.
203 class Scanner { 245 class Scanner {
204 public: 246 public:
205 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; 247 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
206 248
207 class LiteralScope { 249 class LiteralScope {
208 public: 250 public:
(...skipping 25 matching lines...) Expand all
234 // (the token returned by Next()). 276 // (the token returned by Next()).
235 Location location() const { return current_.location; } 277 Location location() const { return current_.location; }
236 Location peek_location() const { return next_.location; } 278 Location peek_location() const { return next_.location; }
237 279
238 // Returns the literal string, if any, for the current token (the 280 // Returns the literal string, if any, for the current token (the
239 // token returned by Next()). The string is 0-terminated and in 281 // token returned by Next()). The string is 0-terminated and in
240 // UTF-8 format; they may contain 0-characters. Literal strings are 282 // UTF-8 format; they may contain 0-characters. Literal strings are
241 // collected for identifiers, strings, and numbers. 283 // collected for identifiers, strings, and numbers.
242 // These functions only give the correct result if the literal 284 // These functions only give the correct result if the literal
243 // was scanned between calls to StartLiteral() and TerminateLiteral(). 285 // was scanned between calls to StartLiteral() and TerminateLiteral().
244 const char* literal_string() const { 286 bool is_literal_ascii() {
245 return current_.literal_chars.start(); 287 ASSERT_NOT_NULL(current_.literal_chars);
288 return current_.literal_chars->is_ascii();
246 } 289 }
247 290 Vector<const char> literal_ascii_string() {
291 ASSERT_NOT_NULL(current_.literal_chars);
292 return current_.literal_chars->ascii_literal();
293 }
294 Vector<const uc16> literal_uc16_string() {
295 ASSERT_NOT_NULL(current_.literal_chars);
296 return current_.literal_chars->uc16_literal();
297 }
248 int literal_length() const { 298 int literal_length() const {
249 // Excluding terminal '\x00' added by TerminateLiteral(). 299 ASSERT_NOT_NULL(current_.literal_chars);
250 return current_.literal_chars.length() - 1; 300 return current_.literal_chars->length();
251 }
252
253 Vector<const char> literal() const {
254 return Vector<const char>(literal_string(), literal_length());
255 } 301 }
256 302
257 // Returns the literal string for the next token (the token that 303 // Returns the literal string for the next token (the token that
258 // would be returned if Next() were called). 304 // would be returned if Next() were called).
259 const char* next_literal_string() const { 305 bool is_next_literal_ascii() {
260 return next_.literal_chars.start(); 306 ASSERT_NOT_NULL(next_.literal_chars);
307 return next_.literal_chars->is_ascii();
261 } 308 }
262 309 Vector<const char> next_literal_ascii_string() {
263 310 ASSERT_NOT_NULL(next_.literal_chars);
264 // Returns the length of the next token (that would be returned if 311 return next_.literal_chars->ascii_literal();
265 // Next() were called). 312 }
313 Vector<const uc16> next_literal_uc16_string() {
314 ASSERT_NOT_NULL(next_.literal_chars);
315 return next_.literal_chars->uc16_literal();
316 }
266 int next_literal_length() const { 317 int next_literal_length() const {
267 // Excluding terminal '\x00' added by TerminateLiteral(). 318 ASSERT_NOT_NULL(next_.literal_chars);
268 return next_.literal_chars.length() - 1; 319 return next_.literal_chars->length();
269 }
270
271 Vector<const char> next_literal() const {
272 return Vector<const char>(next_literal_string(), next_literal_length());
273 } 320 }
274 321
275 static const int kCharacterLookaheadBufferSize = 1; 322 static const int kCharacterLookaheadBufferSize = 1;
276 323
277 protected: 324 protected:
278 // The current and look-ahead token. 325 // The current and look-ahead token.
279 struct TokenDesc { 326 struct TokenDesc {
280 Token::Value token; 327 Token::Value token;
281 Location location; 328 Location location;
282 Vector<const char> literal_chars; 329 LiteralBuffer* literal_chars;
283 }; 330 };
284 331
285 // Call this after setting source_ to the input. 332 // Call this after setting source_ to the input.
286 void Init() { 333 void Init() {
287 // Set c0_ (one character ahead) 334 // Set c0_ (one character ahead)
288 ASSERT(kCharacterLookaheadBufferSize == 1); 335 ASSERT(kCharacterLookaheadBufferSize == 1);
289 Advance(); 336 Advance();
290 // Initialize current_ to not refer to a literal. 337 // Initialize current_ to not refer to a literal.
291 current_.literal_chars = Vector<const char>(); 338 current_.literal_chars = NULL;
292 // Reset literal buffer.
293 literal_buffer_.Reset();
294 } 339 }
295 340
296 // Literal buffer support 341 // Literal buffer support
297 inline void StartLiteral() { 342 inline void StartLiteral() {
298 literal_buffer_.StartLiteral(); 343 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
344 &literal_buffer2_ : &literal_buffer1_;
345 free_buffer->Reset();
346 next_.literal_chars = free_buffer;
299 } 347 }
300 348
301 inline void AddLiteralChar(uc32 c) { 349 inline void AddLiteralChar(uc32 c) {
302 literal_buffer_.AddChar(c); 350 ASSERT_NOT_NULL(next_.literal_chars);
351 next_.literal_chars->AddChar(c);
303 } 352 }
304 353
305 // Complete scanning of a literal. 354 // Complete scanning of a literal.
306 inline void TerminateLiteral() { 355 inline void TerminateLiteral() {
307 next_.literal_chars = literal_buffer_.EndLiteral(); 356 // Does nothing in the current implementation.
308 } 357 }
309 358
310 // Stops scanning of a literal and drop the collected characters, 359 // Stops scanning of a literal and drop the collected characters,
311 // e.g., due to an encountered error. 360 // e.g., due to an encountered error.
312 inline void DropLiteral() { 361 inline void DropLiteral() {
313 literal_buffer_.DropLiteral(); 362 next_.literal_chars = NULL;
314 } 363 }
315 364
316 inline void AddLiteralCharAdvance() { 365 inline void AddLiteralCharAdvance() {
317 AddLiteralChar(c0_); 366 AddLiteralChar(c0_);
318 Advance(); 367 Advance();
319 } 368 }
320 369
321 // Low-level scanning support. 370 // Low-level scanning support.
322 void Advance() { c0_ = source_->Advance(); } 371 void Advance() { c0_ = source_->Advance(); }
323 void PushBack(uc32 ch) { 372 void PushBack(uc32 ch) {
(...skipping 17 matching lines...) Expand all
341 } 390 }
342 391
343 uc32 ScanHexEscape(uc32 c, int length); 392 uc32 ScanHexEscape(uc32 c, int length);
344 uc32 ScanOctalEscape(uc32 c, int length); 393 uc32 ScanOctalEscape(uc32 c, int length);
345 394
346 // Return the current source position. 395 // Return the current source position.
347 int source_pos() { 396 int source_pos() {
348 return source_->pos() - kCharacterLookaheadBufferSize; 397 return source_->pos() - kCharacterLookaheadBufferSize;
349 } 398 }
350 399
400 // Buffers collecting literal strings, numbers, etc.
401 LiteralBuffer literal_buffer1_;
402 LiteralBuffer literal_buffer2_;
403
351 TokenDesc current_; // desc for current token (as returned by Next()) 404 TokenDesc current_; // desc for current token (as returned by Next())
352 TokenDesc next_; // desc for next token (one token look-ahead) 405 TokenDesc next_; // desc for next token (one token look-ahead)
353 406
354 // Input stream. Must be initialized to an UC16CharacterStream. 407 // Input stream. Must be initialized to an UC16CharacterStream.
355 UC16CharacterStream* source_; 408 UC16CharacterStream* source_;
356 409
357 // Buffer to hold literal values (identifiers, strings, numbers)
358 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
359 LiteralCollector literal_buffer_;
360 410
361 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 411 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
362 uc32 c0_; 412 uc32 c0_;
363 }; 413 };
364 414
365 // ---------------------------------------------------------------------------- 415 // ----------------------------------------------------------------------------
366 // JavaScriptScanner - base logic for JavaScript scanning. 416 // JavaScriptScanner - base logic for JavaScript scanning.
367 417
368 class JavaScriptScanner : public Scanner { 418 class JavaScriptScanner : public Scanner {
369 public: 419 public:
370
371 // Bit vector representing set of types of literals.
372 enum LiteralType {
373 kNoLiterals = 0,
374 kLiteralNumber = 1,
375 kLiteralIdentifier = 2,
376 kLiteralString = 4,
377 kLiteralRegExp = 8,
378 kLiteralRegExpFlags = 16,
379 kAllLiterals = 31
380 };
381
382 // A LiteralScope that disables recording of some types of JavaScript 420 // A LiteralScope that disables recording of some types of JavaScript
383 // literals. If the scanner is configured to not record the specific 421 // literals. If the scanner is configured to not record the specific
384 // type of literal, the scope will not call StartLiteral. 422 // type of literal, the scope will not call StartLiteral.
385 class LiteralScope { 423 class LiteralScope {
386 public: 424 public:
387 LiteralScope(JavaScriptScanner* self, LiteralType type) 425 explicit LiteralScope(JavaScriptScanner* self)
388 : scanner_(self), complete_(false) { 426 : scanner_(self), complete_(false) {
389 if (scanner_->RecordsLiteral(type)) { 427 scanner_->StartLiteral();
390 scanner_->StartLiteral();
391 }
392 } 428 }
393 ~LiteralScope() { 429 ~LiteralScope() {
394 if (!complete_) scanner_->DropLiteral(); 430 if (!complete_) scanner_->DropLiteral();
395 } 431 }
396 void Complete() { 432 void Complete() {
397 scanner_->TerminateLiteral(); 433 scanner_->TerminateLiteral();
398 complete_ = true; 434 complete_ = true;
399 } 435 }
400 436
401 private: 437 private:
(...skipping 21 matching lines...) Expand all
423 // Tells whether the buffer contains an identifier (no escapes). 459 // Tells whether the buffer contains an identifier (no escapes).
424 // Used for checking if a property name is an identifier. 460 // Used for checking if a property name is an identifier.
425 static bool IsIdentifier(unibrow::CharacterStream* buffer); 461 static bool IsIdentifier(unibrow::CharacterStream* buffer);
426 462
427 // Seek forward to the given position. This operation does not 463 // Seek forward to the given position. This operation does not
428 // work in general, for instance when there are pushed back 464 // work in general, for instance when there are pushed back
429 // characters, but works for seeking forward until simple delimiter 465 // characters, but works for seeking forward until simple delimiter
430 // tokens, which is what it is used for. 466 // tokens, which is what it is used for.
431 void SeekForward(int pos); 467 void SeekForward(int pos);
432 468
433 // Whether this scanner records the given literal type or not.
434 bool RecordsLiteral(LiteralType type) {
435 return (literal_flags_ & type) != 0;
436 }
437
438 protected: 469 protected:
439 bool SkipWhiteSpace(); 470 bool SkipWhiteSpace();
440 Token::Value SkipSingleLineComment(); 471 Token::Value SkipSingleLineComment();
441 Token::Value SkipMultiLineComment(); 472 Token::Value SkipMultiLineComment();
442 473
443 // Scans a single JavaScript token. 474 // Scans a single JavaScript token.
444 void Scan(); 475 void Scan();
445 476
446 void ScanDecimalDigits(); 477 void ScanDecimalDigits();
447 Token::Value ScanNumber(bool seen_period); 478 Token::Value ScanNumber(bool seen_period);
448 Token::Value ScanIdentifierOrKeyword(); 479 Token::Value ScanIdentifierOrKeyword();
449 Token::Value ScanIdentifierSuffix(LiteralScope* literal); 480 Token::Value ScanIdentifierSuffix(LiteralScope* literal);
450 481
451 void ScanEscape(); 482 void ScanEscape();
452 Token::Value ScanString(); 483 Token::Value ScanString();
453 484
454 // Scans a possible HTML comment -- begins with '<!'. 485 // Scans a possible HTML comment -- begins with '<!'.
455 Token::Value ScanHtmlComment(); 486 Token::Value ScanHtmlComment();
456 487
457 // Decodes a unicode escape-sequence which is part of an identifier. 488 // Decodes a unicode escape-sequence which is part of an identifier.
458 // If the escape sequence cannot be decoded the result is kBadChar. 489 // If the escape sequence cannot be decoded the result is kBadChar.
459 uc32 ScanIdentifierUnicodeEscape(); 490 uc32 ScanIdentifierUnicodeEscape();
460 491
461 int literal_flags_;
462 bool has_line_terminator_before_next_; 492 bool has_line_terminator_before_next_;
463 }; 493 };
464 494
465 495
466 // ---------------------------------------------------------------------------- 496 // ----------------------------------------------------------------------------
467 // Keyword matching state machine. 497 // Keyword matching state machine.
468 498
469 class KeywordMatcher { 499 class KeywordMatcher {
470 // Incrementally recognize keywords. 500 // Incrementally recognize keywords.
471 // 501 //
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after
584 // keyword with the current prefix). 614 // keyword with the current prefix).
585 const char* keyword_; 615 const char* keyword_;
586 int counter_; 616 int counter_;
587 Token::Value keyword_token_; 617 Token::Value keyword_token_;
588 }; 618 };
589 619
590 620
591 } } // namespace v8::internal 621 } } // namespace v8::internal
592 622
593 #endif // V8_SCANNER_BASE_H_ 623 #endif // V8_SCANNER_BASE_H_
OLDNEW
« no previous file with comments | « src/scanner.cc ('k') | src/scanner-base.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698