src/scanner-base.h - Issue 6075005: Change scanner buffers to not use utf-8.

Side by Side Diff: src/scanner-base.h

Issue 6075005: Change scanner buffers to not use utf-8. (Closed)

Patch Set: Fixed linto. Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2010 the V8 project authors. All rights reserved.	1 // Copyright 2010 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 123 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
134 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;	134 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;

135 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;	135 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;

136	136

137 static bool IsIdentifier(unibrow::CharacterStream* buffer);	137 static bool IsIdentifier(unibrow::CharacterStream* buffer);

138	138

139 private:	139 private:

140 static StaticResource<Utf8Decoder> utf8_decoder_;	140 static StaticResource<Utf8Decoder> utf8_decoder_;

141 };	141 };

142	142

143 // ----------------------------------------------------------------------------	143 // ----------------------------------------------------------------------------

144 // LiteralCollector - Collector of chars of literals.	144 // LiteralBuffer - Collector of chars of literals.

145	145

146 class LiteralCollector {	146 class LiteralBuffer {

147 public:	147 public:

148 LiteralCollector();	148 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }

149 ~LiteralCollector();

150	149

151 inline void AddChar(uc32 c) {	150 ~LiteralBuffer() {

152 if (recording_) {	151 if (backing_store_.length() > 0) {

153 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {	152 backing_store_.Dispose();

154 buffer_.Add(static_cast<char>(c));

155 } else {

156 AddCharSlow(c);

157 }

158 }	153 }

159 }	154 }

160	155

161 void StartLiteral() {	156 inline void AddChar(uc16 character) {

162 buffer_.StartSequence();	157 if (position_ >= backing_store_.length()) ExpandBuffer();

163 recording_ = true;	158 if (is_ascii_) {

	159 if (character < kMaxAsciiCharCodeU) {

	160 backing_store_[position_] = static_cast<byte>(character);

	161 position_ += kASCIISize;

	162 return;

	163 }

	164 ConvertToUC16();

	165 }

	166 reinterpret_cast<uc16>(&backing_store_[position_]) = character;

	167 position_ += kUC16Size;

164 }	168 }

165	169

166 Vector<const char> EndLiteral() {	170 bool is_ascii() { return is_ascii_; }

167 if (recording_) {	171

168 recording_ = false;	172 Vector<const uc16> uc16_literal() {

169 buffer_.Add(kEndMarker);	173 ASSERT(!is_ascii_);

170 Vector<char> sequence = buffer_.EndSequence();	174 ASSERT((position_ & 0x1) == 0);

171 return Vector<const char>(sequence.start(), sequence.length());	175 return Vector<const uc16>(

172 }	176 reinterpret_cast<const uc16*>(backing_store_.start()),

173 return Vector<const char>();	177 position_ >> 1);

174 }	178 }

175	179

176 void DropLiteral() {	180 Vector<const char> ascii_literal() {

177 if (recording_) {	181 ASSERT(is_ascii_);

178 recording_ = false;	182 return Vector<const char>(

179 buffer_.DropSequence();	183 reinterpret_cast<const char*>(backing_store_.start()),

180 }	184 position_);

	185 }

	186

	187 int length() {

	188 return is_ascii_ ? position_ : (position_ >> 1);

181 }	189 }

182	190

183 void Reset() {	191 void Reset() {

184 buffer_.Reset();	192 position_ = 0;

	193 is_ascii_ = true;

	194 }

	195 private:

	196 static const int kInitialCapacity = 16;

	197 static const int kGrowthFactory = 4;

	198 static const int kMinConversionSlack = 256;

	199 static const int kMaxGrowth = 1 * MB;

	200 inline int NewCapacity(int min_capacity) {

	201 int capacity = Max(min_capacity, backing_store_.length());

	202 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);

	203 return new_capacity;

185 }	204 }

186	205

187 // The end marker added after a parsed literal.	206 void ExpandBuffer() {

188 // Using zero allows the usage of strlen and similar functions on	207 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));

189 // identifiers and numbers (but not strings, since they may contain zero	208 memcpy(new_store.start(), backing_store_.start(), position_);

190 // bytes).	209 backing_store_.Dispose();

191 static const char kEndMarker = '\x00';	210 backing_store_ = new_store;

192 private:	211 }

193 static const int kInitialCapacity = 256;	212

194 SequenceCollector<char, 4> buffer_;	213 void ConvertToUC16() {

195 bool recording_;	214 ASSERT(is_ascii_);

196 void AddCharSlow(uc32 c);	215 Vector<byte> new_store;

	216 int new_content_size = position_ * kUC16Size;

	217 if (new_content_size > backing_store_.length()) {

	218 new_store = Vector<byte>::New(NewCapacity(new_content_size));

	219 } else {

	220 new_store = backing_store_;

	221 }

	222 char* src = reinterpret_cast<char*>(backing_store_.start());

	223 uc16* dst = reinterpret_cast<uc16*>(new_store.start());

	224 for (int i = position_ - 1; i >= 0; i--) {

	225 dst[i] = src[i];

	226 }

	227 if (new_store.start() != backing_store_.start()) {

	228 backing_store_.Dispose();

	229 backing_store_ = new_store;

	230 }

	231 position_ = new_content_size;

	232 is_ascii_ = false;

	233 }

	234

	235 bool is_ascii_;

	236 int position_;

	237 Vector<byte> backing_store_;

197 };	238 };

198	239

	240

199 // ----------------------------------------------------------------------------	241 // ----------------------------------------------------------------------------

200 // Scanner base-class.	242 // Scanner base-class.

201	243

202 // Generic functionality used by both JSON and JavaScript scanners.	244 // Generic functionality used by both JSON and JavaScript scanners.

203 class Scanner {	245 class Scanner {

204 public:	246 public:

205 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;	247 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;

206	248

207 class LiteralScope {	249 class LiteralScope {

208 public:	250 public:

(...skipping 25 matching lines...) Expand all Loading...
234 // (the token returned by Next()).	276 // (the token returned by Next()).

235 Location location() const { return current_.location; }	277 Location location() const { return current_.location; }

236 Location peek_location() const { return next_.location; }	278 Location peek_location() const { return next_.location; }

237	279

238 // Returns the literal string, if any, for the current token (the	280 // Returns the literal string, if any, for the current token (the

239 // token returned by Next()). The string is 0-terminated and in	281 // token returned by Next()). The string is 0-terminated and in

240 // UTF-8 format; they may contain 0-characters. Literal strings are	282 // UTF-8 format; they may contain 0-characters. Literal strings are

241 // collected for identifiers, strings, and numbers.	283 // collected for identifiers, strings, and numbers.

242 // These functions only give the correct result if the literal	284 // These functions only give the correct result if the literal

243 // was scanned between calls to StartLiteral() and TerminateLiteral().	285 // was scanned between calls to StartLiteral() and TerminateLiteral().

244 const char* literal_string() const {	286 bool is_literal_ascii() {

245 return current_.literal_chars.start();	287 ASSERT_NOT_NULL(current_.literal_chars);

	288 return current_.literal_chars->is_ascii();

246 }	289 }

247	290 Vector<const char> literal_ascii_string() {

	291 ASSERT_NOT_NULL(current_.literal_chars);

	292 return current_.literal_chars->ascii_literal();

	293 }

	294 Vector<const uc16> literal_uc16_string() {

	295 ASSERT_NOT_NULL(current_.literal_chars);

	296 return current_.literal_chars->uc16_literal();

	297 }

248 int literal_length() const {	298 int literal_length() const {

249 // Excluding terminal '\x00' added by TerminateLiteral().	299 ASSERT_NOT_NULL(current_.literal_chars);

250 return current_.literal_chars.length() - 1;	300 return current_.literal_chars->length();

251 }

252

253 Vector<const char> literal() const {

254 return Vector<const char>(literal_string(), literal_length());

255 }	301 }

256	302

257 // Returns the literal string for the next token (the token that	303 // Returns the literal string for the next token (the token that

258 // would be returned if Next() were called).	304 // would be returned if Next() were called).

259 const char* next_literal_string() const {	305 bool is_next_literal_ascii() {

260 return next_.literal_chars.start();	306 ASSERT_NOT_NULL(next_.literal_chars);

	307 return next_.literal_chars->is_ascii();

261 }	308 }

262	309 Vector<const char> next_literal_ascii_string() {

263	310 ASSERT_NOT_NULL(next_.literal_chars);

264 // Returns the length of the next token (that would be returned if	311 return next_.literal_chars->ascii_literal();

265 // Next() were called).	312 }

	313 Vector<const uc16> next_literal_uc16_string() {

	314 ASSERT_NOT_NULL(next_.literal_chars);

	315 return next_.literal_chars->uc16_literal();

	316 }

266 int next_literal_length() const {	317 int next_literal_length() const {

267 // Excluding terminal '\x00' added by TerminateLiteral().	318 ASSERT_NOT_NULL(next_.literal_chars);

268 return next_.literal_chars.length() - 1;	319 return next_.literal_chars->length();

269 }

270

271 Vector<const char> next_literal() const {

272 return Vector<const char>(next_literal_string(), next_literal_length());

273 }	320 }

274	321

275 static const int kCharacterLookaheadBufferSize = 1;	322 static const int kCharacterLookaheadBufferSize = 1;

276	323

277 protected:	324 protected:

278 // The current and look-ahead token.	325 // The current and look-ahead token.

279 struct TokenDesc {	326 struct TokenDesc {

280 Token::Value token;	327 Token::Value token;

281 Location location;	328 Location location;

282 Vector<const char> literal_chars;	329 LiteralBuffer* literal_chars;

283 };	330 };

284	331

285 // Call this after setting source_ to the input.	332 // Call this after setting source_ to the input.

286 void Init() {	333 void Init() {

287 // Set c0_ (one character ahead)	334 // Set c0_ (one character ahead)

288 ASSERT(kCharacterLookaheadBufferSize == 1);	335 ASSERT(kCharacterLookaheadBufferSize == 1);

289 Advance();	336 Advance();

290 // Initialize current_ to not refer to a literal.	337 // Initialize current_ to not refer to a literal.

291 current_.literal_chars = Vector<const char>();	338 current_.literal_chars = NULL;

292 // Reset literal buffer.

293 literal_buffer_.Reset();

294 }	339 }

295	340

296 // Literal buffer support	341 // Literal buffer support

297 inline void StartLiteral() {	342 inline void StartLiteral() {

298 literal_buffer_.StartLiteral();	343 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?

	344 &literal_buffer2_ : &literal_buffer1_;

	345 free_buffer->Reset();

	346 next_.literal_chars = free_buffer;

299 }	347 }

300	348

301 inline void AddLiteralChar(uc32 c) {	349 inline void AddLiteralChar(uc32 c) {

302 literal_buffer_.AddChar(c);	350 ASSERT_NOT_NULL(next_.literal_chars);

	351 next_.literal_chars->AddChar(c);

303 }	352 }

304	353

305 // Complete scanning of a literal.	354 // Complete scanning of a literal.

306 inline void TerminateLiteral() {	355 inline void TerminateLiteral() {

307 next_.literal_chars = literal_buffer_.EndLiteral();	356 // Does nothing in the current implementation.

308 }	357 }

309	358

310 // Stops scanning of a literal and drop the collected characters,	359 // Stops scanning of a literal and drop the collected characters,

311 // e.g., due to an encountered error.	360 // e.g., due to an encountered error.

312 inline void DropLiteral() {	361 inline void DropLiteral() {

313 literal_buffer_.DropLiteral();	362 next_.literal_chars = NULL;

314 }	363 }

315	364

316 inline void AddLiteralCharAdvance() {	365 inline void AddLiteralCharAdvance() {

317 AddLiteralChar(c0_);	366 AddLiteralChar(c0_);

318 Advance();	367 Advance();

319 }	368 }

320	369

321 // Low-level scanning support.	370 // Low-level scanning support.

322 void Advance() { c0_ = source_->Advance(); }	371 void Advance() { c0_ = source_->Advance(); }

323 void PushBack(uc32 ch) {	372 void PushBack(uc32 ch) {

(...skipping 17 matching lines...) Expand all Loading...
341 }	390 }

342	391

343 uc32 ScanHexEscape(uc32 c, int length);	392 uc32 ScanHexEscape(uc32 c, int length);

344 uc32 ScanOctalEscape(uc32 c, int length);	393 uc32 ScanOctalEscape(uc32 c, int length);

345	394

346 // Return the current source position.	395 // Return the current source position.

347 int source_pos() {	396 int source_pos() {

348 return source_->pos() - kCharacterLookaheadBufferSize;	397 return source_->pos() - kCharacterLookaheadBufferSize;

349 }	398 }

350	399

	400 // Buffers collecting literal strings, numbers, etc.

	401 LiteralBuffer literal_buffer1_;

	402 LiteralBuffer literal_buffer2_;

	403

351 TokenDesc current_; // desc for current token (as returned by Next())	404 TokenDesc current_; // desc for current token (as returned by Next())

352 TokenDesc next_; // desc for next token (one token look-ahead)	405 TokenDesc next_; // desc for next token (one token look-ahead)

353	406

354 // Input stream. Must be initialized to an UC16CharacterStream.	407 // Input stream. Must be initialized to an UC16CharacterStream.

355 UC16CharacterStream* source_;	408 UC16CharacterStream* source_;

356	409

357 // Buffer to hold literal values (identifiers, strings, numbers)

358 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.

359 LiteralCollector literal_buffer_;

360	410

361 // One Unicode character look-ahead; c0_ < 0 at the end of the input.	411 // One Unicode character look-ahead; c0_ < 0 at the end of the input.

362 uc32 c0_;	412 uc32 c0_;

363 };	413 };

364	414

365 // ----------------------------------------------------------------------------	415 // ----------------------------------------------------------------------------

366 // JavaScriptScanner - base logic for JavaScript scanning.	416 // JavaScriptScanner - base logic for JavaScript scanning.

367	417

368 class JavaScriptScanner : public Scanner {	418 class JavaScriptScanner : public Scanner {

369 public:	419 public:

370

371 // Bit vector representing set of types of literals.

372 enum LiteralType {

373 kNoLiterals = 0,

374 kLiteralNumber = 1,

375 kLiteralIdentifier = 2,

376 kLiteralString = 4,

377 kLiteralRegExp = 8,

378 kLiteralRegExpFlags = 16,

379 kAllLiterals = 31

380 };

381

382 // A LiteralScope that disables recording of some types of JavaScript	420 // A LiteralScope that disables recording of some types of JavaScript

383 // literals. If the scanner is configured to not record the specific	421 // literals. If the scanner is configured to not record the specific

384 // type of literal, the scope will not call StartLiteral.	422 // type of literal, the scope will not call StartLiteral.

385 class LiteralScope {	423 class LiteralScope {

386 public:	424 public:

387 LiteralScope(JavaScriptScanner* self, LiteralType type)	425 explicit LiteralScope(JavaScriptScanner* self)

388 : scanner_(self), complete_(false) {	426 : scanner_(self), complete_(false) {

389 if (scanner_->RecordsLiteral(type)) {	427 scanner_->StartLiteral();

390 scanner_->StartLiteral();

391 }

392 }	428 }

393 ~LiteralScope() {	429 ~LiteralScope() {

394 if (!complete_) scanner_->DropLiteral();	430 if (!complete_) scanner_->DropLiteral();

395 }	431 }

396 void Complete() {	432 void Complete() {

397 scanner_->TerminateLiteral();	433 scanner_->TerminateLiteral();

398 complete_ = true;	434 complete_ = true;

399 }	435 }

400	436

401 private:	437 private:

(...skipping 21 matching lines...) Expand all Loading...
423 // Tells whether the buffer contains an identifier (no escapes).	459 // Tells whether the buffer contains an identifier (no escapes).

424 // Used for checking if a property name is an identifier.	460 // Used for checking if a property name is an identifier.

425 static bool IsIdentifier(unibrow::CharacterStream* buffer);	461 static bool IsIdentifier(unibrow::CharacterStream* buffer);

426	462

427 // Seek forward to the given position. This operation does not	463 // Seek forward to the given position. This operation does not

428 // work in general, for instance when there are pushed back	464 // work in general, for instance when there are pushed back

429 // characters, but works for seeking forward until simple delimiter	465 // characters, but works for seeking forward until simple delimiter

430 // tokens, which is what it is used for.	466 // tokens, which is what it is used for.

431 void SeekForward(int pos);	467 void SeekForward(int pos);

432	468

433 // Whether this scanner records the given literal type or not.

434 bool RecordsLiteral(LiteralType type) {

435 return (literal_flags_ & type) != 0;

436 }

437

438 protected:	469 protected:

439 bool SkipWhiteSpace();	470 bool SkipWhiteSpace();

440 Token::Value SkipSingleLineComment();	471 Token::Value SkipSingleLineComment();

441 Token::Value SkipMultiLineComment();	472 Token::Value SkipMultiLineComment();

442	473

443 // Scans a single JavaScript token.	474 // Scans a single JavaScript token.

444 void Scan();	475 void Scan();

445	476

446 void ScanDecimalDigits();	477 void ScanDecimalDigits();

447 Token::Value ScanNumber(bool seen_period);	478 Token::Value ScanNumber(bool seen_period);

448 Token::Value ScanIdentifierOrKeyword();	479 Token::Value ScanIdentifierOrKeyword();

449 Token::Value ScanIdentifierSuffix(LiteralScope* literal);	480 Token::Value ScanIdentifierSuffix(LiteralScope* literal);

450	481

451 void ScanEscape();	482 void ScanEscape();

452 Token::Value ScanString();	483 Token::Value ScanString();

453	484

454 // Scans a possible HTML comment -- begins with '<!'.	485 // Scans a possible HTML comment -- begins with '<!'.

455 Token::Value ScanHtmlComment();	486 Token::Value ScanHtmlComment();

456	487

457 // Decodes a unicode escape-sequence which is part of an identifier.	488 // Decodes a unicode escape-sequence which is part of an identifier.

458 // If the escape sequence cannot be decoded the result is kBadChar.	489 // If the escape sequence cannot be decoded the result is kBadChar.

459 uc32 ScanIdentifierUnicodeEscape();	490 uc32 ScanIdentifierUnicodeEscape();

460	491

461 int literal_flags_;

462 bool has_line_terminator_before_next_;	492 bool has_line_terminator_before_next_;

463 };	493 };

464	494

465	495

466 // ----------------------------------------------------------------------------	496 // ----------------------------------------------------------------------------

467 // Keyword matching state machine.	497 // Keyword matching state machine.

468	498

469 class KeywordMatcher {	499 class KeywordMatcher {

470 // Incrementally recognize keywords.	500 // Incrementally recognize keywords.

471 //	501 //

(...skipping 112 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
584 // keyword with the current prefix).	614 // keyword with the current prefix).

585 const char* keyword_;	615 const char* keyword_;

586 int counter_;	616 int counter_;

587 Token::Value keyword_token_;	617 Token::Value keyword_token_;

588 };	618 };

589	619

590	620

591 } } // namespace v8::internal	621 } } // namespace v8::internal

592	622

593 #endif // V8_SCANNER_BASE_H_	623 #endif // V8_SCANNER_BASE_H_

OLD	NEW

« no previous file with comments | « src/scanner.cc ('k') | src/scanner-base.cc » ('j') | no next file with comments »