src/scanner.cc - Issue 5545006: Optimized scanner to avoid virtual calls for every character read.

Side by Side Diff: src/scanner.cc

Issue 5545006: Optimized scanner to avoid virtual calls for every character read. (Closed)

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2010 the V8 project authors. All rights reserved.	1 // Copyright 2010 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 18 matching lines...) Expand all Loading...
29	29

30 #include "ast.h"	30 #include "ast.h"

31 #include "handles.h"	31 #include "handles.h"

32 #include "scanner.h"	32 #include "scanner.h"

33 #include "unicode-inl.h"	33 #include "unicode-inl.h"

34	34

35 namespace v8 {	35 namespace v8 {

36 namespace internal {	36 namespace internal {

37	37

38 // ----------------------------------------------------------------------------	38 // ----------------------------------------------------------------------------

39 // UTF16Buffer	39 // BufferedUC16CharacterStreams

40	40

41 // CharacterStreamUTF16Buffer	41 BufferedUC16CharacterStream::BufferedUC16CharacterStream()

42 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()	42 : UC16CharacterStream(),

43 : pushback_buffer_(0), last_(0), stream_(NULL) { }	43 pushback_limit_(NULL) {

44	44 // Initialize buffer as being empty. First read will fill the buffer.

45	45 buffer_cursor_ = buffer_;

46 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,	46 buffer_end_ = buffer_;

47 unibrow::CharacterStream* input,	47 }

48 int start_position,	48

49 int end_position) {	49 BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }

50 stream_ = input;	50

51 if (start_position > 0) {	51 void BufferedUC16CharacterStream::PushBack(uc16 character) {

52 SeekForward(start_position);	52 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {

53 }	53 // buffer_ is writable, buffer_cursor_ is const pointer.

54 end_ = end_position != kNoEndPosition ? end_position : kMaxInt;	54 buffer_[--buffer_cursor_ - buffer_] = character;

55 }	55 pos_--;

56	56 return;

57	57 }

58 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {	58 SlowPushBack(character);

59 pushback_buffer()->Add(last_);	59 }

60 last_ = ch;	60

	61

	62 void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {

	63 // In pushback mode, the end of the buffer contains pushback,

	64 // and the start of the buffer (from buffer start to pushback_limit_)

	65 // contains valid data that comes just after the pushback.

	66 // We NULL the pushback_limit_ if pushing all the way back to the

	67 // start of the buffer.

	68

	69 if (pushback_limit_ == NULL) {

	70 // Enter pushback mode.

	71 pushback_limit_ = buffer_end_;

	72 buffer_end_ = buffer_ + kBufferSize;

	73 buffer_cursor_ = buffer_end_;

	74 }

	75 ASSERT(pushback_limit_ > buffer_);

	76 ASSERT(pos_ > 0);

	77 buffer_[--buffer_cursor_ - buffer_] = character;

	78 if (buffer_cursor_ == buffer_) {

	79 pushback_limit_ = NULL;

	80 } else if (buffer_cursor_ < pushback_limit_) {

	81 pushback_limit_ = buffer_cursor_;

	82 }

61 pos_--;	83 pos_--;

62 }	84 }

63	85

64	86

65 uc32 CharacterStreamUTF16Buffer::Advance() {	87 bool BufferedUC16CharacterStream::ReadBlock() {

66 ASSERT(end_ != kNoEndPosition);	88 if (pushback_limit_ != NULL) {

67 ASSERT(end_ >= 0);	89 buffer_cursor_ = buffer_;

68 // NOTE: It is of importance to Persian / Farsi resources that we do	90 buffer_end_ = pushback_limit_;

69 // not strip format control characters in the scanner; see	91 pushback_limit_ = NULL;

70 //	92 ASSERT(buffer_cursor_ != buffer_end_);

71 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152	93 return true;

72 //	94 }

73 // So, even though ECMA-262, section 7.1, page 11, dictates that we	95 unsigned length = FillBuffer(pos_, kBufferSize);

74 // must remove Unicode format-control characters, we do not. This is	96 buffer_cursor_ = buffer_;

75 // in line with how IE and SpiderMonkey handles it.	97 buffer_end_ = buffer_ + length;

76 if (!pushback_buffer()->is_empty()) {	98 return length > 0;

77 pos_++;	99 }

78 return last_ = pushback_buffer()->RemoveLast();	100

79 } else if (stream_->has_more() && pos_ < end_) {	101

80 pos_++;	102 unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {

81 uc32 next = stream_->GetNext();	103 // Leave pushback mode (i.e., ignore that there might be valid data

82 return last_ = next;	104 // in the buffer before the pushback_limit_ point).

83 } else {	105 pushback_limit_ = NULL;

84 // Note: currently the following increment is necessary to avoid a	106 return BufferSeekForward(delta);

85 // test-parser problem!	107 }

86 pos_++;	108

87 return last_ = static_cast<uc32>(-1);	109 // ----------------------------------------------------------------------------

88 }	110 // GenericStringUC16CharacterStream

89 }	111

90	112

91	113 GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(

92 void CharacterStreamUTF16Buffer::SeekForward(int pos) {	114 Handle<String> data,

93 pos_ = pos;	115 unsigned start_position,

94 ASSERT(pushback_buffer()->is_empty());	116 unsigned end_position)

95 stream_->Seek(pos);	117 : string_(data),

96 }	118 length_(end_position) {

97	119 ASSERT(end_position >= start_position);

	120 buffer_cursor_ = buffer_;

	121 buffer_end_ = buffer_;

	122 pos_ = start_position;

	123 }

	124
	Erik Corry 2010/12/07 12:27:30 blanky blanky here and a few more places. blanky blanky here and a few more places.
	125 GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }

	126

	127

	128 unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {

	129 unsigned old_pos = pos_;

	130 pos_ = Min(pos_ + delta, length_);

	131 ReadBlock();

	132 return pos_ - old_pos;

	133 }

	134

	135

	136 unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,

	137 unsigned length) {

	138 if (from_pos >= length_) return 0;

	139 if (from_pos + length > length_) {

	140 length = length_ - from_pos;

	141 }

	142 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);

	143 return length;

	144 }

	145

	146 // ----------------------------------------------------------------------------

	147 // Utf8ToUC16CharacterStream

	148 Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,

	149 unsigned length)

	150 : BufferedUC16CharacterStream(),

	151 raw_data_(data),

	152 raw_data_length_(length),

	153 raw_data_pos_(0),

	154 raw_character_position_(0) {

	155 ReadBlock();

	156 }

	157

	158 Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }

	159

	160

	161 unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {

	162 unsigned old_pos = pos_;

	163 unsigned target_pos = pos_ + delta;

	164 SetRawPosition(target_pos);

	165 pos_ = raw_character_position_;

	166 ReadBlock();

	167 return pos_ - old_pos;

	168 }

	169

	170

	171 unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,

	172 unsigned length) {

	173 static const unibrow::uchar kMaxUC16Character = 0xffff;

	174 SetRawPosition(char_position);

	175 if (raw_character_position_ != char_position) {

	176 // char_position was not a valid position in the stream (hit the end

	177 // while spooling to it).

	178 return 0u;

	179 }

	180 unsigned i = 0;

	181 while (i < length) {

	182 if (raw_data_pos_ == raw_data_length_) break;

	183 unibrow::uchar c = raw_data_[raw_data_pos_];

	184 if (c <= unibrow::Utf8::kMaxOneByteChar) {

	185 raw_data_pos_++;

	186 } else {

	187 c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,

	188 raw_data_length_ - raw_data_pos_,

	189 &raw_data_pos_);

	190 // Don't allow characters outside of the BMP.

	191 if (c > kMaxUC16Character) {

	192 c = unibrow::Utf8::kBadChar;

	193 }

	194 }

	195 buffer_[i++] = static_cast<uc16>(c);

	196 }

	197 raw_character_position_ = char_position + i;

	198 return i;

	199 }

	200

	201 // Move the cursor back to point at the preceding utf-8 character start
	Erik Corry 2010/12/07 12:27:30 utf -> UTF here and in other places. utf -> UTF here and in other places. Lasse Reichstein 2010/12/07 14:05:54 Fixed. I hope. Fixed. I hope.
	202 // in the buffer.

	203 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {

	204 byte character = buffer[--*cursor];

	205 if ((character & 0x80u) != 0) {
	Erik Corry 2010/12/07 12:27:30 if (character > Utf8::kMaxOneByteChar) is nicer be if (character > Utf8::kMaxOneByteChar) is nicer because the constant is named Lasse Reichstein 2010/12/07 14:05:54 Done. Show quoted text On 2010/12/07 12:27:30, Erik Corry wrote: > if (character > Utf8::kMaxOneByteChar) > is nicer because the constant is named Done.
	206 ASSERT((character & 0xC0) == 0x80);
	Erik Corry 2010/12/07 12:27:30 (character & Utf8::kMultiByteEncodingMask) == Utf8 (character & Utf8::kMultiByteEncodingMask) == Utf8::kMultiByteEncodingLastChar These constants don't exist yet. Lasse Reichstein 2010/12/07 14:05:54 Added functions, but not to unibrow::Utf8. I'll co Added functions, but not to unibrow::Utf8. I'll consider moving them later if they turn out to be generally useful,
	207 // Last byte of a multi-byte character encoding. Step backwards until

	208 // pointing to the first byte of the encoding, recognized by having the

	209 // top two bits set.

	210 while (buffer[--*cursor] < 0xC0u) { }
	Erik Corry 2010/12/07 12:27:30 kMultiByteEncodingFirstChar kMultiByteEncodingFirstChar Lasse Reichstein 2010/12/07 14:05:54 IsUtf8MultiCharacterFollower(buffer[--cursor]) IsUtf8MultiCharacterFollower(buffer[--cursor])
	211 }

	212 }

	213

	214

	215 // Move the cursor forward to point at the next following utf-8 character start

	216 // in the buffer.

	217 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {

	218 byte character = buffer[(*cursor)++];

	219 if ((character & 0x80u) != 0) {

	220 // First character of a multi-byte character encoding.

	221 // The number of most-significant one-bits determines the length of the

	222 // encoding:

	223 // 110..... - (0xCx, 0xDx) one additional byte (minimum).

	224 // 1110.... - (0xEx) two additional bytes.

	225 // 11110... - (0xFx) three additional bytes (maximum).

	226 ASSERT((character & 0xC0) == 0xC0);
	Erik Corry 2010/12/07 12:27:30 Named constants. Named constants. Lasse Reichstein 2010/12/07 14:05:54 Done. Show quoted text On 2010/12/07 12:27:30, Erik Corry wrote: > Named constants. Done.
	227 // Additional bytes is:

	228 // 1 if value in range 0xC0 .. 0xDF.

	229 // 2 if value in range 0xE0 .. 0xEF.

	230 // 3 if value in range 0xF0 .. 0xF7.

	231 // Encode that in a single value
	Erik Corry 2010/12/07 12:27:30 Missing full stop. Missing full stop. Lasse Reichstein 2010/12/07 14:05:54 Done. Show quoted text On 2010/12/07 12:27:30, Erik Corry wrote: > Missing full stop. Done.
	232 unsigned additional_bytes =

	233 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
	Erik Corry 2010/12/07 12:27:30 Some places we check whether the unicode character Some places we check whether the unicode character is out of range, but here we just assume that there are no 4 byte encodings. We should be consistent. Lasse Reichstein 2010/12/07 14:05:54 It works with four-byte encodings as well. "additi It works with four-byte encodings as well. "additional bytes" is in addition to the first byte that we already read.
	234 *cursor += additional_bytes;

	235 }

	236 }

	237

	238

	239 void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {

	240 if (raw_character_position_ > target_position) {

	241 // Spool backwards in utf8 buffer.

	242 do {

	243 Utf8CharacterBack(raw_data_, &raw_data_pos_);

	244 raw_character_position_--;

	245 } while (raw_character_position_ > target_position);

	246 return;

	247 }

	248 // Spool forwards in the utf8 buffer.

	249 while (raw_character_position_ < target_position) {

	250 if (raw_data_pos_ == raw_data_length_) return;

	251 Utf8CharacterForward(raw_data_, &raw_data_pos_);

	252 raw_character_position_++;

	253 }

	254 }

	255

	256 // ----------------------------------------------------------------------------

	257 // ExternalTwoByteStringUC16CharacterStream

	258

	259 ExternalTwoByteStringUC16CharacterStream::

	260 ~ExternalTwoByteStringUC16CharacterStream() { }

	261

	262 ExternalTwoByteStringUC16CharacterStream

	263 ::ExternalTwoByteStringUC16CharacterStream(

	264 Handle<ExternalTwoByteString> data,

	265 int start_position,

	266 int end_position)

	267 : UC16CharacterStream(),

	268 source_(data),

	269 raw_data_(data->GetTwoByteData(start_position)) {

	270 buffer_cursor_ = raw_data_,

	271 buffer_end_ = raw_data_ + (end_position - start_position);

	272 pos_ = start_position;

	273 }

98	274

99 // ----------------------------------------------------------------------------	275 // ----------------------------------------------------------------------------

100 // Scanner::LiteralScope	276 // Scanner::LiteralScope

101	277

102 Scanner::LiteralScope::LiteralScope(Scanner* self)	278 Scanner::LiteralScope::LiteralScope(Scanner* self)

103 : scanner_(self), complete_(false) {	279 : scanner_(self), complete_(false) {

104 self->StartLiteral();	280 self->StartLiteral();

105 }	281 }

106	282

107	283

108 Scanner::LiteralScope::~LiteralScope() {	284 Scanner::LiteralScope::~LiteralScope() {

109 if (!complete_) scanner_->DropLiteral();	285 if (!complete_) scanner_->DropLiteral();

110 }	286 }

111	287

112	288

113 void Scanner::LiteralScope::Complete() {	289 void Scanner::LiteralScope::Complete() {

114 scanner_->TerminateLiteral();	290 scanner_->TerminateLiteral();

115 complete_ = true;	291 complete_ = true;

116 }	292 }

117	293

118 // ----------------------------------------------------------------------------	294 // ----------------------------------------------------------------------------

119 // V8JavaScriptScanner	295 // V8JavaScriptScanner

120	296

121 void V8JavaScriptScanner::Initialize(Handle<String> source,	297 V8JavaScriptScanner::V8JavaScriptScanner() : JavaScriptScanner() { }

	298

	299

	300 void V8JavaScriptScanner::Initialize(UC16CharacterStream* source,

122 int literal_flags) {	301 int literal_flags) {

123 source_ = stream_initializer_.Init(source, NULL, 0, source->length());	302 source_ = source;

	303 literal_flags_ = literal_flags \| kLiteralIdentifier;

124 // Need to capture identifiers in order to recognize "get" and "set"	304 // Need to capture identifiers in order to recognize "get" and "set"

125 // in object literals.	305 // in object literals.

126 literal_flags_ = literal_flags \| kLiteralIdentifier;

127 Init();	306 Init();

128 // Skip initial whitespace allowing HTML comment ends just like	307 // Skip initial whitespace allowing HTML comment ends just like

129 // after a newline and scan first token.	308 // after a newline and scan first token.

130 has_line_terminator_before_next_ = true;

131 SkipWhiteSpace();

132 Scan();

133 }

134

135

136 void V8JavaScriptScanner::Initialize(Handle<String> source,

137 unibrow::CharacterStream* stream,

138 int literal_flags) {

139 source_ = stream_initializer_.Init(source, stream,

140 0, UTF16Buffer::kNoEndPosition);

141 literal_flags_ = literal_flags \| kLiteralIdentifier;

142 Init();

143 // Skip initial whitespace allowing HTML comment ends just like

144 // after a newline and scan first token.

145 has_line_terminator_before_next_ = true;

146 SkipWhiteSpace();

147 Scan();

148 }

149

150

151 void V8JavaScriptScanner::Initialize(Handle<String> source,

152 int start_position,

153 int end_position,

154 int literal_flags) {

155 source_ = stream_initializer_.Init(source, NULL,

156 start_position, end_position);

157 literal_flags_ = literal_flags \| kLiteralIdentifier;

158 Init();

159 // Skip initial whitespace allowing HTML comment ends just like

160 // after a newline and scan first token.

161 has_line_terminator_before_next_ = true;	309 has_line_terminator_before_next_ = true;

162 SkipWhiteSpace();	310 SkipWhiteSpace();

163 Scan();	311 Scan();

164 }	312 }

165	313

166	314

167 UTF16Buffer* StreamInitializer::Init(Handle<String> source,

168 unibrow::CharacterStream* stream,

169 int start_position,

170 int end_position) {

171 // Either initialize the scanner from a character stream or from a

172 // string.

173 ASSERT(source.is_null() \|\| stream == NULL);

174

175 // Initialize the source buffer.

176 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {

177 two_byte_string_buffer_.Initialize(

178 Handle<ExternalTwoByteString>::cast(source),

179 start_position,

180 end_position);

181 return &two_byte_string_buffer_;

182 } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) {

183 ascii_string_buffer_.Initialize(

184 Handle<ExternalAsciiString>::cast(source),

185 start_position,

186 end_position);

187 return &ascii_string_buffer_;

188 } else {

189 if (!source.is_null()) {

190 safe_string_input_buffer_.Reset(source.location());

191 stream = &safe_string_input_buffer_;

192 }

193 char_stream_buffer_.Initialize(source,

194 stream,

195 start_position,

196 end_position);

197 return &char_stream_buffer_;

198 }

199 }

200

201 // ----------------------------------------------------------------------------	315 // ----------------------------------------------------------------------------

202 // JsonScanner	316 // JsonScanner

203	317

204 JsonScanner::JsonScanner() {}	318 JsonScanner::JsonScanner() : Scanner() { }

205	319

206	320

207 void JsonScanner::Initialize(Handle<String> source) {	321 void JsonScanner::Initialize(UC16CharacterStream* source) {

208 source_ = stream_initializer_.Init(source, NULL, 0, source->length());	322 source_ = source;

209 Init();	323 Init();

210 // Skip initial whitespace.	324 // Skip initial whitespace.

211 SkipJsonWhiteSpace();	325 SkipJsonWhiteSpace();

212 // Preload first token as look-ahead.	326 // Preload first token as look-ahead.

213 ScanJson();	327 ScanJson();

214 }	328 }

215	329

216	330

217 Token::Value JsonScanner::Next() {	331 Token::Value JsonScanner::Next() {

218 // BUG 1215673: Find a thread safe way to set a stack limit in	332 // BUG 1215673: Find a thread safe way to set a stack limit in

(...skipping 198 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
417 text++;	531 text++;

418 }	532 }

419 if (ScannerConstants::kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;	533 if (ScannerConstants::kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;

420 literal.Complete();	534 literal.Complete();

421 return token;	535 return token;

422 }	536 }

423	537

424	538

425	539

426 } } // namespace v8::internal	540 } } // namespace v8::internal

OLD	NEW

« src/scanner.h ('K') | « src/scanner.h ('k') | src/scanner-base.h » ('j') | src/scanner-base.h » ('J')