src/scanner.cc - Issue 5545006: Optimized scanner to avoid virtual calls for every character read.

Side by Side Diff: src/scanner.cc

Issue 5545006: Optimized scanner to avoid virtual calls for every character read. (Closed)

Patch Set: Addressed review comments. Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2010 the V8 project authors. All rights reserved.	1 // Copyright 2010 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 18 matching lines...) Expand all Loading...
29	29

30 #include "ast.h"	30 #include "ast.h"

31 #include "handles.h"	31 #include "handles.h"

32 #include "scanner.h"	32 #include "scanner.h"

33 #include "unicode-inl.h"	33 #include "unicode-inl.h"

34	34

35 namespace v8 {	35 namespace v8 {

36 namespace internal {	36 namespace internal {

37	37

38 // ----------------------------------------------------------------------------	38 // ----------------------------------------------------------------------------

39 // UTF16Buffer	39 // BufferedUC16CharacterStreams

40	40

41 // CharacterStreamUTF16Buffer	41 BufferedUC16CharacterStream::BufferedUC16CharacterStream()

42 CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()	42 : UC16CharacterStream(),

43 : pushback_buffer_(0), last_(0), stream_(NULL) { }	43 pushback_limit_(NULL) {

44	44 // Initialize buffer as being empty. First read will fill the buffer.

45	45 buffer_cursor_ = buffer_;

46 void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,	46 buffer_end_ = buffer_;

47 unibrow::CharacterStream* input,	47 }

48 int start_position,	48

49 int end_position) {	49 BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }

50 stream_ = input;	50

51 if (start_position > 0) {	51 void BufferedUC16CharacterStream::PushBack(uc16 character) {

52 SeekForward(start_position);	52 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {

53 }	53 // buffer_ is writable, buffer_cursor_ is const pointer.

54 end_ = end_position != kNoEndPosition ? end_position : kMaxInt;	54 buffer_[--buffer_cursor_ - buffer_] = character;

55 }	55 pos_--;

56	56 return;

57	57 }

58 void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {	58 SlowPushBack(character);

59 pushback_buffer()->Add(last_);	59 }

60 last_ = ch;	60

	61

	62 void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {

	63 // In pushback mode, the end of the buffer contains pushback,

	64 // and the start of the buffer (from buffer start to pushback_limit_)

	65 // contains valid data that comes just after the pushback.

	66 // We NULL the pushback_limit_ if pushing all the way back to the

	67 // start of the buffer.

	68

	69 if (pushback_limit_ == NULL) {

	70 // Enter pushback mode.

	71 pushback_limit_ = buffer_end_;

	72 buffer_end_ = buffer_ + kBufferSize;

	73 buffer_cursor_ = buffer_end_;

	74 }

	75 ASSERT(pushback_limit_ > buffer_);

	76 ASSERT(pos_ > 0);

	77 buffer_[--buffer_cursor_ - buffer_] = character;

	78 if (buffer_cursor_ == buffer_) {

	79 pushback_limit_ = NULL;

	80 } else if (buffer_cursor_ < pushback_limit_) {

	81 pushback_limit_ = buffer_cursor_;

	82 }

61 pos_--;	83 pos_--;

62 }	84 }

63	85

64	86

65 uc32 CharacterStreamUTF16Buffer::Advance() {	87 bool BufferedUC16CharacterStream::ReadBlock() {

66 ASSERT(end_ != kNoEndPosition);	88 if (pushback_limit_ != NULL) {

67 ASSERT(end_ >= 0);	89 buffer_cursor_ = buffer_;

68 // NOTE: It is of importance to Persian / Farsi resources that we do	90 buffer_end_ = pushback_limit_;

69 // not strip format control characters in the scanner; see	91 pushback_limit_ = NULL;

70 //	92 ASSERT(buffer_cursor_ != buffer_end_);

71 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152	93 return true;

72 //	94 }

73 // So, even though ECMA-262, section 7.1, page 11, dictates that we	95 unsigned length = FillBuffer(pos_, kBufferSize);

74 // must remove Unicode format-control characters, we do not. This is	96 buffer_cursor_ = buffer_;

75 // in line with how IE and SpiderMonkey handles it.	97 buffer_end_ = buffer_ + length;

76 if (!pushback_buffer()->is_empty()) {	98 return length > 0;

77 pos_++;	99 }

78 return last_ = pushback_buffer()->RemoveLast();	100

79 } else if (stream_->has_more() && pos_ < end_) {	101

80 pos_++;	102 unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {

81 uc32 next = stream_->GetNext();	103 // Leave pushback mode (i.e., ignore that there might be valid data

82 return last_ = next;	104 // in the buffer before the pushback_limit_ point).

83 } else {	105 pushback_limit_ = NULL;

84 // Note: currently the following increment is necessary to avoid a	106 return BufferSeekForward(delta);

85 // test-parser problem!	107 }

86 pos_++;	108

87 return last_ = static_cast<uc32>(-1);	109 // ----------------------------------------------------------------------------

88 }	110 // GenericStringUC16CharacterStream

89 }	111

90	112

91	113 GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(

92 void CharacterStreamUTF16Buffer::SeekForward(int pos) {	114 Handle<String> data,

93 pos_ = pos;	115 unsigned start_position,

94 ASSERT(pushback_buffer()->is_empty());	116 unsigned end_position)

95 stream_->Seek(pos);	117 : string_(data),

96 }	118 length_(end_position) {

97	119 ASSERT(end_position >= start_position);

98	120 buffer_cursor_ = buffer_;

	121 buffer_end_ = buffer_;

	122 pos_ = start_position;

	123 }

	124

	125

	126 GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }

	127

	128

	129 unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {

	130 unsigned old_pos = pos_;

	131 pos_ = Min(pos_ + delta, length_);

	132 ReadBlock();

	133 return pos_ - old_pos;

	134 }

	135

	136

	137 unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,

	138 unsigned length) {

	139 if (from_pos >= length_) return 0;

	140 if (from_pos + length > length_) {

	141 length = length_ - from_pos;

	142 }

	143 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);

	144 return length;

	145 }

	146

	147

	148 // ----------------------------------------------------------------------------

	149 // Utf8ToUC16CharacterStream

	150 Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,

	151 unsigned length)

	152 : BufferedUC16CharacterStream(),

	153 raw_data_(data),

	154 raw_data_length_(length),

	155 raw_data_pos_(0),

	156 raw_character_position_(0) {

	157 ReadBlock();

	158 }

	159

	160

	161 Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }

	162

	163

	164 unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {

	165 unsigned old_pos = pos_;

	166 unsigned target_pos = pos_ + delta;

	167 SetRawPosition(target_pos);

	168 pos_ = raw_character_position_;

	169 ReadBlock();

	170 return pos_ - old_pos;

	171 }

	172

	173

	174 unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,

	175 unsigned length) {

	176 static const unibrow::uchar kMaxUC16Character = 0xffff;

	177 SetRawPosition(char_position);

	178 if (raw_character_position_ != char_position) {

	179 // char_position was not a valid position in the stream (hit the end

	180 // while spooling to it).

	181 return 0u;

	182 }

	183 unsigned i = 0;

	184 while (i < length) {

	185 if (raw_data_pos_ == raw_data_length_) break;

	186 unibrow::uchar c = raw_data_[raw_data_pos_];

	187 if (c <= unibrow::Utf8::kMaxOneByteChar) {

	188 raw_data_pos_++;

	189 } else {

	190 c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,

	191 raw_data_length_ - raw_data_pos_,

	192 &raw_data_pos_);

	193 // Don't allow characters outside of the BMP.

	194 if (c > kMaxUC16Character) {

	195 c = unibrow::Utf8::kBadChar;

	196 }

	197 }

	198 buffer_[i++] = static_cast<uc16>(c);

	199 }

	200 raw_character_position_ = char_position + i;

	201 return i;

	202 }

	203

	204

	205 static const byte kUtf8MultiByteMask = 0xC0;

	206 static const byte kUtf8MultiByteCharStart = 0xC0;

	207 static const byte kUtf8MultiByteCharFollower = 0x80;

	208

	209

	210 #ifdef DEBUG

	211 static bool IsUtf8MultiCharacterStart(byte first_byte) {

	212 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;

	213 }

	214 #endif

	215

	216

	217 static bool IsUtf8MultiCharacterFollower(byte later_byte) {

	218 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;

	219 }

	220

	221

	222 // Move the cursor back to point at the preceding UTF-8 character start

	223 // in the buffer.

	224 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {

	225 byte character = buffer[--*cursor];

	226 if (character > unibrow::Utf8::kMaxOneByteChar) {

	227 ASSERT(IsUtf8MultiCharacterFollower(character));

	228 // Last byte of a multi-byte character encoding. Step backwards until

	229 // pointing to the first byte of the encoding, recognized by having the

	230 // top two bits set.

	231 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }

	232 ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor]));

	233 }

	234 }

	235

	236

	237 // Move the cursor forward to point at the next following UTF-8 character start

	238 // in the buffer.

	239 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {

	240 byte character = buffer[(*cursor)++];

	241 if (character > unibrow::Utf8::kMaxOneByteChar) {

	242 // First character of a multi-byte character encoding.

	243 // The number of most-significant one-bits determines the length of the

	244 // encoding:

	245 // 110..... - (0xCx, 0xDx) one additional byte (minimum).

	246 // 1110.... - (0xEx) two additional bytes.

	247 // 11110... - (0xFx) three additional bytes (maximum).

	248 ASSERT(IsUtf8MultiCharacterStart(character));

	249 // Additional bytes is:

	250 // 1 if value in range 0xC0 .. 0xDF.

	251 // 2 if value in range 0xE0 .. 0xEF.

	252 // 3 if value in range 0xF0 .. 0xF7.

	253 // Encode that in a single value.

	254 unsigned additional_bytes =

	255 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;

	256 *cursor += additional_bytes;

	257 ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));

	258 }

	259 }

	260

	261

	262 void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {

	263 if (raw_character_position_ > target_position) {

	264 // Spool backwards in utf8 buffer.

	265 do {

	266 Utf8CharacterBack(raw_data_, &raw_data_pos_);

	267 raw_character_position_--;

	268 } while (raw_character_position_ > target_position);

	269 return;

	270 }

	271 // Spool forwards in the utf8 buffer.

	272 while (raw_character_position_ < target_position) {

	273 if (raw_data_pos_ == raw_data_length_) return;

	274 Utf8CharacterForward(raw_data_, &raw_data_pos_);

	275 raw_character_position_++;

	276 }

	277 }

	278

	279

	280 // ----------------------------------------------------------------------------

	281 // ExternalTwoByteStringUC16CharacterStream

	282

	283 ExternalTwoByteStringUC16CharacterStream::

	284 ~ExternalTwoByteStringUC16CharacterStream() { }

	285

	286

	287 ExternalTwoByteStringUC16CharacterStream

	288 ::ExternalTwoByteStringUC16CharacterStream(

	289 Handle<ExternalTwoByteString> data,

	290 int start_position,

	291 int end_position)

	292 : UC16CharacterStream(),

	293 source_(data),

	294 raw_data_(data->GetTwoByteData(start_position)) {

	295 buffer_cursor_ = raw_data_,

	296 buffer_end_ = raw_data_ + (end_position - start_position);

	297 pos_ = start_position;

	298 }

	299

	300

99 // ----------------------------------------------------------------------------	301 // ----------------------------------------------------------------------------

100 // Scanner::LiteralScope	302 // Scanner::LiteralScope

101	303

102 Scanner::LiteralScope::LiteralScope(Scanner* self)	304 Scanner::LiteralScope::LiteralScope(Scanner* self)

103 : scanner_(self), complete_(false) {	305 : scanner_(self), complete_(false) {

104 self->StartLiteral();	306 self->StartLiteral();

105 }	307 }

106	308

107	309

108 Scanner::LiteralScope::~LiteralScope() {	310 Scanner::LiteralScope::~LiteralScope() {

109 if (!complete_) scanner_->DropLiteral();	311 if (!complete_) scanner_->DropLiteral();

110 }	312 }

111	313

112	314

113 void Scanner::LiteralScope::Complete() {	315 void Scanner::LiteralScope::Complete() {

114 scanner_->TerminateLiteral();	316 scanner_->TerminateLiteral();

115 complete_ = true;	317 complete_ = true;

116 }	318 }

117	319

	320

118 // ----------------------------------------------------------------------------	321 // ----------------------------------------------------------------------------

119 // V8JavaScriptScanner	322 // V8JavaScriptScanner

120	323

121 void V8JavaScriptScanner::Initialize(Handle<String> source,	324 V8JavaScriptScanner::V8JavaScriptScanner() : JavaScriptScanner() { }

	325

	326

	327 void V8JavaScriptScanner::Initialize(UC16CharacterStream* source,

122 int literal_flags) {	328 int literal_flags) {

123 source_ = stream_initializer_.Init(source, NULL, 0, source->length());	329 source_ = source;

	330 literal_flags_ = literal_flags \| kLiteralIdentifier;

124 // Need to capture identifiers in order to recognize "get" and "set"	331 // Need to capture identifiers in order to recognize "get" and "set"

125 // in object literals.	332 // in object literals.

126 literal_flags_ = literal_flags \| kLiteralIdentifier;

127 Init();	333 Init();

128 // Skip initial whitespace allowing HTML comment ends just like	334 // Skip initial whitespace allowing HTML comment ends just like

129 // after a newline and scan first token.	335 // after a newline and scan first token.

130 has_line_terminator_before_next_ = true;

131 SkipWhiteSpace();

132 Scan();

133 }

134

135

136 void V8JavaScriptScanner::Initialize(Handle<String> source,

137 unibrow::CharacterStream* stream,

138 int literal_flags) {

139 source_ = stream_initializer_.Init(source, stream,

140 0, UTF16Buffer::kNoEndPosition);

141 literal_flags_ = literal_flags \| kLiteralIdentifier;

142 Init();

143 // Skip initial whitespace allowing HTML comment ends just like

144 // after a newline and scan first token.

145 has_line_terminator_before_next_ = true;

146 SkipWhiteSpace();

147 Scan();

148 }

149

150

151 void V8JavaScriptScanner::Initialize(Handle<String> source,

152 int start_position,

153 int end_position,

154 int literal_flags) {

155 source_ = stream_initializer_.Init(source, NULL,

156 start_position, end_position);

157 literal_flags_ = literal_flags \| kLiteralIdentifier;

158 Init();

159 // Skip initial whitespace allowing HTML comment ends just like

160 // after a newline and scan first token.

161 has_line_terminator_before_next_ = true;	336 has_line_terminator_before_next_ = true;

162 SkipWhiteSpace();	337 SkipWhiteSpace();

163 Scan();	338 Scan();

164 }	339 }

165	340

166	341

167 UTF16Buffer* StreamInitializer::Init(Handle<String> source,

168 unibrow::CharacterStream* stream,

169 int start_position,

170 int end_position) {

171 // Either initialize the scanner from a character stream or from a

172 // string.

173 ASSERT(source.is_null() \|\| stream == NULL);

174

175 // Initialize the source buffer.

176 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {

177 two_byte_string_buffer_.Initialize(

178 Handle<ExternalTwoByteString>::cast(source),

179 start_position,

180 end_position);

181 return &two_byte_string_buffer_;

182 } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) {

183 ascii_string_buffer_.Initialize(

184 Handle<ExternalAsciiString>::cast(source),

185 start_position,

186 end_position);

187 return &ascii_string_buffer_;

188 } else {

189 if (!source.is_null()) {

190 safe_string_input_buffer_.Reset(source.location());

191 stream = &safe_string_input_buffer_;

192 }

193 char_stream_buffer_.Initialize(source,

194 stream,

195 start_position,

196 end_position);

197 return &char_stream_buffer_;

198 }

199 }

200

201 // ----------------------------------------------------------------------------	342 // ----------------------------------------------------------------------------

202 // JsonScanner	343 // JsonScanner

203	344

204 JsonScanner::JsonScanner() {}	345 JsonScanner::JsonScanner() : Scanner() { }

205	346

206	347

207 void JsonScanner::Initialize(Handle<String> source) {	348 void JsonScanner::Initialize(UC16CharacterStream* source) {

208 source_ = stream_initializer_.Init(source, NULL, 0, source->length());	349 source_ = source;

209 Init();	350 Init();

210 // Skip initial whitespace.	351 // Skip initial whitespace.

211 SkipJsonWhiteSpace();	352 SkipJsonWhiteSpace();

212 // Preload first token as look-ahead.	353 // Preload first token as look-ahead.

213 ScanJson();	354 ScanJson();

214 }	355 }

215	356

216	357

217 Token::Value JsonScanner::Next() {	358 Token::Value JsonScanner::Next() {

218 // BUG 1215673: Find a thread safe way to set a stack limit in	359 // BUG 1215673: Find a thread safe way to set a stack limit in

(...skipping 198 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
417 text++;	558 text++;

418 }	559 }

419 if (ScannerConstants::kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;	560 if (ScannerConstants::kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;

420 literal.Complete();	561 literal.Complete();

421 return token;	562 return token;

422 }	563 }

423	564

424	565

425	566

426 } } // namespace v8::internal	567 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/scanner.h ('k') | src/scanner-base.h » ('j') | no next file with comments »