src/jsregexp.cc - Issue 12427: Merge regexp2000 back into bleeding_edge

Side by Side Diff: src/jsregexp.cc

Issue 12427: Merge regexp2000 back into bleeding_edge (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 12 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.	1 // Copyright 2006-2008 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

11 // with the distribution.	11 // with the distribution.

12 // * Neither the name of Google Inc. nor the names of its	12 // * Neither the name of Google Inc. nor the names of its

13 // contributors may be used to endorse or promote products derived	13 // contributors may be used to endorse or promote products derived

14 // from this software without specific prior written permission.	14 // from this software without specific prior written permission.

15 //	15 //

16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS	16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT	17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR	18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT	19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,	20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT	21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,	22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY	23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

27	27

	28 #define _HAS_EXCEPTIONS 0
	Erik Corry 2008/11/25 11:03:52 ?? ??
	29 #include <set>
	Erik Corry 2008/11/25 11:03:52 Where is this coming from? Where is this coming from? Mads Ager (chromium) 2008/11/25 21:09:41 These are still there. Please remove! Show quoted text On 2008/11/25 11:03:52, Erik Corry wrote: > Where is this coming from? These are still there. Please remove! Christian Plesner Hansen 2008/11/26 06:49:56 It's in use. It's in use.
	30

28 #include "v8.h"	31 #include "v8.h"

29	32

	33 #include "ast.h"

30 #include "execution.h"	34 #include "execution.h"

31 #include "factory.h"	35 #include "factory.h"

32 #include "jsregexp.h"	36 #include "jsregexp-inl.h"

33 #include "platform.h"	37 #include "platform.h"

34 #include "runtime.h"	38 #include "runtime.h"

35 #include "top.h"	39 #include "top.h"

36 #include "compilation-cache.h"	40 #include "compilation-cache.h"

	41 #include "string-stream.h"

	42 #include "parser.h"

	43 #include "assembler-irregexp.h"

	44 #include "regexp-macro-assembler.h"

	45 #include "regexp-macro-assembler-irregexp.h"

	46 #if defined __arm__ \|\| defined __thumb__ \|\| defined ARM

	47 // include regexp-macro-assembler-arm.h when created.

	48 #else // ia32

	49 #include "regexp-macro-assembler-ia32.h"

	50 #endif

	51 #include "interpreter-irregexp.h"

37	52

38 // Including pcre.h undefines DEBUG to avoid getting debug output from	53 // Including pcre.h undefines DEBUG to avoid getting debug output from

39 // the JSCRE implementation. Make sure to redefine it in debug mode	54 // the JSCRE implementation. Make sure to redefine it in debug mode

40 // after having included the header file.	55 // after having included the header file.

41 #ifdef DEBUG	56 #ifdef DEBUG

42 #include "third_party/jscre/pcre.h"	57 #include "third_party/jscre/pcre.h"

43 #define DEBUG	58 #define DEBUG

44 #else	59 #else

45 #include "third_party/jscre/pcre.h"	60 #include "third_party/jscre/pcre.h"

46 #endif	61 #endif

47	62

	63

48 namespace v8 { namespace internal {	64 namespace v8 { namespace internal {

49	65

50	66

51 #define CAPTURE_INDEX 0

52 #define INTERNAL_INDEX 1

53

54 static Failure* malloc_failure;	67 static Failure* malloc_failure;

55	68

56 static void* JSREMalloc(size_t size) {	69 static void* JSREMalloc(size_t size) {

57 Object* obj = Heap::AllocateByteArray(size);	70 Object* obj = Heap::AllocateByteArray(size);

58	71

59 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile	72 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile

60 // will return NULL to the caller, performs GC there.	73 // will return NULL to the caller, performs GC there.

61 // Also pass failure information to the caller.	74 // Also pass failure information to the caller.

62 if (obj->IsFailure()) {	75 if (obj->IsFailure()) {

63 malloc_failure = Failure::cast(obj);	76 malloc_failure = Failure::cast(obj);

(...skipping 105 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
169 break;	182 break;

170 case 'm':	183 case 'm':

171 flags \|= JSRegExp::MULTILINE;	184 flags \|= JSRegExp::MULTILINE;

172 break;	185 break;

173 }	186 }

174 }	187 }

175 return JSRegExp::Flags(flags);	188 return JSRegExp::Flags(flags);

176 }	189 }

177	190

178	191

179 unibrow::Predicate<unibrow::RegExpSpecialChar, 128> is_reg_exp_special_char;	192 static inline void ThrowRegExpException(Handle<JSRegExp> re,

	193 Handle<String> pattern,

	194 Handle<String> error_text,

	195 const char* message) {

	196 Handle<JSArray> array = Factory::NewJSArray(2);

	197 SetElement(array, 0, pattern);

	198 SetElement(array, 1, error_text);

	199 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);

	200 Top::Throw(*regexp_err);

	201 }

180	202

181	203

182 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,	204 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,

183 Handle<String> pattern,	205 Handle<String> pattern,

184 Handle<String> flag_str) {	206 Handle<String> flag_str) {

185 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);	207 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);

186 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);	208 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);

187 bool in_cache = !cached.is_null();	209 bool in_cache = !cached.is_null();

188 Handle<Object> result;	210 Handle<Object> result;

189 StringShape shape(*pattern);

190 if (in_cache) {	211 if (in_cache) {

191 re->set_data(*cached);	212 re->set_data(*cached);

192 result = re;	213 result = re;

193 } else {	214 } else {

194 bool is_atom = !flags.is_ignore_case();	215 FlattenString(pattern);

195 for (int i = 0; is_atom && i < pattern->length(shape); i++) {	216 RegExpParseResult parse_result;

196 if (is_reg_exp_special_char.get(pattern->Get(shape, i)))	217 FlatStringReader reader(pattern);

197 is_atom = false;	218 if (!ParseRegExp(&reader, &parse_result)) {

	219 // Throw an exception if we fail to parse the pattern.

	220 ThrowRegExpException(re,

	221 pattern,

	222 parse_result.error,

	223 "malformed_regexp");

	224 return Handle<Object>();

198 }	225 }

199 if (is_atom) {	226 RegExpAtom* atom = parse_result.tree->AsAtom();

200 result = AtomCompile(re, pattern, flags);	227 if (atom != NULL && !flags.is_ignore_case()) {

	228 if (parse_result.has_character_escapes) {

	229 Vector<const uc16> atom_pattern = atom->data();

	230 Handle<String> atom_string =

	231 Factory::NewStringFromTwoByte(atom_pattern);

	232 result = AtomCompile(re, pattern, flags, atom_string);

	233 } else {

	234 result = AtomCompile(re, pattern, flags, pattern);

	235 }

201 } else {	236 } else {

202 result = JsreCompile(re, pattern, flags);	237 RegExpNode* node = NULL;

	238 Handle<FixedArray> irregexp_data =

	239 RegExpEngine::Compile(&parse_result,

	240 &node,

	241 flags.is_ignore_case());

	242 if (irregexp_data.is_null()) {

	243 result = JscrePrepare(re, pattern, flags);

	244 } else {

	245 result = IrregexpPrepare(re, pattern, flags, irregexp_data);

	246 }

203 }	247 }

204 Object* data = re->data();	248 Object* data = re->data();

205 if (data->IsFixedArray()) {	249 if (data->IsFixedArray()) {

206 // If compilation succeeded then the data is set on the regexp	250 // If compilation succeeded then the data is set on the regexp

207 // and we can store it in the cache.	251 // and we can store it in the cache.

208 Handle<FixedArray> data(FixedArray::cast(re->data()));	252 Handle<FixedArray> data(FixedArray::cast(re->data()));

209 CompilationCache::PutRegExp(pattern, flags, data);	253 CompilationCache::PutRegExp(pattern, flags, data);

210 }	254 }

211 }	255 }

212	256

213 LOG(RegExpCompileEvent(re, in_cache));	257 LOG(RegExpCompileEvent(re, in_cache));

214 return result;	258 return result;

215 }	259 }

216	260

217	261

218 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,	262 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,

219 Handle<String> subject,	263 Handle<String> subject,

220 Handle<Object> index) {	264 Handle<Object> index) {

221 switch (regexp->TypeTag()) {	265 switch (regexp->TypeTag()) {

222 case JSRegExp::JSCRE:	266 case JSRegExp::JSCRE:

223 return JsreExec(regexp, subject, index);	267 return JscreExec(regexp, subject, index);

224 case JSRegExp::ATOM:	268 case JSRegExp::ATOM:

225 return AtomExec(regexp, subject, index);	269 return AtomExec(regexp, subject, index);

	270 case JSRegExp::IRREGEXP:

	271 return IrregexpExec(regexp, subject, index);

226 default:	272 default:

227 UNREACHABLE();	273 UNREACHABLE();

228 return Handle<Object>();	274 return Handle<Object>();

229 }	275 }

230 }	276 }

231	277

232	278

233 Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,	279 Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,

234 Handle<String> subject) {	280 Handle<String> subject) {

235 switch (regexp->TypeTag()) {	281 switch (regexp->TypeTag()) {

236 case JSRegExp::JSCRE:	282 case JSRegExp::JSCRE:

237 return JsreExecGlobal(regexp, subject);	283 return JscreExecGlobal(regexp, subject);

238 case JSRegExp::ATOM:	284 case JSRegExp::ATOM:

239 return AtomExecGlobal(regexp, subject);	285 return AtomExecGlobal(regexp, subject);

	286 case JSRegExp::IRREGEXP:

	287 return IrregexpExecGlobal(regexp, subject);

240 default:	288 default:

241 UNREACHABLE();	289 UNREACHABLE();

242 return Handle<Object>();	290 return Handle<Object>();

243 }	291 }

244 }	292 }

245	293

246	294

247 Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,	295 Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,

248 Handle<String> pattern,	296 Handle<String> pattern,

249 JSRegExp::Flags flags) {	297 JSRegExp::Flags flags,

250 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, pattern);	298 Handle<String> match_pattern) {

	299 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);

251 return re;	300 return re;

252 }	301 }

253	302

254	303

255 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,	304 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,

256 Handle<String> subject,	305 Handle<String> subject,

257 Handle<Object> index) {	306 Handle<Object> index) {

258 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));	307 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));

259	308

260 uint32_t start_index;	309 uint32_t start_index;

261 if (!Array::IndexFromObject(*index, &start_index)) {	310 if (!Array::IndexFromObject(*index, &start_index)) {

262 return Handle<Smi>(Smi::FromInt(-1));	311 return Handle<Smi>(Smi::FromInt(-1));

263 }	312 }

264	313

265 LOG(RegExpExecEvent(re, start_index, subject));	314 LOG(RegExpExecEvent(re, start_index, subject));

266 int value = Runtime::StringMatch(subject, needle, start_index);	315 int value = Runtime::StringMatch(subject, needle, start_index);

267 if (value == -1) return Factory::null_value();	316 if (value == -1) return Factory::null_value();

268	317

269 Handle<FixedArray> array = Factory::NewFixedArray(2);	318 Handle<FixedArray> array = Factory::NewFixedArray(2);

270 array->set(0,	319 array->set(0, Smi::FromInt(value));

271 Smi::FromInt(value),	320 array->set(1, Smi::FromInt(value + needle->length()));

272 SKIP_WRITE_BARRIER);

273 array->set(1,

274 Smi::FromInt(value + needle->length()),

275 SKIP_WRITE_BARRIER);

276 return Factory::NewJSArrayWithElements(array);	321 return Factory::NewJSArrayWithElements(array);

277 }	322 }

278	323

279	324

280 Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,	325 Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,

281 Handle<String> subject) {	326 Handle<String> subject) {

282 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));	327 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));

283 Handle<JSArray> result = Factory::NewJSArray(1);	328 Handle<JSArray> result = Factory::NewJSArray(1);

284 int index = 0;	329 int index = 0;

285 int match_count = 0;	330 int match_count = 0;

286 int subject_length = subject->length();	331 int subject_length = subject->length();

287 int needle_length = needle->length();	332 int needle_length = needle->length();

288 while (true) {	333 while (true) {

289 LOG(RegExpExecEvent(re, index, subject));	334 LOG(RegExpExecEvent(re, index, subject));

290 int value = -1;	335 int value = -1;

291 if (index + needle_length <= subject_length) {	336 if (index + needle_length <= subject_length) {

292 value = Runtime::StringMatch(subject, needle, index);	337 value = Runtime::StringMatch(subject, needle, index);

293 }	338 }

294 if (value == -1) break;	339 if (value == -1) break;

295 HandleScope scope;	340 HandleScope scope;

296 int end = value + needle_length;	341 int end = value + needle_length;

297	342

298 Handle<FixedArray> array = Factory::NewFixedArray(2);	343 Handle<FixedArray> array = Factory::NewFixedArray(2);

299 array->set(0,	344 array->set(0, Smi::FromInt(value));

300 Smi::FromInt(value),	345 array->set(1, Smi::FromInt(end));

301 SKIP_WRITE_BARRIER);

302 array->set(1,

303 Smi::FromInt(end),

304 SKIP_WRITE_BARRIER);

305 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);	346 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);

306 SetElement(result, match_count, pair);	347 SetElement(result, match_count, pair);

307 match_count++;	348 match_count++;

308 index = end;	349 index = end;

309 if (needle_length == 0) index++;	350 if (needle_length == 0) index++;

310 }	351 }

311 return result;	352 return result;

312 }	353 }

313	354

314	355

	356 Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,

	357 Handle<String> pattern,

	358 JSRegExp::Flags flags) {

	359 Handle<Object> value(Heap::undefined_value());

	360 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);

	361 return re;

	362 }

	363

	364

	365 Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,

	366 Handle<String> pattern,

	367 JSRegExp::Flags flags,

	368 Handle<FixedArray> irregexp_data) {

	369 Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, irregexp_data);

	370 return re;

	371 }

	372

	373

315 static inline Object* DoCompile(String* pattern,	374 static inline Object* DoCompile(String* pattern,

316 JSRegExp::Flags flags,	375 JSRegExp::Flags flags,

317 unsigned* number_of_captures,	376 unsigned* number_of_captures,

318 const char** error_message,	377 const char** error_message,

319 JscreRegExp** code) {	378 JscreRegExp** code) {

320 JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()	379 JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()

321 ? JSRegExpIgnoreCase	380 ? JSRegExpIgnoreCase

322 : JSRegExpDoNotIgnoreCase;	381 : JSRegExpDoNotIgnoreCase;

323 JSRegExpMultilineOption multiline_option = flags.is_multiline()	382 JSRegExpMultilineOption multiline_option = flags.is_multiline()

324 ? JSRegExpMultiline	383 ? JSRegExpMultiline

(...skipping 26 matching lines...) Expand all Loading...
351 const char** error_message,	410 const char** error_message,

352 JscreRegExp** code) {	411 JscreRegExp** code) {

353 CALL_HEAP_FUNCTION_VOID(DoCompile(*pattern,	412 CALL_HEAP_FUNCTION_VOID(DoCompile(*pattern,

354 flags,	413 flags,

355 number_of_captures,	414 number_of_captures,

356 error_message,	415 error_message,

357 code));	416 code));

358 }	417 }

359	418

360	419

361 Handle<Object> RegExpImpl::JsreCompile(Handle<JSRegExp> re,	420 Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {

362 Handle<String> pattern,	421 ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);

363 JSRegExp::Flags flags) {	422 ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());

	423

	424 Handle<String> pattern(re->Pattern());

	425 JSRegExp::Flags flags = re->GetFlags();

	426

364 Handle<String> two_byte_pattern = StringToTwoByte(pattern);	427 Handle<String> two_byte_pattern = StringToTwoByte(pattern);

365	428

366 unsigned number_of_captures;	429 unsigned number_of_captures;

367 const char* error_message = NULL;	430 const char* error_message = NULL;

368	431

369 JscreRegExp* code = NULL;	432 JscreRegExp* code = NULL;

370 FlattenString(pattern);	433 FlattenString(pattern);

371	434

372 CompileWithRetryAfterGC(two_byte_pattern,	435 CompileWithRetryAfterGC(two_byte_pattern,

373 flags,	436 flags,

(...skipping 10 matching lines...) Expand all Loading...
384 Handle<Object> regexp_err =	447 Handle<Object> regexp_err =

385 Factory::NewSyntaxError("malformed_regexp", array);	448 Factory::NewSyntaxError("malformed_regexp", array);

386 Top::Throw(*regexp_err);	449 Top::Throw(*regexp_err);

387 return Handle<Object>();	450 return Handle<Object>();

388 }	451 }

389	452

390 // Convert the return address to a ByteArray pointer.	453 // Convert the return address to a ByteArray pointer.

391 Handle<ByteArray> internal(	454 Handle<ByteArray> internal(

392 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));	455 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));

393	456

394 Handle<FixedArray> value = Factory::NewFixedArray(2);	457 Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);

395 value->set(CAPTURE_INDEX, Smi::FromInt(number_of_captures));	458 value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));

396 value->set(INTERNAL_INDEX, *internal);	459 value->set(kJscreInternalIndex, *internal);

397 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);	460 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);

398	461

399 return re;	462 return re;

400 }	463 }

401	464

402	465

403 Handle<Object> RegExpImpl::JsreExecOnce(Handle<JSRegExp> regexp,	466 Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<JSRegExp> regexp,

404 int num_captures,	467 int num_captures,

405 Handle<String> subject,	468 Handle<String> two_byte_subject,

406 int previous_index,	469 int previous_index,

407 const uc16* two_byte_subject,	470 int* offsets_vector,

408 int* offsets_vector,	471 int offsets_vector_length) {

409 int offsets_vector_length) {	472 #ifdef DEBUG

	473 if (FLAG_trace_regexp_bytecodes) {

	474 String* pattern = regexp->Pattern();

	475 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));

	476 PrintF("\n\nSubject string: '%s'\n\n", *(two_byte_subject->ToCString()));

	477 }

	478 #endif

	479 ASSERT(StringShape(*two_byte_subject).IsTwoByteRepresentation());

	480 ASSERT(two_byte_subject->IsFlat(StringShape(*two_byte_subject)));

	481 bool rc;

	482 {
	Mads Ager (chromium) 2008/11/25 21:09:41 Why is the extra scope needed here? Why is the extra scope needed here? Erik Corry 2008/11/26 12:18:36 It isn't. Removed. Show quoted text On 2008/11/25 21:09:41, Mads Ager wrote: > Why is the extra scope needed here? It isn't. Removed.
	483 for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {

	484 offsets_vector[i] = -1;

	485 }

	486

	487 LOG(RegExpExecEvent(regexp, previous_index, two_byte_subject));

	488

	489 FixedArray* irregexp =

	490 FixedArray::cast(regexp->DataAt(JSRegExp::kIrregexpDataIndex));

	491 int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();

	492

	493 switch (tag) {
	Mads Ager (chromium) 2008/11/25 21:09:41 The indentation of this switch is off. Indent the The indentation of this switch is off. Indent the cases please. Erik Corry 2008/11/26 12:18:36 Done. Show quoted text On 2008/11/25 21:09:41, Mads Ager wrote: > The indentation of this switch is off. Indent the cases please. Done.
	494 case RegExpMacroAssembler::kIA32Implementation: {

	495 Code* code = Code::cast(irregexp->get(kIrregexpCodeIndex));

	496 SmartPointer<int> captures(NewArray<int>((num_captures + 1) * 2));

	497 Address start_addr =

	498 Handle<SeqTwoByteString>::cast(two_byte_subject)->GetCharsAddress();

	499 int start_offset =

	500 start_addr - reinterpret_cast<Address>(*two_byte_subject);

	501 int end_offset =

	502 start_offset + (two_byte_subject->length() - previous_index) * 2;

	503 typedef bool testfunc(String*, int, int, int);

	504 testfunc* test = FUNCTION_CAST<testfunc*>(code->entry());

	505 rc = test(two_byte_subject.location(),

	506 start_offset,

	507 end_offset,

	508 *captures);

	509 if (rc) {

	510 // Capture values are relative to start_offset only.

	511 for (int i = 0; i < offsets_vector_length; i++) {

	512 if (offsets_vector[i] >= 0) {

	513 offsets_vector[i] += previous_index;

	514 }

	515 }

	516 }

	517 break;

	518 }

	519 default:
	Mads Ager (chromium) 2008/11/25 21:09:41 Why is this default case in the middle of the rest Why is this default case in the middle of the rest of the cases. Please move it to the end! Erik Corry 2008/11/26 12:18:36 Because it falls through? Lasse, please take a lo Show quoted text On 2008/11/25 21:09:41, Mads Ager wrote: > Why is this default case in the middle of the rest of the cases. Please move it > to the end! Because it falls through? Lasse, please take a look at this, perhaps it is correct.
	520 case RegExpMacroAssembler::kARMImplementation:

	521 UNREACHABLE();

	522 rc = false;

	523 break;

	524 case RegExpMacroAssembler::kBytecodeImplementation: {

	525 Handle<ByteArray> byte_codes = IrregexpCode(regexp);

	526

	527 rc = IrregexpInterpreter::Match(byte_codes,

	528 two_byte_subject,

	529 offsets_vector,

	530 previous_index);

	531 break;

	532 }

	533 }

	534 }

	535

	536 if (!rc) {

	537 return Factory::null_value();

	538 }

	539

	540 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));

	541 // The captures come in (start, end+1) pairs.

	542 for (int i = 0; i < 2 * (num_captures+1); i += 2) {

	543 array->set(i, Smi::FromInt(offsets_vector[i]));

	544 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));

	545 }

	546 return Factory::NewJSArrayWithElements(array);

	547 }

	548

	549

	550 Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,

	551 int num_captures,

	552 Handle<String> subject,

	553 int previous_index,

	554 const uc16* two_byte_subject,

	555 int* offsets_vector,

	556 int offsets_vector_length) {

410 int rc;	557 int rc;

411 {	558 {

412 AssertNoAllocation a;	559 AssertNoAllocation a;

413 ByteArray* internal = JsreInternal(regexp);	560 ByteArray* internal = JscreInternal(regexp);

414 const JscreRegExp* js_regexp =	561 const JscreRegExp* js_regexp =

415 reinterpret_cast<JscreRegExp*>(internal->GetDataStartAddress());	562 reinterpret_cast<JscreRegExp*>(internal->GetDataStartAddress());

416	563

417 LOG(RegExpExecEvent(regexp, previous_index, subject));	564 LOG(RegExpExecEvent(regexp, previous_index, subject));

418	565

419 rc = jsRegExpExecute(js_regexp,	566 rc = jsRegExpExecute(js_regexp,

420 two_byte_subject,	567 two_byte_subject,

421 subject->length(),	568 subject->length(),

422 previous_index,	569 previous_index,

423 offsets_vector,	570 offsets_vector,

(...skipping 13 matching lines...) Expand all Loading...
437 Handle<Object> code(Smi::FromInt(rc));	584 Handle<Object> code(Smi::FromInt(rc));

438 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };	585 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };

439 Handle<Object> regexp_err(	586 Handle<Object> regexp_err(

440 Factory::NewTypeError("jsre_error", HandleVector(args, 2)));	587 Factory::NewTypeError("jsre_error", HandleVector(args, 2)));

441 return Handle<Object>(Top::Throw(*regexp_err));	588 return Handle<Object>(Top::Throw(*regexp_err));

442 }	589 }

443	590

444 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));	591 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));

445 // The captures come in (start, end+1) pairs.	592 // The captures come in (start, end+1) pairs.

446 for (int i = 0; i < 2 * (num_captures+1); i += 2) {	593 for (int i = 0; i < 2 * (num_captures+1); i += 2) {

447 array->set(i,	594 array->set(i, Smi::FromInt(offsets_vector[i]));

448 Smi::FromInt(offsets_vector[i]),	595 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));

449 SKIP_WRITE_BARRIER);

450 array->set(i+1,

451 Smi::FromInt(offsets_vector[i+1]),

452 SKIP_WRITE_BARRIER);

453 }	596 }

454 return Factory::NewJSArrayWithElements(array);	597 return Factory::NewJSArrayWithElements(array);

455 }	598 }

456	599

457	600

458 class OffsetsVector {	601 class OffsetsVector {

459 public:	602 public:

460 inline OffsetsVector(int num_captures) {	603 inline OffsetsVector(int num_registers) :
	Mads Ager (chromium) 2008/11/25 21:09:41 Move : to the next line and do a four space indent Move : to the next line and do a four space indent.
461 offsets_vector_length_ = (num_captures + 1) * 3;	604 offsets_vector_length_(num_registers) {

462 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {	605 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {

463 vector_ = NewArray<int>(offsets_vector_length_);	606 vector_ = NewArray<int>(offsets_vector_length_);

464 } else {	607 } else {

465 vector_ = static_offsets_vector_;	608 vector_ = static_offsets_vector_;

466 }	609 }

467 }	610 }

468	611

469	612

470 inline ~OffsetsVector() {	613 inline ~OffsetsVector() {

471 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {	614 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {

472 DeleteArray(vector_);	615 DeleteArray(vector_);

473 vector_ = NULL;	616 vector_ = NULL;

474 }	617 }

475 }	618 }

476	619

477	620

478 inline int* vector() {	621 inline int* vector() {

479 return vector_;	622 return vector_;

480 }	623 }

481	624

482	625

483 inline int length() {	626 inline int length() {

484 return offsets_vector_length_;	627 return offsets_vector_length_;

485 }	628 }

486	629

487 private:	630 private:

488 int* vector_;	631 int* vector_;

489 int offsets_vector_length_;	632 int offsets_vector_length_;

490 static const int kStaticOffsetsVectorSize = 30;	633 static const int kStaticOffsetsVectorSize = 50;

491 static int static_offsets_vector_[kStaticOffsetsVectorSize];	634 static int static_offsets_vector_[kStaticOffsetsVectorSize];

492 };	635 };

493	636

494	637

495 int OffsetsVector::static_offsets_vector_[	638 int OffsetsVector::static_offsets_vector_[

496 OffsetsVector::kStaticOffsetsVectorSize];	639 OffsetsVector::kStaticOffsetsVectorSize];

497	640

498	641

499 Handle<Object> RegExpImpl::JsreExec(Handle<JSRegExp> regexp,	642 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,

500 Handle<String> subject,	643 Handle<String> subject,
	Mads Ager (chromium) 2008/11/25 21:09:41 Indentation is off. Indentation is off. Erik Corry 2008/11/26 12:18:36 Fixed. Show quoted text On 2008/11/25 21:09:41, Mads Ager wrote: > Indentation is off. Fixed.
501 Handle<Object> index) {	644 Handle<Object> index) {

	645 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);

	646 ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());

	647

502 // Prepare space for the return values.	648 // Prepare space for the return values.

503 int num_captures = JsreCapture(regexp);	649 int number_of_registers = IrregexpNumberOfRegisters(regexp);

	650 OffsetsVector offsets(number_of_registers);

504	651

505 OffsetsVector offsets(num_captures);	652 int num_captures = IrregexpNumberOfCaptures(regexp);

506	653

507 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));	654 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));

508	655

509 Handle<String> subject16 = CachedStringToTwoByte(subject);	656 Handle<String> subject16 = CachedStringToTwoByte(subject);

510	657

511 Handle<Object> result(JsreExecOnce(regexp, num_captures, subject,	658 Handle<Object> result(

512 previous_index,	659 IrregexpExecOnce(regexp,

513 subject16->GetTwoByteData(),	660 num_captures,

514 offsets.vector(), offsets.length()));	661 subject16,

	662 previous_index,

	663 offsets.vector(),

	664 offsets.length()));

	665 return result;

	666 }

	667

	668

	669 Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,

	670 Handle<String> subject,

	671 Handle<Object> index) {

	672 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);

	673 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {

	674 Handle<Object> compile_result = JscreCompile(regexp);

	675 if (compile_result.is_null()) return compile_result;

	676 }

	677 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());

	678

	679 int num_captures = JscreNumberOfCaptures(regexp);

	680

	681 OffsetsVector offsets((num_captures + 1) * 3);

	682

	683 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));

	684

	685 Handle<String> subject16 = CachedStringToTwoByte(subject);

	686

	687 Handle<Object> result(JscreExecOnce(regexp,

	688 num_captures,

	689 subject,

	690 previous_index,

	691 subject16->GetTwoByteData(),

	692 offsets.vector(),

	693 offsets.length()));

515	694

516 return result;	695 return result;

517 }	696 }

518	697

519	698

520 Handle<Object> RegExpImpl::JsreExecGlobal(Handle<JSRegExp> regexp,	699 Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,

521 Handle<String> subject) {	700 Handle<String> subject) {

	701 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);

	702 ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());

	703

522 // Prepare space for the return values.	704 // Prepare space for the return values.

523 int num_captures = JsreCapture(regexp);	705 int number_of_registers = IrregexpNumberOfRegisters(regexp);

524	706 OffsetsVector offsets(number_of_registers);

525 OffsetsVector offsets(num_captures);

526	707

527 int previous_index = 0;	708 int previous_index = 0;

528	709

529 Handle<JSArray> result = Factory::NewJSArray(0);	710 Handle<JSArray> result = Factory::NewJSArray(0);

	711 int i = 0;

	712 Handle<Object> matches;

	713

	714 Handle<String> subject16 = CachedStringToTwoByte(subject);

	715

	716 do {

	717 if (previous_index > subject->length() \|\| previous_index < 0) {

	718 // Per ECMA-262 15.10.6.2, if the previous index is greater than the

	719 // string length, there is no match.

	720 matches = Factory::null_value();

	721 } else {

	722 matches = IrregexpExecOnce(regexp,

	723 IrregexpNumberOfCaptures(regexp),

	724 subject16,

	725 previous_index,

	726 offsets.vector(),

	727 offsets.length());

	728

	729 if (matches->IsJSArray()) {

	730 SetElement(result, i, matches);

	731 i++;

	732 previous_index = offsets.vector()[1];

	733 if (offsets.vector()[0] == offsets.vector()[1]) {

	734 previous_index++;

	735 }

	736 }

	737 }

	738 } while (matches->IsJSArray());

	739

	740 // If we exited the loop with an exception, throw it.

	741 if (matches->IsNull()) { // Exited loop normally.
	Mads Ager (chromium) 2008/11/25 21:09:41 Move these comments to new lines before the return Move these comments to new lines before the returns? Erik Corry 2008/11/26 12:18:36 I'm not sure why that's any better, but fixed. Wh Show quoted text On 2008/11/25 21:09:41, Mads Ager wrote: > Move these comments to new lines before the returns? I'm not sure why that's any better, but fixed. While I was at it I fixed the same code in JSCREExecGlobal. Why should only new code get the benefit of arbitrary reformatting?
	742 return result;

	743 } else { // Exited loop with the exception in matches.

	744 return matches;

	745 }

	746 }

	747

	748

	749 Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,

	750 Handle<String> subject) {

	751 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);

	752 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {

	753 Handle<Object> compile_result = JscreCompile(regexp);

	754 if (compile_result.is_null()) return compile_result;

	755 }

	756 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());

	757

	758 // Prepare space for the return values.

	759 int num_captures = JscreNumberOfCaptures(regexp);

	760

	761 OffsetsVector offsets((num_captures + 1) * 3);

	762

	763 int previous_index = 0;

	764

	765 Handle<JSArray> result = Factory::NewJSArray(0);

530 int i = 0;	766 int i = 0;

531 Handle<Object> matches;	767 Handle<Object> matches;

532	768

533 Handle<String> subject16 = CachedStringToTwoByte(subject);	769 Handle<String> subject16 = CachedStringToTwoByte(subject);

534	770

535 do {	771 do {

536 if (previous_index > subject->length() \|\| previous_index < 0) {	772 if (previous_index > subject->length() \|\| previous_index < 0) {

537 // Per ECMA-262 15.10.6.2, if the previous index is greater than the	773 // Per ECMA-262 15.10.6.2, if the previous index is greater than the

538 // string length, there is no match.	774 // string length, there is no match.

539 matches = Factory::null_value();	775 matches = Factory::null_value();

540 } else {	776 } else {

541 matches = JsreExecOnce(regexp, num_captures, subject, previous_index,	777 matches = JscreExecOnce(regexp,

542 subject16->GetTwoByteData(),	778 num_captures,

543 offsets.vector(), offsets.length());	779 subject,

	780 previous_index,

	781 subject16->GetTwoByteData(),

	782 offsets.vector(),

	783 offsets.length());

544	784

545 if (matches->IsJSArray()) {	785 if (matches->IsJSArray()) {

546 SetElement(result, i, matches);	786 SetElement(result, i, matches);

547 i++;	787 i++;

548 previous_index = offsets.vector()[1];	788 previous_index = offsets.vector()[1];

549 if (offsets.vector()[0] == offsets.vector()[1]) {	789 if (offsets.vector()[0] == offsets.vector()[1]) {

550 previous_index++;	790 previous_index++;

551 }	791 }

552 }	792 }

553 }	793 }

554 } while (matches->IsJSArray());	794 } while (matches->IsJSArray());

555	795

556 // If we exited the loop with an exception, throw it.	796 // If we exited the loop with an exception, throw it.

557 if (matches->IsNull()) { // Exited loop normally.	797 if (matches->IsNull()) { // Exited loop normally.

558 return result;	798 return result;

559 } else { // Exited loop with the exception in matches.	799 } else { // Exited loop with the exception in matches.

560 return matches;	800 return matches;

561 }	801 }

562 }	802 }

563	803

564	804

565 int RegExpImpl::JsreCapture(Handle<JSRegExp> re) {	805 int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {

566 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));	806 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));

567 return Smi::cast(value->get(CAPTURE_INDEX))->value();	807 return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->

568 }	808 value();
	Mads Ager (chromium) 2008/11/25 21:09:41 This should fit on the previous line. This should fit on the previous line. Erik Corry 2008/11/26 12:18:36 Done. Show quoted text On 2008/11/25 21:09:41, Mads Ager wrote: > This should fit on the previous line. Done.
569	809 }

570	810

571 ByteArray* RegExpImpl::JsreInternal(Handle<JSRegExp> re) {	811

	812 ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {

572 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));	813 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));

573 return ByteArray::cast(value->get(INTERNAL_INDEX));	814 return ByteArray::cast(value->get(kJscreInternalIndex));

574 }	815 }

	816

	817

	818 int RegExpImpl::IrregexpNumberOfCaptures(Handle<JSRegExp> re) {

	819 FixedArray* value =

	820 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));

	821 return Smi::cast(value->get(kIrregexpNumberOfCapturesIndex))->value();

	822 }

	823

	824

	825 int RegExpImpl::IrregexpNumberOfRegisters(Handle<JSRegExp> re) {

	826 FixedArray* value =

	827 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));

	828 return Smi::cast(value->get(kIrregexpNumberOfRegistersIndex))->value();

	829 }

	830

	831

	832 Handle<ByteArray> RegExpImpl::IrregexpCode(Handle<JSRegExp> re) {

	833 FixedArray* value =

	834 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));

	835 return Handle<ByteArray>(ByteArray::cast(value->get(kIrregexpCodeIndex)));

	836 }

	837

	838

	839 // -------------------------------------------------------------------

	840 // New regular expression engine
	Erik Corry 2008/11/25 11:03:52 Probably should mention "Irregexp" Probably should mention "Irregexp" Erik Corry 2008/11/26 12:18:36 Fixed. Show quoted text On 2008/11/25 11:03:52, Erik Corry wrote: > Probably should mention "Irregexp" Fixed.
	841

	842

	843 void RegExpTree::AppendToText(RegExpText* text) {

	844 UNREACHABLE();

	845 }

	846

	847

	848 void RegExpAtom::AppendToText(RegExpText* text) {

	849 text->AddElement(TextElement::Atom(this));

	850 }

	851

	852

	853 void RegExpCharacterClass::AppendToText(RegExpText* text) {

	854 text->AddElement(TextElement::CharClass(this));

	855 }

	856

	857

	858 void RegExpText::AppendToText(RegExpText* text) {

	859 for (int i = 0; i < elements()->length(); i++)

	860 text->AddElement(elements()->at(i));

	861 }

	862

	863

	864 TextElement TextElement::Atom(RegExpAtom* atom) {

	865 TextElement result = TextElement(ATOM);

	866 result.data.u_atom = atom;

	867 return result;

	868 }

	869

	870

	871 TextElement TextElement::CharClass(

	872 RegExpCharacterClass* char_class) {

	873 TextElement result = TextElement(CHAR_CLASS);

	874 result.data.u_char_class = char_class;

	875 return result;

	876 }

	877

	878

	879 class RegExpCompiler {

	880 public:

	881 RegExpCompiler(int capture_count, bool ignore_case);

	882

	883 int AllocateRegister() { return next_register_++; }

	884

	885 Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,

	886 RegExpNode* start,

	887 int capture_count);

	888

	889 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }

	890

	891 static const int kImplementationOffset = 0;

	892 static const int kNumberOfRegistersOffset = 0;

	893 static const int kCodeOffset = 1;

	894

	895 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }

	896 EndNode* accept() { return accept_; }

	897 EndNode* backtrack() { return backtrack_; }

	898

	899 static const int kMaxRecursion = 100;

	900 inline int recursion_depth() { return recursion_depth_; }

	901 inline void IncrementRecursionDepth() { recursion_depth_++; }

	902 inline void DecrementRecursionDepth() { recursion_depth_--; }

	903

	904 inline bool is_case_independent() { return is_case_independent_; }

	905

	906 private:

	907 EndNode* accept_;

	908 EndNode* backtrack_;

	909 int next_register_;

	910 List<RegExpNode> work_list_;

	911 int recursion_depth_;

	912 RegExpMacroAssembler* macro_assembler_;

	913 bool is_case_independent_;

	914 };

	915

	916

	917 // Attempts to compile the regexp using an Irregexp code generator. Returns

	918 // a fixed array or a null handle depending on whether it succeeded.

	919 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case)

	920 : next_register_(2 * (capture_count + 1)),
	Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent. Four space indent. Christian Plesner Hansen 2008/11/26 06:49:56 Fixed. Fixed.
	921 work_list_(NULL),

	922 recursion_depth_(0),

	923 is_case_independent_(ignore_case) {

	924 accept_ = new EndNode(EndNode::ACCEPT);

	925 backtrack_ = new EndNode(EndNode::BACKTRACK);

	926 }

	927

	928

	929 Handle<FixedArray> RegExpCompiler::Assemble(

	930 RegExpMacroAssembler* macro_assembler,

	931 RegExpNode* start,

	932 int capture_count) {

	933 if (!FLAG_attempt_case_independent && is_case_independent_) {

	934 return Handle<FixedArray>::null();

	935 }

	936 macro_assembler_ = macro_assembler;

	937 List <RegExpNode*> work_list(0);

	938 work_list_ = &work_list;

	939 Label fail;

	940 macro_assembler->PushBacktrack(&fail);

	941 if (!start->GoTo(this)) {

	942 fail.Unuse();

	943 return Handle<FixedArray>::null();

	944 }

	945 while (!work_list.is_empty()) {

	946 if (!work_list.RemoveLast()->GoTo(this)) {

	947 fail.Unuse();

	948 return Handle<FixedArray>::null();

	949 }

	950 }

	951 macro_assembler->Bind(&fail);

	952 macro_assembler->Fail();

	953 Handle<FixedArray> array =

	954 Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);

	955 array->set(RegExpImpl::kIrregexpImplementationIndex,

	956 Smi::FromInt(macro_assembler->Implementation()));

	957 array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,

	958 Smi::FromInt(next_register_));

	959 array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,

	960 Smi::FromInt(capture_count));

	961 Handle<Object> code = macro_assembler->GetCode();

	962 array->set(RegExpImpl::kIrregexpCodeIndex, *code);

	963 work_list_ = NULL;

	964 return array;

	965 }

	966

	967

	968 bool RegExpNode::GoTo(RegExpCompiler* compiler) {

	969 // TODO(erikcorry): Implement support.

	970 if (info_.follows_word_interest \|\|

	971 info_.follows_newline_interest \|\|

	972 info_.follows_start_interest) {

	973 return false;

	974 }

	975 if (label_.is_bound()) {

	976 compiler->macro_assembler()->GoTo(&label_);

	977 return true;

	978 } else {
	Mads Ager (chromium) 2008/11/25 21:09:41 Most of this nesting could go away since all of th Most of this nesting could go away since all of the cases return from the function. Simplify to: if (a) { ... return true; } if (b) { ... return true; } ... return how_it_went; ? Christian Plesner Hansen 2008/11/26 06:49:56 Nesting aids readability: you can trivially see wh Nesting aids readability: you can trivially see which conditions hold by looking at the conditions of the enclosing if statements. Otherwise you have to read the conditions of the preceding if statements which may or may not be related to the code that follows, and read their bodies to convince yourself that they do indeed always return.
	979 if (compiler->recursion_depth() > RegExpCompiler::kMaxRecursion) {

	980 compiler->macro_assembler()->GoTo(&label_);

	981 compiler->AddWork(this);

	982 return true;

	983 } else {

	984 compiler->IncrementRecursionDepth();

	985 bool how_it_went = Emit(compiler);

	986 compiler->DecrementRecursionDepth();

	987 return how_it_went;

	988 }

	989 }

	990 }

	991

	992

	993 bool EndNode::GoTo(RegExpCompiler* compiler) {

	994 if (info()->follows_word_interest \|\|

	995 info()->follows_newline_interest \|\|

	996 info()->follows_start_interest) {

	997 return false;

	998 }

	999 if (!label()->is_bound()) {

	1000 Bind(compiler->macro_assembler());

	1001 }

	1002 switch (action_) {

	1003 case ACCEPT:

	1004 compiler->macro_assembler()->Succeed();

	1005 break;
	Mads Ager (chromium) 2008/11/25 21:09:41 Indent the break. Indent the break. Erik Corry 2008/11/26 12:18:36 Fixed. Show quoted text On 2008/11/25 21:09:41, Mads Ager wrote: > Indent the break. Fixed.
	1006 case BACKTRACK:

	1007 compiler->macro_assembler()->Backtrack();

	1008 break;
	Mads Ager (chromium) 2008/11/25 21:09:41 Indent the break. Indent the break. Erik Corry 2008/11/26 12:18:36 Fixed. Show quoted text On 2008/11/25 21:09:41, Mads Ager wrote: > Indent the break. Fixed.
	1009 }

	1010 return true;

	1011 }

	1012

	1013

	1014 Label* RegExpNode::label() {

	1015 return &label_;

	1016 }

	1017

	1018

	1019 bool EndNode::Emit(RegExpCompiler* compiler) {

	1020 RegExpMacroAssembler* macro = compiler->macro_assembler();

	1021 switch (action_) {

	1022 case ACCEPT:

	1023 Bind(macro);

	1024 macro->Succeed();

	1025 return true;

	1026 case BACKTRACK:

	1027 Bind(macro);

	1028 macro->Backtrack();

	1029 return true;

	1030 }

	1031 return false;

	1032 }

	1033

	1034

	1035 void GuardedAlternative::AddGuard(Guard* guard) {

	1036 if (guards_ == NULL)

	1037 guards_ = new ZoneList<Guard*>(1);

	1038 guards_->Add(guard);

	1039 }

	1040

	1041

	1042 ActionNode* ActionNode::StoreRegister(int reg,

	1043 int val,

	1044 RegExpNode* on_success) {

	1045 ActionNode* result = new ActionNode(STORE_REGISTER, on_success);

	1046 result->data_.u_store_register.reg = reg;

	1047 result->data_.u_store_register.value = val;

	1048 return result;

	1049 }

	1050

	1051

	1052 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {

	1053 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);

	1054 result->data_.u_increment_register.reg = reg;

	1055 return result;

	1056 }

	1057

	1058

	1059 ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) {

	1060 ActionNode* result = new ActionNode(STORE_POSITION, on_success);

	1061 result->data_.u_position_register.reg = reg;

	1062 return result;

	1063 }

	1064

	1065

	1066 ActionNode* ActionNode::SavePosition(int reg, RegExpNode* on_success) {

	1067 ActionNode* result = new ActionNode(SAVE_POSITION, on_success);

	1068 result->data_.u_position_register.reg = reg;

	1069 return result;

	1070 }

	1071

	1072

	1073 ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) {

	1074 ActionNode* result = new ActionNode(RESTORE_POSITION, on_success);

	1075 result->data_.u_position_register.reg = reg;

	1076 return result;

	1077 }

	1078

	1079

	1080 ActionNode* ActionNode::BeginSubmatch(int reg, RegExpNode* on_success) {

	1081 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);

	1082 result->data_.u_submatch_stack_pointer_register.reg = reg;

	1083 return result;

	1084 }

	1085

	1086

	1087 ActionNode* ActionNode::EscapeSubmatch(int reg, RegExpNode* on_success) {

	1088 ActionNode* result = new ActionNode(ESCAPE_SUBMATCH, on_success);

	1089 result->data_.u_submatch_stack_pointer_register.reg = reg;

	1090 return result;

	1091 }

	1092

	1093

	1094 #define DEFINE_ACCEPT(Type) \

	1095 void Type##Node::Accept(NodeVisitor* visitor) { \

	1096 visitor->Visit##Type(this); \

	1097 }

	1098 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)

	1099 #undef DEFINE_ACCEPT

	1100

	1101

	1102 // -------------------------------------------------------------------

	1103 // Emit code.

	1104

	1105

	1106 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,

	1107 Guard* guard,

	1108 Label* on_failure) {

	1109 switch (guard->op()) {

	1110 case Guard::LT:

	1111 macro_assembler->IfRegisterGE(guard->reg(), guard->value(), on_failure);

	1112 break;

	1113 case Guard::GEQ:

	1114 macro_assembler->IfRegisterLT(guard->reg(), guard->value(), on_failure);

	1115 break;

	1116 }

	1117 }

	1118

	1119

	1120 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;

	1121 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;

	1122

	1123

	1124 static inline void EmitAtomNonLetters(

	1125 RegExpMacroAssembler* macro_assembler,

	1126 TextElement elm,

	1127 Vector<const uc16> quarks,

	1128 Label* on_failure,

	1129 int cp_offset) {

	1130 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	1131 for (int i = quarks.length() - 1; i >= 0; i--) {

	1132 uc16 c = quarks[i];

	1133 int length = uncanonicalize.get(c, '\0', chars);

	1134 if (length <= 1) {

	1135 macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);

	1136 macro_assembler->CheckNotCharacter(c, on_failure);

	1137 }

	1138 }

	1139 }

	1140

	1141

	1142 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,

	1143 uc16 c1,

	1144 uc16 c2,

	1145 Label* on_failure) {

	1146 uc16 exor = c1 ^ c2;

	1147 // Check whether exor has only one bit set.

	1148 if (((exor - 1) & exor) == 0) {
	Mads Ager (chromium) 2008/11/25 21:09:41 The if-else could be replaced by two ifs that each The if-else could be replaced by two ifs that each return from the function. Erik Corry 2008/11/26 12:18:36 Fixed. Show quoted text On 2008/11/25 21:09:41, Mads Ager wrote: > The if-else could be replaced by two ifs that each return from the function. Fixed.
	1149 // If c1 and c2 differ only by one bit.

	1150 // Ecma262UnCanonicalize always gives the highest number last.

	1151 ASSERT(c2 > c1);

	1152 macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure);

	1153 return true;

	1154 } else {

	1155 ASSERT(c2 > c1);

	1156 uc16 diff = c2 - c1;

	1157 if (((diff - 1) & diff) == 0 && c1 >= diff) {

	1158 // If the characters differ by 2^n but don't differ by one bit then

	1159 // subtract the difference from the found character, then do the or

	1160 // trick. We avoid the theoretical case where negative numbers are

	1161 // involved in order to simplify code generation.

	1162 macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff,

	1163 diff,

	1164 on_failure);

	1165 return true;

	1166 }

	1167 }

	1168 return false;

	1169 }

	1170

	1171

	1172 static inline void EmitAtomLetters(

	1173 RegExpMacroAssembler* macro_assembler,

	1174 TextElement elm,

	1175 Vector<const uc16> quarks,

	1176 Label* on_failure,

	1177 int cp_offset) {

	1178 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	1179 for (int i = quarks.length() - 1; i >= 0; i--) {

	1180 uc16 c = quarks[i];

	1181 int length = uncanonicalize.get(c, '\0', chars);

	1182 if (length <= 1) continue;

	1183 macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);

	1184 Label ok;

	1185 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);

	1186 switch (length) {

	1187 case 2: {

	1188 if (ShortCutEmitCharacterPair(macro_assembler,

	1189 chars[0],

	1190 chars[1],

	1191 on_failure)) {

	1192 ok.Unuse();

	1193 } else {

	1194 macro_assembler->CheckCharacter(chars[0], &ok);

	1195 macro_assembler->CheckNotCharacter(chars[1], on_failure);

	1196 macro_assembler->Bind(&ok);

	1197 }

	1198 break;

	1199 }

	1200 case 4:

	1201 macro_assembler->CheckCharacter(chars[3], &ok);

	1202 // Fall through!

	1203 case 3:

	1204 macro_assembler->CheckCharacter(chars[0], &ok);

	1205 macro_assembler->CheckCharacter(chars[1], &ok);

	1206 macro_assembler->CheckNotCharacter(chars[2], on_failure);

	1207 macro_assembler->Bind(&ok);

	1208 break;

	1209 default:

	1210 UNREACHABLE();

	1211 break;

	1212 }

	1213 }

	1214 }

	1215

	1216

	1217 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,

	1218 RegExpCharacterClass* cc,

	1219 int cp_offset,

	1220 Label* on_failure) {

	1221 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);

	1222 cp_offset++;

	1223

	1224 ZoneList<CharacterRange>* ranges = cc->ranges();

	1225

	1226 Label success;

	1227

	1228 Label *char_is_in_class =
	Erik Corry 2008/11/25 11:03:52 Misplaced asterisk. Misplaced asterisk. Erik Corry 2008/11/26 12:18:36 Fixed. Show quoted text On 2008/11/25 11:03:52, Erik Corry wrote: > Misplaced asterisk. Fixed.
	1229 cc->is_negated() ? on_failure : &success;

	1230

	1231 int range_count = ranges->length();

	1232

	1233 if (range_count == 0) {

	1234 if (!cc->is_negated()) {

	1235 macro_assembler->GoTo(on_failure);

	1236 }

	1237 return;

	1238 }

	1239

	1240 for (int i = 0; i < range_count - 1; i++) {

	1241 CharacterRange& range = ranges->at(i);

	1242 Label next_range;

	1243 uc16 from = range.from();

	1244 uc16 to = range.to();

	1245 if (to == from) {

	1246 macro_assembler->CheckCharacter(to, char_is_in_class);

	1247 } else {

	1248 if (from != 0) {

	1249 macro_assembler->CheckCharacterLT(from, &next_range);

	1250 }

	1251 if (to != 0xffff) {

	1252 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);

	1253 } else {

	1254 macro_assembler->GoTo(char_is_in_class);

	1255 }

	1256 }

	1257 macro_assembler->Bind(&next_range);

	1258 }

	1259

	1260 CharacterRange& range = ranges->at(range_count - 1);

	1261 uc16 from = range.from();

	1262 uc16 to = range.to();

	1263

	1264 if (to == from) {

	1265 if (cc->is_negated()) {

	1266 macro_assembler->CheckCharacter(to, on_failure);

	1267 } else {

	1268 macro_assembler->CheckNotCharacter(to, on_failure);

	1269 }

	1270 } else {

	1271 if (from != 0) {

	1272 if (!cc->is_negated()) {

	1273 macro_assembler->CheckCharacterLT(from, on_failure);

	1274 } else {

	1275 macro_assembler->CheckCharacterLT(from, &success);

	1276 }

	1277 }

	1278 if (to != 0xffff) {

	1279 if (!cc->is_negated()) {

	1280 macro_assembler->CheckCharacterGT(to, on_failure);

	1281 } else {

	1282 macro_assembler->CheckCharacterLT(to + 1, on_failure);

	1283 }

	1284 } else {

	1285 if (cc->is_negated()) {

	1286 macro_assembler->GoTo(on_failure);

	1287 }

	1288 }

	1289 }

	1290 macro_assembler->Bind(&success);

	1291 }

	1292

	1293

	1294

	1295 bool TextNode::Emit(RegExpCompiler* compiler) {

	1296 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	1297 Bind(macro_assembler);

	1298 int element_count = elms_->length();

	1299 int cp_offset = 0;

	1300 // First, handle straight character matches.

	1301 for (int i = 0; i < element_count; i++) {

	1302 TextElement elm = elms_->at(i);

	1303 if (elm.type == TextElement::ATOM) {

	1304 Vector<const uc16> quarks = elm.data.u_atom->data();

	1305 if (!compiler->is_case_independent()) {

	1306 macro_assembler->CheckCharacters(quarks,

	1307 cp_offset,

	1308 on_failure_->label());

	1309 } else {

	1310 EmitAtomNonLetters(macro_assembler,

	1311 elm,

	1312 quarks,

	1313 on_failure_->label(),

	1314 cp_offset);

	1315 }

	1316 cp_offset += quarks.length();

	1317 } else {

	1318 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);

	1319 cp_offset++;

	1320 }

	1321 }

	1322 // Second, handle case independent letter matches if any.

	1323 if (compiler->is_case_independent()) {

	1324 cp_offset = 0;

	1325 for (int i = 0; i < element_count; i++) {

	1326 TextElement elm = elms_->at(i);

	1327 if (elm.type == TextElement::ATOM) {

	1328 Vector<const uc16> quarks = elm.data.u_atom->data();

	1329 EmitAtomLetters(macro_assembler,

	1330 elm,

	1331 quarks,

	1332 on_failure_->label(),

	1333 cp_offset);

	1334 cp_offset += quarks.length();

	1335 } else {

	1336 cp_offset++;

	1337 }

	1338 }

	1339 }

	1340 // If the fast character matches passed then do the character classes.

	1341 cp_offset = 0;

	1342 for (int i = 0; i < element_count; i++) {

	1343 TextElement elm = elms_->at(i);

	1344 if (elm.type == TextElement::CHAR_CLASS) {

	1345 RegExpCharacterClass* cc = elm.data.u_char_class;

	1346 EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label());

	1347 cp_offset ++;

	1348 } else {

	1349 cp_offset += elm.data.u_atom->data().length();

	1350 }

	1351 }

	1352

	1353 compiler->AddWork(on_failure_);

	1354 macro_assembler->AdvanceCurrentPosition(cp_offset);

	1355 return on_success()->GoTo(compiler);

	1356 }

	1357

	1358

	1359 bool ChoiceNode::Emit(RegExpCompiler* compiler) {

	1360 int choice_count = alternatives_->length();

	1361 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	1362 Bind(macro_assembler);

	1363 // For now we just call all choices one after the other. The idea ultimately

	1364 // is to use the Dispatch table to try only the relevant ones.

	1365 int i;
	Mads Ager (chromium) 2008/11/25 21:09:41 I don't see how the i is needed after the for loop I don't see how the i is needed after the for loop. There seems to be no breaks out of the for loop or changes to choice_count during the loop, so we know the value of i after the loop. Move "int i" into the for loop and use "choise_count - 1" instead of i after the loop? Erik Corry 2008/11/26 12:18:36 Fixed. Show quoted text On 2008/11/25 21:09:41, Mads Ager wrote: > I don't see how the i is needed after the for loop. There seems to be no breaks > out of the for loop or changes to choice_count during the loop, so we know the > value of i after the loop. Move "int i" into the for loop and use "choise_count > - 1" instead of i after the loop? Fixed.
	1366 for (i = 0; i < choice_count - 1; i++) {

	1367 GuardedAlternative alternative = alternatives_->at(i);

	1368 Label after;

	1369 Label after_no_pop_cp;

	1370 ZoneList<Guard> guards = alternative.guards();

	1371 if (guards != NULL) {

	1372 int guard_count = guards->length();

	1373 for (int j = 0; j < guard_count; j++) {

	1374 GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp);

	1375 }

	1376 }

	1377 macro_assembler->PushCurrentPosition();

	1378 macro_assembler->PushBacktrack(&after);

	1379 if (!alternative.node()->GoTo(compiler)) {

	1380 after.Unuse();

	1381 after_no_pop_cp.Unuse();

	1382 return false;

	1383 }

	1384 macro_assembler->Bind(&after);

	1385 macro_assembler->PopCurrentPosition();

	1386 macro_assembler->Bind(&after_no_pop_cp);

	1387 }

	1388 GuardedAlternative alternative = alternatives_->at(i);

	1389 ZoneList<Guard> guards = alternative.guards();

	1390 if (guards != NULL) {

	1391 int guard_count = guards->length();

	1392 for (int j = 0; j < guard_count; j++) {

	1393 GenerateGuard(macro_assembler, guards->at(j), on_failure_->label());

	1394 }

	1395 }

	1396 if (!on_failure_->IsBacktrack()) {

	1397 ASSERT_NOT_NULL(on_failure_ -> label());

	1398 macro_assembler->PushBacktrack(on_failure_->label());

	1399 compiler->AddWork(on_failure_);

	1400 }

	1401 if (!alternative.node()->GoTo(compiler)) {

	1402 return false;

	1403 }

	1404 return true;

	1405 }

	1406

	1407

	1408 bool ActionNode::Emit(RegExpCompiler* compiler) {

	1409 RegExpMacroAssembler* macro = compiler->macro_assembler();

	1410 Bind(macro);

	1411 switch (type_) {

	1412 case STORE_REGISTER:

	1413 macro->SetRegister(data_.u_store_register.reg,

	1414 data_.u_store_register.value);

	1415 break;

	1416 case INCREMENT_REGISTER: {

	1417 Label undo;

	1418 macro->PushBacktrack(&undo);

	1419 macro->AdvanceRegister(data_.u_increment_register.reg, 1);

	1420 bool ok = on_success()->GoTo(compiler);

	1421 if (!ok) {

	1422 undo.Unuse();

	1423 return false;

	1424 }

	1425 macro->Bind(&undo);

	1426 macro->AdvanceRegister(data_.u_increment_register.reg, -1);

	1427 macro->Backtrack();

	1428 break;

	1429 }

	1430 case STORE_POSITION: {

	1431 Label undo;

	1432 macro->PushRegister(data_.u_position_register.reg);

	1433 macro->PushBacktrack(&undo);

	1434 macro->WriteCurrentPositionToRegister(data_.u_position_register.reg);

	1435 bool ok = on_success()->GoTo(compiler);

	1436 if (!ok) {

	1437 undo.Unuse();

	1438 return false;

	1439 }

	1440 macro->Bind(&undo);

	1441 macro->PopRegister(data_.u_position_register.reg);

	1442 macro->Backtrack();

	1443 break;

	1444 }

	1445 case SAVE_POSITION:

	1446 macro->WriteCurrentPositionToRegister(

	1447 data_.u_position_register.reg);

	1448 break;

	1449 case RESTORE_POSITION:

	1450 macro->ReadCurrentPositionFromRegister(

	1451 data_.u_position_register.reg);

	1452 break;

	1453 case BEGIN_SUBMATCH:

	1454 macro->WriteStackPointerToRegister(

	1455 data_.u_submatch_stack_pointer_register.reg);

	1456 break;

	1457 case ESCAPE_SUBMATCH:

	1458 macro->ReadStackPointerFromRegister(

	1459 data_.u_submatch_stack_pointer_register.reg);

	1460 break;

	1461 default:

	1462 UNREACHABLE();

	1463 return false;

	1464 }

	1465 return on_success()->GoTo(compiler);

	1466 }

	1467

	1468

	1469 bool BackReferenceNode::Emit(RegExpCompiler* compiler) {

	1470 RegExpMacroAssembler* macro = compiler->macro_assembler();

	1471 Bind(macro);

	1472 // Check whether the registers are uninitialized and always

	1473 // succeed if they are.

	1474 macro->IfRegisterLT(start_reg_, 0, on_success()->label());

	1475 macro->IfRegisterLT(end_reg_, 0, on_success()->label());

	1476 ASSERT_EQ(start_reg_ + 1, end_reg_);

	1477 macro->CheckNotBackReference(start_reg_, on_failure_->label());

	1478 return on_success()->GoTo(compiler);

	1479 }

	1480

	1481

	1482 // -------------------------------------------------------------------

	1483 // Dot/dotty output

	1484

	1485

	1486 #ifdef DEBUG

	1487

	1488

	1489 class DotPrinter: public NodeVisitor {

	1490 public:

	1491 DotPrinter() : stream_(&alloc_) { }

	1492 void PrintNode(const char* label, RegExpNode* node);

	1493 void Visit(RegExpNode* node);

	1494 void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure);

	1495 StringStream* stream() { return &stream_; }

	1496 #define DECLARE_VISIT(Type) \

	1497 virtual void Visit##Type(Type##Node* that);

	1498 FOR_EACH_NODE_TYPE(DECLARE_VISIT)

	1499 #undef DECLARE_VISIT

	1500 private:

	1501 HeapStringAllocator alloc_;

	1502 StringStream stream_;

	1503 std::set<RegExpNode*> seen_;

	1504 };

	1505

	1506

	1507 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {

	1508 stream()->Add("digraph G {\n graph [label=\"");

	1509 for (int i = 0; label[i]; i++) {

	1510 switch (label[i]) {

	1511 case '\\':

	1512 stream()->Add("\\\\");

	1513 break;

	1514 case '"':

	1515 stream()->Add("\"");

	1516 break;

	1517 default:

	1518 stream()->Put(label[i]);

	1519 break;

	1520 }

	1521 }

	1522 stream()->Add("\"];\n");

	1523 Visit(node);

	1524 stream()->Add("}\n");

	1525 printf("%s", *(stream()->ToCString()));

	1526 }

	1527

	1528

	1529 void DotPrinter::Visit(RegExpNode* node) {

	1530 if (seen_.find(node) != seen_.end())

	1531 return;

	1532 seen_.insert(node);

	1533 node->Accept(this);

	1534 }

	1535

	1536

	1537 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {

	1538 if (on_failure->IsBacktrack()) return;

	1539 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);

	1540 Visit(on_failure);

	1541 }

	1542

	1543

	1544 class TableEntryBodyPrinter {

	1545 public:

	1546 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)

	1547 : stream_(stream), choice_(choice) { }
	Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent. Four space indent. Christian Plesner Hansen 2008/11/26 06:49:56 Fixed. Fixed.
	1548 void Call(uc16 from, DispatchTable::Entry entry) {

	1549 OutSet* out_set = entry.out_set();

	1550 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {

	1551 if (out_set->Get(i)) {

	1552 stream()->Add(" n%p:s%io%i -> n%p;\n",

	1553 choice(),

	1554 from,

	1555 i,

	1556 choice()->alternatives()->at(i).node());

	1557 }

	1558 }

	1559 }

	1560 private:

	1561 StringStream* stream() { return stream_; }

	1562 ChoiceNode* choice() { return choice_; }

	1563 StringStream* stream_;

	1564 ChoiceNode* choice_;

	1565 };

	1566

	1567

	1568 class TableEntryHeaderPrinter {

	1569 public:

	1570 explicit TableEntryHeaderPrinter(StringStream* stream)

	1571 : first_(true), stream_(stream) { }
	Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent. Four space indent. Christian Plesner Hansen 2008/11/26 06:49:56 Fixed. Fixed.
	1572 void Call(uc16 from, DispatchTable::Entry entry) {

	1573 if (first_) {

	1574 first_ = false;

	1575 } else {

	1576 stream()->Add("\|");

	1577 }

	1578 stream()->Add("{\\%k-\\%k\|{", from, entry.to());

	1579 OutSet* out_set = entry.out_set();

	1580 int priority = 0;

	1581 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {

	1582 if (out_set->Get(i)) {

	1583 if (priority > 0) stream()->Add("\|");

	1584 stream()->Add("<s%io%i> %i", from, i, priority);

	1585 priority++;

	1586 }

	1587 }

	1588 stream()->Add("}}");

	1589 }

	1590 private:

	1591 bool first_;

	1592 StringStream* stream() { return stream_; }

	1593 StringStream* stream_;

	1594 };

	1595

	1596

	1597 void DotPrinter::VisitChoice(ChoiceNode* that) {

	1598 stream()->Add(" n%p [shape=Mrecord, label=\"", that);

	1599 TableEntryHeaderPrinter header_printer(stream());

	1600 that->table()->ForEach(&header_printer);

	1601 stream()->Add("\"]\n", that);

	1602 TableEntryBodyPrinter body_printer(stream(), that);

	1603 that->table()->ForEach(&body_printer);

	1604 PrintOnFailure(that, that->on_failure());

	1605 for (int i = 0; i < that->alternatives()->length(); i++) {

	1606 GuardedAlternative alt = that->alternatives()->at(i);

	1607 alt.node()->Accept(this);

	1608 }

	1609 }

	1610

	1611

	1612 void DotPrinter::VisitText(TextNode* that) {

	1613 stream()->Add(" n%p [label=\"", that);

	1614 for (int i = 0; i < that->elements()->length(); i++) {

	1615 if (i > 0) stream()->Add(" ");

	1616 TextElement elm = that->elements()->at(i);

	1617 switch (elm.type) {

	1618 case TextElement::ATOM: {

	1619 stream()->Add("'%w'", elm.data.u_atom->data());

	1620 break;

	1621 }

	1622 case TextElement::CHAR_CLASS: {

	1623 RegExpCharacterClass* node = elm.data.u_char_class;

	1624 stream()->Add("[");

	1625 if (node->is_negated())

	1626 stream()->Add("^");

	1627 for (int j = 0; j < node->ranges()->length(); j++) {

	1628 CharacterRange range = node->ranges()->at(j);

	1629 stream()->Add("%k-%k", range.from(), range.to());

	1630 }

	1631 stream()->Add("]");

	1632 break;

	1633 }

	1634 default:

	1635 UNREACHABLE();

	1636 }

	1637 }

	1638 stream()->Add("\", shape=box, peripheries=2];\n");

	1639 stream()->Add(" n%p -> n%p;\n", that, that->on_success());

	1640 Visit(that->on_success());

	1641 PrintOnFailure(that, that->on_failure());

	1642 }

	1643

	1644

	1645 void DotPrinter::VisitBackReference(BackReferenceNode* that) {

	1646 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",

	1647 that,

	1648 that->start_register(),

	1649 that->end_register());

	1650 stream()->Add(" n%p -> n%p;\n", that, that->on_success());

	1651 Visit(that->on_success());

	1652 PrintOnFailure(that, that->on_failure());

	1653 }

	1654

	1655

	1656 void DotPrinter::VisitEnd(EndNode* that) {

	1657 stream()->Add(" n%p [style=bold, shape=point];\n", that);

	1658 }

	1659

	1660

	1661 void DotPrinter::VisitAction(ActionNode* that) {

	1662 stream()->Add(" n%p [", that);

	1663 switch (that->type_) {

	1664 case ActionNode::STORE_REGISTER:

	1665 stream()->Add("label=\"$%i:=%i\", shape=octagon",

	1666 that->data_.u_store_register.reg,

	1667 that->data_.u_store_register.value);

	1668 break;

	1669 case ActionNode::INCREMENT_REGISTER:

	1670 stream()->Add("label=\"$%i++\", shape=octagon",

	1671 that->data_.u_increment_register.reg);

	1672 break;

	1673 case ActionNode::STORE_POSITION:

	1674 stream()->Add("label=\"$%i:=$pos\", shape=octagon",

	1675 that->data_.u_position_register.reg);

	1676 break;

	1677 case ActionNode::SAVE_POSITION:

	1678 stream()->Add("label=\"$%i:=$pos\", shape=octagon",

	1679 that->data_.u_position_register.reg);

	1680 break;

	1681 case ActionNode::RESTORE_POSITION:

	1682 stream()->Add("label=\"$pos:=$%i\", shape=octagon",

	1683 that->data_.u_position_register.reg);

	1684 break;

	1685 case ActionNode::BEGIN_SUBMATCH:

	1686 stream()->Add("label=\"begin\", shape=septagon");

	1687 break;

	1688 case ActionNode::ESCAPE_SUBMATCH:

	1689 stream()->Add("label=\"escape\", shape=septagon");

	1690 break;

	1691 }

	1692 stream()->Add("];\n");

	1693 stream()->Add(" n%p -> n%p;\n", that, that->on_success());

	1694 Visit(that->on_success());

	1695 }

	1696

	1697

	1698 class DispatchTableDumper {

	1699 public:

	1700 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }

	1701 void Call(uc16 key, DispatchTable::Entry entry);

	1702 StringStream* stream() { return stream_; }

	1703 private:

	1704 StringStream* stream_;

	1705 };

	1706

	1707

	1708 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {

	1709 stream()->Add("[%k-%k]: {", key, entry.to());

	1710 OutSet* set = entry.out_set();

	1711 bool first = true;

	1712 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {

	1713 if (set->Get(i)) {

	1714 if (first) {

	1715 first = false;

	1716 } else {

	1717 stream()->Add(", ");

	1718 }

	1719 stream()->Add("%i", i);

	1720 }

	1721 }

	1722 stream()->Add("}\n");

	1723 }

	1724

	1725

	1726 void DispatchTable::Dump() {

	1727 HeapStringAllocator alloc;

	1728 StringStream stream(&alloc);

	1729 DispatchTableDumper dumper(&stream);

	1730 tree()->ForEach(&dumper);

	1731 OS::PrintError("%s", *stream.ToCString());

	1732 }

	1733

	1734

	1735 void RegExpEngine::DotPrint(const char* label, RegExpNode* node) {

	1736 DotPrinter printer;

	1737 printer.PrintNode(label, node);

	1738 }

	1739

	1740

	1741 #endif // DEBUG

	1742

	1743

	1744 // -------------------------------------------------------------------

	1745 // Tree to graph conversion

	1746

	1747

	1748 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,

	1749 RegExpNode* on_success,

	1750 RegExpNode* on_failure) {

	1751 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);

	1752 elms->Add(TextElement::Atom(this));

	1753 return new TextNode(elms, on_success, on_failure);

	1754 }

	1755

	1756

	1757 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,

	1758 RegExpNode* on_success,

	1759 RegExpNode* on_failure) {

	1760 return new TextNode(elements(), on_success, on_failure);

	1761 }

	1762

	1763

	1764 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

	1765 RegExpNode* on_success,

	1766 RegExpNode* on_failure) {

	1767 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);

	1768 elms->Add(TextElement::CharClass(this));

	1769 return new TextNode(elms, on_success, on_failure);

	1770 }

	1771

	1772

	1773 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,

	1774 RegExpNode* on_success,

	1775 RegExpNode* on_failure) {

	1776 ZoneList<RegExpTree> alternatives = this->alternatives();

	1777 int length = alternatives->length();

	1778 ChoiceNode* result = new ChoiceNode(length, on_failure);

	1779 for (int i = 0; i < length; i++) {

	1780 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,

	1781 on_success,

	1782 on_failure));

	1783 result->AddAlternative(alternative);

	1784 }

	1785 return result;

	1786 }

	1787

	1788

	1789 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,

	1790 RegExpNode* on_success,

	1791 RegExpNode* on_failure) {

	1792 return ToNode(min(),

	1793 max(),

	1794 is_greedy(),

	1795 body(),

	1796 compiler,

	1797 on_success,

	1798 on_failure);

	1799 }

	1800

	1801

	1802 RegExpNode* RegExpQuantifier::ToNode(int min,

	1803 int max,

	1804 bool is_greedy,

	1805 RegExpTree* body,

	1806 RegExpCompiler* compiler,

	1807 RegExpNode* on_success,

	1808 RegExpNode* on_failure) {

	1809 // x{f, t} becomes this:

	1810 //

	1811 // (r++)<-.

	1812 // \| `

	1813 // \| (x)

	1814 // v ^

	1815 // (r=0)-->(?)---/ [if r < t]

	1816 // \|

	1817 // [if r >= f] \----> ...

	1818 //

	1819 //

	1820 // TODO(someone): clear captures on repetition and handle empty

	1821 // matches.

	1822 bool has_min = min > 0;

	1823 bool has_max = max < RegExpQuantifier::kInfinity;

	1824 bool needs_counter = has_min \|\| has_max;

	1825 int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1;

	1826 ChoiceNode* center = new ChoiceNode(2, on_failure);

	1827 RegExpNode* loop_return = needs_counter

	1828 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))

	1829 : static_cast<RegExpNode*>(center);

	1830 RegExpNode* body_node = body->ToNode(compiler, loop_return, on_failure);

	1831 GuardedAlternative body_alt(body_node);

	1832 if (has_max) {

	1833 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);

	1834 body_alt.AddGuard(body_guard);

	1835 }

	1836 GuardedAlternative rest_alt(on_success);

	1837 if (has_min) {

	1838 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);

	1839 rest_alt.AddGuard(rest_guard);

	1840 }

	1841 if (is_greedy) {

	1842 center->AddAlternative(body_alt);

	1843 center->AddAlternative(rest_alt);

	1844 } else {

	1845 center->AddAlternative(rest_alt);

	1846 center->AddAlternative(body_alt);

	1847 }

	1848 if (needs_counter) {

	1849 return ActionNode::StoreRegister(reg_ctr, 0, center);

	1850 } else {

	1851 return center;

	1852 }

	1853 }

	1854

	1855

	1856 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,

	1857 RegExpNode* on_success,

	1858 RegExpNode* on_failure) {

	1859 NodeInfo info;

	1860 switch (type()) {

	1861 case START_OF_LINE:

	1862 info.follows_newline_interest = true;

	1863 break;

	1864 case START_OF_INPUT:

	1865 info.follows_start_interest = true;

	1866 break;

	1867 case BOUNDARY: case NON_BOUNDARY:

	1868 info.follows_word_interest = true;

	1869 break;

	1870 case END_OF_LINE: case END_OF_INPUT:

	1871 // This is wrong but has the effect of making the compiler abort.

	1872 info.follows_start_interest = true;

	1873 }

	1874 return on_success->PropagateInterest(&info);

	1875 }

	1876

	1877

	1878 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,

	1879 RegExpNode* on_success,

	1880 RegExpNode* on_failure) {

	1881 return new BackReferenceNode(RegExpCapture::StartRegister(index()),

	1882 RegExpCapture::EndRegister(index()),

	1883 on_success,

	1884 on_failure);

	1885 }

	1886

	1887

	1888 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,

	1889 RegExpNode* on_success,

	1890 RegExpNode* on_failure) {

	1891 return on_success;

	1892 }

	1893

	1894

	1895 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,

	1896 RegExpNode* on_success,

	1897 RegExpNode* on_failure) {

	1898 int stack_pointer_register = compiler->AllocateRegister();

	1899 int position_register = compiler->AllocateRegister();

	1900 if (is_positive()) {

	1901 // begin submatch scope

	1902 // $reg = $pos

	1903 // if [body]

	1904 // then

	1905 // $pos = $reg

	1906 // escape submatch scope (drop all backtracks created in scope)

	1907 // succeed

	1908 // else

	1909 // end submatch scope (nothing to clean up, just exit the scope)

	1910 // fail

	1911 return ActionNode::BeginSubmatch(

	1912 stack_pointer_register,

	1913 ActionNode::SavePosition(

	1914 position_register,

	1915 body()->ToNode(

	1916 compiler,

	1917 ActionNode::RestorePosition(

	1918 position_register,

	1919 ActionNode::EscapeSubmatch(stack_pointer_register,

	1920 on_success)),

	1921 on_failure)));

	1922 } else {

	1923 // begin submatch scope

	1924 // try

	1925 // first if (body)

	1926 // then

	1927 // escape submatch scope

	1928 // fail

	1929 // else

	1930 // backtrack

	1931 // second

	1932 // end submatch scope

	1933 // restore current position

	1934 // succeed

	1935 ChoiceNode* try_node =

	1936 new ChoiceNode(1, ActionNode::RestorePosition(position_register,

	1937 on_success));

	1938 RegExpNode* body_node = body()->ToNode(

	1939 compiler,

	1940 ActionNode::EscapeSubmatch(stack_pointer_register, on_failure),

	1941 compiler->backtrack());

	1942 GuardedAlternative body_alt(body_node);

	1943 try_node->AddAlternative(body_alt);

	1944 return ActionNode::BeginSubmatch(stack_pointer_register,

	1945 ActionNode::SavePosition(

	1946 position_register,

	1947 try_node));

	1948 }

	1949 }

	1950

	1951

	1952 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,

	1953 RegExpNode* on_success,

	1954 RegExpNode* on_failure) {

	1955 return ToNode(body(), index(), compiler, on_success, on_failure);

	1956 }

	1957

	1958

	1959 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,

	1960 int index,

	1961 RegExpCompiler* compiler,

	1962 RegExpNode* on_success,

	1963 RegExpNode* on_failure) {

	1964 int start_reg = RegExpCapture::StartRegister(index);

	1965 int end_reg = RegExpCapture::EndRegister(index);

	1966 RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success);

	1967 RegExpNode* body_node = body->ToNode(compiler, store_end, on_failure);

	1968 return ActionNode::StorePosition(start_reg, body_node);

	1969 }

	1970

	1971

	1972 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,

	1973 RegExpNode* on_success,

	1974 RegExpNode* on_failure) {

	1975 ZoneList<RegExpTree> children = nodes();

	1976 RegExpNode* current = on_success;

	1977 for (int i = children->length() - 1; i >= 0; i--) {

	1978 current = children->at(i)->ToNode(compiler, current, on_failure);

	1979 }

	1980 return current;

	1981 }

	1982

	1983

	1984 static const int kSpaceRangeCount = 20;

	1985 static const uc16 kSpaceRanges[kSpaceRangeCount] = {

	1986 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 0x00A0, 0x1680,

	1987 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,

	1988 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000

	1989 };

	1990

	1991

	1992 static const int kWordRangeCount = 8;

	1993 static const uc16 kWordRanges[kWordRangeCount] = {

	1994 '0', '9', 'A', 'Z', '_', '_', 'a', 'z'

	1995 };

	1996

	1997

	1998 static const int kDigitRangeCount = 2;

	1999 static const uc16 kDigitRanges[kDigitRangeCount] = {

	2000 '0', '9'

	2001 };

	2002

	2003

	2004 static const int kLineTerminatorRangeCount = 6;

	2005 static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = {

	2006 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029

	2007 };

	2008

	2009

	2010 static void AddClass(const uc16* elmv,

	2011 int elmc,

	2012 ZoneList<CharacterRange>* ranges) {

	2013 for (int i = 0; i < elmc; i += 2) {

	2014 ASSERT(elmv[i] <= elmv[i + 1]);

	2015 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));

	2016 }

	2017 }

	2018

	2019

	2020 static void AddClassNegated(const uc16 *elmv,

	2021 int elmc,

	2022 ZoneList<CharacterRange>* ranges) {

	2023 ASSERT(elmv[0] != 0x0000);

	2024 ASSERT(elmv[elmc-1] != 0xFFFF);

	2025 uc16 last = 0x0000;

	2026 for (int i = 0; i < elmc; i += 2) {

	2027 ASSERT(last <= elmv[i] - 1);

	2028 ASSERT(elmv[i] <= elmv[i + 1]);

	2029 ranges->Add(CharacterRange(last, elmv[i] - 1));

	2030 last = elmv[i + 1] + 1;

	2031 }

	2032 ranges->Add(CharacterRange(last, 0xFFFF));

	2033 }

	2034

	2035

	2036 void CharacterRange::AddClassEscape(uc16 type,

	2037 ZoneList<CharacterRange>* ranges) {

	2038 switch (type) {

	2039 case 's':

	2040 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);

	2041 break;

	2042 case 'S':

	2043 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);

	2044 break;

	2045 case 'w':

	2046 AddClass(kWordRanges, kWordRangeCount, ranges);

	2047 break;

	2048 case 'W':

	2049 AddClassNegated(kWordRanges, kWordRangeCount, ranges);

	2050 break;

	2051 case 'd':

	2052 AddClass(kDigitRanges, kDigitRangeCount, ranges);

	2053 break;

	2054 case 'D':

	2055 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);

	2056 break;

	2057 case '.':

	2058 AddClassNegated(kLineTerminatorRanges,

	2059 kLineTerminatorRangeCount,

	2060 ranges);

	2061 break;

	2062 // This is not a character range as defined by the spec but a

	2063 // convenient shorthand for a character class that matches any

	2064 // character.

	2065 case '*':

	2066 ranges->Add(CharacterRange::Everything());

	2067 break;

	2068 default:

	2069 UNREACHABLE();

	2070 }

	2071 }

	2072

	2073

	2074 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {

	2075 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	2076 if (IsSingleton()) {

	2077 // If this is a singleton we just expand the one character.

	2078 int length = uncanonicalize.get(from(), '\0', chars);

	2079 for (int i = 0; i < length; i++) {

	2080 uc32 chr = chars[i];

	2081 if (chr != from()) {

	2082 ranges->Add(CharacterRange::Singleton(chars[i]));

	2083 }

	2084 }

	2085 } else if (from() <= kRangeCanonicalizeMax

	2086 && to() <= kRangeCanonicalizeMax) {
	Mads Ager (chromium) 2008/11/25 21:09:41 Indentation is off. Indentation is off. Christian Plesner Hansen 2008/11/26 06:49:56 Fixed. Fixed.
	2087 // If this is a range we expand the characters block by block,

	2088 // expanding contiguous subranges (blocks) one at a time.

	2089 // The approach is as follows. For a given start character we

	2090 // look up the block that contains it, for instance 'a' if the

	2091 // start character is 'c'. A block is characterized by the property

	2092 // that all characters uncanonicalize in the same way as the first

	2093 // element, except that each entry in the result is incremented

	2094 // by the distance from the first element. So a-z is a block

	2095 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter

	2096 // uncanonicalizes to ['a' + k, 'A' + k].

	2097 // Once we've found the start point we look up its uncanonicalization

	2098 // and produce a range for each element. For instance for [c-f]

	2099 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only

	2100 // add a range if it is not already contained in the input, so [c-f]

	2101 // will be skipped but [C-F] will be added. If this range is not

	2102 // completely contained in a block we do this for all the blocks

	2103 // covered by the range.

	2104 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	2105 // First, look up the block that contains the 'from' character.

	2106 int length = canonrange.get(from(), '\0', range);

	2107 if (length == 0) {

	2108 range[0] = from();

	2109 } else {

	2110 ASSERT_EQ(1, length);

	2111 }

	2112 int pos = from();

	2113 // The start of the current block. Note that except for the first

	2114 // iteration 'start' is always equal to 'pos'.

	2115 int start;

	2116 // If it is not the start point of a block the entry contains the

	2117 // offset of the character from the start point.

	2118 if ((range[0] & kStartMarker) == 0) {

	2119 start = pos - range[0];

	2120 } else {

	2121 start = pos;

	2122 }

	2123 // Then we add the ranges on at a time, incrementing the current

	2124 // position to be after the last block each time. The position

	2125 // always points to the start of a block.

	2126 while (pos < to()) {

	2127 length = canonrange.get(start, '\0', range);

	2128 if (length == 0) {

	2129 range[0] = start;

	2130 } else {

	2131 ASSERT_EQ(1, length);

	2132 }

	2133 ASSERT((range[0] & kStartMarker) != 0);

	2134 // The start point of a block contains the distance to the end

	2135 // of the range.

	2136 int block_end = start + (range[0] & kPayloadMask) - 1;

	2137 int end = (block_end > to()) ? to() : block_end;

	2138 length = uncanonicalize.get(start, '\0', range);

	2139 for (int i = 0; i < length; i++) {

	2140 uc32 c = range[i];

	2141 uc16 range_from = c + (pos - start);

	2142 uc16 range_to = c + (end - start);

	2143 if (!(from() <= range_from && range_to <= to()))

	2144 ranges->Add(CharacterRange(range_from, range_to));

	2145 }

	2146 start = pos = block_end + 1;

	2147 }

	2148 } else {

	2149 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.

	2150 }

	2151 }

	2152

	2153

	2154 // -------------------------------------------------------------------

	2155 // Interest propagation

	2156

	2157

	2158 RegExpNode* RegExpNode::GetSibling(NodeInfo* info) {

	2159 for (int i = 0; i < siblings_.length(); i++) {

	2160 RegExpNode* sibling = siblings_.Get(i);

	2161 if (sibling->info()->SameInterests(info))

	2162 return sibling;

	2163 }

	2164 return NULL;

	2165 }

	2166

	2167

	2168 template <class C>

	2169 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {

	2170 RegExpNode* sibling = node->GetSibling(info);

	2171 if (sibling != NULL) return sibling;

	2172 node->EnsureSiblings();

	2173 sibling = new C(*node);

	2174 sibling->info()->AdoptInterests(info);

	2175 node->AddSibling(sibling);

	2176 return sibling;

	2177 }

	2178

	2179

	2180 RegExpNode* ActionNode::PropagateInterest(NodeInfo* info) {

	2181 RegExpNode* sibling = GetSibling(info);

	2182 if (sibling != NULL) return sibling;

	2183 EnsureSiblings();

	2184 ActionNode* action = new ActionNode(*this);

	2185 action->info()->AdoptInterests(info);

	2186 AddSibling(action);

	2187 action->set_on_success(action->on_success()->PropagateInterest(info));

	2188 return action;

	2189 }

	2190

	2191

	2192 RegExpNode* ChoiceNode::PropagateInterest(NodeInfo* info) {

	2193 RegExpNode* sibling = GetSibling(info);

	2194 if (sibling != NULL) return sibling;

	2195 EnsureSiblings();

	2196 ChoiceNode* choice = new ChoiceNode(*this);

	2197 choice->info()->AdoptInterests(info);

	2198 AddSibling(choice);

	2199 ZoneList<GuardedAlternative>* old_alternatives = alternatives();

	2200 int count = old_alternatives->length();

	2201 choice->alternatives_ = new ZoneList<GuardedAlternative>(count);

	2202 for (int i = 0; i < count; i++) {

	2203 GuardedAlternative alternative = old_alternatives->at(i);

	2204 alternative.set_node(alternative.node()->PropagateInterest(info));

	2205 choice->alternatives()->Add(alternative);

	2206 }

	2207 return choice;

	2208 }

	2209

	2210

	2211 RegExpNode* EndNode::PropagateInterest(NodeInfo* info) {

	2212 return PropagateToEndpoint(this, info);

	2213 }

	2214

	2215

	2216 RegExpNode* BackReferenceNode::PropagateInterest(NodeInfo* info) {

	2217 return PropagateToEndpoint(this, info);

	2218 }

	2219

	2220

	2221 RegExpNode* TextNode::PropagateInterest(NodeInfo* info) {

	2222 return PropagateToEndpoint(this, info);

	2223 }

	2224

	2225

	2226 // -------------------------------------------------------------------

	2227 // Splay tree

	2228

	2229

	2230 OutSet* OutSet::Extend(unsigned value) {

	2231 if (Get(value))

	2232 return this;

	2233 if (successors() != NULL) {

	2234 for (int i = 0; i < successors()->length(); i++) {

	2235 OutSet* successor = successors()->at(i);

	2236 if (successor->Get(value))

	2237 return successor;

	2238 }

	2239 } else {

	2240 successors_ = new ZoneList<OutSet*>(2);

	2241 }

	2242 OutSet* result = new OutSet(first_, remaining_);

	2243 result->Set(value);

	2244 successors()->Add(result);

	2245 return result;

	2246 }

	2247

	2248

	2249 void OutSet::Set(unsigned value) {

	2250 if (value < kFirstLimit) {

	2251 first_ \|= (1 << value);

	2252 } else {

	2253 if (remaining_ == NULL)

	2254 remaining_ = new ZoneList<unsigned>(1);

	2255 if (remaining_->is_empty() \|\| !remaining_->Contains(value))

	2256 remaining_->Add(value);

	2257 }

	2258 }

	2259

	2260

	2261 bool OutSet::Get(unsigned value) {

	2262 if (value < kFirstLimit) {

	2263 return (first_ & (1 << value)) != 0;

	2264 } else if (remaining_ == NULL) {

	2265 return false;

	2266 } else {

	2267 return remaining_->Contains(value);

	2268 }

	2269 }

	2270

	2271

	2272 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;

	2273 const DispatchTable::Entry DispatchTable::Config::kNoValue;

	2274

	2275

	2276 void DispatchTable::AddRange(CharacterRange full_range, int value) {

	2277 CharacterRange current = full_range;

	2278 if (tree()->is_empty()) {

	2279 // If this is the first range we just insert into the table.

	2280 ZoneSplayTree<Config>::Locator loc;

	2281 ASSERT_RESULT(tree()->Insert(current.from(), &loc));

	2282 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));

	2283 return;

	2284 }

	2285 // First see if there is a range to the left of this one that

	2286 // overlaps.

	2287 ZoneSplayTree<Config>::Locator loc;

	2288 if (tree()->FindGreatestLessThan(current.from(), &loc)) {

	2289 Entry* entry = &loc.value();

	2290 // If we've found a range that overlaps with this one, and it

	2291 // starts strictly to the left of this one, we have to fix it

	2292 // because the following code only handles ranges that start on

	2293 // or after the start point of the range we're adding.

	2294 if (entry->from() < current.from() && entry->to() >= current.from()) {

	2295 // Snap the overlapping range in half around the start point of

	2296 // the range we're adding.

	2297 CharacterRange left(entry->from(), current.from() - 1);

	2298 CharacterRange right(current.from(), entry->to());

	2299 // The left part of the overlapping range doesn't overlap.

	2300 // Truncate the whole entry to be just the left part.

	2301 entry->set_to(left.to());

	2302 // The right part is the one that overlaps. We add this part

	2303 // to the map and let the next step deal with merging it with

	2304 // the range we're adding.

	2305 ZoneSplayTree<Config>::Locator loc;

	2306 ASSERT_RESULT(tree()->Insert(right.from(), &loc));

	2307 loc.set_value(Entry(right.from(),

	2308 right.to(),

	2309 entry->out_set()));

	2310 }

	2311 }

	2312 while (current.is_valid()) {

	2313 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&

	2314 (loc.value().from() <= current.to()) &&

	2315 (loc.value().to() >= current.from())) {

	2316 Entry* entry = &loc.value();

	2317 // We have overlap. If there is space between the start point of

	2318 // the range we're adding and where the overlapping range starts

	2319 // then we have to add a range covering just that space.

	2320 if (current.from() < entry->from()) {

	2321 ZoneSplayTree<Config>::Locator ins;

	2322 ASSERT_RESULT(tree()->Insert(current.from(), &ins));

	2323 ins.set_value(Entry(current.from(),

	2324 entry->from() - 1,

	2325 empty()->Extend(value)));

	2326 current.set_from(entry->from());

	2327 }

	2328 ASSERT_EQ(current.from(), entry->from());

	2329 // If the overlapping range extends beyond the one we want to add

	2330 // we have to snap the right part off and add it separately.

	2331 if (entry->to() > current.to()) {

	2332 ZoneSplayTree<Config>::Locator ins;

	2333 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));

	2334 ins.set_value(Entry(current.to() + 1,

	2335 entry->to(),

	2336 entry->out_set()));

	2337 entry->set_to(current.to());

	2338 }

	2339 ASSERT(entry->to() <= current.to());

	2340 // The overlapping range is now completely contained by the range

	2341 // we're adding so we can just update it and move the start point

	2342 // of the range we're adding just past it.

	2343 entry->AddValue(value);

	2344 // Bail out if the last interval ended at 0xFFFF since otherwise

	2345 // adding 1 will wrap around to 0.

	2346 if (entry->to() == 0xFFFF)

	2347 break;

	2348 ASSERT(entry->to() + 1 > current.from());

	2349 current.set_from(entry->to() + 1);

	2350 } else {

	2351 // There is no overlap so we can just add the range

	2352 ZoneSplayTree<Config>::Locator ins;

	2353 ASSERT_RESULT(tree()->Insert(current.from(), &ins));

	2354 ins.set_value(Entry(current.from(),

	2355 current.to(),

	2356 empty()->Extend(value)));

	2357 break;

	2358 }

	2359 }

	2360 }

	2361

	2362

	2363 OutSet* DispatchTable::Get(uc16 value) {

	2364 ZoneSplayTree<Config>::Locator loc;

	2365 if (!tree()->FindGreatestLessThan(value, &loc))

	2366 return empty();

	2367 Entry* entry = &loc.value();

	2368 if (value <= entry->to())

	2369 return entry->out_set();

	2370 else

	2371 return empty();

	2372 }

	2373

	2374

	2375 // -------------------------------------------------------------------

	2376 // Analysis

	2377

	2378

	2379 void Analysis::EnsureAnalyzed(RegExpNode* that) {

	2380 if (that->info()->been_analyzed \|\| that->info()->being_analyzed)

	2381 return;

	2382 that->info()->being_analyzed = true;

	2383 that->Accept(this);

	2384 that->info()->being_analyzed = false;

	2385 that->info()->been_analyzed = true;

	2386 }

	2387

	2388

	2389 void Analysis::VisitEnd(EndNode* that) {

	2390 // nothing to do

	2391 }

	2392

	2393

	2394 void Analysis::VisitText(TextNode* that) {

	2395 EnsureAnalyzed(that->on_success());

	2396 EnsureAnalyzed(that->on_failure());

	2397 }

	2398

	2399

	2400 void Analysis::VisitAction(ActionNode* that) {

	2401 RegExpNode* next = that->on_success();

	2402 EnsureAnalyzed(next);

	2403 that->info()->determine_newline = next->info()->prev_determine_newline();

	2404 that->info()->determine_word = next->info()->prev_determine_word();

	2405 that->info()->determine_start = next->info()->prev_determine_start();

	2406 }

	2407

	2408

	2409 void Analysis::VisitChoice(ChoiceNode* that) {

	2410 NodeInfo* info = that->info();

	2411 for (int i = 0; i < that->alternatives()->length(); i++) {

	2412 RegExpNode* node = that->alternatives()->at(i).node();

	2413 EnsureAnalyzed(node);

	2414 info->determine_newline \|= node->info()->prev_determine_newline();

	2415 info->determine_word \|= node->info()->prev_determine_word();

	2416 info->determine_start \|= node->info()->prev_determine_start();

	2417 }

	2418 if (!that->table_calculated()) {

	2419 DispatchTableConstructor cons(that->table());

	2420 cons.BuildTable(that);

	2421 }

	2422 EnsureAnalyzed(that->on_failure());

	2423 }

	2424

	2425

	2426 void Analysis::VisitBackReference(BackReferenceNode* that) {

	2427 EnsureAnalyzed(that->on_success());

	2428 EnsureAnalyzed(that->on_failure());

	2429 }

	2430

	2431

	2432 // -------------------------------------------------------------------

	2433 // Dispatch table construction

	2434

	2435

	2436 void DispatchTableConstructor::VisitEnd(EndNode* that) {

	2437 AddRange(CharacterRange::Everything());

	2438 }

	2439

	2440

	2441 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {

	2442 ASSERT(!node->table_calculated());

	2443 node->set_being_calculated(true);

	2444 ZoneList<GuardedAlternative>* alternatives = node->alternatives();

	2445 for (int i = 0; i < alternatives->length(); i++) {

	2446 set_choice_index(i);

	2447 alternatives->at(i).node()->Accept(this);

	2448 }

	2449 node->set_being_calculated(false);

	2450 node->set_table_calculated(true);

	2451 }

	2452

	2453

	2454 class AddDispatchRange {

	2455 public:

	2456 explicit AddDispatchRange(DispatchTableConstructor* constructor)

	2457 : constructor_(constructor) { }

	2458 void Call(uc32 from, DispatchTable::Entry entry);

	2459 private:

	2460 DispatchTableConstructor* constructor_;

	2461 };

	2462

	2463

	2464 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {

	2465 CharacterRange range(from, entry.to());

	2466 constructor_->AddRange(range);

	2467 }

	2468

	2469

	2470 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {

	2471 if (node->being_calculated())

	2472 return;

	2473 if (!node->table_calculated()) {

	2474 DispatchTableConstructor constructor(node->table());

	2475 constructor.BuildTable(node);

	2476 }

	2477 ASSERT(node->table_calculated());

	2478 AddDispatchRange adder(this);

	2479 node->table()->ForEach(&adder);

	2480 }

	2481

	2482

	2483 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {

	2484 // TODO(160): Find the node that we refer back to and propagate its start

	2485 // set back to here. For now we just accept anything.

	2486 AddRange(CharacterRange::Everything());

	2487 }

	2488

	2489

	2490

	2491 static int CompareRangeByFrom(const CharacterRange* a,

	2492 const CharacterRange* b) {

	2493 return Spaceship<uc16>(a->from(), b->from());

	2494 }

	2495

	2496

	2497 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {

	2498 ranges->Sort(CompareRangeByFrom);

	2499 uc16 last = 0;

	2500 for (int i = 0; i < ranges->length(); i++) {

	2501 CharacterRange range = ranges->at(i);

	2502 if (last < range.from())

	2503 AddRange(CharacterRange(last, range.from() - 1));

	2504 if (range.to() >= last) {

	2505 if (range.to() == 0xFFFF) {

	2506 return;

	2507 } else {

	2508 last = range.to() + 1;

	2509 }

	2510 }

	2511 }

	2512 AddRange(CharacterRange(last, 0xFFFF));

	2513 }

	2514

	2515

	2516 void DispatchTableConstructor::VisitText(TextNode* that) {

	2517 TextElement elm = that->elements()->at(0);

	2518 switch (elm.type) {

	2519 case TextElement::ATOM: {

	2520 uc16 c = elm.data.u_atom->data()[0];

	2521 AddRange(CharacterRange(c, c));

	2522 break;

	2523 }

	2524 case TextElement::CHAR_CLASS: {

	2525 RegExpCharacterClass* tree = elm.data.u_char_class;

	2526 ZoneList<CharacterRange>* ranges = tree->ranges();

	2527 if (tree->is_negated()) {

	2528 AddInverse(ranges);

	2529 } else {

	2530 for (int i = 0; i < ranges->length(); i++)

	2531 AddRange(ranges->at(i));

	2532 }

	2533 break;

	2534 }

	2535 default: {

	2536 UNIMPLEMENTED();

	2537 }

	2538 }

	2539 }

	2540

	2541

	2542 void DispatchTableConstructor::VisitAction(ActionNode* that) {

	2543 that->on_success()->Accept(this);

	2544 }

	2545

	2546

	2547 Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,

	2548 RegExpNode** node_return,

	2549 bool ignore_case) {

	2550 RegExpCompiler compiler(input->capture_count, ignore_case);

	2551 // Wrap the body of the regexp in capture #0.

	2552 RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,

	2553 0,

	2554 &compiler,

	2555 compiler.accept(),

	2556 compiler.backtrack());

	2557 // Add a .*? at the beginning, outside the body capture.

	2558 // Note: We could choose to not add this if the regexp is anchored at

	2559 // the start of the input but I'm not sure how best to do that and

	2560 // since we don't even handle ^ yet I'm saving that optimization for

	2561 // later.

	2562 RegExpNode* node = RegExpQuantifier::ToNode(0,

	2563 RegExpQuantifier::kInfinity,

	2564 false,

	2565 new RegExpCharacterClass('*'),

	2566 &compiler,

	2567 captured_body,

	2568 compiler.backtrack());

	2569 if (node_return != NULL) *node_return = node;

	2570 Analysis analysis;

	2571 analysis.EnsureAnalyzed(node);

	2572

	2573 if (!FLAG_irregexp) {

	2574 return Handle<FixedArray>::null();

	2575 }

	2576

	2577 #if !(defined ARM \|\| defined __arm__ \|\| defined __thumb__)

	2578 if (FLAG_irregexp_native) { // Flag only checked in IA32 mode.

	2579 // TODO(lrn) Move compilation to a later point in the life-cycle

	2580 // of the RegExp. We don't know the type of input string yet.

	2581 // For now, always assume two-byte strings.

	2582 RegExpMacroAssemblerIA32 macro_assembler(RegExpMacroAssemblerIA32::UC16,

	2583 (input->capture_count + 1) * 2,

	2584 ignore_case);

	2585 return compiler.Assemble(&macro_assembler,

	2586 node,

	2587 input->capture_count);

	2588 }

	2589 #endif

	2590 byte codes[1024];

	2591 IrregexpAssembler assembler(Vector<byte>(codes, 1024));

	2592 RegExpMacroAssemblerIrregexp macro_assembler(&assembler);

	2593 return compiler.Assemble(&macro_assembler,

	2594 node,

	2595 input->capture_count);

	2596 }

	2597

575	2598

576 }} // namespace v8::internal	2599 }} // namespace v8::internal

OLD	NEW

« src/jsregexp.h ('K') | « src/jsregexp.h ('k') | src/jsregexp-inl.h » ('j') | src/jsregexp-inl.h » ('J')