src/third_party/jscre/pcre_compile.cpp - Issue 21504: Remove JSCRE

Side by Side Diff: src/third_party/jscre/pcre_compile.cpp

Issue 21504: Remove JSCRE (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 11 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 /* This is JavaScriptCore's variant of the PCRE library. While this library

2 started out as a copy of PCRE, many of the features of PCRE have been

3 removed. This library now supports only the regular expression features

4 required by the JavaScript language specification, and has only the functions

5 needed by JavaScriptCore and the rest of WebKit.

6

7 Originally written by Philip Hazel

8 Copyright (c) 1997-2006 University of Cambridge

9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.

10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>

11

12 -----------------------------------------------------------------------------

13 Redistribution and use in source and binary forms, with or without

14 modification, are permitted provided that the following conditions are met:

15

16 * Redistributions of source code must retain the above copyright notice,

17 this list of conditions and the following disclaimer.

18

19 * Redistributions in binary form must reproduce the above copyright

20 notice, this list of conditions and the following disclaimer in the

21 documentation and/or other materials provided with the distribution.

22

23 * Neither the name of the University of Cambridge nor the names of its

24 contributors may be used to endorse or promote products derived from

25 this software without specific prior written permission.

26

27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

37 POSSIBILITY OF SUCH DAMAGE.

38 -----------------------------------------------------------------------------

39 */

40

41 /* This module contains the external function jsRegExpExecute(), along with

42 supporting internal functions that are not used by other modules. */

43

44 #include "config.h"

45

46 #include "pcre_internal.h"

47

48 #include <string.h>

49 #include "ASCIICType.h"

50

51 /* Negative values for the firstchar and reqchar variables */

52

53 #define REQ_UNSET (-2)

54 #define REQ_NONE (-1)

55

56 /*************************************************

57 * Code parameters and static tables *

58 *************************************************/

59

60 /* Maximum number of items on the nested bracket stacks at compile time. This

61 applies to the nesting of all kinds of parentheses. It does not limit

62 un-nested, non-capturing parentheses. This number can be made bigger if

63 necessary - it is used to dimension one int and one unsigned char vector at

64 compile time. */

65

66 #define BRASTACK_SIZE 200

67

68 namespace v8 { namespace jscre {

69

70 /* Table for handling escaped characters in the range '0'-'z'. Positive returns

71 are simple data values; negative values are for special things like \d and so

72 on. Zero means further processing is needed (for things like \x), or the escape

73 is invalid. */

74

75 static const short escapes[] = {

76 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */

77 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */

78 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */

79 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */

80 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */

81 0, 0, 0, '[', '\\', ']', '^', '_', /* X - _ */

82 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */

83 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */

84 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */

85 0, 0, 0 /* x - z */

86 };

87

88 /* Error code numbers. They are given names so that they can more easily be

89 tracked. */

90

91 enum ErrorCode {

92 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,

93 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17

94 };

95

96 /* The texts of compile-time error messages. These are "char *" because they

97 are passed to the outside world. */

98

99 static const char* errorText(ErrorCode code)

100 {

101 static const char errorTexts[] =

102 /* 1 */

103 "\\ at end of pattern\0"

104 "\\c at end of pattern\0"

105 "character value in \\x{...} sequence is too large\0"

106 "numbers out of order in {} quantifier\0"

107 /* 5 */

108 "number too big in {} quantifier\0"

109 "missing terminating ] for character class\0"

110 "internal error: code overflow\0"

111 "range out of order in character class\0"

112 "nothing to repeat\0"

113 /* 10 */

114 "unmatched parentheses\0"

115 "internal error: unexpected repeat\0"

116 "unrecognized character after (?\0"

117 "failed to get memory\0"

118 "missing )\0"

119 /* 15 */

120 "reference to non-existent subpattern\0"

121 "regular expression too large\0"

122 "parentheses nested too deeply"

123 ;

124

125 int i = code;

126 const char* text = errorTexts;

127 while (i > 1)

128 i -= !*text++;

129 return text;

130 }

131

132 /* Structure for passing "static" information around between the functions

133 doing the compiling. */

134

135 struct CompileData {

136 CompileData() {

137 top_backref = 0;

138 backrefMap = 0;

139 req_varyopt = 0;

140 needOuterBracket = false;

141 numCapturingBrackets = 0;

142 }

143 int top_backref; /* Maximum back reference */

144 unsigned backrefMap; /* Bitmap of low back refs */

145 int req_varyopt; /* "After variable item" flag for reqbyte */

146 bool needOuterBracket;

147 int numCapturingBrackets;

148 };

149

150 /* Definitions to allow mutual recursion */

151

152 static bool compileBracket(int, int, unsigned char, const UChar, const UCha r, ErrorCode, int, int, int*, CompileData&);

153 static bool bracketIsAnchored(const unsigned char* code);

154 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap , unsigned backrefMap);

155 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool ina ssert);

156

157 /*************************************************

158 * Handle escapes *

159 *************************************************/

160

161 /* This function is called when a \ has been encountered. It either returns a

162 positive value for a simple escape such as \n, or a negative value which

163 encodes one of the more complicated things such as \d. When UTF-8 is enabled,

164 a positive value greater than 255 may be returned. On entry, ptr is pointing at

165 the \. On exit, it is on the final character of the escape sequence.

166

167 Arguments:

168 ptrptr points to the pattern position pointer

169 errorcodeptr points to the errorcode variable

170 bracount number of previous extracting brackets

171 options the options bits

172 isclass true if inside a character class

173

174 Returns: zero or positive => a data character

175 negative => a special escape sequence

176 on error, errorptr is set

177 */

178

179 static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int bracount, bool isclass)

180 {

181 const UChar* ptr = *ptrptr + 1;

182

183 /* If backslash is at the end of the pattern, it's an error. */

184 if (ptr == patternEnd) {

185 *errorcodeptr = ERR1;

186 *ptrptr = ptr;

187 return 0;

188 }

189

190 int c = *ptr;

191

192 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in

193 a table. A non-zero result is something that can be returned immediately.

194 Otherwise further processing may be required. */

195

196 if (c < '0' \|\| c > 'z') { /* Not alphameric */

197 } else if (int escapeValue = escapes[c - '0']) {

198 c = escapeValue;

199 if (isclass) {

200 if (-c == ESC_b)

201 c = '\b'; /* \b is backslash in a class */

202 else if (-c == ESC_B)

203 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */

204 }

205 /* Escapes that need further processing, or are illegal. */

206

207 } else {

208 switch (c) {

209 case '1':

210 case '2':

211 case '3':

212 case '4':

213 case '5':

214 case '6':

215 case '7':

216 case '8':

217 case '9':

218 /* Escape sequences starting with a non-zero digit are backrefer ences,

219 unless there are insufficient brackets, in which case they are octal

220 escape sequences. Those sequences end on the first non-octal ch aracter

221 or when we overflow 0-255, whichever comes first. */

222

223 if (!isclass) {

224 const UChar* oldptr = ptr;

225 c -= '0';

226 while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c < = bracount)

227 c = c * 10 + *(++ptr) - '0';

228 if (c <= bracount) {

229 c = -(ESC_REF + c);

230 break;

231 }

232 ptr = oldptr; /* Put the pointer back and fall through */

233 }

234

235 /* Handle an octal number following \. If the first digit is 8 o r 9,

236 this is not octal. */

237

238 if ((c = *ptr) >= '8')

239 break;

240

241 /* \0 always starts an octal number, but we may drop through to here with a

242 larger first octal digit. */

243

244 case '0': {

245 c -= '0';

246 int i;

247 for (i = 1; i <= 2; ++i) {

248 if (ptr + i >= patternEnd \|\| ptr[i] < '0' \|\| ptr[i] > '7')

249 break;

250 int cc = c * 8 + ptr[i] - '0';

251 if (cc > 255)

252 break;

253 c = cc;

254 }

255 ptr += i - 1;

256 break;

257 }

258

259 case 'x': {

260 c = 0;

261 int i;

262 for (i = 1; i <= 2; ++i) {

263 if (ptr + i >= patternEnd \|\| !isASCIIHexDigit(ptr[i])) {

264 c = 'x';

265 i = 1;

266 break;

267 }

268 int cc = ptr[i];

269 if (cc >= 'a')

270 cc -= 32; /* Convert to upper case */

271 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));

272 }

273 ptr += i - 1;

274 break;

275 }

276

277 case 'u': {

278 c = 0;

279 int i;

280 for (i = 1; i <= 4; ++i) {

281 if (ptr + i >= patternEnd \|\| !isASCIIHexDigit(ptr[i])) {

282 c = 'u';

283 i = 1;

284 break;

285 }

286 int cc = ptr[i];

287 if (cc >= 'a')

288 cc -= 32; /* Convert to upper case */

289 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));

290 }

291 ptr += i - 1;

292 break;

293 }

294

295 case 'c':

296 if (++ptr == patternEnd) {

297 *errorcodeptr = ERR2;

298 return 0;

299 }

300 c = *ptr;

301

302 /* A letter is upper-cased; then the 0x40 bit is flipped. This c oding

303 is ASCII-specific, but then the whole concept of \cx is ASCII-s pecific. */

304 c = toASCIIUpper(c) ^ 0x40;

305 break;

306 }

307 }

308

309 *ptrptr = ptr;

310 return c;

311 }

312

313 /*************************************************

314 * Check for counted repeat *

315 *************************************************/

316

317 /* This function is called when a '{' is encountered in a place where it might

318 start a quantifier. It looks ahead to see if it really is a quantifier or not.

319 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}

320 where the ddds are digits.

321

322 Arguments:

323 p pointer to the first char after '{'

324

325 Returns: true or false

326 */

327

328 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)

329 {

330 if (p >= patternEnd \|\| !isASCIIDigit(*p))

331 return false;

332 p++;

333 while (p < patternEnd && isASCIIDigit(*p))

334 p++;

335 if (p < patternEnd && *p == '}')

336 return true;

337

338 if (p >= patternEnd \|\| *p++ != ',')

339 return false;

340 if (p < patternEnd && *p == '}')

341 return true;

342

343 if (p >= patternEnd \|\| !isASCIIDigit(*p))

344 return false;

345 p++;

346 while (p < patternEnd && isASCIIDigit(*p))

347 p++;

348

349 return (p < patternEnd && *p == '}');

350 }

351

352 /*************************************************

353 * Read repeat counts *

354 *************************************************/

355

356 /* Read an item of the form {n,m} and return the values. This is called only

357 after isCountedRepeat() has confirmed that a repeat-count quantifier exists,

358 so the syntax is guaranteed to be correct, but we need to check the values.

359

360 Arguments:

361 p pointer to first char after '{'

362 minp pointer to int for min

363 maxp pointer to int for max

364 returned as -1 if no max

365 errorcodeptr points to error code variable

366

367 Returns: pointer to '}' on success;

368 current ptr on error, with errorcodeptr set non-zero

369 */

370

371 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, Error Code* errorcodeptr)

372 {

373 int min = 0;

374 int max = -1;

375

376 /* Read the minimum value and do a paranoid check: a negative value indicate s

377 an integer overflow. */

378

379 while (isASCIIDigit(*p))

380 min = min * 10 + *p++ - '0';

381 if (min < 0 \|\| min > 65535) {

382 *errorcodeptr = ERR5;

383 return p;

384 }

385

386 /* Read the maximum value if there is one, and again do a paranoid on its si ze.

387 Also, max must not be less than min. */

388

389 if (*p == '}')

390 max = min;

391 else {

392 if (*(++p) != '}') {

393 max = 0;

394 while (isASCIIDigit(*p))

395 max = max * 10 + *p++ - '0';

396 if (max < 0 \|\| max > 65535) {

397 *errorcodeptr = ERR5;

398 return p;

399 }

400 if (max < min) {

401 *errorcodeptr = ERR4;

402 return p;

403 }

404 }

405 }

406

407 /* Fill in the required variables, and pass back the pointer to the terminat ing

408 '}'. */

409

410 *minp = min;

411 *maxp = max;

412 return p;

413 }

414

415 /*************************************************

416 * Find first significant op code *

417 *************************************************/

418

419 /* This is called by several functions that scan a compiled expression looking

420 for a fixed first character, or an anchoring op code etc. It skips over things

421 that do not influence this.

422

423 Arguments:

424 code pointer to the start of the group

425 Returns: pointer to the first significant opcode

426 */

427

428 static const unsigned char* firstSignificantOpcode(const unsigned char* code)

429 {

430 while (*code == OP_BRANUMBER)

431 code += 3;

432 return code;

433 }

434

435 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsig ned char* code)

436 {

437 while (true) {

438 switch (*code) {

439 case OP_ASSERT_NOT:

440 advanceToEndOfBracket(code);

441 code += 1 + LINK_SIZE;

442 break;

443 case OP_WORD_BOUNDARY:

444 case OP_NOT_WORD_BOUNDARY:

445 ++code;

446 break;

447 case OP_BRANUMBER:

448 code += 3;

449 break;

450 default:

451 return code;

452 }

453 }

454 }

455

456 /*************************************************

457 * Get othercase range *

458 *************************************************/

459

460 /* This function is passed the start and end of a class range, in UTF-8 mode

461 with UCP support. It searches up the characters, looking for internal ranges of

462 characters in the "other" case. Each call returns the next one, updating the

463 start address.

464

465 Arguments:

466 cptr points to starting character value; updated

467 d end value

468 ocptr where to put start of othercase range

469 odptr where to put end of othercase range

470

471 Yield: true when range returned; false when no more

472 */

473

474 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)

475 {

476 int c, othercase = 0;

477

478 for (c = *cptr; c <= d; c++) {

479 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0)

480 break;

481 }

482

483 if (c > d)

484 return false;

485

486 *ocptr = othercase;

487 int next = othercase + 1;

488

489 for (++c; c <= d; c++) {

490 if (kjs_pcre_ucp_othercase(c) != next)

491 break;

492 next++;

493 }

494

495 *odptr = next - 1;

496 *cptr = c;

497

498 return true;

499 }

500

501 /*************************************************

502 * Convert character value to UTF-8 *

503 *************************************************/

504

505 /* This function takes an integer value in the range 0 - 0x7fffffff

506 and encodes it as a UTF-8 character in 0 to 6 bytes.

507

508 Arguments:

509 cvalue the character value

510 buffer pointer to buffer for result - at least 6 bytes long

511

512 Returns: number of characters placed in the buffer

513 */

514

515 static int encodeUTF8(int cvalue, unsigned char *buffer)

516 {

517 int i;

518 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)

519 if (cvalue <= kjs_pcre_utf8_table1[i])

520 break;

521 buffer += i;

522 for (int j = i; j > 0; j--) {

523 *buffer-- = 0x80 \| (cvalue & 0x3f);

524 cvalue >>= 6;

525 }

526 *buffer = kjs_pcre_utf8_table2[i] \| cvalue;

527 return i + 1;

528 }

529

530 /*************************************************

531 * Compile one branch *

532 *************************************************/

533

534 /* Scan the pattern, compiling it into the code vector.

535

536 Arguments:

537 options the option bits

538 brackets points to number of extracting brackets used

539 codeptr points to the pointer to the current code point

540 ptrptr points to the current pattern pointer

541 errorcodeptr points to error code variable

542 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)

543 reqbyteptr set to the last literal character required, else < 0

544 cd contains pointers to tables etc.

545

546 Returns: true on success

547 false, with *errorcodeptr set non-zero on error

548 */

549

550 static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd , UChar expected)

551 {

552 return ((ptr + 1 < patternEnd) && ptr[1] == expected);

553 }

554

555 static bool

556 compileBranch(int options, int* brackets, unsigned char** codeptr,

557 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorco deptr, int *firstbyteptr,

558 int* reqbyteptr, CompileData& cd)

559 {

560 int repeat_type, op_type;

561 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */

562 int bravalue = 0;

563 int reqvary, tempreqvary;

564 int c;

565 unsigned char* code = *codeptr;

566 unsigned char* tempcode;

567 bool groupsetfirstbyte = false;

568 const UChar* ptr = *ptrptr;

569 const UChar* tempptr;

570 unsigned char* previous = NULL;

571 unsigned char classbits[32];

572

573 bool class_utf8;

574 unsigned char* class_utf8data;

575 unsigned char utf8_char[6];

576

577 /* Initialize no first byte, no required byte. REQ_UNSET means "no char

578 matching encountered yet". It gets changed to REQ_NONE if we hit something that

579 matches a non-fixed char first char; reqbyte just remains unset if we never

580 find one.

581

582 When we hit a repeat whose minimum is zero, we may have to adjust these val ues

583 to take the zero repeat into account. This is implemented by setting them t o

584 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The indivi dual

585 item types that can be repeated set these backoff variables appropriately. */

586

587 int firstbyte = REQ_UNSET;

588 int reqbyte = REQ_UNSET;

589 int zeroreqbyte = REQ_UNSET;

590 int zerofirstbyte = REQ_UNSET;

591

592 /* The variable req_caseopt contains either the REQ_IGNORE_CASE value or zer o,

593 according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit

594 value > 255. It is added into the firstbyte or reqbyte variables to record the

595 case status of the value. This is used only for ASCII characters. */

596

597 int req_caseopt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;

598

599 /* Switch on next character until the end of the branch */

600

601 for (;; ptr++) {

602 bool negate_class;

603 bool should_flip_negation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */

604 int class_charcount;

605 int class_lastchar;

606 int skipbytes;

607 int subreqbyte;

608 int subfirstbyte;

609 int mclength;

610 unsigned char mcbuffer[8];

611

612 /* Next byte in the pattern */

613

614 c = ptr < patternEnd ? *ptr : 0;

615

616 /* Fill in length of a previous callout, except when the next thing is

617 a quantifier. */

618

619 bool is_quantifier = c == '*' \|\| c == '+' \|\| c == '?' \|\| (c == '{' && is CountedRepeat(ptr + 1, patternEnd));

620

621 switch (c) {

622 /* The branch terminates at end of string, \|, or ). */

623

624 case 0:

625 if (ptr < patternEnd)

626 goto NORMAL_CHAR;

627 // End of string; fall through

628 case '\|':

629 case ')':

630 *firstbyteptr = firstbyte;

631 *reqbyteptr = reqbyte;

632 *codeptr = code;

633 *ptrptr = ptr;

634 return true;

635

636 /* Handle single-character metacharacters. In multiline mode, ^ disa bles

637 the setting of any following char as a first character. */

638

639 case '^':

640 if (options & MatchAcrossMultipleLinesOption) {

641 if (firstbyte == REQ_UNSET)

642 firstbyte = REQ_NONE;

643 *code++ = OP_BOL;

644 } else

645 *code++ = OP_CIRC;

646 previous = NULL;

647 break;

648

649 case '$':

650 previous = NULL;

651 if (options & MatchAcrossMultipleLinesOption)

652 *code++ = OP_EOL;

653 else

654 *code++ = OP_DOLL;

655 break;

656

657 /* There can never be a first char if '.' is first, whatever happens about

658 repeats. The value of reqbyte doesn't change either. */

659

660 case '.':

661 if (firstbyte == REQ_UNSET)

662 firstbyte = REQ_NONE;

663 zerofirstbyte = firstbyte;

664 zeroreqbyte = reqbyte;

665 previous = code;

666 *code++ = OP_NOT_NEWLINE;

667 break;

668

669 /* Character classes. If the included characters are all < 256, we b uild a

670 32-byte bitmap of the permitted characters, except in the special c ase

671 where there is only one such character. For negated classes, we bui ld the

672 map as usual, then invert it at the end. However, we use a differen t opcode

673 so that data characters > 255 can be handled correctly.

674

675 If the class contains characters outside the 0-255 range, a differe nt

676 opcode is compiled. It may optionally have a bit map for characters < 256,

677 but those above are are explicitly listed afterwards. A flag byte t ells

678 whether the bitmap is present, and whether this is a negated class or not.

679 */

680

681 case '[': {

682 previous = code;

683 should_flip_negation = false;

684

685 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if

686 they are encountered at the top level, so we'll do that too. */

687

688 /* If the first character is '^', set the negation flag and skip it. */

689

690 if (ptr + 1 >= patternEnd) {

691 *errorcodeptr = ERR6;

692 return false;

693 }

694

695 if (ptr[1] == '^') {

696 negate_class = true;

697 ++ptr;

698 } else

699 negate_class = false;

700

701 /* Keep a count of chars with values < 256 so that we can optimi ze the case

702 of just a single character (as long as it's < 256). For higher valued UTF-8

703 characters, we don't yet do any optimization. */

704

705 class_charcount = 0;

706 class_lastchar = -1;

707

708 class_utf8 = false; /* No chars >= 256 */

709 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */

710

711 /* Initialize the 32-char bit map to all zeros. We have to build the

712 map in a temporary bit of store, in case the class contains onl y 1

713 character (< 256), because in that case the compiled code doesn 't use the

714 bit map. */

715

716 memset(classbits, 0, 32 * sizeof(unsigned char));

717

718 /* Process characters until ] is reached. The first pass

719 through the regex checked the overall syntax, so we don't need to be very

720 strict here. At the start of the loop, c contains the first byt e of the

721 character. */

722

723 while ((++ptr < patternEnd) && (c = *ptr) != ']') {

724 /* Backslash may introduce a single character, or it may int roduce one

725 of the specials, which just set a flag. Escaped items are c hecked for

726 validity in the pre-compiling pass. The sequence \b is a sp ecial case.

727 Inside a class (and only there) it is treated as backspace. Elsewhere

728 it marks a word boundary. Other escapes have preset maps re ady to

729 or into the one we are building. We assume they have more t han one

730 character in them, so set class_charcount bigger than one. */

731

732 if (c == '\\') {

733 c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCa pturingBrackets, true);

734 if (c < 0) {

735 class_charcount += 2; /* Greater than 1 is what matters */

736 switch (-c) {

737 case ESC_d:

738 for (c = 0; c < 32; c++)

739 classbits[c] \|= classBitmapForChar(c + c bit_digit);

740 continue;

741

742 case ESC_D:

743 should_flip_negation = true;

744 for (c = 0; c < 32; c++)

745 classbits[c] \|= ~classBitmapForChar(c + cbit_digit);

746 continue;

747

748 case ESC_w:

749 for (c = 0; c < 32; c++)

750 classbits[c] \|= classBitmapForChar(c + c bit_word);

751 continue;

752

753 case ESC_W:

754 should_flip_negation = true;

755 for (c = 0; c < 32; c++)

756 classbits[c] \|= ~classBitmapForChar(c + cbit_word);

757 continue;

758

759 case ESC_s:

760 for (c = 0; c < 32; c++)

761 classbits[c] \|= classBitmapForChar(c + cbit_space);

762 continue;

763

764 case ESC_S:

765 should_flip_negation = true;

766 for (c = 0; c < 32; c++)

767 classbits[c] \|= ~classBitmapForChar(c + cbit_space);

768 continue;

769

770 /* Unrecognized escapes are faulted if PCRE is running in its

771 strict mode. By default, for compatibility with Perl, they are

772 treated as literals. */

773

774 default:

775 c = ptr; / The final characte r */

776 class_charcount -= 2; /* Undo the default c ount from above */

777 }

778 }

779

780 /* Fall through if we have a single character (c >= 0). This may be

781 > 256 in UTF-8 mode. */

782

783 } /* End of backslash handling */

784

785 /* A single character may be followed by '-' to form a range . However,

786 Perl does not permit ']' to be the end of the range. A '-' character

787 here is treated as a literal. */

788

789 if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']' ) {

790 ptr += 2;

791

792 int d = *ptr;

793

794 /* The second part of a range can be a single-character escape, but

795 not any of the other escapes. Perl 5.6 treats a hyphen as a literal

796 in such circumstances. */

797

798 if (d == '\\') {

799 const UChar* oldptr = ptr;

800 d = checkEscape(&ptr, patternEnd, errorcodeptr, cd.n umCapturingBrackets, true);

801

802 /* \X is literal X; any other special means the '-' was literal */

803 if (d < 0) {

804 ptr = oldptr - 2;

805 goto LONE_SINGLE_CHARACTER; /* A few lines belo w */

806 }

807 }

808

809 /* The check that the two values are in the correct orde r happens in

810 the pre-pass. Optimize one-character ranges */

811

812 if (d == c)

813 goto LONE_SINGLE_CHARACTER; /* A few lines below */

814

815 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless

816 matching, we have to use an XCLASS with extra data item s. Caseless

817 matching for characters > 127 is available only if UCP support is

818 available. */

819

820 if ((d > 255 \|\| ((options & IgnoreCaseOption) && d > 127 ))) {

821 class_utf8 = true;

822

823 /* With UCP support, we can find the other case equi valents of

824 the relevant characters. There may be several range s. Optimize how

825 they fit with the basic range. */

826

827 if (options & IgnoreCaseOption) {

828 int occ, ocd;

829 int cc = c;

830 int origd = d;

831 while (getOthercaseRange(&cc, origd, &occ, &ocd) ) {

832 if (occ >= c && ocd <= d)

833 continue; /* Skip embedded ranges */

834

835 if (occ < c && ocd >= c - 1) /* Exte nd the basic range */

836 { /* if the re is overlap, */

837 c = occ; /* no ting that if occ < c */

838 continue; /* we can't have ocd > d */

839 } /* becaus e a subrange is */

840 if (ocd > d && occ <= d + 1) /* alwa ys shorter than */

841 { /* the ba sic range. */

842 d = ocd;

843 continue;

844 }

845

846 if (occ == ocd)

847 *class_utf8data++ = XCL_SINGLE;

848 else {

849 *class_utf8data++ = XCL_RANGE;

850 class_utf8data += encodeUTF8(occ, class_ utf8data);

851 }

852 class_utf8data += encodeUTF8(ocd, class_utf8 data);

853 }

854 }

855

856 /* Now record the original range, possibly modified for UCP caseless

857 overlapping ranges. */

858

859 *class_utf8data++ = XCL_RANGE;

860 class_utf8data += encodeUTF8(c, class_utf8data);

861 class_utf8data += encodeUTF8(d, class_utf8data);

862

863 /* With UCP support, we are done. Without UCP suppor t, there is no

864 caseless matching for UTF-8 characters > 127; we ca n use the bit map

865 for the smaller ones. */

866

867 continue; /* With next character in the class */

868 }

869

870 /* We use the bit map for all cases when not in UTF-8 mo de; else

871 ranges that lie entirely within 0-127 when there is UCP support; else

872 for partial ranges without UCP support. */

873

874 for (; c <= d; c++) {

875 classbits[c/8] \|= (1 << (c&7));

876 if (options & IgnoreCaseOption) {

877 int uc = flipCase(c);

878 classbits[uc/8] \|= (1 << (uc&7));

879 }

880 class_charcount++; /* in case a one-c har range */

881 class_lastchar = c;

882 }

883

884 continue; /* Go get the next char in the class */

885 }

886

887 /* Handle a lone single character - we can get here for a no rmal

888 non-escape char, or after \ that introduces a single charac ter or for an

889 apparent range that isn't. */

890

891 LONE_SINGLE_CHARACTER:

892

893 /* Handle a character that cannot go in the bit map */

894

895 if ((c > 255 \|\| ((options & IgnoreCaseOption) && c > 127))) {

896 class_utf8 = true;

897 *class_utf8data++ = XCL_SINGLE;

898 class_utf8data += encodeUTF8(c, class_utf8data);

899

900 if (options & IgnoreCaseOption) {

901 int othercase;

902 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0) {

903 *class_utf8data++ = XCL_SINGLE;

904 class_utf8data += encodeUTF8(othercase, class_ut f8data);

905 }

906 }

907 } else {

908 /* Handle a single-byte character */

909 classbits[c/8] \|= (1 << (c&7));

910 if (options & IgnoreCaseOption) {

911 c = flipCase(c);

912 classbits[c/8] \|= (1 << (c&7));

913 }

914 class_charcount++;

915 class_lastchar = c;

916 }

917 }

918

919 /* If class_charcount is 1, we saw precisely one character whose value is

920 less than 256. In non-UTF-8 mode we can always optimize. In UTF -8 mode, we

921 can optimize the negative case only if there were no characters >= 128

922 because OP_NOT and the related opcodes like OP_NOTSTAR operate on

923 single-bytes only. This is an historical hangover. Maybe one da y we can

924 tidy these opcodes to handle multi-byte characters.

925

926 The optimization throws away the bit map. We turn the item into a

927 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's neg ative. Note

928 that OP_NOT does not support multibyte characters. In the posit ive case, it

929 can cause firstbyte to be set. Otherwise, there can be no first char if

930 this item is first, whatever repeat count may follow. In the ca se of

931 reqbyte, save the previous value for reinstating. */

932

933 if (class_charcount == 1 && (!class_utf8 && (!negate_class \|\| cl ass_lastchar < 128))) {

934 zeroreqbyte = reqbyte;

935

936 /* The OP_NOT opcode works on one-byte characters only. */

937

938 if (negate_class) {

939 if (firstbyte == REQ_UNSET)

940 firstbyte = REQ_NONE;

941 zerofirstbyte = firstbyte;

942 *code++ = OP_NOT;

943 *code++ = class_lastchar;

944 break;

945 }

946

947 /* For a single, positive character, get the value into c, a nd

948 then we can handle this with the normal one-character code. */

949

950 c = class_lastchar;

951 goto NORMAL_CHAR;

952 } /* End of 1-char optimization */

953

954 /* The general case - not the one-char optimization. If this is the first

955 thing in the branch, there can be no first char setting, whatev er the

956 repeat count. Any reqbyte setting must remain unchanged after a ny kind of

957 repeat. */

958

959 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;

960 zerofirstbyte = firstbyte;

961 zeroreqbyte = reqbyte;

962

963 /* If there are characters with values > 255, we have to compile an

964 extended class, with its own opcode. If there are no characters < 256,

965 we can omit the bitmap. */

966

967 if (class_utf8 && !should_flip_negation) {

968 class_utf8data++ = XCL_END; / Marks the end of extra da ta */

969 *code++ = OP_XCLASS;

970 code += LINK_SIZE;

971 *code = negate_class? XCL_NOT : 0;

972

973 /* If the map is required, install it, and move on to the en d of

974 the extra data */

975

976 if (class_charcount > 0) {

977 *code++ \|= XCL_MAP;

978 memcpy(code, classbits, 32);

979 code = class_utf8data;

980 }

981

982 /* If the map is not required, slide down the extra data. */

983

984 else {

985 int len = class_utf8data - (code + 33);

986 memmove(code + 1, code + 33, len);

987 code += len + 1;

988 }

989

990 /* Now fill in the complete length of the item */

991

992 putLinkValue(previous + 1, code - previous);

993 break; /* End of class handling */

994 }

995

996 /* If there are no characters > 255, negate the 32-byte map if n ecessary,

997 and copy it into the code vector. If this is the first thing in the branch,

998 there can be no first char setting, whatever the repeat count. Any reqbyte

999 setting must remain unchanged after any kind of repeat. */

1000

1001 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP _NCLASS;

1002 if (negate_class)

1003 for (c = 0; c < 32; c++)

1004 code[c] = ~classbits[c];

1005 else

1006 memcpy(code, classbits, 32);

1007 code += 32;

1008 break;

1009 }

1010

1011 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this

1012 has been tested above. */

1013

1014 case '{':

1015 if (!is_quantifier)

1016 goto NORMAL_CHAR;

1017 ptr = readRepeatCounts(ptr + 1, &repeat_min, &repeat_max, errorc odeptr);

1018 if (*errorcodeptr)

1019 goto FAILED;

1020 goto REPEAT;

1021

1022 case '*':

1023 repeat_min = 0;

1024 repeat_max = -1;

1025 goto REPEAT;

1026

1027 case '+':

1028 repeat_min = 1;

1029 repeat_max = -1;

1030 goto REPEAT;

1031

1032 case '?':

1033 repeat_min = 0;

1034 repeat_max = 1;

1035

1036 REPEAT:

1037 if (!previous) {

1038 *errorcodeptr = ERR9;

1039 goto FAILED;

1040 }

1041

1042 if (repeat_min == 0) {

1043 firstbyte = zerofirstbyte; /* Adjust for zero repeat */

1044 reqbyte = zeroreqbyte; /* Ditto */

1045 }

1046

1047 /* Remember whether this is a variable length repeat */

1048

1049 reqvary = (repeat_min == repeat_max) ? 0 : REQ_VARY;

1050

1051 op_type = 0; /* Default single-char op codes */

1052

1053 /* Save start of previous item, in case we have to move it up to make space

1054 for an inserted OP_ONCE for the additional '+' extension. */

1055 /* FIXME: Probably don't need this because we don't use OP_ONCE. */

1056

1057 tempcode = previous;

1058

1059 /* If the next character is '+', we have a possessive quantifier . This

1060 implies greediness, whatever the setting of the PCRE_UNGREEDY o ption.

1061 If the next character is '?' this is a minimizing repeat, by de fault,

1062 but if PCRE_UNGREEDY is set, it works the other way round. We c hange the

1063 repeat type to the non-default. */

1064

1065 if (safelyCheckNextChar(ptr, patternEnd, '?')) {

1066 repeat_type = 1;

1067 ptr++;

1068 } else

1069 repeat_type = 0;

1070

1071 /* If previous was a character match, abolish the item and gener ate a

1072 repeat item instead. If a char item has a minumum of more than one, ensure

1073 that it is set in reqbyte - it might not be if a sequence such as x{3} is

1074 the first thing in a branch because the x will have gone into f irstbyte

1075 instead. */

1076

1077 if (previous == OP_CHAR \|\| previous == OP_CHAR_IGNORING_CASE) {

1078 /* Deal with UTF-8 characters that take up more than one byt e. It's

1079 easier to write this out separately than try to macrify it. Use c to

1080 hold the length of the character in bytes, plus 0x80 to fla g that it's a

1081 length rather than a small character. */

1082

1083 if (code[-1] & 0x80) {

1084 unsigned char *lastchar = code - 1;

1085 while((*lastchar & 0xc0) == 0x80)

1086 lastchar--;

1087 c = code - lastchar; /* Length of UTF-8 chara cter */

1088 memcpy(utf8_char, lastchar, c); /* Save the char */

1089 c \|= 0x80; /* Flag c as a length */

1090 }

1091 else {

1092 c = code[-1];

1093 if (repeat_min > 1)

1094 reqbyte = c \| req_caseopt \| cd.req_varyopt;

1095 }

1096

1097 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single char acter types */

1098 }

1099

1100 else if (previous == OP_ASCII_CHAR \|\| previous == OP_ASCII_LET TER_IGNORING_CASE) {

1101 c = previous[1];

1102 if (repeat_min > 1)

1103 reqbyte = c \| req_caseopt \| cd.req_varyopt;

1104 goto OUTPUT_SINGLE_REPEAT;

1105 }

1106

1107 /* If previous was a single negated character ([^a] or similar), we use

1108 one of the special opcodes, replacing it. The code is shared wi th single-

1109 character repeats by setting opt_type to add a suitable offset into

1110 repeat_type. OP_NOT is currently used only for single-byte char s. */

1111

1112 else if (*previous == OP_NOT) {

1113 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */

1114 c = previous[1];

1115 goto OUTPUT_SINGLE_REPEAT;

1116 }

1117

1118 /* If previous was a character type match (\d or similar), aboli sh it and

1119 create a suitable repeat item. The code is shared with single-c haracter

1120 repeats by setting op_type to add a suitable offset into repeat _type. */

1121

1122 else if (*previous <= OP_NOT_NEWLINE) {

1123 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */

1124 c = *previous;

1125

1126 OUTPUT_SINGLE_REPEAT:

1127 int prop_type = -1;

1128 int prop_value = -1;

1129

1130 unsigned char* oldcode = code;

1131 code = previous; /* Usually overwrite previ ous item */

1132

1133 /* If the maximum is zero then the minimum must also be zero ; Perl allows

1134 this case, so we do too - by simply omitting the item altog ether. */

1135

1136 if (repeat_max == 0)

1137 goto END_REPEAT;

1138

1139 /* Combine the op_type with the repeat_type */

1140

1141 repeat_type += op_type;

1142

1143 /* A minimum of zero is handled either as the special case * or ?, or as

1144 an UPTO, with the maximum given. */

1145

1146 if (repeat_min == 0) {

1147 if (repeat_max == -1)

1148 *code++ = OP_STAR + repeat_type;

1149 else if (repeat_max == 1)

1150 *code++ = OP_QUERY + repeat_type;

1151 else {

1152 *code++ = OP_UPTO + repeat_type;

1153 put2ByteValueAndAdvance(code, repeat_max);

1154 }

1155 }

1156

1157 /* A repeat minimum of 1 is optimized into some special case s. If the

1158 maximum is unlimited, we use OP_PLUS. Otherwise, the origin al item it

1159 left in place and, if the maximum is greater than 1, we use OP_UPTO with

1160 one less than the maximum. */

1161

1162 else if (repeat_min == 1) {

1163 if (repeat_max == -1)

1164 *code++ = OP_PLUS + repeat_type;

1165 else {

1166 code = oldcode; /* leave previous it em in place */

1167 if (repeat_max == 1)

1168 goto END_REPEAT;

1169 *code++ = OP_UPTO + repeat_type;

1170 put2ByteValueAndAdvance(code, repeat_max - 1);

1171 }

1172 }

1173

1174 /* The case {n,n} is just an EXACT, while the general case { n,m} is

1175 handled as an EXACT followed by an UPTO. */

1176

1177 else {

1178 code++ = OP_EXACT + op_type; / NB EXACT doesn't have repeat_type */

1179 put2ByteValueAndAdvance(code, repeat_min);

1180

1181 /* If the maximum is unlimited, insert an OP_STAR. Befor e doing so,

1182 we have to insert the character for the previous code. For a repeated

1183 Unicode property match, there are two extra bytes that define the

1184 required property. In UTF-8 mode, long characters have their length in

1185 c, with the 0x80 bit as a flag. */

1186

1187 if (repeat_max < 0) {

1188 if (c >= 128) {

1189 memcpy(code, utf8_char, c & 7);

1190 code += c & 7;

1191 } else {

1192 *code++ = c;

1193 if (prop_type >= 0) {

1194 *code++ = prop_type;

1195 *code++ = prop_value;

1196 }

1197 }

1198 *code++ = OP_STAR + repeat_type;

1199 }

1200

1201 /* Else insert an UPTO if the max is greater than the mi n, again

1202 preceded by the character, for the previously inserted code. */

1203

1204 else if (repeat_max != repeat_min) {

1205 if (c >= 128) {

1206 memcpy(code, utf8_char, c & 7);

1207 code += c & 7;

1208 } else

1209 *code++ = c;

1210 if (prop_type >= 0) {

1211 *code++ = prop_type;

1212 *code++ = prop_value;

1213 }

1214 repeat_max -= repeat_min;

1215 *code++ = OP_UPTO + repeat_type;

1216 put2ByteValueAndAdvance(code, repeat_max);

1217 }

1218 }

1219

1220 /* The character or character type itself comes last in all cases. */

1221

1222 if (c >= 128) {

1223 memcpy(code, utf8_char, c & 7);

1224 code += c & 7;

1225 } else

1226 *code++ = c;

1227

1228 /* For a repeated Unicode property match, there are two extr a bytes that

1229 define the required property. */

1230

1231 if (prop_type >= 0) {

1232 *code++ = prop_type;

1233 *code++ = prop_value;

1234 }

1235 }

1236

1237 /* If previous was a character class or a back reference, we put the repeat

1238 stuff after it, but just skip the item if the repeat was {0,0}. */

1239

1240 else if (*previous == OP_CLASS \|\|

1241 *previous == OP_NCLASS \|\|

1242 *previous == OP_XCLASS \|\|

1243 *previous == OP_REF)

1244 {

1245 if (repeat_max == 0) {

1246 code = previous;

1247 goto END_REPEAT;

1248 }

1249

1250 if (repeat_min == 0 && repeat_max == -1)

1251 *code++ = OP_CRSTAR + repeat_type;

1252 else if (repeat_min == 1 && repeat_max == -1)

1253 *code++ = OP_CRPLUS + repeat_type;

1254 else if (repeat_min == 0 && repeat_max == 1)

1255 *code++ = OP_CRQUERY + repeat_type;

1256 else {

1257 *code++ = OP_CRRANGE + repeat_type;

1258 put2ByteValueAndAdvance(code, repeat_min);

1259 if (repeat_max == -1)

1260 repeat_max = 0; /* 2-byte encoding for max */

1261 put2ByteValueAndAdvance(code, repeat_max);

1262 }

1263 }

1264

1265 /* If previous was a bracket group, we may have to replicate it in certain

1266 cases. */

1267

1268 else if (*previous >= OP_BRA) {

1269 int ketoffset = 0;

1270 int len = code - previous;

1271 unsigned char* bralink = NULL;

1272

1273 /* If the maximum repeat count is unlimited, find the end of the bracket

1274 by scanning through from the start, and compute the offset back to it

1275 from the current code pointer. There may be an OP_OPT setti ng following

1276 the final KET, so we can't find the end just by going back from the code

1277 pointer. */

1278

1279 if (repeat_max == -1) {

1280 const unsigned char* ket = previous;

1281 advanceToEndOfBracket(ket);

1282 ketoffset = code - ket;

1283 }

1284

1285 /* The case of a zero minimum is special because of the need to stick

1286 OP_BRAZERO in front of it, and because the group appears on ce in the

1287 data, whereas in other cases it appears the minimum number of times. For

1288 this reason, it is simplest to treat this case separately, as otherwise

1289 the code gets far too messy. There are several special subc ases when the

1290 minimum is zero. */

1291

1292 if (repeat_min == 0) {

1293 /* If the maximum is also zero, we just omit the group f rom the output

1294 altogether. */

1295

1296 if (repeat_max == 0) {

1297 code = previous;

1298 goto END_REPEAT;

1299 }

1300

1301 /* If the maximum is 1 or unlimited, we just have to sti ck in the

1302 BRAZERO and do no more at this point. However, we do ne ed to adjust

1303 any OP_RECURSE calls inside the group that refer to the group itself or

1304 any internal group, because the offset is from the star t of the whole

1305 regex. Temporarily terminate the pattern while doing th is. */

1306

1307 if (repeat_max <= 1) {

1308 *code = OP_END;

1309 memmove(previous+1, previous, len);

1310 code++;

1311 *previous++ = OP_BRAZERO + repeat_type;

1312 }

1313

1314 /* If the maximum is greater than 1 and limited, we have to replicate

1315 in a nested fashion, sticking OP_BRAZERO before each se t of brackets.

1316 The first one has to be handled carefully because it's the original

1317 copy, which has to be moved up. The remainder can be ha ndled by code

1318 that is common with the non-zero minimum case below. We have to

1319 adjust the value of repeat_max, since one less copy is required. */

1320

1321 else {

1322 *code = OP_END;

1323 memmove(previous + 2 + LINK_SIZE, previous, len);

1324 code += 2 + LINK_SIZE;

1325 *previous++ = OP_BRAZERO + repeat_type;

1326 *previous++ = OP_BRA;

1327

1328 /* We chain together the bracket offset fields that have to be

1329 filled in later when the ends of the brackets are r eached. */

1330

1331 int offset = (!bralink) ? 0 : previous - bralink;

1332 bralink = previous;

1333 putLinkValueAllowZeroAndAdvance(previous, offset);

1334 }

1335

1336 repeat_max--;

1337 }

1338

1339 /* If the minimum is greater than zero, replicate the group as many

1340 times as necessary, and adjust the maximum to the number of subsequent

1341 copies that we need. If we set a first char from the group, and didn't

1342 set a required char, copy the latter from the former. */

1343

1344 else {

1345 if (repeat_min > 1) {

1346 if (groupsetfirstbyte && reqbyte < 0)

1347 reqbyte = firstbyte;

1348 for (int i = 1; i < repeat_min; i++) {

1349 memcpy(code, previous, len);

1350 code += len;

1351 }

1352 }

1353 if (repeat_max > 0)

1354 repeat_max -= repeat_min;

1355 }

1356

1357 /* This code is common to both the zero and non-zero minimum cases. If

1358 the maximum is limited, it replicates the group in a nested fashion,

1359 remembering the bracket starts on a stack. In the case of a zero minimum,

1360 the first one was set up above. In all cases the repeat_max now specifies

1361 the number of additional copies needed. */

1362

1363 if (repeat_max >= 0) {

1364 for (int i = repeat_max - 1; i >= 0; i--) {

1365 *code++ = OP_BRAZERO + repeat_type;

1366

1367 /* All but the final copy start a new nesting, maint aining the

1368 chain of brackets outstanding. */

1369

1370 if (i != 0) {

1371 *code++ = OP_BRA;

1372 int offset = (!bralink) ? 0 : code - bralink;

1373 bralink = code;

1374 putLinkValueAllowZeroAndAdvance(code, offset);

1375 }

1376

1377 memcpy(code, previous, len);

1378 code += len;

1379 }

1380

1381 /* Now chain through the pending brackets, and fill in t heir length

1382 fields (which are holding the chain links pro tem). */

1383

1384 while (bralink) {

1385 int offset = code - bralink + 1;

1386 unsigned char* bra = code - offset;

1387 int oldlinkoffset = getLinkValueAllowZero(bra + 1);

1388 bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkof fset;

1389 *code++ = OP_KET;

1390 putLinkValueAndAdvance(code, offset);

1391 putLinkValue(bra + 1, offset);

1392 }

1393 }

1394

1395 /* If the maximum is unlimited, set a repeater in the final copy. We

1396 can't just offset backwards from the current code point, be cause we

1397 don't know if there's been an options resetting after the k et. The

1398 correct offset was computed above. */

1399

1400 else

1401 code[-ketoffset] = OP_KETRMAX + repeat_type;

1402 }

1403

1404 /* Else there's some kind of shambles */

1405

1406 else {

1407 *errorcodeptr = ERR11;

1408 goto FAILED;

1409 }

1410

1411 /* In all case we no longer have a previous item. We also set th e

1412 "follows varying string" flag for subsequently encountered reqb ytes if

1413 it isn't already set and we have just passed a varying length i tem. */

1414

1415 END_REPEAT:

1416 previous = NULL;

1417 cd.req_varyopt \|= reqvary;

1418 break;

1419

1420 /* Start of nested bracket sub-expression, or comment or lookahead o r

1421 lookbehind or option setting or condition. First deal with special things

1422 that can come after a bracket; all are introduced by ?, and the app earance

1423 of any of them means that this is not a referencing group. They wer e

1424 checked for validity in the first pass over the string, so we don't have to

1425 check for syntax errors here. */

1426

1427 case '(':

1428 skipbytes = 0;

1429

1430 if (*(++ptr) == '?') {

1431 switch (*(++ptr)) {

1432 case ':': /* Non-extracting bracket */

1433 bravalue = OP_BRA;

1434 ptr++;

1435 break;

1436

1437 case '=': /* Positive lookahead */

1438 bravalue = OP_ASSERT;

1439 ptr++;

1440 break;

1441

1442 case '!': /* Negative lookahead */

1443 bravalue = OP_ASSERT_NOT;

1444 ptr++;

1445 break;

1446

1447 /* Character after (? not specially recognized */

1448

1449 default:

1450 *errorcodeptr = ERR12;

1451 goto FAILED;

1452 }

1453 }

1454

1455 /* Else we have a referencing group; adjust the opcode. If the b racket

1456 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and

1457 arrange for the true number to follow later, in an OP_BRANUMBER item. */

1458

1459 else {

1460 if (++(*brackets) > EXTRACT_BASIC_MAX) {

1461 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;

1462 code[1 + LINK_SIZE] = OP_BRANUMBER;

1463 put2ByteValue(code + 2 + LINK_SIZE, *brackets);

1464 skipbytes = 3;

1465 }

1466 else

1467 bravalue = OP_BRA + *brackets;

1468 }

1469

1470 /* Process nested bracketed re. Assertions may not be repeated, but other

1471 kinds can be. We copy code into a non-variable in order to be a ble

1472 to pass its address because some compilers complain otherwise. Pass in a

1473 new setting for the ims options if they have changed. */

1474

1475 previous = (bravalue >= OP_BRAZERO) ? code : 0;

1476 *code = bravalue;

1477 tempcode = code;

1478 tempreqvary = cd.req_varyopt; /* Save value before bracket * /

1479

1480 if (!compileBracket(

1481 options,

1482 brackets, /* Extracting b racket count */

1483 &tempcode, /* Where to put code (updated) */

1484 &ptr, /* Input pointe r (updated) */

1485 patternEnd,

1486 errorcodeptr, /* Where to put an error message */

1487 skipbytes, /* Skip over OP _BRANUMBER */

1488 &subfirstbyte, /* For possible first char */

1489 &subreqbyte, /* For possible last char */

1490 cd)) /* Tables block */

1491 goto FAILED;

1492

1493 /* At the end of compiling, code is still pointing to the start of the

1494 group, while tempcode has been updated to point past the end of the group

1495 and any option resetting that may follow it. The pattern pointe r (ptr)

1496 is on the bracket. */

1497

1498 /* Handle updating of the required and first characters. Update for normal

1499 brackets of all kinds, and conditions with two branches (see co de above).

1500 If the bracket is followed by a quantifier with zero repeat, we have to

1501 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the

1502 main loop so that they can be accessed for the back off. */

1503

1504 zeroreqbyte = reqbyte;

1505 zerofirstbyte = firstbyte;

1506 groupsetfirstbyte = false;

1507

1508 if (bravalue >= OP_BRA) {

1509 /* If we have not yet set a firstbyte in this branch, take i t from the

1510 subpattern, remembering that it was set here so that a repe at of more

1511 than one can replicate it as reqbyte if necessary. If the s ubpattern has

1512 no firstbyte, set "none" for the whole branch. In both case s, a zero

1513 repeat forces firstbyte to "none". */

1514

1515 if (firstbyte == REQ_UNSET) {

1516 if (subfirstbyte >= 0) {

1517 firstbyte = subfirstbyte;

1518 groupsetfirstbyte = true;

1519 }

1520 else

1521 firstbyte = REQ_NONE;

1522 zerofirstbyte = REQ_NONE;

1523 }

1524

1525 /* If firstbyte was previously set, convert the subpattern's firstbyte

1526 into reqbyte if there wasn't one, using the vary flag that was in

1527 existence beforehand. */

1528

1529 else if (subfirstbyte >= 0 && subreqbyte < 0)

1530 subreqbyte = subfirstbyte \| tempreqvary;

1531

1532 /* If the subpattern set a required byte (or set a first byt e that isn't

1533 really the first byte - see above), set it. */

1534

1535 if (subreqbyte >= 0)

1536 reqbyte = subreqbyte;

1537 }

1538

1539 /* For a forward assertion, we take the reqbyte, if set. This ca n be

1540 helpful if the pattern that follows the assertion doesn't set a different

1541 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte

1542 for an assertion, however because it leads to incorrect effect for patterns

1543 such as /(?=a)a.+/ when the "real" "a" would then become a reqb yte instead

1544 of a firstbyte. This is overcome by a scan at the end if there' s no

1545 firstbyte, looking for an asserted first char. */

1546

1547 else if (bravalue == OP_ASSERT && subreqbyte >= 0)

1548 reqbyte = subreqbyte;

1549

1550 /* Now update the main code pointer to the end of the group. */

1551

1552 code = tempcode;

1553

1554 /* Error if hit end of pattern */

1555

1556 if (ptr >= patternEnd \|\| *ptr != ')') {

1557 *errorcodeptr = ERR14;

1558 goto FAILED;

1559 }

1560 break;

1561

1562 /* Check \ for being a real metacharacter; if not, fall through and handle

1563 it as a data character at the start of a string. Escape items are c hecked

1564 for validity in the pre-compiling pass. */

1565

1566 case '\\':

1567 tempptr = ptr;

1568 c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingB rackets, false);

1569

1570 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values

1571 are arranged to be the negation of the corresponding OP_values. For the

1572 back references, the values are ESC_REF plus the reference numb er. Only

1573 back references and those types that consume a character may be repeated.

1574 We can test for values between ESC_b and ESC_w for the latter; this may

1575 have to change if any new ones are ever created. */

1576

1577 if (c < 0) {

1578 /* For metasequences that actually match a character, we dis able the

1579 setting of a first character if it hasn't already been set. */

1580

1581 if (firstbyte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)

1582 firstbyte = REQ_NONE;

1583

1584 /* Set values to reset to if this is followed by a zero repe at. */

1585

1586 zerofirstbyte = firstbyte;

1587 zeroreqbyte = reqbyte;

1588

1589 /* Back references are handled specially */

1590

1591 if (-c >= ESC_REF) {

1592 int number = -c - ESC_REF;

1593 previous = code;

1594 *code++ = OP_REF;

1595 put2ByteValueAndAdvance(code, number);

1596 }

1597

1598 /* For the rest, we can obtain the OP value by negating the escape

1599 value */

1600

1601 else {

1602 previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;

1603 *code++ = -c;

1604 }

1605 continue;

1606 }

1607

1608 /* Fall through. */

1609

1610 /* Handle a literal character. It is guaranteed not to be whites pace or #

1611 when the extended flag is set. If we are in UTF-8 mode, it may be a

1612 multi-byte literal character. */

1613

1614 default:

1615 NORMAL_CHAR:

1616

1617 previous = code;

1618

1619 if (c < 128) {

1620 mclength = 1;

1621 mcbuffer[0] = c;

1622

1623 if ((options & IgnoreCaseOption) && (c \| 0x20) >= 'a' && (c \| 0x20) <= 'z') {

1624 *code++ = OP_ASCII_LETTER_IGNORING_CASE;

1625 *code++ = c \| 0x20;

1626 } else {

1627 *code++ = OP_ASCII_CHAR;

1628 *code++ = c;

1629 }

1630 } else {

1631 mclength = encodeUTF8(c, mcbuffer);

1632

1633 *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CA SE : OP_CHAR;

1634 for (c = 0; c < mclength; c++)

1635 *code++ = mcbuffer[c];

1636 }

1637

1638 /* Set the first and required bytes appropriately. If no previou s first

1639 byte, set it from this character, but revert to none on a zero repeat.

1640 Otherwise, leave the firstbyte value alone, and don't change it on a zero

1641 repeat. */

1642

1643 if (firstbyte == REQ_UNSET) {

1644 zerofirstbyte = REQ_NONE;

1645 zeroreqbyte = reqbyte;

1646

1647 /* If the character is more than one byte long, we can set f irstbyte

1648 only if it is not to be matched caselessly. */

1649

1650 if (mclength == 1 \|\| req_caseopt == 0) {

1651 firstbyte = mcbuffer[0] \| req_caseopt;

1652 if (mclength != 1)

1653 reqbyte = code[-1] \| cd.req_varyopt;

1654 }

1655 else

1656 firstbyte = reqbyte = REQ_NONE;

1657 }

1658

1659 /* firstbyte was previously set; we can set reqbyte only the len gth is

1660 1 or the matching is caseful. */

1661

1662 else {

1663 zerofirstbyte = firstbyte;

1664 zeroreqbyte = reqbyte;

1665 if (mclength == 1 \|\| req_caseopt == 0)

1666 reqbyte = code[-1] \| req_caseopt \| cd.req_varyopt;

1667 }

1668

1669 break; /* End of literal character handling */

1670 }

1671 } /* end of big loop */

1672

1673 /* Control never reaches here by falling through, only by a goto for all the

1674 error states. Pass back the position in the pattern so that it can be displ ayed

1675 to the user for diagnosing the error. */

1676

1677 FAILED:

1678 *ptrptr = ptr;

1679 return false;

1680 }

1681

1682 /*************************************************

1683 * Compile sequence of alternatives *

1684 *************************************************/

1685

1686 /* On entry, ptr is pointing past the bracket character, but on return

1687 it points to the closing bracket, or vertical bar, or end of string.

1688 The code variable is pointing at the byte into which the BRA operator has been

1689 stored. If the ims options are changed at the start (for a (?ims: group) or

1690 during any branch, we need to insert an OP_OPT item at the start of every

1691 following branch to ensure they get set correctly at run time, and also pass

1692 the new options into every subsequent branch compile.

1693

1694 Argument:

1695 options option bits, including any changes for this subpattern

1696 brackets -> int containing the number of extracting brackets used

1697 codeptr -> the address of the current code pointer

1698 ptrptr -> the address of the current pattern pointer

1699 errorcodeptr -> pointer to error code variable

1700 skipbytes skip this many bytes at start (for OP_BRANUMBER)

1701 firstbyteptr place to put the first required character, or a negative number

1702 reqbyteptr place to put the last required character, or a negative number

1703 cd points to the data block with tables pointers etc.

1704

1705 Returns: true on success

1706 */

1707

1708 static bool

1709 compileBracket(int options, int* brackets, unsigned char** codeptr,

1710 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int skipbytes,

1711 int* firstbyteptr, int* reqbyteptr, CompileData& cd)

1712 {

1713 const UChar* ptr = *ptrptr;

1714 unsigned char* code = *codeptr;

1715 unsigned char* last_branch = code;

1716 unsigned char* start_bracket = code;

1717 int firstbyte = REQ_UNSET;

1718 int reqbyte = REQ_UNSET;

1719

1720 /* Offset is set zero to mark that this bracket is still open */

1721

1722 putLinkValueAllowZero(code + 1, 0);

1723 code += 1 + LINK_SIZE + skipbytes;

1724

1725 /* Loop for each alternative branch */

1726

1727 while (true) {

1728 /* Now compile the branch */

1729

1730 int branchfirstbyte;

1731 int branchreqbyte;

1732 if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorcode ptr,

1733 &branchfirstbyte, &branchreqbyte, cd)) {

1734 *ptrptr = ptr;

1735 return false;

1736 }

1737

1738 /* If this is the first branch, the firstbyte and reqbyte values for the

1739 branch become the values for the regex. */

1740

1741 if (*last_branch != OP_ALT) {

1742 firstbyte = branchfirstbyte;

1743 reqbyte = branchreqbyte;

1744 }

1745

1746 /* If this is not the first branch, the first char and reqbyte have to

1747 match the values from all the previous branches, except that if the pre vious

1748 value for reqbyte didn't have REQ_VARY set, it can still match, and we set

1749 REQ_VARY for the regex. */

1750

1751 else {

1752 /* If we previously had a firstbyte, but it doesn't match the new br anch,

1753 we have to abandon the firstbyte for the regex, but if there was pr eviously

1754 no reqbyte, it takes on the value of the old firstbyte. */

1755

1756 if (firstbyte >= 0 && firstbyte != branchfirstbyte) {

1757 if (reqbyte < 0)

1758 reqbyte = firstbyte;

1759 firstbyte = REQ_NONE;

1760 }

1761

1762 /* If we (now or from before) have no firstbyte, a firstbyte from th e

1763 branch becomes a reqbyte if there isn't a branch reqbyte. */

1764

1765 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)

1766 branchreqbyte = branchfirstbyte;

1767

1768 /* Now ensure that the reqbytes match */

1769

1770 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))

1771 reqbyte = REQ_NONE;

1772 else

1773 reqbyte \|= branchreqbyte; /* To "or" REQ_VARY */

1774 }

1775

1776 /* Reached end of expression, either ')' or end of pattern. Go back thro ugh

1777 the alternative branches and reverse the chain of offsets, with the fie ld in

1778 the BRA item now becoming an offset to the first alternative. If there are

1779 no alternatives, it points to the end of the group. The length in the

1780 terminating ket is always the length of the whole bracketed item. If an y of

1781 the ims options were changed inside the group, compile a resetting op-c ode

1782 following, except at the very end of the pattern. Return leaving the po inter

1783 at the terminating char. */

1784

1785 if (ptr >= patternEnd \|\| *ptr != '\|') {

1786 int length = code - last_branch;

1787 do {

1788 int prev_length = getLinkValueAllowZero(last_branch + 1);

1789 putLinkValue(last_branch + 1, length);

1790 length = prev_length;

1791 last_branch -= length;

1792 } while (length > 0);

1793

1794 /* Fill in the ket */

1795

1796 *code = OP_KET;

1797 putLinkValue(code + 1, code - start_bracket);

1798 code += 1 + LINK_SIZE;

1799

1800 /* Set values to pass back */

1801

1802 *codeptr = code;

1803 *ptrptr = ptr;

1804 *firstbyteptr = firstbyte;

1805 *reqbyteptr = reqbyte;

1806 return true;

1807 }

1808

1809 /* Another branch follows; insert an "or" node. Its length field points back

1810 to the previous branch while the bracket remains open. At the end the c hain

1811 is reversed. It's done like this so that the start of the bracket has a

1812 zero offset until it is closed, making it possible to detect recursion. */

1813

1814 *code = OP_ALT;

1815 putLinkValue(code + 1, code - last_branch);

1816 last_branch = code;

1817 code += 1 + LINK_SIZE;

1818 ptr++;

1819 }

1820 ASSERT_NOT_REACHED();

1821 }

1822

1823 /*************************************************

1824 * Check for anchored expression *

1825 *************************************************/

1826

1827 /* Try to find out if this is an anchored regular expression. Consider each

1828 alternative branch. If they all start OP_CIRC, or with a bracket

1829 all of whose alternatives start OP_CIRC (recurse ad lib), then

1830 it's anchored.

1831

1832 Arguments:

1833 code points to start of expression (the bracket)

1834 captureMap a bitmap of which brackets we are inside while testing; this

1835 handles up to substring 31; all brackets after that share

1836 the zero bit

1837 backrefMap the back reference bitmap

1838 */

1839

1840 static bool branchIsAnchored(const unsigned char* code)

1841 {

1842 const unsigned char* scode = firstSignificantOpcode(code);

1843 int op = *scode;

1844

1845 /* Brackets */

1846 if (op >= OP_BRA \|\| op == OP_ASSERT)

1847 return bracketIsAnchored(scode);

1848

1849 /* Check for explicit anchoring */

1850 return op == OP_CIRC;

1851 }

1852

1853 static bool bracketIsAnchored(const unsigned char* code)

1854 {

1855 do {

1856 if (!branchIsAnchored(code + 1 + LINK_SIZE))

1857 return false;

1858 code += getLinkValue(code + 1);

1859 } while (code == OP_ALT); / Loop for each alternative */

1860 return true;

1861 }

1862

1863 /*************************************************

1864 * Check for starting with ^ or .* *

1865 *************************************************/

1866

1867 /* This is called to find out if every branch starts with ^ or .* so that

1868 "first char" processing can be done to speed things up in multiline

1869 matching and for non-DOTALL patterns that start with .* (which must start at

1870 the beginning or after \n)

1871

1872 Except when the .* appears inside capturing parentheses, and there is a

1873 subsequent back reference to those parentheses. By keeping a bitmap of the

1874 first 31 back references, we can catch some of the more common cases more

1875 precisely; all the greater back references share a single bit.

1876

1877 Arguments:

1878 code points to start of expression (the bracket)

1879 captureMap a bitmap of which brackets we are inside while testing; this

1880 handles up to substring 31; all brackets after that share

1881 the zero bit

1882 backrefMap the back reference bitmap

1883 */

1884

1885 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)

1886 {

1887 const unsigned char* scode = firstSignificantOpcode(code);

1888 int op = *scode;

1889

1890 /* Capturing brackets */

1891 if (op > OP_BRA) {

1892 int captureNum = op - OP_BRA;

1893 if (captureNum > EXTRACT_BASIC_MAX)

1894 captureNum = get2ByteValue(scode + 2 + LINK_SIZE);

1895 int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;

1896 return bracketNeedsLineStart(scode, captureMap \| bracketMask, backrefMap );

1897 }

1898

1899 /* Other brackets */

1900 if (op == OP_BRA \|\| op == OP_ASSERT)

1901 return bracketNeedsLineStart(scode, captureMap, backrefMap);

1902

1903 /* .* means "start at start or after \n" if it isn't in brackets that

1904 may be referenced. */

1905

1906 if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR)

1907 return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);

1908

1909 /* Explicit ^ */

1910 return op == OP_CIRC \|\| op == OP_BOL;

1911 }

1912

1913 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap , unsigned backrefMap)

1914 {

1915 do {

1916 if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))

1917 return false;

1918 code += getLinkValue(code + 1);

1919 } while (code == OP_ALT); / Loop for each alternative */

1920 return true;

1921 }

1922

1923 /*************************************************

1924 * Check for asserted fixed first char *

1925 *************************************************/

1926

1927 /* During compilation, the "first char" settings from forward assertions are

1928 discarded, because they can cause conflicts with actual literals that follow.

1929 However, if we end up without a first char setting for an unanchored pattern,

1930 it is worth scanning the regex to see if there is an initial asserted first

1931 char. If all branches start with the same asserted char, or with a bracket all

1932 of whose alternatives start with the same asserted char (recurse ad lib), then

1933 we return that char, otherwise -1.

1934

1935 Arguments:

1936 code points to start of expression (the bracket)

1937 options pointer to the options (used to check casing changes)

1938 inassert true if in an assertion

1939

1940 Returns: -1 or the fixed first char

1941 */

1942

1943 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inas sert)

1944 {

1945 const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);

1946 int op = *scode;

1947

1948 if (op >= OP_BRA)

1949 op = OP_BRA;

1950

1951 switch (op) {

1952 default:

1953 return -1;

1954

1955 case OP_BRA:

1956 case OP_ASSERT:

1957 return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);

1958

1959 case OP_EXACT:

1960 scode += 2;

1961 /* Fall through */

1962

1963 case OP_CHAR:

1964 case OP_CHAR_IGNORING_CASE:

1965 case OP_ASCII_CHAR:

1966 case OP_ASCII_LETTER_IGNORING_CASE:

1967 case OP_PLUS:

1968 case OP_MINPLUS:

1969 if (!inassert)

1970 return -1;

1971 return scode[1];

1972 }

1973 }

1974

1975 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool ina ssert)

1976 {

1977 int c = -1;

1978 do {

1979 int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert) ;

1980 if (d < 0)

1981 return -1;

1982 if (c < 0)

1983 c = d;

1984 else if (c != d)

1985 return -1;

1986 code += getLinkValue(code + 1);

1987 } while (*code == OP_ALT);

1988 return c;

1989 }

1990

1991 static inline int multiplyWithOverflowCheck(int a, int b)

1992 {

1993 if (!a \|\| !b)

1994 return 0;

1995 if (a > MAX_PATTERN_SIZE / b)

1996 return -1;

1997 return a * b;

1998 }

1999

2000 static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt h, JSRegExpIgnoreCaseOption ignoreCase,

2001 CompileData& cd, ErrorCode& errorcode)

2002 {

2003 /* Make a pass over the pattern to compute the

2004 amount of store required to hold the compiled code. This does not have to b e

2005 perfect as long as errors are overestimates. */

2006

2007 if (patternLength > MAX_PATTERN_SIZE) {

2008 errorcode = ERR16;

2009 return -1;

2010 }

2011

2012 int length = 1 + LINK_SIZE; /* For initial BRA plus length */

2013 int branch_extra = 0;

2014 int lastitemlength = 0;

2015 unsigned brastackptr = 0;

2016 int brastack[BRASTACK_SIZE];

2017 unsigned char bralenstack[BRASTACK_SIZE];

2018 int bracount = 0;

2019

2020 const UChar* ptr = (const UChar*)(pattern - 1);

2021 const UChar* patternEnd = (const UChar*)(pattern + patternLength);

2022

2023 while (++ptr < patternEnd) {

2024 int minRepeats = 0, maxRepeats = 0;

2025 int c = *ptr;

2026

2027 switch (c) {

2028 /* A backslashed item may be an escaped data character or it may be a

2029 character type. */

2030

2031 case '\\':

2032 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBra ckets, false);

2033 if (errorcode != 0)

2034 return -1;

2035

2036 lastitemlength = 1; /* Default length of last item for repea ts */

2037

2038 if (c >= 0) { /* Data character */

2039 length += 2; /* For a one-byte character */

2040

2041 if (c > 127) {

2042 int i;

2043 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)

2044 if (c <= kjs_pcre_utf8_table1[i]) break;

2045 length += i;

2046 lastitemlength += i;

2047 }

2048

2049 continue;

2050 }

2051

2052 /* Other escapes need one byte */

2053

2054 length++;

2055

2056 /* A back reference needs an additional 2 bytes, plus either one or 5

2057 bytes for a repeat. We also need to keep the value of the highe st

2058 back reference. */

2059

2060 if (c <= -ESC_REF) {

2061 int refnum = -c - ESC_REF;

2062 cd.backrefMap \|= (refnum < 32) ? (1 << refnum) : 1;

2063 if (refnum > cd.top_backref)

2064 cd.top_backref = refnum;

2065 length += 2; /* For single back reference */

2066 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRe peat(ptr + 2, patternEnd)) {

2067 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats , &errorcode);

2068 if (errorcode)

2069 return -1;

2070 if ((minRepeats == 0 && (maxRepeats == 1 \|\| maxRepeats = = -1)) \|\|

2071 (minRepeats == 1 && maxRepeats == -1))

2072 length++;

2073 else

2074 length += 5;

2075 if (safelyCheckNextChar(ptr, patternEnd, '?'))

2076 ptr++;

2077 }

2078 }

2079 continue;

2080

2081 case '^': /* Single-byte metacharacters */

2082 case '.':

2083 case '$':

2084 length++;

2085 lastitemlength = 1;

2086 continue;

2087

2088 case '': / These repeats won't be after brackets; */

2089 case '+': /* those are handled separately */

2090 case '?':

2091 length++;

2092 goto POSSESSIVE;

2093

2094 /* This covers the cases of braced repeats after a single char, meta char,

2095 class, or back reference. */

2096

2097 case '{':

2098 if (!isCountedRepeat(ptr + 1, patternEnd))

2099 goto NORMAL_CHAR;

2100 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &error code);

2101 if (errorcode != 0)

2102 return -1;

2103

2104 /* These special cases just insert one extra opcode */

2105

2106 if ((minRepeats == 0 && (maxRepeats == 1 \|\| maxRepeats == -1)) \| \|

2107 (minRepeats == 1 && maxRepeats == -1))

2108 length++;

2109

2110 /* These cases might insert additional copies of a preceding cha racter. */

2111

2112 else {

2113 if (minRepeats != 1) {

2114 length -= lastitemlength; /* Uncount the original char or metachar */

2115 if (minRepeats > 0)

2116 length += 3 + lastitemlength;

2117 }

2118 length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);

2119 }

2120

2121 if (safelyCheckNextChar(ptr, patternEnd, '?'))

2122 ptr++; /* Needs no extra length */

2123

2124 POSSESSIVE: /* Test for possessive quantifier */

2125 if (safelyCheckNextChar(ptr, patternEnd, '+')) {

2126 ptr++;

2127 length += 2 + 2 * LINK_SIZE; /* Allow for atomic brackets */

2128 }

2129 continue;

2130

2131 /* An alternation contains an offset to the next branch or ket. If a ny ims

2132 options changed in the previous branch(es), and/or if we are in a

2133 lookbehind assertion, extra space will be needed at the start of th e

2134 branch. This is handled by branch_extra. */

2135

2136 case '\|':

2137 if (brastackptr == 0)

2138 cd.needOuterBracket = true;

2139 length += 1 + LINK_SIZE + branch_extra;

2140 continue;

2141

2142 /* A character class uses 33 characters provided that all the charac ter

2143 values are less than 256. Otherwise, it uses a bit map for low valu ed

2144 characters, and individual items for others. Don't worry about char acter

2145 types that aren't allowed in classes - they'll get picked up during the

2146 compile. A character class that contains only one single-byte chara cter

2147 uses 2 or 3 bytes, depending on whether it is negated or not. Notic e this

2148 where we can. (In UTF-8 mode we can do this only for chars < 128.) */

2149

2150 case '[': {

2151 int class_optcount;

2152 if (*(++ptr) == '^') {

2153 class_optcount = 10; /* Greater than one */

2154 ptr++;

2155 }

2156 else

2157 class_optcount = 0;

2158

2159 bool class_utf8 = false;

2160

2161 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {

2162 /* Check for escapes */

2163

2164 if (*ptr == '\\') {

2165 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapt uringBrackets, true);

2166 if (errorcode != 0)

2167 return -1;

2168

2169 /* Handle escapes that turn into characters */

2170

2171 if (c >= 0)

2172 goto NON_SPECIAL_CHARACTER;

2173

2174 /* Escapes that are meta-things. The normal ones just af fect the

2175 bit map, but Unicode properties require an XCLASS exten ded item. */

2176

2177 else

2178 class_optcount = 10; /* \d, \s etc; make sur e > 1 */

2179 }

2180

2181 /* Anything else increments the possible optimization count. We have to

2182 detect ranges here so that we can compute the number of ext ra ranges for

2183 caseless wide characters when UCP support is available. If there are wide

2184 characters, we are going to have to use an XCLASS, even for single

2185 characters. */

2186

2187 else {

2188 c = *ptr;

2189

2190 /* Come here from handling \ above when it escapes to a char value */

2191

2192 NON_SPECIAL_CHARACTER:

2193 class_optcount++;

2194

2195 int d = -1;

2196 if (safelyCheckNextChar(ptr, patternEnd, '-')) {

2197 UChar const *hyptr = ptr++;

2198 if (safelyCheckNextChar(ptr, patternEnd, '\\')) {

2199 ptr++;

2200 d = checkEscape(&ptr, patternEnd, &errorcode, cd .numCapturingBrackets, true);

2201 if (errorcode != 0)

2202 return -1;

2203 }

2204 else if ((ptr + 1 < patternEnd) && ptr[1] != ']')

2205 d = *++ptr;

2206 if (d < 0)

2207 ptr = hyptr; /* go back to hyphen as data * /

2208 }

2209

2210 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >

2211 127 for caseless matching, we will need to use an XCLAS S. */

2212

2213 if (d >= 0) {

2214 class_optcount = 10; /* Ensure > 1 */

2215 if (d < c) {

2216 errorcode = ERR8;

2217 return -1;

2218 }

2219

2220 if ((d > 255 \|\| (ignoreCase && d > 127))) {

2221 unsigned char buffer[6];

2222 if (!class_utf8) /* Allow for XCLASS ove rhead */

2223 {

2224 class_utf8 = true;

2225 length += LINK_SIZE + 2;

2226 }

2227

2228 /* If we have UCP support, find out how many ext ra ranges are

2229 needed to map the other case of characters with in this range. We

2230 have to mimic the range optimization here, beca use extending the

2231 range upwards might push d over a boundary that makes it use

2232 another byte in the UTF-8 representation. */

2233

2234 if (ignoreCase) {

2235 int occ, ocd;

2236 int cc = c;

2237 int origd = d;

2238 while (getOthercaseRange(&cc, origd, &occ, & ocd)) {

2239 if (occ >= c && ocd <= d)

2240 continue; /* Skip embedded */

2241

2242 if (occ < c && ocd >= c - 1) /* Extend the basic range */

2243 { /* if there is overlap, */

2244 c = occ; /* noti ng that if occ < c */

2245 continue; /* we c an't have ocd > d */

2246 } /* because a subrange is */

2247 if (ocd > d && occ <= d + 1) /* always shorter than */

2248 { /* the basi c range. */

2249 d = ocd;

2250 continue;

2251 }

2252

2253 /* An extra item is needed */

2254

2255 length += 1 + encodeUTF8(occ, buffer) +

2256 ((occ == ocd) ? 0 : encodeUTF8(ocd, buff er));

2257 }

2258 }

2259

2260 /* The length of the (possibly extended) range * /

2261

2262 length += 1 + encodeUTF8(c, buffer) + encodeUTF8 (d, buffer);

2263 }

2264

2265 }

2266

2267 /* We have a single character. There is nothing to be do ne unless we

2268 are in UTF-8 mode. If the char is > 255, or 127 when ca seless, we must

2269 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP

2270 support. */

2271

2272 else {

2273 if ((c > 255 \|\| (ignoreCase && c > 127))) {

2274 unsigned char buffer[6];

2275 class_optcount = 10; /* Ensure > 1 */

2276 if (!class_utf8) /* Allow for XCLASS ove rhead */

2277 {

2278 class_utf8 = true;

2279 length += LINK_SIZE + 2;

2280 }

2281 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8 (c, buffer));

2282 }

2283 }

2284 }

2285 }

2286

2287 if (ptr >= patternEnd) { /* Missing terminating ']' */

2288 errorcode = ERR6;

2289 return -1;

2290 }

2291

2292 /* We can optimize when there was only one optimizable character .

2293 Note that this does not detect the case of a negated single cha racter.

2294 In that case we do an incorrect length computation, but it's no t a serious

2295 problem because the computed length is too large rather than to o small. */

2296

2297 if (class_optcount == 1)

2298 goto NORMAL_CHAR;

2299

2300 /* Here, we handle repeats for the class opcodes. */

2301 {

2302 length += 33;

2303

2304 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,

2305 we also need extra for wrapping the whole thing in a sub-pa ttern. */

2306

2307 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRe peat(ptr + 2, patternEnd)) {

2308 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats , &errorcode);

2309 if (errorcode != 0)

2310 return -1;

2311 if ((minRepeats == 0 && (maxRepeats == 1 \|\| maxRepeats = = -1)) \|\|

2312 (minRepeats == 1 && maxRepeats == -1))

2313 length++;

2314 else

2315 length += 5;

2316 if (safelyCheckNextChar(ptr, patternEnd, '+')) {

2317 ptr++;

2318 length += 2 + 2 * LINK_SIZE;

2319 } else if (safelyCheckNextChar(ptr, patternEnd, '?'))

2320 ptr++;

2321 }

2322 }

2323 continue;

2324 }

2325

2326 /* Brackets may be genuine groups or special things */

2327

2328 case '(': {

2329 int branch_newextra = 0;

2330 int bracket_length = 1 + LINK_SIZE;

2331 bool capturing = false;

2332

2333 /* Handle special forms of bracket, which all start (? */

2334

2335 if (safelyCheckNextChar(ptr, patternEnd, '?')) {

2336 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {

2337 /* Non-referencing groups and lookaheads just move the p ointer on, and

2338 then behave like a non-special bracket, except that the y don't increment

2339 the count of extracting brackets. Ditto for the "once o nly" bracket,

2340 which is in Perl from version 5.005. */

2341

2342 case ':':

2343 case '=':

2344 case '!':

2345 ptr += 2;

2346 break;

2347

2348 /* Else loop checking valid options until ) is met. Anyt hing else is an

2349 error. If we are without any brackets, i.e. at top leve l, the settings

2350 act as if specified in the options, so massage the opti ons immediately.

2351 This is for backward compatibility with Perl 5.004. */

2352

2353 default:

2354 errorcode = ERR12;

2355 return -1;

2356 }

2357 } else

2358 capturing = 1;

2359

2360 /* Capturing brackets must be counted so we can process escapes in a

2361 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are goi ng to need

2362 an additional 3 bytes of memory per capturing bracket. */

2363

2364 if (capturing) {

2365 bracount++;

2366 if (bracount > EXTRACT_BASIC_MAX)

2367 bracket_length += 3;

2368 }

2369

2370 /* Save length for computing whole length at end if there's a re peat that

2371 requires duplication of the group. Also save the current value of

2372 branch_extra, and start the new group with the new value. If no n-zero, this

2373 will either be 2 for a (?imsx: group, or 3 for a lookbehind ass ertion. */

2374

2375 if (brastackptr >= sizeof(brastack)/sizeof(int)) {

2376 errorcode = ERR17;

2377 return -1;

2378 }

2379

2380 bralenstack[brastackptr] = branch_extra;

2381 branch_extra = branch_newextra;

2382

2383 brastack[brastackptr++] = length;

2384 length += bracket_length;

2385 continue;

2386 }

2387

2388 /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certai n sets of values we

2389 have to replicate this bracket up to that many times. If brastackpt r is

2390 0 this is an unmatched bracket which will generate an error, but ta ke care

2391 not to try to access brastack[-1] when computing the length and res toring

2392 the branch_extra value. */

2393

2394 case ')': {

2395 int duplength;

2396 length += 1 + LINK_SIZE;

2397 if (brastackptr > 0) {

2398 duplength = length - brastack[--brastackptr];

2399 branch_extra = bralenstack[brastackptr];

2400 }

2401 else

2402 duplength = 0;

2403

2404 /* Leave ptr at the final char; for readRepeatCounts this happen s

2405 automatically; for the others we need an increment. */

2406

2407 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRe peat(ptr + 2, patternEnd)) {

2408 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &e rrorcode);

2409 if (errorcode)

2410 return -1;

2411 } else if (c == '*') {

2412 minRepeats = 0;

2413 maxRepeats = -1;

2414 ptr++;

2415 } else if (c == '+') {

2416 minRepeats = 1;

2417 maxRepeats = -1;

2418 ptr++;

2419 } else if (c == '?') {

2420 minRepeats = 0;

2421 maxRepeats = 1;

2422 ptr++;

2423 } else {

2424 minRepeats = 1;

2425 maxRepeats = 1;

2426 }

2427

2428 /* If the minimum is zero, we have to allow for an OP_BRAZERO be fore the

2429 group, and if the maximum is greater than zero, we have to repl icate

2430 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting

2431 bracket set. */

2432

2433 int repeatsLength;

2434 if (minRepeats == 0) {

2435 length++;

2436 if (maxRepeats > 0) {

2437 repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1 , duplength + 3 + 2 * LINK_SIZE);

2438 if (repeatsLength < 0) {

2439 errorcode = ERR16;

2440 return -1;

2441 }

2442 length += repeatsLength;

2443 if (length > MAX_PATTERN_SIZE) {

2444 errorcode = ERR16;

2445 return -1;

2446 }

2447 }

2448 }

2449

2450 /* When the minimum is greater than zero, we have to replicate u p to

2451 minval-1 times, with no additions required in the copies. Then, if there

2452 is a limited maximum we have to replicate up to maxval-1 times allowing

2453 for a BRAZERO item before each optional copy and nesting bracke ts for all

2454 but one of the optional copies. */

2455

2456 else {

2457 repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, du plength);

2458 if (repeatsLength < 0) {

2459 errorcode = ERR16;

2460 return -1;

2461 }

2462 length += repeatsLength;

2463 if (maxRepeats > minRepeats) { /* Need this test as maxRepea ts=-1 means no limit */

2464 repeatsLength = multiplyWithOverflowCheck(maxRepeats - m inRepeats, duplength + 3 + 2 * LINK_SIZE);

2465 if (repeatsLength < 0) {

2466 errorcode = ERR16;

2467 return -1;

2468 }

2469 length += repeatsLength - (2 + 2 * LINK_SIZE);

2470 }

2471 if (length > MAX_PATTERN_SIZE) {

2472 errorcode = ERR16;

2473 return -1;

2474 }

2475 }

2476

2477 /* Allow space for once brackets for "possessive quantifier" */

2478

2479 if (safelyCheckNextChar(ptr, patternEnd, '+')) {

2480 ptr++;

2481 length += 2 + 2 * LINK_SIZE;

2482 }

2483 continue;

2484 }

2485

2486 /* Non-special character. It won't be space or # in extended mode, s o it is

2487 always a genuine character. If we are in a \Q...\E sequence, check for the

2488 end; if not, we have a literal. */

2489

2490 default:

2491 NORMAL_CHAR:

2492 length += 2; /* For a one-byte character */

2493 lastitemlength = 1; /* Default length of last item for repeats */

2494

2495 if (c > 127) {

2496 int i;

2497 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)

2498 if (c <= kjs_pcre_utf8_table1[i])

2499 break;

2500 length += i;

2501 lastitemlength += i;

2502 }

2503

2504 continue;

2505 }

2506 }

2507

2508 length += 2 + LINK_SIZE; /* For final KET and END */

2509

2510 cd.numCapturingBrackets = bracount;

2511 return length;

2512 }

2513

2514 /*************************************************

2515 * Compile a Regular Expression *

2516 *************************************************/

2517

2518 /* This function takes a string and returns a pointer to a block of store

2519 holding a compiled version of the expression. The original API for this

2520 function had no error code return variable; it is retained for backwards

2521 compatibility. The new function is given a new name.

2522

2523 Arguments:

2524 pattern the regular expression

2525 options various option bits

2526 errorcodeptr pointer to error code variable (pcre_compile2() only)

2527 can be NULL if you don't want a code value

2528 errorptr pointer to pointer to error text

2529 erroroffset ptr offset in pattern where error was detected

2530 tables pointer to character tables or NULL

2531

2532 Returns: pointer to compiled data block, or NULL on error,

2533 with errorptr and erroroffset set

2534 */

2535

2536 static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorptr)

2537 {

2538 *errorptr = errorText(errorcode);

2539 return 0;

2540 }

2541

2542 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,

2543 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption mul tiline,

2544 unsigned* numSubpatterns, const char** errorptr,

2545 malloc_t* allocate_function, free_t* free_function)

2546 {

2547 /* We can't pass back an error message if errorptr is NULL; I guess the best we

2548 can do is just return NULL, but we can set a code value if there is a code pointer. */

2549 if (!errorptr)

2550 return 0;

2551 *errorptr = NULL;

2552

2553 CompileData cd;

2554

2555 ErrorCode errorcode = ERR0;

2556 /* Call this once just to count the brackets. */

2557 calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, error code);

2558 /* Call it again to compute the length. */

2559 int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCa se, cd, errorcode);

2560 if (errorcode)

2561 return returnError(errorcode, errorptr);

2562

2563 if (length > MAX_PATTERN_SIZE)

2564 return returnError(ERR16, errorptr);

2565

2566 size_t size = length + sizeof(JSRegExp);

2567 JSRegExp* re = reinterpret_cast<JSRegExp>((allocate_function)(size));

2568

2569 if (!re)

2570 return returnError(ERR13, errorptr);

2571

2572 re->options = (ignoreCase ? IgnoreCaseOption : 0) \| (multiline ? MatchAcross MultipleLinesOption : 0);

2573

2574 /* The starting points of the name/number translation table and of the code are

2575 passed around in the compile data block. */

2576

2577 const unsigned char* codeStart = (const unsigned char*)(re + 1);

2578

2579 /* Set up a starting, non-extracting bracket, then compile the expression. O n

2580 error, errorcode will be set non-zero, so we don't need to look at the resu lt

2581 of the function here. */

2582

2583 const UChar* ptr = (const UChar*)pattern;

2584 const UChar* patternEnd = pattern + patternLength;

2585 unsigned char* code = (unsigned char*)codeStart;

2586 int firstbyte, reqbyte;

2587 int bracketCount = 0;

2588 if (!cd.needOuterBracket)

2589 compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &erro rcode, &firstbyte, &reqbyte, cd);

2590 else {

2591 *code = OP_BRA;

2592 compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &err orcode, 0, &firstbyte, &reqbyte, cd);

2593 }

2594 re->top_bracket = bracketCount;

2595 re->top_backref = cd.top_backref;

2596

2597 /* If not reached end of pattern on success, there's an excess bracket. */

2598

2599 if (errorcode == 0 && ptr < patternEnd)

2600 errorcode = ERR10;

2601

2602 /* Fill in the terminating state and check for disastrous overflow, but

2603 if debugging, leave the test till after things are printed out. */

2604

2605 *code++ = OP_END;

2606

2607 ASSERT(code - codeStart <= length);

2608 if (code - codeStart > length)

2609 errorcode = ERR7;

2610

2611 /* Give an error if there's back reference to a non-existent capturing

2612 subpattern. */

2613

2614 if (re->top_backref > re->top_bracket)

2615 errorcode = ERR15;

2616

2617 /* Failed to compile, or error while post-processing */

2618

2619 if (errorcode != ERR0) {

2620 (free_function)(reinterpret_cast<void>(re));

2621 return returnError(errorcode, errorptr);

2622 }

2623

2624 /* If the anchored option was not passed, set the flag if we can determine t hat

2625 the pattern is anchored by virtue of ^ characters or \A or anything else (s uch

2626 as starting with .* when DOTALL is set).

2627

2628 Otherwise, if we know what the first character has to be, save it, because that

2629 speeds up unanchored matches no end. If not, see if we can set the

2630 UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches

2631 start with ^. and also when all branches start with .* for non-DOTALL match es.

2632 */

2633

2634 if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(co deStart))

2635 re->options \|= IsAnchoredOption;

2636 else {

2637 if (firstbyte < 0) {

2638 firstbyte = (cd.needOuterBracket

2639 ? bracketFindFirstAssertedCharacter(codeStart, false)

2640 : branchFindFirstAssertedCharacter(codeStart, false))

2641 \| ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);

2642 }

2643 if (firstbyte >= 0) {

2644 int ch = firstbyte & 255;

2645 if (ch < 127) {

2646 re->first_byte = ((firstbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstbyte;

2647 re->options \|= UseFirstByteOptimizationOption;

2648 }

2649 } else {

2650 if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.bac krefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))

2651 re->options \|= UseMultiLineFirstByteOptimizationOption;

2652 }

2653 }

2654

2655 /* For an anchored pattern, we use the "required byte" only if it follows a

2656 variable length item in the regex. Remove the caseless flag for non-caseabl e

2657 bytes. */

2658

2659 if (reqbyte >= 0 && (!(re->options & IsAnchoredOption) \|\| (reqbyte & REQ_VAR Y))) {

2660 int ch = reqbyte & 255;

2661 if (ch < 127) {

2662 re->req_byte = ((reqbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqbyte & ~REQ_IGNORE_CASE) : reqbyte;

2663 re->options \|= UseRequiredByteOptimizationOption;

2664 }

2665 }

2666

2667 if (numSubpatterns)

2668 *numSubpatterns = re->top_bracket;

2669 return re;

2670 }

2671

2672 void jsRegExpFree(JSRegExp* re, free_t* free_function)

2673 {

2674 (free_function)(reinterpret_cast<void>(re));

2675 }

2676

2677 } } // namespace v8::jscre

OLD	NEW

« LICENSE ('K') | « src/third_party/jscre/pcre_chartables.c ('k') | src/third_party/jscre/pcre_exec.cpp » ('j') | no next file with comments »