runtime/third_party/jscre/pcre_compile.cpp - Issue 1071713003: - Remove JSCRE from the runtime.

Side by Side Diff: runtime/third_party/jscre/pcre_compile.cpp

Issue 1071713003: - Remove JSCRE from the runtime. (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/

Patch Set: Created 5 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 /* This is JavaScriptCore's variant of the PCRE library. While this library

2 started out as a copy of PCRE, many of the features of PCRE have been

3 removed. This library now supports only the regular expression features

4 required by the JavaScript language specification, and has only the functions

5 needed by JavaScriptCore and the rest of WebKit.

6

7 Originally written by Philip Hazel

8 Copyright (c) 1997-2006 University of Cambridge

9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.

10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>

11

12 -----------------------------------------------------------------------------

13 Redistribution and use in source and binary forms, with or without

14 modification, are permitted provided that the following conditions are met:

15

16 * Redistributions of source code must retain the above copyright notice,

17 this list of conditions and the following disclaimer.

18

19 * Redistributions in binary form must reproduce the above copyright

20 notice, this list of conditions and the following disclaimer in the

21 documentation and/or other materials provided with the distribution.

22

23 * Neither the name of the University of Cambridge nor the names of its

24 contributors may be used to endorse or promote products derived from

25 this software without specific prior written permission.

26

27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

37 POSSIBILITY OF SUCH DAMAGE.

38 -----------------------------------------------------------------------------

39 */

40

41 /* This module contains the external function jsRegExpExecute(), along with

42 supporting internal functions that are not used by other modules. */

43

44 #include "config.h"

45

46 #include "pcre_internal.h"

47

48 #include <string.h>

49 #include "ASCIICType.h"

50

51 /* Negative values for the firstchar and reqchar variables */

52

53 #define REQ_UNSET (-2)

54 #define REQ_NONE (-1)

55

56 /*************************************************

57 * Code parameters and static tables *

58 *************************************************/

59

60 /* Maximum number of items on the nested bracket stacks at compile time. This

61 applies to the nesting of all kinds of parentheses. It does not limit

62 un-nested, non-capturing parentheses. This number can be made bigger if

63 necessary - it is used to dimension one int and one unsigned char vector at

64 compile time. */

65

66 #define BRASTACK_SIZE 200

67

68 namespace dart { namespace jscre {

69

70 /* Table for handling escaped characters in the range '0'-'z'. Positive returns

71 are simple data values; negative values are for special things like \d and so

72 on. Zero means further processing is needed (for things like \x), or the escape

73 is invalid. */

74

75 static const short escapes[] = {

76 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */

77 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */

78 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */

79 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */

80 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */

81 0, 0, 0, '[', '\\', ']', '^', '_', /* X - _ */

82 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */

83 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */

84 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */

85 0, 0, 0 /* x - z */

86 };

87

88 /* Error code numbers. They are given names so that they can more easily be

89 tracked. */

90

91 enum ErrorCode {

92 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,

93 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17

94 };

95

96 /* The texts of compile-time error messages. These are "char *" because they

97 are passed to the outside world. */

98

99 static const char* errorText(ErrorCode code)

100 {

101 static const char errorTexts[] =

102 /* 1 */

103 "\\ at end of pattern\0"

104 "\\c at end of pattern\0"

105 "character value in \\x{...} sequence is too large\0"

106 "numbers out of order in {} quantifier\0"

107 /* 5 */

108 "number too big in {} quantifier\0"

109 "missing terminating ] for character class\0"

110 "internal error: code overflow\0"

111 "range out of order in character class\0"

112 "nothing to repeat\0"

113 /* 10 */

114 "unmatched parentheses\0"

115 "internal error: unexpected repeat\0"

116 "unrecognized character after (?\0"

117 "failed to get memory\0"

118 "missing )\0"

119 /* 15 */

120 "reference to non-existent subpattern\0"

121 "regular expression too large\0"

122 "parentheses nested too deeply"

123 ;

124

125 int i = code;

126 const char* text = errorTexts;

127 while (i > 1)

128 i -= !*text++;

129 return text;

130 }

131

132 /* Structure for passing "static" information around between the functions

133 doing the compiling. */

134

135 struct CompileData {

136 CompileData() {

137 top_backref = 0;

138 backrefMap = 0;

139 req_varyopt = 0;

140 needOuterBracket = false;

141 numCapturingBrackets = 0;

142 }

143 int top_backref; /* Maximum back reference */

144 unsigned backrefMap; /* Bitmap of low back refs */

145 int req_varyopt; /* "After variable item" flag for reqbyte */

146 bool needOuterBracket;

147 int numCapturingBrackets;

148 };

149

150 /* Definitions to allow mutual recursion */

151

152 static bool compileBracket(int, int, unsigned char, const UChar, const UCha r, ErrorCode, int, int, int*, CompileData&);

153 static bool bracketIsAnchored(const unsigned char* code);

154 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap , unsigned backrefMap);

155 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool ina ssert);

156

157 /*************************************************

158 * Handle escapes *

159 *************************************************/

160

161 /* This function is called when a \ has been encountered. It either returns a

162 positive value for a simple escape such as \n, or a negative value which

163 encodes one of the more complicated things such as \d. When UTF-8 is enabled,

164 a positive value greater than 255 may be returned. On entry, ptr is pointing at

165 the \. On exit, it is on the final character of the escape sequence.

166

167 Arguments:

168 ptrptr points to the pattern position pointer

169 errorcodeptr points to the errorcode variable

170 bracount number of previous extracting brackets

171 options the options bits

172 isclass true if inside a character class

173

174 Returns: zero or positive => a data character

175 negative => a special escape sequence

176 on error, errorptr is set

177 */

178

179 static int checkEscape(const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int bracount, bool isclass)

180 {

181 const UChar* ptr = *ptrptr + 1;

182

183 /* If backslash is at the end of the pattern, it's an error. */

184 if (ptr == patternEnd) {

185 *errorcodeptr = ERR1;

186 *ptrptr = ptr;

187 return 0;

188 }

189

190 int c = *ptr;

191

192 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in

193 a table. A non-zero result is something that can be returned immediately.

194 Otherwise further processing may be required. */

195

196 if (c < '0' \|\| c > 'z') { /* Not alphameric */

197 } else if (int escapeValue = escapes[c - '0']) {

198 c = escapeValue;

199 if (isclass) {

200 if (-c == ESC_b)

201 c = '\b'; /* \b is backslash in a class */

202 else if (-c == ESC_B)

203 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */

204 }

205 /* Escapes that need further processing, or are illegal. */

206

207 } else {

208 switch (c) {

209 case '1':

210 case '2':

211 case '3':

212 case '4':

213 case '5':

214 case '6':

215 case '7':

216 case '8':

217 case '9':

218 /* Escape sequences starting with a non-zero digit are backrefer ences,

219 unless there are insufficient brackets, in which case they are octal

220 escape sequences. Those sequences end on the first non-octal ch aracter

221 or when we overflow 0-255, whichever comes first. */

222

223 if (!isclass) {

224 const UChar* oldptr = ptr;

225 c -= '0';

226 while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c < = bracount)

227 c = c * 10 + *(++ptr) - '0';

228 if (c <= bracount) {

229 c = -(ESC_REF + c);

230 break;

231 }

232 ptr = oldptr; /* Put the pointer back and fall through */

233 }

234

235 /* Handle an octal number following \. If the first digit is 8 o r 9,

236 this is not octal. */

237

238 if ((c = *ptr) >= '8')

239 break;

240

241 /* \0 always starts an octal number, but we may drop through to here with a

242 larger first octal digit. */

243

244 case '0': {

245 c -= '0';

246 int i;

247 for (i = 1; i <= 2; ++i) {

248 if (ptr + i >= patternEnd \|\| ptr[i] < '0' \|\| ptr[i] > '7')

249 break;

250 int cc = c * 8 + ptr[i] - '0';

251 if (cc > 255)

252 break;

253 c = cc;

254 }

255 ptr += i - 1;

256 break;

257 }

258

259 case 'x': {

260 c = 0;

261 int i;

262 for (i = 1; i <= 2; ++i) {

263 if (ptr + i >= patternEnd \|\| !isASCIIHexDigit(ptr[i])) {

264 c = 'x';

265 i = 1;

266 break;

267 }

268 int cc = ptr[i];

269 if (cc >= 'a')

270 cc -= 32; /* Convert to upper case */

271 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));

272 }

273 ptr += i - 1;

274 break;

275 }

276

277 case 'u': {

278 c = 0;

279 int i;

280 for (i = 1; i <= 4; ++i) {

281 if (ptr + i >= patternEnd \|\| !isASCIIHexDigit(ptr[i])) {

282 c = 'u';

283 i = 1;

284 break;

285 }

286 int cc = ptr[i];

287 if (cc >= 'a')

288 cc -= 32; /* Convert to upper case */

289 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));

290 }

291 ptr += i - 1;

292 break;

293 }

294

295 case 'c':

296 if (++ptr == patternEnd) {

297 *errorcodeptr = ERR2;

298 return 0;

299 }

300 c = *ptr;

301

302 /* A letter is upper-cased; then the 0x40 bit is flipped. This c oding

303 is ASCII-specific, but then the whole concept of \cx is ASCII-s pecific. */

304 c = toASCIIUpper(c) ^ 0x40;

305 break;

306 }

307 }

308

309 *ptrptr = ptr;

310 return c;

311 }

312

313 /*************************************************

314 * Check for counted repeat *

315 *************************************************/

316

317 /* This function is called when a '{' is encountered in a place where it might

318 start a quantifier. It looks ahead to see if it really is a quantifier or not.

319 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}

320 where the ddds are digits.

321

322 Arguments:

323 p pointer to the first char after '{'

324

325 Returns: true or false

326 */

327

328 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)

329 {

330 if (p >= patternEnd \|\| !isASCIIDigit(*p))

331 return false;

332 p++;

333 while (p < patternEnd && isASCIIDigit(*p))

334 p++;

335 if (p < patternEnd && *p == '}')

336 return true;

337

338 if (p >= patternEnd \|\| *p++ != ',')

339 return false;

340 if (p < patternEnd && *p == '}')

341 return true;

342

343 if (p >= patternEnd \|\| !isASCIIDigit(*p))

344 return false;

345 p++;

346 while (p < patternEnd && isASCIIDigit(*p))

347 p++;

348

349 return (p < patternEnd && *p == '}');

350 }

351

352 /*************************************************

353 * Read repeat counts *

354 *************************************************/

355

356 /* Read an item of the form {n,m} and return the values. This is called only

357 after isCountedRepeat() has confirmed that a repeat-count quantifier exists,

358 so the syntax is guaranteed to be correct, but we need to check the values.

359

360 Arguments:

361 p pointer to first char after '{'

362 minp pointer to int for min

363 maxp pointer to int for max

364 returned as -1 if no max

365 errorcodeptr points to error code variable

366

367 Returns: pointer to '}' on success;

368 current ptr on error, with errorcodeptr set non-zero

369 */

370

371 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, Error Code* errorcodeptr)

372 {

373 int min = 0;

374 int max = -1;

375

376 /* Read the minimum value and do a paranoid check: a negative value indicate s

377 an integer overflow. */

378

379 while (isASCIIDigit(*p))

380 min = min * 10 + *p++ - '0';

381 if (min < 0 \|\| min > 65535) {

382 *errorcodeptr = ERR5;

383 return p;

384 }

385

386 /* Read the maximum value if there is one, and again do a paranoid on its si ze.

387 Also, max must not be less than min. */

388

389 if (*p == '}')

390 max = min;

391 else {

392 if (*(++p) != '}') {

393 max = 0;

394 while (isASCIIDigit(*p))

395 max = max * 10 + *p++ - '0';

396 if (max < 0 \|\| max > 65535) {

397 *errorcodeptr = ERR5;

398 return p;

399 }

400 if (max < min) {

401 *errorcodeptr = ERR4;

402 return p;

403 }

404 }

405 }

406

407 /* Fill in the required variables, and pass back the pointer to the terminat ing

408 '}'. */

409

410 *minp = min;

411 *maxp = max;

412 return p;

413 }

414

415 /*************************************************

416 * Find first significant op code *

417 *************************************************/

418

419 /* This is called by several functions that scan a compiled expression looking

420 for a fixed first character, or an anchoring op code etc. It skips over things

421 that do not influence this.

422

423 Arguments:

424 code pointer to the start of the group

425 Returns: pointer to the first significant opcode

426 */

427

428 static const unsigned char* firstSignificantOpcode(const unsigned char* code)

429 {

430 while (*code == OP_BRANUMBER)

431 code += 3;

432 return code;

433 }

434

435 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsig ned char* code)

436 {

437 while (true) {

438 switch (*code) {

439 case OP_ASSERT_NOT:

440 advanceToEndOfBracket(code);

441 code += 1 + LINK_SIZE;

442 break;

443 case OP_WORD_BOUNDARY:

444 case OP_NOT_WORD_BOUNDARY:

445 ++code;

446 break;

447 case OP_BRANUMBER:

448 code += 3;

449 break;

450 default:

451 return code;

452 }

453 }

454 }

455

456 /*************************************************

457 * Get othercase range *

458 *************************************************/

459

460 /* This function is passed the start and end of a class range, in UTF-8 mode

461 with UCP support. It searches up the characters, looking for internal ranges of

462 characters in the "other" case. Each call returns the next one, updating the

463 start address.

464

465 Arguments:

466 cptr points to starting character value; updated

467 d end value

468 ocptr where to put start of othercase range

469 odptr where to put end of othercase range

470

471 Yield: true when range returned; false when no more

472 */

473

474 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)

475 {

476 int c, othercase = 0;

477

478 for (c = *cptr; c <= d; c++) {

479 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0)

480 break;

481 }

482

483 if (c > d)

484 return false;

485

486 *ocptr = othercase;

487 int next = othercase + 1;

488

489 for (++c; c <= d; c++) {

490 if (kjs_pcre_ucp_othercase(c) != next)

491 break;

492 next++;

493 }

494

495 *odptr = next - 1;

496 *cptr = c;

497

498 return true;

499 }

500

501 /*************************************************

502 * Convert character value to UTF-8 *

503 *************************************************/

504

505 /* This function takes an integer value in the range 0 - 0x7fffffff

506 and encodes it as a UTF-8 character in 0 to 6 bytes.

507

508 Arguments:

509 cvalue the character value

510 buffer pointer to buffer for result - at least 6 bytes long

511

512 Returns: number of characters placed in the buffer

513 */

514

515 static int encodeUTF8(int cvalue, unsigned char *buffer)

516 {

517 int i;

518 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)

519 if (cvalue <= kjs_pcre_utf8_table1[i])

520 break;

521 buffer += i;

522 for (int j = i; j > 0; j--) {

523 *buffer-- = 0x80 \| (cvalue & 0x3f);

524 cvalue >>= 6;

525 }

526 *buffer = kjs_pcre_utf8_table2[i] \| cvalue;

527 return i + 1;

528 }

529

530 /*************************************************

531 * Compile one branch *

532 *************************************************/

533

534 /* Scan the pattern, compiling it into the code vector.

535

536 Arguments:

537 options the option bits

538 brackets points to number of extracting brackets used

539 codeptr points to the pointer to the current code point

540 ptrptr points to the current pattern pointer

541 errorcodeptr points to error code variable

542 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)

543 reqbyteptr set to the last literal character required, else < 0

544 cd contains pointers to tables etc.

545

546 Returns: true on success

547 false, with *errorcodeptr set non-zero on error

548 */

549

550 static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd , UChar expected)

551 {

552 return ((ptr + 1 < patternEnd) && ptr[1] == expected);

553 }

554

555 static bool

556 compileBranch(int options, int* brackets, unsigned char** codeptr,

557 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorco deptr, int *firstbyteptr,

558 int* reqbyteptr, CompileData& cd)

559 {

560 int repeat_type, op_type;

561 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */

562 int bravalue = 0;

563 int reqvary, tempreqvary;

564 int c;

565 unsigned char* code = *codeptr;

566 unsigned char* tempcode;

567 bool groupsetfirstbyte = false;

568 const UChar* ptr = *ptrptr;

569 unsigned char* previous = NULL;

570 unsigned char classbits[32];

571

572 bool class_utf8;

573 unsigned char* class_utf8data;

574 unsigned char utf8_char[6];

575

576 /* Initialize no first byte, no required byte. REQ_UNSET means "no char

577 matching encountered yet". It gets changed to REQ_NONE if we hit something that

578 matches a non-fixed char first char; reqbyte just remains unset if we never

579 find one.

580

581 When we hit a repeat whose minimum is zero, we may have to adjust these val ues

582 to take the zero repeat into account. This is implemented by setting them t o

583 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The indivi dual

584 item types that can be repeated set these backoff variables appropriately. */

585

586 int firstbyte = REQ_UNSET;

587 int reqbyte = REQ_UNSET;

588 int zeroreqbyte = REQ_UNSET;

589 int zerofirstbyte = REQ_UNSET;

590

591 /* The variable req_caseopt contains either the REQ_IGNORE_CASE value or zer o,

592 according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit

593 value > 255. It is added into the firstbyte or reqbyte variables to record the

594 case status of the value. This is used only for ASCII characters. */

595

596 int req_caseopt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;

597

598 /* Switch on next character until the end of the branch */

599

600 for (;; ptr++) {

601 bool negate_class;

602 bool should_flip_negation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */

603 int class_charcount;

604 int class_lastchar;

605 int skipbytes;

606 int subreqbyte;

607 int subfirstbyte;

608 int mclength;

609 unsigned char mcbuffer[8];

610

611 /* Next byte in the pattern */

612

613 c = ptr < patternEnd ? *ptr : 0;

614

615 /* Fill in length of a previous callout, except when the next thing is

616 a quantifier. */

617

618 bool is_quantifier = c == '*' \|\| c == '+' \|\| c == '?' \|\| (c == '{' && is CountedRepeat(ptr + 1, patternEnd));

619

620 switch (c) {

621 /* The branch terminates at end of string, \|, or ). */

622

623 case 0:

624 if (ptr < patternEnd)

625 goto NORMAL_CHAR;

626 // End of string; fall through

627 case '\|':

628 case ')':

629 *firstbyteptr = firstbyte;

630 *reqbyteptr = reqbyte;

631 *codeptr = code;

632 *ptrptr = ptr;

633 return true;

634

635 /* Handle single-character metacharacters. In multiline mode, ^ disa bles

636 the setting of any following char as a first character. */

637

638 case '^':

639 if (options & MatchAcrossMultipleLinesOption) {

640 if (firstbyte == REQ_UNSET)

641 firstbyte = REQ_NONE;

642 *code++ = OP_BOL;

643 } else

644 *code++ = OP_CIRC;

645 previous = NULL;

646 break;

647

648 case '$':

649 previous = NULL;

650 if (options & MatchAcrossMultipleLinesOption)

651 *code++ = OP_EOL;

652 else

653 *code++ = OP_DOLL;

654 break;

655

656 /* There can never be a first char if '.' is first, whatever happens about

657 repeats. The value of reqbyte doesn't change either. */

658

659 case '.':

660 if (firstbyte == REQ_UNSET)

661 firstbyte = REQ_NONE;

662 zerofirstbyte = firstbyte;

663 zeroreqbyte = reqbyte;

664 previous = code;

665 *code++ = OP_NOT_NEWLINE;

666 break;

667

668 /* Character classes. If the included characters are all < 256, we b uild a

669 32-byte bitmap of the permitted characters, except in the special c ase

670 where there is only one such character. For negated classes, we bui ld the

671 map as usual, then invert it at the end. However, we use a differen t opcode

672 so that data characters > 255 can be handled correctly.

673

674 If the class contains characters outside the 0-255 range, a differe nt

675 opcode is compiled. It may optionally have a bit map for characters < 256,

676 but those above are are explicitly listed afterwards. A flag byte t ells

677 whether the bitmap is present, and whether this is a negated class or not.

678 */

679

680 case '[': {

681 previous = code;

682 should_flip_negation = false;

683

684 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if

685 they are encountered at the top level, so we'll do that too. */

686

687 /* If the first character is '^', set the negation flag and skip it. */

688

689 if (ptr + 1 >= patternEnd) {

690 *errorcodeptr = ERR6;

691 return false;

692 }

693

694 if (ptr[1] == '^') {

695 negate_class = true;

696 ++ptr;

697 } else

698 negate_class = false;

699

700 /* Keep a count of chars with values < 256 so that we can optimi ze the case

701 of just a single character (as long as it's < 256). For higher valued UTF-8

702 characters, we don't yet do any optimization. */

703

704 class_charcount = 0;

705 class_lastchar = -1;

706

707 class_utf8 = false; /* No chars >= 256 */

708 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */

709

710 /* Initialize the 32-char bit map to all zeros. We have to build the

711 map in a temporary bit of store, in case the class contains onl y 1

712 character (< 256), because in that case the compiled code doesn 't use the

713 bit map. */

714

715 memset(classbits, 0, 32 * sizeof(unsigned char));

716

717 /* Process characters until ] is reached. The first pass

718 through the regex checked the overall syntax, so we don't need to be very

719 strict here. At the start of the loop, c contains the first byt e of the

720 character. */

721

722 while ((++ptr < patternEnd) && (c = *ptr) != ']') {

723 /* Backslash may introduce a single character, or it may int roduce one

724 of the specials, which just set a flag. Escaped items are c hecked for

725 validity in the pre-compiling pass. The sequence \b is a sp ecial case.

726 Inside a class (and only there) it is treated as backspace. Elsewhere

727 it marks a word boundary. Other escapes have preset maps re ady to

728 or into the one we are building. We assume they have more t han one

729 character in them, so set class_charcount bigger than one. */

730

731 if (c == '\\') {

732 c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCa pturingBrackets, true);

733 if (c < 0) {

734 class_charcount += 2; /* Greater than 1 is what matters */

735 switch (-c) {

736 case ESC_d:

737 for (c = 0; c < 32; c++)

738 classbits[c] \|= classBitmapForChar(c + c bit_digit);

739 continue;

740

741 case ESC_D:

742 should_flip_negation = true;

743 for (c = 0; c < 32; c++)

744 classbits[c] \|= ~classBitmapForChar(c + cbit_digit);

745 continue;

746

747 case ESC_w:

748 for (c = 0; c < 32; c++)

749 classbits[c] \|= classBitmapForChar(c + c bit_word);

750 continue;

751

752 case ESC_W:

753 should_flip_negation = true;

754 for (c = 0; c < 32; c++)

755 classbits[c] \|= ~classBitmapForChar(c + cbit_word);

756 continue;

757

758 case ESC_s:

759 for (c = 0; c < 32; c++)

760 classbits[c] \|= classBitmapForChar(c + cbit_space);

761 continue;

762

763 case ESC_S:

764 should_flip_negation = true;

765 for (c = 0; c < 32; c++)

766 classbits[c] \|= ~classBitmapForChar(c + cbit_space);

767 continue;

768

769 /* Unrecognized escapes are faulted if PCRE is running in its

770 strict mode. By default, for compatibility with Perl, they are

771 treated as literals. */

772

773 default:

774 c = ptr; / The final characte r */

775 class_charcount -= 2; /* Undo the default c ount from above */

776 }

777 }

778

779 /* Fall through if we have a single character (c >= 0). This may be

780 > 256 in UTF-8 mode. */

781

782 } /* End of backslash handling */

783

784 /* A single character may be followed by '-' to form a range . However,

785 Perl does not permit ']' to be the end of the range. A '-' character

786 here is treated as a literal. */

787

788 if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']' ) {

789 ptr += 2;

790

791 int d = *ptr;

792

793 /* The second part of a range can be a single-character escape, but

794 not any of the other escapes. Perl 5.6 treats a hyphen as a literal

795 in such circumstances. */

796

797 if (d == '\\') {

798 const UChar* oldptr = ptr;

799 d = checkEscape(&ptr, patternEnd, errorcodeptr, cd.n umCapturingBrackets, true);

800

801 /* \X is literal X; any other special means the '-' was literal */

802 if (d < 0) {

803 ptr = oldptr - 2;

804 goto LONE_SINGLE_CHARACTER; /* A few lines belo w */

805 }

806 }

807

808 /* The check that the two values are in the correct orde r happens in

809 the pre-pass. Optimize one-character ranges */

810

811 if (d == c)

812 goto LONE_SINGLE_CHARACTER; /* A few lines below */

813

814 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless

815 matching, we have to use an XCLASS with extra data item s. Caseless

816 matching for characters > 127 is available only if UCP support is

817 available. */

818

819 if ((d > 255 \|\| ((options & IgnoreCaseOption) && d > 127 ))) {

820 class_utf8 = true;

821

822 /* With UCP support, we can find the other case equi valents of

823 the relevant characters. There may be several range s. Optimize how

824 they fit with the basic range. */

825

826 if (options & IgnoreCaseOption) {

827 int occ, ocd;

828 int cc = c;

829 int origd = d;

830 while (getOthercaseRange(&cc, origd, &occ, &ocd) ) {

831 if (occ >= c && ocd <= d)

832 continue; /* Skip embedded ranges */

833

834 if (occ < c && ocd >= c - 1) /* Exte nd the basic range */

835 { /* if the re is overlap, */

836 c = occ; /* no ting that if occ < c */

837 continue; /* we can't have ocd > d */

838 } /* becaus e a subrange is */

839 if (ocd > d && occ <= d + 1) /* alwa ys shorter than */

840 { /* the ba sic range. */

841 d = ocd;

842 continue;

843 }

844

845 if (occ == ocd)

846 *class_utf8data++ = XCL_SINGLE;

847 else {

848 *class_utf8data++ = XCL_RANGE;

849 class_utf8data += encodeUTF8(occ, class_ utf8data);

850 }

851 class_utf8data += encodeUTF8(ocd, class_utf8 data);

852 }

853 }

854

855 /* Now record the original range, possibly modified for UCP caseless

856 overlapping ranges. */

857

858 *class_utf8data++ = XCL_RANGE;

859 class_utf8data += encodeUTF8(c, class_utf8data);

860 class_utf8data += encodeUTF8(d, class_utf8data);

861

862 /* With UCP support, we are done. Without UCP suppor t, there is no

863 caseless matching for UTF-8 characters > 127; we ca n use the bit map

864 for the smaller ones. */

865

866 continue; /* With next character in the class */

867 }

868

869 /* We use the bit map for all cases when not in UTF-8 mo de; else

870 ranges that lie entirely within 0-127 when there is UCP support; else

871 for partial ranges without UCP support. */

872

873 for (; c <= d; c++) {

874 classbits[c/8] \|= (1 << (c&7));

875 if (options & IgnoreCaseOption) {

876 int uc = flipCase(c);

877 classbits[uc/8] \|= (1 << (uc&7));

878 }

879 class_charcount++; /* in case a one-c har range */

880 class_lastchar = c;

881 }

882

883 continue; /* Go get the next char in the class */

884 }

885

886 /* Handle a lone single character - we can get here for a no rmal

887 non-escape char, or after \ that introduces a single charac ter or for an

888 apparent range that isn't. */

889

890 LONE_SINGLE_CHARACTER:

891

892 /* Handle a character that cannot go in the bit map */

893

894 if ((c > 255 \|\| ((options & IgnoreCaseOption) && c > 127))) {

895 class_utf8 = true;

896 *class_utf8data++ = XCL_SINGLE;

897 class_utf8data += encodeUTF8(c, class_utf8data);

898

899 if (options & IgnoreCaseOption) {

900 int othercase;

901 if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0) {

902 *class_utf8data++ = XCL_SINGLE;

903 class_utf8data += encodeUTF8(othercase, class_ut f8data);

904 }

905 }

906 } else {

907 /* Handle a single-byte character */

908 classbits[c/8] \|= (1 << (c&7));

909 if (options & IgnoreCaseOption) {

910 c = flipCase(c);

911 classbits[c/8] \|= (1 << (c&7));

912 }

913 class_charcount++;

914 class_lastchar = c;

915 }

916 }

917

918 /* If class_charcount is 1, we saw precisely one character whose value is

919 less than 256. In non-UTF-8 mode we can always optimize. In UTF -8 mode, we

920 can optimize the negative case only if there were no characters >= 128

921 because OP_NOT and the related opcodes like OP_NOTSTAR operate on

922 single-bytes only. This is an historical hangover. Maybe one da y we can

923 tidy these opcodes to handle multi-byte characters.

924

925 The optimization throws away the bit map. We turn the item into a

926 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's neg ative. Note

927 that OP_NOT does not support multibyte characters. In the posit ive case, it

928 can cause firstbyte to be set. Otherwise, there can be no first char if

929 this item is first, whatever repeat count may follow. In the ca se of

930 reqbyte, save the previous value for reinstating. */

931

932 if (class_charcount == 1 && (!class_utf8 && (!negate_class \|\| cl ass_lastchar < 128))) {

933 zeroreqbyte = reqbyte;

934

935 /* The OP_NOT opcode works on one-byte characters only. */

936

937 if (negate_class) {

938 if (firstbyte == REQ_UNSET)

939 firstbyte = REQ_NONE;

940 zerofirstbyte = firstbyte;

941 *code++ = OP_NOT;

942 *code++ = class_lastchar;

943 break;

944 }

945

946 /* For a single, positive character, get the value into c, a nd

947 then we can handle this with the normal one-character code. */

948

949 c = class_lastchar;

950 goto NORMAL_CHAR;

951 } /* End of 1-char optimization */

952

953 /* The general case - not the one-char optimization. If this is the first

954 thing in the branch, there can be no first char setting, whatev er the

955 repeat count. Any reqbyte setting must remain unchanged after a ny kind of

956 repeat. */

957

958 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;

959 zerofirstbyte = firstbyte;

960 zeroreqbyte = reqbyte;

961

962 /* If there are characters with values > 255, we have to compile an

963 extended class, with its own opcode. If there are no characters < 256,

964 we can omit the bitmap. */

965

966 if (class_utf8 && !should_flip_negation) {

967 class_utf8data++ = XCL_END; / Marks the end of extra da ta */

968 *code++ = OP_XCLASS;

969 code += LINK_SIZE;

970 *code = negate_class? XCL_NOT : 0;

971

972 /* If the map is required, install it, and move on to the en d of

973 the extra data */

974

975 if (class_charcount > 0) {

976 *code++ \|= XCL_MAP;

977 memcpy(code, classbits, 32);

978 code = class_utf8data;

979 }

980

981 /* If the map is not required, slide down the extra data. */

982

983 else {

984 int len = class_utf8data - (code + 33);

985 memmove(code + 1, code + 33, len);

986 code += len + 1;

987 }

988

989 /* Now fill in the complete length of the item */

990

991 putLinkValue(previous + 1, code - previous);

992 break; /* End of class handling */

993 }

994

995 /* If there are no characters > 255, negate the 32-byte map if n ecessary,

996 and copy it into the code vector. If this is the first thing in the branch,

997 there can be no first char setting, whatever the repeat count. Any reqbyte

998 setting must remain unchanged after any kind of repeat. */

999

1000 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP _NCLASS;

1001 if (negate_class)

1002 for (c = 0; c < 32; c++)

1003 code[c] = ~classbits[c];

1004 else

1005 memcpy(code, classbits, 32);

1006 code += 32;

1007 break;

1008 }

1009

1010 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this

1011 has been tested above. */

1012

1013 case '{':

1014 if (!is_quantifier)

1015 goto NORMAL_CHAR;

1016 ptr = readRepeatCounts(ptr + 1, &repeat_min, &repeat_max, errorc odeptr);

1017 if (*errorcodeptr)

1018 goto FAILED;

1019 goto REPEAT;

1020

1021 case '*':

1022 repeat_min = 0;

1023 repeat_max = -1;

1024 goto REPEAT;

1025

1026 case '+':

1027 repeat_min = 1;

1028 repeat_max = -1;

1029 goto REPEAT;

1030

1031 case '?':

1032 repeat_min = 0;

1033 repeat_max = 1;

1034

1035 REPEAT:

1036 if (!previous) {

1037 *errorcodeptr = ERR9;

1038 goto FAILED;

1039 }

1040

1041 if (repeat_min == 0) {

1042 firstbyte = zerofirstbyte; /* Adjust for zero repeat */

1043 reqbyte = zeroreqbyte; /* Ditto */

1044 }

1045

1046 /* Remember whether this is a variable length repeat */

1047

1048 reqvary = (repeat_min == repeat_max) ? 0 : REQ_VARY;

1049

1050 op_type = 0; /* Default single-char op codes */

1051

1052 /* Save start of previous item, in case we have to move it up to make space

1053 for an inserted OP_ONCE for the additional '+' extension. */

1054 /* FIXME: Probably don't need this because we don't use OP_ONCE. */

1055

1056 tempcode = previous;

1057

1058 /* If the next character is '+', we have a possessive quantifier . This

1059 implies greediness, whatever the setting of the PCRE_UNGREEDY o ption.

1060 If the next character is '?' this is a minimizing repeat, by de fault,

1061 but if PCRE_UNGREEDY is set, it works the other way round. We c hange the

1062 repeat type to the non-default. */

1063

1064 if (safelyCheckNextChar(ptr, patternEnd, '?')) {

1065 repeat_type = 1;

1066 ptr++;

1067 } else

1068 repeat_type = 0;

1069

1070 /* If previous was a character match, abolish the item and gener ate a

1071 repeat item instead. If a char item has a minumum of more than one, ensure

1072 that it is set in reqbyte - it might not be if a sequence such as x{3} is

1073 the first thing in a branch because the x will have gone into f irstbyte

1074 instead. */

1075

1076 if (previous == OP_CHAR \|\| previous == OP_CHAR_IGNORING_CASE) {

1077 /* Deal with UTF-8 characters that take up more than one byt e. It's

1078 easier to write this out separately than try to macrify it. Use c to

1079 hold the length of the character in bytes, plus 0x80 to fla g that it's a

1080 length rather than a small character. */

1081

1082 if (code[-1] & 0x80) {

1083 unsigned char *lastchar = code - 1;

1084 while((*lastchar & 0xc0) == 0x80)

1085 lastchar--;

1086 c = code - lastchar; /* Length of UTF-8 chara cter */

1087 memcpy(utf8_char, lastchar, c); /* Save the char */

1088 c \|= 0x80; /* Flag c as a length */

1089 }

1090 else {

1091 c = code[-1];

1092 if (repeat_min > 1)

1093 reqbyte = c \| req_caseopt \| cd.req_varyopt;

1094 }

1095

1096 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single char acter types */

1097 }

1098

1099 else if (previous == OP_ASCII_CHAR \|\| previous == OP_ASCII_LET TER_IGNORING_CASE) {

1100 c = previous[1];

1101 if (repeat_min > 1)

1102 reqbyte = c \| req_caseopt \| cd.req_varyopt;

1103 goto OUTPUT_SINGLE_REPEAT;

1104 }

1105

1106 /* If previous was a single negated character ([^a] or similar), we use

1107 one of the special opcodes, replacing it. The code is shared wi th single-

1108 character repeats by setting opt_type to add a suitable offset into

1109 repeat_type. OP_NOT is currently used only for single-byte char s. */

1110

1111 else if (*previous == OP_NOT) {

1112 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */

1113 c = previous[1];

1114 goto OUTPUT_SINGLE_REPEAT;

1115 }

1116

1117 /* If previous was a character type match (\d or similar), aboli sh it and

1118 create a suitable repeat item. The code is shared with single-c haracter

1119 repeats by setting op_type to add a suitable offset into repeat _type. */

1120

1121 else if (*previous <= OP_NOT_NEWLINE) {

1122 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */

1123 c = *previous;

1124

1125 OUTPUT_SINGLE_REPEAT:

1126 int prop_type = -1;

1127 int prop_value = -1;

1128

1129 unsigned char* oldcode = code;

1130 code = previous; /* Usually overwrite previ ous item */

1131

1132 /* If the maximum is zero then the minimum must also be zero ; Perl allows

1133 this case, so we do too - by simply omitting the item altog ether. */

1134

1135 if (repeat_max == 0)

1136 goto END_REPEAT;

1137

1138 /* Combine the op_type with the repeat_type */

1139

1140 repeat_type += op_type;

1141

1142 /* A minimum of zero is handled either as the special case * or ?, or as

1143 an UPTO, with the maximum given. */

1144

1145 if (repeat_min == 0) {

1146 if (repeat_max == -1)

1147 *code++ = OP_STAR + repeat_type;

1148 else if (repeat_max == 1)

1149 *code++ = OP_QUERY + repeat_type;

1150 else {

1151 *code++ = OP_UPTO + repeat_type;

1152 put2ByteValueAndAdvance(code, repeat_max);

1153 }

1154 }

1155

1156 /* A repeat minimum of 1 is optimized into some special case s. If the

1157 maximum is unlimited, we use OP_PLUS. Otherwise, the origin al item it

1158 left in place and, if the maximum is greater than 1, we use OP_UPTO with

1159 one less than the maximum. */

1160

1161 else if (repeat_min == 1) {

1162 if (repeat_max == -1)

1163 *code++ = OP_PLUS + repeat_type;

1164 else {

1165 code = oldcode; /* leave previous it em in place */

1166 if (repeat_max == 1)

1167 goto END_REPEAT;

1168 *code++ = OP_UPTO + repeat_type;

1169 put2ByteValueAndAdvance(code, repeat_max - 1);

1170 }

1171 }

1172

1173 /* The case {n,n} is just an EXACT, while the general case { n,m} is

1174 handled as an EXACT followed by an UPTO. */

1175

1176 else {

1177 code++ = OP_EXACT + op_type; / NB EXACT doesn't have repeat_type */

1178 put2ByteValueAndAdvance(code, repeat_min);

1179

1180 /* If the maximum is unlimited, insert an OP_STAR. Befor e doing so,

1181 we have to insert the character for the previous code. For a repeated

1182 Unicode property match, there are two extra bytes that define the

1183 required property. In UTF-8 mode, long characters have their length in

1184 c, with the 0x80 bit as a flag. */

1185

1186 if (repeat_max < 0) {

1187 if (c >= 128) {

1188 memcpy(code, utf8_char, c & 7);

1189 code += c & 7;

1190 } else {

1191 *code++ = c;

1192 if (prop_type >= 0) {

1193 *code++ = prop_type;

1194 *code++ = prop_value;

1195 }

1196 }

1197 *code++ = OP_STAR + repeat_type;

1198 }

1199

1200 /* Else insert an UPTO if the max is greater than the mi n, again

1201 preceded by the character, for the previously inserted code. */

1202

1203 else if (repeat_max != repeat_min) {

1204 if (c >= 128) {

1205 memcpy(code, utf8_char, c & 7);

1206 code += c & 7;

1207 } else

1208 *code++ = c;

1209 if (prop_type >= 0) {

1210 *code++ = prop_type;

1211 *code++ = prop_value;

1212 }

1213 repeat_max -= repeat_min;

1214 *code++ = OP_UPTO + repeat_type;

1215 put2ByteValueAndAdvance(code, repeat_max);

1216 }

1217 }

1218

1219 /* The character or character type itself comes last in all cases. */

1220

1221 if (c >= 128) {

1222 memcpy(code, utf8_char, c & 7);

1223 code += c & 7;

1224 } else

1225 *code++ = c;

1226

1227 /* For a repeated Unicode property match, there are two extr a bytes that

1228 define the required property. */

1229

1230 if (prop_type >= 0) {

1231 *code++ = prop_type;

1232 *code++ = prop_value;

1233 }

1234 }

1235

1236 /* If previous was a character class or a back reference, we put the repeat

1237 stuff after it, but just skip the item if the repeat was {0,0}. */

1238

1239 else if (*previous == OP_CLASS \|\|

1240 *previous == OP_NCLASS \|\|

1241 *previous == OP_XCLASS \|\|

1242 *previous == OP_REF)

1243 {

1244 if (repeat_max == 0) {

1245 code = previous;

1246 goto END_REPEAT;

1247 }

1248

1249 if (repeat_min == 0 && repeat_max == -1)

1250 *code++ = OP_CRSTAR + repeat_type;

1251 else if (repeat_min == 1 && repeat_max == -1)

1252 *code++ = OP_CRPLUS + repeat_type;

1253 else if (repeat_min == 0 && repeat_max == 1)

1254 *code++ = OP_CRQUERY + repeat_type;

1255 else {

1256 *code++ = OP_CRRANGE + repeat_type;

1257 put2ByteValueAndAdvance(code, repeat_min);

1258 if (repeat_max == -1)

1259 repeat_max = 0; /* 2-byte encoding for max */

1260 put2ByteValueAndAdvance(code, repeat_max);

1261 }

1262 }

1263

1264 /* If previous was a bracket group, we may have to replicate it in certain

1265 cases. */

1266

1267 else if (*previous >= OP_BRA) {

1268 int ketoffset = 0;

1269 int len = code - previous;

1270 unsigned char* bralink = NULL;

1271

1272 /* If the maximum repeat count is unlimited, find the end of the bracket

1273 by scanning through from the start, and compute the offset back to it

1274 from the current code pointer. There may be an OP_OPT setti ng following

1275 the final KET, so we can't find the end just by going back from the code

1276 pointer. */

1277

1278 if (repeat_max == -1) {

1279 const unsigned char* ket = previous;

1280 advanceToEndOfBracket(ket);

1281 ketoffset = code - ket;

1282 }

1283

1284 /* The case of a zero minimum is special because of the need to stick

1285 OP_BRAZERO in front of it, and because the group appears on ce in the

1286 data, whereas in other cases it appears the minimum number of times. For

1287 this reason, it is simplest to treat this case separately, as otherwise

1288 the code gets far too messy. There are several special subc ases when the

1289 minimum is zero. */

1290

1291 if (repeat_min == 0) {

1292 /* If the maximum is also zero, we just omit the group f rom the output

1293 altogether. */

1294

1295 if (repeat_max == 0) {

1296 code = previous;

1297 goto END_REPEAT;

1298 }

1299

1300 /* If the maximum is 1 or unlimited, we just have to sti ck in the

1301 BRAZERO and do no more at this point. However, we do ne ed to adjust

1302 any OP_RECURSE calls inside the group that refer to the group itself or

1303 any internal group, because the offset is from the star t of the whole

1304 regex. Temporarily terminate the pattern while doing th is. */

1305

1306 if (repeat_max <= 1) {

1307 *code = OP_END;

1308 memmove(previous+1, previous, len);

1309 code++;

1310 *previous++ = OP_BRAZERO + repeat_type;

1311 }

1312

1313 /* If the maximum is greater than 1 and limited, we have to replicate

1314 in a nested fashion, sticking OP_BRAZERO before each se t of brackets.

1315 The first one has to be handled carefully because it's the original

1316 copy, which has to be moved up. The remainder can be ha ndled by code

1317 that is common with the non-zero minimum case below. We have to

1318 adjust the value of repeat_max, since one less copy is required. */

1319

1320 else {

1321 *code = OP_END;

1322 memmove(previous + 2 + LINK_SIZE, previous, len);

1323 code += 2 + LINK_SIZE;

1324 *previous++ = OP_BRAZERO + repeat_type;

1325 *previous++ = OP_BRA;

1326

1327 /* We chain together the bracket offset fields that have to be

1328 filled in later when the ends of the brackets are r eached. */

1329

1330 int offset = (!bralink) ? 0 : previous - bralink;

1331 bralink = previous;

1332 putLinkValueAllowZeroAndAdvance(previous, offset);

1333 }

1334

1335 repeat_max--;

1336 }

1337

1338 /* If the minimum is greater than zero, replicate the group as many

1339 times as necessary, and adjust the maximum to the number of subsequent

1340 copies that we need. If we set a first char from the group, and didn't

1341 set a required char, copy the latter from the former. */

1342

1343 else {

1344 if (repeat_min > 1) {

1345 if (groupsetfirstbyte && reqbyte < 0)

1346 reqbyte = firstbyte;

1347 for (int i = 1; i < repeat_min; i++) {

1348 memcpy(code, previous, len);

1349 code += len;

1350 }

1351 }

1352 if (repeat_max > 0)

1353 repeat_max -= repeat_min;

1354 }

1355

1356 /* This code is common to both the zero and non-zero minimum cases. If

1357 the maximum is limited, it replicates the group in a nested fashion,

1358 remembering the bracket starts on a stack. In the case of a zero minimum,

1359 the first one was set up above. In all cases the repeat_max now specifies

1360 the number of additional copies needed. */

1361

1362 if (repeat_max >= 0) {

1363 for (int i = repeat_max - 1; i >= 0; i--) {

1364 *code++ = OP_BRAZERO + repeat_type;

1365

1366 /* All but the final copy start a new nesting, maint aining the

1367 chain of brackets outstanding. */

1368

1369 if (i != 0) {

1370 *code++ = OP_BRA;

1371 int offset = (!bralink) ? 0 : code - bralink;

1372 bralink = code;

1373 putLinkValueAllowZeroAndAdvance(code, offset);

1374 }

1375

1376 memcpy(code, previous, len);

1377 code += len;

1378 }

1379

1380 /* Now chain through the pending brackets, and fill in t heir length

1381 fields (which are holding the chain links pro tem). */

1382

1383 while (bralink) {

1384 int offset = code - bralink + 1;

1385 unsigned char* bra = code - offset;

1386 int oldlinkoffset = getLinkValueAllowZero(bra + 1);

1387 bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkof fset;

1388 *code++ = OP_KET;

1389 putLinkValueAndAdvance(code, offset);

1390 putLinkValue(bra + 1, offset);

1391 }

1392 }

1393

1394 /* If the maximum is unlimited, set a repeater in the final copy. We

1395 can't just offset backwards from the current code point, be cause we

1396 don't know if there's been an options resetting after the k et. The

1397 correct offset was computed above. */

1398

1399 else

1400 code[-ketoffset] = OP_KETRMAX + repeat_type;

1401 }

1402

1403 /* Else there's some kind of shambles */

1404

1405 else {

1406 *errorcodeptr = ERR11;

1407 goto FAILED;

1408 }

1409

1410 /* In all case we no longer have a previous item. We also set th e

1411 "follows varying string" flag for subsequently encountered reqb ytes if

1412 it isn't already set and we have just passed a varying length i tem. */

1413

1414 END_REPEAT:

1415 previous = NULL;

1416 cd.req_varyopt \|= reqvary;

1417 break;

1418

1419 /* Start of nested bracket sub-expression, or comment or lookahead o r

1420 lookbehind or option setting or condition. First deal with special things

1421 that can come after a bracket; all are introduced by ?, and the app earance

1422 of any of them means that this is not a referencing group. They wer e

1423 checked for validity in the first pass over the string, so we don't have to

1424 check for syntax errors here. */

1425

1426 case '(':

1427 skipbytes = 0;

1428

1429 if (*(++ptr) == '?') {

1430 switch (*(++ptr)) {

1431 case ':': /* Non-extracting bracket */

1432 bravalue = OP_BRA;

1433 ptr++;

1434 break;

1435

1436 case '=': /* Positive lookahead */

1437 bravalue = OP_ASSERT;

1438 ptr++;

1439 break;

1440

1441 case '!': /* Negative lookahead */

1442 bravalue = OP_ASSERT_NOT;

1443 ptr++;

1444 break;

1445

1446 /* Character after (? not specially recognized */

1447

1448 default:

1449 *errorcodeptr = ERR12;

1450 goto FAILED;

1451 }

1452 }

1453

1454 /* Else we have a referencing group; adjust the opcode. If the b racket

1455 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and

1456 arrange for the true number to follow later, in an OP_BRANUMBER item. */

1457

1458 else {

1459 if (++(*brackets) > EXTRACT_BASIC_MAX) {

1460 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;

1461 code[1 + LINK_SIZE] = OP_BRANUMBER;

1462 put2ByteValue(code + 2 + LINK_SIZE, *brackets);

1463 skipbytes = 3;

1464 }

1465 else

1466 bravalue = OP_BRA + *brackets;

1467 }

1468

1469 /* Process nested bracketed re. Assertions may not be repeated, but other

1470 kinds can be. We copy code into a non-variable in order to be a ble

1471 to pass its address because some compilers complain otherwise. Pass in a

1472 new setting for the ims options if they have changed. */

1473

1474 previous = (bravalue >= OP_BRAZERO) ? code : 0;

1475 *code = bravalue;

1476 tempcode = code;

1477 tempreqvary = cd.req_varyopt; /* Save value before bracket * /

1478

1479 if (!compileBracket(

1480 options,

1481 brackets, /* Extracting b racket count */

1482 &tempcode, /* Where to put code (updated) */

1483 &ptr, /* Input pointe r (updated) */

1484 patternEnd,

1485 errorcodeptr, /* Where to put an error message */

1486 skipbytes, /* Skip over OP _BRANUMBER */

1487 &subfirstbyte, /* For possible first char */

1488 &subreqbyte, /* For possible last char */

1489 cd)) /* Tables block */

1490 goto FAILED;

1491

1492 /* At the end of compiling, code is still pointing to the start of the

1493 group, while tempcode has been updated to point past the end of the group

1494 and any option resetting that may follow it. The pattern pointe r (ptr)

1495 is on the bracket. */

1496

1497 /* Handle updating of the required and first characters. Update for normal

1498 brackets of all kinds, and conditions with two branches (see co de above).

1499 If the bracket is followed by a quantifier with zero repeat, we have to

1500 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the

1501 main loop so that they can be accessed for the back off. */

1502

1503 zeroreqbyte = reqbyte;

1504 zerofirstbyte = firstbyte;

1505 groupsetfirstbyte = false;

1506

1507 if (bravalue >= OP_BRA) {

1508 /* If we have not yet set a firstbyte in this branch, take i t from the

1509 subpattern, remembering that it was set here so that a repe at of more

1510 than one can replicate it as reqbyte if necessary. If the s ubpattern has

1511 no firstbyte, set "none" for the whole branch. In both case s, a zero

1512 repeat forces firstbyte to "none". */

1513

1514 if (firstbyte == REQ_UNSET) {

1515 if (subfirstbyte >= 0) {

1516 firstbyte = subfirstbyte;

1517 groupsetfirstbyte = true;

1518 }

1519 else

1520 firstbyte = REQ_NONE;

1521 zerofirstbyte = REQ_NONE;

1522 }

1523

1524 /* If firstbyte was previously set, convert the subpattern's firstbyte

1525 into reqbyte if there wasn't one, using the vary flag that was in

1526 existence beforehand. */

1527

1528 else if (subfirstbyte >= 0 && subreqbyte < 0)

1529 subreqbyte = subfirstbyte \| tempreqvary;

1530

1531 /* If the subpattern set a required byte (or set a first byt e that isn't

1532 really the first byte - see above), set it. */

1533

1534 if (subreqbyte >= 0)

1535 reqbyte = subreqbyte;

1536 }

1537

1538 /* For a forward assertion, we take the reqbyte, if set. This ca n be

1539 helpful if the pattern that follows the assertion doesn't set a different

1540 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte

1541 for an assertion, however because it leads to incorrect effect for patterns

1542 such as /(?=a)a.+/ when the "real" "a" would then become a reqb yte instead

1543 of a firstbyte. This is overcome by a scan at the end if there' s no

1544 firstbyte, looking for an asserted first char. */

1545

1546 else if (bravalue == OP_ASSERT && subreqbyte >= 0)

1547 reqbyte = subreqbyte;

1548

1549 /* Now update the main code pointer to the end of the group. */

1550

1551 code = tempcode;

1552

1553 /* Error if hit end of pattern */

1554

1555 if (ptr >= patternEnd \|\| *ptr != ')') {

1556 *errorcodeptr = ERR14;

1557 goto FAILED;

1558 }

1559 break;

1560

1561 /* Check \ for being a real metacharacter; if not, fall through and handle

1562 it as a data character at the start of a string. Escape items are c hecked

1563 for validity in the pre-compiling pass. */

1564

1565 case '\\':

1566 c = checkEscape(&ptr, patternEnd, errorcodeptr, cd.numCapturingB rackets, false);

1567

1568 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values

1569 are arranged to be the negation of the corresponding OP_values. For the

1570 back references, the values are ESC_REF plus the reference numb er. Only

1571 back references and those types that consume a character may be repeated.

1572 We can test for values between ESC_b and ESC_w for the latter; this may

1573 have to change if any new ones are ever created. */

1574

1575 if (c < 0) {

1576 /* For metasequences that actually match a character, we dis able the

1577 setting of a first character if it hasn't already been set. */

1578

1579 if (firstbyte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)

1580 firstbyte = REQ_NONE;

1581

1582 /* Set values to reset to if this is followed by a zero repe at. */

1583

1584 zerofirstbyte = firstbyte;

1585 zeroreqbyte = reqbyte;

1586

1587 /* Back references are handled specially */

1588

1589 if (-c >= ESC_REF) {

1590 int number = -c - ESC_REF;

1591 previous = code;

1592 *code++ = OP_REF;

1593 put2ByteValueAndAdvance(code, number);

1594 }

1595

1596 /* For the rest, we can obtain the OP value by negating the escape

1597 value */

1598

1599 else {

1600 previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;

1601 *code++ = -c;

1602 }

1603 continue;

1604 }

1605

1606 /* Fall through. */

1607

1608 /* Handle a literal character. It is guaranteed not to be whites pace or #

1609 when the extended flag is set. If we are in UTF-8 mode, it may be a

1610 multi-byte literal character. */

1611

1612 default:

1613 NORMAL_CHAR:

1614

1615 previous = code;

1616

1617 if (c < 128) {

1618 mclength = 1;

1619 mcbuffer[0] = c;

1620

1621 if ((options & IgnoreCaseOption) && (c \| 0x20) >= 'a' && (c \| 0x20) <= 'z') {

1622 *code++ = OP_ASCII_LETTER_IGNORING_CASE;

1623 *code++ = c \| 0x20;

1624 } else {

1625 *code++ = OP_ASCII_CHAR;

1626 *code++ = c;

1627 }

1628 } else {

1629 mclength = encodeUTF8(c, mcbuffer);

1630

1631 *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CA SE : OP_CHAR;

1632 for (c = 0; c < mclength; c++)

1633 *code++ = mcbuffer[c];

1634 }

1635

1636 /* Set the first and required bytes appropriately. If no previou s first

1637 byte, set it from this character, but revert to none on a zero repeat.

1638 Otherwise, leave the firstbyte value alone, and don't change it on a zero

1639 repeat. */

1640

1641 if (firstbyte == REQ_UNSET) {

1642 zerofirstbyte = REQ_NONE;

1643 zeroreqbyte = reqbyte;

1644

1645 /* If the character is more than one byte long, we can set f irstbyte

1646 only if it is not to be matched caselessly. */

1647

1648 if (mclength == 1 \|\| req_caseopt == 0) {

1649 firstbyte = mcbuffer[0] \| req_caseopt;

1650 if (mclength != 1)

1651 reqbyte = code[-1] \| cd.req_varyopt;

1652 }

1653 else

1654 firstbyte = reqbyte = REQ_NONE;

1655 }

1656

1657 /* firstbyte was previously set; we can set reqbyte only the len gth is

1658 1 or the matching is caseful. */

1659

1660 else {

1661 zerofirstbyte = firstbyte;

1662 zeroreqbyte = reqbyte;

1663 if (mclength == 1 \|\| req_caseopt == 0)

1664 reqbyte = code[-1] \| req_caseopt \| cd.req_varyopt;

1665 }

1666

1667 break; /* End of literal character handling */

1668 }

1669 } /* end of big loop */

1670

1671 /* Control never reaches here by falling through, only by a goto for all the

1672 error states. Pass back the position in the pattern so that it can be displ ayed

1673 to the user for diagnosing the error. */

1674

1675 FAILED:

1676 *ptrptr = ptr;

1677 return false;

1678 }

1679

1680 /*************************************************

1681 * Compile sequence of alternatives *

1682 *************************************************/

1683

1684 /* On entry, ptr is pointing past the bracket character, but on return

1685 it points to the closing bracket, or vertical bar, or end of string.

1686 The code variable is pointing at the byte into which the BRA operator has been

1687 stored. If the ims options are changed at the start (for a (?ims: group) or

1688 during any branch, we need to insert an OP_OPT item at the start of every

1689 following branch to ensure they get set correctly at run time, and also pass

1690 the new options into every subsequent branch compile.

1691

1692 Argument:

1693 options option bits, including any changes for this subpattern

1694 brackets -> int containing the number of extracting brackets used

1695 codeptr -> the address of the current code pointer

1696 ptrptr -> the address of the current pattern pointer

1697 errorcodeptr -> pointer to error code variable

1698 skipbytes skip this many bytes at start (for OP_BRANUMBER)

1699 firstbyteptr place to put the first required character, or a negative number

1700 reqbyteptr place to put the last required character, or a negative number

1701 cd points to the data block with tables pointers etc.

1702

1703 Returns: true on success

1704 */

1705

1706 static bool

1707 compileBracket(int options, int* brackets, unsigned char** codeptr,

1708 const UChar** ptrptr, const UChar* patternEnd, ErrorCode* errorcodeptr, int skipbytes,

1709 int* firstbyteptr, int* reqbyteptr, CompileData& cd)

1710 {

1711 const UChar* ptr = *ptrptr;

1712 unsigned char* code = *codeptr;

1713 unsigned char* last_branch = code;

1714 unsigned char* start_bracket = code;

1715 int firstbyte = REQ_UNSET;

1716 int reqbyte = REQ_UNSET;

1717

1718 /* Offset is set zero to mark that this bracket is still open */

1719

1720 putLinkValueAllowZero(code + 1, 0);

1721 code += 1 + LINK_SIZE + skipbytes;

1722

1723 /* Loop for each alternative branch */

1724

1725 while (true) {

1726 /* Now compile the branch */

1727

1728 int branchfirstbyte;

1729 int branchreqbyte;

1730 if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorcode ptr,

1731 &branchfirstbyte, &branchreqbyte, cd)) {

1732 *ptrptr = ptr;

1733 return false;

1734 }

1735

1736 /* If this is the first branch, the firstbyte and reqbyte values for the

1737 branch become the values for the regex. */

1738

1739 if (*last_branch != OP_ALT) {

1740 firstbyte = branchfirstbyte;

1741 reqbyte = branchreqbyte;

1742 }

1743

1744 /* If this is not the first branch, the first char and reqbyte have to

1745 match the values from all the previous branches, except that if the pre vious

1746 value for reqbyte didn't have REQ_VARY set, it can still match, and we set

1747 REQ_VARY for the regex. */

1748

1749 else {

1750 /* If we previously had a firstbyte, but it doesn't match the new br anch,

1751 we have to abandon the firstbyte for the regex, but if there was pr eviously

1752 no reqbyte, it takes on the value of the old firstbyte. */

1753

1754 if (firstbyte >= 0 && firstbyte != branchfirstbyte) {

1755 if (reqbyte < 0)

1756 reqbyte = firstbyte;

1757 firstbyte = REQ_NONE;

1758 }

1759

1760 /* If we (now or from before) have no firstbyte, a firstbyte from th e

1761 branch becomes a reqbyte if there isn't a branch reqbyte. */

1762

1763 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)

1764 branchreqbyte = branchfirstbyte;

1765

1766 /* Now ensure that the reqbytes match */

1767

1768 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))

1769 reqbyte = REQ_NONE;

1770 else

1771 reqbyte \|= branchreqbyte; /* To "or" REQ_VARY */

1772 }

1773

1774 /* Reached end of expression, either ')' or end of pattern. Go back thro ugh

1775 the alternative branches and reverse the chain of offsets, with the fie ld in

1776 the BRA item now becoming an offset to the first alternative. If there are

1777 no alternatives, it points to the end of the group. The length in the

1778 terminating ket is always the length of the whole bracketed item. If an y of

1779 the ims options were changed inside the group, compile a resetting op-c ode

1780 following, except at the very end of the pattern. Return leaving the po inter

1781 at the terminating char. */

1782

1783 if (ptr >= patternEnd \|\| *ptr != '\|') {

1784 int length = code - last_branch;

1785 do {

1786 int prev_length = getLinkValueAllowZero(last_branch + 1);

1787 putLinkValue(last_branch + 1, length);

1788 length = prev_length;

1789 last_branch -= length;

1790 } while (length > 0);

1791

1792 /* Fill in the ket */

1793

1794 *code = OP_KET;

1795 putLinkValue(code + 1, code - start_bracket);

1796 code += 1 + LINK_SIZE;

1797

1798 /* Set values to pass back */

1799

1800 *codeptr = code;

1801 *ptrptr = ptr;

1802 *firstbyteptr = firstbyte;

1803 *reqbyteptr = reqbyte;

1804 return true;

1805 }

1806

1807 /* Another branch follows; insert an "or" node. Its length field points back

1808 to the previous branch while the bracket remains open. At the end the c hain

1809 is reversed. It's done like this so that the start of the bracket has a

1810 zero offset until it is closed, making it possible to detect recursion. */

1811

1812 *code = OP_ALT;

1813 putLinkValue(code + 1, code - last_branch);

1814 last_branch = code;

1815 code += 1 + LINK_SIZE;

1816 ptr++;

1817 }

1818 ASSERT_NOT_REACHED();

1819 }

1820

1821 /*************************************************

1822 * Check for anchored expression *

1823 *************************************************/

1824

1825 /* Try to find out if this is an anchored regular expression. Consider each

1826 alternative branch. If they all start OP_CIRC, or with a bracket

1827 all of whose alternatives start OP_CIRC (recurse ad lib), then

1828 it's anchored.

1829

1830 Arguments:

1831 code points to start of expression (the bracket)

1832 captureMap a bitmap of which brackets we are inside while testing; this

1833 handles up to substring 31; all brackets after that share

1834 the zero bit

1835 backrefMap the back reference bitmap

1836 */

1837

1838 static bool branchIsAnchored(const unsigned char* code)

1839 {

1840 const unsigned char* scode = firstSignificantOpcode(code);

1841 int op = *scode;

1842

1843 /* Brackets */

1844 if (op >= OP_BRA \|\| op == OP_ASSERT)

1845 return bracketIsAnchored(scode);

1846

1847 /* Check for explicit anchoring */

1848 return op == OP_CIRC;

1849 }

1850

1851 static bool bracketIsAnchored(const unsigned char* code)

1852 {

1853 do {

1854 if (!branchIsAnchored(code + 1 + LINK_SIZE))

1855 return false;

1856 code += getLinkValue(code + 1);

1857 } while (code == OP_ALT); / Loop for each alternative */

1858 return true;

1859 }

1860

1861 /*************************************************

1862 * Check for starting with ^ or .* *

1863 *************************************************/

1864

1865 /* This is called to find out if every branch starts with ^ or .* so that

1866 "first char" processing can be done to speed things up in multiline

1867 matching and for non-DOTALL patterns that start with .* (which must start at

1868 the beginning or after \n)

1869

1870 Except when the .* appears inside capturing parentheses, and there is a

1871 subsequent back reference to those parentheses. By keeping a bitmap of the

1872 first 31 back references, we can catch some of the more common cases more

1873 precisely; all the greater back references share a single bit.

1874

1875 Arguments:

1876 code points to start of expression (the bracket)

1877 captureMap a bitmap of which brackets we are inside while testing; this

1878 handles up to substring 31; all brackets after that share

1879 the zero bit

1880 backrefMap the back reference bitmap

1881 */

1882

1883 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)

1884 {

1885 const unsigned char* scode = firstSignificantOpcode(code);

1886 int op = *scode;

1887

1888 /* Capturing brackets */

1889 if (op > OP_BRA) {

1890 int captureNum = op - OP_BRA;

1891 if (captureNum > EXTRACT_BASIC_MAX)

1892 captureNum = get2ByteValue(scode + 2 + LINK_SIZE);

1893 int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;

1894 return bracketNeedsLineStart(scode, captureMap \| bracketMask, backrefMap );

1895 }

1896

1897 /* Other brackets */

1898 if (op == OP_BRA \|\| op == OP_ASSERT)

1899 return bracketNeedsLineStart(scode, captureMap, backrefMap);

1900

1901 /* .* means "start at start or after \n" if it isn't in brackets that

1902 may be referenced. */

1903

1904 if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR)

1905 return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);

1906

1907 /* Explicit ^ */

1908 return op == OP_CIRC \|\| op == OP_BOL;

1909 }

1910

1911 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap , unsigned backrefMap)

1912 {

1913 do {

1914 if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))

1915 return false;

1916 code += getLinkValue(code + 1);

1917 } while (code == OP_ALT); / Loop for each alternative */

1918 return true;

1919 }

1920

1921 /*************************************************

1922 * Check for asserted fixed first char *

1923 *************************************************/

1924

1925 /* During compilation, the "first char" settings from forward assertions are

1926 discarded, because they can cause conflicts with actual literals that follow.

1927 However, if we end up without a first char setting for an unanchored pattern,

1928 it is worth scanning the regex to see if there is an initial asserted first

1929 char. If all branches start with the same asserted char, or with a bracket all

1930 of whose alternatives start with the same asserted char (recurse ad lib), then

1931 we return that char, otherwise -1.

1932

1933 Arguments:

1934 code points to start of expression (the bracket)

1935 options pointer to the options (used to check casing changes)

1936 inassert true if in an assertion

1937

1938 Returns: -1 or the fixed first char

1939 */

1940

1941 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inas sert)

1942 {

1943 const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);

1944 int op = *scode;

1945

1946 if (op >= OP_BRA)

1947 op = OP_BRA;

1948

1949 switch (op) {

1950 default:

1951 return -1;

1952

1953 case OP_BRA:

1954 case OP_ASSERT:

1955 return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);

1956

1957 case OP_EXACT:

1958 scode += 2;

1959 /* Fall through */

1960

1961 case OP_CHAR:

1962 case OP_CHAR_IGNORING_CASE:

1963 case OP_ASCII_CHAR:

1964 case OP_ASCII_LETTER_IGNORING_CASE:

1965 case OP_PLUS:

1966 case OP_MINPLUS:

1967 if (!inassert)

1968 return -1;

1969 return scode[1];

1970 }

1971 }

1972

1973 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool ina ssert)

1974 {

1975 int c = -1;

1976 do {

1977 int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert) ;

1978 if (d < 0)

1979 return -1;

1980 if (c < 0)

1981 c = d;

1982 else if (c != d)

1983 return -1;

1984 code += getLinkValue(code + 1);

1985 } while (*code == OP_ALT);

1986 return c;

1987 }

1988

1989 static inline int multiplyWithOverflowCheck(int a, int b)

1990 {

1991 if (!a \|\| !b)

1992 return 0;

1993 if (a > MAX_PATTERN_SIZE / b)

1994 return -1;

1995 return a * b;

1996 }

1997

1998 static int calculateCompiledPatternLength(const UChar* pattern, int patternLengt h, JSRegExpIgnoreCaseOption ignoreCase,

1999 CompileData& cd, ErrorCode& errorcode)

2000 {

2001 /* Make a pass over the pattern to compute the

2002 amount of store required to hold the compiled code. This does not have to b e

2003 perfect as long as errors are overestimates. */

2004

2005 if (patternLength > MAX_PATTERN_SIZE) {

2006 errorcode = ERR16;

2007 return -1;

2008 }

2009

2010 int length = 1 + LINK_SIZE; /* For initial BRA plus length */

2011 int branch_extra = 0;

2012 int lastitemlength = 0;

2013 unsigned brastackptr = 0;

2014 int brastack[BRASTACK_SIZE];

2015 unsigned char bralenstack[BRASTACK_SIZE];

2016 int bracount = 0;

2017

2018 const UChar* ptr = reinterpret_cast<const UChar*>(pattern - 1);

2019 const UChar* patternEnd = reinterpret_cast<const UChar*>(pattern + patternLe ngth);

2020

2021 while (++ptr < patternEnd) {

2022 int minRepeats = 0, maxRepeats = 0;

2023 int c = *ptr;

2024

2025 switch (c) {

2026 /* A backslashed item may be an escaped data character or it may be a

2027 character type. */

2028

2029 case '\\':

2030 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBra ckets, false);

2031 if (errorcode != 0)

2032 return -1;

2033

2034 lastitemlength = 1; /* Default length of last item for repea ts */

2035

2036 if (c >= 0) { /* Data character */

2037 length += 2; /* For a one-byte character */

2038

2039 if (c > 127) {

2040 int i;

2041 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)

2042 if (c <= kjs_pcre_utf8_table1[i]) break;

2043 length += i;

2044 lastitemlength += i;

2045 }

2046

2047 continue;

2048 }

2049

2050 /* Other escapes need one byte */

2051

2052 length++;

2053

2054 /* A back reference needs an additional 2 bytes, plus either one or 5

2055 bytes for a repeat. We also need to keep the value of the highe st

2056 back reference. */

2057

2058 if (c <= -ESC_REF) {

2059 int refnum = -c - ESC_REF;

2060 cd.backrefMap \|= (refnum < 32) ? (1 << refnum) : 1;

2061 if (refnum > cd.top_backref)

2062 cd.top_backref = refnum;

2063 length += 2; /* For single back reference */

2064 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRe peat(ptr + 2, patternEnd)) {

2065 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats , &errorcode);

2066 if (errorcode)

2067 return -1;

2068 if ((minRepeats == 0 && (maxRepeats == 1 \|\| maxRepeats = = -1)) \|\|

2069 (minRepeats == 1 && maxRepeats == -1))

2070 length++;

2071 else

2072 length += 5;

2073 if (safelyCheckNextChar(ptr, patternEnd, '?'))

2074 ptr++;

2075 }

2076 }

2077 continue;

2078

2079 case '^': /* Single-byte metacharacters */

2080 case '.':

2081 case '$':

2082 length++;

2083 lastitemlength = 1;

2084 continue;

2085

2086 case '': / These repeats won't be after brackets; */

2087 case '+': /* those are handled separately */

2088 case '?':

2089 length++;

2090 goto POSSESSIVE;

2091

2092 /* This covers the cases of braced repeats after a single char, meta char,

2093 class, or back reference. */

2094

2095 case '{':

2096 if (!isCountedRepeat(ptr + 1, patternEnd))

2097 goto NORMAL_CHAR;

2098 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &error code);

2099 if (errorcode != 0)

2100 return -1;

2101

2102 /* These special cases just insert one extra opcode */

2103

2104 if ((minRepeats == 0 && (maxRepeats == 1 \|\| maxRepeats == -1)) \| \|

2105 (minRepeats == 1 && maxRepeats == -1))

2106 length++;

2107

2108 /* These cases might insert additional copies of a preceding cha racter. */

2109

2110 else {

2111 if (minRepeats != 1) {

2112 length -= lastitemlength; /* Uncount the original char or metachar */

2113 if (minRepeats > 0)

2114 length += 3 + lastitemlength;

2115 }

2116 length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);

2117 }

2118

2119 if (safelyCheckNextChar(ptr, patternEnd, '?'))

2120 ptr++; /* Needs no extra length */

2121

2122 POSSESSIVE: /* Test for possessive quantifier */

2123 if (safelyCheckNextChar(ptr, patternEnd, '+')) {

2124 ptr++;

2125 length += 2 + 2 * LINK_SIZE; /* Allow for atomic brackets */

2126 }

2127 continue;

2128

2129 /* An alternation contains an offset to the next branch or ket. If a ny ims

2130 options changed in the previous branch(es), and/or if we are in a

2131 lookbehind assertion, extra space will be needed at the start of th e

2132 branch. This is handled by branch_extra. */

2133

2134 case '\|':

2135 if (brastackptr == 0)

2136 cd.needOuterBracket = true;

2137 length += 1 + LINK_SIZE + branch_extra;

2138 continue;

2139

2140 /* A character class uses 33 characters provided that all the charac ter

2141 values are less than 256. Otherwise, it uses a bit map for low valu ed

2142 characters, and individual items for others. Don't worry about char acter

2143 types that aren't allowed in classes - they'll get picked up during the

2144 compile. A character class that contains only one single-byte chara cter

2145 uses 2 or 3 bytes, depending on whether it is negated or not. Notic e this

2146 where we can. (In UTF-8 mode we can do this only for chars < 128.) */

2147

2148 case '[': {

2149 int class_optcount;

2150 if (*(++ptr) == '^') {

2151 class_optcount = 10; /* Greater than one */

2152 ptr++;

2153 }

2154 else

2155 class_optcount = 0;

2156

2157 bool class_utf8 = false;

2158

2159 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {

2160 /* Check for escapes */

2161

2162 if (*ptr == '\\') {

2163 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapt uringBrackets, true);

2164 if (errorcode != 0)

2165 return -1;

2166

2167 /* Handle escapes that turn into characters */

2168

2169 if (c >= 0)

2170 goto NON_SPECIAL_CHARACTER;

2171

2172 /* Escapes that are meta-things. The normal ones just af fect the

2173 bit map, but Unicode properties require an XCLASS exten ded item. */

2174

2175 else

2176 class_optcount = 10; /* \d, \s etc; make sur e > 1 */

2177 }

2178

2179 /* Anything else increments the possible optimization count. We have to

2180 detect ranges here so that we can compute the number of ext ra ranges for

2181 caseless wide characters when UCP support is available. If there are wide

2182 characters, we are going to have to use an XCLASS, even for single

2183 characters. */

2184

2185 else {

2186 c = *ptr;

2187

2188 /* Come here from handling \ above when it escapes to a char value */

2189

2190 NON_SPECIAL_CHARACTER:

2191 class_optcount++;

2192

2193 int d = -1;

2194 if (safelyCheckNextChar(ptr, patternEnd, '-')) {

2195 UChar const *hyptr = ptr++;

2196 if (safelyCheckNextChar(ptr, patternEnd, '\\')) {

2197 ptr++;

2198 d = checkEscape(&ptr, patternEnd, &errorcode, cd .numCapturingBrackets, true);

2199 if (errorcode != 0)

2200 return -1;

2201 }

2202 else if ((ptr + 1 < patternEnd) && ptr[1] != ']')

2203 d = *++ptr;

2204 if (d < 0)

2205 ptr = hyptr; /* go back to hyphen as data * /

2206 }

2207

2208 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >

2209 127 for caseless matching, we will need to use an XCLAS S. */

2210

2211 if (d >= 0) {

2212 class_optcount = 10; /* Ensure > 1 */

2213 if (d < c) {

2214 errorcode = ERR8;

2215 return -1;

2216 }

2217

2218 if ((d > 255 \|\| (ignoreCase && d > 127))) {

2219 unsigned char buffer[6];

2220 if (!class_utf8) /* Allow for XCLASS ove rhead */

2221 {

2222 class_utf8 = true;

2223 length += LINK_SIZE + 2;

2224 }

2225

2226 /* If we have UCP support, find out how many ext ra ranges are

2227 needed to map the other case of characters with in this range. We

2228 have to mimic the range optimization here, beca use extending the

2229 range upwards might push d over a boundary that makes it use

2230 another byte in the UTF-8 representation. */

2231

2232 if (ignoreCase) {

2233 int occ, ocd;

2234 int cc = c;

2235 int origd = d;

2236 while (getOthercaseRange(&cc, origd, &occ, & ocd)) {

2237 if (occ >= c && ocd <= d)

2238 continue; /* Skip embedded */

2239

2240 if (occ < c && ocd >= c - 1) /* Extend the basic range */

2241 { /* if there is overlap, */

2242 c = occ; /* noti ng that if occ < c */

2243 continue; /* we c an't have ocd > d */

2244 } /* because a subrange is */

2245 if (ocd > d && occ <= d + 1) /* always shorter than */

2246 { /* the basi c range. */

2247 d = ocd;

2248 continue;

2249 }

2250

2251 /* An extra item is needed */

2252

2253 length += 1 + encodeUTF8(occ, buffer) +

2254 ((occ == ocd) ? 0 : encodeUTF8(ocd, buff er));

2255 }

2256 }

2257

2258 /* The length of the (possibly extended) range * /

2259

2260 length += 1 + encodeUTF8(c, buffer) + encodeUTF8 (d, buffer);

2261 }

2262

2263 }

2264

2265 /* We have a single character. There is nothing to be do ne unless we

2266 are in UTF-8 mode. If the char is > 255, or 127 when ca seless, we must

2267 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP

2268 support. */

2269

2270 else {

2271 if ((c > 255 \|\| (ignoreCase && c > 127))) {

2272 unsigned char buffer[6];

2273 class_optcount = 10; /* Ensure > 1 */

2274 if (!class_utf8) /* Allow for XCLASS ove rhead */

2275 {

2276 class_utf8 = true;

2277 length += LINK_SIZE + 2;

2278 }

2279 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8 (c, buffer));

2280 }

2281 }

2282 }

2283 }

2284

2285 if (ptr >= patternEnd) { /* Missing terminating ']' */

2286 errorcode = ERR6;

2287 return -1;

2288 }

2289

2290 /* We can optimize when there was only one optimizable character .

2291 Note that this does not detect the case of a negated single cha racter.

2292 In that case we do an incorrect length computation, but it's no t a serious

2293 problem because the computed length is too large rather than to o small. */

2294

2295 if (class_optcount == 1)

2296 goto NORMAL_CHAR;

2297

2298 /* Here, we handle repeats for the class opcodes. */

2299 {

2300 length += 33;

2301

2302 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,

2303 we also need extra for wrapping the whole thing in a sub-pa ttern. */

2304

2305 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRe peat(ptr + 2, patternEnd)) {

2306 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats , &errorcode);

2307 if (errorcode != 0)

2308 return -1;

2309 if ((minRepeats == 0 && (maxRepeats == 1 \|\| maxRepeats = = -1)) \|\|

2310 (minRepeats == 1 && maxRepeats == -1))

2311 length++;

2312 else

2313 length += 5;

2314 if (safelyCheckNextChar(ptr, patternEnd, '+')) {

2315 ptr++;

2316 length += 2 + 2 * LINK_SIZE;

2317 } else if (safelyCheckNextChar(ptr, patternEnd, '?'))

2318 ptr++;

2319 }

2320 }

2321 continue;

2322 }

2323

2324 /* Brackets may be genuine groups or special things */

2325

2326 case '(': {

2327 int branch_newextra = 0;

2328 int bracket_length = 1 + LINK_SIZE;

2329 bool capturing = false;

2330

2331 /* Handle special forms of bracket, which all start (? */

2332

2333 if (safelyCheckNextChar(ptr, patternEnd, '?')) {

2334 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {

2335 /* Non-referencing groups and lookaheads just move the p ointer on, and

2336 then behave like a non-special bracket, except that the y don't increment

2337 the count of extracting brackets. Ditto for the "once o nly" bracket,

2338 which is in Perl from version 5.005. */

2339

2340 case ':':

2341 case '=':

2342 case '!':

2343 ptr += 2;

2344 break;

2345

2346 /* Else loop checking valid options until ) is met. Anyt hing else is an

2347 error. If we are without any brackets, i.e. at top leve l, the settings

2348 act as if specified in the options, so massage the opti ons immediately.

2349 This is for backward compatibility with Perl 5.004. */

2350

2351 default:

2352 errorcode = ERR12;

2353 return -1;

2354 }

2355 } else

2356 capturing = 1;

2357

2358 /* Capturing brackets must be counted so we can process escapes in a

2359 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are goi ng to need

2360 an additional 3 bytes of memory per capturing bracket. */

2361

2362 if (capturing) {

2363 bracount++;

2364 if (bracount > EXTRACT_BASIC_MAX)

2365 bracket_length += 3;

2366 }

2367

2368 /* Save length for computing whole length at end if there's a re peat that

2369 requires duplication of the group. Also save the current value of

2370 branch_extra, and start the new group with the new value. If no n-zero, this

2371 will either be 2 for a (?imsx: group, or 3 for a lookbehind ass ertion. */

2372

2373 if (brastackptr >= sizeof(brastack)/sizeof(int)) {

2374 errorcode = ERR17;

2375 return -1;

2376 }

2377

2378 bralenstack[brastackptr] = branch_extra;

2379 branch_extra = branch_newextra;

2380

2381 brastack[brastackptr++] = length;

2382 length += bracket_length;

2383 continue;

2384 }

2385

2386 /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certai n sets of values we

2387 have to replicate this bracket up to that many times. If brastackpt r is

2388 0 this is an unmatched bracket which will generate an error, but ta ke care

2389 not to try to access brastack[-1] when computing the length and res toring

2390 the branch_extra value. */

2391

2392 case ')': {

2393 int duplength;

2394 length += 1 + LINK_SIZE;

2395 if (brastackptr > 0) {

2396 duplength = length - brastack[--brastackptr];

2397 branch_extra = bralenstack[brastackptr];

2398 }

2399 else

2400 duplength = 0;

2401

2402 /* Leave ptr at the final char; for readRepeatCounts this happen s

2403 automatically; for the others we need an increment. */

2404

2405 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRe peat(ptr + 2, patternEnd)) {

2406 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &e rrorcode);

2407 if (errorcode)

2408 return -1;

2409 } else if (c == '*') {

2410 minRepeats = 0;

2411 maxRepeats = -1;

2412 ptr++;

2413 } else if (c == '+') {

2414 minRepeats = 1;

2415 maxRepeats = -1;

2416 ptr++;

2417 } else if (c == '?') {

2418 minRepeats = 0;

2419 maxRepeats = 1;

2420 ptr++;

2421 } else {

2422 minRepeats = 1;

2423 maxRepeats = 1;

2424 }

2425

2426 /* If the minimum is zero, we have to allow for an OP_BRAZERO be fore the

2427 group, and if the maximum is greater than zero, we have to repl icate

2428 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting

2429 bracket set. */

2430

2431 int repeatsLength;

2432 if (minRepeats == 0) {

2433 length++;

2434 if (maxRepeats > 0) {

2435 repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1 , duplength + 3 + 2 * LINK_SIZE);

2436 if (repeatsLength < 0) {

2437 errorcode = ERR16;

2438 return -1;

2439 }

2440 length += repeatsLength;

2441 if (length > MAX_PATTERN_SIZE) {

2442 errorcode = ERR16;

2443 return -1;

2444 }

2445 }

2446 }

2447

2448 /* When the minimum is greater than zero, we have to replicate u p to

2449 minval-1 times, with no additions required in the copies. Then, if there

2450 is a limited maximum we have to replicate up to maxval-1 times allowing

2451 for a BRAZERO item before each optional copy and nesting bracke ts for all

2452 but one of the optional copies. */

2453

2454 else {

2455 repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, du plength);

2456 if (repeatsLength < 0) {

2457 errorcode = ERR16;

2458 return -1;

2459 }

2460 length += repeatsLength;

2461 if (maxRepeats > minRepeats) { /* Need this test as maxRepea ts=-1 means no limit */

2462 repeatsLength = multiplyWithOverflowCheck(maxRepeats - m inRepeats, duplength + 3 + 2 * LINK_SIZE);

2463 if (repeatsLength < 0) {

2464 errorcode = ERR16;

2465 return -1;

2466 }

2467 length += repeatsLength - (2 + 2 * LINK_SIZE);

2468 }

2469 if (length > MAX_PATTERN_SIZE) {

2470 errorcode = ERR16;

2471 return -1;

2472 }

2473 }

2474

2475 /* Allow space for once brackets for "possessive quantifier" */

2476

2477 if (safelyCheckNextChar(ptr, patternEnd, '+')) {

2478 ptr++;

2479 length += 2 + 2 * LINK_SIZE;

2480 }

2481 continue;

2482 }

2483

2484 /* Non-special character. It won't be space or # in extended mode, s o it is

2485 always a genuine character. If we are in a \Q...\E sequence, check for the

2486 end; if not, we have a literal. */

2487

2488 default:

2489 NORMAL_CHAR:

2490 length += 2; /* For a one-byte character */

2491 lastitemlength = 1; /* Default length of last item for repeats */

2492

2493 if (c > 127) {

2494 int i;

2495 for (i = 0; i < kjs_pcre_utf8_table1_size; i++)

2496 if (c <= kjs_pcre_utf8_table1[i])

2497 break;

2498 length += i;

2499 lastitemlength += i;

2500 }

2501

2502 continue;

2503 }

2504 }

2505

2506 length += 2 + LINK_SIZE; /* For final KET and END */

2507

2508 cd.numCapturingBrackets = bracount;

2509 return length;

2510 }

2511

2512 /*************************************************

2513 * Compile a Regular Expression *

2514 *************************************************/

2515

2516 /* This function takes a string and returns a pointer to a block of store

2517 holding a compiled version of the expression. The original API for this

2518 function had no error code return variable; it is retained for backwards

2519 compatibility. The new function is given a new name.

2520

2521 Arguments:

2522 pattern the regular expression

2523 options various option bits

2524 errorcodeptr pointer to error code variable (pcre_compile2() only)

2525 can be NULL if you don't want a code value

2526 errorptr pointer to pointer to error text

2527 erroroffset ptr offset in pattern where error was detected

2528 tables pointer to character tables or NULL

2529

2530 Returns: pointer to compiled data block, or NULL on error,

2531 with errorptr and erroroffset set

2532 */

2533

2534 static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorptr)

2535 {

2536 *errorptr = errorText(errorcode);

2537 return 0;

2538 }

2539

2540 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,

2541 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption mul tiline,

2542 unsigned* numSubpatterns, const char** errorptr,

2543 malloc_t* allocate_function, free_t* free_function)

2544 {

2545 /* We can't pass back an error message if errorptr is NULL; I guess the best we

2546 can do is just return NULL, but we can set a code value if there is a code pointer. */

2547 if (!errorptr)

2548 return 0;

2549 *errorptr = NULL;

2550

2551 CompileData cd;

2552

2553 ErrorCode errorcode = ERR0;

2554 /* Call this once just to count the brackets. */

2555 calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, error code);

2556 /* Call it again to compute the length. */

2557 int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCa se, cd, errorcode);

2558 if (errorcode)

2559 return returnError(errorcode, errorptr);

2560

2561 if (length > MAX_PATTERN_SIZE)

2562 return returnError(ERR16, errorptr);

2563

2564 size_t size = length + sizeof(JSRegExp);

2565 JSRegExp* re = reinterpret_cast<JSRegExp>((allocate_function)(size));

2566

2567 if (!re)

2568 return returnError(ERR13, errorptr);

2569

2570 re->options = (ignoreCase ? IgnoreCaseOption : 0) \| (multiline ? MatchAcross MultipleLinesOption : 0);

2571

2572 /* The starting points of the name/number translation table and of the code are

2573 passed around in the compile data block. */

2574

2575 unsigned char* codeStart = reinterpret_cast<unsigned char*>(re + 1);

2576

2577 /* Set up a starting, non-extracting bracket, then compile the expression. O n

2578 error, errorcode will be set non-zero, so we don't need to look at the resu lt

2579 of the function here. */

2580

2581 const UChar* ptr = reinterpret_cast<const UChar*>(pattern);

2582 const UChar* patternEnd = pattern + patternLength;

2583 unsigned char* code = reinterpret_cast<unsigned char*>(codeStart);

2584 int firstbyte, reqbyte;

2585 int bracketCount = 0;

2586 if (!cd.needOuterBracket)

2587 compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &erro rcode, &firstbyte, &reqbyte, cd);

2588 else {

2589 *code = OP_BRA;

2590 compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &err orcode, 0, &firstbyte, &reqbyte, cd);

2591 }

2592 re->top_bracket = bracketCount;

2593 re->top_backref = cd.top_backref;

2594

2595 /* If not reached end of pattern on success, there's an excess bracket. */

2596

2597 if (errorcode == 0 && ptr < patternEnd)

2598 errorcode = ERR10;

2599

2600 /* Fill in the terminating state and check for disastrous overflow, but

2601 if debugging, leave the test till after things are printed out. */

2602

2603 *code++ = OP_END;

2604

2605 ASSERT(code - codeStart <= length);

2606 if (code - codeStart > length)

2607 errorcode = ERR7;

2608

2609 /* Give an error if there's back reference to a non-existent capturing

2610 subpattern. */

2611

2612 if (re->top_backref > re->top_bracket)

2613 errorcode = ERR15;

2614

2615 /* Failed to compile, or error while post-processing */

2616

2617 if (errorcode != ERR0) {

2618 (free_function)(reinterpret_cast<void>(re));

2619 return returnError(errorcode, errorptr);

2620 }

2621

2622 /* If the anchored option was not passed, set the flag if we can determine t hat

2623 the pattern is anchored by virtue of ^ characters or \A or anything else (s uch

2624 as starting with .* when DOTALL is set).

2625

2626 Otherwise, if we know what the first character has to be, save it, because that

2627 speeds up unanchored matches no end. If not, see if we can set the

2628 UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches

2629 start with ^. and also when all branches start with .* for non-DOTALL match es.

2630 */

2631

2632 if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(co deStart))

2633 re->options \|= IsAnchoredOption;

2634 else {

2635 if (firstbyte < 0) {

2636 firstbyte = (cd.needOuterBracket

2637 ? bracketFindFirstAssertedCharacter(codeStart, false)

2638 : branchFindFirstAssertedCharacter(codeStart, false))

2639 \| ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);

2640 }

2641 if (firstbyte >= 0) {

2642 int ch = firstbyte & 255;

2643 if (ch < 127) {

2644 re->first_byte = ((firstbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstbyte;

2645 re->options \|= UseFirstByteOptimizationOption;

2646 }

2647 } else {

2648 if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.bac krefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))

2649 re->options \|= UseMultiLineFirstByteOptimizationOption;

2650 }

2651 }

2652

2653 /* For an anchored pattern, we use the "required byte" only if it follows a

2654 variable length item in the regex. Remove the caseless flag for non-caseabl e

2655 bytes. */

2656

2657 if (reqbyte >= 0 && (!(re->options & IsAnchoredOption) \|\| (reqbyte & REQ_VAR Y))) {

2658 int ch = reqbyte & 255;

2659 if (ch < 127) {

2660 re->req_byte = ((reqbyte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqbyte & ~REQ_IGNORE_CASE) : reqbyte;

2661 re->options \|= UseRequiredByteOptimizationOption;

2662 }

2663 }

2664

2665 if (numSubpatterns)

2666 *numSubpatterns = re->top_bracket;

2667 return re;

2668 }

2669

2670 void jsRegExpFree(JSRegExp* re, free_t* free_function)

2671 {

2672 (free_function)(reinterpret_cast<void>(re));

2673 }

2674

2675 } } // namespace dart::jscre

OLD	NEW

« no previous file with comments | « runtime/third_party/jscre/pcre_chartables.c ('k') | runtime/third_party/jscre/pcre_exec.cpp » ('j') | runtime/vm/object.h » ('J')