third_party/re2/ucs2.diff - Issue 1544433002: Replace RE2 import with a dependency

Side by Side Diff: third_party/re2/ucs2.diff

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 4 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 This is a dump from Google's source control system of the change

2 that removed UCS-2 support from RE2. As the explanation below

3 says, UCS-2 mode is fundamentally at odds with things like ^ and $,

4 so it never really worked very well. But if you are interested in using

5 it without those operators, it did work for that. It assumed that the

6 UCS-2 data was in the native host byte order.

7

8 If you are interested in adding UCS-2 mode back, this patch might

9 be a good starting point.

10

11

12 Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15

13

14 Retire UCS-2 mode.

15

16 I added it as an experiment for V8, but it

17 requires 2-byte lookahead to do completely,

18 and RE2 has 1-byte lookahead (enough for UTF-8)

19 as a fairly deep fundamental assumption,

20 so it did not support ^ or $.

21

22 ==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====

23 re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319

24 cap_[0] = p;

25 if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.

26 return true;

27 - if (prog_->flags() & Regexp::UCS2)

28 - p++;

29 }

30 return false;

31 }

32 ==== re2/compile.cc#17 - re2/compile.cc#18 ====

33 re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100

34 // Input encodings.

35 enum Encoding {

36 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)

37 - kEncodingUCS2, // UCS-2 (0-FFFF), native byte order

38 kEncodingLatin1, // Latin1 (0-FF)

39 };

40

41 re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172

42 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);

43 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);

44 void Add_80_10ffff();

45 - void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);

46 - void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,

47 - uint8 lo2, uint8 hi2, bool fold2);

48

49 // New suffix that matches the byte range lo-hi, then goes to next.

50 Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);

51 re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477

52

53 // Converts rune range lo-hi into a fragment that recognizes

54 // the bytes that would make up those runes in the current

55 - // encoding (Latin 1, UTF-8, or UCS-2).

56 + // encoding (Latin 1 or UTF-8).

57 // This lets the machine work byte-by-byte even when

58 // using multibyte encodings.

59

60 re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489

61 case kEncodingLatin1:

62 AddRuneRangeLatin1(lo, hi, foldcase);

63 break;

64 - case kEncodingUCS2:

65 - AddRuneRangeUCS2(lo, hi, foldcase);

66 - break;

67 }

68 }

69

70 re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501

71 AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));

72 }

73

74 - // Test whether 16-bit values are big or little endian.

75 - static bool BigEndian() {

76 - union {

77 - char byte[2];

78 - int16 endian;

79 - } u;

80 -

81 - u.byte[0] = 1;

82 - u.byte[1] = 2;

83 - return u.endian == 0x0102;

84 - }

85 -

86 - void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,

87 - uint8 lo2, uint8 hi2, bool fold2) {

88 - Inst* ip;

89 - if (reversed_) {

90 - ip = RuneByteSuffix(lo1, hi1, fold1, NULL);

91 - ip = RuneByteSuffix(lo2, hi2, fold2, ip);

92 - } else {

93 - ip = RuneByteSuffix(lo2, hi2, fold2, NULL);

94 - ip = RuneByteSuffix(lo1, hi1, fold1, ip);

95 - }

96 - AddSuffix(ip);

97 - }

98 -

99 - void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {

100 - if (lo > hi \|\| lo > 0xFFFF)

101 - return;

102 - if (hi > 0xFFFF)

103 - hi = 0xFFFF;

104 -

105 - // We'll assemble a pattern assuming big endian.

106 - // If the machine isn't, tell Cat to reverse its arguments.

107 - bool oldreversed = reversed_;

108 - if (!BigEndian()) {

109 - reversed_ = !oldreversed;

110 - }

111 -

112 - // Split into bytes.

113 - int lo1 = lo >> 8;

114 - int lo2 = lo & 0xFF;

115 - int hi1 = hi >> 8;

116 - int hi2 = hi & 0xFF;

117 -

118 - if (lo1 == hi1) {

119 - // Easy case: high bits are same in both.

120 - // Only do ASCII case folding on the second byte if the top byte is 00.

121 - AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);

122 - } else {

123 - // Harder case: different second byte ranges depending on first byte.

124 -

125 - // Initial fragment.

126 - if (lo2 > 0) {

127 - AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);

128 - lo1++;

129 - }

130 -

131 - // Trailing fragment.

132 - if (hi2 < 0xFF) {

133 - AddUCS2Pair(hi1, hi1, false, 0, hi2, false);

134 - hi1--;

135 - }

136 -

137 - // Inner ranges.

138 - if (lo1 <= hi1) {

139 - AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);

140 - }

141 - }

142 -

143 - // Restore reverse setting.

144 - reversed_ = oldreversed;

145 - }

146 -

147 // Table describing how to make a UTF-8 matching machine

148 // for the rune range 80-10FFFF (Runeself-Runemax).

149 // This range happens frequently enough (for example /./ and /[^a-z]/)

150 re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634

151

152 Frag Compiler::Literal(Rune r, bool foldcase) {

153 switch (encoding_) {

154 - default: // UCS-2 or something new

155 - BeginRange();

156 - AddRuneRange(r, r, foldcase);

157 - return EndRange();

158 + default:

159 + return kNullFrag;

160

161 case kEncodingLatin1:

162 return ByteRange(r, r, foldcase);

163 re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850

164

165 if (re->parse_flags() & Regexp::Latin1)

166 c.encoding_ = kEncodingLatin1;

167 - else if (re->parse_flags() & Regexp::UCS2)

168 - c.encoding_ = kEncodingUCS2;

169 c.reversed_ = reversed;

170 if (max_mem <= 0) {

171 c.max_inst_ = 100000; // more than enough

172 re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905

173 c.prog_->set_start_unanchored(c.prog_->start());

174 } else {

175 Frag dot;

176 - if (c.encoding_ == kEncodingUCS2) {

177 - dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, fals e));

178 - } else {

179 - dot = c.ByteRange(0x00, 0xFF, false);

180 - }

181 + dot = c.ByteRange(0x00, 0xFF, false);

182 Frag dotloop = c.Star(dot, true);

183 Frag unanchored = c.Cat(dotloop, all);

184 c.prog_->set_start_unanchored(unanchored.begin);

185 ==== re2/nfa.cc#8 - re2/nfa.cc#9 ====

186 re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431

187 const char* bp = context.begin();

188 int c = -1;

189 int wasword = 0;

190 - bool ucs2 = prog_->flags() & Regexp::UCS2;

191

192 if (text.begin() > context.begin()) {

193 c = text.begin()[-1] & 0xFF;

194 re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497

195 // If there's a required first byte for an unanchored search

196 // and we're not in the middle of any possible matches,

197 // use memchr to search for the byte quickly.

198 - if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&

199 + if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&

200 p < text.end() && (p[0] & 0xFF) != first_byte_) {

201 p = reinterpret_cast<const char*>(memchr(p, first_byte_,

202 text.end() - p));

203 re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514

204 flag = Prog::EmptyFlags(context, p);

205 }

206

207 - // In UCS-2 mode, if we need to start a new thread,

208 - // make sure to do it on an even boundary.

209 - if(ucs2 && runq->size() == 0 &&

210 - (p - context.begin()) % 2 && p < text.end()) {

211 - p++;

212 - flag = Prog::EmptyFlags(context, p);

213 - }

214 -

215 // Steal match storage (cleared but unused as of yet)

216 // temporarily to hold match boundaries for new thread.

217 - // In UCS-2 mode, only start the thread on a 2-byte boundary.

218 - if(!ucs2 \|\| (p - context.begin()) % 2 == 0) {

219 - match_[0] = p;

220 - AddToThreadq(runq, start_, flag, p, match_);

221 - match_[0] = NULL;

222 - }

223 + match_[0] = p;

224 + AddToThreadq(runq, start_, flag, p, match_);

225 + match_[0] = NULL;

226 }

227

228 // If all the threads have died, stop early.

229 ==== re2/parse.cc#22 - re2/parse.cc#23 ====

230 re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165

231 status_(status), stacktop_(NULL), ncap_(0) {

232 if (flags_ & Latin1)

233 rune_max_ = 0xFF;

234 - else if (flags & UCS2)

235 - rune_max_ = 0xFFFF;

236 else

237 rune_max_ = Runemax;

238 }

239 re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374

240 bool Regexp::ParseState::PushCarat() {

241 if (flags_ & OneLine) {

242 return PushSimpleOp(kRegexpBeginText);

243 - } else {

244 - if (flags_ & UCS2) {

245 - status_->set_code(kRegexpUnsupported);

246 - status_->set_error_arg("multiline ^ in UCS-2 mode");

247 - return false;

248 - }

249 - return PushSimpleOp(kRegexpBeginLine);

250 }

251 + return PushSimpleOp(kRegexpBeginLine);

252 }

253

254 // Pushes a \b or \B onto the stack.

255 bool Regexp::ParseState::PushWordBoundary(bool word) {

256 - if (flags_ & UCS2) {

257 - status_->set_code(kRegexpUnsupported);

258 - status_->set_error_arg("\\b or \\B in UCS-2 mode");

259 - return false;

260 - }

261 if (word)

262 return PushSimpleOp(kRegexpWordBoundary);

263 return PushSimpleOp(kRegexpNoWordBoundary);

264 re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389

265 bool ret = PushSimpleOp(kRegexpEndText);

266 flags_ = oflags;

267 return ret;

268 - }

269 - if (flags_ & UCS2) {

270 - status_->set_code(kRegexpUnsupported);

271 - status_->set_error_arg("multiline $ in UCS-2 mode");

272 - return false;

273 }

274 return PushSimpleOp(kRegexpEndLine);

275 }

276 ==== re2/re2.cc#34 - re2/re2.cc#35 ====

277 re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84

278 return RE2::ErrorBadUTF8;

279 case re2::kRegexpBadNamedCapture:

280 return RE2::ErrorBadNamedCapture;

281 - case re2::kRegexpUnsupported:

282 - return RE2::ErrorUnsupported;

283 }

284 return RE2::ErrorInternal;

285 }

286 re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125

287 break;

288 case RE2::Options::EncodingLatin1:

289 flags \|= Regexp::Latin1;

290 - break;

291 - case RE2::Options::EncodingUCS2:

292 - flags \|= Regexp::UCS2;

293 break;

294 }

295

296 ==== re2/re2.h#36 - re2/re2.h#37 ====

297 re2/re2.h#36:246,252 - re2/re2.h#37:246,251

298 ErrorBadUTF8, // invalid UTF-8 in regexp

299 ErrorBadNamedCapture, // bad named capture group

300 ErrorPatternTooLarge, // pattern too large (compile failed)

301 - ErrorUnsupported, // unsupported feature (in UCS-2 mode)

302 };

303

304 // Predefined common options.

305 re2/re2.h#36:570,576 - re2/re2.h#37:569,574

306

307 enum Encoding {

308 EncodingUTF8 = 1,

309 - EncodingUCS2, // 16-bit Unicode 0-FFFF only

310 EncodingLatin1

311 };

312

313 ==== re2/regexp.cc#15 - re2/regexp.cc#16 ====

314 re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329

315 // the regexp that remains after the prefix. The prefix might

316 // be ASCII case-insensitive.

317 bool Regexp::RequiredPrefix(string prefix, bool foldcase, Regexp** suffix) {

318 - // Don't even bother for UCS-2; it's time to throw that code away.

319 - if (parse_flags_ & UCS2)

320 - return false;

321 -

322 // No need for a walker: the regexp must be of the form

323 // 1. some number of ^ anchors

324 // 2. a literal char or string

325 ==== re2/regexp.h#20 - re2/regexp.h#21 ====

326 re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192

327 kRegexpBadPerlOp, // bad perl operator

328 kRegexpBadUTF8, // invalid UTF-8 in regexp

329 kRegexpBadNamedCapture, // bad named capture

330 - kRegexpUnsupported, // unsupported operator

331 };

332

333 // Error status for certain operations.

334 re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314

335 // \Q and \E to disable/enable metacharacters

336 // (?P<name>expr) for named captures

337 // \C to match any single byte

338 - UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8.

339 - UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group

340 + UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group

341 // and \P{Han} for its negation.

342 - NeverNL = 1<<12, // Never match NL, even if the regexp mentions

343 + NeverNL = 1<<11, // Never match NL, even if the regexp mentions

344 // it explicitly.

345

346 // As close to Perl as we can get.

347 ==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====

348 re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139

349 cap_[0] = p;

350 if (Visit(prog_->start(), p)) // Match must be leftmost; done.

351 return true;

352 - if (prog_->flags() & Regexp::UCS2)

353 - p++;

354 }

355 return false;

356 }

357 ==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====

358 re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152

359 static ParseMode parse_modes[] = {

360 { single_line, "single-line" },

361 { single_line\|Regexp::Latin1, "single-line, latin1" },

362 - { single_line\|Regexp::UCS2, "single-line, ucs2" },

363 { multi_line, "multiline" },

364 { multi_line\|Regexp::NonGreedy, "multiline, nongreedy" },

365 { multi_line\|Regexp::Latin1, "multiline, latin1" },

366 - { multi_line\|Regexp::UCS2, "multiline, ucs2" },

367 };

368

369 static string FormatMode(Regexp::ParseFlags flags) {

370 re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185

371 RegexpStatus status;

372 regexp_ = Regexp::Parse(regexp_str, flags, &status);

373 if (regexp_ == NULL) {

374 - if (status.code() != kRegexpUnsupported) {

375 - LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)

376 - << " mode: " << FormatMode(flags);

377 - error_ = true;

378 - }

379 + LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)

380 + << " mode: " << FormatMode(flags);

381 + error_ = true;

382 return;

383 }

384 prog_ = regexp_->CompileToProg(0);

385 re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231

386 RE2::Options options;

387 if (flags & Regexp::Latin1)

388 options.set_encoding(RE2::Options::EncodingLatin1);

389 - else if (flags & Regexp::UCS2)

390 - options.set_encoding(RE2::Options::EncodingUCS2);

391 if (kind_ == Prog::kLongestMatch)

392 options.set_longest_match(true);

393 re2_ = new RE2(re, options);

394 re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280

395 delete re2_;

396 }

397

398 - // Converts UTF-8 string in text into UCS-2 string in new_text.

399 - static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {

400 - const char* p = text.begin();

401 - const char* ep = text.end();

402 - uint16* q = new uint16[ep - p];

403 - uint16* q0 = q;

404 -

405 - int n;

406 - Rune r;

407 - for (; p < ep; p += n) {

408 - if (!fullrune(p, ep - p)) {

409 - delete[] q0;

410 - return false;

411 - }

412 - n = chartorune(&r, p);

413 - if (r > 0xFFFF) {

414 - delete[] q0;

415 - return false;

416 - }

417 - *q++ = r;

418 - }

419 - new_text = StringPiece(reinterpret_cast<char>(q0), 2*(q - q0));

420 - return true;

421 - }

422 -

423 - // Rewrites *sp from being a pointer into text8 (UTF-8)

424 - // to being a pointer into text16 (equivalent text but in UCS-2).

425 - static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text 16,

426 - StringPiece *sp) {

427 - if (sp->begin() == NULL && text8.begin() != NULL)

428 - return;

429 -

430 - int nrune = 0;

431 - int n;

432 - Rune r;

433 - const char* p = text8.begin();

434 - const char* ep = text8.end();

435 - const char* spbegin = NULL;

436 - const char* spend = NULL;

437 - for (;;) {

438 - if (p == sp->begin())

439 - spbegin = text16.begin() + sizeof(uint16)*nrune;

440 - if (p == sp->end())

441 - spend = text16.begin() + sizeof(uint16)*nrune;

442 - if (p >= ep)

443 - break;

444 - n = chartorune(&r, p);

445 - p += n;

446 - nrune++;

447 - }

448 - if (spbegin == NULL \|\| spend == NULL) {

449 - LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "

450 - << CEscape(text8) << " "

451 - << (int)(sp->begin() - text8.begin()) << " "

452 - << (int)(sp->end() - text8.begin());

453 - }

454 - *sp = StringPiece(spbegin, spend - spbegin);

455 - }

456 -

457 - // Rewrites *sp from begin a pointer into text16 (UCS-2)

458 - // to being a pointer into text8 (equivalent text but in UTF-8).

459 - static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& tex t8,

460 - StringPiece* sp) {

461 - if (sp->begin() == NULL)

462 - return;

463 -

464 - int nrune = 0;

465 - int n;

466 - Rune r;

467 - const char* p = text8.begin();

468 - const char* ep = text8.end();

469 - const char* spbegin = NULL;

470 - const char* spend = NULL;

471 - for (;;) {

472 - if (nrune == (sp->begin() - text16.begin())/2)

473 - spbegin = p;

474 - if (nrune == (sp->end() - text16.begin())/2)

475 - spend = p;

476 - if (p >= ep)

477 - break;

478 - n = chartorune(&r, p);

479 - p += n;

480 - nrune++;

481 - }

482 - if (text8.begin() != NULL && (spbegin == NULL \|\| spend == NULL)) {

483 - LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "

484 - << CEscape(text16) << " "

485 - << (int)(sp->begin() - text16.begin()) << " "

486 - << (int)(sp->end() - text16.begin());

487 - }

488 - *sp = StringPiece(spbegin, spend - spbegin);

489 - }

490 -

491 // Runs a single search using the named engine type.

492 // This interface hides all the irregularities of the various

493 // engine interfaces from the rest of this file.

494 re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300

495

496 StringPiece text = orig_text;

497 StringPiece context = orig_context;

498 - bool ucs2 = false;

499

500 - if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {

501 - if (!ConvertUTF8ToUCS2(orig_context, &context)) {

502 - result->skipped = true;

503 - return;

504 - }

505 -

506 - // Rewrite context to refer to new text.

507 - AdjustUTF8ToUCS2(orig_context, context, &text);

508 - ucs2 = true;

509 - }

510 -

511 switch (type) {

512 default:

513 LOG(FATAL) << "Bad RunSearch type: " << (int)type;

514 re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451

515 }

516 }

517

518 - // If we did UCS-2 matching, rewrite the matches to refer

519 - // to the original UTF-8 text.

520 - if (ucs2) {

521 - if (result->matched) {

522 - if (result->have_submatch0) {

523 - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);

524 - } else if (result->have_submatch) {

525 - for (int i = 0; i < nsubmatch; i++) {

526 - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);

527 - }

528 - }

529 - }

530 - delete[] context.begin();

531 - }

532 -

533 if (!result->matched)

534 memset(result->submatch, 0, sizeof result->submatch);

535 }

536 re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475

537 return true;

538 }

539

540 - // Check whether text uses only Unicode points <= 0xFFFF

541 - // (in the BMP).

542 - static bool IsBMP(const StringPiece& text) {

543 - const char* p = text.begin();

544 - const char* ep = text.end();

545 - while (p < ep) {

546 - if (!fullrune(p, ep - p))

547 - return false;

548 - Rune r;

549 - p += chartorune(&r, p);

550 - if (r > 0xFFFF)

551 - return false;

552 - }

553 - return true;

554 - }

555 -

556 // Runs a single test.

557 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context ,

558 Prog::Anchor anchor) {

559 re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483

560 Result correct;

561 RunSearch(kEngineBacktrack, text, context, anchor, &correct);

562 if (correct.skipped) {

563 - if (regexp_ == NULL \|\| !IsBMP(context)) // okay to skip in UCS-2 mode

564 + if (regexp_ == NULL)

565 return true;

566 LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)

567 << " " << FormatMode(flags_);

OLD	NEW

« no previous file with comments | « third_party/re2/testinstall.cc ('k') | third_party/re2/util/atomicops.h » ('j') | no next file with comments »