Index: third_party/re2/ucs2.diff |
diff --git a/third_party/re2/ucs2.diff b/third_party/re2/ucs2.diff |
deleted file mode 100644 |
index 57aec04a15cfd2fc93dd75468841076a7042fc97..0000000000000000000000000000000000000000 |
--- a/third_party/re2/ucs2.diff |
+++ /dev/null |
@@ -1,567 +0,0 @@ |
-This is a dump from Google's source control system of the change |
-that removed UCS-2 support from RE2. As the explanation below |
-says, UCS-2 mode is fundamentally at odds with things like ^ and $, |
-so it never really worked very well. But if you are interested in using |
-it without those operators, it did work for that. It assumed that the |
-UCS-2 data was in the native host byte order. |
- |
-If you are interested in adding UCS-2 mode back, this patch might |
-be a good starting point. |
- |
- |
-Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15 |
- |
- Retire UCS-2 mode. |
- |
- I added it as an experiment for V8, but it |
- requires 2-byte lookahead to do completely, |
- and RE2 has 1-byte lookahead (enough for UTF-8) |
- as a fairly deep fundamental assumption, |
- so it did not support ^ or $. |
- |
-==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ==== |
-re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319 |
- cap_[0] = p; |
- if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. |
- return true; |
-- if (prog_->flags() & Regexp::UCS2) |
-- p++; |
- } |
- return false; |
- } |
-==== re2/compile.cc#17 - re2/compile.cc#18 ==== |
-re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100 |
- // Input encodings. |
- enum Encoding { |
- kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) |
-- kEncodingUCS2, // UCS-2 (0-FFFF), native byte order |
- kEncodingLatin1, // Latin1 (0-FF) |
- }; |
- |
-re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172 |
- void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); |
- void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); |
- void Add_80_10ffff(); |
-- void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase); |
-- void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, |
-- uint8 lo2, uint8 hi2, bool fold2); |
- |
- // New suffix that matches the byte range lo-hi, then goes to next. |
- Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next); |
-re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477 |
- |
- // Converts rune range lo-hi into a fragment that recognizes |
- // the bytes that would make up those runes in the current |
-- // encoding (Latin 1, UTF-8, or UCS-2). |
-+ // encoding (Latin 1 or UTF-8). |
- // This lets the machine work byte-by-byte even when |
- // using multibyte encodings. |
- |
-re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489 |
- case kEncodingLatin1: |
- AddRuneRangeLatin1(lo, hi, foldcase); |
- break; |
-- case kEncodingUCS2: |
-- AddRuneRangeUCS2(lo, hi, foldcase); |
-- break; |
- } |
- } |
- |
-re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501 |
- AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL)); |
- } |
- |
-- // Test whether 16-bit values are big or little endian. |
-- static bool BigEndian() { |
-- union { |
-- char byte[2]; |
-- int16 endian; |
-- } u; |
-- |
-- u.byte[0] = 1; |
-- u.byte[1] = 2; |
-- return u.endian == 0x0102; |
-- } |
-- |
-- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, |
-- uint8 lo2, uint8 hi2, bool fold2) { |
-- Inst* ip; |
-- if (reversed_) { |
-- ip = RuneByteSuffix(lo1, hi1, fold1, NULL); |
-- ip = RuneByteSuffix(lo2, hi2, fold2, ip); |
-- } else { |
-- ip = RuneByteSuffix(lo2, hi2, fold2, NULL); |
-- ip = RuneByteSuffix(lo1, hi1, fold1, ip); |
-- } |
-- AddSuffix(ip); |
-- } |
-- |
-- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) { |
-- if (lo > hi || lo > 0xFFFF) |
-- return; |
-- if (hi > 0xFFFF) |
-- hi = 0xFFFF; |
-- |
-- // We'll assemble a pattern assuming big endian. |
-- // If the machine isn't, tell Cat to reverse its arguments. |
-- bool oldreversed = reversed_; |
-- if (!BigEndian()) { |
-- reversed_ = !oldreversed; |
-- } |
-- |
-- // Split into bytes. |
-- int lo1 = lo >> 8; |
-- int lo2 = lo & 0xFF; |
-- int hi1 = hi >> 8; |
-- int hi2 = hi & 0xFF; |
-- |
-- if (lo1 == hi1) { |
-- // Easy case: high bits are same in both. |
-- // Only do ASCII case folding on the second byte if the top byte is 00. |
-- AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase); |
-- } else { |
-- // Harder case: different second byte ranges depending on first byte. |
-- |
-- // Initial fragment. |
-- if (lo2 > 0) { |
-- AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase); |
-- lo1++; |
-- } |
-- |
-- // Trailing fragment. |
-- if (hi2 < 0xFF) { |
-- AddUCS2Pair(hi1, hi1, false, 0, hi2, false); |
-- hi1--; |
-- } |
-- |
-- // Inner ranges. |
-- if (lo1 <= hi1) { |
-- AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false); |
-- } |
-- } |
-- |
-- // Restore reverse setting. |
-- reversed_ = oldreversed; |
-- } |
-- |
- // Table describing how to make a UTF-8 matching machine |
- // for the rune range 80-10FFFF (Runeself-Runemax). |
- // This range happens frequently enough (for example /./ and /[^a-z]/) |
-re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634 |
- |
- Frag Compiler::Literal(Rune r, bool foldcase) { |
- switch (encoding_) { |
-- default: // UCS-2 or something new |
-- BeginRange(); |
-- AddRuneRange(r, r, foldcase); |
-- return EndRange(); |
-+ default: |
-+ return kNullFrag; |
- |
- case kEncodingLatin1: |
- return ByteRange(r, r, foldcase); |
-re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850 |
- |
- if (re->parse_flags() & Regexp::Latin1) |
- c.encoding_ = kEncodingLatin1; |
-- else if (re->parse_flags() & Regexp::UCS2) |
-- c.encoding_ = kEncodingUCS2; |
- c.reversed_ = reversed; |
- if (max_mem <= 0) { |
- c.max_inst_ = 100000; // more than enough |
-re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905 |
- c.prog_->set_start_unanchored(c.prog_->start()); |
- } else { |
- Frag dot; |
-- if (c.encoding_ == kEncodingUCS2) { |
-- dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false)); |
-- } else { |
-- dot = c.ByteRange(0x00, 0xFF, false); |
-- } |
-+ dot = c.ByteRange(0x00, 0xFF, false); |
- Frag dotloop = c.Star(dot, true); |
- Frag unanchored = c.Cat(dotloop, all); |
- c.prog_->set_start_unanchored(unanchored.begin); |
-==== re2/nfa.cc#8 - re2/nfa.cc#9 ==== |
-re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431 |
- const char* bp = context.begin(); |
- int c = -1; |
- int wasword = 0; |
-- bool ucs2 = prog_->flags() & Regexp::UCS2; |
- |
- if (text.begin() > context.begin()) { |
- c = text.begin()[-1] & 0xFF; |
-re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497 |
- // If there's a required first byte for an unanchored search |
- // and we're not in the middle of any possible matches, |
- // use memchr to search for the byte quickly. |
-- if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 && |
-+ if (!anchored && first_byte_ >= 0 && runq->size() == 0 && |
- p < text.end() && (p[0] & 0xFF) != first_byte_) { |
- p = reinterpret_cast<const char*>(memchr(p, first_byte_, |
- text.end() - p)); |
-re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514 |
- flag = Prog::EmptyFlags(context, p); |
- } |
- |
-- // In UCS-2 mode, if we need to start a new thread, |
-- // make sure to do it on an even boundary. |
-- if(ucs2 && runq->size() == 0 && |
-- (p - context.begin()) % 2 && p < text.end()) { |
-- p++; |
-- flag = Prog::EmptyFlags(context, p); |
-- } |
-- |
- // Steal match storage (cleared but unused as of yet) |
- // temporarily to hold match boundaries for new thread. |
-- // In UCS-2 mode, only start the thread on a 2-byte boundary. |
-- if(!ucs2 || (p - context.begin()) % 2 == 0) { |
-- match_[0] = p; |
-- AddToThreadq(runq, start_, flag, p, match_); |
-- match_[0] = NULL; |
-- } |
-+ match_[0] = p; |
-+ AddToThreadq(runq, start_, flag, p, match_); |
-+ match_[0] = NULL; |
- } |
- |
- // If all the threads have died, stop early. |
-==== re2/parse.cc#22 - re2/parse.cc#23 ==== |
-re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165 |
- status_(status), stacktop_(NULL), ncap_(0) { |
- if (flags_ & Latin1) |
- rune_max_ = 0xFF; |
-- else if (flags & UCS2) |
-- rune_max_ = 0xFFFF; |
- else |
- rune_max_ = Runemax; |
- } |
-re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374 |
- bool Regexp::ParseState::PushCarat() { |
- if (flags_ & OneLine) { |
- return PushSimpleOp(kRegexpBeginText); |
-- } else { |
-- if (flags_ & UCS2) { |
-- status_->set_code(kRegexpUnsupported); |
-- status_->set_error_arg("multiline ^ in UCS-2 mode"); |
-- return false; |
-- } |
-- return PushSimpleOp(kRegexpBeginLine); |
- } |
-+ return PushSimpleOp(kRegexpBeginLine); |
- } |
- |
- // Pushes a \b or \B onto the stack. |
- bool Regexp::ParseState::PushWordBoundary(bool word) { |
-- if (flags_ & UCS2) { |
-- status_->set_code(kRegexpUnsupported); |
-- status_->set_error_arg("\\b or \\B in UCS-2 mode"); |
-- return false; |
-- } |
- if (word) |
- return PushSimpleOp(kRegexpWordBoundary); |
- return PushSimpleOp(kRegexpNoWordBoundary); |
-re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389 |
- bool ret = PushSimpleOp(kRegexpEndText); |
- flags_ = oflags; |
- return ret; |
-- } |
-- if (flags_ & UCS2) { |
-- status_->set_code(kRegexpUnsupported); |
-- status_->set_error_arg("multiline $ in UCS-2 mode"); |
-- return false; |
- } |
- return PushSimpleOp(kRegexpEndLine); |
- } |
-==== re2/re2.cc#34 - re2/re2.cc#35 ==== |
-re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84 |
- return RE2::ErrorBadUTF8; |
- case re2::kRegexpBadNamedCapture: |
- return RE2::ErrorBadNamedCapture; |
-- case re2::kRegexpUnsupported: |
-- return RE2::ErrorUnsupported; |
- } |
- return RE2::ErrorInternal; |
- } |
-re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125 |
- break; |
- case RE2::Options::EncodingLatin1: |
- flags |= Regexp::Latin1; |
-- break; |
-- case RE2::Options::EncodingUCS2: |
-- flags |= Regexp::UCS2; |
- break; |
- } |
- |
-==== re2/re2.h#36 - re2/re2.h#37 ==== |
-re2/re2.h#36:246,252 - re2/re2.h#37:246,251 |
- ErrorBadUTF8, // invalid UTF-8 in regexp |
- ErrorBadNamedCapture, // bad named capture group |
- ErrorPatternTooLarge, // pattern too large (compile failed) |
-- ErrorUnsupported, // unsupported feature (in UCS-2 mode) |
- }; |
- |
- // Predefined common options. |
-re2/re2.h#36:570,576 - re2/re2.h#37:569,574 |
- |
- enum Encoding { |
- EncodingUTF8 = 1, |
-- EncodingUCS2, // 16-bit Unicode 0-FFFF only |
- EncodingLatin1 |
- }; |
- |
-==== re2/regexp.cc#15 - re2/regexp.cc#16 ==== |
-re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329 |
- // the regexp that remains after the prefix. The prefix might |
- // be ASCII case-insensitive. |
- bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { |
-- // Don't even bother for UCS-2; it's time to throw that code away. |
-- if (parse_flags_ & UCS2) |
-- return false; |
-- |
- // No need for a walker: the regexp must be of the form |
- // 1. some number of ^ anchors |
- // 2. a literal char or string |
-==== re2/regexp.h#20 - re2/regexp.h#21 ==== |
-re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192 |
- kRegexpBadPerlOp, // bad perl operator |
- kRegexpBadUTF8, // invalid UTF-8 in regexp |
- kRegexpBadNamedCapture, // bad named capture |
-- kRegexpUnsupported, // unsupported operator |
- }; |
- |
- // Error status for certain operations. |
-re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314 |
- // \Q and \E to disable/enable metacharacters |
- // (?P<name>expr) for named captures |
- // \C to match any single byte |
-- UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8. |
-- UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group |
-+ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group |
- // and \P{Han} for its negation. |
-- NeverNL = 1<<12, // Never match NL, even if the regexp mentions |
-+ NeverNL = 1<<11, // Never match NL, even if the regexp mentions |
- // it explicitly. |
- |
- // As close to Perl as we can get. |
-==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ==== |
-re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139 |
- cap_[0] = p; |
- if (Visit(prog_->start(), p)) // Match must be leftmost; done. |
- return true; |
-- if (prog_->flags() & Regexp::UCS2) |
-- p++; |
- } |
- return false; |
- } |
-==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ==== |
-re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152 |
- static ParseMode parse_modes[] = { |
- { single_line, "single-line" }, |
- { single_line|Regexp::Latin1, "single-line, latin1" }, |
-- { single_line|Regexp::UCS2, "single-line, ucs2" }, |
- { multi_line, "multiline" }, |
- { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, |
- { multi_line|Regexp::Latin1, "multiline, latin1" }, |
-- { multi_line|Regexp::UCS2, "multiline, ucs2" }, |
- }; |
- |
- static string FormatMode(Regexp::ParseFlags flags) { |
-re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185 |
- RegexpStatus status; |
- regexp_ = Regexp::Parse(regexp_str, flags, &status); |
- if (regexp_ == NULL) { |
-- if (status.code() != kRegexpUnsupported) { |
-- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) |
-- << " mode: " << FormatMode(flags); |
-- error_ = true; |
-- } |
-+ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) |
-+ << " mode: " << FormatMode(flags); |
-+ error_ = true; |
- return; |
- } |
- prog_ = regexp_->CompileToProg(0); |
-re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231 |
- RE2::Options options; |
- if (flags & Regexp::Latin1) |
- options.set_encoding(RE2::Options::EncodingLatin1); |
-- else if (flags & Regexp::UCS2) |
-- options.set_encoding(RE2::Options::EncodingUCS2); |
- if (kind_ == Prog::kLongestMatch) |
- options.set_longest_match(true); |
- re2_ = new RE2(re, options); |
-re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280 |
- delete re2_; |
- } |
- |
-- // Converts UTF-8 string in text into UCS-2 string in new_text. |
-- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) { |
-- const char* p = text.begin(); |
-- const char* ep = text.end(); |
-- uint16* q = new uint16[ep - p]; |
-- uint16* q0 = q; |
-- |
-- int n; |
-- Rune r; |
-- for (; p < ep; p += n) { |
-- if (!fullrune(p, ep - p)) { |
-- delete[] q0; |
-- return false; |
-- } |
-- n = chartorune(&r, p); |
-- if (r > 0xFFFF) { |
-- delete[] q0; |
-- return false; |
-- } |
-- *q++ = r; |
-- } |
-- *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0)); |
-- return true; |
-- } |
-- |
-- // Rewrites *sp from being a pointer into text8 (UTF-8) |
-- // to being a pointer into text16 (equivalent text but in UCS-2). |
-- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16, |
-- StringPiece *sp) { |
-- if (sp->begin() == NULL && text8.begin() != NULL) |
-- return; |
-- |
-- int nrune = 0; |
-- int n; |
-- Rune r; |
-- const char* p = text8.begin(); |
-- const char* ep = text8.end(); |
-- const char* spbegin = NULL; |
-- const char* spend = NULL; |
-- for (;;) { |
-- if (p == sp->begin()) |
-- spbegin = text16.begin() + sizeof(uint16)*nrune; |
-- if (p == sp->end()) |
-- spend = text16.begin() + sizeof(uint16)*nrune; |
-- if (p >= ep) |
-- break; |
-- n = chartorune(&r, p); |
-- p += n; |
-- nrune++; |
-- } |
-- if (spbegin == NULL || spend == NULL) { |
-- LOG(FATAL) << "Error in AdjustUTF8ToUCS2 " |
-- << CEscape(text8) << " " |
-- << (int)(sp->begin() - text8.begin()) << " " |
-- << (int)(sp->end() - text8.begin()); |
-- } |
-- *sp = StringPiece(spbegin, spend - spbegin); |
-- } |
-- |
-- // Rewrites *sp from begin a pointer into text16 (UCS-2) |
-- // to being a pointer into text8 (equivalent text but in UTF-8). |
-- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8, |
-- StringPiece* sp) { |
-- if (sp->begin() == NULL) |
-- return; |
-- |
-- int nrune = 0; |
-- int n; |
-- Rune r; |
-- const char* p = text8.begin(); |
-- const char* ep = text8.end(); |
-- const char* spbegin = NULL; |
-- const char* spend = NULL; |
-- for (;;) { |
-- if (nrune == (sp->begin() - text16.begin())/2) |
-- spbegin = p; |
-- if (nrune == (sp->end() - text16.begin())/2) |
-- spend = p; |
-- if (p >= ep) |
-- break; |
-- n = chartorune(&r, p); |
-- p += n; |
-- nrune++; |
-- } |
-- if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) { |
-- LOG(FATAL) << "Error in AdjustUCS2ToUTF8 " |
-- << CEscape(text16) << " " |
-- << (int)(sp->begin() - text16.begin()) << " " |
-- << (int)(sp->end() - text16.begin()); |
-- } |
-- *sp = StringPiece(spbegin, spend - spbegin); |
-- } |
-- |
- // Runs a single search using the named engine type. |
- // This interface hides all the irregularities of the various |
- // engine interfaces from the rest of this file. |
-re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300 |
- |
- StringPiece text = orig_text; |
- StringPiece context = orig_context; |
-- bool ucs2 = false; |
- |
-- if ((flags() & Regexp::UCS2) && type != kEnginePCRE) { |
-- if (!ConvertUTF8ToUCS2(orig_context, &context)) { |
-- result->skipped = true; |
-- return; |
-- } |
-- |
-- // Rewrite context to refer to new text. |
-- AdjustUTF8ToUCS2(orig_context, context, &text); |
-- ucs2 = true; |
-- } |
-- |
- switch (type) { |
- default: |
- LOG(FATAL) << "Bad RunSearch type: " << (int)type; |
-re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451 |
- } |
- } |
- |
-- // If we did UCS-2 matching, rewrite the matches to refer |
-- // to the original UTF-8 text. |
-- if (ucs2) { |
-- if (result->matched) { |
-- if (result->have_submatch0) { |
-- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]); |
-- } else if (result->have_submatch) { |
-- for (int i = 0; i < nsubmatch; i++) { |
-- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]); |
-- } |
-- } |
-- } |
-- delete[] context.begin(); |
-- } |
-- |
- if (!result->matched) |
- memset(result->submatch, 0, sizeof result->submatch); |
- } |
-re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475 |
- return true; |
- } |
- |
-- // Check whether text uses only Unicode points <= 0xFFFF |
-- // (in the BMP). |
-- static bool IsBMP(const StringPiece& text) { |
-- const char* p = text.begin(); |
-- const char* ep = text.end(); |
-- while (p < ep) { |
-- if (!fullrune(p, ep - p)) |
-- return false; |
-- Rune r; |
-- p += chartorune(&r, p); |
-- if (r > 0xFFFF) |
-- return false; |
-- } |
-- return true; |
-- } |
-- |
- // Runs a single test. |
- bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, |
- Prog::Anchor anchor) { |
-re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483 |
- Result correct; |
- RunSearch(kEngineBacktrack, text, context, anchor, &correct); |
- if (correct.skipped) { |
-- if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode |
-+ if (regexp_ == NULL) |
- return true; |
- LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) |
- << " " << FormatMode(flags_); |