OLD | NEW |
| (Empty) |
1 This is a dump from Google's source control system of the change | |
2 that removed UCS-2 support from RE2. As the explanation below | |
3 says, UCS-2 mode is fundamentally at odds with things like ^ and $, | |
4 so it never really worked very well. But if you are interested in using | |
5 it without those operators, it did work for that. It assumed that the | |
6 UCS-2 data was in the native host byte order. | |
7 | |
8 If you are interested in adding UCS-2 mode back, this patch might | |
9 be a good starting point. | |
10 | |
11 | |
12 Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15 | |
13 | |
14 Retire UCS-2 mode. | |
15 | |
16 I added it as an experiment for V8, but it | |
17 requires 2-byte lookahead to do completely, | |
18 and RE2 has 1-byte lookahead (enough for UTF-8) | |
19 as a fairly deep fundamental assumption, | |
20 so it did not support ^ or $. | |
21 | |
22 ==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ==== | |
23 re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319 | |
24 cap_[0] = p; | |
25 if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. | |
26 return true; | |
27 - if (prog_->flags() & Regexp::UCS2) | |
28 - p++; | |
29 } | |
30 return false; | |
31 } | |
32 ==== re2/compile.cc#17 - re2/compile.cc#18 ==== | |
33 re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100 | |
34 // Input encodings. | |
35 enum Encoding { | |
36 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) | |
37 - kEncodingUCS2, // UCS-2 (0-FFFF), native byte order | |
38 kEncodingLatin1, // Latin1 (0-FF) | |
39 }; | |
40 | |
41 re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172 | |
42 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); | |
43 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); | |
44 void Add_80_10ffff(); | |
45 - void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase); | |
46 - void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, | |
47 - uint8 lo2, uint8 hi2, bool fold2); | |
48 | |
49 // New suffix that matches the byte range lo-hi, then goes to next. | |
50 Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next); | |
51 re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477 | |
52 | |
53 // Converts rune range lo-hi into a fragment that recognizes | |
54 // the bytes that would make up those runes in the current | |
55 - // encoding (Latin 1, UTF-8, or UCS-2). | |
56 + // encoding (Latin 1 or UTF-8). | |
57 // This lets the machine work byte-by-byte even when | |
58 // using multibyte encodings. | |
59 | |
60 re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489 | |
61 case kEncodingLatin1: | |
62 AddRuneRangeLatin1(lo, hi, foldcase); | |
63 break; | |
64 - case kEncodingUCS2: | |
65 - AddRuneRangeUCS2(lo, hi, foldcase); | |
66 - break; | |
67 } | |
68 } | |
69 | |
70 re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501 | |
71 AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL)); | |
72 } | |
73 | |
74 - // Test whether 16-bit values are big or little endian. | |
75 - static bool BigEndian() { | |
76 - union { | |
77 - char byte[2]; | |
78 - int16 endian; | |
79 - } u; | |
80 - | |
81 - u.byte[0] = 1; | |
82 - u.byte[1] = 2; | |
83 - return u.endian == 0x0102; | |
84 - } | |
85 - | |
86 - void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, | |
87 - uint8 lo2, uint8 hi2, bool fold2) { | |
88 - Inst* ip; | |
89 - if (reversed_) { | |
90 - ip = RuneByteSuffix(lo1, hi1, fold1, NULL); | |
91 - ip = RuneByteSuffix(lo2, hi2, fold2, ip); | |
92 - } else { | |
93 - ip = RuneByteSuffix(lo2, hi2, fold2, NULL); | |
94 - ip = RuneByteSuffix(lo1, hi1, fold1, ip); | |
95 - } | |
96 - AddSuffix(ip); | |
97 - } | |
98 - | |
99 - void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) { | |
100 - if (lo > hi || lo > 0xFFFF) | |
101 - return; | |
102 - if (hi > 0xFFFF) | |
103 - hi = 0xFFFF; | |
104 - | |
105 - // We'll assemble a pattern assuming big endian. | |
106 - // If the machine isn't, tell Cat to reverse its arguments. | |
107 - bool oldreversed = reversed_; | |
108 - if (!BigEndian()) { | |
109 - reversed_ = !oldreversed; | |
110 - } | |
111 - | |
112 - // Split into bytes. | |
113 - int lo1 = lo >> 8; | |
114 - int lo2 = lo & 0xFF; | |
115 - int hi1 = hi >> 8; | |
116 - int hi2 = hi & 0xFF; | |
117 - | |
118 - if (lo1 == hi1) { | |
119 - // Easy case: high bits are same in both. | |
120 - // Only do ASCII case folding on the second byte if the top byte is 00. | |
121 - AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase); | |
122 - } else { | |
123 - // Harder case: different second byte ranges depending on first byte. | |
124 - | |
125 - // Initial fragment. | |
126 - if (lo2 > 0) { | |
127 - AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase); | |
128 - lo1++; | |
129 - } | |
130 - | |
131 - // Trailing fragment. | |
132 - if (hi2 < 0xFF) { | |
133 - AddUCS2Pair(hi1, hi1, false, 0, hi2, false); | |
134 - hi1--; | |
135 - } | |
136 - | |
137 - // Inner ranges. | |
138 - if (lo1 <= hi1) { | |
139 - AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false); | |
140 - } | |
141 - } | |
142 - | |
143 - // Restore reverse setting. | |
144 - reversed_ = oldreversed; | |
145 - } | |
146 - | |
147 // Table describing how to make a UTF-8 matching machine | |
148 // for the rune range 80-10FFFF (Runeself-Runemax). | |
149 // This range happens frequently enough (for example /./ and /[^a-z]/) | |
150 re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634 | |
151 | |
152 Frag Compiler::Literal(Rune r, bool foldcase) { | |
153 switch (encoding_) { | |
154 - default: // UCS-2 or something new | |
155 - BeginRange(); | |
156 - AddRuneRange(r, r, foldcase); | |
157 - return EndRange(); | |
158 + default: | |
159 + return kNullFrag; | |
160 | |
161 case kEncodingLatin1: | |
162 return ByteRange(r, r, foldcase); | |
163 re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850 | |
164 | |
165 if (re->parse_flags() & Regexp::Latin1) | |
166 c.encoding_ = kEncodingLatin1; | |
167 - else if (re->parse_flags() & Regexp::UCS2) | |
168 - c.encoding_ = kEncodingUCS2; | |
169 c.reversed_ = reversed; | |
170 if (max_mem <= 0) { | |
171 c.max_inst_ = 100000; // more than enough | |
172 re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905 | |
173 c.prog_->set_start_unanchored(c.prog_->start()); | |
174 } else { | |
175 Frag dot; | |
176 - if (c.encoding_ == kEncodingUCS2) { | |
177 - dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, fals
e)); | |
178 - } else { | |
179 - dot = c.ByteRange(0x00, 0xFF, false); | |
180 - } | |
181 + dot = c.ByteRange(0x00, 0xFF, false); | |
182 Frag dotloop = c.Star(dot, true); | |
183 Frag unanchored = c.Cat(dotloop, all); | |
184 c.prog_->set_start_unanchored(unanchored.begin); | |
185 ==== re2/nfa.cc#8 - re2/nfa.cc#9 ==== | |
186 re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431 | |
187 const char* bp = context.begin(); | |
188 int c = -1; | |
189 int wasword = 0; | |
190 - bool ucs2 = prog_->flags() & Regexp::UCS2; | |
191 | |
192 if (text.begin() > context.begin()) { | |
193 c = text.begin()[-1] & 0xFF; | |
194 re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497 | |
195 // If there's a required first byte for an unanchored search | |
196 // and we're not in the middle of any possible matches, | |
197 // use memchr to search for the byte quickly. | |
198 - if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 && | |
199 + if (!anchored && first_byte_ >= 0 && runq->size() == 0 && | |
200 p < text.end() && (p[0] & 0xFF) != first_byte_) { | |
201 p = reinterpret_cast<const char*>(memchr(p, first_byte_, | |
202 text.end() - p)); | |
203 re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514 | |
204 flag = Prog::EmptyFlags(context, p); | |
205 } | |
206 | |
207 - // In UCS-2 mode, if we need to start a new thread, | |
208 - // make sure to do it on an even boundary. | |
209 - if(ucs2 && runq->size() == 0 && | |
210 - (p - context.begin()) % 2 && p < text.end()) { | |
211 - p++; | |
212 - flag = Prog::EmptyFlags(context, p); | |
213 - } | |
214 - | |
215 // Steal match storage (cleared but unused as of yet) | |
216 // temporarily to hold match boundaries for new thread. | |
217 - // In UCS-2 mode, only start the thread on a 2-byte boundary. | |
218 - if(!ucs2 || (p - context.begin()) % 2 == 0) { | |
219 - match_[0] = p; | |
220 - AddToThreadq(runq, start_, flag, p, match_); | |
221 - match_[0] = NULL; | |
222 - } | |
223 + match_[0] = p; | |
224 + AddToThreadq(runq, start_, flag, p, match_); | |
225 + match_[0] = NULL; | |
226 } | |
227 | |
228 // If all the threads have died, stop early. | |
229 ==== re2/parse.cc#22 - re2/parse.cc#23 ==== | |
230 re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165 | |
231 status_(status), stacktop_(NULL), ncap_(0) { | |
232 if (flags_ & Latin1) | |
233 rune_max_ = 0xFF; | |
234 - else if (flags & UCS2) | |
235 - rune_max_ = 0xFFFF; | |
236 else | |
237 rune_max_ = Runemax; | |
238 } | |
239 re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374 | |
240 bool Regexp::ParseState::PushCarat() { | |
241 if (flags_ & OneLine) { | |
242 return PushSimpleOp(kRegexpBeginText); | |
243 - } else { | |
244 - if (flags_ & UCS2) { | |
245 - status_->set_code(kRegexpUnsupported); | |
246 - status_->set_error_arg("multiline ^ in UCS-2 mode"); | |
247 - return false; | |
248 - } | |
249 - return PushSimpleOp(kRegexpBeginLine); | |
250 } | |
251 + return PushSimpleOp(kRegexpBeginLine); | |
252 } | |
253 | |
254 // Pushes a \b or \B onto the stack. | |
255 bool Regexp::ParseState::PushWordBoundary(bool word) { | |
256 - if (flags_ & UCS2) { | |
257 - status_->set_code(kRegexpUnsupported); | |
258 - status_->set_error_arg("\\b or \\B in UCS-2 mode"); | |
259 - return false; | |
260 - } | |
261 if (word) | |
262 return PushSimpleOp(kRegexpWordBoundary); | |
263 return PushSimpleOp(kRegexpNoWordBoundary); | |
264 re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389 | |
265 bool ret = PushSimpleOp(kRegexpEndText); | |
266 flags_ = oflags; | |
267 return ret; | |
268 - } | |
269 - if (flags_ & UCS2) { | |
270 - status_->set_code(kRegexpUnsupported); | |
271 - status_->set_error_arg("multiline $ in UCS-2 mode"); | |
272 - return false; | |
273 } | |
274 return PushSimpleOp(kRegexpEndLine); | |
275 } | |
276 ==== re2/re2.cc#34 - re2/re2.cc#35 ==== | |
277 re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84 | |
278 return RE2::ErrorBadUTF8; | |
279 case re2::kRegexpBadNamedCapture: | |
280 return RE2::ErrorBadNamedCapture; | |
281 - case re2::kRegexpUnsupported: | |
282 - return RE2::ErrorUnsupported; | |
283 } | |
284 return RE2::ErrorInternal; | |
285 } | |
286 re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125 | |
287 break; | |
288 case RE2::Options::EncodingLatin1: | |
289 flags |= Regexp::Latin1; | |
290 - break; | |
291 - case RE2::Options::EncodingUCS2: | |
292 - flags |= Regexp::UCS2; | |
293 break; | |
294 } | |
295 | |
296 ==== re2/re2.h#36 - re2/re2.h#37 ==== | |
297 re2/re2.h#36:246,252 - re2/re2.h#37:246,251 | |
298 ErrorBadUTF8, // invalid UTF-8 in regexp | |
299 ErrorBadNamedCapture, // bad named capture group | |
300 ErrorPatternTooLarge, // pattern too large (compile failed) | |
301 - ErrorUnsupported, // unsupported feature (in UCS-2 mode) | |
302 }; | |
303 | |
304 // Predefined common options. | |
305 re2/re2.h#36:570,576 - re2/re2.h#37:569,574 | |
306 | |
307 enum Encoding { | |
308 EncodingUTF8 = 1, | |
309 - EncodingUCS2, // 16-bit Unicode 0-FFFF only | |
310 EncodingLatin1 | |
311 }; | |
312 | |
313 ==== re2/regexp.cc#15 - re2/regexp.cc#16 ==== | |
314 re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329 | |
315 // the regexp that remains after the prefix. The prefix might | |
316 // be ASCII case-insensitive. | |
317 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { | |
318 - // Don't even bother for UCS-2; it's time to throw that code away. | |
319 - if (parse_flags_ & UCS2) | |
320 - return false; | |
321 - | |
322 // No need for a walker: the regexp must be of the form | |
323 // 1. some number of ^ anchors | |
324 // 2. a literal char or string | |
325 ==== re2/regexp.h#20 - re2/regexp.h#21 ==== | |
326 re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192 | |
327 kRegexpBadPerlOp, // bad perl operator | |
328 kRegexpBadUTF8, // invalid UTF-8 in regexp | |
329 kRegexpBadNamedCapture, // bad named capture | |
330 - kRegexpUnsupported, // unsupported operator | |
331 }; | |
332 | |
333 // Error status for certain operations. | |
334 re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314 | |
335 // \Q and \E to disable/enable metacharacters | |
336 // (?P<name>expr) for named captures | |
337 // \C to match any single byte | |
338 - UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8. | |
339 - UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group | |
340 + UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group | |
341 // and \P{Han} for its negation. | |
342 - NeverNL = 1<<12, // Never match NL, even if the regexp mentions | |
343 + NeverNL = 1<<11, // Never match NL, even if the regexp mentions | |
344 // it explicitly. | |
345 | |
346 // As close to Perl as we can get. | |
347 ==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ==== | |
348 re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139 | |
349 cap_[0] = p; | |
350 if (Visit(prog_->start(), p)) // Match must be leftmost; done. | |
351 return true; | |
352 - if (prog_->flags() & Regexp::UCS2) | |
353 - p++; | |
354 } | |
355 return false; | |
356 } | |
357 ==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ==== | |
358 re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152 | |
359 static ParseMode parse_modes[] = { | |
360 { single_line, "single-line" }, | |
361 { single_line|Regexp::Latin1, "single-line, latin1" }, | |
362 - { single_line|Regexp::UCS2, "single-line, ucs2" }, | |
363 { multi_line, "multiline" }, | |
364 { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, | |
365 { multi_line|Regexp::Latin1, "multiline, latin1" }, | |
366 - { multi_line|Regexp::UCS2, "multiline, ucs2" }, | |
367 }; | |
368 | |
369 static string FormatMode(Regexp::ParseFlags flags) { | |
370 re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185 | |
371 RegexpStatus status; | |
372 regexp_ = Regexp::Parse(regexp_str, flags, &status); | |
373 if (regexp_ == NULL) { | |
374 - if (status.code() != kRegexpUnsupported) { | |
375 - LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) | |
376 - << " mode: " << FormatMode(flags); | |
377 - error_ = true; | |
378 - } | |
379 + LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) | |
380 + << " mode: " << FormatMode(flags); | |
381 + error_ = true; | |
382 return; | |
383 } | |
384 prog_ = regexp_->CompileToProg(0); | |
385 re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231 | |
386 RE2::Options options; | |
387 if (flags & Regexp::Latin1) | |
388 options.set_encoding(RE2::Options::EncodingLatin1); | |
389 - else if (flags & Regexp::UCS2) | |
390 - options.set_encoding(RE2::Options::EncodingUCS2); | |
391 if (kind_ == Prog::kLongestMatch) | |
392 options.set_longest_match(true); | |
393 re2_ = new RE2(re, options); | |
394 re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280 | |
395 delete re2_; | |
396 } | |
397 | |
398 - // Converts UTF-8 string in text into UCS-2 string in new_text. | |
399 - static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text)
{ | |
400 - const char* p = text.begin(); | |
401 - const char* ep = text.end(); | |
402 - uint16* q = new uint16[ep - p]; | |
403 - uint16* q0 = q; | |
404 - | |
405 - int n; | |
406 - Rune r; | |
407 - for (; p < ep; p += n) { | |
408 - if (!fullrune(p, ep - p)) { | |
409 - delete[] q0; | |
410 - return false; | |
411 - } | |
412 - n = chartorune(&r, p); | |
413 - if (r > 0xFFFF) { | |
414 - delete[] q0; | |
415 - return false; | |
416 - } | |
417 - *q++ = r; | |
418 - } | |
419 - *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0)); | |
420 - return true; | |
421 - } | |
422 - | |
423 - // Rewrites *sp from being a pointer into text8 (UTF-8) | |
424 - // to being a pointer into text16 (equivalent text but in UCS-2). | |
425 - static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text
16, | |
426 - StringPiece *sp) { | |
427 - if (sp->begin() == NULL && text8.begin() != NULL) | |
428 - return; | |
429 - | |
430 - int nrune = 0; | |
431 - int n; | |
432 - Rune r; | |
433 - const char* p = text8.begin(); | |
434 - const char* ep = text8.end(); | |
435 - const char* spbegin = NULL; | |
436 - const char* spend = NULL; | |
437 - for (;;) { | |
438 - if (p == sp->begin()) | |
439 - spbegin = text16.begin() + sizeof(uint16)*nrune; | |
440 - if (p == sp->end()) | |
441 - spend = text16.begin() + sizeof(uint16)*nrune; | |
442 - if (p >= ep) | |
443 - break; | |
444 - n = chartorune(&r, p); | |
445 - p += n; | |
446 - nrune++; | |
447 - } | |
448 - if (spbegin == NULL || spend == NULL) { | |
449 - LOG(FATAL) << "Error in AdjustUTF8ToUCS2 " | |
450 - << CEscape(text8) << " " | |
451 - << (int)(sp->begin() - text8.begin()) << " " | |
452 - << (int)(sp->end() - text8.begin()); | |
453 - } | |
454 - *sp = StringPiece(spbegin, spend - spbegin); | |
455 - } | |
456 - | |
457 - // Rewrites *sp from begin a pointer into text16 (UCS-2) | |
458 - // to being a pointer into text8 (equivalent text but in UTF-8). | |
459 - static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& tex
t8, | |
460 - StringPiece* sp) { | |
461 - if (sp->begin() == NULL) | |
462 - return; | |
463 - | |
464 - int nrune = 0; | |
465 - int n; | |
466 - Rune r; | |
467 - const char* p = text8.begin(); | |
468 - const char* ep = text8.end(); | |
469 - const char* spbegin = NULL; | |
470 - const char* spend = NULL; | |
471 - for (;;) { | |
472 - if (nrune == (sp->begin() - text16.begin())/2) | |
473 - spbegin = p; | |
474 - if (nrune == (sp->end() - text16.begin())/2) | |
475 - spend = p; | |
476 - if (p >= ep) | |
477 - break; | |
478 - n = chartorune(&r, p); | |
479 - p += n; | |
480 - nrune++; | |
481 - } | |
482 - if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) { | |
483 - LOG(FATAL) << "Error in AdjustUCS2ToUTF8 " | |
484 - << CEscape(text16) << " " | |
485 - << (int)(sp->begin() - text16.begin()) << " " | |
486 - << (int)(sp->end() - text16.begin()); | |
487 - } | |
488 - *sp = StringPiece(spbegin, spend - spbegin); | |
489 - } | |
490 - | |
491 // Runs a single search using the named engine type. | |
492 // This interface hides all the irregularities of the various | |
493 // engine interfaces from the rest of this file. | |
494 re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300 | |
495 | |
496 StringPiece text = orig_text; | |
497 StringPiece context = orig_context; | |
498 - bool ucs2 = false; | |
499 | |
500 - if ((flags() & Regexp::UCS2) && type != kEnginePCRE) { | |
501 - if (!ConvertUTF8ToUCS2(orig_context, &context)) { | |
502 - result->skipped = true; | |
503 - return; | |
504 - } | |
505 - | |
506 - // Rewrite context to refer to new text. | |
507 - AdjustUTF8ToUCS2(orig_context, context, &text); | |
508 - ucs2 = true; | |
509 - } | |
510 - | |
511 switch (type) { | |
512 default: | |
513 LOG(FATAL) << "Bad RunSearch type: " << (int)type; | |
514 re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451 | |
515 } | |
516 } | |
517 | |
518 - // If we did UCS-2 matching, rewrite the matches to refer | |
519 - // to the original UTF-8 text. | |
520 - if (ucs2) { | |
521 - if (result->matched) { | |
522 - if (result->have_submatch0) { | |
523 - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]); | |
524 - } else if (result->have_submatch) { | |
525 - for (int i = 0; i < nsubmatch; i++) { | |
526 - AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]); | |
527 - } | |
528 - } | |
529 - } | |
530 - delete[] context.begin(); | |
531 - } | |
532 - | |
533 if (!result->matched) | |
534 memset(result->submatch, 0, sizeof result->submatch); | |
535 } | |
536 re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475 | |
537 return true; | |
538 } | |
539 | |
540 - // Check whether text uses only Unicode points <= 0xFFFF | |
541 - // (in the BMP). | |
542 - static bool IsBMP(const StringPiece& text) { | |
543 - const char* p = text.begin(); | |
544 - const char* ep = text.end(); | |
545 - while (p < ep) { | |
546 - if (!fullrune(p, ep - p)) | |
547 - return false; | |
548 - Rune r; | |
549 - p += chartorune(&r, p); | |
550 - if (r > 0xFFFF) | |
551 - return false; | |
552 - } | |
553 - return true; | |
554 - } | |
555 - | |
556 // Runs a single test. | |
557 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context
, | |
558 Prog::Anchor anchor) { | |
559 re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483 | |
560 Result correct; | |
561 RunSearch(kEngineBacktrack, text, context, anchor, &correct); | |
562 if (correct.skipped) { | |
563 - if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode | |
564 + if (regexp_ == NULL) | |
565 return true; | |
566 LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) | |
567 << " " << FormatMode(flags_); | |
OLD | NEW |