OLD | NEW |
| (Empty) |
1 // Copyright 2007 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // Compiled regular expression representation. | |
6 // Tested by compile_test.cc | |
7 | |
8 #include "util/util.h" | |
9 #include "util/sparse_set.h" | |
10 #include "re2/prog.h" | |
11 #include "re2/stringpiece.h" | |
12 | |
13 namespace re2 { | |
14 | |
15 // Constructors per Inst opcode | |
16 | |
17 void Prog::Inst::InitAlt(uint32 out, uint32 out1) { | |
18 DCHECK_EQ(out_opcode_, 0); | |
19 set_out_opcode(out, kInstAlt); | |
20 out1_ = out1; | |
21 } | |
22 | |
23 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) { | |
24 DCHECK_EQ(out_opcode_, 0); | |
25 set_out_opcode(out, kInstByteRange); | |
26 lo_ = lo & 0xFF; | |
27 hi_ = hi & 0xFF; | |
28 foldcase_ = foldcase & 0xFF; | |
29 } | |
30 | |
31 void Prog::Inst::InitCapture(int cap, uint32 out) { | |
32 DCHECK_EQ(out_opcode_, 0); | |
33 set_out_opcode(out, kInstCapture); | |
34 cap_ = cap; | |
35 } | |
36 | |
37 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) { | |
38 DCHECK_EQ(out_opcode_, 0); | |
39 set_out_opcode(out, kInstEmptyWidth); | |
40 empty_ = empty; | |
41 } | |
42 | |
43 void Prog::Inst::InitMatch(int32 id) { | |
44 DCHECK_EQ(out_opcode_, 0); | |
45 set_opcode(kInstMatch); | |
46 match_id_ = id; | |
47 } | |
48 | |
49 void Prog::Inst::InitNop(uint32 out) { | |
50 DCHECK_EQ(out_opcode_, 0); | |
51 set_opcode(kInstNop); | |
52 } | |
53 | |
54 void Prog::Inst::InitFail() { | |
55 DCHECK_EQ(out_opcode_, 0); | |
56 set_opcode(kInstFail); | |
57 } | |
58 | |
59 string Prog::Inst::Dump() { | |
60 switch (opcode()) { | |
61 default: | |
62 return StringPrintf("opcode %d", static_cast<int>(opcode())); | |
63 | |
64 case kInstAlt: | |
65 return StringPrintf("alt -> %d | %d", out(), out1_); | |
66 | |
67 case kInstAltMatch: | |
68 return StringPrintf("altmatch -> %d | %d", out(), out1_); | |
69 | |
70 case kInstByteRange: | |
71 return StringPrintf("byte%s [%02x-%02x] -> %d", | |
72 foldcase_ ? "/i" : "", | |
73 lo_, hi_, out()); | |
74 | |
75 case kInstCapture: | |
76 return StringPrintf("capture %d -> %d", cap_, out()); | |
77 | |
78 case kInstEmptyWidth: | |
79 return StringPrintf("emptywidth %#x -> %d", | |
80 static_cast<int>(empty_), out()); | |
81 | |
82 case kInstMatch: | |
83 return StringPrintf("match! %d", match_id()); | |
84 | |
85 case kInstNop: | |
86 return StringPrintf("nop -> %d", out()); | |
87 | |
88 case kInstFail: | |
89 return StringPrintf("fail"); | |
90 } | |
91 } | |
92 | |
93 Prog::Prog() | |
94 : anchor_start_(false), | |
95 anchor_end_(false), | |
96 reversed_(false), | |
97 did_onepass_(false), | |
98 start_(0), | |
99 start_unanchored_(0), | |
100 size_(0), | |
101 byte_inst_count_(0), | |
102 bytemap_range_(0), | |
103 flags_(0), | |
104 onepass_statesize_(0), | |
105 inst_(NULL), | |
106 dfa_first_(NULL), | |
107 dfa_longest_(NULL), | |
108 dfa_mem_(0), | |
109 delete_dfa_(NULL), | |
110 unbytemap_(NULL), | |
111 onepass_nodes_(NULL), | |
112 onepass_start_(NULL) { | |
113 } | |
114 | |
115 Prog::~Prog() { | |
116 if (delete_dfa_) { | |
117 if (dfa_first_) | |
118 delete_dfa_(dfa_first_); | |
119 if (dfa_longest_) | |
120 delete_dfa_(dfa_longest_); | |
121 } | |
122 delete[] onepass_nodes_; | |
123 delete[] inst_; | |
124 delete[] unbytemap_; | |
125 } | |
126 | |
127 typedef SparseSet Workq; | |
128 | |
129 static inline void AddToQueue(Workq* q, int id) { | |
130 if (id != 0) | |
131 q->insert(id); | |
132 } | |
133 | |
134 static string ProgToString(Prog* prog, Workq* q) { | |
135 string s; | |
136 | |
137 for (Workq::iterator i = q->begin(); i != q->end(); ++i) { | |
138 int id = *i; | |
139 Prog::Inst* ip = prog->inst(id); | |
140 StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str()); | |
141 AddToQueue(q, ip->out()); | |
142 if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch) | |
143 AddToQueue(q, ip->out1()); | |
144 } | |
145 return s; | |
146 } | |
147 | |
148 string Prog::Dump() { | |
149 string map; | |
150 if (false) { // Debugging | |
151 int lo = 0; | |
152 StringAppendF(&map, "byte map:\n"); | |
153 for (int i = 0; i < bytemap_range_; i++) { | |
154 StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]); | |
155 lo = unbytemap_[i] + 1; | |
156 } | |
157 StringAppendF(&map, "\n"); | |
158 } | |
159 | |
160 Workq q(size_); | |
161 AddToQueue(&q, start_); | |
162 return map + ProgToString(this, &q); | |
163 } | |
164 | |
165 string Prog::DumpUnanchored() { | |
166 Workq q(size_); | |
167 AddToQueue(&q, start_unanchored_); | |
168 return ProgToString(this, &q); | |
169 } | |
170 | |
171 static bool IsMatch(Prog*, Prog::Inst*); | |
172 | |
173 // Peep-hole optimizer. | |
174 void Prog::Optimize() { | |
175 Workq q(size_); | |
176 | |
177 // Eliminate nops. Most are taken out during compilation | |
178 // but a few are hard to avoid. | |
179 q.clear(); | |
180 AddToQueue(&q, start_); | |
181 for (Workq::iterator i = q.begin(); i != q.end(); ++i) { | |
182 int id = *i; | |
183 | |
184 Inst* ip = inst(id); | |
185 int j = ip->out(); | |
186 Inst* jp; | |
187 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { | |
188 j = jp->out(); | |
189 } | |
190 ip->set_out(j); | |
191 AddToQueue(&q, ip->out()); | |
192 | |
193 if (ip->opcode() == kInstAlt) { | |
194 j = ip->out1(); | |
195 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { | |
196 j = jp->out(); | |
197 } | |
198 ip->out1_ = j; | |
199 AddToQueue(&q, ip->out1()); | |
200 } | |
201 } | |
202 | |
203 // Insert kInstAltMatch instructions | |
204 // Look for | |
205 // ip: Alt -> j | k | |
206 // j: ByteRange [00-FF] -> ip | |
207 // k: Match | |
208 // or the reverse (the above is the greedy one). | |
209 // Rewrite Alt to AltMatch. | |
210 q.clear(); | |
211 AddToQueue(&q, start_); | |
212 for (Workq::iterator i = q.begin(); i != q.end(); ++i) { | |
213 int id = *i; | |
214 Inst* ip = inst(id); | |
215 AddToQueue(&q, ip->out()); | |
216 if (ip->opcode() == kInstAlt) | |
217 AddToQueue(&q, ip->out1()); | |
218 | |
219 if (ip->opcode() == kInstAlt) { | |
220 Inst* j = inst(ip->out()); | |
221 Inst* k = inst(ip->out1()); | |
222 if (j->opcode() == kInstByteRange && j->out() == id && | |
223 j->lo() == 0x00 && j->hi() == 0xFF && | |
224 IsMatch(this, k)) { | |
225 ip->set_opcode(kInstAltMatch); | |
226 continue; | |
227 } | |
228 if (IsMatch(this, j) && | |
229 k->opcode() == kInstByteRange && k->out() == id && | |
230 k->lo() == 0x00 && k->hi() == 0xFF) { | |
231 ip->set_opcode(kInstAltMatch); | |
232 } | |
233 } | |
234 } | |
235 } | |
236 | |
237 // Is ip a guaranteed match at end of text, perhaps after some capturing? | |
238 static bool IsMatch(Prog* prog, Prog::Inst* ip) { | |
239 for (;;) { | |
240 switch (ip->opcode()) { | |
241 default: | |
242 LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); | |
243 return false; | |
244 | |
245 case kInstAlt: | |
246 case kInstAltMatch: | |
247 case kInstByteRange: | |
248 case kInstFail: | |
249 case kInstEmptyWidth: | |
250 return false; | |
251 | |
252 case kInstCapture: | |
253 case kInstNop: | |
254 ip = prog->inst(ip->out()); | |
255 break; | |
256 | |
257 case kInstMatch: | |
258 return true; | |
259 } | |
260 } | |
261 } | |
262 | |
263 uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) { | |
264 int flags = 0; | |
265 | |
266 // ^ and \A | |
267 if (p == text.begin()) | |
268 flags |= kEmptyBeginText | kEmptyBeginLine; | |
269 else if (p[-1] == '\n') | |
270 flags |= kEmptyBeginLine; | |
271 | |
272 // $ and \z | |
273 if (p == text.end()) | |
274 flags |= kEmptyEndText | kEmptyEndLine; | |
275 else if (p < text.end() && p[0] == '\n') | |
276 flags |= kEmptyEndLine; | |
277 | |
278 // \b and \B | |
279 if (p == text.begin() && p == text.end()) { | |
280 // no word boundary here | |
281 } else if (p == text.begin()) { | |
282 if (IsWordChar(p[0])) | |
283 flags |= kEmptyWordBoundary; | |
284 } else if (p == text.end()) { | |
285 if (IsWordChar(p[-1])) | |
286 flags |= kEmptyWordBoundary; | |
287 } else { | |
288 if (IsWordChar(p[-1]) != IsWordChar(p[0])) | |
289 flags |= kEmptyWordBoundary; | |
290 } | |
291 if (!(flags & kEmptyWordBoundary)) | |
292 flags |= kEmptyNonWordBoundary; | |
293 | |
294 return flags; | |
295 } | |
296 | |
297 void Prog::MarkByteRange(int lo, int hi) { | |
298 DCHECK_GE(lo, 0); | |
299 DCHECK_GE(hi, 0); | |
300 DCHECK_LE(lo, 255); | |
301 DCHECK_LE(hi, 255); | |
302 DCHECK_LE(lo, hi); | |
303 if (0 < lo && lo <= 255) | |
304 byterange_.Set(lo - 1); | |
305 if (0 <= hi && hi <= 255) | |
306 byterange_.Set(hi); | |
307 } | |
308 | |
309 void Prog::ComputeByteMap() { | |
310 // Fill in bytemap with byte classes for prog_. | |
311 // Ranges of bytes that are treated as indistinguishable | |
312 // by the regexp program are mapped to a single byte class. | |
313 // The vector prog_->byterange() marks the end of each | |
314 // such range. | |
315 const Bitmap<256>& v = byterange(); | |
316 | |
317 COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize); | |
318 uint8 n = 0; | |
319 uint32 bits = 0; | |
320 for (int i = 0; i < 256; i++) { | |
321 if ((i&31) == 0) | |
322 bits = v.Word(i >> 5); | |
323 bytemap_[i] = n; | |
324 n += bits & 1; | |
325 bits >>= 1; | |
326 } | |
327 bytemap_range_ = bytemap_[255] + 1; | |
328 unbytemap_ = new uint8[bytemap_range_]; | |
329 for (int i = 0; i < 256; i++) | |
330 unbytemap_[bytemap_[i]] = static_cast<uint8>(i); | |
331 | |
332 if (0) { // For debugging: use trivial byte map. | |
333 for (int i = 0; i < 256; i++) { | |
334 bytemap_[i] = static_cast<uint8>(i); | |
335 unbytemap_[i] = static_cast<uint8>(i); | |
336 } | |
337 bytemap_range_ = 256; | |
338 LOG(INFO) << "Using trivial bytemap."; | |
339 } | |
340 } | |
341 | |
342 } // namespace re2 | |
343 | |
OLD | NEW |