| OLD | NEW |
| (Empty) |
| 1 // Copyright 2007 The RE2 Authors. All Rights Reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 // Compiled regular expression representation. | |
| 6 // Tested by compile_test.cc | |
| 7 | |
| 8 #include "util/util.h" | |
| 9 #include "util/sparse_set.h" | |
| 10 #include "re2/prog.h" | |
| 11 #include "re2/stringpiece.h" | |
| 12 | |
| 13 namespace re2 { | |
| 14 | |
| 15 // Constructors per Inst opcode | |
| 16 | |
| 17 void Prog::Inst::InitAlt(uint32 out, uint32 out1) { | |
| 18 DCHECK_EQ(out_opcode_, 0); | |
| 19 set_out_opcode(out, kInstAlt); | |
| 20 out1_ = out1; | |
| 21 } | |
| 22 | |
| 23 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) { | |
| 24 DCHECK_EQ(out_opcode_, 0); | |
| 25 set_out_opcode(out, kInstByteRange); | |
| 26 lo_ = lo & 0xFF; | |
| 27 hi_ = hi & 0xFF; | |
| 28 foldcase_ = foldcase & 0xFF; | |
| 29 } | |
| 30 | |
| 31 void Prog::Inst::InitCapture(int cap, uint32 out) { | |
| 32 DCHECK_EQ(out_opcode_, 0); | |
| 33 set_out_opcode(out, kInstCapture); | |
| 34 cap_ = cap; | |
| 35 } | |
| 36 | |
| 37 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) { | |
| 38 DCHECK_EQ(out_opcode_, 0); | |
| 39 set_out_opcode(out, kInstEmptyWidth); | |
| 40 empty_ = empty; | |
| 41 } | |
| 42 | |
| 43 void Prog::Inst::InitMatch(int32 id) { | |
| 44 DCHECK_EQ(out_opcode_, 0); | |
| 45 set_opcode(kInstMatch); | |
| 46 match_id_ = id; | |
| 47 } | |
| 48 | |
| 49 void Prog::Inst::InitNop(uint32 out) { | |
| 50 DCHECK_EQ(out_opcode_, 0); | |
| 51 set_opcode(kInstNop); | |
| 52 } | |
| 53 | |
| 54 void Prog::Inst::InitFail() { | |
| 55 DCHECK_EQ(out_opcode_, 0); | |
| 56 set_opcode(kInstFail); | |
| 57 } | |
| 58 | |
| 59 string Prog::Inst::Dump() { | |
| 60 switch (opcode()) { | |
| 61 default: | |
| 62 return StringPrintf("opcode %d", static_cast<int>(opcode())); | |
| 63 | |
| 64 case kInstAlt: | |
| 65 return StringPrintf("alt -> %d | %d", out(), out1_); | |
| 66 | |
| 67 case kInstAltMatch: | |
| 68 return StringPrintf("altmatch -> %d | %d", out(), out1_); | |
| 69 | |
| 70 case kInstByteRange: | |
| 71 return StringPrintf("byte%s [%02x-%02x] -> %d", | |
| 72 foldcase_ ? "/i" : "", | |
| 73 lo_, hi_, out()); | |
| 74 | |
| 75 case kInstCapture: | |
| 76 return StringPrintf("capture %d -> %d", cap_, out()); | |
| 77 | |
| 78 case kInstEmptyWidth: | |
| 79 return StringPrintf("emptywidth %#x -> %d", | |
| 80 static_cast<int>(empty_), out()); | |
| 81 | |
| 82 case kInstMatch: | |
| 83 return StringPrintf("match! %d", match_id()); | |
| 84 | |
| 85 case kInstNop: | |
| 86 return StringPrintf("nop -> %d", out()); | |
| 87 | |
| 88 case kInstFail: | |
| 89 return StringPrintf("fail"); | |
| 90 } | |
| 91 } | |
| 92 | |
| 93 Prog::Prog() | |
| 94 : anchor_start_(false), | |
| 95 anchor_end_(false), | |
| 96 reversed_(false), | |
| 97 did_onepass_(false), | |
| 98 start_(0), | |
| 99 start_unanchored_(0), | |
| 100 size_(0), | |
| 101 byte_inst_count_(0), | |
| 102 bytemap_range_(0), | |
| 103 flags_(0), | |
| 104 onepass_statesize_(0), | |
| 105 inst_(NULL), | |
| 106 dfa_first_(NULL), | |
| 107 dfa_longest_(NULL), | |
| 108 dfa_mem_(0), | |
| 109 delete_dfa_(NULL), | |
| 110 unbytemap_(NULL), | |
| 111 onepass_nodes_(NULL), | |
| 112 onepass_start_(NULL) { | |
| 113 } | |
| 114 | |
| 115 Prog::~Prog() { | |
| 116 if (delete_dfa_) { | |
| 117 if (dfa_first_) | |
| 118 delete_dfa_(dfa_first_); | |
| 119 if (dfa_longest_) | |
| 120 delete_dfa_(dfa_longest_); | |
| 121 } | |
| 122 delete[] onepass_nodes_; | |
| 123 delete[] inst_; | |
| 124 delete[] unbytemap_; | |
| 125 } | |
| 126 | |
| 127 typedef SparseSet Workq; | |
| 128 | |
| 129 static inline void AddToQueue(Workq* q, int id) { | |
| 130 if (id != 0) | |
| 131 q->insert(id); | |
| 132 } | |
| 133 | |
| 134 static string ProgToString(Prog* prog, Workq* q) { | |
| 135 string s; | |
| 136 | |
| 137 for (Workq::iterator i = q->begin(); i != q->end(); ++i) { | |
| 138 int id = *i; | |
| 139 Prog::Inst* ip = prog->inst(id); | |
| 140 StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str()); | |
| 141 AddToQueue(q, ip->out()); | |
| 142 if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch) | |
| 143 AddToQueue(q, ip->out1()); | |
| 144 } | |
| 145 return s; | |
| 146 } | |
| 147 | |
| 148 string Prog::Dump() { | |
| 149 string map; | |
| 150 if (false) { // Debugging | |
| 151 int lo = 0; | |
| 152 StringAppendF(&map, "byte map:\n"); | |
| 153 for (int i = 0; i < bytemap_range_; i++) { | |
| 154 StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]); | |
| 155 lo = unbytemap_[i] + 1; | |
| 156 } | |
| 157 StringAppendF(&map, "\n"); | |
| 158 } | |
| 159 | |
| 160 Workq q(size_); | |
| 161 AddToQueue(&q, start_); | |
| 162 return map + ProgToString(this, &q); | |
| 163 } | |
| 164 | |
| 165 string Prog::DumpUnanchored() { | |
| 166 Workq q(size_); | |
| 167 AddToQueue(&q, start_unanchored_); | |
| 168 return ProgToString(this, &q); | |
| 169 } | |
| 170 | |
| 171 static bool IsMatch(Prog*, Prog::Inst*); | |
| 172 | |
| 173 // Peep-hole optimizer. | |
| 174 void Prog::Optimize() { | |
| 175 Workq q(size_); | |
| 176 | |
| 177 // Eliminate nops. Most are taken out during compilation | |
| 178 // but a few are hard to avoid. | |
| 179 q.clear(); | |
| 180 AddToQueue(&q, start_); | |
| 181 for (Workq::iterator i = q.begin(); i != q.end(); ++i) { | |
| 182 int id = *i; | |
| 183 | |
| 184 Inst* ip = inst(id); | |
| 185 int j = ip->out(); | |
| 186 Inst* jp; | |
| 187 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { | |
| 188 j = jp->out(); | |
| 189 } | |
| 190 ip->set_out(j); | |
| 191 AddToQueue(&q, ip->out()); | |
| 192 | |
| 193 if (ip->opcode() == kInstAlt) { | |
| 194 j = ip->out1(); | |
| 195 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { | |
| 196 j = jp->out(); | |
| 197 } | |
| 198 ip->out1_ = j; | |
| 199 AddToQueue(&q, ip->out1()); | |
| 200 } | |
| 201 } | |
| 202 | |
| 203 // Insert kInstAltMatch instructions | |
| 204 // Look for | |
| 205 // ip: Alt -> j | k | |
| 206 // j: ByteRange [00-FF] -> ip | |
| 207 // k: Match | |
| 208 // or the reverse (the above is the greedy one). | |
| 209 // Rewrite Alt to AltMatch. | |
| 210 q.clear(); | |
| 211 AddToQueue(&q, start_); | |
| 212 for (Workq::iterator i = q.begin(); i != q.end(); ++i) { | |
| 213 int id = *i; | |
| 214 Inst* ip = inst(id); | |
| 215 AddToQueue(&q, ip->out()); | |
| 216 if (ip->opcode() == kInstAlt) | |
| 217 AddToQueue(&q, ip->out1()); | |
| 218 | |
| 219 if (ip->opcode() == kInstAlt) { | |
| 220 Inst* j = inst(ip->out()); | |
| 221 Inst* k = inst(ip->out1()); | |
| 222 if (j->opcode() == kInstByteRange && j->out() == id && | |
| 223 j->lo() == 0x00 && j->hi() == 0xFF && | |
| 224 IsMatch(this, k)) { | |
| 225 ip->set_opcode(kInstAltMatch); | |
| 226 continue; | |
| 227 } | |
| 228 if (IsMatch(this, j) && | |
| 229 k->opcode() == kInstByteRange && k->out() == id && | |
| 230 k->lo() == 0x00 && k->hi() == 0xFF) { | |
| 231 ip->set_opcode(kInstAltMatch); | |
| 232 } | |
| 233 } | |
| 234 } | |
| 235 } | |
| 236 | |
| 237 // Is ip a guaranteed match at end of text, perhaps after some capturing? | |
| 238 static bool IsMatch(Prog* prog, Prog::Inst* ip) { | |
| 239 for (;;) { | |
| 240 switch (ip->opcode()) { | |
| 241 default: | |
| 242 LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); | |
| 243 return false; | |
| 244 | |
| 245 case kInstAlt: | |
| 246 case kInstAltMatch: | |
| 247 case kInstByteRange: | |
| 248 case kInstFail: | |
| 249 case kInstEmptyWidth: | |
| 250 return false; | |
| 251 | |
| 252 case kInstCapture: | |
| 253 case kInstNop: | |
| 254 ip = prog->inst(ip->out()); | |
| 255 break; | |
| 256 | |
| 257 case kInstMatch: | |
| 258 return true; | |
| 259 } | |
| 260 } | |
| 261 } | |
| 262 | |
| 263 uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) { | |
| 264 int flags = 0; | |
| 265 | |
| 266 // ^ and \A | |
| 267 if (p == text.begin()) | |
| 268 flags |= kEmptyBeginText | kEmptyBeginLine; | |
| 269 else if (p[-1] == '\n') | |
| 270 flags |= kEmptyBeginLine; | |
| 271 | |
| 272 // $ and \z | |
| 273 if (p == text.end()) | |
| 274 flags |= kEmptyEndText | kEmptyEndLine; | |
| 275 else if (p < text.end() && p[0] == '\n') | |
| 276 flags |= kEmptyEndLine; | |
| 277 | |
| 278 // \b and \B | |
| 279 if (p == text.begin() && p == text.end()) { | |
| 280 // no word boundary here | |
| 281 } else if (p == text.begin()) { | |
| 282 if (IsWordChar(p[0])) | |
| 283 flags |= kEmptyWordBoundary; | |
| 284 } else if (p == text.end()) { | |
| 285 if (IsWordChar(p[-1])) | |
| 286 flags |= kEmptyWordBoundary; | |
| 287 } else { | |
| 288 if (IsWordChar(p[-1]) != IsWordChar(p[0])) | |
| 289 flags |= kEmptyWordBoundary; | |
| 290 } | |
| 291 if (!(flags & kEmptyWordBoundary)) | |
| 292 flags |= kEmptyNonWordBoundary; | |
| 293 | |
| 294 return flags; | |
| 295 } | |
| 296 | |
| 297 void Prog::MarkByteRange(int lo, int hi) { | |
| 298 DCHECK_GE(lo, 0); | |
| 299 DCHECK_GE(hi, 0); | |
| 300 DCHECK_LE(lo, 255); | |
| 301 DCHECK_LE(hi, 255); | |
| 302 DCHECK_LE(lo, hi); | |
| 303 if (0 < lo && lo <= 255) | |
| 304 byterange_.Set(lo - 1); | |
| 305 if (0 <= hi && hi <= 255) | |
| 306 byterange_.Set(hi); | |
| 307 } | |
| 308 | |
| 309 void Prog::ComputeByteMap() { | |
| 310 // Fill in bytemap with byte classes for prog_. | |
| 311 // Ranges of bytes that are treated as indistinguishable | |
| 312 // by the regexp program are mapped to a single byte class. | |
| 313 // The vector prog_->byterange() marks the end of each | |
| 314 // such range. | |
| 315 const Bitmap<256>& v = byterange(); | |
| 316 | |
| 317 COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize); | |
| 318 uint8 n = 0; | |
| 319 uint32 bits = 0; | |
| 320 for (int i = 0; i < 256; i++) { | |
| 321 if ((i&31) == 0) | |
| 322 bits = v.Word(i >> 5); | |
| 323 bytemap_[i] = n; | |
| 324 n += bits & 1; | |
| 325 bits >>= 1; | |
| 326 } | |
| 327 bytemap_range_ = bytemap_[255] + 1; | |
| 328 unbytemap_ = new uint8[bytemap_range_]; | |
| 329 for (int i = 0; i < 256; i++) | |
| 330 unbytemap_[bytemap_[i]] = static_cast<uint8>(i); | |
| 331 | |
| 332 if (0) { // For debugging: use trivial byte map. | |
| 333 for (int i = 0; i < 256; i++) { | |
| 334 bytemap_[i] = static_cast<uint8>(i); | |
| 335 unbytemap_[i] = static_cast<uint8>(i); | |
| 336 } | |
| 337 bytemap_range_ = 256; | |
| 338 LOG(INFO) << "Using trivial bytemap."; | |
| 339 } | |
| 340 } | |
| 341 | |
| 342 } // namespace re2 | |
| 343 | |
| OLD | NEW |