| OLD | NEW |
| (Empty) |
| 1 // Copyright 2006 The RE2 Authors. All Rights Reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 // Format a regular expression structure as a string. | |
| 6 // Tested by parse_test.cc | |
| 7 | |
| 8 #include "util/util.h" | |
| 9 #include "re2/regexp.h" | |
| 10 #include "re2/walker-inl.h" | |
| 11 | |
| 12 namespace re2 { | |
| 13 | |
| 14 enum { | |
| 15 PrecAtom, | |
| 16 PrecUnary, | |
| 17 PrecConcat, | |
| 18 PrecAlternate, | |
| 19 PrecEmpty, | |
| 20 PrecParen, | |
| 21 PrecToplevel, | |
| 22 }; | |
| 23 | |
| 24 // Helper function. See description below. | |
| 25 static void AppendCCRange(string* t, Rune lo, Rune hi); | |
| 26 | |
| 27 // Walker to generate string in s_. | |
| 28 // The arg pointers are actually integers giving the | |
| 29 // context precedence. | |
| 30 // The child_args are always NULL. | |
| 31 class ToStringWalker : public Regexp::Walker<int> { | |
| 32 public: | |
| 33 explicit ToStringWalker(string* t) : t_(t) {} | |
| 34 | |
| 35 virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); | |
| 36 virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, | |
| 37 int* child_args, int nchild_args); | |
| 38 virtual int ShortVisit(Regexp* re, int parent_arg) { | |
| 39 return 0; | |
| 40 } | |
| 41 | |
| 42 private: | |
| 43 string* t_; // The string the walker appends to. | |
| 44 | |
| 45 DISALLOW_COPY_AND_ASSIGN(ToStringWalker); | |
| 46 }; | |
| 47 | |
| 48 string Regexp::ToString() { | |
| 49 string t; | |
| 50 ToStringWalker w(&t); | |
| 51 w.WalkExponential(this, PrecToplevel, 100000); | |
| 52 if (w.stopped_early()) | |
| 53 t += " [truncated]"; | |
| 54 return t; | |
| 55 } | |
| 56 | |
| 57 #define ToString DontCallToString // Avoid accidental recursion. | |
| 58 | |
| 59 // Visits re before children are processed. | |
| 60 // Appends ( if needed and passes new precedence to children. | |
| 61 int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { | |
| 62 int prec = parent_arg; | |
| 63 int nprec = PrecAtom; | |
| 64 | |
| 65 switch (re->op()) { | |
| 66 case kRegexpNoMatch: | |
| 67 case kRegexpEmptyMatch: | |
| 68 case kRegexpLiteral: | |
| 69 case kRegexpAnyChar: | |
| 70 case kRegexpAnyByte: | |
| 71 case kRegexpBeginLine: | |
| 72 case kRegexpEndLine: | |
| 73 case kRegexpBeginText: | |
| 74 case kRegexpEndText: | |
| 75 case kRegexpWordBoundary: | |
| 76 case kRegexpNoWordBoundary: | |
| 77 case kRegexpCharClass: | |
| 78 case kRegexpHaveMatch: | |
| 79 nprec = PrecAtom; | |
| 80 break; | |
| 81 | |
| 82 case kRegexpConcat: | |
| 83 case kRegexpLiteralString: | |
| 84 if (prec < PrecConcat) | |
| 85 t_->append("(?:"); | |
| 86 nprec = PrecConcat; | |
| 87 break; | |
| 88 | |
| 89 case kRegexpAlternate: | |
| 90 if (prec < PrecAlternate) | |
| 91 t_->append("(?:"); | |
| 92 nprec = PrecAlternate; | |
| 93 break; | |
| 94 | |
| 95 case kRegexpCapture: | |
| 96 t_->append("("); | |
| 97 if (re->cap() == 0) | |
| 98 LOG(DFATAL) << "kRegexpCapture cap() == 0"; | |
| 99 if (re->name()) { | |
| 100 t_->append("?P<"); | |
| 101 t_->append(*re->name()); | |
| 102 t_->append(">"); | |
| 103 } | |
| 104 nprec = PrecParen; | |
| 105 break; | |
| 106 | |
| 107 case kRegexpStar: | |
| 108 case kRegexpPlus: | |
| 109 case kRegexpQuest: | |
| 110 case kRegexpRepeat: | |
| 111 if (prec < PrecUnary) | |
| 112 t_->append("(?:"); | |
| 113 // The subprecedence here is PrecAtom instead of PrecUnary | |
| 114 // because PCRE treats two unary ops in a row as a parse error. | |
| 115 nprec = PrecAtom; | |
| 116 break; | |
| 117 } | |
| 118 | |
| 119 return nprec; | |
| 120 } | |
| 121 | |
| 122 static void AppendLiteral(string *t, Rune r, bool foldcase) { | |
| 123 if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { | |
| 124 t->append(1, '\\'); | |
| 125 t->append(1, static_cast<char>(r)); | |
| 126 } else if (foldcase && 'a' <= r && r <= 'z') { | |
| 127 if ('a' <= r && r <= 'z') | |
| 128 r += 'A' - 'a'; | |
| 129 t->append(1, '['); | |
| 130 t->append(1, static_cast<char>(r)); | |
| 131 t->append(1, static_cast<char>(r) + 'a' - 'A'); | |
| 132 t->append(1, ']'); | |
| 133 } else { | |
| 134 AppendCCRange(t, r, r); | |
| 135 } | |
| 136 } | |
| 137 | |
| 138 // Visits re after children are processed. | |
| 139 // For childless regexps, all the work is done here. | |
| 140 // For regexps with children, append any unary suffixes or ). | |
| 141 int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, | |
| 142 int* child_args, int nchild_args) { | |
| 143 int prec = parent_arg; | |
| 144 switch (re->op()) { | |
| 145 case kRegexpNoMatch: | |
| 146 // There's no simple symbol for "no match", but | |
| 147 // [^0-Runemax] excludes everything. | |
| 148 t_->append("[^\\x00-\\x{10ffff}]"); | |
| 149 break; | |
| 150 | |
| 151 case kRegexpEmptyMatch: | |
| 152 // Append (?:) to make empty string visible, | |
| 153 // unless this is already being parenthesized. | |
| 154 if (prec < PrecEmpty) | |
| 155 t_->append("(?:)"); | |
| 156 break; | |
| 157 | |
| 158 case kRegexpLiteral: | |
| 159 AppendLiteral(t_, re->rune(), | |
| 160 (re->parse_flags() & Regexp::FoldCase) != 0); | |
| 161 break; | |
| 162 | |
| 163 case kRegexpLiteralString: | |
| 164 for (int i = 0; i < re->nrunes(); i++) | |
| 165 AppendLiteral(t_, re->runes()[i], | |
| 166 (re->parse_flags() & Regexp::FoldCase) != 0); | |
| 167 if (prec < PrecConcat) | |
| 168 t_->append(")"); | |
| 169 break; | |
| 170 | |
| 171 case kRegexpConcat: | |
| 172 if (prec < PrecConcat) | |
| 173 t_->append(")"); | |
| 174 break; | |
| 175 | |
| 176 case kRegexpAlternate: | |
| 177 // Clumsy but workable: the children all appended | | |
| 178 // at the end of their strings, so just remove the last one. | |
| 179 if ((*t_)[t_->size()-1] == '|') | |
| 180 t_->erase(t_->size()-1); | |
| 181 else | |
| 182 LOG(DFATAL) << "Bad final char: " << t_; | |
| 183 if (prec < PrecAlternate) | |
| 184 t_->append(")"); | |
| 185 break; | |
| 186 | |
| 187 case kRegexpStar: | |
| 188 t_->append("*"); | |
| 189 if (re->parse_flags() & Regexp::NonGreedy) | |
| 190 t_->append("?"); | |
| 191 if (prec < PrecUnary) | |
| 192 t_->append(")"); | |
| 193 break; | |
| 194 | |
| 195 case kRegexpPlus: | |
| 196 t_->append("+"); | |
| 197 if (re->parse_flags() & Regexp::NonGreedy) | |
| 198 t_->append("?"); | |
| 199 if (prec < PrecUnary) | |
| 200 t_->append(")"); | |
| 201 break; | |
| 202 | |
| 203 case kRegexpQuest: | |
| 204 t_->append("?"); | |
| 205 if (re->parse_flags() & Regexp::NonGreedy) | |
| 206 t_->append("?"); | |
| 207 if (prec < PrecUnary) | |
| 208 t_->append(")"); | |
| 209 break; | |
| 210 | |
| 211 case kRegexpRepeat: | |
| 212 if (re->max() == -1) | |
| 213 t_->append(StringPrintf("{%d,}", re->min())); | |
| 214 else if (re->min() == re->max()) | |
| 215 t_->append(StringPrintf("{%d}", re->min())); | |
| 216 else | |
| 217 t_->append(StringPrintf("{%d,%d}", re->min(), re->max())); | |
| 218 if (re->parse_flags() & Regexp::NonGreedy) | |
| 219 t_->append("?"); | |
| 220 if (prec < PrecUnary) | |
| 221 t_->append(")"); | |
| 222 break; | |
| 223 | |
| 224 case kRegexpAnyChar: | |
| 225 t_->append("."); | |
| 226 break; | |
| 227 | |
| 228 case kRegexpAnyByte: | |
| 229 t_->append("\\C"); | |
| 230 break; | |
| 231 | |
| 232 case kRegexpBeginLine: | |
| 233 t_->append("^"); | |
| 234 break; | |
| 235 | |
| 236 case kRegexpEndLine: | |
| 237 t_->append("$"); | |
| 238 break; | |
| 239 | |
| 240 case kRegexpBeginText: | |
| 241 t_->append("(?-m:^)"); | |
| 242 break; | |
| 243 | |
| 244 case kRegexpEndText: | |
| 245 if (re->parse_flags() & Regexp::WasDollar) | |
| 246 t_->append("(?-m:$)"); | |
| 247 else | |
| 248 t_->append("\\z"); | |
| 249 break; | |
| 250 | |
| 251 case kRegexpWordBoundary: | |
| 252 t_->append("\\b"); | |
| 253 break; | |
| 254 | |
| 255 case kRegexpNoWordBoundary: | |
| 256 t_->append("\\B"); | |
| 257 break; | |
| 258 | |
| 259 case kRegexpCharClass: { | |
| 260 if (re->cc()->size() == 0) { | |
| 261 t_->append("[^\\x00-\\x{10ffff}]"); | |
| 262 break; | |
| 263 } | |
| 264 t_->append("["); | |
| 265 // Heuristic: show class as negated if it contains the | |
| 266 // non-character 0xFFFE. | |
| 267 CharClass* cc = re->cc(); | |
| 268 if (cc->Contains(0xFFFE)) { | |
| 269 cc = cc->Negate(); | |
| 270 t_->append("^"); | |
| 271 } | |
| 272 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) | |
| 273 AppendCCRange(t_, i->lo, i->hi); | |
| 274 if (cc != re->cc()) | |
| 275 cc->Delete(); | |
| 276 t_->append("]"); | |
| 277 break; | |
| 278 } | |
| 279 | |
| 280 case kRegexpCapture: | |
| 281 t_->append(")"); | |
| 282 break; | |
| 283 | |
| 284 case kRegexpHaveMatch: | |
| 285 // There's no syntax accepted by the parser to generate | |
| 286 // this node (it is generated by RE2::Set) so make something | |
| 287 // up that is readable but won't compile. | |
| 288 t_->append("(?HaveMatch:%d)", re->match_id()); | |
| 289 break; | |
| 290 } | |
| 291 | |
| 292 // If the parent is an alternation, append the | for it. | |
| 293 if (prec == PrecAlternate) | |
| 294 t_->append("|"); | |
| 295 | |
| 296 return 0; | |
| 297 } | |
| 298 | |
| 299 // Appends a rune for use in a character class to the string t. | |
| 300 static void AppendCCChar(string* t, Rune r) { | |
| 301 if (0x20 <= r && r <= 0x7E) { | |
| 302 if (strchr("[]^-\\", r)) | |
| 303 t->append("\\"); | |
| 304 t->append(1, static_cast<char>(r)); | |
| 305 return; | |
| 306 } | |
| 307 switch (r) { | |
| 308 default: | |
| 309 break; | |
| 310 | |
| 311 case '\r': | |
| 312 t->append("\\r"); | |
| 313 return; | |
| 314 | |
| 315 case '\t': | |
| 316 t->append("\\t"); | |
| 317 return; | |
| 318 | |
| 319 case '\n': | |
| 320 t->append("\\n"); | |
| 321 return; | |
| 322 | |
| 323 case '\f': | |
| 324 t->append("\\f"); | |
| 325 return; | |
| 326 } | |
| 327 | |
| 328 if (r < 0x100) { | |
| 329 StringAppendF(t, "\\x%02x", static_cast<int>(r)); | |
| 330 return; | |
| 331 } | |
| 332 StringAppendF(t, "\\x{%x}", static_cast<int>(r)); | |
| 333 } | |
| 334 | |
| 335 static void AppendCCRange(string* t, Rune lo, Rune hi) { | |
| 336 if (lo > hi) | |
| 337 return; | |
| 338 AppendCCChar(t, lo); | |
| 339 if (lo < hi) { | |
| 340 t->append("-"); | |
| 341 AppendCCChar(t, hi); | |
| 342 } | |
| 343 } | |
| 344 | |
| 345 } // namespace re2 | |
| OLD | NEW |