| OLD | NEW | 
|---|
|  | (Empty) | 
| 1 // Copyright 2006 The RE2 Authors.  All Rights Reserved. |  | 
| 2 // Use of this source code is governed by a BSD-style |  | 
| 3 // license that can be found in the LICENSE file. |  | 
| 4 |  | 
| 5 // Format a regular expression structure as a string. |  | 
| 6 // Tested by parse_test.cc |  | 
| 7 |  | 
| 8 #include "util/util.h" |  | 
| 9 #include "re2/regexp.h" |  | 
| 10 #include "re2/walker-inl.h" |  | 
| 11 |  | 
| 12 namespace re2 { |  | 
| 13 |  | 
| 14 enum { |  | 
| 15   PrecAtom, |  | 
| 16   PrecUnary, |  | 
| 17   PrecConcat, |  | 
| 18   PrecAlternate, |  | 
| 19   PrecEmpty, |  | 
| 20   PrecParen, |  | 
| 21   PrecToplevel, |  | 
| 22 }; |  | 
| 23 |  | 
| 24 // Helper function.  See description below. |  | 
| 25 static void AppendCCRange(string* t, Rune lo, Rune hi); |  | 
| 26 |  | 
| 27 // Walker to generate string in s_. |  | 
| 28 // The arg pointers are actually integers giving the |  | 
| 29 // context precedence. |  | 
| 30 // The child_args are always NULL. |  | 
| 31 class ToStringWalker : public Regexp::Walker<int> { |  | 
| 32  public: |  | 
| 33   explicit ToStringWalker(string* t) : t_(t) {} |  | 
| 34 |  | 
| 35   virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); |  | 
| 36   virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, |  | 
| 37                         int* child_args, int nchild_args); |  | 
| 38   virtual int ShortVisit(Regexp* re, int parent_arg) { |  | 
| 39     return 0; |  | 
| 40   } |  | 
| 41 |  | 
| 42  private: |  | 
| 43   string* t_;  // The string the walker appends to. |  | 
| 44 |  | 
| 45   DISALLOW_COPY_AND_ASSIGN(ToStringWalker); |  | 
| 46 }; |  | 
| 47 |  | 
| 48 string Regexp::ToString() { |  | 
| 49   string t; |  | 
| 50   ToStringWalker w(&t); |  | 
| 51   w.WalkExponential(this, PrecToplevel, 100000); |  | 
| 52   if (w.stopped_early()) |  | 
| 53     t += " [truncated]"; |  | 
| 54   return t; |  | 
| 55 } |  | 
| 56 |  | 
| 57 #define ToString DontCallToString  // Avoid accidental recursion. |  | 
| 58 |  | 
| 59 // Visits re before children are processed. |  | 
| 60 // Appends ( if needed and passes new precedence to children. |  | 
| 61 int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { |  | 
| 62   int prec = parent_arg; |  | 
| 63   int nprec = PrecAtom; |  | 
| 64 |  | 
| 65   switch (re->op()) { |  | 
| 66     case kRegexpNoMatch: |  | 
| 67     case kRegexpEmptyMatch: |  | 
| 68     case kRegexpLiteral: |  | 
| 69     case kRegexpAnyChar: |  | 
| 70     case kRegexpAnyByte: |  | 
| 71     case kRegexpBeginLine: |  | 
| 72     case kRegexpEndLine: |  | 
| 73     case kRegexpBeginText: |  | 
| 74     case kRegexpEndText: |  | 
| 75     case kRegexpWordBoundary: |  | 
| 76     case kRegexpNoWordBoundary: |  | 
| 77     case kRegexpCharClass: |  | 
| 78     case kRegexpHaveMatch: |  | 
| 79       nprec = PrecAtom; |  | 
| 80       break; |  | 
| 81 |  | 
| 82     case kRegexpConcat: |  | 
| 83     case kRegexpLiteralString: |  | 
| 84       if (prec < PrecConcat) |  | 
| 85         t_->append("(?:"); |  | 
| 86       nprec = PrecConcat; |  | 
| 87       break; |  | 
| 88 |  | 
| 89     case kRegexpAlternate: |  | 
| 90       if (prec < PrecAlternate) |  | 
| 91         t_->append("(?:"); |  | 
| 92       nprec = PrecAlternate; |  | 
| 93       break; |  | 
| 94 |  | 
| 95     case kRegexpCapture: |  | 
| 96       t_->append("("); |  | 
| 97       if (re->cap() == 0) |  | 
| 98         LOG(DFATAL) << "kRegexpCapture cap() == 0"; |  | 
| 99       if (re->name()) { |  | 
| 100         t_->append("?P<"); |  | 
| 101         t_->append(*re->name()); |  | 
| 102         t_->append(">"); |  | 
| 103       } |  | 
| 104       nprec = PrecParen; |  | 
| 105       break; |  | 
| 106 |  | 
| 107     case kRegexpStar: |  | 
| 108     case kRegexpPlus: |  | 
| 109     case kRegexpQuest: |  | 
| 110     case kRegexpRepeat: |  | 
| 111       if (prec < PrecUnary) |  | 
| 112         t_->append("(?:"); |  | 
| 113       // The subprecedence here is PrecAtom instead of PrecUnary |  | 
| 114       // because PCRE treats two unary ops in a row as a parse error. |  | 
| 115       nprec = PrecAtom; |  | 
| 116       break; |  | 
| 117   } |  | 
| 118 |  | 
| 119   return nprec; |  | 
| 120 } |  | 
| 121 |  | 
| 122 static void AppendLiteral(string *t, Rune r, bool foldcase) { |  | 
| 123   if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { |  | 
| 124     t->append(1, '\\'); |  | 
| 125     t->append(1, static_cast<char>(r)); |  | 
| 126   } else if (foldcase && 'a' <= r && r <= 'z') { |  | 
| 127     if ('a' <= r && r <= 'z') |  | 
| 128       r += 'A' - 'a'; |  | 
| 129     t->append(1, '['); |  | 
| 130     t->append(1, static_cast<char>(r)); |  | 
| 131     t->append(1, static_cast<char>(r) + 'a' - 'A'); |  | 
| 132     t->append(1, ']'); |  | 
| 133   } else { |  | 
| 134     AppendCCRange(t, r, r); |  | 
| 135   } |  | 
| 136 } |  | 
| 137 |  | 
| 138 // Visits re after children are processed. |  | 
| 139 // For childless regexps, all the work is done here. |  | 
| 140 // For regexps with children, append any unary suffixes or ). |  | 
| 141 int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, |  | 
| 142                               int* child_args, int nchild_args) { |  | 
| 143   int prec = parent_arg; |  | 
| 144   switch (re->op()) { |  | 
| 145     case kRegexpNoMatch: |  | 
| 146       // There's no simple symbol for "no match", but |  | 
| 147       // [^0-Runemax] excludes everything. |  | 
| 148       t_->append("[^\\x00-\\x{10ffff}]"); |  | 
| 149       break; |  | 
| 150 |  | 
| 151     case kRegexpEmptyMatch: |  | 
| 152       // Append (?:) to make empty string visible, |  | 
| 153       // unless this is already being parenthesized. |  | 
| 154       if (prec < PrecEmpty) |  | 
| 155         t_->append("(?:)"); |  | 
| 156       break; |  | 
| 157 |  | 
| 158     case kRegexpLiteral: |  | 
| 159       AppendLiteral(t_, re->rune(), |  | 
| 160                     (re->parse_flags() & Regexp::FoldCase) != 0); |  | 
| 161       break; |  | 
| 162 |  | 
| 163     case kRegexpLiteralString: |  | 
| 164       for (int i = 0; i < re->nrunes(); i++) |  | 
| 165         AppendLiteral(t_, re->runes()[i], |  | 
| 166                       (re->parse_flags() & Regexp::FoldCase) != 0); |  | 
| 167       if (prec < PrecConcat) |  | 
| 168         t_->append(")"); |  | 
| 169       break; |  | 
| 170 |  | 
| 171     case kRegexpConcat: |  | 
| 172       if (prec < PrecConcat) |  | 
| 173         t_->append(")"); |  | 
| 174       break; |  | 
| 175 |  | 
| 176     case kRegexpAlternate: |  | 
| 177       // Clumsy but workable: the children all appended | |  | 
| 178       // at the end of their strings, so just remove the last one. |  | 
| 179       if ((*t_)[t_->size()-1] == '|') |  | 
| 180         t_->erase(t_->size()-1); |  | 
| 181       else |  | 
| 182         LOG(DFATAL) << "Bad final char: " << t_; |  | 
| 183       if (prec < PrecAlternate) |  | 
| 184         t_->append(")"); |  | 
| 185       break; |  | 
| 186 |  | 
| 187     case kRegexpStar: |  | 
| 188       t_->append("*"); |  | 
| 189       if (re->parse_flags() & Regexp::NonGreedy) |  | 
| 190         t_->append("?"); |  | 
| 191       if (prec < PrecUnary) |  | 
| 192         t_->append(")"); |  | 
| 193       break; |  | 
| 194 |  | 
| 195     case kRegexpPlus: |  | 
| 196       t_->append("+"); |  | 
| 197       if (re->parse_flags() & Regexp::NonGreedy) |  | 
| 198         t_->append("?"); |  | 
| 199       if (prec < PrecUnary) |  | 
| 200         t_->append(")"); |  | 
| 201       break; |  | 
| 202 |  | 
| 203     case kRegexpQuest: |  | 
| 204       t_->append("?"); |  | 
| 205       if (re->parse_flags() & Regexp::NonGreedy) |  | 
| 206         t_->append("?"); |  | 
| 207       if (prec < PrecUnary) |  | 
| 208         t_->append(")"); |  | 
| 209       break; |  | 
| 210 |  | 
| 211     case kRegexpRepeat: |  | 
| 212       if (re->max() == -1) |  | 
| 213         t_->append(StringPrintf("{%d,}", re->min())); |  | 
| 214       else if (re->min() == re->max()) |  | 
| 215         t_->append(StringPrintf("{%d}", re->min())); |  | 
| 216       else |  | 
| 217         t_->append(StringPrintf("{%d,%d}", re->min(), re->max())); |  | 
| 218       if (re->parse_flags() & Regexp::NonGreedy) |  | 
| 219         t_->append("?"); |  | 
| 220       if (prec < PrecUnary) |  | 
| 221         t_->append(")"); |  | 
| 222       break; |  | 
| 223 |  | 
| 224     case kRegexpAnyChar: |  | 
| 225       t_->append("."); |  | 
| 226       break; |  | 
| 227 |  | 
| 228     case kRegexpAnyByte: |  | 
| 229       t_->append("\\C"); |  | 
| 230       break; |  | 
| 231 |  | 
| 232     case kRegexpBeginLine: |  | 
| 233       t_->append("^"); |  | 
| 234       break; |  | 
| 235 |  | 
| 236     case kRegexpEndLine: |  | 
| 237       t_->append("$"); |  | 
| 238       break; |  | 
| 239 |  | 
| 240     case kRegexpBeginText: |  | 
| 241       t_->append("(?-m:^)"); |  | 
| 242       break; |  | 
| 243 |  | 
| 244     case kRegexpEndText: |  | 
| 245       if (re->parse_flags() & Regexp::WasDollar) |  | 
| 246         t_->append("(?-m:$)"); |  | 
| 247       else |  | 
| 248         t_->append("\\z"); |  | 
| 249       break; |  | 
| 250 |  | 
| 251     case kRegexpWordBoundary: |  | 
| 252       t_->append("\\b"); |  | 
| 253       break; |  | 
| 254 |  | 
| 255     case kRegexpNoWordBoundary: |  | 
| 256       t_->append("\\B"); |  | 
| 257       break; |  | 
| 258 |  | 
| 259     case kRegexpCharClass: { |  | 
| 260       if (re->cc()->size() == 0) { |  | 
| 261         t_->append("[^\\x00-\\x{10ffff}]"); |  | 
| 262         break; |  | 
| 263       } |  | 
| 264       t_->append("["); |  | 
| 265       // Heuristic: show class as negated if it contains the |  | 
| 266       // non-character 0xFFFE. |  | 
| 267       CharClass* cc = re->cc(); |  | 
| 268       if (cc->Contains(0xFFFE)) { |  | 
| 269         cc = cc->Negate(); |  | 
| 270         t_->append("^"); |  | 
| 271       } |  | 
| 272       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) |  | 
| 273         AppendCCRange(t_, i->lo, i->hi); |  | 
| 274       if (cc != re->cc()) |  | 
| 275         cc->Delete(); |  | 
| 276       t_->append("]"); |  | 
| 277       break; |  | 
| 278     } |  | 
| 279 |  | 
| 280     case kRegexpCapture: |  | 
| 281       t_->append(")"); |  | 
| 282       break; |  | 
| 283 |  | 
| 284     case kRegexpHaveMatch: |  | 
| 285       // There's no syntax accepted by the parser to generate |  | 
| 286       // this node (it is generated by RE2::Set) so make something |  | 
| 287       // up that is readable but won't compile. |  | 
| 288       t_->append("(?HaveMatch:%d)", re->match_id()); |  | 
| 289       break; |  | 
| 290   } |  | 
| 291 |  | 
| 292   // If the parent is an alternation, append the | for it. |  | 
| 293   if (prec == PrecAlternate) |  | 
| 294     t_->append("|"); |  | 
| 295 |  | 
| 296   return 0; |  | 
| 297 } |  | 
| 298 |  | 
| 299 // Appends a rune for use in a character class to the string t. |  | 
| 300 static void AppendCCChar(string* t, Rune r) { |  | 
| 301   if (0x20 <= r && r <= 0x7E) { |  | 
| 302     if (strchr("[]^-\\", r)) |  | 
| 303       t->append("\\"); |  | 
| 304     t->append(1, static_cast<char>(r)); |  | 
| 305     return; |  | 
| 306   } |  | 
| 307   switch (r) { |  | 
| 308     default: |  | 
| 309       break; |  | 
| 310 |  | 
| 311     case '\r': |  | 
| 312       t->append("\\r"); |  | 
| 313       return; |  | 
| 314 |  | 
| 315     case '\t': |  | 
| 316       t->append("\\t"); |  | 
| 317       return; |  | 
| 318 |  | 
| 319     case '\n': |  | 
| 320       t->append("\\n"); |  | 
| 321       return; |  | 
| 322 |  | 
| 323     case '\f': |  | 
| 324       t->append("\\f"); |  | 
| 325       return; |  | 
| 326   } |  | 
| 327 |  | 
| 328   if (r < 0x100) { |  | 
| 329     StringAppendF(t, "\\x%02x", static_cast<int>(r)); |  | 
| 330     return; |  | 
| 331   } |  | 
| 332   StringAppendF(t, "\\x{%x}", static_cast<int>(r)); |  | 
| 333 } |  | 
| 334 |  | 
| 335 static void AppendCCRange(string* t, Rune lo, Rune hi) { |  | 
| 336   if (lo > hi) |  | 
| 337     return; |  | 
| 338   AppendCCChar(t, lo); |  | 
| 339   if (lo < hi) { |  | 
| 340     t->append("-"); |  | 
| 341     AppendCCChar(t, hi); |  | 
| 342   } |  | 
| 343 } |  | 
| 344 |  | 
| 345 }  // namespace re2 |  | 
| OLD | NEW | 
|---|