| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #ifndef BASE_STRING_TOKENIZER_H_ | |
| 6 #define BASE_STRING_TOKENIZER_H_ | |
| 7 | |
| 8 #include <algorithm> | |
| 9 #include <string> | |
| 10 | |
| 11 #include "base/string_piece.h" | |
| 12 | |
| 13 // StringTokenizerT is a simple string tokenizer class. It works like an | |
| 14 // iterator that with each step (see the Advance method) updates members that | |
| 15 // refer to the next token in the input string. The user may optionally | |
| 16 // configure the tokenizer to return delimiters. | |
| 17 // | |
| 18 // Warning: be careful not to pass a C string into the 2-arg constructor: | |
| 19 // StringTokenizer t("this is a test", " "); // WRONG | |
| 20 // This will create a temporary std::string, save the begin() and end() | |
| 21 // iterators, and then the string will be freed before we actually start | |
| 22 // tokenizing it. | |
| 23 // Instead, use a std::string or use the 3 arg constructor of CStringTokenizer. | |
| 24 // | |
| 25 // | |
| 26 // EXAMPLE 1: | |
| 27 // | |
| 28 // char input[] = "this is a test"; | |
| 29 // CStringTokenizer t(input, input + strlen(input), " "); | |
| 30 // while (t.GetNext()) { | |
| 31 // printf("%s\n", t.token().c_str()); | |
| 32 // } | |
| 33 // | |
| 34 // Output: | |
| 35 // | |
| 36 // this | |
| 37 // is | |
| 38 // a | |
| 39 // test | |
| 40 // | |
| 41 // | |
| 42 // EXAMPLE 2: | |
| 43 // | |
| 44 // std::string input = "no-cache=\"foo, bar\", private"; | |
| 45 // StringTokenizer t(input, ", "); | |
| 46 // t.set_quote_chars("\""); | |
| 47 // while (t.GetNext()) { | |
| 48 // printf("%s\n", t.token().c_str()); | |
| 49 // } | |
| 50 // | |
| 51 // Output: | |
| 52 // | |
| 53 // no-cache="foo, bar" | |
| 54 // private | |
| 55 // | |
| 56 // | |
| 57 // EXAMPLE 3: | |
| 58 // | |
| 59 // bool next_is_option = false, next_is_value = false; | |
| 60 // std::string input = "text/html; charset=UTF-8; foo=bar"; | |
| 61 // StringTokenizer t(input, "; ="); | |
| 62 // t.set_options(StringTokenizer::RETURN_DELIMS); | |
| 63 // while (t.GetNext()) { | |
| 64 // if (t.token_is_delim()) { | |
| 65 // switch (*t.token_begin()) { | |
| 66 // case ';': | |
| 67 // next_is_option = true; | |
| 68 // break; | |
| 69 // case '=': | |
| 70 // next_is_value = true; | |
| 71 // break; | |
| 72 // } | |
| 73 // } else { | |
| 74 // const char* label; | |
| 75 // if (next_is_option) { | |
| 76 // label = "option-name"; | |
| 77 // next_is_option = false; | |
| 78 // } else if (next_is_value) { | |
| 79 // label = "option-value"; | |
| 80 // next_is_value = false; | |
| 81 // } else { | |
| 82 // label = "mime-type"; | |
| 83 // } | |
| 84 // printf("%s: %s\n", label, t.token().c_str()); | |
| 85 // } | |
| 86 // } | |
| 87 // | |
| 88 // | |
| 89 template <class str, class const_iterator> | |
| 90 class StringTokenizerT { | |
| 91 public: | |
| 92 typedef typename str::value_type char_type; | |
| 93 | |
| 94 // Options that may be pass to set_options() | |
| 95 enum { | |
| 96 // Specifies the delimiters should be returned as tokens | |
| 97 RETURN_DELIMS = 1 << 0, | |
| 98 }; | |
| 99 | |
| 100 // The string object must live longer than the tokenizer. (In particular this | |
| 101 // should not be constructed with a temporary.) | |
| 102 StringTokenizerT(const str& string, | |
| 103 const str& delims) { | |
| 104 Init(string.begin(), string.end(), delims); | |
| 105 } | |
| 106 | |
| 107 StringTokenizerT(const_iterator string_begin, | |
| 108 const_iterator string_end, | |
| 109 const str& delims) { | |
| 110 Init(string_begin, string_end, delims); | |
| 111 } | |
| 112 | |
| 113 // Set the options for this tokenizer. By default, this is 0. | |
| 114 void set_options(int options) { options_ = options; } | |
| 115 | |
| 116 // Set the characters to regard as quotes. By default, this is empty. When | |
| 117 // a quote char is encountered, the tokenizer will switch into a mode where | |
| 118 // it ignores delimiters that it finds. It switches out of this mode once it | |
| 119 // finds another instance of the quote char. If a backslash is encountered | |
| 120 // within a quoted string, then the next character is skipped. | |
| 121 void set_quote_chars(const str& quotes) { quotes_ = quotes; } | |
| 122 | |
| 123 // Call this method to advance the tokenizer to the next delimiter. This | |
| 124 // returns false if the tokenizer is complete. This method must be called | |
| 125 // before calling any of the token* methods. | |
| 126 bool GetNext() { | |
| 127 if (quotes_.empty() && options_ == 0) | |
| 128 return QuickGetNext(); | |
| 129 else | |
| 130 return FullGetNext(); | |
| 131 } | |
| 132 | |
| 133 // Start iterating through tokens from the beginning of the string. | |
| 134 void Reset() { | |
| 135 token_end_ = start_pos_; | |
| 136 } | |
| 137 | |
| 138 // Returns true if token is a delimiter. When the tokenizer is constructed | |
| 139 // with the RETURN_DELIMS option, this method can be used to check if the | |
| 140 // returned token is actually a delimiter. | |
| 141 bool token_is_delim() const { return token_is_delim_; } | |
| 142 | |
| 143 // If GetNext() returned true, then these methods may be used to read the | |
| 144 // value of the token. | |
| 145 const_iterator token_begin() const { return token_begin_; } | |
| 146 const_iterator token_end() const { return token_end_; } | |
| 147 str token() const { return str(token_begin_, token_end_); } | |
| 148 base::StringPiece token_piece() const { | |
| 149 return base::StringPiece(&*token_begin_, | |
| 150 std::distance(token_begin_, token_end_)); | |
| 151 } | |
| 152 | |
| 153 private: | |
| 154 void Init(const_iterator string_begin, | |
| 155 const_iterator string_end, | |
| 156 const str& delims) { | |
| 157 start_pos_ = string_begin; | |
| 158 token_begin_ = string_begin; | |
| 159 token_end_ = string_begin; | |
| 160 end_ = string_end; | |
| 161 delims_ = delims; | |
| 162 options_ = 0; | |
| 163 token_is_delim_ = false; | |
| 164 } | |
| 165 | |
| 166 // Implementation of GetNext() for when we have no quote characters. We have | |
| 167 // two separate implementations because AdvanceOne() is a hot spot in large | |
| 168 // text files with large tokens. | |
| 169 bool QuickGetNext() { | |
| 170 token_is_delim_ = false; | |
| 171 for (;;) { | |
| 172 token_begin_ = token_end_; | |
| 173 if (token_end_ == end_) | |
| 174 return false; | |
| 175 ++token_end_; | |
| 176 if (delims_.find(*token_begin_) == str::npos) | |
| 177 break; | |
| 178 // else skip over delimiter. | |
| 179 } | |
| 180 while (token_end_ != end_ && delims_.find(*token_end_) == str::npos) | |
| 181 ++token_end_; | |
| 182 return true; | |
| 183 } | |
| 184 | |
| 185 // Implementation of GetNext() for when we have to take quotes into account. | |
| 186 bool FullGetNext() { | |
| 187 AdvanceState state; | |
| 188 token_is_delim_ = false; | |
| 189 for (;;) { | |
| 190 token_begin_ = token_end_; | |
| 191 if (token_end_ == end_) | |
| 192 return false; | |
| 193 ++token_end_; | |
| 194 if (AdvanceOne(&state, *token_begin_)) | |
| 195 break; | |
| 196 if (options_ & RETURN_DELIMS) { | |
| 197 token_is_delim_ = true; | |
| 198 return true; | |
| 199 } | |
| 200 // else skip over delimiter. | |
| 201 } | |
| 202 while (token_end_ != end_ && AdvanceOne(&state, *token_end_)) | |
| 203 ++token_end_; | |
| 204 return true; | |
| 205 } | |
| 206 | |
| 207 bool IsDelim(char_type c) const { | |
| 208 return delims_.find(c) != str::npos; | |
| 209 } | |
| 210 | |
| 211 bool IsQuote(char_type c) const { | |
| 212 return quotes_.find(c) != str::npos; | |
| 213 } | |
| 214 | |
| 215 struct AdvanceState { | |
| 216 bool in_quote; | |
| 217 bool in_escape; | |
| 218 char_type quote_char; | |
| 219 AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {} | |
| 220 }; | |
| 221 | |
| 222 // Returns true if a delimiter was not hit. | |
| 223 bool AdvanceOne(AdvanceState* state, char_type c) { | |
| 224 if (state->in_quote) { | |
| 225 if (state->in_escape) { | |
| 226 state->in_escape = false; | |
| 227 } else if (c == '\\') { | |
| 228 state->in_escape = true; | |
| 229 } else if (c == state->quote_char) { | |
| 230 state->in_quote = false; | |
| 231 } | |
| 232 } else { | |
| 233 if (IsDelim(c)) | |
| 234 return false; | |
| 235 state->in_quote = IsQuote(state->quote_char = c); | |
| 236 } | |
| 237 return true; | |
| 238 } | |
| 239 | |
| 240 const_iterator start_pos_; | |
| 241 const_iterator token_begin_; | |
| 242 const_iterator token_end_; | |
| 243 const_iterator end_; | |
| 244 str delims_; | |
| 245 str quotes_; | |
| 246 int options_; | |
| 247 bool token_is_delim_; | |
| 248 }; | |
| 249 | |
| 250 typedef StringTokenizerT<std::string, std::string::const_iterator> | |
| 251 StringTokenizer; | |
| 252 typedef StringTokenizerT<std::wstring, std::wstring::const_iterator> | |
| 253 WStringTokenizer; | |
| 254 typedef StringTokenizerT<std::string, const char*> CStringTokenizer; | |
| 255 | |
| 256 #endif // BASE_STRING_TOKENIZER_H_ | |
| OLD | NEW |