base/string_tokenizer.h - Issue 12087091: Move string tokenizer to base/strings.

Side by Side Diff: base/string_tokenizer.h

Issue 12087091: Move string tokenizer to base/strings. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: sort Created 7 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #ifndef BASE_STRING_TOKENIZER_H_

6 #define BASE_STRING_TOKENIZER_H_

7

8 #include <algorithm>

9 #include <string>

10

11 #include "base/string_piece.h"

12

13 // StringTokenizerT is a simple string tokenizer class. It works like an

14 // iterator that with each step (see the Advance method) updates members that

15 // refer to the next token in the input string. The user may optionally

16 // configure the tokenizer to return delimiters.

17 //

18 // Warning: be careful not to pass a C string into the 2-arg constructor:

19 // StringTokenizer t("this is a test", " "); // WRONG

20 // This will create a temporary std::string, save the begin() and end()

21 // iterators, and then the string will be freed before we actually start

22 // tokenizing it.

23 // Instead, use a std::string or use the 3 arg constructor of CStringTokenizer.

24 //

25 //

26 // EXAMPLE 1:

27 //

28 // char input[] = "this is a test";

29 // CStringTokenizer t(input, input + strlen(input), " ");

30 // while (t.GetNext()) {

31 // printf("%s\n", t.token().c_str());

32 // }

33 //

34 // Output:

35 //

36 // this

37 // is

38 // a

39 // test

40 //

41 //

42 // EXAMPLE 2:

43 //

44 // std::string input = "no-cache=\"foo, bar\", private";

45 // StringTokenizer t(input, ", ");

46 // t.set_quote_chars("\"");

47 // while (t.GetNext()) {

48 // printf("%s\n", t.token().c_str());

49 // }

50 //

51 // Output:

52 //

53 // no-cache="foo, bar"

54 // private

55 //

56 //

57 // EXAMPLE 3:

58 //

59 // bool next_is_option = false, next_is_value = false;

60 // std::string input = "text/html; charset=UTF-8; foo=bar";

61 // StringTokenizer t(input, "; =");

62 // t.set_options(StringTokenizer::RETURN_DELIMS);

63 // while (t.GetNext()) {

64 // if (t.token_is_delim()) {

65 // switch (*t.token_begin()) {

66 // case ';':

67 // next_is_option = true;

68 // break;

69 // case '=':

70 // next_is_value = true;

71 // break;

72 // }

73 // } else {

74 // const char* label;

75 // if (next_is_option) {

76 // label = "option-name";

77 // next_is_option = false;

78 // } else if (next_is_value) {

79 // label = "option-value";

80 // next_is_value = false;

81 // } else {

82 // label = "mime-type";

83 // }

84 // printf("%s: %s\n", label, t.token().c_str());

85 // }

86 // }

87 //

88 //

89 template <class str, class const_iterator>

90 class StringTokenizerT {

91 public:

92 typedef typename str::value_type char_type;

93

94 // Options that may be pass to set_options()

95 enum {

96 // Specifies the delimiters should be returned as tokens

97 RETURN_DELIMS = 1 << 0,

98 };

99

100 // The string object must live longer than the tokenizer. (In particular this

101 // should not be constructed with a temporary.)

102 StringTokenizerT(const str& string,

103 const str& delims) {

104 Init(string.begin(), string.end(), delims);

105 }

106

107 StringTokenizerT(const_iterator string_begin,

108 const_iterator string_end,

109 const str& delims) {

110 Init(string_begin, string_end, delims);

111 }

112

113 // Set the options for this tokenizer. By default, this is 0.

114 void set_options(int options) { options_ = options; }

115

116 // Set the characters to regard as quotes. By default, this is empty. When

117 // a quote char is encountered, the tokenizer will switch into a mode where

118 // it ignores delimiters that it finds. It switches out of this mode once it

119 // finds another instance of the quote char. If a backslash is encountered

120 // within a quoted string, then the next character is skipped.

121 void set_quote_chars(const str& quotes) { quotes_ = quotes; }

122

123 // Call this method to advance the tokenizer to the next delimiter. This

124 // returns false if the tokenizer is complete. This method must be called

125 // before calling any of the token* methods.

126 bool GetNext() {

127 if (quotes_.empty() && options_ == 0)

128 return QuickGetNext();

129 else

130 return FullGetNext();

131 }

132

133 // Start iterating through tokens from the beginning of the string.

134 void Reset() {

135 token_end_ = start_pos_;

136 }

137

138 // Returns true if token is a delimiter. When the tokenizer is constructed

139 // with the RETURN_DELIMS option, this method can be used to check if the

140 // returned token is actually a delimiter.

141 bool token_is_delim() const { return token_is_delim_; }

142

143 // If GetNext() returned true, then these methods may be used to read the

144 // value of the token.

145 const_iterator token_begin() const { return token_begin_; }

146 const_iterator token_end() const { return token_end_; }

147 str token() const { return str(token_begin_, token_end_); }

148 base::StringPiece token_piece() const {

149 return base::StringPiece(&*token_begin_,

150 std::distance(token_begin_, token_end_));

151 }

152

153 private:

154 void Init(const_iterator string_begin,

155 const_iterator string_end,

156 const str& delims) {

157 start_pos_ = string_begin;

158 token_begin_ = string_begin;

159 token_end_ = string_begin;

160 end_ = string_end;

161 delims_ = delims;

162 options_ = 0;

163 token_is_delim_ = false;

164 }

165

166 // Implementation of GetNext() for when we have no quote characters. We have

167 // two separate implementations because AdvanceOne() is a hot spot in large

168 // text files with large tokens.

169 bool QuickGetNext() {

170 token_is_delim_ = false;

171 for (;;) {

172 token_begin_ = token_end_;

173 if (token_end_ == end_)

174 return false;

175 ++token_end_;

176 if (delims_.find(*token_begin_) == str::npos)

177 break;

178 // else skip over delimiter.

179 }

180 while (token_end_ != end_ && delims_.find(*token_end_) == str::npos)

181 ++token_end_;

182 return true;

183 }

184

185 // Implementation of GetNext() for when we have to take quotes into account.

186 bool FullGetNext() {

187 AdvanceState state;

188 token_is_delim_ = false;

189 for (;;) {

190 token_begin_ = token_end_;

191 if (token_end_ == end_)

192 return false;

193 ++token_end_;

194 if (AdvanceOne(&state, *token_begin_))

195 break;

196 if (options_ & RETURN_DELIMS) {

197 token_is_delim_ = true;

198 return true;

199 }

200 // else skip over delimiter.

201 }

202 while (token_end_ != end_ && AdvanceOne(&state, *token_end_))

203 ++token_end_;

204 return true;

205 }

206

207 bool IsDelim(char_type c) const {

208 return delims_.find(c) != str::npos;

209 }

210

211 bool IsQuote(char_type c) const {

212 return quotes_.find(c) != str::npos;

213 }

214

215 struct AdvanceState {

216 bool in_quote;

217 bool in_escape;

218 char_type quote_char;

219 AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {}

220 };

221

222 // Returns true if a delimiter was not hit.

223 bool AdvanceOne(AdvanceState* state, char_type c) {

224 if (state->in_quote) {

225 if (state->in_escape) {

226 state->in_escape = false;

227 } else if (c == '\\') {

228 state->in_escape = true;

229 } else if (c == state->quote_char) {

230 state->in_quote = false;

231 }

232 } else {

233 if (IsDelim(c))

234 return false;

235 state->in_quote = IsQuote(state->quote_char = c);

236 }

237 return true;

238 }

239

240 const_iterator start_pos_;

241 const_iterator token_begin_;

242 const_iterator token_end_;

243 const_iterator end_;

244 str delims_;

245 str quotes_;

246 int options_;

247 bool token_is_delim_;

248 };

249

250 typedef StringTokenizerT<std::string, std::string::const_iterator>

251 StringTokenizer;

252 typedef StringTokenizerT<std::wstring, std::wstring::const_iterator>

253 WStringTokenizer;

254 typedef StringTokenizerT<std::string, const char*> CStringTokenizer;

255

256 #endif // BASE_STRING_TOKENIZER_H_

OLD	NEW

« no previous file with comments | « base/process_util_linux.cc ('k') | base/string_tokenizer_unittest.cc » ('j') | no next file with comments »