base/strings/string_split.cc - Issue 1169393003: Add new SplitString backend.

Side by Side Diff: base/strings/string_split.cc

Issue 1169393003: Add new SplitString backend. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "base/strings/string_split.h"	5 #include "base/strings/string_split.h"

6	6

7 #include "base/logging.h"	7 #include "base/logging.h"

8 #include "base/strings/string_util.h"	8 #include "base/strings/string_util.h"

9 #include "base/third_party/icu/icu_utf.h"	9 #include "base/third_party/icu/icu_utf.h"

10	10

11 namespace base {	11 namespace base {

12	12

13 namespace {	13 namespace {

14	14

15 template <typename STR>	15 // PieceToOutputType converts a StringPiece as needed to a given output type,

16 void SplitStringT(const STR& str,	16 // which is either the same type of StringPiece (a NOP) or the corresponding

17 const typename STR::value_type s,	17 // non-piece string type.

18 bool trim_whitespace,	18 //

19 std::vector<STR>* r) {	19 // The default converter is a NOP, it works when the OutputType is the

20 r->clear();	20 // correct StringPiece.

21 size_t last = 0;	21 template<typename Str, typename OutputType>

22 size_t c = str.size();	22 OutputType PieceToOutputType(BasicStringPiece<Str> piece) {

23 for (size_t i = 0; i <= c; ++i) {	23 return piece;

24 if (i == c \|\| str[i] == s) {	24 }

25 STR tmp(str, last, i - last);	25 template<> // Convert StringPiece to std::string

26 if (trim_whitespace)	26 std::string PieceToOutputType<std::string, std::string>(StringPiece piece) {

27 TrimWhitespace(tmp, TRIM_ALL, &tmp);	27 return piece.as_string();

28 // Avoid converting an empty or all-whitespace source string into a vector	28 }

29 // of one empty string.	29 template<> // Convert StringPiece16 to string16.

30 if (i != c \|\| !r->empty() \|\| !tmp.empty())	30 string16 PieceToOutputType<string16, string16>(StringPiece16 piece) {

31 r->push_back(tmp);	31 return piece.as_string();

32 last = i + 1;	32 }

	33

	34 // Returns either the ASCII or UTF-16 whitespace.

	35 template<typename Str> BasicStringPiece<Str> WhitespaceForType();

	36 template<> StringPiece16 WhitespaceForType<string16>() {

	37 return kWhitespaceUTF16;

	38 }

	39 template<> StringPiece WhitespaceForType<std::string>() {

	40 return kWhitespaceASCII;

	41 }

	42

	43 // Optimize the single-character case to call find() on the string instead,

	44 // since this is the common case and can be made faster. This could have been

	45 // done with template specialization too, but would have been less clear.

	46 //

	47 // There is no corresponding FindFirstNotOf because StringPiece already

	48 // implements these different versions that do the optimized searching.

	49 size_t FindFirstOf(StringPiece piece, char c, size_t pos) {

	50 return piece.find(c, pos);

	51 }

	52 size_t FindFirstOf(StringPiece16 piece, char16 c, size_t pos) {

	53 return piece.find(c, pos);

	54 }

	55 size_t FindFirstOf(StringPiece piece, StringPiece one_of, size_t pos) {

	56 return piece.find_first_of(one_of, pos);

	57 }

	58 size_t FindFirstOf(StringPiece16 piece, StringPiece16 one_of, size_t pos) {

	59 return piece.find_first_of(one_of, pos);

	60 }

	61

	62 // General string splitter template. Can take 8- or 16-bit input, can produce

	63 // the corresponding string or StringPiece output, and can take single- or

	64 // multiple-character delimiters.

	65 //

	66 // DelimiterType is either a character (Str::value_type> or a string piece of
	danakj 2015/06/11 23:58:34 typo here with value_type> typo here with value_type>
	67 // multiple characters (BasicStringPiece<Str>). StringPiece has version of
	danakj 2015/06/11 23:58:34 has a version has a version
	68 // find for both of these cases, and the single-character version is the most

	69 // common and can be implemented faster, which is why this is a template.

	70 template<typename Str, typename OutputStringType, typename DelimiterType>

	71 static std::vector<OutputStringType> SplitStringT(

	72 BasicStringPiece<Str> str,

	73 DelimiterType delimiter,

	74 WhitespaceHandling whitespace,

	75 SplitResult result_type) {

	76 std::vector<OutputStringType> result;

	77 if (str.empty())

	78 return result;

	79

	80 size_t start = 0;

	81 while (start != Str::npos) {

	82 size_t end = FindFirstOf(str, delimiter, start);

	83

	84 BasicStringPiece<Str> piece;

	85 if (end == Str::npos) {

	86 piece = str.substr(start);

	87 start = Str::npos;

	88 } else {

	89 piece = str.substr(start, end - start);

	90 start = end + 1;

33 }	91 }

	92

	93 if (whitespace == TRIM_WHITESPACE)

	94 piece = TrimString(piece, WhitespaceForType<Str>(), TRIM_ALL);

	95

	96 if (result_type == SPLIT_WANT_ALL \|\| !piece.empty())

	97 result.push_back(PieceToOutputType<Str, OutputStringType>(piece));

34 }	98 }

	99 return result;

35 }	100 }

36	101

37 bool SplitStringIntoKeyValue(const std::string& line,	102 bool SplitStringIntoKeyValue(const std::string& line,

38 char key_value_delimiter,	103 char key_value_delimiter,

39 std::string* key,	104 std::string* key,

40 std::string* value) {	105 std::string* value) {

41 key->clear();	106 key->clear();

42 value->clear();	107 value->clear();

43	108

44 // Find the delimiter.	109 // Find the delimiter.

(...skipping 10 matching lines...) Expand all Loading...
55 if (begin_value_pos == std::string::npos) {	120 if (begin_value_pos == std::string::npos) {

56 DVLOG(1) << "cannot parse value from line: " << line;	121 DVLOG(1) << "cannot parse value from line: " << line;

57 return false; // no value	122 return false; // no value

58 }	123 }

59 value->assign(remains, begin_value_pos, remains.size() - begin_value_pos);	124 value->assign(remains, begin_value_pos, remains.size() - begin_value_pos);

60 return true;	125 return true;

61 }	126 }

62	127

63 template <typename STR>	128 template <typename STR>

64 void SplitStringUsingSubstrT(const STR& str,	129 void SplitStringUsingSubstrT(const STR& str,

65 const STR& s,	130 const STR& s,

66 std::vector<STR>* r) {	131 std::vector<STR>* r) {

67 r->clear();	132 r->clear();

68 typename STR::size_type begin_index = 0;	133 typename STR::size_type begin_index = 0;

69 while (true) {	134 while (true) {

70 const typename STR::size_type end_index = str.find(s, begin_index);	135 const typename STR::size_type end_index = str.find(s, begin_index);

71 if (end_index == STR::npos) {	136 if (end_index == STR::npos) {

72 const STR term = str.substr(begin_index);	137 const STR term = str.substr(begin_index);

73 STR tmp;	138 STR tmp;

74 TrimWhitespace(term, TRIM_ALL, &tmp);	139 TrimWhitespace(term, TRIM_ALL, &tmp);

75 r->push_back(tmp);	140 r->push_back(tmp);

76 return;	141 return;

77 }	142 }

78 const STR term = str.substr(begin_index, end_index - begin_index);	143 const STR term = str.substr(begin_index, end_index - begin_index);

79 STR tmp;	144 STR tmp;

80 TrimWhitespace(term, TRIM_ALL, &tmp);	145 TrimWhitespace(term, TRIM_ALL, &tmp);

81 r->push_back(tmp);	146 r->push_back(tmp);

82 begin_index = end_index + s.size();	147 begin_index = end_index + s.size();

83 }	148 }

84 }	149 }

85	150

86 template<typename STR>

87 void SplitStringAlongWhitespaceT(const STR& str, std::vector<STR>* result) {

88 result->clear();

89 const size_t length = str.length();

90 if (!length)

91 return;

92

93 bool last_was_ws = false;

94 size_t last_non_ws_start = 0;

95 for (size_t i = 0; i < length; ++i) {

96 switch (str[i]) {

97 // HTML 5 defines whitespace as: space, tab, LF, line tab, FF, or CR.

98 case L' ':

99 case L'\t':

100 case L'\xA':

101 case L'\xB':

102 case L'\xC':

103 case L'\xD':

104 if (!last_was_ws) {

105 if (i > 0) {

106 result->push_back(

107 str.substr(last_non_ws_start, i - last_non_ws_start));

108 }

109 last_was_ws = true;

110 }

111 break;

112

113 default: // Not a space character.

114 if (last_was_ws) {

115 last_was_ws = false;

116 last_non_ws_start = i;

117 }

118 break;

119 }

120 }

121 if (!last_was_ws) {

122 result->push_back(

123 str.substr(last_non_ws_start, length - last_non_ws_start));

124 }

125 }

126

127 } // namespace	151 } // namespace

128	152

	153 std::vector<std::string> SplitString(StringPiece input,

	154 StringPiece separators,

	155 WhitespaceHandling whitespace,

	156 SplitResult result_type) {

	157 if (separators.size() == 1) {

	158 return SplitStringT<std::string, std::string, char>(

	159 input, separators[0], whitespace, result_type);

	160 }

	161 return SplitStringT<std::string, std::string, StringPiece>(

	162 input, separators, whitespace, result_type);

	163 }

	164

	165 std::vector<string16> SplitString(StringPiece16 input,

	166 StringPiece16 separators,

	167 WhitespaceHandling whitespace,

	168 SplitResult result_type) {

	169 if (separators.size() == 1) {

	170 return SplitStringT<string16, string16, char16>(

	171 input, separators[0], whitespace, result_type);

	172 }

	173 return SplitStringT<string16, string16, StringPiece16>(

	174 input, separators, whitespace, result_type);

	175 }

	176

	177 std::vector<StringPiece> SplitStringPiece(StringPiece input,

	178 StringPiece separators,

	179 WhitespaceHandling whitespace,

	180 SplitResult result_type) {

	181 if (separators.size() == 1) {

	182 return SplitStringT<std::string, StringPiece, char>(

	183 input, separators[0], whitespace, result_type);

	184 }

	185 return SplitStringT<std::string, StringPiece, StringPiece>(

	186 input, separators, whitespace, result_type);

	187 }

	188

	189 std::vector<StringPiece16> SplitStringPiece(StringPiece16 input,

	190 StringPiece16 separators,

	191 WhitespaceHandling whitespace,

	192 SplitResult result_type) {

	193 if (separators.size() == 1) {

	194 return SplitStringT<string16, StringPiece16, char16>(

	195 input, separators[0], whitespace, result_type);

	196 }

	197 return SplitStringT<string16, StringPiece16, StringPiece16>(

	198 input, separators, whitespace, result_type);

	199 }

	200

129 void SplitString(const string16& str,	201 void SplitString(const string16& str,

130 char16 c,	202 char16 c,

131 std::vector<string16>* r) {	203 std::vector<string16>* result) {

132 DCHECK(CBU16_IS_SINGLE(c));	204 DCHECK(CBU16_IS_SINGLE(c));

133 SplitStringT(str, c, true, r);	205 *result = SplitStringT<string16, string16, char16>(

	206 str, c, TRIM_WHITESPACE, SPLIT_WANT_ALL);

	207

	208 // Backward-compat hack: The old SplitString implementation would keep

	209 // empty substrings, for example:

	210 // "a,,b" -> ["a", "", "b"]

	211 // "a, ,b" -> ["a", "", "b"]

	212 // which the current code also does. But the old one would discard them when

	213 // the only result was that empty string:

	214 // " " -> []

	215 // In the latter case, our new code will give [""]

	216 if (result->size() == 1 && (*result)[0].empty())

	217 result->clear();

134 }	218 }

135	219

136 void SplitString(const std::string& str,	220 void SplitString(const std::string& str,

137 char c,	221 char c,

138 std::vector<std::string>* r) {	222 std::vector<std::string>* result) {

139 #if CHAR_MIN < 0	223 #if CHAR_MIN < 0

140 DCHECK_GE(c, 0);	224 DCHECK_GE(c, 0);

141 #endif	225 #endif

142 DCHECK_LT(c, 0x7F);	226 DCHECK_LT(c, 0x7F);

143 SplitStringT(str, c, true, r);	227 *result = SplitStringT<std::string, std::string, char>(

	228 str, c, TRIM_WHITESPACE, SPLIT_WANT_ALL);

	229

	230 // Backward-compat hack, see above.

	231 if (result->size() == 1 && (*result)[0].empty())

	232 result->clear();

	233

144 }	234 }

145	235

146 bool SplitStringIntoKeyValuePairs(const std::string& line,	236 bool SplitStringIntoKeyValuePairs(const std::string& line,

147 char key_value_delimiter,	237 char key_value_delimiter,

148 char key_value_pair_delimiter,	238 char key_value_pair_delimiter,

149 StringPairs* key_value_pairs) {	239 StringPairs* key_value_pairs) {

150 key_value_pairs->clear();	240 key_value_pairs->clear();

151	241

152 std::vector<std::string> pairs;	242 std::vector<std::string> pairs;

153 SplitString(line, key_value_pair_delimiter, &pairs);	243 SplitString(line, key_value_pair_delimiter, &pairs);

(...skipping 21 matching lines...) Expand all Loading...
175 std::vector<string16>* r) {	265 std::vector<string16>* r) {

176 SplitStringUsingSubstrT(str, s, r);	266 SplitStringUsingSubstrT(str, s, r);

177 }	267 }

178	268

179 void SplitStringUsingSubstr(const std::string& str,	269 void SplitStringUsingSubstr(const std::string& str,

180 const std::string& s,	270 const std::string& s,

181 std::vector<std::string>* r) {	271 std::vector<std::string>* r) {

182 SplitStringUsingSubstrT(str, s, r);	272 SplitStringUsingSubstrT(str, s, r);

183 }	273 }

184	274

185 void SplitStringDontTrim(const string16& str,	275 void SplitStringDontTrim(StringPiece16 str,

186 char16 c,	276 char16 c,

187 std::vector<string16>* r) {	277 std::vector<string16>* result) {

188 DCHECK(CBU16_IS_SINGLE(c));	278 DCHECK(CBU16_IS_SINGLE(c));

189 SplitStringT(str, c, false, r);	279 *result = SplitStringT<string16, string16, char16>(

	280 str, c, KEEP_WHITESPACE, SPLIT_WANT_ALL);

190 }	281 }

191	282

192 void SplitStringDontTrim(const std::string& str,	283 void SplitStringDontTrim(StringPiece str,

193 char c,	284 char c,

194 std::vector<std::string>* r) {	285 std::vector<std::string>* result) {

195 #if CHAR_MIN < 0	286 #if CHAR_MIN < 0

196 DCHECK_GE(c, 0);	287 DCHECK_GE(c, 0);

197 #endif	288 #endif

198 DCHECK_LT(c, 0x7F);	289 DCHECK_LT(c, 0x7F);

199 SplitStringT(str, c, false, r);	290 *result = SplitStringT<std::string, std::string, char>(

	291 str, c, KEEP_WHITESPACE, SPLIT_WANT_ALL);

200 }	292 }

201	293

202 void SplitStringAlongWhitespace(const string16& str,	294 void SplitStringAlongWhitespace(const string16& str,

203 std::vector<string16>* result) {	295 std::vector<string16>* result) {

204 SplitStringAlongWhitespaceT(str, result);	296 *result = SplitStringT<string16, string16, StringPiece16>(

	297 str, StringPiece16(kWhitespaceASCIIAs16),

	298 TRIM_WHITESPACE, SPLIT_WANT_NONEMPTY);

205 }	299 }

206	300

207 void SplitStringAlongWhitespace(const std::string& str,	301 void SplitStringAlongWhitespace(const std::string& str,

208 std::vector<std::string>* result) {	302 std::vector<std::string>* result) {

209 SplitStringAlongWhitespaceT(str, result);	303 *result = SplitStringT<std::string, std::string, StringPiece>(

	304 str, StringPiece(kWhitespaceASCII),

	305 TRIM_WHITESPACE, SPLIT_WANT_NONEMPTY);

210 }	306 }

211	307

212 } // namespace base	308 } // namespace base

OLD	NEW

« base/strings/string_split.h ('K') | « base/strings/string_split.h ('k') | base/strings/string_split_unittest.cc » ('j') | base/strings/string_split_unittest.cc » ('J')