base/strings/string_util.h - Issue 1647803004: Move base to DEPS

Side by Side Diff: base/strings/string_util.h

Issue 1647803004: Move base to DEPS (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 // Copyright 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4 //

5 // This file defines utility functions for working with strings.

6

7 #ifndef BASE_STRINGS_STRING_UTIL_H_

8 #define BASE_STRINGS_STRING_UTIL_H_

9

10 #include <ctype.h>

11 #include <stdarg.h> // va_list

12

13 #include <string>

14 #include <vector>

15

16 #include "base/base_export.h"

17 #include "base/basictypes.h"

18 #include "base/compiler_specific.h"

19 #include "base/strings/string16.h"

20 #include "base/strings/string_piece.h" // For implicit conversions.

21

22 namespace base {

23

24 // C standard-library functions that aren't cross-platform are provided as

25 // "base::...", and their prototypes are listed below. These functions are

26 // then implemented as inline calls to the platform-specific equivalents in the

27 // platform-specific headers.

28

29 // Wrapper for vsnprintf that always null-terminates and always returns the

30 // number of characters that would be in an untruncated formatted

31 // string, even when truncation occurs.

32 int vsnprintf(char* buffer, size_t size, const char* format, va_list arguments)

33 PRINTF_FORMAT(3, 0);

34

35 // Some of these implementations need to be inlined.

36

37 // We separate the declaration from the implementation of this inline

38 // function just so the PRINTF_FORMAT works.

39 inline int snprintf(char* buffer, size_t size, const char* format, ...)

40 PRINTF_FORMAT(3, 4);

41 inline int snprintf(char* buffer, size_t size, const char* format, ...) {

42 va_list arguments;

43 va_start(arguments, format);

44 int result = vsnprintf(buffer, size, format, arguments);

45 va_end(arguments);

46 return result;

47 }

48

49 // TODO(mark) http://crbug.com/472900 crashpad shouldn't use base while

50 // being DEPSed in. This backwards-compat hack is provided until crashpad is

51 // updated.

52 #if defined(OS_WIN)

53 inline int strcasecmp(const char* s1, const char* s2) {

54 return _stricmp(s1, s2);

55 }

56 #else // Posix

57 inline int strcasecmp(const char* string1, const char* string2) {

58 return ::strcasecmp(string1, string2);

59 }

60 #endif

61

62 // BSD-style safe and consistent string copy functions.

63 // Copies \|src\| to \|dst\|, where \|dst_size\| is the total allocated size of \|dst\|.

64 // Copies at most \|dst_size\|-1 characters, and always NULL terminates \|dst\|, as

65 // long as \|dst_size\| is not 0. Returns the length of \|src\| in characters.

66 // If the return value is >= dst_size, then the output was truncated.

67 // NOTE: All sizes are in number of characters, NOT in bytes.

68 BASE_EXPORT size_t strlcpy(char* dst, const char* src, size_t dst_size);

69 BASE_EXPORT size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size);

70

71 // Scan a wprintf format string to determine whether it's portable across a

72 // variety of systems. This function only checks that the conversion

73 // specifiers used by the format string are supported and have the same meaning

74 // on a variety of systems. It doesn't check for other errors that might occur

75 // within a format string.

76 //

77 // Nonportable conversion specifiers for wprintf are:

78 // - 's' and 'c' without an 'l' length modifier. %s and %c operate on char

79 // data on all systems except Windows, which treat them as wchar_t data.

80 // Use %ls and %lc for wchar_t data instead.

81 // - 'S' and 'C', which operate on wchar_t data on all systems except Windows,

82 // which treat them as char data. Use %ls and %lc for wchar_t data

83 // instead.

84 // - 'F', which is not identified by Windows wprintf documentation.

85 // - 'D', 'O', and 'U', which are deprecated and not available on all systems.

86 // Use %ld, %lo, and %lu instead.

87 //

88 // Note that there is no portable conversion specifier for char data when

89 // working with wprintf.

90 //

91 // This function is intended to be called from base::vswprintf.

92 BASE_EXPORT bool IsWprintfFormatPortable(const wchar_t* format);

93

94 // ASCII-specific tolower. The standard library's tolower is locale sensitive,

95 // so we don't want to use it here.

96 template <class Char> inline Char ToLowerASCII(Char c) {

97 return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;

98 }

99

100 // ASCII-specific toupper. The standard library's toupper is locale sensitive,

101 // so we don't want to use it here.

102 template <class Char> inline Char ToUpperASCII(Char c) {

103 return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c;

104 }

105

106 // Functor for case-insensitive ASCII comparisons for STL algorithms like

107 // std::search.

108 //

109 // Note that a full Unicode version of this functor is not possible to write

110 // because case mappings might change the number of characters, depend on

111 // context (combining accents), and require handling UTF-16. If you need

112 // proper Unicode support, use base::i18n::ToLower/FoldCase and then just

113 // use a normal operator== on the result.

114 template<typename Char> struct CaseInsensitiveCompareASCII {

115 public:

116 bool operator()(Char x, Char y) const {

117 return ToLowerASCII(x) == ToLowerASCII(y);

118 }

119 };

120

121 // Like strcasecmp for case-insensitive ASCII characters only. Returns:

122 // -1 (a < b)

123 // 0 (a == b)

124 // 1 (a > b)

125 // (unlike strcasecmp which can return values greater or less than 1/-1). For

126 // full Unicode support, use base::i18n::ToLower or base::i18h::FoldCase

127 // and then just call the normal string operators on the result.

128 BASE_EXPORT int CompareCaseInsensitiveASCII(StringPiece a, StringPiece b);

129 BASE_EXPORT int CompareCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b);

130

131 // Equality for ASCII case-insensitive comparisons. For full Unicode support,

132 // use base::i18n::ToLower or base::i18h::FoldCase and then compare with either

133 // == or !=.

134 BASE_EXPORT bool EqualsCaseInsensitiveASCII(StringPiece a, StringPiece b);

135 BASE_EXPORT bool EqualsCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b);

136

137 // These threadsafe functions return references to globally unique empty

138 // strings.

139 //

140 // It is likely faster to construct a new empty string object (just a few

141 // instructions to set the length to 0) than to get the empty string singleton

142 // returned by these functions (which requires threadsafe singleton access).

143 //

144 // Therefore, DO NOT USE THESE AS A GENERAL-PURPOSE SUBSTITUTE FOR DEFAULT

145 // CONSTRUCTORS. There is only one case where you should use these: functions

146 // which need to return a string by reference (e.g. as a class member

147 // accessor), and don't have an empty string to use (e.g. in an error case).

148 // These should not be used as initializers, function arguments, or return

149 // values for functions which return by value or outparam.

150 BASE_EXPORT const std::string& EmptyString();

151 BASE_EXPORT const string16& EmptyString16();

152

153 // Contains the set of characters representing whitespace in the corresponding

154 // encoding. Null-terminated. The ASCII versions are the whitespaces as defined

155 // by HTML5, and don't include control characters.

156 BASE_EXPORT extern const wchar_t kWhitespaceWide[]; // Includes Unicode.

157 BASE_EXPORT extern const char16 kWhitespaceUTF16[]; // Includes Unicode.

158 BASE_EXPORT extern const char kWhitespaceASCII[];

159 BASE_EXPORT extern const char16 kWhitespaceASCIIAs16[]; // No unicode.

160

161 // Null-terminated string representing the UTF-8 byte order mark.

162 BASE_EXPORT extern const char kUtf8ByteOrderMark[];

163

164 // Removes characters in \|remove_chars\| from anywhere in \|input\|. Returns true

165 // if any characters were removed. \|remove_chars\| must be null-terminated.

166 // NOTE: Safe to use the same variable for both \|input\| and \|output\|.

167 BASE_EXPORT bool RemoveChars(const string16& input,

168 const StringPiece16& remove_chars,

169 string16* output);

170 BASE_EXPORT bool RemoveChars(const std::string& input,

171 const StringPiece& remove_chars,

172 std::string* output);

173

174 // Replaces characters in \|replace_chars\| from anywhere in \|input\| with

175 // \|replace_with\|. Each character in \|replace_chars\| will be replaced with

176 // the \|replace_with\| string. Returns true if any characters were replaced.

177 // \|replace_chars\| must be null-terminated.

178 // NOTE: Safe to use the same variable for both \|input\| and \|output\|.

179 BASE_EXPORT bool ReplaceChars(const string16& input,

180 const StringPiece16& replace_chars,

181 const string16& replace_with,

182 string16* output);

183 BASE_EXPORT bool ReplaceChars(const std::string& input,

184 const StringPiece& replace_chars,

185 const std::string& replace_with,

186 std::string* output);

187

188 enum TrimPositions {

189 TRIM_NONE = 0,

190 TRIM_LEADING = 1 << 0,

191 TRIM_TRAILING = 1 << 1,

192 TRIM_ALL = TRIM_LEADING \| TRIM_TRAILING,

193 };

194

195 // Removes characters in \|trim_chars\| from the beginning and end of \|input\|.

196 // The 8-bit version only works on 8-bit characters, not UTF-8.

197 //

198 // It is safe to use the same variable for both \|input\| and \|output\| (this is

199 // the normal usage to trim in-place).

200 BASE_EXPORT bool TrimString(const string16& input,

201 StringPiece16 trim_chars,

202 string16* output);

203 BASE_EXPORT bool TrimString(const std::string& input,

204 StringPiece trim_chars,

205 std::string* output);

206

207 // StringPiece versions of the above. The returned pieces refer to the original

208 // buffer.

209 BASE_EXPORT StringPiece16 TrimString(StringPiece16 input,

210 const StringPiece16& trim_chars,

211 TrimPositions positions);

212 BASE_EXPORT StringPiece TrimString(StringPiece input,

213 const StringPiece& trim_chars,

214 TrimPositions positions);

215

216 // Truncates a string to the nearest UTF-8 character that will leave

217 // the string less than or equal to the specified byte size.

218 BASE_EXPORT void TruncateUTF8ToByteSize(const std::string& input,

219 const size_t byte_size,

220 std::string* output);

221

222 // Trims any whitespace from either end of the input string.

223 //

224 // The StringPiece versions return a substring referencing the input buffer.

225 // The ASCII versions look only for ASCII whitespace.

226 //

227 // The std::string versions return where whitespace was found.

228 // NOTE: Safe to use the same variable for both input and output.

229 BASE_EXPORT TrimPositions TrimWhitespace(const string16& input,

230 TrimPositions positions,

231 string16* output);

232 BASE_EXPORT StringPiece16 TrimWhitespace(StringPiece16 input,

233 TrimPositions positions);

234 BASE_EXPORT TrimPositions TrimWhitespaceASCII(const std::string& input,

235 TrimPositions positions,

236 std::string* output);

237 BASE_EXPORT StringPiece TrimWhitespaceASCII(StringPiece input,

238 TrimPositions positions);

239

240 // Deprecated. This function is only for backward compatibility and calls

241 // TrimWhitespaceASCII().

242 BASE_EXPORT TrimPositions TrimWhitespace(const std::string& input,

243 TrimPositions positions,

244 std::string* output);

245

246 // Searches for CR or LF characters. Removes all contiguous whitespace

247 // strings that contain them. This is useful when trying to deal with text

248 // copied from terminals.

249 // Returns \|text\|, with the following three transformations:

250 // (1) Leading and trailing whitespace is trimmed.

251 // (2) If \|trim_sequences_with_line_breaks\| is true, any other whitespace

252 // sequences containing a CR or LF are trimmed.

253 // (3) All other whitespace sequences are converted to single spaces.

254 BASE_EXPORT string16 CollapseWhitespace(

255 const string16& text,

256 bool trim_sequences_with_line_breaks);

257 BASE_EXPORT std::string CollapseWhitespaceASCII(

258 const std::string& text,

259 bool trim_sequences_with_line_breaks);

260

261 // Returns true if \|input\| is empty or contains only characters found in

262 // \|characters\|.

263 BASE_EXPORT bool ContainsOnlyChars(const StringPiece& input,

264 const StringPiece& characters);

265 BASE_EXPORT bool ContainsOnlyChars(const StringPiece16& input,

266 const StringPiece16& characters);

267

268 // Returns true if the specified string matches the criteria. How can a wide

269 // string be 8-bit or UTF8? It contains only characters that are < 256 (in the

270 // first case) or characters that use only 8-bits and whose 8-bit

271 // representation looks like a UTF-8 string (the second case).

272 //

273 // Note that IsStringUTF8 checks not only if the input is structurally

274 // valid but also if it doesn't contain any non-character codepoint

275 // (e.g. U+FFFE). It's done on purpose because all the existing callers want

276 // to have the maximum 'discriminating' power from other encodings. If

277 // there's a use case for just checking the structural validity, we have to

278 // add a new function for that.

279 //

280 // IsStringASCII assumes the input is likely all ASCII, and does not leave early

281 // if it is not the case.

282 BASE_EXPORT bool IsStringUTF8(const StringPiece& str);

283 BASE_EXPORT bool IsStringASCII(const StringPiece& str);

284 BASE_EXPORT bool IsStringASCII(const StringPiece16& str);

285 // A convenience adaptor for WebStrings, as they don't convert into

286 // StringPieces directly.

287 BASE_EXPORT bool IsStringASCII(const string16& str);

288 #if defined(WCHAR_T_IS_UTF32)

289 BASE_EXPORT bool IsStringASCII(const std::wstring& str);

290 #endif

291

292 // Converts the elements of the given string. This version uses a pointer to

293 // clearly differentiate it from the non-pointer variant.

294 template <class str> inline void StringToLowerASCII(str* s) {

295 for (typename str::iterator i = s->begin(); i != s->end(); ++i)

296 i = ToLowerASCII(i);

297 }

298

299 template <class str> inline str StringToLowerASCII(const str& s) {

300 // for std::string and std::wstring

301 str output(s);

302 StringToLowerASCII(&output);

303 return output;

304 }

305

306 // Converts the elements of the given string. This version uses a pointer to

307 // clearly differentiate it from the non-pointer variant.

308 template <class str> inline void StringToUpperASCII(str* s) {

309 for (typename str::iterator i = s->begin(); i != s->end(); ++i)

310 i = ToUpperASCII(i);

311 }

312

313 template <class str> inline str StringToUpperASCII(const str& s) {

314 // for std::string and std::wstring

315 str output(s);

316 StringToUpperASCII(&output);

317 return output;

318 }

319 //

320 // Compare the lower-case form of the given string against the given ASCII

321 // string. This is useful for doing checking if an input string matches some

322 // token, and it is optimized to avoid intermediate string copies. This API is

323 // borrowed from the equivalent APIs in Mozilla.

324 BASE_EXPORT bool LowerCaseEqualsASCII(const std::string& a, const char* b);

325 BASE_EXPORT bool LowerCaseEqualsASCII(const string16& a, const char* b);

326

327 // Same thing, but with string iterators instead.

328 BASE_EXPORT bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,

329 std::string::const_iterator a_end,

330 const char* b);

331 BASE_EXPORT bool LowerCaseEqualsASCII(string16::const_iterator a_begin,

332 string16::const_iterator a_end,

333 const char* b);

334 BASE_EXPORT bool LowerCaseEqualsASCII(const char* a_begin,

335 const char* a_end,

336 const char* b);

337 BASE_EXPORT bool LowerCaseEqualsASCII(const char* a_begin,

338 const char* a_end,

339 const char* b_begin,

340 const char* b_end);

341 BASE_EXPORT bool LowerCaseEqualsASCII(const char16* a_begin,

342 const char16* a_end,

343 const char* b);

344

345 // Performs a case-sensitive string compare. The behavior is undefined if both

346 // strings are not ASCII.

347 BASE_EXPORT bool EqualsASCII(const string16& a, const StringPiece& b);

348

349 // Indicates case sensitivity of comparisons. Only ASCII case insensitivity

350 // is supported. Full Unicode case-insensitive conversions would need to go in

351 // base/i18n so it can use ICU.

352 //

353 // If you need to do Unicode-aware case-insensitive StartsWith/EndsWith, it's

354 // best to call base::i18n::ToLower() or base::i18n::FoldCase() (see

355 // base/i18n/case_conversion.h for usage advice) on the arguments, and then use

356 // the results to a case-sensitive comparison.

357 enum class CompareCase {

358 SENSITIVE,

359 INSENSITIVE_ASCII,

360 };

361

362 BASE_EXPORT bool StartsWith(StringPiece str,

363 StringPiece search_for,

364 CompareCase case_sensitivity);

365 BASE_EXPORT bool StartsWith(StringPiece16 str,

366 StringPiece16 search_for,

367 CompareCase case_sensitivity);

368 BASE_EXPORT bool EndsWith(StringPiece str,

369 StringPiece search_for,

370 CompareCase case_sensitivity);

371 BASE_EXPORT bool EndsWith(StringPiece16 str,

372 StringPiece16 search_for,

373 CompareCase case_sensitivity);

374

375 // DEPRECATED. Returns true if str starts/ends with search, or false otherwise.

376 // TODO(brettw) remove in favor of the "enum" versions above.

377 inline bool StartsWithASCII(const std::string& str,

378 const std::string& search,

379 bool case_sensitive) {

380 return StartsWith(

381 StringPiece(str), StringPiece(search),

382 case_sensitive ? CompareCase::SENSITIVE : CompareCase::INSENSITIVE_ASCII);

383 }

384

385 // Determines the type of ASCII character, independent of locale (the C

386 // library versions will change based on locale).

387 template <typename Char>

388 inline bool IsAsciiWhitespace(Char c) {

389 return c == ' ' \|\| c == '\r' \|\| c == '\n' \|\| c == '\t';

390 }

391 template <typename Char>

392 inline bool IsAsciiAlpha(Char c) {

393 return ((c >= 'A') && (c <= 'Z')) \|\| ((c >= 'a') && (c <= 'z'));

394 }

395 template <typename Char>

396 inline bool IsAsciiDigit(Char c) {

397 return c >= '0' && c <= '9';

398 }

399

400 template <typename Char>

401 inline bool IsHexDigit(Char c) {

402 return (c >= '0' && c <= '9') \|\|

403 (c >= 'A' && c <= 'F') \|\|

404 (c >= 'a' && c <= 'f');

405 }

406

407 // Returns the integer corresponding to the given hex character. For example:

408 // '4' -> 4

409 // 'a' -> 10

410 // 'B' -> 11

411 // Assumes the input is a valid hex character. DCHECKs in debug builds if not.

412 BASE_EXPORT char HexDigitToInt(wchar_t c);

413

414 // Returns true if it's a Unicode whitespace character.

415 inline bool IsUnicodeWhitespace(wchar_t c) {

416 return wcschr(base::kWhitespaceWide, c) != NULL;

417 }

418

419 // Return a byte string in human-readable format with a unit suffix. Not

420 // appropriate for use in any UI; use of FormatBytes and friends in ui/base is

421 // highly recommended instead. TODO(avi): Figure out how to get callers to use

422 // FormatBytes instead; remove this.

423 BASE_EXPORT string16 FormatBytesUnlocalized(int64 bytes);

424

425 // Starting at \|start_offset\| (usually 0), replace the first instance of

426 // \|find_this\| with \|replace_with\|.

427 BASE_EXPORT void ReplaceFirstSubstringAfterOffset(base::string16* str,

428 size_t start_offset,

429 StringPiece16 find_this,

430 StringPiece16 replace_with);

431 BASE_EXPORT void ReplaceFirstSubstringAfterOffset(std::string* str,

432 size_t start_offset,

433 StringPiece find_this,

434 StringPiece replace_with);

435

436 // Starting at \|start_offset\| (usually 0), look through \|str\| and replace all

437 // instances of \|find_this\| with \|replace_with\|.

438 //

439 // This does entire substrings; use std::replace in <algorithm> for single

440 // characters, for example:

441 // std::replace(str.begin(), str.end(), 'a', 'b');

442 BASE_EXPORT void ReplaceSubstringsAfterOffset(string16* str,

443 size_t start_offset,

444 StringPiece16 find_this,

445 StringPiece16 replace_with);

446 BASE_EXPORT void ReplaceSubstringsAfterOffset(std::string* str,

447 size_t start_offset,

448 StringPiece find_this,

449 StringPiece replace_with);

450

451 // Reserves enough memory in \|str\| to accommodate \|length_with_null\| characters,

452 // sets the size of \|str\| to \|length_with_null - 1\| characters, and returns a

453 // pointer to the underlying contiguous array of characters. This is typically

454 // used when calling a function that writes results into a character array, but

455 // the caller wants the data to be managed by a string-like object. It is

456 // convenient in that is can be used inline in the call, and fast in that it

457 // avoids copying the results of the call from a char* into a string.

458 //

459 // \|length_with_null\| must be at least 2, since otherwise the underlying string

460 // would have size 0, and trying to access &((*str)[0]) in that case can result

461 // in a number of problems.

462 //

463 // Internally, this takes linear time because the resize() call 0-fills the

464 // underlying array for potentially all

465 // (\|length_with_null - 1\| * sizeof(string_type::value_type)) bytes. Ideally we

466 // could avoid this aspect of the resize() call, as we expect the caller to

467 // immediately write over this memory, but there is no other way to set the size

468 // of the string, and not doing that will mean people who access \|str\| rather

469 // than str.c_str() will get back a string of whatever size \|str\| had on entry

470 // to this function (probably 0).

471 BASE_EXPORT char* WriteInto(std::string* str, size_t length_with_null);

472 BASE_EXPORT char16* WriteInto(string16* str, size_t length_with_null);

473 #ifndef OS_WIN

474 BASE_EXPORT wchar_t* WriteInto(std::wstring* str, size_t length_with_null);

475 #endif

476

477 // Does the opposite of SplitString().

478 BASE_EXPORT std::string JoinString(const std::vector<std::string>& parts,

479 StringPiece separator);

480 BASE_EXPORT string16 JoinString(const std::vector<string16>& parts,

481 StringPiece16 separator);

482

483 // Replace $1-$2-$3..$9 in the format string with \|a\|-\|b\|-\|c\|..\|i\| respectively.

484 // Additionally, any number of consecutive '$' characters is replaced by that

485 // number less one. Eg $$->$, $$$->$$, etc. The offsets parameter here can be

486 // NULL. This only allows you to use up to nine replacements.

487 BASE_EXPORT string16

488 ReplaceStringPlaceholders(const string16& format_string,

489 const std::vector<string16>& subst,

490 std::vector<size_t>* offsets);

491

492 BASE_EXPORT std::string ReplaceStringPlaceholders(

493 const StringPiece& format_string,

494 const std::vector<std::string>& subst,

495 std::vector<size_t>* offsets);

496

497 // Single-string shortcut for ReplaceStringHolders. \|offset\| may be NULL.

498 BASE_EXPORT string16 ReplaceStringPlaceholders(const string16& format_string,

499 const string16& a,

500 size_t* offset);

501

502 } // namespace base

503

504 #if defined(OS_WIN)

505 #include "base/strings/string_util_win.h"

506 #elif defined(OS_POSIX)

507 #include "base/strings/string_util_posix.h"

508 #else

509 #error Define string operations appropriately for your platform

510 #endif

511

512 #endif // BASE_STRINGS_STRING_UTIL_H_

OLD	NEW

« no previous file with comments | « base/strings/string_tokenizer_unittest.cc ('k') | base/strings/string_util.cc » ('j') | no next file with comments »