| OLD | NEW |
| 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "base/string_util.h" | 5 #include "base/string_util.h" |
| 6 | 6 |
| 7 #include <ctype.h> | 7 #include <ctype.h> |
| 8 #include <errno.h> | 8 #include <errno.h> |
| 9 #include <math.h> | 9 #include <math.h> |
| 10 #include <stdarg.h> | 10 #include <stdarg.h> |
| (...skipping 499 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 510 // the Initial Developer. All Rights Reserved. | 510 // the Initial Developer. All Rights Reserved. |
| 511 // | 511 // |
| 512 // Contributor(s): | 512 // Contributor(s): |
| 513 // Scott Collins <scc@mozilla.org> (original author) | 513 // Scott Collins <scc@mozilla.org> (original author) |
| 514 // | 514 // |
| 515 // This is a template so that it can be run on wide and 8-bit strings. We want | 515 // This is a template so that it can be run on wide and 8-bit strings. We want |
| 516 // to run it on wide strings when we have input that we think may have | 516 // to run it on wide strings when we have input that we think may have |
| 517 // originally been UTF-8, but has been converted to wide characters because | 517 // originally been UTF-8, but has been converted to wide characters because |
| 518 // that's what we (and Windows) use internally. | 518 // that's what we (and Windows) use internally. |
| 519 template<typename CHAR> | 519 template<typename CHAR> |
| 520 static bool IsStringUTF8T(const CHAR* str) { | 520 static bool IsStringUTF8T(const CHAR* str, int length) { |
| 521 bool overlong = false; | 521 bool overlong = false; |
| 522 bool surrogate = false; | 522 bool surrogate = false; |
| 523 bool nonchar = false; | 523 bool nonchar = false; |
| 524 | 524 |
| 525 // overlong byte upper bound | 525 // overlong byte upper bound |
| 526 typename ToUnsigned<CHAR>::Unsigned olupper = 0; | 526 typename ToUnsigned<CHAR>::Unsigned olupper = 0; |
| 527 | 527 |
| 528 // surrogate byte lower bound | 528 // surrogate byte lower bound |
| 529 typename ToUnsigned<CHAR>::Unsigned slower = 0; | 529 typename ToUnsigned<CHAR>::Unsigned slower = 0; |
| 530 | 530 |
| 531 // incremented when inside a multi-byte char to indicate how many bytes | 531 // incremented when inside a multi-byte char to indicate how many bytes |
| 532 // are left in the sequence | 532 // are left in the sequence |
| 533 int positions_left = 0; | 533 int positions_left = 0; |
| 534 | 534 |
| 535 for (int i = 0; str[i] != 0; i++) { | 535 for (int i = 0; i < length; i++) { |
| 536 // This whole function assume an unsigned value so force its conversion to | 536 // This whole function assume an unsigned value so force its conversion to |
| 537 // an unsigned value. | 537 // an unsigned value. |
| 538 typename ToUnsigned<CHAR>::Unsigned c = str[i]; | 538 typename ToUnsigned<CHAR>::Unsigned c = str[i]; |
| 539 if (c < 0x80) | 539 if (c < 0x80) |
| 540 continue; // ASCII | 540 continue; // ASCII |
| 541 | 541 |
| 542 if (c <= 0xC1) { | 542 if (c <= 0xC1) { |
| 543 // [80-BF] where not expected, [C0-C1] for overlong | 543 // [80-BF] where not expected, [C0-C1] for overlong |
| 544 return false; | 544 return false; |
| 545 } else if (IsBegin2ByteUTF8(c)) { | 545 } else if (IsBegin2ByteUTF8(c)) { |
| 546 positions_left = 1; | 546 positions_left = 1; |
| 547 } else if (IsBegin3ByteUTF8(c)) { | 547 } else if (IsBegin3ByteUTF8(c)) { |
| 548 positions_left = 2; | 548 positions_left = 2; |
| 549 if (c == 0xE0) { | 549 if (c == 0xE0) { |
| 550 // to exclude E0[80-9F][80-BF] | 550 // to exclude E0[80-9F][80-BF] |
| 551 overlong = true; | 551 overlong = true; |
| 552 olupper = 0x9F; | 552 olupper = 0x9F; |
| 553 } else if (c == 0xED) { | 553 } else if (c == 0xED) { |
| 554 // ED[A0-BF][80-BF]: surrogate codepoint | 554 // ED[A0-BF][80-BF]: surrogate codepoint |
| 555 surrogate = true; | 555 surrogate = true; |
| 556 slower = 0xA0; | 556 slower = 0xA0; |
| 557 } else if (c == 0xEF) { | 557 } else if (c == 0xEF) { |
| 558 // EF BF [BE-BF] : non-character | 558 // EF BF [BE-BF] : non-character |
| 559 // TODO(jungshik): EF B7 [90-AF] should be checked as well. |
| 559 nonchar = true; | 560 nonchar = true; |
| 560 } | 561 } |
| 561 } else if (c <= 0xF4) { | 562 } else if (c <= 0xF4) { |
| 562 positions_left = 3; | 563 positions_left = 3; |
| 563 nonchar = true; | 564 nonchar = true; |
| 564 if (c == 0xF0) { | 565 if (c == 0xF0) { |
| 565 // to exclude F0[80-8F][80-BF]{2} | 566 // to exclude F0[80-8F][80-BF]{2} |
| 566 overlong = true; | 567 overlong = true; |
| 567 olupper = 0x8F; | 568 olupper = 0x8F; |
| 568 } else if (c == 0xF4) { | 569 } else if (c == 0xF4) { |
| (...skipping 23 matching lines...) Expand all Loading... |
| 592 if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) || | 593 if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) || |
| 593 (surrogate && slower <= c) || (nonchar && !positions_left) ) { | 594 (surrogate && slower <= c) || (nonchar && !positions_left) ) { |
| 594 return false; | 595 return false; |
| 595 } | 596 } |
| 596 overlong = surrogate = false; | 597 overlong = surrogate = false; |
| 597 } | 598 } |
| 598 } | 599 } |
| 599 return true; | 600 return true; |
| 600 } | 601 } |
| 601 | 602 |
| 602 bool IsStringUTF8(const char* str) { | 603 bool IsStringUTF8(const std::string& str) { |
| 603 return IsStringUTF8T(str); | 604 return IsStringUTF8T(str.data(), str.length()); |
| 604 } | 605 } |
| 605 | 606 |
| 606 bool IsStringWideUTF8(const wchar_t* str) { | 607 bool IsStringWideUTF8(const std::wstring& str) { |
| 607 return IsStringUTF8T(str); | 608 return IsStringUTF8T(str.data(), str.length()); |
| 608 } | 609 } |
| 609 | 610 |
| 610 template<typename Iter> | 611 template<typename Iter> |
| 611 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, | 612 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, |
| 612 Iter a_end, | 613 Iter a_end, |
| 613 const char* b) { | 614 const char* b) { |
| 614 for (Iter it = a_begin; it != a_end; ++it, ++b) { | 615 for (Iter it = a_begin; it != a_end; ++it, ++b) { |
| 615 if (!*b || ToLowerASCII(*it) != *b) | 616 if (!*b || ToLowerASCII(*it) != *b) |
| 616 return false; | 617 return false; |
| 617 } | 618 } |
| (...skipping 808 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1426 | 1427 |
| 1427 } // namespace | 1428 } // namespace |
| 1428 | 1429 |
| 1429 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) { | 1430 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) { |
| 1430 return lcpyT<char>(dst, src, dst_size); | 1431 return lcpyT<char>(dst, src, dst_size); |
| 1431 } | 1432 } |
| 1432 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { | 1433 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { |
| 1433 return lcpyT<wchar_t>(dst, src, dst_size); | 1434 return lcpyT<wchar_t>(dst, src, dst_size); |
| 1434 } | 1435 } |
| 1435 | 1436 |
| OLD | NEW |