OLD | NEW |
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "base/string_util.h" | 5 #include "base/string_util.h" |
6 | 6 |
7 #include <ctype.h> | 7 #include <ctype.h> |
8 #include <errno.h> | 8 #include <errno.h> |
9 #include <math.h> | 9 #include <math.h> |
10 #include <stdarg.h> | 10 #include <stdarg.h> |
(...skipping 499 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
510 // the Initial Developer. All Rights Reserved. | 510 // the Initial Developer. All Rights Reserved. |
511 // | 511 // |
512 // Contributor(s): | 512 // Contributor(s): |
513 // Scott Collins <scc@mozilla.org> (original author) | 513 // Scott Collins <scc@mozilla.org> (original author) |
514 // | 514 // |
515 // This is a template so that it can be run on wide and 8-bit strings. We want | 515 // This is a template so that it can be run on wide and 8-bit strings. We want |
516 // to run it on wide strings when we have input that we think may have | 516 // to run it on wide strings when we have input that we think may have |
517 // originally been UTF-8, but has been converted to wide characters because | 517 // originally been UTF-8, but has been converted to wide characters because |
518 // that's what we (and Windows) use internally. | 518 // that's what we (and Windows) use internally. |
519 template<typename CHAR> | 519 template<typename CHAR> |
520 static bool IsStringUTF8T(const CHAR* str) { | 520 static bool IsStringUTF8T(const CHAR* str, int length) { |
521 bool overlong = false; | 521 bool overlong = false; |
522 bool surrogate = false; | 522 bool surrogate = false; |
523 bool nonchar = false; | 523 bool nonchar = false; |
524 | 524 |
525 // overlong byte upper bound | 525 // overlong byte upper bound |
526 typename ToUnsigned<CHAR>::Unsigned olupper = 0; | 526 typename ToUnsigned<CHAR>::Unsigned olupper = 0; |
527 | 527 |
528 // surrogate byte lower bound | 528 // surrogate byte lower bound |
529 typename ToUnsigned<CHAR>::Unsigned slower = 0; | 529 typename ToUnsigned<CHAR>::Unsigned slower = 0; |
530 | 530 |
531 // incremented when inside a multi-byte char to indicate how many bytes | 531 // incremented when inside a multi-byte char to indicate how many bytes |
532 // are left in the sequence | 532 // are left in the sequence |
533 int positions_left = 0; | 533 int positions_left = 0; |
534 | 534 |
535 for (int i = 0; str[i] != 0; i++) { | 535 for (int i = 0; i < length; i++) { |
536 // This whole function assume an unsigned value so force its conversion to | 536 // This whole function assume an unsigned value so force its conversion to |
537 // an unsigned value. | 537 // an unsigned value. |
538 typename ToUnsigned<CHAR>::Unsigned c = str[i]; | 538 typename ToUnsigned<CHAR>::Unsigned c = str[i]; |
539 if (c < 0x80) | 539 if (c < 0x80) |
540 continue; // ASCII | 540 continue; // ASCII |
541 | 541 |
542 if (c <= 0xC1) { | 542 if (c <= 0xC1) { |
543 // [80-BF] where not expected, [C0-C1] for overlong | 543 // [80-BF] where not expected, [C0-C1] for overlong |
544 return false; | 544 return false; |
545 } else if (IsBegin2ByteUTF8(c)) { | 545 } else if (IsBegin2ByteUTF8(c)) { |
546 positions_left = 1; | 546 positions_left = 1; |
547 } else if (IsBegin3ByteUTF8(c)) { | 547 } else if (IsBegin3ByteUTF8(c)) { |
548 positions_left = 2; | 548 positions_left = 2; |
549 if (c == 0xE0) { | 549 if (c == 0xE0) { |
550 // to exclude E0[80-9F][80-BF] | 550 // to exclude E0[80-9F][80-BF] |
551 overlong = true; | 551 overlong = true; |
552 olupper = 0x9F; | 552 olupper = 0x9F; |
553 } else if (c == 0xED) { | 553 } else if (c == 0xED) { |
554 // ED[A0-BF][80-BF]: surrogate codepoint | 554 // ED[A0-BF][80-BF]: surrogate codepoint |
555 surrogate = true; | 555 surrogate = true; |
556 slower = 0xA0; | 556 slower = 0xA0; |
557 } else if (c == 0xEF) { | 557 } else if (c == 0xEF) { |
558 // EF BF [BE-BF] : non-character | 558 // EF BF [BE-BF] : non-character |
| 559 // TODO(jungshik): EF B7 [90-AF] should be checked as well. |
559 nonchar = true; | 560 nonchar = true; |
560 } | 561 } |
561 } else if (c <= 0xF4) { | 562 } else if (c <= 0xF4) { |
562 positions_left = 3; | 563 positions_left = 3; |
563 nonchar = true; | 564 nonchar = true; |
564 if (c == 0xF0) { | 565 if (c == 0xF0) { |
565 // to exclude F0[80-8F][80-BF]{2} | 566 // to exclude F0[80-8F][80-BF]{2} |
566 overlong = true; | 567 overlong = true; |
567 olupper = 0x8F; | 568 olupper = 0x8F; |
568 } else if (c == 0xF4) { | 569 } else if (c == 0xF4) { |
(...skipping 23 matching lines...) Expand all Loading... |
592 if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) || | 593 if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) || |
593 (surrogate && slower <= c) || (nonchar && !positions_left) ) { | 594 (surrogate && slower <= c) || (nonchar && !positions_left) ) { |
594 return false; | 595 return false; |
595 } | 596 } |
596 overlong = surrogate = false; | 597 overlong = surrogate = false; |
597 } | 598 } |
598 } | 599 } |
599 return true; | 600 return true; |
600 } | 601 } |
601 | 602 |
602 bool IsStringUTF8(const char* str) { | 603 bool IsStringUTF8(const std::string& str) { |
603 return IsStringUTF8T(str); | 604 return IsStringUTF8T(str.data(), str.length()); |
604 } | 605 } |
605 | 606 |
606 bool IsStringWideUTF8(const wchar_t* str) { | 607 bool IsStringWideUTF8(const std::wstring& str) { |
607 return IsStringUTF8T(str); | 608 return IsStringUTF8T(str.data(), str.length()); |
608 } | 609 } |
609 | 610 |
610 template<typename Iter> | 611 template<typename Iter> |
611 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, | 612 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, |
612 Iter a_end, | 613 Iter a_end, |
613 const char* b) { | 614 const char* b) { |
614 for (Iter it = a_begin; it != a_end; ++it, ++b) { | 615 for (Iter it = a_begin; it != a_end; ++it, ++b) { |
615 if (!*b || ToLowerASCII(*it) != *b) | 616 if (!*b || ToLowerASCII(*it) != *b) |
616 return false; | 617 return false; |
617 } | 618 } |
(...skipping 808 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1426 | 1427 |
1427 } // namespace | 1428 } // namespace |
1428 | 1429 |
1429 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) { | 1430 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) { |
1430 return lcpyT<char>(dst, src, dst_size); | 1431 return lcpyT<char>(dst, src, dst_size); |
1431 } | 1432 } |
1432 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { | 1433 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { |
1433 return lcpyT<wchar_t>(dst, src, dst_size); | 1434 return lcpyT<wchar_t>(dst, src, dst_size); |
1434 } | 1435 } |
1435 | 1436 |
OLD | NEW |