Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(338)

Side by Side Diff: base/string_util.cc

Issue 4268: IsStringUTF8 unittest and enforcing UTF-8 in JSON deserialization (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 12 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "base/string_util.h" 5 #include "base/string_util.h"
6 6
7 #include <ctype.h> 7 #include <ctype.h>
8 #include <errno.h> 8 #include <errno.h>
9 #include <math.h> 9 #include <math.h>
10 #include <stdarg.h> 10 #include <stdarg.h>
(...skipping 499 matching lines...) Expand 10 before | Expand all | Expand 10 after
510 // the Initial Developer. All Rights Reserved. 510 // the Initial Developer. All Rights Reserved.
511 // 511 //
512 // Contributor(s): 512 // Contributor(s):
513 // Scott Collins <scc@mozilla.org> (original author) 513 // Scott Collins <scc@mozilla.org> (original author)
514 // 514 //
515 // This is a template so that it can be run on wide and 8-bit strings. We want 515 // This is a template so that it can be run on wide and 8-bit strings. We want
516 // to run it on wide strings when we have input that we think may have 516 // to run it on wide strings when we have input that we think may have
517 // originally been UTF-8, but has been converted to wide characters because 517 // originally been UTF-8, but has been converted to wide characters because
518 // that's what we (and Windows) use internally. 518 // that's what we (and Windows) use internally.
519 template<typename CHAR> 519 template<typename CHAR>
520 static bool IsStringUTF8T(const CHAR* str) { 520 static bool IsStringUTF8T(const CHAR* str, int length) {
521 bool overlong = false; 521 bool overlong = false;
522 bool surrogate = false; 522 bool surrogate = false;
523 bool nonchar = false; 523 bool nonchar = false;
524 524
525 // overlong byte upper bound 525 // overlong byte upper bound
526 typename ToUnsigned<CHAR>::Unsigned olupper = 0; 526 typename ToUnsigned<CHAR>::Unsigned olupper = 0;
527 527
528 // surrogate byte lower bound 528 // surrogate byte lower bound
529 typename ToUnsigned<CHAR>::Unsigned slower = 0; 529 typename ToUnsigned<CHAR>::Unsigned slower = 0;
530 530
531 // incremented when inside a multi-byte char to indicate how many bytes 531 // incremented when inside a multi-byte char to indicate how many bytes
532 // are left in the sequence 532 // are left in the sequence
533 int positions_left = 0; 533 int positions_left = 0;
534 534
535 for (int i = 0; str[i] != 0; i++) { 535 for (int i = 0; i < length; i++) {
536 // This whole function assume an unsigned value so force its conversion to 536 // This whole function assume an unsigned value so force its conversion to
537 // an unsigned value. 537 // an unsigned value.
538 typename ToUnsigned<CHAR>::Unsigned c = str[i]; 538 typename ToUnsigned<CHAR>::Unsigned c = str[i];
539 if (c < 0x80) 539 if (c < 0x80)
540 continue; // ASCII 540 continue; // ASCII
541 541
542 if (c <= 0xC1) { 542 if (c <= 0xC1) {
543 // [80-BF] where not expected, [C0-C1] for overlong 543 // [80-BF] where not expected, [C0-C1] for overlong
544 return false; 544 return false;
545 } else if (IsBegin2ByteUTF8(c)) { 545 } else if (IsBegin2ByteUTF8(c)) {
546 positions_left = 1; 546 positions_left = 1;
547 } else if (IsBegin3ByteUTF8(c)) { 547 } else if (IsBegin3ByteUTF8(c)) {
548 positions_left = 2; 548 positions_left = 2;
549 if (c == 0xE0) { 549 if (c == 0xE0) {
550 // to exclude E0[80-9F][80-BF] 550 // to exclude E0[80-9F][80-BF]
551 overlong = true; 551 overlong = true;
552 olupper = 0x9F; 552 olupper = 0x9F;
553 } else if (c == 0xED) { 553 } else if (c == 0xED) {
554 // ED[A0-BF][80-BF]: surrogate codepoint 554 // ED[A0-BF][80-BF]: surrogate codepoint
555 surrogate = true; 555 surrogate = true;
556 slower = 0xA0; 556 slower = 0xA0;
557 } else if (c == 0xEF) { 557 } else if (c == 0xEF) {
558 // EF BF [BE-BF] : non-character 558 // EF BF [BE-BF] : non-character
559 // TODO(jungshik): EF B7 [90-AF] should be checked as well.
559 nonchar = true; 560 nonchar = true;
560 } 561 }
561 } else if (c <= 0xF4) { 562 } else if (c <= 0xF4) {
562 positions_left = 3; 563 positions_left = 3;
563 nonchar = true; 564 nonchar = true;
564 if (c == 0xF0) { 565 if (c == 0xF0) {
565 // to exclude F0[80-8F][80-BF]{2} 566 // to exclude F0[80-8F][80-BF]{2}
566 overlong = true; 567 overlong = true;
567 olupper = 0x8F; 568 olupper = 0x8F;
568 } else if (c == 0xF4) { 569 } else if (c == 0xF4) {
(...skipping 23 matching lines...) Expand all
592 if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) || 593 if (!IsInUTF8Sequence(c) || (overlong && c <= olupper) ||
593 (surrogate && slower <= c) || (nonchar && !positions_left) ) { 594 (surrogate && slower <= c) || (nonchar && !positions_left) ) {
594 return false; 595 return false;
595 } 596 }
596 overlong = surrogate = false; 597 overlong = surrogate = false;
597 } 598 }
598 } 599 }
599 return true; 600 return true;
600 } 601 }
601 602
602 bool IsStringUTF8(const char* str) { 603 bool IsStringUTF8(const std::string& str) {
603 return IsStringUTF8T(str); 604 return IsStringUTF8T(str.data(), str.length());
604 } 605 }
605 606
606 bool IsStringWideUTF8(const wchar_t* str) { 607 bool IsStringWideUTF8(const std::wstring& str) {
607 return IsStringUTF8T(str); 608 return IsStringUTF8T(str.data(), str.length());
608 } 609 }
609 610
610 template<typename Iter> 611 template<typename Iter>
611 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, 612 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
612 Iter a_end, 613 Iter a_end,
613 const char* b) { 614 const char* b) {
614 for (Iter it = a_begin; it != a_end; ++it, ++b) { 615 for (Iter it = a_begin; it != a_end; ++it, ++b) {
615 if (!*b || ToLowerASCII(*it) != *b) 616 if (!*b || ToLowerASCII(*it) != *b)
616 return false; 617 return false;
617 } 618 }
(...skipping 808 matching lines...) Expand 10 before | Expand all | Expand 10 after
1426 1427
1427 } // namespace 1428 } // namespace
1428 1429
1429 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) { 1430 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {
1430 return lcpyT<char>(dst, src, dst_size); 1431 return lcpyT<char>(dst, src, dst_size);
1431 } 1432 }
1432 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { 1433 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
1433 return lcpyT<wchar_t>(dst, src, dst_size); 1434 return lcpyT<wchar_t>(dst, src, dst_size);
1434 } 1435 }
1435 1436
OLDNEW
« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698