base/string_util.cc - Issue 4268: IsStringUTF8 unittest and enforcing UTF-8 in JSON deserialization

Side by Side Diff: base/string_util.cc

Issue 4268: IsStringUTF8 unittest and enforcing UTF-8 in JSON deserialization (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 12 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "base/string_util.h"	5 #include "base/string_util.h"

6	6

7 #include <ctype.h>	7 #include <ctype.h>

8 #include <errno.h>	8 #include <errno.h>

9 #include <math.h>	9 #include <math.h>

10 #include <stdarg.h>	10 #include <stdarg.h>

(...skipping 499 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
510 // the Initial Developer. All Rights Reserved.	510 // the Initial Developer. All Rights Reserved.

511 //	511 //

512 // Contributor(s):	512 // Contributor(s):

513 // Scott Collins <scc@mozilla.org> (original author)	513 // Scott Collins <scc@mozilla.org> (original author)

514 //	514 //

515 // This is a template so that it can be run on wide and 8-bit strings. We want	515 // This is a template so that it can be run on wide and 8-bit strings. We want

516 // to run it on wide strings when we have input that we think may have	516 // to run it on wide strings when we have input that we think may have

517 // originally been UTF-8, but has been converted to wide characters because	517 // originally been UTF-8, but has been converted to wide characters because

518 // that's what we (and Windows) use internally.	518 // that's what we (and Windows) use internally.

519 template<typename CHAR>	519 template<typename CHAR>

520 static bool IsStringUTF8T(const CHAR* str) {	520 static bool IsStringUTF8T(const CHAR* str, int length) {

521 bool overlong = false;	521 bool overlong = false;

522 bool surrogate = false;	522 bool surrogate = false;

523 bool nonchar = false;	523 bool nonchar = false;

524	524

525 // overlong byte upper bound	525 // overlong byte upper bound

526 typename ToUnsigned<CHAR>::Unsigned olupper = 0;	526 typename ToUnsigned<CHAR>::Unsigned olupper = 0;

527	527

528 // surrogate byte lower bound	528 // surrogate byte lower bound

529 typename ToUnsigned<CHAR>::Unsigned slower = 0;	529 typename ToUnsigned<CHAR>::Unsigned slower = 0;

530	530

531 // incremented when inside a multi-byte char to indicate how many bytes	531 // incremented when inside a multi-byte char to indicate how many bytes

532 // are left in the sequence	532 // are left in the sequence

533 int positions_left = 0;	533 int positions_left = 0;

534	534

535 for (int i = 0; str[i] != 0; i++) {	535 for (int i = 0; i < length; i++) {

536 // This whole function assume an unsigned value so force its conversion to	536 // This whole function assume an unsigned value so force its conversion to

537 // an unsigned value.	537 // an unsigned value.

538 typename ToUnsigned<CHAR>::Unsigned c = str[i];	538 typename ToUnsigned<CHAR>::Unsigned c = str[i];

539 if (c < 0x80)	539 if (c < 0x80)

540 continue; // ASCII	540 continue; // ASCII

541	541

542 if (c <= 0xC1) {	542 if (c <= 0xC1) {

543 // [80-BF] where not expected, [C0-C1] for overlong	543 // [80-BF] where not expected, [C0-C1] for overlong

544 return false;	544 return false;

545 } else if (IsBegin2ByteUTF8(c)) {	545 } else if (IsBegin2ByteUTF8(c)) {

546 positions_left = 1;	546 positions_left = 1;

547 } else if (IsBegin3ByteUTF8(c)) {	547 } else if (IsBegin3ByteUTF8(c)) {

548 positions_left = 2;	548 positions_left = 2;

549 if (c == 0xE0) {	549 if (c == 0xE0) {

550 // to exclude E0[80-9F][80-BF]	550 // to exclude E0[80-9F][80-BF]

551 overlong = true;	551 overlong = true;

552 olupper = 0x9F;	552 olupper = 0x9F;

553 } else if (c == 0xED) {	553 } else if (c == 0xED) {

554 // ED[A0-BF][80-BF]: surrogate codepoint	554 // ED[A0-BF][80-BF]: surrogate codepoint

555 surrogate = true;	555 surrogate = true;

556 slower = 0xA0;	556 slower = 0xA0;

557 } else if (c == 0xEF) {	557 } else if (c == 0xEF) {

558 // EF BF [BE-BF] : non-character	558 // EF BF [BE-BF] : non-character

	559 // TODO(jungshik): EF B7 [90-AF] should be checked as well.

559 nonchar = true;	560 nonchar = true;

560 }	561 }

561 } else if (c <= 0xF4) {	562 } else if (c <= 0xF4) {

562 positions_left = 3;	563 positions_left = 3;

563 nonchar = true;	564 nonchar = true;

564 if (c == 0xF0) {	565 if (c == 0xF0) {

565 // to exclude F0[80-8F][80-BF]{2}	566 // to exclude F0[80-8F][80-BF]{2}

566 overlong = true;	567 overlong = true;

567 olupper = 0x8F;	568 olupper = 0x8F;

568 } else if (c == 0xF4) {	569 } else if (c == 0xF4) {

(...skipping 23 matching lines...) Expand all Loading...
592 if (!IsInUTF8Sequence(c) \|\| (overlong && c <= olupper) \|\|	593 if (!IsInUTF8Sequence(c) \|\| (overlong && c <= olupper) \|\|

593 (surrogate && slower <= c) \|\| (nonchar && !positions_left) ) {	594 (surrogate && slower <= c) \|\| (nonchar && !positions_left) ) {

594 return false;	595 return false;

595 }	596 }

596 overlong = surrogate = false;	597 overlong = surrogate = false;

597 }	598 }

598 }	599 }

599 return true;	600 return true;

600 }	601 }

601	602

602 bool IsStringUTF8(const char* str) {	603 bool IsStringUTF8(const std::string& str) {

603 return IsStringUTF8T(str);	604 return IsStringUTF8T(str.data(), str.length());

604 }	605 }

605	606

606 bool IsStringWideUTF8(const wchar_t* str) {	607 bool IsStringWideUTF8(const std::wstring& str) {

607 return IsStringUTF8T(str);	608 return IsStringUTF8T(str.data(), str.length());

608 }	609 }

609	610

610 template<typename Iter>	611 template<typename Iter>

611 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,	612 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,

612 Iter a_end,	613 Iter a_end,

613 const char* b) {	614 const char* b) {

614 for (Iter it = a_begin; it != a_end; ++it, ++b) {	615 for (Iter it = a_begin; it != a_end; ++it, ++b) {

615 if (!b \|\| ToLowerASCII(it) != *b)	616 if (!b \|\| ToLowerASCII(it) != *b)

616 return false;	617 return false;

617 }	618 }

(...skipping 808 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1426	1427

1427 } // namespace	1428 } // namespace

1428	1429

1429 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {	1430 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {

1430 return lcpyT<char>(dst, src, dst_size);	1431 return lcpyT<char>(dst, src, dst_size);

1431 }	1432 }

1432 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {	1433 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {

1433 return lcpyT<wchar_t>(dst, src, dst_size);	1434 return lcpyT<wchar_t>(dst, src, dst_size);

1434 }	1435 }

1435	1436

OLD	NEW

« no previous file with comments | « base/string_util.h ('k') | base/string_util_unittest.cc » ('j') | no next file with comments »