url/url_canon_icu.cc - Issue 13821004: Move googleurl into the Chrome repo.

Side by Side Diff: url/url_canon_icu.cc

Issue 13821004: Move googleurl into the Chrome repo. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright 2011, Google Inc.

	2 // All rights reserved.

	3 //

	4 // Redistribution and use in source and binary forms, with or without

	5 // modification, are permitted provided that the following conditions are

	6 // met:

	7 //

	8 // * Redistributions of source code must retain the above copyright

	9 // notice, this list of conditions and the following disclaimer.

	10 // * Redistributions in binary form must reproduce the above

	11 // copyright notice, this list of conditions and the following disclaimer

	12 // in the documentation and/or other materials provided with the

	13 // distribution.

	14 // * Neither the name of Google Inc. nor the names of its

	15 // contributors may be used to endorse or promote products derived from

	16 // this software without specific prior written permission.

	17 //

	18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

	22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

	23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

	24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

	25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

	26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

	27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

	28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	29

	30 // ICU integration functions.

	31

	32 #include <stdlib.h>

	33 #include <string.h>

	34 #include <unicode/ucnv.h>

	35 #include <unicode/ucnv_cb.h>

	36 #include <unicode/uidna.h>

	37

	38 #include "googleurl/src/url_canon_icu.h"

	39 #include "googleurl/src/url_canon_internal.h" // for _itoa_s

	40

	41 #include "base/logging.h"

	42

	43 namespace url_canon {

	44

	45 namespace {

	46

	47 // Called when converting a character that can not be represented, this will

	48 // append an escaped version of the numerical character reference for that code

	49 // point. It is of the form "Ӓ" and we will escape the non-digits to

	50 // "%26%231234%3B". Why? This is what Netscape did back in the olden days.

	51 void appendURLEscapedChar(const void* context,

	52 UConverterFromUnicodeArgs* from_args,

	53 const UChar* code_units,

	54 int32_t length,

	55 UChar32 code_point,

	56 UConverterCallbackReason reason,

	57 UErrorCode* err) {

	58 if (reason == UCNV_UNASSIGNED) {

	59 *err = U_ZERO_ERROR;

	60

	61 const static int prefix_len = 6;

	62 const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escape d

	63 ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err);

	64

	65 DCHECK(code_point < 0x110000);

	66 char number[8]; // Max Unicode code point is 7 digits.

	67 _itoa_s(code_point, number, 10);

	68 int number_len = static_cast<int>(strlen(number));

	69 ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err);

	70

	71 const static int postfix_len = 3;

	72 const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped

	73 ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err);

	74 }

	75 }

	76

	77 // A class for scoping the installation of the invalid character callback.

	78 class AppendHandlerInstaller {

	79 public:

	80 // The owner of this object must ensure that the converter is alive for the

	81 // duration of this object's lifetime.

	82 AppendHandlerInstaller(UConverter* converter) : converter_(converter) {

	83 UErrorCode err = U_ZERO_ERROR;

	84 ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0,

	85 &old_callback_, &old_context_, &err);

	86 }

	87

	88 ~AppendHandlerInstaller() {

	89 UErrorCode err = U_ZERO_ERROR;

	90 ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err);

	91 }

	92

	93 private:

	94 UConverter* converter_;

	95

	96 UConverterFromUCallback old_callback_;

	97 const void* old_context_;

	98 };

	99

	100 } // namespace

	101

	102 ICUCharsetConverter::ICUCharsetConverter(UConverter* converter)

	103 : converter_(converter) {

	104 }

	105

	106 ICUCharsetConverter::~ICUCharsetConverter() {

	107 }

	108

	109 void ICUCharsetConverter::ConvertFromUTF16(const char16* input,

	110 int input_len,

	111 CanonOutput* output) {

	112 // Install our error handler. It will be called for character that can not

	113 // be represented in the destination character set.

	114 AppendHandlerInstaller handler(converter_);

	115

	116 int begin_offset = output->length();

	117 int dest_capacity = output->capacity() - begin_offset;

	118 output->set_length(output->length());

	119

	120 do {

	121 UErrorCode err = U_ZERO_ERROR;

	122 char* dest = &output->data()[begin_offset];

	123 int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity,

	124 input, input_len, &err);

	125 if (err != U_BUFFER_OVERFLOW_ERROR) {

	126 output->set_length(begin_offset + required_capacity);

	127 return;

	128 }

	129

	130 // Output didn't fit, expand

	131 dest_capacity = required_capacity;

	132 output->Resize(begin_offset + dest_capacity);

	133 } while (true);

	134 }

	135

	136 // Converts the Unicode input representing a hostname to ASCII using IDN rules.

	137 // The output must be ASCII, but is represented as wide characters.

	138 //

	139 // On success, the output will be filled with the ASCII host name and it will

	140 // return true. Unlike most other canonicalization functions, this assumes that

	141 // the output is empty. The beginning of the host will be at offset 0, and

	142 // the length of the output will be set to the length of the new host name.

	143 //

	144 // On error, this will return false. The output in this case is undefined.

	145 bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output) {

	146 DCHECK(output->length() == 0); // Output buffer is assumed empty.

	147 while (true) {

	148 // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate

	149 // the spec (which do exist). This does not present any risk and is a

	150 // little more future proof.

	151 UErrorCode err = U_ZERO_ERROR;

	152 int num_converted = uidna_IDNToASCII(src, src_len, output->data(),

	153 output->capacity(),

	154 UIDNA_ALLOW_UNASSIGNED, NULL, &err);

	155 if (err == U_ZERO_ERROR) {

	156 output->set_length(num_converted);

	157 return true;

	158 }

	159 if (err != U_BUFFER_OVERFLOW_ERROR)

	160 return false; // Unknown error, give up.

	161

	162 // Not enough room in our buffer, expand.

	163 output->Resize(output->capacity() * 2);

	164 }

	165 }

	166

	167 bool ReadUTFChar(const char* str, int* begin, int length,

	168 unsigned* code_point_out) {

	169 int code_point; // Avoids warning when U8_NEXT writes -1 to it.

	170 U8_NEXT(str, *begin, length, code_point);

	171 *code_point_out = static_cast<unsigned>(code_point);

	172

	173 // The ICU macro above moves to the next char, we want to point to the last

	174 // char consumed.

	175 (*begin)--;

	176

	177 // Validate the decoded value.

	178 if (U_IS_UNICODE_CHAR(code_point))

	179 return true;

	180 *code_point_out = kUnicodeReplacementCharacter;

	181 return false;

	182 }

	183

	184 bool ReadUTFChar(const char16* str, int* begin, int length,

	185 unsigned* code_point) {

	186 if (U16_IS_SURROGATE(str[*begin])) {

	187 if (!U16_IS_SURROGATE_LEAD(str[begin]) \|\| begin + 1 >= length \|\|

	188 !U16_IS_TRAIL(str[*begin + 1])) {

	189 // Invalid surrogate pair.

	190 *code_point = kUnicodeReplacementCharacter;

	191 return false;

	192 } else {

	193 // Valid surrogate pair.

	194 code_point = U16_GET_SUPPLEMENTARY(str[begin], str[*begin + 1]);

	195 (*begin)++;

	196 }

	197 } else {

	198 // Not a surrogate, just one 16-bit word.

	199 code_point = str[begin];

	200 }

	201

	202 if (U_IS_UNICODE_CHAR(*code_point))

	203 return true;

	204

	205 // Invalid code point.

	206 *code_point = kUnicodeReplacementCharacter;

	207 return false;

	208 }

	209

	210 } // namespace url_canon

OLD	NEW

« no previous file with comments | « url/url_canon_icu.h ('k') | url/url_canon_internal.h » ('j') | no next file with comments »