OLD | NEW |
1 // Copyright 2007, Google Inc. | 1 // Copyright 2007, Google Inc. |
2 // All rights reserved. | 2 // All rights reserved. |
3 // | 3 // |
4 // Redistribution and use in source and binary forms, with or without | 4 // Redistribution and use in source and binary forms, with or without |
5 // modification, are permitted provided that the following conditions are | 5 // modification, are permitted provided that the following conditions are |
6 // met: | 6 // met: |
7 // | 7 // |
8 // * Redistributions of source code must retain the above copyright | 8 // * Redistributions of source code must retain the above copyright |
9 // notice, this list of conditions and the following disclaimer. | 9 // notice, this list of conditions and the following disclaimer. |
10 // * Redistributions in binary form must reproduce the above | 10 // * Redistributions in binary form must reproduce the above |
(...skipping 145 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
156 // UTF-8 functions ------------------------------------------------------------ | 156 // UTF-8 functions ------------------------------------------------------------ |
157 | 157 |
158 // Reads one character in UTF-8 starting at |*begin| in |str| and places | 158 // Reads one character in UTF-8 starting at |*begin| in |str| and places |
159 // the decoded value into |*code_point|. If the character is valid, we will | 159 // the decoded value into |*code_point|. If the character is valid, we will |
160 // return true. If invalid, we'll return false and put the | 160 // return true. If invalid, we'll return false and put the |
161 // kUnicodeReplacementCharacter into |*code_point|. | 161 // kUnicodeReplacementCharacter into |*code_point|. |
162 // | 162 // |
163 // |*begin| will be updated to point to the last character consumed so it | 163 // |*begin| will be updated to point to the last character consumed so it |
164 // can be incremented in a loop and will be ready for the next character. | 164 // can be incremented in a loop and will be ready for the next character. |
165 // (for a single-byte ASCII character, it will not be changed). | 165 // (for a single-byte ASCII character, it will not be changed). |
166 inline bool ReadUTFChar(const char* str, int* begin, int length, | 166 // |
167 unsigned* code_point_out) { | 167 // Implementation is in url_canon_icu.cc. |
168 int code_point; // Avoids warning when U8_NEXT writes -1 to it. | 168 bool ReadUTFChar(const char* str, int* begin, int length, |
169 U8_NEXT(str, *begin, length, code_point); | 169 unsigned* code_point_out); |
170 *code_point_out = static_cast<unsigned>(code_point); | |
171 | |
172 // The ICU macro above moves to the next char, we want to point to the last | |
173 // char consumed. | |
174 (*begin)--; | |
175 | |
176 // Validate the decoded value. | |
177 if (U_IS_UNICODE_CHAR(code_point)) | |
178 return true; | |
179 *code_point_out = kUnicodeReplacementCharacter; | |
180 return false; | |
181 } | |
182 | 170 |
183 // Generic To-UTF-8 converter. This will call the given append method for each | 171 // Generic To-UTF-8 converter. This will call the given append method for each |
184 // character that should be appended, with the given output method. Wrappers | 172 // character that should be appended, with the given output method. Wrappers |
185 // are provided below for escaped and non-escaped versions of this. | 173 // are provided below for escaped and non-escaped versions of this. |
186 template<class Output, void Appender(unsigned char, Output*)> | 174 template<class Output, void Appender(unsigned char, Output*)> |
187 inline void DoAppendUTF8(unsigned char_value, Output* output) { | 175 inline void DoAppendUTF8(unsigned char_value, Output* output) { |
188 if (char_value <= 0x7f) { | 176 if (char_value <= 0x7f) { |
189 Appender(static_cast<unsigned char>(char_value), output); | 177 Appender(static_cast<unsigned char>(char_value), output); |
190 } else if (char_value <= 0x7ff) { | 178 } else if (char_value <= 0x7ff) { |
191 // 110xxxxx 10xxxxxx | 179 // 110xxxxx 10xxxxxx |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
253 // UTF-16 functions ----------------------------------------------------------- | 241 // UTF-16 functions ----------------------------------------------------------- |
254 | 242 |
255 // Reads one character in UTF-16 starting at |*begin| in |str| and places | 243 // Reads one character in UTF-16 starting at |*begin| in |str| and places |
256 // the decoded value into |*code_point|. If the character is valid, we will | 244 // the decoded value into |*code_point|. If the character is valid, we will |
257 // return true. If invalid, we'll return false and put the | 245 // return true. If invalid, we'll return false and put the |
258 // kUnicodeReplacementCharacter into |*code_point|. | 246 // kUnicodeReplacementCharacter into |*code_point|. |
259 // | 247 // |
260 // |*begin| will be updated to point to the last character consumed so it | 248 // |*begin| will be updated to point to the last character consumed so it |
261 // can be incremented in a loop and will be ready for the next character. | 249 // can be incremented in a loop and will be ready for the next character. |
262 // (for a single-16-bit-word character, it will not be changed). | 250 // (for a single-16-bit-word character, it will not be changed). |
263 inline bool ReadUTFChar(const char16* str, int* begin, int length, | 251 // |
264 unsigned* code_point) { | 252 // Implementation is in url_canon_icu.cc. |
265 if (U16_IS_SURROGATE(str[*begin])) { | 253 bool ReadUTFChar(const char16* str, int* begin, int length, |
266 if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length || | 254 unsigned* code_point); |
267 !U16_IS_TRAIL(str[*begin + 1])) { | |
268 // Invalid surrogate pair. | |
269 *code_point = kUnicodeReplacementCharacter; | |
270 return false; | |
271 } else { | |
272 // Valid surrogate pair. | |
273 *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]); | |
274 (*begin)++; | |
275 } | |
276 } else { | |
277 // Not a surrogate, just one 16-bit word. | |
278 *code_point = str[*begin]; | |
279 } | |
280 | |
281 if (U_IS_UNICODE_CHAR(*code_point)) | |
282 return true; | |
283 | |
284 // Invalid code point. | |
285 *code_point = kUnicodeReplacementCharacter; | |
286 return false; | |
287 } | |
288 | 255 |
289 // Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method. | 256 // Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method. |
290 inline void AppendUTF16Value(unsigned code_point, | 257 inline void AppendUTF16Value(unsigned code_point, |
291 CanonOutputT<char16>* output) { | 258 CanonOutputT<char16>* output) { |
292 if (code_point > 0xffff) { | 259 if (code_point > 0xffff) { |
293 output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0)); | 260 output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0)); |
294 output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00)); | 261 output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00)); |
295 } else { | 262 } else { |
296 output->push_back(static_cast<char16>(code_point)); | 263 output->push_back(static_cast<char16>(code_point)); |
297 } | 264 } |
(...skipping 184 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
482 inline unsigned long long _strtoui64(const char* nptr, | 449 inline unsigned long long _strtoui64(const char* nptr, |
483 char** endptr, int base) { | 450 char** endptr, int base) { |
484 return strtoull(nptr, endptr, base); | 451 return strtoull(nptr, endptr, base); |
485 } | 452 } |
486 | 453 |
487 #endif // WIN32 | 454 #endif // WIN32 |
488 | 455 |
489 } // namespace url_canon | 456 } // namespace url_canon |
490 | 457 |
491 #endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ | 458 #endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ |
OLD | NEW |