| OLD | NEW |
| 1 // Copyright 2007, Google Inc. | 1 // Copyright 2007, Google Inc. |
| 2 // All rights reserved. | 2 // All rights reserved. |
| 3 // | 3 // |
| 4 // Redistribution and use in source and binary forms, with or without | 4 // Redistribution and use in source and binary forms, with or without |
| 5 // modification, are permitted provided that the following conditions are | 5 // modification, are permitted provided that the following conditions are |
| 6 // met: | 6 // met: |
| 7 // | 7 // |
| 8 // * Redistributions of source code must retain the above copyright | 8 // * Redistributions of source code must retain the above copyright |
| 9 // notice, this list of conditions and the following disclaimer. | 9 // notice, this list of conditions and the following disclaimer. |
| 10 // * Redistributions in binary form must reproduce the above | 10 // * Redistributions in binary form must reproduce the above |
| (...skipping 145 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 156 // UTF-8 functions ------------------------------------------------------------ | 156 // UTF-8 functions ------------------------------------------------------------ |
| 157 | 157 |
| 158 // Reads one character in UTF-8 starting at |*begin| in |str| and places | 158 // Reads one character in UTF-8 starting at |*begin| in |str| and places |
| 159 // the decoded value into |*code_point|. If the character is valid, we will | 159 // the decoded value into |*code_point|. If the character is valid, we will |
| 160 // return true. If invalid, we'll return false and put the | 160 // return true. If invalid, we'll return false and put the |
| 161 // kUnicodeReplacementCharacter into |*code_point|. | 161 // kUnicodeReplacementCharacter into |*code_point|. |
| 162 // | 162 // |
| 163 // |*begin| will be updated to point to the last character consumed so it | 163 // |*begin| will be updated to point to the last character consumed so it |
| 164 // can be incremented in a loop and will be ready for the next character. | 164 // can be incremented in a loop and will be ready for the next character. |
| 165 // (for a single-byte ASCII character, it will not be changed). | 165 // (for a single-byte ASCII character, it will not be changed). |
| 166 inline bool ReadUTFChar(const char* str, int* begin, int length, | 166 // |
| 167 unsigned* code_point_out) { | 167 // Implementation is in url_canon_icu.cc. |
| 168 int code_point; // Avoids warning when U8_NEXT writes -1 to it. | 168 bool ReadUTFChar(const char* str, int* begin, int length, |
| 169 U8_NEXT(str, *begin, length, code_point); | 169 unsigned* code_point_out); |
| 170 *code_point_out = static_cast<unsigned>(code_point); | |
| 171 | |
| 172 // The ICU macro above moves to the next char, we want to point to the last | |
| 173 // char consumed. | |
| 174 (*begin)--; | |
| 175 | |
| 176 // Validate the decoded value. | |
| 177 if (U_IS_UNICODE_CHAR(code_point)) | |
| 178 return true; | |
| 179 *code_point_out = kUnicodeReplacementCharacter; | |
| 180 return false; | |
| 181 } | |
| 182 | 170 |
| 183 // Generic To-UTF-8 converter. This will call the given append method for each | 171 // Generic To-UTF-8 converter. This will call the given append method for each |
| 184 // character that should be appended, with the given output method. Wrappers | 172 // character that should be appended, with the given output method. Wrappers |
| 185 // are provided below for escaped and non-escaped versions of this. | 173 // are provided below for escaped and non-escaped versions of this. |
| 186 template<class Output, void Appender(unsigned char, Output*)> | 174 template<class Output, void Appender(unsigned char, Output*)> |
| 187 inline void DoAppendUTF8(unsigned char_value, Output* output) { | 175 inline void DoAppendUTF8(unsigned char_value, Output* output) { |
| 188 if (char_value <= 0x7f) { | 176 if (char_value <= 0x7f) { |
| 189 Appender(static_cast<unsigned char>(char_value), output); | 177 Appender(static_cast<unsigned char>(char_value), output); |
| 190 } else if (char_value <= 0x7ff) { | 178 } else if (char_value <= 0x7ff) { |
| 191 // 110xxxxx 10xxxxxx | 179 // 110xxxxx 10xxxxxx |
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 253 // UTF-16 functions ----------------------------------------------------------- | 241 // UTF-16 functions ----------------------------------------------------------- |
| 254 | 242 |
| 255 // Reads one character in UTF-16 starting at |*begin| in |str| and places | 243 // Reads one character in UTF-16 starting at |*begin| in |str| and places |
| 256 // the decoded value into |*code_point|. If the character is valid, we will | 244 // the decoded value into |*code_point|. If the character is valid, we will |
| 257 // return true. If invalid, we'll return false and put the | 245 // return true. If invalid, we'll return false and put the |
| 258 // kUnicodeReplacementCharacter into |*code_point|. | 246 // kUnicodeReplacementCharacter into |*code_point|. |
| 259 // | 247 // |
| 260 // |*begin| will be updated to point to the last character consumed so it | 248 // |*begin| will be updated to point to the last character consumed so it |
| 261 // can be incremented in a loop and will be ready for the next character. | 249 // can be incremented in a loop and will be ready for the next character. |
| 262 // (for a single-16-bit-word character, it will not be changed). | 250 // (for a single-16-bit-word character, it will not be changed). |
| 263 inline bool ReadUTFChar(const char16* str, int* begin, int length, | 251 // |
| 264 unsigned* code_point) { | 252 // Implementation is in url_canon_icu.cc. |
| 265 if (U16_IS_SURROGATE(str[*begin])) { | 253 bool ReadUTFChar(const char16* str, int* begin, int length, |
| 266 if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length || | 254 unsigned* code_point); |
| 267 !U16_IS_TRAIL(str[*begin + 1])) { | |
| 268 // Invalid surrogate pair. | |
| 269 *code_point = kUnicodeReplacementCharacter; | |
| 270 return false; | |
| 271 } else { | |
| 272 // Valid surrogate pair. | |
| 273 *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]); | |
| 274 (*begin)++; | |
| 275 } | |
| 276 } else { | |
| 277 // Not a surrogate, just one 16-bit word. | |
| 278 *code_point = str[*begin]; | |
| 279 } | |
| 280 | |
| 281 if (U_IS_UNICODE_CHAR(*code_point)) | |
| 282 return true; | |
| 283 | |
| 284 // Invalid code point. | |
| 285 *code_point = kUnicodeReplacementCharacter; | |
| 286 return false; | |
| 287 } | |
| 288 | 255 |
| 289 // Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method. | 256 // Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method. |
| 290 inline void AppendUTF16Value(unsigned code_point, | 257 inline void AppendUTF16Value(unsigned code_point, |
| 291 CanonOutputT<char16>* output) { | 258 CanonOutputT<char16>* output) { |
| 292 if (code_point > 0xffff) { | 259 if (code_point > 0xffff) { |
| 293 output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0)); | 260 output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0)); |
| 294 output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00)); | 261 output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00)); |
| 295 } else { | 262 } else { |
| 296 output->push_back(static_cast<char16>(code_point)); | 263 output->push_back(static_cast<char16>(code_point)); |
| 297 } | 264 } |
| (...skipping 184 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 482 inline unsigned long long _strtoui64(const char* nptr, | 449 inline unsigned long long _strtoui64(const char* nptr, |
| 483 char** endptr, int base) { | 450 char** endptr, int base) { |
| 484 return strtoull(nptr, endptr, base); | 451 return strtoull(nptr, endptr, base); |
| 485 } | 452 } |
| 486 | 453 |
| 487 #endif // WIN32 | 454 #endif // WIN32 |
| 488 | 455 |
| 489 } // namespace url_canon | 456 } // namespace url_canon |
| 490 | 457 |
| 491 #endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ | 458 #endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ |
| OLD | NEW |