| OLD | NEW |
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #ifndef URL_URL_CANON_INTERNAL_H_ | 5 #ifndef URL_URL_CANON_INTERNAL_H_ |
| 6 #define URL_URL_CANON_INTERNAL_H_ | 6 #define URL_URL_CANON_INTERNAL_H_ |
| 7 | 7 |
| 8 // This file is intended to be included in another C++ file where the character | 8 // This file is intended to be included in another C++ file where the character |
| 9 // types are defined. This allows us to write mostly generic code, but not have | 9 // types are defined. This allows us to write mostly generic code, but not have |
| 10 // templace bloat because everything is inlined when anybody calls any of our | 10 // template bloat because everything is inlined when anybody calls any of our |
| 11 // functions. | 11 // functions. |
| 12 | 12 |
| 13 #include <stdlib.h> | 13 #include <stdlib.h> |
| 14 | 14 |
| 15 #include "base/logging.h" | 15 #include "base/logging.h" |
| 16 #include "url/url_canon.h" | 16 #include "url/url_canon.h" |
| 17 | 17 |
| 18 namespace url { | 18 namespace url { |
| 19 | 19 |
| 20 // Character type handling ----------------------------------------------------- | 20 // Character type handling ----------------------------------------------------- |
| (...skipping 13 matching lines...) Expand all Loading... |
| 34 | 34 |
| 35 // Valid in an ASCII-representation of a hex digit (as in %-escaped). | 35 // Valid in an ASCII-representation of a hex digit (as in %-escaped). |
| 36 CHAR_HEX = 8, | 36 CHAR_HEX = 8, |
| 37 | 37 |
| 38 // Valid in an ASCII-representation of a decimal digit. | 38 // Valid in an ASCII-representation of a decimal digit. |
| 39 CHAR_DEC = 16, | 39 CHAR_DEC = 16, |
| 40 | 40 |
| 41 // Valid in an ASCII-representation of an octal digit. | 41 // Valid in an ASCII-representation of an octal digit. |
| 42 CHAR_OCT = 32, | 42 CHAR_OCT = 32, |
| 43 | 43 |
| 44 // Characters that do not require escaping in encodeURIComponent. Characters | 44 // Characters that do not require escaping in encodeURIComponent. Characters |
| 45 // that do not have this flag will be escaped; see url_util.cc. | 45 // that do not have this flag will be escaped; see url_util.cc. |
| 46 CHAR_COMPONENT = 64, | 46 CHAR_COMPONENT = 64, |
| 47 }; | 47 }; |
| 48 | 48 |
| 49 // This table contains the flags in SharedCharTypes for each 8-bit character. | 49 // This table contains the flags in SharedCharTypes for each 8-bit character. |
| 50 // Some canonicalization functions have their own specialized lookup table. | 50 // Some canonicalization functions have their own specialized lookup table. |
| 51 // For those with simple requirements, we have collected the flags in one | 51 // For those with simple requirements, we have collected the flags in one |
| 52 // place so there are fewer lookup tables to load into the CPU cache. | 52 // place so there are fewer lookup tables to load into the CPU cache. |
| 53 // | 53 // |
| 54 // Using an unsigned char type has a small but measurable performance benefit | 54 // Using an unsigned char type has a small but measurable performance benefit |
| (...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 168 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), | 168 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), |
| 169 output); | 169 output); |
| 170 } else if (char_value <= 0xffff) { | 170 } else if (char_value <= 0xffff) { |
| 171 // 1110xxxx 10xxxxxx 10xxxxxx | 171 // 1110xxxx 10xxxxxx 10xxxxxx |
| 172 Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)), | 172 Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)), |
| 173 output); | 173 output); |
| 174 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), | 174 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), |
| 175 output); | 175 output); |
| 176 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), | 176 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), |
| 177 output); | 177 output); |
| 178 } else if (char_value <= 0x10FFFF) { // Max unicode code point. | 178 } else if (char_value <= 0x10FFFF) { // Max Unicode code point. |
| 179 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | 179 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 180 Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)), | 180 Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)), |
| 181 output); | 181 output); |
| 182 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)), | 182 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)), |
| 183 output); | 183 output); |
| 184 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), | 184 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), |
| 185 output); | 185 output); |
| 186 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), | 186 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), |
| 187 output); | 187 output); |
| 188 } else { | 188 } else { |
| 189 // Invalid UTF-8 character (>20 bits). | 189 // Invalid UTF-8 character (>20 bits). |
| 190 NOTREACHED(); | 190 NOTREACHED(); |
| 191 } | 191 } |
| 192 } | 192 } |
| 193 | 193 |
| 194 // Helper used by AppendUTF8Value below. We use an unsigned parameter so there | 194 // Helper used by AppendUTF8Value below. We use an unsigned parameter so there |
| 195 // are no funny sign problems with the input, but then have to convert it to | 195 // are no funny sign problems with the input, but then have to convert it to |
| 196 // a regular char for appending. | 196 // a regular char for appending. |
| 197 inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) { | 197 inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) { |
| 198 output->push_back(static_cast<char>(ch)); | 198 output->push_back(static_cast<char>(ch)); |
| 199 } | 199 } |
| 200 | 200 |
| 201 // Writes the given character to the output as UTF-8. This does NO checking | 201 // Writes the given character to the output as UTF-8. This does NO checking |
| 202 // of the validity of the unicode characters; the caller should ensure that | 202 // of the validity of the Unicode characters; the caller should ensure that |
| 203 // the value it is appending is valid to append. | 203 // the value it is appending is valid to append. |
| 204 inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) { | 204 inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) { |
| 205 DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output); | 205 DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output); |
| 206 } | 206 } |
| 207 | 207 |
| 208 // Writes the given character to the output as UTF-8, escaping ALL | 208 // Writes the given character to the output as UTF-8, escaping ALL |
| 209 // characters (even when they are ASCII). This does NO checking of the | 209 // characters (even when they are ASCII). This does NO checking of the |
| 210 // validity of the unicode characters; the caller should ensure that the value | 210 // validity of the Unicode characters; the caller should ensure that the value |
| 211 // it is appending is valid to append. | 211 // it is appending is valid to append. |
| 212 inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) { | 212 inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) { |
| 213 DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output); | 213 DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output); |
| 214 } | 214 } |
| 215 | 215 |
| 216 // UTF-16 functions ----------------------------------------------------------- | 216 // UTF-16 functions ----------------------------------------------------------- |
| 217 | 217 |
| 218 // Reads one character in UTF-16 starting at |*begin| in |str| and places | 218 // Reads one character in UTF-16 starting at |*begin| in |str| and places |
| 219 // the decoded value into |*code_point|. If the character is valid, we will | 219 // the decoded value into |*code_point|. If the character is valid, we will |
| 220 // return true. If invalid, we'll return false and put the | 220 // return true. If invalid, we'll return false and put the |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 253 // Every single output character will be escaped. This means that if you | 253 // Every single output character will be escaped. This means that if you |
| 254 // give it an ASCII character as input, it will be escaped. Some code uses | 254 // give it an ASCII character as input, it will be escaped. Some code uses |
| 255 // this when it knows that a character is invalid according to its rules | 255 // this when it knows that a character is invalid according to its rules |
| 256 // for validity. If you don't want escaping for ASCII characters, you will | 256 // for validity. If you don't want escaping for ASCII characters, you will |
| 257 // have to filter them out prior to calling this function. | 257 // have to filter them out prior to calling this function. |
| 258 // | 258 // |
| 259 // Assumes that ch[begin] is within range in the array, but does not assume | 259 // Assumes that ch[begin] is within range in the array, but does not assume |
| 260 // that any following characters are. | 260 // that any following characters are. |
| 261 inline bool AppendUTF8EscapedChar(const base::char16* str, int* begin, | 261 inline bool AppendUTF8EscapedChar(const base::char16* str, int* begin, |
| 262 int length, CanonOutput* output) { | 262 int length, CanonOutput* output) { |
| 263 // UTF-16 input. Readchar16 will handle invalid characters for us and give | 263 // UTF-16 input. ReadUTFChar will handle invalid characters for us and give |
| 264 // us the kUnicodeReplacementCharacter, so we don't have to do special | 264 // us the kUnicodeReplacementCharacter, so we don't have to do special |
| 265 // checking after failure, just pass through the failure to the caller. | 265 // checking after failure, just pass through the failure to the caller. |
| 266 unsigned char_value; | 266 unsigned char_value; |
| 267 bool success = ReadUTFChar(str, begin, length, &char_value); | 267 bool success = ReadUTFChar(str, begin, length, &char_value); |
| 268 AppendUTF8EscapedValue(char_value, output); | 268 AppendUTF8EscapedValue(char_value, output); |
| 269 return success; | 269 return success; |
| 270 } | 270 } |
| 271 | 271 |
| 272 // Handles UTF-8 input. See the wide version above for usage. | 272 // Handles UTF-8 input. See the wide version above for usage. |
| 273 inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length, | 273 inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length, |
| (...skipping 150 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 424 inline unsigned long long _strtoui64(const char* nptr, | 424 inline unsigned long long _strtoui64(const char* nptr, |
| 425 char** endptr, int base) { | 425 char** endptr, int base) { |
| 426 return strtoull(nptr, endptr, base); | 426 return strtoull(nptr, endptr, base); |
| 427 } | 427 } |
| 428 | 428 |
| 429 #endif // WIN32 | 429 #endif // WIN32 |
| 430 | 430 |
| 431 } // namespace url | 431 } // namespace url |
| 432 | 432 |
| 433 #endif // URL_URL_CANON_INTERNAL_H_ | 433 #endif // URL_URL_CANON_INTERNAL_H_ |
| OLD | NEW |