OLD | NEW |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #ifndef URL_URL_CANON_INTERNAL_H_ | 5 #ifndef URL_URL_CANON_INTERNAL_H_ |
6 #define URL_URL_CANON_INTERNAL_H_ | 6 #define URL_URL_CANON_INTERNAL_H_ |
7 | 7 |
8 // This file is intended to be included in another C++ file where the character | 8 // This file is intended to be included in another C++ file where the character |
9 // types are defined. This allows us to write mostly generic code, but not have | 9 // types are defined. This allows us to write mostly generic code, but not have |
10 // templace bloat because everything is inlined when anybody calls any of our | 10 // template bloat because everything is inlined when anybody calls any of our |
11 // functions. | 11 // functions. |
12 | 12 |
13 #include <stdlib.h> | 13 #include <stdlib.h> |
14 | 14 |
15 #include "base/logging.h" | 15 #include "base/logging.h" |
16 #include "url/url_canon.h" | 16 #include "url/url_canon.h" |
17 | 17 |
18 namespace url { | 18 namespace url { |
19 | 19 |
20 // Character type handling ----------------------------------------------------- | 20 // Character type handling ----------------------------------------------------- |
(...skipping 13 matching lines...) Expand all Loading... |
34 | 34 |
35 // Valid in an ASCII-representation of a hex digit (as in %-escaped). | 35 // Valid in an ASCII-representation of a hex digit (as in %-escaped). |
36 CHAR_HEX = 8, | 36 CHAR_HEX = 8, |
37 | 37 |
38 // Valid in an ASCII-representation of a decimal digit. | 38 // Valid in an ASCII-representation of a decimal digit. |
39 CHAR_DEC = 16, | 39 CHAR_DEC = 16, |
40 | 40 |
41 // Valid in an ASCII-representation of an octal digit. | 41 // Valid in an ASCII-representation of an octal digit. |
42 CHAR_OCT = 32, | 42 CHAR_OCT = 32, |
43 | 43 |
44 // Characters that do not require escaping in encodeURIComponent. Characters | 44 // Characters that do not require escaping in encodeURIComponent. Characters |
45 // that do not have this flag will be escaped; see url_util.cc. | 45 // that do not have this flag will be escaped; see url_util.cc. |
46 CHAR_COMPONENT = 64, | 46 CHAR_COMPONENT = 64, |
47 }; | 47 }; |
48 | 48 |
49 // This table contains the flags in SharedCharTypes for each 8-bit character. | 49 // This table contains the flags in SharedCharTypes for each 8-bit character. |
50 // Some canonicalization functions have their own specialized lookup table. | 50 // Some canonicalization functions have their own specialized lookup table. |
51 // For those with simple requirements, we have collected the flags in one | 51 // For those with simple requirements, we have collected the flags in one |
52 // place so there are fewer lookup tables to load into the CPU cache. | 52 // place so there are fewer lookup tables to load into the CPU cache. |
53 // | 53 // |
54 // Using an unsigned char type has a small but measurable performance benefit | 54 // Using an unsigned char type has a small but measurable performance benefit |
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
168 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), | 168 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), |
169 output); | 169 output); |
170 } else if (char_value <= 0xffff) { | 170 } else if (char_value <= 0xffff) { |
171 // 1110xxxx 10xxxxxx 10xxxxxx | 171 // 1110xxxx 10xxxxxx 10xxxxxx |
172 Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)), | 172 Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)), |
173 output); | 173 output); |
174 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), | 174 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), |
175 output); | 175 output); |
176 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), | 176 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), |
177 output); | 177 output); |
178 } else if (char_value <= 0x10FFFF) { // Max unicode code point. | 178 } else if (char_value <= 0x10FFFF) { // Max Unicode code point. |
179 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | 179 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
180 Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)), | 180 Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)), |
181 output); | 181 output); |
182 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)), | 182 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)), |
183 output); | 183 output); |
184 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), | 184 Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), |
185 output); | 185 output); |
186 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), | 186 Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), |
187 output); | 187 output); |
188 } else { | 188 } else { |
189 // Invalid UTF-8 character (>20 bits). | 189 // Invalid UTF-8 character (>20 bits). |
190 NOTREACHED(); | 190 NOTREACHED(); |
191 } | 191 } |
192 } | 192 } |
193 | 193 |
194 // Helper used by AppendUTF8Value below. We use an unsigned parameter so there | 194 // Helper used by AppendUTF8Value below. We use an unsigned parameter so there |
195 // are no funny sign problems with the input, but then have to convert it to | 195 // are no funny sign problems with the input, but then have to convert it to |
196 // a regular char for appending. | 196 // a regular char for appending. |
197 inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) { | 197 inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) { |
198 output->push_back(static_cast<char>(ch)); | 198 output->push_back(static_cast<char>(ch)); |
199 } | 199 } |
200 | 200 |
201 // Writes the given character to the output as UTF-8. This does NO checking | 201 // Writes the given character to the output as UTF-8. This does NO checking |
202 // of the validity of the unicode characters; the caller should ensure that | 202 // of the validity of the Unicode characters; the caller should ensure that |
203 // the value it is appending is valid to append. | 203 // the value it is appending is valid to append. |
204 inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) { | 204 inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) { |
205 DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output); | 205 DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output); |
206 } | 206 } |
207 | 207 |
208 // Writes the given character to the output as UTF-8, escaping ALL | 208 // Writes the given character to the output as UTF-8, escaping ALL |
209 // characters (even when they are ASCII). This does NO checking of the | 209 // characters (even when they are ASCII). This does NO checking of the |
210 // validity of the unicode characters; the caller should ensure that the value | 210 // validity of the Unicode characters; the caller should ensure that the value |
211 // it is appending is valid to append. | 211 // it is appending is valid to append. |
212 inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) { | 212 inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) { |
213 DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output); | 213 DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output); |
214 } | 214 } |
215 | 215 |
216 // UTF-16 functions ----------------------------------------------------------- | 216 // UTF-16 functions ----------------------------------------------------------- |
217 | 217 |
218 // Reads one character in UTF-16 starting at |*begin| in |str| and places | 218 // Reads one character in UTF-16 starting at |*begin| in |str| and places |
219 // the decoded value into |*code_point|. If the character is valid, we will | 219 // the decoded value into |*code_point|. If the character is valid, we will |
220 // return true. If invalid, we'll return false and put the | 220 // return true. If invalid, we'll return false and put the |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
253 // Every single output character will be escaped. This means that if you | 253 // Every single output character will be escaped. This means that if you |
254 // give it an ASCII character as input, it will be escaped. Some code uses | 254 // give it an ASCII character as input, it will be escaped. Some code uses |
255 // this when it knows that a character is invalid according to its rules | 255 // this when it knows that a character is invalid according to its rules |
256 // for validity. If you don't want escaping for ASCII characters, you will | 256 // for validity. If you don't want escaping for ASCII characters, you will |
257 // have to filter them out prior to calling this function. | 257 // have to filter them out prior to calling this function. |
258 // | 258 // |
259 // Assumes that ch[begin] is within range in the array, but does not assume | 259 // Assumes that ch[begin] is within range in the array, but does not assume |
260 // that any following characters are. | 260 // that any following characters are. |
261 inline bool AppendUTF8EscapedChar(const base::char16* str, int* begin, | 261 inline bool AppendUTF8EscapedChar(const base::char16* str, int* begin, |
262 int length, CanonOutput* output) { | 262 int length, CanonOutput* output) { |
263 // UTF-16 input. Readchar16 will handle invalid characters for us and give | 263 // UTF-16 input. ReadUTFChar will handle invalid characters for us and give |
264 // us the kUnicodeReplacementCharacter, so we don't have to do special | 264 // us the kUnicodeReplacementCharacter, so we don't have to do special |
265 // checking after failure, just pass through the failure to the caller. | 265 // checking after failure, just pass through the failure to the caller. |
266 unsigned char_value; | 266 unsigned char_value; |
267 bool success = ReadUTFChar(str, begin, length, &char_value); | 267 bool success = ReadUTFChar(str, begin, length, &char_value); |
268 AppendUTF8EscapedValue(char_value, output); | 268 AppendUTF8EscapedValue(char_value, output); |
269 return success; | 269 return success; |
270 } | 270 } |
271 | 271 |
272 // Handles UTF-8 input. See the wide version above for usage. | 272 // Handles UTF-8 input. See the wide version above for usage. |
273 inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length, | 273 inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length, |
(...skipping 150 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
424 inline unsigned long long _strtoui64(const char* nptr, | 424 inline unsigned long long _strtoui64(const char* nptr, |
425 char** endptr, int base) { | 425 char** endptr, int base) { |
426 return strtoull(nptr, endptr, base); | 426 return strtoull(nptr, endptr, base); |
427 } | 427 } |
428 | 428 |
429 #endif // WIN32 | 429 #endif // WIN32 |
430 | 430 |
431 } // namespace url | 431 } // namespace url |
432 | 432 |
433 #endif // URL_URL_CANON_INTERNAL_H_ | 433 #endif // URL_URL_CANON_INTERNAL_H_ |
OLD | NEW |