| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "net/base/net_util.h" | 5 #include "net/base/net_util.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 #include <iterator> | 8 #include <iterator> |
| 9 #include <map> | 9 #include <map> |
| 10 | 10 |
| 11 #include "build/build_config.h" | 11 #include "build/build_config.h" |
| 12 | 12 |
| 13 #if defined(OS_WIN) | 13 #if defined(OS_WIN) |
| 14 #include <windows.h> | 14 #include <windows.h> |
| 15 #include <winsock2.h> | 15 #include <winsock2.h> |
| 16 #include <iphlpapi.h> | 16 #include <iphlpapi.h> |
| 17 #pragma comment(lib, "iphlpapi.lib") | 17 #pragma comment(lib, "iphlpapi.lib") |
| 18 #elif defined(OS_POSIX) | 18 #elif defined(OS_POSIX) |
| 19 #include <fcntl.h> | 19 #include <fcntl.h> |
| 20 #if !defined(OS_ANDROID) | 20 #if !defined(OS_ANDROID) |
| 21 #include <ifaddrs.h> | 21 #include <ifaddrs.h> |
| 22 #endif | 22 #endif |
| 23 #include <netdb.h> | 23 #include <netdb.h> |
| 24 #include <net/if.h> | 24 #include <net/if.h> |
| 25 #include <netinet/in.h> | 25 #include <netinet/in.h> |
| 26 #endif | 26 #endif |
| 27 | 27 |
| 28 #include "base/base64.h" | |
| 29 #include "base/basictypes.h" | 28 #include "base/basictypes.h" |
| 30 #include "base/file_path.h" | 29 #include "base/file_path.h" |
| 31 #include "base/file_util.h" | 30 #include "base/file_util.h" |
| 32 #include "base/i18n/file_util_icu.h" | 31 #include "base/i18n/file_util_icu.h" |
| 33 #include "base/i18n/icu_string_conversions.h" | 32 #include "base/i18n/icu_string_conversions.h" |
| 34 #include "base/i18n/time_formatting.h" | 33 #include "base/i18n/time_formatting.h" |
| 35 #include "base/json/string_escape.h" | 34 #include "base/json/string_escape.h" |
| 36 #include "base/lazy_instance.h" | 35 #include "base/lazy_instance.h" |
| 37 #include "base/logging.h" | 36 #include "base/logging.h" |
| 38 #include "base/memory/singleton.h" | 37 #include "base/memory/singleton.h" |
| (...skipping 25 matching lines...) Expand all Loading... |
| 64 #include "net/base/dns_util.h" | 63 #include "net/base/dns_util.h" |
| 65 #include "net/base/escape.h" | 64 #include "net/base/escape.h" |
| 66 #include "net/base/mime_util.h" | 65 #include "net/base/mime_util.h" |
| 67 #include "net/base/net_module.h" | 66 #include "net/base/net_module.h" |
| 68 #if defined(OS_WIN) | 67 #if defined(OS_WIN) |
| 69 #include "net/base/winsock_init.h" | 68 #include "net/base/winsock_init.h" |
| 70 #endif | 69 #endif |
| 71 #include "net/http/http_content_disposition.h" | 70 #include "net/http/http_content_disposition.h" |
| 72 #include "unicode/datefmt.h" | 71 #include "unicode/datefmt.h" |
| 73 #include "unicode/regex.h" | 72 #include "unicode/regex.h" |
| 74 #include "unicode/ucnv.h" | |
| 75 #include "unicode/uidna.h" | 73 #include "unicode/uidna.h" |
| 76 #include "unicode/ulocdata.h" | 74 #include "unicode/ulocdata.h" |
| 77 #include "unicode/uniset.h" | 75 #include "unicode/uniset.h" |
| 78 #include "unicode/uscript.h" | 76 #include "unicode/uscript.h" |
| 79 #include "unicode/uset.h" | 77 #include "unicode/uset.h" |
| 80 | 78 |
| 81 using base::Time; | 79 using base::Time; |
| 82 | 80 |
| 83 namespace net { | 81 namespace net { |
| 84 | 82 |
| (...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 168 #if defined(OS_WIN) | 166 #if defined(OS_WIN) |
| 169 std::string::size_type CountTrailingChars( | 167 std::string::size_type CountTrailingChars( |
| 170 const std::string& input, | 168 const std::string& input, |
| 171 const std::string::value_type trailing_chars[]) { | 169 const std::string::value_type trailing_chars[]) { |
| 172 const size_t last_good_char = input.find_last_not_of(trailing_chars); | 170 const size_t last_good_char = input.find_last_not_of(trailing_chars); |
| 173 return (last_good_char == std::string::npos) ? | 171 return (last_good_char == std::string::npos) ? |
| 174 input.length() : (input.length() - last_good_char - 1); | 172 input.length() : (input.length() - last_good_char - 1); |
| 175 } | 173 } |
| 176 #endif | 174 #endif |
| 177 | 175 |
| 178 // Similar to Base64Decode. Decodes a Q-encoded string to a sequence | |
| 179 // of bytes. If input is invalid, return false. | |
| 180 bool QPDecode(const std::string& input, std::string* output) { | |
| 181 std::string temp; | |
| 182 temp.reserve(input.size()); | |
| 183 for (std::string::const_iterator it = input.begin(); it != input.end(); | |
| 184 ++it) { | |
| 185 if (*it == '_') { | |
| 186 temp.push_back(' '); | |
| 187 } else if (*it == '=') { | |
| 188 if ((input.end() - it < 3) || | |
| 189 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || | |
| 190 !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) | |
| 191 return false; | |
| 192 unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + | |
| 193 HexDigitToInt(*(it + 2)); | |
| 194 temp.push_back(static_cast<char>(ch)); | |
| 195 ++it; | |
| 196 ++it; | |
| 197 } else if (0x20 < *it && *it < 0x7F) { | |
| 198 // In a Q-encoded word, only printable ASCII characters | |
| 199 // represent themselves. Besides, space, '=', '_' and '?' are | |
| 200 // not allowed, but they're already filtered out. | |
| 201 DCHECK_NE('=', *it); | |
| 202 DCHECK_NE('?', *it); | |
| 203 DCHECK_NE('_', *it); | |
| 204 temp.push_back(*it); | |
| 205 } else { | |
| 206 return false; | |
| 207 } | |
| 208 } | |
| 209 output->swap(temp); | |
| 210 return true; | |
| 211 } | |
| 212 | |
| 213 enum RFC2047EncodingType {Q_ENCODING, B_ENCODING}; | |
| 214 bool DecodeBQEncoding(const std::string& part, | |
| 215 RFC2047EncodingType enc_type, | |
| 216 const std::string& charset, | |
| 217 std::string* output) { | |
| 218 std::string decoded; | |
| 219 if (!((enc_type == B_ENCODING) ? | |
| 220 base::Base64Decode(part, &decoded) : QPDecode(part, &decoded))) | |
| 221 return false; | |
| 222 | |
| 223 if (decoded.empty()) { | |
| 224 output->clear(); | |
| 225 return true; | |
| 226 } | |
| 227 | |
| 228 UErrorCode err = U_ZERO_ERROR; | |
| 229 UConverter* converter(ucnv_open(charset.c_str(), &err)); | |
| 230 if (U_FAILURE(err)) | |
| 231 return false; | |
| 232 | |
| 233 // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. | |
| 234 // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes | |
| 235 // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a | |
| 236 // trailing '\0'. | |
| 237 size_t output_length = decoded.length() * 3 + 1; | |
| 238 char* buf = WriteInto(output, output_length); | |
| 239 output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, | |
| 240 decoded.data(), decoded.length(), &err); | |
| 241 ucnv_close(converter); | |
| 242 if (U_FAILURE(err)) | |
| 243 return false; | |
| 244 output->resize(output_length); | |
| 245 return true; | |
| 246 } | |
| 247 | |
| 248 bool DecodeWord(const std::string& encoded_word, | |
| 249 const std::string& referrer_charset, | |
| 250 bool* is_rfc2047, | |
| 251 std::string* output) { | |
| 252 *is_rfc2047 = false; | |
| 253 output->clear(); | |
| 254 if (encoded_word.empty()) | |
| 255 return true; | |
| 256 | |
| 257 if (!IsStringASCII(encoded_word)) { | |
| 258 // Try UTF-8, referrer_charset and the native OS default charset in turn. | |
| 259 if (IsStringUTF8(encoded_word)) { | |
| 260 *output = encoded_word; | |
| 261 } else { | |
| 262 string16 utf16_output; | |
| 263 if (!referrer_charset.empty() && | |
| 264 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), | |
| 265 base::OnStringConversionError::FAIL, | |
| 266 &utf16_output)) { | |
| 267 *output = UTF16ToUTF8(utf16_output); | |
| 268 } else { | |
| 269 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); | |
| 270 } | |
| 271 } | |
| 272 | |
| 273 return true; | |
| 274 } | |
| 275 | |
| 276 // RFC 2047 : one of encoding methods supported by Firefox and relatively | |
| 277 // widely used by web servers. | |
| 278 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. | |
| 279 // We don't care about the length restriction (72 bytes) because | |
| 280 // many web servers generate encoded words longer than the limit. | |
| 281 std::string tmp; | |
| 282 *is_rfc2047 = true; | |
| 283 int part_index = 0; | |
| 284 std::string charset; | |
| 285 StringTokenizer t(encoded_word, "?"); | |
| 286 RFC2047EncodingType enc_type = Q_ENCODING; | |
| 287 while (*is_rfc2047 && t.GetNext()) { | |
| 288 std::string part = t.token(); | |
| 289 switch (part_index) { | |
| 290 case 0: | |
| 291 if (part != "=") { | |
| 292 *is_rfc2047 = false; | |
| 293 break; | |
| 294 } | |
| 295 ++part_index; | |
| 296 break; | |
| 297 case 1: | |
| 298 // Do we need charset validity check here? | |
| 299 charset = part; | |
| 300 ++part_index; | |
| 301 break; | |
| 302 case 2: | |
| 303 if (part.size() > 1 || | |
| 304 part.find_first_of("bBqQ") == std::string::npos) { | |
| 305 *is_rfc2047 = false; | |
| 306 break; | |
| 307 } | |
| 308 if (part[0] == 'b' || part[0] == 'B') { | |
| 309 enc_type = B_ENCODING; | |
| 310 } | |
| 311 ++part_index; | |
| 312 break; | |
| 313 case 3: | |
| 314 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); | |
| 315 if (!*is_rfc2047) { | |
| 316 // Last minute failure. Invalid B/Q encoding. Rather than | |
| 317 // passing it through, return now. | |
| 318 return false; | |
| 319 } | |
| 320 ++part_index; | |
| 321 break; | |
| 322 case 4: | |
| 323 if (part != "=") { | |
| 324 // Another last minute failure ! | |
| 325 // Likely to be a case of two encoded-words in a row or | |
| 326 // an encoded word followed by a non-encoded word. We can be | |
| 327 // generous, but it does not help much in terms of compatibility, | |
| 328 // I believe. Return immediately. | |
| 329 *is_rfc2047 = false; | |
| 330 return false; | |
| 331 } | |
| 332 ++part_index; | |
| 333 break; | |
| 334 default: | |
| 335 *is_rfc2047 = false; | |
| 336 return false; | |
| 337 } | |
| 338 } | |
| 339 | |
| 340 if (*is_rfc2047) { | |
| 341 if (*(encoded_word.end() - 1) == '=') { | |
| 342 output->swap(tmp); | |
| 343 return true; | |
| 344 } | |
| 345 // encoded_word ending prematurelly with '?' or extra '?' | |
| 346 *is_rfc2047 = false; | |
| 347 return false; | |
| 348 } | |
| 349 | |
| 350 // We're not handling 'especial' characters quoted with '\', but | |
| 351 // it should be Ok because we're not an email client but a | |
| 352 // web browser. | |
| 353 | |
| 354 // What IE6/7 does: %-escaped UTF-8. | |
| 355 tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES); | |
| 356 if (IsStringUTF8(tmp)) { | |
| 357 output->swap(tmp); | |
| 358 return true; | |
| 359 // We can try either the OS default charset or 'origin charset' here, | |
| 360 // As far as I can tell, IE does not support it. However, I've seen | |
| 361 // web servers emit %-escaped string in a legacy encoding (usually | |
| 362 // origin charset). | |
| 363 // TODO(jungshik) : Test IE further and consider adding a fallback here. | |
| 364 } | |
| 365 return false; | |
| 366 } | |
| 367 | |
| 368 // Does some simple normalization of scripts so we can allow certain scripts | 176 // Does some simple normalization of scripts so we can allow certain scripts |
| 369 // to exist together. | 177 // to exist together. |
| 370 // TODO(brettw) bug 880223: we should allow some other languages to be | 178 // TODO(brettw) bug 880223: we should allow some other languages to be |
| 371 // oombined such as Chinese and Latin. We will probably need a more | 179 // oombined such as Chinese and Latin. We will probably need a more |
| 372 // complicated system of language pairs to have more fine-grained control. | 180 // complicated system of language pairs to have more fine-grained control. |
| 373 UScriptCode NormalizeScript(UScriptCode code) { | 181 UScriptCode NormalizeScript(UScriptCode code) { |
| 374 switch (code) { | 182 switch (code) { |
| 375 case USCRIPT_KATAKANA: | 183 case USCRIPT_KATAKANA: |
| 376 case USCRIPT_HIRAGANA: | 184 case USCRIPT_HIRAGANA: |
| 377 case USCRIPT_KATAKANA_OR_HIRAGANA: | 185 case USCRIPT_KATAKANA_OR_HIRAGANA: |
| (...skipping 554 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 932 // don't attempt to divine a file name out of them. | 740 // don't attempt to divine a file name out of them. |
| 933 if (!url.is_valid() || url.SchemeIs("about") || url.SchemeIs("data")) | 741 if (!url.is_valid() || url.SchemeIs("about") || url.SchemeIs("data")) |
| 934 return std::string(); | 742 return std::string(); |
| 935 | 743 |
| 936 const std::string unescaped_url_filename = UnescapeURLComponent( | 744 const std::string unescaped_url_filename = UnescapeURLComponent( |
| 937 url.ExtractFileName(), | 745 url.ExtractFileName(), |
| 938 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); | 746 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); |
| 939 | 747 |
| 940 // The URL's path should be escaped UTF-8, but may not be. | 748 // The URL's path should be escaped UTF-8, but may not be. |
| 941 std::string decoded_filename = unescaped_url_filename; | 749 std::string decoded_filename = unescaped_url_filename; |
| 942 if (!IsStringASCII(decoded_filename)) { | 750 if (!IsStringUTF8(decoded_filename)) { |
| 943 bool ignore; | |
| 944 // TODO(jshin): this is probably not robust enough. To be sure, we need | 751 // TODO(jshin): this is probably not robust enough. To be sure, we need |
| 945 // encoding detection. | 752 // encoding detection. |
| 946 DecodeWord(unescaped_url_filename, referrer_charset, &ignore, | 753 string16 utf16_output; |
| 947 &decoded_filename); | 754 if (!referrer_charset.empty() && |
| 755 base::CodepageToUTF16(unescaped_url_filename, |
| 756 referrer_charset.c_str(), |
| 757 base::OnStringConversionError::FAIL, |
| 758 &utf16_output)) { |
| 759 decoded_filename = UTF16ToUTF8(utf16_output); |
| 760 } else { |
| 761 decoded_filename = WideToUTF8( |
| 762 base::SysNativeMBToWide(unescaped_url_filename)); |
| 763 } |
| 948 } | 764 } |
| 949 // If the URL contains a (possibly empty) query, assume it is a generator, and | 765 // If the URL contains a (possibly empty) query, assume it is a generator, and |
| 950 // allow the determined extension to be overwritten. | 766 // allow the determined extension to be overwritten. |
| 951 *should_overwrite_extension = !decoded_filename.empty() && url.has_query(); | 767 *should_overwrite_extension = !decoded_filename.empty() && url.has_query(); |
| 952 | 768 |
| 953 return decoded_filename; | 769 return decoded_filename; |
| 954 } | 770 } |
| 955 | 771 |
| 956 #if defined(OS_WIN) | 772 #if defined(OS_WIN) |
| 957 // Returns whether the specified extension is automatically integrated into the | 773 // Returns whether the specified extension is automatically integrated into the |
| (...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1151 return std::string(); | 967 return std::string(); |
| 1152 | 968 |
| 1153 begin += match.length(); | 969 begin += match.length(); |
| 1154 | 970 |
| 1155 std::string ret; | 971 std::string ret; |
| 1156 TrimWhitespace(std::string(begin, std::find(begin, headers.end(), '\n')), | 972 TrimWhitespace(std::string(begin, std::find(begin, headers.end(), '\n')), |
| 1157 TRIM_ALL, &ret); | 973 TRIM_ALL, &ret); |
| 1158 return ret; | 974 return ret; |
| 1159 } | 975 } |
| 1160 | 976 |
| 1161 bool DecodeCharset(const std::string& input, | |
| 1162 std::string* decoded_charset, | |
| 1163 std::string* value) { | |
| 1164 StringTokenizer t(input, "'"); | |
| 1165 t.set_options(StringTokenizer::RETURN_DELIMS); | |
| 1166 std::string temp_charset; | |
| 1167 std::string temp_value; | |
| 1168 int numDelimsSeen = 0; | |
| 1169 while (t.GetNext()) { | |
| 1170 if (t.token_is_delim()) { | |
| 1171 ++numDelimsSeen; | |
| 1172 continue; | |
| 1173 } else { | |
| 1174 switch (numDelimsSeen) { | |
| 1175 case 0: | |
| 1176 temp_charset = t.token(); | |
| 1177 break; | |
| 1178 case 1: | |
| 1179 // Language is ignored. | |
| 1180 break; | |
| 1181 case 2: | |
| 1182 temp_value = t.token(); | |
| 1183 break; | |
| 1184 default: | |
| 1185 return false; | |
| 1186 } | |
| 1187 } | |
| 1188 } | |
| 1189 if (numDelimsSeen != 2) | |
| 1190 return false; | |
| 1191 if (temp_charset.empty() || temp_value.empty()) | |
| 1192 return false; | |
| 1193 decoded_charset->swap(temp_charset); | |
| 1194 value->swap(temp_value); | |
| 1195 return true; | |
| 1196 } | |
| 1197 | |
| 1198 bool DecodeFilenameValue(const std::string& input, | |
| 1199 const std::string& referrer_charset, | |
| 1200 std::string* output) { | |
| 1201 std::string tmp; | |
| 1202 // Tokenize with whitespace characters. | |
| 1203 StringTokenizer t(input, " \t\n\r"); | |
| 1204 t.set_options(StringTokenizer::RETURN_DELIMS); | |
| 1205 bool is_previous_token_rfc2047 = true; | |
| 1206 while (t.GetNext()) { | |
| 1207 if (t.token_is_delim()) { | |
| 1208 // If the previous non-delimeter token is not RFC2047-encoded, | |
| 1209 // put in a space in its place. Otheriwse, skip over it. | |
| 1210 if (!is_previous_token_rfc2047) { | |
| 1211 tmp.push_back(' '); | |
| 1212 } | |
| 1213 continue; | |
| 1214 } | |
| 1215 // We don't support a single multibyte character split into | |
| 1216 // adjacent encoded words. Some broken mail clients emit headers | |
| 1217 // with that problem, but most web servers usually encode a filename | |
| 1218 // in a single encoded-word. Firefox/Thunderbird do not support | |
| 1219 // it, either. | |
| 1220 std::string decoded; | |
| 1221 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, | |
| 1222 &decoded)) | |
| 1223 return false; | |
| 1224 tmp.append(decoded); | |
| 1225 } | |
| 1226 output->swap(tmp); | |
| 1227 return true; | |
| 1228 } | |
| 1229 | |
| 1230 bool DecodeExtValue(const std::string& param_value, std::string* decoded) { | |
| 1231 if (param_value.find('"') != std::string::npos) | |
| 1232 return false; | |
| 1233 | |
| 1234 std::string charset; | |
| 1235 std::string value; | |
| 1236 if (!DecodeCharset(param_value, &charset, &value)) | |
| 1237 return false; | |
| 1238 | |
| 1239 // RFC 5987 value should be ASCII-only. | |
| 1240 if (!IsStringASCII(value)) { | |
| 1241 decoded->clear(); | |
| 1242 return true; | |
| 1243 } | |
| 1244 | |
| 1245 std::string unescaped = UnescapeURLComponent(value, | |
| 1246 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); | |
| 1247 | |
| 1248 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); | |
| 1249 } | |
| 1250 | |
| 1251 string16 IDNToUnicode(const std::string& host, | 977 string16 IDNToUnicode(const std::string& host, |
| 1252 const std::string& languages) { | 978 const std::string& languages) { |
| 1253 return IDNToUnicodeWithOffsets(host, languages, NULL); | 979 return IDNToUnicodeWithOffsets(host, languages, NULL); |
| 1254 } | 980 } |
| 1255 | 981 |
| 1256 std::string CanonicalizeHost(const std::string& host, | 982 std::string CanonicalizeHost(const std::string& host, |
| 1257 url_canon::CanonHostInfo* host_info) { | 983 url_canon::CanonHostInfo* host_info) { |
| 1258 // Try to canonicalize the host. | 984 // Try to canonicalize the host. |
| 1259 const url_parse::Component raw_host_component( | 985 const url_parse::Component raw_host_component( |
| 1260 0, static_cast<int>(host.length())); | 986 0, static_cast<int>(host.length())); |
| (...skipping 1186 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2447 | 2173 |
| 2448 NetworkInterface::NetworkInterface(const std::string& name, | 2174 NetworkInterface::NetworkInterface(const std::string& name, |
| 2449 const IPAddressNumber& address) | 2175 const IPAddressNumber& address) |
| 2450 : name(name), address(address) { | 2176 : name(name), address(address) { |
| 2451 } | 2177 } |
| 2452 | 2178 |
| 2453 NetworkInterface::~NetworkInterface() { | 2179 NetworkInterface::~NetworkInterface() { |
| 2454 } | 2180 } |
| 2455 | 2181 |
| 2456 } // namespace net | 2182 } // namespace net |
| OLD | NEW |