Index: net/base/net_util.cc |
diff --git a/net/base/net_util.cc b/net/base/net_util.cc |
index 2b11c4dd04d1b7d89495745d9661b72ab380ddaf..5f321c6d01fa68a4379c741da0a27c4540160f45 100644 |
--- a/net/base/net_util.cc |
+++ b/net/base/net_util.cc |
@@ -25,7 +25,6 @@ |
#include <netinet/in.h> |
#endif |
-#include "base/base64.h" |
#include "base/basictypes.h" |
#include "base/file_path.h" |
#include "base/file_util.h" |
@@ -71,7 +70,6 @@ |
#include "net/http/http_content_disposition.h" |
#include "unicode/datefmt.h" |
#include "unicode/regex.h" |
-#include "unicode/ucnv.h" |
#include "unicode/uidna.h" |
#include "unicode/ulocdata.h" |
#include "unicode/uniset.h" |
@@ -175,196 +173,6 @@ std::string::size_type CountTrailingChars( |
} |
#endif |
-// Similar to Base64Decode. Decodes a Q-encoded string to a sequence |
-// of bytes. If input is invalid, return false. |
-bool QPDecode(const std::string& input, std::string* output) { |
- std::string temp; |
- temp.reserve(input.size()); |
- for (std::string::const_iterator it = input.begin(); it != input.end(); |
- ++it) { |
- if (*it == '_') { |
- temp.push_back(' '); |
- } else if (*it == '=') { |
- if ((input.end() - it < 3) || |
- !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || |
- !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) |
- return false; |
- unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + |
- HexDigitToInt(*(it + 2)); |
- temp.push_back(static_cast<char>(ch)); |
- ++it; |
- ++it; |
- } else if (0x20 < *it && *it < 0x7F) { |
- // In a Q-encoded word, only printable ASCII characters |
- // represent themselves. Besides, space, '=', '_' and '?' are |
- // not allowed, but they're already filtered out. |
- DCHECK_NE('=', *it); |
- DCHECK_NE('?', *it); |
- DCHECK_NE('_', *it); |
- temp.push_back(*it); |
- } else { |
- return false; |
- } |
- } |
- output->swap(temp); |
- return true; |
-} |
- |
-enum RFC2047EncodingType {Q_ENCODING, B_ENCODING}; |
-bool DecodeBQEncoding(const std::string& part, |
- RFC2047EncodingType enc_type, |
- const std::string& charset, |
- std::string* output) { |
- std::string decoded; |
- if (!((enc_type == B_ENCODING) ? |
- base::Base64Decode(part, &decoded) : QPDecode(part, &decoded))) |
- return false; |
- |
- if (decoded.empty()) { |
- output->clear(); |
- return true; |
- } |
- |
- UErrorCode err = U_ZERO_ERROR; |
- UConverter* converter(ucnv_open(charset.c_str(), &err)); |
- if (U_FAILURE(err)) |
- return false; |
- |
- // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. |
- // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes |
- // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a |
- // trailing '\0'. |
- size_t output_length = decoded.length() * 3 + 1; |
- char* buf = WriteInto(output, output_length); |
- output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, |
- decoded.data(), decoded.length(), &err); |
- ucnv_close(converter); |
- if (U_FAILURE(err)) |
- return false; |
- output->resize(output_length); |
- return true; |
-} |
- |
-bool DecodeWord(const std::string& encoded_word, |
- const std::string& referrer_charset, |
- bool* is_rfc2047, |
- std::string* output) { |
- *is_rfc2047 = false; |
- output->clear(); |
- if (encoded_word.empty()) |
- return true; |
- |
- if (!IsStringASCII(encoded_word)) { |
- // Try UTF-8, referrer_charset and the native OS default charset in turn. |
- if (IsStringUTF8(encoded_word)) { |
- *output = encoded_word; |
- } else { |
- string16 utf16_output; |
- if (!referrer_charset.empty() && |
- base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), |
- base::OnStringConversionError::FAIL, |
- &utf16_output)) { |
- *output = UTF16ToUTF8(utf16_output); |
- } else { |
- *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); |
- } |
- } |
- |
- return true; |
- } |
- |
- // RFC 2047 : one of encoding methods supported by Firefox and relatively |
- // widely used by web servers. |
- // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. |
- // We don't care about the length restriction (72 bytes) because |
- // many web servers generate encoded words longer than the limit. |
- std::string tmp; |
- *is_rfc2047 = true; |
- int part_index = 0; |
- std::string charset; |
- StringTokenizer t(encoded_word, "?"); |
- RFC2047EncodingType enc_type = Q_ENCODING; |
- while (*is_rfc2047 && t.GetNext()) { |
- std::string part = t.token(); |
- switch (part_index) { |
- case 0: |
- if (part != "=") { |
- *is_rfc2047 = false; |
- break; |
- } |
- ++part_index; |
- break; |
- case 1: |
- // Do we need charset validity check here? |
- charset = part; |
- ++part_index; |
- break; |
- case 2: |
- if (part.size() > 1 || |
- part.find_first_of("bBqQ") == std::string::npos) { |
- *is_rfc2047 = false; |
- break; |
- } |
- if (part[0] == 'b' || part[0] == 'B') { |
- enc_type = B_ENCODING; |
- } |
- ++part_index; |
- break; |
- case 3: |
- *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); |
- if (!*is_rfc2047) { |
- // Last minute failure. Invalid B/Q encoding. Rather than |
- // passing it through, return now. |
- return false; |
- } |
- ++part_index; |
- break; |
- case 4: |
- if (part != "=") { |
- // Another last minute failure ! |
- // Likely to be a case of two encoded-words in a row or |
- // an encoded word followed by a non-encoded word. We can be |
- // generous, but it does not help much in terms of compatibility, |
- // I believe. Return immediately. |
- *is_rfc2047 = false; |
- return false; |
- } |
- ++part_index; |
- break; |
- default: |
- *is_rfc2047 = false; |
- return false; |
- } |
- } |
- |
- if (*is_rfc2047) { |
- if (*(encoded_word.end() - 1) == '=') { |
- output->swap(tmp); |
- return true; |
- } |
- // encoded_word ending prematurelly with '?' or extra '?' |
- *is_rfc2047 = false; |
- return false; |
- } |
- |
- // We're not handling 'especial' characters quoted with '\', but |
- // it should be Ok because we're not an email client but a |
- // web browser. |
- |
- // What IE6/7 does: %-escaped UTF-8. |
- tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES); |
- if (IsStringUTF8(tmp)) { |
- output->swap(tmp); |
- return true; |
- // We can try either the OS default charset or 'origin charset' here, |
- // As far as I can tell, IE does not support it. However, I've seen |
- // web servers emit %-escaped string in a legacy encoding (usually |
- // origin charset). |
- // TODO(jungshik) : Test IE further and consider adding a fallback here. |
- } |
- return false; |
-} |
- |
// Does some simple normalization of scripts so we can allow certain scripts |
// to exist together. |
// TODO(brettw) bug 880223: we should allow some other languages to be |
@@ -939,12 +747,20 @@ std::string GetFileNameFromURL(const GURL& url, |
// The URL's path should be escaped UTF-8, but may not be. |
std::string decoded_filename = unescaped_url_filename; |
- if (!IsStringASCII(decoded_filename)) { |
- bool ignore; |
+ if (!IsStringUTF8(decoded_filename)) { |
// TODO(jshin): this is probably not robust enough. To be sure, we need |
// encoding detection. |
- DecodeWord(unescaped_url_filename, referrer_charset, &ignore, |
- &decoded_filename); |
+ string16 utf16_output; |
+ if (!referrer_charset.empty() && |
+ base::CodepageToUTF16(unescaped_url_filename, |
+ referrer_charset.c_str(), |
+ base::OnStringConversionError::FAIL, |
+ &utf16_output)) { |
+ decoded_filename = UTF16ToUTF8(utf16_output); |
+ } else { |
+ decoded_filename = WideToUTF8( |
+ base::SysNativeMBToWide(unescaped_url_filename)); |
+ } |
} |
// If the URL contains a (possibly empty) query, assume it is a generator, and |
// allow the determined extension to be overwritten. |
@@ -1158,96 +974,6 @@ std::string GetSpecificHeader(const std::string& headers, |
return ret; |
} |
-bool DecodeCharset(const std::string& input, |
- std::string* decoded_charset, |
- std::string* value) { |
- StringTokenizer t(input, "'"); |
- t.set_options(StringTokenizer::RETURN_DELIMS); |
- std::string temp_charset; |
- std::string temp_value; |
- int numDelimsSeen = 0; |
- while (t.GetNext()) { |
- if (t.token_is_delim()) { |
- ++numDelimsSeen; |
- continue; |
- } else { |
- switch (numDelimsSeen) { |
- case 0: |
- temp_charset = t.token(); |
- break; |
- case 1: |
- // Language is ignored. |
- break; |
- case 2: |
- temp_value = t.token(); |
- break; |
- default: |
- return false; |
- } |
- } |
- } |
- if (numDelimsSeen != 2) |
- return false; |
- if (temp_charset.empty() || temp_value.empty()) |
- return false; |
- decoded_charset->swap(temp_charset); |
- value->swap(temp_value); |
- return true; |
-} |
- |
-bool DecodeFilenameValue(const std::string& input, |
- const std::string& referrer_charset, |
- std::string* output) { |
- std::string tmp; |
- // Tokenize with whitespace characters. |
- StringTokenizer t(input, " \t\n\r"); |
- t.set_options(StringTokenizer::RETURN_DELIMS); |
- bool is_previous_token_rfc2047 = true; |
- while (t.GetNext()) { |
- if (t.token_is_delim()) { |
- // If the previous non-delimeter token is not RFC2047-encoded, |
- // put in a space in its place. Otheriwse, skip over it. |
- if (!is_previous_token_rfc2047) { |
- tmp.push_back(' '); |
- } |
- continue; |
- } |
- // We don't support a single multibyte character split into |
- // adjacent encoded words. Some broken mail clients emit headers |
- // with that problem, but most web servers usually encode a filename |
- // in a single encoded-word. Firefox/Thunderbird do not support |
- // it, either. |
- std::string decoded; |
- if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, |
- &decoded)) |
- return false; |
- tmp.append(decoded); |
- } |
- output->swap(tmp); |
- return true; |
-} |
- |
-bool DecodeExtValue(const std::string& param_value, std::string* decoded) { |
- if (param_value.find('"') != std::string::npos) |
- return false; |
- |
- std::string charset; |
- std::string value; |
- if (!DecodeCharset(param_value, &charset, &value)) |
- return false; |
- |
- // RFC 5987 value should be ASCII-only. |
- if (!IsStringASCII(value)) { |
- decoded->clear(); |
- return true; |
- } |
- |
- std::string unescaped = UnescapeURLComponent(value, |
- UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); |
- |
- return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); |
-} |
- |
string16 IDNToUnicode(const std::string& host, |
const std::string& languages) { |
return IDNToUnicodeWithOffsets(host, languages, NULL); |