| Index: net/base/net_util.cc
|
| diff --git a/net/base/net_util.cc b/net/base/net_util.cc
|
| index 2b11c4dd04d1b7d89495745d9661b72ab380ddaf..5f321c6d01fa68a4379c741da0a27c4540160f45 100644
|
| --- a/net/base/net_util.cc
|
| +++ b/net/base/net_util.cc
|
| @@ -25,7 +25,6 @@
|
| #include <netinet/in.h>
|
| #endif
|
|
|
| -#include "base/base64.h"
|
| #include "base/basictypes.h"
|
| #include "base/file_path.h"
|
| #include "base/file_util.h"
|
| @@ -71,7 +70,6 @@
|
| #include "net/http/http_content_disposition.h"
|
| #include "unicode/datefmt.h"
|
| #include "unicode/regex.h"
|
| -#include "unicode/ucnv.h"
|
| #include "unicode/uidna.h"
|
| #include "unicode/ulocdata.h"
|
| #include "unicode/uniset.h"
|
| @@ -175,196 +173,6 @@ std::string::size_type CountTrailingChars(
|
| }
|
| #endif
|
|
|
| -// Similar to Base64Decode. Decodes a Q-encoded string to a sequence
|
| -// of bytes. If input is invalid, return false.
|
| -bool QPDecode(const std::string& input, std::string* output) {
|
| - std::string temp;
|
| - temp.reserve(input.size());
|
| - for (std::string::const_iterator it = input.begin(); it != input.end();
|
| - ++it) {
|
| - if (*it == '_') {
|
| - temp.push_back(' ');
|
| - } else if (*it == '=') {
|
| - if ((input.end() - it < 3) ||
|
| - !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
|
| - !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
|
| - return false;
|
| - unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
|
| - HexDigitToInt(*(it + 2));
|
| - temp.push_back(static_cast<char>(ch));
|
| - ++it;
|
| - ++it;
|
| - } else if (0x20 < *it && *it < 0x7F) {
|
| - // In a Q-encoded word, only printable ASCII characters
|
| - // represent themselves. Besides, space, '=', '_' and '?' are
|
| - // not allowed, but they're already filtered out.
|
| - DCHECK_NE('=', *it);
|
| - DCHECK_NE('?', *it);
|
| - DCHECK_NE('_', *it);
|
| - temp.push_back(*it);
|
| - } else {
|
| - return false;
|
| - }
|
| - }
|
| - output->swap(temp);
|
| - return true;
|
| -}
|
| -
|
| -enum RFC2047EncodingType {Q_ENCODING, B_ENCODING};
|
| -bool DecodeBQEncoding(const std::string& part,
|
| - RFC2047EncodingType enc_type,
|
| - const std::string& charset,
|
| - std::string* output) {
|
| - std::string decoded;
|
| - if (!((enc_type == B_ENCODING) ?
|
| - base::Base64Decode(part, &decoded) : QPDecode(part, &decoded)))
|
| - return false;
|
| -
|
| - if (decoded.empty()) {
|
| - output->clear();
|
| - return true;
|
| - }
|
| -
|
| - UErrorCode err = U_ZERO_ERROR;
|
| - UConverter* converter(ucnv_open(charset.c_str(), &err));
|
| - if (U_FAILURE(err))
|
| - return false;
|
| -
|
| - // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
|
| - // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
|
| - // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a
|
| - // trailing '\0'.
|
| - size_t output_length = decoded.length() * 3 + 1;
|
| - char* buf = WriteInto(output, output_length);
|
| - output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,
|
| - decoded.data(), decoded.length(), &err);
|
| - ucnv_close(converter);
|
| - if (U_FAILURE(err))
|
| - return false;
|
| - output->resize(output_length);
|
| - return true;
|
| -}
|
| -
|
| -bool DecodeWord(const std::string& encoded_word,
|
| - const std::string& referrer_charset,
|
| - bool* is_rfc2047,
|
| - std::string* output) {
|
| - *is_rfc2047 = false;
|
| - output->clear();
|
| - if (encoded_word.empty())
|
| - return true;
|
| -
|
| - if (!IsStringASCII(encoded_word)) {
|
| - // Try UTF-8, referrer_charset and the native OS default charset in turn.
|
| - if (IsStringUTF8(encoded_word)) {
|
| - *output = encoded_word;
|
| - } else {
|
| - string16 utf16_output;
|
| - if (!referrer_charset.empty() &&
|
| - base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
|
| - base::OnStringConversionError::FAIL,
|
| - &utf16_output)) {
|
| - *output = UTF16ToUTF8(utf16_output);
|
| - } else {
|
| - *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
|
| - }
|
| - }
|
| -
|
| - return true;
|
| - }
|
| -
|
| - // RFC 2047 : one of encoding methods supported by Firefox and relatively
|
| - // widely used by web servers.
|
| - // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
|
| - // We don't care about the length restriction (72 bytes) because
|
| - // many web servers generate encoded words longer than the limit.
|
| - std::string tmp;
|
| - *is_rfc2047 = true;
|
| - int part_index = 0;
|
| - std::string charset;
|
| - StringTokenizer t(encoded_word, "?");
|
| - RFC2047EncodingType enc_type = Q_ENCODING;
|
| - while (*is_rfc2047 && t.GetNext()) {
|
| - std::string part = t.token();
|
| - switch (part_index) {
|
| - case 0:
|
| - if (part != "=") {
|
| - *is_rfc2047 = false;
|
| - break;
|
| - }
|
| - ++part_index;
|
| - break;
|
| - case 1:
|
| - // Do we need charset validity check here?
|
| - charset = part;
|
| - ++part_index;
|
| - break;
|
| - case 2:
|
| - if (part.size() > 1 ||
|
| - part.find_first_of("bBqQ") == std::string::npos) {
|
| - *is_rfc2047 = false;
|
| - break;
|
| - }
|
| - if (part[0] == 'b' || part[0] == 'B') {
|
| - enc_type = B_ENCODING;
|
| - }
|
| - ++part_index;
|
| - break;
|
| - case 3:
|
| - *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);
|
| - if (!*is_rfc2047) {
|
| - // Last minute failure. Invalid B/Q encoding. Rather than
|
| - // passing it through, return now.
|
| - return false;
|
| - }
|
| - ++part_index;
|
| - break;
|
| - case 4:
|
| - if (part != "=") {
|
| - // Another last minute failure !
|
| - // Likely to be a case of two encoded-words in a row or
|
| - // an encoded word followed by a non-encoded word. We can be
|
| - // generous, but it does not help much in terms of compatibility,
|
| - // I believe. Return immediately.
|
| - *is_rfc2047 = false;
|
| - return false;
|
| - }
|
| - ++part_index;
|
| - break;
|
| - default:
|
| - *is_rfc2047 = false;
|
| - return false;
|
| - }
|
| - }
|
| -
|
| - if (*is_rfc2047) {
|
| - if (*(encoded_word.end() - 1) == '=') {
|
| - output->swap(tmp);
|
| - return true;
|
| - }
|
| - // encoded_word ending prematurelly with '?' or extra '?'
|
| - *is_rfc2047 = false;
|
| - return false;
|
| - }
|
| -
|
| - // We're not handling 'especial' characters quoted with '\', but
|
| - // it should be Ok because we're not an email client but a
|
| - // web browser.
|
| -
|
| - // What IE6/7 does: %-escaped UTF-8.
|
| - tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);
|
| - if (IsStringUTF8(tmp)) {
|
| - output->swap(tmp);
|
| - return true;
|
| - // We can try either the OS default charset or 'origin charset' here,
|
| - // As far as I can tell, IE does not support it. However, I've seen
|
| - // web servers emit %-escaped string in a legacy encoding (usually
|
| - // origin charset).
|
| - // TODO(jungshik) : Test IE further and consider adding a fallback here.
|
| - }
|
| - return false;
|
| -}
|
| -
|
| // Does some simple normalization of scripts so we can allow certain scripts
|
| // to exist together.
|
| // TODO(brettw) bug 880223: we should allow some other languages to be
|
| @@ -939,12 +747,20 @@ std::string GetFileNameFromURL(const GURL& url,
|
|
|
| // The URL's path should be escaped UTF-8, but may not be.
|
| std::string decoded_filename = unescaped_url_filename;
|
| - if (!IsStringASCII(decoded_filename)) {
|
| - bool ignore;
|
| + if (!IsStringUTF8(decoded_filename)) {
|
| // TODO(jshin): this is probably not robust enough. To be sure, we need
|
| // encoding detection.
|
| - DecodeWord(unescaped_url_filename, referrer_charset, &ignore,
|
| - &decoded_filename);
|
| + string16 utf16_output;
|
| + if (!referrer_charset.empty() &&
|
| + base::CodepageToUTF16(unescaped_url_filename,
|
| + referrer_charset.c_str(),
|
| + base::OnStringConversionError::FAIL,
|
| + &utf16_output)) {
|
| + decoded_filename = UTF16ToUTF8(utf16_output);
|
| + } else {
|
| + decoded_filename = WideToUTF8(
|
| + base::SysNativeMBToWide(unescaped_url_filename));
|
| + }
|
| }
|
| // If the URL contains a (possibly empty) query, assume it is a generator, and
|
| // allow the determined extension to be overwritten.
|
| @@ -1158,96 +974,6 @@ std::string GetSpecificHeader(const std::string& headers,
|
| return ret;
|
| }
|
|
|
| -bool DecodeCharset(const std::string& input,
|
| - std::string* decoded_charset,
|
| - std::string* value) {
|
| - StringTokenizer t(input, "'");
|
| - t.set_options(StringTokenizer::RETURN_DELIMS);
|
| - std::string temp_charset;
|
| - std::string temp_value;
|
| - int numDelimsSeen = 0;
|
| - while (t.GetNext()) {
|
| - if (t.token_is_delim()) {
|
| - ++numDelimsSeen;
|
| - continue;
|
| - } else {
|
| - switch (numDelimsSeen) {
|
| - case 0:
|
| - temp_charset = t.token();
|
| - break;
|
| - case 1:
|
| - // Language is ignored.
|
| - break;
|
| - case 2:
|
| - temp_value = t.token();
|
| - break;
|
| - default:
|
| - return false;
|
| - }
|
| - }
|
| - }
|
| - if (numDelimsSeen != 2)
|
| - return false;
|
| - if (temp_charset.empty() || temp_value.empty())
|
| - return false;
|
| - decoded_charset->swap(temp_charset);
|
| - value->swap(temp_value);
|
| - return true;
|
| -}
|
| -
|
| -bool DecodeFilenameValue(const std::string& input,
|
| - const std::string& referrer_charset,
|
| - std::string* output) {
|
| - std::string tmp;
|
| - // Tokenize with whitespace characters.
|
| - StringTokenizer t(input, " \t\n\r");
|
| - t.set_options(StringTokenizer::RETURN_DELIMS);
|
| - bool is_previous_token_rfc2047 = true;
|
| - while (t.GetNext()) {
|
| - if (t.token_is_delim()) {
|
| - // If the previous non-delimeter token is not RFC2047-encoded,
|
| - // put in a space in its place. Otheriwse, skip over it.
|
| - if (!is_previous_token_rfc2047) {
|
| - tmp.push_back(' ');
|
| - }
|
| - continue;
|
| - }
|
| - // We don't support a single multibyte character split into
|
| - // adjacent encoded words. Some broken mail clients emit headers
|
| - // with that problem, but most web servers usually encode a filename
|
| - // in a single encoded-word. Firefox/Thunderbird do not support
|
| - // it, either.
|
| - std::string decoded;
|
| - if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
|
| - &decoded))
|
| - return false;
|
| - tmp.append(decoded);
|
| - }
|
| - output->swap(tmp);
|
| - return true;
|
| -}
|
| -
|
| -bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
|
| - if (param_value.find('"') != std::string::npos)
|
| - return false;
|
| -
|
| - std::string charset;
|
| - std::string value;
|
| - if (!DecodeCharset(param_value, &charset, &value))
|
| - return false;
|
| -
|
| - // RFC 5987 value should be ASCII-only.
|
| - if (!IsStringASCII(value)) {
|
| - decoded->clear();
|
| - return true;
|
| - }
|
| -
|
| - std::string unescaped = UnescapeURLComponent(value,
|
| - UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
|
| -
|
| - return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);
|
| -}
|
| -
|
| string16 IDNToUnicode(const std::string& host,
|
| const std::string& languages) {
|
| return IDNToUnicodeWithOffsets(host, languages, NULL);
|
|
|