net/base/net_util.cc - Issue 11471041: Move DecodeFilenameValue and DecodeExt value into http_content_disposition.

Unified Diff: net/base/net_util.cc

Issue 11471041: Move DecodeFilenameValue and DecodeExt value into http_content_disposition. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: net/base/net_util.cc

diff --git a/net/base/net_util.cc b/net/base/net_util.cc

index 2b11c4dd04d1b7d89495745d9661b72ab380ddaf..5f321c6d01fa68a4379c741da0a27c4540160f45 100644

--- a/net/base/net_util.cc

+++ b/net/base/net_util.cc

@@ -25,7 +25,6 @@

#include <netinet/in.h>

#endif

-#include "base/base64.h"

#include "base/basictypes.h"

#include "base/file_path.h"

#include "base/file_util.h"

@@ -71,7 +70,6 @@

#include "net/http/http_content_disposition.h"

#include "unicode/datefmt.h"

#include "unicode/regex.h"

-#include "unicode/ucnv.h"

#include "unicode/uidna.h"

#include "unicode/ulocdata.h"

#include "unicode/uniset.h"

@@ -175,196 +173,6 @@ std::string::size_type CountTrailingChars(

}

#endif

-// Similar to Base64Decode. Decodes a Q-encoded string to a sequence

-// of bytes. If input is invalid, return false.

-bool QPDecode(const std::string& input, std::string* output) {

- std::string temp;

- temp.reserve(input.size());

- for (std::string::const_iterator it = input.begin(); it != input.end();

- ++it) {

- if (*it == '_') {

- temp.push_back(' ');

- } else if (*it == '=') {

- if ((input.end() - it < 3) ||

- !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||

- !IsHexDigit(static_cast<unsigned char>(*(it + 2))))

- return false;

- unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +

- HexDigitToInt(*(it + 2));

- temp.push_back(static_cast<char>(ch));

- ++it;

- } else if (0x20 < *it && *it < 0x7F) {

- // In a Q-encoded word, only printable ASCII characters

- // represent themselves. Besides, space, '=', '_' and '?' are

- // not allowed, but they're already filtered out.

- DCHECK_NE('=', *it);

- DCHECK_NE('?', *it);

- DCHECK_NE('_', *it);

- temp.push_back(*it);

- } else {

- return false;

- }

- output->swap(temp);

- return true;

-enum RFC2047EncodingType {Q_ENCODING, B_ENCODING};

-bool DecodeBQEncoding(const std::string& part,

- RFC2047EncodingType enc_type,

- const std::string& charset,

- std::string* output) {

- std::string decoded;

- if (!((enc_type == B_ENCODING) ?

- base::Base64Decode(part, &decoded) : QPDecode(part, &decoded)))

- return false;

- if (decoded.empty()) {

- output->clear();

- return true;

- }

- UErrorCode err = U_ZERO_ERROR;

- UConverter* converter(ucnv_open(charset.c_str(), &err));

- if (U_FAILURE(err))

- return false;

- // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.

- // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes

- // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a

- // trailing '\0'.

- size_t output_length = decoded.length() * 3 + 1;

- char* buf = WriteInto(output, output_length);

- output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,

- decoded.data(), decoded.length(), &err);

- ucnv_close(converter);

- if (U_FAILURE(err))

- return false;

- output->resize(output_length);

- return true;

-bool DecodeWord(const std::string& encoded_word,

- const std::string& referrer_charset,

- bool* is_rfc2047,

- std::string* output) {

- *is_rfc2047 = false;

- output->clear();

- if (encoded_word.empty())

- return true;

- if (!IsStringASCII(encoded_word)) {

- // Try UTF-8, referrer_charset and the native OS default charset in turn.

- if (IsStringUTF8(encoded_word)) {

- *output = encoded_word;

- } else {

- string16 utf16_output;

- if (!referrer_charset.empty() &&

- base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),

- base::OnStringConversionError::FAIL,

- &utf16_output)) {

- *output = UTF16ToUTF8(utf16_output);

- } else {

- *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));

- }

- return true;

- }

- // RFC 2047 : one of encoding methods supported by Firefox and relatively

- // widely used by web servers.

- // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.

- // We don't care about the length restriction (72 bytes) because

- // many web servers generate encoded words longer than the limit.

- std::string tmp;

- *is_rfc2047 = true;

- int part_index = 0;

- std::string charset;

- StringTokenizer t(encoded_word, "?");

- RFC2047EncodingType enc_type = Q_ENCODING;

- while (*is_rfc2047 && t.GetNext()) {

- std::string part = t.token();

- switch (part_index) {

- case 0:

- if (part != "=") {

- *is_rfc2047 = false;

- break;

- }

- ++part_index;

- break;

- case 1:

- // Do we need charset validity check here?

- charset = part;

- ++part_index;

- break;

- case 2:

- if (part.size() > 1 ||

- part.find_first_of("bBqQ") == std::string::npos) {

- *is_rfc2047 = false;

- break;

- }

- if (part[0] == 'b' || part[0] == 'B') {

- enc_type = B_ENCODING;

- }

- ++part_index;

- break;

- case 3:

- *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);

- if (!*is_rfc2047) {

- // Last minute failure. Invalid B/Q encoding. Rather than

- // passing it through, return now.

- return false;

- }

- ++part_index;

- break;

- case 4:

- if (part != "=") {

- // Another last minute failure !

- // Likely to be a case of two encoded-words in a row or

- // an encoded word followed by a non-encoded word. We can be

- // generous, but it does not help much in terms of compatibility,

- // I believe. Return immediately.

- *is_rfc2047 = false;

- return false;

- }

- ++part_index;

- break;

- default:

- *is_rfc2047 = false;

- return false;

- }

- if (*is_rfc2047) {

- if (*(encoded_word.end() - 1) == '=') {

- output->swap(tmp);

- return true;

- }

- // encoded_word ending prematurelly with '?' or extra '?'

- *is_rfc2047 = false;

- return false;

- }

- // We're not handling 'especial' characters quoted with '\', but

- // it should be Ok because we're not an email client but a

- // web browser.

- // What IE6/7 does: %-escaped UTF-8.

- tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);

- if (IsStringUTF8(tmp)) {

- output->swap(tmp);

- return true;

- // We can try either the OS default charset or 'origin charset' here,

- // As far as I can tell, IE does not support it. However, I've seen

- // web servers emit %-escaped string in a legacy encoding (usually

- // origin charset).

- // TODO(jungshik) : Test IE further and consider adding a fallback here.

- }

- return false;

// Does some simple normalization of scripts so we can allow certain scripts

// to exist together.

// TODO(brettw) bug 880223: we should allow some other languages to be

@@ -939,12 +747,20 @@ std::string GetFileNameFromURL(const GURL& url,

// The URL's path should be escaped UTF-8, but may not be.

std::string decoded_filename = unescaped_url_filename;

- if (!IsStringASCII(decoded_filename)) {

- bool ignore;

+ if (!IsStringUTF8(decoded_filename)) {

// TODO(jshin): this is probably not robust enough. To be sure, we need

// encoding detection.

- DecodeWord(unescaped_url_filename, referrer_charset, &ignore,

- &decoded_filename);

+ string16 utf16_output;

+ if (!referrer_charset.empty() &&

+ base::CodepageToUTF16(unescaped_url_filename,

+ referrer_charset.c_str(),

+ base::OnStringConversionError::FAIL,

+ &utf16_output)) {

+ decoded_filename = UTF16ToUTF8(utf16_output);

+ } else {

+ decoded_filename = WideToUTF8(

+ base::SysNativeMBToWide(unescaped_url_filename));

+ }

}

// If the URL contains a (possibly empty) query, assume it is a generator, and

// allow the determined extension to be overwritten.

@@ -1158,96 +974,6 @@ std::string GetSpecificHeader(const std::string& headers,

return ret;

}

-bool DecodeCharset(const std::string& input,

- std::string* decoded_charset,

- std::string* value) {

- StringTokenizer t(input, "'");

- t.set_options(StringTokenizer::RETURN_DELIMS);

- std::string temp_charset;

- std::string temp_value;

- int numDelimsSeen = 0;

- while (t.GetNext()) {

- if (t.token_is_delim()) {

- ++numDelimsSeen;

- continue;

- } else {

- switch (numDelimsSeen) {

- case 0:

- temp_charset = t.token();

- break;

- case 1:

- // Language is ignored.

- break;

- case 2:

- temp_value = t.token();

- break;

- default:

- return false;

- }

- if (numDelimsSeen != 2)

- return false;

- if (temp_charset.empty() || temp_value.empty())

- return false;

- decoded_charset->swap(temp_charset);

- value->swap(temp_value);

- return true;

-bool DecodeFilenameValue(const std::string& input,

- const std::string& referrer_charset,

- std::string* output) {

- std::string tmp;

- // Tokenize with whitespace characters.

- StringTokenizer t(input, " \t\n\r");

- t.set_options(StringTokenizer::RETURN_DELIMS);

- bool is_previous_token_rfc2047 = true;

- while (t.GetNext()) {

- if (t.token_is_delim()) {

- // If the previous non-delimeter token is not RFC2047-encoded,

- // put in a space in its place. Otheriwse, skip over it.

- if (!is_previous_token_rfc2047) {

- tmp.push_back(' ');

- }

- continue;

- }

- // We don't support a single multibyte character split into

- // adjacent encoded words. Some broken mail clients emit headers

- // with that problem, but most web servers usually encode a filename

- // in a single encoded-word. Firefox/Thunderbird do not support

- // it, either.

- std::string decoded;

- if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,

- &decoded))

- return false;

- tmp.append(decoded);

- }

- output->swap(tmp);

- return true;

-bool DecodeExtValue(const std::string& param_value, std::string* decoded) {

- if (param_value.find('"') != std::string::npos)

- return false;

- std::string charset;

- std::string value;

- if (!DecodeCharset(param_value, &charset, &value))

- return false;

- // RFC 5987 value should be ASCII-only.

- if (!IsStringASCII(value)) {

- decoded->clear();

- return true;

- }

- std::string unescaped = UnescapeURLComponent(value,

- UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);

- return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);

string16 IDNToUnicode(const std::string& host,

const std::string& languages) {

return IDNToUnicodeWithOffsets(host, languages, NULL);

« no previous file with comments | « net/base/net_util.h ('k') | net/base/net_util_unittest.cc » ('j') | net/http/http_content_disposition.cc » ('J')