Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(526)

Side by Side Diff: net/base/net_util.cc

Issue 6898026: Eliminate wstring from base/utf_offset_string_conversions.h, net/base/escape.h, and net/base/net_... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 9 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « net/base/net_util.h ('k') | net/base/net_util_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "net/base/net_util.h" 5 #include "net/base/net_util.h"
6 6
7 #include <unicode/regex.h> 7 #include <unicode/regex.h>
8 #include <unicode/ucnv.h> 8 #include <unicode/ucnv.h>
9 #include <unicode/uidna.h> 9 #include <unicode/uidna.h>
10 #include <unicode/ulocdata.h> 10 #include <unicode/ulocdata.h>
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after
148 0xFFFF, // Used to block all invalid port numbers (see 148 0xFFFF, // Used to block all invalid port numbers (see
149 // third_party/WebKit/Source/WebCore/platform/KURLGoogle.cpp, port()) 149 // third_party/WebKit/Source/WebCore/platform/KURLGoogle.cpp, port())
150 }; 150 };
151 151
152 // FTP overrides the following restricted ports. 152 // FTP overrides the following restricted ports.
153 static const int kAllowedFtpPorts[] = { 153 static const int kAllowedFtpPorts[] = {
154 21, // ftp data 154 21, // ftp data
155 22, // ssh 155 22, // ssh
156 }; 156 };
157 157
158 template<typename STR>
159 STR GetSpecificHeaderT(const STR& headers, const STR& name) {
160 // We want to grab the Value from the "Key: Value" pairs in the headers,
161 // which should look like this (no leading spaces, \n-separated) (we format
162 // them this way in url_request_inet.cc):
163 // HTTP/1.1 200 OK\n
164 // ETag: "6d0b8-947-24f35ec0"\n
165 // Content-Length: 2375\n
166 // Content-Type: text/html; charset=UTF-8\n
167 // Last-Modified: Sun, 03 Sep 2006 04:34:43 GMT\n
168 if (headers.empty())
169 return STR();
170
171 STR match;
172 match.push_back('\n');
173 match.append(name);
174 match.push_back(':');
175
176 typename STR::const_iterator begin =
177 search(headers.begin(), headers.end(), match.begin(), match.end(),
178 base::CaseInsensitiveCompareASCII<typename STR::value_type>());
179
180 if (begin == headers.end())
181 return STR();
182
183 begin += match.length();
184
185 typename STR::const_iterator end = find(begin, headers.end(), '\n');
186
187 STR ret;
188 TrimWhitespace(STR(begin, end), TRIM_ALL, &ret);
189 return ret;
190 }
191
192 // Similar to Base64Decode. Decodes a Q-encoded string to a sequence 158 // Similar to Base64Decode. Decodes a Q-encoded string to a sequence
193 // of bytes. If input is invalid, return false. 159 // of bytes. If input is invalid, return false.
194 bool QPDecode(const std::string& input, std::string* output) { 160 bool QPDecode(const std::string& input, std::string* output) {
195 std::string temp; 161 std::string temp;
196 temp.reserve(input.size()); 162 temp.reserve(input.size());
197 std::string::const_iterator it = input.begin(); 163 std::string::const_iterator it = input.begin();
198 while (it != input.end()) { 164 while (it != input.end()) {
199 if (*it == '_') { 165 if (*it == '_') {
200 temp.push_back(' '); 166 temp.push_back(' ');
201 } else if (*it == '=') { 167 } else if (*it == '=') {
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
269 *is_rfc2047 = false; 235 *is_rfc2047 = false;
270 output->clear(); 236 output->clear();
271 if (encoded_word.empty()) 237 if (encoded_word.empty())
272 return true; 238 return true;
273 239
274 if (!IsStringASCII(encoded_word)) { 240 if (!IsStringASCII(encoded_word)) {
275 // Try UTF-8, referrer_charset and the native OS default charset in turn. 241 // Try UTF-8, referrer_charset and the native OS default charset in turn.
276 if (IsStringUTF8(encoded_word)) { 242 if (IsStringUTF8(encoded_word)) {
277 *output = encoded_word; 243 *output = encoded_word;
278 } else { 244 } else {
279 std::wstring wide_output; 245 string16 utf16_output;
280 if (!referrer_charset.empty() && 246 if (!referrer_charset.empty() &&
281 base::CodepageToWide(encoded_word, referrer_charset.c_str(), 247 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
282 base::OnStringConversionError::FAIL, 248 base::OnStringConversionError::FAIL,
283 &wide_output)) { 249 &utf16_output)) {
284 *output = WideToUTF8(wide_output); 250 *output = UTF16ToUTF8(utf16_output);
285 } else { 251 } else {
286 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); 252 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
287 } 253 }
288 } 254 }
289 255
290 return true; 256 return true;
291 } 257 }
292 258
293 // RFC 2047 : one of encoding methods supported by Firefox and relatively 259 // RFC 2047 : one of encoding methods supported by Firefox and relatively
294 // widely used by web servers. 260 // widely used by web servers.
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after
407 std::string decoded; 373 std::string decoded;
408 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, 374 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
409 &decoded)) 375 &decoded))
410 return false; 376 return false;
411 tmp.append(decoded); 377 tmp.append(decoded);
412 } 378 }
413 output->swap(tmp); 379 output->swap(tmp);
414 return true; 380 return true;
415 } 381 }
416 382
417 // TODO(mpcomplete): This is a quick and dirty implementation for now. I'm
418 // sure this doesn't properly handle all (most?) cases.
419 template<typename STR>
420 STR GetHeaderParamValueT(const STR& header, const STR& param_name,
421 QuoteRule::Type quote_rule) {
422 // This assumes args are formatted exactly like "bla; arg1=value; arg2=value".
423 typename STR::const_iterator param_begin =
424 search(header.begin(), header.end(), param_name.begin(), param_name.end(),
425 base::CaseInsensitiveCompareASCII<typename STR::value_type>());
426
427 if (param_begin == header.end())
428 return STR();
429 param_begin += param_name.length();
430
431 STR whitespace;
432 whitespace.push_back(' ');
433 whitespace.push_back('\t');
434 const typename STR::size_type equals_offset =
435 header.find_first_not_of(whitespace, param_begin - header.begin());
436 if (equals_offset == STR::npos || header.at(equals_offset) != '=')
437 return STR();
438
439 param_begin = header.begin() + equals_offset + 1;
440 if (param_begin == header.end())
441 return STR();
442
443 typename STR::const_iterator param_end;
444 if (*param_begin == '"' && quote_rule == QuoteRule::REMOVE_OUTER_QUOTES) {
445 ++param_begin; // skip past the quote.
446 param_end = find(param_begin, header.end(), '"');
447 // If the closing quote is missing, we will treat the rest of the
448 // string as the parameter. We can't set |param_end| to the
449 // location of the separator (';'), since the separator is
450 // technically quoted. See: http://crbug.com/58840
451 } else {
452 param_end = find(param_begin+1, header.end(), ';');
453 }
454
455 return STR(param_begin, param_end);
456 }
457
458 // Does some simple normalization of scripts so we can allow certain scripts 383 // Does some simple normalization of scripts so we can allow certain scripts
459 // to exist together. 384 // to exist together.
460 // TODO(brettw) bug 880223: we should allow some other languages to be 385 // TODO(brettw) bug 880223: we should allow some other languages to be
461 // oombined such as Chinese and Latin. We will probably need a more 386 // oombined such as Chinese and Latin. We will probably need a more
462 // complicated system of language pairs to have more fine-grained control. 387 // complicated system of language pairs to have more fine-grained control.
463 UScriptCode NormalizeScript(UScriptCode code) { 388 UScriptCode NormalizeScript(UScriptCode code) {
464 switch (code) { 389 switch (code) {
465 case USCRIPT_KATAKANA: 390 case USCRIPT_KATAKANA:
466 case USCRIPT_HIRAGANA: 391 case USCRIPT_HIRAGANA:
467 case USCRIPT_KATAKANA_OR_HIRAGANA: 392 case USCRIPT_KATAKANA_OR_HIRAGANA:
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after
586 ulocdata_close(uld); 511 ulocdata_close(uld);
587 } 512 }
588 } 513 }
589 return !lang_set->isEmpty() && lang_set->containsAll(component_characters); 514 return !lang_set->isEmpty() && lang_set->containsAll(component_characters);
590 } 515 }
591 516
592 // Returns true if the given Unicode host component is safe to display to the 517 // Returns true if the given Unicode host component is safe to display to the
593 // user. 518 // user.
594 bool IsIDNComponentSafe(const char16* str, 519 bool IsIDNComponentSafe(const char16* str,
595 int str_len, 520 int str_len,
596 const std::wstring& languages) { 521 const std::string& languages) {
597 // Most common cases (non-IDN) do not reach here so that we don't 522 // Most common cases (non-IDN) do not reach here so that we don't
598 // need a fast return path. 523 // need a fast return path.
599 // TODO(jungshik) : Check if there's any character inappropriate 524 // TODO(jungshik) : Check if there's any character inappropriate
600 // (although allowed) for domain names. 525 // (although allowed) for domain names.
601 // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and 526 // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and
602 // http://www.unicode.org/reports/tr39/data/xidmodifications.txt 527 // http://www.unicode.org/reports/tr39/data/xidmodifications.txt
603 // For now, we borrow the list from Mozilla and tweaked it slightly. 528 // For now, we borrow the list from Mozilla and tweaked it slightly.
604 // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because 529 // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because
605 // they're gonna be canonicalized to U+0020 and full stop before 530 // they're gonna be canonicalized to U+0020 and full stop before
606 // reaching here.) 531 // reaching here.)
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
670 // (sync'd with characters allowed in url_canon_host with square 595 // (sync'd with characters allowed in url_canon_host with square
671 // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc. 596 // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc.
672 icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"), 597 icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"),
673 status); 598 status);
674 DCHECK(U_SUCCESS(status)); 599 DCHECK(U_SUCCESS(status));
675 // Subtract common characters because they're always allowed so that 600 // Subtract common characters because they're always allowed so that
676 // we just have to check if a language-specific set contains 601 // we just have to check if a language-specific set contains
677 // the remainder. 602 // the remainder.
678 component_characters.removeAll(common_characters); 603 component_characters.removeAll(common_characters);
679 604
680 std::string languages_list(WideToASCII(languages)); 605 StringTokenizer t(languages, ",");
681 StringTokenizer t(languages_list, ",");
682 while (t.GetNext()) { 606 while (t.GetNext()) {
683 if (IsComponentCoveredByLang(component_characters, t.token())) 607 if (IsComponentCoveredByLang(component_characters, t.token()))
684 return true; 608 return true;
685 } 609 }
686 return false; 610 return false;
687 } 611 }
688 612
689 // Converts one component of a host (between dots) to IDN if safe. The result 613 // Converts one component of a host (between dots) to IDN if safe. The result
690 // will be APPENDED to the given output string and will be the same as the input 614 // will be APPENDED to the given output string and will be the same as the input
691 // if it is not IDN or the IDN is unsafe to display. Returns whether any 615 // if it is not IDN or the IDN is unsafe to display. Returns whether any
692 // conversion was performed. 616 // conversion was performed.
693 bool IDNToUnicodeOneComponent(const char16* comp, 617 bool IDNToUnicodeOneComponent(const char16* comp,
694 size_t comp_len, 618 size_t comp_len,
695 const std::wstring& languages, 619 const std::string& languages,
696 string16* out) { 620 string16* out) {
697 DCHECK(out); 621 DCHECK(out);
698 if (comp_len == 0) 622 if (comp_len == 0)
699 return false; 623 return false;
700 624
701 // Only transform if the input can be an IDN component. 625 // Only transform if the input can be an IDN component.
702 static const char16 kIdnPrefix[] = {'x', 'n', '-', '-'}; 626 static const char16 kIdnPrefix[] = {'x', 'n', '-', '-'};
703 if ((comp_len > arraysize(kIdnPrefix)) && 627 if ((comp_len > arraysize(kIdnPrefix)) &&
704 !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(char16))) { 628 !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(char16))) {
705 // Repeatedly expand the output string until it's big enough. It looks like 629 // Repeatedly expand the output string until it's big enough. It looks like
(...skipping 21 matching lines...) Expand all
727 // Failed, revert back to original string. 651 // Failed, revert back to original string.
728 out->resize(original_length); 652 out->resize(original_length);
729 } 653 }
730 654
731 // We get here with no IDN or on error, in which case we just append the 655 // We get here with no IDN or on error, in which case we just append the
732 // literal input. 656 // literal input.
733 out->append(comp, comp_len); 657 out->append(comp, comp_len);
734 return false; 658 return false;
735 } 659 }
736 660
737 struct SubtractFromOffset { 661 // Functions may stack-allocate one of these in order to clamp the offsets in
mrossetti 2011/04/26 22:13:34 I'm not sure I see the point of making this an aut
Peter Kasting 2011/04/27 02:07:19 Good point. Originally I wanted an object to guar
738 explicit SubtractFromOffset(size_t amount) 662 // |offsets_for_adjustment| to the length of |output| on exit.
739 : amount(amount) {} 663 class OffsetLimiter {
740 void operator()(size_t& offset) { 664 public:
741 if (offset != std::wstring::npos) { 665 OffsetLimiter(std::vector<size_t>* offsets_for_adjustment, string16* output)
742 if (offset >= amount) 666 : offsets_for_adjustment_(offsets_for_adjustment),
743 offset -= amount; 667 output_(output) {
744 else 668 }
745 offset = std::wstring::npos; 669
670 ~OffsetLimiter() {
671 if (offsets_for_adjustment_) {
672 std::for_each(offsets_for_adjustment_->begin(),
673 offsets_for_adjustment_->end(),
674 LimitOffset<string16>(output_->length()));
746 } 675 }
747 } 676 }
748 677
749 size_t amount; 678 private:
679 std::vector<size_t>* offsets_for_adjustment_;
680 string16* output_;
681
682 DISALLOW_COPY_AND_ASSIGN(OffsetLimiter);
750 }; 683 };
751 684
752 struct AddToOffset { 685 // TODO(brettw) bug 734373: check the scripts for each host component and
753 explicit AddToOffset(size_t amount) 686 // don't un-IDN-ize if there is more than one. Alternatively, only IDN for
754 : amount(amount) {} 687 // scripts that the user has installed. For now, just put the entire
755 void operator()(size_t& offset) { 688 // path through IDN. Maybe this feature can be implemented in ICU itself?
756 if (offset != std::wstring::npos) 689 //
757 offset += amount; 690 // We may want to skip this step in the case of file URLs to allow unicode
691 // UNC hostnames regardless of encodings.
692 string16 IDNToUnicodeWithOffsets(const std::string& host,
693 const std::string& languages,
694 std::vector<size_t>* offsets_for_adjustment) {
695 // Convert the ASCII input to a string16 for ICU.
696 string16 input16;
697 input16.reserve(host.length());
698 input16.insert(input16.end(), host.begin(), host.end());
699
700 // Do each component of the host separately, since we enforce script matching
701 // on a per-component basis.
702 AdjustOffset::Adjustments adjustments;
703 string16 out16;
704 OffsetLimiter offset_limiter(offsets_for_adjustment, &out16);
mrossetti 2011/04/26 22:13:34 Continuing from the previous comment: This auto co
705 for (size_t component_start = 0, component_end;
706 component_start < input16.length();
707 component_start = component_end + 1) {
708 // Find the end of the component.
709 component_end = input16.find('.', component_start);
710 if (component_end == string16::npos)
711 component_end = input16.length(); // For getting the last component.
712 size_t component_length = component_end - component_start;
713 size_t new_component_start = out16.length();
714 bool converted_idn = false;
715 if (component_end > component_start) {
716 // Add the substring that we just found.
717 converted_idn = IDNToUnicodeOneComponent(input16.data() + component_start,
718 component_length, languages, &out16);
719 }
720 size_t new_component_length = out16.length() - new_component_start;
721
722 if (converted_idn && offsets_for_adjustment) {
723 adjustments.push_back(AdjustOffset::Adjustment(
724 component_start, component_length, new_component_length));
725 }
726
727 // Need to add the dot we just found (if we found one).
728 if (component_end < input16.length())
729 out16.push_back('.');
758 } 730 }
759 731
760 size_t amount; 732 // Make offset adjustment.
761 }; 733 if (offsets_for_adjustment && !adjustments.empty()) {
734 std::for_each(offsets_for_adjustment->begin(),
735 offsets_for_adjustment->end(), AdjustOffset(adjustments));
736 }
762 737
763 std::vector<size_t> OffsetsIntoSection( 738 return out16;
764 std::vector<size_t>* offsets_for_adjustment,
765 size_t section_begin) {
766 std::vector<size_t> offsets_into_section;
767 if (offsets_for_adjustment) {
768 std::transform(offsets_for_adjustment->begin(),
769 offsets_for_adjustment->end(),
770 std::back_inserter(offsets_into_section),
771 ClampComponentOffset(section_begin));
772 std::for_each(offsets_into_section.begin(), offsets_into_section.end(),
773 SubtractFromOffset(section_begin));
774 }
775 return offsets_into_section;
776 } 739 }
777 740
778 void ApplySectionAdjustments(const std::vector<size_t>& offsets_into_section, 741 // Transforms |original_offsets| by subtracting |section_begin| from all
779 std::vector<size_t>* offsets_for_adjustment, 742 // offsets. Any offset which was not at least this large to begin with is set
780 size_t old_section_len, 743 // to std::string::npos.
781 size_t new_section_len, 744 std::vector<size_t> OffsetsIntoComponent(
782 size_t section_begin) { 745 const std::vector<size_t>& original_offsets,
783 if (offsets_for_adjustment) { 746 size_t component_begin) {
784 DCHECK_EQ(offsets_for_adjustment->size(), offsets_into_section.size()); 747 DCHECK_NE(std::string::npos, component_begin);
785 std::vector<size_t>::const_iterator host_offsets_iter = 748 std::vector<size_t> offsets_into_component(original_offsets);
786 offsets_into_section.begin(); 749 for (std::vector<size_t>::iterator i(offsets_into_component.begin());
787 for (std::vector<size_t>::iterator offsets_iter = 750 i != offsets_into_component.end(); ++i) {
788 offsets_for_adjustment->begin(); 751 if (*i != std::string::npos)
789 offsets_iter != offsets_for_adjustment->end(); 752 *i = (*i < component_begin) ? std::string::npos : (*i - component_begin);
790 ++offsets_iter, ++host_offsets_iter) { 753 }
791 size_t offset = *offsets_iter; 754 return offsets_into_component;
792 if (offset == std::wstring::npos || offset < section_begin) { 755 }
793 // The offset is before the host section so leave it as is. 756
794 continue; 757 // Called after we transform a component and append it to an output string.
795 } 758 // Maps |transformed_offsets|, which represent offsets into the transformed
796 if (offset >= section_begin + old_section_len) { 759 // component itself, into appropriate offsets for the output string, by adding
797 // The offset is after the host section so adjust by host length delta. 760 // |output_component_begin| to each. Determines which offsets need mapping by
798 offset += new_section_len - old_section_len; 761 // checking to see which of the |original_offsets| were within the designated
799 } else if (*host_offsets_iter != std::wstring::npos) { 762 // original component, using its provided endpoints.
800 // The offset is within the host and valid so adjust by the host 763 void AdjustForComponentTransform(
801 // reformatting offsets results. 764 const std::vector<size_t>& original_offsets,
802 offset = section_begin + *host_offsets_iter; 765 size_t original_component_begin,
803 } else { 766 size_t original_component_end,
804 // The offset is invalid. 767 const std::vector<size_t>& transformed_offsets,
805 offset = std::wstring::npos; 768 size_t output_component_begin,
806 } 769 std::vector<size_t>* offsets_for_adjustment) {
807 *offsets_iter = offset; 770 if (!offsets_for_adjustment)
771 return;
772
773 DCHECK_NE(std::string::npos, original_component_begin);
774 DCHECK_NE(std::string::npos, original_component_end);
775 DCHECK_NE(string16::npos, output_component_begin);
776 size_t offsets_size = offsets_for_adjustment->size();
777 DCHECK_EQ(offsets_size, original_offsets.size());
778 DCHECK_EQ(offsets_size, transformed_offsets.size());
779 for (size_t i = 0; i < offsets_size; ++i) {
780 size_t original_offset = original_offsets[i];
781 if ((original_offset >= original_component_begin) &&
782 (original_offset < original_component_end)) {
783 size_t transformed_offset = transformed_offsets[i];
784 (*offsets_for_adjustment)[i] = (transformed_offset == string16::npos) ?
785 string16::npos : (output_component_begin + transformed_offset);
808 } 786 }
809 } 787 }
810 } 788 }
811 789
812 // If |component| is valid, its begin is incremented by |delta|. 790 // If |component| is valid, its begin is incremented by |delta|.
813 void AdjustComponent(int delta, url_parse::Component* component) { 791 void AdjustComponent(int delta, url_parse::Component* component) {
814 if (!component->is_valid()) 792 if (!component->is_valid())
815 return; 793 return;
816 794
817 DCHECK(delta >= 0 || component->begin >= -delta); 795 DCHECK(delta >= 0 || component->begin >= -delta);
818 component->begin += delta; 796 component->begin += delta;
819 } 797 }
820 798
821 // Adjusts all the components of |parsed| by |delta|, except for the scheme. 799 // Adjusts all the components of |parsed| by |delta|, except for the scheme.
822 void AdjustComponents(int delta, url_parse::Parsed* parsed) { 800 void AdjustComponents(int delta, url_parse::Parsed* parsed) {
823 AdjustComponent(delta, &(parsed->username)); 801 AdjustComponent(delta, &(parsed->username));
824 AdjustComponent(delta, &(parsed->password)); 802 AdjustComponent(delta, &(parsed->password));
825 AdjustComponent(delta, &(parsed->host)); 803 AdjustComponent(delta, &(parsed->host));
826 AdjustComponent(delta, &(parsed->port)); 804 AdjustComponent(delta, &(parsed->port));
827 AdjustComponent(delta, &(parsed->path)); 805 AdjustComponent(delta, &(parsed->path));
828 AdjustComponent(delta, &(parsed->query)); 806 AdjustComponent(delta, &(parsed->query));
829 AdjustComponent(delta, &(parsed->ref)); 807 AdjustComponent(delta, &(parsed->ref));
830 } 808 }
831 809
832 std::wstring FormatUrlInternal(const GURL& url, 810 // Helper for FormatUrlWithOffsets().
833 const std::wstring& languages, 811 string16 FormatViewSourceUrl(const GURL& url,
834 FormatUrlTypes format_types, 812 const std::vector<size_t>& original_offsets,
835 UnescapeRule::Type unescape_rules, 813 const std::string& languages,
836 url_parse::Parsed* new_parsed, 814 FormatUrlTypes format_types,
837 size_t* prefix_end, 815 UnescapeRule::Type unescape_rules,
838 std::vector<size_t>* offsets_for_adjustment); 816 url_parse::Parsed* new_parsed,
817 size_t* prefix_end,
818 std::vector<size_t>* offsets_for_adjustment) {
819 DCHECK(new_parsed);
820 const char kViewSource[] = "view-source:";
821 const size_t kViewSourceLength = arraysize(kViewSource) - 1;
822 std::vector<size_t> offsets_into_url(
823 OffsetsIntoComponent(original_offsets, kViewSourceLength));
839 824
840 // Helper for FormatUrl()/FormatUrlInternal(). 825 GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLength));
841 std::wstring FormatViewSourceUrl(const GURL& url, 826 string16 result(ASCIIToUTF16(kViewSource) +
842 const std::wstring& languages, 827 FormatUrlWithOffsets(real_url, languages, format_types, unescape_rules,
843 FormatUrlTypes format_types, 828 new_parsed, prefix_end, &offsets_into_url));
844 UnescapeRule::Type unescape_rules, 829 OffsetLimiter offset_limiter(offsets_for_adjustment, &result);
845 url_parse::Parsed* new_parsed,
846 size_t* prefix_end,
847 std::vector<size_t>* offsets_for_adjustment) {
848 DCHECK(new_parsed);
849 DCHECK(offsets_for_adjustment);
850 const wchar_t* const kWideViewSource = L"view-source:";
851 const size_t kViewSourceLengthPlus1 = 12;
852 std::vector<size_t> saved_offsets(*offsets_for_adjustment);
853
854 GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLengthPlus1));
855 // Clamp the offsets to the source area.
856 std::for_each(offsets_for_adjustment->begin(),
857 offsets_for_adjustment->end(),
858 SubtractFromOffset(kViewSourceLengthPlus1));
859 std::wstring result = FormatUrlInternal(real_url, languages, format_types,
860 unescape_rules, new_parsed, prefix_end, offsets_for_adjustment);
861 result.insert(0, kWideViewSource);
862 830
863 // Adjust position values. 831 // Adjust position values.
864 if (new_parsed->scheme.is_nonempty()) { 832 if (new_parsed->scheme.is_nonempty()) {
865 // Assume "view-source:real-scheme" as a scheme. 833 // Assume "view-source:real-scheme" as a scheme.
866 new_parsed->scheme.len += kViewSourceLengthPlus1; 834 new_parsed->scheme.len += kViewSourceLength;
867 } else { 835 } else {
868 new_parsed->scheme.begin = 0; 836 new_parsed->scheme.begin = 0;
869 new_parsed->scheme.len = kViewSourceLengthPlus1 - 1; 837 new_parsed->scheme.len = kViewSourceLength - 1;
870 } 838 }
871 AdjustComponents(kViewSourceLengthPlus1, new_parsed); 839 AdjustComponents(kViewSourceLength, new_parsed);
872 if (prefix_end) 840 if (prefix_end)
873 *prefix_end += kViewSourceLengthPlus1; 841 *prefix_end += kViewSourceLength;
874 std::for_each(offsets_for_adjustment->begin(), 842 AdjustForComponentTransform(original_offsets, kViewSourceLength,
875 offsets_for_adjustment->end(), 843 url.possibly_invalid_spec().length(), offsets_into_url, kViewSourceLength,
876 AddToOffset(kViewSourceLengthPlus1)); 844 offsets_for_adjustment);
877 // Restore all offsets which were not affected by FormatUrlInternal.
878 DCHECK_EQ(saved_offsets.size(), offsets_for_adjustment->size());
879 for (size_t i = 0; i < saved_offsets.size(); ++i) {
880 if (saved_offsets[i] < kViewSourceLengthPlus1)
881 (*offsets_for_adjustment)[i] = saved_offsets[i];
882 }
883 return result; 845 return result;
884 } 846 }
885 847
886 // Appends the substring |in_component| inside of the URL |spec| to |output|, 848 class AppendComponentTransform {
887 // and the resulting range will be filled into |out_component|. |unescape_rules| 849 public:
888 // defines how to clean the URL for human readability. |offsets_for_adjustment| 850 AppendComponentTransform() {}
889 // is an array of offsets into |output| each of which will be adjusted based on 851 virtual string16 Execute(
890 // how it maps to the component being converted; if it is less than 852 const std::string& component_text,
891 // output->length(), it will be untouched, and if it is greater than 853 std::vector<size_t>* offsets_into_component) const = 0;
892 // output->length() + in_component.len it will be adjusted by the difference in 854
893 // lengths between the input and output components. Otherwise it points into 855 private:
894 // the component being converted, and is adjusted to point to the same logical 856 DISALLOW_COPY_AND_ASSIGN(AppendComponentTransform);
895 // place in |output|. |offsets_for_adjustment| may not be NULL. 857 };
858
859 class HostComponentTransform : public AppendComponentTransform {
860 public:
861 explicit HostComponentTransform(const std::string& languages)
862 : languages_(languages) {
863 }
864
865 private:
866 virtual string16 Execute(
867 const std::string& component_text,
868 std::vector<size_t>* offsets_into_component) const {
869 return IDNToUnicodeWithOffsets(component_text, languages_,
870 offsets_into_component);
871 }
872
873 const std::string& languages_;
874 };
875
876 class NonHostComponentTransform : public AppendComponentTransform {
877 public:
878 explicit NonHostComponentTransform(UnescapeRule::Type unescape_rules)
879 : unescape_rules_(unescape_rules) {
880 }
881
882 private:
883 virtual string16 Execute(
884 const std::string& component_text,
885 std::vector<size_t>* offsets_into_component) const {
886 return (unescape_rules_ == UnescapeRule::NONE) ?
887 UTF8ToUTF16AndAdjustOffsets(component_text, offsets_into_component) :
888 UnescapeAndDecodeUTF8URLComponentWithOffsets(component_text,
889 unescape_rules_, offsets_into_component);
890 }
891
892 const UnescapeRule::Type unescape_rules_;
893 };
894
896 void AppendFormattedComponent(const std::string& spec, 895 void AppendFormattedComponent(const std::string& spec,
897 const url_parse::Component& in_component, 896 const url_parse::Component& original_component,
898 UnescapeRule::Type unescape_rules, 897 const std::vector<size_t>& original_offsets,
899 std::wstring* output, 898 const AppendComponentTransform& transform,
900 url_parse::Component* out_component, 899 string16* output,
900 url_parse::Component* output_component,
901 std::vector<size_t>* offsets_for_adjustment) { 901 std::vector<size_t>* offsets_for_adjustment) {
902 DCHECK(output); 902 DCHECK(output);
903 DCHECK(offsets_for_adjustment); 903 if (original_component.is_nonempty()) {
904 if (in_component.is_nonempty()) { 904 size_t original_component_begin =
905 size_t component_begin = output->length(); 905 static_cast<size_t>(original_component.begin);
906 out_component->begin = static_cast<int>(component_begin); 906 size_t output_component_begin = output->length();
907 if (output_component)
908 output_component->begin = static_cast<int>(output_component_begin);
907 909
908 // Compose a list of offsets within the component area.
909 std::vector<size_t> offsets_into_component = 910 std::vector<size_t> offsets_into_component =
910 OffsetsIntoSection(offsets_for_adjustment, component_begin); 911 OffsetsIntoComponent(original_offsets, original_component_begin);
912 output->append(transform.Execute(std::string(spec, original_component_begin,
913 static_cast<size_t>(original_component.len)), &offsets_into_component));
911 914
912 if (unescape_rules == UnescapeRule::NONE) { 915 if (output_component) {
913 output->append(UTF8ToWideAndAdjustOffsets( 916 output_component->len =
914 spec.substr(in_component.begin, in_component.len), 917 static_cast<int>(output->length() - output_component_begin);
915 &offsets_into_component));
916 } else {
917 output->append(UTF16ToWideHack(
918 UnescapeAndDecodeUTF8URLComponentWithOffsets(
919 spec.substr(in_component.begin, in_component.len), unescape_rules,
920 &offsets_into_component)));
921 } 918 }
922 size_t new_component_len = output->length() - component_begin; 919 AdjustForComponentTransform(original_offsets, original_component_begin,
923 out_component->len = static_cast<int>(new_component_len); 920 static_cast<size_t>(original_component.end()),
924 921 offsets_into_component, output_component_begin,
925 // Apply offset adjustments. 922 offsets_for_adjustment);
926 size_t old_component_len = static_cast<size_t>(in_component.len); 923 } else if (output_component) {
927 ApplySectionAdjustments(offsets_into_component, offsets_for_adjustment, 924 output_component->reset();
928 old_component_len, new_component_len, component_begin);
929 } else {
930 out_component->reset();
931 } 925 }
932 } 926 }
933 927
934 // TODO(viettrungluu): This is really the old-fashioned version, made internal.
935 // I need to really convert |FormatUrl()|.
936 std::wstring FormatUrlInternal(const GURL& url,
937 const std::wstring& languages,
938 FormatUrlTypes format_types,
939 UnescapeRule::Type unescape_rules,
940 url_parse::Parsed* new_parsed,
941 size_t* prefix_end,
942 std::vector<size_t>* offsets_for_adjustment) {
943 url_parse::Parsed parsed_temp;
944 if (!new_parsed)
945 new_parsed = &parsed_temp;
946 else
947 *new_parsed = url_parse::Parsed();
948
949 std::vector<size_t> offsets_temp;
950 if (!offsets_for_adjustment)
951 offsets_for_adjustment = &offsets_temp;
952
953 std::wstring url_string;
954
955 // Check for empty URLs or 0 available text width.
956 if (url.is_empty()) {
957 if (prefix_end)
958 *prefix_end = 0;
959 std::for_each(offsets_for_adjustment->begin(),
960 offsets_for_adjustment->end(),
961 LimitOffset<std::wstring>(0));
962 return url_string;
963 }
964
965 // Special handling for view-source:. Don't use chrome::kViewSourceScheme
966 // because this library shouldn't depend on chrome.
967 const char* const kViewSource = "view-source";
968 // Reject "view-source:view-source:..." to avoid deep recursion.
969 const char* const kViewSourceTwice = "view-source:view-source:";
970 if (url.SchemeIs(kViewSource) &&
971 !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) {
972 return FormatViewSourceUrl(url, languages, format_types,
973 unescape_rules, new_parsed, prefix_end, offsets_for_adjustment);
974 }
975
976 // We handle both valid and invalid URLs (this will give us the spec
977 // regardless of validity).
978 const std::string& spec = url.possibly_invalid_spec();
979 const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
980 size_t spec_length = spec.length();
981 std::for_each(offsets_for_adjustment->begin(),
982 offsets_for_adjustment->end(),
983 LimitOffset<std::wstring>(spec_length));
984
985 // Copy everything before the username (the scheme and the separators.)
986 // These are ASCII.
987 url_string.insert(url_string.end(), spec.begin(),
988 spec.begin() + parsed.CountCharactersBefore(url_parse::Parsed::USERNAME,
989 true));
990
991 const wchar_t kHTTP[] = L"http://";
992 const char kFTP[] = "ftp.";
993 // URLFixerUpper::FixupURL() treats "ftp.foo.com" as ftp://ftp.foo.com. This
994 // means that if we trim "http://" off a URL whose host starts with "ftp." and
995 // the user inputs this into any field subject to fixup (which is basically
996 // all input fields), the meaning would be changed. (In fact, often the
997 // formatted URL is directly pre-filled into an input field.) For this reason
998 // we avoid stripping "http://" in this case.
999 bool omit_http =
1000 (format_types & kFormatUrlOmitHTTP) && (url_string == kHTTP) &&
1001 (url.host().compare(0, arraysize(kFTP) - 1, kFTP) != 0);
1002
1003 new_parsed->scheme = parsed.scheme;
1004
1005 if ((format_types & kFormatUrlOmitUsernamePassword) != 0) {
1006 // Remove the username and password fields. We don't want to display those
1007 // to the user since they can be used for attacks,
1008 // e.g. "http://google.com:search@evil.ru/"
1009 new_parsed->username.reset();
1010 new_parsed->password.reset();
1011 // Update the offsets based on removed username and/or password.
1012 if (!offsets_for_adjustment->empty() &&
1013 (parsed.username.is_nonempty() || parsed.password.is_nonempty())) {
1014 AdjustOffset::Adjustments adjustments;
1015 if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) {
1016 // The seeming off-by-one and off-by-two in these first two lines are to
1017 // account for the ':' after the username and '@' after the password.
1018 adjustments.push_back(AdjustOffset::Adjustment(
1019 static_cast<size_t>(parsed.username.begin),
1020 static_cast<size_t>(parsed.username.len + parsed.password.len +
1021 2), 0));
1022 } else {
1023 const url_parse::Component* nonempty_component =
1024 parsed.username.is_nonempty() ? &parsed.username : &parsed.password;
1025 // The seeming off-by-one in below is to account for the '@' after the
1026 // username/password.
1027 adjustments.push_back(AdjustOffset::Adjustment(
1028 static_cast<size_t>(nonempty_component->begin),
1029 static_cast<size_t>(nonempty_component->len + 1), 0));
1030 }
1031
1032 // Make offset adjustment.
1033 std::for_each(offsets_for_adjustment->begin(),
1034 offsets_for_adjustment->end(),
1035 AdjustOffset(adjustments));
1036 }
1037 } else {
1038 AppendFormattedComponent(spec, parsed.username, unescape_rules, &url_string,
1039 &new_parsed->username, offsets_for_adjustment);
1040 if (parsed.password.is_valid())
1041 url_string.push_back(':');
1042 AppendFormattedComponent(spec, parsed.password, unescape_rules, &url_string,
1043 &new_parsed->password, offsets_for_adjustment);
1044 if (parsed.username.is_valid() || parsed.password.is_valid())
1045 url_string.push_back('@');
1046 }
1047 if (prefix_end)
1048 *prefix_end = static_cast<size_t>(url_string.length());
1049
1050 AppendFormattedHostWithOffsets(url, languages, &url_string, new_parsed,
1051 offsets_for_adjustment);
1052
1053 // Port.
1054 if (parsed.port.is_nonempty()) {
1055 url_string.push_back(':');
1056 new_parsed->port.begin = url_string.length();
1057 url_string.insert(url_string.end(),
1058 spec.begin() + parsed.port.begin,
1059 spec.begin() + parsed.port.end());
1060 new_parsed->port.len = url_string.length() - new_parsed->port.begin;
1061 } else {
1062 new_parsed->port.reset();
1063 }
1064
1065 // Path and query both get the same general unescape & convert treatment.
1066 if (!(format_types & kFormatUrlOmitTrailingSlashOnBareHostname) ||
1067 !CanStripTrailingSlash(url)) {
1068 AppendFormattedComponent(spec, parsed.path, unescape_rules, &url_string,
1069 &new_parsed->path, offsets_for_adjustment);
1070 }
1071 if (parsed.query.is_valid())
1072 url_string.push_back('?');
1073 AppendFormattedComponent(spec, parsed.query, unescape_rules, &url_string,
1074 &new_parsed->query, offsets_for_adjustment);
1075
1076 // Reference is stored in valid, unescaped UTF-8, so we can just convert.
1077 if (parsed.ref.is_valid()) {
1078 url_string.push_back('#');
1079 size_t ref_begin = url_string.length();
1080 new_parsed->ref.begin = static_cast<int>(ref_begin);
1081
1082 // Compose a list of offsets within the section.
1083 std::vector<size_t> offsets_into_ref =
1084 OffsetsIntoSection(offsets_for_adjustment, ref_begin);
1085
1086 if (parsed.ref.len > 0) {
1087 url_string.append(UTF8ToWideAndAdjustOffsets(spec.substr(parsed.ref.begin,
1088 parsed.ref.len),
1089 &offsets_into_ref));
1090 }
1091 size_t old_ref_len = static_cast<size_t>(parsed.ref.len);
1092 size_t new_ref_len = url_string.length() - new_parsed->ref.begin;
1093 new_parsed->ref.len = static_cast<int>(new_ref_len);
1094
1095 // Apply offset adjustments.
1096 ApplySectionAdjustments(offsets_into_ref, offsets_for_adjustment,
1097 old_ref_len, new_ref_len, ref_begin);
1098 }
1099
1100 // If we need to strip out http do it after the fact. This way we don't need
1101 // to worry about how offset_for_adjustment is interpreted.
1102 const size_t kHTTPSize = arraysize(kHTTP) - 1;
1103 if (omit_http && !url_string.compare(0, kHTTPSize, kHTTP)) {
1104 url_string = url_string.substr(kHTTPSize);
1105 AdjustOffset::Adjustments adjustments;
1106 adjustments.push_back(AdjustOffset::Adjustment(0, kHTTPSize, 0));
1107 std::for_each(offsets_for_adjustment->begin(),
1108 offsets_for_adjustment->end(),
1109 AdjustOffset(adjustments));
1110 if (prefix_end)
1111 *prefix_end -= kHTTPSize;
1112
1113 // Adjust new_parsed.
1114 DCHECK(new_parsed->scheme.is_valid());
1115 int delta = -(new_parsed->scheme.len + 3); // +3 for ://.
1116 new_parsed->scheme.reset();
1117 AdjustComponents(delta, new_parsed);
1118 }
1119
1120 return url_string;
1121 }
1122
1123 } // namespace 928 } // namespace
1124 929
1125 const FormatUrlType kFormatUrlOmitNothing = 0; 930 const FormatUrlType kFormatUrlOmitNothing = 0;
1126 const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0; 931 const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0;
1127 const FormatUrlType kFormatUrlOmitHTTP = 1 << 1; 932 const FormatUrlType kFormatUrlOmitHTTP = 1 << 1;
1128 const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2; 933 const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2;
1129 const FormatUrlType kFormatUrlOmitAll = kFormatUrlOmitUsernamePassword | 934 const FormatUrlType kFormatUrlOmitAll = kFormatUrlOmitUsernamePassword |
1130 kFormatUrlOmitHTTP | kFormatUrlOmitTrailingSlashOnBareHostname; 935 kFormatUrlOmitHTTP | kFormatUrlOmitTrailingSlashOnBareHostname;
1131 936
1132 // TODO(viettrungluu): We don't want non-POD globals; change this. 937 // TODO(viettrungluu): We don't want non-POD globals; change this.
(...skipping 23 matching lines...) Expand all
1156 FILE_PATH_LITERAL("#"), FILE_PATH_LITERAL("%23")); 961 FILE_PATH_LITERAL("#"), FILE_PATH_LITERAL("%23"));
1157 962
1158 #if defined(OS_POSIX) 963 #if defined(OS_POSIX)
1159 ReplaceSubstringsAfterOffset(&url_string, 0, 964 ReplaceSubstringsAfterOffset(&url_string, 0,
1160 FILE_PATH_LITERAL("\\"), FILE_PATH_LITERAL("%5C")); 965 FILE_PATH_LITERAL("\\"), FILE_PATH_LITERAL("%5C"));
1161 #endif 966 #endif
1162 967
1163 return GURL(url_string); 968 return GURL(url_string);
1164 } 969 }
1165 970
1166 std::wstring GetSpecificHeader(const std::wstring& headers,
1167 const std::wstring& name) {
1168 return GetSpecificHeaderT(headers, name);
1169 }
1170
1171 std::string GetSpecificHeader(const std::string& headers, 971 std::string GetSpecificHeader(const std::string& headers,
1172 const std::string& name) { 972 const std::string& name) {
1173 return GetSpecificHeaderT(headers, name); 973 // We want to grab the Value from the "Key: Value" pairs in the headers,
974 // which should look like this (no leading spaces, \n-separated) (we format
975 // them this way in url_request_inet.cc):
976 // HTTP/1.1 200 OK\n
977 // ETag: "6d0b8-947-24f35ec0"\n
978 // Content-Length: 2375\n
979 // Content-Type: text/html; charset=UTF-8\n
980 // Last-Modified: Sun, 03 Sep 2006 04:34:43 GMT\n
981 if (headers.empty())
982 return std::string();
983
984 std::string match('\n' + name + ':');
985
986 std::string::const_iterator begin =
987 search(headers.begin(), headers.end(), match.begin(), match.end(),
988 base::CaseInsensitiveCompareASCII<char>());
989
990 if (begin == headers.end())
991 return std::string();
992
993 begin += match.length();
994
995 std::string ret;
996 TrimWhitespace(std::string(begin, find(begin, headers.end(), '\n')), TRIM_ALL,
997 &ret);
998 return ret;
1174 } 999 }
1175 1000
1176 bool DecodeCharset(const std::string& input, 1001 bool DecodeCharset(const std::string& input,
1177 std::string* decoded_charset, 1002 std::string* decoded_charset,
1178 std::string* value) { 1003 std::string* value) {
1179 StringTokenizer t(input, "'"); 1004 StringTokenizer t(input, "'");
1180 t.set_options(StringTokenizer::RETURN_DELIMS); 1005 t.set_options(StringTokenizer::RETURN_DELIMS);
1181 std::string temp_charset; 1006 std::string temp_charset;
1182 std::string temp_value; 1007 std::string temp_value;
1183 int numDelimsSeen = 0; 1008 int numDelimsSeen = 0;
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
1238 param_value = GetHeaderParamValue(header, "name", 1063 param_value = GetHeaderParamValue(header, "name",
1239 QuoteRule::REMOVE_OUTER_QUOTES); 1064 QuoteRule::REMOVE_OUTER_QUOTES);
1240 } 1065 }
1241 if (param_value.empty()) 1066 if (param_value.empty())
1242 return std::string(); 1067 return std::string();
1243 if (DecodeParamValue(param_value, referrer_charset, &decoded)) 1068 if (DecodeParamValue(param_value, referrer_charset, &decoded))
1244 return decoded; 1069 return decoded;
1245 return std::string(); 1070 return std::string();
1246 } 1071 }
1247 1072
1248 std::wstring GetHeaderParamValue(const std::wstring& field, 1073 // TODO(mpcomplete): This is a quick and dirty implementation for now. I'm
1249 const std::wstring& param_name, 1074 // sure this doesn't properly handle all (most?) cases.
1250 QuoteRule::Type quote_rule) { 1075 std::string GetHeaderParamValue(const std::string& header,
1251 return GetHeaderParamValueT(field, param_name, quote_rule); 1076 const std::string& param_name,
1077 QuoteRule::Type quote_rule) {
1078 // This assumes args are formatted exactly like "bla; arg1=value; arg2=value".
1079 std::string::const_iterator param_begin =
1080 search(header.begin(), header.end(), param_name.begin(), param_name.end(),
1081 base::CaseInsensitiveCompareASCII<char>());
1082
1083 if (param_begin == header.end())
1084 return std::string();
1085 param_begin += param_name.length();
1086
1087 std::string whitespace(" \t");
1088 size_t equals_offset =
1089 header.find_first_not_of(whitespace, param_begin - header.begin());
1090 if (equals_offset == std::string::npos || header[equals_offset] != '=')
1091 return std::string();
1092
1093 param_begin = header.begin() + equals_offset + 1;
1094 if (param_begin == header.end())
1095 return std::string();
1096
1097 std::string::const_iterator param_end;
1098 if (*param_begin == '"' && quote_rule == QuoteRule::REMOVE_OUTER_QUOTES) {
1099 ++param_begin; // skip past the quote.
1100 param_end = find(param_begin, header.end(), '"');
1101 // If the closing quote is missing, we will treat the rest of the
1102 // string as the parameter. We can't set |param_end| to the
1103 // location of the separator (';'), since the separator is
1104 // technically quoted. See: http://crbug.com/58840
1105 } else {
1106 param_end = find(param_begin + 1, header.end(), ';');
1107 }
1108
1109 return std::string(param_begin, param_end);
1252 } 1110 }
1253 1111
1254 std::string GetHeaderParamValue(const std::string& field, 1112 string16 IDNToUnicode(const std::string& host,
1255 const std::string& param_name, 1113 const std::string& languages) {
1256 QuoteRule::Type quote_rule) {
1257 return GetHeaderParamValueT(field, param_name, quote_rule);
1258 }
1259
1260 // TODO(brettw) bug 734373: check the scripts for each host component and
1261 // don't un-IDN-ize if there is more than one. Alternatively, only IDN for
1262 // scripts that the user has installed. For now, just put the entire
1263 // path through IDN. Maybe this feature can be implemented in ICU itself?
1264 //
1265 // We may want to skip this step in the case of file URLs to allow unicode
1266 // UNC hostnames regardless of encodings.
1267 std::wstring IDNToUnicodeWithOffsets(
1268 const char* host,
1269 size_t host_len,
1270 const std::wstring& languages,
1271 std::vector<size_t>* offsets_for_adjustment) {
1272 // Convert the ASCII input to a wide string for ICU.
1273 string16 input16;
1274 input16.reserve(host_len);
1275 input16.insert(input16.end(), host, host + host_len);
1276
1277 // Do each component of the host separately, since we enforce script matching
1278 // on a per-component basis.
1279 AdjustOffset::Adjustments adjustments;
1280 string16 out16;
1281 for (size_t component_start = 0, component_end;
1282 component_start < input16.length();
1283 component_start = component_end + 1) {
1284 // Find the end of the component.
1285 component_end = input16.find('.', component_start);
1286 if (component_end == string16::npos)
1287 component_end = input16.length(); // For getting the last component.
1288 size_t component_length = component_end - component_start;
1289 size_t new_component_start = out16.length();
1290 bool converted_idn = false;
1291 if (component_end > component_start) {
1292 // Add the substring that we just found.
1293 converted_idn = IDNToUnicodeOneComponent(input16.data() + component_start,
1294 component_length, languages, &out16);
1295 }
1296 size_t new_component_length = out16.length() - new_component_start;
1297
1298 if (converted_idn && offsets_for_adjustment) {
1299 adjustments.push_back(AdjustOffset::Adjustment(
1300 component_start, component_length, new_component_length));
1301 }
1302
1303 // Need to add the dot we just found (if we found one).
1304 if (component_end < input16.length())
1305 out16.push_back('.');
1306 }
1307
1308 // Make offset adjustment.
1309 if (offsets_for_adjustment && !adjustments.empty()) {
1310 std::for_each(offsets_for_adjustment->begin(),
1311 offsets_for_adjustment->end(),
1312 AdjustOffset(adjustments));
1313 }
1314
1315 return UTF16ToWideAndAdjustOffsets(out16, offsets_for_adjustment);
1316 }
1317
1318 std::wstring IDNToUnicode(const char* host,
1319 size_t host_len,
1320 const std::wstring& languages,
1321 size_t* offset_for_adjustment) {
1322 std::vector<size_t> offsets; 1114 std::vector<size_t> offsets;
1323 if (offset_for_adjustment) 1115 return IDNToUnicodeWithOffsets(host, languages, &offsets);
1324 offsets.push_back(*offset_for_adjustment);
1325 std::wstring result =
1326 IDNToUnicodeWithOffsets(host, host_len, languages, &offsets);
1327 if (offset_for_adjustment)
1328 *offset_for_adjustment = offsets[0];
1329 return result;
1330 } 1116 }
1331 1117
1332 std::string CanonicalizeHost(const std::string& host, 1118 std::string CanonicalizeHost(const std::string& host,
1333 url_canon::CanonHostInfo* host_info) { 1119 url_canon::CanonHostInfo* host_info) {
1334 // Try to canonicalize the host. 1120 // Try to canonicalize the host.
1335 const url_parse::Component raw_host_component( 1121 const url_parse::Component raw_host_component(
1336 0, static_cast<int>(host.length())); 1122 0, static_cast<int>(host.length()));
1337 std::string canon_host; 1123 std::string canon_host;
1338 url_canon::StdStringCanonOutput canon_host_output(&canon_host); 1124 url_canon::StdStringCanonOutput canon_host_output(&canon_host);
1339 url_canon::CanonicalizeHostVerbose(host.c_str(), raw_host_component, 1125 url_canon::CanonicalizeHostVerbose(host.c_str(), raw_host_component,
1340 &canon_host_output, host_info); 1126 &canon_host_output, host_info);
1341 1127
1342 if (host_info->out_host.is_nonempty() && 1128 if (host_info->out_host.is_nonempty() &&
1343 host_info->family != url_canon::CanonHostInfo::BROKEN) { 1129 host_info->family != url_canon::CanonHostInfo::BROKEN) {
1344 // Success! Assert that there's no extra garbage. 1130 // Success! Assert that there's no extra garbage.
1345 canon_host_output.Complete(); 1131 canon_host_output.Complete();
1346 DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length())); 1132 DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
1347 } else { 1133 } else {
1348 // Empty host, or canonicalization failed. We'll return empty. 1134 // Empty host, or canonicalization failed. We'll return empty.
1349 canon_host.clear(); 1135 canon_host.clear();
1350 } 1136 }
1351 1137
1352 return canon_host; 1138 return canon_host;
1353 } 1139 }
1354 1140
1355 std::string CanonicalizeHost(const std::wstring& host,
1356 url_canon::CanonHostInfo* host_info) {
1357 std::string converted_host;
1358 WideToUTF8(host.c_str(), host.length(), &converted_host);
1359 return CanonicalizeHost(converted_host, host_info);
1360 }
1361
1362 std::string GetDirectoryListingHeader(const string16& title) { 1141 std::string GetDirectoryListingHeader(const string16& title) {
1363 static const base::StringPiece header( 1142 static const base::StringPiece header(
1364 NetModule::GetResource(IDR_DIR_HEADER_HTML)); 1143 NetModule::GetResource(IDR_DIR_HEADER_HTML));
1365 // This can be null in unit tests. 1144 // This can be null in unit tests.
1366 DLOG_IF(WARNING, header.empty()) << 1145 DLOG_IF(WARNING, header.empty()) <<
1367 "Missing resource: directory listing header"; 1146 "Missing resource: directory listing header";
1368 1147
1369 std::string result; 1148 std::string result;
1370 if (!header.empty()) 1149 if (!header.empty())
1371 result.assign(header.data(), header.size()); 1150 result.assign(header.data(), header.size());
(...skipping 360 matching lines...) Expand 10 before | Expand all | Expand 10 after
1732 UnescapeRule::Type flags = 1511 UnescapeRule::Type flags =
1733 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS; 1512 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS;
1734 *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags, NULL); 1513 *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags, NULL);
1735 *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags, NULL); 1514 *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags, NULL);
1736 } 1515 }
1737 1516
1738 std::string GetHostOrSpecFromURL(const GURL& url) { 1517 std::string GetHostOrSpecFromURL(const GURL& url) {
1739 return url.has_host() ? TrimEndingDot(url.host()) : url.spec(); 1518 return url.has_host() ? TrimEndingDot(url.host()) : url.spec();
1740 } 1519 }
1741 1520
1742 void AppendFormattedHostWithOffsets( 1521 void AppendFormattedHost(const GURL& url,
1743 const GURL& url, 1522 const std::string& languages,
1744 const std::wstring& languages, 1523 string16* output) {
1745 std::wstring* output, 1524 std::vector<size_t> offsets;
1746 url_parse::Parsed* new_parsed, 1525 AppendFormattedComponent(url.possibly_invalid_spec(),
1747 std::vector<size_t>* offsets_for_adjustment) { 1526 url.parsed_for_possibly_invalid_spec().host, offsets,
1748 DCHECK(output); 1527 HostComponentTransform(languages), output, NULL, NULL);
1749 const url_parse::Component& host =
1750 url.parsed_for_possibly_invalid_spec().host;
1751
1752 if (host.is_nonempty()) {
1753 // Handle possible IDN in the host name.
1754 size_t host_begin = output->length();
1755 if (new_parsed)
1756 new_parsed->host.begin = static_cast<int>(host_begin);
1757 size_t old_host_len = static_cast<size_t>(host.len);
1758
1759 // Compose a list of offsets within the host area.
1760 std::vector<size_t> offsets_into_host =
1761 OffsetsIntoSection(offsets_for_adjustment, host_begin);
1762
1763 const std::string& spec = url.possibly_invalid_spec();
1764 DCHECK(host.begin >= 0 &&
1765 ((spec.length() == 0 && host.begin == 0) ||
1766 host.begin < static_cast<int>(spec.length())));
1767 output->append(IDNToUnicodeWithOffsets(&spec[host.begin], old_host_len,
1768 languages, &offsets_into_host));
1769
1770 size_t new_host_len = output->length() - host_begin;
1771 if (new_parsed)
1772 new_parsed->host.len = static_cast<int>(new_host_len);
1773
1774 // Apply offset adjustments.
1775 ApplySectionAdjustments(offsets_into_host, offsets_for_adjustment,
1776 old_host_len, new_host_len, host_begin);
1777 } else if (new_parsed) {
1778 new_parsed->host.reset();
1779 }
1780 } 1528 }
1781 1529
1782 void AppendFormattedHost(const GURL& url,
1783 const std::wstring& languages,
1784 std::wstring* output,
1785 url_parse::Parsed* new_parsed,
1786 size_t* offset_for_adjustment) {
1787 std::vector<size_t> offsets;
1788 if (offset_for_adjustment)
1789 offsets.push_back(*offset_for_adjustment);
1790 AppendFormattedHostWithOffsets(url, languages, output, new_parsed, &offsets);
1791 if (offset_for_adjustment)
1792 *offset_for_adjustment = offsets[0];
1793 }
1794
1795 // TODO(viettrungluu): convert the wstring |FormatUrlInternal()|.
1796 string16 FormatUrlWithOffsets(const GURL& url, 1530 string16 FormatUrlWithOffsets(const GURL& url,
1797 const std::string& languages, 1531 const std::string& languages,
1798 FormatUrlTypes format_types, 1532 FormatUrlTypes format_types,
1799 UnescapeRule::Type unescape_rules, 1533 UnescapeRule::Type unescape_rules,
1800 url_parse::Parsed* new_parsed, 1534 url_parse::Parsed* new_parsed,
1801 size_t* prefix_end, 1535 size_t* prefix_end,
1802 std::vector<size_t>* offsets_for_adjustment) { 1536 std::vector<size_t>* offsets_for_adjustment) {
1803 return WideToUTF16Hack( 1537 url_parse::Parsed parsed_temp;
1804 FormatUrlInternal(url, ASCIIToWide(languages), format_types, 1538 if (!new_parsed)
1805 unescape_rules, new_parsed, prefix_end, 1539 new_parsed = &parsed_temp;
1806 offsets_for_adjustment)); 1540 else
1541 *new_parsed = url_parse::Parsed();
1542 std::vector<size_t> original_offsets;
1543 if (offsets_for_adjustment)
1544 original_offsets = *offsets_for_adjustment;
1545
1546 // Special handling for view-source:. Don't use chrome::kViewSourceScheme
1547 // because this library shouldn't depend on chrome.
1548 const char* const kViewSource = "view-source";
1549 // Reject "view-source:view-source:..." to avoid deep recursion.
1550 const char* const kViewSourceTwice = "view-source:view-source:";
1551 if (url.SchemeIs(kViewSource) &&
1552 !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) {
1553 return FormatViewSourceUrl(url, original_offsets, languages, format_types,
1554 unescape_rules, new_parsed, prefix_end, offsets_for_adjustment);
1555 }
1556
1557 // We handle both valid and invalid URLs (this will give us the spec
1558 // regardless of validity).
1559 const std::string& spec = url.possibly_invalid_spec();
1560 const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
1561 size_t spec_length = spec.length();
1562
1563 // Scheme & separators. These are ASCII.
1564 string16 url_string;
1565 OffsetLimiter offset_limiter(offsets_for_adjustment, &url_string);
1566 url_string.insert(url_string.end(), spec.begin(),
1567 spec.begin() + parsed.CountCharactersBefore(url_parse::Parsed::USERNAME,
1568 true));
1569 const char kHTTP[] = "http://";
1570 const char kFTP[] = "ftp.";
1571 // URLFixerUpper::FixupURL() treats "ftp.foo.com" as ftp://ftp.foo.com. This
1572 // means that if we trim "http://" off a URL whose host starts with "ftp." and
1573 // the user inputs this into any field subject to fixup (which is basically
1574 // all input fields), the meaning would be changed. (In fact, often the
1575 // formatted URL is directly pre-filled into an input field.) For this reason
1576 // we avoid stripping "http://" in this case.
1577 bool omit_http = (format_types & kFormatUrlOmitHTTP) &&
1578 EqualsASCII(url_string, kHTTP) &&
1579 !StartsWithASCII(url.host(), kFTP, true);
1580 new_parsed->scheme = parsed.scheme;
1581
1582 // Username & password.
1583 if ((format_types & kFormatUrlOmitUsernamePassword) != 0) {
1584 // Remove the username and password fields. We don't want to display those
1585 // to the user since they can be used for attacks,
1586 // e.g. "http://google.com:search@evil.ru/"
1587 new_parsed->username.reset();
1588 new_parsed->password.reset();
1589 // Update the offsets based on removed username and/or password.
1590 if (offsets_for_adjustment && !offsets_for_adjustment->empty() &&
1591 (parsed.username.is_nonempty() || parsed.password.is_nonempty())) {
1592 AdjustOffset::Adjustments adjustments;
mrossetti 2011/04/26 22:13:34 The AdjustOffset::Adjustments concept would be a g
Peter Kasting 2011/04/27 02:07:19 Good idea. Rewrote this object.
1593 if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) {
1594 // The seeming off-by-one and off-by-two in these first two lines are to
1595 // account for the ':' after the username and '@' after the password.
1596 adjustments.push_back(AdjustOffset::Adjustment(
1597 static_cast<size_t>(parsed.username.begin),
1598 static_cast<size_t>(parsed.username.len + parsed.password.len +
1599 2), 0));
1600 } else {
1601 const url_parse::Component* nonempty_component =
1602 parsed.username.is_nonempty() ? &parsed.username : &parsed.password;
1603 // The seeming off-by-one in below is to account for the '@' after the
1604 // username/password.
1605 adjustments.push_back(AdjustOffset::Adjustment(
1606 static_cast<size_t>(nonempty_component->begin),
1607 static_cast<size_t>(nonempty_component->len + 1), 0));
1608 }
1609 std::for_each(offsets_for_adjustment->begin(),
1610 offsets_for_adjustment->end(), AdjustOffset(adjustments));
1611 }
1612 } else {
1613 AppendFormattedComponent(spec, parsed.username, original_offsets,
1614 NonHostComponentTransform(unescape_rules), &url_string,
1615 &new_parsed->username, offsets_for_adjustment);
1616 if (parsed.password.is_valid()) {
1617 size_t colon = parsed.username.end();
1618 DCHECK_EQ(static_cast<size_t>(parsed.password.begin - 1), colon);
1619 std::vector<size_t>::const_iterator colon_iter =
1620 std::find(original_offsets.begin(), original_offsets.end(), colon);
1621 if (colon_iter != original_offsets.end()) {
1622 (*offsets_for_adjustment)[colon_iter - original_offsets.begin()] =
1623 url_string.length();
1624 }
1625 url_string.push_back(':');
1626 }
1627 AppendFormattedComponent(spec, parsed.password, original_offsets,
1628 NonHostComponentTransform(unescape_rules), &url_string,
1629 &new_parsed->password, offsets_for_adjustment);
1630 if (parsed.username.is_valid() || parsed.password.is_valid()) {
1631 size_t at_sign = (parsed.password.is_valid() ?
1632 parsed.password : parsed.username).end();
1633 DCHECK_EQ(static_cast<size_t>(parsed.host.begin - 1), at_sign);
1634 std::vector<size_t>::const_iterator at_sign_iter =
1635 std::find(original_offsets.begin(), original_offsets.end(), at_sign);
1636 if (at_sign_iter != original_offsets.end()) {
1637 (*offsets_for_adjustment)[at_sign_iter - original_offsets.begin()] =
1638 url_string.length();
1639 }
1640 url_string.push_back('@');
1641 }
1642 }
1643 if (prefix_end)
1644 *prefix_end = static_cast<size_t>(url_string.length());
1645
1646 // Host.
1647 AppendFormattedComponent(spec, parsed.host, original_offsets,
1648 HostComponentTransform(languages), &url_string, &new_parsed->host,
1649 offsets_for_adjustment);
1650
1651 // Port.
1652 if (parsed.port.is_nonempty()) {
1653 url_string.push_back(':');
1654 new_parsed->port.begin = url_string.length();
1655 url_string.insert(url_string.end(),
1656 spec.begin() + parsed.port.begin,
1657 spec.begin() + parsed.port.end());
1658 new_parsed->port.len = url_string.length() - new_parsed->port.begin;
1659 } else {
1660 new_parsed->port.reset();
1661 }
1662
1663 // Path & query. Both get the same general unescape & convert treatment.
1664 if (!(format_types & kFormatUrlOmitTrailingSlashOnBareHostname) ||
1665 !CanStripTrailingSlash(url)) {
1666 AppendFormattedComponent(spec, parsed.path, original_offsets,
1667 NonHostComponentTransform(unescape_rules), &url_string,
1668 &new_parsed->path, offsets_for_adjustment);
1669 }
1670 if (parsed.query.is_valid())
1671 url_string.push_back('?');
1672 AppendFormattedComponent(spec, parsed.query, original_offsets,
1673 NonHostComponentTransform(unescape_rules), &url_string,
1674 &new_parsed->query, offsets_for_adjustment);
1675
1676 // Ref. This is valid, unescaped UTF-8, so we can just convert.
1677 if (parsed.ref.is_valid()) {
1678 url_string.push_back('#');
1679 size_t original_ref_begin = static_cast<size_t>(parsed.ref.begin);
1680 size_t original_ref_len = static_cast<size_t>(parsed.ref.len);
1681 size_t output_ref_begin = url_string.length();
1682 new_parsed->ref.begin = static_cast<int>(output_ref_begin);
1683
1684 std::vector<size_t> offsets_into_ref(
1685 OffsetsIntoComponent(original_offsets, original_ref_begin));
1686 if (parsed.ref.len > 0) {
1687 url_string.append(UTF8ToUTF16AndAdjustOffsets(
1688 spec.substr(original_ref_begin, static_cast<size_t>(parsed.ref.len)),
1689 &offsets_into_ref));
1690 }
1691
1692 new_parsed->ref.len =
1693 static_cast<int>(url_string.length() - new_parsed->ref.begin);
1694 AdjustForComponentTransform(original_offsets, original_ref_begin,
1695 static_cast<size_t>(parsed.ref.end()), offsets_into_ref,
1696 output_ref_begin, offsets_for_adjustment);
1697 }
1698
1699 // If we need to strip out http do it after the fact. This way we don't need
1700 // to worry about how offset_for_adjustment is interpreted.
1701 if (omit_http && StartsWith(url_string, ASCIIToUTF16(kHTTP), true)) {
1702 const size_t kHTTPSize = arraysize(kHTTP) - 1;
1703 url_string = url_string.substr(kHTTPSize);
1704 if (offsets_for_adjustment && !offsets_for_adjustment->empty()) {
1705 AdjustOffset::Adjustments adjustments;
1706 adjustments.push_back(AdjustOffset::Adjustment(0, kHTTPSize, 0));
1707 std::for_each(offsets_for_adjustment->begin(),
1708 offsets_for_adjustment->end(), AdjustOffset(adjustments));
1709 }
1710 if (prefix_end)
1711 *prefix_end -= kHTTPSize;
1712
1713 // Adjust new_parsed.
1714 DCHECK(new_parsed->scheme.is_valid());
1715 int delta = -(new_parsed->scheme.len + 3); // +3 for ://.
1716 new_parsed->scheme.reset();
1717 AdjustComponents(delta, new_parsed);
1718 }
1719
1720 return url_string;
1807 } 1721 }
1808 1722
1809 string16 FormatUrl(const GURL& url, 1723 string16 FormatUrl(const GURL& url,
1810 const std::string& languages, 1724 const std::string& languages,
1811 FormatUrlTypes format_types, 1725 FormatUrlTypes format_types,
1812 UnescapeRule::Type unescape_rules, 1726 UnescapeRule::Type unescape_rules,
1813 url_parse::Parsed* new_parsed, 1727 url_parse::Parsed* new_parsed,
1814 size_t* prefix_end, 1728 size_t* prefix_end,
1815 size_t* offset_for_adjustment) { 1729 size_t* offset_for_adjustment) {
1816 std::vector<size_t> offsets; 1730 std::vector<size_t> offsets;
1817 if (offset_for_adjustment) 1731 if (offset_for_adjustment)
1818 offsets.push_back(*offset_for_adjustment); 1732 offsets.push_back(*offset_for_adjustment);
1819 string16 result = WideToUTF16Hack( 1733 string16 result = FormatUrlWithOffsets(url, languages, format_types,
1820 FormatUrlInternal(url, ASCIIToWide(languages), format_types, 1734 unescape_rules, new_parsed, prefix_end, &offsets);
1821 unescape_rules, new_parsed, prefix_end, &offsets));
1822 if (offset_for_adjustment) 1735 if (offset_for_adjustment)
1823 *offset_for_adjustment = offsets[0]; 1736 *offset_for_adjustment = offsets[0];
1824 return result; 1737 return result;
1825 } 1738 }
1826 1739
1827 bool CanStripTrailingSlash(const GURL& url) { 1740 bool CanStripTrailingSlash(const GURL& url) {
1828 // Omit the path only for standard, non-file URLs with nothing but "/" after 1741 // Omit the path only for standard, non-file URLs with nothing but "/" after
1829 // the hostname. 1742 // the hostname.
1830 return url.IsStandard() && !url.SchemeIsFile() && !url.has_query() && 1743 return url.IsStandard() && !url.SchemeIsFile() && !url.has_query() &&
1831 !url.has_ref() && url.path() == "/"; 1744 !url.has_ref() && url.path() == "/";
(...skipping 435 matching lines...) Expand 10 before | Expand all | Expand 10 after
2267 } 2180 }
2268 2181
2269 NetworkInterface::NetworkInterface(const std::string& name, 2182 NetworkInterface::NetworkInterface(const std::string& name,
2270 const IPAddressNumber& address) 2183 const IPAddressNumber& address)
2271 : name(name), address(address) { 2184 : name(name), address(address) {
2272 } 2185 }
2273 2186
2274 NetworkInterface::~NetworkInterface() { 2187 NetworkInterface::~NetworkInterface() {
2275 } 2188 }
2276 2189
2277 ClampComponentOffset::ClampComponentOffset(size_t component_start)
2278 : component_start(component_start) {}
2279
2280 size_t ClampComponentOffset::operator()(size_t offset) {
2281 return (offset >= component_start) ?
2282 offset : std::wstring::npos;
2283 }
2284
2285 } // namespace net 2190 } // namespace net
OLDNEW
« no previous file with comments | « net/base/net_util.h ('k') | net/base/net_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698