Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(173)

Side by Side Diff: net/base/net_util.cc

Issue 6898026: Eliminate wstring from base/utf_offset_string_conversions.h, net/base/escape.h, and net/base/net_... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 9 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "net/base/net_util.h" 5 #include "net/base/net_util.h"
6 6
7 #include <unicode/regex.h> 7 #include <unicode/regex.h>
8 #include <unicode/ucnv.h> 8 #include <unicode/ucnv.h>
9 #include <unicode/uidna.h> 9 #include <unicode/uidna.h>
10 #include <unicode/ulocdata.h> 10 #include <unicode/ulocdata.h>
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after
148 0xFFFF, // Used to block all invalid port numbers (see 148 0xFFFF, // Used to block all invalid port numbers (see
149 // third_party/WebKit/Source/WebCore/platform/KURLGoogle.cpp, port()) 149 // third_party/WebKit/Source/WebCore/platform/KURLGoogle.cpp, port())
150 }; 150 };
151 151
152 // FTP overrides the following restricted ports. 152 // FTP overrides the following restricted ports.
153 static const int kAllowedFtpPorts[] = { 153 static const int kAllowedFtpPorts[] = {
154 21, // ftp data 154 21, // ftp data
155 22, // ssh 155 22, // ssh
156 }; 156 };
157 157
158 template<typename STR>
159 STR GetSpecificHeaderT(const STR& headers, const STR& name) {
160 // We want to grab the Value from the "Key: Value" pairs in the headers,
161 // which should look like this (no leading spaces, \n-separated) (we format
162 // them this way in url_request_inet.cc):
163 // HTTP/1.1 200 OK\n
164 // ETag: "6d0b8-947-24f35ec0"\n
165 // Content-Length: 2375\n
166 // Content-Type: text/html; charset=UTF-8\n
167 // Last-Modified: Sun, 03 Sep 2006 04:34:43 GMT\n
168 if (headers.empty())
169 return STR();
170
171 STR match;
172 match.push_back('\n');
173 match.append(name);
174 match.push_back(':');
175
176 typename STR::const_iterator begin =
177 search(headers.begin(), headers.end(), match.begin(), match.end(),
178 base::CaseInsensitiveCompareASCII<typename STR::value_type>());
179
180 if (begin == headers.end())
181 return STR();
182
183 begin += match.length();
184
185 typename STR::const_iterator end = find(begin, headers.end(), '\n');
186
187 STR ret;
188 TrimWhitespace(STR(begin, end), TRIM_ALL, &ret);
189 return ret;
190 }
191
192 // Similar to Base64Decode. Decodes a Q-encoded string to a sequence 158 // Similar to Base64Decode. Decodes a Q-encoded string to a sequence
193 // of bytes. If input is invalid, return false. 159 // of bytes. If input is invalid, return false.
194 bool QPDecode(const std::string& input, std::string* output) { 160 bool QPDecode(const std::string& input, std::string* output) {
195 std::string temp; 161 std::string temp;
196 temp.reserve(input.size()); 162 temp.reserve(input.size());
197 std::string::const_iterator it = input.begin(); 163 std::string::const_iterator it = input.begin();
198 while (it != input.end()) { 164 while (it != input.end()) {
199 if (*it == '_') { 165 if (*it == '_') {
200 temp.push_back(' '); 166 temp.push_back(' ');
201 } else if (*it == '=') { 167 } else if (*it == '=') {
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
269 *is_rfc2047 = false; 235 *is_rfc2047 = false;
270 output->clear(); 236 output->clear();
271 if (encoded_word.empty()) 237 if (encoded_word.empty())
272 return true; 238 return true;
273 239
274 if (!IsStringASCII(encoded_word)) { 240 if (!IsStringASCII(encoded_word)) {
275 // Try UTF-8, referrer_charset and the native OS default charset in turn. 241 // Try UTF-8, referrer_charset and the native OS default charset in turn.
276 if (IsStringUTF8(encoded_word)) { 242 if (IsStringUTF8(encoded_word)) {
277 *output = encoded_word; 243 *output = encoded_word;
278 } else { 244 } else {
279 std::wstring wide_output; 245 string16 utf16_output;
280 if (!referrer_charset.empty() && 246 if (!referrer_charset.empty() &&
281 base::CodepageToWide(encoded_word, referrer_charset.c_str(), 247 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
282 base::OnStringConversionError::FAIL, 248 base::OnStringConversionError::FAIL,
283 &wide_output)) { 249 &utf16_output)) {
284 *output = WideToUTF8(wide_output); 250 *output = UTF16ToUTF8(utf16_output);
285 } else { 251 } else {
286 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); 252 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
287 } 253 }
288 } 254 }
289 255
290 return true; 256 return true;
291 } 257 }
292 258
293 // RFC 2047 : one of encoding methods supported by Firefox and relatively 259 // RFC 2047 : one of encoding methods supported by Firefox and relatively
294 // widely used by web servers. 260 // widely used by web servers.
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after
407 std::string decoded; 373 std::string decoded;
408 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, 374 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
409 &decoded)) 375 &decoded))
410 return false; 376 return false;
411 tmp.append(decoded); 377 tmp.append(decoded);
412 } 378 }
413 output->swap(tmp); 379 output->swap(tmp);
414 return true; 380 return true;
415 } 381 }
416 382
417 // TODO(mpcomplete): This is a quick and dirty implementation for now. I'm
418 // sure this doesn't properly handle all (most?) cases.
419 template<typename STR>
420 STR GetHeaderParamValueT(const STR& header, const STR& param_name,
421 QuoteRule::Type quote_rule) {
422 // This assumes args are formatted exactly like "bla; arg1=value; arg2=value".
423 typename STR::const_iterator param_begin =
424 search(header.begin(), header.end(), param_name.begin(), param_name.end(),
425 base::CaseInsensitiveCompareASCII<typename STR::value_type>());
426
427 if (param_begin == header.end())
428 return STR();
429 param_begin += param_name.length();
430
431 STR whitespace;
432 whitespace.push_back(' ');
433 whitespace.push_back('\t');
434 const typename STR::size_type equals_offset =
435 header.find_first_not_of(whitespace, param_begin - header.begin());
436 if (equals_offset == STR::npos || header.at(equals_offset) != '=')
437 return STR();
438
439 param_begin = header.begin() + equals_offset + 1;
440 if (param_begin == header.end())
441 return STR();
442
443 typename STR::const_iterator param_end;
444 if (*param_begin == '"' && quote_rule == QuoteRule::REMOVE_OUTER_QUOTES) {
445 ++param_begin; // skip past the quote.
446 param_end = find(param_begin, header.end(), '"');
447 // If the closing quote is missing, we will treat the rest of the
448 // string as the parameter. We can't set |param_end| to the
449 // location of the separator (';'), since the separator is
450 // technically quoted. See: http://crbug.com/58840
451 } else {
452 param_end = find(param_begin+1, header.end(), ';');
453 }
454
455 return STR(param_begin, param_end);
456 }
457
458 // Does some simple normalization of scripts so we can allow certain scripts 383 // Does some simple normalization of scripts so we can allow certain scripts
459 // to exist together. 384 // to exist together.
460 // TODO(brettw) bug 880223: we should allow some other languages to be 385 // TODO(brettw) bug 880223: we should allow some other languages to be
461 // oombined such as Chinese and Latin. We will probably need a more 386 // oombined such as Chinese and Latin. We will probably need a more
462 // complicated system of language pairs to have more fine-grained control. 387 // complicated system of language pairs to have more fine-grained control.
463 UScriptCode NormalizeScript(UScriptCode code) { 388 UScriptCode NormalizeScript(UScriptCode code) {
464 switch (code) { 389 switch (code) {
465 case USCRIPT_KATAKANA: 390 case USCRIPT_KATAKANA:
466 case USCRIPT_HIRAGANA: 391 case USCRIPT_HIRAGANA:
467 case USCRIPT_KATAKANA_OR_HIRAGANA: 392 case USCRIPT_KATAKANA_OR_HIRAGANA:
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after
586 ulocdata_close(uld); 511 ulocdata_close(uld);
587 } 512 }
588 } 513 }
589 return !lang_set->isEmpty() && lang_set->containsAll(component_characters); 514 return !lang_set->isEmpty() && lang_set->containsAll(component_characters);
590 } 515 }
591 516
592 // Returns true if the given Unicode host component is safe to display to the 517 // Returns true if the given Unicode host component is safe to display to the
593 // user. 518 // user.
594 bool IsIDNComponentSafe(const char16* str, 519 bool IsIDNComponentSafe(const char16* str,
595 int str_len, 520 int str_len,
596 const std::wstring& languages) { 521 const std::string& languages) {
597 // Most common cases (non-IDN) do not reach here so that we don't 522 // Most common cases (non-IDN) do not reach here so that we don't
598 // need a fast return path. 523 // need a fast return path.
599 // TODO(jungshik) : Check if there's any character inappropriate 524 // TODO(jungshik) : Check if there's any character inappropriate
600 // (although allowed) for domain names. 525 // (although allowed) for domain names.
601 // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and 526 // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and
602 // http://www.unicode.org/reports/tr39/data/xidmodifications.txt 527 // http://www.unicode.org/reports/tr39/data/xidmodifications.txt
603 // For now, we borrow the list from Mozilla and tweaked it slightly. 528 // For now, we borrow the list from Mozilla and tweaked it slightly.
604 // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because 529 // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because
605 // they're gonna be canonicalized to U+0020 and full stop before 530 // they're gonna be canonicalized to U+0020 and full stop before
606 // reaching here.) 531 // reaching here.)
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
670 // (sync'd with characters allowed in url_canon_host with square 595 // (sync'd with characters allowed in url_canon_host with square
671 // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc. 596 // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc.
672 icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"), 597 icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"),
673 status); 598 status);
674 DCHECK(U_SUCCESS(status)); 599 DCHECK(U_SUCCESS(status));
675 // Subtract common characters because they're always allowed so that 600 // Subtract common characters because they're always allowed so that
676 // we just have to check if a language-specific set contains 601 // we just have to check if a language-specific set contains
677 // the remainder. 602 // the remainder.
678 component_characters.removeAll(common_characters); 603 component_characters.removeAll(common_characters);
679 604
680 std::string languages_list(WideToASCII(languages)); 605 StringTokenizer t(languages, ",");
681 StringTokenizer t(languages_list, ",");
682 while (t.GetNext()) { 606 while (t.GetNext()) {
683 if (IsComponentCoveredByLang(component_characters, t.token())) 607 if (IsComponentCoveredByLang(component_characters, t.token()))
684 return true; 608 return true;
685 } 609 }
686 return false; 610 return false;
687 } 611 }
688 612
689 // Converts one component of a host (between dots) to IDN if safe. The result 613 // Converts one component of a host (between dots) to IDN if safe. The result
690 // will be APPENDED to the given output string and will be the same as the input 614 // will be APPENDED to the given output string and will be the same as the input
691 // if it is not IDN or the IDN is unsafe to display. Returns whether any 615 // if it is not IDN or the IDN is unsafe to display. Returns whether any
692 // conversion was performed. 616 // conversion was performed.
693 bool IDNToUnicodeOneComponent(const char16* comp, 617 bool IDNToUnicodeOneComponent(const char16* comp,
694 size_t comp_len, 618 size_t comp_len,
695 const std::wstring& languages, 619 const std::string& languages,
696 string16* out) { 620 string16* out) {
697 DCHECK(out); 621 DCHECK(out);
698 if (comp_len == 0) 622 if (comp_len == 0)
699 return false; 623 return false;
700 624
701 // Only transform if the input can be an IDN component. 625 // Only transform if the input can be an IDN component.
702 static const char16 kIdnPrefix[] = {'x', 'n', '-', '-'}; 626 static const char16 kIdnPrefix[] = {'x', 'n', '-', '-'};
703 if ((comp_len > arraysize(kIdnPrefix)) && 627 if ((comp_len > arraysize(kIdnPrefix)) &&
704 !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(char16))) { 628 !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(char16))) {
705 // Repeatedly expand the output string until it's big enough. It looks like 629 // Repeatedly expand the output string until it's big enough. It looks like
(...skipping 21 matching lines...) Expand all
727 // Failed, revert back to original string. 651 // Failed, revert back to original string.
728 out->resize(original_length); 652 out->resize(original_length);
729 } 653 }
730 654
731 // We get here with no IDN or on error, in which case we just append the 655 // We get here with no IDN or on error, in which case we just append the
732 // literal input. 656 // literal input.
733 out->append(comp, comp_len); 657 out->append(comp, comp_len);
734 return false; 658 return false;
735 } 659 }
736 660
737 struct SubtractFromOffset { 661 // Clamps the offsets in |offsets_for_adjustment| to the length of |str|.
738 explicit SubtractFromOffset(size_t amount) 662 void LimitOffsets(const string16& str,
739 : amount(amount) {} 663 std::vector<size_t>* offsets_for_adjustment) {
740 void operator()(size_t& offset) { 664 if (offsets_for_adjustment) {
741 if (offset != std::wstring::npos) { 665 std::for_each(offsets_for_adjustment->begin(),
742 if (offset >= amount) 666 offsets_for_adjustment->end(),
743 offset -= amount; 667 LimitOffset<string16>(str.length()));
744 else 668 }
745 offset = std::wstring::npos; 669 }
670
671 // TODO(brettw) bug 734373: check the scripts for each host component and
672 // don't un-IDN-ize if there is more than one. Alternatively, only IDN for
673 // scripts that the user has installed. For now, just put the entire
674 // path through IDN. Maybe this feature can be implemented in ICU itself?
675 //
676 // We may want to skip this step in the case of file URLs to allow unicode
677 // UNC hostnames regardless of encodings.
678 string16 IDNToUnicodeWithOffsets(const std::string& host,
679 const std::string& languages,
680 std::vector<size_t>* offsets_for_adjustment) {
681 // Convert the ASCII input to a string16 for ICU.
682 string16 input16;
683 input16.reserve(host.length());
684 input16.insert(input16.end(), host.begin(), host.end());
685
686 // Do each component of the host separately, since we enforce script matching
687 // on a per-component basis.
688 string16 out16;
689 {
690 OffsetAdjuster offset_adjuster(offsets_for_adjustment);
691 for (size_t component_start = 0, component_end;
692 component_start < input16.length();
693 component_start = component_end + 1) {
694 // Find the end of the component.
695 component_end = input16.find('.', component_start);
696 if (component_end == string16::npos)
697 component_end = input16.length(); // For getting the last component.
698 size_t component_length = component_end - component_start;
699 size_t new_component_start = out16.length();
700 bool converted_idn = false;
701 if (component_end > component_start) {
702 // Add the substring that we just found.
703 converted_idn = IDNToUnicodeOneComponent(
704 input16.data() + component_start, component_length, languages,
705 &out16);
706 }
707 size_t new_component_length = out16.length() - new_component_start;
708
709 if (converted_idn && offsets_for_adjustment) {
710 offset_adjuster.Add(OffsetAdjuster::Adjustment(component_start,
711 component_length, new_component_length));
712 }
713
714 // Need to add the dot we just found (if we found one).
715 if (component_end < input16.length())
716 out16.push_back('.');
746 } 717 }
747 } 718 }
748 719
749 size_t amount; 720 LimitOffsets(out16, offsets_for_adjustment);
750 }; 721 return out16;
751
752 struct AddToOffset {
753 explicit AddToOffset(size_t amount)
754 : amount(amount) {}
755 void operator()(size_t& offset) {
756 if (offset != std::wstring::npos)
757 offset += amount;
758 }
759
760 size_t amount;
761 };
762
763 std::vector<size_t> OffsetsIntoSection(
764 std::vector<size_t>* offsets_for_adjustment,
765 size_t section_begin) {
766 std::vector<size_t> offsets_into_section;
767 if (offsets_for_adjustment) {
768 std::transform(offsets_for_adjustment->begin(),
769 offsets_for_adjustment->end(),
770 std::back_inserter(offsets_into_section),
771 ClampComponentOffset(section_begin));
772 std::for_each(offsets_into_section.begin(), offsets_into_section.end(),
773 SubtractFromOffset(section_begin));
774 }
775 return offsets_into_section;
776 } 722 }
777 723
778 void ApplySectionAdjustments(const std::vector<size_t>& offsets_into_section, 724 // Transforms |original_offsets| by subtracting |section_begin| from all
brettw 2011/04/27 17:47:51 section_begin -> component_begin
779 std::vector<size_t>* offsets_for_adjustment, 725 // offsets. Any offset which was not at least this large to begin with is set
780 size_t old_section_len, 726 // to std::string::npos.
781 size_t new_section_len, 727 std::vector<size_t> OffsetsIntoComponent(
782 size_t section_begin) { 728 const std::vector<size_t>& original_offsets,
783 if (offsets_for_adjustment) { 729 size_t component_begin) {
784 DCHECK_EQ(offsets_for_adjustment->size(), offsets_into_section.size()); 730 DCHECK_NE(std::string::npos, component_begin);
785 std::vector<size_t>::const_iterator host_offsets_iter = 731 std::vector<size_t> offsets_into_component(original_offsets);
786 offsets_into_section.begin(); 732 for (std::vector<size_t>::iterator i(offsets_into_component.begin());
787 for (std::vector<size_t>::iterator offsets_iter = 733 i != offsets_into_component.end(); ++i) {
788 offsets_for_adjustment->begin(); 734 if (*i != std::string::npos)
789 offsets_iter != offsets_for_adjustment->end(); 735 *i = (*i < component_begin) ? std::string::npos : (*i - component_begin);
790 ++offsets_iter, ++host_offsets_iter) { 736 }
791 size_t offset = *offsets_iter; 737 return offsets_into_component;
792 if (offset == std::wstring::npos || offset < section_begin) { 738 }
793 // The offset is before the host section so leave it as is. 739
794 continue; 740 // Called after we transform a component and append it to an output string.
795 } 741 // Maps |transformed_offsets|, which represent offsets into the transformed
796 if (offset >= section_begin + old_section_len) { 742 // component itself, into appropriate offsets for the output string, by adding
797 // The offset is after the host section so adjust by host length delta. 743 // |output_component_begin| to each. Determines which offsets need mapping by
798 offset += new_section_len - old_section_len; 744 // checking to see which of the |original_offsets| were within the designated
799 } else if (*host_offsets_iter != std::wstring::npos) { 745 // original component, using its provided endpoints.
800 // The offset is within the host and valid so adjust by the host 746 void AdjustForComponentTransform(
801 // reformatting offsets results. 747 const std::vector<size_t>& original_offsets,
802 offset = section_begin + *host_offsets_iter; 748 size_t original_component_begin,
803 } else { 749 size_t original_component_end,
804 // The offset is invalid. 750 const std::vector<size_t>& transformed_offsets,
805 offset = std::wstring::npos; 751 size_t output_component_begin,
806 } 752 std::vector<size_t>* offsets_for_adjustment) {
807 *offsets_iter = offset; 753 if (!offsets_for_adjustment)
754 return;
755
756 DCHECK_NE(std::string::npos, original_component_begin);
757 DCHECK_NE(std::string::npos, original_component_end);
758 DCHECK_NE(string16::npos, output_component_begin);
759 size_t offsets_size = offsets_for_adjustment->size();
760 DCHECK_EQ(offsets_size, original_offsets.size());
761 DCHECK_EQ(offsets_size, transformed_offsets.size());
762 for (size_t i = 0; i < offsets_size; ++i) {
763 size_t original_offset = original_offsets[i];
764 if ((original_offset >= original_component_begin) &&
765 (original_offset < original_component_end)) {
766 size_t transformed_offset = transformed_offsets[i];
767 (*offsets_for_adjustment)[i] = (transformed_offset == string16::npos) ?
768 string16::npos : (output_component_begin + transformed_offset);
808 } 769 }
809 } 770 }
810 } 771 }
811 772
812 // If |component| is valid, its begin is incremented by |delta|. 773 // If |component| is valid, its begin is incremented by |delta|.
813 void AdjustComponent(int delta, url_parse::Component* component) { 774 void AdjustComponent(int delta, url_parse::Component* component) {
814 if (!component->is_valid()) 775 if (!component->is_valid())
815 return; 776 return;
816 777
817 DCHECK(delta >= 0 || component->begin >= -delta); 778 DCHECK(delta >= 0 || component->begin >= -delta);
818 component->begin += delta; 779 component->begin += delta;
819 } 780 }
820 781
821 // Adjusts all the components of |parsed| by |delta|, except for the scheme. 782 // Adjusts all the components of |parsed| by |delta|, except for the scheme.
822 void AdjustComponents(int delta, url_parse::Parsed* parsed) { 783 void AdjustComponents(int delta, url_parse::Parsed* parsed) {
823 AdjustComponent(delta, &(parsed->username)); 784 AdjustComponent(delta, &(parsed->username));
824 AdjustComponent(delta, &(parsed->password)); 785 AdjustComponent(delta, &(parsed->password));
825 AdjustComponent(delta, &(parsed->host)); 786 AdjustComponent(delta, &(parsed->host));
826 AdjustComponent(delta, &(parsed->port)); 787 AdjustComponent(delta, &(parsed->port));
827 AdjustComponent(delta, &(parsed->path)); 788 AdjustComponent(delta, &(parsed->path));
828 AdjustComponent(delta, &(parsed->query)); 789 AdjustComponent(delta, &(parsed->query));
829 AdjustComponent(delta, &(parsed->ref)); 790 AdjustComponent(delta, &(parsed->ref));
830 } 791 }
831 792
832 std::wstring FormatUrlInternal(const GURL& url, 793 // Helper for FormatUrlWithOffsets().
833 const std::wstring& languages, 794 string16 FormatViewSourceUrl(const GURL& url,
834 FormatUrlTypes format_types, 795 const std::vector<size_t>& original_offsets,
835 UnescapeRule::Type unescape_rules, 796 const std::string& languages,
836 url_parse::Parsed* new_parsed, 797 FormatUrlTypes format_types,
837 size_t* prefix_end, 798 UnescapeRule::Type unescape_rules,
838 std::vector<size_t>* offsets_for_adjustment); 799 url_parse::Parsed* new_parsed,
800 size_t* prefix_end,
801 std::vector<size_t>* offsets_for_adjustment) {
802 DCHECK(new_parsed);
803 const char kViewSource[] = "view-source:";
804 const size_t kViewSourceLength = arraysize(kViewSource) - 1;
805 std::vector<size_t> offsets_into_url(
806 OffsetsIntoComponent(original_offsets, kViewSourceLength));
839 807
840 // Helper for FormatUrl()/FormatUrlInternal(). 808 GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLength));
841 std::wstring FormatViewSourceUrl(const GURL& url, 809 string16 result(ASCIIToUTF16(kViewSource) +
842 const std::wstring& languages, 810 FormatUrlWithOffsets(real_url, languages, format_types, unescape_rules,
843 FormatUrlTypes format_types, 811 new_parsed, prefix_end, &offsets_into_url));
844 UnescapeRule::Type unescape_rules,
845 url_parse::Parsed* new_parsed,
846 size_t* prefix_end,
847 std::vector<size_t>* offsets_for_adjustment) {
848 DCHECK(new_parsed);
849 DCHECK(offsets_for_adjustment);
850 const wchar_t* const kWideViewSource = L"view-source:";
851 const size_t kViewSourceLengthPlus1 = 12;
852 std::vector<size_t> saved_offsets(*offsets_for_adjustment);
853
854 GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLengthPlus1));
855 // Clamp the offsets to the source area.
856 std::for_each(offsets_for_adjustment->begin(),
857 offsets_for_adjustment->end(),
858 SubtractFromOffset(kViewSourceLengthPlus1));
859 std::wstring result = FormatUrlInternal(real_url, languages, format_types,
860 unescape_rules, new_parsed, prefix_end, offsets_for_adjustment);
861 result.insert(0, kWideViewSource);
862 812
863 // Adjust position values. 813 // Adjust position values.
864 if (new_parsed->scheme.is_nonempty()) { 814 if (new_parsed->scheme.is_nonempty()) {
865 // Assume "view-source:real-scheme" as a scheme. 815 // Assume "view-source:real-scheme" as a scheme.
866 new_parsed->scheme.len += kViewSourceLengthPlus1; 816 new_parsed->scheme.len += kViewSourceLength;
867 } else { 817 } else {
868 new_parsed->scheme.begin = 0; 818 new_parsed->scheme.begin = 0;
869 new_parsed->scheme.len = kViewSourceLengthPlus1 - 1; 819 new_parsed->scheme.len = kViewSourceLength - 1;
870 } 820 }
871 AdjustComponents(kViewSourceLengthPlus1, new_parsed); 821 AdjustComponents(kViewSourceLength, new_parsed);
872 if (prefix_end) 822 if (prefix_end)
873 *prefix_end += kViewSourceLengthPlus1; 823 *prefix_end += kViewSourceLength;
874 std::for_each(offsets_for_adjustment->begin(), 824 AdjustForComponentTransform(original_offsets, kViewSourceLength,
875 offsets_for_adjustment->end(), 825 url.possibly_invalid_spec().length(), offsets_into_url, kViewSourceLength,
876 AddToOffset(kViewSourceLengthPlus1)); 826 offsets_for_adjustment);
877 // Restore all offsets which were not affected by FormatUrlInternal. 827 LimitOffsets(result, offsets_for_adjustment);
878 DCHECK_EQ(saved_offsets.size(), offsets_for_adjustment->size());
879 for (size_t i = 0; i < saved_offsets.size(); ++i) {
880 if (saved_offsets[i] < kViewSourceLengthPlus1)
881 (*offsets_for_adjustment)[i] = saved_offsets[i];
882 }
883 return result; 828 return result;
884 } 829 }
885 830
886 // Appends the substring |in_component| inside of the URL |spec| to |output|, 831 class AppendComponentTransform {
887 // and the resulting range will be filled into |out_component|. |unescape_rules| 832 public:
888 // defines how to clean the URL for human readability. |offsets_for_adjustment| 833 AppendComponentTransform() {}
889 // is an array of offsets into |output| each of which will be adjusted based on 834 virtual ~AppendComponentTransform() {}
890 // how it maps to the component being converted; if it is less than 835
891 // output->length(), it will be untouched, and if it is greater than 836 virtual string16 Execute(
892 // output->length() + in_component.len it will be adjusted by the difference in 837 const std::string& component_text,
893 // lengths between the input and output components. Otherwise it points into 838 std::vector<size_t>* offsets_into_component) const = 0;
894 // the component being converted, and is adjusted to point to the same logical 839
895 // place in |output|. |offsets_for_adjustment| may not be NULL. 840 // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an
841 // accessible copy constructor in order to call AppendFormattedComponent()
842 // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ).
843 };
844
845 class HostComponentTransform : public AppendComponentTransform {
846 public:
847 explicit HostComponentTransform(const std::string& languages)
848 : languages_(languages) {
849 }
850
851 private:
852 virtual string16 Execute(
853 const std::string& component_text,
854 std::vector<size_t>* offsets_into_component) const {
855 return IDNToUnicodeWithOffsets(component_text, languages_,
856 offsets_into_component);
857 }
858
859 const std::string& languages_;
860 };
861
862 class NonHostComponentTransform : public AppendComponentTransform {
863 public:
864 explicit NonHostComponentTransform(UnescapeRule::Type unescape_rules)
865 : unescape_rules_(unescape_rules) {
866 }
867
868 private:
869 virtual string16 Execute(
870 const std::string& component_text,
871 std::vector<size_t>* offsets_into_component) const {
872 return (unescape_rules_ == UnescapeRule::NONE) ?
873 UTF8ToUTF16AndAdjustOffsets(component_text, offsets_into_component) :
874 UnescapeAndDecodeUTF8URLComponentWithOffsets(component_text,
875 unescape_rules_, offsets_into_component);
876 }
877
878 const UnescapeRule::Type unescape_rules_;
879 };
880
896 void AppendFormattedComponent(const std::string& spec, 881 void AppendFormattedComponent(const std::string& spec,
897 const url_parse::Component& in_component, 882 const url_parse::Component& original_component,
898 UnescapeRule::Type unescape_rules, 883 const std::vector<size_t>& original_offsets,
899 std::wstring* output, 884 const AppendComponentTransform& transform,
900 url_parse::Component* out_component, 885 string16* output,
886 url_parse::Component* output_component,
901 std::vector<size_t>* offsets_for_adjustment) { 887 std::vector<size_t>* offsets_for_adjustment) {
902 DCHECK(output); 888 DCHECK(output);
903 DCHECK(offsets_for_adjustment); 889 if (original_component.is_nonempty()) {
904 if (in_component.is_nonempty()) { 890 size_t original_component_begin =
905 size_t component_begin = output->length(); 891 static_cast<size_t>(original_component.begin);
906 out_component->begin = static_cast<int>(component_begin); 892 size_t output_component_begin = output->length();
893 if (output_component)
894 output_component->begin = static_cast<int>(output_component_begin);
907 895
908 // Compose a list of offsets within the component area.
909 std::vector<size_t> offsets_into_component = 896 std::vector<size_t> offsets_into_component =
910 OffsetsIntoSection(offsets_for_adjustment, component_begin); 897 OffsetsIntoComponent(original_offsets, original_component_begin);
898 output->append(transform.Execute(std::string(spec, original_component_begin,
899 static_cast<size_t>(original_component.len)), &offsets_into_component));
911 900
912 if (unescape_rules == UnescapeRule::NONE) { 901 if (output_component) {
913 output->append(UTF8ToWideAndAdjustOffsets( 902 output_component->len =
914 spec.substr(in_component.begin, in_component.len), 903 static_cast<int>(output->length() - output_component_begin);
915 &offsets_into_component));
916 } else {
917 output->append(UTF16ToWideHack(
918 UnescapeAndDecodeUTF8URLComponentWithOffsets(
919 spec.substr(in_component.begin, in_component.len), unescape_rules,
920 &offsets_into_component)));
921 } 904 }
922 size_t new_component_len = output->length() - component_begin; 905 AdjustForComponentTransform(original_offsets, original_component_begin,
923 out_component->len = static_cast<int>(new_component_len); 906 static_cast<size_t>(original_component.end()),
924 907 offsets_into_component, output_component_begin,
925 // Apply offset adjustments. 908 offsets_for_adjustment);
926 size_t old_component_len = static_cast<size_t>(in_component.len); 909 } else if (output_component) {
927 ApplySectionAdjustments(offsets_into_component, offsets_for_adjustment, 910 output_component->reset();
928 old_component_len, new_component_len, component_begin);
929 } else {
930 out_component->reset();
931 } 911 }
932 } 912 }
933 913
934 // TODO(viettrungluu): This is really the old-fashioned version, made internal.
935 // I need to really convert |FormatUrl()|.
936 std::wstring FormatUrlInternal(const GURL& url,
937 const std::wstring& languages,
938 FormatUrlTypes format_types,
939 UnescapeRule::Type unescape_rules,
940 url_parse::Parsed* new_parsed,
941 size_t* prefix_end,
942 std::vector<size_t>* offsets_for_adjustment) {
943 url_parse::Parsed parsed_temp;
944 if (!new_parsed)
945 new_parsed = &parsed_temp;
946 else
947 *new_parsed = url_parse::Parsed();
948
949 std::vector<size_t> offsets_temp;
950 if (!offsets_for_adjustment)
951 offsets_for_adjustment = &offsets_temp;
952
953 std::wstring url_string;
954
955 // Check for empty URLs or 0 available text width.
956 if (url.is_empty()) {
957 if (prefix_end)
958 *prefix_end = 0;
959 std::for_each(offsets_for_adjustment->begin(),
960 offsets_for_adjustment->end(),
961 LimitOffset<std::wstring>(0));
962 return url_string;
963 }
964
965 // Special handling for view-source:. Don't use chrome::kViewSourceScheme
966 // because this library shouldn't depend on chrome.
967 const char* const kViewSource = "view-source";
968 // Reject "view-source:view-source:..." to avoid deep recursion.
969 const char* const kViewSourceTwice = "view-source:view-source:";
970 if (url.SchemeIs(kViewSource) &&
971 !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) {
972 return FormatViewSourceUrl(url, languages, format_types,
973 unescape_rules, new_parsed, prefix_end, offsets_for_adjustment);
974 }
975
976 // We handle both valid and invalid URLs (this will give us the spec
977 // regardless of validity).
978 const std::string& spec = url.possibly_invalid_spec();
979 const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
980 size_t spec_length = spec.length();
981 std::for_each(offsets_for_adjustment->begin(),
982 offsets_for_adjustment->end(),
983 LimitOffset<std::wstring>(spec_length));
984
985 // Copy everything before the username (the scheme and the separators.)
986 // These are ASCII.
987 url_string.insert(url_string.end(), spec.begin(),
988 spec.begin() + parsed.CountCharactersBefore(url_parse::Parsed::USERNAME,
989 true));
990
991 const wchar_t kHTTP[] = L"http://";
992 const char kFTP[] = "ftp.";
993 // URLFixerUpper::FixupURL() treats "ftp.foo.com" as ftp://ftp.foo.com. This
994 // means that if we trim "http://" off a URL whose host starts with "ftp." and
995 // the user inputs this into any field subject to fixup (which is basically
996 // all input fields), the meaning would be changed. (In fact, often the
997 // formatted URL is directly pre-filled into an input field.) For this reason
998 // we avoid stripping "http://" in this case.
999 bool omit_http =
1000 (format_types & kFormatUrlOmitHTTP) && (url_string == kHTTP) &&
1001 (url.host().compare(0, arraysize(kFTP) - 1, kFTP) != 0);
1002
1003 new_parsed->scheme = parsed.scheme;
1004
1005 if ((format_types & kFormatUrlOmitUsernamePassword) != 0) {
1006 // Remove the username and password fields. We don't want to display those
1007 // to the user since they can be used for attacks,
1008 // e.g. "http://google.com:search@evil.ru/"
1009 new_parsed->username.reset();
1010 new_parsed->password.reset();
1011 // Update the offsets based on removed username and/or password.
1012 if (!offsets_for_adjustment->empty() &&
1013 (parsed.username.is_nonempty() || parsed.password.is_nonempty())) {
1014 AdjustOffset::Adjustments adjustments;
1015 if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) {
1016 // The seeming off-by-one and off-by-two in these first two lines are to
1017 // account for the ':' after the username and '@' after the password.
1018 adjustments.push_back(AdjustOffset::Adjustment(
1019 static_cast<size_t>(parsed.username.begin),
1020 static_cast<size_t>(parsed.username.len + parsed.password.len +
1021 2), 0));
1022 } else {
1023 const url_parse::Component* nonempty_component =
1024 parsed.username.is_nonempty() ? &parsed.username : &parsed.password;
1025 // The seeming off-by-one in below is to account for the '@' after the
1026 // username/password.
1027 adjustments.push_back(AdjustOffset::Adjustment(
1028 static_cast<size_t>(nonempty_component->begin),
1029 static_cast<size_t>(nonempty_component->len + 1), 0));
1030 }
1031
1032 // Make offset adjustment.
1033 std::for_each(offsets_for_adjustment->begin(),
1034 offsets_for_adjustment->end(),
1035 AdjustOffset(adjustments));
1036 }
1037 } else {
1038 AppendFormattedComponent(spec, parsed.username, unescape_rules, &url_string,
1039 &new_parsed->username, offsets_for_adjustment);
1040 if (parsed.password.is_valid())
1041 url_string.push_back(':');
1042 AppendFormattedComponent(spec, parsed.password, unescape_rules, &url_string,
1043 &new_parsed->password, offsets_for_adjustment);
1044 if (parsed.username.is_valid() || parsed.password.is_valid())
1045 url_string.push_back('@');
1046 }
1047 if (prefix_end)
1048 *prefix_end = static_cast<size_t>(url_string.length());
1049
1050 AppendFormattedHostWithOffsets(url, languages, &url_string, new_parsed,
1051 offsets_for_adjustment);
1052
1053 // Port.
1054 if (parsed.port.is_nonempty()) {
1055 url_string.push_back(':');
1056 new_parsed->port.begin = url_string.length();
1057 url_string.insert(url_string.end(),
1058 spec.begin() + parsed.port.begin,
1059 spec.begin() + parsed.port.end());
1060 new_parsed->port.len = url_string.length() - new_parsed->port.begin;
1061 } else {
1062 new_parsed->port.reset();
1063 }
1064
1065 // Path and query both get the same general unescape & convert treatment.
1066 if (!(format_types & kFormatUrlOmitTrailingSlashOnBareHostname) ||
1067 !CanStripTrailingSlash(url)) {
1068 AppendFormattedComponent(spec, parsed.path, unescape_rules, &url_string,
1069 &new_parsed->path, offsets_for_adjustment);
1070 }
1071 if (parsed.query.is_valid())
1072 url_string.push_back('?');
1073 AppendFormattedComponent(spec, parsed.query, unescape_rules, &url_string,
1074 &new_parsed->query, offsets_for_adjustment);
1075
1076 // Reference is stored in valid, unescaped UTF-8, so we can just convert.
1077 if (parsed.ref.is_valid()) {
1078 url_string.push_back('#');
1079 size_t ref_begin = url_string.length();
1080 new_parsed->ref.begin = static_cast<int>(ref_begin);
1081
1082 // Compose a list of offsets within the section.
1083 std::vector<size_t> offsets_into_ref =
1084 OffsetsIntoSection(offsets_for_adjustment, ref_begin);
1085
1086 if (parsed.ref.len > 0) {
1087 url_string.append(UTF8ToWideAndAdjustOffsets(spec.substr(parsed.ref.begin,
1088 parsed.ref.len),
1089 &offsets_into_ref));
1090 }
1091 size_t old_ref_len = static_cast<size_t>(parsed.ref.len);
1092 size_t new_ref_len = url_string.length() - new_parsed->ref.begin;
1093 new_parsed->ref.len = static_cast<int>(new_ref_len);
1094
1095 // Apply offset adjustments.
1096 ApplySectionAdjustments(offsets_into_ref, offsets_for_adjustment,
1097 old_ref_len, new_ref_len, ref_begin);
1098 }
1099
1100 // If we need to strip out http do it after the fact. This way we don't need
1101 // to worry about how offset_for_adjustment is interpreted.
1102 const size_t kHTTPSize = arraysize(kHTTP) - 1;
1103 if (omit_http && !url_string.compare(0, kHTTPSize, kHTTP)) {
1104 url_string = url_string.substr(kHTTPSize);
1105 AdjustOffset::Adjustments adjustments;
1106 adjustments.push_back(AdjustOffset::Adjustment(0, kHTTPSize, 0));
1107 std::for_each(offsets_for_adjustment->begin(),
1108 offsets_for_adjustment->end(),
1109 AdjustOffset(adjustments));
1110 if (prefix_end)
1111 *prefix_end -= kHTTPSize;
1112
1113 // Adjust new_parsed.
1114 DCHECK(new_parsed->scheme.is_valid());
1115 int delta = -(new_parsed->scheme.len + 3); // +3 for ://.
1116 new_parsed->scheme.reset();
1117 AdjustComponents(delta, new_parsed);
1118 }
1119
1120 return url_string;
1121 }
1122
1123 } // namespace 914 } // namespace
1124 915
1125 const FormatUrlType kFormatUrlOmitNothing = 0; 916 const FormatUrlType kFormatUrlOmitNothing = 0;
1126 const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0; 917 const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0;
1127 const FormatUrlType kFormatUrlOmitHTTP = 1 << 1; 918 const FormatUrlType kFormatUrlOmitHTTP = 1 << 1;
1128 const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2; 919 const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2;
1129 const FormatUrlType kFormatUrlOmitAll = kFormatUrlOmitUsernamePassword | 920 const FormatUrlType kFormatUrlOmitAll = kFormatUrlOmitUsernamePassword |
1130 kFormatUrlOmitHTTP | kFormatUrlOmitTrailingSlashOnBareHostname; 921 kFormatUrlOmitHTTP | kFormatUrlOmitTrailingSlashOnBareHostname;
1131 922
1132 // TODO(viettrungluu): We don't want non-POD globals; change this. 923 // TODO(viettrungluu): We don't want non-POD globals; change this.
(...skipping 23 matching lines...) Expand all
1156 FILE_PATH_LITERAL("#"), FILE_PATH_LITERAL("%23")); 947 FILE_PATH_LITERAL("#"), FILE_PATH_LITERAL("%23"));
1157 948
1158 #if defined(OS_POSIX) 949 #if defined(OS_POSIX)
1159 ReplaceSubstringsAfterOffset(&url_string, 0, 950 ReplaceSubstringsAfterOffset(&url_string, 0,
1160 FILE_PATH_LITERAL("\\"), FILE_PATH_LITERAL("%5C")); 951 FILE_PATH_LITERAL("\\"), FILE_PATH_LITERAL("%5C"));
1161 #endif 952 #endif
1162 953
1163 return GURL(url_string); 954 return GURL(url_string);
1164 } 955 }
1165 956
1166 std::wstring GetSpecificHeader(const std::wstring& headers,
1167 const std::wstring& name) {
1168 return GetSpecificHeaderT(headers, name);
1169 }
1170
1171 std::string GetSpecificHeader(const std::string& headers, 957 std::string GetSpecificHeader(const std::string& headers,
1172 const std::string& name) { 958 const std::string& name) {
1173 return GetSpecificHeaderT(headers, name); 959 // We want to grab the Value from the "Key: Value" pairs in the headers,
960 // which should look like this (no leading spaces, \n-separated) (we format
961 // them this way in url_request_inet.cc):
962 // HTTP/1.1 200 OK\n
963 // ETag: "6d0b8-947-24f35ec0"\n
964 // Content-Length: 2375\n
965 // Content-Type: text/html; charset=UTF-8\n
966 // Last-Modified: Sun, 03 Sep 2006 04:34:43 GMT\n
967 if (headers.empty())
968 return std::string();
969
970 std::string match('\n' + name + ':');
971
972 std::string::const_iterator begin =
973 search(headers.begin(), headers.end(), match.begin(), match.end(),
974 base::CaseInsensitiveCompareASCII<char>());
975
976 if (begin == headers.end())
977 return std::string();
978
979 begin += match.length();
980
981 std::string ret;
982 TrimWhitespace(std::string(begin, find(begin, headers.end(), '\n')), TRIM_ALL,
983 &ret);
984 return ret;
1174 } 985 }
1175 986
1176 bool DecodeCharset(const std::string& input, 987 bool DecodeCharset(const std::string& input,
1177 std::string* decoded_charset, 988 std::string* decoded_charset,
1178 std::string* value) { 989 std::string* value) {
1179 StringTokenizer t(input, "'"); 990 StringTokenizer t(input, "'");
1180 t.set_options(StringTokenizer::RETURN_DELIMS); 991 t.set_options(StringTokenizer::RETURN_DELIMS);
1181 std::string temp_charset; 992 std::string temp_charset;
1182 std::string temp_value; 993 std::string temp_value;
1183 int numDelimsSeen = 0; 994 int numDelimsSeen = 0;
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
1238 param_value = GetHeaderParamValue(header, "name", 1049 param_value = GetHeaderParamValue(header, "name",
1239 QuoteRule::REMOVE_OUTER_QUOTES); 1050 QuoteRule::REMOVE_OUTER_QUOTES);
1240 } 1051 }
1241 if (param_value.empty()) 1052 if (param_value.empty())
1242 return std::string(); 1053 return std::string();
1243 if (DecodeParamValue(param_value, referrer_charset, &decoded)) 1054 if (DecodeParamValue(param_value, referrer_charset, &decoded))
1244 return decoded; 1055 return decoded;
1245 return std::string(); 1056 return std::string();
1246 } 1057 }
1247 1058
1248 std::wstring GetHeaderParamValue(const std::wstring& field, 1059 // TODO(mpcomplete): This is a quick and dirty implementation for now. I'm
1249 const std::wstring& param_name, 1060 // sure this doesn't properly handle all (most?) cases.
1250 QuoteRule::Type quote_rule) { 1061 std::string GetHeaderParamValue(const std::string& header,
1251 return GetHeaderParamValueT(field, param_name, quote_rule); 1062 const std::string& param_name,
1063 QuoteRule::Type quote_rule) {
1064 // This assumes args are formatted exactly like "bla; arg1=value; arg2=value".
1065 std::string::const_iterator param_begin =
1066 search(header.begin(), header.end(), param_name.begin(), param_name.end(),
1067 base::CaseInsensitiveCompareASCII<char>());
1068
1069 if (param_begin == header.end())
1070 return std::string();
1071 param_begin += param_name.length();
1072
1073 std::string whitespace(" \t");
1074 size_t equals_offset =
1075 header.find_first_not_of(whitespace, param_begin - header.begin());
1076 if (equals_offset == std::string::npos || header[equals_offset] != '=')
1077 return std::string();
1078
1079 param_begin = header.begin() + equals_offset + 1;
1080 if (param_begin == header.end())
1081 return std::string();
1082
1083 std::string::const_iterator param_end;
1084 if (*param_begin == '"' && quote_rule == QuoteRule::REMOVE_OUTER_QUOTES) {
1085 ++param_begin; // skip past the quote.
1086 param_end = find(param_begin, header.end(), '"');
1087 // If the closing quote is missing, we will treat the rest of the
1088 // string as the parameter. We can't set |param_end| to the
1089 // location of the separator (';'), since the separator is
1090 // technically quoted. See: http://crbug.com/58840
1091 } else {
1092 param_end = find(param_begin + 1, header.end(), ';');
1093 }
1094
1095 return std::string(param_begin, param_end);
1252 } 1096 }
1253 1097
1254 std::string GetHeaderParamValue(const std::string& field, 1098 string16 IDNToUnicode(const std::string& host,
1255 const std::string& param_name, 1099 const std::string& languages) {
1256 QuoteRule::Type quote_rule) {
1257 return GetHeaderParamValueT(field, param_name, quote_rule);
1258 }
1259
1260 // TODO(brettw) bug 734373: check the scripts for each host component and
1261 // don't un-IDN-ize if there is more than one. Alternatively, only IDN for
1262 // scripts that the user has installed. For now, just put the entire
1263 // path through IDN. Maybe this feature can be implemented in ICU itself?
1264 //
1265 // We may want to skip this step in the case of file URLs to allow unicode
1266 // UNC hostnames regardless of encodings.
1267 std::wstring IDNToUnicodeWithOffsets(
1268 const char* host,
1269 size_t host_len,
1270 const std::wstring& languages,
1271 std::vector<size_t>* offsets_for_adjustment) {
1272 // Convert the ASCII input to a wide string for ICU.
1273 string16 input16;
1274 input16.reserve(host_len);
1275 input16.insert(input16.end(), host, host + host_len);
1276
1277 // Do each component of the host separately, since we enforce script matching
1278 // on a per-component basis.
1279 AdjustOffset::Adjustments adjustments;
1280 string16 out16;
1281 for (size_t component_start = 0, component_end;
1282 component_start < input16.length();
1283 component_start = component_end + 1) {
1284 // Find the end of the component.
1285 component_end = input16.find('.', component_start);
1286 if (component_end == string16::npos)
1287 component_end = input16.length(); // For getting the last component.
1288 size_t component_length = component_end - component_start;
1289 size_t new_component_start = out16.length();
1290 bool converted_idn = false;
1291 if (component_end > component_start) {
1292 // Add the substring that we just found.
1293 converted_idn = IDNToUnicodeOneComponent(input16.data() + component_start,
1294 component_length, languages, &out16);
1295 }
1296 size_t new_component_length = out16.length() - new_component_start;
1297
1298 if (converted_idn && offsets_for_adjustment) {
1299 adjustments.push_back(AdjustOffset::Adjustment(
1300 component_start, component_length, new_component_length));
1301 }
1302
1303 // Need to add the dot we just found (if we found one).
1304 if (component_end < input16.length())
1305 out16.push_back('.');
1306 }
1307
1308 // Make offset adjustment.
1309 if (offsets_for_adjustment && !adjustments.empty()) {
1310 std::for_each(offsets_for_adjustment->begin(),
1311 offsets_for_adjustment->end(),
1312 AdjustOffset(adjustments));
1313 }
1314
1315 return UTF16ToWideAndAdjustOffsets(out16, offsets_for_adjustment);
1316 }
1317
1318 std::wstring IDNToUnicode(const char* host,
1319 size_t host_len,
1320 const std::wstring& languages,
1321 size_t* offset_for_adjustment) {
1322 std::vector<size_t> offsets; 1100 std::vector<size_t> offsets;
1323 if (offset_for_adjustment) 1101 return IDNToUnicodeWithOffsets(host, languages, &offsets);
1324 offsets.push_back(*offset_for_adjustment);
1325 std::wstring result =
1326 IDNToUnicodeWithOffsets(host, host_len, languages, &offsets);
1327 if (offset_for_adjustment)
1328 *offset_for_adjustment = offsets[0];
1329 return result;
1330 } 1102 }
1331 1103
1332 std::string CanonicalizeHost(const std::string& host, 1104 std::string CanonicalizeHost(const std::string& host,
1333 url_canon::CanonHostInfo* host_info) { 1105 url_canon::CanonHostInfo* host_info) {
1334 // Try to canonicalize the host. 1106 // Try to canonicalize the host.
1335 const url_parse::Component raw_host_component( 1107 const url_parse::Component raw_host_component(
1336 0, static_cast<int>(host.length())); 1108 0, static_cast<int>(host.length()));
1337 std::string canon_host; 1109 std::string canon_host;
1338 url_canon::StdStringCanonOutput canon_host_output(&canon_host); 1110 url_canon::StdStringCanonOutput canon_host_output(&canon_host);
1339 url_canon::CanonicalizeHostVerbose(host.c_str(), raw_host_component, 1111 url_canon::CanonicalizeHostVerbose(host.c_str(), raw_host_component,
1340 &canon_host_output, host_info); 1112 &canon_host_output, host_info);
1341 1113
1342 if (host_info->out_host.is_nonempty() && 1114 if (host_info->out_host.is_nonempty() &&
1343 host_info->family != url_canon::CanonHostInfo::BROKEN) { 1115 host_info->family != url_canon::CanonHostInfo::BROKEN) {
1344 // Success! Assert that there's no extra garbage. 1116 // Success! Assert that there's no extra garbage.
1345 canon_host_output.Complete(); 1117 canon_host_output.Complete();
1346 DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length())); 1118 DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
1347 } else { 1119 } else {
1348 // Empty host, or canonicalization failed. We'll return empty. 1120 // Empty host, or canonicalization failed. We'll return empty.
1349 canon_host.clear(); 1121 canon_host.clear();
1350 } 1122 }
1351 1123
1352 return canon_host; 1124 return canon_host;
1353 } 1125 }
1354 1126
1355 std::string CanonicalizeHost(const std::wstring& host,
1356 url_canon::CanonHostInfo* host_info) {
1357 std::string converted_host;
1358 WideToUTF8(host.c_str(), host.length(), &converted_host);
1359 return CanonicalizeHost(converted_host, host_info);
1360 }
1361
1362 std::string GetDirectoryListingHeader(const string16& title) { 1127 std::string GetDirectoryListingHeader(const string16& title) {
1363 static const base::StringPiece header( 1128 static const base::StringPiece header(
1364 NetModule::GetResource(IDR_DIR_HEADER_HTML)); 1129 NetModule::GetResource(IDR_DIR_HEADER_HTML));
1365 // This can be null in unit tests. 1130 // This can be null in unit tests.
1366 DLOG_IF(WARNING, header.empty()) << 1131 DLOG_IF(WARNING, header.empty()) <<
1367 "Missing resource: directory listing header"; 1132 "Missing resource: directory listing header";
1368 1133
1369 std::string result; 1134 std::string result;
1370 if (!header.empty()) 1135 if (!header.empty())
1371 result.assign(header.data(), header.size()); 1136 result.assign(header.data(), header.size());
(...skipping 360 matching lines...) Expand 10 before | Expand all | Expand 10 after
1732 UnescapeRule::Type flags = 1497 UnescapeRule::Type flags =
1733 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS; 1498 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS;
1734 *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags, NULL); 1499 *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags, NULL);
1735 *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags, NULL); 1500 *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags, NULL);
1736 } 1501 }
1737 1502
1738 std::string GetHostOrSpecFromURL(const GURL& url) { 1503 std::string GetHostOrSpecFromURL(const GURL& url) {
1739 return url.has_host() ? TrimEndingDot(url.host()) : url.spec(); 1504 return url.has_host() ? TrimEndingDot(url.host()) : url.spec();
1740 } 1505 }
1741 1506
1742 void AppendFormattedHostWithOffsets( 1507 void AppendFormattedHost(const GURL& url,
1743 const GURL& url, 1508 const std::string& languages,
1744 const std::wstring& languages, 1509 string16* output) {
1745 std::wstring* output, 1510 std::vector<size_t> offsets;
1746 url_parse::Parsed* new_parsed, 1511 AppendFormattedComponent(url.possibly_invalid_spec(),
1747 std::vector<size_t>* offsets_for_adjustment) { 1512 url.parsed_for_possibly_invalid_spec().host, offsets,
1748 DCHECK(output); 1513 HostComponentTransform(languages), output, NULL, NULL);
1749 const url_parse::Component& host =
1750 url.parsed_for_possibly_invalid_spec().host;
1751
1752 if (host.is_nonempty()) {
1753 // Handle possible IDN in the host name.
1754 size_t host_begin = output->length();
1755 if (new_parsed)
1756 new_parsed->host.begin = static_cast<int>(host_begin);
1757 size_t old_host_len = static_cast<size_t>(host.len);
1758
1759 // Compose a list of offsets within the host area.
1760 std::vector<size_t> offsets_into_host =
1761 OffsetsIntoSection(offsets_for_adjustment, host_begin);
1762
1763 const std::string& spec = url.possibly_invalid_spec();
1764 DCHECK(host.begin >= 0 &&
1765 ((spec.length() == 0 && host.begin == 0) ||
1766 host.begin < static_cast<int>(spec.length())));
1767 output->append(IDNToUnicodeWithOffsets(&spec[host.begin], old_host_len,
1768 languages, &offsets_into_host));
1769
1770 size_t new_host_len = output->length() - host_begin;
1771 if (new_parsed)
1772 new_parsed->host.len = static_cast<int>(new_host_len);
1773
1774 // Apply offset adjustments.
1775 ApplySectionAdjustments(offsets_into_host, offsets_for_adjustment,
1776 old_host_len, new_host_len, host_begin);
1777 } else if (new_parsed) {
1778 new_parsed->host.reset();
1779 }
1780 } 1514 }
1781 1515
1782 void AppendFormattedHost(const GURL& url,
1783 const std::wstring& languages,
1784 std::wstring* output,
1785 url_parse::Parsed* new_parsed,
1786 size_t* offset_for_adjustment) {
1787 std::vector<size_t> offsets;
1788 if (offset_for_adjustment)
1789 offsets.push_back(*offset_for_adjustment);
1790 AppendFormattedHostWithOffsets(url, languages, output, new_parsed, &offsets);
1791 if (offset_for_adjustment)
1792 *offset_for_adjustment = offsets[0];
1793 }
1794
1795 // TODO(viettrungluu): convert the wstring |FormatUrlInternal()|.
1796 string16 FormatUrlWithOffsets(const GURL& url, 1516 string16 FormatUrlWithOffsets(const GURL& url,
1797 const std::string& languages, 1517 const std::string& languages,
1798 FormatUrlTypes format_types, 1518 FormatUrlTypes format_types,
1799 UnescapeRule::Type unescape_rules, 1519 UnescapeRule::Type unescape_rules,
1800 url_parse::Parsed* new_parsed, 1520 url_parse::Parsed* new_parsed,
1801 size_t* prefix_end, 1521 size_t* prefix_end,
1802 std::vector<size_t>* offsets_for_adjustment) { 1522 std::vector<size_t>* offsets_for_adjustment) {
1803 return WideToUTF16Hack( 1523 url_parse::Parsed parsed_temp;
1804 FormatUrlInternal(url, ASCIIToWide(languages), format_types, 1524 if (!new_parsed)
1805 unescape_rules, new_parsed, prefix_end, 1525 new_parsed = &parsed_temp;
1806 offsets_for_adjustment)); 1526 else
1527 *new_parsed = url_parse::Parsed();
1528 std::vector<size_t> original_offsets;
1529 if (offsets_for_adjustment)
1530 original_offsets = *offsets_for_adjustment;
1531
1532 // Special handling for view-source:. Don't use chrome::kViewSourceScheme
1533 // because this library shouldn't depend on chrome.
1534 const char* const kViewSource = "view-source";
1535 // Reject "view-source:view-source:..." to avoid deep recursion.
1536 const char* const kViewSourceTwice = "view-source:view-source:";
1537 if (url.SchemeIs(kViewSource) &&
1538 !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) {
1539 return FormatViewSourceUrl(url, original_offsets, languages, format_types,
1540 unescape_rules, new_parsed, prefix_end, offsets_for_adjustment);
1541 }
1542
1543 // We handle both valid and invalid URLs (this will give us the spec
1544 // regardless of validity).
1545 const std::string& spec = url.possibly_invalid_spec();
1546 const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
1547
1548 // Scheme & separators. These are ASCII.
1549 string16 url_string;
1550 url_string.insert(url_string.end(), spec.begin(),
1551 spec.begin() + parsed.CountCharactersBefore(url_parse::Parsed::USERNAME,
1552 true));
1553 const char kHTTP[] = "http://";
1554 const char kFTP[] = "ftp.";
1555 // URLFixerUpper::FixupURL() treats "ftp.foo.com" as ftp://ftp.foo.com. This
1556 // means that if we trim "http://" off a URL whose host starts with "ftp." and
1557 // the user inputs this into any field subject to fixup (which is basically
1558 // all input fields), the meaning would be changed. (In fact, often the
1559 // formatted URL is directly pre-filled into an input field.) For this reason
1560 // we avoid stripping "http://" in this case.
1561 bool omit_http = (format_types & kFormatUrlOmitHTTP) &&
1562 EqualsASCII(url_string, kHTTP) &&
1563 !StartsWithASCII(url.host(), kFTP, true);
1564 new_parsed->scheme = parsed.scheme;
1565
1566 // Username & password.
1567 if ((format_types & kFormatUrlOmitUsernamePassword) != 0) {
1568 // Remove the username and password fields. We don't want to display those
1569 // to the user since they can be used for attacks,
1570 // e.g. "http://google.com:search@evil.ru/"
1571 new_parsed->username.reset();
1572 new_parsed->password.reset();
1573 // Update the offsets based on removed username and/or password.
1574 if (offsets_for_adjustment && !offsets_for_adjustment->empty() &&
1575 (parsed.username.is_nonempty() || parsed.password.is_nonempty())) {
1576 OffsetAdjuster offset_adjuster(offsets_for_adjustment);
1577 if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) {
1578 // The seeming off-by-one and off-by-two in these first two lines are to
1579 // account for the ':' after the username and '@' after the password.
1580 offset_adjuster.Add(OffsetAdjuster::Adjustment(
1581 static_cast<size_t>(parsed.username.begin),
1582 static_cast<size_t>(parsed.username.len + parsed.password.len + 2),
1583 0));
1584 } else {
1585 const url_parse::Component* nonempty_component =
1586 parsed.username.is_nonempty() ? &parsed.username : &parsed.password;
1587 // The seeming off-by-one in below is to account for the '@' after the
1588 // username/password.
1589 offset_adjuster.Add(OffsetAdjuster::Adjustment(
1590 static_cast<size_t>(nonempty_component->begin),
1591 static_cast<size_t>(nonempty_component->len + 1), 0));
1592 }
1593 }
1594 } else {
1595 AppendFormattedComponent(spec, parsed.username, original_offsets,
1596 NonHostComponentTransform(unescape_rules), &url_string,
1597 &new_parsed->username, offsets_for_adjustment);
1598 if (parsed.password.is_valid()) {
1599 size_t colon = parsed.username.end();
1600 DCHECK_EQ(static_cast<size_t>(parsed.password.begin - 1), colon);
1601 std::vector<size_t>::const_iterator colon_iter =
1602 std::find(original_offsets.begin(), original_offsets.end(), colon);
1603 if (colon_iter != original_offsets.end()) {
1604 (*offsets_for_adjustment)[colon_iter - original_offsets.begin()] =
1605 url_string.length();
1606 }
1607 url_string.push_back(':');
1608 }
1609 AppendFormattedComponent(spec, parsed.password, original_offsets,
1610 NonHostComponentTransform(unescape_rules), &url_string,
1611 &new_parsed->password, offsets_for_adjustment);
1612 if (parsed.username.is_valid() || parsed.password.is_valid()) {
1613 size_t at_sign = (parsed.password.is_valid() ?
1614 parsed.password : parsed.username).end();
1615 DCHECK_EQ(static_cast<size_t>(parsed.host.begin - 1), at_sign);
1616 std::vector<size_t>::const_iterator at_sign_iter =
1617 std::find(original_offsets.begin(), original_offsets.end(), at_sign);
1618 if (at_sign_iter != original_offsets.end()) {
1619 (*offsets_for_adjustment)[at_sign_iter - original_offsets.begin()] =
1620 url_string.length();
1621 }
1622 url_string.push_back('@');
1623 }
1624 }
1625 if (prefix_end)
1626 *prefix_end = static_cast<size_t>(url_string.length());
1627
1628 // Host.
1629 AppendFormattedComponent(spec, parsed.host, original_offsets,
1630 HostComponentTransform(languages), &url_string, &new_parsed->host,
1631 offsets_for_adjustment);
1632
1633 // Port.
1634 if (parsed.port.is_nonempty()) {
1635 url_string.push_back(':');
1636 new_parsed->port.begin = url_string.length();
1637 url_string.insert(url_string.end(),
1638 spec.begin() + parsed.port.begin,
1639 spec.begin() + parsed.port.end());
1640 new_parsed->port.len = url_string.length() - new_parsed->port.begin;
1641 } else {
1642 new_parsed->port.reset();
1643 }
1644
1645 // Path & query. Both get the same general unescape & convert treatment.
1646 if (!(format_types & kFormatUrlOmitTrailingSlashOnBareHostname) ||
1647 !CanStripTrailingSlash(url)) {
1648 AppendFormattedComponent(spec, parsed.path, original_offsets,
1649 NonHostComponentTransform(unescape_rules), &url_string,
1650 &new_parsed->path, offsets_for_adjustment);
1651 }
1652 if (parsed.query.is_valid())
1653 url_string.push_back('?');
1654 AppendFormattedComponent(spec, parsed.query, original_offsets,
1655 NonHostComponentTransform(unescape_rules), &url_string,
1656 &new_parsed->query, offsets_for_adjustment);
1657
1658 // Ref. This is valid, unescaped UTF-8, so we can just convert.
1659 if (parsed.ref.is_valid()) {
1660 url_string.push_back('#');
1661 size_t original_ref_begin = static_cast<size_t>(parsed.ref.begin);
1662 size_t output_ref_begin = url_string.length();
1663 new_parsed->ref.begin = static_cast<int>(output_ref_begin);
1664
1665 std::vector<size_t> offsets_into_ref(
1666 OffsetsIntoComponent(original_offsets, original_ref_begin));
1667 if (parsed.ref.len > 0) {
1668 url_string.append(UTF8ToUTF16AndAdjustOffsets(
1669 spec.substr(original_ref_begin, static_cast<size_t>(parsed.ref.len)),
1670 &offsets_into_ref));
1671 }
1672
1673 new_parsed->ref.len =
1674 static_cast<int>(url_string.length() - new_parsed->ref.begin);
1675 AdjustForComponentTransform(original_offsets, original_ref_begin,
1676 static_cast<size_t>(parsed.ref.end()), offsets_into_ref,
1677 output_ref_begin, offsets_for_adjustment);
1678 }
1679
1680 // If we need to strip out http do it after the fact. This way we don't need
1681 // to worry about how offset_for_adjustment is interpreted.
1682 if (omit_http && StartsWith(url_string, ASCIIToUTF16(kHTTP), true)) {
1683 const size_t kHTTPSize = arraysize(kHTTP) - 1;
1684 url_string = url_string.substr(kHTTPSize);
1685 if (offsets_for_adjustment && !offsets_for_adjustment->empty()) {
1686 OffsetAdjuster offset_adjuster(offsets_for_adjustment);
1687 offset_adjuster.Add(OffsetAdjuster::Adjustment(0, kHTTPSize, 0));
1688 }
1689 if (prefix_end)
1690 *prefix_end -= kHTTPSize;
1691
1692 // Adjust new_parsed.
1693 DCHECK(new_parsed->scheme.is_valid());
1694 int delta = -(new_parsed->scheme.len + 3); // +3 for ://.
1695 new_parsed->scheme.reset();
1696 AdjustComponents(delta, new_parsed);
1697 }
1698
1699 LimitOffsets(url_string, offsets_for_adjustment);
1700 return url_string;
1807 } 1701 }
1808 1702
1809 string16 FormatUrl(const GURL& url, 1703 string16 FormatUrl(const GURL& url,
1810 const std::string& languages, 1704 const std::string& languages,
1811 FormatUrlTypes format_types, 1705 FormatUrlTypes format_types,
1812 UnescapeRule::Type unescape_rules, 1706 UnescapeRule::Type unescape_rules,
1813 url_parse::Parsed* new_parsed, 1707 url_parse::Parsed* new_parsed,
1814 size_t* prefix_end, 1708 size_t* prefix_end,
1815 size_t* offset_for_adjustment) { 1709 size_t* offset_for_adjustment) {
1816 std::vector<size_t> offsets; 1710 std::vector<size_t> offsets;
1817 if (offset_for_adjustment) 1711 if (offset_for_adjustment)
1818 offsets.push_back(*offset_for_adjustment); 1712 offsets.push_back(*offset_for_adjustment);
1819 string16 result = WideToUTF16Hack( 1713 string16 result = FormatUrlWithOffsets(url, languages, format_types,
1820 FormatUrlInternal(url, ASCIIToWide(languages), format_types, 1714 unescape_rules, new_parsed, prefix_end, &offsets);
1821 unescape_rules, new_parsed, prefix_end, &offsets));
1822 if (offset_for_adjustment) 1715 if (offset_for_adjustment)
1823 *offset_for_adjustment = offsets[0]; 1716 *offset_for_adjustment = offsets[0];
1824 return result; 1717 return result;
1825 } 1718 }
1826 1719
1827 bool CanStripTrailingSlash(const GURL& url) { 1720 bool CanStripTrailingSlash(const GURL& url) {
1828 // Omit the path only for standard, non-file URLs with nothing but "/" after 1721 // Omit the path only for standard, non-file URLs with nothing but "/" after
1829 // the hostname. 1722 // the hostname.
1830 return url.IsStandard() && !url.SchemeIsFile() && !url.has_query() && 1723 return url.IsStandard() && !url.SchemeIsFile() && !url.has_query() &&
1831 !url.has_ref() && url.path() == "/"; 1724 !url.has_ref() && url.path() == "/";
(...skipping 435 matching lines...) Expand 10 before | Expand all | Expand 10 after
2267 } 2160 }
2268 2161
2269 NetworkInterface::NetworkInterface(const std::string& name, 2162 NetworkInterface::NetworkInterface(const std::string& name,
2270 const IPAddressNumber& address) 2163 const IPAddressNumber& address)
2271 : name(name), address(address) { 2164 : name(name), address(address) {
2272 } 2165 }
2273 2166
2274 NetworkInterface::~NetworkInterface() { 2167 NetworkInterface::~NetworkInterface() {
2275 } 2168 }
2276 2169
2277 ClampComponentOffset::ClampComponentOffset(size_t component_start)
2278 : component_start(component_start) {}
2279
2280 size_t ClampComponentOffset::operator()(size_t offset) {
2281 return (offset >= component_start) ?
2282 offset : std::wstring::npos;
2283 }
2284
2285 } // namespace net 2170 } // namespace net
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698