Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(306)

Side by Side Diff: chrome/renderer/render_view.cc

Issue 2714012: Convert page contents grabbing from wide to UTF16. The current code is a bit... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 10 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« chrome/renderer/render_view.h ('K') | « chrome/renderer/render_view.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/render_view.h" 5 #include "chrome/renderer/render_view.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <string> 8 #include <string>
9 #include <vector> 9 #include <vector>
10 10
(...skipping 337 matching lines...) Expand 10 before | Expand all | Expand 10 after
348 std::string new_extension = 348 std::string new_extension =
349 RenderThread::current()->GetExtensionIdForURL(new_url); 349 RenderThread::current()->GetExtensionIdForURL(new_url);
350 return (old_extension != new_extension); 350 return (old_extension != new_extension);
351 } 351 }
352 352
353 // Returns the ISO 639_1 language code of the specified |text|, or 'unknown' 353 // Returns the ISO 639_1 language code of the specified |text|, or 'unknown'
354 // if it failed. 354 // if it failed.
355 // 355 //
356 // Note this only works on Windows at this time. It always returns 'unknown' 356 // Note this only works on Windows at this time. It always returns 'unknown'
357 // on other platforms. 357 // on other platforms.
358 static std::string DetermineTextLanguage(const std::wstring& text) { 358 static std::string DetermineTextLanguage(const string16& text) {
359 // Text with less than 100 bytes will probably not provide good results. 359 // Text with less than 100 bytes will probably not provide good results.
360 // Report it as unknown language. 360 // Report it as unknown language.
361 if (text.length() < 100) 361 if (text.length() < 100)
362 return RenderView::kUnknownLanguageCode; 362 return RenderView::kUnknownLanguageCode;
363 363
364 std::string language = RenderView::kUnknownLanguageCode; 364 std::string language = RenderView::kUnknownLanguageCode;
365 int num_languages = 0; 365 int num_languages = 0;
366 bool is_reliable = false; 366 bool is_reliable = false;
367 string16 input = WideToUTF16(text);
368 Language cld_language = 367 Language cld_language =
369 DetectLanguageOfUnicodeText(NULL, input.c_str(), true, &is_reliable, 368 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
370 &num_languages, NULL); 369 &num_languages, NULL);
371 if (is_reliable && cld_language != NUM_LANGUAGES && 370 if (is_reliable && cld_language != NUM_LANGUAGES &&
372 cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) { 371 cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) {
373 // We should not use LanguageCode_ISO_639_1 because it does not cover all 372 // We should not use LanguageCode_ISO_639_1 because it does not cover all
374 // the languages CLD can detect. As a result, it'll return the invalid 373 // the languages CLD can detect. As a result, it'll return the invalid
375 // language code for tradtional Chinese among others. 374 // language code for tradtional Chinese among others.
376 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and 375 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
377 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN 376 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
378 // for Simplified Chinese. 377 // for Simplified Chinese.
379 language = LanguageCodeWithDialects(cld_language); 378 language = LanguageCodeWithDialects(cld_language);
(...skipping 435 matching lines...) Expand 10 before | Expand all | Expand 10 after
815 814
816 if (!preliminary_capture) 815 if (!preliminary_capture)
817 last_indexed_page_id_ = load_id; 816 last_indexed_page_id_ = load_id;
818 817
819 // Get the URL for this page. 818 // Get the URL for this page.
820 GURL url(main_frame->url()); 819 GURL url(main_frame->url());
821 if (url.is_empty()) 820 if (url.is_empty())
822 return; 821 return;
823 822
824 // Retrieve the frame's full text. 823 // Retrieve the frame's full text.
825 std::wstring contents; 824 string16 contents;
826 CaptureText(main_frame, &contents); 825 CaptureText(main_frame, &contents);
827 if (contents.size()) { 826 if (contents.size()) {
828 base::TimeTicks begin_time = base::TimeTicks::Now(); 827 base::TimeTicks begin_time = base::TimeTicks::Now();
829 std::string language = DetermineTextLanguage(contents); 828 std::string language = DetermineTextLanguage(contents);
830 UMA_HISTOGRAM_MEDIUM_TIMES("Renderer4.LanguageDetection", 829 UMA_HISTOGRAM_MEDIUM_TIMES("Renderer4.LanguageDetection",
831 base::TimeTicks::Now() - begin_time); 830 base::TimeTicks::Now() - begin_time);
832 831
833 // Send the text to the browser for indexing (the browser might decide not 832 // Send the text to the browser for indexing (the browser might decide not
834 // to index, if the URL is HTTPS for instance) and language discovery. 833 // to index, if the URL is HTTPS for instance) and language discovery.
835 Send(new ViewHostMsg_PageContents(routing_id_, url, load_id, contents, 834 Send(new ViewHostMsg_PageContents(routing_id_, url, load_id, contents,
836 language)); 835 language));
837 } 836 }
838 837
839 OnCaptureThumbnail(); 838 OnCaptureThumbnail();
840 } 839 }
841 840
842 void RenderView::CaptureText(WebFrame* frame, std::wstring* contents) { 841 void RenderView::CaptureText(WebFrame* frame, string16* contents) {
843 contents->clear(); 842 contents->clear();
844 if (!frame) 843 if (!frame)
845 return; 844 return;
846 845
847 #ifdef TIME_TEXT_RETRIEVAL 846 #ifdef TIME_TEXT_RETRIEVAL
848 double begin = time_util::GetHighResolutionTimeNow(); 847 double begin = time_util::GetHighResolutionTimeNow();
849 #endif 848 #endif
850 849
851 // get the contents of the frame 850 // get the contents of the frame
852 *contents = UTF16ToWideHack(frame->contentAsText(kMaxIndexChars)); 851 *contents = frame->contentAsText(kMaxIndexChars);
853 852
854 #ifdef TIME_TEXT_RETRIEVAL 853 #ifdef TIME_TEXT_RETRIEVAL
855 double end = time_util::GetHighResolutionTimeNow(); 854 double end = time_util::GetHighResolutionTimeNow();
856 char buf[128]; 855 char buf[128];
857 sprintf_s(buf, "%d chars retrieved for indexing in %gms\n", 856 sprintf_s(buf, "%d chars retrieved for indexing in %gms\n",
858 contents.size(), (end - begin)*1000); 857 contents.size(), (end - begin)*1000);
859 OutputDebugStringA(buf); 858 OutputDebugStringA(buf);
860 #endif 859 #endif
861 860
862 // When the contents are clipped to the maximum, we don't want to have a 861 // When the contents are clipped to the maximum, we don't want to have a
863 // partial word indexed at the end that might have been clipped. Therefore, 862 // partial word indexed at the end that might have been clipped. Therefore,
864 // terminate the string at the last space to ensure no words are clipped. 863 // terminate the string at the last space to ensure no words are clipped.
865 if (contents->size() == kMaxIndexChars) { 864 if (contents->size() == kMaxIndexChars) {
866 size_t last_space_index = contents->find_last_of(kWhitespaceWide); 865 size_t last_space_index = contents->find_last_of(kWhitespaceUTF16);
867 if (last_space_index == std::wstring::npos) 866 if (last_space_index == std::wstring::npos)
868 return; // don't index if we got a huge block of text with no spaces 867 return; // don't index if we got a huge block of text with no spaces
869 contents->resize(last_space_index); 868 contents->resize(last_space_index);
870 } 869 }
871 } 870 }
872 871
873 bool RenderView::CaptureThumbnail(WebView* view, 872 bool RenderView::CaptureThumbnail(WebView* view,
874 int w, 873 int w,
875 int h, 874 int h,
876 SkBitmap* thumbnail, 875 SkBitmap* thumbnail,
(...skipping 4235 matching lines...) Expand 10 before | Expand all | Expand 10 after
5112 webkit_glue::FormData form; 5111 webkit_glue::FormData form;
5113 const WebInputElement element = node.toConst<WebInputElement>(); 5112 const WebInputElement element = node.toConst<WebInputElement>();
5114 if (!form_manager_.FindFormWithFormControlElement( 5113 if (!form_manager_.FindFormWithFormControlElement(
5115 element, FormManager::REQUIRE_NONE, &form)) 5114 element, FormManager::REQUIRE_NONE, &form))
5116 return; 5115 return;
5117 5116
5118 autofill_action_ = action; 5117 autofill_action_ = action;
5119 Send(new ViewHostMsg_FillAutoFillFormData( 5118 Send(new ViewHostMsg_FillAutoFillFormData(
5120 routing_id_, autofill_query_id_, form, value, label)); 5119 routing_id_, autofill_query_id_, form, value, label));
5121 } 5120 }
OLDNEW
« chrome/renderer/render_view.h ('K') | « chrome/renderer/render_view.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698