Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(430)

Side by Side Diff: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

Issue 2081653007: Replace ICU with CED for auto encoding detection (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: deps Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/WebKit/Source/platform/blink_platform.gyp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2008, 2009 Google Inc. All rights reserved. 2 * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are 5 * modification, are permitted provided that the following conditions are
6 * met: 6 * met:
7 * 7 *
8 * * Redistributions of source code must retain the above copyright 8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer. 9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above 10 * * Redistributions in binary form must reproduce the above
(...skipping 12 matching lines...) Expand all
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */ 29 */
30 30
31 #include "platform/text/TextEncodingDetector.h" 31 #include "platform/text/TextEncodingDetector.h"
32 32
33 #include "third_party/ced/src/compact_enc_det/compact_enc_det.h"
33 #include "wtf/text/TextEncoding.h" 34 #include "wtf/text/TextEncoding.h"
34 #include <unicode/ucnv.h>
35 #include <unicode/ucsdet.h>
36 35
37 namespace blink { 36 namespace blink {
38 37
39 bool detectTextEncoding(const char* data, size_t length, 38 bool detectTextEncoding(const char* data, size_t length,
40 const char* hintEncodingName, WTF::TextEncoding* detectedEncoding) 39 const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)
tkent 2016/06/23 03:30:39 I'm afraid ignoring |hintEncodingName| degrades CE
Jinsuk Kim 2016/06/23 07:35:57 Passed hint encoding info to CED.
41 { 40 {
42 *detectedEncoding = WTF::TextEncoding(); 41 *detectedEncoding = WTF::TextEncoding();
43 int matchesCount = 0; 42 int consumedBytes;
44 UErrorCode status = U_ZERO_ERROR; 43 bool isReliable;
45 UCharsetDetector* detector = ucsdet_open(&status); 44 Encoding encoding = CompactEncDet::DetectEncoding(
46 if (U_FAILURE(status)) 45 data, length, nullptr, nullptr, nullptr,
47 return false; 46 UNKNOWN_ENCODING,
48 ucsdet_enableInputFilter(detector, true); 47 UNKNOWN_LANGUAGE,
49 ucsdet_setText(detector, data, static_cast<int32_t>(length), &status); 48 CompactEncDet::WEB_CORPUS,
50 if (U_FAILURE(status)) 49 false, // Include 7-bit encodings
51 return false; 50 &consumedBytes,
52 51 &isReliable);
53 // FIXME: A few things we can do other than improving 52 if (encoding != UNKNOWN_ENCODING) {
54 // the ICU detector itself. 53 *detectedEncoding = WTF::TextEncoding(MimeEncodingName(encoding));
55 // 1. Use ucsdet_detectAll and pick the most likely one given
56 // "the context" (parent-encoding, referrer encoding, etc).
57 // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
58 // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
59 // encoding with a highest confidence among the detector-specific
60 // limited set of candidate encodings.
61 // Below is a partial implementation of the first part of what's outlined
62 // above.
63 const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &s tatus);
64 if (U_FAILURE(status)) {
65 ucsdet_close(detector);
66 return false;
67 }
68
69 const char* encoding = 0;
70 if (hintEncodingName) {
71 WTF::TextEncoding hintEncoding(hintEncodingName);
72 // 10 is the minimum confidence value consistent with the codepoint
73 // allocation in a given encoding. The size of a chunk passed to
74 // us varies even for the same html file (apparently depending on
75 // the network load). When we're given a rather short chunk, we
76 // don't have a sufficiently reliable signal other than the fact that
77 // the chunk is consistent with a set of encodings. So, instead of
78 // setting an arbitrary threshold, we have to scan all the encodings
79 // consistent with the data.
80 const int32_t kThresold = 10;
81 for (int i = 0; i < matchesCount; ++i) {
82 int32_t confidence = ucsdet_getConfidence(matches[i], &status);
83 if (U_FAILURE(status)) {
84 status = U_ZERO_ERROR;
85 continue;
86 }
87 if (confidence < kThresold)
88 break;
89 const char* matchEncoding = ucsdet_getName(matches[i], &status);
90 if (U_FAILURE(status)) {
91 status = U_ZERO_ERROR;
92 continue;
93 }
94 if (WTF::TextEncoding(matchEncoding) == hintEncoding) {
95 encoding = hintEncodingName;
96 break;
97 }
98 }
99 }
100 // If no match is found so far, just pick the top match.
101 // This can happen, say, when a parent frame in EUC-JP refers to
102 // a child frame in Shift_JIS and both frames do NOT specify the encoding
103 // making us resort to auto-detection (when it IS turned on).
104 if (!encoding && matchesCount > 0)
105 encoding = ucsdet_getName(matches[0], &status);
106 if (U_SUCCESS(status)) {
107 *detectedEncoding = WTF::TextEncoding(encoding);
108 ucsdet_close(detector);
109 return true; 54 return true;
110 } 55 }
111 ucsdet_close(detector);
112 return false; 56 return false;
113 } 57 }
114 58
115 } // namespace blink 59 } // namespace blink
OLDNEW
« no previous file with comments | « third_party/WebKit/Source/platform/blink_platform.gyp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698