| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2008, 2009 Google Inc. All rights reserved. | 2 * Copyright (C) 2008, 2009 Google Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions are | 5 * modification, are permitted provided that the following conditions are |
| 6 * met: | 6 * met: |
| 7 * | 7 * |
| 8 * * Redistributions of source code must retain the above copyright | 8 * * Redistributions of source code must retain the above copyright |
| 9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
| 10 * * Redistributions in binary form must reproduce the above | 10 * * Redistributions in binary form must reproduce the above |
| (...skipping 12 matching lines...) Expand all Loading... |
| 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 */ | 29 */ |
| 30 | 30 |
| 31 #include "platform/text/TextEncodingDetector.h" | 31 #include "platform/text/TextEncodingDetector.h" |
| 32 | 32 |
| 33 #include "third_party/ced/src/compact_enc_det/compact_enc_det.h" |
| 33 #include "wtf/text/TextEncoding.h" | 34 #include "wtf/text/TextEncoding.h" |
| 34 #include <unicode/ucnv.h> | |
| 35 #include <unicode/ucsdet.h> | |
| 36 | 35 |
| 37 namespace blink { | 36 namespace blink { |
| 38 | 37 |
| 39 bool detectTextEncoding(const char* data, size_t length, | 38 bool detectTextEncoding(const char* data, size_t length, |
| 40 const char* hintEncodingName, WTF::TextEncoding* detectedEncoding) | 39 const char* hintEncodingName, WTF::TextEncoding* detectedEncoding) |
| 41 { | 40 { |
| 42 *detectedEncoding = WTF::TextEncoding(); | 41 *detectedEncoding = WTF::TextEncoding(); |
| 43 int matchesCount = 0; | 42 int consumedBytes; |
| 44 UErrorCode status = U_ZERO_ERROR; | 43 bool isReliable; |
| 45 UCharsetDetector* detector = ucsdet_open(&status); | 44 Encoding encoding = CompactEncDet::DetectEncoding( |
| 46 if (U_FAILURE(status)) | 45 data, length, nullptr, nullptr, nullptr, |
| 47 return false; | 46 EncodingNameAliasToEncoding(hintEncodingName), |
| 48 ucsdet_enableInputFilter(detector, true); | 47 UNKNOWN_LANGUAGE, |
| 49 ucsdet_setText(detector, data, static_cast<int32_t>(length), &status); | 48 CompactEncDet::WEB_CORPUS, |
| 50 if (U_FAILURE(status)) | 49 false, // Include 7-bit encodings |
| 51 return false; | 50 &consumedBytes, |
| 52 | 51 &isReliable); |
| 53 // FIXME: A few things we can do other than improving | 52 if (encoding != UNKNOWN_ENCODING) { |
| 54 // the ICU detector itself. | 53 *detectedEncoding = WTF::TextEncoding(MimeEncodingName(encoding)); |
| 55 // 1. Use ucsdet_detectAll and pick the most likely one given | |
| 56 // "the context" (parent-encoding, referrer encoding, etc). | |
| 57 // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. | |
| 58 // Chinese, Japanese, Russian, Korean and Hebrew) by picking the | |
| 59 // encoding with a highest confidence among the detector-specific | |
| 60 // limited set of candidate encodings. | |
| 61 // Below is a partial implementation of the first part of what's outlined | |
| 62 // above. | |
| 63 const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &s
tatus); | |
| 64 if (U_FAILURE(status)) { | |
| 65 ucsdet_close(detector); | |
| 66 return false; | |
| 67 } | |
| 68 | |
| 69 const char* encoding = 0; | |
| 70 if (hintEncodingName) { | |
| 71 WTF::TextEncoding hintEncoding(hintEncodingName); | |
| 72 // 10 is the minimum confidence value consistent with the codepoint | |
| 73 // allocation in a given encoding. The size of a chunk passed to | |
| 74 // us varies even for the same html file (apparently depending on | |
| 75 // the network load). When we're given a rather short chunk, we | |
| 76 // don't have a sufficiently reliable signal other than the fact that | |
| 77 // the chunk is consistent with a set of encodings. So, instead of | |
| 78 // setting an arbitrary threshold, we have to scan all the encodings | |
| 79 // consistent with the data. | |
| 80 const int32_t kThresold = 10; | |
| 81 for (int i = 0; i < matchesCount; ++i) { | |
| 82 int32_t confidence = ucsdet_getConfidence(matches[i], &status); | |
| 83 if (U_FAILURE(status)) { | |
| 84 status = U_ZERO_ERROR; | |
| 85 continue; | |
| 86 } | |
| 87 if (confidence < kThresold) | |
| 88 break; | |
| 89 const char* matchEncoding = ucsdet_getName(matches[i], &status); | |
| 90 if (U_FAILURE(status)) { | |
| 91 status = U_ZERO_ERROR; | |
| 92 continue; | |
| 93 } | |
| 94 if (WTF::TextEncoding(matchEncoding) == hintEncoding) { | |
| 95 encoding = hintEncodingName; | |
| 96 break; | |
| 97 } | |
| 98 } | |
| 99 } | |
| 100 // If no match is found so far, just pick the top match. | |
| 101 // This can happen, say, when a parent frame in EUC-JP refers to | |
| 102 // a child frame in Shift_JIS and both frames do NOT specify the encoding | |
| 103 // making us resort to auto-detection (when it IS turned on). | |
| 104 if (!encoding && matchesCount > 0) | |
| 105 encoding = ucsdet_getName(matches[0], &status); | |
| 106 if (U_SUCCESS(status)) { | |
| 107 *detectedEncoding = WTF::TextEncoding(encoding); | |
| 108 ucsdet_close(detector); | |
| 109 return true; | 54 return true; |
| 110 } | 55 } |
| 111 ucsdet_close(detector); | |
| 112 return false; | 56 return false; |
| 113 } | 57 } |
| 114 | 58 |
| 115 } // namespace blink | 59 } // namespace blink |
| OLD | NEW |