| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "base/i18n/icu_encoding_detection.h" | |
| 6 | |
| 7 #include <set> | |
| 8 | |
| 9 #include "base/strings/string_util.h" | |
| 10 #include "third_party/icu/source/i18n/unicode/ucsdet.h" | |
| 11 | |
| 12 namespace base { | |
| 13 | |
| 14 bool DetectEncoding(const std::string& text, std::string* encoding) { | |
| 15 if (IsStringASCII(text)) { | |
| 16 *encoding = std::string(); | |
| 17 return true; | |
| 18 } | |
| 19 | |
| 20 UErrorCode status = U_ZERO_ERROR; | |
| 21 UCharsetDetector* detector = ucsdet_open(&status); | |
| 22 ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), | |
| 23 &status); | |
| 24 const UCharsetMatch* match = ucsdet_detect(detector, &status); | |
| 25 if (match == NULL) | |
| 26 return false; | |
| 27 const char* detected_encoding = ucsdet_getName(match, &status); | |
| 28 ucsdet_close(detector); | |
| 29 | |
| 30 if (U_FAILURE(status)) | |
| 31 return false; | |
| 32 | |
| 33 *encoding = detected_encoding; | |
| 34 return true; | |
| 35 } | |
| 36 | |
| 37 bool DetectAllEncodings(const std::string& text, | |
| 38 std::vector<std::string>* encodings) { | |
| 39 UErrorCode status = U_ZERO_ERROR; | |
| 40 UCharsetDetector* detector = ucsdet_open(&status); | |
| 41 ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), | |
| 42 &status); | |
| 43 int matches_count = 0; | |
| 44 const UCharsetMatch** matches = ucsdet_detectAll(detector, | |
| 45 &matches_count, | |
| 46 &status); | |
| 47 if (U_FAILURE(status)) { | |
| 48 ucsdet_close(detector); | |
| 49 return false; | |
| 50 } | |
| 51 | |
| 52 // ICU has some heuristics for encoding detection, such that the more likely | |
| 53 // encodings should be returned first. However, it doesn't always return | |
| 54 // all encodings that properly decode |text|, so we'll append more encodings | |
| 55 // later. To make that efficient, keep track of encodings sniffed in this | |
| 56 // first phase. | |
| 57 std::set<std::string> sniffed_encodings; | |
| 58 | |
| 59 encodings->clear(); | |
| 60 for (int i = 0; i < matches_count; i++) { | |
| 61 UErrorCode get_name_status = U_ZERO_ERROR; | |
| 62 const char* encoding_name = ucsdet_getName(matches[i], &get_name_status); | |
| 63 | |
| 64 // If we failed to get the encoding's name, ignore the error. | |
| 65 if (U_FAILURE(get_name_status)) | |
| 66 continue; | |
| 67 | |
| 68 int32_t confidence = ucsdet_getConfidence(matches[i], &get_name_status); | |
| 69 | |
| 70 // We also treat this error as non-fatal. | |
| 71 if (U_FAILURE(get_name_status)) | |
| 72 continue; | |
| 73 | |
| 74 // A confidence level >= 10 means that the encoding is expected to properly | |
| 75 // decode the text. Drop all encodings with lower confidence level. | |
| 76 if (confidence < 10) | |
| 77 continue; | |
| 78 | |
| 79 encodings->push_back(encoding_name); | |
| 80 sniffed_encodings.insert(encoding_name); | |
| 81 } | |
| 82 | |
| 83 // Append all encodings not included earlier, in arbitrary order. | |
| 84 // TODO(jshin): This shouldn't be necessary, possible ICU bug. | |
| 85 // See also http://crbug.com/65917. | |
| 86 UEnumeration* detectable_encodings = ucsdet_getAllDetectableCharsets(detector, | |
| 87 &status); | |
| 88 int detectable_count = uenum_count(detectable_encodings, &status); | |
| 89 for (int i = 0; i < detectable_count; i++) { | |
| 90 int name_length; | |
| 91 const char* name_raw = uenum_next(detectable_encodings, | |
| 92 &name_length, | |
| 93 &status); | |
| 94 std::string name(name_raw, name_length); | |
| 95 if (sniffed_encodings.find(name) == sniffed_encodings.end()) | |
| 96 encodings->push_back(name); | |
| 97 } | |
| 98 uenum_close(detectable_encodings); | |
| 99 | |
| 100 ucsdet_close(detector); | |
| 101 return !encodings->empty(); | |
| 102 } | |
| 103 | |
| 104 } // namespace base | |
| OLD | NEW |