| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "base/i18n/icu_encoding_detection.h" | |
| 6 | |
| 7 #include <stdint.h> | |
| 8 | |
| 9 #include <set> | |
| 10 | |
| 11 #include "base/strings/string_util.h" | |
| 12 #include "third_party/icu/source/i18n/unicode/ucsdet.h" | |
| 13 | |
| 14 namespace base { | |
| 15 | |
| 16 bool DetectEncoding(const std::string& text, std::string* encoding) { | |
| 17 if (IsStringASCII(text)) { | |
| 18 *encoding = std::string(); | |
| 19 return true; | |
| 20 } | |
| 21 | |
| 22 UErrorCode status = U_ZERO_ERROR; | |
| 23 UCharsetDetector* detector = ucsdet_open(&status); | |
| 24 ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), | |
| 25 &status); | |
| 26 const UCharsetMatch* match = ucsdet_detect(detector, &status); | |
| 27 if (match != nullptr) | |
| 28 *encoding = ucsdet_getName(match, &status); | |
| 29 ucsdet_close(detector); | |
| 30 return (match != nullptr) && !!U_SUCCESS(status); | |
| 31 } | |
| 32 | |
| 33 bool DetectAllEncodings(const std::string& text, | |
| 34 std::vector<std::string>* encodings) { | |
| 35 UErrorCode status = U_ZERO_ERROR; | |
| 36 UCharsetDetector* detector = ucsdet_open(&status); | |
| 37 ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), | |
| 38 &status); | |
| 39 int matches_count = 0; | |
| 40 const UCharsetMatch** matches = ucsdet_detectAll(detector, | |
| 41 &matches_count, | |
| 42 &status); | |
| 43 if (U_FAILURE(status)) { | |
| 44 ucsdet_close(detector); | |
| 45 return false; | |
| 46 } | |
| 47 | |
| 48 // ICU has some heuristics for encoding detection, such that the more likely | |
| 49 // encodings should be returned first. However, it doesn't always return | |
| 50 // all encodings that properly decode |text|, so we'll append more encodings | |
| 51 // later. To make that efficient, keep track of encodings sniffed in this | |
| 52 // first phase. | |
| 53 std::set<std::string> sniffed_encodings; | |
| 54 | |
| 55 encodings->clear(); | |
| 56 for (int i = 0; i < matches_count; i++) { | |
| 57 UErrorCode get_name_status = U_ZERO_ERROR; | |
| 58 const char* encoding_name = ucsdet_getName(matches[i], &get_name_status); | |
| 59 | |
| 60 // If we failed to get the encoding's name, ignore the error. | |
| 61 if (U_FAILURE(get_name_status)) | |
| 62 continue; | |
| 63 | |
| 64 int32_t confidence = ucsdet_getConfidence(matches[i], &get_name_status); | |
| 65 | |
| 66 // We also treat this error as non-fatal. | |
| 67 if (U_FAILURE(get_name_status)) | |
| 68 continue; | |
| 69 | |
| 70 // A confidence level >= 10 means that the encoding is expected to properly | |
| 71 // decode the text. Drop all encodings with lower confidence level. | |
| 72 if (confidence < 10) | |
| 73 continue; | |
| 74 | |
| 75 encodings->push_back(encoding_name); | |
| 76 sniffed_encodings.insert(encoding_name); | |
| 77 } | |
| 78 | |
| 79 // Append all encodings not included earlier, in arbitrary order. | |
| 80 // TODO(jshin): This shouldn't be necessary, possible ICU bug. | |
| 81 // See also http://crbug.com/65917. | |
| 82 UEnumeration* detectable_encodings = ucsdet_getAllDetectableCharsets(detector, | |
| 83 &status); | |
| 84 int detectable_count = uenum_count(detectable_encodings, &status); | |
| 85 for (int i = 0; i < detectable_count; i++) { | |
| 86 int name_length; | |
| 87 const char* name_raw = uenum_next(detectable_encodings, | |
| 88 &name_length, | |
| 89 &status); | |
| 90 std::string name(name_raw, name_length); | |
| 91 if (sniffed_encodings.find(name) == sniffed_encodings.end()) | |
| 92 encodings->push_back(name); | |
| 93 } | |
| 94 uenum_close(detectable_encodings); | |
| 95 | |
| 96 ucsdet_close(detector); | |
| 97 return !encodings->empty(); | |
| 98 } | |
| 99 | |
| 100 } // namespace base | |
| OLD | NEW |