OLD | NEW |
| (Empty) |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "base/i18n/icu_encoding_detection.h" | |
6 | |
7 #include <stdint.h> | |
8 | |
9 #include <set> | |
10 | |
11 #include "base/strings/string_util.h" | |
12 #include "third_party/icu/source/i18n/unicode/ucsdet.h" | |
13 | |
14 namespace base { | |
15 | |
16 bool DetectEncoding(const std::string& text, std::string* encoding) { | |
17 if (IsStringASCII(text)) { | |
18 *encoding = std::string(); | |
19 return true; | |
20 } | |
21 | |
22 UErrorCode status = U_ZERO_ERROR; | |
23 UCharsetDetector* detector = ucsdet_open(&status); | |
24 ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), | |
25 &status); | |
26 const UCharsetMatch* match = ucsdet_detect(detector, &status); | |
27 if (match != nullptr) | |
28 *encoding = ucsdet_getName(match, &status); | |
29 ucsdet_close(detector); | |
30 return (match != nullptr) && !!U_SUCCESS(status); | |
31 } | |
32 | |
33 bool DetectAllEncodings(const std::string& text, | |
34 std::vector<std::string>* encodings) { | |
35 UErrorCode status = U_ZERO_ERROR; | |
36 UCharsetDetector* detector = ucsdet_open(&status); | |
37 ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), | |
38 &status); | |
39 int matches_count = 0; | |
40 const UCharsetMatch** matches = ucsdet_detectAll(detector, | |
41 &matches_count, | |
42 &status); | |
43 if (U_FAILURE(status)) { | |
44 ucsdet_close(detector); | |
45 return false; | |
46 } | |
47 | |
48 // ICU has some heuristics for encoding detection, such that the more likely | |
49 // encodings should be returned first. However, it doesn't always return | |
50 // all encodings that properly decode |text|, so we'll append more encodings | |
51 // later. To make that efficient, keep track of encodings sniffed in this | |
52 // first phase. | |
53 std::set<std::string> sniffed_encodings; | |
54 | |
55 encodings->clear(); | |
56 for (int i = 0; i < matches_count; i++) { | |
57 UErrorCode get_name_status = U_ZERO_ERROR; | |
58 const char* encoding_name = ucsdet_getName(matches[i], &get_name_status); | |
59 | |
60 // If we failed to get the encoding's name, ignore the error. | |
61 if (U_FAILURE(get_name_status)) | |
62 continue; | |
63 | |
64 int32_t confidence = ucsdet_getConfidence(matches[i], &get_name_status); | |
65 | |
66 // We also treat this error as non-fatal. | |
67 if (U_FAILURE(get_name_status)) | |
68 continue; | |
69 | |
70 // A confidence level >= 10 means that the encoding is expected to properly | |
71 // decode the text. Drop all encodings with lower confidence level. | |
72 if (confidence < 10) | |
73 continue; | |
74 | |
75 encodings->push_back(encoding_name); | |
76 sniffed_encodings.insert(encoding_name); | |
77 } | |
78 | |
79 // Append all encodings not included earlier, in arbitrary order. | |
80 // TODO(jshin): This shouldn't be necessary, possible ICU bug. | |
81 // See also http://crbug.com/65917. | |
82 UEnumeration* detectable_encodings = ucsdet_getAllDetectableCharsets(detector, | |
83 &status); | |
84 int detectable_count = uenum_count(detectable_encodings, &status); | |
85 for (int i = 0; i < detectable_count; i++) { | |
86 int name_length; | |
87 const char* name_raw = uenum_next(detectable_encodings, | |
88 &name_length, | |
89 &status); | |
90 std::string name(name_raw, name_length); | |
91 if (sniffed_encodings.find(name) == sniffed_encodings.end()) | |
92 encodings->push_back(name); | |
93 } | |
94 uenum_close(detectable_encodings); | |
95 | |
96 ucsdet_close(detector); | |
97 return !encodings->empty(); | |
98 } | |
99 | |
100 } // namespace base | |
OLD | NEW |