base/i18n/icu_encoding_detection.cc - Issue 2168003003: Replace ICU encoding detection with CED

Side by Side Diff: base/i18n/icu_encoding_detection.cc

Issue 2168003003: Replace ICU encoding detection with CED (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: comments/add datafiles back Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "base/i18n/icu_encoding_detection.h"

6

7 #include <stdint.h>

8

9 #include <set>

10

11 #include "base/strings/string_util.h"

12 #include "third_party/icu/source/i18n/unicode/ucsdet.h"

13

14 namespace base {

15

16 bool DetectEncoding(const std::string& text, std::string* encoding) {

17 if (IsStringASCII(text)) {

18 *encoding = std::string();

19 return true;

20 }

21

22 UErrorCode status = U_ZERO_ERROR;

23 UCharsetDetector* detector = ucsdet_open(&status);

24 ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),

25 &status);

26 const UCharsetMatch* match = ucsdet_detect(detector, &status);

27 if (match != nullptr)

28 *encoding = ucsdet_getName(match, &status);

29 ucsdet_close(detector);

30 return (match != nullptr) && !!U_SUCCESS(status);

31 }

32

33 bool DetectAllEncodings(const std::string& text,

34 std::vector<std::string>* encodings) {

35 UErrorCode status = U_ZERO_ERROR;

36 UCharsetDetector* detector = ucsdet_open(&status);

37 ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),

38 &status);

39 int matches_count = 0;

40 const UCharsetMatch** matches = ucsdet_detectAll(detector,

41 &matches_count,

42 &status);

43 if (U_FAILURE(status)) {

44 ucsdet_close(detector);

45 return false;

46 }

47

48 // ICU has some heuristics for encoding detection, such that the more likely

49 // encodings should be returned first. However, it doesn't always return

50 // all encodings that properly decode \|text\|, so we'll append more encodings

51 // later. To make that efficient, keep track of encodings sniffed in this

52 // first phase.

53 std::set<std::string> sniffed_encodings;

54

55 encodings->clear();

56 for (int i = 0; i < matches_count; i++) {

57 UErrorCode get_name_status = U_ZERO_ERROR;

58 const char* encoding_name = ucsdet_getName(matches[i], &get_name_status);

59

60 // If we failed to get the encoding's name, ignore the error.

61 if (U_FAILURE(get_name_status))

62 continue;

63

64 int32_t confidence = ucsdet_getConfidence(matches[i], &get_name_status);

65

66 // We also treat this error as non-fatal.

67 if (U_FAILURE(get_name_status))

68 continue;

69

70 // A confidence level >= 10 means that the encoding is expected to properly

71 // decode the text. Drop all encodings with lower confidence level.

72 if (confidence < 10)

73 continue;

74

75 encodings->push_back(encoding_name);

76 sniffed_encodings.insert(encoding_name);

77 }

78

79 // Append all encodings not included earlier, in arbitrary order.

80 // TODO(jshin): This shouldn't be necessary, possible ICU bug.

81 // See also http://crbug.com/65917.

82 UEnumeration* detectable_encodings = ucsdet_getAllDetectableCharsets(detector,

83 &status);

84 int detectable_count = uenum_count(detectable_encodings, &status);

85 for (int i = 0; i < detectable_count; i++) {

86 int name_length;

87 const char* name_raw = uenum_next(detectable_encodings,

88 &name_length,

89 &status);

90 std::string name(name_raw, name_length);

91 if (sniffed_encodings.find(name) == sniffed_encodings.end())

92 encodings->push_back(name);

93 }

94 uenum_close(detectable_encodings);

95

96 ucsdet_close(detector);

97 return !encodings->empty();

98 }

99

100 } // namespace base

OLD	NEW

« no previous file with comments | « base/i18n/icu_encoding_detection.h ('k') | chromeos/network/network_state_unittest.cc » ('j') | net/ftp/ftp_directory_listing_parser.cc » ('J')