Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(83)

Side by Side Diff: source/i18n/csrucode.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/csrsbcs.cpp ('k') | source/i18n/csrutf8.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ********************************************************************** 2 **********************************************************************
3 * Copyright (C) 2005-2013, International Business Machines 3 * Copyright (C) 2005-2013, International Business Machines
4 * Corporation and others. All Rights Reserved. 4 * Corporation and others. All Rights Reserved.
5 ********************************************************************** 5 **********************************************************************
6 */ 6 */
7 7
8 #include "unicode/utypes.h" 8 #include "unicode/utypes.h"
9 9
10 #if !UCONFIG_NO_CONVERSION 10 #if !UCONFIG_NO_CONVERSION
(...skipping 11 matching lines...) Expand all
22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() 22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
23 { 23 {
24 // nothing to do 24 // nothing to do
25 } 25 }
26 26
27 const char *CharsetRecog_UTF_16_BE::getName() const 27 const char *CharsetRecog_UTF_16_BE::getName() const
28 { 28 {
29 return "UTF-16BE"; 29 return "UTF-16BE";
30 } 30 }
31 31
32 // UTF-16 confidence calculation. Very simple minded, but better than nothing.
33 // Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
34 // and are very likely to be UTF-16, although they could also be part of a U TF-32 code.
35 // NULs are a contra-indication, they will appear commonly if the actual encod ing is UTF-32.
36 // NULs should be rare in actual text.
37
38 static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) {
39 if (codeUnit == 0) {
40 confidence -= 10;
41 } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
42 confidence += 10;
43 }
44 if (confidence < 0) {
45 confidence = 0;
46 } else if (confidence > 100) {
47 confidence = 100;
48 }
49 return confidence;
50 }
51
52
32 UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) co nst 53 UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) co nst
33 { 54 {
34 const uint8_t *input = textIn->fRawInput; 55 const uint8_t *input = textIn->fRawInput;
35 int32_t confidence = 0; 56 int32_t confidence = 10;
36 int32_t length = textIn->fRawLength; 57 int32_t length = textIn->fRawLength;
37 58
38 if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) { 59 int32_t bytesToCheck = (length > 30) ? 30 : length;
39 confidence = 100; 60 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
61 UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1];
62 if (charIndex == 0 && codeUnit == 0xFEFF) {
63 confidence = 100;
64 break;
65 }
66 confidence = adjustConfidence(codeUnit, confidence);
67 if (confidence == 0 || confidence == 100) {
68 break;
69 }
40 } 70 }
41 71 if (bytesToCheck < 4 && confidence < 100) {
42 // TODO: Do some statastics to check for unsigned UTF-16BE 72 confidence = 0;
73 }
43 results->set(textIn, this, confidence); 74 results->set(textIn, this, confidence);
44 return (confidence > 0); 75 return (confidence > 0);
45 } 76 }
46 77
47 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() 78 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
48 { 79 {
49 // nothing to do 80 // nothing to do
50 } 81 }
51 82
52 const char *CharsetRecog_UTF_16_LE::getName() const 83 const char *CharsetRecog_UTF_16_LE::getName() const
53 { 84 {
54 return "UTF-16LE"; 85 return "UTF-16LE";
55 } 86 }
56 87
57 UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) co nst 88 UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) co nst
58 { 89 {
59 const uint8_t *input = textIn->fRawInput; 90 const uint8_t *input = textIn->fRawInput;
60 int32_t confidence = 0; 91 int32_t confidence = 10;
61 int32_t length = textIn->fRawLength; 92 int32_t length = textIn->fRawLength;
62 93
63 if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) { 94 int32_t bytesToCheck = (length > 30) ? 30 : length;
64 confidence = 100; 95 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
96 UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8);
97 if (charIndex == 0 && codeUnit == 0xFEFF) {
98 confidence = 100; // UTF-16 BOM
99 if (length >= 4 && input[2] == 0 && input[3] == 0) {
100 confidence = 0; // UTF-32 BOM
101 }
102 break;
103 }
104 confidence = adjustConfidence(codeUnit, confidence);
105 if (confidence == 0 || confidence == 100) {
106 break;
107 }
65 } 108 }
66 109 if (bytesToCheck < 4 && confidence < 100) {
67 // TODO: Do some statastics to check for unsigned UTF-16LE 110 confidence = 0;
111 }
68 results->set(textIn, this, confidence); 112 results->set(textIn, this, confidence);
69 return (confidence > 0); 113 return (confidence > 0);
70 } 114 }
71 115
72 CharsetRecog_UTF_32::~CharsetRecog_UTF_32() 116 CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
73 { 117 {
74 // nothing to do 118 // nothing to do
75 } 119 }
76 120
77 UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const 121 UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
145 189
146 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) con st 190 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) con st
147 { 191 {
148 return input[index + 3] << 24 | input[index + 2] << 16 | 192 return input[index + 3] << 24 | input[index + 2] << 16 |
149 input[index + 1] << 8 | input[index + 0]; 193 input[index + 1] << 8 | input[index + 0];
150 } 194 }
151 195
152 U_NAMESPACE_END 196 U_NAMESPACE_END
153 #endif 197 #endif
154 198
OLDNEW
« no previous file with comments | « source/i18n/csrsbcs.cpp ('k') | source/i18n/csrutf8.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698