source/i18n/csrucode.cpp - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/i18n/csrucode.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 **********************************************************************	2 **********************************************************************

3 * Copyright (C) 2005-2013, International Business Machines	3 * Copyright (C) 2005-2013, International Business Machines

4 * Corporation and others. All Rights Reserved.	4 * Corporation and others. All Rights Reserved.

5 **********************************************************************	5 **********************************************************************

6 */	6 */

7	7

8 #include "unicode/utypes.h"	8 #include "unicode/utypes.h"

9	9

10 #if !UCONFIG_NO_CONVERSION	10 #if !UCONFIG_NO_CONVERSION

(...skipping 11 matching lines...) Expand all Loading...
22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()	22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()

23 {	23 {

24 // nothing to do	24 // nothing to do

25 }	25 }

26	26

27 const char *CharsetRecog_UTF_16_BE::getName() const	27 const char *CharsetRecog_UTF_16_BE::getName() const

28 {	28 {

29 return "UTF-16BE";	29 return "UTF-16BE";

30 }	30 }

31	31

	32 // UTF-16 confidence calculation. Very simple minded, but better than nothing.

	33 // Any 8 bit non-control characters bump the confidence up. These have a zero high byte,

	34 // and are very likely to be UTF-16, although they could also be part of a U TF-32 code.

	35 // NULs are a contra-indication, they will appear commonly if the actual encod ing is UTF-32.

	36 // NULs should be rare in actual text.

	37

	38 static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) {

	39 if (codeUnit == 0) {

	40 confidence -= 10;

	41 } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) \|\| codeUnit == 0x0a) {

	42 confidence += 10;

	43 }

	44 if (confidence < 0) {

	45 confidence = 0;

	46 } else if (confidence > 100) {

	47 confidence = 100;

	48 }

	49 return confidence;

	50 }

	51

	52

32 UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) co nst	53 UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) co nst

33 {	54 {

34 const uint8_t *input = textIn->fRawInput;	55 const uint8_t *input = textIn->fRawInput;

35 int32_t confidence = 0;	56 int32_t confidence = 10;

36 int32_t length = textIn->fRawLength;	57 int32_t length = textIn->fRawLength;

37	58

38 if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) {	59 int32_t bytesToCheck = (length > 30) ? 30 : length;

39 confidence = 100;	60 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {

	61 UChar codeUnit = (input[charIndex] << 8) \| input[charIndex + 1];

	62 if (charIndex == 0 && codeUnit == 0xFEFF) {

	63 confidence = 100;

	64 break;

	65 }

	66 confidence = adjustConfidence(codeUnit, confidence);

	67 if (confidence == 0 \|\| confidence == 100) {

	68 break;

	69 }

40 }	70 }

41	71 if (bytesToCheck < 4 && confidence < 100) {

42 // TODO: Do some statastics to check for unsigned UTF-16BE	72 confidence = 0;

	73 }

43 results->set(textIn, this, confidence);	74 results->set(textIn, this, confidence);

44 return (confidence > 0);	75 return (confidence > 0);

45 }	76 }

46	77

47 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()	78 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()

48 {	79 {

49 // nothing to do	80 // nothing to do

50 }	81 }

51	82

52 const char *CharsetRecog_UTF_16_LE::getName() const	83 const char *CharsetRecog_UTF_16_LE::getName() const

53 {	84 {

54 return "UTF-16LE";	85 return "UTF-16LE";

55 }	86 }

56	87

57 UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) co nst	88 UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) co nst

58 {	89 {

59 const uint8_t *input = textIn->fRawInput;	90 const uint8_t *input = textIn->fRawInput;

60 int32_t confidence = 0;	91 int32_t confidence = 10;

61 int32_t length = textIn->fRawLength;	92 int32_t length = textIn->fRawLength;

62	93

63 if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 \|\| input[3] != 0x00)) {	94 int32_t bytesToCheck = (length > 30) ? 30 : length;

64 confidence = 100;	95 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {

	96 UChar codeUnit = input[charIndex] \| (input[charIndex + 1] << 8);

	97 if (charIndex == 0 && codeUnit == 0xFEFF) {

	98 confidence = 100; // UTF-16 BOM

	99 if (length >= 4 && input[2] == 0 && input[3] == 0) {

	100 confidence = 0; // UTF-32 BOM

	101 }

	102 break;

	103 }

	104 confidence = adjustConfidence(codeUnit, confidence);

	105 if (confidence == 0 \|\| confidence == 100) {

	106 break;

	107 }

65 }	108 }

66	109 if (bytesToCheck < 4 && confidence < 100) {

67 // TODO: Do some statastics to check for unsigned UTF-16LE	110 confidence = 0;

	111 }

68 results->set(textIn, this, confidence);	112 results->set(textIn, this, confidence);

69 return (confidence > 0);	113 return (confidence > 0);

70 }	114 }

71	115

72 CharsetRecog_UTF_32::~CharsetRecog_UTF_32()	116 CharsetRecog_UTF_32::~CharsetRecog_UTF_32()

73 {	117 {

74 // nothing to do	118 // nothing to do

75 }	119 }

76	120

77 UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const	121 UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const

(...skipping 67 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
145	189

146 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) con st	190 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) con st

147 {	191 {

148 return input[index + 3] << 24 \| input[index + 2] << 16 \|	192 return input[index + 3] << 24 \| input[index + 2] << 16 \|

149 input[index + 1] << 8 \| input[index + 0];	193 input[index + 1] << 8 \| input[index + 0];

150 }	194 }

151	195

152 U_NAMESPACE_END	196 U_NAMESPACE_END

153 #endif	197 #endif

154	198

OLD	NEW

« no previous file with comments | « source/i18n/csrsbcs.cpp ('k') | source/i18n/csrutf8.cpp » ('j') | no next file with comments »