source/i18n/csrutf8.cpp - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/i18n/csrutf8.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 **********************************************************************	2 **********************************************************************

3 * Copyright (C) 2005-2012, International Business Machines	3 * Copyright (C) 2005-2014, International Business Machines

4 * Corporation and others. All Rights Reserved.	4 * Corporation and others. All Rights Reserved.

5 **********************************************************************	5 **********************************************************************

6 */	6 */

7	7

8 #include "unicode/utypes.h"	8 #include "unicode/utypes.h"

9	9

10 #if !UCONFIG_NO_CONVERSION	10 #if !UCONFIG_NO_CONVERSION

11	11

12 #include "csrutf8.h"	12 #include "csrutf8.h"

13 #include "csmatch.h"	13 #include "csmatch.h"

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
48	48

49 // Hi bit on char found. Figure out how long the sequence should be	49 // Hi bit on char found. Figure out how long the sequence should be

50 if ((b & 0x0E0) == 0x0C0) {	50 if ((b & 0x0E0) == 0x0C0) {

51 trailBytes = 1;	51 trailBytes = 1;

52 } else if ((b & 0x0F0) == 0x0E0) {	52 } else if ((b & 0x0F0) == 0x0E0) {

53 trailBytes = 2;	53 trailBytes = 2;

54 } else if ((b & 0x0F8) == 0xF0) {	54 } else if ((b & 0x0F8) == 0xF0) {

55 trailBytes = 3;	55 trailBytes = 3;

56 } else {	56 } else {

57 numInvalid += 1;	57 numInvalid += 1;

58	58 continue;

59 if (numInvalid > 5) {

60 break;

61 }

62

63 trailBytes = 0;

64 }	59 }

65	60

66 // Verify that we've got the right number of trail bytes in the sequence	61 // Verify that we've got the right number of trail bytes in the sequence

67 for (;;) {	62 for (;;) {

68 i += 1;	63 i += 1;

69	64

70 if (i >= input->fRawLength) {	65 if (i >= input->fRawLength) {

71 break;	66 break;

72 }	67 }

73	68

74 b = inputBytes[i];	69 b = inputBytes[i];

75	70

76 if ((b & 0xC0) != 0x080) {	71 if ((b & 0xC0) != 0x080) {

77 numInvalid += 1;	72 numInvalid += 1;

78 break;	73 break;

79 }	74 }

80	75

81 if (--trailBytes == 0) {	76 if (--trailBytes == 0) {

82 numValid += 1;	77 numValid += 1;

83 break;	78 break;

84 }	79 }

85 }	80 }

86	81

87 }	82 }

88	83

89 // Cook up some sort of confidence score, based on presense of a BOM	84 // Cook up some sort of confidence score, based on presence of a BOM

90 // and the existence of valid and/or invalid multi-byte sequences.	85 // and the existence of valid and/or invalid multi-byte sequences.

91 confidence = 0;	86 confidence = 0;

92 if (hasBOM && numInvalid == 0) {	87 if (hasBOM && numInvalid == 0) {

93 confidence = 100;	88 confidence = 100;

94 } else if (hasBOM && numValid > numInvalid*10) {	89 } else if (hasBOM && numValid > numInvalid*10) {

95 confidence = 80;	90 confidence = 80;

96 } else if (numValid > 3 && numInvalid == 0) {	91 } else if (numValid > 3 && numInvalid == 0) {

97 confidence = 100;	92 confidence = 100;

98 } else if (numValid > 0 && numInvalid == 0) {	93 } else if (numValid > 0 && numInvalid == 0) {

99 confidence = 80;	94 confidence = 80;

100 } else if (numValid == 0 && numInvalid == 0) {	95 } else if (numValid == 0 && numInvalid == 0) {

101 // Plain ASCII.	96 // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, w hich

102 confidence = 10;	97 // accepts ASCII with confidence = 10.

	98 confidence = 15;

103 } else if (numValid > numInvalid*10) {	99 } else if (numValid > numInvalid*10) {

104 // Probably corruput utf-8 data. Valid sequences aren't likely by chanc e.	100 // Probably corruput utf-8 data. Valid sequences aren't likely by chanc e.

105 confidence = 25;	101 confidence = 25;

106 }	102 }

107	103

108 results->set(input, this, confidence);	104 results->set(input, this, confidence);

109 return (confidence > 0);	105 return (confidence > 0);

110 }	106 }

111	107

112 U_NAMESPACE_END	108 U_NAMESPACE_END

113 #endif	109 #endif

OLD	NEW

« no previous file with comments | « source/i18n/csrucode.cpp ('k') | source/i18n/currfmt.h » ('j') | no next file with comments »