base/third_party/icu/icu_utf.cc - Issue 1538743002: Switch to standard integer types in base/.

Side by Side Diff: base/third_party/icu/icu_utf.cc

Issue 1538743002: Switch to standard integer types in base/. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: DEPS roll too Created 4 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 ******************************************************************************	2 ******************************************************************************

3 *	3 *

4 * Copyright (C) 1999-2006, International Business Machines	4 * Copyright (C) 1999-2006, International Business Machines

5 * Corporation and others. All Rights Reserved.	5 * Corporation and others. All Rights Reserved.

6 *	6 *

7 ******************************************************************************	7 ******************************************************************************

8 * file name: utf_impl.c	8 * file name: utf_impl.c

9 * encoding: US-ASCII	9 * encoding: US-ASCII

10 * tab size: 8 (not used)	10 * tab size: 8 (not used)

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
67 * - MOV AX, 6 (result)	67 * - MOV AX, 6 (result)

68 * - JZ finish (ZF==1 if leadByte==0xff)	68 * - JZ finish (ZF==1 if leadByte==0xff)

69 * - SUB AX, BX (result)	69 * - SUB AX, BX (result)

70 * -finish:	70 * -finish:

71 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)	71 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)

72 *	72 *

73 * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;	73 * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;

74 * lead bytes above 0xf4 are illegal.	74 * lead bytes above 0xf4 are illegal.

75 * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.	75 * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.

76 */	76 */

77 const uint8	77 const uint8_t utf8_countTrailBytes[256] =

78 utf8_countTrailBytes[256]={	78 {

79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

82 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

83	82

84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	83 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

88	86

89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

93	90

94 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	91 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

95 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	92 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

96	93

97 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	94 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,

98 3, 3, 3, 3, 3,	95 3, 3, /* illegal in Unicode */

99 3, 3, 3, /* illegal in Unicode */	96 4, 4, 4, 4, /* illegal in Unicode */

100 4, 4, 4, 4, /* illegal in Unicode */	97 5, 5, /* illegal in Unicode */

101 5, 5, /* illegal in Unicode */	98 0, 0 /* illegal bytes 0xfe and 0xff */

102 0, 0 /* illegal bytes 0xfe and 0xff */

103 };	99 };

104	100

105 static const UChar32	101 static const UChar32

106 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };	102 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };

107	103

108 static const UChar32	104 static const UChar32

109 utf8_errorValue[6]={	105 utf8_errorValue[6]={

110 CBUTF8_ERROR_VALUE_1, CBUTF8_ERROR_VALUE_2, CBUTF_ERROR_VALUE, 0x10ffff,	106 CBUTF8_ERROR_VALUE_1, CBUTF8_ERROR_VALUE_2, CBUTF_ERROR_VALUE, 0x10ffff,

111 0x3ffffff, 0x7fffffff	107 0x3ffffff, 0x7fffffff

112 };	108 };

(...skipping 13 matching lines...) Expand all Loading...
126 * Same as the obsolete "safe" behavior, but non-characters are also treated	122 * Same as the obsolete "safe" behavior, but non-characters are also treated

127 * like illegal sequences.	123 * like illegal sequences.

128 *	124 *

129 * The special negative (<0) value -2 is used for lenient treatment of surrogate	125 * The special negative (<0) value -2 is used for lenient treatment of surrogate

130 * code points as legal. Some implementations use this for roundtripping of	126 * code points as legal. Some implementations use this for roundtripping of

131 * Unicode 16-bit strings that are not well-formed UTF-16, that is, they	127 * Unicode 16-bit strings that are not well-formed UTF-16, that is, they

132 * contain unpaired surrogates.	128 * contain unpaired surrogates.

133 *	129 *

134 * Note that a UBool is the same as an int8_t.	130 * Note that a UBool is the same as an int8_t.

135 */	131 */

136 UChar32	132 UChar32 utf8_nextCharSafeBody(const uint8_t* s,

137 utf8_nextCharSafeBody(const uint8 s, int32 pi, int32 length, UChar32 c, UBool strict) {	133 int32_t* pi,

138 int32 i=*pi;	134 int32_t length,

139 uint8 count=CBU8_COUNT_TRAIL_BYTES(c);	135 UChar32 c,

	136 UBool strict) {

	137 int32_t i = *pi;

	138 uint8_t count = CBU8_COUNT_TRAIL_BYTES(c);

140 if((i)+count<=(length)) {	139 if((i)+count<=(length)) {

141 uint8 trail, illegal=0;	140 uint8_t trail, illegal = 0;

142	141

143 CBU8_MASK_LEAD_BYTE((c), count);	142 CBU8_MASK_LEAD_BYTE((c), count);

144 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */	143 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */

145 switch(count) {	144 switch(count) {

146 /* each branch falls through to the next one */	145 /* each branch falls through to the next one */

147 case 5:	146 case 5:

148 case 4:	147 case 4:

149 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode 's UTF-8 */	148 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode 's UTF-8 */

150 illegal=1;	149 illegal=1;

151 break;	150 break;

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
185 * Starting with Unicode 3.2, surrogate code points must not be	184 * Starting with Unicode 3.2, surrogate code points must not be

186 * encoded in UTF-8, and there are no irregular sequences any more.	185 * encoded in UTF-8, and there are no irregular sequences any more.

187 *	186 *

188 * U8_ macros (new in ICU 2.4) return negative values for error conditio ns.	187 * U8_ macros (new in ICU 2.4) return negative values for error conditio ns.

189 */	188 */

190	189

191 /* correct sequence - all trail bytes have (b7..b6)==(10)? */	190 /* correct sequence - all trail bytes have (b7..b6)==(10)? */

192 /* illegal is also set if count>=4 */	191 /* illegal is also set if count>=4 */

193 if(illegal \|\| (c)<utf8_minLegal[count] \|\| (CBU_IS_SURROGATE(c) && strict !=-2)) {	192 if(illegal \|\| (c)<utf8_minLegal[count] \|\| (CBU_IS_SURROGATE(c) && strict !=-2)) {

194 /* error handling */	193 /* error handling */

195 uint8 errorCount=count;	194 uint8_t errorCount = count;

196 /* don't go beyond this sequence */	195 /* don't go beyond this sequence */

197 i=*pi;	196 i=*pi;

198 while(count>0 && CBU8_IS_TRAIL(s[i])) {	197 while(count>0 && CBU8_IS_TRAIL(s[i])) {

199 ++(i);	198 ++(i);

200 --count;	199 --count;

201 }	200 }

202 if(strict>=0) {	201 if(strict>=0) {

203 c=utf8_errorValue[errorCount-count];	202 c=utf8_errorValue[errorCount-count];

204 } else {	203 } else {

205 c=CBU_SENTINEL;	204 c=CBU_SENTINEL;

206 }	205 }

207 } else if((strict)>0 && CBU_IS_UNICODE_NONCHAR(c)) {	206 } else if((strict)>0 && CBU_IS_UNICODE_NONCHAR(c)) {

208 /* strict: forbid non-characters like U+fffe */	207 /* strict: forbid non-characters like U+fffe */

209 c=utf8_errorValue[count];	208 c=utf8_errorValue[count];

210 }	209 }

211 } else /* too few bytes left */ {	210 } else /* too few bytes left */ {

212 /* error handling */	211 /* error handling */

213 int32 i0=i;	212 int32_t i0 = i;

214 /* don't just set (i)=(length) in case there is an illegal sequence */	213 /* don't just set (i)=(length) in case there is an illegal sequence */

215 while((i)<(length) && CBU8_IS_TRAIL(s[i])) {	214 while((i)<(length) && CBU8_IS_TRAIL(s[i])) {

216 ++(i);	215 ++(i);

217 }	216 }

218 if(strict>=0) {	217 if(strict>=0) {

219 c=utf8_errorValue[i-i0];	218 c=utf8_errorValue[i-i0];

220 } else {	219 } else {

221 c=CBU_SENTINEL;	220 c=CBU_SENTINEL;

222 }	221 }

223 }	222 }

224 *pi=i;	223 *pi=i;

225 return c;	224 return c;

226 }	225 }

227	226

228 } // namespace base_icu	227 } // namespace base_icu

OLD	NEW

« no previous file with comments | « base/third_party/icu/icu_utf.h ('k') | base/third_party/nspr/prtime.cc » ('j') | no next file with comments »