base/third_party/icu/icu_utf.h - Issue 1647803004: Move base to DEPS

Side by Side Diff: base/third_party/icu/icu_utf.h

Issue 1647803004: Move base to DEPS (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /*

2 *******************************************************************************

3 *

4 * Copyright (C) 1999-2004, International Business Machines

5 * Corporation and others. All Rights Reserved.

6 *

7 *******************************************************************************

8 * file name: utf.h

9 * encoding: US-ASCII

10 * tab size: 8 (not used)

11 * indentation:4

12 *

13 * created on: 1999sep09

14 * created by: Markus W. Scherer

15 */

16

17 #ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_

18 #define BASE_THIRD_PARTY_ICU_ICU_UTF_H_

19

20 #include "base/basictypes.h"

21

22 namespace base_icu {

23

24 typedef int32 UChar32;

25 typedef uint16 UChar;

26 typedef int8 UBool;

27

28 // General ---------------------------------------------------------------------

29 // from utf.h

30

31 /**

32 * This value is intended for sentinel values for APIs that

33 * (take or) return single code points (UChar32).

34 * It is outside of the Unicode code point range 0..0x10ffff.

35 *

36 * For example, a "done" or "error" value in a new API

37 * could be indicated with CBU_SENTINEL.

38 *

39 * ICU APIs designed before ICU 2.4 usually define service-specific "done"

40 * values, mostly 0xffff.

41 * Those may need to be distinguished from

42 * actual U+ffff text contents by calling functions like

43 * CharacterIterator::hasNext() or UnicodeString::length().

44 *

45 * @return -1

46 * @see UChar32

47 * @stable ICU 2.4

48 */

49 #define CBU_SENTINEL (-1)

50

51 /**

52 * Is this code point a Unicode noncharacter?

53 * @param c 32-bit code point

54 * @return TRUE or FALSE

55 * @stable ICU 2.4

56 */

57 #define CBU_IS_UNICODE_NONCHAR(c) \

58 ((c)>=0xfdd0 && \

59 ((uint32)(c)<=0xfdef \|\| ((c)&0xfffe)==0xfffe) && \

60 (uint32)(c)<=0x10ffff)

61

62 /**

63 * Is c a Unicode code point value (0..U+10ffff)

64 * that can be assigned a character?

65 *

66 * Code points that are not characters include:

67 * - single surrogate code points (U+d800..U+dfff, 2048 code points)

68 * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code poin ts)

69 * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)

70 * - the highest Unicode code point value is U+10ffff

71 *

72 * This means that all code points below U+d800 are character code points,

73 * and that boundary is tested first for performance.

74 *

75 * @param c 32-bit code point

76 * @return TRUE or FALSE

77 * @stable ICU 2.4

78 */

79 #define CBU_IS_UNICODE_CHAR(c) \

80 ((uint32)(c)<0xd800 \|\| \

81 ((uint32)(c)>0xdfff && \

82 (uint32)(c)<=0x10ffff && \

83 !CBU_IS_UNICODE_NONCHAR(c)))

84

85 /**

86 * Is this code point a surrogate (U+d800..U+dfff)?

87 * @param c 32-bit code point

88 * @return TRUE or FALSE

89 * @stable ICU 2.4

90 */

91 #define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)

92

93 /**

94 * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),

95 * is it a lead surrogate?

96 * @param c 32-bit code point

97 * @return TRUE or FALSE

98 * @stable ICU 2.4

99 */

100 #define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)

101

102

103 // UTF-8 macros ----------------------------------------------------------------

104 // from utf8.h

105

106 extern const uint8 utf8_countTrailBytes[256];

107

108 /**

109 * Count the trail bytes for a UTF-8 lead byte.

110 * @internal

111 */

112 #define CBU8_COUNT_TRAIL_BYTES(leadByte) (base_icu::utf8_countTrailBytes[(uint8) leadByte])

113

114 /**

115 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.

116 * @internal

117 */

118 #define CBU8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(coun tTrailBytes)))-1)

119

120 /**

121 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?

122 * @param c 8-bit code unit (byte)

123 * @return TRUE or FALSE

124 * @stable ICU 2.4

125 */

126 #define CBU8_IS_SINGLE(c) (((c)&0x80)==0)

127

128 /**

129 * Is this code unit (byte) a UTF-8 lead byte?

130 * @param c 8-bit code unit (byte)

131 * @return TRUE or FALSE

132 * @stable ICU 2.4

133 */

134 #define CBU8_IS_LEAD(c) ((uint8)((c)-0xc0)<0x3e)

135

136 /**

137 * Is this code unit (byte) a UTF-8 trail byte?

138 * @param c 8-bit code unit (byte)

139 * @return TRUE or FALSE

140 * @stable ICU 2.4

141 */

142 #define CBU8_IS_TRAIL(c) (((c)&0xc0)==0x80)

143

144 /**

145 * How many code units (bytes) are used for the UTF-8 encoding

146 * of this Unicode code point?

147 * @param c 32-bit code point

148 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point

149 * @stable ICU 2.4

150 */

151 #define CBU8_LENGTH(c) \

152 ((uint32)(c)<=0x7f ? 1 : \

153 ((uint32)(c)<=0x7ff ? 2 : \

154 ((uint32)(c)<=0xd7ff ? 3 : \

155 ((uint32)(c)<=0xdfff \|\| (uint32)(c)>0x10ffff ? 0 : \

156 ((uint32)(c)<=0xffff ? 3 : 4)\

157 ) \

158 ) \

159 ) \

160 )

161

162 /**

163 * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000 ..U+10ffff).

164 * @return 4

165 * @stable ICU 2.4

166 */

167 #define CBU8_MAX_LENGTH 4

168

169 /**

170 * Function for handling "next code point" with error-checking.

171 * @internal

172 */

173 UChar32 utf8_nextCharSafeBody(const uint8 s, int32 pi, int32 length, UChar32 c , UBool strict);

174

175 /**

176 * Get a code point from a string at a code point boundary offset,

177 * and advance the offset to the next code point boundary.

178 * (Post-incrementing forward iteration.)

179 * "Safe" macro, checks for illegal sequences and for string boundaries.

180 *

181 * The offset may point to the lead byte of a multi-byte sequence,

182 * in which case the macro will read the whole sequence.

183 * If the offset points to a trail byte or an illegal UTF-8 sequence, then

184 * c is set to a negative value.

185 *

186 * @param s const uint8 * string

187 * @param i string offset, i<length

188 * @param length string length

189 * @param c output UChar32 variable, set to <0 in case of an error

190 * @see CBU8_NEXT_UNSAFE

191 * @stable ICU 2.4

192 */

193 #define CBU8_NEXT(s, i, length, c) { \

194 (c)=(s)[(i)++]; \

195 if(((uint8)(c))>=0x80) { \

196 if(CBU8_IS_LEAD(c)) { \

197 (c)=base_icu::utf8_nextCharSafeBody((const uint8 *)s, &(i), (int32)( length), c, -1); \

198 } else { \

199 (c)=CBU_SENTINEL; \

200 } \

201 } \

202 }

203

204 /**

205 * Append a code point to a string, overwriting 1 to 4 bytes.

206 * The offset points to the current end of the string contents

207 * and is advanced (post-increment).

208 * "Unsafe" macro, assumes a valid code point and sufficient space in the string .

209 * Otherwise, the result is undefined.

210 *

211 * @param s const uint8 * string buffer

212 * @param i string offset

213 * @param c code point to append

214 * @see CBU8_APPEND

215 * @stable ICU 2.4

216 */

217 #define CBU8_APPEND_UNSAFE(s, i, c) { \

218 if((uint32)(c)<=0x7f) { \

219 (s)[(i)++]=(uint8)(c); \

220 } else { \

221 if((uint32)(c)<=0x7ff) { \

222 (s)[(i)++]=(uint8)(((c)>>6)\|0xc0); \

223 } else { \

224 if((uint32)(c)<=0xffff) { \

225 (s)[(i)++]=(uint8)(((c)>>12)\|0xe0); \

226 } else { \

227 (s)[(i)++]=(uint8)(((c)>>18)\|0xf0); \

228 (s)[(i)++]=(uint8)((((c)>>12)&0x3f)\|0x80); \

229 } \

230 (s)[(i)++]=(uint8)((((c)>>6)&0x3f)\|0x80); \

231 } \

232 (s)[(i)++]=(uint8)(((c)&0x3f)\|0x80); \

233 } \

234 }

235

236 // UTF-16 macros ---------------------------------------------------------------

237 // from utf16.h

238

239 /**

240 * Does this code unit alone encode a code point (BMP, not a surrogate)?

241 * @param c 16-bit code unit

242 * @return TRUE or FALSE

243 * @stable ICU 2.4

244 */

245 #define CBU16_IS_SINGLE(c) !CBU_IS_SURROGATE(c)

246

247 /**

248 * Is this code unit a lead surrogate (U+d800..U+dbff)?

249 * @param c 16-bit code unit

250 * @return TRUE or FALSE

251 * @stable ICU 2.4

252 */

253 #define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)

254

255 /**

256 * Is this code unit a trail surrogate (U+dc00..U+dfff)?

257 * @param c 16-bit code unit

258 * @return TRUE or FALSE

259 * @stable ICU 2.4

260 */

261 #define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)

262

263 /**

264 * Is this code unit a surrogate (U+d800..U+dfff)?

265 * @param c 16-bit code unit

266 * @return TRUE or FALSE

267 * @stable ICU 2.4

268 */

269 #define CBU16_IS_SURROGATE(c) CBU_IS_SURROGATE(c)

270

271 /**

272 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),

273 * is it a lead surrogate?

274 * @param c 16-bit code unit

275 * @return TRUE or FALSE

276 * @stable ICU 2.4

277 */

278 #define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)

279

280 /**

281 * Helper constant for CBU16_GET_SUPPLEMENTARY.

282 * @internal

283 */

284 #define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)

285

286 /**

287 * Get a supplementary code point value (U+10000..U+10ffff)

288 * from its lead and trail surrogates.

289 * The result is undefined if the input values are not

290 * lead and trail surrogates.

291 *

292 * @param lead lead surrogate (U+d800..U+dbff)

293 * @param trail trail surrogate (U+dc00..U+dfff)

294 * @return supplementary code point (U+10000..U+10ffff)

295 * @stable ICU 2.4

296 */

297 #define CBU16_GET_SUPPLEMENTARY(lead, trail) \

298 (((base_icu::UChar32)(lead)<<10UL)+(base_icu::UChar32)(trail)-CBU16_SURROGAT E_OFFSET)

299

300

301 /**

302 * Get the lead surrogate (0xd800..0xdbff) for a

303 * supplementary code point (0x10000..0x10ffff).

304 * @param supplementary 32-bit code point (U+10000..U+10ffff)

305 * @return lead surrogate (U+d800..U+dbff) for supplementary

306 * @stable ICU 2.4

307 */

308 #define CBU16_LEAD(supplementary) \

309 (base_icu::UChar)(((supplementary)>>10)+0xd7c0)

310

311 /**

312 * Get the trail surrogate (0xdc00..0xdfff) for a

313 * supplementary code point (0x10000..0x10ffff).

314 * @param supplementary 32-bit code point (U+10000..U+10ffff)

315 * @return trail surrogate (U+dc00..U+dfff) for supplementary

316 * @stable ICU 2.4

317 */

318 #define CBU16_TRAIL(supplementary) \

319 (base_icu::UChar)(((supplementary)&0x3ff)\|0xdc00)

320

321 /**

322 * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)

323 * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff) .

324 * @param c 32-bit code point

325 * @return 1 or 2

326 * @stable ICU 2.4

327 */

328 #define CBU16_LENGTH(c) ((uint32)(c)<=0xffff ? 1 : 2)

329

330 /**

331 * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10f fff).

332 * @return 2

333 * @stable ICU 2.4

334 */

335 #define CBU16_MAX_LENGTH 2

336

337 /**

338 * Get a code point from a string at a code point boundary offset,

339 * and advance the offset to the next code point boundary.

340 * (Post-incrementing forward iteration.)

341 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.

342 *

343 * The offset may point to the lead surrogate unit

344 * for a supplementary code point, in which case the macro will read

345 * the following trail surrogate as well.

346 * If the offset points to a trail surrogate or

347 * to a single, unpaired lead surrogate, then that itself

348 * will be returned as the code point.

349 *

350 * @param s const UChar * string

351 * @param i string offset, i<length

352 * @param length string length

353 * @param c output UChar32 variable

354 * @stable ICU 2.4

355 */

356 #define CBU16_NEXT(s, i, length, c) { \

357 (c)=(s)[(i)++]; \

358 if(CBU16_IS_LEAD(c)) { \

359 uint16 __c2; \

360 if((i)<(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \

361 ++(i); \

362 (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \

363 } \

364 } \

365 }

366

367 /**

368 * Append a code point to a string, overwriting 1 or 2 code units.

369 * The offset points to the current end of the string contents

370 * and is advanced (post-increment).

371 * "Unsafe" macro, assumes a valid code point and sufficient space in the string .

372 * Otherwise, the result is undefined.

373 *

374 * @param s const UChar * string buffer

375 * @param i string offset

376 * @param c code point to append

377 * @see CBU16_APPEND

378 * @stable ICU 2.4

379 */

380 #define CBU16_APPEND_UNSAFE(s, i, c) { \

381 if((uint32)(c)<=0xffff) { \

382 (s)[(i)++]=(uint16)(c); \

383 } else { \

384 (s)[(i)++]=(uint16)(((c)>>10)+0xd7c0); \

385 (s)[(i)++]=(uint16)(((c)&0x3ff)\|0xdc00); \

386 } \

387 }

388

389 } // namesapce base_icu

390

391 #endif // BASE_THIRD_PARTY_ICU_ICU_UTF_H_

OLD	NEW

« no previous file with comments | « base/third_party/icu/README.chromium ('k') | base/third_party/icu/icu_utf.cc » ('j') | no next file with comments »