public/common/unicode/normalizer2.h - Issue 19276009: Move ICU headers part 1

Side by Side Diff: public/common/unicode/normalizer2.h

Issue 19276009: Move ICU headers part 1 (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/icu46/

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 /*

2 *******************************************************************************

3 *

4 * Copyright (C) 2009-2010, International Business Machines

5 * Corporation and others. All Rights Reserved.

6 *

7 *******************************************************************************

8 * file name: normalizer2.h

9 * encoding: US-ASCII

10 * tab size: 8 (not used)

11 * indentation:4

12 *

13 * created on: 2009nov22

14 * created by: Markus W. Scherer

15 */

16

17 #ifndef __NORMALIZER2_H__

18 #define __NORMALIZER2_H__

19

20 /**

21 * \file

22 * \brief C++ API: New API for Unicode Normalization.

23 */

24

25 #include "unicode/utypes.h"

26

27 #if !UCONFIG_NO_NORMALIZATION

28

29 #include "unicode/uniset.h"

30 #include "unicode/unistr.h"

31 #include "unicode/unorm2.h"

32

33 U_NAMESPACE_BEGIN

34

35 /**

36 * Unicode normalization functionality for standard Unicode normalization or

37 * for using custom mapping tables.

38 * All instances of this class are unmodifiable/immutable.

39 * Instances returned by getInstance() are singletons that must not be deleted b y the caller.

40 * The Normalizer2 class is not intended for public subclassing.

41 *

42 * The primary functions are to produce a normalized string and to detect whethe r

43 * a string is already normalized.

44 * The most commonly used normalization forms are those defined in

45 * http://www.unicode.org/unicode/reports/tr15/

46 * However, this API supports additional normalization forms for specialized pur poses.

47 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)

48 * and can be used in implementations of UTS #46.

49 *

50 * Not only are the standard compose and decompose modes supplied,

51 * but additional modes are provided as documented in the Mode enum.

52 *

53 * Some of the functions in this class identify normalization boundaries.

54 * At a normalization boundary, the portions of the string

55 * before it and starting from it do not interact and can be handled independent ly.

56 *

57 * The spanQuickCheckYes() stops at a normalization boundary.

58 * When the goal is a normalized string, then the text before the boundary

59 * can be copied, and the remainder can be processed with normalizeSecondAndAppe nd().

60 *

61 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whet her

62 * a character is guaranteed to be at a normalization boundary,

63 * regardless of context.

64 * This is used for moving from one normalization boundary to the next

65 * or preceding boundary, and for performing iterative normalization.

66 *

67 * Iterative normalization is useful when only a small portion of a

68 * longer string needs to be processed.

69 * For example, in ICU, iterative normalization is used by the NormalizationTran sliterator

70 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()

71 * (to process only the substring for which sort key bytes are computed).

72 *

73 * The set of normalization boundaries returned by these functions may not be

74 * complete: There may be more boundaries that could be returned.

75 * Different functions may return different boundaries.

76 * @stable ICU 4.4

77 */

78 class U_COMMON_API Normalizer2 : public UObject {

79 public:

80 /**

81 * Returns a Normalizer2 instance which uses the specified data file

82 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceB undle)

83 * and which composes or decomposes text according to the specified mode.

84 * Returns an unmodifiable singleton instance. Do not delete it.

85 *

86 * Use packageName=NULL for data files that are part of ICU's own data.

87 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard N FC/NFD.

88 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.

89 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_C asefold.

90 *

91 * @param packageName NULL for ICU built-in data, otherwise application data package name

92 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file

93 * @param mode normalization mode (compose or decompose etc.)

94 * @param errorCode Standard ICU error code. Its input value must

95 * pass the U_SUCCESS() test, or else the function returns

96 * immediately. Check for U_FAILURE() on output or use with

97 * function chaining. (See User Guide for details.)

98 * @return the requested Normalizer2, if successful

99 * @stable ICU 4.4

100 */

101 static const Normalizer2 *

102 getInstance(const char *packageName,

103 const char *name,

104 UNormalization2Mode mode,

105 UErrorCode &errorCode);

106

107 /**

108 * Returns the normalized form of the source string.

109 * @param src source string

110 * @param errorCode Standard ICU error code. Its input value must

111 * pass the U_SUCCESS() test, or else the function returns

112 * immediately. Check for U_FAILURE() on output or use with

113 * function chaining. (See User Guide for details.)

114 * @return normalized src

115 * @stable ICU 4.4

116 */

117 UnicodeString

118 normalize(const UnicodeString &src, UErrorCode &errorCode) const {

119 UnicodeString result;

120 normalize(src, result, errorCode);

121 return result;

122 }

123 /**

124 * Writes the normalized form of the source string to the destination string

125 * (replacing its contents) and returns the destination string.

126 * The source and destination strings must be different objects.

127 * @param src source string

128 * @param dest destination string; its contents is replaced with normalized src

129 * @param errorCode Standard ICU error code. Its input value must

130 * pass the U_SUCCESS() test, or else the function returns

131 * immediately. Check for U_FAILURE() on output or use with

132 * function chaining. (See User Guide for details.)

133 * @return dest

134 * @stable ICU 4.4

135 */

136 virtual UnicodeString &

137 normalize(const UnicodeString &src,

138 UnicodeString &dest,

139 UErrorCode &errorCode) const = 0;

140 /**

141 * Appends the normalized form of the second string to the first string

142 * (merging them at the boundary) and returns the first string.

143 * The result is normalized if the first string was normalized.

144 * The first and second strings must be different objects.

145 * @param first string, should be normalized

146 * @param second string, will be normalized

147 * @param errorCode Standard ICU error code. Its input value must

148 * pass the U_SUCCESS() test, or else the function returns

149 * immediately. Check for U_FAILURE() on output or use with

150 * function chaining. (See User Guide for details.)

151 * @return first

152 * @stable ICU 4.4

153 */

154 virtual UnicodeString &

155 normalizeSecondAndAppend(UnicodeString &first,

156 const UnicodeString &second,

157 UErrorCode &errorCode) const = 0;

158 /**

159 * Appends the second string to the first string

160 * (merging them at the boundary) and returns the first string.

161 * The result is normalized if both the strings were normalized.

162 * The first and second strings must be different objects.

163 * @param first string, should be normalized

164 * @param second string, should be normalized

165 * @param errorCode Standard ICU error code. Its input value must

166 * pass the U_SUCCESS() test, or else the function returns

167 * immediately. Check for U_FAILURE() on output or use with

168 * function chaining. (See User Guide for details.)

169 * @return first

170 * @stable ICU 4.4

171 */

172 virtual UnicodeString &

173 append(UnicodeString &first,

174 const UnicodeString &second,

175 UErrorCode &errorCode) const = 0;

176

177 /**

178 * Gets the decomposition mapping of c. Equivalent to normalize(UnicodeStrin g(c))

179 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.

180 * This function is independent of the mode of the Normalizer2.

181 * @param c code point

182 * @param decomposition String object which will be set to c's

183 * decomposition mapping, if there is one.

184 * @return TRUE if c has a decomposition, otherwise FALSE

185 * @draft ICU 4.6

186 */

187 virtual UBool

188 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;

189

190 /**

191 * Tests if the string is normalized.

192 * Internally, in cases where the quickCheck() method would return "maybe"

193 * (which is only possible for the two COMPOSE modes) this method

194 * resolves to "yes" or "no" to provide a definitive result,

195 * at the cost of doing more work in those cases.

196 * @param s input string

197 * @param errorCode Standard ICU error code. Its input value must

198 * pass the U_SUCCESS() test, or else the function returns

199 * immediately. Check for U_FAILURE() on output or use with

200 * function chaining. (See User Guide for details.)

201 * @return TRUE if s is normalized

202 * @stable ICU 4.4

203 */

204 virtual UBool

205 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;

206

207 /**

208 * Tests if the string is normalized.

209 * For the two COMPOSE modes, the result could be "maybe" in cases that

210 * would take a little more work to resolve definitively.

211 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster

212 * combination of quick check + normalization, to avoid

213 * re-checking the "yes" prefix.

214 * @param s input string

215 * @param errorCode Standard ICU error code. Its input value must

216 * pass the U_SUCCESS() test, or else the function returns

217 * immediately. Check for U_FAILURE() on output or use with

218 * function chaining. (See User Guide for details.)

219 * @return UNormalizationCheckResult

220 * @stable ICU 4.4

221 */

222 virtual UNormalizationCheckResult

223 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;

224

225 /**

226 * Returns the end of the normalized substring of the input string.

227 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>

228 * the substring <code>UnicodeString(s, 0, end)</code>

229 * will pass the quick check with a "yes" result.

230 *

231 * The returned end index is usually one or more characters before the

232 * "no" or "maybe" character: The end index is at a normalization boundary.

233 * (See the class documentation for more about normalization boundaries.)

234 *

235 * When the goal is a normalized string and most input strings are expected

236 * to be normalized already, then call this method,

237 * and if it returns a prefix shorter than the input string,

238 * copy that prefix and use normalizeSecondAndAppend() for the remainder.

239 * @param s input string

240 * @param errorCode Standard ICU error code. Its input value must

241 * pass the U_SUCCESS() test, or else the function returns

242 * immediately. Check for U_FAILURE() on output or use with

243 * function chaining. (See User Guide for details.)

244 * @return "yes" span end index

245 * @stable ICU 4.4

246 */

247 virtual int32_t

248 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;

249

250 /**

251 * Tests if the character always has a normalization boundary before it,

252 * regardless of context.

253 * If true, then the character does not normalization-interact with

254 * preceding characters.

255 * In other words, a string containing this character can be normalized

256 * by processing portions before this character and starting from this

257 * character independently.

258 * This is used for iterative normalization. See the class documentation for details.

259 * @param c character to test

260 * @return TRUE if c has a normalization boundary before it

261 * @stable ICU 4.4

262 */

263 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;

264

265 /**

266 * Tests if the character always has a normalization boundary after it,

267 * regardless of context.

268 * If true, then the character does not normalization-interact with

269 * following characters.

270 * In other words, a string containing this character can be normalized

271 * by processing portions up to this character and after this

272 * character independently.

273 * This is used for iterative normalization. See the class documentation for details.

274 * Note that this operation may be significantly slower than hasBoundaryBefo re().

275 * @param c character to test

276 * @return TRUE if c has a normalization boundary after it

277 * @stable ICU 4.4

278 */

279 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;

280

281 /**

282 * Tests if the character is normalization-inert.

283 * If true, then the character does not change, nor normalization-interact w ith

284 * preceding or following characters.

285 * In other words, a string containing this character can be normalized

286 * by processing portions before this character and after this

287 * character independently.

288 * This is used for iterative normalization. See the class documentation for details.

289 * Note that this operation may be significantly slower than hasBoundaryBefo re().

290 * @param c character to test

291 * @return TRUE if c is normalization-inert

292 * @stable ICU 4.4

293 */

294 virtual UBool isInert(UChar32 c) const = 0;

295

296 private:

297 // No ICU "poor man's RTTI" for this class nor its subclasses.

298 virtual UClassID getDynamicClassID() const;

299 };

300

301 /**

302 * Normalization filtered by a UnicodeSet.

303 * Normalizes portions of the text contained in the filter set and leaves

304 * portions not contained in the filter set unchanged.

305 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).

306 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".

307 * This class implements all of (and only) the Normalizer2 API.

308 * An instance of this class is unmodifiable/immutable but is constructed and

309 * must be destructed by the owner.

310 * @stable ICU 4.4

311 */

312 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {

313 public:

314 /**

315 * Constructs a filtered normalizer wrapping any Normalizer2 instance

316 * and a filter set.

317 * Both are aliased and must not be modified or deleted while this object

318 * is used.

319 * The filter set should be frozen; otherwise the performance will suffer gr eatly.

320 * @param n2 wrapped Normalizer2 instance

321 * @param filterSet UnicodeSet which determines the characters to be normali zed

322 * @stable ICU 4.4

323 */

324 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :

325 norm2(n2), set(filterSet) {}

326

327 /**

328 * Writes the normalized form of the source string to the destination string

329 * (replacing its contents) and returns the destination string.

330 * The source and destination strings must be different objects.

331 * @param src source string

332 * @param dest destination string; its contents is replaced with normalized src

333 * @param errorCode Standard ICU error code. Its input value must

334 * pass the U_SUCCESS() test, or else the function returns

335 * immediately. Check for U_FAILURE() on output or use with

336 * function chaining. (See User Guide for details.)

337 * @return dest

338 * @stable ICU 4.4

339 */

340 virtual UnicodeString &

341 normalize(const UnicodeString &src,

342 UnicodeString &dest,

343 UErrorCode &errorCode) const;

344 /**

345 * Appends the normalized form of the second string to the first string

346 * (merging them at the boundary) and returns the first string.

347 * The result is normalized if the first string was normalized.

348 * The first and second strings must be different objects.

349 * @param first string, should be normalized

350 * @param second string, will be normalized

351 * @param errorCode Standard ICU error code. Its input value must

352 * pass the U_SUCCESS() test, or else the function returns

353 * immediately. Check for U_FAILURE() on output or use with

354 * function chaining. (See User Guide for details.)

355 * @return first

356 * @stable ICU 4.4

357 */

358 virtual UnicodeString &

359 normalizeSecondAndAppend(UnicodeString &first,

360 const UnicodeString &second,

361 UErrorCode &errorCode) const;

362 /**

363 * Appends the second string to the first string

364 * (merging them at the boundary) and returns the first string.

365 * The result is normalized if both the strings were normalized.

366 * The first and second strings must be different objects.

367 * @param first string, should be normalized

368 * @param second string, should be normalized

369 * @param errorCode Standard ICU error code. Its input value must

370 * pass the U_SUCCESS() test, or else the function returns

371 * immediately. Check for U_FAILURE() on output or use with

372 * function chaining. (See User Guide for details.)

373 * @return first

374 * @stable ICU 4.4

375 */

376 virtual UnicodeString &

377 append(UnicodeString &first,

378 const UnicodeString &second,

379 UErrorCode &errorCode) const;

380

381 /**

382 * Gets the decomposition mapping of c. Equivalent to normalize(UnicodeStrin g(c))

383 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.

384 * This function is independent of the mode of the Normalizer2.

385 * @param c code point

386 * @param decomposition String object which will be set to c's

387 * decomposition mapping, if there is one.

388 * @return TRUE if c has a decomposition, otherwise FALSE

389 * @draft ICU 4.6

390 */

391 virtual UBool

392 getDecomposition(UChar32 c, UnicodeString &decomposition) const;

393

394 /**

395 * Tests if the string is normalized.

396 * For details see the Normalizer2 base class documentation.

397 * @param s input string

398 * @param errorCode Standard ICU error code. Its input value must

399 * pass the U_SUCCESS() test, or else the function returns

400 * immediately. Check for U_FAILURE() on output or use with

401 * function chaining. (See User Guide for details.)

402 * @return TRUE if s is normalized

403 * @stable ICU 4.4

404 */

405 virtual UBool

406 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;

407 /**

408 * Tests if the string is normalized.

409 * For details see the Normalizer2 base class documentation.

410 * @param s input string

411 * @param errorCode Standard ICU error code. Its input value must

412 * pass the U_SUCCESS() test, or else the function returns

413 * immediately. Check for U_FAILURE() on output or use with

414 * function chaining. (See User Guide for details.)

415 * @return UNormalizationCheckResult

416 * @stable ICU 4.4

417 */

418 virtual UNormalizationCheckResult

419 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;

420 /**

421 * Returns the end of the normalized substring of the input string.

422 * For details see the Normalizer2 base class documentation.

423 * @param s input string

424 * @param errorCode Standard ICU error code. Its input value must

425 * pass the U_SUCCESS() test, or else the function returns

426 * immediately. Check for U_FAILURE() on output or use with

427 * function chaining. (See User Guide for details.)

428 * @return "yes" span end index

429 * @stable ICU 4.4

430 */

431 virtual int32_t

432 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;

433

434 /**

435 * Tests if the character always has a normalization boundary before it,

436 * regardless of context.

437 * For details see the Normalizer2 base class documentation.

438 * @param c character to test

439 * @return TRUE if c has a normalization boundary before it

440 * @stable ICU 4.4

441 */

442 virtual UBool hasBoundaryBefore(UChar32 c) const;

443

444 /**

445 * Tests if the character always has a normalization boundary after it,

446 * regardless of context.

447 * For details see the Normalizer2 base class documentation.

448 * @param c character to test

449 * @return TRUE if c has a normalization boundary after it

450 * @stable ICU 4.4

451 */

452 virtual UBool hasBoundaryAfter(UChar32 c) const;

453

454 /**

455 * Tests if the character is normalization-inert.

456 * For details see the Normalizer2 base class documentation.

457 * @param c character to test

458 * @return TRUE if c is normalization-inert

459 * @stable ICU 4.4

460 */

461 virtual UBool isInert(UChar32 c) const;

462 private:

463 UnicodeString &

464 normalize(const UnicodeString &src,

465 UnicodeString &dest,

466 USetSpanCondition spanCondition,

467 UErrorCode &errorCode) const;

468

469 UnicodeString &

470 normalizeSecondAndAppend(UnicodeString &first,

471 const UnicodeString &second,

472 UBool doNormalize,

473 UErrorCode &errorCode) const;

474

475 const Normalizer2 &norm2;

476 const UnicodeSet &set;

477 };

478

479 U_NAMESPACE_END

480

481 #endif // !UCONFIG_NO_NORMALIZATION

482 #endif // __NORMALIZER2_H__

OLD	NEW

« no previous file with comments | « public/common/unicode/locid.h ('k') | public/common/unicode/normlzr.h » ('j') | no next file with comments »