icu46/source/common/unicode/normalizer2.h - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/unicode/normalizer2.h

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 *******************************************************************************

	3 *

	4 * Copyright (C) 2009-2010, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 *******************************************************************************

	8 * file name: normalizer2.h

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 2009nov22

	14 * created by: Markus W. Scherer

	15 */

	16

	17 #ifndef __NORMALIZER2_H__

	18 #define __NORMALIZER2_H__

	19

	20 /**

	21 * \file

	22 * \brief C++ API: New API for Unicode Normalization.

	23 */

	24

	25 #include "unicode/utypes.h"

	26

	27 #if !UCONFIG_NO_NORMALIZATION

	28

	29 #include "unicode/uniset.h"

	30 #include "unicode/unistr.h"

	31 #include "unicode/unorm2.h"

	32

	33 U_NAMESPACE_BEGIN

	34

	35 /**

	36 * Unicode normalization functionality for standard Unicode normalization or

	37 * for using custom mapping tables.

	38 * All instances of this class are unmodifiable/immutable.

	39 * Instances returned by getInstance() are singletons that must not be deleted b y the caller.

	40 * The Normalizer2 class is not intended for public subclassing.

	41 *

	42 * The primary functions are to produce a normalized string and to detect whethe r

	43 * a string is already normalized.

	44 * The most commonly used normalization forms are those defined in

	45 * http://www.unicode.org/unicode/reports/tr15/

	46 * However, this API supports additional normalization forms for specialized pur poses.

	47 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)

	48 * and can be used in implementations of UTS #46.

	49 *

	50 * Not only are the standard compose and decompose modes supplied,

	51 * but additional modes are provided as documented in the Mode enum.

	52 *

	53 * Some of the functions in this class identify normalization boundaries.

	54 * At a normalization boundary, the portions of the string

	55 * before it and starting from it do not interact and can be handled independent ly.

	56 *

	57 * The spanQuickCheckYes() stops at a normalization boundary.

	58 * When the goal is a normalized string, then the text before the boundary

	59 * can be copied, and the remainder can be processed with normalizeSecondAndAppe nd().

	60 *

	61 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whet her

	62 * a character is guaranteed to be at a normalization boundary,

	63 * regardless of context.

	64 * This is used for moving from one normalization boundary to the next

	65 * or preceding boundary, and for performing iterative normalization.

	66 *

	67 * Iterative normalization is useful when only a small portion of a

	68 * longer string needs to be processed.

	69 * For example, in ICU, iterative normalization is used by the NormalizationTran sliterator

	70 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()

	71 * (to process only the substring for which sort key bytes are computed).

	72 *

	73 * The set of normalization boundaries returned by these functions may not be

	74 * complete: There may be more boundaries that could be returned.

	75 * Different functions may return different boundaries.

	76 * @stable ICU 4.4

	77 */

	78 class U_COMMON_API Normalizer2 : public UObject {

	79 public:

	80 /**

	81 * Returns a Normalizer2 instance which uses the specified data file

	82 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceB undle)

	83 * and which composes or decomposes text according to the specified mode.

	84 * Returns an unmodifiable singleton instance. Do not delete it.

	85 *

	86 * Use packageName=NULL for data files that are part of ICU's own data.

	87 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard N FC/NFD.

	88 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.

	89 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_C asefold.

	90 *

	91 * @param packageName NULL for ICU built-in data, otherwise application data package name

	92 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file

	93 * @param mode normalization mode (compose or decompose etc.)

	94 * @param errorCode Standard ICU error code. Its input value must

	95 * pass the U_SUCCESS() test, or else the function returns

	96 * immediately. Check for U_FAILURE() on output or use with

	97 * function chaining. (See User Guide for details.)

	98 * @return the requested Normalizer2, if successful

	99 * @stable ICU 4.4

	100 */

	101 static const Normalizer2 *

	102 getInstance(const char *packageName,

	103 const char *name,

	104 UNormalization2Mode mode,

	105 UErrorCode &errorCode);

	106

	107 /**

	108 * Returns the normalized form of the source string.

	109 * @param src source string

	110 * @param errorCode Standard ICU error code. Its input value must

	111 * pass the U_SUCCESS() test, or else the function returns

	112 * immediately. Check for U_FAILURE() on output or use with

	113 * function chaining. (See User Guide for details.)

	114 * @return normalized src

	115 * @stable ICU 4.4

	116 */

	117 UnicodeString

	118 normalize(const UnicodeString &src, UErrorCode &errorCode) const {

	119 UnicodeString result;

	120 normalize(src, result, errorCode);

	121 return result;

	122 }

	123 /**

	124 * Writes the normalized form of the source string to the destination string

	125 * (replacing its contents) and returns the destination string.

	126 * The source and destination strings must be different objects.

	127 * @param src source string

	128 * @param dest destination string; its contents is replaced with normalized src

	129 * @param errorCode Standard ICU error code. Its input value must

	130 * pass the U_SUCCESS() test, or else the function returns

	131 * immediately. Check for U_FAILURE() on output or use with

	132 * function chaining. (See User Guide for details.)

	133 * @return dest

	134 * @stable ICU 4.4

	135 */

	136 virtual UnicodeString &

	137 normalize(const UnicodeString &src,

	138 UnicodeString &dest,

	139 UErrorCode &errorCode) const = 0;

	140 /**

	141 * Appends the normalized form of the second string to the first string

	142 * (merging them at the boundary) and returns the first string.

	143 * The result is normalized if the first string was normalized.

	144 * The first and second strings must be different objects.

	145 * @param first string, should be normalized

	146 * @param second string, will be normalized

	147 * @param errorCode Standard ICU error code. Its input value must

	148 * pass the U_SUCCESS() test, or else the function returns

	149 * immediately. Check for U_FAILURE() on output or use with

	150 * function chaining. (See User Guide for details.)

	151 * @return first

	152 * @stable ICU 4.4

	153 */

	154 virtual UnicodeString &

	155 normalizeSecondAndAppend(UnicodeString &first,

	156 const UnicodeString &second,

	157 UErrorCode &errorCode) const = 0;

	158 /**

	159 * Appends the second string to the first string

	160 * (merging them at the boundary) and returns the first string.

	161 * The result is normalized if both the strings were normalized.

	162 * The first and second strings must be different objects.

	163 * @param first string, should be normalized

	164 * @param second string, should be normalized

	165 * @param errorCode Standard ICU error code. Its input value must

	166 * pass the U_SUCCESS() test, or else the function returns

	167 * immediately. Check for U_FAILURE() on output or use with

	168 * function chaining. (See User Guide for details.)

	169 * @return first

	170 * @stable ICU 4.4

	171 */

	172 virtual UnicodeString &

	173 append(UnicodeString &first,

	174 const UnicodeString &second,

	175 UErrorCode &errorCode) const = 0;

	176

	177 /**

	178 * Gets the decomposition mapping of c. Equivalent to normalize(UnicodeStrin g(c))

	179 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.

	180 * This function is independent of the mode of the Normalizer2.

	181 * @param c code point

	182 * @param decomposition String object which will be set to c's

	183 * decomposition mapping, if there is one.

	184 * @return TRUE if c has a decomposition, otherwise FALSE

	185 * @draft ICU 4.6

	186 */

	187 virtual UBool

	188 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;

	189

	190 /**

	191 * Tests if the string is normalized.

	192 * Internally, in cases where the quickCheck() method would return "maybe"

	193 * (which is only possible for the two COMPOSE modes) this method

	194 * resolves to "yes" or "no" to provide a definitive result,

	195 * at the cost of doing more work in those cases.

	196 * @param s input string

	197 * @param errorCode Standard ICU error code. Its input value must

	198 * pass the U_SUCCESS() test, or else the function returns

	199 * immediately. Check for U_FAILURE() on output or use with

	200 * function chaining. (See User Guide for details.)

	201 * @return TRUE if s is normalized

	202 * @stable ICU 4.4

	203 */

	204 virtual UBool

	205 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;

	206

	207 /**

	208 * Tests if the string is normalized.

	209 * For the two COMPOSE modes, the result could be "maybe" in cases that

	210 * would take a little more work to resolve definitively.

	211 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster

	212 * combination of quick check + normalization, to avoid

	213 * re-checking the "yes" prefix.

	214 * @param s input string

	215 * @param errorCode Standard ICU error code. Its input value must

	216 * pass the U_SUCCESS() test, or else the function returns

	217 * immediately. Check for U_FAILURE() on output or use with

	218 * function chaining. (See User Guide for details.)

	219 * @return UNormalizationCheckResult

	220 * @stable ICU 4.4

	221 */

	222 virtual UNormalizationCheckResult

	223 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;

	224

	225 /**

	226 * Returns the end of the normalized substring of the input string.

	227 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>

	228 * the substring <code>UnicodeString(s, 0, end)</code>

	229 * will pass the quick check with a "yes" result.

	230 *

	231 * The returned end index is usually one or more characters before the

	232 * "no" or "maybe" character: The end index is at a normalization boundary.

	233 * (See the class documentation for more about normalization boundaries.)

	234 *

	235 * When the goal is a normalized string and most input strings are expected

	236 * to be normalized already, then call this method,

	237 * and if it returns a prefix shorter than the input string,

	238 * copy that prefix and use normalizeSecondAndAppend() for the remainder.

	239 * @param s input string

	240 * @param errorCode Standard ICU error code. Its input value must

	241 * pass the U_SUCCESS() test, or else the function returns

	242 * immediately. Check for U_FAILURE() on output or use with

	243 * function chaining. (See User Guide for details.)

	244 * @return "yes" span end index

	245 * @stable ICU 4.4

	246 */

	247 virtual int32_t

	248 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;

	249

	250 /**

	251 * Tests if the character always has a normalization boundary before it,

	252 * regardless of context.

	253 * If true, then the character does not normalization-interact with

	254 * preceding characters.

	255 * In other words, a string containing this character can be normalized

	256 * by processing portions before this character and starting from this

	257 * character independently.

	258 * This is used for iterative normalization. See the class documentation for details.

	259 * @param c character to test

	260 * @return TRUE if c has a normalization boundary before it

	261 * @stable ICU 4.4

	262 */

	263 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;

	264

	265 /**

	266 * Tests if the character always has a normalization boundary after it,

	267 * regardless of context.

	268 * If true, then the character does not normalization-interact with

	269 * following characters.

	270 * In other words, a string containing this character can be normalized

	271 * by processing portions up to this character and after this

	272 * character independently.

	273 * This is used for iterative normalization. See the class documentation for details.

	274 * Note that this operation may be significantly slower than hasBoundaryBefo re().

	275 * @param c character to test

	276 * @return TRUE if c has a normalization boundary after it

	277 * @stable ICU 4.4

	278 */

	279 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;

	280

	281 /**

	282 * Tests if the character is normalization-inert.

	283 * If true, then the character does not change, nor normalization-interact w ith

	284 * preceding or following characters.

	285 * In other words, a string containing this character can be normalized

	286 * by processing portions before this character and after this

	287 * character independently.

	288 * This is used for iterative normalization. See the class documentation for details.

	289 * Note that this operation may be significantly slower than hasBoundaryBefo re().

	290 * @param c character to test

	291 * @return TRUE if c is normalization-inert

	292 * @stable ICU 4.4

	293 */

	294 virtual UBool isInert(UChar32 c) const = 0;

	295

	296 private:

	297 // No ICU "poor man's RTTI" for this class nor its subclasses.

	298 virtual UClassID getDynamicClassID() const;

	299 };

	300

	301 /**

	302 * Normalization filtered by a UnicodeSet.

	303 * Normalizes portions of the text contained in the filter set and leaves

	304 * portions not contained in the filter set unchanged.

	305 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).

	306 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".

	307 * This class implements all of (and only) the Normalizer2 API.

	308 * An instance of this class is unmodifiable/immutable but is constructed and

	309 * must be destructed by the owner.

	310 * @stable ICU 4.4

	311 */

	312 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {

	313 public:

	314 /**

	315 * Constructs a filtered normalizer wrapping any Normalizer2 instance

	316 * and a filter set.

	317 * Both are aliased and must not be modified or deleted while this object

	318 * is used.

	319 * The filter set should be frozen; otherwise the performance will suffer gr eatly.

	320 * @param n2 wrapped Normalizer2 instance

	321 * @param filterSet UnicodeSet which determines the characters to be normali zed

	322 * @stable ICU 4.4

	323 */

	324 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :

	325 norm2(n2), set(filterSet) {}

	326

	327 /**

	328 * Writes the normalized form of the source string to the destination string

	329 * (replacing its contents) and returns the destination string.

	330 * The source and destination strings must be different objects.

	331 * @param src source string

	332 * @param dest destination string; its contents is replaced with normalized src

	333 * @param errorCode Standard ICU error code. Its input value must

	334 * pass the U_SUCCESS() test, or else the function returns

	335 * immediately. Check for U_FAILURE() on output or use with

	336 * function chaining. (See User Guide for details.)

	337 * @return dest

	338 * @stable ICU 4.4

	339 */

	340 virtual UnicodeString &

	341 normalize(const UnicodeString &src,

	342 UnicodeString &dest,

	343 UErrorCode &errorCode) const;

	344 /**

	345 * Appends the normalized form of the second string to the first string

	346 * (merging them at the boundary) and returns the first string.

	347 * The result is normalized if the first string was normalized.

	348 * The first and second strings must be different objects.

	349 * @param first string, should be normalized

	350 * @param second string, will be normalized

	351 * @param errorCode Standard ICU error code. Its input value must

	352 * pass the U_SUCCESS() test, or else the function returns

	353 * immediately. Check for U_FAILURE() on output or use with

	354 * function chaining. (See User Guide for details.)

	355 * @return first

	356 * @stable ICU 4.4

	357 */

	358 virtual UnicodeString &

	359 normalizeSecondAndAppend(UnicodeString &first,

	360 const UnicodeString &second,

	361 UErrorCode &errorCode) const;

	362 /**

	363 * Appends the second string to the first string

	364 * (merging them at the boundary) and returns the first string.

	365 * The result is normalized if both the strings were normalized.

	366 * The first and second strings must be different objects.

	367 * @param first string, should be normalized

	368 * @param second string, should be normalized

	369 * @param errorCode Standard ICU error code. Its input value must

	370 * pass the U_SUCCESS() test, or else the function returns

	371 * immediately. Check for U_FAILURE() on output or use with

	372 * function chaining. (See User Guide for details.)

	373 * @return first

	374 * @stable ICU 4.4

	375 */

	376 virtual UnicodeString &

	377 append(UnicodeString &first,

	378 const UnicodeString &second,

	379 UErrorCode &errorCode) const;

	380

	381 /**

	382 * Gets the decomposition mapping of c. Equivalent to normalize(UnicodeStrin g(c))

	383 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.

	384 * This function is independent of the mode of the Normalizer2.

	385 * @param c code point

	386 * @param decomposition String object which will be set to c's

	387 * decomposition mapping, if there is one.

	388 * @return TRUE if c has a decomposition, otherwise FALSE

	389 * @draft ICU 4.6

	390 */

	391 virtual UBool

	392 getDecomposition(UChar32 c, UnicodeString &decomposition) const;

	393

	394 /**

	395 * Tests if the string is normalized.

	396 * For details see the Normalizer2 base class documentation.

	397 * @param s input string

	398 * @param errorCode Standard ICU error code. Its input value must

	399 * pass the U_SUCCESS() test, or else the function returns

	400 * immediately. Check for U_FAILURE() on output or use with

	401 * function chaining. (See User Guide for details.)

	402 * @return TRUE if s is normalized

	403 * @stable ICU 4.4

	404 */

	405 virtual UBool

	406 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;

	407 /**

	408 * Tests if the string is normalized.

	409 * For details see the Normalizer2 base class documentation.

	410 * @param s input string

	411 * @param errorCode Standard ICU error code. Its input value must

	412 * pass the U_SUCCESS() test, or else the function returns

	413 * immediately. Check for U_FAILURE() on output or use with

	414 * function chaining. (See User Guide for details.)

	415 * @return UNormalizationCheckResult

	416 * @stable ICU 4.4

	417 */

	418 virtual UNormalizationCheckResult

	419 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;

	420 /**

	421 * Returns the end of the normalized substring of the input string.

	422 * For details see the Normalizer2 base class documentation.

	423 * @param s input string

	424 * @param errorCode Standard ICU error code. Its input value must

	425 * pass the U_SUCCESS() test, or else the function returns

	426 * immediately. Check for U_FAILURE() on output or use with

	427 * function chaining. (See User Guide for details.)

	428 * @return "yes" span end index

	429 * @stable ICU 4.4

	430 */

	431 virtual int32_t

	432 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;

	433

	434 /**

	435 * Tests if the character always has a normalization boundary before it,

	436 * regardless of context.

	437 * For details see the Normalizer2 base class documentation.

	438 * @param c character to test

	439 * @return TRUE if c has a normalization boundary before it

	440 * @stable ICU 4.4

	441 */

	442 virtual UBool hasBoundaryBefore(UChar32 c) const;

	443

	444 /**

	445 * Tests if the character always has a normalization boundary after it,

	446 * regardless of context.

	447 * For details see the Normalizer2 base class documentation.

	448 * @param c character to test

	449 * @return TRUE if c has a normalization boundary after it

	450 * @stable ICU 4.4

	451 */

	452 virtual UBool hasBoundaryAfter(UChar32 c) const;

	453

	454 /**

	455 * Tests if the character is normalization-inert.

	456 * For details see the Normalizer2 base class documentation.

	457 * @param c character to test

	458 * @return TRUE if c is normalization-inert

	459 * @stable ICU 4.4

	460 */

	461 virtual UBool isInert(UChar32 c) const;

	462 private:

	463 UnicodeString &

	464 normalize(const UnicodeString &src,

	465 UnicodeString &dest,

	466 USetSpanCondition spanCondition,

	467 UErrorCode &errorCode) const;

	468

	469 UnicodeString &

	470 normalizeSecondAndAppend(UnicodeString &first,

	471 const UnicodeString &second,

	472 UBool doNormalize,

	473 UErrorCode &errorCode) const;

	474

	475 const Normalizer2 &norm2;

	476 const UnicodeSet &set;

	477 };

	478

	479 U_NAMESPACE_END

	480

	481 #endif // !UCONFIG_NO_NORMALIZATION

	482 #endif // __NORMALIZER2_H__

OLD	NEW

« no previous file with comments | « icu46/source/common/unicode/locid.h ('k') | icu46/source/common/unicode/normlzr.h » ('j') | no next file with comments »