icu46/source/common/unicode/ubrk.h - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/unicode/ubrk.h

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ******************************************************************************

	3 * Copyright (C) 1996-2010, International Business Machines Corporation and other s.

	4 * All Rights Reserved.

	5 ******************************************************************************

	6 */

	7

	8 #ifndef UBRK_H

	9 #define UBRK_H

	10

	11 #include "unicode/utypes.h"

	12 #include "unicode/uloc.h"

	13 #include "unicode/utext.h"

	14 #include "unicode/localpointer.h"

	15

	16 /**

	17 * A text-break iterator.

	18 * For usage in C programs.

	19 */

	20 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR

	21 # define UBRK_TYPEDEF_UBREAK_ITERATOR

	22 /**

	23 * Opaque type representing an ICU Break iterator object.

	24 * @stable ICU 2.0

	25 */

	26 typedef struct UBreakIterator UBreakIterator;

	27 #endif

	28

	29 #if !UCONFIG_NO_BREAK_ITERATION

	30

	31 #include "unicode/parseerr.h"

	32

	33 /**

	34 * \file

	35 * \brief C API: BreakIterator

	36 *

	37 * <h2> BreakIterator C API </h2>

	38 *

	39 * The BreakIterator C API defines methods for finding the location

	40 * of boundaries in text. Pointer to a UBreakIterator maintain a

	41 * current position and scan over text returning the index of characters

	42 * where boundaries occur.

	43 * <p>

	44 * Line boundary analysis determines where a text string can be broken

	45 * when line-wrapping. The mechanism correctly handles punctuation and

	46 * hyphenated words.

	47 * <p>

	48 * Sentence boundary analysis allows selection with correct

	49 * interpretation of periods within numbers and abbreviations, and

	50 * trailing punctuation marks such as quotation marks and parentheses.

	51 * <p>

	52 * Word boundary analysis is used by search and replace functions, as

	53 * well as within text editing applications that allow the user to

	54 * select words with a double click. Word selection provides correct

	55 * interpretation of punctuation marks within and following

	56 * words. Characters that are not part of a word, such as symbols or

	57 * punctuation marks, have word-breaks on both sides.

	58 * <p>

	59 * Character boundary analysis identifies the boundaries of

	60 * "Extended Grapheme Clusters", which are groupings of codepoints

	61 * that should be treated as character-like units for many text operations.

	62 * Please see Unicode Standard Annex #29, Unicode Text Segmentation,

	63 * http://www.unicode.org/reports/tr29/ for additional information

	64 * on grapheme clusters and guidelines on their use.

	65 * <p>

	66 * Title boundary analysis locates all positions,

	67 * typically starts of words, that should be set to Title Case

	68 * when title casing the text.

	69 * <p>

	70 * The text boundary positions are found according to the rules

	71 * described in Unicode Standard Annex #29, Text Boundaries, and

	72 * Unicode Standard Annex #14, Line Breaking Properties. These

	73 * are available at http://www.unicode.org/reports/tr14/ and

	74 * http://www.unicode.org/reports/tr29/.

	75 * <p>

	76 * In addition to the plain C API defined in this header file, an

	77 * object oriented C++ API with equivalent functionality is defined in the

	78 * file brkiter.h.

	79 * <p>

	80 * Code snippets illustrating the use of the Break Iterator APIs

	81 * are available in the ICU User Guide,

	82 * http://icu-project.org/userguide/boundaryAnalysis.html

	83 * and in the sample program icu/source/samples/break/break.cpp

	84 */

	85

	86 /** The possible types of text boundaries. @stable ICU 2.0 */

	87 typedef enum UBreakIteratorType {

	88 /** Character breaks @stable ICU 2.0 */

	89 UBRK_CHARACTER = 0,

	90 /** Word breaks @stable ICU 2.0 */

	91 UBRK_WORD = 1,

	92 /** Line breaks @stable ICU 2.0 */

	93 UBRK_LINE = 2,

	94 /** Sentence breaks @stable ICU 2.0 */

	95 UBRK_SENTENCE = 3,

	96

	97 #ifndef U_HIDE_DEPRECATED_API

	98 /**

	99 * Title Case breaks

	100 * The iterator created using this type locates title boundaries as described for

	101 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,

	102 * please use Word Boundary iterator.

	103 *

	104 * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.

	105 */

	106 UBRK_TITLE = 4,

	107 #endif /* U_HIDE_DEPRECATED_API */

	108 UBRK_COUNT = 5

	109 } UBreakIteratorType;

	110

	111 /** Value indicating all text boundaries have been returned.

	112 * @stable ICU 2.0

	113 */

	114 #define UBRK_DONE ((int32_t) -1)

	115

	116

	117 /**

	118 * Enum constants for the word break tags returned by

	119 * getRuleStatus(). A range of values is defined for each category of

	120 * word, to allow for further subdivisions of a category in future releases.

	121 * Applications should check for tag values falling within the range, rather

	122 * than for single individual values.

	123 * @stable ICU 2.2

	124 */

	125 typedef enum UWordBreak {

	126 /** Tag value for "words" that do not fit into any of other categories.

	127 * Includes spaces and most punctuation. */

	128 UBRK_WORD_NONE = 0,

	129 /** Upper bound for tags for uncategorized words. */

	130 UBRK_WORD_NONE_LIMIT = 100,

	131 /** Tag value for words that appear to be numbers, lower limit. */

	132 UBRK_WORD_NUMBER = 100,

	133 /** Tag value for words that appear to be numbers, upper limit. */

	134 UBRK_WORD_NUMBER_LIMIT = 200,

	135 /** Tag value for words that contain letters, excluding

	136 * hiragana, katakana or ideographic characters, lower limit. */

	137 UBRK_WORD_LETTER = 200,

	138 /** Tag value for words containing letters, upper limit */

	139 UBRK_WORD_LETTER_LIMIT = 300,

	140 /** Tag value for words containing kana characters, lower limit */

	141 UBRK_WORD_KANA = 300,

	142 /** Tag value for words containing kana characters, upper limit */

	143 UBRK_WORD_KANA_LIMIT = 400,

	144 /** Tag value for words containing ideographic characters, lower limit */

	145 UBRK_WORD_IDEO = 400,

	146 /** Tag value for words containing ideographic characters, upper limit */

	147 UBRK_WORD_IDEO_LIMIT = 500

	148 } UWordBreak;

	149

	150 /**

	151 * Enum constants for the line break tags returned by getRuleStatus().

	152 * A range of values is defined for each category of

	153 * word, to allow for further subdivisions of a category in future releases.

	154 * Applications should check for tag values falling within the range, rather

	155 * than for single individual values.

	156 * @stable ICU 2.8

	157 */

	158 typedef enum ULineBreakTag {

	159 /** Tag value for soft line breaks, positions at which a line break

	160 * is acceptable but not required */

	161 UBRK_LINE_SOFT = 0,

	162 /** Upper bound for soft line breaks. */

	163 UBRK_LINE_SOFT_LIMIT = 100,

	164 /** Tag value for a hard, or mandatory line break */

	165 UBRK_LINE_HARD = 100,

	166 /** Upper bound for hard line breaks. */

	167 UBRK_LINE_HARD_LIMIT = 200

	168 } ULineBreakTag;

	169

	170

	171

	172 /**

	173 * Enum constants for the sentence break tags returned by getRuleStatus().

	174 * A range of values is defined for each category of

	175 * sentence, to allow for further subdivisions of a category in future releases .

	176 * Applications should check for tag values falling within the range, rather

	177 * than for single individual values.

	178 * @stable ICU 2.8

	179 */

	180 typedef enum USentenceBreakTag {

	181 /** Tag value for for sentences ending with a sentence terminator

	182 * ('.', '?', '!', etc.) character, possibly followed by a

	183 * hard separator (CR, LF, PS, etc.)

	184 */

	185 UBRK_SENTENCE_TERM = 0,

	186 /** Upper bound for tags for sentences ended by sentence terminators. */

	187 UBRK_SENTENCE_TERM_LIMIT = 100,

	188 /** Tag value for for sentences that do not contain an ending

	189 * sentence terminator ('.', '?', '!', etc.) character, but

	190 * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.

	191 */

	192 UBRK_SENTENCE_SEP = 100,

	193 /** Upper bound for tags for sentences ended by a separator. */

	194 UBRK_SENTENCE_SEP_LIMIT = 200

	195 /** Tag value for a hard, or mandatory line break */

	196 } USentenceBreakTag;

	197

	198

	199 /**

	200 * Open a new UBreakIterator for locating text boundaries for a specified locale .

	201 * A UBreakIterator may be used for detecting character, line, word,

	202 * and sentence breaks in text.

	203 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_W ORD,

	204 * UBRK_LINE, UBRK_SENTENCE

	205 * @param locale The locale specifying the text-breaking conventions.

	206 * @param text The text to be iterated over.

	207 * @param textLength The number of characters in text, or -1 if null-terminated.

	208 * @param status A UErrorCode to receive any errors.

	209 * @return A UBreakIterator for the specified locale.

	210 * @see ubrk_openRules

	211 * @stable ICU 2.0

	212 */

	213 U_STABLE UBreakIterator* U_EXPORT2

	214 ubrk_open(UBreakIteratorType type,

	215 const char *locale,

	216 const UChar *text,

	217 int32_t textLength,

	218 UErrorCode *status);

	219

	220 /**

	221 * Open a new UBreakIterator for locating text boundaries using specified breaki ng rules.

	222 * The rule syntax is ... (TBD)

	223 * @param rules A set of rules specifying the text breaking conventions.

	224 * @param rulesLength The number of characters in rules, or -1 if null-terminate d.

	225 * @param text The text to be iterated over. May be null, in which case ubrk_se tText() is

	226 * used to specify the text to be iterated.

	227 * @param textLength The number of characters in text, or -1 if null-terminated.

	228 * @param parseErr Receives position and context information for any syntax er rors

	229 * detected while parsing the rules.

	230 * @param status A UErrorCode to receive any errors.

	231 * @return A UBreakIterator for the specified rules.

	232 * @see ubrk_open

	233 * @stable ICU 2.2

	234 */

	235 U_STABLE UBreakIterator* U_EXPORT2

	236 ubrk_openRules(const UChar *rules,

	237 int32_t rulesLength,

	238 const UChar *text,

	239 int32_t textLength,

	240 UParseError *parseErr,

	241 UErrorCode *status);

	242

	243 /**

	244 * Thread safe cloning operation

	245 * @param bi iterator to be cloned

	246 * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.

	247 * If buffer is not large enough, new memory will be allocated.

	248 * Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.

	249 * @param pBufferSize pointer to size of allocated space.

	250 * If *pBufferSize == 0, a sufficient size for use in cloning will

	251 * be returned ('pre-flighting')

	252 * If *pBufferSize is not enough for a stack-based safe clone,

	253 * new memory will be allocated.

	254 * @param status to indicate whether the operation went on smoothly or there wer e errors

	255 * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any a llocations were necessary.

	256 * @return pointer to the new clone

	257 * @stable ICU 2.0

	258 */

	259 U_STABLE UBreakIterator * U_EXPORT2

	260 ubrk_safeClone(

	261 const UBreakIterator *bi,

	262 void *stackBuffer,

	263 int32_t *pBufferSize,

	264 UErrorCode *status);

	265

	266 /**

	267 * A recommended size (in bytes) for the memory buffer to be passed to ubrk_sav eClone().

	268 * @stable ICU 2.0

	269 */

	270 #define U_BRK_SAFECLONE_BUFFERSIZE 512

	271

	272 /**

	273 * Close a UBreakIterator.

	274 * Once closed, a UBreakIterator may no longer be used.

	275 * @param bi The break iterator to close.

	276 * @stable ICU 2.0

	277 */

	278 U_STABLE void U_EXPORT2

	279 ubrk_close(UBreakIterator *bi);

	280

	281 #if U_SHOW_CPLUSPLUS_API

	282

	283 U_NAMESPACE_BEGIN

	284

	285 /**

	286 * \class LocalUBreakIteratorPointer

	287 * "Smart pointer" class, closes a UBreakIterator via ubrk_close().

	288 * For most methods see the LocalPointerBase base class.

	289 *

	290 * @see LocalPointerBase

	291 * @see LocalPointer

	292 * @stable ICU 4.4

	293 */

	294 U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_clo se);

	295

	296 U_NAMESPACE_END

	297

	298 #endif

	299

	300 /**

	301 * Sets an existing iterator to point to a new piece of text

	302 * @param bi The iterator to use

	303 * @param text The text to be set

	304 * @param textLength The length of the text

	305 * @param status The error code

	306 * @stable ICU 2.0

	307 */

	308 U_STABLE void U_EXPORT2

	309 ubrk_setText(UBreakIterator* bi,

	310 const UChar* text,

	311 int32_t textLength,

	312 UErrorCode* status);

	313

	314

	315 /**

	316 * Sets an existing iterator to point to a new piece of text

	317 * @param bi The iterator to use

	318 * @param text The text to be set.

	319 * This function makes a shallow clone of the supplied UText. This means

	320 * that the caller is free to immediately close or otherwise reuse t he

	321 * UText that was passed as a parameter, but that the underlying tex t itself

	322 * must not be altered while being referenced by the break iterator.

	323 * @param status The error code

	324 * @stable ICU 3.4

	325 */

	326 U_STABLE void U_EXPORT2

	327 ubrk_setUText(UBreakIterator* bi,

	328 UText* text,

	329 UErrorCode* status);

	330

	331

	332

	333 /**

	334 * Determine the most recently-returned text boundary.

	335 *

	336 * @param bi The break iterator to use.

	337 * @return The character index most recently returned by \ref ubrk_next, \ref ub rk_previous,

	338 * \ref ubrk_first, or \ref ubrk_last.

	339 * @stable ICU 2.0

	340 */

	341 U_STABLE int32_t U_EXPORT2

	342 ubrk_current(const UBreakIterator *bi);

	343

	344 /**

	345 * Determine the text boundary following the current text boundary.

	346 *

	347 * @param bi The break iterator to use.

	348 * @return The character index of the next text boundary, or UBRK_DONE

	349 * if all text boundaries have been returned.

	350 * @see ubrk_previous

	351 * @stable ICU 2.0

	352 */

	353 U_STABLE int32_t U_EXPORT2

	354 ubrk_next(UBreakIterator *bi);

	355

	356 /**

	357 * Determine the text boundary preceding the current text boundary.

	358 *

	359 * @param bi The break iterator to use.

	360 * @return The character index of the preceding text boundary, or UBRK_DONE

	361 * if all text boundaries have been returned.

	362 * @see ubrk_next

	363 * @stable ICU 2.0

	364 */

	365 U_STABLE int32_t U_EXPORT2

	366 ubrk_previous(UBreakIterator *bi);

	367

	368 /**

	369 * Determine the index of the first character in the text being scanned.

	370 * This is not always the same as index 0 of the text.

	371 * @param bi The break iterator to use.

	372 * @return The character index of the first character in the text being scanned.

	373 * @see ubrk_last

	374 * @stable ICU 2.0

	375 */

	376 U_STABLE int32_t U_EXPORT2

	377 ubrk_first(UBreakIterator *bi);

	378

	379 /**

	380 * Determine the index immediately <EM>beyond</EM> the last character in the tex t being

	381 * scanned.

	382 * This is not the same as the last character.

	383 * @param bi The break iterator to use.

	384 * @return The character offset immediately <EM>beyond</EM> the last character i n the

	385 * text being scanned.

	386 * @see ubrk_first

	387 * @stable ICU 2.0

	388 */

	389 U_STABLE int32_t U_EXPORT2

	390 ubrk_last(UBreakIterator *bi);

	391

	392 /**

	393 * Determine the text boundary preceding the specified offset.

	394 * The value returned is always smaller than offset, or UBRK_DONE.

	395 * @param bi The break iterator to use.

	396 * @param offset The offset to begin scanning.

	397 * @return The text boundary preceding offset, or UBRK_DONE.

	398 * @see ubrk_following

	399 * @stable ICU 2.0

	400 */

	401 U_STABLE int32_t U_EXPORT2

	402 ubrk_preceding(UBreakIterator *bi,

	403 int32_t offset);

	404

	405 /**

	406 * Determine the text boundary following the specified offset.

	407 * The value returned is always greater than offset, or UBRK_DONE.

	408 * @param bi The break iterator to use.

	409 * @param offset The offset to begin scanning.

	410 * @return The text boundary following offset, or UBRK_DONE.

	411 * @see ubrk_preceding

	412 * @stable ICU 2.0

	413 */

	414 U_STABLE int32_t U_EXPORT2

	415 ubrk_following(UBreakIterator *bi,

	416 int32_t offset);

	417

	418 /**

	419 * Get a locale for which text breaking information is available.

	420 * A UBreakIterator in a locale returned by this function will perform the correc t

	421 * text breaking for the locale.

	422 * @param index The index of the desired locale.

	423 * @return A locale for which number text breaking information is available, or 0 if none.

	424 * @see ubrk_countAvailable

	425 * @stable ICU 2.0

	426 */

	427 U_STABLE const char* U_EXPORT2

	428 ubrk_getAvailable(int32_t index);

	429

	430 /**

	431 * Determine how many locales have text breaking information available.

	432 * This function is most useful as determining the loop ending condition for

	433 * calls to \ref ubrk_getAvailable.

	434 * @return The number of locales for which text breaking information is available .

	435 * @see ubrk_getAvailable

	436 * @stable ICU 2.0

	437 */

	438 U_STABLE int32_t U_EXPORT2

	439 ubrk_countAvailable(void);

	440

	441

	442 /**

	443 * Returns true if the specfied position is a boundary position. As a side

	444 * effect, leaves the iterator pointing to the first boundary position at

	445 * or after "offset".

	446 * @param bi The break iterator to use.

	447 * @param offset the offset to check.

	448 * @return True if "offset" is a boundary position.

	449 * @stable ICU 2.0

	450 */

	451 U_STABLE UBool U_EXPORT2

	452 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);

	453

	454 /**

	455 * Return the status from the break rule that determined the most recently

	456 * returned break position. The values appear in the rule source

	457 * within brackets, {123}, for example. For rules that do not specify a

	458 * status, a default value of 0 is returned.

	459 * <p>

	460 * For word break iterators, the possible values are defined in enum UWordBreak.

	461 * @stable ICU 2.2

	462 */

	463 U_STABLE int32_t U_EXPORT2

	464 ubrk_getRuleStatus(UBreakIterator *bi);

	465

	466 /**

	467 * Get the statuses from the break rules that determined the most recently

	468 * returned break position. The values appear in the rule source

	469 * within brackets, {123}, for example. The default status value for rules

	470 * that do not explicitly provide one is zero.

	471 * <p>

	472 * For word break iterators, the possible values are defined in enum UWordBreak.

	473 * @param bi The break iterator to use

	474 * @param fillInVec an array to be filled in with the status values.

	475 * @param capacity the length of the supplied vector. A length of zero causes

	476 * the function to return the number of status values, in the

	477 * normal way, without attemtping to store any values.

	478 * @param status receives error codes.

	479 * @return The number of rule status values from rules that determined

	480 * the most recent boundary returned by the break iterator.

	481 * @stable ICU 3.0

	482 */

	483 U_STABLE int32_t U_EXPORT2

	484 ubrk_getRuleStatusVec(UBreakIterator bi, int32_t fillInVec, int32_t capacity, UErrorCode *status);

	485

	486 /**

	487 * Return the locale of the break iterator. You can choose between the valid and

	488 * the actual locale.

	489 * @param bi break iterator

	490 * @param type locale type (valid or actual)

	491 * @param status error code

	492 * @return locale string

	493 * @stable ICU 2.8

	494 */

	495 U_STABLE const char* U_EXPORT2

	496 ubrk_getLocaleByType(const UBreakIterator bi, ULocDataLocaleType type, UErrorCo de status);

	497

	498

	499 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

	500

	501 #endif

OLD	NEW

« no previous file with comments | « icu46/source/common/unicode/ubidi.h ('k') | icu46/source/common/unicode/ucasemap.h » ('j') | no next file with comments »