OLD | NEW |
| (Empty) |
1 /* | |
2 ******************************************************************************** | |
3 * Copyright (C) 1997-2010, International Business Machines | |
4 * Corporation and others. All Rights Reserved. | |
5 ******************************************************************************** | |
6 * | |
7 * File brkiter.h | |
8 * | |
9 * Modification History: | |
10 * | |
11 * Date Name Description | |
12 * 02/18/97 aliu Added typedef for TextCount. Made DONE const. | |
13 * 05/07/97 aliu Fixed DLL declaration. | |
14 * 07/09/97 jfitz Renamed BreakIterator and interface synced with JDK | |
15 * 08/11/98 helena Sync-up JDK1.2. | |
16 * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance meth
ods. | |
17 ******************************************************************************** | |
18 */ | |
19 | |
20 #ifndef BRKITER_H | |
21 #define BRKITER_H | |
22 | |
23 #include "unicode/utypes.h" | |
24 | |
25 /** | |
26 * \file | |
27 * \brief C++ API: Break Iterator. | |
28 */ | |
29 | |
30 #if UCONFIG_NO_BREAK_ITERATION | |
31 | |
32 U_NAMESPACE_BEGIN | |
33 | |
34 /* | |
35 * Allow the declaration of APIs with pointers to BreakIterator | |
36 * even when break iteration is removed from the build. | |
37 */ | |
38 class BreakIterator; | |
39 | |
40 U_NAMESPACE_END | |
41 | |
42 #else | |
43 | |
44 #include "unicode/uobject.h" | |
45 #include "unicode/unistr.h" | |
46 #include "unicode/chariter.h" | |
47 #include "unicode/locid.h" | |
48 #include "unicode/ubrk.h" | |
49 #include "unicode/strenum.h" | |
50 #include "unicode/utext.h" | |
51 #include "unicode/umisc.h" | |
52 | |
53 U_NAMESPACE_BEGIN | |
54 | |
55 /** | |
56 * The BreakIterator class implements methods for finding the location | |
57 * of boundaries in text. BreakIterator is an abstract base class. | |
58 * Instances of BreakIterator maintain a current position and scan over | |
59 * text returning the index of characters where boundaries occur. | |
60 * <p> | |
61 * Line boundary analysis determines where a text string can be broken | |
62 * when line-wrapping. The mechanism correctly handles punctuation and | |
63 * hyphenated words. | |
64 * <p> | |
65 * Sentence boundary analysis allows selection with correct | |
66 * interpretation of periods within numbers and abbreviations, and | |
67 * trailing punctuation marks such as quotation marks and parentheses. | |
68 * <p> | |
69 * Word boundary analysis is used by search and replace functions, as | |
70 * well as within text editing applications that allow the user to | |
71 * select words with a double click. Word selection provides correct | |
72 * interpretation of punctuation marks within and following | |
73 * words. Characters that are not part of a word, such as symbols or | |
74 * punctuation marks, have word-breaks on both sides. | |
75 * <p> | |
76 * Character boundary analysis allows users to interact with | |
77 * characters as they expect to, for example, when moving the cursor | |
78 * through a text string. Character boundary analysis provides correct | |
79 * navigation of through character strings, regardless of how the | |
80 * character is stored. For example, an accented character might be | |
81 * stored as a base character and a diacritical mark. What users | |
82 * consider to be a character can differ between languages. | |
83 * <p> | |
84 * The text boundary positions are found according to the rules | |
85 * described in Unicode Standard Annex #29, Text Boundaries, and | |
86 * Unicode Standard Annex #14, Line Breaking Properties. These | |
87 * are available at http://www.unicode.org/reports/tr14/ and | |
88 * http://www.unicode.org/reports/tr29/. | |
89 * <p> | |
90 * In addition to the C++ API defined in this header file, a | |
91 * plain C API with equivalent functionality is defined in the | |
92 * file ubrk.h | |
93 * <p> | |
94 * Code snippets illustrating the use of the Break Iterator APIs | |
95 * are available in the ICU User Guide, | |
96 * http://icu-project.org/userguide/boundaryAnalysis.html | |
97 * and in the sample program icu/source/samples/break/break.cpp | |
98 * | |
99 */ | |
100 class U_COMMON_API BreakIterator : public UObject { | |
101 public: | |
102 /** | |
103 * destructor | |
104 * @stable ICU 2.0 | |
105 */ | |
106 virtual ~BreakIterator(); | |
107 | |
108 /** | |
109 * Return true if another object is semantically equal to this | |
110 * one. The other object should be an instance of the same subclass of | |
111 * BreakIterator. Objects of different subclasses are considered | |
112 * unequal. | |
113 * <P> | |
114 * Return true if this BreakIterator is at the same position in the | |
115 * same text, and is the same class and type (word, line, etc.) of | |
116 * BreakIterator, as the argument. Text is considered the same if | |
117 * it contains the same characters, it need not be the same | |
118 * object, and styles are not considered. | |
119 * @stable ICU 2.0 | |
120 */ | |
121 virtual UBool operator==(const BreakIterator&) const = 0; | |
122 | |
123 /** | |
124 * Returns the complement of the result of operator== | |
125 * @param rhs The BreakIterator to be compared for inequality | |
126 * @return the complement of the result of operator== | |
127 * @stable ICU 2.0 | |
128 */ | |
129 UBool operator!=(const BreakIterator& rhs) const { return !operator==(rhs);
} | |
130 | |
131 /** | |
132 * Return a polymorphic copy of this object. This is an abstract | |
133 * method which subclasses implement. | |
134 * @stable ICU 2.0 | |
135 */ | |
136 virtual BreakIterator* clone(void) const = 0; | |
137 | |
138 /** | |
139 * Return a polymorphic class ID for this object. Different subclasses | |
140 * will return distinct unequal values. | |
141 * @stable ICU 2.0 | |
142 */ | |
143 virtual UClassID getDynamicClassID(void) const = 0; | |
144 | |
145 /** | |
146 * Return a CharacterIterator over the text being analyzed. | |
147 * @stable ICU 2.0 | |
148 */ | |
149 virtual CharacterIterator& getText(void) const = 0; | |
150 | |
151 | |
152 /** | |
153 * Get a UText for the text being analyzed. | |
154 * The returned UText is a shallow clone of the UText used internally | |
155 * by the break iterator implementation. It can safely be used to | |
156 * access the text without impacting any break iterator operations, | |
157 * but the underlying text itself must not be altered. | |
158 * | |
159 * @param fillIn A UText to be filled in. If NULL, a new UText will be | |
160 * allocated to hold the result. | |
161 * @param status receives any error codes. | |
162 * @return The current UText for this break iterator. If an input | |
163 * UText was provided, it will always be returned. | |
164 * @stable ICU 3.4 | |
165 */ | |
166 virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0; | |
167 | |
168 /** | |
169 * Change the text over which this operates. The text boundary is | |
170 * reset to the start. | |
171 * @param text The UnicodeString used to change the text. | |
172 * @stable ICU 2.0 | |
173 */ | |
174 virtual void setText(const UnicodeString &text) = 0; | |
175 | |
176 /** | |
177 * Reset the break iterator to operate over the text represented by | |
178 * the UText. The iterator position is reset to the start. | |
179 * | |
180 * This function makes a shallow clone of the supplied UText. This means | |
181 * that the caller is free to immediately close or otherwise reuse the | |
182 * Utext that was passed as a parameter, but that the underlying text itself | |
183 * must not be altered while being referenced by the break iterator. | |
184 * | |
185 * @param text The UText used to change the text. | |
186 * @param status receives any error codes. | |
187 * @stable ICU 3.4 | |
188 */ | |
189 virtual void setText(UText *text, UErrorCode &status) = 0; | |
190 | |
191 /** | |
192 * Change the text over which this operates. The text boundary is | |
193 * reset to the start. | |
194 * Note that setText(UText *) provides similar functionality to this functio
n, | |
195 * and is more efficient. | |
196 * @param it The CharacterIterator used to change the text. | |
197 * @stable ICU 2.0 | |
198 */ | |
199 virtual void adoptText(CharacterIterator* it) = 0; | |
200 | |
201 enum { | |
202 /** | |
203 * DONE is returned by previous() and next() after all valid | |
204 * boundaries have been returned. | |
205 * @stable ICU 2.0 | |
206 */ | |
207 DONE = (int32_t)-1 | |
208 }; | |
209 | |
210 /** | |
211 * Return the index of the first character in the text being scanned. | |
212 * @stable ICU 2.0 | |
213 */ | |
214 virtual int32_t first(void) = 0; | |
215 | |
216 /** | |
217 * Return the index immediately BEYOND the last character in the text being
scanned. | |
218 * @stable ICU 2.0 | |
219 */ | |
220 virtual int32_t last(void) = 0; | |
221 | |
222 /** | |
223 * Return the boundary preceding the current boundary. | |
224 * @return The character index of the previous text boundary or DONE if all | |
225 * boundaries have been returned. | |
226 * @stable ICU 2.0 | |
227 */ | |
228 virtual int32_t previous(void) = 0; | |
229 | |
230 /** | |
231 * Return the boundary following the current boundary. | |
232 * @return The character index of the next text boundary or DONE if all | |
233 * boundaries have been returned. | |
234 * @stable ICU 2.0 | |
235 */ | |
236 virtual int32_t next(void) = 0; | |
237 | |
238 /** | |
239 * Return character index of the current interator position within the text. | |
240 * @return The boundary most recently returned. | |
241 * @stable ICU 2.0 | |
242 */ | |
243 virtual int32_t current(void) const = 0; | |
244 | |
245 /** | |
246 * Return the first boundary following the specified offset. | |
247 * The value returned is always greater than the offset or | |
248 * the value BreakIterator.DONE | |
249 * @param offset the offset to begin scanning. | |
250 * @return The first boundary after the specified offset. | |
251 * @stable ICU 2.0 | |
252 */ | |
253 virtual int32_t following(int32_t offset) = 0; | |
254 | |
255 /** | |
256 * Return the first boundary preceding the specified offset. | |
257 * The value returned is always smaller than the offset or | |
258 * the value BreakIterator.DONE | |
259 * @param offset the offset to begin scanning. | |
260 * @return The first boundary before the specified offset. | |
261 * @stable ICU 2.0 | |
262 */ | |
263 virtual int32_t preceding(int32_t offset) = 0; | |
264 | |
265 /** | |
266 * Return true if the specfied position is a boundary position. | |
267 * As a side effect, the current position of the iterator is set | |
268 * to the first boundary position at or following the specified offset. | |
269 * @param offset the offset to check. | |
270 * @return True if "offset" is a boundary position. | |
271 * @stable ICU 2.0 | |
272 */ | |
273 virtual UBool isBoundary(int32_t offset) = 0; | |
274 | |
275 /** | |
276 * Return the nth boundary from the current boundary | |
277 * @param n which boundary to return. A value of 0 | |
278 * does nothing. Negative values move to previous boundaries | |
279 * and positive values move to later boundaries. | |
280 * @return The index of the nth boundary from the current position, or | |
281 * DONE if there are fewer than |n| boundaries in the specfied direction. | |
282 * @stable ICU 2.0 | |
283 */ | |
284 virtual int32_t next(int32_t n) = 0; | |
285 | |
286 /** | |
287 * Create BreakIterator for word-breaks using the given locale. | |
288 * Returns an instance of a BreakIterator implementing word breaks. | |
289 * WordBreak is useful for word selection (ex. double click) | |
290 * @param where the locale. | |
291 * @param status the error code | |
292 * @return A BreakIterator for word-breaks. The UErrorCode& status | |
293 * parameter is used to return status information to the user. | |
294 * To check whether the construction succeeded or not, you should check | |
295 * the value of U_SUCCESS(err). If you wish more detailed information, you | |
296 * can check for informational error results which still indicate success. | |
297 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For | |
298 * example, 'de_CH' was requested, but nothing was found there, so 'de' was | |
299 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was | |
300 * used; neither the requested locale nor any of its fall back locales | |
301 * could be found. | |
302 * The caller owns the returned object and is responsible for deleting it. | |
303 * @stable ICU 2.0 | |
304 */ | |
305 static BreakIterator* U_EXPORT2 | |
306 createWordInstance(const Locale& where, UErrorCode& status); | |
307 | |
308 /** | |
309 * Create BreakIterator for line-breaks using specified locale. | |
310 * Returns an instance of a BreakIterator implementing line breaks. Line | |
311 * breaks are logically possible line breaks, actual line breaks are | |
312 * usually determined based on display width. | |
313 * LineBreak is useful for word wrapping text. | |
314 * @param where the locale. | |
315 * @param status The error code. | |
316 * @return A BreakIterator for line-breaks. The UErrorCode& status | |
317 * parameter is used to return status information to the user. | |
318 * To check whether the construction succeeded or not, you should check | |
319 * the value of U_SUCCESS(err). If you wish more detailed information, you | |
320 * can check for informational error results which still indicate success. | |
321 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For | |
322 * example, 'de_CH' was requested, but nothing was found there, so 'de' was | |
323 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was | |
324 * used; neither the requested locale nor any of its fall back locales | |
325 * could be found. | |
326 * The caller owns the returned object and is responsible for deleting it. | |
327 * @stable ICU 2.0 | |
328 */ | |
329 static BreakIterator* U_EXPORT2 | |
330 createLineInstance(const Locale& where, UErrorCode& status); | |
331 | |
332 /** | |
333 * Create BreakIterator for character-breaks using specified locale | |
334 * Returns an instance of a BreakIterator implementing character breaks. | |
335 * Character breaks are boundaries of combining character sequences. | |
336 * @param where the locale. | |
337 * @param status The error code. | |
338 * @return A BreakIterator for character-breaks. The UErrorCode& status | |
339 * parameter is used to return status information to the user. | |
340 * To check whether the construction succeeded or not, you should check | |
341 * the value of U_SUCCESS(err). If you wish more detailed information, you | |
342 * can check for informational error results which still indicate success. | |
343 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For | |
344 * example, 'de_CH' was requested, but nothing was found there, so 'de' was | |
345 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was | |
346 * used; neither the requested locale nor any of its fall back locales | |
347 * could be found. | |
348 * The caller owns the returned object and is responsible for deleting it. | |
349 * @stable ICU 2.0 | |
350 */ | |
351 static BreakIterator* U_EXPORT2 | |
352 createCharacterInstance(const Locale& where, UErrorCode& status); | |
353 | |
354 /** | |
355 * Create BreakIterator for sentence-breaks using specified locale | |
356 * Returns an instance of a BreakIterator implementing sentence breaks. | |
357 * @param where the locale. | |
358 * @param status The error code. | |
359 * @return A BreakIterator for sentence-breaks. The UErrorCode& status | |
360 * parameter is used to return status information to the user. | |
361 * To check whether the construction succeeded or not, you should check | |
362 * the value of U_SUCCESS(err). If you wish more detailed information, you | |
363 * can check for informational error results which still indicate success. | |
364 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For | |
365 * example, 'de_CH' was requested, but nothing was found there, so 'de' was | |
366 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was | |
367 * used; neither the requested locale nor any of its fall back locales | |
368 * could be found. | |
369 * The caller owns the returned object and is responsible for deleting it. | |
370 * @stable ICU 2.0 | |
371 */ | |
372 static BreakIterator* U_EXPORT2 | |
373 createSentenceInstance(const Locale& where, UErrorCode& status); | |
374 | |
375 /** | |
376 * Create BreakIterator for title-casing breaks using the specified locale | |
377 * Returns an instance of a BreakIterator implementing title breaks. | |
378 * The iterator returned locates title boundaries as described for | |
379 * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, | |
380 * please use Word Boundary iterator.{@link #createWordInstance } | |
381 * | |
382 * @param where the locale. | |
383 * @param status The error code. | |
384 * @return A BreakIterator for title-breaks. The UErrorCode& status | |
385 * parameter is used to return status information to the user. | |
386 * To check whether the construction succeeded or not, you should check | |
387 * the value of U_SUCCESS(err). If you wish more detailed information, you | |
388 * can check for informational error results which still indicate success. | |
389 * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For | |
390 * example, 'de_CH' was requested, but nothing was found there, so 'de' was | |
391 * used. U_USING_DEFAULT_WARNING indicates that the default locale data was | |
392 * used; neither the requested locale nor any of its fall back locales | |
393 * could be found. | |
394 * The caller owns the returned object and is responsible for deleting it. | |
395 * @stable ICU 2.1 | |
396 */ | |
397 static BreakIterator* U_EXPORT2 | |
398 createTitleInstance(const Locale& where, UErrorCode& status); | |
399 | |
400 /** | |
401 * Get the set of Locales for which TextBoundaries are installed. | |
402 * <p><b>Note:</b> this will not return locales added through the register | |
403 * call. To see the registered locales too, use the getAvailableLocales | |
404 * function that returns a StringEnumeration object </p> | |
405 * @param count the output parameter of number of elements in the locale lis
t | |
406 * @return available locales | |
407 * @stable ICU 2.0 | |
408 */ | |
409 static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count); | |
410 | |
411 /** | |
412 * Get name of the object for the desired Locale, in the desired langauge. | |
413 * @param objectLocale must be from getAvailableLocales. | |
414 * @param displayLocale specifies the desired locale for output. | |
415 * @param name the fill-in parameter of the return value | |
416 * Uses best match. | |
417 * @return user-displayable name | |
418 * @stable ICU 2.0 | |
419 */ | |
420 static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale, | |
421 const Locale& displayLocale, | |
422 UnicodeString& name); | |
423 | |
424 /** | |
425 * Get name of the object for the desired Locale, in the langauge of the | |
426 * default locale. | |
427 * @param objectLocale must be from getMatchingLocales | |
428 * @param name the fill-in parameter of the return value | |
429 * @return user-displayable name | |
430 * @stable ICU 2.0 | |
431 */ | |
432 static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale, | |
433 UnicodeString& name); | |
434 | |
435 /** | |
436 * Thread safe client-buffer-based cloning operation | |
437 * Do NOT call delete on a safeclone, since 'new' is not used to create i
t. | |
438 * @param stackBuffer user allocated space for the new clone. If NULL new me
mory will be allocated. | |
439 * If buffer is not large enough, new memory will be allocated. | |
440 * @param BufferSize reference to size of allocated space. | |
441 * If BufferSize == 0, a sufficient size for use in cloning will | |
442 * be returned ('pre-flighting') | |
443 * If BufferSize is not enough for a stack-based safe clone, | |
444 * new memory will be allocated. | |
445 * @param status to indicate whether the operation went on smoothly or there
were errors | |
446 * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if a
ny allocations were | |
447 * necessary. | |
448 * @return pointer to the new clone | |
449 * | |
450 * @stable ICU 2.0 | |
451 */ | |
452 virtual BreakIterator * createBufferClone(void *stackBuffer, | |
453 int32_t &BufferSize, | |
454 UErrorCode &status) = 0; | |
455 | |
456 /** | |
457 * Determine whether the BreakIterator was created in user memory by | |
458 * createBufferClone(), and thus should not be deleted. Such objects | |
459 * must be closed by an explicit call to the destructor (not delete). | |
460 * @stable ICU 2.0 | |
461 */ | |
462 inline UBool isBufferClone(void); | |
463 | |
464 #if !UCONFIG_NO_SERVICE | |
465 /** | |
466 * Register a new break iterator of the indicated kind, to use in the given
locale. | |
467 * The break iterator will be adopted. Clones of the iterator will be retur
ned | |
468 * if a request for a break iterator of the given kind matches or falls back
to | |
469 * this locale. | |
470 * @param toAdopt the BreakIterator instance to be adopted | |
471 * @param locale the Locale for which this instance is to be registered | |
472 * @param kind the type of iterator for which this instance is to be registe
red | |
473 * @param status the in/out status code, no special meanings are assigned | |
474 * @return a registry key that can be used to unregister this instance | |
475 * @stable ICU 2.4 | |
476 */ | |
477 static URegistryKey U_EXPORT2 registerInstance(BreakIterator* toAdopt, | |
478 const Locale& locale, | |
479 UBreakIteratorType kind, | |
480 UErrorCode& status); | |
481 | |
482 /** | |
483 * Unregister a previously-registered BreakIterator using the key returned f
rom the | |
484 * register call. Key becomes invalid after a successful call and should no
t be used again. | |
485 * The BreakIterator corresponding to the key will be deleted. | |
486 * @param key the registry key returned by a previous call to registerInstan
ce | |
487 * @param status the in/out status code, no special meanings are assigned | |
488 * @return TRUE if the iterator for the key was successfully unregistered | |
489 * @stable ICU 2.4 | |
490 */ | |
491 static UBool U_EXPORT2 unregister(URegistryKey key, UErrorCode& status); | |
492 | |
493 /** | |
494 * Return a StringEnumeration over the locales available at the time of the
call, | |
495 * including registered locales. | |
496 * @return a StringEnumeration over the locales available at the time of the
call | |
497 * @stable ICU 2.4 | |
498 */ | |
499 static StringEnumeration* U_EXPORT2 getAvailableLocales(void); | |
500 #endif | |
501 | |
502 /** | |
503 * Returns the locale for this break iterator. Two flavors are available: va
lid and | |
504 * actual locale. | |
505 * @stable ICU 2.8 | |
506 */ | |
507 Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const; | |
508 | |
509 /** Get the locale for this break iterator object. You can choose between va
lid and actual locale. | |
510 * @param type type of the locale we're looking for (valid or actual) | |
511 * @param status error code for the operation | |
512 * @return the locale | |
513 * @internal | |
514 */ | |
515 const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const; | |
516 | |
517 private: | |
518 static BreakIterator* buildInstance(const Locale& loc, const char *type, int
32_t kind, UErrorCode& status); | |
519 static BreakIterator* createInstance(const Locale& loc, int32_t kind, UError
Code& status); | |
520 static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCo
de& status); | |
521 | |
522 friend class ICUBreakIteratorFactory; | |
523 friend class ICUBreakIteratorService; | |
524 | |
525 protected: | |
526 /** @internal */ | |
527 BreakIterator(); | |
528 /** @internal */ | |
529 UBool fBufferClone; | |
530 /** @internal */ | |
531 BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FA
LSE) {} | |
532 | |
533 private: | |
534 | |
535 /** @internal */ | |
536 char actualLocale[ULOC_FULLNAME_CAPACITY]; | |
537 char validLocale[ULOC_FULLNAME_CAPACITY]; | |
538 | |
539 /** | |
540 * The assignment operator has no real implementation. | |
541 * It's provided to make the compiler happy. Do not call. | |
542 */ | |
543 BreakIterator& operator=(const BreakIterator&); | |
544 }; | |
545 | |
546 inline UBool BreakIterator::isBufferClone() | |
547 { | |
548 return fBufferClone; | |
549 } | |
550 | |
551 U_NAMESPACE_END | |
552 | |
553 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
554 | |
555 #endif // _BRKITER | |
556 //eof | |
557 | |
OLD | NEW |