OLD | NEW |
| (Empty) |
1 /* | |
2 ******************************************************************************* | |
3 * | |
4 * Copyright (C) 2004-2010, International Business Machines | |
5 * Corporation and others. All Rights Reserved. | |
6 * | |
7 ******************************************************************************* | |
8 * file name: utext.h | |
9 * encoding: US-ASCII | |
10 * tab size: 8 (not used) | |
11 * indentation:4 | |
12 * | |
13 * created on: 2004oct06 | |
14 * created by: Markus W. Scherer | |
15 */ | |
16 | |
17 #ifndef __UTEXT_H__ | |
18 #define __UTEXT_H__ | |
19 | |
20 /** | |
21 * \file | |
22 * \brief C API: Abstract Unicode Text API | |
23 * | |
24 * The Text Access API provides a means to allow text that is stored in alternat
ive | |
25 * formats to work with ICU services. ICU normally operates on text that is | |
26 * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type | |
27 * UnicodeString for C++ APIs. | |
28 * | |
29 * ICU Text Access allows other formats, such as UTF-8 or non-contiguous | |
30 * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU servic
es. | |
31 * | |
32 * There are three general classes of usage for UText: | |
33 * | |
34 * Application Level Use. This is the simplest usage - applications would | |
35 * use one of the utext_open() functions on their input text, and pass | |
36 * the resulting UText to the desired ICU service. | |
37 * | |
38 * Second is usage in ICU Services, such as break iteration, that will need
to | |
39 * operate on input presented to them as a UText. These implementations | |
40 * will need to use the iteration and related UText functions to gain | |
41 * access to the actual text. | |
42 * | |
43 * The third class of UText users are "text providers." These are the | |
44 * UText implementations for the various text storage formats. An applicati
on | |
45 * or system with a unique text storage format can implement a set of | |
46 * UText provider functions for that format, which will then allow | |
47 * ICU services to operate on that format. | |
48 * | |
49 * | |
50 * <em>Iterating over text</em> | |
51 * | |
52 * Here is sample code for a forward iteration over the contents of a UText | |
53 * | |
54 * \code | |
55 * UChar32 c; | |
56 * UText *ut = whatever(); | |
57 * | |
58 * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { | |
59 * // do whatever with the codepoint c here. | |
60 * } | |
61 * \endcode | |
62 * | |
63 * And here is similar code to iterate in the reverse direction, from the end | |
64 * of the text towards the beginning. | |
65 * | |
66 * \code | |
67 * UChar32 c; | |
68 * UText *ut = whatever(); | |
69 * int textLength = utext_nativeLength(ut); | |
70 * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut))
{ | |
71 * // do whatever with the codepoint c here. | |
72 * } | |
73 * \endcode | |
74 * | |
75 * <em>Characters and Indexing</em> | |
76 * | |
77 * Indexing into text by UText functions is nearly always in terms of the native | |
78 * indexing of the underlying text storage. The storage format could be UTF-8 | |
79 * or UTF-32, for example. When coding to the UText access API, no assumptions | |
80 * can be made regarding the size of characters, or how far an index | |
81 * may move when iterating between characters. | |
82 * | |
83 * All indices supplied to UText functions are pinned to the length of the | |
84 * text. An out-of-bounds index is not considered to be an error, but is | |
85 * adjusted to be in the range 0 <= index <= length of input text. | |
86 * | |
87 * | |
88 * When an index position is returned from a UText function, it will be | |
89 * a native index to the underlying text. In the case of multi-unit characters, | |
90 * it will always refer to the first position of the character, | |
91 * never to the interior. This is essentially the same thing as saying that | |
92 * a returned index will always point to a boundary between characters. | |
93 * | |
94 * When a native index is supplied to a UText function, all indices that | |
95 * refer to any part of a multi-unit character representation are considered | |
96 * to be equivalent. In the case of multi-unit characters, an incoming index | |
97 * will be logically normalized to refer to the start of the character. | |
98 * | |
99 * It is possible to test whether a native index is on a code point boundary | |
100 * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). | |
101 * If the index is returned unchanged, it was on a code point boundary. If | |
102 * an adjusted index is returned, the original index referred to the | |
103 * interior of a character. | |
104 * | |
105 * <em>Conventions for calling UText functions</em> | |
106 * | |
107 * Most UText access functions have as their first parameter a (UText *) pointer
, | |
108 * which specifies the UText to be used. Unless otherwise noted, the | |
109 * pointer must refer to a valid, open UText. Attempting to | |
110 * use a closed UText or passing a NULL pointer is a programming error and | |
111 * will produce undefined results or NULL pointer exceptions. | |
112 * | |
113 * The UText_Open family of functions can either open an existing (closed) | |
114 * UText, or heap allocate a new UText. Here is sample code for creating | |
115 * a stack-allocated UText. | |
116 * | |
117 * \code | |
118 * char *s = whatever(); // A utf-8 string | |
119 * U_ErrorCode status = U_ZERO_ERROR; | |
120 * UText ut = UTEXT_INITIALIZER; | |
121 * utext_openUTF8(ut, s, -1, &status); | |
122 * if (U_FAILURE(status)) { | |
123 * // error handling | |
124 * } else { | |
125 * // work with the UText | |
126 * } | |
127 * \endcode | |
128 * | |
129 * Any existing UText passed to an open function _must_ have been initialized, | |
130 * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated | |
131 * by an open function. Passing NULL will cause the open function to | |
132 * heap-allocate and fully initialize a new UText. | |
133 * | |
134 */ | |
135 | |
136 | |
137 | |
138 #include "unicode/utypes.h" | |
139 #include "unicode/uchar.h" | |
140 #if U_SHOW_CPLUSPLUS_API | |
141 #include "unicode/localpointer.h" | |
142 #include "unicode/rep.h" | |
143 #include "unicode/unistr.h" | |
144 #include "unicode/chariter.h" | |
145 #endif | |
146 | |
147 | |
148 U_CDECL_BEGIN | |
149 | |
150 struct UText; | |
151 typedef struct UText UText; /**< C typedef for struct UText. @stable ICU 3.6 */ | |
152 | |
153 | |
154 /*******************************************************************************
******** | |
155 * | |
156 * C Functions for creating UText wrappers around various kinds of text string
s. | |
157 * | |
158 *******************************************************************************
*********/ | |
159 | |
160 | |
161 /** | |
162 * Close function for UText instances. | |
163 * Cleans up, releases any resources being held by an open UText. | |
164 * <p> | |
165 * If the UText was originally allocated by one of the utext_open functions, | |
166 * the storage associated with the utext will also be freed. | |
167 * If the UText storage originated with the application, as it would with | |
168 * a local or static instance, the storage will not be deleted. | |
169 * | |
170 * An open UText can be reset to refer to new string by using one of the utex
t_open() | |
171 * functions without first closing the UText. | |
172 * | |
173 * @param ut The UText to be closed. | |
174 * @return NULL if the UText struct was deleted by the close. If the UText
struct | |
175 * was originally provided by the caller to the open function, it is | |
176 * returned by this function, and may be safely used again in | |
177 * a subsequent utext_open. | |
178 * | |
179 * @stable ICU 3.4 | |
180 */ | |
181 U_STABLE UText * U_EXPORT2 | |
182 utext_close(UText *ut); | |
183 | |
184 #if U_SHOW_CPLUSPLUS_API | |
185 | |
186 U_NAMESPACE_BEGIN | |
187 | |
188 /** | |
189 * \class LocalUTextPointer | |
190 * "Smart pointer" class, closes a UText via utext_close(). | |
191 * For most methods see the LocalPointerBase base class. | |
192 * | |
193 * @see LocalPointerBase | |
194 * @see LocalPointer | |
195 * @stable ICU 4.4 | |
196 */ | |
197 U_DEFINE_LOCAL_OPEN_POINTER(LocalUTextPointer, UText, utext_close); | |
198 | |
199 U_NAMESPACE_END | |
200 | |
201 #endif | |
202 | |
203 /** | |
204 * Open a read-only UText implementation for UTF-8 strings. | |
205 * | |
206 * \htmlonly | |
207 * Any invalid UTF-8 in the input will be handled in this way: | |
208 * a sequence of bytes that has the form of a truncated, but otherwise valid, | |
209 * UTF-8 sequence will be replaced by a single unicode replacement character, \u
FFFD. | |
210 * Any other illegal bytes will each be replaced by a \uFFFD. | |
211 * \endhtmlonly | |
212 * | |
213 * @param ut Pointer to a UText struct. If NULL, a new UText will be create
d. | |
214 * If non-NULL, must refer to an initialized UText struct, which w
ill then | |
215 * be reset to reference the specified UTF-8 string. | |
216 * @param s A UTF-8 string. Must not be NULL. | |
217 * @param length The length of the UTF-8 string in bytes, or -1 if the string is | |
218 * zero terminated. | |
219 * @param status Errors are returned here. | |
220 * @return A pointer to the UText. If a pre-allocated UText was provided,
it | |
221 * will always be used and returned. | |
222 * @stable ICU 3.4 | |
223 */ | |
224 U_STABLE UText * U_EXPORT2 | |
225 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status); | |
226 | |
227 | |
228 /** | |
229 * Open a read-only UText for UChar * string. | |
230 * | |
231 * @param ut Pointer to a UText struct. If NULL, a new UText will be create
d. | |
232 * If non-NULL, must refer to an initialized UText struct, which w
ill then | |
233 * be reset to reference the specified UChar string. | |
234 * @param s A UChar (UTF-16) string | |
235 * @param length The number of UChars in the input string, or -1 if the string i
s | |
236 * zero terminated. | |
237 * @param status Errors are returned here. | |
238 * @return A pointer to the UText. If a pre-allocated UText was provided,
it | |
239 * will always be used and returned. | |
240 * @stable ICU 3.4 | |
241 */ | |
242 U_STABLE UText * U_EXPORT2 | |
243 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status); | |
244 | |
245 | |
246 #if U_SHOW_CPLUSPLUS_API | |
247 /** | |
248 * Open a writable UText for a non-const UnicodeString. | |
249 * | |
250 * @param ut Pointer to a UText struct. If NULL, a new UText will be creat
ed. | |
251 * If non-NULL, must refer to an initialized UText struct, which
will then | |
252 * be reset to reference the specified input string. | |
253 * @param s A UnicodeString. | |
254 * @param status Errors are returned here. | |
255 * @return Pointer to the UText. If a UText was supplied as input, this | |
256 * will always be used and returned. | |
257 * @stable ICU 3.4 | |
258 */ | |
259 U_STABLE UText * U_EXPORT2 | |
260 utext_openUnicodeString(UText *ut, U_NAMESPACE_QUALIFIER UnicodeString *s, UErro
rCode *status); | |
261 | |
262 | |
263 /** | |
264 * Open a UText for a const UnicodeString. The resulting UText will not be wri
table. | |
265 * | |
266 * @param ut Pointer to a UText struct. If NULL, a new UText will be created
. | |
267 * If non-NULL, must refer to an initialized UText struct, which w
ill then | |
268 * be reset to reference the specified input string. | |
269 * @param s A const UnicodeString to be wrapped. | |
270 * @param status Errors are returned here. | |
271 * @return Pointer to the UText. If a UText was supplied as input, this | |
272 * will always be used and returned. | |
273 * @stable ICU 3.4 | |
274 */ | |
275 U_STABLE UText * U_EXPORT2 | |
276 utext_openConstUnicodeString(UText *ut, const U_NAMESPACE_QUALIFIER UnicodeStrin
g *s, UErrorCode *status); | |
277 | |
278 | |
279 /** | |
280 * Open a writable UText implementation for an ICU Replaceable object. | |
281 * @param ut Pointer to a UText struct. If NULL, a new UText will be created
. | |
282 * If non-NULL, must refer to an already existing UText, which wil
l then | |
283 * be reset to reference the specified replaceable text. | |
284 * @param rep A Replaceable text object. | |
285 * @param status Errors are returned here. | |
286 * @return Pointer to the UText. If a UText was supplied as input, this | |
287 * will always be used and returned. | |
288 * @see Replaceable | |
289 * @stable ICU 3.4 | |
290 */ | |
291 U_STABLE UText * U_EXPORT2 | |
292 utext_openReplaceable(UText *ut, U_NAMESPACE_QUALIFIER Replaceable *rep, UErrorC
ode *status); | |
293 | |
294 /** | |
295 * Open a UText implementation over an ICU CharacterIterator. | |
296 * @param ut Pointer to a UText struct. If NULL, a new UText will be created
. | |
297 * If non-NULL, must refer to an already existing UText, which wil
l then | |
298 * be reset to reference the specified replaceable text. | |
299 * @param ci A Character Iterator. | |
300 * @param status Errors are returned here. | |
301 * @return Pointer to the UText. If a UText was supplied as input, this | |
302 * will always be used and returned. | |
303 * @see Replaceable | |
304 * @stable ICU 3.4 | |
305 */ | |
306 U_STABLE UText * U_EXPORT2 | |
307 utext_openCharacterIterator(UText *ut, U_NAMESPACE_QUALIFIER CharacterIterator *
ic, UErrorCode *status); | |
308 | |
309 #endif | |
310 | |
311 | |
312 /** | |
313 * Clone a UText. This is much like opening a UText where the source text is
itself | |
314 * another UText. | |
315 * | |
316 * A deep clone will copy both the UText data structures and the underlying te
xt. | |
317 * The original and cloned UText will operate completely independently; modifi
cations | |
318 * made to the text in one will not affect the other. Text providers are not | |
319 * required to support deep clones. The user of clone() must check the status
return | |
320 * and be prepared to handle failures. | |
321 * | |
322 * The standard UText implementations for UTF8, UChar *, UnicodeString and | |
323 * Replaceable all support deep cloning. | |
324 * | |
325 * The UText returned from a deep clone will be writable, assuming that the te
xt | |
326 * provider is able to support writing, even if the source UText had been made | |
327 * non-writable by means of UText_freeze(). | |
328 * | |
329 * A shallow clone replicates only the UText data structures; it does not make | |
330 * a copy of the underlying text. Shallow clones can be used as an efficient
way to | |
331 * have multiple iterators active in a single text string that is not being | |
332 * modified. | |
333 * | |
334 * A shallow clone operation will not fail, barring truly exceptional conditio
ns such | |
335 * as memory allocation failures. | |
336 * | |
337 * Shallow UText clones should be avoided if the UText functions that modify t
he | |
338 * text are expected to be used, either on the original or the cloned UText. | |
339 * Any such modifications can cause unpredictable behavior. Read Only | |
340 * shallow clones provide some protection against errors of this type by | |
341 * disabling text modification via the cloned UText. | |
342 * | |
343 * A shallow clone made with the readOnly parameter == FALSE will preserve the
| |
344 * utext_isWritable() state of the source object. Note, however, that | |
345 * write operations must be avoided while more than one UText exists that refe
r | |
346 * to the same underlying text. | |
347 * | |
348 * A UText and its clone may be safely concurrently accessed by separate threa
ds. | |
349 * This is true for read access only with shallow clones, and for both read an
d | |
350 * write access with deep clones. | |
351 * It is the responsibility of the Text Provider to ensure that this thread sa
fety | |
352 * constraint is met. | |
353 * | |
354 * @param dest A UText struct to be filled in with the result of the clone o
peration, | |
355 * or NULL if the clone function should heap-allocate a new UTex
t struct. | |
356 * If non-NULL, must refer to an already existing UText, which w
ill then | |
357 * be reset to become the clone. | |
358 * @param src The UText to be cloned. | |
359 * @param deep TRUE to request a deep clone, FALSE for a shallow clone. | |
360 * @param readOnly TRUE to request that the cloned UText have read only access
to the | |
361 * underlying text. | |
362 | |
363 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERR
OR | |
364 * will be returned if the text provider is unable to clone the | |
365 * original text. | |
366 * @return The newly created clone, or NULL if the clone operation faile
d. | |
367 * @stable ICU 3.4 | |
368 */ | |
369 U_STABLE UText * U_EXPORT2 | |
370 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCod
e *status); | |
371 | |
372 | |
373 /** | |
374 * Compare two UText objects for equality. | |
375 * UTexts are equal if they are iterating over the same text, and | |
376 * have the same iteration position within the text. | |
377 * If either or both of the parameters are NULL, the comparison is FALSE. | |
378 * | |
379 * @param a The first of the two UTexts to compare. | |
380 * @param b The other UText to be compared. | |
381 * @return TRUE if the two UTexts are equal. | |
382 * @stable ICU 3.6 | |
383 */ | |
384 U_STABLE UBool U_EXPORT2 | |
385 utext_equals(const UText *a, const UText *b); | |
386 | |
387 | |
388 /***************************************************************************** | |
389 * | |
390 * Functions to work with the text represeted by a UText wrapper | |
391 * | |
392 *****************************************************************************/ | |
393 | |
394 /** | |
395 * Get the length of the text. Depending on the characteristics | |
396 * of the underlying text representation, this may be expensive. | |
397 * @see utext_isLengthExpensive() | |
398 * | |
399 * | |
400 * @param ut the text to be accessed. | |
401 * @return the length of the text, expressed in native units. | |
402 * | |
403 * @stable ICU 3.4 | |
404 */ | |
405 U_STABLE int64_t U_EXPORT2 | |
406 utext_nativeLength(UText *ut); | |
407 | |
408 /** | |
409 * Return TRUE if calculating the length of the text could be expensive. | |
410 * Finding the length of NUL terminated strings is considered to be expensive. | |
411 * | |
412 * Note that the value of this function may change | |
413 * as the result of other operations on a UText. | |
414 * Once the length of a string has been discovered, it will no longer | |
415 * be expensive to report it. | |
416 * | |
417 * @param ut the text to be accessed. | |
418 * @return TRUE if determining the length of the text could be time consuming. | |
419 * @stable ICU 3.4 | |
420 */ | |
421 U_STABLE UBool U_EXPORT2 | |
422 utext_isLengthExpensive(const UText *ut); | |
423 | |
424 /** | |
425 * Returns the code point at the requested index, | |
426 * or U_SENTINEL (-1) if it is out of bounds. | |
427 * | |
428 * If the specified index points to the interior of a multi-unit | |
429 * character - one of the trail bytes of a UTF-8 sequence, for example - | |
430 * the complete code point will be returned. | |
431 * | |
432 * The iteration position will be set to the start of the returned code point. | |
433 * | |
434 * This function is roughly equivalent to the the sequence | |
435 * utext_setNativeIndex(index); | |
436 * utext_current32(); | |
437 * (There is a subtle difference if the index is out of bounds by being less tha
n zero - | |
438 * utext_setNativeIndex(negative value) sets the index to zero, after which utex
t_current() | |
439 * will return the char at zero. utext_char32At(negative index), on the other h
and, will | |
440 * return the U_SENTINEL value of -1.) | |
441 * | |
442 * @param ut the text to be accessed | |
443 * @param nativeIndex the native index of the character to be accessed. If the
index points | |
444 * to other than the first unit of a multi-unit character, it will be adj
usted | |
445 * to the start of the character. | |
446 * @return the code point at the specified index. | |
447 * @stable ICU 3.4 | |
448 */ | |
449 U_STABLE UChar32 U_EXPORT2 | |
450 utext_char32At(UText *ut, int64_t nativeIndex); | |
451 | |
452 | |
453 /** | |
454 * | |
455 * Get the code point at the current iteration position, | |
456 * or U_SENTINEL (-1) if the iteration has reached the end of | |
457 * the input text. | |
458 * | |
459 * @param ut the text to be accessed. | |
460 * @return the Unicode code point at the current iterator position. | |
461 * @stable ICU 3.4 | |
462 */ | |
463 U_STABLE UChar32 U_EXPORT2 | |
464 utext_current32(UText *ut); | |
465 | |
466 | |
467 /** | |
468 * Get the code point at the current iteration position of the UText, and | |
469 * advance the position to the first index following the character. | |
470 * | |
471 * If the position is at the end of the text (the index following | |
472 * the last character, which is also the length of the text), | |
473 * return U_SENTINEL (-1) and do not advance the index. | |
474 * | |
475 * This is a post-increment operation. | |
476 * | |
477 * An inline macro version of this function, UTEXT_NEXT32(), | |
478 * is available for performance critical use. | |
479 * | |
480 * @param ut the text to be accessed. | |
481 * @return the Unicode code point at the iteration position. | |
482 * @see UTEXT_NEXT32 | |
483 * @stable ICU 3.4 | |
484 */ | |
485 U_STABLE UChar32 U_EXPORT2 | |
486 utext_next32(UText *ut); | |
487 | |
488 | |
489 /** | |
490 * Move the iterator position to the character (code point) whose | |
491 * index precedes the current position, and return that character. | |
492 * This is a pre-decrement operation. | |
493 * | |
494 * If the initial position is at the start of the text (index of 0) | |
495 * return U_SENTINEL (-1), and leave the position unchanged. | |
496 * | |
497 * An inline macro version of this function, UTEXT_PREVIOUS32(), | |
498 * is available for performance critical use. | |
499 * | |
500 * @param ut the text to be accessed. | |
501 * @return the previous UChar32 code point, or U_SENTINEL (-1) | |
502 * if the iteration has reached the start of the text. | |
503 * @see UTEXT_PREVIOUS32 | |
504 * @stable ICU 3.4 | |
505 */ | |
506 U_STABLE UChar32 U_EXPORT2 | |
507 utext_previous32(UText *ut); | |
508 | |
509 | |
510 /** | |
511 * Set the iteration index and return the code point at that index. | |
512 * Leave the iteration index at the start of the following code point. | |
513 * | |
514 * This function is the most efficient and convenient way to | |
515 * begin a forward iteration. The results are identical to the those | |
516 * from the sequence | |
517 * \code | |
518 * utext_setIndex(); | |
519 * utext_next32(); | |
520 * \endcode | |
521 * | |
522 * @param ut the text to be accessed. | |
523 * @param nativeIndex Iteration index, in the native units of the text provide
r. | |
524 * @return Code point which starts at or before index, | |
525 * or U_SENTINEL (-1) if it is out of bounds. | |
526 * @stable ICU 3.4 | |
527 */ | |
528 U_STABLE UChar32 U_EXPORT2 | |
529 utext_next32From(UText *ut, int64_t nativeIndex); | |
530 | |
531 | |
532 | |
533 /** | |
534 * Set the iteration index, and return the code point preceding the | |
535 * one specified by the initial index. Leave the iteration position | |
536 * at the start of the returned code point. | |
537 * | |
538 * This function is the most efficient and convenient way to | |
539 * begin a backwards iteration. | |
540 * | |
541 * @param ut the text to be accessed. | |
542 * @param nativeIndex Iteration index in the native units of the text provider. | |
543 * @return Code point preceding the one at the initial index, | |
544 * or U_SENTINEL (-1) if it is out of bounds. | |
545 * | |
546 * @stable ICU 3.4 | |
547 */ | |
548 U_STABLE UChar32 U_EXPORT2 | |
549 utext_previous32From(UText *ut, int64_t nativeIndex); | |
550 | |
551 /** | |
552 * Get the current iterator position, which can range from 0 to | |
553 * the length of the text. | |
554 * The position is a native index into the input text, in whatever format it | |
555 * may have (possibly UTF-8 for example), and may not always be the same as | |
556 * the corresponding UChar (UTF-16) index. | |
557 * The returned position will always be aligned to a code point boundary. | |
558 * | |
559 * @param ut the text to be accessed. | |
560 * @return the current index position, in the native units of the text provider
. | |
561 * @stable ICU 3.4 | |
562 */ | |
563 U_STABLE int64_t U_EXPORT2 | |
564 utext_getNativeIndex(const UText *ut); | |
565 | |
566 /** | |
567 * Set the current iteration position to the nearest code point | |
568 * boundary at or preceding the specified index. | |
569 * The index is in the native units of the original input text. | |
570 * If the index is out of range, it will be pinned to be within | |
571 * the range of the input text. | |
572 * <p> | |
573 * It will usually be more efficient to begin an iteration | |
574 * using the functions utext_next32From() or utext_previous32From() | |
575 * rather than setIndex(). | |
576 * <p> | |
577 * Moving the index position to an adjacent character is best done | |
578 * with utext_next32(), utext_previous32() or utext_moveIndex32(). | |
579 * Attempting to do direct arithmetic on the index position is | |
580 * complicated by the fact that the size (in native units) of a | |
581 * character depends on the underlying representation of the character | |
582 * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not | |
583 * easily knowable. | |
584 * | |
585 * @param ut the text to be accessed. | |
586 * @param nativeIndex the native unit index of the new iteration position. | |
587 * @stable ICU 3.4 | |
588 */ | |
589 U_STABLE void U_EXPORT2 | |
590 utext_setNativeIndex(UText *ut, int64_t nativeIndex); | |
591 | |
592 /** | |
593 * Move the iterator postion by delta code points. The number of code points | |
594 * is a signed number; a negative delta will move the iterator backwards, | |
595 * towards the start of the text. | |
596 * <p> | |
597 * The index is moved by <code>delta</code> code points | |
598 * forward or backward, but no further backward than to 0 and | |
599 * no further forward than to utext_nativeLength(). | |
600 * The resulting index value will be in between 0 and length, inclusive. | |
601 * | |
602 * @param ut the text to be accessed. | |
603 * @param delta the signed number of code points to move the iteration position. | |
604 * @return TRUE if the position could be moved the requested number of positions
while | |
605 * staying within the range [0 - text length]. | |
606 * @stable ICU 3.4 | |
607 */ | |
608 U_STABLE UBool U_EXPORT2 | |
609 utext_moveIndex32(UText *ut, int32_t delta); | |
610 | |
611 /** | |
612 * Get the native index of the character preceeding the current position. | |
613 * If the iteration position is already at the start of the text, zero | |
614 * is returned. | |
615 * The value returned is the same as that obtained from the following sequence, | |
616 * but without the side effect of changing the iteration position. | |
617 * | |
618 * \code | |
619 * UText *ut = whatever; | |
620 * ... | |
621 * utext_previous(ut) | |
622 * utext_getNativeIndex(ut); | |
623 * \endcode | |
624 * | |
625 * This function is most useful during forwards iteration, where it will get the | |
626 * native index of the character most recently returned from utext_next(). | |
627 * | |
628 * @param ut the text to be accessed | |
629 * @return the native index of the character preceeding the current index positi
on, | |
630 * or zero if the current position is at the start of the text. | |
631 * @stable ICU 3.6 | |
632 */ | |
633 U_STABLE int64_t U_EXPORT2 | |
634 utext_getPreviousNativeIndex(UText *ut); | |
635 | |
636 | |
637 /** | |
638 * | |
639 * Extract text from a UText into a UChar buffer. The range of text to be extra
cted | |
640 * is specified in the native indices of the UText provider. These may not nece
ssarily | |
641 * be UTF-16 indices. | |
642 * <p> | |
643 * The size (number of 16 bit UChars) of the data to be extracted is returned.
The | |
644 * full number of UChars is returned, even when the extracted text is truncated | |
645 * because the specified buffer size is too small. | |
646 * <p> | |
647 * The extracted string will (if you are a user) / must (if you are a text provi
der) | |
648 * be NUL-terminated if there is sufficient space in the destination buffer. Th
is | |
649 * terminating NUL is not included in the returned length. | |
650 * <p> | |
651 * The iteration index is left at the position following the last extracted char
acter. | |
652 * | |
653 * @param ut the UText from which to extract data. | |
654 * @param nativeStart the native index of the first character to extract.\ | |
655 * If the specified index is out of range, | |
656 * it will be pinned to to be within 0 <= index <= textLength | |
657 * @param nativeLimit the native string index of the position following the las
t | |
658 * character to extract. If the specified index is out of range, | |
659 * it will be pinned to to be within 0 <= index <= textLength. | |
660 * nativeLimit must be >= nativeStart. | |
661 * @param dest the UChar (UTF-16) buffer into which the extracted text is plac
ed | |
662 * @param destCapacity The size, in UChars, of the destination buffer. May be
zero | |
663 * for precomputing the required size. | |
664 * @param status receives any error status. | |
665 * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the
| |
666 * buffer was too small. Returns number of UChars for preflighting. | |
667 * @return Number of UChars in the data to be extracted. Does not include a tra
iling NUL. | |
668 * | |
669 * @stable ICU 3.4 | |
670 */ | |
671 U_STABLE int32_t U_EXPORT2 | |
672 utext_extract(UText *ut, | |
673 int64_t nativeStart, int64_t nativeLimit, | |
674 UChar *dest, int32_t destCapacity, | |
675 UErrorCode *status); | |
676 | |
677 | |
678 /** | |
679 * Compare two UTexts (binary order). The comparison begins at each source text'
s | |
680 * iteration position. The iteration position of each UText will be left followi
ng | |
681 * the last character compared. | |
682 * | |
683 * The comparison is done in code point order; unlike u_strCompare, you | |
684 * cannot choose to use code unit order. This is because the characters | |
685 * in a UText are accessed one code point at a time, and may not be from a UTF-1
6 | |
686 * context. | |
687 * | |
688 * This functions works with strings of different explicitly specified lengths | |
689 * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. | |
690 * A length argument of -1 signifies that as much of the string should be used a
s | |
691 * is necessary to compare with the other string. If both length arguments are -
1, | |
692 * the entire remaining portionss of both strings are used. | |
693 * | |
694 * @param s1 First source string. | |
695 * @param length1 Length of first source string in UTF-32 code points. | |
696 * | |
697 * @param s2 Second source string. | |
698 * @param length2 Length of second source string in UTF-32 code points. | |
699 * | |
700 * @return <0 or 0 or >0 as usual for string comparisons | |
701 * | |
702 * @internal ICU 4.4 technology preview | |
703 */ | |
704 U_INTERNAL int32_t U_EXPORT2 | |
705 utext_compare(UText *s1, int32_t length1, | |
706 UText *s2, int32_t length2); | |
707 | |
708 /** | |
709 * Compare two UTexts (binary order). The comparison begins at each source text'
s | |
710 * iteration position. The iteration position of each UText will be left followi
ng | |
711 * the last character compared. This method differs from utext_compare in that | |
712 * it accepts native limits rather than lengths for each string. | |
713 * | |
714 * The comparison is done in code point order; unlike u_strCompare, you | |
715 * cannot choose to use code unit order. This is because the characters | |
716 * in a UText are accessed one code point at a time, and may not be from a UTF-1
6 | |
717 * context. | |
718 * | |
719 * This functions works with strings of different explicitly specified lengths | |
720 * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. | |
721 * A limit argument of -1 signifies that as much of the string should be used as | |
722 * is necessary to compare with the other string. If both limit arguments are -1
, | |
723 * the entire remaining portionss of both strings are used. | |
724 * | |
725 * @param s1 First source string. | |
726 * @param limit1 Native index of the last character in the first source string t
o be considered. | |
727 * | |
728 * @param s2 Second source string. | |
729 * @param limit2 Native index of the last character in the second source string
to be considered. | |
730 * | |
731 * @return <0 or 0 or >0 as usual for string comparisons | |
732 * | |
733 * @internal ICU 4.4 technology preview | |
734 */ | |
735 U_INTERNAL int32_t U_EXPORT2 | |
736 utext_compareNativeLimit(UText *s1, int64_t limit1, | |
737 UText *s2, int64_t limit2); | |
738 | |
739 /** | |
740 * Compare two UTexts case-insensitively using full case folding. The comparison | |
741 * begins at each source text's iteration position. The iteration position of ea
ch | |
742 * UText will be left following the last character compared. | |
743 * | |
744 * The comparison is done in code point order; this is because the characters | |
745 * in a UText are accessed one code point at a time, and may not be from a UTF-1
6 | |
746 * context. | |
747 * | |
748 * This functions works with strings of different explicitly specified lengths | |
749 * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. | |
750 * A length argument of -1 signifies that as much of the string should be used a
s | |
751 * is necessary to compare with the other string. If both length arguments are -
1, | |
752 * the entire remaining portionss of both strings are used. | |
753 * | |
754 * @param s1 First source string. | |
755 * @param length1 Length of first source string in UTF-32 code points. | |
756 * | |
757 * @param s2 Second source string. | |
758 * @param length2 Length of second source string in UTF-32 code points. | |
759 * | |
760 * @param options A bit set of options: | |
761 * - U_FOLD_CASE_DEFAULT or 0 is used for default options: | |
762 * Comparison in code point order with default case folding. | |
763 * | |
764 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I | |
765 * | |
766 * @param pErrorCode Must be a valid pointer to an error code value, | |
767 * which must not indicate a failure before the function call. | |
768 * | |
769 * @return <0 or 0 or >0 as usual for string comparisons | |
770 * | |
771 * @internal ICU 4.4 technology preview | |
772 */ | |
773 U_INTERNAL int32_t U_EXPORT2 | |
774 utext_caseCompare(UText *s1, int32_t length1, | |
775 UText *s2, int32_t length2, | |
776 uint32_t options, UErrorCode *pErrorCode); | |
777 | |
778 /** | |
779 * Compare two UTexts case-insensitively using full case folding. The comparison | |
780 * begins at each source text's iteration position. The iteration position of ea
ch | |
781 * UText will be left following the last character compared. This method differs
from | |
782 * utext_caseCompare in that it accepts native limits rather than lengths for ea
ch | |
783 * string. | |
784 * | |
785 * The comparison is done in code point order; this is because the characters | |
786 * in a UText are accessed one code point at a time, and may not be from a UTF-1
6 | |
787 * context. | |
788 * | |
789 * This functions works with strings of different explicitly specified lengths | |
790 * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. | |
791 * A limit argument of -1 signifies that as much of the string should be used as | |
792 * is necessary to compare with the other string. If both length arguments are -
1, | |
793 * the entire remaining portionss of both strings are used. | |
794 * | |
795 * @param s1 First source string. | |
796 * @param limit1 Native index of the last character in the first source string t
o be considered. | |
797 * | |
798 * @param s2 Second source string. | |
799 * @param limit2 Native index of the last character in the second source string
to be considered. | |
800 * | |
801 * @param options A bit set of options: | |
802 * - U_FOLD_CASE_DEFAULT or 0 is used for default options: | |
803 * Comparison in code point order with default case folding. | |
804 * | |
805 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I | |
806 * | |
807 * @param pErrorCode Must be a valid pointer to an error code value, | |
808 * which must not indicate a failure before the function call. | |
809 * | |
810 * @return <0 or 0 or >0 as usual for string comparisons | |
811 * | |
812 * @internal ICU 4.4 technology preview | |
813 */ | |
814 U_INTERNAL int32_t U_EXPORT2 | |
815 utext_caseCompareNativeLimit(UText *s1, int64_t limit1, | |
816 UText *s2, int64_t limit2, | |
817 uint32_t options, UErrorCode *pErrorCode); | |
818 | |
819 | |
820 /*******************************************************************************
***** | |
821 * | |
822 * #define inline versions of selected performance-critical text access functio
ns | |
823 * Caution: do not use auto increment++ or decrement-- expressions | |
824 * as parameters to these macros. | |
825 * | |
826 * For most use, where there is no extreme performance constraint, the | |
827 * normal, non-inline functions are a better choice. The resulting cod
e | |
828 * will be smaller, and, if the need ever arises, easier to debug. | |
829 * | |
830 * These are implemented as #defines rather than real functions | |
831 * because there is no fully portable way to do inline functions in pla
in C. | |
832 * | |
833 *******************************************************************************
*****/ | |
834 | |
835 /** | |
836 * inline version of utext_current32(), for performance-critical situations. | |
837 * | |
838 * Get the code point at the current iteration position of the UText. | |
839 * Returns U_SENTINEL (-1) if the position is at the end of the | |
840 * text. | |
841 * | |
842 * @internal ICU 4.4 technology preview | |
843 */ | |
844 #define UTEXT_CURRENT32(ut) \ | |
845 ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkO
ffset]<0xd800 ? \ | |
846 ((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut)) | |
847 | |
848 /** | |
849 * inline version of utext_next32(), for performance-critical situations. | |
850 * | |
851 * Get the code point at the current iteration position of the UText, and | |
852 * advance the position to the first index following the character. | |
853 * This is a post-increment operation. | |
854 * Returns U_SENTINEL (-1) if the position is at the end of the | |
855 * text. | |
856 * | |
857 * @stable ICU 3.4 | |
858 */ | |
859 #define UTEXT_NEXT32(ut) \ | |
860 ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkO
ffset]<0xd800 ? \ | |
861 ((ut)->chunkContents)[((ut)->chunkOffset)++] : utext_next32(ut)) | |
862 | |
863 /** | |
864 * inline version of utext_previous32(), for performance-critical situations. | |
865 * | |
866 * Move the iterator position to the character (code point) whose | |
867 * index precedes the current position, and return that character. | |
868 * This is a pre-decrement operation. | |
869 * Returns U_SENTINEL (-1) if the position is at the start of the text. | |
870 * | |
871 * @stable ICU 3.4 | |
872 */ | |
873 #define UTEXT_PREVIOUS32(ut) \ | |
874 ((ut)->chunkOffset > 0 && \ | |
875 (ut)->chunkContents[(ut)->chunkOffset-1] < 0xd800 ? \ | |
876 (ut)->chunkContents[--((ut)->chunkOffset)] : utext_previous32(ut)) | |
877 | |
878 /** | |
879 * inline version of utext_getNativeIndex(), for performance-critical situatio
ns. | |
880 * | |
881 * Get the current iterator position, which can range from 0 to | |
882 * the length of the text. | |
883 * The position is a native index into the input text, in whatever format it | |
884 * may have (possibly UTF-8 for example), and may not always be the same as | |
885 * the corresponding UChar (UTF-16) index. | |
886 * The returned position will always be aligned to a code point boundary. | |
887 * | |
888 * @stable ICU 3.6 | |
889 */ | |
890 #define UTEXT_GETNATIVEINDEX(ut) \ | |
891 ((ut)->chunkOffset <= (ut)->nativeIndexingLimit? \ | |
892 (ut)->chunkNativeStart+(ut)->chunkOffset : \ | |
893 (ut)->pFuncs->mapOffsetToNative(ut)) | |
894 | |
895 /** | |
896 * inline version of utext_setNativeIndex(), for performance-critical situatio
ns. | |
897 * | |
898 * Set the current iteration position to the nearest code point | |
899 * boundary at or preceding the specified index. | |
900 * The index is in the native units of the original input text. | |
901 * If the index is out of range, it will be pinned to be within | |
902 * the range of the input text. | |
903 * | |
904 * @stable ICU 3.8 | |
905 */ | |
906 #define UTEXT_SETNATIVEINDEX(ut, ix) \ | |
907 { int64_t __offset = (ix) - (ut)->chunkNativeStart; \ | |
908 if (__offset>=0 && __offset<=(int64_t)(ut)->nativeIndexingLimit) { \ | |
909 (ut)->chunkOffset=(int32_t)__offset; \ | |
910 } else { \ | |
911 utext_setNativeIndex((ut), (ix)); } } | |
912 | |
913 | |
914 | |
915 /*******************************************************************************
***** | |
916 * | |
917 * Functions related to writing or modifying the text. | |
918 * These will work only with modifiable UTexts. Attempting to | |
919 * modify a read-only UText will return an error status. | |
920 * | |
921 *******************************************************************************
*****/ | |
922 | |
923 | |
924 /** | |
925 * Return TRUE if the text can be written (modified) with utext_replace() or | |
926 * utext_copy(). For the text to be writable, the text provider must | |
927 * be of a type that supports writing and the UText must not be frozen. | |
928 * | |
929 * Attempting to modify text when utext_isWriteable() is FALSE will fail - | |
930 * the text will not be modified, and an error will be returned from the functi
on | |
931 * that attempted the modification. | |
932 * | |
933 * @param ut the UText to be tested. | |
934 * @return TRUE if the text is modifiable. | |
935 * | |
936 * @see utext_freeze() | |
937 * @see utext_replace() | |
938 * @see utext_copy() | |
939 * @stable ICU 3.4 | |
940 * | |
941 */ | |
942 U_STABLE UBool U_EXPORT2 | |
943 utext_isWritable(const UText *ut); | |
944 | |
945 | |
946 /** | |
947 * Test whether there is meta data associated with the text. | |
948 * @see Replaceable::hasMetaData() | |
949 * | |
950 * @param ut The UText to be tested | |
951 * @return TRUE if the underlying text includes meta data. | |
952 * @stable ICU 3.4 | |
953 */ | |
954 U_STABLE UBool U_EXPORT2 | |
955 utext_hasMetaData(const UText *ut); | |
956 | |
957 | |
958 /** | |
959 * Replace a range of the original text with a replacement text. | |
960 * | |
961 * Leaves the current iteration position at the position following the | |
962 * newly inserted replacement text. | |
963 * | |
964 * This function is only available on UText types that support writing, | |
965 * that is, ones where utext_isWritable() returns TRUE. | |
966 * | |
967 * When using this function, there should be only a single UText opened onto the | |
968 * underlying native text string. Behavior after a replace operation | |
969 * on a UText is undefined for any other additional UTexts that refer to the | |
970 * modified string. | |
971 * | |
972 * @param ut the UText representing the text to be operated on. | |
973 * @param nativeStart the native index of the start of the region to be rep
laced | |
974 * @param nativeLimit the native index of the character following the regio
n to be replaced. | |
975 * @param replacementText pointer to the replacement text | |
976 * @param replacementLength length of the replacement text, or -1 if the text is
NUL terminated. | |
977 * @param status receives any error status. Possible errors include | |
978 * U_NO_WRITE_PERMISSION | |
979 * | |
980 * @return The signed number of (native) storage units by which | |
981 * the length of the text expanded or contracted. | |
982 * | |
983 * @stable ICU 3.4 | |
984 */ | |
985 U_STABLE int32_t U_EXPORT2 | |
986 utext_replace(UText *ut, | |
987 int64_t nativeStart, int64_t nativeLimit, | |
988 const UChar *replacementText, int32_t replacementLength, | |
989 UErrorCode *status); | |
990 | |
991 | |
992 | |
993 /** | |
994 * | |
995 * Copy or move a substring from one position to another within the text, | |
996 * while retaining any metadata associated with the text. | |
997 * This function is used to duplicate or reorder substrings. | |
998 * The destination index must not overlap the source range. | |
999 * | |
1000 * The text to be copied or moved is inserted at destIndex; | |
1001 * it does not replace or overwrite any existing text. | |
1002 * | |
1003 * The iteration position is left following the newly inserted text | |
1004 * at the destination position. | |
1005 * | |
1006 * This function is only available on UText types that support writing, | |
1007 * that is, ones where utext_isWritable() returns TRUE. | |
1008 * | |
1009 * When using this function, there should be only a single UText opened onto the | |
1010 * underlying native text string. Behavior after a copy operation | |
1011 * on a UText is undefined in any other additional UTexts that refer to the | |
1012 * modified string. | |
1013 * | |
1014 * @param ut The UText representing the text to be operated on. | |
1015 * @param nativeStart The native index of the start of the region to be copied
or moved | |
1016 * @param nativeLimit The native index of the character position following the
region | |
1017 * to be copied. | |
1018 * @param destIndex The native destination index to which the source substrin
g is | |
1019 * copied or moved. | |
1020 * @param move If TRUE, then the substring is moved, not copied/duplicat
ed. | |
1021 * @param status receives any error status. Possible errors include U_NO_
WRITE_PERMISSION | |
1022 * | |
1023 * @stable ICU 3.4 | |
1024 */ | |
1025 U_STABLE void U_EXPORT2 | |
1026 utext_copy(UText *ut, | |
1027 int64_t nativeStart, int64_t nativeLimit, | |
1028 int64_t destIndex, | |
1029 UBool move, | |
1030 UErrorCode *status); | |
1031 | |
1032 | |
1033 /** | |
1034 * <p> | |
1035 * Freeze a UText. This prevents any modification to the underlying text itse
lf | |
1036 * by means of functions operating on this UText. | |
1037 * </p> | |
1038 * <p> | |
1039 * Once frozen, a UText can not be unfrozen. The intent is to ensure | |
1040 * that a the text underlying a frozen UText wrapper cannot be modified via th
at UText. | |
1041 * </p> | |
1042 * <p> | |
1043 * Caution: freezing a UText will disable changes made via the specific | |
1044 * frozen UText wrapper only; it will not have any effect on the ability to | |
1045 * directly modify the text by bypassing the UText. Any such backdoor modifi
cations | |
1046 * are always an error while UText access is occuring because the underlying | |
1047 * text can get out of sync with UText's buffering. | |
1048 * </p> | |
1049 * | |
1050 * @param ut The UText to be frozen. | |
1051 * @see utext_isWritable() | |
1052 * @stable ICU 3.6 | |
1053 */ | |
1054 U_STABLE void U_EXPORT2 | |
1055 utext_freeze(UText *ut); | |
1056 | |
1057 | |
1058 /** | |
1059 * UText provider properties (bit field indexes). | |
1060 * | |
1061 * @see UText | |
1062 * @stable ICU 3.4 | |
1063 */ | |
1064 enum { | |
1065 /** | |
1066 * It is potentially time consuming for the provider to determine the length
of the text. | |
1067 * @stable ICU 3.4 | |
1068 */ | |
1069 UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1, | |
1070 /** | |
1071 * Text chunks remain valid and usable until the text object is modified or | |
1072 * deleted, not just until the next time the access() function is called | |
1073 * (which is the default). | |
1074 * @stable ICU 3.4 | |
1075 */ | |
1076 UTEXT_PROVIDER_STABLE_CHUNKS = 2, | |
1077 /** | |
1078 * The provider supports modifying the text via the replace() and copy() | |
1079 * functions. | |
1080 * @see Replaceable | |
1081 * @stable ICU 3.4 | |
1082 */ | |
1083 UTEXT_PROVIDER_WRITABLE = 3, | |
1084 /** | |
1085 * There is meta data associated with the text. | |
1086 * @see Replaceable::hasMetaData() | |
1087 * @stable ICU 3.4 | |
1088 */ | |
1089 UTEXT_PROVIDER_HAS_META_DATA = 4, | |
1090 /** | |
1091 * Text provider owns the text storage. | |
1092 * Generally occurs as the result of a deep clone of the UText. | |
1093 * When closing the UText, the associated text must | |
1094 * also be closed/deleted/freed/ whatever is appropriate. | |
1095 * @stable ICU 3.6 | |
1096 */ | |
1097 UTEXT_PROVIDER_OWNS_TEXT = 5 | |
1098 }; | |
1099 | |
1100 /** | |
1101 * Function type declaration for UText.clone(). | |
1102 * | |
1103 * clone a UText. Much like opening a UText where the source text is itself | |
1104 * another UText. | |
1105 * | |
1106 * A deep clone will copy both the UText data structures and the underlying te
xt. | |
1107 * The original and cloned UText will operate completely independently; modifi
cations | |
1108 * made to the text in one will not effect the other. Text providers are not | |
1109 * required to support deep clones. The user of clone() must check the status
return | |
1110 * and be prepared to handle failures. | |
1111 * | |
1112 * A shallow clone replicates only the UText data structures; it does not make | |
1113 * a copy of the underlying text. Shallow clones can be used as an efficient
way to | |
1114 * have multiple iterators active in a single text string that is not being | |
1115 * modified. | |
1116 * | |
1117 * A shallow clone operation must not fail except for truly exceptional condit
ions such | |
1118 * as memory allocation failures. | |
1119 * | |
1120 * A UText and its clone may be safely concurrently accessed by separate threa
ds. | |
1121 * This is true for both shallow and deep clones. | |
1122 * It is the responsibility of the Text Provider to ensure that this thread sa
fety | |
1123 * constraint is met. | |
1124 | |
1125 * | |
1126 * @param dest A UText struct to be filled in with the result of the clone o
peration, | |
1127 * or NULL if the clone function should heap-allocate a new UTex
t struct. | |
1128 * @param src The UText to be cloned. | |
1129 * @param deep TRUE to request a deep clone, FALSE for a shallow clone. | |
1130 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERR
OR | |
1131 * should be returned if the text provider is unable to clone th
e | |
1132 * original text. | |
1133 * @return The newly created clone, or NULL if the clone operation faile
d. | |
1134 * | |
1135 * @stable ICU 3.4 | |
1136 */ | |
1137 typedef UText * U_CALLCONV | |
1138 UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status); | |
1139 | |
1140 | |
1141 /** | |
1142 * Function type declaration for UText.nativeLength(). | |
1143 * | |
1144 * @param ut the UText to get the length of. | |
1145 * @return the length, in the native units of the original text string. | |
1146 * @see UText | |
1147 * @stable ICU 3.4 | |
1148 */ | |
1149 typedef int64_t U_CALLCONV | |
1150 UTextNativeLength(UText *ut); | |
1151 | |
1152 /** | |
1153 * Function type declaration for UText.access(). Get the description of the tex
t chunk | |
1154 * containing the text at a requested native index. The UText's iteration | |
1155 * position will be left at the requested index. If the index is out | |
1156 * of bounds, the iteration position will be left at the start or end | |
1157 * of the string, as appropriate. | |
1158 * | |
1159 * Chunks must begin and end on code point boundaries. A single code point | |
1160 * comprised of multiple storage units must never span a chunk boundary. | |
1161 * | |
1162 * | |
1163 * @param ut the UText being accessed. | |
1164 * @param nativeIndex Requested index of the text to be accessed. | |
1165 * @param forward If TRUE, then the returned chunk must contain text | |
1166 * starting from the index, so that start<=index<limit. | |
1167 * If FALSE, then the returned chunk must contain text | |
1168 * before the index, so that start<index<=limit. | |
1169 * @return True if the requested index could be accessed. The chunk | |
1170 * will contain the requested text. | |
1171 * False value if a chunk cannot be accessed | |
1172 * (the requested index is out of bounds). | |
1173 * | |
1174 * @see UText | |
1175 * @stable ICU 3.4 | |
1176 */ | |
1177 typedef UBool U_CALLCONV | |
1178 UTextAccess(UText *ut, int64_t nativeIndex, UBool forward); | |
1179 | |
1180 /** | |
1181 * Function type declaration for UText.extract(). | |
1182 * | |
1183 * Extract text from a UText into a UChar buffer. The range of text to be extra
cted | |
1184 * is specified in the native indices of the UText provider. These may not nece
ssarily | |
1185 * be UTF-16 indices. | |
1186 * <p> | |
1187 * The size (number of 16 bit UChars) in the data to be extracted is returned.
The | |
1188 * full amount is returned, even when the specified buffer size is smaller. | |
1189 * <p> | |
1190 * The extracted string will (if you are a user) / must (if you are a text provi
der) | |
1191 * be NUL-terminated if there is sufficient space in the destination buffer. | |
1192 * | |
1193 * @param ut the UText from which to extract data. | |
1194 * @param nativeStart the native index of the first characer to extract. | |
1195 * @param nativeLimit the native string index of the position following the l
ast | |
1196 * character to extract. | |
1197 * @param dest the UChar (UTF-16) buffer into which the extracted text
is placed | |
1198 * @param destCapacity The size, in UChars, of the destination buffer. May be
zero | |
1199 * for precomputing the required size. | |
1200 * @param status receives any error status. | |
1201 * If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars fo
r | |
1202 * preflighting. | |
1203 * @return Number of UChars in the data. Does not include a trailing NUL. | |
1204 * | |
1205 * @stable ICU 3.4 | |
1206 */ | |
1207 typedef int32_t U_CALLCONV | |
1208 UTextExtract(UText *ut, | |
1209 int64_t nativeStart, int64_t nativeLimit, | |
1210 UChar *dest, int32_t destCapacity, | |
1211 UErrorCode *status); | |
1212 | |
1213 /** | |
1214 * Function type declaration for UText.replace(). | |
1215 * | |
1216 * Replace a range of the original text with a replacement text. | |
1217 * | |
1218 * Leaves the current iteration position at the position following the | |
1219 * newly inserted replacement text. | |
1220 * | |
1221 * This function need only be implemented on UText types that support writing. | |
1222 * | |
1223 * When using this function, there should be only a single UText opened onto the | |
1224 * underlying native text string. The function is responsible for updating the | |
1225 * text chunk within the UText to reflect the updated iteration position, | |
1226 * taking into account any changes to the underlying string's structure caused | |
1227 * by the replace operation. | |
1228 * | |
1229 * @param ut the UText representing the text to be operated on. | |
1230 * @param nativeStart the index of the start of the region to be replaced | |
1231 * @param nativeLimit the index of the character following the region to be
replaced. | |
1232 * @param replacementText pointer to the replacement text | |
1233 * @param replacmentLength length of the replacement text in UChars, or -1 if th
e text is NUL terminated. | |
1234 * @param status receives any error status. Possible errors include | |
1235 * U_NO_WRITE_PERMISSION | |
1236 * | |
1237 * @return The signed number of (native) storage units by which | |
1238 * the length of the text expanded or contracted. | |
1239 * | |
1240 * @stable ICU 3.4 | |
1241 */ | |
1242 typedef int32_t U_CALLCONV | |
1243 UTextReplace(UText *ut, | |
1244 int64_t nativeStart, int64_t nativeLimit, | |
1245 const UChar *replacementText, int32_t replacmentLength, | |
1246 UErrorCode *status); | |
1247 | |
1248 /** | |
1249 * Function type declaration for UText.copy(). | |
1250 * | |
1251 * Copy or move a substring from one position to another within the text, | |
1252 * while retaining any metadata associated with the text. | |
1253 * This function is used to duplicate or reorder substrings. | |
1254 * The destination index must not overlap the source range. | |
1255 * | |
1256 * The text to be copied or moved is inserted at destIndex; | |
1257 * it does not replace or overwrite any existing text. | |
1258 * | |
1259 * This function need only be implemented for UText types that support writing. | |
1260 * | |
1261 * When using this function, there should be only a single UText opened onto the | |
1262 * underlying native text string. The function is responsible for updating the | |
1263 * text chunk within the UText to reflect the updated iteration position, | |
1264 * taking into account any changes to the underlying string's structure caused | |
1265 * by the replace operation. | |
1266 * | |
1267 * @param ut The UText representing the text to be operated on. | |
1268 * @param nativeStart The index of the start of the region to be copied or move
d | |
1269 * @param nativeLimit The index of the character following the region to be rep
laced. | |
1270 * @param nativeDest The destination index to which the source substring is co
pied or moved. | |
1271 * @param move If TRUE, then the substring is moved, not copied/duplicat
ed. | |
1272 * @param status receives any error status. Possible errors include U_NO_
WRITE_PERMISSION | |
1273 * | |
1274 * @stable ICU 3.4 | |
1275 */ | |
1276 typedef void U_CALLCONV | |
1277 UTextCopy(UText *ut, | |
1278 int64_t nativeStart, int64_t nativeLimit, | |
1279 int64_t nativeDest, | |
1280 UBool move, | |
1281 UErrorCode *status); | |
1282 | |
1283 /** | |
1284 * Function type declaration for UText.mapOffsetToNative(). | |
1285 * Map from the current UChar offset within the current text chunk to | |
1286 * the corresponding native index in the original source text. | |
1287 * | |
1288 * This is required only for text providers that do not use native UTF-16 indexe
s. | |
1289 * | |
1290 * @param ut the UText. | |
1291 * @return Absolute (native) index corresponding to chunkOffset in the current c
hunk. | |
1292 * The returned native index should always be to a code point boundary. | |
1293 * | |
1294 * @stable ICU 3.4 | |
1295 */ | |
1296 typedef int64_t U_CALLCONV | |
1297 UTextMapOffsetToNative(const UText *ut); | |
1298 | |
1299 /** | |
1300 * Function type declaration for UText.mapIndexToUTF16(). | |
1301 * Map from a native index to a UChar offset within a text chunk. | |
1302 * Behavior is undefined if the native index does not fall within the | |
1303 * current chunk. | |
1304 * | |
1305 * This function is required only for text providers that do not use native UTF-
16 indexes. | |
1306 * | |
1307 * @param ut The UText containing the text chunk. | |
1308 * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->
limit. | |
1309 * @return Chunk-relative UTF-16 offset corresponding to the specifie
d native | |
1310 * index. | |
1311 * | |
1312 * @stable ICU 3.4 | |
1313 */ | |
1314 typedef int32_t U_CALLCONV | |
1315 UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex); | |
1316 | |
1317 | |
1318 /** | |
1319 * Function type declaration for UText.utextClose(). | |
1320 * | |
1321 * A Text Provider close function is only required for provider types that make | |
1322 * allocations in their open function (or other functions) that must be | |
1323 * cleaned when the UText is closed. | |
1324 * | |
1325 * The allocation of the UText struct itself and any "extra" storage | |
1326 * associated with the UText is handled by the common UText implementation | |
1327 * and does not require provider specific cleanup in a close function. | |
1328 * | |
1329 * Most UText provider implementations do not need to implement this function. | |
1330 * | |
1331 * @param ut A UText object to be closed. | |
1332 * | |
1333 * @stable ICU 3.4 | |
1334 */ | |
1335 typedef void U_CALLCONV | |
1336 UTextClose(UText *ut); | |
1337 | |
1338 | |
1339 /** | |
1340 * (public) Function dispatch table for UText. | |
1341 * Conceptually very much like a C++ Virtual Function Table. | |
1342 * This struct defines the organization of the table. | |
1343 * Each text provider implementation must provide an | |
1344 * actual table that is initialized with the appropriate functions | |
1345 * for the type of text being handled. | |
1346 * @stable ICU 3.6 | |
1347 */ | |
1348 struct UTextFuncs { | |
1349 /** | |
1350 * (public) Function table size, sizeof(UTextFuncs) | |
1351 * Intended for use should the table grow to accomodate added | |
1352 * functions in the future, to allow tests for older format | |
1353 * function tables that do not contain the extensions. | |
1354 * | |
1355 * Fields are placed for optimal alignment on | |
1356 * 32/64/128-bit-pointer machines, by normally grouping together | |
1357 * 4 32-bit fields, | |
1358 * 4 pointers, | |
1359 * 2 64-bit fields | |
1360 * in sequence. | |
1361 * @stable ICU 3.6 | |
1362 */ | |
1363 int32_t tableSize; | |
1364 | |
1365 /** | |
1366 * (private) Alignment padding. | |
1367 * Do not use, reserved for use by the UText framework only. | |
1368 * @internal | |
1369 */ | |
1370 int32_t reserved1, /** @internal */ reserved2, /** @internal */ reserv
ed3; | |
1371 | |
1372 | |
1373 /** | |
1374 * (public) Function pointer for UTextClone | |
1375 * | |
1376 * @see UTextClone | |
1377 * @stable ICU 3.6 | |
1378 */ | |
1379 UTextClone *clone; | |
1380 | |
1381 /** | |
1382 * (public) function pointer for UTextLength | |
1383 * May be expensive to compute! | |
1384 * | |
1385 * @see UTextLength | |
1386 * @stable ICU 3.6 | |
1387 */ | |
1388 UTextNativeLength *nativeLength; | |
1389 | |
1390 /** | |
1391 * (public) Function pointer for UTextAccess. | |
1392 * | |
1393 * @see UTextAccess | |
1394 * @stable ICU 3.6 | |
1395 */ | |
1396 UTextAccess *access; | |
1397 | |
1398 /** | |
1399 * (public) Function pointer for UTextExtract. | |
1400 * | |
1401 * @see UTextExtract | |
1402 * @stable ICU 3.6 | |
1403 */ | |
1404 UTextExtract *extract; | |
1405 | |
1406 /** | |
1407 * (public) Function pointer for UTextReplace. | |
1408 * | |
1409 * @see UTextReplace | |
1410 * @stable ICU 3.6 | |
1411 */ | |
1412 UTextReplace *replace; | |
1413 | |
1414 /** | |
1415 * (public) Function pointer for UTextCopy. | |
1416 * | |
1417 * @see UTextCopy | |
1418 * @stable ICU 3.6 | |
1419 */ | |
1420 UTextCopy *copy; | |
1421 | |
1422 /** | |
1423 * (public) Function pointer for UTextMapOffsetToNative. | |
1424 * | |
1425 * @see UTextMapOffsetToNative | |
1426 * @stable ICU 3.6 | |
1427 */ | |
1428 UTextMapOffsetToNative *mapOffsetToNative; | |
1429 | |
1430 /** | |
1431 * (public) Function pointer for UTextMapNativeIndexToUTF16. | |
1432 * | |
1433 * @see UTextMapNativeIndexToUTF16 | |
1434 * @stable ICU 3.6 | |
1435 */ | |
1436 UTextMapNativeIndexToUTF16 *mapNativeIndexToUTF16; | |
1437 | |
1438 /** | |
1439 * (public) Function pointer for UTextClose. | |
1440 * | |
1441 * @see UTextClose | |
1442 * @stable ICU 3.6 | |
1443 */ | |
1444 UTextClose *close; | |
1445 | |
1446 /** | |
1447 * (private) Spare function pointer | |
1448 * @internal | |
1449 */ | |
1450 UTextClose *spare1; | |
1451 | |
1452 /** | |
1453 * (private) Spare function pointer | |
1454 * @internal | |
1455 */ | |
1456 UTextClose *spare2; | |
1457 | |
1458 /** | |
1459 * (private) Spare function pointer | |
1460 * @internal | |
1461 */ | |
1462 UTextClose *spare3; | |
1463 | |
1464 }; | |
1465 /** | |
1466 * Function dispatch table for UText | |
1467 * @see UTextFuncs | |
1468 */ | |
1469 typedef struct UTextFuncs UTextFuncs; | |
1470 | |
1471 /** | |
1472 * UText struct. Provides the interface between the generic UText access cod
e | |
1473 * and the UText provider code that works on specific kinds of | |
1474 * text (UTF-8, noncontiguous UTF-16, whatever.) | |
1475 * | |
1476 * Applications that are using predefined types of text provid
ers | |
1477 * to pass text data to ICU services will have no need to view
the | |
1478 * internals of the UText structs that they open. | |
1479 * | |
1480 * @stable ICU 3.6 | |
1481 */ | |
1482 struct UText { | |
1483 /** | |
1484 * (private) Magic. Used to help detect when UText functions are hande
d | |
1485 * invalid or unitialized UText structs. | |
1486 * utext_openXYZ() functions take an initialized, | |
1487 * but not necessarily open, UText struct as an | |
1488 * optional fill-in parameter. This magic field | |
1489 * is used to check for that initialization. | |
1490 * Text provider close functions must NOT clear | |
1491 * the magic field because that would prevent | |
1492 * reuse of the UText struct. | |
1493 * @internal | |
1494 */ | |
1495 uint32_t magic; | |
1496 | |
1497 | |
1498 /** | |
1499 * (private) Flags for managing the allocation and freeing of | |
1500 * memory associated with this UText. | |
1501 * @internal | |
1502 */ | |
1503 int32_t flags; | |
1504 | |
1505 | |
1506 /** | |
1507 * Text provider properties. This set of flags is maintainted by the | |
1508 * text provider implementation. | |
1509 * @stable ICU 3.4 | |
1510 */ | |
1511 int32_t providerProperties; | |
1512 | |
1513 /** | |
1514 * (public) sizeOfStruct=sizeof(UText) | |
1515 * Allows possible backward compatible extension. | |
1516 * | |
1517 * @stable ICU 3.4 | |
1518 */ | |
1519 int32_t sizeOfStruct; | |
1520 | |
1521 /* ------ 16 byte alignment boundary ----------- */ | |
1522 | |
1523 | |
1524 /** | |
1525 * (protected) Native index of the first character position following | |
1526 * the current chunk. | |
1527 * @stable ICU 3.6 | |
1528 */ | |
1529 int64_t chunkNativeLimit; | |
1530 | |
1531 /** | |
1532 * (protected) Size in bytes of the extra space (pExtra). | |
1533 * @stable ICU 3.4 | |
1534 */ | |
1535 int32_t extraSize; | |
1536 | |
1537 /** | |
1538 * (protected) The highest chunk offset where native indexing and | |
1539 * chunk (UTF-16) indexing correspond. For UTF-16 sources, value | |
1540 * will be equal to chunkLength. | |
1541 * | |
1542 * @stable ICU 3.6 | |
1543 */ | |
1544 int32_t nativeIndexingLimit; | |
1545 | |
1546 /* ---- 16 byte alignment boundary------ */ | |
1547 | |
1548 /** | |
1549 * (protected) Native index of the first character in the text chunk. | |
1550 * @stable ICU 3.6 | |
1551 */ | |
1552 int64_t chunkNativeStart; | |
1553 | |
1554 /** | |
1555 * (protected) Current iteration position within the text chunk (UTF-16 buf
fer). | |
1556 * This is the index to the character that will be returned by utext_next32
(). | |
1557 * @stable ICU 3.6 | |
1558 */ | |
1559 int32_t chunkOffset; | |
1560 | |
1561 /** | |
1562 * (protected) Length the text chunk (UTF-16 buffer), in UChars. | |
1563 * @stable ICU 3.6 | |
1564 */ | |
1565 int32_t chunkLength; | |
1566 | |
1567 /* ---- 16 byte alignment boundary-- */ | |
1568 | |
1569 | |
1570 /** | |
1571 * (protected) pointer to a chunk of text in UTF-16 format. | |
1572 * May refer either to original storage of the source of the text, or | |
1573 * if conversion was required, to a buffer owned by the UText. | |
1574 * @stable ICU 3.6 | |
1575 */ | |
1576 const UChar *chunkContents; | |
1577 | |
1578 /** | |
1579 * (public) Pointer to Dispatch table for accessing functions for this
UText. | |
1580 * @stable ICU 3.6 | |
1581 */ | |
1582 const UTextFuncs *pFuncs; | |
1583 | |
1584 /** | |
1585 * (protected) Pointer to additional space requested by the | |
1586 * text provider during the utext_open operation. | |
1587 * @stable ICU 3.4 | |
1588 */ | |
1589 void *pExtra; | |
1590 | |
1591 /** | |
1592 * (protected) Pointer to string or text-containin object or similar. | |
1593 * This is the source of the text that this UText is wrapping, in a format | |
1594 * that is known to the text provider functions. | |
1595 * @stable ICU 3.4 | |
1596 */ | |
1597 const void *context; | |
1598 | |
1599 /* --- 16 byte alignment boundary--- */ | |
1600 | |
1601 /** | |
1602 * (protected) Pointer fields available for use by the text provider. | |
1603 * Not used by UText common code. | |
1604 * @stable ICU 3.6 | |
1605 */ | |
1606 const void *p; | |
1607 /** | |
1608 * (protected) Pointer fields available for use by the text provider. | |
1609 * Not used by UText common code. | |
1610 * @stable ICU 3.6 | |
1611 */ | |
1612 const void *q; | |
1613 /** | |
1614 * (protected) Pointer fields available for use by the text provider. | |
1615 * Not used by UText common code. | |
1616 * @stable ICU 3.6 | |
1617 */ | |
1618 const void *r; | |
1619 | |
1620 /** | |
1621 * Private field reserved for future use by the UText framework | |
1622 * itself. This is not to be touched by the text providers. | |
1623 * @internal ICU 3.4 | |
1624 */ | |
1625 void *privP; | |
1626 | |
1627 | |
1628 /* --- 16 byte alignment boundary--- */ | |
1629 | |
1630 | |
1631 /** | |
1632 * (protected) Integer field reserved for use by the text provider. | |
1633 * Not used by the UText framework, or by the client (user) of the UText. | |
1634 * @stable ICU 3.4 | |
1635 */ | |
1636 int64_t a; | |
1637 | |
1638 /** | |
1639 * (protected) Integer field reserved for use by the text provider. | |
1640 * Not used by the UText framework, or by the client (user) of the UText. | |
1641 * @stable ICU 3.4 | |
1642 */ | |
1643 int32_t b; | |
1644 | |
1645 /** | |
1646 * (protected) Integer field reserved for use by the text provider. | |
1647 * Not used by the UText framework, or by the client (user) of the UText. | |
1648 * @stable ICU 3.4 | |
1649 */ | |
1650 int32_t c; | |
1651 | |
1652 /* ---- 16 byte alignment boundary---- */ | |
1653 | |
1654 | |
1655 /** | |
1656 * Private field reserved for future use by the UText framework | |
1657 * itself. This is not to be touched by the text providers. | |
1658 * @internal ICU 3.4 | |
1659 */ | |
1660 int64_t privA; | |
1661 /** | |
1662 * Private field reserved for future use by the UText framework | |
1663 * itself. This is not to be touched by the text providers. | |
1664 * @internal ICU 3.4 | |
1665 */ | |
1666 int32_t privB; | |
1667 /** | |
1668 * Private field reserved for future use by the UText framework | |
1669 * itself. This is not to be touched by the text providers. | |
1670 * @internal ICU 3.4 | |
1671 */ | |
1672 int32_t privC; | |
1673 }; | |
1674 | |
1675 | |
1676 /** | |
1677 * Common function for use by Text Provider implementations to allocate and/or
initialize | |
1678 * a new UText struct. To be called in the implementation of utext_open() func
tions. | |
1679 * If the supplied UText parameter is null, a new UText struct will be allocate
d on the heap. | |
1680 * If the supplied UText is already open, the provider's close function will be
called | |
1681 * so that the struct can be reused by the open that is in progress. | |
1682 * | |
1683 * @param ut pointer to a UText struct to be re-used, or null if a new UText | |
1684 * should be allocated. | |
1685 * @param extraSpace The amount of additional space to be allocated as part | |
1686 * of this UText, for use by types of providers that require | |
1687 * additional storage. | |
1688 * @param status Errors are returned here. | |
1689 * @return pointer to the UText, allocated if necessary, with extra space set up
if requested. | |
1690 * @stable ICU 3.4 | |
1691 */ | |
1692 U_STABLE UText * U_EXPORT2 | |
1693 utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status); | |
1694 | |
1695 /** | |
1696 * @internal | |
1697 * Value used to help identify correctly initialized UText structs. | |
1698 * Note: must be publicly visible so that UTEXT_INITIALIZER can access it. | |
1699 */ | |
1700 enum { | |
1701 UTEXT_MAGIC = 0x345ad82c | |
1702 }; | |
1703 | |
1704 /** | |
1705 * initializer to be used with local (stack) instances of a UText | |
1706 * struct. UText structs must be initialized before passing | |
1707 * them to one of the utext_open functions. | |
1708 * | |
1709 * @stable ICU 3.6 | |
1710 */ | |
1711 #define UTEXT_INITIALIZER { \ | |
1712 UTEXT_MAGIC, /* magic */ \ | |
1713 0, /* flags */ \ | |
1714 0, /* providerProps */ \ | |
1715 sizeof(UText), /* sizeOfStruct */ \ | |
1716 0, /* chunkNativeLimit */ \ | |
1717 0, /* extraSize */ \ | |
1718 0, /* nativeIndexingLimit */ \ | |
1719 0, /* chunkNativeStart */ \ | |
1720 0, /* chunkOffset */ \ | |
1721 0, /* chunkLength */ \ | |
1722 NULL, /* chunkContents */ \ | |
1723 NULL, /* pFuncs */ \ | |
1724 NULL, /* pExtra */ \ | |
1725 NULL, /* context */ \ | |
1726 NULL, NULL, NULL, /* p, q, r */ \ | |
1727 NULL, /* privP */ \ | |
1728 0, 0, 0, /* a, b, c */ \ | |
1729 0, 0, 0 /* privA,B,C, */ \ | |
1730 } | |
1731 | |
1732 | |
1733 U_CDECL_END | |
1734 | |
1735 | |
1736 | |
1737 #endif | |
OLD | NEW |