OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ****************************************************************************** |
| 3 * |
| 4 * Copyright (C) 2001-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ****************************************************************************** |
| 8 * file name: utrie2_builder.c |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2008sep26 (split off from utrie2.c) |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * This is a common implementation of a Unicode trie. |
| 17 * It is a kind of compressed, serializable table of 16- or 32-bit values assoc
iated with |
| 18 * Unicode code points (0..0x10ffff). |
| 19 * This is the second common version of a Unicode trie (hence the name UTrie2). |
| 20 * See utrie2.h for a comparison. |
| 21 * |
| 22 * This file contains only the builder code. |
| 23 * See utrie2.c for the runtime and enumeration code. |
| 24 */ |
| 25 #ifdef UTRIE2_DEBUG |
| 26 # include <stdio.h> |
| 27 #endif |
| 28 |
| 29 #include "unicode/utypes.h" |
| 30 #include "cmemory.h" |
| 31 #include "utrie2.h" |
| 32 #include "utrie2_impl.h" |
| 33 |
| 34 #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */ |
| 35 |
| 36 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
| 37 |
| 38 /* Implementation notes ----------------------------------------------------- */ |
| 39 |
| 40 /* |
| 41 * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values |
| 42 * have been chosen to minimize trie sizes overall. |
| 43 * Most of the code is flexible enough to work with a range of values, |
| 44 * within certain limits. |
| 45 * |
| 46 * Exception: Support for separate values for lead surrogate code _units_ |
| 47 * vs. code _points_ was added after the constants were fixed, |
| 48 * and has not been tested nor particularly designed for different constant valu
es. |
| 49 * (Especially the utrie2_enum() code that jumps to the special LSCP index-2 |
| 50 * part and back.) |
| 51 * |
| 52 * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-line
ar data |
| 53 * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LEN
GTH |
| 54 * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction |
| 55 * remapping) stops working. |
| 56 * |
| 57 * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate() |
| 58 * assumes that a single index-2 block is used for 0x400 code points |
| 59 * corresponding to one lead surrogate. |
| 60 * |
| 61 * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains |
| 62 * more than one Unicode plane, and the split of the index-2 table into a BMP |
| 63 * part and a supplementary part, with a gap in between, would not work. |
| 64 * |
| 65 * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because |
| 66 * there is data with more than 64k distinct values, |
| 67 * for example for Unihan collation with a separate collation weight per |
| 68 * Han character. |
| 69 */ |
| 70 |
| 71 /* Building a trie ----------------------------------------------------------*/ |
| 72 |
| 73 enum { |
| 74 /** The null index-2 block, following the gap in the index-2 table. */ |
| 75 UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP
_LENGTH, |
| 76 |
| 77 /** The start of allocated index-2 blocks. */ |
| 78 UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_
BLOCK_LENGTH, |
| 79 |
| 80 /** |
| 81 * The null data block. |
| 82 * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller, |
| 83 * to work with 6-bit trail bytes from 2-byte UTF-8. |
| 84 */ |
| 85 UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET, |
| 86 |
| 87 /** The start of allocated data blocks. */ |
| 88 UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40, |
| 89 |
| 90 /** |
| 91 * The start of data blocks for U+0800 and above. |
| 92 * Below, compaction uses a block length of 64 for 2-byte UTF-8. |
| 93 * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH. |
| 94 * Data values for 0x780 code points beyond ASCII. |
| 95 */ |
| 96 UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780 |
| 97 }; |
| 98 |
| 99 /* Start with allocation of 16k data entries. */ |
| 100 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14) |
| 101 |
| 102 /* Grow about 8x each time. */ |
| 103 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17) |
| 104 |
| 105 static int32_t |
| 106 allocIndex2Block(UNewTrie2 *trie); |
| 107 |
| 108 U_CAPI UTrie2 * U_EXPORT2 |
| 109 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode)
{ |
| 110 UTrie2 *trie; |
| 111 UNewTrie2 *newTrie; |
| 112 uint32_t *data; |
| 113 int32_t i, j; |
| 114 |
| 115 if(U_FAILURE(*pErrorCode)) { |
| 116 return NULL; |
| 117 } |
| 118 |
| 119 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); |
| 120 newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); |
| 121 data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4); |
| 122 if(trie==NULL || newTrie==NULL || data==NULL) { |
| 123 uprv_free(trie); |
| 124 uprv_free(newTrie); |
| 125 uprv_free(data); |
| 126 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| 127 return 0; |
| 128 } |
| 129 |
| 130 uprv_memset(trie, 0, sizeof(UTrie2)); |
| 131 trie->initialValue=initialValue; |
| 132 trie->errorValue=errorValue; |
| 133 trie->highStart=0x110000; |
| 134 trie->newTrie=newTrie; |
| 135 |
| 136 newTrie->data=data; |
| 137 newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH; |
| 138 newTrie->initialValue=initialValue; |
| 139 newTrie->errorValue=errorValue; |
| 140 newTrie->highStart=0x110000; |
| 141 newTrie->firstFreeBlock=0; /* no free block in the list */ |
| 142 newTrie->isCompacted=FALSE; |
| 143 |
| 144 /* |
| 145 * preallocate and reset |
| 146 * - ASCII |
| 147 * - the bad-UTF-8-data block |
| 148 * - the null data block |
| 149 */ |
| 150 for(i=0; i<0x80; ++i) { |
| 151 newTrie->data[i]=initialValue; |
| 152 } |
| 153 for(; i<0xc0; ++i) { |
| 154 newTrie->data[i]=errorValue; |
| 155 } |
| 156 for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) { |
| 157 newTrie->data[i]=initialValue; |
| 158 } |
| 159 newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET; |
| 160 newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET; |
| 161 |
| 162 /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks
*/ |
| 163 for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { |
| 164 newTrie->index2[i]=j; |
| 165 newTrie->map[i]=1; |
| 166 } |
| 167 /* reference counts for the bad-UTF-8-data block */ |
| 168 for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { |
| 169 newTrie->map[i]=0; |
| 170 } |
| 171 /* |
| 172 * Reference counts for the null data block: all blocks except for the ASCII
blocks. |
| 173 * Plus 1 so that we don't drop this block during compaction. |
| 174 * Plus as many as needed for lead surrogate code points. |
| 175 */ |
| 176 /* i==newTrie->dataNullOffset */ |
| 177 newTrie->map[i++]= |
| 178 (0x110000>>UTRIE2_SHIFT_2)- |
| 179 (0x80>>UTRIE2_SHIFT_2)+ |
| 180 1+ |
| 181 UTRIE2_LSCP_INDEX_2_LENGTH; |
| 182 j+=UTRIE2_DATA_BLOCK_LENGTH; |
| 183 for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { |
| 184 newTrie->map[i]=0; |
| 185 } |
| 186 |
| 187 /* |
| 188 * set the remaining indexes in the BMP index-2 block |
| 189 * to the null data block |
| 190 */ |
| 191 for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) { |
| 192 newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET; |
| 193 } |
| 194 |
| 195 /* |
| 196 * Fill the index gap with impossible values so that compaction |
| 197 * does not overlap other index-2 blocks with the gap. |
| 198 */ |
| 199 for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) { |
| 200 newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1; |
| 201 } |
| 202 |
| 203 /* set the indexes in the null index-2 block */ |
| 204 for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) { |
| 205 newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFF
SET; |
| 206 } |
| 207 newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET; |
| 208 newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET; |
| 209 |
| 210 /* set the index-1 indexes for the linear index-2 block */ |
| 211 for(i=0, j=0; |
| 212 i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; |
| 213 ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH |
| 214 ) { |
| 215 newTrie->index1[i]=j; |
| 216 } |
| 217 |
| 218 /* set the remaining index-1 indexes to the null index-2 block */ |
| 219 for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { |
| 220 newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET; |
| 221 } |
| 222 |
| 223 /* |
| 224 * Preallocate and reset data for U+0080..U+07ff, |
| 225 * for 2-byte UTF-8 which will be compacted in 64-blocks |
| 226 * even if UTRIE2_DATA_BLOCK_LENGTH is smaller. |
| 227 */ |
| 228 for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) { |
| 229 utrie2_set32(trie, i, initialValue, pErrorCode); |
| 230 } |
| 231 |
| 232 return trie; |
| 233 } |
| 234 |
| 235 static UNewTrie2 * |
| 236 cloneBuilder(const UNewTrie2 *other) { |
| 237 UNewTrie2 *trie; |
| 238 |
| 239 trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); |
| 240 if(trie==NULL) { |
| 241 return NULL; |
| 242 } |
| 243 |
| 244 trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4); |
| 245 if(trie->data==NULL) { |
| 246 uprv_free(trie); |
| 247 return NULL; |
| 248 } |
| 249 trie->dataCapacity=other->dataCapacity; |
| 250 |
| 251 /* clone data */ |
| 252 uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1)); |
| 253 uprv_memcpy(trie->index2, other->index2, other->index2Length*4); |
| 254 trie->index2NullOffset=other->index2NullOffset; |
| 255 trie->index2Length=other->index2Length; |
| 256 |
| 257 uprv_memcpy(trie->data, other->data, other->dataLength*4); |
| 258 trie->dataNullOffset=other->dataNullOffset; |
| 259 trie->dataLength=other->dataLength; |
| 260 |
| 261 /* reference counters */ |
| 262 if(other->isCompacted) { |
| 263 trie->firstFreeBlock=0; |
| 264 } else { |
| 265 uprv_memcpy(trie->map, other->map, (other->dataLength>>UTRIE2_SHIFT_2)*4
); |
| 266 trie->firstFreeBlock=other->firstFreeBlock; |
| 267 } |
| 268 |
| 269 trie->initialValue=other->initialValue; |
| 270 trie->errorValue=other->errorValue; |
| 271 trie->highStart=other->highStart; |
| 272 trie->isCompacted=other->isCompacted; |
| 273 |
| 274 return trie; |
| 275 } |
| 276 |
| 277 U_CAPI UTrie2 * U_EXPORT2 |
| 278 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) { |
| 279 UTrie2 *trie; |
| 280 |
| 281 if(U_FAILURE(*pErrorCode)) { |
| 282 return NULL; |
| 283 } |
| 284 if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { |
| 285 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 286 return NULL; |
| 287 } |
| 288 |
| 289 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); |
| 290 if(trie==NULL) { |
| 291 return NULL; |
| 292 } |
| 293 uprv_memcpy(trie, other, sizeof(UTrie2)); |
| 294 |
| 295 if(other->memory!=NULL) { |
| 296 trie->memory=uprv_malloc(other->length); |
| 297 if(trie->memory!=NULL) { |
| 298 trie->isMemoryOwned=TRUE; |
| 299 uprv_memcpy(trie->memory, other->memory, other->length); |
| 300 |
| 301 /* make the clone's pointers point to its own memory */ |
| 302 trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other
->memory); |
| 303 if(other->data16!=NULL) { |
| 304 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *
)other->memory); |
| 305 } |
| 306 if(other->data32!=NULL) { |
| 307 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *
)other->memory); |
| 308 } |
| 309 } |
| 310 } else /* other->newTrie!=NULL */ { |
| 311 trie->newTrie=cloneBuilder(other->newTrie); |
| 312 } |
| 313 |
| 314 if(trie->memory==NULL && trie->newTrie==NULL) { |
| 315 uprv_free(trie); |
| 316 trie=NULL; |
| 317 } |
| 318 return trie; |
| 319 } |
| 320 |
| 321 typedef struct NewTrieAndStatus { |
| 322 UTrie2 *trie; |
| 323 UErrorCode errorCode; |
| 324 UBool exclusiveLimit; /* rather than inclusive range end */ |
| 325 } NewTrieAndStatus; |
| 326 |
| 327 static UBool U_CALLCONV |
| 328 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { |
| 329 NewTrieAndStatus *nt=(NewTrieAndStatus *)context; |
| 330 if(value!=nt->trie->initialValue) { |
| 331 if(nt->exclusiveLimit) { |
| 332 --end; |
| 333 } |
| 334 if(start==end) { |
| 335 utrie2_set32(nt->trie, start, value, &nt->errorCode); |
| 336 } else { |
| 337 utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode)
; |
| 338 } |
| 339 return U_SUCCESS(nt->errorCode); |
| 340 } else { |
| 341 return TRUE; |
| 342 } |
| 343 } |
| 344 |
| 345 #ifdef UTRIE2_DEBUG |
| 346 static void |
| 347 utrie_printLengths(const UTrie *trie) { |
| 348 long indexLength=trie->indexLength; |
| 349 long dataLength=(long)trie->dataLength; |
| 350 long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->d
ata32!=NULL ? 4 : 2); |
| 351 printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n", |
| 352 indexLength, dataLength, totalLength); |
| 353 } |
| 354 |
| 355 static void |
| 356 utrie2_printLengths(const UTrie2 *trie, const char *which) { |
| 357 long indexLength=trie->indexLength; |
| 358 long dataLength=(long)trie->dataLength; |
| 359 long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->
data32!=NULL ? 4 : 2); |
| 360 printf("**UTrie2Lengths(%s)** index:%6ld data:%6ld serialized:%6ld\n", |
| 361 which, indexLength, dataLength, totalLength); |
| 362 } |
| 363 #endif |
| 364 |
| 365 U_CAPI UTrie2 * U_EXPORT2 |
| 366 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) { |
| 367 NewTrieAndStatus context; |
| 368 UChar lead; |
| 369 |
| 370 if(U_FAILURE(*pErrorCode)) { |
| 371 return NULL; |
| 372 } |
| 373 if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { |
| 374 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 375 return NULL; |
| 376 } |
| 377 if(other->newTrie!=NULL && !other->newTrie->isCompacted) { |
| 378 return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */ |
| 379 } |
| 380 |
| 381 /* Clone the frozen trie by enumerating it and building a new one. */ |
| 382 context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode)
; |
| 383 if(U_FAILURE(*pErrorCode)) { |
| 384 return NULL; |
| 385 } |
| 386 context.exclusiveLimit=FALSE; |
| 387 context.errorCode=*pErrorCode; |
| 388 utrie2_enum(other, NULL, copyEnumRange, &context); |
| 389 *pErrorCode=context.errorCode; |
| 390 for(lead=0xd800; lead<0xdc00; ++lead) { |
| 391 uint32_t value; |
| 392 if(other->data32==NULL) { |
| 393 value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead); |
| 394 } else { |
| 395 value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead); |
| 396 } |
| 397 if(value!=other->initialValue) { |
| 398 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErr
orCode); |
| 399 } |
| 400 } |
| 401 if(U_FAILURE(*pErrorCode)) { |
| 402 utrie2_close(context.trie); |
| 403 context.trie=NULL; |
| 404 } |
| 405 return context.trie; |
| 406 } |
| 407 |
| 408 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the
clone. */ |
| 409 U_CAPI UTrie2 * U_EXPORT2 |
| 410 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode
) { |
| 411 NewTrieAndStatus context; |
| 412 UChar lead; |
| 413 |
| 414 if(U_FAILURE(*pErrorCode)) { |
| 415 return NULL; |
| 416 } |
| 417 if(trie1==NULL) { |
| 418 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 419 return NULL; |
| 420 } |
| 421 context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode); |
| 422 if(U_FAILURE(*pErrorCode)) { |
| 423 return NULL; |
| 424 } |
| 425 context.exclusiveLimit=TRUE; |
| 426 context.errorCode=*pErrorCode; |
| 427 utrie_enum(trie1, NULL, copyEnumRange, &context); |
| 428 *pErrorCode=context.errorCode; |
| 429 for(lead=0xd800; lead<0xdc00; ++lead) { |
| 430 uint32_t value; |
| 431 if(trie1->data32==NULL) { |
| 432 value=UTRIE_GET16_FROM_LEAD(trie1, lead); |
| 433 } else { |
| 434 value=UTRIE_GET32_FROM_LEAD(trie1, lead); |
| 435 } |
| 436 if(value!=trie1->initialValue) { |
| 437 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErr
orCode); |
| 438 } |
| 439 } |
| 440 if(U_SUCCESS(*pErrorCode)) { |
| 441 utrie2_freeze(context.trie, |
| 442 trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VAL
UE_BITS, |
| 443 pErrorCode); |
| 444 } |
| 445 #ifdef UTRIE2_DEBUG |
| 446 if(U_SUCCESS(*pErrorCode)) { |
| 447 utrie_printLengths(trie1); |
| 448 utrie2_printLengths(context.trie, "fromUTrie"); |
| 449 } |
| 450 #endif |
| 451 if(U_FAILURE(*pErrorCode)) { |
| 452 utrie2_close(context.trie); |
| 453 context.trie=NULL; |
| 454 } |
| 455 return context.trie; |
| 456 } |
| 457 |
| 458 static U_INLINE UBool |
| 459 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { |
| 460 int32_t i2, block; |
| 461 |
| 462 if(U_IS_LEAD(c) && forLSCP) { |
| 463 i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+ |
| 464 (c>>UTRIE2_SHIFT_2); |
| 465 } else { |
| 466 i2=trie->index1[c>>UTRIE2_SHIFT_1]+ |
| 467 ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK); |
| 468 } |
| 469 block=trie->index2[i2]; |
| 470 return (UBool)(block==trie->dataNullOffset); |
| 471 } |
| 472 |
| 473 static int32_t |
| 474 allocIndex2Block(UNewTrie2 *trie) { |
| 475 int32_t newBlock, newTop; |
| 476 |
| 477 newBlock=trie->index2Length; |
| 478 newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH; |
| 479 if(newTop>LENGTHOF(trie->index2)) { |
| 480 /* |
| 481 * Should never occur. |
| 482 * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect, |
| 483 * or the code writes more values than should be possible. |
| 484 */ |
| 485 return -1; |
| 486 } |
| 487 trie->index2Length=newTop; |
| 488 uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRI
E2_INDEX_2_BLOCK_LENGTH*4); |
| 489 return newBlock; |
| 490 } |
| 491 |
| 492 static int32_t |
| 493 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { |
| 494 int32_t i1, i2; |
| 495 |
| 496 if(U_IS_LEAD(c) && forLSCP) { |
| 497 return UTRIE2_LSCP_INDEX_2_OFFSET; |
| 498 } |
| 499 |
| 500 i1=c>>UTRIE2_SHIFT_1; |
| 501 i2=trie->index1[i1]; |
| 502 if(i2==trie->index2NullOffset) { |
| 503 i2=allocIndex2Block(trie); |
| 504 if(i2<0) { |
| 505 return -1; /* program error */ |
| 506 } |
| 507 trie->index1[i1]=i2; |
| 508 } |
| 509 return i2; |
| 510 } |
| 511 |
| 512 static int32_t |
| 513 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) { |
| 514 int32_t newBlock, newTop; |
| 515 |
| 516 if(trie->firstFreeBlock!=0) { |
| 517 /* get the first free block */ |
| 518 newBlock=trie->firstFreeBlock; |
| 519 trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2]; |
| 520 } else { |
| 521 /* get a new block from the high end */ |
| 522 newBlock=trie->dataLength; |
| 523 newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH; |
| 524 if(newTop>trie->dataCapacity) { |
| 525 /* out of memory in the data array */ |
| 526 int32_t capacity; |
| 527 uint32_t *data; |
| 528 |
| 529 if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) { |
| 530 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH; |
| 531 } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) { |
| 532 capacity=UNEWTRIE2_MAX_DATA_LENGTH; |
| 533 } else { |
| 534 /* |
| 535 * Should never occur. |
| 536 * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect, |
| 537 * or the code writes more values than should be possible. |
| 538 */ |
| 539 return -1; |
| 540 } |
| 541 data=(uint32_t *)uprv_malloc(capacity*4); |
| 542 if(data==NULL) { |
| 543 return -1; |
| 544 } |
| 545 uprv_memcpy(data, trie->data, trie->dataLength*4); |
| 546 uprv_free(trie->data); |
| 547 trie->data=data; |
| 548 trie->dataCapacity=capacity; |
| 549 } |
| 550 trie->dataLength=newTop; |
| 551 } |
| 552 uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LEN
GTH*4); |
| 553 trie->map[newBlock>>UTRIE2_SHIFT_2]=0; |
| 554 return newBlock; |
| 555 } |
| 556 |
| 557 /* call when the block's reference counter reaches 0 */ |
| 558 static void |
| 559 releaseDataBlock(UNewTrie2 *trie, int32_t block) { |
| 560 /* put this block at the front of the free-block chain */ |
| 561 trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock; |
| 562 trie->firstFreeBlock=block; |
| 563 } |
| 564 |
| 565 static U_INLINE UBool |
| 566 isWritableBlock(UNewTrie2 *trie, int32_t block) { |
| 567 return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHI
FT_2]); |
| 568 } |
| 569 |
| 570 static U_INLINE void |
| 571 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) { |
| 572 int32_t oldBlock; |
| 573 ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldB
lock! */ |
| 574 oldBlock=trie->index2[i2]; |
| 575 if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) { |
| 576 releaseDataBlock(trie, oldBlock); |
| 577 } |
| 578 trie->index2[i2]=block; |
| 579 } |
| 580 |
| 581 /** |
| 582 * No error checking for illegal arguments. |
| 583 * |
| 584 * @return -1 if no new data block available (out of memory in data array) |
| 585 * @internal |
| 586 */ |
| 587 static int32_t |
| 588 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { |
| 589 int32_t i2, oldBlock, newBlock; |
| 590 |
| 591 i2=getIndex2Block(trie, c, forLSCP); |
| 592 if(i2<0) { |
| 593 return -1; /* program error */ |
| 594 } |
| 595 |
| 596 i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; |
| 597 oldBlock=trie->index2[i2]; |
| 598 if(isWritableBlock(trie, oldBlock)) { |
| 599 return oldBlock; |
| 600 } |
| 601 |
| 602 /* allocate a new data block */ |
| 603 newBlock=allocDataBlock(trie, oldBlock); |
| 604 if(newBlock<0) { |
| 605 /* out of memory in the data array */ |
| 606 return -1; |
| 607 } |
| 608 setIndex2Entry(trie, i2, newBlock); |
| 609 return newBlock; |
| 610 } |
| 611 |
| 612 /** |
| 613 * @return TRUE if the value was successfully set |
| 614 */ |
| 615 static void |
| 616 set32(UNewTrie2 *trie, |
| 617 UChar32 c, UBool forLSCP, uint32_t value, |
| 618 UErrorCode *pErrorCode) { |
| 619 int32_t block; |
| 620 |
| 621 if(trie==NULL || trie->isCompacted) { |
| 622 *pErrorCode=U_NO_WRITE_PERMISSION; |
| 623 return; |
| 624 } |
| 625 |
| 626 block=getDataBlock(trie, c, forLSCP); |
| 627 if(block<0) { |
| 628 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| 629 return; |
| 630 } |
| 631 |
| 632 trie->data[block+(c&UTRIE2_DATA_MASK)]=value; |
| 633 } |
| 634 |
| 635 U_CAPI void U_EXPORT2 |
| 636 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { |
| 637 if(U_FAILURE(*pErrorCode)) { |
| 638 return; |
| 639 } |
| 640 if((uint32_t)c>0x10ffff) { |
| 641 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 642 return; |
| 643 } |
| 644 set32(trie->newTrie, c, TRUE, value, pErrorCode); |
| 645 } |
| 646 |
| 647 U_CAPI void U_EXPORT2 |
| 648 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie, |
| 649 UChar32 c, uint32_t value, |
| 650 UErrorCode *pErrorCode) { |
| 651 if(U_FAILURE(*pErrorCode)) { |
| 652 return; |
| 653 } |
| 654 if(!U_IS_LEAD(c)) { |
| 655 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 656 return; |
| 657 } |
| 658 set32(trie->newTrie, c, FALSE, value, pErrorCode); |
| 659 } |
| 660 |
| 661 static void |
| 662 writeBlock(uint32_t *block, uint32_t value) { |
| 663 uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH; |
| 664 while(block<limit) { |
| 665 *block++=value; |
| 666 } |
| 667 } |
| 668 |
| 669 /** |
| 670 * initialValue is ignored if overwrite=TRUE |
| 671 * @internal |
| 672 */ |
| 673 static void |
| 674 fillBlock(uint32_t *block, UChar32 start, UChar32 limit, |
| 675 uint32_t value, uint32_t initialValue, UBool overwrite) { |
| 676 uint32_t *pLimit; |
| 677 |
| 678 pLimit=block+limit; |
| 679 block+=start; |
| 680 if(overwrite) { |
| 681 while(block<pLimit) { |
| 682 *block++=value; |
| 683 } |
| 684 } else { |
| 685 while(block<pLimit) { |
| 686 if(*block==initialValue) { |
| 687 *block=value; |
| 688 } |
| 689 ++block; |
| 690 } |
| 691 } |
| 692 } |
| 693 |
| 694 U_CAPI void U_EXPORT2 |
| 695 utrie2_setRange32(UTrie2 *trie, |
| 696 UChar32 start, UChar32 end, |
| 697 uint32_t value, UBool overwrite, |
| 698 UErrorCode *pErrorCode) { |
| 699 /* |
| 700 * repeat value in [start..end] |
| 701 * mark index values for repeat-data blocks by setting bit 31 of the index v
alues |
| 702 * fill around existing values if any, if(overwrite) |
| 703 */ |
| 704 UNewTrie2 *newTrie; |
| 705 int32_t block, rest, repeatBlock; |
| 706 UChar32 limit; |
| 707 |
| 708 if(U_FAILURE(*pErrorCode)) { |
| 709 return; |
| 710 } |
| 711 if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) { |
| 712 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 713 return; |
| 714 } |
| 715 newTrie=trie->newTrie; |
| 716 if(newTrie==NULL || newTrie->isCompacted) { |
| 717 *pErrorCode=U_NO_WRITE_PERMISSION; |
| 718 return; |
| 719 } |
| 720 if(!overwrite && value==newTrie->initialValue) { |
| 721 return; /* nothing to do */ |
| 722 } |
| 723 |
| 724 limit=end+1; |
| 725 if(start&UTRIE2_DATA_MASK) { |
| 726 UChar32 nextStart; |
| 727 |
| 728 /* set partial block at [start..following block boundary[ */ |
| 729 block=getDataBlock(newTrie, start, TRUE); |
| 730 if(block<0) { |
| 731 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| 732 return; |
| 733 } |
| 734 |
| 735 nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK; |
| 736 if(nextStart<=limit) { |
| 737 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_B
LOCK_LENGTH, |
| 738 value, newTrie->initialValue, overwrite); |
| 739 start=nextStart; |
| 740 } else { |
| 741 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_
DATA_MASK, |
| 742 value, newTrie->initialValue, overwrite); |
| 743 return; |
| 744 } |
| 745 } |
| 746 |
| 747 /* number of positions in the last, partial block */ |
| 748 rest=limit&UTRIE2_DATA_MASK; |
| 749 |
| 750 /* round down limit to a block boundary */ |
| 751 limit&=~UTRIE2_DATA_MASK; |
| 752 |
| 753 /* iterate over all-value blocks */ |
| 754 if(value==newTrie->initialValue) { |
| 755 repeatBlock=newTrie->dataNullOffset; |
| 756 } else { |
| 757 repeatBlock=-1; |
| 758 } |
| 759 |
| 760 while(start<limit) { |
| 761 int32_t i2; |
| 762 UBool setRepeatBlock=FALSE; |
| 763 |
| 764 if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE))
{ |
| 765 start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */ |
| 766 continue; |
| 767 } |
| 768 |
| 769 /* get index value */ |
| 770 i2=getIndex2Block(newTrie, start, TRUE); |
| 771 if(i2<0) { |
| 772 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; |
| 773 return; |
| 774 } |
| 775 i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; |
| 776 block=newTrie->index2[i2]; |
| 777 if(isWritableBlock(newTrie, block)) { |
| 778 /* already allocated */ |
| 779 if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) { |
| 780 /* |
| 781 * We overwrite all values, and it's not a |
| 782 * protected (ASCII-linear or 2-byte UTF-8) block: |
| 783 * replace with the repeatBlock. |
| 784 */ |
| 785 setRepeatBlock=TRUE; |
| 786 } else { |
| 787 /* !overwrite, or protected block: just write the values into th
is block */ |
| 788 fillBlock(newTrie->data+block, |
| 789 0, UTRIE2_DATA_BLOCK_LENGTH, |
| 790 value, newTrie->initialValue, overwrite); |
| 791 } |
| 792 } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->d
ataNullOffset)) { |
| 793 /* |
| 794 * Set the repeatBlock instead of the null block or previous repeat
block: |
| 795 * |
| 796 * If !isWritableBlock() then all entries in the block have the same
value |
| 797 * because it's the null block or a range block (the repeatBlock fro
m a previous |
| 798 * call to utrie2_setRange32()). |
| 799 * No other blocks are used multiple times before compacting. |
| 800 * |
| 801 * The null block is the only non-writable block with the initialVal
ue because |
| 802 * of the repeatBlock initialization above. (If value==initialValue,
then |
| 803 * the repeatBlock will be the null data block.) |
| 804 * |
| 805 * We set our repeatBlock if the desired value differs from the bloc
k's value, |
| 806 * and if we overwrite any data or if the data is all initial values |
| 807 * (which is the same as the block being the null block, see above). |
| 808 */ |
| 809 setRepeatBlock=TRUE; |
| 810 } |
| 811 if(setRepeatBlock) { |
| 812 if(repeatBlock>=0) { |
| 813 setIndex2Entry(newTrie, i2, repeatBlock); |
| 814 } else { |
| 815 /* create and set and fill the repeatBlock */ |
| 816 repeatBlock=getDataBlock(newTrie, start, TRUE); |
| 817 if(repeatBlock<0) { |
| 818 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| 819 return; |
| 820 } |
| 821 writeBlock(newTrie->data+repeatBlock, value); |
| 822 } |
| 823 } |
| 824 |
| 825 start+=UTRIE2_DATA_BLOCK_LENGTH; |
| 826 } |
| 827 |
| 828 if(rest>0) { |
| 829 /* set partial block at [last block boundary..limit[ */ |
| 830 block=getDataBlock(newTrie, start, TRUE); |
| 831 if(block<0) { |
| 832 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| 833 return; |
| 834 } |
| 835 |
| 836 fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, ov
erwrite); |
| 837 } |
| 838 |
| 839 return; |
| 840 } |
| 841 |
| 842 /* compaction --------------------------------------------------------------- */ |
| 843 |
| 844 static U_INLINE UBool |
| 845 equal_int32(const int32_t *s, const int32_t *t, int32_t length) { |
| 846 while(length>0 && *s==*t) { |
| 847 ++s; |
| 848 ++t; |
| 849 --length; |
| 850 } |
| 851 return (UBool)(length==0); |
| 852 } |
| 853 |
| 854 static U_INLINE UBool |
| 855 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { |
| 856 while(length>0 && *s==*t) { |
| 857 ++s; |
| 858 ++t; |
| 859 --length; |
| 860 } |
| 861 return (UBool)(length==0); |
| 862 } |
| 863 |
| 864 static int32_t |
| 865 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock
) { |
| 866 int32_t block; |
| 867 |
| 868 /* ensure that we do not even partially get past index2Length */ |
| 869 index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH; |
| 870 |
| 871 for(block=0; block<=index2Length; ++block) { |
| 872 if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH))
{ |
| 873 return block; |
| 874 } |
| 875 } |
| 876 return -1; |
| 877 } |
| 878 |
| 879 static int32_t |
| 880 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock,
int32_t blockLength) { |
| 881 int32_t block; |
| 882 |
| 883 /* ensure that we do not even partially get past dataLength */ |
| 884 dataLength-=blockLength; |
| 885 |
| 886 for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) { |
| 887 if(equal_uint32(data+block, data+otherBlock, blockLength)) { |
| 888 return block; |
| 889 } |
| 890 } |
| 891 return -1; |
| 892 } |
| 893 |
| 894 /* |
| 895 * Find the start of the last range in the trie by enumerating backward. |
| 896 * Indexes for supplementary code points higher than this will be omitted. |
| 897 */ |
| 898 static UChar32 |
| 899 findHighStart(UNewTrie2 *trie, uint32_t highValue) { |
| 900 const uint32_t *data32; |
| 901 |
| 902 uint32_t value, initialValue; |
| 903 UChar32 c, prev; |
| 904 int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock,
nullBlock; |
| 905 |
| 906 data32=trie->data; |
| 907 initialValue=trie->initialValue; |
| 908 |
| 909 index2NullOffset=trie->index2NullOffset; |
| 910 nullBlock=trie->dataNullOffset; |
| 911 |
| 912 /* set variables for previous range */ |
| 913 if(highValue==initialValue) { |
| 914 prevI2Block=index2NullOffset; |
| 915 prevBlock=nullBlock; |
| 916 } else { |
| 917 prevI2Block=-1; |
| 918 prevBlock=-1; |
| 919 } |
| 920 prev=0x110000; |
| 921 |
| 922 /* enumerate index-2 blocks */ |
| 923 i1=UNEWTRIE2_INDEX_1_LENGTH; |
| 924 c=prev; |
| 925 while(c>0) { |
| 926 i2Block=trie->index1[--i1]; |
| 927 if(i2Block==prevI2Block) { |
| 928 /* the index-2 block is the same as the previous one, and filled wit
h highValue */ |
| 929 c-=UTRIE2_CP_PER_INDEX_1_ENTRY; |
| 930 continue; |
| 931 } |
| 932 prevI2Block=i2Block; |
| 933 if(i2Block==index2NullOffset) { |
| 934 /* this is the null index-2 block */ |
| 935 if(highValue!=initialValue) { |
| 936 return c; |
| 937 } |
| 938 c-=UTRIE2_CP_PER_INDEX_1_ENTRY; |
| 939 } else { |
| 940 /* enumerate data blocks for one index-2 block */ |
| 941 for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) { |
| 942 block=trie->index2[i2Block+ --i2]; |
| 943 if(block==prevBlock) { |
| 944 /* the block is the same as the previous one, and filled wit
h highValue */ |
| 945 c-=UTRIE2_DATA_BLOCK_LENGTH; |
| 946 continue; |
| 947 } |
| 948 prevBlock=block; |
| 949 if(block==nullBlock) { |
| 950 /* this is the null data block */ |
| 951 if(highValue!=initialValue) { |
| 952 return c; |
| 953 } |
| 954 c-=UTRIE2_DATA_BLOCK_LENGTH; |
| 955 } else { |
| 956 for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) { |
| 957 value=data32[block+ --j]; |
| 958 if(value!=highValue) { |
| 959 return c; |
| 960 } |
| 961 --c; |
| 962 } |
| 963 } |
| 964 } |
| 965 } |
| 966 } |
| 967 |
| 968 /* deliver last range */ |
| 969 return 0; |
| 970 } |
| 971 |
| 972 /* |
| 973 * Compact a build-time trie. |
| 974 * |
| 975 * The compaction |
| 976 * - removes blocks that are identical with earlier ones |
| 977 * - overlaps adjacent blocks as much as possible (if overlap==TRUE) |
| 978 * - moves blocks in steps of the data granularity |
| 979 * - moves and overlaps blocks that overlap with multiple values in the overlap
region |
| 980 * |
| 981 * It does not |
| 982 * - try to move and overlap blocks that are not already adjacent |
| 983 */ |
| 984 static void |
| 985 compactData(UNewTrie2 *trie) { |
| 986 int32_t start, newStart, movedStart; |
| 987 int32_t blockLength, overlap; |
| 988 int32_t i, mapIndex, blockCount; |
| 989 |
| 990 /* do not compact linear-ASCII data */ |
| 991 newStart=UTRIE2_DATA_START_OFFSET; |
| 992 for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) { |
| 993 trie->map[i]=start; |
| 994 } |
| 995 |
| 996 /* |
| 997 * Start with a block length of 64 for 2-byte UTF-8, |
| 998 * then switch to UTRIE2_DATA_BLOCK_LENGTH. |
| 999 */ |
| 1000 blockLength=64; |
| 1001 blockCount=blockLength>>UTRIE2_SHIFT_2; |
| 1002 for(start=newStart; start<trie->dataLength;) { |
| 1003 /* |
| 1004 * start: index of first entry of current block |
| 1005 * newStart: index where the current block is to be moved |
| 1006 * (right after current end of already-compacted data) |
| 1007 */ |
| 1008 if(start==UNEWTRIE2_DATA_0800_OFFSET) { |
| 1009 blockLength=UTRIE2_DATA_BLOCK_LENGTH; |
| 1010 blockCount=1; |
| 1011 } |
| 1012 |
| 1013 /* skip blocks that are not used */ |
| 1014 if(trie->map[start>>UTRIE2_SHIFT_2]<=0) { |
| 1015 /* advance start to the next block */ |
| 1016 start+=blockLength; |
| 1017 |
| 1018 /* leave newStart with the previous block! */ |
| 1019 continue; |
| 1020 } |
| 1021 |
| 1022 /* search for an identical block */ |
| 1023 if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLeng
th)) |
| 1024 >=0 |
| 1025 ) { |
| 1026 /* found an identical block, set the other block's index value for t
he current block */ |
| 1027 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { |
| 1028 trie->map[mapIndex++]=movedStart; |
| 1029 movedStart+=UTRIE2_DATA_BLOCK_LENGTH; |
| 1030 } |
| 1031 |
| 1032 /* advance start to the next block */ |
| 1033 start+=blockLength; |
| 1034 |
| 1035 /* leave newStart with the previous block! */ |
| 1036 continue; |
| 1037 } |
| 1038 |
| 1039 /* see if the beginning of this block can be overlapped with the end of
the previous block */ |
| 1040 /* look for maximum overlap (modulo granularity) with the previous, adja
cent block */ |
| 1041 for(overlap=blockLength-UTRIE2_DATA_GRANULARITY; |
| 1042 overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data
+start, overlap); |
| 1043 overlap-=UTRIE2_DATA_GRANULARITY) {} |
| 1044 |
| 1045 if(overlap>0 || newStart<start) { |
| 1046 /* some overlap, or just move the whole block */ |
| 1047 movedStart=newStart-overlap; |
| 1048 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { |
| 1049 trie->map[mapIndex++]=movedStart; |
| 1050 movedStart+=UTRIE2_DATA_BLOCK_LENGTH; |
| 1051 } |
| 1052 |
| 1053 /* move the non-overlapping indexes to their new positions */ |
| 1054 start+=overlap; |
| 1055 for(i=blockLength-overlap; i>0; --i) { |
| 1056 trie->data[newStart++]=trie->data[start++]; |
| 1057 } |
| 1058 } else /* no overlap && newStart==start */ { |
| 1059 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { |
| 1060 trie->map[mapIndex++]=start; |
| 1061 start+=UTRIE2_DATA_BLOCK_LENGTH; |
| 1062 } |
| 1063 newStart=start; |
| 1064 } |
| 1065 } |
| 1066 |
| 1067 /* now adjust the index-2 table */ |
| 1068 for(i=0; i<trie->index2Length; ++i) { |
| 1069 if(i==UNEWTRIE2_INDEX_GAP_OFFSET) { |
| 1070 /* Gap indexes are invalid (-1). Skip over the gap. */ |
| 1071 i+=UNEWTRIE2_INDEX_GAP_LENGTH; |
| 1072 } |
| 1073 trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2]; |
| 1074 } |
| 1075 trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2]; |
| 1076 |
| 1077 /* ensure dataLength alignment */ |
| 1078 while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) { |
| 1079 trie->data[newStart++]=trie->initialValue; |
| 1080 } |
| 1081 |
| 1082 #ifdef UTRIE2_DEBUG |
| 1083 /* we saved some space */ |
| 1084 printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n", |
| 1085 (long)trie->dataLength, (long)newStart); |
| 1086 #endif |
| 1087 |
| 1088 trie->dataLength=newStart; |
| 1089 } |
| 1090 |
| 1091 static void |
| 1092 compactIndex2(UNewTrie2 *trie) { |
| 1093 int32_t i, start, newStart, movedStart, overlap; |
| 1094 |
| 1095 /* do not compact linear-BMP index-2 blocks */ |
| 1096 newStart=UTRIE2_INDEX_2_BMP_LENGTH; |
| 1097 for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) { |
| 1098 trie->map[i]=start; |
| 1099 } |
| 1100 |
| 1101 /* Reduce the index table gap to what will be needed at runtime. */ |
| 1102 newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_S
HIFT_1); |
| 1103 |
| 1104 for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) { |
| 1105 /* |
| 1106 * start: index of first entry of current block |
| 1107 * newStart: index where the current block is to be moved |
| 1108 * (right after current end of already-compacted data) |
| 1109 */ |
| 1110 |
| 1111 /* search for an identical block */ |
| 1112 if( (movedStart=findSameIndex2Block(trie->index2, newStart, start)) |
| 1113 >=0 |
| 1114 ) { |
| 1115 /* found an identical block, set the other block's index value for t
he current block */ |
| 1116 trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart; |
| 1117 |
| 1118 /* advance start to the next block */ |
| 1119 start+=UTRIE2_INDEX_2_BLOCK_LENGTH; |
| 1120 |
| 1121 /* leave newStart with the previous block! */ |
| 1122 continue; |
| 1123 } |
| 1124 |
| 1125 /* see if the beginning of this block can be overlapped with the end of
the previous block */ |
| 1126 /* look for maximum overlap with the previous, adjacent block */ |
| 1127 for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1; |
| 1128 overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->ind
ex2+start, overlap); |
| 1129 --overlap) {} |
| 1130 |
| 1131 if(overlap>0 || newStart<start) { |
| 1132 /* some overlap, or just move the whole block */ |
| 1133 trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap; |
| 1134 |
| 1135 /* move the non-overlapping indexes to their new positions */ |
| 1136 start+=overlap; |
| 1137 for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) { |
| 1138 trie->index2[newStart++]=trie->index2[start++]; |
| 1139 } |
| 1140 } else /* no overlap && newStart==start */ { |
| 1141 trie->map[start>>UTRIE2_SHIFT_1_2]=start; |
| 1142 start+=UTRIE2_INDEX_2_BLOCK_LENGTH; |
| 1143 newStart=start; |
| 1144 } |
| 1145 } |
| 1146 |
| 1147 /* now adjust the index-1 table */ |
| 1148 for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { |
| 1149 trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2]; |
| 1150 } |
| 1151 trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2]; |
| 1152 |
| 1153 /* |
| 1154 * Ensure data table alignment: |
| 1155 * Needs to be granularity-aligned for 16-bit trie |
| 1156 * (so that dataMove will be down-shiftable), |
| 1157 * and 2-aligned for uint32_t data. |
| 1158 */ |
| 1159 while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) { |
| 1160 /* Arbitrary value: 0x3fffc not possible for real data. */ |
| 1161 trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT; |
| 1162 } |
| 1163 |
| 1164 #ifdef UTRIE2_DEBUG |
| 1165 /* we saved some space */ |
| 1166 printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n", |
| 1167 (long)trie->index2Length, (long)newStart); |
| 1168 #endif |
| 1169 |
| 1170 trie->index2Length=newStart; |
| 1171 } |
| 1172 |
| 1173 static void |
| 1174 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) { |
| 1175 UNewTrie2 *newTrie; |
| 1176 UChar32 highStart, suppHighStart; |
| 1177 uint32_t highValue; |
| 1178 |
| 1179 newTrie=trie->newTrie; |
| 1180 |
| 1181 /* find highStart and round it up */ |
| 1182 highValue=utrie2_get32(trie, 0x10ffff); |
| 1183 highStart=findHighStart(newTrie, highValue); |
| 1184 highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_
1_ENTRY-1); |
| 1185 if(highStart==0x110000) { |
| 1186 highValue=trie->errorValue; |
| 1187 } |
| 1188 |
| 1189 /* |
| 1190 * Set trie->highStart only after utrie2_get32(trie, highStart). |
| 1191 * Otherwise utrie2_get32(trie, highStart) would try to read the highValue. |
| 1192 */ |
| 1193 trie->highStart=newTrie->highStart=highStart; |
| 1194 |
| 1195 #ifdef UTRIE2_DEBUG |
| 1196 printf("UTrie2: highStart U+%04lx highValue 0x%lx initialValue 0x%lx\n", |
| 1197 (long)highStart, (long)highValue, (long)trie->initialValue); |
| 1198 #endif |
| 1199 |
| 1200 if(highStart<0x110000) { |
| 1201 /* Blank out [highStart..10ffff] to release associated data blocks. */ |
| 1202 suppHighStart= highStart<=0x10000 ? 0x10000 : highStart; |
| 1203 utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRU
E, pErrorCode); |
| 1204 if(U_FAILURE(*pErrorCode)) { |
| 1205 return; |
| 1206 } |
| 1207 } |
| 1208 |
| 1209 compactData(newTrie); |
| 1210 if(highStart>0x10000) { |
| 1211 compactIndex2(newTrie); |
| 1212 #ifdef UTRIE2_DEBUG |
| 1213 } else { |
| 1214 printf("UTrie2: highStart U+%04lx count of 16-bit index-2 words %lu->%l
u\n", |
| 1215 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2
_INDEX_1_OFFSET); |
| 1216 #endif |
| 1217 } |
| 1218 |
| 1219 /* |
| 1220 * Store the highValue in the data array and round up the dataLength. |
| 1221 * Must be done after compactData() because that assumes that dataLength |
| 1222 * is a multiple of UTRIE2_DATA_BLOCK_LENGTH. |
| 1223 */ |
| 1224 newTrie->data[newTrie->dataLength++]=highValue; |
| 1225 while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) { |
| 1226 newTrie->data[newTrie->dataLength++]=trie->initialValue; |
| 1227 } |
| 1228 |
| 1229 newTrie->isCompacted=TRUE; |
| 1230 } |
| 1231 |
| 1232 /* serialization ------------------------------------------------------------ */ |
| 1233 |
| 1234 /** |
| 1235 * Maximum length of the runtime index array. |
| 1236 * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLen
gth. |
| 1237 * (The actual maximum length is lower, |
| 1238 * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_L
ENGTH.) |
| 1239 */ |
| 1240 #define UTRIE2_MAX_INDEX_LENGTH 0xffff |
| 1241 |
| 1242 /** |
| 1243 * Maximum length of the runtime data array. |
| 1244 * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT, |
| 1245 * and by uint16_t UTrie2Header.shiftedDataLength. |
| 1246 */ |
| 1247 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT) |
| 1248 |
| 1249 /* Compact and internally serialize the trie. */ |
| 1250 U_CAPI void U_EXPORT2 |
| 1251 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) { |
| 1252 UNewTrie2 *newTrie; |
| 1253 UTrie2Header *header; |
| 1254 uint32_t *p; |
| 1255 uint16_t *dest16; |
| 1256 int32_t i, length; |
| 1257 int32_t allIndexesLength; |
| 1258 int32_t dataMove; /* >0 if the data is moved to the end of the index array
*/ |
| 1259 UChar32 highStart; |
| 1260 |
| 1261 /* argument check */ |
| 1262 if(U_FAILURE(*pErrorCode)) { |
| 1263 return; |
| 1264 } |
| 1265 if( trie==NULL || |
| 1266 valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits |
| 1267 ) { |
| 1268 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1269 return; |
| 1270 } |
| 1271 newTrie=trie->newTrie; |
| 1272 if(newTrie==NULL) { |
| 1273 /* already frozen */ |
| 1274 UTrie2ValueBits frozenValueBits= |
| 1275 trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS; |
| 1276 if(valueBits!=frozenValueBits) { |
| 1277 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1278 } |
| 1279 return; |
| 1280 } |
| 1281 |
| 1282 /* compact if necessary */ |
| 1283 if(!newTrie->isCompacted) { |
| 1284 compactTrie(trie, pErrorCode); |
| 1285 if(U_FAILURE(*pErrorCode)) { |
| 1286 return; |
| 1287 } |
| 1288 } |
| 1289 highStart=trie->highStart; |
| 1290 |
| 1291 if(highStart<=0x10000) { |
| 1292 allIndexesLength=UTRIE2_INDEX_1_OFFSET; |
| 1293 } else { |
| 1294 allIndexesLength=newTrie->index2Length; |
| 1295 } |
| 1296 if(valueBits==UTRIE2_16_VALUE_BITS) { |
| 1297 dataMove=allIndexesLength; |
| 1298 } else { |
| 1299 dataMove=0; |
| 1300 } |
| 1301 |
| 1302 /* are indexLength and dataLength within limits? */ |
| 1303 if( /* for unshifted indexLength */ |
| 1304 allIndexesLength>UTRIE2_MAX_INDEX_LENGTH || |
| 1305 /* for unshifted dataNullOffset */ |
| 1306 (dataMove+newTrie->dataNullOffset)>0xffff || |
| 1307 /* for unshifted 2-byte UTF-8 index-2 values */ |
| 1308 (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff || |
| 1309 /* for shiftedDataLength */ |
| 1310 (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH |
| 1311 ) { |
| 1312 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| 1313 return; |
| 1314 } |
| 1315 |
| 1316 /* calculate the total serialized length */ |
| 1317 length=sizeof(UTrie2Header)+allIndexesLength*2; |
| 1318 if(valueBits==UTRIE2_16_VALUE_BITS) { |
| 1319 length+=newTrie->dataLength*2; |
| 1320 } else { |
| 1321 length+=newTrie->dataLength*4; |
| 1322 } |
| 1323 |
| 1324 trie->memory=uprv_malloc(length); |
| 1325 if(trie->memory==NULL) { |
| 1326 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| 1327 return; |
| 1328 } |
| 1329 trie->length=length; |
| 1330 trie->isMemoryOwned=TRUE; |
| 1331 |
| 1332 trie->indexLength=allIndexesLength; |
| 1333 trie->dataLength=newTrie->dataLength; |
| 1334 if(highStart<=0x10000) { |
| 1335 trie->index2NullOffset=0xffff; |
| 1336 } else { |
| 1337 trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset; |
| 1338 } |
| 1339 trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset); |
| 1340 trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY; |
| 1341 |
| 1342 /* set the header fields */ |
| 1343 header=(UTrie2Header *)trie->memory; |
| 1344 |
| 1345 header->signature=UTRIE2_SIG; /* "Tri2" */ |
| 1346 header->options=(uint16_t)valueBits; |
| 1347 |
| 1348 header->indexLength=(uint16_t)trie->indexLength; |
| 1349 header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT); |
| 1350 header->index2NullOffset=trie->index2NullOffset; |
| 1351 header->dataNullOffset=trie->dataNullOffset; |
| 1352 header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1); |
| 1353 |
| 1354 /* fill the index and data arrays */ |
| 1355 dest16=(uint16_t *)(header+1); |
| 1356 trie->index=dest16; |
| 1357 |
| 1358 /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after
adding dataMove */ |
| 1359 p=(uint32_t *)newTrie->index2; |
| 1360 for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) { |
| 1361 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); |
| 1362 } |
| 1363 |
| 1364 /* write UTF-8 2-byte index-2 values, not right-shifted */ |
| 1365 for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */ |
| 1366 *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET); |
| 1367 } |
| 1368 for(; i<(0xe0-0xc0); ++i) { /* C2..DF */ |
| 1369 *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]); |
| 1370 } |
| 1371 |
| 1372 if(highStart>0x10000) { |
| 1373 int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1; |
| 1374 int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LE
NGTH+index1Length; |
| 1375 |
| 1376 /* write 16-bit index-1 values for supplementary code points */ |
| 1377 p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; |
| 1378 for(i=index1Length; i>0; --i) { |
| 1379 *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++); |
| 1380 } |
| 1381 |
| 1382 /* |
| 1383 * write the index-2 array values for supplementary code points, |
| 1384 * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove |
| 1385 */ |
| 1386 p=(uint32_t *)newTrie->index2+index2Offset; |
| 1387 for(i=newTrie->index2Length-index2Offset; i>0; --i) { |
| 1388 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); |
| 1389 } |
| 1390 } |
| 1391 |
| 1392 /* write the 16/32-bit data array */ |
| 1393 switch(valueBits) { |
| 1394 case UTRIE2_16_VALUE_BITS: |
| 1395 /* write 16-bit data values */ |
| 1396 trie->data16=dest16; |
| 1397 trie->data32=NULL; |
| 1398 p=newTrie->data; |
| 1399 for(i=newTrie->dataLength; i>0; --i) { |
| 1400 *dest16++=(uint16_t)*p++; |
| 1401 } |
| 1402 break; |
| 1403 case UTRIE2_32_VALUE_BITS: |
| 1404 /* write 32-bit data values */ |
| 1405 trie->data16=NULL; |
| 1406 trie->data32=(uint32_t *)dest16; |
| 1407 uprv_memcpy(dest16, newTrie->data, newTrie->dataLength*4); |
| 1408 break; |
| 1409 default: |
| 1410 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1411 return; |
| 1412 } |
| 1413 |
| 1414 /* Delete the UNewTrie2. */ |
| 1415 uprv_free(newTrie->data); |
| 1416 uprv_free(newTrie); |
| 1417 trie->newTrie=NULL; |
| 1418 } |
| 1419 |
| 1420 U_CAPI UBool U_EXPORT2 |
| 1421 utrie2_isFrozen(const UTrie2 *trie) { |
| 1422 return (UBool)(trie->newTrie==NULL); |
| 1423 } |
| 1424 |
| 1425 U_CAPI int32_t U_EXPORT2 |
| 1426 utrie2_serialize(UTrie2 *trie, |
| 1427 void *data, int32_t capacity, |
| 1428 UErrorCode *pErrorCode) { |
| 1429 /* argument check */ |
| 1430 if(U_FAILURE(*pErrorCode)) { |
| 1431 return 0; |
| 1432 } |
| 1433 |
| 1434 if( trie==NULL || trie->memory==NULL || trie->newTrie!=NULL || |
| 1435 capacity<0 || (capacity>0 && (data==NULL || (U_POINTER_MASK_LSB(data, 3)
!=0))) |
| 1436 ) { |
| 1437 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1438 return 0; |
| 1439 } |
| 1440 |
| 1441 if(capacity>=trie->length) { |
| 1442 uprv_memcpy(data, trie->memory, trie->length); |
| 1443 } else { |
| 1444 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1445 } |
| 1446 return trie->length; |
| 1447 } |
| 1448 |
| 1449 /* |
| 1450 * This is here to avoid a dependency from utrie2.cpp on utrie.c. |
| 1451 * This file already depends on utrie.c. |
| 1452 * Otherwise, this should be in utrie2.cpp right after utrie2_swap(). |
| 1453 */ |
| 1454 U_CAPI int32_t U_EXPORT2 |
| 1455 utrie2_swapAnyVersion(const UDataSwapper *ds, |
| 1456 const void *inData, int32_t length, void *outData, |
| 1457 UErrorCode *pErrorCode) { |
| 1458 if(U_SUCCESS(*pErrorCode)) { |
| 1459 switch(utrie2_getVersion(inData, length, TRUE)) { |
| 1460 case 1: |
| 1461 return utrie_swap(ds, inData, length, outData, pErrorCode); |
| 1462 case 2: |
| 1463 return utrie2_swap(ds, inData, length, outData, pErrorCode); |
| 1464 default: |
| 1465 *pErrorCode=U_INVALID_FORMAT_ERROR; |
| 1466 return 0; |
| 1467 } |
| 1468 } |
| 1469 return 0; |
| 1470 } |
OLD | NEW |