OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 1999-2009, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: store.c |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2003-02-06 |
| 14 * created by: Ram Viswanadha |
| 15 * |
| 16 */ |
| 17 |
| 18 #include <stdio.h> |
| 19 #include <stdlib.h> |
| 20 #include "unicode/utypes.h" |
| 21 #include "cmemory.h" |
| 22 #include "cstring.h" |
| 23 #include "filestrm.h" |
| 24 #include "unicode/udata.h" |
| 25 #include "utrie.h" |
| 26 #include "unewdata.h" |
| 27 #include "gensprep.h" |
| 28 #include "uhash.h" |
| 29 |
| 30 |
| 31 #define DO_DEBUG_OUT 0 |
| 32 |
| 33 |
| 34 /* |
| 35 * StringPrep profile file format ------------------------------------ |
| 36 * |
| 37 * The file format prepared and written here contains a 16-bit trie and a mappin
g table. |
| 38 * |
| 39 * Before the data contents described below, there are the headers required by |
| 40 * the udata API for loading ICU data. Especially, a UDataInfo structure |
| 41 * precedes the actual data. It contains platform properties values and the |
| 42 * file format version. |
| 43 * |
| 44 * The following is a description of format version 2. |
| 45 * |
| 46 * Data contents: |
| 47 * |
| 48 * The contents is a parsed, binary form of RFC3454 and possibly |
| 49 * NormalizationCorrections.txt depending on the options specified on the profil
e. |
| 50 * |
| 51 * Any Unicode code point from 0 to 0x10ffff can be looked up to get |
| 52 * the trie-word, if any, for that code point. This means that the input |
| 53 * to the lookup are 21-bit unsigned integers, with not all of the |
| 54 * 21-bit range used. |
| 55 * |
| 56 * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c. |
| 57 * After that there are the following structures: |
| 58 * |
| 59 * int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum
in sprpimpl.h file |
| 60 * |
| 61 * UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_
INDEX_TRIE_SIZE] |
| 62 * |
| 63 * uint16_t mappingTable[]; -- Contains the sequecence of co
de units that the code point maps to |
| 64 * size in bytes = indexes[_SPRE
P_INDEX_MAPPING_DATA_SIZE] |
| 65 * |
| 66 * The indexes array contains the following values: |
| 67 * indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPr
ep trie in bytes |
| 68 * indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingT
able in bytes |
| 69 * indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode ver
sion of last entry in NormalizationCorrections.txt |
| 70 * indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1
UChar mapping index in the mapping table |
| 71 * indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2
UChars mapping index in the mapping table |
| 72 * indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3
UChars mapping index in the mapping table |
| 73 * indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4
UChars mapping index in the mapping table |
| 74 * indexes[_SPREP_OPTIONS] -- Bit set of options to tu
rn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON |
| 75 * |
| 76 * |
| 77 * StringPrep Trie : |
| 78 * |
| 79 * The StringPrep tries is a 16-bit trie that contains data for the profile. |
| 80 * Each code point is associated with a value (trie-word) in the trie. |
| 81 * |
| 82 * - structure of data words from the trie |
| 83 * |
| 84 * i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0) |
| 85 * represents the type associated with the code point |
| 86 * if(trieWord >= _SPREP_TYPE_THRESHOLD){ |
| 87 * type = trieWord - 0xFFF0; |
| 88 * } |
| 89 * The type can be : |
| 90 * USPREP_UNASSIGNED |
| 91 * USPREP_PROHIBITED |
| 92 * USPREP_DELETE |
| 93 * |
| 94 * ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and |
| 95 * contains distribution described below |
| 96 * |
| 97 * 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This i
s to allow for codepoint that are both prohibited and mapped. |
| 98 * 1 - ON : The value in the next 14 bits is an index into the mappi
ng table |
| 99 * OFF: The value in the next 14 bits is an delta value from the
code point |
| 100 * 2..15 - Contains data as described by bit 1. If all bits are set |
| 101 * (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELE
TE |
| 102 * |
| 103 * |
| 104 * Mapping Table: |
| 105 * The data in mapping table is sorted according to the length of the mapping se
quence. |
| 106 * If the type of the code point is USPREP_MAP and value in trie word is an inde
x, the index |
| 107 * is compared with start indexes of sequence length start to figure out the len
gth according to |
| 108 * the following algorithm: |
| 109 * |
| 110 * if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START]
&& |
| 111 * index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]
){ |
| 112 * length = 1; |
| 113 * }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START
] && |
| 114 * index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_STAR
T]){ |
| 115 * length = 2; |
| 116 * }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_STA
RT] && |
| 117 * index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START
]){ |
| 118 * length = 3; |
| 119 * }else{ |
| 120 * // The first position in the mapping table contains the len
gth |
| 121 * // of the sequence |
| 122 * length = mappingTable[index++]; |
| 123 * |
| 124 * } |
| 125 * |
| 126 */ |
| 127 |
| 128 /* file data ---------------------------------------------------------------- */ |
| 129 /* indexes[] value names */ |
| 130 |
| 131 #if UCONFIG_NO_IDNA |
| 132 |
| 133 /* dummy UDataInfo cf. udata.h */ |
| 134 static UDataInfo dataInfo = { |
| 135 sizeof(UDataInfo), |
| 136 0, |
| 137 |
| 138 U_IS_BIG_ENDIAN, |
| 139 U_CHARSET_FAMILY, |
| 140 U_SIZEOF_UCHAR, |
| 141 0, |
| 142 |
| 143 { 0, 0, 0, 0 }, /* dummy dataFormat */ |
| 144 { 0, 0, 0, 0 }, /* dummy formatVersion */ |
| 145 { 0, 0, 0, 0 } /* dummy dataVersion */ |
| 146 }; |
| 147 |
| 148 #else |
| 149 |
| 150 static int32_t indexes[_SPREP_INDEX_TOP]={ 0 }; |
| 151 |
| 152 static uint16_t* mappingData= NULL; |
| 153 static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping da
ta */ |
| 154 static int16_t currentIndex = 0; /* the current index into the data trie */ |
| 155 static int32_t maxLength = 0; /* maximum length of mapping string */ |
| 156 |
| 157 |
| 158 /* UDataInfo cf. udata.h */ |
| 159 static UDataInfo dataInfo={ |
| 160 sizeof(UDataInfo), |
| 161 0, |
| 162 |
| 163 U_IS_BIG_ENDIAN, |
| 164 U_CHARSET_FAMILY, |
| 165 U_SIZEOF_UCHAR, |
| 166 0, |
| 167 |
| 168 { 0x53, 0x50, 0x52, 0x50 }, /* dataFormat="SPRP" */ |
| 169 { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ |
| 170 { 3, 2, 0, 0 } /* dataVersion (Unicode version)
*/ |
| 171 }; |
| 172 void |
| 173 setUnicodeVersion(const char *v) { |
| 174 UVersionInfo version; |
| 175 u_versionFromString(version, v); |
| 176 uprv_memcpy(dataInfo.dataVersion, version, 4); |
| 177 } |
| 178 |
| 179 void |
| 180 setUnicodeVersionNC(UVersionInfo version){ |
| 181 uint32_t univer = version[0] << 24; |
| 182 univer += version[1] << 16; |
| 183 univer += version[2] << 8; |
| 184 univer += version[3]; |
| 185 indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] = univer; |
| 186 } |
| 187 static UNewTrie *sprepTrie; |
| 188 |
| 189 #define MAX_DATA_LENGTH 11500 |
| 190 |
| 191 |
| 192 #define SPREP_DELTA_RANGE_POSITIVE_LIMIT 8191 |
| 193 #define SPREP_DELTA_RANGE_NEGATIVE_LIMIT -8192 |
| 194 |
| 195 |
| 196 extern void |
| 197 init() { |
| 198 |
| 199 sprepTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); |
| 200 uprv_memset(sprepTrie, 0, sizeof(UNewTrie)); |
| 201 |
| 202 /* initialize the two tries */ |
| 203 if(NULL==utrie_open(sprepTrie, NULL, MAX_DATA_LENGTH, 0, 0, FALSE)) { |
| 204 fprintf(stderr, "error: failed to initialize tries\n"); |
| 205 exit(U_MEMORY_ALLOCATION_ERROR); |
| 206 } |
| 207 } |
| 208 |
| 209 static UHashtable* hashTable = NULL; |
| 210 |
| 211 |
| 212 typedef struct ValueStruct { |
| 213 UChar* mapping; |
| 214 int16_t length; |
| 215 UStringPrepType type; |
| 216 } ValueStruct; |
| 217 |
| 218 /* Callback for deleting the value from the hashtable */ |
| 219 static void U_CALLCONV valueDeleter(void* obj){ |
| 220 ValueStruct* value = (ValueStruct*) obj; |
| 221 uprv_free(value->mapping); |
| 222 uprv_free(value); |
| 223 } |
| 224 |
| 225 /* Callback for hashing the entry */ |
| 226 static int32_t U_CALLCONV hashEntry(const UHashTok parm) { |
| 227 return parm.integer; |
| 228 } |
| 229 |
| 230 /* Callback for comparing two entries */ |
| 231 static UBool U_CALLCONV compareEntries(const UHashTok p1, const UHashTok p2) { |
| 232 return (UBool)(p1.integer != p2.integer); |
| 233 } |
| 234 |
| 235 |
| 236 static void |
| 237 storeMappingData(){ |
| 238 |
| 239 int32_t pos = -1; |
| 240 const UHashElement* element = NULL; |
| 241 ValueStruct* value = NULL; |
| 242 int32_t codepoint = 0; |
| 243 int32_t elementCount = 0; |
| 244 int32_t writtenElementCount = 0; |
| 245 int32_t mappingLength = 1; /* minimum mapping length */ |
| 246 int32_t oldMappingLength = 0; |
| 247 uint16_t trieWord =0; |
| 248 int32_t limitIndex = 0; |
| 249 |
| 250 if (hashTable == NULL) { |
| 251 return; |
| 252 } |
| 253 elementCount = uhash_count(hashTable); |
| 254 |
| 255 /*initialize the mapping data */ |
| 256 mappingData = (uint16_t*) uprv_malloc(U_SIZEOF_UCHAR * (mappingDataCapacity)
); |
| 257 |
| 258 uprv_memset(mappingData,0,U_SIZEOF_UCHAR * mappingDataCapacity); |
| 259 |
| 260 while(writtenElementCount < elementCount){ |
| 261 |
| 262 while( (element = uhash_nextElement(hashTable, &pos))!=NULL){ |
| 263 |
| 264 codepoint = element->key.integer; |
| 265 value = (ValueStruct*)element->value.pointer; |
| 266 |
| 267 /* store the start of indexes */ |
| 268 if(oldMappingLength != mappingLength){ |
| 269 /* Assume that index[] is used according to the enums defined */ |
| 270 if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){ |
| 271 indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength
] = currentIndex; |
| 272 } |
| 273 if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH && |
| 274 mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){ |
| 275 |
| 276 limitIndex = currentIndex; |
| 277 |
| 278 } |
| 279 oldMappingLength = mappingLength; |
| 280 } |
| 281 |
| 282 if(value->length == mappingLength){ |
| 283 uint32_t savedTrieWord = 0; |
| 284 trieWord = currentIndex << 2; |
| 285 /* turn on the 2nd bit to signal that the following bits contain
an index */ |
| 286 trieWord += 0x02; |
| 287 |
| 288 if(trieWord > _SPREP_TYPE_THRESHOLD){ |
| 289 fprintf(stderr,"trieWord cannot contain value greater than 0
x%04X.\n",_SPREP_TYPE_THRESHOLD); |
| 290 exit(U_ILLEGAL_CHAR_FOUND); |
| 291 } |
| 292 /* figure out if the code point has type already stored */ |
| 293 savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); |
| 294 if(savedTrieWord!=0){ |
| 295 if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBIT
ED){ |
| 296 /* turn on the first bit in trie word */ |
| 297 trieWord += 0x01; |
| 298 }else{ |
| 299 /* |
| 300 * the codepoint has value something other than prohibit
ed |
| 301 * and a mapping .. error! |
| 302 */ |
| 303 fprintf(stderr,"Type for codepoint \\U%08X already set!.
\n", (int)codepoint); |
| 304 exit(U_ILLEGAL_ARGUMENT_ERROR); |
| 305 } |
| 306 } |
| 307 |
| 308 /* now set the value in the trie */ |
| 309 if(!utrie_set32(sprepTrie,codepoint,trieWord)){ |
| 310 fprintf(stderr,"Could not set the value for code point.\n"); |
| 311 exit(U_ILLEGAL_ARGUMENT_ERROR); |
| 312 } |
| 313 |
| 314 /* written the trie word for the codepoint... increment the coun
t*/ |
| 315 writtenElementCount++; |
| 316 |
| 317 /* sanity check are we exceeding the max number allowed */ |
| 318 if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){ |
| 319 fprintf(stderr, "Too many entries in the mapping table %i. M
aximum allowed is %i\n", currentIndex+value->length, _SPREP_MAX_INDEX_VALUE); |
| 320 exit(U_INDEX_OUTOFBOUNDS_ERROR); |
| 321 } |
| 322 |
| 323 /* copy the mapping data */ |
| 324 if(currentIndex+value->length+1 <= mappingDataCapacity){ |
| 325 /* write the length */ |
| 326 if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ |
| 327 /* the cast here is safe since we donot expect the leng
th to be > 65535 */ |
| 328 mappingData[currentIndex++] = (uint16_t) mappingLength; |
| 329 } |
| 330 /* copy the contents to mappindData array */ |
| 331 uprv_memmove(mappingData+currentIndex, value->mapping, value
->length*U_SIZEOF_UCHAR); |
| 332 currentIndex += value->length; |
| 333 |
| 334 }else{ |
| 335 /* realloc */ |
| 336 UChar* newMappingData = (uint16_t*) uprv_malloc(U_SIZEOF_UCH
AR * mappingDataCapacity*2); |
| 337 if(newMappingData == NULL){ |
| 338 fprintf(stderr, "Could not realloc the mapping data!\n")
; |
| 339 exit(U_MEMORY_ALLOCATION_ERROR); |
| 340 } |
| 341 uprv_memmove(newMappingData, mappingData, U_SIZEOF_UCHAR * m
appingDataCapacity); |
| 342 mappingDataCapacity *= 2; |
| 343 uprv_free(mappingData); |
| 344 mappingData = newMappingData; |
| 345 /* write the length */ |
| 346 if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ |
| 347 /* the cast here is safe since we donot expect the leng
th to be > 65535 */ |
| 348 mappingData[currentIndex++] = (uint16_t) mappingLength; |
| 349 } |
| 350 /* continue copying */ |
| 351 uprv_memmove(mappingData+currentIndex, value->mapping, value
->length*U_SIZEOF_UCHAR); |
| 352 currentIndex += value->length; |
| 353 } |
| 354 |
| 355 } |
| 356 } |
| 357 mappingLength++; |
| 358 pos = -1; |
| 359 } |
| 360 /* set the last length for range check */ |
| 361 if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){ |
| 362 indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentI
ndex+1; |
| 363 }else{ |
| 364 indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex; |
| 365 } |
| 366 |
| 367 } |
| 368 |
| 369 extern void setOptions(int32_t options){ |
| 370 indexes[_SPREP_OPTIONS] = options; |
| 371 } |
| 372 extern void |
| 373 storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, |
| 374 UStringPrepType type, UErrorCode* status){ |
| 375 |
| 376 |
| 377 UChar* map = NULL; |
| 378 int16_t adjustedLen=0, i; |
| 379 uint16_t trieWord = 0; |
| 380 ValueStruct *value = NULL; |
| 381 uint32_t savedTrieWord = 0; |
| 382 |
| 383 /* initialize the hashtable */ |
| 384 if(hashTable==NULL){ |
| 385 hashTable = uhash_open(hashEntry, compareEntries, NULL, status); |
| 386 uhash_setValueDeleter(hashTable, valueDeleter); |
| 387 } |
| 388 |
| 389 /* figure out if the code point has type already stored */ |
| 390 savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); |
| 391 if(savedTrieWord!=0){ |
| 392 if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ |
| 393 /* turn on the first bit in trie word */ |
| 394 trieWord += 0x01; |
| 395 }else{ |
| 396 /* |
| 397 * the codepoint has value something other than prohibited |
| 398 * and a mapping .. error! |
| 399 */ |
| 400 fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)co
depoint); |
| 401 exit(U_ILLEGAL_ARGUMENT_ERROR); |
| 402 } |
| 403 } |
| 404 |
| 405 /* figure out the real length */ |
| 406 for(i=0; i<length; i++){ |
| 407 if(mapping[i] > 0xFFFF){ |
| 408 adjustedLen +=2; |
| 409 }else{ |
| 410 adjustedLen++; |
| 411 } |
| 412 } |
| 413 |
| 414 if(adjustedLen == 0){ |
| 415 trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2); |
| 416 /* make sure that the value of trieWord is less than the threshold */ |
| 417 if(trieWord < _SPREP_TYPE_THRESHOLD){ |
| 418 /* now set the value in the trie */ |
| 419 if(!utrie_set32(sprepTrie,codepoint,trieWord)){ |
| 420 fprintf(stderr,"Could not set the value for code point.\n"); |
| 421 exit(U_ILLEGAL_ARGUMENT_ERROR); |
| 422 } |
| 423 /* value is set so just return */ |
| 424 return; |
| 425 }else{ |
| 426 fprintf(stderr,"trieWord cannot contain value greater than threshold
0x%04X.\n",_SPREP_TYPE_THRESHOLD); |
| 427 exit(U_ILLEGAL_CHAR_FOUND); |
| 428 } |
| 429 } |
| 430 |
| 431 if(adjustedLen == 1){ |
| 432 /* calculate the delta */ |
| 433 int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]); |
| 434 if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RAN
GE_POSITIVE_LIMIT){ |
| 435 |
| 436 trieWord = delta << 2; |
| 437 |
| 438 |
| 439 /* make sure that the second bit is OFF */ |
| 440 if((trieWord & 0x02) != 0 ){ |
| 441 fprintf(stderr,"The second bit in the trie word is not zero whil
e storing a delta.\n"); |
| 442 exit(U_INTERNAL_PROGRAM_ERROR); |
| 443 } |
| 444 /* make sure that the value of trieWord is less than the threshold *
/ |
| 445 if(trieWord < _SPREP_TYPE_THRESHOLD){ |
| 446 /* now set the value in the trie */ |
| 447 if(!utrie_set32(sprepTrie,codepoint,trieWord)){ |
| 448 fprintf(stderr,"Could not set the value for code point.\n"); |
| 449 exit(U_ILLEGAL_ARGUMENT_ERROR); |
| 450 } |
| 451 /* value is set so just return */ |
| 452 return; |
| 453 } |
| 454 } |
| 455 /* |
| 456 * if the delta is not in the given range or if the trieWord is larger t
han the threshold |
| 457 * just fall through for storing the mapping in the mapping table |
| 458 */ |
| 459 } |
| 460 |
| 461 map = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * (adjustedLen+1)); |
| 462 uprv_memset(map,0,U_SIZEOF_UCHAR * (adjustedLen+1)); |
| 463 |
| 464 i=0; |
| 465 |
| 466 while(i<length){ |
| 467 if(mapping[i] <= 0xFFFF){ |
| 468 map[i] = (uint16_t)mapping[i]; |
| 469 }else{ |
| 470 map[i] = UTF16_LEAD(mapping[i]); |
| 471 map[i+1] = UTF16_TRAIL(mapping[i]); |
| 472 } |
| 473 i++; |
| 474 } |
| 475 |
| 476 value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct)); |
| 477 value->mapping = map; |
| 478 value->type = type; |
| 479 value->length = adjustedLen; |
| 480 if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){ |
| 481 mappingDataCapacity++; |
| 482 } |
| 483 if(maxLength < value->length){ |
| 484 maxLength = value->length; |
| 485 } |
| 486 uhash_iput(hashTable,codepoint,value,status); |
| 487 mappingDataCapacity += adjustedLen; |
| 488 |
| 489 if(U_FAILURE(*status)){ |
| 490 fprintf(stderr, "Failed to put entries into the hastable. Error: %s\n",
u_errorName(*status)); |
| 491 exit(*status); |
| 492 } |
| 493 } |
| 494 |
| 495 |
| 496 extern void |
| 497 storeRange(uint32_t start, uint32_t end, UStringPrepType type,UErrorCode* status
){ |
| 498 uint16_t trieWord = 0; |
| 499 |
| 500 if((int)(_SPREP_TYPE_THRESHOLD + type) > 0xFFFF){ |
| 501 fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n"); |
| 502 exit(U_ILLEGAL_CHAR_FOUND); |
| 503 } |
| 504 trieWord = (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the val
ue */ |
| 505 if(start == end){ |
| 506 uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL); |
| 507 if(savedTrieWord>0){ |
| 508 if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITE
D){ |
| 509 /* |
| 510 * A mapping is stored in the trie word |
| 511 * and the only other possible type that a |
| 512 * code point can have is USPREP_PROHIBITED |
| 513 * |
| 514 */ |
| 515 |
| 516 /* turn on the 0th bit in the savedTrieWord */ |
| 517 savedTrieWord += 0x01; |
| 518 |
| 519 /* the downcast is safe since we only save 16 bit values */ |
| 520 trieWord = (uint16_t)savedTrieWord; |
| 521 |
| 522 /* make sure that the value of trieWord is less than the thresho
ld */ |
| 523 if(trieWord < _SPREP_TYPE_THRESHOLD){ |
| 524 /* now set the value in the trie */ |
| 525 if(!utrie_set32(sprepTrie,start,trieWord)){ |
| 526 fprintf(stderr,"Could not set the value for code point.\
n"); |
| 527 exit(U_ILLEGAL_ARGUMENT_ERROR); |
| 528 } |
| 529 /* value is set so just return */ |
| 530 return; |
| 531 }else{ |
| 532 fprintf(stderr,"trieWord cannot contain value greater than t
hreshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); |
| 533 exit(U_ILLEGAL_CHAR_FOUND); |
| 534 } |
| 535 |
| 536 }else if(savedTrieWord != trieWord){ |
| 537 fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (i
nt)start); |
| 538 exit(U_ILLEGAL_ARGUMENT_ERROR); |
| 539 } |
| 540 /* if savedTrieWord == trieWord .. fall through and set the value */ |
| 541 } |
| 542 if(!utrie_set32(sprepTrie,start,trieWord)){ |
| 543 fprintf(stderr,"Could not set the value for code point \\U%08X.\n",
(int)start); |
| 544 exit(U_ILLEGAL_ARGUMENT_ERROR); |
| 545 } |
| 546 }else{ |
| 547 if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, FALSE)){ |
| 548 fprintf(stderr,"Value for certain codepoint already set.\n"); |
| 549 exit(U_ILLEGAL_CHAR_FOUND); |
| 550 } |
| 551 } |
| 552 |
| 553 } |
| 554 |
| 555 /* folding value: just store the offset (16 bits) if there is any non-0 entry */ |
| 556 static uint32_t U_CALLCONV |
| 557 getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) { |
| 558 uint32_t foldedValue, value; |
| 559 UChar32 limit=0; |
| 560 UBool inBlockZero; |
| 561 |
| 562 foldedValue=0; |
| 563 |
| 564 limit=start+0x400; |
| 565 while(start<limit) { |
| 566 value=utrie_get32(trie, start, &inBlockZero); |
| 567 if(inBlockZero) { |
| 568 start+=UTRIE_DATA_BLOCK_LENGTH; |
| 569 } else if(value!=0) { |
| 570 return (uint32_t)offset; |
| 571 } else { |
| 572 ++start; |
| 573 } |
| 574 } |
| 575 return 0; |
| 576 |
| 577 } |
| 578 |
| 579 #endif /* #if !UCONFIG_NO_IDNA */ |
| 580 |
| 581 extern void |
| 582 generateData(const char *dataDir, const char* bundleName) { |
| 583 static uint8_t sprepTrieBlock[100000]; |
| 584 |
| 585 UNewDataMemory *pData; |
| 586 UErrorCode errorCode=U_ZERO_ERROR; |
| 587 int32_t size, dataLength; |
| 588 char* fileName = (char*) uprv_malloc(uprv_strlen(bundleName) +100); |
| 589 |
| 590 #if UCONFIG_NO_IDNA |
| 591 |
| 592 size=0; |
| 593 |
| 594 #else |
| 595 |
| 596 int32_t sprepTrieSize; |
| 597 |
| 598 /* sort and add mapping data */ |
| 599 storeMappingData(); |
| 600 |
| 601 sprepTrieSize=utrie_serialize(sprepTrie, sprepTrieBlock, sizeof(sprepTrieBlo
ck), getFoldedValue, TRUE, &errorCode); |
| 602 if(U_FAILURE(errorCode)) { |
| 603 fprintf(stderr, "error: utrie_serialize(sprep trie) failed, %s\n", u_err
orName(errorCode)); |
| 604 exit(errorCode); |
| 605 } |
| 606 |
| 607 size = sprepTrieSize + mappingDataCapacity*U_SIZEOF_UCHAR + sizeof(indexes); |
| 608 if(beVerbose) { |
| 609 printf("size of sprep trie %5u bytes\n", (int)sprepTrieSize
); |
| 610 printf("size of " U_ICUDATA_NAME "_%s." DATA_TYPE " contents: %ld bytes\
n", bundleName,(long)size); |
| 611 printf("size of mapping data array %5u bytes\n",(int)mappingDataCapacity
* U_SIZEOF_UCHAR); |
| 612 printf("Number of code units in mappingData (currentIndex) are: %i \n",
currentIndex); |
| 613 printf("Maximum length of the mapping string is : %i \n", (int)maxLength
); |
| 614 } |
| 615 |
| 616 #endif |
| 617 |
| 618 fileName[0]=0; |
| 619 uprv_strcat(fileName,bundleName); |
| 620 /* write the data */ |
| 621 pData=udata_create(dataDir, DATA_TYPE, fileName, &dataInfo, |
| 622 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); |
| 623 if(U_FAILURE(errorCode)) { |
| 624 fprintf(stderr, "gensprep: unable to create the output file, error %d\n"
, errorCode); |
| 625 exit(errorCode); |
| 626 } |
| 627 |
| 628 #if !UCONFIG_NO_IDNA |
| 629 |
| 630 indexes[_SPREP_INDEX_TRIE_SIZE]=sprepTrieSize; |
| 631 indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]=mappingDataCapacity*U_SIZEOF_UCHAR; |
| 632 |
| 633 udata_writeBlock(pData, indexes, sizeof(indexes)); |
| 634 udata_writeBlock(pData, sprepTrieBlock, sprepTrieSize); |
| 635 udata_writeBlock(pData, mappingData, indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]
); |
| 636 |
| 637 |
| 638 #endif |
| 639 |
| 640 /* finish up */ |
| 641 dataLength=udata_finish(pData, &errorCode); |
| 642 if(U_FAILURE(errorCode)) { |
| 643 fprintf(stderr, "gensprep: error %d writing the output file\n", errorCod
e); |
| 644 exit(errorCode); |
| 645 } |
| 646 |
| 647 if(dataLength!=size) { |
| 648 fprintf(stderr, "gensprep error: data length %ld != calculated size %ld\
n", |
| 649 (long)dataLength, (long)size); |
| 650 exit(U_INTERNAL_PROGRAM_ERROR); |
| 651 } |
| 652 |
| 653 #if !UCONFIG_NO_IDNA |
| 654 /* done with writing the data .. close the hashtable */ |
| 655 if (hashTable != NULL) { |
| 656 uhash_close(hashTable); |
| 657 } |
| 658 #endif |
| 659 } |
| 660 |
| 661 #if !UCONFIG_NO_IDNA |
| 662 |
| 663 extern void |
| 664 cleanUpData(void) { |
| 665 |
| 666 utrie_close(sprepTrie); |
| 667 uprv_free(sprepTrie); |
| 668 } |
| 669 |
| 670 #endif /* #if !UCONFIG_NO_IDNA */ |
| 671 |
| 672 /* |
| 673 * Hey, Emacs, please set the following: |
| 674 * |
| 675 * Local Variables: |
| 676 * indent-tabs-mode: nil |
| 677 * End: |
| 678 * |
| 679 */ |
OLD | NEW |