OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ****************************************************************************** |
| 3 * |
| 4 * Copyright (C) 1999-2009, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ****************************************************************************** |
| 8 * file name: unames.c |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 1999oct04 |
| 14 * created by: Markus W. Scherer |
| 15 */ |
| 16 |
| 17 #include "unicode/utypes.h" |
| 18 #include "unicode/putil.h" |
| 19 #include "unicode/uchar.h" |
| 20 #include "unicode/udata.h" |
| 21 #include "ustr_imp.h" |
| 22 #include "umutex.h" |
| 23 #include "cmemory.h" |
| 24 #include "cstring.h" |
| 25 #include "ucln_cmn.h" |
| 26 #include "udataswp.h" |
| 27 #include "uprops.h" |
| 28 |
| 29 /* prototypes ------------------------------------------------------------- */ |
| 30 |
| 31 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) |
| 32 |
| 33 static const char DATA_NAME[] = "unames"; |
| 34 static const char DATA_TYPE[] = "icu"; |
| 35 |
| 36 #define GROUP_SHIFT 5 |
| 37 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT) |
| 38 #define GROUP_MASK (LINES_PER_GROUP-1) |
| 39 |
| 40 /* |
| 41 * This struct was replaced by explicitly accessing equivalent |
| 42 * fields from triples of uint16_t. |
| 43 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs, |
| 44 * which broke the assumption that sizeof(Group)==6 and that the ++ operator |
| 45 * would advance by 6 bytes (3 uint16_t). |
| 46 * |
| 47 * We can't just change the data structure because it's loaded from a data file, |
| 48 * and we don't want to make it less compact, so we changed the access code. |
| 49 * |
| 50 * For details see ICU tickets 6331 and 6008. |
| 51 typedef struct { |
| 52 uint16_t groupMSB, |
| 53 offsetHigh, offsetLow; / * avoid padding * / |
| 54 } Group; |
| 55 */ |
| 56 enum { |
| 57 GROUP_MSB, |
| 58 GROUP_OFFSET_HIGH, |
| 59 GROUP_OFFSET_LOW, |
| 60 GROUP_LENGTH |
| 61 }; |
| 62 |
| 63 /* |
| 64 * Get the 32-bit group offset. |
| 65 * @param group (const uint16_t *) pointer to a Group triple of uint16_t |
| 66 * @return group offset (int32_t) |
| 67 */ |
| 68 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)
[GROUP_OFFSET_LOW]) |
| 69 |
| 70 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH) |
| 71 #define PREV_GROUP(group) ((group)-GROUP_LENGTH) |
| 72 |
| 73 typedef struct { |
| 74 uint32_t start, end; |
| 75 uint8_t type, variant; |
| 76 uint16_t size; |
| 77 } AlgorithmicRange; |
| 78 |
| 79 typedef struct { |
| 80 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; |
| 81 } UCharNames; |
| 82 |
| 83 /* |
| 84 * Get the groups table from a UCharNames struct. |
| 85 * The groups table consists of one uint16_t groupCount followed by |
| 86 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH |
| 87 * and the comment for the old struct Group above. |
| 88 * |
| 89 * @param names (const UCharNames *) pointer to the UCharNames indexes |
| 90 * @return (const uint16_t *) pointer to the groups table |
| 91 */ |
| 92 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOf
fset) |
| 93 |
| 94 typedef struct { |
| 95 const char *otherName; |
| 96 UChar32 code; |
| 97 } FindName; |
| 98 |
| 99 #define DO_FIND_NAME NULL |
| 100 |
| 101 static UDataMemory *uCharNamesData=NULL; |
| 102 static UCharNames *uCharNames=NULL; |
| 103 static UErrorCode gLoadErrorCode=U_ZERO_ERROR; |
| 104 |
| 105 /* |
| 106 * Maximum length of character names (regular & 1.0). |
| 107 */ |
| 108 static int32_t gMaxNameLength=0; |
| 109 |
| 110 /* |
| 111 * Set of chars used in character names (regular & 1.0). |
| 112 * Chars are platform-dependent (can be EBCDIC). |
| 113 */ |
| 114 static uint32_t gNameSet[8]={ 0 }; |
| 115 |
| 116 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT |
| 117 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1 |
| 118 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2 |
| 119 |
| 120 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3) |
| 121 |
| 122 static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = { |
| 123 "unassigned", |
| 124 "uppercase letter", |
| 125 "lowercase letter", |
| 126 "titlecase letter", |
| 127 "modifier letter", |
| 128 "other letter", |
| 129 "non spacing mark", |
| 130 "enclosing mark", |
| 131 "combining spacing mark", |
| 132 "decimal digit number", |
| 133 "letter number", |
| 134 "other number", |
| 135 "space separator", |
| 136 "line separator", |
| 137 "paragraph separator", |
| 138 "control", |
| 139 "format", |
| 140 "private use area", |
| 141 "surrogate", |
| 142 "dash punctuation", |
| 143 "start punctuation", |
| 144 "end punctuation", |
| 145 "connector punctuation", |
| 146 "other punctuation", |
| 147 "math symbol", |
| 148 "currency symbol", |
| 149 "modifier symbol", |
| 150 "other symbol", |
| 151 "initial punctuation", |
| 152 "final punctuation", |
| 153 "noncharacter", |
| 154 "lead surrogate", |
| 155 "trail surrogate" |
| 156 }; |
| 157 |
| 158 /* implementation ----------------------------------------------------------- */ |
| 159 |
| 160 static UBool U_CALLCONV unames_cleanup(void) |
| 161 { |
| 162 if(uCharNamesData) { |
| 163 udata_close(uCharNamesData); |
| 164 uCharNamesData = NULL; |
| 165 } |
| 166 if(uCharNames) { |
| 167 uCharNames = NULL; |
| 168 } |
| 169 gMaxNameLength=0; |
| 170 return TRUE; |
| 171 } |
| 172 |
| 173 static UBool U_CALLCONV |
| 174 isAcceptable(void *context, |
| 175 const char *type, const char *name, |
| 176 const UDataInfo *pInfo) { |
| 177 return (UBool)( |
| 178 pInfo->size>=20 && |
| 179 pInfo->isBigEndian==U_IS_BIG_ENDIAN && |
| 180 pInfo->charsetFamily==U_CHARSET_FAMILY && |
| 181 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ |
| 182 pInfo->dataFormat[1]==0x6e && |
| 183 pInfo->dataFormat[2]==0x61 && |
| 184 pInfo->dataFormat[3]==0x6d && |
| 185 pInfo->formatVersion[0]==1); |
| 186 } |
| 187 |
| 188 static UBool |
| 189 isDataLoaded(UErrorCode *pErrorCode) { |
| 190 /* load UCharNames from file if necessary */ |
| 191 UBool isCached; |
| 192 |
| 193 /* do this because double-checked locking is broken */ |
| 194 UMTX_CHECK(NULL, (uCharNames!=NULL), isCached); |
| 195 |
| 196 if(!isCached) { |
| 197 UCharNames *names; |
| 198 UDataMemory *data; |
| 199 |
| 200 /* check error code from previous attempt */ |
| 201 if(U_FAILURE(gLoadErrorCode)) { |
| 202 *pErrorCode=gLoadErrorCode; |
| 203 return FALSE; |
| 204 } |
| 205 |
| 206 /* open the data outside the mutex block */ |
| 207 data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pE
rrorCode); |
| 208 if(U_FAILURE(*pErrorCode)) { |
| 209 gLoadErrorCode=*pErrorCode; |
| 210 return FALSE; |
| 211 } |
| 212 |
| 213 names=(UCharNames *)udata_getMemory(data); |
| 214 |
| 215 /* in the mutex block, set the data for this process */ |
| 216 { |
| 217 umtx_lock(NULL); |
| 218 if(uCharNames==NULL) { |
| 219 uCharNamesData=data; |
| 220 uCharNames=names; |
| 221 data=NULL; |
| 222 names=NULL; |
| 223 ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup); |
| 224 } |
| 225 umtx_unlock(NULL); |
| 226 } |
| 227 |
| 228 /* if a different thread set it first, then close the extra data */ |
| 229 if(data!=NULL) { |
| 230 udata_close(data); /* NULL if it was set correctly */ |
| 231 } |
| 232 } |
| 233 return TRUE; |
| 234 } |
| 235 |
| 236 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \ |
| 237 if((bufferLength)>0) { \ |
| 238 *(buffer)++=c; \ |
| 239 --(bufferLength); \ |
| 240 } \ |
| 241 ++(bufferPos); \ |
| 242 } |
| 243 |
| 244 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT |
| 245 |
| 246 /* |
| 247 * Important: expandName() and compareName() are almost the same - |
| 248 * apply fixes to both. |
| 249 * |
| 250 * UnicodeData.txt uses ';' as a field separator, so no |
| 251 * field can contain ';' as part of its contents. |
| 252 * In unames.dat, it is marked as token[';']==-1 only if the |
| 253 * semicolon is used in the data file - which is iff we |
| 254 * have Unicode 1.0 names or ISO comments or aliases. |
| 255 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases |
| 256 * although we know that it will never be part of a name. |
| 257 */ |
| 258 static uint16_t |
| 259 expandName(UCharNames *names, |
| 260 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, |
| 261 char *buffer, uint16_t bufferLength) { |
| 262 uint16_t *tokens=(uint16_t *)names+8; |
| 263 uint16_t token, tokenCount=*tokens++, bufferPos=0; |
| 264 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; |
| 265 uint8_t c; |
| 266 |
| 267 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
| 268 /* |
| 269 * skip the modern name if it is not requested _and_ |
| 270 * if the semicolon byte value is a character, not a token number |
| 271 */ |
| 272 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { |
| 273 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; |
| 274 do { |
| 275 while(nameLength>0) { |
| 276 --nameLength; |
| 277 if(*name++==';') { |
| 278 break; |
| 279 } |
| 280 } |
| 281 } while(--fieldIndex>0); |
| 282 } else { |
| 283 /* |
| 284 * the semicolon byte value is a token number, therefore |
| 285 * only modern names are stored in unames.dat and there is no |
| 286 * such requested alternate name here |
| 287 */ |
| 288 nameLength=0; |
| 289 } |
| 290 } |
| 291 |
| 292 /* write each letter directly, and write a token word per token */ |
| 293 while(nameLength>0) { |
| 294 --nameLength; |
| 295 c=*name++; |
| 296 |
| 297 if(c>=tokenCount) { |
| 298 if(c!=';') { |
| 299 /* implicit letter */ |
| 300 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| 301 } else { |
| 302 /* finished */ |
| 303 break; |
| 304 } |
| 305 } else { |
| 306 token=tokens[c]; |
| 307 if(token==(uint16_t)(-2)) { |
| 308 /* this is a lead byte for a double-byte token */ |
| 309 token=tokens[c<<8|*name++]; |
| 310 --nameLength; |
| 311 } |
| 312 if(token==(uint16_t)(-1)) { |
| 313 if(c!=';') { |
| 314 /* explicit letter */ |
| 315 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| 316 } else { |
| 317 /* stop, but skip the semicolon if we are seeking |
| 318 extended names and there was no 2.0 name but there |
| 319 is a 1.0 name. */ |
| 320 if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) { |
| 321 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(u
int16_t)(-1)) { |
| 322 continue; |
| 323 } |
| 324 } |
| 325 /* finished */ |
| 326 break; |
| 327 } |
| 328 } else { |
| 329 /* write token word */ |
| 330 uint8_t *tokenString=tokenStrings+token; |
| 331 while((c=*tokenString++)!=0) { |
| 332 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| 333 } |
| 334 } |
| 335 } |
| 336 } |
| 337 |
| 338 /* zero-terminate */ |
| 339 if(bufferLength>0) { |
| 340 *buffer=0; |
| 341 } |
| 342 |
| 343 return bufferPos; |
| 344 } |
| 345 |
| 346 /* |
| 347 * compareName() is almost the same as expandName() except that it compares |
| 348 * the currently expanded name to an input name. |
| 349 * It returns the match/no match result as soon as possible. |
| 350 */ |
| 351 static UBool |
| 352 compareName(UCharNames *names, |
| 353 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice
, |
| 354 const char *otherName) { |
| 355 uint16_t *tokens=(uint16_t *)names+8; |
| 356 uint16_t token, tokenCount=*tokens++; |
| 357 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; |
| 358 uint8_t c; |
| 359 const char *origOtherName = otherName; |
| 360 |
| 361 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
| 362 /* |
| 363 * skip the modern name if it is not requested _and_ |
| 364 * if the semicolon byte value is a character, not a token number |
| 365 */ |
| 366 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { |
| 367 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; |
| 368 do { |
| 369 while(nameLength>0) { |
| 370 --nameLength; |
| 371 if(*name++==';') { |
| 372 break; |
| 373 } |
| 374 } |
| 375 } while(--fieldIndex>0); |
| 376 } else { |
| 377 /* |
| 378 * the semicolon byte value is a token number, therefore |
| 379 * only modern names are stored in unames.dat and there is no |
| 380 * such requested alternate name here |
| 381 */ |
| 382 nameLength=0; |
| 383 } |
| 384 } |
| 385 |
| 386 /* compare each letter directly, and compare a token word per token */ |
| 387 while(nameLength>0) { |
| 388 --nameLength; |
| 389 c=*name++; |
| 390 |
| 391 if(c>=tokenCount) { |
| 392 if(c!=';') { |
| 393 /* implicit letter */ |
| 394 if((char)c!=*otherName++) { |
| 395 return FALSE; |
| 396 } |
| 397 } else { |
| 398 /* finished */ |
| 399 break; |
| 400 } |
| 401 } else { |
| 402 token=tokens[c]; |
| 403 if(token==(uint16_t)(-2)) { |
| 404 /* this is a lead byte for a double-byte token */ |
| 405 token=tokens[c<<8|*name++]; |
| 406 --nameLength; |
| 407 } |
| 408 if(token==(uint16_t)(-1)) { |
| 409 if(c!=';') { |
| 410 /* explicit letter */ |
| 411 if((char)c!=*otherName++) { |
| 412 return FALSE; |
| 413 } |
| 414 } else { |
| 415 /* stop, but skip the semicolon if we are seeking |
| 416 extended names and there was no 2.0 name but there |
| 417 is a 1.0 name. */ |
| 418 if(otherName == origOtherName && nameChoice == U_EXTENDED_CH
AR_NAME) { |
| 419 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(u
int16_t)(-1)) { |
| 420 continue; |
| 421 } |
| 422 } |
| 423 /* finished */ |
| 424 break; |
| 425 } |
| 426 } else { |
| 427 /* write token word */ |
| 428 uint8_t *tokenString=tokenStrings+token; |
| 429 while((c=*tokenString++)!=0) { |
| 430 if((char)c!=*otherName++) { |
| 431 return FALSE; |
| 432 } |
| 433 } |
| 434 } |
| 435 } |
| 436 } |
| 437 |
| 438 /* complete match? */ |
| 439 return (UBool)(*otherName==0); |
| 440 } |
| 441 |
| 442 static uint8_t getCharCat(UChar32 cp) { |
| 443 uint8_t cat; |
| 444 |
| 445 if (UTF_IS_UNICODE_NONCHAR(cp)) { |
| 446 return U_NONCHARACTER_CODE_POINT; |
| 447 } |
| 448 |
| 449 if ((cat = u_charType(cp)) == U_SURROGATE) { |
| 450 cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE; |
| 451 } |
| 452 |
| 453 return cat; |
| 454 } |
| 455 |
| 456 static const char *getCharCatName(UChar32 cp) { |
| 457 uint8_t cat = getCharCat(cp); |
| 458 |
| 459 /* Return unknown if the table of names above is not up to |
| 460 date. */ |
| 461 |
| 462 if (cat >= LENGTHOF(charCatNames)) { |
| 463 return "unknown"; |
| 464 } else { |
| 465 return charCatNames[cat]; |
| 466 } |
| 467 } |
| 468 |
| 469 static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) { |
| 470 const char *catname = getCharCatName(code); |
| 471 uint16_t length = 0; |
| 472 |
| 473 UChar32 cp; |
| 474 int ndigits, i; |
| 475 |
| 476 WRITE_CHAR(buffer, bufferLength, length, '<'); |
| 477 while (catname[length - 1]) { |
| 478 WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]); |
| 479 } |
| 480 WRITE_CHAR(buffer, bufferLength, length, '-'); |
| 481 for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4) |
| 482 ; |
| 483 if (ndigits < 4) |
| 484 ndigits = 4; |
| 485 for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, buffer
Length--) { |
| 486 uint8_t v = (uint8_t)(cp & 0xf); |
| 487 buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10); |
| 488 } |
| 489 buffer += ndigits; |
| 490 length += ndigits; |
| 491 WRITE_CHAR(buffer, bufferLength, length, '>'); |
| 492 |
| 493 return length; |
| 494 } |
| 495 |
| 496 /* |
| 497 * getGroup() does a binary search for the group that contains the |
| 498 * Unicode code point "code". |
| 499 * The return value is always a valid Group* that may contain "code" |
| 500 * or else is the highest group before "code". |
| 501 * If the lowest group is after "code", then that one is returned. |
| 502 */ |
| 503 static const uint16_t * |
| 504 getGroup(UCharNames *names, uint32_t code) { |
| 505 const uint16_t *groups=GET_GROUPS(names); |
| 506 uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT), |
| 507 start=0, |
| 508 limit=*groups++, |
| 509 number; |
| 510 |
| 511 /* binary search for the group of names that contains the one for code */ |
| 512 while(start<limit-1) { |
| 513 number=(uint16_t)((start+limit)/2); |
| 514 if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) { |
| 515 limit=number; |
| 516 } else { |
| 517 start=number; |
| 518 } |
| 519 } |
| 520 |
| 521 /* return this regardless of whether it is an exact match */ |
| 522 return groups+start*GROUP_LENGTH; |
| 523 } |
| 524 |
| 525 /* |
| 526 * expandGroupLengths() reads a block of compressed lengths of 32 strings and |
| 527 * expands them into offsets and lengths for each string. |
| 528 * Lengths are stored with a variable-width encoding in consecutive nibbles: |
| 529 * If a nibble<0xc, then it is the length itself (0=empty string). |
| 530 * If a nibble>=0xc, then it forms a length value with the following nibble. |
| 531 * Calculation see below. |
| 532 * The offsets and lengths arrays must be at least 33 (one more) long because |
| 533 * there is no check here at the end if the last nibble is still used. |
| 534 */ |
| 535 static const uint8_t * |
| 536 expandGroupLengths(const uint8_t *s, |
| 537 uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_P
ER_GROUP+1]) { |
| 538 /* read the lengths of the 32 strings in this group and get each string's of
fset */ |
| 539 uint16_t i=0, offset=0, length=0; |
| 540 uint8_t lengthByte; |
| 541 |
| 542 /* all 32 lengths must be read to get the offset of the first group string *
/ |
| 543 while(i<LINES_PER_GROUP) { |
| 544 lengthByte=*s++; |
| 545 |
| 546 /* read even nibble - MSBs of lengthByte */ |
| 547 if(length>=12) { |
| 548 /* double-nibble length spread across two bytes */ |
| 549 length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12); |
| 550 lengthByte&=0xf; |
| 551 } else if((lengthByte /* &0xf0 */)>=0xc0) { |
| 552 /* double-nibble length spread across this one byte */ |
| 553 length=(uint16_t)((lengthByte&0x3f)+12); |
| 554 } else { |
| 555 /* single-nibble length in MSBs */ |
| 556 length=(uint16_t)(lengthByte>>4); |
| 557 lengthByte&=0xf; |
| 558 } |
| 559 |
| 560 *offsets++=offset; |
| 561 *lengths++=length; |
| 562 |
| 563 offset+=length; |
| 564 ++i; |
| 565 |
| 566 /* read odd nibble - LSBs of lengthByte */ |
| 567 if((lengthByte&0xf0)==0) { |
| 568 /* this nibble was not consumed for a double-nibble length above */ |
| 569 length=lengthByte; |
| 570 if(length<12) { |
| 571 /* single-nibble length in LSBs */ |
| 572 *offsets++=offset; |
| 573 *lengths++=length; |
| 574 |
| 575 offset+=length; |
| 576 ++i; |
| 577 } |
| 578 } else { |
| 579 length=0; /* prevent double-nibble detection in the next iteration
*/ |
| 580 } |
| 581 } |
| 582 |
| 583 /* now, s is at the first group string */ |
| 584 return s; |
| 585 } |
| 586 |
| 587 static uint16_t |
| 588 expandGroupName(UCharNames *names, const uint16_t *group, |
| 589 uint16_t lineNumber, UCharNameChoice nameChoice, |
| 590 char *buffer, uint16_t bufferLength) { |
| 591 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; |
| 592 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(
group); |
| 593 s=expandGroupLengths(s, offsets, lengths); |
| 594 return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameCho
ice, |
| 595 buffer, bufferLength); |
| 596 } |
| 597 |
| 598 static uint16_t |
| 599 getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice, |
| 600 char *buffer, uint16_t bufferLength) { |
| 601 const uint16_t *group=getGroup(names, code); |
| 602 if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) { |
| 603 return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameCh
oice, |
| 604 buffer, bufferLength); |
| 605 } else { |
| 606 /* group not found */ |
| 607 /* zero-terminate */ |
| 608 if(bufferLength>0) { |
| 609 *buffer=0; |
| 610 } |
| 611 return 0; |
| 612 } |
| 613 } |
| 614 |
| 615 /* |
| 616 * enumGroupNames() enumerates all the names in a 32-group |
| 617 * and either calls the enumerator function or finds a given input name. |
| 618 */ |
| 619 static UBool |
| 620 enumGroupNames(UCharNames *names, const uint16_t *group, |
| 621 UChar32 start, UChar32 end, |
| 622 UEnumCharNamesFn *fn, void *context, |
| 623 UCharNameChoice nameChoice) { |
| 624 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; |
| 625 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(
group); |
| 626 |
| 627 s=expandGroupLengths(s, offsets, lengths); |
| 628 if(fn!=DO_FIND_NAME) { |
| 629 char buffer[200]; |
| 630 uint16_t length; |
| 631 |
| 632 while(start<=end) { |
| 633 length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&
GROUP_MASK], nameChoice, buffer, sizeof(buffer)); |
| 634 if (!length && nameChoice == U_EXTENDED_CHAR_NAME) { |
| 635 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; |
| 636 } |
| 637 /* here, we assume that the buffer is large enough */ |
| 638 if(length>0) { |
| 639 if(!fn(context, start, nameChoice, buffer, length)) { |
| 640 return FALSE; |
| 641 } |
| 642 } |
| 643 ++start; |
| 644 } |
| 645 } else { |
| 646 const char *otherName=((FindName *)context)->otherName; |
| 647 while(start<=end) { |
| 648 if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GRO
UP_MASK], nameChoice, otherName)) { |
| 649 ((FindName *)context)->code=start; |
| 650 return FALSE; |
| 651 } |
| 652 ++start; |
| 653 } |
| 654 } |
| 655 return TRUE; |
| 656 } |
| 657 |
| 658 /* |
| 659 * enumExtNames enumerate extended names. |
| 660 * It only needs to do it if it is called with a real function and not |
| 661 * with the dummy DO_FIND_NAME, because u_charFromName() does a check |
| 662 * for extended names by itself. |
| 663 */ |
| 664 static UBool |
| 665 enumExtNames(UChar32 start, UChar32 end, |
| 666 UEnumCharNamesFn *fn, void *context) |
| 667 { |
| 668 if(fn!=DO_FIND_NAME) { |
| 669 char buffer[200]; |
| 670 uint16_t length; |
| 671 |
| 672 while(start<=end) { |
| 673 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; |
| 674 /* here, we assume that the buffer is large enough */ |
| 675 if(length>0) { |
| 676 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) { |
| 677 return FALSE; |
| 678 } |
| 679 } |
| 680 ++start; |
| 681 } |
| 682 } |
| 683 |
| 684 return TRUE; |
| 685 } |
| 686 |
| 687 static UBool |
| 688 enumNames(UCharNames *names, |
| 689 UChar32 start, UChar32 limit, |
| 690 UEnumCharNamesFn *fn, void *context, |
| 691 UCharNameChoice nameChoice) { |
| 692 uint16_t startGroupMSB, endGroupMSB, groupCount; |
| 693 const uint16_t *group, *groupLimit; |
| 694 |
| 695 startGroupMSB=(uint16_t)(start>>GROUP_SHIFT); |
| 696 endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT); |
| 697 |
| 698 /* find the group that contains start, or the highest before it */ |
| 699 group=getGroup(names, start); |
| 700 |
| 701 if(startGroupMSB==endGroupMSB) { |
| 702 if(startGroupMSB==group[GROUP_MSB]) { |
| 703 /* if start and limit-1 are in the same group, then enumerate only i
n that one */ |
| 704 return enumGroupNames(names, group, start, limit-1, fn, context, nam
eChoice); |
| 705 } |
| 706 } else { |
| 707 const uint16_t *groups=GET_GROUPS(names); |
| 708 groupCount=*groups++; |
| 709 groupLimit=groups+groupCount*GROUP_LENGTH; |
| 710 |
| 711 if(startGroupMSB==group[GROUP_MSB]) { |
| 712 /* enumerate characters in the partial start group */ |
| 713 if((start&GROUP_MASK)!=0) { |
| 714 if(!enumGroupNames(names, group, |
| 715 start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+
LINES_PER_GROUP-1, |
| 716 fn, context, nameChoice)) { |
| 717 return FALSE; |
| 718 } |
| 719 group=NEXT_GROUP(group); /* continue with the next group */ |
| 720 } |
| 721 } else if(startGroupMSB>group[GROUP_MSB]) { |
| 722 /* make sure that we start enumerating with the first group after st
art */ |
| 723 const uint16_t *nextGroup=NEXT_GROUP(group); |
| 724 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB &
& nameChoice == U_EXTENDED_CHAR_NAME) { |
| 725 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; |
| 726 if (end > limit) { |
| 727 end = limit; |
| 728 } |
| 729 if (!enumExtNames(start, end - 1, fn, context)) { |
| 730 return FALSE; |
| 731 } |
| 732 } |
| 733 group=nextGroup; |
| 734 } |
| 735 |
| 736 /* enumerate entire groups between the start- and end-groups */ |
| 737 while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) { |
| 738 const uint16_t *nextGroup; |
| 739 start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT; |
| 740 if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn,
context, nameChoice)) { |
| 741 return FALSE; |
| 742 } |
| 743 nextGroup=NEXT_GROUP(group); |
| 744 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB
] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) { |
| 745 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; |
| 746 if (end > limit) { |
| 747 end = limit; |
| 748 } |
| 749 if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1
, fn, context)) { |
| 750 return FALSE; |
| 751 } |
| 752 } |
| 753 group=nextGroup; |
| 754 } |
| 755 |
| 756 /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */ |
| 757 if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) { |
| 758 return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1,
fn, context, nameChoice); |
| 759 } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) { |
| 760 UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT; |
| 761 if (next > start) { |
| 762 start = next; |
| 763 } |
| 764 } else { |
| 765 return TRUE; |
| 766 } |
| 767 } |
| 768 |
| 769 /* we have not found a group, which means everything is made of |
| 770 extended names. */ |
| 771 if (nameChoice == U_EXTENDED_CHAR_NAME) { |
| 772 if (limit > UCHAR_MAX_VALUE + 1) { |
| 773 limit = UCHAR_MAX_VALUE + 1; |
| 774 } |
| 775 return enumExtNames(start, limit - 1, fn, context); |
| 776 } |
| 777 |
| 778 return TRUE; |
| 779 } |
| 780 |
| 781 static uint16_t |
| 782 writeFactorSuffix(const uint16_t *factors, uint16_t count, |
| 783 const char *s, /* suffix elements */ |
| 784 uint32_t code, |
| 785 uint16_t indexes[8], /* output fields from here */ |
| 786 const char *elementBases[8], const char *elements[8], |
| 787 char *buffer, uint16_t bufferLength) { |
| 788 uint16_t i, factor, bufferPos=0; |
| 789 char c; |
| 790 |
| 791 /* write elements according to the factors */ |
| 792 |
| 793 /* |
| 794 * the factorized elements are determined by modulo arithmetic |
| 795 * with the factors of this algorithm |
| 796 * |
| 797 * note that for fewer operations, count is decremented here |
| 798 */ |
| 799 --count; |
| 800 for(i=count; i>0; --i) { |
| 801 factor=factors[i]; |
| 802 indexes[i]=(uint16_t)(code%factor); |
| 803 code/=factor; |
| 804 } |
| 805 /* |
| 806 * we don't need to calculate the last modulus because start<=code<=end |
| 807 * guarantees here that code<=factors[0] |
| 808 */ |
| 809 indexes[0]=(uint16_t)code; |
| 810 |
| 811 /* write each element */ |
| 812 for(;;) { |
| 813 if(elementBases!=NULL) { |
| 814 *elementBases++=s; |
| 815 } |
| 816 |
| 817 /* skip indexes[i] strings */ |
| 818 factor=indexes[i]; |
| 819 while(factor>0) { |
| 820 while(*s++!=0) {} |
| 821 --factor; |
| 822 } |
| 823 if(elements!=NULL) { |
| 824 *elements++=s; |
| 825 } |
| 826 |
| 827 /* write element */ |
| 828 while((c=*s++)!=0) { |
| 829 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| 830 } |
| 831 |
| 832 /* we do not need to perform the rest of this loop for i==count - break
here */ |
| 833 if(i>=count) { |
| 834 break; |
| 835 } |
| 836 |
| 837 /* skip the rest of the strings for this factors[i] */ |
| 838 factor=(uint16_t)(factors[i]-indexes[i]-1); |
| 839 while(factor>0) { |
| 840 while(*s++!=0) {} |
| 841 --factor; |
| 842 } |
| 843 |
| 844 ++i; |
| 845 } |
| 846 |
| 847 /* zero-terminate */ |
| 848 if(bufferLength>0) { |
| 849 *buffer=0; |
| 850 } |
| 851 |
| 852 return bufferPos; |
| 853 } |
| 854 |
| 855 /* |
| 856 * Important: |
| 857 * Parts of findAlgName() are almost the same as some of getAlgName(). |
| 858 * Fixes must be applied to both. |
| 859 */ |
| 860 static uint16_t |
| 861 getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, |
| 862 char *buffer, uint16_t bufferLength) { |
| 863 uint16_t bufferPos=0; |
| 864 |
| 865 /* Only the normative character name can be algorithmic. */ |
| 866 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
| 867 /* zero-terminate */ |
| 868 if(bufferLength>0) { |
| 869 *buffer=0; |
| 870 } |
| 871 return 0; |
| 872 } |
| 873 |
| 874 switch(range->type) { |
| 875 case 0: { |
| 876 /* name = prefix hex-digits */ |
| 877 const char *s=(const char *)(range+1); |
| 878 char c; |
| 879 |
| 880 uint16_t i, count; |
| 881 |
| 882 /* copy prefix */ |
| 883 while((c=*s++)!=0) { |
| 884 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| 885 } |
| 886 |
| 887 /* write hexadecimal code point value */ |
| 888 count=range->variant; |
| 889 |
| 890 /* zero-terminate */ |
| 891 if(count<bufferLength) { |
| 892 buffer[count]=0; |
| 893 } |
| 894 |
| 895 for(i=count; i>0;) { |
| 896 if(--i<bufferLength) { |
| 897 c=(char)(code&0xf); |
| 898 if(c<10) { |
| 899 c+='0'; |
| 900 } else { |
| 901 c+='A'-10; |
| 902 } |
| 903 buffer[i]=c; |
| 904 } |
| 905 code>>=4; |
| 906 } |
| 907 |
| 908 bufferPos+=count; |
| 909 break; |
| 910 } |
| 911 case 1: { |
| 912 /* name = prefix factorized-elements */ |
| 913 uint16_t indexes[8]; |
| 914 const uint16_t *factors=(const uint16_t *)(range+1); |
| 915 uint16_t count=range->variant; |
| 916 const char *s=(const char *)(factors+count); |
| 917 char c; |
| 918 |
| 919 /* copy prefix */ |
| 920 while((c=*s++)!=0) { |
| 921 WRITE_CHAR(buffer, bufferLength, bufferPos, c); |
| 922 } |
| 923 |
| 924 bufferPos+=writeFactorSuffix(factors, count, |
| 925 s, code-range->start, indexes, NULL, NULL,
buffer, bufferLength); |
| 926 break; |
| 927 } |
| 928 default: |
| 929 /* undefined type */ |
| 930 /* zero-terminate */ |
| 931 if(bufferLength>0) { |
| 932 *buffer=0; |
| 933 } |
| 934 break; |
| 935 } |
| 936 |
| 937 return bufferPos; |
| 938 } |
| 939 |
| 940 /* |
| 941 * Important: enumAlgNames() and findAlgName() are almost the same. |
| 942 * Any fix must be applied to both. |
| 943 */ |
| 944 static UBool |
| 945 enumAlgNames(AlgorithmicRange *range, |
| 946 UChar32 start, UChar32 limit, |
| 947 UEnumCharNamesFn *fn, void *context, |
| 948 UCharNameChoice nameChoice) { |
| 949 char buffer[200]; |
| 950 uint16_t length; |
| 951 |
| 952 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
| 953 return TRUE; |
| 954 } |
| 955 |
| 956 switch(range->type) { |
| 957 case 0: { |
| 958 char *s, *end; |
| 959 char c; |
| 960 |
| 961 /* get the full name of the start character */ |
| 962 length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buf
fer)); |
| 963 if(length<=0) { |
| 964 return TRUE; |
| 965 } |
| 966 |
| 967 /* call the enumerator function with this first character */ |
| 968 if(!fn(context, start, nameChoice, buffer, length)) { |
| 969 return FALSE; |
| 970 } |
| 971 |
| 972 /* go to the end of the name; all these names have the same length */ |
| 973 end=buffer; |
| 974 while(*end!=0) { |
| 975 ++end; |
| 976 } |
| 977 |
| 978 /* enumerate the rest of the names */ |
| 979 while(++start<limit) { |
| 980 /* increment the hexadecimal number on a character-basis */ |
| 981 s=end; |
| 982 for (;;) { |
| 983 c=*--s; |
| 984 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) { |
| 985 *s=(char)(c+1); |
| 986 break; |
| 987 } else if(c=='9') { |
| 988 *s='A'; |
| 989 break; |
| 990 } else if(c=='F') { |
| 991 *s='0'; |
| 992 } |
| 993 } |
| 994 |
| 995 if(!fn(context, start, nameChoice, buffer, length)) { |
| 996 return FALSE; |
| 997 } |
| 998 } |
| 999 break; |
| 1000 } |
| 1001 case 1: { |
| 1002 uint16_t indexes[8]; |
| 1003 const char *elementBases[8], *elements[8]; |
| 1004 const uint16_t *factors=(const uint16_t *)(range+1); |
| 1005 uint16_t count=range->variant; |
| 1006 const char *s=(const char *)(factors+count); |
| 1007 char *suffix, *t; |
| 1008 uint16_t prefixLength, i, idx; |
| 1009 |
| 1010 char c; |
| 1011 |
| 1012 /* name = prefix factorized-elements */ |
| 1013 |
| 1014 /* copy prefix */ |
| 1015 suffix=buffer; |
| 1016 prefixLength=0; |
| 1017 while((c=*s++)!=0) { |
| 1018 *suffix++=c; |
| 1019 ++prefixLength; |
| 1020 } |
| 1021 |
| 1022 /* append the suffix of the start character */ |
| 1023 length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count, |
| 1024 s, (uint32_t)start-range->start, |
| 1025 indexes, elementBases, elements, |
| 1026 suffix, (uint16_t)(sizeof(buffer)-
prefixLength))); |
| 1027 |
| 1028 /* call the enumerator function with this first character */ |
| 1029 if(!fn(context, start, nameChoice, buffer, length)) { |
| 1030 return FALSE; |
| 1031 } |
| 1032 |
| 1033 /* enumerate the rest of the names */ |
| 1034 while(++start<limit) { |
| 1035 /* increment the indexes in lexical order bound by the factors */ |
| 1036 i=count; |
| 1037 for (;;) { |
| 1038 idx=(uint16_t)(indexes[--i]+1); |
| 1039 if(idx<factors[i]) { |
| 1040 /* skip one index and its element string */ |
| 1041 indexes[i]=idx; |
| 1042 s=elements[i]; |
| 1043 while(*s++!=0) { |
| 1044 } |
| 1045 elements[i]=s; |
| 1046 break; |
| 1047 } else { |
| 1048 /* reset this index to 0 and its element string to the first
one */ |
| 1049 indexes[i]=0; |
| 1050 elements[i]=elementBases[i]; |
| 1051 } |
| 1052 } |
| 1053 |
| 1054 /* to make matters a little easier, just append all elements to the
suffix */ |
| 1055 t=suffix; |
| 1056 length=prefixLength; |
| 1057 for(i=0; i<count; ++i) { |
| 1058 s=elements[i]; |
| 1059 while((c=*s++)!=0) { |
| 1060 *t++=c; |
| 1061 ++length; |
| 1062 } |
| 1063 } |
| 1064 /* zero-terminate */ |
| 1065 *t=0; |
| 1066 |
| 1067 if(!fn(context, start, nameChoice, buffer, length)) { |
| 1068 return FALSE; |
| 1069 } |
| 1070 } |
| 1071 break; |
| 1072 } |
| 1073 default: |
| 1074 /* undefined type */ |
| 1075 break; |
| 1076 } |
| 1077 |
| 1078 return TRUE; |
| 1079 } |
| 1080 |
| 1081 /* |
| 1082 * findAlgName() is almost the same as enumAlgNames() except that it |
| 1083 * returns the code point for a name if it fits into the range. |
| 1084 * It returns 0xffff otherwise. |
| 1085 */ |
| 1086 static UChar32 |
| 1087 findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *oth
erName) { |
| 1088 UChar32 code; |
| 1089 |
| 1090 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { |
| 1091 return 0xffff; |
| 1092 } |
| 1093 |
| 1094 switch(range->type) { |
| 1095 case 0: { |
| 1096 /* name = prefix hex-digits */ |
| 1097 const char *s=(const char *)(range+1); |
| 1098 char c; |
| 1099 |
| 1100 uint16_t i, count; |
| 1101 |
| 1102 /* compare prefix */ |
| 1103 while((c=*s++)!=0) { |
| 1104 if((char)c!=*otherName++) { |
| 1105 return 0xffff; |
| 1106 } |
| 1107 } |
| 1108 |
| 1109 /* read hexadecimal code point value */ |
| 1110 count=range->variant; |
| 1111 code=0; |
| 1112 for(i=0; i<count; ++i) { |
| 1113 c=*otherName++; |
| 1114 if('0'<=c && c<='9') { |
| 1115 code=(code<<4)|(c-'0'); |
| 1116 } else if('A'<=c && c<='F') { |
| 1117 code=(code<<4)|(c-'A'+10); |
| 1118 } else { |
| 1119 return 0xffff; |
| 1120 } |
| 1121 } |
| 1122 |
| 1123 /* does it fit into the range? */ |
| 1124 if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=rang
e->end) { |
| 1125 return code; |
| 1126 } |
| 1127 break; |
| 1128 } |
| 1129 case 1: { |
| 1130 char buffer[64]; |
| 1131 uint16_t indexes[8]; |
| 1132 const char *elementBases[8], *elements[8]; |
| 1133 const uint16_t *factors=(const uint16_t *)(range+1); |
| 1134 uint16_t count=range->variant; |
| 1135 const char *s=(const char *)(factors+count), *t; |
| 1136 UChar32 start, limit; |
| 1137 uint16_t i, idx; |
| 1138 |
| 1139 char c; |
| 1140 |
| 1141 /* name = prefix factorized-elements */ |
| 1142 |
| 1143 /* compare prefix */ |
| 1144 while((c=*s++)!=0) { |
| 1145 if((char)c!=*otherName++) { |
| 1146 return 0xffff; |
| 1147 } |
| 1148 } |
| 1149 |
| 1150 start=(UChar32)range->start; |
| 1151 limit=(UChar32)(range->end+1); |
| 1152 |
| 1153 /* initialize the suffix elements for enumeration; indexes should all be
set to 0 */ |
| 1154 writeFactorSuffix(factors, count, s, 0, |
| 1155 indexes, elementBases, elements, buffer, sizeof(buffer
)); |
| 1156 |
| 1157 /* compare the first suffix */ |
| 1158 if(0==uprv_strcmp(otherName, buffer)) { |
| 1159 return start; |
| 1160 } |
| 1161 |
| 1162 /* enumerate and compare the rest of the suffixes */ |
| 1163 while(++start<limit) { |
| 1164 /* increment the indexes in lexical order bound by the factors */ |
| 1165 i=count; |
| 1166 for (;;) { |
| 1167 idx=(uint16_t)(indexes[--i]+1); |
| 1168 if(idx<factors[i]) { |
| 1169 /* skip one index and its element string */ |
| 1170 indexes[i]=idx; |
| 1171 s=elements[i]; |
| 1172 while(*s++!=0) {} |
| 1173 elements[i]=s; |
| 1174 break; |
| 1175 } else { |
| 1176 /* reset this index to 0 and its element string to the first
one */ |
| 1177 indexes[i]=0; |
| 1178 elements[i]=elementBases[i]; |
| 1179 } |
| 1180 } |
| 1181 |
| 1182 /* to make matters a little easier, just compare all elements of the
suffix */ |
| 1183 t=otherName; |
| 1184 for(i=0; i<count; ++i) { |
| 1185 s=elements[i]; |
| 1186 while((c=*s++)!=0) { |
| 1187 if(c!=*t++) { |
| 1188 s=""; /* does not match */ |
| 1189 i=99; |
| 1190 } |
| 1191 } |
| 1192 } |
| 1193 if(i<99 && *t==0) { |
| 1194 return start; |
| 1195 } |
| 1196 } |
| 1197 break; |
| 1198 } |
| 1199 default: |
| 1200 /* undefined type */ |
| 1201 break; |
| 1202 } |
| 1203 |
| 1204 return 0xffff; |
| 1205 } |
| 1206 |
| 1207 /* sets of name characters, maximum name lengths ---------------------------- */ |
| 1208 |
| 1209 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f))) |
| 1210 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x
1f)))!=0) |
| 1211 |
| 1212 static int32_t |
| 1213 calcStringSetLength(uint32_t set[8], const char *s) { |
| 1214 int32_t length=0; |
| 1215 char c; |
| 1216 |
| 1217 while((c=*s++)!=0) { |
| 1218 SET_ADD(set, c); |
| 1219 ++length; |
| 1220 } |
| 1221 return length; |
| 1222 } |
| 1223 |
| 1224 static int32_t |
| 1225 calcAlgNameSetsLengths(int32_t maxNameLength) { |
| 1226 AlgorithmicRange *range; |
| 1227 uint32_t *p; |
| 1228 uint32_t rangeCount; |
| 1229 int32_t length; |
| 1230 |
| 1231 /* enumerate algorithmic ranges */ |
| 1232 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
| 1233 rangeCount=*p; |
| 1234 range=(AlgorithmicRange *)(p+1); |
| 1235 while(rangeCount>0) { |
| 1236 switch(range->type) { |
| 1237 case 0: |
| 1238 /* name = prefix + (range->variant times) hex-digits */ |
| 1239 /* prefix */ |
| 1240 length=calcStringSetLength(gNameSet, (const char *)(range+1))+range-
>variant; |
| 1241 if(length>maxNameLength) { |
| 1242 maxNameLength=length; |
| 1243 } |
| 1244 break; |
| 1245 case 1: { |
| 1246 /* name = prefix factorized-elements */ |
| 1247 const uint16_t *factors=(const uint16_t *)(range+1); |
| 1248 const char *s; |
| 1249 int32_t i, count=range->variant, factor, factorLength, maxFactorLeng
th; |
| 1250 |
| 1251 /* prefix length */ |
| 1252 s=(const char *)(factors+count); |
| 1253 length=calcStringSetLength(gNameSet, s); |
| 1254 s+=length+1; /* start of factor suffixes */ |
| 1255 |
| 1256 /* get the set and maximum factor suffix length for each factor */ |
| 1257 for(i=0; i<count; ++i) { |
| 1258 maxFactorLength=0; |
| 1259 for(factor=factors[i]; factor>0; --factor) { |
| 1260 factorLength=calcStringSetLength(gNameSet, s); |
| 1261 s+=factorLength+1; |
| 1262 if(factorLength>maxFactorLength) { |
| 1263 maxFactorLength=factorLength; |
| 1264 } |
| 1265 } |
| 1266 length+=maxFactorLength; |
| 1267 } |
| 1268 |
| 1269 if(length>maxNameLength) { |
| 1270 maxNameLength=length; |
| 1271 } |
| 1272 break; |
| 1273 } |
| 1274 default: |
| 1275 /* unknown type */ |
| 1276 break; |
| 1277 } |
| 1278 |
| 1279 range=(AlgorithmicRange *)((uint8_t *)range+range->size); |
| 1280 --rangeCount; |
| 1281 } |
| 1282 return maxNameLength; |
| 1283 } |
| 1284 |
| 1285 static int32_t |
| 1286 calcExtNameSetsLengths(int32_t maxNameLength) { |
| 1287 int32_t i, length; |
| 1288 |
| 1289 for(i=0; i<LENGTHOF(charCatNames); ++i) { |
| 1290 /* |
| 1291 * for each category, count the length of the category name |
| 1292 * plus 9= |
| 1293 * 2 for <> |
| 1294 * 1 for - |
| 1295 * 6 for most hex digits per code point |
| 1296 */ |
| 1297 length=9+calcStringSetLength(gNameSet, charCatNames[i]); |
| 1298 if(length>maxNameLength) { |
| 1299 maxNameLength=length; |
| 1300 } |
| 1301 } |
| 1302 return maxNameLength; |
| 1303 } |
| 1304 |
| 1305 static int32_t |
| 1306 calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *to
kenStrings, int8_t *tokenLengths, |
| 1307 uint32_t set[8], |
| 1308 const uint8_t **pLine, const uint8_t *lineLimit) { |
| 1309 const uint8_t *line=*pLine; |
| 1310 int32_t length=0, tokenLength; |
| 1311 uint16_t c, token; |
| 1312 |
| 1313 while(line!=lineLimit && (c=*line++)!=(uint8_t)';') { |
| 1314 if(c>=tokenCount) { |
| 1315 /* implicit letter */ |
| 1316 SET_ADD(set, c); |
| 1317 ++length; |
| 1318 } else { |
| 1319 token=tokens[c]; |
| 1320 if(token==(uint16_t)(-2)) { |
| 1321 /* this is a lead byte for a double-byte token */ |
| 1322 c=c<<8|*line++; |
| 1323 token=tokens[c]; |
| 1324 } |
| 1325 if(token==(uint16_t)(-1)) { |
| 1326 /* explicit letter */ |
| 1327 SET_ADD(set, c); |
| 1328 ++length; |
| 1329 } else { |
| 1330 /* count token word */ |
| 1331 if(tokenLengths!=NULL) { |
| 1332 /* use cached token length */ |
| 1333 tokenLength=tokenLengths[c]; |
| 1334 if(tokenLength==0) { |
| 1335 tokenLength=calcStringSetLength(set, (const char *)token
Strings+token); |
| 1336 tokenLengths[c]=(int8_t)tokenLength; |
| 1337 } |
| 1338 } else { |
| 1339 tokenLength=calcStringSetLength(set, (const char *)tokenStri
ngs+token); |
| 1340 } |
| 1341 length+=tokenLength; |
| 1342 } |
| 1343 } |
| 1344 } |
| 1345 |
| 1346 *pLine=line; |
| 1347 return length; |
| 1348 } |
| 1349 |
| 1350 static void |
| 1351 calcGroupNameSetsLengths(int32_t maxNameLength) { |
| 1352 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; |
| 1353 |
| 1354 uint16_t *tokens=(uint16_t *)uCharNames+8; |
| 1355 uint16_t tokenCount=*tokens++; |
| 1356 uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset; |
| 1357 |
| 1358 int8_t *tokenLengths; |
| 1359 |
| 1360 const uint16_t *group; |
| 1361 const uint8_t *s, *line, *lineLimit; |
| 1362 |
| 1363 int32_t groupCount, lineNumber, length; |
| 1364 |
| 1365 tokenLengths=(int8_t *)uprv_malloc(tokenCount); |
| 1366 if(tokenLengths!=NULL) { |
| 1367 uprv_memset(tokenLengths, 0, tokenCount); |
| 1368 } |
| 1369 |
| 1370 group=GET_GROUPS(uCharNames); |
| 1371 groupCount=*group++; |
| 1372 |
| 1373 /* enumerate all groups */ |
| 1374 while(groupCount>0) { |
| 1375 s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(g
roup); |
| 1376 s=expandGroupLengths(s, offsets, lengths); |
| 1377 |
| 1378 /* enumerate all lines in each group */ |
| 1379 for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) { |
| 1380 line=s+offsets[lineNumber]; |
| 1381 length=lengths[lineNumber]; |
| 1382 if(length==0) { |
| 1383 continue; |
| 1384 } |
| 1385 |
| 1386 lineLimit=line+length; |
| 1387 |
| 1388 /* read regular name */ |
| 1389 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLeng
ths, gNameSet, &line, lineLimit); |
| 1390 if(length>maxNameLength) { |
| 1391 maxNameLength=length; |
| 1392 } |
| 1393 if(line==lineLimit) { |
| 1394 continue; |
| 1395 } |
| 1396 |
| 1397 /* read Unicode 1.0 name */ |
| 1398 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLeng
ths, gNameSet, &line, lineLimit); |
| 1399 if(length>maxNameLength) { |
| 1400 maxNameLength=length; |
| 1401 } |
| 1402 if(line==lineLimit) { |
| 1403 continue; |
| 1404 } |
| 1405 |
| 1406 /* read ISO comment */ |
| 1407 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLe
ngths, gISOCommentSet, &line, lineLimit);*/ |
| 1408 } |
| 1409 |
| 1410 group=NEXT_GROUP(group); |
| 1411 --groupCount; |
| 1412 } |
| 1413 |
| 1414 if(tokenLengths!=NULL) { |
| 1415 uprv_free(tokenLengths); |
| 1416 } |
| 1417 |
| 1418 /* set gMax... - name length last for threading */ |
| 1419 gMaxNameLength=maxNameLength; |
| 1420 } |
| 1421 |
| 1422 static UBool |
| 1423 calcNameSetsLengths(UErrorCode *pErrorCode) { |
| 1424 static const char extChars[]="0123456789ABCDEF<>-"; |
| 1425 int32_t i, maxNameLength; |
| 1426 |
| 1427 if(gMaxNameLength!=0) { |
| 1428 return TRUE; |
| 1429 } |
| 1430 |
| 1431 if(!isDataLoaded(pErrorCode)) { |
| 1432 return FALSE; |
| 1433 } |
| 1434 |
| 1435 /* set hex digits, used in various names, and <>-, used in extended names */ |
| 1436 for(i=0; i<sizeof(extChars)-1; ++i) { |
| 1437 SET_ADD(gNameSet, extChars[i]); |
| 1438 } |
| 1439 |
| 1440 /* set sets and lengths from algorithmic names */ |
| 1441 maxNameLength=calcAlgNameSetsLengths(0); |
| 1442 |
| 1443 /* set sets and lengths from extended names */ |
| 1444 maxNameLength=calcExtNameSetsLengths(maxNameLength); |
| 1445 |
| 1446 /* set sets and lengths from group names, set global maximum values */ |
| 1447 calcGroupNameSetsLengths(maxNameLength); |
| 1448 |
| 1449 return TRUE; |
| 1450 } |
| 1451 |
| 1452 /* public API --------------------------------------------------------------- */ |
| 1453 |
| 1454 U_CAPI int32_t U_EXPORT2 |
| 1455 u_charName(UChar32 code, UCharNameChoice nameChoice, |
| 1456 char *buffer, int32_t bufferLength, |
| 1457 UErrorCode *pErrorCode) { |
| 1458 AlgorithmicRange *algRange; |
| 1459 uint32_t *p; |
| 1460 uint32_t i; |
| 1461 int32_t length; |
| 1462 |
| 1463 /* check the argument values */ |
| 1464 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| 1465 return 0; |
| 1466 } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || |
| 1467 bufferLength<0 || (bufferLength>0 && buffer==NULL) |
| 1468 ) { |
| 1469 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1470 return 0; |
| 1471 } |
| 1472 |
| 1473 if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { |
| 1474 return u_terminateChars(buffer, bufferLength, 0, pErrorCode); |
| 1475 } |
| 1476 |
| 1477 length=0; |
| 1478 |
| 1479 /* try algorithmic names first */ |
| 1480 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
| 1481 i=*p; |
| 1482 algRange=(AlgorithmicRange *)(p+1); |
| 1483 while(i>0) { |
| 1484 if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) { |
| 1485 length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uin
t16_t)bufferLength); |
| 1486 break; |
| 1487 } |
| 1488 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); |
| 1489 --i; |
| 1490 } |
| 1491 |
| 1492 if(i==0) { |
| 1493 if (nameChoice == U_EXTENDED_CHAR_NAME) { |
| 1494 length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME,
buffer, (uint16_t) bufferLength); |
| 1495 if (!length) { |
| 1496 /* extended character name */ |
| 1497 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLe
ngth); |
| 1498 } |
| 1499 } else { |
| 1500 /* normal character name */ |
| 1501 length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint
16_t)bufferLength); |
| 1502 } |
| 1503 } |
| 1504 |
| 1505 return u_terminateChars(buffer, bufferLength, length, pErrorCode); |
| 1506 } |
| 1507 |
| 1508 U_CAPI int32_t U_EXPORT2 |
| 1509 u_getISOComment(UChar32 c, |
| 1510 char *dest, int32_t destCapacity, |
| 1511 UErrorCode *pErrorCode) { |
| 1512 int32_t length; |
| 1513 |
| 1514 /* check the argument values */ |
| 1515 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| 1516 return 0; |
| 1517 } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) { |
| 1518 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1519 return 0; |
| 1520 } |
| 1521 |
| 1522 if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { |
| 1523 return u_terminateChars(dest, destCapacity, 0, pErrorCode); |
| 1524 } |
| 1525 |
| 1526 /* the ISO comment is stored like a normal character name */ |
| 1527 length=getName(uCharNames, (uint32_t)c, U_ISO_COMMENT, dest, (uint16_t)destC
apacity); |
| 1528 return u_terminateChars(dest, destCapacity, length, pErrorCode); |
| 1529 } |
| 1530 |
| 1531 U_CAPI UChar32 U_EXPORT2 |
| 1532 u_charFromName(UCharNameChoice nameChoice, |
| 1533 const char *name, |
| 1534 UErrorCode *pErrorCode) { |
| 1535 char upper[120], lower[120]; |
| 1536 FindName findName; |
| 1537 AlgorithmicRange *algRange; |
| 1538 uint32_t *p; |
| 1539 uint32_t i; |
| 1540 UChar32 cp = 0; |
| 1541 char c0; |
| 1542 UChar32 error = 0xffff; /* Undefined, but use this for backwards compati
bility. */ |
| 1543 |
| 1544 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| 1545 return error; |
| 1546 } |
| 1547 |
| 1548 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) { |
| 1549 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1550 return error; |
| 1551 } |
| 1552 |
| 1553 if(!isDataLoaded(pErrorCode)) { |
| 1554 return error; |
| 1555 } |
| 1556 |
| 1557 /* construct the uppercase and lowercase of the name first */ |
| 1558 for(i=0; i<sizeof(upper); ++i) { |
| 1559 if((c0=*name++)!=0) { |
| 1560 upper[i]=uprv_toupper(c0); |
| 1561 lower[i]=uprv_tolower(c0); |
| 1562 } else { |
| 1563 upper[i]=lower[i]=0; |
| 1564 break; |
| 1565 } |
| 1566 } |
| 1567 if(i==sizeof(upper)) { |
| 1568 /* name too long, there is no such character */ |
| 1569 *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
| 1570 return error; |
| 1571 } |
| 1572 |
| 1573 /* try extended names first */ |
| 1574 if (lower[0] == '<') { |
| 1575 if (nameChoice == U_EXTENDED_CHAR_NAME) { |
| 1576 if (lower[--i] == '>') { |
| 1577 for (--i; lower[i] && lower[i] != '-'; --i) { |
| 1578 } |
| 1579 |
| 1580 if (lower[i] == '-') { /* We've got a category. */ |
| 1581 uint32_t cIdx; |
| 1582 |
| 1583 lower[i] = 0; |
| 1584 |
| 1585 for (++i; lower[i] != '>'; ++i) { |
| 1586 if (lower[i] >= '0' && lower[i] <= '9') { |
| 1587 cp = (cp << 4) + lower[i] - '0'; |
| 1588 } else if (lower[i] >= 'a' && lower[i] <= 'f') { |
| 1589 cp = (cp << 4) + lower[i] - 'a' + 10; |
| 1590 } else { |
| 1591 *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
| 1592 return error; |
| 1593 } |
| 1594 } |
| 1595 |
| 1596 /* Now validate the category name. |
| 1597 We could use a binary search, or a trie, if |
| 1598 we really wanted to. */ |
| 1599 |
| 1600 for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames);
++cIdx) { |
| 1601 |
| 1602 if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) { |
| 1603 if (getCharCat(cp) == cIdx) { |
| 1604 return cp; |
| 1605 } |
| 1606 break; |
| 1607 } |
| 1608 } |
| 1609 } |
| 1610 } |
| 1611 } |
| 1612 |
| 1613 *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
| 1614 return error; |
| 1615 } |
| 1616 |
| 1617 /* try algorithmic names now */ |
| 1618 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
| 1619 i=*p; |
| 1620 algRange=(AlgorithmicRange *)(p+1); |
| 1621 while(i>0) { |
| 1622 if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) { |
| 1623 return cp; |
| 1624 } |
| 1625 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); |
| 1626 --i; |
| 1627 } |
| 1628 |
| 1629 /* normal character name */ |
| 1630 findName.otherName=upper; |
| 1631 findName.code=error; |
| 1632 enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameC
hoice); |
| 1633 if (findName.code == error) { |
| 1634 *pErrorCode = U_ILLEGAL_CHAR_FOUND; |
| 1635 } |
| 1636 return findName.code; |
| 1637 } |
| 1638 |
| 1639 U_CAPI void U_EXPORT2 |
| 1640 u_enumCharNames(UChar32 start, UChar32 limit, |
| 1641 UEnumCharNamesFn *fn, |
| 1642 void *context, |
| 1643 UCharNameChoice nameChoice, |
| 1644 UErrorCode *pErrorCode) { |
| 1645 AlgorithmicRange *algRange; |
| 1646 uint32_t *p; |
| 1647 uint32_t i; |
| 1648 |
| 1649 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| 1650 return; |
| 1651 } |
| 1652 |
| 1653 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) { |
| 1654 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1655 return; |
| 1656 } |
| 1657 |
| 1658 if((uint32_t) limit > UCHAR_MAX_VALUE + 1) { |
| 1659 limit = UCHAR_MAX_VALUE + 1; |
| 1660 } |
| 1661 if((uint32_t)start>=(uint32_t)limit) { |
| 1662 return; |
| 1663 } |
| 1664 |
| 1665 if(!isDataLoaded(pErrorCode)) { |
| 1666 return; |
| 1667 } |
| 1668 |
| 1669 /* interleave the data-driven ones with the algorithmic ones */ |
| 1670 /* iterate over all algorithmic ranges; assume that they are in ascending or
der */ |
| 1671 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); |
| 1672 i=*p; |
| 1673 algRange=(AlgorithmicRange *)(p+1); |
| 1674 while(i>0) { |
| 1675 /* enumerate the character names before the current algorithmic range */ |
| 1676 /* here: start<limit */ |
| 1677 if((uint32_t)start<algRange->start) { |
| 1678 if((uint32_t)limit<=algRange->start) { |
| 1679 enumNames(uCharNames, start, limit, fn, context, nameChoice); |
| 1680 return; |
| 1681 } |
| 1682 if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, conte
xt, nameChoice)) { |
| 1683 return; |
| 1684 } |
| 1685 start=(UChar32)algRange->start; |
| 1686 } |
| 1687 /* enumerate the character names in the current algorithmic range */ |
| 1688 /* here: algRange->start<=start<limit */ |
| 1689 if((uint32_t)start<=algRange->end) { |
| 1690 if((uint32_t)limit<=(algRange->end+1)) { |
| 1691 enumAlgNames(algRange, start, limit, fn, context, nameChoice); |
| 1692 return; |
| 1693 } |
| 1694 if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, cont
ext, nameChoice)) { |
| 1695 return; |
| 1696 } |
| 1697 start=(UChar32)algRange->end+1; |
| 1698 } |
| 1699 /* continue to the next algorithmic range (here: start<limit) */ |
| 1700 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); |
| 1701 --i; |
| 1702 } |
| 1703 /* enumerate the character names after the last algorithmic range */ |
| 1704 enumNames(uCharNames, start, limit, fn, context, nameChoice); |
| 1705 } |
| 1706 |
| 1707 U_CAPI int32_t U_EXPORT2 |
| 1708 uprv_getMaxCharNameLength() { |
| 1709 UErrorCode errorCode=U_ZERO_ERROR; |
| 1710 if(calcNameSetsLengths(&errorCode)) { |
| 1711 return gMaxNameLength; |
| 1712 } else { |
| 1713 return 0; |
| 1714 } |
| 1715 } |
| 1716 |
| 1717 /** |
| 1718 * Converts the char set cset into a Unicode set uset. |
| 1719 * @param cset Set of 256 bit flags corresponding to a set of chars. |
| 1720 * @param uset USet to receive characters. Existing contents are deleted. |
| 1721 */ |
| 1722 static void |
| 1723 charSetToUSet(uint32_t cset[8], const USetAdder *sa) { |
| 1724 UChar us[256]; |
| 1725 char cs[256]; |
| 1726 |
| 1727 int32_t i, length; |
| 1728 UErrorCode errorCode; |
| 1729 |
| 1730 errorCode=U_ZERO_ERROR; |
| 1731 |
| 1732 if(!calcNameSetsLengths(&errorCode)) { |
| 1733 return; |
| 1734 } |
| 1735 |
| 1736 /* build a char string with all chars that are used in character names */ |
| 1737 length=0; |
| 1738 for(i=0; i<256; ++i) { |
| 1739 if(SET_CONTAINS(cset, i)) { |
| 1740 cs[length++]=(char)i; |
| 1741 } |
| 1742 } |
| 1743 |
| 1744 /* convert the char string to a UChar string */ |
| 1745 u_charsToUChars(cs, us, length); |
| 1746 |
| 1747 /* add each UChar to the USet */ |
| 1748 for(i=0; i<length; ++i) { |
| 1749 if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */ |
| 1750 sa->add(sa->set, us[i]); |
| 1751 } |
| 1752 } |
| 1753 } |
| 1754 |
| 1755 /** |
| 1756 * Fills set with characters that are used in Unicode character names. |
| 1757 * @param set USet to receive characters. |
| 1758 */ |
| 1759 U_CAPI void U_EXPORT2 |
| 1760 uprv_getCharNameCharacters(const USetAdder *sa) { |
| 1761 charSetToUSet(gNameSet, sa); |
| 1762 } |
| 1763 |
| 1764 /* data swapping ------------------------------------------------------------ */ |
| 1765 |
| 1766 /* |
| 1767 * The token table contains non-negative entries for token bytes, |
| 1768 * and -1 for bytes that represent themselves in the data file's charset. |
| 1769 * -2 entries are used for lead bytes. |
| 1770 * |
| 1771 * Direct bytes (-1 entries) must be translated from the input charset family |
| 1772 * to the output charset family. |
| 1773 * makeTokenMap() writes a permutation mapping for this. |
| 1774 * Use it once for single-/lead-byte tokens and once more for all trail byte |
| 1775 * tokens. (';' is an unused trail byte marked with -1.) |
| 1776 */ |
| 1777 static void |
| 1778 makeTokenMap(const UDataSwapper *ds, |
| 1779 int16_t tokens[], uint16_t tokenCount, |
| 1780 uint8_t map[256], |
| 1781 UErrorCode *pErrorCode) { |
| 1782 UBool usedOutChar[256]; |
| 1783 uint16_t i, j; |
| 1784 uint8_t c1, c2; |
| 1785 |
| 1786 if(U_FAILURE(*pErrorCode)) { |
| 1787 return; |
| 1788 } |
| 1789 |
| 1790 if(ds->inCharset==ds->outCharset) { |
| 1791 /* Same charset family: identity permutation */ |
| 1792 for(i=0; i<256; ++i) { |
| 1793 map[i]=(uint8_t)i; |
| 1794 } |
| 1795 } else { |
| 1796 uprv_memset(map, 0, 256); |
| 1797 uprv_memset(usedOutChar, 0, 256); |
| 1798 |
| 1799 if(tokenCount>256) { |
| 1800 tokenCount=256; |
| 1801 } |
| 1802 |
| 1803 /* set the direct bytes (byte 0 always maps to itself) */ |
| 1804 for(i=1; i<tokenCount; ++i) { |
| 1805 if(tokens[i]==-1) { |
| 1806 /* convert the direct byte character */ |
| 1807 c1=(uint8_t)i; |
| 1808 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode); |
| 1809 if(U_FAILURE(*pErrorCode)) { |
| 1810 udata_printError(ds, "unames/makeTokenMap() finds variant ch
aracter 0x%02x used (input charset family %d)\n", |
| 1811 i, ds->inCharset); |
| 1812 return; |
| 1813 } |
| 1814 |
| 1815 /* enter the converted character into the map and mark it used *
/ |
| 1816 map[c1]=c2; |
| 1817 usedOutChar[c2]=TRUE; |
| 1818 } |
| 1819 } |
| 1820 |
| 1821 /* set the mappings for the rest of the permutation */ |
| 1822 for(i=j=1; i<tokenCount; ++i) { |
| 1823 /* set mappings that were not set for direct bytes */ |
| 1824 if(map[i]==0) { |
| 1825 /* set an output byte value that was not used as an output byte
above */ |
| 1826 while(usedOutChar[j]) { |
| 1827 ++j; |
| 1828 } |
| 1829 map[i]=(uint8_t)j++; |
| 1830 } |
| 1831 } |
| 1832 |
| 1833 /* |
| 1834 * leave mappings at tokenCount and above unset if tokenCount<256 |
| 1835 * because they won't be used |
| 1836 */ |
| 1837 } |
| 1838 } |
| 1839 |
| 1840 U_CAPI int32_t U_EXPORT2 |
| 1841 uchar_swapNames(const UDataSwapper *ds, |
| 1842 const void *inData, int32_t length, void *outData, |
| 1843 UErrorCode *pErrorCode) { |
| 1844 const UDataInfo *pInfo; |
| 1845 int32_t headerSize; |
| 1846 |
| 1847 const uint8_t *inBytes; |
| 1848 uint8_t *outBytes; |
| 1849 |
| 1850 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset, |
| 1851 offset, i, count, stringsCount; |
| 1852 |
| 1853 const AlgorithmicRange *inRange; |
| 1854 AlgorithmicRange *outRange; |
| 1855 |
| 1856 /* udata_swapDataHeader checks the arguments */ |
| 1857 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); |
| 1858 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| 1859 return 0; |
| 1860 } |
| 1861 |
| 1862 /* check data format and format version */ |
| 1863 pInfo=(const UDataInfo *)((const char *)inData+4); |
| 1864 if(!( |
| 1865 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ |
| 1866 pInfo->dataFormat[1]==0x6e && |
| 1867 pInfo->dataFormat[2]==0x61 && |
| 1868 pInfo->dataFormat[3]==0x6d && |
| 1869 pInfo->formatVersion[0]==1 |
| 1870 )) { |
| 1871 udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x
(format version %02x) is not recognized as unames.icu\n", |
| 1872 pInfo->dataFormat[0], pInfo->dataFormat[1], |
| 1873 pInfo->dataFormat[2], pInfo->dataFormat[3], |
| 1874 pInfo->formatVersion[0]); |
| 1875 *pErrorCode=U_UNSUPPORTED_ERROR; |
| 1876 return 0; |
| 1877 } |
| 1878 |
| 1879 inBytes=(const uint8_t *)inData+headerSize; |
| 1880 outBytes=(uint8_t *)outData+headerSize; |
| 1881 if(length<0) { |
| 1882 algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]); |
| 1883 } else { |
| 1884 length-=headerSize; |
| 1885 if( length<20 || |
| 1886 (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)i
nBytes)[3])) |
| 1887 ) { |
| 1888 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after hea
der) for unames.icu\n", |
| 1889 length); |
| 1890 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| 1891 return 0; |
| 1892 } |
| 1893 } |
| 1894 |
| 1895 if(length<0) { |
| 1896 /* preflighting: iterate through algorithmic ranges */ |
| 1897 offset=algNamesOffset; |
| 1898 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); |
| 1899 offset+=4; |
| 1900 |
| 1901 for(i=0; i<count; ++i) { |
| 1902 inRange=(const AlgorithmicRange *)(inBytes+offset); |
| 1903 offset+=ds->readUInt16(inRange->size); |
| 1904 } |
| 1905 } else { |
| 1906 /* swap data */ |
| 1907 const uint16_t *p; |
| 1908 uint16_t *q, *temp; |
| 1909 |
| 1910 int16_t tokens[512]; |
| 1911 uint16_t tokenCount; |
| 1912 |
| 1913 uint8_t map[256], trailMap[256]; |
| 1914 |
| 1915 /* copy the data for inaccessible bytes */ |
| 1916 if(inBytes!=outBytes) { |
| 1917 uprv_memcpy(outBytes, inBytes, length); |
| 1918 } |
| 1919 |
| 1920 /* the initial 4 offsets first */ |
| 1921 tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]); |
| 1922 groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]); |
| 1923 groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]); |
| 1924 ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode); |
| 1925 |
| 1926 /* |
| 1927 * now the tokens table |
| 1928 * it needs to be permutated along with the compressed name strings |
| 1929 */ |
| 1930 p=(const uint16_t *)(inBytes+16); |
| 1931 q=(uint16_t *)(outBytes+16); |
| 1932 |
| 1933 /* read and swap the tokenCount */ |
| 1934 tokenCount=ds->readUInt16(*p); |
| 1935 ds->swapArray16(ds, p, 2, q, pErrorCode); |
| 1936 ++p; |
| 1937 ++q; |
| 1938 |
| 1939 /* read the first 512 tokens and make the token maps */ |
| 1940 if(tokenCount<=512) { |
| 1941 count=tokenCount; |
| 1942 } else { |
| 1943 count=512; |
| 1944 } |
| 1945 for(i=0; i<count; ++i) { |
| 1946 tokens[i]=udata_readInt16(ds, p[i]); |
| 1947 } |
| 1948 for(; i<512; ++i) { |
| 1949 tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512
*/ |
| 1950 } |
| 1951 makeTokenMap(ds, tokens, tokenCount, map, pErrorCode); |
| 1952 makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256
: 0), trailMap, pErrorCode); |
| 1953 if(U_FAILURE(*pErrorCode)) { |
| 1954 return 0; |
| 1955 } |
| 1956 |
| 1957 /* |
| 1958 * swap and permutate the tokens |
| 1959 * go through a temporary array to support in-place swapping |
| 1960 */ |
| 1961 temp=(uint16_t *)uprv_malloc(tokenCount*2); |
| 1962 if(temp==NULL) { |
| 1963 udata_printError(ds, "out of memory swapping %u unames.icu tokens\n"
, |
| 1964 tokenCount); |
| 1965 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| 1966 return 0; |
| 1967 } |
| 1968 |
| 1969 /* swap and permutate single-/lead-byte tokens */ |
| 1970 for(i=0; i<tokenCount && i<256; ++i) { |
| 1971 ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode); |
| 1972 } |
| 1973 |
| 1974 /* swap and permutate trail-byte tokens */ |
| 1975 for(; i<tokenCount; ++i) { |
| 1976 ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pE
rrorCode); |
| 1977 } |
| 1978 |
| 1979 /* copy the result into the output and free the temporary array */ |
| 1980 uprv_memcpy(q, temp, tokenCount*2); |
| 1981 uprv_free(temp); |
| 1982 |
| 1983 /* |
| 1984 * swap the token strings but not a possible padding byte after |
| 1985 * the terminating NUL of the last string |
| 1986 */ |
| 1987 udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groups
Offset-tokenStringOffset), |
| 1988 outBytes+tokenStringOffset, pErrorCode); |
| 1989 if(U_FAILURE(*pErrorCode)) { |
| 1990 udata_printError(ds, "uchar_swapNames(token strings) failed\n"); |
| 1991 return 0; |
| 1992 } |
| 1993 |
| 1994 /* swap the group table */ |
| 1995 count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset))); |
| 1996 ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2), |
| 1997 outBytes+groupsOffset, pErrorCode); |
| 1998 |
| 1999 /* |
| 2000 * swap the group strings |
| 2001 * swap the string bytes but not the nibble-encoded string lengths |
| 2002 */ |
| 2003 if(ds->inCharset!=ds->outCharset) { |
| 2004 uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1]; |
| 2005 |
| 2006 const uint8_t *inStrings, *nextInStrings; |
| 2007 uint8_t *outStrings; |
| 2008 |
| 2009 uint8_t c; |
| 2010 |
| 2011 inStrings=inBytes+groupStringOffset; |
| 2012 outStrings=outBytes+groupStringOffset; |
| 2013 |
| 2014 stringsCount=algNamesOffset-groupStringOffset; |
| 2015 |
| 2016 /* iterate through string groups until only a few padding bytes are
left */ |
| 2017 while(stringsCount>32) { |
| 2018 nextInStrings=expandGroupLengths(inStrings, offsets, lengths); |
| 2019 |
| 2020 /* move past the length bytes */ |
| 2021 stringsCount-=(uint32_t)(nextInStrings-inStrings); |
| 2022 outStrings+=nextInStrings-inStrings; |
| 2023 inStrings=nextInStrings; |
| 2024 |
| 2025 count=offsets[31]+lengths[31]; /* total number of string bytes i
n this group */ |
| 2026 stringsCount-=count; |
| 2027 |
| 2028 /* swap the string bytes using map[] and trailMap[] */ |
| 2029 while(count>0) { |
| 2030 c=*inStrings++; |
| 2031 *outStrings++=map[c]; |
| 2032 if(tokens[c]!=-2) { |
| 2033 --count; |
| 2034 } else { |
| 2035 /* token lead byte: swap the trail byte, too */ |
| 2036 *outStrings++=trailMap[*inStrings++]; |
| 2037 count-=2; |
| 2038 } |
| 2039 } |
| 2040 } |
| 2041 } |
| 2042 |
| 2043 /* swap the algorithmic ranges */ |
| 2044 offset=algNamesOffset; |
| 2045 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); |
| 2046 ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode); |
| 2047 offset+=4; |
| 2048 |
| 2049 for(i=0; i<count; ++i) { |
| 2050 if(offset>(uint32_t)length) { |
| 2051 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after
header) for unames.icu algorithmic range %u\n", |
| 2052 length, i); |
| 2053 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| 2054 return 0; |
| 2055 } |
| 2056 |
| 2057 inRange=(const AlgorithmicRange *)(inBytes+offset); |
| 2058 outRange=(AlgorithmicRange *)(outBytes+offset); |
| 2059 offset+=ds->readUInt16(inRange->size); |
| 2060 |
| 2061 ds->swapArray32(ds, inRange, 8, outRange, pErrorCode); |
| 2062 ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode); |
| 2063 switch(inRange->type) { |
| 2064 case 0: |
| 2065 /* swap prefix string */ |
| 2066 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char
*)(inRange+1)), |
| 2067 outRange+1, pErrorCode); |
| 2068 if(U_FAILURE(*pErrorCode)) { |
| 2069 udata_printError(ds, "uchar_swapNames(prefix string of algor
ithmic range %u) failed\n", |
| 2070 i); |
| 2071 return 0; |
| 2072 } |
| 2073 break; |
| 2074 case 1: |
| 2075 { |
| 2076 /* swap factors and the prefix and factor strings */ |
| 2077 uint32_t factorsCount; |
| 2078 |
| 2079 factorsCount=inRange->variant; |
| 2080 p=(const uint16_t *)(inRange+1); |
| 2081 q=(uint16_t *)(outRange+1); |
| 2082 ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorC
ode); |
| 2083 |
| 2084 /* swap the strings, up to the last terminating NUL */ |
| 2085 p+=factorsCount; |
| 2086 q+=factorsCount; |
| 2087 stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p)
; |
| 2088 while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]
!=0) { |
| 2089 --stringsCount; |
| 2090 } |
| 2091 ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode
); |
| 2092 } |
| 2093 break; |
| 2094 default: |
| 2095 udata_printError(ds, "uchar_swapNames(): unknown type %u of algo
rithmic range %u\n", |
| 2096 inRange->type, i); |
| 2097 *pErrorCode=U_UNSUPPORTED_ERROR; |
| 2098 return 0; |
| 2099 } |
| 2100 } |
| 2101 } |
| 2102 |
| 2103 return headerSize+(int32_t)offset; |
| 2104 } |
| 2105 |
| 2106 /* |
| 2107 * Hey, Emacs, please set the following: |
| 2108 * |
| 2109 * Local Variables: |
| 2110 * indent-tabs-mode: nil |
| 2111 * End: |
| 2112 * |
| 2113 */ |
OLD | NEW |