OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (c) 2002-2004, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 * Author: Alan Liu |
| 7 * Created: October 30 2002 |
| 8 * Since: ICU 2.4 |
| 9 ********************************************************************** |
| 10 */ |
| 11 #ifndef PROPNAME_H |
| 12 #define PROPNAME_H |
| 13 |
| 14 #include "unicode/utypes.h" |
| 15 #include "unicode/uchar.h" |
| 16 #include "udataswp.h" |
| 17 #include "uprops.h" |
| 18 |
| 19 /* |
| 20 * This header defines the in-memory layout of the property names data |
| 21 * structure representing the UCD data files PropertyAliases.txt and |
| 22 * PropertyValueAliases.txt. It is used by: |
| 23 * propname.cpp - reads data |
| 24 * genpname - creates data |
| 25 */ |
| 26 |
| 27 /* low-level char * property name comparison -------------------------------- */ |
| 28 |
| 29 U_CDECL_BEGIN |
| 30 |
| 31 /** |
| 32 * \var uprv_comparePropertyNames |
| 33 * Unicode property names and property value names are compared "loosely". |
| 34 * |
| 35 * UCD.html 4.0.1 says: |
| 36 * For all property names, property value names, and for property values for |
| 37 * Enumerated, Binary, or Catalog properties, use the following |
| 38 * loose matching rule: |
| 39 * |
| 40 * LM3. Ignore case, whitespace, underscore ('_'), and hyphens. |
| 41 * |
| 42 * This function does just that, for (char *) name strings. |
| 43 * It is almost identical to ucnv_compareNames() but also ignores |
| 44 * C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC). |
| 45 * |
| 46 * @internal |
| 47 */ |
| 48 |
| 49 U_CAPI int32_t U_EXPORT2 |
| 50 uprv_compareASCIIPropertyNames(const char *name1, const char *name2); |
| 51 |
| 52 U_CAPI int32_t U_EXPORT2 |
| 53 uprv_compareEBCDICPropertyNames(const char *name1, const char *name2); |
| 54 |
| 55 #if U_CHARSET_FAMILY==U_ASCII_FAMILY |
| 56 # define uprv_comparePropertyNames uprv_compareASCIIPropertyNames |
| 57 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY |
| 58 # define uprv_comparePropertyNames uprv_compareEBCDICPropertyNames |
| 59 #else |
| 60 # error U_CHARSET_FAMILY is not valid |
| 61 #endif |
| 62 |
| 63 U_CDECL_END |
| 64 |
| 65 /* UDataMemory structure and signatures ------------------------------------- */ |
| 66 |
| 67 #define PNAME_DATA_NAME "pnames" |
| 68 #define PNAME_DATA_TYPE "icu" |
| 69 |
| 70 /* Fields in UDataInfo: */ |
| 71 |
| 72 /* PNAME_SIG[] is encoded as numeric literals for compatibility with the HP comp
iler */ |
| 73 #define PNAME_SIG_0 ((uint8_t)0x70) /* p */ |
| 74 #define PNAME_SIG_1 ((uint8_t)0x6E) /* n */ |
| 75 #define PNAME_SIG_2 ((uint8_t)0x61) /* a */ |
| 76 #define PNAME_SIG_3 ((uint8_t)0x6D) /* m */ |
| 77 |
| 78 #define PNAME_FORMAT_VERSION ((int8_t)1) /* formatVersion[0] */ |
| 79 |
| 80 /** |
| 81 * Swap pnames.icu. See udataswp.h. |
| 82 * @internal |
| 83 */ |
| 84 U_CAPI int32_t U_EXPORT2 |
| 85 upname_swap(const UDataSwapper *ds, |
| 86 const void *inData, int32_t length, void *outData, |
| 87 UErrorCode *pErrorCode); |
| 88 |
| 89 |
| 90 #ifdef XP_CPLUSPLUS |
| 91 |
| 92 class Builder; |
| 93 |
| 94 U_NAMESPACE_BEGIN |
| 95 |
| 96 /** |
| 97 * An offset from the start of the pnames data to a contained entity. |
| 98 * This must be a signed value, since negative offsets are used as an |
| 99 * end-of-list marker. Offsets to actual objects are non-zero. A |
| 100 * zero offset indicates an absent entry; this corresponds to aliases |
| 101 * marked "n/a" in the original Unicode data files. |
| 102 */ |
| 103 typedef int16_t Offset; /* must be signed */ |
| 104 |
| 105 #define MAX_OFFSET 0x7FFF |
| 106 |
| 107 /** |
| 108 * A generic value for a property or property value. Typically an |
| 109 * enum from uchar.h, but sometimes a non-enum value. It must be |
| 110 * large enough to accomodate the largest enum value, which as of this |
| 111 * writing is the largest general category mask. Need not be signed |
| 112 * but may be. Typically it doesn't matter, since the caller will |
| 113 * cast it to the proper type before use. Takes the special value |
| 114 * UCHAR_INVALID_CODE for invalid input. |
| 115 */ |
| 116 typedef int32_t EnumValue; |
| 117 |
| 118 /* ---------------------------------------------------------------------- */ |
| 119 /* ValueMap */ |
| 120 |
| 121 /** |
| 122 * For any top-level property that has named values (binary and |
| 123 * enumerated properties), there is a ValueMap object. This object |
| 124 * maps from enum values to two other maps. One goes from value enums |
| 125 * to value names. The other goes from value names to value enums. |
| 126 * |
| 127 * The value enum values may be contiguous or disjoint. If they are |
| 128 * contiguous then the enumToName_offset is nonzero, and the |
| 129 * ncEnumToName_offset is zero. Vice versa if the value enums are |
| 130 * disjoint. |
| 131 * |
| 132 * There are n of these objects, where n is the number of binary |
| 133 * properties + the number of enumerated properties. |
| 134 */ |
| 135 struct ValueMap { |
| 136 |
| 137 /* -- begin pnames data -- */ |
| 138 /* Enum=>name EnumToOffset / NonContiguousEnumToOffset objects. */ |
| 139 /* Exactly one of these will be nonzero. */ |
| 140 Offset enumToName_offset; |
| 141 Offset ncEnumToName_offset; |
| 142 |
| 143 Offset nameToEnum_offset; /* Name=>enum data */ |
| 144 /* -- end pnames data -- */ |
| 145 }; |
| 146 |
| 147 /* ---------------------------------------------------------------------- */ |
| 148 /* PropertyAliases class */ |
| 149 |
| 150 /** |
| 151 * A class encapsulating access to the memory-mapped data representing |
| 152 * property aliases and property value aliases (pnames). The class |
| 153 * MUST have no v-table and declares certain methods inline -- small |
| 154 * methods and methods that are called from only one point. |
| 155 * |
| 156 * The data members in this class correspond to the in-memory layout |
| 157 * of the header of the pnames data. |
| 158 */ |
| 159 class PropertyAliases { |
| 160 |
| 161 /* -- begin pnames data -- */ |
| 162 /* Enum=>name EnumToOffset object for binary and enumerated */ |
| 163 /* properties */ |
| 164 Offset enumToName_offset; |
| 165 |
| 166 /* Name=>enum data for binary & enumerated properties */ |
| 167 Offset nameToEnum_offset; |
| 168 |
| 169 /* Enum=>offset EnumToOffset object mapping enumerated properties */ |
| 170 /* to ValueMap objects */ |
| 171 Offset enumToValue_offset; |
| 172 |
| 173 /* The following are needed by external readers of this data. */ |
| 174 /* We don't use them ourselves. */ |
| 175 int16_t total_size; /* size in bytes excluding the udata header */ |
| 176 Offset valueMap_offset; /* offset to start of array */ |
| 177 int16_t valueMap_count; /* number of entries */ |
| 178 Offset nameGroupPool_offset; /* offset to start of array */ |
| 179 int16_t nameGroupPool_count; /* number of entries (not groups) */ |
| 180 Offset stringPool_offset; /* offset to start of pool */ |
| 181 int16_t stringPool_count; /* number of strings (not size in bytes) */ |
| 182 |
| 183 /* -- end pnames data -- */ |
| 184 |
| 185 friend class ::Builder; |
| 186 |
| 187 const ValueMap* getValueMap(EnumValue prop) const; |
| 188 |
| 189 const char* chooseNameInGroup(Offset offset, |
| 190 UPropertyNameChoice choice) const; |
| 191 |
| 192 public: |
| 193 |
| 194 inline const int8_t* getPointer(Offset o) const { |
| 195 return ((const int8_t*) this) + o; |
| 196 } |
| 197 |
| 198 inline const int8_t* getPointerNull(Offset o) const { |
| 199 return o ? getPointer(o) : NULL; |
| 200 } |
| 201 |
| 202 inline const char* getPropertyName(EnumValue prop, |
| 203 UPropertyNameChoice choice) const; |
| 204 |
| 205 inline EnumValue getPropertyEnum(const char* alias) const; |
| 206 |
| 207 inline const char* getPropertyValueName(EnumValue prop, EnumValue value, |
| 208 UPropertyNameChoice choice) const; |
| 209 |
| 210 inline EnumValue getPropertyValueEnum(EnumValue prop, |
| 211 const char* alias) const; |
| 212 |
| 213 static int32_t |
| 214 swap(const UDataSwapper *ds, |
| 215 const uint8_t *inBytes, int32_t length, uint8_t *outBytes, |
| 216 UErrorCode *pErrorCode); |
| 217 }; |
| 218 |
| 219 /* ---------------------------------------------------------------------- */ |
| 220 /* EnumToOffset */ |
| 221 |
| 222 /** |
| 223 * A generic map from enum values to Offsets. The enum values must be |
| 224 * contiguous, from enumStart to enumLimit. The Offset values may |
| 225 * point to anything. |
| 226 */ |
| 227 class EnumToOffset { |
| 228 |
| 229 /* -- begin pnames data -- */ |
| 230 EnumValue enumStart; |
| 231 EnumValue enumLimit; |
| 232 Offset _offsetArray; /* [array of enumLimit-enumStart] */ |
| 233 /* -- end pnames data -- */ |
| 234 |
| 235 friend class ::Builder; |
| 236 |
| 237 Offset* getOffsetArray() { |
| 238 return &_offsetArray; |
| 239 } |
| 240 |
| 241 const Offset* getOffsetArray() const { |
| 242 return &_offsetArray; |
| 243 } |
| 244 |
| 245 static int32_t getSize(int32_t n) { |
| 246 return sizeof(EnumToOffset) + sizeof(Offset) * (n - 1); |
| 247 } |
| 248 |
| 249 int32_t getSize() { |
| 250 return getSize(enumLimit - enumStart); |
| 251 } |
| 252 |
| 253 public: |
| 254 |
| 255 Offset getOffset(EnumValue enumProbe) const { |
| 256 if (enumProbe < enumStart || |
| 257 enumProbe >= enumLimit) { |
| 258 return 0; /* not found */ |
| 259 } |
| 260 const Offset* p = getOffsetArray(); |
| 261 return p[enumProbe - enumStart]; |
| 262 } |
| 263 |
| 264 static int32_t |
| 265 swap(const UDataSwapper *ds, |
| 266 const uint8_t *inBytes, int32_t length, uint8_t *outBytes, |
| 267 uint8_t *temp, int32_t pos, |
| 268 UErrorCode *pErrorCode); |
| 269 }; |
| 270 |
| 271 /* ---------------------------------------------------------------------- */ |
| 272 /* NonContiguousEnumToOffset */ |
| 273 |
| 274 /** |
| 275 * A generic map from enum values to Offsets. The enum values may be |
| 276 * disjoint. If they are contiguous, an EnumToOffset should be used |
| 277 * instead. The Offset values may point to anything. |
| 278 */ |
| 279 class NonContiguousEnumToOffset { |
| 280 |
| 281 /* -- begin pnames data -- */ |
| 282 int32_t count; |
| 283 EnumValue _enumArray; /* [array of count] */ |
| 284 /* Offset _offsetArray; // [array of count] after enumValue[count-1] */ |
| 285 /* -- end pnames data -- */ |
| 286 |
| 287 friend class ::Builder; |
| 288 |
| 289 EnumValue* getEnumArray() { |
| 290 return &_enumArray; |
| 291 } |
| 292 |
| 293 const EnumValue* getEnumArray() const { |
| 294 return &_enumArray; |
| 295 } |
| 296 |
| 297 Offset* getOffsetArray() { |
| 298 return (Offset*) (getEnumArray() + count); |
| 299 } |
| 300 |
| 301 const Offset* getOffsetArray() const { |
| 302 return (Offset*) (getEnumArray() + count); |
| 303 } |
| 304 |
| 305 static int32_t getSize(int32_t n) { |
| 306 return sizeof(int32_t) + (sizeof(EnumValue) + sizeof(Offset)) * n; |
| 307 } |
| 308 |
| 309 int32_t getSize() { |
| 310 return getSize(count); |
| 311 } |
| 312 |
| 313 public: |
| 314 |
| 315 Offset getOffset(EnumValue enumProbe) const { |
| 316 const EnumValue* e = getEnumArray(); |
| 317 const Offset* p = getOffsetArray(); |
| 318 /* linear search; binary later if warranted */ |
| 319 /* (binary is not faster for short lists) */ |
| 320 for (int32_t i=0; i<count; ++i) { |
| 321 if (e[i] < enumProbe) continue; |
| 322 if (e[i] > enumProbe) break; |
| 323 return p[i]; |
| 324 } |
| 325 return 0; /* not found */ |
| 326 } |
| 327 |
| 328 static int32_t |
| 329 swap(const UDataSwapper *ds, |
| 330 const uint8_t *inBytes, int32_t length, uint8_t *outBytes, |
| 331 uint8_t *temp, int32_t pos, |
| 332 UErrorCode *pErrorCode); |
| 333 }; |
| 334 |
| 335 /* ---------------------------------------------------------------------- */ |
| 336 /* NameToEnum */ |
| 337 |
| 338 /** |
| 339 * A map from names to enum values. |
| 340 */ |
| 341 class NameToEnum { |
| 342 |
| 343 /* -- begin pnames data -- */ |
| 344 int32_t count; /* number of entries */ |
| 345 EnumValue _enumArray; /* [array of count] EnumValues */ |
| 346 /* Offset _nameArray; // [array of count] offsets to names */ |
| 347 /* -- end pnames data -- */ |
| 348 |
| 349 friend class ::Builder; |
| 350 |
| 351 EnumValue* getEnumArray() { |
| 352 return &_enumArray; |
| 353 } |
| 354 |
| 355 const EnumValue* getEnumArray() const { |
| 356 return &_enumArray; |
| 357 } |
| 358 |
| 359 Offset* getNameArray() { |
| 360 return (Offset*) (getEnumArray() + count); |
| 361 } |
| 362 |
| 363 const Offset* getNameArray() const { |
| 364 return (Offset*) (getEnumArray() + count); |
| 365 } |
| 366 |
| 367 static int32_t getSize(int32_t n) { |
| 368 return sizeof(int32_t) + (sizeof(Offset) + sizeof(EnumValue)) * n; |
| 369 } |
| 370 |
| 371 int32_t getSize() { |
| 372 return getSize(count); |
| 373 } |
| 374 |
| 375 public: |
| 376 |
| 377 EnumValue getEnum(const char* alias, const PropertyAliases& data) const { |
| 378 |
| 379 const Offset* n = getNameArray(); |
| 380 const EnumValue* e = getEnumArray(); |
| 381 |
| 382 /* linear search; binary later if warranted */ |
| 383 /* (binary is not faster for short lists) */ |
| 384 for (int32_t i=0; i<count; ++i) { |
| 385 const char* name = (const char*) data.getPointer(n[i]); |
| 386 int32_t c = uprv_comparePropertyNames(alias, name); |
| 387 if (c > 0) continue; |
| 388 if (c < 0) break; |
| 389 return e[i]; |
| 390 } |
| 391 |
| 392 return UCHAR_INVALID_CODE; |
| 393 } |
| 394 |
| 395 static int32_t |
| 396 swap(const UDataSwapper *ds, |
| 397 const uint8_t *inBytes, int32_t length, uint8_t *outBytes, |
| 398 uint8_t *temp, int32_t pos, |
| 399 UErrorCode *pErrorCode); |
| 400 }; |
| 401 |
| 402 /*---------------------------------------------------------------------- |
| 403 * |
| 404 * In-memory layout. THIS IS NOT A STANDALONE DOCUMENT. It goes |
| 405 * together with above C++ declarations and gives an overview. |
| 406 * |
| 407 * See above for definitions of Offset and EnumValue. Also, refer to |
| 408 * above class declarations for the "bottom line" on data layout. |
| 409 * |
| 410 * Sizes: |
| 411 * '*_offset' is an Offset (see above) |
| 412 * 'count' members are typically int32_t (see above declarations) |
| 413 * 'enumArray' is an array of EnumValue (see above) |
| 414 * 'offsetArray' is an array of Offset (see above) |
| 415 * 'nameArray' is an array of Offset (see above) |
| 416 * 'enum*' is an EnumValue (see above) |
| 417 * '*Array [x n]' means that *Array has n elements |
| 418 * |
| 419 * References: |
| 420 * Instead of pointers, this flat data structure contains offsets. |
| 421 * All offsets are relative to the start of 'header'. A notation |
| 422 * is used to indicate what structure each offset points to: |
| 423 * 'foo (>x)' the offset(s) in foo point to structure x |
| 424 * |
| 425 * Structures: |
| 426 * Each structure is assigned a number, except for the header, |
| 427 * which is called 'header'. The numbers are not contiguous |
| 428 * for historical reasons. Some structures have sub-parts |
| 429 * that are denoted with a letter, e.g., "5a". |
| 430 * |
| 431 * BEGIN LAYOUT |
| 432 * ============ |
| 433 * header: |
| 434 * enumToName_offset (>0) |
| 435 * nameToEnum_offset (>2) |
| 436 * enumToValue_offset (>3) |
| 437 * (alignment padding build in to header) |
| 438 * |
| 439 * The header also contains the following, used by "external readers" |
| 440 * like ICU4J and icuswap. |
| 441 * |
| 442 * // The following are needed by external readers of this data. |
| 443 * // We don't use them ourselves. |
| 444 * int16_t total_size; // size in bytes excluding the udata header |
| 445 * Offset valueMap_offset; // offset to start of array |
| 446 * int16_t valueMap_count; // number of entries |
| 447 * Offset nameGroupPool_offset; // offset to start of array |
| 448 * int16_t nameGroupPool_count; // number of entries (not groups) |
| 449 * Offset stringPool_offset; // offset to start of pool |
| 450 * int16_t stringPool_count; // number of strings (not size in bytes) |
| 451 * |
| 452 * 0: # NonContiguousEnumToOffset obj for props => name groups |
| 453 * count |
| 454 * enumArray [x count] |
| 455 * offsetArray [x count] (>98) |
| 456 * |
| 457 * => pad to next 4-byte boundary |
| 458 * |
| 459 * (1: omitted -- no longer used) |
| 460 * |
| 461 * 2: # NameToEnum obj for binary & enumerated props |
| 462 * count |
| 463 * enumArray [x count] |
| 464 * nameArray [x count] (>99) |
| 465 * |
| 466 * => pad to next 4-byte boundary |
| 467 * |
| 468 * 3: # NonContiguousEnumToOffset obj for enumerated props => ValueMaps |
| 469 * count |
| 470 * enumArray [x count] |
| 471 * offsetArray [x count] (>4) |
| 472 * |
| 473 * => pad to next 4-byte boundary |
| 474 * |
| 475 * 4: # ValueMap array [x one for each enumerated prop i] |
| 476 * enumToName_offset (>5a +2*i) one of these two is NULL, one is not |
| 477 * ncEnumToName_offset (>5b +2*i) |
| 478 * nameToEnums_offset (>6 +2*i) |
| 479 * |
| 480 * => pad to next 4-byte boundary |
| 481 * |
| 482 * for each enumerated prop (either 5a or 5b): |
| 483 * |
| 484 * 5a: # EnumToOffset for enumerated prop's values => name groups |
| 485 * enumStart |
| 486 * enumLimit |
| 487 * offsetArray [x enumLimit - enumStart] (>98) |
| 488 * |
| 489 * => pad to next 4-byte boundary |
| 490 * |
| 491 * 5b: # NonContiguousEnumToOffset for enumerated prop's values => name groups |
| 492 * count |
| 493 * enumArray [x count] |
| 494 * offsetArray [x count] (>98) |
| 495 * |
| 496 * => pad to next 4-byte boundary |
| 497 * |
| 498 * 6: # NameToEnum for enumerated prop's values |
| 499 * count |
| 500 * enumArray [x count] |
| 501 * nameArray [x count] (>99) |
| 502 * |
| 503 * => pad to next 4-byte boundary |
| 504 * |
| 505 * 98: # name group pool {NGP} |
| 506 * [array of Offset values] (>99) |
| 507 * |
| 508 * 99: # string pool {SP} |
| 509 * [pool of nul-terminated char* strings] |
| 510 */ |
| 511 U_NAMESPACE_END |
| 512 |
| 513 #endif /* C++ */ |
| 514 |
| 515 #endif |
OLD | NEW |