OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 1999-2014, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: collationweights.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2001mar08 as ucol_wgt.cpp |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * This file contains code for allocating n collation element weights |
| 17 * between two exclusive limits. |
| 18 * It is used only internally by the collation tailoring builder. |
| 19 */ |
| 20 |
| 21 #include "unicode/utypes.h" |
| 22 |
| 23 #if !UCONFIG_NO_COLLATION |
| 24 |
| 25 #include "cmemory.h" |
| 26 #include "collation.h" |
| 27 #include "collationweights.h" |
| 28 #include "uarrsort.h" |
| 29 #include "uassert.h" |
| 30 |
| 31 #ifdef UCOL_DEBUG |
| 32 # include <stdio.h> |
| 33 #endif |
| 34 |
| 35 U_NAMESPACE_BEGIN |
| 36 |
| 37 /* collation element weight allocation -------------------------------------- */ |
| 38 |
| 39 /* helper functions for CE weights */ |
| 40 |
| 41 static inline uint32_t |
| 42 getWeightTrail(uint32_t weight, int32_t length) { |
| 43 return (uint32_t)(weight>>(8*(4-length)))&0xff; |
| 44 } |
| 45 |
| 46 static inline uint32_t |
| 47 setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) { |
| 48 length=8*(4-length); |
| 49 return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length)); |
| 50 } |
| 51 |
| 52 static inline uint32_t |
| 53 getWeightByte(uint32_t weight, int32_t idx) { |
| 54 return getWeightTrail(weight, idx); /* same calculation */ |
| 55 } |
| 56 |
| 57 static inline uint32_t |
| 58 setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) { |
| 59 uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */ |
| 60 |
| 61 idx*=8; |
| 62 if(idx<32) { |
| 63 mask=((uint32_t)0xffffffff)>>idx; |
| 64 } else { |
| 65 // Do not use uint32_t>>32 because on some platforms that does not shift
at all |
| 66 // while we need it to become 0. |
| 67 // PowerPC: 0xffffffff>>32 = 0 (wanted) |
| 68 // x86: 0xffffffff>>32 = 0xffffffff (not wanted) |
| 69 // |
| 70 // ANSI C99 6.5.7 Bitwise shift operators: |
| 71 // "If the value of the right operand is negative |
| 72 // or is greater than or equal to the width of the promoted left operand
, |
| 73 // the behavior is undefined." |
| 74 mask=0; |
| 75 } |
| 76 idx=32-idx; |
| 77 mask|=0xffffff00<<idx; |
| 78 return (uint32_t)((weight&mask)|(byte<<idx)); |
| 79 } |
| 80 |
| 81 static inline uint32_t |
| 82 truncateWeight(uint32_t weight, int32_t length) { |
| 83 return (uint32_t)(weight&(0xffffffff<<(8*(4-length)))); |
| 84 } |
| 85 |
| 86 static inline uint32_t |
| 87 incWeightTrail(uint32_t weight, int32_t length) { |
| 88 return (uint32_t)(weight+(1UL<<(8*(4-length)))); |
| 89 } |
| 90 |
| 91 static inline uint32_t |
| 92 decWeightTrail(uint32_t weight, int32_t length) { |
| 93 return (uint32_t)(weight-(1UL<<(8*(4-length)))); |
| 94 } |
| 95 |
| 96 CollationWeights::CollationWeights() |
| 97 : middleLength(0), rangeIndex(0), rangeCount(0) { |
| 98 for(int32_t i = 0; i < 5; ++i) { |
| 99 minBytes[i] = maxBytes[i] = 0; |
| 100 } |
| 101 } |
| 102 |
| 103 void |
| 104 CollationWeights::initForPrimary(UBool compressible) { |
| 105 middleLength=1; |
| 106 minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1; |
| 107 maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE; |
| 108 if(compressible) { |
| 109 minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1; |
| 110 maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1; |
| 111 } else { |
| 112 minBytes[2] = 2; |
| 113 maxBytes[2] = 0xff; |
| 114 } |
| 115 minBytes[3] = 2; |
| 116 maxBytes[3] = 0xff; |
| 117 minBytes[4] = 2; |
| 118 maxBytes[4] = 0xff; |
| 119 } |
| 120 |
| 121 void |
| 122 CollationWeights::initForSecondary() { |
| 123 // We use only the lower 16 bits for secondary weights. |
| 124 middleLength=3; |
| 125 minBytes[1] = 0; |
| 126 maxBytes[1] = 0; |
| 127 minBytes[2] = 0; |
| 128 maxBytes[2] = 0; |
| 129 minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1; |
| 130 maxBytes[3] = 0xff; |
| 131 minBytes[4] = 2; |
| 132 maxBytes[4] = 0xff; |
| 133 } |
| 134 |
| 135 void |
| 136 CollationWeights::initForTertiary() { |
| 137 // We use only the lower 16 bits for tertiary weights. |
| 138 middleLength=3; |
| 139 minBytes[1] = 0; |
| 140 maxBytes[1] = 0; |
| 141 minBytes[2] = 0; |
| 142 maxBytes[2] = 0; |
| 143 // We use only 6 bits per byte. |
| 144 // The other bits are used for case & quaternary weights. |
| 145 minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1; |
| 146 maxBytes[3] = 0x3f; |
| 147 minBytes[4] = 2; |
| 148 maxBytes[4] = 0x3f; |
| 149 } |
| 150 |
| 151 uint32_t |
| 152 CollationWeights::incWeight(uint32_t weight, int32_t length) const { |
| 153 for(;;) { |
| 154 uint32_t byte=getWeightByte(weight, length); |
| 155 if(byte<maxBytes[length]) { |
| 156 return setWeightByte(weight, length, byte+1); |
| 157 } else { |
| 158 // Roll over, set this byte to the minimum and increment the previou
s one. |
| 159 weight=setWeightByte(weight, length, minBytes[length]); |
| 160 --length; |
| 161 U_ASSERT(length > 0); |
| 162 } |
| 163 } |
| 164 } |
| 165 |
| 166 uint32_t |
| 167 CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t off
set) const { |
| 168 for(;;) { |
| 169 offset += getWeightByte(weight, length); |
| 170 if((uint32_t)offset <= maxBytes[length]) { |
| 171 return setWeightByte(weight, length, offset); |
| 172 } else { |
| 173 // Split the offset between this byte and the previous one. |
| 174 offset -= minBytes[length]; |
| 175 weight = setWeightByte(weight, length, minBytes[length] + offset % c
ountBytes(length)); |
| 176 offset /= countBytes(length); |
| 177 --length; |
| 178 U_ASSERT(length > 0); |
| 179 } |
| 180 } |
| 181 } |
| 182 |
| 183 void |
| 184 CollationWeights::lengthenRange(WeightRange &range) const { |
| 185 int32_t length=range.length+1; |
| 186 range.start=setWeightTrail(range.start, length, minBytes[length]); |
| 187 range.end=setWeightTrail(range.end, length, maxBytes[length]); |
| 188 range.count*=countBytes(length); |
| 189 range.length=length; |
| 190 } |
| 191 |
| 192 /* for uprv_sortArray: sort ranges in weight order */ |
| 193 static int32_t U_CALLCONV |
| 194 compareRanges(const void * /*context*/, const void *left, const void *right) { |
| 195 uint32_t l, r; |
| 196 |
| 197 l=((const CollationWeights::WeightRange *)left)->start; |
| 198 r=((const CollationWeights::WeightRange *)right)->start; |
| 199 if(l<r) { |
| 200 return -1; |
| 201 } else if(l>r) { |
| 202 return 1; |
| 203 } else { |
| 204 return 0; |
| 205 } |
| 206 } |
| 207 |
| 208 UBool |
| 209 CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) { |
| 210 U_ASSERT(lowerLimit != 0); |
| 211 U_ASSERT(upperLimit != 0); |
| 212 |
| 213 /* get the lengths of the limits */ |
| 214 int32_t lowerLength=lengthOfWeight(lowerLimit); |
| 215 int32_t upperLength=lengthOfWeight(upperLimit); |
| 216 |
| 217 #ifdef UCOL_DEBUG |
| 218 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength); |
| 219 printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength); |
| 220 #endif |
| 221 U_ASSERT(lowerLength>=middleLength); |
| 222 // Permit upperLength<middleLength: The upper limit for secondaries is 0x100
00. |
| 223 |
| 224 if(lowerLimit>=upperLimit) { |
| 225 #ifdef UCOL_DEBUG |
| 226 printf("error: no space between lower & upper limits\n"); |
| 227 #endif |
| 228 return FALSE; |
| 229 } |
| 230 |
| 231 /* check that neither is a prefix of the other */ |
| 232 if(lowerLength<upperLength) { |
| 233 if(lowerLimit==truncateWeight(upperLimit, lowerLength)) { |
| 234 #ifdef UCOL_DEBUG |
| 235 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08l
x\n", lowerLimit, upperLimit); |
| 236 #endif |
| 237 return FALSE; |
| 238 } |
| 239 } |
| 240 /* if the upper limit is a prefix of the lower limit then the earlier test l
owerLimit>=upperLimit has caught it */ |
| 241 |
| 242 WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this s
implifies indexing */ |
| 243 uprv_memset(lower, 0, sizeof(lower)); |
| 244 uprv_memset(&middle, 0, sizeof(middle)); |
| 245 uprv_memset(upper, 0, sizeof(upper)); |
| 246 |
| 247 /* |
| 248 * With the limit lengths of 1..4, there are up to 7 ranges for allocation: |
| 249 * range minimum length |
| 250 * lower[4] 4 |
| 251 * lower[3] 3 |
| 252 * lower[2] 2 |
| 253 * middle 1 |
| 254 * upper[2] 2 |
| 255 * upper[3] 3 |
| 256 * upper[4] 4 |
| 257 * |
| 258 * We are now going to calculate up to 7 ranges. |
| 259 * Some of them will typically overlap, so we will then have to merge and el
iminate ranges. |
| 260 */ |
| 261 uint32_t weight=lowerLimit; |
| 262 for(int32_t length=lowerLength; length>middleLength; --length) { |
| 263 uint32_t trail=getWeightTrail(weight, length); |
| 264 if(trail<maxBytes[length]) { |
| 265 lower[length].start=incWeightTrail(weight, length); |
| 266 lower[length].end=setWeightTrail(weight, length, maxBytes[length]); |
| 267 lower[length].length=length; |
| 268 lower[length].count=maxBytes[length]-trail; |
| 269 } |
| 270 weight=truncateWeight(weight, length-1); |
| 271 } |
| 272 if(weight<0xff000000) { |
| 273 middle.start=incWeightTrail(weight, middleLength); |
| 274 } else { |
| 275 // Prevent overflow for primary lead byte FF |
| 276 // which would yield a middle range starting at 0. |
| 277 middle.start=0xffffffff; // no middle range |
| 278 } |
| 279 |
| 280 weight=upperLimit; |
| 281 for(int32_t length=upperLength; length>middleLength; --length) { |
| 282 uint32_t trail=getWeightTrail(weight, length); |
| 283 if(trail>minBytes[length]) { |
| 284 upper[length].start=setWeightTrail(weight, length, minBytes[length])
; |
| 285 upper[length].end=decWeightTrail(weight, length); |
| 286 upper[length].length=length; |
| 287 upper[length].count=trail-minBytes[length]; |
| 288 } |
| 289 weight=truncateWeight(weight, length-1); |
| 290 } |
| 291 middle.end=decWeightTrail(weight, middleLength); |
| 292 |
| 293 /* set the middle range */ |
| 294 middle.length=middleLength; |
| 295 if(middle.end>=middle.start) { |
| 296 middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+
1; |
| 297 } else { |
| 298 /* no middle range, eliminate overlaps */ |
| 299 |
| 300 /* reduce or remove the lower ranges that go beyond upperLimit */ |
| 301 for(int32_t length=4; length>middleLength; --length) { |
| 302 if(lower[length].count>0 && upper[length].count>0) { |
| 303 uint32_t start=upper[length].start; |
| 304 uint32_t end=lower[length].end; |
| 305 |
| 306 if(end>=start || incWeight(end, length)==start) { |
| 307 /* lower and upper ranges collide or are directly adjacent:
merge these two and remove all shorter ranges */ |
| 308 start=lower[length].start; |
| 309 end=lower[length].end=upper[length].end; |
| 310 /* |
| 311 * merging directly adjacent ranges needs to subtract the 0/
1 gaps in between; |
| 312 * it may result in a range with count>countBytes |
| 313 */ |
| 314 lower[length].count= |
| 315 (int32_t)(getWeightTrail(end, length)-getWeightTrail(sta
rt, length)+1+ |
| 316 countBytes(length)*(getWeightByte(end, length-
1)-getWeightByte(start, length-1))); |
| 317 upper[length].count=0; |
| 318 while(--length>middleLength) { |
| 319 lower[length].count=upper[length].count=0; |
| 320 } |
| 321 break; |
| 322 } |
| 323 } |
| 324 } |
| 325 } |
| 326 |
| 327 #ifdef UCOL_DEBUG |
| 328 /* print ranges */ |
| 329 for(int32_t length=4; length>=2; --length) { |
| 330 if(lower[length].count>0) { |
| 331 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length
, lower[length].start, lower[length].end, lower[length].count); |
| 332 } |
| 333 } |
| 334 if(middle.count>0) { |
| 335 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start
, middle.end, middle.count); |
| 336 } |
| 337 for(int32_t length=2; length<=4; ++length) { |
| 338 if(upper[length].count>0) { |
| 339 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length
, upper[length].start, upper[length].end, upper[length].count); |
| 340 } |
| 341 } |
| 342 #endif |
| 343 |
| 344 /* copy the ranges, shortest first, into the result array */ |
| 345 rangeCount=0; |
| 346 if(middle.count>0) { |
| 347 uprv_memcpy(ranges, &middle, sizeof(WeightRange)); |
| 348 rangeCount=1; |
| 349 } |
| 350 for(int32_t length=middleLength+1; length<=4; ++length) { |
| 351 /* copy upper first so that later the middle range is more likely the fi
rst one to use */ |
| 352 if(upper[length].count>0) { |
| 353 uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange)); |
| 354 ++rangeCount; |
| 355 } |
| 356 if(lower[length].count>0) { |
| 357 uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange)); |
| 358 ++rangeCount; |
| 359 } |
| 360 } |
| 361 return rangeCount>0; |
| 362 } |
| 363 |
| 364 UBool |
| 365 CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) { |
| 366 // See if the first few minLength and minLength+1 ranges have enough weights
. |
| 367 for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++
i) { |
| 368 if(n <= ranges[i].count) { |
| 369 // Use the first few minLength and minLength+1 ranges. |
| 370 if(ranges[i].length > minLength) { |
| 371 // Reduce the number of weights from the last minLength+1 range |
| 372 // which might sort before some minLength ranges, |
| 373 // so that we use all weights in the minLength ranges. |
| 374 ranges[i].count = n; |
| 375 } |
| 376 rangeCount = i + 1; |
| 377 #ifdef UCOL_DEBUG |
| 378 printf("take first %ld ranges\n", rangeCount); |
| 379 #endif |
| 380 |
| 381 if(rangeCount>1) { |
| 382 /* sort the ranges by weight values */ |
| 383 UErrorCode errorCode=U_ZERO_ERROR; |
| 384 uprv_sortArray(ranges, rangeCount, sizeof(WeightRange), |
| 385 compareRanges, NULL, FALSE, &errorCode); |
| 386 /* ignore error code: we know that the internal sort function wi
ll not fail here */ |
| 387 } |
| 388 return TRUE; |
| 389 } |
| 390 n -= ranges[i].count; // still >0 |
| 391 } |
| 392 return FALSE; |
| 393 } |
| 394 |
| 395 UBool |
| 396 CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) { |
| 397 // See if the minLength ranges have enough weights |
| 398 // when we split one and lengthen the following ones. |
| 399 int32_t count = 0; |
| 400 int32_t minLengthRangeCount; |
| 401 for(minLengthRangeCount = 0; |
| 402 minLengthRangeCount < rangeCount && |
| 403 ranges[minLengthRangeCount].length == minLength; |
| 404 ++minLengthRangeCount) { |
| 405 count += ranges[minLengthRangeCount].count; |
| 406 } |
| 407 |
| 408 int32_t nextCountBytes = countBytes(minLength + 1); |
| 409 if(n > count * nextCountBytes) { return FALSE; } |
| 410 |
| 411 // Use the minLength ranges. Merge them, and then split again as necessary. |
| 412 uint32_t start = ranges[0].start; |
| 413 uint32_t end = ranges[0].end; |
| 414 for(int32_t i = 1; i < minLengthRangeCount; ++i) { |
| 415 if(ranges[i].start < start) { start = ranges[i].start; } |
| 416 if(ranges[i].end > end) { end = ranges[i].end; } |
| 417 } |
| 418 |
| 419 // Calculate how to split the range between minLength (count1) and minLength
+1 (count2). |
| 420 // Goal: |
| 421 // count1 + count2 * nextCountBytes = n |
| 422 // count1 + count2 = count |
| 423 // These turn into |
| 424 // (count - count2) + count2 * nextCountBytes = n |
| 425 // and then into the following count1 & count2 computations. |
| 426 int32_t count2 = (n - count) / (nextCountBytes - 1); // number of weights t
o be lengthened |
| 427 int32_t count1 = count - count2; // number of minLength weights |
| 428 if(count2 == 0 || (count1 + count2 * nextCountBytes) < n) { |
| 429 // round up |
| 430 ++count2; |
| 431 --count1; |
| 432 U_ASSERT((count1 + count2 * nextCountBytes) >= n); |
| 433 } |
| 434 |
| 435 ranges[0].start = start; |
| 436 |
| 437 if(count1 == 0) { |
| 438 // Make one long range. |
| 439 ranges[0].end = end; |
| 440 ranges[0].count = count; |
| 441 lengthenRange(ranges[0]); |
| 442 rangeCount = 1; |
| 443 } else { |
| 444 // Split the range, lengthen the second part. |
| 445 #ifdef UCOL_DEBUG |
| 446 printf("split the range number %ld (out of %ld minLength ranges) by %ld:
%ld\n", |
| 447 splitRange, rangeCount, count1, count2); |
| 448 #endif |
| 449 |
| 450 // Next start = start + count1. First end = 1 before that. |
| 451 ranges[0].end = incWeightByOffset(start, minLength, count1 - 1); |
| 452 ranges[0].count = count1; |
| 453 |
| 454 ranges[1].start = incWeight(ranges[0].end, minLength); |
| 455 ranges[1].end = end; |
| 456 ranges[1].length = minLength; // +1 when lengthened |
| 457 ranges[1].count = count2; // *countBytes when lengthened |
| 458 lengthenRange(ranges[1]); |
| 459 rangeCount = 2; |
| 460 } |
| 461 return TRUE; |
| 462 } |
| 463 |
| 464 /* |
| 465 * call getWeightRanges and then determine heuristically |
| 466 * which ranges to use for a given number of weights between (excluding) |
| 467 * two limits |
| 468 */ |
| 469 UBool |
| 470 CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t
n) { |
| 471 #ifdef UCOL_DEBUG |
| 472 puts(""); |
| 473 #endif |
| 474 |
| 475 if(!getWeightRanges(lowerLimit, upperLimit)) { |
| 476 #ifdef UCOL_DEBUG |
| 477 printf("error: unable to get Weight ranges\n"); |
| 478 #endif |
| 479 return FALSE; |
| 480 } |
| 481 |
| 482 /* try until we find suitably large ranges */ |
| 483 for(;;) { |
| 484 /* get the smallest number of bytes in a range */ |
| 485 int32_t minLength=ranges[0].length; |
| 486 |
| 487 if(allocWeightsInShortRanges(n, minLength)) { break; } |
| 488 |
| 489 if(minLength == 4) { |
| 490 #ifdef UCOL_DEBUG |
| 491 printf("error: the maximum number of %ld weights is insufficient for
n=%ld\n", |
| 492 minLengthCount, n); |
| 493 #endif |
| 494 return FALSE; |
| 495 } |
| 496 |
| 497 if(allocWeightsInMinLengthRanges(n, minLength)) { break; } |
| 498 |
| 499 /* no good match, lengthen all minLength ranges and iterate */ |
| 500 #ifdef UCOL_DEBUG |
| 501 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n",
minLength, minLength+1); |
| 502 #endif |
| 503 for(int32_t i=0; ranges[i].length==minLength; ++i) { |
| 504 lengthenRange(ranges[i]); |
| 505 } |
| 506 } |
| 507 |
| 508 #ifdef UCOL_DEBUG |
| 509 puts("final ranges:"); |
| 510 for(int32_t i=0; i<rangeCount; ++i) { |
| 511 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n
", |
| 512 i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].co
unt); |
| 513 } |
| 514 #endif |
| 515 |
| 516 rangeIndex = 0; |
| 517 return TRUE; |
| 518 } |
| 519 |
| 520 uint32_t |
| 521 CollationWeights::nextWeight() { |
| 522 if(rangeIndex >= rangeCount) { |
| 523 return 0xffffffff; |
| 524 } else { |
| 525 /* get the next weight */ |
| 526 WeightRange &range = ranges[rangeIndex]; |
| 527 uint32_t weight = range.start; |
| 528 if(--range.count == 0) { |
| 529 /* this range is finished */ |
| 530 ++rangeIndex; |
| 531 } else { |
| 532 /* increment the weight for the next value */ |
| 533 range.start = incWeight(weight, range.length); |
| 534 U_ASSERT(range.start <= range.end); |
| 535 } |
| 536 |
| 537 return weight; |
| 538 } |
| 539 } |
| 540 |
| 541 U_NAMESPACE_END |
| 542 |
| 543 #endif /* #if !UCONFIG_NO_COLLATION */ |
OLD | NEW |