source/i18n/collationweights.cpp - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/i18n/collationweights.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 *******************************************************************************

	3 *

	4 * Copyright (C) 1999-2014, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 *******************************************************************************

	8 * file name: collationweights.cpp

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 2001mar08 as ucol_wgt.cpp

	14 * created by: Markus W. Scherer

	15 *

	16 * This file contains code for allocating n collation element weights

	17 * between two exclusive limits.

	18 * It is used only internally by the collation tailoring builder.

	19 */

	20

	21 #include "unicode/utypes.h"

	22

	23 #if !UCONFIG_NO_COLLATION

	24

	25 #include "cmemory.h"

	26 #include "collation.h"

	27 #include "collationweights.h"

	28 #include "uarrsort.h"

	29 #include "uassert.h"

	30

	31 #ifdef UCOL_DEBUG

	32 # include <stdio.h>

	33 #endif

	34

	35 U_NAMESPACE_BEGIN

	36

	37 /* collation element weight allocation -------------------------------------- */

	38

	39 /* helper functions for CE weights */

	40

	41 static inline uint32_t

	42 getWeightTrail(uint32_t weight, int32_t length) {

	43 return (uint32_t)(weight>>(8*(4-length)))&0xff;

	44 }

	45

	46 static inline uint32_t

	47 setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {

	48 length=8*(4-length);

	49 return (uint32_t)((weight&(0xffffff00<<length))\|(trail<<length));

	50 }

	51

	52 static inline uint32_t

	53 getWeightByte(uint32_t weight, int32_t idx) {

	54 return getWeightTrail(weight, idx); /* same calculation */

	55 }

	56

	57 static inline uint32_t

	58 setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {

	59 uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */

	60

	61 idx*=8;

	62 if(idx<32) {

	63 mask=((uint32_t)0xffffffff)>>idx;

	64 } else {

	65 // Do not use uint32_t>>32 because on some platforms that does not shift at all

	66 // while we need it to become 0.

	67 // PowerPC: 0xffffffff>>32 = 0 (wanted)

	68 // x86: 0xffffffff>>32 = 0xffffffff (not wanted)

	69 //

	70 // ANSI C99 6.5.7 Bitwise shift operators:

	71 // "If the value of the right operand is negative

	72 // or is greater than or equal to the width of the promoted left operand ,

	73 // the behavior is undefined."

	74 mask=0;

	75 }

	76 idx=32-idx;

	77 mask\|=0xffffff00<<idx;

	78 return (uint32_t)((weight&mask)\|(byte<<idx));

	79 }

	80

	81 static inline uint32_t

	82 truncateWeight(uint32_t weight, int32_t length) {

	83 return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));

	84 }

	85

	86 static inline uint32_t

	87 incWeightTrail(uint32_t weight, int32_t length) {

	88 return (uint32_t)(weight+(1UL<<(8*(4-length))));

	89 }

	90

	91 static inline uint32_t

	92 decWeightTrail(uint32_t weight, int32_t length) {

	93 return (uint32_t)(weight-(1UL<<(8*(4-length))));

	94 }

	95

	96 CollationWeights::CollationWeights()

	97 : middleLength(0), rangeIndex(0), rangeCount(0) {

	98 for(int32_t i = 0; i < 5; ++i) {

	99 minBytes[i] = maxBytes[i] = 0;

	100 }

	101 }

	102

	103 void

	104 CollationWeights::initForPrimary(UBool compressible) {

	105 middleLength=1;

	106 minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;

	107 maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;

	108 if(compressible) {

	109 minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;

	110 maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;

	111 } else {

	112 minBytes[2] = 2;

	113 maxBytes[2] = 0xff;

	114 }

	115 minBytes[3] = 2;

	116 maxBytes[3] = 0xff;

	117 minBytes[4] = 2;

	118 maxBytes[4] = 0xff;

	119 }

	120

	121 void

	122 CollationWeights::initForSecondary() {

	123 // We use only the lower 16 bits for secondary weights.

	124 middleLength=3;

	125 minBytes[1] = 0;

	126 maxBytes[1] = 0;

	127 minBytes[2] = 0;

	128 maxBytes[2] = 0;

	129 minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1;

	130 maxBytes[3] = 0xff;

	131 minBytes[4] = 2;

	132 maxBytes[4] = 0xff;

	133 }

	134

	135 void

	136 CollationWeights::initForTertiary() {

	137 // We use only the lower 16 bits for tertiary weights.

	138 middleLength=3;

	139 minBytes[1] = 0;

	140 maxBytes[1] = 0;

	141 minBytes[2] = 0;

	142 maxBytes[2] = 0;

	143 // We use only 6 bits per byte.

	144 // The other bits are used for case & quaternary weights.

	145 minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1;

	146 maxBytes[3] = 0x3f;

	147 minBytes[4] = 2;

	148 maxBytes[4] = 0x3f;

	149 }

	150

	151 uint32_t

	152 CollationWeights::incWeight(uint32_t weight, int32_t length) const {

	153 for(;;) {

	154 uint32_t byte=getWeightByte(weight, length);

	155 if(byte<maxBytes[length]) {

	156 return setWeightByte(weight, length, byte+1);

	157 } else {

	158 // Roll over, set this byte to the minimum and increment the previou s one.

	159 weight=setWeightByte(weight, length, minBytes[length]);

	160 --length;

	161 U_ASSERT(length > 0);

	162 }

	163 }

	164 }

	165

	166 uint32_t

	167 CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t off set) const {

	168 for(;;) {

	169 offset += getWeightByte(weight, length);

	170 if((uint32_t)offset <= maxBytes[length]) {

	171 return setWeightByte(weight, length, offset);

	172 } else {

	173 // Split the offset between this byte and the previous one.

	174 offset -= minBytes[length];

	175 weight = setWeightByte(weight, length, minBytes[length] + offset % c ountBytes(length));

	176 offset /= countBytes(length);

	177 --length;

	178 U_ASSERT(length > 0);

	179 }

	180 }

	181 }

	182

	183 void

	184 CollationWeights::lengthenRange(WeightRange &range) const {

	185 int32_t length=range.length+1;

	186 range.start=setWeightTrail(range.start, length, minBytes[length]);

	187 range.end=setWeightTrail(range.end, length, maxBytes[length]);

	188 range.count*=countBytes(length);

	189 range.length=length;

	190 }

	191

	192 /* for uprv_sortArray: sort ranges in weight order */

	193 static int32_t U_CALLCONV

	194 compareRanges(const void * /context/, const void left, const void right) {

	195 uint32_t l, r;

	196

	197 l=((const CollationWeights::WeightRange *)left)->start;

	198 r=((const CollationWeights::WeightRange *)right)->start;

	199 if(l<r) {

	200 return -1;

	201 } else if(l>r) {

	202 return 1;

	203 } else {

	204 return 0;

	205 }

	206 }

	207

	208 UBool

	209 CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {

	210 U_ASSERT(lowerLimit != 0);

	211 U_ASSERT(upperLimit != 0);

	212

	213 /* get the lengths of the limits */

	214 int32_t lowerLength=lengthOfWeight(lowerLimit);

	215 int32_t upperLength=lengthOfWeight(upperLimit);

	216

	217 #ifdef UCOL_DEBUG

	218 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);

	219 printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);

	220 #endif

	221 U_ASSERT(lowerLength>=middleLength);

	222 // Permit upperLength<middleLength: The upper limit for secondaries is 0x100 00.

	223

	224 if(lowerLimit>=upperLimit) {

	225 #ifdef UCOL_DEBUG

	226 printf("error: no space between lower & upper limits\n");

	227 #endif

	228 return FALSE;

	229 }

	230

	231 /* check that neither is a prefix of the other */

	232 if(lowerLength<upperLength) {

	233 if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {

	234 #ifdef UCOL_DEBUG

	235 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08l x\n", lowerLimit, upperLimit);

	236 #endif

	237 return FALSE;

	238 }

	239 }

	240 /* if the upper limit is a prefix of the lower limit then the earlier test l owerLimit>=upperLimit has caught it */

	241

	242 WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this s implifies indexing */

	243 uprv_memset(lower, 0, sizeof(lower));

	244 uprv_memset(&middle, 0, sizeof(middle));

	245 uprv_memset(upper, 0, sizeof(upper));

	246

	247 /*

	248 * With the limit lengths of 1..4, there are up to 7 ranges for allocation:

	249 * range minimum length

	250 * lower[4] 4

	251 * lower[3] 3

	252 * lower[2] 2

	253 * middle 1

	254 * upper[2] 2

	255 * upper[3] 3

	256 * upper[4] 4

	257 *

	258 * We are now going to calculate up to 7 ranges.

	259 * Some of them will typically overlap, so we will then have to merge and el iminate ranges.

	260 */

	261 uint32_t weight=lowerLimit;

	262 for(int32_t length=lowerLength; length>middleLength; --length) {

	263 uint32_t trail=getWeightTrail(weight, length);

	264 if(trail<maxBytes[length]) {

	265 lower[length].start=incWeightTrail(weight, length);

	266 lower[length].end=setWeightTrail(weight, length, maxBytes[length]);

	267 lower[length].length=length;

	268 lower[length].count=maxBytes[length]-trail;

	269 }

	270 weight=truncateWeight(weight, length-1);

	271 }

	272 if(weight<0xff000000) {

	273 middle.start=incWeightTrail(weight, middleLength);

	274 } else {

	275 // Prevent overflow for primary lead byte FF

	276 // which would yield a middle range starting at 0.

	277 middle.start=0xffffffff; // no middle range

	278 }

	279

	280 weight=upperLimit;

	281 for(int32_t length=upperLength; length>middleLength; --length) {

	282 uint32_t trail=getWeightTrail(weight, length);

	283 if(trail>minBytes[length]) {

	284 upper[length].start=setWeightTrail(weight, length, minBytes[length]) ;

	285 upper[length].end=decWeightTrail(weight, length);

	286 upper[length].length=length;

	287 upper[length].count=trail-minBytes[length];

	288 }

	289 weight=truncateWeight(weight, length-1);

	290 }

	291 middle.end=decWeightTrail(weight, middleLength);

	292

	293 /* set the middle range */

	294 middle.length=middleLength;

	295 if(middle.end>=middle.start) {

	296 middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+ 1;

	297 } else {

	298 /* no middle range, eliminate overlaps */

	299

	300 /* reduce or remove the lower ranges that go beyond upperLimit */

	301 for(int32_t length=4; length>middleLength; --length) {

	302 if(lower[length].count>0 && upper[length].count>0) {

	303 uint32_t start=upper[length].start;

	304 uint32_t end=lower[length].end;

	305

	306 if(end>=start \|\| incWeight(end, length)==start) {

	307 /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */

	308 start=lower[length].start;

	309 end=lower[length].end=upper[length].end;

	310 /*

	311 * merging directly adjacent ranges needs to subtract the 0/ 1 gaps in between;

	312 * it may result in a range with count>countBytes

	313 */

	314 lower[length].count=

	315 (int32_t)(getWeightTrail(end, length)-getWeightTrail(sta rt, length)+1+

	316 countBytes(length)*(getWeightByte(end, length- 1)-getWeightByte(start, length-1)));

	317 upper[length].count=0;

	318 while(--length>middleLength) {

	319 lower[length].count=upper[length].count=0;

	320 }

	321 break;

	322 }

	323 }

	324 }

	325 }

	326

	327 #ifdef UCOL_DEBUG

	328 /* print ranges */

	329 for(int32_t length=4; length>=2; --length) {

	330 if(lower[length].count>0) {

	331 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length , lower[length].start, lower[length].end, lower[length].count);

	332 }

	333 }

	334 if(middle.count>0) {

	335 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start , middle.end, middle.count);

	336 }

	337 for(int32_t length=2; length<=4; ++length) {

	338 if(upper[length].count>0) {

	339 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length , upper[length].start, upper[length].end, upper[length].count);

	340 }

	341 }

	342 #endif

	343

	344 /* copy the ranges, shortest first, into the result array */

	345 rangeCount=0;

	346 if(middle.count>0) {

	347 uprv_memcpy(ranges, &middle, sizeof(WeightRange));

	348 rangeCount=1;

	349 }

	350 for(int32_t length=middleLength+1; length<=4; ++length) {

	351 /* copy upper first so that later the middle range is more likely the fi rst one to use */

	352 if(upper[length].count>0) {

	353 uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));

	354 ++rangeCount;

	355 }

	356 if(lower[length].count>0) {

	357 uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));

	358 ++rangeCount;

	359 }

	360 }

	361 return rangeCount>0;

	362 }

	363

	364 UBool

	365 CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {

	366 // See if the first few minLength and minLength+1 ranges have enough weights .

	367 for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++ i) {

	368 if(n <= ranges[i].count) {

	369 // Use the first few minLength and minLength+1 ranges.

	370 if(ranges[i].length > minLength) {

	371 // Reduce the number of weights from the last minLength+1 range

	372 // which might sort before some minLength ranges,

	373 // so that we use all weights in the minLength ranges.

	374 ranges[i].count = n;

	375 }

	376 rangeCount = i + 1;

	377 #ifdef UCOL_DEBUG

	378 printf("take first %ld ranges\n", rangeCount);

	379 #endif

	380

	381 if(rangeCount>1) {

	382 /* sort the ranges by weight values */

	383 UErrorCode errorCode=U_ZERO_ERROR;

	384 uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),

	385 compareRanges, NULL, FALSE, &errorCode);

	386 /* ignore error code: we know that the internal sort function wi ll not fail here */

	387 }

	388 return TRUE;

	389 }

	390 n -= ranges[i].count; // still >0

	391 }

	392 return FALSE;

	393 }

	394

	395 UBool

	396 CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {

	397 // See if the minLength ranges have enough weights

	398 // when we split one and lengthen the following ones.

	399 int32_t count = 0;

	400 int32_t minLengthRangeCount;

	401 for(minLengthRangeCount = 0;

	402 minLengthRangeCount < rangeCount &&

	403 ranges[minLengthRangeCount].length == minLength;

	404 ++minLengthRangeCount) {

	405 count += ranges[minLengthRangeCount].count;

	406 }

	407

	408 int32_t nextCountBytes = countBytes(minLength + 1);

	409 if(n > count * nextCountBytes) { return FALSE; }

	410

	411 // Use the minLength ranges. Merge them, and then split again as necessary.

	412 uint32_t start = ranges[0].start;

	413 uint32_t end = ranges[0].end;

	414 for(int32_t i = 1; i < minLengthRangeCount; ++i) {

	415 if(ranges[i].start < start) { start = ranges[i].start; }

	416 if(ranges[i].end > end) { end = ranges[i].end; }

	417 }

	418

	419 // Calculate how to split the range between minLength (count1) and minLength +1 (count2).

	420 // Goal:

	421 // count1 + count2 * nextCountBytes = n

	422 // count1 + count2 = count

	423 // These turn into

	424 // (count - count2) + count2 * nextCountBytes = n

	425 // and then into the following count1 & count2 computations.

	426 int32_t count2 = (n - count) / (nextCountBytes - 1); // number of weights t o be lengthened

	427 int32_t count1 = count - count2; // number of minLength weights

	428 if(count2 == 0 \|\| (count1 + count2 * nextCountBytes) < n) {

	429 // round up

	430 ++count2;

	431 --count1;

	432 U_ASSERT((count1 + count2 * nextCountBytes) >= n);

	433 }

	434

	435 ranges[0].start = start;

	436

	437 if(count1 == 0) {

	438 // Make one long range.

	439 ranges[0].end = end;

	440 ranges[0].count = count;

	441 lengthenRange(ranges[0]);

	442 rangeCount = 1;

	443 } else {

	444 // Split the range, lengthen the second part.

	445 #ifdef UCOL_DEBUG

	446 printf("split the range number %ld (out of %ld minLength ranges) by %ld: %ld\n",

	447 splitRange, rangeCount, count1, count2);

	448 #endif

	449

	450 // Next start = start + count1. First end = 1 before that.

	451 ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);

	452 ranges[0].count = count1;

	453

	454 ranges[1].start = incWeight(ranges[0].end, minLength);

	455 ranges[1].end = end;

	456 ranges[1].length = minLength; // +1 when lengthened

	457 ranges[1].count = count2; // *countBytes when lengthened

	458 lengthenRange(ranges[1]);

	459 rangeCount = 2;

	460 }

	461 return TRUE;

	462 }

	463

	464 /*

	465 * call getWeightRanges and then determine heuristically

	466 * which ranges to use for a given number of weights between (excluding)

	467 * two limits

	468 */

	469 UBool

	470 CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {

	471 #ifdef UCOL_DEBUG

	472 puts("");

	473 #endif

	474

	475 if(!getWeightRanges(lowerLimit, upperLimit)) {

	476 #ifdef UCOL_DEBUG

	477 printf("error: unable to get Weight ranges\n");

	478 #endif

	479 return FALSE;

	480 }

	481

	482 /* try until we find suitably large ranges */

	483 for(;;) {

	484 /* get the smallest number of bytes in a range */

	485 int32_t minLength=ranges[0].length;

	486

	487 if(allocWeightsInShortRanges(n, minLength)) { break; }

	488

	489 if(minLength == 4) {

	490 #ifdef UCOL_DEBUG

	491 printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",

	492 minLengthCount, n);

	493 #endif

	494 return FALSE;

	495 }

	496

	497 if(allocWeightsInMinLengthRanges(n, minLength)) { break; }

	498

	499 /* no good match, lengthen all minLength ranges and iterate */

	500 #ifdef UCOL_DEBUG

	501 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);

	502 #endif

	503 for(int32_t i=0; ranges[i].length==minLength; ++i) {

	504 lengthenRange(ranges[i]);

	505 }

	506 }

	507

	508 #ifdef UCOL_DEBUG

	509 puts("final ranges:");

	510 for(int32_t i=0; i<rangeCount; ++i) {

	511 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n ",

	512 i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].co unt);

	513 }

	514 #endif

	515

	516 rangeIndex = 0;

	517 return TRUE;

	518 }

	519

	520 uint32_t

	521 CollationWeights::nextWeight() {

	522 if(rangeIndex >= rangeCount) {

	523 return 0xffffffff;

	524 } else {

	525 /* get the next weight */

	526 WeightRange &range = ranges[rangeIndex];

	527 uint32_t weight = range.start;

	528 if(--range.count == 0) {

	529 /* this range is finished */

	530 ++rangeIndex;

	531 } else {

	532 /* increment the weight for the next value */

	533 range.start = incWeight(weight, range.length);

	534 U_ASSERT(range.start <= range.end);

	535 }

	536

	537 return weight;

	538 }

	539 }

	540

	541 U_NAMESPACE_END

	542

	543 #endif /* #if !UCONFIG_NO_COLLATION */

OLD	NEW

« no previous file with comments | « source/i18n/collationweights.h ('k') | source/i18n/compactdecimalformat.cpp » ('j') | no next file with comments »