OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * | 3 * |
4 * Copyright (C) 1999-2014, International Business Machines | 4 * Copyright (C) 1999-2015, International Business Machines |
5 * Corporation and others. All Rights Reserved. | 5 * Corporation and others. All Rights Reserved. |
6 * | 6 * |
7 ******************************************************************************* | 7 ******************************************************************************* |
8 * file name: collationweights.cpp | 8 * file name: collationweights.cpp |
9 * encoding: US-ASCII | 9 * encoding: US-ASCII |
10 * tab size: 8 (not used) | 10 * tab size: 8 (not used) |
11 * indentation:4 | 11 * indentation:4 |
12 * | 12 * |
13 * created on: 2001mar08 as ucol_wgt.cpp | 13 * created on: 2001mar08 as ucol_wgt.cpp |
14 * created by: Markus W. Scherer | 14 * created by: Markus W. Scherer |
(...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
119 } | 119 } |
120 | 120 |
121 void | 121 void |
122 CollationWeights::initForSecondary() { | 122 CollationWeights::initForSecondary() { |
123 // We use only the lower 16 bits for secondary weights. | 123 // We use only the lower 16 bits for secondary weights. |
124 middleLength=3; | 124 middleLength=3; |
125 minBytes[1] = 0; | 125 minBytes[1] = 0; |
126 maxBytes[1] = 0; | 126 maxBytes[1] = 0; |
127 minBytes[2] = 0; | 127 minBytes[2] = 0; |
128 maxBytes[2] = 0; | 128 maxBytes[2] = 0; |
129 minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1; | 129 minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1; |
130 maxBytes[3] = 0xff; | 130 maxBytes[3] = 0xff; |
131 minBytes[4] = 2; | 131 minBytes[4] = 2; |
132 maxBytes[4] = 0xff; | 132 maxBytes[4] = 0xff; |
133 } | 133 } |
134 | 134 |
135 void | 135 void |
136 CollationWeights::initForTertiary() { | 136 CollationWeights::initForTertiary() { |
137 // We use only the lower 16 bits for tertiary weights. | 137 // We use only the lower 16 bits for tertiary weights. |
138 middleLength=3; | 138 middleLength=3; |
139 minBytes[1] = 0; | 139 minBytes[1] = 0; |
140 maxBytes[1] = 0; | 140 maxBytes[1] = 0; |
141 minBytes[2] = 0; | 141 minBytes[2] = 0; |
142 maxBytes[2] = 0; | 142 maxBytes[2] = 0; |
143 // We use only 6 bits per byte. | 143 // We use only 6 bits per byte. |
144 // The other bits are used for case & quaternary weights. | 144 // The other bits are used for case & quaternary weights. |
145 minBytes[3] = Collation::MERGE_SEPARATOR_BYTE + 1; | 145 minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1; |
146 maxBytes[3] = 0x3f; | 146 maxBytes[3] = 0x3f; |
147 minBytes[4] = 2; | 147 minBytes[4] = 2; |
148 maxBytes[4] = 0x3f; | 148 maxBytes[4] = 0x3f; |
149 } | 149 } |
150 | 150 |
151 uint32_t | 151 uint32_t |
152 CollationWeights::incWeight(uint32_t weight, int32_t length) const { | 152 CollationWeights::incWeight(uint32_t weight, int32_t length) const { |
153 for(;;) { | 153 for(;;) { |
154 uint32_t byte=getWeightByte(weight, length); | 154 uint32_t byte=getWeightByte(weight, length); |
155 if(byte<maxBytes[length]) { | 155 if(byte<maxBytes[length]) { |
(...skipping 133 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
289 weight=truncateWeight(weight, length-1); | 289 weight=truncateWeight(weight, length-1); |
290 } | 290 } |
291 middle.end=decWeightTrail(weight, middleLength); | 291 middle.end=decWeightTrail(weight, middleLength); |
292 | 292 |
293 /* set the middle range */ | 293 /* set the middle range */ |
294 middle.length=middleLength; | 294 middle.length=middleLength; |
295 if(middle.end>=middle.start) { | 295 if(middle.end>=middle.start) { |
296 middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+
1; | 296 middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+
1; |
297 } else { | 297 } else { |
298 /* no middle range, eliminate overlaps */ | 298 /* no middle range, eliminate overlaps */ |
299 | |
300 /* reduce or remove the lower ranges that go beyond upperLimit */ | |
301 for(int32_t length=4; length>middleLength; --length) { | 299 for(int32_t length=4; length>middleLength; --length) { |
302 if(lower[length].count>0 && upper[length].count>0) { | 300 if(lower[length].count>0 && upper[length].count>0) { |
303 uint32_t start=upper[length].start; | 301 // Note: The lowerEnd and upperStart weights are versions of |
304 uint32_t end=lower[length].end; | 302 // lowerLimit and upperLimit (which are lowerLimit<upperLimit), |
| 303 // truncated (still less-or-equal) |
| 304 // and then with their last bytes changed to the |
| 305 // maxByte (for lowerEnd) or minByte (for upperStart). |
| 306 const uint32_t lowerEnd=lower[length].end; |
| 307 const uint32_t upperStart=upper[length].start; |
| 308 UBool merged=FALSE; |
305 | 309 |
306 if(end>=start || incWeight(end, length)==start) { | 310 if(lowerEnd>upperStart) { |
307 /* lower and upper ranges collide or are directly adjacent:
merge these two and remove all shorter ranges */ | 311 // These two lower and upper ranges collide. |
308 start=lower[length].start; | 312 // Since lowerLimit<upperLimit and lowerEnd and upperStart |
309 end=lower[length].end=upper[length].end; | 313 // are versions with only their last bytes modified |
310 /* | 314 // (and following ones removed/reset to 0), |
311 * merging directly adjacent ranges needs to subtract the 0/
1 gaps in between; | 315 // lowerEnd>upperStart is only possible |
312 * it may result in a range with count>countBytes | 316 // if the leading bytes are equal |
313 */ | 317 // and lastByte(lowerEnd)>lastByte(upperStart). |
| 318 U_ASSERT(truncateWeight(lowerEnd, length-1)== |
| 319 truncateWeight(upperStart, length-1)); |
| 320 // Intersect these two ranges. |
| 321 lower[length].end=upper[length].end; |
314 lower[length].count= | 322 lower[length].count= |
315 (int32_t)(getWeightTrail(end, length)-getWeightTrail(sta
rt, length)+1+ | 323 (int32_t)getWeightTrail(lower[length].end, length)- |
316 countBytes(length)*(getWeightByte(end, length-
1)-getWeightByte(start, length-1))); | 324 (int32_t)getWeightTrail(lower[length].start, length)
+1; |
| 325 // count might be <=0 in which case there is no room, |
| 326 // and the range-collecting code below will ignore this rang
e. |
| 327 merged=TRUE; |
| 328 } else if(lowerEnd==upperStart) { |
| 329 // Not possible, unless minByte==maxByte which is not allowe
d. |
| 330 U_ASSERT(minBytes[length]<maxBytes[length]); |
| 331 } else /* lowerEnd<upperStart */ { |
| 332 if(incWeight(lowerEnd, length)==upperStart) { |
| 333 // Merge adjacent ranges. |
| 334 lower[length].end=upper[length].end; |
| 335 lower[length].count+=upper[length].count; // might be >
countBytes |
| 336 merged=TRUE; |
| 337 } |
| 338 } |
| 339 if(merged) { |
| 340 // Remove all shorter ranges. |
| 341 // There was no room available for them between the ranges w
e just merged. |
317 upper[length].count=0; | 342 upper[length].count=0; |
318 while(--length>middleLength) { | 343 while(--length>middleLength) { |
319 lower[length].count=upper[length].count=0; | 344 lower[length].count=upper[length].count=0; |
320 } | 345 } |
321 break; | 346 break; |
322 } | 347 } |
323 } | 348 } |
324 } | 349 } |
325 } | 350 } |
326 | 351 |
(...skipping 207 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
534 U_ASSERT(range.start <= range.end); | 559 U_ASSERT(range.start <= range.end); |
535 } | 560 } |
536 | 561 |
537 return weight; | 562 return weight; |
538 } | 563 } |
539 } | 564 } |
540 | 565 |
541 U_NAMESPACE_END | 566 U_NAMESPACE_END |
542 | 567 |
543 #endif /* #if !UCONFIG_NO_COLLATION */ | 568 #endif /* #if !UCONFIG_NO_COLLATION */ |
OLD | NEW |