OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 2012-2014, International Business Machines | 3 * Copyright (C) 2012-2015, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 * collationdata.cpp | 6 * collationdata.cpp |
7 * | 7 * |
8 * created on: 2012jul28 | 8 * created on: 2012jul28 |
9 * created by: Markus W. Scherer | 9 * created by: Markus W. Scherer |
10 */ | 10 */ |
11 | 11 |
12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
13 | 13 |
14 #if !UCONFIG_NO_COLLATION | 14 #if !UCONFIG_NO_COLLATION |
15 | 15 |
16 #include "unicode/ucol.h" | 16 #include "unicode/ucol.h" |
17 #include "unicode/udata.h" | 17 #include "unicode/udata.h" |
18 #include "unicode/uscript.h" | 18 #include "unicode/uscript.h" |
19 #include "cmemory.h" | 19 #include "cmemory.h" |
20 #include "collation.h" | 20 #include "collation.h" |
21 #include "collationdata.h" | 21 #include "collationdata.h" |
22 #include "uassert.h" | 22 #include "uassert.h" |
23 #include "utrie2.h" | 23 #include "utrie2.h" |
| 24 #include "uvectr32.h" |
24 | 25 |
25 U_NAMESPACE_BEGIN | 26 U_NAMESPACE_BEGIN |
26 | 27 |
27 uint32_t | 28 uint32_t |
28 CollationData::getIndirectCE32(uint32_t ce32) const { | 29 CollationData::getIndirectCE32(uint32_t ce32) const { |
29 U_ASSERT(Collation::isSpecialCE32(ce32)); | 30 U_ASSERT(Collation::isSpecialCE32(ce32)); |
30 int32_t tag = Collation::tagFromCE32(ce32); | 31 int32_t tag = Collation::tagFromCE32(ce32); |
31 if(tag == Collation::DIGIT_TAG) { | 32 if(tag == Collation::DIGIT_TAG) { |
32 // Fetch the non-numeric-collation CE32. | 33 // Fetch the non-numeric-collation CE32. |
33 ce32 = ce32s[Collation::indexFromCE32(ce32)]; | 34 ce32 = ce32s[Collation::indexFromCE32(ce32)]; |
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
107 return d->getCEFromOffsetCE32(c, ce32); | 108 return d->getCEFromOffsetCE32(c, ce32); |
108 case Collation::IMPLICIT_TAG: | 109 case Collation::IMPLICIT_TAG: |
109 return Collation::unassignedCEFromCodePoint(c); | 110 return Collation::unassignedCEFromCodePoint(c); |
110 } | 111 } |
111 } | 112 } |
112 return Collation::ceFromSimpleCE32(ce32); | 113 return Collation::ceFromSimpleCE32(ce32); |
113 } | 114 } |
114 | 115 |
115 uint32_t | 116 uint32_t |
116 CollationData::getFirstPrimaryForGroup(int32_t script) const { | 117 CollationData::getFirstPrimaryForGroup(int32_t script) const { |
117 int32_t index = findScript(script); | 118 int32_t index = getScriptIndex(script); |
118 if(index < 0) { | 119 return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16; |
119 return 0; | |
120 } | |
121 uint32_t head = scripts[index]; | |
122 return (head & 0xff00) << 16; | |
123 } | 120 } |
124 | 121 |
125 uint32_t | 122 uint32_t |
126 CollationData::getLastPrimaryForGroup(int32_t script) const { | 123 CollationData::getLastPrimaryForGroup(int32_t script) const { |
127 int32_t index = findScript(script); | 124 int32_t index = getScriptIndex(script); |
128 if(index < 0) { | 125 if(index == 0) { |
129 return 0; | 126 return 0; |
130 } | 127 } |
131 uint32_t head = scripts[index]; | 128 uint32_t limit = scriptStarts[index + 1]; |
132 uint32_t lastByte = head & 0xff; | 129 return (limit << 16) - 1; |
133 return ((lastByte + 1) << 24) - 1; | |
134 } | 130 } |
135 | 131 |
136 int32_t | 132 int32_t |
137 CollationData::getGroupForPrimary(uint32_t p) const { | 133 CollationData::getGroupForPrimary(uint32_t p) const { |
138 p >>= 24; // Reordering groups are distinguished by primary lead bytes. | 134 p >>= 16; |
139 for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) { | 135 if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) { |
140 uint32_t lastByte = scripts[i] & 0xff; | 136 return -1; |
141 if(p <= lastByte) { | 137 } |
142 return scripts[i + 2]; | 138 int32_t index = 1; |
| 139 while(p >= scriptStarts[index + 1]) { ++index; } |
| 140 for(int32_t i = 0; i < numScripts; ++i) { |
| 141 if(scriptsIndex[i] == index) { |
| 142 return i; |
| 143 } |
| 144 } |
| 145 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { |
| 146 if(scriptsIndex[numScripts + i] == index) { |
| 147 return UCOL_REORDER_CODE_FIRST + i; |
143 } | 148 } |
144 } | 149 } |
145 return -1; | 150 return -1; |
146 } | 151 } |
147 | 152 |
148 int32_t | 153 int32_t |
149 CollationData::findScript(int32_t script) const { | 154 CollationData::getScriptIndex(int32_t script) const { |
150 if(script < 0 || 0xffff < script) { return -1; } | 155 if(script < 0) { |
151 for(int32_t i = 0; i < scriptsLength;) { | 156 return 0; |
152 int32_t limit = i + 2 + scripts[i + 1]; | 157 } else if(script < numScripts) { |
153 for(int32_t j = i + 2; j < limit; ++j) { | 158 return scriptsIndex[script]; |
154 if(script == scripts[j]) { return i; } | 159 } else if(script < UCOL_REORDER_CODE_FIRST) { |
| 160 return 0; |
| 161 } else { |
| 162 script -= UCOL_REORDER_CODE_FIRST; |
| 163 if(script < MAX_NUM_SPECIAL_REORDER_CODES) { |
| 164 return scriptsIndex[numScripts + script]; |
| 165 } else { |
| 166 return 0; |
155 } | 167 } |
156 i = limit; | |
157 } | 168 } |
158 return -1; | |
159 } | 169 } |
160 | 170 |
161 int32_t | 171 int32_t |
162 CollationData::getEquivalentScripts(int32_t script, | 172 CollationData::getEquivalentScripts(int32_t script, |
163 int32_t dest[], int32_t capacity, | 173 int32_t dest[], int32_t capacity, |
164 UErrorCode &errorCode) const { | 174 UErrorCode &errorCode) const { |
165 if(U_FAILURE(errorCode)) { return 0; } | 175 if(U_FAILURE(errorCode)) { return 0; } |
166 int32_t i = findScript(script); | 176 int32_t index = getScriptIndex(script); |
167 if(i < 0) { return 0; } | 177 if(index == 0) { return 0; } |
168 int32_t length = scripts[i + 1]; | 178 if(script >= UCOL_REORDER_CODE_FIRST) { |
169 U_ASSERT(length != 0); | 179 // Special groups have no aliases. |
| 180 if(capacity > 0) { |
| 181 dest[0] = script; |
| 182 } else { |
| 183 errorCode = U_BUFFER_OVERFLOW_ERROR; |
| 184 } |
| 185 return 1; |
| 186 } |
| 187 |
| 188 int32_t length = 0; |
| 189 for(int32_t i = 0; i < numScripts; ++i) { |
| 190 if(scriptsIndex[i] == index) { |
| 191 if(length < capacity) { |
| 192 dest[length] = i; |
| 193 } |
| 194 ++length; |
| 195 } |
| 196 } |
170 if(length > capacity) { | 197 if(length > capacity) { |
171 errorCode = U_BUFFER_OVERFLOW_ERROR; | 198 errorCode = U_BUFFER_OVERFLOW_ERROR; |
172 return length; | |
173 } | |
174 i += 2; | |
175 dest[0] = scripts[i++]; | |
176 for(int32_t j = 1; j < length; ++j) { | |
177 script = scripts[i++]; | |
178 // Sorted insertion. | |
179 for(int32_t k = j;; --k) { | |
180 // Invariant: dest[k] is free to receive either script or dest[k - 1
]. | |
181 if(k > 0 && script < dest[k - 1]) { | |
182 dest[k] = dest[k - 1]; | |
183 } else { | |
184 dest[k] = script; | |
185 break; | |
186 } | |
187 } | |
188 } | 199 } |
189 return length; | 200 return length; |
190 } | 201 } |
191 | 202 |
192 void | 203 void |
193 CollationData::makeReorderTable(const int32_t *reorder, int32_t length, | 204 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length, |
194 uint8_t table[256], UErrorCode &errorCode) const
{ | 205 UVector32 &ranges, UErrorCode &errorCode) const
{ |
| 206 makeReorderRanges(reorder, length, FALSE, ranges, errorCode); |
| 207 } |
| 208 |
| 209 void |
| 210 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length, |
| 211 UBool latinMustMove, |
| 212 UVector32 &ranges, UErrorCode &errorCode) const
{ |
195 if(U_FAILURE(errorCode)) { return; } | 213 if(U_FAILURE(errorCode)) { return; } |
196 | 214 ranges.removeAllElements(); |
197 // Initialize the table. | 215 if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) { |
198 // Never reorder special low and high primary lead bytes. | 216 return; |
199 int32_t lowByte; | |
200 for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) { | |
201 table[lowByte] = lowByte; | |
202 } | |
203 // lowByte == 03 | |
204 | |
205 int32_t highByte; | |
206 for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) { | |
207 table[highByte] = highByte; | |
208 } | |
209 // highByte == FE | |
210 | |
211 // Set intermediate bytes to 0 to indicate that they have not been set yet. | |
212 for(int32_t i = lowByte; i <= highByte; ++i) { | |
213 table[i] = 0; | |
214 } | 217 } |
215 | 218 |
| 219 // Maps each script-or-group range to a new lead byte. |
| 220 uint8_t table[MAX_NUM_SCRIPT_RANGES]; |
| 221 uprv_memset(table, 0, sizeof(table)); |
| 222 |
| 223 { |
| 224 // Set "don't care" values for reserved ranges. |
| 225 int32_t index = scriptsIndex[ |
| 226 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_F
IRST]; |
| 227 if(index != 0) { |
| 228 table[index] = 0xff; |
| 229 } |
| 230 index = scriptsIndex[ |
| 231 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FI
RST]; |
| 232 if(index != 0) { |
| 233 table[index] = 0xff; |
| 234 } |
| 235 } |
| 236 |
| 237 // Never reorder special low and high primary lead bytes. |
| 238 U_ASSERT(scriptStartsLength >= 2); |
| 239 U_ASSERT(scriptStarts[0] == 0); |
| 240 int32_t lowStart = scriptStarts[1]; |
| 241 U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8)); |
| 242 int32_t highLimit = scriptStarts[scriptStartsLength - 1]; |
| 243 U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8)); |
| 244 |
216 // Get the set of special reorder codes in the input list. | 245 // Get the set of special reorder codes in the input list. |
217 // This supports up to 32 special reorder codes; | 246 // This supports a fixed number of special reorder codes; |
218 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. | 247 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. |
219 uint32_t specials = 0; | 248 uint32_t specials = 0; |
220 for(int32_t i = 0; i < length; ++i) { | 249 for(int32_t i = 0; i < length; ++i) { |
221 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; | 250 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; |
222 if(0 <= reorderCode && reorderCode <= 31) { | 251 if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) { |
223 specials |= (uint32_t)1 << reorderCode; | 252 specials |= (uint32_t)1 << reorderCode; |
224 } | 253 } |
225 } | 254 } |
226 | 255 |
227 // Start the reordering with the special low reorder codes that do not occur
in the input. | 256 // Start the reordering with the special low reorder codes that do not occur
in the input. |
228 for(int32_t i = 0;; i += 3) { | 257 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { |
229 if(scripts[i + 1] != 1) { break; } // Went beyond special single-code r
eorder codes. | 258 int32_t index = scriptsIndex[numScripts + i]; |
230 int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST; | 259 if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) { |
231 if(reorderCode < 0) { break; } // Went beyond special reorder codes. | 260 lowStart = addLowScriptRange(table, index, lowStart); |
232 if((specials & ((uint32_t)1 << reorderCode)) == 0) { | |
233 int32_t head = scripts[i]; | |
234 int32_t firstByte = head >> 8; | |
235 int32_t lastByte = head & 0xff; | |
236 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); | |
237 } | 261 } |
238 } | 262 } |
239 | 263 |
240 // Reorder according to the input scripts, continuing from the bottom of the
bytes range. | 264 // Skip the reserved range before Latin if Latin is the first script, |
| 265 // so that we do not move it unnecessarily. |
| 266 int32_t skippedReserved = 0; |
| 267 if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) { |
| 268 int32_t index = scriptsIndex[USCRIPT_LATIN]; |
| 269 U_ASSERT(index != 0); |
| 270 int32_t start = scriptStarts[index]; |
| 271 U_ASSERT(lowStart <= start); |
| 272 skippedReserved = start - lowStart; |
| 273 lowStart = start; |
| 274 } |
| 275 |
| 276 // Reorder according to the input scripts, continuing from the bottom of the
primary range. |
| 277 int32_t originalLength = length; // length will be decremented if "others"
is in the list. |
| 278 UBool hasReorderToEnd = FALSE; |
241 for(int32_t i = 0; i < length;) { | 279 for(int32_t i = 0; i < length;) { |
242 int32_t script = reorder[i++]; | 280 int32_t script = reorder[i++]; |
243 if(script == USCRIPT_UNKNOWN) { | 281 if(script == USCRIPT_UNKNOWN) { |
244 // Put the remaining scripts at the top. | 282 // Put the remaining scripts at the top. |
| 283 hasReorderToEnd = TRUE; |
245 while(i < length) { | 284 while(i < length) { |
246 script = reorder[--length]; | 285 script = reorder[--length]; |
247 if(script == USCRIPT_UNKNOWN || // Must occur at most once. | 286 if(script == USCRIPT_UNKNOWN || // Must occur at most once. |
248 script == UCOL_REORDER_CODE_DEFAULT) { | 287 script == UCOL_REORDER_CODE_DEFAULT) { |
249 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | 288 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
250 return; | 289 return; |
251 } | 290 } |
252 int32_t index = findScript(script); | 291 int32_t index = getScriptIndex(script); |
253 if(index < 0) { continue; } | 292 if(index == 0) { continue; } |
254 int32_t head = scripts[index]; | 293 if(table[index] != 0) { // Duplicate or equivalent script. |
255 int32_t firstByte = head >> 8; | |
256 int32_t lastByte = head & 0xff; | |
257 if(table[firstByte] != 0) { // Duplicate or equivalent script. | |
258 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | 294 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
259 return; | 295 return; |
260 } | 296 } |
261 do { table[lastByte--] = highByte--; } while(firstByte <= lastBy
te); | 297 highLimit = addHighScriptRange(table, index, highLimit); |
262 } | 298 } |
263 break; | 299 break; |
264 } | 300 } |
265 if(script == UCOL_REORDER_CODE_DEFAULT) { | 301 if(script == UCOL_REORDER_CODE_DEFAULT) { |
266 // The default code must be the only one in the list, and that is ha
ndled by the caller. | 302 // The default code must be the only one in the list, and that is ha
ndled by the caller. |
267 // Otherwise it must not be used. | 303 // Otherwise it must not be used. |
268 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | 304 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
269 return; | 305 return; |
270 } | 306 } |
271 int32_t index = findScript(script); | 307 int32_t index = getScriptIndex(script); |
272 if(index < 0) { continue; } | 308 if(index == 0) { continue; } |
273 int32_t head = scripts[index]; | 309 if(table[index] != 0) { // Duplicate or equivalent script. |
274 int32_t firstByte = head >> 8; | |
275 int32_t lastByte = head & 0xff; | |
276 if(table[firstByte] != 0) { // Duplicate or equivalent script. | |
277 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | 310 errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
278 return; | 311 return; |
279 } | 312 } |
280 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); | 313 lowStart = addLowScriptRange(table, index, lowStart); |
281 } | 314 } |
282 | 315 |
283 // Put all remaining scripts into the middle. | 316 // Put all remaining scripts into the middle. |
284 // Avoid table[0] which must remain 0. | 317 for(int32_t i = 1; i < scriptStartsLength - 1; ++i) { |
285 for(int32_t i = 1; i <= 0xff; ++i) { | 318 int32_t leadByte = table[i]; |
286 if(table[i] == 0) { table[i] = lowByte++; } | 319 if(leadByte != 0) { continue; } |
| 320 int32_t start = scriptStarts[i]; |
| 321 if(!hasReorderToEnd && start > lowStart) { |
| 322 // No need to move this script. |
| 323 lowStart = start; |
| 324 } |
| 325 lowStart = addLowScriptRange(table, i, lowStart); |
287 } | 326 } |
288 U_ASSERT(lowByte == highByte + 1); | 327 if(lowStart > highLimit) { |
| 328 if((lowStart - (skippedReserved & 0xff00)) <= highLimit) { |
| 329 // Try not skipping the before-Latin reserved range. |
| 330 makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode); |
| 331 return; |
| 332 } |
| 333 // We need more primary lead bytes than available, despite the reserved
ranges. |
| 334 errorCode = U_BUFFER_OVERFLOW_ERROR; |
| 335 return; |
| 336 } |
| 337 |
| 338 // Turn lead bytes into a list of (limit, offset) pairs. |
| 339 // Encode each pair in one list element: |
| 340 // Upper 16 bits = limit, lower 16 = signed lead byte offset. |
| 341 int32_t offset = 0; |
| 342 for(int32_t i = 1;; ++i) { |
| 343 int32_t nextOffset = offset; |
| 344 while(i < scriptStartsLength - 1) { |
| 345 int32_t newLeadByte = table[i]; |
| 346 if(newLeadByte == 0xff) { |
| 347 // "Don't care" lead byte for reserved range, continue with curr
ent offset. |
| 348 } else { |
| 349 nextOffset = newLeadByte - (scriptStarts[i] >> 8); |
| 350 if(nextOffset != offset) { break; } |
| 351 } |
| 352 ++i; |
| 353 } |
| 354 if(offset != 0 || i < scriptStartsLength - 1) { |
| 355 ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xfff
f), errorCode); |
| 356 } |
| 357 if(i == scriptStartsLength - 1) { break; } |
| 358 offset = nextOffset; |
| 359 } |
| 360 } |
| 361 |
| 362 int32_t |
| 363 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStar
t) const { |
| 364 int32_t start = scriptStarts[index]; |
| 365 if((start & 0xff) < (lowStart & 0xff)) { |
| 366 lowStart += 0x100; |
| 367 } |
| 368 table[index] = (uint8_t)(lowStart >> 8); |
| 369 int32_t limit = scriptStarts[index + 1]; |
| 370 lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (
limit & 0xff); |
| 371 return lowStart; |
| 372 } |
| 373 |
| 374 int32_t |
| 375 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLi
mit) const { |
| 376 int32_t limit = scriptStarts[index + 1]; |
| 377 if((limit & 0xff) > (highLimit & 0xff)) { |
| 378 highLimit -= 0x100; |
| 379 } |
| 380 int32_t start = scriptStarts[index]; |
| 381 highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) |
(start & 0xff); |
| 382 table[index] = (uint8_t)(highLimit >> 8); |
| 383 return highLimit; |
289 } | 384 } |
290 | 385 |
291 U_NAMESPACE_END | 386 U_NAMESPACE_END |
292 | 387 |
293 #endif // !UCONFIG_NO_COLLATION | 388 #endif // !UCONFIG_NO_COLLATION |
OLD | NEW |