Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(741)

Side by Side Diff: source/i18n/collationdata.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/i18n/collationdata.h ('k') | source/i18n/collationdatabuilder.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ******************************************************************************* 2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines 3 * Copyright (C) 2012-2015, International Business Machines
4 * Corporation and others. All Rights Reserved. 4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************* 5 *******************************************************************************
6 * collationdata.cpp 6 * collationdata.cpp
7 * 7 *
8 * created on: 2012jul28 8 * created on: 2012jul28
9 * created by: Markus W. Scherer 9 * created by: Markus W. Scherer
10 */ 10 */
11 11
12 #include "unicode/utypes.h" 12 #include "unicode/utypes.h"
13 13
14 #if !UCONFIG_NO_COLLATION 14 #if !UCONFIG_NO_COLLATION
15 15
16 #include "unicode/ucol.h" 16 #include "unicode/ucol.h"
17 #include "unicode/udata.h" 17 #include "unicode/udata.h"
18 #include "unicode/uscript.h" 18 #include "unicode/uscript.h"
19 #include "cmemory.h" 19 #include "cmemory.h"
20 #include "collation.h" 20 #include "collation.h"
21 #include "collationdata.h" 21 #include "collationdata.h"
22 #include "uassert.h" 22 #include "uassert.h"
23 #include "utrie2.h" 23 #include "utrie2.h"
24 #include "uvectr32.h"
24 25
25 U_NAMESPACE_BEGIN 26 U_NAMESPACE_BEGIN
26 27
27 uint32_t 28 uint32_t
28 CollationData::getIndirectCE32(uint32_t ce32) const { 29 CollationData::getIndirectCE32(uint32_t ce32) const {
29 U_ASSERT(Collation::isSpecialCE32(ce32)); 30 U_ASSERT(Collation::isSpecialCE32(ce32));
30 int32_t tag = Collation::tagFromCE32(ce32); 31 int32_t tag = Collation::tagFromCE32(ce32);
31 if(tag == Collation::DIGIT_TAG) { 32 if(tag == Collation::DIGIT_TAG) {
32 // Fetch the non-numeric-collation CE32. 33 // Fetch the non-numeric-collation CE32.
33 ce32 = ce32s[Collation::indexFromCE32(ce32)]; 34 ce32 = ce32s[Collation::indexFromCE32(ce32)];
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after
107 return d->getCEFromOffsetCE32(c, ce32); 108 return d->getCEFromOffsetCE32(c, ce32);
108 case Collation::IMPLICIT_TAG: 109 case Collation::IMPLICIT_TAG:
109 return Collation::unassignedCEFromCodePoint(c); 110 return Collation::unassignedCEFromCodePoint(c);
110 } 111 }
111 } 112 }
112 return Collation::ceFromSimpleCE32(ce32); 113 return Collation::ceFromSimpleCE32(ce32);
113 } 114 }
114 115
115 uint32_t 116 uint32_t
116 CollationData::getFirstPrimaryForGroup(int32_t script) const { 117 CollationData::getFirstPrimaryForGroup(int32_t script) const {
117 int32_t index = findScript(script); 118 int32_t index = getScriptIndex(script);
118 if(index < 0) { 119 return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
119 return 0;
120 }
121 uint32_t head = scripts[index];
122 return (head & 0xff00) << 16;
123 } 120 }
124 121
125 uint32_t 122 uint32_t
126 CollationData::getLastPrimaryForGroup(int32_t script) const { 123 CollationData::getLastPrimaryForGroup(int32_t script) const {
127 int32_t index = findScript(script); 124 int32_t index = getScriptIndex(script);
128 if(index < 0) { 125 if(index == 0) {
129 return 0; 126 return 0;
130 } 127 }
131 uint32_t head = scripts[index]; 128 uint32_t limit = scriptStarts[index + 1];
132 uint32_t lastByte = head & 0xff; 129 return (limit << 16) - 1;
133 return ((lastByte + 1) << 24) - 1;
134 } 130 }
135 131
136 int32_t 132 int32_t
137 CollationData::getGroupForPrimary(uint32_t p) const { 133 CollationData::getGroupForPrimary(uint32_t p) const {
138 p >>= 24; // Reordering groups are distinguished by primary lead bytes. 134 p >>= 16;
139 for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) { 135 if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
140 uint32_t lastByte = scripts[i] & 0xff; 136 return -1;
141 if(p <= lastByte) { 137 }
142 return scripts[i + 2]; 138 int32_t index = 1;
139 while(p >= scriptStarts[index + 1]) { ++index; }
140 for(int32_t i = 0; i < numScripts; ++i) {
141 if(scriptsIndex[i] == index) {
142 return i;
143 }
144 }
145 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
146 if(scriptsIndex[numScripts + i] == index) {
147 return UCOL_REORDER_CODE_FIRST + i;
143 } 148 }
144 } 149 }
145 return -1; 150 return -1;
146 } 151 }
147 152
148 int32_t 153 int32_t
149 CollationData::findScript(int32_t script) const { 154 CollationData::getScriptIndex(int32_t script) const {
150 if(script < 0 || 0xffff < script) { return -1; } 155 if(script < 0) {
151 for(int32_t i = 0; i < scriptsLength;) { 156 return 0;
152 int32_t limit = i + 2 + scripts[i + 1]; 157 } else if(script < numScripts) {
153 for(int32_t j = i + 2; j < limit; ++j) { 158 return scriptsIndex[script];
154 if(script == scripts[j]) { return i; } 159 } else if(script < UCOL_REORDER_CODE_FIRST) {
160 return 0;
161 } else {
162 script -= UCOL_REORDER_CODE_FIRST;
163 if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
164 return scriptsIndex[numScripts + script];
165 } else {
166 return 0;
155 } 167 }
156 i = limit;
157 } 168 }
158 return -1;
159 } 169 }
160 170
161 int32_t 171 int32_t
162 CollationData::getEquivalentScripts(int32_t script, 172 CollationData::getEquivalentScripts(int32_t script,
163 int32_t dest[], int32_t capacity, 173 int32_t dest[], int32_t capacity,
164 UErrorCode &errorCode) const { 174 UErrorCode &errorCode) const {
165 if(U_FAILURE(errorCode)) { return 0; } 175 if(U_FAILURE(errorCode)) { return 0; }
166 int32_t i = findScript(script); 176 int32_t index = getScriptIndex(script);
167 if(i < 0) { return 0; } 177 if(index == 0) { return 0; }
168 int32_t length = scripts[i + 1]; 178 if(script >= UCOL_REORDER_CODE_FIRST) {
169 U_ASSERT(length != 0); 179 // Special groups have no aliases.
180 if(capacity > 0) {
181 dest[0] = script;
182 } else {
183 errorCode = U_BUFFER_OVERFLOW_ERROR;
184 }
185 return 1;
186 }
187
188 int32_t length = 0;
189 for(int32_t i = 0; i < numScripts; ++i) {
190 if(scriptsIndex[i] == index) {
191 if(length < capacity) {
192 dest[length] = i;
193 }
194 ++length;
195 }
196 }
170 if(length > capacity) { 197 if(length > capacity) {
171 errorCode = U_BUFFER_OVERFLOW_ERROR; 198 errorCode = U_BUFFER_OVERFLOW_ERROR;
172 return length;
173 }
174 i += 2;
175 dest[0] = scripts[i++];
176 for(int32_t j = 1; j < length; ++j) {
177 script = scripts[i++];
178 // Sorted insertion.
179 for(int32_t k = j;; --k) {
180 // Invariant: dest[k] is free to receive either script or dest[k - 1 ].
181 if(k > 0 && script < dest[k - 1]) {
182 dest[k] = dest[k - 1];
183 } else {
184 dest[k] = script;
185 break;
186 }
187 }
188 } 199 }
189 return length; 200 return length;
190 } 201 }
191 202
192 void 203 void
193 CollationData::makeReorderTable(const int32_t *reorder, int32_t length, 204 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
194 uint8_t table[256], UErrorCode &errorCode) const { 205 UVector32 &ranges, UErrorCode &errorCode) const {
206 makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
207 }
208
209 void
210 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
211 UBool latinMustMove,
212 UVector32 &ranges, UErrorCode &errorCode) const {
195 if(U_FAILURE(errorCode)) { return; } 213 if(U_FAILURE(errorCode)) { return; }
196 214 ranges.removeAllElements();
197 // Initialize the table. 215 if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
198 // Never reorder special low and high primary lead bytes. 216 return;
199 int32_t lowByte;
200 for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) {
201 table[lowByte] = lowByte;
202 }
203 // lowByte == 03
204
205 int32_t highByte;
206 for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) {
207 table[highByte] = highByte;
208 }
209 // highByte == FE
210
211 // Set intermediate bytes to 0 to indicate that they have not been set yet.
212 for(int32_t i = lowByte; i <= highByte; ++i) {
213 table[i] = 0;
214 } 217 }
215 218
219 // Maps each script-or-group range to a new lead byte.
220 uint8_t table[MAX_NUM_SCRIPT_RANGES];
221 uprv_memset(table, 0, sizeof(table));
222
223 {
224 // Set "don't care" values for reserved ranges.
225 int32_t index = scriptsIndex[
226 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_F IRST];
227 if(index != 0) {
228 table[index] = 0xff;
229 }
230 index = scriptsIndex[
231 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FI RST];
232 if(index != 0) {
233 table[index] = 0xff;
234 }
235 }
236
237 // Never reorder special low and high primary lead bytes.
238 U_ASSERT(scriptStartsLength >= 2);
239 U_ASSERT(scriptStarts[0] == 0);
240 int32_t lowStart = scriptStarts[1];
241 U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
242 int32_t highLimit = scriptStarts[scriptStartsLength - 1];
243 U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
244
216 // Get the set of special reorder codes in the input list. 245 // Get the set of special reorder codes in the input list.
217 // This supports up to 32 special reorder codes; 246 // This supports a fixed number of special reorder codes;
218 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. 247 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
219 uint32_t specials = 0; 248 uint32_t specials = 0;
220 for(int32_t i = 0; i < length; ++i) { 249 for(int32_t i = 0; i < length; ++i) {
221 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; 250 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
222 if(0 <= reorderCode && reorderCode <= 31) { 251 if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
223 specials |= (uint32_t)1 << reorderCode; 252 specials |= (uint32_t)1 << reorderCode;
224 } 253 }
225 } 254 }
226 255
227 // Start the reordering with the special low reorder codes that do not occur in the input. 256 // Start the reordering with the special low reorder codes that do not occur in the input.
228 for(int32_t i = 0;; i += 3) { 257 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
229 if(scripts[i + 1] != 1) { break; } // Went beyond special single-code r eorder codes. 258 int32_t index = scriptsIndex[numScripts + i];
230 int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST; 259 if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
231 if(reorderCode < 0) { break; } // Went beyond special reorder codes. 260 lowStart = addLowScriptRange(table, index, lowStart);
232 if((specials & ((uint32_t)1 << reorderCode)) == 0) {
233 int32_t head = scripts[i];
234 int32_t firstByte = head >> 8;
235 int32_t lastByte = head & 0xff;
236 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);
237 } 261 }
238 } 262 }
239 263
240 // Reorder according to the input scripts, continuing from the bottom of the bytes range. 264 // Skip the reserved range before Latin if Latin is the first script,
265 // so that we do not move it unnecessarily.
266 int32_t skippedReserved = 0;
267 if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
268 int32_t index = scriptsIndex[USCRIPT_LATIN];
269 U_ASSERT(index != 0);
270 int32_t start = scriptStarts[index];
271 U_ASSERT(lowStart <= start);
272 skippedReserved = start - lowStart;
273 lowStart = start;
274 }
275
276 // Reorder according to the input scripts, continuing from the bottom of the primary range.
277 int32_t originalLength = length; // length will be decremented if "others" is in the list.
278 UBool hasReorderToEnd = FALSE;
241 for(int32_t i = 0; i < length;) { 279 for(int32_t i = 0; i < length;) {
242 int32_t script = reorder[i++]; 280 int32_t script = reorder[i++];
243 if(script == USCRIPT_UNKNOWN) { 281 if(script == USCRIPT_UNKNOWN) {
244 // Put the remaining scripts at the top. 282 // Put the remaining scripts at the top.
283 hasReorderToEnd = TRUE;
245 while(i < length) { 284 while(i < length) {
246 script = reorder[--length]; 285 script = reorder[--length];
247 if(script == USCRIPT_UNKNOWN || // Must occur at most once. 286 if(script == USCRIPT_UNKNOWN || // Must occur at most once.
248 script == UCOL_REORDER_CODE_DEFAULT) { 287 script == UCOL_REORDER_CODE_DEFAULT) {
249 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 288 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
250 return; 289 return;
251 } 290 }
252 int32_t index = findScript(script); 291 int32_t index = getScriptIndex(script);
253 if(index < 0) { continue; } 292 if(index == 0) { continue; }
254 int32_t head = scripts[index]; 293 if(table[index] != 0) { // Duplicate or equivalent script.
255 int32_t firstByte = head >> 8;
256 int32_t lastByte = head & 0xff;
257 if(table[firstByte] != 0) { // Duplicate or equivalent script.
258 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 294 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
259 return; 295 return;
260 } 296 }
261 do { table[lastByte--] = highByte--; } while(firstByte <= lastBy te); 297 highLimit = addHighScriptRange(table, index, highLimit);
262 } 298 }
263 break; 299 break;
264 } 300 }
265 if(script == UCOL_REORDER_CODE_DEFAULT) { 301 if(script == UCOL_REORDER_CODE_DEFAULT) {
266 // The default code must be the only one in the list, and that is ha ndled by the caller. 302 // The default code must be the only one in the list, and that is ha ndled by the caller.
267 // Otherwise it must not be used. 303 // Otherwise it must not be used.
268 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 304 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
269 return; 305 return;
270 } 306 }
271 int32_t index = findScript(script); 307 int32_t index = getScriptIndex(script);
272 if(index < 0) { continue; } 308 if(index == 0) { continue; }
273 int32_t head = scripts[index]; 309 if(table[index] != 0) { // Duplicate or equivalent script.
274 int32_t firstByte = head >> 8;
275 int32_t lastByte = head & 0xff;
276 if(table[firstByte] != 0) { // Duplicate or equivalent script.
277 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 310 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
278 return; 311 return;
279 } 312 }
280 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); 313 lowStart = addLowScriptRange(table, index, lowStart);
281 } 314 }
282 315
283 // Put all remaining scripts into the middle. 316 // Put all remaining scripts into the middle.
284 // Avoid table[0] which must remain 0. 317 for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
285 for(int32_t i = 1; i <= 0xff; ++i) { 318 int32_t leadByte = table[i];
286 if(table[i] == 0) { table[i] = lowByte++; } 319 if(leadByte != 0) { continue; }
320 int32_t start = scriptStarts[i];
321 if(!hasReorderToEnd && start > lowStart) {
322 // No need to move this script.
323 lowStart = start;
324 }
325 lowStart = addLowScriptRange(table, i, lowStart);
287 } 326 }
288 U_ASSERT(lowByte == highByte + 1); 327 if(lowStart > highLimit) {
328 if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
329 // Try not skipping the before-Latin reserved range.
330 makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
331 return;
332 }
333 // We need more primary lead bytes than available, despite the reserved ranges.
334 errorCode = U_BUFFER_OVERFLOW_ERROR;
335 return;
336 }
337
338 // Turn lead bytes into a list of (limit, offset) pairs.
339 // Encode each pair in one list element:
340 // Upper 16 bits = limit, lower 16 = signed lead byte offset.
341 int32_t offset = 0;
342 for(int32_t i = 1;; ++i) {
343 int32_t nextOffset = offset;
344 while(i < scriptStartsLength - 1) {
345 int32_t newLeadByte = table[i];
346 if(newLeadByte == 0xff) {
347 // "Don't care" lead byte for reserved range, continue with curr ent offset.
348 } else {
349 nextOffset = newLeadByte - (scriptStarts[i] >> 8);
350 if(nextOffset != offset) { break; }
351 }
352 ++i;
353 }
354 if(offset != 0 || i < scriptStartsLength - 1) {
355 ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xfff f), errorCode);
356 }
357 if(i == scriptStartsLength - 1) { break; }
358 offset = nextOffset;
359 }
360 }
361
362 int32_t
363 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStar t) const {
364 int32_t start = scriptStarts[index];
365 if((start & 0xff) < (lowStart & 0xff)) {
366 lowStart += 0x100;
367 }
368 table[index] = (uint8_t)(lowStart >> 8);
369 int32_t limit = scriptStarts[index + 1];
370 lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | ( limit & 0xff);
371 return lowStart;
372 }
373
374 int32_t
375 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLi mit) const {
376 int32_t limit = scriptStarts[index + 1];
377 if((limit & 0xff) > (highLimit & 0xff)) {
378 highLimit -= 0x100;
379 }
380 int32_t start = scriptStarts[index];
381 highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
382 table[index] = (uint8_t)(highLimit >> 8);
383 return highLimit;
289 } 384 }
290 385
291 U_NAMESPACE_END 386 U_NAMESPACE_END
292 387
293 #endif // !UCONFIG_NO_COLLATION 388 #endif // !UCONFIG_NO_COLLATION
OLDNEW
« no previous file with comments | « source/i18n/collationdata.h ('k') | source/i18n/collationdatabuilder.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698