source/i18n/collationdata.cpp - Issue 1621843002: ICU 56 update step 1

Side by Side Diff: source/i18n/collationdata.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 *******************************************************************************	2 *******************************************************************************

3 * Copyright (C) 2012-2014, International Business Machines	3 * Copyright (C) 2012-2015, International Business Machines

4 * Corporation and others. All Rights Reserved.	4 * Corporation and others. All Rights Reserved.

5 *******************************************************************************	5 *******************************************************************************

6 * collationdata.cpp	6 * collationdata.cpp

7 *	7 *

8 * created on: 2012jul28	8 * created on: 2012jul28

9 * created by: Markus W. Scherer	9 * created by: Markus W. Scherer

10 */	10 */

11	11

12 #include "unicode/utypes.h"	12 #include "unicode/utypes.h"

13	13

14 #if !UCONFIG_NO_COLLATION	14 #if !UCONFIG_NO_COLLATION

15	15

16 #include "unicode/ucol.h"	16 #include "unicode/ucol.h"

17 #include "unicode/udata.h"	17 #include "unicode/udata.h"

18 #include "unicode/uscript.h"	18 #include "unicode/uscript.h"

19 #include "cmemory.h"	19 #include "cmemory.h"

20 #include "collation.h"	20 #include "collation.h"

21 #include "collationdata.h"	21 #include "collationdata.h"

22 #include "uassert.h"	22 #include "uassert.h"

23 #include "utrie2.h"	23 #include "utrie2.h"

	24 #include "uvectr32.h"

24	25

25 U_NAMESPACE_BEGIN	26 U_NAMESPACE_BEGIN

26	27

27 uint32_t	28 uint32_t

28 CollationData::getIndirectCE32(uint32_t ce32) const {	29 CollationData::getIndirectCE32(uint32_t ce32) const {

29 U_ASSERT(Collation::isSpecialCE32(ce32));	30 U_ASSERT(Collation::isSpecialCE32(ce32));

30 int32_t tag = Collation::tagFromCE32(ce32);	31 int32_t tag = Collation::tagFromCE32(ce32);

31 if(tag == Collation::DIGIT_TAG) {	32 if(tag == Collation::DIGIT_TAG) {

32 // Fetch the non-numeric-collation CE32.	33 // Fetch the non-numeric-collation CE32.

33 ce32 = ce32s[Collation::indexFromCE32(ce32)];	34 ce32 = ce32s[Collation::indexFromCE32(ce32)];

(...skipping 73 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
107 return d->getCEFromOffsetCE32(c, ce32);	108 return d->getCEFromOffsetCE32(c, ce32);

108 case Collation::IMPLICIT_TAG:	109 case Collation::IMPLICIT_TAG:

109 return Collation::unassignedCEFromCodePoint(c);	110 return Collation::unassignedCEFromCodePoint(c);

110 }	111 }

111 }	112 }

112 return Collation::ceFromSimpleCE32(ce32);	113 return Collation::ceFromSimpleCE32(ce32);

113 }	114 }

114	115

115 uint32_t	116 uint32_t

116 CollationData::getFirstPrimaryForGroup(int32_t script) const {	117 CollationData::getFirstPrimaryForGroup(int32_t script) const {

117 int32_t index = findScript(script);	118 int32_t index = getScriptIndex(script);

118 if(index < 0) {	119 return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;

119 return 0;

120 }

121 uint32_t head = scripts[index];

122 return (head & 0xff00) << 16;

123 }	120 }

124	121

125 uint32_t	122 uint32_t

126 CollationData::getLastPrimaryForGroup(int32_t script) const {	123 CollationData::getLastPrimaryForGroup(int32_t script) const {

127 int32_t index = findScript(script);	124 int32_t index = getScriptIndex(script);

128 if(index < 0) {	125 if(index == 0) {

129 return 0;	126 return 0;

130 }	127 }

131 uint32_t head = scripts[index];	128 uint32_t limit = scriptStarts[index + 1];

132 uint32_t lastByte = head & 0xff;	129 return (limit << 16) - 1;

133 return ((lastByte + 1) << 24) - 1;

134 }	130 }

135	131

136 int32_t	132 int32_t

137 CollationData::getGroupForPrimary(uint32_t p) const {	133 CollationData::getGroupForPrimary(uint32_t p) const {

138 p >>= 24; // Reordering groups are distinguished by primary lead bytes.	134 p >>= 16;

139 for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) {	135 if(p < scriptStarts[1] \|\| scriptStarts[scriptStartsLength - 1] <= p) {

140 uint32_t lastByte = scripts[i] & 0xff;	136 return -1;

141 if(p <= lastByte) {	137 }

142 return scripts[i + 2];	138 int32_t index = 1;

	139 while(p >= scriptStarts[index + 1]) { ++index; }

	140 for(int32_t i = 0; i < numScripts; ++i) {

	141 if(scriptsIndex[i] == index) {

	142 return i;

	143 }

	144 }

	145 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {

	146 if(scriptsIndex[numScripts + i] == index) {

	147 return UCOL_REORDER_CODE_FIRST + i;

143 }	148 }

144 }	149 }

145 return -1;	150 return -1;

146 }	151 }

147	152

148 int32_t	153 int32_t

149 CollationData::findScript(int32_t script) const {	154 CollationData::getScriptIndex(int32_t script) const {

150 if(script < 0 \|\| 0xffff < script) { return -1; }	155 if(script < 0) {

151 for(int32_t i = 0; i < scriptsLength;) {	156 return 0;

152 int32_t limit = i + 2 + scripts[i + 1];	157 } else if(script < numScripts) {

153 for(int32_t j = i + 2; j < limit; ++j) {	158 return scriptsIndex[script];

154 if(script == scripts[j]) { return i; }	159 } else if(script < UCOL_REORDER_CODE_FIRST) {

	160 return 0;

	161 } else {

	162 script -= UCOL_REORDER_CODE_FIRST;

	163 if(script < MAX_NUM_SPECIAL_REORDER_CODES) {

	164 return scriptsIndex[numScripts + script];

	165 } else {

	166 return 0;

155 }	167 }

156 i = limit;

157 }	168 }

158 return -1;

159 }	169 }

160	170

161 int32_t	171 int32_t

162 CollationData::getEquivalentScripts(int32_t script,	172 CollationData::getEquivalentScripts(int32_t script,

163 int32_t dest[], int32_t capacity,	173 int32_t dest[], int32_t capacity,

164 UErrorCode &errorCode) const {	174 UErrorCode &errorCode) const {

165 if(U_FAILURE(errorCode)) { return 0; }	175 if(U_FAILURE(errorCode)) { return 0; }

166 int32_t i = findScript(script);	176 int32_t index = getScriptIndex(script);

167 if(i < 0) { return 0; }	177 if(index == 0) { return 0; }

168 int32_t length = scripts[i + 1];	178 if(script >= UCOL_REORDER_CODE_FIRST) {

169 U_ASSERT(length != 0);	179 // Special groups have no aliases.

	180 if(capacity > 0) {

	181 dest[0] = script;

	182 } else {

	183 errorCode = U_BUFFER_OVERFLOW_ERROR;

	184 }

	185 return 1;

	186 }

	187

	188 int32_t length = 0;

	189 for(int32_t i = 0; i < numScripts; ++i) {

	190 if(scriptsIndex[i] == index) {

	191 if(length < capacity) {

	192 dest[length] = i;

	193 }

	194 ++length;

	195 }

	196 }

170 if(length > capacity) {	197 if(length > capacity) {

171 errorCode = U_BUFFER_OVERFLOW_ERROR;	198 errorCode = U_BUFFER_OVERFLOW_ERROR;

172 return length;

173 }

174 i += 2;

175 dest[0] = scripts[i++];

176 for(int32_t j = 1; j < length; ++j) {

177 script = scripts[i++];

178 // Sorted insertion.

179 for(int32_t k = j;; --k) {

180 // Invariant: dest[k] is free to receive either script or dest[k - 1 ].

181 if(k > 0 && script < dest[k - 1]) {

182 dest[k] = dest[k - 1];

183 } else {

184 dest[k] = script;

185 break;

186 }

187 }

188 }	199 }

189 return length;	200 return length;

190 }	201 }

191	202

192 void	203 void

193 CollationData::makeReorderTable(const int32_t *reorder, int32_t length,	204 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,

194 uint8_t table[256], UErrorCode &errorCode) const {	205 UVector32 &ranges, UErrorCode &errorCode) const {

	206 makeReorderRanges(reorder, length, FALSE, ranges, errorCode);

	207 }

	208

	209 void

	210 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,

	211 UBool latinMustMove,

	212 UVector32 &ranges, UErrorCode &errorCode) const {

195 if(U_FAILURE(errorCode)) { return; }	213 if(U_FAILURE(errorCode)) { return; }

196	214 ranges.removeAllElements();

197 // Initialize the table.	215 if(length == 0 \|\| (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {

198 // Never reorder special low and high primary lead bytes.	216 return;

199 int32_t lowByte;

200 for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) {

201 table[lowByte] = lowByte;

202 }

203 // lowByte == 03

204

205 int32_t highByte;

206 for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) {

207 table[highByte] = highByte;

208 }

209 // highByte == FE

210

211 // Set intermediate bytes to 0 to indicate that they have not been set yet.

212 for(int32_t i = lowByte; i <= highByte; ++i) {

213 table[i] = 0;

214 }	217 }

215	218

	219 // Maps each script-or-group range to a new lead byte.

	220 uint8_t table[MAX_NUM_SCRIPT_RANGES];

	221 uprv_memset(table, 0, sizeof(table));

	222

	223 {

	224 // Set "don't care" values for reserved ranges.

	225 int32_t index = scriptsIndex[

	226 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_F IRST];

	227 if(index != 0) {

	228 table[index] = 0xff;

	229 }

	230 index = scriptsIndex[

	231 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FI RST];

	232 if(index != 0) {

	233 table[index] = 0xff;

	234 }

	235 }

	236

	237 // Never reorder special low and high primary lead bytes.

	238 U_ASSERT(scriptStartsLength >= 2);

	239 U_ASSERT(scriptStarts[0] == 0);

	240 int32_t lowStart = scriptStarts[1];

	241 U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));

	242 int32_t highLimit = scriptStarts[scriptStartsLength - 1];

	243 U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));

	244

216 // Get the set of special reorder codes in the input list.	245 // Get the set of special reorder codes in the input list.

217 // This supports up to 32 special reorder codes;	246 // This supports a fixed number of special reorder codes;

218 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.	247 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.

219 uint32_t specials = 0;	248 uint32_t specials = 0;

220 for(int32_t i = 0; i < length; ++i) {	249 for(int32_t i = 0; i < length; ++i) {

221 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;	250 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;

222 if(0 <= reorderCode && reorderCode <= 31) {	251 if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {

223 specials \|= (uint32_t)1 << reorderCode;	252 specials \|= (uint32_t)1 << reorderCode;

224 }	253 }

225 }	254 }

226	255

227 // Start the reordering with the special low reorder codes that do not occur in the input.	256 // Start the reordering with the special low reorder codes that do not occur in the input.

228 for(int32_t i = 0;; i += 3) {	257 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {

229 if(scripts[i + 1] != 1) { break; } // Went beyond special single-code r eorder codes.	258 int32_t index = scriptsIndex[numScripts + i];

230 int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST;	259 if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {

231 if(reorderCode < 0) { break; } // Went beyond special reorder codes.	260 lowStart = addLowScriptRange(table, index, lowStart);

232 if((specials & ((uint32_t)1 << reorderCode)) == 0) {

233 int32_t head = scripts[i];

234 int32_t firstByte = head >> 8;

235 int32_t lastByte = head & 0xff;

236 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);

237 }	261 }

238 }	262 }

239	263

240 // Reorder according to the input scripts, continuing from the bottom of the bytes range.	264 // Skip the reserved range before Latin if Latin is the first script,

	265 // so that we do not move it unnecessarily.

	266 int32_t skippedReserved = 0;

	267 if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {

	268 int32_t index = scriptsIndex[USCRIPT_LATIN];

	269 U_ASSERT(index != 0);

	270 int32_t start = scriptStarts[index];

	271 U_ASSERT(lowStart <= start);

	272 skippedReserved = start - lowStart;

	273 lowStart = start;

	274 }

	275

	276 // Reorder according to the input scripts, continuing from the bottom of the primary range.

	277 int32_t originalLength = length; // length will be decremented if "others" is in the list.

	278 UBool hasReorderToEnd = FALSE;

241 for(int32_t i = 0; i < length;) {	279 for(int32_t i = 0; i < length;) {

242 int32_t script = reorder[i++];	280 int32_t script = reorder[i++];

243 if(script == USCRIPT_UNKNOWN) {	281 if(script == USCRIPT_UNKNOWN) {

244 // Put the remaining scripts at the top.	282 // Put the remaining scripts at the top.

	283 hasReorderToEnd = TRUE;

245 while(i < length) {	284 while(i < length) {

246 script = reorder[--length];	285 script = reorder[--length];

247 if(script == USCRIPT_UNKNOWN \|\| // Must occur at most once.	286 if(script == USCRIPT_UNKNOWN \|\| // Must occur at most once.

248 script == UCOL_REORDER_CODE_DEFAULT) {	287 script == UCOL_REORDER_CODE_DEFAULT) {

249 errorCode = U_ILLEGAL_ARGUMENT_ERROR;	288 errorCode = U_ILLEGAL_ARGUMENT_ERROR;

250 return;	289 return;

251 }	290 }

252 int32_t index = findScript(script);	291 int32_t index = getScriptIndex(script);

253 if(index < 0) { continue; }	292 if(index == 0) { continue; }

254 int32_t head = scripts[index];	293 if(table[index] != 0) { // Duplicate or equivalent script.

255 int32_t firstByte = head >> 8;

256 int32_t lastByte = head & 0xff;

257 if(table[firstByte] != 0) { // Duplicate or equivalent script.

258 errorCode = U_ILLEGAL_ARGUMENT_ERROR;	294 errorCode = U_ILLEGAL_ARGUMENT_ERROR;

259 return;	295 return;

260 }	296 }

261 do { table[lastByte--] = highByte--; } while(firstByte <= lastBy te);	297 highLimit = addHighScriptRange(table, index, highLimit);

262 }	298 }

263 break;	299 break;

264 }	300 }

265 if(script == UCOL_REORDER_CODE_DEFAULT) {	301 if(script == UCOL_REORDER_CODE_DEFAULT) {

266 // The default code must be the only one in the list, and that is ha ndled by the caller.	302 // The default code must be the only one in the list, and that is ha ndled by the caller.

267 // Otherwise it must not be used.	303 // Otherwise it must not be used.

268 errorCode = U_ILLEGAL_ARGUMENT_ERROR;	304 errorCode = U_ILLEGAL_ARGUMENT_ERROR;

269 return;	305 return;

270 }	306 }

271 int32_t index = findScript(script);	307 int32_t index = getScriptIndex(script);

272 if(index < 0) { continue; }	308 if(index == 0) { continue; }

273 int32_t head = scripts[index];	309 if(table[index] != 0) { // Duplicate or equivalent script.

274 int32_t firstByte = head >> 8;

275 int32_t lastByte = head & 0xff;

276 if(table[firstByte] != 0) { // Duplicate or equivalent script.

277 errorCode = U_ILLEGAL_ARGUMENT_ERROR;	310 errorCode = U_ILLEGAL_ARGUMENT_ERROR;

278 return;	311 return;

279 }	312 }

280 do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);	313 lowStart = addLowScriptRange(table, index, lowStart);

281 }	314 }

282	315

283 // Put all remaining scripts into the middle.	316 // Put all remaining scripts into the middle.

284 // Avoid table[0] which must remain 0.	317 for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {

285 for(int32_t i = 1; i <= 0xff; ++i) {	318 int32_t leadByte = table[i];

286 if(table[i] == 0) { table[i] = lowByte++; }	319 if(leadByte != 0) { continue; }

	320 int32_t start = scriptStarts[i];

	321 if(!hasReorderToEnd && start > lowStart) {

	322 // No need to move this script.

	323 lowStart = start;

	324 }

	325 lowStart = addLowScriptRange(table, i, lowStart);

287 }	326 }

288 U_ASSERT(lowByte == highByte + 1);	327 if(lowStart > highLimit) {

	328 if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {

	329 // Try not skipping the before-Latin reserved range.

	330 makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);

	331 return;

	332 }

	333 // We need more primary lead bytes than available, despite the reserved ranges.

	334 errorCode = U_BUFFER_OVERFLOW_ERROR;

	335 return;

	336 }

	337

	338 // Turn lead bytes into a list of (limit, offset) pairs.

	339 // Encode each pair in one list element:

	340 // Upper 16 bits = limit, lower 16 = signed lead byte offset.

	341 int32_t offset = 0;

	342 for(int32_t i = 1;; ++i) {

	343 int32_t nextOffset = offset;

	344 while(i < scriptStartsLength - 1) {

	345 int32_t newLeadByte = table[i];

	346 if(newLeadByte == 0xff) {

	347 // "Don't care" lead byte for reserved range, continue with curr ent offset.

	348 } else {

	349 nextOffset = newLeadByte - (scriptStarts[i] >> 8);

	350 if(nextOffset != offset) { break; }

	351 }

	352 ++i;

	353 }

	354 if(offset != 0 \|\| i < scriptStartsLength - 1) {

	355 ranges.addElement(((int32_t)scriptStarts[i] << 16) \| (offset & 0xfff f), errorCode);

	356 }

	357 if(i == scriptStartsLength - 1) { break; }

	358 offset = nextOffset;

	359 }

	360 }

	361

	362 int32_t

	363 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStar t) const {

	364 int32_t start = scriptStarts[index];

	365 if((start & 0xff) < (lowStart & 0xff)) {

	366 lowStart += 0x100;

	367 }

	368 table[index] = (uint8_t)(lowStart >> 8);

	369 int32_t limit = scriptStarts[index + 1];

	370 lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) \| ( limit & 0xff);

	371 return lowStart;

	372 }

	373

	374 int32_t

	375 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLi mit) const {

	376 int32_t limit = scriptStarts[index + 1];

	377 if((limit & 0xff) > (highLimit & 0xff)) {

	378 highLimit -= 0x100;

	379 }

	380 int32_t start = scriptStarts[index];

	381 highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) \| (start & 0xff);

	382 table[index] = (uint8_t)(highLimit >> 8);

	383 return highLimit;

289 }	384 }

290	385

291 U_NAMESPACE_END	386 U_NAMESPACE_END

292	387

293 #endif // !UCONFIG_NO_COLLATION	388 #endif // !UCONFIG_NO_COLLATION

OLD	NEW

« no previous file with comments | « source/i18n/collationdata.h ('k') | source/i18n/collationdatabuilder.cpp » ('j') | no next file with comments »