source/i18n/ucol_bld.cpp - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/i18n/ucol_bld.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 *******************************************************************************

3 *

4 * Copyright (C) 2001-2013, International Business Machines

5 * Corporation and others. All Rights Reserved.

6 *

7 *******************************************************************************

8 * file name: ucol_bld.cpp

9 * encoding: US-ASCII

10 * tab size: 8 (not used)

11 * indentation:4

12 *

13 * created 02/22/2001

14 * created by: Vladimir Weinstein

15 *

16 * This module builds a collator based on the rule set.

17 *

18 */

19

20 #include "unicode/utypes.h"

21

22 #if !UCONFIG_NO_COLLATION

23

24 #include "unicode/ucoleitr.h"

25 #include "unicode/udata.h"

26 #include "unicode/uchar.h"

27 #include "unicode/uniset.h"

28 #include "unicode/uscript.h"

29 #include "unicode/ustring.h"

30 #include "unicode/utf16.h"

31 #include "normalizer2impl.h"

32 #include "uassert.h"

33 #include "ucol_bld.h"

34 #include "ucol_elm.h"

35 #include "ucol_cnt.h"

36 #include "ucln_in.h"

37 #include "umutex.h"

38 #include "cmemory.h"

39 #include "cstring.h"

40

41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

42

43 static const InverseUCATableHeader* _staticInvUCA = NULL;

44 static UDataMemory* invUCA_DATA_MEM = NULL;

45 static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER;

46

47 U_CDECL_BEGIN

48 static UBool U_CALLCONV

49 isAcceptableInvUCA(void * /context/,

50 const char * /type/, const char * /name/,

51 const UDataInfo *pInfo)

52 {

53 /* context, type & name are intentionally not used */

54 if( pInfo->size>=20 &&

55 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&

56 pInfo->charsetFamily==U_CHARSET_FAMILY &&

57 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */

58 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&

59 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&

60 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&

61 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&

62 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&

63 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&

64 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&

65 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&

66 )

67 {

68 // TODO: Check that the invuca data version (pInfo->dataVersion)

69 // matches the ucadata version.

70 return TRUE;

71 } else {

72 return FALSE;

73 }

74 }

75 U_CDECL_END

76

77 /*

78 * Takes two CEs (lead and continuation) and

79 * compares them as CEs should be compared:

80 * primary vs. primary, secondary vs. secondary

81 * tertiary vs. tertiary

82 */

83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {

84 uint32_t s1 = source0, s2, t1 = target0, t2;

85 if(isContinuation(source1)) {

86 s2 = source1;

87 } else {

88 s2 = 0;

89 }

90 if(isContinuation(target1)) {

91 t2 = target1;

92 } else {

93 t2 = 0;

94 }

95

96 uint32_t s = 0, t = 0;

97 if(s1 == t1 && s2 == t2) {

98 return 0;

99 }

100 s = (s1 & 0xFFFF0000)\|((s2 & 0xFFFF0000)>>16);

101 t = (t1 & 0xFFFF0000)\|((t2 & 0xFFFF0000)>>16);

102 if(s < t) {

103 return -1;

104 } else if(s > t) {

105 return 1;

106 } else {

107 s = (s1 & 0x0000FF00) \| (s2 & 0x0000FF00)>>8;

108 t = (t1 & 0x0000FF00) \| (t2 & 0x0000FF00)>>8;

109 if(s < t) {

110 return -1;

111 } else if(s > t) {

112 return 1;

113 } else {

114 s = (s1 & 0x000000FF)<<8 \| (s2 & 0x000000FF);

115 t = (t1 & 0x000000FF)<<8 \| (t2 & 0x000000FF);

116 if(s < t) {

117 return -1;

118 } else {

119 return 1;

120 }

121 }

122 }

123 }

124

125 static

126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t Second CE) {

127 uint32_t bottom = 0, top = src->invUCA->tableSize;

128 uint32_t i = 0;

129 uint32_t first = 0, second = 0;

130 uint32_t CETable = (uint32_t )((uint8_t *)src->invUCA+src->invUCA->table);

131 int32_t res = 0;

132

133 while(bottom < top-1) {

134 i = (top+bottom)/2;

135 first = (CETable+3i);

136 second = (CETable+3i+1);

137 res = compareCEs(first, second, CE, SecondCE);

138 if(res > 0) {

139 top = i;

140 } else if(res < 0) {

141 bottom = i;

142 } else {

143 break;

144 }

145 }

146

147 /* weiv: */

148 /* in searching for elements, I have removed the failure */

149 /* The reason for this is that the builder does not rely */

150 /* on search mechanism telling it that it didn't find an */

151 /* element. However, indirect positioning relies on being */

152 /* able to find the elements around any CE, even if it is */

153 /* not defined in the UCA. */

154 return i;

155 /*

156 if((first == CE && second == SecondCE)) {

157 return i;

158 } else {

159 return -1;

160 }

161 */

162 }

163

164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {

165 0xFFFF0000,

166 0xFFFFFF00,

167 0xFFFFFFFF

168 };

169

170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,

171 uint32_t CE, uint32_t contCE,

172 uint32_t nextCE, uint32_t nextCont CE,

173 uint32_t strength)

174 {

175 uint32_t CETable = (uint32_t )((uint8_t *)src->invUCA+src->invUCA->table);

176 int32_t iCE;

177

178 iCE = ucol_inv_findCE(src, CE, contCE);

179

180 if(iCE<0) {

181 *nextCE = UCOL_NOT_FOUND;

182 return -1;

183 }

184

185 CE &= strengthMask[strength];

186 contCE &= strengthMask[strength];

187

188 *nextCE = CE;

189 *nextContCE = contCE;

190

191 while((*nextCE & strengthMask[strength]) == CE

192 && (*nextContCE & strengthMask[strength]) == contCE)

193 {

194 nextCE = ((CETable+3*(++iCE)));

195 nextContCE = ((CETable+3*(iCE)+1));

196 }

197

198 return iCE;

199 }

200

201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,

202 uint32_t CE, uint32_t contCE,

203 uint32_t prevCE, uint32_t prevCont CE,

204 uint32_t strength)

205 {

206 uint32_t CETable = (uint32_t )((uint8_t *)src->invUCA+src->invUCA->table);

207 int32_t iCE;

208

209 iCE = ucol_inv_findCE(src, CE, contCE);

210

211 if(iCE<0) {

212 *prevCE = UCOL_NOT_FOUND;

213 return -1;

214 }

215

216 CE &= strengthMask[strength];

217 contCE &= strengthMask[strength];

218

219 *prevCE = CE;

220 *prevContCE = contCE;

221

222 while((*prevCE & strengthMask[strength]) == CE

223 && (*prevContCE & strengthMask[strength])== contCE

224 && iCE > 0) /* this condition should prevent falling off the edge of the world */

225 {

226 /* here, we end up in a singularity - zero */

227 prevCE = ((CETable+3*(--iCE)));

228 prevContCE = ((CETable+3*(iCE)+1));

229 }

230

231 return iCE;

232 }

233

234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t co ntCE,

235 uint32_t prevCE, uint32_t prevContCE)

236 {

237 if(prevCE == CE && prevContCE == contCE) {

238 return UCOL_IDENTICAL;

239 }

240 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY] )

241 \|\| (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[U COL_PRIMARY]))

242 {

243 return UCOL_PRIMARY;

244 }

245 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECOND ARY])

246 \|\| (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask [UCOL_SECONDARY]))

247 {

248 return UCOL_SECONDARY;

249 }

250 return UCOL_TERTIARY;

251 }

252

253

254 /*static

255 inline int32_t ucol_inv_getPrevious(UColTokenParser src, UColTokListHeader lh, uint32_t strength) {

256

257 uint32_t CE = lh->baseCE;

258 uint32_t SecondCE = lh->baseContCE;

259

260 uint32_t CETable = (uint32_t )((uint8_t *)src->invUCA+src->invUCA->table);

261 uint32_t previousCE, previousContCE;

262 int32_t iCE;

263

264 iCE = ucol_inv_findCE(src, CE, SecondCE);

265

266 if(iCE<0) {

267 return -1;

268 }

269

270 CE &= strengthMask[strength];

271 SecondCE &= strengthMask[strength];

272

273 previousCE = CE;

274 previousContCE = SecondCE;

275

276 while((previousCE & strengthMask[strength]) == CE && (previousContCE & str engthMask[strength])== SecondCE) {

277 previousCE = ((CETable+3(--iCE)));

278 previousContCE = ((CETable+3(iCE)+1));

279 }

280 lh->previousCE = previousCE;

281 lh->previousContCE = previousContCE;

282

283 return iCE;

284 }*/

285

286 static

287 inline int32_t ucol_inv_getNext(UColTokenParser src, UColTokListHeader lh, uin t32_t strength) {

288 uint32_t CE = lh->baseCE;

289 uint32_t SecondCE = lh->baseContCE;

290

291 uint32_t CETable = (uint32_t )((uint8_t *)src->invUCA+src->invUCA->table);

292 uint32_t nextCE, nextContCE;

293 int32_t iCE;

294

295 iCE = ucol_inv_findCE(src, CE, SecondCE);

296

297 if(iCE<0) {

298 return -1;

299 }

300

301 CE &= strengthMask[strength];

302 SecondCE &= strengthMask[strength];

303

304 nextCE = CE;

305 nextContCE = SecondCE;

306

307 while((nextCE & strengthMask[strength]) == CE

308 && (nextContCE & strengthMask[strength]) == SecondCE)

309 {

310 nextCE = ((CETable+3(++iCE)));

311 nextContCE = ((CETable+3(iCE)+1));

312 }

313

314 lh->nextCE = nextCE;

315 lh->nextContCE = nextContCE;

316

317 return iCE;

318 }

319

320 static void ucol_inv_getGapPositions(UColTokenParser src, UColTokListHeader lh , UErrorCode *status) {

321 /* reset all the gaps */

322 int32_t i = 0;

323 uint32_t CETable = (uint32_t )((uint8_t *)src->invUCA+src->invUCA->table);

324 uint32_t st = 0;

325 uint32_t t1, t2;

326 int32_t pos;

327

328 UColToken *tok = lh->first;

329 uint32_t tokStrength = tok->strength;

330

331 for(i = 0; i<3; i++) {

332 lh->gapsHi[3*i] = 0;

333 lh->gapsHi[3*i+1] = 0;

334 lh->gapsHi[3*i+2] = 0;

335 lh->gapsLo[3*i] = 0;

336 lh->gapsLo[3*i+1] = 0;

337 lh->gapsLo[3*i+2] = 0;

338 lh->numStr[i] = 0;

339 lh->fStrToken[i] = NULL;

340 lh->lStrToken[i] = NULL;

341 lh->pos[i] = -1;

342 }

343

344 UCAConstants consts = (UCAConstants )((uint8_t *)src->UCA->image + src->UC A->image->UCAConsts);

345

346 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh ->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicit s - */

347 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT _MAX ) { /* implicits - */

348 lh->pos[0] = 0;

349 t1 = lh->baseCE;

350 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;

351 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) \| (t2 & UCOL_PRIMARYMASK) >> 16;

352 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 \| (t2 & UCOL_SECONDARYMA SK) << 8;

353 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 \| (UCOL_TERTIARYORDER(t2) ) << 16;

354 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) \| ((t2 & UCOL_PRIMARYMASK) >> 16);

355 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(prim aryCE)+1);

356

357 t1 = (primaryCE & UCOL_PRIMARYMASK) \| 0x0505;

358 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // \| UCOL_CONTINUATION_MARKER ;

359

360 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) \| (t2 & UCOL_PRIMARYMASK) >> 16;

361 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 \| (t2 & UCOL_SECONDARYMA SK) << 8;

362 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 \| (UCOL_TERTIARYORDER(t2) ) << 16;

363 } else if(lh->indirect == TRUE && lh->nextCE != 0) {

364 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {

365 lh->pos[0] = 0;

366 t1 = lh->baseCE;

367 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;

368 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) \| (t2 & UCOL_PRIMARYMASK) >> 16;

369 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 \| (t2 & UCOL_SECONDARYMA SK) << 8;

370 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 \| (UCOL_TERTIARYORDER(t2) ) << 16;

371 t1 = lh->nextCE;

372 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;

373 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) \| (t2 & UCOL_PRIMARYMASK) >> 16;

374 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 \| (t2 & UCOL_SECONDARYMA SK) << 8;

375 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 \| (UCOL_TERTIARYORDER(t2) ) << 16;

376 } else {

377 for(;;) {

378 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {

379 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength )) >= 0) {

380 lh->fStrToken[tokStrength] = tok;

381 } else { /* The CE must be implicit, since it's not in the table */

382 /* Error */

383 *status = U_INTERNAL_PROGRAM_ERROR;

384 }

385 }

386

387 while(tok != NULL && tok->strength >= tokStrength) {

388 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {

389 lh->lStrToken[tokStrength] = tok;

390 }

391 tok = tok->next;

392 }

393 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {

394 /* check if previous interval is the same and merge the interval s if it is so */

395 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {

396 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];

397 lh->fStrToken[tokStrength+1] = NULL;

398 lh->lStrToken[tokStrength+1] = NULL;

399 lh->pos[tokStrength+1] = -1;

400 }

401 }

402 if(tok != NULL) {

403 tokStrength = tok->strength;

404 } else {

405 break;

406 }

407 }

408 for(st = 0; st < 3; st++) {

409 if((pos = lh->pos[st]) >= 0) {

410 t1 = (CETable+3(pos));

411 t2 = (CETable+3(pos)+1);

412 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) \| (t2 & UCOL_PRIMARYM ASK) >> 16;

413 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 \| (t2 & UCO L_SECONDARYMASK) << 8;

414 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 \| (UCOL_TE RTIARYORDER(t2)) << 16;

415 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 \| (t2&0x3f) << 16;

416 //pos--;

417 //t1 = (CETable+3(pos));

418 //t2 = (CETable+3(pos)+1);

419 t1 = lh->baseCE;

420 t2 = lh->baseContCE;

421 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) \| (t2 & UCOL_PRIMARYM ASK) >> 16;

422 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 \| (t2 & UCO L_SECONDARYMASK) << 8;

423 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 \| (t2&0x3f) << 16;

424 }

425 }

426 }

427 }

428

429

430 #define ucol_countBytes(value, noOfBytes) \

431 { \

432 uint32_t mask = 0xFFFFFFFF; \

433 (noOfBytes) = 0; \

434 while(mask != 0) { \

435 if(((value) & mask) != 0) { \

436 (noOfBytes)++; \

437 } \

438 mask >>= 8; \

439 } \

440 }

441

442 static uint32_t ucol_getNextGenerated(ucolCEGenerator g, UErrorCode status) {

443 if(U_SUCCESS(*status)) {

444 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);

445 }

446 return g->current;

447 }

448

449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator g, UColToken tok, ui nt32_t strength, UErrorCode *status) {

450 /* TODO: rename to enum names */

451 uint32_t high, low, count=1;

452 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;

453

454 if(strength == UCOL_SECONDARY) {

455 low = UCOL_COMMON_TOP2<<24;

456 high = 0xFFFFFFFF;

457 count = 0xFF - UCOL_COMMON_TOP2;

458 } else {

459 low = UCOL_BYTE_COMMON << 24; //0x05000000;

460 high = 0x40000000;

461 count = 0x40 - UCOL_BYTE_COMMON;

462 }

463

464 if(tok->next != NULL && tok->next->strength == strength) {

465 count = tok->next->toInsert;

466 }

467

468 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);

469 g->current = UCOL_BYTE_COMMON<<24;

470

471 if(g->noOfRanges == 0) {

472 *status = U_INTERNAL_PROGRAM_ERROR;

473 }

474 return g->current;

475 }

476

477 static uint32_t ucol_getCEGenerator(ucolCEGenerator g, uint32_t lows, uint32_t * highs, UColToken tok, uint32_t fStrength, UErrorCode status) {

478 uint32_t strength = tok->strength;

479 uint32_t low = lows[fStrength*3+strength];

480 uint32_t high = highs[fStrength*3+strength];

481 uint32_t maxByte = 0;

482 if(strength == UCOL_TERTIARY) {

483 maxByte = 0x3F;

484 } else if(strength == UCOL_PRIMARY) {

485 maxByte = 0xFE;

486 } else {

487 maxByte = 0xFF;

488 }

489

490 uint32_t count = tok->toInsert;

491

492 if(low >= high && strength > UCOL_PRIMARY) {

493 int32_t s = strength;

494 for(;;) {

495 s--;

496 if(lows[fStrength3+s] != highs[fStrength3+s]) {

497 if(strength == UCOL_SECONDARY) {

498 if (low < UCOL_COMMON_TOP2<<24 ) {

499 // Override if low range is less than UCOL_COMMON_TOP2.

500 low = UCOL_COMMON_TOP2<<24;

501 }

502 high = 0xFFFFFFFF;

503 } else {

504 // Override if low range is less than UCOL_COMMON_BOT3.

505 if ( low < UCOL_COMMON_BOT3<<24 ) {

506 low = UCOL_COMMON_BOT3<<24;

507 }

508 high = 0x40000000;

509 }

510 break;

511 }

512 if(s<0) {

513 *status = U_INTERNAL_PROGRAM_ERROR;

514 return 0;

515 }

516 }

517 }

518

519 if(low < 0x02000000) {

520 // We must not use CE weight byte 02, so we set it as the minimum lower bound.

521 // See http://site.icu-project.org/design/collation/bytes

522 low = 0x02000000;

523 }

524

525 if(strength == UCOL_SECONDARY) { /* similar as simple */

526 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<2 4)) {

527 low = UCOL_COMMON_TOP2<<24;

528 }

529 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<< 24)) {

530 high = UCOL_COMMON_TOP2<<24;

531 }

532 if(low < (UCOL_COMMON_BOT2<<24)) {

533 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);

534 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);

535 //g->current = UCOL_COMMON_BOT2<<24;

536 return g->current;

537 }

538 }

539

540 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);

541 if(g->noOfRanges == 0) {

542 *status = U_INTERNAL_PROGRAM_ERROR;

543 }

544 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);

545 return g->current;

546 }

547

548 static

549 uint32_t u_toLargeKana(const UChar source, const uint32_t sourceLen, UChar res Buf, const uint32_t resLen, UErrorCode *status) {

550 uint32_t i = 0;

551 UChar c;

552

553 if(U_FAILURE(*status)) {

554 return 0;

555 }

556

557 if(sourceLen > resLen) {

558 *status = U_MEMORY_ALLOCATION_ERROR;

559 return 0;

560 }

561

562 for(i = 0; i < sourceLen; i++) {

563 c = source[i];

564 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */

565 switch(c - 0x3000) {

566 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: ca se 0x83: case 0x85: case 0x8E:

567 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: ca se 0xE3: case 0xE5: case 0xEE:

568 c++;

569 break;

570 case 0xF5:

571 c = 0x30AB;

572 break;

573 case 0xF6:

574 c = 0x30B1;

575 break;

576 }

577 }

578 resBuf[i] = c;

579 }

580 return sourceLen;

581 }

582

583 static

584 uint32_t u_toSmallKana(const UChar source, const uint32_t sourceLen, UChar res Buf, const uint32_t resLen, UErrorCode *status) {

585 uint32_t i = 0;

586 UChar c;

587

588 if(U_FAILURE(*status)) {

589 return 0;

590 }

591

592 if(sourceLen > resLen) {

593 *status = U_MEMORY_ALLOCATION_ERROR;

594 return 0;

595 }

596

597 for(i = 0; i < sourceLen; i++) {

598 c = source[i];

599 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */

600 switch(c - 0x3000) {

601 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: ca se 0x84: case 0x86: case 0x8F:

602 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: ca se 0xE4: case 0xE6: case 0xEF:

603 c--;

604 break;

605 case 0xAB:

606 c = 0x30F5;

607 break;

608 case 0xB1:

609 c = 0x30F6;

610 break;

611 }

612 }

613 resBuf[i] = c;

614 }

615 return sourceLen;

616 }

617

618 U_NAMESPACE_BEGIN

619

620 static

621 uint8_t ucol_uprv_getCaseBits(const UCollator UCA, const UChar src, uint32_t l en, UErrorCode *status) {

622 uint32_t i = 0;

623 UChar n[128];

624 uint32_t nLen = 0;

625 uint32_t uCount = 0, lCount = 0;

626

627 collIterate s;

628 uint32_t order = 0;

629

630 if(U_FAILURE(*status)) {

631 return UCOL_LOWER_CASE;

632 }

633

634 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);

635 if(U_SUCCESS(*status)) {

636 for(i = 0; i < nLen; i++) {

637 uprv_init_collIterate(UCA, &n[i], 1, &s, status);

638 order = ucol_getNextCE(UCA, &s, status);

639 if(isContinuation(order)) {

640 *status = U_INTERNAL_PROGRAM_ERROR;

641 return UCOL_LOWER_CASE;

642 }

643 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {

644 uCount++;

645 } else {

646 if(u_islower(n[i])) {

647 lCount++;

648 } else if(U_SUCCESS(*status)) {

649 UChar sk[1], lk[1];

650 u_toSmallKana(&n[i], 1, sk, 1, status);

651 u_toLargeKana(&n[i], 1, lk, 1, status);

652 if(sk[0] == n[i] && lk[0] != n[i]) {

653 lCount++;

654 }

655 }

656 }

657 }

658 }

659

660 if(uCount != 0 && lCount != 0) {

661 return UCOL_MIXED_CASE;

662 } else if(uCount != 0) {

663 return UCOL_UPPER_CASE;

664 } else {

665 return UCOL_LOWER_CASE;

666 }

667 }

668

669

670 U_CFUNC void ucol_doCE(UColTokenParser src, uint32_t CEparts, UColToken tok, UErrorCode status) {

671 /* this one makes the table and stuff */

672 uint32_t noOfBytes[3];

673 uint32_t i;

674

675 for(i = 0; i<3; i++) {

676 ucol_countBytes(CEparts[i], noOfBytes[i]);

677 }

678

679 /* Here we have to pack CEs from parts */

680

681 uint32_t CEi = 0;

682 uint32_t value = 0;

683

684 while(2*CEi<noOfBytes[0] \|\| CEi<noOfBytes[1] \|\| CEi<noOfBytes[2]) {

685 if(CEi > 0) {

686 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */

687 } else {

688 value = 0;

689 }

690

691 if(2*CEi<noOfBytes[0]) {

692 value \|= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;

693 }

694 if(CEi<noOfBytes[1]) {

695 value \|= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;

696 }

697 if(CEi<noOfBytes[2]) {

698 value \|= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);

699 }

700 tok->CEs[CEi] = value;

701 CEi++;

702 }

703 if(CEi == 0) { /* totally ignorable */

704 tok->noOfCEs = 1;

705 tok->CEs[0] = 0;

706 } else { /* there is at least something */

707 tok->noOfCEs = CEi;

708 }

709

710

711 // we want to set case bits here and now, not later.

712 // Case bits handling

713 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables

714 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field

715 int32_t cSize = (tok->source & 0xFF000000) >> 24;

716 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;

717

718 if(cSize > 1) {

719 // Do it manually

720 tok->CEs[0] \|= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, statu s);

721 } else {

722 // Copy it from the UCA

723 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);

724 tok->CEs[0] \|= (caseCE & 0xC0);

725 }

726 }

727

728 #if UCOL_DEBUG==2

729 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8noOfBytes[0]), CEparts[1] >> (32-8noOfBytes [1]), CEparts[2]>> (32-8*noOfBytes[2]));

730 for(i = 0; i<tok->noOfCEs; i++) {

731 fprintf(stderr, "%08X ", tok->CEs[i]);

732 }

733 fprintf(stderr, "\n");

734 #endif

735 }

736

737 U_CFUNC void ucol_initBuffers(UColTokenParser src, UColTokListHeader lh, UErro rCode *status) {

738 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];

739 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];

740

741 UColToken *tok = lh->last;

742 uint32_t t[UCOL_STRENGTH_LIMIT];

743

744 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));

745

746 /* must initialize ranges to avoid memory check warnings */

747 for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {

748 uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));

749 }

750

751 tok->toInsert = 1;

752 t[tok->strength] = 1;

753

754 while(tok->previous != NULL) {

755 if(tok->previous->strength < tok->strength) { /* going up */

756 t[tok->strength] = 0;

757 t[tok->previous->strength]++;

758 } else if(tok->previous->strength > tok->strength) { /* going down */

759 t[tok->previous->strength] = 1;

760 } else {

761 t[tok->strength]++;

762 }

763 tok=tok->previous;

764 tok->toInsert = t[tok->strength];

765 }

766

767 tok->toInsert = t[tok->strength];

768 ucol_inv_getGapPositions(src, lh, status);

769

770 #if UCOL_DEBUG

771 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);

772 int32_t j = 2;

773 for(j = 2; j >= 0; j--) {

774 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j3], lh- >gapsLo[j3+1], lh->gapsLo[j*3+2]);

775 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j3], lh- >gapsHi[j3+1], lh->gapsHi[j*3+2]);

776 }

777 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];

778

779 do {

780 fprintf(stderr,"%i", tok->strength);

781 tok = tok->next;

782 } while(tok != NULL);

783 fprintf(stderr, "\n");

784

785 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];

786

787 do {

788 fprintf(stderr,"%i", tok->toInsert);

789 tok = tok->next;

790 } while(tok != NULL);

791 #endif

792

793 tok = lh->first;

794 uint32_t fStrength = UCOL_IDENTICAL;

795 uint32_t initStrength = UCOL_IDENTICAL;

796

797

798 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) \| (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;

799 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 \| (lh->bas eContCE & UCOL_SECONDARYMASK) << 8;

800 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 \| (UCOL_TERT IARYORDER(lh->baseContCE)) << 16;

801

802 while (tok != NULL && U_SUCCESS(*status)) {

803 fStrength = tok->strength;

804 if(fStrength < initStrength) {

805 initStrength = fStrength;

806 if(lh->pos[fStrength] == -1) {

807 while(lh->pos[fStrength] == -1 && fStrength > 0) {

808 fStrength--;

809 }

810 if(lh->pos[fStrength] == -1) {

811 *status = U_INTERNAL_PROGRAM_ERROR;

812 return;

813 }

814 }

815 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */

816 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];

817 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];

818 /CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gap sLo[fStrength3+2], lh->gapsHi[fStrength3+2], tok, UCOL_TERTIARY); /

819 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY ], lh->gapsLo, lh->gapsHi, tok, fStrength, status);

820 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */

821 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];

822 /CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrengt h3+1], lh->gapsHi[fStrength3+1], tok, 1);/

823 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDA RY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);

824 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);

825 } else { /* primaries */

826 /CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gaps Lo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);/

827 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);

828 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_S ECONDARY], tok, UCOL_SECONDARY, status);

829 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);

830 }

831 } else {

832 if(tok->strength == UCOL_TERTIARY) {

833 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIA RY], status);

834 } else if(tok->strength == UCOL_SECONDARY) {

835 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECON DARY], status);

836 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);

837 } else if(tok->strength == UCOL_PRIMARY) {

838 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY ], status);

839 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_S ECONDARY], tok, UCOL_SECONDARY, status);

840 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TE RTIARY], tok, UCOL_TERTIARY, status);

841 }

842 }

843 ucol_doCE(src, CEparts, tok, status);

844 tok = tok->next;

845 }

846 }

847

848 U_CFUNC void ucol_createElements(UColTokenParser src, tempUCATable t, UColTokL istHeader lh, UErrorCode status) {

849 UCAElements el;

850 UColToken *tok = lh->first;

851 UColToken *expt = NULL;

852 uint32_t i = 0, j = 0;

853 const Normalizer2Impl nfcImpl = Normalizer2Factory::getNFCImpl(status);

854

855 while(tok != NULL && U_SUCCESS(*status)) {

856 /* first, check if there are any expansions */

857 /* if there are expansions, we need to do a little bit more processing * /

858 /* since parts of expansion can be tailored, while others are not */

859 if(tok->expansion != 0) {

860 uint32_t len = tok->expansion >> 24;

861 uint32_t currentSequenceLen = len;

862 uint32_t expOffset = tok->expansion & 0x00FFFFFF;

863 //uint32_t exp = currentSequenceLen \| expOffset;

864 UColToken exp;

865 exp.source = currentSequenceLen \| expOffset;

866 exp.rulesToParseHdl = &(src->source);

867

868 while(len > 0) {

869 currentSequenceLen = len;

870 while(currentSequenceLen > 0) {

871 exp.source = (currentSequenceLen << 24) \| expOffset;

872 if((expt = (UColToken )uhash_get(src->tailored, &exp)) != N ULL && expt->strength != UCOL_TOK_RESET) { / expansion is tailored */

873 uint32_t noOfCEsToCopy = expt->noOfCEs;

874 for(j = 0; j<noOfCEsToCopy; j++) {

875 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];

876 }

877 tok->noOfExpCEs += noOfCEsToCopy;

878 // Smart people never try to add codepoints and CEs.

879 // For some odd reason, it won't work.

880 expOffset += currentSequenceLen; //noOfCEsToCopy;

881 len -= currentSequenceLen; //noOfCEsToCopy;

882 break;

883 } else {

884 currentSequenceLen--;

885 }

886 }

887 if(currentSequenceLen == 0) { /* couldn't find any tailored subs equence */

888 /* will have to get one from UCA */

889 /* first, get the UChars from the rules */

890 /* then pick CEs out until there is no more and stuff them i nto expansion */

891 collIterate s;

892 uint32_t order = 0;

893 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);

894

895 for(;;) {

896 order = ucol_getNextCE(src->UCA, &s, status);

897 if(order == UCOL_NO_MORE_CES) {

898 break;

899 }

900 tok->expCEs[tok->noOfExpCEs++] = order;

901 }

902 expOffset++;

903 len--;

904 }

905 }

906 } else {

907 tok->noOfExpCEs = 0;

908 }

909

910 /* set the ucaelement with obtained values */

911 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;

912 /* copy CEs */

913 for(i = 0; i<tok->noOfCEs; i++) {

914 el.CEs[i] = tok->CEs[i];

915 }

916 for(i = 0; i<tok->noOfExpCEs; i++) {

917 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];

918 }

919

920 /* copy UChars */

921 // We kept prefix and source kind of together, as it is a kind of a cont raction.

922 // However, now we have to slice the prefix off the main thing -

923 el.prefix = el.prefixChars;

924 el.cPoints = el.uchars;

925 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the

926 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND

927 // decomposed elements to the unsaf table.

928 el.prefixSize = tok->prefix>>24;

929 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el. prefixSize*sizeof(UChar));

930

931 el.cSize = (tok->source >> 24)-(tok->prefix>>24);

932 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));

933 } else {

934 el.prefixSize = 0;

935 *el.prefix = 0;

936

937 el.cSize = (tok->source >> 24);

938 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el. cSize*sizeof(UChar));

939 }

940 if(src->UCA != NULL) {

941 for(i = 0; i<el.cSize; i++) {

942 if(UCOL_ISJAMO(el.cPoints[i])) {

943 t->image->jamoSpecial = TRUE;

944 }

945 }

946 if (!src->buildCCTabFlag && el.cSize > 0) {

947 // Check the trailing canonical combining class (tccc) of the la st character.

948 const UChar *s = el.cPoints + el.cSize;

949 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);

950 if ((fcd & 0xff) != 0) {

951 src->buildCCTabFlag = TRUE;

952 }

953 }

954 }

955

956 /* and then, add it */

957 #if UCOL_DEBUG==2

958 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);

959 #endif

960 uprv_uca_addAnElement(t, &el, status);

961

962 #if UCOL_DEBUG_DUPLICATES

963 if(*status != U_ZERO_ERROR) {

964 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoint s[0], tok->debugSource);

965 *status = U_ZERO_ERROR;

966 }

967 #endif

968

969 tok = tok->next;

970 }

971 }

972

973 U_CDECL_BEGIN

974 static UBool U_CALLCONV

975 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {

976 UErrorCode status = U_ZERO_ERROR;

977 tempUCATable t = (tempUCATable )context;

978 if(value == 0) {

979 while(start < limit) {

980 uint32_t CE = utrie_get32(t->mapping, start, NULL);

981 if(CE == UCOL_NOT_FOUND) {

982 UCAElements el;

983 el.isThai = FALSE;

984 el.prefixSize = 0;

985 el.prefixChars[0] = 0;

986 el.prefix = el.prefixChars;

987 el.cPoints = el.uchars;

988

989 el.cSize = 0;

990 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);

991

992 el.noOfCEs = 1;

993 el.CEs[0] = 0;

994 uprv_uca_addAnElement(t, &el, &status);

995

996 }

997 start++;

998 }

999 }

1000 if(U_FAILURE(status)) {

1001 return FALSE;

1002 } else {

1003 return TRUE;

1004 }

1005 }

1006 U_CDECL_END

1007

1008 static void

1009 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser src, tempUCATable t,

1010 UChar32 start, UChar32 end,

1011 UErrorCode *status)

1012 {

1013 //UChar decomp[256];

1014 uint32_t CE = UCOL_NOT_FOUND;

1015 UChar32 u = 0;

1016 UCAElements el;

1017 el.isThai = FALSE;

1018 el.prefixSize = 0;

1019 el.prefixChars[0] = 0;

1020 collIterate colIt;

1021

1022 if(U_SUCCESS(*status)) {

1023 for(u = start; u<=end; u++) {

1024 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND

1025 /* this test is for contractions that are missing the starting e lement. */

1026 \|\| ((isCntTableElement(CE)) &&

1027 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_F OUND))

1028 )

1029 {

1030 el.cSize = 0;

1031 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);

1032 //decomp[0] = (UChar)u;

1033 //el.uchars[0] = (UChar)u;

1034 el.cPoints = el.uchars;

1035 //el.cSize = 1;

1036 el.noOfCEs = 0;

1037 el.prefix = el.prefixChars;

1038 el.prefixSize = 0;

1039 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);

1040 // We actually want to check whether this element is a special

1041 // If it is an implicit element (hangul, CJK - we want to copy t he

1042 // special, not the resolved CEs) - for hangul, copying resolved

1043 // would just make things the same (there is an expansion and it

1044 // takes approximately the same amount of time to resolve as

1045 // falling back to the UCA).

1046 /*

1047 UTRIE_GET32(src->UCA->mapping, u, CE);

1048 tag = getCETag(CE);

1049 if(tag == HANGUL_SYLLABLE_TAG \|\| tag == CJK_IMPLICIT_TAG

1050 \|\| tag == IMPLICIT_TAG \|\| tag == TRAIL_SURROGATE_TAG

1051 \|\| tag == LEAD_SURROGATE_TAG) {

1052 el.CEs[el.noOfCEs++] = CE;

1053 } else {

1054 */

1055 // It turns out that it does not make sense to keep implicits

1056 // unresolved. The cost of resolving them is big enough so that

1057 // it doesn't make any difference whether we have to go to the U CA

1058 // or not.

1059 {

1060 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);

1061 while(CE != UCOL_NO_MORE_CES) {

1062 CE = ucol_getNextCE(src->UCA, &colIt, status);

1063 if(CE != UCOL_NO_MORE_CES) {

1064 el.CEs[el.noOfCEs++] = CE;

1065 }

1066 }

1067 }

1068 uprv_uca_addAnElement(t, &el, status);

1069 }

1070 }

1071 }

1072 }

1073

1074 U_NAMESPACE_END

1075

1076 U_CFUNC UCATableHeader *

1077 ucol_assembleTailoringTable(UColTokenParser src, UErrorCode status) {

1078 U_NAMESPACE_USE

1079

1080 uint32_t i = 0;

1081 if(U_FAILURE(*status)) {

1082 return NULL;

1083 }

1084 /*

1085 2. Eliminate the negative lists by doing the following for each non-null ne gative list:

1086 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,

1087 create new ListHeader X

1088 o reverse the list, add to the end of X's positive list. Reset the strengt h of the

1089 first item you add, based on the stronger strength levels of the two lists.

1090 */

1091 /*

1092 3. For each ListHeader with a non-null positive list:

1093 */

1094 /*

1095 o Find all character strings with CEs between the baseCE and the

1096 next/previous CE, at the strength of the first token. Add these to the

1097 tailoring.

1098 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the

1099 tailoring has & x < z...

1100 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...

1101 */

1102 /* It is possible that this part should be done even while constructing list */

1103 /* The problem is that it is unknown what is going to be the strongest weigh t */

1104 /* So we might as well do it here */

1105

1106 /*

1107 o Allocate CEs for each token in the list, based on the total number N of the

1108 largest level difference, and the gap G between baseCE and nextCE at that

1109 level. The relation * between the last item and nextCE is the same as the

1110 strongest strength.

1111 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)

1112 ? There are 3 primary items: a, d, e. Fit them into the primary gap.

1113 Then fit b and c into the secondary gap between a and d, then fit q

1114 into the tertiary gap between b and c.

1115

1116 o Example: baseCE << b <<< q << c * nextCE(X,2)

1117 ? There are 2 secondary items: b, c. Fit them into the secondary gap.

1118 Then fit q into the tertiary gap between b and c.

1119 o When incrementing primary values, we will not cross high byte

1120 boundaries except where there is only a single-byte primary. That is to

1121 ensure that the script reordering will continue to work.

1122 */

1123 UCATableHeader image = (UCATableHeader )uprv_malloc(sizeof(UCATableHeader) );

1124 /* test for NULL */

1125 if (image == NULL) {

1126 *status = U_MEMORY_ALLOCATION_ERROR;

1127 return NULL;

1128 }

1129 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));

1130

1131 for(i = 0; i<src->resultLen; i++) {

1132 /* now we need to generate the CEs */

1133 /* We stuff the initial value in the buffers, and increase the appropria te buffer */

1134 /* According to strength */

1135 if(U_SUCCESS(*status)) {

1136 if(src->lh[i].first) { // if there are any elements

1137 // due to the way parser works, subsequent tailorings

1138 // may remove all the elements from a sequence, therefore

1139 // leaving an empty tailoring sequence.

1140 ucol_initBuffers(src, &src->lh[i], status);

1141 }

1142 }

1143 if(U_FAILURE(*status)) {

1144 uprv_free(image);

1145 return NULL;

1146 }

1147 }

1148

1149 if(src->varTop != NULL) { /* stuff the variable top value */

1150 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;

1151 /* remove it from the list */

1152 if(src->varTop->listHeader->first == src->varTop) { /* first in list */

1153 src->varTop->listHeader->first = src->varTop->next;

1154 }

1155 if(src->varTop->listHeader->last == src->varTop) { /* first in list */

1156 src->varTop->listHeader->last = src->varTop->previous;

1157 }

1158 if(src->varTop->next != NULL) {

1159 src->varTop->next->previous = src->varTop->previous;

1160 }

1161 if(src->varTop->previous != NULL) {

1162 src->varTop->previous->next = src->varTop->next;

1163 }

1164 }

1165

1166

1167 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOU ND_TAG, NOT_FOUND_TAG, status);

1168 if(U_FAILURE(*status)) {

1169 uprv_free(image);

1170 return NULL;

1171 }

1172

1173

1174 /* After this, we have assigned CE values to all regular CEs */

1175 /* now we will go through list once more and resolve expansions, */

1176 /* make UCAElements structs and add them to table */

1177 for(i = 0; i<src->resultLen; i++) {

1178 /* now we need to generate the CEs */

1179 /* We stuff the initial value in the buffers, and increase the appropria te buffer */

1180 /* According to strength */

1181 if(U_SUCCESS(*status)) {

1182 ucol_createElements(src, t, &src->lh[i], status);

1183 }

1184 }

1185

1186 UCAElements el;

1187 el.isThai = FALSE;

1188 el.prefixSize = 0;

1189 el.prefixChars[0] = 0;

1190

1191 /* add latin-1 stuff */

1192 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);

1193

1194 /* add stuff for copying */

1195 if(src->copySet != NULL) {

1196 int32_t i = 0;

1197 UnicodeSet set = (UnicodeSet )src->copySet;

1198 for(i = 0; i < set->getRangeCount(); i++) {

1199 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->g etRangeEnd(i), status);

1200 }

1201 }

1202

1203 if(U_SUCCESS(*status)) {

1204 /* copy contractions from the UCA - this is felt mostly for cyrillic*/

1205

1206 uint32_t tailoredCE = UCOL_NOT_FOUND;

1207 UChar conts = (UChar )((uint8_t *)src->UCA->image + src->UCA->image->c ontractionUCACombos);

1208 int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosW idth;

1209 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status) ;

1210 // Check for null pointer

1211 if (ucaEl == NULL) {

1212 *status = U_MEMORY_ALLOCATION_ERROR;

1213 return NULL;

1214 }

1215 while(*conts != 0) {

1216 // A continuation is NUL-terminated and NUL-padded

1217 // except if it has the maximum length.

1218 int32_t contractionLength = maxUCAContractionLength;

1219 while(contractionLength > 0 && conts[contractionLength - 1] == 0) {

1220 --contractionLength;

1221 }

1222 UChar32 first;

1223 int32_t firstLength = 0;

1224 U16_NEXT(conts, firstLength, contractionLength, first);

1225 tailoredCE = utrie_get32(t->mapping, first, NULL);

1226 if(tailoredCE != UCOL_NOT_FOUND) {

1227 UBool needToAdd = TRUE;

1228 if(isCntTableElement(tailoredCE)) {

1229 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts +firstLength, status) == TRUE) {

1230 needToAdd = FALSE;

1231 }

1232 }

1233 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {

1234 UCAElements elm;

1235 elm.cPoints = el.uchars;

1236 elm.noOfCEs = 0;

1237 elm.uchars[0] = *conts;

1238 elm.uchars[1] = 0;

1239 elm.cSize = 1;

1240 elm.prefixChars[0] = *(conts+2);

1241 elm.isThai = FALSE;

1242 elm.prefix = elm.prefixChars;

1243 elm.prefixSize = 1;

1244 UCAElements prefixEnt=(UCAElements )uhash_get(t->prefixLoo kup, &elm);

1245 if ((prefixEnt==NULL) \|\| (prefixEnt->prefix)!=(conts+2)) {

1246 needToAdd = TRUE;

1247 }

1248 }

1249 if(src->removeSet != NULL && uset_contains(src->removeSet, first )) {

1250 needToAdd = FALSE;

1251 }

1252

1253 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.

1254 if (*(conts+1) != 0) { // contractions

1255 el.prefix = el.prefixChars;

1256 el.prefixSize = 0;

1257 el.cPoints = el.uchars;

1258 el.noOfCEs = 0;

1259 u_memcpy(el.uchars, conts, contractionLength);

1260 el.cSize = contractionLength;

1261 ucol_setText(ucaEl, el.uchars, el.cSize, status);

1262 }

1263 else { // pre-context character

1264 UChar str[4] = { 0 };

1265 int32_t len=0;

1266 int32_t preKeyLen=0;

1267

1268 el.cPoints = el.uchars;

1269 el.noOfCEs = 0;

1270 el.uchars[0] = *conts;

1271 el.uchars[1] = 0;

1272 el.cSize = 1;

1273 el.prefixChars[0] = *(conts+2);

1274 el.prefix = el.prefixChars;

1275 el.prefixSize = 1;

1276 if (el.prefixChars[0]!=0) {

1277 // get CE of prefix character first

1278 str[0]=el.prefixChars[0];

1279 str[1]=0;

1280 ucol_setText(ucaEl, str, 1, status);

1281 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaE l, status))

1282 != UCOL_NULLORDER) {

1283 preKeyLen++; // count number of keys for prefix character

1284 }

1285 str[len++] = el.prefixChars[0];

1286 }

1287

1288 str[len++] = el.uchars[0];

1289 str[len]=0;

1290 ucol_setText(ucaEl, str, len, status);

1291 // Skip the keys for prefix character, then copy the res t to el.

1292 while ((preKeyLen-->0) &&

1293 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, s tatus)) != UCOL_NULLORDER) {

1294 continue;

1295 }

1296

1297 }

1298 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, statu s)) != UCOL_NULLORDER) {

1299 el.noOfCEs++;

1300 }

1301 uprv_uca_addAnElement(t, &el, status);

1302 }

1303

1304 } else if(src->removeSet != NULL && uset_contains(src->removeSet, fi rst)) {

1305 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);

1306 }

1307 conts+=maxUCAContractionLength;

1308 }

1309 ucol_closeElements(ucaEl);

1310 }

1311

1312 // Add completely ignorable elements

1313 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);

1314

1315 // add tailoring characters related canonical closures

1316 uprv_uca_canonicalClosure(t, src, NULL, status);

1317

1318 /* still need to produce compatibility closure */

1319

1320 UCATableHeader *myData = uprv_uca_assembleTable(t, status);

1321

1322 uprv_uca_closeTempTable(t);

1323 uprv_free(image);

1324

1325 return myData;

1326 }

1327

1328 U_CDECL_BEGIN

1329 static UBool U_CALLCONV

1330 ucol_bld_cleanup(void)

1331 {

1332 udata_close(invUCA_DATA_MEM);

1333 invUCA_DATA_MEM = NULL;

1334 _staticInvUCA = NULL;

1335 gStaticInvUCAInitOnce.reset();

1336 return TRUE;

1337 }

1338 U_CDECL_END

1339

1340 static void U_CALLCONV initInverseUCA(UErrorCode &status) {

1341 U_ASSERT(invUCA_DATA_MEM == NULL);

1342 U_ASSERT(_staticInvUCA == NULL);

1343 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);

1344 InverseUCATableHeader *newInvUCA = NULL;

1345 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_ DATA_NAME, isAcceptableInvUCA, NULL, &status);

1346

1347 if(U_FAILURE(status)) {

1348 if (result) {

1349 udata_close(result);

1350 }

1351 // This is not needed, as we are talking about

1352 // memory we got from UData

1353 //uprv_free(newInvUCA);

1354 return;

1355 }

1356

1357 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */

1358 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);

1359 UCollator *UCA = ucol_initUCA(&status);

1360 // UCA versions of UCA and inverse UCA should match

1361 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVe rsionInfo)) != 0) {

1362 status = U_INVALID_FORMAT_ERROR;

1363 udata_close(result);

1364 return;

1365 }

1366

1367 invUCA_DATA_MEM = result;

1368 _staticInvUCA = newInvUCA;

1369 }

1370 }

1371

1372

1373 U_CAPI const InverseUCATableHeader * U_EXPORT2

1374 ucol_initInverseUCA(UErrorCode *status)

1375 {

1376 umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status);

1377 return _staticInvUCA;

1378 }

1379

1380 /* This is the data that is used for non-script reordering codes. These _must_ b e kept

1381 * in order that they are to be applied as defaults and in synch with the UColRe orderCode enum.

1382 */

1383 static const char * const ReorderingTokenNames[] = {

1384 "SPACE",

1385 "PUNCT",

1386 "SYMBOL",

1387 "CURRENCY",

1388 "DIGIT"

1389 };

1390

1391 static void toUpper(const char* src, char* dst, uint32_t length) {

1392 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {

1393 dst = uprv_toupper(src);

1394 }

1395 *dst = '\0';

1396 }

1397

1398 U_INTERNAL int32_t U_EXPORT2

1399 ucol_findReorderingEntry(const char* name) {

1400 char buffer[32];

1401 toUpper(name, buffer, 32);

1402 for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {

1403 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {

1404 return entry + UCOL_REORDER_CODE_FIRST;

1405 }

1406 }

1407 return USCRIPT_INVALID_CODE;

1408 }

1409

1410 #endif /* #if !UCONFIG_NO_COLLATION */

OLD	NEW

« no previous file with comments | « source/i18n/ucol_bld.h ('k') | source/i18n/ucol_cnt.h » ('j') | no next file with comments »