source/i18n/ucol_tok.cpp - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/i18n/ucol_tok.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 *******************************************************************************

3 *

4 * Copyright (C) 2001-2012, International Business Machines

5 * Corporation and others. All Rights Reserved.

6 *

7 *******************************************************************************

8 * file name: ucol_tok.cpp

9 * encoding: US-ASCII

10 * tab size: 8 (not used)

11 * indentation:4

12 *

13 * created 02/22/2001

14 * created by: Vladimir Weinstein

15 *

16 * This module reads a tailoring rule string and produces a list of

17 * tokens that will be turned into collation elements

18 *

19 */

20

21 #include "unicode/utypes.h"

22

23 #if !UCONFIG_NO_COLLATION

24

25 #include "unicode/uscript.h"

26 #include "unicode/ustring.h"

27 #include "unicode/uchar.h"

28 #include "unicode/uniset.h"

29

30 #include "cmemory.h"

31 #include "cstring.h"

32 #include "patternprops.h"

33 #include "ucol_bld.h"

34 #include "ucol_tok.h"

35 #include "ulocimp.h"

36 #include "uresimp.h"

37

38 // Define this only for debugging.

39 // #define DEBUG_FOR_COLL_RULES 1

40

41 #ifdef DEBUG_FOR_COLL_RULES

42 #include <iostream>

43 #endif

44

45 U_NAMESPACE_USE

46

47 U_CDECL_BEGIN

48 static int32_t U_CALLCONV

49 uhash_hashTokens(const UHashTok k)

50 {

51 int32_t hash = 0;

52 //uint32_t key = (uint32_t)k.integer;

53 UColToken key = (UColToken )k.pointer;

54 if (key != 0) {

55 int32_t len = (key->source & 0xFF000000)>>24;

56 int32_t inc = ((len - 32) / 32) + 1;

57

58 const UChar p = (key->source & 0x00FFFFFF) + (key->rulesToParseHdl);

59 const UChar *limit = p + len;

60

61 while (p<limit) {

62 hash = (hash * 37) + *p;

63 p += inc;

64 }

65 }

66 return hash;

67 }

68

69 static UBool U_CALLCONV

70 uhash_compareTokens(const UHashTok key1, const UHashTok key2)

71 {

72 //uint32_t p1 = (uint32_t) key1.integer;

73 //uint32_t p2 = (uint32_t) key2.integer;

74 UColToken p1 = (UColToken )key1.pointer;

75 UColToken p2 = (UColToken )key2.pointer;

76 const UChar s1 = (p1->source & 0x00FFFFFF) + (p1->rulesToParseHdl);

77 const UChar s2 = (p2->source & 0x00FFFFFF) + (p2->rulesToParseHdl);

78 uint32_t s1L = ((p1->source & 0xFF000000) >> 24);

79 uint32_t s2L = ((p2->source & 0xFF000000) >> 24);

80 const UChar *end = s1+s1L-1;

81

82 if (p1 == p2) {

83 return TRUE;

84 }

85 if (p1->source == 0 \|\| p2->source == 0) {

86 return FALSE;

87 }

88 if(s1L != s2L) {

89 return FALSE;

90 }

91 if(p1->source == p2->source) {

92 return TRUE;

93 }

94 while((s1 < end) && s1 == s2) {

95 ++s1;

96 ++s2;

97 }

98 if(s1 == s2) {

99 return TRUE;

100 } else {

101 return FALSE;

102 }

103 }

104 U_CDECL_END

105

106 /*

107 * Debug messages used to pinpoint where a format error occurred.

108 * A better way is to include context-sensitive information in syntaxError() fun ction.

109 *

110 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR

111 * in the compile line.

112 */

113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */

114

115 #ifdef DEBUG_FOR_FORMAT_ERROR

116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__) ;}

117 #else

118 #define DBG_FORMAT_ERROR

119 #endif

120

121

122 /*

123 * Controls debug messages so that the output can be compared before and after a

124 * big change. Prints the information of every code point that comes out of the

125 * collation parser and its strength into a file. When a big change in format

126 * happens, the files before and after the change should be identical.

127 *

128 * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS

129 * in the compile line.

130 */

131 // #define DEBUG_FOR_CODE_POINTS 1

132

133 #ifdef DEBUG_FOR_CODE_POINTS

134 FILE* dfcp_fp = NULL;

135 #endif

136

137

138 typedef struct {

139 uint32_t startCE;

140 uint32_t startContCE;

141 uint32_t limitCE;

142 uint32_t limitContCE;

143 } indirectBoundaries;

144

145 /* these values are used for finding CE values for indirect positioning. */

146 /* Indirect positioning is a mechanism for allowing resets on symbolic */

147 /* values. It only works for resets and you cannot tailor indirect names */

148 /* An indirect name can define either an anchor point or a range. An */

149 /* anchor point behaves in exactly the same way as a code point in reset */

150 /* would, except that it cannot be tailored. A range (we currently only */

151 /* know for the [top] range will explicitly set the upper bound for */

152 /* generated CEs, thus allowing for better control over how many CEs can */

153 /* be squeezed between in the range without performance penalty. */

154 /* In that respect, we use [top] for tailoring of locales that use CJK */

155 /* characters. Other indirect values are currently a pure convenience, */

156 /* they can be used to assure that the CEs will be always positioned in */

157 /* the same place relative to a point with known properties (e.g. first */

158 /* primary ignorable). */

159 static indirectBoundaries ucolIndirectBoundaries[15];

160 /*

161 static indirectBoundaries ucolIndirectBoundaries[11] = {

162 { UCOL_RESET_TOP_VALUE, 0,

163 UCOL_NEXT_TOP_VALUE, 0 },

164 { UCOL_FIRST_PRIMARY_IGNORABLE, 0,

165 0, 0 },

166 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,

167 0, 0 },

168 { UCOL_FIRST_SECONDARY_IGNORABLE, 0,

169 0, 0 },

170 { UCOL_LAST_SECONDARY_IGNORABLE, 0,

171 0, 0 },

172 { UCOL_FIRST_TERTIARY_IGNORABLE, 0,

173 0, 0 },

174 { UCOL_LAST_TERTIARY_IGNORABLE, 0,

175 0, 0 },

176 { UCOL_FIRST_VARIABLE, 0,

177 0, 0 },

178 { UCOL_LAST_VARIABLE, 0,

179 0, 0 },

180 { UCOL_FIRST_NON_VARIABLE, 0,

181 0, 0 },

182 { UCOL_LAST_NON_VARIABLE, 0,

183 0, 0 },

184 };

185 */

186

187 static void setIndirectBoundaries(uint32_t indexR, uint32_t start, uint32_t en d) {

188

189 // Set values for the top - TODO: once we have values for all the indirects, we are going

190 // to initalize here.

191 ucolIndirectBoundaries[indexR].startCE = start[0];

192 ucolIndirectBoundaries[indexR].startContCE = start[1];

193 if(end) {

194 ucolIndirectBoundaries[indexR].limitCE = end[0];

195 ucolIndirectBoundaries[indexR].limitContCE = end[1];

196 } else {

197 ucolIndirectBoundaries[indexR].limitCE = 0;

198 ucolIndirectBoundaries[indexR].limitContCE = 0;

199 }

200 }

201

202

203 static inline

204 void syntaxError(const UChar* rules,

205 int32_t pos,

206 int32_t rulesLen,

207 UParseError* parseError)

208 {

209 parseError->offset = pos;

210 parseError->line = 0 ; /* we are not using line numbers */

211

212 // for pre-context

213 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN -1));

214 int32_t stop = pos;

215

216 u_memcpy(parseError->preContext,rules+start,stop-start);

217 //null terminate the buffer

218 parseError->preContext[stop-start] = 0;

219

220 //for post-context

221 start = pos+1;

222 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1 )) :

223 rulesLen;

224

225 if(start < stop) {

226 u_memcpy(parseError->postContext,rules+start,stop-start);

227 //null terminate the buffer

228 parseError->postContext[stop-start]= 0;

229 } else {

230 parseError->postContext[0] = 0;

231 }

232 }

233

234 static

235 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, U ColAttributeValue value) {

236 switch(attrib) {

237 case UCOL_HIRAGANA_QUATERNARY_MODE:

238 opts->hiraganaQ = value;

239 break;

240 case UCOL_FRENCH_COLLATION:

241 opts->frenchCollation = value;

242 break;

243 case UCOL_ALTERNATE_HANDLING:

244 opts->alternateHandling = value;

245 break;

246 case UCOL_CASE_FIRST:

247 opts->caseFirst = value;

248 break;

249 case UCOL_CASE_LEVEL:

250 opts->caseLevel = value;

251 break;

252 case UCOL_NORMALIZATION_MODE:

253 opts->normalizationMode = value;

254 break;

255 case UCOL_STRENGTH:

256 opts->strength = value;

257 break;

258 case UCOL_NUMERIC_COLLATION:

259 opts->numericCollation = value;

260 break;

261 case UCOL_ATTRIBUTE_COUNT:

262 default:

263 break;

264 }

265 }

266

267 #define UTOK_OPTION_COUNT 22

268

269 static UBool didInit = FALSE;

270 /* we can be strict, or we can be lenient */

271 /* I'd surely be lenient with the option arguments */

272 /* maybe even with options */

273 U_STRING_DECL(suboption_00, "non-ignorable", 13);

274 U_STRING_DECL(suboption_01, "shifted", 7);

275

276 U_STRING_DECL(suboption_02, "lower", 5);

277 U_STRING_DECL(suboption_03, "upper", 5);

278 U_STRING_DECL(suboption_04, "off", 3);

279 U_STRING_DECL(suboption_05, "on", 2);

280 U_STRING_DECL(suboption_06, "1", 1);

281 U_STRING_DECL(suboption_07, "2", 1);

282 U_STRING_DECL(suboption_08, "3", 1);

283 U_STRING_DECL(suboption_09, "4", 1);

284 U_STRING_DECL(suboption_10, "I", 1);

285

286 U_STRING_DECL(suboption_11, "primary", 7);

287 U_STRING_DECL(suboption_12, "secondary", 9);

288 U_STRING_DECL(suboption_13, "tertiary", 8);

289 U_STRING_DECL(suboption_14, "variable", 8);

290 U_STRING_DECL(suboption_15, "regular", 7);

291 U_STRING_DECL(suboption_16, "implicit", 8);

292 U_STRING_DECL(suboption_17, "trailing", 8);

293

294

295 U_STRING_DECL(option_00, "undefined", 9);

296 U_STRING_DECL(option_01, "rearrange", 9);

297 U_STRING_DECL(option_02, "alternate", 9);

298 U_STRING_DECL(option_03, "backwards", 9);

299 U_STRING_DECL(option_04, "variable top", 12);

300 U_STRING_DECL(option_05, "top", 3);

301 U_STRING_DECL(option_06, "normalization", 13);

302 U_STRING_DECL(option_07, "caseLevel", 9);

303 U_STRING_DECL(option_08, "caseFirst", 9);

304 U_STRING_DECL(option_09, "scriptOrder", 11);

305 U_STRING_DECL(option_10, "charsetname", 11);

306 U_STRING_DECL(option_11, "charset", 7);

307 U_STRING_DECL(option_12, "before", 6);

308 U_STRING_DECL(option_13, "hiraganaQ", 9);

309 U_STRING_DECL(option_14, "strength", 8);

310 U_STRING_DECL(option_15, "first", 5);

311 U_STRING_DECL(option_16, "last", 4);

312 U_STRING_DECL(option_17, "optimize", 8);

313 U_STRING_DECL(option_18, "suppressContractions", 20);

314 U_STRING_DECL(option_19, "numericOrdering", 15);

315 U_STRING_DECL(option_20, "import", 6);

316 U_STRING_DECL(option_21, "reorder", 7);

317

318 /*

319 [last variable] last variable value

320 [last primary ignorable] largest CE for primary ignorable

321 [last secondary ignorable] largest CE for secondary ignorable

322 [last tertiary ignorable] largest CE for tertiary ignorable

323 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8 )

324 */

325

326

327 static const ucolTokSuboption alternateSub[2] = {

328 {suboption_00, 13, UCOL_NON_IGNORABLE},

329 {suboption_01, 7, UCOL_SHIFTED}

330 };

331

332 static const ucolTokSuboption caseFirstSub[3] = {

333 {suboption_02, 5, UCOL_LOWER_FIRST},

334 {suboption_03, 5, UCOL_UPPER_FIRST},

335 {suboption_04, 3, UCOL_OFF},

336 };

337

338 static const ucolTokSuboption onOffSub[2] = {

339 {suboption_04, 3, UCOL_OFF},

340 {suboption_05, 2, UCOL_ON}

341 };

342

343 static const ucolTokSuboption frenchSub[1] = {

344 {suboption_07, 1, UCOL_ON}

345 };

346

347 static const ucolTokSuboption beforeSub[3] = {

348 {suboption_06, 1, UCOL_PRIMARY},

349 {suboption_07, 1, UCOL_SECONDARY},

350 {suboption_08, 1, UCOL_TERTIARY}

351 };

352

353 static const ucolTokSuboption strengthSub[5] = {

354 {suboption_06, 1, UCOL_PRIMARY},

355 {suboption_07, 1, UCOL_SECONDARY},

356 {suboption_08, 1, UCOL_TERTIARY},

357 {suboption_09, 1, UCOL_QUATERNARY},

358 {suboption_10, 1, UCOL_IDENTICAL},

359 };

360

361 static const ucolTokSuboption firstLastSub[7] = {

362 {suboption_11, 7, UCOL_PRIMARY},

363 {suboption_12, 9, UCOL_PRIMARY},

364 {suboption_13, 8, UCOL_PRIMARY},

365 {suboption_14, 8, UCOL_PRIMARY},

366 {suboption_15, 7, UCOL_PRIMARY},

367 {suboption_16, 8, UCOL_PRIMARY},

368 {suboption_17, 8, UCOL_PRIMARY},

369 };

370

371 enum OptionNumber {

372 OPTION_ALTERNATE_HANDLING = 0,

373 OPTION_FRENCH_COLLATION,

374 OPTION_CASE_LEVEL,

375 OPTION_CASE_FIRST,

376 OPTION_NORMALIZATION_MODE,

377 OPTION_HIRAGANA_QUATERNARY,

378 OPTION_STRENGTH,

379 OPTION_NUMERIC_COLLATION,

380 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,

381 OPTION_VARIABLE_TOP,

382 OPTION_REARRANGE,

383 OPTION_BEFORE,

384 OPTION_TOP,

385 OPTION_FIRST,

386 OPTION_LAST,

387 OPTION_OPTIMIZE,

388 OPTION_SUPPRESS_CONTRACTIONS,

389 OPTION_UNDEFINED,

390 OPTION_SCRIPT_ORDER,

391 OPTION_CHARSET_NAME,

392 OPTION_CHARSET,

393 OPTION_IMPORT,

394 OPTION_SCRIPTREORDER

395 } ;

396

397 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {

398 /00/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /"alterna te" /

399 /01/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /"backwards" /

400 /02/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /"caseLevel" /

401 /03/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /"caseFirst" /

402 /04/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /"normalizati on" /

403 /05/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /"hiraga naQ" /

404 /06/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /"strength" /

405 /07/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /"numericOrde ring"/

406 /08/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"variable top" /

407 /09/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"rearrange" /

408 /10/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /"before" /

409 /11/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"top" /

410 /12/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /"first" /

411 /13/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /"last" /

412 /14/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"optimize" /

413 /15/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"suppressContractio ns" /

414 /16/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"undefined" /

415 /17/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"scriptOrder" /

416 /18/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"charsetname" /

417 /19/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"charset" /

418 /20/ {option_20, 6, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /"import" /

419 /21/ {option_21, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /"reorder" /

420 };

421

422 static

423 int32_t u_strncmpNoCase(const UChar *s1,

424 const UChar *s2,

425 int32_t n)

426 {

427 if(n > 0) {

428 int32_t rc;

429 for(;;) {

430 rc = (int32_t)u_tolower(s1) - (int32_t)u_tolower(s2);

431 if(rc != 0 \|\| *s1 == 0 \|\| --n == 0) {

432 return rc;

433 }

434 ++s1;

435 ++s2;

436 }

437 }

438 return 0;

439 }

440

441 static

442 void ucol_uprv_tok_initData() {

443 if(!didInit) {

444 U_STRING_INIT(suboption_00, "non-ignorable", 13);

445 U_STRING_INIT(suboption_01, "shifted", 7);

446

447 U_STRING_INIT(suboption_02, "lower", 5);

448 U_STRING_INIT(suboption_03, "upper", 5);

449 U_STRING_INIT(suboption_04, "off", 3);

450 U_STRING_INIT(suboption_05, "on", 2);

451

452 U_STRING_INIT(suboption_06, "1", 1);

453 U_STRING_INIT(suboption_07, "2", 1);

454 U_STRING_INIT(suboption_08, "3", 1);

455 U_STRING_INIT(suboption_09, "4", 1);

456 U_STRING_INIT(suboption_10, "I", 1);

457

458 U_STRING_INIT(suboption_11, "primary", 7);

459 U_STRING_INIT(suboption_12, "secondary", 9);

460 U_STRING_INIT(suboption_13, "tertiary", 8);

461 U_STRING_INIT(suboption_14, "variable", 8);

462 U_STRING_INIT(suboption_15, "regular", 7);

463 U_STRING_INIT(suboption_16, "implicit", 8);

464 U_STRING_INIT(suboption_17, "trailing", 8);

465

466

467 U_STRING_INIT(option_00, "undefined", 9);

468 U_STRING_INIT(option_01, "rearrange", 9);

469 U_STRING_INIT(option_02, "alternate", 9);

470 U_STRING_INIT(option_03, "backwards", 9);

471 U_STRING_INIT(option_04, "variable top", 12);

472 U_STRING_INIT(option_05, "top", 3);

473 U_STRING_INIT(option_06, "normalization", 13);

474 U_STRING_INIT(option_07, "caseLevel", 9);

475 U_STRING_INIT(option_08, "caseFirst", 9);

476 U_STRING_INIT(option_09, "scriptOrder", 11);

477 U_STRING_INIT(option_10, "charsetname", 11);

478 U_STRING_INIT(option_11, "charset", 7);

479 U_STRING_INIT(option_12, "before", 6);

480 U_STRING_INIT(option_13, "hiraganaQ", 9);

481 U_STRING_INIT(option_14, "strength", 8);

482 U_STRING_INIT(option_15, "first", 5);

483 U_STRING_INIT(option_16, "last", 4);

484 U_STRING_INIT(option_17, "optimize", 8);

485 U_STRING_INIT(option_18, "suppressContractions", 20);

486 U_STRING_INIT(option_19, "numericOrdering", 15);

487 U_STRING_INIT(option_20, "import ", 6);

488 U_STRING_INIT(option_21, "reorder", 7);

489 didInit = TRUE;

490 }

491 }

492

493

494 // This function reads basic options to set in the runtime collator

495 // used by data driven tests. Should not support build time options

496 U_CAPI const UChar * U_EXPORT2

497 ucol_tok_getNextArgument(const UChar start, const UChar end,

498 UColAttribute attrib, UColAttributeValue value,

499 UErrorCode *status)

500 {

501 uint32_t i = 0;

502 int32_t j=0;

503 UBool foundOption = FALSE;

504 const UChar *optionArg = NULL;

505

506 ucol_uprv_tok_initData();

507

508 while(start < end && PatternProps::isWhiteSpace(start)) { / eat whitespace */

509 start++;

510 }

511 if(start >= end) {

512 return NULL;

513 }

514 /* skip opening '[' */

515 if(*start == 0x005b) {

516 start++;

517 } else {

518 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['

519 return NULL;

520 }

521

522 while(i < UTOK_OPTION_COUNT) {

523 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].op tionLen) == 0) {

524 foundOption = TRUE;

525 if(end - start > rulesOptions[i].optionLen) {

526 optionArg = start+rulesOptions[i].optionLen+1; /* start of the o ptions, skip space */

527 while(PatternProps::isWhiteSpace(optionArg)) { / eat whitespac e */

528 optionArg++;

529 }

530 }

531 break;

532 }

533 i++;

534 }

535

536 if(!foundOption) {

537 *status = U_ILLEGAL_ARGUMENT_ERROR;

538 return NULL;

539 }

540

541 if(optionArg) {

542 for(j = 0; j<rulesOptions[i].subSize; j++) {

543 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, ru lesOptions[i].subopts[j].subLen) == 0) {

544 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr , rulesOptions[i].subopts[j].attrVal);

545 *attrib = rulesOptions[i].attr;

546 *value = rulesOptions[i].subopts[j].attrVal;

547 optionArg += rulesOptions[i].subopts[j].subLen;

548 while(PatternProps::isWhiteSpace(optionArg)) { / eat whitespac e */

549 optionArg++;

550 }

551 if(*optionArg == 0x005d) {

552 optionArg++;

553 return optionArg;

554 } else {

555 *status = U_ILLEGAL_ARGUMENT_ERROR;

556 return NULL;

557 }

558 }

559 }

560 }

561 *status = U_ILLEGAL_ARGUMENT_ERROR;

562 return NULL;

563 }

564

565 static

566 USet ucol_uprv_tok_readAndSetUnicodeSet(const UChar start, const UChar end, U ErrorCode status) {

567 while(start != 0x005b) { / advance while we find the first '[' */

568 start++;

569 }

570 // now we need to get a balanced set of '[]'. The problem is that a set can have

571 // many, and *end point to the first closing '['

572 int32_t noOpenBraces = 1;

573 int32_t current = 1; // skip the opening brace

574 while(start+current < end && noOpenBraces != 0) {

575 if(start[current] == 0x005b) {

576 noOpenBraces++;

577 } else if(start[current] == 0x005D) { // closing brace

578 noOpenBraces--;

579 }

580 current++;

581 }

582

583 if(noOpenBraces != 0 \|\| u_strchr(start+current, 0x005d /']'/) == NULL) {

584 *status = U_ILLEGAL_ARGUMENT_ERROR;

585 return NULL;

586 }

587 return uset_openPattern(start, current, status);

588 }

589

590 /**

591 * Reads an option and matches the option name with the predefined options. (Cas e-insensitive.)

592 * @param start Pointer to the start UChar.

593 * @param end Pointer to the last valid pointer beyond which the option will not extend.

594 * @param optionArg Address of the pointer at which the options start (after the option name)

595 * @return The index of the option, or -1 if the option is not valid.

596 */

597 static

598 int32_t ucol_uprv_tok_readOption(const UChar start, const UChar end, const UCh ar **optionArg) {

599 int32_t i = 0;

600 ucol_uprv_tok_initData();

601

602 while(PatternProps::isWhiteSpace(start)) { / eat whitespace */

603 start++;

604 }

605 while(i < UTOK_OPTION_COUNT) {

606 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].op tionLen) == 0) {

607 if(end - start > rulesOptions[i].optionLen) {

608 optionArg = start+rulesOptions[i].optionLen; / End of option n ame; start of the options */

609 while(PatternProps::isWhiteSpace(*optionArg)) { / eat whitespa ce */

610 (*optionArg)++;

611 }

612 }

613 break;

614 }

615 i++;

616 }

617 if(i == UTOK_OPTION_COUNT) {

618 i = -1; // didn't find an option

619 }

620 return i;

621 }

622

623

624 static

625 void ucol_tok_parseScriptReorder(UColTokenParser src, UErrorCode status) {

626 int32_t codeCount = 0;

627 int32_t codeIndex = 0;

628 char conversion[64];

629 int32_t tokenLength = 0;

630 const UChar* space;

631

632 const UChar* current = src->current;

633 const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);

634

635 // eat leading whitespace

636 while(current < end && u_isWhitespace(*current)) {

637 current++;

638 }

639

640 while(current < end) {

641 space = u_memchr(current, 0x0020, end - current);

642 space = space == 0 ? end : space;

643 tokenLength = space - current;

644 if (tokenLength < 4) {

645 *status = U_INVALID_FORMAT_ERROR;

646 return;

647 }

648 codeCount++;

649 current += tokenLength;

650 while(current < end && u_isWhitespace(current)) { / eat whitespace */

651 ++current;

652 }

653 }

654

655 if (codeCount == 0) {

656 *status = U_INVALID_FORMAT_ERROR;

657 }

658

659 src->reorderCodesLength = codeCount;

660 src->reorderCodes = (int32_t)uprv_malloc(codeCount sizeof(int32_t));

661 current = src->current;

662

663 // eat leading whitespace

664 while(current < end && u_isWhitespace(*current)) {

665 current++;

666 }

667

668 while(current < end) {

669 space = u_memchr(current, 0x0020, end - current);

670 space = space == 0 ? end : space;

671 tokenLength = space - current;

672 if (tokenLength < 4) {

673 *status = U_ILLEGAL_ARGUMENT_ERROR;

674 return;

675 } else {

676 u_UCharsToChars(current, conversion, tokenLength);

677 conversion[tokenLength] = '\0';

678 src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);

679 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {

680 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRI PT, conversion);

681 }

682 if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {

683 *status = U_ILLEGAL_ARGUMENT_ERROR;

684 }

685 }

686 codeIndex++;

687 current += tokenLength;

688 while(current < end && u_isWhitespace(current)) { / eat whitespace */

689 ++current;

690 }

691 }

692 }

693

694 // reads and conforms to various options in rules

695 // end is the position of the first closing ']'

696 // However, some of the options take an UnicodeSet definition

697 // which needs to duplicate the closing ']'

698 // for example: '[copy [\uAC00-\uD7FF]]'

699 // These options will move end to the second ']' and the

700 // caller will set the current to it.

701 static

702 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser src, UErrorCode status) {

703 const UChar* start = src->current;

704 int32_t i = 0;

705 int32_t j=0;

706 const UChar *optionArg = NULL;

707

708 uint8_t result = 0;

709

710 start++; /skip opening '['/

711 i = ucol_uprv_tok_readOption(start, src->end, &optionArg);

712 if(optionArg) {

713 src->current = optionArg;

714 }

715

716 if(i < 0) {

717 *status = U_ILLEGAL_ARGUMENT_ERROR;

718 } else {

719 int32_t noOpenBraces = 1;

720 switch(i) {

721 case OPTION_ALTERNATE_HANDLING:

722 case OPTION_FRENCH_COLLATION:

723 case OPTION_CASE_LEVEL:

724 case OPTION_CASE_FIRST:

725 case OPTION_NORMALIZATION_MODE:

726 case OPTION_HIRAGANA_QUATERNARY:

727 case OPTION_STRENGTH:

728 case OPTION_NUMERIC_COLLATION:

729 if(optionArg) {

730 for(j = 0; j<rulesOptions[i].subSize; j++) {

731 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName , rulesOptions[i].subopts[j].subLen) == 0) {

732 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].at tr, rulesOptions[i].subopts[j].attrVal);

733 result = UCOL_TOK_SUCCESS;

734 }

735 }

736 }

737 if(result == 0) {

738 *status = U_ILLEGAL_ARGUMENT_ERROR;

739 }

740 break;

741 case OPTION_VARIABLE_TOP:

742 result = UCOL_TOK_SUCCESS \| UCOL_TOK_VARIABLE_TOP;

743 break;

744 case OPTION_REARRANGE:

745 result = UCOL_TOK_SUCCESS;

746 break;

747 case OPTION_BEFORE:

748 if(optionArg) {

749 for(j = 0; j<rulesOptions[i].subSize; j++) {

750 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName , rulesOptions[i].subopts[j].subLen) == 0) {

751 result = UCOL_TOK_SUCCESS \| (rulesOptions[i].subopts[j].attr Val + 1);

752 }

753 }

754 }

755 if(result == 0) {

756 *status = U_ILLEGAL_ARGUMENT_ERROR;

757 }

758 break;

759 case OPTION_TOP: /* we are going to have an array with structures of limit C Es */

760 /* index to this array will be src->parsedToken.indirectIndex*/

761 src->parsedToken.indirectIndex = 0;

762 result = UCOL_TOK_SUCCESS \| UCOL_TOK_TOP;

763 break;

764 case OPTION_FIRST:

765 case OPTION_LAST: /* first, last */

766 for(j = 0; j<rulesOptions[i].subSize; j++) {

767 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, ru lesOptions[i].subopts[j].subLen) == 0) {

768 // the calculation below assumes that OPTION_FIRST and OPTION_LA ST are at i and i+1 and that the first

769 // element of indirect boundaries is reserved for top.

770 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2 );

771 result = UCOL_TOK_SUCCESS \| UCOL_TOK_TOP;;

772 }

773 }

774 if(result == 0) {

775 *status = U_ILLEGAL_ARGUMENT_ERROR;

776 }

777 break;

778 case OPTION_OPTIMIZE:

779 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before no rmalization

780 // we need to move end here

781 src->current++; // skip opening brace

782 while(src->current < src->end && noOpenBraces != 0) {

783 if(*src->current == 0x005b) {

784 noOpenBraces++;

785 } else if(*src->current == 0x005D) { // closing brace

786 noOpenBraces--;

787 }

788 src->current++;

789 }

790 result = UCOL_TOK_SUCCESS;

791 break;

792 case OPTION_SCRIPTREORDER:

793 ucol_tok_parseScriptReorder(src, status);

794 break;

795 default:

796 *status = U_UNSUPPORTED_ERROR;

797 break;

798 }

799 }

800 src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->curren t));

801 return result;

802 }

803

804

805 inline void ucol_tok_addToExtraCurrent(UColTokenParser src, const UChar stuff, int32_t len, UErrorCode *status) {

806 if (stuff == NULL \|\| len <= 0) {

807 return;

808 }

809 UnicodeString tempStuff(FALSE, stuff, len);

810 if(src->extraCurrent+len >= src->extraEnd) {

811 /* reallocate */

812 if (stuff >= src->source && stuff <= src->end) {

813 // Copy the "stuff" contents into tempStuff's own buffer.

814 // UnicodeString is copy-on-write.

815 if (len > 0) {

816 tempStuff.setCharAt(0, tempStuff[0]);

817 } else {

818 tempStuff.remove();

819 }

820 }

821 UChar newSrc = (UChar )uprv_realloc(src->source, (src->extraEnd-src->s ource)2sizeof(UChar));

822 if(newSrc != NULL) {

823 src->current = newSrc + (src->current - src->source);

824 src->extraCurrent = newSrc + (src->extraCurrent - src->source);

825 src->end = newSrc + (src->end - src->source);

826 src->extraEnd = newSrc + (src->extraEnd-src->source)*2;

827 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);

828 src->source = newSrc;

829 } else {

830 *status = U_MEMORY_ALLOCATION_ERROR;

831 return;

832 }

833 }

834 if(len == 1) {

835 *src->extraCurrent++ = tempStuff[0];

836 } else {

837 u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);

838 src->extraCurrent += len;

839 }

840 }

841

842 inline UBool ucol_tok_doSetTop(UColTokenParser src, UErrorCode status) {

843 /*

844 top = TRUE;

845 */

846 UChar buff[5];

847 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);

848 buff[0] = 0xFFFE;

849 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].sta rtCE >> 16);

850 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].sta rtCE & 0xFFFF);

851 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {

852 src->parsedToken.charsLen = 3;

853 ucol_tok_addToExtraCurrent(src, buff, 3, status);

854 } else {

855 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex] .startContCE >> 16);

856 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex] .startContCE & 0xFFFF);

857 src->parsedToken.charsLen = 5;

858 ucol_tok_addToExtraCurrent(src, buff, 5, status);

859 }

860 return TRUE;

861 }

862

863 static UBool isCharNewLine(UChar c){

864 switch(c){

865 case 0x000A: /* LF */

866 case 0x000D: /* CR */

867 case 0x000C: /* FF */

868 case 0x0085: /* NEL */

869 case 0x2028: /* LS */

870 case 0x2029: /* PS */

871 return TRUE;

872 default:

873 return FALSE;

874 }

875 }

876

877 /*

878 * This function is called several times when a range is processed. Each time, the next code point

879 * is processed.

880 * The following variables must be set before calling this function:

881 * src->currentRangeCp: The current code point to process.

882 * src->lastRangeCp: The last code point in the range.

883 * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.

884 */

885 static const UChar*

886 ucol_tok_processNextCodePointInRange(UColTokenParser *src,

887 UErrorCode *status)

888 {

889 // Append current code point to source

890 UChar buff[U16_MAX_LENGTH];

891 uint32_t i = 0;

892

893 uint32_t nChars = U16_LENGTH(src->currentRangeCp);

894 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);

895 src->parsedToken.charsLen = nChars;

896

897 U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);

898 ucol_tok_addToExtraCurrent(src, buff, nChars, status);

899

900 ++src->currentRangeCp;

901 if (src->currentRangeCp > src->lastRangeCp) {

902 src->inRange = FALSE;

903

904 if (src->currentStarredCharIndex > src->lastStarredCharIndex) {

905 src->isStarred = FALSE;

906 }

907 } else {

908 src->previousCp = src->currentRangeCp;

909 }

910 return src->current;

911 }

912

913 /*

914 * This function is called several times when a starred list is processed. Each time, the next code point

915 * in the list is processed.

916 * The following variables must be set before calling this function:

917 * src->currentStarredCharIndex: Index (in src->source) of the first char of the current code point.

918 * src->lastStarredCharIndex: Index to the last character in the list.

919 * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.

920 */

921 static const UChar*

922 ucol_tok_processNextTokenInStarredList(UColTokenParser *src)

923 {

924 // Extract the characters corresponding to the next code point.

925 UChar32 cp;

926 src->parsedToken.charsOffset = src->currentStarredCharIndex;

927 int32_t prev = src->currentStarredCharIndex;

928 U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src- >source), cp);

929 src->parsedToken.charsLen = src->currentStarredCharIndex - prev;

930

931 // When we are done parsing the starred string, turn the flag off so that

932 // the normal processing is restored.

933 if (src->currentStarredCharIndex > src->lastStarredCharIndex) {

934 src->isStarred = FALSE;

935 }

936 src->previousCp = cp;

937 return src->current;

938 }

939

940 /*

941 * Partially parses the next token, keeps the indices in src->parsedToken, and u pdates the counters.

942 *

943 * This routine parses and separates almost all tokens. The following are the sy ntax characters recognized.

944 * # : Comment character

945 * & : Reset operator

946 * = : Equality

947 * < : Primary collation

948 * << : Secondary collation

949 * <<< : Tertiary collation

950 * ; : Secondary collation

951 * , : Tertiary collation

952 * / : Expansions

953 * \| : Prefix

954 * - : Range

955

956 * ! : Java Thai modifier, ignored

957 * @ : French only

958

959 * [] : Options

960 * '' : Quotes

961 *

962 * Along with operators =, <, <<, <<<, the operator * is supported to indicate a list. For example, &a<*bcdexyz

963 * is equivalent to &a<b<c<d<e<x<y<z. In lists, ranges also can be given, so & a*b-ex-z is equivalent to the above.

964 * This function do not separate the tokens in a list. Instead, &a<*b-ex-z is parsed as three tokens - "&a",

965 * "<*b", "-ex", "-z". The strength (< in this case), whether in a list, wheth er in a range and the previous

966 * character returned as cached so that the calling program can do further spli tting.

967 */

968 static const UChar*

969 ucol_tok_parseNextTokenInternal(UColTokenParser *src,

970 UBool startOfRules,

971 UParseError *parseError,

972 UErrorCode *status)

973 {

974 UBool variableTop = FALSE;

975 UBool top = FALSE;

976 UBool inChars = TRUE;

977 UBool inQuote = FALSE;

978 UBool wasInQuote = FALSE;

979 uint8_t before = 0;

980 UBool isEscaped = FALSE;

981

982 // TODO: replace these variables with src->parsedToken counterparts

983 // no need to use them anymore since we have src->parsedToken.

984 // Ideally, token parser would be a nice class... Once, when I have

985 // more time (around 2020 probably).

986 uint32_t newExtensionLen = 0;

987 uint32_t extensionOffset = 0;

988 uint32_t newStrength = UCOL_TOK_UNSET;

989 UChar buff[10];

990

991 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;

992 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;

993 src->parsedToken.indirectIndex = 0;

994

995 while (src->current < src->end) {

996 UChar ch = *(src->current);

997

998 if (inQuote) {

999 if (ch == 0x0027/'\''/) {

1000 inQuote = FALSE;

1001 } else {

1002 if ((src->parsedToken.charsLen == 0) \|\| inChars) {

1003 if(src->parsedToken.charsLen == 0) {

1004 src->parsedToken.charsOffset = (uint32_t)(src->extraCurr ent - src->source);

1005 }

1006 src->parsedToken.charsLen++;

1007 } else {

1008 if(newExtensionLen == 0) {

1009 extensionOffset = (uint32_t)(src->extraCurrent - src->so urce);

1010 }

1011 newExtensionLen++;

1012 }

1013 }

1014 }else if(isEscaped){

1015 isEscaped =FALSE;

1016 if (newStrength == UCOL_TOK_UNSET) {

1017 *status = U_INVALID_FORMAT_ERROR;

1018 syntaxError(src->source,(int32_t)(src->current-src->source),(int 32_t)(src->end-src->source),parseError);

1019 DBG_FORMAT_ERROR

1020 return NULL;

1021 // enabling rules to start with non-tokens a < b

1022 // newStrength = UCOL_TOK_RESET;

1023 }

1024 if(ch != 0x0000 && src->current != src->end) {

1025 if (inChars) {

1026 if(src->parsedToken.charsLen == 0) {

1027 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);

1028 }

1029 src->parsedToken.charsLen++;

1030 } else {

1031 if(newExtensionLen == 0) {

1032 extensionOffset = (uint32_t)(src->current - src->source) ;

1033 }

1034 newExtensionLen++;

1035 }

1036 }

1037 }else {

1038 if(!PatternProps::isWhiteSpace(ch)) {

1039 /* Sets the strength for this entry */

1040 switch (ch) {

1041 case 0x003D/'='/ :

1042 if (newStrength != UCOL_TOK_UNSET) {

1043 goto EndOfLoop;

1044 }

1045

1046 /* if we start with strength, we'll reset to top */

1047 if(startOfRules == TRUE) {

1048 src->parsedToken.indirectIndex = 5;

1049 top = ucol_tok_doSetTop(src, status);

1050 newStrength = UCOL_TOK_RESET;

1051 goto EndOfLoop;

1052 }

1053 newStrength = UCOL_IDENTICAL;

1054 if((src->current+1) == 0x002A) {/''/

1055 src->current++;

1056 src->isStarred = TRUE;

1057 }

1058 break;

1059

1060 case 0x002C/','/:

1061 if (newStrength != UCOL_TOK_UNSET) {

1062 goto EndOfLoop;

1063 }

1064

1065 /* if we start with strength, we'll reset to top */

1066 if(startOfRules == TRUE) {

1067 src->parsedToken.indirectIndex = 5;

1068 top = ucol_tok_doSetTop(src, status);

1069 newStrength = UCOL_TOK_RESET;

1070 goto EndOfLoop;

1071 }

1072 newStrength = UCOL_TERTIARY;

1073 break;

1074

1075 case 0x003B/';'/:

1076 if (newStrength != UCOL_TOK_UNSET) {

1077 goto EndOfLoop;

1078 }

1079

1080 /* if we start with strength, we'll reset to top */

1081 if(startOfRules == TRUE) {

1082 src->parsedToken.indirectIndex = 5;

1083 top = ucol_tok_doSetTop(src, status);

1084 newStrength = UCOL_TOK_RESET;

1085 goto EndOfLoop;

1086 }

1087 newStrength = UCOL_SECONDARY;

1088 break;

1089

1090 case 0x003C/'<'/:

1091 if (newStrength != UCOL_TOK_UNSET) {

1092 goto EndOfLoop;

1093 }

1094

1095 /* if we start with strength, we'll reset to top */

1096 if(startOfRules == TRUE) {

1097 src->parsedToken.indirectIndex = 5;

1098 top = ucol_tok_doSetTop(src, status);

1099 newStrength = UCOL_TOK_RESET;

1100 goto EndOfLoop;

1101 }

1102 /* before this, do a scan to verify whether this is */

1103 /* another strength */

1104 if(*(src->current+1) == 0x003C) {

1105 src->current++;

1106 if(*(src->current+1) == 0x003C) {

1107 src->current++; /* three in a row! */

1108 newStrength = UCOL_TERTIARY;

1109 } else { /* two in a row */

1110 newStrength = UCOL_SECONDARY;

1111 }

1112 } else { /* just one */

1113 newStrength = UCOL_PRIMARY;

1114 }

1115 if((src->current+1) == 0x002A) {/''/

1116 src->current++;

1117 src->isStarred = TRUE;

1118 }

1119 break;

1120

1121 case 0x0026/'&'/:

1122 if (newStrength != UCOL_TOK_UNSET) {

1123 /**/

1124 goto EndOfLoop;

1125 }

1126

1127 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */

1128 break;

1129

1130 case 0x005b/'['/:

1131 /* options - read an option, analyze it */

1132 if(u_strchr(src->current, 0x005d /']'/) != NULL) {

1133 uint8_t result = ucol_uprv_tok_readAndSetOption(src, sta tus);

1134 if(U_SUCCESS(*status)) {

1135 if(result & UCOL_TOK_TOP) {

1136 if(newStrength == UCOL_TOK_RESET) {

1137 top = ucol_tok_doSetTop(src, status);

1138 if(before) { // This is a combination of bef ore and indirection like '&[before 2][first regular]<b'

1139 src->parsedToken.charsLen+=2;

1140 buff[0] = 0x002d;

1141 buff[1] = before;

1142 ucol_tok_addToExtraCurrent(src, buff, 2, status);

1143 }

1144

1145 src->current++;

1146 goto EndOfLoop;

1147 } else {

1148 *status = U_INVALID_FORMAT_ERROR;

1149 syntaxError(src->source,(int32_t)(src->curre nt-src->source),(int32_t)(src->end-src->source),parseError);

1150 DBG_FORMAT_ERROR

1151 }

1152 } else if(result & UCOL_TOK_VARIABLE_TOP) {

1153 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {

1154 variableTop = TRUE;

1155 src->parsedToken.charsOffset = (uint32_t)(sr c->extraCurrent - src->source);

1156 src->parsedToken.charsLen = 1;

1157 buff[0] = 0xFFFF;

1158 ucol_tok_addToExtraCurrent(src, buff, 1, sta tus);

1159 src->current++;

1160 goto EndOfLoop;

1161 } else {

1162 *status = U_INVALID_FORMAT_ERROR;

1163 syntaxError(src->source,(int32_t)(src->curre nt-src->source),(int32_t)(src->end-src->source),parseError);

1164 DBG_FORMAT_ERROR

1165 }

1166 } else if (result & UCOL_TOK_BEFORE){

1167 if(newStrength == UCOL_TOK_RESET) {

1168 before = result & UCOL_TOK_BEFORE;

1169 } else {

1170 *status = U_INVALID_FORMAT_ERROR;

1171 syntaxError(src->source,(int32_t)(src->curre nt-src->source),(int32_t)(src->end-src->source),parseError);

1172 DBG_FORMAT_ERROR

1173 }

1174 }

1175 } else {

1176 *status = U_INVALID_FORMAT_ERROR;

1177 syntaxError(src->source,(int32_t)(src->current-src-> source),(int32_t)(src->end-src->source),parseError);

1178 DBG_FORMAT_ERROR

1179 return NULL;

1180 }

1181 }

1182 break;

1183 case 0x0021/! skip java thai modifier reordering/:

1184 break;

1185 case 0x002F/'/'/:

1186 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */

1187 inChars = FALSE; /* we're now processing expansion */

1188 break;

1189 case 0x005C /* back slash for escaped chars */:

1190 isEscaped = TRUE;

1191 break;

1192 /* found a quote, we're gonna start copying */

1193 case 0x0027/'\''/:

1194 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal unt il we have a strength */

1195 *status = U_INVALID_FORMAT_ERROR;

1196 syntaxError(src->source,(int32_t)(src->current-src->source ),(int32_t)(src->end-src->source),parseError);

1197 DBG_FORMAT_ERROR

1198 return NULL;

1199 // enabling rules to start with a non-token character a < b

1200 // newStrength = UCOL_TOK_RESET;

1201 }

1202

1203 inQuote = TRUE;

1204

1205 if(inChars) { /* we're doing characters */

1206 if(wasInQuote == FALSE) {

1207 src->parsedToken.charsOffset = (uint32_t)(src->extra Current - src->source);

1208 }

1209 if (src->parsedToken.charsLen != 0) {

1210 ucol_tok_addToExtraCurrent(src, src->current - src-> parsedToken.charsLen, src->parsedToken.charsLen, status);

1211 }

1212 src->parsedToken.charsLen++;

1213 } else { /* we're doing an expansion */

1214 if(wasInQuote == FALSE) {

1215 extensionOffset = (uint32_t)(src->extraCurrent - src ->source);

1216 }

1217 if (newExtensionLen != 0) {

1218 ucol_tok_addToExtraCurrent(src, src->current - newEx tensionLen, newExtensionLen, status);

1219 }

1220 newExtensionLen++;

1221 }

1222

1223 wasInQuote = TRUE;

1224

1225 ch = *(++(src->current));

1226 if(ch == 0x0027) { /* copy the double quote */

1227 ucol_tok_addToExtraCurrent(src, &ch, 1, status);

1228 inQuote = FALSE;

1229 }

1230 break;

1231

1232 /* '@' is french only if the strength is not currently set * /

1233 /* if it is, it's just a regular character in collation rule s */

1234 case 0x0040/'@'/:

1235 if (newStrength == UCOL_TOK_UNSET) {

1236 src->opts->frenchCollation = UCOL_ON;

1237 break;

1238 }

1239

1240 case 0x007C /\|/: /* this means we have actually been reading p refix part */

1241 // we want to store read characters to the prefix part and c ontinue reading

1242 // the characters (proper way would be to restart reading th e chars, but in

1243 // that case we would have to complicate the token hasher, w hich I do not

1244 // intend to play with. Instead, we will do prefixes when pr efixes are due

1245 // (before adding the elements).

1246 src->parsedToken.prefixOffset = src->parsedToken.charsOffset ;

1247 src->parsedToken.prefixLen = src->parsedToken.charsLen;

1248

1249 if(inChars) { /* we're doing characters */

1250 if(wasInQuote == FALSE) {

1251 src->parsedToken.charsOffset = (uint32_t)(src->extra Current - src->source);

1252 }

1253 if (src->parsedToken.charsLen != 0) {

1254 ucol_tok_addToExtraCurrent(src, src->current - src-> parsedToken.charsLen, src->parsedToken.charsLen, status);

1255 }

1256 src->parsedToken.charsLen++;

1257 }

1258

1259 wasInQuote = TRUE;

1260

1261 do {

1262 ch = *(++(src->current));

1263 // skip whitespace between '\|' and the character

1264 } while (PatternProps::isWhiteSpace(ch));

1265 break;

1266

1267 //charsOffset = 0;

1268 //newCharsLen = 0;

1269 //break; // We want to store the whole prefix/character sequ ence. If we break

1270 // the '\|' is going to get lost.

1271

1272 case 0x002D /-/: /* A range. */

1273 if (newStrength != UCOL_TOK_UNSET) {

1274 // While processing the pending token, the isStarred field

1275 // is reset, so it needs to be saved for the next

1276 // invocation.

1277 src->savedIsStarred = src->isStarred;

1278 goto EndOfLoop;

1279 }

1280 src->isStarred = src->savedIsStarred;

1281

1282 // Ranges are valid only in starred tokens.

1283 if (!src->isStarred) {

1284 *status = U_INVALID_FORMAT_ERROR;

1285 syntaxError(src->source,(int32_t)(src->current-src->source) ,(int32_t)(src->end-src->source),parseError);

1286 DBG_FORMAT_ERROR

1287 return NULL;

1288 }

1289 newStrength = src->parsedToken.strength;

1290 src->inRange = TRUE;

1291 break;

1292

1293 case 0x0023 /#/: /* this is a comment, skip everything through the end of line */

1294 do {

1295 ch = *(++(src->current));

1296 } while (!isCharNewLine(ch));

1297

1298 break;

1299 default:

1300 if (newStrength == UCOL_TOK_UNSET) {

1301 *status = U_INVALID_FORMAT_ERROR;

1302 syntaxError(src->source,(int32_t)(src->current-src->source ),(int32_t)(src->end-src->source),parseError);

1303 DBG_FORMAT_ERROR

1304 return NULL;

1305 }

1306

1307 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {

1308 *status = U_INVALID_FORMAT_ERROR;

1309 syntaxError(src->source,(int32_t)(src->current-src->sour ce),(int32_t)(src->end-src->source),parseError);

1310 DBG_FORMAT_ERROR

1311 return NULL;

1312 }

1313

1314 if(ch == 0x0000 && src->current+1 == src->end) {

1315 break;

1316 }

1317

1318 if (inChars) {

1319 if(src->parsedToken.charsLen == 0) {

1320 src->parsedToken.charsOffset = (uint32_t)(src->curre nt - src->source);

1321 }

1322 src->parsedToken.charsLen++;

1323 } else {

1324 if(newExtensionLen == 0) {

1325 extensionOffset = (uint32_t)(src->current - src->sou rce);

1326 }

1327 newExtensionLen++;

1328 }

1329

1330 break;

1331 }

1332 }

1333 }

1334

1335 if(wasInQuote) {

1336 if(ch != 0x27) {

1337 if(inQuote \|\| !PatternProps::isWhiteSpace(ch)) {

1338 ucol_tok_addToExtraCurrent(src, &ch, 1, status);

1339 }

1340 }

1341 }

1342

1343 src->current++;

1344 }

1345

1346 EndOfLoop:

1347 wasInQuote = FALSE;

1348 if (newStrength == UCOL_TOK_UNSET) {

1349 return NULL;

1350 }

1351

1352 if (src->parsedToken.charsLen == 0 && top == FALSE) {

1353 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(sr c->end-src->source),parseError);

1354 *status = U_INVALID_FORMAT_ERROR;

1355 DBG_FORMAT_ERROR

1356 return NULL;

1357 }

1358

1359 src->parsedToken.strength = newStrength;

1360 src->parsedToken.extensionOffset = extensionOffset;

1361 src->parsedToken.extensionLen = newExtensionLen;

1362 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) \| (UCOL _TOK_TOP * (top?1:0)) \| before;

1363

1364 return src->current;

1365 }

1366

1367 /*

1368 * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.

1369 * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.

1370 *

1371 * In addition to what ucol_tok_parseNextTokenInternal() does, this function doe s the following:

1372 * 1) ucol_tok_parseNextTokenInternal() returns a range as a single token. Thi s function separates

1373 * it to separate tokens and returns one by one. In order to do that, the n ecessary states are

1374 * cached as member variables of the token parser.

1375 * 2) When encountering a range, ucol_tok_parseNextTokenInternal() processes ch aracters up to the

1376 * starting character as a single list token (which is separated into indivi dual characters here)

1377 * and as another list token starting with the last character in the range. Before expanding it

1378 * as a list of tokens, this function expands the range by filling the inter mediate characters and

1379 * returns them one by one as separate tokens.

1380 * Necessary checks are done for invalid combinations.

1381 */

1382 U_CAPI const UChar* U_EXPORT2

1383 ucol_tok_parseNextToken(UColTokenParser *src,

1384 UBool startOfRules,

1385 UParseError *parseError,

1386 UErrorCode *status)

1387 {

1388 const UChar *nextToken;

1389

1390 if (src->inRange) {

1391 // We are not done processing a range. Continue it.

1392 return ucol_tok_processNextCodePointInRange(src, status);

1393 } else if (src->isStarred) {

1394 // We are not done processing a starred token. Continue it.

1395 return ucol_tok_processNextTokenInStarredList(src);

1396 }

1397

1398 // Get the next token.

1399 nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, sta tus);

1400

1401 if (nextToken == NULL) {

1402 return NULL;

1403 }

1404

1405 if (src->inRange) {

1406 // A new range has started.

1407 // Check whether it is a chain of ranges with more than one hyphen.

1408 if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {

1409 *status = U_INVALID_FORMAT_ERROR;

1410 syntaxError(src->source,src->parsedToken.charsOffset-1,

1411 src->parsedToken.charsOffset+src->parsedToken.charsLen, pars eError);

1412 DBG_FORMAT_ERROR

1413 return NULL;

1414 }

1415

1416 // The current token indicates the second code point of the range.

1417 // Process just that, and then proceed with the star.

1418 src->currentStarredCharIndex = src->parsedToken.charsOffset;

1419 U16_NEXT(src->source, src->currentStarredCharIndex,

1420 (uint32_t)(src->end - src->source), src->lastRangeCp);

1421 if (src->lastRangeCp <= src->previousCp) {

1422 *status = U_INVALID_FORMAT_ERROR;

1423 syntaxError(src->source,src->parsedToken.charsOffset-1,

1424 src->parsedToken.charsOffset+src->parsedToken.charsLen,parse Error);

1425 DBG_FORMAT_ERROR

1426 return NULL;

1427 }

1428

1429 // Set current range code point to process the range loop

1430 src->currentRangeCp = src->previousCp + 1;

1431

1432 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken. charsLen - 1;

1433

1434 return ucol_tok_processNextCodePointInRange(src, status);

1435 } else if (src->isStarred) {

1436 // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharInd ex_ so that

1437 // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be

1438 // separated into several tokens and returned.

1439 src->currentStarredCharIndex = src->parsedToken.charsOffset;

1440 src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken .charsLen - 1;

1441

1442 return ucol_tok_processNextTokenInStarredList(src);

1443 } else {

1444 // Set previous codepoint

1445 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);

1446 }

1447 return nextToken;

1448 }

1449

1450

1451 /*

1452 Processing Description

1453 1 Build a ListList. Each list has a header, which contains two lists (positive

1454 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and

1455 reset may be null.

1456 2 As you process, you keep a LAST pointer that points to the last token you

1457 handled.

1458

1459 */

1460

1461 static UColToken ucol_tok_initAReset(UColTokenParser src, const UChar expand, uint32_t expandNext,

1462 UParseError parseError, UErrorCode statu s)

1463 {

1464 if(src->resultLen == src->listCapacity) {

1465 // Unfortunately, this won't work, as we store addresses of lhs in token

1466 src->listCapacity *= 2;

1467 src->lh = (UColTokListHeader )uprv_realloc(src->lh, src->listCapacitys izeof(UColTokListHeader));

1468 if(src->lh == NULL) {

1469 *status = U_MEMORY_ALLOCATION_ERROR;

1470 return NULL;

1471 }

1472 }

1473 /* do the reset thing */

1474 UColToken sourceToken = (UColToken )uprv_malloc(sizeof(UColToken));

1475 /* test for NULL */

1476 if (sourceToken == NULL) {

1477 *status = U_MEMORY_ALLOCATION_ERROR;

1478 return NULL;

1479 }

1480 sourceToken->rulesToParseHdl = &(src->source);

1481 sourceToken->source = src->parsedToken.charsLen << 24 \| src->parsedToken.cha rsOffset;

1482 sourceToken->expansion = src->parsedToken.extensionLen << 24 \| src->parsedTo ken.extensionOffset;

1483

1484 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);

1485 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffs et);

1486

1487 // keep the flags around so that we know about before

1488 sourceToken->flags = src->parsedToken.flags;

1489

1490 if(src->parsedToken.prefixOffset != 0) {

1491 // this is a syntax error

1492 *status = U_INVALID_FORMAT_ERROR;

1493 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken. charsOffset+src->parsedToken.charsLen,parseError);

1494 DBG_FORMAT_ERROR

1495 uprv_free(sourceToken);

1496 return 0;

1497 } else {

1498 sourceToken->prefix = 0;

1499 }

1500

1501 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should als o handle reverse */

1502 sourceToken->strength = UCOL_TOK_RESET;

1503 sourceToken->next = NULL;

1504 sourceToken->previous = NULL;

1505 sourceToken->noOfCEs = 0;

1506 sourceToken->noOfExpCEs = 0;

1507 sourceToken->listHeader = &src->lh[src->resultLen];

1508

1509 src->lh[src->resultLen].first = NULL;

1510 src->lh[src->resultLen].last = NULL;

1511 src->lh[src->resultLen].first = NULL;

1512 src->lh[src->resultLen].last = NULL;

1513

1514 src->lh[src->resultLen].reset = sourceToken;

1515

1516 /*

1517 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...

1518 First convert all expansions into normal form. Examples:

1519 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *

1520 d * ... into &x * c/y * d * ...

1521 Note: reset values can never have expansions, although they can cause the

1522 very next item to have one. They may be contractions, if they are found

1523 earlier in the list.

1524 */

1525 *expandNext = 0;

1526 if(expand != NULL) {

1527 /* check to see if there is an expansion */

1528 if(src->parsedToken.charsLen > 1) {

1529 uint32_t resetCharsOffset;

1530 resetCharsOffset = (uint32_t)(expand - src->source);

1531 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOff set ) << 24) \| src->parsedToken.charsOffset;

1532 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOf fset - resetCharsOffset)<<24) \| (resetCharsOffset);

1533 }

1534 }

1535

1536 src->resultLen++;

1537

1538 uhash_put(src->tailored, sourceToken, sourceToken, status);

1539

1540 return sourceToken;

1541 }

1542

1543 static

1544 inline UColToken getVirginBefore(UColTokenParser src, UColToken sourceToken, uint8_t strength, UParseError parseError, UErrorCode *status) {

1545 if(U_FAILURE(*status)) {

1546 return NULL;

1547 }

1548 /* this is a virgin before - we need to fish the anchor from the UCA */

1549 collIterate s;

1550 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;

1551 uint32_t CE, SecondCE;

1552 // uint32_t invPos;

1553 if(sourceToken != NULL) {

1554 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFF FFF), 1, &s, status);

1555 } else {

1556 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /*charsOffset/, 1, &s, status);

1557 }

1558 if(U_FAILURE(*status)) {

1559 return NULL;

1560 }

1561

1562 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;

1563 baseContCE = ucol_getNextCE(src->UCA, &s, status);

1564 if(baseContCE == UCOL_NO_MORE_CES) {

1565 baseContCE = 0;

1566 }

1567

1568

1569 UCAConstants consts = (UCAConstants )((uint8_t *)src->UCA->image + src->UC A->image->UCAConsts);

1570 uint32_t ch = 0;

1571 uint32_t expandNext = 0;

1572 UColToken key;

1573

1574 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseC E & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */

1575 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) \| ((baseContCE & UCOL_PRI MARYMASK) >> 16);

1576 uint32_t raw = uprv_uca_getRawFromImplicit(primary);

1577 ch = uprv_uca_getCodePointFromRaw(raw-1);

1578 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);

1579 CE = (primaryCE & UCOL_PRIMARYMASK) \| 0x0505;

1580 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) \| UCOL_CONTINUATION_MA RKER;

1581

1582 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->sourc e);

1583 *src->extraCurrent++ = 0xFFFE;

1584 *src->extraCurrent++ = (UChar)ch;

1585 src->parsedToken.charsLen++;

1586

1587 key.source = (src->parsedToken.charsLen/*newCharsLen/ << 24) \| src->pa rsedToken.charsOffset/*charsOffset/;

1588 key.rulesToParseHdl = &(src->source);

1589

1590 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);

1591 sourceToken = (UColToken *)uhash_get(src->tailored, &key);

1592

1593 if(sourceToken == NULL) {

1594 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;

1595 if(isContinuation(SecondCE)) {

1596 src->lh[src->resultLen].baseContCE = SecondCE;

1597 } else {

1598 src->lh[src->resultLen].baseContCE = 0;

1599 }

1600 src->lh[src->resultLen].nextCE = 0;

1601 src->lh[src->resultLen].nextContCE = 0;

1602 src->lh[src->resultLen].previousCE = 0;

1603 src->lh[src->resultLen].previousContCE = 0;

1604

1605 src->lh[src->resultLen].indirect = FALSE;

1606

1607 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, s tatus);

1608 }

1609

1610 } else {

1611 /* invPos = */ ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondC E, strength);

1612

1613 // we got the previous CE. Now we need to see if the difference between

1614 // the two CEs is really of the requested strength.

1615 // if it's a bigger difference (we asked for secondary and got primary), we

1616 // need to modify the CE.

1617 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < stre ngth) {

1618 // adjust the strength

1619 // now we are in the situation where our baseCE should actually be m odified in

1620 // order to get the CE in the right position.

1621 if(strength == UCOL_SECONDARY) {

1622 CE = baseCE - 0x0200;

1623 } else { // strength == UCOL_TERTIARY

1624 CE = baseCE - 0x02;

1625 }

1626 if(baseContCE) {

1627 if(strength == UCOL_SECONDARY) {

1628 SecondCE = baseContCE - 0x0200;

1629 } else { // strength == UCOL_TERTIARY

1630 SecondCE = baseContCE - 0x02;

1631 }

1632 }

1633 }

1634

1635 #if 0

1636 // the code below relies on getting a code point from the inverse table, in order to be

1637 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:

1638 // 1. There are many code points that have the same CE

1639 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2 ] are broken.

1640 // Also, in case when there is no equivalent strength before an element, we have to actually

1641 // construct one. For example, &[before 2]a << x won't result in x << a, because the element

1642 // before a is a primary difference.

1643

1644 //uint32_t CETable = (uint32_t )((uint8_t *)src->invUCA+src->invUCA->t able);

1645

1646

1647 ch = CETable[3*invPos+2];

1648

1649 if((ch & UCOL_INV_SIZEMASK) != 0) {

1650 uint16_t conts = (uint16_t )((uint8_t *)src->invUCA+src->invUCA->c onts);

1651 uint32_t offset = (ch & UCOL_INV_OFFSETMASK);

1652 ch = conts[offset];

1653 }

1654

1655 *src->extraCurrent++ = (UChar)ch;

1656 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->sourc e - 1);

1657 src->parsedToken.charsLen = 1;

1658

1659 // We got an UCA before. However, this might have been tailored.

1660 // example:

1661 // &\u30ca = \u306a

1662 // &[before 3]\u306a<<<\u306a\|\u309d

1663

1664

1665 // uint32_t key = (newCharsLen << 24) \| charsOffset;

1666 key.source = (src->parsedToken.charsLen/*newCharsLen/ << 24) \| src->pa rsedToken.charsOffset/*charsOffset/;

1667 key.rulesToParseHdl = &(src->source);

1668

1669 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);

1670 sourceToken = (UColToken *)uhash_get(src->tailored, &key);

1671 #endif

1672

1673 // here is how it should be. The situation such as &[before 1]a < x, sho uld be

1674 // resolved exactly as if we wrote &a > x.

1675 // therefore, I don't really care if the UCA value before a has been cha nged.

1676 // However, I do care if the strength between my element and the previou s element

1677 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll

1678 // have to construct the base CE.

1679

1680

1681

1682 // if we found a tailored thing, we have to use the UCA value and constr uct

1683 // a new reset token with constructed name

1684 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {

1685 // character to which we want to anchor is already tailored.

1686 // We need to construct a new token which will be the anchor

1687 // point

1688 //*(src->extraCurrent-1) = 0xFFFE;

1689 //*src->extraCurrent++ = (UChar)ch;

1690 // grab before

1691 src->parsedToken.charsOffset -= 10;

1692 src->parsedToken.charsLen += 10;

1693 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;

1694 if(isContinuation(SecondCE)) {

1695 src->lh[src->resultLen].baseContCE = SecondCE;

1696 } else {

1697 src->lh[src->resultLen].baseContCE = 0;

1698 }

1699 src->lh[src->resultLen].nextCE = 0;

1700 src->lh[src->resultLen].nextContCE = 0;

1701 src->lh[src->resultLen].previousCE = 0;

1702 src->lh[src->resultLen].previousContCE = 0;

1703

1704 src->lh[src->resultLen].indirect = FALSE;

1705

1706 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, statu s);

1707 //}

1708 }

1709

1710 return sourceToken;

1711

1712 }

1713

1714 uint32_t ucol_tok_assembleTokenList(UColTokenParser src, UParseError parseErro r, UErrorCode *status) {

1715 UColToken *lastToken = NULL;

1716 const UChar *parseEnd = NULL;

1717 uint32_t expandNext = 0;

1718 UBool variableTop = FALSE;

1719 UBool top = FALSE;

1720 uint16_t specs = 0;

1721 UColTokListHeader *ListList = NULL;

1722

1723 src->parsedToken.strength = UCOL_TOK_UNSET;

1724

1725 ListList = src->lh;

1726

1727 if(U_FAILURE(*status)) {

1728 return 0;

1729 }

1730 #ifdef DEBUG_FOR_CODE_POINTS

1731 char filename[35];

1732 sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());

1733 dfcp_fp = fopen(filename, "a");

1734 fprintf(stdout, "Output is in the file %s.\n", filename);

1735 #endif

1736

1737 #ifdef DEBUG_FOR_COLL_RULES

1738 std::string s3;

1739 UnicodeString(src->source).toUTF8String(s3);

1740 std::cout << "src->source = " << s3 << std::endl;

1741 #endif

1742

1743 while(src->current < src->end \|\| src->isStarred) {

1744 src->parsedToken.prefixOffset = 0;

1745

1746 parseEnd = ucol_tok_parseNextToken(src,

1747 (UBool)(lastToken == NULL),

1748 parseError,

1749 status);

1750

1751 specs = src->parsedToken.flags;

1752

1753

1754 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);

1755 top = ((specs & UCOL_TOK_TOP) != 0);

1756

1757 if(U_SUCCESS(*status) && parseEnd != NULL) {

1758 UColToken *sourceToken = NULL;

1759 //uint32_t key = 0;

1760 uint32_t lastStrength = UCOL_TOK_UNSET;

1761

1762 if(lastToken != NULL ) {

1763 lastStrength = lastToken->strength;

1764 }

1765

1766 #ifdef DEBUG_FOR_CODE_POINTS

1767 UChar32 cp;

1768 U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src ->extraEnd - src->source), cp);

1769 fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsed Token.strength);

1770 #endif

1771 //key = newCharsLen << 24 \| charsOffset;

1772 UColToken key;

1773 key.source = src->parsedToken.charsLen << 24 \| src->parsedToken.char sOffset;

1774 key.rulesToParseHdl = &(src->source);

1775

1776 /* 4 Lookup each source in the CharsToToken map, and find a sourceT oken */

1777 sourceToken = (UColToken *)uhash_get(src->tailored, &key);

1778

1779 if(src->parsedToken.strength != UCOL_TOK_RESET) {

1780 if(lastToken == NULL) { /* this means that rules haven't started properly */

1781 *status = U_INVALID_FORMAT_ERROR;

1782 syntaxError(src->source,0,(int32_t)(src->end-src->source),pa rseError);

1783 DBG_FORMAT_ERROR

1784 return 0;

1785 }

1786 /* 6 Otherwise (when relation != reset) */

1787 if(sourceToken == NULL) {

1788 /* If sourceToken is null, create new one, */

1789 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));

1790 /* test for NULL */

1791 if (sourceToken == NULL) {

1792 *status = U_MEMORY_ALLOCATION_ERROR;

1793 return 0;

1794 }

1795 sourceToken->rulesToParseHdl = &(src->source);

1796 sourceToken->source = src->parsedToken.charsLen << 24 \| src- >parsedToken.charsOffset;

1797

1798 sourceToken->debugSource = *(src->source + src->parsedToken. charsOffset);

1799

1800 sourceToken->prefix = src->parsedToken.prefixLen << 24 \| src ->parsedToken.prefixOffset;

1801 sourceToken->debugPrefix = *(src->source + src->parsedToken. prefixOffset);

1802

1803 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */

1804 sourceToken->next = NULL;

1805 sourceToken->previous = NULL;

1806 sourceToken->noOfCEs = 0;

1807 sourceToken->noOfExpCEs = 0;

1808 // keep the flags around so that we know about before

1809 sourceToken->flags = src->parsedToken.flags;

1810 uhash_put(src->tailored, sourceToken, sourceToken, status);

1811 if(U_FAILURE(*status)) {

1812 return 0;

1813 }

1814 } else {

1815 /* we could have fished out a reset here */

1816 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != s ourceToken) {

1817 /* otherwise remove sourceToken from where it was. */

1818 if(sourceToken->next != NULL) {

1819 if(sourceToken->next->strength > sourceToken->streng th) {

1820 sourceToken->next->strength = sourceToken->stren gth;

1821 }

1822 sourceToken->next->previous = sourceToken->previous;

1823 } else {

1824 sourceToken->listHeader->last = sourceToken->previou s;

1825 }

1826

1827 if(sourceToken->previous != NULL) {

1828 sourceToken->previous->next = sourceToken->next;

1829 } else {

1830 sourceToken->listHeader->first = sourceToken->next;

1831 }

1832 sourceToken->next = NULL;

1833 sourceToken->previous = NULL;

1834 }

1835 }

1836

1837 sourceToken->strength = src->parsedToken.strength;

1838 sourceToken->listHeader = lastToken->listHeader;

1839

1840 /*

1841 1. Find the strongest strength in each list, and set strongestP and strongestN

1842 accordingly in the headers.

1843 */

1844 if(lastStrength == UCOL_TOK_RESET

1845 \|\| sourceToken->listHeader->first == 0) {

1846 /* If LAST is a reset

1847 insert sourceToken in the list. */

1848 if(sourceToken->listHeader->first == 0) {

1849 sourceToken->listHeader->first = sourceToken;

1850 sourceToken->listHeader->last = sourceToken;

1851 } else { /* we need to find a place for us */

1852 /* and we'll get in front of the same strength */

1853 if(sourceToken->listHeader->first->strength <= sourc eToken->strength) {

1854 sourceToken->next = sourceToken->listHeader->fir st;

1855 sourceToken->next->previous = sourceToken;

1856 sourceToken->listHeader->first = sourceToken;

1857 sourceToken->previous = NULL;

1858 } else {

1859 lastToken = sourceToken->listHeader->first;

1860 while(lastToken->next != NULL && lastToken->next ->strength > sourceToken->strength) {

1861 lastToken = lastToken->next;

1862 }

1863 if(lastToken->next != NULL) {

1864 lastToken->next->previous = sourceToken;

1865 } else {

1866 sourceToken->listHeader->last = sourceToken;

1867 }

1868 sourceToken->previous = lastToken;

1869 sourceToken->next = lastToken->next;

1870 lastToken->next = sourceToken;

1871 }

1872 }

1873 } else {

1874 /* Otherwise (when LAST is not a reset)

1875 if polarity (LAST) == polarity(relation), insert sourceT oken after LAST,

1876 otherwise insert before.

1877 when inserting after or before, search to the next posit ion with the same

1878 strength in that direction. (This is called postpone ins ertion). */

1879 if(sourceToken != lastToken) {

1880 if(lastToken->polarity == sourceToken->polarity) {

1881 while(lastToken->next != NULL && lastToken->next ->strength > sourceToken->strength) {

1882 lastToken = lastToken->next;

1883 }

1884 sourceToken->previous = lastToken;

1885 if(lastToken->next != NULL) {

1886 lastToken->next->previous = sourceToken;

1887 } else {

1888 sourceToken->listHeader->last = sourceToken;

1889 }

1890

1891 sourceToken->next = lastToken->next;

1892 lastToken->next = sourceToken;

1893 } else {

1894 while(lastToken->previous != NULL && lastToken-> previous->strength > sourceToken->strength) {

1895 lastToken = lastToken->previous;

1896 }

1897 sourceToken->next = lastToken;

1898 if(lastToken->previous != NULL) {

1899 lastToken->previous->next = sourceToken;

1900 } else {

1901 sourceToken->listHeader->first = sourceToken ;

1902 }

1903 sourceToken->previous = lastToken->previous;

1904 lastToken->previous = sourceToken;

1905 }

1906 } else { /* repeated one thing twice in rules, stay with the stronger strength */

1907 if(lastStrength < sourceToken->strength) {

1908 sourceToken->strength = lastStrength;

1909 }

1910 }

1911 }

1912

1913 /* if the token was a variable top, we're gonna put it in */

1914 if(variableTop == TRUE && src->varTop == NULL) {

1915 variableTop = FALSE;

1916 src->varTop = sourceToken;

1917 }

1918

1919 // Treat the expansions.

1920 // There are two types of expansions: explicit (x / y) and r eset based propagating expansions

1921 // (&abc * d * e <=> &ab * d / c * e / c)

1922 // if both of them are in effect for a token, they are combi ned.

1923

1924 sourceToken->expansion = src->parsedToken.extensionLen << 24 \| src->parsedToken.extensionOffset;

1925

1926 if(expandNext != 0) {

1927 if(sourceToken->strength == UCOL_PRIMARY) { /* primary s trength kills off the implicit expansion */

1928 expandNext = 0;

1929 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */

1930 sourceToken->expansion = expandNext;

1931 } else { /* there is both explicit and implicit expansio n. We need to make a combination */

1932 uprv_memcpy(src->extraCurrent, src->source + (expand Next & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));

1933 uprv_memcpy(src->extraCurrent+(expandNext >> 24), sr c->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*size of(UChar));

1934 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 \| (uint32_t)(src->extraCurrent - src->s ource));

1935 src->extraCurrent += (expandNext >> 24) + src->parse dToken.extensionLen;

1936 }

1937 }

1938

1939 // This is just for debugging purposes

1940 if(sourceToken->expansion != 0) {

1941 sourceToken->debugExpansion = *(src->source + src->parse dToken.extensionOffset);

1942 } else {

1943 sourceToken->debugExpansion = 0;

1944 }

1945 // if the previous token was a reset before, the strength of this

1946 // token must match the strength of before. Otherwise we hav e an

1947 // undefined situation.

1948 // In other words, we currently have a cludge which we use t o

1949 // represent &a >> x. This is written as &[before 2]a << x.

1950 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {

1951 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BE FORE) - 1;

1952 if(beforeStrength != sourceToken->strength) {

1953 *status = U_INVALID_FORMAT_ERROR;

1954 syntaxError(src->source,0,(int32_t)(src->end-src->so urce),parseError);

1955 DBG_FORMAT_ERROR

1956 return 0;

1957 }

1958 }

1959 } else {

1960 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {

1961 /* if the previous token was also a reset, */

1962 /this means that we have two consecutive resets /

1963 /* and we want to remove the previous one if empty*/

1964 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {

1965 src->resultLen--;

1966 }

1967 }

1968

1969 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */

1970 uint32_t searchCharsLen = src->parsedToken.charsLen;

1971 while(searchCharsLen > 1 && sourceToken == NULL) {

1972 searchCharsLen--;

1973 //key = searchCharsLen << 24 \| charsOffset;

1974 UColToken key;

1975 key.source = searchCharsLen << 24 \| src->parsedToken.cha rsOffset;

1976 key.rulesToParseHdl = &(src->source);

1977 sourceToken = (UColToken *)uhash_get(src->tailored, &key );

1978 }

1979 if(sourceToken != NULL) {

1980 expandNext = (src->parsedToken.charsLen - searchCharsLen ) << 24 \| (src->parsedToken.charsOffset + searchCharsLen);

1981 }

1982 }

1983

1984 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */

1985 if(top == FALSE) { /* there is no indirection */

1986 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;

1987 if(sourceToken != NULL && sourceToken->strength != UCOL_ TOK_RESET) {

1988 /* this is a before that is already ordered in the U CA - so we need to get the previous with good strength */

1989 while(sourceToken->strength > strength && sourceToke n->previous != NULL) {

1990 sourceToken = sourceToken->previous;

1991 }

1992 /* here, either we hit the strength or NULL */

1993 if(sourceToken->strength == strength) {

1994 if(sourceToken->previous != NULL) {

1995 sourceToken = sourceToken->previous;

1996 } else { /* start of list */

1997 sourceToken = sourceToken->listHeader->reset ;

1998 }

1999 } else { /* we hit NULL */

2000 /* we should be doing the else part */

2001 sourceToken = sourceToken->listHeader->reset;

2002 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);

2003 }

2004 } else {

2005 sourceToken = getVirginBefore(src, sourceToken, stre ngth, parseError, status);

2006 }

2007 } else { /* this is both before and indirection */

2008 top = FALSE;

2009 ListList[src->resultLen].previousCE = 0;

2010 ListList[src->resultLen].previousContCE = 0;

2011 ListList[src->resultLen].indirect = TRUE;

2012 /* we need to do slightly more work. we need to get the baseCE using the */

2013 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */

2014 /* in ucol_bld */

2015 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;

2016 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToke n.indirectIndex].startCE;

2017 uint32_t baseContCE = ucolIndirectBoundaries[src->parsed Token.indirectIndex].startContCE;//&0xFFFFFF3F;

2018 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;

2019

2020 UCAConstants consts = (UCAConstants )((uint8_t *)src-> UCA->image + src->UCA->image->UCAConsts);

2021 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICI T_MIN<<24) &&

2022 (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICI T_MAX<<24) ) { /* implicits - */

2023 uint32_t primary = (baseCE & UCOL_PRIMARYMASK) \| ((b aseContCE & UCOL_PRIMARYMASK) >> 16);

2024 uint32_t raw = uprv_uca_getRawFromImplicit(primary);

2025 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw -1);

2026 CE = (primaryCE & UCOL_PRIMARYMASK) \| 0x0505;

2027 SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) \| UCOL_CONTINUATION_MARKER;

2028 } else {

2029 /int32_t invPos = ucol_inv_getPrevCE(baseCE, baseCo ntCE, &CE, &SecondCE, strength);/

2030 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &Se condCE, strength);

2031 }

2032

2033 ListList[src->resultLen].baseCE = CE;

2034 ListList[src->resultLen].baseContCE = SecondCE;

2035 ListList[src->resultLen].nextCE = 0;

2036 ListList[src->resultLen].nextContCE = 0;

2037

2038 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, p arseError, status);

2039 }

2040 }

2041

2042

2043 /* 5 If the relation is a reset:

2044 If sourceToken is null

2045 Create new list, create new sourceToken, make the baseCE from so urce, put

2046 the sourceToken in ListHeader of the new list */

2047 if(sourceToken == NULL) {

2048 /*

2049 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...

2050 First convert all expansions into normal form. Examples:

2051 If "xy" doesn't occur earlier in the list or in the UCA, con vert &xy * c *

2052 d * ... into &x * c/y * d * ...

2053 Note: reset values can never have expansions, although they can cause the

2054 very next item to have one. They may be contractions, if the y are found

2055 earlier in the list.

2056 */

2057 if(top == FALSE) {

2058 collIterate s;

2059 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;

2060

2061 uprv_init_collIterate(src->UCA, src->source+src->parsedT oken.charsOffset, src->parsedToken.charsLen, &s, status);

2062

2063 CE = ucol_getNextCE(src->UCA, &s, status);

2064 const UChar *expand = s.pos;

2065 SecondCE = ucol_getNextCE(src->UCA, &s, status);

2066

2067 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;

2068 if(isContinuation(SecondCE)) {

2069 ListList[src->resultLen].baseContCE = SecondCE;

2070 } else {

2071 ListList[src->resultLen].baseContCE = 0;

2072 }

2073 ListList[src->resultLen].nextCE = 0;

2074 ListList[src->resultLen].nextContCE = 0;

2075 ListList[src->resultLen].previousCE = 0;

2076 ListList[src->resultLen].previousContCE = 0;

2077 ListList[src->resultLen].indirect = FALSE;

2078 sourceToken = ucol_tok_initAReset(src, expand, &expandNe xt, parseError, status);

2079 } else { /* top == TRUE */

2080 /* just use the supplied values */

2081 top = FALSE;

2082 ListList[src->resultLen].previousCE = 0;

2083 ListList[src->resultLen].previousContCE = 0;

2084 ListList[src->resultLen].indirect = TRUE;

2085 ListList[src->resultLen].baseCE = ucolIndirectBoundaries [src->parsedToken.indirectIndex].startCE;

2086 ListList[src->resultLen].baseContCE = ucolIndirectBounda ries[src->parsedToken.indirectIndex].startContCE;

2087 ListList[src->resultLen].nextCE = ucolIndirectBoundaries [src->parsedToken.indirectIndex].limitCE;

2088 ListList[src->resultLen].nextContCE = ucolIndirectBounda ries[src->parsedToken.indirectIndex].limitContCE;

2089

2090 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, p arseError, status);

2091

2092 }

2093 } else { /* reset to something already in rules */

2094 top = FALSE;

2095 }

2096 }

2097 /* 7 After all this, set LAST to point to sourceToken, and goto ste p 3. */

2098 lastToken = sourceToken;

2099 } else {

2100 if(U_FAILURE(*status)) {

2101 return 0;

2102 }

2103 }

2104 }

2105 #ifdef DEBUG_FOR_CODE_POINTS

2106 fclose(dfcp_fp);

2107 #endif

2108

2109

2110 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {

2111 src->resultLen--;

2112 }

2113 return src->resultLen;

2114 }

2115

2116 const UChar* ucol_tok_getRulesFromBundle(

2117 void* /context/,

2118 const char* locale,

2119 const char* type,

2120 int32_t* pLength,

2121 UErrorCode* status)

2122 {

2123 const UChar* rules = NULL;

2124 UResourceBundle* bundle;

2125 UResourceBundle* collations;

2126 UResourceBundle* collation;

2127

2128 *pLength = 0;

2129

2130 bundle = ures_open(U_ICUDATA_COLL, locale, status);

2131 if(U_SUCCESS(*status)){

2132 collations = ures_getByKey(bundle, "collations", NULL, status);

2133 if(U_SUCCESS(*status)){

2134 collation = ures_getByKey(collations, type, NULL, status);

2135 if(U_SUCCESS(*status)){

2136 rules = ures_getStringByKey(collation, "Sequence", pLength, stat us);

2137 if(U_FAILURE(*status)){

2138 *pLength = 0;

2139 rules = NULL;

2140 }

2141 ures_close(collation);

2142 }

2143 ures_close(collations);

2144 }

2145 }

2146

2147 ures_close(bundle);

2148

2149 return rules;

2150 }

2151

2152 void ucol_tok_initTokenList(

2153 UColTokenParser *src,

2154 const UChar *rules,

2155 uint32_t rulesLength,

2156 const UCollator *UCA,

2157 GetCollationRulesFunction importFunc,

2158 void* context,

2159 UErrorCode *status) {

2160 U_NAMESPACE_USE

2161

2162 uint32_t nSize = 0;

2163 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);

2164

2165 bool needToDeallocRules = false;

2166

2167 if(U_FAILURE(*status)) {

2168 return;

2169 }

2170

2171 // set everything to zero, so that we can clean up gracefully

2172 uprv_memset(src, 0, sizeof(UColTokenParser));

2173

2174 // first we need to find options that don't like to be normalized,

2175 // like copy and remove...

2176 //const UChar *openBrace = rules;

2177 int32_t optionNumber = -1;

2178 const UChar *setStart = NULL;

2179 uint32_t i = 0;

2180 while(i < rulesLength) {

2181 if(rules[i] == 0x005B) { // '[': start of an option

2182 /* Gets the following:

2183 optionNumber: The index of the option.

2184 setStart: The pointer at which the option arguments start.

2185 */

2186 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength , &setStart);

2187

2188 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tail oring */

2189 // [optimize]

2190 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rule s+rulesLength, status);

2191 if(U_SUCCESS(*status)) {

2192 if(src->copySet == NULL) {

2193 src->copySet = newSet;

2194 } else {

2195 uset_addAll(src->copySet, newSet);

2196 uset_close(newSet);

2197 }

2198 } else {

2199 return;

2200 }

2201 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {

2202 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rule s+rulesLength, status);

2203 if(U_SUCCESS(*status)) {

2204 if(src->removeSet == NULL) {

2205 src->removeSet = newSet;

2206 } else {

2207 uset_addAll(src->removeSet, newSet);

2208 uset_close(newSet);

2209 }

2210 } else {

2211 return;

2212 }

2213 } else if(optionNumber == OPTION_IMPORT){

2214 // [import <collation-name>]

2215

2216 // Find the address of the closing ].

2217 UChar* import_end = u_strchr(setStart, 0x005D);

2218 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);

2219 // Ignore trailing whitespace.

2220 while(PatternProps::isWhiteSpace(*(import_end-1))) {

2221 --import_end;

2222 }

2223

2224 int32_t optionLength = (int32_t)(import_end - setStart);

2225 char option[50];

2226 if(optionLength >= (int32_t)sizeof(option)) {

2227 *status = U_ILLEGAL_ARGUMENT_ERROR;

2228 return;

2229 }

2230 u_UCharsToChars(setStart, option, optionLength);

2231 option[optionLength] = 0;

2232

2233 *status = U_ZERO_ERROR;

2234 char locale[50];

2235 int32_t templ;

2236 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &te mpl, status);

2237 if(U_FAILURE(*status)) {

2238 *status = U_ILLEGAL_ARGUMENT_ERROR;

2239 return;

2240 }

2241

2242 char type[50];

2243 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)siz eof(type), status) <= 0 \|\|

2244 U_FAILURE(*status)

2245 ) {

2246 *status = U_ZERO_ERROR;

2247 uprv_strcpy(type, "standard");

2248 }

2249

2250 // TODO: Use public functions when available, see ticket #8134.

2251 char keywords = (char )locale_getKeywordsStart(locale);

2252 if(keywords != NULL) {

2253 *keywords = 0;

2254 }

2255

2256 int32_t importRulesLength = 0;

2257 const UChar* importRules = importFunc(context, locale, type, &im portRulesLength, status);

2258

2259 #ifdef DEBUG_FOR_COLL_RULES

2260 std::string s;

2261 UnicodeString(importRules).toUTF8String(s);

2262 std::cout << "Import rules = " << s << std::endl;

2263 #endif

2264

2265 // Add the length of the imported rules to length of the origina l rules,

2266 // and subtract the length of the import option.

2267 uint32_t newRulesLength = rulesLength + importRulesLength - (opt ionEndOffset - i);

2268

2269 UChar* newRules = (UChar)uprv_malloc(newRulesLengthsizeof(UCha r));

2270

2271 #ifdef DEBUG_FOR_COLL_RULES

2272 std::string s1;

2273 UnicodeString(rules).toUTF8String(s1);

2274 std::cout << "Original rules = " << s1 << std::endl;

2275 #endif

2276

2277

2278 // Copy the section of the original rules leading up to the impo rt

2279 uprv_memcpy(newRules, rules, i*sizeof(UChar));

2280 // Copy the imported rules

2281 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UC har));

2282 // Copy the rest of the original rules (minus the import option itself)

2283 uprv_memcpy(newRules+i+importRulesLength,

2284 rules+optionEndOffset,

2285 (rulesLength-optionEndOffset)*sizeof(UChar));

2286

2287 #ifdef DEBUG_FOR_COLL_RULES

2288 std::string s2;

2289 UnicodeString(newRules).toUTF8String(s2);

2290 std::cout << "Resulting rules = " << s2 << std::endl;

2291 #endif

2292

2293 if(needToDeallocRules){

2294 // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free

2295 uprv_free((void*)rules);

2296 }

2297 needToDeallocRules = true;

2298 rules = newRules;

2299 rulesLength = newRulesLength;

2300

2301 estimatedSize += importRulesLength*2;

2302

2303 // First character of the new rules needs to be processed

2304 i--;

2305 }

2306 }

2307 //openBrace++;

2308 i++;

2309 }

2310

2311 src->source = (UChar )uprv_malloc(estimatedSizesizeof(UChar));

2312 /* test for NULL */

2313 if (src->source == NULL) {

2314 *status = U_MEMORY_ALLOCATION_ERROR;

2315 return;

2316 }

2317 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));

2318 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estim atedSize, status);

2319 if(nSize > estimatedSize \|\| *status == U_BUFFER_OVERFLOW_ERROR) {

2320 *status = U_ZERO_ERROR;

2321 src->source = (UChar )uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_R ULE_SPACE_SIZE)sizeof(UChar));

2322 /* test for NULL */

2323 if (src->source == NULL) {

2324 *status = U_MEMORY_ALLOCATION_ERROR;

2325 return;

2326 }

2327 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, n Size+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);

2328 }

2329 if(needToDeallocRules){

2330 // if needToDeallocRules is set, then we allocated rules, so it's safe t o cast and free

2331 uprv_free((void*)rules);

2332 }

2333

2334

2335 src->current = src->source;

2336 src->end = src->source+nSize;

2337 src->sourceCurrent = src->source;

2338 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule str ing so that option scanning works correctly

2339 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SP ACE_SIZE;

2340 src->varTop = NULL;

2341 src->UCA = UCA;

2342 src->invUCA = ucol_initInverseUCA(status);

2343 src->parsedToken.charsLen = 0;

2344 src->parsedToken.charsOffset = 0;

2345 src->parsedToken.extensionLen = 0;

2346 src->parsedToken.extensionOffset = 0;

2347 src->parsedToken.prefixLen = 0;

2348 src->parsedToken.prefixOffset = 0;

2349 src->parsedToken.flags = 0;

2350 src->parsedToken.strength = UCOL_TOK_UNSET;

2351 src->buildCCTabFlag = FALSE;

2352 src->isStarred = FALSE;

2353 src->inRange = FALSE;

2354 src->lastRangeCp = 0;

2355 src->previousCp = 0;

2356

2357 if(U_FAILURE(*status)) {

2358 return;

2359 }

2360 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, stat us);

2361 if(U_FAILURE(*status)) {

2362 return;

2363 }

2364 uhash_setValueDeleter(src->tailored, uprv_free);

2365

2366 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));

2367 /* test for NULL */

2368 if (src->opts == NULL) {

2369 *status = U_MEMORY_ALLOCATION_ERROR;

2370 return;

2371 }

2372

2373 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));

2374

2375 src->lh = 0;

2376 src->listCapacity = 1024;

2377 src->lh = (UColTokListHeader )uprv_malloc(src->listCapacitysizeof(UColTokL istHeader));

2378 //Test for NULL

2379 if (src->lh == NULL) {

2380 *status = U_MEMORY_ALLOCATION_ERROR;

2381 return;

2382 }

2383 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));

2384 src->resultLen = 0;

2385

2386 UCAConstants consts = (UCAConstants )((uint8_t *)src->UCA->image + src->UC A->image->UCAConsts);

2387

2388 // UCOL_RESET_TOP_VALUE

2389 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IM PLICIT);

2390 // UCOL_FIRST_PRIMARY_IGNORABLE

2391 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);

2392 // UCOL_LAST_PRIMARY_IGNORABLE

2393 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);

2394 // UCOL_FIRST_SECONDARY_IGNORABLE

2395 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);

2396 // UCOL_LAST_SECONDARY_IGNORABLE

2397 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);

2398 // UCOL_FIRST_TERTIARY_IGNORABLE

2399 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);

2400 // UCOL_LAST_TERTIARY_IGNORABLE

2401 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);

2402 // UCOL_FIRST_VARIABLE

2403 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);

2404 // UCOL_LAST_VARIABLE

2405 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);

2406 // UCOL_FIRST_NON_VARIABLE

2407 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);

2408 // UCOL_LAST_NON_VARIABLE

2409 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_I MPLICIT);

2410 // UCOL_FIRST_IMPLICIT

2411 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);

2412 // UCOL_LAST_IMPLICIT

2413 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAIL ING);

2414 // UCOL_FIRST_TRAILING

2415 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);

2416 // UCOL_LAST_TRAILING

2417 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);

2418 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);

2419 }

2420

2421

2422 void ucol_tok_closeTokenList(UColTokenParser *src) {

2423 if(src->copySet != NULL) {

2424 uset_close(src->copySet);

2425 }

2426 if(src->removeSet != NULL) {

2427 uset_close(src->removeSet);

2428 }

2429 if(src->tailored != NULL) {

2430 uhash_close(src->tailored);

2431 }

2432 if(src->lh != NULL) {

2433 uprv_free(src->lh);

2434 }

2435 if(src->source != NULL) {

2436 uprv_free(src->source);

2437 }

2438 if(src->opts != NULL) {

2439 uprv_free(src->opts);

2440 }

2441 if (src->reorderCodes != NULL) {

2442 uprv_free(src->reorderCodes);

2443 }

2444 }

2445

2446 #endif /* #if !UCONFIG_NO_COLLATION */

OLD	NEW

« no previous file with comments | « source/i18n/ucol_tok.h ('k') | source/i18n/ucol_wgt.h » ('j') | no next file with comments »