OLD | NEW |
| (Empty) |
1 /******************************************************************** | |
2 * COPYRIGHT: | |
3 * Copyright (C) 2001-2011 IBM, Inc. All Rights Reserved. | |
4 * | |
5 ********************************************************************/ | |
6 /*******************************************************************************
* | |
7 * | |
8 * File dumpce.cpp | |
9 * | |
10 * Modification History: | |
11 * Name Date Description | |
12 * synwee May 31 2001 Creation | |
13 * | |
14 ********************************************************************************
* | |
15 */ | |
16 | |
17 /** | |
18 * This program outputs the collation elements used for a requested tailoring. | |
19 * | |
20 * Usage: | |
21 * dumpce options... please check main function. | |
22 */ | |
23 #include <unicode/utypes.h> | |
24 #include <unicode/ucol.h> | |
25 #include <unicode/uloc.h> | |
26 #include <unicode/ucoleitr.h> | |
27 #include <unicode/uchar.h> | |
28 #include <unicode/uscript.h> | |
29 #include <unicode/utf16.h> | |
30 #include <unicode/putil.h> | |
31 #include <unicode/ustring.h> | |
32 #include <stdio.h> | |
33 #include <stdlib.h> | |
34 #include <string.h> | |
35 #include <time.h> | |
36 #include "ucol_tok.h" | |
37 #include "cstring.h" | |
38 #include "uoptions.h" | |
39 #include "ucol_imp.h" | |
40 #include <unicode/ures.h> | |
41 #include <unicode/uniset.h> | |
42 #include <unicode/usetiter.h> | |
43 | |
44 /** | |
45 * Command line option variables. | |
46 * These global variables are set according to the options specified on the | |
47 * command line by the user. | |
48 */ | |
49 static UOption options[]={ | |
50 /* 00 */ UOPTION_HELP_H, | |
51 /* 01 */ UOPTION_HELP_QUESTION_MARK, | |
52 /* 02 */ {"locale", NULL, NULL, NULL, 'l', UOPT_REQUIRES_ARG, 0}, | |
53 /* 03 */ {"serialize", NULL, NULL, NULL, 'z', UOPT_NO_ARG, 0}, | |
54 /* 04 */ UOPTION_DESTDIR, | |
55 /* 05 */ UOPTION_SOURCEDIR, | |
56 /* 06 */ {"attribute", NULL, NULL, NULL, 'a', UOPT_REQUIRES_ARG, 0}, | |
57 /* 07 */ {"rule", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0}, | |
58 /* 08 */ {"normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0}, | |
59 /* 09 */ {"scripts", NULL, NULL, NULL, 't', UOPT_NO_ARG, 0}, | |
60 /* 10 */ {"reducehan", NULL, NULL, NULL, 'e', UOPT_NO_ARG, 0}, | |
61 /* 11 */ UOPTION_VERBOSE, | |
62 /* 12 */ {"wholescripts", NULL, NULL, NULL, 'W', UOPT_NO_ARG, 0} | |
63 }; | |
64 | |
65 /** | |
66 * Collator used in this program | |
67 */ | |
68 static UCollator *COLLATOR_; | |
69 /** | |
70 * Output strea, used in this program | |
71 */ | |
72 static FILE *OUTPUT_; | |
73 | |
74 static UColAttributeValue ATTRIBUTE_[UCOL_ATTRIBUTE_COUNT] = { | |
75 UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, | |
76 UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, | |
77 }; | |
78 | |
79 typedef struct { | |
80 int value; | |
81 char *name; | |
82 } EnumNameValuePair; | |
83 | |
84 static const EnumNameValuePair ATTRIBUTE_NAME_[] = { | |
85 {UCOL_FRENCH_COLLATION, "UCOL_FRENCH_COLLATION"}, | |
86 {UCOL_ALTERNATE_HANDLING, "UCOL_ALTERNATE_HANDLING"}, | |
87 {UCOL_CASE_FIRST, "UCOL_CASE_FIRST"}, | |
88 {UCOL_CASE_LEVEL, "UCOL_CASE_LEVEL"}, | |
89 {UCOL_NORMALIZATION_MODE, | |
90 "UCOL_NORMALIZATION_MODE|UCOL_DECOMPOSITION_MODE"}, | |
91 {UCOL_STRENGTH, "UCOL_STRENGTH"}, | |
92 {UCOL_HIRAGANA_QUATERNARY_MODE, "UCOL_HIRAGANA_QUATERNARY_MODE"}, | |
93 {UCOL_NUMERIC_COLLATION, "UCOL_NUMERIC_COLLATION"}, | |
94 NULL | |
95 }; | |
96 | |
97 static const EnumNameValuePair ATTRIBUTE_VALUE_[] = { | |
98 {UCOL_PRIMARY, "UCOL_PRIMARY"}, | |
99 {UCOL_SECONDARY, "UCOL_SECONDARY"}, | |
100 {UCOL_TERTIARY, "UCOL_TERTIARY|UCOL_DEFAULT_STRENGTH"}, | |
101 {UCOL_QUATERNARY, "UCOL_QUATERNARY"}, | |
102 {UCOL_IDENTICAL, "UCOL_IDENTICAL"}, | |
103 {UCOL_OFF, "UCOL_OFF"}, | |
104 {UCOL_ON, "UCOL_ON"}, | |
105 {UCOL_SHIFTED, "UCOL_SHIFTED"}, | |
106 {UCOL_NON_IGNORABLE, "UCOL_NON_IGNORABLE"}, | |
107 {UCOL_LOWER_FIRST, "UCOL_LOWER_FIRST"}, | |
108 {UCOL_UPPER_FIRST, "UCOL_UPPER_FIRST"}, | |
109 NULL | |
110 }; | |
111 | |
112 typedef struct { | |
113 UChar ch[32]; | |
114 int count; // number of codepoint | |
115 UBool tailored; | |
116 } ScriptElement; | |
117 | |
118 /** | |
119 * Writes the hexadecimal of a null-terminated array of codepoints into a | |
120 * file | |
121 * @param f UFILE instance to store | |
122 * @param c codepoints array | |
123 */ | |
124 void serialize(FILE *f, const UChar *c) | |
125 { | |
126 UChar cp = *(c ++); | |
127 | |
128 fprintf(f, " %04x", cp); | |
129 | |
130 while (*c != 0) { | |
131 cp = *(c ++); | |
132 fprintf(f, " %04x", cp); | |
133 } | |
134 } | |
135 | |
136 /** | |
137 * Writes the hexadecimal of a non-null-terminated array of codepoints into a | |
138 * file | |
139 * @param f UFILE instance to store | |
140 * @param c codepoints array | |
141 * @param l codepoints array length | |
142 */ | |
143 void serialize(FILE *f, const UChar *c, int l) | |
144 { | |
145 int count = 1; | |
146 UChar cp = *(c ++); | |
147 | |
148 fprintf(f, " %04x", cp); | |
149 | |
150 while (count < l) { | |
151 cp = *(c ++); | |
152 fprintf(f, " %04x", cp); | |
153 count ++; | |
154 } | |
155 } | |
156 | |
157 /** | |
158 * Sets the iterator to the argument string and outputs the collation elements. | |
159 * @param f file output stream | |
160 * @param iter collation element iterator | |
161 */ | |
162 void serialize(FILE *f, UCollationElements *iter) { | |
163 const UChar *codepoint = iter->iteratordata_.string; | |
164 // unlikely that sortkeys will be over this size | |
165 uint8_t sortkey[64]; | |
166 uint8_t *psortkey = sortkey; | |
167 int sortkeylength = 0; | |
168 | |
169 if (iter->iteratordata_.flags & UCOL_ITER_HASLEN) { | |
170 serialize(f, codepoint, iter->iteratordata_.endp - codepoint); | |
171 sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint, | |
172 iter->iteratordata_.endp - codepoint, sortkey, 64); | |
173 } | |
174 else { | |
175 serialize(f, codepoint); | |
176 sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint, | |
177 -1, sortkey, 64); | |
178 } | |
179 if (options[11].doesOccur) { | |
180 serialize(stdout, codepoint); | |
181 fprintf(stdout, "\n"); | |
182 } | |
183 | |
184 fprintf(f, "; "); | |
185 | |
186 UErrorCode error = U_ZERO_ERROR; | |
187 uint32_t ce = ucol_next(iter, &error); | |
188 if (U_FAILURE(error)) { | |
189 fprintf(f, "Error retrieving collation elements\n"); | |
190 return; | |
191 } | |
192 | |
193 while (TRUE) { | |
194 fprintf(f, "["); | |
195 if (UCOL_PRIMARYORDER(ce) != 0) { | |
196 fprintf(f, "%04x", UCOL_PRIMARYORDER(ce)); | |
197 } | |
198 fprintf(f, ","); | |
199 if (UCOL_SECONDARYORDER(ce) != 0) { | |
200 fprintf(f, " %02x", UCOL_SECONDARYORDER(ce)); | |
201 } | |
202 fprintf(f, ","); | |
203 if (UCOL_TERTIARYORDER(ce) != 0) { | |
204 fprintf(f, " %02x", UCOL_TERTIARYORDER(ce)); | |
205 } | |
206 fprintf(f, "] "); | |
207 | |
208 ce = ucol_next(iter, &error); | |
209 if (ce == UCOL_NULLORDER) { | |
210 break; | |
211 } | |
212 if (U_FAILURE(error)) { | |
213 fprintf(stdout, "Error retrieving collation elements"); | |
214 return; | |
215 } | |
216 } | |
217 | |
218 if (sortkeylength > 64) { | |
219 fprintf(f, "Sortkey exceeds pre-allocated size"); | |
220 } | |
221 | |
222 fprintf(f, "["); | |
223 while (TRUE) { | |
224 fprintf(f, "%02x", *psortkey); | |
225 psortkey ++; | |
226 if ((*psortkey) == 0) { | |
227 break; | |
228 } | |
229 fprintf(f, " "); | |
230 } | |
231 fprintf(f, "]\n"); | |
232 } | |
233 | |
234 /** | |
235 * Serializes the contraction within the given argument rule | |
236 * @param f file output stream | |
237 * @param r rule | |
238 * @param rlen rule length | |
239 * @param contractionsonly flag to indicate if only contractions are to be | |
240 * output or all collation elements | |
241 * @param iter iterator to iterate over collation elements | |
242 */ | |
243 void serialize(FILE *f, UChar *rule, int rlen, UBool contractiononly, | |
244 UCollationElements *iter) { | |
245 const UChar *current = NULL; | |
246 uint32_t strength = 0; | |
247 uint32_t chOffset = 0; | |
248 uint32_t chLen = 0; | |
249 uint32_t exOffset = 0; | |
250 uint32_t exLen = 0; | |
251 uint32_t prefixOffset = 0; | |
252 uint32_t prefixLen = 0; | |
253 uint8_t specs = 0; | |
254 UBool rstart = TRUE; | |
255 UColTokenParser src; | |
256 UColOptionSet opts; | |
257 UParseError parseError; | |
258 UErrorCode error = U_ZERO_ERROR; | |
259 | |
260 src.opts = &opts; | |
261 | |
262 src.source = rule; | |
263 src.current = rule; | |
264 src.end = rule + rlen; | |
265 src.extraCurrent = src.end; | |
266 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; | |
267 | |
268 | |
269 while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError, | |
270 &error)) != NULL) { | |
271 chOffset = src.parsedToken.charsOffset; | |
272 chLen = src.parsedToken.charsLen; | |
273 // contractions handled here | |
274 if (!contractiononly || chLen > 1) { | |
275 ucol_setText(iter, rule + chOffset, chLen, &error); | |
276 if (U_FAILURE(error)) { | |
277 fprintf(stdout, "Error setting text in iterator\n"); | |
278 return; | |
279 } | |
280 serialize(f, iter); | |
281 } | |
282 rstart = FALSE; | |
283 } | |
284 } | |
285 | |
286 /** | |
287 * Prints the attribute values in the argument collator into the output stream | |
288 * @param collator | |
289 */ | |
290 void outputAttribute(UCollator *collator, UErrorCode *error) | |
291 { | |
292 UColAttribute attribute = UCOL_FRENCH_COLLATION; | |
293 while (attribute < UCOL_ATTRIBUTE_COUNT) { | |
294 int count = 0; | |
295 while (TRUE) { | |
296 // getting attribute name | |
297 if (ATTRIBUTE_NAME_[count].value == attribute) { | |
298 fprintf(OUTPUT_, "%s = ", ATTRIBUTE_NAME_[count].name); | |
299 break; | |
300 } | |
301 count ++; | |
302 } | |
303 count = 0; | |
304 int attributeval = ucol_getAttribute(collator, attribute, error); | |
305 if (U_FAILURE(*error)) { | |
306 fprintf(stdout, "Failure in reading collator attribute\n"); | |
307 return; | |
308 } | |
309 while (TRUE) { | |
310 // getting attribute value | |
311 if (ATTRIBUTE_VALUE_[count].value == attributeval) { | |
312 fprintf(OUTPUT_, "%s\n", ATTRIBUTE_VALUE_[count].name); | |
313 break; | |
314 } | |
315 count ++; | |
316 } | |
317 attribute = (UColAttribute)(attribute + 1); | |
318 } | |
319 } | |
320 | |
321 /** | |
322 * Prints the normalization mode in the argument collator into the output stream | |
323 * @param collator | |
324 */ | |
325 void outputNormalization(UCollator *collator) | |
326 { | |
327 UErrorCode status = U_ZERO_ERROR; | |
328 int normmode = ucol_getAttribute(collator, UCOL_NORMALIZATION_MODE, &status)
; | |
329 int count = 0; | |
330 while (TRUE) { | |
331 // getting attribute name | |
332 if (ATTRIBUTE_VALUE_[count].value == normmode) { | |
333 break; | |
334 } | |
335 count ++; | |
336 } | |
337 fprintf(OUTPUT_, "NORMALIZATION MODE = %s\n", | |
338 ATTRIBUTE_VALUE_[count].name); | |
339 } | |
340 | |
341 /** | |
342 * Output the collation element belonging to the locale into a file | |
343 * @param locale string | |
344 * @param fullrules flag to indicate if only tailored collation elements are to | |
345 * be output or all collation elements | |
346 */ | |
347 void serialize(const char *locale, UBool tailoredonly) { | |
348 UErrorCode error = U_ZERO_ERROR; | |
349 UChar str[128]; | |
350 int strlen = 0; | |
351 | |
352 fprintf(OUTPUT_, "# This file contains the serialized collation elements\n")
; | |
353 fprintf(OUTPUT_, "# as of the collation version indicated below.\n"); | |
354 fprintf(OUTPUT_, "# Data format: xxxx xxxx..; [yyyy, yy, yy] [yyyy, yy, yy]
... [yyyy, yy, yy] [zz zz..\n"); | |
355 fprintf(OUTPUT_, "# where xxxx are codepoints in hexadecimals,\
n"); | |
356 fprintf(OUTPUT_, "# yyyyyyyy are the corresponding\n"); | |
357 fprintf(OUTPUT_, "# collation elements in hexadecimals\n"); | |
358 fprintf(OUTPUT_, "# and zz are the sortkey values in hexadecima
ls\n"); | |
359 | |
360 fprintf(OUTPUT_, "\n# Collator information\n"); | |
361 | |
362 fprintf(OUTPUT_, "\nLocale: %s\n", locale); | |
363 fprintf(stdout, "Locale: %s\n", locale); | |
364 UVersionInfo version; | |
365 ucol_getVersion(COLLATOR_, version); | |
366 fprintf(OUTPUT_, "Version number: %d.%d.%d.%d\n", | |
367 version[0], version[1], version[2], version[3]); | |
368 outputAttribute(COLLATOR_, &error); | |
369 outputNormalization(COLLATOR_); | |
370 | |
371 UCollationElements *iter = ucol_openElements(COLLATOR_, str, strlen, | |
372 &error); | |
373 if (U_FAILURE(error)) { | |
374 fprintf(stdout, "Error creating iterator\n"); | |
375 return; | |
376 } | |
377 | |
378 if (!tailoredonly) { | |
379 fprintf(OUTPUT_, "\n# Range of unicode characters\n\n"); | |
380 UChar32 codepoint = 0; | |
381 while (codepoint <= UCHAR_MAX_VALUE) { | |
382 if (u_isdefined(codepoint)) { | |
383 strlen = 0; | |
384 UTF16_APPEND_CHAR_UNSAFE(str, strlen, codepoint); | |
385 str[strlen] = 0; | |
386 ucol_setText(iter, str, strlen, &error); | |
387 if (U_FAILURE(error)) { | |
388 fprintf(stdout, "Error setting text in iterator\n"); | |
389 return; | |
390 } | |
391 serialize(OUTPUT_, iter); | |
392 } | |
393 codepoint ++; | |
394 } | |
395 } | |
396 | |
397 UChar ucarules[0x10000]; | |
398 UChar *rules; | |
399 int32_t rulelength = 0; | |
400 rules = ucarules; | |
401 | |
402 if (tailoredonly) { | |
403 int32_t rulelength = 0; | |
404 const UChar *temp = ucol_getRules(COLLATOR_, &rulelength); | |
405 if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) { | |
406 rules = (UChar *)malloc(sizeof(UChar) * | |
407 (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE)); | |
408 } | |
409 memcpy(rules, temp, rulelength * sizeof(UChar)); | |
410 rules[rulelength] = 0; | |
411 fprintf(OUTPUT_, "\n# Tailorings\n\n"); | |
412 serialize(OUTPUT_, rules, rulelength, FALSE, iter); | |
413 if (rules != ucarules) { | |
414 free(rules); | |
415 } | |
416 } | |
417 else { | |
418 rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, ucarules, | |
419 0x10000); | |
420 if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) { | |
421 rules = (UChar *)malloc(sizeof(UChar) * | |
422 (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE)); | |
423 rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, rules, | |
424 rulelength); | |
425 } | |
426 fprintf(OUTPUT_, "\n# Contractions\n\n"); | |
427 serialize(OUTPUT_, rules, rulelength, TRUE, iter); | |
428 if (rules != ucarules) { | |
429 free(rules); | |
430 } | |
431 } | |
432 | |
433 ucol_closeElements(iter); | |
434 } | |
435 | |
436 /** | |
437 * Sets the collator with the attribute values | |
438 * @param collator | |
439 * @param error status | |
440 */ | |
441 void setAttributes(UCollator *collator, UErrorCode *error) | |
442 { | |
443 int count = 0; | |
444 while (count < UCOL_ATTRIBUTE_COUNT) { | |
445 if (ATTRIBUTE_[count] != UCOL_DEFAULT) { | |
446 ucol_setAttribute(collator, (UColAttribute)count, | |
447 ATTRIBUTE_[count], error); | |
448 if (U_FAILURE(*error)) { | |
449 return; | |
450 } | |
451 } | |
452 count ++; | |
453 } | |
454 } | |
455 | |
456 /** | |
457 * Appends directory path with an ending seperator if necessary. | |
458 * @param path with enough space to append one seperator | |
459 * @return new directory path length | |
460 */ | |
461 int appendDirSeparator(char *dir) | |
462 { | |
463 int dirlength = strlen(dir); | |
464 char dirending = dir[dirlength - 1]; | |
465 if (dirending != U_FILE_SEP_CHAR) { | |
466 dir[dirlength] = U_FILE_SEP_CHAR; | |
467 dir[dirlength + 1] = 0; | |
468 return dirlength + 1; | |
469 } | |
470 return dirlength; | |
471 } | |
472 | |
473 /** | |
474 * Output the collation element into a file | |
475 */ | |
476 void serialize() { | |
477 char filename[128]; | |
478 int dirlength = 0; | |
479 | |
480 if (options[4].doesOccur) { | |
481 strcpy(filename, options[4].value); | |
482 dirlength = appendDirSeparator(filename); | |
483 } | |
484 | |
485 if (options[2].doesOccur) { | |
486 const char *locale = (char *)options[2].value; | |
487 int32_t localeindex = 0; | |
488 | |
489 if (strcmp(locale, "all") == 0) { | |
490 if (options[4].doesOccur) { | |
491 strcat(filename, "UCA.txt"); | |
492 OUTPUT_ = fopen(filename, "w"); | |
493 if (OUTPUT_ == NULL) { | |
494 fprintf(stdout, "Cannot open file:%s\n", filename); | |
495 return; | |
496 } | |
497 } | |
498 fprintf(stdout, "UCA\n"); | |
499 UErrorCode error = U_ZERO_ERROR; | |
500 COLLATOR_ = ucol_open("en_US", &error); | |
501 if (U_FAILURE(error)) { | |
502 fprintf(stdout, "Collator creation failed:"); | |
503 fprintf(stdout, u_errorName(error)); | |
504 goto CLOSEUCA; | |
505 return; | |
506 } | |
507 setAttributes(COLLATOR_, &error); | |
508 if (U_FAILURE(error)) { | |
509 fprintf(stdout, "Collator attribute setting failed:"); | |
510 fprintf(stdout, u_errorName(error)); | |
511 goto CLOSEUCA; | |
512 return; | |
513 } | |
514 | |
515 serialize("UCA", FALSE); | |
516 CLOSEUCA : | |
517 if (options[4].doesOccur) { | |
518 filename[dirlength] = 0; | |
519 fclose(OUTPUT_); | |
520 } | |
521 ucol_close(COLLATOR_); | |
522 localeindex = ucol_countAvailable() - 1; | |
523 fprintf(stdout, "Number of locales: %d\n", localeindex + 1); | |
524 locale = ucol_getAvailable(localeindex); | |
525 } | |
526 | |
527 while (TRUE) { | |
528 UErrorCode error = U_ZERO_ERROR; | |
529 COLLATOR_ = ucol_open(locale, &error); | |
530 if (U_FAILURE(error)) { | |
531 fprintf(stdout, "Collator creation failed:"); | |
532 fprintf(stdout, u_errorName(error)); | |
533 goto CLOSETAILOR; | |
534 return; | |
535 } | |
536 setAttributes(COLLATOR_, &error); | |
537 if (U_FAILURE(error)) { | |
538 fprintf(stdout, "Collator attribute setting failed:"); | |
539 fprintf(stdout, u_errorName(error)); | |
540 goto CLOSETAILOR; | |
541 return; | |
542 } | |
543 | |
544 if (options[4].doesOccur) { | |
545 strcat(filename, locale); | |
546 strcat(filename, ".txt"); | |
547 OUTPUT_ = fopen(filename, "w"); | |
548 if (OUTPUT_ == NULL) { | |
549 fprintf(stdout, "Cannot open file:%s\n", filename); | |
550 return; | |
551 } | |
552 } | |
553 | |
554 if (options[3].doesOccur) { | |
555 serialize(locale, TRUE); | |
556 } | |
557 | |
558 ucol_close(COLLATOR_); | |
559 | |
560 CLOSETAILOR : | |
561 if (options[4].doesOccur) { | |
562 filename[dirlength] = 0; | |
563 fclose(OUTPUT_); | |
564 } | |
565 | |
566 localeindex --; | |
567 if (localeindex < 0) { | |
568 break; | |
569 } | |
570 locale = ucol_getAvailable(localeindex); | |
571 } | |
572 } | |
573 | |
574 if (options[7].doesOccur) { | |
575 char inputfilename[128] = ""; | |
576 // rules are to be used | |
577 if (options[5].doesOccur) { | |
578 strcpy(inputfilename, options[5].value); | |
579 appendDirSeparator(inputfilename); | |
580 } | |
581 strcat(inputfilename, options[7].value); | |
582 FILE *input = fopen(inputfilename, "r"); | |
583 if (input == NULL) { | |
584 fprintf(stdout, "Cannot open file:%s\n", filename); | |
585 return; | |
586 } | |
587 | |
588 char s[1024]; | |
589 UChar rule[1024]; | |
590 UChar *prule = rule; | |
591 int size = 1024; | |
592 // synwee TODO: make this part dynamic | |
593 while (fscanf(input, "%[^\n]s", s) != EOF) { | |
594 size -= u_unescape(s, prule, size); | |
595 prule = prule + u_strlen(prule); | |
596 } | |
597 fclose(input); | |
598 | |
599 if (options[4].doesOccur) { | |
600 strcat(filename, "Rules.txt"); | |
601 OUTPUT_ = fopen(filename, "w"); | |
602 if (OUTPUT_ == NULL) { | |
603 fprintf(stdout, "Cannot open file:%s\n", filename); | |
604 return; | |
605 } | |
606 } | |
607 | |
608 fprintf(stdout, "Rules\n"); | |
609 UErrorCode error = U_ZERO_ERROR; | |
610 UParseError parseError; | |
611 COLLATOR_ = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, | |
612 UCOL_DEFAULT_STRENGTH, &parseError, &error); | |
613 if (U_FAILURE(error)) { | |
614 fprintf(stdout, "Collator creation failed:"); | |
615 fprintf(stdout, u_errorName(error)); | |
616 goto CLOSERULES; | |
617 return; | |
618 } | |
619 setAttributes(COLLATOR_, &error); | |
620 if (U_FAILURE(error)) { | |
621 fprintf(stdout, "Collator attribute setting failed:"); | |
622 fprintf(stdout, u_errorName(error)); | |
623 goto CLOSERULES; | |
624 return; | |
625 } | |
626 | |
627 serialize("Rule-based", TRUE); | |
628 ucol_close(COLLATOR_); | |
629 | |
630 CLOSERULES : | |
631 if (options[4].doesOccur) { | |
632 filename[dirlength] = 0; | |
633 fclose(OUTPUT_); | |
634 } | |
635 } | |
636 } | |
637 | |
638 /** | |
639 * Parse for enum values. | |
640 * Note this only works for positive enum values. | |
641 * @param enumarray array containing names of the enum values in string and | |
642 * their corresponding value. | |
643 * declared enum value. | |
644 * @param str string to be parsed | |
645 * @return corresponding integer enum value or -1 if value is not found. | |
646 */ | |
647 int parseEnums(const EnumNameValuePair enumarray[], const char *str) | |
648 { | |
649 const char *enumname = enumarray[0].name; | |
650 int result = atoi(str); | |
651 if (result == 0 && str[0] != '0') { | |
652 while (strcmp(enumname, str) != 0) { | |
653 // checking for multiple enum names sharing the same values | |
654 enumname = strstr(enumname, str); | |
655 if (enumname != NULL) { | |
656 int size = strchr(enumname, '|') - enumname; | |
657 if (size < 0) { | |
658 size = strlen(enumname); | |
659 } | |
660 if (size == (int)strlen(str)) { | |
661 return enumarray[result].value; | |
662 } | |
663 } | |
664 result ++; | |
665 if (&(enumarray[result]) == NULL) { | |
666 return -1; | |
667 } | |
668 enumname = enumarray[result].name; | |
669 } | |
670 } | |
671 return -1; | |
672 } | |
673 | |
674 /** | |
675 * Parser for attribute name value pair | |
676 */ | |
677 void parseAttributes() { | |
678 char str[32]; | |
679 const char *pname = options[6].value; | |
680 const char *pend = options[6].value + strlen(options[6].value); | |
681 const char *pvalue; | |
682 | |
683 while (pname < pend) { | |
684 pvalue = strchr(pname, '='); | |
685 if (pvalue == NULL) { | |
686 fprintf(stdout, | |
687 "No matching value found for attribute argument %s\n", | |
688 pname); | |
689 return; | |
690 } | |
691 int count = pvalue - pname; | |
692 strncpy(str, pname, count); | |
693 str[count] = 0; | |
694 | |
695 int name = parseEnums(ATTRIBUTE_NAME_, str); | |
696 if (name == -1) { | |
697 fprintf(stdout, "Attribute name not found: %s\n", str); | |
698 return; | |
699 } | |
700 | |
701 pvalue ++; | |
702 // getting corresponding enum value | |
703 pname = strchr(pvalue, ','); | |
704 if (pname == NULL) { | |
705 pname = pend; | |
706 } | |
707 count = pname - pvalue; | |
708 strncpy(str, pvalue, count); | |
709 str[count] = 0; | |
710 int value = parseEnums(ATTRIBUTE_VALUE_, str); | |
711 if (value == -1) { | |
712 fprintf(stdout, "Attribute value not found: %s\n", str); | |
713 return; | |
714 } | |
715 ATTRIBUTE_[name] = (UColAttributeValue)value; | |
716 pname ++; | |
717 } | |
718 } | |
719 | |
720 /** | |
721 * Checks if the locale argument is a base language | |
722 * @param locale to be checked | |
723 * @return TRUE if it is a base language | |
724 */ | |
725 inline UBool checkLocaleForLanguage(const char *locale) | |
726 { | |
727 return strlen(locale) <= 2; | |
728 } | |
729 | |
730 /** | |
731 * Converts a UChar array into its string form "xxxx xxxx" | |
732 * @param ch array of UChar characters | |
733 * @param count number of UChar characters | |
734 */ | |
735 void outputUChar(UChar ch[], int count) | |
736 { | |
737 for (int i = 0; i < count; i ++) { | |
738 fprintf(OUTPUT_, "%04X ", ch[i]); | |
739 } | |
740 } | |
741 | |
742 /** | |
743 * If it is a primary difference returns -1 or 1. | |
744 * If it is a secondary difference returns -2 or 2. | |
745 * If it is a tertiary difference returns -3 or 3. | |
746 * If equals returns 0. | |
747 */ | |
748 int compareSortKey(const void *elem1, const void *elem2) | |
749 { | |
750 // compare the 2 script element sort key | |
751 UChar *ch1 = ((ScriptElement *)elem1)->ch; | |
752 UChar *ch2 = ((ScriptElement *)elem2)->ch; | |
753 int size1 = ((ScriptElement *)elem1)->count; | |
754 int size2 = ((ScriptElement *)elem2)->count; | |
755 UErrorCode error = U_ZERO_ERROR; | |
756 | |
757 ucol_setStrength(COLLATOR_, UCOL_PRIMARY); | |
758 int result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2); | |
759 if (result == 0) { | |
760 ucol_setStrength(COLLATOR_, UCOL_SECONDARY); | |
761 result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2); | |
762 if (result == 0) { | |
763 ucol_setStrength(COLLATOR_, UCOL_TERTIARY); | |
764 result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2); | |
765 if (result < 0) { | |
766 return -3; | |
767 } | |
768 if (result > 0) { | |
769 return 3; | |
770 } | |
771 } | |
772 if (result < 0) { | |
773 return -2; | |
774 } | |
775 if (result > 0) { | |
776 return 2; | |
777 } | |
778 } | |
779 return result; | |
780 } | |
781 | |
782 /** | |
783 * Output serialized script elements | |
784 * @param element the element to output | |
785 * @param compare the comparison with the previous element | |
786 * @param expansion flags TRUE if element has an expansion | |
787 */ | |
788 void outputScriptElem(ScriptElement &element, int compare, UBool expansion) | |
789 { | |
790 switch (compare) { | |
791 case 0: | |
792 if (expansion) { | |
793 fprintf(OUTPUT_, "<tr><td class='eq' title='["); | |
794 } | |
795 else { | |
796 fprintf(OUTPUT_, "<tr><td class='q' title='["); | |
797 } | |
798 break; | |
799 case -1: | |
800 if (expansion) { | |
801 fprintf(OUTPUT_, "<tr><td class='ep' title='["); | |
802 } | |
803 else { | |
804 fprintf(OUTPUT_, "<tr><td class='p' title='["); | |
805 } | |
806 break; | |
807 case -2: | |
808 if (expansion) { | |
809 fprintf(OUTPUT_, "<tr><td class='es' title='["); | |
810 } | |
811 else { | |
812 fprintf(OUTPUT_, "<tr><td class='s' title='["); | |
813 } | |
814 break; | |
815 default: | |
816 if (expansion) { | |
817 fprintf(OUTPUT_, "<tr><td class='et' title='["); | |
818 } | |
819 else { | |
820 fprintf(OUTPUT_, "<tr><td class='t' title='["); | |
821 } | |
822 } | |
823 | |
824 uint8_t sortkey[32]; | |
825 ucol_setStrength(COLLATOR_, UCOL_TERTIARY); | |
826 ucol_getSortKey(COLLATOR_, element.ch, element.count, sortkey, 32); | |
827 int i = 0; | |
828 while (sortkey[i] != 0) { | |
829 if (sortkey[i] == 1) { | |
830 fprintf(OUTPUT_, " | "); | |
831 } | |
832 else { | |
833 fprintf(OUTPUT_, "%02x", sortkey[i]); | |
834 } | |
835 | |
836 i ++; | |
837 } | |
838 | |
839 fprintf(OUTPUT_, "]'>"); | |
840 | |
841 UErrorCode error = U_ZERO_ERROR; | |
842 char utf8[64]; | |
843 UChar nfc[32]; | |
844 int32_t length = unorm_normalize(element.ch, element.count, UNORM_NFC, 0,
nfc, | |
845 32, &error); | |
846 if (U_FAILURE(error)) { | |
847 fprintf(stdout, "Error normalizing contractions to NFC\n"); | |
848 } | |
849 u_strToUTF8(utf8, 64, &length, nfc, length, &error); | |
850 if (U_FAILURE(error)) { | |
851 fprintf(stdout, "Error converting UChar to utf8\n"); | |
852 return; | |
853 } | |
854 | |
855 fprintf(OUTPUT_, "%s<br>", utf8); | |
856 fprintf(OUTPUT_, "<tt>"); | |
857 outputUChar(element.ch, element.count); | |
858 | |
859 if (compare == 0) { | |
860 fprintf(OUTPUT_, "</tt></td><td> </td><td> </td><td> </td
><td>Q</td><td>"); | |
861 } | |
862 else if (compare == -1) { | |
863 fprintf(OUTPUT_, "</tt></td><td>P</td><td> </td><td> </td><td>
</td><td>"); | |
864 } | |
865 else if (compare == -2) { | |
866 fprintf(OUTPUT_, "</tt></td><td> </td><td>S</td><td> </td><td>
</td><td>"); | |
867 } | |
868 else if (compare == -3) { | |
869 fprintf(OUTPUT_, "</tt></td><td> </td><td> </td><td>T</td><td>
</td><td>"); | |
870 } | |
871 | |
872 i = 0; | |
873 while (i < element.count) { | |
874 char str[128]; | |
875 UChar32 codepoint; | |
876 U16_NEXT(element.ch, i, element.count, codepoint); | |
877 int32_t temp = u_charName(codepoint, U_UNICODE_CHAR_NAME, str, 128, | |
878 &error); | |
879 if (U_FAILURE(error)) { | |
880 fprintf(stdout, "Error getting character name\n"); | |
881 return; | |
882 } | |
883 if (element.tailored) { | |
884 fprintf(OUTPUT_, "<b>"); | |
885 } | |
886 fprintf(OUTPUT_, "%s", str); | |
887 if (element.tailored) { | |
888 fprintf(OUTPUT_, " *</b>"); | |
889 } | |
890 if (i < element.count) { | |
891 fprintf(OUTPUT_, "<br>\n"); | |
892 } | |
893 } | |
894 | |
895 fprintf(OUTPUT_, "</td></tr>\n"); | |
896 } | |
897 | |
898 /** | |
899 * Checks if codepoint belongs to scripts | |
900 * @param script list | |
901 * @param scriptcount number of scripts | |
902 * @param codepoint to test | |
903 * @return TRUE if codepoint belongs to scripts | |
904 */ | |
905 UBool checkInScripts(UScriptCode script[], int scriptcount, | |
906 UChar32 codepoint) | |
907 { | |
908 UErrorCode error = U_ZERO_ERROR; | |
909 for (int i = 0; i < scriptcount; i ++) { | |
910 if (script[i] == USCRIPT_HAN && options[10].doesOccur) { | |
911 if ((codepoint >= 0x2E80 && codepoint <= 0x2EE4) || | |
912 (codepoint >= 0x2A672 && codepoint <= 0x2A6D6)) { | |
913 // reduce han | |
914 return TRUE; | |
915 } | |
916 } | |
917 else if (uscript_getScript(codepoint, &error) == script[i]) { | |
918 return TRUE; | |
919 } | |
920 if (U_FAILURE(error)) { | |
921 fprintf(stdout, "Error checking character in scripts\n"); | |
922 return FALSE; | |
923 } | |
924 } | |
925 return FALSE; | |
926 } | |
927 | |
928 /** | |
929 * Checks if the set of codepoints belongs to the script | |
930 * @param script list | |
931 * @param scriptcount number of scripts | |
932 * @param scriptelem | |
933 * @return TRUE if all codepoints belongs to the script | |
934 */ | |
935 inline UBool checkInScripts(UScriptCode script[], int scriptcount, | |
936 ScriptElement scriptelem) | |
937 { | |
938 int i = 0; | |
939 while (i < scriptelem.count) { | |
940 UChar32 codepoint; | |
941 U16_NEXT(scriptelem.ch, i, scriptelem.count, codepoint); | |
942 UErrorCode error = U_ZERO_ERROR; | |
943 if (checkInScripts(script, scriptcount, codepoint)) { | |
944 return TRUE; | |
945 } | |
946 } | |
947 return FALSE; | |
948 } | |
949 | |
950 /** | |
951 * Gets the script elements and contractions belonging to the script | |
952 * @param elems output list | |
953 * @param locale locale | |
954 * @return number of script elements | |
955 * Add by Richard | |
956 */ | |
957 int getScriptElementsFromExemplars(ScriptElement scriptelem[], const char* local
e) { | |
958 UErrorCode error = U_ZERO_ERROR; | |
959 UChar32 codepoint = 0; | |
960 | |
961 UResourceBundle* ures = ures_open(NULL, locale, &error); | |
962 if (U_FAILURE(error)) { | |
963 fprintf(stdout, "Can not find resource bundle for locale: %s\n", locale)
; | |
964 return -1; | |
965 } | |
966 int32_t length; | |
967 const UChar* exemplarChars = ures_getStringByKey(ures, "ExemplarCharacters",
&length, &error); | |
968 | |
969 if (U_FAILURE(error)) { | |
970 fprintf(stdout, "Can not find ExemplarCharacters in resource bundle\n"); | |
971 return -1; | |
972 } | |
973 | |
974 UChar* upperChars = new UChar[length * 2]; | |
975 if (upperChars == 0) { | |
976 fprintf(stdout, "Memory error\n"); | |
977 return -1; | |
978 } | |
979 | |
980 int32_t destLength = u_strToUpper(upperChars, length * 2, exemplarChars, -1,
locale, &error); | |
981 if (U_FAILURE(error)) { | |
982 fprintf(stdout, "Error when u_strToUpper() \n"); | |
983 return -1; | |
984 } | |
985 | |
986 UChar* pattern = new UChar[length + destLength + 10]; | |
987 UChar left[2] = {0x005b, 0x0}; | |
988 UChar right[2] = {0x005d, 0x0}; | |
989 pattern = u_strcpy(pattern, left); | |
990 pattern = u_strcat(pattern, exemplarChars); | |
991 pattern = u_strcat(pattern, upperChars); | |
992 pattern = u_strcat(pattern, right); | |
993 | |
994 UnicodeSet * uniset = new UnicodeSet(UnicodeString(pattern), error); | |
995 if (U_FAILURE(error)) { | |
996 fprintf(stdout, "Can not open USet \n"); | |
997 return -1; | |
998 } | |
999 | |
1000 UnicodeSetIterator* usetiter = new UnicodeSetIterator(*uniset); | |
1001 | |
1002 int32_t count = 0; | |
1003 | |
1004 while (usetiter -> next()) { | |
1005 if (usetiter -> isString()) { | |
1006 UnicodeString strItem = usetiter -> getString(); | |
1007 | |
1008 scriptelem[count].count = 0; | |
1009 for (int i = 0; i < strItem.length(); i++) { | |
1010 codepoint = strItem.char32At(i); | |
1011 UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, scriptelem[count]
.count, codepoint); | |
1012 scriptelem[count].tailored = FALSE; | |
1013 } | |
1014 } else { | |
1015 codepoint = usetiter -> getCodepoint(); | |
1016 scriptelem[count].count = 0; | |
1017 UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, scriptelem[count].cou
nt, codepoint); | |
1018 scriptelem[count].tailored = FALSE; | |
1019 } | |
1020 | |
1021 count++; | |
1022 } | |
1023 delete []pattern; | |
1024 | |
1025 return count; | |
1026 } | |
1027 | |
1028 /** | |
1029 * Gets the script elements and contractions belonging to the script | |
1030 * @param script list | |
1031 * @param scriptcount number of scripts | |
1032 * @param elems output list | |
1033 * @return number of script elements | |
1034 */ | |
1035 int getScriptElements(UScriptCode script[], int scriptcount, | |
1036 ScriptElement scriptelem[]) | |
1037 { | |
1038 UErrorCode error = U_ZERO_ERROR; | |
1039 UChar32 codepoint = 0; | |
1040 int count = 0; | |
1041 while (codepoint <= UCHAR_MAX_VALUE) { | |
1042 if (checkInScripts(script, scriptcount, codepoint)) { | |
1043 scriptelem[count].count = 0; | |
1044 UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch, | |
1045 scriptelem[count].count, codepoint); | |
1046 scriptelem[count].tailored = FALSE; | |
1047 count ++; | |
1048 } | |
1049 if (U_FAILURE(error)) { | |
1050 fprintf(stdout, "Error determining codepoint in script\n"); | |
1051 return -1; | |
1052 } | |
1053 codepoint ++; | |
1054 } | |
1055 | |
1056 const UChar *current = NULL; | |
1057 uint32_t strength = 0; | |
1058 uint32_t chOffset = 0; | |
1059 uint32_t chLen = 0; | |
1060 uint32_t exOffset = 0; | |
1061 uint32_t exLen = 0; | |
1062 uint32_t prefixOffset = 0; | |
1063 uint32_t prefixLen = 0; | |
1064 uint8_t specs = 0; | |
1065 UBool rstart = TRUE; | |
1066 UColTokenParser src; | |
1067 UColOptionSet opts; | |
1068 UParseError parseError; | |
1069 | |
1070 int32_t rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, NULL, 0); | |
1071 src.source = (UChar *)malloc(sizeof(UChar) * | |
1072 (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE)); | |
1073 rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, src.source, | |
1074 rulelength); | |
1075 src.current = src.source; | |
1076 src.end = src.source + rulelength; | |
1077 src.extraCurrent = src.end; | |
1078 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; | |
1079 src.opts = &opts; | |
1080 | |
1081 /* | |
1082 ucol_tok_parseNextToken(&src, &strength, &chOffset, | |
1083 &chLen, &exOffset, &exLen, | |
1084 &prefixOffset, &prefixLen, | |
1085 &specs, rstart, &parseError, | |
1086 &error) | |
1087 */ | |
1088 while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError, | |
1089 &error)) != NULL) { | |
1090 // contractions handled here | |
1091 if (chLen > 1) { | |
1092 u_strncpy(scriptelem[count].ch, src.source + chOffset, chLen); | |
1093 scriptelem[count].count = chLen; | |
1094 if (checkInScripts(script, scriptcount, scriptelem[count])) { | |
1095 scriptelem[count].tailored = FALSE; | |
1096 count ++; | |
1097 } | |
1098 } | |
1099 rstart = FALSE; | |
1100 } | |
1101 if (U_FAILURE(error)) { | |
1102 fprintf(stdout, "Error parsing rules: %s\n", u_errorName(error)); | |
1103 } | |
1104 // rule might have been reallocated, so delete this instead | |
1105 free(src.source); | |
1106 return count; | |
1107 } | |
1108 | |
1109 int compareCodepoints(const void *elem1, const void *elem2) | |
1110 { | |
1111 UChar *ch1 = ((ScriptElement *)elem1)->ch; // key | |
1112 UChar *ch2 = ((ScriptElement *)elem2)->ch; | |
1113 ch1[((ScriptElement *)elem1)->count] = 0; | |
1114 ch2[((ScriptElement *)elem2)->count] = 0; | |
1115 | |
1116 // compare the 2 codepoints | |
1117 return u_strcmp(ch1, ch2); | |
1118 } | |
1119 | |
1120 UBool hasSubNFD(ScriptElement &se, ScriptElement &key) | |
1121 { | |
1122 UChar *ch1 = se.ch; | |
1123 UChar *ch2 = key.ch; // key | |
1124 ch1[se.count] = 0; | |
1125 ch2[key.count] = 0; | |
1126 | |
1127 // compare the 2 codepoints | |
1128 if (u_strstr(ch1, ch2) != NULL) { | |
1129 return TRUE; | |
1130 } | |
1131 | |
1132 // check the decomposition | |
1133 UChar norm[32]; | |
1134 UErrorCode error = U_ZERO_ERROR; | |
1135 int size = unorm_normalize(ch1, se.count, UNORM_NFD, 0, norm, 32, | |
1136 &error); | |
1137 if (U_FAILURE(error)) { | |
1138 fprintf(stdout, "Error normalizing\n"); | |
1139 } | |
1140 if (u_strstr(norm, ch2) != NULL) { | |
1141 return TRUE; | |
1142 } | |
1143 return FALSE; | |
1144 } | |
1145 | |
1146 /** | |
1147 * Marks tailored elements | |
1148 * @param script list | |
1149 * @param scriptcount number of scripts | |
1150 * @param scriptelem script element list | |
1151 * @param scriptelemlength size of the script element list | |
1152 */ | |
1153 void markTailored(UScriptCode script[], int scriptcount, | |
1154 ScriptElement scriptelem[], int scriptelemlength) | |
1155 { | |
1156 int32_t rulelength; | |
1157 const UChar *rule = ucol_getRules(COLLATOR_, &rulelength); | |
1158 | |
1159 const UChar *current = NULL; | |
1160 uint32_t strength = 0; | |
1161 uint32_t chOffset = 0; | |
1162 uint32_t chLen = 0; | |
1163 uint32_t exOffset = 0; | |
1164 uint32_t exLen = 0; | |
1165 uint32_t prefixOffset = 0; | |
1166 uint32_t prefixLen = 0; | |
1167 uint8_t specs = 0; | |
1168 UBool rstart = TRUE; | |
1169 UColTokenParser src; | |
1170 UColOptionSet opts; | |
1171 UParseError parseError; | |
1172 | |
1173 src.opts = &opts; | |
1174 src.source = (UChar *)malloc( | |
1175 (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); | |
1176 memcpy(src.source, rule, rulelength * sizeof(UChar)); | |
1177 src.current = src.source; | |
1178 src.end = (UChar *)src.source + rulelength; | |
1179 src.extraCurrent = src.end; | |
1180 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; | |
1181 | |
1182 UErrorCode error = U_ZERO_ERROR; | |
1183 | |
1184 while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError, | |
1185 &error)) != NULL) { | |
1186 if (chLen >= 1 && strength != UCOL_TOK_RESET) { | |
1187 // skipping the reset characters and non useful stuff. | |
1188 ScriptElement se; | |
1189 u_strncpy(se.ch, src.source + chOffset, chLen); | |
1190 se.count = chLen; | |
1191 | |
1192 if (checkInScripts(script, scriptcount, se)) { | |
1193 /* | |
1194 ScriptElement *tse = (ScriptElement *)bsearch(&se, scriptelem, | |
1195 scriptelemlength, | |
1196 sizeof(ScriptElement), | |
1197 compareCodepoints); | |
1198 */ | |
1199 for (int i = 0; i < scriptelemlength; i ++) { | |
1200 if (!scriptelem[i].tailored && | |
1201 hasSubNFD(scriptelem[i], se)) { | |
1202 scriptelem[i].tailored = TRUE; | |
1203 } | |
1204 } | |
1205 } | |
1206 } | |
1207 rstart = FALSE; | |
1208 } | |
1209 free(src.source); | |
1210 if (U_FAILURE(error)) { | |
1211 fprintf(stdout, "Error parsing rules\n"); | |
1212 } | |
1213 } | |
1214 | |
1215 /** | |
1216 * Checks if the collation iterator has more than 1 collation element | |
1217 * @parem coleiter collation element iterator | |
1218 * @return TRUE if collation iterator has more than 1 collation element | |
1219 */ | |
1220 UBool hasExpansions(UCollationElements *coleiter) | |
1221 { | |
1222 UErrorCode error = U_ZERO_ERROR; | |
1223 int32_t ce = ucol_next(coleiter, &error); | |
1224 int count = 0; | |
1225 | |
1226 if (U_FAILURE(error)) { | |
1227 fprintf(stdout, "Error getting next collation element\n"); | |
1228 } | |
1229 while (ce != UCOL_NULLORDER) { | |
1230 if ((UCOL_PRIMARYORDER(ce) != 0) && !isContinuation(ce)) { | |
1231 count ++; | |
1232 if (count == 2) { | |
1233 return TRUE; | |
1234 } | |
1235 } | |
1236 ce = ucol_next(coleiter, &error); | |
1237 if (U_FAILURE(error)) { | |
1238 fprintf(stdout, "Error getting next collation element\n"); | |
1239 } | |
1240 } | |
1241 return FALSE; | |
1242 } | |
1243 | |
1244 /** | |
1245 * Prints the footer for index.html | |
1246 * @param file output file | |
1247 */ | |
1248 void outputHTMLFooter() | |
1249 { | |
1250 fprintf(OUTPUT_, "</table>\n"); | |
1251 fprintf(OUTPUT_, "</body>\n"); | |
1252 fprintf(OUTPUT_, "</html>\n"); | |
1253 } | |
1254 | |
1255 /** | |
1256 * Serialize the codepoints from start to end into an html file. | |
1257 * Arranging them into ascending collation order. | |
1258 * @param script code list | |
1259 * @param scriptcount number of scripts | |
1260 */ | |
1261 //void serializeScripts(UScriptCode script[], int scriptcount) | |
1262 //Richard | |
1263 void serializeScripts(UScriptCode script[], int scriptcount, const char* locale
= NULL) | |
1264 { | |
1265 UErrorCode error = U_ZERO_ERROR; | |
1266 | |
1267 ScriptElement *scriptelem = | |
1268 (ScriptElement *)malloc(sizeof(ScriptElement) * 0x20000); | |
1269 if (scriptelem == NULL) { | |
1270 fprintf(stdout, "Memory error\n"); | |
1271 return; | |
1272 } | |
1273 int count = 0; | |
1274 if(locale) { | |
1275 count = getScriptElementsFromExemplars(scriptelem, locale); | |
1276 } else { | |
1277 count = getScriptElements(script, scriptcount, scriptelem); | |
1278 } | |
1279 | |
1280 // Sort script elements using Quicksort algorithm: | |
1281 qsort(scriptelem, count, sizeof(ScriptElement), compareCodepoints); | |
1282 markTailored(script, scriptcount, scriptelem, count); | |
1283 // Sort script elements using Quicksort algorithm: | |
1284 qsort(scriptelem, count, sizeof(ScriptElement), compareSortKey); | |
1285 | |
1286 UCollationElements* coleiter = ucol_openElements(COLLATOR_, | |
1287 scriptelem[0].ch, | |
1288 scriptelem[0].count, | |
1289 &error); | |
1290 if (U_FAILURE(error)) { | |
1291 fprintf(stdout, "Error creating collation element iterator\n"); | |
1292 return; | |
1293 } | |
1294 | |
1295 outputScriptElem(scriptelem[0], -1, hasExpansions(coleiter)); | |
1296 for (int i = 0; i < count - 1; i ++) { | |
1297 ucol_setText(coleiter, scriptelem[i + 1].ch, scriptelem[i + 1].count, | |
1298 &error); | |
1299 if (U_FAILURE(error)) { | |
1300 fprintf(stdout, "Error setting text in collation element iterator\n"
); | |
1301 return; | |
1302 } | |
1303 outputScriptElem(scriptelem[i + 1], | |
1304 compareSortKey(scriptelem + i, scriptelem + i + 1), | |
1305 hasExpansions(coleiter)); | |
1306 } | |
1307 free(scriptelem); | |
1308 outputHTMLFooter(); | |
1309 } | |
1310 | |
1311 /** | |
1312 * Prints the header for the html | |
1313 * @param locale name | |
1314 * @param script | |
1315 * @param scriptcount number of scripts | |
1316 */ | |
1317 void outputHTMLHeader(const char *locale, UScriptCode script[], | |
1318 int scriptcount) | |
1319 { | |
1320 fprintf(OUTPUT_, "<html>\n"); | |
1321 fprintf(OUTPUT_, "<head>\n"); | |
1322 fprintf(OUTPUT_, "<meta http-equiv=\"Content-Type\" content=\"text/html; cha
rset=utf-8\">\n"); | |
1323 fprintf(OUTPUT_, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n
"); | |
1324 fprintf(OUTPUT_, "<link rel=\"stylesheet\" href=\"charts.css\" type=\"text/c
ss\">\n"); | |
1325 fprintf(OUTPUT_, "<title>ICU Collation charts</title>\n"); | |
1326 fprintf(OUTPUT_, "<base target=\"main\">\n"); | |
1327 fprintf(OUTPUT_, "</head>\n"); | |
1328 | |
1329 fprintf(OUTPUT_, "<body bgcolor=#FFFFFF>\n"); | |
1330 fprintf(OUTPUT_, "<!--\n"); | |
1331 fprintf(OUTPUT_, "This file contains sorted characters in ascending order ac
cording to the locale stated\n"); | |
1332 fprintf(OUTPUT_, "If the character is in red, it is tailored in the collatio
n rules.\n"); | |
1333 fprintf(OUTPUT_, "Background colours have certain meanings:\n"); | |
1334 fprintf(OUTPUT_, "White - equals the previous character\n"); | |
1335 fprintf(OUTPUT_, "dark blue - primary greater than the previous character\n"
); | |
1336 fprintf(OUTPUT_, "blue - secondary greater than the previous character\n"); | |
1337 fprintf(OUTPUT_, "light blue - tertiary greater than the previous character\
n"); | |
1338 fprintf(OUTPUT_, "--!>\n"); | |
1339 | |
1340 fprintf(OUTPUT_, "<table border=0>\n"); | |
1341 UChar displayname[64]; | |
1342 UErrorCode error = U_ZERO_ERROR; | |
1343 int32_t size = uloc_getDisplayName(locale, "en_US", displayname, 64, &error)
; | |
1344 char utf8displayname[128]; | |
1345 if (U_FAILURE(error)) { | |
1346 utf8displayname[0] = 0; | |
1347 } | |
1348 else { | |
1349 int32_t utf8size = 0; | |
1350 u_strToUTF8(utf8displayname, 128, &utf8size, displayname, size, &error); | |
1351 } | |
1352 | |
1353 fprintf(OUTPUT_, "<tr><th>Locale</th><td class='noborder'>%s</td></tr>\n", u
tf8displayname); | |
1354 fprintf(OUTPUT_, "<tr><th>Script(s)</th>"); | |
1355 fprintf(OUTPUT_, "<td class='noborder'>"); | |
1356 for (int i = 0; i < scriptcount; i ++) { | |
1357 fprintf(OUTPUT_, "%s", uscript_getName(script[i])); | |
1358 if (i + 1 != scriptcount) { | |
1359 fprintf(OUTPUT_, ", "); | |
1360 } | |
1361 } | |
1362 fprintf(OUTPUT_, "</td></tr>\n"); | |
1363 | |
1364 fprintf(OUTPUT_, "<tr><th>Rules</th><td class='noborder'><a href=\"http://de
v.icu-project.org/cgi-bin/viewcvs.cgi/*checkout*/icu/source/data/coll/%s.txt\">%
s.txt</a></td></tr>\n", locale, locale); | |
1365 | |
1366 UVersionInfo version; | |
1367 ucol_getVersion(COLLATOR_, version); | |
1368 fprintf(OUTPUT_, "<tr><th>Collator version</th><td class='noborder'>%d.%d.%d
.%d</td></tr>\n", | |
1369 version[0], version[1], version[2], version[3]); | |
1370 | |
1371 UColAttribute attr = UCOL_FRENCH_COLLATION; | |
1372 while (attr < UCOL_ATTRIBUTE_COUNT) { | |
1373 UColAttributeValue value = ucol_getAttribute(COLLATOR_, attr, &error); | |
1374 if (U_FAILURE(error)) { | |
1375 fprintf(stdout, "Error getting attribute\n"); | |
1376 return; | |
1377 } | |
1378 if (value != UCOL_DEFAULT) { | |
1379 if (attr == UCOL_FRENCH_COLLATION && value != UCOL_OFF) { | |
1380 fprintf(OUTPUT_, "<tr><th>French Collation</th><td class='nobord
er'>on, code %d</td></tr>\n", value); | |
1381 } | |
1382 if (attr == UCOL_ALTERNATE_HANDLING && value != UCOL_NON_IGNORABLE)
{ | |
1383 fprintf(OUTPUT_, "<tr><th>Alternate Handling</th><td class='nobo
rder'>shifted, code%d</td></tr>\n", value); | |
1384 } | |
1385 if (attr == UCOL_CASE_FIRST && value != UCOL_OFF) { | |
1386 fprintf(OUTPUT_, "<tr><th>Case First</th><td class='noborder'>on
, code %d</td></tr>\n", value); | |
1387 } | |
1388 if (attr == UCOL_CASE_LEVEL && value != UCOL_OFF) { | |
1389 fprintf(OUTPUT_, "<tr><th>Case Level</th><td class='noborder'>on
, code %d</td></tr>\n", value); | |
1390 } | |
1391 if (attr == UCOL_NORMALIZATION_MODE && value != UCOL_OFF) { | |
1392 fprintf(OUTPUT_, "<tr><th>Normalization</th><td class='noborder'
>on, code %d</td></tr>\n", value); | |
1393 } | |
1394 if (attr == UCOL_STRENGTH && value != UCOL_TERTIARY) { | |
1395 fprintf(OUTPUT_, "<tr><th>Strength</th><td class='noborder'>code
%d</td></tr>\n", value); | |
1396 } | |
1397 if (attr == UCOL_HIRAGANA_QUATERNARY_MODE && value != UCOL_OFF) { | |
1398 fprintf(OUTPUT_, "<tr><th>Hiragana Quaternary</th><td class='nob
order'>on, code %d</td></tr>\n", value); | |
1399 } | |
1400 } | |
1401 attr = (UColAttribute)(attr + 1); | |
1402 } | |
1403 | |
1404 // Get UNIX-style time and display as number and string. | |
1405 time_t ltime; | |
1406 time( <ime ); | |
1407 fprintf(OUTPUT_, "<tr><th>Date Generated</th><td class='noborder'>%s</td></t
r>", ctime(<ime)); | |
1408 | |
1409 fprintf(OUTPUT_, "</table>\n"); | |
1410 | |
1411 fprintf(OUTPUT_, "<p><a href=help.html>How to read the table</a><br>\n"); | |
1412 fprintf(OUTPUT_, "<a href=http://www.jtcsv.com/cgi-bin/icu-bugs/ target=new>
Submit a bug</a></p>\n"); | |
1413 fprintf(OUTPUT_, "\n<table>\n"); | |
1414 fprintf(OUTPUT_, "\n<tr><th>Codepoint</th><th>P</th><th>S</th><th>T</th><th>
Q</th><th>Name</th></tr>\n"); | |
1415 } | |
1416 | |
1417 /** | |
1418 * Prints the header for index.html | |
1419 * @param file output file | |
1420 */ | |
1421 void outputListHTMLHeader(FILE *file) | |
1422 { | |
1423 fprintf(file, "<html>\n"); | |
1424 fprintf(file, "<head>\n"); | |
1425 fprintf(file, "<meta http-equiv=\"Content-Type\" content=\"text/html; charse
t=utf-8\">\n"); | |
1426 fprintf(file, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n"); | |
1427 fprintf(file, "<title>ICU Collation Charts</title>\n"); | |
1428 fprintf(file, "<base target=\"main\">\n"); | |
1429 fprintf(file, "</head>\n"); | |
1430 fprintf(file, "<body bgcolor=#FFFFFF>\n"); | |
1431 fprintf(file, "<h2 align=center>ICU Collation Charts</h2>\n"); | |
1432 fprintf(file, "<p align=center>\n"); | |
1433 fprintf(file, "<a href=http://www.unicode.org/charts/collation/ target=new>U
CA Charts</a><br>"); | |
1434 } | |
1435 | |
1436 /** | |
1437 * Prints the footer for index.html | |
1438 * @param file output file | |
1439 */ | |
1440 void outputListHTMLFooter(FILE *file) | |
1441 { | |
1442 fprintf(file, "</p>\n"); | |
1443 //fprintf(file, "<center><image src=http://oss.software.ibm.com/icu/imag
es/w24.gif></center>\n"); | |
1444 fprintf(file, "</body>\n"); | |
1445 fprintf(file, "</html>\n"); | |
1446 } | |
1447 | |
1448 /** | |
1449 * Gets all scripts and serialize their codepoints into an html file. | |
1450 */ | |
1451 void serializeScripts() { | |
1452 char filename[128]; | |
1453 int dirlength = 0; | |
1454 | |
1455 if (options[4].doesOccur) { | |
1456 strcpy(filename, options[4].value); | |
1457 dirlength = appendDirSeparator(filename); | |
1458 } else { | |
1459 filename[0] = 0; | |
1460 } | |
1461 | |
1462 const char *locale; | |
1463 int32_t localelist = 0; | |
1464 int32_t localesize; | |
1465 | |
1466 localesize = ucol_countAvailable(); | |
1467 locale = ucol_getAvailable(localelist); | |
1468 | |
1469 strcat(filename, "list.html"); | |
1470 FILE *list = fopen(filename, "w"); | |
1471 filename[dirlength] = 0; | |
1472 if (list == NULL) { | |
1473 fprintf(stdout, "Cannot open file: %s\n", filename); | |
1474 return; | |
1475 } | |
1476 | |
1477 outputListHTMLHeader(list); | |
1478 fprintf(list, "<blockquote>\n"); | |
1479 while (TRUE) { | |
1480 UErrorCode error = U_ZERO_ERROR; | |
1481 COLLATOR_ = ucol_open(locale, &error); | |
1482 if (U_FAILURE(error)) { | |
1483 fprintf(stdout, "Collator creation failed:"); | |
1484 fprintf(stdout, u_errorName(error)); | |
1485 break; | |
1486 } | |
1487 if ((error != U_USING_FALLBACK_WARNING && // not tailored | |
1488 error != U_USING_DEFAULT_WARNING) || | |
1489 checkLocaleForLanguage(locale)) { | |
1490 fprintf(list, "<a href=%s.html>%s</a> ", locale, locale); | |
1491 setAttributes(COLLATOR_, &error); | |
1492 if (U_FAILURE(error)) { | |
1493 fprintf(stdout, "Collator attribute setting failed:"); | |
1494 fprintf(stdout, u_errorName(error)); | |
1495 break; | |
1496 } | |
1497 | |
1498 UScriptCode scriptcode[32]; | |
1499 uint32_t scriptcount = uscript_getCode(locale, scriptcode, 32, | |
1500 &error); | |
1501 if (U_FAILURE(error)) { | |
1502 fprintf(stdout, "Error getting lcale scripts\n"); | |
1503 break; | |
1504 } | |
1505 | |
1506 strcat(filename, locale); | |
1507 strcat(filename, ".html"); | |
1508 OUTPUT_ = fopen(filename, "w"); | |
1509 if (OUTPUT_ == NULL) { | |
1510 fprintf(stdout, "Cannot open file:%s\n", filename); | |
1511 break; | |
1512 } | |
1513 outputHTMLHeader(locale, scriptcode, scriptcount); | |
1514 fprintf(stdout, "%s\n", locale); | |
1515 | |
1516 if(options[12].doesOccur) { | |
1517 // use whole scripts | |
1518 serializeScripts(scriptcode, scriptcount); | |
1519 } else { | |
1520 // use exemplar chars | |
1521 serializeScripts(scriptcode, scriptcount, locale); | |
1522 } | |
1523 fclose(OUTPUT_); | |
1524 } | |
1525 ucol_close(COLLATOR_); | |
1526 | |
1527 filename[dirlength] = 0; | |
1528 localelist ++; | |
1529 if (localelist == localesize) { | |
1530 break; | |
1531 } | |
1532 locale = ucol_getAvailable(localelist); | |
1533 } | |
1534 fprintf(list, "<br><a href=help.html>help</a><br>"); | |
1535 fprintf(list, "</blockquote>\n"); | |
1536 outputListHTMLFooter(list); | |
1537 fclose(list); | |
1538 } | |
1539 | |
1540 /** | |
1541 * Main -- process command line, read in and pre-process the test file, | |
1542 * call other functions to do the actual tests. | |
1543 */ | |
1544 int main(int argc, char *argv[]) { | |
1545 | |
1546 argc = u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), | |
1547 options); | |
1548 | |
1549 // error handling, printing usage message | |
1550 if (argc < 0) { | |
1551 fprintf(stdout, "error in command line argument: "); | |
1552 fprintf(stdout, argv[-argc]); | |
1553 fprintf(stdout, "\n"); | |
1554 } | |
1555 if (argc < 0 || options[0].doesOccur || options[1].doesOccur) { | |
1556 fprintf(stdout, "Usage: dumpce options...\n" | |
1557 "--help\n" | |
1558 " Display this message.\n" | |
1559 "--locale name|all\n" | |
1560 " ICU locale to use. Default is en_US\n" | |
1561 "--serialize\n" | |
1562 " Serializes the collation elements in -locale or all
locales available and outputs them into --outputdir/locale_ce.txt\n" | |
1563 "--destdir dir_name\n" | |
1564 " Path for outputing the serialized collation element
s. Defaults to stdout if no defined\n" | |
1565 "--sourcedir dir_name\n" | |
1566 " Path for the input rule file for collation\n" | |
1567 "--attribute name=value,name=value...\n" | |
1568 " Pairs of attribute names and values for setting\n" | |
1569 "--rule filename\n" | |
1570 " Name of file containing the collation rules.\n" | |
1571 "--normalizaton mode\n" | |
1572 " UNormalizationMode mode to be used.\n" | |
1573 "--scripts\n" | |
1574 " Codepoints from all scripts are sorted and serializ
ed.\n" | |
1575 "--reducehan\n" | |
1576 " Only 200 Han script characters will be displayed wi
th the use of --scripts.\n" | |
1577 "--wholescripts\n" | |
1578 " Show collation order for whole scripts instead of j
ust for exemplar characters of a locale\n\n"); | |
1579 | |
1580 fprintf(stdout, "Example to generate *.txt files : dumpce --serialize --
locale af --destdir /temp --attribute UCOL_STRENGTH=UCOL_DEFAULT_STRENGTH,4=17\n
\n"); | |
1581 fprintf(stdout, "Example to generate *.html files for oss web display: d
umpce --scripts --destdir /temp --reducehan\n"); | |
1582 return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; | |
1583 } | |
1584 | |
1585 OUTPUT_ = stdout; | |
1586 if (options[6].doesOccur) { | |
1587 fprintf(stdout, "attributes %s\n", options[6].value); | |
1588 parseAttributes(); | |
1589 } | |
1590 if (options[3].doesOccur) { | |
1591 serialize(); | |
1592 } | |
1593 if (options[9].doesOccur) { | |
1594 serializeScripts(); | |
1595 } | |
1596 return 0; | |
1597 } | |
OLD | NEW |