icu46/source/tools/dumpce/dumpce.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/tools/dumpce/dumpce.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /********************************************************************

	2 * COPYRIGHT:

	3 * Copyright (C) 2001-2005 IBM, Inc. All Rights Reserved.

	4 *

	5 ********************************************************************/

	6 /******************************************************************************* *

	7 *

	8 * File dumpce.cpp

	9 *

	10 * Modification History:

	11 * Name Date Description

	12 * synwee May 31 2001 Creation

	13 *

	14 ******************************************************************************** *

	15 */

	16

	17 /**

	18 * This program outputs the collation elements used for a requested tailoring.

	19 *

	20 * Usage:

	21 * dumpce options... please check main function.

	22 */

	23 #include <unicode/utypes.h>

	24 #include <unicode/ucol.h>

	25 #include <unicode/uloc.h>

	26 #include <unicode/ucoleitr.h>

	27 #include <unicode/uchar.h>

	28 #include <unicode/uscript.h>

	29 #include <unicode/utf16.h>

	30 #include <unicode/putil.h>

	31 #include <unicode/ustring.h>

	32 #include <stdio.h>

	33 #include <stdlib.h>

	34 #include <string.h>

	35 #include <time.h>

	36 #include "ucol_tok.h"

	37 #include "cstring.h"

	38 #include "uoptions.h"

	39 #include "ucol_imp.h"

	40 #include <unicode/ures.h>

	41 #include <unicode/uniset.h>

	42 #include <unicode/usetiter.h>

	43

	44 /**

	45 * Command line option variables.

	46 * These global variables are set according to the options specified on the

	47 * command line by the user.

	48 */

	49 static UOption options[]={

	50 /* 00 */ UOPTION_HELP_H,

	51 /* 01 */ UOPTION_HELP_QUESTION_MARK,

	52 /* 02 */ {"locale", NULL, NULL, NULL, 'l', UOPT_REQUIRES_ARG, 0},

	53 /* 03 */ {"serialize", NULL, NULL, NULL, 'z', UOPT_NO_ARG, 0},

	54 /* 04 */ UOPTION_DESTDIR,

	55 /* 05 */ UOPTION_SOURCEDIR,

	56 /* 06 */ {"attribute", NULL, NULL, NULL, 'a', UOPT_REQUIRES_ARG, 0},

	57 /* 07 */ {"rule", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0},

	58 /* 08 */ {"normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0},

	59 /* 09 */ {"scripts", NULL, NULL, NULL, 't', UOPT_NO_ARG, 0},

	60 /* 10 */ {"reducehan", NULL, NULL, NULL, 'e', UOPT_NO_ARG, 0},

	61 /* 11 */ UOPTION_VERBOSE,

	62 /* 12 */ {"wholescripts", NULL, NULL, NULL, 'W', UOPT_NO_ARG, 0}

	63 };

	64

	65 /**

	66 * Collator used in this program

	67 */

	68 static UCollator *COLLATOR_;

	69 /**

	70 * Output strea, used in this program

	71 */

	72 static FILE *OUTPUT_;

	73

	74 static UColAttributeValue ATTRIBUTE_[UCOL_ATTRIBUTE_COUNT] = {

	75 UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT,

	76 UCOL_DEFAULT, UCOL_DEFAULT, UCOL_DEFAULT,

	77 };

	78

	79 typedef struct {

	80 int value;

	81 char *name;

	82 } EnumNameValuePair;

	83

	84 static const EnumNameValuePair ATTRIBUTE_NAME_[] = {

	85 {UCOL_FRENCH_COLLATION, "UCOL_FRENCH_COLLATION"},

	86 {UCOL_ALTERNATE_HANDLING, "UCOL_ALTERNATE_HANDLING"},

	87 {UCOL_CASE_FIRST, "UCOL_CASE_FIRST"},

	88 {UCOL_CASE_LEVEL, "UCOL_CASE_LEVEL"},

	89 {UCOL_NORMALIZATION_MODE,

	90 "UCOL_NORMALIZATION_MODE\|UCOL_DECOMPOSITION_MODE"},

	91 {UCOL_STRENGTH, "UCOL_STRENGTH"},

	92 {UCOL_HIRAGANA_QUATERNARY_MODE, "UCOL_HIRAGANA_QUATERNARY_MODE"},

	93 {UCOL_NUMERIC_COLLATION, "UCOL_NUMERIC_COLLATION"},

	94 NULL

	95 };

	96

	97 static const EnumNameValuePair ATTRIBUTE_VALUE_[] = {

	98 {UCOL_PRIMARY, "UCOL_PRIMARY"},

	99 {UCOL_SECONDARY, "UCOL_SECONDARY"},

	100 {UCOL_TERTIARY, "UCOL_TERTIARY\|UCOL_DEFAULT_STRENGTH"},

	101 {UCOL_QUATERNARY, "UCOL_QUATERNARY"},

	102 {UCOL_IDENTICAL, "UCOL_IDENTICAL"},

	103 {UCOL_OFF, "UCOL_OFF"},

	104 {UCOL_ON, "UCOL_ON"},

	105 {UCOL_SHIFTED, "UCOL_SHIFTED"},

	106 {UCOL_NON_IGNORABLE, "UCOL_NON_IGNORABLE"},

	107 {UCOL_LOWER_FIRST, "UCOL_LOWER_FIRST"},

	108 {UCOL_UPPER_FIRST, "UCOL_UPPER_FIRST"},

	109 NULL

	110 };

	111

	112 typedef struct {

	113 UChar ch[32];

	114 int count; // number of codepoint

	115 UBool tailored;

	116 } ScriptElement;

	117

	118 /**

	119 * Writes the hexadecimal of a null-terminated array of codepoints into a

	120 * file

	121 * @param f UFILE instance to store

	122 * @param c codepoints array

	123 */

	124 void serialize(FILE f, const UChar c)

	125 {

	126 UChar cp = *(c ++);

	127

	128 fprintf(f, " %04x", cp);

	129

	130 while (*c != 0) {

	131 cp = *(c ++);

	132 fprintf(f, " %04x", cp);

	133 }

	134 }

	135

	136 /**

	137 * Writes the hexadecimal of a non-null-terminated array of codepoints into a

	138 * file

	139 * @param f UFILE instance to store

	140 * @param c codepoints array

	141 * @param l codepoints array length

	142 */

	143 void serialize(FILE f, const UChar c, int l)

	144 {

	145 int count = 1;

	146 UChar cp = *(c ++);

	147

	148 fprintf(f, " %04x", cp);

	149

	150 while (count < l) {

	151 cp = *(c ++);

	152 fprintf(f, " %04x", cp);

	153 count ++;

	154 }

	155 }

	156

	157 /**

	158 * Sets the iterator to the argument string and outputs the collation elements.

	159 * @param f file output stream

	160 * @param iter collation element iterator

	161 */

	162 void serialize(FILE f, UCollationElements iter) {

	163 UChar *codepoint = iter->iteratordata_.string;

	164 // unlikely that sortkeys will be over this size

	165 uint8_t sortkey[64];

	166 uint8_t *psortkey = sortkey;

	167 int sortkeylength = 0;

	168

	169 if (iter->iteratordata_.flags & UCOL_ITER_HASLEN) {

	170 serialize(f, codepoint, iter->iteratordata_.endp - codepoint);

	171 sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint,

	172 iter->iteratordata_.endp - codepoint, sortkey, 64);

	173 }

	174 else {

	175 serialize(f, codepoint);

	176 sortkeylength = ucol_getSortKey(iter->iteratordata_.coll, codepoint,

	177 -1, sortkey, 64);

	178 }

	179 if (options[11].doesOccur) {

	180 serialize(stdout, codepoint);

	181 fprintf(stdout, "\n");

	182 }

	183

	184 fprintf(f, "; ");

	185

	186 UErrorCode error = U_ZERO_ERROR;

	187 uint32_t ce = ucol_next(iter, &error);

	188 if (U_FAILURE(error)) {

	189 fprintf(f, "Error retrieving collation elements\n");

	190 return;

	191 }

	192

	193 while (TRUE) {

	194 fprintf(f, "[");

	195 if (UCOL_PRIMARYORDER(ce) != 0) {

	196 fprintf(f, "%04x", UCOL_PRIMARYORDER(ce));

	197 }

	198 fprintf(f, ",");

	199 if (UCOL_SECONDARYORDER(ce) != 0) {

	200 fprintf(f, " %02x", UCOL_SECONDARYORDER(ce));

	201 }

	202 fprintf(f, ",");

	203 if (UCOL_TERTIARYORDER(ce) != 0) {

	204 fprintf(f, " %02x", UCOL_TERTIARYORDER(ce));

	205 }

	206 fprintf(f, "] ");

	207

	208 ce = ucol_next(iter, &error);

	209 if (ce == UCOL_NULLORDER) {

	210 break;

	211 }

	212 if (U_FAILURE(error)) {

	213 fprintf(stdout, "Error retrieving collation elements");

	214 return;

	215 }

	216 }

	217

	218 if (sortkeylength > 64) {

	219 fprintf(f, "Sortkey exceeds pre-allocated size");

	220 }

	221

	222 fprintf(f, "[");

	223 while (TRUE) {

	224 fprintf(f, "%02x", *psortkey);

	225 psortkey ++;

	226 if ((*psortkey) == 0) {

	227 break;

	228 }

	229 fprintf(f, " ");

	230 }

	231 fprintf(f, "]\n");

	232 }

	233

	234 /**

	235 * Serializes the contraction within the given argument rule

	236 * @param f file output stream

	237 * @param r rule

	238 * @param rlen rule length

	239 * @param contractionsonly flag to indicate if only contractions are to be

	240 * output or all collation elements

	241 * @param iter iterator to iterate over collation elements

	242 */

	243 void serialize(FILE f, UChar rule, int rlen, UBool contractiononly,

	244 UCollationElements *iter) {

	245 const UChar *current = NULL;

	246 uint32_t strength = 0;

	247 uint32_t chOffset = 0;

	248 uint32_t chLen = 0;

	249 uint32_t exOffset = 0;

	250 uint32_t exLen = 0;

	251 uint32_t prefixOffset = 0;

	252 uint32_t prefixLen = 0;

	253 uint8_t specs = 0;

	254 UBool rstart = TRUE;

	255 UColTokenParser src;

	256 UColOptionSet opts;

	257 UParseError parseError;

	258 UErrorCode error = U_ZERO_ERROR;

	259

	260 src.opts = &opts;

	261

	262 src.source = rule;

	263 src.current = rule;

	264 src.end = rule + rlen;

	265 src.extraCurrent = src.end;

	266 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;

	267

	268

	269 while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,

	270 &error)) != NULL) {

	271 chOffset = src.parsedToken.charsOffset;

	272 chLen = src.parsedToken.charsLen;

	273 // contractions handled here

	274 if (!contractiononly \|\| chLen > 1) {

	275 ucol_setText(iter, rule + chOffset, chLen, &error);

	276 if (U_FAILURE(error)) {

	277 fprintf(stdout, "Error setting text in iterator\n");

	278 return;

	279 }

	280 serialize(f, iter);

	281 }

	282 rstart = FALSE;

	283 }

	284 }

	285

	286 /**

	287 * Prints the attribute values in the argument collator into the output stream

	288 * @param collator

	289 */

	290 void outputAttribute(UCollator collator, UErrorCode error)

	291 {

	292 UColAttribute attribute = UCOL_FRENCH_COLLATION;

	293 while (attribute < UCOL_ATTRIBUTE_COUNT) {

	294 int count = 0;

	295 while (TRUE) {

	296 // getting attribute name

	297 if (ATTRIBUTE_NAME_[count].value == attribute) {

	298 fprintf(OUTPUT_, "%s = ", ATTRIBUTE_NAME_[count].name);

	299 break;

	300 }

	301 count ++;

	302 }

	303 count = 0;

	304 int attributeval = ucol_getAttribute(collator, attribute, error);

	305 if (U_FAILURE(*error)) {

	306 fprintf(stdout, "Failure in reading collator attribute\n");

	307 return;

	308 }

	309 while (TRUE) {

	310 // getting attribute value

	311 if (ATTRIBUTE_VALUE_[count].value == attributeval) {

	312 fprintf(OUTPUT_, "%s\n", ATTRIBUTE_VALUE_[count].name);

	313 break;

	314 }

	315 count ++;

	316 }

	317 attribute = (UColAttribute)(attribute + 1);

	318 }

	319 }

	320

	321 /**

	322 * Prints the normalization mode in the argument collator into the output stream

	323 * @param collator

	324 */

	325 void outputNormalization(UCollator *collator)

	326 {

	327 UErrorCode status = U_ZERO_ERROR;

	328 int normmode = ucol_getAttribute(collator, UCOL_NORMALIZATION_MODE, &status) ;

	329 int count = 0;

	330 while (TRUE) {

	331 // getting attribute name

	332 if (ATTRIBUTE_VALUE_[count].value == normmode) {

	333 break;

	334 }

	335 count ++;

	336 }

	337 fprintf(OUTPUT_, "NORMALIZATION MODE = %s\n",

	338 ATTRIBUTE_VALUE_[count].name);

	339 }

	340

	341 /**

	342 * Output the collation element belonging to the locale into a file

	343 * @param locale string

	344 * @param fullrules flag to indicate if only tailored collation elements are to

	345 * be output or all collation elements

	346 */

	347 void serialize(const char *locale, UBool tailoredonly) {

	348 UErrorCode error = U_ZERO_ERROR;

	349 UChar str[128];

	350 int strlen = 0;

	351

	352 fprintf(OUTPUT_, "# This file contains the serialized collation elements\n") ;

	353 fprintf(OUTPUT_, "# as of the collation version indicated below.\n");

	354 fprintf(OUTPUT_, "# Data format: xxxx xxxx..; [yyyy, yy, yy] [yyyy, yy, yy] ... [yyyy, yy, yy] [zz zz..\n");

	355 fprintf(OUTPUT_, "# where xxxx are codepoints in hexadecimals,\ n");

	356 fprintf(OUTPUT_, "# yyyyyyyy are the corresponding\n");

	357 fprintf(OUTPUT_, "# collation elements in hexadecimals\n");

	358 fprintf(OUTPUT_, "# and zz are the sortkey values in hexadecima ls\n");

	359

	360 fprintf(OUTPUT_, "\n# Collator information\n");

	361

	362 fprintf(OUTPUT_, "\nLocale: %s\n", locale);

	363 fprintf(stdout, "Locale: %s\n", locale);

	364 UVersionInfo version;

	365 ucol_getVersion(COLLATOR_, version);

	366 fprintf(OUTPUT_, "Version number: %d.%d.%d.%d\n",

	367 version[0], version[1], version[2], version[3]);

	368 outputAttribute(COLLATOR_, &error);

	369 outputNormalization(COLLATOR_);

	370

	371 UCollationElements *iter = ucol_openElements(COLLATOR_, str, strlen,

	372 &error);

	373 if (U_FAILURE(error)) {

	374 fprintf(stdout, "Error creating iterator\n");

	375 return;

	376 }

	377

	378 if (!tailoredonly) {

	379 fprintf(OUTPUT_, "\n# Range of unicode characters\n\n");

	380 UChar32 codepoint = 0;

	381 while (codepoint <= UCHAR_MAX_VALUE) {

	382 if (u_isdefined(codepoint)) {

	383 strlen = 0;

	384 UTF16_APPEND_CHAR_UNSAFE(str, strlen, codepoint);

	385 str[strlen] = 0;

	386 ucol_setText(iter, str, strlen, &error);

	387 if (U_FAILURE(error)) {

	388 fprintf(stdout, "Error setting text in iterator\n");

	389 return;

	390 }

	391 serialize(OUTPUT_, iter);

	392 }

	393 codepoint ++;

	394 }

	395 }

	396

	397 UChar ucarules[0x10000];

	398 UChar *rules;

	399 int32_t rulelength = 0;

	400 rules = ucarules;

	401

	402 if (tailoredonly) {

	403 int32_t rulelength = 0;

	404 const UChar *temp = ucol_getRules(COLLATOR_, &rulelength);

	405 if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) {

	406 rules = (UChar )malloc(sizeof(UChar)

	407 (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));

	408 }

	409 memcpy(rules, temp, rulelength * sizeof(UChar));

	410 rules[rulelength] = 0;

	411 fprintf(OUTPUT_, "\n# Tailorings\n\n");

	412 serialize(OUTPUT_, rules, rulelength, FALSE, iter);

	413 if (rules != ucarules) {

	414 free(rules);

	415 }

	416 }

	417 else {

	418 rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, ucarules,

	419 0x10000);

	420 if (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE > 0x10000) {

	421 rules = (UChar )malloc(sizeof(UChar)

	422 (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));

	423 rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, rules,

	424 rulelength);

	425 }

	426 fprintf(OUTPUT_, "\n# Contractions\n\n");

	427 serialize(OUTPUT_, rules, rulelength, TRUE, iter);

	428 if (rules != ucarules) {

	429 free(rules);

	430 }

	431 }

	432

	433 ucol_closeElements(iter);

	434 }

	435

	436 /**

	437 * Sets the collator with the attribute values

	438 * @param collator

	439 * @param error status

	440 */

	441 void setAttributes(UCollator collator, UErrorCode error)

	442 {

	443 int count = 0;

	444 while (count < UCOL_ATTRIBUTE_COUNT) {

	445 if (ATTRIBUTE_[count] != UCOL_DEFAULT) {

	446 ucol_setAttribute(collator, (UColAttribute)count,

	447 ATTRIBUTE_[count], error);

	448 if (U_FAILURE(*error)) {

	449 return;

	450 }

	451 }

	452 count ++;

	453 }

	454 }

	455

	456 /**

	457 * Appends directory path with an ending seperator if necessary.

	458 * @param path with enough space to append one seperator

	459 * @return new directory path length

	460 */

	461 int appendDirSeparator(char *dir)

	462 {

	463 int dirlength = strlen(dir);

	464 char dirending = dir[dirlength - 1];

	465 if (dirending != U_FILE_SEP_CHAR) {

	466 dir[dirlength] = U_FILE_SEP_CHAR;

	467 dir[dirlength + 1] = 0;

	468 return dirlength + 1;

	469 }

	470 return dirlength;

	471 }

	472

	473 /**

	474 * Output the collation element into a file

	475 */

	476 void serialize() {

	477 char filename[128];

	478 int dirlength = 0;

	479

	480 if (options[4].doesOccur) {

	481 strcpy(filename, options[4].value);

	482 dirlength = appendDirSeparator(filename);

	483 }

	484

	485 if (options[2].doesOccur) {

	486 const char locale = (char )options[2].value;

	487 int32_t localeindex = 0;

	488

	489 if (strcmp(locale, "all") == 0) {

	490 if (options[4].doesOccur) {

	491 strcat(filename, "UCA.txt");

	492 OUTPUT_ = fopen(filename, "w");

	493 if (OUTPUT_ == NULL) {

	494 fprintf(stdout, "Cannot open file:%s\n", filename);

	495 return;

	496 }

	497 }

	498 fprintf(stdout, "UCA\n");

	499 UErrorCode error = U_ZERO_ERROR;

	500 COLLATOR_ = ucol_open("en_US", &error);

	501 if (U_FAILURE(error)) {

	502 fprintf(stdout, "Collator creation failed:");

	503 fprintf(stdout, u_errorName(error));

	504 goto CLOSEUCA;

	505 return;

	506 }

	507 setAttributes(COLLATOR_, &error);

	508 if (U_FAILURE(error)) {

	509 fprintf(stdout, "Collator attribute setting failed:");

	510 fprintf(stdout, u_errorName(error));

	511 goto CLOSEUCA;

	512 return;

	513 }

	514

	515 serialize("UCA", FALSE);

	516 CLOSEUCA :

	517 if (options[4].doesOccur) {

	518 filename[dirlength] = 0;

	519 fclose(OUTPUT_);

	520 }

	521 ucol_close(COLLATOR_);

	522 localeindex = ucol_countAvailable() - 1;

	523 fprintf(stdout, "Number of locales: %d\n", localeindex + 1);

	524 locale = ucol_getAvailable(localeindex);

	525 }

	526

	527 while (TRUE) {

	528 UErrorCode error = U_ZERO_ERROR;

	529 COLLATOR_ = ucol_open(locale, &error);

	530 if (U_FAILURE(error)) {

	531 fprintf(stdout, "Collator creation failed:");

	532 fprintf(stdout, u_errorName(error));

	533 goto CLOSETAILOR;

	534 return;

	535 }

	536 setAttributes(COLLATOR_, &error);

	537 if (U_FAILURE(error)) {

	538 fprintf(stdout, "Collator attribute setting failed:");

	539 fprintf(stdout, u_errorName(error));

	540 goto CLOSETAILOR;

	541 return;

	542 }

	543

	544 if (options[4].doesOccur) {

	545 strcat(filename, locale);

	546 strcat(filename, ".txt");

	547 OUTPUT_ = fopen(filename, "w");

	548 if (OUTPUT_ == NULL) {

	549 fprintf(stdout, "Cannot open file:%s\n", filename);

	550 return;

	551 }

	552 }

	553

	554 if (options[3].doesOccur) {

	555 serialize(locale, TRUE);

	556 }

	557

	558 ucol_close(COLLATOR_);

	559

	560 CLOSETAILOR :

	561 if (options[4].doesOccur) {

	562 filename[dirlength] = 0;

	563 fclose(OUTPUT_);

	564 }

	565

	566 localeindex --;

	567 if (localeindex < 0) {

	568 break;

	569 }

	570 locale = ucol_getAvailable(localeindex);

	571 }

	572 }

	573

	574 if (options[7].doesOccur) {

	575 char inputfilename[128];

	576 // rules are to be used

	577 if (options[5].doesOccur) {

	578 strcpy(inputfilename, options[5].value);

	579 appendDirSeparator(inputfilename);

	580 }

	581 strcat(inputfilename, options[7].value);

	582 FILE *input = fopen(inputfilename, "r");

	583 if (input == NULL) {

	584 fprintf(stdout, "Cannot open file:%s\n", filename);

	585 return;

	586 }

	587

	588 char s[1024];

	589 UChar rule[1024];

	590 UChar *prule = rule;

	591 int size = 1024;

	592 // synwee TODO: make this part dynamic

	593 while (fscanf(input, "%[^\n]s", s) != EOF) {

	594 size -= u_unescape(s, prule, size);

	595 prule = prule + u_strlen(prule);

	596 }

	597 fclose(input);

	598

	599 if (options[4].doesOccur) {

	600 strcat(filename, "Rules.txt");

	601 OUTPUT_ = fopen(filename, "w");

	602 if (OUTPUT_ == NULL) {

	603 fprintf(stdout, "Cannot open file:%s\n", filename);

	604 return;

	605 }

	606 }

	607

	608 fprintf(stdout, "Rules\n");

	609 UErrorCode error = U_ZERO_ERROR;

	610 UParseError parseError;

	611 COLLATOR_ = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT,

	612 UCOL_DEFAULT_STRENGTH, &parseError, &error);

	613 if (U_FAILURE(error)) {

	614 fprintf(stdout, "Collator creation failed:");

	615 fprintf(stdout, u_errorName(error));

	616 goto CLOSERULES;

	617 return;

	618 }

	619 setAttributes(COLLATOR_, &error);

	620 if (U_FAILURE(error)) {

	621 fprintf(stdout, "Collator attribute setting failed:");

	622 fprintf(stdout, u_errorName(error));

	623 goto CLOSERULES;

	624 return;

	625 }

	626

	627 serialize("Rule-based", TRUE);

	628 ucol_close(COLLATOR_);

	629

	630 CLOSERULES :

	631 if (options[4].doesOccur) {

	632 filename[dirlength] = 0;

	633 fclose(OUTPUT_);

	634 }

	635 }

	636 }

	637

	638 /**

	639 * Parse for enum values.

	640 * Note this only works for positive enum values.

	641 * @param enumarray array containing names of the enum values in string and

	642 * their corresponding value.

	643 * declared enum value.

	644 * @param str string to be parsed

	645 * @return corresponding integer enum value or -1 if value is not found.

	646 */

	647 int parseEnums(const EnumNameValuePair enumarray[], const char *str)

	648 {

	649 const char *enumname = enumarray[0].name;

	650 int result = atoi(str);

	651 if (result == 0 && str[0] != '0') {

	652 while (strcmp(enumname, str) != 0) {

	653 // checking for multiple enum names sharing the same values

	654 enumname = strstr(enumname, str);

	655 if (enumname != NULL) {

	656 int size = strchr(enumname, '\|') - enumname;

	657 if (size < 0) {

	658 size = strlen(enumname);

	659 }

	660 if (size == (int)strlen(str)) {

	661 return enumarray[result].value;

	662 }

	663 }

	664 result ++;

	665 if (&(enumarray[result]) == NULL) {

	666 return -1;

	667 }

	668 enumname = enumarray[result].name;

	669 }

	670 }

	671 return -1;

	672 }

	673

	674 /**

	675 * Parser for attribute name value pair

	676 */

	677 void parseAttributes() {

	678 char str[32];

	679 const char *pname = options[6].value;

	680 const char *pend = options[6].value + strlen(options[6].value);

	681 const char *pvalue;

	682

	683 while (pname < pend) {

	684 pvalue = strchr(pname, '=');

	685 if (pvalue == NULL) {

	686 fprintf(stdout,

	687 "No matching value found for attribute argument %s\n",

	688 pname);

	689 return;

	690 }

	691 int count = pvalue - pname;

	692 strncpy(str, pname, count);

	693 str[count] = 0;

	694

	695 int name = parseEnums(ATTRIBUTE_NAME_, str);

	696 if (name == -1) {

	697 fprintf(stdout, "Attribute name not found: %s\n", str);

	698 return;

	699 }

	700

	701 pvalue ++;

	702 // getting corresponding enum value

	703 pname = strchr(pvalue, ',');

	704 if (pname == NULL) {

	705 pname = pend;

	706 }

	707 count = pname - pvalue;

	708 strncpy(str, pvalue, count);

	709 str[count] = 0;

	710 int value = parseEnums(ATTRIBUTE_VALUE_, str);

	711 if (value == -1) {

	712 fprintf(stdout, "Attribute value not found: %s\n", str);

	713 return;

	714 }

	715 ATTRIBUTE_[name] = (UColAttributeValue)value;

	716 pname ++;

	717 }

	718 }

	719

	720 /**

	721 * Checks if the locale argument is a base language

	722 * @param locale to be checked

	723 * @return TRUE if it is a base language

	724 */

	725 inline UBool checkLocaleForLanguage(const char *locale)

	726 {

	727 return strlen(locale) <= 2;

	728 }

	729

	730 /**

	731 * Converts a UChar array into its string form "xxxx xxxx"

	732 * @param ch array of UChar characters

	733 * @param count number of UChar characters

	734 */

	735 void outputUChar(UChar ch[], int count)

	736 {

	737 for (int i = 0; i < count; i ++) {

	738 fprintf(OUTPUT_, "%04X ", ch[i]);

	739 }

	740 }

	741

	742 /**

	743 * If it is a primary difference returns -1 or 1.

	744 * If it is a secondary difference returns -2 or 2.

	745 * If it is a tertiary difference returns -3 or 3.

	746 * If equals returns 0.

	747 */

	748 int compareSortKey(const void elem1, const void elem2)

	749 {

	750 // compare the 2 script element sort key

	751 UChar ch1 = ((ScriptElement )elem1)->ch;

	752 UChar ch2 = ((ScriptElement )elem2)->ch;

	753 int size1 = ((ScriptElement *)elem1)->count;

	754 int size2 = ((ScriptElement *)elem2)->count;

	755 UErrorCode error = U_ZERO_ERROR;

	756

	757 ucol_setStrength(COLLATOR_, UCOL_PRIMARY);

	758 int result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);

	759 if (result == 0) {

	760 ucol_setStrength(COLLATOR_, UCOL_SECONDARY);

	761 result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);

	762 if (result == 0) {

	763 ucol_setStrength(COLLATOR_, UCOL_TERTIARY);

	764 result = ucol_strcoll(COLLATOR_, ch1, size1, ch2, size2);

	765 if (result < 0) {

	766 return -3;

	767 }

	768 if (result > 0) {

	769 return 3;

	770 }

	771 }

	772 if (result < 0) {

	773 return -2;

	774 }

	775 if (result > 0) {

	776 return 2;

	777 }

	778 }

	779 return result;

	780 }

	781

	782 /**

	783 * Output serialized script elements

	784 * @param element the element to output

	785 * @param compare the comparison with the previous element

	786 * @param expansion flags TRUE if element has an expansion

	787 */

	788 void outputScriptElem(ScriptElement &element, int compare, UBool expansion)

	789 {

	790 switch (compare) {

	791 case 0:

	792 if (expansion) {

	793 fprintf(OUTPUT_, "<tr><td class='eq' title='[");

	794 }

	795 else {

	796 fprintf(OUTPUT_, "<tr><td class='q' title='[");

	797 }

	798 break;

	799 case -1:

	800 if (expansion) {

	801 fprintf(OUTPUT_, "<tr><td class='ep' title='[");

	802 }

	803 else {

	804 fprintf(OUTPUT_, "<tr><td class='p' title='[");

	805 }

	806 break;

	807 case -2:

	808 if (expansion) {

	809 fprintf(OUTPUT_, "<tr><td class='es' title='[");

	810 }

	811 else {

	812 fprintf(OUTPUT_, "<tr><td class='s' title='[");

	813 }

	814 break;

	815 default:

	816 if (expansion) {

	817 fprintf(OUTPUT_, "<tr><td class='et' title='[");

	818 }

	819 else {

	820 fprintf(OUTPUT_, "<tr><td class='t' title='[");

	821 }

	822 }

	823

	824 uint8_t sortkey[32];

	825 ucol_setStrength(COLLATOR_, UCOL_TERTIARY);

	826 ucol_getSortKey(COLLATOR_, element.ch, element.count, sortkey, 32);

	827 int i = 0;

	828 while (sortkey[i] != 0) {

	829 if (sortkey[i] == 1) {

	830 fprintf(OUTPUT_, " \| ");

	831 }

	832 else {

	833 fprintf(OUTPUT_, "%02x", sortkey[i]);

	834 }

	835

	836 i ++;

	837 }

	838

	839 fprintf(OUTPUT_, "]'>");

	840

	841 UErrorCode error = U_ZERO_ERROR;

	842 char utf8[64];

	843 UChar nfc[32];

	844 int32_t length = unorm_normalize(element.ch, element.count, UNORM_NFC, 0, nfc,

	845 32, &error);

	846 if (U_FAILURE(error)) {

	847 fprintf(stdout, "Error normalizing contractions to NFC\n");

	848 }

	849 u_strToUTF8(utf8, 64, &length, nfc, length, &error);

	850 if (U_FAILURE(error)) {

	851 fprintf(stdout, "Error converting UChar to utf8\n");

	852 return;

	853 }

	854

	855 fprintf(OUTPUT_, "%s<br>", utf8);

	856 fprintf(OUTPUT_, "<tt>");

	857 outputUChar(element.ch, element.count);

	858

	859 if (compare == 0) {

	860 fprintf(OUTPUT_, "</tt></td><td> </td><td> </td><td> </td ><td>Q</td><td>");

	861 }

	862 else if (compare == -1) {

	863 fprintf(OUTPUT_, "</tt></td><td>P</td><td> </td><td> </td><td>  </td><td>");

	864 }

	865 else if (compare == -2) {

	866 fprintf(OUTPUT_, "</tt></td><td> </td><td>S</td><td> </td><td>  </td><td>");

	867 }

	868 else if (compare == -3) {

	869 fprintf(OUTPUT_, "</tt></td><td> </td><td> </td><td>T</td><td>  </td><td>");

	870 }

	871

	872 i = 0;

	873 while (i < element.count) {

	874 char str[128];

	875 UChar32 codepoint;

	876 UTF_NEXT_CHAR(element.ch, i, element.count, codepoint);

	877 int32_t temp = u_charName(codepoint, U_UNICODE_CHAR_NAME, str, 128,

	878 &error);

	879 if (U_FAILURE(error)) {

	880 fprintf(stdout, "Error getting character name\n");

	881 return;

	882 }

	883 if (element.tailored) {

	884 fprintf(OUTPUT_, "<b>");

	885 }

	886 fprintf(OUTPUT_, "%s", str);

	887 if (element.tailored) {

	888 fprintf(OUTPUT_, " *</b>");

	889 }

	890 if (i < element.count) {

	891 fprintf(OUTPUT_, "<br>\n");

	892 }

	893 }

	894

	895 fprintf(OUTPUT_, "</td></tr>\n");

	896 }

	897

	898 /**

	899 * Checks if codepoint belongs to scripts

	900 * @param script list

	901 * @param scriptcount number of scripts

	902 * @param codepoint to test

	903 * @return TRUE if codepoint belongs to scripts

	904 */

	905 UBool checkInScripts(UScriptCode script[], int scriptcount,

	906 UChar32 codepoint)

	907 {

	908 UErrorCode error = U_ZERO_ERROR;

	909 for (int i = 0; i < scriptcount; i ++) {

	910 if (script[i] == USCRIPT_HAN && options[10].doesOccur) {

	911 if ((codepoint >= 0x2E80 && codepoint <= 0x2EE4) \|\|

	912 (codepoint >= 0x2A672 && codepoint <= 0x2A6D6)) {

	913 // reduce han

	914 return TRUE;

	915 }

	916 }

	917 else if (uscript_getScript(codepoint, &error) == script[i]) {

	918 return TRUE;

	919 }

	920 if (U_FAILURE(error)) {

	921 fprintf(stdout, "Error checking character in scripts\n");

	922 return FALSE;

	923 }

	924 }

	925 return FALSE;

	926 }

	927

	928 /**

	929 * Checks if the set of codepoints belongs to the script

	930 * @param script list

	931 * @param scriptcount number of scripts

	932 * @param scriptelem

	933 * @return TRUE if all codepoints belongs to the script

	934 */

	935 inline UBool checkInScripts(UScriptCode script[], int scriptcount,

	936 ScriptElement scriptelem)

	937 {

	938 int i = 0;

	939 while (i < scriptelem.count) {

	940 UChar32 codepoint;

	941 UTF_NEXT_CHAR(scriptelem.ch, i, scriptelem.count, codepoint);

	942 UErrorCode error = U_ZERO_ERROR;

	943 if (checkInScripts(script, scriptcount, codepoint)) {

	944 return TRUE;

	945 }

	946 }

	947 return FALSE;

	948 }

	949

	950 /**

	951 * Gets the script elements and contractions belonging to the script

	952 * @param elems output list

	953 * @param locale locale

	954 * @return number of script elements

	955 * Add by Richard

	956 */

	957 int getScriptElementsFromExemplars(ScriptElement scriptelem[], const char* local e) {

	958 UErrorCode error = U_ZERO_ERROR;

	959 UChar32 codepoint = 0;

	960

	961 UResourceBundle* ures = ures_open(NULL, locale, &error);

	962 if (U_FAILURE(error)) {

	963 fprintf(stdout, "Can not find resource bundle for locale: %s\n", locale);

	964 return -1;

	965 }

	966 int32_t length;

	967 const UChar* exemplarChars = ures_getStringByKey(ures, "ExemplarCharacte rs", &length, &error);

	968

	969 if (U_FAILURE(error)) {

	970 fprintf(stdout, "Can not find ExemplarCharacters in resource bun dle\n");

	971 return -1;

	972 }

	973

	974 UChar* upperChars = new UChar[length*2];

	975 if (upperChars == 0) {

	976 fprintf(stdout, "Memory error\n");

	977 return -1;

	978 }

	979

	980 int32_t destLength = u_strToUpper(upperChars, length*2, exemplarChars, - 1, locale, &error);

	981 if (U_FAILURE(error)) {

	982 fprintf(stdout, "Error when u_strToUpper() \n");

	983 return -1;

	984 }

	985

	986 UChar* pattern = new UChar[length + destLength + 10];

	987 UChar left[2] = {0x005b, 0x0};

	988 UChar right[2] = {0x005d, 0x0};

	989 pattern = u_strcpy(pattern, left);

	990 pattern = u_strcat(pattern, exemplarChars);

	991 pattern = u_strcat(pattern, upperChars);

	992 pattern = u_strcat(pattern, right);

	993

	994 UnicodeSet * uniset = new UnicodeSet(UnicodeString(pattern), error);

	995 if (U_FAILURE(error)) {

	996 fprintf(stdout, "Can not open USet \n");

	997 return -1;

	998 }

	999

	1000 UnicodeSetIterator* usetiter = new UnicodeSetIterator(*uniset);

	1001

	1002 int32_t count = 0;

	1003

	1004 while (usetiter -> next()) {

	1005 if (usetiter -> isString()) {

	1006 UnicodeString strItem = usetiter -> getString();

	1007

	1008 scriptelem[count].count = 0;

	1009 for (int i = 0; i < strItem.length(); i++) {

	1010 codepoint = strItem.char32At(i);

	1011 UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch,

	1012 scriptelem[count].count, codepoint);

	1013 scriptelem[count].tailored = FALSE;

	1014 }

	1015 } else {

	1016 codepoint = usetiter -> getCodepoint();

	1017 scriptelem[count].count = 0;

	1018 UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch,

	1019 scriptelem[count].count, co depoint);

	1020 scriptelem[count].tailored = FALSE;

	1021 }

	1022

	1023 count++;

	1024 }

	1025

	1026 return count;

	1027 }

	1028

	1029 /**

	1030 * Gets the script elements and contractions belonging to the script

	1031 * @param script list

	1032 * @param scriptcount number of scripts

	1033 * @param elems output list

	1034 * @return number of script elements

	1035 */

	1036 int getScriptElements(UScriptCode script[], int scriptcount,

	1037 ScriptElement scriptelem[])

	1038 {

	1039 UErrorCode error = U_ZERO_ERROR;

	1040 UChar32 codepoint = 0;

	1041 int count = 0;

	1042 while (codepoint <= UCHAR_MAX_VALUE) {

	1043 if (checkInScripts(script, scriptcount, codepoint)) {

	1044 scriptelem[count].count = 0;

	1045 UTF16_APPEND_CHAR_UNSAFE(scriptelem[count].ch,

	1046 scriptelem[count].count, codepoint);

	1047 scriptelem[count].tailored = FALSE;

	1048 count ++;

	1049 }

	1050 if (U_FAILURE(error)) {

	1051 fprintf(stdout, "Error determining codepoint in script\n");

	1052 return -1;

	1053 }

	1054 codepoint ++;

	1055 }

	1056

	1057 const UChar *current = NULL;

	1058 uint32_t strength = 0;

	1059 uint32_t chOffset = 0;

	1060 uint32_t chLen = 0;

	1061 uint32_t exOffset = 0;

	1062 uint32_t exLen = 0;

	1063 uint32_t prefixOffset = 0;

	1064 uint32_t prefixLen = 0;

	1065 uint8_t specs = 0;

	1066 UBool rstart = TRUE;

	1067 UColTokenParser src;

	1068 UColOptionSet opts;

	1069 UParseError parseError;

	1070

	1071 int32_t rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, NULL, 0);

	1072 src.source = (UChar )malloc(sizeof(UChar)

	1073 (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE));

	1074 rulelength = ucol_getRulesEx(COLLATOR_, UCOL_FULL_RULES, src.source,

	1075 rulelength);

	1076 src.current = src.source;

	1077 src.end = src.source + rulelength;

	1078 src.extraCurrent = src.end;

	1079 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;

	1080 src.opts = &opts;

	1081

	1082 /*

	1083 ucol_tok_parseNextToken(&src, &strength, &chOffset,

	1084 &chLen, &exOffset, &exLen,

	1085 &prefixOffset, &prefixLen,

	1086 &specs, rstart, &parseError,

	1087 &error)

	1088 */

	1089 while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,

	1090 &error)) != NULL) {

	1091 // contractions handled here

	1092 if (chLen > 1) {

	1093 u_strncpy(scriptelem[count].ch, src.source + chOffset, chLen);

	1094 scriptelem[count].count = chLen;

	1095 if (checkInScripts(script, scriptcount, scriptelem[count])) {

	1096 scriptelem[count].tailored = FALSE;

	1097 count ++;

	1098 }

	1099 }

	1100 rstart = FALSE;

	1101 }

	1102 if (U_FAILURE(error)) {

	1103 fprintf(stdout, "Error parsing rules: %s\n", u_errorName(error));

	1104 }

	1105 // rule might have been reallocated, so delete this instead

	1106 free(src.source);

	1107 return count;

	1108 }

	1109

	1110 int compareCodepoints(const void elem1, const void elem2)

	1111 {

	1112 UChar ch1 = ((ScriptElement )elem1)->ch; // key

	1113 UChar ch2 = ((ScriptElement )elem2)->ch;

	1114 ch1[((ScriptElement *)elem1)->count] = 0;

	1115 ch2[((ScriptElement *)elem2)->count] = 0;

	1116

	1117 // compare the 2 codepoints

	1118 return u_strcmp(ch1, ch2);

	1119 }

	1120

	1121 UBool hasSubNFD(ScriptElement &se, ScriptElement &key)

	1122 {

	1123 UChar *ch1 = se.ch;

	1124 UChar *ch2 = key.ch; // key

	1125 ch1[se.count] = 0;

	1126 ch2[key.count] = 0;

	1127

	1128 // compare the 2 codepoints

	1129 if (u_strstr(ch1, ch2) != NULL) {

	1130 return TRUE;

	1131 }

	1132

	1133 // check the decomposition

	1134 UChar norm[32];

	1135 UErrorCode error = U_ZERO_ERROR;

	1136 int size = unorm_normalize(ch1, se.count, UNORM_NFD, 0, norm, 32,

	1137 &error);

	1138 if (U_FAILURE(error)) {

	1139 fprintf(stdout, "Error normalizing\n");

	1140 }

	1141 if (u_strstr(norm, ch2) != NULL) {

	1142 return TRUE;

	1143 }

	1144 return FALSE;

	1145 }

	1146

	1147 /**

	1148 * Marks tailored elements

	1149 * @param script list

	1150 * @param scriptcount number of scripts

	1151 * @param scriptelem script element list

	1152 * @param scriptelemlength size of the script element list

	1153 */

	1154 void markTailored(UScriptCode script[], int scriptcount,

	1155 ScriptElement scriptelem[], int scriptelemlength)

	1156 {

	1157 int32_t rulelength;

	1158 const UChar *rule = ucol_getRules(COLLATOR_, &rulelength);

	1159

	1160 const UChar *current = NULL;

	1161 uint32_t strength = 0;

	1162 uint32_t chOffset = 0;

	1163 uint32_t chLen = 0;

	1164 uint32_t exOffset = 0;

	1165 uint32_t exLen = 0;

	1166 uint32_t prefixOffset = 0;

	1167 uint32_t prefixLen = 0;

	1168 uint8_t specs = 0;

	1169 UBool rstart = TRUE;

	1170 UColTokenParser src;

	1171 UColOptionSet opts;

	1172 UParseError parseError;

	1173

	1174 src.opts = &opts;

	1175 src.source = (UChar *)malloc(

	1176 (rulelength + UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar));

	1177 memcpy(src.source, rule, rulelength * sizeof(UChar));

	1178 src.current = src.source;

	1179 src.end = (UChar *)src.source + rulelength;

	1180 src.extraCurrent = src.end;

	1181 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE;

	1182

	1183 UErrorCode error = U_ZERO_ERROR;

	1184

	1185 while ((current = ucol_tok_parseNextToken(&src, rstart, &parseError,

	1186 &error)) != NULL) {

	1187 if (chLen >= 1 && strength != UCOL_TOK_RESET) {

	1188 // skipping the reset characters and non useful stuff.

	1189 ScriptElement se;

	1190 u_strncpy(se.ch, src.source + chOffset, chLen);

	1191 se.count = chLen;

	1192

	1193 if (checkInScripts(script, scriptcount, se)) {

	1194 /*

	1195 ScriptElement tse = (ScriptElement )bsearch(&se, scriptelem,

	1196 scriptelemlength,

	1197 sizeof(ScriptElement),

	1198 compareCodepoints);

	1199 */

	1200 for (int i = 0; i < scriptelemlength; i ++) {

	1201 if (!scriptelem[i].tailored &&

	1202 hasSubNFD(scriptelem[i], se)) {

	1203 scriptelem[i].tailored = TRUE;

	1204 }

	1205 }

	1206 }

	1207 }

	1208 rstart = FALSE;

	1209 }

	1210 free(src.source);

	1211 if (U_FAILURE(error)) {

	1212 fprintf(stdout, "Error parsing rules\n");

	1213 }

	1214 }

	1215

	1216 /**

	1217 * Checks if the collation iterator has more than 1 collation element

	1218 * @parem coleiter collation element iterator

	1219 * @return TRUE if collation iterator has more than 1 collation element

	1220 */

	1221 UBool hasExpansions(UCollationElements *coleiter)

	1222 {

	1223 UErrorCode error = U_ZERO_ERROR;

	1224 int32_t ce = ucol_next(coleiter, &error);

	1225 int count = 0;

	1226

	1227 if (U_FAILURE(error)) {

	1228 fprintf(stdout, "Error getting next collation element\n");

	1229 }

	1230 while (ce != UCOL_NULLORDER) {

	1231 if ((UCOL_PRIMARYORDER(ce) != 0) && !isContinuation(ce)) {

	1232 count ++;

	1233 if (count == 2) {

	1234 return TRUE;

	1235 }

	1236 }

	1237 ce = ucol_next(coleiter, &error);

	1238 if (U_FAILURE(error)) {

	1239 fprintf(stdout, "Error getting next collation element\n");

	1240 }

	1241 }

	1242 return FALSE;

	1243 }

	1244

	1245 /**

	1246 * Prints the footer for index.html

	1247 * @param file output file

	1248 */

	1249 void outputHTMLFooter()

	1250 {

	1251 fprintf(OUTPUT_, "</table>\n");

	1252 fprintf(OUTPUT_, "</body>\n");

	1253 fprintf(OUTPUT_, "</html>\n");

	1254 }

	1255

	1256 /**

	1257 * Serialize the codepoints from start to end into an html file.

	1258 * Arranging them into ascending collation order.

	1259 * @param script code list

	1260 * @param scriptcount number of scripts

	1261 */

	1262 //void serializeScripts(UScriptCode script[], int scriptcount)

	1263 //Richard

	1264 void serializeScripts(UScriptCode script[], int scriptcount, const char* locale = NULL)

	1265 {

	1266 UErrorCode error = U_ZERO_ERROR;

	1267

	1268 ScriptElement *scriptelem =

	1269 (ScriptElement )malloc(sizeof(ScriptElement) 0x20000);

	1270 if (scriptelem == NULL) {

	1271 fprintf(stdout, "Memory error\n");

	1272 return;

	1273 }

	1274 int count = 0;

	1275 if(locale) {

	1276 count = getScriptElementsFromExemplars(scriptelem, locale);

	1277 } else {

	1278 count = getScriptElements(script, scriptcount, scriptelem);

	1279 }

	1280

	1281 // Sort script elements using Quicksort algorithm:

	1282 qsort(scriptelem, count, sizeof(ScriptElement), compareCodepoints);

	1283 markTailored(script, scriptcount, scriptelem, count);

	1284 // Sort script elements using Quicksort algorithm:

	1285 qsort(scriptelem, count, sizeof(ScriptElement), compareSortKey);

	1286

	1287 UCollationElements* coleiter = ucol_openElements(COLLATOR_,

	1288 scriptelem[0].ch,

	1289 scriptelem[0].count,

	1290 &error);

	1291 if (U_FAILURE(error)) {

	1292 fprintf(stdout, "Error creating collation element iterator\n");

	1293 return;

	1294 }

	1295

	1296 outputScriptElem(scriptelem[0], -1, hasExpansions(coleiter));

	1297 for (int i = 0; i < count - 1; i ++) {

	1298 ucol_setText(coleiter, scriptelem[i + 1].ch, scriptelem[i + 1].count,

	1299 &error);

	1300 if (U_FAILURE(error)) {

	1301 fprintf(stdout, "Error setting text in collation element iterator\n" );

	1302 return;

	1303 }

	1304 outputScriptElem(scriptelem[i + 1],

	1305 compareSortKey(scriptelem + i, scriptelem + i + 1),

	1306 hasExpansions(coleiter));

	1307 }

	1308 free(scriptelem);

	1309 outputHTMLFooter();

	1310 }

	1311

	1312 /**

	1313 * Prints the header for the html

	1314 * @param locale name

	1315 * @param script

	1316 * @param scriptcount number of scripts

	1317 */

	1318 void outputHTMLHeader(const char *locale, UScriptCode script[],

	1319 int scriptcount)

	1320 {

	1321 fprintf(OUTPUT_, "<html>\n");

	1322 fprintf(OUTPUT_, "<head>\n");

	1323 fprintf(OUTPUT_, "<meta http-equiv=\"Content-Type\" content=\"text/html; cha rset=utf-8\">\n");

	1324 fprintf(OUTPUT_, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n ");

	1325 fprintf(OUTPUT_, "<link rel=\"stylesheet\" href=\"charts.css\" type=\"text/c ss\">\n");

	1326 fprintf(OUTPUT_, "<title>ICU Collation charts</title>\n");

	1327 fprintf(OUTPUT_, "<base target=\"main\">\n");

	1328 fprintf(OUTPUT_, "</head>\n");

	1329

	1330 fprintf(OUTPUT_, "<body bgcolor=#FFFFFF>\n");

	1331 fprintf(OUTPUT_, "<!--\n");

	1332 fprintf(OUTPUT_, "This file contains sorted characters in ascending order ac cording to the locale stated\n");

	1333 fprintf(OUTPUT_, "If the character is in red, it is tailored in the collatio n rules.\n");

	1334 fprintf(OUTPUT_, "Background colours have certain meanings:\n");

	1335 fprintf(OUTPUT_, "White - equals the previous character\n");

	1336 fprintf(OUTPUT_, "dark blue - primary greater than the previous character\n" );

	1337 fprintf(OUTPUT_, "blue - secondary greater than the previous character\n");

	1338 fprintf(OUTPUT_, "light blue - tertiary greater than the previous character\ n");

	1339 fprintf(OUTPUT_, "--!>\n");

	1340

	1341 fprintf(OUTPUT_, "<table border=0>\n");

	1342 UChar displayname[64];

	1343 UErrorCode error = U_ZERO_ERROR;

	1344 int32_t size = uloc_getDisplayName(locale, "en_US", displayname, 64, &error) ;

	1345 char utf8displayname[128];

	1346 if (U_FAILURE(error)) {

	1347 utf8displayname[0] = 0;

	1348 }

	1349 else {

	1350 int32_t utf8size = 0;

	1351 u_strToUTF8(utf8displayname, 128, &utf8size, displayname, size, &error);

	1352 }

	1353

	1354 fprintf(OUTPUT_, "<tr><th>Locale</th><td class='noborder'>%s</td></tr>\n", u tf8displayname);

	1355 fprintf(OUTPUT_, "<tr><th>Script(s)</th>");

	1356 fprintf(OUTPUT_, "<td class='noborder'>");

	1357 for (int i = 0; i < scriptcount; i ++) {

	1358 fprintf(OUTPUT_, "%s", uscript_getName(script[i]));

	1359 if (i + 1 != scriptcount) {

	1360 fprintf(OUTPUT_, ", ");

	1361 }

	1362 }

	1363 fprintf(OUTPUT_, "</td></tr>\n");

	1364

	1365 fprintf(OUTPUT_, "<tr><th>Rules</th><td class='noborder'><a href=\"http://de v.icu-project.org/cgi-bin/viewcvs.cgi/checkout/icu/source/data/coll/%s.txt\">% s.txt</a></td></tr>\n", locale, locale);

	1366

	1367 UVersionInfo version;

	1368 ucol_getVersion(COLLATOR_, version);

	1369 fprintf(OUTPUT_, "<tr><th>Collator version</th><td class='noborder'>%d.%d.%d .%d</td></tr>\n",

	1370 version[0], version[1], version[2], version[3]);

	1371

	1372 UColAttribute attr = UCOL_FRENCH_COLLATION;

	1373 while (attr < UCOL_ATTRIBUTE_COUNT) {

	1374 UColAttributeValue value = ucol_getAttribute(COLLATOR_, attr, &error);

	1375 if (U_FAILURE(error)) {

	1376 fprintf(stdout, "Error getting attribute\n");

	1377 return;

	1378 }

	1379 if (value != UCOL_DEFAULT) {

	1380 if (attr == UCOL_FRENCH_COLLATION && value != UCOL_OFF) {

	1381 fprintf(OUTPUT_, "<tr><th>French Collation</th><td class='nobord er'>on, code %d</td></tr>\n", value);

	1382 }

	1383 if (attr == UCOL_ALTERNATE_HANDLING && value != UCOL_NON_IGNORABLE) {

	1384 fprintf(OUTPUT_, "<tr><th>Alternate Handling</th><td class='nobo rder'>shifted, code%d</td></tr>\n", value);

	1385 }

	1386 if (attr == UCOL_CASE_FIRST && value != UCOL_OFF) {

	1387 fprintf(OUTPUT_, "<tr><th>Case First</th><td class='noborder'>on , code %d</td></tr>\n", value);

	1388 }

	1389 if (attr == UCOL_CASE_LEVEL && value != UCOL_OFF) {

	1390 fprintf(OUTPUT_, "<tr><th>Case Level</th><td class='noborder'>on , code %d</td></tr>\n", value);

	1391 }

	1392 if (attr == UCOL_NORMALIZATION_MODE && value != UCOL_OFF) {

	1393 fprintf(OUTPUT_, "<tr><th>Normalization</th><td class='noborder' >on, code %d</td></tr>\n", value);

	1394 }

	1395 if (attr == UCOL_STRENGTH && value != UCOL_TERTIARY) {

	1396 fprintf(OUTPUT_, "<tr><th>Strength</th><td class='noborder'>code %d</td></tr>\n", value);

	1397 }

	1398 if (attr == UCOL_HIRAGANA_QUATERNARY_MODE && value != UCOL_OFF) {

	1399 fprintf(OUTPUT_, "<tr><th>Hiragana Quaternary</th><td class='nob order'>on, code %d</td></tr>\n", value);

	1400 }

	1401 }

	1402 attr = (UColAttribute)(attr + 1);

	1403 }

	1404

	1405 // Get UNIX-style time and display as number and string.

	1406 time_t ltime;

	1407 time( &ltime );

	1408 fprintf(OUTPUT_, "<tr><th>Date Generated</th><td class='noborder'>%s</td></t r>", ctime(&ltime));

	1409

	1410 fprintf(OUTPUT_, "</table>\n");

	1411

	1412 fprintf(OUTPUT_, "<p><a href=help.html>How to read the table</a><br>\n");

	1413 fprintf(OUTPUT_, "<a href=http://www.jtcsv.com/cgi-bin/icu-bugs/ target=new> Submit a bug</a></p>\n");

	1414 fprintf(OUTPUT_, "\n<table>\n");

	1415 fprintf(OUTPUT_, "\n<tr><th>Codepoint</th><th>P</th><th>S</th><th>T</th><th> Q</th><th>Name</th></tr>\n");

	1416 }

	1417

	1418 /**

	1419 * Prints the header for index.html

	1420 * @param file output file

	1421 */

	1422 void outputListHTMLHeader(FILE *file)

	1423 {

	1424 fprintf(file, "<html>\n");

	1425 fprintf(file, "<head>\n");

	1426 fprintf(file, "<meta http-equiv=\"Content-Type\" content=\"text/html; charse t=utf-8\">\n");

	1427 fprintf(file, "<meta http-equiv=\"Content-Language\" content=\"en-us\">\n");

	1428 fprintf(file, "<title>ICU Collation Charts</title>\n");

	1429 fprintf(file, "<base target=\"main\">\n");

	1430 fprintf(file, "</head>\n");

	1431 fprintf(file, "<body bgcolor=#FFFFFF>\n");

	1432 fprintf(file, "<h2 align=center>ICU Collation Charts</h2>\n");

	1433 fprintf(file, "<p align=center>\n");

	1434 fprintf(file, "<a href=http://www.unicode.org/charts/collation/ target=new>U CA Charts</a><br>");

	1435 }

	1436

	1437 /**

	1438 * Prints the footer for index.html

	1439 * @param file output file

	1440 */

	1441 void outputListHTMLFooter(FILE *file)

	1442 {

	1443 fprintf(file, "</p>\n");

	1444 //fprintf(file, "<center><image src=http://oss.software.ibm.com/icu/imag es/w24.gif></center>\n");

	1445 fprintf(file, "</body>\n");

	1446 fprintf(file, "</html>\n");

	1447 }

	1448

	1449 /**

	1450 * Gets all scripts and serialize their codepoints into an html file.

	1451 */

	1452 void serializeScripts() {

	1453 char filename[128];

	1454 int dirlength = 0;

	1455

	1456 if (options[4].doesOccur) {

	1457 strcpy(filename, options[4].value);

	1458 dirlength = appendDirSeparator(filename);

	1459 } else {

	1460 filename[0] = 0;

	1461 }

	1462

	1463 const char *locale;

	1464 int32_t localelist = 0;

	1465 int32_t localesize;

	1466

	1467 localesize = ucol_countAvailable();

	1468 locale = ucol_getAvailable(localelist);

	1469

	1470 strcat(filename, "list.html");

	1471 FILE *list = fopen(filename, "w");

	1472 filename[dirlength] = 0;

	1473 if (list == NULL) {

	1474 fprintf(stdout, "Cannot open file: %s\n", filename);

	1475 return;

	1476 }

	1477

	1478 outputListHTMLHeader(list);

	1479 fprintf(list, "<blockquote>\n");

	1480 while (TRUE) {

	1481 UErrorCode error = U_ZERO_ERROR;

	1482 COLLATOR_ = ucol_open(locale, &error);

	1483 if (U_FAILURE(error)) {

	1484 fprintf(stdout, "Collator creation failed:");

	1485 fprintf(stdout, u_errorName(error));

	1486 return;

	1487 }

	1488 if ((error != U_USING_FALLBACK_WARNING && // not tailored

	1489 error != U_USING_DEFAULT_WARNING) \|\|

	1490 checkLocaleForLanguage(locale)) {

	1491 fprintf(list, "<a href=%s.html>%s</a> ", locale, locale);

	1492 setAttributes(COLLATOR_, &error);

	1493 if (U_FAILURE(error)) {

	1494 fprintf(stdout, "Collator attribute setting failed:");

	1495 fprintf(stdout, u_errorName(error));

	1496 return;

	1497 }

	1498

	1499 UScriptCode scriptcode[32];

	1500 uint32_t scriptcount = uscript_getCode(locale, scriptcode, 32,

	1501 &error);

	1502 if (U_FAILURE(error)) {

	1503 fprintf(stdout, "Error getting lcale scripts\n");

	1504 return;

	1505 }

	1506

	1507 strcat(filename, locale);

	1508 strcat(filename, ".html");

	1509 OUTPUT_ = fopen(filename, "w");

	1510 if (OUTPUT_ == NULL) {

	1511 fprintf(stdout, "Cannot open file:%s\n", filename);

	1512 return;

	1513 }

	1514 outputHTMLHeader(locale, scriptcode, scriptcount);

	1515 fprintf(stdout, "%s\n", locale);

	1516

	1517 if(options[12].doesOccur) {

	1518 // use whole scripts

	1519 serializeScripts(scriptcode, scriptcount);

	1520 } else {

	1521 // use exemplar chars

	1522 serializeScripts(scriptcode, scriptcount, locale);

	1523 }

	1524 fclose(OUTPUT_);

	1525 }

	1526 ucol_close(COLLATOR_);

	1527

	1528 filename[dirlength] = 0;

	1529 localelist ++;

	1530 if (localelist == localesize) {

	1531 break;

	1532 }

	1533 locale = ucol_getAvailable(localelist);

	1534 }

	1535 fprintf(list, "<br><a href=help.html>help</a><br>");

	1536 fprintf(list, "</blockquote>\n");

	1537 outputListHTMLFooter(list);

	1538 fclose(list);

	1539 }

	1540

	1541 /**

	1542 * Main -- process command line, read in and pre-process the test file,

	1543 * call other functions to do the actual tests.

	1544 */

	1545 int main(int argc, char *argv[]) {

	1546

	1547 argc = u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]),

	1548 options);

	1549

	1550 // error handling, printing usage message

	1551 if (argc < 0) {

	1552 fprintf(stdout, "error in command line argument: ");

	1553 fprintf(stdout, argv[-argc]);

	1554 fprintf(stdout, "\n");

	1555 }

	1556 if (argc < 0 \|\| options[0].doesOccur \|\| options[1].doesOccur) {

	1557 fprintf(stdout, "Usage: dumpce options...\n"

	1558 "--help\n"

	1559 " Display this message.\n"

	1560 "--locale name\|all\n"

	1561 " ICU locale to use. Default is en_US\n"

	1562 "--serialize\n"

	1563 " Serializes the collation elements in -locale or all locales available and outputs them into --outputdir/locale_ce.txt\n"

	1564 "--destdir dir_name\n"

	1565 " Path for outputing the serialized collation element s. Defaults to stdout if no defined\n"

	1566 "--sourcedir dir_name\n"

	1567 " Path for the input rule file for collation\n"

	1568 "--attribute name=value,name=value...\n"

	1569 " Pairs of attribute names and values for setting\n"

	1570 "--rule filename\n"

	1571 " Name of file containing the collation rules.\n"

	1572 "--normalizaton mode\n"

	1573 " UNormalizationMode mode to be used.\n"

	1574 "--scripts\n"

	1575 " Codepoints from all scripts are sorted and serializ ed.\n"

	1576 "--reducehan\n"

	1577 " Only 200 Han script characters will be displayed wi th the use of --scripts.\n"

	1578 "--wholescripts\n"

	1579 " Show collation order for whole scripts instead of j ust for exemplar characters of a locale\n\n");

	1580

	1581 fprintf(stdout, "Example to generate *.txt files : dumpce --serialize -- locale af --destdir /temp --attribute UCOL_STRENGTH=UCOL_DEFAULT_STRENGTH,4=17\n \n");

	1582 fprintf(stdout, "Example to generate *.html files for oss web display: d umpce --scripts --destdir /temp --reducehan\n");

	1583 return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;

	1584 }

	1585

	1586 OUTPUT_ = stdout;

	1587 if (options[6].doesOccur) {

	1588 fprintf(stdout, "attributes %s\n", options[6].value);

	1589 parseAttributes();

	1590 }

	1591 if (options[3].doesOccur) {

	1592 serialize();

	1593 }

	1594 if (options[9].doesOccur) {

	1595 serializeScripts();

	1596 }

	1597 return 0;

	1598 }

OLD	NEW

« no previous file with comments | « icu46/source/tools/dumpce/Makefile.in ('k') | icu46/source/tools/dumpce/dumpce.dsp » ('j') | no next file with comments »