OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2009-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: gennorm2.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2009nov25 |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * This program reads text files that define Unicode normalization, |
| 17 * parses them, and builds a binary data file. |
| 18 */ |
| 19 |
| 20 #include "unicode/utypes.h" |
| 21 #include "n2builder.h" |
| 22 |
| 23 #include <stdio.h> |
| 24 #include <stdlib.h> |
| 25 #include <string.h> |
| 26 #include "unicode/errorcode.h" |
| 27 #include "unicode/localpointer.h" |
| 28 #include "unicode/putil.h" |
| 29 #include "unicode/uchar.h" |
| 30 #include "unicode/unistr.h" |
| 31 #include "charstr.h" |
| 32 #include "normalizer2impl.h" |
| 33 #include "toolutil.h" |
| 34 #include "uoptions.h" |
| 35 #include "uparse.h" |
| 36 |
| 37 #if UCONFIG_NO_NORMALIZATION |
| 38 #include "unewdata.h" |
| 39 #endif |
| 40 |
| 41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
| 42 |
| 43 U_NAMESPACE_BEGIN |
| 44 |
| 45 UBool beVerbose=FALSE, haveCopyright=TRUE; |
| 46 |
| 47 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); |
| 48 |
| 49 #if !UCONFIG_NO_NORMALIZATION |
| 50 void parseFile(FILE *f, Normalizer2DataBuilder &builder); |
| 51 #endif |
| 52 |
| 53 /* -------------------------------------------------------------------------- */ |
| 54 |
| 55 enum { |
| 56 HELP_H, |
| 57 HELP_QUESTION_MARK, |
| 58 VERBOSE, |
| 59 COPYRIGHT, |
| 60 SOURCEDIR, |
| 61 OUTPUT_FILENAME, |
| 62 UNICODE_VERSION, |
| 63 OPT_FAST |
| 64 }; |
| 65 |
| 66 static UOption options[]={ |
| 67 UOPTION_HELP_H, |
| 68 UOPTION_HELP_QUESTION_MARK, |
| 69 UOPTION_VERBOSE, |
| 70 UOPTION_COPYRIGHT, |
| 71 UOPTION_SOURCEDIR, |
| 72 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), |
| 73 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), |
| 74 UOPTION_DEF("fast", '\1', UOPT_NO_ARG) |
| 75 }; |
| 76 |
| 77 extern "C" int |
| 78 main(int argc, char* argv[]) { |
| 79 U_MAIN_INIT_ARGS(argc, argv); |
| 80 |
| 81 /* preset then read command line options */ |
| 82 options[SOURCEDIR].value=""; |
| 83 options[UNICODE_VERSION].value=U_UNICODE_VERSION; |
| 84 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), option
s); |
| 85 |
| 86 /* error handling, printing usage message */ |
| 87 if(argc<0) { |
| 88 fprintf(stderr, |
| 89 "error in command line argument \"%s\"\n", |
| 90 argv[-argc]); |
| 91 } |
| 92 if(!options[OUTPUT_FILENAME].doesOccur) { |
| 93 argc=-1; |
| 94 } |
| 95 if( argc<2 || |
| 96 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur |
| 97 ) { |
| 98 /* |
| 99 * Broken into chunks because the C89 standard says the minimum |
| 100 * required supported string length is 509 bytes. |
| 101 */ |
| 102 fprintf(stderr, |
| 103 "Usage: %s [-options] infiles+ -o outputfilename\n" |
| 104 "\n" |
| 105 "Reads the infiles with normalization data and\n" |
| 106 "creates a binary file (outputfilename) with the data.\n" |
| 107 "\n", |
| 108 argv[0]); |
| 109 fprintf(stderr, |
| 110 "Options:\n" |
| 111 "\t-h or -? or --help this usage text\n" |
| 112 "\t-v or --verbose verbose output\n" |
| 113 "\t-c or --copyright include a copyright notice\n" |
| 114 "\t-u or --unicode Unicode version, followed by the version like
5.2.0\n"); |
| 115 fprintf(stderr, |
| 116 "\t-s or --sourcedir source directory, followed by the path\n" |
| 117 "\t-o or --output output filename\n"); |
| 118 fprintf(stderr, |
| 119 "\t --fast optimize the .nrm file for fast normalization
,\n" |
| 120 "\t which might increase its size (Writes fully
decomposed\n" |
| 121 "\t regular mappings instead of delta mappings.\n
" |
| 122 "\t You should measure the runtime speed to make
sure that\n" |
| 123 "\t this is a good trade-off.)\n"); |
| 124 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; |
| 125 } |
| 126 |
| 127 beVerbose=options[VERBOSE].doesOccur; |
| 128 haveCopyright=options[COPYRIGHT].doesOccur; |
| 129 |
| 130 IcuToolErrorCode errorCode("gennorm2/main()"); |
| 131 |
| 132 #if UCONFIG_NO_NORMALIZATION |
| 133 |
| 134 fprintf(stderr, |
| 135 "gennorm2 writes a dummy binary data file " |
| 136 "because UCONFIG_NO_NORMALIZATION is set, \n" |
| 137 "see icu/source/common/unicode/uconfig.h\n"); |
| 138 udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode); |
| 139 // Should not return an error since this is the expected behaviour if UCONFI
G_NO_NORMALIZATION is on. |
| 140 // return U_UNSUPPORTED_ERROR; |
| 141 return 0; |
| 142 |
| 143 #else |
| 144 |
| 145 LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(erro
rCode)); |
| 146 errorCode.assertSuccess(); |
| 147 |
| 148 builder->setUnicodeVersion(options[UNICODE_VERSION].value); |
| 149 |
| 150 if(options[OPT_FAST].doesOccur) { |
| 151 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); |
| 152 } |
| 153 |
| 154 // prepare the filename beginning with the source dir |
| 155 CharString filename(options[SOURCEDIR].value, errorCode); |
| 156 int32_t pathLength=filename.length(); |
| 157 if( pathLength>0 && |
| 158 filename[pathLength-1]!=U_FILE_SEP_CHAR && |
| 159 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR |
| 160 ) { |
| 161 filename.append(U_FILE_SEP_CHAR, errorCode); |
| 162 pathLength=filename.length(); |
| 163 } |
| 164 |
| 165 for(int i=1; i<argc; ++i) { |
| 166 printf("gennorm2: processing %s\n", argv[i]); |
| 167 filename.append(argv[i], errorCode); |
| 168 LocalStdioFilePointer f(fopen(filename.data(), "r")); |
| 169 if(f==NULL) { |
| 170 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data
()); |
| 171 exit(U_FILE_ACCESS_ERROR); |
| 172 } |
| 173 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS); |
| 174 parseFile(f.getAlias(), *builder); |
| 175 filename.truncate(pathLength); |
| 176 } |
| 177 |
| 178 builder->writeBinaryFile(options[OUTPUT_FILENAME].value); |
| 179 |
| 180 return errorCode.get(); |
| 181 |
| 182 #endif |
| 183 } |
| 184 |
| 185 #if !UCONFIG_NO_NORMALIZATION |
| 186 |
| 187 void parseFile(FILE *f, Normalizer2DataBuilder &builder) { |
| 188 IcuToolErrorCode errorCode("gennorm2/parseFile()"); |
| 189 char line[300]; |
| 190 uint32_t startCP, endCP; |
| 191 while(NULL!=fgets(line, (int)sizeof(line), f)) { |
| 192 char *comment=(char *)strchr(line, '#'); |
| 193 if(comment!=NULL) { |
| 194 *comment=0; |
| 195 } |
| 196 u_rtrim(line); |
| 197 if(line[0]==0) { |
| 198 continue; // skip empty and comment-only lines |
| 199 } |
| 200 if(line[0]=='*') { |
| 201 continue; // reserved syntax |
| 202 } |
| 203 const char *delimiter; |
| 204 int32_t rangeLength= |
| 205 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimite
r, errorCode); |
| 206 if(errorCode.isFailure()) { |
| 207 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n"
, line); |
| 208 exit(errorCode.reset()); |
| 209 } |
| 210 delimiter=u_skipWhitespace(delimiter); |
| 211 if(*delimiter==':') { |
| 212 const char *s=u_skipWhitespace(delimiter+1); |
| 213 char *end; |
| 214 unsigned long value=strtoul(s, &end, 10); |
| 215 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { |
| 216 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); |
| 217 exit(U_PARSE_ERROR); |
| 218 } |
| 219 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { |
| 220 builder.setCC(c, (uint8_t)value); |
| 221 } |
| 222 continue; |
| 223 } |
| 224 if(*delimiter=='-') { |
| 225 if(*u_skipWhitespace(delimiter+1)!=0) { |
| 226 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", l
ine); |
| 227 exit(U_PARSE_ERROR); |
| 228 } |
| 229 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { |
| 230 builder.removeMapping(c); |
| 231 } |
| 232 continue; |
| 233 } |
| 234 if(*delimiter=='=' || *delimiter=='>') { |
| 235 UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; |
| 236 int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars),
NULL, errorCode); |
| 237 if(errorCode.isFailure()) { |
| 238 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\
n", line); |
| 239 exit(errorCode.reset()); |
| 240 } |
| 241 UnicodeString mapping(FALSE, uchars, length); |
| 242 if(*delimiter=='=') { |
| 243 if(rangeLength!=1) { |
| 244 fprintf(stderr, |
| 245 "gennorm2 error: round-trip mapping for more than 1
code point on %s\n", |
| 246 line); |
| 247 exit(U_PARSE_ERROR); |
| 248 } |
| 249 builder.setRoundTripMapping((UChar32)startCP, mapping); |
| 250 } else { |
| 251 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { |
| 252 builder.setOneWayMapping(c, mapping); |
| 253 } |
| 254 } |
| 255 continue; |
| 256 } |
| 257 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); |
| 258 exit(U_PARSE_ERROR); |
| 259 } |
| 260 } |
| 261 |
| 262 #endif // !UCONFIG_NO_NORMALIZATION |
| 263 |
| 264 U_NAMESPACE_END |
| 265 |
| 266 /* |
| 267 * Hey, Emacs, please set the following: |
| 268 * |
| 269 * Local Variables: |
| 270 * indent-tabs-mode: nil |
| 271 * End: |
| 272 * |
| 273 */ |
OLD | NEW |