OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * | 3 * |
4 * Copyright (C) 2009-2012, International Business Machines | 4 * Copyright (C) 2009-2014, International Business Machines |
5 * Corporation and others. All Rights Reserved. | 5 * Corporation and others. All Rights Reserved. |
6 * | 6 * |
7 ******************************************************************************* | 7 ******************************************************************************* |
8 * file name: n2builder.cpp | 8 * file name: n2builder.cpp |
9 * encoding: US-ASCII | 9 * encoding: US-ASCII |
10 * tab size: 8 (not used) | 10 * tab size: 8 (not used) |
11 * indentation:4 | 11 * indentation:4 |
12 * | 12 * |
13 * created on: 2009nov25 | 13 * created on: 2009nov25 |
14 * created by: Markus W. Scherer | 14 * created by: Markus W. Scherer |
(...skipping 11 matching lines...) Expand all Loading... |
26 #if U_HAVE_STD_STRING | 26 #if U_HAVE_STD_STRING |
27 #include <vector> | 27 #include <vector> |
28 #endif | 28 #endif |
29 #include "unicode/errorcode.h" | 29 #include "unicode/errorcode.h" |
30 #include "unicode/localpointer.h" | 30 #include "unicode/localpointer.h" |
31 #include "unicode/putil.h" | 31 #include "unicode/putil.h" |
32 #include "unicode/udata.h" | 32 #include "unicode/udata.h" |
33 #include "unicode/uniset.h" | 33 #include "unicode/uniset.h" |
34 #include "unicode/unistr.h" | 34 #include "unicode/unistr.h" |
35 #include "unicode/ustring.h" | 35 #include "unicode/ustring.h" |
| 36 #include "charstr.h" |
36 #include "hash.h" | 37 #include "hash.h" |
37 #include "normalizer2impl.h" | 38 #include "normalizer2impl.h" |
38 #include "toolutil.h" | 39 #include "toolutil.h" |
39 #include "unewdata.h" | 40 #include "unewdata.h" |
40 #include "utrie2.h" | 41 #include "utrie2.h" |
41 #include "uvectr32.h" | 42 #include "uvectr32.h" |
42 | 43 #include "writesrc.h" |
43 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) | |
44 | 44 |
45 #if !UCONFIG_NO_NORMALIZATION | 45 #if !UCONFIG_NO_NORMALIZATION |
46 | 46 |
47 /* UDataInfo cf. udata.h */ | 47 /* UDataInfo cf. udata.h */ |
48 static UDataInfo dataInfo={ | 48 static UDataInfo dataInfo={ |
49 sizeof(UDataInfo), | 49 sizeof(UDataInfo), |
50 0, | 50 0, |
51 | 51 |
52 U_IS_BIG_ENDIAN, | 52 U_IS_BIG_ENDIAN, |
53 U_CHARSET_FAMILY, | 53 U_CHARSET_FAMILY, |
54 U_SIZEOF_UCHAR, | 54 U_SIZEOF_UCHAR, |
55 0, | 55 0, |
56 | 56 |
57 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ | 57 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ |
58 { 2, 0, 0, 0 }, /* formatVersion */ | 58 { 2, 0, 0, 0 }, /* formatVersion */ |
59 { 5, 2, 0, 0 } /* dataVersion (Unicode version) */ | 59 { 5, 2, 0, 0 } /* dataVersion (Unicode version) */ |
60 }; | 60 }; |
61 | 61 |
62 U_NAMESPACE_BEGIN | 62 U_NAMESPACE_BEGIN |
63 | 63 |
64 class HangulIterator { | 64 class HangulIterator { |
65 public: | 65 public: |
66 struct Range { | 66 struct Range { |
67 UChar32 start, limit; | 67 UChar32 start, limit; |
68 uint16_t norm16; | 68 uint16_t norm16; |
69 }; | 69 }; |
70 | 70 |
71 HangulIterator() : rangeIndex(0) {} | 71 HangulIterator() : rangeIndex(0) {} |
72 const Range *nextRange() { | 72 const Range *nextRange() { |
73 if(rangeIndex<LENGTHOF(ranges)) { | 73 if(rangeIndex<UPRV_LENGTHOF(ranges)) { |
74 return ranges+rangeIndex++; | 74 return ranges+rangeIndex++; |
75 } else { | 75 } else { |
76 return NULL; | 76 return NULL; |
77 } | 77 } |
78 } | 78 } |
79 void reset() { rangeIndex=0; } | 79 void reset() { rangeIndex=0; } |
80 private: | 80 private: |
81 static const Range ranges[4]; | 81 static const Range ranges[4]; |
82 int32_t rangeIndex; | 82 int32_t rangeIndex; |
83 }; | 83 }; |
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
163 U_CDECL_BEGIN | 163 U_CDECL_BEGIN |
164 | 164 |
165 static UBool U_CALLCONV | 165 static UBool U_CALLCONV |
166 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value
) { | 166 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value
) { |
167 return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value)
; | 167 return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value)
; |
168 } | 168 } |
169 | 169 |
170 U_CDECL_END | 170 U_CDECL_END |
171 | 171 |
172 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : | 172 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : |
173 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NOR
MAL) { | 173 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NOR
MAL), |
| 174 norm16TrieLength(0) { |
174 memset(unicodeVersion, 0, sizeof(unicodeVersion)); | 175 memset(unicodeVersion, 0, sizeof(unicodeVersion)); |
175 normTrie=utrie2_open(0, 0, &errorCode); | 176 normTrie=utrie2_open(0, 0, &errorCode); |
176 normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(N
orm)); | 177 normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(N
orm)); |
177 norms=allocNorm(); // unused Norm struct at index 0 | 178 norms=allocNorm(); // unused Norm struct at index 0 |
178 memset(indexes, 0, sizeof(indexes)); | 179 memset(indexes, 0, sizeof(indexes)); |
179 memset(smallFCD, 0, sizeof(smallFCD)); | 180 memset(smallFCD, 0, sizeof(smallFCD)); |
180 } | 181 } |
181 | 182 |
182 Normalizer2DataBuilder::~Normalizer2DataBuilder() { | 183 Normalizer2DataBuilder::~Normalizer2DataBuilder() { |
183 utrie2_close(normTrie); | 184 utrie2_close(normTrie); |
(...skipping 954 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1138 // which is harmless. | 1139 // which is harmless. |
1139 // As a result, the minimum code points are always BMP code points. | 1140 // As a result, the minimum code points are always BMP code points. |
1140 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; | 1141 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; |
1141 if(minCP>=0x10000) { | 1142 if(minCP>=0x10000) { |
1142 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); | 1143 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); |
1143 } | 1144 } |
1144 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; | 1145 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; |
1145 if(minCP>=0x10000) { | 1146 if(minCP>=0x10000) { |
1146 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); | 1147 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); |
1147 } | 1148 } |
1148 } | |
1149 | 1149 |
1150 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { | |
1151 processData(); | |
1152 | |
1153 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); | |
1154 utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); | 1150 utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); |
1155 int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); | 1151 norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); |
1156 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { | 1152 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { |
1157 fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normaliz
ation trie - %s\n", | 1153 fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normaliz
ation trie - %s\n", |
1158 errorCode.errorName()); | 1154 errorCode.errorName()); |
1159 exit(errorCode.reset()); | 1155 exit(errorCode.reset()); |
1160 } | 1156 } |
1161 errorCode.reset(); | 1157 errorCode.reset(); |
1162 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); | |
1163 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, e
rrorCode); | |
1164 errorCode.assertSuccess(); | |
1165 | 1158 |
1166 int32_t offset=(int32_t)sizeof(indexes); | 1159 int32_t offset=(int32_t)sizeof(indexes); |
1167 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; | 1160 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; |
1168 offset+=norm16TrieLength; | 1161 offset+=norm16TrieLength; |
1169 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; | 1162 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; |
1170 offset+=extraData.length()*2; | 1163 offset+=extraData.length()*2; |
1171 indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; | 1164 indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; |
1172 offset+=sizeof(smallFCD); | 1165 offset+=sizeof(smallFCD); |
1173 int32_t totalSize=offset; | 1166 int32_t totalSize=offset; |
1174 for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_T
OTAL_SIZE; ++i) { | 1167 for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_T
OTAL_SIZE; ++i) { |
(...skipping 12 matching lines...) Expand all Loading... |
1187 printf("minNoNo: 0x%04x\n", (int)indexes[Norma
lizer2Impl::IX_MIN_NO_NO]); | 1180 printf("minNoNo: 0x%04x\n", (int)indexes[Norma
lizer2Impl::IX_MIN_NO_NO]); |
1188 printf("limitNoNo: 0x%04x\n", (int)indexes[Norma
lizer2Impl::IX_LIMIT_NO_NO]); | 1181 printf("limitNoNo: 0x%04x\n", (int)indexes[Norma
lizer2Impl::IX_LIMIT_NO_NO]); |
1189 printf("minMaybeYes: 0x%04x\n", (int)indexes[Norma
lizer2Impl::IX_MIN_MAYBE_YES]); | 1182 printf("minMaybeYes: 0x%04x\n", (int)indexes[Norma
lizer2Impl::IX_MIN_MAYBE_YES]); |
1190 } | 1183 } |
1191 | 1184 |
1192 UVersionInfo nullVersion={ 0, 0, 0, 0 }; | 1185 UVersionInfo nullVersion={ 0, 0, 0, 0 }; |
1193 if(0==memcmp(nullVersion, unicodeVersion, 4)) { | 1186 if(0==memcmp(nullVersion, unicodeVersion, 4)) { |
1194 u_versionFromString(unicodeVersion, U_UNICODE_VERSION); | 1187 u_versionFromString(unicodeVersion, U_UNICODE_VERSION); |
1195 } | 1188 } |
1196 memcpy(dataInfo.dataVersion, unicodeVersion, 4); | 1189 memcpy(dataInfo.dataVersion, unicodeVersion, 4); |
| 1190 } |
| 1191 |
| 1192 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { |
| 1193 processData(); |
| 1194 |
| 1195 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); |
| 1196 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); |
| 1197 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, e
rrorCode); |
| 1198 errorCode.assertSuccess(); |
| 1199 |
1197 UNewDataMemory *pData= | 1200 UNewDataMemory *pData= |
1198 udata_create(NULL, NULL, filename, &dataInfo, | 1201 udata_create(NULL, NULL, filename, &dataInfo, |
1199 haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); | 1202 haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); |
1200 if(errorCode.isFailure()) { | 1203 if(errorCode.isFailure()) { |
1201 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %
s\n", | 1204 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %
s\n", |
1202 filename, errorCode.errorName()); | 1205 filename, errorCode.errorName()); |
1203 exit(errorCode.reset()); | 1206 exit(errorCode.reset()); |
1204 } | 1207 } |
1205 udata_writeBlock(pData, indexes, sizeof(indexes)); | 1208 udata_writeBlock(pData, indexes, sizeof(indexes)); |
1206 udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); | 1209 udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); |
1207 udata_writeUString(pData, extraData.getBuffer(), extraData.length()); | 1210 udata_writeUString(pData, extraData.getBuffer(), extraData.length()); |
1208 udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); | 1211 udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); |
1209 int32_t writtenSize=udata_finish(pData, errorCode); | 1212 int32_t writtenSize=udata_finish(pData, errorCode); |
1210 if(errorCode.isFailure()) { | 1213 if(errorCode.isFailure()) { |
1211 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCod
e.errorName()); | 1214 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCod
e.errorName()); |
1212 exit(errorCode.reset()); | 1215 exit(errorCode.reset()); |
1213 } | 1216 } |
| 1217 int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; |
1214 if(writtenSize!=totalSize) { | 1218 if(writtenSize!=totalSize) { |
1215 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld
\n", | 1219 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld
\n", |
1216 (long)writtenSize, (long)totalSize); | 1220 (long)writtenSize, (long)totalSize); |
1217 exit(U_INTERNAL_PROGRAM_ERROR); | 1221 exit(U_INTERNAL_PROGRAM_ERROR); |
1218 } | 1222 } |
1219 } | 1223 } |
1220 | 1224 |
| 1225 void |
| 1226 Normalizer2DataBuilder::writeCSourceFile(const char *filename) { |
| 1227 processData(); |
| 1228 |
| 1229 IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()"); |
| 1230 const char *basename=findBasename(filename); |
| 1231 CharString path(filename, (int32_t)(basename-filename), errorCode); |
| 1232 CharString dataName(basename, errorCode); |
| 1233 const char *extension=strrchr(basename, '.'); |
| 1234 if(extension!=NULL) { |
| 1235 dataName.truncate((int32_t)(extension-basename)); |
| 1236 } |
| 1237 errorCode.assertSuccess(); |
| 1238 |
| 1239 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); |
| 1240 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, e
rrorCode); |
| 1241 errorCode.assertSuccess(); |
| 1242 |
| 1243 FILE *f=usrc_create(path.data(), basename, "icu/source/tools/gennorm2/n2buil
der.cpp"); |
| 1244 if(f==NULL) { |
| 1245 fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the
output file %s\n", |
| 1246 filename); |
| 1247 exit(U_FILE_ACCESS_ERROR); |
| 1248 return; |
| 1249 } |
| 1250 char line[100]; |
| 1251 sprintf(line, "static const UVersionInfo %s_formatVersion={", dataName.data(
)); |
| 1252 usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n"); |
| 1253 sprintf(line, "static const UVersionInfo %s_dataVersion={", dataName.data())
; |
| 1254 usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n"); |
| 1255 sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\
n", |
| 1256 dataName.data()); |
| 1257 usrc_writeArray(f, |
| 1258 line, |
| 1259 indexes, 32, Normalizer2Impl::IX_COUNT, |
| 1260 "\n};\n\n"); |
| 1261 sprintf(line, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName.data(
)); |
| 1262 usrc_writeUTrie2Arrays(f, |
| 1263 line, NULL, |
| 1264 norm16Trie, |
| 1265 "\n};\n\n"); |
| 1266 sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", dataName.data(
)); |
| 1267 usrc_writeArray(f, |
| 1268 line, |
| 1269 extraData.getBuffer(), 16, extraData.length(), |
| 1270 "\n};\n\n"); |
| 1271 sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName.data())
; |
| 1272 usrc_writeArray(f, |
| 1273 line, |
| 1274 smallFCD, 8, sizeof(smallFCD), |
| 1275 "\n};\n\n"); |
| 1276 /*fputs( // TODO |
| 1277 "static const UCaseProps %s_singleton={\n" |
| 1278 " NULL,\n" |
| 1279 " %s_indexes,\n" |
| 1280 " %s_extraData,\n" |
| 1281 " %s_smallFCD,\n", |
| 1282 f);*/ |
| 1283 sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data()); |
| 1284 char line2[100]; |
| 1285 sprintf(line2, "%s_trieIndex", dataName.data()); |
| 1286 usrc_writeUTrie2Struct(f, |
| 1287 line, |
| 1288 norm16Trie, line2, NULL, |
| 1289 "};\n"); |
| 1290 fclose(f); |
| 1291 } |
| 1292 |
1221 U_NAMESPACE_END | 1293 U_NAMESPACE_END |
1222 | 1294 |
1223 #endif /* #if !UCONFIG_NO_NORMALIZATION */ | 1295 #endif /* #if !UCONFIG_NO_NORMALIZATION */ |
1224 | 1296 |
1225 /* | 1297 /* |
1226 * Hey, Emacs, please set the following: | 1298 * Hey, Emacs, please set the following: |
1227 * | 1299 * |
1228 * Local Variables: | 1300 * Local Variables: |
1229 * indent-tabs-mode: nil | 1301 * indent-tabs-mode: nil |
1230 * End: | 1302 * End: |
1231 */ | 1303 */ |
OLD | NEW |