OLD | NEW |
1 /* | 1 /* |
2 ********************************************************************** | 2 ********************************************************************** |
3 * Copyright (C) 2002-2009, International Business Machines | 3 * Copyright (C) 2002-2010, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ********************************************************************** | 5 ********************************************************************** |
6 * | 6 * |
7 * File genctd.c | 7 * File genctd.c |
8 */ | 8 */ |
9 | 9 |
10 //-------------------------------------------------------------------- | 10 //-------------------------------------------------------------------- |
11 // | 11 // |
12 // Tool for generating CompactTrieDictionary data files (.ctd files). | 12 // Tool for generating CompactTrieDictionary data files (.ctd files). |
13 // | 13 // |
(...skipping 13 matching lines...) Expand all Loading... |
27 | 27 |
28 #include "unicode/utypes.h" | 28 #include "unicode/utypes.h" |
29 #include "unicode/uchar.h" | 29 #include "unicode/uchar.h" |
30 #include "unicode/ucnv.h" | 30 #include "unicode/ucnv.h" |
31 #include "unicode/uniset.h" | 31 #include "unicode/uniset.h" |
32 #include "unicode/unistr.h" | 32 #include "unicode/unistr.h" |
33 #include "unicode/uclean.h" | 33 #include "unicode/uclean.h" |
34 #include "unicode/udata.h" | 34 #include "unicode/udata.h" |
35 #include "unicode/putil.h" | 35 #include "unicode/putil.h" |
36 | 36 |
| 37 //#include "unicode/ustdio.h" |
| 38 |
37 #include "uoptions.h" | 39 #include "uoptions.h" |
38 #include "unewdata.h" | 40 #include "unewdata.h" |
39 #include "ucmndata.h" | 41 #include "ucmndata.h" |
40 #include "rbbidata.h" | 42 #include "rbbidata.h" |
41 #include "triedict.h" | 43 #include "triedict.h" |
42 #include "cmemory.h" | 44 #include "cmemory.h" |
| 45 #include "uassert.h" |
43 | 46 |
44 #include <stdio.h> | 47 #include <stdio.h> |
45 #include <stdlib.h> | 48 #include <stdlib.h> |
46 #include <string.h> | 49 #include <string.h> |
47 | 50 |
48 U_NAMESPACE_USE | 51 U_NAMESPACE_USE |
49 | 52 |
50 static char *progName; | 53 static char *progName; |
51 static UOption options[]={ | 54 static UOption options[]={ |
52 UOPTION_HELP_H, /* 0 */ | 55 UOPTION_HELP_H, /* 0 */ |
(...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
192 } | 195 } |
193 status = U_ZERO_ERROR; | 196 status = U_ZERO_ERROR; |
194 | 197 |
195 // | 198 // |
196 // Read in the dictionary source file | 199 // Read in the dictionary source file |
197 // | 200 // |
198 long result; | 201 long result; |
199 long wordFileSize; | 202 long wordFileSize; |
200 FILE *file; | 203 FILE *file; |
201 char *wordBufferC; | 204 char *wordBufferC; |
| 205 MutableTrieDictionary *mtd = NULL; |
| 206 |
| 207 file = fopen(wordFileName, "rb"); |
| 208 if( file == 0 ) { //cannot find file |
| 209 //create 1-line dummy file: ie 1 char, 1 value |
| 210 UNewDataMemory *pData; |
| 211 char msg[1024]; |
202 | 212 |
203 file = fopen(wordFileName, "rb"); | 213 /* write message with just the name */ |
204 if( file == 0 ) { | 214 sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFi
leName); |
205 fprintf(stderr, "Could not open file \"%s\"\n", wordFileName); | 215 fprintf(stderr, "%s\n", msg); |
206 exit(-1); | |
207 } | |
208 fseek(file, 0, SEEK_END); | |
209 wordFileSize = ftell(file); | |
210 fseek(file, 0, SEEK_SET); | |
211 wordBufferC = new char[wordFileSize+10]; | |
212 | 216 |
213 result = (long)fread(wordBufferC, 1, wordFileSize, file); | 217 UChar c = 0x0020; |
214 if (result != wordFileSize) { | 218 mtd = new MutableTrieDictionary(c, status, TRUE); |
215 fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); | 219 mtd->addWord(&c, 1, status, 1); |
216 exit (-1); | |
217 } | |
218 wordBufferC[wordFileSize]=0; | |
219 fclose(file); | |
220 | 220 |
221 // | 221 } else { //read words in from input file |
222 // Look for a Unicode Signature (BOM) on the word file | 222 fseek(file, 0, SEEK_END); |
223 // | 223 wordFileSize = ftell(file); |
224 int32_t signatureLength; | 224 fseek(file, 0, SEEK_SET); |
225 const char * wordSourceC = wordBufferC; | 225 wordBufferC = new char[wordFileSize+10]; |
226 const char* encoding = ucnv_detectUnicodeSignature( | 226 |
227 wordSourceC, wordFileSize, &signatureLength, &status)
; | 227 result = (long)fread(wordBufferC, 1, wordFileSize, file); |
228 if (U_FAILURE(status)) { | 228 if (result != wordFileSize) { |
229 exit(status); | 229 fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); |
230 } | 230 exit (-1); |
231 if(encoding!=NULL ){ | |
232 wordSourceC += signatureLength; | |
233 wordFileSize -= signatureLength; | |
234 } | |
235 | |
236 // | |
237 // Open a converter to take the rule file to UTF-16 | |
238 // | |
239 UConverter* conv; | |
240 conv = ucnv_open(encoding, &status); | |
241 if (U_FAILURE(status)) { | |
242 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); | |
243 exit(status); | |
244 } | |
245 | |
246 // | |
247 // Convert the words to UChar. | |
248 // Preflight first to determine required buffer size. | |
249 // | |
250 uint32_t destCap = ucnv_toUChars(conv, | |
251 NULL, // dest, | |
252 0, // destCapacity, | |
253 wordSourceC, | |
254 wordFileSize, | |
255 &status); | |
256 if (status != U_BUFFER_OVERFLOW_ERROR) { | |
257 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)
); | |
258 exit(status); | |
259 }; | |
260 | |
261 status = U_ZERO_ERROR; | |
262 UChar *wordSourceU = new UChar[destCap+1]; | |
263 ucnv_toUChars(conv, | |
264 wordSourceU, // dest, | |
265 destCap+1, | |
266 wordSourceC, | |
267 wordFileSize, | |
268 &status); | |
269 if (U_FAILURE(status)) { | |
270 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)
); | |
271 exit(status); | |
272 }; | |
273 ucnv_close(conv); | |
274 | |
275 // Get rid of the original file buffer | |
276 delete[] wordBufferC; | |
277 | |
278 // Create a MutableTrieDictionary, and loop through all the lines, inserting | |
279 // words. | |
280 | |
281 // First, pick a median character. | |
282 UChar *current = wordSourceU + (destCap/2); | |
283 UChar uc = *current++; | |
284 UnicodeSet breaks; | |
285 breaks.add(0x000A); // Line Feed | |
286 breaks.add(0x000D); // Carriage Return | |
287 breaks.add(0x2028); // Line Separator | |
288 breaks.add(0x2029); // Paragraph Separator | |
289 | |
290 do { | |
291 // Look for line break | |
292 while (uc && !breaks.contains(uc)) { | |
293 uc = *current++; | |
294 } | 231 } |
295 // Now skip to first non-line-break | 232 wordBufferC[wordFileSize]=0; |
296 while (uc && breaks.contains(uc)) { | 233 fclose(file); |
297 uc = *current++; | 234 |
| 235 // |
| 236 // Look for a Unicode Signature (BOM) on the word file |
| 237 // |
| 238 int32_t signatureLength; |
| 239 const char * wordSourceC = wordBufferC; |
| 240 const char* encoding = ucnv_detectUnicodeSignature( |
| 241 wordSourceC, wordFileSize, &signatureLength, &sta
tus); |
| 242 if (U_FAILURE(status)) { |
| 243 exit(status); |
298 } | 244 } |
299 } | 245 if(encoding!=NULL ){ |
300 while (uc && (breaks.contains(uc) || u_isspace(uc))); | 246 wordSourceC += signatureLength; |
301 | 247 wordFileSize -= signatureLength; |
302 MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status); | 248 } |
303 | 249 |
304 if (U_FAILURE(status)) { | 250 // |
305 fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_error
Name(status)); | 251 // Open a converter to take the rule file to UTF-16 |
306 exit(status); | 252 // |
307 } | 253 UConverter* conv; |
| 254 conv = ucnv_open(encoding, &status); |
| 255 if (U_FAILURE(status)) { |
| 256 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)
); |
| 257 exit(status); |
| 258 } |
308 | 259 |
309 // Now add the words. Words are non-space characters at the beginning of | 260 // |
310 // lines, and must be at least one UChar. | 261 // Convert the words to UChar. |
311 current = wordSourceU; | 262 // Preflight first to determine required buffer size. |
312 UChar *candidate = current; | 263 // |
313 uc = *current++; | 264 uint32_t destCap = ucnv_toUChars(conv, |
314 int32_t length = 0; | 265 NULL, // dest, |
315 | 266 0, // destCapacity, |
316 while (uc) { | 267 wordSourceC, |
317 while (uc && !u_isspace(uc)) { | 268 wordFileSize, |
318 ++length; | 269 &status); |
319 uc = *current++; | 270 if (status != U_BUFFER_OVERFLOW_ERROR) { |
320 } | 271 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(sta
tus)); |
321 if (length > 0) { | 272 exit(status); |
322 mtd->addWord(candidate, length, status); | 273 }; |
323 if (U_FAILURE(status)) { | 274 |
324 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\
"\n", | 275 status = U_ZERO_ERROR; |
325 u_errorName(status)); | 276 UChar *wordSourceU = new UChar[destCap+1]; |
326 exit(status); | 277 ucnv_toUChars(conv, |
| 278 wordSourceU, // dest, |
| 279 destCap+1, |
| 280 wordSourceC, |
| 281 wordFileSize, |
| 282 &status); |
| 283 if (U_FAILURE(status)) { |
| 284 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(sta
tus)); |
| 285 exit(status); |
| 286 }; |
| 287 ucnv_close(conv); |
| 288 |
| 289 // Get rid of the original file buffer |
| 290 delete[] wordBufferC; |
| 291 |
| 292 // Create a MutableTrieDictionary, and loop through all the lines, inser
ting |
| 293 // words. |
| 294 |
| 295 // First, pick a median character. |
| 296 UChar *current = wordSourceU + (destCap/2); |
| 297 UChar uc = *current++; |
| 298 UnicodeSet breaks; |
| 299 breaks.add(0x000A); // Line Feed |
| 300 breaks.add(0x000D); // Carriage Return |
| 301 breaks.add(0x2028); // Line Separator |
| 302 breaks.add(0x2029); // Paragraph Separator |
| 303 |
| 304 do { |
| 305 // Look for line break |
| 306 while (uc && !breaks.contains(uc)) { |
| 307 uc = *current++; |
| 308 } |
| 309 // Now skip to first non-line-break |
| 310 while (uc && breaks.contains(uc)) { |
| 311 uc = *current++; |
327 } | 312 } |
328 } | 313 } |
329 // Find beginning of next line | 314 while (uc && (breaks.contains(uc) || u_isspace(uc))); |
330 while (uc && !breaks.contains(uc)) { | 315 |
331 uc = *current++; | 316 mtd = new MutableTrieDictionary(uc, status); |
| 317 |
| 318 if (U_FAILURE(status)) { |
| 319 fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_e
rrorName(status)); |
| 320 exit(status); |
332 } | 321 } |
333 while (uc && breaks.contains(uc)) { | 322 |
334 uc = *current++; | 323 // Now add the words. Words are non-space characters at the beginning of |
| 324 // lines, and must be at least one UChar. If a word has an associated va
lue, |
| 325 // the value should follow the word on the same line after a tab charact
er. |
| 326 current = wordSourceU; |
| 327 UChar *candidate = current; |
| 328 uc = *current++; |
| 329 int32_t length = 0; |
| 330 int count = 0; |
| 331 |
| 332 while (uc) { |
| 333 while (uc && !u_isspace(uc)) { |
| 334 ++length; |
| 335 uc = *current++; |
| 336 } |
| 337 |
| 338 UnicodeString valueString; |
| 339 UChar candidateValue; |
| 340 if(uc == 0x0009){ //separator is a tab char, read in number after sp
ace |
| 341 » while (uc && u_isspace(uc)) { |
| 342 » » uc = *current++; |
| 343 » } |
| 344 while (uc && !u_isspace(uc)) { |
| 345 valueString.append(uc); |
| 346 uc = *current++; |
| 347 } |
| 348 } |
| 349 |
| 350 if (length > 0) { |
| 351 count++; |
| 352 if(valueString.length() > 0){ |
| 353 mtd->setValued(TRUE); |
| 354 |
| 355 uint32_t value = 0; |
| 356 char* s = new char[valueString.length()]; |
| 357 valueString.extract(0,valueString.length(), s, valueString.l
ength()); |
| 358 int n = sscanf(s, "%ud", &value); |
| 359 U_ASSERT(n == 1); |
| 360 U_ASSERT(value >= 0); |
| 361 mtd->addWord(candidate, length, status, (uint16_t)value); |
| 362 delete[] s; |
| 363 } else { |
| 364 mtd->addWord(candidate, length, status); |
| 365 } |
| 366 |
| 367 if (U_FAILURE(status)) { |
| 368 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \
"%s\" at line %d in input file\n", |
| 369 u_errorName(status), count); |
| 370 exit(status); |
| 371 } |
| 372 } |
| 373 |
| 374 // Find beginning of next line |
| 375 while (uc && !breaks.contains(uc)) { |
| 376 uc = *current++; |
| 377 } |
| 378 // Find next non-line-breaking character |
| 379 while (uc && breaks.contains(uc)) { |
| 380 uc = *current++; |
| 381 } |
| 382 candidate = current-1; |
| 383 length = 0; |
335 } | 384 } |
336 candidate = current-1; | 385 |
337 length = 0; | 386 // Get rid of the Unicode text buffer |
| 387 delete[] wordSourceU; |
338 } | 388 } |
339 | 389 |
340 // Get rid of the Unicode text buffer | |
341 delete[] wordSourceU; | |
342 | |
343 // Now, create a CompactTrieDictionary from the mutable dictionary | 390 // Now, create a CompactTrieDictionary from the mutable dictionary |
344 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); | 391 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); |
345 if (U_FAILURE(status)) { | 392 if (U_FAILURE(status)) { |
346 fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_error
Name(status)); | 393 fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_error
Name(status)); |
347 exit(status); | 394 exit(status); |
348 } | 395 } |
349 | 396 |
350 // Get rid of the MutableTrieDictionary | 397 // Get rid of the MutableTrieDictionary |
351 delete mtd; | 398 delete mtd; |
352 | 399 |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
386 // Get rid of the CompactTrieDictionary | 433 // Get rid of the CompactTrieDictionary |
387 delete ctd; | 434 delete ctd; |
388 | 435 |
389 u_cleanup(); | 436 u_cleanup(); |
390 | 437 |
391 printf("genctd: tool completed successfully.\n"); | 438 printf("genctd: tool completed successfully.\n"); |
392 return 0; | 439 return 0; |
393 | 440 |
394 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 441 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
395 } | 442 } |
396 | |
OLD | NEW |