Index: icu46/source/tools/genctd/genctd.cpp |
=================================================================== |
--- icu46/source/tools/genctd/genctd.cpp (revision 68397) |
+++ icu46/source/tools/genctd/genctd.cpp (working copy) |
@@ -1,6 +1,6 @@ |
/* |
********************************************************************** |
-* Copyright (C) 2002-2009, International Business Machines |
+* Copyright (C) 2002-2010, International Business Machines |
* Corporation and others. All Rights Reserved. |
********************************************************************** |
* |
@@ -34,12 +34,15 @@ |
#include "unicode/udata.h" |
#include "unicode/putil.h" |
+//#include "unicode/ustdio.h" |
+ |
#include "uoptions.h" |
#include "unewdata.h" |
#include "ucmndata.h" |
#include "rbbidata.h" |
#include "triedict.h" |
#include "cmemory.h" |
+#include "uassert.h" |
#include <stdio.h> |
#include <stdlib.h> |
@@ -199,147 +202,191 @@ |
long wordFileSize; |
FILE *file; |
char *wordBufferC; |
- |
+ MutableTrieDictionary *mtd = NULL; |
+ |
file = fopen(wordFileName, "rb"); |
- if( file == 0 ) { |
- fprintf(stderr, "Could not open file \"%s\"\n", wordFileName); |
- exit(-1); |
- } |
- fseek(file, 0, SEEK_END); |
- wordFileSize = ftell(file); |
- fseek(file, 0, SEEK_SET); |
- wordBufferC = new char[wordFileSize+10]; |
+ if( file == 0 ) { //cannot find file |
+ //create 1-line dummy file: ie 1 char, 1 value |
+ UNewDataMemory *pData; |
+ char msg[1024]; |
- result = (long)fread(wordBufferC, 1, wordFileSize, file); |
- if (result != wordFileSize) { |
- fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); |
- exit (-1); |
- } |
- wordBufferC[wordFileSize]=0; |
- fclose(file); |
+ /* write message with just the name */ |
+ sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName); |
+ fprintf(stderr, "%s\n", msg); |
- // |
- // Look for a Unicode Signature (BOM) on the word file |
- // |
- int32_t signatureLength; |
- const char * wordSourceC = wordBufferC; |
- const char* encoding = ucnv_detectUnicodeSignature( |
- wordSourceC, wordFileSize, &signatureLength, &status); |
- if (U_FAILURE(status)) { |
- exit(status); |
- } |
- if(encoding!=NULL ){ |
- wordSourceC += signatureLength; |
- wordFileSize -= signatureLength; |
- } |
+ UChar c = 0x0020; |
+ mtd = new MutableTrieDictionary(c, status, TRUE); |
+ mtd->addWord(&c, 1, status, 1); |
- // |
- // Open a converter to take the rule file to UTF-16 |
- // |
- UConverter* conv; |
- conv = ucnv_open(encoding, &status); |
- if (U_FAILURE(status)) { |
- fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); |
- exit(status); |
- } |
- |
- // |
- // Convert the words to UChar. |
- // Preflight first to determine required buffer size. |
- // |
- uint32_t destCap = ucnv_toUChars(conv, |
- NULL, // dest, |
- 0, // destCapacity, |
- wordSourceC, |
- wordFileSize, |
- &status); |
- if (status != U_BUFFER_OVERFLOW_ERROR) { |
- fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
- exit(status); |
- }; |
- |
- status = U_ZERO_ERROR; |
- UChar *wordSourceU = new UChar[destCap+1]; |
- ucnv_toUChars(conv, |
- wordSourceU, // dest, |
- destCap+1, |
- wordSourceC, |
- wordFileSize, |
- &status); |
- if (U_FAILURE(status)) { |
- fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
- exit(status); |
- }; |
- ucnv_close(conv); |
- |
- // Get rid of the original file buffer |
- delete[] wordBufferC; |
- |
- // Create a MutableTrieDictionary, and loop through all the lines, inserting |
- // words. |
- |
- // First, pick a median character. |
- UChar *current = wordSourceU + (destCap/2); |
- UChar uc = *current++; |
- UnicodeSet breaks; |
- breaks.add(0x000A); // Line Feed |
- breaks.add(0x000D); // Carriage Return |
- breaks.add(0x2028); // Line Separator |
- breaks.add(0x2029); // Paragraph Separator |
- |
- do { |
- // Look for line break |
- while (uc && !breaks.contains(uc)) { |
- uc = *current++; |
+ } else { //read words in from input file |
+ fseek(file, 0, SEEK_END); |
+ wordFileSize = ftell(file); |
+ fseek(file, 0, SEEK_SET); |
+ wordBufferC = new char[wordFileSize+10]; |
+ |
+ result = (long)fread(wordBufferC, 1, wordFileSize, file); |
+ if (result != wordFileSize) { |
+ fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); |
+ exit (-1); |
} |
- // Now skip to first non-line-break |
- while (uc && breaks.contains(uc)) { |
- uc = *current++; |
+ wordBufferC[wordFileSize]=0; |
+ fclose(file); |
+ |
+ // |
+ // Look for a Unicode Signature (BOM) on the word file |
+ // |
+ int32_t signatureLength; |
+ const char * wordSourceC = wordBufferC; |
+ const char* encoding = ucnv_detectUnicodeSignature( |
+ wordSourceC, wordFileSize, &signatureLength, &status); |
+ if (U_FAILURE(status)) { |
+ exit(status); |
} |
- } |
- while (uc && (breaks.contains(uc) || u_isspace(uc))); |
- |
- MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status); |
+ if(encoding!=NULL ){ |
+ wordSourceC += signatureLength; |
+ wordFileSize -= signatureLength; |
+ } |
- if (U_FAILURE(status)) { |
- fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); |
- exit(status); |
- } |
+ // |
+ // Open a converter to take the rule file to UTF-16 |
+ // |
+ UConverter* conv; |
+ conv = ucnv_open(encoding, &status); |
+ if (U_FAILURE(status)) { |
+ fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); |
+ exit(status); |
+ } |
- // Now add the words. Words are non-space characters at the beginning of |
- // lines, and must be at least one UChar. |
- current = wordSourceU; |
- UChar *candidate = current; |
- uc = *current++; |
- int32_t length = 0; |
- |
- while (uc) { |
- while (uc && !u_isspace(uc)) { |
- ++length; |
- uc = *current++; |
- } |
- if (length > 0) { |
- mtd->addWord(candidate, length, status); |
- if (U_FAILURE(status)) { |
- fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n", |
- u_errorName(status)); |
- exit(status); |
+ // |
+ // Convert the words to UChar. |
+ // Preflight first to determine required buffer size. |
+ // |
+ uint32_t destCap = ucnv_toUChars(conv, |
+ NULL, // dest, |
+ 0, // destCapacity, |
+ wordSourceC, |
+ wordFileSize, |
+ &status); |
+ if (status != U_BUFFER_OVERFLOW_ERROR) { |
+ fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
+ exit(status); |
+ }; |
+ |
+ status = U_ZERO_ERROR; |
+ UChar *wordSourceU = new UChar[destCap+1]; |
+ ucnv_toUChars(conv, |
+ wordSourceU, // dest, |
+ destCap+1, |
+ wordSourceC, |
+ wordFileSize, |
+ &status); |
+ if (U_FAILURE(status)) { |
+ fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); |
+ exit(status); |
+ }; |
+ ucnv_close(conv); |
+ |
+ // Get rid of the original file buffer |
+ delete[] wordBufferC; |
+ |
+ // Create a MutableTrieDictionary, and loop through all the lines, inserting |
+ // words. |
+ |
+ // First, pick a median character. |
+ UChar *current = wordSourceU + (destCap/2); |
+ UChar uc = *current++; |
+ UnicodeSet breaks; |
+ breaks.add(0x000A); // Line Feed |
+ breaks.add(0x000D); // Carriage Return |
+ breaks.add(0x2028); // Line Separator |
+ breaks.add(0x2029); // Paragraph Separator |
+ |
+ do { |
+ // Look for line break |
+ while (uc && !breaks.contains(uc)) { |
+ uc = *current++; |
} |
+ // Now skip to first non-line-break |
+ while (uc && breaks.contains(uc)) { |
+ uc = *current++; |
+ } |
} |
- // Find beginning of next line |
- while (uc && !breaks.contains(uc)) { |
- uc = *current++; |
+ while (uc && (breaks.contains(uc) || u_isspace(uc))); |
+ |
+ mtd = new MutableTrieDictionary(uc, status); |
+ |
+ if (U_FAILURE(status)) { |
+ fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); |
+ exit(status); |
} |
- while (uc && breaks.contains(uc)) { |
- uc = *current++; |
+ |
+ // Now add the words. Words are non-space characters at the beginning of |
+ // lines, and must be at least one UChar. If a word has an associated value, |
+ // the value should follow the word on the same line after a tab character. |
+ current = wordSourceU; |
+ UChar *candidate = current; |
+ uc = *current++; |
+ int32_t length = 0; |
+ int count = 0; |
+ |
+ while (uc) { |
+ while (uc && !u_isspace(uc)) { |
+ ++length; |
+ uc = *current++; |
+ } |
+ |
+ UnicodeString valueString; |
+ UChar candidateValue; |
+ if(uc == 0x0009){ //separator is a tab char, read in number after space |
+ while (uc && u_isspace(uc)) { |
+ uc = *current++; |
+ } |
+ while (uc && !u_isspace(uc)) { |
+ valueString.append(uc); |
+ uc = *current++; |
+ } |
+ } |
+ |
+ if (length > 0) { |
+ count++; |
+ if(valueString.length() > 0){ |
+ mtd->setValued(TRUE); |
+ |
+ uint32_t value = 0; |
+ char* s = new char[valueString.length()]; |
+ valueString.extract(0,valueString.length(), s, valueString.length()); |
+ int n = sscanf(s, "%ud", &value); |
+ U_ASSERT(n == 1); |
+ U_ASSERT(value >= 0); |
+ mtd->addWord(candidate, length, status, (uint16_t)value); |
+ delete[] s; |
+ } else { |
+ mtd->addWord(candidate, length, status); |
+ } |
+ |
+ if (U_FAILURE(status)) { |
+ fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n", |
+ u_errorName(status), count); |
+ exit(status); |
+ } |
+ } |
+ |
+ // Find beginning of next line |
+ while (uc && !breaks.contains(uc)) { |
+ uc = *current++; |
+ } |
+ // Find next non-line-breaking character |
+ while (uc && breaks.contains(uc)) { |
+ uc = *current++; |
+ } |
+ candidate = current-1; |
+ length = 0; |
} |
- candidate = current-1; |
- length = 0; |
+ |
+ // Get rid of the Unicode text buffer |
+ delete[] wordSourceU; |
} |
- // Get rid of the Unicode text buffer |
- delete[] wordSourceU; |
- |
// Now, create a CompactTrieDictionary from the mutable dictionary |
CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); |
if (U_FAILURE(status)) { |
@@ -393,4 +440,3 @@ |
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
} |
- |