Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Unified Diff: icu46/source/tools/genctd/genctd.cpp

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/
Patch Set: Created 9 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « icu46/source/tools/genctd/Makefile.in ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: icu46/source/tools/genctd/genctd.cpp
===================================================================
--- icu46/source/tools/genctd/genctd.cpp (revision 68397)
+++ icu46/source/tools/genctd/genctd.cpp (working copy)
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 2002-2009, International Business Machines
+* Copyright (C) 2002-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@@ -34,12 +34,15 @@
#include "unicode/udata.h"
#include "unicode/putil.h"
+//#include "unicode/ustdio.h"
+
#include "uoptions.h"
#include "unewdata.h"
#include "ucmndata.h"
#include "rbbidata.h"
#include "triedict.h"
#include "cmemory.h"
+#include "uassert.h"
#include <stdio.h>
#include <stdlib.h>
@@ -199,147 +202,191 @@
long wordFileSize;
FILE *file;
char *wordBufferC;
-
+ MutableTrieDictionary *mtd = NULL;
+
file = fopen(wordFileName, "rb");
- if( file == 0 ) {
- fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
- exit(-1);
- }
- fseek(file, 0, SEEK_END);
- wordFileSize = ftell(file);
- fseek(file, 0, SEEK_SET);
- wordBufferC = new char[wordFileSize+10];
+ if( file == 0 ) { //cannot find file
+ //create 1-line dummy file: ie 1 char, 1 value
+ UNewDataMemory *pData;
+ char msg[1024];
- result = (long)fread(wordBufferC, 1, wordFileSize, file);
- if (result != wordFileSize) {
- fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
- exit (-1);
- }
- wordBufferC[wordFileSize]=0;
- fclose(file);
+ /* write message with just the name */
+ sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName);
+ fprintf(stderr, "%s\n", msg);
- //
- // Look for a Unicode Signature (BOM) on the word file
- //
- int32_t signatureLength;
- const char * wordSourceC = wordBufferC;
- const char* encoding = ucnv_detectUnicodeSignature(
- wordSourceC, wordFileSize, &signatureLength, &status);
- if (U_FAILURE(status)) {
- exit(status);
- }
- if(encoding!=NULL ){
- wordSourceC += signatureLength;
- wordFileSize -= signatureLength;
- }
+ UChar c = 0x0020;
+ mtd = new MutableTrieDictionary(c, status, TRUE);
+ mtd->addWord(&c, 1, status, 1);
- //
- // Open a converter to take the rule file to UTF-16
- //
- UConverter* conv;
- conv = ucnv_open(encoding, &status);
- if (U_FAILURE(status)) {
- fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
- exit(status);
- }
-
- //
- // Convert the words to UChar.
- // Preflight first to determine required buffer size.
- //
- uint32_t destCap = ucnv_toUChars(conv,
- NULL, // dest,
- 0, // destCapacity,
- wordSourceC,
- wordFileSize,
- &status);
- if (status != U_BUFFER_OVERFLOW_ERROR) {
- fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
- exit(status);
- };
-
- status = U_ZERO_ERROR;
- UChar *wordSourceU = new UChar[destCap+1];
- ucnv_toUChars(conv,
- wordSourceU, // dest,
- destCap+1,
- wordSourceC,
- wordFileSize,
- &status);
- if (U_FAILURE(status)) {
- fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
- exit(status);
- };
- ucnv_close(conv);
-
- // Get rid of the original file buffer
- delete[] wordBufferC;
-
- // Create a MutableTrieDictionary, and loop through all the lines, inserting
- // words.
-
- // First, pick a median character.
- UChar *current = wordSourceU + (destCap/2);
- UChar uc = *current++;
- UnicodeSet breaks;
- breaks.add(0x000A); // Line Feed
- breaks.add(0x000D); // Carriage Return
- breaks.add(0x2028); // Line Separator
- breaks.add(0x2029); // Paragraph Separator
-
- do {
- // Look for line break
- while (uc && !breaks.contains(uc)) {
- uc = *current++;
+ } else { //read words in from input file
+ fseek(file, 0, SEEK_END);
+ wordFileSize = ftell(file);
+ fseek(file, 0, SEEK_SET);
+ wordBufferC = new char[wordFileSize+10];
+
+ result = (long)fread(wordBufferC, 1, wordFileSize, file);
+ if (result != wordFileSize) {
+ fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
+ exit (-1);
}
- // Now skip to first non-line-break
- while (uc && breaks.contains(uc)) {
- uc = *current++;
+ wordBufferC[wordFileSize]=0;
+ fclose(file);
+
+ //
+ // Look for a Unicode Signature (BOM) on the word file
+ //
+ int32_t signatureLength;
+ const char * wordSourceC = wordBufferC;
+ const char* encoding = ucnv_detectUnicodeSignature(
+ wordSourceC, wordFileSize, &signatureLength, &status);
+ if (U_FAILURE(status)) {
+ exit(status);
}
- }
- while (uc && (breaks.contains(uc) || u_isspace(uc)));
-
- MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
+ if(encoding!=NULL ){
+ wordSourceC += signatureLength;
+ wordFileSize -= signatureLength;
+ }
- if (U_FAILURE(status)) {
- fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
- exit(status);
- }
+ //
+ // Open a converter to take the rule file to UTF-16
+ //
+ UConverter* conv;
+ conv = ucnv_open(encoding, &status);
+ if (U_FAILURE(status)) {
+ fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
+ exit(status);
+ }
- // Now add the words. Words are non-space characters at the beginning of
- // lines, and must be at least one UChar.
- current = wordSourceU;
- UChar *candidate = current;
- uc = *current++;
- int32_t length = 0;
-
- while (uc) {
- while (uc && !u_isspace(uc)) {
- ++length;
- uc = *current++;
- }
- if (length > 0) {
- mtd->addWord(candidate, length, status);
- if (U_FAILURE(status)) {
- fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
- u_errorName(status));
- exit(status);
+ //
+ // Convert the words to UChar.
+ // Preflight first to determine required buffer size.
+ //
+ uint32_t destCap = ucnv_toUChars(conv,
+ NULL, // dest,
+ 0, // destCapacity,
+ wordSourceC,
+ wordFileSize,
+ &status);
+ if (status != U_BUFFER_OVERFLOW_ERROR) {
+ fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
+ exit(status);
+ };
+
+ status = U_ZERO_ERROR;
+ UChar *wordSourceU = new UChar[destCap+1];
+ ucnv_toUChars(conv,
+ wordSourceU, // dest,
+ destCap+1,
+ wordSourceC,
+ wordFileSize,
+ &status);
+ if (U_FAILURE(status)) {
+ fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
+ exit(status);
+ };
+ ucnv_close(conv);
+
+ // Get rid of the original file buffer
+ delete[] wordBufferC;
+
+ // Create a MutableTrieDictionary, and loop through all the lines, inserting
+ // words.
+
+ // First, pick a median character.
+ UChar *current = wordSourceU + (destCap/2);
+ UChar uc = *current++;
+ UnicodeSet breaks;
+ breaks.add(0x000A); // Line Feed
+ breaks.add(0x000D); // Carriage Return
+ breaks.add(0x2028); // Line Separator
+ breaks.add(0x2029); // Paragraph Separator
+
+ do {
+ // Look for line break
+ while (uc && !breaks.contains(uc)) {
+ uc = *current++;
}
+ // Now skip to first non-line-break
+ while (uc && breaks.contains(uc)) {
+ uc = *current++;
+ }
}
- // Find beginning of next line
- while (uc && !breaks.contains(uc)) {
- uc = *current++;
+ while (uc && (breaks.contains(uc) || u_isspace(uc)));
+
+ mtd = new MutableTrieDictionary(uc, status);
+
+ if (U_FAILURE(status)) {
+ fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
+ exit(status);
}
- while (uc && breaks.contains(uc)) {
- uc = *current++;
+
+ // Now add the words. Words are non-space characters at the beginning of
+ // lines, and must be at least one UChar. If a word has an associated value,
+ // the value should follow the word on the same line after a tab character.
+ current = wordSourceU;
+ UChar *candidate = current;
+ uc = *current++;
+ int32_t length = 0;
+ int count = 0;
+
+ while (uc) {
+ while (uc && !u_isspace(uc)) {
+ ++length;
+ uc = *current++;
+ }
+
+ UnicodeString valueString;
+ UChar candidateValue;
+ if(uc == 0x0009){ //separator is a tab char, read in number after space
+ while (uc && u_isspace(uc)) {
+ uc = *current++;
+ }
+ while (uc && !u_isspace(uc)) {
+ valueString.append(uc);
+ uc = *current++;
+ }
+ }
+
+ if (length > 0) {
+ count++;
+ if(valueString.length() > 0){
+ mtd->setValued(TRUE);
+
+ uint32_t value = 0;
+ char* s = new char[valueString.length()];
+ valueString.extract(0,valueString.length(), s, valueString.length());
+ int n = sscanf(s, "%ud", &value);
+ U_ASSERT(n == 1);
+ U_ASSERT(value >= 0);
+ mtd->addWord(candidate, length, status, (uint16_t)value);
+ delete[] s;
+ } else {
+ mtd->addWord(candidate, length, status);
+ }
+
+ if (U_FAILURE(status)) {
+ fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n",
+ u_errorName(status), count);
+ exit(status);
+ }
+ }
+
+ // Find beginning of next line
+ while (uc && !breaks.contains(uc)) {
+ uc = *current++;
+ }
+ // Find next non-line-breaking character
+ while (uc && breaks.contains(uc)) {
+ uc = *current++;
+ }
+ candidate = current-1;
+ length = 0;
}
- candidate = current-1;
- length = 0;
+
+ // Get rid of the Unicode text buffer
+ delete[] wordSourceU;
}
- // Get rid of the Unicode text buffer
- delete[] wordSourceU;
-
// Now, create a CompactTrieDictionary from the mutable dictionary
CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
if (U_FAILURE(status)) {
@@ -393,4 +440,3 @@
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
}
-
« no previous file with comments | « icu46/source/tools/genctd/Makefile.in ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698