Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(300)

Side by Side Diff: icu46/source/tools/genctd/genctd.cpp

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/
Patch Set: Created 9 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « icu46/source/tools/genctd/Makefile.in ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ********************************************************************** 2 **********************************************************************
3 * Copyright (C) 2002-2009, International Business Machines 3 * Copyright (C) 2002-2010, International Business Machines
4 * Corporation and others. All Rights Reserved. 4 * Corporation and others. All Rights Reserved.
5 ********************************************************************** 5 **********************************************************************
6 * 6 *
7 * File genctd.c 7 * File genctd.c
8 */ 8 */
9 9
10 //-------------------------------------------------------------------- 10 //--------------------------------------------------------------------
11 // 11 //
12 // Tool for generating CompactTrieDictionary data files (.ctd files). 12 // Tool for generating CompactTrieDictionary data files (.ctd files).
13 // 13 //
(...skipping 13 matching lines...) Expand all
27 27
28 #include "unicode/utypes.h" 28 #include "unicode/utypes.h"
29 #include "unicode/uchar.h" 29 #include "unicode/uchar.h"
30 #include "unicode/ucnv.h" 30 #include "unicode/ucnv.h"
31 #include "unicode/uniset.h" 31 #include "unicode/uniset.h"
32 #include "unicode/unistr.h" 32 #include "unicode/unistr.h"
33 #include "unicode/uclean.h" 33 #include "unicode/uclean.h"
34 #include "unicode/udata.h" 34 #include "unicode/udata.h"
35 #include "unicode/putil.h" 35 #include "unicode/putil.h"
36 36
37 //#include "unicode/ustdio.h"
38
37 #include "uoptions.h" 39 #include "uoptions.h"
38 #include "unewdata.h" 40 #include "unewdata.h"
39 #include "ucmndata.h" 41 #include "ucmndata.h"
40 #include "rbbidata.h" 42 #include "rbbidata.h"
41 #include "triedict.h" 43 #include "triedict.h"
42 #include "cmemory.h" 44 #include "cmemory.h"
45 #include "uassert.h"
43 46
44 #include <stdio.h> 47 #include <stdio.h>
45 #include <stdlib.h> 48 #include <stdlib.h>
46 #include <string.h> 49 #include <string.h>
47 50
48 U_NAMESPACE_USE 51 U_NAMESPACE_USE
49 52
50 static char *progName; 53 static char *progName;
51 static UOption options[]={ 54 static UOption options[]={
52 UOPTION_HELP_H, /* 0 */ 55 UOPTION_HELP_H, /* 0 */
(...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after
192 } 195 }
193 status = U_ZERO_ERROR; 196 status = U_ZERO_ERROR;
194 197
195 // 198 //
196 // Read in the dictionary source file 199 // Read in the dictionary source file
197 // 200 //
198 long result; 201 long result;
199 long wordFileSize; 202 long wordFileSize;
200 FILE *file; 203 FILE *file;
201 char *wordBufferC; 204 char *wordBufferC;
205 MutableTrieDictionary *mtd = NULL;
206
207 file = fopen(wordFileName, "rb");
208 if( file == 0 ) { //cannot find file
209 //create 1-line dummy file: ie 1 char, 1 value
210 UNewDataMemory *pData;
211 char msg[1024];
202 212
203 file = fopen(wordFileName, "rb"); 213 /* write message with just the name */
204 if( file == 0 ) { 214 sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFi leName);
205 fprintf(stderr, "Could not open file \"%s\"\n", wordFileName); 215 fprintf(stderr, "%s\n", msg);
206 exit(-1);
207 }
208 fseek(file, 0, SEEK_END);
209 wordFileSize = ftell(file);
210 fseek(file, 0, SEEK_SET);
211 wordBufferC = new char[wordFileSize+10];
212 216
213 result = (long)fread(wordBufferC, 1, wordFileSize, file); 217 UChar c = 0x0020;
214 if (result != wordFileSize) { 218 mtd = new MutableTrieDictionary(c, status, TRUE);
215 fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); 219 mtd->addWord(&c, 1, status, 1);
216 exit (-1);
217 }
218 wordBufferC[wordFileSize]=0;
219 fclose(file);
220 220
221 // 221 } else { //read words in from input file
222 // Look for a Unicode Signature (BOM) on the word file 222 fseek(file, 0, SEEK_END);
223 // 223 wordFileSize = ftell(file);
224 int32_t signatureLength; 224 fseek(file, 0, SEEK_SET);
225 const char * wordSourceC = wordBufferC; 225 wordBufferC = new char[wordFileSize+10];
226 const char* encoding = ucnv_detectUnicodeSignature( 226
227 wordSourceC, wordFileSize, &signatureLength, &status) ; 227 result = (long)fread(wordBufferC, 1, wordFileSize, file);
228 if (U_FAILURE(status)) { 228 if (result != wordFileSize) {
229 exit(status); 229 fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
230 } 230 exit (-1);
231 if(encoding!=NULL ){
232 wordSourceC += signatureLength;
233 wordFileSize -= signatureLength;
234 }
235
236 //
237 // Open a converter to take the rule file to UTF-16
238 //
239 UConverter* conv;
240 conv = ucnv_open(encoding, &status);
241 if (U_FAILURE(status)) {
242 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
243 exit(status);
244 }
245
246 //
247 // Convert the words to UChar.
248 // Preflight first to determine required buffer size.
249 //
250 uint32_t destCap = ucnv_toUChars(conv,
251 NULL, // dest,
252 0, // destCapacity,
253 wordSourceC,
254 wordFileSize,
255 &status);
256 if (status != U_BUFFER_OVERFLOW_ERROR) {
257 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status) );
258 exit(status);
259 };
260
261 status = U_ZERO_ERROR;
262 UChar *wordSourceU = new UChar[destCap+1];
263 ucnv_toUChars(conv,
264 wordSourceU, // dest,
265 destCap+1,
266 wordSourceC,
267 wordFileSize,
268 &status);
269 if (U_FAILURE(status)) {
270 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status) );
271 exit(status);
272 };
273 ucnv_close(conv);
274
275 // Get rid of the original file buffer
276 delete[] wordBufferC;
277
278 // Create a MutableTrieDictionary, and loop through all the lines, inserting
279 // words.
280
281 // First, pick a median character.
282 UChar *current = wordSourceU + (destCap/2);
283 UChar uc = *current++;
284 UnicodeSet breaks;
285 breaks.add(0x000A); // Line Feed
286 breaks.add(0x000D); // Carriage Return
287 breaks.add(0x2028); // Line Separator
288 breaks.add(0x2029); // Paragraph Separator
289
290 do {
291 // Look for line break
292 while (uc && !breaks.contains(uc)) {
293 uc = *current++;
294 } 231 }
295 // Now skip to first non-line-break 232 wordBufferC[wordFileSize]=0;
296 while (uc && breaks.contains(uc)) { 233 fclose(file);
297 uc = *current++; 234
235 //
236 // Look for a Unicode Signature (BOM) on the word file
237 //
238 int32_t signatureLength;
239 const char * wordSourceC = wordBufferC;
240 const char* encoding = ucnv_detectUnicodeSignature(
241 wordSourceC, wordFileSize, &signatureLength, &sta tus);
242 if (U_FAILURE(status)) {
243 exit(status);
298 } 244 }
299 } 245 if(encoding!=NULL ){
300 while (uc && (breaks.contains(uc) || u_isspace(uc))); 246 wordSourceC += signatureLength;
301 247 wordFileSize -= signatureLength;
302 MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status); 248 }
303 249
304 if (U_FAILURE(status)) { 250 //
305 fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_error Name(status)); 251 // Open a converter to take the rule file to UTF-16
306 exit(status); 252 //
307 } 253 UConverter* conv;
254 conv = ucnv_open(encoding, &status);
255 if (U_FAILURE(status)) {
256 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status) );
257 exit(status);
258 }
308 259
309 // Now add the words. Words are non-space characters at the beginning of 260 //
310 // lines, and must be at least one UChar. 261 // Convert the words to UChar.
311 current = wordSourceU; 262 // Preflight first to determine required buffer size.
312 UChar *candidate = current; 263 //
313 uc = *current++; 264 uint32_t destCap = ucnv_toUChars(conv,
314 int32_t length = 0; 265 NULL, // dest,
315 266 0, // destCapacity,
316 while (uc) { 267 wordSourceC,
317 while (uc && !u_isspace(uc)) { 268 wordFileSize,
318 ++length; 269 &status);
319 uc = *current++; 270 if (status != U_BUFFER_OVERFLOW_ERROR) {
320 } 271 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(sta tus));
321 if (length > 0) { 272 exit(status);
322 mtd->addWord(candidate, length, status); 273 };
323 if (U_FAILURE(status)) { 274
324 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\ "\n", 275 status = U_ZERO_ERROR;
325 u_errorName(status)); 276 UChar *wordSourceU = new UChar[destCap+1];
326 exit(status); 277 ucnv_toUChars(conv,
278 wordSourceU, // dest,
279 destCap+1,
280 wordSourceC,
281 wordFileSize,
282 &status);
283 if (U_FAILURE(status)) {
284 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(sta tus));
285 exit(status);
286 };
287 ucnv_close(conv);
288
289 // Get rid of the original file buffer
290 delete[] wordBufferC;
291
292 // Create a MutableTrieDictionary, and loop through all the lines, inser ting
293 // words.
294
295 // First, pick a median character.
296 UChar *current = wordSourceU + (destCap/2);
297 UChar uc = *current++;
298 UnicodeSet breaks;
299 breaks.add(0x000A); // Line Feed
300 breaks.add(0x000D); // Carriage Return
301 breaks.add(0x2028); // Line Separator
302 breaks.add(0x2029); // Paragraph Separator
303
304 do {
305 // Look for line break
306 while (uc && !breaks.contains(uc)) {
307 uc = *current++;
308 }
309 // Now skip to first non-line-break
310 while (uc && breaks.contains(uc)) {
311 uc = *current++;
327 } 312 }
328 } 313 }
329 // Find beginning of next line 314 while (uc && (breaks.contains(uc) || u_isspace(uc)));
330 while (uc && !breaks.contains(uc)) { 315
331 uc = *current++; 316 mtd = new MutableTrieDictionary(uc, status);
317
318 if (U_FAILURE(status)) {
319 fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_e rrorName(status));
320 exit(status);
332 } 321 }
333 while (uc && breaks.contains(uc)) { 322
334 uc = *current++; 323 // Now add the words. Words are non-space characters at the beginning of
324 // lines, and must be at least one UChar. If a word has an associated va lue,
325 // the value should follow the word on the same line after a tab charact er.
326 current = wordSourceU;
327 UChar *candidate = current;
328 uc = *current++;
329 int32_t length = 0;
330 int count = 0;
331
332 while (uc) {
333 while (uc && !u_isspace(uc)) {
334 ++length;
335 uc = *current++;
336 }
337
338 UnicodeString valueString;
339 UChar candidateValue;
340 if(uc == 0x0009){ //separator is a tab char, read in number after sp ace
341 » while (uc && u_isspace(uc)) {
342 » » uc = *current++;
343 » }
344 while (uc && !u_isspace(uc)) {
345 valueString.append(uc);
346 uc = *current++;
347 }
348 }
349
350 if (length > 0) {
351 count++;
352 if(valueString.length() > 0){
353 mtd->setValued(TRUE);
354
355 uint32_t value = 0;
356 char* s = new char[valueString.length()];
357 valueString.extract(0,valueString.length(), s, valueString.l ength());
358 int n = sscanf(s, "%ud", &value);
359 U_ASSERT(n == 1);
360 U_ASSERT(value >= 0);
361 mtd->addWord(candidate, length, status, (uint16_t)value);
362 delete[] s;
363 } else {
364 mtd->addWord(candidate, length, status);
365 }
366
367 if (U_FAILURE(status)) {
368 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \ "%s\" at line %d in input file\n",
369 u_errorName(status), count);
370 exit(status);
371 }
372 }
373
374 // Find beginning of next line
375 while (uc && !breaks.contains(uc)) {
376 uc = *current++;
377 }
378 // Find next non-line-breaking character
379 while (uc && breaks.contains(uc)) {
380 uc = *current++;
381 }
382 candidate = current-1;
383 length = 0;
335 } 384 }
336 candidate = current-1; 385
337 length = 0; 386 // Get rid of the Unicode text buffer
387 delete[] wordSourceU;
338 } 388 }
339 389
340 // Get rid of the Unicode text buffer
341 delete[] wordSourceU;
342
343 // Now, create a CompactTrieDictionary from the mutable dictionary 390 // Now, create a CompactTrieDictionary from the mutable dictionary
344 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); 391 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
345 if (U_FAILURE(status)) { 392 if (U_FAILURE(status)) {
346 fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_error Name(status)); 393 fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_error Name(status));
347 exit(status); 394 exit(status);
348 } 395 }
349 396
350 // Get rid of the MutableTrieDictionary 397 // Get rid of the MutableTrieDictionary
351 delete mtd; 398 delete mtd;
352 399
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
386 // Get rid of the CompactTrieDictionary 433 // Get rid of the CompactTrieDictionary
387 delete ctd; 434 delete ctd;
388 435
389 u_cleanup(); 436 u_cleanup();
390 437
391 printf("genctd: tool completed successfully.\n"); 438 printf("genctd: tool completed successfully.\n");
392 return 0; 439 return 0;
393 440
394 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 441 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
395 } 442 }
396
OLDNEW
« no previous file with comments | « icu46/source/tools/genctd/Makefile.in ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698