icu46/source/tools/genctd/genctd.cpp - Issue 6370014: CJK segmentation patch for ICU 4.6...

Side by Side Diff: icu46/source/tools/genctd/genctd.cpp

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 9 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 **********************************************************************	2 **********************************************************************

3 * Copyright (C) 2002-2009, International Business Machines	3 * Copyright (C) 2002-2010, International Business Machines

4 * Corporation and others. All Rights Reserved.	4 * Corporation and others. All Rights Reserved.

5 **********************************************************************	5 **********************************************************************

6 *	6 *

7 * File genctd.c	7 * File genctd.c

8 */	8 */

9	9

10 //--------------------------------------------------------------------	10 //--------------------------------------------------------------------

11 //	11 //

12 // Tool for generating CompactTrieDictionary data files (.ctd files).	12 // Tool for generating CompactTrieDictionary data files (.ctd files).

13 //	13 //

(...skipping 13 matching lines...) Expand all Loading...
27	27

28 #include "unicode/utypes.h"	28 #include "unicode/utypes.h"

29 #include "unicode/uchar.h"	29 #include "unicode/uchar.h"

30 #include "unicode/ucnv.h"	30 #include "unicode/ucnv.h"

31 #include "unicode/uniset.h"	31 #include "unicode/uniset.h"

32 #include "unicode/unistr.h"	32 #include "unicode/unistr.h"

33 #include "unicode/uclean.h"	33 #include "unicode/uclean.h"

34 #include "unicode/udata.h"	34 #include "unicode/udata.h"

35 #include "unicode/putil.h"	35 #include "unicode/putil.h"

36	36

	37 //#include "unicode/ustdio.h"

	38

37 #include "uoptions.h"	39 #include "uoptions.h"

38 #include "unewdata.h"	40 #include "unewdata.h"

39 #include "ucmndata.h"	41 #include "ucmndata.h"

40 #include "rbbidata.h"	42 #include "rbbidata.h"

41 #include "triedict.h"	43 #include "triedict.h"

42 #include "cmemory.h"	44 #include "cmemory.h"

	45 #include "uassert.h"

43	46

44 #include <stdio.h>	47 #include <stdio.h>

45 #include <stdlib.h>	48 #include <stdlib.h>

46 #include <string.h>	49 #include <string.h>

47	50

48 U_NAMESPACE_USE	51 U_NAMESPACE_USE

49	52

50 static char *progName;	53 static char *progName;

51 static UOption options[]={	54 static UOption options[]={

52 UOPTION_HELP_H, /* 0 */	55 UOPTION_HELP_H, /* 0 */

(...skipping 139 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
192 }	195 }

193 status = U_ZERO_ERROR;	196 status = U_ZERO_ERROR;

194	197

195 //	198 //

196 // Read in the dictionary source file	199 // Read in the dictionary source file

197 //	200 //

198 long result;	201 long result;

199 long wordFileSize;	202 long wordFileSize;

200 FILE *file;	203 FILE *file;

201 char *wordBufferC;	204 char *wordBufferC;

	205 MutableTrieDictionary *mtd = NULL;

	206

	207 file = fopen(wordFileName, "rb");

	208 if( file == 0 ) { //cannot find file

	209 //create 1-line dummy file: ie 1 char, 1 value

	210 UNewDataMemory *pData;

	211 char msg[1024];

202	212

203 file = fopen(wordFileName, "rb");	213 /* write message with just the name */

204 if( file == 0 ) {	214 sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFi leName);

205 fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);	215 fprintf(stderr, "%s\n", msg);

206 exit(-1);

207 }

208 fseek(file, 0, SEEK_END);

209 wordFileSize = ftell(file);

210 fseek(file, 0, SEEK_SET);

211 wordBufferC = new char[wordFileSize+10];

212	216

213 result = (long)fread(wordBufferC, 1, wordFileSize, file);	217 UChar c = 0x0020;

214 if (result != wordFileSize) {	218 mtd = new MutableTrieDictionary(c, status, TRUE);

215 fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);	219 mtd->addWord(&c, 1, status, 1);

216 exit (-1);

217 }

218 wordBufferC[wordFileSize]=0;

219 fclose(file);

220	220

221 //	221 } else { //read words in from input file

222 // Look for a Unicode Signature (BOM) on the word file	222 fseek(file, 0, SEEK_END);

223 //	223 wordFileSize = ftell(file);

224 int32_t signatureLength;	224 fseek(file, 0, SEEK_SET);

225 const char * wordSourceC = wordBufferC;	225 wordBufferC = new char[wordFileSize+10];

226 const char* encoding = ucnv_detectUnicodeSignature(	226

227 wordSourceC, wordFileSize, &signatureLength, &status) ;	227 result = (long)fread(wordBufferC, 1, wordFileSize, file);

228 if (U_FAILURE(status)) {	228 if (result != wordFileSize) {

229 exit(status);	229 fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);

230 }	230 exit (-1);

231 if(encoding!=NULL ){

232 wordSourceC += signatureLength;

233 wordFileSize -= signatureLength;

234 }

235

236 //

237 // Open a converter to take the rule file to UTF-16

238 //

239 UConverter* conv;

240 conv = ucnv_open(encoding, &status);

241 if (U_FAILURE(status)) {

242 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));

243 exit(status);

244 }

245

246 //

247 // Convert the words to UChar.

248 // Preflight first to determine required buffer size.

249 //

250 uint32_t destCap = ucnv_toUChars(conv,

251 NULL, // dest,

252 0, // destCapacity,

253 wordSourceC,

254 wordFileSize,

255 &status);

256 if (status != U_BUFFER_OVERFLOW_ERROR) {

257 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status) );

258 exit(status);

259 };

260

261 status = U_ZERO_ERROR;

262 UChar *wordSourceU = new UChar[destCap+1];

263 ucnv_toUChars(conv,

264 wordSourceU, // dest,

265 destCap+1,

266 wordSourceC,

267 wordFileSize,

268 &status);

269 if (U_FAILURE(status)) {

270 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status) );

271 exit(status);

272 };

273 ucnv_close(conv);

274

275 // Get rid of the original file buffer

276 delete[] wordBufferC;

277

278 // Create a MutableTrieDictionary, and loop through all the lines, inserting

279 // words.

280

281 // First, pick a median character.

282 UChar *current = wordSourceU + (destCap/2);

283 UChar uc = *current++;

284 UnicodeSet breaks;

285 breaks.add(0x000A); // Line Feed

286 breaks.add(0x000D); // Carriage Return

287 breaks.add(0x2028); // Line Separator

288 breaks.add(0x2029); // Paragraph Separator

289

290 do {

291 // Look for line break

292 while (uc && !breaks.contains(uc)) {

293 uc = *current++;

294 }	231 }

295 // Now skip to first non-line-break	232 wordBufferC[wordFileSize]=0;

296 while (uc && breaks.contains(uc)) {	233 fclose(file);

297 uc = *current++;	234

	235 //

	236 // Look for a Unicode Signature (BOM) on the word file

	237 //

	238 int32_t signatureLength;

	239 const char * wordSourceC = wordBufferC;

	240 const char* encoding = ucnv_detectUnicodeSignature(

	241 wordSourceC, wordFileSize, &signatureLength, &sta tus);

	242 if (U_FAILURE(status)) {

	243 exit(status);

298 }	244 }

299 }	245 if(encoding!=NULL ){

300 while (uc && (breaks.contains(uc) \|\| u_isspace(uc)));	246 wordSourceC += signatureLength;

301	247 wordFileSize -= signatureLength;

302 MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);	248 }

303	249

304 if (U_FAILURE(status)) {	250 //

305 fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_error Name(status));	251 // Open a converter to take the rule file to UTF-16

306 exit(status);	252 //

307 }	253 UConverter* conv;

	254 conv = ucnv_open(encoding, &status);

	255 if (U_FAILURE(status)) {

	256 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status) );

	257 exit(status);

	258 }

308	259

309 // Now add the words. Words are non-space characters at the beginning of	260 //

310 // lines, and must be at least one UChar.	261 // Convert the words to UChar.

311 current = wordSourceU;	262 // Preflight first to determine required buffer size.

312 UChar *candidate = current;	263 //

313 uc = *current++;	264 uint32_t destCap = ucnv_toUChars(conv,

314 int32_t length = 0;	265 NULL, // dest,

315	266 0, // destCapacity,

316 while (uc) {	267 wordSourceC,

317 while (uc && !u_isspace(uc)) {	268 wordFileSize,

318 ++length;	269 &status);

319 uc = *current++;	270 if (status != U_BUFFER_OVERFLOW_ERROR) {

320 }	271 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(sta tus));

321 if (length > 0) {	272 exit(status);

322 mtd->addWord(candidate, length, status);	273 };

323 if (U_FAILURE(status)) {	274

324 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\ "\n",	275 status = U_ZERO_ERROR;

325 u_errorName(status));	276 UChar *wordSourceU = new UChar[destCap+1];

326 exit(status);	277 ucnv_toUChars(conv,

	278 wordSourceU, // dest,

	279 destCap+1,

	280 wordSourceC,

	281 wordFileSize,

	282 &status);

	283 if (U_FAILURE(status)) {

	284 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(sta tus));

	285 exit(status);

	286 };

	287 ucnv_close(conv);

	288

	289 // Get rid of the original file buffer

	290 delete[] wordBufferC;

	291

	292 // Create a MutableTrieDictionary, and loop through all the lines, inser ting

	293 // words.

	294

	295 // First, pick a median character.

	296 UChar *current = wordSourceU + (destCap/2);

	297 UChar uc = *current++;

	298 UnicodeSet breaks;

	299 breaks.add(0x000A); // Line Feed

	300 breaks.add(0x000D); // Carriage Return

	301 breaks.add(0x2028); // Line Separator

	302 breaks.add(0x2029); // Paragraph Separator

	303

	304 do {

	305 // Look for line break

	306 while (uc && !breaks.contains(uc)) {

	307 uc = *current++;

	308 }

	309 // Now skip to first non-line-break

	310 while (uc && breaks.contains(uc)) {

	311 uc = *current++;

327 }	312 }

328 }	313 }

329 // Find beginning of next line	314 while (uc && (breaks.contains(uc) \|\| u_isspace(uc)));

330 while (uc && !breaks.contains(uc)) {	315

331 uc = *current++;	316 mtd = new MutableTrieDictionary(uc, status);

	317

	318 if (U_FAILURE(status)) {

	319 fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_e rrorName(status));

	320 exit(status);

332 }	321 }

333 while (uc && breaks.contains(uc)) {	322

334 uc = *current++;	323 // Now add the words. Words are non-space characters at the beginning of

	324 // lines, and must be at least one UChar. If a word has an associated va lue,

	325 // the value should follow the word on the same line after a tab charact er.

	326 current = wordSourceU;

	327 UChar *candidate = current;

	328 uc = *current++;

	329 int32_t length = 0;

	330 int count = 0;

	331

	332 while (uc) {

	333 while (uc && !u_isspace(uc)) {

	334 ++length;

	335 uc = *current++;

	336 }

	337

	338 UnicodeString valueString;

	339 UChar candidateValue;

	340 if(uc == 0x0009){ //separator is a tab char, read in number after sp ace

	341 » while (uc && u_isspace(uc)) {

	342 » » uc = *current++;

	343 » }

	344 while (uc && !u_isspace(uc)) {

	345 valueString.append(uc);

	346 uc = *current++;

	347 }

	348 }

	349

	350 if (length > 0) {

	351 count++;

	352 if(valueString.length() > 0){

	353 mtd->setValued(TRUE);

	354

	355 uint32_t value = 0;

	356 char* s = new char[valueString.length()];

	357 valueString.extract(0,valueString.length(), s, valueString.l ength());

	358 int n = sscanf(s, "%ud", &value);

	359 U_ASSERT(n == 1);

	360 U_ASSERT(value >= 0);

	361 mtd->addWord(candidate, length, status, (uint16_t)value);

	362 delete[] s;

	363 } else {

	364 mtd->addWord(candidate, length, status);

	365 }

	366

	367 if (U_FAILURE(status)) {

	368 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \ "%s\" at line %d in input file\n",

	369 u_errorName(status), count);

	370 exit(status);

	371 }

	372 }

	373

	374 // Find beginning of next line

	375 while (uc && !breaks.contains(uc)) {

	376 uc = *current++;

	377 }

	378 // Find next non-line-breaking character

	379 while (uc && breaks.contains(uc)) {

	380 uc = *current++;

	381 }

	382 candidate = current-1;

	383 length = 0;

335 }	384 }

336 candidate = current-1;	385

337 length = 0;	386 // Get rid of the Unicode text buffer

	387 delete[] wordSourceU;

338 }	388 }

339	389

340 // Get rid of the Unicode text buffer

341 delete[] wordSourceU;

342

343 // Now, create a CompactTrieDictionary from the mutable dictionary	390 // Now, create a CompactTrieDictionary from the mutable dictionary

344 CompactTrieDictionary ctd = new CompactTrieDictionary(mtd, status);	391 CompactTrieDictionary ctd = new CompactTrieDictionary(mtd, status);

345 if (U_FAILURE(status)) {	392 if (U_FAILURE(status)) {

346 fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_error Name(status));	393 fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_error Name(status));

347 exit(status);	394 exit(status);

348 }	395 }

349	396

350 // Get rid of the MutableTrieDictionary	397 // Get rid of the MutableTrieDictionary

351 delete mtd;	398 delete mtd;

352	399

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
386 // Get rid of the CompactTrieDictionary	433 // Get rid of the CompactTrieDictionary

387 delete ctd;	434 delete ctd;

388	435

389 u_cleanup();	436 u_cleanup();

390	437

391 printf("genctd: tool completed successfully.\n");	438 printf("genctd: tool completed successfully.\n");

392 return 0;	439 return 0;

393	440

394 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */	441 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

395 }	442 }

396

OLD	NEW

« no previous file with comments | « icu46/source/tools/genctd/Makefile.in ('k') | no next file » | no next file with comments »