OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2009-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: n2builder.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2009nov25 |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * Builds Normalizer2 data and writes a binary .nrm file. |
| 17 * For the file format see source/common/normalizer2impl.h. |
| 18 */ |
| 19 |
| 20 #include "unicode/utypes.h" |
| 21 #include "n2builder.h" |
| 22 |
| 23 #include <stdio.h> |
| 24 #include <stdlib.h> |
| 25 #include <string.h> |
| 26 #if U_HAVE_STD_STRING |
| 27 #include <vector> |
| 28 #endif |
| 29 #include "unicode/errorcode.h" |
| 30 #include "unicode/localpointer.h" |
| 31 #include "unicode/putil.h" |
| 32 #include "unicode/udata.h" |
| 33 #include "unicode/uniset.h" |
| 34 #include "unicode/unistr.h" |
| 35 #include "unicode/ustring.h" |
| 36 #include "hash.h" |
| 37 #include "normalizer2impl.h" |
| 38 #include "toolutil.h" |
| 39 #include "unewdata.h" |
| 40 #include "utrie2.h" |
| 41 #include "uvectr32.h" |
| 42 |
| 43 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
| 44 |
| 45 #if !UCONFIG_NO_NORMALIZATION |
| 46 |
| 47 /* UDataInfo cf. udata.h */ |
| 48 static UDataInfo dataInfo={ |
| 49 sizeof(UDataInfo), |
| 50 0, |
| 51 |
| 52 U_IS_BIG_ENDIAN, |
| 53 U_CHARSET_FAMILY, |
| 54 U_SIZEOF_UCHAR, |
| 55 0, |
| 56 |
| 57 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ |
| 58 { 1, 0, 0, 0 }, /* formatVersion */ |
| 59 { 5, 2, 0, 0 } /* dataVersion (Unicode version) */ |
| 60 }; |
| 61 |
| 62 U_NAMESPACE_BEGIN |
| 63 |
| 64 class HangulIterator { |
| 65 public: |
| 66 struct Range { |
| 67 UChar32 start, limit; |
| 68 uint16_t norm16; |
| 69 }; |
| 70 |
| 71 HangulIterator() : rangeIndex(0) {} |
| 72 const Range *nextRange() { |
| 73 if(rangeIndex<LENGTHOF(ranges)) { |
| 74 return ranges+rangeIndex++; |
| 75 } else { |
| 76 return NULL; |
| 77 } |
| 78 } |
| 79 void reset() { rangeIndex=0; } |
| 80 private: |
| 81 static const Range ranges[4]; |
| 82 int32_t rangeIndex; |
| 83 }; |
| 84 |
| 85 const HangulIterator::Range HangulIterator::ranges[4]={ |
| 86 { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 }, |
| 87 { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2
Impl::JAMO_VT }, |
| 88 // JAMO_T_BASE+1: not U+11A7 |
| 89 { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalize
r2Impl::JAMO_VT }, |
| 90 { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 }, // wi
ll become minYesNo |
| 91 }; |
| 92 |
| 93 struct CompositionPair { |
| 94 CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} |
| 95 UChar32 trail, composite; |
| 96 }; |
| 97 |
| 98 struct Norm { |
| 99 enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; |
| 100 |
| 101 UBool hasMapping() const { return mappingType>REMOVED; } |
| 102 |
| 103 // Requires hasMapping() and well-formed mapping. |
| 104 void setMappingCP() { |
| 105 UChar32 c; |
| 106 if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char3
2At(0))) { |
| 107 mappingCP=c; |
| 108 } else { |
| 109 mappingCP=U_SENTINEL; |
| 110 } |
| 111 } |
| 112 |
| 113 const CompositionPair *getCompositionPairs(int32_t &length) const { |
| 114 if(compositions==NULL) { |
| 115 length=0; |
| 116 return NULL; |
| 117 } else { |
| 118 length=compositions->size()/2; |
| 119 return reinterpret_cast<const CompositionPair *>(compositions->getBu
ffer()); |
| 120 } |
| 121 } |
| 122 |
| 123 UnicodeString *mapping; |
| 124 UChar32 mappingCP; // >=0 if mapping to 1 code point |
| 125 int32_t mappingPhase; |
| 126 MappingType mappingType; |
| 127 |
| 128 UVector32 *compositions; // (trail, composite) pairs |
| 129 uint8_t cc; |
| 130 UBool combinesBack; |
| 131 UBool hasNoCompBoundaryAfter; |
| 132 |
| 133 enum OffsetType { |
| 134 OFFSET_NONE, OFFSET_MAYBE_YES, |
| 135 OFFSET_YES_YES, OFFSET_YES_NO, OFFSET_NO_NO, |
| 136 OFFSET_DELTA |
| 137 }; |
| 138 enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 }; |
| 139 int32_t offset; |
| 140 }; |
| 141 |
| 142 class Normalizer2DBEnumerator { |
| 143 public: |
| 144 Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {} |
| 145 virtual ~Normalizer2DBEnumerator() {} |
| 146 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0; |
| 147 Normalizer2DBEnumerator *ptr() { return this; } |
| 148 protected: |
| 149 Normalizer2DataBuilder &builder; |
| 150 }; |
| 151 |
| 152 U_CDECL_BEGIN |
| 153 |
| 154 static UBool U_CALLCONV |
| 155 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value
) { |
| 156 return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value)
; |
| 157 } |
| 158 |
| 159 U_CDECL_END |
| 160 |
| 161 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : |
| 162 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NOR
MAL) { |
| 163 memset(unicodeVersion, 0, sizeof(unicodeVersion)); |
| 164 normTrie=utrie2_open(0, 0, &errorCode); |
| 165 normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(N
orm)); |
| 166 norms=allocNorm(); // unused Norm struct at index 0 |
| 167 memset(indexes, 0, sizeof(indexes)); |
| 168 } |
| 169 |
| 170 Normalizer2DataBuilder::~Normalizer2DataBuilder() { |
| 171 utrie2_close(normTrie); |
| 172 int32_t normsLength=utm_countItems(normMem); |
| 173 for(int32_t i=1; i<normsLength; ++i) { |
| 174 delete norms[i].mapping; |
| 175 delete norms[i].compositions; |
| 176 } |
| 177 utm_close(normMem); |
| 178 utrie2_close(norm16Trie); |
| 179 } |
| 180 |
| 181 void |
| 182 Normalizer2DataBuilder::setUnicodeVersion(const char *v) { |
| 183 u_versionFromString(unicodeVersion, v); |
| 184 } |
| 185 |
| 186 Norm *Normalizer2DataBuilder::allocNorm() { |
| 187 Norm *p=(Norm *)utm_alloc(normMem); |
| 188 norms=(Norm *)utm_getStart(normMem); // in case it got reallocated |
| 189 return p; |
| 190 } |
| 191 |
| 192 /* get an existing Norm unit */ |
| 193 Norm *Normalizer2DataBuilder::getNorm(UChar32 c) { |
| 194 uint32_t i=utrie2_get32(normTrie, c); |
| 195 if(i==0) { |
| 196 return NULL; |
| 197 } |
| 198 return norms+i; |
| 199 } |
| 200 |
| 201 const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const { |
| 202 return norms[utrie2_get32(normTrie, c)]; |
| 203 } |
| 204 |
| 205 /* |
| 206 * get or create a Norm unit; |
| 207 * get or create the intermediate trie entries for it as well |
| 208 */ |
| 209 Norm *Normalizer2DataBuilder::createNorm(UChar32 c) { |
| 210 uint32_t i=utrie2_get32(normTrie, c); |
| 211 if(i!=0) { |
| 212 return norms+i; |
| 213 } else { |
| 214 /* allocate Norm */ |
| 215 Norm *p=allocNorm(); |
| 216 IcuToolErrorCode errorCode("gennorm2/createNorm()"); |
| 217 utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode); |
| 218 return p; |
| 219 } |
| 220 } |
| 221 |
| 222 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { |
| 223 if(p!=NULL) { |
| 224 if(p->mappingType!=Norm::NONE) { |
| 225 if( overrideHandling==OVERRIDE_NONE || |
| 226 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) |
| 227 ) { |
| 228 fprintf(stderr, |
| 229 "error in gennorm2 phase %d: " |
| 230 "not permitted to override mapping for U+%04lX from phas
e %d\n", |
| 231 (int)phase, (long)c, (int)p->mappingPhase); |
| 232 exit(U_INVALID_FORMAT_ERROR); |
| 233 } |
| 234 delete p->mapping; |
| 235 p->mapping=NULL; |
| 236 } |
| 237 p->mappingPhase=phase; |
| 238 } |
| 239 return p; |
| 240 } |
| 241 |
| 242 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { |
| 243 overrideHandling=oh; |
| 244 ++phase; |
| 245 } |
| 246 |
| 247 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { |
| 248 createNorm(c)->cc=cc; |
| 249 } |
| 250 |
| 251 uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const { |
| 252 return getNormRef(c).cc; |
| 253 } |
| 254 |
| 255 static UBool isWellFormed(const UnicodeString &s) { |
| 256 UErrorCode errorCode=U_ZERO_ERROR; |
| 257 u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode); |
| 258 return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; |
| 259 } |
| 260 |
| 261 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m)
{ |
| 262 if(!isWellFormed(m)) { |
| 263 fprintf(stderr, |
| 264 "error in gennorm2 phase %d: " |
| 265 "illegal one-way mapping from U+%04lX to malformed string\n", |
| 266 (int)phase, (long)c); |
| 267 exit(U_INVALID_FORMAT_ERROR); |
| 268 } |
| 269 Norm *p=checkNormForMapping(createNorm(c), c); |
| 270 p->mapping=new UnicodeString(m); |
| 271 p->mappingType=Norm::ONE_WAY; |
| 272 p->setMappingCP(); |
| 273 } |
| 274 |
| 275 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString
&m) { |
| 276 if(U_IS_SURROGATE(c)) { |
| 277 fprintf(stderr, |
| 278 "error in gennorm2 phase %d: " |
| 279 "illegal round-trip mapping from surrogate code point U+%04lX\n"
, |
| 280 (int)phase, (long)c); |
| 281 exit(U_INVALID_FORMAT_ERROR); |
| 282 } |
| 283 if(!isWellFormed(m)) { |
| 284 fprintf(stderr, |
| 285 "error in gennorm2 phase %d: " |
| 286 "illegal round-trip mapping from U+%04lX to malformed string\n", |
| 287 (int)phase, (long)c); |
| 288 exit(U_INVALID_FORMAT_ERROR); |
| 289 } |
| 290 int32_t numCP=u_countChar32(m.getBuffer(), m.length()); |
| 291 if(numCP!=2) { |
| 292 fprintf(stderr, |
| 293 "error in gennorm2 phase %d: " |
| 294 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n"
, |
| 295 (int)phase, (long)c, (int)numCP); |
| 296 exit(U_INVALID_FORMAT_ERROR); |
| 297 } |
| 298 Norm *p=checkNormForMapping(createNorm(c), c); |
| 299 p->mapping=new UnicodeString(m); |
| 300 p->mappingType=Norm::ROUND_TRIP; |
| 301 p->mappingCP=U_SENTINEL; |
| 302 } |
| 303 |
| 304 void Normalizer2DataBuilder::removeMapping(UChar32 c) { |
| 305 Norm *p=checkNormForMapping(getNorm(c), c); |
| 306 if(p!=NULL) { |
| 307 p->mappingType=Norm::REMOVED; |
| 308 } |
| 309 } |
| 310 |
| 311 class CompositionBuilder : public Normalizer2DBEnumerator { |
| 312 public: |
| 313 CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {
} |
| 314 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { |
| 315 builder.addComposition(start, end, value); |
| 316 return TRUE; |
| 317 } |
| 318 }; |
| 319 |
| 320 void |
| 321 Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t valu
e) { |
| 322 if(norms[value].mappingType==Norm::ROUND_TRIP) { |
| 323 if(start!=end) { |
| 324 fprintf(stderr, |
| 325 "gennorm2 error: same round-trip mapping for " |
| 326 "more than 1 code point U+%04lX..U+%04lX\n", |
| 327 (long)start, (long)end); |
| 328 exit(U_INVALID_FORMAT_ERROR); |
| 329 } |
| 330 if(norms[value].cc!=0) { |
| 331 fprintf(stderr, |
| 332 "gennorm2 error: " |
| 333 "U+%04lX has a round-trip mapping and ccc!=0, " |
| 334 "not possible in Unicode normalization\n", |
| 335 (long)start); |
| 336 exit(U_INVALID_FORMAT_ERROR); |
| 337 } |
| 338 // setRoundTripMapping() ensured that there are exactly two code points. |
| 339 const UnicodeString &m=*norms[value].mapping; |
| 340 UChar32 lead=m.char32At(0); |
| 341 UChar32 trail=m.char32At(m.length()-1); |
| 342 if(getCC(lead)!=0) { |
| 343 fprintf(stderr, |
| 344 "gennorm2 error: " |
| 345 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0,
" |
| 346 "not possible in Unicode normalization\n", |
| 347 (long)start, (long)lead); |
| 348 exit(U_INVALID_FORMAT_ERROR); |
| 349 } |
| 350 // Flag for trailing character. |
| 351 createNorm(trail)->combinesBack=TRUE; |
| 352 // Insert (trail, composite) pair into compositions list for the lead ch
aracter. |
| 353 IcuToolErrorCode errorCode("gennorm2/addComposition()"); |
| 354 Norm *leadNorm=createNorm(lead); |
| 355 UVector32 *compositions=leadNorm->compositions; |
| 356 int32_t i; |
| 357 if(compositions==NULL) { |
| 358 compositions=leadNorm->compositions=new UVector32(errorCode); |
| 359 i=0; // "insert" the first pair at index 0 |
| 360 } else { |
| 361 // Insertion sort, and check for duplicate trail characters. |
| 362 int32_t length; |
| 363 const CompositionPair *pairs=leadNorm->getCompositionPairs(length); |
| 364 for(i=0; i<length; ++i) { |
| 365 if(trail==pairs[i].trail) { |
| 366 fprintf(stderr, |
| 367 "gennorm2 error: same round-trip mapping for " |
| 368 "more than 1 code point (e.g., U+%04lX) to U+%04lX +
U+%04lX\n", |
| 369 (long)start, (long)lead, (long)trail); |
| 370 exit(U_INVALID_FORMAT_ERROR); |
| 371 } |
| 372 if(trail<pairs[i].trail) { |
| 373 break; |
| 374 } |
| 375 } |
| 376 } |
| 377 compositions->insertElementAt(trail, 2*i, errorCode); |
| 378 compositions->insertElementAt(start, 2*i+1, errorCode); |
| 379 } |
| 380 } |
| 381 |
| 382 UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm, |
| 383 uint8_t lowCC, uint8_t highC
C) const { |
| 384 if((highCC-lowCC)>=2) { |
| 385 int32_t length; |
| 386 const CompositionPair *pairs=norm.getCompositionPairs(length); |
| 387 for(int32_t i=0; i<length; ++i) { |
| 388 uint8_t trailCC=getCC(pairs[i].trail); |
| 389 if(lowCC<trailCC && trailCC<highCC) { |
| 390 return TRUE; |
| 391 } |
| 392 } |
| 393 } |
| 394 return FALSE; |
| 395 } |
| 396 |
| 397 UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const { |
| 398 int32_t length; |
| 399 const CompositionPair *pairs=norm.getCompositionPairs(length); |
| 400 for(int32_t i=0; i<length; ++i) { |
| 401 if(trail==pairs[i].trail) { |
| 402 return pairs[i].composite; |
| 403 } |
| 404 if(trail<pairs[i].trail) { |
| 405 break; |
| 406 } |
| 407 } |
| 408 return U_SENTINEL; |
| 409 } |
| 410 |
| 411 class Decomposer : public Normalizer2DBEnumerator { |
| 412 public: |
| 413 Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecom
pose(FALSE) {} |
| 414 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { |
| 415 didDecompose|=builder.decompose(start, end, value); |
| 416 return TRUE; |
| 417 } |
| 418 UBool didDecompose; |
| 419 }; |
| 420 |
| 421 UBool |
| 422 Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) { |
| 423 if(norms[value].hasMapping()) { |
| 424 const UnicodeString &m=*norms[value].mapping; |
| 425 UnicodeString *decomposed=NULL; |
| 426 const UChar *s=m.getBuffer(); |
| 427 int32_t length=m.length(); |
| 428 int32_t prev, i=0; |
| 429 UChar32 c; |
| 430 while(i<length) { |
| 431 prev=i; |
| 432 U16_NEXT(s, i, length, c); |
| 433 if(start<=c && c<=end) { |
| 434 fprintf(stderr, |
| 435 "gennorm2 error: U+%04lX maps to itself directly or indi
rectly\n", |
| 436 (long)c); |
| 437 exit(U_INVALID_FORMAT_ERROR); |
| 438 } |
| 439 const Norm &cNorm=getNormRef(c); |
| 440 if(cNorm.hasMapping()) { |
| 441 if(norms[value].mappingType==Norm::ROUND_TRIP) { |
| 442 if(prev==0) { |
| 443 if(cNorm.mappingType!=Norm::ROUND_TRIP) { |
| 444 fprintf(stderr, |
| 445 "gennorm2 error: " |
| 446 "U+%04lX's round-trip mapping's starter " |
| 447 "U+%04lX one-way-decomposes, " |
| 448 "not possible in Unicode normalization\n", |
| 449 (long)start, (long)c); |
| 450 exit(U_INVALID_FORMAT_ERROR); |
| 451 } |
| 452 uint8_t myTrailCC=getCC(m.char32At(i)); |
| 453 UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping
->length()-1); |
| 454 uint8_t cTrailCC=getCC(cTrailChar); |
| 455 if(cTrailCC>myTrailCC) { |
| 456 fprintf(stderr, |
| 457 "gennorm2 error: " |
| 458 "U+%04lX's round-trip mapping's starter " |
| 459 "U+%04lX decomposes and the " |
| 460 "inner/earlier tccc=%hu > outer/following tc
cc=%hu, " |
| 461 "not possible in Unicode normalization\n", |
| 462 (long)start, (long)c, |
| 463 (short)cTrailCC, (short)myTrailCC); |
| 464 exit(U_INVALID_FORMAT_ERROR); |
| 465 } |
| 466 } else { |
| 467 fprintf(stderr, |
| 468 "gennorm2 error: " |
| 469 "U+%04lX's round-trip mapping's non-starter " |
| 470 "U+%04lX decomposes, " |
| 471 "not possible in Unicode normalization\n", |
| 472 (long)start, (long)c); |
| 473 exit(U_INVALID_FORMAT_ERROR); |
| 474 } |
| 475 } |
| 476 if(decomposed==NULL) { |
| 477 decomposed=new UnicodeString(m, 0, prev); |
| 478 } |
| 479 decomposed->append(*cNorm.mapping); |
| 480 } else if(Hangul::isHangul(c)) { |
| 481 UChar buffer[3]; |
| 482 int32_t hangulLength=Hangul::decompose(c, buffer); |
| 483 if(norms[value].mappingType==Norm::ROUND_TRIP && prev!=0) { |
| 484 fprintf(stderr, |
| 485 "gennorm2 error: " |
| 486 "U+%04lX's round-trip mapping's non-starter " |
| 487 "U+%04lX decomposes, " |
| 488 "not possible in Unicode normalization\n", |
| 489 (long)start, (long)c); |
| 490 exit(U_INVALID_FORMAT_ERROR); |
| 491 } |
| 492 if(decomposed==NULL) { |
| 493 decomposed=new UnicodeString(m, 0, prev); |
| 494 } |
| 495 decomposed->append(buffer, hangulLength); |
| 496 } else if(decomposed!=NULL) { |
| 497 decomposed->append(m, prev, i-prev); |
| 498 } |
| 499 } |
| 500 if(decomposed!=NULL) { |
| 501 delete norms[value].mapping; |
| 502 norms[value].mapping=decomposed; |
| 503 // Not norms[value].setMappingCP(); because the original mapping |
| 504 // is most likely to be encodable as a delta. |
| 505 return TRUE; |
| 506 } |
| 507 } |
| 508 return FALSE; |
| 509 } |
| 510 |
| 511 class BuilderReorderingBuffer { |
| 512 public: |
| 513 BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(F
ALSE) {} |
| 514 void reset() { |
| 515 fLength=0; |
| 516 fLastStarterIndex=-1; |
| 517 fDidReorder=FALSE; |
| 518 } |
| 519 int32_t length() const { return fLength; } |
| 520 UBool isEmpty() const { return fLength==0; } |
| 521 int32_t lastStarterIndex() const { return fLastStarterIndex; } |
| 522 UChar32 charAt(int32_t i) const { return fArray[i]>>8; } |
| 523 uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } |
| 524 UBool didReorder() const { return fDidReorder; } |
| 525 void append(UChar32 c, uint8_t cc) { |
| 526 if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { |
| 527 if(cc==0) { |
| 528 fLastStarterIndex=fLength; |
| 529 } |
| 530 fArray[fLength++]=(c<<8)|cc; |
| 531 return; |
| 532 } |
| 533 // Let this character bubble back to its canonical order. |
| 534 int32_t i=fLength-1; |
| 535 while(i>fLastStarterIndex && ccAt(i)>cc) { |
| 536 --i; |
| 537 } |
| 538 ++i; // after the last starter or prevCC<=cc |
| 539 // Move this and the following characters forward one to make space. |
| 540 for(int32_t j=fLength; i<j; --j) { |
| 541 fArray[j]=fArray[j-1]; |
| 542 } |
| 543 fArray[i]=(c<<8)|cc; |
| 544 ++fLength; |
| 545 fDidReorder=TRUE; |
| 546 } |
| 547 void toString(UnicodeString &dest) { |
| 548 dest.remove(); |
| 549 for(int32_t i=0; i<fLength; ++i) { |
| 550 dest.append(charAt(i)); |
| 551 } |
| 552 } |
| 553 void setComposite(UChar32 composite, int32_t combMarkIndex) { |
| 554 fArray[fLastStarterIndex]=composite<<8; |
| 555 // Remove the combining mark that contributed to the composite. |
| 556 --fLength; |
| 557 while(combMarkIndex<fLength) { |
| 558 fArray[combMarkIndex]=fArray[combMarkIndex+1]; |
| 559 ++combMarkIndex; |
| 560 } |
| 561 } |
| 562 private: |
| 563 int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; |
| 564 int32_t fLength; |
| 565 int32_t fLastStarterIndex; |
| 566 UBool fDidReorder; |
| 567 }; |
| 568 |
| 569 void |
| 570 Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) { |
| 571 UnicodeString &m=*p->mapping; |
| 572 int32_t length=m.length(); |
| 573 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { |
| 574 return; // writeMapping() will complain about it and print the code poi
nt. |
| 575 } |
| 576 const UChar *s=m.getBuffer(); |
| 577 int32_t i=0; |
| 578 UChar32 c; |
| 579 while(i<length) { |
| 580 U16_NEXT(s, i, length, c); |
| 581 buffer.append(c, getCC(c)); |
| 582 } |
| 583 if(buffer.didReorder()) { |
| 584 buffer.toString(m); |
| 585 } |
| 586 } |
| 587 |
| 588 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
ffer) { |
| 589 if(buffer.isEmpty()) { |
| 590 return TRUE; // maps-to-empty string is no boundary of any kind |
| 591 } |
| 592 int32_t lastStarterIndex=buffer.lastStarterIndex(); |
| 593 if(lastStarterIndex<0) { |
| 594 return TRUE; // no starter |
| 595 } |
| 596 UChar32 starter=buffer.charAt(lastStarterIndex); |
| 597 if( Hangul::isJamoL(starter) || |
| 598 (Hangul::isJamoV(starter) && |
| 599 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)
)) |
| 600 ) { |
| 601 // A Jamo leading consonant or an LV pair combines-forward if it is at t
he end, |
| 602 // otherwise it is blocked. |
| 603 return lastStarterIndex==buffer.length()-1; |
| 604 } |
| 605 // no Hangul in fully decomposed mapping |
| 606 const Norm *starterNorm=&getNormRef(starter); |
| 607 if(starterNorm->compositions==NULL) { |
| 608 return FALSE; // the last starter does not combine forward |
| 609 } |
| 610 // Compose as far as possible, and see if further compositions are possible. |
| 611 uint8_t prevCC=0; |
| 612 for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();
) { |
| 613 uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starte
r |
| 614 if(combinesWithCCBetween(*starterNorm, prevCC, cc)) { |
| 615 return TRUE; |
| 616 } |
| 617 if( prevCC<cc && |
| 618 (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0 |
| 619 ) { |
| 620 buffer.setComposite(starter, combMarkIndex); |
| 621 starterNorm=&getNormRef(starter); |
| 622 if(starterNorm->compositions==NULL) { |
| 623 return FALSE; // the composite does not combine further |
| 624 } |
| 625 } else { |
| 626 prevCC=cc; |
| 627 ++combMarkIndex; |
| 628 } |
| 629 } |
| 630 // TRUE if the final, forward-combining starter is at the end. |
| 631 return prevCC==0; |
| 632 } |
| 633 |
| 634 // Requires p->hasMapping(). |
| 635 void Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeStrin
g &dataString) { |
| 636 UnicodeString &m=*p->mapping; |
| 637 int32_t length=m.length(); |
| 638 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { |
| 639 fprintf(stderr, |
| 640 "gennorm2 error: " |
| 641 "mapping for U+%04lX longer than maximum of %d\n", |
| 642 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); |
| 643 exit(U_INVALID_FORMAT_ERROR); |
| 644 } |
| 645 int32_t leadCC, trailCC; |
| 646 if(length==0) { |
| 647 leadCC=trailCC=0; |
| 648 } else { |
| 649 leadCC=getCC(m.char32At(0)); |
| 650 trailCC=getCC(m.char32At(length-1)); |
| 651 } |
| 652 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) { |
| 653 fprintf(stderr, |
| 654 "gennorm2 error: " |
| 655 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by IC
U\n", |
| 656 (long)c); |
| 657 exit(U_INVALID_FORMAT_ERROR); |
| 658 } |
| 659 int32_t firstUnit=length|(trailCC<<8); |
| 660 int32_t secondUnit=p->cc|(leadCC<<8); |
| 661 if(secondUnit!=0) { |
| 662 firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; |
| 663 } |
| 664 if(p->compositions!=NULL) { |
| 665 firstUnit|=Normalizer2Impl::MAPPING_PLUS_COMPOSITION_LIST; |
| 666 } |
| 667 if(p->hasNoCompBoundaryAfter) { |
| 668 firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER; |
| 669 } |
| 670 dataString.append((UChar)firstUnit); |
| 671 if(secondUnit!=0) { |
| 672 dataString.append((UChar)secondUnit); |
| 673 } |
| 674 dataString.append(m); |
| 675 } |
| 676 |
| 677 // Requires p->compositions!=NULL. |
| 678 void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, Unicode
String &dataString) { |
| 679 if(p->cc!=0) { |
| 680 fprintf(stderr, |
| 681 "gennorm2 error: " |
| 682 "U+%04lX combines-forward and has ccc!=0, not possible in Unicod
e normalization\n", |
| 683 (long)c); |
| 684 exit(U_INVALID_FORMAT_ERROR); |
| 685 } |
| 686 int32_t length; |
| 687 const CompositionPair *pairs=p->getCompositionPairs(length); |
| 688 for(int32_t i=0; i<length; ++i) { |
| 689 const CompositionPair &pair=pairs[i]; |
| 690 // 22 bits for the composite character and whether it combines forward. |
| 691 UChar32 compositeAndFwd=pair.composite<<1; |
| 692 if(getNormRef(pair.composite).compositions!=NULL) { |
| 693 compositeAndFwd|=1; // The composite character also combines-forwar
d. |
| 694 } |
| 695 // Encode most pairs in two units and some in three. |
| 696 int32_t firstUnit, secondUnit, thirdUnit; |
| 697 if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { |
| 698 if(compositeAndFwd<=0xffff) { |
| 699 firstUnit=pair.trail<<1; |
| 700 secondUnit=compositeAndFwd; |
| 701 thirdUnit=-1; |
| 702 } else { |
| 703 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; |
| 704 secondUnit=compositeAndFwd>>16; |
| 705 thirdUnit=compositeAndFwd; |
| 706 } |
| 707 } else { |
| 708 firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ |
| 709 (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| |
| 710 Normalizer2Impl::COMP_1_TRIPLE; |
| 711 secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| |
| 712 (compositeAndFwd>>16); |
| 713 thirdUnit=compositeAndFwd; |
| 714 } |
| 715 // Set the high bit of the first unit if this is the last composition pa
ir. |
| 716 if(i==(length-1)) { |
| 717 firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; |
| 718 } |
| 719 dataString.append((UChar)firstUnit).append((UChar)secondUnit); |
| 720 if(thirdUnit>=0) { |
| 721 dataString.append((UChar)thirdUnit); |
| 722 } |
| 723 } |
| 724 } |
| 725 |
| 726 class ExtraDataWriter : public Normalizer2DBEnumerator { |
| 727 public: |
| 728 ExtraDataWriter(Normalizer2DataBuilder &b) : |
| 729 Normalizer2DBEnumerator(b), |
| 730 yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=s
tart of compositions |
| 731 yesNoData(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data |
| 732 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { |
| 733 if(value!=0) { |
| 734 if(start!=end) { |
| 735 fprintf(stderr, |
| 736 "gennorm2 error: unexpected shared data for " |
| 737 "multiple code points U+%04lX..U+%04lX\n", |
| 738 (long)start, (long)end); |
| 739 exit(U_INTERNAL_PROGRAM_ERROR); |
| 740 } |
| 741 builder.writeExtraData(start, value, *this); |
| 742 } |
| 743 return TRUE; |
| 744 } |
| 745 UnicodeString maybeYesCompositions; |
| 746 UnicodeString yesYesCompositions; |
| 747 UnicodeString yesNoData; |
| 748 UnicodeString noNoMappings; |
| 749 Hashtable previousNoNoMappings; // If constructed in runtime code, pass in
UErrorCode. |
| 750 }; |
| 751 |
| 752 void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraData
Writer &writer) { |
| 753 Norm *p=norms+value; |
| 754 if(p->combinesBack) { |
| 755 if(p->hasMapping()) { |
| 756 fprintf(stderr, |
| 757 "gennorm2 error: " |
| 758 "U+%04lX combines-back and decomposes, not possible in Unico
de normalization\n", |
| 759 (long)c); |
| 760 exit(U_INVALID_FORMAT_ERROR); |
| 761 } |
| 762 if(p->compositions!=NULL) { |
| 763 p->offset= |
| 764 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)| |
| 765 Norm::OFFSET_MAYBE_YES; |
| 766 writeCompositions(c, p, writer.maybeYesCompositions); |
| 767 } |
| 768 } else if(!p->hasMapping()) { |
| 769 if(p->compositions!=NULL) { |
| 770 p->offset= |
| 771 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)| |
| 772 Norm::OFFSET_YES_YES; |
| 773 writeCompositions(c, p, writer.yesYesCompositions); |
| 774 } |
| 775 } else if(p->mappingType==Norm::ROUND_TRIP) { |
| 776 p->offset= |
| 777 (writer.yesNoData.length()<<Norm::OFFSET_SHIFT)| |
| 778 Norm::OFFSET_YES_NO; |
| 779 writeMapping(c, p, writer.yesNoData); |
| 780 if(p->compositions!=NULL) { |
| 781 writeCompositions(c, p, writer.yesNoData); |
| 782 } |
| 783 } else /* one-way */ { |
| 784 if(p->compositions!=NULL) { |
| 785 fprintf(stderr, |
| 786 "gennorm2 error: " |
| 787 "U+%04lX combines-forward and has a one-way mapping, " |
| 788 "not possible in Unicode normalization\n", |
| 789 (long)c); |
| 790 exit(U_INVALID_FORMAT_ERROR); |
| 791 } |
| 792 if(p->cc==0 && optimization!=OPTIMIZE_FAST) { |
| 793 // Try a compact, algorithmic encoding. |
| 794 // Only for ccc=0, because we can't store additional information. |
| 795 if(p->mappingCP>=0) { |
| 796 int32_t delta=p->mappingCP-c; |
| 797 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl:
:MAX_DELTA) { |
| 798 p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA; |
| 799 } |
| 800 } |
| 801 } |
| 802 if(p->offset==0) { |
| 803 int32_t oldNoNoLength=writer.noNoMappings.length(); |
| 804 writeMapping(c, p, writer.noNoMappings); |
| 805 UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLe
ngth); |
| 806 int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping); |
| 807 if(previousOffset!=0) { |
| 808 // Duplicate, remove the new units and point to the old ones. |
| 809 writer.noNoMappings.truncate(oldNoNoLength); |
| 810 p->offset= |
| 811 ((previousOffset-1)<<Norm::OFFSET_SHIFT)| |
| 812 Norm::OFFSET_NO_NO; |
| 813 } else { |
| 814 // Enter this new mapping into the hashtable, avoiding value 0 w
hich is "not found". |
| 815 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.
puti()"); |
| 816 writer.previousNoNoMappings.puti(newMapping, oldNoNoLength+1, er
rorCode); |
| 817 p->offset= |
| 818 (oldNoNoLength<<Norm::OFFSET_SHIFT)| |
| 819 Norm::OFFSET_NO_NO; |
| 820 } |
| 821 } |
| 822 } |
| 823 } |
| 824 |
| 825 class Norm16Writer : public Normalizer2DBEnumerator { |
| 826 public: |
| 827 Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} |
| 828 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { |
| 829 builder.writeNorm16(start, end, value); |
| 830 return TRUE; |
| 831 } |
| 832 }; |
| 833 |
| 834 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t va
lue) { |
| 835 if(value!=0) { |
| 836 const Norm *p=norms+value; |
| 837 int32_t offset=p->offset>>Norm::OFFSET_SHIFT; |
| 838 int32_t norm16=0; |
| 839 UBool isDecompNo=FALSE; |
| 840 UBool isCompNoMaybe=FALSE; |
| 841 switch(p->offset&Norm::OFFSET_MASK) { |
| 842 case Norm::OFFSET_NONE: |
| 843 // No mapping, no compositions list. |
| 844 if(p->combinesBack) { |
| 845 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc; |
| 846 isDecompNo=(UBool)(p->cc!=0); |
| 847 isCompNoMaybe=TRUE; |
| 848 } else if(p->cc!=0) { |
| 849 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc; |
| 850 isDecompNo=isCompNoMaybe=TRUE; |
| 851 } |
| 852 break; |
| 853 case Norm::OFFSET_MAYBE_YES: |
| 854 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset; |
| 855 isCompNoMaybe=TRUE; |
| 856 break; |
| 857 case Norm::OFFSET_YES_YES: |
| 858 norm16=offset; |
| 859 break; |
| 860 case Norm::OFFSET_YES_NO: |
| 861 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset; |
| 862 isDecompNo=TRUE; |
| 863 break; |
| 864 case Norm::OFFSET_NO_NO: |
| 865 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset; |
| 866 isDecompNo=isCompNoMaybe=TRUE; |
| 867 break; |
| 868 case Norm::OFFSET_DELTA: |
| 869 norm16=getCenterNoNoDelta()+offset; |
| 870 isDecompNo=isCompNoMaybe=TRUE; |
| 871 break; |
| 872 default: // Should not occur. |
| 873 exit(U_INTERNAL_PROGRAM_ERROR); |
| 874 } |
| 875 IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); |
| 876 utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorC
ode); |
| 877 if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { |
| 878 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; |
| 879 } |
| 880 if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_
CP]) { |
| 881 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; |
| 882 } |
| 883 } |
| 884 } |
| 885 |
| 886 void Normalizer2DataBuilder::setHangulData() { |
| 887 HangulIterator hi; |
| 888 const HangulIterator::Range *range; |
| 889 // Check that none of the Hangul/Jamo code points have data. |
| 890 while((range=hi.nextRange())!=NULL) { |
| 891 for(UChar32 c=range->start; c<range->limit; ++c) { |
| 892 if(utrie2_get32(norm16Trie, c)!=0) { |
| 893 fprintf(stderr, |
| 894 "gennorm2 error: " |
| 895 "illegal mapping/composition/ccc data for Hangul or Jamo
U+%04lX\n", |
| 896 (long)c); |
| 897 exit(U_INVALID_FORMAT_ERROR); |
| 898 } |
| 899 } |
| 900 } |
| 901 // Set data for algorithmic runtime handling. |
| 902 IcuToolErrorCode errorCode("gennorm2/setHangulData()"); |
| 903 hi.reset(); |
| 904 while((range=hi.nextRange())!=NULL) { |
| 905 uint16_t norm16=range->norm16; |
| 906 if(norm16==0) { |
| 907 norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul
LV/LVT encoded as minYesNo |
| 908 if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { |
| 909 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start; |
| 910 } |
| 911 } else { |
| 912 if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
// Jamo V/T are maybeYes |
| 913 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start; |
| 914 } |
| 915 } |
| 916 utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE
, errorCode); |
| 917 errorCode.assertSuccess(); |
| 918 } |
| 919 } |
| 920 |
| 921 U_CDECL_BEGIN |
| 922 |
| 923 static UBool U_CALLCONV |
| 924 enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint3
2_t value) { |
| 925 uint32_t *pMaxValue=(uint32_t *)context; |
| 926 if(value>*pMaxValue) { |
| 927 *pMaxValue=value; |
| 928 } |
| 929 return TRUE; |
| 930 } |
| 931 |
| 932 U_CDECL_END |
| 933 |
| 934 void Normalizer2DataBuilder::processData() { |
| 935 IcuToolErrorCode errorCode("gennorm2/processData()"); |
| 936 norm16Trie=utrie2_open(0, 0, errorCode); |
| 937 errorCode.assertSuccess(); |
| 938 |
| 939 utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr(
)); |
| 940 |
| 941 Decomposer decomposer(*this); |
| 942 do { |
| 943 decomposer.didDecompose=FALSE; |
| 944 utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer); |
| 945 } while(decomposer.didDecompose); |
| 946 |
| 947 BuilderReorderingBuffer buffer; |
| 948 int32_t normsLength=utm_countItems(normMem); |
| 949 for(int32_t i=1; i<normsLength; ++i) { |
| 950 if(norms[i].hasMapping()) { |
| 951 buffer.reset(); |
| 952 reorder(norms+i, buffer); |
| 953 norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer); |
| 954 } |
| 955 } |
| 956 |
| 957 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; |
| 958 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; |
| 959 |
| 960 ExtraDataWriter extraDataWriter(*this); |
| 961 utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter); |
| 962 |
| 963 extraData=extraDataWriter.maybeYesCompositions; |
| 964 extraData.append(extraDataWriter.yesYesCompositions). |
| 965 append(extraDataWriter.yesNoData). |
| 966 append(extraDataWriter.noNoMappings); |
| 967 // Pad to even length for 4-byte alignment of following data. |
| 968 if(extraData.length()&1) { |
| 969 extraData.append((UChar)0); |
| 970 } |
| 971 |
| 972 indexes[Normalizer2Impl::IX_MIN_YES_NO]= |
| 973 extraDataWriter.yesYesCompositions.length(); |
| 974 indexes[Normalizer2Impl::IX_MIN_NO_NO]= |
| 975 indexes[Normalizer2Impl::IX_MIN_YES_NO]+ |
| 976 extraDataWriter.yesNoData.length(); |
| 977 indexes[Normalizer2Impl::IX_LIMIT_NO_NO]= |
| 978 indexes[Normalizer2Impl::IX_MIN_NO_NO]+ |
| 979 extraDataWriter.noNoMappings.length(); |
| 980 indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= |
| 981 Normalizer2Impl::MIN_NORMAL_MAYBE_YES- |
| 982 extraDataWriter.maybeYesCompositions.length(); |
| 983 |
| 984 int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA; |
| 985 if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { |
| 986 fprintf(stderr, |
| 987 "gennorm2 error: " |
| 988 "data structure overflow, too much mapping composition data\n"); |
| 989 exit(U_BUFFER_OVERFLOW_ERROR); |
| 990 } |
| 991 |
| 992 utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr()); |
| 993 |
| 994 setHangulData(); |
| 995 |
| 996 // Look for the "worst" norm16 value of any supplementary code point |
| 997 // corresponding to a lead surrogate, and set it as that surrogate's value. |
| 998 // Enables quick check inner loops to look at only code units. |
| 999 // |
| 1000 // We could be more sophisticated: |
| 1001 // We could collect a bit set for whether there are values in the different |
| 1002 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) |
| 1003 // and select the best value that only breaks the composition and/or decompo
sition |
| 1004 // inner loops if necessary. |
| 1005 // However, that seems like overkill for an optimization for supplementary c
haracters. |
| 1006 for(UChar lead=0xd800; lead<0xdc00; ++lead) { |
| 1007 uint32_t maxValue=utrie2_get32(norm16Trie, lead); |
| 1008 utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &
maxValue); |
| 1009 if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] && |
| 1010 maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO] |
| 1011 ) { |
| 1012 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or cc
c!=0. |
| 1013 // Otherwise it might end up at something like JAMO_VT which stays i
n |
| 1014 // the inner decomposition quick check loop. |
| 1015 maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1; |
| 1016 } |
| 1017 utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCo
de); |
| 1018 } |
| 1019 |
| 1020 // Adjust supplementary minimum code points to break quick check loops at th
eir lead surrogates. |
| 1021 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail sur
rogate) |
| 1022 // which is harmless. |
| 1023 // As a result, the minimum code points are always BMP code points. |
| 1024 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; |
| 1025 if(minCP>=0x10000) { |
| 1026 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); |
| 1027 } |
| 1028 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; |
| 1029 if(minCP>=0x10000) { |
| 1030 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); |
| 1031 } |
| 1032 } |
| 1033 |
| 1034 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { |
| 1035 processData(); |
| 1036 |
| 1037 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); |
| 1038 utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); |
| 1039 int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); |
| 1040 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { |
| 1041 fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normaliz
ation trie - %s\n", |
| 1042 errorCode.errorName()); |
| 1043 exit(errorCode.reset()); |
| 1044 } |
| 1045 errorCode.reset(); |
| 1046 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); |
| 1047 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, e
rrorCode); |
| 1048 errorCode.assertSuccess(); |
| 1049 |
| 1050 int32_t offset=(int32_t)sizeof(indexes); |
| 1051 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; |
| 1052 offset+=norm16TrieLength; |
| 1053 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; |
| 1054 int32_t totalSize=offset+=extraData.length()*2; |
| 1055 for(int32_t i=Normalizer2Impl::IX_RESERVED2_OFFSET; i<=Normalizer2Impl::IX_T
OTAL_SIZE; ++i) { |
| 1056 indexes[i]=totalSize; |
| 1057 } |
| 1058 |
| 1059 if(beVerbose) { |
| 1060 printf("size of normalization trie: %5ld bytes\n", (long)norm16T
rieLength); |
| 1061 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extr
aData.length()); |
| 1062 printf("size of binary data file contents: %5ld bytes\n", (long)totalSi
ze); |
| 1063 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Nor
malizer2Impl::IX_MIN_DECOMP_NO_CP]); |
| 1064 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Nor
malizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); |
| 1065 printf("minYesNo: 0x%04x\n", (int)indexes[Norma
lizer2Impl::IX_MIN_YES_NO]); |
| 1066 printf("minNoNo: 0x%04x\n", (int)indexes[Norma
lizer2Impl::IX_MIN_NO_NO]); |
| 1067 printf("limitNoNo: 0x%04x\n", (int)indexes[Norma
lizer2Impl::IX_LIMIT_NO_NO]); |
| 1068 printf("minMaybeYes: 0x%04x\n", (int)indexes[Norma
lizer2Impl::IX_MIN_MAYBE_YES]); |
| 1069 } |
| 1070 |
| 1071 memcpy(dataInfo.dataVersion, unicodeVersion, 4); |
| 1072 UNewDataMemory *pData= |
| 1073 udata_create(NULL, NULL, filename, &dataInfo, |
| 1074 haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); |
| 1075 if(errorCode.isFailure()) { |
| 1076 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %
s\n", |
| 1077 filename, errorCode.errorName()); |
| 1078 exit(errorCode.reset()); |
| 1079 } |
| 1080 udata_writeBlock(pData, indexes, sizeof(indexes)); |
| 1081 udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); |
| 1082 udata_writeUString(pData, extraData.getBuffer(), extraData.length()); |
| 1083 |
| 1084 int32_t writtenSize=udata_finish(pData, errorCode); |
| 1085 if(errorCode.isFailure()) { |
| 1086 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCod
e.errorName()); |
| 1087 exit(errorCode.reset()); |
| 1088 } |
| 1089 if(writtenSize!=totalSize) { |
| 1090 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld
\n", |
| 1091 (long)writtenSize, (long)totalSize); |
| 1092 exit(U_INTERNAL_PROGRAM_ERROR); |
| 1093 } |
| 1094 } |
| 1095 |
| 1096 U_NAMESPACE_END |
| 1097 |
| 1098 #endif /* #if !UCONFIG_NO_NORMALIZATION */ |
| 1099 |
| 1100 /* |
| 1101 * Hey, Emacs, please set the following: |
| 1102 * |
| 1103 * Local Variables: |
| 1104 * indent-tabs-mode: nil |
| 1105 * End: |
| 1106 */ |
OLD | NEW |