OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * | 3 * |
4 * Copyright (C) 2009-2013, International Business Machines | 4 * Copyright (C) 2009-2014, International Business Machines |
5 * Corporation and others. All Rights Reserved. | 5 * Corporation and others. All Rights Reserved. |
6 * | 6 * |
7 ******************************************************************************* | 7 ******************************************************************************* |
8 * file name: normalizer2impl.h | 8 * file name: normalizer2impl.h |
9 * encoding: US-ASCII | 9 * encoding: US-ASCII |
10 * tab size: 8 (not used) | 10 * tab size: 8 (not used) |
11 * indentation:4 | 11 * indentation:4 |
12 * | 12 * |
13 * created on: 2009nov22 | 13 * created on: 2009nov22 |
14 * created by: Markus W. Scherer | 14 * created by: Markus W. Scherer |
15 */ | 15 */ |
16 | 16 |
17 #ifndef __NORMALIZER2IMPL_H__ | 17 #ifndef __NORMALIZER2IMPL_H__ |
18 #define __NORMALIZER2IMPL_H__ | 18 #define __NORMALIZER2IMPL_H__ |
19 | 19 |
20 #include "unicode/utypes.h" | 20 #include "unicode/utypes.h" |
21 | 21 |
22 #if !UCONFIG_NO_NORMALIZATION | 22 #if !UCONFIG_NO_NORMALIZATION |
23 | 23 |
24 #include "unicode/normalizer2.h" | 24 #include "unicode/normalizer2.h" |
25 #include "unicode/udata.h" | |
26 #include "unicode/unistr.h" | 25 #include "unicode/unistr.h" |
27 #include "unicode/unorm.h" | 26 #include "unicode/unorm.h" |
28 #include "unicode/utf16.h" | 27 #include "unicode/utf16.h" |
29 #include "mutex.h" | 28 #include "mutex.h" |
30 #include "uset_imp.h" | 29 #include "uset_imp.h" |
31 #include "utrie2.h" | 30 #include "utrie2.h" |
32 | 31 |
33 U_NAMESPACE_BEGIN | 32 U_NAMESPACE_BEGIN |
34 | 33 |
35 struct CanonIterData; | 34 struct CanonIterData; |
36 | 35 |
37 class Hangul { | 36 class U_COMMON_API Hangul { |
38 public: | 37 public: |
39 /* Korean Hangul and Jamo constants */ | 38 /* Korean Hangul and Jamo constants */ |
40 enum { | 39 enum { |
41 JAMO_L_BASE=0x1100, /* "lead" jamo */ | 40 JAMO_L_BASE=0x1100, /* "lead" jamo */ |
| 41 JAMO_L_END=0x1112, |
42 JAMO_V_BASE=0x1161, /* "vowel" jamo */ | 42 JAMO_V_BASE=0x1161, /* "vowel" jamo */ |
| 43 JAMO_V_END=0x1175, |
43 JAMO_T_BASE=0x11a7, /* "trail" jamo */ | 44 JAMO_T_BASE=0x11a7, /* "trail" jamo */ |
| 45 JAMO_T_END=0x11c2, |
44 | 46 |
45 HANGUL_BASE=0xac00, | 47 HANGUL_BASE=0xac00, |
| 48 HANGUL_END=0xd7a3, |
46 | 49 |
47 JAMO_L_COUNT=19, | 50 JAMO_L_COUNT=19, |
48 JAMO_V_COUNT=21, | 51 JAMO_V_COUNT=21, |
49 JAMO_T_COUNT=28, | 52 JAMO_T_COUNT=28, |
50 | 53 |
51 JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, | 54 JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, |
52 | 55 |
53 HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, | 56 HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, |
54 HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT | 57 HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT |
55 }; | 58 }; |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
103 buffer[0]=orig-c2; // LV syllable | 106 buffer[0]=orig-c2; // LV syllable |
104 buffer[1]=(UChar)(JAMO_T_BASE+c2); | 107 buffer[1]=(UChar)(JAMO_T_BASE+c2); |
105 } | 108 } |
106 } | 109 } |
107 private: | 110 private: |
108 Hangul(); // no instantiation | 111 Hangul(); // no instantiation |
109 }; | 112 }; |
110 | 113 |
111 class Normalizer2Impl; | 114 class Normalizer2Impl; |
112 | 115 |
113 class ReorderingBuffer : public UMemory { | 116 class U_COMMON_API ReorderingBuffer : public UMemory { |
114 public: | 117 public: |
115 ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) : | 118 ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) : |
116 impl(ni), str(dest), | 119 impl(ni), str(dest), |
117 start(NULL), reorderStart(NULL), limit(NULL), | 120 start(NULL), reorderStart(NULL), limit(NULL), |
118 remainingCapacity(0), lastCC(0) {} | 121 remainingCapacity(0), lastCC(0) {} |
119 ~ReorderingBuffer() { | 122 ~ReorderingBuffer() { |
120 if(start!=NULL) { | 123 if(start!=NULL) { |
121 str.releaseBuffer((int32_t)(limit-start)); | 124 str.releaseBuffer((int32_t)(limit-start)); |
122 } | 125 } |
123 } | 126 } |
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
206 uint8_t lastCC; | 209 uint8_t lastCC; |
207 | 210 |
208 // private backward iterator | 211 // private backward iterator |
209 void setIterator() { codePointStart=limit; } | 212 void setIterator() { codePointStart=limit; } |
210 void skipPrevious(); // Requires start<codePointStart. | 213 void skipPrevious(); // Requires start<codePointStart. |
211 uint8_t previousCC(); // Returns 0 if there is no previous character. | 214 uint8_t previousCC(); // Returns 0 if there is no previous character. |
212 | 215 |
213 UChar *codePointStart, *codePointLimit; | 216 UChar *codePointStart, *codePointLimit; |
214 }; | 217 }; |
215 | 218 |
216 class U_COMMON_API Normalizer2Impl : public UMemory { | 219 class U_COMMON_API Normalizer2Impl : public UObject { |
217 public: | 220 public: |
218 Normalizer2Impl() : memory(NULL), normTrie(NULL), fCanonIterData(NULL) { | 221 Normalizer2Impl() : normTrie(NULL), fCanonIterData(NULL) { |
219 fCanonIterDataInitOnce.reset(); | 222 fCanonIterDataInitOnce.reset(); |
220 } | 223 } |
221 ~Normalizer2Impl(); | 224 virtual ~Normalizer2Impl(); |
222 | 225 |
223 void load(const char *packageName, const char *name, UErrorCode &errorCode); | 226 void init(const int32_t *inIndexes, const UTrie2 *inTrie, |
| 227 const uint16_t *inExtraData, const uint8_t *inSmallFCD); |
224 | 228 |
| 229 void addLcccChars(UnicodeSet &set) const; |
225 void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; | 230 void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; |
226 void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode)
const; | 231 void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode)
const; |
227 | 232 |
228 // low-level properties ------------------------------------------------ *** | 233 // low-level properties ------------------------------------------------ *** |
229 | 234 |
230 const UTrie2 *getNormTrie() const { return normTrie; } | 235 const UTrie2 *getNormTrie() const { return normTrie; } |
231 | 236 |
232 UBool ensureCanonIterData(UErrorCode &errorCode) const; | 237 UBool ensureCanonIterData(UErrorCode &errorCode) const; |
233 | 238 |
234 uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); } | 239 uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); } |
235 | 240 |
236 UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const { | 241 UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const { |
237 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { | 242 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { |
238 return UNORM_YES; | 243 return UNORM_YES; |
239 } else if(minMaybeYes<=norm16) { | 244 } else if(minMaybeYes<=norm16) { |
240 return UNORM_MAYBE; | 245 return UNORM_MAYBE; |
241 } else { | 246 } else { |
242 return UNORM_NO; | 247 return UNORM_NO; |
243 } | 248 } |
244 } | 249 } |
| 250 UBool isAlgorithmicNoNo(uint16_t norm16) const { return limitNoNo<=norm16 &&
norm16<minMaybeYes; } |
245 UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<min
MaybeYes; } | 251 UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<min
MaybeYes; } |
246 UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMayb
eYes<=norm16; } | 252 UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMayb
eYes<=norm16; } |
247 | 253 |
248 uint8_t getCC(uint16_t norm16) const { | 254 uint8_t getCC(uint16_t norm16) const { |
249 if(norm16>=MIN_NORMAL_MAYBE_YES) { | 255 if(norm16>=MIN_NORMAL_MAYBE_YES) { |
250 return (uint8_t)norm16; | 256 return (uint8_t)norm16; |
251 } | 257 } |
252 if(norm16<minNoNo || limitNoNo<=norm16) { | 258 if(norm16<minNoNo || limitNoNo<=norm16) { |
253 return 0; | 259 return 0; |
254 } | 260 } |
(...skipping 154 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
409 COMP_1_TRIPLE=1, | 415 COMP_1_TRIPLE=1, |
410 COMP_1_TRAIL_LIMIT=0x3400, | 416 COMP_1_TRAIL_LIMIT=0x3400, |
411 COMP_1_TRAIL_MASK=0x7ffe, | 417 COMP_1_TRAIL_MASK=0x7ffe, |
412 COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit | 418 COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit |
413 COMP_2_TRAIL_SHIFT=6, | 419 COMP_2_TRAIL_SHIFT=6, |
414 COMP_2_TRAIL_MASK=0xffc0 | 420 COMP_2_TRAIL_MASK=0xffc0 |
415 }; | 421 }; |
416 | 422 |
417 // higher-level functionality ------------------------------------------ *** | 423 // higher-level functionality ------------------------------------------ *** |
418 | 424 |
| 425 // NFD without an NFD Normalizer2 instance. |
| 426 UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest, |
| 427 UErrorCode &errorCode) const; |
| 428 /** |
| 429 * Decomposes [src, limit[ and writes the result to dest. |
| 430 * limit can be NULL if src is NUL-terminated. |
| 431 * destLengthEstimate is the initial dest buffer capacity and can be -1. |
| 432 */ |
| 433 void decompose(const UChar *src, const UChar *limit, |
| 434 UnicodeString &dest, int32_t destLengthEstimate, |
| 435 UErrorCode &errorCode) const; |
| 436 |
419 const UChar *decompose(const UChar *src, const UChar *limit, | 437 const UChar *decompose(const UChar *src, const UChar *limit, |
420 ReorderingBuffer *buffer, UErrorCode &errorCode) cons
t; | 438 ReorderingBuffer *buffer, UErrorCode &errorCode) cons
t; |
421 void decomposeAndAppend(const UChar *src, const UChar *limit, | 439 void decomposeAndAppend(const UChar *src, const UChar *limit, |
422 UBool doDecompose, | 440 UBool doDecompose, |
423 UnicodeString &safeMiddle, | 441 UnicodeString &safeMiddle, |
424 ReorderingBuffer &buffer, | 442 ReorderingBuffer &buffer, |
425 UErrorCode &errorCode) const; | 443 UErrorCode &errorCode) const; |
426 UBool compose(const UChar *src, const UChar *limit, | 444 UBool compose(const UChar *src, const UChar *limit, |
427 UBool onlyContiguous, | 445 UBool onlyContiguous, |
428 UBool doCompose, | 446 UBool doCompose, |
(...skipping 24 matching lines...) Expand all Loading... |
453 } | 471 } |
454 UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert)
const; | 472 UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert)
const; |
455 | 473 |
456 UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || ge
tFCD16(c)<=0xff; } | 474 UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || ge
tFCD16(c)<=0xff; } |
457 UBool hasFCDBoundaryAfter(UChar32 c) const { | 475 UBool hasFCDBoundaryAfter(UChar32 c) const { |
458 uint16_t fcd16=getFCD16(c); | 476 uint16_t fcd16=getFCD16(c); |
459 return fcd16<=1 || (fcd16&0xff)==0; | 477 return fcd16<=1 || (fcd16&0xff)==0; |
460 } | 478 } |
461 UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; } | 479 UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; } |
462 private: | 480 private: |
463 static UBool U_CALLCONV | |
464 isAcceptable(void *context, const char *type, const char *name, const UDataI
nfo *pInfo); | |
465 | |
466 UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<
=JAMO_VT; } | 481 UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<
=JAMO_VT; } |
467 UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes
; } | 482 UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes
; } |
468 static UBool isInert(uint16_t norm16) { return norm16==0; } | 483 static UBool isInert(uint16_t norm16) { return norm16==0; } |
469 static UBool isJamoL(uint16_t norm16) { return norm16==1; } | 484 static UBool isJamoL(uint16_t norm16) { return norm16==1; } |
470 static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } | 485 static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } |
471 UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; } | 486 UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; } |
472 UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; } | 487 UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; } |
473 // UBool isCompYes(uint16_t norm16) const { | 488 // UBool isCompYes(uint16_t norm16) const { |
474 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; | 489 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; |
475 // } | 490 // } |
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
559 UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const; | 574 UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const; |
560 const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) co
nst; | 575 const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) co
nst; |
561 const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const; | 576 const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const; |
562 | 577 |
563 const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) con
st; | 578 const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) con
st; |
564 const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const; | 579 const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const; |
565 | 580 |
566 int32_t getCanonValue(UChar32 c) const; | 581 int32_t getCanonValue(UChar32 c) const; |
567 const UnicodeSet &getCanonStartSet(int32_t n) const; | 582 const UnicodeSet &getCanonStartSet(int32_t n) const; |
568 | 583 |
569 UDataMemory *memory; | 584 // UVersionInfo dataVersion; |
570 UVersionInfo dataVersion; | |
571 | 585 |
572 // Code point thresholds for quick check codes. | 586 // Code point thresholds for quick check codes. |
573 UChar32 minDecompNoCP; | 587 UChar32 minDecompNoCP; |
574 UChar32 minCompNoMaybeCP; | 588 UChar32 minCompNoMaybeCP; |
575 | 589 |
576 // Norm16 value thresholds for quick check combinations and types of extra d
ata. | 590 // Norm16 value thresholds for quick check combinations and types of extra d
ata. |
577 uint16_t minYesNo; | 591 uint16_t minYesNo; |
578 uint16_t minYesNoMappingsOnly; | 592 uint16_t minYesNoMappingsOnly; |
579 uint16_t minNoNo; | 593 uint16_t minNoNo; |
580 uint16_t limitNoNo; | 594 uint16_t limitNoNo; |
581 uint16_t minMaybeYes; | 595 uint16_t minMaybeYes; |
582 | 596 |
583 UTrie2 *normTrie; | 597 const UTrie2 *normTrie; |
584 const uint16_t *maybeYesCompositions; | 598 const uint16_t *maybeYesCompositions; |
585 const uint16_t *extraData; // mappings and/or compositions for yesYes, yesN
o & noNo characters | 599 const uint16_t *extraData; // mappings and/or compositions for yesYes, yesN
o & noNo characters |
586 const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if
any FCD!=0 | 600 const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if
any FCD!=0 |
587 uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F | 601 uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F |
588 | 602 |
589 public: // CanonIterData is public to allow access from C callback f
unctions. | 603 public: // CanonIterData is public to allow access from C callback functions. |
590 UInitOnce fCanonIterDataInitOnce; | 604 UInitOnce fCanonIterDataInitOnce; |
591 CanonIterData *fCanonIterData; | 605 CanonIterData *fCanonIterData; |
592 }; | 606 }; |
593 | 607 |
594 // bits in canonIterData | 608 // bits in canonIterData |
595 #define CANON_NOT_SEGMENT_STARTER 0x80000000 | 609 #define CANON_NOT_SEGMENT_STARTER 0x80000000 |
596 #define CANON_HAS_COMPOSITIONS 0x40000000 | 610 #define CANON_HAS_COMPOSITIONS 0x40000000 |
597 #define CANON_HAS_SET 0x200000 | 611 #define CANON_HAS_SET 0x200000 |
598 #define CANON_VALUE_MASK 0x1fffff | 612 #define CANON_VALUE_MASK 0x1fffff |
599 | 613 |
600 /** | 614 /** |
601 * ICU-internal shortcut for quick access to standard Unicode normalization. | 615 * ICU-internal shortcut for quick access to standard Unicode normalization. |
602 */ | 616 */ |
603 class U_COMMON_API Normalizer2Factory { | 617 class U_COMMON_API Normalizer2Factory { |
604 public: | 618 public: |
605 static const Normalizer2 *getNFCInstance(UErrorCode &errorCode); | |
606 static const Normalizer2 *getNFDInstance(UErrorCode &errorCode); | |
607 static const Normalizer2 *getFCDInstance(UErrorCode &errorCode); | 619 static const Normalizer2 *getFCDInstance(UErrorCode &errorCode); |
608 static const Normalizer2 *getFCCInstance(UErrorCode &errorCode); | 620 static const Normalizer2 *getFCCInstance(UErrorCode &errorCode); |
609 static const Normalizer2 *getNFKCInstance(UErrorCode &errorCode); | |
610 static const Normalizer2 *getNFKDInstance(UErrorCode &errorCode); | |
611 static const Normalizer2 *getNFKC_CFInstance(UErrorCode &errorCode); | |
612 static const Normalizer2 *getNoopInstance(UErrorCode &errorCode); | 621 static const Normalizer2 *getNoopInstance(UErrorCode &errorCode); |
613 | 622 |
614 static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &e
rrorCode); | 623 static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &e
rrorCode); |
615 | 624 |
616 static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode); | 625 static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode); |
617 static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode); | 626 static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode); |
618 static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode); | 627 static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode); |
619 | 628 |
620 // Get the Impl instance of the Normalizer2. | 629 // Get the Impl instance of the Normalizer2. |
621 // Must be used only when it is known that norm2 is a Normalizer2WithImpl in
stance. | 630 // Must be used only when it is known that norm2 is a Normalizer2WithImpl in
stance. |
(...skipping 148 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
770 * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesN
o extraData into | 779 * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesN
o extraData into |
771 * distinct ranges (combines-forward vs. not) | 780 * distinct ranges (combines-forward vs. not) |
772 * so that a range check can be used to find out if there is a compositions li
st. | 781 * so that a range check can be used to find out if there is a compositions li
st. |
773 * This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LI
ST flag. | 782 * This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LI
ST flag. |
774 * It is needed for the new (in ICU 49) composePair(), not for other normaliza
tion. | 783 * It is needed for the new (in ICU 49) composePair(), not for other normaliza
tion. |
775 * - Addition of the smallFCD[] bit set. | 784 * - Addition of the smallFCD[] bit set. |
776 */ | 785 */ |
777 | 786 |
778 #endif /* !UCONFIG_NO_NORMALIZATION */ | 787 #endif /* !UCONFIG_NO_NORMALIZATION */ |
779 #endif /* __NORMALIZER2IMPL_H__ */ | 788 #endif /* __NORMALIZER2IMPL_H__ */ |
OLD | NEW |