OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2001-2010, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 * Date Name Description |
| 7 * 07/03/01 aliu Creation. |
| 8 ********************************************************************** |
| 9 */ |
| 10 |
| 11 #include "unicode/utypes.h" |
| 12 |
| 13 #if !UCONFIG_NO_TRANSLITERATION |
| 14 |
| 15 #include "unicode/normalizer2.h" |
| 16 #include "cstring.h" |
| 17 #include "nortrans.h" |
| 18 |
| 19 U_NAMESPACE_BEGIN |
| 20 |
| 21 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) |
| 22 |
| 23 static inline Transliterator::Token cstrToken(const char *s) { |
| 24 return Transliterator::pointerToken((void *)s); |
| 25 } |
| 26 |
| 27 /** |
| 28 * System registration hook. |
| 29 */ |
| 30 void NormalizationTransliterator::registerIDs() { |
| 31 // In the Token, the byte after the NUL is the UNormalization2Mode. |
| 32 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), |
| 33 _create, cstrToken("nfc\0\0")); |
| 34 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), |
| 35 _create, cstrToken("nfkc\0\0")); |
| 36 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), |
| 37 _create, cstrToken("nfc\0\1")); |
| 38 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), |
| 39 _create, cstrToken("nfkc\0\1")); |
| 40 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"), |
| 41 _create, cstrToken("nfc\0\2")); |
| 42 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"), |
| 43 _create, cstrToken("nfc\0\3")); |
| 44 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), |
| 45 UNICODE_STRING_SIMPLE("NFD"), TRUE); |
| 46 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), |
| 47 UNICODE_STRING_SIMPLE("NFKD"), TRUE)
; |
| 48 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"), |
| 49 UNICODE_STRING_SIMPLE("NFD"), FALSE)
; |
| 50 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"), |
| 51 UNICODE_STRING_SIMPLE("FCD"), FALSE)
; |
| 52 } |
| 53 |
| 54 /** |
| 55 * Factory methods |
| 56 */ |
| 57 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID, |
| 58 Token context) { |
| 59 const char *name = (const char *)context.pointer; |
| 60 UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1]; |
| 61 UErrorCode errorCode = U_ZERO_ERROR; |
| 62 const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorC
ode); |
| 63 if(U_SUCCESS(errorCode)) { |
| 64 return new NormalizationTransliterator(ID, *norm2); |
| 65 } else { |
| 66 return NULL; |
| 67 } |
| 68 } |
| 69 |
| 70 /** |
| 71 * Constructs a transliterator. |
| 72 */ |
| 73 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id
, |
| 74 const Normalizer2 &norm
2) : |
| 75 Transliterator(id, 0), fNorm2(norm2) {} |
| 76 |
| 77 /** |
| 78 * Destructor. |
| 79 */ |
| 80 NormalizationTransliterator::~NormalizationTransliterator() { |
| 81 } |
| 82 |
| 83 /** |
| 84 * Copy constructor. |
| 85 */ |
| 86 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTran
sliterator& o) : |
| 87 Transliterator(o), fNorm2(o.fNorm2) {} |
| 88 |
| 89 /** |
| 90 * Transliterator API. |
| 91 */ |
| 92 Transliterator* NormalizationTransliterator::clone(void) const { |
| 93 return new NormalizationTransliterator(*this); |
| 94 } |
| 95 |
| 96 /** |
| 97 * Implements {@link Transliterator#handleTransliterate}. |
| 98 */ |
| 99 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransP
osition& offsets, |
| 100 UBool isIncremental) const
{ |
| 101 // start and limit of the input range |
| 102 int32_t start = offsets.start; |
| 103 int32_t limit = offsets.limit; |
| 104 if(start >= limit) { |
| 105 return; |
| 106 } |
| 107 |
| 108 /* |
| 109 * Normalize as short chunks at a time as possible even in |
| 110 * bulk mode, so that styled text is minimally disrupted. |
| 111 * In incremental mode, a chunk that ends with offsets.limit |
| 112 * must not be normalized. |
| 113 * |
| 114 * If it was known that the input text is not styled, then |
| 115 * a bulk mode normalization could look like this: |
| 116 |
| 117 UnicodeString input, normalized; |
| 118 int32_t length = limit - start; |
| 119 _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); |
| 120 input.releaseBuffer(length); |
| 121 |
| 122 UErrorCode status = U_ZERO_ERROR; |
| 123 fNorm2.normalize(input, normalized, status); |
| 124 |
| 125 text.handleReplaceBetween(start, limit, normalized); |
| 126 |
| 127 int32_t delta = normalized.length() - length; |
| 128 offsets.contextLimit += delta; |
| 129 offsets.limit += delta; |
| 130 offsets.start = limit + delta; |
| 131 |
| 132 */ |
| 133 UErrorCode errorCode = U_ZERO_ERROR; |
| 134 UnicodeString segment; |
| 135 UnicodeString normalized; |
| 136 UChar32 c = text.char32At(start); |
| 137 do { |
| 138 int32_t prev = start; |
| 139 // Skip at least one character so we make progress. |
| 140 // c holds the character at start. |
| 141 segment.remove(); |
| 142 do { |
| 143 segment.append(c); |
| 144 start += U16_LENGTH(c); |
| 145 } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(sta
rt))); |
| 146 if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) { |
| 147 // stop in incremental mode when we reach the input limit |
| 148 // in case there are additional characters that could change the |
| 149 // normalization result |
| 150 start=prev; |
| 151 break; |
| 152 } |
| 153 fNorm2.normalize(segment, normalized, errorCode); |
| 154 if(U_FAILURE(errorCode)) { |
| 155 break; |
| 156 } |
| 157 if(segment != normalized) { |
| 158 // replace the input chunk with its normalized form |
| 159 text.handleReplaceBetween(prev, start, normalized); |
| 160 |
| 161 // update all necessary indexes accordingly |
| 162 int32_t delta = normalized.length() - (start - prev); |
| 163 start += delta; |
| 164 limit += delta; |
| 165 } |
| 166 } while(start < limit); |
| 167 |
| 168 offsets.start = start; |
| 169 offsets.contextLimit += limit - offsets.limit; |
| 170 offsets.limit = limit; |
| 171 } |
| 172 |
| 173 U_NAMESPACE_END |
| 174 |
| 175 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
OLD | NEW |