OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2009-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: filterednormalizer2.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2009dec10 |
| 14 * created by: Markus W. Scherer |
| 15 */ |
| 16 |
| 17 #include "unicode/utypes.h" |
| 18 |
| 19 #if !UCONFIG_NO_NORMALIZATION |
| 20 |
| 21 #include "unicode/normalizer2.h" |
| 22 #include "unicode/uniset.h" |
| 23 #include "unicode/unistr.h" |
| 24 #include "unicode/unorm.h" |
| 25 #include "cpputils.h" |
| 26 |
| 27 U_NAMESPACE_BEGIN |
| 28 |
| 29 UnicodeString & |
| 30 FilteredNormalizer2::normalize(const UnicodeString &src, |
| 31 UnicodeString &dest, |
| 32 UErrorCode &errorCode) const { |
| 33 uprv_checkCanGetBuffer(src, errorCode); |
| 34 if(U_FAILURE(errorCode)) { |
| 35 dest.setToBogus(); |
| 36 return dest; |
| 37 } |
| 38 if(&dest==&src) { |
| 39 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 40 return dest; |
| 41 } |
| 42 dest.remove(); |
| 43 return normalize(src, dest, USET_SPAN_SIMPLE, errorCode); |
| 44 } |
| 45 |
| 46 // Internal: No argument checking, and appends to dest. |
| 47 // Pass as input spanCondition the one that is likely to yield a non-zero |
| 48 // span length at the start of src. |
| 49 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, |
| 50 // USET_SPAN_SIMPLE should be passed in for the start of src |
| 51 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after |
| 52 // an in-filter prefix. |
| 53 UnicodeString & |
| 54 FilteredNormalizer2::normalize(const UnicodeString &src, |
| 55 UnicodeString &dest, |
| 56 USetSpanCondition spanCondition, |
| 57 UErrorCode &errorCode) const { |
| 58 UnicodeString tempDest; // Don't throw away destination buffer between iter
ations. |
| 59 for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) { |
| 60 int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition); |
| 61 int32_t spanLength=spanLimit-prevSpanLimit; |
| 62 if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| 63 if(spanLength!=0) { |
| 64 dest.append(src, prevSpanLimit, spanLength); |
| 65 } |
| 66 spanCondition=USET_SPAN_SIMPLE; |
| 67 } else { |
| 68 if(spanLength!=0) { |
| 69 // Not norm2.normalizeSecondAndAppend() because we do not want |
| 70 // to modify the non-filter part of dest. |
| 71 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLim
it, spanLimit), |
| 72 tempDest, errorCode)); |
| 73 if(U_FAILURE(errorCode)) { |
| 74 break; |
| 75 } |
| 76 } |
| 77 spanCondition=USET_SPAN_NOT_CONTAINED; |
| 78 } |
| 79 prevSpanLimit=spanLimit; |
| 80 } |
| 81 return dest; |
| 82 } |
| 83 |
| 84 UnicodeString & |
| 85 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, |
| 86 const UnicodeString &second, |
| 87 UErrorCode &errorCode) const { |
| 88 return normalizeSecondAndAppend(first, second, TRUE, errorCode); |
| 89 } |
| 90 |
| 91 UnicodeString & |
| 92 FilteredNormalizer2::append(UnicodeString &first, |
| 93 const UnicodeString &second, |
| 94 UErrorCode &errorCode) const { |
| 95 return normalizeSecondAndAppend(first, second, FALSE, errorCode); |
| 96 } |
| 97 |
| 98 UnicodeString & |
| 99 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, |
| 100 const UnicodeString &second, |
| 101 UBool doNormalize, |
| 102 UErrorCode &errorCode) const { |
| 103 uprv_checkCanGetBuffer(first, errorCode); |
| 104 uprv_checkCanGetBuffer(second, errorCode); |
| 105 if(U_FAILURE(errorCode)) { |
| 106 return first; |
| 107 } |
| 108 if(&first==&second) { |
| 109 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 110 return first; |
| 111 } |
| 112 if(first.isEmpty()) { |
| 113 if(doNormalize) { |
| 114 return normalize(second, first, errorCode); |
| 115 } else { |
| 116 return first=second; |
| 117 } |
| 118 } |
| 119 // merge the in-filter suffix of the first string with the in-filter prefix
of the second |
| 120 int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE); |
| 121 if(prefixLimit!=0) { |
| 122 UnicodeString prefix(second.tempSubString(0, prefixLimit)); |
| 123 int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE); |
| 124 if(suffixStart==0) { |
| 125 if(doNormalize) { |
| 126 norm2.normalizeSecondAndAppend(first, prefix, errorCode); |
| 127 } else { |
| 128 norm2.append(first, prefix, errorCode); |
| 129 } |
| 130 } else { |
| 131 UnicodeString middle(first, suffixStart, INT32_MAX); |
| 132 if(doNormalize) { |
| 133 norm2.normalizeSecondAndAppend(middle, prefix, errorCode); |
| 134 } else { |
| 135 norm2.append(middle, prefix, errorCode); |
| 136 } |
| 137 first.replace(suffixStart, INT32_MAX, middle); |
| 138 } |
| 139 } |
| 140 if(prefixLimit<second.length()) { |
| 141 UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX)); |
| 142 if(doNormalize) { |
| 143 normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode); |
| 144 } else { |
| 145 first.append(rest); |
| 146 } |
| 147 } |
| 148 return first; |
| 149 } |
| 150 |
| 151 UBool |
| 152 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) c
onst { |
| 153 return set.contains(c) && norm2.getDecomposition(c, decomposition); |
| 154 } |
| 155 |
| 156 UBool |
| 157 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode)
const { |
| 158 uprv_checkCanGetBuffer(s, errorCode); |
| 159 if(U_FAILURE(errorCode)) { |
| 160 return FALSE; |
| 161 } |
| 162 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; |
| 163 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { |
| 164 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); |
| 165 if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| 166 spanCondition=USET_SPAN_SIMPLE; |
| 167 } else { |
| 168 if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLi
mit), errorCode) || |
| 169 U_FAILURE(errorCode) |
| 170 ) { |
| 171 return FALSE; |
| 172 } |
| 173 spanCondition=USET_SPAN_NOT_CONTAINED; |
| 174 } |
| 175 prevSpanLimit=spanLimit; |
| 176 } |
| 177 return TRUE; |
| 178 } |
| 179 |
| 180 UNormalizationCheckResult |
| 181 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) c
onst { |
| 182 uprv_checkCanGetBuffer(s, errorCode); |
| 183 if(U_FAILURE(errorCode)) { |
| 184 return UNORM_MAYBE; |
| 185 } |
| 186 UNormalizationCheckResult result=UNORM_YES; |
| 187 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; |
| 188 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { |
| 189 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); |
| 190 if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| 191 spanCondition=USET_SPAN_SIMPLE; |
| 192 } else { |
| 193 UNormalizationCheckResult qcResult= |
| 194 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit
), errorCode); |
| 195 if(U_FAILURE(errorCode) || qcResult==UNORM_NO) { |
| 196 return qcResult; |
| 197 } else if(qcResult==UNORM_MAYBE) { |
| 198 result=qcResult; |
| 199 } |
| 200 spanCondition=USET_SPAN_NOT_CONTAINED; |
| 201 } |
| 202 prevSpanLimit=spanLimit; |
| 203 } |
| 204 return result; |
| 205 } |
| 206 |
| 207 int32_t |
| 208 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &error
Code) const { |
| 209 uprv_checkCanGetBuffer(s, errorCode); |
| 210 if(U_FAILURE(errorCode)) { |
| 211 return 0; |
| 212 } |
| 213 USetSpanCondition spanCondition=USET_SPAN_SIMPLE; |
| 214 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { |
| 215 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); |
| 216 if(spanCondition==USET_SPAN_NOT_CONTAINED) { |
| 217 spanCondition=USET_SPAN_SIMPLE; |
| 218 } else { |
| 219 int32_t yesLimit= |
| 220 prevSpanLimit+ |
| 221 norm2.spanQuickCheckYes( |
| 222 s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode)
; |
| 223 if(U_FAILURE(errorCode) || yesLimit<spanLimit) { |
| 224 return yesLimit; |
| 225 } |
| 226 spanCondition=USET_SPAN_NOT_CONTAINED; |
| 227 } |
| 228 prevSpanLimit=spanLimit; |
| 229 } |
| 230 return s.length(); |
| 231 } |
| 232 |
| 233 UBool |
| 234 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const { |
| 235 return !set.contains(c) || norm2.hasBoundaryBefore(c); |
| 236 } |
| 237 |
| 238 UBool |
| 239 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const { |
| 240 return !set.contains(c) || norm2.hasBoundaryAfter(c); |
| 241 } |
| 242 |
| 243 UBool |
| 244 FilteredNormalizer2::isInert(UChar32 c) const { |
| 245 return !set.contains(c) || norm2.isInert(c); |
| 246 } |
| 247 |
| 248 U_NAMESPACE_END |
| 249 |
| 250 // C API ------------------------------------------------------------------- *** |
| 251 |
| 252 U_NAMESPACE_USE |
| 253 |
| 254 U_DRAFT UNormalizer2 * U_EXPORT2 |
| 255 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode
*pErrorCode) { |
| 256 if(U_FAILURE(*pErrorCode)) { |
| 257 return NULL; |
| 258 } |
| 259 if(filterSet==NULL) { |
| 260 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 261 return NULL; |
| 262 } |
| 263 Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2, |
| 264 *UnicodeSet::fromUSet(filterSet)); |
| 265 if(fn2==NULL) { |
| 266 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| 267 } |
| 268 return (UNormalizer2 *)fn2; |
| 269 } |
| 270 |
| 271 #endif // !UCONFIG_NO_NORMALIZATION |
OLD | NEW |