OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (c) 2002-2004, International Business Machines Corporation |
| 4 * and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 * Date Name Description |
| 7 * 01/21/2002 aliu Creation. |
| 8 ********************************************************************** |
| 9 */ |
| 10 |
| 11 #include "unicode/utypes.h" |
| 12 |
| 13 #if !UCONFIG_NO_TRANSLITERATION |
| 14 |
| 15 #include "strrepl.h" |
| 16 #include "rbt_data.h" |
| 17 #include "util.h" |
| 18 #include "unicode/uniset.h" |
| 19 |
| 20 U_NAMESPACE_BEGIN |
| 21 |
| 22 static const UChar EMPTY[] = { 0 }; // empty string: "" |
| 23 |
| 24 UnicodeReplacer::~UnicodeReplacer() {} |
| 25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) |
| 26 |
| 27 /** |
| 28 * Construct a StringReplacer that sets the emits the given output |
| 29 * text and sets the cursor to the given position. |
| 30 * @param theOutput text that will replace input text when the |
| 31 * replace() method is called. May contain stand-in characters |
| 32 * that represent nested replacers. |
| 33 * @param theCursorPos cursor position that will be returned by |
| 34 * the replace() method |
| 35 * @param theData transliterator context object that translates |
| 36 * stand-in characters to UnicodeReplacer objects |
| 37 */ |
| 38 StringReplacer::StringReplacer(const UnicodeString& theOutput, |
| 39 int32_t theCursorPos, |
| 40 const TransliterationRuleData* theData) { |
| 41 output = theOutput; |
| 42 cursorPos = theCursorPos; |
| 43 hasCursor = TRUE; |
| 44 data = theData; |
| 45 isComplex = TRUE; |
| 46 } |
| 47 |
| 48 /** |
| 49 * Construct a StringReplacer that sets the emits the given output |
| 50 * text and does not modify the cursor. |
| 51 * @param theOutput text that will replace input text when the |
| 52 * replace() method is called. May contain stand-in characters |
| 53 * that represent nested replacers. |
| 54 * @param theData transliterator context object that translates |
| 55 * stand-in characters to UnicodeReplacer objects |
| 56 */ |
| 57 StringReplacer::StringReplacer(const UnicodeString& theOutput, |
| 58 const TransliterationRuleData* theData) { |
| 59 output = theOutput; |
| 60 cursorPos = 0; |
| 61 hasCursor = FALSE; |
| 62 data = theData; |
| 63 isComplex = TRUE; |
| 64 } |
| 65 |
| 66 /** |
| 67 * Copy constructor. |
| 68 */ |
| 69 StringReplacer::StringReplacer(const StringReplacer& other) : |
| 70 UnicodeFunctor(other), |
| 71 UnicodeReplacer(other) |
| 72 { |
| 73 output = other.output; |
| 74 cursorPos = other.cursorPos; |
| 75 hasCursor = other.hasCursor; |
| 76 data = other.data; |
| 77 isComplex = other.isComplex; |
| 78 } |
| 79 |
| 80 /** |
| 81 * Destructor |
| 82 */ |
| 83 StringReplacer::~StringReplacer() { |
| 84 } |
| 85 |
| 86 /** |
| 87 * Implement UnicodeFunctor |
| 88 */ |
| 89 UnicodeFunctor* StringReplacer::clone() const { |
| 90 return new StringReplacer(*this); |
| 91 } |
| 92 |
| 93 /** |
| 94 * Implement UnicodeFunctor |
| 95 */ |
| 96 UnicodeReplacer* StringReplacer::toReplacer() const { |
| 97 return (UnicodeReplacer*) this; |
| 98 } |
| 99 |
| 100 /** |
| 101 * UnicodeReplacer API |
| 102 */ |
| 103 int32_t StringReplacer::replace(Replaceable& text, |
| 104 int32_t start, |
| 105 int32_t limit, |
| 106 int32_t& cursor) { |
| 107 int32_t outLen; |
| 108 int32_t newStart = 0; |
| 109 |
| 110 // NOTE: It should be possible to _always_ run the complex |
| 111 // processing code; just slower. If not, then there is a bug |
| 112 // in the complex processing code. |
| 113 |
| 114 // Simple (no nested replacers) Processing Code : |
| 115 if (!isComplex) { |
| 116 text.handleReplaceBetween(start, limit, output); |
| 117 outLen = output.length(); |
| 118 |
| 119 // Setup default cursor position (for cursorPos within output) |
| 120 newStart = cursorPos; |
| 121 } |
| 122 |
| 123 // Complex (nested replacers) Processing Code : |
| 124 else { |
| 125 /* When there are segments to be copied, use the Replaceable.copy() |
| 126 * API in order to retain out-of-band data. Copy everything to the |
| 127 * end of the string, then copy them back over the key. This preserves |
| 128 * the integrity of indices into the key and surrounding context while |
| 129 * generating the output text. |
| 130 */ |
| 131 UnicodeString buf; |
| 132 int32_t oOutput; // offset into 'output' |
| 133 isComplex = FALSE; |
| 134 |
| 135 // The temporary buffer starts at tempStart, and extends |
| 136 // to destLimit. The start of the buffer has a single |
| 137 // character from before the key. This provides style |
| 138 // data when addition characters are filled into the |
| 139 // temporary buffer. If there is nothing to the left, use |
| 140 // the non-character U+FFFF, which Replaceable subclasses |
| 141 // should treat specially as a "no-style character." |
| 142 // destStart points to the point after the style context |
| 143 // character, so it is tempStart+1 or tempStart+2. |
| 144 int32_t tempStart = text.length(); // start of temp buffer |
| 145 int32_t destStart = tempStart; // copy new text to here |
| 146 if (start > 0) { |
| 147 int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1)); |
| 148 text.copy(start-len, start, tempStart); |
| 149 destStart += len; |
| 150 } else { |
| 151 UnicodeString str((UChar) 0xFFFF); |
| 152 text.handleReplaceBetween(tempStart, tempStart, str); |
| 153 destStart++; |
| 154 } |
| 155 int32_t destLimit = destStart; |
| 156 |
| 157 for (oOutput=0; oOutput<output.length(); ) { |
| 158 if (oOutput == cursorPos) { |
| 159 // Record the position of the cursor |
| 160 newStart = destLimit - destStart; // relative to start |
| 161 } |
| 162 UChar32 c = output.char32At(oOutput); |
| 163 UnicodeReplacer* r = data->lookupReplacer(c); |
| 164 if (r == NULL) { |
| 165 // Accumulate straight (non-segment) text. |
| 166 buf.append(c); |
| 167 } else { |
| 168 isComplex = TRUE; |
| 169 |
| 170 // Insert any accumulated straight text. |
| 171 if (buf.length() > 0) { |
| 172 text.handleReplaceBetween(destLimit, destLimit, buf); |
| 173 destLimit += buf.length(); |
| 174 buf.truncate(0); |
| 175 } |
| 176 |
| 177 // Delegate output generation to replacer object |
| 178 int32_t len = r->replace(text, destLimit, destLimit, cursor); |
| 179 destLimit += len; |
| 180 } |
| 181 oOutput += UTF_CHAR_LENGTH(c); |
| 182 } |
| 183 // Insert any accumulated straight text. |
| 184 if (buf.length() > 0) { |
| 185 text.handleReplaceBetween(destLimit, destLimit, buf); |
| 186 destLimit += buf.length(); |
| 187 } |
| 188 if (oOutput == cursorPos) { |
| 189 // Record the position of the cursor |
| 190 newStart = destLimit - destStart; // relative to start |
| 191 } |
| 192 |
| 193 outLen = destLimit - destStart; |
| 194 |
| 195 // Copy new text to start, and delete it |
| 196 text.copy(destStart, destLimit, start); |
| 197 text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY)
; |
| 198 |
| 199 // Delete the old text (the key) |
| 200 text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY); |
| 201 } |
| 202 |
| 203 if (hasCursor) { |
| 204 // Adjust the cursor for positions outside the key. These |
| 205 // refer to code points rather than code units. If cursorPos |
| 206 // is within the output string, then use newStart, which has |
| 207 // already been set above. |
| 208 if (cursorPos < 0) { |
| 209 newStart = start; |
| 210 int32_t n = cursorPos; |
| 211 // Outside the output string, cursorPos counts code points |
| 212 while (n < 0 && newStart > 0) { |
| 213 newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1)); |
| 214 ++n; |
| 215 } |
| 216 newStart += n; |
| 217 } else if (cursorPos > output.length()) { |
| 218 newStart = start + outLen; |
| 219 int32_t n = cursorPos - output.length(); |
| 220 // Outside the output string, cursorPos counts code points |
| 221 while (n > 0 && newStart < text.length()) { |
| 222 newStart += UTF_CHAR_LENGTH(text.char32At(newStart)); |
| 223 --n; |
| 224 } |
| 225 newStart += n; |
| 226 } else { |
| 227 // Cursor is within output string. It has been set up above |
| 228 // to be relative to start. |
| 229 newStart += start; |
| 230 } |
| 231 |
| 232 cursor = newStart; |
| 233 } |
| 234 |
| 235 return outLen; |
| 236 } |
| 237 |
| 238 /** |
| 239 * UnicodeReplacer API |
| 240 */ |
| 241 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, |
| 242 UBool escapeUnprintable) const
{ |
| 243 rule.truncate(0); |
| 244 UnicodeString quoteBuf; |
| 245 |
| 246 int32_t cursor = cursorPos; |
| 247 |
| 248 // Handle a cursor preceding the output |
| 249 if (hasCursor && cursor < 0) { |
| 250 while (cursor++ < 0) { |
| 251 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnp
rintable, quoteBuf); |
| 252 } |
| 253 // Fall through and append '|' below |
| 254 } |
| 255 |
| 256 for (int32_t i=0; i<output.length(); ++i) { |
| 257 if (hasCursor && i == cursor) { |
| 258 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnp
rintable, quoteBuf); |
| 259 } |
| 260 UChar c = output.charAt(i); // Ok to use 16-bits here |
| 261 |
| 262 UnicodeReplacer* r = data->lookupReplacer(c); |
| 263 if (r == NULL) { |
| 264 ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBu
f); |
| 265 } else { |
| 266 UnicodeString buf; |
| 267 r->toReplacerPattern(buf, escapeUnprintable); |
| 268 buf.insert(0, (UChar)0x20); |
| 269 buf.append((UChar)0x20); |
| 270 ICU_Utility::appendToRule(rule, buf, |
| 271 TRUE, escapeUnprintable, quoteBuf); |
| 272 } |
| 273 } |
| 274 |
| 275 // Handle a cursor after the output. Use > rather than >= because |
| 276 // if cursor == output.length() it is at the end of the output, |
| 277 // which is the default position, so we need not emit it. |
| 278 if (hasCursor && cursor > output.length()) { |
| 279 cursor -= output.length(); |
| 280 while (cursor-- > 0) { |
| 281 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnp
rintable, quoteBuf); |
| 282 } |
| 283 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprint
able, quoteBuf); |
| 284 } |
| 285 // Flush quoteBuf out to result |
| 286 ICU_Utility::appendToRule(rule, -1, |
| 287 TRUE, escapeUnprintable, quoteBuf); |
| 288 |
| 289 return rule; |
| 290 } |
| 291 |
| 292 /** |
| 293 * Implement UnicodeReplacer |
| 294 */ |
| 295 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { |
| 296 UChar32 ch; |
| 297 for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) { |
| 298 ch = output.char32At(i); |
| 299 UnicodeReplacer* r = data->lookupReplacer(ch); |
| 300 if (r == NULL) { |
| 301 toUnionTo.add(ch); |
| 302 } else { |
| 303 r->addReplacementSetTo(toUnionTo); |
| 304 } |
| 305 } |
| 306 } |
| 307 |
| 308 /** |
| 309 * UnicodeFunctor API |
| 310 */ |
| 311 void StringReplacer::setData(const TransliterationRuleData* d) { |
| 312 data = d; |
| 313 int32_t i = 0; |
| 314 while (i<output.length()) { |
| 315 UChar32 c = output.char32At(i); |
| 316 UnicodeFunctor* f = data->lookup(c); |
| 317 if (f != NULL) { |
| 318 f->setData(data); |
| 319 } |
| 320 i += UTF_CHAR_LENGTH(c); |
| 321 } |
| 322 } |
| 323 |
| 324 U_NAMESPACE_END |
| 325 |
| 326 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
| 327 |
| 328 //eof |
OLD | NEW |