OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ***************************************************************** |
| 3 * Copyright (c) 2002-2008, International Business Machines Corporation |
| 4 * and others. All Rights Reserved. |
| 5 ***************************************************************** |
| 6 * Date Name Description |
| 7 * 06/06/2002 aliu Creation. |
| 8 ***************************************************************** |
| 9 */ |
| 10 |
| 11 #include "unicode/utypes.h" |
| 12 |
| 13 #if !UCONFIG_NO_TRANSLITERATION |
| 14 |
| 15 #include "unicode/uobject.h" |
| 16 #include "unicode/uscript.h" |
| 17 #include "nultrans.h" |
| 18 #include "anytrans.h" |
| 19 #include "uvector.h" |
| 20 #include "tridpars.h" |
| 21 #include "hash.h" |
| 22 #include "putilimp.h" |
| 23 #include "uinvchar.h" |
| 24 |
| 25 //------------------------------------------------------------ |
| 26 // Constants |
| 27 |
| 28 static const UChar TARGET_SEP = 45; // '-' |
| 29 static const UChar VARIANT_SEP = 47; // '/' |
| 30 static const UChar ANY[] = {65,110,121,0}; // "Any" |
| 31 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" |
| 32 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45
,0}; // "-Latin;Latin-" |
| 33 |
| 34 //------------------------------------------------------------ |
| 35 |
| 36 U_CDECL_BEGIN |
| 37 /** |
| 38 * Deleter function for Transliterator*. |
| 39 */ |
| 40 static void U_CALLCONV |
| 41 _deleteTransliterator(void *obj) { |
| 42 delete (U_NAMESPACE_QUALIFIER Transliterator*) obj; |
| 43 } |
| 44 U_CDECL_END |
| 45 |
| 46 //------------------------------------------------------------ |
| 47 |
| 48 U_NAMESPACE_BEGIN |
| 49 |
| 50 //------------------------------------------------------------ |
| 51 // ScriptRunIterator |
| 52 |
| 53 /** |
| 54 * Returns a series of ranges corresponding to scripts. They will be |
| 55 * of the form: |
| 56 * |
| 57 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second |
| 58 * | | - first run (start, limit) |
| 59 * | | - second run (start, limit) |
| 60 * |
| 61 * That is, the runs will overlap. The reason for this is so that a |
| 62 * transliterator can consider common characters both before and after |
| 63 * the scripts. |
| 64 */ |
| 65 class ScriptRunIterator : public UMemory { |
| 66 private: |
| 67 const Replaceable& text; |
| 68 int32_t textStart; |
| 69 int32_t textLimit; |
| 70 |
| 71 public: |
| 72 /** |
| 73 * The code of the current run, valid after next() returns. May |
| 74 * be USCRIPT_INVALID_CODE if and only if the entire text is |
| 75 * COMMON/INHERITED. |
| 76 */ |
| 77 UScriptCode scriptCode; |
| 78 |
| 79 /** |
| 80 * The start of the run, inclusive, valid after next() returns. |
| 81 */ |
| 82 int32_t start; |
| 83 |
| 84 /** |
| 85 * The end of the run, exclusive, valid after next() returns. |
| 86 */ |
| 87 int32_t limit; |
| 88 |
| 89 /** |
| 90 * Constructs a run iterator over the given text from start |
| 91 * (inclusive) to limit (exclusive). |
| 92 */ |
| 93 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit); |
| 94 |
| 95 /** |
| 96 * Returns TRUE if there are any more runs. TRUE is always |
| 97 * returned at least once. Upon return, the caller should |
| 98 * examine scriptCode, start, and limit. |
| 99 */ |
| 100 UBool next(); |
| 101 |
| 102 /** |
| 103 * Adjusts internal indices for a change in the limit index of the |
| 104 * given delta. A positive delta means the limit has increased. |
| 105 */ |
| 106 void adjustLimit(int32_t delta); |
| 107 |
| 108 private: |
| 109 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this
class |
| 110 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copy
ing of this class |
| 111 }; |
| 112 |
| 113 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText, |
| 114 int32_t myStart, int32_t myLimit) : |
| 115 text(theText) |
| 116 { |
| 117 textStart = myStart; |
| 118 textLimit = myLimit; |
| 119 limit = myStart; |
| 120 } |
| 121 |
| 122 UBool ScriptRunIterator::next() { |
| 123 UChar32 ch; |
| 124 UScriptCode s; |
| 125 UErrorCode ec = U_ZERO_ERROR; |
| 126 |
| 127 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet |
| 128 start = limit; |
| 129 |
| 130 // Are we done? |
| 131 if (start == textLimit) { |
| 132 return FALSE; |
| 133 } |
| 134 |
| 135 // Move start back to include adjacent COMMON or INHERITED |
| 136 // characters |
| 137 while (start > textStart) { |
| 138 ch = text.char32At(start - 1); // look back |
| 139 s = uscript_getScript(ch, &ec); |
| 140 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) { |
| 141 --start; |
| 142 } else { |
| 143 break; |
| 144 } |
| 145 } |
| 146 |
| 147 // Move limit ahead to include COMMON, INHERITED, and characters |
| 148 // of the current script. |
| 149 while (limit < textLimit) { |
| 150 ch = text.char32At(limit); // look ahead |
| 151 s = uscript_getScript(ch, &ec); |
| 152 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) { |
| 153 if (scriptCode == USCRIPT_INVALID_CODE) { |
| 154 scriptCode = s; |
| 155 } else if (s != scriptCode) { |
| 156 break; |
| 157 } |
| 158 } |
| 159 ++limit; |
| 160 } |
| 161 |
| 162 // Return TRUE even if the entire text is COMMON / INHERITED, in |
| 163 // which case scriptCode will be USCRIPT_INVALID_CODE. |
| 164 return TRUE; |
| 165 } |
| 166 |
| 167 void ScriptRunIterator::adjustLimit(int32_t delta) { |
| 168 limit += delta; |
| 169 textLimit += delta; |
| 170 } |
| 171 |
| 172 //------------------------------------------------------------ |
| 173 // AnyTransliterator |
| 174 |
| 175 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator) |
| 176 |
| 177 AnyTransliterator::AnyTransliterator(const UnicodeString& id, |
| 178 const UnicodeString& theTarget, |
| 179 const UnicodeString& theVariant, |
| 180 UScriptCode theTargetScript, |
| 181 UErrorCode& ec) : |
| 182 Transliterator(id, NULL), |
| 183 targetScript(theTargetScript) |
| 184 { |
| 185 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); |
| 186 if (U_FAILURE(ec)) { |
| 187 return; |
| 188 } |
| 189 uhash_setValueDeleter(cache, _deleteTransliterator); |
| 190 |
| 191 target = theTarget; |
| 192 if (theVariant.length() > 0) { |
| 193 target.append(VARIANT_SEP).append(theVariant); |
| 194 } |
| 195 } |
| 196 |
| 197 AnyTransliterator::~AnyTransliterator() { |
| 198 uhash_close(cache); |
| 199 } |
| 200 |
| 201 /** |
| 202 * Copy constructor. |
| 203 */ |
| 204 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : |
| 205 Transliterator(o), |
| 206 target(o.target), |
| 207 targetScript(o.targetScript) |
| 208 { |
| 209 // Don't copy the cache contents |
| 210 UErrorCode ec = U_ZERO_ERROR; |
| 211 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); |
| 212 if (U_FAILURE(ec)) { |
| 213 return; |
| 214 } |
| 215 uhash_setValueDeleter(cache, _deleteTransliterator); |
| 216 } |
| 217 |
| 218 /** |
| 219 * Transliterator API. |
| 220 */ |
| 221 Transliterator* AnyTransliterator::clone() const { |
| 222 return new AnyTransliterator(*this); |
| 223 } |
| 224 |
| 225 /** |
| 226 * Implements {@link Transliterator#handleTransliterate}. |
| 227 */ |
| 228 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& p
os, |
| 229 UBool isIncremental) const { |
| 230 int32_t allStart = pos.start; |
| 231 int32_t allLimit = pos.limit; |
| 232 |
| 233 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit); |
| 234 |
| 235 while (it.next()) { |
| 236 // Ignore runs in the ante context |
| 237 if (it.limit <= allStart) continue; |
| 238 |
| 239 // Try to instantiate transliterator from it.scriptCode to |
| 240 // our target or target/variant |
| 241 Transliterator* t = getTransliterator(it.scriptCode); |
| 242 |
| 243 if (t == NULL) { |
| 244 // We have no transliterator. Do nothing, but keep |
| 245 // pos.start up to date. |
| 246 pos.start = it.limit; |
| 247 continue; |
| 248 } |
| 249 |
| 250 // If the run end is before the transliteration limit, do |
| 251 // a non-incremental transliteration. Otherwise do an |
| 252 // incremental one. |
| 253 UBool incremental = isIncremental && (it.limit >= allLimit); |
| 254 |
| 255 pos.start = uprv_max(allStart, it.start); |
| 256 pos.limit = uprv_min(allLimit, it.limit); |
| 257 int32_t limit = pos.limit; |
| 258 t->filteredTransliterate(text, pos, incremental); |
| 259 int32_t delta = pos.limit - limit; |
| 260 allLimit += delta; |
| 261 it.adjustLimit(delta); |
| 262 |
| 263 // We're done if we enter the post context |
| 264 if (it.limit >= allLimit) break; |
| 265 } |
| 266 |
| 267 // Restore limit. pos.start is fine where the last transliterator |
| 268 // left it, or at the end of the last run. |
| 269 pos.limit = allLimit; |
| 270 } |
| 271 |
| 272 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { |
| 273 |
| 274 if (source == targetScript || source == USCRIPT_INVALID_CODE) { |
| 275 return NULL; |
| 276 } |
| 277 |
| 278 Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source); |
| 279 if (t == NULL) { |
| 280 UErrorCode ec = U_ZERO_ERROR; |
| 281 UnicodeString sourceName(uscript_getName(source), -1, US_INV); |
| 282 UnicodeString id(sourceName); |
| 283 id.append(TARGET_SEP).append(target); |
| 284 |
| 285 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); |
| 286 if (U_FAILURE(ec) || t == NULL) { |
| 287 delete t; |
| 288 |
| 289 // Try to pivot around Latin, our most common script |
| 290 id = sourceName; |
| 291 id.append(LATIN_PIVOT).append(target); |
| 292 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); |
| 293 if (U_FAILURE(ec) || t == NULL) { |
| 294 delete t; |
| 295 t = NULL; |
| 296 } |
| 297 } |
| 298 |
| 299 if (t != NULL) { |
| 300 uhash_iput(cache, (int32_t) source, t, &ec); |
| 301 } |
| 302 } |
| 303 |
| 304 return t; |
| 305 } |
| 306 |
| 307 /** |
| 308 * Return the script code for a given name, or -1 if not found. |
| 309 */ |
| 310 static UScriptCode scriptNameToCode(const UnicodeString& name) { |
| 311 char buf[128]; |
| 312 UScriptCode code; |
| 313 UErrorCode ec = U_ZERO_ERROR; |
| 314 int32_t nameLen = name.length(); |
| 315 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen); |
| 316 |
| 317 if (isInvariant) { |
| 318 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV); |
| 319 buf[127] = 0; // Make sure that we NULL terminate the string. |
| 320 } |
| 321 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec
)) |
| 322 { |
| 323 code = USCRIPT_INVALID_CODE; |
| 324 } |
| 325 return code; |
| 326 } |
| 327 |
| 328 /** |
| 329 * Registers standard transliterators with the system. Called by |
| 330 * Transliterator during initialization. Scan all current targets and |
| 331 * register those that are scripts T as Any-T/V. |
| 332 */ |
| 333 void AnyTransliterator::registerIDs() { |
| 334 |
| 335 UErrorCode ec = U_ZERO_ERROR; |
| 336 Hashtable seen(TRUE, ec); |
| 337 |
| 338 int32_t sourceCount = Transliterator::_countAvailableSources(); |
| 339 for (int32_t s=0; s<sourceCount; ++s) { |
| 340 UnicodeString source; |
| 341 Transliterator::_getAvailableSource(s, source); |
| 342 |
| 343 // Ignore the "Any" source |
| 344 if (source.caseCompare(ANY, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue; |
| 345 |
| 346 int32_t targetCount = Transliterator::_countAvailableTargets(source); |
| 347 for (int32_t t=0; t<targetCount; ++t) { |
| 348 UnicodeString target; |
| 349 Transliterator::_getAvailableTarget(t, source, target); |
| 350 |
| 351 // Only process each target once |
| 352 if (seen.geti(target) != 0) continue; |
| 353 ec = U_ZERO_ERROR; |
| 354 seen.puti(target, 1, ec); |
| 355 |
| 356 // Get the script code for the target. If not a script, ignore. |
| 357 UScriptCode targetScript = scriptNameToCode(target); |
| 358 if (targetScript == USCRIPT_INVALID_CODE) continue; |
| 359 |
| 360 int32_t variantCount = Transliterator::_countAvailableVariants(sourc
e, target); |
| 361 // assert(variantCount >= 1); |
| 362 for (int32_t v=0; v<variantCount; ++v) { |
| 363 UnicodeString variant; |
| 364 Transliterator::_getAvailableVariant(v, source, target, variant)
; |
| 365 |
| 366 UnicodeString id; |
| 367 TransliteratorIDParser::STVtoID(ANY, target, variant, id); |
| 368 ec = U_ZERO_ERROR; |
| 369 AnyTransliterator* t = new AnyTransliterator(id, target, variant
, |
| 370 targetScript, ec); |
| 371 if (U_FAILURE(ec)) { |
| 372 delete t; |
| 373 } else { |
| 374 Transliterator::_registerInstance(t); |
| 375 Transliterator::_registerSpecialInverse(target, NULL_ID, FAL
SE); |
| 376 } |
| 377 } |
| 378 } |
| 379 } |
| 380 } |
| 381 |
| 382 U_NAMESPACE_END |
| 383 |
| 384 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
| 385 |
| 386 //eof |
OLD | NEW |