OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2008-2010, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 * Date Name Description |
| 7 * 05/11/2008 Andy Heninger Port from Java |
| 8 ********************************************************************** |
| 9 */ |
| 10 |
| 11 #include "unicode/utypes.h" |
| 12 |
| 13 #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION |
| 14 |
| 15 #include "unicode/unifilt.h" |
| 16 #include "unicode/uchar.h" |
| 17 #include "unicode/uniset.h" |
| 18 #include "unicode/brkiter.h" |
| 19 #include "brktrans.h" |
| 20 #include "unicode/uchar.h" |
| 21 #include "cmemory.h" |
| 22 #include "uprops.h" |
| 23 #include "uinvchar.h" |
| 24 #include "util.h" |
| 25 #include "uvectr32.h" |
| 26 |
| 27 U_NAMESPACE_BEGIN |
| 28 |
| 29 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) |
| 30 |
| 31 static const UChar SPACE = 32; // ' ' |
| 32 |
| 33 |
| 34 /** |
| 35 * Constructs a transliterator with the default delimiters '{' and |
| 36 * '}'. |
| 37 */ |
| 38 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : |
| 39 Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), |
| 40 fInsertion(SPACE) { |
| 41 bi = NULL; |
| 42 UErrorCode status = U_ZERO_ERROR; |
| 43 boundaries = new UVector32(status); |
| 44 } |
| 45 |
| 46 |
| 47 /** |
| 48 * Destructor. |
| 49 */ |
| 50 BreakTransliterator::~BreakTransliterator() { |
| 51 delete bi; |
| 52 bi = NULL; |
| 53 delete boundaries; |
| 54 boundaries = NULL; |
| 55 } |
| 56 |
| 57 /** |
| 58 * Copy constructor. |
| 59 */ |
| 60 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : |
| 61 Transliterator(o) { |
| 62 bi = NULL; |
| 63 if (o.bi != NULL) { |
| 64 bi = o.bi->clone(); |
| 65 } |
| 66 fInsertion = o.fInsertion; |
| 67 UErrorCode status = U_ZERO_ERROR; |
| 68 boundaries = new UVector32(status); |
| 69 } |
| 70 |
| 71 |
| 72 /** |
| 73 * Transliterator API. |
| 74 */ |
| 75 Transliterator* BreakTransliterator::clone(void) const { |
| 76 return new BreakTransliterator(*this); |
| 77 } |
| 78 |
| 79 /** |
| 80 * Implements {@link Transliterator#handleTransliterate}. |
| 81 */ |
| 82 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition&
offsets, |
| 83 UBool isIncremental ) const
{ |
| 84 |
| 85 UErrorCode status = U_ZERO_ERROR; |
| 86 boundaries->removeAllElements(); |
| 87 BreakTransliterator *nonConstThis = (BreakTransliterator *)this; |
| 88 nonConstThis->getBreakIterator(); // Lazy-create it if necessary |
| 89 UnicodeString sText = replaceableAsString(text); |
| 90 bi->setText(sText); |
| 91 bi->preceding(offsets.start); |
| 92 |
| 93 // To make things much easier, we will stack the boundaries, and then in
sert at the end. |
| 94 // generally, we won't need too many, since we will be filtered. |
| 95 |
| 96 int32_t boundary; |
| 97 for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.l
imit; boundary = bi->next()) { |
| 98 if (boundary == 0) continue; |
| 99 // HACK: Check to see that preceeding item was a letter |
| 100 |
| 101 UChar32 cp = sText.char32At(boundary-1); |
| 102 int type = u_charType(cp); |
| 103 //System.out.println(Integer.toString(cp,16) + " (before): " + type)
; |
| 104 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
| 105 |
| 106 cp = sText.char32At(boundary); |
| 107 type = u_charType(cp); |
| 108 //System.out.println(Integer.toString(cp,16) + " (after): " + type); |
| 109 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; |
| 110 |
| 111 boundaries->addElement(boundary, status); |
| 112 // printf("Boundary at %d\n", boundary); |
| 113 } |
| 114 |
| 115 int delta = 0; |
| 116 int lastBoundary = 0; |
| 117 |
| 118 if (boundaries->size() != 0) { // if we found something, adjust |
| 119 delta = boundaries->size() * fInsertion.length(); |
| 120 lastBoundary = boundaries->lastElementi(); |
| 121 |
| 122 // we do this from the end backwards, so that we don't have to keep
updating. |
| 123 |
| 124 while (boundaries->size() > 0) { |
| 125 boundary = boundaries->popi(); |
| 126 text.handleReplaceBetween(boundary, boundary, fInsertion); |
| 127 } |
| 128 } |
| 129 |
| 130 // Now fix up the return values |
| 131 offsets.contextLimit += delta; |
| 132 offsets.limit += delta; |
| 133 offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; |
| 134 |
| 135 // TODO: do something with U_FAILURE(status); |
| 136 // (need to look at transliterators overall, not just here.) |
| 137 } |
| 138 |
| 139 // |
| 140 // getInsertion() |
| 141 // |
| 142 const UnicodeString &BreakTransliterator::getInsertion() const { |
| 143 return fInsertion; |
| 144 } |
| 145 |
| 146 // |
| 147 // setInsertion() |
| 148 // |
| 149 void BreakTransliterator::setInsertion(const UnicodeString &insertion) { |
| 150 this->fInsertion = insertion; |
| 151 } |
| 152 |
| 153 // |
| 154 // getBreakIterator Lazily create the break iterator if it does |
| 155 // not already exist. Copied from Java, probably |
| 156 // better to just create it in the constructor. |
| 157 // |
| 158 BreakIterator *BreakTransliterator::getBreakIterator() { |
| 159 UErrorCode status = U_ZERO_ERROR; |
| 160 if (bi == NULL) { |
| 161 // Note: Thai breaking behavior is universal, it is not |
| 162 // tied to the Thai locale. |
| 163 bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); |
| 164 } |
| 165 return bi; |
| 166 } |
| 167 |
| 168 // |
| 169 // replaceableAsString Hack to let break iterators work |
| 170 // on the replaceable text from transliterators. |
| 171 // In practice, the only real Replaceable type that we |
| 172 // will be seeing is UnicodeString, so this function |
| 173 // will normally be efficient. |
| 174 // |
| 175 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { |
| 176 UnicodeString s; |
| 177 UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); |
| 178 if (rs != NULL) { |
| 179 s = *rs; |
| 180 } else { |
| 181 r.extractBetween(0, r.length(), s); |
| 182 } |
| 183 return s; |
| 184 } |
| 185 |
| 186 U_NAMESPACE_END |
| 187 |
| 188 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
OLD | NEW |