OLD | NEW |
(Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "config.h" |
| 6 #include "ScriptRunIterator.h" |
| 7 |
| 8 #include "platform/Logging.h" |
| 9 #include "wtf/Threading.h" |
| 10 |
| 11 #include <ubidi_props.h> |
| 12 |
| 13 namespace blink { |
| 14 |
| 15 typedef ScriptData::PairedBracketType PairedBracketType; |
| 16 |
| 17 const int ScriptData::kMaxScriptCount = 20; |
| 18 |
| 19 ScriptData::~ScriptData() |
| 20 { |
| 21 } |
| 22 |
| 23 void ICUScriptData::getScripts(UChar32 ch, Vector<UScriptCode>& dst) const |
| 24 { |
| 25 |
| 26 UErrorCode status = U_ZERO_ERROR; |
| 27 // Leave room to insert primary script. It's not strictly necessary but |
| 28 // it ensures that the result won't ever be greater than kMaxScriptCount, |
| 29 // which some client someday might expect. |
| 30 dst.resize(kMaxScriptCount - 1); |
| 31 // Note, ICU convention is to return the number of available items |
| 32 // regardless of the capacity passed to the call. So count can be greater |
| 33 // than dst->size(), if a later version of the unicode data has more |
| 34 // than kMaxScriptCount items. |
| 35 int count = uscript_getScriptExtensions( |
| 36 ch, &dst[0], dst.size(), &status); |
| 37 if (status == U_BUFFER_OVERFLOW_ERROR) { |
| 38 // Allow this, we'll just use what we have. |
| 39 WTF_LOG_ERROR("Exceeded maximum script count of %d for 0x%x", kMaxScript
Count, ch); |
| 40 count = dst.size(); |
| 41 status = U_ZERO_ERROR; |
| 42 } |
| 43 UScriptCode primaryScript = uscript_getScript(ch, &status); |
| 44 |
| 45 if (U_FAILURE(status)) { |
| 46 WTF_LOG_ERROR("Could not get icu script data: %d for 0x%x", status, ch); |
| 47 dst.clear(); |
| 48 return; |
| 49 } |
| 50 |
| 51 dst.resize(count); |
| 52 |
| 53 if (primaryScript == dst.at(0)) { |
| 54 // Only one script (might be common or inherited -- these are never in |
| 55 // the extensions unless they're the only script), or extensions are in |
| 56 // priority order already. |
| 57 return; |
| 58 } |
| 59 |
| 60 if (primaryScript != USCRIPT_INHERITED |
| 61 && primaryScript != USCRIPT_COMMON |
| 62 && primaryScript != USCRIPT_INVALID_CODE) { |
| 63 // Not common or primary, with extensions that are not in order. We know |
| 64 // the primary, so we insert it at the front and swap the previous front |
| 65 // to somewhere else in the list. |
| 66 auto it = std::find(dst.begin() + 1, dst.end(), primaryScript); |
| 67 if (it == dst.end()) { |
| 68 dst.append(primaryScript); |
| 69 } |
| 70 std::swap(*dst.begin(), *it); |
| 71 return; |
| 72 } |
| 73 |
| 74 if (primaryScript == USCRIPT_COMMON) { |
| 75 if (count == 1) { |
| 76 // Common with a preferred script. Keep common at head. |
| 77 dst.prepend(primaryScript); |
| 78 return; |
| 79 } |
| 80 |
| 81 // Ignore common. Find the preferred script of the multiple scripts that |
| 82 // remain, and ensure it is at the head. Just keep swapping them in, |
| 83 // there aren't likely to be many. |
| 84 for (size_t i = 1; i < dst.size(); ++i) { |
| 85 if (dst.at(0) == USCRIPT_LATIN || dst.at(i) < dst.at(0)) { |
| 86 std::swap(dst.at(0), dst.at(i)); |
| 87 } |
| 88 } |
| 89 return; |
| 90 } |
| 91 |
| 92 // The primary is inherited, and there are other scripts. Put inherited at |
| 93 // the front, the true primary next, and then the others in random order. |
| 94 dst.append(dst.at(0)); |
| 95 dst.at(0) = primaryScript; |
| 96 for (size_t i = 2; i < dst.size(); ++i) { |
| 97 if (dst.at(1) == USCRIPT_LATIN || dst.at(i) < dst.at(1)) { |
| 98 std::swap(dst.at(1), dst.at(i)); |
| 99 } |
| 100 } |
| 101 } |
| 102 |
| 103 UChar32 ICUScriptData::getPairedBracket(UChar32 ch) const |
| 104 { |
| 105 return u_getBidiPairedBracket(ch); |
| 106 } |
| 107 |
| 108 PairedBracketType ICUScriptData::getPairedBracketType(UChar32 ch) const |
| 109 { |
| 110 return static_cast<PairedBracketType>( |
| 111 u_getIntPropertyValue(ch, UCHAR_BIDI_PAIRED_BRACKET_TYPE)); |
| 112 } |
| 113 |
| 114 const ICUScriptData* ICUScriptData::instance() |
| 115 { |
| 116 AtomicallyInitializedStaticReference(const ICUScriptData, icuScriptDataInsta
nce, (new ICUScriptData())); |
| 117 return &icuScriptDataInstance; |
| 118 } |
| 119 |
| 120 ScriptRunIterator::ScriptRunIterator(const UChar* text, size_t length, const Scr
iptData* data) |
| 121 : m_text(text) |
| 122 , m_length(length) |
| 123 , m_bracketsFixupDepth(0) |
| 124 // The initial value of m_aheadCharacter is not used. |
| 125 , m_aheadCharacter(0) |
| 126 , m_aheadPos(0) |
| 127 , m_commonPreferred(USCRIPT_COMMON) |
| 128 , m_scriptData(data) |
| 129 { |
| 130 ASSERT(text); |
| 131 ASSERT(data); |
| 132 |
| 133 if (m_aheadPos < m_length) { |
| 134 m_currentSet.clear(); |
| 135 // Priming the m_currentSet with USCRIPT_COMMON here so that the first |
| 136 // resolution between m_currentSet and m_nextSet in mergeSets() leads to |
| 137 // chosing the script of the first consumed character. |
| 138 m_currentSet.append(USCRIPT_COMMON); |
| 139 U16_NEXT(m_text, m_aheadPos, m_length, m_aheadCharacter); |
| 140 m_scriptData->getScripts(m_aheadCharacter, m_aheadSet); |
| 141 } |
| 142 } |
| 143 |
| 144 ScriptRunIterator::ScriptRunIterator(const UChar* text, size_t length) |
| 145 : ScriptRunIterator(text, length, ICUScriptData::instance()) |
| 146 { |
| 147 } |
| 148 |
| 149 bool ScriptRunIterator::consume(unsigned& limit, UScriptCode& script) |
| 150 { |
| 151 if (m_currentSet.isEmpty()) { |
| 152 return false; |
| 153 } |
| 154 |
| 155 size_t pos; |
| 156 UChar32 ch; |
| 157 while (fetch(&pos, &ch)) { |
| 158 PairedBracketType pairedType = m_scriptData->getPairedBracketType(ch); |
| 159 switch (pairedType) { |
| 160 case PairedBracketType::BracketTypeOpen: |
| 161 openBracket(ch); |
| 162 break; |
| 163 case PairedBracketType::BracketTypeClose: |
| 164 closeBracket(ch); |
| 165 break; |
| 166 default: |
| 167 break; |
| 168 } |
| 169 if (!mergeSets()) { |
| 170 limit = pos; |
| 171 script = resolveCurrentScript(); |
| 172 fixupStack(script); |
| 173 m_currentSet = m_nextSet; |
| 174 return true; |
| 175 } |
| 176 } |
| 177 |
| 178 limit = m_length; |
| 179 script = resolveCurrentScript(); |
| 180 m_currentSet.clear(); |
| 181 return true; |
| 182 } |
| 183 |
| 184 void ScriptRunIterator::openBracket(UChar32 ch) |
| 185 { |
| 186 if (m_brackets.size() == kMaxBrackets) { |
| 187 m_brackets.removeFirst(); |
| 188 if (m_bracketsFixupDepth == kMaxBrackets) { |
| 189 --m_bracketsFixupDepth; |
| 190 } |
| 191 } |
| 192 m_brackets.append(BracketRec({ ch, USCRIPT_COMMON })); |
| 193 ++m_bracketsFixupDepth; |
| 194 } |
| 195 |
| 196 void ScriptRunIterator::closeBracket(UChar32 ch) |
| 197 { |
| 198 if (m_brackets.size() > 0) { |
| 199 UChar32 target = m_scriptData->getPairedBracket(ch); |
| 200 for (auto it = m_brackets.rbegin(); it != m_brackets.rend(); ++it) { |
| 201 if (it->ch == target) { |
| 202 // Have a match, use open paren's resolved script. |
| 203 UScriptCode script = it->script; |
| 204 m_nextSet.clear(); |
| 205 m_nextSet.append(script); |
| 206 |
| 207 // And pop stack to this point. |
| 208 int numPopped = std::distance(m_brackets.rbegin(), it); |
| 209 // TODO: No resize operation in WTF::Deque? |
| 210 for (int i = 0; i < numPopped; ++i) |
| 211 m_brackets.removeLast(); |
| 212 m_bracketsFixupDepth = std::max(static_cast<size_t>(0), |
| 213 m_bracketsFixupDepth - numPopped); |
| 214 return; |
| 215 } |
| 216 } |
| 217 } |
| 218 // leave stack alone, no match |
| 219 } |
| 220 |
| 221 // Keep items in m_currentSet that are in m_nextSet. |
| 222 // |
| 223 // If the sets are disjoint, return false and leave m_currentSet unchanged. Else |
| 224 // return true and make current set the intersection. Make sure to maintain |
| 225 // current priority script as priority if it remains, else retain next priority |
| 226 // script if it remains. |
| 227 // |
| 228 // Also maintain a common preferred script. If current and next are both |
| 229 // common, and there is no common preferred script and next has a preferred |
| 230 // script, set the common preferred script to that of next. |
| 231 bool ScriptRunIterator::mergeSets() |
| 232 { |
| 233 if (m_nextSet.isEmpty() || m_currentSet.isEmpty()) { |
| 234 return false; |
| 235 } |
| 236 |
| 237 auto currentSetIt = m_currentSet.begin(); |
| 238 auto currentEnd = m_currentSet.end(); |
| 239 // Most of the time, this is the only one. |
| 240 // Advance the current iterator, we won't need to check it again later. |
| 241 UScriptCode priorityScript = *currentSetIt++; |
| 242 |
| 243 // If next is common or inherited, the only thing that might change |
| 244 // is the common preferred script. |
| 245 if (m_nextSet.at(0) <= USCRIPT_INHERITED) { |
| 246 if (m_nextSet.size() == 2 && priorityScript <= USCRIPT_INHERITED && m_co
mmonPreferred == USCRIPT_COMMON) { |
| 247 m_commonPreferred = m_nextSet.at(1); |
| 248 } |
| 249 return true; |
| 250 } |
| 251 |
| 252 // If current is common or inherited, use the next script set. |
| 253 if (priorityScript <= USCRIPT_INHERITED) { |
| 254 m_currentSet = m_nextSet; |
| 255 return true; |
| 256 } |
| 257 |
| 258 // Neither is common or inherited. If current is a singleton, |
| 259 // just see if it exists in the next set. This is the common case. |
| 260 auto next_it = m_nextSet.begin(); |
| 261 auto next_end = m_nextSet.end(); |
| 262 if (currentSetIt == currentEnd) { |
| 263 return std::find(next_it, next_end, priorityScript) != next_end; |
| 264 } |
| 265 |
| 266 // Establish the priority script, if we have one. |
| 267 // First try current priority script. |
| 268 bool havePriority = std::find(next_it, next_end, priorityScript) |
| 269 != next_end; |
| 270 if (!havePriority) { |
| 271 // So try next priority script. |
| 272 // Skip the first current script, we already know it's not there. |
| 273 // Advance the next iterator, later we won't need to check it again. |
| 274 priorityScript = *next_it++; |
| 275 havePriority = std::find(currentSetIt, currentEnd, priorityScript) != cu
rrentEnd; |
| 276 } |
| 277 |
| 278 // Note that we can never write more scripts into the current vector than |
| 279 // it already contains, so currentWriteIt won't ever exceed the size/capacit
y. |
| 280 auto currentWriteIt = m_currentSet.begin(); |
| 281 if (havePriority) { |
| 282 // keep the priority script. |
| 283 *currentWriteIt++ = priorityScript; |
| 284 } |
| 285 |
| 286 if (next_it != next_end) { |
| 287 // Iterate over the remaining current scripts, and keep them if |
| 288 // they occur in the remaining next scripts. |
| 289 while (currentSetIt != currentEnd) { |
| 290 UScriptCode sc = *currentSetIt++; |
| 291 if (std::find(next_it, next_end, sc) != next_end) { |
| 292 *currentWriteIt++ = sc; |
| 293 } |
| 294 } |
| 295 } |
| 296 |
| 297 // Only change current if the run continues. |
| 298 int written = std::distance(m_currentSet.begin(), currentWriteIt); |
| 299 if (written > 0) { |
| 300 m_currentSet.resize(written); |
| 301 return true; |
| 302 } |
| 303 return false; |
| 304 } |
| 305 |
| 306 // When we hit the end of the run, and resolve the script, we now know the |
| 307 // resolved script of any open bracket that was pushed on the stack since |
| 308 // the start of the run. Fixup depth records how many of these there |
| 309 // were. We've maintained this count during pushes, and taken care to |
| 310 // adjust it if the stack got overfull and open brackets were pushed off |
| 311 // the bottom. This sets the script of the fixup_depth topmost entries of the |
| 312 // stack to the resolved script. |
| 313 void ScriptRunIterator::fixupStack(UScriptCode resolvedScript) |
| 314 { |
| 315 if (m_bracketsFixupDepth > 0) { |
| 316 if (m_bracketsFixupDepth > m_brackets.size()) { |
| 317 // Should never happen unless someone breaks the code. |
| 318 WTF_LOG_ERROR("Brackets fixup depth exceeds size of bracket vector."
); |
| 319 m_bracketsFixupDepth = m_brackets.size(); |
| 320 } |
| 321 auto it = m_brackets.rbegin(); |
| 322 for (size_t i = 0; i < m_bracketsFixupDepth; ++i) { |
| 323 it->script = resolvedScript; |
| 324 ++it; |
| 325 } |
| 326 m_bracketsFixupDepth = 0; |
| 327 } |
| 328 } |
| 329 |
| 330 bool ScriptRunIterator::fetch(size_t* pos, UChar32* ch) |
| 331 { |
| 332 if (m_aheadPos > m_length) { |
| 333 return false; |
| 334 } |
| 335 *pos = m_aheadPos - (m_aheadCharacter >= 0x10000 ? 2 : 1); |
| 336 *ch = m_aheadCharacter; |
| 337 |
| 338 m_nextSet.swap(m_aheadSet); |
| 339 if (m_aheadPos == m_length) { |
| 340 // No more data to fetch, but last character still needs to be |
| 341 // processed. Advance m_aheadPos so that next time we will know |
| 342 // this has been done. |
| 343 m_aheadPos++; |
| 344 return true; |
| 345 } |
| 346 |
| 347 U16_NEXT(m_text, m_aheadPos, m_length, m_aheadCharacter); |
| 348 m_scriptData->getScripts(m_aheadCharacter, m_aheadSet); |
| 349 if (m_aheadSet.isEmpty()) { |
| 350 // No scripts for this character. This has already been logged, so |
| 351 // we just terminate processing this text. |
| 352 return false; |
| 353 } |
| 354 if (m_aheadSet[0] == USCRIPT_INHERITED && m_aheadSet.size() > 1) { |
| 355 if (m_nextSet[0] == USCRIPT_COMMON) { |
| 356 // Overwrite the next set with the non-inherited portion of the set. |
| 357 m_nextSet = m_aheadSet; |
| 358 m_nextSet.remove(0); |
| 359 // Discard the remaining values, we'll inherit. |
| 360 m_aheadSet.resize(1); |
| 361 } else { |
| 362 // Else, this applies to anything. |
| 363 m_aheadSet.resize(1); |
| 364 } |
| 365 } |
| 366 return true; |
| 367 } |
| 368 |
| 369 UScriptCode ScriptRunIterator::resolveCurrentScript() const |
| 370 { |
| 371 UScriptCode result = m_currentSet.at(0); |
| 372 return result == USCRIPT_COMMON ? m_commonPreferred : result; |
| 373 } |
| 374 |
| 375 } // namespace blink |
OLD | NEW |