Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "config.h" | |
| 6 #include "ScriptRunIterator.h" | |
| 7 | |
| 8 #include "platform/Logging.h" | |
| 9 #include "wtf/Threading.h" | |
| 10 | |
| 11 #include <ubidi_props.h> | |
| 12 | |
| 13 namespace blink { | |
| 14 | |
| 15 typedef ScriptData::PairedBracketType PairedBracketType; | |
| 16 | |
| 17 ScriptData::~ScriptData() | |
| 18 { | |
| 19 } | |
| 20 | |
| 21 const int ScriptData::kMaxScriptCount; | |
| 22 | |
| 23 void ICUScriptData::getScripts(UChar32 ch, Vector<UScriptCode>& dst) const | |
| 24 { | |
| 25 | |
| 26 UErrorCode status = U_ZERO_ERROR; | |
| 27 // Leave room to insert primary script. It's not strictly necessary but | |
| 28 // it ensures that the result won't ever be greater than kMaxScriptCount, | |
| 29 // which some client someday might expect. | |
| 30 dst.resize(kMaxScriptCount - 1); | |
| 31 // Note, ICU convention is to return the number of available items | |
| 32 // regardless of the capacity passed to the call. So count can be greater | |
| 33 // than dst->size(), if a later version of the unicode data has more | |
| 34 // than kMaxScriptCount items. | |
| 35 int count = uscript_getScriptExtensions( | |
| 36 ch, &dst[0], dst.size(), &status); | |
| 37 if (status == U_BUFFER_OVERFLOW_ERROR) { | |
| 38 // Allow this, we'll just use what we have. | |
| 39 WTF_LOG_ERROR("Exceeded maximum script count of %d for 0x%x", kMaxScript Count, ch); | |
| 40 count = dst.size(); | |
| 41 status = U_ZERO_ERROR; | |
| 42 } | |
| 43 UScriptCode primaryScript = uscript_getScript(ch, &status); | |
| 44 | |
| 45 if (U_FAILURE(status)) { | |
| 46 WTF_LOG_ERROR("Could not get icu script data: %d for 0x%x", status, ch); | |
| 47 dst.clear(); | |
| 48 return; | |
| 49 } | |
| 50 | |
| 51 dst.resize(count); | |
| 52 | |
| 53 if (primaryScript == dst.at(0)) { | |
| 54 // Only one script (might be common or inherited -- these are never in | |
| 55 // the extensions unless they're the only script), or extensions are in | |
| 56 // priority order already. | |
| 57 return; | |
| 58 } | |
| 59 | |
| 60 if (primaryScript > USCRIPT_INHERITED) { | |
|
behdad_google
2015/09/01 11:16:36
This relies on order of USCRIPT values. Can you r
| |
| 61 // Not common or primary, with extensions that are not in order. We know | |
| 62 // the primary, so we insert it at the front and swap the previous front | |
| 63 // to somewhere else in the list. | |
| 64 auto it = std::find(dst.begin() + 1, dst.end(), primaryScript); | |
| 65 if (it == dst.end()) { | |
| 66 dst.append(primaryScript); | |
| 67 } | |
| 68 std::swap(*dst.begin(), *it); | |
| 69 return; | |
| 70 } | |
| 71 | |
| 72 if (primaryScript == USCRIPT_COMMON) { | |
| 73 if (count == 1) { | |
| 74 // Common with a preferred script. Keep common at head. | |
| 75 dst.prepend(primaryScript); | |
| 76 return; | |
| 77 } | |
| 78 | |
| 79 // Ignore common. Find the preferred script of the multiple scripts that | |
| 80 // remain, and ensure it is at the head. Just keep swapping them in, | |
| 81 // there aren't likely to be many. | |
| 82 for (size_t i = 1; i < dst.size(); ++i) { | |
| 83 if (dst.at(0) == USCRIPT_LATIN || dst.at(i) < dst.at(0)) { | |
|
behdad_google
2015/09/01 11:16:36
Note to self. This algorithm uses the order that
| |
| 84 std::swap(dst.at(0), dst.at(i)); | |
| 85 } | |
| 86 } | |
| 87 return; | |
| 88 } | |
| 89 | |
| 90 // The primary is inherited, and there are other scripts. Put inherited at | |
| 91 // the front, the true primary next, and then the others in random order. | |
| 92 dst.append(dst.at(0)); | |
| 93 dst.at(0) = primaryScript; | |
| 94 for (size_t i = 2; i < dst.size(); ++i) { | |
| 95 if (dst.at(1) == USCRIPT_LATIN || dst.at(i) < dst.at(1)) { | |
| 96 std::swap(dst.at(1), dst.at(i)); | |
| 97 } | |
| 98 } | |
| 99 } | |
| 100 | |
| 101 UChar32 ICUScriptData::getPairedBracket(UChar32 ch) const | |
| 102 { | |
| 103 return u_getBidiPairedBracket(ch); | |
| 104 } | |
| 105 | |
| 106 PairedBracketType ICUScriptData::getPairedBracketType(UChar32 ch) const | |
| 107 { | |
| 108 return static_cast<PairedBracketType>( | |
| 109 u_getIntPropertyValue(ch, UCHAR_BIDI_PAIRED_BRACKET_TYPE)); | |
| 110 } | |
| 111 | |
| 112 const ICUScriptData* ICUScriptData::instance() | |
| 113 { | |
| 114 AtomicallyInitializedStaticReference(const ICUScriptData, icuScriptDataInsta nce, (new ICUScriptData())); | |
| 115 return &icuScriptDataInstance; | |
| 116 } | |
| 117 | |
| 118 ScriptRunIterator::ScriptRunIterator(const UChar* text, size_t length, const Scr iptData* data) | |
| 119 : m_text(text) | |
| 120 , m_length(length) | |
| 121 , m_bracketsFixupDepth(0) | |
| 122 // The initial value of m_aheadCharacter is not used. | |
| 123 , m_aheadCharacter(0) | |
| 124 , m_aheadPos(0) | |
| 125 , m_commonPreferred(USCRIPT_COMMON) | |
| 126 , m_scriptData(data) | |
| 127 { | |
| 128 ASSERT(text); | |
| 129 ASSERT(data); | |
| 130 | |
| 131 if (m_aheadPos < m_length) { | |
| 132 m_currentSet.clear(); | |
| 133 m_currentSet.append(USCRIPT_COMMON); | |
| 134 U16_NEXT(m_text, m_aheadPos, m_length, m_aheadCharacter); | |
| 135 m_scriptData->getScripts(m_aheadCharacter, m_aheadSet); | |
| 136 } | |
| 137 } | |
| 138 | |
| 139 ScriptRunIterator::ScriptRunIterator(const UChar* text, size_t length) | |
| 140 : ScriptRunIterator(text, length, ICUScriptData::instance()) | |
| 141 { | |
| 142 } | |
| 143 | |
| 144 bool ScriptRunIterator::consume(unsigned& limit, UScriptCode& script) | |
| 145 { | |
| 146 if (m_currentSet.isEmpty()) { | |
| 147 return false; | |
| 148 } | |
| 149 | |
| 150 size_t pos; | |
| 151 UChar32 ch; | |
| 152 while (fetch(&pos, &ch)) { | |
| 153 PairedBracketType pairedType = m_scriptData->getPairedBracketType(ch); | |
| 154 switch (pairedType) { | |
| 155 case PairedBracketType::BracketTypeOpen: | |
| 156 openBracket(ch); | |
| 157 break; | |
| 158 case PairedBracketType::BracketTypeClose: | |
| 159 closeBracket(ch); | |
| 160 break; | |
| 161 default: | |
| 162 break; | |
| 163 } | |
| 164 if (!mergeSets()) { | |
| 165 limit = pos; | |
| 166 script = resolveCurrentScript(); | |
| 167 fixupStack(script); | |
| 168 m_currentSet = m_nextSet; | |
| 169 return true; | |
| 170 } | |
| 171 } | |
| 172 | |
| 173 limit = m_length; | |
| 174 script = resolveCurrentScript(); | |
| 175 m_currentSet.clear(); | |
| 176 return true; | |
| 177 } | |
| 178 | |
| 179 void ScriptRunIterator::openBracket(UChar32 ch) | |
| 180 { | |
| 181 if (m_brackets.size() == kMaxBrackets) { | |
| 182 m_brackets.removeFirst(); | |
| 183 if (m_bracketsFixupDepth == kMaxBrackets) { | |
| 184 --m_bracketsFixupDepth; | |
| 185 } | |
| 186 } | |
| 187 m_brackets.append(BracketRec({ ch, USCRIPT_COMMON })); | |
| 188 ++m_bracketsFixupDepth; | |
| 189 } | |
| 190 | |
| 191 void ScriptRunIterator::closeBracket(UChar32 ch) | |
| 192 { | |
| 193 if (m_brackets.size() > 0) { | |
| 194 UChar32 target = m_scriptData->getPairedBracket(ch); | |
| 195 for (auto it = m_brackets.rbegin(); it != m_brackets.rend(); ++it) { | |
| 196 if (it->ch == target) { | |
| 197 // Have a match, use open paren's resolved script. | |
| 198 UScriptCode script = it->script; | |
| 199 m_nextSet.clear(); | |
| 200 m_nextSet.append(script); | |
| 201 | |
| 202 // And pop stack to this point. | |
| 203 int numPopped = std::distance(m_brackets.rbegin(), it); | |
| 204 // TODO: No resize operation in WTF::Deque? | |
| 205 for (int i = 0; i < numPopped; ++i) | |
| 206 m_brackets.removeLast(); | |
| 207 m_bracketsFixupDepth = std::max(static_cast<size_t>(0), | |
| 208 m_bracketsFixupDepth - numPopped); | |
| 209 return; | |
| 210 } | |
| 211 } | |
| 212 } | |
| 213 // leave stack alone, no match | |
| 214 } | |
| 215 | |
| 216 // Keep items in m_currentSet that are in m_nextSet. | |
| 217 // | |
| 218 // If the sets are disjoint, return false and leave m_currentSet unchanged. Else | |
| 219 // return true and make current set the intersection. Make sure to maintain | |
| 220 // current priority script as priority if it remains, else retain next priority | |
| 221 // script if it remains. | |
| 222 // | |
| 223 // Also maintain a common preferred script. If current and next are both | |
| 224 // common, and there is no common preferred script and next has a preferred | |
| 225 // script, set the common preferred script to that of next. | |
| 226 bool ScriptRunIterator::mergeSets() | |
| 227 { | |
| 228 if (m_nextSet.isEmpty() || m_currentSet.isEmpty()) { | |
| 229 return false; | |
| 230 } | |
| 231 | |
| 232 auto currentSetIt = m_currentSet.begin(); | |
| 233 auto currentEnd = m_currentSet.end(); | |
| 234 // Most of the time, this is the only one. | |
| 235 // Advance the current iterator, we won't need to check it again later. | |
| 236 UScriptCode priorityScript = *currentSetIt++; | |
| 237 | |
| 238 // If next is common or inherited, the only thing that might change | |
| 239 // is the common preferred script. | |
| 240 if (m_nextSet.at(0) <= USCRIPT_INHERITED) { | |
| 241 if (m_nextSet.size() == 2 && priorityScript <= USCRIPT_INHERITED && m_co mmonPreferred == USCRIPT_COMMON) { | |
| 242 m_commonPreferred = m_nextSet.at(1); | |
| 243 } | |
| 244 return true; | |
| 245 } | |
| 246 | |
| 247 // If current is common or inherited, use the next script set. | |
| 248 if (priorityScript <= USCRIPT_INHERITED) { | |
| 249 m_currentSet = m_nextSet; | |
| 250 return true; | |
| 251 } | |
| 252 | |
| 253 // Neither is common or inherited. If current is a singleton, | |
| 254 // just see if it exists in the next set. This is the common case. | |
| 255 auto next_it = m_nextSet.begin(); | |
| 256 auto next_end = m_nextSet.end(); | |
| 257 if (currentSetIt == currentEnd) { | |
| 258 return std::find(next_it, next_end, priorityScript) != next_end; | |
| 259 } | |
| 260 | |
| 261 // Establish the priority script, if we have one. | |
| 262 // First try current priority script. | |
| 263 bool havePriority = std::find(next_it, next_end, priorityScript) | |
| 264 != next_end; | |
| 265 if (!havePriority) { | |
| 266 // So try next priority script. | |
| 267 // Skip the first current script, we already know it's not there. | |
| 268 // Advance the next iterator, later we won't need to check it again. | |
| 269 priorityScript = *next_it++; | |
| 270 havePriority = std::find(currentSetIt, currentEnd, priorityScript) != cu rrentEnd; | |
| 271 } | |
| 272 | |
| 273 // Note that we can never write more scripts into the current vector than | |
| 274 // it already contains, so currentWriteIt won't ever exceed the size/capacit y. | |
| 275 auto currentWriteIt = m_currentSet.begin(); | |
| 276 if (havePriority) { | |
| 277 // keep the priority script. | |
| 278 *currentWriteIt++ = priorityScript; | |
| 279 } | |
| 280 | |
| 281 if (next_it != next_end) { | |
| 282 // Iterate over the remaining current scripts, and keep them if | |
| 283 // they occur in the remaining next scripts. | |
| 284 while (currentSetIt != currentEnd) { | |
| 285 UScriptCode sc = *currentSetIt++; | |
| 286 if (std::find(next_it, next_end, sc) != next_end) { | |
| 287 *currentWriteIt++ = sc; | |
| 288 } | |
| 289 } | |
| 290 } | |
| 291 | |
| 292 // Only change current if the run continues. | |
| 293 int written = std::distance(m_currentSet.begin(), currentWriteIt); | |
| 294 if (written > 0) { | |
| 295 m_currentSet.resize(written); | |
| 296 return true; | |
| 297 } | |
| 298 return false; | |
| 299 } | |
| 300 | |
| 301 // When we hit the end of the run, and resolve the script, we now know the | |
| 302 // resolved script of any open bracket that was pushed on the stack since | |
| 303 // the start of the run. Fixup depth records how many of these there | |
| 304 // were. We've maintained this count during pushes, and taken care to | |
| 305 // adjust it if the stack got overfull and open brackets were pushed off | |
| 306 // the bottom. This sets the script of the fixup_depth topmost entries of the | |
| 307 // stack to the resolved script. | |
| 308 void ScriptRunIterator::fixupStack(UScriptCode resolvedScript) | |
| 309 { | |
| 310 if (m_bracketsFixupDepth > 0) { | |
| 311 if (m_bracketsFixupDepth > m_brackets.size()) { | |
| 312 // Should never happen unless someone breaks the code. | |
| 313 WTF_LOG_ERROR("Brackets fixup depth exceeds size of bracket vector." ); | |
| 314 m_bracketsFixupDepth = m_brackets.size(); | |
| 315 } | |
| 316 auto it = m_brackets.rbegin(); | |
| 317 for (size_t i = 0; i < m_bracketsFixupDepth; ++i) { | |
| 318 it->script = resolvedScript; | |
| 319 ++it; | |
| 320 } | |
| 321 m_bracketsFixupDepth = 0; | |
| 322 } | |
| 323 } | |
| 324 | |
| 325 bool ScriptRunIterator::fetch(size_t* pos, UChar32* ch) | |
| 326 { | |
| 327 if (m_aheadPos > m_length) { | |
| 328 return false; | |
| 329 } | |
| 330 *pos = m_aheadPos - (m_aheadCharacter >= 0x10000 ? 2 : 1); | |
| 331 *ch = m_aheadCharacter; | |
| 332 | |
| 333 m_nextSet.swap(m_aheadSet); | |
| 334 if (m_aheadPos == m_length) { | |
| 335 // No more data to fetch, but last character still needs to be | |
| 336 // processed. Advance m_aheadPos so that next time we will know | |
| 337 // this has been done. | |
| 338 m_aheadPos++; | |
| 339 return true; | |
| 340 } | |
| 341 | |
| 342 U16_NEXT(m_text, m_aheadPos, m_length, m_aheadCharacter); | |
| 343 m_scriptData->getScripts(m_aheadCharacter, m_aheadSet); | |
| 344 if (m_aheadSet.isEmpty()) { | |
| 345 // No scripts for this character. This has already been logged, so | |
| 346 // we just terminate processing this text. | |
| 347 return false; | |
| 348 } | |
| 349 if (m_aheadSet[0] == USCRIPT_INHERITED && m_aheadSet.size() > 1) { | |
| 350 if (m_nextSet[0] == USCRIPT_COMMON) { | |
| 351 // Overwrite the next set with the non-inherited portion of the set. | |
| 352 m_nextSet = m_aheadSet; | |
| 353 m_nextSet.remove(0); | |
| 354 // Discard the remaining values, we'll inherit. | |
| 355 m_aheadSet.resize(1); | |
| 356 } else { | |
| 357 // Else, this applies to anything. | |
| 358 m_aheadSet.resize(1); | |
| 359 } | |
| 360 } | |
| 361 return true; | |
| 362 } | |
| 363 | |
| 364 UScriptCode ScriptRunIterator::resolveCurrentScript() const | |
| 365 { | |
| 366 UScriptCode result = m_currentSet.at(0); | |
| 367 return result == USCRIPT_COMMON ? m_commonPreferred : result; | |
| 368 } | |
| 369 | |
| 370 } // namespace blink | |
| OLD | NEW |