Source/platform/fonts/ScriptRunIterator.cpp - Issue 1323513006: Upstream ScriptRunIterator for segmenting text runs by script

Unified Diff: Source/platform/fonts/ScriptRunIterator.cpp

Issue 1323513006: Upstream ScriptRunIterator for segmenting text runs by script (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« Source/platform/fonts/ScriptRunIterator.h ('K') | « Source/platform/fonts/ScriptRunIterator.h ('k') | Source/platform/fonts/ScriptRunIteratorTest.cpp » ('j') | Source/platform/fonts/ScriptRunIteratorTest.cpp » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: Source/platform/fonts/ScriptRunIterator.cpp

diff --git a/Source/platform/fonts/ScriptRunIterator.cpp b/Source/platform/fonts/ScriptRunIterator.cpp

new file mode 100644

index 0000000000000000000000000000000000000000..e68a30eb54b056719f65d2c4c04ca5c5a3ce5b72

--- /dev/null

+++ b/Source/platform/fonts/ScriptRunIterator.cpp

@@ -0,0 +1,371 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "config.h"

+#include "ScriptRunIterator.h"

+#include "platform/Logging.h"

+#include "wtf/Threading.h"

+#include <ubidi_props.h>

+namespace blink {

+typedef ScriptData::PairedBracketType PairedBracketType;

+ScriptData::~ScriptData()

+const int ScriptData::kMaxScriptCount;

+void ICUScriptData::getScripts(UChar32 ch, Vector<UScriptCode>& dst) const

+ UErrorCode status = U_ZERO_ERROR;

+ // Leave room to insert primary script. It's not strictly necessary but

+ // it ensures that the result won't ever be greater than kMaxScriptCount,

+ // which some client someday might expect.

+ dst.resize(kMaxScriptCount - 1);

+ // Note, ICU convention is to return the number of available items

+ // regardless of the capacity passed to the call. So count can be greater

+ // than dst->size(), if a later version of the unicode data has more

+ // than kMaxScriptCount items.

+ int count = uscript_getScriptExtensions(

+ ch, &dst[0], dst.size(), &status);

+ if (status == U_BUFFER_OVERFLOW_ERROR) {

+ // Allow this, we'll just use what we have.

+ WTF_LOG_ERROR("Exceeded maximum script count of %d for 0x%x", kMaxScriptCount, ch);

+ count = dst.size();

+ status = U_ZERO_ERROR;

+ }

+ UScriptCode primaryScript = uscript_getScript(ch, &status);

+ if (U_FAILURE(status)) {

+ WTF_LOG_ERROR("Could not get icu script data: %d for 0x%x", status, ch);

+ dst.clear();

+ return;

+ }

+ dst.resize(count);

+ if (primaryScript == dst.at(0)) {

+ // Only one script (might be common or inherited -- these are never in

+ // the extensions unless they're the only script), or extensions are in

+ // priority order already.

+ return;

+ }

+ if (primaryScript > USCRIPT_INHERITED) {

+ // Not common or primary, with extensions that are not in order. We know

+ // the primary, so we insert it at the front and swap the previous front

+ // to somewhere else in the list.

+ auto it = std::find(dst.begin() + 1, dst.end(), primaryScript);

+ if (it == dst.end()) {

+ dst.append(primaryScript);

+ }

+ std::swap(*dst.begin(), *it);

+ return;

+ }

+ if (primaryScript == USCRIPT_COMMON) {

+ if (count == 1) {

+ // Common with a preferred script. Keep common at head.

+ dst.prepend(primaryScript);

+ return;

+ }

+ // Ignore common. Find the preferred script of the multiple scripts that

+ // remain, and ensure it is at the head. Just keep swapping them in,

+ // there aren't likely to be many.

+ for (size_t i = 1; i < dst.size(); ++i) {

+ if (dst.at(0) == USCRIPT_LATIN || dst.at(i) < dst.at(0)) {

+ std::swap(dst.at(0), dst.at(i));

+ }

+ return;

+ }

+ // The primary is inherited, and there are other scripts. Put inherited at

+ // the front, the true primary next, and then the others in random order.

+ dst.append(dst.at(0));

+ dst.at(0) = primaryScript;

+ for (size_t i = 2; i < dst.size(); ++i) {

+ if (dst.at(1) == USCRIPT_LATIN || dst.at(i) < dst.at(1)) {

+ std::swap(dst.at(1), dst.at(i));

+ }

+UChar32 ICUScriptData::getPairedBracket(UChar32 ch) const

+ return u_getBidiPairedBracket(ch);

+PairedBracketType ICUScriptData::getPairedBracketType(UChar32 ch) const

+ return static_cast<PairedBracketType>(

+ u_getIntPropertyValue(ch, UCHAR_BIDI_PAIRED_BRACKET_TYPE));

+const ICUScriptData* ICUScriptData::instance()

+ AtomicallyInitializedStaticReference(const ICUScriptData, icuScriptDataInstance, (new ICUScriptData()));

+ return &icuScriptDataInstance;

+ScriptRunIterator::ScriptRunIterator(const UChar* text, size_t length, const ScriptData* data)

+ : m_text(text)

+ , m_length(length)

+ , m_bracketsFixupDepth(0)

+ ,

+ // The initial value of m_aheadCharacter is not used, use ffff to be distinctive.

eae 2015/08/28 21:10:24 Why FFFF instead of 0?

+ m_aheadCharacter(0xffff)

+ , m_aheadPos(0)

+ , m_commonPreferred(USCRIPT_COMMON)

+ , m_scriptData(data)

+ ASSERT(text);

+ ASSERT(data);

+ if (m_aheadPos < m_length) {

+ m_currentSet.clear();

+ m_currentSet.append(USCRIPT_COMMON);

eae 2015/08/28 21:10:24 Why would we append commonn to the set here?

+ U16_NEXT(m_text, m_aheadPos, m_length, m_aheadCharacter);

+ m_scriptData->getScripts(m_aheadCharacter, m_aheadSet);

+ }

+ScriptRunIterator::ScriptRunIterator(const UChar* text, size_t length)

+ : ScriptRunIterator(text, length, ICUScriptData::instance())

+bool ScriptRunIterator::consume(unsigned& limit, UScriptCode& script)

+ if (m_currentSet.isEmpty()) {

+ return false;

+ }

+ size_t pos;

+ UChar32 ch;

+ while (fetch(&pos, &ch)) {

+ PairedBracketType paired_type = m_scriptData->getPairedBracketType(ch);

+ switch (paired_type) {

+ case PairedBracketType::OPEN:

+ openBracket(ch);

+ break;

+ case PairedBracketType::CLOSE:

+ closeBracket(ch);

+ break;

+ default:

+ break;

+ }

+ if (!mergeSets()) {

+ limit = pos;

+ script = resolveCurrentScript();

+ fixupStack(script);

+ m_currentSet = m_nextSet;

+ return true;

+ }

+ limit = m_length;

+ script = resolveCurrentScript();

+ m_currentSet.clear();

+ return true;

+void ScriptRunIterator::openBracket(UChar32 ch)

+ if (m_brackets.size() == kMaxBrackets) {

+ m_brackets.removeFirst();

+ if (m_bracketsFixupDepth == kMaxBrackets) {

+ --m_bracketsFixupDepth;

+ }

+ m_brackets.append(BracketRec({ ch, USCRIPT_COMMON }));

+ ++m_bracketsFixupDepth;

+void ScriptRunIterator::closeBracket(UChar32 ch)

+ if (m_brackets.size() > 0) {

+ UChar32 target = m_scriptData->getPairedBracket(ch);

+ for (auto it = m_brackets.rbegin(); it != m_brackets.rend(); ++it) {

+ if (it->ch == target) {

+ // Have a match, use open paren's resolved script.

+ UScriptCode script = it->script;

+ m_nextSet.clear();

+ m_nextSet.append(script);

+ // And pop stack to this point.

+ int num_popped = std::distance(m_brackets.rbegin(), it);

+ // TODO: No resize operation in WTF::Deque?

+ for (int i = 0; i < num_popped; ++i)

+ m_brackets.removeLast();

+ m_bracketsFixupDepth = std::max(0ul, m_bracketsFixupDepth - num_popped);

+ return;

+ }

+ // leave stack alone, no match

+// Keep items in current_set that are in next_set.

+//

+// If the sets are disjoint, return false and leave current_set unchanged. Else

+// return true and make current set the intersection. Make sure to maintain

+// current priority script as priority if it remains, else retain next priority

+// script if it remains.

+//

+// Also maintain a common preferred script. If current and next are both

+// common, and there is no common preferred script and next has a preferred

+// script, set the common preferred script to that of next.

+bool ScriptRunIterator::mergeSets()

+ if (m_nextSet.isEmpty() || m_currentSet.isEmpty()) {

+ return false;

+ }

+ auto cur_it = m_currentSet.begin();

eae 2015/08/28 21:10:24 curIt, or better yet, currentScript

+ auto cur_end = m_currentSet.end();

+ // Most of the time, this is the only one.

+ // Advance the current iterator, we won't need to check it again later.

+ UScriptCode priority_script = *cur_it++;

+ // If next is common or inherited, the only thing that might change

+ // is the common preferred script.

+ if (m_nextSet.at(0) <= USCRIPT_INHERITED) {

+ if (m_nextSet.size() == 2 && priority_script <= USCRIPT_INHERITED && m_commonPreferred == USCRIPT_COMMON) {

+ m_commonPreferred = m_nextSet.at(1);

+ }

+ return true;

+ }

+ // If current is common or inherited, use the next script set.

+ if (priority_script <= USCRIPT_INHERITED) {

+ m_currentSet = m_nextSet;

+ return true;

+ }

+ // Neither is common or inherited. If current is a singleton,

+ // just see if it exists in the next set. This is the common case.

+ auto next_it = m_nextSet.begin();

+ auto next_end = m_nextSet.end();

+ if (cur_it == cur_end) {

+ return std::find(next_it, next_end, priority_script) != next_end;

+ }

+ // Establish the priority script, if we have one.

+ // First try current priority script.

+ bool have_priority = std::find(next_it, next_end, priority_script)

+ != next_end;

+ if (!have_priority) {

+ // So try next priority script.

+ // Skip the first current script, we already know it's not there.

+ // Advance the next iterator, later we won't need to check it again.

+ priority_script = *next_it++;

+ have_priority = std::find(cur_it, cur_end, priority_script) != cur_end;

+ }

+ // Note that we can never write more scripts into the current vector than

+ // it already contains, so cur_write_it won't ever exceed the size/capacity.

+ auto cur_write_it = m_currentSet.begin();

+ if (have_priority) {

+ // keep the priority script.

+ *cur_write_it++ = priority_script;

+ }

+ if (next_it != next_end) {

+ // Iterate over the remaining current scripts, and keep them if

+ // they occur in the remaining next scripts.

+ while (cur_it != cur_end) {

+ UScriptCode sc = *cur_it++;

+ if (std::find(next_it, next_end, sc) != next_end) {

+ *cur_write_it++ = sc;

+ }

+ // Only change current if the run continues.

+ int written = std::distance(m_currentSet.begin(), cur_write_it);

+ if (written > 0) {

+ m_currentSet.resize(written);

+ return true;

+ }

+ return false;

+// When we hit the end of the run, and resolve the script, we now know the

+// resolved script of any open bracket that was pushed on the stack since

+// the start of the run. Fixup depth records how many of these there

+// were. We've maintained this count during pushes, and taken care to

+// adjust it if the stack got overfull and open brackets were pushed off

+// the bottom. This sets the script of the fixup_depth topmost entries of the

+// stack to the resolved script.

+void ScriptRunIterator::fixupStack(UScriptCode resolved_script)

+ if (m_bracketsFixupDepth > 0) {

+ if (m_bracketsFixupDepth > m_brackets.size()) {

+ // Should never happen unless someone breaks the code.

+ WTF_LOG_ERROR("Brackets fixup depth exceeds size of bracket vector.");

+ m_bracketsFixupDepth = m_brackets.size();

+ }

+ auto it = m_brackets.rbegin();

+ for (size_t i = 0; i < m_bracketsFixupDepth; ++i) {

+ it->script = resolved_script;

+ ++it;

+ }

+ m_bracketsFixupDepth = 0;

+ }

+bool ScriptRunIterator::fetch(size_t* pos, UChar32* ch)

+ if (m_aheadPos > m_length) {

+ return false;

+ }

+ *pos = m_aheadPos - (m_aheadCharacter >= 0x10000 ? 2 : 1);

+ *ch = m_aheadCharacter;

+ m_nextSet.swap(m_aheadSet);

+ if (m_aheadPos == m_length) {

+ // No more data to fetch, but last character still needs to be

+ // processed. Advance m_aheadPos so that next time we will know

+ // this has been done.

+ m_aheadPos++;

+ return true;

+ }

+ U16_NEXT(m_text, m_aheadPos, m_length, m_aheadCharacter);

+ m_scriptData->getScripts(m_aheadCharacter, m_aheadSet);

+ if (m_aheadSet.isEmpty()) {

+ // No scripts for this character. This has already been logged, so

+ // we just terminate processing this text.

+ return false;

+ }

+ if (m_aheadSet[0] == USCRIPT_INHERITED && m_aheadSet.size() > 1) {

+ if (m_nextSet[0] == USCRIPT_COMMON) {

+ // Overwrite the next set with the non-inherited portion of the set.

+ m_nextSet = m_aheadSet;

+ m_nextSet.remove(0);

+ // Discard the remaining values, we'll inherit.

+ m_aheadSet.resize(1);

+ }

+ else {

+ // Else, this applies to anything.

+ m_aheadSet.resize(1);

+ }

+ return true;

+UScriptCode ScriptRunIterator::resolveCurrentScript() const

+ UScriptCode result = m_currentSet.at(0);

+ return result == USCRIPT_COMMON ? m_commonPreferred : result;

+} // namespace blink