Source/platform/fonts/ScriptRunIteratorTest.cpp - Issue 1323513006: Upstream ScriptRunIterator for segmenting text runs by script

Unified Diff: Source/platform/fonts/ScriptRunIteratorTest.cpp

Issue 1323513006: Upstream ScriptRunIterator for segmenting text runs by script (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Additional review comments addressed, new linkage attempt for kMaxScripts constant Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: Source/platform/fonts/ScriptRunIteratorTest.cpp

diff --git a/Source/platform/fonts/ScriptRunIteratorTest.cpp b/Source/platform/fonts/ScriptRunIteratorTest.cpp

new file mode 100644

index 0000000000000000000000000000000000000000..624ad0b5809c23868fa36ee9cdb2c9800e5627d7

--- /dev/null

+++ b/Source/platform/fonts/ScriptRunIteratorTest.cpp

@@ -0,0 +1,786 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "config.h"

+#include "platform/fonts/ScriptRunIterator.h"

+#include "platform/Logging.h"

+#include "wtf/Assertions.h"

+#include "wtf/Threading.h"

+#include "wtf/text/WTFString.h"

+#include <gtest/gtest.h>

+#include <string>

+#include <vector>

+namespace blink {

+struct TestRun {

+ std::string text;

+ UScriptCode code;

+};

+struct ExpectedRun {

+ unsigned limit;

+ UScriptCode code;

+ ExpectedRun(unsigned the_limit, UScriptCode the_code)

+ : limit(the_limit)

+ , code(the_code)

+ {

+ }

+};

+class MockScriptData : public ScriptData {

+public:

+ ~MockScriptData() override {}

+ static const MockScriptData* instance()

+ {

+ AtomicallyInitializedStaticReference(const MockScriptData, mockScriptData, (new MockScriptData()));

+ return &mockScriptData;

+ }

+ void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override

+ {

+ ASSERT(ch >= kMockCharMin);

+ ASSERT(ch < kMockCharLimit);

+ int code = ch - kMockCharMin;

+ dst.clear();

+ switch (code & kCodeSpecialMask) {

+ case kCodeSpecialCommon:

+ dst.append(USCRIPT_COMMON);

+ break;

+ case kCodeSpecialInherited:

+ dst.append(USCRIPT_INHERITED);

+ break;

+ default:

+ break;

+ }

+ int listBits = kTable[code & kCodeListIndexMask];

+ if (dst.isEmpty() && listBits == 0) {

+ dst.append(USCRIPT_UNKNOWN);

+ return;

+ }

+ while (listBits) {

+ switch (listBits & kListMask) {

+ case 0:

+ break;

+ case kLatin:

+ dst.append(USCRIPT_LATIN);

+ break;

+ case kHan:

+ dst.append(USCRIPT_HAN);

+ break;

+ case kGreek:

+ dst.append(USCRIPT_GREEK);

+ break;

+ }

+ listBits >>= kListShift;

+ }

+ UChar32 getPairedBracket(UChar32 ch) const override

+ {

+ switch (getPairedBracketType(ch)) {

+ case PairedBracketType::BracketTypeClose:

+ return ch - kBracketDelta;

+ case PairedBracketType::BracketTypeOpen:

+ return ch + kBracketDelta;

+ default:

+ return ch;

+ }

+ PairedBracketType getPairedBracketType(UChar32 ch) const override

+ {

+ ASSERT(ch >= kMockCharMin && ch < kMockCharLimit);

+ int code = ch - kMockCharMin;

+ if ((code & kCodeBracketBit) == 0) {

+ return PairedBracketType::BracketTypeNone;

+ }

+ if (code & kCodeBracketCloseBit) {

+ return PairedBracketType::BracketTypeClose;

+ }

+ return PairedBracketType::BracketTypeOpen;

+ }

+ static int TableLookup(int value)

+ {

+ for (int i = 0; i < 16; ++i) {

+ if (kTable[i] == value) {

+ return i;

+ }

+ WTF_LOG_ERROR("Table does not contain value 0x%x", value);

+ return 0;

+ }

+ static String ToTestString(const std::string& input)

+ {

+ String result(String::make16BitFrom8BitSource(0, 0));

+ bool inSet = false;

+ int seen = 0;

+ int code = 0;

+ int list = 0;

+ int currentShift = 0;

+ for (char c : input) {

+ if (inSet) {

+ switch (c) {

+ case '(':

+ ASSERT(seen == 0);

+ seen |= kSawBracket;

+ code |= kCodeBracketBit;

+ break;

+ case '[':

+ ASSERT(seen == 0);

+ seen |= kSawBracket;

+ code |= kCodeBracketBit | kCodeSquareBracketBit;

+ break;

+ case ')':

+ ASSERT(seen == 0);

+ seen |= kSawBracket;

+ code |= kCodeBracketBit | kCodeBracketCloseBit;

+ break;

+ case ']':

+ ASSERT(seen == 0);

+ seen |= kSawBracket;

+ code |= kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit;

+ break;

+ case 'i':

+ ASSERT(seen == 0); // brackets can't be inherited

+ seen |= kSawSpecial;

+ code |= kCodeSpecialInherited;

+ break;

+ case 'c':

+ ASSERT((seen & ~kSawBracket) == 0);

+ seen |= kSawSpecial;

+ code |= kCodeSpecialCommon;

+ break;

+ case 'l':

+ ASSERT((seen & kSawLatin) == 0);

+ ASSERT(currentShift < 3);

+ seen |= kSawLatin;

+ list |= kLatin << (2 * currentShift++);

+ break;

+ case 'h':

+ ASSERT((seen & kSawHan) == 0);

+ ASSERT(currentShift < 3);

+ seen |= kSawHan;

+ list |= kHan << (2 * currentShift++);

+ break;

+ case 'g':

+ ASSERT((seen & kSawGreek) == 0);

+ ASSERT(currentShift < 3);

+ seen |= kSawGreek;

+ list |= kGreek << (2 * currentShift++);

+ break;

+ case '>':

+ ASSERT(seen != 0);

+ code |= TableLookup(list);

+ result.append(static_cast<UChar>(kMockCharMin + code));

+ inSet = false;

+ break;

+ default:

+ WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);

+ break;

+ }

+ continue;

+ }

+ // not in set

+ switch (c) {

+ case '<':

+ seen = 0;

+ code = 0;

+ list = 0;

+ currentShift = 0;

+ inSet = true;

+ break;

+ case '(':

+ code = kCodeBracketBit | kCodeSpecialCommon;

+ break;

+ case '[':

+ code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCommon;

+ break;

+ case ')':

+ code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;

+ break;

+ case ']':

+ code = kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;

+ break;

+ case 'i':

+ code = kCodeSpecialInherited;

+ break;

+ case 'c':

+ code = kCodeSpecialCommon;

+ break;

+ case 'l':

+ code = kLatin;

+ break;

+ case 'h':

+ code = kHan;

+ break;

+ case 'g':

+ code = kGreek;

+ break;

+ case '?':

+ code = 0; // unknown

+ break;

+ default:

+ WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);

+ }

+ if (!inSet) {

+ result.append(static_cast<UChar>(kMockCharMin + code));

+ }

+ return result;

+ }

+ static std::string MockCharString(UChar mockch)

+ {

+ ASSERT(mockch >= kMockCharMin && mockch < kMockCharLimit);

+ int code = mockch - kMockCharMin;

+ // We use set notation in these cases:

+ // - more than one of special, kLatin, kHan, kGreek

+ // - bracket and not common (since non-set brackets are common)

+ bool isBracket = (code & kCodeBracketBit) != 0;

+ bool isSpecial = (mockch & kCodeSpecialMask) != 0;

+ bool isCommon = (mockch & kCodeSpecialMask) == kCodeSpecialCommon;

+ char c;

+ if (isBracket) {

+ if (code & kCodeSquareBracketBit) {

+ if (code & kCodeBracketCloseBit) {

+ c = ']';

+ } else {

+ c = '[';

+ }

+ } else {

+ if (code & kCodeBracketCloseBit) {

+ c = ')';

+ } else {

+ c = '(';

+ }

+ } else if (isSpecial) {

+ c = isCommon ? 'c' : 'i';

+ }

+ std::string result;

+ int listBits = kTable[code & kCodeListIndexMask];

+ while (listBits) {

+ switch (listBits & kListMask) {

+ case 0:

+ break;

+ case kLatin:

+ result += 'l';

+ break;

+ case kHan:

+ result += 'h';

+ break;

+ case kGreek:

+ result += 'g';

+ break;

+ }

+ listBits >>= kListShift;

+ }

+ bool needSet = result.length() + (isSpecial ? 1 : 0) > 1 || (isBracket && (result.length() > 0 || !isCommon));

+ if (needSet) {

+ std::string setResult("<");

+ if (isBracket) {

+ setResult += c;

+ }

+ if (isSpecial) {

+ if (isCommon) {

+ setResult += "c";

+ } else {

+ setResult += "i";

+ }

+ setResult += result;

+ setResult += ">";

+ return setResult;

+ }

+ if (isBracket || isSpecial) {

+ result = c;

+ }

+ return result;

+ }

+ // We determine properties based on the offset from kMockCharMin:

+ // bits 0-3 represent the list of l, h, c scripts (index into table)

+ // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal

+ // bit 6 clear means non-bracket, open means bracket

+ // bit 7 clear means open bracket, set means close bracket

+ // bit 8 clear means paren, set means bracket

+ // if it's a bracket, the matching bracket is 64 code points away

+ static const UChar32 kMockCharMin = 0xe000;

+ static const UChar32 kMockCharLimit = kMockCharMin + 0x200;

+ static const int kLatin = 1;

+ static const int kHan = 2;

+ static const int kGreek = 3;

+ static const int kCodeListIndexMask = 0xf;

+ static const int kCodeSpecialMask = 0x30;

+ static const int kCodeSpecialCommon = 0x10;

+ static const int kCodeSpecialInherited = 0x20;

+ static const int kCodeBracketCloseBit = 0x40;

+ static const int kCodeBracketBit = 0x80;

+ static const int kCodeSquareBracketBit = 0x100;

+ static const int kListShift = 2;

+ static const int kListMask = 0x3;

+ static const int kBracketDelta = kCodeBracketCloseBit;

+ static const int kTable[16];

+ static const int kSawBracket = 0x1;

+ static const int kSawSpecial = 0x2;

+ static const int kSawLatin = 0x4;

+ static const int kSawHan = 0x8;

+ static const int kSawGreek = 0x10;

+};

+static const int kLatin2 = MockScriptData::kLatin << 2;

+static const int kHan2 = MockScriptData::kHan << 2;

+static const int kGreek2 = MockScriptData::kGreek << 2;

+static const int kLatin3 = MockScriptData::kLatin << 4;

+static const int kHan3 = MockScriptData::kHan << 4;

+static const int kGreek3 = MockScriptData::kGreek << 4;

+const int MockScriptData::kTable[] = {

+ 0, kLatin, kHan, kGreek,

+ kLatin2 + kHan, kLatin2 + kGreek,

+ kHan2 + kLatin, kHan2 + kGreek,

+ kGreek2 + kLatin, kGreek2 + kHan,

+ kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan,

+ kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin,

+ kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin,

+};

+class ScriptRunIteratorTest : public testing::Test {

+protected:

+ void CheckRuns(const std::vector<TestRun>& runs)

+ {

+ String text(String::make16BitFrom8BitSource(0, 0));

+ std::vector<ExpectedRun> expect;

+ for (auto& run : runs) {

+ text.append(String::fromUTF8(run.text.c_str()));

+ expect.push_back(ExpectedRun(text.length(), run.code));

+ }

+ ScriptRunIterator scriptRunIterator(text.characters16(), text.length());

+ VerifyRuns(&scriptRunIterator, expect);

+ }

+ // FIXME crbug.com/527329 - CheckMockRuns should be replaced by finding

+ // suitable equivalent real codepoint sequences instead.

+ void CheckMockRuns(const std::vector<TestRun>& runs)

+ {

+ String text(String::make16BitFrom8BitSource(0, 0));

+ std::vector<ExpectedRun> expect;

+ for (const TestRun& run : runs) {

+ text.append(MockScriptData::ToTestString(run.text));

+ expect.push_back({ text.length(), run.code });

+ }

+ ScriptRunIterator scriptRunIterator(text.characters16(), text.length(),

+ MockScriptData::instance());

+ VerifyRuns(&scriptRunIterator, expect);

+ }

+ void VerifyRuns(ScriptRunIterator* scriptRunIterator,

+ const std::vector<ExpectedRun>& expect)

+ {

+ unsigned limit;

+ UScriptCode code;

+ unsigned long runCount = 0;

+ while (scriptRunIterator->consume(limit, code)) {

+ ASSERT_LT(runCount, expect.size());

+ ASSERT_EQ(expect[runCount].limit, limit);

+ ASSERT_EQ(expect[runCount].code, code);

+ ++runCount;

+ }

+ WTF_LOG_ERROR("Expected %zu runs, got %lu ", expect.size(), runCount);

+ ASSERT_EQ(expect.size(), runCount);

+ }

+};

+TEST_F(ScriptRunIteratorTest, Empty)

+ String empty(String::make16BitFrom8BitSource(0, 0));

+ ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length());

+ unsigned limit = 0;

+ UScriptCode code = USCRIPT_INVALID_CODE;

+ ASSERT(!scriptRunIterator.consume(limit, code));

+ ASSERT_EQ(limit, 0u);

+ ASSERT_EQ(code, USCRIPT_INVALID_CODE);

+// Some of our compilers cannot initialize a vector from an array yet.

+#define DECLARE_RUNSVECTOR(...) \

+ static const TestRun runsArray[] = __VA_ARGS__; \

+ std::vector<TestRun> runs(runsArray, runsArray + sizeof(runsArray) / sizeof(*runsArray));

+#define CHECK_RUNS(...) \

+ DECLARE_RUNSVECTOR(__VA_ARGS__); \

+ CheckRuns(runs);

+#define CHECK_MOCK_RUNS(...) \

+ DECLARE_RUNSVECTOR(__VA_ARGS__); \

+ CheckMockRuns(runs);

+TEST_F(ScriptRunIteratorTest, Whitespace)

+ CHECK_RUNS({ { " \t ", USCRIPT_COMMON } });

+TEST_F(ScriptRunIteratorTest, Common)

+ CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } });

+TEST_F(ScriptRunIteratorTest, Latin)

+ CHECK_RUNS({ { "latin", USCRIPT_LATIN } });

+TEST_F(ScriptRunIteratorTest, Chinese)

+ CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } });

+// Close bracket without matching open is ignored

+TEST_F(ScriptRunIteratorTest, UnbalancedParens1)

+ CHECK_RUNS({ { "(萬", USCRIPT_HAN },

+ { "a]", USCRIPT_LATIN },

+ { ")", USCRIPT_HAN } });

+// Open bracket without matching close is popped when inside

+// matching close brackets, so doesn't match later close.

+TEST_F(ScriptRunIteratorTest, UnbalancedParens2)

+ CHECK_RUNS({ { "(萬", USCRIPT_HAN },

+ { "a[", USCRIPT_LATIN },

+ { ")]", USCRIPT_HAN } });

+// space goes with leading script

+TEST_F(ScriptRunIteratorTest, LatinHan)

+ CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN },

+ { "萬國碼", USCRIPT_HAN } });

+// space goes with leading script

+TEST_F(ScriptRunIteratorTest, HanLatin)

+ CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },

+ { "Unicode", USCRIPT_LATIN } });

+TEST_F(ScriptRunIteratorTest, ParenEmptyParen)

+ CHECK_RUNS({ { "()", USCRIPT_COMMON } });

+TEST_F(ScriptRunIteratorTest, ParenChineseParen)

+ CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } });

+TEST_F(ScriptRunIteratorTest, ParenLatinParen)

+ CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } });

+// open paren gets leading script

+TEST_F(ScriptRunIteratorTest, LatinParenChineseParen)

+ CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },

+ { "萬國碼", USCRIPT_HAN },

+ { ")", USCRIPT_LATIN } });

+// open paren gets first trailing script if no leading script

+TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin)

+ CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN },

+ { "Unicode", USCRIPT_LATIN } });

+// leading common and open paren get first trailing script.

+// TODO(dougfelt): we don't do quote matching, but probably should figure out

+// something better then doing nothing.

+TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote)

+ CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN },

+ { "Unicode\"", USCRIPT_LATIN } });

+// Unmatched close brace gets leading context

+TEST_F(ScriptRunIteratorTest, UnmatchedClose)

+ CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },

+ { "萬國碼] ", USCRIPT_HAN },

+ { ") Unicode\"", USCRIPT_LATIN } });

+// Match up to 32 bracket pairs

+TEST_F(ScriptRunIteratorTest, Match32Brackets)

+ CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN },

+ { "Unicode (((((((((((((((((((((((((((((((!"

+ ")))))))))))))))))))))))))))))))",

+ USCRIPT_LATIN },

+ { "]", USCRIPT_HAN } });

+// Matches 32 most recent bracket pairs. More than that, and we revert to

+// surrounding script.

+TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets)

+ CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN },

+ { "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN },

+ { "萬國碼!", USCRIPT_HAN },

+ { ")))))))))))))))))))))))))))))))", USCRIPT_LATIN },

+ { "]", USCRIPT_HAN },

+ { "But )))", USCRIPT_LATIN } });

+// A char with multiple scripts that match both leading and trailing context

+// gets the leading context.

+TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext)

+ CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN },

+ { "l", USCRIPT_LATIN } });

+// A char with multiple scripts that only match trailing context gets the

+// trailing context.

+TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext)

+ CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },

+ { "<gl>l", USCRIPT_LATIN } });

+// Retain first established priority script. <lhg><gh> produce the script <gh>

+// with g as priority, because of the two priority scripts l and g, only g

+// remains. Then <gh><hgl> retains g as priority, because of the two priority

+// scripts g and h that remain, g was encountered first.

+TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript)

+ CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } });

+// Parens can have scripts that break script runs.

+TEST_F(ScriptRunIteratorTest, ExtensionsParens)

+ CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK },

+ { "h<[hl>", USCRIPT_HAN },

+ { "l", USCRIPT_LATIN },

+ { "<]hl>", USCRIPT_HAN },

+ { "<)lg>", USCRIPT_GREEK } });

+// The close paren might be encountered before we've established the open

+// paren's script, but when this is the case the current set is still valid, so

+// this doesn't affect it nor break the run.

+TEST_F(ScriptRunIteratorTest, ExtensionsParens2)

+ CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } });

+// A common script with a single extension should be treated as common, but

+// with the extended script as a default. If we encounter anything other than

+// common, that takes priority. If we encounter other common scripts with a

+// single extension, the current priority remains.

+TEST_F(ScriptRunIteratorTest, CommonWithPriority)

+ CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } });

+TEST_F(ScriptRunIteratorTest, CommonWithPriority2)

+ CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } });

+TEST_F(ScriptRunIteratorTest, CommonWithPriority3)

+ CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } });

+// UDatta (\xE0\xA5\x91) is inherited with LATIN and DEVANAGARI extensions.

+// Since it has LATIN, and the dotted circle (\xE2\x97\x8C) is COMMON and has

+// adopted the preceding LATIN, it gets the LATIN. This is standard.

+TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta)

+ CHECK_RUNS({ { "Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN } });

+// In this situation, UDatta (\xE0\xA5\x91) doesn't share a script with the

+// value inherited by the dotted circle (\xE2\x97\x8C). It captures the

+// preceding dotted circle and breaks it from the run it would normally have

+// been in.

+TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta)

+ CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },

+ { "\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_DEVANAGARI } });

+// Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is

+// common, that of Fathatan is inherited. The script extensions for Fathatan

+// are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the

+// preferred script for Fathatan is Arabic, according to Behdad's

+// heuristic. This is exactly analogous to the Udatta tests above, except

+// Tatweel is Lm. But we don't take properties into account, only scripts.

+TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan)

+ CHECK_RUNS({ { "Latin ", USCRIPT_LATIN },

+ { "\xD9\x80\xD9\x8B", USCRIPT_ARABIC } });

+// Another case where if the mark accepts a script that was inherited by the

+// preceding common-script character, they both continue in that script.

+// SYRIAC LETTER NUN \xDC\xA2

+// ARABIC TATWEEL \xD9\x80

+// ARABIC FATHATAN \xD9\x82

+TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan)

+ CHECK_RUNS({ { "\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC } });

+// The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that

+// is not common.

+TEST_F(ScriptRunIteratorTest, HanUdatta)

+ CHECK_RUNS({ { "萬國碼\xE0\xA5\x91", USCRIPT_HAN } });

+// The Udatta (\xE0\xA5\x91) is inherited, and will capture the space and turn

+// it into Devanagari.

+TEST_F(ScriptRunIteratorTest, HanSpaceUdatta)

+ CHECK_RUNS({ { "萬國碼", USCRIPT_HAN },

+ { " \xE0\xA5\x91", USCRIPT_DEVANAGARI } });

+// Make sure Mock code works too.

+TEST_F(ScriptRunIteratorTest, MockHanInheritedGL)

+ CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } });

+TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL)

+ CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },

+ { "c<igl>", USCRIPT_GREEK } });

+// Leading inherited just act like common, except there's no preferred script.

+TEST_F(ScriptRunIteratorTest, MockLeadingInherited)

+ CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } });

+// Leading inherited just act like common, except there's no preferred script.

+TEST_F(ScriptRunIteratorTest, MockLeadingInherited2)

+ CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } });

+TEST_F(ScriptRunIteratorTest, LeadingInheritedHan)

+ // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91

+ CHECK_RUNS({ { "\xE0\xA5\x91萬國碼", USCRIPT_HAN } });

+TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2)

+ // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91

+ // ARABIC FATHATAN \xD9\x8B

+ CHECK_RUNS({ { "\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN } });

+TEST_F(ScriptRunIteratorTest, OddLatinString)

+ CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } });

+class ScriptRunIteratorICUDataTest : public testing::Test {

+public:

+ ScriptRunIteratorICUDataTest()

+ : m_maxExtensions(0)

+ , m_maxExtensionsCodepoint(0xffff)

+ {

+ int maxExtensions = 0;

+ UChar32 m_maxExtensionscp = 0;

+ for (UChar32 cp = 0; cp < 0x11000; ++cp) {

+ UErrorCode status = U_ZERO_ERROR;

+ int count = uscript_getScriptExtensions(cp, 0, 0, &status);

+ if (count > maxExtensions) {

+ maxExtensions = count;

+ m_maxExtensionscp = cp;

+ }

+ m_maxExtensions = maxExtensions;

+ m_maxExtensionsCodepoint = m_maxExtensionscp;

+ }

+protected:

+ UChar32 GetACharWithMaxExtensions(int* numExtensions)

+ {

+ if (numExtensions) {

+ *numExtensions = m_maxExtensions;

+ }

+ return m_maxExtensionsCodepoint;

+ }

+private:

+ int m_maxExtensions;

+ UChar32 m_maxExtensionsCodepoint;

+};

+// Validate that ICU never returns more than our maximum expected number of

+// script extensions.

+TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions)

+ int maxExtensions;

+ UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);

+ ASSERT_LE(maxExtensions, ScriptData::kMaxScriptCount)

+ << "char " << std::hex << cp << std::dec;

+// Check that ICUScriptData returns all of a character's scripts.

+// This only checks one likely character, but doesn't check all cases.

+TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions)

+ int maxExtensions;

+ UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);

+ Vector<UScriptCode> extensions;

+ ICUScriptData::instance()->getScripts(cp, extensions);

+ // It's possible that GetScripts adds the primary script to the list of

+ // extensions, resulting in one more script than the raw extension count.

+ ASSERT_GE(static_cast<int>(extensions.size()), maxExtensions)

+ << "char " << std::hex << cp << std::dec;

+TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension)

+ Vector<UScriptCode> extensions;

+ for (UChar32 cp = 0; cp < 0x110000; ++cp) {

+ ICUScriptData::instance()->getScripts(cp, extensions);

+ UScriptCode primary = extensions.at(0);

+ if (primary == USCRIPT_COMMON) {

+ ASSERT_LE(extensions.size(), 2ul)

+ << "cp: " << std::hex << cp << std::dec;

+ }

+// ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to

+// ignore this for now, as I think it shouldn't matter which run it ends up

+// in. HarfBuzz needs to be able to use it as context and shape each

+// neighboring character appropriately no matter what run it got assigned to.

+} // namespace blink

« no previous file with comments | « Source/platform/fonts/ScriptRunIterator.cpp ('k') | no next file » | no next file with comments »