Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(202)

Unified Diff: Source/platform/fonts/ScriptRunIteratorTest.cpp

Issue 1323513006: Upstream ScriptRunIterator for segmenting text runs by script (Closed) Base URL: svn://svn.chromium.org/blink/trunk
Patch Set: Additional review comments addressed, new linkage attempt for kMaxScripts constant Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « Source/platform/fonts/ScriptRunIterator.cpp ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: Source/platform/fonts/ScriptRunIteratorTest.cpp
diff --git a/Source/platform/fonts/ScriptRunIteratorTest.cpp b/Source/platform/fonts/ScriptRunIteratorTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..624ad0b5809c23868fa36ee9cdb2c9800e5627d7
--- /dev/null
+++ b/Source/platform/fonts/ScriptRunIteratorTest.cpp
@@ -0,0 +1,786 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "config.h"
+#include "platform/fonts/ScriptRunIterator.h"
+
+#include "platform/Logging.h"
+#include "wtf/Assertions.h"
+#include "wtf/Threading.h"
+#include "wtf/text/WTFString.h"
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+namespace blink {
+
+struct TestRun {
+ std::string text;
+ UScriptCode code;
+};
+
+struct ExpectedRun {
+ unsigned limit;
+ UScriptCode code;
+
+ ExpectedRun(unsigned the_limit, UScriptCode the_code)
+ : limit(the_limit)
+ , code(the_code)
+ {
+ }
+};
+
+class MockScriptData : public ScriptData {
+public:
+ ~MockScriptData() override {}
+
+ static const MockScriptData* instance()
+ {
+ AtomicallyInitializedStaticReference(const MockScriptData, mockScriptData, (new MockScriptData()));
+
+ return &mockScriptData;
+ }
+
+ void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override
+ {
+ ASSERT(ch >= kMockCharMin);
+ ASSERT(ch < kMockCharLimit);
+
+ int code = ch - kMockCharMin;
+ dst.clear();
+ switch (code & kCodeSpecialMask) {
+ case kCodeSpecialCommon:
+ dst.append(USCRIPT_COMMON);
+ break;
+ case kCodeSpecialInherited:
+ dst.append(USCRIPT_INHERITED);
+ break;
+ default:
+ break;
+ }
+ int listBits = kTable[code & kCodeListIndexMask];
+ if (dst.isEmpty() && listBits == 0) {
+ dst.append(USCRIPT_UNKNOWN);
+ return;
+ }
+ while (listBits) {
+ switch (listBits & kListMask) {
+ case 0:
+ break;
+ case kLatin:
+ dst.append(USCRIPT_LATIN);
+ break;
+ case kHan:
+ dst.append(USCRIPT_HAN);
+ break;
+ case kGreek:
+ dst.append(USCRIPT_GREEK);
+ break;
+ }
+ listBits >>= kListShift;
+ }
+ }
+
+ UChar32 getPairedBracket(UChar32 ch) const override
+ {
+ switch (getPairedBracketType(ch)) {
+ case PairedBracketType::BracketTypeClose:
+ return ch - kBracketDelta;
+ case PairedBracketType::BracketTypeOpen:
+ return ch + kBracketDelta;
+ default:
+ return ch;
+ }
+ }
+
+ PairedBracketType getPairedBracketType(UChar32 ch) const override
+ {
+ ASSERT(ch >= kMockCharMin && ch < kMockCharLimit);
+ int code = ch - kMockCharMin;
+ if ((code & kCodeBracketBit) == 0) {
+ return PairedBracketType::BracketTypeNone;
+ }
+ if (code & kCodeBracketCloseBit) {
+ return PairedBracketType::BracketTypeClose;
+ }
+ return PairedBracketType::BracketTypeOpen;
+ }
+
+ static int TableLookup(int value)
+ {
+ for (int i = 0; i < 16; ++i) {
+ if (kTable[i] == value) {
+ return i;
+ }
+ }
+ WTF_LOG_ERROR("Table does not contain value 0x%x", value);
+ return 0;
+ }
+
+ static String ToTestString(const std::string& input)
+ {
+ String result(String::make16BitFrom8BitSource(0, 0));
+ bool inSet = false;
+ int seen = 0;
+ int code = 0;
+ int list = 0;
+ int currentShift = 0;
+ for (char c : input) {
+ if (inSet) {
+ switch (c) {
+ case '(':
+ ASSERT(seen == 0);
+ seen |= kSawBracket;
+ code |= kCodeBracketBit;
+ break;
+ case '[':
+ ASSERT(seen == 0);
+ seen |= kSawBracket;
+ code |= kCodeBracketBit | kCodeSquareBracketBit;
+ break;
+ case ')':
+ ASSERT(seen == 0);
+ seen |= kSawBracket;
+ code |= kCodeBracketBit | kCodeBracketCloseBit;
+ break;
+ case ']':
+ ASSERT(seen == 0);
+ seen |= kSawBracket;
+ code |= kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit;
+ break;
+ case 'i':
+ ASSERT(seen == 0); // brackets can't be inherited
+ seen |= kSawSpecial;
+ code |= kCodeSpecialInherited;
+ break;
+ case 'c':
+ ASSERT((seen & ~kSawBracket) == 0);
+ seen |= kSawSpecial;
+ code |= kCodeSpecialCommon;
+ break;
+ case 'l':
+ ASSERT((seen & kSawLatin) == 0);
+ ASSERT(currentShift < 3);
+ seen |= kSawLatin;
+ list |= kLatin << (2 * currentShift++);
+ break;
+ case 'h':
+ ASSERT((seen & kSawHan) == 0);
+ ASSERT(currentShift < 3);
+ seen |= kSawHan;
+ list |= kHan << (2 * currentShift++);
+ break;
+ case 'g':
+ ASSERT((seen & kSawGreek) == 0);
+ ASSERT(currentShift < 3);
+ seen |= kSawGreek;
+ list |= kGreek << (2 * currentShift++);
+ break;
+ case '>':
+ ASSERT(seen != 0);
+ code |= TableLookup(list);
+ result.append(static_cast<UChar>(kMockCharMin + code));
+ inSet = false;
+ break;
+ default:
+ WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
+ break;
+ }
+ continue;
+ }
+ // not in set
+ switch (c) {
+ case '<':
+ seen = 0;
+ code = 0;
+ list = 0;
+ currentShift = 0;
+ inSet = true;
+ break;
+ case '(':
+ code = kCodeBracketBit | kCodeSpecialCommon;
+ break;
+ case '[':
+ code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCommon;
+ break;
+ case ')':
+ code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;
+ break;
+ case ']':
+ code = kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;
+ break;
+ case 'i':
+ code = kCodeSpecialInherited;
+ break;
+ case 'c':
+ code = kCodeSpecialCommon;
+ break;
+ case 'l':
+ code = kLatin;
+ break;
+ case 'h':
+ code = kHan;
+ break;
+ case 'g':
+ code = kGreek;
+ break;
+ case '?':
+ code = 0; // unknown
+ break;
+ default:
+ WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
+ }
+ if (!inSet) {
+ result.append(static_cast<UChar>(kMockCharMin + code));
+ }
+ }
+ return result;
+ }
+
+ static std::string MockCharString(UChar mockch)
+ {
+ ASSERT(mockch >= kMockCharMin && mockch < kMockCharLimit);
+ int code = mockch - kMockCharMin;
+
+ // We use set notation in these cases:
+ // - more than one of special, kLatin, kHan, kGreek
+ // - bracket and not common (since non-set brackets are common)
+ bool isBracket = (code & kCodeBracketBit) != 0;
+ bool isSpecial = (mockch & kCodeSpecialMask) != 0;
+ bool isCommon = (mockch & kCodeSpecialMask) == kCodeSpecialCommon;
+ char c;
+ if (isBracket) {
+ if (code & kCodeSquareBracketBit) {
+ if (code & kCodeBracketCloseBit) {
+ c = ']';
+ } else {
+ c = '[';
+ }
+ } else {
+ if (code & kCodeBracketCloseBit) {
+ c = ')';
+ } else {
+ c = '(';
+ }
+ }
+ } else if (isSpecial) {
+ c = isCommon ? 'c' : 'i';
+ }
+ std::string result;
+ int listBits = kTable[code & kCodeListIndexMask];
+ while (listBits) {
+ switch (listBits & kListMask) {
+ case 0:
+ break;
+ case kLatin:
+ result += 'l';
+ break;
+ case kHan:
+ result += 'h';
+ break;
+ case kGreek:
+ result += 'g';
+ break;
+ }
+ listBits >>= kListShift;
+ }
+ bool needSet = result.length() + (isSpecial ? 1 : 0) > 1 || (isBracket && (result.length() > 0 || !isCommon));
+ if (needSet) {
+ std::string setResult("<");
+ if (isBracket) {
+ setResult += c;
+ }
+ if (isSpecial) {
+ if (isCommon) {
+ setResult += "c";
+ } else {
+ setResult += "i";
+ }
+ }
+ setResult += result;
+ setResult += ">";
+ return setResult;
+ }
+ if (isBracket || isSpecial) {
+ result = c;
+ }
+ return result;
+ }
+
+ // We determine properties based on the offset from kMockCharMin:
+ // bits 0-3 represent the list of l, h, c scripts (index into table)
+ // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal
+ // bit 6 clear means non-bracket, open means bracket
+ // bit 7 clear means open bracket, set means close bracket
+ // bit 8 clear means paren, set means bracket
+ // if it's a bracket, the matching bracket is 64 code points away
+ static const UChar32 kMockCharMin = 0xe000;
+ static const UChar32 kMockCharLimit = kMockCharMin + 0x200;
+ static const int kLatin = 1;
+ static const int kHan = 2;
+ static const int kGreek = 3;
+ static const int kCodeListIndexMask = 0xf;
+ static const int kCodeSpecialMask = 0x30;
+ static const int kCodeSpecialCommon = 0x10;
+ static const int kCodeSpecialInherited = 0x20;
+ static const int kCodeBracketCloseBit = 0x40;
+ static const int kCodeBracketBit = 0x80;
+ static const int kCodeSquareBracketBit = 0x100;
+ static const int kListShift = 2;
+ static const int kListMask = 0x3;
+ static const int kBracketDelta = kCodeBracketCloseBit;
+ static const int kTable[16];
+
+ static const int kSawBracket = 0x1;
+ static const int kSawSpecial = 0x2;
+ static const int kSawLatin = 0x4;
+ static const int kSawHan = 0x8;
+ static const int kSawGreek = 0x10;
+};
+
+static const int kLatin2 = MockScriptData::kLatin << 2;
+static const int kHan2 = MockScriptData::kHan << 2;
+static const int kGreek2 = MockScriptData::kGreek << 2;
+static const int kLatin3 = MockScriptData::kLatin << 4;
+static const int kHan3 = MockScriptData::kHan << 4;
+static const int kGreek3 = MockScriptData::kGreek << 4;
+const int MockScriptData::kTable[] = {
+ 0, kLatin, kHan, kGreek,
+ kLatin2 + kHan, kLatin2 + kGreek,
+ kHan2 + kLatin, kHan2 + kGreek,
+ kGreek2 + kLatin, kGreek2 + kHan,
+ kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan,
+ kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin,
+ kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin,
+};
+
+class ScriptRunIteratorTest : public testing::Test {
+protected:
+ void CheckRuns(const std::vector<TestRun>& runs)
+ {
+ String text(String::make16BitFrom8BitSource(0, 0));
+ std::vector<ExpectedRun> expect;
+ for (auto& run : runs) {
+ text.append(String::fromUTF8(run.text.c_str()));
+ expect.push_back(ExpectedRun(text.length(), run.code));
+ }
+ ScriptRunIterator scriptRunIterator(text.characters16(), text.length());
+ VerifyRuns(&scriptRunIterator, expect);
+ }
+
+ // FIXME crbug.com/527329 - CheckMockRuns should be replaced by finding
+ // suitable equivalent real codepoint sequences instead.
+ void CheckMockRuns(const std::vector<TestRun>& runs)
+ {
+ String text(String::make16BitFrom8BitSource(0, 0));
+ std::vector<ExpectedRun> expect;
+ for (const TestRun& run : runs) {
+ text.append(MockScriptData::ToTestString(run.text));
+ expect.push_back({ text.length(), run.code });
+ }
+
+ ScriptRunIterator scriptRunIterator(text.characters16(), text.length(),
+ MockScriptData::instance());
+ VerifyRuns(&scriptRunIterator, expect);
+ }
+
+ void VerifyRuns(ScriptRunIterator* scriptRunIterator,
+ const std::vector<ExpectedRun>& expect)
+ {
+ unsigned limit;
+ UScriptCode code;
+ unsigned long runCount = 0;
+ while (scriptRunIterator->consume(limit, code)) {
+ ASSERT_LT(runCount, expect.size());
+ ASSERT_EQ(expect[runCount].limit, limit);
+ ASSERT_EQ(expect[runCount].code, code);
+ ++runCount;
+ }
+ WTF_LOG_ERROR("Expected %zu runs, got %lu ", expect.size(), runCount);
+ ASSERT_EQ(expect.size(), runCount);
+ }
+};
+
+TEST_F(ScriptRunIteratorTest, Empty)
+{
+ String empty(String::make16BitFrom8BitSource(0, 0));
+ ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length());
+ unsigned limit = 0;
+ UScriptCode code = USCRIPT_INVALID_CODE;
+ ASSERT(!scriptRunIterator.consume(limit, code));
+ ASSERT_EQ(limit, 0u);
+ ASSERT_EQ(code, USCRIPT_INVALID_CODE);
+}
+
+// Some of our compilers cannot initialize a vector from an array yet.
+#define DECLARE_RUNSVECTOR(...) \
+ static const TestRun runsArray[] = __VA_ARGS__; \
+ std::vector<TestRun> runs(runsArray, runsArray + sizeof(runsArray) / sizeof(*runsArray));
+
+#define CHECK_RUNS(...) \
+ DECLARE_RUNSVECTOR(__VA_ARGS__); \
+ CheckRuns(runs);
+
+#define CHECK_MOCK_RUNS(...) \
+ DECLARE_RUNSVECTOR(__VA_ARGS__); \
+ CheckMockRuns(runs);
+
+TEST_F(ScriptRunIteratorTest, Whitespace)
+{
+ CHECK_RUNS({ { " \t ", USCRIPT_COMMON } });
+}
+
+TEST_F(ScriptRunIteratorTest, Common)
+{
+ CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } });
+}
+
+TEST_F(ScriptRunIteratorTest, Latin)
+{
+ CHECK_RUNS({ { "latin", USCRIPT_LATIN } });
+}
+
+TEST_F(ScriptRunIteratorTest, Chinese)
+{
+ CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } });
+}
+
+// Close bracket without matching open is ignored
+TEST_F(ScriptRunIteratorTest, UnbalancedParens1)
+{
+ CHECK_RUNS({ { "(萬", USCRIPT_HAN },
+ { "a]", USCRIPT_LATIN },
+ { ")", USCRIPT_HAN } });
+}
+
+// Open bracket without matching close is popped when inside
+// matching close brackets, so doesn't match later close.
+TEST_F(ScriptRunIteratorTest, UnbalancedParens2)
+{
+ CHECK_RUNS({ { "(萬", USCRIPT_HAN },
+ { "a[", USCRIPT_LATIN },
+ { ")]", USCRIPT_HAN } });
+}
+
+// space goes with leading script
+TEST_F(ScriptRunIteratorTest, LatinHan)
+{
+ CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN },
+ { "萬國碼", USCRIPT_HAN } });
+}
+
+// space goes with leading script
+TEST_F(ScriptRunIteratorTest, HanLatin)
+{
+ CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
+ { "Unicode", USCRIPT_LATIN } });
+}
+
+TEST_F(ScriptRunIteratorTest, ParenEmptyParen)
+{
+ CHECK_RUNS({ { "()", USCRIPT_COMMON } });
+}
+
+TEST_F(ScriptRunIteratorTest, ParenChineseParen)
+{
+ CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } });
+}
+
+TEST_F(ScriptRunIteratorTest, ParenLatinParen)
+{
+ CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } });
+}
+
+// open paren gets leading script
+TEST_F(ScriptRunIteratorTest, LatinParenChineseParen)
+{
+ CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
+ { "萬國碼", USCRIPT_HAN },
+ { ")", USCRIPT_LATIN } });
+}
+
+// open paren gets first trailing script if no leading script
+TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin)
+{
+ CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN },
+ { "Unicode", USCRIPT_LATIN } });
+}
+
+// leading common and open paren get first trailing script.
+// TODO(dougfelt): we don't do quote matching, but probably should figure out
+// something better then doing nothing.
+TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote)
+{
+ CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN },
+ { "Unicode\"", USCRIPT_LATIN } });
+}
+
+// Unmatched close brace gets leading context
+TEST_F(ScriptRunIteratorTest, UnmatchedClose)
+{
+ CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
+ { "萬國碼] ", USCRIPT_HAN },
+ { ") Unicode\"", USCRIPT_LATIN } });
+}
+
+// Match up to 32 bracket pairs
+TEST_F(ScriptRunIteratorTest, Match32Brackets)
+{
+ CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN },
+ { "Unicode (((((((((((((((((((((((((((((((!"
+ ")))))))))))))))))))))))))))))))",
+ USCRIPT_LATIN },
+ { "]", USCRIPT_HAN } });
+}
+
+// Matches 32 most recent bracket pairs. More than that, and we revert to
+// surrounding script.
+TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets)
+{
+ CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN },
+ { "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN },
+ { "萬國碼!", USCRIPT_HAN },
+ { ")))))))))))))))))))))))))))))))", USCRIPT_LATIN },
+ { "]", USCRIPT_HAN },
+ { "But )))", USCRIPT_LATIN } });
+}
+
+// A char with multiple scripts that match both leading and trailing context
+// gets the leading context.
+TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext)
+{
+ CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN },
+ { "l", USCRIPT_LATIN } });
+}
+
+// A char with multiple scripts that only match trailing context gets the
+// trailing context.
+TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext)
+{
+ CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
+ { "<gl>l", USCRIPT_LATIN } });
+}
+
+// Retain first established priority script. <lhg><gh> produce the script <gh>
+// with g as priority, because of the two priority scripts l and g, only g
+// remains. Then <gh><hgl> retains g as priority, because of the two priority
+// scripts g and h that remain, g was encountered first.
+TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript)
+{
+ CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } });
+}
+
+// Parens can have scripts that break script runs.
+TEST_F(ScriptRunIteratorTest, ExtensionsParens)
+{
+ CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK },
+ { "h<[hl>", USCRIPT_HAN },
+ { "l", USCRIPT_LATIN },
+ { "<]hl>", USCRIPT_HAN },
+ { "<)lg>", USCRIPT_GREEK } });
+}
+
+// The close paren might be encountered before we've established the open
+// paren's script, but when this is the case the current set is still valid, so
+// this doesn't affect it nor break the run.
+TEST_F(ScriptRunIteratorTest, ExtensionsParens2)
+{
+ CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } });
+}
+
+// A common script with a single extension should be treated as common, but
+// with the extended script as a default. If we encounter anything other than
+// common, that takes priority. If we encounter other common scripts with a
+// single extension, the current priority remains.
+TEST_F(ScriptRunIteratorTest, CommonWithPriority)
+{
+ CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } });
+}
+
+TEST_F(ScriptRunIteratorTest, CommonWithPriority2)
+{
+ CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } });
+}
+
+TEST_F(ScriptRunIteratorTest, CommonWithPriority3)
+{
+ CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } });
+}
+
+// UDatta (\xE0\xA5\x91) is inherited with LATIN and DEVANAGARI extensions.
+// Since it has LATIN, and the dotted circle (\xE2\x97\x8C) is COMMON and has
+// adopted the preceding LATIN, it gets the LATIN. This is standard.
+TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta)
+{
+ CHECK_RUNS({ { "Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN } });
+}
+
+// In this situation, UDatta (\xE0\xA5\x91) doesn't share a script with the
+// value inherited by the dotted circle (\xE2\x97\x8C). It captures the
+// preceding dotted circle and breaks it from the run it would normally have
+// been in.
+TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta)
+{
+ CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
+ { "\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_DEVANAGARI } });
+}
+
+// Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is
+// common, that of Fathatan is inherited. The script extensions for Fathatan
+// are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the
+// preferred script for Fathatan is Arabic, according to Behdad's
+// heuristic. This is exactly analogous to the Udatta tests above, except
+// Tatweel is Lm. But we don't take properties into account, only scripts.
+TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan)
+{
+ CHECK_RUNS({ { "Latin ", USCRIPT_LATIN },
+ { "\xD9\x80\xD9\x8B", USCRIPT_ARABIC } });
+}
+
+// Another case where if the mark accepts a script that was inherited by the
+// preceding common-script character, they both continue in that script.
+// SYRIAC LETTER NUN \xDC\xA2
+// ARABIC TATWEEL \xD9\x80
+// ARABIC FATHATAN \xD9\x82
+TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan)
+{
+ CHECK_RUNS({ { "\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC } });
+}
+
+// The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that
+// is not common.
+TEST_F(ScriptRunIteratorTest, HanUdatta)
+{
+ CHECK_RUNS({ { "萬國碼\xE0\xA5\x91", USCRIPT_HAN } });
+}
+
+// The Udatta (\xE0\xA5\x91) is inherited, and will capture the space and turn
+// it into Devanagari.
+TEST_F(ScriptRunIteratorTest, HanSpaceUdatta)
+{
+ CHECK_RUNS({ { "萬國碼", USCRIPT_HAN },
+ { " \xE0\xA5\x91", USCRIPT_DEVANAGARI } });
+}
+
+// Make sure Mock code works too.
+TEST_F(ScriptRunIteratorTest, MockHanInheritedGL)
+{
+ CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } });
+}
+
+TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL)
+{
+ CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
+ { "c<igl>", USCRIPT_GREEK } });
+}
+
+// Leading inherited just act like common, except there's no preferred script.
+TEST_F(ScriptRunIteratorTest, MockLeadingInherited)
+{
+ CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } });
+}
+
+// Leading inherited just act like common, except there's no preferred script.
+TEST_F(ScriptRunIteratorTest, MockLeadingInherited2)
+{
+ CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } });
+}
+
+TEST_F(ScriptRunIteratorTest, LeadingInheritedHan)
+{
+ // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
+ CHECK_RUNS({ { "\xE0\xA5\x91萬國碼", USCRIPT_HAN } });
+}
+
+TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2)
+{
+ // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
+ // ARABIC FATHATAN \xD9\x8B
+ CHECK_RUNS({ { "\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN } });
+}
+
+TEST_F(ScriptRunIteratorTest, OddLatinString)
+{
+ CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } });
+}
+
+class ScriptRunIteratorICUDataTest : public testing::Test {
+public:
+ ScriptRunIteratorICUDataTest()
+ : m_maxExtensions(0)
+ , m_maxExtensionsCodepoint(0xffff)
+ {
+ int maxExtensions = 0;
+ UChar32 m_maxExtensionscp = 0;
+ for (UChar32 cp = 0; cp < 0x11000; ++cp) {
+ UErrorCode status = U_ZERO_ERROR;
+ int count = uscript_getScriptExtensions(cp, 0, 0, &status);
+ if (count > maxExtensions) {
+ maxExtensions = count;
+ m_maxExtensionscp = cp;
+ }
+ }
+ m_maxExtensions = maxExtensions;
+ m_maxExtensionsCodepoint = m_maxExtensionscp;
+ }
+
+protected:
+ UChar32 GetACharWithMaxExtensions(int* numExtensions)
+ {
+ if (numExtensions) {
+ *numExtensions = m_maxExtensions;
+ }
+ return m_maxExtensionsCodepoint;
+ }
+
+private:
+ int m_maxExtensions;
+ UChar32 m_maxExtensionsCodepoint;
+};
+
+// Validate that ICU never returns more than our maximum expected number of
+// script extensions.
+TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions)
+{
+ int maxExtensions;
+ UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
+ ASSERT_LE(maxExtensions, ScriptData::kMaxScriptCount)
+ << "char " << std::hex << cp << std::dec;
+}
+
+// Check that ICUScriptData returns all of a character's scripts.
+// This only checks one likely character, but doesn't check all cases.
+TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions)
+{
+ int maxExtensions;
+ UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
+ Vector<UScriptCode> extensions;
+ ICUScriptData::instance()->getScripts(cp, extensions);
+
+ // It's possible that GetScripts adds the primary script to the list of
+ // extensions, resulting in one more script than the raw extension count.
+ ASSERT_GE(static_cast<int>(extensions.size()), maxExtensions)
+ << "char " << std::hex << cp << std::dec;
+}
+
+TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension)
+{
+ Vector<UScriptCode> extensions;
+ for (UChar32 cp = 0; cp < 0x110000; ++cp) {
+ ICUScriptData::instance()->getScripts(cp, extensions);
+ UScriptCode primary = extensions.at(0);
+ if (primary == USCRIPT_COMMON) {
+ ASSERT_LE(extensions.size(), 2ul)
+ << "cp: " << std::hex << cp << std::dec;
+ }
+ }
+}
+
+// ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to
+// ignore this for now, as I think it shouldn't matter which run it ends up
+// in. HarfBuzz needs to be able to use it as context and shape each
+// neighboring character appropriately no matter what run it got assigned to.
+
+} // namespace blink
« no previous file with comments | « Source/platform/fonts/ScriptRunIterator.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698