Index: Source/platform/fonts/ScriptRunIteratorTest.cpp |
diff --git a/Source/platform/fonts/ScriptRunIteratorTest.cpp b/Source/platform/fonts/ScriptRunIteratorTest.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..173c7d309f2bf646e888d9e907a7dcfd50f8cccf |
--- /dev/null |
+++ b/Source/platform/fonts/ScriptRunIteratorTest.cpp |
@@ -0,0 +1,786 @@ |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "config.h" |
+#include "platform/fonts/ScriptRunIterator.h" |
+ |
+#include "platform/Logging.h" |
+#include "wtf/Assertions.h" |
+#include "wtf/Threading.h" |
+#include "wtf/text/WTFString.h" |
+ |
+#include <gtest/gtest.h> |
+#include <string> |
+#include <vector> |
+ |
+namespace blink { |
+ |
+struct TestRun { |
+ std::string text; |
+ UScriptCode code; |
+}; |
+ |
+struct ExpectedRun { |
+ unsigned limit; |
+ UScriptCode code; |
+ |
+ ExpectedRun(unsigned the_limit, UScriptCode the_code) |
+ : limit(the_limit) |
+ , code(the_code) |
+ { |
+ } |
+}; |
+ |
+class MockScriptData : public ScriptData { |
+public: |
+ ~MockScriptData() override {} |
+ |
+ static const MockScriptData* instance() |
+ { |
+ AtomicallyInitializedStaticReference(const MockScriptData, mockScriptData, (new MockScriptData())); |
+ |
+ return &mockScriptData; |
+ } |
+ |
+ void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override |
+ { |
+ ASSERT(ch >= kMockCharMin); |
+ ASSERT(ch < kMockCharLimit); |
+ |
+ int code = ch - kMockCharMin; |
+ dst.clear(); |
+ switch (code & kCodeSpecialMask) { |
+ case kCodeSpecialCommon: |
+ dst.append(USCRIPT_COMMON); |
+ break; |
+ case kCodeSpecialInherited: |
+ dst.append(USCRIPT_INHERITED); |
+ break; |
+ default: |
+ break; |
+ } |
+ int listBits = kTable[code & kCodeListIndexMask]; |
+ if (dst.isEmpty() && listBits == 0) { |
+ dst.append(USCRIPT_UNKNOWN); |
+ return; |
+ } |
+ while (listBits) { |
+ switch (listBits & kListMask) { |
+ case 0: |
+ break; |
+ case kLatin: |
+ dst.append(USCRIPT_LATIN); |
+ break; |
+ case kHan: |
+ dst.append(USCRIPT_HAN); |
+ break; |
+ case kGreek: |
+ dst.append(USCRIPT_GREEK); |
+ break; |
+ } |
+ listBits >>= kListShift; |
+ } |
+ } |
+ |
+ UChar32 getPairedBracket(UChar32 ch) const override |
+ { |
+ switch (getPairedBracketType(ch)) { |
+ case PairedBracketType::BracketTypeClose: |
+ return ch - kBracketDelta; |
+ case PairedBracketType::BracketTypeOpen: |
+ return ch + kBracketDelta; |
+ default: |
+ return ch; |
+ } |
+ } |
+ |
+ PairedBracketType getPairedBracketType(UChar32 ch) const override |
+ { |
+ ASSERT(ch >= kMockCharMin && ch < kMockCharLimit); |
+ int code = ch - kMockCharMin; |
+ if ((code & kCodeBracketBit) == 0) { |
+ return PairedBracketType::BracketTypeNone; |
+ } |
+ if (code & kCodeBracketCloseBit) { |
+ return PairedBracketType::BracketTypeClose; |
+ } |
+ return PairedBracketType::BracketTypeOpen; |
+ } |
+ |
+ static int TableLookup(int value) |
+ { |
+ for (int i = 0; i < 16; ++i) { |
+ if (kTable[i] == value) { |
+ return i; |
+ } |
+ } |
+ WTF_LOG_ERROR("Table does not contain value 0x%x", value); |
+ return 0; |
+ } |
+ |
+ static String ToTestString(const std::string& input) |
+ { |
+ String result(String::make16BitFrom8BitSource(0, 0)); |
+ bool inSet = false; |
+ int seen = 0; |
+ int code = 0; |
+ int list = 0; |
+ int currentShift = 0; |
+ for (char c : input) { |
+ if (inSet) { |
+ switch (c) { |
+ case '(': |
+ ASSERT(seen == 0); |
+ seen |= kSawBracket; |
+ code |= kCodeBracketBit; |
+ break; |
+ case '[': |
+ ASSERT(seen == 0); |
+ seen |= kSawBracket; |
+ code |= kCodeBracketBit | kCodeSquareBracketBit; |
+ break; |
+ case ')': |
+ ASSERT(seen == 0); |
+ seen |= kSawBracket; |
+ code |= kCodeBracketBit | kCodeBracketCloseBit; |
+ break; |
+ case ']': |
+ ASSERT(seen == 0); |
+ seen |= kSawBracket; |
+ code |= kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit; |
+ break; |
+ case 'i': |
+ ASSERT(seen == 0); // brackets can't be inherited |
+ seen |= kSawSpecial; |
+ code |= kCodeSpecialInherited; |
+ break; |
+ case 'c': |
+ ASSERT((seen & ~kSawBracket) == 0); |
+ seen |= kSawSpecial; |
+ code |= kCodeSpecialCommon; |
+ break; |
+ case 'l': |
+ ASSERT((seen & kSawLatin) == 0); |
+ ASSERT(currentShift < 3); |
+ seen |= kSawLatin; |
+ list |= kLatin << (2 * currentShift++); |
+ break; |
+ case 'h': |
+ ASSERT((seen & kSawHan) == 0); |
+ ASSERT(currentShift < 3); |
+ seen |= kSawHan; |
+ list |= kHan << (2 * currentShift++); |
+ break; |
+ case 'g': |
+ ASSERT((seen & kSawGreek) == 0); |
+ ASSERT(currentShift < 3); |
+ seen |= kSawGreek; |
+ list |= kGreek << (2 * currentShift++); |
+ break; |
+ case '>': |
+ ASSERT(seen != 0); |
+ code |= TableLookup(list); |
+ result.append(static_cast<UChar>(kMockCharMin + code)); |
+ inSet = false; |
+ break; |
+ default: |
+ WTF_LOG_ERROR("Illegal mock string set char: '%c'", c); |
+ break; |
+ } |
+ continue; |
+ } |
+ // not in set |
+ switch (c) { |
+ case '<': |
+ seen = 0; |
+ code = 0; |
+ list = 0; |
+ currentShift = 0; |
+ inSet = true; |
+ break; |
+ case '(': |
+ code = kCodeBracketBit | kCodeSpecialCommon; |
+ break; |
+ case '[': |
+ code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCommon; |
+ break; |
+ case ')': |
+ code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon; |
+ break; |
+ case ']': |
+ code = kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon; |
+ break; |
+ case 'i': |
+ code = kCodeSpecialInherited; |
+ break; |
+ case 'c': |
+ code = kCodeSpecialCommon; |
+ break; |
+ case 'l': |
+ code = kLatin; |
+ break; |
+ case 'h': |
+ code = kHan; |
+ break; |
+ case 'g': |
+ code = kGreek; |
+ break; |
+ case '?': |
+ code = 0; // unknown |
+ break; |
+ default: |
+ WTF_LOG_ERROR("Illegal mock string set char: '%c'", c); |
+ } |
+ if (!inSet) { |
+ result.append(static_cast<UChar>(kMockCharMin + code)); |
+ } |
+ } |
+ return result; |
+ } |
+ |
+ static std::string MockCharString(UChar mockch) |
+ { |
+ ASSERT(mockch >= kMockCharMin && mockch < kMockCharLimit); |
+ int code = mockch - kMockCharMin; |
+ |
+ // We use set notation in these cases: |
+ // - more than one of special, kLatin, kHan, kGreek |
+ // - bracket and not common (since non-set brackets are common) |
+ bool isBracket = (code & kCodeBracketBit) != 0; |
+ bool isSpecial = (mockch & kCodeSpecialMask) != 0; |
+ bool isCommon = (mockch & kCodeSpecialMask) == kCodeSpecialCommon; |
+ char c; |
+ if (isBracket) { |
+ if (code & kCodeSquareBracketBit) { |
+ if (code & kCodeBracketCloseBit) { |
+ c = ']'; |
+ } else { |
+ c = '['; |
+ } |
+ } else { |
+ if (code & kCodeBracketCloseBit) { |
+ c = ')'; |
+ } else { |
+ c = '('; |
+ } |
+ } |
+ } else if (isSpecial) { |
+ c = isCommon ? 'c' : 'i'; |
+ } |
+ std::string result; |
+ int listBits = kTable[code & kCodeListIndexMask]; |
+ while (listBits) { |
+ switch (listBits & kListMask) { |
+ case 0: |
+ break; |
+ case kLatin: |
+ result += 'l'; |
+ break; |
+ case kHan: |
+ result += 'h'; |
+ break; |
+ case kGreek: |
+ result += 'g'; |
+ break; |
+ } |
+ listBits >>= kListShift; |
+ } |
+ bool needSet = result.length() + (isSpecial ? 1 : 0) > 1 || (isBracket && (result.length() > 0 || !isCommon)); |
+ if (needSet) { |
+ std::string setResult("<"); |
+ if (isBracket) { |
+ setResult += c; |
+ } |
+ if (isSpecial) { |
+ if (isCommon) { |
+ setResult += "c"; |
+ } else { |
+ setResult += "i"; |
+ } |
+ } |
+ setResult += result; |
+ setResult += ">"; |
+ return setResult; |
+ } |
+ if (isBracket || isSpecial) { |
+ result = c; |
+ } |
+ return result; |
+ } |
+ |
+ // We determine properties based on the offset from kMockCharMin: |
+ // bits 0-3 represent the list of l, h, c scripts (index into table) |
+ // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal |
+ // bit 6 clear means non-bracket, open means bracket |
+ // bit 7 clear means open bracket, set means close bracket |
+ // bit 8 clear means paren, set means bracket |
+ // if it's a bracket, the matching bracket is 64 code points away |
+ static const UChar32 kMockCharMin = 0xe000; |
+ static const UChar32 kMockCharLimit = kMockCharMin + 0x200; |
+ static const int kLatin = 1; |
+ static const int kHan = 2; |
+ static const int kGreek = 3; |
+ static const int kCodeListIndexMask = 0xf; |
+ static const int kCodeSpecialMask = 0x30; |
+ static const int kCodeSpecialCommon = 0x10; |
+ static const int kCodeSpecialInherited = 0x20; |
+ static const int kCodeBracketCloseBit = 0x40; |
+ static const int kCodeBracketBit = 0x80; |
+ static const int kCodeSquareBracketBit = 0x100; |
+ static const int kListShift = 2; |
+ static const int kListMask = 0x3; |
+ static const int kBracketDelta = kCodeBracketCloseBit; |
+ static const int kTable[16]; |
+ |
+ static const int kSawBracket = 0x1; |
+ static const int kSawSpecial = 0x2; |
+ static const int kSawLatin = 0x4; |
+ static const int kSawHan = 0x8; |
+ static const int kSawGreek = 0x10; |
+}; |
+ |
+static const int kLatin2 = MockScriptData::kLatin << 2; |
+static const int kHan2 = MockScriptData::kHan << 2; |
+static const int kGreek2 = MockScriptData::kGreek << 2; |
+static const int kLatin3 = MockScriptData::kLatin << 4; |
+static const int kHan3 = MockScriptData::kHan << 4; |
+static const int kGreek3 = MockScriptData::kGreek << 4; |
+const int MockScriptData::kTable[] = { |
+ 0, kLatin, kHan, kGreek, |
+ kLatin2 + kHan, kLatin2 + kGreek, |
+ kHan2 + kLatin, kHan2 + kGreek, |
+ kGreek2 + kLatin, kGreek2 + kHan, |
+ kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan, |
+ kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin, |
+ kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin, |
+}; |
+ |
+class ScriptRunIteratorTest : public testing::Test { |
+protected: |
+ void CheckRuns(const std::vector<TestRun>& runs) |
+ { |
+ String text(String::make16BitFrom8BitSource(0, 0)); |
+ std::vector<ExpectedRun> expect; |
+ for (auto& run : runs) { |
+ text.append(String::fromUTF8(run.text.c_str())); |
+ expect.push_back(ExpectedRun(text.length(), run.code)); |
+ } |
+ ScriptRunIterator scriptRunIterator(text.characters16(), text.length()); |
+ VerifyRuns(&scriptRunIterator, expect); |
+ } |
+ |
+ void CheckMockRuns(const std::vector<TestRun>& runs) |
+ { |
+ String text(String::make16BitFrom8BitSource(0, 0)); |
+ std::vector<ExpectedRun> expect; |
+ for (const TestRun& run : runs) { |
+ text.append(MockScriptData::ToTestString(run.text)); |
+ expect.push_back({ text.length(), run.code }); |
+ } |
+ |
+ ScriptRunIterator scriptRunIterator(text.characters16(), text.length(), |
+ MockScriptData::instance()); |
+ VerifyRuns(&scriptRunIterator, expect); |
+ } |
+ |
+ void VerifyRuns(ScriptRunIterator* scriptRunIterator, |
+ const std::vector<ExpectedRun>& expect) |
+ { |
+ unsigned limit; |
+ UScriptCode code; |
+ unsigned long runCount = 0; |
+ while (scriptRunIterator->consume(limit, code)) { |
+ ASSERT_LT(runCount, expect.size()); |
+ ASSERT_EQ(expect[runCount].limit, limit); |
+ ASSERT_EQ(expect[runCount].code, code); |
+ ++runCount; |
+ } |
+ WTF_LOG_ERROR("Expected %zu runs, got %lu ", expect.size(), runCount); |
+ ASSERT_EQ(expect.size(), runCount); |
+ } |
+}; |
+ |
+TEST_F(ScriptRunIteratorTest, Empty) |
+{ |
+ String empty(String::make16BitFrom8BitSource(0, 0)); |
+ ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length()); |
+ unsigned limit = 0; |
+ UScriptCode code = USCRIPT_INVALID_CODE; |
+ ASSERT(!scriptRunIterator.consume(limit, code)); |
+ ASSERT_EQ(limit, 0u); |
+ ASSERT_EQ(code, USCRIPT_INVALID_CODE); |
+} |
+ |
+// Some of our compilers cannot initialize a vector from an array yet. |
+#define DECLARE_RUNSVECTOR(...) \ |
+ static const TestRun runsArray[] = __VA_ARGS__; \ |
+ std::vector<TestRun> runs(runsArray, runsArray + sizeof(runsArray) / sizeof(*runsArray)); |
+ |
+#define CHECK_RUNS(...) \ |
+ DECLARE_RUNSVECTOR(__VA_ARGS__); \ |
+ CheckRuns(runs); |
+ |
+#define CHECK_MOCK_RUNS(...) \ |
+ DECLARE_RUNSVECTOR(__VA_ARGS__); \ |
+ CheckMockRuns(runs); |
+ |
+TEST_F(ScriptRunIteratorTest, Whitespace) |
+{ |
+ CHECK_RUNS({ { " \t ", USCRIPT_COMMON } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, Common) |
+{ |
+ CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, Latin) |
+{ |
+ CHECK_RUNS({ { "latin", USCRIPT_LATIN } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, Chinese) |
+{ |
+ CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } }); |
+} |
+ |
+// Close bracket without matching open is ignored |
+TEST_F(ScriptRunIteratorTest, UnbalancedParens1) |
+{ |
+ CHECK_RUNS({ { "(萬", USCRIPT_HAN }, |
+ { "a]", USCRIPT_LATIN }, |
+ { ")", USCRIPT_HAN } }); |
+} |
+ |
+// Open bracket without matching close is popped when inside |
+// matching close brackets, so doesn't match later close. |
+TEST_F(ScriptRunIteratorTest, UnbalancedParens2) |
+{ |
+ CHECK_RUNS({ { "(萬", USCRIPT_HAN }, |
+ { "a[", USCRIPT_LATIN }, |
+ { ")]", USCRIPT_HAN } }); |
+} |
+ |
+// space goes with leading script |
+TEST_F(ScriptRunIteratorTest, LatinHan) |
+{ |
+ CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN }, |
+ { "萬國碼", USCRIPT_HAN } }); |
+} |
+ |
+// space goes with leading script |
+TEST_F(ScriptRunIteratorTest, HanLatin) |
+{ |
+ CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN }, |
+ { "Unicode", USCRIPT_LATIN } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, ParenEmptyParen) |
+{ |
+ CHECK_RUNS({ { "()", USCRIPT_COMMON } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, ParenChineseParen) |
+{ |
+ CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, ParenLatinParen) |
+{ |
+ CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } }); |
+} |
+ |
+// open paren gets leading script |
+TEST_F(ScriptRunIteratorTest, LatinParenChineseParen) |
+{ |
+ CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN }, |
+ { "萬國碼", USCRIPT_HAN }, |
+ { ")", USCRIPT_LATIN } }); |
+} |
+ |
+// open paren gets first trailing script if no leading script |
+TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin) |
+{ |
+ CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN }, |
+ { "Unicode", USCRIPT_LATIN } }); |
+} |
+ |
+// leading common and open paren get first trailing script. |
+// TODO(dougfelt): we don't do quote matching, but probably should figure out |
+// something better then doing nothing. |
+TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote) |
+{ |
+ CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN }, |
+ { "Unicode\"", USCRIPT_LATIN } }); |
+} |
+ |
+// Unmatched close brace gets leading context |
+TEST_F(ScriptRunIteratorTest, UnmatchedClose) |
+{ |
+ CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN }, |
+ { "萬國碼] ", USCRIPT_HAN }, |
+ { ") Unicode\"", USCRIPT_LATIN } }); |
+} |
+ |
+// Match up to 32 bracket pairs |
+TEST_F(ScriptRunIteratorTest, Match32Brackets) |
+{ |
+ CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN }, |
+ { "Unicode (((((((((((((((((((((((((((((((!" |
+ ")))))))))))))))))))))))))))))))", |
+ USCRIPT_LATIN }, |
+ { "]", USCRIPT_HAN } }); |
+} |
+ |
+// Matches 32 most recent bracket pairs. More than that, and we revert to |
+// surrounding script. |
+TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets) |
+{ |
+ CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN }, |
+ { "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN }, |
+ { "萬國碼!", USCRIPT_HAN }, |
+ { ")))))))))))))))))))))))))))))))", USCRIPT_LATIN }, |
+ { "]", USCRIPT_HAN }, |
+ { "But )))", USCRIPT_LATIN } }); |
+} |
+ |
+// A char with multiple scripts that match both leading and trailing context |
+// gets the leading context. |
+TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext) |
+{ |
+ CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN }, |
+ { "l", USCRIPT_LATIN } }); |
+} |
+ |
+// A char with multiple scripts that only match trailing context gets the |
+// trailing context. |
+TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext) |
+{ |
+ CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN }, |
+ { "<gl>l", USCRIPT_LATIN } }); |
+} |
+ |
+// Retain first established priority script. <lhg><gh> produce the script <gh> |
+// with g as priority, because of the two priority scripts l and g, only g |
+// remains. Then <gh><hgl> retains g as priority, because of the two priority |
+// scripts g and h that remain, g was encountered first. |
+TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript) |
+{ |
+ CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } }); |
+} |
+ |
+// Parens can have scripts that break script runs. |
+TEST_F(ScriptRunIteratorTest, ExtensionsParens) |
+{ |
+ CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK }, |
+ { "h<[hl>", USCRIPT_HAN }, |
+ { "l", USCRIPT_LATIN }, |
+ { "<]hl>", USCRIPT_HAN }, |
+ { "<)lg>", USCRIPT_GREEK } }); |
+} |
+ |
+// The close paren might be encountered before we've established the open |
+// paren's script, but when this is the case the current set is still valid, so |
+// this doesn't affect it nor break the run. |
+TEST_F(ScriptRunIteratorTest, ExtensionsParens2) |
+{ |
+ CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } }); |
+} |
+ |
+// A common script with a single extension should be treated as common, but |
+// with the extended script as a default. If we encounter anything other than |
+// common, that takes priority. If we encounter other common scripts with a |
+// single extension, the current priority remains. |
+TEST_F(ScriptRunIteratorTest, CommonWithPriority) |
+{ |
+ CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, CommonWithPriority2) |
+{ |
+ CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, CommonWithPriority3) |
+{ |
+ CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } }); |
+} |
+ |
+// UDatta (\xE0\xA5\x91) is inherited with LATIN and DEVANAGARI extensions. |
+// Since it has LATIN, and the dotted circle (\xE2\x97\x8C) is COMMON and has |
+// adopted the preceding LATIN, it gets the LATIN. This is standard. |
+TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta) |
+{ |
+ CHECK_RUNS({ { "Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN } }); |
+} |
+ |
+// In this situation, UDatta (\xE0\xA5\x91) doesn't share a script with the |
+// value inherited by the dotted circle (\xE2\x97\x8C). It captures the |
+// preceding dotted circle and breaks it from the run it would normally have |
+// been in. |
+TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta) |
+{ |
+ CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN }, |
+ { "\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_DEVANAGARI } }); |
+} |
+ |
+// Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is |
+// common, that of Fathatan is inherited. The script extensions for Fathatan |
+// are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the |
+// preferred script for Fathatan is Arabic, according to Behdad's |
+// heuristic. This is exactly analogous to the Udatta tests above, except |
+// Tatweel is Lm. But we don't take properties into account, only scripts. |
+TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan) |
+{ |
+ CHECK_RUNS({ { "Latin ", USCRIPT_LATIN }, |
+ { "\xD9\x80\xD9\x8B", USCRIPT_ARABIC } }); |
+} |
+ |
+// Another case where if the mark accepts a script that was inherited by the |
+// preceding common-script character, they both continue in that script. |
+// SYRIAC LETTER NUN \xDC\xA2 |
+// ARABIC TATWEEL \xD9\x80 |
+// ARABIC FATHATAN \xD9\x82 |
+TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan) |
+{ |
+ CHECK_RUNS({ { "\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC } }); |
+} |
+ |
+// The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that |
+// is not common. |
+TEST_F(ScriptRunIteratorTest, HanUdatta) |
+{ |
+ CHECK_RUNS({ { "萬國碼\xE0\xA5\x91", USCRIPT_HAN } }); |
+} |
+ |
+// The Udatta (\xE0\xA5\x91) is inherited, and will capture the space and turn |
+// it into Devanagari. |
+TEST_F(ScriptRunIteratorTest, HanSpaceUdatta) |
+{ |
+ CHECK_RUNS({ { "萬國碼", USCRIPT_HAN }, |
+ { " \xE0\xA5\x91", USCRIPT_DEVANAGARI } }); |
+} |
+ |
+// Make sure Mock code works too. |
+TEST_F(ScriptRunIteratorTest, MockHanInheritedGL) |
+{ |
+ CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL) |
+{ |
+ CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN }, |
+ { "c<igl>", USCRIPT_GREEK } }); |
+} |
+ |
+// Leading inherited just act like common, except there's no preferred script. |
+TEST_F(ScriptRunIteratorTest, MockLeadingInherited) |
+{ |
+ CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } }); |
+} |
+ |
+// Leading inherited just act like common, except there's no preferred script. |
+TEST_F(ScriptRunIteratorTest, MockLeadingInherited2) |
+{ |
+ CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, LeadingInheritedHan) |
+{ |
+ // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91 |
+ CHECK_RUNS({ { "\xE0\xA5\x91萬國碼", USCRIPT_HAN } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2) |
+{ |
+ // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91 |
+ // ARABIC FATHATAN \xD9\x8B |
+ CHECK_RUNS({ { "\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN } }); |
+} |
+ |
+TEST_F(ScriptRunIteratorTest, OddLatinString) |
+{ |
+ CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } }); |
+} |
+ |
+class ScriptRunIteratorICUDataTest : public testing::Test { |
+public: |
+ ScriptRunIteratorICUDataTest() |
+ : m_maxExtensions(0) |
+ , m_maxExtensionsCodepoint(0xffff) |
+ { |
+ int maxExtensions = 0; |
+ UChar32 m_maxExtensionscp = 0; |
+ for (UChar32 cp = 0; cp < 0x11000; ++cp) { |
+ UErrorCode status = U_ZERO_ERROR; |
+ int count = uscript_getScriptExtensions(cp, 0, 0, &status); |
+ if (count > maxExtensions) { |
+ maxExtensions = count; |
+ m_maxExtensionscp = cp; |
+ } |
+ if (count > ScriptData::kMaxScriptCount) { |
+ } |
+ } |
+ m_maxExtensions = maxExtensions; |
+ m_maxExtensionsCodepoint = m_maxExtensionscp; |
+ } |
+ |
+protected: |
+ UChar32 GetACharWithMaxExtensions(int* numExtensions) |
+ { |
+ if (numExtensions) { |
+ *numExtensions = m_maxExtensions; |
+ } |
+ return m_maxExtensionsCodepoint; |
+ } |
+ |
+private: |
+ int m_maxExtensions; |
+ UChar32 m_maxExtensionsCodepoint; |
+}; |
+ |
+// Validate that ICU never returns more than our maximum expected number of |
+// script extensions. |
+TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions) |
+{ |
+ int maxExtensions; |
+ UChar32 cp = GetACharWithMaxExtensions(&maxExtensions); |
+ ASSERT_LE(maxExtensions, ScriptData::kMaxScriptCount) |
+ << "char " << std::hex << cp << std::dec; |
+} |
+ |
+// Check that ICUScriptData returns all of a character's scripts. |
+// This only checks one likely character, but doesn't check all cases. |
+TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions) |
+{ |
+ int maxExtensions; |
+ UChar32 cp = GetACharWithMaxExtensions(&maxExtensions); |
+ Vector<UScriptCode> extensions; |
+ ICUScriptData::instance()->getScripts(cp, extensions); |
+ |
+ // It's possible that GetScripts adds the primary script to the list of |
+ // extensions, resulting in one more script than the raw extension count. |
+ ASSERT_GE(static_cast<int>(extensions.size()), maxExtensions) |
+ << "char " << std::hex << cp << std::dec; |
+} |
+ |
+TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension) |
+{ |
+ Vector<UScriptCode> extensions; |
+ for (UChar32 cp = 0; cp < 0x110000; ++cp) { |
+ ICUScriptData::instance()->getScripts(cp, extensions); |
+ UScriptCode primary = extensions.at(0); |
+ if (primary == USCRIPT_COMMON) { |
+ ASSERT_LE(extensions.size(), 2ul) |
+ << "cp: " << std::hex << cp << std::dec; |
+ } |
+ } |
+} |
+ |
+// ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to |
+// ignore this for now, as I think it shouldn't matter which run it ends up |
+// in. HarfBuzz needs to be able to use it as context and shape each |
+// neighboring character appropriately no matter what run it got assigned to. |
+ |
+} // namespace blink |