Source/platform/fonts/ScriptRunIteratorTest.cpp - Issue 1323513006: Upstream ScriptRunIterator for segmenting text runs by script

Side by Side Diff: Source/platform/fonts/ScriptRunIteratorTest.cpp

Issue 1323513006: Upstream ScriptRunIterator for segmenting text runs by script (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "config.h"

	6

	7 #include "wtf/Assertions.h"

	8 #include "platform/fonts/ScriptRunIterator.h"

	9 #include "platform/Logging.h"

	10 #include "wtf/Threading.h"

	11 #include "wtf/text/WTFString.h"

	12

	13 #include <gtest/gtest.h>

	14

	15 #include <string>

	16 #include <vector>

	17

	18 namespace blink {

	19

	20 struct TestRun {

	21 std::string text;

	22 UScriptCode code;

	23 };

	24

	25 struct ExpectedRun {

	26 unsigned limit;

	27 UScriptCode code;

	28

	29 ExpectedRun(unsigned the_limit, UScriptCode the_code)

	30 : limit(the_limit)

	31 , code(the_code)

	32 {

	33 }

	34 };

	35

	36 class MockScriptData : public ScriptData {
	eae 2015/08/28 21:10:25 Do we really need to mock out the data object? It Do we really need to mock out the data object? It would be nice if we could use the actual implementation and real content instead of mocking out half of the logic. As it is this test is more of a test for the mock then the actual implementation.
	37 public:

	38 ~MockScriptData() override {}

	39

	40 static const MockScriptData* instance()

	41 {

	42 AtomicallyInitializedStaticReference(const MockScriptData, mockScriptDat a, (new MockScriptData()));

	43

	44 return &mockScriptData;

	45 }

	46

	47 virtual void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override

	48 {

	49 ASSERT(ch >= kMockCharMin);

	50 ASSERT(ch < kMockCharLimit);

	51

	52 int code = ch - kMockCharMin;

	53 dst.clear();

	54 switch (code & kCodeSpecialMask) {

	55 case kCodeSpecialCommon:

	56 dst.append(USCRIPT_COMMON);

	57 break;

	58 case kCodeSpecialInherited:

	59 dst.append(USCRIPT_INHERITED);

	60 break;

	61 default:

	62 break;

	63 }

	64 int list_bits = kTable[code & kCodeListIndexMask];

	65 if (dst.isEmpty() && list_bits == 0) {

	66 dst.append(USCRIPT_UNKNOWN);

	67 return;

	68 }

	69 while (list_bits) {

	70 switch (list_bits & kListMask) {

	71 case 0:

	72 break;

	73 case kLatin:

	74 dst.append(USCRIPT_LATIN);

	75 break;

	76 case kHan:

	77 dst.append(USCRIPT_HAN);

	78 break;

	79 case kGreek:

	80 dst.append(USCRIPT_GREEK);

	81 break;

	82 }

	83 list_bits >>= kListShift;

	84 }

	85 }

	86

	87 UChar32 getPairedBracket(UChar32 ch) const override

	88 {

	89 switch (getPairedBracketType(ch)) {

	90 case PairedBracketType::CLOSE:

	91 return ch - kBracketDelta;

	92 case PairedBracketType::OPEN:

	93 return ch + kBracketDelta;

	94 default:

	95 return ch;

	96 }

	97 }

	98

	99 PairedBracketType getPairedBracketType(UChar32 ch) const override

	100 {

	101 ASSERT(ch >= kMockCharMin && ch < kMockCharLimit);

	102 int code = ch - kMockCharMin;

	103 if ((code & kCodeBracketBit) == 0) {

	104 return PairedBracketType::NONE;

	105 }

	106 if (code & kCodeBracketCloseBit) {

	107 return PairedBracketType::CLOSE;

	108 }

	109 return PairedBracketType::OPEN;

	110 }

	111

	112 static int TableLookup(int value)

	113 {

	114 for (int i = 0; i < 16; ++i) {

	115 if (kTable[i] == value) {

	116 return i;

	117 }

	118 }

	119 WTF_LOG_ERROR("Table does not contain value 0x%x", value);

	120 return 0;

	121 }

	122

	123 static String ToTestString(const std::string& input)

	124 {

	125 String result(String::make16BitFrom8BitSource(0, 0));

	126 bool in_set = false;

	127 int seen = 0;

	128 int code = 0;

	129 int list = 0;

	130 int cur_shift = 0;

	131 for (char c : input) {

	132 if (in_set) {

	133 switch (c) {

	134 case '(':

	135 ASSERT(seen == 0);

	136 seen \|= kSawBracket;

	137 code \|= kCodeBracketBit;

	138 break;

	139 case '[':

	140 ASSERT(seen == 0);

	141 seen \|= kSawBracket;

	142 code \|= kCodeBracketBit \| kCodeSquareBracketBit;

	143 break;

	144 case ')':

	145 ASSERT(seen == 0);

	146 seen \|= kSawBracket;

	147 code \|= kCodeBracketBit \| kCodeBracketCloseBit;

	148 break;

	149 case ']':

	150 ASSERT(seen == 0);

	151 seen \|= kSawBracket;

	152 code \|= kCodeBracketBit \| kCodeSquareBracketBit \| kCodeBrack etCloseBit;

	153 break;

	154 case 'i':

	155 ASSERT(seen == 0); // brackets can't be inherited

	156 seen \|= kSawSpecial;

	157 code \|= kCodeSpecialInherited;

	158 break;

	159 case 'c':

	160 ASSERT((seen & ~kSawBracket) == 0);

	161 seen \|= kSawSpecial;

	162 code \|= kCodeSpecialCommon;

	163 break;

	164 case 'l':

	165 ASSERT((seen & kSawLatin) == 0);

	166 ASSERT(cur_shift < 3);

	167 seen \|= kSawLatin;

	168 list \|= kLatin << (2 * cur_shift++);

	169 break;

	170 case 'h':

	171 ASSERT((seen & kSawHan) == 0);

	172 ASSERT(cur_shift < 3);

	173 seen \|= kSawHan;

	174 list \|= kHan << (2 * cur_shift++);

	175 break;

	176 case 'g':

	177 ASSERT((seen & kSawGreek) == 0);

	178 ASSERT(cur_shift < 3);

	179 seen \|= kSawGreek;

	180 list \|= kGreek << (2 * cur_shift++);

	181 break;

	182 case '>':

	183 ASSERT(seen != 0);

	184 code \|= TableLookup(list);

	185 result.append(static_cast<UChar>(kMockCharMin + code));

	186 in_set = false;

	187 break;

	188 default:

	189 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);

	190 break;

	191 }

	192 continue;

	193 }

	194 // not in set

	195 switch (c) {

	196 case '<':

	197 seen = 0;

	198 code = 0;

	199 list = 0;

	200 cur_shift = 0;

	201 in_set = true;

	202 break;

	203 case '(':

	204 code = kCodeBracketBit \| kCodeSpecialCommon;

	205 break;

	206 case '[':

	207 code = kCodeBracketBit \| kCodeSquareBracketBit \| kCodeSpecialCom mon;

	208 break;

	209 case ')':

	210 code = kCodeBracketBit \| kCodeBracketCloseBit \| kCodeSpecialComm on;

	211 break;

	212 case ']':

	213 code = kCodeBracketBit \| kCodeSquareBracketBit \| kCodeBracketClo seBit \| kCodeSpecialCommon;

	214 break;

	215 case 'i':

	216 code = kCodeSpecialInherited;

	217 break;

	218 case 'c':

	219 code = kCodeSpecialCommon;

	220 break;

	221 case 'l':

	222 code = kLatin;

	223 break;

	224 case 'h':

	225 code = kHan;

	226 break;

	227 case 'g':

	228 code = kGreek;

	229 break;

	230 case '?':

	231 code = 0; // unknown

	232 break;

	233 default:

	234 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);

	235 }

	236 if (!in_set) {

	237 result.append(static_cast<UChar>(kMockCharMin + code));

	238 }

	239 }

	240 return result;

	241 }

	242

	243 static std::string MockCharString(UChar mockch)

	244 {

	245 ASSERT(mockch >= kMockCharMin && mockch < kMockCharLimit);

	246 int code = mockch - kMockCharMin;

	247

	248 // We use set notation in these cases:

	249 // - more than one of special, kLatin, kHan, kGreek

	250 // - bracket and not common (since non-set brackets are common)

	251 bool is_bracket = (code & kCodeBracketBit) != 0;

	252 bool is_special = (mockch & kCodeSpecialMask) != 0;

	253 bool is_common = (mockch & kCodeSpecialMask) == kCodeSpecialCommon;

	254 char c;

	255 if (is_bracket) {

	256 if (code & kCodeSquareBracketBit) {

	257 if (code & kCodeBracketCloseBit) {

	258 c = ']';

	259 }

	260 else {

	261 c = '[';

	262 }

	263 }

	264 else {

	265 if (code & kCodeBracketCloseBit) {

	266 c = ')';

	267 }

	268 else {

	269 c = '(';

	270 }

	271 }

	272 }

	273 else if (is_special) {

	274 c = is_common ? 'c' : 'i';

	275 }

	276 std::string result;

	277 int list_bits = kTable[code & kCodeListIndexMask];

	278 while (list_bits) {

	279 switch (list_bits & kListMask) {

	280 case 0:

	281 break;

	282 case kLatin:

	283 result += 'l';

	284 break;

	285 case kHan:

	286 result += 'h';

	287 break;

	288 case kGreek:

	289 result += 'g';

	290 break;

	291 }

	292 list_bits >>= kListShift;

	293 }

	294 bool need_set = result.length() + (is_special ? 1 : 0) > 1 \|\| (is_bracke t && (result.length() > 0 \|\| !is_common));

	295 if (need_set) {

	296 std::string set_result("<");

	297 if (is_bracket) {

	298 set_result += c;

	299 }

	300 if (is_special) {

	301 if (is_common) {

	302 set_result += "c";

	303 }

	304 else {

	305 set_result += "i";

	306 }

	307 }

	308 set_result += result;

	309 set_result += ">";

	310 return set_result;

	311 }

	312 if (is_bracket \|\| is_special) {

	313 result = c;

	314 }

	315 return result;

	316 }

	317

	318 // we determine properties based on the offset from kMockCharMin

	319 // bits 0-3 represent the list of l, h, c scripts (index into table)

	320 // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal

	321 // bit 6 clear means non-bracket, open means bracket

	322 // bit 7 clear means open bracket, set means close bracket

	323 // bit 8 clear means paren, set means bracket

	324 // if it's a bracket, the matching bracket is 64 code points away

	325

	326 static const UChar32 kMockCharMin = 0xe000;

	327 static const UChar32 kMockCharLimit = kMockCharMin + 0x200;

	328 static const int kLatin = 1;

	329 static const int kHan = 2;

	330 static const int kGreek = 3;

	331 static const int kCodeListIndexMask = 0xf;

	332 static const int kCodeSpecialMask = 0x30;

	333 static const int kCodeSpecialCommon = 0x10;

	334 static const int kCodeSpecialInherited = 0x20;

	335 static const int kCodeBracketCloseBit = 0x40;

	336 static const int kCodeBracketBit = 0x80;

	337 static const int kCodeSquareBracketBit = 0x100;

	338 static const int kListShift = 2;

	339 static const int kListMask = 0x3;

	340 static const int kBracketDelta = kCodeBracketCloseBit;

	341 static const int kTable[16];

	342

	343 static const int kSawBracket = 0x1;

	344 static const int kSawSpecial = 0x2;

	345 static const int kSawLatin = 0x4;

	346 static const int kSawHan = 0x8;

	347 static const int kSawGreek = 0x10;

	348 };

	349

	350 static constexpr int kLatin2 = MockScriptData::kLatin << 2;

	351 static constexpr int kHan2 = MockScriptData::kHan << 2;

	352 static constexpr int kGreek2 = MockScriptData::kGreek << 2;

	353 static constexpr int kLatin3 = MockScriptData::kLatin << 4;

	354 static constexpr int kHan3 = MockScriptData::kHan << 4;

	355 static constexpr int kGreek3 = MockScriptData::kGreek << 4;

	356 const int MockScriptData::kTable[] = {

	357 0, kLatin, kHan, kGreek,

	358 kLatin2 + kHan, kLatin2 + kGreek,

	359 kHan2 + kLatin, kHan2 + kGreek,

	360 kGreek2 + kLatin, kGreek2 + kHan,

	361 kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan,

	362 kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin,

	363 kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin,

	364 };

	365

	366 class ScriptRunIteratorTest : public testing::Test {

	367 protected:

	368 void CheckRuns(const std::vector<TestRun>& runs)

	369 {

	370 String text(String::make16BitFrom8BitSource(0, 0));

	371 std::vector<ExpectedRun> expect;

	372 for (auto& run : runs) {

	373 text.append(String::fromUTF8(run.text.c_str()));

	374 expect.push_back(ExpectedRun(text.length(), run.code));

	375 }

	376 ScriptRunIterator scriptRunIterator(text.characters16(), text.length());

	377 VerifyRuns(&scriptRunIterator, expect);

	378 }

	379

	380 void CheckMockRuns(const std::vector<TestRun>& runs)

	381 {

	382 String text(String::make16BitFrom8BitSource(0, 0));

	383 std::vector<ExpectedRun> expect;

	384 for (const TestRun& run : runs) {

	385 text.append(MockScriptData::ToTestString(run.text));

	386 expect.push_back({ text.length(), run.code });

	387 }

	388

	389 ScriptRunIterator scriptRunIterator(text.characters16(), text.length(),

	390 MockScriptData::instance());

	391 VerifyRuns(&scriptRunIterator, expect);

	392 }

	393

	394 void VerifyRuns(ScriptRunIterator* scriptRunIterator,

	395 const std::vector<ExpectedRun>& expect)

	396 {

	397 unsigned limit;

	398 UScriptCode code;

	399 unsigned long run_count = 0;

	400 while (scriptRunIterator->consume(limit, code)) {

	401 ASSERT_LT(run_count, expect.size());

	402 ASSERT_EQ(expect[run_count].limit, limit);

	403 ASSERT_EQ(expect[run_count].code, code);

	404 ++run_count;

	405 }

	406 WTF_LOG_ERROR("Expected %ld runs, got %lu ", expect.size(), run_count);

	407 ASSERT_EQ(expect.size(), run_count);

	408 }

	409 };

	410

	411 TEST_F(ScriptRunIteratorTest, Empty)

	412 {

	413 String empty(String::make16BitFrom8BitSource(0, 0));

	414 ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length());

	415 unsigned limit;

	416 UScriptCode code;

	417 ASSERT(!scriptRunIterator.consume(limit, code));

	418 }

	419

	420 // Some of our compilers cannot initialize a vector from an array yet.

	421 #define DECLARE_RUNSVECTOR(...) \

	422 static const TestRun runsArray[] = __VA_ARGS__; \

	423 std::vector<TestRun> runs(runsArray, runsArray + sizeof(runsArray) / sizeof( *runsArray));

	424

	425 #define CHECK_RUNS(...) \

	426 DECLARE_RUNSVECTOR(__VA_ARGS__); \

	427 CheckRuns(runs);

	428

	429 #define CHECK_MOCK_RUNS(...) \

	430 DECLARE_RUNSVECTOR(__VA_ARGS__); \

	431 CheckMockRuns(runs);

	432

	433 TEST_F(ScriptRunIteratorTest, Whitespace)

	434 {

	435 CHECK_RUNS({ { " \t ", USCRIPT_COMMON } });

	436 }

	437

	438 TEST_F(ScriptRunIteratorTest, Common)

	439 {

	440 CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } });

	441 }

	442

	443 TEST_F(ScriptRunIteratorTest, Latin)

	444 {

	445 CHECK_RUNS({ { "latin", USCRIPT_LATIN } });

	446 }

	447

	448 TEST_F(ScriptRunIteratorTest, Chinese)

	449 {

	450 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } });

	451 }

	452

	453 // Close bracket without matching open is ignored

	454 TEST_F(ScriptRunIteratorTest, UnbalancedParens1)

	455 {

	456 CHECK_RUNS({ { "(萬", USCRIPT_HAN },

	457 { "a]", USCRIPT_LATIN },

	458 { ")", USCRIPT_HAN } });

	459 }

	460

	461 // Open bracket without matching close is popped when inside

	462 // matching close brackets, so doesn't match later close.

	463 TEST_F(ScriptRunIteratorTest, UnbalancedParens2)

	464 {

	465 CHECK_RUNS({ { "(萬", USCRIPT_HAN },

	466 { "a[", USCRIPT_LATIN },

	467 { ")]", USCRIPT_HAN } });

	468 }

	469

	470 // space goes with leading script

	471 TEST_F(ScriptRunIteratorTest, LatinHan)

	472 {

	473 CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN },

	474 { "萬國碼", USCRIPT_HAN } });

	475 }

	476

	477 // space goes with leading script

	478 TEST_F(ScriptRunIteratorTest, HanLatin)

	479 {

	480 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },

	481 { "Unicode", USCRIPT_LATIN } });

	482 }

	483

	484 TEST_F(ScriptRunIteratorTest, ParenEmptyParen)

	485 {

	486 CHECK_RUNS({ { "()", USCRIPT_COMMON } });

	487 }

	488

	489 TEST_F(ScriptRunIteratorTest, ParenChineseParen)

	490 {

	491 CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } });

	492 }

	493

	494 TEST_F(ScriptRunIteratorTest, ParenLatinParen)

	495 {

	496 CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } });

	497 }

	498

	499 // open paren gets leading script

	500 TEST_F(ScriptRunIteratorTest, LatinParenChineseParen)

	501 {

	502 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },

	503 { "萬國碼", USCRIPT_HAN },

	504 { ")", USCRIPT_LATIN } });

	505 }

	506

	507 // open paren gets first trailing script if no leading script

	508 TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin)

	509 {

	510 CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN },

	511 { "Unicode", USCRIPT_LATIN } });

	512 }

	513

	514 // leading common and open paren get first trailing script.

	515 // TODO(dougfelt): we don't do quote matching, but probably should figure out

	516 // something better then doing nothing.

	517 TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote)

	518 {

	519 CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN },

	520 { "Unicode\"", USCRIPT_LATIN } });

	521 }

	522

	523 // Unmatched close brace gets leading context

	524 TEST_F(ScriptRunIteratorTest, UnmatchedClose)

	525 {

	526 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },

	527 { "萬國碼] ", USCRIPT_HAN },

	528 { ") Unicode\"", USCRIPT_LATIN } });

	529 }

	530

	531 // Match up to 32 bracket pairs

	532 TEST_F(ScriptRunIteratorTest, Match32Brackets)

	533 {

	534 CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN },

	535 { "Unicode (((((((((((((((((((((((((((((((!"

	536 ")))))))))))))))))))))))))))))))",

	537 USCRIPT_LATIN },

	538 { "]", USCRIPT_HAN } });

	539 }

	540

	541 // Matches 32 most recent bracket pairs. More than that, and we revert to

	542 // surrounding script.

	543 TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets)

	544 {

	545 CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN },

	546 { "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN },

	547 { "萬國碼!", USCRIPT_HAN },

	548 { ")))))))))))))))))))))))))))))))", USCRIPT_LATIN },

	549 { "]", USCRIPT_HAN },

	550 { "But )))", USCRIPT_LATIN } });

	551 }

	552

	553 // A char with multiple scripts that match both leading and trailing context

	554 // gets the leading context.

	555 TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext)

	556 {

	557 CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN },

	558 { "l", USCRIPT_LATIN } });

	559 }

	560

	561 // A char with multiple scripts that only match trailing context gets the

	562 // trailing context.

	563 TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext)

	564 {

	565 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },

	566 { "<gl>l", USCRIPT_LATIN } });

	567 }

	568

	569 // Retain first established priority script. <lhg><gh> produce the script <gh>

	570 // with g as priority, because of the two priority scripts l and g, only g

	571 // remains. Then <gh><hgl> retains g as priority, because of the two priority

	572 // scripts g and h that remain, g was encountered first.

	573 TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript)

	574 {

	575 CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } });

	576 }

	577

	578 // Parens can have scripts that break script runs.

	579 TEST_F(ScriptRunIteratorTest, ExtensionsParens)

	580 {

	581 CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK },

	582 { "h<[hl>", USCRIPT_HAN },

	583 { "l", USCRIPT_LATIN },

	584 { "<]hl>", USCRIPT_HAN },

	585 { "<)lg>", USCRIPT_GREEK } });

	586 }

	587

	588 // The close paren might be encountered before we've established the open

	589 // paren's script, but when this is the case the current set is still valid, so

	590 // this doesn't affect it nor break the run.

	591 TEST_F(ScriptRunIteratorTest, ExtensionsParens2)

	592 {

	593 CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } });

	594 }

	595

	596 // A common script with a single extension should be treated as common, but

	597 // with the extended script as a default. If we encounter anything other than

	598 // common, that takes priority. If we encounter other common scripts with a

	599 // single extension, the current priority remains.

	600 TEST_F(ScriptRunIteratorTest, CommonWithPriority)

	601 {

	602 CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } });

	603 }

	604

	605 TEST_F(ScriptRunIteratorTest, CommonWithPriority2)

	606 {

	607 CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } });

	608 }

	609

	610 TEST_F(ScriptRunIteratorTest, CommonWithPriority3)

	611 {

	612 CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } });

	613 }

	614

	615 // UDatta is inherited with LATIN and DEVANAGARI extensions. Since it has

	616 // LATIN, and the dotted circle is COMMON and has adopted the preceding LATIN,

	617 // it gets the LATIN. This is standard.

	618 TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta)

	619 {

	620 CHECK_RUNS({ { "Latin \u25cc\u0951", USCRIPT_LATIN } });

	621 }

	622

	623 // In this situation, UDatta doesn't share a script with the value inherited by

	624 // the dotted circle. It captures the preceding dotted circle and breaks it

	625 // from the run it would normally have been in.

	626 TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta)

	627 {

	628 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },

	629 { "\u25cc\u0951", USCRIPT_DEVANAGARI } });

	630 }

	631

	632 // Tatweel is \u0640 Lm, Fathatan is \u064b Mn. The script of tatweel is

	633 // common, that of Fathatan is inherited. The script extensions for Fathatan

	634 // are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the

	635 // preferred script for Fathatan is Arabic, according to Behdad's

	636 // heuristic. This is exactly analogous to the Udatta tests above, except

	637 // Tatweel is Lm. But we don't take properties into account, only scripts.

	638 TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan)

	639 {

	640 CHECK_RUNS({ { "Latin ", USCRIPT_LATIN },

	641 { "\u0640\u064b", USCRIPT_ARABIC } });

	642 }

	643

	644 // Another case where if the mark accepts a script that was inherited by the

	645 // preceding common-script character, they both continue in that script.

	646 TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan)

	647 {

	648 CHECK_RUNS({ { "\u0722\u0640\u064b", USCRIPT_SYRIAC } });

	649 }

	650

	651 // The Udatta is inherited, so will share runs with anything that is not

	652 // common.

	653 TEST_F(ScriptRunIteratorTest, HanUdatta)

	654 {

	655 CHECK_RUNS({ { "萬國碼\u0951", USCRIPT_HAN } });

	656 }

	657

	658 // The Udatta is inherited, and will capture the space and turn it into

	659 // Devanagari.

	660 TEST_F(ScriptRunIteratorTest, HanSpaceUdatta)

	661 {

	662 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN },

	663 { " \u0951", USCRIPT_DEVANAGARI } });

	664 }

	665

	666 // Make sure Mock code works too.

	667 TEST_F(ScriptRunIteratorTest, MockHanInheritedGL)

	668 {

	669 CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } });

	670 }

	671

	672 TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL)

	673 {

	674 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },

	675 { "c<igl>", USCRIPT_GREEK } });

	676 }

	677

	678 // Leading inherited just act like common, except there's no preferred script.

	679 TEST_F(ScriptRunIteratorTest, MockLeadingInherited)

	680 {

	681 CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } });

	682 }

	683

	684 // Leading inherited just act like common, except there's no preferred script.

	685 TEST_F(ScriptRunIteratorTest, MockLeadingInherited2)

	686 {

	687 CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } });

	688 }

	689

	690 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan)

	691 {

	692 CHECK_RUNS({ { "\u0951萬國碼", USCRIPT_HAN } });

	693 }

	694

	695 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2)

	696 {

	697 CHECK_RUNS({ { "\u0951\u064b萬國碼", USCRIPT_HAN } });

	698 }

	699

	700 TEST_F(ScriptRunIteratorTest, OddLatinString)

	701 {

	702 CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } });

	703 }

	704

	705 class ScriptRunIteratorICUDataTest : public testing::Test {

	706 public:

	707 ScriptRunIteratorICUDataTest()

	708 : max_extensions_(0)

	709 , max_extensions_cp_(0xffff)

	710 {

	711 int max_extensions = 0;

	712 UChar32 max_extensions_cp = 0;

	713 for (UChar32 cp = 0; cp < 0x11000; ++cp) {

	714 UErrorCode status = U_ZERO_ERROR;

	715 int count = uscript_getScriptExtensions(cp, NULL, 0, &status);

	716 if (count > max_extensions) {

	717 max_extensions = count;

	718 max_extensions_cp = cp;

	719 }

	720 if (count > ScriptData::kMaxScriptCount) {

	721 }

	722 }

	723 max_extensions_ = max_extensions;

	724 max_extensions_cp_ = max_extensions_cp;

	725 }

	726

	727 protected:

	728 UChar32 GetACharWithMaxExtensions(int* num_extensions)

	729 {

	730 if (num_extensions) {

	731 *num_extensions = max_extensions_;

	732 }

	733 return max_extensions_cp_;

	734 }

	735

	736 private:

	737 int max_extensions_;

	738 UChar32 max_extensions_cp_;

	739 };

	740

	741 // Validate that ICU never returns more than our maximum expected number of

	742 // script extensions.

	743 TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions)

	744 {

	745 int max_extensions;

	746 UChar32 cp = GetACharWithMaxExtensions(&max_extensions);

	747 ASSERT_LE(max_extensions, ScriptData::kMaxScriptCount)

	748 << "char " << std::hex << cp << std::dec;

	749 }

	750

	751 // Check that ICUScriptData returns all of a character's scripts.

	752 // This only checks one likely character, but doesn't check all cases.

	753 TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions)

	754 {

	755 int max_extensions;

	756 UChar32 cp = GetACharWithMaxExtensions(&max_extensions);

	757 Vector<UScriptCode> extensions;

	758 ICUScriptData::instance()->getScripts(cp, extensions);

	759

	760 // It's possible that GetScripts adds the primary script to the list of

	761 // extensions, resulting in one more script than the raw extension count.

	762 ASSERT_GE(static_cast<int>(extensions.size()), max_extensions)

	763 << "char " << std::hex << cp << std::dec;

	764 }

	765

	766 TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension)

	767 {

	768 Vector<UScriptCode> extensions;

	769 for (UChar32 cp = 0; cp < 0x110000; ++cp) {

	770 ICUScriptData::instance()->getScripts(cp, extensions);

	771 UScriptCode primary = extensions.at(0);

	772 if (primary == USCRIPT_COMMON) {

	773 ASSERT_LE(extensions.size(), 2ul)

	774 << "cp: " << std::hex << cp << std::dec;

	775 }

	776 }

	777 }

	778

	779 // ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to

	780 // ignore this for now, as I think it shouldn't matter which run it ends up

	781 // in. HarfBuzz needs to be able to use it as context and shape each

	782 // neighboring character appropriately no matter what run it got assigned to.

	783

	784 } // namespace blink

OLD	NEW

« Source/platform/fonts/ScriptRunIterator.cpp ('K') | « Source/platform/fonts/ScriptRunIterator.cpp ('k') | no next file » | no next file with comments »