Source/platform/fonts/ScriptRunIteratorTest.cpp - Issue 1323513006: Upstream ScriptRunIterator for segmenting text runs by script

Side by Side Diff: Source/platform/fonts/ScriptRunIteratorTest.cpp

Issue 1323513006: Upstream ScriptRunIterator for segmenting text runs by script (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Hex escapes seem to be the safest cross-platform bet Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "config.h"

	6 #include "platform/fonts/ScriptRunIterator.h"

	7

	8 #include "platform/Logging.h"

	9 #include "wtf/Assertions.h"

	10 #include "wtf/Threading.h"

	11 #include "wtf/text/WTFString.h"

	12

	13 #include <gtest/gtest.h>

	14 #include <string>

	15 #include <vector>

	16

	17 namespace blink {

	18

	19 struct TestRun {

	20 std::string text;

	21 UScriptCode code;

	22 };

	23

	24 struct ExpectedRun {

	25 unsigned limit;

	26 UScriptCode code;

	27

	28 ExpectedRun(unsigned the_limit, UScriptCode the_code)

	29 : limit(the_limit)

	30 , code(the_code)

	31 {

	32 }

	33 };

	34

	35 class MockScriptData : public ScriptData {

	36 public:

	37 ~MockScriptData() override {}

	38

	39 static const MockScriptData* instance()

	40 {

	41 AtomicallyInitializedStaticReference(const MockScriptData, mockScriptDat a, (new MockScriptData()));

	42

	43 return &mockScriptData;

	44 }

	45

	46 void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override

	47 {

	48 ASSERT(ch >= kMockCharMin);

	49 ASSERT(ch < kMockCharLimit);

	50

	51 int code = ch - kMockCharMin;

	52 dst.clear();

	53 switch (code & kCodeSpecialMask) {

	54 case kCodeSpecialCommon:

	55 dst.append(USCRIPT_COMMON);

	56 break;

	57 case kCodeSpecialInherited:

	58 dst.append(USCRIPT_INHERITED);

	59 break;

	60 default:

	61 break;

	62 }

	63 int listBits = kTable[code & kCodeListIndexMask];

	64 if (dst.isEmpty() && listBits == 0) {

	65 dst.append(USCRIPT_UNKNOWN);

	66 return;

	67 }

	68 while (listBits) {

	69 switch (listBits & kListMask) {

	70 case 0:

	71 break;

	72 case kLatin:

	73 dst.append(USCRIPT_LATIN);

	74 break;

	75 case kHan:

	76 dst.append(USCRIPT_HAN);

	77 break;

	78 case kGreek:

	79 dst.append(USCRIPT_GREEK);

	80 break;

	81 }

	82 listBits >>= kListShift;

	83 }

	84 }

	85

	86 UChar32 getPairedBracket(UChar32 ch) const override

	87 {

	88 switch (getPairedBracketType(ch)) {

	89 case PairedBracketType::BracketTypeClose:

	90 return ch - kBracketDelta;

	91 case PairedBracketType::BracketTypeOpen:

	92 return ch + kBracketDelta;

	93 default:

	94 return ch;

	95 }

	96 }

	97

	98 PairedBracketType getPairedBracketType(UChar32 ch) const override

	99 {

	100 ASSERT(ch >= kMockCharMin && ch < kMockCharLimit);

	101 int code = ch - kMockCharMin;

	102 if ((code & kCodeBracketBit) == 0) {

	103 return PairedBracketType::BracketTypeNone;

	104 }

	105 if (code & kCodeBracketCloseBit) {

	106 return PairedBracketType::BracketTypeClose;

	107 }

	108 return PairedBracketType::BracketTypeOpen;

	109 }

	110

	111 static int TableLookup(int value)

	112 {

	113 for (int i = 0; i < 16; ++i) {

	114 if (kTable[i] == value) {

	115 return i;

	116 }

	117 }

	118 WTF_LOG_ERROR("Table does not contain value 0x%x", value);

	119 return 0;

	120 }

	121

	122 static String ToTestString(const std::string& input)

	123 {

	124 String result(String::make16BitFrom8BitSource(0, 0));

	125 bool inSet = false;

	126 int seen = 0;

	127 int code = 0;

	128 int list = 0;

	129 int currentShift = 0;

	130 for (char c : input) {

	131 if (inSet) {

	132 switch (c) {

	133 case '(':

	134 ASSERT(seen == 0);

	135 seen \|= kSawBracket;

	136 code \|= kCodeBracketBit;

	137 break;

	138 case '[':

	139 ASSERT(seen == 0);

	140 seen \|= kSawBracket;

	141 code \|= kCodeBracketBit \| kCodeSquareBracketBit;

	142 break;

	143 case ')':

	144 ASSERT(seen == 0);

	145 seen \|= kSawBracket;

	146 code \|= kCodeBracketBit \| kCodeBracketCloseBit;

	147 break;

	148 case ']':

	149 ASSERT(seen == 0);

	150 seen \|= kSawBracket;

	151 code \|= kCodeBracketBit \| kCodeSquareBracketBit \| kCodeBrack etCloseBit;

	152 break;

	153 case 'i':

	154 ASSERT(seen == 0); // brackets can't be inherited

	155 seen \|= kSawSpecial;

	156 code \|= kCodeSpecialInherited;

	157 break;

	158 case 'c':

	159 ASSERT((seen & ~kSawBracket) == 0);

	160 seen \|= kSawSpecial;

	161 code \|= kCodeSpecialCommon;

	162 break;

	163 case 'l':

	164 ASSERT((seen & kSawLatin) == 0);

	165 ASSERT(currentShift < 3);

	166 seen \|= kSawLatin;

	167 list \|= kLatin << (2 * currentShift++);

	168 break;

	169 case 'h':

	170 ASSERT((seen & kSawHan) == 0);

	171 ASSERT(currentShift < 3);

	172 seen \|= kSawHan;

	173 list \|= kHan << (2 * currentShift++);

	174 break;

	175 case 'g':

	176 ASSERT((seen & kSawGreek) == 0);

	177 ASSERT(currentShift < 3);

	178 seen \|= kSawGreek;

	179 list \|= kGreek << (2 * currentShift++);

	180 break;

	181 case '>':

	182 ASSERT(seen != 0);

	183 code \|= TableLookup(list);

	184 result.append(static_cast<UChar>(kMockCharMin + code));

	185 inSet = false;

	186 break;

	187 default:

	188 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);

	189 break;

	190 }

	191 continue;

	192 }

	193 // not in set

	194 switch (c) {

	195 case '<':

	196 seen = 0;

	197 code = 0;

	198 list = 0;

	199 currentShift = 0;

	200 inSet = true;

	201 break;

	202 case '(':

	203 code = kCodeBracketBit \| kCodeSpecialCommon;

	204 break;

	205 case '[':

	206 code = kCodeBracketBit \| kCodeSquareBracketBit \| kCodeSpecialCom mon;

	207 break;

	208 case ')':

	209 code = kCodeBracketBit \| kCodeBracketCloseBit \| kCodeSpecialComm on;

	210 break;

	211 case ']':

	212 code = kCodeBracketBit \| kCodeSquareBracketBit \| kCodeBracketClo seBit \| kCodeSpecialCommon;

	213 break;

	214 case 'i':

	215 code = kCodeSpecialInherited;

	216 break;

	217 case 'c':

	218 code = kCodeSpecialCommon;

	219 break;

	220 case 'l':

	221 code = kLatin;

	222 break;

	223 case 'h':

	224 code = kHan;

	225 break;

	226 case 'g':

	227 code = kGreek;

	228 break;

	229 case '?':

	230 code = 0; // unknown

	231 break;

	232 default:

	233 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);

	234 }

	235 if (!inSet) {

	236 result.append(static_cast<UChar>(kMockCharMin + code));

	237 }

	238 }

	239 return result;

	240 }

	241

	242 static std::string MockCharString(UChar mockch)

	243 {

	244 ASSERT(mockch >= kMockCharMin && mockch < kMockCharLimit);

	245 int code = mockch - kMockCharMin;

	246

	247 // We use set notation in these cases:

	248 // - more than one of special, kLatin, kHan, kGreek

	249 // - bracket and not common (since non-set brackets are common)

	250 bool isBracket = (code & kCodeBracketBit) != 0;

	251 bool isSpecial = (mockch & kCodeSpecialMask) != 0;

	252 bool isCommon = (mockch & kCodeSpecialMask) == kCodeSpecialCommon;

	253 char c;

	254 if (isBracket) {

	255 if (code & kCodeSquareBracketBit) {

	256 if (code & kCodeBracketCloseBit) {

	257 c = ']';

	258 } else {

	259 c = '[';

	260 }

	261 } else {

	262 if (code & kCodeBracketCloseBit) {

	263 c = ')';

	264 } else {

	265 c = '(';

	266 }

	267 }

	268 } else if (isSpecial) {

	269 c = isCommon ? 'c' : 'i';

	270 }

	271 std::string result;

	272 int listBits = kTable[code & kCodeListIndexMask];

	273 while (listBits) {

	274 switch (listBits & kListMask) {

	275 case 0:

	276 break;

	277 case kLatin:

	278 result += 'l';

	279 break;

	280 case kHan:

	281 result += 'h';

	282 break;

	283 case kGreek:

	284 result += 'g';

	285 break;

	286 }

	287 listBits >>= kListShift;

	288 }

	289 bool needSet = result.length() + (isSpecial ? 1 : 0) > 1 \|\| (isBracket & & (result.length() > 0 \|\| !isCommon));

	290 if (needSet) {

	291 std::string setResult("<");

	292 if (isBracket) {

	293 setResult += c;

	294 }

	295 if (isSpecial) {

	296 if (isCommon) {

	297 setResult += "c";

	298 } else {

	299 setResult += "i";

	300 }

	301 }

	302 setResult += result;

	303 setResult += ">";

	304 return setResult;

	305 }

	306 if (isBracket \|\| isSpecial) {

	307 result = c;

	308 }

	309 return result;

	310 }

	311

	312 // We determine properties based on the offset from kMockCharMin:

	313 // bits 0-3 represent the list of l, h, c scripts (index into table)

	314 // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal

	315 // bit 6 clear means non-bracket, open means bracket

	316 // bit 7 clear means open bracket, set means close bracket

	317 // bit 8 clear means paren, set means bracket

	318 // if it's a bracket, the matching bracket is 64 code points away

	319 static const UChar32 kMockCharMin = 0xe000;

	320 static const UChar32 kMockCharLimit = kMockCharMin + 0x200;

	321 static const int kLatin = 1;

	322 static const int kHan = 2;

	323 static const int kGreek = 3;

	324 static const int kCodeListIndexMask = 0xf;

	325 static const int kCodeSpecialMask = 0x30;

	326 static const int kCodeSpecialCommon = 0x10;

	327 static const int kCodeSpecialInherited = 0x20;

	328 static const int kCodeBracketCloseBit = 0x40;

	329 static const int kCodeBracketBit = 0x80;

	330 static const int kCodeSquareBracketBit = 0x100;

	331 static const int kListShift = 2;

	332 static const int kListMask = 0x3;

	333 static const int kBracketDelta = kCodeBracketCloseBit;

	334 static const int kTable[16];

	335

	336 static const int kSawBracket = 0x1;

	337 static const int kSawSpecial = 0x2;

	338 static const int kSawLatin = 0x4;

	339 static const int kSawHan = 0x8;

	340 static const int kSawGreek = 0x10;

	341 };

	342

	343 static const int kLatin2 = MockScriptData::kLatin << 2;

	344 static const int kHan2 = MockScriptData::kHan << 2;

	345 static const int kGreek2 = MockScriptData::kGreek << 2;

	346 static const int kLatin3 = MockScriptData::kLatin << 4;

	347 static const int kHan3 = MockScriptData::kHan << 4;

	348 static const int kGreek3 = MockScriptData::kGreek << 4;

	349 const int MockScriptData::kTable[] = {

	350 0, kLatin, kHan, kGreek,

	351 kLatin2 + kHan, kLatin2 + kGreek,

	352 kHan2 + kLatin, kHan2 + kGreek,

	353 kGreek2 + kLatin, kGreek2 + kHan,

	354 kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan,

	355 kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin,

	356 kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin,

	357 };

	358

	359 class ScriptRunIteratorTest : public testing::Test {

	360 protected:

	361 void CheckRuns(const std::vector<TestRun>& runs)

	362 {

	363 String text(String::make16BitFrom8BitSource(0, 0));

	364 std::vector<ExpectedRun> expect;

	365 for (auto& run : runs) {

	366 text.append(String::fromUTF8(run.text.c_str()));

	367 expect.push_back(ExpectedRun(text.length(), run.code));

	368 }

	369 ScriptRunIterator scriptRunIterator(text.characters16(), text.length());

	370 VerifyRuns(&scriptRunIterator, expect);

	371 }

	372

	373 void CheckMockRuns(const std::vector<TestRun>& runs)

	374 {

	375 String text(String::make16BitFrom8BitSource(0, 0));

	376 std::vector<ExpectedRun> expect;

	377 for (const TestRun& run : runs) {

	378 text.append(MockScriptData::ToTestString(run.text));

	379 expect.push_back({ text.length(), run.code });

	380 }

	381

	382 ScriptRunIterator scriptRunIterator(text.characters16(), text.length(),

	383 MockScriptData::instance());

	384 VerifyRuns(&scriptRunIterator, expect);

	385 }

	386

	387 void VerifyRuns(ScriptRunIterator* scriptRunIterator,

	388 const std::vector<ExpectedRun>& expect)

	389 {

	390 unsigned limit;

	391 UScriptCode code;

	392 unsigned long runCount = 0;

	393 while (scriptRunIterator->consume(limit, code)) {

	394 ASSERT_LT(runCount, expect.size());

	395 ASSERT_EQ(expect[runCount].limit, limit);

	396 ASSERT_EQ(expect[runCount].code, code);

	397 ++runCount;

	398 }

	399 WTF_LOG_ERROR("Expected %zu runs, got %lu ", expect.size(), runCount);

	400 ASSERT_EQ(expect.size(), runCount);

	401 }

	402 };

	403

	404 TEST_F(ScriptRunIteratorTest, Empty)

	405 {

	406 String empty(String::make16BitFrom8BitSource(0, 0));

	407 ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length());

	408 unsigned limit = 0;

	409 UScriptCode code = USCRIPT_INVALID_CODE;

	410 ASSERT(!scriptRunIterator.consume(limit, code));

	411 ASSERT_EQ(limit, 0u);

	412 ASSERT_EQ(code, USCRIPT_INVALID_CODE);

	413 }

	414

	415 // Some of our compilers cannot initialize a vector from an array yet.

	416 #define DECLARE_RUNSVECTOR(...) \

	417 static const TestRun runsArray[] = __VA_ARGS__; \

	418 std::vector<TestRun> runs(runsArray, runsArray + sizeof(runsArray) / sizeof( *runsArray));

	419

	420 #define CHECK_RUNS(...) \

	421 DECLARE_RUNSVECTOR(__VA_ARGS__); \

	422 CheckRuns(runs);

	423

	424 #define CHECK_MOCK_RUNS(...) \

	425 DECLARE_RUNSVECTOR(__VA_ARGS__); \

	426 CheckMockRuns(runs);

	427

	428 TEST_F(ScriptRunIteratorTest, Whitespace)

	429 {

	430 CHECK_RUNS({ { " \t ", USCRIPT_COMMON } });

	431 }

	432

	433 TEST_F(ScriptRunIteratorTest, Common)

	434 {

	435 CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } });

	436 }

	437

	438 TEST_F(ScriptRunIteratorTest, Latin)

	439 {

	440 CHECK_RUNS({ { "latin", USCRIPT_LATIN } });

	441 }

	442

	443 TEST_F(ScriptRunIteratorTest, Chinese)

	444 {

	445 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } });

	446 }

	447

	448 // Close bracket without matching open is ignored

	449 TEST_F(ScriptRunIteratorTest, UnbalancedParens1)

	450 {

	451 CHECK_RUNS({ { "(萬", USCRIPT_HAN },

	452 { "a]", USCRIPT_LATIN },

	453 { ")", USCRIPT_HAN } });

	454 }

	455

	456 // Open bracket without matching close is popped when inside

	457 // matching close brackets, so doesn't match later close.

	458 TEST_F(ScriptRunIteratorTest, UnbalancedParens2)

	459 {

	460 CHECK_RUNS({ { "(萬", USCRIPT_HAN },

	461 { "a[", USCRIPT_LATIN },

	462 { ")]", USCRIPT_HAN } });

	463 }

	464

	465 // space goes with leading script

	466 TEST_F(ScriptRunIteratorTest, LatinHan)

	467 {

	468 CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN },

	469 { "萬國碼", USCRIPT_HAN } });

	470 }

	471

	472 // space goes with leading script

	473 TEST_F(ScriptRunIteratorTest, HanLatin)

	474 {

	475 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },

	476 { "Unicode", USCRIPT_LATIN } });

	477 }

	478

	479 TEST_F(ScriptRunIteratorTest, ParenEmptyParen)

	480 {

	481 CHECK_RUNS({ { "()", USCRIPT_COMMON } });

	482 }

	483

	484 TEST_F(ScriptRunIteratorTest, ParenChineseParen)

	485 {

	486 CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } });

	487 }

	488

	489 TEST_F(ScriptRunIteratorTest, ParenLatinParen)

	490 {

	491 CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } });

	492 }

	493

	494 // open paren gets leading script

	495 TEST_F(ScriptRunIteratorTest, LatinParenChineseParen)

	496 {

	497 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },

	498 { "萬國碼", USCRIPT_HAN },

	499 { ")", USCRIPT_LATIN } });

	500 }

	501

	502 // open paren gets first trailing script if no leading script

	503 TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin)

	504 {

	505 CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN },

	506 { "Unicode", USCRIPT_LATIN } });

	507 }

	508

	509 // leading common and open paren get first trailing script.

	510 // TODO(dougfelt): we don't do quote matching, but probably should figure out

	511 // something better then doing nothing.

	512 TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote)

	513 {

	514 CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN },

	515 { "Unicode\"", USCRIPT_LATIN } });

	516 }

	517

	518 // Unmatched close brace gets leading context

	519 TEST_F(ScriptRunIteratorTest, UnmatchedClose)

	520 {

	521 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },

	522 { "萬國碼] ", USCRIPT_HAN },

	523 { ") Unicode\"", USCRIPT_LATIN } });

	524 }

	525

	526 // Match up to 32 bracket pairs

	527 TEST_F(ScriptRunIteratorTest, Match32Brackets)

	528 {

	529 CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN },

	530 { "Unicode (((((((((((((((((((((((((((((((!"

	531 ")))))))))))))))))))))))))))))))",

	532 USCRIPT_LATIN },

	533 { "]", USCRIPT_HAN } });

	534 }

	535

	536 // Matches 32 most recent bracket pairs. More than that, and we revert to

	537 // surrounding script.

	538 TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets)

	539 {

	540 CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN },

	541 { "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN },

	542 { "萬國碼!", USCRIPT_HAN },

	543 { ")))))))))))))))))))))))))))))))", USCRIPT_LATIN },

	544 { "]", USCRIPT_HAN },

	545 { "But )))", USCRIPT_LATIN } });

	546 }

	547

	548 // A char with multiple scripts that match both leading and trailing context

	549 // gets the leading context.

	550 TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext)

	551 {

	552 CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN },

	553 { "l", USCRIPT_LATIN } });

	554 }

	555

	556 // A char with multiple scripts that only match trailing context gets the

	557 // trailing context.

	558 TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext)

	559 {

	560 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },

	561 { "<gl>l", USCRIPT_LATIN } });

	562 }

	563

	564 // Retain first established priority script. <lhg><gh> produce the script <gh>

	565 // with g as priority, because of the two priority scripts l and g, only g

	566 // remains. Then <gh><hgl> retains g as priority, because of the two priority

	567 // scripts g and h that remain, g was encountered first.

	568 TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript)

	569 {

	570 CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } });

	571 }

	572

	573 // Parens can have scripts that break script runs.

	574 TEST_F(ScriptRunIteratorTest, ExtensionsParens)

	575 {

	576 CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK },

	577 { "h<[hl>", USCRIPT_HAN },

	578 { "l", USCRIPT_LATIN },

	579 { "<]hl>", USCRIPT_HAN },

	580 { "<)lg>", USCRIPT_GREEK } });

	581 }

	582

	583 // The close paren might be encountered before we've established the open

	584 // paren's script, but when this is the case the current set is still valid, so

	585 // this doesn't affect it nor break the run.

	586 TEST_F(ScriptRunIteratorTest, ExtensionsParens2)

	587 {

	588 CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } });

	589 }

	590

	591 // A common script with a single extension should be treated as common, but

	592 // with the extended script as a default. If we encounter anything other than

	593 // common, that takes priority. If we encounter other common scripts with a

	594 // single extension, the current priority remains.

	595 TEST_F(ScriptRunIteratorTest, CommonWithPriority)

	596 {

	597 CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } });

	598 }

	599

	600 TEST_F(ScriptRunIteratorTest, CommonWithPriority2)

	601 {

	602 CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } });

	603 }

	604

	605 TEST_F(ScriptRunIteratorTest, CommonWithPriority3)

	606 {

	607 CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } });

	608 }

	609

	610 // UDatta (\xE0\xA5\x91) is inherited with LATIN and DEVANAGARI extensions.

	611 // Since it has LATIN, and the dotted circle (\xE2\x97\x8C) is COMMON and has

	612 // adopted the preceding LATIN, it gets the LATIN. This is standard.

	613 TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta)

	614 {

	615 CHECK_RUNS({ { "Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN } });

	616 }

	617

	618 // In this situation, UDatta (\xE0\xA5\x91) doesn't share a script with the

	619 // value inherited by the dotted circle (\xE2\x97\x8C). It captures the

	620 // preceding dotted circle and breaks it from the run it would normally have

	621 // been in.

	622 TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta)

	623 {

	624 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },

	625 { "\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_DEVANAGARI } });

	626 }

	627

	628 // Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is

	629 // common, that of Fathatan is inherited. The script extensions for Fathatan

	630 // are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the

	631 // preferred script for Fathatan is Arabic, according to Behdad's

	632 // heuristic. This is exactly analogous to the Udatta tests above, except

	633 // Tatweel is Lm. But we don't take properties into account, only scripts.

	634 TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan)

	635 {

	636 CHECK_RUNS({ { "Latin ", USCRIPT_LATIN },

	637 { "\xD9\x80\xD9\x8B", USCRIPT_ARABIC } });

	638 }

	639

	640 // Another case where if the mark accepts a script that was inherited by the

	641 // preceding common-script character, they both continue in that script.

	642 // SYRIAC LETTER NUN \xDC\xA2

	643 // ARABIC TATWEEL \xD9\x80

	644 // ARABIC FATHATAN \xD9\x82

	645 TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan)

	646 {

	647 CHECK_RUNS({ { "\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC } });

	648 }

	649

	650 // The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that

	651 // is not common.

	652 TEST_F(ScriptRunIteratorTest, HanUdatta)

	653 {

	654 CHECK_RUNS({ { "萬國碼\xE0\xA5\x91", USCRIPT_HAN } });

	655 }

	656

	657 // The Udatta (\xE0\xA5\x91) is inherited, and will capture the space and turn

	658 // it into Devanagari.

	659 TEST_F(ScriptRunIteratorTest, HanSpaceUdatta)

	660 {

	661 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN },

	662 { " \xE0\xA5\x91", USCRIPT_DEVANAGARI } });

	663 }

	664

	665 // Make sure Mock code works too.

	666 TEST_F(ScriptRunIteratorTest, MockHanInheritedGL)

	667 {

	668 CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } });

	669 }

	670

	671 TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL)

	672 {

	673 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },

	674 { "c<igl>", USCRIPT_GREEK } });

	675 }

	676

	677 // Leading inherited just act like common, except there's no preferred script.

	678 TEST_F(ScriptRunIteratorTest, MockLeadingInherited)

	679 {

	680 CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } });

	681 }

	682

	683 // Leading inherited just act like common, except there's no preferred script.

	684 TEST_F(ScriptRunIteratorTest, MockLeadingInherited2)

	685 {

	686 CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } });

	687 }

	688

	689 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan)

	690 {

	691 // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91

	692 CHECK_RUNS({ { "\xE0\xA5\x91萬國碼", USCRIPT_HAN } });

	693 }

	694

	695 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2)

	696 {

	697 // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91

	698 // ARABIC FATHATAN \xD9\x8B

	699 CHECK_RUNS({ { "\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN } });

	700 }

	701

	702 TEST_F(ScriptRunIteratorTest, OddLatinString)

	703 {

	704 CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } });

	705 }

	706

	707 class ScriptRunIteratorICUDataTest : public testing::Test {

	708 public:

	709 ScriptRunIteratorICUDataTest()

	710 : m_maxExtensions(0)

	711 , m_maxExtensionsCodepoint(0xffff)

	712 {

	713 int maxExtensions = 0;

	714 UChar32 m_maxExtensionscp = 0;

	715 for (UChar32 cp = 0; cp < 0x11000; ++cp) {

	716 UErrorCode status = U_ZERO_ERROR;

	717 int count = uscript_getScriptExtensions(cp, 0, 0, &status);

	718 if (count > maxExtensions) {

	719 maxExtensions = count;

	720 m_maxExtensionscp = cp;

	721 }

	722 if (count > ScriptData::kMaxScriptCount) {

	723 }

	724 }

	725 m_maxExtensions = maxExtensions;

	726 m_maxExtensionsCodepoint = m_maxExtensionscp;

	727 }

	728

	729 protected:

	730 UChar32 GetACharWithMaxExtensions(int* numExtensions)

	731 {

	732 if (numExtensions) {

	733 *numExtensions = m_maxExtensions;

	734 }

	735 return m_maxExtensionsCodepoint;

	736 }

	737

	738 private:

	739 int m_maxExtensions;

	740 UChar32 m_maxExtensionsCodepoint;

	741 };

	742

	743 // Validate that ICU never returns more than our maximum expected number of

	744 // script extensions.

	745 TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions)

	746 {

	747 int maxExtensions;

	748 UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);

	749 ASSERT_LE(maxExtensions, ScriptData::kMaxScriptCount)

	750 << "char " << std::hex << cp << std::dec;

	751 }

	752

	753 // Check that ICUScriptData returns all of a character's scripts.

	754 // This only checks one likely character, but doesn't check all cases.

	755 TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions)

	756 {

	757 int maxExtensions;

	758 UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);

	759 Vector<UScriptCode> extensions;

	760 ICUScriptData::instance()->getScripts(cp, extensions);

	761

	762 // It's possible that GetScripts adds the primary script to the list of

	763 // extensions, resulting in one more script than the raw extension count.

	764 ASSERT_GE(static_cast<int>(extensions.size()), maxExtensions)

	765 << "char " << std::hex << cp << std::dec;

	766 }

	767

	768 TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension)

	769 {

	770 Vector<UScriptCode> extensions;

	771 for (UChar32 cp = 0; cp < 0x110000; ++cp) {

	772 ICUScriptData::instance()->getScripts(cp, extensions);

	773 UScriptCode primary = extensions.at(0);

	774 if (primary == USCRIPT_COMMON) {

	775 ASSERT_LE(extensions.size(), 2ul)

	776 << "cp: " << std::hex << cp << std::dec;

	777 }

	778 }

	779 }

	780

	781 // ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to

	782 // ignore this for now, as I think it shouldn't matter which run it ends up

	783 // in. HarfBuzz needs to be able to use it as context and shape each

	784 // neighboring character appropriately no matter what run it got assigned to.

	785

	786 } // namespace blink

OLD	NEW

« Source/platform/fonts/ScriptRunIterator.cpp ('K') | « Source/platform/fonts/ScriptRunIterator.cpp ('k') | no next file » | no next file with comments »