OLD | NEW |
(Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "config.h" |
| 6 #include "platform/fonts/ScriptRunIterator.h" |
| 7 |
| 8 #include "platform/Logging.h" |
| 9 #include "wtf/Assertions.h" |
| 10 #include "wtf/Threading.h" |
| 11 #include "wtf/text/WTFString.h" |
| 12 |
| 13 #include <gtest/gtest.h> |
| 14 #include <string> |
| 15 #include <vector> |
| 16 |
| 17 namespace blink { |
| 18 |
| 19 struct TestRun { |
| 20 std::string text; |
| 21 UScriptCode code; |
| 22 }; |
| 23 |
| 24 struct ExpectedRun { |
| 25 unsigned limit; |
| 26 UScriptCode code; |
| 27 |
| 28 ExpectedRun(unsigned the_limit, UScriptCode the_code) |
| 29 : limit(the_limit) |
| 30 , code(the_code) |
| 31 { |
| 32 } |
| 33 }; |
| 34 |
| 35 class MockScriptData : public ScriptData { |
| 36 public: |
| 37 ~MockScriptData() override {} |
| 38 |
| 39 static const MockScriptData* instance() |
| 40 { |
| 41 AtomicallyInitializedStaticReference(const MockScriptData, mockScriptDat
a, (new MockScriptData())); |
| 42 |
| 43 return &mockScriptData; |
| 44 } |
| 45 |
| 46 void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override |
| 47 { |
| 48 ASSERT(ch >= kMockCharMin); |
| 49 ASSERT(ch < kMockCharLimit); |
| 50 |
| 51 int code = ch - kMockCharMin; |
| 52 dst.clear(); |
| 53 switch (code & kCodeSpecialMask) { |
| 54 case kCodeSpecialCommon: |
| 55 dst.append(USCRIPT_COMMON); |
| 56 break; |
| 57 case kCodeSpecialInherited: |
| 58 dst.append(USCRIPT_INHERITED); |
| 59 break; |
| 60 default: |
| 61 break; |
| 62 } |
| 63 int listBits = kTable[code & kCodeListIndexMask]; |
| 64 if (dst.isEmpty() && listBits == 0) { |
| 65 dst.append(USCRIPT_UNKNOWN); |
| 66 return; |
| 67 } |
| 68 while (listBits) { |
| 69 switch (listBits & kListMask) { |
| 70 case 0: |
| 71 break; |
| 72 case kLatin: |
| 73 dst.append(USCRIPT_LATIN); |
| 74 break; |
| 75 case kHan: |
| 76 dst.append(USCRIPT_HAN); |
| 77 break; |
| 78 case kGreek: |
| 79 dst.append(USCRIPT_GREEK); |
| 80 break; |
| 81 } |
| 82 listBits >>= kListShift; |
| 83 } |
| 84 } |
| 85 |
| 86 UChar32 getPairedBracket(UChar32 ch) const override |
| 87 { |
| 88 switch (getPairedBracketType(ch)) { |
| 89 case PairedBracketType::BracketTypeClose: |
| 90 return ch - kBracketDelta; |
| 91 case PairedBracketType::BracketTypeOpen: |
| 92 return ch + kBracketDelta; |
| 93 default: |
| 94 return ch; |
| 95 } |
| 96 } |
| 97 |
| 98 PairedBracketType getPairedBracketType(UChar32 ch) const override |
| 99 { |
| 100 ASSERT(ch >= kMockCharMin && ch < kMockCharLimit); |
| 101 int code = ch - kMockCharMin; |
| 102 if ((code & kCodeBracketBit) == 0) { |
| 103 return PairedBracketType::BracketTypeNone; |
| 104 } |
| 105 if (code & kCodeBracketCloseBit) { |
| 106 return PairedBracketType::BracketTypeClose; |
| 107 } |
| 108 return PairedBracketType::BracketTypeOpen; |
| 109 } |
| 110 |
| 111 static int TableLookup(int value) |
| 112 { |
| 113 for (int i = 0; i < 16; ++i) { |
| 114 if (kTable[i] == value) { |
| 115 return i; |
| 116 } |
| 117 } |
| 118 WTF_LOG_ERROR("Table does not contain value 0x%x", value); |
| 119 return 0; |
| 120 } |
| 121 |
| 122 static String ToTestString(const std::string& input) |
| 123 { |
| 124 String result(String::make16BitFrom8BitSource(0, 0)); |
| 125 bool inSet = false; |
| 126 int seen = 0; |
| 127 int code = 0; |
| 128 int list = 0; |
| 129 int currentShift = 0; |
| 130 for (char c : input) { |
| 131 if (inSet) { |
| 132 switch (c) { |
| 133 case '(': |
| 134 ASSERT(seen == 0); |
| 135 seen |= kSawBracket; |
| 136 code |= kCodeBracketBit; |
| 137 break; |
| 138 case '[': |
| 139 ASSERT(seen == 0); |
| 140 seen |= kSawBracket; |
| 141 code |= kCodeBracketBit | kCodeSquareBracketBit; |
| 142 break; |
| 143 case ')': |
| 144 ASSERT(seen == 0); |
| 145 seen |= kSawBracket; |
| 146 code |= kCodeBracketBit | kCodeBracketCloseBit; |
| 147 break; |
| 148 case ']': |
| 149 ASSERT(seen == 0); |
| 150 seen |= kSawBracket; |
| 151 code |= kCodeBracketBit | kCodeSquareBracketBit | kCodeBrack
etCloseBit; |
| 152 break; |
| 153 case 'i': |
| 154 ASSERT(seen == 0); // brackets can't be inherited |
| 155 seen |= kSawSpecial; |
| 156 code |= kCodeSpecialInherited; |
| 157 break; |
| 158 case 'c': |
| 159 ASSERT((seen & ~kSawBracket) == 0); |
| 160 seen |= kSawSpecial; |
| 161 code |= kCodeSpecialCommon; |
| 162 break; |
| 163 case 'l': |
| 164 ASSERT((seen & kSawLatin) == 0); |
| 165 ASSERT(currentShift < 3); |
| 166 seen |= kSawLatin; |
| 167 list |= kLatin << (2 * currentShift++); |
| 168 break; |
| 169 case 'h': |
| 170 ASSERT((seen & kSawHan) == 0); |
| 171 ASSERT(currentShift < 3); |
| 172 seen |= kSawHan; |
| 173 list |= kHan << (2 * currentShift++); |
| 174 break; |
| 175 case 'g': |
| 176 ASSERT((seen & kSawGreek) == 0); |
| 177 ASSERT(currentShift < 3); |
| 178 seen |= kSawGreek; |
| 179 list |= kGreek << (2 * currentShift++); |
| 180 break; |
| 181 case '>': |
| 182 ASSERT(seen != 0); |
| 183 code |= TableLookup(list); |
| 184 result.append(static_cast<UChar>(kMockCharMin + code)); |
| 185 inSet = false; |
| 186 break; |
| 187 default: |
| 188 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c); |
| 189 break; |
| 190 } |
| 191 continue; |
| 192 } |
| 193 // not in set |
| 194 switch (c) { |
| 195 case '<': |
| 196 seen = 0; |
| 197 code = 0; |
| 198 list = 0; |
| 199 currentShift = 0; |
| 200 inSet = true; |
| 201 break; |
| 202 case '(': |
| 203 code = kCodeBracketBit | kCodeSpecialCommon; |
| 204 break; |
| 205 case '[': |
| 206 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCom
mon; |
| 207 break; |
| 208 case ')': |
| 209 code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialComm
on; |
| 210 break; |
| 211 case ']': |
| 212 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketClo
seBit | kCodeSpecialCommon; |
| 213 break; |
| 214 case 'i': |
| 215 code = kCodeSpecialInherited; |
| 216 break; |
| 217 case 'c': |
| 218 code = kCodeSpecialCommon; |
| 219 break; |
| 220 case 'l': |
| 221 code = kLatin; |
| 222 break; |
| 223 case 'h': |
| 224 code = kHan; |
| 225 break; |
| 226 case 'g': |
| 227 code = kGreek; |
| 228 break; |
| 229 case '?': |
| 230 code = 0; // unknown |
| 231 break; |
| 232 default: |
| 233 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c); |
| 234 } |
| 235 if (!inSet) { |
| 236 result.append(static_cast<UChar>(kMockCharMin + code)); |
| 237 } |
| 238 } |
| 239 return result; |
| 240 } |
| 241 |
| 242 static std::string MockCharString(UChar mockch) |
| 243 { |
| 244 ASSERT(mockch >= kMockCharMin && mockch < kMockCharLimit); |
| 245 int code = mockch - kMockCharMin; |
| 246 |
| 247 // We use set notation in these cases: |
| 248 // - more than one of special, kLatin, kHan, kGreek |
| 249 // - bracket and not common (since non-set brackets are common) |
| 250 bool isBracket = (code & kCodeBracketBit) != 0; |
| 251 bool isSpecial = (mockch & kCodeSpecialMask) != 0; |
| 252 bool isCommon = (mockch & kCodeSpecialMask) == kCodeSpecialCommon; |
| 253 char c; |
| 254 if (isBracket) { |
| 255 if (code & kCodeSquareBracketBit) { |
| 256 if (code & kCodeBracketCloseBit) { |
| 257 c = ']'; |
| 258 } else { |
| 259 c = '['; |
| 260 } |
| 261 } else { |
| 262 if (code & kCodeBracketCloseBit) { |
| 263 c = ')'; |
| 264 } else { |
| 265 c = '('; |
| 266 } |
| 267 } |
| 268 } else if (isSpecial) { |
| 269 c = isCommon ? 'c' : 'i'; |
| 270 } |
| 271 std::string result; |
| 272 int listBits = kTable[code & kCodeListIndexMask]; |
| 273 while (listBits) { |
| 274 switch (listBits & kListMask) { |
| 275 case 0: |
| 276 break; |
| 277 case kLatin: |
| 278 result += 'l'; |
| 279 break; |
| 280 case kHan: |
| 281 result += 'h'; |
| 282 break; |
| 283 case kGreek: |
| 284 result += 'g'; |
| 285 break; |
| 286 } |
| 287 listBits >>= kListShift; |
| 288 } |
| 289 bool needSet = result.length() + (isSpecial ? 1 : 0) > 1 || (isBracket &
& (result.length() > 0 || !isCommon)); |
| 290 if (needSet) { |
| 291 std::string setResult("<"); |
| 292 if (isBracket) { |
| 293 setResult += c; |
| 294 } |
| 295 if (isSpecial) { |
| 296 if (isCommon) { |
| 297 setResult += "c"; |
| 298 } else { |
| 299 setResult += "i"; |
| 300 } |
| 301 } |
| 302 setResult += result; |
| 303 setResult += ">"; |
| 304 return setResult; |
| 305 } |
| 306 if (isBracket || isSpecial) { |
| 307 result = c; |
| 308 } |
| 309 return result; |
| 310 } |
| 311 |
| 312 // We determine properties based on the offset from kMockCharMin: |
| 313 // bits 0-3 represent the list of l, h, c scripts (index into table) |
| 314 // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal |
| 315 // bit 6 clear means non-bracket, open means bracket |
| 316 // bit 7 clear means open bracket, set means close bracket |
| 317 // bit 8 clear means paren, set means bracket |
| 318 // if it's a bracket, the matching bracket is 64 code points away |
| 319 static const UChar32 kMockCharMin = 0xe000; |
| 320 static const UChar32 kMockCharLimit = kMockCharMin + 0x200; |
| 321 static const int kLatin = 1; |
| 322 static const int kHan = 2; |
| 323 static const int kGreek = 3; |
| 324 static const int kCodeListIndexMask = 0xf; |
| 325 static const int kCodeSpecialMask = 0x30; |
| 326 static const int kCodeSpecialCommon = 0x10; |
| 327 static const int kCodeSpecialInherited = 0x20; |
| 328 static const int kCodeBracketCloseBit = 0x40; |
| 329 static const int kCodeBracketBit = 0x80; |
| 330 static const int kCodeSquareBracketBit = 0x100; |
| 331 static const int kListShift = 2; |
| 332 static const int kListMask = 0x3; |
| 333 static const int kBracketDelta = kCodeBracketCloseBit; |
| 334 static const int kTable[16]; |
| 335 |
| 336 static const int kSawBracket = 0x1; |
| 337 static const int kSawSpecial = 0x2; |
| 338 static const int kSawLatin = 0x4; |
| 339 static const int kSawHan = 0x8; |
| 340 static const int kSawGreek = 0x10; |
| 341 }; |
| 342 |
| 343 static const int kLatin2 = MockScriptData::kLatin << 2; |
| 344 static const int kHan2 = MockScriptData::kHan << 2; |
| 345 static const int kGreek2 = MockScriptData::kGreek << 2; |
| 346 static const int kLatin3 = MockScriptData::kLatin << 4; |
| 347 static const int kHan3 = MockScriptData::kHan << 4; |
| 348 static const int kGreek3 = MockScriptData::kGreek << 4; |
| 349 const int MockScriptData::kTable[] = { |
| 350 0, kLatin, kHan, kGreek, |
| 351 kLatin2 + kHan, kLatin2 + kGreek, |
| 352 kHan2 + kLatin, kHan2 + kGreek, |
| 353 kGreek2 + kLatin, kGreek2 + kHan, |
| 354 kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan, |
| 355 kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin, |
| 356 kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin, |
| 357 }; |
| 358 |
| 359 class ScriptRunIteratorTest : public testing::Test { |
| 360 protected: |
| 361 void CheckRuns(const std::vector<TestRun>& runs) |
| 362 { |
| 363 String text(String::make16BitFrom8BitSource(0, 0)); |
| 364 std::vector<ExpectedRun> expect; |
| 365 for (auto& run : runs) { |
| 366 text.append(String::fromUTF8(run.text.c_str())); |
| 367 expect.push_back(ExpectedRun(text.length(), run.code)); |
| 368 } |
| 369 ScriptRunIterator scriptRunIterator(text.characters16(), text.length()); |
| 370 VerifyRuns(&scriptRunIterator, expect); |
| 371 } |
| 372 |
| 373 void CheckMockRuns(const std::vector<TestRun>& runs) |
| 374 { |
| 375 String text(String::make16BitFrom8BitSource(0, 0)); |
| 376 std::vector<ExpectedRun> expect; |
| 377 for (const TestRun& run : runs) { |
| 378 text.append(MockScriptData::ToTestString(run.text)); |
| 379 expect.push_back({ text.length(), run.code }); |
| 380 } |
| 381 |
| 382 ScriptRunIterator scriptRunIterator(text.characters16(), text.length(), |
| 383 MockScriptData::instance()); |
| 384 VerifyRuns(&scriptRunIterator, expect); |
| 385 } |
| 386 |
| 387 void VerifyRuns(ScriptRunIterator* scriptRunIterator, |
| 388 const std::vector<ExpectedRun>& expect) |
| 389 { |
| 390 unsigned limit; |
| 391 UScriptCode code; |
| 392 unsigned long runCount = 0; |
| 393 while (scriptRunIterator->consume(limit, code)) { |
| 394 ASSERT_LT(runCount, expect.size()); |
| 395 ASSERT_EQ(expect[runCount].limit, limit); |
| 396 ASSERT_EQ(expect[runCount].code, code); |
| 397 ++runCount; |
| 398 } |
| 399 WTF_LOG_ERROR("Expected %zu runs, got %lu ", expect.size(), runCount); |
| 400 ASSERT_EQ(expect.size(), runCount); |
| 401 } |
| 402 }; |
| 403 |
| 404 TEST_F(ScriptRunIteratorTest, Empty) |
| 405 { |
| 406 String empty(String::make16BitFrom8BitSource(0, 0)); |
| 407 ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length()); |
| 408 unsigned limit = 0; |
| 409 UScriptCode code = USCRIPT_INVALID_CODE; |
| 410 ASSERT(!scriptRunIterator.consume(limit, code)); |
| 411 ASSERT_EQ(limit, 0u); |
| 412 ASSERT_EQ(code, USCRIPT_INVALID_CODE); |
| 413 } |
| 414 |
| 415 // Some of our compilers cannot initialize a vector from an array yet. |
| 416 #define DECLARE_RUNSVECTOR(...) \ |
| 417 static const TestRun runsArray[] = __VA_ARGS__; \ |
| 418 std::vector<TestRun> runs(runsArray, runsArray + sizeof(runsArray) / sizeof(
*runsArray)); |
| 419 |
| 420 #define CHECK_RUNS(...) \ |
| 421 DECLARE_RUNSVECTOR(__VA_ARGS__); \ |
| 422 CheckRuns(runs); |
| 423 |
| 424 #define CHECK_MOCK_RUNS(...) \ |
| 425 DECLARE_RUNSVECTOR(__VA_ARGS__); \ |
| 426 CheckMockRuns(runs); |
| 427 |
| 428 TEST_F(ScriptRunIteratorTest, Whitespace) |
| 429 { |
| 430 CHECK_RUNS({ { " \t ", USCRIPT_COMMON } }); |
| 431 } |
| 432 |
| 433 TEST_F(ScriptRunIteratorTest, Common) |
| 434 { |
| 435 CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } }); |
| 436 } |
| 437 |
| 438 TEST_F(ScriptRunIteratorTest, Latin) |
| 439 { |
| 440 CHECK_RUNS({ { "latin", USCRIPT_LATIN } }); |
| 441 } |
| 442 |
| 443 TEST_F(ScriptRunIteratorTest, Chinese) |
| 444 { |
| 445 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } }); |
| 446 } |
| 447 |
| 448 // Close bracket without matching open is ignored |
| 449 TEST_F(ScriptRunIteratorTest, UnbalancedParens1) |
| 450 { |
| 451 CHECK_RUNS({ { "(萬", USCRIPT_HAN }, |
| 452 { "a]", USCRIPT_LATIN }, |
| 453 { ")", USCRIPT_HAN } }); |
| 454 } |
| 455 |
| 456 // Open bracket without matching close is popped when inside |
| 457 // matching close brackets, so doesn't match later close. |
| 458 TEST_F(ScriptRunIteratorTest, UnbalancedParens2) |
| 459 { |
| 460 CHECK_RUNS({ { "(萬", USCRIPT_HAN }, |
| 461 { "a[", USCRIPT_LATIN }, |
| 462 { ")]", USCRIPT_HAN } }); |
| 463 } |
| 464 |
| 465 // space goes with leading script |
| 466 TEST_F(ScriptRunIteratorTest, LatinHan) |
| 467 { |
| 468 CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN }, |
| 469 { "萬國碼", USCRIPT_HAN } }); |
| 470 } |
| 471 |
| 472 // space goes with leading script |
| 473 TEST_F(ScriptRunIteratorTest, HanLatin) |
| 474 { |
| 475 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN }, |
| 476 { "Unicode", USCRIPT_LATIN } }); |
| 477 } |
| 478 |
| 479 TEST_F(ScriptRunIteratorTest, ParenEmptyParen) |
| 480 { |
| 481 CHECK_RUNS({ { "()", USCRIPT_COMMON } }); |
| 482 } |
| 483 |
| 484 TEST_F(ScriptRunIteratorTest, ParenChineseParen) |
| 485 { |
| 486 CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } }); |
| 487 } |
| 488 |
| 489 TEST_F(ScriptRunIteratorTest, ParenLatinParen) |
| 490 { |
| 491 CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } }); |
| 492 } |
| 493 |
| 494 // open paren gets leading script |
| 495 TEST_F(ScriptRunIteratorTest, LatinParenChineseParen) |
| 496 { |
| 497 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN }, |
| 498 { "萬國碼", USCRIPT_HAN }, |
| 499 { ")", USCRIPT_LATIN } }); |
| 500 } |
| 501 |
| 502 // open paren gets first trailing script if no leading script |
| 503 TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin) |
| 504 { |
| 505 CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN }, |
| 506 { "Unicode", USCRIPT_LATIN } }); |
| 507 } |
| 508 |
| 509 // leading common and open paren get first trailing script. |
| 510 // TODO(dougfelt): we don't do quote matching, but probably should figure out |
| 511 // something better then doing nothing. |
| 512 TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote) |
| 513 { |
| 514 CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN }, |
| 515 { "Unicode\"", USCRIPT_LATIN } }); |
| 516 } |
| 517 |
| 518 // Unmatched close brace gets leading context |
| 519 TEST_F(ScriptRunIteratorTest, UnmatchedClose) |
| 520 { |
| 521 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN }, |
| 522 { "萬國碼] ", USCRIPT_HAN }, |
| 523 { ") Unicode\"", USCRIPT_LATIN } }); |
| 524 } |
| 525 |
| 526 // Match up to 32 bracket pairs |
| 527 TEST_F(ScriptRunIteratorTest, Match32Brackets) |
| 528 { |
| 529 CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN }, |
| 530 { "Unicode (((((((((((((((((((((((((((((((!" |
| 531 ")))))))))))))))))))))))))))))))", |
| 532 USCRIPT_LATIN }, |
| 533 { "]", USCRIPT_HAN } }); |
| 534 } |
| 535 |
| 536 // Matches 32 most recent bracket pairs. More than that, and we revert to |
| 537 // surrounding script. |
| 538 TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets) |
| 539 { |
| 540 CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN }, |
| 541 { "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN }, |
| 542 { "萬國碼!", USCRIPT_HAN }, |
| 543 { ")))))))))))))))))))))))))))))))", USCRIPT_LATIN }, |
| 544 { "]", USCRIPT_HAN }, |
| 545 { "But )))", USCRIPT_LATIN } }); |
| 546 } |
| 547 |
| 548 // A char with multiple scripts that match both leading and trailing context |
| 549 // gets the leading context. |
| 550 TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext) |
| 551 { |
| 552 CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN }, |
| 553 { "l", USCRIPT_LATIN } }); |
| 554 } |
| 555 |
| 556 // A char with multiple scripts that only match trailing context gets the |
| 557 // trailing context. |
| 558 TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext) |
| 559 { |
| 560 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN }, |
| 561 { "<gl>l", USCRIPT_LATIN } }); |
| 562 } |
| 563 |
| 564 // Retain first established priority script. <lhg><gh> produce the script <gh> |
| 565 // with g as priority, because of the two priority scripts l and g, only g |
| 566 // remains. Then <gh><hgl> retains g as priority, because of the two priority |
| 567 // scripts g and h that remain, g was encountered first. |
| 568 TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript) |
| 569 { |
| 570 CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } }); |
| 571 } |
| 572 |
| 573 // Parens can have scripts that break script runs. |
| 574 TEST_F(ScriptRunIteratorTest, ExtensionsParens) |
| 575 { |
| 576 CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK }, |
| 577 { "h<[hl>", USCRIPT_HAN }, |
| 578 { "l", USCRIPT_LATIN }, |
| 579 { "<]hl>", USCRIPT_HAN }, |
| 580 { "<)lg>", USCRIPT_GREEK } }); |
| 581 } |
| 582 |
| 583 // The close paren might be encountered before we've established the open |
| 584 // paren's script, but when this is the case the current set is still valid, so |
| 585 // this doesn't affect it nor break the run. |
| 586 TEST_F(ScriptRunIteratorTest, ExtensionsParens2) |
| 587 { |
| 588 CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } }); |
| 589 } |
| 590 |
| 591 // A common script with a single extension should be treated as common, but |
| 592 // with the extended script as a default. If we encounter anything other than |
| 593 // common, that takes priority. If we encounter other common scripts with a |
| 594 // single extension, the current priority remains. |
| 595 TEST_F(ScriptRunIteratorTest, CommonWithPriority) |
| 596 { |
| 597 CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } }); |
| 598 } |
| 599 |
| 600 TEST_F(ScriptRunIteratorTest, CommonWithPriority2) |
| 601 { |
| 602 CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } }); |
| 603 } |
| 604 |
| 605 TEST_F(ScriptRunIteratorTest, CommonWithPriority3) |
| 606 { |
| 607 CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } }); |
| 608 } |
| 609 |
| 610 // UDatta (\xE0\xA5\x91) is inherited with LATIN and DEVANAGARI extensions. |
| 611 // Since it has LATIN, and the dotted circle (\xE2\x97\x8C) is COMMON and has |
| 612 // adopted the preceding LATIN, it gets the LATIN. This is standard. |
| 613 TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta) |
| 614 { |
| 615 CHECK_RUNS({ { "Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN } }); |
| 616 } |
| 617 |
| 618 // In this situation, UDatta (\xE0\xA5\x91) doesn't share a script with the |
| 619 // value inherited by the dotted circle (\xE2\x97\x8C). It captures the |
| 620 // preceding dotted circle and breaks it from the run it would normally have |
| 621 // been in. |
| 622 TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta) |
| 623 { |
| 624 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN }, |
| 625 { "\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_DEVANAGARI } }); |
| 626 } |
| 627 |
| 628 // Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is |
| 629 // common, that of Fathatan is inherited. The script extensions for Fathatan |
| 630 // are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the |
| 631 // preferred script for Fathatan is Arabic, according to Behdad's |
| 632 // heuristic. This is exactly analogous to the Udatta tests above, except |
| 633 // Tatweel is Lm. But we don't take properties into account, only scripts. |
| 634 TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan) |
| 635 { |
| 636 CHECK_RUNS({ { "Latin ", USCRIPT_LATIN }, |
| 637 { "\xD9\x80\xD9\x8B", USCRIPT_ARABIC } }); |
| 638 } |
| 639 |
| 640 // Another case where if the mark accepts a script that was inherited by the |
| 641 // preceding common-script character, they both continue in that script. |
| 642 // SYRIAC LETTER NUN \xDC\xA2 |
| 643 // ARABIC TATWEEL \xD9\x80 |
| 644 // ARABIC FATHATAN \xD9\x82 |
| 645 TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan) |
| 646 { |
| 647 CHECK_RUNS({ { "\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC } }); |
| 648 } |
| 649 |
| 650 // The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that |
| 651 // is not common. |
| 652 TEST_F(ScriptRunIteratorTest, HanUdatta) |
| 653 { |
| 654 CHECK_RUNS({ { "萬國碼\xE0\xA5\x91", USCRIPT_HAN } }); |
| 655 } |
| 656 |
| 657 // The Udatta (\xE0\xA5\x91) is inherited, and will capture the space and turn |
| 658 // it into Devanagari. |
| 659 TEST_F(ScriptRunIteratorTest, HanSpaceUdatta) |
| 660 { |
| 661 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN }, |
| 662 { " \xE0\xA5\x91", USCRIPT_DEVANAGARI } }); |
| 663 } |
| 664 |
| 665 // Make sure Mock code works too. |
| 666 TEST_F(ScriptRunIteratorTest, MockHanInheritedGL) |
| 667 { |
| 668 CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } }); |
| 669 } |
| 670 |
| 671 TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL) |
| 672 { |
| 673 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN }, |
| 674 { "c<igl>", USCRIPT_GREEK } }); |
| 675 } |
| 676 |
| 677 // Leading inherited just act like common, except there's no preferred script. |
| 678 TEST_F(ScriptRunIteratorTest, MockLeadingInherited) |
| 679 { |
| 680 CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } }); |
| 681 } |
| 682 |
| 683 // Leading inherited just act like common, except there's no preferred script. |
| 684 TEST_F(ScriptRunIteratorTest, MockLeadingInherited2) |
| 685 { |
| 686 CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } }); |
| 687 } |
| 688 |
| 689 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan) |
| 690 { |
| 691 // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91 |
| 692 CHECK_RUNS({ { "\xE0\xA5\x91萬國碼", USCRIPT_HAN } }); |
| 693 } |
| 694 |
| 695 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2) |
| 696 { |
| 697 // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91 |
| 698 // ARABIC FATHATAN \xD9\x8B |
| 699 CHECK_RUNS({ { "\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN } }); |
| 700 } |
| 701 |
| 702 TEST_F(ScriptRunIteratorTest, OddLatinString) |
| 703 { |
| 704 CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } }); |
| 705 } |
| 706 |
| 707 class ScriptRunIteratorICUDataTest : public testing::Test { |
| 708 public: |
| 709 ScriptRunIteratorICUDataTest() |
| 710 : m_maxExtensions(0) |
| 711 , m_maxExtensionsCodepoint(0xffff) |
| 712 { |
| 713 int maxExtensions = 0; |
| 714 UChar32 m_maxExtensionscp = 0; |
| 715 for (UChar32 cp = 0; cp < 0x11000; ++cp) { |
| 716 UErrorCode status = U_ZERO_ERROR; |
| 717 int count = uscript_getScriptExtensions(cp, 0, 0, &status); |
| 718 if (count > maxExtensions) { |
| 719 maxExtensions = count; |
| 720 m_maxExtensionscp = cp; |
| 721 } |
| 722 if (count > ScriptData::kMaxScriptCount) { |
| 723 } |
| 724 } |
| 725 m_maxExtensions = maxExtensions; |
| 726 m_maxExtensionsCodepoint = m_maxExtensionscp; |
| 727 } |
| 728 |
| 729 protected: |
| 730 UChar32 GetACharWithMaxExtensions(int* numExtensions) |
| 731 { |
| 732 if (numExtensions) { |
| 733 *numExtensions = m_maxExtensions; |
| 734 } |
| 735 return m_maxExtensionsCodepoint; |
| 736 } |
| 737 |
| 738 private: |
| 739 int m_maxExtensions; |
| 740 UChar32 m_maxExtensionsCodepoint; |
| 741 }; |
| 742 |
| 743 // Validate that ICU never returns more than our maximum expected number of |
| 744 // script extensions. |
| 745 TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions) |
| 746 { |
| 747 int maxExtensions; |
| 748 UChar32 cp = GetACharWithMaxExtensions(&maxExtensions); |
| 749 ASSERT_LE(maxExtensions, ScriptData::kMaxScriptCount) |
| 750 << "char " << std::hex << cp << std::dec; |
| 751 } |
| 752 |
| 753 // Check that ICUScriptData returns all of a character's scripts. |
| 754 // This only checks one likely character, but doesn't check all cases. |
| 755 TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions) |
| 756 { |
| 757 int maxExtensions; |
| 758 UChar32 cp = GetACharWithMaxExtensions(&maxExtensions); |
| 759 Vector<UScriptCode> extensions; |
| 760 ICUScriptData::instance()->getScripts(cp, extensions); |
| 761 |
| 762 // It's possible that GetScripts adds the primary script to the list of |
| 763 // extensions, resulting in one more script than the raw extension count. |
| 764 ASSERT_GE(static_cast<int>(extensions.size()), maxExtensions) |
| 765 << "char " << std::hex << cp << std::dec; |
| 766 } |
| 767 |
| 768 TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension) |
| 769 { |
| 770 Vector<UScriptCode> extensions; |
| 771 for (UChar32 cp = 0; cp < 0x110000; ++cp) { |
| 772 ICUScriptData::instance()->getScripts(cp, extensions); |
| 773 UScriptCode primary = extensions.at(0); |
| 774 if (primary == USCRIPT_COMMON) { |
| 775 ASSERT_LE(extensions.size(), 2ul) |
| 776 << "cp: " << std::hex << cp << std::dec; |
| 777 } |
| 778 } |
| 779 } |
| 780 |
| 781 // ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to |
| 782 // ignore this for now, as I think it shouldn't matter which run it ends up |
| 783 // in. HarfBuzz needs to be able to use it as context and shape each |
| 784 // neighboring character appropriately no matter what run it got assigned to. |
| 785 |
| 786 } // namespace blink |
OLD | NEW |