Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "config.h" | |
| 6 | |
| 7 #include "wtf/Assertions.h" | |
| 8 #include "platform/fonts/ScriptRunIterator.h" | |
| 9 #include "platform/Logging.h" | |
| 10 #include "wtf/Threading.h" | |
| 11 #include "wtf/text/WTFString.h" | |
| 12 | |
| 13 #include <gtest/gtest.h> | |
| 14 | |
| 15 #include <string> | |
| 16 #include <vector> | |
| 17 | |
| 18 namespace blink { | |
| 19 | |
| 20 struct TestRun { | |
| 21 std::string text; | |
| 22 UScriptCode code; | |
| 23 }; | |
| 24 | |
| 25 struct ExpectedRun { | |
| 26 unsigned limit; | |
| 27 UScriptCode code; | |
| 28 | |
| 29 ExpectedRun(unsigned the_limit, UScriptCode the_code) | |
| 30 : limit(the_limit) | |
| 31 , code(the_code) | |
| 32 { | |
| 33 } | |
| 34 }; | |
| 35 | |
| 36 class MockScriptData : public ScriptData { | |
|
eae
2015/08/28 21:10:25
Do we really need to mock out the data object? It
| |
| 37 public: | |
| 38 ~MockScriptData() override {} | |
| 39 | |
| 40 static const MockScriptData* instance() | |
| 41 { | |
| 42 AtomicallyInitializedStaticReference(const MockScriptData, mockScriptDat a, (new MockScriptData())); | |
| 43 | |
| 44 return &mockScriptData; | |
| 45 } | |
| 46 | |
| 47 virtual void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override | |
| 48 { | |
| 49 ASSERT(ch >= kMockCharMin); | |
| 50 ASSERT(ch < kMockCharLimit); | |
| 51 | |
| 52 int code = ch - kMockCharMin; | |
| 53 dst.clear(); | |
| 54 switch (code & kCodeSpecialMask) { | |
| 55 case kCodeSpecialCommon: | |
| 56 dst.append(USCRIPT_COMMON); | |
| 57 break; | |
| 58 case kCodeSpecialInherited: | |
| 59 dst.append(USCRIPT_INHERITED); | |
| 60 break; | |
| 61 default: | |
| 62 break; | |
| 63 } | |
| 64 int list_bits = kTable[code & kCodeListIndexMask]; | |
| 65 if (dst.isEmpty() && list_bits == 0) { | |
| 66 dst.append(USCRIPT_UNKNOWN); | |
| 67 return; | |
| 68 } | |
| 69 while (list_bits) { | |
| 70 switch (list_bits & kListMask) { | |
| 71 case 0: | |
| 72 break; | |
| 73 case kLatin: | |
| 74 dst.append(USCRIPT_LATIN); | |
| 75 break; | |
| 76 case kHan: | |
| 77 dst.append(USCRIPT_HAN); | |
| 78 break; | |
| 79 case kGreek: | |
| 80 dst.append(USCRIPT_GREEK); | |
| 81 break; | |
| 82 } | |
| 83 list_bits >>= kListShift; | |
| 84 } | |
| 85 } | |
| 86 | |
| 87 UChar32 getPairedBracket(UChar32 ch) const override | |
| 88 { | |
| 89 switch (getPairedBracketType(ch)) { | |
| 90 case PairedBracketType::CLOSE: | |
| 91 return ch - kBracketDelta; | |
| 92 case PairedBracketType::OPEN: | |
| 93 return ch + kBracketDelta; | |
| 94 default: | |
| 95 return ch; | |
| 96 } | |
| 97 } | |
| 98 | |
| 99 PairedBracketType getPairedBracketType(UChar32 ch) const override | |
| 100 { | |
| 101 ASSERT(ch >= kMockCharMin && ch < kMockCharLimit); | |
| 102 int code = ch - kMockCharMin; | |
| 103 if ((code & kCodeBracketBit) == 0) { | |
| 104 return PairedBracketType::NONE; | |
| 105 } | |
| 106 if (code & kCodeBracketCloseBit) { | |
| 107 return PairedBracketType::CLOSE; | |
| 108 } | |
| 109 return PairedBracketType::OPEN; | |
| 110 } | |
| 111 | |
| 112 static int TableLookup(int value) | |
| 113 { | |
| 114 for (int i = 0; i < 16; ++i) { | |
| 115 if (kTable[i] == value) { | |
| 116 return i; | |
| 117 } | |
| 118 } | |
| 119 WTF_LOG_ERROR("Table does not contain value 0x%x", value); | |
| 120 return 0; | |
| 121 } | |
| 122 | |
| 123 static String ToTestString(const std::string& input) | |
| 124 { | |
| 125 String result(String::make16BitFrom8BitSource(0, 0)); | |
| 126 bool in_set = false; | |
| 127 int seen = 0; | |
| 128 int code = 0; | |
| 129 int list = 0; | |
| 130 int cur_shift = 0; | |
| 131 for (char c : input) { | |
| 132 if (in_set) { | |
| 133 switch (c) { | |
| 134 case '(': | |
| 135 ASSERT(seen == 0); | |
| 136 seen |= kSawBracket; | |
| 137 code |= kCodeBracketBit; | |
| 138 break; | |
| 139 case '[': | |
| 140 ASSERT(seen == 0); | |
| 141 seen |= kSawBracket; | |
| 142 code |= kCodeBracketBit | kCodeSquareBracketBit; | |
| 143 break; | |
| 144 case ')': | |
| 145 ASSERT(seen == 0); | |
| 146 seen |= kSawBracket; | |
| 147 code |= kCodeBracketBit | kCodeBracketCloseBit; | |
| 148 break; | |
| 149 case ']': | |
| 150 ASSERT(seen == 0); | |
| 151 seen |= kSawBracket; | |
| 152 code |= kCodeBracketBit | kCodeSquareBracketBit | kCodeBrack etCloseBit; | |
| 153 break; | |
| 154 case 'i': | |
| 155 ASSERT(seen == 0); // brackets can't be inherited | |
| 156 seen |= kSawSpecial; | |
| 157 code |= kCodeSpecialInherited; | |
| 158 break; | |
| 159 case 'c': | |
| 160 ASSERT((seen & ~kSawBracket) == 0); | |
| 161 seen |= kSawSpecial; | |
| 162 code |= kCodeSpecialCommon; | |
| 163 break; | |
| 164 case 'l': | |
| 165 ASSERT((seen & kSawLatin) == 0); | |
| 166 ASSERT(cur_shift < 3); | |
| 167 seen |= kSawLatin; | |
| 168 list |= kLatin << (2 * cur_shift++); | |
| 169 break; | |
| 170 case 'h': | |
| 171 ASSERT((seen & kSawHan) == 0); | |
| 172 ASSERT(cur_shift < 3); | |
| 173 seen |= kSawHan; | |
| 174 list |= kHan << (2 * cur_shift++); | |
| 175 break; | |
| 176 case 'g': | |
| 177 ASSERT((seen & kSawGreek) == 0); | |
| 178 ASSERT(cur_shift < 3); | |
| 179 seen |= kSawGreek; | |
| 180 list |= kGreek << (2 * cur_shift++); | |
| 181 break; | |
| 182 case '>': | |
| 183 ASSERT(seen != 0); | |
| 184 code |= TableLookup(list); | |
| 185 result.append(static_cast<UChar>(kMockCharMin + code)); | |
| 186 in_set = false; | |
| 187 break; | |
| 188 default: | |
| 189 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c); | |
| 190 break; | |
| 191 } | |
| 192 continue; | |
| 193 } | |
| 194 // not in set | |
| 195 switch (c) { | |
| 196 case '<': | |
| 197 seen = 0; | |
| 198 code = 0; | |
| 199 list = 0; | |
| 200 cur_shift = 0; | |
| 201 in_set = true; | |
| 202 break; | |
| 203 case '(': | |
| 204 code = kCodeBracketBit | kCodeSpecialCommon; | |
| 205 break; | |
| 206 case '[': | |
| 207 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCom mon; | |
| 208 break; | |
| 209 case ')': | |
| 210 code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialComm on; | |
| 211 break; | |
| 212 case ']': | |
| 213 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketClo seBit | kCodeSpecialCommon; | |
| 214 break; | |
| 215 case 'i': | |
| 216 code = kCodeSpecialInherited; | |
| 217 break; | |
| 218 case 'c': | |
| 219 code = kCodeSpecialCommon; | |
| 220 break; | |
| 221 case 'l': | |
| 222 code = kLatin; | |
| 223 break; | |
| 224 case 'h': | |
| 225 code = kHan; | |
| 226 break; | |
| 227 case 'g': | |
| 228 code = kGreek; | |
| 229 break; | |
| 230 case '?': | |
| 231 code = 0; // unknown | |
| 232 break; | |
| 233 default: | |
| 234 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c); | |
| 235 } | |
| 236 if (!in_set) { | |
| 237 result.append(static_cast<UChar>(kMockCharMin + code)); | |
| 238 } | |
| 239 } | |
| 240 return result; | |
| 241 } | |
| 242 | |
| 243 static std::string MockCharString(UChar mockch) | |
| 244 { | |
| 245 ASSERT(mockch >= kMockCharMin && mockch < kMockCharLimit); | |
| 246 int code = mockch - kMockCharMin; | |
| 247 | |
| 248 // We use set notation in these cases: | |
| 249 // - more than one of special, kLatin, kHan, kGreek | |
| 250 // - bracket and not common (since non-set brackets are common) | |
| 251 bool is_bracket = (code & kCodeBracketBit) != 0; | |
| 252 bool is_special = (mockch & kCodeSpecialMask) != 0; | |
| 253 bool is_common = (mockch & kCodeSpecialMask) == kCodeSpecialCommon; | |
| 254 char c; | |
| 255 if (is_bracket) { | |
| 256 if (code & kCodeSquareBracketBit) { | |
| 257 if (code & kCodeBracketCloseBit) { | |
| 258 c = ']'; | |
| 259 } | |
| 260 else { | |
| 261 c = '['; | |
| 262 } | |
| 263 } | |
| 264 else { | |
| 265 if (code & kCodeBracketCloseBit) { | |
| 266 c = ')'; | |
| 267 } | |
| 268 else { | |
| 269 c = '('; | |
| 270 } | |
| 271 } | |
| 272 } | |
| 273 else if (is_special) { | |
| 274 c = is_common ? 'c' : 'i'; | |
| 275 } | |
| 276 std::string result; | |
| 277 int list_bits = kTable[code & kCodeListIndexMask]; | |
| 278 while (list_bits) { | |
| 279 switch (list_bits & kListMask) { | |
| 280 case 0: | |
| 281 break; | |
| 282 case kLatin: | |
| 283 result += 'l'; | |
| 284 break; | |
| 285 case kHan: | |
| 286 result += 'h'; | |
| 287 break; | |
| 288 case kGreek: | |
| 289 result += 'g'; | |
| 290 break; | |
| 291 } | |
| 292 list_bits >>= kListShift; | |
| 293 } | |
| 294 bool need_set = result.length() + (is_special ? 1 : 0) > 1 || (is_bracke t && (result.length() > 0 || !is_common)); | |
| 295 if (need_set) { | |
| 296 std::string set_result("<"); | |
| 297 if (is_bracket) { | |
| 298 set_result += c; | |
| 299 } | |
| 300 if (is_special) { | |
| 301 if (is_common) { | |
| 302 set_result += "c"; | |
| 303 } | |
| 304 else { | |
| 305 set_result += "i"; | |
| 306 } | |
| 307 } | |
| 308 set_result += result; | |
| 309 set_result += ">"; | |
| 310 return set_result; | |
| 311 } | |
| 312 if (is_bracket || is_special) { | |
| 313 result = c; | |
| 314 } | |
| 315 return result; | |
| 316 } | |
| 317 | |
| 318 // we determine properties based on the offset from kMockCharMin | |
| 319 // bits 0-3 represent the list of l, h, c scripts (index into table) | |
| 320 // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal | |
| 321 // bit 6 clear means non-bracket, open means bracket | |
| 322 // bit 7 clear means open bracket, set means close bracket | |
| 323 // bit 8 clear means paren, set means bracket | |
| 324 // if it's a bracket, the matching bracket is 64 code points away | |
| 325 | |
| 326 static const UChar32 kMockCharMin = 0xe000; | |
| 327 static const UChar32 kMockCharLimit = kMockCharMin + 0x200; | |
| 328 static const int kLatin = 1; | |
| 329 static const int kHan = 2; | |
| 330 static const int kGreek = 3; | |
| 331 static const int kCodeListIndexMask = 0xf; | |
| 332 static const int kCodeSpecialMask = 0x30; | |
| 333 static const int kCodeSpecialCommon = 0x10; | |
| 334 static const int kCodeSpecialInherited = 0x20; | |
| 335 static const int kCodeBracketCloseBit = 0x40; | |
| 336 static const int kCodeBracketBit = 0x80; | |
| 337 static const int kCodeSquareBracketBit = 0x100; | |
| 338 static const int kListShift = 2; | |
| 339 static const int kListMask = 0x3; | |
| 340 static const int kBracketDelta = kCodeBracketCloseBit; | |
| 341 static const int kTable[16]; | |
| 342 | |
| 343 static const int kSawBracket = 0x1; | |
| 344 static const int kSawSpecial = 0x2; | |
| 345 static const int kSawLatin = 0x4; | |
| 346 static const int kSawHan = 0x8; | |
| 347 static const int kSawGreek = 0x10; | |
| 348 }; | |
| 349 | |
| 350 static constexpr int kLatin2 = MockScriptData::kLatin << 2; | |
| 351 static constexpr int kHan2 = MockScriptData::kHan << 2; | |
| 352 static constexpr int kGreek2 = MockScriptData::kGreek << 2; | |
| 353 static constexpr int kLatin3 = MockScriptData::kLatin << 4; | |
| 354 static constexpr int kHan3 = MockScriptData::kHan << 4; | |
| 355 static constexpr int kGreek3 = MockScriptData::kGreek << 4; | |
| 356 const int MockScriptData::kTable[] = { | |
| 357 0, kLatin, kHan, kGreek, | |
| 358 kLatin2 + kHan, kLatin2 + kGreek, | |
| 359 kHan2 + kLatin, kHan2 + kGreek, | |
| 360 kGreek2 + kLatin, kGreek2 + kHan, | |
| 361 kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan, | |
| 362 kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin, | |
| 363 kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin, | |
| 364 }; | |
| 365 | |
| 366 class ScriptRunIteratorTest : public testing::Test { | |
| 367 protected: | |
| 368 void CheckRuns(const std::vector<TestRun>& runs) | |
| 369 { | |
| 370 String text(String::make16BitFrom8BitSource(0, 0)); | |
| 371 std::vector<ExpectedRun> expect; | |
| 372 for (auto& run : runs) { | |
| 373 text.append(String::fromUTF8(run.text.c_str())); | |
| 374 expect.push_back(ExpectedRun(text.length(), run.code)); | |
| 375 } | |
| 376 ScriptRunIterator scriptRunIterator(text.characters16(), text.length()); | |
| 377 VerifyRuns(&scriptRunIterator, expect); | |
| 378 } | |
| 379 | |
| 380 void CheckMockRuns(const std::vector<TestRun>& runs) | |
| 381 { | |
| 382 String text(String::make16BitFrom8BitSource(0, 0)); | |
| 383 std::vector<ExpectedRun> expect; | |
| 384 for (const TestRun& run : runs) { | |
| 385 text.append(MockScriptData::ToTestString(run.text)); | |
| 386 expect.push_back({ text.length(), run.code }); | |
| 387 } | |
| 388 | |
| 389 ScriptRunIterator scriptRunIterator(text.characters16(), text.length(), | |
| 390 MockScriptData::instance()); | |
| 391 VerifyRuns(&scriptRunIterator, expect); | |
| 392 } | |
| 393 | |
| 394 void VerifyRuns(ScriptRunIterator* scriptRunIterator, | |
| 395 const std::vector<ExpectedRun>& expect) | |
| 396 { | |
| 397 unsigned limit; | |
| 398 UScriptCode code; | |
| 399 unsigned long run_count = 0; | |
| 400 while (scriptRunIterator->consume(limit, code)) { | |
| 401 ASSERT_LT(run_count, expect.size()); | |
| 402 ASSERT_EQ(expect[run_count].limit, limit); | |
| 403 ASSERT_EQ(expect[run_count].code, code); | |
| 404 ++run_count; | |
| 405 } | |
| 406 WTF_LOG_ERROR("Expected %ld runs, got %lu ", expect.size(), run_count); | |
| 407 ASSERT_EQ(expect.size(), run_count); | |
| 408 } | |
| 409 }; | |
| 410 | |
| 411 TEST_F(ScriptRunIteratorTest, Empty) | |
| 412 { | |
| 413 String empty(String::make16BitFrom8BitSource(0, 0)); | |
| 414 ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length()); | |
| 415 unsigned limit; | |
| 416 UScriptCode code; | |
| 417 ASSERT(!scriptRunIterator.consume(limit, code)); | |
| 418 } | |
| 419 | |
| 420 // Some of our compilers cannot initialize a vector from an array yet. | |
| 421 #define DECLARE_RUNSVECTOR(...) \ | |
| 422 static const TestRun runsArray[] = __VA_ARGS__; \ | |
| 423 std::vector<TestRun> runs(runsArray, runsArray + sizeof(runsArray) / sizeof( *runsArray)); | |
| 424 | |
| 425 #define CHECK_RUNS(...) \ | |
| 426 DECLARE_RUNSVECTOR(__VA_ARGS__); \ | |
| 427 CheckRuns(runs); | |
| 428 | |
| 429 #define CHECK_MOCK_RUNS(...) \ | |
| 430 DECLARE_RUNSVECTOR(__VA_ARGS__); \ | |
| 431 CheckMockRuns(runs); | |
| 432 | |
| 433 TEST_F(ScriptRunIteratorTest, Whitespace) | |
| 434 { | |
| 435 CHECK_RUNS({ { " \t ", USCRIPT_COMMON } }); | |
| 436 } | |
| 437 | |
| 438 TEST_F(ScriptRunIteratorTest, Common) | |
| 439 { | |
| 440 CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } }); | |
| 441 } | |
| 442 | |
| 443 TEST_F(ScriptRunIteratorTest, Latin) | |
| 444 { | |
| 445 CHECK_RUNS({ { "latin", USCRIPT_LATIN } }); | |
| 446 } | |
| 447 | |
| 448 TEST_F(ScriptRunIteratorTest, Chinese) | |
| 449 { | |
| 450 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } }); | |
| 451 } | |
| 452 | |
| 453 // Close bracket without matching open is ignored | |
| 454 TEST_F(ScriptRunIteratorTest, UnbalancedParens1) | |
| 455 { | |
| 456 CHECK_RUNS({ { "(萬", USCRIPT_HAN }, | |
| 457 { "a]", USCRIPT_LATIN }, | |
| 458 { ")", USCRIPT_HAN } }); | |
| 459 } | |
| 460 | |
| 461 // Open bracket without matching close is popped when inside | |
| 462 // matching close brackets, so doesn't match later close. | |
| 463 TEST_F(ScriptRunIteratorTest, UnbalancedParens2) | |
| 464 { | |
| 465 CHECK_RUNS({ { "(萬", USCRIPT_HAN }, | |
| 466 { "a[", USCRIPT_LATIN }, | |
| 467 { ")]", USCRIPT_HAN } }); | |
| 468 } | |
| 469 | |
| 470 // space goes with leading script | |
| 471 TEST_F(ScriptRunIteratorTest, LatinHan) | |
| 472 { | |
| 473 CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN }, | |
| 474 { "萬國碼", USCRIPT_HAN } }); | |
| 475 } | |
| 476 | |
| 477 // space goes with leading script | |
| 478 TEST_F(ScriptRunIteratorTest, HanLatin) | |
| 479 { | |
| 480 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN }, | |
| 481 { "Unicode", USCRIPT_LATIN } }); | |
| 482 } | |
| 483 | |
| 484 TEST_F(ScriptRunIteratorTest, ParenEmptyParen) | |
| 485 { | |
| 486 CHECK_RUNS({ { "()", USCRIPT_COMMON } }); | |
| 487 } | |
| 488 | |
| 489 TEST_F(ScriptRunIteratorTest, ParenChineseParen) | |
| 490 { | |
| 491 CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } }); | |
| 492 } | |
| 493 | |
| 494 TEST_F(ScriptRunIteratorTest, ParenLatinParen) | |
| 495 { | |
| 496 CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } }); | |
| 497 } | |
| 498 | |
| 499 // open paren gets leading script | |
| 500 TEST_F(ScriptRunIteratorTest, LatinParenChineseParen) | |
| 501 { | |
| 502 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN }, | |
| 503 { "萬國碼", USCRIPT_HAN }, | |
| 504 { ")", USCRIPT_LATIN } }); | |
| 505 } | |
| 506 | |
| 507 // open paren gets first trailing script if no leading script | |
| 508 TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin) | |
| 509 { | |
| 510 CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN }, | |
| 511 { "Unicode", USCRIPT_LATIN } }); | |
| 512 } | |
| 513 | |
| 514 // leading common and open paren get first trailing script. | |
| 515 // TODO(dougfelt): we don't do quote matching, but probably should figure out | |
| 516 // something better then doing nothing. | |
| 517 TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote) | |
| 518 { | |
| 519 CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN }, | |
| 520 { "Unicode\"", USCRIPT_LATIN } }); | |
| 521 } | |
| 522 | |
| 523 // Unmatched close brace gets leading context | |
| 524 TEST_F(ScriptRunIteratorTest, UnmatchedClose) | |
| 525 { | |
| 526 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN }, | |
| 527 { "萬國碼] ", USCRIPT_HAN }, | |
| 528 { ") Unicode\"", USCRIPT_LATIN } }); | |
| 529 } | |
| 530 | |
| 531 // Match up to 32 bracket pairs | |
| 532 TEST_F(ScriptRunIteratorTest, Match32Brackets) | |
| 533 { | |
| 534 CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN }, | |
| 535 { "Unicode (((((((((((((((((((((((((((((((!" | |
| 536 ")))))))))))))))))))))))))))))))", | |
| 537 USCRIPT_LATIN }, | |
| 538 { "]", USCRIPT_HAN } }); | |
| 539 } | |
| 540 | |
| 541 // Matches 32 most recent bracket pairs. More than that, and we revert to | |
| 542 // surrounding script. | |
| 543 TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets) | |
| 544 { | |
| 545 CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN }, | |
| 546 { "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN }, | |
| 547 { "萬國碼!", USCRIPT_HAN }, | |
| 548 { ")))))))))))))))))))))))))))))))", USCRIPT_LATIN }, | |
| 549 { "]", USCRIPT_HAN }, | |
| 550 { "But )))", USCRIPT_LATIN } }); | |
| 551 } | |
| 552 | |
| 553 // A char with multiple scripts that match both leading and trailing context | |
| 554 // gets the leading context. | |
| 555 TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext) | |
| 556 { | |
| 557 CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN }, | |
| 558 { "l", USCRIPT_LATIN } }); | |
| 559 } | |
| 560 | |
| 561 // A char with multiple scripts that only match trailing context gets the | |
| 562 // trailing context. | |
| 563 TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext) | |
| 564 { | |
| 565 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN }, | |
| 566 { "<gl>l", USCRIPT_LATIN } }); | |
| 567 } | |
| 568 | |
| 569 // Retain first established priority script. <lhg><gh> produce the script <gh> | |
| 570 // with g as priority, because of the two priority scripts l and g, only g | |
| 571 // remains. Then <gh><hgl> retains g as priority, because of the two priority | |
| 572 // scripts g and h that remain, g was encountered first. | |
| 573 TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript) | |
| 574 { | |
| 575 CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } }); | |
| 576 } | |
| 577 | |
| 578 // Parens can have scripts that break script runs. | |
| 579 TEST_F(ScriptRunIteratorTest, ExtensionsParens) | |
| 580 { | |
| 581 CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK }, | |
| 582 { "h<[hl>", USCRIPT_HAN }, | |
| 583 { "l", USCRIPT_LATIN }, | |
| 584 { "<]hl>", USCRIPT_HAN }, | |
| 585 { "<)lg>", USCRIPT_GREEK } }); | |
| 586 } | |
| 587 | |
| 588 // The close paren might be encountered before we've established the open | |
| 589 // paren's script, but when this is the case the current set is still valid, so | |
| 590 // this doesn't affect it nor break the run. | |
| 591 TEST_F(ScriptRunIteratorTest, ExtensionsParens2) | |
| 592 { | |
| 593 CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } }); | |
| 594 } | |
| 595 | |
| 596 // A common script with a single extension should be treated as common, but | |
| 597 // with the extended script as a default. If we encounter anything other than | |
| 598 // common, that takes priority. If we encounter other common scripts with a | |
| 599 // single extension, the current priority remains. | |
| 600 TEST_F(ScriptRunIteratorTest, CommonWithPriority) | |
| 601 { | |
| 602 CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } }); | |
| 603 } | |
| 604 | |
| 605 TEST_F(ScriptRunIteratorTest, CommonWithPriority2) | |
| 606 { | |
| 607 CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } }); | |
| 608 } | |
| 609 | |
| 610 TEST_F(ScriptRunIteratorTest, CommonWithPriority3) | |
| 611 { | |
| 612 CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } }); | |
| 613 } | |
| 614 | |
| 615 // UDatta is inherited with LATIN and DEVANAGARI extensions. Since it has | |
| 616 // LATIN, and the dotted circle is COMMON and has adopted the preceding LATIN, | |
| 617 // it gets the LATIN. This is standard. | |
| 618 TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta) | |
| 619 { | |
| 620 CHECK_RUNS({ { "Latin \u25cc\u0951", USCRIPT_LATIN } }); | |
| 621 } | |
| 622 | |
| 623 // In this situation, UDatta doesn't share a script with the value inherited by | |
| 624 // the dotted circle. It captures the preceding dotted circle and breaks it | |
| 625 // from the run it would normally have been in. | |
| 626 TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta) | |
| 627 { | |
| 628 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN }, | |
| 629 { "\u25cc\u0951", USCRIPT_DEVANAGARI } }); | |
| 630 } | |
| 631 | |
| 632 // Tatweel is \u0640 Lm, Fathatan is \u064b Mn. The script of tatweel is | |
| 633 // common, that of Fathatan is inherited. The script extensions for Fathatan | |
| 634 // are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the | |
| 635 // preferred script for Fathatan is Arabic, according to Behdad's | |
| 636 // heuristic. This is exactly analogous to the Udatta tests above, except | |
| 637 // Tatweel is Lm. But we don't take properties into account, only scripts. | |
| 638 TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan) | |
| 639 { | |
| 640 CHECK_RUNS({ { "Latin ", USCRIPT_LATIN }, | |
| 641 { "\u0640\u064b", USCRIPT_ARABIC } }); | |
| 642 } | |
| 643 | |
| 644 // Another case where if the mark accepts a script that was inherited by the | |
| 645 // preceding common-script character, they both continue in that script. | |
| 646 TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan) | |
| 647 { | |
| 648 CHECK_RUNS({ { "\u0722\u0640\u064b", USCRIPT_SYRIAC } }); | |
| 649 } | |
| 650 | |
| 651 // The Udatta is inherited, so will share runs with anything that is not | |
| 652 // common. | |
| 653 TEST_F(ScriptRunIteratorTest, HanUdatta) | |
| 654 { | |
| 655 CHECK_RUNS({ { "萬國碼\u0951", USCRIPT_HAN } }); | |
| 656 } | |
| 657 | |
| 658 // The Udatta is inherited, and will capture the space and turn it into | |
| 659 // Devanagari. | |
| 660 TEST_F(ScriptRunIteratorTest, HanSpaceUdatta) | |
| 661 { | |
| 662 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN }, | |
| 663 { " \u0951", USCRIPT_DEVANAGARI } }); | |
| 664 } | |
| 665 | |
| 666 // Make sure Mock code works too. | |
| 667 TEST_F(ScriptRunIteratorTest, MockHanInheritedGL) | |
| 668 { | |
| 669 CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } }); | |
| 670 } | |
| 671 | |
| 672 TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL) | |
| 673 { | |
| 674 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN }, | |
| 675 { "c<igl>", USCRIPT_GREEK } }); | |
| 676 } | |
| 677 | |
| 678 // Leading inherited just act like common, except there's no preferred script. | |
| 679 TEST_F(ScriptRunIteratorTest, MockLeadingInherited) | |
| 680 { | |
| 681 CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } }); | |
| 682 } | |
| 683 | |
| 684 // Leading inherited just act like common, except there's no preferred script. | |
| 685 TEST_F(ScriptRunIteratorTest, MockLeadingInherited2) | |
| 686 { | |
| 687 CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } }); | |
| 688 } | |
| 689 | |
| 690 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan) | |
| 691 { | |
| 692 CHECK_RUNS({ { "\u0951萬國碼", USCRIPT_HAN } }); | |
| 693 } | |
| 694 | |
| 695 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2) | |
| 696 { | |
| 697 CHECK_RUNS({ { "\u0951\u064b萬國碼", USCRIPT_HAN } }); | |
| 698 } | |
| 699 | |
| 700 TEST_F(ScriptRunIteratorTest, OddLatinString) | |
| 701 { | |
| 702 CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } }); | |
| 703 } | |
| 704 | |
| 705 class ScriptRunIteratorICUDataTest : public testing::Test { | |
| 706 public: | |
| 707 ScriptRunIteratorICUDataTest() | |
| 708 : max_extensions_(0) | |
| 709 , max_extensions_cp_(0xffff) | |
| 710 { | |
| 711 int max_extensions = 0; | |
| 712 UChar32 max_extensions_cp = 0; | |
| 713 for (UChar32 cp = 0; cp < 0x11000; ++cp) { | |
| 714 UErrorCode status = U_ZERO_ERROR; | |
| 715 int count = uscript_getScriptExtensions(cp, NULL, 0, &status); | |
| 716 if (count > max_extensions) { | |
| 717 max_extensions = count; | |
| 718 max_extensions_cp = cp; | |
| 719 } | |
| 720 if (count > ScriptData::kMaxScriptCount) { | |
| 721 } | |
| 722 } | |
| 723 max_extensions_ = max_extensions; | |
| 724 max_extensions_cp_ = max_extensions_cp; | |
| 725 } | |
| 726 | |
| 727 protected: | |
| 728 UChar32 GetACharWithMaxExtensions(int* num_extensions) | |
| 729 { | |
| 730 if (num_extensions) { | |
| 731 *num_extensions = max_extensions_; | |
| 732 } | |
| 733 return max_extensions_cp_; | |
| 734 } | |
| 735 | |
| 736 private: | |
| 737 int max_extensions_; | |
| 738 UChar32 max_extensions_cp_; | |
| 739 }; | |
| 740 | |
| 741 // Validate that ICU never returns more than our maximum expected number of | |
| 742 // script extensions. | |
| 743 TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions) | |
| 744 { | |
| 745 int max_extensions; | |
| 746 UChar32 cp = GetACharWithMaxExtensions(&max_extensions); | |
| 747 ASSERT_LE(max_extensions, ScriptData::kMaxScriptCount) | |
| 748 << "char " << std::hex << cp << std::dec; | |
| 749 } | |
| 750 | |
| 751 // Check that ICUScriptData returns all of a character's scripts. | |
| 752 // This only checks one likely character, but doesn't check all cases. | |
| 753 TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions) | |
| 754 { | |
| 755 int max_extensions; | |
| 756 UChar32 cp = GetACharWithMaxExtensions(&max_extensions); | |
| 757 Vector<UScriptCode> extensions; | |
| 758 ICUScriptData::instance()->getScripts(cp, extensions); | |
| 759 | |
| 760 // It's possible that GetScripts adds the primary script to the list of | |
| 761 // extensions, resulting in one more script than the raw extension count. | |
| 762 ASSERT_GE(static_cast<int>(extensions.size()), max_extensions) | |
| 763 << "char " << std::hex << cp << std::dec; | |
| 764 } | |
| 765 | |
| 766 TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension) | |
| 767 { | |
| 768 Vector<UScriptCode> extensions; | |
| 769 for (UChar32 cp = 0; cp < 0x110000; ++cp) { | |
| 770 ICUScriptData::instance()->getScripts(cp, extensions); | |
| 771 UScriptCode primary = extensions.at(0); | |
| 772 if (primary == USCRIPT_COMMON) { | |
| 773 ASSERT_LE(extensions.size(), 2ul) | |
| 774 << "cp: " << std::hex << cp << std::dec; | |
| 775 } | |
| 776 } | |
| 777 } | |
| 778 | |
| 779 // ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to | |
| 780 // ignore this for now, as I think it shouldn't matter which run it ends up | |
| 781 // in. HarfBuzz needs to be able to use it as context and shape each | |
| 782 // neighboring character appropriately no matter what run it got assigned to. | |
| 783 | |
| 784 } // namespace blink | |
| OLD | NEW |