OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "config.h" | |
6 | |
7 #include "wtf/Assertions.h" | |
8 #include "platform/fonts/ScriptRunIterator.h" | |
9 #include "platform/Logging.h" | |
10 #include "wtf/Threading.h" | |
11 #include "wtf/text/WTFString.h" | |
12 | |
13 #include <gtest/gtest.h> | |
14 | |
15 #include <string> | |
16 #include <vector> | |
17 | |
18 namespace blink { | |
19 | |
20 struct TestRun { | |
21 std::string text; | |
22 UScriptCode code; | |
23 }; | |
24 | |
25 struct ExpectedRun { | |
26 unsigned limit; | |
27 UScriptCode code; | |
28 | |
29 ExpectedRun(unsigned the_limit, UScriptCode the_code) | |
30 : limit(the_limit) | |
31 , code(the_code) | |
32 { | |
33 } | |
34 }; | |
35 | |
36 class MockScriptData : public ScriptData { | |
eae
2015/08/28 21:10:25
Do we really need to mock out the data object? It
| |
37 public: | |
38 ~MockScriptData() override {} | |
39 | |
40 static const MockScriptData* instance() | |
41 { | |
42 AtomicallyInitializedStaticReference(const MockScriptData, mockScriptDat a, (new MockScriptData())); | |
43 | |
44 return &mockScriptData; | |
45 } | |
46 | |
47 virtual void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override | |
48 { | |
49 ASSERT(ch >= kMockCharMin); | |
50 ASSERT(ch < kMockCharLimit); | |
51 | |
52 int code = ch - kMockCharMin; | |
53 dst.clear(); | |
54 switch (code & kCodeSpecialMask) { | |
55 case kCodeSpecialCommon: | |
56 dst.append(USCRIPT_COMMON); | |
57 break; | |
58 case kCodeSpecialInherited: | |
59 dst.append(USCRIPT_INHERITED); | |
60 break; | |
61 default: | |
62 break; | |
63 } | |
64 int list_bits = kTable[code & kCodeListIndexMask]; | |
65 if (dst.isEmpty() && list_bits == 0) { | |
66 dst.append(USCRIPT_UNKNOWN); | |
67 return; | |
68 } | |
69 while (list_bits) { | |
70 switch (list_bits & kListMask) { | |
71 case 0: | |
72 break; | |
73 case kLatin: | |
74 dst.append(USCRIPT_LATIN); | |
75 break; | |
76 case kHan: | |
77 dst.append(USCRIPT_HAN); | |
78 break; | |
79 case kGreek: | |
80 dst.append(USCRIPT_GREEK); | |
81 break; | |
82 } | |
83 list_bits >>= kListShift; | |
84 } | |
85 } | |
86 | |
87 UChar32 getPairedBracket(UChar32 ch) const override | |
88 { | |
89 switch (getPairedBracketType(ch)) { | |
90 case PairedBracketType::CLOSE: | |
91 return ch - kBracketDelta; | |
92 case PairedBracketType::OPEN: | |
93 return ch + kBracketDelta; | |
94 default: | |
95 return ch; | |
96 } | |
97 } | |
98 | |
99 PairedBracketType getPairedBracketType(UChar32 ch) const override | |
100 { | |
101 ASSERT(ch >= kMockCharMin && ch < kMockCharLimit); | |
102 int code = ch - kMockCharMin; | |
103 if ((code & kCodeBracketBit) == 0) { | |
104 return PairedBracketType::NONE; | |
105 } | |
106 if (code & kCodeBracketCloseBit) { | |
107 return PairedBracketType::CLOSE; | |
108 } | |
109 return PairedBracketType::OPEN; | |
110 } | |
111 | |
112 static int TableLookup(int value) | |
113 { | |
114 for (int i = 0; i < 16; ++i) { | |
115 if (kTable[i] == value) { | |
116 return i; | |
117 } | |
118 } | |
119 WTF_LOG_ERROR("Table does not contain value 0x%x", value); | |
120 return 0; | |
121 } | |
122 | |
123 static String ToTestString(const std::string& input) | |
124 { | |
125 String result(String::make16BitFrom8BitSource(0, 0)); | |
126 bool in_set = false; | |
127 int seen = 0; | |
128 int code = 0; | |
129 int list = 0; | |
130 int cur_shift = 0; | |
131 for (char c : input) { | |
132 if (in_set) { | |
133 switch (c) { | |
134 case '(': | |
135 ASSERT(seen == 0); | |
136 seen |= kSawBracket; | |
137 code |= kCodeBracketBit; | |
138 break; | |
139 case '[': | |
140 ASSERT(seen == 0); | |
141 seen |= kSawBracket; | |
142 code |= kCodeBracketBit | kCodeSquareBracketBit; | |
143 break; | |
144 case ')': | |
145 ASSERT(seen == 0); | |
146 seen |= kSawBracket; | |
147 code |= kCodeBracketBit | kCodeBracketCloseBit; | |
148 break; | |
149 case ']': | |
150 ASSERT(seen == 0); | |
151 seen |= kSawBracket; | |
152 code |= kCodeBracketBit | kCodeSquareBracketBit | kCodeBrack etCloseBit; | |
153 break; | |
154 case 'i': | |
155 ASSERT(seen == 0); // brackets can't be inherited | |
156 seen |= kSawSpecial; | |
157 code |= kCodeSpecialInherited; | |
158 break; | |
159 case 'c': | |
160 ASSERT((seen & ~kSawBracket) == 0); | |
161 seen |= kSawSpecial; | |
162 code |= kCodeSpecialCommon; | |
163 break; | |
164 case 'l': | |
165 ASSERT((seen & kSawLatin) == 0); | |
166 ASSERT(cur_shift < 3); | |
167 seen |= kSawLatin; | |
168 list |= kLatin << (2 * cur_shift++); | |
169 break; | |
170 case 'h': | |
171 ASSERT((seen & kSawHan) == 0); | |
172 ASSERT(cur_shift < 3); | |
173 seen |= kSawHan; | |
174 list |= kHan << (2 * cur_shift++); | |
175 break; | |
176 case 'g': | |
177 ASSERT((seen & kSawGreek) == 0); | |
178 ASSERT(cur_shift < 3); | |
179 seen |= kSawGreek; | |
180 list |= kGreek << (2 * cur_shift++); | |
181 break; | |
182 case '>': | |
183 ASSERT(seen != 0); | |
184 code |= TableLookup(list); | |
185 result.append(static_cast<UChar>(kMockCharMin + code)); | |
186 in_set = false; | |
187 break; | |
188 default: | |
189 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c); | |
190 break; | |
191 } | |
192 continue; | |
193 } | |
194 // not in set | |
195 switch (c) { | |
196 case '<': | |
197 seen = 0; | |
198 code = 0; | |
199 list = 0; | |
200 cur_shift = 0; | |
201 in_set = true; | |
202 break; | |
203 case '(': | |
204 code = kCodeBracketBit | kCodeSpecialCommon; | |
205 break; | |
206 case '[': | |
207 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCom mon; | |
208 break; | |
209 case ')': | |
210 code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialComm on; | |
211 break; | |
212 case ']': | |
213 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketClo seBit | kCodeSpecialCommon; | |
214 break; | |
215 case 'i': | |
216 code = kCodeSpecialInherited; | |
217 break; | |
218 case 'c': | |
219 code = kCodeSpecialCommon; | |
220 break; | |
221 case 'l': | |
222 code = kLatin; | |
223 break; | |
224 case 'h': | |
225 code = kHan; | |
226 break; | |
227 case 'g': | |
228 code = kGreek; | |
229 break; | |
230 case '?': | |
231 code = 0; // unknown | |
232 break; | |
233 default: | |
234 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c); | |
235 } | |
236 if (!in_set) { | |
237 result.append(static_cast<UChar>(kMockCharMin + code)); | |
238 } | |
239 } | |
240 return result; | |
241 } | |
242 | |
243 static std::string MockCharString(UChar mockch) | |
244 { | |
245 ASSERT(mockch >= kMockCharMin && mockch < kMockCharLimit); | |
246 int code = mockch - kMockCharMin; | |
247 | |
248 // We use set notation in these cases: | |
249 // - more than one of special, kLatin, kHan, kGreek | |
250 // - bracket and not common (since non-set brackets are common) | |
251 bool is_bracket = (code & kCodeBracketBit) != 0; | |
252 bool is_special = (mockch & kCodeSpecialMask) != 0; | |
253 bool is_common = (mockch & kCodeSpecialMask) == kCodeSpecialCommon; | |
254 char c; | |
255 if (is_bracket) { | |
256 if (code & kCodeSquareBracketBit) { | |
257 if (code & kCodeBracketCloseBit) { | |
258 c = ']'; | |
259 } | |
260 else { | |
261 c = '['; | |
262 } | |
263 } | |
264 else { | |
265 if (code & kCodeBracketCloseBit) { | |
266 c = ')'; | |
267 } | |
268 else { | |
269 c = '('; | |
270 } | |
271 } | |
272 } | |
273 else if (is_special) { | |
274 c = is_common ? 'c' : 'i'; | |
275 } | |
276 std::string result; | |
277 int list_bits = kTable[code & kCodeListIndexMask]; | |
278 while (list_bits) { | |
279 switch (list_bits & kListMask) { | |
280 case 0: | |
281 break; | |
282 case kLatin: | |
283 result += 'l'; | |
284 break; | |
285 case kHan: | |
286 result += 'h'; | |
287 break; | |
288 case kGreek: | |
289 result += 'g'; | |
290 break; | |
291 } | |
292 list_bits >>= kListShift; | |
293 } | |
294 bool need_set = result.length() + (is_special ? 1 : 0) > 1 || (is_bracke t && (result.length() > 0 || !is_common)); | |
295 if (need_set) { | |
296 std::string set_result("<"); | |
297 if (is_bracket) { | |
298 set_result += c; | |
299 } | |
300 if (is_special) { | |
301 if (is_common) { | |
302 set_result += "c"; | |
303 } | |
304 else { | |
305 set_result += "i"; | |
306 } | |
307 } | |
308 set_result += result; | |
309 set_result += ">"; | |
310 return set_result; | |
311 } | |
312 if (is_bracket || is_special) { | |
313 result = c; | |
314 } | |
315 return result; | |
316 } | |
317 | |
318 // we determine properties based on the offset from kMockCharMin | |
319 // bits 0-3 represent the list of l, h, c scripts (index into table) | |
320 // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal | |
321 // bit 6 clear means non-bracket, open means bracket | |
322 // bit 7 clear means open bracket, set means close bracket | |
323 // bit 8 clear means paren, set means bracket | |
324 // if it's a bracket, the matching bracket is 64 code points away | |
325 | |
326 static const UChar32 kMockCharMin = 0xe000; | |
327 static const UChar32 kMockCharLimit = kMockCharMin + 0x200; | |
328 static const int kLatin = 1; | |
329 static const int kHan = 2; | |
330 static const int kGreek = 3; | |
331 static const int kCodeListIndexMask = 0xf; | |
332 static const int kCodeSpecialMask = 0x30; | |
333 static const int kCodeSpecialCommon = 0x10; | |
334 static const int kCodeSpecialInherited = 0x20; | |
335 static const int kCodeBracketCloseBit = 0x40; | |
336 static const int kCodeBracketBit = 0x80; | |
337 static const int kCodeSquareBracketBit = 0x100; | |
338 static const int kListShift = 2; | |
339 static const int kListMask = 0x3; | |
340 static const int kBracketDelta = kCodeBracketCloseBit; | |
341 static const int kTable[16]; | |
342 | |
343 static const int kSawBracket = 0x1; | |
344 static const int kSawSpecial = 0x2; | |
345 static const int kSawLatin = 0x4; | |
346 static const int kSawHan = 0x8; | |
347 static const int kSawGreek = 0x10; | |
348 }; | |
349 | |
350 static constexpr int kLatin2 = MockScriptData::kLatin << 2; | |
351 static constexpr int kHan2 = MockScriptData::kHan << 2; | |
352 static constexpr int kGreek2 = MockScriptData::kGreek << 2; | |
353 static constexpr int kLatin3 = MockScriptData::kLatin << 4; | |
354 static constexpr int kHan3 = MockScriptData::kHan << 4; | |
355 static constexpr int kGreek3 = MockScriptData::kGreek << 4; | |
356 const int MockScriptData::kTable[] = { | |
357 0, kLatin, kHan, kGreek, | |
358 kLatin2 + kHan, kLatin2 + kGreek, | |
359 kHan2 + kLatin, kHan2 + kGreek, | |
360 kGreek2 + kLatin, kGreek2 + kHan, | |
361 kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan, | |
362 kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin, | |
363 kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin, | |
364 }; | |
365 | |
366 class ScriptRunIteratorTest : public testing::Test { | |
367 protected: | |
368 void CheckRuns(const std::vector<TestRun>& runs) | |
369 { | |
370 String text(String::make16BitFrom8BitSource(0, 0)); | |
371 std::vector<ExpectedRun> expect; | |
372 for (auto& run : runs) { | |
373 text.append(String::fromUTF8(run.text.c_str())); | |
374 expect.push_back(ExpectedRun(text.length(), run.code)); | |
375 } | |
376 ScriptRunIterator scriptRunIterator(text.characters16(), text.length()); | |
377 VerifyRuns(&scriptRunIterator, expect); | |
378 } | |
379 | |
380 void CheckMockRuns(const std::vector<TestRun>& runs) | |
381 { | |
382 String text(String::make16BitFrom8BitSource(0, 0)); | |
383 std::vector<ExpectedRun> expect; | |
384 for (const TestRun& run : runs) { | |
385 text.append(MockScriptData::ToTestString(run.text)); | |
386 expect.push_back({ text.length(), run.code }); | |
387 } | |
388 | |
389 ScriptRunIterator scriptRunIterator(text.characters16(), text.length(), | |
390 MockScriptData::instance()); | |
391 VerifyRuns(&scriptRunIterator, expect); | |
392 } | |
393 | |
394 void VerifyRuns(ScriptRunIterator* scriptRunIterator, | |
395 const std::vector<ExpectedRun>& expect) | |
396 { | |
397 unsigned limit; | |
398 UScriptCode code; | |
399 unsigned long run_count = 0; | |
400 while (scriptRunIterator->consume(limit, code)) { | |
401 ASSERT_LT(run_count, expect.size()); | |
402 ASSERT_EQ(expect[run_count].limit, limit); | |
403 ASSERT_EQ(expect[run_count].code, code); | |
404 ++run_count; | |
405 } | |
406 WTF_LOG_ERROR("Expected %ld runs, got %lu ", expect.size(), run_count); | |
407 ASSERT_EQ(expect.size(), run_count); | |
408 } | |
409 }; | |
410 | |
411 TEST_F(ScriptRunIteratorTest, Empty) | |
412 { | |
413 String empty(String::make16BitFrom8BitSource(0, 0)); | |
414 ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length()); | |
415 unsigned limit; | |
416 UScriptCode code; | |
417 ASSERT(!scriptRunIterator.consume(limit, code)); | |
418 } | |
419 | |
420 // Some of our compilers cannot initialize a vector from an array yet. | |
421 #define DECLARE_RUNSVECTOR(...) \ | |
422 static const TestRun runsArray[] = __VA_ARGS__; \ | |
423 std::vector<TestRun> runs(runsArray, runsArray + sizeof(runsArray) / sizeof( *runsArray)); | |
424 | |
425 #define CHECK_RUNS(...) \ | |
426 DECLARE_RUNSVECTOR(__VA_ARGS__); \ | |
427 CheckRuns(runs); | |
428 | |
429 #define CHECK_MOCK_RUNS(...) \ | |
430 DECLARE_RUNSVECTOR(__VA_ARGS__); \ | |
431 CheckMockRuns(runs); | |
432 | |
433 TEST_F(ScriptRunIteratorTest, Whitespace) | |
434 { | |
435 CHECK_RUNS({ { " \t ", USCRIPT_COMMON } }); | |
436 } | |
437 | |
438 TEST_F(ScriptRunIteratorTest, Common) | |
439 { | |
440 CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } }); | |
441 } | |
442 | |
443 TEST_F(ScriptRunIteratorTest, Latin) | |
444 { | |
445 CHECK_RUNS({ { "latin", USCRIPT_LATIN } }); | |
446 } | |
447 | |
448 TEST_F(ScriptRunIteratorTest, Chinese) | |
449 { | |
450 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } }); | |
451 } | |
452 | |
453 // Close bracket without matching open is ignored | |
454 TEST_F(ScriptRunIteratorTest, UnbalancedParens1) | |
455 { | |
456 CHECK_RUNS({ { "(萬", USCRIPT_HAN }, | |
457 { "a]", USCRIPT_LATIN }, | |
458 { ")", USCRIPT_HAN } }); | |
459 } | |
460 | |
461 // Open bracket without matching close is popped when inside | |
462 // matching close brackets, so doesn't match later close. | |
463 TEST_F(ScriptRunIteratorTest, UnbalancedParens2) | |
464 { | |
465 CHECK_RUNS({ { "(萬", USCRIPT_HAN }, | |
466 { "a[", USCRIPT_LATIN }, | |
467 { ")]", USCRIPT_HAN } }); | |
468 } | |
469 | |
470 // space goes with leading script | |
471 TEST_F(ScriptRunIteratorTest, LatinHan) | |
472 { | |
473 CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN }, | |
474 { "萬國碼", USCRIPT_HAN } }); | |
475 } | |
476 | |
477 // space goes with leading script | |
478 TEST_F(ScriptRunIteratorTest, HanLatin) | |
479 { | |
480 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN }, | |
481 { "Unicode", USCRIPT_LATIN } }); | |
482 } | |
483 | |
484 TEST_F(ScriptRunIteratorTest, ParenEmptyParen) | |
485 { | |
486 CHECK_RUNS({ { "()", USCRIPT_COMMON } }); | |
487 } | |
488 | |
489 TEST_F(ScriptRunIteratorTest, ParenChineseParen) | |
490 { | |
491 CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } }); | |
492 } | |
493 | |
494 TEST_F(ScriptRunIteratorTest, ParenLatinParen) | |
495 { | |
496 CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } }); | |
497 } | |
498 | |
499 // open paren gets leading script | |
500 TEST_F(ScriptRunIteratorTest, LatinParenChineseParen) | |
501 { | |
502 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN }, | |
503 { "萬國碼", USCRIPT_HAN }, | |
504 { ")", USCRIPT_LATIN } }); | |
505 } | |
506 | |
507 // open paren gets first trailing script if no leading script | |
508 TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin) | |
509 { | |
510 CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN }, | |
511 { "Unicode", USCRIPT_LATIN } }); | |
512 } | |
513 | |
514 // leading common and open paren get first trailing script. | |
515 // TODO(dougfelt): we don't do quote matching, but probably should figure out | |
516 // something better then doing nothing. | |
517 TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote) | |
518 { | |
519 CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN }, | |
520 { "Unicode\"", USCRIPT_LATIN } }); | |
521 } | |
522 | |
523 // Unmatched close brace gets leading context | |
524 TEST_F(ScriptRunIteratorTest, UnmatchedClose) | |
525 { | |
526 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN }, | |
527 { "萬國碼] ", USCRIPT_HAN }, | |
528 { ") Unicode\"", USCRIPT_LATIN } }); | |
529 } | |
530 | |
531 // Match up to 32 bracket pairs | |
532 TEST_F(ScriptRunIteratorTest, Match32Brackets) | |
533 { | |
534 CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN }, | |
535 { "Unicode (((((((((((((((((((((((((((((((!" | |
536 ")))))))))))))))))))))))))))))))", | |
537 USCRIPT_LATIN }, | |
538 { "]", USCRIPT_HAN } }); | |
539 } | |
540 | |
541 // Matches 32 most recent bracket pairs. More than that, and we revert to | |
542 // surrounding script. | |
543 TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets) | |
544 { | |
545 CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN }, | |
546 { "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN }, | |
547 { "萬國碼!", USCRIPT_HAN }, | |
548 { ")))))))))))))))))))))))))))))))", USCRIPT_LATIN }, | |
549 { "]", USCRIPT_HAN }, | |
550 { "But )))", USCRIPT_LATIN } }); | |
551 } | |
552 | |
553 // A char with multiple scripts that match both leading and trailing context | |
554 // gets the leading context. | |
555 TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext) | |
556 { | |
557 CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN }, | |
558 { "l", USCRIPT_LATIN } }); | |
559 } | |
560 | |
561 // A char with multiple scripts that only match trailing context gets the | |
562 // trailing context. | |
563 TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext) | |
564 { | |
565 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN }, | |
566 { "<gl>l", USCRIPT_LATIN } }); | |
567 } | |
568 | |
569 // Retain first established priority script. <lhg><gh> produce the script <gh> | |
570 // with g as priority, because of the two priority scripts l and g, only g | |
571 // remains. Then <gh><hgl> retains g as priority, because of the two priority | |
572 // scripts g and h that remain, g was encountered first. | |
573 TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript) | |
574 { | |
575 CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } }); | |
576 } | |
577 | |
578 // Parens can have scripts that break script runs. | |
579 TEST_F(ScriptRunIteratorTest, ExtensionsParens) | |
580 { | |
581 CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK }, | |
582 { "h<[hl>", USCRIPT_HAN }, | |
583 { "l", USCRIPT_LATIN }, | |
584 { "<]hl>", USCRIPT_HAN }, | |
585 { "<)lg>", USCRIPT_GREEK } }); | |
586 } | |
587 | |
588 // The close paren might be encountered before we've established the open | |
589 // paren's script, but when this is the case the current set is still valid, so | |
590 // this doesn't affect it nor break the run. | |
591 TEST_F(ScriptRunIteratorTest, ExtensionsParens2) | |
592 { | |
593 CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } }); | |
594 } | |
595 | |
596 // A common script with a single extension should be treated as common, but | |
597 // with the extended script as a default. If we encounter anything other than | |
598 // common, that takes priority. If we encounter other common scripts with a | |
599 // single extension, the current priority remains. | |
600 TEST_F(ScriptRunIteratorTest, CommonWithPriority) | |
601 { | |
602 CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } }); | |
603 } | |
604 | |
605 TEST_F(ScriptRunIteratorTest, CommonWithPriority2) | |
606 { | |
607 CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } }); | |
608 } | |
609 | |
610 TEST_F(ScriptRunIteratorTest, CommonWithPriority3) | |
611 { | |
612 CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } }); | |
613 } | |
614 | |
615 // UDatta is inherited with LATIN and DEVANAGARI extensions. Since it has | |
616 // LATIN, and the dotted circle is COMMON and has adopted the preceding LATIN, | |
617 // it gets the LATIN. This is standard. | |
618 TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta) | |
619 { | |
620 CHECK_RUNS({ { "Latin \u25cc\u0951", USCRIPT_LATIN } }); | |
621 } | |
622 | |
623 // In this situation, UDatta doesn't share a script with the value inherited by | |
624 // the dotted circle. It captures the preceding dotted circle and breaks it | |
625 // from the run it would normally have been in. | |
626 TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta) | |
627 { | |
628 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN }, | |
629 { "\u25cc\u0951", USCRIPT_DEVANAGARI } }); | |
630 } | |
631 | |
632 // Tatweel is \u0640 Lm, Fathatan is \u064b Mn. The script of tatweel is | |
633 // common, that of Fathatan is inherited. The script extensions for Fathatan | |
634 // are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the | |
635 // preferred script for Fathatan is Arabic, according to Behdad's | |
636 // heuristic. This is exactly analogous to the Udatta tests above, except | |
637 // Tatweel is Lm. But we don't take properties into account, only scripts. | |
638 TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan) | |
639 { | |
640 CHECK_RUNS({ { "Latin ", USCRIPT_LATIN }, | |
641 { "\u0640\u064b", USCRIPT_ARABIC } }); | |
642 } | |
643 | |
644 // Another case where if the mark accepts a script that was inherited by the | |
645 // preceding common-script character, they both continue in that script. | |
646 TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan) | |
647 { | |
648 CHECK_RUNS({ { "\u0722\u0640\u064b", USCRIPT_SYRIAC } }); | |
649 } | |
650 | |
651 // The Udatta is inherited, so will share runs with anything that is not | |
652 // common. | |
653 TEST_F(ScriptRunIteratorTest, HanUdatta) | |
654 { | |
655 CHECK_RUNS({ { "萬國碼\u0951", USCRIPT_HAN } }); | |
656 } | |
657 | |
658 // The Udatta is inherited, and will capture the space and turn it into | |
659 // Devanagari. | |
660 TEST_F(ScriptRunIteratorTest, HanSpaceUdatta) | |
661 { | |
662 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN }, | |
663 { " \u0951", USCRIPT_DEVANAGARI } }); | |
664 } | |
665 | |
666 // Make sure Mock code works too. | |
667 TEST_F(ScriptRunIteratorTest, MockHanInheritedGL) | |
668 { | |
669 CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } }); | |
670 } | |
671 | |
672 TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL) | |
673 { | |
674 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN }, | |
675 { "c<igl>", USCRIPT_GREEK } }); | |
676 } | |
677 | |
678 // Leading inherited just act like common, except there's no preferred script. | |
679 TEST_F(ScriptRunIteratorTest, MockLeadingInherited) | |
680 { | |
681 CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } }); | |
682 } | |
683 | |
684 // Leading inherited just act like common, except there's no preferred script. | |
685 TEST_F(ScriptRunIteratorTest, MockLeadingInherited2) | |
686 { | |
687 CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } }); | |
688 } | |
689 | |
690 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan) | |
691 { | |
692 CHECK_RUNS({ { "\u0951萬國碼", USCRIPT_HAN } }); | |
693 } | |
694 | |
695 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2) | |
696 { | |
697 CHECK_RUNS({ { "\u0951\u064b萬國碼", USCRIPT_HAN } }); | |
698 } | |
699 | |
700 TEST_F(ScriptRunIteratorTest, OddLatinString) | |
701 { | |
702 CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } }); | |
703 } | |
704 | |
705 class ScriptRunIteratorICUDataTest : public testing::Test { | |
706 public: | |
707 ScriptRunIteratorICUDataTest() | |
708 : max_extensions_(0) | |
709 , max_extensions_cp_(0xffff) | |
710 { | |
711 int max_extensions = 0; | |
712 UChar32 max_extensions_cp = 0; | |
713 for (UChar32 cp = 0; cp < 0x11000; ++cp) { | |
714 UErrorCode status = U_ZERO_ERROR; | |
715 int count = uscript_getScriptExtensions(cp, NULL, 0, &status); | |
716 if (count > max_extensions) { | |
717 max_extensions = count; | |
718 max_extensions_cp = cp; | |
719 } | |
720 if (count > ScriptData::kMaxScriptCount) { | |
721 } | |
722 } | |
723 max_extensions_ = max_extensions; | |
724 max_extensions_cp_ = max_extensions_cp; | |
725 } | |
726 | |
727 protected: | |
728 UChar32 GetACharWithMaxExtensions(int* num_extensions) | |
729 { | |
730 if (num_extensions) { | |
731 *num_extensions = max_extensions_; | |
732 } | |
733 return max_extensions_cp_; | |
734 } | |
735 | |
736 private: | |
737 int max_extensions_; | |
738 UChar32 max_extensions_cp_; | |
739 }; | |
740 | |
741 // Validate that ICU never returns more than our maximum expected number of | |
742 // script extensions. | |
743 TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions) | |
744 { | |
745 int max_extensions; | |
746 UChar32 cp = GetACharWithMaxExtensions(&max_extensions); | |
747 ASSERT_LE(max_extensions, ScriptData::kMaxScriptCount) | |
748 << "char " << std::hex << cp << std::dec; | |
749 } | |
750 | |
751 // Check that ICUScriptData returns all of a character's scripts. | |
752 // This only checks one likely character, but doesn't check all cases. | |
753 TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions) | |
754 { | |
755 int max_extensions; | |
756 UChar32 cp = GetACharWithMaxExtensions(&max_extensions); | |
757 Vector<UScriptCode> extensions; | |
758 ICUScriptData::instance()->getScripts(cp, extensions); | |
759 | |
760 // It's possible that GetScripts adds the primary script to the list of | |
761 // extensions, resulting in one more script than the raw extension count. | |
762 ASSERT_GE(static_cast<int>(extensions.size()), max_extensions) | |
763 << "char " << std::hex << cp << std::dec; | |
764 } | |
765 | |
766 TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension) | |
767 { | |
768 Vector<UScriptCode> extensions; | |
769 for (UChar32 cp = 0; cp < 0x110000; ++cp) { | |
770 ICUScriptData::instance()->getScripts(cp, extensions); | |
771 UScriptCode primary = extensions.at(0); | |
772 if (primary == USCRIPT_COMMON) { | |
773 ASSERT_LE(extensions.size(), 2ul) | |
774 << "cp: " << std::hex << cp << std::dec; | |
775 } | |
776 } | |
777 } | |
778 | |
779 // ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to | |
780 // ignore this for now, as I think it shouldn't matter which run it ends up | |
781 // in. HarfBuzz needs to be able to use it as context and shape each | |
782 // neighboring character appropriately no matter what run it got assigned to. | |
783 | |
784 } // namespace blink | |
OLD | NEW |