Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(657)

Side by Side Diff: Source/platform/fonts/ScriptRunIteratorTest.cpp

Issue 1323513006: Upstream ScriptRunIterator for segmenting text runs by script (Closed) Base URL: svn://svn.chromium.org/blink/trunk
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "config.h"
6
7 #include "wtf/Assertions.h"
8 #include "platform/fonts/ScriptRunIterator.h"
9 #include "platform/Logging.h"
10 #include "wtf/Threading.h"
11 #include "wtf/text/WTFString.h"
12
13 #include <gtest/gtest.h>
14
15 #include <string>
16 #include <vector>
17
18 namespace blink {
19
20 struct TestRun {
21 std::string text;
22 UScriptCode code;
23 };
24
25 struct ExpectedRun {
26 unsigned limit;
27 UScriptCode code;
28
29 ExpectedRun(unsigned the_limit, UScriptCode the_code)
30 : limit(the_limit)
31 , code(the_code)
32 {
33 }
34 };
35
36 class MockScriptData : public ScriptData {
eae 2015/08/28 21:10:25 Do we really need to mock out the data object? It
37 public:
38 ~MockScriptData() override {}
39
40 static const MockScriptData* instance()
41 {
42 AtomicallyInitializedStaticReference(const MockScriptData, mockScriptDat a, (new MockScriptData()));
43
44 return &mockScriptData;
45 }
46
47 virtual void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override
48 {
49 ASSERT(ch >= kMockCharMin);
50 ASSERT(ch < kMockCharLimit);
51
52 int code = ch - kMockCharMin;
53 dst.clear();
54 switch (code & kCodeSpecialMask) {
55 case kCodeSpecialCommon:
56 dst.append(USCRIPT_COMMON);
57 break;
58 case kCodeSpecialInherited:
59 dst.append(USCRIPT_INHERITED);
60 break;
61 default:
62 break;
63 }
64 int list_bits = kTable[code & kCodeListIndexMask];
65 if (dst.isEmpty() && list_bits == 0) {
66 dst.append(USCRIPT_UNKNOWN);
67 return;
68 }
69 while (list_bits) {
70 switch (list_bits & kListMask) {
71 case 0:
72 break;
73 case kLatin:
74 dst.append(USCRIPT_LATIN);
75 break;
76 case kHan:
77 dst.append(USCRIPT_HAN);
78 break;
79 case kGreek:
80 dst.append(USCRIPT_GREEK);
81 break;
82 }
83 list_bits >>= kListShift;
84 }
85 }
86
87 UChar32 getPairedBracket(UChar32 ch) const override
88 {
89 switch (getPairedBracketType(ch)) {
90 case PairedBracketType::CLOSE:
91 return ch - kBracketDelta;
92 case PairedBracketType::OPEN:
93 return ch + kBracketDelta;
94 default:
95 return ch;
96 }
97 }
98
99 PairedBracketType getPairedBracketType(UChar32 ch) const override
100 {
101 ASSERT(ch >= kMockCharMin && ch < kMockCharLimit);
102 int code = ch - kMockCharMin;
103 if ((code & kCodeBracketBit) == 0) {
104 return PairedBracketType::NONE;
105 }
106 if (code & kCodeBracketCloseBit) {
107 return PairedBracketType::CLOSE;
108 }
109 return PairedBracketType::OPEN;
110 }
111
112 static int TableLookup(int value)
113 {
114 for (int i = 0; i < 16; ++i) {
115 if (kTable[i] == value) {
116 return i;
117 }
118 }
119 WTF_LOG_ERROR("Table does not contain value 0x%x", value);
120 return 0;
121 }
122
123 static String ToTestString(const std::string& input)
124 {
125 String result(String::make16BitFrom8BitSource(0, 0));
126 bool in_set = false;
127 int seen = 0;
128 int code = 0;
129 int list = 0;
130 int cur_shift = 0;
131 for (char c : input) {
132 if (in_set) {
133 switch (c) {
134 case '(':
135 ASSERT(seen == 0);
136 seen |= kSawBracket;
137 code |= kCodeBracketBit;
138 break;
139 case '[':
140 ASSERT(seen == 0);
141 seen |= kSawBracket;
142 code |= kCodeBracketBit | kCodeSquareBracketBit;
143 break;
144 case ')':
145 ASSERT(seen == 0);
146 seen |= kSawBracket;
147 code |= kCodeBracketBit | kCodeBracketCloseBit;
148 break;
149 case ']':
150 ASSERT(seen == 0);
151 seen |= kSawBracket;
152 code |= kCodeBracketBit | kCodeSquareBracketBit | kCodeBrack etCloseBit;
153 break;
154 case 'i':
155 ASSERT(seen == 0); // brackets can't be inherited
156 seen |= kSawSpecial;
157 code |= kCodeSpecialInherited;
158 break;
159 case 'c':
160 ASSERT((seen & ~kSawBracket) == 0);
161 seen |= kSawSpecial;
162 code |= kCodeSpecialCommon;
163 break;
164 case 'l':
165 ASSERT((seen & kSawLatin) == 0);
166 ASSERT(cur_shift < 3);
167 seen |= kSawLatin;
168 list |= kLatin << (2 * cur_shift++);
169 break;
170 case 'h':
171 ASSERT((seen & kSawHan) == 0);
172 ASSERT(cur_shift < 3);
173 seen |= kSawHan;
174 list |= kHan << (2 * cur_shift++);
175 break;
176 case 'g':
177 ASSERT((seen & kSawGreek) == 0);
178 ASSERT(cur_shift < 3);
179 seen |= kSawGreek;
180 list |= kGreek << (2 * cur_shift++);
181 break;
182 case '>':
183 ASSERT(seen != 0);
184 code |= TableLookup(list);
185 result.append(static_cast<UChar>(kMockCharMin + code));
186 in_set = false;
187 break;
188 default:
189 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
190 break;
191 }
192 continue;
193 }
194 // not in set
195 switch (c) {
196 case '<':
197 seen = 0;
198 code = 0;
199 list = 0;
200 cur_shift = 0;
201 in_set = true;
202 break;
203 case '(':
204 code = kCodeBracketBit | kCodeSpecialCommon;
205 break;
206 case '[':
207 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCom mon;
208 break;
209 case ')':
210 code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialComm on;
211 break;
212 case ']':
213 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketClo seBit | kCodeSpecialCommon;
214 break;
215 case 'i':
216 code = kCodeSpecialInherited;
217 break;
218 case 'c':
219 code = kCodeSpecialCommon;
220 break;
221 case 'l':
222 code = kLatin;
223 break;
224 case 'h':
225 code = kHan;
226 break;
227 case 'g':
228 code = kGreek;
229 break;
230 case '?':
231 code = 0; // unknown
232 break;
233 default:
234 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
235 }
236 if (!in_set) {
237 result.append(static_cast<UChar>(kMockCharMin + code));
238 }
239 }
240 return result;
241 }
242
243 static std::string MockCharString(UChar mockch)
244 {
245 ASSERT(mockch >= kMockCharMin && mockch < kMockCharLimit);
246 int code = mockch - kMockCharMin;
247
248 // We use set notation in these cases:
249 // - more than one of special, kLatin, kHan, kGreek
250 // - bracket and not common (since non-set brackets are common)
251 bool is_bracket = (code & kCodeBracketBit) != 0;
252 bool is_special = (mockch & kCodeSpecialMask) != 0;
253 bool is_common = (mockch & kCodeSpecialMask) == kCodeSpecialCommon;
254 char c;
255 if (is_bracket) {
256 if (code & kCodeSquareBracketBit) {
257 if (code & kCodeBracketCloseBit) {
258 c = ']';
259 }
260 else {
261 c = '[';
262 }
263 }
264 else {
265 if (code & kCodeBracketCloseBit) {
266 c = ')';
267 }
268 else {
269 c = '(';
270 }
271 }
272 }
273 else if (is_special) {
274 c = is_common ? 'c' : 'i';
275 }
276 std::string result;
277 int list_bits = kTable[code & kCodeListIndexMask];
278 while (list_bits) {
279 switch (list_bits & kListMask) {
280 case 0:
281 break;
282 case kLatin:
283 result += 'l';
284 break;
285 case kHan:
286 result += 'h';
287 break;
288 case kGreek:
289 result += 'g';
290 break;
291 }
292 list_bits >>= kListShift;
293 }
294 bool need_set = result.length() + (is_special ? 1 : 0) > 1 || (is_bracke t && (result.length() > 0 || !is_common));
295 if (need_set) {
296 std::string set_result("<");
297 if (is_bracket) {
298 set_result += c;
299 }
300 if (is_special) {
301 if (is_common) {
302 set_result += "c";
303 }
304 else {
305 set_result += "i";
306 }
307 }
308 set_result += result;
309 set_result += ">";
310 return set_result;
311 }
312 if (is_bracket || is_special) {
313 result = c;
314 }
315 return result;
316 }
317
318 // we determine properties based on the offset from kMockCharMin
319 // bits 0-3 represent the list of l, h, c scripts (index into table)
320 // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal
321 // bit 6 clear means non-bracket, open means bracket
322 // bit 7 clear means open bracket, set means close bracket
323 // bit 8 clear means paren, set means bracket
324 // if it's a bracket, the matching bracket is 64 code points away
325
326 static const UChar32 kMockCharMin = 0xe000;
327 static const UChar32 kMockCharLimit = kMockCharMin + 0x200;
328 static const int kLatin = 1;
329 static const int kHan = 2;
330 static const int kGreek = 3;
331 static const int kCodeListIndexMask = 0xf;
332 static const int kCodeSpecialMask = 0x30;
333 static const int kCodeSpecialCommon = 0x10;
334 static const int kCodeSpecialInherited = 0x20;
335 static const int kCodeBracketCloseBit = 0x40;
336 static const int kCodeBracketBit = 0x80;
337 static const int kCodeSquareBracketBit = 0x100;
338 static const int kListShift = 2;
339 static const int kListMask = 0x3;
340 static const int kBracketDelta = kCodeBracketCloseBit;
341 static const int kTable[16];
342
343 static const int kSawBracket = 0x1;
344 static const int kSawSpecial = 0x2;
345 static const int kSawLatin = 0x4;
346 static const int kSawHan = 0x8;
347 static const int kSawGreek = 0x10;
348 };
349
350 static constexpr int kLatin2 = MockScriptData::kLatin << 2;
351 static constexpr int kHan2 = MockScriptData::kHan << 2;
352 static constexpr int kGreek2 = MockScriptData::kGreek << 2;
353 static constexpr int kLatin3 = MockScriptData::kLatin << 4;
354 static constexpr int kHan3 = MockScriptData::kHan << 4;
355 static constexpr int kGreek3 = MockScriptData::kGreek << 4;
356 const int MockScriptData::kTable[] = {
357 0, kLatin, kHan, kGreek,
358 kLatin2 + kHan, kLatin2 + kGreek,
359 kHan2 + kLatin, kHan2 + kGreek,
360 kGreek2 + kLatin, kGreek2 + kHan,
361 kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan,
362 kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin,
363 kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin,
364 };
365
366 class ScriptRunIteratorTest : public testing::Test {
367 protected:
368 void CheckRuns(const std::vector<TestRun>& runs)
369 {
370 String text(String::make16BitFrom8BitSource(0, 0));
371 std::vector<ExpectedRun> expect;
372 for (auto& run : runs) {
373 text.append(String::fromUTF8(run.text.c_str()));
374 expect.push_back(ExpectedRun(text.length(), run.code));
375 }
376 ScriptRunIterator scriptRunIterator(text.characters16(), text.length());
377 VerifyRuns(&scriptRunIterator, expect);
378 }
379
380 void CheckMockRuns(const std::vector<TestRun>& runs)
381 {
382 String text(String::make16BitFrom8BitSource(0, 0));
383 std::vector<ExpectedRun> expect;
384 for (const TestRun& run : runs) {
385 text.append(MockScriptData::ToTestString(run.text));
386 expect.push_back({ text.length(), run.code });
387 }
388
389 ScriptRunIterator scriptRunIterator(text.characters16(), text.length(),
390 MockScriptData::instance());
391 VerifyRuns(&scriptRunIterator, expect);
392 }
393
394 void VerifyRuns(ScriptRunIterator* scriptRunIterator,
395 const std::vector<ExpectedRun>& expect)
396 {
397 unsigned limit;
398 UScriptCode code;
399 unsigned long run_count = 0;
400 while (scriptRunIterator->consume(limit, code)) {
401 ASSERT_LT(run_count, expect.size());
402 ASSERT_EQ(expect[run_count].limit, limit);
403 ASSERT_EQ(expect[run_count].code, code);
404 ++run_count;
405 }
406 WTF_LOG_ERROR("Expected %ld runs, got %lu ", expect.size(), run_count);
407 ASSERT_EQ(expect.size(), run_count);
408 }
409 };
410
411 TEST_F(ScriptRunIteratorTest, Empty)
412 {
413 String empty(String::make16BitFrom8BitSource(0, 0));
414 ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length());
415 unsigned limit;
416 UScriptCode code;
417 ASSERT(!scriptRunIterator.consume(limit, code));
418 }
419
420 // Some of our compilers cannot initialize a vector from an array yet.
421 #define DECLARE_RUNSVECTOR(...) \
422 static const TestRun runsArray[] = __VA_ARGS__; \
423 std::vector<TestRun> runs(runsArray, runsArray + sizeof(runsArray) / sizeof( *runsArray));
424
425 #define CHECK_RUNS(...) \
426 DECLARE_RUNSVECTOR(__VA_ARGS__); \
427 CheckRuns(runs);
428
429 #define CHECK_MOCK_RUNS(...) \
430 DECLARE_RUNSVECTOR(__VA_ARGS__); \
431 CheckMockRuns(runs);
432
433 TEST_F(ScriptRunIteratorTest, Whitespace)
434 {
435 CHECK_RUNS({ { " \t ", USCRIPT_COMMON } });
436 }
437
438 TEST_F(ScriptRunIteratorTest, Common)
439 {
440 CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } });
441 }
442
443 TEST_F(ScriptRunIteratorTest, Latin)
444 {
445 CHECK_RUNS({ { "latin", USCRIPT_LATIN } });
446 }
447
448 TEST_F(ScriptRunIteratorTest, Chinese)
449 {
450 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } });
451 }
452
453 // Close bracket without matching open is ignored
454 TEST_F(ScriptRunIteratorTest, UnbalancedParens1)
455 {
456 CHECK_RUNS({ { "(萬", USCRIPT_HAN },
457 { "a]", USCRIPT_LATIN },
458 { ")", USCRIPT_HAN } });
459 }
460
461 // Open bracket without matching close is popped when inside
462 // matching close brackets, so doesn't match later close.
463 TEST_F(ScriptRunIteratorTest, UnbalancedParens2)
464 {
465 CHECK_RUNS({ { "(萬", USCRIPT_HAN },
466 { "a[", USCRIPT_LATIN },
467 { ")]", USCRIPT_HAN } });
468 }
469
470 // space goes with leading script
471 TEST_F(ScriptRunIteratorTest, LatinHan)
472 {
473 CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN },
474 { "萬國碼", USCRIPT_HAN } });
475 }
476
477 // space goes with leading script
478 TEST_F(ScriptRunIteratorTest, HanLatin)
479 {
480 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
481 { "Unicode", USCRIPT_LATIN } });
482 }
483
484 TEST_F(ScriptRunIteratorTest, ParenEmptyParen)
485 {
486 CHECK_RUNS({ { "()", USCRIPT_COMMON } });
487 }
488
489 TEST_F(ScriptRunIteratorTest, ParenChineseParen)
490 {
491 CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } });
492 }
493
494 TEST_F(ScriptRunIteratorTest, ParenLatinParen)
495 {
496 CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } });
497 }
498
499 // open paren gets leading script
500 TEST_F(ScriptRunIteratorTest, LatinParenChineseParen)
501 {
502 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
503 { "萬國碼", USCRIPT_HAN },
504 { ")", USCRIPT_LATIN } });
505 }
506
507 // open paren gets first trailing script if no leading script
508 TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin)
509 {
510 CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN },
511 { "Unicode", USCRIPT_LATIN } });
512 }
513
514 // leading common and open paren get first trailing script.
515 // TODO(dougfelt): we don't do quote matching, but probably should figure out
516 // something better then doing nothing.
517 TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote)
518 {
519 CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN },
520 { "Unicode\"", USCRIPT_LATIN } });
521 }
522
523 // Unmatched close brace gets leading context
524 TEST_F(ScriptRunIteratorTest, UnmatchedClose)
525 {
526 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
527 { "萬國碼] ", USCRIPT_HAN },
528 { ") Unicode\"", USCRIPT_LATIN } });
529 }
530
531 // Match up to 32 bracket pairs
532 TEST_F(ScriptRunIteratorTest, Match32Brackets)
533 {
534 CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN },
535 { "Unicode (((((((((((((((((((((((((((((((!"
536 ")))))))))))))))))))))))))))))))",
537 USCRIPT_LATIN },
538 { "]", USCRIPT_HAN } });
539 }
540
541 // Matches 32 most recent bracket pairs. More than that, and we revert to
542 // surrounding script.
543 TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets)
544 {
545 CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN },
546 { "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN },
547 { "萬國碼!", USCRIPT_HAN },
548 { ")))))))))))))))))))))))))))))))", USCRIPT_LATIN },
549 { "]", USCRIPT_HAN },
550 { "But )))", USCRIPT_LATIN } });
551 }
552
553 // A char with multiple scripts that match both leading and trailing context
554 // gets the leading context.
555 TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext)
556 {
557 CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN },
558 { "l", USCRIPT_LATIN } });
559 }
560
561 // A char with multiple scripts that only match trailing context gets the
562 // trailing context.
563 TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext)
564 {
565 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
566 { "<gl>l", USCRIPT_LATIN } });
567 }
568
569 // Retain first established priority script. <lhg><gh> produce the script <gh>
570 // with g as priority, because of the two priority scripts l and g, only g
571 // remains. Then <gh><hgl> retains g as priority, because of the two priority
572 // scripts g and h that remain, g was encountered first.
573 TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript)
574 {
575 CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } });
576 }
577
578 // Parens can have scripts that break script runs.
579 TEST_F(ScriptRunIteratorTest, ExtensionsParens)
580 {
581 CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK },
582 { "h<[hl>", USCRIPT_HAN },
583 { "l", USCRIPT_LATIN },
584 { "<]hl>", USCRIPT_HAN },
585 { "<)lg>", USCRIPT_GREEK } });
586 }
587
588 // The close paren might be encountered before we've established the open
589 // paren's script, but when this is the case the current set is still valid, so
590 // this doesn't affect it nor break the run.
591 TEST_F(ScriptRunIteratorTest, ExtensionsParens2)
592 {
593 CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } });
594 }
595
596 // A common script with a single extension should be treated as common, but
597 // with the extended script as a default. If we encounter anything other than
598 // common, that takes priority. If we encounter other common scripts with a
599 // single extension, the current priority remains.
600 TEST_F(ScriptRunIteratorTest, CommonWithPriority)
601 {
602 CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } });
603 }
604
605 TEST_F(ScriptRunIteratorTest, CommonWithPriority2)
606 {
607 CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } });
608 }
609
610 TEST_F(ScriptRunIteratorTest, CommonWithPriority3)
611 {
612 CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } });
613 }
614
615 // UDatta is inherited with LATIN and DEVANAGARI extensions. Since it has
616 // LATIN, and the dotted circle is COMMON and has adopted the preceding LATIN,
617 // it gets the LATIN. This is standard.
618 TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta)
619 {
620 CHECK_RUNS({ { "Latin \u25cc\u0951", USCRIPT_LATIN } });
621 }
622
623 // In this situation, UDatta doesn't share a script with the value inherited by
624 // the dotted circle. It captures the preceding dotted circle and breaks it
625 // from the run it would normally have been in.
626 TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta)
627 {
628 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
629 { "\u25cc\u0951", USCRIPT_DEVANAGARI } });
630 }
631
632 // Tatweel is \u0640 Lm, Fathatan is \u064b Mn. The script of tatweel is
633 // common, that of Fathatan is inherited. The script extensions for Fathatan
634 // are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the
635 // preferred script for Fathatan is Arabic, according to Behdad's
636 // heuristic. This is exactly analogous to the Udatta tests above, except
637 // Tatweel is Lm. But we don't take properties into account, only scripts.
638 TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan)
639 {
640 CHECK_RUNS({ { "Latin ", USCRIPT_LATIN },
641 { "\u0640\u064b", USCRIPT_ARABIC } });
642 }
643
644 // Another case where if the mark accepts a script that was inherited by the
645 // preceding common-script character, they both continue in that script.
646 TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan)
647 {
648 CHECK_RUNS({ { "\u0722\u0640\u064b", USCRIPT_SYRIAC } });
649 }
650
651 // The Udatta is inherited, so will share runs with anything that is not
652 // common.
653 TEST_F(ScriptRunIteratorTest, HanUdatta)
654 {
655 CHECK_RUNS({ { "萬國碼\u0951", USCRIPT_HAN } });
656 }
657
658 // The Udatta is inherited, and will capture the space and turn it into
659 // Devanagari.
660 TEST_F(ScriptRunIteratorTest, HanSpaceUdatta)
661 {
662 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN },
663 { " \u0951", USCRIPT_DEVANAGARI } });
664 }
665
666 // Make sure Mock code works too.
667 TEST_F(ScriptRunIteratorTest, MockHanInheritedGL)
668 {
669 CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } });
670 }
671
672 TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL)
673 {
674 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
675 { "c<igl>", USCRIPT_GREEK } });
676 }
677
678 // Leading inherited just act like common, except there's no preferred script.
679 TEST_F(ScriptRunIteratorTest, MockLeadingInherited)
680 {
681 CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } });
682 }
683
684 // Leading inherited just act like common, except there's no preferred script.
685 TEST_F(ScriptRunIteratorTest, MockLeadingInherited2)
686 {
687 CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } });
688 }
689
690 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan)
691 {
692 CHECK_RUNS({ { "\u0951萬國碼", USCRIPT_HAN } });
693 }
694
695 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2)
696 {
697 CHECK_RUNS({ { "\u0951\u064b萬國碼", USCRIPT_HAN } });
698 }
699
700 TEST_F(ScriptRunIteratorTest, OddLatinString)
701 {
702 CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } });
703 }
704
705 class ScriptRunIteratorICUDataTest : public testing::Test {
706 public:
707 ScriptRunIteratorICUDataTest()
708 : max_extensions_(0)
709 , max_extensions_cp_(0xffff)
710 {
711 int max_extensions = 0;
712 UChar32 max_extensions_cp = 0;
713 for (UChar32 cp = 0; cp < 0x11000; ++cp) {
714 UErrorCode status = U_ZERO_ERROR;
715 int count = uscript_getScriptExtensions(cp, NULL, 0, &status);
716 if (count > max_extensions) {
717 max_extensions = count;
718 max_extensions_cp = cp;
719 }
720 if (count > ScriptData::kMaxScriptCount) {
721 }
722 }
723 max_extensions_ = max_extensions;
724 max_extensions_cp_ = max_extensions_cp;
725 }
726
727 protected:
728 UChar32 GetACharWithMaxExtensions(int* num_extensions)
729 {
730 if (num_extensions) {
731 *num_extensions = max_extensions_;
732 }
733 return max_extensions_cp_;
734 }
735
736 private:
737 int max_extensions_;
738 UChar32 max_extensions_cp_;
739 };
740
741 // Validate that ICU never returns more than our maximum expected number of
742 // script extensions.
743 TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions)
744 {
745 int max_extensions;
746 UChar32 cp = GetACharWithMaxExtensions(&max_extensions);
747 ASSERT_LE(max_extensions, ScriptData::kMaxScriptCount)
748 << "char " << std::hex << cp << std::dec;
749 }
750
751 // Check that ICUScriptData returns all of a character's scripts.
752 // This only checks one likely character, but doesn't check all cases.
753 TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions)
754 {
755 int max_extensions;
756 UChar32 cp = GetACharWithMaxExtensions(&max_extensions);
757 Vector<UScriptCode> extensions;
758 ICUScriptData::instance()->getScripts(cp, extensions);
759
760 // It's possible that GetScripts adds the primary script to the list of
761 // extensions, resulting in one more script than the raw extension count.
762 ASSERT_GE(static_cast<int>(extensions.size()), max_extensions)
763 << "char " << std::hex << cp << std::dec;
764 }
765
766 TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension)
767 {
768 Vector<UScriptCode> extensions;
769 for (UChar32 cp = 0; cp < 0x110000; ++cp) {
770 ICUScriptData::instance()->getScripts(cp, extensions);
771 UScriptCode primary = extensions.at(0);
772 if (primary == USCRIPT_COMMON) {
773 ASSERT_LE(extensions.size(), 2ul)
774 << "cp: " << std::hex << cp << std::dec;
775 }
776 }
777 }
778
779 // ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to
780 // ignore this for now, as I think it shouldn't matter which run it ends up
781 // in. HarfBuzz needs to be able to use it as context and shape each
782 // neighboring character appropriately no matter what run it got assigned to.
783
784 } // namespace blink
OLDNEW
« Source/platform/fonts/ScriptRunIterator.cpp ('K') | « Source/platform/fonts/ScriptRunIterator.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698