Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(157)

Side by Side Diff: Source/platform/fonts/ScriptRunIteratorTest.cpp

Issue 1323513006: Upstream ScriptRunIterator for segmenting text runs by script (Closed) Base URL: svn://svn.chromium.org/blink/trunk
Patch Set: Additional review comments addressed, new linkage attempt for kMaxScripts constant Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « Source/platform/fonts/ScriptRunIterator.cpp ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "config.h"
6 #include "platform/fonts/ScriptRunIterator.h"
7
8 #include "platform/Logging.h"
9 #include "wtf/Assertions.h"
10 #include "wtf/Threading.h"
11 #include "wtf/text/WTFString.h"
12
13 #include <gtest/gtest.h>
14 #include <string>
15 #include <vector>
16
17 namespace blink {
18
19 struct TestRun {
20 std::string text;
21 UScriptCode code;
22 };
23
24 struct ExpectedRun {
25 unsigned limit;
26 UScriptCode code;
27
28 ExpectedRun(unsigned the_limit, UScriptCode the_code)
29 : limit(the_limit)
30 , code(the_code)
31 {
32 }
33 };
34
35 class MockScriptData : public ScriptData {
36 public:
37 ~MockScriptData() override {}
38
39 static const MockScriptData* instance()
40 {
41 AtomicallyInitializedStaticReference(const MockScriptData, mockScriptDat a, (new MockScriptData()));
42
43 return &mockScriptData;
44 }
45
46 void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override
47 {
48 ASSERT(ch >= kMockCharMin);
49 ASSERT(ch < kMockCharLimit);
50
51 int code = ch - kMockCharMin;
52 dst.clear();
53 switch (code & kCodeSpecialMask) {
54 case kCodeSpecialCommon:
55 dst.append(USCRIPT_COMMON);
56 break;
57 case kCodeSpecialInherited:
58 dst.append(USCRIPT_INHERITED);
59 break;
60 default:
61 break;
62 }
63 int listBits = kTable[code & kCodeListIndexMask];
64 if (dst.isEmpty() && listBits == 0) {
65 dst.append(USCRIPT_UNKNOWN);
66 return;
67 }
68 while (listBits) {
69 switch (listBits & kListMask) {
70 case 0:
71 break;
72 case kLatin:
73 dst.append(USCRIPT_LATIN);
74 break;
75 case kHan:
76 dst.append(USCRIPT_HAN);
77 break;
78 case kGreek:
79 dst.append(USCRIPT_GREEK);
80 break;
81 }
82 listBits >>= kListShift;
83 }
84 }
85
86 UChar32 getPairedBracket(UChar32 ch) const override
87 {
88 switch (getPairedBracketType(ch)) {
89 case PairedBracketType::BracketTypeClose:
90 return ch - kBracketDelta;
91 case PairedBracketType::BracketTypeOpen:
92 return ch + kBracketDelta;
93 default:
94 return ch;
95 }
96 }
97
98 PairedBracketType getPairedBracketType(UChar32 ch) const override
99 {
100 ASSERT(ch >= kMockCharMin && ch < kMockCharLimit);
101 int code = ch - kMockCharMin;
102 if ((code & kCodeBracketBit) == 0) {
103 return PairedBracketType::BracketTypeNone;
104 }
105 if (code & kCodeBracketCloseBit) {
106 return PairedBracketType::BracketTypeClose;
107 }
108 return PairedBracketType::BracketTypeOpen;
109 }
110
111 static int TableLookup(int value)
112 {
113 for (int i = 0; i < 16; ++i) {
114 if (kTable[i] == value) {
115 return i;
116 }
117 }
118 WTF_LOG_ERROR("Table does not contain value 0x%x", value);
119 return 0;
120 }
121
122 static String ToTestString(const std::string& input)
123 {
124 String result(String::make16BitFrom8BitSource(0, 0));
125 bool inSet = false;
126 int seen = 0;
127 int code = 0;
128 int list = 0;
129 int currentShift = 0;
130 for (char c : input) {
131 if (inSet) {
132 switch (c) {
133 case '(':
134 ASSERT(seen == 0);
135 seen |= kSawBracket;
136 code |= kCodeBracketBit;
137 break;
138 case '[':
139 ASSERT(seen == 0);
140 seen |= kSawBracket;
141 code |= kCodeBracketBit | kCodeSquareBracketBit;
142 break;
143 case ')':
144 ASSERT(seen == 0);
145 seen |= kSawBracket;
146 code |= kCodeBracketBit | kCodeBracketCloseBit;
147 break;
148 case ']':
149 ASSERT(seen == 0);
150 seen |= kSawBracket;
151 code |= kCodeBracketBit | kCodeSquareBracketBit | kCodeBrack etCloseBit;
152 break;
153 case 'i':
154 ASSERT(seen == 0); // brackets can't be inherited
155 seen |= kSawSpecial;
156 code |= kCodeSpecialInherited;
157 break;
158 case 'c':
159 ASSERT((seen & ~kSawBracket) == 0);
160 seen |= kSawSpecial;
161 code |= kCodeSpecialCommon;
162 break;
163 case 'l':
164 ASSERT((seen & kSawLatin) == 0);
165 ASSERT(currentShift < 3);
166 seen |= kSawLatin;
167 list |= kLatin << (2 * currentShift++);
168 break;
169 case 'h':
170 ASSERT((seen & kSawHan) == 0);
171 ASSERT(currentShift < 3);
172 seen |= kSawHan;
173 list |= kHan << (2 * currentShift++);
174 break;
175 case 'g':
176 ASSERT((seen & kSawGreek) == 0);
177 ASSERT(currentShift < 3);
178 seen |= kSawGreek;
179 list |= kGreek << (2 * currentShift++);
180 break;
181 case '>':
182 ASSERT(seen != 0);
183 code |= TableLookup(list);
184 result.append(static_cast<UChar>(kMockCharMin + code));
185 inSet = false;
186 break;
187 default:
188 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
189 break;
190 }
191 continue;
192 }
193 // not in set
194 switch (c) {
195 case '<':
196 seen = 0;
197 code = 0;
198 list = 0;
199 currentShift = 0;
200 inSet = true;
201 break;
202 case '(':
203 code = kCodeBracketBit | kCodeSpecialCommon;
204 break;
205 case '[':
206 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCom mon;
207 break;
208 case ')':
209 code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialComm on;
210 break;
211 case ']':
212 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketClo seBit | kCodeSpecialCommon;
213 break;
214 case 'i':
215 code = kCodeSpecialInherited;
216 break;
217 case 'c':
218 code = kCodeSpecialCommon;
219 break;
220 case 'l':
221 code = kLatin;
222 break;
223 case 'h':
224 code = kHan;
225 break;
226 case 'g':
227 code = kGreek;
228 break;
229 case '?':
230 code = 0; // unknown
231 break;
232 default:
233 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
234 }
235 if (!inSet) {
236 result.append(static_cast<UChar>(kMockCharMin + code));
237 }
238 }
239 return result;
240 }
241
242 static std::string MockCharString(UChar mockch)
243 {
244 ASSERT(mockch >= kMockCharMin && mockch < kMockCharLimit);
245 int code = mockch - kMockCharMin;
246
247 // We use set notation in these cases:
248 // - more than one of special, kLatin, kHan, kGreek
249 // - bracket and not common (since non-set brackets are common)
250 bool isBracket = (code & kCodeBracketBit) != 0;
251 bool isSpecial = (mockch & kCodeSpecialMask) != 0;
252 bool isCommon = (mockch & kCodeSpecialMask) == kCodeSpecialCommon;
253 char c;
254 if (isBracket) {
255 if (code & kCodeSquareBracketBit) {
256 if (code & kCodeBracketCloseBit) {
257 c = ']';
258 } else {
259 c = '[';
260 }
261 } else {
262 if (code & kCodeBracketCloseBit) {
263 c = ')';
264 } else {
265 c = '(';
266 }
267 }
268 } else if (isSpecial) {
269 c = isCommon ? 'c' : 'i';
270 }
271 std::string result;
272 int listBits = kTable[code & kCodeListIndexMask];
273 while (listBits) {
274 switch (listBits & kListMask) {
275 case 0:
276 break;
277 case kLatin:
278 result += 'l';
279 break;
280 case kHan:
281 result += 'h';
282 break;
283 case kGreek:
284 result += 'g';
285 break;
286 }
287 listBits >>= kListShift;
288 }
289 bool needSet = result.length() + (isSpecial ? 1 : 0) > 1 || (isBracket & & (result.length() > 0 || !isCommon));
290 if (needSet) {
291 std::string setResult("<");
292 if (isBracket) {
293 setResult += c;
294 }
295 if (isSpecial) {
296 if (isCommon) {
297 setResult += "c";
298 } else {
299 setResult += "i";
300 }
301 }
302 setResult += result;
303 setResult += ">";
304 return setResult;
305 }
306 if (isBracket || isSpecial) {
307 result = c;
308 }
309 return result;
310 }
311
312 // We determine properties based on the offset from kMockCharMin:
313 // bits 0-3 represent the list of l, h, c scripts (index into table)
314 // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal
315 // bit 6 clear means non-bracket, open means bracket
316 // bit 7 clear means open bracket, set means close bracket
317 // bit 8 clear means paren, set means bracket
318 // if it's a bracket, the matching bracket is 64 code points away
319 static const UChar32 kMockCharMin = 0xe000;
320 static const UChar32 kMockCharLimit = kMockCharMin + 0x200;
321 static const int kLatin = 1;
322 static const int kHan = 2;
323 static const int kGreek = 3;
324 static const int kCodeListIndexMask = 0xf;
325 static const int kCodeSpecialMask = 0x30;
326 static const int kCodeSpecialCommon = 0x10;
327 static const int kCodeSpecialInherited = 0x20;
328 static const int kCodeBracketCloseBit = 0x40;
329 static const int kCodeBracketBit = 0x80;
330 static const int kCodeSquareBracketBit = 0x100;
331 static const int kListShift = 2;
332 static const int kListMask = 0x3;
333 static const int kBracketDelta = kCodeBracketCloseBit;
334 static const int kTable[16];
335
336 static const int kSawBracket = 0x1;
337 static const int kSawSpecial = 0x2;
338 static const int kSawLatin = 0x4;
339 static const int kSawHan = 0x8;
340 static const int kSawGreek = 0x10;
341 };
342
343 static const int kLatin2 = MockScriptData::kLatin << 2;
344 static const int kHan2 = MockScriptData::kHan << 2;
345 static const int kGreek2 = MockScriptData::kGreek << 2;
346 static const int kLatin3 = MockScriptData::kLatin << 4;
347 static const int kHan3 = MockScriptData::kHan << 4;
348 static const int kGreek3 = MockScriptData::kGreek << 4;
349 const int MockScriptData::kTable[] = {
350 0, kLatin, kHan, kGreek,
351 kLatin2 + kHan, kLatin2 + kGreek,
352 kHan2 + kLatin, kHan2 + kGreek,
353 kGreek2 + kLatin, kGreek2 + kHan,
354 kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan,
355 kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin,
356 kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin,
357 };
358
359 class ScriptRunIteratorTest : public testing::Test {
360 protected:
361 void CheckRuns(const std::vector<TestRun>& runs)
362 {
363 String text(String::make16BitFrom8BitSource(0, 0));
364 std::vector<ExpectedRun> expect;
365 for (auto& run : runs) {
366 text.append(String::fromUTF8(run.text.c_str()));
367 expect.push_back(ExpectedRun(text.length(), run.code));
368 }
369 ScriptRunIterator scriptRunIterator(text.characters16(), text.length());
370 VerifyRuns(&scriptRunIterator, expect);
371 }
372
373 // FIXME crbug.com/527329 - CheckMockRuns should be replaced by finding
374 // suitable equivalent real codepoint sequences instead.
375 void CheckMockRuns(const std::vector<TestRun>& runs)
376 {
377 String text(String::make16BitFrom8BitSource(0, 0));
378 std::vector<ExpectedRun> expect;
379 for (const TestRun& run : runs) {
380 text.append(MockScriptData::ToTestString(run.text));
381 expect.push_back({ text.length(), run.code });
382 }
383
384 ScriptRunIterator scriptRunIterator(text.characters16(), text.length(),
385 MockScriptData::instance());
386 VerifyRuns(&scriptRunIterator, expect);
387 }
388
389 void VerifyRuns(ScriptRunIterator* scriptRunIterator,
390 const std::vector<ExpectedRun>& expect)
391 {
392 unsigned limit;
393 UScriptCode code;
394 unsigned long runCount = 0;
395 while (scriptRunIterator->consume(limit, code)) {
396 ASSERT_LT(runCount, expect.size());
397 ASSERT_EQ(expect[runCount].limit, limit);
398 ASSERT_EQ(expect[runCount].code, code);
399 ++runCount;
400 }
401 WTF_LOG_ERROR("Expected %zu runs, got %lu ", expect.size(), runCount);
402 ASSERT_EQ(expect.size(), runCount);
403 }
404 };
405
406 TEST_F(ScriptRunIteratorTest, Empty)
407 {
408 String empty(String::make16BitFrom8BitSource(0, 0));
409 ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length());
410 unsigned limit = 0;
411 UScriptCode code = USCRIPT_INVALID_CODE;
412 ASSERT(!scriptRunIterator.consume(limit, code));
413 ASSERT_EQ(limit, 0u);
414 ASSERT_EQ(code, USCRIPT_INVALID_CODE);
415 }
416
417 // Some of our compilers cannot initialize a vector from an array yet.
418 #define DECLARE_RUNSVECTOR(...) \
419 static const TestRun runsArray[] = __VA_ARGS__; \
420 std::vector<TestRun> runs(runsArray, runsArray + sizeof(runsArray) / sizeof( *runsArray));
421
422 #define CHECK_RUNS(...) \
423 DECLARE_RUNSVECTOR(__VA_ARGS__); \
424 CheckRuns(runs);
425
426 #define CHECK_MOCK_RUNS(...) \
427 DECLARE_RUNSVECTOR(__VA_ARGS__); \
428 CheckMockRuns(runs);
429
430 TEST_F(ScriptRunIteratorTest, Whitespace)
431 {
432 CHECK_RUNS({ { " \t ", USCRIPT_COMMON } });
433 }
434
435 TEST_F(ScriptRunIteratorTest, Common)
436 {
437 CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } });
438 }
439
440 TEST_F(ScriptRunIteratorTest, Latin)
441 {
442 CHECK_RUNS({ { "latin", USCRIPT_LATIN } });
443 }
444
445 TEST_F(ScriptRunIteratorTest, Chinese)
446 {
447 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } });
448 }
449
450 // Close bracket without matching open is ignored
451 TEST_F(ScriptRunIteratorTest, UnbalancedParens1)
452 {
453 CHECK_RUNS({ { "(萬", USCRIPT_HAN },
454 { "a]", USCRIPT_LATIN },
455 { ")", USCRIPT_HAN } });
456 }
457
458 // Open bracket without matching close is popped when inside
459 // matching close brackets, so doesn't match later close.
460 TEST_F(ScriptRunIteratorTest, UnbalancedParens2)
461 {
462 CHECK_RUNS({ { "(萬", USCRIPT_HAN },
463 { "a[", USCRIPT_LATIN },
464 { ")]", USCRIPT_HAN } });
465 }
466
467 // space goes with leading script
468 TEST_F(ScriptRunIteratorTest, LatinHan)
469 {
470 CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN },
471 { "萬國碼", USCRIPT_HAN } });
472 }
473
474 // space goes with leading script
475 TEST_F(ScriptRunIteratorTest, HanLatin)
476 {
477 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
478 { "Unicode", USCRIPT_LATIN } });
479 }
480
481 TEST_F(ScriptRunIteratorTest, ParenEmptyParen)
482 {
483 CHECK_RUNS({ { "()", USCRIPT_COMMON } });
484 }
485
486 TEST_F(ScriptRunIteratorTest, ParenChineseParen)
487 {
488 CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } });
489 }
490
491 TEST_F(ScriptRunIteratorTest, ParenLatinParen)
492 {
493 CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } });
494 }
495
496 // open paren gets leading script
497 TEST_F(ScriptRunIteratorTest, LatinParenChineseParen)
498 {
499 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
500 { "萬國碼", USCRIPT_HAN },
501 { ")", USCRIPT_LATIN } });
502 }
503
504 // open paren gets first trailing script if no leading script
505 TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin)
506 {
507 CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN },
508 { "Unicode", USCRIPT_LATIN } });
509 }
510
511 // leading common and open paren get first trailing script.
512 // TODO(dougfelt): we don't do quote matching, but probably should figure out
513 // something better then doing nothing.
514 TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote)
515 {
516 CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN },
517 { "Unicode\"", USCRIPT_LATIN } });
518 }
519
520 // Unmatched close brace gets leading context
521 TEST_F(ScriptRunIteratorTest, UnmatchedClose)
522 {
523 CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
524 { "萬國碼] ", USCRIPT_HAN },
525 { ") Unicode\"", USCRIPT_LATIN } });
526 }
527
528 // Match up to 32 bracket pairs
529 TEST_F(ScriptRunIteratorTest, Match32Brackets)
530 {
531 CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN },
532 { "Unicode (((((((((((((((((((((((((((((((!"
533 ")))))))))))))))))))))))))))))))",
534 USCRIPT_LATIN },
535 { "]", USCRIPT_HAN } });
536 }
537
538 // Matches 32 most recent bracket pairs. More than that, and we revert to
539 // surrounding script.
540 TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets)
541 {
542 CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN },
543 { "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN },
544 { "萬國碼!", USCRIPT_HAN },
545 { ")))))))))))))))))))))))))))))))", USCRIPT_LATIN },
546 { "]", USCRIPT_HAN },
547 { "But )))", USCRIPT_LATIN } });
548 }
549
550 // A char with multiple scripts that match both leading and trailing context
551 // gets the leading context.
552 TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext)
553 {
554 CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN },
555 { "l", USCRIPT_LATIN } });
556 }
557
558 // A char with multiple scripts that only match trailing context gets the
559 // trailing context.
560 TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext)
561 {
562 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
563 { "<gl>l", USCRIPT_LATIN } });
564 }
565
566 // Retain first established priority script. <lhg><gh> produce the script <gh>
567 // with g as priority, because of the two priority scripts l and g, only g
568 // remains. Then <gh><hgl> retains g as priority, because of the two priority
569 // scripts g and h that remain, g was encountered first.
570 TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript)
571 {
572 CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } });
573 }
574
575 // Parens can have scripts that break script runs.
576 TEST_F(ScriptRunIteratorTest, ExtensionsParens)
577 {
578 CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK },
579 { "h<[hl>", USCRIPT_HAN },
580 { "l", USCRIPT_LATIN },
581 { "<]hl>", USCRIPT_HAN },
582 { "<)lg>", USCRIPT_GREEK } });
583 }
584
585 // The close paren might be encountered before we've established the open
586 // paren's script, but when this is the case the current set is still valid, so
587 // this doesn't affect it nor break the run.
588 TEST_F(ScriptRunIteratorTest, ExtensionsParens2)
589 {
590 CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } });
591 }
592
593 // A common script with a single extension should be treated as common, but
594 // with the extended script as a default. If we encounter anything other than
595 // common, that takes priority. If we encounter other common scripts with a
596 // single extension, the current priority remains.
597 TEST_F(ScriptRunIteratorTest, CommonWithPriority)
598 {
599 CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } });
600 }
601
602 TEST_F(ScriptRunIteratorTest, CommonWithPriority2)
603 {
604 CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } });
605 }
606
607 TEST_F(ScriptRunIteratorTest, CommonWithPriority3)
608 {
609 CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } });
610 }
611
612 // UDatta (\xE0\xA5\x91) is inherited with LATIN and DEVANAGARI extensions.
613 // Since it has LATIN, and the dotted circle (\xE2\x97\x8C) is COMMON and has
614 // adopted the preceding LATIN, it gets the LATIN. This is standard.
615 TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta)
616 {
617 CHECK_RUNS({ { "Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN } });
618 }
619
620 // In this situation, UDatta (\xE0\xA5\x91) doesn't share a script with the
621 // value inherited by the dotted circle (\xE2\x97\x8C). It captures the
622 // preceding dotted circle and breaks it from the run it would normally have
623 // been in.
624 TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta)
625 {
626 CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
627 { "\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_DEVANAGARI } });
628 }
629
630 // Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is
631 // common, that of Fathatan is inherited. The script extensions for Fathatan
632 // are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the
633 // preferred script for Fathatan is Arabic, according to Behdad's
634 // heuristic. This is exactly analogous to the Udatta tests above, except
635 // Tatweel is Lm. But we don't take properties into account, only scripts.
636 TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan)
637 {
638 CHECK_RUNS({ { "Latin ", USCRIPT_LATIN },
639 { "\xD9\x80\xD9\x8B", USCRIPT_ARABIC } });
640 }
641
642 // Another case where if the mark accepts a script that was inherited by the
643 // preceding common-script character, they both continue in that script.
644 // SYRIAC LETTER NUN \xDC\xA2
645 // ARABIC TATWEEL \xD9\x80
646 // ARABIC FATHATAN \xD9\x82
647 TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan)
648 {
649 CHECK_RUNS({ { "\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC } });
650 }
651
652 // The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that
653 // is not common.
654 TEST_F(ScriptRunIteratorTest, HanUdatta)
655 {
656 CHECK_RUNS({ { "萬國碼\xE0\xA5\x91", USCRIPT_HAN } });
657 }
658
659 // The Udatta (\xE0\xA5\x91) is inherited, and will capture the space and turn
660 // it into Devanagari.
661 TEST_F(ScriptRunIteratorTest, HanSpaceUdatta)
662 {
663 CHECK_RUNS({ { "萬國碼", USCRIPT_HAN },
664 { " \xE0\xA5\x91", USCRIPT_DEVANAGARI } });
665 }
666
667 // Make sure Mock code works too.
668 TEST_F(ScriptRunIteratorTest, MockHanInheritedGL)
669 {
670 CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } });
671 }
672
673 TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL)
674 {
675 CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
676 { "c<igl>", USCRIPT_GREEK } });
677 }
678
679 // Leading inherited just act like common, except there's no preferred script.
680 TEST_F(ScriptRunIteratorTest, MockLeadingInherited)
681 {
682 CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } });
683 }
684
685 // Leading inherited just act like common, except there's no preferred script.
686 TEST_F(ScriptRunIteratorTest, MockLeadingInherited2)
687 {
688 CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } });
689 }
690
691 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan)
692 {
693 // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
694 CHECK_RUNS({ { "\xE0\xA5\x91萬國碼", USCRIPT_HAN } });
695 }
696
697 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2)
698 {
699 // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
700 // ARABIC FATHATAN \xD9\x8B
701 CHECK_RUNS({ { "\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN } });
702 }
703
704 TEST_F(ScriptRunIteratorTest, OddLatinString)
705 {
706 CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } });
707 }
708
709 class ScriptRunIteratorICUDataTest : public testing::Test {
710 public:
711 ScriptRunIteratorICUDataTest()
712 : m_maxExtensions(0)
713 , m_maxExtensionsCodepoint(0xffff)
714 {
715 int maxExtensions = 0;
716 UChar32 m_maxExtensionscp = 0;
717 for (UChar32 cp = 0; cp < 0x11000; ++cp) {
718 UErrorCode status = U_ZERO_ERROR;
719 int count = uscript_getScriptExtensions(cp, 0, 0, &status);
720 if (count > maxExtensions) {
721 maxExtensions = count;
722 m_maxExtensionscp = cp;
723 }
724 }
725 m_maxExtensions = maxExtensions;
726 m_maxExtensionsCodepoint = m_maxExtensionscp;
727 }
728
729 protected:
730 UChar32 GetACharWithMaxExtensions(int* numExtensions)
731 {
732 if (numExtensions) {
733 *numExtensions = m_maxExtensions;
734 }
735 return m_maxExtensionsCodepoint;
736 }
737
738 private:
739 int m_maxExtensions;
740 UChar32 m_maxExtensionsCodepoint;
741 };
742
743 // Validate that ICU never returns more than our maximum expected number of
744 // script extensions.
745 TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions)
746 {
747 int maxExtensions;
748 UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
749 ASSERT_LE(maxExtensions, ScriptData::kMaxScriptCount)
750 << "char " << std::hex << cp << std::dec;
751 }
752
753 // Check that ICUScriptData returns all of a character's scripts.
754 // This only checks one likely character, but doesn't check all cases.
755 TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions)
756 {
757 int maxExtensions;
758 UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
759 Vector<UScriptCode> extensions;
760 ICUScriptData::instance()->getScripts(cp, extensions);
761
762 // It's possible that GetScripts adds the primary script to the list of
763 // extensions, resulting in one more script than the raw extension count.
764 ASSERT_GE(static_cast<int>(extensions.size()), maxExtensions)
765 << "char " << std::hex << cp << std::dec;
766 }
767
768 TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension)
769 {
770 Vector<UScriptCode> extensions;
771 for (UChar32 cp = 0; cp < 0x110000; ++cp) {
772 ICUScriptData::instance()->getScripts(cp, extensions);
773 UScriptCode primary = extensions.at(0);
774 if (primary == USCRIPT_COMMON) {
775 ASSERT_LE(extensions.size(), 2ul)
776 << "cp: " << std::hex << cp << std::dec;
777 }
778 }
779 }
780
781 // ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to
782 // ignore this for now, as I think it shouldn't matter which run it ends up
783 // in. HarfBuzz needs to be able to use it as context and shape each
784 // neighboring character appropriately no matter what run it got assigned to.
785
786 } // namespace blink
OLDNEW
« no previous file with comments | « Source/platform/fonts/ScriptRunIterator.cpp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698