OLD | NEW |
1 // Copyright 2008 the V8 project authors. All rights reserved. | 1 // Copyright 2008 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 1381 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1392 CHECK_EQ(canonicalize(lower), canonicalize(upper)); | 1392 CHECK_EQ(canonicalize(lower), canonicalize(upper)); |
1393 unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1393 unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
1394 int length = un_canonicalize.get(lower, '\0', uncanon); | 1394 int length = un_canonicalize.get(lower, '\0', uncanon); |
1395 CHECK_EQ(2, length); | 1395 CHECK_EQ(2, length); |
1396 CHECK_EQ(upper, uncanon[0]); | 1396 CHECK_EQ(upper, uncanon[0]); |
1397 CHECK_EQ(lower, uncanon[1]); | 1397 CHECK_EQ(lower, uncanon[1]); |
1398 } | 1398 } |
1399 for (uc32 c = 128; c < (1 << 21); c++) | 1399 for (uc32 c = 128; c < (1 << 21); c++) |
1400 CHECK_GE(canonicalize(c), 128); | 1400 CHECK_GE(canonicalize(c), 128); |
1401 unibrow::Mapping<unibrow::ToUppercase> to_upper; | 1401 unibrow::Mapping<unibrow::ToUppercase> to_upper; |
1402 for (uc32 c = 0; c < (1 << 21); c++) { | 1402 // Canonicalization is only defined for the Basic Multilingual Plane. |
| 1403 for (uc32 c = 0; c < (1 << 16); c++) { |
1403 unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth]; | 1404 unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth]; |
1404 int length = to_upper.get(c, '\0', upper); | 1405 int length = to_upper.get(c, '\0', upper); |
1405 if (length == 0) { | 1406 if (length == 0) { |
1406 length = 1; | 1407 length = 1; |
1407 upper[0] = c; | 1408 upper[0] = c; |
1408 } | 1409 } |
1409 uc32 u = upper[0]; | 1410 uc32 u = upper[0]; |
1410 if (length > 1 || (c >= 128 && u < 128)) | 1411 if (length > 1 || (c >= 128 && u < 128)) |
1411 u = c; | 1412 u = c; |
1412 CHECK_EQ(u, canonicalize(c)); | 1413 CHECK_EQ(u, canonicalize(c)); |
1413 } | 1414 } |
1414 } | 1415 } |
1415 | 1416 |
1416 | 1417 |
1417 static uc32 CanonRange(uc32 c) { | 1418 static uc32 CanonRangeEnd(uc32 c) { |
1418 unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth]; | 1419 unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth]; |
1419 int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL); | 1420 int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL); |
1420 if (count == 0) { | 1421 if (count == 0) { |
1421 return c; | 1422 return c; |
1422 } else { | 1423 } else { |
1423 CHECK_EQ(1, count); | 1424 CHECK_EQ(1, count); |
1424 return canon[0]; | 1425 return canon[0]; |
1425 } | 1426 } |
1426 } | 1427 } |
1427 | 1428 |
1428 | 1429 |
1429 TEST(RangeCanonicalization) { | 1430 TEST(RangeCanonicalization) { |
1430 CHECK_NE(CanonRange(0) & CharacterRange::kStartMarker, 0); | |
1431 // Check that we arrive at the same result when using the basic | 1431 // Check that we arrive at the same result when using the basic |
1432 // range canonicalization primitives as when using immediate | 1432 // range canonicalization primitives as when using immediate |
1433 // canonicalization. | 1433 // canonicalization. |
1434 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; | 1434 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; |
1435 for (int i = 0; i < CharacterRange::kRangeCanonicalizeMax; i++) { | 1435 int block_start = 0; |
1436 int range = CanonRange(i); | 1436 while (block_start <= 0xFFFF) { |
1437 int indirect_length = 0; | 1437 uc32 block_end = CanonRangeEnd(block_start); |
1438 unibrow::uchar indirect[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1438 unsigned block_length = block_end - block_start + 1; |
1439 if ((range & CharacterRange::kStartMarker) == 0) { | 1439 if (block_length > 1) { |
1440 indirect_length = un_canonicalize.get(i - range, '\0', indirect); | 1440 unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
1441 for (int i = 0; i < indirect_length; i++) | 1441 int first_length = un_canonicalize.get(block_start, '\0', first); |
1442 indirect[i] += range; | 1442 for (unsigned i = 1; i < block_length; i++) { |
1443 } else { | 1443 unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
1444 indirect_length = un_canonicalize.get(i, '\0', indirect); | 1444 int succ_length = un_canonicalize.get(block_start + i, '\0', succ); |
1445 } | 1445 CHECK_EQ(first_length, succ_length); |
1446 unibrow::uchar direct[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1446 for (int j = 0; j < succ_length; j++) { |
1447 int direct_length = un_canonicalize.get(i, '\0', direct); | 1447 int calc = first[j] + i; |
1448 CHECK_EQ(direct_length, indirect_length); | 1448 int found = succ[j]; |
1449 } | 1449 CHECK_EQ(calc, found); |
1450 // Check that we arrive at the same results when skipping over | 1450 } |
1451 // canonicalization ranges. | |
1452 int next_block = 0; | |
1453 while (next_block < CharacterRange::kRangeCanonicalizeMax) { | |
1454 uc32 start = CanonRange(next_block); | |
1455 CHECK_NE((start & CharacterRange::kStartMarker), 0); | |
1456 unsigned dist = start & CharacterRange::kPayloadMask; | |
1457 unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
1458 int first_length = un_canonicalize.get(next_block, '\0', first); | |
1459 for (unsigned i = 1; i < dist; i++) { | |
1460 CHECK_EQ(i, CanonRange(next_block + i)); | |
1461 unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
1462 int succ_length = un_canonicalize.get(next_block + i, '\0', succ); | |
1463 CHECK_EQ(first_length, succ_length); | |
1464 for (int j = 0; j < succ_length; j++) { | |
1465 int calc = first[j] + i; | |
1466 int found = succ[j]; | |
1467 CHECK_EQ(calc, found); | |
1468 } | 1451 } |
1469 } | 1452 } |
1470 next_block = next_block + dist; | 1453 block_start = block_start + block_length; |
1471 } | 1454 } |
1472 } | 1455 } |
1473 | 1456 |
1474 | 1457 |
1475 TEST(UncanonicalizeEquivalence) { | 1458 TEST(UncanonicalizeEquivalence) { |
1476 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; | 1459 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize; |
1477 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1460 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
1478 for (int i = 0; i < (1 << 16); i++) { | 1461 for (int i = 0; i < (1 << 16); i++) { |
1479 int length = un_canonicalize.get(i, '\0', chars); | 1462 int length = un_canonicalize.get(i, '\0', chars); |
1480 for (int j = 0; j < length; j++) { | 1463 for (int j = 0; j < length; j++) { |
(...skipping 311 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1792 bool in_second = CharacterInSet(&l2, i); | 1775 bool in_second = CharacterInSet(&l2, i); |
1793 CHECK((in_first || in_second) == CharacterInSet(&all, i)); | 1776 CHECK((in_first || in_second) == CharacterInSet(&all, i)); |
1794 } | 1777 } |
1795 } | 1778 } |
1796 | 1779 |
1797 | 1780 |
1798 TEST(Graph) { | 1781 TEST(Graph) { |
1799 V8::Initialize(NULL); | 1782 V8::Initialize(NULL); |
1800 Execute("\\b\\w+\\b", false, true, true); | 1783 Execute("\\b\\w+\\b", false, true, true); |
1801 } | 1784 } |
OLD | NEW |