src/jsregexp.cc - Issue 11349: Character range uncanonicalization.

Side by Side Diff: src/jsregexp.cc

Issue 11349: Character range uncanonicalization. (Closed)

Patch Set: Created 12 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.	1 // Copyright 2006-2008 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 1856 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1867 // character.	1867 // character.

1868 case '*':	1868 case '*':

1869 ranges->Add(CharacterRange::Everything());	1869 ranges->Add(CharacterRange::Everything());

1870 break;	1870 break;

1871 default:	1871 default:

1872 UNREACHABLE();	1872 UNREACHABLE();

1873 }	1873 }

1874 }	1874 }

1875	1875

1876	1876

	1877 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;

	1878 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;

	1879

	1880

	1881 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {

	1882 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	1883 if (IsSingleton()) {

	1884 // If this is a singleton we just expand the one character.

	1885 int length = uncanonicalize.get(from(), '\0', chars);

	1886 for (int i = 0; i < length; i++) {

	1887 uc32 chr = chars[i];

	1888 if (chr != from()) {

	1889 ranges->Add(CharacterRange::Singleton(chars[i]));

	1890 }

	1891 }

	1892 } else if (from() <= kRangeCanonicalizeMax

	1893 && to() <= kRangeCanonicalizeMax) {

	1894 // If this is a range we expand the characters block by block,

	1895 // expanding contiguous subranges (blocks) one at a time.

	1896 // The approach is as follows. For a given start character we

	1897 // look up the block that contains it, for instance 'a' if the

	1898 // start character is 'c'. A block is characterized by the property

	1899 // that all characters uncanonicalize in the same way as the first

	1900 // element, except that each entry in the result is incremented

	1901 // by the distance from the first element. So a-z is a block

	1902 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter

	1903 // uncanonicalizes to ['a' + k, 'A' + k].

	1904 // Once we've found the start point we look up its uncanonicalization

	1905 // and produce a range for each element. For instance for [c-f]

	1906 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only

	1907 // add a range if it is not already contained in the input, so [c-f]

	1908 // will be skipped but [C-F] will be added. If this range is not

	1909 // completely contained in a block we do this for all the blocks

	1910 // covered by the range.

	1911 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	1912 // First, look up the block that contains the 'from' character.

	1913 int length = canonrange.get(from(), '\0', range);

	1914 if (length == 0) {

	1915 range[0] = from();

	1916 } else {

	1917 ASSERT_EQ(1, length);

	1918 }

	1919 int pos = from();

	1920 // The start of the current block. Note that except for the first

	1921 // iteration 'start' is always equal to 'pos'.

	1922 int start;

	1923 // If it is not the start point of a block the entry contains the

	1924 // offset of the character from the start point.

	1925 if ((range[0] & kStartMarker) == 0) {

	1926 start = pos - range[0];

	1927 } else {

	1928 start = pos;

	1929 }

	1930 // Then we add the ranges on at a time, incrementing the current

	1931 // position to be after the last block each time. The position

	1932 // always points to the start of a block.

	1933 while (pos < to()) {

	1934 length = canonrange.get(start, '\0', range);

	1935 if (length == 0) {

	1936 range[0] = start;

	1937 } else {

	1938 ASSERT_EQ(1, length);

	1939 }

	1940 ASSERT((range[0] & kStartMarker) != 0);

	1941 // The start point of a block contains the distance to the end

	1942 // of the range.

	1943 int block_end = start + (range[0] & kPayloadMask) - 1;

	1944 int end = (block_end > to()) ? to() : block_end;

	1945 length = uncanonicalize.get(start, '\0', range);

	1946 for (int i = 0; i < length; i++) {

	1947 uc32 c = range[i];

	1948 uc16 range_from = c + (pos - start);

	1949 uc16 range_to = c + (end - start);

	1950 if (!(from() <= range_from && range_to <= to()))

	1951 ranges->Add(CharacterRange(range_from, range_to));

	1952 }

	1953 start = pos = block_end + 1;

	1954 }

	1955 } else {

	1956 // TODO when we've fixed the 2^11 bug in unibrow.

	1957 }

	1958 }

	1959

	1960

1877 // -------------------------------------------------------------------	1961 // -------------------------------------------------------------------

1878 // Interest propagation	1962 // Interest propagation

1879	1963

1880	1964

1881 RegExpNode* RegExpNode::GetSibling(NodeInfo* info) {	1965 RegExpNode* RegExpNode::GetSibling(NodeInfo* info) {

1882 for (int i = 0; i < siblings_.length(); i++) {	1966 for (int i = 0; i < siblings_.length(); i++) {

1883 RegExpNode* sibling = siblings_.Get(i);	1967 RegExpNode* sibling = siblings_.Get(i);

1884 if (sibling->info()->SameInterests(info))	1968 if (sibling->info()->SameInterests(info))

1885 return sibling;	1969 return sibling;

1886 }	1970 }

(...skipping 413 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2300 }	2384 }

2301	2385

2302 RegExpMacroAssembler::RegExpMacroAssembler() {	2386 RegExpMacroAssembler::RegExpMacroAssembler() {

2303 }	2387 }

2304	2388

2305 RegExpMacroAssembler::~RegExpMacroAssembler() {	2389 RegExpMacroAssembler::~RegExpMacroAssembler() {

2306 }	2390 }

2307	2391

2308	2392

2309 }} // namespace v8::internal	2393 }} // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/jsregexp.h ('k') | src/unicode.h » ('j') | no next file with comments »