Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1088)

Side by Side Diff: src/jsregexp.cc

Issue 11349: Character range uncanonicalization. (Closed)
Patch Set: Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/jsregexp.h ('k') | src/unicode.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 1856 matching lines...) Expand 10 before | Expand all | Expand 10 after
1867 // character. 1867 // character.
1868 case '*': 1868 case '*':
1869 ranges->Add(CharacterRange::Everything()); 1869 ranges->Add(CharacterRange::Everything());
1870 break; 1870 break;
1871 default: 1871 default:
1872 UNREACHABLE(); 1872 UNREACHABLE();
1873 } 1873 }
1874 } 1874 }
1875 1875
1876 1876
1877 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1878 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1879
1880
1881 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
1882 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1883 if (IsSingleton()) {
1884 // If this is a singleton we just expand the one character.
1885 int length = uncanonicalize.get(from(), '\0', chars);
1886 for (int i = 0; i < length; i++) {
1887 uc32 chr = chars[i];
1888 if (chr != from()) {
1889 ranges->Add(CharacterRange::Singleton(chars[i]));
1890 }
1891 }
1892 } else if (from() <= kRangeCanonicalizeMax
1893 && to() <= kRangeCanonicalizeMax) {
1894 // If this is a range we expand the characters block by block,
1895 // expanding contiguous subranges (blocks) one at a time.
1896 // The approach is as follows. For a given start character we
1897 // look up the block that contains it, for instance 'a' if the
1898 // start character is 'c'. A block is characterized by the property
1899 // that all characters uncanonicalize in the same way as the first
1900 // element, except that each entry in the result is incremented
1901 // by the distance from the first element. So a-z is a block
1902 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
1903 // uncanonicalizes to ['a' + k, 'A' + k].
1904 // Once we've found the start point we look up its uncanonicalization
1905 // and produce a range for each element. For instance for [c-f]
1906 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
1907 // add a range if it is not already contained in the input, so [c-f]
1908 // will be skipped but [C-F] will be added. If this range is not
1909 // completely contained in a block we do this for all the blocks
1910 // covered by the range.
1911 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1912 // First, look up the block that contains the 'from' character.
1913 int length = canonrange.get(from(), '\0', range);
1914 if (length == 0) {
1915 range[0] = from();
1916 } else {
1917 ASSERT_EQ(1, length);
1918 }
1919 int pos = from();
1920 // The start of the current block. Note that except for the first
1921 // iteration 'start' is always equal to 'pos'.
1922 int start;
1923 // If it is not the start point of a block the entry contains the
1924 // offset of the character from the start point.
1925 if ((range[0] & kStartMarker) == 0) {
1926 start = pos - range[0];
1927 } else {
1928 start = pos;
1929 }
1930 // Then we add the ranges on at a time, incrementing the current
1931 // position to be after the last block each time. The position
1932 // always points to the start of a block.
1933 while (pos < to()) {
1934 length = canonrange.get(start, '\0', range);
1935 if (length == 0) {
1936 range[0] = start;
1937 } else {
1938 ASSERT_EQ(1, length);
1939 }
1940 ASSERT((range[0] & kStartMarker) != 0);
1941 // The start point of a block contains the distance to the end
1942 // of the range.
1943 int block_end = start + (range[0] & kPayloadMask) - 1;
1944 int end = (block_end > to()) ? to() : block_end;
1945 length = uncanonicalize.get(start, '\0', range);
1946 for (int i = 0; i < length; i++) {
1947 uc32 c = range[i];
1948 uc16 range_from = c + (pos - start);
1949 uc16 range_to = c + (end - start);
1950 if (!(from() <= range_from && range_to <= to()))
1951 ranges->Add(CharacterRange(range_from, range_to));
1952 }
1953 start = pos = block_end + 1;
1954 }
1955 } else {
1956 // TODO when we've fixed the 2^11 bug in unibrow.
1957 }
1958 }
1959
1960
1877 // ------------------------------------------------------------------- 1961 // -------------------------------------------------------------------
1878 // Interest propagation 1962 // Interest propagation
1879 1963
1880 1964
1881 RegExpNode* RegExpNode::GetSibling(NodeInfo* info) { 1965 RegExpNode* RegExpNode::GetSibling(NodeInfo* info) {
1882 for (int i = 0; i < siblings_.length(); i++) { 1966 for (int i = 0; i < siblings_.length(); i++) {
1883 RegExpNode* sibling = siblings_.Get(i); 1967 RegExpNode* sibling = siblings_.Get(i);
1884 if (sibling->info()->SameInterests(info)) 1968 if (sibling->info()->SameInterests(info))
1885 return sibling; 1969 return sibling;
1886 } 1970 }
(...skipping 413 matching lines...) Expand 10 before | Expand all | Expand 10 after
2300 } 2384 }
2301 2385
2302 RegExpMacroAssembler::RegExpMacroAssembler() { 2386 RegExpMacroAssembler::RegExpMacroAssembler() {
2303 } 2387 }
2304 2388
2305 RegExpMacroAssembler::~RegExpMacroAssembler() { 2389 RegExpMacroAssembler::~RegExpMacroAssembler() {
2306 } 2390 }
2307 2391
2308 2392
2309 }} // namespace v8::internal 2393 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/unicode.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698