Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(440)

Unified Diff: src/jsregexp.cc

Issue 13247: * Have an ASCII and a UC16 interpreter for Irregexp bytecodes -... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 12 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/interpreter-irregexp.cc ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/jsregexp.cc
===================================================================
--- src/jsregexp.cc (revision 939)
+++ src/jsregexp.cc (working copy)
@@ -962,10 +962,12 @@
}
Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp);
- Handle<String> two_byte_subject = CachedStringToTwoByte(subject);
+ if (!subject->IsFlat(StringShape(*subject))) {
+ FlattenString(subject);
Lasse Reichstein 2008/12/09 07:43:07 The string is also flattened in the IA32 branch (l
+ }
rc = IrregexpInterpreter::Match(byte_codes,
- two_byte_subject,
+ subject,
offsets_vector,
previous_index);
break;
@@ -1191,7 +1193,7 @@
class RegExpCompiler {
public:
- RegExpCompiler(int capture_count, bool ignore_case);
+ RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
int AllocateRegister() { return next_register_++; }
@@ -1215,6 +1217,7 @@
inline void DecrementRecursionDepth() { recursion_depth_--; }
inline bool ignore_case() { return ignore_case_; }
+ inline bool ascii() { return ascii_; }
private:
EndNode* accept_;
@@ -1223,6 +1226,7 @@
int recursion_depth_;
RegExpMacroAssembler* macro_assembler_;
bool ignore_case_;
+ bool ascii_;
};
@@ -1239,11 +1243,12 @@
// Attempts to compile the regexp using an Irregexp code generator. Returns
// a fixed array or a null handle depending on whether it succeeded.
-RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case)
+RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
: next_register_(2 * (capture_count + 1)),
work_list_(NULL),
recursion_depth_(0),
- ignore_case_(ignore_case) {
+ ignore_case_(ignore_case),
+ ascii_(ascii) {
accept_ = new EndNode(EndNode::ACCEPT);
}
@@ -1682,7 +1687,6 @@
chars[0],
chars[1],
on_failure)) {
- ok.Unuse();
} else {
macro_assembler->CheckCharacter(chars[0], &ok);
macro_assembler->CheckNotCharacter(chars[1], on_failure);
@@ -1711,8 +1715,10 @@
RegExpCharacterClass* cc,
int cp_offset,
Label* on_failure,
- bool check_offset) {
+ bool check_offset,
+ bool ascii) {
ZoneList<CharacterRange>* ranges = cc->ranges();
+ const int max_char = ascii ? 0x7f : 0xffff;
Lasse Reichstein 2008/12/09 07:43:07 Use String::kMaxAsciiCharCode instead of 0x7f?
Label success;
@@ -1721,16 +1727,27 @@
int range_count = ranges->length();
- if (range_count == 0) {
+ int last_valid_range = range_count - 1;
+ while (last_valid_range >= 0) {
+ CharacterRange& range = ranges->at(last_valid_range);
+ if (range.from() <= max_char) {
+ break;
+ }
+ last_valid_range--;
+ }
+
+ if (last_valid_range < 0) {
if (!cc->is_negated()) {
+ // TODO(plesner): We can remove this when the node level does our
+ // ASCII optimizations for us.
macro_assembler->GoTo(on_failure);
}
return;
}
- if (range_count == 1 &&
+ if (last_valid_range == 0 &&
!cc->is_negated() &&
- ranges->at(0).IsEverything(0xffff)) {
+ ranges->at(0).IsEverything(max_char)) {
// This is a common case hit by non-anchored expressions.
// TODO(erikcorry): We should have a macro assembler instruction that just
// checks for end of string without loading the character.
@@ -1748,18 +1765,22 @@
macro_assembler->LoadCurrentCharacterUnchecked(cp_offset);
}
- for (int i = 0; i < range_count - 1; i++) {
+ for (int i = 0; i <= last_valid_range; i++) {
CharacterRange& range = ranges->at(i);
Label next_range;
uc16 from = range.from();
uc16 to = range.to();
+ if (from > max_char) {
+ continue;
+ }
+ if (to > max_char) to = max_char;
if (to == from) {
macro_assembler->CheckCharacter(to, char_is_in_class);
} else {
if (from != 0) {
macro_assembler->CheckCharacterLT(from, &next_range);
Lasse Reichstein 2008/12/09 07:43:07 How about a CheckCharacterRange(from, to, char_is_
}
- if (to != 0xffff) {
+ if (to != max_char) {
macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
} else {
macro_assembler->GoTo(char_is_in_class);
@@ -1768,10 +1789,13 @@
macro_assembler->Bind(&next_range);
}
- CharacterRange& range = ranges->at(range_count - 1);
+ CharacterRange& range = ranges->at(last_valid_range);
uc16 from = range.from();
uc16 to = range.to();
+ if (to > max_char) to = max_char;
+ ASSERT(to >= from);
+
if (to == from) {
if (cc->is_negated()) {
macro_assembler->CheckCharacter(to, on_failure);
@@ -1875,7 +1899,25 @@
macro_assembler->GoTo(backtrack);
return true;
}
- // First, handle straight character matches.
+ // First check for non-ASCII text.
+ // TODO(plesner): We should do this at node level.
+ if (compiler->ascii()) {
+ for (int i = element_count - 1; i >= 0; i--) {
+ TextElement elm = elms_->at(i);
+ if (elm.type == TextElement::ATOM) {
+ Vector<const uc16> quarks = elm.data.u_atom->data();
+ for (int j = quarks.length() - 1; j >= 0; j--) {
+ if (quarks[j] > 0x7f) {
Lasse Reichstein 2008/12/09 07:43:07 Use String::kMaxAsciiCharCode
+ macro_assembler->GoTo(backtrack);
+ return true;
+ }
+ }
+ } else {
+ ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
+ }
+ }
+ }
+ // Second, handle straight character matches.
int checked_up_to = -1;
for (int i = element_count - 1; i >= 0; i--) {
TextElement elm = elms_->at(i);
@@ -1902,7 +1944,7 @@
ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
}
}
- // Second, handle case independent letter matches if any.
+ // Third, handle case independent letter matches if any.
if (compiler->ignore_case()) {
for (int i = element_count - 1; i >= 0; i--) {
TextElement elm = elms_->at(i);
@@ -1930,7 +1972,8 @@
cc,
cp_offset,
backtrack,
- checked_up_to < cp_offset);
+ checked_up_to < cp_offset,
+ compiler->ascii());
if (cp_offset > checked_up_to) checked_up_to = cp_offset;
}
}
@@ -3611,7 +3654,7 @@
bool is_multiline,
Handle<String> pattern,
bool is_ascii) {
- RegExpCompiler compiler(input->capture_count, ignore_case);
+ RegExpCompiler compiler(input->capture_count, ignore_case, is_ascii);
// Wrap the body of the regexp in capture #0.
RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,
0,
« no previous file with comments | « src/interpreter-irregexp.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698