Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(659)

Unified Diff: src/jsregexp.cc

Issue 11352: * Match literals in a case independent way.... (Closed) Base URL: http://v8.googlecode.com/svn/branches/experimental/regexp2000/
Patch Set: '' Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/interpreter-re2k.cc ('k') | src/regexp-macro-assembler.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/jsregexp.cc
===================================================================
--- src/jsregexp.cc (revision 825)
+++ src/jsregexp.cc (working copy)
@@ -875,14 +875,13 @@
class RegExpCompiler {
public:
- explicit RegExpCompiler(int capture_count);
+ RegExpCompiler(int capture_count, bool ignore_case);
int AllocateRegister() { return next_register_++; }
Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,
RegExpNode* start,
- int capture_count,
- bool case_independent);
+ int capture_count);
inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
@@ -899,6 +898,8 @@
inline void IncrementRecursionDepth() { recursion_depth_++; }
inline void DecrementRecursionDepth() { recursion_depth_--; }
+ inline bool is_case_independent() { return is_case_independent_; }
+
private:
EndNode* accept_;
EndNode* backtrack_;
@@ -906,15 +907,17 @@
List<RegExpNode*>* work_list_;
int recursion_depth_;
RegExpMacroAssembler* macro_assembler_;
+ bool is_case_independent_;
};
// Attempts to compile the regexp using a Regexp2000 code generator. Returns
// a fixed array or a null handle depending on whether it succeeded.
-RegExpCompiler::RegExpCompiler(int capture_count)
+RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case)
: next_register_(2 * (capture_count + 1)),
work_list_(NULL),
- recursion_depth_(0) {
+ recursion_depth_(0),
+ is_case_independent_(ignore_case) {
accept_ = new EndNode(EndNode::ACCEPT);
backtrack_ = new EndNode(EndNode::BACKTRACK);
}
@@ -923,9 +926,10 @@
Handle<FixedArray> RegExpCompiler::Assemble(
RegExpMacroAssembler* macro_assembler,
RegExpNode* start,
- int capture_count,
- bool case_independent) {
- if (case_independent) return Handle<FixedArray>::null();
+ int capture_count) {
+ if (!FLAG_attempt_case_independent && is_case_independent_) {
+ return Handle<FixedArray>::null();
+ }
macro_assembler_ = macro_assembler;
List <RegExpNode*> work_list(0);
work_list_ = &work_list;
@@ -1110,107 +1114,231 @@
}
-bool TextNode::Emit(RegExpCompiler* compiler) {
- RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
- Bind(macro_assembler);
- int element_count = elms_->length();
- int cp_offset = 0;
- for (int i = 0; i < element_count; i++) {
- TextElement elm = (*elms_)[i];
- switch (elm.type) {
- case TextElement::ATOM: {
- Vector<const uc16> quarks = elm.data.u_atom->data();
- macro_assembler->CheckCharacters(quarks,
- cp_offset,
- on_failure_->label());
- cp_offset += quarks.length();
+static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
+static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
+
+
+static inline void EmitAtomNonLetters(
+ RegExpMacroAssembler* macro_assembler,
+ TextElement elm,
+ Vector<const uc16> quarks,
+ Label* on_failure,
+ int cp_offset) {
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ for (int i = quarks.length() - 1; i >= 0; i--) {
+ uc16 c = quarks[i];
+ int length = uncanonicalize.get(c, '\0', chars);
+ if (length <= 1) {
+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
+ macro_assembler->CheckNotCharacter(c, on_failure);
+ }
+ }
+}
+
+
+static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
+ uc16 c1,
+ uc16 c2,
+ Label* on_failure) {
+ uc16 exor = c1 ^ c2;
+ // Check whether exor has only one bit set.
+ if (((exor - 1) & exor) == 0) {
+ // If c1 and c2 differ only by one bit.
+ // Ecma262UnCanonicalize always gives the highest number last.
+ ASSERT(c2 > c1);
+ macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure);
+ return true;
+ } else {
+ ASSERT(c2 > c1);
+ uc16 diff = c2 - c1;
+ if (((diff - 1) & diff) == 0 && c1 >= diff) {
+ // If the characters differ by 2^n but don't differ by one bit then
+ // subtract the difference from the found character, then do the or
+ // trick. We avoid the theoretical case where negative numbers are
+ // involved in order to simplify code generation.
+ macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff,
+ diff,
+ on_failure);
+ return true;
+ }
+ }
+ return false;
+}
+
+
+static inline void EmitAtomLetters(
+ RegExpMacroAssembler* macro_assembler,
+ TextElement elm,
+ Vector<const uc16> quarks,
+ Label* on_failure,
+ int cp_offset) {
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ for (int i = quarks.length() - 1; i >= 0; i--) {
+ uc16 c = quarks[i];
+ int length = uncanonicalize.get(c, '\0', chars);
+ if (length <= 1) continue;
+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
+ Label ok;
+ ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
+ switch (length) {
+ case 2: {
+ if (ShortCutEmitCharacterPair(macro_assembler,
+ chars[0],
+ chars[1],
+ on_failure)) {
+ ok.Unuse();
+ } else {
+ macro_assembler->CheckCharacter(chars[0], &ok);
+ macro_assembler->CheckNotCharacter(chars[1], on_failure);
+ macro_assembler->Bind(&ok);
+ }
break;
}
- case TextElement::CHAR_CLASS: {
- RegExpCharacterClass* cc = elm.data.u_char_class;
- macro_assembler->LoadCurrentCharacter(cp_offset, on_failure_->label());
- cp_offset++;
+ case 4:
+ macro_assembler->CheckCharacter(chars[3], &ok);
+ // Fall through!
+ case 3:
+ macro_assembler->CheckCharacter(chars[0], &ok);
+ macro_assembler->CheckCharacter(chars[1], &ok);
+ macro_assembler->CheckNotCharacter(chars[2], on_failure);
+ macro_assembler->Bind(&ok);
+ break;
+ default:
+ UNREACHABLE();
+ break;
+ }
+ }
+}
- ZoneList<CharacterRange>* ranges = cc->ranges();
- Label success;
+static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
+ RegExpCharacterClass* cc,
+ int cp_offset,
+ Label* on_failure) {
+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);
+ cp_offset++;
- Label *char_is_in_class =
- cc->is_negated() ? on_failure_->label() : &success;
+ ZoneList<CharacterRange>* ranges = cc->ranges();
- int range_count = ranges->length();
+ Label success;
- if (range_count == 0) {
- if (!cc->is_negated()) {
- on_failure()->GoTo(compiler);
- }
- break;
- }
+ Label *char_is_in_class =
+ cc->is_negated() ? on_failure : &success;
- for (int i = 0; i < range_count - 1; i++) {
- CharacterRange& range = (*ranges)[i];
- Label next_range;
- uc16 from = range.from();
- uc16 to = range.to();
- if (to == from) {
- macro_assembler->CheckCharacter(to, char_is_in_class);
- } else {
- if (from != 0) {
- macro_assembler->CheckCharacterLT(from, &next_range);
- }
- if (to != 0xffff) {
- macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
- } else {
- macro_assembler->GoTo(char_is_in_class);
- }
- }
- macro_assembler->Bind(&next_range);
- }
+ int range_count = ranges->length();
- if (range_count != 0) {
- CharacterRange& range = (*ranges)[range_count - 1];
- uc16 from = range.from();
- uc16 to = range.to();
+ if (range_count == 0) {
+ if (!cc->is_negated()) {
+ macro_assembler->GoTo(on_failure);
+ }
+ return;
+ }
- if (to == from) {
- if (cc->is_negated()) {
- macro_assembler->CheckCharacter(to, on_failure_->label());
- } else {
- macro_assembler->CheckNotCharacter(to, on_failure_->label());
- }
- } else {
- if (from != 0) {
- if (!cc->is_negated()) {
- macro_assembler->CheckCharacterLT(from, on_failure_->label());
- } else {
- macro_assembler->CheckCharacterLT(from, &success);
- }
- }
- if (to != 0xffff) {
- if (!cc->is_negated()) {
- macro_assembler->CheckCharacterGT(to, on_failure_->label());
- } else {
- macro_assembler->CheckCharacterLT(to + 1, on_failure_->label());
- }
- } else {
- if (cc->is_negated()) {
- macro_assembler->GoTo(on_failure_->label());
- }
- }
- }
- } else if (cc->is_negated()) {
- macro_assembler->GoTo(on_failure_->label());
- }
+ for (int i = 0; i < range_count - 1; i++) {
+ CharacterRange& range = ranges->at(i);
+ Label next_range;
+ uc16 from = range.from();
+ uc16 to = range.to();
+ if (to == from) {
+ macro_assembler->CheckCharacter(to, char_is_in_class);
+ } else {
+ if (from != 0) {
+ macro_assembler->CheckCharacterLT(from, &next_range);
+ }
+ if (to != 0xffff) {
+ macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
+ } else {
+ macro_assembler->GoTo(char_is_in_class);
+ }
+ }
+ macro_assembler->Bind(&next_range);
+ }
- macro_assembler->Bind(&success);
+ CharacterRange& range = ranges->at(range_count - 1);
+ uc16 from = range.from();
+ uc16 to = range.to();
- break;
+ if (to == from) {
+ if (cc->is_negated()) {
+ macro_assembler->CheckCharacter(to, on_failure);
+ } else {
+ macro_assembler->CheckNotCharacter(to, on_failure);
+ }
+ } else {
+ if (from != 0) {
+ if (!cc->is_negated()) {
+ macro_assembler->CheckCharacterLT(from, on_failure);
+ } else {
+ macro_assembler->CheckCharacterLT(from, &success);
}
- default:
- UNREACHABLE();
- return false;
}
+ if (to != 0xffff) {
+ if (!cc->is_negated()) {
+ macro_assembler->CheckCharacterGT(to, on_failure);
+ } else {
+ macro_assembler->CheckCharacterLT(to + 1, on_failure);
+ }
+ } else {
+ if (cc->is_negated()) {
+ macro_assembler->GoTo(on_failure);
+ }
+ }
}
+ macro_assembler->Bind(&success);
+}
+
+
+
+bool TextNode::Emit(RegExpCompiler* compiler) {
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+ Bind(macro_assembler);
+ int element_count = elms_->length();
+ int cp_offset = 0;
+ // First, handle straight character matches.
+ for (int i = 0; i < element_count; i++) {
+ TextElement elm = elms_->at(i);
+ if (elm.type == TextElement::ATOM) {
+ Vector<const uc16> quarks = elm.data.u_atom->data();
+ if (!compiler->is_case_independent()) {
+ macro_assembler->CheckCharacters(quarks,
+ cp_offset,
+ on_failure_->label());
+ } else {
+ EmitAtomNonLetters(macro_assembler, elm, quarks, on_failure_->label(), cp_offset);
+ }
+ cp_offset += quarks.length();
+ } else {
+ ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
+ cp_offset++;
+ }
+ }
+ // Second, handle case independent letter matches if any.
+ if (compiler->is_case_independent()) {
+ cp_offset = 0;
+ for (int i = 0; i < element_count; i++) {
+ TextElement elm = elms_->at(i);
+ if (elm.type == TextElement::ATOM) {
+ Vector<const uc16> quarks = elm.data.u_atom->data();
+ EmitAtomLetters(macro_assembler, elm, quarks, on_failure_->label(), cp_offset);
+ cp_offset += quarks.length();
+ } else {
+ cp_offset++;
+ }
+ }
+ }
+ // If the fast character matches passed then do the character classes.
+ cp_offset = 0;
+ for (int i = 0; i < element_count; i++) {
+ TextElement elm = elms_->at(i);
+ if (elm.type == TextElement::CHAR_CLASS) {
+ RegExpCharacterClass* cc = elm.data.u_char_class;
+ EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label());
+ cp_offset ++;
+ } else {
+ cp_offset += elm.data.u_atom->data().length();
+ }
+ }
+
compiler->AddWork(on_failure_);
macro_assembler->AdvanceCurrentPosition(cp_offset);
return on_success()->GoTo(compiler);
@@ -1225,14 +1353,14 @@
// is to use the Dispatch table to try only the relevant ones.
int i;
for (i = 0; i < choice_count - 1; i++) {
- GuardedAlternative alternative = (*alternatives_)[i];
+ GuardedAlternative alternative = alternatives_->at(i);
Label after;
Label after_no_pop_cp;
ZoneList<Guard*>* guards = alternative.guards();
if (guards != NULL) {
int guard_count = guards->length();
for (int j = 0; j < guard_count; j++) {
- GenerateGuard(macro_assembler, (*guards)[j], &after_no_pop_cp);
+ GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp);
}
}
macro_assembler->PushCurrentPosition();
@@ -1246,12 +1374,12 @@
macro_assembler->PopCurrentPosition();
macro_assembler->Bind(&after_no_pop_cp);
}
- GuardedAlternative alternative = (*alternatives_)[i];
+ GuardedAlternative alternative = alternatives_->at(i);
ZoneList<Guard*>* guards = alternative.guards();
if (guards != NULL) {
int guard_count = guards->length();
for (int j = 0; j < guard_count; j++) {
- GenerateGuard(macro_assembler, (*guards)[j], on_failure_->label());
+ GenerateGuard(macro_assembler, guards->at(j), on_failure_->label());
}
}
if (!on_failure_->IsBacktrack()) {
@@ -1932,10 +2060,6 @@
}
-static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
-static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
-
-
void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
if (IsSingleton()) {
@@ -2412,7 +2536,7 @@
Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
RegExpNode** node_return,
bool ignore_case) {
- RegExpCompiler compiler(input->capture_count);
+ RegExpCompiler compiler(input->capture_count, ignore_case);
// Wrap the body of the regexp in capture #0.
RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,
0,
@@ -2445,8 +2569,7 @@
ignore_case);
return compiler.Assemble(&macro_assembler,
node,
- input->capture_count,
- ignore_case);
+ input->capture_count);
}
#endif
byte codes[1024];
@@ -2454,8 +2577,7 @@
RegExpMacroAssemblerRe2k macro_assembler(&assembler);
return compiler.Assemble(&macro_assembler,
node,
- input->capture_count,
- ignore_case);
+ input->capture_count);
}
« no previous file with comments | « src/interpreter-re2k.cc ('k') | src/regexp-macro-assembler.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698