src/jsregexp.cc - Issue 11352: * Match literals in a case independent way....

Unified Diff: src/jsregexp.cc

Issue 11352: * Match literals in a case independent way.... (Closed) Base URL: http://v8.googlecode.com/svn/branches/experimental/regexp2000/

Patch Set: '' Created 12 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

===================================================================

--- src/jsregexp.cc (revision 825)

+++ src/jsregexp.cc (working copy)

@@ -875,14 +875,13 @@

class RegExpCompiler {

public:

- explicit RegExpCompiler(int capture_count);

+ RegExpCompiler(int capture_count, bool ignore_case);

int AllocateRegister() { return next_register_++; }

Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,

RegExpNode* start,

- int capture_count,

- bool case_independent);

+ int capture_count);

inline void AddWork(RegExpNode* node) { work_list_->Add(node); }

@@ -899,6 +898,8 @@

inline void IncrementRecursionDepth() { recursion_depth_++; }

inline void DecrementRecursionDepth() { recursion_depth_--; }

+ inline bool is_case_independent() { return is_case_independent_; }

private:

EndNode* accept_;

EndNode* backtrack_;

@@ -906,15 +907,17 @@

List<RegExpNode*>* work_list_;

int recursion_depth_;

RegExpMacroAssembler* macro_assembler_;

+ bool is_case_independent_;

};

// Attempts to compile the regexp using a Regexp2000 code generator. Returns

// a fixed array or a null handle depending on whether it succeeded.

-RegExpCompiler::RegExpCompiler(int capture_count)

+RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case)

: next_register_(2 * (capture_count + 1)),

work_list_(NULL),

- recursion_depth_(0) {

+ recursion_depth_(0),

+ is_case_independent_(ignore_case) {

accept_ = new EndNode(EndNode::ACCEPT);

backtrack_ = new EndNode(EndNode::BACKTRACK);

}

@@ -923,9 +926,10 @@

Handle<FixedArray> RegExpCompiler::Assemble(

RegExpMacroAssembler* macro_assembler,

RegExpNode* start,

- int capture_count,

- bool case_independent) {

- if (case_independent) return Handle<FixedArray>::null();

+ int capture_count) {

+ if (!FLAG_attempt_case_independent && is_case_independent_) {

+ return Handle<FixedArray>::null();

+ }

macro_assembler_ = macro_assembler;

List <RegExpNode*> work_list(0);

work_list_ = &work_list;

@@ -1110,107 +1114,231 @@

}

-bool TextNode::Emit(RegExpCompiler* compiler) {

- RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

- Bind(macro_assembler);

- int element_count = elms_->length();

- int cp_offset = 0;

- for (int i = 0; i < element_count; i++) {

- TextElement elm = (*elms_)[i];

- switch (elm.type) {

- case TextElement::ATOM: {

- Vector<const uc16> quarks = elm.data.u_atom->data();

- macro_assembler->CheckCharacters(quarks,

- cp_offset,

- on_failure_->label());

- cp_offset += quarks.length();

+static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;

+static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;

+static inline void EmitAtomNonLetters(

+ RegExpMacroAssembler* macro_assembler,

+ TextElement elm,

+ Vector<const uc16> quarks,

+ Label* on_failure,

+ int cp_offset) {

+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

+ for (int i = quarks.length() - 1; i >= 0; i--) {

+ uc16 c = quarks[i];

+ int length = uncanonicalize.get(c, '\0', chars);

+ if (length <= 1) {

+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);

+ macro_assembler->CheckNotCharacter(c, on_failure);

+ }

+static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,

+ uc16 c1,

+ uc16 c2,

+ Label* on_failure) {

+ uc16 exor = c1 ^ c2;

+ // Check whether exor has only one bit set.

+ if (((exor - 1) & exor) == 0) {

+ // If c1 and c2 differ only by one bit.

+ // Ecma262UnCanonicalize always gives the highest number last.

+ ASSERT(c2 > c1);

+ macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure);

+ return true;

+ } else {

+ ASSERT(c2 > c1);

+ uc16 diff = c2 - c1;

+ if (((diff - 1) & diff) == 0 && c1 >= diff) {

+ // If the characters differ by 2^n but don't differ by one bit then

+ // subtract the difference from the found character, then do the or

+ // trick. We avoid the theoretical case where negative numbers are

+ // involved in order to simplify code generation.

+ macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff,

+ diff,

+ on_failure);

+ return true;

+ }

+ return false;

+static inline void EmitAtomLetters(

+ RegExpMacroAssembler* macro_assembler,

+ TextElement elm,

+ Vector<const uc16> quarks,

+ Label* on_failure,

+ int cp_offset) {

+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

+ for (int i = quarks.length() - 1; i >= 0; i--) {

+ uc16 c = quarks[i];

+ int length = uncanonicalize.get(c, '\0', chars);

+ if (length <= 1) continue;

+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);

+ Label ok;

+ ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);

+ switch (length) {

+ case 2: {

+ if (ShortCutEmitCharacterPair(macro_assembler,

+ chars[0],

+ chars[1],

+ on_failure)) {

+ ok.Unuse();

+ } else {

+ macro_assembler->CheckCharacter(chars[0], &ok);

+ macro_assembler->CheckNotCharacter(chars[1], on_failure);

+ macro_assembler->Bind(&ok);

+ }

break;

}

- case TextElement::CHAR_CLASS: {

- RegExpCharacterClass* cc = elm.data.u_char_class;

- macro_assembler->LoadCurrentCharacter(cp_offset, on_failure_->label());

- cp_offset++;

+ case 4:

+ macro_assembler->CheckCharacter(chars[3], &ok);

+ // Fall through!

+ case 3:

+ macro_assembler->CheckCharacter(chars[0], &ok);

+ macro_assembler->CheckCharacter(chars[1], &ok);

+ macro_assembler->CheckNotCharacter(chars[2], on_failure);

+ macro_assembler->Bind(&ok);

+ break;

+ default:

+ UNREACHABLE();

+ break;

+ }

- ZoneList<CharacterRange>* ranges = cc->ranges();

- Label success;

+static void EmitCharClass(RegExpMacroAssembler* macro_assembler,

+ RegExpCharacterClass* cc,

+ int cp_offset,

+ Label* on_failure) {

+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);

+ cp_offset++;

- Label *char_is_in_class =

- cc->is_negated() ? on_failure_->label() : &success;

+ ZoneList<CharacterRange>* ranges = cc->ranges();

- int range_count = ranges->length();

+ Label success;

- if (range_count == 0) {

- if (!cc->is_negated()) {

- on_failure()->GoTo(compiler);

- }

- break;

- }

+ Label *char_is_in_class =

+ cc->is_negated() ? on_failure : &success;

- for (int i = 0; i < range_count - 1; i++) {

- CharacterRange& range = (*ranges)[i];

- Label next_range;

- uc16 from = range.from();

- uc16 to = range.to();

- if (to == from) {

- macro_assembler->CheckCharacter(to, char_is_in_class);

- } else {

- if (from != 0) {

- macro_assembler->CheckCharacterLT(from, &next_range);

- }

- if (to != 0xffff) {

- macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);

- } else {

- macro_assembler->GoTo(char_is_in_class);

- }

- macro_assembler->Bind(&next_range);

- }

+ int range_count = ranges->length();

- if (range_count != 0) {

- CharacterRange& range = (*ranges)[range_count - 1];

- uc16 from = range.from();

- uc16 to = range.to();

+ if (range_count == 0) {

+ if (!cc->is_negated()) {

+ macro_assembler->GoTo(on_failure);

+ }

+ return;

+ }

- if (to == from) {

- if (cc->is_negated()) {

- macro_assembler->CheckCharacter(to, on_failure_->label());

- } else {

- macro_assembler->CheckNotCharacter(to, on_failure_->label());

- }

- } else {

- if (from != 0) {

- if (!cc->is_negated()) {

- macro_assembler->CheckCharacterLT(from, on_failure_->label());

- } else {

- macro_assembler->CheckCharacterLT(from, &success);

- }

- if (to != 0xffff) {

- if (!cc->is_negated()) {

- macro_assembler->CheckCharacterGT(to, on_failure_->label());

- } else {

- macro_assembler->CheckCharacterLT(to + 1, on_failure_->label());

- }

- } else {

- if (cc->is_negated()) {

- macro_assembler->GoTo(on_failure_->label());

- }

- } else if (cc->is_negated()) {

- macro_assembler->GoTo(on_failure_->label());

- }

+ for (int i = 0; i < range_count - 1; i++) {

+ CharacterRange& range = ranges->at(i);

+ Label next_range;

+ uc16 from = range.from();

+ uc16 to = range.to();

+ if (to == from) {

+ macro_assembler->CheckCharacter(to, char_is_in_class);

+ } else {

+ if (from != 0) {

+ macro_assembler->CheckCharacterLT(from, &next_range);

+ }

+ if (to != 0xffff) {

+ macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);

+ } else {

+ macro_assembler->GoTo(char_is_in_class);

+ }

+ macro_assembler->Bind(&next_range);

+ }

- macro_assembler->Bind(&success);

+ CharacterRange& range = ranges->at(range_count - 1);

+ uc16 from = range.from();

+ uc16 to = range.to();

- break;

+ if (to == from) {

+ if (cc->is_negated()) {

+ macro_assembler->CheckCharacter(to, on_failure);

+ } else {

+ macro_assembler->CheckNotCharacter(to, on_failure);

+ }

+ } else {

+ if (from != 0) {

+ if (!cc->is_negated()) {

+ macro_assembler->CheckCharacterLT(from, on_failure);

+ } else {

+ macro_assembler->CheckCharacterLT(from, &success);

}

- default:

- UNREACHABLE();

- return false;

}

+ if (to != 0xffff) {

+ if (!cc->is_negated()) {

+ macro_assembler->CheckCharacterGT(to, on_failure);

+ } else {

+ macro_assembler->CheckCharacterLT(to + 1, on_failure);

+ }

+ } else {

+ if (cc->is_negated()) {

+ macro_assembler->GoTo(on_failure);

+ }

}

+ macro_assembler->Bind(&success);

+bool TextNode::Emit(RegExpCompiler* compiler) {

+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

+ Bind(macro_assembler);

+ int element_count = elms_->length();

+ int cp_offset = 0;

+ // First, handle straight character matches.

+ for (int i = 0; i < element_count; i++) {

+ TextElement elm = elms_->at(i);

+ if (elm.type == TextElement::ATOM) {

+ Vector<const uc16> quarks = elm.data.u_atom->data();

+ if (!compiler->is_case_independent()) {

+ macro_assembler->CheckCharacters(quarks,

+ cp_offset,

+ on_failure_->label());

+ } else {

+ EmitAtomNonLetters(macro_assembler, elm, quarks, on_failure_->label(), cp_offset);

+ }

+ cp_offset += quarks.length();

+ } else {

+ ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);

+ cp_offset++;

+ }

+ // Second, handle case independent letter matches if any.

+ if (compiler->is_case_independent()) {

+ cp_offset = 0;

+ for (int i = 0; i < element_count; i++) {

+ TextElement elm = elms_->at(i);

+ if (elm.type == TextElement::ATOM) {

+ Vector<const uc16> quarks = elm.data.u_atom->data();

+ EmitAtomLetters(macro_assembler, elm, quarks, on_failure_->label(), cp_offset);

+ cp_offset += quarks.length();

+ } else {

+ cp_offset++;

+ }

+ // If the fast character matches passed then do the character classes.

+ cp_offset = 0;

+ for (int i = 0; i < element_count; i++) {

+ TextElement elm = elms_->at(i);

+ if (elm.type == TextElement::CHAR_CLASS) {

+ RegExpCharacterClass* cc = elm.data.u_char_class;

+ EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label());

+ cp_offset ++;

+ } else {

+ cp_offset += elm.data.u_atom->data().length();

+ }

compiler->AddWork(on_failure_);

macro_assembler->AdvanceCurrentPosition(cp_offset);

return on_success()->GoTo(compiler);

@@ -1225,14 +1353,14 @@

// is to use the Dispatch table to try only the relevant ones.

int i;

for (i = 0; i < choice_count - 1; i++) {

- GuardedAlternative alternative = (*alternatives_)[i];

+ GuardedAlternative alternative = alternatives_->at(i);

Label after;

Label after_no_pop_cp;

ZoneList<Guard*>* guards = alternative.guards();

if (guards != NULL) {

int guard_count = guards->length();

for (int j = 0; j < guard_count; j++) {

- GenerateGuard(macro_assembler, (*guards)[j], &after_no_pop_cp);

+ GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp);

}

macro_assembler->PushCurrentPosition();

@@ -1246,12 +1374,12 @@

macro_assembler->PopCurrentPosition();

macro_assembler->Bind(&after_no_pop_cp);

}

- GuardedAlternative alternative = (*alternatives_)[i];

+ GuardedAlternative alternative = alternatives_->at(i);

ZoneList<Guard*>* guards = alternative.guards();

if (guards != NULL) {

int guard_count = guards->length();

for (int j = 0; j < guard_count; j++) {

- GenerateGuard(macro_assembler, (*guards)[j], on_failure_->label());

+ GenerateGuard(macro_assembler, guards->at(j), on_failure_->label());

}

if (!on_failure_->IsBacktrack()) {

@@ -1932,10 +2060,6 @@

}

-static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;

-static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;

void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {

unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

if (IsSingleton()) {

@@ -2412,7 +2536,7 @@

Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,

RegExpNode** node_return,

bool ignore_case) {

- RegExpCompiler compiler(input->capture_count);

+ RegExpCompiler compiler(input->capture_count, ignore_case);

// Wrap the body of the regexp in capture #0.

RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,

@@ -2445,8 +2569,7 @@

ignore_case);

return compiler.Assemble(&macro_assembler,

node,

- input->capture_count,

- ignore_case);

+ input->capture_count);

}

#endif

byte codes[1024];

@@ -2454,8 +2577,7 @@

RegExpMacroAssemblerRe2k macro_assembler(&assembler);

return compiler.Assemble(&macro_assembler,

node,

- input->capture_count,

- ignore_case);

+ input->capture_count);

}

« no previous file with comments | « src/interpreter-re2k.cc ('k') | src/regexp-macro-assembler.h » ('j') | no next file with comments »