Chromium Code Reviews| Index: src/jsregexp.cc |
| diff --git a/src/jsregexp.cc b/src/jsregexp.cc |
| index 8a94e819e2c1cbbc04e79e904b80b2e6fef4acdf..0d9f83af37ba48b7ee7b2cf24e81e3ed34d151ab 100644 |
| --- a/src/jsregexp.cc |
| +++ b/src/jsregexp.cc |
| @@ -290,25 +290,18 @@ int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp, |
| DCHECK(needle_content.IsFlat()); |
| DCHECK(subject_content.IsFlat()); |
| // dispatch on type of strings |
| - index = (needle_content.IsAscii() |
| - ? (subject_content.IsAscii() |
| - ? SearchString(isolate, |
| - subject_content.ToOneByteVector(), |
| - needle_content.ToOneByteVector(), |
| - index) |
| - : SearchString(isolate, |
| - subject_content.ToUC16Vector(), |
| - needle_content.ToOneByteVector(), |
| - index)) |
| - : (subject_content.IsAscii() |
| - ? SearchString(isolate, |
| - subject_content.ToOneByteVector(), |
| - needle_content.ToUC16Vector(), |
| - index) |
| - : SearchString(isolate, |
| - subject_content.ToUC16Vector(), |
| - needle_content.ToUC16Vector(), |
| - index))); |
| + index = |
| + (needle_content.IsOneByte() |
| + ? (subject_content.IsOneByte() |
| + ? SearchString(isolate, subject_content.ToOneByteVector(), |
| + needle_content.ToOneByteVector(), index) |
| + : SearchString(isolate, subject_content.ToUC16Vector(), |
| + needle_content.ToOneByteVector(), index)) |
| + : (subject_content.IsOneByte() |
| + ? SearchString(isolate, subject_content.ToOneByteVector(), |
| + needle_content.ToUC16Vector(), index) |
| + : SearchString(isolate, subject_content.ToUC16Vector(), |
| + needle_content.ToUC16Vector(), index))); |
| if (index == -1) { |
| return i / 2; // Return number of matches. |
| } else { |
| @@ -346,14 +339,15 @@ Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, |
| // Irregexp implementation. |
| // Ensures that the regexp object contains a compiled version of the |
| -// source for either ASCII or non-ASCII strings. |
| +// source for either one-byte or two-byte subject strings. |
| // If the compiled version doesn't already exist, it is compiled |
| // from the source pattern. |
| // If compilation fails, an exception is thrown and this function |
| // returns false. |
| -bool RegExpImpl::EnsureCompiledIrregexp( |
| - Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { |
| - Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); |
| +bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, |
| + Handle<String> sample_subject, |
| + bool is_one_byte) { |
| + Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte)); |
| #ifdef V8_INTERPRETED_REGEXP |
| if (compiled_code->IsByteArray()) return true; |
| #else // V8_INTERPRETED_REGEXP (RegExp native code) |
| @@ -361,18 +355,18 @@ bool RegExpImpl::EnsureCompiledIrregexp( |
| #endif |
| // We could potentially have marked this as flushable, but have kept |
| // a saved version if we did not flush it yet. |
| - Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); |
| + Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); |
| if (saved_code->IsCode()) { |
| // Reinstate the code in the original place. |
| - re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); |
| + re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code); |
| DCHECK(compiled_code->IsSmi()); |
| return true; |
| } |
| - return CompileIrregexp(re, sample_subject, is_ascii); |
| + return CompileIrregexp(re, sample_subject, is_one_byte); |
| } |
| -static void CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, bool is_ascii, |
| +static void CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, |
| Handle<String> error_message, |
| Isolate* isolate) { |
| Factory* factory = isolate->factory(); |
| @@ -389,14 +383,14 @@ static void CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, bool is_ascii, |
| bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, |
| Handle<String> sample_subject, |
| - bool is_ascii) { |
| + bool is_one_byte) { |
| // Compile the RegExp. |
| Isolate* isolate = re->GetIsolate(); |
| Zone zone(isolate); |
| PostponeInterruptsScope postpone(isolate); |
| // If we had a compilation error the last time this is saved at the |
| // saved code index. |
| - Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); |
| + Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte)); |
| // When arriving here entry can only be a smi, either representing an |
| // uncompiled regexp, a previous compilation error, or code that has |
| // been flushed. |
| @@ -410,10 +404,10 @@ bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, |
| // A previous compilation failed and threw an error which we store in |
| // the saved code index (we store the error message, not the actual |
| // error). Recreate the error object and throw it. |
| - Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii)); |
| + Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); |
| DCHECK(error_string->IsString()); |
| Handle<String> error_message(String::cast(error_string)); |
| - CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); |
| + CreateRegExpErrorObjectAndThrow(re, error_message, isolate); |
| return false; |
| } |
| @@ -434,25 +428,19 @@ bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, |
| "malformed_regexp")); |
| return false; |
| } |
| - RegExpEngine::CompilationResult result = |
| - RegExpEngine::Compile(&compile_data, |
| - flags.is_ignore_case(), |
| - flags.is_global(), |
| - flags.is_multiline(), |
| - pattern, |
| - sample_subject, |
| - is_ascii, |
| - &zone); |
| + RegExpEngine::CompilationResult result = RegExpEngine::Compile( |
| + &compile_data, flags.is_ignore_case(), flags.is_global(), |
| + flags.is_multiline(), pattern, sample_subject, is_one_byte, &zone); |
| if (result.error_message != NULL) { |
| // Unable to compile regexp. |
| Handle<String> error_message = isolate->factory()->NewStringFromUtf8( |
| CStrVector(result.error_message)).ToHandleChecked(); |
| - CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); |
| + CreateRegExpErrorObjectAndThrow(re, error_message, isolate); |
| return false; |
| } |
| Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); |
| - data->set(JSRegExp::code_index(is_ascii), result.code); |
| + data->set(JSRegExp::code_index(is_one_byte), result.code); |
| int register_max = IrregexpMaxRegisterCount(*data); |
| if (result.num_registers > register_max) { |
| SetIrregexpMaxRegisterCount(*data, result.num_registers); |
| @@ -483,13 +471,13 @@ int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { |
| } |
| -ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) { |
| - return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii))); |
| +ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) { |
| + return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte))); |
| } |
| -Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) { |
| - return Code::cast(re->get(JSRegExp::code_index(is_ascii))); |
| +Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) { |
| + return Code::cast(re->get(JSRegExp::code_index(is_one_byte))); |
| } |
| @@ -510,9 +498,9 @@ int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, |
| Handle<String> subject) { |
| subject = String::Flatten(subject); |
| - // Check the asciiness of the underlying storage. |
| - bool is_ascii = subject->IsOneByteRepresentationUnderneath(); |
| - if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; |
| + // Check representation of the underlying storage. |
| + bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); |
| + if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1; |
| #ifdef V8_INTERPRETED_REGEXP |
| // Byte-code regexp needs space allocated for all its registers. |
| @@ -542,13 +530,13 @@ int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, |
| DCHECK(index <= subject->length()); |
| DCHECK(subject->IsFlat()); |
| - bool is_ascii = subject->IsOneByteRepresentationUnderneath(); |
| + bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); |
| #ifndef V8_INTERPRETED_REGEXP |
| DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); |
| do { |
| - EnsureCompiledIrregexp(regexp, subject, is_ascii); |
| - Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); |
| + EnsureCompiledIrregexp(regexp, subject, is_one_byte); |
| + Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate); |
| // The stack is used to allocate registers for the compiled regexp code. |
| // This means that in case of failure, the output registers array is left |
| // untouched and contains the capture results from the previous successful |
| @@ -575,10 +563,10 @@ int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, |
| // must restart from scratch. |
| // In this case, it means we must make sure we are prepared to handle |
| // the, potentially, different subject (the string can switch between |
| - // being internal and external, and even between being ASCII and UC16, |
| + // being internal and external, and even between being Latin1 and UC16, |
| // but the characters are always the same). |
| IrregexpPrepare(regexp, subject); |
| - is_ascii = subject->IsOneByteRepresentationUnderneath(); |
| + is_one_byte = subject->IsOneByteRepresentationUnderneath(); |
| } while (true); |
| UNREACHABLE(); |
| return RE_EXCEPTION; |
| @@ -596,7 +584,8 @@ int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, |
| for (int i = number_of_capture_registers - 1; i >= 0; i--) { |
| raw_output[i] = -1; |
| } |
| - Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate); |
| + Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte), |
| + isolate); |
| IrregexpResult result = IrregexpInterpreter::Match(isolate, |
| byte_codes, |
| @@ -997,7 +986,7 @@ class FrequencyCollator { |
| class RegExpCompiler { |
| public: |
| - RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii, |
| + RegExpCompiler(int capture_count, bool ignore_case, bool is_one_byte, |
| Zone* zone); |
| int AllocateRegister() { |
| @@ -1030,7 +1019,7 @@ class RegExpCompiler { |
| void SetRegExpTooBig() { reg_exp_too_big_ = true; } |
| inline bool ignore_case() { return ignore_case_; } |
| - inline bool ascii() { return ascii_; } |
| + inline bool one_byte() { return one_byte_; } |
| FrequencyCollator* frequency_collator() { return &frequency_collator_; } |
| int current_expansion_factor() { return current_expansion_factor_; } |
| @@ -1049,7 +1038,7 @@ class RegExpCompiler { |
| int recursion_depth_; |
| RegExpMacroAssembler* macro_assembler_; |
| bool ignore_case_; |
| - bool ascii_; |
| + bool one_byte_; |
| bool reg_exp_too_big_; |
| int current_expansion_factor_; |
| FrequencyCollator frequency_collator_; |
| @@ -1075,13 +1064,13 @@ static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { |
| // Attempts to compile the regexp using an Irregexp code generator. Returns |
| // a fixed array or a null handle depending on whether it succeeded. |
| -RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii, |
| - Zone* zone) |
| +RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, |
| + bool one_byte, Zone* zone) |
| : next_register_(2 * (capture_count + 1)), |
| work_list_(NULL), |
| recursion_depth_(0), |
| ignore_case_(ignore_case), |
| - ascii_(ascii), |
| + one_byte_(one_byte), |
| reg_exp_too_big_(false), |
| current_expansion_factor_(1), |
| frequency_collator_(), |
| @@ -1592,9 +1581,8 @@ void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, |
| // Returns the number of characters in the equivalence class, omitting those |
| // that cannot occur in the source string because it is ASCII. |
| -static int GetCaseIndependentLetters(Isolate* isolate, |
| - uc16 character, |
| - bool ascii_subject, |
| +static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, |
| + bool one_byte_subject, |
| unibrow::uchar* letters) { |
| int length = |
| isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); |
| @@ -1604,11 +1592,14 @@ static int GetCaseIndependentLetters(Isolate* isolate, |
| letters[0] = character; |
| length = 1; |
| } |
| - if (!ascii_subject || character <= String::kMaxOneByteCharCode) { |
| + if (!one_byte_subject || character <= String::kMaxOneByteCharCode) { |
| return length; |
| } |
| + |
| // The standard requires that non-ASCII characters cannot have ASCII |
| // character codes in their equivalence class. |
| + // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore, |
| + // is it? For example, \u00C5 is equivalent to \u212B. |
|
Yang
2014/09/10 08:26:36
This is one of the TODOs I mentioned.
dcarney
2014/09/10 09:35:12
I checked other browsers I think originally, and w
|
| return 0; |
| } |
| @@ -1644,18 +1635,19 @@ static inline bool EmitAtomNonLetter(Isolate* isolate, |
| bool check, |
| bool preloaded) { |
| RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| - bool ascii = compiler->ascii(); |
| + bool one_byte = compiler->one_byte(); |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| - int length = GetCaseIndependentLetters(isolate, c, ascii, chars); |
| + int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); |
| if (length < 1) { |
| - // This can't match. Must be an ASCII subject and a non-ASCII character. |
| - // We do not need to do anything since the ASCII pass already handled this. |
| + // This can't match. Must be an one-byte subject and a non-one-byte |
| + // character. We do not need to do anything since the one-byte pass |
| + // already handled this. |
| return false; // Bounds not checked. |
| } |
| bool checked = false; |
| // We handle the length > 1 case in a later pass. |
| if (length == 1) { |
| - if (ascii && c > String::kMaxOneByteCharCodeU) { |
| + if (one_byte && c > String::kMaxOneByteCharCodeU) { |
| // Can't match - see above. |
| return false; // Bounds not checked. |
| } |
| @@ -1670,12 +1662,10 @@ static inline bool EmitAtomNonLetter(Isolate* isolate, |
| static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, |
| - bool ascii, |
| - uc16 c1, |
| - uc16 c2, |
| + bool one_byte, uc16 c1, uc16 c2, |
| Label* on_failure) { |
| uc16 char_mask; |
| - if (ascii) { |
| + if (one_byte) { |
| char_mask = String::kMaxOneByteCharCode; |
| } else { |
| char_mask = String::kMaxUtf16CodeUnit; |
| @@ -1726,9 +1716,9 @@ static inline bool EmitAtomLetter(Isolate* isolate, |
| bool check, |
| bool preloaded) { |
| RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| - bool ascii = compiler->ascii(); |
| + bool one_byte = compiler->one_byte(); |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| - int length = GetCaseIndependentLetters(isolate, c, ascii, chars); |
| + int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); |
| if (length <= 1) return false; |
| // We may not need to check against the end of the input string |
| // if this character lies before a character that matched. |
| @@ -1739,11 +1729,8 @@ static inline bool EmitAtomLetter(Isolate* isolate, |
| DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
| switch (length) { |
| case 2: { |
| - if (ShortCutEmitCharacterPair(macro_assembler, |
| - ascii, |
| - chars[0], |
| - chars[1], |
| - on_failure)) { |
| + if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], |
| + chars[1], on_failure)) { |
| } else { |
| macro_assembler->CheckCharacter(chars[0], &ok); |
| macro_assembler->CheckNotCharacter(chars[1], on_failure); |
| @@ -1918,7 +1905,7 @@ static void SplitSearchSpace(ZoneList<int>* ranges, |
| // new_start_index is the index of the first edge that is beyond the |
| // current kSize space. |
| - // For very large search spaces we do a binary chop search of the non-ASCII |
| + // For very large search spaces we do a binary chop search of the non-Latin1 |
| // space instead of just going to the end of the current kSize space. The |
| // heuristics are complicated a little by the fact that any 128-character |
| // encoding space can be quickly tested with a table lookup, so we don't |
| @@ -1927,14 +1914,13 @@ static void SplitSearchSpace(ZoneList<int>* ranges, |
| // for example, we only want to match every second character (eg. the lower |
| // case characters on some Unicode pages). |
| int binary_chop_index = (end_index + start_index) / 2; |
| - // The first test ensures that we get to the code that handles the ASCII |
| + // The first test ensures that we get to the code that handles the Latin1 |
| // range with a single not-taken branch, speeding up this important |
| - // character range (even non-ASCII charset-based text has spaces and |
| + // character range (even non-Latin1 charset-based text has spaces and |
| // punctuation). |
| - if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case. |
| + if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. |
| end_index - start_index > (*new_start_index - start_index) * 2 && |
| - last - first > kSize * 2 && |
| - binary_chop_index > *new_start_index && |
| + last - first > kSize * 2 && binary_chop_index > *new_start_index && |
| ranges->at(binary_chop_index) >= first + 2 * kSize) { |
| int scan_forward_for_section_border = binary_chop_index;; |
| int new_border = (ranges->at(binary_chop_index) | kMask) + 1; |
| @@ -2121,20 +2107,16 @@ static void GenerateBranches(RegExpMacroAssembler* masm, |
| static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
| - RegExpCharacterClass* cc, |
| - bool ascii, |
| - Label* on_failure, |
| - int cp_offset, |
| - bool check_offset, |
| - bool preloaded, |
| - Zone* zone) { |
| + RegExpCharacterClass* cc, bool one_byte, |
| + Label* on_failure, int cp_offset, bool check_offset, |
| + bool preloaded, Zone* zone) { |
| ZoneList<CharacterRange>* ranges = cc->ranges(zone); |
| if (!CharacterRange::IsCanonical(ranges)) { |
| CharacterRange::Canonicalize(ranges); |
| } |
| int max_char; |
| - if (ascii) { |
| + if (one_byte) { |
| max_char = String::kMaxOneByteCharCode; |
| } else { |
| max_char = String::kMaxUtf16CodeUnit; |
| @@ -2464,7 +2446,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, |
| GetQuickCheckDetails( |
| details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); |
| if (details->cannot_match()) return false; |
| - if (!details->Rationalize(compiler->ascii())) return false; |
| + if (!details->Rationalize(compiler->one_byte())) return false; |
| DCHECK(details->characters() == 1 || |
| compiler->macro_assembler()->CanReadUnaligned()); |
| uint32_t mask = details->mask(); |
| @@ -2486,7 +2468,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, |
| // If number of characters preloaded is 1 then we used a byte or 16 bit |
| // load so the value is already masked down. |
| uint32_t char_mask; |
| - if (compiler->ascii()) { |
| + if (compiler->one_byte()) { |
| char_mask = String::kMaxOneByteCharCode; |
| } else { |
| char_mask = String::kMaxUtf16CodeUnit; |
| @@ -2494,11 +2476,11 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, |
| if ((mask & char_mask) == char_mask) need_mask = false; |
| mask &= char_mask; |
| } else { |
| - // For 2-character preloads in ASCII mode or 1-character preloads in |
| - // TWO_BYTE mode we also use a 16 bit load with zero extend. |
| - if (details->characters() == 2 && compiler->ascii()) { |
| + // For 2-character preloads in one-byte mode or 1-character preloads in |
| + // two-byte mode we also use a 16 bit load with zero extend. |
| + if (details->characters() == 2 && compiler->one_byte()) { |
| if ((mask & 0xffff) == 0xffff) need_mask = false; |
| - } else if (details->characters() == 1 && !compiler->ascii()) { |
| + } else if (details->characters() == 1 && !compiler->one_byte()) { |
| if ((mask & 0xffff) == 0xffff) need_mask = false; |
| } else { |
| if (mask == 0xffffffff) need_mask = false; |
| @@ -2538,7 +2520,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, |
| DCHECK(characters_filled_in < details->characters()); |
| int characters = details->characters(); |
| int char_mask; |
| - if (compiler->ascii()) { |
| + if (compiler->one_byte()) { |
| char_mask = String::kMaxOneByteCharCode; |
| } else { |
| char_mask = String::kMaxUtf16CodeUnit; |
| @@ -2552,18 +2534,20 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, |
| details->positions(characters_filled_in); |
| uc16 c = quarks[i]; |
| if (c > char_mask) { |
| - // If we expect a non-ASCII character from an ASCII string, |
| - // there is no way we can match. Not even case independent |
| - // matching can turn an ASCII character into non-ASCII or |
| + // If we expect a non-Latin1 character from an one-byte string, |
| + // there is no way we can match. Not even case-independent |
| + // matching can turn an Latin1 character into non-Latin1 or |
| // vice versa. |
| + // TODO(dcarney): issue 3550. Verify that this works as expected. |
| + // For example, \u0178 is uppercase of \u00ff (y-umlaut). |
|
Yang
2014/09/10 08:26:36
This is the other.
|
| details->set_cannot_match(); |
| pos->determines_perfectly = false; |
| return; |
| } |
| if (compiler->ignore_case()) { |
| unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| - int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), |
| - chars); |
| + int length = GetCaseIndependentLetters(isolate, c, |
| + compiler->one_byte(), chars); |
| DCHECK(length != 0); // Can only happen if c > char_mask (see above). |
| if (length == 1) { |
| // This letter has no case equivalents, so it's nice and simple |
| @@ -2692,7 +2676,7 @@ void QuickCheckDetails::Clear() { |
| } |
| -void QuickCheckDetails::Advance(int by, bool ascii) { |
| +void QuickCheckDetails::Advance(int by, bool one_byte) { |
| DCHECK(by >= 0); |
| if (by >= characters_) { |
| Clear(); |
| @@ -2756,7 +2740,7 @@ class VisitMarker { |
| }; |
| -RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) { |
| +RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) { |
| if (info()->replacement_calculated) return replacement(); |
| if (depth < 0) return this; |
| DCHECK(!info()->visited); |
| @@ -2766,7 +2750,7 @@ RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) { |
| RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { |
| - RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case); |
| + RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); |
| if (next == NULL) return set_replacement(NULL); |
| on_success_ = next; |
| return set_replacement(this); |
| @@ -2790,7 +2774,7 @@ static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { |
| } |
| -RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { |
| +RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { |
| if (info()->replacement_calculated) return replacement(); |
| if (depth < 0) return this; |
| DCHECK(!info()->visited); |
| @@ -2844,7 +2828,7 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { |
| } |
| -RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) { |
| +RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) { |
| if (info()->replacement_calculated) return replacement(); |
| if (depth < 0) return this; |
| if (info()->visited) return this; |
| @@ -2852,17 +2836,17 @@ RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) { |
| VisitMarker marker(info()); |
| RegExpNode* continue_replacement = |
| - continue_node_->FilterASCII(depth - 1, ignore_case); |
| + continue_node_->FilterOneByte(depth - 1, ignore_case); |
| // If we can't continue after the loop then there is no sense in doing the |
| // loop. |
| if (continue_replacement == NULL) return set_replacement(NULL); |
| } |
| - return ChoiceNode::FilterASCII(depth - 1, ignore_case); |
| + return ChoiceNode::FilterOneByte(depth - 1, ignore_case); |
| } |
| -RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) { |
| +RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { |
| if (info()->replacement_calculated) return replacement(); |
| if (depth < 0) return this; |
| if (info()->visited) return this; |
| @@ -2882,7 +2866,7 @@ RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) { |
| for (int i = 0; i < choice_count; i++) { |
| GuardedAlternative alternative = alternatives_->at(i); |
| RegExpNode* replacement = |
| - alternative.node()->FilterASCII(depth - 1, ignore_case); |
| + alternative.node()->FilterOneByte(depth - 1, ignore_case); |
| DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. |
| if (replacement != NULL) { |
| alternatives_->at(i).set_node(replacement); |
| @@ -2902,7 +2886,7 @@ RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) { |
| new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); |
| for (int i = 0; i < choice_count; i++) { |
| RegExpNode* replacement = |
| - alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case); |
| + alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); |
| if (replacement != NULL) { |
| alternatives_->at(i).set_node(replacement); |
| new_alternatives->Add(alternatives_->at(i), zone()); |
| @@ -2913,8 +2897,8 @@ RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) { |
| } |
| -RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth, |
| - bool ignore_case) { |
| +RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(int depth, |
| + bool ignore_case) { |
| if (info()->replacement_calculated) return replacement(); |
| if (depth < 0) return this; |
| if (info()->visited) return this; |
| @@ -2922,12 +2906,12 @@ RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth, |
| // Alternative 0 is the negative lookahead, alternative 1 is what comes |
| // afterwards. |
| RegExpNode* node = alternatives_->at(1).node(); |
| - RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case); |
| + RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); |
| if (replacement == NULL) return set_replacement(NULL); |
| alternatives_->at(1).set_node(replacement); |
| RegExpNode* neg_node = alternatives_->at(0).node(); |
| - RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case); |
| + RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); |
| // If the negative lookahead is always going to fail then |
| // we don't need to check it. |
| if (neg_replacement == NULL) return set_replacement(replacement); |
| @@ -3036,7 +3020,7 @@ static void EmitHat(RegExpCompiler* compiler, |
| if (!assembler->CheckSpecialCharacterClass('n', |
| new_trace.backtrack())) { |
| // Newline means \n, \r, 0x2028 or 0x2029. |
| - if (!compiler->ascii()) { |
| + if (!compiler->one_byte()) { |
| assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); |
| } |
| assembler->CheckCharacter('\n', &ok); |
| @@ -3234,7 +3218,7 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, |
| int* checked_up_to) { |
| RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
| Isolate* isolate = assembler->zone()->isolate(); |
| - bool ascii = compiler->ascii(); |
| + bool one_byte = compiler->one_byte(); |
| Label* backtrack = trace->backtrack(); |
| QuickCheckDetails* quick_check = trace->quick_check_performed(); |
| int element_count = elms_->length(); |
| @@ -3248,8 +3232,8 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, |
| if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; |
| EmitCharacterFunction* emit_function = NULL; |
| switch (pass) { |
| - case NON_ASCII_MATCH: |
| - DCHECK(ascii); |
| + case NON_LATIN1_MATCH: |
| + DCHECK(one_byte); |
| if (quarks[j] > String::kMaxOneByteCharCode) { |
| assembler->GoTo(backtrack); |
| return; |
| @@ -3284,14 +3268,8 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, |
| if (first_element_checked && i == 0) continue; |
| if (DeterminedAlready(quick_check, elm.cp_offset())) continue; |
| RegExpCharacterClass* cc = elm.char_class(); |
| - EmitCharClass(assembler, |
| - cc, |
| - ascii, |
| - backtrack, |
| - cp_offset, |
| - *checked_up_to < cp_offset, |
| - preloaded, |
| - zone()); |
| + EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset, |
| + *checked_up_to < cp_offset, preloaded, zone()); |
| UpdateBoundsCheck(cp_offset, checked_up_to); |
| } |
| } |
| @@ -3332,9 +3310,9 @@ void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { |
| return; |
| } |
| - if (compiler->ascii()) { |
| + if (compiler->one_byte()) { |
| int dummy = 0; |
| - TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); |
| + TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); |
| } |
| bool first_elt_done = false; |
| @@ -3390,7 +3368,7 @@ void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { |
| // Adjust the offsets of the quick check performed information. This |
| // information is used to find out what we already determined about the |
| // characters by means of mask and compare. |
| - quick_check_performed_.Advance(by, compiler->ascii()); |
| + quick_check_performed_.Advance(by, compiler->one_byte()); |
| cp_offset_ += by; |
| if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { |
| compiler->SetRegExpTooBig(); |
| @@ -3400,7 +3378,7 @@ void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { |
| } |
| -void TextNode::MakeCaseIndependent(bool is_ascii) { |
| +void TextNode::MakeCaseIndependent(bool is_one_byte) { |
| int element_count = elms_->length(); |
| for (int i = 0; i < element_count; i++) { |
| TextElement elm = elms_->at(i); |
| @@ -3412,7 +3390,7 @@ void TextNode::MakeCaseIndependent(bool is_ascii) { |
| ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
| int range_count = ranges->length(); |
| for (int j = 0; j < range_count; j++) { |
| - ranges->at(j).AddCaseEquivalents(ranges, is_ascii, zone()); |
| + ranges->at(j).AddCaseEquivalents(ranges, is_one_byte, zone()); |
| } |
| } |
| } |
| @@ -3440,7 +3418,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( |
| } |
| if (ranges->length() != 1) return NULL; |
| uint32_t max_char; |
| - if (compiler->ascii()) { |
| + if (compiler->one_byte()) { |
| max_char = String::kMaxOneByteCharCode; |
| } else { |
| max_char = String::kMaxUtf16CodeUnit; |
| @@ -3517,8 +3495,8 @@ int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, |
| int eats_at_least) { |
| int preload_characters = Min(4, eats_at_least); |
| if (compiler->macro_assembler()->CanReadUnaligned()) { |
| - bool ascii = compiler->ascii(); |
| - if (ascii) { |
| + bool one_byte = compiler->one_byte(); |
| + if (one_byte) { |
| if (preload_characters > 4) preload_characters = 4; |
| // We can't preload 3 characters because there is no machine instruction |
| // to do that. We can't just load 4 because we could be reading |
| @@ -3644,7 +3622,7 @@ BoyerMooreLookahead::BoyerMooreLookahead( |
| int length, RegExpCompiler* compiler, Zone* zone) |
| : length_(length), |
| compiler_(compiler) { |
| - if (compiler->ascii()) { |
| + if (compiler->one_byte()) { |
| max_char_ = String::kMaxOneByteCharCode; |
| } else { |
| max_char_ = String::kMaxUtf16CodeUnit; |
| @@ -3712,8 +3690,9 @@ int BoyerMooreLookahead::FindBestInterval( |
| // dividing by 2 we switch off the skipping if the probability of skipping |
| // is less than 50%. This is because the multibyte mask-and-compare |
| // skipping in quickcheck is more likely to do well on this case. |
| - bool in_quickcheck_range = ((i - remembered_from < 4) || |
| - (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2)); |
| + bool in_quickcheck_range = |
| + ((i - remembered_from < 4) || |
| + (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); |
| // Called 'probability' but it is only a rough estimate and can actually |
| // be outside the 0-kSize range. |
| int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; |
| @@ -3931,8 +3910,7 @@ void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, |
| if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { |
| // Save some time by looking at most one machine word ahead. |
| state->eats_at_least_ = |
| - EatsAtLeast(compiler->ascii() ? 4 : 2, |
| - kRecursionBudget, |
| + EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget, |
| current_trace->at_start() == Trace::FALSE_VALUE); |
| } |
| state->preload_characters_ = |
| @@ -5347,12 +5325,11 @@ void CharacterRange::Split(ZoneList<CharacterRange>* base, |
| void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
| - bool is_ascii, |
| - Zone* zone) { |
| + bool is_one_byte, Zone* zone) { |
| Isolate* isolate = zone->isolate(); |
| uc16 bottom = from(); |
| uc16 top = to(); |
| - if (is_ascii && !RangeContainsLatin1Equivalents(*this)) { |
| + if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) { |
| if (bottom > String::kMaxOneByteCharCode) return; |
| if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; |
| } |
| @@ -5771,7 +5748,7 @@ void TextNode::CalculateOffsets() { |
| void Analysis::VisitText(TextNode* that) { |
| if (ignore_case_) { |
| - that->MakeCaseIndependent(is_ascii_); |
| + that->MakeCaseIndependent(is_one_byte_); |
| } |
| EnsureAnalyzed(that->on_success()); |
| if (!has_failed()) { |
| @@ -6047,18 +6024,13 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) { |
| RegExpEngine::CompilationResult RegExpEngine::Compile( |
| - RegExpCompileData* data, |
| - bool ignore_case, |
| - bool is_global, |
| - bool is_multiline, |
| - Handle<String> pattern, |
| - Handle<String> sample_subject, |
| - bool is_ascii, |
| - Zone* zone) { |
| + RegExpCompileData* data, bool ignore_case, bool is_global, |
| + bool is_multiline, Handle<String> pattern, Handle<String> sample_subject, |
| + bool is_one_byte, Zone* zone) { |
| if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { |
| return IrregexpRegExpTooBig(zone->isolate()); |
| } |
| - RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii, zone); |
| + RegExpCompiler compiler(data->capture_count, ignore_case, is_one_byte, zone); |
| // Sample some characters from the middle of the string. |
| static const int kSampleSize = 128; |
| @@ -6105,18 +6077,18 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( |
| node = loop_node; |
| } |
| } |
| - if (is_ascii) { |
| - node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); |
| + if (is_one_byte) { |
| + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); |
| // Do it again to propagate the new nodes to places where they were not |
| // put because they had not been calculated yet. |
| if (node != NULL) { |
| - node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); |
| + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); |
| } |
| } |
| if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); |
| data->node = node; |
| - Analysis analysis(ignore_case, is_ascii); |
| + Analysis analysis(ignore_case, is_one_byte); |
| analysis.EnsureAnalyzed(node); |
| if (analysis.has_failed()) { |
| const char* error_message = analysis.error_message(); |
| @@ -6128,8 +6100,8 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( |
| // Native regexp implementation. |
| NativeRegExpMacroAssembler::Mode mode = |
| - is_ascii ? NativeRegExpMacroAssembler::ASCII |
| - : NativeRegExpMacroAssembler::UC16; |
| + is_one_byte ? NativeRegExpMacroAssembler::LATIN1 |
| + : NativeRegExpMacroAssembler::UC16; |
| #if V8_TARGET_ARCH_IA32 |
| RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2, |