| OLD | NEW |
| 1 // Copyright 2006-2009 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2009 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 1230 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1241 case Guard::GEQ: | 1241 case Guard::GEQ: |
| 1242 ASSERT(!trace->mentions_reg(guard->reg())); | 1242 ASSERT(!trace->mentions_reg(guard->reg())); |
| 1243 macro_assembler->IfRegisterLT(guard->reg(), | 1243 macro_assembler->IfRegisterLT(guard->reg(), |
| 1244 guard->value(), | 1244 guard->value(), |
| 1245 trace->backtrack()); | 1245 trace->backtrack()); |
| 1246 break; | 1246 break; |
| 1247 } | 1247 } |
| 1248 } | 1248 } |
| 1249 | 1249 |
| 1250 | 1250 |
| 1251 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; | |
| 1252 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; | |
| 1253 | |
| 1254 | |
| 1255 // Returns the number of characters in the equivalence class, omitting those | 1251 // Returns the number of characters in the equivalence class, omitting those |
| 1256 // that cannot occur in the source string because it is ASCII. | 1252 // that cannot occur in the source string because it is ASCII. |
| 1257 static int GetCaseIndependentLetters(uc16 character, | 1253 static int GetCaseIndependentLetters(Isolate* isolate, |
| 1254 uc16 character, |
| 1258 bool ascii_subject, | 1255 bool ascii_subject, |
| 1259 unibrow::uchar* letters) { | 1256 unibrow::uchar* letters) { |
| 1260 int length = uncanonicalize.get(character, '\0', letters); | 1257 int length = |
| 1258 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); |
| 1261 // Unibrow returns 0 or 1 for characters where case independependence is | 1259 // Unibrow returns 0 or 1 for characters where case independependence is |
| 1262 // trivial. | 1260 // trivial. |
| 1263 if (length == 0) { | 1261 if (length == 0) { |
| 1264 letters[0] = character; | 1262 letters[0] = character; |
| 1265 length = 1; | 1263 length = 1; |
| 1266 } | 1264 } |
| 1267 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { | 1265 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { |
| 1268 return length; | 1266 return length; |
| 1269 } | 1267 } |
| 1270 // The standard requires that non-ASCII characters cannot have ASCII | 1268 // The standard requires that non-ASCII characters cannot have ASCII |
| 1271 // character codes in their equivalence class. | 1269 // character codes in their equivalence class. |
| 1272 return 0; | 1270 return 0; |
| 1273 } | 1271 } |
| 1274 | 1272 |
| 1275 | 1273 |
| 1276 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler, | 1274 static inline bool EmitSimpleCharacter(Isolate* isolate, |
| 1275 RegExpCompiler* compiler, |
| 1277 uc16 c, | 1276 uc16 c, |
| 1278 Label* on_failure, | 1277 Label* on_failure, |
| 1279 int cp_offset, | 1278 int cp_offset, |
| 1280 bool check, | 1279 bool check, |
| 1281 bool preloaded) { | 1280 bool preloaded) { |
| 1282 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 1281 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
| 1283 bool bound_checked = false; | 1282 bool bound_checked = false; |
| 1284 if (!preloaded) { | 1283 if (!preloaded) { |
| 1285 assembler->LoadCurrentCharacter( | 1284 assembler->LoadCurrentCharacter( |
| 1286 cp_offset, | 1285 cp_offset, |
| 1287 on_failure, | 1286 on_failure, |
| 1288 check); | 1287 check); |
| 1289 bound_checked = true; | 1288 bound_checked = true; |
| 1290 } | 1289 } |
| 1291 assembler->CheckNotCharacter(c, on_failure); | 1290 assembler->CheckNotCharacter(c, on_failure); |
| 1292 return bound_checked; | 1291 return bound_checked; |
| 1293 } | 1292 } |
| 1294 | 1293 |
| 1295 | 1294 |
| 1296 // Only emits non-letters (things that don't have case). Only used for case | 1295 // Only emits non-letters (things that don't have case). Only used for case |
| 1297 // independent matches. | 1296 // independent matches. |
| 1298 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler, | 1297 static inline bool EmitAtomNonLetter(Isolate* isolate, |
| 1298 RegExpCompiler* compiler, |
| 1299 uc16 c, | 1299 uc16 c, |
| 1300 Label* on_failure, | 1300 Label* on_failure, |
| 1301 int cp_offset, | 1301 int cp_offset, |
| 1302 bool check, | 1302 bool check, |
| 1303 bool preloaded) { | 1303 bool preloaded) { |
| 1304 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | 1304 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| 1305 bool ascii = compiler->ascii(); | 1305 bool ascii = compiler->ascii(); |
| 1306 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1306 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 1307 int length = GetCaseIndependentLetters(c, ascii, chars); | 1307 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); |
| 1308 if (length < 1) { | 1308 if (length < 1) { |
| 1309 // This can't match. Must be an ASCII subject and a non-ASCII character. | 1309 // This can't match. Must be an ASCII subject and a non-ASCII character. |
| 1310 // We do not need to do anything since the ASCII pass already handled this. | 1310 // We do not need to do anything since the ASCII pass already handled this. |
| 1311 return false; // Bounds not checked. | 1311 return false; // Bounds not checked. |
| 1312 } | 1312 } |
| 1313 bool checked = false; | 1313 bool checked = false; |
| 1314 // We handle the length > 1 case in a later pass. | 1314 // We handle the length > 1 case in a later pass. |
| 1315 if (length == 1) { | 1315 if (length == 1) { |
| 1316 if (ascii && c > String::kMaxAsciiCharCodeU) { | 1316 if (ascii && c > String::kMaxAsciiCharCodeU) { |
| 1317 // Can't match - see above. | 1317 // Can't match - see above. |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1359 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, | 1359 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, |
| 1360 diff, | 1360 diff, |
| 1361 mask, | 1361 mask, |
| 1362 on_failure); | 1362 on_failure); |
| 1363 return true; | 1363 return true; |
| 1364 } | 1364 } |
| 1365 return false; | 1365 return false; |
| 1366 } | 1366 } |
| 1367 | 1367 |
| 1368 | 1368 |
| 1369 typedef bool EmitCharacterFunction(RegExpCompiler* compiler, | 1369 typedef bool EmitCharacterFunction(Isolate* isolate, |
| 1370 RegExpCompiler* compiler, |
| 1370 uc16 c, | 1371 uc16 c, |
| 1371 Label* on_failure, | 1372 Label* on_failure, |
| 1372 int cp_offset, | 1373 int cp_offset, |
| 1373 bool check, | 1374 bool check, |
| 1374 bool preloaded); | 1375 bool preloaded); |
| 1375 | 1376 |
| 1376 // Only emits letters (things that have case). Only used for case independent | 1377 // Only emits letters (things that have case). Only used for case independent |
| 1377 // matches. | 1378 // matches. |
| 1378 static inline bool EmitAtomLetter(RegExpCompiler* compiler, | 1379 static inline bool EmitAtomLetter(Isolate* isolate, |
| 1380 RegExpCompiler* compiler, |
| 1379 uc16 c, | 1381 uc16 c, |
| 1380 Label* on_failure, | 1382 Label* on_failure, |
| 1381 int cp_offset, | 1383 int cp_offset, |
| 1382 bool check, | 1384 bool check, |
| 1383 bool preloaded) { | 1385 bool preloaded) { |
| 1384 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | 1386 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| 1385 bool ascii = compiler->ascii(); | 1387 bool ascii = compiler->ascii(); |
| 1386 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1388 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 1387 int length = GetCaseIndependentLetters(c, ascii, chars); | 1389 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); |
| 1388 if (length <= 1) return false; | 1390 if (length <= 1) return false; |
| 1389 // We may not need to check against the end of the input string | 1391 // We may not need to check against the end of the input string |
| 1390 // if this character lies before a character that matched. | 1392 // if this character lies before a character that matched. |
| 1391 if (!preloaded) { | 1393 if (!preloaded) { |
| 1392 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1394 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
| 1393 } | 1395 } |
| 1394 Label ok; | 1396 Label ok; |
| 1395 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); | 1397 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
| 1396 switch (length) { | 1398 switch (length) { |
| 1397 case 2: { | 1399 case 2: { |
| (...skipping 380 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1778 // | 1780 // |
| 1779 // We iterate along the text object, building up for each character a | 1781 // We iterate along the text object, building up for each character a |
| 1780 // mask and value that can be used to test for a quick failure to match. | 1782 // mask and value that can be used to test for a quick failure to match. |
| 1781 // The masks and values for the positions will be combined into a single | 1783 // The masks and values for the positions will be combined into a single |
| 1782 // machine word for the current character width in order to be used in | 1784 // machine word for the current character width in order to be used in |
| 1783 // generating a quick check. | 1785 // generating a quick check. |
| 1784 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, | 1786 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, |
| 1785 RegExpCompiler* compiler, | 1787 RegExpCompiler* compiler, |
| 1786 int characters_filled_in, | 1788 int characters_filled_in, |
| 1787 bool not_at_start) { | 1789 bool not_at_start) { |
| 1790 Isolate* isolate = Isolate::Current(); |
| 1788 ASSERT(characters_filled_in < details->characters()); | 1791 ASSERT(characters_filled_in < details->characters()); |
| 1789 int characters = details->characters(); | 1792 int characters = details->characters(); |
| 1790 int char_mask; | 1793 int char_mask; |
| 1791 int char_shift; | 1794 int char_shift; |
| 1792 if (compiler->ascii()) { | 1795 if (compiler->ascii()) { |
| 1793 char_mask = String::kMaxAsciiCharCode; | 1796 char_mask = String::kMaxAsciiCharCode; |
| 1794 char_shift = 8; | 1797 char_shift = 8; |
| 1795 } else { | 1798 } else { |
| 1796 char_mask = String::kMaxUC16CharCode; | 1799 char_mask = String::kMaxUC16CharCode; |
| 1797 char_shift = 16; | 1800 char_shift = 16; |
| (...skipping 10 matching lines...) Expand all Loading... |
| 1808 // If we expect a non-ASCII character from an ASCII string, | 1811 // If we expect a non-ASCII character from an ASCII string, |
| 1809 // there is no way we can match. Not even case independent | 1812 // there is no way we can match. Not even case independent |
| 1810 // matching can turn an ASCII character into non-ASCII or | 1813 // matching can turn an ASCII character into non-ASCII or |
| 1811 // vice versa. | 1814 // vice versa. |
| 1812 details->set_cannot_match(); | 1815 details->set_cannot_match(); |
| 1813 pos->determines_perfectly = false; | 1816 pos->determines_perfectly = false; |
| 1814 return; | 1817 return; |
| 1815 } | 1818 } |
| 1816 if (compiler->ignore_case()) { | 1819 if (compiler->ignore_case()) { |
| 1817 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1820 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 1818 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars); | 1821 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), |
| 1822 chars); |
| 1819 ASSERT(length != 0); // Can only happen if c > char_mask (see above). | 1823 ASSERT(length != 0); // Can only happen if c > char_mask (see above). |
| 1820 if (length == 1) { | 1824 if (length == 1) { |
| 1821 // This letter has no case equivalents, so it's nice and simple | 1825 // This letter has no case equivalents, so it's nice and simple |
| 1822 // and the mask-compare will determine definitely whether we have | 1826 // and the mask-compare will determine definitely whether we have |
| 1823 // a match at this character position. | 1827 // a match at this character position. |
| 1824 pos->mask = char_mask; | 1828 pos->mask = char_mask; |
| 1825 pos->value = c; | 1829 pos->value = c; |
| 1826 pos->determines_perfectly = true; | 1830 pos->determines_perfectly = true; |
| 1827 } else { | 1831 } else { |
| 1828 uint32_t common_bits = char_mask; | 1832 uint32_t common_bits = char_mask; |
| (...skipping 479 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2308 // loading characters, which means we do not need to recheck the bounds | 2312 // loading characters, which means we do not need to recheck the bounds |
| 2309 // up to the limit the quick check already checked. In addition the quick | 2313 // up to the limit the quick check already checked. In addition the quick |
| 2310 // check can have involved a mask and compare operation which may simplify | 2314 // check can have involved a mask and compare operation which may simplify |
| 2311 // or obviate the need for further checks at some character positions. | 2315 // or obviate the need for further checks at some character positions. |
| 2312 void TextNode::TextEmitPass(RegExpCompiler* compiler, | 2316 void TextNode::TextEmitPass(RegExpCompiler* compiler, |
| 2313 TextEmitPassType pass, | 2317 TextEmitPassType pass, |
| 2314 bool preloaded, | 2318 bool preloaded, |
| 2315 Trace* trace, | 2319 Trace* trace, |
| 2316 bool first_element_checked, | 2320 bool first_element_checked, |
| 2317 int* checked_up_to) { | 2321 int* checked_up_to) { |
| 2322 Isolate* isolate = Isolate::Current(); |
| 2318 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 2323 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
| 2319 bool ascii = compiler->ascii(); | 2324 bool ascii = compiler->ascii(); |
| 2320 Label* backtrack = trace->backtrack(); | 2325 Label* backtrack = trace->backtrack(); |
| 2321 QuickCheckDetails* quick_check = trace->quick_check_performed(); | 2326 QuickCheckDetails* quick_check = trace->quick_check_performed(); |
| 2322 int element_count = elms_->length(); | 2327 int element_count = elms_->length(); |
| 2323 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { | 2328 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { |
| 2324 TextElement elm = elms_->at(i); | 2329 TextElement elm = elms_->at(i); |
| 2325 int cp_offset = trace->cp_offset() + elm.cp_offset; | 2330 int cp_offset = trace->cp_offset() + elm.cp_offset; |
| 2326 if (elm.type == TextElement::ATOM) { | 2331 if (elm.type == TextElement::ATOM) { |
| 2327 Vector<const uc16> quarks = elm.data.u_atom->data(); | 2332 Vector<const uc16> quarks = elm.data.u_atom->data(); |
| (...skipping 15 matching lines...) Expand all Loading... |
| 2343 case SIMPLE_CHARACTER_MATCH: | 2348 case SIMPLE_CHARACTER_MATCH: |
| 2344 emit_function = &EmitSimpleCharacter; | 2349 emit_function = &EmitSimpleCharacter; |
| 2345 break; | 2350 break; |
| 2346 case CASE_CHARACTER_MATCH: | 2351 case CASE_CHARACTER_MATCH: |
| 2347 emit_function = &EmitAtomLetter; | 2352 emit_function = &EmitAtomLetter; |
| 2348 break; | 2353 break; |
| 2349 default: | 2354 default: |
| 2350 break; | 2355 break; |
| 2351 } | 2356 } |
| 2352 if (emit_function != NULL) { | 2357 if (emit_function != NULL) { |
| 2353 bool bound_checked = emit_function(compiler, | 2358 bool bound_checked = emit_function(isolate, |
| 2359 compiler, |
| 2354 quarks[j], | 2360 quarks[j], |
| 2355 backtrack, | 2361 backtrack, |
| 2356 cp_offset + j, | 2362 cp_offset + j, |
| 2357 *checked_up_to < cp_offset + j, | 2363 *checked_up_to < cp_offset + j, |
| 2358 preloaded); | 2364 preloaded); |
| 2359 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); | 2365 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); |
| 2360 } | 2366 } |
| 2361 } | 2367 } |
| 2362 } else { | 2368 } else { |
| 2363 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); | 2369 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
| (...skipping 1616 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3980 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); | 3986 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); |
| 3981 for (int i = 0; i < overlay.length(); i += 2) { | 3987 for (int i = 0; i < overlay.length(); i += 2) { |
| 3982 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), | 3988 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), |
| 3983 CharacterRangeSplitter::kInOverlay); | 3989 CharacterRangeSplitter::kInOverlay); |
| 3984 } | 3990 } |
| 3985 CharacterRangeSplitter callback(included, excluded); | 3991 CharacterRangeSplitter callback(included, excluded); |
| 3986 table.ForEach(&callback); | 3992 table.ForEach(&callback); |
| 3987 } | 3993 } |
| 3988 | 3994 |
| 3989 | 3995 |
| 3990 static void AddUncanonicals(ZoneList<CharacterRange>* ranges, | 3996 static void AddUncanonicals(Isolate* isolate, |
| 3997 ZoneList<CharacterRange>* ranges, |
| 3991 int bottom, | 3998 int bottom, |
| 3992 int top); | 3999 int top); |
| 3993 | 4000 |
| 3994 | 4001 |
| 3995 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, | 4002 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
| 3996 bool is_ascii) { | 4003 bool is_ascii) { |
| 4004 Isolate* isolate = Isolate::Current(); |
| 3997 uc16 bottom = from(); | 4005 uc16 bottom = from(); |
| 3998 uc16 top = to(); | 4006 uc16 top = to(); |
| 3999 if (is_ascii) { | 4007 if (is_ascii) { |
| 4000 if (bottom > String::kMaxAsciiCharCode) return; | 4008 if (bottom > String::kMaxAsciiCharCode) return; |
| 4001 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode; | 4009 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode; |
| 4002 } | 4010 } |
| 4003 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 4011 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 4004 if (top == bottom) { | 4012 if (top == bottom) { |
| 4005 // If this is a singleton we just expand the one character. | 4013 // If this is a singleton we just expand the one character. |
| 4006 int length = uncanonicalize.get(bottom, '\0', chars); | 4014 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); |
| 4007 for (int i = 0; i < length; i++) { | 4015 for (int i = 0; i < length; i++) { |
| 4008 uc32 chr = chars[i]; | 4016 uc32 chr = chars[i]; |
| 4009 if (chr != bottom) { | 4017 if (chr != bottom) { |
| 4010 ranges->Add(CharacterRange::Singleton(chars[i])); | 4018 ranges->Add(CharacterRange::Singleton(chars[i])); |
| 4011 } | 4019 } |
| 4012 } | 4020 } |
| 4013 } else if (bottom <= kRangeCanonicalizeMax && | 4021 } else if (bottom <= kRangeCanonicalizeMax && |
| 4014 top <= kRangeCanonicalizeMax) { | 4022 top <= kRangeCanonicalizeMax) { |
| 4015 // If this is a range we expand the characters block by block, | 4023 // If this is a range we expand the characters block by block, |
| 4016 // expanding contiguous subranges (blocks) one at a time. | 4024 // expanding contiguous subranges (blocks) one at a time. |
| 4017 // The approach is as follows. For a given start character we | 4025 // The approach is as follows. For a given start character we |
| 4018 // look up the block that contains it, for instance 'a' if the | 4026 // look up the block that contains it, for instance 'a' if the |
| 4019 // start character is 'c'. A block is characterized by the property | 4027 // start character is 'c'. A block is characterized by the property |
| 4020 // that all characters uncanonicalize in the same way as the first | 4028 // that all characters uncanonicalize in the same way as the first |
| 4021 // element, except that each entry in the result is incremented | 4029 // element, except that each entry in the result is incremented |
| 4022 // by the distance from the first element. So a-z is a block | 4030 // by the distance from the first element. So a-z is a block |
| 4023 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter | 4031 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter |
| 4024 // uncanonicalizes to ['a' + k, 'A' + k]. | 4032 // uncanonicalizes to ['a' + k, 'A' + k]. |
| 4025 // Once we've found the start point we look up its uncanonicalization | 4033 // Once we've found the start point we look up its uncanonicalization |
| 4026 // and produce a range for each element. For instance for [c-f] | 4034 // and produce a range for each element. For instance for [c-f] |
| 4027 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only | 4035 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only |
| 4028 // add a range if it is not already contained in the input, so [c-f] | 4036 // add a range if it is not already contained in the input, so [c-f] |
| 4029 // will be skipped but [C-F] will be added. If this range is not | 4037 // will be skipped but [C-F] will be added. If this range is not |
| 4030 // completely contained in a block we do this for all the blocks | 4038 // completely contained in a block we do this for all the blocks |
| 4031 // covered by the range. | 4039 // covered by the range. |
| 4032 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 4040 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 4033 // First, look up the block that contains the 'bottom' character. | 4041 // First, look up the block that contains the 'bottom' character. |
| 4034 int length = canonrange.get(bottom, '\0', range); | 4042 int length = isolate->jsregexp_canonrange()->get(bottom, '\0', range); |
| 4035 if (length == 0) { | 4043 if (length == 0) { |
| 4036 range[0] = bottom; | 4044 range[0] = bottom; |
| 4037 } else { | 4045 } else { |
| 4038 ASSERT_EQ(1, length); | 4046 ASSERT_EQ(1, length); |
| 4039 } | 4047 } |
| 4040 int pos = bottom; | 4048 int pos = bottom; |
| 4041 // The start of the current block. Note that except for the first | 4049 // The start of the current block. Note that except for the first |
| 4042 // iteration 'start' is always equal to 'pos'. | 4050 // iteration 'start' is always equal to 'pos'. |
| 4043 int start; | 4051 int start; |
| 4044 // If it is not the start point of a block the entry contains the | 4052 // If it is not the start point of a block the entry contains the |
| 4045 // offset of the character from the start point. | 4053 // offset of the character from the start point. |
| 4046 if ((range[0] & kStartMarker) == 0) { | 4054 if ((range[0] & kStartMarker) == 0) { |
| 4047 start = pos - range[0]; | 4055 start = pos - range[0]; |
| 4048 } else { | 4056 } else { |
| 4049 start = pos; | 4057 start = pos; |
| 4050 } | 4058 } |
| 4051 // Then we add the ranges one at a time, incrementing the current | 4059 // Then we add the ranges one at a time, incrementing the current |
| 4052 // position to be after the last block each time. The position | 4060 // position to be after the last block each time. The position |
| 4053 // always points to the start of a block. | 4061 // always points to the start of a block. |
| 4054 while (pos < top) { | 4062 while (pos < top) { |
| 4055 length = canonrange.get(start, '\0', range); | 4063 length = isolate->jsregexp_canonrange()->get(start, '\0', range); |
| 4056 if (length == 0) { | 4064 if (length == 0) { |
| 4057 range[0] = start; | 4065 range[0] = start; |
| 4058 } else { | 4066 } else { |
| 4059 ASSERT_EQ(1, length); | 4067 ASSERT_EQ(1, length); |
| 4060 } | 4068 } |
| 4061 ASSERT((range[0] & kStartMarker) != 0); | 4069 ASSERT((range[0] & kStartMarker) != 0); |
| 4062 // The start point of a block contains the distance to the end | 4070 // The start point of a block contains the distance to the end |
| 4063 // of the range. | 4071 // of the range. |
| 4064 int block_end = start + (range[0] & kPayloadMask) - 1; | 4072 int block_end = start + (range[0] & kPayloadMask) - 1; |
| 4065 int end = (block_end > top) ? top : block_end; | 4073 int end = (block_end > top) ? top : block_end; |
| 4066 length = uncanonicalize.get(start, '\0', range); | 4074 length = isolate->jsregexp_uncanonicalize()->get(start, '\0', range); |
| 4067 for (int i = 0; i < length; i++) { | 4075 for (int i = 0; i < length; i++) { |
| 4068 uc32 c = range[i]; | 4076 uc32 c = range[i]; |
| 4069 uc16 range_from = c + (pos - start); | 4077 uc16 range_from = c + (pos - start); |
| 4070 uc16 range_to = c + (end - start); | 4078 uc16 range_to = c + (end - start); |
| 4071 if (!(bottom <= range_from && range_to <= top)) { | 4079 if (!(bottom <= range_from && range_to <= top)) { |
| 4072 ranges->Add(CharacterRange(range_from, range_to)); | 4080 ranges->Add(CharacterRange(range_from, range_to)); |
| 4073 } | 4081 } |
| 4074 } | 4082 } |
| 4075 start = pos = block_end + 1; | 4083 start = pos = block_end + 1; |
| 4076 } | 4084 } |
| 4077 } else { | 4085 } else { |
| 4078 // Unibrow ranges don't work for high characters due to the "2^11 bug". | 4086 // Unibrow ranges don't work for high characters due to the "2^11 bug". |
| 4079 // Therefore we do something dumber for these ranges. | 4087 // Therefore we do something dumber for these ranges. |
| 4080 AddUncanonicals(ranges, bottom, top); | 4088 AddUncanonicals(isolate, ranges, bottom, top); |
| 4081 } | 4089 } |
| 4082 } | 4090 } |
| 4083 | 4091 |
| 4084 | 4092 |
| 4085 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { | 4093 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { |
| 4086 ASSERT_NOT_NULL(ranges); | 4094 ASSERT_NOT_NULL(ranges); |
| 4087 int n = ranges->length(); | 4095 int n = ranges->length(); |
| 4088 if (n <= 1) return true; | 4096 if (n <= 1) return true; |
| 4089 int max = ranges->at(0).to(); | 4097 int max = ranges->at(0).to(); |
| 4090 for (int i = 1; i < n; i++) { | 4098 for (int i = 1; i < n; i++) { |
| (...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4170 result.SetElementsInSecondSet(); | 4178 result.SetElementsInSecondSet(); |
| 4171 } else if (j < range->length()) { | 4179 } else if (j < range->length()) { |
| 4172 // Argument range contains something not in word range. | 4180 // Argument range contains something not in word range. |
| 4173 result.SetElementsInFirstSet(); | 4181 result.SetElementsInFirstSet(); |
| 4174 } | 4182 } |
| 4175 | 4183 |
| 4176 return result; | 4184 return result; |
| 4177 } | 4185 } |
| 4178 | 4186 |
| 4179 | 4187 |
| 4180 static void AddUncanonicals(ZoneList<CharacterRange>* ranges, | 4188 static void AddUncanonicals(Isolate* isolate, |
| 4189 ZoneList<CharacterRange>* ranges, |
| 4181 int bottom, | 4190 int bottom, |
| 4182 int top) { | 4191 int top) { |
| 4183 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 4192 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 4184 // Zones with no case mappings. There is a DEBUG-mode loop to assert that | 4193 // Zones with no case mappings. There is a DEBUG-mode loop to assert that |
| 4185 // this table is correct. | 4194 // this table is correct. |
| 4186 // 0x0600 - 0x0fff | 4195 // 0x0600 - 0x0fff |
| 4187 // 0x1100 - 0x1cff | 4196 // 0x1100 - 0x1cff |
| 4188 // 0x2000 - 0x20ff | 4197 // 0x2000 - 0x20ff |
| 4189 // 0x2200 - 0x23ff | 4198 // 0x2200 - 0x23ff |
| 4190 // 0x2500 - 0x2bff | 4199 // 0x2500 - 0x2bff |
| (...skipping 17 matching lines...) Expand all Loading... |
| 4208 if (top <= CharacterRange::kRangeCanonicalizeMax) { | 4217 if (top <= CharacterRange::kRangeCanonicalizeMax) { |
| 4209 CharacterRange range(bottom, top); | 4218 CharacterRange range(bottom, top); |
| 4210 range.AddCaseEquivalents(ranges, false); | 4219 range.AddCaseEquivalents(ranges, false); |
| 4211 return; | 4220 return; |
| 4212 } | 4221 } |
| 4213 | 4222 |
| 4214 // Split up very large ranges. This helps remove ranges where there are no | 4223 // Split up very large ranges. This helps remove ranges where there are no |
| 4215 // case mappings. | 4224 // case mappings. |
| 4216 for (int i = 0; i < boundary_count; i++) { | 4225 for (int i = 0; i < boundary_count; i++) { |
| 4217 if (bottom < boundaries[i] && top >= boundaries[i]) { | 4226 if (bottom < boundaries[i] && top >= boundaries[i]) { |
| 4218 AddUncanonicals(ranges, bottom, boundaries[i] - 1); | 4227 AddUncanonicals(isolate, ranges, bottom, boundaries[i] - 1); |
| 4219 AddUncanonicals(ranges, boundaries[i], top); | 4228 AddUncanonicals(isolate, ranges, boundaries[i], top); |
| 4220 return; | 4229 return; |
| 4221 } | 4230 } |
| 4222 } | 4231 } |
| 4223 | 4232 |
| 4224 // If we are completely in a zone with no case mappings then we are done. | 4233 // If we are completely in a zone with no case mappings then we are done. |
| 4225 // We start at 2 so as not to except the ASCII range from mappings. | 4234 // We start at 2 so as not to except the ASCII range from mappings. |
| 4226 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) { | 4235 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) { |
| 4227 if (bottom >= boundaries[i] && top < boundaries[i + 1]) { | 4236 if (bottom >= boundaries[i] && top < boundaries[i + 1]) { |
| 4228 #ifdef DEBUG | 4237 #ifdef DEBUG |
| 4229 for (int j = bottom; j <= top; j++) { | 4238 for (int j = bottom; j <= top; j++) { |
| 4230 unsigned current_char = j; | 4239 unsigned current_char = j; |
| 4231 int length = uncanonicalize.get(current_char, '\0', chars); | 4240 int length = isolate->jsregexp_uncanonicalize()->get(current_char, |
| 4241 '\0', chars); |
| 4232 for (int k = 0; k < length; k++) { | 4242 for (int k = 0; k < length; k++) { |
| 4233 ASSERT(chars[k] == current_char); | 4243 ASSERT(chars[k] == current_char); |
| 4234 } | 4244 } |
| 4235 } | 4245 } |
| 4236 #endif | 4246 #endif |
| 4237 return; | 4247 return; |
| 4238 } | 4248 } |
| 4239 } | 4249 } |
| 4240 | 4250 |
| 4241 // Step through the range finding equivalent characters. | 4251 // Step through the range finding equivalent characters. |
| 4242 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); | 4252 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); |
| 4243 for (int i = bottom; i <= top; i++) { | 4253 for (int i = bottom; i <= top; i++) { |
| 4244 int length = uncanonicalize.get(i, '\0', chars); | 4254 int length = isolate->jsregexp_uncanonicalize()->get(i, '\0', chars); |
| 4245 for (int j = 0; j < length; j++) { | 4255 for (int j = 0; j < length; j++) { |
| 4246 uc32 chr = chars[j]; | 4256 uc32 chr = chars[j]; |
| 4247 if (chr != i && (chr < bottom || chr > top)) { | 4257 if (chr != i && (chr < bottom || chr > top)) { |
| 4248 characters->Add(chr); | 4258 characters->Add(chr); |
| 4249 } | 4259 } |
| 4250 } | 4260 } |
| 4251 } | 4261 } |
| 4252 | 4262 |
| 4253 // Step through the equivalent characters finding simple ranges and | 4263 // Step through the equivalent characters finding simple ranges and |
| 4254 // adding ranges to the character class. | 4264 // adding ranges to the character class. |
| (...skipping 999 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5254 RegExpMacroAssemblerIrregexp macro_assembler(codes); | 5264 RegExpMacroAssemblerIrregexp macro_assembler(codes); |
| 5255 #endif // V8_INTERPRETED_REGEXP | 5265 #endif // V8_INTERPRETED_REGEXP |
| 5256 | 5266 |
| 5257 return compiler.Assemble(¯o_assembler, | 5267 return compiler.Assemble(¯o_assembler, |
| 5258 node, | 5268 node, |
| 5259 data->capture_count, | 5269 data->capture_count, |
| 5260 pattern); | 5270 pattern); |
| 5261 } | 5271 } |
| 5262 | 5272 |
| 5263 | 5273 |
| 5264 int OffsetsVector::static_offsets_vector_[ | |
| 5265 OffsetsVector::kStaticOffsetsVectorSize]; | |
| 5266 | |
| 5267 }} // namespace v8::internal | 5274 }} // namespace v8::internal |
| OLD | NEW |