src/runtime/runtime-i18n.cc - Issue 1971943002: Make normalize, collator:compare and breakiterator a bit more efficient

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 1971943002: Make normalize, collator:compare and breakiterator a bit more efficient (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: use RUNTIME_ASSERT Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5	5

6 #ifdef V8_I18N_SUPPORT	6 #ifdef V8_I18N_SUPPORT

7 #include "src/runtime/runtime-utils.h"	7 #include "src/runtime/runtime-utils.h"

8	8

9 #include "src/api.h"	9 #include "src/api.h"

10 #include "src/api-natives.h"	10 #include "src/api-natives.h"

11 #include "src/arguments.h"	11 #include "src/arguments.h"

12 #include "src/factory.h"	12 #include "src/factory.h"

13 #include "src/i18n.h"	13 #include "src/i18n.h"

14 #include "src/isolate-inl.h"	14 #include "src/isolate-inl.h"

15 #include "src/messages.h"	15 #include "src/messages.h"

16	16

17 #include "unicode/brkiter.h"	17 #include "unicode/brkiter.h"

18 #include "unicode/calendar.h"	18 #include "unicode/calendar.h"

19 #include "unicode/coll.h"	19 #include "unicode/coll.h"

20 #include "unicode/curramt.h"	20 #include "unicode/curramt.h"

21 #include "unicode/datefmt.h"	21 #include "unicode/datefmt.h"

22 #include "unicode/dcfmtsym.h"	22 #include "unicode/dcfmtsym.h"

23 #include "unicode/decimfmt.h"	23 #include "unicode/decimfmt.h"

24 #include "unicode/dtfmtsym.h"	24 #include "unicode/dtfmtsym.h"

25 #include "unicode/dtptngen.h"	25 #include "unicode/dtptngen.h"

26 #include "unicode/locid.h"	26 #include "unicode/locid.h"

	27 #include "unicode/normalizer2.h"

27 #include "unicode/numfmt.h"	28 #include "unicode/numfmt.h"

28 #include "unicode/numsys.h"	29 #include "unicode/numsys.h"

29 #include "unicode/rbbi.h"	30 #include "unicode/rbbi.h"

30 #include "unicode/smpdtfmt.h"	31 #include "unicode/smpdtfmt.h"

31 #include "unicode/timezone.h"	32 #include "unicode/timezone.h"

32 #include "unicode/translit.h"	33 #include "unicode/translit.h"

33 #include "unicode/uchar.h"	34 #include "unicode/uchar.h"

34 #include "unicode/ucol.h"	35 #include "unicode/ucol.h"

35 #include "unicode/ucurr.h"	36 #include "unicode/ucurr.h"

36 #include "unicode/uloc.h"	37 #include "unicode/uloc.h"

37 #include "unicode/unistr.h"	38 #include "unicode/unistr.h"

38 #include "unicode/unum.h"	39 #include "unicode/unum.h"

39 #include "unicode/uversion.h"	40 #include "unicode/uversion.h"

40	41

41	42

42 namespace v8 {	43 namespace v8 {

43 namespace internal {	44 namespace internal {

	45 namespace {

	46

	47 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,

	48 base::SmartArrayPointer<uc16>* dest,

	49 int32_t length) {

	50 DCHECK(flat.IsFlat());

	51 if (flat.IsOneByte()) {

	52 if (dest->is_empty()) {

	53 dest->Reset(NewArray<uc16>(length));

	54 CopyChars(dest->get(), flat.ToOneByteVector().start(), length);

	55 }

	56 return reinterpret_cast<const UChar*>(dest->get());

	57 } else {

	58 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

	59 }

	60 }

	61

	62 } // namespace

44	63

45 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {	64 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {

46 HandleScope scope(isolate);	65 HandleScope scope(isolate);

47 Factory* factory = isolate->factory();	66 Factory* factory = isolate->factory();

48	67

49 DCHECK(args.length() == 1);	68 DCHECK(args.length() == 1);

50 CONVERT_ARG_HANDLE_CHECKED(String, locale_id_str, 0);	69 CONVERT_ARG_HANDLE_CHECKED(String, locale_id_str, 0);

51	70

52 v8::String::Utf8Value locale_id(v8::Utils::ToLocal(locale_id_str));	71 v8::String::Utf8Value locale_id(v8::Utils::ToLocal(locale_id_str));

53	72

(...skipping 496 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
550	569

551 DCHECK(args.length() == 3);	570 DCHECK(args.length() == 3);

552	571

553 CONVERT_ARG_HANDLE_CHECKED(JSObject, collator_holder, 0);	572 CONVERT_ARG_HANDLE_CHECKED(JSObject, collator_holder, 0);

554 CONVERT_ARG_HANDLE_CHECKED(String, string1, 1);	573 CONVERT_ARG_HANDLE_CHECKED(String, string1, 1);

555 CONVERT_ARG_HANDLE_CHECKED(String, string2, 2);	574 CONVERT_ARG_HANDLE_CHECKED(String, string2, 2);

556	575

557 icu::Collator* collator = Collator::UnpackCollator(isolate, collator_holder);	576 icu::Collator* collator = Collator::UnpackCollator(isolate, collator_holder);

558 if (!collator) return isolate->ThrowIllegalOperation();	577 if (!collator) return isolate->ThrowIllegalOperation();

559	578

560 v8::String::Value string_value1(v8::Utils::ToLocal(string1));	579 string1 = String::Flatten(string1);

561 v8::String::Value string_value2(v8::Utils::ToLocal(string2));	580 string2 = String::Flatten(string2);

562 const UChar* u_string1 = reinterpret_cast<const UChar>(string_value1);	581 DisallowHeapAllocation no_gc;

563 const UChar* u_string2 = reinterpret_cast<const UChar>(string_value2);	582 int32_t length1 = string1->length();

	583 int32_t length2 = string2->length();

	584 String::FlatContent flat1 = string1->GetFlatContent();

	585 String::FlatContent flat2 = string2->GetFlatContent();

	586 base::SmartArrayPointer<uc16> sap1;

	587 base::SmartArrayPointer<uc16> sap2;

	588 const UChar* string_val1 = GetUCharBufferFromFlat(flat1, &sap1, length1);

	589 const UChar* string_val2 = GetUCharBufferFromFlat(flat2, &sap2, length2);

564 UErrorCode status = U_ZERO_ERROR;	590 UErrorCode status = U_ZERO_ERROR;

565 UCollationResult result =	591 UCollationResult result =

566 collator->compare(u_string1, string_value1.length(), u_string2,	592 collator->compare(string_val1, length1, string_val2, length2, status);

567 string_value2.length(), status);

568 if (U_FAILURE(status)) return isolate->ThrowIllegalOperation();	593 if (U_FAILURE(status)) return isolate->ThrowIllegalOperation();

569	594

570 return *isolate->factory()->NewNumberFromInt(result);	595 return *isolate->factory()->NewNumberFromInt(result);

571 }	596 }

572	597

573	598

574 RUNTIME_FUNCTION(Runtime_StringNormalize) {	599 RUNTIME_FUNCTION(Runtime_StringNormalize) {

575 HandleScope scope(isolate);	600 HandleScope scope(isolate);

576 static const UNormalizationMode normalizationForms[] = {	601 static const struct {

577 UNORM_NFC, UNORM_NFD, UNORM_NFKC, UNORM_NFKD};	602 const char* name;

	603 UNormalization2Mode mode;

	604 } normalizationForms[] = {

	605 {"nfc", UNORM2_COMPOSE},

	606 {"nfc", UNORM2_DECOMPOSE},

	607 {"nfkc", UNORM2_COMPOSE},

	608 {"nfkc", UNORM2_DECOMPOSE},

	609 };

578	610

579 DCHECK(args.length() == 2);	611 DCHECK(args.length() == 2);

580	612

581 CONVERT_ARG_HANDLE_CHECKED(String, stringValue, 0);	613 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

582 CONVERT_NUMBER_CHECKED(int, form_id, Int32, args[1]);	614 CONVERT_NUMBER_CHECKED(int, form_id, Int32, args[1]);

583 RUNTIME_ASSERT(form_id >= 0 &&	615 RUNTIME_ASSERT(form_id >= 0 &&

584 static_cast<size_t>(form_id) < arraysize(normalizationForms));	616 static_cast<size_t>(form_id) < arraysize(normalizationForms));

585	617

586 v8::String::Value string_value(v8::Utils::ToLocal(stringValue));	618 int length = s->length();

587 const UChar* u_value = reinterpret_cast<const UChar>(string_value);	619 s = String::Flatten(s);

	620 icu::UnicodeString result;

	621 base::SmartArrayPointer<uc16> sap;

	622 UErrorCode status = U_ZERO_ERROR;

	623 {

	624 DisallowHeapAllocation no_gc;

	625 String::FlatContent flat = s->GetFlatContent();

	626 const UChar* src = GetUCharBufferFromFlat(flat, &sap, length);

	627 icu::UnicodeString input(false, src, length);

	628 // Getting a singleton. Should not free it.

	629 const icu::Normalizer2* normalizer =

	630 icu::Normalizer2::getInstance(nullptr, normalizationForms[form_id].name,

	631 normalizationForms[form_id].mode, status);

	632 DCHECK(U_SUCCESS(status));

	633 RUNTIME_ASSERT(normalizer != nullptr);

	634 int32_t normalized_prefix_length =

	635 normalizer->spanQuickCheckYes(input, status);

	636 // Quick return if the input is already normalized.

	637 if (length == normalized_prefix_length) return *s;

	638 icu::UnicodeString unnormalized =

	639 input.tempSubString(normalized_prefix_length);

	640 // Read-only alias of the normalized prefix.

	641 result.setTo(false, input.getBuffer(), normalized_prefix_length);

	642 // copy-on-write; normalize the suffix and append to \|result\|.

	643 normalizer->normalizeSecondAndAppend(result, unnormalized, status);

	644 }

588	645

589 // TODO(mnita): check Normalizer2 (not available in ICU 46)

590 UErrorCode status = U_ZERO_ERROR;

591 icu::UnicodeString input(false, u_value, string_value.length());

592 icu::UnicodeString result;

593 icu::Normalizer::normalize(input, normalizationForms[form_id], 0, result,

594 status);

595 if (U_FAILURE(status)) {	646 if (U_FAILURE(status)) {

596 return isolate->heap()->undefined_value();	647 return isolate->heap()->undefined_value();

597 }	648 }

598	649

599 Handle<String> result_str;	650 Handle<String> result_str;

600 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	651 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

601 isolate, result_str,	652 isolate, result_str,

602 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(	653 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(

603 reinterpret_cast<const uint16_t*>(result.getBuffer()),	654 reinterpret_cast<const uint16_t*>(result.getBuffer()),

604 result.length())));	655 result.length())));

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
658 CONVERT_ARG_HANDLE_CHECKED(String, text, 1);	709 CONVERT_ARG_HANDLE_CHECKED(String, text, 1);

659	710

660 icu::BreakIterator* break_iterator =	711 icu::BreakIterator* break_iterator =

661 BreakIterator::UnpackBreakIterator(isolate, break_iterator_holder);	712 BreakIterator::UnpackBreakIterator(isolate, break_iterator_holder);

662 if (!break_iterator) return isolate->ThrowIllegalOperation();	713 if (!break_iterator) return isolate->ThrowIllegalOperation();

663	714

664 icu::UnicodeString* u_text = reinterpret_cast<icu::UnicodeString*>(	715 icu::UnicodeString* u_text = reinterpret_cast<icu::UnicodeString*>(

665 break_iterator_holder->GetInternalField(1));	716 break_iterator_holder->GetInternalField(1));

666 delete u_text;	717 delete u_text;

667	718

668 v8::String::Value text_value(v8::Utils::ToLocal(text));	719 int length = text->length();

669 u_text = new icu::UnicodeString(reinterpret_cast<const UChar>(text_value),	720 text = String::Flatten(text);

670 text_value.length());	721 DisallowHeapAllocation no_gc;

	722 String::FlatContent flat = text->GetFlatContent();

	723 base::SmartArrayPointer<uc16> sap;

	724 const UChar* text_value = GetUCharBufferFromFlat(flat, &sap, length);

	725 u_text = new icu::UnicodeString(text_value, length);

671 break_iterator_holder->SetInternalField(1, reinterpret_cast<Smi*>(u_text));	726 break_iterator_holder->SetInternalField(1, reinterpret_cast<Smi*>(u_text));

672	727

673 break_iterator->setText(*u_text);	728 break_iterator->setText(*u_text);

674	729

675 return isolate->heap()->undefined_value();	730 return isolate->heap()->undefined_value();

676 }	731 }

677	732

678	733

679 RUNTIME_FUNCTION(Runtime_BreakIteratorFirst) {	734 RUNTIME_FUNCTION(Runtime_BreakIteratorFirst) {

680 HandleScope scope(isolate);	735 HandleScope scope(isolate);

(...skipping 76 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
757 const char* transliterator_id) {	812 const char* transliterator_id) {

758 UErrorCode status = U_ZERO_ERROR;	813 UErrorCode status = U_ZERO_ERROR;

759 base::SmartPointer<icu::Transliterator> translit(	814 base::SmartPointer<icu::Transliterator> translit(

760 icu::Transliterator::createInstance(	815 icu::Transliterator::createInstance(

761 icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,	816 icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,

762 status));	817 status));

763 if (U_FAILURE(status)) return;	818 if (U_FAILURE(status)) return;

764 translit->transliterate(*input);	819 translit->transliterate(*input);

765 }	820 }

766	821

767 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,

768 base::SmartArrayPointer<uc16>* dest,

769 int32_t length) {

770 DCHECK(flat.IsFlat());

771 if (flat.IsOneByte()) {

772 if (dest->is_empty()) {

773 dest->Reset(NewArray<uc16>(length));

774 CopyChars(dest->get(), flat.ToOneByteVector().start(), length);

775 }

776 return reinterpret_cast<const UChar*>(dest->get());

777 } else {

778 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

779 }

780 }

781

782 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,	822 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,

783 bool is_to_upper, const char* lang) {	823 bool is_to_upper, const char* lang) {

784 int32_t src_length = s->length();	824 int32_t src_length = s->length();

785	825

786 // Greek uppercasing has to be done via transliteration.	826 // Greek uppercasing has to be done via transliteration.

787 // TODO(jshin): Drop this special-casing once ICU's regular case conversion	827 // TODO(jshin): Drop this special-casing once ICU's regular case conversion

788 // API supports Greek uppercasing. See	828 // API supports Greek uppercasing. See

789 // http://bugs.icu-project.org/trac/ticket/10582 .	829 // http://bugs.icu-project.org/trac/ticket/10582 .

790 // In the meantime, if there's no Greek character in \|s\|, call this	830 // In the meantime, if there's no Greek character in \|s\|, call this

791 // function again with the root locale (lang="").	831 // function again with the root locale (lang="").

(...skipping 310 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1102 // mapping of ASCII range characters are different in those locales.	1142 // mapping of ASCII range characters are different in those locales.

1103 // Greek (el) does not require any adjustment, though.	1143 // Greek (el) does not require any adjustment, though.

1104 return LocaleConvertCase(s, isolate, is_upper,	1144 return LocaleConvertCase(s, isolate, is_upper,

1105 reinterpret_cast<const char*>(lang_str));	1145 reinterpret_cast<const char*>(lang_str));

1106 }	1146 }

1107	1147

1108 } // namespace internal	1148 } // namespace internal

1109 } // namespace v8	1149 } // namespace v8

1110	1150

1111 #endif // V8_I18N_SUPPORT	1151 #endif // V8_I18N_SUPPORT

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »