runtime/vm/dart_api_message.cc - Issue 11280150: Add support for surrogates when serializing and deserializing for native ports

Side by Side Diff: runtime/vm/dart_api_message.cc

Issue 11280150: Add support for surrogates when serializing and deserializing for native ports (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Added Utf16::CodePointIterator Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 #include "vm/dart_api_message.h"	5 #include "vm/dart_api_message.h"

6 #include "vm/object.h"	6 #include "vm/object.h"

7 #include "vm/snapshot_ids.h"	7 #include "vm/snapshot_ids.h"

8 #include "vm/symbols.h"	8 #include "vm/symbols.h"

9 #include "vm/unicode.h"	9 #include "vm/unicode.h"

10	10

(...skipping 371 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
382 ::free(latin1);	382 ::free(latin1);

383 return object;	383 return object;

384 }	384 }

385 case kTwoByteStringCid: {	385 case kTwoByteStringCid: {

386 intptr_t len = ReadSmiValue();	386 intptr_t len = ReadSmiValue();

387 intptr_t hash = ReadSmiValue();	387 intptr_t hash = ReadSmiValue();

388 USE(hash);	388 USE(hash);

389 uint16_t *utf16 =	389 uint16_t *utf16 =

390 reinterpret_cast<uint16_t>(::malloc(len sizeof(uint16_t)));	390 reinterpret_cast<uint16_t>(::malloc(len sizeof(uint16_t)));

391 intptr_t utf8_len = 0;	391 intptr_t utf8_len = 0;

	392 // Read all the UTF-16 code units.

392 for (intptr_t i = 0; i < len; i++) {	393 for (intptr_t i = 0; i < len; i++) {

393 utf16[i] = Read<uint16_t>();	394 utf16[i] = Read<uint16_t>();

394 // TODO(sgjesse): Check for surrogate pairs.	395 }

395 utf8_len += Utf8::Length(utf16[i]);	396 // Calculate the UTF-8 length.

	397 Utf16::CodePointIterator it(utf16, len);
	siva 2012/11/27 03:00:25 If you get invalid characters here it.Next() could If you get invalid characters here it.Next() could potentially return false right at the first character and we would end up with utf8_len being 0. Should that be reported as an error as just silently dropped and an empty string returned like it is being done now? Søren Gjesse 2012/11/27 11:35:54 There are no invalid characters in an UTF-16 seque Show quoted text On 2012/11/27 03:00:25, siva wrote: > If you get invalid characters here it.Next() could > potentially return false right at the first character > and we would end up with utf8_len being 0. > > Should that be reported as an error as just silently > dropped and an empty string returned like it is being > done now? There are no invalid characters in an UTF-16 sequence. So that cannot happen. Added test to unicode_test.cc.
	398 while (it.Next()) {

	399 utf8_len += Utf8::Length(it.Current());

396 }	400 }

397 Dart_CObject* object = AllocateDartCObjectString(utf8_len);	401 Dart_CObject* object = AllocateDartCObjectString(utf8_len);

398 AddBackRef(object_id, object, kIsDeserialized);	402 AddBackRef(object_id, object, kIsDeserialized);

399 char* p = object->value.as_string;	403 char* p = object->value.as_string;

400 for (intptr_t i = 0; i < len; i++) {	404 Utf16::CodePointIterator it2(utf16, len);
	siva 2012/11/27 03:00:25 Would it make sense to have a reset method on the Would it make sense to have a reset method on the iterator instead so that you can start iterating again on the same iterator? Søren Gjesse 2012/11/27 11:35:54 Good point added Reset() here and for String::Code Show quoted text On 2012/11/27 03:00:25, siva wrote: > Would it make sense to have a reset method on the iterator > instead so that you can start iterating again on the same iterator? Good point added Reset() here and for String::CodePointIterator as well.
401 // TODO(sgjesse): Check for surrogate pairs.	405 while (it2.Next()) {

402 p += Utf8::Encode(utf16[i], p);	406 p += Utf8::Encode(it2.Current(), p);

403 }	407 }

404 *p = '\0';	408 *p = '\0';

405 ASSERT(p == (object->value.as_string + utf8_len));	409 ASSERT(p == (object->value.as_string + utf8_len));

406 ::free(utf16);	410 ::free(utf16);

407 return object;	411 return object;

408 }	412 }

409 case kUint8ArrayCid: {	413 case kUint8ArrayCid: {

410 intptr_t len = ReadSmiValue();	414 intptr_t len = ReadSmiValue();

411 Dart_CObject* object = AllocateDartCObjectUint8Array(len);	415 Dart_CObject* object = AllocateDartCObjectUint8Array(len);

412 AddBackRef(object_id, object, kIsDeserialized);	416 AddBackRef(object_id, object, kIsDeserialized);

(...skipping 368 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
781 // Write out the class and tags information.	785 // Write out the class and tags information.

782 WriteIndexedObject(kDoubleCid);	786 WriteIndexedObject(kDoubleCid);

783 WriteIntptrValue(0);	787 WriteIntptrValue(0);

784 // Write double value.	788 // Write double value.

785 Write<double>(object->value.as_double);	789 Write<double>(object->value.as_double);

786 break;	790 break;

787 case Dart_CObject::kString: {	791 case Dart_CObject::kString: {

788 const uint8_t* utf8_str =	792 const uint8_t* utf8_str =

789 reinterpret_cast<const uint8_t*>(object->value.as_string);	793 reinterpret_cast<const uint8_t*>(object->value.as_string);

790 intptr_t utf8_len = strlen(object->value.as_string);	794 intptr_t utf8_len = strlen(object->value.as_string);

791 if (!Utf8::IsValid(utf8_str, utf8_len)) {	795 if (!Utf8::IsValidAllowSurrogates(utf8_str, utf8_len)) {
	siva 2012/11/27 03:00:25 I am not sure I understand the need for this to be I am not sure I understand the need for this to be IsValidAllowSurrogates and not just IsValid? Is it because you might have read a partial utf8 string and are expecting more? Søren Gjesse 2012/11/27 11:35:54 The current Utf8::IsValid does not allow for Utf8 Show quoted text On 2012/11/27 03:00:25, siva wrote: > I am not sure I understand the need for this to be > IsValidAllowSurrogates and not just IsValid? > > Is it because you might have read a partial utf8 string and are expecting more? The current Utf8::IsValid does not allow for Utf8 encoded Utf16 surrogate code points. That is the 3-byte Utf8 encodings of the code point range d800 - dbff and dc00 - dfff are not allowed. However as the Utf16 two byte strings posted can contain these code points the Utf8 strings in the Dart_CObject structures can contain 3-byte Utf8 encodings of Utf16 surrogate code points. We need to allow sending the same data as can be received. Maybe we should just make IsValid allow surrogate code points in all cases. Currently you cannot create all the strings using the Dart API Dart_NewStringFromUTF8 that you can using String.fromCharCodes inside Dart (e.g. String.fromCharCodes([0xd800])). siva 2012/11/28 03:28:23 I was under the impression that we allow for Utf8 I was under the impression that we allow for Utf8 encoded Utf16 surrogate code points, at least that was the intention. This might be a bug, I should probably write a test case to verify this. I would prefer if we did not have to distinguish between IsValid and IsValidAllowSurrogates. Similarly Utf8::DecodeToUTF16 should allow surrogates and not have to distinguish between the two. For instance if you look at Utf8::DecodeToUTF16 it tries to deal with supplementary characters. The fact that Utf8::Decode doesn't deal with it correctly is a bug. I think we should remove this distinction, fix Utf8::ISValid and Utf8::Decode, then this CL would be good to go. What do you think? On 2012/11/27 11:35:54, Søren Gjesse wrote: Show quoted text > On 2012/11/27 03:00:25, siva wrote: > > I am not sure I understand the need for this to be > > IsValidAllowSurrogates and not just IsValid? > > > > Is it because you might have read a partial utf8 string and are expecting > more? > > The current Utf8::IsValid does not allow for Utf8 encoded Utf16 surrogate code > points. That is the 3-byte Utf8 encodings of the code point range d800 - dbff > and dc00 - dfff are not allowed. However as the Utf16 two byte strings posted > can contain these code points the Utf8 strings in the Dart_CObject structures > can contain 3-byte Utf8 encodings of Utf16 surrogate code points. We need to > allow sending the same data as can be received. > > Maybe we should just make IsValid allow surrogate code points in all cases. > Currently you cannot create all the strings using the Dart API > Dart_NewStringFromUTF8 that you can using String.fromCharCodes inside Dart (e.g. > String.fromCharCodes([0xd800])).
792 return false;	796 return false;

793 }	797 }

794	798

795 Utf8::Type type;	799 Utf8::Type type;

796 intptr_t len = Utf8::CodePointCount(utf8_str, utf8_len, &type);	800 intptr_t len = Utf8::CodePointCount(utf8_str, utf8_len, &type);

797	801

798 // Write out the serialization header value for this object.	802 // Write out the serialization header value for this object.

799 WriteInlinedHeader(object);	803 WriteInlinedHeader(object);

800 // Write out the class and tags information.	804 // Write out the class and tags information.

801 WriteIndexedObject(type == Utf8::kLatin1 ? kOneByteStringCid	805 WriteIndexedObject(type == Utf8::kLatin1 ? kOneByteStringCid

802 : kTwoByteStringCid);	806 : kTwoByteStringCid);

803 WriteIntptrValue(0);	807 WriteIntptrValue(0);

804 // Write string length, hash and content	808 // Write string length, hash and content

805 WriteSmi(len);	809 WriteSmi(len);

806 WriteSmi(0); // TODO(sgjesse): Hash - not written.	810 WriteSmi(0); // TODO(sgjesse): Hash - not written.

807 if (type == Utf8::kLatin1) {	811 if (type == Utf8::kLatin1) {

808 uint8_t* latin1_str =	812 uint8_t* latin1_str =

809 reinterpret_cast<uint8_t>(::malloc(len sizeof(uint8_t)));	813 reinterpret_cast<uint8_t>(::malloc(len sizeof(uint8_t)));

810 Utf8::DecodeToLatin1(utf8_str, utf8_len, latin1_str, len);	814 bool success = Utf8::DecodeToLatin1(utf8_str,

	815 utf8_len,

	816 latin1_str,

	817 len);

	818 ASSERT(success);

811 for (intptr_t i = 0; i < len; i++) {	819 for (intptr_t i = 0; i < len; i++) {

812 Write<uint8_t>(latin1_str[i]);	820 Write<uint8_t>(latin1_str[i]);

813 }	821 }

814 ::free(latin1_str);	822 ::free(latin1_str);

815 } else {	823 } else {

816 // TODO(sgjesse): Make sure surrogate pairs are handled.	824 // TODO(sgjesse): Make sure surrogate pairs are handled.

817 uint16_t* utf16_str =	825 uint16_t* utf16_str =

818 reinterpret_cast<uint16_t>(::malloc(len sizeof(uint16_t)));	826 reinterpret_cast<uint16_t>(::malloc(len sizeof(uint16_t)));

819 Utf8::DecodeToUTF16(utf8_str, utf8_len, utf16_str, len);	827 bool success = Utf8::DecodeToUTF16AllowSurrogates(utf8_str,

	828 utf8_len,

	829 utf16_str,

	830 len);

	831 ASSERT(success);

820 for (intptr_t i = 0; i < len; i++) {	832 for (intptr_t i = 0; i < len; i++) {

821 Write<uint16_t>(utf16_str[i]);	833 Write<uint16_t>(utf16_str[i]);

822 }	834 }

823 ::free(utf16_str);	835 ::free(utf16_str);

824 }	836 }

825 break;	837 break;

826 }	838 }

827 case Dart_CObject::kUint8Array: {	839 case Dart_CObject::kUint8Array: {

828 // Write out the serialization header value for this object.	840 // Write out the serialization header value for this object.

829 WriteInlinedHeader(object);	841 WriteInlinedHeader(object);

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
882 if (!success) {	894 if (!success) {

883 UnmarkAllCObjects(object);	895 UnmarkAllCObjects(object);

884 return false;	896 return false;

885 }	897 }

886 }	898 }

887 UnmarkAllCObjects(object);	899 UnmarkAllCObjects(object);

888 return true;	900 return true;

889 }	901 }

890	902

891 } // namespace dart	903 } // namespace dart

OLD	NEW

« no previous file with comments | « no previous file | runtime/vm/snapshot_test.cc » ('j') | runtime/vm/unicode.h » ('J')