src/uri.cc - Issue 1994733003: Rewrite decodeURL as builtin function, remove now unused runtime functions.

Side by Side Diff: src/uri.cc

Issue 1994733003: Rewrite decodeURL as builtin function, remove now unused runtime functions. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Address review comments Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/uri.h"	5 #include "src/uri.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/handles.h"	8 #include "src/handles.h"

9 #include "src/isolate-inl.h"	9 #include "src/isolate-inl.h"

10 #include "src/list.h"	10 #include "src/list.h"

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
53 }	53 }

54 }	54 }

55	55

56 void AddHexEncodedToBuffer(uint8_t octet, List<uint8_t>* buffer) {	56 void AddHexEncodedToBuffer(uint8_t octet, List<uint8_t>* buffer) {

57 buffer->Add('%');	57 buffer->Add('%');

58 buffer->Add(HexCharOfValue(octet >> 4));	58 buffer->Add(HexCharOfValue(octet >> 4));

59 buffer->Add(HexCharOfValue(octet & 0x0F));	59 buffer->Add(HexCharOfValue(octet & 0x0F));

60 }	60 }

61	61

62 void EncodeSingle(uc16 c, List<uint8_t>* buffer) {	62 void EncodeSingle(uc16 c, List<uint8_t>* buffer) {

63 uint8_t x = (c >> 12) & 0xF;	63 char s[4];

64 uint8_t y = (c >> 6) & 63;	64 int number_of_bytes;

65 uint8_t z = c & 63;	65 number_of_bytes =

66 if (c <= 0x007F) {	66 unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);

67 AddHexEncodedToBuffer(c, buffer);	67 for (int k = 0; k < number_of_bytes; k++) {

68 } else if (c <= 0x07FF) {	68 AddHexEncodedToBuffer(s[k], buffer);

69 AddHexEncodedToBuffer(y + 192, buffer);

70 AddHexEncodedToBuffer(z + 128, buffer);

71 } else {

72 AddHexEncodedToBuffer(x + 224, buffer);

73 AddHexEncodedToBuffer(y + 128, buffer);

74 AddHexEncodedToBuffer(z + 128, buffer);

75 }	69 }

76 }	70 }

77	71

78 void EncodePair(uc16 cc1, uc16 cc2, List<uint8_t>* buffer) {	72 void EncodePair(uc16 cc1, uc16 cc2, List<uint8_t>* buffer) {

79 uint8_t u = ((cc1 >> 6) & 0xF) + 1;	73 char s[4];

80 uint8_t w = (cc1 >> 2) & 0xF;	74 int number_of_bytes =

81 uint8_t x = cc1 & 3;	75 unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),

82 uint8_t y = (cc2 >> 6) & 0xF;	76 unibrow::Utf16::kNoPreviousCharacter, false);

83 uint8_t z = cc2 & 63;	77 for (int k = 0; k < number_of_bytes; k++) {

84 AddHexEncodedToBuffer((u >> 2) + 240, buffer);	78 AddHexEncodedToBuffer(s[k], buffer);

85 AddHexEncodedToBuffer((((u & 3) << 4) \| w) + 128, buffer);	79 }

86 AddHexEncodedToBuffer(((x << 4) \| y) + 128, buffer);

87 AddHexEncodedToBuffer(z + 128, buffer);

88 }	80 }

89	81

90 } // anonymous namespace	82 } // anonymous namespace

91	83

92 Object* Uri::Encode(Isolate* isolate, Handle<String> uri, bool is_uri) {	84 MaybeHandle<Object> Uri::Encode(Isolate* isolate, Handle<String> uri,

	85 bool is_uri) {

93 uri = String::Flatten(uri);	86 uri = String::Flatten(uri);

94 int uri_length = uri->length();	87 int uri_length = uri->length();

95 List<uint8_t> buffer(uri_length);	88 List<uint8_t> buffer(uri_length);

96	89

97 {	90 {

98 DisallowHeapAllocation no_gc;	91 DisallowHeapAllocation no_gc;

99 String::FlatContent uri_content = uri->GetFlatContent();	92 String::FlatContent uri_content = uri->GetFlatContent();

100	93

101 for (int k = 0; k < uri_length; k++) {	94 for (int k = 0; k < uri_length; k++) {

102 uc16 cc1 = uri_content.Get(k);	95 uc16 cc1 = uri_content.Get(k);

(...skipping 10 matching lines...) Expand all Loading...
113 if (IsUnescapePredicateInUriComponent(cc1) \|\|	106 if (IsUnescapePredicateInUriComponent(cc1) \|\|

114 (is_uri && IsUriSeparator(cc1))) {	107 (is_uri && IsUriSeparator(cc1))) {

115 buffer.Add(cc1);	108 buffer.Add(cc1);

116 } else {	109 } else {

117 EncodeSingle(cc1, &buffer);	110 EncodeSingle(cc1, &buffer);

118 }	111 }

119 continue;	112 continue;

120 }	113 }

121	114

122 AllowHeapAllocation allocate_error_and_return;	115 AllowHeapAllocation allocate_error_and_return;

123 THROW_NEW_ERROR_RETURN_FAILURE(isolate, NewURIError());	116 THROW_NEW_ERROR(isolate, NewURIError(), Object);

124 }	117 }

125 }	118 }

126	119

127 Handle<String> result;	120 Handle<String> result;

128 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	121 ASSIGN_RETURN_ON_EXCEPTION(

129 isolate, result,	122 isolate, result,

130 isolate->factory()->NewStringFromOneByte(buffer.ToConstVector()));	123 isolate->factory()->NewStringFromOneByte(buffer.ToConstVector()), Object);

131 return *result;	124 return result;

	125 }

	126

	127 namespace { // anonymous namespace for DecodeURI helper functions

	128

	129 bool IsReservedPredicate(uc16 c) {

	130 switch (c) {

	131 case '#':

	132 case '$':

	133 case '&':

	134 case '+':

	135 case ',':

	136 case '/':

	137 case ':':

	138 case ';':

	139 case '=':

	140 case '?':

	141 case '@':

	142 return true;

	143 default:

	144 return false;

	145 }

	146 }

	147

	148 bool IsRepalcementCharacter(List<uint8_t>* octets) {
	Yang 2016/05/23 06:44:32 typo. typo. Franzi 2016/05/23 08:55:57 Done. Show quoted text On 2016/05/23 06:44:32, Yang wrote: > typo. Done.
	149 // 0xFFFD is %ef%bf%bd
	Yang 2016/05/23 06:44:32 What does this comment mean? What does this comment mean? Franzi 2016/05/23 08:55:57 Reworded the comment to clarify why we check for t Show quoted text On 2016/05/23 06:44:32, Yang wrote: > What does this comment mean? Reworded the comment to clarify why we check for this particular encoding.
	150 if (octets->length() != 3 \|\| octets->at(0) != 0xef \|\| octets->at(1) != 0xbf \|\|

	151 octets->at(2) != 0xbd) {

	152 return false;

	153 }

	154 return true;

	155 }

	156

	157 bool DecodeOctets(List<uint8_t>* octets, List<uc16>* two_byte_buffer) {

	158 size_t cursor = 0;

	159 uc32 value = unibrow::Utf8::ValueOf(octets->ToConstVector().start(),

	160 octets->length(), &cursor);

	161 // kBadChar is the Replacement Character, which is the decoding of

	162 // valid input %ef%bf%bd

	163 if (value == unibrow::Utf8::kBadChar && !IsRepalcementCharacter(octets)) {

	164 return false;

	165 }

	166

	167 if (value <= unibrow::Utf16::kMaxNonSurrogateCharCode) {

	168 two_byte_buffer->Add(value);

	169 } else {

	170 two_byte_buffer->Add(unibrow::Utf16::LeadSurrogate(value));

	171 two_byte_buffer->Add(unibrow::Utf16::TrailSurrogate(value));

	172 }

	173 return true;

	174 }

	175

	176 bool TwoDigitHex(uc16& decoded, int k, String::FlatContent* uri_content) {
	Yang 2016/05/23 06:44:31 Can we use uc16* as argument type? That way it's e Can we use uc16* as argument type? That way it's easier to understand, from the call site, that the argument is an output. Also see https://google.github.io/styleguide/cppguide.html#Reference_Arguments Yang 2016/05/23 06:44:32 can we call the second argument "index" or somethi can we call the second argument "index" or something instead of "k"? Franzi 2016/05/23 08:55:57 Done. Show quoted text On 2016/05/23 06:44:31, Yang wrote: > Can we use uc16* as argument type? That way it's easier to understand, from the > call site, that the argument is an output. > > Also see https://google.github.io/styleguide/cppguide.html#Reference_Arguments Done. Franzi 2016/05/23 08:55:57 Done. Show quoted text On 2016/05/23 06:44:32, Yang wrote: > can we call the second argument "index" or something instead of "k"? Done.
	177 char high = HexValue(uri_content->Get(k + 1));
	Yang 2016/05/23 06:44:32 FlatContent::Get returns a uc16. Casting that to s FlatContent::Get returns a uc16. Casting that to signed char and then comparing that to 0 smells like undefined behavior. Let's store the result as uc16 and compare to Utf8::kMaxOneByteChar. Franzi 2016/05/23 08:55:57 Not sure I understand the comment. HexValue takes Show quoted text On 2016/05/23 06:44:32, Yang wrote: > FlatContent::Get returns a uc16. Casting that to signed char and then comparing > that to 0 smells like undefined behavior. > > Let's store the result as uc16 and compare to Utf8::kMaxOneByteChar. Not sure I understand the comment. HexValue takes a uc32 and returns an int between -1 and 16. Is that not OK to store as a signed char? Yang 2016/05/23 11:24:59 Ah I see. I misunderstood. Nevermind this comment. Show quoted text On 2016/05/23 08:55:57, Franzi wrote: > On 2016/05/23 06:44:32, Yang wrote: > > FlatContent::Get returns a uc16. Casting that to signed char and then > comparing > > that to 0 smells like undefined behavior. > > > > Let's store the result as uc16 and compare to Utf8::kMaxOneByteChar. > > Not sure I understand the comment. HexValue takes a uc32 and returns an int > between -1 and 16. Is that not OK to store as a signed char? Ah I see. I misunderstood. Nevermind this comment.
	178 char low = HexValue(uri_content->Get(k + 2));

	179 if (high < 0 \|\| low < 0) {

	180 return false;

	181 }

	182 decoded = (high << 4) \| low;

	183 return true;

	184 }

	185

	186 template <typename T>

	187 void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int k,
	Yang 2016/05/23 06:44:32 same here, "index" instead of "k". same here, "index" instead of "k". Franzi 2016/05/23 08:55:57 Done. Show quoted text On 2016/05/23 06:44:32, Yang wrote: > same here, "index" instead of "k". Done.
	188 bool is_uri, List<T>* buffer) {

	189 if (is_uri && IsReservedPredicate(decoded)) {

	190 buffer->Add('%');

	191 buffer->Add(uri_content->Get(k + 1));

	192 buffer->Add(uri_content->Get(k + 2));
	Yang 2016/05/23 06:44:31 Can we have a safeguard here that we don't have im Can we have a safeguard here that we don't have implicit clamping here? I.e. that the return value uri_content->Get fits into the buffer type T? Franzi 2016/05/23 08:55:57 Done. Throwing exception if uri_content->Get() is Show quoted text On 2016/05/23 06:44:31, Yang wrote: > Can we have a safeguard here that we don't have implicit clamping here? I.e. > that the return value uri_content->Get fits into the buffer type T? Done. Throwing exception if uri_content->Get() is larger than numeric_limits::max() for type T.
	193 } else {

	194 buffer->Add(decoded);

	195 }

	196 }

	197

	198 bool IntoTwoByte(int index, bool is_uri, int uri_length,

	199 String::FlatContent* uri_content,

	200 List<uc16>* two_byte_buffer) {

	201 for (int k = index; k < uri_length; k++) {

	202 uc16 code = uri_content->Get(k);

	203 if (code == '%') {

	204 uc16 decoded;

	205 if (k + 2 >= uri_length \|\| !TwoDigitHex(decoded, k, uri_content)) {

	206 return false;

	207 }

	208 k += 2;

	209 if (decoded > unibrow::Utf8::kMaxOneByteChar) {

	210 int n = 0;

	211 while (((decoded << ++n) & 0x80) != 0) {
	Yang 2016/05/23 06:44:31 Can we have this as do { n++; } while ((decode Can we have this as do { n++; } while ((decoded << n) & 0x80); Franzi 2016/05/23 08:55:57 Changed it to a simple while loop: int n = 1; Show quoted text On 2016/05/23 06:44:31, Yang wrote: > Can we have this as > > do { > n++; > } while ((decoded << n) & 0x80); Changed it to a simple while loop: int n = 1; while((decoded << n) & 0x80) { n++; }
	212 }

	213 if (n == 1 \|\| n > 4 \|\| k + 3 * (n - 1) >= uri_length) {

	214 return false;

	215 }

	216 List<uint8_t> octets;
	Yang 2016/05/23 06:44:31 octets will at most have the length 4, right? Can octets will at most have the length 4, right? Can we have a simple array and pass the length to DecodeOctets? Franzi 2016/05/23 08:55:57 Done. Show quoted text On 2016/05/23 06:44:31, Yang wrote: > octets will at most have the length 4, right? Can we have a simple array and > pass the length to DecodeOctets? Done.
	217 octets.Add(decoded);

	218

	219 for (int i = 1; i < n; i++) {

	220 uc16 decodedTrail;

	221

	222 if (uri_content->Get(++k) != '%' \|\| k + 2 >= uri_length \|\|

	223 !TwoDigitHex(decodedTrail, k, uri_content)) {

	224 return false;

	225 }

	226 k += 2;

	227 octets.Add(decodedTrail);

	228 }

	229

	230 if (!DecodeOctets(&octets, two_byte_buffer)) {

	231 return false;

	232 }

	233 } else {

	234 AddToBuffer(decoded, uri_content, k - 2, is_uri, two_byte_buffer);

	235 }

	236 } else {

	237 two_byte_buffer->Add(code);

	238 }

	239 }

	240 return true;

	241 }

	242

	243 bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,

	244 List<uint8_t>* one_byte_buffer,

	245 List<uc16>* two_byte_buffer) {

	246 DisallowHeapAllocation no_gc;

	247 String::FlatContent uri_content = uri->GetFlatContent();

	248

	249 int uri_length = uri->length();

	250 for (int k = 0; k < uri_length; k++) {

	251 uc16 code = uri_content.Get(k);

	252 if (code == '%') {

	253 uc16 decoded;

	254 if (k + 2 >= uri_length \|\| !TwoDigitHex(decoded, k, &uri_content)) {

	255 return false;

	256 }

	257

	258 if (decoded > unibrow::Utf8::kMaxOneByteChar) {

	259 return IntoTwoByte(k, is_uri, uri_length, &uri_content,

	260 two_byte_buffer);

	261 }

	262

	263 AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);

	264 k += 2;

	265 } else {

	266 if (code > unibrow::Utf8::kMaxOneByteChar) {

	267 return IntoTwoByte(k, is_uri, uri_length, &uri_content,

	268 two_byte_buffer);

	269 }

	270 one_byte_buffer->Add(code);

	271 }

	272 }

	273 return true;

	274 }

	275

	276 } // anonymous namespace

	277

	278 MaybeHandle<Object> Uri::Decode(Isolate* isolate, Handle<String> uri,

	279 bool is_uri) {

	280 uri = String::Flatten(uri);

	281 List<uint8_t> one_byte_buffer;

	282 List<uc16> two_byte_buffer;

	283

	284 if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {

	285 THROW_NEW_ERROR(isolate, NewURIError(), Object);

	286 }

	287

	288 Handle<String> left = isolate->factory()->InternalizeOneByteString(

	289 one_byte_buffer.ToConstVector());

	290

	291 Handle<String> right = isolate->factory()->InternalizeTwoByteString(

	292 two_byte_buffer.ToConstVector());

	293

	294 Handle<String> result;

	295 ASSIGN_RETURN_ON_EXCEPTION(

	296 isolate, result, isolate->factory()->NewConsString(left, right), Object);
	Yang 2016/05/23 06:44:32 Since we are going to copy from list into the heap Since we are going to copy from list into the heap anyways, can we simply, for the mixed encoding case, allocate a large raw two-byte string and CopyChar both lists into it? Franzi 2016/05/23 08:55:57 Returning sequential one- or two-byte string. Show quoted text On 2016/05/23 06:44:32, Yang wrote: > Since we are going to copy from list into the heap anyways, can we simply, for > the mixed encoding case, allocate a large raw two-byte string and CopyChar both > lists into it? Returning sequential one- or two-byte string.
	297

	298 return result;

132 }	299 }

133	300

134 } // namespace internal	301 } // namespace internal

135 } // namespace v8	302 } // namespace v8

OLD	NEW

« no previous file with comments | « src/uri.h ('k') | test/cctest/compiler/test-run-intrinsics.cc » ('j') | test/mjsunit/regress-612109.js » ('J')