Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(360)

Side by Side Diff: runtime/vm/uri.cc

Issue 2011543002: Canonicalize uris in C++ instead of Dart for the standalone embedder. (Closed) Base URL: git@github.com:dart-lang/sdk.git@master
Patch Set: Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2016, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 #include "vm/uri.h"
6
7 #include "vm/zone.h"
8
9 namespace dart {
10
11 // Lower-case a string in place.
12 static void StringLower(char* str) {
13 for (int i = 0; str[i] != '\0'; i++) {
14 char c = str[i];
Florian Schneider 2016/05/26 14:09:45 Why not use libc tolower?
turnidge 2016/05/31 18:25:27 I only wanted ASCII characters to be modified -- I
Florian Schneider 2016/06/01 07:13:33 Ok, it may be worth adding a short comment.
turnidge 2016/06/01 20:00:40 Done.
15 if (c >= 'A' && c <= 'Z') {
16 str[i] = c + ('a' - 'A');
17 }
18 }
19 }
20
21
22 static bool IsUnreservedChar(intptr_t value) {
23 return ((value >= 'a' && value <= 'z') ||
24 (value >= 'A' && value <= 'Z') ||
25 (value >= '0' && value <= '9') ||
26 value == '-' ||
27 value == '.' ||
28 value == '_' ||
29 value == '~');
30 }
31
32
33 static bool IsDelimiter(intptr_t value) {
34 switch (value) {
35 case ':': case '/': case '?': case '#':
36 case '[': case ']': case '@': case '!':
37 case '$': case '&': case '\'': case '(':
38 case ')': case '*': case '+': case ',':
39 case ';': case '=':
40 return true;
41 default:
42 return false;
43 }
44 }
45
46
47 static bool IsHexDigit(char value) {
48 return ((value >- '0' && value <= '9') ||
Florian Schneider 2016/06/01 07:13:33 -'0' does not seem right. s/>-/>=/
turnidge 2016/06/01 20:00:40 Nice catch. Thanks. Fixed.
49 (value >= 'A' && value <= 'F') ||
50 (value >= 'a' && value <= 'f'));
51 }
52
53
54 static int HexValue(char digit) {
55 if ((digit >= '0' && digit <= '9')) {
56 return digit - '0';
57 }
58 if ((digit >= 'A' && digit <= 'F')) {
59 return digit - 'A' + 10;
60 }
61 if ((digit >= 'a' && digit <= 'f')) {
62 return digit - 'a' + 10;
63 }
64 UNREACHABLE();
65 return 0;
66 }
67
68
69 static int GetEscapedValue(const char* str, intptr_t pos, intptr_t len) {
70 if (pos + 2 >= len) {
71 // Not enough room for a valid escape sequence.
72 return -1;
73 }
74 if (str[pos] != '%') {
75 // Escape sequences start with '%'.
76 return -1;
77 }
78
79 char digit1 = str[pos + 1];
80 char digit2 = str[pos + 2];
81 if (!IsHexDigit(digit1) || !IsHexDigit(digit2)) {
82 // Invalid escape sequence. Ignore it.
ahe 2016/05/31 21:44:42 What does the RFC say about this situation?
turnidge 2016/06/01 20:00:40 I couldn't exactly figure it out, but my best gues
83 return -1;
84 }
85 return HexValue(digit1) * 16 + HexValue(digit2);
86 }
87
88
89 static char* NormalizeEscapes(const char* str, intptr_t len) {
90 // Allocate the buffer.
91 Zone* zone = Thread::Current()->zone();
92 char* buffer = zone->Alloc<char>(len * 3 + 1); // +1 for '\0'
Cutch 2016/05/24 23:53:46 nit: (len * 3) + 1 maybe a comment about the * 3
turnidge 2016/05/31 18:25:27 Done.
93
94 // Copy the string, normalizing as we go.
95 intptr_t buffer_pos = 0;
96 intptr_t pos = 0;
97 while (pos < len) {
98 int escaped_value = GetEscapedValue(str, pos, len);
ahe 2016/05/25 12:44:52 I don't understand this code. As far as I can tell
turnidge 2016/05/27 21:40:05 I think you probably need to look at GetEscapedVal
ahe 2016/05/31 21:44:42 This is embarrassing, I spent a lot of time lookin
99 if (escaped_value >= 0) {
100 // If one of the special "unreserved" characters has been
101 // escaped, revert the escaping. Otherwise preserve the
102 // escaping.
103 if (IsUnreservedChar(escaped_value)) {
104 buffer[buffer_pos] = escaped_value;
105 buffer_pos++;
106 } else {
107 OS::SNPrint(buffer + buffer_pos, 4, "%%%02X", escaped_value);
Cutch 2016/05/24 23:53:46 Would copying the the three characters from the or
turnidge 2016/05/27 21:40:06 By reprinting the escaped value we normalize it to
108 buffer_pos += 3;
109 }
110 pos += 3;
111 } else {
112 char c = str[pos];
113 // If a delimiter or unreserved character is currently not
114 // escaped, preserve that. If there is a busted %-sequence in
115 // the input, preserve that too.
116 if (c == '%' || IsDelimiter(c) || IsUnreservedChar(c)) {
117 buffer[buffer_pos] = c;
118 buffer_pos++;
119 } else {
120 // Escape funky characters.
121 OS::SNPrint(buffer + buffer_pos, 4, "%%%02X", c);
122 buffer_pos += 3;
123 }
124 pos++;
125 }
126 }
127 buffer[buffer_pos] = '\0';
128 return buffer;
129 }
130
131
132 static void ClearParsedUri(ParsedUri* parsed_uri) {
133 parsed_uri->scheme = NULL;
134 parsed_uri->userinfo = NULL;
135 parsed_uri->host = NULL;
136 parsed_uri->port = NULL;
137 parsed_uri->path = NULL;
138 parsed_uri->query = NULL;
139 parsed_uri->fragment = NULL;
140 }
141
142
143 static intptr_t ParseAuthority(const char* authority, ParsedUri* parsed_uri) {
144 Zone* zone = Thread::Current()->zone();
145 const char* current = authority;
146 intptr_t len = 0;
147
148 size_t userinfo_len = strcspn(current, "@/");
149 if (current[userinfo_len] == '@') {
150 // The '@' character follows the optional userinfo string.
151 parsed_uri->userinfo = NormalizeEscapes(current, userinfo_len);
152 current += userinfo_len + 1;
153 len += userinfo_len + 1;
154 } else {
155 parsed_uri->userinfo = NULL;
156 }
157
158 size_t host_len = strcspn(current, ":/");
159 char* host = NormalizeEscapes(current, host_len);
160 StringLower(host);
161 parsed_uri->host = host;
162 len += host_len;
163
164 if (current[host_len] == ':') {
165 // The ':' character precedes the optional port string.
166 const char* port_start = current + host_len + 1; // +1 for ':'
167 size_t port_len = strcspn(port_start, "/");
168 parsed_uri->port = zone->MakeCopyOfStringN(port_start, port_len);
169 len += 1 + port_len; // +1 for ':'
170 } else {
171 parsed_uri->port = NULL;
172 }
173 return len;
174 }
175
176
177 // Performs a simple parse of a uri into its components.
178 // See RFC 3986 Section 3: Syntax.
179 bool ParseUri(const char* uri, ParsedUri* parsed_uri) {
180 Zone* zone = Thread::Current()->zone();
181
182 // The first ':' separates the scheme from the rest of the uri. If
183 // a ':' occurs after the first '/' it doesn't count.
184 size_t scheme_len = strcspn(uri, ":/");
185 const char* rest = uri;
186 if (uri[scheme_len] == ':') {
187 char* scheme = zone->MakeCopyOfStringN(uri, scheme_len);
188 StringLower(scheme);
189 parsed_uri->scheme = scheme;
190 rest = uri + scheme_len + 1;
191 } else {
192 parsed_uri->scheme = NULL;
193 }
194
195 // The first '#' separates the optional fragment
196 const char* hash_pos = rest + strcspn(rest, "#");
197 if (*hash_pos == '#') {
198 // There is a fragment part.
199 const char* fragment_start = hash_pos + 1;
200 parsed_uri->fragment =
201 NormalizeEscapes(fragment_start, strlen(fragment_start));
202 } else {
203 parsed_uri->fragment = NULL;
204 }
205
206 // The first '?' or '#' separates the hierarchical part from the
207 // optional query.
208 const char* question_pos = rest + strcspn(rest, "?#");
209 if (*question_pos == '?') {
210 // There is a query part.
211 const char* query_start = question_pos + 1;
212 parsed_uri->query =
213 NormalizeEscapes(query_start, (hash_pos - query_start));
214 } else {
215 parsed_uri->query = NULL;
216 }
217
218 const char* path_start = rest;
219 if (rest[0] == '/' && rest[1] == '/') {
220 // There is an authority part.
221 const char* authority_start = rest + 2; // 2 for '//'.
222
223 intptr_t authority_len =
224 ParseAuthority(authority_start, parsed_uri);
225 if (authority_len < 0) {
226 ClearParsedUri(parsed_uri);
227 return false;
228 }
229 path_start = authority_start + authority_len;
230 } else {
231 parsed_uri->userinfo = NULL;
232 parsed_uri->host = NULL;
233 parsed_uri->port = NULL;
234 }
235
236 // Double slashes in the path do not parse.
237 bool saw_slash = false;
238 for (const char* pos = path_start; pos < question_pos; pos++) {
239 if (*pos == '/') {
240 if (saw_slash) {
241 ClearParsedUri(parsed_uri);
242 return false;
243 }
244 saw_slash = true;
245 } else {
246 saw_slash = false;
247 }
248 }
249
250 // The path is the substring between the authority and the query.
251 parsed_uri->path = NormalizeEscapes(path_start, (question_pos - path_start));
252 return true;
253 }
254
255
256 static char* RemoveLastSegment(char* current,
257 const char* base) {
258 if (current == base) {
259 return current;
260 }
261 ASSERT(current > base);
262 for (current--; current > base; current--) {
263 if (*current == '/') {
264 // We have found the beginning of the last segment.
265 return current;
266 }
267 }
268 ASSERT(current == base);
269 return current;
270 }
271
272
273 static intptr_t SegmentLength(const char* input) {
274 const char* cp = input;
275
276 // Include initial slash in the segment, if any.
277 if (*cp == '/') {
278 cp++;
279 }
280
281 // Don't include trailing slash in the segment.
282 cp += strcspn(cp, "/");
283 return cp - input;
284 }
285
286
287 // See RFC 3986 Section 5.2.4: Remove Dot Segments.
288 static const char* RemoveDotSegments(const char* path) {
289 const char* input = path;
290
291 // The output path will always be less than or equal to the size of
292 // the input path.
293 Zone* zone = Thread::Current()->zone();
294 char* buffer = zone->Alloc<char>(strlen(path) + 1); // +1 for '\0'
295 char* output = buffer;
296
297 while (*input != '\0') {
298 if (strncmp("../", input, 3) == 0) {
299 // Discard initial "../" from the input. It's junk.
300 input += 3;
301
302 } else if (strncmp("./", input, 3) == 0) {
303 // Discard initial "./" from the input. It's junk.
304 input += 2;
305
306 } else if (strncmp("/./", input, 3) == 0) {
307 // Advance past the "/." part of the input.
308 input += 2;
309
310 } else if (strcmp("/.", input) == 0) {
311 // Pretend the input just contains a "/".
312 input = "/";
313
314 } else if (strncmp("/../", input, 4) == 0) {
315 // Advance past the "/.." part of the input and remove one
316 // segment from the output.
317 input += 3;
318 output = RemoveLastSegment(output, buffer);
319
320 } else if (strcmp("/..", input) == 0) {
321 // Pretend the input contains a "/" and remove one segment from
322 // the output.
323 input = "/";
324 output = RemoveLastSegment(output, buffer);
325
326 } else if (strcmp("..", input) == 0) {
327 // The input has been reduced to nothing useful.
328 input += 2;
329
330 } else if (strcmp(".", input) == 0) {
331 // The input has been reduced to nothing useful.
332 input += 1;
333
334 } else {
335 intptr_t segment_len = SegmentLength(input);
336 strncpy(output, input, segment_len);
337 output += segment_len;
338 input += segment_len;
339 }
340 }
341 *output = '\0';
342 return buffer;
343 }
344
345
346 // See RFC 3986 Section 5.2.3: Merge Paths.
347 static const char* MergePaths(const char* base_path, const char* ref_path) {
348 Zone* zone = Thread::Current()->zone();
349 if (base_path[0] == '\0') {
350 // If the base_path is empty, we prepend '/'.
351 return zone->PrintToString("/%s", ref_path);
352 }
353
354 // We need to find the last '/' in base_path.
355 char* last_slash = strrchr(base_path, '/');
356 if (last_slash == NULL) {
357 // There is no slash in the base_path. Return the ref_path unchanged.
358 return ref_path;
359 }
360
361 // We found a '/' in the base_path. Cut off everything after it and
362 // add the ref_path.
363 intptr_t truncated_base_len = last_slash - base_path;
364 intptr_t ref_path_len = strlen(ref_path);
365 intptr_t len = truncated_base_len + ref_path_len + 1; // +1 for '/'
366 char* buffer = zone->Alloc<char>(len + 1); // +1 for '\0'
367
368 // Copy truncated base.
369 strncpy(buffer, base_path, truncated_base_len);
370
371 // Add a slash.
372 buffer[truncated_base_len] = '/';
373
374 // Copy the ref_path.
375 strncpy((buffer + truncated_base_len + 1), ref_path, ref_path_len);
376
377 // Add the trailing '\0'.
378 buffer[len] = '\0';
379
380 return buffer;
381 }
382
383
384 static char* BuildUri(const ParsedUri& uri) {
385 Zone* zone = Thread::Current()->zone();
386 ASSERT(uri.path != NULL);
387
388 const char* fragment = uri.fragment == NULL ? "" : uri.fragment;
389 const char* fragment_separator = uri.fragment == NULL ? "" : "#";
390 const char* query = uri.query == NULL ? "" : uri.query;
391 const char* query_separator = uri.query == NULL ? "" : "?";
392
393 // If there is no scheme for this uri, just build a relative uri of
394 // the form: "path[?query][#fragment]". This is sort of a
395 // degenerate case, but it occurs when we resolve relative urls
ahe 2016/05/31 21:44:42 I wouldn't call this a degenerate case. It's quite
turnidge 2016/06/01 20:00:40 Corrected the comment.
396 // inside a "dart:" library.
397 if (uri.scheme == NULL) {
398 ASSERT(uri.userinfo == NULL && uri.host == NULL && uri.port == NULL);
399 ASSERT(uri.query == NULL);
400 return zone->PrintToString("%s%s%s%s%s",
401 uri.path, query_separator, query,
402 fragment_separator, fragment);
403 }
404
405 // Uri with no authority: "scheme:path[?query][#fragment]"
406 if (uri.host == NULL) {
407 ASSERT(uri.userinfo == NULL && uri.port == NULL);
408 return zone->PrintToString("%s:%s%s%s%s%s",
409 uri.scheme, uri.path, query_separator, query,
410 fragment_separator, fragment);
411 }
412
413 const char* user = uri.userinfo == NULL ? "" : uri.userinfo;
414 const char* user_separator = uri.userinfo == NULL ? "" : "@";
415 const char* port = uri.port == NULL ? "" : uri.port;
416 const char* port_separator = uri.port == NULL ? "" : ":";
417
418 // If the path doesn't start with a '/', add one. We need it to
419 // separate the path from the authority.
420 const char* path_separator = ((uri.path[0] == '\0' || uri.path[0] == '/')
421 ? "" : "/");
422
423 // Uri with authority:
424 // "scheme://[userinfo@]host[:port][/]path[?query][#fragment]"
425 return zone->PrintToString(
426 "%s://%s%s%s%s%s%s%s%s%s%s%s", // There is *nothing* wrong with this.
427 uri.scheme, user, user_separator, uri.host, port_separator, port,
428 path_separator, uri.path, query_separator, query,
429 fragment_separator, fragment);
430 }
431
432
433 // See RFC 3986 Section 5: Reference Resolution
434 bool ResolveUri(const char* ref_uri,
435 const char* base_uri,
436 const char** target_uri) {
437 // Parse the reference uri.
438 ParsedUri ref;
439 if (!ParseUri(ref_uri, &ref)) {
440 *target_uri = NULL;
441 return false;
442 }
443
444 ParsedUri target;
445 if (ref.scheme != NULL) {
446 if (strcmp(ref.scheme, "dart") == 0) {
447 Zone* zone = Thread::Current()->zone();
448 *target_uri = zone->MakeCopyOfString(ref_uri);
449 return true;
450 }
451
452 // When the ref_uri specifies a scheme, the base_uri is ignored.
453 target.scheme = ref.scheme;
454 target.userinfo = ref.userinfo;
455 target.host = ref.host;
456 target.port = ref.port;
457 target.path = RemoveDotSegments(ref.path);
458 target.query = ref.query;
459 target.fragment = ref.fragment;
460 *target_uri = BuildUri(target);
461 return true;
462 }
463
464 // Parse the base uri.
465 ParsedUri base;
466 if (!ParseUri(base_uri, &base)) {
467 *target_uri = NULL;
468 return false;
469 }
470
471 if (base.scheme != NULL && strcmp(base.scheme, "dart") == 0) {
Cutch 2016/05/24 23:53:46 nits about parenthesis: (base.scheme != NULL) &&
turnidge 2016/05/27 21:40:06 Done.
ahe 2016/05/31 21:44:42 This is odd. Why is there a special case for the d
turnidge 2016/06/01 20:00:40 Discussed offline. Sometimes we resolve a relativ
472 Zone* zone = Thread::Current()->zone();
473 *target_uri = zone->MakeCopyOfString(ref_uri);
474 return true;
475 }
476
477 if (ref.host != NULL) {
478 // When the ref_uri specifies an authority, we only use the base scheme.
479 target.scheme = base.scheme;
480 target.userinfo = ref.userinfo;
481 target.host = ref.host;
482 target.port = ref.port;
483 target.path = RemoveDotSegments(ref.path);
484 target.query = ref.query;
485 target.fragment = ref.fragment;
486 *target_uri = BuildUri(target);
487 return true;
488 }
489
490 if (ref.path[0] == '\0') {
491 // Empty path. Use most parts of base_uri.
492 target.scheme = base.scheme;
493 target.userinfo = base.userinfo;
494 target.host = base.host;
495 target.port = base.port;
496 target.path = base.path;
497 target.query = ((ref.query == NULL) ? base.query : ref.query);
498 target.fragment = ref.fragment;
499 *target_uri = BuildUri(target);
500 return true;
501
502 } else if (ref.path[0] == '/') {
503 // Absolute path. ref_path wins.
504 target.scheme = base.scheme;
505 target.userinfo = base.userinfo;
506 target.host = base.host;
507 target.port = base.port;
508 target.path = RemoveDotSegments(ref.path);
509 target.query = ref.query;
510 target.fragment = ref.fragment;
511 *target_uri = BuildUri(target);
512 return true;
513
514 } else {
515 // Relative path. We need to merge base_path and ref_path.
516 target.scheme = base.scheme;
517 target.userinfo = base.userinfo;
518 target.host = base.host;
519 target.port = base.port;
520 target.path = RemoveDotSegments(MergePaths(base.path, ref.path));
521 target.query = ref.query;
522 target.fragment = ref.fragment;
523 *target_uri = BuildUri(target);
524 return true;
525 }
526 }
527
528 } // namespace dart
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698