Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(79)

Side by Side Diff: runtime/vm/uri.cc

Issue 2011543002: Canonicalize uris in C++ instead of Dart for the standalone embedder. (Closed) Base URL: git@github.com:dart-lang/sdk.git@master
Patch Set: more code review fixes Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2016, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 #include "vm/uri.h"
6
7 #include "vm/zone.h"
8
9 namespace dart {
10
11 // Lower-case a string in place.
12 static void StringLower(char* str) {
13 for (int i = 0; str[i] != '\0'; i++) {
14 char c = str[i];
15 if (c >= 'A' && c <= 'Z') {
16 str[i] = c + ('a' - 'A');
17 }
18 }
19 }
20
21
22 static bool IsUnreservedChar(intptr_t value) {
23 return ((value >= 'a' && value <= 'z') ||
24 (value >= 'A' && value <= 'Z') ||
25 (value >= '0' && value <= '9') ||
26 value == '-' ||
27 value == '.' ||
28 value == '_' ||
29 value == '~');
30 }
31
32
33 static bool IsDelimiter(intptr_t value) {
34 switch (value) {
35 case ':': case '/': case '?': case '#':
36 case '[': case ']': case '@': case '!':
37 case '$': case '&': case '\'': case '(':
38 case ')': case '*': case '+': case ',':
39 case ';': case '=':
40 return true;
41 default:
42 return false;
43 }
44 }
45
46
47 static bool IsHexDigit(char value) {
48 return ((value >- '0' && value <= '9') ||
49 (value >= 'A' && value <= 'F') ||
50 (value >= 'a' && value <= 'f'));
51 }
52
53
54 static int HexValue(char digit) {
55 if ((digit >= '0' && digit <= '9')) {
56 return digit - '0';
57 }
58 if ((digit >= 'A' && digit <= 'F')) {
59 return digit - 'A' + 10;
60 }
61 if ((digit >= 'a' && digit <= 'f')) {
62 return digit - 'a' + 10;
63 }
64 UNREACHABLE();
65 return 0;
66 }
67
68
69 static int GetEscapedValue(const char* str, intptr_t pos, intptr_t len) {
70 if (pos + 2 >= len) {
71 // Not enough room for a valid escape sequence.
72 return -1;
73 }
74 if (str[pos] != '%') {
75 // Escape sequences start with '%'.
76 return -1;
77 }
78
79 char digit1 = str[pos + 1];
80 char digit2 = str[pos + 2];
81 if (!IsHexDigit(digit1) || !IsHexDigit(digit2)) {
82 // Invalid escape sequence. Ignore it.
83 return -1;
84 }
85 return HexValue(digit1) * 16 + HexValue(digit2);
86 }
87
88
89 static char* NormalizeEscapes(const char* str, intptr_t len) {
90 // Allocate the buffer.
91 Zone* zone = Thread::Current()->zone();
92 // We multiply len by three because a percent-escape sequence is
93 // three characters long (e.g. ' ' -> '%20). +1 for '\0'. We could
94 // take two passes through the string and avoid the excess
95 // allocation, but it's zone-memory so it doesn't seem necessary.
96 char* buffer = zone->Alloc<char>(len * 3 + 1);
97
98 // Copy the string, normalizing as we go.
99 intptr_t buffer_pos = 0;
100 intptr_t pos = 0;
101 while (pos < len) {
102 int escaped_value = GetEscapedValue(str, pos, len);
103 if (escaped_value >= 0) {
104 // If one of the special "unreserved" characters has been
105 // escaped, revert the escaping. Otherwise preserve the
106 // escaping.
107 if (IsUnreservedChar(escaped_value)) {
108 buffer[buffer_pos] = escaped_value;
109 buffer_pos++;
110 } else {
111 OS::SNPrint(buffer + buffer_pos, 4, "%%%02X", escaped_value);
112 buffer_pos += 3;
113 }
114 pos += 3;
115 } else {
116 char c = str[pos];
117 // If a delimiter or unreserved character is currently not
118 // escaped, preserve that. If there is a busted %-sequence in
119 // the input, preserve that too.
120 if (c == '%' || IsDelimiter(c) || IsUnreservedChar(c)) {
121 buffer[buffer_pos] = c;
122 buffer_pos++;
123 } else {
124 // Escape funky characters.
125 OS::SNPrint(buffer + buffer_pos, 4, "%%%02X", c);
126 buffer_pos += 3;
127 }
128 pos++;
129 }
130 }
131 buffer[buffer_pos] = '\0';
132 return buffer;
133 }
134
135
136 static void ClearParsedUri(ParsedUri* parsed_uri) {
137 parsed_uri->scheme = NULL;
138 parsed_uri->userinfo = NULL;
139 parsed_uri->host = NULL;
140 parsed_uri->port = NULL;
141 parsed_uri->path = NULL;
142 parsed_uri->query = NULL;
143 parsed_uri->fragment = NULL;
144 }
145
146
147 static intptr_t ParseAuthority(const char* authority, ParsedUri* parsed_uri) {
148 Zone* zone = Thread::Current()->zone();
149 const char* current = authority;
150 intptr_t len = 0;
151
152 size_t userinfo_len = strcspn(current, "@/");
153 if (current[userinfo_len] == '@') {
154 // The '@' character follows the optional userinfo string.
155 parsed_uri->userinfo = NormalizeEscapes(current, userinfo_len);
156 current += userinfo_len + 1;
157 len += userinfo_len + 1;
158 } else {
159 parsed_uri->userinfo = NULL;
160 }
161
162 size_t host_len = strcspn(current, ":/");
163 char* host = NormalizeEscapes(current, host_len);
164 StringLower(host);
165 parsed_uri->host = host;
166 len += host_len;
167
168 if (current[host_len] == ':') {
169 // The ':' character precedes the optional port string.
170 const char* port_start = current + host_len + 1; // +1 for ':'
171 size_t port_len = strcspn(port_start, "/");
172 parsed_uri->port = zone->MakeCopyOfStringN(port_start, port_len);
173 len += 1 + port_len; // +1 for ':'
174 } else {
175 parsed_uri->port = NULL;
176 }
177 return len;
178 }
179
180
181 // Performs a simple parse of a uri into its components.
182 // See RFC 3986 Section 3: Syntax.
183 bool ParseUri(const char* uri, ParsedUri* parsed_uri) {
184 Zone* zone = Thread::Current()->zone();
185
186 // The first ':' separates the scheme from the rest of the uri. If
187 // a ':' occurs after the first '/' it doesn't count.
188 size_t scheme_len = strcspn(uri, ":/");
189 const char* rest = uri;
190 if (uri[scheme_len] == ':') {
191 char* scheme = zone->MakeCopyOfStringN(uri, scheme_len);
192 StringLower(scheme);
193 parsed_uri->scheme = scheme;
194 rest = uri + scheme_len + 1;
195 } else {
196 parsed_uri->scheme = NULL;
197 }
198
199 // The first '#' separates the optional fragment
200 const char* hash_pos = rest + strcspn(rest, "#");
201 if (*hash_pos == '#') {
202 // There is a fragment part.
203 const char* fragment_start = hash_pos + 1;
204 parsed_uri->fragment =
205 NormalizeEscapes(fragment_start, strlen(fragment_start));
206 } else {
207 parsed_uri->fragment = NULL;
208 }
209
210 // The first '?' or '#' separates the hierarchical part from the
211 // optional query.
212 const char* question_pos = rest + strcspn(rest, "?#");
213 if (*question_pos == '?') {
214 // There is a query part.
215 const char* query_start = question_pos + 1;
216 parsed_uri->query =
217 NormalizeEscapes(query_start, (hash_pos - query_start));
218 } else {
219 parsed_uri->query = NULL;
220 }
221
222 const char* path_start = rest;
223 if (rest[0] == '/' && rest[1] == '/') {
224 // There is an authority part.
225 const char* authority_start = rest + 2; // 2 for '//'.
226
227 intptr_t authority_len =
228 ParseAuthority(authority_start, parsed_uri);
229 if (authority_len < 0) {
230 ClearParsedUri(parsed_uri);
231 return false;
232 }
233 path_start = authority_start + authority_len;
234 } else {
235 parsed_uri->userinfo = NULL;
236 parsed_uri->host = NULL;
237 parsed_uri->port = NULL;
238 }
239
240 // Double slashes in the path do not parse.
241 bool saw_slash = false;
242 for (const char* pos = path_start; pos < question_pos; pos++) {
243 if (*pos == '/') {
244 if (saw_slash) {
245 ClearParsedUri(parsed_uri);
246 return false;
247 }
248 saw_slash = true;
249 } else {
250 saw_slash = false;
251 }
252 }
253
254 // The path is the substring between the authority and the query.
255 parsed_uri->path = NormalizeEscapes(path_start, (question_pos - path_start));
256 return true;
257 }
258
259
260 static char* RemoveLastSegment(char* current,
261 char* base,
262 bool relative) {
263 if (relative) {
264 // If we are removing segments from a relative url, do not remove
265 // any initial ".." segments.
266 if (current == base ||
267 (((current - base) == 1) && (base[0] == '/'))) {
268 strncpy(base, "/..", 3);
269 return base + 3;
270 } else if (((current - base) >= 3) &&
271 (strncmp(current - 3, "/..", 3) == 0)) {
272 // We have run out of segments to remove. Since the base is
273 // relative, start adding ".."s onto the output buffer.
274 strncpy(current, "/..", 3);
275 return current + 3;
276 }
277 }
278 if (current == base) {
279 return current;
280 }
281 ASSERT(current > base);
282 for (current--; current > base; current--) {
283 if (*current == '/') {
284 // We have found the beginning of the last segment.
285 return current;
286 }
287 }
288 ASSERT(current == base);
289 return current;
290 }
291
292
293 static intptr_t SegmentLength(const char* input) {
294 const char* cp = input;
295
296 // Include initial slash in the segment, if any.
297 if (*cp == '/') {
298 cp++;
299 }
300
301 // Don't include trailing slash in the segment.
302 cp += strcspn(cp, "/");
303 return cp - input;
304 }
305
306
307 // See RFC 3986 Section 5.2.4: Remove Dot Segments.
308 static const char* RemoveDotSegments(const char* path, bool relative) {
309 const char* input = path;
310
311 // The output path will always be less than or equal to the size of
312 // the input path.
313 Zone* zone = Thread::Current()->zone();
314 char* buffer = zone->Alloc<char>(strlen(path) + 1); // +1 for '\0'
315 char* output = buffer;
316
317 while (*input != '\0') {
318 if (strncmp("../", input, 3) == 0) {
319 // Discard initial "../" from the input. It's junk.
320 if (relative) {
321 output = RemoveLastSegment(output, buffer, relative);
322 }
323 input += 3;
324
325 } else if (strncmp("./", input, 3) == 0) {
326 // Discard initial "./" from the input. It's junk.
327 input += 2;
328
329 } else if (strncmp("/./", input, 3) == 0) {
330 // Advance past the "/." part of the input.
331 input += 2;
332
333 } else if (strcmp("/.", input) == 0) {
334 // Pretend the input just contains a "/".
335 input = "/";
336
337 } else if (strncmp("/../", input, 4) == 0) {
338 // Advance past the "/.." part of the input and remove one
339 // segment from the output.
340 input += 3;
341 output = RemoveLastSegment(output, buffer, relative);
342
343 } else if (strcmp("/..", input) == 0) {
344 // Pretend the input contains a "/" and remove one segment from
345 // the output.
346 input = "/";
347 output = RemoveLastSegment(output, buffer, relative);
348
349 } else if (strcmp("..", input) == 0) {
350 // The input has been reduced to nothing useful.
351 input += 2;
352
353 } else if (strcmp(".", input) == 0) {
354 // The input has been reduced to nothing useful.
355 input += 1;
356
357 } else {
358 intptr_t segment_len = SegmentLength(input);
359 if (input[0] != '/' && output != buffer) {
360 *output = '/';
361 output++;
362 }
363 strncpy(output, input, segment_len);
364 output += segment_len;
365 input += segment_len;
366 }
367 }
368 *output = '\0';
369 if (relative) {
370 // When resolving against a relative base path, the result should
371 // be relative.
372 if (buffer[0] == '/') {
373 buffer = buffer + 1;
374 }
375 // If we have wittled the path down to nothing, normalize to "./".
376 if (buffer[0] == '\0') {
377 return "./";
378 }
379 }
380 return buffer;
381 }
382
383
384 // See RFC 3986 Section 5.2.3: Merge Paths.
385 static const char* MergePaths(const char* base_path, const char* ref_path) {
386 Zone* zone = Thread::Current()->zone();
387 if (base_path[0] == '\0') {
388 // If the base_path is empty, we prepend '/'.
389 return zone->PrintToString("/%s", ref_path);
390 }
391
392 // We need to find the last '/' in base_path.
393 char* last_slash = strrchr(base_path, '/');
394 if (last_slash == NULL) {
395 // There is no slash in the base_path. Return the ref_path unchanged.
396 return ref_path;
397 }
398
399 // We found a '/' in the base_path. Cut off everything after it and
400 // add the ref_path.
401 intptr_t truncated_base_len = last_slash - base_path;
402 intptr_t ref_path_len = strlen(ref_path);
403 intptr_t len = truncated_base_len + ref_path_len + 1; // +1 for '/'
404 char* buffer = zone->Alloc<char>(len + 1); // +1 for '\0'
405
406 // Copy truncated base.
407 strncpy(buffer, base_path, truncated_base_len);
408
409 // Add a slash.
410 buffer[truncated_base_len] = '/';
411
412 // Copy the ref_path.
413 strncpy((buffer + truncated_base_len + 1), ref_path, ref_path_len);
414
415 // Add the trailing '\0'.
416 buffer[len] = '\0';
417
418 return buffer;
419 }
420
421
422 static char* BuildUri(const ParsedUri& uri) {
423 Zone* zone = Thread::Current()->zone();
424 ASSERT(uri.path != NULL);
425
426 const char* fragment = uri.fragment == NULL ? "" : uri.fragment;
427 const char* fragment_separator = uri.fragment == NULL ? "" : "#";
428 const char* query = uri.query == NULL ? "" : uri.query;
429 const char* query_separator = uri.query == NULL ? "" : "?";
430
431 // If there is no scheme for this uri, just build a relative uri of
432 // the form: "path[?query][#fragment]". This is sort of a
433 // degenerate case, but it occurs when we resolve relative urls
434 // inside a "dart:" library.
435 if (uri.scheme == NULL) {
436 ASSERT(uri.userinfo == NULL && uri.host == NULL && uri.port == NULL);
437 return zone->PrintToString("%s%s%s%s%s",
438 uri.path, query_separator, query,
439 fragment_separator, fragment);
440 }
441
442 // Uri with no authority: "scheme:path[?query][#fragment]"
443 if (uri.host == NULL) {
444 ASSERT(uri.userinfo == NULL && uri.port == NULL);
445 return zone->PrintToString("%s:%s%s%s%s%s",
446 uri.scheme, uri.path, query_separator, query,
447 fragment_separator, fragment);
448 }
449
450 const char* user = uri.userinfo == NULL ? "" : uri.userinfo;
451 const char* user_separator = uri.userinfo == NULL ? "" : "@";
452 const char* port = uri.port == NULL ? "" : uri.port;
453 const char* port_separator = uri.port == NULL ? "" : ":";
454
455 // If the path doesn't start with a '/', add one. We need it to
456 // separate the path from the authority.
457 const char* path_separator = ((uri.path[0] == '\0' || uri.path[0] == '/')
458 ? "" : "/");
459
460 // Uri with authority:
461 // "scheme://[userinfo@]host[:port][/]path[?query][#fragment]"
462 return zone->PrintToString(
463 "%s://%s%s%s%s%s%s%s%s%s%s%s", // There is *nothing* wrong with this.
464 uri.scheme, user, user_separator, uri.host, port_separator, port,
465 path_separator, uri.path, query_separator, query,
466 fragment_separator, fragment);
467 }
468
469
470 // See RFC 3986 Section 5: Reference Resolution
471 //
472 // If the base uri is a relative path with no scheme or authority
473 // specified, then we diverge from the spec and instead we resolve the
474 // uri in a manner which is consistent with the dart:core Uri
475 // implementation. Nicely, this makes uri resolution associative:
476 //
477 // resolve(resolve(absUrl, relUrl1), relUrl2)
478 // ==
479 // resolve((absUrl, resolve(relUrl1, relUrl2))
480 //
481 // In practice, during uri canonicalization the base uri is always
482 // absolute, so the issue is moot, but consistency is nice to have.
483 bool ResolveUri(const char* ref_uri,
484 const char* base_uri,
485 const char** target_uri) {
486 // Parse the reference uri.
487 ParsedUri ref;
488 if (!ParseUri(ref_uri, &ref)) {
489 *target_uri = NULL;
490 return false;
491 }
492
493 ParsedUri target;
494 if (ref.scheme != NULL) {
495 if (strcmp(ref.scheme, "dart") == 0) {
496 Zone* zone = Thread::Current()->zone();
497 *target_uri = zone->MakeCopyOfString(ref_uri);
498 return true;
499 }
500
501 // When the ref_uri specifies a scheme, the base_uri is ignored.
502 target.scheme = ref.scheme;
503 target.userinfo = ref.userinfo;
504 target.host = ref.host;
505 target.port = ref.port;
506 target.path = RemoveDotSegments(ref.path, false);
507 target.query = ref.query;
508 target.fragment = ref.fragment;
509 *target_uri = BuildUri(target);
510 return true;
511 }
512
513 // Parse the base uri.
514 ParsedUri base;
515 if (!ParseUri(base_uri, &base)) {
516 *target_uri = NULL;
517 return false;
518 }
519
520 if ((base.scheme != NULL) && strcmp(base.scheme, "dart") == 0) {
521 Zone* zone = Thread::Current()->zone();
522 *target_uri = zone->MakeCopyOfString(ref_uri);
523 return true;
524 }
525
526 if (ref.host != NULL) {
527 // When the ref_uri specifies an authority, we only use the base scheme.
528 target.scheme = base.scheme;
529 target.userinfo = ref.userinfo;
530 target.host = ref.host;
531 target.port = ref.port;
532 target.path = RemoveDotSegments(ref.path, false);
533 target.query = ref.query;
534 target.fragment = ref.fragment;
535 *target_uri = BuildUri(target);
536 return true;
537 }
538
539 if (ref.path[0] == '\0') {
540 // Empty path. Use most parts of base_uri.
541 target.scheme = base.scheme;
542 target.userinfo = base.userinfo;
543 target.host = base.host;
544 target.port = base.port;
545 target.path = base.path;
546 target.query = ((ref.query == NULL) ? base.query : ref.query);
547 target.fragment = ref.fragment;
548 *target_uri = BuildUri(target);
549 return true;
550
551 } else if (ref.path[0] == '/') {
552 // Absolute path. ref_path wins.
553 target.scheme = base.scheme;
554 target.userinfo = base.userinfo;
555 target.host = base.host;
556 target.port = base.port;
557 target.path = RemoveDotSegments(ref.path, false);
558 target.query = ref.query;
559 target.fragment = ref.fragment;
560 *target_uri = BuildUri(target);
561 return true;
562
563 } else {
564 // Relative path. We need to merge base_path and ref_path.
565 bool relative_base = (base.scheme == NULL &&
566 base.host == NULL &&
567 base.path[0] != '/');
568
569 target.scheme = base.scheme;
570 target.userinfo = base.userinfo;
571 target.host = base.host;
572 target.port = base.port;
573 target.path = RemoveDotSegments(MergePaths(base.path, ref.path),
574 relative_base);
575 target.query = ref.query;
576 target.fragment = ref.fragment;
577 *target_uri = BuildUri(target);
578 return true;
579 }
580 }
581
582 } // namespace dart
OLDNEW
« no previous file with comments | « runtime/vm/uri.h ('k') | runtime/vm/uri_test.cc » ('j') | tests/corelib/uri_test.dart » ('J')

Powered by Google App Engine
This is Rietveld 408576698