runtime/vm/uri.cc - Issue 2011543002: Canonicalize uris in C++ instead of Dart for the standalone embedder.

Side by Side Diff: runtime/vm/uri.cc

Issue 2011543002: Canonicalize uris in C++ instead of Dart for the standalone embedder. (Closed) Base URL: git@github.com:dart-lang/sdk.git@master

Patch Set: more code review fixes Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright (c) 2016, the Dart project authors. Please see the AUTHORS file

	2 // for details. All rights reserved. Use of this source code is governed by a

	3 // BSD-style license that can be found in the LICENSE file.

	4

	5 #include "vm/uri.h"

	6

	7 #include "vm/zone.h"

	8

	9 namespace dart {

	10

	11 // Lower-case a string in place.

	12 static void StringLower(char* str) {

	13 for (int i = 0; str[i] != '\0'; i++) {

	14 char c = str[i];

	15 if (c >= 'A' && c <= 'Z') {

	16 str[i] = c + ('a' - 'A');

	17 }

	18 }

	19 }

	20

	21

	22 static bool IsUnreservedChar(intptr_t value) {

	23 return ((value >= 'a' && value <= 'z') \|\|

	24 (value >= 'A' && value <= 'Z') \|\|

	25 (value >= '0' && value <= '9') \|\|

	26 value == '-' \|\|

	27 value == '.' \|\|

	28 value == '_' \|\|

	29 value == '~');

	30 }

	31

	32

	33 static bool IsDelimiter(intptr_t value) {

	34 switch (value) {

	35 case ':': case '/': case '?': case '#':

	36 case '[': case ']': case '@': case '!':

	37 case '$': case '&': case '\'': case '(':

	38 case ')': case '*': case '+': case ',':

	39 case ';': case '=':

	40 return true;

	41 default:

	42 return false;

	43 }

	44 }

	45

	46

	47 static bool IsHexDigit(char value) {

	48 return ((value >- '0' && value <= '9') \|\|

	49 (value >= 'A' && value <= 'F') \|\|

	50 (value >= 'a' && value <= 'f'));

	51 }

	52

	53

	54 static int HexValue(char digit) {

	55 if ((digit >= '0' && digit <= '9')) {

	56 return digit - '0';

	57 }

	58 if ((digit >= 'A' && digit <= 'F')) {

	59 return digit - 'A' + 10;

	60 }

	61 if ((digit >= 'a' && digit <= 'f')) {

	62 return digit - 'a' + 10;

	63 }

	64 UNREACHABLE();

	65 return 0;

	66 }

	67

	68

	69 static int GetEscapedValue(const char* str, intptr_t pos, intptr_t len) {

	70 if (pos + 2 >= len) {

	71 // Not enough room for a valid escape sequence.

	72 return -1;

	73 }

	74 if (str[pos] != '%') {

	75 // Escape sequences start with '%'.

	76 return -1;

	77 }

	78

	79 char digit1 = str[pos + 1];

	80 char digit2 = str[pos + 2];

	81 if (!IsHexDigit(digit1) \|\| !IsHexDigit(digit2)) {

	82 // Invalid escape sequence. Ignore it.

	83 return -1;

	84 }

	85 return HexValue(digit1) * 16 + HexValue(digit2);

	86 }

	87

	88

	89 static char* NormalizeEscapes(const char* str, intptr_t len) {

	90 // Allocate the buffer.

	91 Zone* zone = Thread::Current()->zone();

	92 // We multiply len by three because a percent-escape sequence is

	93 // three characters long (e.g. ' ' -> '%20). +1 for '\0'. We could

	94 // take two passes through the string and avoid the excess

	95 // allocation, but it's zone-memory so it doesn't seem necessary.

	96 char* buffer = zone->Alloc<char>(len * 3 + 1);

	97

	98 // Copy the string, normalizing as we go.

	99 intptr_t buffer_pos = 0;

	100 intptr_t pos = 0;

	101 while (pos < len) {

	102 int escaped_value = GetEscapedValue(str, pos, len);

	103 if (escaped_value >= 0) {

	104 // If one of the special "unreserved" characters has been

	105 // escaped, revert the escaping. Otherwise preserve the

	106 // escaping.

	107 if (IsUnreservedChar(escaped_value)) {

	108 buffer[buffer_pos] = escaped_value;

	109 buffer_pos++;

	110 } else {

	111 OS::SNPrint(buffer + buffer_pos, 4, "%%%02X", escaped_value);

	112 buffer_pos += 3;

	113 }

	114 pos += 3;

	115 } else {

	116 char c = str[pos];

	117 // If a delimiter or unreserved character is currently not

	118 // escaped, preserve that. If there is a busted %-sequence in

	119 // the input, preserve that too.

	120 if (c == '%' \|\| IsDelimiter(c) \|\| IsUnreservedChar(c)) {

	121 buffer[buffer_pos] = c;

	122 buffer_pos++;

	123 } else {

	124 // Escape funky characters.

	125 OS::SNPrint(buffer + buffer_pos, 4, "%%%02X", c);

	126 buffer_pos += 3;

	127 }

	128 pos++;

	129 }

	130 }

	131 buffer[buffer_pos] = '\0';

	132 return buffer;

	133 }

	134

	135

	136 static void ClearParsedUri(ParsedUri* parsed_uri) {

	137 parsed_uri->scheme = NULL;

	138 parsed_uri->userinfo = NULL;

	139 parsed_uri->host = NULL;

	140 parsed_uri->port = NULL;

	141 parsed_uri->path = NULL;

	142 parsed_uri->query = NULL;

	143 parsed_uri->fragment = NULL;

	144 }

	145

	146

	147 static intptr_t ParseAuthority(const char* authority, ParsedUri* parsed_uri) {

	148 Zone* zone = Thread::Current()->zone();

	149 const char* current = authority;

	150 intptr_t len = 0;

	151

	152 size_t userinfo_len = strcspn(current, "@/");

	153 if (current[userinfo_len] == '@') {

	154 // The '@' character follows the optional userinfo string.

	155 parsed_uri->userinfo = NormalizeEscapes(current, userinfo_len);

	156 current += userinfo_len + 1;

	157 len += userinfo_len + 1;

	158 } else {

	159 parsed_uri->userinfo = NULL;

	160 }

	161

	162 size_t host_len = strcspn(current, ":/");

	163 char* host = NormalizeEscapes(current, host_len);

	164 StringLower(host);

	165 parsed_uri->host = host;

	166 len += host_len;

	167

	168 if (current[host_len] == ':') {

	169 // The ':' character precedes the optional port string.

	170 const char* port_start = current + host_len + 1; // +1 for ':'

	171 size_t port_len = strcspn(port_start, "/");

	172 parsed_uri->port = zone->MakeCopyOfStringN(port_start, port_len);

	173 len += 1 + port_len; // +1 for ':'

	174 } else {

	175 parsed_uri->port = NULL;

	176 }

	177 return len;

	178 }

	179

	180

	181 // Performs a simple parse of a uri into its components.

	182 // See RFC 3986 Section 3: Syntax.

	183 bool ParseUri(const char* uri, ParsedUri* parsed_uri) {

	184 Zone* zone = Thread::Current()->zone();

	185

	186 // The first ':' separates the scheme from the rest of the uri. If

	187 // a ':' occurs after the first '/' it doesn't count.

	188 size_t scheme_len = strcspn(uri, ":/");

	189 const char* rest = uri;

	190 if (uri[scheme_len] == ':') {

	191 char* scheme = zone->MakeCopyOfStringN(uri, scheme_len);

	192 StringLower(scheme);

	193 parsed_uri->scheme = scheme;

	194 rest = uri + scheme_len + 1;

	195 } else {

	196 parsed_uri->scheme = NULL;

	197 }

	198

	199 // The first '#' separates the optional fragment

	200 const char* hash_pos = rest + strcspn(rest, "#");

	201 if (*hash_pos == '#') {

	202 // There is a fragment part.

	203 const char* fragment_start = hash_pos + 1;

	204 parsed_uri->fragment =

	205 NormalizeEscapes(fragment_start, strlen(fragment_start));

	206 } else {

	207 parsed_uri->fragment = NULL;

	208 }

	209

	210 // The first '?' or '#' separates the hierarchical part from the

	211 // optional query.

	212 const char* question_pos = rest + strcspn(rest, "?#");

	213 if (*question_pos == '?') {

	214 // There is a query part.

	215 const char* query_start = question_pos + 1;

	216 parsed_uri->query =

	217 NormalizeEscapes(query_start, (hash_pos - query_start));

	218 } else {

	219 parsed_uri->query = NULL;

	220 }

	221

	222 const char* path_start = rest;

	223 if (rest[0] == '/' && rest[1] == '/') {

	224 // There is an authority part.

	225 const char* authority_start = rest + 2; // 2 for '//'.

	226

	227 intptr_t authority_len =

	228 ParseAuthority(authority_start, parsed_uri);

	229 if (authority_len < 0) {

	230 ClearParsedUri(parsed_uri);

	231 return false;

	232 }

	233 path_start = authority_start + authority_len;

	234 } else {

	235 parsed_uri->userinfo = NULL;

	236 parsed_uri->host = NULL;

	237 parsed_uri->port = NULL;

	238 }

	239

	240 // Double slashes in the path do not parse.

	241 bool saw_slash = false;

	242 for (const char* pos = path_start; pos < question_pos; pos++) {

	243 if (*pos == '/') {

	244 if (saw_slash) {

	245 ClearParsedUri(parsed_uri);

	246 return false;

	247 }

	248 saw_slash = true;

	249 } else {

	250 saw_slash = false;

	251 }

	252 }

	253

	254 // The path is the substring between the authority and the query.

	255 parsed_uri->path = NormalizeEscapes(path_start, (question_pos - path_start));

	256 return true;

	257 }

	258

	259

	260 static char* RemoveLastSegment(char* current,

	261 char* base,

	262 bool relative) {

	263 if (relative) {

	264 // If we are removing segments from a relative url, do not remove

	265 // any initial ".." segments.

	266 if (current == base \|\|

	267 (((current - base) == 1) && (base[0] == '/'))) {

	268 strncpy(base, "/..", 3);

	269 return base + 3;

	270 } else if (((current - base) >= 3) &&

	271 (strncmp(current - 3, "/..", 3) == 0)) {

	272 // We have run out of segments to remove. Since the base is

	273 // relative, start adding ".."s onto the output buffer.

	274 strncpy(current, "/..", 3);

	275 return current + 3;

	276 }

	277 }

	278 if (current == base) {

	279 return current;

	280 }

	281 ASSERT(current > base);

	282 for (current--; current > base; current--) {

	283 if (*current == '/') {

	284 // We have found the beginning of the last segment.

	285 return current;

	286 }

	287 }

	288 ASSERT(current == base);

	289 return current;

	290 }

	291

	292

	293 static intptr_t SegmentLength(const char* input) {

	294 const char* cp = input;

	295

	296 // Include initial slash in the segment, if any.

	297 if (*cp == '/') {

	298 cp++;

	299 }

	300

	301 // Don't include trailing slash in the segment.

	302 cp += strcspn(cp, "/");

	303 return cp - input;

	304 }

	305

	306

	307 // See RFC 3986 Section 5.2.4: Remove Dot Segments.

	308 static const char* RemoveDotSegments(const char* path, bool relative) {

	309 const char* input = path;

	310

	311 // The output path will always be less than or equal to the size of

	312 // the input path.

	313 Zone* zone = Thread::Current()->zone();

	314 char* buffer = zone->Alloc<char>(strlen(path) + 1); // +1 for '\0'

	315 char* output = buffer;

	316

	317 while (*input != '\0') {

	318 if (strncmp("../", input, 3) == 0) {

	319 // Discard initial "../" from the input. It's junk.

	320 if (relative) {

	321 output = RemoveLastSegment(output, buffer, relative);

	322 }

	323 input += 3;

	324

	325 } else if (strncmp("./", input, 3) == 0) {

	326 // Discard initial "./" from the input. It's junk.

	327 input += 2;

	328

	329 } else if (strncmp("/./", input, 3) == 0) {

	330 // Advance past the "/." part of the input.

	331 input += 2;

	332

	333 } else if (strcmp("/.", input) == 0) {

	334 // Pretend the input just contains a "/".

	335 input = "/";

	336

	337 } else if (strncmp("/../", input, 4) == 0) {

	338 // Advance past the "/.." part of the input and remove one

	339 // segment from the output.

	340 input += 3;

	341 output = RemoveLastSegment(output, buffer, relative);

	342

	343 } else if (strcmp("/..", input) == 0) {

	344 // Pretend the input contains a "/" and remove one segment from

	345 // the output.

	346 input = "/";

	347 output = RemoveLastSegment(output, buffer, relative);

	348

	349 } else if (strcmp("..", input) == 0) {

	350 // The input has been reduced to nothing useful.

	351 input += 2;

	352

	353 } else if (strcmp(".", input) == 0) {

	354 // The input has been reduced to nothing useful.

	355 input += 1;

	356

	357 } else {

	358 intptr_t segment_len = SegmentLength(input);

	359 if (input[0] != '/' && output != buffer) {

	360 *output = '/';

	361 output++;

	362 }

	363 strncpy(output, input, segment_len);

	364 output += segment_len;

	365 input += segment_len;

	366 }

	367 }

	368 *output = '\0';

	369 if (relative) {

	370 // When resolving against a relative base path, the result should

	371 // be relative.

	372 if (buffer[0] == '/') {

	373 buffer = buffer + 1;

	374 }

	375 // If we have wittled the path down to nothing, normalize to "./".

	376 if (buffer[0] == '\0') {

	377 return "./";

	378 }

	379 }

	380 return buffer;

	381 }

	382

	383

	384 // See RFC 3986 Section 5.2.3: Merge Paths.

	385 static const char* MergePaths(const char* base_path, const char* ref_path) {

	386 Zone* zone = Thread::Current()->zone();

	387 if (base_path[0] == '\0') {

	388 // If the base_path is empty, we prepend '/'.

	389 return zone->PrintToString("/%s", ref_path);

	390 }

	391

	392 // We need to find the last '/' in base_path.

	393 char* last_slash = strrchr(base_path, '/');

	394 if (last_slash == NULL) {

	395 // There is no slash in the base_path. Return the ref_path unchanged.

	396 return ref_path;

	397 }

	398

	399 // We found a '/' in the base_path. Cut off everything after it and

	400 // add the ref_path.

	401 intptr_t truncated_base_len = last_slash - base_path;

	402 intptr_t ref_path_len = strlen(ref_path);

	403 intptr_t len = truncated_base_len + ref_path_len + 1; // +1 for '/'

	404 char* buffer = zone->Alloc<char>(len + 1); // +1 for '\0'

	405

	406 // Copy truncated base.

	407 strncpy(buffer, base_path, truncated_base_len);

	408

	409 // Add a slash.

	410 buffer[truncated_base_len] = '/';

	411

	412 // Copy the ref_path.

	413 strncpy((buffer + truncated_base_len + 1), ref_path, ref_path_len);

	414

	415 // Add the trailing '\0'.

	416 buffer[len] = '\0';

	417

	418 return buffer;

	419 }

	420

	421

	422 static char* BuildUri(const ParsedUri& uri) {

	423 Zone* zone = Thread::Current()->zone();

	424 ASSERT(uri.path != NULL);

	425

	426 const char* fragment = uri.fragment == NULL ? "" : uri.fragment;

	427 const char* fragment_separator = uri.fragment == NULL ? "" : "#";

	428 const char* query = uri.query == NULL ? "" : uri.query;

	429 const char* query_separator = uri.query == NULL ? "" : "?";

	430

	431 // If there is no scheme for this uri, just build a relative uri of

	432 // the form: "path[?query][#fragment]". This is sort of a

	433 // degenerate case, but it occurs when we resolve relative urls

	434 // inside a "dart:" library.

	435 if (uri.scheme == NULL) {

	436 ASSERT(uri.userinfo == NULL && uri.host == NULL && uri.port == NULL);

	437 return zone->PrintToString("%s%s%s%s%s",

	438 uri.path, query_separator, query,

	439 fragment_separator, fragment);

	440 }

	441

	442 // Uri with no authority: "scheme:path[?query][#fragment]"

	443 if (uri.host == NULL) {

	444 ASSERT(uri.userinfo == NULL && uri.port == NULL);

	445 return zone->PrintToString("%s:%s%s%s%s%s",

	446 uri.scheme, uri.path, query_separator, query,

	447 fragment_separator, fragment);

	448 }

	449

	450 const char* user = uri.userinfo == NULL ? "" : uri.userinfo;

	451 const char* user_separator = uri.userinfo == NULL ? "" : "@";

	452 const char* port = uri.port == NULL ? "" : uri.port;

	453 const char* port_separator = uri.port == NULL ? "" : ":";

	454

	455 // If the path doesn't start with a '/', add one. We need it to

	456 // separate the path from the authority.

	457 const char* path_separator = ((uri.path[0] == '\0' \|\| uri.path[0] == '/')

	458 ? "" : "/");

	459

	460 // Uri with authority:

	461 // "scheme://[userinfo@]host[:port][/]path[?query][#fragment]"

	462 return zone->PrintToString(

	463 "%s://%s%s%s%s%s%s%s%s%s%s%s", // There is nothing wrong with this.

	464 uri.scheme, user, user_separator, uri.host, port_separator, port,

	465 path_separator, uri.path, query_separator, query,

	466 fragment_separator, fragment);

	467 }

	468

	469

	470 // See RFC 3986 Section 5: Reference Resolution

	471 //

	472 // If the base uri is a relative path with no scheme or authority

	473 // specified, then we diverge from the spec and instead we resolve the

	474 // uri in a manner which is consistent with the dart:core Uri

	475 // implementation. Nicely, this makes uri resolution associative:

	476 //

	477 // resolve(resolve(absUrl, relUrl1), relUrl2)

	478 // ==

	479 // resolve((absUrl, resolve(relUrl1, relUrl2))

	480 //

	481 // In practice, during uri canonicalization the base uri is always

	482 // absolute, so the issue is moot, but consistency is nice to have.

	483 bool ResolveUri(const char* ref_uri,

	484 const char* base_uri,

	485 const char** target_uri) {

	486 // Parse the reference uri.

	487 ParsedUri ref;

	488 if (!ParseUri(ref_uri, &ref)) {

	489 *target_uri = NULL;

	490 return false;

	491 }

	492

	493 ParsedUri target;

	494 if (ref.scheme != NULL) {

	495 if (strcmp(ref.scheme, "dart") == 0) {

	496 Zone* zone = Thread::Current()->zone();

	497 *target_uri = zone->MakeCopyOfString(ref_uri);

	498 return true;

	499 }

	500

	501 // When the ref_uri specifies a scheme, the base_uri is ignored.

	502 target.scheme = ref.scheme;

	503 target.userinfo = ref.userinfo;

	504 target.host = ref.host;

	505 target.port = ref.port;

	506 target.path = RemoveDotSegments(ref.path, false);

	507 target.query = ref.query;

	508 target.fragment = ref.fragment;

	509 *target_uri = BuildUri(target);

	510 return true;

	511 }

	512

	513 // Parse the base uri.

	514 ParsedUri base;

	515 if (!ParseUri(base_uri, &base)) {

	516 *target_uri = NULL;

	517 return false;

	518 }

	519

	520 if ((base.scheme != NULL) && strcmp(base.scheme, "dart") == 0) {

	521 Zone* zone = Thread::Current()->zone();

	522 *target_uri = zone->MakeCopyOfString(ref_uri);

	523 return true;

	524 }

	525

	526 if (ref.host != NULL) {

	527 // When the ref_uri specifies an authority, we only use the base scheme.

	528 target.scheme = base.scheme;

	529 target.userinfo = ref.userinfo;

	530 target.host = ref.host;

	531 target.port = ref.port;

	532 target.path = RemoveDotSegments(ref.path, false);

	533 target.query = ref.query;

	534 target.fragment = ref.fragment;

	535 *target_uri = BuildUri(target);

	536 return true;

	537 }

	538

	539 if (ref.path[0] == '\0') {

	540 // Empty path. Use most parts of base_uri.

	541 target.scheme = base.scheme;

	542 target.userinfo = base.userinfo;

	543 target.host = base.host;

	544 target.port = base.port;

	545 target.path = base.path;

	546 target.query = ((ref.query == NULL) ? base.query : ref.query);

	547 target.fragment = ref.fragment;

	548 *target_uri = BuildUri(target);

	549 return true;

	550

	551 } else if (ref.path[0] == '/') {

	552 // Absolute path. ref_path wins.

	553 target.scheme = base.scheme;

	554 target.userinfo = base.userinfo;

	555 target.host = base.host;

	556 target.port = base.port;

	557 target.path = RemoveDotSegments(ref.path, false);

	558 target.query = ref.query;

	559 target.fragment = ref.fragment;

	560 *target_uri = BuildUri(target);

	561 return true;

	562

	563 } else {

	564 // Relative path. We need to merge base_path and ref_path.

	565 bool relative_base = (base.scheme == NULL &&

	566 base.host == NULL &&

	567 base.path[0] != '/');

	568

	569 target.scheme = base.scheme;

	570 target.userinfo = base.userinfo;

	571 target.host = base.host;

	572 target.port = base.port;

	573 target.path = RemoveDotSegments(MergePaths(base.path, ref.path),

	574 relative_base);

	575 target.query = ref.query;

	576 target.fragment = ref.fragment;

	577 *target_uri = BuildUri(target);

	578 return true;

	579 }

	580 }

	581

	582 } // namespace dart

OLD	NEW

« no previous file with comments | « runtime/vm/uri.h ('k') | runtime/vm/uri_test.cc » ('j') | tests/corelib/uri_test.dart » ('J')