Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Unified Diff: sdk/lib/core/uri.dart

Issue 321543003: New, more validating, parser for URI. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Update and add tests. Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: sdk/lib/core/uri.dart
diff --git a/sdk/lib/core/uri.dart b/sdk/lib/core/uri.dart
index 091e1fece45903402fde1890ed9698afe85b10e8..8d3f589c7602eb2703828493f66289e2ff904b72 100644
--- a/sdk/lib/core/uri.dart
+++ b/sdk/lib/core/uri.dart
@@ -172,174 +172,256 @@ class Uri {
// query = *( pchar / "/" / "?" )
//
// fragment = *( pchar / "/" / "?" )
- bool isRegName(int ch) {
- return ch < 128 && ((_regNameTable[ch >> 4] & (1 << (ch & 0x0f))) != 0);
- }
-
- int ipV6Address(int index) {
- // IPv6. Skip to ']'.
- index = uri.indexOf(']', index);
- if (index == -1) {
- throw new FormatException("Bad end of IPv6 host");
- }
- return index + 1;
- }
-
- int length = uri.length;
- int index = 0;
-
- int schemeEndIndex = 0;
-
- if (length == 0) {
+ if (uri.isEmpty) {
return new Uri();
}
+ String scheme;
+ String userInfo = "";
+ String host = "";
+ int port = 0;
+ String path = "";
+ String query;
+ String fragment;
+ bool allowColonInPath = false;
- if (uri.codeUnitAt(0) != _SLASH) {
- // Can be scheme.
- while (index < length) {
- // Look for ':'. If found, continue from the post of ':'. If not (end
- // reached or invalid scheme char found) back up one char, and continue
- // to path.
- // Note that scheme-chars is contained in path-chars.
- int codeUnit = uri.codeUnitAt(index++);
- if (!_isSchemeCharacter(codeUnit)) {
- if (codeUnit == _COLON) {
- schemeEndIndex = index;
- } else {
- // Back up one char, since we met an invalid scheme char.
- index--;
+ /// Index after current char.
Søren Gjesse 2014/06/11 08:33:15 Why dart-doc comments here?
Lasse Reichstein Nielsen 2014/06/12 08:07:31 Because I assume the editor will display that comm
+ int index = 0;
+ /// Current char.
+ int current = _EOI;
+ /// Start index of current section being parsed.
+ int start = 0;
Anders Johnsen 2014/06/11 10:00:03 Start is a bit ambiguous. currentSectionStart ?
Lasse Reichstein Nielsen 2014/06/12 08:07:31 .... loooooong name. But ok.
+ /// The position after the current character. If parsing a percent
+ /// escape, this is index + 3, otherwise index + 1.
+ int nextIndex = 0;
+
+ void advance() {
+ index = nextIndex;
+ if (index < uri.length) {
+ current = uri.codeUnitAt(index);
+ if (_MAX_VALID_CHAR >= current) {
+ if (_PERCENT != current) {
+ nextIndex = nextIndex + 1;
+ return;
+ }
+ if (index + 2 >= uri.length ||
+ !_isHexDigit(uri.codeUnitAt(index + 1)) ||
+ !_isHexDigit(uri.codeUnitAt(index + 2))) {
+ _fail(uri, index, "Incomplete percent escape");
}
- break;
+ // Valid escape, returns _PERCENT as current.
+ nextIndex = nextIndex + 3;
+ return;
}
+ _fail(uri, index, "Unexpected character");
}
- }
-
- int userInfoEndIndex = -1;
- int portIndex = -1;
- int authorityEndIndex = schemeEndIndex;
- // If we see '//', there must be an authority.
- if (authorityEndIndex == index &&
- authorityEndIndex + 1 < length &&
- uri.codeUnitAt(authorityEndIndex) == _SLASH &&
- uri.codeUnitAt(authorityEndIndex + 1) == _SLASH) {
- // Skip '//'.
- authorityEndIndex += 2;
- // It can both be host and userInfo.
- while (authorityEndIndex < length) {
- int codeUnit = uri.codeUnitAt(authorityEndIndex++);
- if (!isRegName(codeUnit)) {
- if (codeUnit == _LEFT_BRACKET) {
- authorityEndIndex = ipV6Address(authorityEndIndex);
- } else if (portIndex == -1 && codeUnit == _COLON) {
- // First time ':'.
- portIndex = authorityEndIndex;
- } else if (codeUnit == _AT_SIGN || codeUnit == _COLON) {
- // Second time ':' or first '@'. Must be userInfo.
- userInfoEndIndex = uri.indexOf('@', authorityEndIndex - 1);
- // Not found. Must be path then.
- if (userInfoEndIndex == -1) {
- authorityEndIndex = index;
- break;
- }
- portIndex = -1;
- authorityEndIndex = userInfoEndIndex + 1;
- // Now it can only be host:port.
- while (authorityEndIndex < length) {
- int codeUnit = uri.codeUnitAt(authorityEndIndex++);
- if (!isRegName(codeUnit)) {
- if (codeUnit == _LEFT_BRACKET) {
- authorityEndIndex = ipV6Address(authorityEndIndex);
- } else if (codeUnit == _COLON) {
- if (portIndex != -1) {
- throw new FormatException("Double port in host");
- }
- portIndex = authorityEndIndex;
- } else {
- authorityEndIndex--;
- break;
- }
- }
- }
- break;
- } else {
- authorityEndIndex--;
- break;
+ current = _EOI;
+ }
+
+ // Parse authority.
+ void parseAuth() {
+ start = index;
+ void parseIpV6() {
+ assert(current == _LEFT_BRACKET);
+ assert(start == index);
+ for (int i = index + 1; i < uri.length; i++) {
+ if (uri.codeUnitAt(i) == _RIGHT_BRACKET) {
+ nextIndex = i + 1;
+ advance();
+ host = uri.substring(start, index);
Søren Gjesse 2014/06/11 08:33:15 Are we doing proper IPv6 validation later, or is t
Lasse Reichstein Nielsen 2014/06/12 08:07:31 I believe we are doing it later. I don't know if
+ return;
}
}
+ _fail(uri, start, "Unmatched [ in host name");
kevmoo 2014/06/10 21:29:08 quote '['
}
- } else {
- authorityEndIndex = schemeEndIndex;
- }
- // At path now.
- int pathEndIndex = authorityEndIndex;
- while (pathEndIndex < length) {
- int codeUnit = uri.codeUnitAt(pathEndIndex++);
- if (codeUnit == _QUESTION || codeUnit == _NUMBER_SIGN) {
- pathEndIndex--;
- break;
+ void parseHost() {
+ assert(start == index);
+ if (current == _LEFT_BRACKET) {
+ parseIpV6();
+ return;
+ }
+ while (_isRegNameChar(current)) {
+ advance();
+ }
+ host = uri.substring(start, index);
}
- }
- // Maybe query.
- int queryEndIndex = pathEndIndex;
- if (queryEndIndex < length && uri.codeUnitAt(queryEndIndex) == _QUESTION) {
- while (queryEndIndex < length) {
- int codeUnit = uri.codeUnitAt(queryEndIndex++);
- if (codeUnit == _NUMBER_SIGN) {
- queryEndIndex--;
- break;
+ int parsePort() {
+ assert(_isDigit(current));
+ int portVal = current - _ZERO;
+ advance();
+ while (_isDigit(current)) {
+ portVal = portVal * 10 + (current - _ZERO);
Søren Gjesse 2014/06/11 08:33:15 Any limitations on the port value in the spec?
+ advance();
}
+ return portVal;
}
- }
-
- var scheme = null;
- if (schemeEndIndex > 0) {
- scheme = uri.substring(0, schemeEndIndex - 1);
- }
- var host = "";
- var userInfo = "";
- var port = 0;
- if (schemeEndIndex != authorityEndIndex) {
- int startIndex = schemeEndIndex + 2;
- if (userInfoEndIndex > 0) {
- userInfo = uri.substring(startIndex, userInfoEndIndex);
- startIndex = userInfoEndIndex + 1;
+ maybeUserInfo: {
+ // Break this when we know user-info is done or not there.
+ // user-info or reg-name
+ if (current == _LEFT_BRACKET) break maybeUserInfo;
+ while (_isRegNameChar(current)) {
+ advance();
+ }
+ if (current == _SLASH) {
+ host = uri.substring(start, index);
+ return;
+ }
+ if (current == _AT_SIGN) {
+ userInfo = uri.substring(start, index);
+ advance();
+ start = index;
+ break maybeUserInfo;
+ }
+ if (current == _COLON) {
+ // First colon seen after what might be a host name.
+ // Can be either part of user-info or preceeding a port.
+ int hostEnd = index;
+ advance();
+ // user-info or port.
+ if (_isDigit(current)) {
+ int portVal = parsePort();
+ if (current == _SLASH || current == _EOI) {
+ host = uri.substring(start, hostEnd);
+ port = portVal;
+ return;
+ }
+ }
+ }
+ if (current == _EOI) {
+ host = uri.substring(start, index);
+ return;
+ }
+ // A non-port character seen after a colon.
+ // This must be user-info.
+ while (_isUserInfoChar(current)) {
+ advance();
+ }
+ if (current != _AT_SIGN) {
+ _fail(uri, index, "Expected @");
kevmoo 2014/06/10 21:29:08 quote '@'
+ }
+ userInfo = uri.substring(start, index);
+ advance();
+ start = index;
+ } // end maybeUserInfo
+ parseHost();
+ if (current == _COLON) {
+ advance();
+ if (!_isDigit(current)) {
+ _fail(uri, index, "Expected port number");
+ }
+ port = parsePort();
}
- if (portIndex > 0) {
- var portStr = uri.substring(portIndex, authorityEndIndex);
- try {
- port = int.parse(portStr);
- } catch (_) {
- throw new FormatException("Invalid port: '$portStr'");
+ } // end parseAuth().
+
+ // Start parsing.
+ to_path: { // Break this block to go to parsing the path.
+ advance();
+ if (_isAlpha(current)) {
Anders Johnsen 2014/06/11 10:00:03 Do we need 'scheme != null' to prevent double-pars
Lasse Reichstein Nielsen 2014/06/12 08:07:31 No, there is no parsing before this point. The cod
+ // May be scheme or path.
+ do {
+ advance();
+ } while (_isSchemeCharacter(current));
+ if (current != _COLON) {
+ break to_path;
+ }
+ allowColonInPath = true;
+ scheme = uri.substring(0, index);
+ advance();
+ start = index;
+ }
+ // Path or authority.
+ if (current == _SLASH) {
+ allowColonInPath = true;
+ advance();
+ if (current == _SLASH) {
+ advance();
+ parseAuth();
+ if (current == _EOI) {
+ start = index;
+ break to_path;
+ }
+ if (current != _SLASH) {
+ _fail(uri, index, "Expected /");
kevmoo 2014/06/10 21:29:08 quote '/'
+ }
+ start = index;
+ advance();
}
- host = uri.substring(startIndex, portIndex - 1);
- } else {
- host = uri.substring(startIndex, authorityEndIndex);
}
+ } // end to_path.
+ if (!allowColonInPath) {
+ while (_isPathChar(current) || current == _PERCENT) {
Anders Johnsen 2014/06/11 10:00:03 Will _isPathChar return true for '/' ?
Lasse Reichstein Nielsen 2014/06/12 08:07:31 No.
+ if (current == _COLON) {
+ _fail(uri, index, "Colon in path before first '/'");
+ }
+ advance();
+ }
+ }
+ // Start or continue path. May be empty.
+ while (_isPathChar(current) || current == _SLASH || current == _PERCENT) {
+ advance();
}
+ path = uri.substring(start, index);
- var path = uri.substring(authorityEndIndex, pathEndIndex);
- var query = "";
- if (pathEndIndex < queryEndIndex) {
- query = uri.substring(pathEndIndex + 1, queryEndIndex);
+ if (current == _QUESTION) {
+ start = nextIndex;
+ do {
+ advance();
+ } while (_isQueryChar(current) || current == _PERCENT);
+ query = uri.substring(start, index);
}
- var fragment = "";
- // If queryEndIndex is not at end (length), there is a fragment.
- if (queryEndIndex < length) {
- fragment = uri.substring(queryEndIndex + 1, length);
+
+ if (current == _NUMBER_SIGN) {
+ // Fragment can contain same characters as query.
Søren Gjesse 2014/06/11 08:33:15 No number sign in fragment?
Lasse Reichstein Nielsen 2014/06/12 08:07:32 No. At most one # in any URL.
+ start = nextIndex;
+ do {
+ advance();
+ } while (_isQueryChar(current) || current == _PERCENT);
+ fragment = uri.substring(start, index);
+ }
+
+ if (current != _EOI) {
+ _fail(uri, index, "Unexpected character");
}
return new Uri(scheme: scheme,
Anders Johnsen 2014/06/11 10:00:03 This constructor does some extra validation. Do we
Lasse Reichstein Nielsen 2014/06/12 08:07:31 As pointed out elsewhere, the validation needs to
userInfo: userInfo,
- host: host,
port: port,
kevmoo 2014/06/10 21:29:08 nit: might as well keep these params in the origin
Lasse Reichstein Nielsen 2014/06/12 08:07:31 ACK.
+ host: host,
path: path,
query: query,
fragment: fragment);
}
+ // Report a parse failure.
+ static void _fail(uri, index, message) {
kevmoo 2014/06/10 21:29:08 Add types to parameters
+ if (index == uri.length) {
+ message += ": Unexpected end of input.";
kevmoo 2014/06/10 21:29:08 Could this be changed so each message reads like a
+ } else {
+ message += ": Unexpected character at position $index.\n";
kevmoo 2014/06/10 21:29:08 ditto for reading like a sentence from above.
+ int min = 0;
+ int max = uri.length;
+ String pre = "";
+ String post = "";
kevmoo 2014/06/10 21:29:08 Code comments for what you're doing here? Is this
Lasse Reichstein Nielsen 2014/06/12 08:07:31 Yes, at most 78 characters of the source will be d
+ if (uri.length > 78) {
+ min = index - 10;
+ if (min < 0) min = 0;
+ int max = min + 72;
+ if (max > uri.length) {
+ max = uri.length;
+ min = max - 72;
+ }
+ if (min != 0) pre = "...";
+ if (max != uri.length) post = "...";
+ }
+ message = "$message$pre${uri.substring(min, max)}$post\n"
+ "${' ' * (pre.length + index - min)}^";
+ }
+ throw new FormatException(message);
+ }
+
+
/**
* Creates a new URI from its components.
*
@@ -934,6 +1016,22 @@ class Uri {
return ch < 128 && ((_schemeTable[ch >> 4] & (1 << (ch & 0x0f))) != 0);
}
+ static bool _isPathChar(int ch) {
+ return ch < 128 && ((_pathCharTable[ch >> 4] & (1 << (ch & 0x0f))) != 0);
+ }
+
+ static bool _isRegNameChar(int ch) {
+ return ch < 128 && ((_regNameTable[ch >> 4] & (1 << (ch & 0x0f))) != 0);
+ }
+
+ static bool _isUserInfoChar(int ch) {
+ return ch < 128 && (
+ ((_regNameTable[ch >> 4] & (1 << (ch & 0x0f))) != 0) || ch == _COLON);
+ }
+
+ static bool _isQueryChar(int ch) {
+ return ch < 128 && ((_queryCharTable[ch >> 4] & (1 << (ch & 0x0f))) != 0);
+ }
/**
* Returns whether the URI is absolute.
@@ -1551,6 +1649,25 @@ class Uri {
static const int _LOWER_CASE_F = 0x66;
static const int _LOWER_CASE_Z = 0x7A;
static const int _BAR = 0x7C;
+ static const int _MAX_VALID_CHAR = 0x7e;
+ static const int _EOI = 0x10000; // Not a code unit.
Anders Johnsen 2014/06/11 10:00:03 -1?
Lasse Reichstein Nielsen 2014/06/12 08:07:31 Breaks the _isPathChar function (or requires an ex
+
+ static bool _isAlpha(int char) {
+ char |= 0x20;
+ return _LOWER_CASE_A <= char && _LOWER_CASE_Z >= char;
+ // TODO: Test:
+ // return ((char - _a) & 0xffff) <= (_z - _a);
+ }
+
+ static bool _isDigit(int char) {
+ return _NINE >= char && _ZERO <= char;
+ }
+
+ static bool _isHexDigit(int char) {
+ if (_NINE >= char) return _ZERO <= char;
+ char |= 0x20;
+ return _LOWER_CASE_A <= char && _LOWER_CASE_F >= char;
+ }
/**
* This is the internal implementation of JavaScript's encodeURI function.

Powered by Google App Engine
This is Rietveld 408576698