| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "net/tools/flip_server/url_to_filename_encoder.h" | |
| 6 | |
| 7 #include <stdlib.h> | |
| 8 | |
| 9 #include "base/logging.h" | |
| 10 #include "base/strings/string_util.h" | |
| 11 | |
| 12 using std::string; | |
| 13 | |
| 14 namespace { | |
| 15 | |
| 16 #ifdef WIN32 | |
| 17 #define strtoull _strtoui64 | |
| 18 #endif | |
| 19 | |
| 20 // A simple parser for long long values. Returns the parsed value if a | |
| 21 // valid integer is found; else returns deflt | |
| 22 // UInt64 and Int64 cannot handle decimal numbers with leading 0s. | |
| 23 uint64_t ParseLeadingHex64Value(const char* str, uint64_t deflt) { | |
| 24 char* error = NULL; | |
| 25 const uint64_t value = strtoull(str, &error, 16); | |
| 26 return (error == str) ? deflt : value; | |
| 27 } | |
| 28 | |
| 29 } // namespace | |
| 30 | |
| 31 namespace net { | |
| 32 | |
| 33 // The escape character choice is made here -- all code and tests in this | |
| 34 // directory are based off of this constant. However, our testdata | |
| 35 // has tons of dependencies on this, so it cannot be changed without | |
| 36 // re-running those tests and fixing them. | |
| 37 const char UrlToFilenameEncoder::kEscapeChar = ','; | |
| 38 const char UrlToFilenameEncoder::kTruncationChar = '-'; | |
| 39 const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128; | |
| 40 | |
| 41 void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) { | |
| 42 CHECK(!segment->empty()); | |
| 43 if ((*segment == ".") || (*segment == "..")) { | |
| 44 dest->append(1, kEscapeChar); | |
| 45 dest->append(*segment); | |
| 46 segment->clear(); | |
| 47 } else { | |
| 48 size_t segment_size = segment->size(); | |
| 49 if (segment_size > kMaximumSubdirectoryLength) { | |
| 50 // We need to inject ",-" at the end of the segment to signify that | |
| 51 // we are inserting an artificial '/'. This means we have to chop | |
| 52 // off at least two characters to make room. | |
| 53 segment_size = kMaximumSubdirectoryLength - 2; | |
| 54 | |
| 55 // But we don't want to break up an escape sequence that happens to lie at | |
| 56 // the end. Escape sequences are at most 2 characters. | |
| 57 if ((*segment)[segment_size - 1] == kEscapeChar) { | |
| 58 segment_size -= 1; | |
| 59 } else if ((*segment)[segment_size - 2] == kEscapeChar) { | |
| 60 segment_size -= 2; | |
| 61 } | |
| 62 dest->append(segment->data(), segment_size); | |
| 63 dest->append(1, kEscapeChar); | |
| 64 dest->append(1, kTruncationChar); | |
| 65 segment->erase(0, segment_size); | |
| 66 | |
| 67 // At this point, if we had segment_size=3, and segment="abcd", | |
| 68 // then after this erase, we will have written "abc,-" and set segment="d" | |
| 69 } else { | |
| 70 dest->append(*segment); | |
| 71 segment->clear(); | |
| 72 } | |
| 73 } | |
| 74 } | |
| 75 | |
| 76 void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix, | |
| 77 const string& escaped_ending, | |
| 78 char dir_separator, | |
| 79 string* encoded_filename) { | |
| 80 string filename_ending = UrlUtilities::Unescape(escaped_ending); | |
| 81 | |
| 82 char encoded[3]; | |
| 83 int encoded_len; | |
| 84 string segment; | |
| 85 | |
| 86 // TODO(jmarantz): This code would be a bit simpler if we disallowed | |
| 87 // Instaweb allowing filename_prefix to not end in "/". We could | |
| 88 // then change the is routine to just take one input string. | |
| 89 size_t start_of_segment = filename_prefix.find_last_of(dir_separator); | |
| 90 if (start_of_segment == string::npos) { | |
| 91 segment = filename_prefix; | |
| 92 } else { | |
| 93 segment = filename_prefix.substr(start_of_segment + 1); | |
| 94 *encoded_filename = filename_prefix.substr(0, start_of_segment + 1); | |
| 95 } | |
| 96 | |
| 97 size_t index = 0; | |
| 98 // Special case the first / to avoid adding a leading kEscapeChar. | |
| 99 if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) { | |
| 100 encoded_filename->append(segment); | |
| 101 segment.clear(); | |
| 102 encoded_filename->append(1, dir_separator); | |
| 103 ++index; | |
| 104 } | |
| 105 | |
| 106 for (; index < filename_ending.length(); ++index) { | |
| 107 unsigned char ch = static_cast<unsigned char>(filename_ending[index]); | |
| 108 | |
| 109 // Note: instead of outputing an empty segment, we let the second slash | |
| 110 // be escaped below. | |
| 111 if ((ch == dir_separator) && !segment.empty()) { | |
| 112 AppendSegment(&segment, encoded_filename); | |
| 113 encoded_filename->append(1, dir_separator); | |
| 114 segment.clear(); | |
| 115 } else { | |
| 116 // After removing unsafe chars the only safe ones are _.=+- and alphanums. | |
| 117 if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') || | |
| 118 (ch == '-') || (('0' <= ch) && (ch <= '9')) || | |
| 119 (('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) { | |
| 120 encoded[0] = ch; | |
| 121 encoded_len = 1; | |
| 122 } else { | |
| 123 encoded[0] = kEscapeChar; | |
| 124 encoded[1] = ch / 16; | |
| 125 encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0'; | |
| 126 encoded[2] = ch % 16; | |
| 127 encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0'; | |
| 128 encoded_len = 3; | |
| 129 } | |
| 130 segment.append(encoded, encoded_len); | |
| 131 | |
| 132 // If segment is too big, we must chop it into chunks. | |
| 133 if (segment.size() > kMaximumSubdirectoryLength) { | |
| 134 AppendSegment(&segment, encoded_filename); | |
| 135 encoded_filename->append(1, dir_separator); | |
| 136 } | |
| 137 } | |
| 138 } | |
| 139 | |
| 140 // Append "," to the leaf filename so the leaf can also be a branch., e.g. | |
| 141 // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and | |
| 142 // /a/b/c/d". So we will rename the "d" here to "d,". If doing that pushed | |
| 143 // us over the 128 char limit, then we will need to append "/" and the | |
| 144 // remaining chars. | |
| 145 segment += kEscapeChar; | |
| 146 AppendSegment(&segment, encoded_filename); | |
| 147 if (!segment.empty()) { | |
| 148 // The last overflow segment is special, because we appended in | |
| 149 // kEscapeChar above. We won't need to check it again for size | |
| 150 // or further escaping. | |
| 151 encoded_filename->append(1, dir_separator); | |
| 152 encoded_filename->append(segment); | |
| 153 } | |
| 154 } | |
| 155 | |
| 156 // Note: this decoder is not the exact inverse of the EncodeSegment above, | |
| 157 // because it does not take into account a prefix. | |
| 158 bool UrlToFilenameEncoder::Decode(const string& encoded_filename, | |
| 159 char dir_separator, | |
| 160 string* decoded_url) { | |
| 161 enum State { kStart, kEscape, kFirstDigit, kTruncate, kEscapeDot }; | |
| 162 State state = kStart; | |
| 163 char hex_buffer[3]; | |
| 164 hex_buffer[2] = '\0'; | |
| 165 for (size_t i = 0; i < encoded_filename.size(); ++i) { | |
| 166 char ch = encoded_filename[i]; | |
| 167 switch (state) { | |
| 168 case kStart: | |
| 169 if (ch == kEscapeChar) { | |
| 170 state = kEscape; | |
| 171 } else if (ch == dir_separator) { | |
| 172 decoded_url->append(1, '/'); // URLs only use '/' not '\\' | |
| 173 } else { | |
| 174 decoded_url->append(1, ch); | |
| 175 } | |
| 176 break; | |
| 177 case kEscape: | |
| 178 if (base::IsHexDigit(ch)) { | |
| 179 hex_buffer[0] = ch; | |
| 180 state = kFirstDigit; | |
| 181 } else if (ch == kTruncationChar) { | |
| 182 state = kTruncate; | |
| 183 } else if (ch == '.') { | |
| 184 decoded_url->append(1, '.'); | |
| 185 state = kEscapeDot; // Look for at most one more dot. | |
| 186 } else if (ch == dir_separator) { | |
| 187 // Consider url "//x". This was once encoded to "/,/x,". | |
| 188 // This code is what skips the first Escape. | |
| 189 decoded_url->append(1, '/'); // URLs only use '/' not '\\' | |
| 190 state = kStart; | |
| 191 } else { | |
| 192 return false; | |
| 193 } | |
| 194 break; | |
| 195 case kFirstDigit: | |
| 196 if (base::IsHexDigit(ch)) { | |
| 197 hex_buffer[1] = ch; | |
| 198 uint64_t hex_value = ParseLeadingHex64Value(hex_buffer, 0); | |
| 199 decoded_url->append(1, static_cast<char>(hex_value)); | |
| 200 state = kStart; | |
| 201 } else { | |
| 202 return false; | |
| 203 } | |
| 204 break; | |
| 205 case kTruncate: | |
| 206 if (ch == dir_separator) { | |
| 207 // Skip this separator, it was only put in to break up long | |
| 208 // path segments, but is not part of the URL. | |
| 209 state = kStart; | |
| 210 } else { | |
| 211 return false; | |
| 212 } | |
| 213 break; | |
| 214 case kEscapeDot: | |
| 215 decoded_url->append(1, ch); | |
| 216 state = kStart; | |
| 217 break; | |
| 218 } | |
| 219 } | |
| 220 | |
| 221 // All legal encoded filenames end in kEscapeChar. | |
| 222 return (state == kEscape); | |
| 223 } | |
| 224 | |
| 225 // Escape the given input |path| and chop any individual components | |
| 226 // of the path which are greater than kMaximumSubdirectoryLength characters | |
| 227 // into two chunks. | |
| 228 // | |
| 229 // This legacy version has several issues with aliasing of different URLs, | |
| 230 // inability to represent both /a/b/c and /a/b/c/d, and inability to decode | |
| 231 // the filenames back into URLs. | |
| 232 // | |
| 233 // But there is a large body of slurped data which depends on this format, | |
| 234 // so leave it as the default for spdy_in_mem_edsm_server. | |
| 235 string UrlToFilenameEncoder::LegacyEscape(const string& path) { | |
| 236 string output; | |
| 237 | |
| 238 // Note: We also chop paths into medium sized 'chunks'. | |
| 239 // This is due to the incompetence of the windows | |
| 240 // filesystem, which still hasn't figured out how | |
| 241 // to deal with long filenames. | |
| 242 int last_slash = 0; | |
| 243 for (size_t index = 0; index < path.length(); index++) { | |
| 244 char ch = path[index]; | |
| 245 if (ch == 0x5C) | |
| 246 last_slash = index; | |
| 247 if ((ch == 0x2D) || // hyphen | |
| 248 (ch == 0x5C) || (ch == 0x5F) || // backslash, underscore | |
| 249 ((0x30 <= ch) && (ch <= 0x39)) || // Digits [0-9] | |
| 250 ((0x41 <= ch) && (ch <= 0x5A)) || // Uppercase [A-Z] | |
| 251 ((0x61 <= ch) && (ch <= 0x7A))) { // Lowercase [a-z] | |
| 252 output.append(&path[index], 1); | |
| 253 } else { | |
| 254 char encoded[3]; | |
| 255 encoded[0] = 'x'; | |
| 256 encoded[1] = ch / 16; | |
| 257 encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0'; | |
| 258 encoded[2] = ch % 16; | |
| 259 encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0'; | |
| 260 output.append(encoded, 3); | |
| 261 } | |
| 262 if (index - last_slash > kMaximumSubdirectoryLength) { | |
| 263 #ifdef WIN32 | |
| 264 char slash = '\\'; | |
| 265 #else | |
| 266 char slash = '/'; | |
| 267 #endif | |
| 268 output.append(&slash, 1); | |
| 269 last_slash = index; | |
| 270 } | |
| 271 } | |
| 272 return output; | |
| 273 } | |
| 274 | |
| 275 } // namespace net | |
| OLD | NEW |