OLD | NEW |
| (Empty) |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include <stdlib.h> | |
6 | |
7 #include "base/logging.h" | |
8 #include "base/strings/string_util.h" | |
9 #include "net/base/net_util.h" | |
10 #include "net/tools/dump_cache/url_to_filename_encoder.h" | |
11 | |
12 using std::string; | |
13 | |
14 namespace { | |
15 | |
16 // Returns 1 if buf is prefixed by "num_digits" of hex digits | |
17 // Teturns 0 otherwise. | |
18 // The function checks for '\0' for string termination. | |
19 int HexDigitsPrefix(const char* buf, int num_digits) { | |
20 for (int i = 0; i < num_digits; i++) { | |
21 if (!IsHexDigit(buf[i])) | |
22 return 0; // This also detects end of string as '\0' is not xdigit. | |
23 } | |
24 return 1; | |
25 } | |
26 | |
27 #ifdef WIN32 | |
28 #define strtoull _strtoui64 | |
29 #endif | |
30 | |
31 // A simple parser for long long values. Returns the parsed value if a | |
32 // valid integer is found; else returns deflt | |
33 // UInt64 and Int64 cannot handle decimal numbers with leading 0s. | |
34 uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) { | |
35 char *error = NULL; | |
36 const uint64 value = strtoull(str, &error, 16); | |
37 return (error == str) ? deflt : value; | |
38 } | |
39 | |
40 } | |
41 | |
42 namespace net { | |
43 | |
44 // The escape character choice is made here -- all code and tests in this | |
45 // directory are based off of this constant. However, our testdata | |
46 // has tons of dependencies on this, so it cannot be changed without | |
47 // re-running those tests and fixing them. | |
48 const char UrlToFilenameEncoder::kEscapeChar = ','; | |
49 const char UrlToFilenameEncoder::kTruncationChar = '-'; | |
50 const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128; | |
51 | |
52 void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) { | |
53 CHECK(!segment->empty()); | |
54 if ((*segment == ".") || (*segment == "..")) { | |
55 dest->append(1, kEscapeChar); | |
56 dest->append(*segment); | |
57 segment->clear(); | |
58 } else { | |
59 size_t segment_size = segment->size(); | |
60 if (segment_size > kMaximumSubdirectoryLength) { | |
61 // We need to inject ",-" at the end of the segment to signify that | |
62 // we are inserting an artificial '/'. This means we have to chop | |
63 // off at least two characters to make room. | |
64 segment_size = kMaximumSubdirectoryLength - 2; | |
65 | |
66 // But we don't want to break up an escape sequence that happens to lie at | |
67 // the end. Escape sequences are at most 2 characters. | |
68 if ((*segment)[segment_size - 1] == kEscapeChar) { | |
69 segment_size -= 1; | |
70 } else if ((*segment)[segment_size - 2] == kEscapeChar) { | |
71 segment_size -= 2; | |
72 } | |
73 dest->append(segment->data(), segment_size); | |
74 dest->append(1, kEscapeChar); | |
75 dest->append(1, kTruncationChar); | |
76 segment->erase(0, segment_size); | |
77 | |
78 // At this point, if we had segment_size=3, and segment="abcd", | |
79 // then after this erase, we will have written "abc,-" and set segment="d" | |
80 } else { | |
81 dest->append(*segment); | |
82 segment->clear(); | |
83 } | |
84 } | |
85 } | |
86 | |
87 void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix, | |
88 const string& escaped_ending, | |
89 char dir_separator, | |
90 string* encoded_filename) { | |
91 string filename_ending = UrlUtilities::Unescape(escaped_ending); | |
92 | |
93 char encoded[3]; | |
94 int encoded_len; | |
95 string segment; | |
96 | |
97 // TODO(jmarantz): This code would be a bit simpler if we disallowed | |
98 // Instaweb allowing filename_prefix to not end in "/". We could | |
99 // then change the is routine to just take one input string. | |
100 size_t start_of_segment = filename_prefix.find_last_of(dir_separator); | |
101 if (start_of_segment == string::npos) { | |
102 segment = filename_prefix; | |
103 } else { | |
104 segment = filename_prefix.substr(start_of_segment + 1); | |
105 *encoded_filename = filename_prefix.substr(0, start_of_segment + 1); | |
106 } | |
107 | |
108 size_t index = 0; | |
109 // Special case the first / to avoid adding a leading kEscapeChar. | |
110 if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) { | |
111 encoded_filename->append(segment); | |
112 segment.clear(); | |
113 encoded_filename->append(1, dir_separator); | |
114 ++index; | |
115 } | |
116 | |
117 for (; index < filename_ending.length(); ++index) { | |
118 unsigned char ch = static_cast<unsigned char>(filename_ending[index]); | |
119 | |
120 // Note: instead of outputing an empty segment, we let the second slash | |
121 // be escaped below. | |
122 if ((ch == dir_separator) && !segment.empty()) { | |
123 AppendSegment(&segment, encoded_filename); | |
124 encoded_filename->append(1, dir_separator); | |
125 segment.clear(); | |
126 } else { | |
127 // After removing unsafe chars the only safe ones are _.=+- and alphanums. | |
128 if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') || | |
129 (ch == '-') || (('0' <= ch) && (ch <= '9')) || | |
130 (('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) { | |
131 encoded[0] = ch; | |
132 encoded_len = 1; | |
133 } else { | |
134 encoded[0] = kEscapeChar; | |
135 encoded[1] = ch / 16; | |
136 encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0'; | |
137 encoded[2] = ch % 16; | |
138 encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0'; | |
139 encoded_len = 3; | |
140 } | |
141 segment.append(encoded, encoded_len); | |
142 | |
143 // If segment is too big, we must chop it into chunks. | |
144 if (segment.size() > kMaximumSubdirectoryLength) { | |
145 AppendSegment(&segment, encoded_filename); | |
146 encoded_filename->append(1, dir_separator); | |
147 } | |
148 } | |
149 } | |
150 | |
151 // Append "," to the leaf filename so the leaf can also be a branch., e.g. | |
152 // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and | |
153 // /a/b/c/d". So we will rename the "d" here to "d,". If doing that pushed | |
154 // us over the 128 char limit, then we will need to append "/" and the | |
155 // remaining chars. | |
156 segment += kEscapeChar; | |
157 AppendSegment(&segment, encoded_filename); | |
158 if (!segment.empty()) { | |
159 // The last overflow segment is special, because we appended in | |
160 // kEscapeChar above. We won't need to check it again for size | |
161 // or further escaping. | |
162 encoded_filename->append(1, dir_separator); | |
163 encoded_filename->append(segment); | |
164 } | |
165 } | |
166 | |
167 // Note: this decoder is not the exact inverse of the EncodeSegment above, | |
168 // because it does not take into account a prefix. | |
169 bool UrlToFilenameEncoder::Decode(const string& encoded_filename, | |
170 char dir_separator, | |
171 string* decoded_url) { | |
172 enum State { | |
173 kStart, | |
174 kEscape, | |
175 kFirstDigit, | |
176 kTruncate, | |
177 kEscapeDot | |
178 }; | |
179 State state = kStart; | |
180 char hex_buffer[3]; | |
181 hex_buffer[2] = '\0'; | |
182 for (size_t i = 0; i < encoded_filename.size(); ++i) { | |
183 char ch = encoded_filename[i]; | |
184 switch (state) { | |
185 case kStart: | |
186 if (ch == kEscapeChar) { | |
187 state = kEscape; | |
188 } else if (ch == dir_separator) { | |
189 decoded_url->append(1, '/'); // URLs only use '/' not '\\' | |
190 } else { | |
191 decoded_url->append(1, ch); | |
192 } | |
193 break; | |
194 case kEscape: | |
195 if (HexDigitsPrefix(&ch, 1) == 1) { | |
196 hex_buffer[0] = ch; | |
197 state = kFirstDigit; | |
198 } else if (ch == kTruncationChar) { | |
199 state = kTruncate; | |
200 } else if (ch == '.') { | |
201 decoded_url->append(1, '.'); | |
202 state = kEscapeDot; // Look for at most one more dot. | |
203 } else if (ch == dir_separator) { | |
204 // Consider url "//x". This was once encoded to "/,/x,". | |
205 // This code is what skips the first Escape. | |
206 decoded_url->append(1, '/'); // URLs only use '/' not '\\' | |
207 state = kStart; | |
208 } else { | |
209 return false; | |
210 } | |
211 break; | |
212 case kFirstDigit: | |
213 if (HexDigitsPrefix(&ch, 1) == 1) { | |
214 hex_buffer[1] = ch; | |
215 uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0); | |
216 decoded_url->append(1, static_cast<char>(hex_value)); | |
217 state = kStart; | |
218 } else { | |
219 return false; | |
220 } | |
221 break; | |
222 case kTruncate: | |
223 if (ch == dir_separator) { | |
224 // Skip this separator, it was only put in to break up long | |
225 // path segments, but is not part of the URL. | |
226 state = kStart; | |
227 } else { | |
228 return false; | |
229 } | |
230 break; | |
231 case kEscapeDot: | |
232 decoded_url->append(1, ch); | |
233 state = kStart; | |
234 break; | |
235 } | |
236 } | |
237 | |
238 // All legal encoded filenames end in kEscapeChar. | |
239 return (state == kEscape); | |
240 } | |
241 | |
242 // Escape the given input |path| and chop any individual components | |
243 // of the path which are greater than kMaximumSubdirectoryLength characters | |
244 // into two chunks. | |
245 // | |
246 // This legacy version has several issues with aliasing of different URLs, | |
247 // inability to represent both /a/b/c and /a/b/c/d, and inability to decode | |
248 // the filenames back into URLs. | |
249 // | |
250 // But there is a large body of slurped data which depends on this format, | |
251 // so leave it as the default for spdy_in_mem_edsm_server. | |
252 string UrlToFilenameEncoder::LegacyEscape(const string& path) { | |
253 string output; | |
254 | |
255 // Note: We also chop paths into medium sized 'chunks'. | |
256 // This is due to the incompetence of the windows | |
257 // filesystem, which still hasn't figured out how | |
258 // to deal with long filenames. | |
259 int last_slash = 0; | |
260 for (size_t index = 0; index < path.length(); index++) { | |
261 char ch = path[index]; | |
262 if (ch == 0x5C) | |
263 last_slash = index; | |
264 if ((ch == 0x2D) || // hyphen | |
265 (ch == 0x5C) || (ch == 0x5F) || // backslash, underscore | |
266 ((0x30 <= ch) && (ch <= 0x39)) || // Digits [0-9] | |
267 ((0x41 <= ch) && (ch <= 0x5A)) || // Uppercase [A-Z] | |
268 ((0x61 <= ch) && (ch <= 0x7A))) { // Lowercase [a-z] | |
269 output.append(&path[index], 1); | |
270 } else { | |
271 char encoded[3]; | |
272 encoded[0] = 'x'; | |
273 encoded[1] = ch / 16; | |
274 encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0'; | |
275 encoded[2] = ch % 16; | |
276 encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0'; | |
277 output.append(encoded, 3); | |
278 } | |
279 if (index - last_slash > kMaximumSubdirectoryLength) { | |
280 #ifdef WIN32 | |
281 char slash = '\\'; | |
282 #else | |
283 char slash = '/'; | |
284 #endif | |
285 output.append(&slash, 1); | |
286 last_slash = index; | |
287 } | |
288 } | |
289 return output; | |
290 } | |
291 | |
292 } // namespace net | |
OLD | NEW |