OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "chrome/browser/search_engines/template_url_parser.h" | |
6 | |
7 #include <algorithm> | |
8 #include <map> | |
9 #include <vector> | |
10 | |
11 #include "base/logging.h" | |
12 #include "base/memory/scoped_ptr.h" | |
13 #include "base/strings/string_number_conversions.h" | |
14 #include "base/strings/string_util.h" | |
15 #include "base/strings/utf_string_conversions.h" | |
16 #include "components/search_engines/template_url.h" | |
17 #include "libxml/parser.h" | |
18 #include "libxml/xmlwriter.h" | |
19 #include "ui/gfx/favicon_size.h" | |
20 #include "url/gurl.h" | |
21 #include "url/url_constants.h" | |
22 | |
23 namespace { | |
24 | |
25 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds | |
26 // to that of char, the following names are all in terms of char. This avoids | |
27 // having to convert to wide, then do comparisons. | |
28 | |
29 // Defines for element names of the OSD document: | |
30 const char kURLElement[] = "Url"; | |
31 const char kParamElement[] = "Param"; | |
32 const char kShortNameElement[] = "ShortName"; | |
33 const char kImageElement[] = "Image"; | |
34 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription"; | |
35 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin"; | |
36 const char kInputEncodingElement[] = "InputEncoding"; | |
37 | |
38 // Various XML attributes used. | |
39 const char kURLTypeAttribute[] = "type"; | |
40 const char kURLTemplateAttribute[] = "template"; | |
41 const char kImageTypeAttribute[] = "type"; | |
42 const char kImageWidthAttribute[] = "width"; | |
43 const char kImageHeightAttribute[] = "height"; | |
44 const char kParamNameAttribute[] = "name"; | |
45 const char kParamValueAttribute[] = "value"; | |
46 const char kParamMethodAttribute[] = "method"; | |
47 | |
48 // Mime type for search results. | |
49 const char kHTMLType[] = "text/html"; | |
50 | |
51 // Mime type for as you type suggestions. | |
52 const char kSuggestionType[] = "application/x-suggestions+json"; | |
53 | |
54 std::string XMLCharToString(const xmlChar* value) { | |
55 return std::string(reinterpret_cast<const char*>(value)); | |
56 } | |
57 | |
58 // Returns true if input_encoding contains a valid input encoding string. This | |
59 // doesn't verify that we have a valid encoding for the string, just that the | |
60 // string contains characters that constitute a valid input encoding. | |
61 bool IsValidEncodingString(const std::string& input_encoding) { | |
62 if (input_encoding.empty()) | |
63 return false; | |
64 | |
65 if (!IsAsciiAlpha(input_encoding[0])) | |
66 return false; | |
67 | |
68 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) { | |
69 char c = input_encoding[i]; | |
70 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' && | |
71 c != '-') { | |
72 return false; | |
73 } | |
74 } | |
75 return true; | |
76 } | |
77 | |
78 void AppendParamToQuery(const std::string& key, | |
79 const std::string& value, | |
80 std::string* query) { | |
81 if (!query->empty()) | |
82 query->append("&"); | |
83 if (!key.empty()) { | |
84 query->append(key); | |
85 query->append("="); | |
86 } | |
87 query->append(value); | |
88 } | |
89 | |
90 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S]. | |
91 bool IsHTTPRef(const std::string& url) { | |
92 if (url.empty()) | |
93 return true; | |
94 GURL gurl(url); | |
95 return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) || | |
96 gurl.SchemeIs(url::kHttpsScheme)); | |
97 } | |
98 | |
99 } // namespace | |
100 | |
101 | |
102 // TemplateURLParsingContext -------------------------------------------------- | |
103 | |
104 // To minimize memory overhead while parsing, a SAX style parser is used. | |
105 // TemplateURLParsingContext is used to maintain the state we're in the document | |
106 // while parsing. | |
107 class TemplateURLParsingContext { | |
108 public: | |
109 // Enum of the known element types. | |
110 enum ElementType { | |
111 UNKNOWN, | |
112 OPEN_SEARCH_DESCRIPTION, | |
113 URL, | |
114 PARAM, | |
115 SHORT_NAME, | |
116 IMAGE, | |
117 INPUT_ENCODING, | |
118 }; | |
119 | |
120 enum Method { | |
121 GET, | |
122 POST | |
123 }; | |
124 | |
125 // Key/value of a Param node. | |
126 typedef std::pair<std::string, std::string> Param; | |
127 | |
128 explicit TemplateURLParsingContext( | |
129 TemplateURLParser::ParameterFilter* parameter_filter); | |
130 | |
131 static void StartElementImpl(void* ctx, | |
132 const xmlChar* name, | |
133 const xmlChar** atts); | |
134 static void EndElementImpl(void* ctx, const xmlChar* name); | |
135 static void CharactersImpl(void* ctx, const xmlChar* ch, int len); | |
136 | |
137 // Returns a heap-allocated TemplateURL representing the result of parsing. | |
138 // This will be NULL if parsing failed or if the results were invalid for some | |
139 // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied, | |
140 // a resulting TemplateURLRef was invalid, etc.). | |
141 TemplateURL* GetTemplateURL(const SearchTermsData& search_terms_data, | |
142 bool show_in_default_list); | |
143 | |
144 private: | |
145 // Key is UTF8 encoded. | |
146 typedef std::map<std::string, ElementType> ElementNameToElementTypeMap; | |
147 | |
148 static void InitMapping(); | |
149 | |
150 void ParseURL(const xmlChar** atts); | |
151 void ParseImage(const xmlChar** atts); | |
152 void ParseParam(const xmlChar** atts); | |
153 void ProcessURLParams(); | |
154 | |
155 // Returns the current ElementType. | |
156 ElementType GetKnownType(); | |
157 | |
158 static ElementNameToElementTypeMap* kElementNameToElementTypeMap; | |
159 | |
160 // Data that gets updated as we parse, and is converted to a TemplateURL by | |
161 // GetTemplateURL(). | |
162 TemplateURLData data_; | |
163 | |
164 std::vector<ElementType> elements_; | |
165 bool image_is_valid_for_favicon_; | |
166 | |
167 // Character content for the current element. | |
168 base::string16 string_; | |
169 | |
170 TemplateURLParser::ParameterFilter* parameter_filter_; | |
171 | |
172 // The list of parameters parsed in the Param nodes of a Url node. | |
173 std::vector<Param> extra_params_; | |
174 | |
175 // The HTTP methods used. | |
176 Method method_; | |
177 Method suggestion_method_; | |
178 | |
179 // If true, we are currently parsing a suggest URL, otherwise it is an HTML | |
180 // search. Note that we don't need a stack as URL nodes cannot be nested. | |
181 bool is_suggest_url_; | |
182 | |
183 // Whether we should derive the image from the URL (when images are data | |
184 // URLs). | |
185 bool derive_image_from_url_; | |
186 | |
187 DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext); | |
188 }; | |
189 | |
190 // static | |
191 TemplateURLParsingContext::ElementNameToElementTypeMap* | |
192 TemplateURLParsingContext::kElementNameToElementTypeMap = NULL; | |
193 | |
194 TemplateURLParsingContext::TemplateURLParsingContext( | |
195 TemplateURLParser::ParameterFilter* parameter_filter) | |
196 : image_is_valid_for_favicon_(false), | |
197 parameter_filter_(parameter_filter), | |
198 method_(GET), | |
199 suggestion_method_(GET), | |
200 is_suggest_url_(false), | |
201 derive_image_from_url_(false) { | |
202 if (kElementNameToElementTypeMap == NULL) | |
203 InitMapping(); | |
204 } | |
205 | |
206 // static | |
207 void TemplateURLParsingContext::StartElementImpl(void* ctx, | |
208 const xmlChar* name, | |
209 const xmlChar** atts) { | |
210 // Remove the namespace from |name|, ex: os:Url -> Url. | |
211 std::string node_name(XMLCharToString(name)); | |
212 size_t index = node_name.find_first_of(":"); | |
213 if (index != std::string::npos) | |
214 node_name.erase(0, index + 1); | |
215 | |
216 TemplateURLParsingContext* context = | |
217 reinterpret_cast<TemplateURLParsingContext*>(ctx); | |
218 context->elements_.push_back( | |
219 context->kElementNameToElementTypeMap->count(node_name) ? | |
220 (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN); | |
221 switch (context->GetKnownType()) { | |
222 case TemplateURLParsingContext::URL: | |
223 context->extra_params_.clear(); | |
224 context->ParseURL(atts); | |
225 break; | |
226 case TemplateURLParsingContext::IMAGE: | |
227 context->ParseImage(atts); | |
228 break; | |
229 case TemplateURLParsingContext::PARAM: | |
230 context->ParseParam(atts); | |
231 break; | |
232 default: | |
233 break; | |
234 } | |
235 context->string_.clear(); | |
236 } | |
237 | |
238 // static | |
239 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) { | |
240 TemplateURLParsingContext* context = | |
241 reinterpret_cast<TemplateURLParsingContext*>(ctx); | |
242 switch (context->GetKnownType()) { | |
243 case TemplateURLParsingContext::SHORT_NAME: | |
244 context->data_.short_name = context->string_; | |
245 break; | |
246 case TemplateURLParsingContext::IMAGE: { | |
247 GURL image_url(base::UTF16ToUTF8(context->string_)); | |
248 if (image_url.SchemeIs(url::kDataScheme)) { | |
249 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to | |
250 // decode the data URL in the renderer. For now, we'll just point to the | |
251 // favicon from the URL. | |
252 context->derive_image_from_url_ = true; | |
253 } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() && | |
254 (image_url.SchemeIs(url::kHttpScheme) || | |
255 image_url.SchemeIs(url::kHttpsScheme))) { | |
256 context->data_.favicon_url = image_url; | |
257 } | |
258 context->image_is_valid_for_favicon_ = false; | |
259 break; | |
260 } | |
261 case TemplateURLParsingContext::INPUT_ENCODING: { | |
262 std::string input_encoding = base::UTF16ToASCII(context->string_); | |
263 if (IsValidEncodingString(input_encoding)) | |
264 context->data_.input_encodings.push_back(input_encoding); | |
265 break; | |
266 } | |
267 case TemplateURLParsingContext::URL: | |
268 context->ProcessURLParams(); | |
269 break; | |
270 default: | |
271 break; | |
272 } | |
273 context->string_.clear(); | |
274 context->elements_.pop_back(); | |
275 } | |
276 | |
277 // static | |
278 void TemplateURLParsingContext::CharactersImpl(void* ctx, | |
279 const xmlChar* ch, | |
280 int len) { | |
281 reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ += | |
282 base::UTF8ToUTF16(std::string(reinterpret_cast<const char*>(ch), len)); | |
283 } | |
284 | |
285 TemplateURL* TemplateURLParsingContext::GetTemplateURL( | |
286 const SearchTermsData& search_terms_data, | |
287 bool show_in_default_list) { | |
288 // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107 | |
289 if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() || | |
290 !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url)) | |
291 return NULL; | |
292 if (suggestion_method_ == TemplateURLParsingContext::POST) | |
293 data_.suggestions_url.clear(); | |
294 | |
295 // If the image was a data URL, use the favicon from the search URL instead. | |
296 // (see the TODO in EndElementImpl()). | |
297 GURL search_url(data_.url()); | |
298 if (derive_image_from_url_ && data_.favicon_url.is_empty()) | |
299 data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url); | |
300 | |
301 data_.SetKeyword(TemplateURL::GenerateKeyword(search_url)); | |
302 data_.show_in_default_list = show_in_default_list; | |
303 | |
304 // Bail if the search URL is empty or if either TemplateURLRef is invalid. | |
305 scoped_ptr<TemplateURL> template_url(new TemplateURL(data_)); | |
306 if (template_url->url().empty() || | |
307 !template_url->url_ref().IsValid(search_terms_data) || | |
308 (!template_url->suggestions_url().empty() && | |
309 !template_url->suggestions_url_ref().IsValid(search_terms_data))) { | |
310 return NULL; | |
311 } | |
312 | |
313 return template_url.release(); | |
314 } | |
315 | |
316 // static | |
317 void TemplateURLParsingContext::InitMapping() { | |
318 kElementNameToElementTypeMap = new std::map<std::string, ElementType>; | |
319 (*kElementNameToElementTypeMap)[kURLElement] = URL; | |
320 (*kElementNameToElementTypeMap)[kParamElement] = PARAM; | |
321 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME; | |
322 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE; | |
323 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] = | |
324 OPEN_SEARCH_DESCRIPTION; | |
325 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] = | |
326 OPEN_SEARCH_DESCRIPTION; | |
327 (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING; | |
328 } | |
329 | |
330 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) { | |
331 if (!atts) | |
332 return; | |
333 | |
334 std::string template_url; | |
335 bool is_post = false; | |
336 bool is_html_url = false; | |
337 bool is_suggest_url = false; | |
338 for (; *atts; atts += 2) { | |
339 std::string name(XMLCharToString(*atts)); | |
340 const xmlChar* value = atts[1]; | |
341 if (name == kURLTypeAttribute) { | |
342 std::string type = XMLCharToString(value); | |
343 is_html_url = (type == kHTMLType); | |
344 is_suggest_url = (type == kSuggestionType); | |
345 } else if (name == kURLTemplateAttribute) { | |
346 template_url = XMLCharToString(value); | |
347 } else if (name == kParamMethodAttribute) { | |
348 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post"); | |
349 } | |
350 } | |
351 | |
352 if (is_html_url && !template_url.empty()) { | |
353 data_.SetURL(template_url); | |
354 is_suggest_url_ = false; | |
355 if (is_post) | |
356 method_ = POST; | |
357 } else if (is_suggest_url) { | |
358 data_.suggestions_url = template_url; | |
359 is_suggest_url_ = true; | |
360 if (is_post) | |
361 suggestion_method_ = POST; | |
362 } | |
363 } | |
364 | |
365 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) { | |
366 if (!atts) | |
367 return; | |
368 | |
369 int width = 0; | |
370 int height = 0; | |
371 std::string type; | |
372 for (; *atts; atts += 2) { | |
373 std::string name(XMLCharToString(*atts)); | |
374 const xmlChar* value = atts[1]; | |
375 if (name == kImageTypeAttribute) { | |
376 type = XMLCharToString(value); | |
377 } else if (name == kImageWidthAttribute) { | |
378 base::StringToInt(XMLCharToString(value), &width); | |
379 } else if (name == kImageHeightAttribute) { | |
380 base::StringToInt(XMLCharToString(value), &height); | |
381 } | |
382 } | |
383 | |
384 image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) && | |
385 (height == gfx::kFaviconSize) && | |
386 ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon")); | |
387 } | |
388 | |
389 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) { | |
390 if (!atts) | |
391 return; | |
392 | |
393 std::string key, value; | |
394 for (; *atts; atts += 2) { | |
395 std::string name(XMLCharToString(*atts)); | |
396 const xmlChar* val = atts[1]; | |
397 if (name == kParamNameAttribute) { | |
398 key = XMLCharToString(val); | |
399 } else if (name == kParamValueAttribute) { | |
400 value = XMLCharToString(val); | |
401 } | |
402 } | |
403 | |
404 if (!key.empty() && | |
405 (!parameter_filter_ || parameter_filter_->KeepParameter(key, value))) | |
406 extra_params_.push_back(Param(key, value)); | |
407 } | |
408 | |
409 void TemplateURLParsingContext::ProcessURLParams() { | |
410 if (!parameter_filter_ && extra_params_.empty()) | |
411 return; | |
412 | |
413 GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url()); | |
414 if (url.is_empty()) | |
415 return; | |
416 | |
417 // If there is a parameter filter, parse the existing URL and remove any | |
418 // unwanted parameter. | |
419 std::string new_query; | |
420 bool modified = false; | |
421 if (parameter_filter_) { | |
422 url::Component query = url.parsed_for_possibly_invalid_spec().query; | |
423 url::Component key, value; | |
424 const char* url_spec = url.spec().c_str(); | |
425 while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) { | |
426 std::string key_str(url_spec, key.begin, key.len); | |
427 std::string value_str(url_spec, value.begin, value.len); | |
428 if (parameter_filter_->KeepParameter(key_str, value_str)) { | |
429 AppendParamToQuery(key_str, value_str, &new_query); | |
430 } else { | |
431 modified = true; | |
432 } | |
433 } | |
434 } | |
435 if (!modified) | |
436 new_query = url.query(); | |
437 | |
438 // Add the extra parameters if any. | |
439 if (!extra_params_.empty()) { | |
440 modified = true; | |
441 for (std::vector<Param>::const_iterator iter(extra_params_.begin()); | |
442 iter != extra_params_.end(); ++iter) | |
443 AppendParamToQuery(iter->first, iter->second, &new_query); | |
444 } | |
445 | |
446 if (modified) { | |
447 GURL::Replacements repl; | |
448 repl.SetQueryStr(new_query); | |
449 url = url.ReplaceComponents(repl); | |
450 if (is_suggest_url_) | |
451 data_.suggestions_url = url.spec(); | |
452 else if (url.is_valid()) | |
453 data_.SetURL(url.spec()); | |
454 } | |
455 } | |
456 | |
457 TemplateURLParsingContext::ElementType | |
458 TemplateURLParsingContext::GetKnownType() { | |
459 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION) | |
460 return elements_[1]; | |
461 // We only expect PARAM nodes under the URL node. | |
462 return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION && | |
463 elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN; | |
464 } | |
465 | |
466 | |
467 // TemplateURLParser ---------------------------------------------------------- | |
468 | |
469 // static | |
470 TemplateURL* TemplateURLParser::Parse( | |
471 const SearchTermsData& search_terms_data, | |
472 bool show_in_default_list, | |
473 const char* data, | |
474 size_t length, | |
475 TemplateURLParser::ParameterFilter* param_filter) { | |
476 // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to | |
477 // & . Unfortunately xmlSubstituteEntitiesDefault affects global state. | |
478 // If this becomes problematic we'll need to provide our own entity | |
479 // type for &, or strip out & by hand after parsing. | |
480 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1); | |
481 TemplateURLParsingContext context(param_filter); | |
482 xmlSAXHandler sax_handler; | |
483 memset(&sax_handler, 0, sizeof(sax_handler)); | |
484 sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl; | |
485 sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl; | |
486 sax_handler.characters = &TemplateURLParsingContext::CharactersImpl; | |
487 int error = xmlSAXUserParseMemory(&sax_handler, &context, data, | |
488 static_cast<int>(length)); | |
489 xmlSubstituteEntitiesDefault(last_sub_entities_value); | |
490 | |
491 return error ? | |
492 NULL : context.GetTemplateURL(search_terms_data, show_in_default_list); | |
493 } | |
OLD | NEW |