| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/browser/search_engines/template_url_parser.h" | |
| 6 | |
| 7 #include <algorithm> | |
| 8 #include <map> | |
| 9 #include <vector> | |
| 10 | |
| 11 #include "base/logging.h" | |
| 12 #include "base/memory/scoped_ptr.h" | |
| 13 #include "base/strings/string_number_conversions.h" | |
| 14 #include "base/strings/string_util.h" | |
| 15 #include "base/strings/utf_string_conversions.h" | |
| 16 #include "components/search_engines/template_url.h" | |
| 17 #include "libxml/parser.h" | |
| 18 #include "libxml/xmlwriter.h" | |
| 19 #include "ui/gfx/favicon_size.h" | |
| 20 #include "url/gurl.h" | |
| 21 #include "url/url_constants.h" | |
| 22 | |
| 23 namespace { | |
| 24 | |
| 25 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds | |
| 26 // to that of char, the following names are all in terms of char. This avoids | |
| 27 // having to convert to wide, then do comparisons. | |
| 28 | |
| 29 // Defines for element names of the OSD document: | |
| 30 const char kURLElement[] = "Url"; | |
| 31 const char kParamElement[] = "Param"; | |
| 32 const char kShortNameElement[] = "ShortName"; | |
| 33 const char kImageElement[] = "Image"; | |
| 34 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription"; | |
| 35 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin"; | |
| 36 const char kInputEncodingElement[] = "InputEncoding"; | |
| 37 | |
| 38 // Various XML attributes used. | |
| 39 const char kURLTypeAttribute[] = "type"; | |
| 40 const char kURLTemplateAttribute[] = "template"; | |
| 41 const char kImageTypeAttribute[] = "type"; | |
| 42 const char kImageWidthAttribute[] = "width"; | |
| 43 const char kImageHeightAttribute[] = "height"; | |
| 44 const char kParamNameAttribute[] = "name"; | |
| 45 const char kParamValueAttribute[] = "value"; | |
| 46 const char kParamMethodAttribute[] = "method"; | |
| 47 | |
| 48 // Mime type for search results. | |
| 49 const char kHTMLType[] = "text/html"; | |
| 50 | |
| 51 // Mime type for as you type suggestions. | |
| 52 const char kSuggestionType[] = "application/x-suggestions+json"; | |
| 53 | |
| 54 std::string XMLCharToString(const xmlChar* value) { | |
| 55 return std::string(reinterpret_cast<const char*>(value)); | |
| 56 } | |
| 57 | |
| 58 // Returns true if input_encoding contains a valid input encoding string. This | |
| 59 // doesn't verify that we have a valid encoding for the string, just that the | |
| 60 // string contains characters that constitute a valid input encoding. | |
| 61 bool IsValidEncodingString(const std::string& input_encoding) { | |
| 62 if (input_encoding.empty()) | |
| 63 return false; | |
| 64 | |
| 65 if (!IsAsciiAlpha(input_encoding[0])) | |
| 66 return false; | |
| 67 | |
| 68 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) { | |
| 69 char c = input_encoding[i]; | |
| 70 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' && | |
| 71 c != '-') { | |
| 72 return false; | |
| 73 } | |
| 74 } | |
| 75 return true; | |
| 76 } | |
| 77 | |
| 78 void AppendParamToQuery(const std::string& key, | |
| 79 const std::string& value, | |
| 80 std::string* query) { | |
| 81 if (!query->empty()) | |
| 82 query->append("&"); | |
| 83 if (!key.empty()) { | |
| 84 query->append(key); | |
| 85 query->append("="); | |
| 86 } | |
| 87 query->append(value); | |
| 88 } | |
| 89 | |
| 90 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S]. | |
| 91 bool IsHTTPRef(const std::string& url) { | |
| 92 if (url.empty()) | |
| 93 return true; | |
| 94 GURL gurl(url); | |
| 95 return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) || | |
| 96 gurl.SchemeIs(url::kHttpsScheme)); | |
| 97 } | |
| 98 | |
| 99 } // namespace | |
| 100 | |
| 101 | |
| 102 // TemplateURLParsingContext -------------------------------------------------- | |
| 103 | |
| 104 // To minimize memory overhead while parsing, a SAX style parser is used. | |
| 105 // TemplateURLParsingContext is used to maintain the state we're in the document | |
| 106 // while parsing. | |
| 107 class TemplateURLParsingContext { | |
| 108 public: | |
| 109 // Enum of the known element types. | |
| 110 enum ElementType { | |
| 111 UNKNOWN, | |
| 112 OPEN_SEARCH_DESCRIPTION, | |
| 113 URL, | |
| 114 PARAM, | |
| 115 SHORT_NAME, | |
| 116 IMAGE, | |
| 117 INPUT_ENCODING, | |
| 118 }; | |
| 119 | |
| 120 enum Method { | |
| 121 GET, | |
| 122 POST | |
| 123 }; | |
| 124 | |
| 125 // Key/value of a Param node. | |
| 126 typedef std::pair<std::string, std::string> Param; | |
| 127 | |
| 128 explicit TemplateURLParsingContext( | |
| 129 TemplateURLParser::ParameterFilter* parameter_filter); | |
| 130 | |
| 131 static void StartElementImpl(void* ctx, | |
| 132 const xmlChar* name, | |
| 133 const xmlChar** atts); | |
| 134 static void EndElementImpl(void* ctx, const xmlChar* name); | |
| 135 static void CharactersImpl(void* ctx, const xmlChar* ch, int len); | |
| 136 | |
| 137 // Returns a heap-allocated TemplateURL representing the result of parsing. | |
| 138 // This will be NULL if parsing failed or if the results were invalid for some | |
| 139 // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied, | |
| 140 // a resulting TemplateURLRef was invalid, etc.). | |
| 141 TemplateURL* GetTemplateURL(const SearchTermsData& search_terms_data, | |
| 142 bool show_in_default_list); | |
| 143 | |
| 144 private: | |
| 145 // Key is UTF8 encoded. | |
| 146 typedef std::map<std::string, ElementType> ElementNameToElementTypeMap; | |
| 147 | |
| 148 static void InitMapping(); | |
| 149 | |
| 150 void ParseURL(const xmlChar** atts); | |
| 151 void ParseImage(const xmlChar** atts); | |
| 152 void ParseParam(const xmlChar** atts); | |
| 153 void ProcessURLParams(); | |
| 154 | |
| 155 // Returns the current ElementType. | |
| 156 ElementType GetKnownType(); | |
| 157 | |
| 158 static ElementNameToElementTypeMap* kElementNameToElementTypeMap; | |
| 159 | |
| 160 // Data that gets updated as we parse, and is converted to a TemplateURL by | |
| 161 // GetTemplateURL(). | |
| 162 TemplateURLData data_; | |
| 163 | |
| 164 std::vector<ElementType> elements_; | |
| 165 bool image_is_valid_for_favicon_; | |
| 166 | |
| 167 // Character content for the current element. | |
| 168 base::string16 string_; | |
| 169 | |
| 170 TemplateURLParser::ParameterFilter* parameter_filter_; | |
| 171 | |
| 172 // The list of parameters parsed in the Param nodes of a Url node. | |
| 173 std::vector<Param> extra_params_; | |
| 174 | |
| 175 // The HTTP methods used. | |
| 176 Method method_; | |
| 177 Method suggestion_method_; | |
| 178 | |
| 179 // If true, we are currently parsing a suggest URL, otherwise it is an HTML | |
| 180 // search. Note that we don't need a stack as URL nodes cannot be nested. | |
| 181 bool is_suggest_url_; | |
| 182 | |
| 183 // Whether we should derive the image from the URL (when images are data | |
| 184 // URLs). | |
| 185 bool derive_image_from_url_; | |
| 186 | |
| 187 DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext); | |
| 188 }; | |
| 189 | |
| 190 // static | |
| 191 TemplateURLParsingContext::ElementNameToElementTypeMap* | |
| 192 TemplateURLParsingContext::kElementNameToElementTypeMap = NULL; | |
| 193 | |
| 194 TemplateURLParsingContext::TemplateURLParsingContext( | |
| 195 TemplateURLParser::ParameterFilter* parameter_filter) | |
| 196 : image_is_valid_for_favicon_(false), | |
| 197 parameter_filter_(parameter_filter), | |
| 198 method_(GET), | |
| 199 suggestion_method_(GET), | |
| 200 is_suggest_url_(false), | |
| 201 derive_image_from_url_(false) { | |
| 202 if (kElementNameToElementTypeMap == NULL) | |
| 203 InitMapping(); | |
| 204 } | |
| 205 | |
| 206 // static | |
| 207 void TemplateURLParsingContext::StartElementImpl(void* ctx, | |
| 208 const xmlChar* name, | |
| 209 const xmlChar** atts) { | |
| 210 // Remove the namespace from |name|, ex: os:Url -> Url. | |
| 211 std::string node_name(XMLCharToString(name)); | |
| 212 size_t index = node_name.find_first_of(":"); | |
| 213 if (index != std::string::npos) | |
| 214 node_name.erase(0, index + 1); | |
| 215 | |
| 216 TemplateURLParsingContext* context = | |
| 217 reinterpret_cast<TemplateURLParsingContext*>(ctx); | |
| 218 context->elements_.push_back( | |
| 219 context->kElementNameToElementTypeMap->count(node_name) ? | |
| 220 (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN); | |
| 221 switch (context->GetKnownType()) { | |
| 222 case TemplateURLParsingContext::URL: | |
| 223 context->extra_params_.clear(); | |
| 224 context->ParseURL(atts); | |
| 225 break; | |
| 226 case TemplateURLParsingContext::IMAGE: | |
| 227 context->ParseImage(atts); | |
| 228 break; | |
| 229 case TemplateURLParsingContext::PARAM: | |
| 230 context->ParseParam(atts); | |
| 231 break; | |
| 232 default: | |
| 233 break; | |
| 234 } | |
| 235 context->string_.clear(); | |
| 236 } | |
| 237 | |
| 238 // static | |
| 239 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) { | |
| 240 TemplateURLParsingContext* context = | |
| 241 reinterpret_cast<TemplateURLParsingContext*>(ctx); | |
| 242 switch (context->GetKnownType()) { | |
| 243 case TemplateURLParsingContext::SHORT_NAME: | |
| 244 context->data_.short_name = context->string_; | |
| 245 break; | |
| 246 case TemplateURLParsingContext::IMAGE: { | |
| 247 GURL image_url(base::UTF16ToUTF8(context->string_)); | |
| 248 if (image_url.SchemeIs(url::kDataScheme)) { | |
| 249 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to | |
| 250 // decode the data URL in the renderer. For now, we'll just point to the | |
| 251 // favicon from the URL. | |
| 252 context->derive_image_from_url_ = true; | |
| 253 } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() && | |
| 254 (image_url.SchemeIs(url::kHttpScheme) || | |
| 255 image_url.SchemeIs(url::kHttpsScheme))) { | |
| 256 context->data_.favicon_url = image_url; | |
| 257 } | |
| 258 context->image_is_valid_for_favicon_ = false; | |
| 259 break; | |
| 260 } | |
| 261 case TemplateURLParsingContext::INPUT_ENCODING: { | |
| 262 std::string input_encoding = base::UTF16ToASCII(context->string_); | |
| 263 if (IsValidEncodingString(input_encoding)) | |
| 264 context->data_.input_encodings.push_back(input_encoding); | |
| 265 break; | |
| 266 } | |
| 267 case TemplateURLParsingContext::URL: | |
| 268 context->ProcessURLParams(); | |
| 269 break; | |
| 270 default: | |
| 271 break; | |
| 272 } | |
| 273 context->string_.clear(); | |
| 274 context->elements_.pop_back(); | |
| 275 } | |
| 276 | |
| 277 // static | |
| 278 void TemplateURLParsingContext::CharactersImpl(void* ctx, | |
| 279 const xmlChar* ch, | |
| 280 int len) { | |
| 281 reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ += | |
| 282 base::UTF8ToUTF16(std::string(reinterpret_cast<const char*>(ch), len)); | |
| 283 } | |
| 284 | |
| 285 TemplateURL* TemplateURLParsingContext::GetTemplateURL( | |
| 286 const SearchTermsData& search_terms_data, | |
| 287 bool show_in_default_list) { | |
| 288 // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107 | |
| 289 if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() || | |
| 290 !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url)) | |
| 291 return NULL; | |
| 292 if (suggestion_method_ == TemplateURLParsingContext::POST) | |
| 293 data_.suggestions_url.clear(); | |
| 294 | |
| 295 // If the image was a data URL, use the favicon from the search URL instead. | |
| 296 // (see the TODO in EndElementImpl()). | |
| 297 GURL search_url(data_.url()); | |
| 298 if (derive_image_from_url_ && data_.favicon_url.is_empty()) | |
| 299 data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url); | |
| 300 | |
| 301 data_.SetKeyword(TemplateURL::GenerateKeyword(search_url)); | |
| 302 data_.show_in_default_list = show_in_default_list; | |
| 303 | |
| 304 // Bail if the search URL is empty or if either TemplateURLRef is invalid. | |
| 305 scoped_ptr<TemplateURL> template_url(new TemplateURL(data_)); | |
| 306 if (template_url->url().empty() || | |
| 307 !template_url->url_ref().IsValid(search_terms_data) || | |
| 308 (!template_url->suggestions_url().empty() && | |
| 309 !template_url->suggestions_url_ref().IsValid(search_terms_data))) { | |
| 310 return NULL; | |
| 311 } | |
| 312 | |
| 313 return template_url.release(); | |
| 314 } | |
| 315 | |
| 316 // static | |
| 317 void TemplateURLParsingContext::InitMapping() { | |
| 318 kElementNameToElementTypeMap = new std::map<std::string, ElementType>; | |
| 319 (*kElementNameToElementTypeMap)[kURLElement] = URL; | |
| 320 (*kElementNameToElementTypeMap)[kParamElement] = PARAM; | |
| 321 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME; | |
| 322 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE; | |
| 323 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] = | |
| 324 OPEN_SEARCH_DESCRIPTION; | |
| 325 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] = | |
| 326 OPEN_SEARCH_DESCRIPTION; | |
| 327 (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING; | |
| 328 } | |
| 329 | |
| 330 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) { | |
| 331 if (!atts) | |
| 332 return; | |
| 333 | |
| 334 std::string template_url; | |
| 335 bool is_post = false; | |
| 336 bool is_html_url = false; | |
| 337 bool is_suggest_url = false; | |
| 338 for (; *atts; atts += 2) { | |
| 339 std::string name(XMLCharToString(*atts)); | |
| 340 const xmlChar* value = atts[1]; | |
| 341 if (name == kURLTypeAttribute) { | |
| 342 std::string type = XMLCharToString(value); | |
| 343 is_html_url = (type == kHTMLType); | |
| 344 is_suggest_url = (type == kSuggestionType); | |
| 345 } else if (name == kURLTemplateAttribute) { | |
| 346 template_url = XMLCharToString(value); | |
| 347 } else if (name == kParamMethodAttribute) { | |
| 348 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post"); | |
| 349 } | |
| 350 } | |
| 351 | |
| 352 if (is_html_url && !template_url.empty()) { | |
| 353 data_.SetURL(template_url); | |
| 354 is_suggest_url_ = false; | |
| 355 if (is_post) | |
| 356 method_ = POST; | |
| 357 } else if (is_suggest_url) { | |
| 358 data_.suggestions_url = template_url; | |
| 359 is_suggest_url_ = true; | |
| 360 if (is_post) | |
| 361 suggestion_method_ = POST; | |
| 362 } | |
| 363 } | |
| 364 | |
| 365 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) { | |
| 366 if (!atts) | |
| 367 return; | |
| 368 | |
| 369 int width = 0; | |
| 370 int height = 0; | |
| 371 std::string type; | |
| 372 for (; *atts; atts += 2) { | |
| 373 std::string name(XMLCharToString(*atts)); | |
| 374 const xmlChar* value = atts[1]; | |
| 375 if (name == kImageTypeAttribute) { | |
| 376 type = XMLCharToString(value); | |
| 377 } else if (name == kImageWidthAttribute) { | |
| 378 base::StringToInt(XMLCharToString(value), &width); | |
| 379 } else if (name == kImageHeightAttribute) { | |
| 380 base::StringToInt(XMLCharToString(value), &height); | |
| 381 } | |
| 382 } | |
| 383 | |
| 384 image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) && | |
| 385 (height == gfx::kFaviconSize) && | |
| 386 ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon")); | |
| 387 } | |
| 388 | |
| 389 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) { | |
| 390 if (!atts) | |
| 391 return; | |
| 392 | |
| 393 std::string key, value; | |
| 394 for (; *atts; atts += 2) { | |
| 395 std::string name(XMLCharToString(*atts)); | |
| 396 const xmlChar* val = atts[1]; | |
| 397 if (name == kParamNameAttribute) { | |
| 398 key = XMLCharToString(val); | |
| 399 } else if (name == kParamValueAttribute) { | |
| 400 value = XMLCharToString(val); | |
| 401 } | |
| 402 } | |
| 403 | |
| 404 if (!key.empty() && | |
| 405 (!parameter_filter_ || parameter_filter_->KeepParameter(key, value))) | |
| 406 extra_params_.push_back(Param(key, value)); | |
| 407 } | |
| 408 | |
| 409 void TemplateURLParsingContext::ProcessURLParams() { | |
| 410 if (!parameter_filter_ && extra_params_.empty()) | |
| 411 return; | |
| 412 | |
| 413 GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url()); | |
| 414 if (url.is_empty()) | |
| 415 return; | |
| 416 | |
| 417 // If there is a parameter filter, parse the existing URL and remove any | |
| 418 // unwanted parameter. | |
| 419 std::string new_query; | |
| 420 bool modified = false; | |
| 421 if (parameter_filter_) { | |
| 422 url::Component query = url.parsed_for_possibly_invalid_spec().query; | |
| 423 url::Component key, value; | |
| 424 const char* url_spec = url.spec().c_str(); | |
| 425 while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) { | |
| 426 std::string key_str(url_spec, key.begin, key.len); | |
| 427 std::string value_str(url_spec, value.begin, value.len); | |
| 428 if (parameter_filter_->KeepParameter(key_str, value_str)) { | |
| 429 AppendParamToQuery(key_str, value_str, &new_query); | |
| 430 } else { | |
| 431 modified = true; | |
| 432 } | |
| 433 } | |
| 434 } | |
| 435 if (!modified) | |
| 436 new_query = url.query(); | |
| 437 | |
| 438 // Add the extra parameters if any. | |
| 439 if (!extra_params_.empty()) { | |
| 440 modified = true; | |
| 441 for (std::vector<Param>::const_iterator iter(extra_params_.begin()); | |
| 442 iter != extra_params_.end(); ++iter) | |
| 443 AppendParamToQuery(iter->first, iter->second, &new_query); | |
| 444 } | |
| 445 | |
| 446 if (modified) { | |
| 447 GURL::Replacements repl; | |
| 448 repl.SetQueryStr(new_query); | |
| 449 url = url.ReplaceComponents(repl); | |
| 450 if (is_suggest_url_) | |
| 451 data_.suggestions_url = url.spec(); | |
| 452 else if (url.is_valid()) | |
| 453 data_.SetURL(url.spec()); | |
| 454 } | |
| 455 } | |
| 456 | |
| 457 TemplateURLParsingContext::ElementType | |
| 458 TemplateURLParsingContext::GetKnownType() { | |
| 459 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION) | |
| 460 return elements_[1]; | |
| 461 // We only expect PARAM nodes under the URL node. | |
| 462 return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION && | |
| 463 elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN; | |
| 464 } | |
| 465 | |
| 466 | |
| 467 // TemplateURLParser ---------------------------------------------------------- | |
| 468 | |
| 469 // static | |
| 470 TemplateURL* TemplateURLParser::Parse( | |
| 471 const SearchTermsData& search_terms_data, | |
| 472 bool show_in_default_list, | |
| 473 const char* data, | |
| 474 size_t length, | |
| 475 TemplateURLParser::ParameterFilter* param_filter) { | |
| 476 // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to | |
| 477 // & . Unfortunately xmlSubstituteEntitiesDefault affects global state. | |
| 478 // If this becomes problematic we'll need to provide our own entity | |
| 479 // type for &, or strip out & by hand after parsing. | |
| 480 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1); | |
| 481 TemplateURLParsingContext context(param_filter); | |
| 482 xmlSAXHandler sax_handler; | |
| 483 memset(&sax_handler, 0, sizeof(sax_handler)); | |
| 484 sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl; | |
| 485 sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl; | |
| 486 sax_handler.characters = &TemplateURLParsingContext::CharactersImpl; | |
| 487 int error = xmlSAXUserParseMemory(&sax_handler, &context, data, | |
| 488 static_cast<int>(length)); | |
| 489 xmlSubstituteEntitiesDefault(last_sub_entities_value); | |
| 490 | |
| 491 return error ? | |
| 492 NULL : context.GetTemplateURL(search_terms_data, show_in_default_list); | |
| 493 } | |
| OLD | NEW |