Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(44)

Side by Side Diff: chrome/browser/search_engines/template_url_parser.cc

Issue 373343003: Componentize TemplateURLFetcher (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 6 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/search_engines/template_url_parser.h"
6
7 #include <algorithm>
8 #include <map>
9 #include <vector>
10
11 #include "base/logging.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/string_util.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "components/search_engines/template_url.h"
17 #include "libxml/parser.h"
18 #include "libxml/xmlwriter.h"
19 #include "ui/gfx/favicon_size.h"
20 #include "url/gurl.h"
21 #include "url/url_constants.h"
22
23 namespace {
24
25 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
26 // to that of char, the following names are all in terms of char. This avoids
27 // having to convert to wide, then do comparisons.
28
29 // Defines for element names of the OSD document:
30 const char kURLElement[] = "Url";
31 const char kParamElement[] = "Param";
32 const char kShortNameElement[] = "ShortName";
33 const char kImageElement[] = "Image";
34 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
35 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
36 const char kInputEncodingElement[] = "InputEncoding";
37
38 // Various XML attributes used.
39 const char kURLTypeAttribute[] = "type";
40 const char kURLTemplateAttribute[] = "template";
41 const char kImageTypeAttribute[] = "type";
42 const char kImageWidthAttribute[] = "width";
43 const char kImageHeightAttribute[] = "height";
44 const char kParamNameAttribute[] = "name";
45 const char kParamValueAttribute[] = "value";
46 const char kParamMethodAttribute[] = "method";
47
48 // Mime type for search results.
49 const char kHTMLType[] = "text/html";
50
51 // Mime type for as you type suggestions.
52 const char kSuggestionType[] = "application/x-suggestions+json";
53
54 std::string XMLCharToString(const xmlChar* value) {
55 return std::string(reinterpret_cast<const char*>(value));
56 }
57
58 // Returns true if input_encoding contains a valid input encoding string. This
59 // doesn't verify that we have a valid encoding for the string, just that the
60 // string contains characters that constitute a valid input encoding.
61 bool IsValidEncodingString(const std::string& input_encoding) {
62 if (input_encoding.empty())
63 return false;
64
65 if (!IsAsciiAlpha(input_encoding[0]))
66 return false;
67
68 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
69 char c = input_encoding[i];
70 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
71 c != '-') {
72 return false;
73 }
74 }
75 return true;
76 }
77
78 void AppendParamToQuery(const std::string& key,
79 const std::string& value,
80 std::string* query) {
81 if (!query->empty())
82 query->append("&");
83 if (!key.empty()) {
84 query->append(key);
85 query->append("=");
86 }
87 query->append(value);
88 }
89
90 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S].
91 bool IsHTTPRef(const std::string& url) {
92 if (url.empty())
93 return true;
94 GURL gurl(url);
95 return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) ||
96 gurl.SchemeIs(url::kHttpsScheme));
97 }
98
99 } // namespace
100
101
102 // TemplateURLParsingContext --------------------------------------------------
103
104 // To minimize memory overhead while parsing, a SAX style parser is used.
105 // TemplateURLParsingContext is used to maintain the state we're in the document
106 // while parsing.
107 class TemplateURLParsingContext {
108 public:
109 // Enum of the known element types.
110 enum ElementType {
111 UNKNOWN,
112 OPEN_SEARCH_DESCRIPTION,
113 URL,
114 PARAM,
115 SHORT_NAME,
116 IMAGE,
117 INPUT_ENCODING,
118 };
119
120 enum Method {
121 GET,
122 POST
123 };
124
125 // Key/value of a Param node.
126 typedef std::pair<std::string, std::string> Param;
127
128 explicit TemplateURLParsingContext(
129 TemplateURLParser::ParameterFilter* parameter_filter);
130
131 static void StartElementImpl(void* ctx,
132 const xmlChar* name,
133 const xmlChar** atts);
134 static void EndElementImpl(void* ctx, const xmlChar* name);
135 static void CharactersImpl(void* ctx, const xmlChar* ch, int len);
136
137 // Returns a heap-allocated TemplateURL representing the result of parsing.
138 // This will be NULL if parsing failed or if the results were invalid for some
139 // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied,
140 // a resulting TemplateURLRef was invalid, etc.).
141 TemplateURL* GetTemplateURL(const SearchTermsData& search_terms_data,
142 bool show_in_default_list);
143
144 private:
145 // Key is UTF8 encoded.
146 typedef std::map<std::string, ElementType> ElementNameToElementTypeMap;
147
148 static void InitMapping();
149
150 void ParseURL(const xmlChar** atts);
151 void ParseImage(const xmlChar** atts);
152 void ParseParam(const xmlChar** atts);
153 void ProcessURLParams();
154
155 // Returns the current ElementType.
156 ElementType GetKnownType();
157
158 static ElementNameToElementTypeMap* kElementNameToElementTypeMap;
159
160 // Data that gets updated as we parse, and is converted to a TemplateURL by
161 // GetTemplateURL().
162 TemplateURLData data_;
163
164 std::vector<ElementType> elements_;
165 bool image_is_valid_for_favicon_;
166
167 // Character content for the current element.
168 base::string16 string_;
169
170 TemplateURLParser::ParameterFilter* parameter_filter_;
171
172 // The list of parameters parsed in the Param nodes of a Url node.
173 std::vector<Param> extra_params_;
174
175 // The HTTP methods used.
176 Method method_;
177 Method suggestion_method_;
178
179 // If true, we are currently parsing a suggest URL, otherwise it is an HTML
180 // search. Note that we don't need a stack as URL nodes cannot be nested.
181 bool is_suggest_url_;
182
183 // Whether we should derive the image from the URL (when images are data
184 // URLs).
185 bool derive_image_from_url_;
186
187 DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext);
188 };
189
190 // static
191 TemplateURLParsingContext::ElementNameToElementTypeMap*
192 TemplateURLParsingContext::kElementNameToElementTypeMap = NULL;
193
194 TemplateURLParsingContext::TemplateURLParsingContext(
195 TemplateURLParser::ParameterFilter* parameter_filter)
196 : image_is_valid_for_favicon_(false),
197 parameter_filter_(parameter_filter),
198 method_(GET),
199 suggestion_method_(GET),
200 is_suggest_url_(false),
201 derive_image_from_url_(false) {
202 if (kElementNameToElementTypeMap == NULL)
203 InitMapping();
204 }
205
206 // static
207 void TemplateURLParsingContext::StartElementImpl(void* ctx,
208 const xmlChar* name,
209 const xmlChar** atts) {
210 // Remove the namespace from |name|, ex: os:Url -> Url.
211 std::string node_name(XMLCharToString(name));
212 size_t index = node_name.find_first_of(":");
213 if (index != std::string::npos)
214 node_name.erase(0, index + 1);
215
216 TemplateURLParsingContext* context =
217 reinterpret_cast<TemplateURLParsingContext*>(ctx);
218 context->elements_.push_back(
219 context->kElementNameToElementTypeMap->count(node_name) ?
220 (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN);
221 switch (context->GetKnownType()) {
222 case TemplateURLParsingContext::URL:
223 context->extra_params_.clear();
224 context->ParseURL(atts);
225 break;
226 case TemplateURLParsingContext::IMAGE:
227 context->ParseImage(atts);
228 break;
229 case TemplateURLParsingContext::PARAM:
230 context->ParseParam(atts);
231 break;
232 default:
233 break;
234 }
235 context->string_.clear();
236 }
237
238 // static
239 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) {
240 TemplateURLParsingContext* context =
241 reinterpret_cast<TemplateURLParsingContext*>(ctx);
242 switch (context->GetKnownType()) {
243 case TemplateURLParsingContext::SHORT_NAME:
244 context->data_.short_name = context->string_;
245 break;
246 case TemplateURLParsingContext::IMAGE: {
247 GURL image_url(base::UTF16ToUTF8(context->string_));
248 if (image_url.SchemeIs(url::kDataScheme)) {
249 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
250 // decode the data URL in the renderer. For now, we'll just point to the
251 // favicon from the URL.
252 context->derive_image_from_url_ = true;
253 } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() &&
254 (image_url.SchemeIs(url::kHttpScheme) ||
255 image_url.SchemeIs(url::kHttpsScheme))) {
256 context->data_.favicon_url = image_url;
257 }
258 context->image_is_valid_for_favicon_ = false;
259 break;
260 }
261 case TemplateURLParsingContext::INPUT_ENCODING: {
262 std::string input_encoding = base::UTF16ToASCII(context->string_);
263 if (IsValidEncodingString(input_encoding))
264 context->data_.input_encodings.push_back(input_encoding);
265 break;
266 }
267 case TemplateURLParsingContext::URL:
268 context->ProcessURLParams();
269 break;
270 default:
271 break;
272 }
273 context->string_.clear();
274 context->elements_.pop_back();
275 }
276
277 // static
278 void TemplateURLParsingContext::CharactersImpl(void* ctx,
279 const xmlChar* ch,
280 int len) {
281 reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ +=
282 base::UTF8ToUTF16(std::string(reinterpret_cast<const char*>(ch), len));
283 }
284
285 TemplateURL* TemplateURLParsingContext::GetTemplateURL(
286 const SearchTermsData& search_terms_data,
287 bool show_in_default_list) {
288 // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107
289 if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() ||
290 !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url))
291 return NULL;
292 if (suggestion_method_ == TemplateURLParsingContext::POST)
293 data_.suggestions_url.clear();
294
295 // If the image was a data URL, use the favicon from the search URL instead.
296 // (see the TODO in EndElementImpl()).
297 GURL search_url(data_.url());
298 if (derive_image_from_url_ && data_.favicon_url.is_empty())
299 data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url);
300
301 data_.SetKeyword(TemplateURL::GenerateKeyword(search_url));
302 data_.show_in_default_list = show_in_default_list;
303
304 // Bail if the search URL is empty or if either TemplateURLRef is invalid.
305 scoped_ptr<TemplateURL> template_url(new TemplateURL(data_));
306 if (template_url->url().empty() ||
307 !template_url->url_ref().IsValid(search_terms_data) ||
308 (!template_url->suggestions_url().empty() &&
309 !template_url->suggestions_url_ref().IsValid(search_terms_data))) {
310 return NULL;
311 }
312
313 return template_url.release();
314 }
315
316 // static
317 void TemplateURLParsingContext::InitMapping() {
318 kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
319 (*kElementNameToElementTypeMap)[kURLElement] = URL;
320 (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
321 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
322 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
323 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
324 OPEN_SEARCH_DESCRIPTION;
325 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
326 OPEN_SEARCH_DESCRIPTION;
327 (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING;
328 }
329
330 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) {
331 if (!atts)
332 return;
333
334 std::string template_url;
335 bool is_post = false;
336 bool is_html_url = false;
337 bool is_suggest_url = false;
338 for (; *atts; atts += 2) {
339 std::string name(XMLCharToString(*atts));
340 const xmlChar* value = atts[1];
341 if (name == kURLTypeAttribute) {
342 std::string type = XMLCharToString(value);
343 is_html_url = (type == kHTMLType);
344 is_suggest_url = (type == kSuggestionType);
345 } else if (name == kURLTemplateAttribute) {
346 template_url = XMLCharToString(value);
347 } else if (name == kParamMethodAttribute) {
348 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
349 }
350 }
351
352 if (is_html_url && !template_url.empty()) {
353 data_.SetURL(template_url);
354 is_suggest_url_ = false;
355 if (is_post)
356 method_ = POST;
357 } else if (is_suggest_url) {
358 data_.suggestions_url = template_url;
359 is_suggest_url_ = true;
360 if (is_post)
361 suggestion_method_ = POST;
362 }
363 }
364
365 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) {
366 if (!atts)
367 return;
368
369 int width = 0;
370 int height = 0;
371 std::string type;
372 for (; *atts; atts += 2) {
373 std::string name(XMLCharToString(*atts));
374 const xmlChar* value = atts[1];
375 if (name == kImageTypeAttribute) {
376 type = XMLCharToString(value);
377 } else if (name == kImageWidthAttribute) {
378 base::StringToInt(XMLCharToString(value), &width);
379 } else if (name == kImageHeightAttribute) {
380 base::StringToInt(XMLCharToString(value), &height);
381 }
382 }
383
384 image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) &&
385 (height == gfx::kFaviconSize) &&
386 ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
387 }
388
389 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) {
390 if (!atts)
391 return;
392
393 std::string key, value;
394 for (; *atts; atts += 2) {
395 std::string name(XMLCharToString(*atts));
396 const xmlChar* val = atts[1];
397 if (name == kParamNameAttribute) {
398 key = XMLCharToString(val);
399 } else if (name == kParamValueAttribute) {
400 value = XMLCharToString(val);
401 }
402 }
403
404 if (!key.empty() &&
405 (!parameter_filter_ || parameter_filter_->KeepParameter(key, value)))
406 extra_params_.push_back(Param(key, value));
407 }
408
409 void TemplateURLParsingContext::ProcessURLParams() {
410 if (!parameter_filter_ && extra_params_.empty())
411 return;
412
413 GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url());
414 if (url.is_empty())
415 return;
416
417 // If there is a parameter filter, parse the existing URL and remove any
418 // unwanted parameter.
419 std::string new_query;
420 bool modified = false;
421 if (parameter_filter_) {
422 url::Component query = url.parsed_for_possibly_invalid_spec().query;
423 url::Component key, value;
424 const char* url_spec = url.spec().c_str();
425 while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
426 std::string key_str(url_spec, key.begin, key.len);
427 std::string value_str(url_spec, value.begin, value.len);
428 if (parameter_filter_->KeepParameter(key_str, value_str)) {
429 AppendParamToQuery(key_str, value_str, &new_query);
430 } else {
431 modified = true;
432 }
433 }
434 }
435 if (!modified)
436 new_query = url.query();
437
438 // Add the extra parameters if any.
439 if (!extra_params_.empty()) {
440 modified = true;
441 for (std::vector<Param>::const_iterator iter(extra_params_.begin());
442 iter != extra_params_.end(); ++iter)
443 AppendParamToQuery(iter->first, iter->second, &new_query);
444 }
445
446 if (modified) {
447 GURL::Replacements repl;
448 repl.SetQueryStr(new_query);
449 url = url.ReplaceComponents(repl);
450 if (is_suggest_url_)
451 data_.suggestions_url = url.spec();
452 else if (url.is_valid())
453 data_.SetURL(url.spec());
454 }
455 }
456
457 TemplateURLParsingContext::ElementType
458 TemplateURLParsingContext::GetKnownType() {
459 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
460 return elements_[1];
461 // We only expect PARAM nodes under the URL node.
462 return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
463 elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN;
464 }
465
466
467 // TemplateURLParser ----------------------------------------------------------
468
469 // static
470 TemplateURL* TemplateURLParser::Parse(
471 const SearchTermsData& search_terms_data,
472 bool show_in_default_list,
473 const char* data,
474 size_t length,
475 TemplateURLParser::ParameterFilter* param_filter) {
476 // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
477 // &#38; . Unfortunately xmlSubstituteEntitiesDefault affects global state.
478 // If this becomes problematic we'll need to provide our own entity
479 // type for &amp;, or strip out &#38; by hand after parsing.
480 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
481 TemplateURLParsingContext context(param_filter);
482 xmlSAXHandler sax_handler;
483 memset(&sax_handler, 0, sizeof(sax_handler));
484 sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl;
485 sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl;
486 sax_handler.characters = &TemplateURLParsingContext::CharactersImpl;
487 int error = xmlSAXUserParseMemory(&sax_handler, &context, data,
488 static_cast<int>(length));
489 xmlSubstituteEntitiesDefault(last_sub_entities_value);
490
491 return error ?
492 NULL : context.GetTemplateURL(search_terms_data, show_in_default_list);
493 }
OLDNEW
« no previous file with comments | « chrome/browser/search_engines/template_url_parser.h ('k') | chrome/browser/search_engines/template_url_parser_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698