Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(36)

Side by Side Diff: chrome/browser/template_url_parser.cc

Issue 18263: Move search engines files into subdir (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: Created 11 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « chrome/browser/template_url_parser.h ('k') | chrome/browser/template_url_parser_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/template_url_parser.h"
6
7 #include <map>
8 #include <vector>
9
10 #include "base/logging.h"
11 #include "base/scoped_ptr.h"
12 #include "base/string_util.h"
13 #include "chrome/browser/template_url.h"
14 #include "googleurl/src/gurl.h"
15 #include "libxml/parser.h"
16 #include "libxml/xmlwriter.h"
17
18 namespace {
19
20 //
21 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
22 // to that of char, the following names are all in terms of char. This avoids
23 // having to convert to wide, then do comparisons
24
25 // Defines for element names of the OSD document:
26 static const char kURLElement[] = "Url";
27 static const char kParamElement[] = "Param";
28 static const char kShortNameElement[] = "ShortName";
29 static const char kDescriptionElement[] = "Description";
30 static const char kImageElement[] = "Image";
31 static const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
32 static const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
33 static const char kLanguageElement[] = "Language";
34 static const char kInputEncodingElement[] = "InputEncoding";
35
36 // Various XML attributes used.
37 static const char kURLTypeAttribute[] = "type";
38 static const char kURLTemplateAttribute[] = "template";
39 static const char kImageTypeAttribute[] = "type";
40 static const char kImageWidthAttribute[] = "width";
41 static const char kImageHeightAttribute[] = "height";
42 static const char kURLIndexOffsetAttribute[] = "indexOffset";
43 static const char kURLPageOffsetAttribute[] = "pageOffset";
44 static const char kParamNameAttribute[] = "name";
45 static const char kParamValueAttribute[] = "value";
46 static const char kParamMethodAttribute[] = "method";
47
48 // Mime type for search results.
49 static const char kHTMLType[] = "text/html";
50
51 // Mime type for as you type suggestions.
52 static const char kSuggestionType[] = "application/x-suggestions+json";
53
54 // Namespace identifier.
55 static const char kOSDNS[] = "xmlns";
56
57 // The namespace for documents we understand.
58 static const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/";
59
60 // Removes the namespace from the specified |name|, ex: os:Url -> Url.
61 static void PruneNamespace(std::string* name) {
62 size_t index = name->find_first_of(":");
63 if (index != std::string::npos)
64 name->erase(0, index + 1);
65 }
66
67 //
68 // To minimize memory overhead while parsing, a SAX style parser is used.
69 // ParsingContext is used to maintain the state we're in the document
70 // while parsing.
71 class ParsingContext {
72 public:
73 // Enum of the known element types.
74 enum ElementType {
75 UNKNOWN,
76 OPEN_SEARCH_DESCRIPTION,
77 URL,
78 PARAM,
79 SHORT_NAME,
80 DESCRIPTION,
81 IMAGE,
82 LANGUAGE,
83 INPUT_ENCODING,
84 };
85
86 enum Method {
87 GET,
88 POST
89 };
90
91 // Key/value of a Param node.
92 typedef std::pair<std::string, std::string> Param;
93
94 ParsingContext(TemplateURLParser::ParameterFilter* parameter_filter,
95 TemplateURL* url)
96 : url_(url),
97 parameter_filter_(parameter_filter),
98 method_(GET),
99 suggestion_method_(GET),
100 is_suggest_url_(false),
101 derive_image_from_url_(false) {
102 if (kElementNameToElementTypeMap == NULL)
103 InitMapping();
104 }
105
106 // Invoked when an element starts.
107 void PushElement(const std::string& element) {
108 ElementType type;
109 if (kElementNameToElementTypeMap->find(element) ==
110 kElementNameToElementTypeMap->end()) {
111 type = UNKNOWN;
112 } else {
113 type = (*kElementNameToElementTypeMap)[element];
114 }
115 elements_.push_back(type);
116 }
117
118 void PopElement() {
119 elements_.pop_back();
120 }
121
122 // Returns the current ElementType.
123 ElementType GetKnownType() {
124 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
125 return elements_[1];
126
127 // We only expect PARAM nodes under the Url node
128 if (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
129 elements_[1] == URL && elements_[2] == PARAM)
130 return PARAM;
131
132 return UNKNOWN;
133 }
134
135 TemplateURL* template_url() { return url_; }
136
137 void AddImageRef(const std::wstring& type, int width, int height) {
138 if (width > 0 && height > 0)
139 current_image_.reset(new TemplateURL::ImageRef(type, width, height));
140 }
141
142 void EndImage() {
143 current_image_.reset();
144 }
145
146 void SetImageURL(const std::wstring& url) {
147 if (current_image_.get()) {
148 current_image_->url = GURL(WideToUTF8(url));
149 url_->add_image_ref(*current_image_);
150 current_image_.reset();
151 }
152 }
153
154 void ResetString() {
155 string_.clear();
156 }
157
158 void AppendString(const std::wstring& string) {
159 string_ += string;
160 }
161
162 const std::wstring& GetString() {
163 return string_;
164 }
165
166 void ResetExtraParams() {
167 extra_params_.clear();
168 }
169
170 void AddExtraParams(const std::string& key, const std::string& value) {
171 if (parameter_filter_ && !parameter_filter_->KeepParameter(key, value))
172 return;
173 extra_params_.push_back(Param(key, value));
174 }
175
176 const std::vector<Param>& extra_params() const { return extra_params_; }
177
178 void set_is_suggestion(bool value) { is_suggest_url_ = value; }
179 bool is_suggestion() const { return is_suggest_url_; }
180
181 TemplateURLParser::ParameterFilter* parameter_filter() const {
182 return parameter_filter_;
183 }
184
185 void set_derive_image_from_url(bool derive_image_from_url) {
186 derive_image_from_url_ = derive_image_from_url;
187 }
188
189 void set_method(Method method) { method_ = method; }
190 Method method() { return method_; }
191
192 void set_suggestion_method(Method method) { suggestion_method_ = method; }
193 Method suggestion_method() { return suggestion_method_; }
194
195 // Builds the image URL from the Template search URL if no image URL has been
196 // set.
197 void DeriveImageFromURL() {
198 if (derive_image_from_url_ &&
199 url_->GetFavIconURL().is_empty() && url_->url()) {
200 GURL url(WideToUTF8(url_->url()->url())); // More url's please...
201 url_->SetFavIconURL(TemplateURL::GenerateFaviconURL(url));
202 }
203 }
204
205 private:
206 static void InitMapping() {
207 kElementNameToElementTypeMap = new std::map<std::string,ElementType>;
208 (*kElementNameToElementTypeMap)[kURLElement] = URL;
209 (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
210 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
211 (*kElementNameToElementTypeMap)[kDescriptionElement] = DESCRIPTION;
212 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
213 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
214 OPEN_SEARCH_DESCRIPTION;
215 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
216 OPEN_SEARCH_DESCRIPTION;
217 (*kElementNameToElementTypeMap)[kLanguageElement] =
218 LANGUAGE;
219 (*kElementNameToElementTypeMap)[kInputEncodingElement] =
220 INPUT_ENCODING;
221 }
222
223 // Key is UTF8 encoded.
224 static std::map<std::string,ElementType>* kElementNameToElementTypeMap;
225 // TemplateURL supplied to Read method. It's owned by the caller, so we
226 // don't need to free it.
227 TemplateURL* url_;
228 std::vector<ElementType> elements_;
229 scoped_ptr<TemplateURL::ImageRef> current_image_;
230
231 // Character content for the current element.
232 std::wstring string_;
233
234 TemplateURLParser::ParameterFilter* parameter_filter_;
235
236 // The list of parameters parsed in the Param nodes of a Url node.
237 std::vector<Param> extra_params_;
238
239 // The HTTP methods used.
240 Method method_;
241 Method suggestion_method_;
242
243 // If true, we are currently parsing a suggest URL, otherwise it is an HTML
244 // search. Note that we don't need a stack as Url nodes cannot be nested.
245 bool is_suggest_url_;
246
247 // Whether we should derive the image from the URL (when images are data
248 // URLs).
249 bool derive_image_from_url_;
250
251 DISALLOW_EVIL_CONSTRUCTORS(ParsingContext);
252 };
253
254 //static
255 std::map<std::string,ParsingContext::ElementType>*
256 ParsingContext::kElementNameToElementTypeMap = NULL;
257
258 std::wstring XMLCharToWide(const xmlChar* value) {
259 return UTF8ToWide(std::string((const char*)value));
260 }
261
262 std::wstring XMLCharToWide(const xmlChar* value, int length) {
263 return UTF8ToWide(std::string((const char*)value, length));
264 }
265
266 std::string XMLCharToString(const xmlChar* value) {
267 return std::string((const char*)value);
268 }
269
270 // Returns true if input_encoding contains a valid input encoding string. This
271 // doesn't verify that we have a valid encoding for the string, just that the
272 // string contains characters that constitute a valid input encoding.
273 bool IsValidEncodingString(const std::string& input_encoding) {
274 if (input_encoding.empty())
275 return false;
276
277 if (!IsAsciiAlpha(input_encoding[0]))
278 return false;
279
280 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
281 char c = input_encoding[i];
282 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
283 c != '-') {
284 return false;
285 }
286 }
287 return true;
288 }
289
290 void ParseURL(const xmlChar** atts, ParsingContext* context) {
291 if (!atts)
292 return;
293
294 TemplateURL* turl = context->template_url();
295 const xmlChar** attributes = atts;
296 std::wstring template_url;
297 bool is_post = false;
298 bool is_html_url = false;
299 bool is_suggest_url = false;
300 int index_offset = 1;
301 int page_offset = 1;
302
303 while (*attributes) {
304 std::string name(XMLCharToString(*attributes));
305 const xmlChar* value = attributes[1];
306 if (name == kURLTypeAttribute) {
307 std::string type = XMLCharToString(value);
308 is_html_url = (type == kHTMLType);
309 is_suggest_url = (type == kSuggestionType);
310 } else if (name == kURLTemplateAttribute) {
311 template_url = XMLCharToWide(value);
312 } else if (name == kURLIndexOffsetAttribute) {
313 index_offset = std::max(1, StringToInt(XMLCharToWide(value)));
314 } else if (name == kURLPageOffsetAttribute) {
315 page_offset = std::max(1, StringToInt(XMLCharToWide(value)));
316 } else if (name == kParamMethodAttribute) {
317 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
318 }
319 attributes += 2;
320 }
321 if (is_html_url) {
322 turl->SetURL(template_url, index_offset, page_offset);
323 context->set_is_suggestion(false);
324 if (is_post)
325 context->set_method(ParsingContext::POST);
326 } else if (is_suggest_url) {
327 turl->SetSuggestionsURL(template_url, index_offset, page_offset);
328 context->set_is_suggestion(true);
329 if (is_post)
330 context->set_suggestion_method(ParsingContext::POST);
331 }
332 }
333
334 void ParseImage(const xmlChar** atts, ParsingContext* context) {
335 if (!atts)
336 return;
337
338 const xmlChar** attributes = atts;
339 int width = 0;
340 int height = 0;
341 std::wstring type;
342 while (*attributes) {
343 std::string name(XMLCharToString(*attributes));
344 const xmlChar* value = attributes[1];
345 if (name == kImageTypeAttribute) {
346 type = XMLCharToWide(value);
347 } else if (name == kImageWidthAttribute) {
348 width = StringToInt(XMLCharToWide(value));
349 } else if (name == kImageHeightAttribute) {
350 height = StringToInt(XMLCharToWide(value));
351 }
352 attributes += 2;
353 }
354 if (width > 0 && height > 0 && !type.empty()) {
355 // Valid Image URL.
356 context->AddImageRef(type, width, height);
357 }
358 }
359
360 void ParseParam(const xmlChar** atts, ParsingContext* context) {
361 if (!atts)
362 return;
363
364 const xmlChar** attributes = atts;
365 std::wstring type;
366 std::string key, value;
367 while (*attributes) {
368 std::string name(XMLCharToString(*attributes));
369 const xmlChar* val = attributes[1];
370 if (name == kParamNameAttribute) {
371 key = XMLCharToString(val);
372 } else if (name == kParamValueAttribute) {
373 value = XMLCharToString(val);
374 }
375 attributes += 2;
376 }
377 if (!key.empty())
378 context->AddExtraParams(key, value);
379 }
380
381 static void AppendParamToQuery(const std::string& key,
382 const std::string& value,
383 std::string* query) {
384 if (!query->empty())
385 query->append("&");
386 if (!key.empty()) {
387 query->append(key);
388 query->append("=");
389 }
390 query->append(value);
391 }
392
393 void ProcessURLParams(ParsingContext* context) {
394 TemplateURL* t_url = context->template_url();
395 const TemplateURLRef* t_url_ref =
396 context->is_suggestion() ? t_url->suggestions_url() :
397 t_url->url();
398 if (!t_url_ref)
399 return;
400
401 if (!context->parameter_filter() && context->extra_params().empty())
402 return;
403
404 GURL url(WideToUTF8(t_url_ref->url()));
405 // If there is a parameter filter, parse the existing URL and remove any
406 // unwanted parameter.
407 TemplateURLParser::ParameterFilter* filter = context->parameter_filter();
408 std::string new_query;
409 bool modified = false;
410 if (filter) {
411 url_parse::Component query = url.parsed_for_possibly_invalid_spec().query;
412 url_parse::Component key, value;
413 const char* url_spec = url.spec().c_str();
414 while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
415 std::string key_str(url_spec, key.begin, key.len);
416 std::string value_str(url_spec, value.begin, value.len);
417 if (filter->KeepParameter(key_str, value_str)) {
418 AppendParamToQuery(key_str, value_str, &new_query);
419 } else {
420 modified = true;
421 }
422 }
423 }
424 if (!modified)
425 new_query = url.query();
426
427 // Add the extra parameters if any.
428 const std::vector<ParsingContext::Param>& params = context->extra_params();
429 if (!params.empty()) {
430 modified = true;
431 std::vector<ParsingContext::Param>::const_iterator iter;
432 for (iter = params.begin(); iter != params.end(); ++iter)
433 AppendParamToQuery(iter->first, iter->second, &new_query);
434 }
435
436 if (modified) {
437 GURL::Replacements repl;
438 repl.SetQueryStr(new_query);
439 url = url.ReplaceComponents(repl);
440 if (context->is_suggestion()) {
441 t_url->SetSuggestionsURL(UTF8ToWide(url.spec()),
442 t_url_ref->index_offset(),
443 t_url_ref->page_offset());
444 } else {
445 t_url->SetURL(UTF8ToWide(url.spec()),
446 t_url_ref->index_offset(),
447 t_url_ref->page_offset());
448 }
449 }
450 }
451
452 void StartElementImpl(void *ctx, const xmlChar *name, const xmlChar **atts) {
453 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
454 std::string node_name((const char*)name);
455 PruneNamespace(&node_name);
456 context->PushElement(node_name);
457 switch (context->GetKnownType()) {
458 case ParsingContext::URL:
459 context->ResetExtraParams();
460 ParseURL(atts, context);
461 break;
462 case ParsingContext::IMAGE:
463 ParseImage(atts, context);
464 break;
465 case ParsingContext::PARAM:
466 ParseParam(atts, context);
467 break;
468 default:
469 break;
470 }
471 context->ResetString();
472 }
473
474 void EndElementImpl(void *ctx, const xmlChar *name) {
475 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
476 switch (context->GetKnownType()) {
477 case ParsingContext::SHORT_NAME:
478 context->template_url()->set_short_name(context->GetString());
479 break;
480 case ParsingContext::DESCRIPTION:
481 context->template_url()->set_description(context->GetString());
482 break;
483 case ParsingContext::IMAGE: {
484 GURL image_url(WideToUTF8(context->GetString()));
485 if (image_url.SchemeIs("data")) {
486 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
487 // decode the data URL in the renderer. For now, we'll just point to the
488 // fav icon from the URL.
489 context->set_derive_image_from_url(true);
490 } else {
491 context->SetImageURL(context->GetString());
492 }
493 context->EndImage();
494 break;
495 }
496 case ParsingContext::LANGUAGE:
497 context->template_url()->add_language(context->GetString());
498 break;
499 case ParsingContext::INPUT_ENCODING: {
500 std::string input_encoding = WideToASCII(context->GetString());
501 if (IsValidEncodingString(input_encoding))
502 context->template_url()->add_input_encoding(input_encoding);
503 break;
504 }
505 case ParsingContext::URL:
506 ProcessURLParams(context);
507 break;
508 default:
509 break;
510 }
511 context->ResetString();
512 context->PopElement();
513 }
514
515 void CharactersImpl(void *ctx, const xmlChar *ch, int len) {
516 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
517 context->AppendString(XMLCharToWide(ch, len));
518 }
519
520 // Returns true if the ref is null, or the url wrapped by ref is
521 // valid with a spec of http/https.
522 bool IsHTTPRef(const TemplateURLRef* ref) {
523 if (ref == NULL)
524 return true;
525 GURL url(WideToUTF8(ref->url()));
526 return (url.is_valid() && (url.SchemeIs("http") || url.SchemeIs("https")));
527 }
528
529 // Returns true if the TemplateURL is legal. A legal TemplateURL is one
530 // where all URLs have a spec of http/https.
531 bool IsLegal(TemplateURL* url) {
532 if (!IsHTTPRef(url->url()) || !IsHTTPRef(url->suggestions_url()))
533 return false;
534 // Make sure all the image refs are legal.
535 const std::vector<TemplateURL::ImageRef>& image_refs = url->image_refs();
536 for (size_t i = 0; i < image_refs.size(); i++) {
537 GURL image_url(image_refs[i].url);
538 if (!image_url.is_valid() ||
539 !(image_url.SchemeIs("http") || image_url.SchemeIs("https"))) {
540 return false;
541 }
542 }
543 return true;
544 }
545
546 } // namespace
547
548 // static
549 bool TemplateURLParser::Parse(const unsigned char* data, size_t length,
550 TemplateURLParser::ParameterFilter* param_filter,
551 TemplateURL* url) {
552 DCHECK(url);
553 // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
554 // &#38; . Unfortunately xmlSubstituteEntitiesDefault effects global state.
555 // If this becomes problematic we'll need to provide our own entity
556 // type for &amp;, or strip out &#34; by hand after parsing.
557 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
558 ParsingContext context(param_filter, url);
559 xmlSAXHandler sax_handler;
560 memset(&sax_handler, 0, sizeof(sax_handler));
561 sax_handler.startElement = &StartElementImpl;
562 sax_handler.endElement = &EndElementImpl;
563 sax_handler.characters = &CharactersImpl;
564 xmlSAXUserParseMemory(&sax_handler, &context,
565 reinterpret_cast<const char*>(data),
566 static_cast<int>(length));
567 xmlSubstituteEntitiesDefault(last_sub_entities_value);
568 // If the image was a data URL, use the favicon from the search URL instead.
569 // (see TODO inEndElementImpl()).
570 context.DeriveImageFromURL();
571
572 // TODO(jcampan): http://b/issue?id=1196285 we do not support search engines
573 // that use POST yet.
574 if (context.method() == ParsingContext::POST)
575 return false;
576 if (context.suggestion_method() == ParsingContext::POST)
577 url->SetSuggestionsURL(L"", 0, 0);
578
579 if (!url->short_name().empty() && !url->description().empty()) {
580 // So far so good, make sure the urls are http.
581 return IsLegal(url);
582 }
583 return false;
584 }
585
586
OLDNEW
« no previous file with comments | « chrome/browser/template_url_parser.h ('k') | chrome/browser/template_url_parser_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698