| Index: chrome_frame/html_utils.h | 
| =================================================================== | 
| --- chrome_frame/html_utils.h	(revision 0) | 
| +++ chrome_frame/html_utils.h	(revision 0) | 
| @@ -0,0 +1,120 @@ | 
| +// Copyright (c) 2009 The Chromium Authors. All rights reserved. | 
| +// Use of this source code is governed by a BSD-style license that can be | 
| +// found in the LICENSE file. | 
| + | 
| +// This file defines utility functions for working with html. | 
| + | 
| +#ifndef CHROME_FRAME_HTML_UTILS_H_ | 
| +#define CHROME_FRAME_HTML_UTILS_H_ | 
| + | 
| +#include <string> | 
| +#include <vector> | 
| + | 
| +#include "base/basictypes.h" | 
| +#include "testing/gtest/include/gtest/gtest_prod.h" | 
| + | 
| +// Forward declarations | 
| +class HtmlUtilUnittest; | 
| + | 
| +// | 
| +// Class designed to take a string of HTML and extract from it named | 
| +// attribute values from named tags. | 
| +// | 
| +// Caveat: this class currently doesn't handle multi-word UTF-16 encoded | 
| +// characters. Doesn't handle implies that any data following such a | 
| +// character could possibly be misinterpreted. | 
| +// | 
| +class HTMLScanner { | 
| + public: | 
| +  typedef std::wstring::const_iterator StrPos; | 
| + | 
| +  // Structure maintaining const_iterators into html_string_. | 
| +  class StringRange { | 
| +    friend class HTMLScanner; | 
| +   public: | 
| +    StringRange(); | 
| +    StringRange(StrPos start, StrPos end); | 
| + | 
| +    bool LowerCaseEqualsASCII(const char* other) const; | 
| +    bool Equals(const wchar_t* other) const; | 
| + | 
| +    // Copies the data described by StringRange into destination. | 
| +    std::wstring Copy() const; | 
| + | 
| +    // If this StringRange represents a tag, this method extracts the name of | 
| +    // the tag and sticks it in tag_name. | 
| +    // Returns true if the tag name was successfully extracted. | 
| +    // Returns false if this string doesn't look like a valid tag. | 
| +    bool GetTagName(std::wstring* tag_name) const; | 
| + | 
| +    // From a given string range, uses a string tokenizer to extract the value | 
| +    // of the named attribute if a simple scan finds that the attribute name is | 
| +    // present. | 
| +    // | 
| +    // Returns true if the named attribute can be located and it has a value | 
| +    // which has been placed in attribute_value. | 
| +    // | 
| +    // Note that the attribute value is unquoted here as well, so that | 
| +    // GetTagAttribute(*<foo bar="baz">*, L"bar", *out_value*) will stick | 
| +    // 'bar' in out_value and not '"bar"'. | 
| +    // | 
| +    // Returns false if the named attribute is not present in the tag or if it | 
| +    // did not have a value. | 
| +    // | 
| +    bool GetTagAttribute(const wchar_t* attribute_name, | 
| +                         StringRange* attribute_value) const; | 
| + | 
| +    // Unquotes a StringRange by removing a matching pair of either ' or " | 
| +    // characters from the beginning and end of the string if present. | 
| +    // Returns true if string was modified, false otherwise. | 
| +    bool UnQuote(); | 
| +   private: | 
| +     StrPos start_; | 
| +     StrPos end_; | 
| +  }; | 
| + | 
| +  typedef std::vector<StringRange> StringRangeList; | 
| + | 
| +  // html_string must be a null-terminated string containing the HTML | 
| +  // to be scanned. | 
| +  explicit HTMLScanner(const wchar_t* html_string); | 
| + | 
| +  // Returns the set of ranges denoting HTML tags that match the given name. | 
| +  // If stop_tag_name is given, then as soon as a tag with this name is | 
| +  // encountered this method will return. | 
| +  void GetTagsByName(const wchar_t* name, StringRangeList* tag_list, | 
| +                     const wchar_t* stop_tag_name); | 
| + | 
| + private: | 
| +  friend class HtmlUtilUnittest; | 
| +  FRIEND_TEST(HtmlUtilUnittest, BasicTest); | 
| + | 
| +  // Given html_string which represents the remaining html range, this method | 
| +  // returns the next tag in tag and advances html_string to one character after | 
| +  // the end of tag. This method is intended to be called repeatedly to extract | 
| +  // all of the tags in sequence. | 
| +  // | 
| +  // Returns true if another tag was found and 'tag' was populated with a valid | 
| +  // range. | 
| +  // Returns false if we have reached the end of the html data. | 
| +  bool NextTag(StringRange* html_string, StringRange* tag); | 
| + | 
| +  // Returns true if c can be found in quotes_, false otherwise | 
| +  bool IsQuote(wchar_t c); | 
| + | 
| +  // Returns true if pos refers to the last character in an HTML comment in a | 
| +  // string described by html_string, false otherwise. | 
| +  // For example with html_string describing <!-- foo> -->, pos must refer to | 
| +  // the last > for this method to return true. | 
| +  bool IsHTMLCommentClose(StringRange* html_string, StrPos pos); | 
| + | 
| +  // We store a (CollapsedWhitespace'd) copy of the html data. | 
| +  const std::wstring html_string_; | 
| + | 
| +  // Store the string of quote characters to avoid repeated construction. | 
| +  const std::wstring quotes_; | 
| + | 
| +  DISALLOW_COPY_AND_ASSIGN(HTMLScanner); | 
| +}; | 
| + | 
| +#endif  // CHROME_FRAME_HTML_UTILS_H_ | 
|  |