Index: chrome_frame/html_utils.h |
=================================================================== |
--- chrome_frame/html_utils.h (revision 0) |
+++ chrome_frame/html_utils.h (revision 0) |
@@ -0,0 +1,120 @@ |
+// Copyright (c) 2009 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+// This file defines utility functions for working with html. |
+ |
+#ifndef CHROME_FRAME_HTML_UTILS_H_ |
+#define CHROME_FRAME_HTML_UTILS_H_ |
+ |
+#include <string> |
+#include <vector> |
+ |
+#include "base/basictypes.h" |
+#include "testing/gtest/include/gtest/gtest_prod.h" |
+ |
+// Forward declarations |
+class HtmlUtilUnittest; |
+ |
+// |
+// Class designed to take a string of HTML and extract from it named |
+// attribute values from named tags. |
+// |
+// Caveat: this class currently doesn't handle multi-word UTF-16 encoded |
+// characters. Doesn't handle implies that any data following such a |
+// character could possibly be misinterpreted. |
+// |
+class HTMLScanner { |
+ public: |
+ typedef std::wstring::const_iterator StrPos; |
+ |
+ // Structure maintaining const_iterators into html_string_. |
+ class StringRange { |
+ friend class HTMLScanner; |
+ public: |
+ StringRange(); |
+ StringRange(StrPos start, StrPos end); |
+ |
+ bool LowerCaseEqualsASCII(const char* other) const; |
+ bool Equals(const wchar_t* other) const; |
+ |
+ // Copies the data described by StringRange into destination. |
+ std::wstring Copy() const; |
+ |
+ // If this StringRange represents a tag, this method extracts the name of |
+ // the tag and sticks it in tag_name. |
+ // Returns true if the tag name was successfully extracted. |
+ // Returns false if this string doesn't look like a valid tag. |
+ bool GetTagName(std::wstring* tag_name) const; |
+ |
+ // From a given string range, uses a string tokenizer to extract the value |
+ // of the named attribute if a simple scan finds that the attribute name is |
+ // present. |
+ // |
+ // Returns true if the named attribute can be located and it has a value |
+ // which has been placed in attribute_value. |
+ // |
+ // Note that the attribute value is unquoted here as well, so that |
+ // GetTagAttribute(*<foo bar="baz">*, L"bar", *out_value*) will stick |
+ // 'bar' in out_value and not '"bar"'. |
+ // |
+ // Returns false if the named attribute is not present in the tag or if it |
+ // did not have a value. |
+ // |
+ bool GetTagAttribute(const wchar_t* attribute_name, |
+ StringRange* attribute_value) const; |
+ |
+ // Unquotes a StringRange by removing a matching pair of either ' or " |
+ // characters from the beginning and end of the string if present. |
+ // Returns true if string was modified, false otherwise. |
+ bool UnQuote(); |
+ private: |
+ StrPos start_; |
+ StrPos end_; |
+ }; |
+ |
+ typedef std::vector<StringRange> StringRangeList; |
+ |
+ // html_string must be a null-terminated string containing the HTML |
+ // to be scanned. |
+ explicit HTMLScanner(const wchar_t* html_string); |
+ |
+ // Returns the set of ranges denoting HTML tags that match the given name. |
+ // If stop_tag_name is given, then as soon as a tag with this name is |
+ // encountered this method will return. |
+ void GetTagsByName(const wchar_t* name, StringRangeList* tag_list, |
+ const wchar_t* stop_tag_name); |
+ |
+ private: |
+ friend class HtmlUtilUnittest; |
+ FRIEND_TEST(HtmlUtilUnittest, BasicTest); |
+ |
+ // Given html_string which represents the remaining html range, this method |
+ // returns the next tag in tag and advances html_string to one character after |
+ // the end of tag. This method is intended to be called repeatedly to extract |
+ // all of the tags in sequence. |
+ // |
+ // Returns true if another tag was found and 'tag' was populated with a valid |
+ // range. |
+ // Returns false if we have reached the end of the html data. |
+ bool NextTag(StringRange* html_string, StringRange* tag); |
+ |
+ // Returns true if c can be found in quotes_, false otherwise |
+ bool IsQuote(wchar_t c); |
+ |
+ // Returns true if pos refers to the last character in an HTML comment in a |
+ // string described by html_string, false otherwise. |
+ // For example with html_string describing <!-- foo> -->, pos must refer to |
+ // the last > for this method to return true. |
+ bool IsHTMLCommentClose(StringRange* html_string, StrPos pos); |
+ |
+ // We store a (CollapsedWhitespace'd) copy of the html data. |
+ const std::wstring html_string_; |
+ |
+ // Store the string of quote characters to avoid repeated construction. |
+ const std::wstring quotes_; |
+ |
+ DISALLOW_COPY_AND_ASSIGN(HTMLScanner); |
+}; |
+ |
+#endif // CHROME_FRAME_HTML_UTILS_H_ |