OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 // This file defines utility functions for working with html. |
| 6 |
| 7 #ifndef CHROME_FRAME_HTML_UTILS_H_ |
| 8 #define CHROME_FRAME_HTML_UTILS_H_ |
| 9 |
| 10 #include <string> |
| 11 #include <vector> |
| 12 |
| 13 #include "base/basictypes.h" |
| 14 #include "testing/gtest/include/gtest/gtest_prod.h" |
| 15 |
| 16 // Forward declarations |
| 17 class HtmlUtilUnittest; |
| 18 |
| 19 // |
| 20 // Class designed to take a string of HTML and extract from it named |
| 21 // attribute values from named tags. |
| 22 // |
| 23 // Caveat: this class currently doesn't handle multi-word UTF-16 encoded |
| 24 // characters. Doesn't handle implies that any data following such a |
| 25 // character could possibly be misinterpreted. |
| 26 // |
| 27 class HTMLScanner { |
| 28 public: |
| 29 typedef std::wstring::const_iterator StrPos; |
| 30 |
| 31 // Structure maintaining const_iterators into html_string_. |
| 32 class StringRange { |
| 33 friend class HTMLScanner; |
| 34 public: |
| 35 StringRange(); |
| 36 StringRange(StrPos start, StrPos end); |
| 37 |
| 38 bool LowerCaseEqualsASCII(const char* other) const; |
| 39 bool Equals(const wchar_t* other) const; |
| 40 |
| 41 // Copies the data described by StringRange into destination. |
| 42 std::wstring Copy() const; |
| 43 |
| 44 // If this StringRange represents a tag, this method extracts the name of |
| 45 // the tag and sticks it in tag_name. |
| 46 // Returns true if the tag name was successfully extracted. |
| 47 // Returns false if this string doesn't look like a valid tag. |
| 48 bool GetTagName(std::wstring* tag_name) const; |
| 49 |
| 50 // From a given string range, uses a string tokenizer to extract the value |
| 51 // of the named attribute if a simple scan finds that the attribute name is |
| 52 // present. |
| 53 // |
| 54 // Returns true if the named attribute can be located and it has a value |
| 55 // which has been placed in attribute_value. |
| 56 // |
| 57 // Note that the attribute value is unquoted here as well, so that |
| 58 // GetTagAttribute(*<foo bar="baz">*, L"bar", *out_value*) will stick |
| 59 // 'bar' in out_value and not '"bar"'. |
| 60 // |
| 61 // Returns false if the named attribute is not present in the tag or if it |
| 62 // did not have a value. |
| 63 // |
| 64 bool GetTagAttribute(const wchar_t* attribute_name, |
| 65 StringRange* attribute_value) const; |
| 66 |
| 67 // Unquotes a StringRange by removing a matching pair of either ' or " |
| 68 // characters from the beginning and end of the string if present. |
| 69 // Returns true if string was modified, false otherwise. |
| 70 bool UnQuote(); |
| 71 private: |
| 72 StrPos start_; |
| 73 StrPos end_; |
| 74 }; |
| 75 |
| 76 typedef std::vector<StringRange> StringRangeList; |
| 77 |
| 78 // html_string must be a null-terminated string containing the HTML |
| 79 // to be scanned. |
| 80 explicit HTMLScanner(const wchar_t* html_string); |
| 81 |
| 82 // Returns the set of ranges denoting HTML tags that match the given name. |
| 83 // If stop_tag_name is given, then as soon as a tag with this name is |
| 84 // encountered this method will return. |
| 85 void GetTagsByName(const wchar_t* name, StringRangeList* tag_list, |
| 86 const wchar_t* stop_tag_name); |
| 87 |
| 88 private: |
| 89 friend class HtmlUtilUnittest; |
| 90 FRIEND_TEST(HtmlUtilUnittest, BasicTest); |
| 91 |
| 92 // Given html_string which represents the remaining html range, this method |
| 93 // returns the next tag in tag and advances html_string to one character after |
| 94 // the end of tag. This method is intended to be called repeatedly to extract |
| 95 // all of the tags in sequence. |
| 96 // |
| 97 // Returns true if another tag was found and 'tag' was populated with a valid |
| 98 // range. |
| 99 // Returns false if we have reached the end of the html data. |
| 100 bool NextTag(StringRange* html_string, StringRange* tag); |
| 101 |
| 102 // Returns true if c can be found in quotes_, false otherwise |
| 103 bool IsQuote(wchar_t c); |
| 104 |
| 105 // Returns true if pos refers to the last character in an HTML comment in a |
| 106 // string described by html_string, false otherwise. |
| 107 // For example with html_string describing <!-- foo> -->, pos must refer to |
| 108 // the last > for this method to return true. |
| 109 bool IsHTMLCommentClose(StringRange* html_string, StrPos pos); |
| 110 |
| 111 // We store a (CollapsedWhitespace'd) copy of the html data. |
| 112 const std::wstring html_string_; |
| 113 |
| 114 // Store the string of quote characters to avoid repeated construction. |
| 115 const std::wstring quotes_; |
| 116 |
| 117 DISALLOW_COPY_AND_ASSIGN(HTMLScanner); |
| 118 }; |
| 119 |
| 120 #endif // CHROME_FRAME_HTML_UTILS_H_ |
OLD | NEW |