chrome_frame/html_utils.cc - Issue 218019: Initial import of the Chrome Frame codebase. Integration in chrome.gyp coming...

Side by Side Diff: chrome_frame/html_utils.cc

Issue 218019: Initial import of the Chrome Frame codebase. Integration in chrome.gyp coming... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Created 11 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4 //

	5 #include "chrome_frame/html_utils.h"

	6

	7 #include "base/string_util.h"

	8 #include "base/string_tokenizer.h"

	9

	10 const wchar_t* kQuotes = L"\"'";

	11

	12 HTMLScanner::StringRange::StringRange() {

	13 }

	14

	15 HTMLScanner::StringRange::StringRange(StrPos start, StrPos end)

	16 : start_(start), end_(end) {

	17 }

	18

	19 bool HTMLScanner::StringRange::LowerCaseEqualsASCII(const char* other) const {

	20 return ::LowerCaseEqualsASCII(start_, end_, other);

	21 }

	22

	23 bool HTMLScanner::StringRange::Equals(const wchar_t* other) const {

	24 int ret = wcsncmp(&start_[0], other, end_ - start_);

	25 if (ret == 0)

	26 ret = (other[end_ - start_] == L'\0') ? 0 : -1;

	27 return ret == 0;

	28 }

	29

	30 std::wstring HTMLScanner::StringRange::Copy() const {

	31 return std::wstring(start_, end_);

	32 }

	33

	34 bool HTMLScanner::StringRange::GetTagName(std::wstring* tag_name) const {

	35 if (*start_ != L'<') {

	36 LOG(ERROR) << "Badly formatted tag found";

	37 return false;

	38 }

	39

	40 StrPos name_start = start_;

	41 name_start++;

	42 while (name_start < end_ && IsWhitespace(*name_start))

	43 name_start++;

	44

	45 if (name_start >= end_) {

	46 // We seem to have a degenerate tag (i.e. < >). Return false here.

	47 return false;

	48 }

	49

	50 StrPos name_end = name_start + 1;

	51 while (name_end < end_ && !IsWhitespace(*name_end))

	52 name_end++;

	53

	54 if (name_end > end_) {

	55 // This looks like an improperly formatted tab ('<foo'). Return false here.

	56 return false;

	57 }

	58

	59 tag_name->assign(name_start, name_end);

	60 return true;

	61 }

	62

	63

	64 bool HTMLScanner::StringRange::GetTagAttribute(const wchar_t* attribute_name,

	65 StringRange* attribute_value) const {

	66 if (NULL == attribute_name \|\| NULL == attribute_value) {

	67 NOTREACHED();

	68 return false;

	69 }

	70

	71 // Use this so we can use the convenience method LowerCaseEqualsASCII()

	72 // from string_util.h.

	73 std::string search_name_ascii(WideToASCII(attribute_name));

	74

	75 WStringTokenizer tokenizer(start_, end_, L" =/");

	76 tokenizer.set_options(WStringTokenizer::RETURN_DELIMS);

	77

	78 // Set up the quote chars so that we get quoted attribute values as single

	79 // tokens.

	80 tokenizer.set_quote_chars(L"\"'");

	81

	82 const bool PARSE_STATE_NAME = true;

	83 const bool PARSE_STATE_VALUE = false;

	84 bool parse_state = PARSE_STATE_NAME;

	85

	86 // Used to skip the first token, which is the tag name.

	87 bool first_token_skipped = false;

	88

	89 // This is set during a loop iteration in which an '=' sign was spotted.

	90 // It is used to filter out degenerate tags such as:

	91 // <meta foo==bar>

	92 bool last_token_was_delim = false;

	93

	94 // Set this if the attribute name has been found that we might then

	95 // pick up the value in the next loop iteration.

	96 bool attribute_name_found = false;

	97

	98 while (tokenizer.GetNext()) {

	99 // If we have a whitespace delimiter, just keep going. Cases of this should

	100 // be reduced by the CollapseWhitespace call. If we have an '=' character,

	101 // we update our state and reiterate.

	102 if (tokenizer.token_is_delim()) {

	103 if (*tokenizer.token_begin() == L'=') {

	104 if (last_token_was_delim) {

	105 // Looks like we have a badly formed tag, just stop parsing now.

	106 return false;

	107 }

	108 parse_state = !parse_state;

	109 last_token_was_delim = true;

	110 }

	111 continue;

	112 }

	113

	114 last_token_was_delim = false;

	115

	116 // The first non-delimiter token is the tag name, which we don't want.

	117 if (!first_token_skipped) {

	118 first_token_skipped = true;

	119 continue;

	120 }

	121

	122 if (PARSE_STATE_NAME == parse_state) {

	123 // We have a tag name, check to see if it matches our target name:

	124 if (::LowerCaseEqualsASCII(tokenizer.token_begin(), tokenizer.token_end(),

	125 search_name_ascii.c_str())) {

	126 attribute_name_found = true;

	127 continue;

	128 }

	129 } else if (PARSE_STATE_VALUE == parse_state && attribute_name_found) {

	130 attribute_value->start_ = tokenizer.token_begin();

	131 attribute_value->end_ = tokenizer.token_end();

	132

	133 // Unquote the attribute value if need be.

	134 attribute_value->UnQuote();

	135

	136 return true;

	137 } else if (PARSE_STATE_VALUE == parse_state) {

	138 // If we haven't found the attribute name we want yet, ignore this token

	139 // and go back to looking for our name.

	140 parse_state = PARSE_STATE_NAME;

	141 }

	142 }

	143

	144 return false;

	145 }

	146

	147 bool HTMLScanner::StringRange::UnQuote() {

	148 if (start_ + 2 > end_) {

	149 // String's too short to be quoted, bail.

	150 return false;

	151 }

	152

	153 if ((start_ == L'\'' && (end_ - 1) == L'\'') \|\|

	154 (start_ == L'"' && (end_ - 1) == L'"')) {

	155 start_ = start_ + 1;

	156 end_ = end_ - 1;

	157 return true;

	158 }

	159

	160 return false;

	161 }

	162

	163 HTMLScanner::HTMLScanner(const wchar_t* html_string)

	164 : html_string_(CollapseWhitespace(html_string, true)),

	165 quotes_(kQuotes) {

	166 }

	167

	168 void HTMLScanner::GetTagsByName(const wchar_t* name, StringRangeList* tag_list,

	169 const wchar_t* stop_tag) {

	170 DCHECK(NULL != name);

	171 DCHECK(NULL != tag_list);

	172 DCHECK(NULL != stop_tag);

	173

	174 StringRange remaining_html(html_string_.begin(), html_string_.end());

	175

	176 std::wstring search_name(name);

	177 TrimWhitespace(search_name, TRIM_ALL, &search_name);

	178

	179 // Use this so we can use the convenience method LowerCaseEqualsASCII()

	180 // from string_util.h.

	181 std::string search_name_ascii(WideToASCII(search_name));

	182 std::string stop_tag_ascii(WideToASCII(stop_tag));

	183

	184 StringRange current_tag;

	185 std::wstring current_name;

	186 while (NextTag(&remaining_html, &current_tag)) {

	187 if (current_tag.GetTagName(&current_name)) {

	188 if (LowerCaseEqualsASCII(current_name, search_name_ascii.c_str())) {

	189 tag_list->push_back(current_tag);

	190 } else if (LowerCaseEqualsASCII(current_name, stop_tag_ascii.c_str())) {

	191 // We hit the stop tag so it's time to go home.

	192 break;

	193 }

	194 }

	195 }

	196 }

	197

	198 struct ScanState {

	199 bool in_quote;

	200 bool in_escape;

	201 wchar_t quote_char;

	202 ScanState() : in_quote(false), in_escape(false) {}

	203 };

	204

	205 bool HTMLScanner::IsQuote(wchar_t c) {

	206 return quotes_.find(c) != std::wstring::npos;

	207 }

	208

	209 bool HTMLScanner::IsHTMLCommentClose(StringRange* html_string, StrPos pos) {

	210 if (pos < html_string->end_ && pos > html_string->start_ + 2 &&

	211 *pos == L'>') {

	212 return (pos-1) == L'-' && (pos-2) == L'-';

	213 }

	214 return false;

	215 }

	216

	217 bool HTMLScanner::NextTag(StringRange* html_string, StringRange* tag) {

	218 DCHECK(NULL != html_string);

	219 DCHECK(NULL != tag);

	220

	221 tag->start_ = html_string->start_;

	222 while (tag->start_ < html_string->end_ && *tag->start_ != L'<') {

	223 tag->start_++;

	224 }

	225

	226 // we went past the end of the string.

	227 if (tag->start_ >= html_string->end_) {

	228 return false;

	229 }

	230

	231 tag->end_ = tag->start_ + 1;

	232

	233 // Get the tag name to see if we are in an HTML comment. If we are, then

	234 // don't consider quotes. This should work for example:

	235 // <!-- foo ' --> <meta foo='bar'>

	236 std::wstring tag_name;

	237 StringRange start_range(tag->start_, html_string->end_);

	238 start_range.GetTagName(&tag_name);

	239 if (StartsWith(tag_name, L"!--", true)) {

	240 // We're inside a comment tag, keep going until we get out of it.

	241 while (tag->end_ < html_string->end_ &&

	242 !IsHTMLCommentClose(html_string, tag->end_)) {

	243 tag->end_++;

	244 }

	245 } else {

	246 // Properly handle quoted strings within non-comment tags by maintaining

	247 // some state while scanning. Specifically, we have to maintain state on

	248 // whether we are inside a string, what the string terminating character

	249 // will be and whether we are inside an escape sequence.

	250 ScanState state;

	251 while (tag->end_ < html_string->end_) {

	252 if (state.in_quote) {

	253 if (state.in_escape) {

	254 state.in_escape = false;

	255 } else if (*tag->end_ == '\\') {

	256 state.in_escape = true;

	257 } else if (*tag->end_ == state.quote_char) {

	258 state.in_quote = false;

	259 }

	260 } else {

	261 state.in_quote = IsQuote(state.quote_char = *tag->end_);

	262 }

	263

	264 if (!state.in_quote && *tag->end_ == L'>') {

	265 break;

	266 }

	267 tag->end_++;

	268 }

	269 }

	270

	271 // We hit the end_ but found no matching tag closure. Consider this an

	272 // incomplete tag and do not report it.

	273 if (tag->end_ >= html_string->end_)

	274 return false;

	275

	276 // Modify html_string to point to just beyond the end_ of the current tag.

	277 html_string->start_ = tag->end_ + 1;

	278

	279 return true;

	280 }

	281

OLD	NEW

« no previous file with comments | « chrome_frame/html_utils.h ('k') | chrome_frame/icu_stubs.cc » ('j') | no next file with comments »