android_webview/native/address_parser.cc - Issue 2863233002: [WebView] Move files from native to browser

Side by Side Diff: android_webview/native/address_parser.cc

Issue 2863233002: [WebView] Move files from native to browser (Closed)

Patch Set: Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "android_webview/native/address_parser.h"

6

7 #include "android_webview/native/address_parser_internal.h"

8 #include "base/logging.h"

9 #include "base/strings/string_util.h"

10

11 namespace {

12

13 // Minimum number of words in an address after the house number

14 // before a state is expected to be found.

15 // A value too high can miss short addresses.

16 const size_t kMinAddressWords = 3;

17

18 // Maximum number of words allowed in an address between the house number

19 // and the state, both not included.

20 const size_t kMaxAddressWords = 12;

21

22 // Maximum number of lines allowed in an address between the house number

23 // and the state, both not included.

24 const size_t kMaxAddressLines = 5;

25

26 // Maximum length allowed for any address word between the house number

27 // and the state, both not included.

28 const size_t kMaxAddressNameWordLength = 25;

29

30 // Maximum number of words after the house number in which the location name

31 // should be found.

32 const size_t kMaxLocationNameDistance = 4;

33

34 // Additional characters used as new line delimiters.

35 const base::char16 kNewlineDelimiters[] = {

36 '\n', ',', '*',

37 0x2022, // Unicode bullet

38 0,

39 };

40

41 } // anonymous namespace

42

43 namespace android_webview {

44

45 namespace address_parser {

46

47 using namespace internal;

48

49 bool FindAddress(const base::string16& text, base::string16* address) {

50 size_t start, end;

51 if (FindAddress(text.begin(), text.end(), &start, &end)) {

52 size_t len = end >= start ? end - start : 0;

53 address->assign(text.substr(start, len));

54 return true;

55 }

56 return false;

57 }

58

59 bool FindAddress(const base::string16::const_iterator& begin,

60 const base::string16::const_iterator& end,

61 size_t* start_pos,

62 size_t* end_pos) {

63 HouseNumberParser house_number_parser;

64

65 // Keep going through the input string until a potential house number is

66 // detected. Start tokenizing the following words to find a valid

67 // street name within a word range. Then, find a state name followed

68 // by a valid zip code for that state. Also keep a look for any other

69 // possible house numbers to continue from in case of no match and for

70 // state names not followed by a zip code (e.g. New York, NY 10000).

71 const base::string16 newline_delimiters = kNewlineDelimiters;

72 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;

73 for (base::string16::const_iterator it = begin; it != end;) {

74 Word house_number;

75 if (!house_number_parser.Parse(it, end, &house_number))

76 return false;

77

78 String16Tokenizer tokenizer(house_number.end, end, delimiters);

79 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);

80

81 WordList words;

82 words.push_back(house_number);

83

84 bool found_location_name = false;

85 bool continue_on_house_number = true;

86 bool consecutive_house_numbers = true;

87 size_t next_house_number_word = 0;

88 size_t num_lines = 1;

89

90 // Don't include the house number in the word count.

91 size_t next_word = 1;

92 for (; next_word <= kMaxAddressWords + 1; ++next_word) {

93 // Extract a new word from the tokenizer.

94 if (next_word == words.size()) {

95 do {

96 if (!tokenizer.GetNext())

97 return false;

98

99 // Check the number of address lines.

100 if (tokenizer.token_is_delim() &&

101 newline_delimiters.find(*tokenizer.token_begin()) !=

102 base::string16::npos) {

103 ++num_lines;

104 }

105 } while (tokenizer.token_is_delim());

106

107 if (num_lines > kMaxAddressLines)

108 break;

109

110 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));

111 }

112

113 // Check the word length. If too long, don't try to continue from

114 // the next house number as no address can hold this word.

115 const Word& current_word = words[next_word];

116 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);

117 size_t current_word_length =

118 std::distance(current_word.begin, current_word.end);

119 if (current_word_length > kMaxAddressNameWordLength) {

120 continue_on_house_number = false;

121 break;

122 }

123

124 // Check if the new word is a valid house number.

125 if (house_number_parser.Parse(current_word.begin, current_word.end,

126 NULL)) {

127 // Increase the number of consecutive house numbers since the beginning.

128 if (consecutive_house_numbers) {

129 // Check if there is a new line between consecutive house numbers.

130 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."

131 if (num_lines > 1) {

132 next_house_number_word = next_word;

133 break;

134 }

135 }

136

137 // Keep the next candidate to resume parsing from in case of failure.

138 if (next_house_number_word == 0) {

139 next_house_number_word = next_word;

140 continue;

141 }

142 } else {

143 consecutive_house_numbers = false;

144 }

145

146 // Look for location names in the words after the house number.

147 // A range limitation is introduced to avoid matching

148 // anything that starts with a number before a legitimate address.

149 if (next_word <= kMaxLocationNameDistance &&

150 IsValidLocationName(current_word)) {

151 found_location_name = true;

152 continue;

153 }

154

155 // Don't count the house number.

156 if (next_word > kMinAddressWords) {

157 // Looking for the state is likely to add new words to the list while

158 // checking for multi-word state names.

159 size_t state_first_word = next_word;

160 size_t state_last_word, state_index;

161 if (FindStateStartingInWord(&words, state_first_word, &state_last_word,

162 &tokenizer, &state_index)) {

163 // A location name should have been found at this point.

164 if (!found_location_name)

165 break;

166

167 // Explicitly exclude "et al", as "al" is a valid state code.

168 if (current_word_length == 2 && words.size() > 2) {

169 const Word& previous_word = words[state_first_word - 1];

170 if (previous_word.end - previous_word.begin == 2 &&

171 base::LowerCaseEqualsASCII(

172 base::StringPiece16(previous_word.begin, previous_word.end),

173 "et") &&

174 base::LowerCaseEqualsASCII(

175 base::StringPiece16(current_word.begin, current_word.end),

176 "al"))

177 break;

178 }

179

180 // Extract one more word from the tokenizer if not already available.

181 size_t zip_word = state_last_word + 1;

182 if (zip_word == words.size()) {

183 do {

184 if (!tokenizer.GetNext()) {

185 // The address ends with a state name without a zip code. This

186 // is legal according to WebView#findAddress public

187 // documentation.

188 *start_pos = words[0].begin - begin;

189 *end_pos = words[state_last_word].end - begin;

190 return true;

191 }

192 } while (tokenizer.token_is_delim());

193 words.push_back(

194 Word(tokenizer.token_begin(), tokenizer.token_end()));

195 }

196

197 // Check the parsing validity and state range of the zip code.

198 next_word = state_last_word;

199 if (!IsZipValid(words[zip_word], state_index))

200 continue;

201

202 *start_pos = words[0].begin - begin;

203 *end_pos = words[zip_word].end - begin;

204 return true;

205 }

206 }

207 }

208

209 // Avoid skipping too many words because of a non-address number

210 // at the beginning of the contents to parse.

211 if (continue_on_house_number && next_house_number_word > 0) {

212 it = words[next_house_number_word].begin;

213 } else {

214 DCHECK(!words.empty());

215 next_word = std::min(next_word, words.size() - 1);

216 it = words[next_word].end;

217 }

218 }

219

220 return false;

221 }

222

223 } // namespace address_parser

224

225 } // namespace android_webview

OLD	NEW

« android_webview/BUILD.gn ('K') | « android_webview/native/address_parser.h ('k') | android_webview/native/address_parser_internal.h » ('j') | no next file with comments »