content/common/android/address_parser.cc - Issue 2803163002: Move address parser and prefixes to android_webview/.

Side by Side Diff: content/common/android/address_parser.cc

Issue 2803163002: Move address parser and prefixes to android_webview/. (Closed)

Patch Set: Bring back ContentViewStatics import Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "content/common/android/address_parser.h"

6

7 #include "base/logging.h"

8 #include "base/strings/string_util.h"

9 #include "content/common/android/address_parser_internal.h"

10

11 namespace {

12

13 // Minimum number of words in an address after the house number

14 // before a state is expected to be found.

15 // A value too high can miss short addresses.

16 const size_t kMinAddressWords = 3;

17

18 // Maximum number of words allowed in an address between the house number

19 // and the state, both not included.

20 const size_t kMaxAddressWords = 12;

21

22 // Maximum number of lines allowed in an address between the house number

23 // and the state, both not included.

24 const size_t kMaxAddressLines = 5;

25

26 // Maximum length allowed for any address word between the house number

27 // and the state, both not included.

28 const size_t kMaxAddressNameWordLength = 25;

29

30 // Maximum number of words after the house number in which the location name

31 // should be found.

32 const size_t kMaxLocationNameDistance = 4;

33

34 // Additional characters used as new line delimiters.

35 const base::char16 kNewlineDelimiters[] = {

36 '\n',

37 ',',

38 '*',

39 0x2022, // Unicode bullet

40 0,

41 };

42

43 } // anonymous namespace

44

45 namespace content {

46

47 namespace address_parser {

48

49 using namespace internal;

50

51 bool FindAddress(const base::string16& text, base::string16* address) {

52 size_t start, end;

53 if (FindAddress(text.begin(), text.end(), &start, &end)) {

54 size_t len = end >= start ? end - start : 0;

55 address->assign(text.substr(start, len));

56 return true;

57 }

58 return false;

59 }

60

61 bool FindAddress(const base::string16::const_iterator& begin,

62 const base::string16::const_iterator& end,

63 size_t* start_pos,

64 size_t* end_pos) {

65 HouseNumberParser house_number_parser;

66

67 // Keep going through the input string until a potential house number is

68 // detected. Start tokenizing the following words to find a valid

69 // street name within a word range. Then, find a state name followed

70 // by a valid zip code for that state. Also keep a look for any other

71 // possible house numbers to continue from in case of no match and for

72 // state names not followed by a zip code (e.g. New York, NY 10000).

73 const base::string16 newline_delimiters = kNewlineDelimiters;

74 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;

75 for (base::string16::const_iterator it = begin; it != end; ) {

76 Word house_number;

77 if (!house_number_parser.Parse(it, end, &house_number))

78 return false;

79

80 String16Tokenizer tokenizer(house_number.end, end, delimiters);

81 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);

82

83 WordList words;

84 words.push_back(house_number);

85

86 bool found_location_name = false;

87 bool continue_on_house_number = true;

88 bool consecutive_house_numbers = true;

89 size_t next_house_number_word = 0;

90 size_t num_lines = 1;

91

92 // Don't include the house number in the word count.

93 size_t next_word = 1;

94 for (; next_word <= kMaxAddressWords + 1; ++next_word) {

95

96 // Extract a new word from the tokenizer.

97 if (next_word == words.size()) {

98 do {

99 if (!tokenizer.GetNext())

100 return false;

101

102 // Check the number of address lines.

103 if (tokenizer.token_is_delim() && newline_delimiters.find(

104 *tokenizer.token_begin()) != base::string16::npos) {

105 ++num_lines;

106 }

107 } while (tokenizer.token_is_delim());

108

109 if (num_lines > kMaxAddressLines)

110 break;

111

112 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));

113 }

114

115 // Check the word length. If too long, don't try to continue from

116 // the next house number as no address can hold this word.

117 const Word& current_word = words[next_word];

118 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);

119 size_t current_word_length = std::distance(

120 current_word.begin, current_word.end);

121 if (current_word_length > kMaxAddressNameWordLength) {

122 continue_on_house_number = false;

123 break;

124 }

125

126 // Check if the new word is a valid house number.

127 if (house_number_parser.Parse(current_word.begin, current_word.end,

128 NULL)) {

129 // Increase the number of consecutive house numbers since the beginning.

130 if (consecutive_house_numbers) {

131 // Check if there is a new line between consecutive house numbers.

132 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."

133 if (num_lines > 1) {

134 next_house_number_word = next_word;

135 break;

136 }

137 }

138

139 // Keep the next candidate to resume parsing from in case of failure.

140 if (next_house_number_word == 0) {

141 next_house_number_word = next_word;

142 continue;

143 }

144 } else {

145 consecutive_house_numbers = false;

146 }

147

148 // Look for location names in the words after the house number.

149 // A range limitation is introduced to avoid matching

150 // anything that starts with a number before a legitimate address.

151 if (next_word <= kMaxLocationNameDistance &&

152 IsValidLocationName(current_word)) {

153 found_location_name = true;

154 continue;

155 }

156

157 // Don't count the house number.

158 if (next_word > kMinAddressWords) {

159 // Looking for the state is likely to add new words to the list while

160 // checking for multi-word state names.

161 size_t state_first_word = next_word;

162 size_t state_last_word, state_index;

163 if (FindStateStartingInWord(&words, state_first_word, &state_last_word,

164 &tokenizer, &state_index)) {

165

166 // A location name should have been found at this point.

167 if (!found_location_name)

168 break;

169

170 // Explicitly exclude "et al", as "al" is a valid state code.

171 if (current_word_length == 2 && words.size() > 2) {

172 const Word& previous_word = words[state_first_word - 1];

173 if (previous_word.end - previous_word.begin == 2 &&

174 base::LowerCaseEqualsASCII(

175 base::StringPiece16(previous_word.begin, previous_word.end),

176 "et") &&

177 base::LowerCaseEqualsASCII(

178 base::StringPiece16(current_word.begin, current_word.end),

179 "al"))

180 break;

181 }

182

183 // Extract one more word from the tokenizer if not already available.

184 size_t zip_word = state_last_word + 1;

185 if (zip_word == words.size()) {

186 do {

187 if (!tokenizer.GetNext()) {

188 // The address ends with a state name without a zip code. This

189 // is legal according to WebView#findAddress public

190 // documentation.

191 *start_pos = words[0].begin - begin;

192 *end_pos = words[state_last_word].end - begin;

193 return true;

194 }

195 } while (tokenizer.token_is_delim());

196 words.push_back(Word(tokenizer.token_begin(),

197 tokenizer.token_end()));

198 }

199

200 // Check the parsing validity and state range of the zip code.

201 next_word = state_last_word;

202 if (!IsZipValid(words[zip_word], state_index))

203 continue;

204

205 *start_pos = words[0].begin - begin;

206 *end_pos = words[zip_word].end - begin;

207 return true;

208 }

209 }

210 }

211

212 // Avoid skipping too many words because of a non-address number

213 // at the beginning of the contents to parse.

214 if (continue_on_house_number && next_house_number_word > 0) {

215 it = words[next_house_number_word].begin;

216 } else {

217 DCHECK(!words.empty());

218 next_word = std::min(next_word, words.size() - 1);

219 it = words[next_word].end;

220 }

221 }

222

223 return false;

224 }

225

226 } // namespace address_parser

227

228 } // namespace content

OLD	NEW

« no previous file with comments | « content/common/android/address_parser.h ('k') | content/common/android/address_parser_internal.h » ('j') | no next file with comments »