OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "content/common/android/address_parser.h" | |
6 | |
7 #include "base/logging.h" | |
8 #include "base/strings/string_util.h" | |
9 #include "content/common/android/address_parser_internal.h" | |
10 | |
11 namespace { | |
12 | |
13 // Minimum number of words in an address after the house number | |
14 // before a state is expected to be found. | |
15 // A value too high can miss short addresses. | |
16 const size_t kMinAddressWords = 3; | |
17 | |
18 // Maximum number of words allowed in an address between the house number | |
19 // and the state, both not included. | |
20 const size_t kMaxAddressWords = 12; | |
21 | |
22 // Maximum number of lines allowed in an address between the house number | |
23 // and the state, both not included. | |
24 const size_t kMaxAddressLines = 5; | |
25 | |
26 // Maximum length allowed for any address word between the house number | |
27 // and the state, both not included. | |
28 const size_t kMaxAddressNameWordLength = 25; | |
29 | |
30 // Maximum number of words after the house number in which the location name | |
31 // should be found. | |
32 const size_t kMaxLocationNameDistance = 4; | |
33 | |
34 // Additional characters used as new line delimiters. | |
35 const base::char16 kNewlineDelimiters[] = { | |
36 '\n', | |
37 ',', | |
38 '*', | |
39 0x2022, // Unicode bullet | |
40 0, | |
41 }; | |
42 | |
43 } // anonymous namespace | |
44 | |
45 namespace content { | |
46 | |
47 namespace address_parser { | |
48 | |
49 using namespace internal; | |
50 | |
51 bool FindAddress(const base::string16& text, base::string16* address) { | |
52 size_t start, end; | |
53 if (FindAddress(text.begin(), text.end(), &start, &end)) { | |
54 size_t len = end >= start ? end - start : 0; | |
55 address->assign(text.substr(start, len)); | |
56 return true; | |
57 } | |
58 return false; | |
59 } | |
60 | |
61 bool FindAddress(const base::string16::const_iterator& begin, | |
62 const base::string16::const_iterator& end, | |
63 size_t* start_pos, | |
64 size_t* end_pos) { | |
65 HouseNumberParser house_number_parser; | |
66 | |
67 // Keep going through the input string until a potential house number is | |
68 // detected. Start tokenizing the following words to find a valid | |
69 // street name within a word range. Then, find a state name followed | |
70 // by a valid zip code for that state. Also keep a look for any other | |
71 // possible house numbers to continue from in case of no match and for | |
72 // state names not followed by a zip code (e.g. New York, NY 10000). | |
73 const base::string16 newline_delimiters = kNewlineDelimiters; | |
74 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters; | |
75 for (base::string16::const_iterator it = begin; it != end; ) { | |
76 Word house_number; | |
77 if (!house_number_parser.Parse(it, end, &house_number)) | |
78 return false; | |
79 | |
80 String16Tokenizer tokenizer(house_number.end, end, delimiters); | |
81 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS); | |
82 | |
83 WordList words; | |
84 words.push_back(house_number); | |
85 | |
86 bool found_location_name = false; | |
87 bool continue_on_house_number = true; | |
88 bool consecutive_house_numbers = true; | |
89 size_t next_house_number_word = 0; | |
90 size_t num_lines = 1; | |
91 | |
92 // Don't include the house number in the word count. | |
93 size_t next_word = 1; | |
94 for (; next_word <= kMaxAddressWords + 1; ++next_word) { | |
95 | |
96 // Extract a new word from the tokenizer. | |
97 if (next_word == words.size()) { | |
98 do { | |
99 if (!tokenizer.GetNext()) | |
100 return false; | |
101 | |
102 // Check the number of address lines. | |
103 if (tokenizer.token_is_delim() && newline_delimiters.find( | |
104 *tokenizer.token_begin()) != base::string16::npos) { | |
105 ++num_lines; | |
106 } | |
107 } while (tokenizer.token_is_delim()); | |
108 | |
109 if (num_lines > kMaxAddressLines) | |
110 break; | |
111 | |
112 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); | |
113 } | |
114 | |
115 // Check the word length. If too long, don't try to continue from | |
116 // the next house number as no address can hold this word. | |
117 const Word& current_word = words[next_word]; | |
118 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0); | |
119 size_t current_word_length = std::distance( | |
120 current_word.begin, current_word.end); | |
121 if (current_word_length > kMaxAddressNameWordLength) { | |
122 continue_on_house_number = false; | |
123 break; | |
124 } | |
125 | |
126 // Check if the new word is a valid house number. | |
127 if (house_number_parser.Parse(current_word.begin, current_word.end, | |
128 NULL)) { | |
129 // Increase the number of consecutive house numbers since the beginning. | |
130 if (consecutive_house_numbers) { | |
131 // Check if there is a new line between consecutive house numbers. | |
132 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.." | |
133 if (num_lines > 1) { | |
134 next_house_number_word = next_word; | |
135 break; | |
136 } | |
137 } | |
138 | |
139 // Keep the next candidate to resume parsing from in case of failure. | |
140 if (next_house_number_word == 0) { | |
141 next_house_number_word = next_word; | |
142 continue; | |
143 } | |
144 } else { | |
145 consecutive_house_numbers = false; | |
146 } | |
147 | |
148 // Look for location names in the words after the house number. | |
149 // A range limitation is introduced to avoid matching | |
150 // anything that starts with a number before a legitimate address. | |
151 if (next_word <= kMaxLocationNameDistance && | |
152 IsValidLocationName(current_word)) { | |
153 found_location_name = true; | |
154 continue; | |
155 } | |
156 | |
157 // Don't count the house number. | |
158 if (next_word > kMinAddressWords) { | |
159 // Looking for the state is likely to add new words to the list while | |
160 // checking for multi-word state names. | |
161 size_t state_first_word = next_word; | |
162 size_t state_last_word, state_index; | |
163 if (FindStateStartingInWord(&words, state_first_word, &state_last_word, | |
164 &tokenizer, &state_index)) { | |
165 | |
166 // A location name should have been found at this point. | |
167 if (!found_location_name) | |
168 break; | |
169 | |
170 // Explicitly exclude "et al", as "al" is a valid state code. | |
171 if (current_word_length == 2 && words.size() > 2) { | |
172 const Word& previous_word = words[state_first_word - 1]; | |
173 if (previous_word.end - previous_word.begin == 2 && | |
174 base::LowerCaseEqualsASCII( | |
175 base::StringPiece16(previous_word.begin, previous_word.end), | |
176 "et") && | |
177 base::LowerCaseEqualsASCII( | |
178 base::StringPiece16(current_word.begin, current_word.end), | |
179 "al")) | |
180 break; | |
181 } | |
182 | |
183 // Extract one more word from the tokenizer if not already available. | |
184 size_t zip_word = state_last_word + 1; | |
185 if (zip_word == words.size()) { | |
186 do { | |
187 if (!tokenizer.GetNext()) { | |
188 // The address ends with a state name without a zip code. This | |
189 // is legal according to WebView#findAddress public | |
190 // documentation. | |
191 *start_pos = words[0].begin - begin; | |
192 *end_pos = words[state_last_word].end - begin; | |
193 return true; | |
194 } | |
195 } while (tokenizer.token_is_delim()); | |
196 words.push_back(Word(tokenizer.token_begin(), | |
197 tokenizer.token_end())); | |
198 } | |
199 | |
200 // Check the parsing validity and state range of the zip code. | |
201 next_word = state_last_word; | |
202 if (!IsZipValid(words[zip_word], state_index)) | |
203 continue; | |
204 | |
205 *start_pos = words[0].begin - begin; | |
206 *end_pos = words[zip_word].end - begin; | |
207 return true; | |
208 } | |
209 } | |
210 } | |
211 | |
212 // Avoid skipping too many words because of a non-address number | |
213 // at the beginning of the contents to parse. | |
214 if (continue_on_house_number && next_house_number_word > 0) { | |
215 it = words[next_house_number_word].begin; | |
216 } else { | |
217 DCHECK(!words.empty()); | |
218 next_word = std::min(next_word, words.size() - 1); | |
219 it = words[next_word].end; | |
220 } | |
221 } | |
222 | |
223 return false; | |
224 } | |
225 | |
226 } // namespace address_parser | |
227 | |
228 } // namespace content | |
OLD | NEW |