OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "content/common/android/address_parser.h" | 5 #include "android_webview/native/address_parser.h" |
6 | 6 |
| 7 #include "android_webview/native/address_parser_internal.h" |
7 #include "base/logging.h" | 8 #include "base/logging.h" |
8 #include "base/strings/string_util.h" | 9 #include "base/strings/string_util.h" |
9 #include "content/common/android/address_parser_internal.h" | |
10 | 10 |
11 namespace { | 11 namespace { |
12 | 12 |
13 // Minimum number of words in an address after the house number | 13 // Minimum number of words in an address after the house number |
14 // before a state is expected to be found. | 14 // before a state is expected to be found. |
15 // A value too high can miss short addresses. | 15 // A value too high can miss short addresses. |
16 const size_t kMinAddressWords = 3; | 16 const size_t kMinAddressWords = 3; |
17 | 17 |
18 // Maximum number of words allowed in an address between the house number | 18 // Maximum number of words allowed in an address between the house number |
19 // and the state, both not included. | 19 // and the state, both not included. |
20 const size_t kMaxAddressWords = 12; | 20 const size_t kMaxAddressWords = 12; |
21 | 21 |
22 // Maximum number of lines allowed in an address between the house number | 22 // Maximum number of lines allowed in an address between the house number |
23 // and the state, both not included. | 23 // and the state, both not included. |
24 const size_t kMaxAddressLines = 5; | 24 const size_t kMaxAddressLines = 5; |
25 | 25 |
26 // Maximum length allowed for any address word between the house number | 26 // Maximum length allowed for any address word between the house number |
27 // and the state, both not included. | 27 // and the state, both not included. |
28 const size_t kMaxAddressNameWordLength = 25; | 28 const size_t kMaxAddressNameWordLength = 25; |
29 | 29 |
30 // Maximum number of words after the house number in which the location name | 30 // Maximum number of words after the house number in which the location name |
31 // should be found. | 31 // should be found. |
32 const size_t kMaxLocationNameDistance = 4; | 32 const size_t kMaxLocationNameDistance = 4; |
33 | 33 |
34 // Additional characters used as new line delimiters. | 34 // Additional characters used as new line delimiters. |
35 const base::char16 kNewlineDelimiters[] = { | 35 const base::char16 kNewlineDelimiters[] = { |
36 '\n', | 36 '\n', ',', '*', |
37 ',', | 37 0x2022, // Unicode bullet |
38 '*', | 38 0, |
39 0x2022, // Unicode bullet | |
40 0, | |
41 }; | 39 }; |
42 | 40 |
43 } // anonymous namespace | 41 } // anonymous namespace |
44 | 42 |
45 namespace content { | 43 namespace android_webview { |
46 | 44 |
47 namespace address_parser { | 45 namespace address_parser { |
48 | 46 |
49 using namespace internal; | 47 using namespace internal; |
50 | 48 |
51 bool FindAddress(const base::string16& text, base::string16* address) { | 49 bool FindAddress(const base::string16& text, base::string16* address) { |
52 size_t start, end; | 50 size_t start, end; |
53 if (FindAddress(text.begin(), text.end(), &start, &end)) { | 51 if (FindAddress(text.begin(), text.end(), &start, &end)) { |
54 size_t len = end >= start ? end - start : 0; | 52 size_t len = end >= start ? end - start : 0; |
55 address->assign(text.substr(start, len)); | 53 address->assign(text.substr(start, len)); |
56 return true; | 54 return true; |
57 } | 55 } |
58 return false; | 56 return false; |
59 } | 57 } |
60 | 58 |
61 bool FindAddress(const base::string16::const_iterator& begin, | 59 bool FindAddress(const base::string16::const_iterator& begin, |
62 const base::string16::const_iterator& end, | 60 const base::string16::const_iterator& end, |
63 size_t* start_pos, | 61 size_t* start_pos, |
64 size_t* end_pos) { | 62 size_t* end_pos) { |
65 HouseNumberParser house_number_parser; | 63 HouseNumberParser house_number_parser; |
66 | 64 |
67 // Keep going through the input string until a potential house number is | 65 // Keep going through the input string until a potential house number is |
68 // detected. Start tokenizing the following words to find a valid | 66 // detected. Start tokenizing the following words to find a valid |
69 // street name within a word range. Then, find a state name followed | 67 // street name within a word range. Then, find a state name followed |
70 // by a valid zip code for that state. Also keep a look for any other | 68 // by a valid zip code for that state. Also keep a look for any other |
71 // possible house numbers to continue from in case of no match and for | 69 // possible house numbers to continue from in case of no match and for |
72 // state names not followed by a zip code (e.g. New York, NY 10000). | 70 // state names not followed by a zip code (e.g. New York, NY 10000). |
73 const base::string16 newline_delimiters = kNewlineDelimiters; | 71 const base::string16 newline_delimiters = kNewlineDelimiters; |
74 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters; | 72 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters; |
75 for (base::string16::const_iterator it = begin; it != end; ) { | 73 for (base::string16::const_iterator it = begin; it != end;) { |
76 Word house_number; | 74 Word house_number; |
77 if (!house_number_parser.Parse(it, end, &house_number)) | 75 if (!house_number_parser.Parse(it, end, &house_number)) |
78 return false; | 76 return false; |
79 | 77 |
80 String16Tokenizer tokenizer(house_number.end, end, delimiters); | 78 String16Tokenizer tokenizer(house_number.end, end, delimiters); |
81 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS); | 79 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS); |
82 | 80 |
83 WordList words; | 81 WordList words; |
84 words.push_back(house_number); | 82 words.push_back(house_number); |
85 | 83 |
86 bool found_location_name = false; | 84 bool found_location_name = false; |
87 bool continue_on_house_number = true; | 85 bool continue_on_house_number = true; |
88 bool consecutive_house_numbers = true; | 86 bool consecutive_house_numbers = true; |
89 size_t next_house_number_word = 0; | 87 size_t next_house_number_word = 0; |
90 size_t num_lines = 1; | 88 size_t num_lines = 1; |
91 | 89 |
92 // Don't include the house number in the word count. | 90 // Don't include the house number in the word count. |
93 size_t next_word = 1; | 91 size_t next_word = 1; |
94 for (; next_word <= kMaxAddressWords + 1; ++next_word) { | 92 for (; next_word <= kMaxAddressWords + 1; ++next_word) { |
95 | |
96 // Extract a new word from the tokenizer. | 93 // Extract a new word from the tokenizer. |
97 if (next_word == words.size()) { | 94 if (next_word == words.size()) { |
98 do { | 95 do { |
99 if (!tokenizer.GetNext()) | 96 if (!tokenizer.GetNext()) |
100 return false; | 97 return false; |
101 | 98 |
102 // Check the number of address lines. | 99 // Check the number of address lines. |
103 if (tokenizer.token_is_delim() && newline_delimiters.find( | 100 if (tokenizer.token_is_delim() && |
104 *tokenizer.token_begin()) != base::string16::npos) { | 101 newline_delimiters.find(*tokenizer.token_begin()) != |
| 102 base::string16::npos) { |
105 ++num_lines; | 103 ++num_lines; |
106 } | 104 } |
107 } while (tokenizer.token_is_delim()); | 105 } while (tokenizer.token_is_delim()); |
108 | 106 |
109 if (num_lines > kMaxAddressLines) | 107 if (num_lines > kMaxAddressLines) |
110 break; | 108 break; |
111 | 109 |
112 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); | 110 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); |
113 } | 111 } |
114 | 112 |
115 // Check the word length. If too long, don't try to continue from | 113 // Check the word length. If too long, don't try to continue from |
116 // the next house number as no address can hold this word. | 114 // the next house number as no address can hold this word. |
117 const Word& current_word = words[next_word]; | 115 const Word& current_word = words[next_word]; |
118 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0); | 116 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0); |
119 size_t current_word_length = std::distance( | 117 size_t current_word_length = |
120 current_word.begin, current_word.end); | 118 std::distance(current_word.begin, current_word.end); |
121 if (current_word_length > kMaxAddressNameWordLength) { | 119 if (current_word_length > kMaxAddressNameWordLength) { |
122 continue_on_house_number = false; | 120 continue_on_house_number = false; |
123 break; | 121 break; |
124 } | 122 } |
125 | 123 |
126 // Check if the new word is a valid house number. | 124 // Check if the new word is a valid house number. |
127 if (house_number_parser.Parse(current_word.begin, current_word.end, | 125 if (house_number_parser.Parse(current_word.begin, current_word.end, |
128 NULL)) { | 126 NULL)) { |
129 // Increase the number of consecutive house numbers since the beginning. | 127 // Increase the number of consecutive house numbers since the beginning. |
130 if (consecutive_house_numbers) { | 128 if (consecutive_house_numbers) { |
131 // Check if there is a new line between consecutive house numbers. | 129 // Check if there is a new line between consecutive house numbers. |
132 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.." | 130 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.." |
133 if (num_lines > 1) { | 131 if (num_lines > 1) { |
134 next_house_number_word = next_word; | 132 next_house_number_word = next_word; |
135 break; | 133 break; |
136 } | 134 } |
137 } | 135 } |
138 | 136 |
(...skipping 16 matching lines...) Expand all Loading... |
155 } | 153 } |
156 | 154 |
157 // Don't count the house number. | 155 // Don't count the house number. |
158 if (next_word > kMinAddressWords) { | 156 if (next_word > kMinAddressWords) { |
159 // Looking for the state is likely to add new words to the list while | 157 // Looking for the state is likely to add new words to the list while |
160 // checking for multi-word state names. | 158 // checking for multi-word state names. |
161 size_t state_first_word = next_word; | 159 size_t state_first_word = next_word; |
162 size_t state_last_word, state_index; | 160 size_t state_last_word, state_index; |
163 if (FindStateStartingInWord(&words, state_first_word, &state_last_word, | 161 if (FindStateStartingInWord(&words, state_first_word, &state_last_word, |
164 &tokenizer, &state_index)) { | 162 &tokenizer, &state_index)) { |
165 | |
166 // A location name should have been found at this point. | 163 // A location name should have been found at this point. |
167 if (!found_location_name) | 164 if (!found_location_name) |
168 break; | 165 break; |
169 | 166 |
170 // Explicitly exclude "et al", as "al" is a valid state code. | 167 // Explicitly exclude "et al", as "al" is a valid state code. |
171 if (current_word_length == 2 && words.size() > 2) { | 168 if (current_word_length == 2 && words.size() > 2) { |
172 const Word& previous_word = words[state_first_word - 1]; | 169 const Word& previous_word = words[state_first_word - 1]; |
173 if (previous_word.end - previous_word.begin == 2 && | 170 if (previous_word.end - previous_word.begin == 2 && |
174 base::LowerCaseEqualsASCII( | 171 base::LowerCaseEqualsASCII( |
175 base::StringPiece16(previous_word.begin, previous_word.end), | 172 base::StringPiece16(previous_word.begin, previous_word.end), |
176 "et") && | 173 "et") && |
177 base::LowerCaseEqualsASCII( | 174 base::LowerCaseEqualsASCII( |
178 base::StringPiece16(current_word.begin, current_word.end), | 175 base::StringPiece16(current_word.begin, current_word.end), |
179 "al")) | 176 "al")) |
180 break; | 177 break; |
181 } | 178 } |
182 | 179 |
183 // Extract one more word from the tokenizer if not already available. | 180 // Extract one more word from the tokenizer if not already available. |
184 size_t zip_word = state_last_word + 1; | 181 size_t zip_word = state_last_word + 1; |
185 if (zip_word == words.size()) { | 182 if (zip_word == words.size()) { |
186 do { | 183 do { |
187 if (!tokenizer.GetNext()) { | 184 if (!tokenizer.GetNext()) { |
188 // The address ends with a state name without a zip code. This | 185 // The address ends with a state name without a zip code. This |
189 // is legal according to WebView#findAddress public | 186 // is legal according to WebView#findAddress public |
190 // documentation. | 187 // documentation. |
191 *start_pos = words[0].begin - begin; | 188 *start_pos = words[0].begin - begin; |
192 *end_pos = words[state_last_word].end - begin; | 189 *end_pos = words[state_last_word].end - begin; |
193 return true; | 190 return true; |
194 } | 191 } |
195 } while (tokenizer.token_is_delim()); | 192 } while (tokenizer.token_is_delim()); |
196 words.push_back(Word(tokenizer.token_begin(), | 193 words.push_back( |
197 tokenizer.token_end())); | 194 Word(tokenizer.token_begin(), tokenizer.token_end())); |
198 } | 195 } |
199 | 196 |
200 // Check the parsing validity and state range of the zip code. | 197 // Check the parsing validity and state range of the zip code. |
201 next_word = state_last_word; | 198 next_word = state_last_word; |
202 if (!IsZipValid(words[zip_word], state_index)) | 199 if (!IsZipValid(words[zip_word], state_index)) |
203 continue; | 200 continue; |
204 | 201 |
205 *start_pos = words[0].begin - begin; | 202 *start_pos = words[0].begin - begin; |
206 *end_pos = words[zip_word].end - begin; | 203 *end_pos = words[zip_word].end - begin; |
207 return true; | 204 return true; |
(...skipping 10 matching lines...) Expand all Loading... |
218 next_word = std::min(next_word, words.size() - 1); | 215 next_word = std::min(next_word, words.size() - 1); |
219 it = words[next_word].end; | 216 it = words[next_word].end; |
220 } | 217 } |
221 } | 218 } |
222 | 219 |
223 return false; | 220 return false; |
224 } | 221 } |
225 | 222 |
226 } // namespace address_parser | 223 } // namespace address_parser |
227 | 224 |
228 } // namespace content | 225 } // namespace android_webview |
OLD | NEW |