| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "android_webview/native/address_parser.h" | |
| 6 | |
| 7 #include "android_webview/native/address_parser_internal.h" | |
| 8 #include "base/logging.h" | |
| 9 #include "base/strings/string_util.h" | |
| 10 | |
| 11 namespace { | |
| 12 | |
| 13 // Minimum number of words in an address after the house number | |
| 14 // before a state is expected to be found. | |
| 15 // A value too high can miss short addresses. | |
| 16 const size_t kMinAddressWords = 3; | |
| 17 | |
| 18 // Maximum number of words allowed in an address between the house number | |
| 19 // and the state, both not included. | |
| 20 const size_t kMaxAddressWords = 12; | |
| 21 | |
| 22 // Maximum number of lines allowed in an address between the house number | |
| 23 // and the state, both not included. | |
| 24 const size_t kMaxAddressLines = 5; | |
| 25 | |
| 26 // Maximum length allowed for any address word between the house number | |
| 27 // and the state, both not included. | |
| 28 const size_t kMaxAddressNameWordLength = 25; | |
| 29 | |
| 30 // Maximum number of words after the house number in which the location name | |
| 31 // should be found. | |
| 32 const size_t kMaxLocationNameDistance = 4; | |
| 33 | |
| 34 // Additional characters used as new line delimiters. | |
| 35 const base::char16 kNewlineDelimiters[] = { | |
| 36 '\n', ',', '*', | |
| 37 0x2022, // Unicode bullet | |
| 38 0, | |
| 39 }; | |
| 40 | |
| 41 } // anonymous namespace | |
| 42 | |
| 43 namespace android_webview { | |
| 44 | |
| 45 namespace address_parser { | |
| 46 | |
| 47 using namespace internal; | |
| 48 | |
| 49 bool FindAddress(const base::string16& text, base::string16* address) { | |
| 50 size_t start, end; | |
| 51 if (FindAddress(text.begin(), text.end(), &start, &end)) { | |
| 52 size_t len = end >= start ? end - start : 0; | |
| 53 address->assign(text.substr(start, len)); | |
| 54 return true; | |
| 55 } | |
| 56 return false; | |
| 57 } | |
| 58 | |
| 59 bool FindAddress(const base::string16::const_iterator& begin, | |
| 60 const base::string16::const_iterator& end, | |
| 61 size_t* start_pos, | |
| 62 size_t* end_pos) { | |
| 63 HouseNumberParser house_number_parser; | |
| 64 | |
| 65 // Keep going through the input string until a potential house number is | |
| 66 // detected. Start tokenizing the following words to find a valid | |
| 67 // street name within a word range. Then, find a state name followed | |
| 68 // by a valid zip code for that state. Also keep a look for any other | |
| 69 // possible house numbers to continue from in case of no match and for | |
| 70 // state names not followed by a zip code (e.g. New York, NY 10000). | |
| 71 const base::string16 newline_delimiters = kNewlineDelimiters; | |
| 72 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters; | |
| 73 for (base::string16::const_iterator it = begin; it != end;) { | |
| 74 Word house_number; | |
| 75 if (!house_number_parser.Parse(it, end, &house_number)) | |
| 76 return false; | |
| 77 | |
| 78 String16Tokenizer tokenizer(house_number.end, end, delimiters); | |
| 79 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS); | |
| 80 | |
| 81 WordList words; | |
| 82 words.push_back(house_number); | |
| 83 | |
| 84 bool found_location_name = false; | |
| 85 bool continue_on_house_number = true; | |
| 86 bool consecutive_house_numbers = true; | |
| 87 size_t next_house_number_word = 0; | |
| 88 size_t num_lines = 1; | |
| 89 | |
| 90 // Don't include the house number in the word count. | |
| 91 size_t next_word = 1; | |
| 92 for (; next_word <= kMaxAddressWords + 1; ++next_word) { | |
| 93 // Extract a new word from the tokenizer. | |
| 94 if (next_word == words.size()) { | |
| 95 do { | |
| 96 if (!tokenizer.GetNext()) | |
| 97 return false; | |
| 98 | |
| 99 // Check the number of address lines. | |
| 100 if (tokenizer.token_is_delim() && | |
| 101 newline_delimiters.find(*tokenizer.token_begin()) != | |
| 102 base::string16::npos) { | |
| 103 ++num_lines; | |
| 104 } | |
| 105 } while (tokenizer.token_is_delim()); | |
| 106 | |
| 107 if (num_lines > kMaxAddressLines) | |
| 108 break; | |
| 109 | |
| 110 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); | |
| 111 } | |
| 112 | |
| 113 // Check the word length. If too long, don't try to continue from | |
| 114 // the next house number as no address can hold this word. | |
| 115 const Word& current_word = words[next_word]; | |
| 116 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0); | |
| 117 size_t current_word_length = | |
| 118 std::distance(current_word.begin, current_word.end); | |
| 119 if (current_word_length > kMaxAddressNameWordLength) { | |
| 120 continue_on_house_number = false; | |
| 121 break; | |
| 122 } | |
| 123 | |
| 124 // Check if the new word is a valid house number. | |
| 125 if (house_number_parser.Parse(current_word.begin, current_word.end, | |
| 126 NULL)) { | |
| 127 // Increase the number of consecutive house numbers since the beginning. | |
| 128 if (consecutive_house_numbers) { | |
| 129 // Check if there is a new line between consecutive house numbers. | |
| 130 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.." | |
| 131 if (num_lines > 1) { | |
| 132 next_house_number_word = next_word; | |
| 133 break; | |
| 134 } | |
| 135 } | |
| 136 | |
| 137 // Keep the next candidate to resume parsing from in case of failure. | |
| 138 if (next_house_number_word == 0) { | |
| 139 next_house_number_word = next_word; | |
| 140 continue; | |
| 141 } | |
| 142 } else { | |
| 143 consecutive_house_numbers = false; | |
| 144 } | |
| 145 | |
| 146 // Look for location names in the words after the house number. | |
| 147 // A range limitation is introduced to avoid matching | |
| 148 // anything that starts with a number before a legitimate address. | |
| 149 if (next_word <= kMaxLocationNameDistance && | |
| 150 IsValidLocationName(current_word)) { | |
| 151 found_location_name = true; | |
| 152 continue; | |
| 153 } | |
| 154 | |
| 155 // Don't count the house number. | |
| 156 if (next_word > kMinAddressWords) { | |
| 157 // Looking for the state is likely to add new words to the list while | |
| 158 // checking for multi-word state names. | |
| 159 size_t state_first_word = next_word; | |
| 160 size_t state_last_word, state_index; | |
| 161 if (FindStateStartingInWord(&words, state_first_word, &state_last_word, | |
| 162 &tokenizer, &state_index)) { | |
| 163 // A location name should have been found at this point. | |
| 164 if (!found_location_name) | |
| 165 break; | |
| 166 | |
| 167 // Explicitly exclude "et al", as "al" is a valid state code. | |
| 168 if (current_word_length == 2 && words.size() > 2) { | |
| 169 const Word& previous_word = words[state_first_word - 1]; | |
| 170 if (previous_word.end - previous_word.begin == 2 && | |
| 171 base::LowerCaseEqualsASCII( | |
| 172 base::StringPiece16(previous_word.begin, previous_word.end), | |
| 173 "et") && | |
| 174 base::LowerCaseEqualsASCII( | |
| 175 base::StringPiece16(current_word.begin, current_word.end), | |
| 176 "al")) | |
| 177 break; | |
| 178 } | |
| 179 | |
| 180 // Extract one more word from the tokenizer if not already available. | |
| 181 size_t zip_word = state_last_word + 1; | |
| 182 if (zip_word == words.size()) { | |
| 183 do { | |
| 184 if (!tokenizer.GetNext()) { | |
| 185 // The address ends with a state name without a zip code. This | |
| 186 // is legal according to WebView#findAddress public | |
| 187 // documentation. | |
| 188 *start_pos = words[0].begin - begin; | |
| 189 *end_pos = words[state_last_word].end - begin; | |
| 190 return true; | |
| 191 } | |
| 192 } while (tokenizer.token_is_delim()); | |
| 193 words.push_back( | |
| 194 Word(tokenizer.token_begin(), tokenizer.token_end())); | |
| 195 } | |
| 196 | |
| 197 // Check the parsing validity and state range of the zip code. | |
| 198 next_word = state_last_word; | |
| 199 if (!IsZipValid(words[zip_word], state_index)) | |
| 200 continue; | |
| 201 | |
| 202 *start_pos = words[0].begin - begin; | |
| 203 *end_pos = words[zip_word].end - begin; | |
| 204 return true; | |
| 205 } | |
| 206 } | |
| 207 } | |
| 208 | |
| 209 // Avoid skipping too many words because of a non-address number | |
| 210 // at the beginning of the contents to parse. | |
| 211 if (continue_on_house_number && next_house_number_word > 0) { | |
| 212 it = words[next_house_number_word].begin; | |
| 213 } else { | |
| 214 DCHECK(!words.empty()); | |
| 215 next_word = std::min(next_word, words.size() - 1); | |
| 216 it = words[next_word].end; | |
| 217 } | |
| 218 } | |
| 219 | |
| 220 return false; | |
| 221 } | |
| 222 | |
| 223 } // namespace address_parser | |
| 224 | |
| 225 } // namespace android_webview | |
| OLD | NEW |