Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(142)

Side by Side Diff: android_webview/native/address_parser.cc

Issue 2863233002: [WebView] Move files from native to browser (Closed)
Patch Set: Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "android_webview/native/address_parser.h"
6
7 #include "android_webview/native/address_parser_internal.h"
8 #include "base/logging.h"
9 #include "base/strings/string_util.h"
10
11 namespace {
12
13 // Minimum number of words in an address after the house number
14 // before a state is expected to be found.
15 // A value too high can miss short addresses.
16 const size_t kMinAddressWords = 3;
17
18 // Maximum number of words allowed in an address between the house number
19 // and the state, both not included.
20 const size_t kMaxAddressWords = 12;
21
22 // Maximum number of lines allowed in an address between the house number
23 // and the state, both not included.
24 const size_t kMaxAddressLines = 5;
25
26 // Maximum length allowed for any address word between the house number
27 // and the state, both not included.
28 const size_t kMaxAddressNameWordLength = 25;
29
30 // Maximum number of words after the house number in which the location name
31 // should be found.
32 const size_t kMaxLocationNameDistance = 4;
33
34 // Additional characters used as new line delimiters.
35 const base::char16 kNewlineDelimiters[] = {
36 '\n', ',', '*',
37 0x2022, // Unicode bullet
38 0,
39 };
40
41 } // anonymous namespace
42
43 namespace android_webview {
44
45 namespace address_parser {
46
47 using namespace internal;
48
49 bool FindAddress(const base::string16& text, base::string16* address) {
50 size_t start, end;
51 if (FindAddress(text.begin(), text.end(), &start, &end)) {
52 size_t len = end >= start ? end - start : 0;
53 address->assign(text.substr(start, len));
54 return true;
55 }
56 return false;
57 }
58
59 bool FindAddress(const base::string16::const_iterator& begin,
60 const base::string16::const_iterator& end,
61 size_t* start_pos,
62 size_t* end_pos) {
63 HouseNumberParser house_number_parser;
64
65 // Keep going through the input string until a potential house number is
66 // detected. Start tokenizing the following words to find a valid
67 // street name within a word range. Then, find a state name followed
68 // by a valid zip code for that state. Also keep a look for any other
69 // possible house numbers to continue from in case of no match and for
70 // state names not followed by a zip code (e.g. New York, NY 10000).
71 const base::string16 newline_delimiters = kNewlineDelimiters;
72 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;
73 for (base::string16::const_iterator it = begin; it != end;) {
74 Word house_number;
75 if (!house_number_parser.Parse(it, end, &house_number))
76 return false;
77
78 String16Tokenizer tokenizer(house_number.end, end, delimiters);
79 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
80
81 WordList words;
82 words.push_back(house_number);
83
84 bool found_location_name = false;
85 bool continue_on_house_number = true;
86 bool consecutive_house_numbers = true;
87 size_t next_house_number_word = 0;
88 size_t num_lines = 1;
89
90 // Don't include the house number in the word count.
91 size_t next_word = 1;
92 for (; next_word <= kMaxAddressWords + 1; ++next_word) {
93 // Extract a new word from the tokenizer.
94 if (next_word == words.size()) {
95 do {
96 if (!tokenizer.GetNext())
97 return false;
98
99 // Check the number of address lines.
100 if (tokenizer.token_is_delim() &&
101 newline_delimiters.find(*tokenizer.token_begin()) !=
102 base::string16::npos) {
103 ++num_lines;
104 }
105 } while (tokenizer.token_is_delim());
106
107 if (num_lines > kMaxAddressLines)
108 break;
109
110 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
111 }
112
113 // Check the word length. If too long, don't try to continue from
114 // the next house number as no address can hold this word.
115 const Word& current_word = words[next_word];
116 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
117 size_t current_word_length =
118 std::distance(current_word.begin, current_word.end);
119 if (current_word_length > kMaxAddressNameWordLength) {
120 continue_on_house_number = false;
121 break;
122 }
123
124 // Check if the new word is a valid house number.
125 if (house_number_parser.Parse(current_word.begin, current_word.end,
126 NULL)) {
127 // Increase the number of consecutive house numbers since the beginning.
128 if (consecutive_house_numbers) {
129 // Check if there is a new line between consecutive house numbers.
130 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
131 if (num_lines > 1) {
132 next_house_number_word = next_word;
133 break;
134 }
135 }
136
137 // Keep the next candidate to resume parsing from in case of failure.
138 if (next_house_number_word == 0) {
139 next_house_number_word = next_word;
140 continue;
141 }
142 } else {
143 consecutive_house_numbers = false;
144 }
145
146 // Look for location names in the words after the house number.
147 // A range limitation is introduced to avoid matching
148 // anything that starts with a number before a legitimate address.
149 if (next_word <= kMaxLocationNameDistance &&
150 IsValidLocationName(current_word)) {
151 found_location_name = true;
152 continue;
153 }
154
155 // Don't count the house number.
156 if (next_word > kMinAddressWords) {
157 // Looking for the state is likely to add new words to the list while
158 // checking for multi-word state names.
159 size_t state_first_word = next_word;
160 size_t state_last_word, state_index;
161 if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
162 &tokenizer, &state_index)) {
163 // A location name should have been found at this point.
164 if (!found_location_name)
165 break;
166
167 // Explicitly exclude "et al", as "al" is a valid state code.
168 if (current_word_length == 2 && words.size() > 2) {
169 const Word& previous_word = words[state_first_word - 1];
170 if (previous_word.end - previous_word.begin == 2 &&
171 base::LowerCaseEqualsASCII(
172 base::StringPiece16(previous_word.begin, previous_word.end),
173 "et") &&
174 base::LowerCaseEqualsASCII(
175 base::StringPiece16(current_word.begin, current_word.end),
176 "al"))
177 break;
178 }
179
180 // Extract one more word from the tokenizer if not already available.
181 size_t zip_word = state_last_word + 1;
182 if (zip_word == words.size()) {
183 do {
184 if (!tokenizer.GetNext()) {
185 // The address ends with a state name without a zip code. This
186 // is legal according to WebView#findAddress public
187 // documentation.
188 *start_pos = words[0].begin - begin;
189 *end_pos = words[state_last_word].end - begin;
190 return true;
191 }
192 } while (tokenizer.token_is_delim());
193 words.push_back(
194 Word(tokenizer.token_begin(), tokenizer.token_end()));
195 }
196
197 // Check the parsing validity and state range of the zip code.
198 next_word = state_last_word;
199 if (!IsZipValid(words[zip_word], state_index))
200 continue;
201
202 *start_pos = words[0].begin - begin;
203 *end_pos = words[zip_word].end - begin;
204 return true;
205 }
206 }
207 }
208
209 // Avoid skipping too many words because of a non-address number
210 // at the beginning of the contents to parse.
211 if (continue_on_house_number && next_house_number_word > 0) {
212 it = words[next_house_number_word].begin;
213 } else {
214 DCHECK(!words.empty());
215 next_word = std::min(next_word, words.size() - 1);
216 it = words[next_word].end;
217 }
218 }
219
220 return false;
221 }
222
223 } // namespace address_parser
224
225 } // namespace android_webview
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698