Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(305)

Side by Side Diff: android_webview/native/address_parser.cc

Issue 2803163002: Move address parser and prefixes to android_webview/. (Closed)
Patch Set: Bring back ContentViewStatics import Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "content/common/android/address_parser.h" 5 #include "android_webview/native/address_parser.h"
6 6
7 #include "android_webview/native/address_parser_internal.h"
7 #include "base/logging.h" 8 #include "base/logging.h"
8 #include "base/strings/string_util.h" 9 #include "base/strings/string_util.h"
9 #include "content/common/android/address_parser_internal.h"
10 10
11 namespace { 11 namespace {
12 12
13 // Minimum number of words in an address after the house number 13 // Minimum number of words in an address after the house number
14 // before a state is expected to be found. 14 // before a state is expected to be found.
15 // A value too high can miss short addresses. 15 // A value too high can miss short addresses.
16 const size_t kMinAddressWords = 3; 16 const size_t kMinAddressWords = 3;
17 17
18 // Maximum number of words allowed in an address between the house number 18 // Maximum number of words allowed in an address between the house number
19 // and the state, both not included. 19 // and the state, both not included.
20 const size_t kMaxAddressWords = 12; 20 const size_t kMaxAddressWords = 12;
21 21
22 // Maximum number of lines allowed in an address between the house number 22 // Maximum number of lines allowed in an address between the house number
23 // and the state, both not included. 23 // and the state, both not included.
24 const size_t kMaxAddressLines = 5; 24 const size_t kMaxAddressLines = 5;
25 25
26 // Maximum length allowed for any address word between the house number 26 // Maximum length allowed for any address word between the house number
27 // and the state, both not included. 27 // and the state, both not included.
28 const size_t kMaxAddressNameWordLength = 25; 28 const size_t kMaxAddressNameWordLength = 25;
29 29
30 // Maximum number of words after the house number in which the location name 30 // Maximum number of words after the house number in which the location name
31 // should be found. 31 // should be found.
32 const size_t kMaxLocationNameDistance = 4; 32 const size_t kMaxLocationNameDistance = 4;
33 33
34 // Additional characters used as new line delimiters. 34 // Additional characters used as new line delimiters.
35 const base::char16 kNewlineDelimiters[] = { 35 const base::char16 kNewlineDelimiters[] = {
36 '\n', 36 '\n', ',', '*',
37 ',', 37 0x2022, // Unicode bullet
38 '*', 38 0,
39 0x2022, // Unicode bullet
40 0,
41 }; 39 };
42 40
43 } // anonymous namespace 41 } // anonymous namespace
44 42
45 namespace content { 43 namespace android_webview {
46 44
47 namespace address_parser { 45 namespace address_parser {
48 46
49 using namespace internal; 47 using namespace internal;
50 48
51 bool FindAddress(const base::string16& text, base::string16* address) { 49 bool FindAddress(const base::string16& text, base::string16* address) {
52 size_t start, end; 50 size_t start, end;
53 if (FindAddress(text.begin(), text.end(), &start, &end)) { 51 if (FindAddress(text.begin(), text.end(), &start, &end)) {
54 size_t len = end >= start ? end - start : 0; 52 size_t len = end >= start ? end - start : 0;
55 address->assign(text.substr(start, len)); 53 address->assign(text.substr(start, len));
56 return true; 54 return true;
57 } 55 }
58 return false; 56 return false;
59 } 57 }
60 58
61 bool FindAddress(const base::string16::const_iterator& begin, 59 bool FindAddress(const base::string16::const_iterator& begin,
62 const base::string16::const_iterator& end, 60 const base::string16::const_iterator& end,
63 size_t* start_pos, 61 size_t* start_pos,
64 size_t* end_pos) { 62 size_t* end_pos) {
65 HouseNumberParser house_number_parser; 63 HouseNumberParser house_number_parser;
66 64
67 // Keep going through the input string until a potential house number is 65 // Keep going through the input string until a potential house number is
68 // detected. Start tokenizing the following words to find a valid 66 // detected. Start tokenizing the following words to find a valid
69 // street name within a word range. Then, find a state name followed 67 // street name within a word range. Then, find a state name followed
70 // by a valid zip code for that state. Also keep a look for any other 68 // by a valid zip code for that state. Also keep a look for any other
71 // possible house numbers to continue from in case of no match and for 69 // possible house numbers to continue from in case of no match and for
72 // state names not followed by a zip code (e.g. New York, NY 10000). 70 // state names not followed by a zip code (e.g. New York, NY 10000).
73 const base::string16 newline_delimiters = kNewlineDelimiters; 71 const base::string16 newline_delimiters = kNewlineDelimiters;
74 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters; 72 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;
75 for (base::string16::const_iterator it = begin; it != end; ) { 73 for (base::string16::const_iterator it = begin; it != end;) {
76 Word house_number; 74 Word house_number;
77 if (!house_number_parser.Parse(it, end, &house_number)) 75 if (!house_number_parser.Parse(it, end, &house_number))
78 return false; 76 return false;
79 77
80 String16Tokenizer tokenizer(house_number.end, end, delimiters); 78 String16Tokenizer tokenizer(house_number.end, end, delimiters);
81 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS); 79 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
82 80
83 WordList words; 81 WordList words;
84 words.push_back(house_number); 82 words.push_back(house_number);
85 83
86 bool found_location_name = false; 84 bool found_location_name = false;
87 bool continue_on_house_number = true; 85 bool continue_on_house_number = true;
88 bool consecutive_house_numbers = true; 86 bool consecutive_house_numbers = true;
89 size_t next_house_number_word = 0; 87 size_t next_house_number_word = 0;
90 size_t num_lines = 1; 88 size_t num_lines = 1;
91 89
92 // Don't include the house number in the word count. 90 // Don't include the house number in the word count.
93 size_t next_word = 1; 91 size_t next_word = 1;
94 for (; next_word <= kMaxAddressWords + 1; ++next_word) { 92 for (; next_word <= kMaxAddressWords + 1; ++next_word) {
95
96 // Extract a new word from the tokenizer. 93 // Extract a new word from the tokenizer.
97 if (next_word == words.size()) { 94 if (next_word == words.size()) {
98 do { 95 do {
99 if (!tokenizer.GetNext()) 96 if (!tokenizer.GetNext())
100 return false; 97 return false;
101 98
102 // Check the number of address lines. 99 // Check the number of address lines.
103 if (tokenizer.token_is_delim() && newline_delimiters.find( 100 if (tokenizer.token_is_delim() &&
104 *tokenizer.token_begin()) != base::string16::npos) { 101 newline_delimiters.find(*tokenizer.token_begin()) !=
102 base::string16::npos) {
105 ++num_lines; 103 ++num_lines;
106 } 104 }
107 } while (tokenizer.token_is_delim()); 105 } while (tokenizer.token_is_delim());
108 106
109 if (num_lines > kMaxAddressLines) 107 if (num_lines > kMaxAddressLines)
110 break; 108 break;
111 109
112 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end())); 110 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
113 } 111 }
114 112
115 // Check the word length. If too long, don't try to continue from 113 // Check the word length. If too long, don't try to continue from
116 // the next house number as no address can hold this word. 114 // the next house number as no address can hold this word.
117 const Word& current_word = words[next_word]; 115 const Word& current_word = words[next_word];
118 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0); 116 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
119 size_t current_word_length = std::distance( 117 size_t current_word_length =
120 current_word.begin, current_word.end); 118 std::distance(current_word.begin, current_word.end);
121 if (current_word_length > kMaxAddressNameWordLength) { 119 if (current_word_length > kMaxAddressNameWordLength) {
122 continue_on_house_number = false; 120 continue_on_house_number = false;
123 break; 121 break;
124 } 122 }
125 123
126 // Check if the new word is a valid house number. 124 // Check if the new word is a valid house number.
127 if (house_number_parser.Parse(current_word.begin, current_word.end, 125 if (house_number_parser.Parse(current_word.begin, current_word.end,
128 NULL)) { 126 NULL)) {
129 // Increase the number of consecutive house numbers since the beginning. 127 // Increase the number of consecutive house numbers since the beginning.
130 if (consecutive_house_numbers) { 128 if (consecutive_house_numbers) {
131 // Check if there is a new line between consecutive house numbers. 129 // Check if there is a new line between consecutive house numbers.
132 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.." 130 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
133 if (num_lines > 1) { 131 if (num_lines > 1) {
134 next_house_number_word = next_word; 132 next_house_number_word = next_word;
135 break; 133 break;
136 } 134 }
137 } 135 }
138 136
(...skipping 16 matching lines...) Expand all
155 } 153 }
156 154
157 // Don't count the house number. 155 // Don't count the house number.
158 if (next_word > kMinAddressWords) { 156 if (next_word > kMinAddressWords) {
159 // Looking for the state is likely to add new words to the list while 157 // Looking for the state is likely to add new words to the list while
160 // checking for multi-word state names. 158 // checking for multi-word state names.
161 size_t state_first_word = next_word; 159 size_t state_first_word = next_word;
162 size_t state_last_word, state_index; 160 size_t state_last_word, state_index;
163 if (FindStateStartingInWord(&words, state_first_word, &state_last_word, 161 if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
164 &tokenizer, &state_index)) { 162 &tokenizer, &state_index)) {
165
166 // A location name should have been found at this point. 163 // A location name should have been found at this point.
167 if (!found_location_name) 164 if (!found_location_name)
168 break; 165 break;
169 166
170 // Explicitly exclude "et al", as "al" is a valid state code. 167 // Explicitly exclude "et al", as "al" is a valid state code.
171 if (current_word_length == 2 && words.size() > 2) { 168 if (current_word_length == 2 && words.size() > 2) {
172 const Word& previous_word = words[state_first_word - 1]; 169 const Word& previous_word = words[state_first_word - 1];
173 if (previous_word.end - previous_word.begin == 2 && 170 if (previous_word.end - previous_word.begin == 2 &&
174 base::LowerCaseEqualsASCII( 171 base::LowerCaseEqualsASCII(
175 base::StringPiece16(previous_word.begin, previous_word.end), 172 base::StringPiece16(previous_word.begin, previous_word.end),
176 "et") && 173 "et") &&
177 base::LowerCaseEqualsASCII( 174 base::LowerCaseEqualsASCII(
178 base::StringPiece16(current_word.begin, current_word.end), 175 base::StringPiece16(current_word.begin, current_word.end),
179 "al")) 176 "al"))
180 break; 177 break;
181 } 178 }
182 179
183 // Extract one more word from the tokenizer if not already available. 180 // Extract one more word from the tokenizer if not already available.
184 size_t zip_word = state_last_word + 1; 181 size_t zip_word = state_last_word + 1;
185 if (zip_word == words.size()) { 182 if (zip_word == words.size()) {
186 do { 183 do {
187 if (!tokenizer.GetNext()) { 184 if (!tokenizer.GetNext()) {
188 // The address ends with a state name without a zip code. This 185 // The address ends with a state name without a zip code. This
189 // is legal according to WebView#findAddress public 186 // is legal according to WebView#findAddress public
190 // documentation. 187 // documentation.
191 *start_pos = words[0].begin - begin; 188 *start_pos = words[0].begin - begin;
192 *end_pos = words[state_last_word].end - begin; 189 *end_pos = words[state_last_word].end - begin;
193 return true; 190 return true;
194 } 191 }
195 } while (tokenizer.token_is_delim()); 192 } while (tokenizer.token_is_delim());
196 words.push_back(Word(tokenizer.token_begin(), 193 words.push_back(
197 tokenizer.token_end())); 194 Word(tokenizer.token_begin(), tokenizer.token_end()));
198 } 195 }
199 196
200 // Check the parsing validity and state range of the zip code. 197 // Check the parsing validity and state range of the zip code.
201 next_word = state_last_word; 198 next_word = state_last_word;
202 if (!IsZipValid(words[zip_word], state_index)) 199 if (!IsZipValid(words[zip_word], state_index))
203 continue; 200 continue;
204 201
205 *start_pos = words[0].begin - begin; 202 *start_pos = words[0].begin - begin;
206 *end_pos = words[zip_word].end - begin; 203 *end_pos = words[zip_word].end - begin;
207 return true; 204 return true;
(...skipping 10 matching lines...) Expand all
218 next_word = std::min(next_word, words.size() - 1); 215 next_word = std::min(next_word, words.size() - 1);
219 it = words[next_word].end; 216 it = words[next_word].end;
220 } 217 }
221 } 218 }
222 219
223 return false; 220 return false;
224 } 221 }
225 222
226 } // namespace address_parser 223 } // namespace address_parser
227 224
228 } // namespace content 225 } // namespace android_webview
OLDNEW
« no previous file with comments | « android_webview/native/address_parser.h ('k') | android_webview/native/address_parser_internal.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698