android_webview/native/address_parser.cc - Issue 2803163002: Move address parser and prefixes to android_webview/.

Side by Side Diff: android_webview/native/address_parser.cc

Issue 2803163002: Move address parser and prefixes to android_webview/. (Closed)

Patch Set: Bring back ContentViewStatics import Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "content/common/android/address_parser.h"	5 #include "android_webview/native/address_parser.h"

6	6

	7 #include "android_webview/native/address_parser_internal.h"

7 #include "base/logging.h"	8 #include "base/logging.h"

8 #include "base/strings/string_util.h"	9 #include "base/strings/string_util.h"

9 #include "content/common/android/address_parser_internal.h"

10	10

11 namespace {	11 namespace {

12	12

13 // Minimum number of words in an address after the house number	13 // Minimum number of words in an address after the house number

14 // before a state is expected to be found.	14 // before a state is expected to be found.

15 // A value too high can miss short addresses.	15 // A value too high can miss short addresses.

16 const size_t kMinAddressWords = 3;	16 const size_t kMinAddressWords = 3;

17	17

18 // Maximum number of words allowed in an address between the house number	18 // Maximum number of words allowed in an address between the house number

19 // and the state, both not included.	19 // and the state, both not included.

20 const size_t kMaxAddressWords = 12;	20 const size_t kMaxAddressWords = 12;

21	21

22 // Maximum number of lines allowed in an address between the house number	22 // Maximum number of lines allowed in an address between the house number

23 // and the state, both not included.	23 // and the state, both not included.

24 const size_t kMaxAddressLines = 5;	24 const size_t kMaxAddressLines = 5;

25	25

26 // Maximum length allowed for any address word between the house number	26 // Maximum length allowed for any address word between the house number

27 // and the state, both not included.	27 // and the state, both not included.

28 const size_t kMaxAddressNameWordLength = 25;	28 const size_t kMaxAddressNameWordLength = 25;

29	29

30 // Maximum number of words after the house number in which the location name	30 // Maximum number of words after the house number in which the location name

31 // should be found.	31 // should be found.

32 const size_t kMaxLocationNameDistance = 4;	32 const size_t kMaxLocationNameDistance = 4;

33	33

34 // Additional characters used as new line delimiters.	34 // Additional characters used as new line delimiters.

35 const base::char16 kNewlineDelimiters[] = {	35 const base::char16 kNewlineDelimiters[] = {

36 '\n',	36 '\n', ',', '*',

37 ',',	37 0x2022, // Unicode bullet

38 '*',	38 0,

39 0x2022, // Unicode bullet

40 0,

41 };	39 };

42	40

43 } // anonymous namespace	41 } // anonymous namespace

44	42

45 namespace content {	43 namespace android_webview {

46	44

47 namespace address_parser {	45 namespace address_parser {

48	46

49 using namespace internal;	47 using namespace internal;

50	48

51 bool FindAddress(const base::string16& text, base::string16* address) {	49 bool FindAddress(const base::string16& text, base::string16* address) {

52 size_t start, end;	50 size_t start, end;

53 if (FindAddress(text.begin(), text.end(), &start, &end)) {	51 if (FindAddress(text.begin(), text.end(), &start, &end)) {

54 size_t len = end >= start ? end - start : 0;	52 size_t len = end >= start ? end - start : 0;

55 address->assign(text.substr(start, len));	53 address->assign(text.substr(start, len));

56 return true;	54 return true;

57 }	55 }

58 return false;	56 return false;

59 }	57 }

60	58

61 bool FindAddress(const base::string16::const_iterator& begin,	59 bool FindAddress(const base::string16::const_iterator& begin,

62 const base::string16::const_iterator& end,	60 const base::string16::const_iterator& end,

63 size_t* start_pos,	61 size_t* start_pos,

64 size_t* end_pos) {	62 size_t* end_pos) {

65 HouseNumberParser house_number_parser;	63 HouseNumberParser house_number_parser;

66	64

67 // Keep going through the input string until a potential house number is	65 // Keep going through the input string until a potential house number is

68 // detected. Start tokenizing the following words to find a valid	66 // detected. Start tokenizing the following words to find a valid

69 // street name within a word range. Then, find a state name followed	67 // street name within a word range. Then, find a state name followed

70 // by a valid zip code for that state. Also keep a look for any other	68 // by a valid zip code for that state. Also keep a look for any other

71 // possible house numbers to continue from in case of no match and for	69 // possible house numbers to continue from in case of no match and for

72 // state names not followed by a zip code (e.g. New York, NY 10000).	70 // state names not followed by a zip code (e.g. New York, NY 10000).

73 const base::string16 newline_delimiters = kNewlineDelimiters;	71 const base::string16 newline_delimiters = kNewlineDelimiters;

74 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;	72 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;

75 for (base::string16::const_iterator it = begin; it != end; ) {	73 for (base::string16::const_iterator it = begin; it != end;) {

76 Word house_number;	74 Word house_number;

77 if (!house_number_parser.Parse(it, end, &house_number))	75 if (!house_number_parser.Parse(it, end, &house_number))

78 return false;	76 return false;

79	77

80 String16Tokenizer tokenizer(house_number.end, end, delimiters);	78 String16Tokenizer tokenizer(house_number.end, end, delimiters);

81 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);	79 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);

82	80

83 WordList words;	81 WordList words;

84 words.push_back(house_number);	82 words.push_back(house_number);

85	83

86 bool found_location_name = false;	84 bool found_location_name = false;

87 bool continue_on_house_number = true;	85 bool continue_on_house_number = true;

88 bool consecutive_house_numbers = true;	86 bool consecutive_house_numbers = true;

89 size_t next_house_number_word = 0;	87 size_t next_house_number_word = 0;

90 size_t num_lines = 1;	88 size_t num_lines = 1;

91	89

92 // Don't include the house number in the word count.	90 // Don't include the house number in the word count.

93 size_t next_word = 1;	91 size_t next_word = 1;

94 for (; next_word <= kMaxAddressWords + 1; ++next_word) {	92 for (; next_word <= kMaxAddressWords + 1; ++next_word) {

95

96 // Extract a new word from the tokenizer.	93 // Extract a new word from the tokenizer.

97 if (next_word == words.size()) {	94 if (next_word == words.size()) {

98 do {	95 do {

99 if (!tokenizer.GetNext())	96 if (!tokenizer.GetNext())

100 return false;	97 return false;

101	98

102 // Check the number of address lines.	99 // Check the number of address lines.

103 if (tokenizer.token_is_delim() && newline_delimiters.find(	100 if (tokenizer.token_is_delim() &&

104 *tokenizer.token_begin()) != base::string16::npos) {	101 newline_delimiters.find(*tokenizer.token_begin()) !=

	102 base::string16::npos) {

105 ++num_lines;	103 ++num_lines;

106 }	104 }

107 } while (tokenizer.token_is_delim());	105 } while (tokenizer.token_is_delim());

108	106

109 if (num_lines > kMaxAddressLines)	107 if (num_lines > kMaxAddressLines)

110 break;	108 break;

111	109

112 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));	110 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));

113 }	111 }

114	112

115 // Check the word length. If too long, don't try to continue from	113 // Check the word length. If too long, don't try to continue from

116 // the next house number as no address can hold this word.	114 // the next house number as no address can hold this word.

117 const Word& current_word = words[next_word];	115 const Word& current_word = words[next_word];

118 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);	116 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);

119 size_t current_word_length = std::distance(	117 size_t current_word_length =

120 current_word.begin, current_word.end);	118 std::distance(current_word.begin, current_word.end);

121 if (current_word_length > kMaxAddressNameWordLength) {	119 if (current_word_length > kMaxAddressNameWordLength) {

122 continue_on_house_number = false;	120 continue_on_house_number = false;

123 break;	121 break;

124 }	122 }

125	123

126 // Check if the new word is a valid house number.	124 // Check if the new word is a valid house number.

127 if (house_number_parser.Parse(current_word.begin, current_word.end,	125 if (house_number_parser.Parse(current_word.begin, current_word.end,

128 NULL)) {	126 NULL)) {

129 // Increase the number of consecutive house numbers since the beginning.	127 // Increase the number of consecutive house numbers since the beginning.

130 if (consecutive_house_numbers) {	128 if (consecutive_house_numbers) {

131 // Check if there is a new line between consecutive house numbers.	129 // Check if there is a new line between consecutive house numbers.

132 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."	130 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."

133 if (num_lines > 1) {	131 if (num_lines > 1) {

134 next_house_number_word = next_word;	132 next_house_number_word = next_word;

135 break;	133 break;

136 }	134 }

137 }	135 }

138	136

(...skipping 16 matching lines...) Expand all Loading...
155 }	153 }

156	154

157 // Don't count the house number.	155 // Don't count the house number.

158 if (next_word > kMinAddressWords) {	156 if (next_word > kMinAddressWords) {

159 // Looking for the state is likely to add new words to the list while	157 // Looking for the state is likely to add new words to the list while

160 // checking for multi-word state names.	158 // checking for multi-word state names.

161 size_t state_first_word = next_word;	159 size_t state_first_word = next_word;

162 size_t state_last_word, state_index;	160 size_t state_last_word, state_index;

163 if (FindStateStartingInWord(&words, state_first_word, &state_last_word,	161 if (FindStateStartingInWord(&words, state_first_word, &state_last_word,

164 &tokenizer, &state_index)) {	162 &tokenizer, &state_index)) {

165

166 // A location name should have been found at this point.	163 // A location name should have been found at this point.

167 if (!found_location_name)	164 if (!found_location_name)

168 break;	165 break;

169	166

170 // Explicitly exclude "et al", as "al" is a valid state code.	167 // Explicitly exclude "et al", as "al" is a valid state code.

171 if (current_word_length == 2 && words.size() > 2) {	168 if (current_word_length == 2 && words.size() > 2) {

172 const Word& previous_word = words[state_first_word - 1];	169 const Word& previous_word = words[state_first_word - 1];

173 if (previous_word.end - previous_word.begin == 2 &&	170 if (previous_word.end - previous_word.begin == 2 &&

174 base::LowerCaseEqualsASCII(	171 base::LowerCaseEqualsASCII(

175 base::StringPiece16(previous_word.begin, previous_word.end),	172 base::StringPiece16(previous_word.begin, previous_word.end),

176 "et") &&	173 "et") &&

177 base::LowerCaseEqualsASCII(	174 base::LowerCaseEqualsASCII(

178 base::StringPiece16(current_word.begin, current_word.end),	175 base::StringPiece16(current_word.begin, current_word.end),

179 "al"))	176 "al"))

180 break;	177 break;

181 }	178 }

182	179

183 // Extract one more word from the tokenizer if not already available.	180 // Extract one more word from the tokenizer if not already available.

184 size_t zip_word = state_last_word + 1;	181 size_t zip_word = state_last_word + 1;

185 if (zip_word == words.size()) {	182 if (zip_word == words.size()) {

186 do {	183 do {

187 if (!tokenizer.GetNext()) {	184 if (!tokenizer.GetNext()) {

188 // The address ends with a state name without a zip code. This	185 // The address ends with a state name without a zip code. This

189 // is legal according to WebView#findAddress public	186 // is legal according to WebView#findAddress public

190 // documentation.	187 // documentation.

191 *start_pos = words[0].begin - begin;	188 *start_pos = words[0].begin - begin;

192 *end_pos = words[state_last_word].end - begin;	189 *end_pos = words[state_last_word].end - begin;

193 return true;	190 return true;

194 }	191 }

195 } while (tokenizer.token_is_delim());	192 } while (tokenizer.token_is_delim());

196 words.push_back(Word(tokenizer.token_begin(),	193 words.push_back(

197 tokenizer.token_end()));	194 Word(tokenizer.token_begin(), tokenizer.token_end()));

198 }	195 }

199	196

200 // Check the parsing validity and state range of the zip code.	197 // Check the parsing validity and state range of the zip code.

201 next_word = state_last_word;	198 next_word = state_last_word;

202 if (!IsZipValid(words[zip_word], state_index))	199 if (!IsZipValid(words[zip_word], state_index))

203 continue;	200 continue;

204	201

205 *start_pos = words[0].begin - begin;	202 *start_pos = words[0].begin - begin;

206 *end_pos = words[zip_word].end - begin;	203 *end_pos = words[zip_word].end - begin;

207 return true;	204 return true;

(...skipping 10 matching lines...) Expand all Loading...
218 next_word = std::min(next_word, words.size() - 1);	215 next_word = std::min(next_word, words.size() - 1);

219 it = words[next_word].end;	216 it = words[next_word].end;

220 }	217 }

221 }	218 }

222	219

223 return false;	220 return false;

224 }	221 }

225	222

226 } // namespace address_parser	223 } // namespace address_parser

227	224

228 } // namespace content	225 } // namespace android_webview

OLD	NEW

« no previous file with comments | « android_webview/native/address_parser.h ('k') | android_webview/native/address_parser_internal.h » ('j') | no next file with comments »