chrome/tools/convert_dict/dic_reader.cc - Issue 11776032: Unit test for spellchecking 96- through 102-character words

Side by Side Diff: chrome/tools/convert_dict/dic_reader.cc

Issue 11776032: Unit test for spellchecking 96- through 102-character words (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Created 7 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/tools/convert_dict/dic_reader.h"	5 #include "chrome/tools/convert_dict/dic_reader.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <set>	8 #include <set>

9	9

10 #include "base/file_util.h"	10 #include "base/file_util.h"

(...skipping 99 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
110 // token. (It is attached to the first token if a word doesn't have affix	110 // token. (It is attached to the first token if a word doesn't have affix

111 // rules.)	111 // rules.)

112 size_t word_tab_offset = utf8word.find('\t');	112 size_t word_tab_offset = utf8word.find('\t');

113 if (word_tab_offset != std::string::npos)	113 if (word_tab_offset != std::string::npos)

114 utf8word = utf8word.substr(0, word_tab_offset);	114 utf8word = utf8word.substr(0, word_tab_offset);

115	115

116 WordSet::iterator found = word_set->find(utf8word);	116 WordSet::iterator found = word_set->find(utf8word);

117 std::set<int> affix_vector;	117 std::set<int> affix_vector;

118 affix_vector.insert(affix_index);	118 affix_vector.insert(affix_index);

119	119

120 if (found == word_set->end()) {	120 if (found == word_set->end())

121 word_set->insert(std::make_pair(utf8word, affix_vector));	121 word_set->insert(std::make_pair(utf8word, affix_vector));

122 } else {	122 else

123 // The affixes of the delta file should override those in the	123 found->second.insert(affix_index);
	groby-ooo-7-16 2013/01/10 01:35:46 I'm worried that the previous behavior was needed I'm worried that the previous behavior was needed for other dictionaries. Have you checked if we _always_ want to merge affixes? (Also, we should leave a comment here that the merge is intentional, including an example of where it's needed. Just for future generations)
124 // dictionary file.

125 found->second.swap(affix_vector);

126 }

127 }	124 }

128	125

129 return true;	126 return true;

130 }	127 }

131	128

132 } // namespace	129 } // namespace

133	130

134 DicReader::DicReader(const FilePath& path) {	131 DicReader::DicReader(const FilePath& path) {

135 file_ = file_util::OpenFile(path, "r");	132 file_ = file_util::OpenFile(path, "r");

136	133

(...skipping 26 matching lines...) Expand all Loading...
163 aff_reader->encoding(), true))	160 aff_reader->encoding(), true))

164 return false;	161 return false;

165	162

166 // Add words from the .dic_delta file to the word set, if it exists.	163 // Add words from the .dic_delta file to the word set, if it exists.

167 // The first line is the first word to add. Word count line is not present.	164 // The first line is the first word to add. Word count line is not present.

168 // NOTE: These additional words should be encoded as UTF-8.	165 // NOTE: These additional words should be encoded as UTF-8.

169 if (additional_words_file_ != NULL) {	166 if (additional_words_file_ != NULL) {

170 PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",	167 PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",

171 "UTF-8", false);	168 "UTF-8", false);

172 }	169 }

173

174 // Make sure the words are sorted, they may be unsorted in the input.	170 // Make sure the words are sorted, they may be unsorted in the input.

175 for (WordSet::iterator word = word_set.begin(); word != word_set.end();	171 for (WordSet::iterator word = word_set.begin(); word != word_set.end();

176 ++word) {	172 ++word) {

177 std::vector<int> affixes;	173 std::vector<int> affixes;

178 for (std::set<int>::iterator aff = word->second.begin();	174 for (std::set<int>::iterator aff = word->second.begin();

179 aff != word->second.end(); ++aff)	175 aff != word->second.end(); ++aff)

180 affixes.push_back(*aff);	176 affixes.push_back(*aff);

181	177

182 // Double check that the affixes are sorted. This isn't strictly necessary	178 // Double check that the affixes are sorted. This isn't strictly necessary

183 // but it's nice for the file to have a fixed layout.	179 // but it's nice for the file to have a fixed layout.

184 std::sort(affixes.begin(), affixes.end());	180 std::sort(affixes.begin(), affixes.end());

	181 std::reverse(affixes.begin(), affixes.end());

185 words_.push_back(std::make_pair(word->first, affixes));	182 words_.push_back(std::make_pair(word->first, affixes));

186 }	183 }

187	184

188 // Double-check that the words are sorted.	185 // Double-check that the words are sorted.

189 std::sort(words_.begin(), words_.end());	186 std::sort(words_.begin(), words_.end());

190 return true;	187 return true;

191 }	188 }

192	189

193 } // namespace convert_dict	190 } // namespace convert_dict

OLD	NEW

« chrome/tools/convert_dict/aff_reader.cc ('K') | « chrome/tools/convert_dict/aff_reader.cc ('k') | no next file » | no next file with comments »