Chromium Code Reviews| Index: chrome/tools/convert_dict/dic_reader.cc |
| =================================================================== |
| --- chrome/tools/convert_dict/dic_reader.cc (revision 7485) |
| +++ chrome/tools/convert_dict/dic_reader.cc (working copy) |
| @@ -43,43 +43,42 @@ |
| output->push_back(line.substr(slash_index + 1)); |
| } |
| -} // namespace |
| +// This function reads words from a .dic file, or a .dic_delta file. Note that |
|
brettw
2008/12/29 22:25:22
Can you add "// This function reads words from a .
|
| +// we read 'all' the words in the file, irrespective of the word count given |
| +// in the first non empty line of a .dic file. Also note that, for a .dic_delta |
| +// file, the first line actually does _not_ have the number of words. In order |
| +// to control this, we use the |file_has_word_count_in_the_first_line| |
| +// parameter to tell this method whether the first non empty line in the file |
| +// contains the number of words or not. If it does, skip the first line. If it |
| +// does not, then the first line contains a word. |
| +bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader, |
| + const char* file_type, |
| + bool file_has_word_count_in_the_first_line) { |
| + if (file == NULL) |
| + return false; |
| -DicReader::DicReader(const std::string& filename) { |
| - file_ = file_util::OpenFile(filename, "r"); |
| -} |
| + printf("Extracting words from %s file...\n", file_type); |
| -DicReader::~DicReader() { |
| - if (file_) |
| - file_util::CloseFile(file_); |
| -} |
| - |
| -bool DicReader::Read(AffReader* aff_reader) { |
| - if (!file_) |
| - return false; |
| - |
| - bool got_count = false; |
| int line_number = 0; |
| - |
| - WordSet word_set; |
| - while (!feof(file_)) { |
| - std::string line = ReadLine(file_); |
| + while (!feof(file)) { |
| + std::string line = ReadLine(file); |
| line_number++; |
| StripComment(&line); |
| if (line.empty()) |
| continue; |
| - if (!got_count) { |
| + if (file_has_word_count_in_the_first_line) { |
| // Skip the first nonempty line, this is the line count. We don't bother |
| // with it and just read all the lines. |
| - got_count = true; |
| + file_has_word_count_in_the_first_line = false; |
| continue; |
| } |
| std::vector<std::string> split; |
| SplitDicLine(line, &split); |
| if (split.size() == 0 || split.size() > 2) { |
| - printf("Line %d has extra slashes in the dic file\n", line_number); |
| + printf("Line %d has extra slashes in the %s file\n", line_number, |
| + file_type); |
| return false; |
| } |
| @@ -87,8 +86,8 @@ |
| // always use UTF-8 as the encoding to simplify life. |
| std::string utf8word; |
| if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { |
| - printf("Unable to convert line %d from %s to UTF-8 in the dic file\n", |
| - line_number, aff_reader->encoding()); |
| + printf("Unable to convert line %d from %s to UTF-8 in the %s file\n", |
| + line_number, aff_reader->encoding(), file_type); |
| return false; |
| } |
| @@ -109,16 +108,49 @@ |
| affix_index = aff_reader->GetAFIndexForAFString(split[1]); |
| } |
| - WordSet::iterator found = word_set.find(utf8word); |
| - if (found == word_set.end()) { |
| + WordSet::iterator found = word_set->find(utf8word); |
| + if (found == word_set->end()) { |
| std::set<int> affix_vector; |
| affix_vector.insert(affix_index); |
| - word_set.insert(std::make_pair(utf8word, affix_vector)); |
| + word_set->insert(std::make_pair(utf8word, affix_vector)); |
| } else { |
| found->second.insert(affix_index); |
| } |
| } |
| + return true; |
| +} |
| + |
| +} // namespace |
| + |
| +DicReader::DicReader(const std::string& filename) { |
| + file_ = file_util::OpenFile(filename, "r"); |
| + additional_words_file_ = file_util::OpenFile(filename + "_delta", "r"); |
| +} |
| + |
| +DicReader::~DicReader() { |
| + if (file_) |
| + file_util::CloseFile(file_); |
| + if (additional_words_file_) |
| + file_util::CloseFile(additional_words_file_); |
| +} |
| + |
| +bool DicReader::Read(AffReader* aff_reader) { |
| + if (!file_) |
| + return false; |
| + |
| + WordSet word_set; |
| + |
| + // Add words from the dic file to the word set. |
| + // Note that the first line is the word count in the file. |
| + if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true)) |
| + return false; |
| + |
| + // Add words from the dic delta file to the word set, if it exists. |
| + // The first line is the first word to add. Word count line is not present. |
| + PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta", |
|
brettw
2008/12/29 22:25:22
It seems like it would be better to not call the f
|
| + false); |
| + |
| // Make sure the words are sorted, they may be unsorted in the input. |
| for (WordSet::iterator word = word_set.begin(); word != word_set.end(); |
| ++word) { |