Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/tools/convert_dict/dic_reader.h" | 5 #include "chrome/tools/convert_dict/dic_reader.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 #include <set> | 8 #include <set> |
| 9 | 9 |
| 10 #include "base/file_util.h" | 10 #include "base/file_util.h" |
| (...skipping 25 matching lines...) Expand all Loading... | |
| 36 // convert all escaped slashes ("\/" sequences) to regular slashes. | 36 // convert all escaped slashes ("\/" sequences) to regular slashes. |
| 37 std::string word = line.substr(0, slash_index); | 37 std::string word = line.substr(0, slash_index); |
| 38 ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/"); | 38 ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/"); |
| 39 output->push_back(word); | 39 output->push_back(word); |
| 40 | 40 |
| 41 // Everything (if anything) after the slash is the second. | 41 // Everything (if anything) after the slash is the second. |
| 42 if (slash_index < line.size() - 1) | 42 if (slash_index < line.size() - 1) |
| 43 output->push_back(line.substr(slash_index + 1)); | 43 output->push_back(line.substr(slash_index + 1)); |
| 44 } | 44 } |
| 45 | 45 |
| 46 } // namespace | 46 // This function reads words from a .dic file, or a .dic_delta file. Note that |
|
brettw
2008/12/29 22:25:22
Can you add "// This function reads words from a .
| |
| 47 | 47 // we read 'all' the words in the file, irrespective of the word count given |
| 48 DicReader::DicReader(const std::string& filename) { | 48 // in the first non empty line of a .dic file. Also note that, for a .dic_delta |
| 49 file_ = file_util::OpenFile(filename, "r"); | 49 // file, the first line actually does _not_ have the number of words. In order |
| 50 } | 50 // to control this, we use the |file_has_word_count_in_the_first_line| |
| 51 | 51 // parameter to tell this method whether the first non empty line in the file |
| 52 DicReader::~DicReader() { | 52 // contains the number of words or not. If it does, skip the first line. If it |
| 53 if (file_) | 53 // does not, then the first line contains a word. |
| 54 file_util::CloseFile(file_); | 54 bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader, |
| 55 } | 55 const char* file_type, |
| 56 | 56 bool file_has_word_count_in_the_first_line) { |
| 57 bool DicReader::Read(AffReader* aff_reader) { | 57 if (file == NULL) |
| 58 if (!file_) | |
| 59 return false; | 58 return false; |
| 60 | 59 |
| 61 bool got_count = false; | 60 printf("Extracting words from %s file...\n", file_type); |
| 61 | |
| 62 int line_number = 0; | 62 int line_number = 0; |
| 63 | 63 while (!feof(file)) { |
| 64 WordSet word_set; | 64 std::string line = ReadLine(file); |
| 65 while (!feof(file_)) { | |
| 66 std::string line = ReadLine(file_); | |
| 67 line_number++; | 65 line_number++; |
| 68 StripComment(&line); | 66 StripComment(&line); |
| 69 if (line.empty()) | 67 if (line.empty()) |
| 70 continue; | 68 continue; |
| 71 | 69 |
| 72 if (!got_count) { | 70 if (file_has_word_count_in_the_first_line) { |
| 73 // Skip the first nonempty line, this is the line count. We don't bother | 71 // Skip the first nonempty line, this is the line count. We don't bother |
| 74 // with it and just read all the lines. | 72 // with it and just read all the lines. |
| 75 got_count = true; | 73 file_has_word_count_in_the_first_line = false; |
| 76 continue; | 74 continue; |
| 77 } | 75 } |
| 78 | 76 |
| 79 std::vector<std::string> split; | 77 std::vector<std::string> split; |
| 80 SplitDicLine(line, &split); | 78 SplitDicLine(line, &split); |
| 81 if (split.size() == 0 || split.size() > 2) { | 79 if (split.size() == 0 || split.size() > 2) { |
| 82 printf("Line %d has extra slashes in the dic file\n", line_number); | 80 printf("Line %d has extra slashes in the %s file\n", line_number, |
| 81 file_type); | |
| 83 return false; | 82 return false; |
| 84 } | 83 } |
| 85 | 84 |
| 86 // The first part is the word, the second (optional) part is the affix. We | 85 // The first part is the word, the second (optional) part is the affix. We |
| 87 // always use UTF-8 as the encoding to simplify life. | 86 // always use UTF-8 as the encoding to simplify life. |
| 88 std::string utf8word; | 87 std::string utf8word; |
| 89 if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { | 88 if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { |
| 90 printf("Unable to convert line %d from %s to UTF-8 in the dic file\n", | 89 printf("Unable to convert line %d from %s to UTF-8 in the %s file\n", |
| 91 line_number, aff_reader->encoding()); | 90 line_number, aff_reader->encoding(), file_type); |
| 92 return false; | 91 return false; |
| 93 } | 92 } |
| 94 | 93 |
| 95 // We always convert the affix to an index. 0 means no affix. | 94 // We always convert the affix to an index. 0 means no affix. |
| 96 int affix_index = 0; | 95 int affix_index = 0; |
| 97 if (split.size() == 2) { | 96 if (split.size() == 2) { |
| 98 // Got a rule, which is the stuff after the slash. The line may also have | 97 // Got a rule, which is the stuff after the slash. The line may also have |
| 99 // an optional term separated by a tab. This is the morphological | 98 // an optional term separated by a tab. This is the morphological |
| 100 // description. We don't care about this (it is used in the tests to | 99 // description. We don't care about this (it is used in the tests to |
| 101 // generate a nice dump), so we remove it. | 100 // generate a nice dump), so we remove it. |
| 102 size_t split1_tab_offset = split[1].find('\t'); | 101 size_t split1_tab_offset = split[1].find('\t'); |
| 103 if (split1_tab_offset != std::string::npos) | 102 if (split1_tab_offset != std::string::npos) |
| 104 split[1] = split[1].substr(0, split1_tab_offset); | 103 split[1] = split[1].substr(0, split1_tab_offset); |
| 105 | 104 |
| 106 if (aff_reader->has_indexed_affixes()) | 105 if (aff_reader->has_indexed_affixes()) |
| 107 affix_index = atoi(split[1].c_str()); | 106 affix_index = atoi(split[1].c_str()); |
| 108 else | 107 else |
| 109 affix_index = aff_reader->GetAFIndexForAFString(split[1]); | 108 affix_index = aff_reader->GetAFIndexForAFString(split[1]); |
| 110 } | 109 } |
| 111 | 110 |
| 112 WordSet::iterator found = word_set.find(utf8word); | 111 WordSet::iterator found = word_set->find(utf8word); |
| 113 if (found == word_set.end()) { | 112 if (found == word_set->end()) { |
| 114 std::set<int> affix_vector; | 113 std::set<int> affix_vector; |
| 115 affix_vector.insert(affix_index); | 114 affix_vector.insert(affix_index); |
| 116 word_set.insert(std::make_pair(utf8word, affix_vector)); | 115 word_set->insert(std::make_pair(utf8word, affix_vector)); |
| 117 } else { | 116 } else { |
| 118 found->second.insert(affix_index); | 117 found->second.insert(affix_index); |
| 119 } | 118 } |
| 120 } | 119 } |
| 121 | 120 |
| 121 return true; | |
| 122 } | |
| 123 | |
| 124 } // namespace | |
| 125 | |
| 126 DicReader::DicReader(const std::string& filename) { | |
| 127 file_ = file_util::OpenFile(filename, "r"); | |
| 128 additional_words_file_ = file_util::OpenFile(filename + "_delta", "r"); | |
| 129 } | |
| 130 | |
| 131 DicReader::~DicReader() { | |
| 132 if (file_) | |
| 133 file_util::CloseFile(file_); | |
| 134 if (additional_words_file_) | |
| 135 file_util::CloseFile(additional_words_file_); | |
| 136 } | |
| 137 | |
| 138 bool DicReader::Read(AffReader* aff_reader) { | |
| 139 if (!file_) | |
| 140 return false; | |
| 141 | |
| 142 WordSet word_set; | |
| 143 | |
| 144 // Add words from the dic file to the word set. | |
| 145 // Note that the first line is the word count in the file. | |
| 146 if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true)) | |
| 147 return false; | |
| 148 | |
| 149 // Add words from the dic delta file to the word set, if it exists. | |
| 150 // The first line is the first word to add. Word count line is not present. | |
| 151 PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta", | |
|
brettw
2008/12/29 22:25:22
It seems like it would be better to not call the f
| |
| 152 false); | |
| 153 | |
| 122 // Make sure the words are sorted, they may be unsorted in the input. | 154 // Make sure the words are sorted, they may be unsorted in the input. |
| 123 for (WordSet::iterator word = word_set.begin(); word != word_set.end(); | 155 for (WordSet::iterator word = word_set.begin(); word != word_set.end(); |
| 124 ++word) { | 156 ++word) { |
| 125 std::vector<int> affixes; | 157 std::vector<int> affixes; |
| 126 for (std::set<int>::iterator aff = word->second.begin(); | 158 for (std::set<int>::iterator aff = word->second.begin(); |
| 127 aff != word->second.end(); ++aff) | 159 aff != word->second.end(); ++aff) |
| 128 affixes.push_back(*aff); | 160 affixes.push_back(*aff); |
| 129 | 161 |
| 130 // Double check that the affixes are sorted. This isn't strictly necessary | 162 // Double check that the affixes are sorted. This isn't strictly necessary |
| 131 // but it's nice for the file to have a fixed layout. | 163 // but it's nice for the file to have a fixed layout. |
| 132 std::sort(affixes.begin(), affixes.end()); | 164 std::sort(affixes.begin(), affixes.end()); |
| 133 words_.push_back(std::make_pair(word->first, affixes)); | 165 words_.push_back(std::make_pair(word->first, affixes)); |
| 134 } | 166 } |
| 135 | 167 |
| 136 // Double-check that the words are sorted. | 168 // Double-check that the words are sorted. |
| 137 std::sort(words_.begin(), words_.end()); | 169 std::sort(words_.begin(), words_.end()); |
| 138 return true; | 170 return true; |
| 139 } | 171 } |
| 140 | 172 |
| 141 } // namespace convert_dict | 173 } // namespace convert_dict |
| 142 | 174 |
| OLD | NEW |