OLD | NEW |
---|---|
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/tools/convert_dict/dic_reader.h" | 5 #include "chrome/tools/convert_dict/dic_reader.h" |
6 | 6 |
7 #include <algorithm> | 7 #include <algorithm> |
8 #include <set> | 8 #include <set> |
9 | 9 |
10 #include "base/file_util.h" | 10 #include "base/file_util.h" |
(...skipping 25 matching lines...) Expand all Loading... | |
36 // convert all escaped slashes ("\/" sequences) to regular slashes. | 36 // convert all escaped slashes ("\/" sequences) to regular slashes. |
37 std::string word = line.substr(0, slash_index); | 37 std::string word = line.substr(0, slash_index); |
38 ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/"); | 38 ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/"); |
39 output->push_back(word); | 39 output->push_back(word); |
40 | 40 |
41 // Everything (if anything) after the slash is the second. | 41 // Everything (if anything) after the slash is the second. |
42 if (slash_index < line.size() - 1) | 42 if (slash_index < line.size() - 1) |
43 output->push_back(line.substr(slash_index + 1)); | 43 output->push_back(line.substr(slash_index + 1)); |
44 } | 44 } |
45 | 45 |
46 } // namespace | 46 // This function reads words from a .dic file, or a .dic_delta file. Note that |
brettw
2008/12/29 22:25:22
Can you add "// This function reads words from a .
| |
47 | 47 // we read 'all' the words in the file, irrespective of the word count given |
48 DicReader::DicReader(const std::string& filename) { | 48 // in the first non empty line of a .dic file. Also note that, for a .dic_delta |
49 file_ = file_util::OpenFile(filename, "r"); | 49 // file, the first line actually does _not_ have the number of words. In order |
50 } | 50 // to control this, we use the |file_has_word_count_in_the_first_line| |
51 | 51 // parameter to tell this method whether the first non empty line in the file |
52 DicReader::~DicReader() { | 52 // contains the number of words or not. If it does, skip the first line. If it |
53 if (file_) | 53 // does not, then the first line contains a word. |
54 file_util::CloseFile(file_); | 54 bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader, |
55 } | 55 const char* file_type, |
56 | 56 bool file_has_word_count_in_the_first_line) { |
57 bool DicReader::Read(AffReader* aff_reader) { | 57 if (file == NULL) |
58 if (!file_) | |
59 return false; | 58 return false; |
60 | 59 |
61 bool got_count = false; | 60 printf("Extracting words from %s file...\n", file_type); |
61 | |
62 int line_number = 0; | 62 int line_number = 0; |
63 | 63 while (!feof(file)) { |
64 WordSet word_set; | 64 std::string line = ReadLine(file); |
65 while (!feof(file_)) { | |
66 std::string line = ReadLine(file_); | |
67 line_number++; | 65 line_number++; |
68 StripComment(&line); | 66 StripComment(&line); |
69 if (line.empty()) | 67 if (line.empty()) |
70 continue; | 68 continue; |
71 | 69 |
72 if (!got_count) { | 70 if (file_has_word_count_in_the_first_line) { |
73 // Skip the first nonempty line, this is the line count. We don't bother | 71 // Skip the first nonempty line, this is the line count. We don't bother |
74 // with it and just read all the lines. | 72 // with it and just read all the lines. |
75 got_count = true; | 73 file_has_word_count_in_the_first_line = false; |
76 continue; | 74 continue; |
77 } | 75 } |
78 | 76 |
79 std::vector<std::string> split; | 77 std::vector<std::string> split; |
80 SplitDicLine(line, &split); | 78 SplitDicLine(line, &split); |
81 if (split.size() == 0 || split.size() > 2) { | 79 if (split.size() == 0 || split.size() > 2) { |
82 printf("Line %d has extra slashes in the dic file\n", line_number); | 80 printf("Line %d has extra slashes in the %s file\n", line_number, |
81 file_type); | |
83 return false; | 82 return false; |
84 } | 83 } |
85 | 84 |
86 // The first part is the word, the second (optional) part is the affix. We | 85 // The first part is the word, the second (optional) part is the affix. We |
87 // always use UTF-8 as the encoding to simplify life. | 86 // always use UTF-8 as the encoding to simplify life. |
88 std::string utf8word; | 87 std::string utf8word; |
89 if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { | 88 if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { |
90 printf("Unable to convert line %d from %s to UTF-8 in the dic file\n", | 89 printf("Unable to convert line %d from %s to UTF-8 in the %s file\n", |
91 line_number, aff_reader->encoding()); | 90 line_number, aff_reader->encoding(), file_type); |
92 return false; | 91 return false; |
93 } | 92 } |
94 | 93 |
95 // We always convert the affix to an index. 0 means no affix. | 94 // We always convert the affix to an index. 0 means no affix. |
96 int affix_index = 0; | 95 int affix_index = 0; |
97 if (split.size() == 2) { | 96 if (split.size() == 2) { |
98 // Got a rule, which is the stuff after the slash. The line may also have | 97 // Got a rule, which is the stuff after the slash. The line may also have |
99 // an optional term separated by a tab. This is the morphological | 98 // an optional term separated by a tab. This is the morphological |
100 // description. We don't care about this (it is used in the tests to | 99 // description. We don't care about this (it is used in the tests to |
101 // generate a nice dump), so we remove it. | 100 // generate a nice dump), so we remove it. |
102 size_t split1_tab_offset = split[1].find('\t'); | 101 size_t split1_tab_offset = split[1].find('\t'); |
103 if (split1_tab_offset != std::string::npos) | 102 if (split1_tab_offset != std::string::npos) |
104 split[1] = split[1].substr(0, split1_tab_offset); | 103 split[1] = split[1].substr(0, split1_tab_offset); |
105 | 104 |
106 if (aff_reader->has_indexed_affixes()) | 105 if (aff_reader->has_indexed_affixes()) |
107 affix_index = atoi(split[1].c_str()); | 106 affix_index = atoi(split[1].c_str()); |
108 else | 107 else |
109 affix_index = aff_reader->GetAFIndexForAFString(split[1]); | 108 affix_index = aff_reader->GetAFIndexForAFString(split[1]); |
110 } | 109 } |
111 | 110 |
112 WordSet::iterator found = word_set.find(utf8word); | 111 WordSet::iterator found = word_set->find(utf8word); |
113 if (found == word_set.end()) { | 112 if (found == word_set->end()) { |
114 std::set<int> affix_vector; | 113 std::set<int> affix_vector; |
115 affix_vector.insert(affix_index); | 114 affix_vector.insert(affix_index); |
116 word_set.insert(std::make_pair(utf8word, affix_vector)); | 115 word_set->insert(std::make_pair(utf8word, affix_vector)); |
117 } else { | 116 } else { |
118 found->second.insert(affix_index); | 117 found->second.insert(affix_index); |
119 } | 118 } |
120 } | 119 } |
121 | 120 |
121 return true; | |
122 } | |
123 | |
124 } // namespace | |
125 | |
126 DicReader::DicReader(const std::string& filename) { | |
127 file_ = file_util::OpenFile(filename, "r"); | |
128 additional_words_file_ = file_util::OpenFile(filename + "_delta", "r"); | |
129 } | |
130 | |
131 DicReader::~DicReader() { | |
132 if (file_) | |
133 file_util::CloseFile(file_); | |
134 if (additional_words_file_) | |
135 file_util::CloseFile(additional_words_file_); | |
136 } | |
137 | |
138 bool DicReader::Read(AffReader* aff_reader) { | |
139 if (!file_) | |
140 return false; | |
141 | |
142 WordSet word_set; | |
143 | |
144 // Add words from the dic file to the word set. | |
145 // Note that the first line is the word count in the file. | |
146 if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true)) | |
147 return false; | |
148 | |
149 // Add words from the dic delta file to the word set, if it exists. | |
150 // The first line is the first word to add. Word count line is not present. | |
151 PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta", | |
brettw
2008/12/29 22:25:22
It seems like it would be better to not call the f
| |
152 false); | |
153 | |
122 // Make sure the words are sorted, they may be unsorted in the input. | 154 // Make sure the words are sorted, they may be unsorted in the input. |
123 for (WordSet::iterator word = word_set.begin(); word != word_set.end(); | 155 for (WordSet::iterator word = word_set.begin(); word != word_set.end(); |
124 ++word) { | 156 ++word) { |
125 std::vector<int> affixes; | 157 std::vector<int> affixes; |
126 for (std::set<int>::iterator aff = word->second.begin(); | 158 for (std::set<int>::iterator aff = word->second.begin(); |
127 aff != word->second.end(); ++aff) | 159 aff != word->second.end(); ++aff) |
128 affixes.push_back(*aff); | 160 affixes.push_back(*aff); |
129 | 161 |
130 // Double check that the affixes are sorted. This isn't strictly necessary | 162 // Double check that the affixes are sorted. This isn't strictly necessary |
131 // but it's nice for the file to have a fixed layout. | 163 // but it's nice for the file to have a fixed layout. |
132 std::sort(affixes.begin(), affixes.end()); | 164 std::sort(affixes.begin(), affixes.end()); |
133 words_.push_back(std::make_pair(word->first, affixes)); | 165 words_.push_back(std::make_pair(word->first, affixes)); |
134 } | 166 } |
135 | 167 |
136 // Double-check that the words are sorted. | 168 // Double-check that the words are sorted. |
137 std::sort(words_.begin(), words_.end()); | 169 std::sort(words_.begin(), words_.end()); |
138 return true; | 170 return true; |
139 } | 171 } |
140 | 172 |
141 } // namespace convert_dict | 173 } // namespace convert_dict |
142 | 174 |
OLD | NEW |