Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(233)

Side by Side Diff: chrome/tools/convert_dict/dic_reader.cc

Issue 14856: [chromium-reviews] Part 1 of 'Add common words for each language, and remove forbidden words'.... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 11 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « chrome/tools/convert_dict/dic_reader.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/tools/convert_dict/dic_reader.h" 5 #include "chrome/tools/convert_dict/dic_reader.h"
6 6
7 #include <algorithm> 7 #include <algorithm>
8 #include <set> 8 #include <set>
9 9
10 #include "base/file_util.h" 10 #include "base/file_util.h"
(...skipping 25 matching lines...) Expand all
36 // convert all escaped slashes ("\/" sequences) to regular slashes. 36 // convert all escaped slashes ("\/" sequences) to regular slashes.
37 std::string word = line.substr(0, slash_index); 37 std::string word = line.substr(0, slash_index);
38 ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/"); 38 ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/");
39 output->push_back(word); 39 output->push_back(word);
40 40
41 // Everything (if anything) after the slash is the second. 41 // Everything (if anything) after the slash is the second.
42 if (slash_index < line.size() - 1) 42 if (slash_index < line.size() - 1)
43 output->push_back(line.substr(slash_index + 1)); 43 output->push_back(line.substr(slash_index + 1));
44 } 44 }
45 45
46 } // namespace 46 // This function reads words from a .dic file, or a .dic_delta file. Note that
brettw 2008/12/29 22:25:22 Can you add "// This function reads words from a .
47 47 // we read 'all' the words in the file, irrespective of the word count given
48 DicReader::DicReader(const std::string& filename) { 48 // in the first non empty line of a .dic file. Also note that, for a .dic_delta
49 file_ = file_util::OpenFile(filename, "r"); 49 // file, the first line actually does _not_ have the number of words. In order
50 } 50 // to control this, we use the |file_has_word_count_in_the_first_line|
51 51 // parameter to tell this method whether the first non empty line in the file
52 DicReader::~DicReader() { 52 // contains the number of words or not. If it does, skip the first line. If it
53 if (file_) 53 // does not, then the first line contains a word.
54 file_util::CloseFile(file_); 54 bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
55 } 55 const char* file_type,
56 56 bool file_has_word_count_in_the_first_line) {
57 bool DicReader::Read(AffReader* aff_reader) { 57 if (file == NULL)
58 if (!file_)
59 return false; 58 return false;
60 59
61 bool got_count = false; 60 printf("Extracting words from %s file...\n", file_type);
61
62 int line_number = 0; 62 int line_number = 0;
63 63 while (!feof(file)) {
64 WordSet word_set; 64 std::string line = ReadLine(file);
65 while (!feof(file_)) {
66 std::string line = ReadLine(file_);
67 line_number++; 65 line_number++;
68 StripComment(&line); 66 StripComment(&line);
69 if (line.empty()) 67 if (line.empty())
70 continue; 68 continue;
71 69
72 if (!got_count) { 70 if (file_has_word_count_in_the_first_line) {
73 // Skip the first nonempty line, this is the line count. We don't bother 71 // Skip the first nonempty line, this is the line count. We don't bother
74 // with it and just read all the lines. 72 // with it and just read all the lines.
75 got_count = true; 73 file_has_word_count_in_the_first_line = false;
76 continue; 74 continue;
77 } 75 }
78 76
79 std::vector<std::string> split; 77 std::vector<std::string> split;
80 SplitDicLine(line, &split); 78 SplitDicLine(line, &split);
81 if (split.size() == 0 || split.size() > 2) { 79 if (split.size() == 0 || split.size() > 2) {
82 printf("Line %d has extra slashes in the dic file\n", line_number); 80 printf("Line %d has extra slashes in the %s file\n", line_number,
81 file_type);
83 return false; 82 return false;
84 } 83 }
85 84
86 // The first part is the word, the second (optional) part is the affix. We 85 // The first part is the word, the second (optional) part is the affix. We
87 // always use UTF-8 as the encoding to simplify life. 86 // always use UTF-8 as the encoding to simplify life.
88 std::string utf8word; 87 std::string utf8word;
89 if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { 88 if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
90 printf("Unable to convert line %d from %s to UTF-8 in the dic file\n", 89 printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",
91 line_number, aff_reader->encoding()); 90 line_number, aff_reader->encoding(), file_type);
92 return false; 91 return false;
93 } 92 }
94 93
95 // We always convert the affix to an index. 0 means no affix. 94 // We always convert the affix to an index. 0 means no affix.
96 int affix_index = 0; 95 int affix_index = 0;
97 if (split.size() == 2) { 96 if (split.size() == 2) {
98 // Got a rule, which is the stuff after the slash. The line may also have 97 // Got a rule, which is the stuff after the slash. The line may also have
99 // an optional term separated by a tab. This is the morphological 98 // an optional term separated by a tab. This is the morphological
100 // description. We don't care about this (it is used in the tests to 99 // description. We don't care about this (it is used in the tests to
101 // generate a nice dump), so we remove it. 100 // generate a nice dump), so we remove it.
102 size_t split1_tab_offset = split[1].find('\t'); 101 size_t split1_tab_offset = split[1].find('\t');
103 if (split1_tab_offset != std::string::npos) 102 if (split1_tab_offset != std::string::npos)
104 split[1] = split[1].substr(0, split1_tab_offset); 103 split[1] = split[1].substr(0, split1_tab_offset);
105 104
106 if (aff_reader->has_indexed_affixes()) 105 if (aff_reader->has_indexed_affixes())
107 affix_index = atoi(split[1].c_str()); 106 affix_index = atoi(split[1].c_str());
108 else 107 else
109 affix_index = aff_reader->GetAFIndexForAFString(split[1]); 108 affix_index = aff_reader->GetAFIndexForAFString(split[1]);
110 } 109 }
111 110
112 WordSet::iterator found = word_set.find(utf8word); 111 WordSet::iterator found = word_set->find(utf8word);
113 if (found == word_set.end()) { 112 if (found == word_set->end()) {
114 std::set<int> affix_vector; 113 std::set<int> affix_vector;
115 affix_vector.insert(affix_index); 114 affix_vector.insert(affix_index);
116 word_set.insert(std::make_pair(utf8word, affix_vector)); 115 word_set->insert(std::make_pair(utf8word, affix_vector));
117 } else { 116 } else {
118 found->second.insert(affix_index); 117 found->second.insert(affix_index);
119 } 118 }
120 } 119 }
121 120
121 return true;
122 }
123
124 } // namespace
125
126 DicReader::DicReader(const std::string& filename) {
127 file_ = file_util::OpenFile(filename, "r");
128 additional_words_file_ = file_util::OpenFile(filename + "_delta", "r");
129 }
130
131 DicReader::~DicReader() {
132 if (file_)
133 file_util::CloseFile(file_);
134 if (additional_words_file_)
135 file_util::CloseFile(additional_words_file_);
136 }
137
138 bool DicReader::Read(AffReader* aff_reader) {
139 if (!file_)
140 return false;
141
142 WordSet word_set;
143
144 // Add words from the dic file to the word set.
145 // Note that the first line is the word count in the file.
146 if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true))
147 return false;
148
149 // Add words from the dic delta file to the word set, if it exists.
150 // The first line is the first word to add. Word count line is not present.
151 PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
brettw 2008/12/29 22:25:22 It seems like it would be better to not call the f
152 false);
153
122 // Make sure the words are sorted, they may be unsorted in the input. 154 // Make sure the words are sorted, they may be unsorted in the input.
123 for (WordSet::iterator word = word_set.begin(); word != word_set.end(); 155 for (WordSet::iterator word = word_set.begin(); word != word_set.end();
124 ++word) { 156 ++word) {
125 std::vector<int> affixes; 157 std::vector<int> affixes;
126 for (std::set<int>::iterator aff = word->second.begin(); 158 for (std::set<int>::iterator aff = word->second.begin();
127 aff != word->second.end(); ++aff) 159 aff != word->second.end(); ++aff)
128 affixes.push_back(*aff); 160 affixes.push_back(*aff);
129 161
130 // Double check that the affixes are sorted. This isn't strictly necessary 162 // Double check that the affixes are sorted. This isn't strictly necessary
131 // but it's nice for the file to have a fixed layout. 163 // but it's nice for the file to have a fixed layout.
132 std::sort(affixes.begin(), affixes.end()); 164 std::sort(affixes.begin(), affixes.end());
133 words_.push_back(std::make_pair(word->first, affixes)); 165 words_.push_back(std::make_pair(word->first, affixes));
134 } 166 }
135 167
136 // Double-check that the words are sorted. 168 // Double-check that the words are sorted.
137 std::sort(words_.begin(), words_.end()); 169 std::sort(words_.begin(), words_.end());
138 return true; 170 return true;
139 } 171 }
140 172
141 } // namespace convert_dict 173 } // namespace convert_dict
142 174
OLDNEW
« no previous file with comments | « chrome/tools/convert_dict/dic_reader.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698