Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(167)

Unified Diff: chrome/tools/convert_dict/dic_reader.cc

Issue 14856: [chromium-reviews] Part 1 of 'Add common words for each language, and remove forbidden words'.... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 12 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « chrome/tools/convert_dict/dic_reader.h ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: chrome/tools/convert_dict/dic_reader.cc
===================================================================
--- chrome/tools/convert_dict/dic_reader.cc (revision 7485)
+++ chrome/tools/convert_dict/dic_reader.cc (working copy)
@@ -43,43 +43,42 @@
output->push_back(line.substr(slash_index + 1));
}
-} // namespace
+// This function reads words from a .dic file, or a .dic_delta file. Note that
brettw 2008/12/29 22:25:22 Can you add "// This function reads words from a .
+// we read 'all' the words in the file, irrespective of the word count given
+// in the first non empty line of a .dic file. Also note that, for a .dic_delta
+// file, the first line actually does _not_ have the number of words. In order
+// to control this, we use the |file_has_word_count_in_the_first_line|
+// parameter to tell this method whether the first non empty line in the file
+// contains the number of words or not. If it does, skip the first line. If it
+// does not, then the first line contains a word.
+bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
+ const char* file_type,
+ bool file_has_word_count_in_the_first_line) {
+ if (file == NULL)
+ return false;
-DicReader::DicReader(const std::string& filename) {
- file_ = file_util::OpenFile(filename, "r");
-}
+ printf("Extracting words from %s file...\n", file_type);
-DicReader::~DicReader() {
- if (file_)
- file_util::CloseFile(file_);
-}
-
-bool DicReader::Read(AffReader* aff_reader) {
- if (!file_)
- return false;
-
- bool got_count = false;
int line_number = 0;
-
- WordSet word_set;
- while (!feof(file_)) {
- std::string line = ReadLine(file_);
+ while (!feof(file)) {
+ std::string line = ReadLine(file);
line_number++;
StripComment(&line);
if (line.empty())
continue;
- if (!got_count) {
+ if (file_has_word_count_in_the_first_line) {
// Skip the first nonempty line, this is the line count. We don't bother
// with it and just read all the lines.
- got_count = true;
+ file_has_word_count_in_the_first_line = false;
continue;
}
std::vector<std::string> split;
SplitDicLine(line, &split);
if (split.size() == 0 || split.size() > 2) {
- printf("Line %d has extra slashes in the dic file\n", line_number);
+ printf("Line %d has extra slashes in the %s file\n", line_number,
+ file_type);
return false;
}
@@ -87,8 +86,8 @@
// always use UTF-8 as the encoding to simplify life.
std::string utf8word;
if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
- printf("Unable to convert line %d from %s to UTF-8 in the dic file\n",
- line_number, aff_reader->encoding());
+ printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",
+ line_number, aff_reader->encoding(), file_type);
return false;
}
@@ -109,16 +108,49 @@
affix_index = aff_reader->GetAFIndexForAFString(split[1]);
}
- WordSet::iterator found = word_set.find(utf8word);
- if (found == word_set.end()) {
+ WordSet::iterator found = word_set->find(utf8word);
+ if (found == word_set->end()) {
std::set<int> affix_vector;
affix_vector.insert(affix_index);
- word_set.insert(std::make_pair(utf8word, affix_vector));
+ word_set->insert(std::make_pair(utf8word, affix_vector));
} else {
found->second.insert(affix_index);
}
}
+ return true;
+}
+
+} // namespace
+
+DicReader::DicReader(const std::string& filename) {
+ file_ = file_util::OpenFile(filename, "r");
+ additional_words_file_ = file_util::OpenFile(filename + "_delta", "r");
+}
+
+DicReader::~DicReader() {
+ if (file_)
+ file_util::CloseFile(file_);
+ if (additional_words_file_)
+ file_util::CloseFile(additional_words_file_);
+}
+
+bool DicReader::Read(AffReader* aff_reader) {
+ if (!file_)
+ return false;
+
+ WordSet word_set;
+
+ // Add words from the dic file to the word set.
+ // Note that the first line is the word count in the file.
+ if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true))
+ return false;
+
+ // Add words from the dic delta file to the word set, if it exists.
+ // The first line is the first word to add. Word count line is not present.
+ PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
brettw 2008/12/29 22:25:22 It seems like it would be better to not call the f
+ false);
+
// Make sure the words are sorted, they may be unsorted in the input.
for (WordSet::iterator word = word_set.begin(); word != word_set.end();
++word) {
« no previous file with comments | « chrome/tools/convert_dict/dic_reader.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698