chrome/tools/convert_dict/dic_reader.cc - Issue 14856: [chromium-reviews] Part 1 of 'Add common words for each language, and remove forbidden words'....

Unified Diff: chrome/tools/convert_dict/dic_reader.cc

Issue 14856: [chromium-reviews] Part 1 of 'Add common words for each language, and remove forbidden words'.... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 12 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: chrome/tools/convert_dict/dic_reader.cc

===================================================================

--- chrome/tools/convert_dict/dic_reader.cc (revision 7485)

+++ chrome/tools/convert_dict/dic_reader.cc (working copy)

@@ -43,43 +43,42 @@

output->push_back(line.substr(slash_index + 1));

}

-} // namespace

+// This function reads words from a .dic file, or a .dic_delta file. Note that

brettw 2008/12/29 22:25:22 Can you add "// This function reads words from a .

+// we read 'all' the words in the file, irrespective of the word count given

+// in the first non empty line of a .dic file. Also note that, for a .dic_delta

+// file, the first line actually does _not_ have the number of words. In order

+// to control this, we use the |file_has_word_count_in_the_first_line|

+// parameter to tell this method whether the first non empty line in the file

+// contains the number of words or not. If it does, skip the first line. If it

+// does not, then the first line contains a word.

+bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,

+ const char* file_type,

+ bool file_has_word_count_in_the_first_line) {

+ if (file == NULL)

+ return false;

-DicReader::DicReader(const std::string& filename) {

- file_ = file_util::OpenFile(filename, "r");

+ printf("Extracting words from %s file...\n", file_type);

-DicReader::~DicReader() {

- if (file_)

- file_util::CloseFile(file_);

-bool DicReader::Read(AffReader* aff_reader) {

- if (!file_)

- return false;

- bool got_count = false;

int line_number = 0;

- WordSet word_set;

- while (!feof(file_)) {

- std::string line = ReadLine(file_);

+ while (!feof(file)) {

+ std::string line = ReadLine(file);

line_number++;

StripComment(&line);

if (line.empty())

continue;

- if (!got_count) {

+ if (file_has_word_count_in_the_first_line) {

// Skip the first nonempty line, this is the line count. We don't bother

// with it and just read all the lines.

- got_count = true;

+ file_has_word_count_in_the_first_line = false;

continue;

}

std::vector<std::string> split;

SplitDicLine(line, &split);

if (split.size() == 0 || split.size() > 2) {

- printf("Line %d has extra slashes in the dic file\n", line_number);

+ printf("Line %d has extra slashes in the %s file\n", line_number,

+ file_type);

return false;

}

@@ -87,8 +86,8 @@

// always use UTF-8 as the encoding to simplify life.

std::string utf8word;

if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {

- printf("Unable to convert line %d from %s to UTF-8 in the dic file\n",

- line_number, aff_reader->encoding());

+ printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",

+ line_number, aff_reader->encoding(), file_type);

return false;

}

@@ -109,16 +108,49 @@

affix_index = aff_reader->GetAFIndexForAFString(split[1]);

}

- WordSet::iterator found = word_set.find(utf8word);

- if (found == word_set.end()) {

+ WordSet::iterator found = word_set->find(utf8word);

+ if (found == word_set->end()) {

std::set<int> affix_vector;

affix_vector.insert(affix_index);

- word_set.insert(std::make_pair(utf8word, affix_vector));

+ word_set->insert(std::make_pair(utf8word, affix_vector));

} else {

found->second.insert(affix_index);

}

+ return true;

+} // namespace

+DicReader::DicReader(const std::string& filename) {

+ file_ = file_util::OpenFile(filename, "r");

+ additional_words_file_ = file_util::OpenFile(filename + "_delta", "r");

+DicReader::~DicReader() {

+ if (file_)

+ file_util::CloseFile(file_);

+ if (additional_words_file_)

+ file_util::CloseFile(additional_words_file_);

+bool DicReader::Read(AffReader* aff_reader) {

+ if (!file_)

+ return false;

+ WordSet word_set;

+ // Add words from the dic file to the word set.

+ // Note that the first line is the word count in the file.

+ if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true))

+ return false;

+ // Add words from the dic delta file to the word set, if it exists.

+ // The first line is the first word to add. Word count line is not present.

+ PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",

brettw 2008/12/29 22:25:22 It seems like it would be better to not call the f

+ false);

// Make sure the words are sorted, they may be unsorted in the input.

for (WordSet::iterator word = word_set.begin(); word != word_set.end();

++word) {

« no previous file with comments | « chrome/tools/convert_dict/dic_reader.h ('k') | no next file » | no next file with comments »