chrome/tools/convert_dict/dic_reader.cc - Issue 14856: [chromium-reviews] Part 1 of 'Add common words for each language, and remove forbidden words'....

Side by Side Diff: chrome/tools/convert_dict/dic_reader.cc

Issue 14856: [chromium-reviews] Part 1 of 'Add common words for each language, and remove forbidden words'.... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/tools/convert_dict/dic_reader.h"	5 #include "chrome/tools/convert_dict/dic_reader.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <set>	8 #include <set>

9	9

10 #include "base/file_util.h"	10 #include "base/file_util.h"

(...skipping 25 matching lines...) Expand all Loading...
36 // convert all escaped slashes ("\/" sequences) to regular slashes.	36 // convert all escaped slashes ("\/" sequences) to regular slashes.

37 std::string word = line.substr(0, slash_index);	37 std::string word = line.substr(0, slash_index);

38 ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/");	38 ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/");

39 output->push_back(word);	39 output->push_back(word);

40	40

41 // Everything (if anything) after the slash is the second.	41 // Everything (if anything) after the slash is the second.

42 if (slash_index < line.size() - 1)	42 if (slash_index < line.size() - 1)

43 output->push_back(line.substr(slash_index + 1));	43 output->push_back(line.substr(slash_index + 1));

44 }	44 }

45	45

46 } // namespace	46 // This function reads words from a .dic file, or a .dic_delta file. Note that
	brettw 2008/12/29 22:25:22 Can you add "// This function reads words from a . Can you add "// This function reads words from a .dic file, or a .dic_delta file and appends it to the given set."
47	47 // we read 'all' the words in the file, irrespective of the word count given

48 DicReader::DicReader(const std::string& filename) {	48 // in the first non empty line of a .dic file. Also note that, for a .dic_delta

49 file_ = file_util::OpenFile(filename, "r");	49 // file, the first line actually does _not_ have the number of words. In order

50 }	50 // to control this, we use the \|file_has_word_count_in_the_first_line\|

51	51 // parameter to tell this method whether the first non empty line in the file

52 DicReader::~DicReader() {	52 // contains the number of words or not. If it does, skip the first line. If it

53 if (file_)	53 // does not, then the first line contains a word.

54 file_util::CloseFile(file_);	54 bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,

55 }	55 const char* file_type,

56	56 bool file_has_word_count_in_the_first_line) {

57 bool DicReader::Read(AffReader* aff_reader) {	57 if (file == NULL)

58 if (!file_)

59 return false;	58 return false;

60	59

61 bool got_count = false;	60 printf("Extracting words from %s file...\n", file_type);

	61

62 int line_number = 0;	62 int line_number = 0;

63	63 while (!feof(file)) {

64 WordSet word_set;	64 std::string line = ReadLine(file);

65 while (!feof(file_)) {

66 std::string line = ReadLine(file_);

67 line_number++;	65 line_number++;

68 StripComment(&line);	66 StripComment(&line);

69 if (line.empty())	67 if (line.empty())

70 continue;	68 continue;

71	69

72 if (!got_count) {	70 if (file_has_word_count_in_the_first_line) {

73 // Skip the first nonempty line, this is the line count. We don't bother	71 // Skip the first nonempty line, this is the line count. We don't bother

74 // with it and just read all the lines.	72 // with it and just read all the lines.

75 got_count = true;	73 file_has_word_count_in_the_first_line = false;

76 continue;	74 continue;

77 }	75 }

78	76

79 std::vector<std::string> split;	77 std::vector<std::string> split;

80 SplitDicLine(line, &split);	78 SplitDicLine(line, &split);

81 if (split.size() == 0 \|\| split.size() > 2) {	79 if (split.size() == 0 \|\| split.size() > 2) {

82 printf("Line %d has extra slashes in the dic file\n", line_number);	80 printf("Line %d has extra slashes in the %s file\n", line_number,

	81 file_type);

83 return false;	82 return false;

84 }	83 }

85	84

86 // The first part is the word, the second (optional) part is the affix. We	85 // The first part is the word, the second (optional) part is the affix. We

87 // always use UTF-8 as the encoding to simplify life.	86 // always use UTF-8 as the encoding to simplify life.

88 std::string utf8word;	87 std::string utf8word;

89 if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {	88 if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {

90 printf("Unable to convert line %d from %s to UTF-8 in the dic file\n",	89 printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",

91 line_number, aff_reader->encoding());	90 line_number, aff_reader->encoding(), file_type);

92 return false;	91 return false;

93 }	92 }

94	93

95 // We always convert the affix to an index. 0 means no affix.	94 // We always convert the affix to an index. 0 means no affix.

96 int affix_index = 0;	95 int affix_index = 0;

97 if (split.size() == 2) {	96 if (split.size() == 2) {

98 // Got a rule, which is the stuff after the slash. The line may also have	97 // Got a rule, which is the stuff after the slash. The line may also have

99 // an optional term separated by a tab. This is the morphological	98 // an optional term separated by a tab. This is the morphological

100 // description. We don't care about this (it is used in the tests to	99 // description. We don't care about this (it is used in the tests to

101 // generate a nice dump), so we remove it.	100 // generate a nice dump), so we remove it.

102 size_t split1_tab_offset = split[1].find('\t');	101 size_t split1_tab_offset = split[1].find('\t');

103 if (split1_tab_offset != std::string::npos)	102 if (split1_tab_offset != std::string::npos)

104 split[1] = split[1].substr(0, split1_tab_offset);	103 split[1] = split[1].substr(0, split1_tab_offset);

105	104

106 if (aff_reader->has_indexed_affixes())	105 if (aff_reader->has_indexed_affixes())

107 affix_index = atoi(split[1].c_str());	106 affix_index = atoi(split[1].c_str());

108 else	107 else

109 affix_index = aff_reader->GetAFIndexForAFString(split[1]);	108 affix_index = aff_reader->GetAFIndexForAFString(split[1]);

110 }	109 }

111	110

112 WordSet::iterator found = word_set.find(utf8word);	111 WordSet::iterator found = word_set->find(utf8word);

113 if (found == word_set.end()) {	112 if (found == word_set->end()) {

114 std::set<int> affix_vector;	113 std::set<int> affix_vector;

115 affix_vector.insert(affix_index);	114 affix_vector.insert(affix_index);

116 word_set.insert(std::make_pair(utf8word, affix_vector));	115 word_set->insert(std::make_pair(utf8word, affix_vector));

117 } else {	116 } else {

118 found->second.insert(affix_index);	117 found->second.insert(affix_index);

119 }	118 }

120 }	119 }

121	120

	121 return true;

	122 }

	123

	124 } // namespace

	125

	126 DicReader::DicReader(const std::string& filename) {

	127 file_ = file_util::OpenFile(filename, "r");

	128 additional_words_file_ = file_util::OpenFile(filename + "_delta", "r");

	129 }

	130

	131 DicReader::~DicReader() {

	132 if (file_)

	133 file_util::CloseFile(file_);

	134 if (additional_words_file_)

	135 file_util::CloseFile(additional_words_file_);

	136 }

	137

	138 bool DicReader::Read(AffReader* aff_reader) {

	139 if (!file_)

	140 return false;

	141

	142 WordSet word_set;

	143

	144 // Add words from the dic file to the word set.

	145 // Note that the first line is the word count in the file.

	146 if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true))

	147 return false;

	148

	149 // Add words from the dic delta file to the word set, if it exists.

	150 // The first line is the first word to add. Word count line is not present.

	151 PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
	brettw 2008/12/29 22:25:22 It seems like it would be better to not call the f It seems like it would be better to not call the function with a NULL file rather than checking at the beginning of the populate function.
	152 false);

	153

122 // Make sure the words are sorted, they may be unsorted in the input.	154 // Make sure the words are sorted, they may be unsorted in the input.

123 for (WordSet::iterator word = word_set.begin(); word != word_set.end();	155 for (WordSet::iterator word = word_set.begin(); word != word_set.end();

124 ++word) {	156 ++word) {

125 std::vector<int> affixes;	157 std::vector<int> affixes;

126 for (std::set<int>::iterator aff = word->second.begin();	158 for (std::set<int>::iterator aff = word->second.begin();

127 aff != word->second.end(); ++aff)	159 aff != word->second.end(); ++aff)

128 affixes.push_back(*aff);	160 affixes.push_back(*aff);

129	161

130 // Double check that the affixes are sorted. This isn't strictly necessary	162 // Double check that the affixes are sorted. This isn't strictly necessary

131 // but it's nice for the file to have a fixed layout.	163 // but it's nice for the file to have a fixed layout.

132 std::sort(affixes.begin(), affixes.end());	164 std::sort(affixes.begin(), affixes.end());

133 words_.push_back(std::make_pair(word->first, affixes));	165 words_.push_back(std::make_pair(word->first, affixes));

134 }	166 }

135	167

136 // Double-check that the words are sorted.	168 // Double-check that the words are sorted.

137 std::sort(words_.begin(), words_.end());	169 std::sort(words_.begin(), words_.end());

138 return true;	170 return true;

139 }	171 }

140	172

141 } // namespace convert_dict	173 } // namespace convert_dict

142	174

OLD	NEW

« no previous file with comments | « chrome/tools/convert_dict/dic_reader.h ('k') | no next file » | no next file with comments »