Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(488)

Side by Side Diff: chrome/renderer/hyphenator/hyphenator.cc

Issue 9545017: Adds a hy-phen-ator. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/hyphenator/hyphenator.h"
6
7 #include <string>
8
9 #include "base/file_util.h"
10 #include "base/logging.h"
11 #include "base/memory/scoped_ptr.h"
12 #include "base/string_util.h"
13 #include "base/utf_string_conversions.h"
14 #include "third_party/hyphen/hyphen.h"
15
16 namespace {
17
18 // A class that converts a sequence of UT-8 characters to UTF-16 ones and counts
19 // the length of converted UTF-16 characters. This class is used for creating a
20 // mapping from the position of a UTF-8 string to a position of a UTF-16 string
21 // converted from the UTF-8 string. Even though the following snippet produces
22 // the same mapping, it needs to convert same characters many times. This class
23 // incrementally counts the number of converted UTF-16 characters to avoid
24 // unnecessary conversions.
25 //
26 // scoped_array<size_t> position(new size_t[text.length()]);
27 // for (size_t i = 0; i < text.length(); ++i)
28 // position[i] = UTF8ToUTF16(text.substr(0, i)).length();
29 //
30 class UTF16Position {
31 public:
32 UTF16Position();
33 ~UTF16Position();
34
35 // Returns the current position.
36 int position() const { return position_; }
37
38 // Appends one UTF-8 character to this converter and updates the converted
39 // position. This converter increases the position by one when it finishes
40 // reading a BMP character and increases by twi when it finish reading a
41 // non-BMP character.
42 bool Append(char c);
43
44 private:
45 // The number of converted UTF-16 characters.
46 int position_;
47
48 // A buffer that stores UTF-8 sharacters being converted. If this buffer does
49 // not have any characters being converted, this value becomes 0xffffffff.
50 // (This is an invalid UTF-8 character.)
51 uint32 buf_;
52 };
53
54 UTF16Position::UTF16Position()
55 : position_(0),
56 buf_(0xffffffff) {
57 }
58
59 UTF16Position::~UTF16Position() {
60 }
61
62 bool UTF16Position::Append(char c) {
tony 2012/03/05 18:59:00 Can we use U8_FWD_1 in unicode/utf8.h for this ins
Hironori Bono 2012/06/28 09:32:49 Thanks for your advice. Yes, it is better to use U
63 // Rotate our character buffer and append the input character.
64 buf_ = (buf_ << 8) | c;
65
66 // Increase the position by one when we finish reading a BMP character
67 // (U+0000,...,U+FFFF), UTF-16 needs one character to represent a BMP
68 // character.
69 if ((buf_ & 0x80) == 0 ||
70 (buf_ & 0xe0c0) == 0xc080 ||
71 (buf_ & 0xf0c0c0) == 0xe08080) {
72 ++position_;
73 buf_ = 0xffffffff;
74 return true;
75 }
76 // UTF-16 needs two characters to represent a unicode chracter between U+10000
77 // and U+10FFFF.
78 if ((buf_ & 0xf8c0c0c0) == 0xf0808080) {
79 position_ += 2;
80 buf_ = 0xffffffff;
81 return true;
82 }
83 // UTF-16 cannot represent a Unicode character above U+10FFFF. We should abort
84 // this conversion.
85 return false;
86 }
87
88 // A class that encapsulates a hyphenation query. THis class owns resources
tony 2012/03/05 18:59:00 Nit: THis -> This
Hironori Bono 2012/06/28 09:32:49 Done.
89 // temporarily needed for hyphenating one word, and deletes them when it is
90 // deleted as listed in the following snippet.
91 //
92 // void GetHyphenationPoints(HyphenDict* dict,
93 // const string16& word,
94 // std::vector<int>* hyphens) {
95 // scoped_ptr<Query> query(word);
96 // query.Hyphenate(dict, hyphens);
97 // }
98 //
99 class Query {
100 public:
101 explicit Query(const string16& word);
102 ~Query();
103
104 // Hyphenates a word with the specified dictionary. This function hyphenates
105 // the word provided to its constructor and returns a list of hyphenation
106 // points, positions where we can insert hyphens. The following snippet shows
107 // how to insert hyphens with hyphenation points returned by this function.
108 //
109 // std::vector<int> hyphens;
110 // GetHyphenationPoints(dict, word, &hyphens);
tony 2012/03/05 18:59:00 This snippet doesn't seem to match the class.
Hironori Bono 2012/06/28 09:32:49 Done. Thanks for noticing it. I have updated this
111 // for (std::vector<int>::const_reverse_iterator it = hyphens.rbegin();
112 // it != hyphens.rend(); ++it) {
113 // word.insert(*it, 1, '-');
114 // }
115 //
116 bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens);
117
118 private:
119 // A word to be hyphenated.
120 std::string word_;
tony 2012/03/05 18:59:00 Nit: Maybe name this word_utf8_ to make the encodi
Hironori Bono 2012/06/28 09:32:49 Done. Thanks for you suggestion.
121
122 // Return variables from the hyphen library.
123 scoped_array<char> hyphens_;
124 char** rep_;
125 int* pos_;
126 int* cut_;
127 };
128
129 Query::Query(const string16& word)
130 : rep_(NULL),
131 pos_(NULL),
132 cut_(NULL) {
133 // Remove trailing punctuation characters. WebKit does not remove these
134 // characters when it hyphenates a word. These characters prevent the hyphen
135 // library from applying some rules, i.e. they prevent the library from adding
136 // hyphens.
137 DCHECK(!word.empty());
138 static const char16 kPunctuationChars[] = {
139 '!', '"', '\'', ',', '.', '(', ')', ':', ';', '?', '\0',
140 };
141 size_t pos = word.find_last_of(kPunctuationChars);
142 size_t length = (pos != string16::npos) ? pos : word.length();
143 UTF16ToUTF8(word.c_str(), length, &word_);
144 hyphens_.reset(new char[word_.length() + 5]);
tony 2012/03/05 18:59:00 Why 5? Is that the max number of hyphens that will
Hironori Bono 2012/06/28 09:32:49 This is a magic number written in "hyphen.h" <http
145 }
146
147 Query::~Query() {
148 if (rep_) {
149 for (size_t i = 0; i < word_.length(); ++i) {
150 if (rep_[i])
151 free(rep_[i]);
152 }
153 free(rep_);
154 }
155 if (pos_)
156 free(pos_);
157 if (cut_)
158 free(cut_);
159 }
160
161 bool Query::Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens) {
162 DCHECK(dictionary);
163 DCHECK(hyphens);
164
165 int result = hnj_hyphen_hyphenate2(dictionary,
tony 2012/03/05 18:59:00 Nit: result -> error_code?
Hironori Bono 2012/06/28 09:32:49 Done.
166 word_.data(),
167 static_cast<int>(word_.length()),
168 hyphens_.get(),
169 NULL,
170 &rep_,
171 &pos_,
172 &cut_);
tony 2012/03/05 18:59:00 It looks like we can just pass in NULL since we do
Hironori Bono 2012/06/28 09:32:49 This function assumes these pointers are non-NULL
173 if (result)
174 return false;
175
176 // WebKit needs hyphenation points counted in UTF-16 characters. On the other
177 // hand, the hyphen library returns hyphenation points counted in UTF-8
178 // characters. We increamentally convert hyphenation points in UTF-8
179 // characters to hyphenation points in UTF-16 characters and write the
180 // converted hyphenation points to the output vector.
tony 2012/03/05 18:59:00 It would be a nice TODO to switch the hyphen libra
181 UTF16Position converter;
182 hyphens->clear();
183 for (size_t i = 0; i < word_.length(); ++i) {
184 converter.Append(word_[i]);
185 if (hyphens_[i] & 1)
tony 2012/03/05 18:59:00 hyphens and hyphens_ make this code hard to follow
Hironori Bono 2012/06/28 09:32:49 Done, I have renamed them. (I could not figure out
186 hyphens->push_back(converter.position());
187 }
188 return !hyphens->empty();
189 }
190
191 } // namespace
192
193 Hyphenator::Hyphenator(base::PlatformFile file)
194 : dictionary_(NULL),
195 rule_file_(file),
196 result_(0) {
197 }
198
199 Hyphenator::~Hyphenator() {
200 if (dictionary_)
201 hnj_hyphen_free(dictionary_);
202 }
203
204 bool Hyphenator::Initialize() {
205 if (dictionary_)
206 return true;
207
208 rule_map_.reset(new file_util::MemoryMappedFile);
209 if (!rule_map_->Initialize(rule_file_))
210 return false;
211
212 dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());
213 return !!dictionary_;
214 }
215
216 size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,
217 size_t before_index) {
218 if (!dictionary_ || word.empty())
219 return 0;
220
221 // Call the hyphen library to get all hyphenation points, i.e. positions where
222 // we can insert hyphens. When WebKit finds a line-break, it calls this
223 // function twice or more with the same word to find the best hyphenation
224 // point. To avoid calling the hyphen library twice or more with the same
225 // word, we cache the last query.
226 if (word_ != word) {
227 word_ = word;
228 Query query(word);
229 result_ = query.Hyphenate(dictionary_, &hyphens_);
230 }
231 if (!result_)
232 return 0;
233 for (std::vector<int>::reverse_iterator it = hyphens_.rbegin();
234 it != hyphens_.rend(); ++it) {
235 if (static_cast<size_t>(*it) < before_index)
236 return *it;
237 }
238 return 0;
239 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698