Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(37)

Side by Side Diff: content/renderer/hyphenator/hyphenator.cc

Issue 9545017: Adds a hy-phen-ator. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "content/renderer/hyphenator/hyphenator.h"
6
7 #include <string>
jam 2012/07/11 00:16:35 nit: not needed cause of all the string includes b
Hironori Bono 2012/07/13 08:11:35 Done. Thanks for catching it.
8
9 #include "base/file_util.h"
10 #include "base/logging.h"
11 #include "base/memory/scoped_ptr.h"
12 #include "base/string_util.h"
13 #include "base/utf_string_conversions.h"
14 #include "third_party/hyphen/hyphen.h"
15 #include "unicode/uscript.h"
16
17 namespace {
18
19 // A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds
20 // only the length of converted UTF-16 characters. This class is used for
21 // creating a mapping from the position of a UTF-8 string to a position of a
22 // UTF-16 string without unnecessary conversions. Even though the following
23 // snippet produces the same mapping, it needs to convert same characters many
24 // times. This class incrementally counts the number of converted UTF-16
25 // characters to avoid this problem.
26 //
27 // scoped_array<size_t> position(new size_t[text.length()]);
28 // for (size_t i = 0; i < text.length(); ++i)
29 // position[i] = UTF8ToUTF16(text.substr(0, i)).length();
30 //
31 class UTF16TextLength {
32 public:
33 UTF16TextLength();
34 ~UTF16TextLength();
35
36 // Returns the current position.
37 int utf16_length() const { return utf16_length_; }
38
39 // Appends one UTF-8 character to this converter and advances the converted
40 // position. This converter increases the position by one when it finishes
41 // reading a BMP character and increases by two when it finish reading a
42 // non-BMP character.
43 void Append(char c);
44
45 private:
46 // The length of the converted UTF-16 text.
47 int utf16_length_;
48
49 // The buffer that stores UTF-8 characters being converted.
50 std::string utf8_text_;
51
52 DISALLOW_COPY_AND_ASSIGN(UTF16TextLength);
53 };
54
55 UTF16TextLength::UTF16TextLength()
56 : utf16_length_(0) {
57 }
58
59 UTF16TextLength::~UTF16TextLength() {
60 }
61
62 void UTF16TextLength::Append(char c) {
63 // Append the given character and try converting the UTF-8 characters in this
64 // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint,
65 // get the number of UTF-16 characters representing this codepoint and advance
66 // the position.
67 int code = 0;
68 int index = 0;
69 utf8_text_.push_back(c);
70 U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()),
71 code);
72 if (code != U_SENTINEL) {
73 utf8_text_.clear();
74 utf16_length_ += U16_LENGTH(code);
75 }
76 }
77
78 // A class that encapsulates a hyphenation query. This class owns resources
79 // temporarily needed for hyphenating one word, and deletes them when it is
80 // deleted as listed in the following snippet.
81 //
82 // std::vector<int> hyphens;
83 // QUery query(UTF8ToUTF16("hyphenate"));
84 // query.Hyphenate(dict, &hyphens);
85 //
86 class Query {
87 public:
88 explicit Query(const string16& word);
89 ~Query();
90
91 // Hyphenates a word with the specified dictionary. This function hyphenates
92 // the word provided to its constructor and returns a list of hyphenation
93 // points, positions where we can insert hyphens.
94 bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets);
95
96 private:
97 // A word to be hyphenated.
98 std::string word_utf8_;
99
100 // Return variables from the hyphen library.
101 scoped_array<char> hyphen_vector_;
102 char** rep_;
103 int* pos_;
104 int* cut_;
105
106 DISALLOW_COPY_AND_ASSIGN(Query);
107 };
108
109 Query::Query(const string16& word)
110 : rep_(NULL),
111 pos_(NULL),
112 cut_(NULL) {
113 // Remove trailing punctuation characters. WebKit does not remove these
114 // characters when it hyphenates a word. These characters prevent the hyphen
115 // library from applying some rules, i.e. they prevent the library from adding
116 // hyphens.
117 DCHECK(!word.empty());
118 const char16* data = word.data();
119 int length = static_cast<int>(word.length());
120 while (length > 0) {
121 int previous = length;
122 int code = 0;
123 U16_PREV(data, 0, previous, code);
124 UErrorCode error = U_ZERO_ERROR;
125 if (uscript_getScript(code, &error) != USCRIPT_COMMON)
126 break;
127 length = previous;
128 }
129 UTF16ToUTF8(word.c_str(), length, &word_utf8_);
130 // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a
131 // buffer of |word_.length()| + 5 as written in Line 112 of
132 // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>.
133 hyphen_vector_.reset(new char[word_utf8_.length() + 5]);
134 }
135
136 Query::~Query() {
137 if (rep_) {
138 for (size_t i = 0; i < word_utf8_.length(); ++i) {
139 if (rep_[i])
140 free(rep_[i]);
141 }
142 free(rep_);
143 }
144 if (pos_)
145 free(pos_);
146 if (cut_)
147 free(cut_);
148 }
149
150 bool Query::Hyphenate(HyphenDict* dictionary,
151 std::vector<int>* hyphen_offsets) {
152 DCHECK(dictionary);
153 DCHECK(hyphen_offsets);
154
155 int error_code = hnj_hyphen_hyphenate2(dictionary,
156 word_utf8_.data(),
157 static_cast<int>(word_utf8_.length()),
158 hyphen_vector_.get(),
159 NULL,
160 &rep_,
161 &pos_,
162 &cut_);
163 if (error_code)
164 return false;
165
166 // WebKit needs hyphenation points counted in UTF-16 characters. On the other
167 // hand, the hyphen library returns hyphenation points counted in UTF-8
168 // characters. We increamentally convert hyphenation points in UTF-8
169 // characters to hyphenation points in UTF-16 characters and write the
170 // converted hyphenation points to the output vector.
171 UTF16TextLength text_length;
172 hyphen_offsets->clear();
173 for (size_t i = 0; i < word_utf8_.length(); ++i) {
174 text_length.Append(word_utf8_[i]);
175 if (hyphen_vector_[i] & 1)
176 hyphen_offsets->push_back(text_length.utf16_length());
177 }
178 return !hyphen_offsets->empty();
179 }
180
181 } // namespace
182
183 namespace content {
184
185 Hyphenator::Hyphenator(base::PlatformFile file)
186 : dictionary_(NULL),
187 rule_file_(file),
188 result_(0) {
189 }
190
191 Hyphenator::~Hyphenator() {
192 if (dictionary_)
193 hnj_hyphen_free(dictionary_);
194 }
195
196 bool Hyphenator::Initialize() {
197 if (dictionary_)
198 return true;
199
200 rule_map_.reset(new file_util::MemoryMappedFile);
201 if (!rule_map_->Initialize(rule_file_))
tony 2012/07/10 17:30:00 Oh, I see, rule_file_ is a PlatformFile which has
202 return false;
203
204 dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());
205 return !!dictionary_;
206 }
207
208 size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,
209 size_t before_index) {
210 if (!dictionary_ || word.empty())
211 return 0;
212
213 // Call the hyphen library to get all hyphenation points, i.e. positions where
214 // we can insert hyphens. When WebKit finds a line-break, it calls this
215 // function twice or more with the same word to find the best hyphenation
216 // point. To avoid calling the hyphen library twice or more with the same
217 // word, we cache the last query.
218 if (word_ != word) {
219 word_ = word;
220 Query query(word);
221 result_ = query.Hyphenate(dictionary_, &hyphen_offsets_);
222 }
223 if (!result_)
224 return 0;
225 for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin();
226 it != hyphen_offsets_.rend(); ++it) {
227 if (static_cast<size_t>(*it) < before_index)
228 return *it;
229 }
230 return 0;
231 }
232
233 } // namespace content
OLDNEW
« no previous file with comments | « content/renderer/hyphenator/hyphenator.h ('k') | content/renderer/hyphenator/hyphenator_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698