Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(503)

Side by Side Diff: content/renderer/hyphenator/hyphenator.cc

Issue 9545017: Adds a hy-phen-ator. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "content/renderer/hyphenator/hyphenator.h"
6
7 #include <string>
8
9 #include "base/file_util.h"
10 #include "base/logging.h"
11 #include "base/memory/scoped_ptr.h"
12 #include "base/string_util.h"
13 #include "base/utf_string_conversions.h"
14 #include "third_party/hyphen/hyphen.h"
15 #include "unicode/uscript.h"
16
17 namespace {
18
19 // A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds
20 // only the length of converted UTF-16 characters. This class is used for
21 // creating a mapping from the position of a UTF-8 string to a position of a
22 // UTF-16 string without unnecessary conversions. Even though the following
23 // snippet produces the same mapping, it needs to convert same characters many
24 // times. This class incrementally counts the number of converted UTF-16
25 // characters to avoid this problem.
26 //
27 // scoped_array<size_t> position(new size_t[text.length()]);
28 // for (size_t i = 0; i < text.length(); ++i)
29 // position[i] = UTF8ToUTF16(text.substr(0, i)).length();
30 //
31 class UTF16TextLength {
32 public:
33 UTF16TextLength();
34 ~UTF16TextLength();
35
36 // Returns the current position.
37 int utf16_length() const { return utf16_length_; }
38
39 // Appends one UTF-8 character to this converter and advances the converted
40 // position. This converter increases the position by one when it finishes
41 // reading a BMP character and increases by two when it finish reading a
42 // non-BMP character.
43 void Append(char c);
44
45 private:
46 // The length of the converted UTF-16 text.
47 int utf16_length_;
48
49 // The buffer that stores UTF-8 characters being converted.
50 std::string utf8_text_;
51 };
jochen (gone - plz use gerrit) 2012/07/02 15:33:01 DISALLOW_COPY_AND_ASSIGN
Hironori Bono 2012/07/10 09:22:09 Done. Thanks for noticing it.
52
53 UTF16TextLength::UTF16TextLength()
54 : utf16_length_(0) {
55 }
56
57 UTF16TextLength::~UTF16TextLength() {
58 }
59
60 void UTF16TextLength::Append(char c) {
61 // Append the given character and try converting the UTF-8 characters in this
62 // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint,
63 // get the number of UTF-16 characters representing this codepoint and advance
64 // the position.
65 int code = 0;
66 int index = 0;
67 utf8_text_.push_back(c);
68 U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()),
69 code);
70 if (code != U_SENTINEL) {
71 utf8_text_.clear();
72 utf16_length_ += U16_LENGTH(code);
73 }
74 }
75
76 // A class that encapsulates a hyphenation query. This class owns resources
77 // temporarily needed for hyphenating one word, and deletes them when it is
78 // deleted as listed in the following snippet.
79 //
80 // std::vector<int> hyphens;
81 // QUery query(UTF8ToUTF16("hyphenate"));
82 // query.Hyphenate(dict, &hyphens);
83 //
84 class Query {
85 public:
86 explicit Query(const string16& word);
87 ~Query();
88
89 // Hyphenates a word with the specified dictionary. This function hyphenates
90 // the word provided to its constructor and returns a list of hyphenation
91 // points, positions where we can insert hyphens. The following snippet shows
92 // how to insert hyphens with hyphenation points returned by this function.
93 //
94 // string16 word(UTF8ToUTF16("hyphenate"));
95 // std::vector<int> hyphens;
96 // Query query(word);
97 // query.Hyphenate(dict, &hyphens);
98 // for (std::vector<int>::const_reverse_iterator it = hyphens.rbegin();
99 // it != hyphens.rend(); ++it) {
100 // word.insert(*it, 1, '-');
101 // }
tony 2012/06/28 20:06:44 I don't think the code example is that useful as a
Hironori Bono 2012/07/10 09:22:09 Done. I have removed this redundant example.
102 //
103 bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets);
104
105 private:
106 // A word to be hyphenated.
107 std::string word_utf8_;
108
109 // Return variables from the hyphen library.
110 scoped_array<char> hyphen_vector_;
111 char** rep_;
112 int* pos_;
113 int* cut_;
114 };
jochen (gone - plz use gerrit) 2012/07/02 15:33:01 DISALLOW_COPY_AND_ASSIGN
Hironori Bono 2012/07/10 09:22:09 Done. Thanks for noticing it.
115
116 Query::Query(const string16& word)
117 : rep_(NULL),
118 pos_(NULL),
119 cut_(NULL) {
120 // Remove trailing punctuation characters. WebKit does not remove these
121 // characters when it hyphenates a word. These characters prevent the hyphen
122 // library from applying some rules, i.e. they prevent the library from adding
123 // hyphens.
124 DCHECK(!word.empty());
125 const char16* data = word.data();
126 int length = static_cast<int>(word.length());
127 while (length > 0) {
128 int previous = length;
129 int code = 0;
130 U16_PREV(data, 0, previous, code);
131 UErrorCode error = U_ZERO_ERROR;
132 if (uscript_getScript(code, &error) != USCRIPT_COMMON)
133 break;
134 length = previous;
135 }
136 UTF16ToUTF8(word.c_str(), length, &word_utf8_);
137 // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a
138 // buffer of |word_.length()| + 5 as written in Line 112 of
139 // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>.
140 hyphen_vector_.reset(new char[word_utf8_.length() + 5]);
141 }
142
143 Query::~Query() {
144 if (rep_) {
145 for (size_t i = 0; i < word_utf8_.length(); ++i) {
146 if (rep_[i])
147 free(rep_[i]);
tony 2012/06/28 20:06:44 Nit: Should we be using hnj_free here and below?
Hironori Bono 2012/07/10 09:22:09 Thanks for your comment. The hyphen library uses h
148 }
149 free(rep_);
150 }
151 if (pos_)
152 free(pos_);
153 if (cut_)
154 free(cut_);
155 }
156
157 bool Query::Hyphenate(HyphenDict* dictionary,
158 std::vector<int>* hyphen_offsets) {
159 DCHECK(dictionary);
160 DCHECK(hyphen_offsets);
161
162 int error_code = hnj_hyphen_hyphenate2(dictionary,
163 word_utf8_.data(),
164 static_cast<int>(word_utf8_.length()),
165 hyphen_vector_.get(),
166 NULL,
167 &rep_,
168 &pos_,
169 &cut_);
170 if (error_code)
171 return false;
172
173 // WebKit needs hyphenation points counted in UTF-16 characters. On the other
174 // hand, the hyphen library returns hyphenation points counted in UTF-8
175 // characters. We increamentally convert hyphenation points in UTF-8
176 // characters to hyphenation points in UTF-16 characters and write the
177 // converted hyphenation points to the output vector.
178 UTF16TextLength text_length;
179 hyphen_offsets->clear();
180 for (size_t i = 0; i < word_utf8_.length(); ++i) {
181 text_length.Append(word_utf8_[i]);
182 if (hyphen_vector_[i] & 1)
183 hyphen_offsets->push_back(text_length.utf16_length());
184 }
185 return !hyphen_offsets->empty();
186 }
187
188 } // namespace
189
190 Hyphenator::Hyphenator(base::PlatformFile file)
191 : dictionary_(NULL),
192 rule_file_(file),
193 result_(0) {
194 }
195
196 Hyphenator::~Hyphenator() {
197 if (dictionary_)
198 hnj_hyphen_free(dictionary_);
199 }
200
201 bool Hyphenator::Initialize() {
202 if (dictionary_)
203 return true;
204
205 rule_map_.reset(new file_util::MemoryMappedFile);
206 if (!rule_map_->Initialize(rule_file_))
tony 2012/06/28 20:06:44 If this is in the sandboxed renderer process, how
Hironori Bono 2012/07/10 09:22:09 Our spellchecker opens dictionary in a browser pro
207 return false;
208
209 dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());
210 return !!dictionary_;
211 }
212
213 size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,
214 size_t before_index) {
215 if (!dictionary_ || word.empty())
216 return 0;
217
218 // Call the hyphen library to get all hyphenation points, i.e. positions where
219 // we can insert hyphens. When WebKit finds a line-break, it calls this
220 // function twice or more with the same word to find the best hyphenation
221 // point. To avoid calling the hyphen library twice or more with the same
222 // word, we cache the last query.
223 if (word_ != word) {
224 word_ = word;
225 Query query(word);
226 result_ = query.Hyphenate(dictionary_, &hyphen_offsets_);
227 }
228 if (!result_)
229 return 0;
230 for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin();
231 it != hyphen_offsets_.rend(); ++it) {
232 if (static_cast<size_t>(*it) < before_index)
233 return *it;
234 }
235 return 0;
236 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698