Chromium Code Reviews
|
| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/renderer/hyphenator/hyphenator.h" | |
| 6 | |
| 7 #include <string> | |
| 8 | |
| 9 #include "base/file_util.h" | |
| 10 #include "base/logging.h" | |
| 11 #include "base/memory/scoped_ptr.h" | |
| 12 #include "base/string_util.h" | |
| 13 #include "base/utf_string_conversions.h" | |
| 14 #include "third_party/hyphen/hyphen.h" | |
| 15 | |
| 16 namespace { | |
| 17 | |
| 18 // A class that converts a sequence of UT-8 characters to UTF-16 ones and counts | |
| 19 // the length of converted UTF-16 characters. This class is used for creating a | |
| 20 // mapping from the position of a UTF-8 string to a position of a UTF-16 string | |
| 21 // converted from the UTF-8 string. Even though the following snippet produces | |
| 22 // the same mapping, it needs to convert same characters many times. This class | |
| 23 // incrementally counts the number of converted UTF-16 characters to avoid | |
| 24 // unnecessary conversions. | |
| 25 // | |
| 26 // scoped_array<size_t> position(new size_t[text.length()]); | |
| 27 // for (size_t i = 0; i < text.length(); ++i) | |
| 28 // position[i] = UTF8ToUTF16(text.substr(0, i)).length(); | |
| 29 // | |
| 30 class UTF16Position { | |
| 31 public: | |
| 32 UTF16Position(); | |
| 33 ~UTF16Position(); | |
| 34 | |
| 35 // Returns the current position. | |
| 36 int position() const { return position_; } | |
| 37 | |
| 38 // Appends one UTF-8 character to this converter and updates the converted | |
| 39 // position. This converter increases the position by one when it finishes | |
| 40 // reading a BMP character and increases by twi when it finish reading a | |
| 41 // non-BMP character. | |
| 42 bool Append(char c); | |
| 43 | |
| 44 private: | |
| 45 // The number of converted UTF-16 characters. | |
| 46 int position_; | |
| 47 | |
| 48 // A buffer that stores UTF-8 sharacters being converted. If this buffer does | |
| 49 // not have any characters being converted, this value becomes 0xffffffff. | |
| 50 // (This is an invalid UTF-8 character.) | |
| 51 uint32 buf_; | |
| 52 }; | |
| 53 | |
| 54 UTF16Position::UTF16Position() | |
| 55 : position_(0), | |
| 56 buf_(0xffffffff) { | |
| 57 } | |
| 58 | |
| 59 UTF16Position::~UTF16Position() { | |
| 60 } | |
| 61 | |
| 62 bool UTF16Position::Append(char c) { | |
|
tony
2012/03/05 18:59:00
Can we use U8_FWD_1 in unicode/utf8.h for this ins
Hironori Bono
2012/06/28 09:32:49
Thanks for your advice. Yes, it is better to use U
| |
| 63 // Rotate our character buffer and append the input character. | |
| 64 buf_ = (buf_ << 8) | c; | |
| 65 | |
| 66 // Increase the position by one when we finish reading a BMP character | |
| 67 // (U+0000,...,U+FFFF), UTF-16 needs one character to represent a BMP | |
| 68 // character. | |
| 69 if ((buf_ & 0x80) == 0 || | |
| 70 (buf_ & 0xe0c0) == 0xc080 || | |
| 71 (buf_ & 0xf0c0c0) == 0xe08080) { | |
| 72 ++position_; | |
| 73 buf_ = 0xffffffff; | |
| 74 return true; | |
| 75 } | |
| 76 // UTF-16 needs two characters to represent a unicode chracter between U+10000 | |
| 77 // and U+10FFFF. | |
| 78 if ((buf_ & 0xf8c0c0c0) == 0xf0808080) { | |
| 79 position_ += 2; | |
| 80 buf_ = 0xffffffff; | |
| 81 return true; | |
| 82 } | |
| 83 // UTF-16 cannot represent a Unicode character above U+10FFFF. We should abort | |
| 84 // this conversion. | |
| 85 return false; | |
| 86 } | |
| 87 | |
| 88 // A class that encapsulates a hyphenation query. THis class owns resources | |
|
tony
2012/03/05 18:59:00
Nit: THis -> This
Hironori Bono
2012/06/28 09:32:49
Done.
| |
| 89 // temporarily needed for hyphenating one word, and deletes them when it is | |
| 90 // deleted as listed in the following snippet. | |
| 91 // | |
| 92 // void GetHyphenationPoints(HyphenDict* dict, | |
| 93 // const string16& word, | |
| 94 // std::vector<int>* hyphens) { | |
| 95 // scoped_ptr<Query> query(word); | |
| 96 // query.Hyphenate(dict, hyphens); | |
| 97 // } | |
| 98 // | |
| 99 class Query { | |
| 100 public: | |
| 101 explicit Query(const string16& word); | |
| 102 ~Query(); | |
| 103 | |
| 104 // Hyphenates a word with the specified dictionary. This function hyphenates | |
| 105 // the word provided to its constructor and returns a list of hyphenation | |
| 106 // points, positions where we can insert hyphens. The following snippet shows | |
| 107 // how to insert hyphens with hyphenation points returned by this function. | |
| 108 // | |
| 109 // std::vector<int> hyphens; | |
| 110 // GetHyphenationPoints(dict, word, &hyphens); | |
|
tony
2012/03/05 18:59:00
This snippet doesn't seem to match the class.
Hironori Bono
2012/06/28 09:32:49
Done. Thanks for noticing it. I have updated this
| |
| 111 // for (std::vector<int>::const_reverse_iterator it = hyphens.rbegin(); | |
| 112 // it != hyphens.rend(); ++it) { | |
| 113 // word.insert(*it, 1, '-'); | |
| 114 // } | |
| 115 // | |
| 116 bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens); | |
| 117 | |
| 118 private: | |
| 119 // A word to be hyphenated. | |
| 120 std::string word_; | |
|
tony
2012/03/05 18:59:00
Nit: Maybe name this word_utf8_ to make the encodi
Hironori Bono
2012/06/28 09:32:49
Done. Thanks for you suggestion.
| |
| 121 | |
| 122 // Return variables from the hyphen library. | |
| 123 scoped_array<char> hyphens_; | |
| 124 char** rep_; | |
| 125 int* pos_; | |
| 126 int* cut_; | |
| 127 }; | |
| 128 | |
| 129 Query::Query(const string16& word) | |
| 130 : rep_(NULL), | |
| 131 pos_(NULL), | |
| 132 cut_(NULL) { | |
| 133 // Remove trailing punctuation characters. WebKit does not remove these | |
| 134 // characters when it hyphenates a word. These characters prevent the hyphen | |
| 135 // library from applying some rules, i.e. they prevent the library from adding | |
| 136 // hyphens. | |
| 137 DCHECK(!word.empty()); | |
| 138 static const char16 kPunctuationChars[] = { | |
| 139 '!', '"', '\'', ',', '.', '(', ')', ':', ';', '?', '\0', | |
| 140 }; | |
| 141 size_t pos = word.find_last_of(kPunctuationChars); | |
| 142 size_t length = (pos != string16::npos) ? pos : word.length(); | |
| 143 UTF16ToUTF8(word.c_str(), length, &word_); | |
| 144 hyphens_.reset(new char[word_.length() + 5]); | |
|
tony
2012/03/05 18:59:00
Why 5? Is that the max number of hyphens that will
Hironori Bono
2012/06/28 09:32:49
This is a magic number written in "hyphen.h" <http
| |
| 145 } | |
| 146 | |
| 147 Query::~Query() { | |
| 148 if (rep_) { | |
| 149 for (size_t i = 0; i < word_.length(); ++i) { | |
| 150 if (rep_[i]) | |
| 151 free(rep_[i]); | |
| 152 } | |
| 153 free(rep_); | |
| 154 } | |
| 155 if (pos_) | |
| 156 free(pos_); | |
| 157 if (cut_) | |
| 158 free(cut_); | |
| 159 } | |
| 160 | |
| 161 bool Query::Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens) { | |
| 162 DCHECK(dictionary); | |
| 163 DCHECK(hyphens); | |
| 164 | |
| 165 int result = hnj_hyphen_hyphenate2(dictionary, | |
|
tony
2012/03/05 18:59:00
Nit: result -> error_code?
Hironori Bono
2012/06/28 09:32:49
Done.
| |
| 166 word_.data(), | |
| 167 static_cast<int>(word_.length()), | |
| 168 hyphens_.get(), | |
| 169 NULL, | |
| 170 &rep_, | |
| 171 &pos_, | |
| 172 &cut_); | |
|
tony
2012/03/05 18:59:00
It looks like we can just pass in NULL since we do
Hironori Bono
2012/06/28 09:32:49
This function assumes these pointers are non-NULL
| |
| 173 if (result) | |
| 174 return false; | |
| 175 | |
| 176 // WebKit needs hyphenation points counted in UTF-16 characters. On the other | |
| 177 // hand, the hyphen library returns hyphenation points counted in UTF-8 | |
| 178 // characters. We increamentally convert hyphenation points in UTF-8 | |
| 179 // characters to hyphenation points in UTF-16 characters and write the | |
| 180 // converted hyphenation points to the output vector. | |
|
tony
2012/03/05 18:59:00
It would be a nice TODO to switch the hyphen libra
| |
| 181 UTF16Position converter; | |
| 182 hyphens->clear(); | |
| 183 for (size_t i = 0; i < word_.length(); ++i) { | |
| 184 converter.Append(word_[i]); | |
| 185 if (hyphens_[i] & 1) | |
|
tony
2012/03/05 18:59:00
hyphens and hyphens_ make this code hard to follow
Hironori Bono
2012/06/28 09:32:49
Done, I have renamed them. (I could not figure out
| |
| 186 hyphens->push_back(converter.position()); | |
| 187 } | |
| 188 return !hyphens->empty(); | |
| 189 } | |
| 190 | |
| 191 } // namespace | |
| 192 | |
| 193 Hyphenator::Hyphenator(base::PlatformFile file) | |
| 194 : dictionary_(NULL), | |
| 195 rule_file_(file), | |
| 196 result_(0) { | |
| 197 } | |
| 198 | |
| 199 Hyphenator::~Hyphenator() { | |
| 200 if (dictionary_) | |
| 201 hnj_hyphen_free(dictionary_); | |
| 202 } | |
| 203 | |
| 204 bool Hyphenator::Initialize() { | |
| 205 if (dictionary_) | |
| 206 return true; | |
| 207 | |
| 208 rule_map_.reset(new file_util::MemoryMappedFile); | |
| 209 if (!rule_map_->Initialize(rule_file_)) | |
| 210 return false; | |
| 211 | |
| 212 dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length()); | |
| 213 return !!dictionary_; | |
| 214 } | |
| 215 | |
| 216 size_t Hyphenator::ComputeLastHyphenLocation(const string16& word, | |
| 217 size_t before_index) { | |
| 218 if (!dictionary_ || word.empty()) | |
| 219 return 0; | |
| 220 | |
| 221 // Call the hyphen library to get all hyphenation points, i.e. positions where | |
| 222 // we can insert hyphens. When WebKit finds a line-break, it calls this | |
| 223 // function twice or more with the same word to find the best hyphenation | |
| 224 // point. To avoid calling the hyphen library twice or more with the same | |
| 225 // word, we cache the last query. | |
| 226 if (word_ != word) { | |
| 227 word_ = word; | |
| 228 Query query(word); | |
| 229 result_ = query.Hyphenate(dictionary_, &hyphens_); | |
| 230 } | |
| 231 if (!result_) | |
| 232 return 0; | |
| 233 for (std::vector<int>::reverse_iterator it = hyphens_.rbegin(); | |
| 234 it != hyphens_.rend(); ++it) { | |
| 235 if (static_cast<size_t>(*it) < before_index) | |
| 236 return *it; | |
| 237 } | |
| 238 return 0; | |
| 239 } | |
| OLD | NEW |