Chromium Code Reviews| Index: chrome/renderer/hyphenator/hyphenator.cc |
| =================================================================== |
| --- chrome/renderer/hyphenator/hyphenator.cc (revision 0) |
| +++ chrome/renderer/hyphenator/hyphenator.cc (revision 0) |
| @@ -0,0 +1,239 @@ |
| +// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "chrome/renderer/hyphenator/hyphenator.h" |
| + |
| +#include <string> |
| + |
| +#include "base/file_util.h" |
| +#include "base/logging.h" |
| +#include "base/memory/scoped_ptr.h" |
| +#include "base/string_util.h" |
| +#include "base/utf_string_conversions.h" |
| +#include "third_party/hyphen/hyphen.h" |
| + |
| +namespace { |
| + |
| +// A class that converts a sequence of UT-8 characters to UTF-16 ones and counts |
| +// the length of converted UTF-16 characters. This class is used for creating a |
| +// mapping from the position of a UTF-8 string to a position of a UTF-16 string |
| +// converted from the UTF-8 string. Even though the following snippet produces |
| +// the same mapping, it needs to convert same characters many times. This class |
| +// incrementally counts the number of converted UTF-16 characters to avoid |
| +// unnecessary conversions. |
| +// |
| +// scoped_array<size_t> position(new size_t[text.length()]); |
| +// for (size_t i = 0; i < text.length(); ++i) |
| +// position[i] = UTF8ToUTF16(text.substr(0, i)).length(); |
| +// |
| +class UTF16Position { |
| + public: |
| + UTF16Position(); |
| + ~UTF16Position(); |
| + |
| + // Returns the current position. |
| + int position() const { return position_; } |
| + |
| + // Appends one UTF-8 character to this converter and updates the converted |
| + // position. This converter increases the position by one when it finishes |
| + // reading a BMP character and increases by twi when it finish reading a |
| + // non-BMP character. |
| + bool Append(char c); |
| + |
| + private: |
| + // The number of converted UTF-16 characters. |
| + int position_; |
| + |
| + // A buffer that stores UTF-8 sharacters being converted. If this buffer does |
| + // not have any characters being converted, this value becomes 0xffffffff. |
| + // (This is an invalid UTF-8 character.) |
| + uint32 buf_; |
| +}; |
| + |
| +UTF16Position::UTF16Position() |
| + : position_(0), |
| + buf_(0xffffffff) { |
| +} |
| + |
| +UTF16Position::~UTF16Position() { |
| +} |
| + |
| +bool UTF16Position::Append(char c) { |
|
tony
2012/03/05 18:59:00
Can we use U8_FWD_1 in unicode/utf8.h for this ins
Hironori Bono
2012/06/28 09:32:49
Thanks for your advice. Yes, it is better to use U
|
| + // Rotate our character buffer and append the input character. |
| + buf_ = (buf_ << 8) | c; |
| + |
| + // Increase the position by one when we finish reading a BMP character |
| + // (U+0000,...,U+FFFF), UTF-16 needs one character to represent a BMP |
| + // character. |
| + if ((buf_ & 0x80) == 0 || |
| + (buf_ & 0xe0c0) == 0xc080 || |
| + (buf_ & 0xf0c0c0) == 0xe08080) { |
| + ++position_; |
| + buf_ = 0xffffffff; |
| + return true; |
| + } |
| + // UTF-16 needs two characters to represent a unicode chracter between U+10000 |
| + // and U+10FFFF. |
| + if ((buf_ & 0xf8c0c0c0) == 0xf0808080) { |
| + position_ += 2; |
| + buf_ = 0xffffffff; |
| + return true; |
| + } |
| + // UTF-16 cannot represent a Unicode character above U+10FFFF. We should abort |
| + // this conversion. |
| + return false; |
| +} |
| + |
| +// A class that encapsulates a hyphenation query. THis class owns resources |
|
tony
2012/03/05 18:59:00
Nit: THis -> This
Hironori Bono
2012/06/28 09:32:49
Done.
|
| +// temporarily needed for hyphenating one word, and deletes them when it is |
| +// deleted as listed in the following snippet. |
| +// |
| +// void GetHyphenationPoints(HyphenDict* dict, |
| +// const string16& word, |
| +// std::vector<int>* hyphens) { |
| +// scoped_ptr<Query> query(word); |
| +// query.Hyphenate(dict, hyphens); |
| +// } |
| +// |
| +class Query { |
| + public: |
| + explicit Query(const string16& word); |
| + ~Query(); |
| + |
| + // Hyphenates a word with the specified dictionary. This function hyphenates |
| + // the word provided to its constructor and returns a list of hyphenation |
| + // points, positions where we can insert hyphens. The following snippet shows |
| + // how to insert hyphens with hyphenation points returned by this function. |
| + // |
| + // std::vector<int> hyphens; |
| + // GetHyphenationPoints(dict, word, &hyphens); |
|
tony
2012/03/05 18:59:00
This snippet doesn't seem to match the class.
Hironori Bono
2012/06/28 09:32:49
Done. Thanks for noticing it. I have updated this
|
| + // for (std::vector<int>::const_reverse_iterator it = hyphens.rbegin(); |
| + // it != hyphens.rend(); ++it) { |
| + // word.insert(*it, 1, '-'); |
| + // } |
| + // |
| + bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens); |
| + |
| + private: |
| + // A word to be hyphenated. |
| + std::string word_; |
|
tony
2012/03/05 18:59:00
Nit: Maybe name this word_utf8_ to make the encodi
Hironori Bono
2012/06/28 09:32:49
Done. Thanks for you suggestion.
|
| + |
| + // Return variables from the hyphen library. |
| + scoped_array<char> hyphens_; |
| + char** rep_; |
| + int* pos_; |
| + int* cut_; |
| +}; |
| + |
| +Query::Query(const string16& word) |
| + : rep_(NULL), |
| + pos_(NULL), |
| + cut_(NULL) { |
| + // Remove trailing punctuation characters. WebKit does not remove these |
| + // characters when it hyphenates a word. These characters prevent the hyphen |
| + // library from applying some rules, i.e. they prevent the library from adding |
| + // hyphens. |
| + DCHECK(!word.empty()); |
| + static const char16 kPunctuationChars[] = { |
| + '!', '"', '\'', ',', '.', '(', ')', ':', ';', '?', '\0', |
| + }; |
| + size_t pos = word.find_last_of(kPunctuationChars); |
| + size_t length = (pos != string16::npos) ? pos : word.length(); |
| + UTF16ToUTF8(word.c_str(), length, &word_); |
| + hyphens_.reset(new char[word_.length() + 5]); |
|
tony
2012/03/05 18:59:00
Why 5? Is that the max number of hyphens that will
Hironori Bono
2012/06/28 09:32:49
This is a magic number written in "hyphen.h" <http
|
| +} |
| + |
| +Query::~Query() { |
| + if (rep_) { |
| + for (size_t i = 0; i < word_.length(); ++i) { |
| + if (rep_[i]) |
| + free(rep_[i]); |
| + } |
| + free(rep_); |
| + } |
| + if (pos_) |
| + free(pos_); |
| + if (cut_) |
| + free(cut_); |
| +} |
| + |
| +bool Query::Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens) { |
| + DCHECK(dictionary); |
| + DCHECK(hyphens); |
| + |
| + int result = hnj_hyphen_hyphenate2(dictionary, |
|
tony
2012/03/05 18:59:00
Nit: result -> error_code?
Hironori Bono
2012/06/28 09:32:49
Done.
|
| + word_.data(), |
| + static_cast<int>(word_.length()), |
| + hyphens_.get(), |
| + NULL, |
| + &rep_, |
| + &pos_, |
| + &cut_); |
|
tony
2012/03/05 18:59:00
It looks like we can just pass in NULL since we do
Hironori Bono
2012/06/28 09:32:49
This function assumes these pointers are non-NULL
|
| + if (result) |
| + return false; |
| + |
| + // WebKit needs hyphenation points counted in UTF-16 characters. On the other |
| + // hand, the hyphen library returns hyphenation points counted in UTF-8 |
| + // characters. We increamentally convert hyphenation points in UTF-8 |
| + // characters to hyphenation points in UTF-16 characters and write the |
| + // converted hyphenation points to the output vector. |
|
tony
2012/03/05 18:59:00
It would be a nice TODO to switch the hyphen libra
|
| + UTF16Position converter; |
| + hyphens->clear(); |
| + for (size_t i = 0; i < word_.length(); ++i) { |
| + converter.Append(word_[i]); |
| + if (hyphens_[i] & 1) |
|
tony
2012/03/05 18:59:00
hyphens and hyphens_ make this code hard to follow
Hironori Bono
2012/06/28 09:32:49
Done, I have renamed them. (I could not figure out
|
| + hyphens->push_back(converter.position()); |
| + } |
| + return !hyphens->empty(); |
| +} |
| + |
| +} // namespace |
| + |
| +Hyphenator::Hyphenator(base::PlatformFile file) |
| + : dictionary_(NULL), |
| + rule_file_(file), |
| + result_(0) { |
| +} |
| + |
| +Hyphenator::~Hyphenator() { |
| + if (dictionary_) |
| + hnj_hyphen_free(dictionary_); |
| +} |
| + |
| +bool Hyphenator::Initialize() { |
| + if (dictionary_) |
| + return true; |
| + |
| + rule_map_.reset(new file_util::MemoryMappedFile); |
| + if (!rule_map_->Initialize(rule_file_)) |
| + return false; |
| + |
| + dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length()); |
| + return !!dictionary_; |
| +} |
| + |
| +size_t Hyphenator::ComputeLastHyphenLocation(const string16& word, |
| + size_t before_index) { |
| + if (!dictionary_ || word.empty()) |
| + return 0; |
| + |
| + // Call the hyphen library to get all hyphenation points, i.e. positions where |
| + // we can insert hyphens. When WebKit finds a line-break, it calls this |
| + // function twice or more with the same word to find the best hyphenation |
| + // point. To avoid calling the hyphen library twice or more with the same |
| + // word, we cache the last query. |
| + if (word_ != word) { |
| + word_ = word; |
| + Query query(word); |
| + result_ = query.Hyphenate(dictionary_, &hyphens_); |
| + } |
| + if (!result_) |
| + return 0; |
| + for (std::vector<int>::reverse_iterator it = hyphens_.rbegin(); |
| + it != hyphens_.rend(); ++it) { |
| + if (static_cast<size_t>(*it) < before_index) |
| + return *it; |
| + } |
| + return 0; |
| +} |
| Property changes on: chrome\renderer\hyphenator\hyphenator.cc |
| ___________________________________________________________________ |
| Added: svn:eol-style |
| + LF |