chrome/renderer/hyphenator/hyphenator.cc - Issue 9545017: Adds a hy-phen-ator.

Unified Diff: chrome/renderer/hyphenator/hyphenator.cc

Issue 9545017: Adds a hy-phen-ator. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: Created 8 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« chrome/renderer/hyphenator/hyphenator.h ('K') | « chrome/renderer/hyphenator/hyphenator.h ('k') | chrome/renderer/hyphenator/hyphenator_unittest.cc » ('j') | third_party/hyphen/hyphen.gyp » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/renderer/hyphenator/hyphenator.cc

===================================================================

--- chrome/renderer/hyphenator/hyphenator.cc (revision 0)

+++ chrome/renderer/hyphenator/hyphenator.cc (revision 0)

@@ -0,0 +1,239 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "chrome/renderer/hyphenator/hyphenator.h"

+#include <string>

+#include "base/file_util.h"

+#include "base/logging.h"

+#include "base/memory/scoped_ptr.h"

+#include "base/string_util.h"

+#include "base/utf_string_conversions.h"

+#include "third_party/hyphen/hyphen.h"

+namespace {

+// A class that converts a sequence of UT-8 characters to UTF-16 ones and counts

+// the length of converted UTF-16 characters. This class is used for creating a

+// mapping from the position of a UTF-8 string to a position of a UTF-16 string

+// converted from the UTF-8 string. Even though the following snippet produces

+// the same mapping, it needs to convert same characters many times. This class

+// incrementally counts the number of converted UTF-16 characters to avoid

+// unnecessary conversions.

+//

+// scoped_array<size_t> position(new size_t[text.length()]);

+// for (size_t i = 0; i < text.length(); ++i)

+// position[i] = UTF8ToUTF16(text.substr(0, i)).length();

+//

+class UTF16Position {

+ public:

+ UTF16Position();

+ ~UTF16Position();

+ // Returns the current position.

+ int position() const { return position_; }

+ // Appends one UTF-8 character to this converter and updates the converted

+ // position. This converter increases the position by one when it finishes

+ // reading a BMP character and increases by twi when it finish reading a

+ // non-BMP character.

+ bool Append(char c);

+ private:

+ // The number of converted UTF-16 characters.

+ int position_;

+ // A buffer that stores UTF-8 sharacters being converted. If this buffer does

+ // not have any characters being converted, this value becomes 0xffffffff.

+ // (This is an invalid UTF-8 character.)

+ uint32 buf_;

+};

+UTF16Position::UTF16Position()

+ : position_(0),

+ buf_(0xffffffff) {

+UTF16Position::~UTF16Position() {

+bool UTF16Position::Append(char c) {

tony 2012/03/05 18:59:00 Can we use U8_FWD_1 in unicode/utf8.h for this ins

Hironori Bono 2012/06/28 09:32:49 Thanks for your advice. Yes, it is better to use U

+ // Rotate our character buffer and append the input character.

+ buf_ = (buf_ << 8) | c;

+ // Increase the position by one when we finish reading a BMP character

+ // (U+0000,...,U+FFFF), UTF-16 needs one character to represent a BMP

+ // character.

+ if ((buf_ & 0x80) == 0 ||

+ (buf_ & 0xe0c0) == 0xc080 ||

+ (buf_ & 0xf0c0c0) == 0xe08080) {

+ ++position_;

+ buf_ = 0xffffffff;

+ return true;

+ }

+ // UTF-16 needs two characters to represent a unicode chracter between U+10000

+ // and U+10FFFF.

+ if ((buf_ & 0xf8c0c0c0) == 0xf0808080) {

+ position_ += 2;

+ buf_ = 0xffffffff;

+ return true;

+ }

+ // UTF-16 cannot represent a Unicode character above U+10FFFF. We should abort

+ // this conversion.

+ return false;

+// A class that encapsulates a hyphenation query. THis class owns resources

tony 2012/03/05 18:59:00 Nit: THis -> This

Hironori Bono 2012/06/28 09:32:49 Done.

+// temporarily needed for hyphenating one word, and deletes them when it is

+// deleted as listed in the following snippet.

+//

+// void GetHyphenationPoints(HyphenDict* dict,

+// const string16& word,

+// std::vector<int>* hyphens) {

+// scoped_ptr<Query> query(word);

+// query.Hyphenate(dict, hyphens);

+// }

+//

+class Query {

+ public:

+ explicit Query(const string16& word);

+ ~Query();

+ // Hyphenates a word with the specified dictionary. This function hyphenates

+ // the word provided to its constructor and returns a list of hyphenation

+ // points, positions where we can insert hyphens. The following snippet shows

+ // how to insert hyphens with hyphenation points returned by this function.

+ //

+ // std::vector<int> hyphens;

+ // GetHyphenationPoints(dict, word, &hyphens);

tony 2012/03/05 18:59:00 This snippet doesn't seem to match the class.

Hironori Bono 2012/06/28 09:32:49 Done. Thanks for noticing it. I have updated this

+ // for (std::vector<int>::const_reverse_iterator it = hyphens.rbegin();

+ // it != hyphens.rend(); ++it) {

+ // word.insert(*it, 1, '-');

+ // }

+ //

+ bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens);

+ private:

+ // A word to be hyphenated.

+ std::string word_;

tony 2012/03/05 18:59:00 Nit: Maybe name this word_utf8_ to make the encodi

Hironori Bono 2012/06/28 09:32:49 Done. Thanks for you suggestion.

+ // Return variables from the hyphen library.

+ scoped_array<char> hyphens_;

+ char** rep_;

+ int* pos_;

+ int* cut_;

+};

+Query::Query(const string16& word)

+ : rep_(NULL),

+ pos_(NULL),

+ cut_(NULL) {

+ // Remove trailing punctuation characters. WebKit does not remove these

+ // characters when it hyphenates a word. These characters prevent the hyphen

+ // library from applying some rules, i.e. they prevent the library from adding

+ // hyphens.

+ DCHECK(!word.empty());

+ static const char16 kPunctuationChars[] = {

+ '!', '"', '\'', ',', '.', '(', ')', ':', ';', '?', '\0',

+ };

+ size_t pos = word.find_last_of(kPunctuationChars);

+ size_t length = (pos != string16::npos) ? pos : word.length();

+ UTF16ToUTF8(word.c_str(), length, &word_);

+ hyphens_.reset(new char[word_.length() + 5]);

tony 2012/03/05 18:59:00 Why 5? Is that the max number of hyphens that will

Hironori Bono 2012/06/28 09:32:49 This is a magic number written in "hyphen.h" <http

+Query::~Query() {

+ if (rep_) {

+ for (size_t i = 0; i < word_.length(); ++i) {

+ if (rep_[i])

+ free(rep_[i]);

+ }

+ free(rep_);

+ }

+ if (pos_)

+ free(pos_);

+ if (cut_)

+ free(cut_);

+bool Query::Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens) {

+ DCHECK(dictionary);

+ DCHECK(hyphens);

+ int result = hnj_hyphen_hyphenate2(dictionary,

tony 2012/03/05 18:59:00 Nit: result -> error_code?

Hironori Bono 2012/06/28 09:32:49 Done.

+ word_.data(),

+ static_cast<int>(word_.length()),

+ hyphens_.get(),

+ NULL,

+ &rep_,

+ &pos_,

+ &cut_);

tony 2012/03/05 18:59:00 It looks like we can just pass in NULL since we do

Hironori Bono 2012/06/28 09:32:49 This function assumes these pointers are non-NULL

+ if (result)

+ return false;

+ // WebKit needs hyphenation points counted in UTF-16 characters. On the other

+ // hand, the hyphen library returns hyphenation points counted in UTF-8

+ // characters. We increamentally convert hyphenation points in UTF-8

+ // characters to hyphenation points in UTF-16 characters and write the

+ // converted hyphenation points to the output vector.

tony 2012/03/05 18:59:00 It would be a nice TODO to switch the hyphen libra

+ UTF16Position converter;

+ hyphens->clear();

+ for (size_t i = 0; i < word_.length(); ++i) {

+ converter.Append(word_[i]);

+ if (hyphens_[i] & 1)

tony 2012/03/05 18:59:00 hyphens and hyphens_ make this code hard to follow

Hironori Bono 2012/06/28 09:32:49 Done, I have renamed them. (I could not figure out

+ hyphens->push_back(converter.position());

+ }

+ return !hyphens->empty();

+} // namespace

+Hyphenator::Hyphenator(base::PlatformFile file)

+ : dictionary_(NULL),

+ rule_file_(file),

+ result_(0) {

+Hyphenator::~Hyphenator() {

+ if (dictionary_)

+ hnj_hyphen_free(dictionary_);

+bool Hyphenator::Initialize() {

+ if (dictionary_)

+ return true;

+ rule_map_.reset(new file_util::MemoryMappedFile);

+ if (!rule_map_->Initialize(rule_file_))

+ return false;

+ dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());

+ return !!dictionary_;

+size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,

+ size_t before_index) {

+ if (!dictionary_ || word.empty())

+ return 0;

+ // Call the hyphen library to get all hyphenation points, i.e. positions where

+ // we can insert hyphens. When WebKit finds a line-break, it calls this

+ // function twice or more with the same word to find the best hyphenation

+ // point. To avoid calling the hyphen library twice or more with the same

+ // word, we cache the last query.

+ if (word_ != word) {

+ word_ = word;

+ Query query(word);

+ result_ = query.Hyphenate(dictionary_, &hyphens_);

+ }

+ if (!result_)

+ return 0;

+ for (std::vector<int>::reverse_iterator it = hyphens_.rbegin();

+ it != hyphens_.rend(); ++it) {

+ if (static_cast<size_t>(*it) < before_index)

+ return *it;

+ }

+ return 0;

Property changes on: chrome\renderer\hyphenator\hyphenator.cc

___________________________________________________________________

Added: svn:eol-style

+ LF