content/renderer/hyphenator/hyphenator.cc - Issue 9545017: Adds a hy-phen-ator.

Unified Diff: content/renderer/hyphenator/hyphenator.cc

Issue 9545017: Adds a hy-phen-ator. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: Created 8 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« content/renderer/hyphenator/hyphenator.h ('K') | « content/renderer/hyphenator/hyphenator.h ('k') | content/renderer/hyphenator/hyphenator_unittest.cc » ('j') | content/renderer/hyphenator/hyphenator_unittest.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: content/renderer/hyphenator/hyphenator.cc

===================================================================

--- content/renderer/hyphenator/hyphenator.cc (revision 0)

+++ content/renderer/hyphenator/hyphenator.cc (revision 0)

@@ -0,0 +1,236 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "content/renderer/hyphenator/hyphenator.h"

+#include <string>

+#include "base/file_util.h"

+#include "base/logging.h"

+#include "base/memory/scoped_ptr.h"

+#include "base/string_util.h"

+#include "base/utf_string_conversions.h"

+#include "third_party/hyphen/hyphen.h"

+#include "unicode/uscript.h"

+namespace {

+// A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds

+// only the length of converted UTF-16 characters. This class is used for

+// creating a mapping from the position of a UTF-8 string to a position of a

+// UTF-16 string without unnecessary conversions. Even though the following

+// snippet produces the same mapping, it needs to convert same characters many

+// times. This class incrementally counts the number of converted UTF-16

+// characters to avoid this problem.

+//

+// scoped_array<size_t> position(new size_t[text.length()]);

+// for (size_t i = 0; i < text.length(); ++i)

+// position[i] = UTF8ToUTF16(text.substr(0, i)).length();

+//

+class UTF16TextLength {

+ public:

+ UTF16TextLength();

+ ~UTF16TextLength();

+ // Returns the current position.

+ int utf16_length() const { return utf16_length_; }

+ // Appends one UTF-8 character to this converter and advances the converted

+ // position. This converter increases the position by one when it finishes

+ // reading a BMP character and increases by two when it finish reading a

+ // non-BMP character.

+ void Append(char c);

+ private:

+ // The length of the converted UTF-16 text.

+ int utf16_length_;

+ // The buffer that stores UTF-8 characters being converted.

+ std::string utf8_text_;

+};

jochen (gone - plz use gerrit) 2012/07/02 15:33:01 DISALLOW_COPY_AND_ASSIGN

Hironori Bono 2012/07/10 09:22:09 Done. Thanks for noticing it.

+UTF16TextLength::UTF16TextLength()

+ : utf16_length_(0) {

+UTF16TextLength::~UTF16TextLength() {

+void UTF16TextLength::Append(char c) {

+ // Append the given character and try converting the UTF-8 characters in this

+ // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint,

+ // get the number of UTF-16 characters representing this codepoint and advance

+ // the position.

+ int code = 0;

+ int index = 0;

+ utf8_text_.push_back(c);

+ U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()),

+ code);

+ if (code != U_SENTINEL) {

+ utf8_text_.clear();

+ utf16_length_ += U16_LENGTH(code);

+ }

+// A class that encapsulates a hyphenation query. This class owns resources

+// temporarily needed for hyphenating one word, and deletes them when it is

+// deleted as listed in the following snippet.

+//

+// std::vector<int> hyphens;

+// QUery query(UTF8ToUTF16("hyphenate"));

+// query.Hyphenate(dict, &hyphens);

+//

+class Query {

+ public:

+ explicit Query(const string16& word);

+ ~Query();

+ // Hyphenates a word with the specified dictionary. This function hyphenates

+ // the word provided to its constructor and returns a list of hyphenation

+ // points, positions where we can insert hyphens. The following snippet shows

+ // how to insert hyphens with hyphenation points returned by this function.

+ //

+ // string16 word(UTF8ToUTF16("hyphenate"));

+ // std::vector<int> hyphens;

+ // Query query(word);

+ // query.Hyphenate(dict, &hyphens);

+ // for (std::vector<int>::const_reverse_iterator it = hyphens.rbegin();

+ // it != hyphens.rend(); ++it) {

+ // word.insert(*it, 1, '-');

+ // }

tony 2012/06/28 20:06:44 I don't think the code example is that useful as a

Hironori Bono 2012/07/10 09:22:09 Done. I have removed this redundant example.

+ //

+ bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets);

+ private:

+ // A word to be hyphenated.

+ std::string word_utf8_;

+ // Return variables from the hyphen library.

+ scoped_array<char> hyphen_vector_;

+ char** rep_;

+ int* pos_;

+ int* cut_;

+};

jochen (gone - plz use gerrit) 2012/07/02 15:33:01 DISALLOW_COPY_AND_ASSIGN

Hironori Bono 2012/07/10 09:22:09 Done. Thanks for noticing it.

+Query::Query(const string16& word)

+ : rep_(NULL),

+ pos_(NULL),

+ cut_(NULL) {

+ // Remove trailing punctuation characters. WebKit does not remove these

+ // characters when it hyphenates a word. These characters prevent the hyphen

+ // library from applying some rules, i.e. they prevent the library from adding

+ // hyphens.

+ DCHECK(!word.empty());

+ const char16* data = word.data();

+ int length = static_cast<int>(word.length());

+ while (length > 0) {

+ int previous = length;

+ int code = 0;

+ U16_PREV(data, 0, previous, code);

+ UErrorCode error = U_ZERO_ERROR;

+ if (uscript_getScript(code, &error) != USCRIPT_COMMON)

+ break;

+ length = previous;

+ }

+ UTF16ToUTF8(word.c_str(), length, &word_utf8_);

+ // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a

+ // buffer of |word_.length()| + 5 as written in Line 112 of

+ // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>.

+ hyphen_vector_.reset(new char[word_utf8_.length() + 5]);

+Query::~Query() {

+ if (rep_) {

+ for (size_t i = 0; i < word_utf8_.length(); ++i) {

+ if (rep_[i])

+ free(rep_[i]);

tony 2012/06/28 20:06:44 Nit: Should we be using hnj_free here and below?

Hironori Bono 2012/07/10 09:22:09 Thanks for your comment. The hyphen library uses h

+ }

+ free(rep_);

+ }

+ if (pos_)

+ free(pos_);

+ if (cut_)

+ free(cut_);

+bool Query::Hyphenate(HyphenDict* dictionary,

+ std::vector<int>* hyphen_offsets) {

+ DCHECK(dictionary);

+ DCHECK(hyphen_offsets);

+ int error_code = hnj_hyphen_hyphenate2(dictionary,

+ word_utf8_.data(),

+ static_cast<int>(word_utf8_.length()),

+ hyphen_vector_.get(),

+ NULL,

+ &rep_,

+ &pos_,

+ &cut_);

+ if (error_code)

+ return false;

+ // WebKit needs hyphenation points counted in UTF-16 characters. On the other

+ // hand, the hyphen library returns hyphenation points counted in UTF-8

+ // characters. We increamentally convert hyphenation points in UTF-8

+ // characters to hyphenation points in UTF-16 characters and write the

+ // converted hyphenation points to the output vector.

+ UTF16TextLength text_length;

+ hyphen_offsets->clear();

+ for (size_t i = 0; i < word_utf8_.length(); ++i) {

+ text_length.Append(word_utf8_[i]);

+ if (hyphen_vector_[i] & 1)

+ hyphen_offsets->push_back(text_length.utf16_length());

+ }

+ return !hyphen_offsets->empty();

+} // namespace

+Hyphenator::Hyphenator(base::PlatformFile file)

+ : dictionary_(NULL),

+ rule_file_(file),

+ result_(0) {

+Hyphenator::~Hyphenator() {

+ if (dictionary_)

+ hnj_hyphen_free(dictionary_);

+bool Hyphenator::Initialize() {

+ if (dictionary_)

+ return true;

+ rule_map_.reset(new file_util::MemoryMappedFile);

+ if (!rule_map_->Initialize(rule_file_))

tony 2012/06/28 20:06:44 If this is in the sandboxed renderer process, how

Hironori Bono 2012/07/10 09:22:09 Our spellchecker opens dictionary in a browser pro

+ return false;

+ dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());

+ return !!dictionary_;

+size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,

+ size_t before_index) {

+ if (!dictionary_ || word.empty())

+ return 0;

+ // Call the hyphen library to get all hyphenation points, i.e. positions where

+ // we can insert hyphens. When WebKit finds a line-break, it calls this

+ // function twice or more with the same word to find the best hyphenation

+ // point. To avoid calling the hyphen library twice or more with the same

+ // word, we cache the last query.

+ if (word_ != word) {

+ word_ = word;

+ Query query(word);

+ result_ = query.Hyphenate(dictionary_, &hyphen_offsets_);

+ }

+ if (!result_)

+ return 0;

+ for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin();

+ it != hyphen_offsets_.rend(); ++it) {

+ if (static_cast<size_t>(*it) < before_index)

+ return *it;

+ }

+ return 0;

Property changes on: content\renderer\hyphenator\hyphenator.cc

___________________________________________________________________

Added: svn:eol-style

+ LF