Index: content/renderer/hyphenator/hyphenator.cc |
=================================================================== |
--- content/renderer/hyphenator/hyphenator.cc (revision 0) |
+++ content/renderer/hyphenator/hyphenator.cc (revision 0) |
@@ -0,0 +1,236 @@ |
+// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "content/renderer/hyphenator/hyphenator.h" |
+ |
+#include <string> |
+ |
+#include "base/file_util.h" |
+#include "base/logging.h" |
+#include "base/memory/scoped_ptr.h" |
+#include "base/string_util.h" |
+#include "base/utf_string_conversions.h" |
+#include "third_party/hyphen/hyphen.h" |
+#include "unicode/uscript.h" |
+ |
+namespace { |
+ |
+// A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds |
+// only the length of converted UTF-16 characters. This class is used for |
+// creating a mapping from the position of a UTF-8 string to a position of a |
+// UTF-16 string without unnecessary conversions. Even though the following |
+// snippet produces the same mapping, it needs to convert same characters many |
+// times. This class incrementally counts the number of converted UTF-16 |
+// characters to avoid this problem. |
+// |
+// scoped_array<size_t> position(new size_t[text.length()]); |
+// for (size_t i = 0; i < text.length(); ++i) |
+// position[i] = UTF8ToUTF16(text.substr(0, i)).length(); |
+// |
+class UTF16TextLength { |
+ public: |
+ UTF16TextLength(); |
+ ~UTF16TextLength(); |
+ |
+ // Returns the current position. |
+ int utf16_length() const { return utf16_length_; } |
+ |
+ // Appends one UTF-8 character to this converter and advances the converted |
+ // position. This converter increases the position by one when it finishes |
+ // reading a BMP character and increases by two when it finish reading a |
+ // non-BMP character. |
+ void Append(char c); |
+ |
+ private: |
+ // The length of the converted UTF-16 text. |
+ int utf16_length_; |
+ |
+ // The buffer that stores UTF-8 characters being converted. |
+ std::string utf8_text_; |
+}; |
jochen (gone - plz use gerrit)
2012/07/02 15:33:01
DISALLOW_COPY_AND_ASSIGN
Hironori Bono
2012/07/10 09:22:09
Done. Thanks for noticing it.
|
+ |
+UTF16TextLength::UTF16TextLength() |
+ : utf16_length_(0) { |
+} |
+ |
+UTF16TextLength::~UTF16TextLength() { |
+} |
+ |
+void UTF16TextLength::Append(char c) { |
+ // Append the given character and try converting the UTF-8 characters in this |
+ // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint, |
+ // get the number of UTF-16 characters representing this codepoint and advance |
+ // the position. |
+ int code = 0; |
+ int index = 0; |
+ utf8_text_.push_back(c); |
+ U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()), |
+ code); |
+ if (code != U_SENTINEL) { |
+ utf8_text_.clear(); |
+ utf16_length_ += U16_LENGTH(code); |
+ } |
+} |
+ |
+// A class that encapsulates a hyphenation query. This class owns resources |
+// temporarily needed for hyphenating one word, and deletes them when it is |
+// deleted as listed in the following snippet. |
+// |
+// std::vector<int> hyphens; |
+// QUery query(UTF8ToUTF16("hyphenate")); |
+// query.Hyphenate(dict, &hyphens); |
+// |
+class Query { |
+ public: |
+ explicit Query(const string16& word); |
+ ~Query(); |
+ |
+ // Hyphenates a word with the specified dictionary. This function hyphenates |
+ // the word provided to its constructor and returns a list of hyphenation |
+ // points, positions where we can insert hyphens. The following snippet shows |
+ // how to insert hyphens with hyphenation points returned by this function. |
+ // |
+ // string16 word(UTF8ToUTF16("hyphenate")); |
+ // std::vector<int> hyphens; |
+ // Query query(word); |
+ // query.Hyphenate(dict, &hyphens); |
+ // for (std::vector<int>::const_reverse_iterator it = hyphens.rbegin(); |
+ // it != hyphens.rend(); ++it) { |
+ // word.insert(*it, 1, '-'); |
+ // } |
tony
2012/06/28 20:06:44
I don't think the code example is that useful as a
Hironori Bono
2012/07/10 09:22:09
Done. I have removed this redundant example.
|
+ // |
+ bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets); |
+ |
+ private: |
+ // A word to be hyphenated. |
+ std::string word_utf8_; |
+ |
+ // Return variables from the hyphen library. |
+ scoped_array<char> hyphen_vector_; |
+ char** rep_; |
+ int* pos_; |
+ int* cut_; |
+}; |
jochen (gone - plz use gerrit)
2012/07/02 15:33:01
DISALLOW_COPY_AND_ASSIGN
Hironori Bono
2012/07/10 09:22:09
Done. Thanks for noticing it.
|
+ |
+Query::Query(const string16& word) |
+ : rep_(NULL), |
+ pos_(NULL), |
+ cut_(NULL) { |
+ // Remove trailing punctuation characters. WebKit does not remove these |
+ // characters when it hyphenates a word. These characters prevent the hyphen |
+ // library from applying some rules, i.e. they prevent the library from adding |
+ // hyphens. |
+ DCHECK(!word.empty()); |
+ const char16* data = word.data(); |
+ int length = static_cast<int>(word.length()); |
+ while (length > 0) { |
+ int previous = length; |
+ int code = 0; |
+ U16_PREV(data, 0, previous, code); |
+ UErrorCode error = U_ZERO_ERROR; |
+ if (uscript_getScript(code, &error) != USCRIPT_COMMON) |
+ break; |
+ length = previous; |
+ } |
+ UTF16ToUTF8(word.c_str(), length, &word_utf8_); |
+ // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a |
+ // buffer of |word_.length()| + 5 as written in Line 112 of |
+ // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>. |
+ hyphen_vector_.reset(new char[word_utf8_.length() + 5]); |
+} |
+ |
+Query::~Query() { |
+ if (rep_) { |
+ for (size_t i = 0; i < word_utf8_.length(); ++i) { |
+ if (rep_[i]) |
+ free(rep_[i]); |
tony
2012/06/28 20:06:44
Nit: Should we be using hnj_free here and below?
Hironori Bono
2012/07/10 09:22:09
Thanks for your comment. The hyphen library uses h
|
+ } |
+ free(rep_); |
+ } |
+ if (pos_) |
+ free(pos_); |
+ if (cut_) |
+ free(cut_); |
+} |
+ |
+bool Query::Hyphenate(HyphenDict* dictionary, |
+ std::vector<int>* hyphen_offsets) { |
+ DCHECK(dictionary); |
+ DCHECK(hyphen_offsets); |
+ |
+ int error_code = hnj_hyphen_hyphenate2(dictionary, |
+ word_utf8_.data(), |
+ static_cast<int>(word_utf8_.length()), |
+ hyphen_vector_.get(), |
+ NULL, |
+ &rep_, |
+ &pos_, |
+ &cut_); |
+ if (error_code) |
+ return false; |
+ |
+ // WebKit needs hyphenation points counted in UTF-16 characters. On the other |
+ // hand, the hyphen library returns hyphenation points counted in UTF-8 |
+ // characters. We increamentally convert hyphenation points in UTF-8 |
+ // characters to hyphenation points in UTF-16 characters and write the |
+ // converted hyphenation points to the output vector. |
+ UTF16TextLength text_length; |
+ hyphen_offsets->clear(); |
+ for (size_t i = 0; i < word_utf8_.length(); ++i) { |
+ text_length.Append(word_utf8_[i]); |
+ if (hyphen_vector_[i] & 1) |
+ hyphen_offsets->push_back(text_length.utf16_length()); |
+ } |
+ return !hyphen_offsets->empty(); |
+} |
+ |
+} // namespace |
+ |
+Hyphenator::Hyphenator(base::PlatformFile file) |
+ : dictionary_(NULL), |
+ rule_file_(file), |
+ result_(0) { |
+} |
+ |
+Hyphenator::~Hyphenator() { |
+ if (dictionary_) |
+ hnj_hyphen_free(dictionary_); |
+} |
+ |
+bool Hyphenator::Initialize() { |
+ if (dictionary_) |
+ return true; |
+ |
+ rule_map_.reset(new file_util::MemoryMappedFile); |
+ if (!rule_map_->Initialize(rule_file_)) |
tony
2012/06/28 20:06:44
If this is in the sandboxed renderer process, how
Hironori Bono
2012/07/10 09:22:09
Our spellchecker opens dictionary in a browser pro
|
+ return false; |
+ |
+ dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length()); |
+ return !!dictionary_; |
+} |
+ |
+size_t Hyphenator::ComputeLastHyphenLocation(const string16& word, |
+ size_t before_index) { |
+ if (!dictionary_ || word.empty()) |
+ return 0; |
+ |
+ // Call the hyphen library to get all hyphenation points, i.e. positions where |
+ // we can insert hyphens. When WebKit finds a line-break, it calls this |
+ // function twice or more with the same word to find the best hyphenation |
+ // point. To avoid calling the hyphen library twice or more with the same |
+ // word, we cache the last query. |
+ if (word_ != word) { |
+ word_ = word; |
+ Query query(word); |
+ result_ = query.Hyphenate(dictionary_, &hyphen_offsets_); |
+ } |
+ if (!result_) |
+ return 0; |
+ for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin(); |
+ it != hyphen_offsets_.rend(); ++it) { |
+ if (static_cast<size_t>(*it) < before_index) |
+ return *it; |
+ } |
+ return 0; |
+} |
Property changes on: content\renderer\hyphenator\hyphenator.cc |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |