Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(379)

Unified Diff: chrome/renderer/hyphenator/hyphenator.cc

Issue 9545017: Adds a hy-phen-ator. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: Created 8 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/renderer/hyphenator/hyphenator.cc
===================================================================
--- chrome/renderer/hyphenator/hyphenator.cc (revision 0)
+++ chrome/renderer/hyphenator/hyphenator.cc (revision 0)
@@ -0,0 +1,239 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/hyphenator/hyphenator.h"
+
+#include <string>
+
+#include "base/file_util.h"
+#include "base/logging.h"
+#include "base/memory/scoped_ptr.h"
+#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
+#include "third_party/hyphen/hyphen.h"
+
+namespace {
+
+// A class that converts a sequence of UT-8 characters to UTF-16 ones and counts
+// the length of converted UTF-16 characters. This class is used for creating a
+// mapping from the position of a UTF-8 string to a position of a UTF-16 string
+// converted from the UTF-8 string. Even though the following snippet produces
+// the same mapping, it needs to convert same characters many times. This class
+// incrementally counts the number of converted UTF-16 characters to avoid
+// unnecessary conversions.
+//
+// scoped_array<size_t> position(new size_t[text.length()]);
+// for (size_t i = 0; i < text.length(); ++i)
+// position[i] = UTF8ToUTF16(text.substr(0, i)).length();
+//
+class UTF16Position {
+ public:
+ UTF16Position();
+ ~UTF16Position();
+
+ // Returns the current position.
+ int position() const { return position_; }
+
+ // Appends one UTF-8 character to this converter and updates the converted
+ // position. This converter increases the position by one when it finishes
+ // reading a BMP character and increases by twi when it finish reading a
+ // non-BMP character.
+ bool Append(char c);
+
+ private:
+ // The number of converted UTF-16 characters.
+ int position_;
+
+ // A buffer that stores UTF-8 sharacters being converted. If this buffer does
+ // not have any characters being converted, this value becomes 0xffffffff.
+ // (This is an invalid UTF-8 character.)
+ uint32 buf_;
+};
+
+UTF16Position::UTF16Position()
+ : position_(0),
+ buf_(0xffffffff) {
+}
+
+UTF16Position::~UTF16Position() {
+}
+
+bool UTF16Position::Append(char c) {
tony 2012/03/05 18:59:00 Can we use U8_FWD_1 in unicode/utf8.h for this ins
Hironori Bono 2012/06/28 09:32:49 Thanks for your advice. Yes, it is better to use U
+ // Rotate our character buffer and append the input character.
+ buf_ = (buf_ << 8) | c;
+
+ // Increase the position by one when we finish reading a BMP character
+ // (U+0000,...,U+FFFF), UTF-16 needs one character to represent a BMP
+ // character.
+ if ((buf_ & 0x80) == 0 ||
+ (buf_ & 0xe0c0) == 0xc080 ||
+ (buf_ & 0xf0c0c0) == 0xe08080) {
+ ++position_;
+ buf_ = 0xffffffff;
+ return true;
+ }
+ // UTF-16 needs two characters to represent a unicode chracter between U+10000
+ // and U+10FFFF.
+ if ((buf_ & 0xf8c0c0c0) == 0xf0808080) {
+ position_ += 2;
+ buf_ = 0xffffffff;
+ return true;
+ }
+ // UTF-16 cannot represent a Unicode character above U+10FFFF. We should abort
+ // this conversion.
+ return false;
+}
+
+// A class that encapsulates a hyphenation query. THis class owns resources
tony 2012/03/05 18:59:00 Nit: THis -> This
Hironori Bono 2012/06/28 09:32:49 Done.
+// temporarily needed for hyphenating one word, and deletes them when it is
+// deleted as listed in the following snippet.
+//
+// void GetHyphenationPoints(HyphenDict* dict,
+// const string16& word,
+// std::vector<int>* hyphens) {
+// scoped_ptr<Query> query(word);
+// query.Hyphenate(dict, hyphens);
+// }
+//
+class Query {
+ public:
+ explicit Query(const string16& word);
+ ~Query();
+
+ // Hyphenates a word with the specified dictionary. This function hyphenates
+ // the word provided to its constructor and returns a list of hyphenation
+ // points, positions where we can insert hyphens. The following snippet shows
+ // how to insert hyphens with hyphenation points returned by this function.
+ //
+ // std::vector<int> hyphens;
+ // GetHyphenationPoints(dict, word, &hyphens);
tony 2012/03/05 18:59:00 This snippet doesn't seem to match the class.
Hironori Bono 2012/06/28 09:32:49 Done. Thanks for noticing it. I have updated this
+ // for (std::vector<int>::const_reverse_iterator it = hyphens.rbegin();
+ // it != hyphens.rend(); ++it) {
+ // word.insert(*it, 1, '-');
+ // }
+ //
+ bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens);
+
+ private:
+ // A word to be hyphenated.
+ std::string word_;
tony 2012/03/05 18:59:00 Nit: Maybe name this word_utf8_ to make the encodi
Hironori Bono 2012/06/28 09:32:49 Done. Thanks for you suggestion.
+
+ // Return variables from the hyphen library.
+ scoped_array<char> hyphens_;
+ char** rep_;
+ int* pos_;
+ int* cut_;
+};
+
+Query::Query(const string16& word)
+ : rep_(NULL),
+ pos_(NULL),
+ cut_(NULL) {
+ // Remove trailing punctuation characters. WebKit does not remove these
+ // characters when it hyphenates a word. These characters prevent the hyphen
+ // library from applying some rules, i.e. they prevent the library from adding
+ // hyphens.
+ DCHECK(!word.empty());
+ static const char16 kPunctuationChars[] = {
+ '!', '"', '\'', ',', '.', '(', ')', ':', ';', '?', '\0',
+ };
+ size_t pos = word.find_last_of(kPunctuationChars);
+ size_t length = (pos != string16::npos) ? pos : word.length();
+ UTF16ToUTF8(word.c_str(), length, &word_);
+ hyphens_.reset(new char[word_.length() + 5]);
tony 2012/03/05 18:59:00 Why 5? Is that the max number of hyphens that will
Hironori Bono 2012/06/28 09:32:49 This is a magic number written in "hyphen.h" <http
+}
+
+Query::~Query() {
+ if (rep_) {
+ for (size_t i = 0; i < word_.length(); ++i) {
+ if (rep_[i])
+ free(rep_[i]);
+ }
+ free(rep_);
+ }
+ if (pos_)
+ free(pos_);
+ if (cut_)
+ free(cut_);
+}
+
+bool Query::Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens) {
+ DCHECK(dictionary);
+ DCHECK(hyphens);
+
+ int result = hnj_hyphen_hyphenate2(dictionary,
tony 2012/03/05 18:59:00 Nit: result -> error_code?
Hironori Bono 2012/06/28 09:32:49 Done.
+ word_.data(),
+ static_cast<int>(word_.length()),
+ hyphens_.get(),
+ NULL,
+ &rep_,
+ &pos_,
+ &cut_);
tony 2012/03/05 18:59:00 It looks like we can just pass in NULL since we do
Hironori Bono 2012/06/28 09:32:49 This function assumes these pointers are non-NULL
+ if (result)
+ return false;
+
+ // WebKit needs hyphenation points counted in UTF-16 characters. On the other
+ // hand, the hyphen library returns hyphenation points counted in UTF-8
+ // characters. We increamentally convert hyphenation points in UTF-8
+ // characters to hyphenation points in UTF-16 characters and write the
+ // converted hyphenation points to the output vector.
tony 2012/03/05 18:59:00 It would be a nice TODO to switch the hyphen libra
+ UTF16Position converter;
+ hyphens->clear();
+ for (size_t i = 0; i < word_.length(); ++i) {
+ converter.Append(word_[i]);
+ if (hyphens_[i] & 1)
tony 2012/03/05 18:59:00 hyphens and hyphens_ make this code hard to follow
Hironori Bono 2012/06/28 09:32:49 Done, I have renamed them. (I could not figure out
+ hyphens->push_back(converter.position());
+ }
+ return !hyphens->empty();
+}
+
+} // namespace
+
+Hyphenator::Hyphenator(base::PlatformFile file)
+ : dictionary_(NULL),
+ rule_file_(file),
+ result_(0) {
+}
+
+Hyphenator::~Hyphenator() {
+ if (dictionary_)
+ hnj_hyphen_free(dictionary_);
+}
+
+bool Hyphenator::Initialize() {
+ if (dictionary_)
+ return true;
+
+ rule_map_.reset(new file_util::MemoryMappedFile);
+ if (!rule_map_->Initialize(rule_file_))
+ return false;
+
+ dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());
+ return !!dictionary_;
+}
+
+size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,
+ size_t before_index) {
+ if (!dictionary_ || word.empty())
+ return 0;
+
+ // Call the hyphen library to get all hyphenation points, i.e. positions where
+ // we can insert hyphens. When WebKit finds a line-break, it calls this
+ // function twice or more with the same word to find the best hyphenation
+ // point. To avoid calling the hyphen library twice or more with the same
+ // word, we cache the last query.
+ if (word_ != word) {
+ word_ = word;
+ Query query(word);
+ result_ = query.Hyphenate(dictionary_, &hyphens_);
+ }
+ if (!result_)
+ return 0;
+ for (std::vector<int>::reverse_iterator it = hyphens_.rbegin();
+ it != hyphens_.rend(); ++it) {
+ if (static_cast<size_t>(*it) < before_index)
+ return *it;
+ }
+ return 0;
+}
Property changes on: chrome\renderer\hyphenator\hyphenator.cc
___________________________________________________________________
Added: svn:eol-style
+ LF

Powered by Google App Engine
This is Rietveld 408576698