content/renderer/hyphenator/hyphenator.cc - Issue 9545017: Adds a hy-phen-ator.

Side by Side Diff: content/renderer/hyphenator/hyphenator.cc

Issue 9545017: Adds a hy-phen-ator. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "content/renderer/hyphenator/hyphenator.h"

	6

	7 #include <string>
	jam 2012/07/11 00:16:35 nit: not needed cause of all the string includes b nit: not needed cause of all the string includes below Hironori Bono 2012/07/13 08:11:35 Done. Thanks for catching it. Show quoted text On 2012/07/11 00:16:35, John Abd-El-Malek wrote: > nit: not needed cause of all the string includes below Done. Thanks for catching it.
	8

	9 #include "base/file_util.h"

	10 #include "base/logging.h"

	11 #include "base/memory/scoped_ptr.h"

	12 #include "base/string_util.h"

	13 #include "base/utf_string_conversions.h"

	14 #include "third_party/hyphen/hyphen.h"

	15 #include "unicode/uscript.h"

	16

	17 namespace {

	18

	19 // A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds

	20 // only the length of converted UTF-16 characters. This class is used for

	21 // creating a mapping from the position of a UTF-8 string to a position of a

	22 // UTF-16 string without unnecessary conversions. Even though the following

	23 // snippet produces the same mapping, it needs to convert same characters many

	24 // times. This class incrementally counts the number of converted UTF-16

	25 // characters to avoid this problem.

	26 //

	27 // scoped_array<size_t> position(new size_t[text.length()]);

	28 // for (size_t i = 0; i < text.length(); ++i)

	29 // position[i] = UTF8ToUTF16(text.substr(0, i)).length();

	30 //

	31 class UTF16TextLength {

	32 public:

	33 UTF16TextLength();

	34 ~UTF16TextLength();

	35

	36 // Returns the current position.

	37 int utf16_length() const { return utf16_length_; }

	38

	39 // Appends one UTF-8 character to this converter and advances the converted

	40 // position. This converter increases the position by one when it finishes

	41 // reading a BMP character and increases by two when it finish reading a

	42 // non-BMP character.

	43 void Append(char c);

	44

	45 private:

	46 // The length of the converted UTF-16 text.

	47 int utf16_length_;

	48

	49 // The buffer that stores UTF-8 characters being converted.

	50 std::string utf8_text_;

	51

	52 DISALLOW_COPY_AND_ASSIGN(UTF16TextLength);

	53 };

	54

	55 UTF16TextLength::UTF16TextLength()

	56 : utf16_length_(0) {

	57 }

	58

	59 UTF16TextLength::~UTF16TextLength() {

	60 }

	61

	62 void UTF16TextLength::Append(char c) {

	63 // Append the given character and try converting the UTF-8 characters in this

	64 // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint,

	65 // get the number of UTF-16 characters representing this codepoint and advance

	66 // the position.

	67 int code = 0;

	68 int index = 0;

	69 utf8_text_.push_back(c);

	70 U8_NEXT(utf8_text_.data(), index, static_cast<int>(utf8_text_.length()),

	71 code);

	72 if (code != U_SENTINEL) {

	73 utf8_text_.clear();

	74 utf16_length_ += U16_LENGTH(code);

	75 }

	76 }

	77

	78 // A class that encapsulates a hyphenation query. This class owns resources

	79 // temporarily needed for hyphenating one word, and deletes them when it is

	80 // deleted as listed in the following snippet.

	81 //

	82 // std::vector<int> hyphens;

	83 // QUery query(UTF8ToUTF16("hyphenate"));

	84 // query.Hyphenate(dict, &hyphens);

	85 //

	86 class Query {

	87 public:

	88 explicit Query(const string16& word);

	89 ~Query();

	90

	91 // Hyphenates a word with the specified dictionary. This function hyphenates

	92 // the word provided to its constructor and returns a list of hyphenation

	93 // points, positions where we can insert hyphens.

	94 bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphen_offsets);

	95

	96 private:

	97 // A word to be hyphenated.

	98 std::string word_utf8_;

	99

	100 // Return variables from the hyphen library.

	101 scoped_array<char> hyphen_vector_;

	102 char** rep_;

	103 int* pos_;

	104 int* cut_;

	105

	106 DISALLOW_COPY_AND_ASSIGN(Query);

	107 };

	108

	109 Query::Query(const string16& word)

	110 : rep_(NULL),

	111 pos_(NULL),

	112 cut_(NULL) {

	113 // Remove trailing punctuation characters. WebKit does not remove these

	114 // characters when it hyphenates a word. These characters prevent the hyphen

	115 // library from applying some rules, i.e. they prevent the library from adding

	116 // hyphens.

	117 DCHECK(!word.empty());

	118 const char16* data = word.data();

	119 int length = static_cast<int>(word.length());

	120 while (length > 0) {

	121 int previous = length;

	122 int code = 0;

	123 U16_PREV(data, 0, previous, code);

	124 UErrorCode error = U_ZERO_ERROR;

	125 if (uscript_getScript(code, &error) != USCRIPT_COMMON)

	126 break;

	127 length = previous;

	128 }

	129 UTF16ToUTF8(word.c_str(), length, &word_utf8_);

	130 // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a

	131 // buffer of \|word_.length()\| + 5 as written in Line 112 of

	132 // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>.

	133 hyphen_vector_.reset(new char[word_utf8_.length() + 5]);

	134 }

	135

	136 Query::~Query() {

	137 if (rep_) {

	138 for (size_t i = 0; i < word_utf8_.length(); ++i) {

	139 if (rep_[i])

	140 free(rep_[i]);

	141 }

	142 free(rep_);

	143 }

	144 if (pos_)

	145 free(pos_);

	146 if (cut_)

	147 free(cut_);

	148 }

	149

	150 bool Query::Hyphenate(HyphenDict* dictionary,

	151 std::vector<int>* hyphen_offsets) {

	152 DCHECK(dictionary);

	153 DCHECK(hyphen_offsets);

	154

	155 int error_code = hnj_hyphen_hyphenate2(dictionary,

	156 word_utf8_.data(),

	157 static_cast<int>(word_utf8_.length()),

	158 hyphen_vector_.get(),

	159 NULL,

	160 &rep_,

	161 &pos_,

	162 &cut_);

	163 if (error_code)

	164 return false;

	165

	166 // WebKit needs hyphenation points counted in UTF-16 characters. On the other

	167 // hand, the hyphen library returns hyphenation points counted in UTF-8

	168 // characters. We increamentally convert hyphenation points in UTF-8

	169 // characters to hyphenation points in UTF-16 characters and write the

	170 // converted hyphenation points to the output vector.

	171 UTF16TextLength text_length;

	172 hyphen_offsets->clear();

	173 for (size_t i = 0; i < word_utf8_.length(); ++i) {

	174 text_length.Append(word_utf8_[i]);

	175 if (hyphen_vector_[i] & 1)

	176 hyphen_offsets->push_back(text_length.utf16_length());

	177 }

	178 return !hyphen_offsets->empty();

	179 }

	180

	181 } // namespace

	182

	183 namespace content {

	184

	185 Hyphenator::Hyphenator(base::PlatformFile file)

	186 : dictionary_(NULL),

	187 rule_file_(file),

	188 result_(0) {

	189 }

	190

	191 Hyphenator::~Hyphenator() {

	192 if (dictionary_)

	193 hnj_hyphen_free(dictionary_);

	194 }

	195

	196 bool Hyphenator::Initialize() {

	197 if (dictionary_)

	198 return true;

	199

	200 rule_map_.reset(new file_util::MemoryMappedFile);

	201 if (!rule_map_->Initialize(rule_file_))
	tony 2012/07/10 17:30:00 Oh, I see, rule_file_ is a PlatformFile which has Oh, I see, rule_file_ is a PlatformFile which has already been opened. That works.
	202 return false;

	203

	204 dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());

	205 return !!dictionary_;

	206 }

	207

	208 size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,

	209 size_t before_index) {

	210 if (!dictionary_ \|\| word.empty())

	211 return 0;

	212

	213 // Call the hyphen library to get all hyphenation points, i.e. positions where

	214 // we can insert hyphens. When WebKit finds a line-break, it calls this

	215 // function twice or more with the same word to find the best hyphenation

	216 // point. To avoid calling the hyphen library twice or more with the same

	217 // word, we cache the last query.

	218 if (word_ != word) {

	219 word_ = word;

	220 Query query(word);

	221 result_ = query.Hyphenate(dictionary_, &hyphen_offsets_);

	222 }

	223 if (!result_)

	224 return 0;

	225 for (std::vector<int>::reverse_iterator it = hyphen_offsets_.rbegin();

	226 it != hyphen_offsets_.rend(); ++it) {

	227 if (static_cast<size_t>(*it) < before_index)

	228 return *it;

	229 }

	230 return 0;

	231 }

	232

	233 } // namespace content

OLD	NEW

« no previous file with comments | « content/renderer/hyphenator/hyphenator.h ('k') | content/renderer/hyphenator/hyphenator_unittest.cc » ('j') | no next file with comments »