chrome/renderer/hyphenator/hyphenator.cc - Issue 9545017: Adds a hy-phen-ator.

Side by Side Diff: chrome/renderer/hyphenator/hyphenator.cc

Issue 9545017: Adds a hy-phen-ator. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "chrome/renderer/hyphenator/hyphenator.h"

	6

	7 #include <string>

	8

	9 #include "base/file_util.h"

	10 #include "base/logging.h"

	11 #include "base/memory/scoped_ptr.h"

	12 #include "base/string_util.h"

	13 #include "base/utf_string_conversions.h"

	14 #include "third_party/hyphen/hyphen.h"

	15

	16 namespace {

	17

	18 // A class that converts a sequence of UT-8 characters to UTF-16 ones and counts

	19 // the length of converted UTF-16 characters. This class is used for creating a

	20 // mapping from the position of a UTF-8 string to a position of a UTF-16 string

	21 // converted from the UTF-8 string. Even though the following snippet produces

	22 // the same mapping, it needs to convert same characters many times. This class

	23 // incrementally counts the number of converted UTF-16 characters to avoid

	24 // unnecessary conversions.

	25 //

	26 // scoped_array<size_t> position(new size_t[text.length()]);

	27 // for (size_t i = 0; i < text.length(); ++i)

	28 // position[i] = UTF8ToUTF16(text.substr(0, i)).length();

	29 //

	30 class UTF16Position {

	31 public:

	32 UTF16Position();

	33 ~UTF16Position();

	34

	35 // Returns the current position.

	36 int position() const { return position_; }

	37

	38 // Appends one UTF-8 character to this converter and updates the converted

	39 // position. This converter increases the position by one when it finishes

	40 // reading a BMP character and increases by twi when it finish reading a

	41 // non-BMP character.

	42 bool Append(char c);

	43

	44 private:

	45 // The number of converted UTF-16 characters.

	46 int position_;

	47

	48 // A buffer that stores UTF-8 sharacters being converted. If this buffer does

	49 // not have any characters being converted, this value becomes 0xffffffff.

	50 // (This is an invalid UTF-8 character.)

	51 uint32 buf_;

	52 };

	53

	54 UTF16Position::UTF16Position()

	55 : position_(0),

	56 buf_(0xffffffff) {

	57 }

	58

	59 UTF16Position::~UTF16Position() {

	60 }

	61

	62 bool UTF16Position::Append(char c) {
	tony 2012/03/05 18:59:00 Can we use U8_FWD_1 in unicode/utf8.h for this ins Can we use U8_FWD_1 in unicode/utf8.h for this instead? I guess we would need to use U16_IS_SINGLE from utf16.h to determine if we add 1 or 2 to position. Hironori Bono 2012/06/28 09:32:49 Thanks for your advice. Yes, it is better to use U Thanks for your advice. Yes, it is better to use U8_ and U16_ macros. I have re-implemented this function to use them.
	63 // Rotate our character buffer and append the input character.

	64 buf_ = (buf_ << 8) \| c;

	65

	66 // Increase the position by one when we finish reading a BMP character

	67 // (U+0000,...,U+FFFF), UTF-16 needs one character to represent a BMP

	68 // character.

	69 if ((buf_ & 0x80) == 0 \|\|

	70 (buf_ & 0xe0c0) == 0xc080 \|\|

	71 (buf_ & 0xf0c0c0) == 0xe08080) {

	72 ++position_;

	73 buf_ = 0xffffffff;

	74 return true;

	75 }

	76 // UTF-16 needs two characters to represent a unicode chracter between U+10000

	77 // and U+10FFFF.

	78 if ((buf_ & 0xf8c0c0c0) == 0xf0808080) {

	79 position_ += 2;

	80 buf_ = 0xffffffff;

	81 return true;

	82 }

	83 // UTF-16 cannot represent a Unicode character above U+10FFFF. We should abort

	84 // this conversion.

	85 return false;

	86 }

	87

	88 // A class that encapsulates a hyphenation query. THis class owns resources
	tony 2012/03/05 18:59:00 Nit: THis -> This Nit: THis -> This Hironori Bono 2012/06/28 09:32:49 Done. Show quoted text On 2012/03/05 18:59:00, tony wrote: > Nit: THis -> This Done.
	89 // temporarily needed for hyphenating one word, and deletes them when it is

	90 // deleted as listed in the following snippet.

	91 //

	92 // void GetHyphenationPoints(HyphenDict* dict,

	93 // const string16& word,

	94 // std::vector<int>* hyphens) {

	95 // scoped_ptr<Query> query(word);

	96 // query.Hyphenate(dict, hyphens);

	97 // }

	98 //

	99 class Query {

	100 public:

	101 explicit Query(const string16& word);

	102 ~Query();

	103

	104 // Hyphenates a word with the specified dictionary. This function hyphenates

	105 // the word provided to its constructor and returns a list of hyphenation

	106 // points, positions where we can insert hyphens. The following snippet shows

	107 // how to insert hyphens with hyphenation points returned by this function.

	108 //

	109 // std::vector<int> hyphens;

	110 // GetHyphenationPoints(dict, word, &hyphens);
	tony 2012/03/05 18:59:00 This snippet doesn't seem to match the class. This snippet doesn't seem to match the class. Hironori Bono 2012/06/28 09:32:49 Done. Thanks for noticing it. I have updated this Show quoted text On 2012/03/05 18:59:00, tony wrote: > This snippet doesn't seem to match the class. Done. Thanks for noticing it. I have updated this snippet.
	111 // for (std::vector<int>::const_reverse_iterator it = hyphens.rbegin();

	112 // it != hyphens.rend(); ++it) {

	113 // word.insert(*it, 1, '-');

	114 // }

	115 //

	116 bool Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens);

	117

	118 private:

	119 // A word to be hyphenated.

	120 std::string word_;
	tony 2012/03/05 18:59:00 Nit: Maybe name this word_utf8_ to make the encodi Nit: Maybe name this word_utf8_ to make the encoding more clear? Hironori Bono 2012/06/28 09:32:49 Done. Thanks for you suggestion. Show quoted text On 2012/03/05 18:59:00, tony wrote: > Nit: Maybe name this word_utf8_ to make the encoding more clear? Done. Thanks for you suggestion.
	121

	122 // Return variables from the hyphen library.

	123 scoped_array<char> hyphens_;

	124 char** rep_;

	125 int* pos_;

	126 int* cut_;

	127 };

	128

	129 Query::Query(const string16& word)

	130 : rep_(NULL),

	131 pos_(NULL),

	132 cut_(NULL) {

	133 // Remove trailing punctuation characters. WebKit does not remove these

	134 // characters when it hyphenates a word. These characters prevent the hyphen

	135 // library from applying some rules, i.e. they prevent the library from adding

	136 // hyphens.

	137 DCHECK(!word.empty());

	138 static const char16 kPunctuationChars[] = {

	139 '!', '"', '\'', ',', '.', '(', ')', ':', ';', '?', '\0',

	140 };

	141 size_t pos = word.find_last_of(kPunctuationChars);

	142 size_t length = (pos != string16::npos) ? pos : word.length();

	143 UTF16ToUTF8(word.c_str(), length, &word_);

	144 hyphens_.reset(new char[word_.length() + 5]);
	tony 2012/03/05 18:59:00 Why 5? Is that the max number of hyphens that will Why 5? Is that the max number of hyphens that will be added? Can we make this number a constant? Hironori Bono 2012/06/28 09:32:49 This is a magic number written in "hyphen.h" <http Show quoted text On 2012/03/05 18:59:00, tony wrote: > Why 5? Is that the max number of hyphens that will be added? Can we make this > number a constant? This is a magic number written in "hyphen.h" <http://code.google.com/searchframe#OAMlx_jo-ck/src/third_party/hyphen/hyphen....>. I used this number as written in this comment since this header does not have macros for it. I have added a comment to describe it.
	145 }

	146

	147 Query::~Query() {

	148 if (rep_) {

	149 for (size_t i = 0; i < word_.length(); ++i) {

	150 if (rep_[i])

	151 free(rep_[i]);

	152 }

	153 free(rep_);

	154 }

	155 if (pos_)

	156 free(pos_);

	157 if (cut_)

	158 free(cut_);

	159 }

	160

	161 bool Query::Hyphenate(HyphenDict* dictionary, std::vector<int>* hyphens) {

	162 DCHECK(dictionary);

	163 DCHECK(hyphens);

	164

	165 int result = hnj_hyphen_hyphenate2(dictionary,
	tony 2012/03/05 18:59:00 Nit: result -> error_code? Nit: result -> error_code? Hironori Bono 2012/06/28 09:32:49 Done. Show quoted text On 2012/03/05 18:59:00, tony wrote: > Nit: result -> error_code? Done.
	166 word_.data(),

	167 static_cast<int>(word_.length()),

	168 hyphens_.get(),

	169 NULL,

	170 &rep_,

	171 &pos_,

	172 &cut_);
	tony 2012/03/05 18:59:00 It looks like we can just pass in NULL since we do It looks like we can just pass in NULL since we don't use rep_ pos_ or cut_. Then we don't have to worry about deleting the memory. Hironori Bono 2012/06/28 09:32:49 This function assumes these pointers are non-NULL Show quoted text On 2012/03/05 18:59:00, tony wrote: > It looks like we can just pass in NULL since we don't use rep_ pos_ or cut_. > Then we don't have to worry about deleting the memory. This function assumes these pointers are non-NULL and it crashes when we use NULL for them.
	173 if (result)

	174 return false;

	175

	176 // WebKit needs hyphenation points counted in UTF-16 characters. On the other

	177 // hand, the hyphen library returns hyphenation points counted in UTF-8

	178 // characters. We increamentally convert hyphenation points in UTF-8

	179 // characters to hyphenation points in UTF-16 characters and write the

	180 // converted hyphenation points to the output vector.
	tony 2012/03/05 18:59:00 It would be a nice TODO to switch the hyphen libra It would be a nice TODO to switch the hyphen library to use utf-16.
	181 UTF16Position converter;

	182 hyphens->clear();

	183 for (size_t i = 0; i < word_.length(); ++i) {

	184 converter.Append(word_[i]);

	185 if (hyphens_[i] & 1)
	tony 2012/03/05 18:59:00 hyphens and hyphens_ make this code hard to follow hyphens and hyphens_ make this code hard to follow. Maybe hyphen_offsets and hyphen_vector? Hironori Bono 2012/06/28 09:32:49 Done, I have renamed them. (I could not figure out Show quoted text On 2012/03/05 18:59:00, tony wrote: > hyphens and hyphens_ make this code hard to follow. Maybe hyphen_offsets and > hyphen_vector? Done, I have renamed them. (I could not figure out which was what when I updated this change.)
	186 hyphens->push_back(converter.position());

	187 }

	188 return !hyphens->empty();

	189 }

	190

	191 } // namespace

	192

	193 Hyphenator::Hyphenator(base::PlatformFile file)

	194 : dictionary_(NULL),

	195 rule_file_(file),

	196 result_(0) {

	197 }

	198

	199 Hyphenator::~Hyphenator() {

	200 if (dictionary_)

	201 hnj_hyphen_free(dictionary_);

	202 }

	203

	204 bool Hyphenator::Initialize() {

	205 if (dictionary_)

	206 return true;

	207

	208 rule_map_.reset(new file_util::MemoryMappedFile);

	209 if (!rule_map_->Initialize(rule_file_))

	210 return false;

	211

	212 dictionary_ = hnj_hyphen_load(rule_map_->data(), rule_map_->length());

	213 return !!dictionary_;

	214 }

	215

	216 size_t Hyphenator::ComputeLastHyphenLocation(const string16& word,

	217 size_t before_index) {

	218 if (!dictionary_ \|\| word.empty())

	219 return 0;

	220

	221 // Call the hyphen library to get all hyphenation points, i.e. positions where

	222 // we can insert hyphens. When WebKit finds a line-break, it calls this

	223 // function twice or more with the same word to find the best hyphenation

	224 // point. To avoid calling the hyphen library twice or more with the same

	225 // word, we cache the last query.

	226 if (word_ != word) {

	227 word_ = word;

	228 Query query(word);

	229 result_ = query.Hyphenate(dictionary_, &hyphens_);

	230 }

	231 if (!result_)

	232 return 0;

	233 for (std::vector<int>::reverse_iterator it = hyphens_.rbegin();

	234 it != hyphens_.rend(); ++it) {

	235 if (static_cast<size_t>(*it) < before_index)

	236 return *it;

	237 }

	238 return 0;

	239 }

OLD	NEW