Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(100)

Side by Side Diff: third_party/WebKit/Source/platform/text/hyphenation/HyphenatorAOSP.cpp

Issue 2149803004: Add Minikin Hyphenation engine for Android (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Rename alias to fallback Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/WebKit/Source/platform/text/hyphenation/HyphenatorAOSP.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /* ***** BEGIN LICENSE BLOCK *****
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 * ***** END LICENSE BLOCK ***** */
18
19 #include <vector>
20 #include <memory>
21 #include <algorithm>
22 #include <string>
23 #include <unicode/uchar.h>
24
25 // HACK: for reading pattern file
26 #include <fcntl.h>
27
28 #include "platform/text/hyphenation/HyphenatorAOSP.h"
29
30 using std::vector;
31
32 namespace android {
33
34 static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
35
36 // The following are structs that correspond to tables inside the hyb file forma t
37
38 struct AlphabetTable0 {
39 uint32_t version;
40 uint32_t min_codepoint;
41 uint32_t max_codepoint;
42 uint8_t data[1]; // actually flexible array, size is known at runtime
43 };
44
45 struct AlphabetTable1 {
46 uint32_t version;
47 uint32_t n_entries;
48 uint32_t data[1]; // actually flexible array, size is known at runtime
49
50 static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
51 static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
52 };
53
54 struct Trie {
55 uint32_t version;
56 uint32_t char_mask;
57 uint32_t link_shift;
58 uint32_t link_mask;
59 uint32_t pattern_shift;
60 uint32_t n_entries;
61 uint32_t data[1]; // actually flexible array, size is known at runtime
62 };
63
64 struct Pattern {
65 uint32_t version;
66 uint32_t n_entries;
67 uint32_t pattern_offset;
68 uint32_t pattern_size;
69 uint32_t data[1]; // actually flexible array, size is known at runtime
70
71 // accessors
72 static uint32_t len(uint32_t entry) { return entry >> 26; }
73 static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
74 const uint8_t* buf(uint32_t entry) const {
75 return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
76 }
77 };
78
79 struct Header {
80 uint32_t magic;
81 uint32_t version;
82 uint32_t alphabet_offset;
83 uint32_t trie_offset;
84 uint32_t pattern_offset;
85 uint32_t file_size;
86
87 // accessors
88 const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this) ; }
89 uint32_t alphabetVersion() const {
90 return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
91 }
92 const AlphabetTable0* alphabetTable0() const {
93 return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset );
94 }
95 const AlphabetTable1* alphabetTable1() const {
96 return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset );
97 }
98 const Trie* trieTable() const {
99 return reinterpret_cast<const Trie*>(bytes() + trie_offset);
100 }
101 const Pattern* patternTable() const {
102 return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
103 }
104 };
105
106 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData) {
107 Hyphenator* result = new Hyphenator;
108 result->patternData = patternData;
109 return result;
110 }
111
112 void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) {
113 result->clear();
114 result->resize(len);
115 const size_t paddedLen = len + 2; // start and stop code each count for 1
116 if (patternData != nullptr &&
117 (int)len >= MIN_PREFIX + MIN_SUFFIX && paddedLen <= MAX_HYPHENATED_S IZE) {
118 uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
119 if (alphabetLookup(alpha_codes, word, len)) {
120 hyphenateFromCodes(result->data(), alpha_codes, paddedLen);
121 return;
122 }
123 // TODO: try NFC normalization
124 // TODO: handle non-BMP Unicode (requires remapping of offsets)
125 }
126 hyphenateSoft(result->data(), word, len);
127 }
128
129 // If any soft hyphen is present in the word, use soft hyphens to decide hyphena tion,
130 // as recommended in UAX #14 (Use of Soft Hyphen)
131 void Hyphenator::hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len ) {
132 result[0] = 0;
133 for (size_t i = 1; i < len; i++) {
134 result[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
135 }
136 }
137
138 bool Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, siz e_t len) {
139 const Header* header = getHeader();
140 // TODO: check header magic
141 uint32_t alphabetVersion = header->alphabetVersion();
142 if (alphabetVersion == 0) {
143 const AlphabetTable0* alphabet = header->alphabetTable0();
144 uint32_t min_codepoint = alphabet->min_codepoint;
145 uint32_t max_codepoint = alphabet->max_codepoint;
146 alpha_codes[0] = 0; // word start
147 for (size_t i = 0; i < len; i++) {
148 uint16_t c = word[i];
149 if (c < min_codepoint || c >= max_codepoint) {
150 return false;
151 }
152 uint8_t code = alphabet->data[c - min_codepoint];
153 if (code == 0) {
154 return false;
155 }
156 alpha_codes[i + 1] = code;
157 }
158 alpha_codes[len + 1] = 0; // word termination
159 return true;
160 } else if (alphabetVersion == 1) {
161 const AlphabetTable1* alphabet = header->alphabetTable1();
162 size_t n_entries = alphabet->n_entries;
163 const uint32_t* begin = alphabet->data;
164 const uint32_t* end = begin + n_entries;
165 alpha_codes[0] = 0;
166 for (size_t i = 0; i < len; i++) {
167 uint16_t c = word[i];
168 auto p = std::lower_bound(begin, end, c << 11);
169 if (p == end) {
170 return false;
171 }
172 uint32_t entry = *p;
173 if (AlphabetTable1::codepoint(entry) != c) {
174 return false;
175 }
176 alpha_codes[i + 1] = AlphabetTable1::value(entry);
177 }
178 alpha_codes[len + 1] = 0;
179 return true;
180 }
181 return false;
182 }
183
184 /**
185 * Internal implementation, after conversion to codes. All case folding and norm alization
186 * has been done by now, and all characters have been found in the alphabet.
187 * Note: len here is the padded length including 0 codes at start and end.
188 **/
189 void Hyphenator::hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size _t len) {
190 const Header* header = getHeader();
191 const Trie* trie = header->trieTable();
192 const Pattern* pattern = header->patternTable();
193 uint32_t char_mask = trie->char_mask;
194 uint32_t link_shift = trie->link_shift;
195 uint32_t link_mask = trie->link_mask;
196 uint32_t pattern_shift = trie->pattern_shift;
197 size_t maxOffset = len - MIN_SUFFIX - 1;
198 for (size_t i = 0; i < len - 1; i++) {
199 uint32_t node = 0; // index into Trie table
200 for (size_t j = i; j < len; j++) {
201 uint16_t c = codes[j];
202 uint32_t entry = trie->data[node + c];
203 if ((entry & char_mask) == c) {
204 node = (entry & link_mask) >> link_shift;
205 } else {
206 break;
207 }
208 uint32_t pat_ix = trie->data[node] >> pattern_shift;
209 // pat_ix contains a 3-tuple of length, shift (number of trailing ze ros), and an offset
210 // into the buf pool. This is the pattern for the substring (i..j) w e just matched,
211 // which we combine (via point-wise max) into the result vector.
212 if (pat_ix != 0) {
213 uint32_t pat_entry = pattern->data[pat_ix];
214 int pat_len = Pattern::len(pat_entry);
215 int pat_shift = Pattern::shift(pat_entry);
216 const uint8_t* pat_buf = pattern->buf(pat_entry);
217 int offset = j + 1 - (pat_len + pat_shift);
218 // offset is the index within result that lines up with the star t of pat_buf
219 int start = std::max(MIN_PREFIX - offset, 0);
220 int end = std::min(pat_len, (int)maxOffset - offset);
221 for (int k = start; k < end; k++) {
222 result[offset + k] = std::max(result[offset + k], pat_buf[k] );
223 }
224 }
225 }
226 }
227 // Since the above calculation does not modify values outside
228 // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0.
229 for (size_t i = MIN_PREFIX; i < maxOffset; i++) {
230 result[i] &= 1;
231 }
232 }
233
234 } // namespace android
OLDNEW
« no previous file with comments | « third_party/WebKit/Source/platform/text/hyphenation/HyphenatorAOSP.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698