OLD | NEW |
(Empty) | |
| 1 /* ***** BEGIN LICENSE BLOCK ***** |
| 2 * |
| 3 * Copyright (C) 2015 The Android Open Source Project |
| 4 * |
| 5 * Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 * you may not use this file except in compliance with the License. |
| 7 * You may obtain a copy of the License at |
| 8 * |
| 9 * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 * |
| 11 * Unless required by applicable law or agreed to in writing, software |
| 12 * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 * See the License for the specific language governing permissions and |
| 15 * limitations under the License. |
| 16 * |
| 17 * ***** END LICENSE BLOCK ***** */ |
| 18 |
| 19 #include <vector> |
| 20 #include <memory> |
| 21 #include <algorithm> |
| 22 #include <string> |
| 23 #include <unicode/uchar.h> |
| 24 |
| 25 // HACK: for reading pattern file |
| 26 #include <fcntl.h> |
| 27 |
| 28 #include "platform/text/hyphenation/HyphenatorAOSP.h" |
| 29 |
| 30 using std::vector; |
| 31 |
| 32 namespace android { |
| 33 |
| 34 static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; |
| 35 |
| 36 // The following are structs that correspond to tables inside the hyb file forma
t |
| 37 |
| 38 struct AlphabetTable0 { |
| 39 uint32_t version; |
| 40 uint32_t min_codepoint; |
| 41 uint32_t max_codepoint; |
| 42 uint8_t data[1]; // actually flexible array, size is known at runtime |
| 43 }; |
| 44 |
| 45 struct AlphabetTable1 { |
| 46 uint32_t version; |
| 47 uint32_t n_entries; |
| 48 uint32_t data[1]; // actually flexible array, size is known at runtime |
| 49 |
| 50 static uint32_t codepoint(uint32_t entry) { return entry >> 11; } |
| 51 static uint32_t value(uint32_t entry) { return entry & 0x7ff; } |
| 52 }; |
| 53 |
| 54 struct Trie { |
| 55 uint32_t version; |
| 56 uint32_t char_mask; |
| 57 uint32_t link_shift; |
| 58 uint32_t link_mask; |
| 59 uint32_t pattern_shift; |
| 60 uint32_t n_entries; |
| 61 uint32_t data[1]; // actually flexible array, size is known at runtime |
| 62 }; |
| 63 |
| 64 struct Pattern { |
| 65 uint32_t version; |
| 66 uint32_t n_entries; |
| 67 uint32_t pattern_offset; |
| 68 uint32_t pattern_size; |
| 69 uint32_t data[1]; // actually flexible array, size is known at runtime |
| 70 |
| 71 // accessors |
| 72 static uint32_t len(uint32_t entry) { return entry >> 26; } |
| 73 static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; } |
| 74 const uint8_t* buf(uint32_t entry) const { |
| 75 return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry
& 0xfffff); |
| 76 } |
| 77 }; |
| 78 |
| 79 struct Header { |
| 80 uint32_t magic; |
| 81 uint32_t version; |
| 82 uint32_t alphabet_offset; |
| 83 uint32_t trie_offset; |
| 84 uint32_t pattern_offset; |
| 85 uint32_t file_size; |
| 86 |
| 87 // accessors |
| 88 const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this)
; } |
| 89 uint32_t alphabetVersion() const { |
| 90 return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset); |
| 91 } |
| 92 const AlphabetTable0* alphabetTable0() const { |
| 93 return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset
); |
| 94 } |
| 95 const AlphabetTable1* alphabetTable1() const { |
| 96 return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset
); |
| 97 } |
| 98 const Trie* trieTable() const { |
| 99 return reinterpret_cast<const Trie*>(bytes() + trie_offset); |
| 100 } |
| 101 const Pattern* patternTable() const { |
| 102 return reinterpret_cast<const Pattern*>(bytes() + pattern_offset); |
| 103 } |
| 104 }; |
| 105 |
| 106 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData) { |
| 107 Hyphenator* result = new Hyphenator; |
| 108 result->patternData = patternData; |
| 109 return result; |
| 110 } |
| 111 |
| 112 void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t
len) { |
| 113 result->clear(); |
| 114 result->resize(len); |
| 115 const size_t paddedLen = len + 2; // start and stop code each count for 1 |
| 116 if (patternData != nullptr && |
| 117 (int)len >= MIN_PREFIX + MIN_SUFFIX && paddedLen <= MAX_HYPHENATED_S
IZE) { |
| 118 uint16_t alpha_codes[MAX_HYPHENATED_SIZE]; |
| 119 if (alphabetLookup(alpha_codes, word, len)) { |
| 120 hyphenateFromCodes(result->data(), alpha_codes, paddedLen); |
| 121 return; |
| 122 } |
| 123 // TODO: try NFC normalization |
| 124 // TODO: handle non-BMP Unicode (requires remapping of offsets) |
| 125 } |
| 126 hyphenateSoft(result->data(), word, len); |
| 127 } |
| 128 |
| 129 // If any soft hyphen is present in the word, use soft hyphens to decide hyphena
tion, |
| 130 // as recommended in UAX #14 (Use of Soft Hyphen) |
| 131 void Hyphenator::hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len
) { |
| 132 result[0] = 0; |
| 133 for (size_t i = 1; i < len; i++) { |
| 134 result[i] = word[i - 1] == CHAR_SOFT_HYPHEN; |
| 135 } |
| 136 } |
| 137 |
| 138 bool Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, siz
e_t len) { |
| 139 const Header* header = getHeader(); |
| 140 // TODO: check header magic |
| 141 uint32_t alphabetVersion = header->alphabetVersion(); |
| 142 if (alphabetVersion == 0) { |
| 143 const AlphabetTable0* alphabet = header->alphabetTable0(); |
| 144 uint32_t min_codepoint = alphabet->min_codepoint; |
| 145 uint32_t max_codepoint = alphabet->max_codepoint; |
| 146 alpha_codes[0] = 0; // word start |
| 147 for (size_t i = 0; i < len; i++) { |
| 148 uint16_t c = word[i]; |
| 149 if (c < min_codepoint || c >= max_codepoint) { |
| 150 return false; |
| 151 } |
| 152 uint8_t code = alphabet->data[c - min_codepoint]; |
| 153 if (code == 0) { |
| 154 return false; |
| 155 } |
| 156 alpha_codes[i + 1] = code; |
| 157 } |
| 158 alpha_codes[len + 1] = 0; // word termination |
| 159 return true; |
| 160 } else if (alphabetVersion == 1) { |
| 161 const AlphabetTable1* alphabet = header->alphabetTable1(); |
| 162 size_t n_entries = alphabet->n_entries; |
| 163 const uint32_t* begin = alphabet->data; |
| 164 const uint32_t* end = begin + n_entries; |
| 165 alpha_codes[0] = 0; |
| 166 for (size_t i = 0; i < len; i++) { |
| 167 uint16_t c = word[i]; |
| 168 auto p = std::lower_bound(begin, end, c << 11); |
| 169 if (p == end) { |
| 170 return false; |
| 171 } |
| 172 uint32_t entry = *p; |
| 173 if (AlphabetTable1::codepoint(entry) != c) { |
| 174 return false; |
| 175 } |
| 176 alpha_codes[i + 1] = AlphabetTable1::value(entry); |
| 177 } |
| 178 alpha_codes[len + 1] = 0; |
| 179 return true; |
| 180 } |
| 181 return false; |
| 182 } |
| 183 |
| 184 /** |
| 185 * Internal implementation, after conversion to codes. All case folding and norm
alization |
| 186 * has been done by now, and all characters have been found in the alphabet. |
| 187 * Note: len here is the padded length including 0 codes at start and end. |
| 188 **/ |
| 189 void Hyphenator::hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size
_t len) { |
| 190 const Header* header = getHeader(); |
| 191 const Trie* trie = header->trieTable(); |
| 192 const Pattern* pattern = header->patternTable(); |
| 193 uint32_t char_mask = trie->char_mask; |
| 194 uint32_t link_shift = trie->link_shift; |
| 195 uint32_t link_mask = trie->link_mask; |
| 196 uint32_t pattern_shift = trie->pattern_shift; |
| 197 size_t maxOffset = len - MIN_SUFFIX - 1; |
| 198 for (size_t i = 0; i < len - 1; i++) { |
| 199 uint32_t node = 0; // index into Trie table |
| 200 for (size_t j = i; j < len; j++) { |
| 201 uint16_t c = codes[j]; |
| 202 uint32_t entry = trie->data[node + c]; |
| 203 if ((entry & char_mask) == c) { |
| 204 node = (entry & link_mask) >> link_shift; |
| 205 } else { |
| 206 break; |
| 207 } |
| 208 uint32_t pat_ix = trie->data[node] >> pattern_shift; |
| 209 // pat_ix contains a 3-tuple of length, shift (number of trailing ze
ros), and an offset |
| 210 // into the buf pool. This is the pattern for the substring (i..j) w
e just matched, |
| 211 // which we combine (via point-wise max) into the result vector. |
| 212 if (pat_ix != 0) { |
| 213 uint32_t pat_entry = pattern->data[pat_ix]; |
| 214 int pat_len = Pattern::len(pat_entry); |
| 215 int pat_shift = Pattern::shift(pat_entry); |
| 216 const uint8_t* pat_buf = pattern->buf(pat_entry); |
| 217 int offset = j + 1 - (pat_len + pat_shift); |
| 218 // offset is the index within result that lines up with the star
t of pat_buf |
| 219 int start = std::max(MIN_PREFIX - offset, 0); |
| 220 int end = std::min(pat_len, (int)maxOffset - offset); |
| 221 for (int k = start; k < end; k++) { |
| 222 result[offset + k] = std::max(result[offset + k], pat_buf[k]
); |
| 223 } |
| 224 } |
| 225 } |
| 226 } |
| 227 // Since the above calculation does not modify values outside |
| 228 // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0. |
| 229 for (size_t i = MIN_PREFIX; i < maxOffset; i++) { |
| 230 result[i] &= 1; |
| 231 } |
| 232 } |
| 233 |
| 234 } // namespace android |
OLD | NEW |