OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "config.h" | |
6 #include "ScriptRunIterator.h" | |
7 | |
8 #include "platform/Logging.h" | |
9 #include "wtf/Threading.h" | |
10 | |
11 #include <ubidi_props.h> | |
12 | |
13 namespace blink { | |
14 | |
15 typedef ScriptData::PairedBracketType PairedBracketType; | |
16 | |
17 ScriptData::~ScriptData() | |
18 { | |
19 } | |
20 | |
21 const int ScriptData::kMaxScriptCount; | |
22 | |
23 void ICUScriptData::getScripts(UChar32 ch, Vector<UScriptCode>& dst) const | |
24 { | |
25 | |
26 UErrorCode status = U_ZERO_ERROR; | |
27 // Leave room to insert primary script. It's not strictly necessary but | |
28 // it ensures that the result won't ever be greater than kMaxScriptCount, | |
29 // which some client someday might expect. | |
30 dst.resize(kMaxScriptCount - 1); | |
31 // Note, ICU convention is to return the number of available items | |
32 // regardless of the capacity passed to the call. So count can be greater | |
33 // than dst->size(), if a later version of the unicode data has more | |
34 // than kMaxScriptCount items. | |
35 int count = uscript_getScriptExtensions( | |
36 ch, &dst[0], dst.size(), &status); | |
37 if (status == U_BUFFER_OVERFLOW_ERROR) { | |
38 // Allow this, we'll just use what we have. | |
39 WTF_LOG_ERROR("Exceeded maximum script count of %d for 0x%x", kMaxScript Count, ch); | |
40 count = dst.size(); | |
41 status = U_ZERO_ERROR; | |
42 } | |
43 UScriptCode primaryScript = uscript_getScript(ch, &status); | |
44 | |
45 if (U_FAILURE(status)) { | |
46 WTF_LOG_ERROR("Could not get icu script data: %d for 0x%x", status, ch); | |
47 dst.clear(); | |
48 return; | |
49 } | |
50 | |
51 dst.resize(count); | |
52 | |
53 if (primaryScript == dst.at(0)) { | |
54 // Only one script (might be common or inherited -- these are never in | |
55 // the extensions unless they're the only script), or extensions are in | |
56 // priority order already. | |
57 return; | |
58 } | |
59 | |
60 if (primaryScript > USCRIPT_INHERITED) { | |
61 // Not common or primary, with extensions that are not in order. We know | |
62 // the primary, so we insert it at the front and swap the previous front | |
63 // to somewhere else in the list. | |
64 auto it = std::find(dst.begin() + 1, dst.end(), primaryScript); | |
65 if (it == dst.end()) { | |
66 dst.append(primaryScript); | |
67 } | |
68 std::swap(*dst.begin(), *it); | |
69 return; | |
70 } | |
71 | |
72 if (primaryScript == USCRIPT_COMMON) { | |
73 if (count == 1) { | |
74 // Common with a preferred script. Keep common at head. | |
75 dst.prepend(primaryScript); | |
76 return; | |
77 } | |
78 | |
79 // Ignore common. Find the preferred script of the multiple scripts that | |
80 // remain, and ensure it is at the head. Just keep swapping them in, | |
81 // there aren't likely to be many. | |
82 for (size_t i = 1; i < dst.size(); ++i) { | |
83 if (dst.at(0) == USCRIPT_LATIN || dst.at(i) < dst.at(0)) { | |
84 std::swap(dst.at(0), dst.at(i)); | |
85 } | |
86 } | |
87 return; | |
88 } | |
89 | |
90 // The primary is inherited, and there are other scripts. Put inherited at | |
91 // the front, the true primary next, and then the others in random order. | |
92 dst.append(dst.at(0)); | |
93 dst.at(0) = primaryScript; | |
94 for (size_t i = 2; i < dst.size(); ++i) { | |
95 if (dst.at(1) == USCRIPT_LATIN || dst.at(i) < dst.at(1)) { | |
96 std::swap(dst.at(1), dst.at(i)); | |
97 } | |
98 } | |
99 } | |
100 | |
101 UChar32 ICUScriptData::getPairedBracket(UChar32 ch) const | |
102 { | |
103 return u_getBidiPairedBracket(ch); | |
104 } | |
105 | |
106 PairedBracketType ICUScriptData::getPairedBracketType(UChar32 ch) const | |
107 { | |
108 return static_cast<PairedBracketType>( | |
109 u_getIntPropertyValue(ch, UCHAR_BIDI_PAIRED_BRACKET_TYPE)); | |
110 } | |
111 | |
112 const ICUScriptData* ICUScriptData::instance() | |
113 { | |
114 AtomicallyInitializedStaticReference(const ICUScriptData, icuScriptDataInsta nce, (new ICUScriptData())); | |
115 return &icuScriptDataInstance; | |
116 } | |
117 | |
118 ScriptRunIterator::ScriptRunIterator(const UChar* text, size_t length, const Scr iptData* data) | |
119 : m_text(text) | |
120 , m_length(length) | |
121 , m_bracketsFixupDepth(0) | |
122 , | |
123 // The initial value of m_aheadCharacter is not used, use ffff to be distinc tive. | |
eae
2015/08/28 21:10:24
Why FFFF instead of 0?
| |
124 m_aheadCharacter(0xffff) | |
125 , m_aheadPos(0) | |
126 , m_commonPreferred(USCRIPT_COMMON) | |
127 , m_scriptData(data) | |
128 { | |
129 ASSERT(text); | |
130 ASSERT(data); | |
131 | |
132 if (m_aheadPos < m_length) { | |
133 m_currentSet.clear(); | |
134 m_currentSet.append(USCRIPT_COMMON); | |
eae
2015/08/28 21:10:24
Why would we append commonn to the set here?
| |
135 U16_NEXT(m_text, m_aheadPos, m_length, m_aheadCharacter); | |
136 m_scriptData->getScripts(m_aheadCharacter, m_aheadSet); | |
137 } | |
138 } | |
139 | |
140 ScriptRunIterator::ScriptRunIterator(const UChar* text, size_t length) | |
141 : ScriptRunIterator(text, length, ICUScriptData::instance()) | |
142 { | |
143 } | |
144 | |
145 bool ScriptRunIterator::consume(unsigned& limit, UScriptCode& script) | |
146 { | |
147 if (m_currentSet.isEmpty()) { | |
148 return false; | |
149 } | |
150 | |
151 size_t pos; | |
152 UChar32 ch; | |
153 while (fetch(&pos, &ch)) { | |
154 PairedBracketType paired_type = m_scriptData->getPairedBracketType(ch); | |
155 switch (paired_type) { | |
156 case PairedBracketType::OPEN: | |
157 openBracket(ch); | |
158 break; | |
159 case PairedBracketType::CLOSE: | |
160 closeBracket(ch); | |
161 break; | |
162 default: | |
163 break; | |
164 } | |
165 if (!mergeSets()) { | |
166 limit = pos; | |
167 script = resolveCurrentScript(); | |
168 fixupStack(script); | |
169 m_currentSet = m_nextSet; | |
170 return true; | |
171 } | |
172 } | |
173 | |
174 limit = m_length; | |
175 script = resolveCurrentScript(); | |
176 m_currentSet.clear(); | |
177 return true; | |
178 } | |
179 | |
180 void ScriptRunIterator::openBracket(UChar32 ch) | |
181 { | |
182 if (m_brackets.size() == kMaxBrackets) { | |
183 m_brackets.removeFirst(); | |
184 if (m_bracketsFixupDepth == kMaxBrackets) { | |
185 --m_bracketsFixupDepth; | |
186 } | |
187 } | |
188 m_brackets.append(BracketRec({ ch, USCRIPT_COMMON })); | |
189 ++m_bracketsFixupDepth; | |
190 } | |
191 | |
192 void ScriptRunIterator::closeBracket(UChar32 ch) | |
193 { | |
194 if (m_brackets.size() > 0) { | |
195 UChar32 target = m_scriptData->getPairedBracket(ch); | |
196 for (auto it = m_brackets.rbegin(); it != m_brackets.rend(); ++it) { | |
197 if (it->ch == target) { | |
198 // Have a match, use open paren's resolved script. | |
199 UScriptCode script = it->script; | |
200 m_nextSet.clear(); | |
201 m_nextSet.append(script); | |
202 | |
203 // And pop stack to this point. | |
204 int num_popped = std::distance(m_brackets.rbegin(), it); | |
205 // TODO: No resize operation in WTF::Deque? | |
206 for (int i = 0; i < num_popped; ++i) | |
207 m_brackets.removeLast(); | |
208 m_bracketsFixupDepth = std::max(0ul, m_bracketsFixupDepth - num_ popped); | |
209 return; | |
210 } | |
211 } | |
212 } | |
213 // leave stack alone, no match | |
214 } | |
215 | |
216 // Keep items in current_set that are in next_set. | |
217 // | |
218 // If the sets are disjoint, return false and leave current_set unchanged. Else | |
219 // return true and make current set the intersection. Make sure to maintain | |
220 // current priority script as priority if it remains, else retain next priority | |
221 // script if it remains. | |
222 // | |
223 // Also maintain a common preferred script. If current and next are both | |
224 // common, and there is no common preferred script and next has a preferred | |
225 // script, set the common preferred script to that of next. | |
226 bool ScriptRunIterator::mergeSets() | |
227 { | |
228 if (m_nextSet.isEmpty() || m_currentSet.isEmpty()) { | |
229 return false; | |
230 } | |
231 | |
232 auto cur_it = m_currentSet.begin(); | |
eae
2015/08/28 21:10:24
curIt, or better yet, currentScript
| |
233 auto cur_end = m_currentSet.end(); | |
234 // Most of the time, this is the only one. | |
235 // Advance the current iterator, we won't need to check it again later. | |
236 UScriptCode priority_script = *cur_it++; | |
237 | |
238 // If next is common or inherited, the only thing that might change | |
239 // is the common preferred script. | |
240 if (m_nextSet.at(0) <= USCRIPT_INHERITED) { | |
241 if (m_nextSet.size() == 2 && priority_script <= USCRIPT_INHERITED && m_c ommonPreferred == USCRIPT_COMMON) { | |
242 m_commonPreferred = m_nextSet.at(1); | |
243 } | |
244 return true; | |
245 } | |
246 | |
247 // If current is common or inherited, use the next script set. | |
248 if (priority_script <= USCRIPT_INHERITED) { | |
249 m_currentSet = m_nextSet; | |
250 return true; | |
251 } | |
252 | |
253 // Neither is common or inherited. If current is a singleton, | |
254 // just see if it exists in the next set. This is the common case. | |
255 auto next_it = m_nextSet.begin(); | |
256 auto next_end = m_nextSet.end(); | |
257 if (cur_it == cur_end) { | |
258 return std::find(next_it, next_end, priority_script) != next_end; | |
259 } | |
260 | |
261 // Establish the priority script, if we have one. | |
262 // First try current priority script. | |
263 bool have_priority = std::find(next_it, next_end, priority_script) | |
264 != next_end; | |
265 if (!have_priority) { | |
266 // So try next priority script. | |
267 // Skip the first current script, we already know it's not there. | |
268 // Advance the next iterator, later we won't need to check it again. | |
269 priority_script = *next_it++; | |
270 have_priority = std::find(cur_it, cur_end, priority_script) != cur_end; | |
271 } | |
272 | |
273 // Note that we can never write more scripts into the current vector than | |
274 // it already contains, so cur_write_it won't ever exceed the size/capacity. | |
275 auto cur_write_it = m_currentSet.begin(); | |
276 if (have_priority) { | |
277 // keep the priority script. | |
278 *cur_write_it++ = priority_script; | |
279 } | |
280 | |
281 if (next_it != next_end) { | |
282 // Iterate over the remaining current scripts, and keep them if | |
283 // they occur in the remaining next scripts. | |
284 while (cur_it != cur_end) { | |
285 UScriptCode sc = *cur_it++; | |
286 if (std::find(next_it, next_end, sc) != next_end) { | |
287 *cur_write_it++ = sc; | |
288 } | |
289 } | |
290 } | |
291 | |
292 // Only change current if the run continues. | |
293 int written = std::distance(m_currentSet.begin(), cur_write_it); | |
294 if (written > 0) { | |
295 m_currentSet.resize(written); | |
296 return true; | |
297 } | |
298 return false; | |
299 } | |
300 | |
301 // When we hit the end of the run, and resolve the script, we now know the | |
302 // resolved script of any open bracket that was pushed on the stack since | |
303 // the start of the run. Fixup depth records how many of these there | |
304 // were. We've maintained this count during pushes, and taken care to | |
305 // adjust it if the stack got overfull and open brackets were pushed off | |
306 // the bottom. This sets the script of the fixup_depth topmost entries of the | |
307 // stack to the resolved script. | |
308 void ScriptRunIterator::fixupStack(UScriptCode resolved_script) | |
309 { | |
310 if (m_bracketsFixupDepth > 0) { | |
311 if (m_bracketsFixupDepth > m_brackets.size()) { | |
312 // Should never happen unless someone breaks the code. | |
313 WTF_LOG_ERROR("Brackets fixup depth exceeds size of bracket vector." ); | |
314 m_bracketsFixupDepth = m_brackets.size(); | |
315 } | |
316 auto it = m_brackets.rbegin(); | |
317 for (size_t i = 0; i < m_bracketsFixupDepth; ++i) { | |
318 it->script = resolved_script; | |
319 ++it; | |
320 } | |
321 m_bracketsFixupDepth = 0; | |
322 } | |
323 } | |
324 | |
325 bool ScriptRunIterator::fetch(size_t* pos, UChar32* ch) | |
326 { | |
327 if (m_aheadPos > m_length) { | |
328 return false; | |
329 } | |
330 *pos = m_aheadPos - (m_aheadCharacter >= 0x10000 ? 2 : 1); | |
331 *ch = m_aheadCharacter; | |
332 | |
333 m_nextSet.swap(m_aheadSet); | |
334 if (m_aheadPos == m_length) { | |
335 // No more data to fetch, but last character still needs to be | |
336 // processed. Advance m_aheadPos so that next time we will know | |
337 // this has been done. | |
338 m_aheadPos++; | |
339 return true; | |
340 } | |
341 | |
342 U16_NEXT(m_text, m_aheadPos, m_length, m_aheadCharacter); | |
343 m_scriptData->getScripts(m_aheadCharacter, m_aheadSet); | |
344 if (m_aheadSet.isEmpty()) { | |
345 // No scripts for this character. This has already been logged, so | |
346 // we just terminate processing this text. | |
347 return false; | |
348 } | |
349 if (m_aheadSet[0] == USCRIPT_INHERITED && m_aheadSet.size() > 1) { | |
350 if (m_nextSet[0] == USCRIPT_COMMON) { | |
351 // Overwrite the next set with the non-inherited portion of the set. | |
352 m_nextSet = m_aheadSet; | |
353 m_nextSet.remove(0); | |
354 // Discard the remaining values, we'll inherit. | |
355 m_aheadSet.resize(1); | |
356 } | |
357 else { | |
358 // Else, this applies to anything. | |
359 m_aheadSet.resize(1); | |
360 } | |
361 } | |
362 return true; | |
363 } | |
364 | |
365 UScriptCode ScriptRunIterator::resolveCurrentScript() const | |
366 { | |
367 UScriptCode result = m_currentSet.at(0); | |
368 return result == USCRIPT_COMMON ? m_commonPreferred : result; | |
369 } | |
370 | |
371 } // namespace blink | |
OLD | NEW |