Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(178)

Side by Side Diff: cc/resources/texture_compress/arm/atc_dxt_neon.cc

Issue 793693003: Tile Compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // See the links below for detailed descriptions of the algorithms used.
6 // http://cbloomrants.blogspot.se/2008/12/12-08-08-dxtc-summary.html
7 // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination
8
9 #if defined(__ARM_NEON__)
10
11 #include "cc/resources/texture_compress/arm/atc_dxt_neon.h"
12
13 #include <arm_neon.h>
14
15 #include "base/compiler_specific.h"
16 #include "base/logging.h"
17 #include "cc/resources/texture_compress/atc_dxt.h"
18
19 namespace cc {
20 namespace texture_compress {
21
22 struct TYPE_ATC_NEON : public TYPE_ATC {
23 typedef TYPE_ATC BASE_TYPE;
24 static const uint8x8_t kRemap;
25 static const uint64_t kProds[3];
26 };
27
28 struct TYPE_DXT_NEON : public TYPE_DXT {
29 typedef TYPE_DXT BASE_TYPE;
30 static const uint8x8_t kRemap;
31 static const int8x8_t kW1Table;
32 static const uint64_t kProds[3];
33 };
34
35 const uint8x8_t TYPE_ATC_NEON::kRemap = {0, 1, 0, 1, 2, 2, 3, 3};
36 const uint64_t TYPE_ATC_NEON::kProds[3] = {0x00010409, 0x09040100, 0x00020200};
37
38 const uint8x8_t TYPE_DXT_NEON::kRemap = {0, 2, 0, 2, 3, 3, 1, 1};
39 const int8x8_t TYPE_DXT_NEON::kW1Table = {3, 0, 2, 1, 0, 0, 0, 0};
40 const uint64_t TYPE_DXT_NEON::kProds[3] = {0x01040009, 0x04010900, 0x02020000};
41
42 // Number of passes over the block that's done to refine the base colors.
43 // Only applies to high quality compression mode.
44 const int kNumRefinements = 2;
45
46 namespace {
47
48 template <typename T>
49 ALWAYS_INLINE int8x16_t DoW1TableLookup(uint8x16_t indices);
50
51 template <>
52 ALWAYS_INLINE int8x16_t DoW1TableLookup<TYPE_ATC_NEON>(uint8x16_t indices) {
53 // Take a shortcut for ATC which gives the same result as the table lookup.
54 // {0, 1, 2, 3} -> {3, 2, 1, 0}
55 return veorq_s8(vreinterpretq_s8_u8(indices), vdupq_n_s8(3));
56 }
57
58 template <>
59 ALWAYS_INLINE int8x16_t DoW1TableLookup<TYPE_DXT_NEON>(uint8x16_t indices) {
60 // Do table lookup for each color index.
61 return vcombine_s8(vtbl1_s8(TYPE_DXT_NEON::kW1Table,
62 vreinterpret_s8_u8(vget_low_u8(indices))),
63 vtbl1_s8(TYPE_DXT_NEON::kW1Table,
64 vreinterpret_s8_u8(vget_high_u8(indices))));
65 }
66
67 // Returns max and min base green colors matching the given single green color
68 // when solved via linear interpolation. Output format differs for ATC and DXT.
69 // See explicitly instantiated template functions below.
70 template <typename T>
71 ALWAYS_INLINE uint16_t MatchSingleGreenMax(int g);
72 template <typename T>
73 ALWAYS_INLINE uint16_t MatchSingleGreenMin(int g);
74
75 template <>
76 ALWAYS_INLINE uint16_t MatchSingleGreenMax<TYPE_ATC>(int g) {
77 return g_o_match56[g][0] << 1;
78 }
79
80 template <>
81 ALWAYS_INLINE uint16_t MatchSingleGreenMin<TYPE_ATC>(int g) {
82 return g_o_match56[g][1];
83 }
84
85 template <>
86 ALWAYS_INLINE uint16_t MatchSingleGreenMax<TYPE_DXT>(int g) {
87 return g_o_match66[g][0];
88 }
89
90 template <>
91 ALWAYS_INLINE uint16_t MatchSingleGreenMin<TYPE_DXT>(int g) {
92 return g_o_match66[g][1];
93 }
94
95 // This converts the output data to either ATC or DXT format.
96 // See explicitly instantiated template functions below.
97 template <typename T>
98 ALWAYS_INLINE void FormatFixup(uint16x4_t* base_colors, uint64x1_t* indices);
99
100 template <>
101 ALWAYS_INLINE void FormatFixup<TYPE_ATC_NEON>(uint16x4_t* base_colors,
102 uint64x1_t* indices) {
103 // First color in ATC format is 555.
104 *base_colors = vorr_u16(
105 vand_u16(*base_colors, vreinterpret_u16_u64(vdup_n_u64(0xffff001f))),
106 vshr_n_u16(
107 vand_u16(*base_colors, vreinterpret_u16_u64(vdup_n_u64(0x0000ffC0))),
108 1));
109 }
110
111 template <>
112 ALWAYS_INLINE void FormatFixup<TYPE_DXT_NEON>(uint16x4_t* base_colors,
113 uint64x1_t* indices) {
114 // Swap min/max colors if necessary.
115 uint16x4_t max = vdup_lane_u16(*base_colors, 0);
116 uint16x4_t min = vdup_lane_u16(*base_colors, 1);
117 uint16x4_t cmp = vclt_u16(max, min);
118 *base_colors =
119 vorr_u16(vand_u16(vbsl_u16(cmp, min, max),
120 vreinterpret_u16_u64(vdup_n_u64(0x0000ffff))),
121 vand_u16(vbsl_u16(cmp, max, min),
122 vreinterpret_u16_u64(vdup_n_u64(0xffff0000))));
123 *indices = vbsl_u64(vreinterpret_u64_u16(cmp),
124 veor_u64(*indices, vdup_n_u64(0x55555555)), *indices);
125 }
126
127 // Check if all the 8 bits elements in the given quad register are equal.
128 ALWAYS_INLINE bool ElementsEqual(uint8x16_t elements) {
129 uint8x16_t first = vdupq_lane_u8(vget_low_u8(elements), 0);
130 uint8x16_t eq = vceqq_u8(elements, first);
131 uint8x8_t tst = vand_u8(vget_low_u8(eq), vget_high_u8(eq));
132 return vget_lane_u64(vreinterpret_u64_u8(tst), 0) == 0xffffffffffffffff;
133 }
134
135 ALWAYS_INLINE bool Equal(uint8x16_t e1, uint8x16_t e2) {
136 uint8x16_t eq = vceqq_u8(e1, e2);
137 uint8x8_t tst = vand_u8(vget_low_u8(eq), vget_high_u8(eq));
138 return vget_lane_u64(vreinterpret_u64_u8(tst), 0) == 0xffffffffffffffff;
139 }
140
141 ALWAYS_INLINE bool Equal(uint16x8_t e1, uint16x8_t e2) {
142 uint16x8_t eq = vceqq_u16(e1, e2);
143 uint16x4_t tst = vand_u16(vget_low_u16(eq), vget_high_u16(eq));
144 return vget_lane_u64(vreinterpret_u64_u16(tst), 0) == 0xffffffffffffffff;
145 }
146
147 ALWAYS_INLINE bool Equal(uint16x4_t e1, uint16x4_t e2) {
148 uint16x4_t eq = vceq_u16(e1, e2);
149 return vget_lane_u64(vreinterpret_u64_u16(eq), 0) == 0xffffffffffffffff;
150 }
151
152 ALWAYS_INLINE int16x8x2_t ExpandRGBATo16(const uint8x16_t& channel) {
153 int16x8x2_t result;
154 result.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(channel)));
155 result.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(channel)));
156 return result;
157 }
158
159 ALWAYS_INLINE int32x4x4_t ExpandRGBATo32(const uint8x16_t& channel) {
160 uint16x8_t lo = vmovl_u8(vget_low_u8(channel));
161 uint16x8_t hi = vmovl_u8(vget_high_u8(channel));
162 int32x4x4_t result;
163 result.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(lo)));
164 result.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(lo)));
165 result.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(hi)));
166 result.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(hi)));
167 return result;
168 }
169
170 // NEON doesn't have support for division.
171 // Instead it's recommended to use Newton-Raphson refinement to get a close
172 // approximation.
173 template <int REFINEMENT_STEPS>
174 ALWAYS_INLINE float32x4_t Divide(float32x4_t a, float32x4_t b) {
175 #ifdef VERIFY_RESULTS
176 ALIGNAS(8) float a_[4];
177 ALIGNAS(8) float b_[4];
178 vst1q_f32(a_, a);
179 vst1q_f32(b_, b);
180 for (int i = 0; i < 4; ++i)
181 a_[i] /= b_[i];
182 return vld1q_f32(a_);
183 #else
184 // Get an initial estimate of 1/b.
185 float32x4_t reciprocal = vrecpeq_f32(b);
186 // Use a number of Newton-Raphson steps to refine the estimate.
187 for (int i = 0; i < REFINEMENT_STEPS; ++i)
188 reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
189 // Calculate the final estimate.
190 return vmulq_f32(a, reciprocal);
191 #endif
192 }
193
194 namespace vec_ops {
195
196 struct Max {
197 ALWAYS_INLINE int32x4_t Calc(int32x4_t a, int32x4_t b) {
198 return vmaxq_s32(a, b);
199 }
200
201 ALWAYS_INLINE uint32x4_t Calc(uint32x4_t a, uint32x4_t b) {
202 return vmaxq_u32(a, b);
203 }
204
205 ALWAYS_INLINE uint8x8_t Fold(uint8x8_t a, uint8x8_t b) {
206 return vpmax_u8(a, b);
207 }
208
209 ALWAYS_INLINE int32x2_t Fold(int32x2_t a, int32x2_t b) {
210 return vpmax_s32(a, b);
211 }
212
213 ALWAYS_INLINE uint32x2_t Fold(uint32x2_t a, uint32x2_t b) {
214 return vpmax_u32(a, b);
215 }
216 };
217
218 struct Min {
219 ALWAYS_INLINE int32x4_t Calc(int32x4_t a, int32x4_t b) {
220 return vminq_s32(a, b);
221 }
222
223 ALWAYS_INLINE uint32x4_t Calc(uint32x4_t a, uint32x4_t b) {
224 return vminq_u32(a, b);
225 }
226
227 ALWAYS_INLINE uint8x8_t Fold(uint8x8_t a, uint8x8_t b) {
228 return vpmin_u8(a, b);
229 }
230
231 ALWAYS_INLINE int32x2_t Fold(int32x2_t a, int32x2_t b) {
232 return vpmin_s32(a, b);
233 }
234
235 ALWAYS_INLINE uint32x2_t Fold(uint32x2_t a, uint32x2_t b) {
236 return vpmin_u32(a, b);
237 }
238 };
239
240 } // namespace vec_ops
241
242 template <typename Operator>
243 ALWAYS_INLINE uint8x8_t FoldRGBA(const uint8x16x4_t& src) {
244 Operator op;
245
246 // Fold each adjacent pair.
247 uint8x8_t r = op.Fold(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]));
248 uint8x8_t g = op.Fold(vget_low_u8(src.val[1]), vget_high_u8(src.val[1]));
249 uint8x8_t b = op.Fold(vget_low_u8(src.val[2]), vget_high_u8(src.val[2]));
250 uint8x8_t a = op.Fold(vget_low_u8(src.val[3]), vget_high_u8(src.val[3]));
251
252 // Do both red and green channels at the same time.
253 uint8x8_t rg = op.Fold(r, g);
254
255 // Do both blue and alpha channels at the same time.
256 uint8x8_t ba = op.Fold(b, a);
257
258 // Do all the channels at the same time.
259 uint8x8_t rgba = op.Fold(rg, ba);
260
261 // Finally, we need to pad it to get the final reduction.
262 return op.Fold(rgba, rgba);
263 }
264
265 template <typename Operator>
266 ALWAYS_INLINE int32x2_t Fold(const int32x4x4_t& src) {
267 Operator op;
268
269 int32x4_t fold0 = op.Calc(src.val[0], src.val[1]);
270 int32x4_t fold1 = op.Calc(src.val[2], src.val[3]);
271 int32x4_t fold01 = op.Calc(fold0, fold1);
272 int32x2_t fold0123 = op.Fold(vget_low_s32(fold01), vget_high_s32(fold01));
273 return op.Fold(fold0123, vdup_n_s32(0));
274 }
275
276 template <typename Operator>
277 ALWAYS_INLINE uint32x2_t Fold(const uint32x4x4_t& src) {
278 Operator op;
279
280 uint32x4_t fold0 = op.Calc(src.val[0], src.val[1]);
281 uint32x4_t fold1 = op.Calc(src.val[2], src.val[3]);
282 uint32x4_t fold01 = op.Calc(fold0, fold1);
283 uint32x2_t fold0123 = op.Fold(vget_low_u32(fold01), vget_high_u32(fold01));
284 return op.Fold(fold0123, vdup_n_u32(0));
285 }
286
287 template <typename Operator>
288 ALWAYS_INLINE int32x4_t FoldDup(const int32x4x4_t& src) {
289 return vdupq_lane_s32(Fold<Operator>(src), 0);
290 }
291
292 ALWAYS_INLINE uint16x4_t SumRGB(const uint8x16x4_t& src) {
293 // Add up all red values for 16 pixels.
294 uint16x8_t r = vpaddlq_u8(src.val[0]);
295 uint16x4_t r2 = vpadd_u16(vget_low_u16(r), vget_high_u16(r));
296
297 // Add up all green values for 16 pixels.
298 uint16x8_t g = vpaddlq_u8(src.val[1]);
299 uint16x4_t g2 = vpadd_u16(vget_low_u16(g), vget_high_u16(g));
300
301 uint16x4_t rg = vpadd_u16(r2, g2);
302
303 // Add up all blue values for 16 pixels.
304 uint16x8_t b = vpaddlq_u8(src.val[2]);
305 uint16x4_t b2 = vpadd_u16(vget_low_u16(b), vget_high_u16(b));
306
307 uint16x4_t ba = vpadd_u16(b2, vdup_n_u16(0));
308
309 return vpadd_u16(rg, ba);
310 }
311
312 ALWAYS_INLINE int32x4_t SumRGB(const int16x8x4_t& src) {
313 // Add up all red values for 8 pixels.
314 int32x4_t r = vpaddlq_s16(src.val[0]);
315 int32x2_t r2 = vpadd_s32(vget_low_s32(r), vget_high_s32(r));
316
317 // Add up all green values for 8 pixels.
318 int32x4_t g = vpaddlq_s16(src.val[1]);
319 int32x2_t g2 = vpadd_s32(vget_low_s32(g), vget_high_s32(g));
320
321 int32x2_t rg = vpadd_s32(r2, g2);
322
323 // Add up all blue values for 8 pixels.
324 int32x4_t b = vpaddlq_s16(src.val[2]);
325 int32x2_t b2 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
326
327 int32x2_t ba = vpadd_s32(b2, vdup_n_s32(0));
328
329 return vcombine_s32(rg, ba);
330 }
331
332 ALWAYS_INLINE int32x4_t SumRGB(const int32x4x4_t& src) {
333 // Add up all red values for 8 pixels.
334 int32x2_t r = vmovn_s64(vpaddlq_s32(src.val[0]));
335
336 // Add up all green values for 8 pixels.
337 int32x2_t g = vmovn_s64(vpaddlq_s32(src.val[1]));
338
339 int32x2_t rg = vpadd_s32(r, g);
340
341 // Add up all blue values for 8 pixels.
342 int32x2_t b = vmovn_s64(vpaddlq_s32(src.val[2]));
343
344 int32x2_t ba = vpadd_s32(b, vdup_n_s32(0));
345
346 return vcombine_s32(rg, ba);
347 }
348
349 ALWAYS_INLINE int32x4_t DotProduct(int32x4_t r,
350 int32x4_t g,
351 int32x4_t b,
352 int32x4_t dir_r,
353 int32x4_t dir_g,
354 int32x4_t dir_b) {
355 // Multiply and accumulate each 32 bits element.
356 int32x4_t dots = vmulq_s32(r, dir_r);
357 dots = vmlaq_s32(dots, g, dir_g);
358 dots = vmlaq_s32(dots, b, dir_b);
359 return dots;
360 }
361
362 ALWAYS_INLINE int32x4x4_t CalculateDots(const int32x4x4_t& r,
363 const int32x4x4_t& g,
364 const int32x4x4_t& b,
365 const int32x4_t& v_vec) {
366 // Duplicate the red, green and blue luminance values.
367 int32x4_t r_vec = vdupq_n_s32(vgetq_lane_s32(v_vec, 0));
368 int32x4_t g_vec = vdupq_n_s32(vgetq_lane_s32(v_vec, 1));
369 int32x4_t b_vec = vdupq_n_s32(vgetq_lane_s32(v_vec, 2));
370
371 int32x4x4_t result;
372 result.val[0] = DotProduct(r.val[0], g.val[0], b.val[0], r_vec, g_vec, b_vec);
373 result.val[1] = DotProduct(r.val[1], g.val[1], b.val[1], r_vec, g_vec, b_vec);
374 result.val[2] = DotProduct(r.val[2], g.val[2], b.val[2], r_vec, g_vec, b_vec);
375 result.val[3] = DotProduct(r.val[3], g.val[3], b.val[3], r_vec, g_vec, b_vec);
376 return result;
377 }
378
379 ALWAYS_INLINE uint16x8_t QuantizeTo565(uint8x8_t pixels) {
380 // in: [min_r min_g min_b 0 max_r max_g max_b 0]
381 // out: [min_r5 min_g6 min_b5 0][max_r5 max_g6 max_b5 0]
382
383 // Expand the components to signed 16 bit.
384 uint16x8_t pixels16 = vmovl_u8(pixels);
385
386 // {31, 63, 31, 0, 31, 63, 31, 0};
387 const uint16x8_t kMultiply = vreinterpretq_u16_u64(vdupq_n_u64(0x1f003f001f));
388 uint16x8_t pixel0 = vmulq_u16(pixels16, kMultiply);
389
390 // {128, 128, 128, 0, 128, 128, 128, 0};
391 const uint16x8_t kAdd = vreinterpretq_u16_u64(vdupq_n_u64(0x8000800080));
392 uint16x8_t pixel1 = vaddq_u16(pixel0, kAdd);
393
394 // Create a shifted copy.
395 uint16x8_t pixel2 = vsraq_n_u16(pixel1, pixel1, 8);
396
397 // Shift and return.
398 return vshrq_n_u16(pixel2, 8);
399 }
400
401 // Combine the components of base colors in to 16 bits.
402 ALWAYS_INLINE uint16x4_t PackBaseColors(uint16x8_t base_colors) {
403 // in: [max_r5 max_g6 max_b5 0][min_r5 min_g6 min_b5 0]
404 // out: [max_rgb565 min_rgb565 0 0]
405
406 // Swapping r and b channels to match Skia.
407 base_colors = vrev64q_u16(base_colors);
408 base_colors = vcombine_u16(
409 vext_u16(vget_low_u16(base_colors), vget_low_u16(base_colors), 1),
410 vext_u16(vget_high_u16(base_colors), vget_high_u16(base_colors), 1));
411
412 // Shift to pack RGB565 in 16-bit.
413 uint64x2_t r =
414 vshlq_u64(vreinterpretq_u64_u16(base_colors), vdupq_n_s64(-32));
415 uint64x2_t g =
416 vshlq_u64(vreinterpretq_u64_u16(base_colors), vdupq_n_s64(-11));
417 uint64x2_t b = vshlq_u64(vreinterpretq_u64_u16(base_colors), vdupq_n_s64(11));
418 uint64x2_t base_colors_16 = vorrq_u64(r, vorrq_u64(g, b));
419
420 // Shift to pack 16-bit base colors in 32-bit and return.
421 return vreinterpret_u16_u64(
422 vorr_u64(vshl_n_u64(vget_high_u64(base_colors_16), 16),
423 vand_u64(vget_low_u64(base_colors_16), vdup_n_u64(0xffff))));
424 }
425
426 // Combine the given color indices.
427 //
428 // Params:
429 // S Size of an index in bits.
430 // indices Indices to be combined. Each of 8 bits element represents an index.
431 template <int S>
432 ALWAYS_INLINE uint64x1_t PackIndices(uint8x16_t indices) {
433 uint64x2_t ind = vshlq_n_u64(vreinterpretq_u64_u8(indices), 8 - S);
434 const uint64x2_t mask = vdupq_n_u64(0xff00000000000000);
435 uint64x2_t ind2 = vandq_u64(vshlq_n_u64(ind, 56), mask);
436 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 48), mask));
437 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 40), mask));
438 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 32), mask));
439 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 24), mask));
440 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 16), mask));
441 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 8), mask));
442 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(ind, mask));
443 return vshr_n_u64(
444 vorr_u64(vshr_n_u64(vget_low_u64(ind2), (8 * S)), vget_high_u64(ind2)),
445 64 - 16 * S);
446 }
447
448 ALWAYS_INLINE int32x4_t
449 CovarianceChannels(const int16x8x2_t& ch1, const int16x8x2_t& ch2) {
450 // Multiply and accumulate.
451 int32x4_t cov;
452 cov = vmull_s16(vget_low_s16(ch1.val[0]), vget_low_s16(ch2.val[0]));
453 cov = vmlal_s16(cov, vget_high_s16(ch1.val[0]), vget_high_s16(ch2.val[0]));
454 cov = vmlal_s16(cov, vget_low_s16(ch1.val[1]), vget_low_s16(ch2.val[1]));
455 cov = vmlal_s16(cov, vget_high_s16(ch1.val[1]), vget_high_s16(ch2.val[1]));
456 return cov;
457 }
458
459 ALWAYS_INLINE int32x4x2_t
460 Covariance(uint16x4_t average_rgb, const uint8x16x4_t& pixels_scattered) {
461 int16x8_t average_r = vreinterpretq_s16_u16(vdupq_lane_u16(average_rgb, 0));
462 int16x8_t average_g = vreinterpretq_s16_u16(vdupq_lane_u16(average_rgb, 1));
463 int16x8_t average_b = vreinterpretq_s16_u16(vdupq_lane_u16(average_rgb, 2));
464
465 // Subtract red values from the average red.
466 int16x8x2_t diff_r;
467 diff_r.val[0] = vsubq_s16(
468 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels_scattered.val[0]))),
469 average_r);
470 diff_r.val[1] = vsubq_s16(
471 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels_scattered.val[0]))),
472 average_r);
473
474 // Subtract green values from the average green.
475 int16x8x2_t diff_g;
476 diff_g.val[0] = vsubq_s16(
477 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels_scattered.val[1]))),
478 average_g);
479 diff_g.val[1] = vsubq_s16(
480 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels_scattered.val[1]))),
481 average_g);
482
483 // Subtract blue values from the average blue.
484 int16x8x2_t diff_b;
485 diff_b.val[0] = vsubq_s16(
486 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels_scattered.val[2]))),
487 average_b);
488 diff_b.val[1] = vsubq_s16(
489 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels_scattered.val[2]))),
490 average_b);
491
492 int32x4x4_t cov1;
493 cov1.val[0] = CovarianceChannels(diff_r, diff_r);
494 cov1.val[1] = CovarianceChannels(diff_r, diff_g);
495 cov1.val[2] = CovarianceChannels(diff_r, diff_b);
496 cov1.val[3] = vdupq_n_s32(0);
497
498 int32x4x4_t cov2;
499 cov2.val[0] = CovarianceChannels(diff_g, diff_g);
500 cov2.val[1] = CovarianceChannels(diff_g, diff_b);
501 cov2.val[2] = CovarianceChannels(diff_b, diff_b);
502 cov2.val[3] = vdupq_n_s32(0);
503
504 int32x4x2_t covariance;
505 covariance.val[0] = SumRGB(cov1);
506 covariance.val[1] = SumRGB(cov2);
507 return covariance;
508 }
509
510 ALWAYS_INLINE uint32x2_t MaskOutPixel(const uint8x16x4_t& pixels_linear,
511 const int32x4x4_t& dots,
512 int32x4_t max_dot_vec) {
513 // Mask out any of the 16 pixels where the dot product matches exactly.
514 uint32x4x4_t pixels;
515 pixels.val[0] = vandq_u32(vceqq_s32(dots.val[0], max_dot_vec),
516 vreinterpretq_u32_u8(pixels_linear.val[0]));
517
518 pixels.val[1] = vandq_u32(vceqq_s32(dots.val[1], max_dot_vec),
519 vreinterpretq_u32_u8(pixels_linear.val[1]));
520
521 pixels.val[2] = vandq_u32(vceqq_s32(dots.val[2], max_dot_vec),
522 vreinterpretq_u32_u8(pixels_linear.val[2]));
523
524 pixels.val[3] = vandq_u32(vceqq_s32(dots.val[3], max_dot_vec),
525 vreinterpretq_u32_u8(pixels_linear.val[3]));
526
527 // Fold it down.
528 return Fold<vec_ops::Max>(pixels);
529 }
530
531 ALWAYS_INLINE uint16x8_t GetBaseColors(const uint8x16x4_t& pixels_linear,
532 const uint8x16x4_t& pixels_scattered,
533 int32x4_t dir) {
534 // Expand all pixels to signed 32-bit integers.
535 int32x4x4_t r = ExpandRGBATo32(pixels_scattered.val[0]);
536 int32x4x4_t g = ExpandRGBATo32(pixels_scattered.val[1]);
537 int32x4x4_t b = ExpandRGBATo32(pixels_scattered.val[2]);
538
539 int32x4x4_t dots = CalculateDots(r, g, b, dir);
540
541 // Mask out the pixel(s) that matches the max dot.
542 uint32x2_t max_pixel =
543 MaskOutPixel(pixels_linear, dots, FoldDup<vec_ops::Max>(dots));
544
545 // Mask out the pixel(s) that matches the min dot.
546 uint32x2_t min_pixel =
547 MaskOutPixel(pixels_linear, dots, FoldDup<vec_ops::Min>(dots));
548
549 return QuantizeTo565(
550 vreinterpret_u8_u32(vzip_u32(max_pixel, min_pixel).val[0]));
551 }
552
553 // Figure out the two base colors to use from a block of 16 pixels
554 // by Primary Component Analysis and map along principal axis.
555 ALWAYS_INLINE uint16x8_t
556 OptimizeColorsBlock(const uint8x16x4_t& pixels_linear,
557 const uint8x16x4_t& pixels_scattered,
558 uint16x4_t sum_rgb,
559 uint8x8_t min_rgba,
560 uint8x8_t max_rgba) {
561 // min_rgba: [min_r min_g min_b min_a x x x x]
562 // max_rgba: [max_r max_g max_b max_a x x x x]
563
564 // Determine color distribution. We already have the max and min, now we need
565 // the average of the 16 pixels. Divide sum_rgb with rounding.
566 uint16x4_t average_rgb = vrshr_n_u16(sum_rgb, 4);
567
568 // Determine covariance matrix.
569 int32x4x2_t covariance = Covariance(average_rgb, pixels_scattered);
570
571 // Convert covariance matrix to float, find principal axis via power
572 // iteration.
573 float32x4x2_t covariance_float;
574 const float32x4_t kInv255 = vdupq_n_f32(1.0f / 255.0f);
575 covariance_float.val[0] =
576 vmulq_f32(vcvtq_f32_s32(covariance.val[0]), kInv255);
577 covariance_float.val[1] =
578 vmulq_f32(vcvtq_f32_s32(covariance.val[1]), kInv255);
579
580 int16x4_t max_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(max_rgba)));
581 int16x4_t min_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(min_rgba)));
582 float32x4_t vf4 = vcvtq_f32_s32(vsubl_s16(max_16, min_16));
583
584 for (int i = 0; i < 4; ++i) {
585 float32x4_t vfr4 = vdupq_n_f32(vgetq_lane_f32(vf4, 0));
586 float32x4_t vfg4 = vdupq_n_f32(vgetq_lane_f32(vf4, 1));
587 float32x4_t vfb4 = vdupq_n_f32(vgetq_lane_f32(vf4, 2));
588
589 // from: [0 1 2 x] [3 4 5 x]
590 // to: [1 3 4 x]
591 float32x4_t cov_134 =
592 vextq_f32(covariance_float.val[1], covariance_float.val[1], 3);
593 cov_134 =
594 vsetq_lane_f32(vgetq_lane_f32(covariance_float.val[0], 1), cov_134, 0);
595
596 // from: [0 1 2 x] [3 4 5 x]
597 // to: [2 4 5 x]
598 float32x4_t cov_245 = vsetq_lane_f32(
599 vgetq_lane_f32(covariance_float.val[0], 2), covariance_float.val[1], 0);
600
601 vf4 = vmulq_f32(vfr4, covariance_float.val[0]);
602 vf4 = vmlaq_f32(vf4, vfg4, cov_134);
603 vf4 = vmlaq_f32(vf4, vfb4, cov_245);
604 }
605
606 float32x4_t magnitude = vabsq_f32(vf4);
607 magnitude = vsetq_lane_f32(0.0f, magnitude, 3); // Null out alpha.
608 float32x4_t mag4 = vdupq_lane_f32(
609 vpmax_f32(vpmax_f32(vget_low_f32(magnitude), vget_high_f32(magnitude)),
610 vdup_n_f32(0.0f)),
611 0);
612
613 const int32x4_t kLuminance = {299, 587, 114, 0};
614
615 // Note that this quite often means dividing by zero. The math still works
616 // when comparing with Inf though.
617 float32x4_t inv_magnitude = Divide<2>(vdupq_n_f32(512.0f), mag4);
618
619 int32x4_t vf4_mag = vcvtq_s32_f32(vmulq_f32(vf4, inv_magnitude));
620 int32x4_t v =
621 vbslq_s32(vcltq_f32(mag4, vdupq_n_f32(4.0f)), kLuminance, vf4_mag);
622
623 return GetBaseColors(pixels_linear, pixels_scattered, v);
624 }
625
626 ALWAYS_INLINE uint16x8_t
627 GetApproximateBaseColors(const uint8x16x4_t& pixels_linear,
628 const uint8x16x4_t& pixels_scattered,
629 uint8x8_t min_rgba,
630 uint8x8_t max_rgba) {
631 // min_rgba: [min_r min_g min_b min_a x x x x]
632 // max_rgba: [max_r max_g max_b max_a x x x x]
633
634 // Get direction vector and expand to 32-bit.
635 int16x4_t max_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(max_rgba)));
636 int16x4_t min_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(min_rgba)));
637 int32x4_t v = vsubl_s16(max_16, min_16);
638
639 return GetBaseColors(pixels_linear, pixels_scattered, v);
640 }
641
642 // Take two base colors and generate 4 RGBX colors where:
643 // 0 = baseColor0
644 // 1 = baseColor1
645 // 2 = (2 * baseColor0 + baseColor1) / 3
646 // 3 = (2 * baseColor1 + baseColor0) / 3
647 ALWAYS_INLINE uint16x4x4_t EvalColors(const uint16x8_t& base_colors) {
648 // The base colors are expanded by reusing the top bits at the end. That makes
649 // sure that white is still white after being quantized and converted back.
650 //
651 // [(r<<3 | r>>2) (g<<2 | g>>4) (b<<3 | b>>2) 0]
652
653 // The upper shift values for each component.
654 // {3, 2, 3, 0, 3, 2, 3, 0};
655 const int16x8_t kShiftUp = vreinterpretq_s16_u64(vdupq_n_u64(0x300020003));
656 uint16x8_t pixels_up = vshlq_u16(base_colors, kShiftUp);
657 // [r0<<3 g0<<2 b0<<3 0] [r1<<3 g1<<2 b1<<3 0]
658
659 // The lower shift values for each component.
660 // Note that we need to use negative values to shift right.
661 // {-2, -4, -2, 0, -2, -4, -2, 0};
662 const int16x8_t kShiftDown =
663 vreinterpretq_s16_u64(vdupq_n_u64(0xfffefffcfffe));
664 uint16x8_t pixels_down = vshlq_u16(base_colors, kShiftDown);
665 // [r0>>2 g0>>4 b0>>2 0] [r1>>2 g1>>4 b1>>2 0]
666
667 uint16x8_t pixels = vorrq_u16(pixels_up, pixels_down);
668 // [(r0<<3 | r0>>2) (g0<<2 | g0>>4) (b0<<3 | b0>>2) 0]
669 // [(r1<<3 | r1>>2) (g1<<2 | g1>>4) (b1<<3 | b1>>2) 0]
670
671 // Linear interpolate the two other colors:
672 // (2 * max + min) / 3
673 // (2 * min + max) / 3
674
675 uint16x8_t pixels_mul2 = vaddq_u16(pixels, pixels);
676
677 uint16x8_t swapped = vreinterpretq_u16_u64(vextq_u64(
678 vreinterpretq_u64_u16(pixels), vreinterpretq_u64_u16(pixels), 1));
679 int16x8_t output = vreinterpretq_s16_u16(vaddq_u16(pixels_mul2, swapped));
680
681 // There's no division in NEON, but we can use "x * ((1 << 16) / 3 + 1))"
682 // instead.
683 output = vqdmulhq_s16(output, vdupq_n_s16(((1 << 16) / 3 + 1) >> 1));
684
685 uint16x4x4_t colors;
686 colors.val[0] = vget_low_u16(pixels);
687 colors.val[1] = vget_high_u16(pixels);
688 colors.val[2] = vreinterpret_u16_s16(vget_low_s16(output));
689 colors.val[3] = vreinterpret_u16_s16(vget_high_s16(output));
690 return colors;
691 }
692
693 template <typename T>
694 ALWAYS_INLINE uint8x8_t GetRemapIndices(int32x4_t dots,
695 int32x4_t half_point,
696 int32x4_t c0_point,
697 int32x4_t c3_point) {
698 // bits = (dot < half_point ? 4 : 0)
699 // | (dot < c0_point ? 2 : 0)
700 // | (dot < c3_point ? 1 : 0)
701 int32x4_t cmp0 = vreinterpretq_s32_u32(
702 vandq_u32(vcgtq_s32(half_point, dots), vdupq_n_u32(4)));
703 int32x4_t cmp1 = vreinterpretq_s32_u32(
704 vandq_u32(vcgtq_s32(c0_point, dots), vdupq_n_u32(2)));
705 int32x4_t cmp2 = vreinterpretq_s32_u32(
706 vandq_u32(vcgtq_s32(c3_point, dots), vdupq_n_u32(1)));
707 int32x4_t bits = vorrq_s32(vorrq_s32(cmp0, cmp1), cmp2);
708
709 // Narrow it down to unsigned 8 bits and return.
710 return vqmovn_u16(vcombine_u16(vqmovun_s32(bits), vdup_n_u16(0)));
711 }
712
713 // dots: Dot products for each pixel.
714 // points: Crossover points.
715 template <typename T>
716 ALWAYS_INLINE uint8x16_t GetColorIndices(int32x4x4_t dots, int32x4_t points) {
717 // Crossover points for "best color in top half"/"best in bottom half" and
718 // the same inside that subinterval.
719 int32x4_t c0_point = vdupq_lane_s32(vget_low_s32(points), 1);
720 int32x4_t half_point = vdupq_lane_s32(vget_low_s32(points), 0);
721 int32x4_t c3_point = vdupq_lane_s32(vget_high_s32(points), 0);
722
723 // Get kRemap table indices.
724 uint8x8x4_t ind;
725 ind.val[0] = GetRemapIndices<T>(dots.val[0], half_point, c0_point, c3_point);
726 ind.val[1] = GetRemapIndices<T>(dots.val[1], half_point, c0_point, c3_point);
727 ind.val[2] = GetRemapIndices<T>(dots.val[2], half_point, c0_point, c3_point);
728 ind.val[3] = GetRemapIndices<T>(dots.val[3], half_point, c0_point, c3_point);
729
730 // Combine indices.
731 uint8x8_t indices_lo =
732 vreinterpret_u8_u32(vzip_u32(vreinterpret_u32_u8(ind.val[0]),
733 vreinterpret_u32_u8(ind.val[1])).val[0]);
734 uint8x8_t indices_hi =
735 vreinterpret_u8_u32(vzip_u32(vreinterpret_u32_u8(ind.val[2]),
736 vreinterpret_u32_u8(ind.val[3])).val[0]);
737 // Do table lookup and return 2-bit color indices.
738 return vcombine_u8(vtbl1_u8(T::kRemap, indices_lo),
739 vtbl1_u8(T::kRemap, indices_hi));
740 }
741
742 template <typename T>
743 ALWAYS_INLINE uint8x16_t
744 MatchColorsBlock(const uint8x16x4_t& pixels_scattered, uint16x4x4_t colors) {
745 // Get direction vector and expand to 32-bit.
746 int32x4_t dir = vsubl_s16(vreinterpret_s16_u16(colors.val[0]),
747 vreinterpret_s16_u16(colors.val[1]));
748 // Duplicate r g b elements of direction into different registers.
749 int32x4_t dir_r = vdupq_lane_s32(vget_low_s32(dir), 0);
750 int32x4_t dir_g = vdupq_lane_s32(vget_low_s32(dir), 1);
751 int32x4_t dir_b = vdupq_lane_s32(vget_high_s32(dir), 0);
752
753 // Transpose to separate red, green, blue and alpha channels into 4 different
754 // registers. Alpha is ignored.
755 uint16x4x2_t trn_lo = vtrn_u16(colors.val[0], colors.val[1]);
756 uint16x4x2_t trn_hi = vtrn_u16(colors.val[2], colors.val[3]);
757 uint32x4x2_t transposed_colors = vtrnq_u32(
758 vreinterpretq_u32_u16(vcombine_u16(trn_lo.val[0], trn_lo.val[1])),
759 vreinterpretq_u32_u16(vcombine_u16(trn_hi.val[0], trn_hi.val[1])));
760
761 // Expand to 32-bit.
762 int32x4_t colors_r =
763 vmovl_s16(vget_low_s16(vreinterpretq_s16_u32(transposed_colors.val[0])));
764 int32x4_t colors_g =
765 vmovl_s16(vget_high_s16(vreinterpretq_s16_u32(transposed_colors.val[0])));
766 int32x4_t colors_b =
767 vmovl_s16(vget_low_s16(vreinterpretq_s16_u32(transposed_colors.val[1])));
768
769 // Get dot products.
770 int32x4_t stops =
771 DotProduct(colors_r, colors_g, colors_b, dir_r, dir_g, dir_b);
772
773 // Build a register containing 4th, 2nd and 3rd elements of stops respectively
774 // in each 32 bits element.
775 int32x4_t points1 = vsetq_lane_s32(vgetq_lane_s32(stops, 3), stops, 0);
776 // Build a register containing 3rd, 4th and 1st elements of stops respectively
777 // in each 32 bits element.
778 int32x4_t points2 = vreinterpretq_s32_s64(
779 vextq_s64(vreinterpretq_s64_s32(stops), vreinterpretq_s64_s32(stops), 1));
780 // Add and divide by 2.
781 int32x4_t points = vshrq_n_s32(vaddq_s32(points1, points2), 1);
782
783 // Expand all pixels to signed 32-bit integers.
784 int32x4x4_t r = ExpandRGBATo32(pixels_scattered.val[0]);
785 int32x4x4_t g = ExpandRGBATo32(pixels_scattered.val[1]);
786 int32x4x4_t b = ExpandRGBATo32(pixels_scattered.val[2]);
787
788 int32x4x4_t dots = CalculateDots(r, g, b, dir);
789
790 // Get 2-bit color indices.
791 return GetColorIndices<T>(dots, points);
792 }
793
794 template <typename T>
795 ALWAYS_INLINE uint32x4x2_t DoProdsTableLookup(uint8x8_t indices) {
796 // Do table lookup for each color index. The values in the table are 3 bytes
797 // big so we do it in 3 steps.
798 uint16x8_t lookup1 = vmovl_u8(vtbl1_u8(vcreate_u8(T::kProds[0]), indices));
799 uint16x8_t lookup2 = vmovl_u8(vtbl1_u8(vcreate_u8(T::kProds[1]), indices));
800 uint16x8_t lookup3 = vmovl_u8(vtbl1_u8(vcreate_u8(T::kProds[2]), indices));
801 // Expand to 32-bit.
802 uint32x4_t lookup1_lo = vmovl_u16(vget_low_u16(lookup1));
803 uint32x4_t lookup1_hi = vmovl_u16(vget_high_u16(lookup1));
804 uint32x4_t lookup2_lo = vmovl_u16(vget_low_u16(lookup2));
805 uint32x4_t lookup2_hi = vmovl_u16(vget_high_u16(lookup2));
806 uint32x4_t lookup3_lo = vmovl_u16(vget_low_u16(lookup3));
807 uint32x4_t lookup3_hi = vmovl_u16(vget_high_u16(lookup3));
808 // Combine results by shifting and or-ing to obtain the actual table value.
809 uint32x4x2_t result;
810 result.val[0] = vorrq_u32(lookup3_lo, vorrq_u32(vshlq_n_u32(lookup2_lo, 8),
811 vshlq_n_u32(lookup1_lo, 16)));
812 result.val[1] = vorrq_u32(lookup3_hi, vorrq_u32(vshlq_n_u32(lookup2_hi, 8),
813 vshlq_n_u32(lookup1_hi, 16)));
814 return result;
815 }
816
817 // Tries to optimize colors to suit block contents better.
818 // Done by solving a least squares system via normal equations+Cramer's rule.
819 template <typename T>
820 ALWAYS_INLINE int RefineBlock(const uint8x16x4_t& pixels_scattered,
821 uint16x4_t sum_rgb,
822 uint16x8_t& base_colors,
823 uint8x16_t indices) {
824 uint16x8_t old_base_colors = base_colors;
825
826 if (ElementsEqual(indices)) { // Do all pixels have the same index?
827 // Yes, linear system would be singular; solve using optimal single-color
828 // match on average color.
829
830 // Get the average of the 16 pixels with rounding.
831 uint16x4_t average_rgb = vrshr_n_u16(sum_rgb, 4);
832
833 ALIGNAS(8) uint16_t rgb[4];
834 vst1_u16(rgb, average_rgb);
835 // Look up optimal values instead of trying to calculate.
836 uint16_t colors[8] = {g_o_match55[rgb[0]][0],
837 MatchSingleGreenMax<typename T::BASE_TYPE>(rgb[1]),
838 g_o_match55[rgb[2]][0],
839 0,
840 g_o_match55[rgb[0]][1],
841 MatchSingleGreenMin<typename T::BASE_TYPE>(rgb[1]),
842 g_o_match55[rgb[2]][1],
843 0};
844 base_colors = vld1q_u16(colors);
845 } else {
846 // Expand to 16-bit.
847 int16x8x2_t r = ExpandRGBATo16(pixels_scattered.val[0]);
848 int16x8x2_t g = ExpandRGBATo16(pixels_scattered.val[1]);
849 int16x8x2_t b = ExpandRGBATo16(pixels_scattered.val[2]);
850
851 // Do table lookup for each color index.
852 int8x16_t w1 = DoW1TableLookup<T>(indices);
853 // Expand to 16-bit.
854 int16x8_t w1_lo = vmovl_s8(vget_low_s8(w1));
855 int16x8_t w1_hi = vmovl_s8(vget_high_s8(w1));
856 // Multiply and accumulate.
857 int16x8x4_t at1_rgb;
858 at1_rgb.val[0] = vmulq_s16(w1_lo, r.val[0]);
859 at1_rgb.val[0] = vmlaq_s16(at1_rgb.val[0], w1_hi, r.val[1]);
860 at1_rgb.val[1] = vmulq_s16(w1_lo, g.val[0]);
861 at1_rgb.val[1] = vmlaq_s16(at1_rgb.val[1], w1_hi, g.val[1]);
862 at1_rgb.val[2] = vmulq_s16(w1_lo, b.val[0]);
863 at1_rgb.val[2] = vmlaq_s16(at1_rgb.val[2], w1_hi, b.val[1]);
864 // [r][g][b][]
865 int32x4_t at1 = SumRGB(at1_rgb);
866
867 // [r][g][b][]
868 int32x4_t at2 = vreinterpretq_s32_u32(vmovl_u16(sum_rgb));
869 // at2 = 3 * at2 - at1;
870 at2 = vsubq_s32(vmulq_s32(at2, vdupq_n_s32(3)), at1);
871
872 // Do table lookup for each color index.
873 uint32x4x2_t akku1 = DoProdsTableLookup<T>(vget_low_u8(indices));
874 uint32x4x2_t akku2 = DoProdsTableLookup<T>(vget_high_u8(indices));
875 uint32x4_t sum_akku = vaddq_u32(
876 vaddq_u32(vaddq_u32(akku1.val[0], akku1.val[1]), akku2.val[0]),
877 akku2.val[1]);
878 // Pairwise add and accumulate.
879 uint64x1_t akku = vpaddl_u32(vget_low_u32(sum_akku));
880 akku = vpadal_u32(akku, vget_high_u32(sum_akku));
881
882 // Extract solutions and decide solvability.
883
884 // [akku >> 16]x4
885 int32x4_t xx =
886 vdupq_lane_s32(vreinterpret_s32_u64(vshr_n_u64(akku, 16)), 0);
887 // [(akku >> 8) & 0xff]x4
888 const uint64x1_t kFF = vdup_n_u64(0xff);
889 int32x4_t yy = vdupq_lane_s32(
890 vreinterpret_s32_u64(vand_u64(vshr_n_u64(akku, 8), kFF)), 0);
891 // [akku & 0xff]x4
892 int32x4_t xy = vdupq_lane_s32(vreinterpret_s32_u64(vand_u64(akku, kFF)), 0);
893
894 // ((3.0f * 31.0f) / 255.0f) / (xx * yy - xy * xy)
895 float32x4_t frb = Divide<2>(
896 vdupq_n_f32((3.0f * 31.0f) / 255.0f),
897 vcvtq_f32_s32(vsubq_s32(vmulq_s32(xx, yy), vmulq_s32(xy, xy))));
898 // frb * 63.0f / 31.0f
899 float32x4_t fg = vmulq_f32(vmulq_f32(frb, vdupq_n_f32(63.0f)),
900 vdupq_n_f32(1.0f / 31.0f));
901
902 // Solve.
903
904 // [frb][fg][frb][]
905 float32x4_t frb_fg_frb = vsetq_lane_f32(vgetq_lane_f32(fg, 0), frb, 1);
906 // [31][63][31][]
907 const int32x4_t kClamp565_vec = {31, 63, 31, 0};
908
909 // (at1_r * yy - at2_r * xy) * frb + 0.5f
910 int32x4_t base0_rgb32 = vcvtq_s32_f32(vaddq_f32(
911 vmulq_f32(
912 vcvtq_f32_s32(vsubq_s32(vmulq_s32(at1, yy), vmulq_s32(at2, xy))),
913 frb_fg_frb),
914 vdupq_n_f32(0.5f)));
915 // Clamp and saturate.
916 uint16x4_t base0_rgb16 = vqmovun_s32(vbslq_s32(
917 vcgeq_s32(base0_rgb32, kClamp565_vec), kClamp565_vec, base0_rgb32));
918
919 // (at2_r * xx - at1_r * xy) * frb + 0.5f
920 int32x4_t base1_rgb32 = vcvtq_s32_f32(vaddq_f32(
921 vmulq_f32(
922 vcvtq_f32_s32(vsubq_s32(vmulq_s32(at2, xx), vmulq_s32(at1, xy))),
923 frb_fg_frb),
924 vdupq_n_f32(0.5f)));
925 // Clamp and saturate.
926 uint16x4_t base1_rgb16 = vqmovun_s32(vbslq_s32(
927 vcgeq_s32(base1_rgb32, kClamp565_vec), kClamp565_vec, base1_rgb32));
928
929 base_colors = vcombine_u16(base0_rgb16, base1_rgb16);
930 }
931
932 return !Equal(old_base_colors, base_colors);
933 }
934
935 template <typename T, Quality QUALITY>
936 ALWAYS_INLINE void CompressColorBlock(uint8_t* dst,
937 const uint8x16x4_t& pixels_linear,
938 const uint8x16x4_t& pixels_scattered,
939 uint8x8_t min_rgba,
940 uint8x8_t max_rgba) {
941 // Take a shortcut if the block is constant (disregarding alpha).
942 uint32_t min32 = vget_lane_u32(vreinterpret_u32_u8(min_rgba), 0);
943 uint32_t max32 = vget_lane_u32(vreinterpret_u32_u8(max_rgba), 0);
944 if ((min32 & 0x00ffffff) == (max32 & 0x00ffffff)) {
945 // Swapping r and b channels to match Skia.
946 int b = min32 & 0xff;
947 int g = (min32 >> 8) & 0xff;
948 int r = (min32 >> 16) & 0xff;
949
950 uint16_t max16 = MatchSingleColorMax<typename T::BASE_TYPE>(r, g, b);
951 uint16_t min16 = MatchSingleColorMin<typename T::BASE_TYPE>(r, g, b);
952 uint32_t indices = T::kConstantColorIndices;
953 FormatFixup_Generic<typename T::BASE_TYPE>(&max16, &min16, &indices);
954
955 uint32_t* dst32 = reinterpret_cast<uint32_t*>(dst);
956 dst32[0] = max16 | (min16 << 16);
957 dst32[1] = indices;
958 } else {
959 uint16x4_t sum_rgb;
960 uint16x8_t base_colors;
961
962 if (QUALITY == kQualityLow) {
963 base_colors = GetApproximateBaseColors(pixels_linear, pixels_scattered,
964 min_rgba, max_rgba);
965 } else {
966 sum_rgb = SumRGB(pixels_scattered);
967 // Do Primary Component Analysis and map along principal axis.
968 base_colors = OptimizeColorsBlock(pixels_linear, pixels_scattered,
969 sum_rgb, min_rgba, max_rgba);
970 }
971
972 // Check if the two base colors are the same.
973 uint8x16_t indices;
974 if (!Equal(vget_low_u16(base_colors), vget_high_u16(base_colors))) {
975 // Calculate the two intermediate colors as well.
976 uint16x4x4_t colors = EvalColors(base_colors);
977
978 // Do a first pass to find good index candicates for all 16 of the pixels
979 // in the block.
980 indices = MatchColorsBlock<T>(pixels_scattered, colors);
981 } else {
982 // Any indices can be used here.
983 indices = vdupq_n_u8(0);
984 }
985
986 if (QUALITY == kQualityHigh) {
987 // Refine the base colors and indices multiple times if requested.
988 for (int i = 0; i < kNumRefinements; ++i) {
989 uint8x16_t lastIndices = indices;
990
991 if (RefineBlock<T>(pixels_scattered, sum_rgb, base_colors, indices)) {
992 if (!Equal(vget_low_u16(base_colors), vget_high_u16(base_colors))) {
993 uint16x4x4_t colors = EvalColors(base_colors);
994 indices = MatchColorsBlock<T>(pixels_scattered, colors);
995 } else {
996 // We ended up with two identical base colors, can't refine this
997 // further.
998 indices = vdupq_n_u8(0);
999 break;
1000 }
1001 }
1002
1003 if (Equal(indices, lastIndices)) {
1004 // There's no need to do another refinement pass if we didn't get any
1005 // improvements this pass.
1006 break;
1007 }
1008 }
1009 }
1010
1011 // Prepare the final block by converting the base colors to 16-bit and
1012 // packing the pixel indices.
1013 uint16x4_t base_colors_16 = PackBaseColors(base_colors);
1014 uint64x1_t indices_2x16 = PackIndices<2>(indices);
1015 FormatFixup<T>(&base_colors_16, &indices_2x16);
1016 uint64x1_t output = vorr_u64(vshl_n_u64(indices_2x16, 32),
1017 vreinterpret_u64_u16(base_colors_16));
1018 vst1_u64(reinterpret_cast<uint64_t*>(dst), output);
1019 }
1020 }
1021
1022 // alpha: 8x8-bit alpha values.
1023 // dist: Distance between max and min alpha in the color block.
1024 // bias: Rounding bias.
1025 ALWAYS_INLINE uint8x8_t
1026 GetAlphaIndices(uint8x8_t alpha, int16x8_t dist, int16x8_t bias) {
1027 // Expand to signed 16-bit.
1028 int16x8_t alpha_16 = vreinterpretq_s16_u16(vmovl_u8(alpha));
1029
1030 // Multiply each alpha value by 7 and add bias.
1031 int16x8_t a = vaddq_s16(vmulq_s16(alpha_16, vdupq_n_s16(7)), bias);
1032
1033 int16x8_t dist4 = vmulq_s16(dist, vdupq_n_s16(4));
1034 int16x8_t dist2 = vmulq_s16(dist, vdupq_n_s16(2));
1035
1036 // Select index. This is a "linear scale" lerp factor between 0 (val=min)
1037 // and 7 (val=max).
1038 // t = (a >= dist4) ? -1 : 0
1039 int16x8_t t =
1040 vandq_s16(vreinterpretq_s16_u16(vcgeq_s16(a, dist4)), vdupq_n_s16(-1));
1041 // ind1 = t & 4;
1042 int16x8_t ind1 = vandq_s16(t, vdupq_n_s16(4));
1043 // a1 = a - (dist4 & t);
1044 int16x8_t a1 = vsubq_s16(a, vandq_s16(dist4, t));
1045
1046 // t = (a1 >= dist2) ? -1 : 0;
1047 t = vandq_s16(vreinterpretq_s16_u16(vcgeq_s16(a1, dist2)), vdupq_n_s16(-1));
1048 // ind2 = t & 2;
1049 int16x8_t ind2 = vandq_s16(t, vdupq_n_s16(2));
1050 // a2 = a1 - (dist2 & t);
1051 int16x8_t a2 = vsubq_s16(a1, vandq_s16(dist2, t));
1052
1053 // ind3 = (a2 >= dist)
1054 int16x8_t ind3 =
1055 vandq_s16(vreinterpretq_s16_u16(vcgeq_s16(a2, dist)), vdupq_n_s16(1));
1056
1057 // indices = ind1 + ind2 + ind3
1058 int16x8_t indices = vaddq_s16(ind1, vaddq_s16(ind2, ind3));
1059
1060 // Turn linear scale into alpha index (0/1 are extremal pts).
1061 // ind = -indices & 7
1062 int16x8_t ind = vandq_s16(vnegq_s16(indices), vdupq_n_s16(7));
1063 // indices = ind ^ (2 > ind)
1064 indices = veorq_s16(
1065 ind, vandq_s16(vreinterpretq_s16_u16(vcgtq_s16(vdupq_n_s16(2), ind)),
1066 vdupq_n_s16(1)));
1067 // Narrow it down to unsigned 8 bits and return.
1068 return vqmovun_s16(indices);
1069 }
1070
1071 ALWAYS_INLINE void CompressAlphaBlock(uint8_t* dst,
1072 uint8x16_t pixels_alpha,
1073 uint8x8_t min_rgba,
1074 uint8x8_t max_rgba) {
1075 // Take a shortcut if the block is constant.
1076 uint8_t min_alpha = vget_lane_u8(min_rgba, 3);
1077 uint8_t max_alpha = vget_lane_u8(max_rgba, 3);
1078 if (min_alpha == max_alpha) {
1079 dst[0] = max_alpha;
1080 dst[1] = min_alpha;
1081 // All indices are the same, any value will do.
1082 *reinterpret_cast<uint16_t*>(dst + 2) = 0;
1083 *reinterpret_cast<uint32_t*>(dst + 4) = 0;
1084 } else {
1085 // [max - min]x8
1086 int16x8_t dist = vdupq_lane_s16(
1087 vreinterpret_s16_u16(vget_low_u16(vsubl_u8(max_rgba, min_rgba))), 3);
1088 // bias = (dist < 8) ? (dist - 1) : (dist / 2 + 2)
1089 int16x8_t bias = vbslq_s16(vcltq_s16(dist, vdupq_n_s16(8)),
1090 vsubq_s16(dist, vdupq_n_s16(1)),
1091 vaddq_s16(vshrq_n_s16(dist, 1), vdupq_n_s16(2)));
1092 // bias -= min * 7;
1093 bias = vsubq_s16(
1094 bias,
1095 vmulq_s16(
1096 vdupq_lane_s16(
1097 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(min_rgba))), 3),
1098 vdupq_n_s16(7)));
1099
1100 uint8x8_t indices_lo =
1101 GetAlphaIndices(vget_low_u8(pixels_alpha), dist, bias);
1102 uint8x8_t indices_hi =
1103 GetAlphaIndices(vget_high_u8(pixels_alpha), dist, bias);
1104
1105 // Prepare the final block by combining the base alpha values and packing
1106 // the alpha indices.
1107 uint8x8_t max_min_alpha = vzip_u8(max_rgba, min_rgba).val[0];
1108 uint64x1_t indices = PackIndices<3>(vcombine_u8(indices_lo, indices_hi));
1109 uint64x1_t output =
1110 vorr_u64(vshl_n_u64(indices, 16),
1111 vshr_n_u64(vreinterpret_u64_u8(max_min_alpha), 48));
1112 vst1_u64(reinterpret_cast<uint64_t*>(dst), output);
1113 }
1114 }
1115
1116 template <typename T, bool OPAQUE, Quality QUALITY>
1117 void CompressImage(const uint8_t* src, uint8_t* dst, int width, int height) {
1118 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {
1119 for (int x = 0; x < width; x += 4) {
1120 // Load the four rows of pixels.
1121 uint8x16x4_t pixels_linear;
1122 pixels_linear.val[0] = vld1q_u8(src + (x + 0 * width) * 4);
1123 pixels_linear.val[1] = vld1q_u8(src + (x + 1 * width) * 4);
1124 pixels_linear.val[2] = vld1q_u8(src + (x + 2 * width) * 4);
1125 pixels_linear.val[3] = vld1q_u8(src + (x + 3 * width) * 4);
1126
1127 // Transpose/scatter the red, green, blue and alpha channels into
1128 // separate registers.
1129 ALIGNAS(8) uint8_t block[64];
1130 vst1q_u8(block + 0 * 16, pixels_linear.val[0]);
1131 vst1q_u8(block + 1 * 16, pixels_linear.val[1]);
1132 vst1q_u8(block + 2 * 16, pixels_linear.val[2]);
1133 vst1q_u8(block + 3 * 16, pixels_linear.val[3]);
1134 uint8x16x4_t pixels_scattered = vld4q_u8(block);
1135
1136 // We need the min and max values both to detect solid blocks and when
1137 // computing the base colors.
1138 uint8x8_t min_rgba = FoldRGBA<vec_ops::Min>(pixels_scattered);
1139 uint8x8_t max_rgba = FoldRGBA<vec_ops::Max>(pixels_scattered);
1140
1141 if (!OPAQUE) {
1142 CompressAlphaBlock(dst, pixels_scattered.val[3], min_rgba, max_rgba);
1143 dst += 8;
1144 }
1145
1146 CompressColorBlock<T, QUALITY>(dst, pixels_linear, pixels_scattered,
1147 min_rgba, max_rgba);
1148 dst += 8;
1149 }
1150 }
1151 }
1152
1153 } // namespace
1154
1155 void CompressATC_NEON(const uint8_t* src, uint8_t* dst, int width, int height) {
1156 CompressImage<TYPE_ATC_NEON, true, kQualityHigh>(src, dst, width, height);
1157 }
1158
1159 void CompressATCIA_NEON(const uint8_t* src,
1160 uint8_t* dst,
1161 int width,
1162 int height) {
1163 CompressImage<TYPE_ATC_NEON, false, kQualityHigh>(src, dst, width, height);
1164 }
1165
1166 void CompressDXT1_NEON(const uint8_t* src,
1167 uint8_t* dst,
1168 int width,
1169 int height) {
1170 CompressImage<TYPE_DXT_NEON, true, kQualityHigh>(src, dst, width, height);
1171 }
1172
1173 void CompressDXT5_NEON(const uint8_t* src,
1174 uint8_t* dst,
1175 int width,
1176 int height) {
1177 CompressImage<TYPE_DXT_NEON, false, kQualityHigh>(src, dst, width, height);
1178 }
1179
1180 } // namespace texture_compress
1181 } // namespace cc
1182
1183 #endif // __ARM_NEON__
OLDNEW
« no previous file with comments | « cc/resources/texture_compress/arm/atc_dxt_neon.h ('k') | cc/resources/texture_compress/arm/etc1_neon.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698