OLD | NEW |
(Empty) | |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 // See the links below for detailed descriptions of the algorithms used. |
| 6 // http://cbloomrants.blogspot.se/2008/12/12-08-08-dxtc-summary.html |
| 7 // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination |
| 8 |
| 9 #if defined(__ARM_NEON__) |
| 10 |
| 11 #include "cc/resources/texture_compress/arm/atc_dxt_neon.h" |
| 12 |
| 13 #include <arm_neon.h> |
| 14 |
| 15 #include "base/compiler_specific.h" |
| 16 #include "base/logging.h" |
| 17 #include "cc/resources/texture_compress/atc_dxt.h" |
| 18 |
| 19 namespace cc { |
| 20 namespace texture_compress { |
| 21 |
| 22 struct TYPE_ATC_NEON : public TYPE_ATC { |
| 23 typedef TYPE_ATC BASE_TYPE; |
| 24 static const uint8x8_t kRemap; |
| 25 static const uint64_t kProds[3]; |
| 26 }; |
| 27 |
| 28 struct TYPE_DXT_NEON : public TYPE_DXT { |
| 29 typedef TYPE_DXT BASE_TYPE; |
| 30 static const uint8x8_t kRemap; |
| 31 static const int8x8_t kW1Table; |
| 32 static const uint64_t kProds[3]; |
| 33 }; |
| 34 |
| 35 const uint8x8_t TYPE_ATC_NEON::kRemap = {0, 1, 0, 1, 2, 2, 3, 3}; |
| 36 const uint64_t TYPE_ATC_NEON::kProds[3] = {0x00010409, 0x09040100, 0x00020200}; |
| 37 |
| 38 const uint8x8_t TYPE_DXT_NEON::kRemap = {0, 2, 0, 2, 3, 3, 1, 1}; |
| 39 const int8x8_t TYPE_DXT_NEON::kW1Table = {3, 0, 2, 1, 0, 0, 0, 0}; |
| 40 const uint64_t TYPE_DXT_NEON::kProds[3] = {0x01040009, 0x04010900, 0x02020000}; |
| 41 |
| 42 // Number of passes over the block that's done to refine the base colors. |
| 43 // Only applies to high quality compression mode. |
| 44 const int kNumRefinements = 2; |
| 45 |
| 46 namespace { |
| 47 |
| 48 template <typename T> |
| 49 ALWAYS_INLINE int8x16_t DoW1TableLookup(uint8x16_t indices); |
| 50 |
| 51 template <> |
| 52 ALWAYS_INLINE int8x16_t DoW1TableLookup<TYPE_ATC_NEON>(uint8x16_t indices) { |
| 53 // Take a shortcut for ATC which gives the same result as the table lookup. |
| 54 // {0, 1, 2, 3} -> {3, 2, 1, 0} |
| 55 return veorq_s8(vreinterpretq_s8_u8(indices), vdupq_n_s8(3)); |
| 56 } |
| 57 |
| 58 template <> |
| 59 ALWAYS_INLINE int8x16_t DoW1TableLookup<TYPE_DXT_NEON>(uint8x16_t indices) { |
| 60 // Do table lookup for each color index. |
| 61 return vcombine_s8(vtbl1_s8(TYPE_DXT_NEON::kW1Table, |
| 62 vreinterpret_s8_u8(vget_low_u8(indices))), |
| 63 vtbl1_s8(TYPE_DXT_NEON::kW1Table, |
| 64 vreinterpret_s8_u8(vget_high_u8(indices)))); |
| 65 } |
| 66 |
| 67 // Returns max and min base green colors matching the given single green color |
| 68 // when solved via linear interpolation. Output format differs for ATC and DXT. |
| 69 // See explicitly instantiated template functions below. |
| 70 template <typename T> |
| 71 ALWAYS_INLINE uint16_t MatchSingleGreenMax(int g); |
| 72 template <typename T> |
| 73 ALWAYS_INLINE uint16_t MatchSingleGreenMin(int g); |
| 74 |
| 75 template <> |
| 76 ALWAYS_INLINE uint16_t MatchSingleGreenMax<TYPE_ATC>(int g) { |
| 77 return g_o_match56[g][0] << 1; |
| 78 } |
| 79 |
| 80 template <> |
| 81 ALWAYS_INLINE uint16_t MatchSingleGreenMin<TYPE_ATC>(int g) { |
| 82 return g_o_match56[g][1]; |
| 83 } |
| 84 |
| 85 template <> |
| 86 ALWAYS_INLINE uint16_t MatchSingleGreenMax<TYPE_DXT>(int g) { |
| 87 return g_o_match66[g][0]; |
| 88 } |
| 89 |
| 90 template <> |
| 91 ALWAYS_INLINE uint16_t MatchSingleGreenMin<TYPE_DXT>(int g) { |
| 92 return g_o_match66[g][1]; |
| 93 } |
| 94 |
| 95 // This converts the output data to either ATC or DXT format. |
| 96 // See explicitly instantiated template functions below. |
| 97 template <typename T> |
| 98 ALWAYS_INLINE void FormatFixup(uint16x4_t* base_colors, uint64x1_t* indices); |
| 99 |
| 100 template <> |
| 101 ALWAYS_INLINE void FormatFixup<TYPE_ATC_NEON>(uint16x4_t* base_colors, |
| 102 uint64x1_t* indices) { |
| 103 // First color in ATC format is 555. |
| 104 *base_colors = vorr_u16( |
| 105 vand_u16(*base_colors, vreinterpret_u16_u64(vdup_n_u64(0xffff001f))), |
| 106 vshr_n_u16( |
| 107 vand_u16(*base_colors, vreinterpret_u16_u64(vdup_n_u64(0x0000ffC0))), |
| 108 1)); |
| 109 } |
| 110 |
| 111 template <> |
| 112 ALWAYS_INLINE void FormatFixup<TYPE_DXT_NEON>(uint16x4_t* base_colors, |
| 113 uint64x1_t* indices) { |
| 114 // Swap min/max colors if necessary. |
| 115 uint16x4_t max = vdup_lane_u16(*base_colors, 0); |
| 116 uint16x4_t min = vdup_lane_u16(*base_colors, 1); |
| 117 uint16x4_t cmp = vclt_u16(max, min); |
| 118 *base_colors = |
| 119 vorr_u16(vand_u16(vbsl_u16(cmp, min, max), |
| 120 vreinterpret_u16_u64(vdup_n_u64(0x0000ffff))), |
| 121 vand_u16(vbsl_u16(cmp, max, min), |
| 122 vreinterpret_u16_u64(vdup_n_u64(0xffff0000)))); |
| 123 *indices = vbsl_u64(vreinterpret_u64_u16(cmp), |
| 124 veor_u64(*indices, vdup_n_u64(0x55555555)), *indices); |
| 125 } |
| 126 |
| 127 // Check if all the 8 bits elements in the given quad register are equal. |
| 128 ALWAYS_INLINE bool ElementsEqual(uint8x16_t elements) { |
| 129 uint8x16_t first = vdupq_lane_u8(vget_low_u8(elements), 0); |
| 130 uint8x16_t eq = vceqq_u8(elements, first); |
| 131 uint8x8_t tst = vand_u8(vget_low_u8(eq), vget_high_u8(eq)); |
| 132 return vget_lane_u64(vreinterpret_u64_u8(tst), 0) == 0xffffffffffffffff; |
| 133 } |
| 134 |
| 135 ALWAYS_INLINE bool Equal(uint8x16_t e1, uint8x16_t e2) { |
| 136 uint8x16_t eq = vceqq_u8(e1, e2); |
| 137 uint8x8_t tst = vand_u8(vget_low_u8(eq), vget_high_u8(eq)); |
| 138 return vget_lane_u64(vreinterpret_u64_u8(tst), 0) == 0xffffffffffffffff; |
| 139 } |
| 140 |
| 141 ALWAYS_INLINE bool Equal(uint16x8_t e1, uint16x8_t e2) { |
| 142 uint16x8_t eq = vceqq_u16(e1, e2); |
| 143 uint16x4_t tst = vand_u16(vget_low_u16(eq), vget_high_u16(eq)); |
| 144 return vget_lane_u64(vreinterpret_u64_u16(tst), 0) == 0xffffffffffffffff; |
| 145 } |
| 146 |
| 147 ALWAYS_INLINE bool Equal(uint16x4_t e1, uint16x4_t e2) { |
| 148 uint16x4_t eq = vceq_u16(e1, e2); |
| 149 return vget_lane_u64(vreinterpret_u64_u16(eq), 0) == 0xffffffffffffffff; |
| 150 } |
| 151 |
| 152 ALWAYS_INLINE int16x8x2_t ExpandRGBATo16(const uint8x16_t& channel) { |
| 153 int16x8x2_t result; |
| 154 result.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(channel))); |
| 155 result.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(channel))); |
| 156 return result; |
| 157 } |
| 158 |
| 159 ALWAYS_INLINE int32x4x4_t ExpandRGBATo32(const uint8x16_t& channel) { |
| 160 uint16x8_t lo = vmovl_u8(vget_low_u8(channel)); |
| 161 uint16x8_t hi = vmovl_u8(vget_high_u8(channel)); |
| 162 int32x4x4_t result; |
| 163 result.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(lo))); |
| 164 result.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(lo))); |
| 165 result.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(hi))); |
| 166 result.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(hi))); |
| 167 return result; |
| 168 } |
| 169 |
| 170 // NEON doesn't have support for division. |
| 171 // Instead it's recommended to use Newton-Raphson refinement to get a close |
| 172 // approximation. |
| 173 template <int REFINEMENT_STEPS> |
| 174 ALWAYS_INLINE float32x4_t Divide(float32x4_t a, float32x4_t b) { |
| 175 #ifdef VERIFY_RESULTS |
| 176 ALIGNAS(8) float a_[4]; |
| 177 ALIGNAS(8) float b_[4]; |
| 178 vst1q_f32(a_, a); |
| 179 vst1q_f32(b_, b); |
| 180 for (int i = 0; i < 4; ++i) |
| 181 a_[i] /= b_[i]; |
| 182 return vld1q_f32(a_); |
| 183 #else |
| 184 // Get an initial estimate of 1/b. |
| 185 float32x4_t reciprocal = vrecpeq_f32(b); |
| 186 // Use a number of Newton-Raphson steps to refine the estimate. |
| 187 for (int i = 0; i < REFINEMENT_STEPS; ++i) |
| 188 reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); |
| 189 // Calculate the final estimate. |
| 190 return vmulq_f32(a, reciprocal); |
| 191 #endif |
| 192 } |
| 193 |
| 194 namespace vec_ops { |
| 195 |
| 196 struct Max { |
| 197 ALWAYS_INLINE int32x4_t Calc(int32x4_t a, int32x4_t b) { |
| 198 return vmaxq_s32(a, b); |
| 199 } |
| 200 |
| 201 ALWAYS_INLINE uint32x4_t Calc(uint32x4_t a, uint32x4_t b) { |
| 202 return vmaxq_u32(a, b); |
| 203 } |
| 204 |
| 205 ALWAYS_INLINE uint8x8_t Fold(uint8x8_t a, uint8x8_t b) { |
| 206 return vpmax_u8(a, b); |
| 207 } |
| 208 |
| 209 ALWAYS_INLINE int32x2_t Fold(int32x2_t a, int32x2_t b) { |
| 210 return vpmax_s32(a, b); |
| 211 } |
| 212 |
| 213 ALWAYS_INLINE uint32x2_t Fold(uint32x2_t a, uint32x2_t b) { |
| 214 return vpmax_u32(a, b); |
| 215 } |
| 216 }; |
| 217 |
| 218 struct Min { |
| 219 ALWAYS_INLINE int32x4_t Calc(int32x4_t a, int32x4_t b) { |
| 220 return vminq_s32(a, b); |
| 221 } |
| 222 |
| 223 ALWAYS_INLINE uint32x4_t Calc(uint32x4_t a, uint32x4_t b) { |
| 224 return vminq_u32(a, b); |
| 225 } |
| 226 |
| 227 ALWAYS_INLINE uint8x8_t Fold(uint8x8_t a, uint8x8_t b) { |
| 228 return vpmin_u8(a, b); |
| 229 } |
| 230 |
| 231 ALWAYS_INLINE int32x2_t Fold(int32x2_t a, int32x2_t b) { |
| 232 return vpmin_s32(a, b); |
| 233 } |
| 234 |
| 235 ALWAYS_INLINE uint32x2_t Fold(uint32x2_t a, uint32x2_t b) { |
| 236 return vpmin_u32(a, b); |
| 237 } |
| 238 }; |
| 239 |
| 240 } // namespace vec_ops |
| 241 |
| 242 template <typename Operator> |
| 243 ALWAYS_INLINE uint8x8_t FoldRGBA(const uint8x16x4_t& src) { |
| 244 Operator op; |
| 245 |
| 246 // Fold each adjacent pair. |
| 247 uint8x8_t r = op.Fold(vget_low_u8(src.val[0]), vget_high_u8(src.val[0])); |
| 248 uint8x8_t g = op.Fold(vget_low_u8(src.val[1]), vget_high_u8(src.val[1])); |
| 249 uint8x8_t b = op.Fold(vget_low_u8(src.val[2]), vget_high_u8(src.val[2])); |
| 250 uint8x8_t a = op.Fold(vget_low_u8(src.val[3]), vget_high_u8(src.val[3])); |
| 251 |
| 252 // Do both red and green channels at the same time. |
| 253 uint8x8_t rg = op.Fold(r, g); |
| 254 |
| 255 // Do both blue and alpha channels at the same time. |
| 256 uint8x8_t ba = op.Fold(b, a); |
| 257 |
| 258 // Do all the channels at the same time. |
| 259 uint8x8_t rgba = op.Fold(rg, ba); |
| 260 |
| 261 // Finally, we need to pad it to get the final reduction. |
| 262 return op.Fold(rgba, rgba); |
| 263 } |
| 264 |
| 265 template <typename Operator> |
| 266 ALWAYS_INLINE int32x2_t Fold(const int32x4x4_t& src) { |
| 267 Operator op; |
| 268 |
| 269 int32x4_t fold0 = op.Calc(src.val[0], src.val[1]); |
| 270 int32x4_t fold1 = op.Calc(src.val[2], src.val[3]); |
| 271 int32x4_t fold01 = op.Calc(fold0, fold1); |
| 272 int32x2_t fold0123 = op.Fold(vget_low_s32(fold01), vget_high_s32(fold01)); |
| 273 return op.Fold(fold0123, vdup_n_s32(0)); |
| 274 } |
| 275 |
| 276 template <typename Operator> |
| 277 ALWAYS_INLINE uint32x2_t Fold(const uint32x4x4_t& src) { |
| 278 Operator op; |
| 279 |
| 280 uint32x4_t fold0 = op.Calc(src.val[0], src.val[1]); |
| 281 uint32x4_t fold1 = op.Calc(src.val[2], src.val[3]); |
| 282 uint32x4_t fold01 = op.Calc(fold0, fold1); |
| 283 uint32x2_t fold0123 = op.Fold(vget_low_u32(fold01), vget_high_u32(fold01)); |
| 284 return op.Fold(fold0123, vdup_n_u32(0)); |
| 285 } |
| 286 |
| 287 template <typename Operator> |
| 288 ALWAYS_INLINE int32x4_t FoldDup(const int32x4x4_t& src) { |
| 289 return vdupq_lane_s32(Fold<Operator>(src), 0); |
| 290 } |
| 291 |
| 292 ALWAYS_INLINE uint16x4_t SumRGB(const uint8x16x4_t& src) { |
| 293 // Add up all red values for 16 pixels. |
| 294 uint16x8_t r = vpaddlq_u8(src.val[0]); |
| 295 uint16x4_t r2 = vpadd_u16(vget_low_u16(r), vget_high_u16(r)); |
| 296 |
| 297 // Add up all green values for 16 pixels. |
| 298 uint16x8_t g = vpaddlq_u8(src.val[1]); |
| 299 uint16x4_t g2 = vpadd_u16(vget_low_u16(g), vget_high_u16(g)); |
| 300 |
| 301 uint16x4_t rg = vpadd_u16(r2, g2); |
| 302 |
| 303 // Add up all blue values for 16 pixels. |
| 304 uint16x8_t b = vpaddlq_u8(src.val[2]); |
| 305 uint16x4_t b2 = vpadd_u16(vget_low_u16(b), vget_high_u16(b)); |
| 306 |
| 307 uint16x4_t ba = vpadd_u16(b2, vdup_n_u16(0)); |
| 308 |
| 309 return vpadd_u16(rg, ba); |
| 310 } |
| 311 |
| 312 ALWAYS_INLINE int32x4_t SumRGB(const int16x8x4_t& src) { |
| 313 // Add up all red values for 8 pixels. |
| 314 int32x4_t r = vpaddlq_s16(src.val[0]); |
| 315 int32x2_t r2 = vpadd_s32(vget_low_s32(r), vget_high_s32(r)); |
| 316 |
| 317 // Add up all green values for 8 pixels. |
| 318 int32x4_t g = vpaddlq_s16(src.val[1]); |
| 319 int32x2_t g2 = vpadd_s32(vget_low_s32(g), vget_high_s32(g)); |
| 320 |
| 321 int32x2_t rg = vpadd_s32(r2, g2); |
| 322 |
| 323 // Add up all blue values for 8 pixels. |
| 324 int32x4_t b = vpaddlq_s16(src.val[2]); |
| 325 int32x2_t b2 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); |
| 326 |
| 327 int32x2_t ba = vpadd_s32(b2, vdup_n_s32(0)); |
| 328 |
| 329 return vcombine_s32(rg, ba); |
| 330 } |
| 331 |
| 332 ALWAYS_INLINE int32x4_t SumRGB(const int32x4x4_t& src) { |
| 333 // Add up all red values for 8 pixels. |
| 334 int32x2_t r = vmovn_s64(vpaddlq_s32(src.val[0])); |
| 335 |
| 336 // Add up all green values for 8 pixels. |
| 337 int32x2_t g = vmovn_s64(vpaddlq_s32(src.val[1])); |
| 338 |
| 339 int32x2_t rg = vpadd_s32(r, g); |
| 340 |
| 341 // Add up all blue values for 8 pixels. |
| 342 int32x2_t b = vmovn_s64(vpaddlq_s32(src.val[2])); |
| 343 |
| 344 int32x2_t ba = vpadd_s32(b, vdup_n_s32(0)); |
| 345 |
| 346 return vcombine_s32(rg, ba); |
| 347 } |
| 348 |
| 349 ALWAYS_INLINE int32x4_t DotProduct(int32x4_t r, |
| 350 int32x4_t g, |
| 351 int32x4_t b, |
| 352 int32x4_t dir_r, |
| 353 int32x4_t dir_g, |
| 354 int32x4_t dir_b) { |
| 355 // Multiply and accumulate each 32 bits element. |
| 356 int32x4_t dots = vmulq_s32(r, dir_r); |
| 357 dots = vmlaq_s32(dots, g, dir_g); |
| 358 dots = vmlaq_s32(dots, b, dir_b); |
| 359 return dots; |
| 360 } |
| 361 |
| 362 ALWAYS_INLINE int32x4x4_t CalculateDots(const int32x4x4_t& r, |
| 363 const int32x4x4_t& g, |
| 364 const int32x4x4_t& b, |
| 365 const int32x4_t& v_vec) { |
| 366 // Duplicate the red, green and blue luminance values. |
| 367 int32x4_t r_vec = vdupq_n_s32(vgetq_lane_s32(v_vec, 0)); |
| 368 int32x4_t g_vec = vdupq_n_s32(vgetq_lane_s32(v_vec, 1)); |
| 369 int32x4_t b_vec = vdupq_n_s32(vgetq_lane_s32(v_vec, 2)); |
| 370 |
| 371 int32x4x4_t result; |
| 372 result.val[0] = DotProduct(r.val[0], g.val[0], b.val[0], r_vec, g_vec, b_vec); |
| 373 result.val[1] = DotProduct(r.val[1], g.val[1], b.val[1], r_vec, g_vec, b_vec); |
| 374 result.val[2] = DotProduct(r.val[2], g.val[2], b.val[2], r_vec, g_vec, b_vec); |
| 375 result.val[3] = DotProduct(r.val[3], g.val[3], b.val[3], r_vec, g_vec, b_vec); |
| 376 return result; |
| 377 } |
| 378 |
| 379 ALWAYS_INLINE uint16x8_t QuantizeTo565(uint8x8_t pixels) { |
| 380 // in: [min_r min_g min_b 0 max_r max_g max_b 0] |
| 381 // out: [min_r5 min_g6 min_b5 0][max_r5 max_g6 max_b5 0] |
| 382 |
| 383 // Expand the components to signed 16 bit. |
| 384 uint16x8_t pixels16 = vmovl_u8(pixels); |
| 385 |
| 386 // {31, 63, 31, 0, 31, 63, 31, 0}; |
| 387 const uint16x8_t kMultiply = vreinterpretq_u16_u64(vdupq_n_u64(0x1f003f001f)); |
| 388 uint16x8_t pixel0 = vmulq_u16(pixels16, kMultiply); |
| 389 |
| 390 // {128, 128, 128, 0, 128, 128, 128, 0}; |
| 391 const uint16x8_t kAdd = vreinterpretq_u16_u64(vdupq_n_u64(0x8000800080)); |
| 392 uint16x8_t pixel1 = vaddq_u16(pixel0, kAdd); |
| 393 |
| 394 // Create a shifted copy. |
| 395 uint16x8_t pixel2 = vsraq_n_u16(pixel1, pixel1, 8); |
| 396 |
| 397 // Shift and return. |
| 398 return vshrq_n_u16(pixel2, 8); |
| 399 } |
| 400 |
| 401 // Combine the components of base colors in to 16 bits. |
| 402 ALWAYS_INLINE uint16x4_t PackBaseColors(uint16x8_t base_colors) { |
| 403 // in: [max_r5 max_g6 max_b5 0][min_r5 min_g6 min_b5 0] |
| 404 // out: [max_rgb565 min_rgb565 0 0] |
| 405 |
| 406 // Swapping r and b channels to match Skia. |
| 407 base_colors = vrev64q_u16(base_colors); |
| 408 base_colors = vcombine_u16( |
| 409 vext_u16(vget_low_u16(base_colors), vget_low_u16(base_colors), 1), |
| 410 vext_u16(vget_high_u16(base_colors), vget_high_u16(base_colors), 1)); |
| 411 |
| 412 // Shift to pack RGB565 in 16-bit. |
| 413 uint64x2_t r = |
| 414 vshlq_u64(vreinterpretq_u64_u16(base_colors), vdupq_n_s64(-32)); |
| 415 uint64x2_t g = |
| 416 vshlq_u64(vreinterpretq_u64_u16(base_colors), vdupq_n_s64(-11)); |
| 417 uint64x2_t b = vshlq_u64(vreinterpretq_u64_u16(base_colors), vdupq_n_s64(11)); |
| 418 uint64x2_t base_colors_16 = vorrq_u64(r, vorrq_u64(g, b)); |
| 419 |
| 420 // Shift to pack 16-bit base colors in 32-bit and return. |
| 421 return vreinterpret_u16_u64( |
| 422 vorr_u64(vshl_n_u64(vget_high_u64(base_colors_16), 16), |
| 423 vand_u64(vget_low_u64(base_colors_16), vdup_n_u64(0xffff)))); |
| 424 } |
| 425 |
| 426 // Combine the given color indices. |
| 427 // |
| 428 // Params: |
| 429 // S Size of an index in bits. |
| 430 // indices Indices to be combined. Each of 8 bits element represents an index. |
| 431 template <int S> |
| 432 ALWAYS_INLINE uint64x1_t PackIndices(uint8x16_t indices) { |
| 433 uint64x2_t ind = vshlq_n_u64(vreinterpretq_u64_u8(indices), 8 - S); |
| 434 const uint64x2_t mask = vdupq_n_u64(0xff00000000000000); |
| 435 uint64x2_t ind2 = vandq_u64(vshlq_n_u64(ind, 56), mask); |
| 436 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 48), mask)); |
| 437 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 40), mask)); |
| 438 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 32), mask)); |
| 439 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 24), mask)); |
| 440 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 16), mask)); |
| 441 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 8), mask)); |
| 442 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(ind, mask)); |
| 443 return vshr_n_u64( |
| 444 vorr_u64(vshr_n_u64(vget_low_u64(ind2), (8 * S)), vget_high_u64(ind2)), |
| 445 64 - 16 * S); |
| 446 } |
| 447 |
| 448 ALWAYS_INLINE int32x4_t |
| 449 CovarianceChannels(const int16x8x2_t& ch1, const int16x8x2_t& ch2) { |
| 450 // Multiply and accumulate. |
| 451 int32x4_t cov; |
| 452 cov = vmull_s16(vget_low_s16(ch1.val[0]), vget_low_s16(ch2.val[0])); |
| 453 cov = vmlal_s16(cov, vget_high_s16(ch1.val[0]), vget_high_s16(ch2.val[0])); |
| 454 cov = vmlal_s16(cov, vget_low_s16(ch1.val[1]), vget_low_s16(ch2.val[1])); |
| 455 cov = vmlal_s16(cov, vget_high_s16(ch1.val[1]), vget_high_s16(ch2.val[1])); |
| 456 return cov; |
| 457 } |
| 458 |
| 459 ALWAYS_INLINE int32x4x2_t |
| 460 Covariance(uint16x4_t average_rgb, const uint8x16x4_t& pixels_scattered) { |
| 461 int16x8_t average_r = vreinterpretq_s16_u16(vdupq_lane_u16(average_rgb, 0)); |
| 462 int16x8_t average_g = vreinterpretq_s16_u16(vdupq_lane_u16(average_rgb, 1)); |
| 463 int16x8_t average_b = vreinterpretq_s16_u16(vdupq_lane_u16(average_rgb, 2)); |
| 464 |
| 465 // Subtract red values from the average red. |
| 466 int16x8x2_t diff_r; |
| 467 diff_r.val[0] = vsubq_s16( |
| 468 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels_scattered.val[0]))), |
| 469 average_r); |
| 470 diff_r.val[1] = vsubq_s16( |
| 471 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels_scattered.val[0]))), |
| 472 average_r); |
| 473 |
| 474 // Subtract green values from the average green. |
| 475 int16x8x2_t diff_g; |
| 476 diff_g.val[0] = vsubq_s16( |
| 477 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels_scattered.val[1]))), |
| 478 average_g); |
| 479 diff_g.val[1] = vsubq_s16( |
| 480 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels_scattered.val[1]))), |
| 481 average_g); |
| 482 |
| 483 // Subtract blue values from the average blue. |
| 484 int16x8x2_t diff_b; |
| 485 diff_b.val[0] = vsubq_s16( |
| 486 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels_scattered.val[2]))), |
| 487 average_b); |
| 488 diff_b.val[1] = vsubq_s16( |
| 489 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels_scattered.val[2]))), |
| 490 average_b); |
| 491 |
| 492 int32x4x4_t cov1; |
| 493 cov1.val[0] = CovarianceChannels(diff_r, diff_r); |
| 494 cov1.val[1] = CovarianceChannels(diff_r, diff_g); |
| 495 cov1.val[2] = CovarianceChannels(diff_r, diff_b); |
| 496 cov1.val[3] = vdupq_n_s32(0); |
| 497 |
| 498 int32x4x4_t cov2; |
| 499 cov2.val[0] = CovarianceChannels(diff_g, diff_g); |
| 500 cov2.val[1] = CovarianceChannels(diff_g, diff_b); |
| 501 cov2.val[2] = CovarianceChannels(diff_b, diff_b); |
| 502 cov2.val[3] = vdupq_n_s32(0); |
| 503 |
| 504 int32x4x2_t covariance; |
| 505 covariance.val[0] = SumRGB(cov1); |
| 506 covariance.val[1] = SumRGB(cov2); |
| 507 return covariance; |
| 508 } |
| 509 |
| 510 ALWAYS_INLINE uint32x2_t MaskOutPixel(const uint8x16x4_t& pixels_linear, |
| 511 const int32x4x4_t& dots, |
| 512 int32x4_t max_dot_vec) { |
| 513 // Mask out any of the 16 pixels where the dot product matches exactly. |
| 514 uint32x4x4_t pixels; |
| 515 pixels.val[0] = vandq_u32(vceqq_s32(dots.val[0], max_dot_vec), |
| 516 vreinterpretq_u32_u8(pixels_linear.val[0])); |
| 517 |
| 518 pixels.val[1] = vandq_u32(vceqq_s32(dots.val[1], max_dot_vec), |
| 519 vreinterpretq_u32_u8(pixels_linear.val[1])); |
| 520 |
| 521 pixels.val[2] = vandq_u32(vceqq_s32(dots.val[2], max_dot_vec), |
| 522 vreinterpretq_u32_u8(pixels_linear.val[2])); |
| 523 |
| 524 pixels.val[3] = vandq_u32(vceqq_s32(dots.val[3], max_dot_vec), |
| 525 vreinterpretq_u32_u8(pixels_linear.val[3])); |
| 526 |
| 527 // Fold it down. |
| 528 return Fold<vec_ops::Max>(pixels); |
| 529 } |
| 530 |
| 531 ALWAYS_INLINE uint16x8_t GetBaseColors(const uint8x16x4_t& pixels_linear, |
| 532 const uint8x16x4_t& pixels_scattered, |
| 533 int32x4_t dir) { |
| 534 // Expand all pixels to signed 32-bit integers. |
| 535 int32x4x4_t r = ExpandRGBATo32(pixels_scattered.val[0]); |
| 536 int32x4x4_t g = ExpandRGBATo32(pixels_scattered.val[1]); |
| 537 int32x4x4_t b = ExpandRGBATo32(pixels_scattered.val[2]); |
| 538 |
| 539 int32x4x4_t dots = CalculateDots(r, g, b, dir); |
| 540 |
| 541 // Mask out the pixel(s) that matches the max dot. |
| 542 uint32x2_t max_pixel = |
| 543 MaskOutPixel(pixels_linear, dots, FoldDup<vec_ops::Max>(dots)); |
| 544 |
| 545 // Mask out the pixel(s) that matches the min dot. |
| 546 uint32x2_t min_pixel = |
| 547 MaskOutPixel(pixels_linear, dots, FoldDup<vec_ops::Min>(dots)); |
| 548 |
| 549 return QuantizeTo565( |
| 550 vreinterpret_u8_u32(vzip_u32(max_pixel, min_pixel).val[0])); |
| 551 } |
| 552 |
| 553 // Figure out the two base colors to use from a block of 16 pixels |
| 554 // by Primary Component Analysis and map along principal axis. |
| 555 ALWAYS_INLINE uint16x8_t |
| 556 OptimizeColorsBlock(const uint8x16x4_t& pixels_linear, |
| 557 const uint8x16x4_t& pixels_scattered, |
| 558 uint16x4_t sum_rgb, |
| 559 uint8x8_t min_rgba, |
| 560 uint8x8_t max_rgba) { |
| 561 // min_rgba: [min_r min_g min_b min_a x x x x] |
| 562 // max_rgba: [max_r max_g max_b max_a x x x x] |
| 563 |
| 564 // Determine color distribution. We already have the max and min, now we need |
| 565 // the average of the 16 pixels. Divide sum_rgb with rounding. |
| 566 uint16x4_t average_rgb = vrshr_n_u16(sum_rgb, 4); |
| 567 |
| 568 // Determine covariance matrix. |
| 569 int32x4x2_t covariance = Covariance(average_rgb, pixels_scattered); |
| 570 |
| 571 // Convert covariance matrix to float, find principal axis via power |
| 572 // iteration. |
| 573 float32x4x2_t covariance_float; |
| 574 const float32x4_t kInv255 = vdupq_n_f32(1.0f / 255.0f); |
| 575 covariance_float.val[0] = |
| 576 vmulq_f32(vcvtq_f32_s32(covariance.val[0]), kInv255); |
| 577 covariance_float.val[1] = |
| 578 vmulq_f32(vcvtq_f32_s32(covariance.val[1]), kInv255); |
| 579 |
| 580 int16x4_t max_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(max_rgba))); |
| 581 int16x4_t min_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(min_rgba))); |
| 582 float32x4_t vf4 = vcvtq_f32_s32(vsubl_s16(max_16, min_16)); |
| 583 |
| 584 for (int i = 0; i < 4; ++i) { |
| 585 float32x4_t vfr4 = vdupq_n_f32(vgetq_lane_f32(vf4, 0)); |
| 586 float32x4_t vfg4 = vdupq_n_f32(vgetq_lane_f32(vf4, 1)); |
| 587 float32x4_t vfb4 = vdupq_n_f32(vgetq_lane_f32(vf4, 2)); |
| 588 |
| 589 // from: [0 1 2 x] [3 4 5 x] |
| 590 // to: [1 3 4 x] |
| 591 float32x4_t cov_134 = |
| 592 vextq_f32(covariance_float.val[1], covariance_float.val[1], 3); |
| 593 cov_134 = |
| 594 vsetq_lane_f32(vgetq_lane_f32(covariance_float.val[0], 1), cov_134, 0); |
| 595 |
| 596 // from: [0 1 2 x] [3 4 5 x] |
| 597 // to: [2 4 5 x] |
| 598 float32x4_t cov_245 = vsetq_lane_f32( |
| 599 vgetq_lane_f32(covariance_float.val[0], 2), covariance_float.val[1], 0); |
| 600 |
| 601 vf4 = vmulq_f32(vfr4, covariance_float.val[0]); |
| 602 vf4 = vmlaq_f32(vf4, vfg4, cov_134); |
| 603 vf4 = vmlaq_f32(vf4, vfb4, cov_245); |
| 604 } |
| 605 |
| 606 float32x4_t magnitude = vabsq_f32(vf4); |
| 607 magnitude = vsetq_lane_f32(0.0f, magnitude, 3); // Null out alpha. |
| 608 float32x4_t mag4 = vdupq_lane_f32( |
| 609 vpmax_f32(vpmax_f32(vget_low_f32(magnitude), vget_high_f32(magnitude)), |
| 610 vdup_n_f32(0.0f)), |
| 611 0); |
| 612 |
| 613 const int32x4_t kLuminance = {299, 587, 114, 0}; |
| 614 |
| 615 // Note that this quite often means dividing by zero. The math still works |
| 616 // when comparing with Inf though. |
| 617 float32x4_t inv_magnitude = Divide<2>(vdupq_n_f32(512.0f), mag4); |
| 618 |
| 619 int32x4_t vf4_mag = vcvtq_s32_f32(vmulq_f32(vf4, inv_magnitude)); |
| 620 int32x4_t v = |
| 621 vbslq_s32(vcltq_f32(mag4, vdupq_n_f32(4.0f)), kLuminance, vf4_mag); |
| 622 |
| 623 return GetBaseColors(pixels_linear, pixels_scattered, v); |
| 624 } |
| 625 |
| 626 ALWAYS_INLINE uint16x8_t |
| 627 GetApproximateBaseColors(const uint8x16x4_t& pixels_linear, |
| 628 const uint8x16x4_t& pixels_scattered, |
| 629 uint8x8_t min_rgba, |
| 630 uint8x8_t max_rgba) { |
| 631 // min_rgba: [min_r min_g min_b min_a x x x x] |
| 632 // max_rgba: [max_r max_g max_b max_a x x x x] |
| 633 |
| 634 // Get direction vector and expand to 32-bit. |
| 635 int16x4_t max_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(max_rgba))); |
| 636 int16x4_t min_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(min_rgba))); |
| 637 int32x4_t v = vsubl_s16(max_16, min_16); |
| 638 |
| 639 return GetBaseColors(pixels_linear, pixels_scattered, v); |
| 640 } |
| 641 |
| 642 // Take two base colors and generate 4 RGBX colors where: |
| 643 // 0 = baseColor0 |
| 644 // 1 = baseColor1 |
| 645 // 2 = (2 * baseColor0 + baseColor1) / 3 |
| 646 // 3 = (2 * baseColor1 + baseColor0) / 3 |
| 647 ALWAYS_INLINE uint16x4x4_t EvalColors(const uint16x8_t& base_colors) { |
| 648 // The base colors are expanded by reusing the top bits at the end. That makes |
| 649 // sure that white is still white after being quantized and converted back. |
| 650 // |
| 651 // [(r<<3 | r>>2) (g<<2 | g>>4) (b<<3 | b>>2) 0] |
| 652 |
| 653 // The upper shift values for each component. |
| 654 // {3, 2, 3, 0, 3, 2, 3, 0}; |
| 655 const int16x8_t kShiftUp = vreinterpretq_s16_u64(vdupq_n_u64(0x300020003)); |
| 656 uint16x8_t pixels_up = vshlq_u16(base_colors, kShiftUp); |
| 657 // [r0<<3 g0<<2 b0<<3 0] [r1<<3 g1<<2 b1<<3 0] |
| 658 |
| 659 // The lower shift values for each component. |
| 660 // Note that we need to use negative values to shift right. |
| 661 // {-2, -4, -2, 0, -2, -4, -2, 0}; |
| 662 const int16x8_t kShiftDown = |
| 663 vreinterpretq_s16_u64(vdupq_n_u64(0xfffefffcfffe)); |
| 664 uint16x8_t pixels_down = vshlq_u16(base_colors, kShiftDown); |
| 665 // [r0>>2 g0>>4 b0>>2 0] [r1>>2 g1>>4 b1>>2 0] |
| 666 |
| 667 uint16x8_t pixels = vorrq_u16(pixels_up, pixels_down); |
| 668 // [(r0<<3 | r0>>2) (g0<<2 | g0>>4) (b0<<3 | b0>>2) 0] |
| 669 // [(r1<<3 | r1>>2) (g1<<2 | g1>>4) (b1<<3 | b1>>2) 0] |
| 670 |
| 671 // Linear interpolate the two other colors: |
| 672 // (2 * max + min) / 3 |
| 673 // (2 * min + max) / 3 |
| 674 |
| 675 uint16x8_t pixels_mul2 = vaddq_u16(pixels, pixels); |
| 676 |
| 677 uint16x8_t swapped = vreinterpretq_u16_u64(vextq_u64( |
| 678 vreinterpretq_u64_u16(pixels), vreinterpretq_u64_u16(pixels), 1)); |
| 679 int16x8_t output = vreinterpretq_s16_u16(vaddq_u16(pixels_mul2, swapped)); |
| 680 |
| 681 // There's no division in NEON, but we can use "x * ((1 << 16) / 3 + 1))" |
| 682 // instead. |
| 683 output = vqdmulhq_s16(output, vdupq_n_s16(((1 << 16) / 3 + 1) >> 1)); |
| 684 |
| 685 uint16x4x4_t colors; |
| 686 colors.val[0] = vget_low_u16(pixels); |
| 687 colors.val[1] = vget_high_u16(pixels); |
| 688 colors.val[2] = vreinterpret_u16_s16(vget_low_s16(output)); |
| 689 colors.val[3] = vreinterpret_u16_s16(vget_high_s16(output)); |
| 690 return colors; |
| 691 } |
| 692 |
| 693 template <typename T> |
| 694 ALWAYS_INLINE uint8x8_t GetRemapIndices(int32x4_t dots, |
| 695 int32x4_t half_point, |
| 696 int32x4_t c0_point, |
| 697 int32x4_t c3_point) { |
| 698 // bits = (dot < half_point ? 4 : 0) |
| 699 // | (dot < c0_point ? 2 : 0) |
| 700 // | (dot < c3_point ? 1 : 0) |
| 701 int32x4_t cmp0 = vreinterpretq_s32_u32( |
| 702 vandq_u32(vcgtq_s32(half_point, dots), vdupq_n_u32(4))); |
| 703 int32x4_t cmp1 = vreinterpretq_s32_u32( |
| 704 vandq_u32(vcgtq_s32(c0_point, dots), vdupq_n_u32(2))); |
| 705 int32x4_t cmp2 = vreinterpretq_s32_u32( |
| 706 vandq_u32(vcgtq_s32(c3_point, dots), vdupq_n_u32(1))); |
| 707 int32x4_t bits = vorrq_s32(vorrq_s32(cmp0, cmp1), cmp2); |
| 708 |
| 709 // Narrow it down to unsigned 8 bits and return. |
| 710 return vqmovn_u16(vcombine_u16(vqmovun_s32(bits), vdup_n_u16(0))); |
| 711 } |
| 712 |
| 713 // dots: Dot products for each pixel. |
| 714 // points: Crossover points. |
| 715 template <typename T> |
| 716 ALWAYS_INLINE uint8x16_t GetColorIndices(int32x4x4_t dots, int32x4_t points) { |
| 717 // Crossover points for "best color in top half"/"best in bottom half" and |
| 718 // the same inside that subinterval. |
| 719 int32x4_t c0_point = vdupq_lane_s32(vget_low_s32(points), 1); |
| 720 int32x4_t half_point = vdupq_lane_s32(vget_low_s32(points), 0); |
| 721 int32x4_t c3_point = vdupq_lane_s32(vget_high_s32(points), 0); |
| 722 |
| 723 // Get kRemap table indices. |
| 724 uint8x8x4_t ind; |
| 725 ind.val[0] = GetRemapIndices<T>(dots.val[0], half_point, c0_point, c3_point); |
| 726 ind.val[1] = GetRemapIndices<T>(dots.val[1], half_point, c0_point, c3_point); |
| 727 ind.val[2] = GetRemapIndices<T>(dots.val[2], half_point, c0_point, c3_point); |
| 728 ind.val[3] = GetRemapIndices<T>(dots.val[3], half_point, c0_point, c3_point); |
| 729 |
| 730 // Combine indices. |
| 731 uint8x8_t indices_lo = |
| 732 vreinterpret_u8_u32(vzip_u32(vreinterpret_u32_u8(ind.val[0]), |
| 733 vreinterpret_u32_u8(ind.val[1])).val[0]); |
| 734 uint8x8_t indices_hi = |
| 735 vreinterpret_u8_u32(vzip_u32(vreinterpret_u32_u8(ind.val[2]), |
| 736 vreinterpret_u32_u8(ind.val[3])).val[0]); |
| 737 // Do table lookup and return 2-bit color indices. |
| 738 return vcombine_u8(vtbl1_u8(T::kRemap, indices_lo), |
| 739 vtbl1_u8(T::kRemap, indices_hi)); |
| 740 } |
| 741 |
| 742 template <typename T> |
| 743 ALWAYS_INLINE uint8x16_t |
| 744 MatchColorsBlock(const uint8x16x4_t& pixels_scattered, uint16x4x4_t colors) { |
| 745 // Get direction vector and expand to 32-bit. |
| 746 int32x4_t dir = vsubl_s16(vreinterpret_s16_u16(colors.val[0]), |
| 747 vreinterpret_s16_u16(colors.val[1])); |
| 748 // Duplicate r g b elements of direction into different registers. |
| 749 int32x4_t dir_r = vdupq_lane_s32(vget_low_s32(dir), 0); |
| 750 int32x4_t dir_g = vdupq_lane_s32(vget_low_s32(dir), 1); |
| 751 int32x4_t dir_b = vdupq_lane_s32(vget_high_s32(dir), 0); |
| 752 |
| 753 // Transpose to separate red, green, blue and alpha channels into 4 different |
| 754 // registers. Alpha is ignored. |
| 755 uint16x4x2_t trn_lo = vtrn_u16(colors.val[0], colors.val[1]); |
| 756 uint16x4x2_t trn_hi = vtrn_u16(colors.val[2], colors.val[3]); |
| 757 uint32x4x2_t transposed_colors = vtrnq_u32( |
| 758 vreinterpretq_u32_u16(vcombine_u16(trn_lo.val[0], trn_lo.val[1])), |
| 759 vreinterpretq_u32_u16(vcombine_u16(trn_hi.val[0], trn_hi.val[1]))); |
| 760 |
| 761 // Expand to 32-bit. |
| 762 int32x4_t colors_r = |
| 763 vmovl_s16(vget_low_s16(vreinterpretq_s16_u32(transposed_colors.val[0]))); |
| 764 int32x4_t colors_g = |
| 765 vmovl_s16(vget_high_s16(vreinterpretq_s16_u32(transposed_colors.val[0]))); |
| 766 int32x4_t colors_b = |
| 767 vmovl_s16(vget_low_s16(vreinterpretq_s16_u32(transposed_colors.val[1]))); |
| 768 |
| 769 // Get dot products. |
| 770 int32x4_t stops = |
| 771 DotProduct(colors_r, colors_g, colors_b, dir_r, dir_g, dir_b); |
| 772 |
| 773 // Build a register containing 4th, 2nd and 3rd elements of stops respectively |
| 774 // in each 32 bits element. |
| 775 int32x4_t points1 = vsetq_lane_s32(vgetq_lane_s32(stops, 3), stops, 0); |
| 776 // Build a register containing 3rd, 4th and 1st elements of stops respectively |
| 777 // in each 32 bits element. |
| 778 int32x4_t points2 = vreinterpretq_s32_s64( |
| 779 vextq_s64(vreinterpretq_s64_s32(stops), vreinterpretq_s64_s32(stops), 1)); |
| 780 // Add and divide by 2. |
| 781 int32x4_t points = vshrq_n_s32(vaddq_s32(points1, points2), 1); |
| 782 |
| 783 // Expand all pixels to signed 32-bit integers. |
| 784 int32x4x4_t r = ExpandRGBATo32(pixels_scattered.val[0]); |
| 785 int32x4x4_t g = ExpandRGBATo32(pixels_scattered.val[1]); |
| 786 int32x4x4_t b = ExpandRGBATo32(pixels_scattered.val[2]); |
| 787 |
| 788 int32x4x4_t dots = CalculateDots(r, g, b, dir); |
| 789 |
| 790 // Get 2-bit color indices. |
| 791 return GetColorIndices<T>(dots, points); |
| 792 } |
| 793 |
| 794 template <typename T> |
| 795 ALWAYS_INLINE uint32x4x2_t DoProdsTableLookup(uint8x8_t indices) { |
| 796 // Do table lookup for each color index. The values in the table are 3 bytes |
| 797 // big so we do it in 3 steps. |
| 798 uint16x8_t lookup1 = vmovl_u8(vtbl1_u8(vcreate_u8(T::kProds[0]), indices)); |
| 799 uint16x8_t lookup2 = vmovl_u8(vtbl1_u8(vcreate_u8(T::kProds[1]), indices)); |
| 800 uint16x8_t lookup3 = vmovl_u8(vtbl1_u8(vcreate_u8(T::kProds[2]), indices)); |
| 801 // Expand to 32-bit. |
| 802 uint32x4_t lookup1_lo = vmovl_u16(vget_low_u16(lookup1)); |
| 803 uint32x4_t lookup1_hi = vmovl_u16(vget_high_u16(lookup1)); |
| 804 uint32x4_t lookup2_lo = vmovl_u16(vget_low_u16(lookup2)); |
| 805 uint32x4_t lookup2_hi = vmovl_u16(vget_high_u16(lookup2)); |
| 806 uint32x4_t lookup3_lo = vmovl_u16(vget_low_u16(lookup3)); |
| 807 uint32x4_t lookup3_hi = vmovl_u16(vget_high_u16(lookup3)); |
| 808 // Combine results by shifting and or-ing to obtain the actual table value. |
| 809 uint32x4x2_t result; |
| 810 result.val[0] = vorrq_u32(lookup3_lo, vorrq_u32(vshlq_n_u32(lookup2_lo, 8), |
| 811 vshlq_n_u32(lookup1_lo, 16))); |
| 812 result.val[1] = vorrq_u32(lookup3_hi, vorrq_u32(vshlq_n_u32(lookup2_hi, 8), |
| 813 vshlq_n_u32(lookup1_hi, 16))); |
| 814 return result; |
| 815 } |
| 816 |
| 817 // Tries to optimize colors to suit block contents better. |
| 818 // Done by solving a least squares system via normal equations+Cramer's rule. |
| 819 template <typename T> |
| 820 ALWAYS_INLINE int RefineBlock(const uint8x16x4_t& pixels_scattered, |
| 821 uint16x4_t sum_rgb, |
| 822 uint16x8_t& base_colors, |
| 823 uint8x16_t indices) { |
| 824 uint16x8_t old_base_colors = base_colors; |
| 825 |
| 826 if (ElementsEqual(indices)) { // Do all pixels have the same index? |
| 827 // Yes, linear system would be singular; solve using optimal single-color |
| 828 // match on average color. |
| 829 |
| 830 // Get the average of the 16 pixels with rounding. |
| 831 uint16x4_t average_rgb = vrshr_n_u16(sum_rgb, 4); |
| 832 |
| 833 ALIGNAS(8) uint16_t rgb[4]; |
| 834 vst1_u16(rgb, average_rgb); |
| 835 // Look up optimal values instead of trying to calculate. |
| 836 uint16_t colors[8] = {g_o_match55[rgb[0]][0], |
| 837 MatchSingleGreenMax<typename T::BASE_TYPE>(rgb[1]), |
| 838 g_o_match55[rgb[2]][0], |
| 839 0, |
| 840 g_o_match55[rgb[0]][1], |
| 841 MatchSingleGreenMin<typename T::BASE_TYPE>(rgb[1]), |
| 842 g_o_match55[rgb[2]][1], |
| 843 0}; |
| 844 base_colors = vld1q_u16(colors); |
| 845 } else { |
| 846 // Expand to 16-bit. |
| 847 int16x8x2_t r = ExpandRGBATo16(pixels_scattered.val[0]); |
| 848 int16x8x2_t g = ExpandRGBATo16(pixels_scattered.val[1]); |
| 849 int16x8x2_t b = ExpandRGBATo16(pixels_scattered.val[2]); |
| 850 |
| 851 // Do table lookup for each color index. |
| 852 int8x16_t w1 = DoW1TableLookup<T>(indices); |
| 853 // Expand to 16-bit. |
| 854 int16x8_t w1_lo = vmovl_s8(vget_low_s8(w1)); |
| 855 int16x8_t w1_hi = vmovl_s8(vget_high_s8(w1)); |
| 856 // Multiply and accumulate. |
| 857 int16x8x4_t at1_rgb; |
| 858 at1_rgb.val[0] = vmulq_s16(w1_lo, r.val[0]); |
| 859 at1_rgb.val[0] = vmlaq_s16(at1_rgb.val[0], w1_hi, r.val[1]); |
| 860 at1_rgb.val[1] = vmulq_s16(w1_lo, g.val[0]); |
| 861 at1_rgb.val[1] = vmlaq_s16(at1_rgb.val[1], w1_hi, g.val[1]); |
| 862 at1_rgb.val[2] = vmulq_s16(w1_lo, b.val[0]); |
| 863 at1_rgb.val[2] = vmlaq_s16(at1_rgb.val[2], w1_hi, b.val[1]); |
| 864 // [r][g][b][] |
| 865 int32x4_t at1 = SumRGB(at1_rgb); |
| 866 |
| 867 // [r][g][b][] |
| 868 int32x4_t at2 = vreinterpretq_s32_u32(vmovl_u16(sum_rgb)); |
| 869 // at2 = 3 * at2 - at1; |
| 870 at2 = vsubq_s32(vmulq_s32(at2, vdupq_n_s32(3)), at1); |
| 871 |
| 872 // Do table lookup for each color index. |
| 873 uint32x4x2_t akku1 = DoProdsTableLookup<T>(vget_low_u8(indices)); |
| 874 uint32x4x2_t akku2 = DoProdsTableLookup<T>(vget_high_u8(indices)); |
| 875 uint32x4_t sum_akku = vaddq_u32( |
| 876 vaddq_u32(vaddq_u32(akku1.val[0], akku1.val[1]), akku2.val[0]), |
| 877 akku2.val[1]); |
| 878 // Pairwise add and accumulate. |
| 879 uint64x1_t akku = vpaddl_u32(vget_low_u32(sum_akku)); |
| 880 akku = vpadal_u32(akku, vget_high_u32(sum_akku)); |
| 881 |
| 882 // Extract solutions and decide solvability. |
| 883 |
| 884 // [akku >> 16]x4 |
| 885 int32x4_t xx = |
| 886 vdupq_lane_s32(vreinterpret_s32_u64(vshr_n_u64(akku, 16)), 0); |
| 887 // [(akku >> 8) & 0xff]x4 |
| 888 const uint64x1_t kFF = vdup_n_u64(0xff); |
| 889 int32x4_t yy = vdupq_lane_s32( |
| 890 vreinterpret_s32_u64(vand_u64(vshr_n_u64(akku, 8), kFF)), 0); |
| 891 // [akku & 0xff]x4 |
| 892 int32x4_t xy = vdupq_lane_s32(vreinterpret_s32_u64(vand_u64(akku, kFF)), 0); |
| 893 |
| 894 // ((3.0f * 31.0f) / 255.0f) / (xx * yy - xy * xy) |
| 895 float32x4_t frb = Divide<2>( |
| 896 vdupq_n_f32((3.0f * 31.0f) / 255.0f), |
| 897 vcvtq_f32_s32(vsubq_s32(vmulq_s32(xx, yy), vmulq_s32(xy, xy)))); |
| 898 // frb * 63.0f / 31.0f |
| 899 float32x4_t fg = vmulq_f32(vmulq_f32(frb, vdupq_n_f32(63.0f)), |
| 900 vdupq_n_f32(1.0f / 31.0f)); |
| 901 |
| 902 // Solve. |
| 903 |
| 904 // [frb][fg][frb][] |
| 905 float32x4_t frb_fg_frb = vsetq_lane_f32(vgetq_lane_f32(fg, 0), frb, 1); |
| 906 // [31][63][31][] |
| 907 const int32x4_t kClamp565_vec = {31, 63, 31, 0}; |
| 908 |
| 909 // (at1_r * yy - at2_r * xy) * frb + 0.5f |
| 910 int32x4_t base0_rgb32 = vcvtq_s32_f32(vaddq_f32( |
| 911 vmulq_f32( |
| 912 vcvtq_f32_s32(vsubq_s32(vmulq_s32(at1, yy), vmulq_s32(at2, xy))), |
| 913 frb_fg_frb), |
| 914 vdupq_n_f32(0.5f))); |
| 915 // Clamp and saturate. |
| 916 uint16x4_t base0_rgb16 = vqmovun_s32(vbslq_s32( |
| 917 vcgeq_s32(base0_rgb32, kClamp565_vec), kClamp565_vec, base0_rgb32)); |
| 918 |
| 919 // (at2_r * xx - at1_r * xy) * frb + 0.5f |
| 920 int32x4_t base1_rgb32 = vcvtq_s32_f32(vaddq_f32( |
| 921 vmulq_f32( |
| 922 vcvtq_f32_s32(vsubq_s32(vmulq_s32(at2, xx), vmulq_s32(at1, xy))), |
| 923 frb_fg_frb), |
| 924 vdupq_n_f32(0.5f))); |
| 925 // Clamp and saturate. |
| 926 uint16x4_t base1_rgb16 = vqmovun_s32(vbslq_s32( |
| 927 vcgeq_s32(base1_rgb32, kClamp565_vec), kClamp565_vec, base1_rgb32)); |
| 928 |
| 929 base_colors = vcombine_u16(base0_rgb16, base1_rgb16); |
| 930 } |
| 931 |
| 932 return !Equal(old_base_colors, base_colors); |
| 933 } |
| 934 |
| 935 template <typename T, Quality QUALITY> |
| 936 ALWAYS_INLINE void CompressColorBlock(uint8_t* dst, |
| 937 const uint8x16x4_t& pixels_linear, |
| 938 const uint8x16x4_t& pixels_scattered, |
| 939 uint8x8_t min_rgba, |
| 940 uint8x8_t max_rgba) { |
| 941 // Take a shortcut if the block is constant (disregarding alpha). |
| 942 uint32_t min32 = vget_lane_u32(vreinterpret_u32_u8(min_rgba), 0); |
| 943 uint32_t max32 = vget_lane_u32(vreinterpret_u32_u8(max_rgba), 0); |
| 944 if ((min32 & 0x00ffffff) == (max32 & 0x00ffffff)) { |
| 945 // Swapping r and b channels to match Skia. |
| 946 int b = min32 & 0xff; |
| 947 int g = (min32 >> 8) & 0xff; |
| 948 int r = (min32 >> 16) & 0xff; |
| 949 |
| 950 uint16_t max16 = MatchSingleColorMax<typename T::BASE_TYPE>(r, g, b); |
| 951 uint16_t min16 = MatchSingleColorMin<typename T::BASE_TYPE>(r, g, b); |
| 952 uint32_t indices = T::kConstantColorIndices; |
| 953 FormatFixup_Generic<typename T::BASE_TYPE>(&max16, &min16, &indices); |
| 954 |
| 955 uint32_t* dst32 = reinterpret_cast<uint32_t*>(dst); |
| 956 dst32[0] = max16 | (min16 << 16); |
| 957 dst32[1] = indices; |
| 958 } else { |
| 959 uint16x4_t sum_rgb; |
| 960 uint16x8_t base_colors; |
| 961 |
| 962 if (QUALITY == kQualityLow) { |
| 963 base_colors = GetApproximateBaseColors(pixels_linear, pixels_scattered, |
| 964 min_rgba, max_rgba); |
| 965 } else { |
| 966 sum_rgb = SumRGB(pixels_scattered); |
| 967 // Do Primary Component Analysis and map along principal axis. |
| 968 base_colors = OptimizeColorsBlock(pixels_linear, pixels_scattered, |
| 969 sum_rgb, min_rgba, max_rgba); |
| 970 } |
| 971 |
| 972 // Check if the two base colors are the same. |
| 973 uint8x16_t indices; |
| 974 if (!Equal(vget_low_u16(base_colors), vget_high_u16(base_colors))) { |
| 975 // Calculate the two intermediate colors as well. |
| 976 uint16x4x4_t colors = EvalColors(base_colors); |
| 977 |
| 978 // Do a first pass to find good index candicates for all 16 of the pixels |
| 979 // in the block. |
| 980 indices = MatchColorsBlock<T>(pixels_scattered, colors); |
| 981 } else { |
| 982 // Any indices can be used here. |
| 983 indices = vdupq_n_u8(0); |
| 984 } |
| 985 |
| 986 if (QUALITY == kQualityHigh) { |
| 987 // Refine the base colors and indices multiple times if requested. |
| 988 for (int i = 0; i < kNumRefinements; ++i) { |
| 989 uint8x16_t lastIndices = indices; |
| 990 |
| 991 if (RefineBlock<T>(pixels_scattered, sum_rgb, base_colors, indices)) { |
| 992 if (!Equal(vget_low_u16(base_colors), vget_high_u16(base_colors))) { |
| 993 uint16x4x4_t colors = EvalColors(base_colors); |
| 994 indices = MatchColorsBlock<T>(pixels_scattered, colors); |
| 995 } else { |
| 996 // We ended up with two identical base colors, can't refine this |
| 997 // further. |
| 998 indices = vdupq_n_u8(0); |
| 999 break; |
| 1000 } |
| 1001 } |
| 1002 |
| 1003 if (Equal(indices, lastIndices)) { |
| 1004 // There's no need to do another refinement pass if we didn't get any |
| 1005 // improvements this pass. |
| 1006 break; |
| 1007 } |
| 1008 } |
| 1009 } |
| 1010 |
| 1011 // Prepare the final block by converting the base colors to 16-bit and |
| 1012 // packing the pixel indices. |
| 1013 uint16x4_t base_colors_16 = PackBaseColors(base_colors); |
| 1014 uint64x1_t indices_2x16 = PackIndices<2>(indices); |
| 1015 FormatFixup<T>(&base_colors_16, &indices_2x16); |
| 1016 uint64x1_t output = vorr_u64(vshl_n_u64(indices_2x16, 32), |
| 1017 vreinterpret_u64_u16(base_colors_16)); |
| 1018 vst1_u64(reinterpret_cast<uint64_t*>(dst), output); |
| 1019 } |
| 1020 } |
| 1021 |
| 1022 // alpha: 8x8-bit alpha values. |
| 1023 // dist: Distance between max and min alpha in the color block. |
| 1024 // bias: Rounding bias. |
| 1025 ALWAYS_INLINE uint8x8_t |
| 1026 GetAlphaIndices(uint8x8_t alpha, int16x8_t dist, int16x8_t bias) { |
| 1027 // Expand to signed 16-bit. |
| 1028 int16x8_t alpha_16 = vreinterpretq_s16_u16(vmovl_u8(alpha)); |
| 1029 |
| 1030 // Multiply each alpha value by 7 and add bias. |
| 1031 int16x8_t a = vaddq_s16(vmulq_s16(alpha_16, vdupq_n_s16(7)), bias); |
| 1032 |
| 1033 int16x8_t dist4 = vmulq_s16(dist, vdupq_n_s16(4)); |
| 1034 int16x8_t dist2 = vmulq_s16(dist, vdupq_n_s16(2)); |
| 1035 |
| 1036 // Select index. This is a "linear scale" lerp factor between 0 (val=min) |
| 1037 // and 7 (val=max). |
| 1038 // t = (a >= dist4) ? -1 : 0 |
| 1039 int16x8_t t = |
| 1040 vandq_s16(vreinterpretq_s16_u16(vcgeq_s16(a, dist4)), vdupq_n_s16(-1)); |
| 1041 // ind1 = t & 4; |
| 1042 int16x8_t ind1 = vandq_s16(t, vdupq_n_s16(4)); |
| 1043 // a1 = a - (dist4 & t); |
| 1044 int16x8_t a1 = vsubq_s16(a, vandq_s16(dist4, t)); |
| 1045 |
| 1046 // t = (a1 >= dist2) ? -1 : 0; |
| 1047 t = vandq_s16(vreinterpretq_s16_u16(vcgeq_s16(a1, dist2)), vdupq_n_s16(-1)); |
| 1048 // ind2 = t & 2; |
| 1049 int16x8_t ind2 = vandq_s16(t, vdupq_n_s16(2)); |
| 1050 // a2 = a1 - (dist2 & t); |
| 1051 int16x8_t a2 = vsubq_s16(a1, vandq_s16(dist2, t)); |
| 1052 |
| 1053 // ind3 = (a2 >= dist) |
| 1054 int16x8_t ind3 = |
| 1055 vandq_s16(vreinterpretq_s16_u16(vcgeq_s16(a2, dist)), vdupq_n_s16(1)); |
| 1056 |
| 1057 // indices = ind1 + ind2 + ind3 |
| 1058 int16x8_t indices = vaddq_s16(ind1, vaddq_s16(ind2, ind3)); |
| 1059 |
| 1060 // Turn linear scale into alpha index (0/1 are extremal pts). |
| 1061 // ind = -indices & 7 |
| 1062 int16x8_t ind = vandq_s16(vnegq_s16(indices), vdupq_n_s16(7)); |
| 1063 // indices = ind ^ (2 > ind) |
| 1064 indices = veorq_s16( |
| 1065 ind, vandq_s16(vreinterpretq_s16_u16(vcgtq_s16(vdupq_n_s16(2), ind)), |
| 1066 vdupq_n_s16(1))); |
| 1067 // Narrow it down to unsigned 8 bits and return. |
| 1068 return vqmovun_s16(indices); |
| 1069 } |
| 1070 |
| 1071 ALWAYS_INLINE void CompressAlphaBlock(uint8_t* dst, |
| 1072 uint8x16_t pixels_alpha, |
| 1073 uint8x8_t min_rgba, |
| 1074 uint8x8_t max_rgba) { |
| 1075 // Take a shortcut if the block is constant. |
| 1076 uint8_t min_alpha = vget_lane_u8(min_rgba, 3); |
| 1077 uint8_t max_alpha = vget_lane_u8(max_rgba, 3); |
| 1078 if (min_alpha == max_alpha) { |
| 1079 dst[0] = max_alpha; |
| 1080 dst[1] = min_alpha; |
| 1081 // All indices are the same, any value will do. |
| 1082 *reinterpret_cast<uint16_t*>(dst + 2) = 0; |
| 1083 *reinterpret_cast<uint32_t*>(dst + 4) = 0; |
| 1084 } else { |
| 1085 // [max - min]x8 |
| 1086 int16x8_t dist = vdupq_lane_s16( |
| 1087 vreinterpret_s16_u16(vget_low_u16(vsubl_u8(max_rgba, min_rgba))), 3); |
| 1088 // bias = (dist < 8) ? (dist - 1) : (dist / 2 + 2) |
| 1089 int16x8_t bias = vbslq_s16(vcltq_s16(dist, vdupq_n_s16(8)), |
| 1090 vsubq_s16(dist, vdupq_n_s16(1)), |
| 1091 vaddq_s16(vshrq_n_s16(dist, 1), vdupq_n_s16(2))); |
| 1092 // bias -= min * 7; |
| 1093 bias = vsubq_s16( |
| 1094 bias, |
| 1095 vmulq_s16( |
| 1096 vdupq_lane_s16( |
| 1097 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(min_rgba))), 3), |
| 1098 vdupq_n_s16(7))); |
| 1099 |
| 1100 uint8x8_t indices_lo = |
| 1101 GetAlphaIndices(vget_low_u8(pixels_alpha), dist, bias); |
| 1102 uint8x8_t indices_hi = |
| 1103 GetAlphaIndices(vget_high_u8(pixels_alpha), dist, bias); |
| 1104 |
| 1105 // Prepare the final block by combining the base alpha values and packing |
| 1106 // the alpha indices. |
| 1107 uint8x8_t max_min_alpha = vzip_u8(max_rgba, min_rgba).val[0]; |
| 1108 uint64x1_t indices = PackIndices<3>(vcombine_u8(indices_lo, indices_hi)); |
| 1109 uint64x1_t output = |
| 1110 vorr_u64(vshl_n_u64(indices, 16), |
| 1111 vshr_n_u64(vreinterpret_u64_u8(max_min_alpha), 48)); |
| 1112 vst1_u64(reinterpret_cast<uint64_t*>(dst), output); |
| 1113 } |
| 1114 } |
| 1115 |
| 1116 template <typename T, bool OPAQUE, Quality QUALITY> |
| 1117 void CompressImage(const uint8_t* src, uint8_t* dst, int width, int height) { |
| 1118 for (int y = 0; y < height; y += 4, src += width * 4 * 4) { |
| 1119 for (int x = 0; x < width; x += 4) { |
| 1120 // Load the four rows of pixels. |
| 1121 uint8x16x4_t pixels_linear; |
| 1122 pixels_linear.val[0] = vld1q_u8(src + (x + 0 * width) * 4); |
| 1123 pixels_linear.val[1] = vld1q_u8(src + (x + 1 * width) * 4); |
| 1124 pixels_linear.val[2] = vld1q_u8(src + (x + 2 * width) * 4); |
| 1125 pixels_linear.val[3] = vld1q_u8(src + (x + 3 * width) * 4); |
| 1126 |
| 1127 // Transpose/scatter the red, green, blue and alpha channels into |
| 1128 // separate registers. |
| 1129 ALIGNAS(8) uint8_t block[64]; |
| 1130 vst1q_u8(block + 0 * 16, pixels_linear.val[0]); |
| 1131 vst1q_u8(block + 1 * 16, pixels_linear.val[1]); |
| 1132 vst1q_u8(block + 2 * 16, pixels_linear.val[2]); |
| 1133 vst1q_u8(block + 3 * 16, pixels_linear.val[3]); |
| 1134 uint8x16x4_t pixels_scattered = vld4q_u8(block); |
| 1135 |
| 1136 // We need the min and max values both to detect solid blocks and when |
| 1137 // computing the base colors. |
| 1138 uint8x8_t min_rgba = FoldRGBA<vec_ops::Min>(pixels_scattered); |
| 1139 uint8x8_t max_rgba = FoldRGBA<vec_ops::Max>(pixels_scattered); |
| 1140 |
| 1141 if (!OPAQUE) { |
| 1142 CompressAlphaBlock(dst, pixels_scattered.val[3], min_rgba, max_rgba); |
| 1143 dst += 8; |
| 1144 } |
| 1145 |
| 1146 CompressColorBlock<T, QUALITY>(dst, pixels_linear, pixels_scattered, |
| 1147 min_rgba, max_rgba); |
| 1148 dst += 8; |
| 1149 } |
| 1150 } |
| 1151 } |
| 1152 |
| 1153 } // namespace |
| 1154 |
| 1155 void CompressATC_NEON(const uint8_t* src, uint8_t* dst, int width, int height) { |
| 1156 CompressImage<TYPE_ATC_NEON, true, kQualityHigh>(src, dst, width, height); |
| 1157 } |
| 1158 |
| 1159 void CompressATCIA_NEON(const uint8_t* src, |
| 1160 uint8_t* dst, |
| 1161 int width, |
| 1162 int height) { |
| 1163 CompressImage<TYPE_ATC_NEON, false, kQualityHigh>(src, dst, width, height); |
| 1164 } |
| 1165 |
| 1166 void CompressDXT1_NEON(const uint8_t* src, |
| 1167 uint8_t* dst, |
| 1168 int width, |
| 1169 int height) { |
| 1170 CompressImage<TYPE_DXT_NEON, true, kQualityHigh>(src, dst, width, height); |
| 1171 } |
| 1172 |
| 1173 void CompressDXT5_NEON(const uint8_t* src, |
| 1174 uint8_t* dst, |
| 1175 int width, |
| 1176 int height) { |
| 1177 CompressImage<TYPE_DXT_NEON, false, kQualityHigh>(src, dst, width, height); |
| 1178 } |
| 1179 |
| 1180 } // namespace texture_compress |
| 1181 } // namespace cc |
| 1182 |
| 1183 #endif // __ARM_NEON__ |
OLD | NEW |