cc/resources/texture_compress/arm/atc_dxt_neon.cc - Issue 793693003: Tile Compression

Side by Side Diff: cc/resources/texture_compress/arm/atc_dxt_neon.cc

Issue 793693003: Tile Compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2014 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 // See the links below for detailed descriptions of the algorithms used.

	6 // http://cbloomrants.blogspot.se/2008/12/12-08-08-dxtc-summary.html

	7 // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination

	8

	9 #if defined(__ARM_NEON__)

	10

	11 #include "cc/resources/texture_compress/arm/atc_dxt_neon.h"

	12

	13 #include <arm_neon.h>

	14

	15 #include "base/compiler_specific.h"

	16 #include "base/logging.h"

	17 #include "cc/resources/texture_compress/atc_dxt.h"

	18

	19 namespace cc {

	20 namespace texture_compress {

	21

	22 struct TYPE_ATC_NEON : public TYPE_ATC {

	23 typedef TYPE_ATC BASE_TYPE;

	24 static const uint8x8_t kRemap;

	25 static const uint64_t kProds[3];

	26 };

	27

	28 struct TYPE_DXT_NEON : public TYPE_DXT {

	29 typedef TYPE_DXT BASE_TYPE;

	30 static const uint8x8_t kRemap;

	31 static const int8x8_t kW1Table;

	32 static const uint64_t kProds[3];

	33 };

	34

	35 const uint8x8_t TYPE_ATC_NEON::kRemap = {0, 1, 0, 1, 2, 2, 3, 3};

	36 const uint64_t TYPE_ATC_NEON::kProds[3] = {0x00010409, 0x09040100, 0x00020200};

	37

	38 const uint8x8_t TYPE_DXT_NEON::kRemap = {0, 2, 0, 2, 3, 3, 1, 1};

	39 const int8x8_t TYPE_DXT_NEON::kW1Table = {3, 0, 2, 1, 0, 0, 0, 0};

	40 const uint64_t TYPE_DXT_NEON::kProds[3] = {0x01040009, 0x04010900, 0x02020000};

	41

	42 // Number of passes over the block that's done to refine the base colors.

	43 // Only applies to high quality compression mode.

	44 const int kNumRefinements = 2;

	45

	46 namespace {

	47

	48 template <typename T>

	49 ALWAYS_INLINE int8x16_t DoW1TableLookup(uint8x16_t indices);

	50

	51 template <>

	52 ALWAYS_INLINE int8x16_t DoW1TableLookup<TYPE_ATC_NEON>(uint8x16_t indices) {

	53 // Take a shortcut for ATC which gives the same result as the table lookup.

	54 // {0, 1, 2, 3} -> {3, 2, 1, 0}

	55 return veorq_s8(vreinterpretq_s8_u8(indices), vdupq_n_s8(3));

	56 }

	57

	58 template <>

	59 ALWAYS_INLINE int8x16_t DoW1TableLookup<TYPE_DXT_NEON>(uint8x16_t indices) {

	60 // Do table lookup for each color index.

	61 return vcombine_s8(vtbl1_s8(TYPE_DXT_NEON::kW1Table,

	62 vreinterpret_s8_u8(vget_low_u8(indices))),

	63 vtbl1_s8(TYPE_DXT_NEON::kW1Table,

	64 vreinterpret_s8_u8(vget_high_u8(indices))));

	65 }

	66

	67 // Returns max and min base green colors matching the given single green color

	68 // when solved via linear interpolation. Output format differs for ATC and DXT.

	69 // See explicitly instantiated template functions below.

	70 template <typename T>

	71 ALWAYS_INLINE uint16_t MatchSingleGreenMax(int g);

	72 template <typename T>

	73 ALWAYS_INLINE uint16_t MatchSingleGreenMin(int g);

	74

	75 template <>

	76 ALWAYS_INLINE uint16_t MatchSingleGreenMax<TYPE_ATC>(int g) {

	77 return g_o_match56[g][0] << 1;

	78 }

	79

	80 template <>

	81 ALWAYS_INLINE uint16_t MatchSingleGreenMin<TYPE_ATC>(int g) {

	82 return g_o_match56[g][1];

	83 }

	84

	85 template <>

	86 ALWAYS_INLINE uint16_t MatchSingleGreenMax<TYPE_DXT>(int g) {

	87 return g_o_match66[g][0];

	88 }

	89

	90 template <>

	91 ALWAYS_INLINE uint16_t MatchSingleGreenMin<TYPE_DXT>(int g) {

	92 return g_o_match66[g][1];

	93 }

	94

	95 // This converts the output data to either ATC or DXT format.

	96 // See explicitly instantiated template functions below.

	97 template <typename T>

	98 ALWAYS_INLINE void FormatFixup(uint16x4_t* base_colors, uint64x1_t* indices);

	99

	100 template <>

	101 ALWAYS_INLINE void FormatFixup<TYPE_ATC_NEON>(uint16x4_t* base_colors,

	102 uint64x1_t* indices) {

	103 // First color in ATC format is 555.

	104 *base_colors = vorr_u16(

	105 vand_u16(*base_colors, vreinterpret_u16_u64(vdup_n_u64(0xffff001f))),

	106 vshr_n_u16(

	107 vand_u16(*base_colors, vreinterpret_u16_u64(vdup_n_u64(0x0000ffC0))),

	108 1));

	109 }

	110

	111 template <>

	112 ALWAYS_INLINE void FormatFixup<TYPE_DXT_NEON>(uint16x4_t* base_colors,

	113 uint64x1_t* indices) {

	114 // Swap min/max colors if necessary.

	115 uint16x4_t max = vdup_lane_u16(*base_colors, 0);

	116 uint16x4_t min = vdup_lane_u16(*base_colors, 1);

	117 uint16x4_t cmp = vclt_u16(max, min);

	118 *base_colors =

	119 vorr_u16(vand_u16(vbsl_u16(cmp, min, max),

	120 vreinterpret_u16_u64(vdup_n_u64(0x0000ffff))),

	121 vand_u16(vbsl_u16(cmp, max, min),

	122 vreinterpret_u16_u64(vdup_n_u64(0xffff0000))));

	123 *indices = vbsl_u64(vreinterpret_u64_u16(cmp),

	124 veor_u64(indices, vdup_n_u64(0x55555555)), indices);

	125 }

	126

	127 // Check if all the 8 bits elements in the given quad register are equal.

	128 ALWAYS_INLINE bool ElementsEqual(uint8x16_t elements) {

	129 uint8x16_t first = vdupq_lane_u8(vget_low_u8(elements), 0);

	130 uint8x16_t eq = vceqq_u8(elements, first);

	131 uint8x8_t tst = vand_u8(vget_low_u8(eq), vget_high_u8(eq));

	132 return vget_lane_u64(vreinterpret_u64_u8(tst), 0) == 0xffffffffffffffff;

	133 }

	134

	135 ALWAYS_INLINE bool Equal(uint8x16_t e1, uint8x16_t e2) {

	136 uint8x16_t eq = vceqq_u8(e1, e2);

	137 uint8x8_t tst = vand_u8(vget_low_u8(eq), vget_high_u8(eq));

	138 return vget_lane_u64(vreinterpret_u64_u8(tst), 0) == 0xffffffffffffffff;

	139 }

	140

	141 ALWAYS_INLINE bool Equal(uint16x8_t e1, uint16x8_t e2) {

	142 uint16x8_t eq = vceqq_u16(e1, e2);

	143 uint16x4_t tst = vand_u16(vget_low_u16(eq), vget_high_u16(eq));

	144 return vget_lane_u64(vreinterpret_u64_u16(tst), 0) == 0xffffffffffffffff;

	145 }

	146

	147 ALWAYS_INLINE bool Equal(uint16x4_t e1, uint16x4_t e2) {

	148 uint16x4_t eq = vceq_u16(e1, e2);

	149 return vget_lane_u64(vreinterpret_u64_u16(eq), 0) == 0xffffffffffffffff;

	150 }

	151

	152 ALWAYS_INLINE int16x8x2_t ExpandRGBATo16(const uint8x16_t& channel) {

	153 int16x8x2_t result;

	154 result.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(channel)));

	155 result.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(channel)));

	156 return result;

	157 }

	158

	159 ALWAYS_INLINE int32x4x4_t ExpandRGBATo32(const uint8x16_t& channel) {

	160 uint16x8_t lo = vmovl_u8(vget_low_u8(channel));

	161 uint16x8_t hi = vmovl_u8(vget_high_u8(channel));

	162 int32x4x4_t result;

	163 result.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(lo)));

	164 result.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(lo)));

	165 result.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(hi)));

	166 result.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(hi)));

	167 return result;

	168 }

	169

	170 // NEON doesn't have support for division.

	171 // Instead it's recommended to use Newton-Raphson refinement to get a close

	172 // approximation.

	173 template <int REFINEMENT_STEPS>

	174 ALWAYS_INLINE float32x4_t Divide(float32x4_t a, float32x4_t b) {

	175 #ifdef VERIFY_RESULTS

	176 ALIGNAS(8) float a_[4];

	177 ALIGNAS(8) float b_[4];

	178 vst1q_f32(a_, a);

	179 vst1q_f32(b_, b);

	180 for (int i = 0; i < 4; ++i)

	181 a_[i] /= b_[i];

	182 return vld1q_f32(a_);

	183 #else

	184 // Get an initial estimate of 1/b.

	185 float32x4_t reciprocal = vrecpeq_f32(b);

	186 // Use a number of Newton-Raphson steps to refine the estimate.

	187 for (int i = 0; i < REFINEMENT_STEPS; ++i)

	188 reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);

	189 // Calculate the final estimate.

	190 return vmulq_f32(a, reciprocal);

	191 #endif

	192 }

	193

	194 namespace vec_ops {

	195

	196 struct Max {

	197 ALWAYS_INLINE int32x4_t Calc(int32x4_t a, int32x4_t b) {

	198 return vmaxq_s32(a, b);

	199 }

	200

	201 ALWAYS_INLINE uint32x4_t Calc(uint32x4_t a, uint32x4_t b) {

	202 return vmaxq_u32(a, b);

	203 }

	204

	205 ALWAYS_INLINE uint8x8_t Fold(uint8x8_t a, uint8x8_t b) {

	206 return vpmax_u8(a, b);

	207 }

	208

	209 ALWAYS_INLINE int32x2_t Fold(int32x2_t a, int32x2_t b) {

	210 return vpmax_s32(a, b);

	211 }

	212

	213 ALWAYS_INLINE uint32x2_t Fold(uint32x2_t a, uint32x2_t b) {

	214 return vpmax_u32(a, b);

	215 }

	216 };

	217

	218 struct Min {

	219 ALWAYS_INLINE int32x4_t Calc(int32x4_t a, int32x4_t b) {

	220 return vminq_s32(a, b);

	221 }

	222

	223 ALWAYS_INLINE uint32x4_t Calc(uint32x4_t a, uint32x4_t b) {

	224 return vminq_u32(a, b);

	225 }

	226

	227 ALWAYS_INLINE uint8x8_t Fold(uint8x8_t a, uint8x8_t b) {

	228 return vpmin_u8(a, b);

	229 }

	230

	231 ALWAYS_INLINE int32x2_t Fold(int32x2_t a, int32x2_t b) {

	232 return vpmin_s32(a, b);

	233 }

	234

	235 ALWAYS_INLINE uint32x2_t Fold(uint32x2_t a, uint32x2_t b) {

	236 return vpmin_u32(a, b);

	237 }

	238 };

	239

	240 } // namespace vec_ops

	241

	242 template <typename Operator>

	243 ALWAYS_INLINE uint8x8_t FoldRGBA(const uint8x16x4_t& src) {

	244 Operator op;

	245

	246 // Fold each adjacent pair.

	247 uint8x8_t r = op.Fold(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]));

	248 uint8x8_t g = op.Fold(vget_low_u8(src.val[1]), vget_high_u8(src.val[1]));

	249 uint8x8_t b = op.Fold(vget_low_u8(src.val[2]), vget_high_u8(src.val[2]));

	250 uint8x8_t a = op.Fold(vget_low_u8(src.val[3]), vget_high_u8(src.val[3]));

	251

	252 // Do both red and green channels at the same time.

	253 uint8x8_t rg = op.Fold(r, g);

	254

	255 // Do both blue and alpha channels at the same time.

	256 uint8x8_t ba = op.Fold(b, a);

	257

	258 // Do all the channels at the same time.

	259 uint8x8_t rgba = op.Fold(rg, ba);

	260

	261 // Finally, we need to pad it to get the final reduction.

	262 return op.Fold(rgba, rgba);

	263 }

	264

	265 template <typename Operator>

	266 ALWAYS_INLINE int32x2_t Fold(const int32x4x4_t& src) {

	267 Operator op;

	268

	269 int32x4_t fold0 = op.Calc(src.val[0], src.val[1]);

	270 int32x4_t fold1 = op.Calc(src.val[2], src.val[3]);

	271 int32x4_t fold01 = op.Calc(fold0, fold1);

	272 int32x2_t fold0123 = op.Fold(vget_low_s32(fold01), vget_high_s32(fold01));

	273 return op.Fold(fold0123, vdup_n_s32(0));

	274 }

	275

	276 template <typename Operator>

	277 ALWAYS_INLINE uint32x2_t Fold(const uint32x4x4_t& src) {

	278 Operator op;

	279

	280 uint32x4_t fold0 = op.Calc(src.val[0], src.val[1]);

	281 uint32x4_t fold1 = op.Calc(src.val[2], src.val[3]);

	282 uint32x4_t fold01 = op.Calc(fold0, fold1);

	283 uint32x2_t fold0123 = op.Fold(vget_low_u32(fold01), vget_high_u32(fold01));

	284 return op.Fold(fold0123, vdup_n_u32(0));

	285 }

	286

	287 template <typename Operator>

	288 ALWAYS_INLINE int32x4_t FoldDup(const int32x4x4_t& src) {

	289 return vdupq_lane_s32(Fold<Operator>(src), 0);

	290 }

	291

	292 ALWAYS_INLINE uint16x4_t SumRGB(const uint8x16x4_t& src) {

	293 // Add up all red values for 16 pixels.

	294 uint16x8_t r = vpaddlq_u8(src.val[0]);

	295 uint16x4_t r2 = vpadd_u16(vget_low_u16(r), vget_high_u16(r));

	296

	297 // Add up all green values for 16 pixels.

	298 uint16x8_t g = vpaddlq_u8(src.val[1]);

	299 uint16x4_t g2 = vpadd_u16(vget_low_u16(g), vget_high_u16(g));

	300

	301 uint16x4_t rg = vpadd_u16(r2, g2);

	302

	303 // Add up all blue values for 16 pixels.

	304 uint16x8_t b = vpaddlq_u8(src.val[2]);

	305 uint16x4_t b2 = vpadd_u16(vget_low_u16(b), vget_high_u16(b));

	306

	307 uint16x4_t ba = vpadd_u16(b2, vdup_n_u16(0));

	308

	309 return vpadd_u16(rg, ba);

	310 }

	311

	312 ALWAYS_INLINE int32x4_t SumRGB(const int16x8x4_t& src) {

	313 // Add up all red values for 8 pixels.

	314 int32x4_t r = vpaddlq_s16(src.val[0]);

	315 int32x2_t r2 = vpadd_s32(vget_low_s32(r), vget_high_s32(r));

	316

	317 // Add up all green values for 8 pixels.

	318 int32x4_t g = vpaddlq_s16(src.val[1]);

	319 int32x2_t g2 = vpadd_s32(vget_low_s32(g), vget_high_s32(g));

	320

	321 int32x2_t rg = vpadd_s32(r2, g2);

	322

	323 // Add up all blue values for 8 pixels.

	324 int32x4_t b = vpaddlq_s16(src.val[2]);

	325 int32x2_t b2 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));

	326

	327 int32x2_t ba = vpadd_s32(b2, vdup_n_s32(0));

	328

	329 return vcombine_s32(rg, ba);

	330 }

	331

	332 ALWAYS_INLINE int32x4_t SumRGB(const int32x4x4_t& src) {

	333 // Add up all red values for 8 pixels.

	334 int32x2_t r = vmovn_s64(vpaddlq_s32(src.val[0]));

	335

	336 // Add up all green values for 8 pixels.

	337 int32x2_t g = vmovn_s64(vpaddlq_s32(src.val[1]));

	338

	339 int32x2_t rg = vpadd_s32(r, g);

	340

	341 // Add up all blue values for 8 pixels.

	342 int32x2_t b = vmovn_s64(vpaddlq_s32(src.val[2]));

	343

	344 int32x2_t ba = vpadd_s32(b, vdup_n_s32(0));

	345

	346 return vcombine_s32(rg, ba);

	347 }

	348

	349 ALWAYS_INLINE int32x4_t DotProduct(int32x4_t r,

	350 int32x4_t g,

	351 int32x4_t b,

	352 int32x4_t dir_r,

	353 int32x4_t dir_g,

	354 int32x4_t dir_b) {

	355 // Multiply and accumulate each 32 bits element.

	356 int32x4_t dots = vmulq_s32(r, dir_r);

	357 dots = vmlaq_s32(dots, g, dir_g);

	358 dots = vmlaq_s32(dots, b, dir_b);

	359 return dots;

	360 }

	361

	362 ALWAYS_INLINE int32x4x4_t CalculateDots(const int32x4x4_t& r,

	363 const int32x4x4_t& g,

	364 const int32x4x4_t& b,

	365 const int32x4_t& v_vec) {

	366 // Duplicate the red, green and blue luminance values.

	367 int32x4_t r_vec = vdupq_n_s32(vgetq_lane_s32(v_vec, 0));

	368 int32x4_t g_vec = vdupq_n_s32(vgetq_lane_s32(v_vec, 1));

	369 int32x4_t b_vec = vdupq_n_s32(vgetq_lane_s32(v_vec, 2));

	370

	371 int32x4x4_t result;

	372 result.val[0] = DotProduct(r.val[0], g.val[0], b.val[0], r_vec, g_vec, b_vec);

	373 result.val[1] = DotProduct(r.val[1], g.val[1], b.val[1], r_vec, g_vec, b_vec);

	374 result.val[2] = DotProduct(r.val[2], g.val[2], b.val[2], r_vec, g_vec, b_vec);

	375 result.val[3] = DotProduct(r.val[3], g.val[3], b.val[3], r_vec, g_vec, b_vec);

	376 return result;

	377 }

	378

	379 ALWAYS_INLINE uint16x8_t QuantizeTo565(uint8x8_t pixels) {

	380 // in: [min_r min_g min_b 0 max_r max_g max_b 0]

	381 // out: [min_r5 min_g6 min_b5 0][max_r5 max_g6 max_b5 0]

	382

	383 // Expand the components to signed 16 bit.

	384 uint16x8_t pixels16 = vmovl_u8(pixels);

	385

	386 // {31, 63, 31, 0, 31, 63, 31, 0};

	387 const uint16x8_t kMultiply = vreinterpretq_u16_u64(vdupq_n_u64(0x1f003f001f));

	388 uint16x8_t pixel0 = vmulq_u16(pixels16, kMultiply);

	389

	390 // {128, 128, 128, 0, 128, 128, 128, 0};

	391 const uint16x8_t kAdd = vreinterpretq_u16_u64(vdupq_n_u64(0x8000800080));

	392 uint16x8_t pixel1 = vaddq_u16(pixel0, kAdd);

	393

	394 // Create a shifted copy.

	395 uint16x8_t pixel2 = vsraq_n_u16(pixel1, pixel1, 8);

	396

	397 // Shift and return.

	398 return vshrq_n_u16(pixel2, 8);

	399 }

	400

	401 // Combine the components of base colors in to 16 bits.

	402 ALWAYS_INLINE uint16x4_t PackBaseColors(uint16x8_t base_colors) {

	403 // in: [max_r5 max_g6 max_b5 0][min_r5 min_g6 min_b5 0]

	404 // out: [max_rgb565 min_rgb565 0 0]

	405

	406 // Swapping r and b channels to match Skia.

	407 base_colors = vrev64q_u16(base_colors);

	408 base_colors = vcombine_u16(

	409 vext_u16(vget_low_u16(base_colors), vget_low_u16(base_colors), 1),

	410 vext_u16(vget_high_u16(base_colors), vget_high_u16(base_colors), 1));

	411

	412 // Shift to pack RGB565 in 16-bit.

	413 uint64x2_t r =

	414 vshlq_u64(vreinterpretq_u64_u16(base_colors), vdupq_n_s64(-32));

	415 uint64x2_t g =

	416 vshlq_u64(vreinterpretq_u64_u16(base_colors), vdupq_n_s64(-11));

	417 uint64x2_t b = vshlq_u64(vreinterpretq_u64_u16(base_colors), vdupq_n_s64(11));

	418 uint64x2_t base_colors_16 = vorrq_u64(r, vorrq_u64(g, b));

	419

	420 // Shift to pack 16-bit base colors in 32-bit and return.

	421 return vreinterpret_u16_u64(

	422 vorr_u64(vshl_n_u64(vget_high_u64(base_colors_16), 16),

	423 vand_u64(vget_low_u64(base_colors_16), vdup_n_u64(0xffff))));

	424 }

	425

	426 // Combine the given color indices.

	427 //

	428 // Params:

	429 // S Size of an index in bits.

	430 // indices Indices to be combined. Each of 8 bits element represents an index.

	431 template <int S>

	432 ALWAYS_INLINE uint64x1_t PackIndices(uint8x16_t indices) {

	433 uint64x2_t ind = vshlq_n_u64(vreinterpretq_u64_u8(indices), 8 - S);

	434 const uint64x2_t mask = vdupq_n_u64(0xff00000000000000);

	435 uint64x2_t ind2 = vandq_u64(vshlq_n_u64(ind, 56), mask);

	436 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 48), mask));

	437 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 40), mask));

	438 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 32), mask));

	439 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 24), mask));

	440 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 16), mask));

	441 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(vshlq_n_u64(ind, 8), mask));

	442 ind2 = vorrq_u64(vshrq_n_u64(ind2, S), vandq_u64(ind, mask));

	443 return vshr_n_u64(

	444 vorr_u64(vshr_n_u64(vget_low_u64(ind2), (8 * S)), vget_high_u64(ind2)),

	445 64 - 16 * S);

	446 }

	447

	448 ALWAYS_INLINE int32x4_t

	449 CovarianceChannels(const int16x8x2_t& ch1, const int16x8x2_t& ch2) {

	450 // Multiply and accumulate.

	451 int32x4_t cov;

	452 cov = vmull_s16(vget_low_s16(ch1.val[0]), vget_low_s16(ch2.val[0]));

	453 cov = vmlal_s16(cov, vget_high_s16(ch1.val[0]), vget_high_s16(ch2.val[0]));

	454 cov = vmlal_s16(cov, vget_low_s16(ch1.val[1]), vget_low_s16(ch2.val[1]));

	455 cov = vmlal_s16(cov, vget_high_s16(ch1.val[1]), vget_high_s16(ch2.val[1]));

	456 return cov;

	457 }

	458

	459 ALWAYS_INLINE int32x4x2_t

	460 Covariance(uint16x4_t average_rgb, const uint8x16x4_t& pixels_scattered) {

	461 int16x8_t average_r = vreinterpretq_s16_u16(vdupq_lane_u16(average_rgb, 0));

	462 int16x8_t average_g = vreinterpretq_s16_u16(vdupq_lane_u16(average_rgb, 1));

	463 int16x8_t average_b = vreinterpretq_s16_u16(vdupq_lane_u16(average_rgb, 2));

	464

	465 // Subtract red values from the average red.

	466 int16x8x2_t diff_r;

	467 diff_r.val[0] = vsubq_s16(

	468 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels_scattered.val[0]))),

	469 average_r);

	470 diff_r.val[1] = vsubq_s16(

	471 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels_scattered.val[0]))),

	472 average_r);

	473

	474 // Subtract green values from the average green.

	475 int16x8x2_t diff_g;

	476 diff_g.val[0] = vsubq_s16(

	477 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels_scattered.val[1]))),

	478 average_g);

	479 diff_g.val[1] = vsubq_s16(

	480 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels_scattered.val[1]))),

	481 average_g);

	482

	483 // Subtract blue values from the average blue.

	484 int16x8x2_t diff_b;

	485 diff_b.val[0] = vsubq_s16(

	486 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels_scattered.val[2]))),

	487 average_b);

	488 diff_b.val[1] = vsubq_s16(

	489 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels_scattered.val[2]))),

	490 average_b);

	491

	492 int32x4x4_t cov1;

	493 cov1.val[0] = CovarianceChannels(diff_r, diff_r);

	494 cov1.val[1] = CovarianceChannels(diff_r, diff_g);

	495 cov1.val[2] = CovarianceChannels(diff_r, diff_b);

	496 cov1.val[3] = vdupq_n_s32(0);

	497

	498 int32x4x4_t cov2;

	499 cov2.val[0] = CovarianceChannels(diff_g, diff_g);

	500 cov2.val[1] = CovarianceChannels(diff_g, diff_b);

	501 cov2.val[2] = CovarianceChannels(diff_b, diff_b);

	502 cov2.val[3] = vdupq_n_s32(0);

	503

	504 int32x4x2_t covariance;

	505 covariance.val[0] = SumRGB(cov1);

	506 covariance.val[1] = SumRGB(cov2);

	507 return covariance;

	508 }

	509

	510 ALWAYS_INLINE uint32x2_t MaskOutPixel(const uint8x16x4_t& pixels_linear,

	511 const int32x4x4_t& dots,

	512 int32x4_t max_dot_vec) {

	513 // Mask out any of the 16 pixels where the dot product matches exactly.

	514 uint32x4x4_t pixels;

	515 pixels.val[0] = vandq_u32(vceqq_s32(dots.val[0], max_dot_vec),

	516 vreinterpretq_u32_u8(pixels_linear.val[0]));

	517

	518 pixels.val[1] = vandq_u32(vceqq_s32(dots.val[1], max_dot_vec),

	519 vreinterpretq_u32_u8(pixels_linear.val[1]));

	520

	521 pixels.val[2] = vandq_u32(vceqq_s32(dots.val[2], max_dot_vec),

	522 vreinterpretq_u32_u8(pixels_linear.val[2]));

	523

	524 pixels.val[3] = vandq_u32(vceqq_s32(dots.val[3], max_dot_vec),

	525 vreinterpretq_u32_u8(pixels_linear.val[3]));

	526

	527 // Fold it down.

	528 return Fold<vec_ops::Max>(pixels);

	529 }

	530

	531 ALWAYS_INLINE uint16x8_t GetBaseColors(const uint8x16x4_t& pixels_linear,

	532 const uint8x16x4_t& pixels_scattered,

	533 int32x4_t dir) {

	534 // Expand all pixels to signed 32-bit integers.

	535 int32x4x4_t r = ExpandRGBATo32(pixels_scattered.val[0]);

	536 int32x4x4_t g = ExpandRGBATo32(pixels_scattered.val[1]);

	537 int32x4x4_t b = ExpandRGBATo32(pixels_scattered.val[2]);

	538

	539 int32x4x4_t dots = CalculateDots(r, g, b, dir);

	540

	541 // Mask out the pixel(s) that matches the max dot.

	542 uint32x2_t max_pixel =

	543 MaskOutPixel(pixels_linear, dots, FoldDup<vec_ops::Max>(dots));

	544

	545 // Mask out the pixel(s) that matches the min dot.

	546 uint32x2_t min_pixel =

	547 MaskOutPixel(pixels_linear, dots, FoldDup<vec_ops::Min>(dots));

	548

	549 return QuantizeTo565(

	550 vreinterpret_u8_u32(vzip_u32(max_pixel, min_pixel).val[0]));

	551 }

	552

	553 // Figure out the two base colors to use from a block of 16 pixels

	554 // by Primary Component Analysis and map along principal axis.

	555 ALWAYS_INLINE uint16x8_t

	556 OptimizeColorsBlock(const uint8x16x4_t& pixels_linear,

	557 const uint8x16x4_t& pixels_scattered,

	558 uint16x4_t sum_rgb,

	559 uint8x8_t min_rgba,

	560 uint8x8_t max_rgba) {

	561 // min_rgba: [min_r min_g min_b min_a x x x x]

	562 // max_rgba: [max_r max_g max_b max_a x x x x]

	563

	564 // Determine color distribution. We already have the max and min, now we need

	565 // the average of the 16 pixels. Divide sum_rgb with rounding.

	566 uint16x4_t average_rgb = vrshr_n_u16(sum_rgb, 4);

	567

	568 // Determine covariance matrix.

	569 int32x4x2_t covariance = Covariance(average_rgb, pixels_scattered);

	570

	571 // Convert covariance matrix to float, find principal axis via power

	572 // iteration.

	573 float32x4x2_t covariance_float;

	574 const float32x4_t kInv255 = vdupq_n_f32(1.0f / 255.0f);

	575 covariance_float.val[0] =

	576 vmulq_f32(vcvtq_f32_s32(covariance.val[0]), kInv255);

	577 covariance_float.val[1] =

	578 vmulq_f32(vcvtq_f32_s32(covariance.val[1]), kInv255);

	579

	580 int16x4_t max_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(max_rgba)));

	581 int16x4_t min_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(min_rgba)));

	582 float32x4_t vf4 = vcvtq_f32_s32(vsubl_s16(max_16, min_16));

	583

	584 for (int i = 0; i < 4; ++i) {

	585 float32x4_t vfr4 = vdupq_n_f32(vgetq_lane_f32(vf4, 0));

	586 float32x4_t vfg4 = vdupq_n_f32(vgetq_lane_f32(vf4, 1));

	587 float32x4_t vfb4 = vdupq_n_f32(vgetq_lane_f32(vf4, 2));

	588

	589 // from: [0 1 2 x] [3 4 5 x]

	590 // to: [1 3 4 x]

	591 float32x4_t cov_134 =

	592 vextq_f32(covariance_float.val[1], covariance_float.val[1], 3);

	593 cov_134 =

	594 vsetq_lane_f32(vgetq_lane_f32(covariance_float.val[0], 1), cov_134, 0);

	595

	596 // from: [0 1 2 x] [3 4 5 x]

	597 // to: [2 4 5 x]

	598 float32x4_t cov_245 = vsetq_lane_f32(

	599 vgetq_lane_f32(covariance_float.val[0], 2), covariance_float.val[1], 0);

	600

	601 vf4 = vmulq_f32(vfr4, covariance_float.val[0]);

	602 vf4 = vmlaq_f32(vf4, vfg4, cov_134);

	603 vf4 = vmlaq_f32(vf4, vfb4, cov_245);

	604 }

	605

	606 float32x4_t magnitude = vabsq_f32(vf4);

	607 magnitude = vsetq_lane_f32(0.0f, magnitude, 3); // Null out alpha.

	608 float32x4_t mag4 = vdupq_lane_f32(

	609 vpmax_f32(vpmax_f32(vget_low_f32(magnitude), vget_high_f32(magnitude)),

	610 vdup_n_f32(0.0f)),

	611 0);

	612

	613 const int32x4_t kLuminance = {299, 587, 114, 0};

	614

	615 // Note that this quite often means dividing by zero. The math still works

	616 // when comparing with Inf though.

	617 float32x4_t inv_magnitude = Divide<2>(vdupq_n_f32(512.0f), mag4);

	618

	619 int32x4_t vf4_mag = vcvtq_s32_f32(vmulq_f32(vf4, inv_magnitude));

	620 int32x4_t v =

	621 vbslq_s32(vcltq_f32(mag4, vdupq_n_f32(4.0f)), kLuminance, vf4_mag);

	622

	623 return GetBaseColors(pixels_linear, pixels_scattered, v);

	624 }

	625

	626 ALWAYS_INLINE uint16x8_t

	627 GetApproximateBaseColors(const uint8x16x4_t& pixels_linear,

	628 const uint8x16x4_t& pixels_scattered,

	629 uint8x8_t min_rgba,

	630 uint8x8_t max_rgba) {

	631 // min_rgba: [min_r min_g min_b min_a x x x x]

	632 // max_rgba: [max_r max_g max_b max_a x x x x]

	633

	634 // Get direction vector and expand to 32-bit.

	635 int16x4_t max_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(max_rgba)));

	636 int16x4_t min_16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(min_rgba)));

	637 int32x4_t v = vsubl_s16(max_16, min_16);

	638

	639 return GetBaseColors(pixels_linear, pixels_scattered, v);

	640 }

	641

	642 // Take two base colors and generate 4 RGBX colors where:

	643 // 0 = baseColor0

	644 // 1 = baseColor1

	645 // 2 = (2 * baseColor0 + baseColor1) / 3

	646 // 3 = (2 * baseColor1 + baseColor0) / 3

	647 ALWAYS_INLINE uint16x4x4_t EvalColors(const uint16x8_t& base_colors) {

	648 // The base colors are expanded by reusing the top bits at the end. That makes

	649 // sure that white is still white after being quantized and converted back.

	650 //

	651 // [(r<<3 \| r>>2) (g<<2 \| g>>4) (b<<3 \| b>>2) 0]

	652

	653 // The upper shift values for each component.

	654 // {3, 2, 3, 0, 3, 2, 3, 0};

	655 const int16x8_t kShiftUp = vreinterpretq_s16_u64(vdupq_n_u64(0x300020003));

	656 uint16x8_t pixels_up = vshlq_u16(base_colors, kShiftUp);

	657 // [r0<<3 g0<<2 b0<<3 0] [r1<<3 g1<<2 b1<<3 0]

	658

	659 // The lower shift values for each component.

	660 // Note that we need to use negative values to shift right.

	661 // {-2, -4, -2, 0, -2, -4, -2, 0};

	662 const int16x8_t kShiftDown =

	663 vreinterpretq_s16_u64(vdupq_n_u64(0xfffefffcfffe));

	664 uint16x8_t pixels_down = vshlq_u16(base_colors, kShiftDown);

	665 // [r0>>2 g0>>4 b0>>2 0] [r1>>2 g1>>4 b1>>2 0]

	666

	667 uint16x8_t pixels = vorrq_u16(pixels_up, pixels_down);

	668 // [(r0<<3 \| r0>>2) (g0<<2 \| g0>>4) (b0<<3 \| b0>>2) 0]

	669 // [(r1<<3 \| r1>>2) (g1<<2 \| g1>>4) (b1<<3 \| b1>>2) 0]

	670

	671 // Linear interpolate the two other colors:

	672 // (2 * max + min) / 3

	673 // (2 * min + max) / 3

	674

	675 uint16x8_t pixels_mul2 = vaddq_u16(pixels, pixels);

	676

	677 uint16x8_t swapped = vreinterpretq_u16_u64(vextq_u64(

	678 vreinterpretq_u64_u16(pixels), vreinterpretq_u64_u16(pixels), 1));

	679 int16x8_t output = vreinterpretq_s16_u16(vaddq_u16(pixels_mul2, swapped));

	680

	681 // There's no division in NEON, but we can use "x * ((1 << 16) / 3 + 1))"

	682 // instead.

	683 output = vqdmulhq_s16(output, vdupq_n_s16(((1 << 16) / 3 + 1) >> 1));

	684

	685 uint16x4x4_t colors;

	686 colors.val[0] = vget_low_u16(pixels);

	687 colors.val[1] = vget_high_u16(pixels);

	688 colors.val[2] = vreinterpret_u16_s16(vget_low_s16(output));

	689 colors.val[3] = vreinterpret_u16_s16(vget_high_s16(output));

	690 return colors;

	691 }

	692

	693 template <typename T>

	694 ALWAYS_INLINE uint8x8_t GetRemapIndices(int32x4_t dots,

	695 int32x4_t half_point,

	696 int32x4_t c0_point,

	697 int32x4_t c3_point) {

	698 // bits = (dot < half_point ? 4 : 0)

	699 // \| (dot < c0_point ? 2 : 0)

	700 // \| (dot < c3_point ? 1 : 0)

	701 int32x4_t cmp0 = vreinterpretq_s32_u32(

	702 vandq_u32(vcgtq_s32(half_point, dots), vdupq_n_u32(4)));

	703 int32x4_t cmp1 = vreinterpretq_s32_u32(

	704 vandq_u32(vcgtq_s32(c0_point, dots), vdupq_n_u32(2)));

	705 int32x4_t cmp2 = vreinterpretq_s32_u32(

	706 vandq_u32(vcgtq_s32(c3_point, dots), vdupq_n_u32(1)));

	707 int32x4_t bits = vorrq_s32(vorrq_s32(cmp0, cmp1), cmp2);

	708

	709 // Narrow it down to unsigned 8 bits and return.

	710 return vqmovn_u16(vcombine_u16(vqmovun_s32(bits), vdup_n_u16(0)));

	711 }

	712

	713 // dots: Dot products for each pixel.

	714 // points: Crossover points.

	715 template <typename T>

	716 ALWAYS_INLINE uint8x16_t GetColorIndices(int32x4x4_t dots, int32x4_t points) {

	717 // Crossover points for "best color in top half"/"best in bottom half" and

	718 // the same inside that subinterval.

	719 int32x4_t c0_point = vdupq_lane_s32(vget_low_s32(points), 1);

	720 int32x4_t half_point = vdupq_lane_s32(vget_low_s32(points), 0);

	721 int32x4_t c3_point = vdupq_lane_s32(vget_high_s32(points), 0);

	722

	723 // Get kRemap table indices.

	724 uint8x8x4_t ind;

	725 ind.val[0] = GetRemapIndices<T>(dots.val[0], half_point, c0_point, c3_point);

	726 ind.val[1] = GetRemapIndices<T>(dots.val[1], half_point, c0_point, c3_point);

	727 ind.val[2] = GetRemapIndices<T>(dots.val[2], half_point, c0_point, c3_point);

	728 ind.val[3] = GetRemapIndices<T>(dots.val[3], half_point, c0_point, c3_point);

	729

	730 // Combine indices.

	731 uint8x8_t indices_lo =

	732 vreinterpret_u8_u32(vzip_u32(vreinterpret_u32_u8(ind.val[0]),

	733 vreinterpret_u32_u8(ind.val[1])).val[0]);

	734 uint8x8_t indices_hi =

	735 vreinterpret_u8_u32(vzip_u32(vreinterpret_u32_u8(ind.val[2]),

	736 vreinterpret_u32_u8(ind.val[3])).val[0]);

	737 // Do table lookup and return 2-bit color indices.

	738 return vcombine_u8(vtbl1_u8(T::kRemap, indices_lo),

	739 vtbl1_u8(T::kRemap, indices_hi));

	740 }

	741

	742 template <typename T>

	743 ALWAYS_INLINE uint8x16_t

	744 MatchColorsBlock(const uint8x16x4_t& pixels_scattered, uint16x4x4_t colors) {

	745 // Get direction vector and expand to 32-bit.

	746 int32x4_t dir = vsubl_s16(vreinterpret_s16_u16(colors.val[0]),

	747 vreinterpret_s16_u16(colors.val[1]));

	748 // Duplicate r g b elements of direction into different registers.

	749 int32x4_t dir_r = vdupq_lane_s32(vget_low_s32(dir), 0);

	750 int32x4_t dir_g = vdupq_lane_s32(vget_low_s32(dir), 1);

	751 int32x4_t dir_b = vdupq_lane_s32(vget_high_s32(dir), 0);

	752

	753 // Transpose to separate red, green, blue and alpha channels into 4 different

	754 // registers. Alpha is ignored.

	755 uint16x4x2_t trn_lo = vtrn_u16(colors.val[0], colors.val[1]);

	756 uint16x4x2_t trn_hi = vtrn_u16(colors.val[2], colors.val[3]);

	757 uint32x4x2_t transposed_colors = vtrnq_u32(

	758 vreinterpretq_u32_u16(vcombine_u16(trn_lo.val[0], trn_lo.val[1])),

	759 vreinterpretq_u32_u16(vcombine_u16(trn_hi.val[0], trn_hi.val[1])));

	760

	761 // Expand to 32-bit.

	762 int32x4_t colors_r =

	763 vmovl_s16(vget_low_s16(vreinterpretq_s16_u32(transposed_colors.val[0])));

	764 int32x4_t colors_g =

	765 vmovl_s16(vget_high_s16(vreinterpretq_s16_u32(transposed_colors.val[0])));

	766 int32x4_t colors_b =

	767 vmovl_s16(vget_low_s16(vreinterpretq_s16_u32(transposed_colors.val[1])));

	768

	769 // Get dot products.

	770 int32x4_t stops =

	771 DotProduct(colors_r, colors_g, colors_b, dir_r, dir_g, dir_b);

	772

	773 // Build a register containing 4th, 2nd and 3rd elements of stops respectively

	774 // in each 32 bits element.

	775 int32x4_t points1 = vsetq_lane_s32(vgetq_lane_s32(stops, 3), stops, 0);

	776 // Build a register containing 3rd, 4th and 1st elements of stops respectively

	777 // in each 32 bits element.

	778 int32x4_t points2 = vreinterpretq_s32_s64(

	779 vextq_s64(vreinterpretq_s64_s32(stops), vreinterpretq_s64_s32(stops), 1));

	780 // Add and divide by 2.

	781 int32x4_t points = vshrq_n_s32(vaddq_s32(points1, points2), 1);

	782

	783 // Expand all pixels to signed 32-bit integers.

	784 int32x4x4_t r = ExpandRGBATo32(pixels_scattered.val[0]);

	785 int32x4x4_t g = ExpandRGBATo32(pixels_scattered.val[1]);

	786 int32x4x4_t b = ExpandRGBATo32(pixels_scattered.val[2]);

	787

	788 int32x4x4_t dots = CalculateDots(r, g, b, dir);

	789

	790 // Get 2-bit color indices.

	791 return GetColorIndices<T>(dots, points);

	792 }

	793

	794 template <typename T>

	795 ALWAYS_INLINE uint32x4x2_t DoProdsTableLookup(uint8x8_t indices) {

	796 // Do table lookup for each color index. The values in the table are 3 bytes

	797 // big so we do it in 3 steps.

	798 uint16x8_t lookup1 = vmovl_u8(vtbl1_u8(vcreate_u8(T::kProds[0]), indices));

	799 uint16x8_t lookup2 = vmovl_u8(vtbl1_u8(vcreate_u8(T::kProds[1]), indices));

	800 uint16x8_t lookup3 = vmovl_u8(vtbl1_u8(vcreate_u8(T::kProds[2]), indices));

	801 // Expand to 32-bit.

	802 uint32x4_t lookup1_lo = vmovl_u16(vget_low_u16(lookup1));

	803 uint32x4_t lookup1_hi = vmovl_u16(vget_high_u16(lookup1));

	804 uint32x4_t lookup2_lo = vmovl_u16(vget_low_u16(lookup2));

	805 uint32x4_t lookup2_hi = vmovl_u16(vget_high_u16(lookup2));

	806 uint32x4_t lookup3_lo = vmovl_u16(vget_low_u16(lookup3));

	807 uint32x4_t lookup3_hi = vmovl_u16(vget_high_u16(lookup3));

	808 // Combine results by shifting and or-ing to obtain the actual table value.

	809 uint32x4x2_t result;

	810 result.val[0] = vorrq_u32(lookup3_lo, vorrq_u32(vshlq_n_u32(lookup2_lo, 8),

	811 vshlq_n_u32(lookup1_lo, 16)));

	812 result.val[1] = vorrq_u32(lookup3_hi, vorrq_u32(vshlq_n_u32(lookup2_hi, 8),

	813 vshlq_n_u32(lookup1_hi, 16)));

	814 return result;

	815 }

	816

	817 // Tries to optimize colors to suit block contents better.

	818 // Done by solving a least squares system via normal equations+Cramer's rule.

	819 template <typename T>

	820 ALWAYS_INLINE int RefineBlock(const uint8x16x4_t& pixels_scattered,

	821 uint16x4_t sum_rgb,

	822 uint16x8_t& base_colors,

	823 uint8x16_t indices) {

	824 uint16x8_t old_base_colors = base_colors;

	825

	826 if (ElementsEqual(indices)) { // Do all pixels have the same index?

	827 // Yes, linear system would be singular; solve using optimal single-color

	828 // match on average color.

	829

	830 // Get the average of the 16 pixels with rounding.

	831 uint16x4_t average_rgb = vrshr_n_u16(sum_rgb, 4);

	832

	833 ALIGNAS(8) uint16_t rgb[4];

	834 vst1_u16(rgb, average_rgb);

	835 // Look up optimal values instead of trying to calculate.

	836 uint16_t colors[8] = {g_o_match55[rgb[0]][0],

	837 MatchSingleGreenMax<typename T::BASE_TYPE>(rgb[1]),

	838 g_o_match55[rgb[2]][0],

	839 0,

	840 g_o_match55[rgb[0]][1],

	841 MatchSingleGreenMin<typename T::BASE_TYPE>(rgb[1]),

	842 g_o_match55[rgb[2]][1],

	843 0};

	844 base_colors = vld1q_u16(colors);

	845 } else {

	846 // Expand to 16-bit.

	847 int16x8x2_t r = ExpandRGBATo16(pixels_scattered.val[0]);

	848 int16x8x2_t g = ExpandRGBATo16(pixels_scattered.val[1]);

	849 int16x8x2_t b = ExpandRGBATo16(pixels_scattered.val[2]);

	850

	851 // Do table lookup for each color index.

	852 int8x16_t w1 = DoW1TableLookup<T>(indices);

	853 // Expand to 16-bit.

	854 int16x8_t w1_lo = vmovl_s8(vget_low_s8(w1));

	855 int16x8_t w1_hi = vmovl_s8(vget_high_s8(w1));

	856 // Multiply and accumulate.

	857 int16x8x4_t at1_rgb;

	858 at1_rgb.val[0] = vmulq_s16(w1_lo, r.val[0]);

	859 at1_rgb.val[0] = vmlaq_s16(at1_rgb.val[0], w1_hi, r.val[1]);

	860 at1_rgb.val[1] = vmulq_s16(w1_lo, g.val[0]);

	861 at1_rgb.val[1] = vmlaq_s16(at1_rgb.val[1], w1_hi, g.val[1]);

	862 at1_rgb.val[2] = vmulq_s16(w1_lo, b.val[0]);

	863 at1_rgb.val[2] = vmlaq_s16(at1_rgb.val[2], w1_hi, b.val[1]);

	864 // [r][g][b][]

	865 int32x4_t at1 = SumRGB(at1_rgb);

	866

	867 // [r][g][b][]

	868 int32x4_t at2 = vreinterpretq_s32_u32(vmovl_u16(sum_rgb));

	869 // at2 = 3 * at2 - at1;

	870 at2 = vsubq_s32(vmulq_s32(at2, vdupq_n_s32(3)), at1);

	871

	872 // Do table lookup for each color index.

	873 uint32x4x2_t akku1 = DoProdsTableLookup<T>(vget_low_u8(indices));

	874 uint32x4x2_t akku2 = DoProdsTableLookup<T>(vget_high_u8(indices));

	875 uint32x4_t sum_akku = vaddq_u32(

	876 vaddq_u32(vaddq_u32(akku1.val[0], akku1.val[1]), akku2.val[0]),

	877 akku2.val[1]);

	878 // Pairwise add and accumulate.

	879 uint64x1_t akku = vpaddl_u32(vget_low_u32(sum_akku));

	880 akku = vpadal_u32(akku, vget_high_u32(sum_akku));

	881

	882 // Extract solutions and decide solvability.

	883

	884 // [akku >> 16]x4

	885 int32x4_t xx =

	886 vdupq_lane_s32(vreinterpret_s32_u64(vshr_n_u64(akku, 16)), 0);

	887 // [(akku >> 8) & 0xff]x4

	888 const uint64x1_t kFF = vdup_n_u64(0xff);

	889 int32x4_t yy = vdupq_lane_s32(

	890 vreinterpret_s32_u64(vand_u64(vshr_n_u64(akku, 8), kFF)), 0);

	891 // [akku & 0xff]x4

	892 int32x4_t xy = vdupq_lane_s32(vreinterpret_s32_u64(vand_u64(akku, kFF)), 0);

	893

	894 // ((3.0f * 31.0f) / 255.0f) / (xx * yy - xy * xy)

	895 float32x4_t frb = Divide<2>(

	896 vdupq_n_f32((3.0f * 31.0f) / 255.0f),

	897 vcvtq_f32_s32(vsubq_s32(vmulq_s32(xx, yy), vmulq_s32(xy, xy))));

	898 // frb * 63.0f / 31.0f

	899 float32x4_t fg = vmulq_f32(vmulq_f32(frb, vdupq_n_f32(63.0f)),

	900 vdupq_n_f32(1.0f / 31.0f));

	901

	902 // Solve.

	903

	904 // [frb][fg][frb][]

	905 float32x4_t frb_fg_frb = vsetq_lane_f32(vgetq_lane_f32(fg, 0), frb, 1);

	906 // [31][63][31][]

	907 const int32x4_t kClamp565_vec = {31, 63, 31, 0};

	908

	909 // (at1_r * yy - at2_r * xy) * frb + 0.5f

	910 int32x4_t base0_rgb32 = vcvtq_s32_f32(vaddq_f32(

	911 vmulq_f32(

	912 vcvtq_f32_s32(vsubq_s32(vmulq_s32(at1, yy), vmulq_s32(at2, xy))),

	913 frb_fg_frb),

	914 vdupq_n_f32(0.5f)));

	915 // Clamp and saturate.

	916 uint16x4_t base0_rgb16 = vqmovun_s32(vbslq_s32(

	917 vcgeq_s32(base0_rgb32, kClamp565_vec), kClamp565_vec, base0_rgb32));

	918

	919 // (at2_r * xx - at1_r * xy) * frb + 0.5f

	920 int32x4_t base1_rgb32 = vcvtq_s32_f32(vaddq_f32(

	921 vmulq_f32(

	922 vcvtq_f32_s32(vsubq_s32(vmulq_s32(at2, xx), vmulq_s32(at1, xy))),

	923 frb_fg_frb),

	924 vdupq_n_f32(0.5f)));

	925 // Clamp and saturate.

	926 uint16x4_t base1_rgb16 = vqmovun_s32(vbslq_s32(

	927 vcgeq_s32(base1_rgb32, kClamp565_vec), kClamp565_vec, base1_rgb32));

	928

	929 base_colors = vcombine_u16(base0_rgb16, base1_rgb16);

	930 }

	931

	932 return !Equal(old_base_colors, base_colors);

	933 }

	934

	935 template <typename T, Quality QUALITY>

	936 ALWAYS_INLINE void CompressColorBlock(uint8_t* dst,

	937 const uint8x16x4_t& pixels_linear,

	938 const uint8x16x4_t& pixels_scattered,

	939 uint8x8_t min_rgba,

	940 uint8x8_t max_rgba) {

	941 // Take a shortcut if the block is constant (disregarding alpha).

	942 uint32_t min32 = vget_lane_u32(vreinterpret_u32_u8(min_rgba), 0);

	943 uint32_t max32 = vget_lane_u32(vreinterpret_u32_u8(max_rgba), 0);

	944 if ((min32 & 0x00ffffff) == (max32 & 0x00ffffff)) {

	945 // Swapping r and b channels to match Skia.

	946 int b = min32 & 0xff;

	947 int g = (min32 >> 8) & 0xff;

	948 int r = (min32 >> 16) & 0xff;

	949

	950 uint16_t max16 = MatchSingleColorMax<typename T::BASE_TYPE>(r, g, b);

	951 uint16_t min16 = MatchSingleColorMin<typename T::BASE_TYPE>(r, g, b);

	952 uint32_t indices = T::kConstantColorIndices;

	953 FormatFixup_Generic<typename T::BASE_TYPE>(&max16, &min16, &indices);

	954

	955 uint32_t* dst32 = reinterpret_cast<uint32_t*>(dst);

	956 dst32[0] = max16 \| (min16 << 16);

	957 dst32[1] = indices;

	958 } else {

	959 uint16x4_t sum_rgb;

	960 uint16x8_t base_colors;

	961

	962 if (QUALITY == kQualityLow) {

	963 base_colors = GetApproximateBaseColors(pixels_linear, pixels_scattered,

	964 min_rgba, max_rgba);

	965 } else {

	966 sum_rgb = SumRGB(pixels_scattered);

	967 // Do Primary Component Analysis and map along principal axis.

	968 base_colors = OptimizeColorsBlock(pixels_linear, pixels_scattered,

	969 sum_rgb, min_rgba, max_rgba);

	970 }

	971

	972 // Check if the two base colors are the same.

	973 uint8x16_t indices;

	974 if (!Equal(vget_low_u16(base_colors), vget_high_u16(base_colors))) {

	975 // Calculate the two intermediate colors as well.

	976 uint16x4x4_t colors = EvalColors(base_colors);

	977

	978 // Do a first pass to find good index candicates for all 16 of the pixels

	979 // in the block.

	980 indices = MatchColorsBlock<T>(pixels_scattered, colors);

	981 } else {

	982 // Any indices can be used here.

	983 indices = vdupq_n_u8(0);

	984 }

	985

	986 if (QUALITY == kQualityHigh) {

	987 // Refine the base colors and indices multiple times if requested.

	988 for (int i = 0; i < kNumRefinements; ++i) {

	989 uint8x16_t lastIndices = indices;

	990

	991 if (RefineBlock<T>(pixels_scattered, sum_rgb, base_colors, indices)) {

	992 if (!Equal(vget_low_u16(base_colors), vget_high_u16(base_colors))) {

	993 uint16x4x4_t colors = EvalColors(base_colors);

	994 indices = MatchColorsBlock<T>(pixels_scattered, colors);

	995 } else {

	996 // We ended up with two identical base colors, can't refine this

	997 // further.

	998 indices = vdupq_n_u8(0);

	999 break;

	1000 }

	1001 }

	1002

	1003 if (Equal(indices, lastIndices)) {

	1004 // There's no need to do another refinement pass if we didn't get any

	1005 // improvements this pass.

	1006 break;

	1007 }

	1008 }

	1009 }

	1010

	1011 // Prepare the final block by converting the base colors to 16-bit and

	1012 // packing the pixel indices.

	1013 uint16x4_t base_colors_16 = PackBaseColors(base_colors);

	1014 uint64x1_t indices_2x16 = PackIndices<2>(indices);

	1015 FormatFixup<T>(&base_colors_16, &indices_2x16);

	1016 uint64x1_t output = vorr_u64(vshl_n_u64(indices_2x16, 32),

	1017 vreinterpret_u64_u16(base_colors_16));

	1018 vst1_u64(reinterpret_cast<uint64_t*>(dst), output);

	1019 }

	1020 }

	1021

	1022 // alpha: 8x8-bit alpha values.

	1023 // dist: Distance between max and min alpha in the color block.

	1024 // bias: Rounding bias.

	1025 ALWAYS_INLINE uint8x8_t

	1026 GetAlphaIndices(uint8x8_t alpha, int16x8_t dist, int16x8_t bias) {

	1027 // Expand to signed 16-bit.

	1028 int16x8_t alpha_16 = vreinterpretq_s16_u16(vmovl_u8(alpha));

	1029

	1030 // Multiply each alpha value by 7 and add bias.

	1031 int16x8_t a = vaddq_s16(vmulq_s16(alpha_16, vdupq_n_s16(7)), bias);

	1032

	1033 int16x8_t dist4 = vmulq_s16(dist, vdupq_n_s16(4));

	1034 int16x8_t dist2 = vmulq_s16(dist, vdupq_n_s16(2));

	1035

	1036 // Select index. This is a "linear scale" lerp factor between 0 (val=min)

	1037 // and 7 (val=max).

	1038 // t = (a >= dist4) ? -1 : 0

	1039 int16x8_t t =

	1040 vandq_s16(vreinterpretq_s16_u16(vcgeq_s16(a, dist4)), vdupq_n_s16(-1));

	1041 // ind1 = t & 4;

	1042 int16x8_t ind1 = vandq_s16(t, vdupq_n_s16(4));

	1043 // a1 = a - (dist4 & t);

	1044 int16x8_t a1 = vsubq_s16(a, vandq_s16(dist4, t));

	1045

	1046 // t = (a1 >= dist2) ? -1 : 0;

	1047 t = vandq_s16(vreinterpretq_s16_u16(vcgeq_s16(a1, dist2)), vdupq_n_s16(-1));

	1048 // ind2 = t & 2;

	1049 int16x8_t ind2 = vandq_s16(t, vdupq_n_s16(2));

	1050 // a2 = a1 - (dist2 & t);

	1051 int16x8_t a2 = vsubq_s16(a1, vandq_s16(dist2, t));

	1052

	1053 // ind3 = (a2 >= dist)

	1054 int16x8_t ind3 =

	1055 vandq_s16(vreinterpretq_s16_u16(vcgeq_s16(a2, dist)), vdupq_n_s16(1));

	1056

	1057 // indices = ind1 + ind2 + ind3

	1058 int16x8_t indices = vaddq_s16(ind1, vaddq_s16(ind2, ind3));

	1059

	1060 // Turn linear scale into alpha index (0/1 are extremal pts).

	1061 // ind = -indices & 7

	1062 int16x8_t ind = vandq_s16(vnegq_s16(indices), vdupq_n_s16(7));

	1063 // indices = ind ^ (2 > ind)

	1064 indices = veorq_s16(

	1065 ind, vandq_s16(vreinterpretq_s16_u16(vcgtq_s16(vdupq_n_s16(2), ind)),

	1066 vdupq_n_s16(1)));

	1067 // Narrow it down to unsigned 8 bits and return.

	1068 return vqmovun_s16(indices);

	1069 }

	1070

	1071 ALWAYS_INLINE void CompressAlphaBlock(uint8_t* dst,

	1072 uint8x16_t pixels_alpha,

	1073 uint8x8_t min_rgba,

	1074 uint8x8_t max_rgba) {

	1075 // Take a shortcut if the block is constant.

	1076 uint8_t min_alpha = vget_lane_u8(min_rgba, 3);

	1077 uint8_t max_alpha = vget_lane_u8(max_rgba, 3);

	1078 if (min_alpha == max_alpha) {

	1079 dst[0] = max_alpha;

	1080 dst[1] = min_alpha;

	1081 // All indices are the same, any value will do.

	1082 reinterpret_cast<uint16_t>(dst + 2) = 0;

	1083 reinterpret_cast<uint32_t>(dst + 4) = 0;

	1084 } else {

	1085 // [max - min]x8

	1086 int16x8_t dist = vdupq_lane_s16(

	1087 vreinterpret_s16_u16(vget_low_u16(vsubl_u8(max_rgba, min_rgba))), 3);

	1088 // bias = (dist < 8) ? (dist - 1) : (dist / 2 + 2)

	1089 int16x8_t bias = vbslq_s16(vcltq_s16(dist, vdupq_n_s16(8)),

	1090 vsubq_s16(dist, vdupq_n_s16(1)),

	1091 vaddq_s16(vshrq_n_s16(dist, 1), vdupq_n_s16(2)));

	1092 // bias -= min * 7;

	1093 bias = vsubq_s16(

	1094 bias,

	1095 vmulq_s16(

	1096 vdupq_lane_s16(

	1097 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(min_rgba))), 3),

	1098 vdupq_n_s16(7)));

	1099

	1100 uint8x8_t indices_lo =

	1101 GetAlphaIndices(vget_low_u8(pixels_alpha), dist, bias);

	1102 uint8x8_t indices_hi =

	1103 GetAlphaIndices(vget_high_u8(pixels_alpha), dist, bias);

	1104

	1105 // Prepare the final block by combining the base alpha values and packing

	1106 // the alpha indices.

	1107 uint8x8_t max_min_alpha = vzip_u8(max_rgba, min_rgba).val[0];

	1108 uint64x1_t indices = PackIndices<3>(vcombine_u8(indices_lo, indices_hi));

	1109 uint64x1_t output =

	1110 vorr_u64(vshl_n_u64(indices, 16),

	1111 vshr_n_u64(vreinterpret_u64_u8(max_min_alpha), 48));

	1112 vst1_u64(reinterpret_cast<uint64_t*>(dst), output);

	1113 }

	1114 }

	1115

	1116 template <typename T, bool OPAQUE, Quality QUALITY>

	1117 void CompressImage(const uint8_t* src, uint8_t* dst, int width, int height) {

	1118 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {

	1119 for (int x = 0; x < width; x += 4) {

	1120 // Load the four rows of pixels.

	1121 uint8x16x4_t pixels_linear;

	1122 pixels_linear.val[0] = vld1q_u8(src + (x + 0 * width) * 4);

	1123 pixels_linear.val[1] = vld1q_u8(src + (x + 1 * width) * 4);

	1124 pixels_linear.val[2] = vld1q_u8(src + (x + 2 * width) * 4);

	1125 pixels_linear.val[3] = vld1q_u8(src + (x + 3 * width) * 4);

	1126

	1127 // Transpose/scatter the red, green, blue and alpha channels into

	1128 // separate registers.

	1129 ALIGNAS(8) uint8_t block[64];

	1130 vst1q_u8(block + 0 * 16, pixels_linear.val[0]);

	1131 vst1q_u8(block + 1 * 16, pixels_linear.val[1]);

	1132 vst1q_u8(block + 2 * 16, pixels_linear.val[2]);

	1133 vst1q_u8(block + 3 * 16, pixels_linear.val[3]);

	1134 uint8x16x4_t pixels_scattered = vld4q_u8(block);

	1135

	1136 // We need the min and max values both to detect solid blocks and when

	1137 // computing the base colors.

	1138 uint8x8_t min_rgba = FoldRGBA<vec_ops::Min>(pixels_scattered);

	1139 uint8x8_t max_rgba = FoldRGBA<vec_ops::Max>(pixels_scattered);

	1140

	1141 if (!OPAQUE) {

	1142 CompressAlphaBlock(dst, pixels_scattered.val[3], min_rgba, max_rgba);

	1143 dst += 8;

	1144 }

	1145

	1146 CompressColorBlock<T, QUALITY>(dst, pixels_linear, pixels_scattered,

	1147 min_rgba, max_rgba);

	1148 dst += 8;

	1149 }

	1150 }

	1151 }

	1152

	1153 } // namespace

	1154

	1155 void CompressATC_NEON(const uint8_t* src, uint8_t* dst, int width, int height) {

	1156 CompressImage<TYPE_ATC_NEON, true, kQualityHigh>(src, dst, width, height);

	1157 }

	1158

	1159 void CompressATCIA_NEON(const uint8_t* src,

	1160 uint8_t* dst,

	1161 int width,

	1162 int height) {

	1163 CompressImage<TYPE_ATC_NEON, false, kQualityHigh>(src, dst, width, height);

	1164 }

	1165

	1166 void CompressDXT1_NEON(const uint8_t* src,

	1167 uint8_t* dst,

	1168 int width,

	1169 int height) {

	1170 CompressImage<TYPE_DXT_NEON, true, kQualityHigh>(src, dst, width, height);

	1171 }

	1172

	1173 void CompressDXT5_NEON(const uint8_t* src,

	1174 uint8_t* dst,

	1175 int width,

	1176 int height) {

	1177 CompressImage<TYPE_DXT_NEON, false, kQualityHigh>(src, dst, width, height);

	1178 }

	1179

	1180 } // namespace texture_compress

	1181 } // namespace cc

	1182

	1183 #endif // __ARM_NEON__

OLD	NEW

« no previous file with comments | « cc/resources/texture_compress/arm/atc_dxt_neon.h ('k') | cc/resources/texture_compress/arm/etc1_neon.h » ('j') | no next file with comments »