cc/resources/texture_compress/arm/etc1_neon.cc - Issue 793693003: Tile Compression

Side by Side Diff: cc/resources/texture_compress/arm/etc1_neon.cc

Issue 793693003: Tile Compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2014 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 // See the following specification for details on the ETC1 format:

	6 // https://www.khronos.org/registry/gles/extensions/OES/OES_compressed_ETC1_RGB8 _texture.txt

	7

	8 #ifdef __ARM_NEON__

	9

	10 #include "cc/resources/texture_compress/arm/etc1_neon.h"

	11

	12 #include <arm_neon.h>

	13 #include <limits>

	14

	15 #include "base/compiler_specific.h"

	16 #include "base/logging.h"

	17

	18 // GCC 4.6 suffers from a bug, raising an internal error when mixing

	19 // interleaved load instructions with linear load instructions. By fiddling

	20 // with variable declaration order this problem can be avoided which is done

	21 // when the following macro is defined.

	22 #if (__GNUC__ == 4) && (__GNUC_MINOR__ == 6)

	23 #define GCC46_INTERNAL_ERROR_WORKAROUND

	24 #endif

	25

	26 namespace {

	27

	28 template <typename T>

	29 inline T clamp(T val, T min, T max) {

	30 return val < min ? min : (val > max ? max : val);

	31 }

	32

	33 inline uint8_t round_to_5_bits(float val) {

	34 return clamp<uint8_t>(val * 31.0f / 255.0f + 0.5f, 0, 31);

	35 }

	36

	37 inline uint8_t round_to_4_bits(float val) {

	38 return clamp<uint8_t>(val * 15.0f / 255.0f + 0.5f, 0, 15);

	39 }

	40

	41 } // namespace

	42

	43 namespace etc1_neon {

	44

	45 union Color {

	46 struct {

	47 uint8_t b;

	48 uint8_t g;

	49 uint8_t r;

	50 uint8_t a;

	51 };

	52 uint8_t components[4];

	53 uint32_t bits;

	54 };

	55

	56 /*

	57 * Codeword tables.

	58 * See: Table 3.17.2

	59 */

	60 static const int16_t g_codeword_tables[8][4] = {{-8, -2, 2, 8},

	61 {-17, -5, 5, 17},

	62 {-29, -9, 9, 29},

	63 {-42, -13, 13, 42},

	64 {-60, -18, 18, 60},

	65 {-80, -24, 24, 80},

	66 {-106, -33, 33, 106},

	67 {-183, -47, 47, 183}};

	68

	69 /*

	70 * NEON optimized codeword tables.

	71 *

	72 * It allows for a single table entry to be loaded into a 64-bit register

	73 * without duplication and with the alpha channel already cleared.

	74 *

	75 * See: Table 3.17.2

	76 */

	77 ALIGNAS(8) static const int16_t g_codeword_tables_neon_opt[8][16] = {

	78 {-8, -8, -8, 0, -2, -2, -2, 0, 2, 2, 2, 0, 8, 8, 8, 0},

	79 {-17, -17, -17, 0, -5, -5, -5, 0, 5, 5, 5, 0, 17, 17, 17, 0},

	80 {-29, -29, -29, 0, -9, -9, -9, 0, 9, 9, 9, 0, 29, 29, 29, 0},

	81 {-42, -42, -42, 0, -13, -13, -13, 0, 13, 13, 13, 0, 42, 42, 42, 0},

	82 {-60, -60, -60, 0, -18, -18, -18, 0, 18, 18, 18, 0, 60, 60, 60, 0},

	83 {-80, -80, -80, 0, -24, -24, -24, 0, 24, 24, 24, 0, 80, 80, 80, 0},

	84 {-106, -106, -106, 0, -33, -33, -33, 0, 33, 33, 33, 0, 106, 106, 106, 0},

	85 {-183, -183, -183, 0, -47, -47, -47, 0, 47, 47, 47, 0, 183, 183, 183, 0}};

	86

	87 /*

	88 * Maps modifier indices to pixel index values.

	89 * See: Table 3.17.3

	90 */

	91 static const uint8_t g_mod_to_pix[4] = {3, 2, 0, 1};

	92

	93 /*

	94 * The ETC1 specification index texels as follows:

	95 *

	96 * [a][e][i][m] [ 0][ 4][ 8][12]

	97 * [b][f][j][n] <-> [ 1][ 5][ 9][13]

	98 * [c][g][k][o] [ 2][ 6][10][14]

	99 * [d][h][l][p] [ 3][ 7][11][15]

	100 *

	101 * However, when extracting sub blocks from BGRA data the natural array

	102 * indexing order ends up different:

	103 *

	104 * vertical0: [a][b][c][d] horizontal0: [a][e][i][m]

	105 * [e][f][g][h] [b][f][j][n]

	106 * vertical1: [i][j][k][l] horizontal1: [c][g][k][o]

	107 * [m][n][o][p] [d][h][l][p]

	108 *

	109 * In order to translate from the natural array indices in a sub block to the

	110 * indices (numbers) used by specification and hardware we use this table.

	111 *

	112 * NOTE: Since we can efficiently transpose matrixes using NEON we end up with

	113 * near perfect indexing for vertical sub blocks.

	114 */

	115 static const uint8_t g_idx_to_num[4][8] = {

	116 {0, 1, 2, 3, 4, 5, 6, 7}, // Vertical block 0.

	117 {8, 9, 10, 11, 12, 13, 14, 15}, // Vertical block 1.

	118 {0, 4, 8, 12, 1, 5, 9, 13}, // Horizontal block 0.

	119 {2, 6, 10, 14, 3, 7, 11, 15} // Horizontal block 1.

	120 };

	121

	122 inline void WriteColors444(uint8_t* block,

	123 const Color& color0,

	124 const Color& color1) {

	125 block[0] = (color0.r & 0xf0) \| (color1.r >> 4);

	126 block[1] = (color0.g & 0xf0) \| (color1.g >> 4);

	127 block[2] = (color0.b & 0xf0) \| (color1.b >> 4);

	128 }

	129

	130 inline void WriteColors555(uint8_t* block,

	131 const Color& color0,

	132 const Color& color1) {

	133 // Table for conversion to 3-bit two complement format.

	134 static const uint8_t two_compl_trans_table[8] = {

	135 4, // -4 (100b)

	136 5, // -3 (101b)

	137 6, // -2 (110b)

	138 7, // -1 (111b)

	139 0, // 0 (000b)

	140 1, // 1 (001b)

	141 2, // 2 (010b)

	142 3, // 3 (011b)

	143 };

	144

	145 int16_t delta_r = static_cast<int16_t>(color1.r >> 3) - (color0.r >> 3);

	146 int16_t delta_g = static_cast<int16_t>(color1.g >> 3) - (color0.g >> 3);

	147 int16_t delta_b = static_cast<int16_t>(color1.b >> 3) - (color0.b >> 3);

	148 DCHECK(delta_r >= -4 && delta_r <= 3);

	149 DCHECK(delta_g >= -4 && delta_g <= 3);

	150 DCHECK(delta_b >= -4 && delta_b <= 3);

	151

	152 block[0] = (color0.r & 0xf8) \| two_compl_trans_table[delta_r + 4];

	153 block[1] = (color0.g & 0xf8) \| two_compl_trans_table[delta_g + 4];

	154 block[2] = (color0.b & 0xf8) \| two_compl_trans_table[delta_b + 4];

	155 }

	156

	157 inline void WriteCodewordTable(uint8_t* block,

	158 uint8_t sub_block_id,

	159 uint8_t table) {

	160 DCHECK_LT(sub_block_id, 2);

	161 DCHECK_LT(table, 8);

	162

	163 uint8_t shift = (2 + (3 - sub_block_id * 3));

	164 block[3] &= ~(0x07 << shift);

	165 block[3] \|= table << shift;

	166 }

	167

	168 inline void WritePixelData(uint8_t* block, uint32_t pixel_data) {

	169 block[4] \|= pixel_data >> 24;

	170 block[5] \|= (pixel_data >> 16) & 0xff;

	171 block[6] \|= (pixel_data >> 8) & 0xff;

	172 block[7] \|= pixel_data & 0xff;

	173 }

	174

	175 inline void WriteFlip(uint8_t* block, bool flip) {

	176 block[3] &= ~0x01;

	177 block[3] \|= static_cast<uint8_t>(flip);

	178 }

	179

	180 inline void WriteDiff(uint8_t* block, bool diff) {

	181 block[3] &= ~0x02;

	182 block[3] \|= static_cast<uint8_t>(diff) << 1;

	183 }

	184

	185 /**

	186 * Compress and rounds BGR888 into BGR444. The resulting BGR444 color is

	187 * expanded to BGR888 as it would be in hardware after decompression. The

	188 * actual 444-bit data is available in the four most significant bits of each

	189 * channel.

	190 */

	191 inline Color MakeColor444(const float* bgr) {

	192 uint8_t b4 = round_to_4_bits(bgr[0]);

	193 uint8_t g4 = round_to_4_bits(bgr[1]);

	194 uint8_t r4 = round_to_4_bits(bgr[2]);

	195 Color bgr444;

	196 bgr444.b = (b4 << 4) \| b4;

	197 bgr444.g = (g4 << 4) \| g4;

	198 bgr444.r = (r4 << 4) \| r4;

	199 return bgr444;

	200 }

	201

	202 /**

	203 * Compress and rounds BGR888 into BGR555. The resulting BGR555 color is

	204 * expanded to BGR888 as it would be in hardware after decompression. The

	205 * actual 555-bit data is available in the five most significant bits of each

	206 * channel.

	207 */

	208 inline Color MakeColor555(const float* bgr) {

	209 uint8_t b5 = round_to_5_bits(bgr[0]);

	210 uint8_t g5 = round_to_5_bits(bgr[1]);

	211 uint8_t r5 = round_to_5_bits(bgr[2]);

	212 Color bgr555;

	213 bgr555.b = (b5 << 3) \| (b5 >> 2);

	214 bgr555.g = (g5 << 3) \| (g5 >> 2);

	215 bgr555.r = (r5 << 3) \| (r5 >> 2);

	216 return bgr555;

	217 }

	218

	219 /**

	220 * Calculates the error metric for two colors. A small error signals that the

	221 * colors are similar to each other, a large error the signals the opposite.

	222 *

	223 * IMPORTANT: This function call has been inlined and NEON optimized in the

	224 * ComputeLuminance() function. The inlined version should be kept

	225 * in sync with this function implementation.

	226 */

	227 inline uint32_t GetColorError(const Color& u, const Color& v) {

	228 int delta_b = static_cast<int>(u.b) - v.b;

	229 int delta_g = static_cast<int>(u.g) - v.g;

	230 int delta_r = static_cast<int>(u.r) - v.r;

	231 return delta_b * delta_b + delta_g * delta_g + delta_r * delta_r;

	232 }

	233

	234 void GetAverageColor(const Color* src, float* avg_color_bgr) {

	235 const uint8_t* src_ptr = reinterpret_cast<const uint8_t*>(src);

	236 #ifdef GCC46_INTERNAL_ERROR_WORKAROUND

	237 uint8x8x4_t src0;

	238 src0 = vld4_u8(src_ptr);

	239 #else

	240 uint8x8x4_t src0 = vld4_u8(src_ptr);

	241 #endif

	242

	243 uint64x1_t sum_b0 = vpaddl_u32(vpaddl_u16(vpaddl_u8(src0.val[0])));

	244 uint64x1_t sum_g0 = vpaddl_u32(vpaddl_u16(vpaddl_u8(src0.val[1])));

	245 uint64x1_t sum_r0 = vpaddl_u32(vpaddl_u16(vpaddl_u8(src0.val[2])));

	246

	247 ALIGNAS(8) uint64_t sum_b, sum_g, sum_r;

	248 vst1_u64(&sum_b, sum_b0);

	249 vst1_u64(&sum_g, sum_g0);

	250 vst1_u64(&sum_r, sum_r0);

	251

	252 const float kInv8 = 1.0f / 8.0f;

	253 avg_color_bgr[0] = static_cast<float>(sum_b) * kInv8;

	254 avg_color_bgr[1] = static_cast<float>(sum_g) * kInv8;

	255 avg_color_bgr[2] = static_cast<float>(sum_r) * kInv8;

	256 }

	257

	258 void ComputeLuminance(uint8_t* block,

	259 const Color* src,

	260 const Color& base,

	261 int sub_block_id,

	262 const uint8_t* idx_to_num_tab) {

	263 uint32_t best_tbl_err = std::numeric_limits<uint32_t>::max();

	264 uint8_t best_tbl_idx = 0;

	265 uint8_t best_mod_idxs[8][8]; // [table][texel]

	266

	267 // Load immutable data that is shared through iteration.

	268 ALIGNAS(8) const int16_t base_color_ptr[4] = {base.b, base.g, base.r, 0x00};

	269 int16x8_t base_color =

	270 vcombine_s16(vld1_s16(base_color_ptr), vld1_s16(base_color_ptr));

	271

	272 ALIGNAS(8) const uint32_t idx_mask_ptr[4] = {0x00, 0x01, 0x02, 0x03};

	273 uint32x4_t idx_mask = vld1q_u32(idx_mask_ptr);

	274

	275 // Preload source color registers.

	276 uint8x16_t src_color[8];

	277 for (unsigned int i = 0; i < 8; ++i) {

	278 DCHECK_EQ(src[i].a, 0x00);

	279 const uint32_t* src_ptr = reinterpret_cast<const uint32_t*>(&src[i]);

	280 src_color[i] = vreinterpretq_u8_u32(vld1q_dup_u32(src_ptr));

	281 }

	282

	283 // Try all codeword tables to find the one giving the best results for this

	284 // block.

	285 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	286 uint32_t tbl_err = 0;

	287

	288 // For the current table, compute the candidate color: base + lum for all

	289 // four luminance entries.

	290 const int16_t* lum_ptr = g_codeword_tables_neon_opt[tbl_idx];

	291 int16x8_t lum01 = vld1q_s16(lum_ptr);

	292 int16x8_t lum23 = vld1q_s16(lum_ptr + 8);

	293

	294 int16x8_t color01 = vaddq_s16(base_color, lum01);

	295 int16x8_t color23 = vaddq_s16(base_color, lum23);

	296

	297 // Clamp the candidate colors to [0, 255].

	298 color01 = vminq_s16(color01, vdupq_n_s16(255));

	299 color01 = vmaxq_s16(color01, vdupq_n_s16(0));

	300 color23 = vminq_s16(color23, vdupq_n_s16(255));

	301 color23 = vmaxq_s16(color23, vdupq_n_s16(0));

	302

	303 uint8x16_t candidate_color =

	304 vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(color01)),

	305 vmovn_u16(vreinterpretq_u16_s16(color23)));

	306

	307 for (unsigned int i = 0; i < 8; ++i) {

	308 // Compute the squared distance between the source and candidate colors.

	309 uint8x16_t diff = vabdq_u8(src_color[i], candidate_color);

	310 uint8x8_t diff01 = vget_low_u8(diff);

	311 uint8x8_t diff23 = vget_high_u8(diff);

	312

	313 uint16x8_t square01 = vmull_u8(diff01, diff01);

	314 uint16x8_t square23 = vmull_u8(diff23, diff23);

	315

	316 uint32x4_t psum01 = vpaddlq_u16(square01);

	317 uint32x4_t psum23 = vpaddlq_u16(square23);

	318 uint32x2_t err01 = vpadd_u32(vget_low_u32(psum01), vget_high_u32(psum01));

	319 uint32x2_t err23 = vpadd_u32(vget_low_u32(psum23), vget_high_u32(psum23));

	320 uint32x4_t errs = vcombine_u32(err01, err23);

	321

	322 // Find the minimum error.

	323 uint32x2_t min_err = vpmin_u32(err01, err23);

	324 min_err = vpmin_u32(min_err, min_err);

	325

	326 // Find the modifier index which produced the minimum error. This is

	327 // essentially the lane number of the lane containing the minimum error.

	328 uint32x4_t min_mask = vceqq_u32(vcombine_u32(min_err, min_err), errs);

	329 uint32x4_t idxs = vbslq_u32(min_mask, idx_mask, vdupq_n_u32(0xffffffff));

	330

	331 uint32x2_t min_idx = vpmin_u32(vget_low_u32(idxs), vget_high_u32(idxs));

	332 min_idx = vpmin_u32(min_idx, min_idx);

	333

	334 uint32_t best_mod_err = vget_lane_u32(min_err, 0);

	335 uint32_t best_mod_idx = vget_lane_u32(min_idx, 0);

	336

	337 best_mod_idxs[tbl_idx][i] = best_mod_idx;

	338

	339 tbl_err += best_mod_err;

	340 if (tbl_err > best_tbl_err)

	341 break; // We're already doing worse than the best table so skip.

	342 }

	343

	344 if (tbl_err < best_tbl_err) {

	345 best_tbl_err = tbl_err;

	346 best_tbl_idx = tbl_idx;

	347

	348 if (tbl_err == 0)

	349 break; // We cannot do any better than this.

	350 }

	351 }

	352

	353 WriteCodewordTable(block, sub_block_id, best_tbl_idx);

	354

	355 uint32_t pix_data = 0;

	356

	357 for (unsigned int i = 0; i < 8; ++i) {

	358 uint8_t mod_idx = best_mod_idxs[best_tbl_idx][i];

	359 uint8_t pix_idx = g_mod_to_pix[mod_idx];

	360

	361 uint32_t lsb = pix_idx & 0x1;

	362 uint32_t msb = pix_idx >> 1;

	363

	364 // Obtain the texel number as specified in the standard.

	365 int texel_num = idx_to_num_tab[i];

	366 pix_data \|= msb << (texel_num + 16);

	367 pix_data \|= lsb << (texel_num);

	368 }

	369

	370 WritePixelData(block, pix_data);

	371 }

	372

	373 /**

	374 * Compress a solid, single colored block.

	375 */

	376 void CompressSolidBlock(uint8_t* dst, const Color& src) {

	377 // Clear destination buffer so that we can "or" in the results.

	378 memset(dst, 0, 8);

	379

	380 float src_color_float[3] = {static_cast<float>(src.b),

	381 static_cast<float>(src.g),

	382 static_cast<float>(src.r)};

	383 Color base = MakeColor555(src_color_float);

	384

	385 WriteDiff(dst, true);

	386 WriteFlip(dst, false);

	387 WriteColors555(dst, base, base);

	388

	389 uint32_t best_tbl_err = std::numeric_limits<uint32_t>::max();

	390 uint8_t best_tbl_idx = 0;

	391 uint8_t best_mod_idx = 0;

	392

	393 // Load immutable data that is shared through iteration.

	394 ALIGNAS(8) const int16_t base_color_ptr[4] = {base.b, base.g, base.r, 0x00};

	395 int16x8_t base_color =

	396 vcombine_s16(vld1_s16(base_color_ptr), vld1_s16(base_color_ptr));

	397

	398 ALIGNAS(8) const uint32_t idx_mask_ptr[4] = {0x00, 0x01, 0x02, 0x03};

	399 uint32x4_t idx_mask = vld1q_u32(idx_mask_ptr);

	400

	401 // Preload source color registers.

	402 DCHECK_EQ(src.a, 0x00);

	403 uint8x16_t src_color = vreinterpretq_u8_u32(

	404 vld1q_dup_u32(reinterpret_cast<const uint32_t*>(&src)));

	405

	406 // Try all codeword tables to find the one giving the best results for this

	407 // block.

	408 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	409 // For the current table, compute the candidate color: base + lum for all

	410 // four luminance entries.

	411 const int16_t* lum_ptr = g_codeword_tables_neon_opt[tbl_idx];

	412 int16x8_t lum01 = vld1q_s16(lum_ptr);

	413 int16x8_t lum23 = vld1q_s16(lum_ptr + 8);

	414

	415 int16x8_t color01 = vaddq_s16(base_color, lum01);

	416 int16x8_t color23 = vaddq_s16(base_color, lum23);

	417

	418 // Clamp the candidate colors to [0, 255].

	419 color01 = vminq_s16(color01, vdupq_n_s16(255));

	420 color01 = vmaxq_s16(color01, vdupq_n_s16(0));

	421 color23 = vminq_s16(color23, vdupq_n_s16(255));

	422 color23 = vmaxq_s16(color23, vdupq_n_s16(0));

	423

	424 uint8x16_t candidate_color =

	425 vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(color01)),

	426 vmovn_u16(vreinterpretq_u16_s16(color23)));

	427

	428 // Compute the squared distance between the source and candidate colors.

	429 uint8x16_t diff = vabdq_u8(src_color, candidate_color);

	430 uint8x8_t diff01 = vget_low_u8(diff);

	431 uint8x8_t diff23 = vget_high_u8(diff);

	432

	433 uint16x8_t square01 = vmull_u8(diff01, diff01);

	434 uint16x8_t square23 = vmull_u8(diff23, diff23);

	435

	436 uint32x4_t psum01 = vpaddlq_u16(square01);

	437 uint32x4_t psum23 = vpaddlq_u16(square23);

	438 uint32x2_t err01 = vpadd_u32(vget_low_u32(psum01), vget_high_u32(psum01));

	439 uint32x2_t err23 = vpadd_u32(vget_low_u32(psum23), vget_high_u32(psum23));

	440 uint32x4_t errs = vcombine_u32(err01, err23);

	441

	442 // Find the minimum error.

	443 uint32x2_t min_err = vpmin_u32(err01, err23);

	444 min_err = vpmin_u32(min_err, min_err);

	445

	446 // Find the modifier index which produced the minimum error. This is

	447 // essentially the lane number of the lane containing the minimum error.

	448 uint32x4_t min_mask = vceqq_u32(vcombine_u32(min_err, min_err), errs);

	449 uint32x4_t idxs = vbslq_u32(min_mask, idx_mask, vdupq_n_u32(0xffffffff));

	450

	451 uint32x2_t min_idx = vpmin_u32(vget_low_u32(idxs), vget_high_u32(idxs));

	452 min_idx = vpmin_u32(min_idx, min_idx);

	453

	454 uint32_t cur_best_mod_err = vget_lane_u32(min_err, 0);

	455 uint32_t cur_best_mod_idx = vget_lane_u32(min_idx, 0);

	456

	457 uint32_t tbl_err = cur_best_mod_err;

	458 if (tbl_err < best_tbl_err) {

	459 best_tbl_err = tbl_err;

	460 best_tbl_idx = tbl_idx;

	461 best_mod_idx = cur_best_mod_idx;

	462

	463 if (tbl_err == 0)

	464 break; // We cannot do any better than this.

	465 }

	466 }

	467

	468 WriteCodewordTable(dst, 0, best_tbl_idx);

	469 WriteCodewordTable(dst, 1, best_tbl_idx);

	470

	471 uint8_t pix_idx = g_mod_to_pix[best_mod_idx];

	472 uint32_t lsb = pix_idx & 0x1;

	473 uint32_t msb = pix_idx >> 1;

	474

	475 uint32_t pix_data = 0;

	476 for (unsigned int i = 0; i < 2; ++i) {

	477 for (unsigned int j = 0; j < 8; ++j) {

	478 // Obtain the texel number as specified in the standard.

	479 int texel_num = g_idx_to_num[i][j];

	480 pix_data \|= msb << (texel_num + 16);

	481 pix_data \|= lsb << (texel_num);

	482 }

	483 }

	484

	485 WritePixelData(dst, pix_data);

	486 }

	487

	488 void CompressBlock(uint8_t* dst, const Color* ver_src, const Color* hor_src) {

	489 ALIGNAS(8) const Color* sub_block_src[4] = {

	490 ver_src, ver_src + 8, hor_src, hor_src + 8};

	491

	492 Color sub_block_avg[4];

	493 bool use_differential[2] = {true, true};

	494

	495 // Compute the average color for each sub block and determine if differential

	496 // coding can be used.

	497 for (unsigned int i = 0, j = 1; i < 4; i += 2, j += 2) {

	498 float avg_color_0[3];

	499 GetAverageColor(sub_block_src[i], avg_color_0);

	500 Color avg_color_555_0 = MakeColor555(avg_color_0);

	501

	502 float avg_color_1[3];

	503 GetAverageColor(sub_block_src[j], avg_color_1);

	504 Color avg_color_555_1 = MakeColor555(avg_color_1);

	505

	506 for (unsigned int light_idx = 0; light_idx < 3; ++light_idx) {

	507 int u = avg_color_555_0.components[light_idx] >> 3;

	508 int v = avg_color_555_1.components[light_idx] >> 3;

	509

	510 int component_diff = v - u;

	511 if (component_diff < -4 \|\| component_diff > 3) {

	512 use_differential[i / 2] = false;

	513 sub_block_avg[i] = MakeColor444(avg_color_0);

	514 sub_block_avg[j] = MakeColor444(avg_color_1);

	515 } else {

	516 sub_block_avg[i] = avg_color_555_0;

	517 sub_block_avg[j] = avg_color_555_1;

	518 }

	519 }

	520 }

	521

	522 // Compute the error of each sub block before adjusting for luminance. These

	523 // error values are later used for determining if we should flip the sub

	524 // block or not.

	525 uint32_t sub_block_err[4] = {0};

	526 for (unsigned int i = 0; i < 4; ++i) {

	527 for (unsigned int j = 0; j < 8; ++j) {

	528 sub_block_err[i] += GetColorError(sub_block_avg[i], sub_block_src[i][j]);

	529 }

	530 }

	531

	532 bool flip =

	533 sub_block_err[2] + sub_block_err[3] < sub_block_err[0] + sub_block_err[1];

	534

	535 // Clear destination buffer so that we can "or" in the results.

	536 memset(dst, 0, 8);

	537

	538 WriteDiff(dst, use_differential[!!flip]);

	539 WriteFlip(dst, flip);

	540

	541 uint8_t sub_block_off_0 = flip ? 2 : 0;

	542 uint8_t sub_block_off_1 = sub_block_off_0 + 1;

	543

	544 if (use_differential[!!flip]) {

	545 WriteColors555(dst, sub_block_avg[sub_block_off_0],

	546 sub_block_avg[sub_block_off_1]);

	547 } else {

	548 WriteColors444(dst, sub_block_avg[sub_block_off_0],

	549 sub_block_avg[sub_block_off_1]);

	550 }

	551

	552 // Compute luminance for the first sub block.

	553 ComputeLuminance(dst, sub_block_src[sub_block_off_0],

	554 sub_block_avg[sub_block_off_0], 0,

	555 g_idx_to_num[sub_block_off_0]);

	556 // Compute luminance for the second sub block.

	557 ComputeLuminance(dst, sub_block_src[sub_block_off_1],

	558 sub_block_avg[sub_block_off_1], 1,

	559 g_idx_to_num[sub_block_off_1]);

	560 }

	561

	562 void CompressTexture(const uint8_t* src, uint8_t* dst, int width, int height) {

	563 DCHECK(width >= 4 && (width & 3) == 0);

	564 DCHECK(height >= 4 && (height & 3) == 0);

	565

	566 ALIGNAS(8) uint32_t ver_blocks[16];

	567 ALIGNAS(8) uint32_t hor_blocks[16];

	568

	569 // Mask for clearing the alpha channel.

	570 ALIGNAS(8) const uint32_t clear_mask_ptr[4] = {

	571 0xff000000, 0xff000000, 0xff000000, 0xff000000};

	572 uint32x4_t clear_mask = vld1q_u32(clear_mask_ptr);

	573

	574 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {

	575 for (int x = 0; x < width; x += 4, dst += 8) {

	576 const uint32_t* row0 = reinterpret_cast<const uint32_t>(src + x 4);

	577 const uint32_t* row1 = row0 + width;

	578 const uint32_t* row2 = row1 + width;

	579 const uint32_t* row3 = row2 + width;

	580

	581 #ifdef GCC46_INTERNAL_ERROR_WORKAROUND

	582 uint32x4x4_t block_transposed;

	583 #endif

	584 ALIGNAS(8) uint32x4_t block[4];

	585 block[0] = vld1q_u32(row0);

	586 block[1] = vld1q_u32(row1);

	587 block[2] = vld1q_u32(row2);

	588 block[3] = vld1q_u32(row3);

	589

	590 // Clear alpha channel.

	591 for (unsigned int i = 0; i < 4; ++i) {

	592 block[i] = vbicq_u32(block[i], clear_mask);

	593 }

	594

	595 // Check if the block is solid.

	596 uint32x4_t solid = vbicq_u32(vdupq_n_u32(*row0), clear_mask);

	597

	598 uint16x4_t eq0 = vmovn_u32(vceqq_u32(block[0], solid));

	599 uint16x4_t eq1 = vmovn_u32(vceqq_u32(block[1], solid));

	600 uint16x4_t eq2 = vmovn_u32(vceqq_u32(block[2], solid));

	601 uint16x4_t eq3 = vmovn_u32(vceqq_u32(block[3], solid));

	602 uint16x4_t tst = vand_u16(vand_u16(eq0, eq1), vand_u16(eq2, eq3));

	603

	604 ALIGNAS(8) uint64_t solid_block_tst_bits;

	605 vst1_u64(&solid_block_tst_bits, vreinterpret_u64_u16(tst));

	606

	607 if (solid_block_tst_bits == 0xffffffffffffffff) {

	608 CompressSolidBlock(dst, reinterpret_cast<const Color>(row0));

	609 continue;

	610 }

	611

	612 vst1q_u32(hor_blocks, block[0]);

	613 vst1q_u32(hor_blocks + 4, block[1]);

	614 vst1q_u32(hor_blocks + 8, block[2]);

	615 vst1q_u32(hor_blocks + 12, block[3]);

	616

	617 // Texel ordering according to specification:

	618 // [ 0][ 4][ 8][12]

	619 // [ 1][ 5][ 9][13]

	620 // [ 2][ 6][10][14]

	621 // [ 3][ 7][11][15]

	622 //

	623 // To access the vertical blocks using C-style indexing we

	624 // transpose the block:

	625 // [ 0][ 1][ 2][ 3]

	626 // [ 4][ 5][ 6][ 7]

	627 // [ 8][ 9][10][11]

	628 // [12][13][14][15]

	629 #ifdef GCC46_INTERNAL_ERROR_WORKAROUND

	630 block_transposed = vld4q_u32(hor_blocks);

	631 #else

	632 uint32x4x4_t block_transposed = vld4q_u32(hor_blocks);

	633 #endif

	634

	635 vst1q_u32(ver_blocks, block_transposed.val[0]);

	636 vst1q_u32(ver_blocks + 4, block_transposed.val[1]);

	637 vst1q_u32(ver_blocks + 8, block_transposed.val[2]);

	638 vst1q_u32(ver_blocks + 12, block_transposed.val[3]);

	639

	640 CompressBlock(dst, reinterpret_cast<const Color*>(ver_blocks),

	641 reinterpret_cast<const Color*>(hor_blocks));

	642 }

	643 }

	644 }

	645

	646 } // namespace etc1_neon

	647

	648 #endif // __ARM_NEON__

OLD	NEW

« no previous file with comments | « cc/resources/texture_compress/arm/etc1_neon.h ('k') | cc/resources/texture_compress/atc_dxt.h » ('j') | no next file with comments »