cc/resources/texture_compressor_etc1_sse.cc - Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression

Side by Side Diff: cc/resources/texture_compressor_etc1_sse.cc

Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Instantiating new class based on CPU; removing SSE4.1 functions and falling back to SSE2 Created 5 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "texture_compressor_etc1_sse.h"

	6

	7 #include <assert.h>

	8 #include <emmintrin.h>

	9 #include <stdio.h>

	10 #include <stdlib.h>

	11 #include <string.h>

	12 #include <time.h>

	13 #include <unistd.h>

	14

	15 #include <cmath>

	16 #include <limits>

	17 #include <sstream>

	18

	19 #include "base/compiler_specific.h"

	20 #include "base/logging.h"

	21

	22 // Defining the following macro will cause the error metric function to weigh

	23 // each color channel differently depending on how the human eye can perceive

	24 // them. This can give a slight improvement in image quality at the cost of a

	25 // performance hit.

	26 // #define USE_PERCEIVED_ERROR_METRIC

	27

	28 namespace {

	29

	30 template <typename T>

	31 inline T clamp(T val, T min, T max) {

	32 return val < min ? min : (val > max ? max : val);

	33 }

	34

	35 inline uint8_t round_to_5_bits(float val) {

	36 return clamp<uint8_t>(val * 31.0f / 255.0f + 0.5f, 0, 31);

	37 }

	38

	39 inline uint8_t round_to_4_bits(float val) {

	40 return clamp<uint8_t>(val * 15.0f / 255.0f + 0.5f, 0, 15);

	41 }

	42

	43 union Color {

	44 struct BgraColorType {

	45 uint8_t b;

	46 uint8_t g;

	47 uint8_t r;

	48 uint8_t a;

	49 } channels;

	50 uint8_t components[4];

	51 uint32_t bits;

	52 };

	53

	54 /*

	55 * Codeword tables.

	56 * See: Table 3.17.2

	57 */

	58 static const int16_t g_codeword_tables[8][4]

	59 __attribute__((aligned(16))) = {{-8, -2, 2, 8},

	60 {-17, -5, 5, 17},

	61 {-29, -9, 9, 29},

	62 {-42, -13, 13, 42},

	63 {-60, -18, 18, 60},

	64 {-80, -24, 24, 80},

	65 {-106, -33, 33, 106},

	66 {-183, -47, 47, 183}};

	67

	68 /*

	69 * Maps modifier indices to pixel index values.

	70 * See: Table 3.17.3

	71 */

	72 static const uint8_t g_mod_to_pix[4] = {3, 2, 0, 1};

	73

	74 /*

	75 * The ETC1 specification index texels as follows:

	76 *

	77 * [a][e][i][m] [ 0][ 4][ 8][12]

	78 * [b][f][j][n] <-> [ 1][ 5][ 9][13]

	79 * [c][g][k][o] [ 2][ 6][10][14]

	80 * [d][h][l][p] [ 3][ 7][11][15]

	81 *

	82 * However, when extracting sub blocks from BGRA data the natural array

	83 * indexing order ends up different:

	84 *

	85 * vertical0: [a][e][b][f] horizontal0: [a][e][i][m]

	86 * [c][g][d][h] [b][f][j][n]

	87 * vertical1: [i][m][j][n] horizontal1: [c][g][k][o]

	88 * [k][o][l][p] [d][h][l][p]

	89 *

	90 * In order to translate from the natural array indices in a sub block to the

	91 * indices (number) used by specification and hardware we use this table.

	92 */

	93 static const uint8_t g_idx_to_num[4][8] = {

	94 {0, 4, 1, 5, 2, 6, 3, 7}, // Vertical block 0.

	95 {8, 12, 9, 13, 10, 14, 11, 15}, // Vertical block 1.

	96 {0, 4, 8, 12, 1, 5, 9, 13}, // Horizontal block 0.

	97 {2, 6, 10, 14, 3, 7, 11, 15} // Horizontal block 1.

	98 };

	99

	100 inline void WriteColors444(uint8_t* block,

	101 const Color& color0,

	102 const Color& color1) {

	103 /* 0, 1, 2 - for ARM */

	104 block[2] = (color0.channels.r & 0xf0) \| (color1.channels.r >> 4);

	105 block[1] = (color0.channels.g & 0xf0) \| (color1.channels.g >> 4);

	106 block[0] = (color0.channels.b & 0xf0) \| (color1.channels.b >> 4);

	107 }

	108

	109 inline void WriteColors555(uint8_t* block,

	110 const Color& color0,

	111 const Color& color1) {

	112 // Table for conversion to 3-bit two complement format.

	113 static const uint8_t two_compl_trans_table[8] = {

	114 4, // -4 (100b)

	115 5, // -3 (101b)

	116 6, // -2 (110b)

	117 7, // -1 (111b)

	118 0, // 0 (000b)

	119 1, // 1 (001b)

	120 2, // 2 (010b)

	121 3, // 3 (011b)

	122 };

	123

	124 int16_t delta_r =

	125 static_cast<int16_t>(color1.channels.r >> 3) - (color0.channels.r >> 3);

	126 int16_t delta_g =

	127 static_cast<int16_t>(color1.channels.g >> 3) - (color0.channels.g >> 3);

	128 int16_t delta_b =

	129 static_cast<int16_t>(color1.channels.b >> 3) - (color0.channels.b >> 3);

	130 DCHECK(delta_r >= -4 && delta_r <= 3);

	131 DCHECK(delta_g >= -4 && delta_g <= 3);

	132 DCHECK(delta_b >= -4 && delta_b <= 3);

	133

	134 /* 0, 1, 2 - for ARM */

	135 block[2] = (color0.channels.r & 0xf8) \| two_compl_trans_table[delta_r + 4];

	136 block[1] = (color0.channels.g & 0xf8) \| two_compl_trans_table[delta_g + 4];

	137 block[0] = (color0.channels.b & 0xf8) \| two_compl_trans_table[delta_b + 4];

	138 }

	139

	140 inline void WriteCodewordTable(uint8_t* block,

	141 uint8_t sub_block_id,

	142 uint8_t table) {

	143 DCHECK_LT(sub_block_id, 2);

	144 DCHECK_LT(table, 8);

	145

	146 uint8_t shift = (2 + (3 - sub_block_id * 3));

	147 block[3] &= ~(0x07 << shift);

	148 block[3] \|= table << shift;

	149 }

	150

	151 inline void WritePixelData(uint8_t* block, uint32_t pixel_data) {

	152 block[4] \|= pixel_data >> 24;

	153 block[5] \|= (pixel_data >> 16) & 0xff;

	154 block[6] \|= (pixel_data >> 8) & 0xff;

	155 block[7] \|= pixel_data & 0xff;

	156 }

	157

	158 inline void WriteFlip(uint8_t* block, bool flip) {

	159 block[3] &= ~0x01;

	160 block[3] \|= static_cast<uint8_t>(flip);

	161 }

	162

	163 inline void WriteDiff(uint8_t* block, bool diff) {

	164 block[3] &= ~0x02;

	165 block[3] \|= static_cast<uint8_t>(diff) << 1;

	166 }

	167

	168 /**

	169 * Compress and rounds BGR888 into BGR444. The resulting BGR444 color is

	170 * expanded to BGR888 as it would be in hardware after decompression. The

	171 * actual 444-bit data is available in the four most significant bits of each

	172 * channel.

	173 */

	174 inline Color MakeColor444(const float* bgr) {

	175 uint8_t b4 = round_to_4_bits(bgr[0]);

	176 uint8_t g4 = round_to_4_bits(bgr[1]);

	177 uint8_t r4 = round_to_4_bits(bgr[2]);

	178 Color bgr444;

	179 bgr444.channels.b = (b4 << 4) \| b4;

	180 bgr444.channels.g = (g4 << 4) \| g4;

	181 bgr444.channels.r = (r4 << 4) \| r4;

	182 bgr444.channels.a = 0x44; /* added by Radu */

	183 return bgr444;

	184 }

	185

	186 /**

	187 * Compress and rounds BGR888 into BGR555. The resulting BGR555 color is

	188 * expanded to BGR888 as it would be in hardware after decompression. The

	189 * actual 555-bit data is available in the five most significant bits of each

	190 * channel.

	191 */

	192 inline Color MakeColor555(const float* bgr) {

	193 uint8_t b5 = round_to_5_bits(bgr[0]);

	194 uint8_t g5 = round_to_5_bits(bgr[1]);

	195 uint8_t r5 = round_to_5_bits(bgr[2]);

	196 Color bgr555;

	197 bgr555.channels.b = (b5 << 3) \| (b5 >> 2);

	198 bgr555.channels.g = (g5 << 3) \| (g5 >> 2);

	199 bgr555.channels.r = (r5 << 3) \| (r5 >> 2);

	200 bgr555.channels.a = 0x55; /* added by Radu */

	201 return bgr555;

	202 }

	203

	204 /**

	205 * Constructs a color from a given base color and luminance value.

	206 */

	207 inline Color MakeColor(const Color& base, int16_t lum) {

	208 int b = static_cast<int>(base.channels.b) + lum;

	209 int g = static_cast<int>(base.channels.g) + lum;

	210 int r = static_cast<int>(base.channels.r) + lum;

	211 Color color;

	212 color.channels.b = static_cast<uint8_t>(clamp(b, 0, 255));

	213 color.channels.g = static_cast<uint8_t>(clamp(g, 0, 255));

	214 color.channels.r = static_cast<uint8_t>(clamp(r, 0, 255));

	215 return color;

	216 }

	217

	218 /**

	219 * Calculates the error metric for two colors. A small error signals that the

	220 * colors are similar to each other, a large error the signals the opposite.

	221 */

	222 inline uint32_t GetColorError(const Color& u, const Color& v) {

	223 #ifdef USE_PERCEIVED_ERROR_METRIC

	224 float delta_b = static_cast<float>(u.channels.b) - v.channels.b;

	225 float delta_g = static_cast<float>(u.channels.g) - v.channels.g;

	226 float delta_r = static_cast<float>(u.channels.r) - v.channels.r;

	227 return static_cast<uint32_t>(0.299f * delta_b * delta_b +

	228 0.587f * delta_g * delta_g +

	229 0.114f * delta_r * delta_r);

	230 #else

	231 int delta_b = static_cast<int>(u.channels.b) - v.channels.b;

	232 int delta_g = static_cast<int>(u.channels.g) - v.channels.g;

	233 int delta_r = static_cast<int>(u.channels.r) - v.channels.r;

	234 return delta_b * delta_b + delta_g * delta_g + delta_r * delta_r;

	235 #endif

	236 }

	237

	238 /************** START OF SSE CODE ****/

	239

	240 struct __sse_data {

	241 /* raw data */

	242 uint8_t* block;

	243 /* 8 bit packed values */

	244 __m128i* packed;

	245 /* 32 bit zero extended values - 4x4 arrays */

	246 __m128i* blue;

	247 __m128i* green;

	248 __m128i* red;

	249 // __m128i *alpha;

	250 };

	251

	252 /* commonly used registers */

	253 static const __m128i __sse_zero = _mm_set1_epi32(0);

	254 static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF);

	255

	256 inline __m128i AddAndClamp(const __m128i x, const __m128i y) {

	257 static const __m128i color_max = _mm_set1_epi32(0xFF);

	258 return _mm_max_epi16(__sse_zero,

	259 _mm_min_epi16(_mm_add_epi32(x, y), color_max));

	260 }

	261

	262 inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) {

	263 /* changed from _mm_mullo_epi32 to _mm_mullo_epi16 */

	264 __m128i ret = _mm_sub_epi16(x, y);

	265 return _mm_mullo_epi16(ret, ret);

	266 }

	267

	268 inline __m128i AddChannelError(const __m128i x,

	269 const __m128i y,

	270 const __m128i z) {

	271 return _mm_add_epi32(x, _mm_add_epi32(y, z));

	272 }

	273

	274 inline uint32_t GetVerticalError(const __sse_data* data,

	275 const __m128i* blue_avg,

	276 const __m128i* green_avg,

	277 const __m128i* red_avg) {

	278 __m128i error = __sse_zero;

	279

	280 #pragma unroll

	281 for (int i = 0; i < 4; i++) {

	282 error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0]));

	283 error =

	284 _mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0]));

	285 error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0]));

	286 }

	287

	288 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	289 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0xB1));

	290

	291 return _mm_cvtsi128_si32(error);

	292 }

	293

	294 inline uint32_t GetHorizontalError(const __sse_data* data,

	295 const __m128i* blue_avg,

	296 const __m128i* green_avg,

	297 const __m128i* red_avg) {

	298 __m128i error = __sse_zero;

	299 int first_index, second_index;

	300

	301 #pragma unroll

	302 for (int i = 0; i < 2; i++) {

	303 first_index = 2 * i;

	304 second_index = first_index + 1;

	305

	306 error = _mm_add_epi32(

	307 error, GetColorErrorSSE(data->blue[first_index], blue_avg[i]));

	308 error = _mm_add_epi32(

	309 error, GetColorErrorSSE(data->blue[second_index], blue_avg[i]));

	310 error = _mm_add_epi32(

	311 error, GetColorErrorSSE(data->green[first_index], green_avg[i]));

	312 error = _mm_add_epi32(

	313 error, GetColorErrorSSE(data->green[second_index], green_avg[i]));

	314 error = _mm_add_epi32(error,

	315 GetColorErrorSSE(data->red[first_index], red_avg[i]));

	316 error = _mm_add_epi32(

	317 error, GetColorErrorSSE(data->red[second_index], red_avg[i]));

	318 }

	319

	320 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	321 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0xB1));

	322 return _mm_cvtsi128_si32(error);

	323 }

	324

	325 inline void GetAvgColors(const __sse_data* data,

	326 float* output,

	327 bool* __sse_use_diff) {

	328 __m128i sum[2], tmp;

	329

	330 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe

	331

	332 /* get avg red */

	333 /* [S0 S0 S1 S1] */

	334 sum[0] = _mm_add_epi32(data->red[0], data->red[1]);

	335 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	336

	337 /* [S2 S2 S3 S3] */

	338 sum[1] = _mm_add_epi32(data->red[2], data->red[3]);

	339 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	340

	341 float hred[2], vred[2];

	342 hred[0] = (_mm_cvtsi128_si32(

	343 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	344 8.0f;

	345 hred[1] = (_mm_cvtsi128_si32(

	346 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	347 8.0f;

	348

	349 tmp = _mm_add_epi32(sum[0], sum[1]);

	350 vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	351 vred[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	352

	353 /* get avg green */

	354 /* [S0 S0 S1 S1] */

	355 sum[0] = _mm_add_epi32(data->green[0], data->green[1]);

	356 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	357

	358 /* [S2 S2 S3 S3] */

	359 sum[1] = _mm_add_epi32(data->green[2], data->green[3]);

	360 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	361

	362 float hgreen[2], vgreen[2];

	363 hgreen[0] = (_mm_cvtsi128_si32(

	364 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	365 8.0f;

	366 hgreen[1] = (_mm_cvtsi128_si32(

	367 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	368 8.0f;

	369

	370 tmp = _mm_add_epi32(sum[0], sum[1]);

	371 vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	372 vgreen[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	373

	374 /* get avg blue */

	375 /* [S0 S0 S1 S1] */

	376 sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]);

	377 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	378

	379 /* [S2 S2 S3 S3] */

	380 sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]);

	381 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	382

	383 float hblue[2], vblue[2];

	384 hblue[0] = (_mm_cvtsi128_si32(

	385 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	386 8.0f;

	387 hblue[1] = (_mm_cvtsi128_si32(

	388 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	389 8.0f;

	390

	391 tmp = _mm_add_epi32(sum[0], sum[1]);

	392 vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	393 vblue[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	394

	395 /* TODO(radu.velea): return int's instead of floats */

	396 output[0] = vblue[0];

	397 output[1] = vgreen[0];

	398 output[2] = vred[0];

	399

	400 output[3] = vblue[1];

	401 output[4] = vgreen[1];

	402 output[5] = vred[1];

	403

	404 output[6] = hblue[0];

	405 output[7] = hgreen[0];

	406 output[8] = hred[0];

	407

	408 output[9] = hblue[1];

	409 output[10] = hgreen[1];

	410 output[11] = hred[1];

	411

	412 __m128i threashhold_upper = _mm_set1_epi32(3);

	413 __m128i threashhold_lower = _mm_set1_epi32(-4);

	414

	415 __m128 factor_v = _mm_set1_ps(31.0f / 255.0f);

	416 __m128 rounding_v = _mm_set1_ps(0.5f);

	417 __m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0);

	418 __m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0);

	419

	420 __m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0);

	421 __m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0);

	422

	423 h_avg_0 = _mm_mul_ps(h_avg_0, factor_v);

	424 h_avg_1 = _mm_mul_ps(h_avg_1, factor_v);

	425 v_avg_0 = _mm_mul_ps(v_avg_0, factor_v);

	426 v_avg_1 = _mm_mul_ps(v_avg_1, factor_v);

	427

	428 h_avg_0 = _mm_add_ps(h_avg_0, rounding_v);

	429 h_avg_1 = _mm_add_ps(h_avg_1, rounding_v);

	430 v_avg_0 = _mm_add_ps(v_avg_0, rounding_v);

	431 v_avg_1 = _mm_add_ps(v_avg_1, rounding_v);

	432

	433 __m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0);

	434 __m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1);

	435

	436 __m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0);

	437 __m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1);

	438

	439 h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i);

	440 v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i);

	441

	442 __sse_use_diff[0] =

	443 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threashhold_lower)));

	444 __sse_use_diff[0] &=

	445 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threashhold_upper)));

	446

	447 __sse_use_diff[1] =

	448 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threashhold_lower)));

	449 __sse_use_diff[1] &=

	450 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threashhold_upper)));

	451 }

	452

	453 void ComputeLuminanceSSE(uint8_t* block,

	454 const Color& base,

	455 const int sub_block_id,

	456 const uint8_t* idx_to_num_tab,

	457 const __sse_data* data) {

	458 uint8_t my_best_tbl_idx = 0;

	459 uint32_t my_best_error = 0x7FFFFFFF;

	460 uint8_t my_best_mod_idx[8][8]; // [table][texel]

	461

	462 const __m128i base_blue = _mm_set1_epi32(base.channels.b);

	463 const __m128i base_green = _mm_set1_epi32(base.channels.g);

	464 const __m128i base_red = _mm_set1_epi32(base.channels.r);

	465

	466 __m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red;

	467 __m128i block_error, mask;

	468

	469 /* this will have the minimum errors for each 4 pixels */

	470 __m128i first_half_min;

	471 __m128i second_half_min;

	472

	473 /* this will have the matching table index combo for each 4 pixels */

	474 __m128i first_half_pattern;

	475 __m128i second_half_pattern;

	476

	477 const __m128i first_blue_data_block = data->blue[2 * sub_block_id];

	478 const __m128i first_green_data_block = data->green[2 * sub_block_id];

	479 const __m128i first_red_data_block = data->red[2 * sub_block_id];

	480

	481 const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1];

	482 const __m128i second_green_data_block = data->green[2 * sub_block_id + 1];

	483 const __m128i second_red_data_block = data->red[2 * sub_block_id + 1];

	484

	485 uint32_t min;

	486

	487 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	488 tmp = _mm_set_epi32(

	489 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],

	490 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);

	491

	492 test_blue = AddAndClamp(tmp, base_blue);

	493 test_green = AddAndClamp(tmp, base_green);

	494 test_red = AddAndClamp(tmp, base_red);

	495

	496 first_half_min = __sse_max_int;

	497 second_half_min = __sse_max_int;

	498

	499 first_half_pattern = __sse_zero;

	500 second_half_pattern = __sse_zero;

	501

	502 #pragma unroll

	503 for (uint8_t imm8 :

	504 {0x1B, 0x4E, 0xB1, 0xE4}) { /* important they are sorted ascending */

	505 switch (imm8) {

	506 case 0x1B:

	507 tmp_blue = _mm_shuffle_epi32(test_blue, 0x1B);

	508 tmp_green = _mm_shuffle_epi32(test_green, 0x1B);

	509 tmp_red = _mm_shuffle_epi32(test_red, 0x1B);

	510 break;

	511 case 0x4E:

	512 tmp_blue = _mm_shuffle_epi32(test_blue, 0x4E);

	513 tmp_green = _mm_shuffle_epi32(test_green, 0x4E);

	514 tmp_red = _mm_shuffle_epi32(test_red, 0x4E);

	515 break;

	516 case 0xB1:

	517 tmp_blue = _mm_shuffle_epi32(test_blue, 0xB1);

	518 tmp_green = _mm_shuffle_epi32(test_green, 0xB1);

	519 tmp_red = _mm_shuffle_epi32(test_red, 0xB1);

	520 break;

	521 case 0xE4:

	522 tmp_blue = _mm_shuffle_epi32(test_blue, 0xE4);

	523 tmp_green = _mm_shuffle_epi32(test_green, 0xE4);

	524 tmp_red = _mm_shuffle_epi32(test_red, 0xE4);

	525 break;

	526 default:

	527 tmp_blue = test_blue;

	528 tmp_green = test_green;

	529 tmp_red = test_red;

	530 }

	531

	532 tmp = _mm_set1_epi32(imm8);

	533

	534 block_error =

	535 AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block),

	536 GetColorErrorSSE(tmp_green, first_green_data_block),

	537 GetColorErrorSSE(tmp_red, first_red_data_block));

	538

	539 /* save winning pattern */

	540 first_half_pattern = _mm_max_epi16(

	541 first_half_pattern,

	542 _mm_and_si128(tmp, _mm_cmpgt_epi32(first_half_min, block_error)));

	543 /* should use _mm_min_epi32(first_half_min, block_error); otherwise

	544 * performance penalty */

	545 mask = _mm_cmplt_epi32(block_error, first_half_min);

	546 first_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	547 _mm_andnot_si128(mask, first_half_min));

	548

	549 /* Second part of the block */

	550 block_error =

	551 AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block),

	552 GetColorErrorSSE(tmp_green, second_green_data_block),

	553 GetColorErrorSSE(tmp_red, second_red_data_block));

	554

	555 /* save winning pattern */

	556 second_half_pattern = _mm_max_epi16(

	557 second_half_pattern,

	558 _mm_and_si128(tmp, _mm_cmpgt_epi32(second_half_min, block_error)));

	559 /* should use _mm_min_epi32(second_half_min, block_error); otherwise

	560 * performance penalty */

	561 mask = _mm_cmplt_epi32(block_error, second_half_min);

	562 second_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	563 _mm_andnot_si128(mask, second_half_min));

	564 }

	565

	566 first_half_min = _mm_add_epi32(first_half_min, second_half_min);

	567 first_half_min =

	568 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E));

	569 first_half_min =

	570 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1));

	571

	572 min = _mm_cvtsi128_si32(first_half_min);

	573

	574 if (min < my_best_error) {

	575 my_best_tbl_idx = tbl_idx;

	576 my_best_error = min;

	577 #if O3_OPTIMIZATION

	578 #pragma unroll

	579 for (int i = 0; i < 4; i++) {

	580 my_best_mod_idx[tbl_idx][i] =

	581 (_mm_extract_epi32(first_half_pattern, i) >> (2 * i)) & 3;

	582 my_best_mod_idx[tbl_idx][i + 4] =

	583 (_mm_extract_epi32(second_half_pattern, i) >> (2 * i)) & 3;

	584 }

	585 #endif

	586 // _mm_shuffle_epi32

	587 my_best_mod_idx[tbl_idx][0] =

	588 (_mm_cvtsi128_si32(first_half_pattern) >> (0)) & 3;

	589 my_best_mod_idx[tbl_idx][4] =

	590 (_mm_cvtsi128_si32(second_half_pattern) >> (0)) & 3;

	591

	592 my_best_mod_idx[tbl_idx][1] =

	593 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x1)) >>

	594 (2)) &

	595 3;

	596 my_best_mod_idx[tbl_idx][5] =

	597 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x1)) >>

	598 (2)) &

	599 3;

	600

	601 my_best_mod_idx[tbl_idx][2] =

	602 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x2)) >>

	603 (4)) &

	604 3;

	605 my_best_mod_idx[tbl_idx][6] =

	606 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x2)) >>

	607 (4)) &

	608 3;

	609

	610 my_best_mod_idx[tbl_idx][3] =

	611 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x3)) >>

	612 (6)) &

	613 3;

	614 my_best_mod_idx[tbl_idx][7] =

	615 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x3)) >>

	616 (6)) &

	617 3;

	618

	619 if (my_best_error == 0) {

	620 break;

	621 }

	622 }

	623 }

	624

	625 WriteCodewordTable(block, sub_block_id, my_best_tbl_idx);

	626

	627 uint32_t pix_data = 0;

	628 uint8_t mod_idx;

	629 uint8_t pix_idx;

	630 uint32_t lsb;

	631 uint32_t msb;

	632 int texel_num;

	633

	634 for (unsigned int i = 0; i < 8; ++i) {

	635 mod_idx = my_best_mod_idx[my_best_tbl_idx][i];

	636 pix_idx = g_mod_to_pix[mod_idx];

	637

	638 lsb = pix_idx & 0x1;

	639 msb = pix_idx >> 1;

	640

	641 // Obtain the texel number as specified in the standard.

	642 texel_num = idx_to_num_tab[i];

	643 pix_data \|= msb << (texel_num + 16);

	644 pix_data \|= lsb << (texel_num);

	645 }

	646

	647 WritePixelData(block, pix_data);

	648 }

	649

	650 void CompressBlock(uint8_t* dst, __sse_data* data) {

	651 /* first 3 vertical 1, seconds 3 vertical 2, third 3 horizontal 1, last 3

	652 * horizontal 2 */

	653 float __sse_avg_colors[12] = {

	654 0,

	655 };

	656 bool use_differential[2] = {true, true};

	657 GetAvgColors(data, __sse_avg_colors, use_differential);

	658 Color sub_block_avg[4];

	659

	660 /* TODO(radu.velea): remove floating point operations and use only int's +

	661 * normal

	662 * rounding and shifts */

	663 for (int i = 0, j = 1; i < 4; i += 2, j += 2) {

	664 if (use_differential[i / 2] == false) {

	665 sub_block_avg[i] = MakeColor444(&__sse_avg_colors[i * 3]);

	666 sub_block_avg[j] = MakeColor444(&__sse_avg_colors[j * 3]);

	667 } else {

	668 sub_block_avg[i] = MakeColor555(&__sse_avg_colors[i * 3]);

	669 sub_block_avg[j] = MakeColor555(&__sse_avg_colors[j * 3]);

	670 }

	671 }

	672

	673 __m128i red_avg[2], green_avg[2], blue_avg[2];

	674

	675 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	676 blue_avg[0] =

	677 _mm_set_epi32((int)__sse_avg_colors[3], (int)__sse_avg_colors[3],

	678 (int)__sse_avg_colors[0], (int)__sse_avg_colors[0]);

	679

	680 green_avg[0] =

	681 _mm_set_epi32((int)__sse_avg_colors[4], (int)__sse_avg_colors[4],

	682 (int)__sse_avg_colors[1], (int)__sse_avg_colors[1]);

	683

	684 red_avg[0] =

	685 _mm_set_epi32((int)__sse_avg_colors[5], (int)__sse_avg_colors[5],

	686 (int)__sse_avg_colors[2], (int)__sse_avg_colors[2]);

	687

	688 uint32_t vertical_error =

	689 GetVerticalError(data, blue_avg, green_avg, red_avg);

	690

	691 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	692 blue_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[6]);

	693 blue_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[9]);

	694

	695 green_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[7]);

	696 green_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[10]);

	697

	698 red_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[8]);

	699 red_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[11]);

	700

	701 uint32_t horizontal_error =

	702 GetHorizontalError(data, blue_avg, green_avg, red_avg);

	703

	704 bool flip = horizontal_error < vertical_error;

	705

	706 // Clear destination buffer so that we can "or" in the results.

	707 memset(dst, 0, 8);

	708

	709 WriteDiff(dst, use_differential[!!flip]);

	710 WriteFlip(dst, flip);

	711

	712 uint8_t sub_block_off_0 = flip ? 2 : 0;

	713 uint8_t sub_block_off_1 = sub_block_off_0 + 1;

	714

	715 if (use_differential[!!flip]) {

	716 WriteColors555(dst, sub_block_avg[sub_block_off_0],

	717 sub_block_avg[sub_block_off_1]);

	718 } else {

	719 WriteColors444(dst, sub_block_avg[sub_block_off_0],

	720 sub_block_avg[sub_block_off_1]);

	721 }

	722

	723 if (flip == false) {

	724 /* transpose vertical data into horizontal lines */

	725 __m128i tmp;

	726 #pragma unroll

	727 for (int i = 0; i < 4; i += 2) {

	728 tmp = data->blue[i];

	729 data->blue[i] = _mm_add_epi32(

	730 _mm_move_epi64(data->blue[i]),

	731 _mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E));

	732 data->blue[i + 1] = _mm_add_epi32(

	733 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	734 _mm_shuffle_epi32(

	735 _mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)),

	736 0x4E));

	737

	738 tmp = data->green[i];

	739 data->green[i] = _mm_add_epi32(

	740 _mm_move_epi64(data->green[i]),

	741 _mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E));

	742 data->green[i + 1] = _mm_add_epi32(

	743 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	744 _mm_shuffle_epi32(

	745 _mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)),

	746 0x4E));

	747

	748 tmp = data->red[i];

	749 data->red[i] = _mm_add_epi32(

	750 _mm_move_epi64(data->red[i]),

	751 _mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E));

	752 data->red[i + 1] = _mm_add_epi32(

	753 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	754 _mm_shuffle_epi32(

	755 _mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E));

	756 }

	757

	758 tmp = data->blue[1];

	759 data->blue[1] = data->blue[2];

	760 data->blue[2] = tmp;

	761

	762 tmp = data->green[1];

	763 data->green[1] = data->green[2];

	764 data->green[2] = tmp;

	765

	766 tmp = data->red[1];

	767 data->red[1] = data->red[2];

	768 data->red[2] = tmp;

	769 }

	770

	771 // Compute luminance for the first sub block.

	772 ComputeLuminanceSSE(dst, sub_block_avg[sub_block_off_0], 0,

	773 g_idx_to_num[sub_block_off_0], data);

	774 // Compute luminance for the second sub block.

	775 ComputeLuminanceSSE(dst, sub_block_avg[sub_block_off_1], 1,

	776 g_idx_to_num[sub_block_off_1], data);

	777 }

	778

	779 static void LegacyExtractBlock(uint8_t* dst, const uint8_t* src, int width) {

	780 for (int j = 0; j < 4; ++j) {

	781 memcpy(&dst[j * 4 * 4], src, 4 * 4);

	782 src += width * 4;

	783 }

	784 }

	785

	786 inline void TransposeBlock(uint8_t* block, __m128i* transposed /* [4] */) {

	787 __m128i tmp3, tmp2, tmp1, tmp0;

	788

	789 transposed[0] = _mm_loadu_si128((__m128i*)(block)); // a0,a1,a2,...a7, ...a15

	790 transposed[1] =

	791 _mm_loadu_si128((__m128i*)(block + 16)); // b0, b1,b2,...b7.... b15

	792 transposed[2] =

	793 _mm_loadu_si128((__m128i*)(block + 32)); // c0, c1,c2,...c7....c15

	794 transposed[3] =

	795 _mm_loadu_si128((__m128i*)(block + 48)); // d0,d1,d2,...d7....d15

	796

	797 tmp0 = _mm_unpacklo_epi8(

	798 transposed[0], transposed[1]); // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7

	799 tmp1 = _mm_unpacklo_epi8(

	800 transposed[2], transposed[3]); // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7

	801 tmp2 = _mm_unpackhi_epi8(

	802 transposed[0],

	803 transposed[1]); // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15

	804 tmp3 = _mm_unpackhi_epi8(

	805 transposed[2],

	806 transposed[3]); // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15

	807

	808 transposed[0] = _mm_unpacklo_epi8(

	809 tmp0, tmp2); // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11

	810 transposed[1] = _mm_unpackhi_epi8(

	811 tmp0, tmp2); // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15

	812 transposed[2] =

	813 _mm_unpacklo_epi8(tmp1, tmp3); // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11

	814 transposed[3] = _mm_unpackhi_epi8(

	815 tmp1, tmp3); // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15

	816

	817 tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]); // a0,a8, b0,b8,

	818 // c0,c8, d0,d8,

	819 // a1,a9, b1,b9,

	820 // c1,c9, d1,d9

	821 tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]); // a2,a10, b2,b10,

	822 // c2,c10, d2,d10,

	823 // a3,a11, b3,b11,

	824 // c3,c11, d3,d11

	825 tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]); // a4,a12, b4,b12,

	826 // c4,c12, d4,d12,

	827 // a5,a13, b5,b13,

	828 // c5,c13, d5,d13,

	829 tmp3 = _mm_unpackhi_epi32(transposed[1],

	830 transposed[3]); // a6,a14, b6,b14, c6,c14, d6,d14,

	831 // a7,a15,b7,b15,c7,c15,d7,d15

	832

	833 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2); // a0,a4, a8, a12, b0,b4,

	834 // b8,b12, c0,c4, c8, c12,

	835 // d0,d4, d8, d12

	836 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2); // a1,a5, a9, a13, b1,b5,

	837 // b9,b13, c1,c5, c9, c13,

	838 // d1,d5, d9, d13

	839 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3); // a2,a6, a10,a14, b2,b6,

	840 // b10,b14, c2,c6, c10,c14,

	841 // d2,d6, d10,d14

	842 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3); // a3,a7, a11,a15, b3,b7,

	843 // b11,b15, c3,c7, c11,c15,

	844 // d3,d7, d11,d15

	845 }

	846

	847 inline void UnpackBlock(__m128i* packed,

	848 __m128i* red,

	849 __m128i* green,

	850 __m128i* blue,

	851 __m128i* alpha) {

	852 const __m128i zero = _mm_set1_epi8(0);

	853 __m128i tmp_low, tmp_high;

	854

	855 /* unpack red */

	856 tmp_low = _mm_unpacklo_epi8(packed[0], zero);

	857 tmp_high = _mm_unpackhi_epi8(packed[0], zero);

	858

	859 red[0] = _mm_unpacklo_epi16(tmp_low, zero);

	860 red[1] = _mm_unpackhi_epi16(tmp_low, zero);

	861

	862 red[2] = _mm_unpacklo_epi16(tmp_high, zero);

	863 red[3] = _mm_unpackhi_epi16(tmp_high, zero);

	864

	865 /* unpack green */

	866 tmp_low = _mm_unpacklo_epi8(packed[1], zero);

	867 tmp_high = _mm_unpackhi_epi8(packed[1], zero);

	868

	869 green[0] = _mm_unpacklo_epi16(tmp_low, zero);

	870 green[1] = _mm_unpackhi_epi16(tmp_low, zero);

	871

	872 green[2] = _mm_unpacklo_epi16(tmp_high, zero);

	873 green[3] = _mm_unpackhi_epi16(tmp_high, zero);

	874

	875 /* unpack blue */

	876 tmp_low = _mm_unpacklo_epi8(packed[2], zero);

	877 tmp_high = _mm_unpackhi_epi8(packed[2], zero);

	878

	879 blue[0] = _mm_unpacklo_epi16(tmp_low, zero);

	880 blue[1] = _mm_unpackhi_epi16(tmp_low, zero);

	881

	882 blue[2] = _mm_unpacklo_epi16(tmp_high, zero);

	883 blue[3] = _mm_unpackhi_epi16(tmp_high, zero);

	884

	885 /* unpack alpha */

	886 tmp_low = _mm_unpacklo_epi8(packed[3], zero);

	887 tmp_high = _mm_unpackhi_epi8(packed[3], zero);

	888

	889 alpha[0] = _mm_unpacklo_epi16(tmp_low, zero);

	890 alpha[1] = _mm_unpackhi_epi16(tmp_low, zero);

	891

	892 alpha[2] = _mm_unpacklo_epi16(tmp_high, zero);

	893 alpha[3] = _mm_unpackhi_epi16(tmp_high, zero);

	894 }

	895

	896 inline int BlockIsConstant(const uint8_t* block, const __m128i* transposed) {

	897 __m128i first = _mm_set1_epi8(block[0]);

	898 first = _mm_cmpeq_epi8(transposed[0], first);

	899 if (_mm_movemask_epi8(first) != 0xFFFF) {

	900 return 0;

	901 }

	902

	903 first = _mm_set1_epi8(block[1]);

	904 first = _mm_cmpeq_epi8(transposed[1], first);

	905

	906 if (_mm_movemask_epi8(first) != 0xFFFF) {

	907 return 0;

	908 }

	909

	910 first = _mm_set1_epi8(block[2]);

	911 first = _mm_cmpeq_epi8(transposed[2], first);

	912

	913 if (_mm_movemask_epi8(first) != 0xFFFF) {

	914 return 0;

	915 }

	916

	917 return 1;

	918 }

	919

	920 inline void CompressSolid(uint8_t* dst, uint8_t* block) {

	921 // Clear destination buffer so that we can "or" in the results.

	922 memset(dst, 0, 8);

	923

	924 float src_color_float[3] = {static_cast<float>(block[0]),

	925 static_cast<float>(block[1]),

	926 static_cast<float>(block[2])};

	927 Color base = MakeColor555(src_color_float);

	928 Color constant;

	929 constant.channels.b = block[0];

	930 constant.channels.g = block[1];

	931 constant.channels.r = block[2];

	932

	933 WriteDiff(dst, true);

	934 WriteFlip(dst, false);

	935 WriteColors555(dst, base, base);

	936

	937 uint8_t best_tbl_idx = 0;

	938 uint8_t best_mod_idx = 0;

	939 uint32_t best_mod_err = std::numeric_limits<uint32_t>::max();

	940

	941 // Try all codeword tables to find the one giving the best results for this

	942 // block.

	943 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	944 // Try all modifiers in the current table to find which one gives the

	945 // smallest error.

	946 for (unsigned int mod_idx = 0; mod_idx < 4; ++mod_idx) {

	947 int16_t lum = g_codeword_tables[tbl_idx][mod_idx];

	948 const Color& color = MakeColor(base, lum);

	949

	950 uint32_t mod_err = GetColorError(constant, color);

	951 if (mod_err < best_mod_err) {

	952 best_tbl_idx = tbl_idx;

	953 best_mod_idx = mod_idx;

	954 best_mod_err = mod_err;

	955

	956 if (mod_err == 0)

	957 break; // We cannot do any better than this.

	958 }

	959 }

	960

	961 if (best_mod_err == 0)

	962 break;

	963 }

	964

	965 WriteCodewordTable(dst, 0, best_tbl_idx);

	966 WriteCodewordTable(dst, 1, best_tbl_idx);

	967

	968 uint8_t pix_idx = g_mod_to_pix[best_mod_idx];

	969 uint32_t lsb = pix_idx & 0x1;

	970 uint32_t msb = pix_idx >> 1;

	971

	972 uint32_t pix_data = 0;

	973 for (unsigned int i = 0; i < 2; ++i) {

	974 for (unsigned int j = 0; j < 8; ++j) {

	975 // Obtain the texel number as specified in the standard.

	976 int texel_num = g_idx_to_num[i][j];

	977 pix_data \|= msb << (texel_num + 16);

	978 pix_data \|= lsb << (texel_num);

	979 }

	980 }

	981

	982 WritePixelData(dst, pix_data);

	983 }

	984

	985 } // namespace

	986

	987 namespace cc {

	988

	989 void TextureCompressorETC1_SSE::Compress(const uint8_t* src,

	990 uint8_t* dst,

	991 int width,

	992 int height,

	993 Quality quality) {

	994 DCHECK(width >= 4 && (width & 3) == 0);

	995 DCHECK(height >= 4 && (height & 3) == 0);

	996

	997 uint8_t block[64] __attribute__((aligned(16)));

	998 __m128i packed[4];

	999 __m128i red[4], green[4], blue[4], alpha[4];

	1000 __sse_data data;

	1001

	1002 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {

	1003 for (int x = 0; x < width; x += 4, dst += 8) {

	1004 /* SSE */

	1005 LegacyExtractBlock(block, src + x * 4, width);

	1006 TransposeBlock(block, packed);

	1007 if (BlockIsConstant(block, packed) == 1) {

	1008 /* TODO(radu.velea): handle constant blocks in SSE */

	1009 CompressSolid(dst, block);

	1010 } else {

	1011 UnpackBlock(packed, blue, green, red, alpha);

	1012

	1013 data.block = block;

	1014 data.packed = packed;

	1015 data.red = red;

	1016 data.blue = blue;

	1017 data.green = green;

	1018

	1019 CompressBlock(dst, &data);

	1020 }

	1021 }

	1022 }

	1023 }

	1024

	1025 } // namespace cc

OLD	NEW

« cc/resources/texture_compressor_etc1_sse.h ('K') | « cc/resources/texture_compressor_etc1_sse.h ('k') | no next file » | no next file with comments »