cc/resources/texture_compressor_etc1_sse.cc - Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression

Side by Side Diff: cc/resources/texture_compressor_etc1_sse.cc

Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Update gyp and Build.gn Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "texture_compressor_etc1_sse.h"

	6

	7 #include <assert.h>

	8 #include <emmintrin.h>

	9 #include <stdio.h>

	10 #include <stdlib.h>

	11 #include <string.h>

	12 #include <time.h>

	13

	14 #include <cmath>

	15 #include <limits>

	16 #include <sstream>

	17

	18 #include "base/compiler_specific.h"

	19 #include "base/logging.h"

	20 #include "cc/resources/texture_compressor_util.h"

	21

	22 using namespace cc::texture_compress;

	23

	24 namespace {

	25

	26 #define ETC1_SET_ERROR(x) (x + x / 2 + 384)

	27

	28 struct __sse_data {

	29 /* raw data */

	30 uint8_t* block;

	31 /* 8 bit packed values */

	32 __m128i* packed;

	33 /* 32 bit zero extended values - 4x4 arrays */

	34 __m128i* blue;

	35 __m128i* green;

	36 __m128i* red;

	37 // __m128i *alpha;

	38 };

	39

	40 /* commonly used registers */

	41 static const __m128i __sse_zero = _mm_set1_epi32(0);

	42 static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF);

	43

	44 inline __m128i AddAndClamp(const __m128i x, const __m128i y) {

	45 static const __m128i color_max = _mm_set1_epi32(0xFF);

	46 return _mm_max_epi16(__sse_zero,

	47 _mm_min_epi16(_mm_add_epi16(x, y), color_max));

	48 }

	49

	50 inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) {

	51 /* changed from _mm_mullo_epi32 to _mm_mullo_epi16 */

	52 __m128i ret = _mm_sub_epi16(x, y);

	53 return _mm_mullo_epi16(ret, ret);

	54 }

	55

	56 inline __m128i AddChannelError(const __m128i x,

	57 const __m128i y,

	58 const __m128i z) {

	59 return _mm_add_epi32(x, _mm_add_epi32(y, z));

	60 }

	61

	62 inline uint32_t SumSSE(const __m128i x) {

	63 __m128i sum = _mm_add_epi32(x, _mm_shuffle_epi32(x, 0x4E));

	64 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));

	65

	66 return _mm_cvtsi128_si32(sum);

	67 }

	68

	69 inline uint32_t GetVerticalError(const __sse_data* data,

	70 const __m128i* blue_avg,

	71 const __m128i* green_avg,

	72 const __m128i* red_avg,

	73 uint32_t* verror) {

	74 __m128i error = __sse_zero;

	75

	76 #pragma unroll

	77 for (int i = 0; i < 4; i++) {

	78 error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0]));

	79 error =

	80 _mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0]));

	81 error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0]));

	82 }

	83

	84 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	85

	86 verror[0] = _mm_cvtsi128_si32(error);

	87 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	88

	89 return verror[0] + verror[1];

	90 }

	91

	92 inline uint32_t GetHorizontalError(const __sse_data* data,

	93 const __m128i* blue_avg,

	94 const __m128i* green_avg,

	95 const __m128i* red_avg,

	96 uint32_t* verror) {

	97 __m128i error = __sse_zero;

	98 int first_index, second_index;

	99

	100 #pragma unroll

	101 for (int i = 0; i < 2; i++) {

	102 first_index = 2 * i;

	103 second_index = first_index + 1;

	104

	105 error = _mm_add_epi32(

	106 error, GetColorErrorSSE(data->blue[first_index], blue_avg[i]));

	107 error = _mm_add_epi32(

	108 error, GetColorErrorSSE(data->blue[second_index], blue_avg[i]));

	109 error = _mm_add_epi32(

	110 error, GetColorErrorSSE(data->green[first_index], green_avg[i]));

	111 error = _mm_add_epi32(

	112 error, GetColorErrorSSE(data->green[second_index], green_avg[i]));

	113 error = _mm_add_epi32(error,

	114 GetColorErrorSSE(data->red[first_index], red_avg[i]));

	115 error = _mm_add_epi32(

	116 error, GetColorErrorSSE(data->red[second_index], red_avg[i]));

	117 }

	118

	119 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	120

	121 verror[0] = _mm_cvtsi128_si32(error);

	122 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	123

	124 return verror[0] + verror[1];

	125 }

	126

	127 inline void GetAvgColors(const __sse_data* data,

	128 float* output,

	129 bool* __sse_use_diff) {

	130 __m128i sum[2], tmp;

	131

	132 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe

	133

	134 /* get avg red */

	135 /* [S0 S0 S1 S1] */

	136 sum[0] = _mm_add_epi32(data->red[0], data->red[1]);

	137 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	138

	139 /* [S2 S2 S3 S3] */

	140 sum[1] = _mm_add_epi32(data->red[2], data->red[3]);

	141 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	142

	143 float hred[2], vred[2];

	144 hred[0] = (_mm_cvtsi128_si32(

	145 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	146 8.0f;

	147 hred[1] = (_mm_cvtsi128_si32(

	148 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	149 8.0f;

	150

	151 tmp = _mm_add_epi32(sum[0], sum[1]);

	152 vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	153 vred[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	154

	155 /* get avg green */

	156 /* [S0 S0 S1 S1] */

	157 sum[0] = _mm_add_epi32(data->green[0], data->green[1]);

	158 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	159

	160 /* [S2 S2 S3 S3] */

	161 sum[1] = _mm_add_epi32(data->green[2], data->green[3]);

	162 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	163

	164 float hgreen[2], vgreen[2];

	165 hgreen[0] = (_mm_cvtsi128_si32(

	166 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	167 8.0f;

	168 hgreen[1] = (_mm_cvtsi128_si32(

	169 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	170 8.0f;

	171

	172 tmp = _mm_add_epi32(sum[0], sum[1]);

	173 vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	174 vgreen[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	175

	176 /* get avg blue */

	177 /* [S0 S0 S1 S1] */

	178 sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]);

	179 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	180

	181 /* [S2 S2 S3 S3] */

	182 sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]);

	183 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	184

	185 float hblue[2], vblue[2];

	186 hblue[0] = (_mm_cvtsi128_si32(

	187 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	188 8.0f;

	189 hblue[1] = (_mm_cvtsi128_si32(

	190 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	191 8.0f;

	192

	193 tmp = _mm_add_epi32(sum[0], sum[1]);

	194 vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	195 vblue[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	196

	197 /* TODO(radu.velea): return int's instead of floats */

	198 output[0] = vblue[0];

	199 output[1] = vgreen[0];

	200 output[2] = vred[0];

	201

	202 output[3] = vblue[1];

	203 output[4] = vgreen[1];

	204 output[5] = vred[1];

	205

	206 output[6] = hblue[0];

	207 output[7] = hgreen[0];

	208 output[8] = hred[0];

	209

	210 output[9] = hblue[1];

	211 output[10] = hgreen[1];

	212 output[11] = hred[1];

	213

	214 __m128i threashhold_upper = _mm_set1_epi32(3);

	215 __m128i threashhold_lower = _mm_set1_epi32(-4);

	216

	217 __m128 factor_v = _mm_set1_ps(31.0f / 255.0f);

	218 __m128 rounding_v = _mm_set1_ps(0.5f);

	219 __m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0);

	220 __m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0);

	221

	222 __m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0);

	223 __m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0);

	224

	225 h_avg_0 = _mm_mul_ps(h_avg_0, factor_v);

	226 h_avg_1 = _mm_mul_ps(h_avg_1, factor_v);

	227 v_avg_0 = _mm_mul_ps(v_avg_0, factor_v);

	228 v_avg_1 = _mm_mul_ps(v_avg_1, factor_v);

	229

	230 h_avg_0 = _mm_add_ps(h_avg_0, rounding_v);

	231 h_avg_1 = _mm_add_ps(h_avg_1, rounding_v);

	232 v_avg_0 = _mm_add_ps(v_avg_0, rounding_v);

	233 v_avg_1 = _mm_add_ps(v_avg_1, rounding_v);

	234

	235 __m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0);

	236 __m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1);

	237

	238 __m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0);

	239 __m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1);

	240

	241 h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i);

	242 v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i);

	243

	244 __sse_use_diff[0] =

	245 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threashhold_lower)));

	246 __sse_use_diff[0] &=

	247 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threashhold_upper)));

	248

	249 __sse_use_diff[1] =

	250 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threashhold_lower)));

	251 __sse_use_diff[1] &=

	252 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threashhold_upper)));

	253 }

	254

	255 void ComputeLuminance(uint8_t* block,

	256 const Color& base,

	257 const int sub_block_id,

	258 const uint8_t* idx_to_num_tab,

	259 const __sse_data* data,

	260 const uint32_t expected_error) {

	261 uint8_t best_tbl_idx = 0;

	262 uint32_t best_error = 0x7FFFFFFF;

	263 uint8_t best_mod_idx[8][8]; // [table][texel]

	264

	265 const __m128i base_blue = _mm_set1_epi32(base.channels.b);

	266 const __m128i base_green = _mm_set1_epi32(base.channels.g);

	267 const __m128i base_red = _mm_set1_epi32(base.channels.r);

	268

	269 __m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red;

	270 __m128i block_error, mask;

	271

	272 /* this will have the minimum errors for each 4 pixels */

	273 __m128i first_half_min;

	274 __m128i second_half_min;

	275

	276 /* this will have the matching table index combo for each 4 pixels */

	277 __m128i first_half_pattern;

	278 __m128i second_half_pattern;

	279

	280 const __m128i first_blue_data_block = data->blue[2 * sub_block_id];

	281 const __m128i first_green_data_block = data->green[2 * sub_block_id];

	282 const __m128i first_red_data_block = data->red[2 * sub_block_id];

	283

	284 const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1];

	285 const __m128i second_green_data_block = data->green[2 * sub_block_id + 1];

	286 const __m128i second_red_data_block = data->red[2 * sub_block_id + 1];

	287

	288 uint32_t min;

	289 /* fail early to increase speed */

	290 long delta = INT32_MAX;

	291 uint32_t last_min = INT32_MAX;

	292

	293 const uint8_t shuffle_mask[] = {

	294 0x1B, 0x4E, 0xB1, 0xE4}; /* important they are sorted ascending */

	295

	296 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	297 tmp = _mm_set_epi32(

	298 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],

	299 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);

	300

	301 test_blue = AddAndClamp(tmp, base_blue);

	302 test_green = AddAndClamp(tmp, base_green);

	303 test_red = AddAndClamp(tmp, base_red);

	304

	305 first_half_min = __sse_max_int;

	306 second_half_min = __sse_max_int;

	307

	308 first_half_pattern = __sse_zero;

	309 second_half_pattern = __sse_zero;

	310

	311 #pragma unroll

	312 for (uint8_t imm8 : shuffle_mask) {

	313 switch (imm8) {

	314 case 0x1B:

	315 tmp_blue = _mm_shuffle_epi32(test_blue, 0x1B);

	316 tmp_green = _mm_shuffle_epi32(test_green, 0x1B);

	317 tmp_red = _mm_shuffle_epi32(test_red, 0x1B);

	318 break;

	319 case 0x4E:

	320 tmp_blue = _mm_shuffle_epi32(test_blue, 0x4E);

	321 tmp_green = _mm_shuffle_epi32(test_green, 0x4E);

	322 tmp_red = _mm_shuffle_epi32(test_red, 0x4E);

	323 break;

	324 case 0xB1:

	325 tmp_blue = _mm_shuffle_epi32(test_blue, 0xB1);

	326 tmp_green = _mm_shuffle_epi32(test_green, 0xB1);

	327 tmp_red = _mm_shuffle_epi32(test_red, 0xB1);

	328 break;

	329 case 0xE4:

	330 tmp_blue = _mm_shuffle_epi32(test_blue, 0xE4);

	331 tmp_green = _mm_shuffle_epi32(test_green, 0xE4);

	332 tmp_red = _mm_shuffle_epi32(test_red, 0xE4);

	333 break;

	334 default:

	335 tmp_blue = test_blue;

	336 tmp_green = test_green;

	337 tmp_red = test_red;

	338 }

	339

	340 tmp = _mm_set1_epi32(imm8);

	341

	342 block_error =

	343 AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block),

	344 GetColorErrorSSE(tmp_green, first_green_data_block),

	345 GetColorErrorSSE(tmp_red, first_red_data_block));

	346

	347 /* save winning pattern */

	348 first_half_pattern = _mm_max_epi16(

	349 first_half_pattern,

	350 _mm_and_si128(tmp, _mm_cmpgt_epi32(first_half_min, block_error)));

	351 /* should use _mm_min_epi32(first_half_min, block_error); otherwise

	352 * performance penalty */

	353 mask = _mm_cmplt_epi32(block_error, first_half_min);

	354 first_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	355 _mm_andnot_si128(mask, first_half_min));

	356

	357 /* Second part of the block */

	358 block_error =

	359 AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block),

	360 GetColorErrorSSE(tmp_green, second_green_data_block),

	361 GetColorErrorSSE(tmp_red, second_red_data_block));

	362

	363 /* save winning pattern */

	364 second_half_pattern = _mm_max_epi16(

	365 second_half_pattern,

	366 _mm_and_si128(tmp, _mm_cmpgt_epi32(second_half_min, block_error)));

	367 /* should use _mm_min_epi32(second_half_min, block_error); otherwise

	368 * performance penalty */

	369 mask = _mm_cmplt_epi32(block_error, second_half_min);

	370 second_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	371 _mm_andnot_si128(mask, second_half_min));

	372 }

	373

	374 first_half_min = _mm_add_epi32(first_half_min, second_half_min);

	375 first_half_min =

	376 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E));

	377 first_half_min =

	378 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1));

	379

	380 min = _mm_cvtsi128_si32(first_half_min);

	381

	382 delta = min - last_min;

	383 last_min = min;

	384

	385 if (min < best_error) {

	386 best_tbl_idx = tbl_idx;

	387 best_error = min;

	388

	389 best_mod_idx[tbl_idx][0] =

	390 (_mm_cvtsi128_si32(first_half_pattern) >> (0)) & 3;

	391 best_mod_idx[tbl_idx][4] =

	392 (_mm_cvtsi128_si32(second_half_pattern) >> (0)) & 3;

	393

	394 best_mod_idx[tbl_idx][1] =

	395 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x1)) >>

	396 (2)) &

	397 3;

	398 best_mod_idx[tbl_idx][5] =

	399 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x1)) >>

	400 (2)) &

	401 3;

	402

	403 best_mod_idx[tbl_idx][2] =

	404 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x2)) >>

	405 (4)) &

	406 3;

	407 best_mod_idx[tbl_idx][6] =

	408 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x2)) >>

	409 (4)) &

	410 3;

	411

	412 best_mod_idx[tbl_idx][3] =

	413 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x3)) >>

	414 (6)) &

	415 3;

	416 best_mod_idx[tbl_idx][7] =

	417 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x3)) >>

	418 (6)) &

	419 3;

	420

	421 if (best_error == 0) {

	422 break;

	423 }

	424 } else if (delta > 0 && expected_error < min) {

	425 /* error is growing and is well beyond expected error */

	426 break;

	427 }

	428 }

	429

	430 WriteCodewordTable(block, sub_block_id, best_tbl_idx);

	431

	432 uint32_t pix_data = 0;

	433 uint8_t mod_idx;

	434 uint8_t pix_idx;

	435 uint32_t lsb;

	436 uint32_t msb;

	437 int texel_num;

	438

	439 for (unsigned int i = 0; i < 8; ++i) {

	440 mod_idx = best_mod_idx[best_tbl_idx][i];

	441 pix_idx = g_mod_to_pix[mod_idx];

	442

	443 lsb = pix_idx & 0x1;

	444 msb = pix_idx >> 1;

	445

	446 // Obtain the texel number as specified in the standard.

	447 texel_num = idx_to_num_tab[i];

	448 pix_data \|= msb << (texel_num + 16);

	449 pix_data \|= lsb << (texel_num);

	450 }

	451

	452 WritePixelData(block, pix_data);

	453 }

	454

	455 void CompressBlock(uint8_t* dst, __sse_data* data) {

	456 /* first 3 vertical 1, seconds 3 vertical 2, third 3 horizontal 1, last 3

	457 * horizontal 2 */

	458 float __sse_avg_colors[12] = {

	459 0,

	460 };

	461 bool use_differential[2] = {true, true};

	462 GetAvgColors(data, __sse_avg_colors, use_differential);

	463 Color sub_block_avg[4];

	464

	465 /* TODO(radu.velea): remove floating point operations and use only int's +

	466 * normal

	467 * rounding and shifts */

	468 for (int i = 0, j = 1; i < 4; i += 2, j += 2) {

	469 if (use_differential[i / 2] == false) {

	470 sub_block_avg[i] = MakeColor444(&__sse_avg_colors[i * 3]);

	471 sub_block_avg[j] = MakeColor444(&__sse_avg_colors[j * 3]);

	472 } else {

	473 sub_block_avg[i] = MakeColor555(&__sse_avg_colors[i * 3]);

	474 sub_block_avg[j] = MakeColor555(&__sse_avg_colors[j * 3]);

	475 }

	476 }

	477

	478 __m128i red_avg[2], green_avg[2], blue_avg[2];

	479

	480 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	481 blue_avg[0] =

	482 _mm_set_epi32((int)__sse_avg_colors[3], (int)__sse_avg_colors[3],

	483 (int)__sse_avg_colors[0], (int)__sse_avg_colors[0]);

	484

	485 green_avg[0] =

	486 _mm_set_epi32((int)__sse_avg_colors[4], (int)__sse_avg_colors[4],

	487 (int)__sse_avg_colors[1], (int)__sse_avg_colors[1]);

	488

	489 red_avg[0] =

	490 _mm_set_epi32((int)__sse_avg_colors[5], (int)__sse_avg_colors[5],

	491 (int)__sse_avg_colors[2], (int)__sse_avg_colors[2]);

	492

	493 uint32_t vertical_error[2];

	494 GetVerticalError(data, blue_avg, green_avg, red_avg, vertical_error);

	495

	496 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	497 blue_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[6]);

	498 blue_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[9]);

	499

	500 green_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[7]);

	501 green_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[10]);

	502

	503 red_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[8]);

	504 red_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[11]);

	505

	506 uint32_t horizontal_error[2];

	507 GetHorizontalError(data, blue_avg, green_avg, red_avg, horizontal_error);

	508

	509 bool flip = (horizontal_error[0] + horizontal_error[1]) <

	510 (vertical_error[0] + vertical_error[1]);

	511 uint32_t* expected_errors = flip == true ? horizontal_error : vertical_error;

	512

	513 // Clear destination buffer so that we can "or" in the results.

	514 memset(dst, 0, 8);

	515

	516 WriteDiff(dst, use_differential[!!flip]);

	517 WriteFlip(dst, flip);

	518

	519 uint8_t sub_block_off_0 = flip ? 2 : 0;

	520 uint8_t sub_block_off_1 = sub_block_off_0 + 1;

	521

	522 if (use_differential[!!flip]) {

	523 WriteColors555(dst, sub_block_avg[sub_block_off_0],

	524 sub_block_avg[sub_block_off_1]);

	525 } else {

	526 WriteColors444(dst, sub_block_avg[sub_block_off_0],

	527 sub_block_avg[sub_block_off_1]);

	528 }

	529

	530 if (flip == false) {

	531 /* transpose vertical data into horizontal lines */

	532 __m128i tmp;

	533 #pragma unroll

	534 for (int i = 0; i < 4; i += 2) {

	535 tmp = data->blue[i];

	536 data->blue[i] = _mm_add_epi32(

	537 _mm_move_epi64(data->blue[i]),

	538 _mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E));

	539 data->blue[i + 1] = _mm_add_epi32(

	540 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	541 _mm_shuffle_epi32(

	542 _mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)),

	543 0x4E));

	544

	545 tmp = data->green[i];

	546 data->green[i] = _mm_add_epi32(

	547 _mm_move_epi64(data->green[i]),

	548 _mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E));

	549 data->green[i + 1] = _mm_add_epi32(

	550 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	551 _mm_shuffle_epi32(

	552 _mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)),

	553 0x4E));

	554

	555 tmp = data->red[i];

	556 data->red[i] = _mm_add_epi32(

	557 _mm_move_epi64(data->red[i]),

	558 _mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E));

	559 data->red[i + 1] = _mm_add_epi32(

	560 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	561 _mm_shuffle_epi32(

	562 _mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E));

	563 }

	564

	565 tmp = data->blue[1];

	566 data->blue[1] = data->blue[2];

	567 data->blue[2] = tmp;

	568

	569 tmp = data->green[1];

	570 data->green[1] = data->green[2];

	571 data->green[2] = tmp;

	572

	573 tmp = data->red[1];

	574 data->red[1] = data->red[2];

	575 data->red[2] = tmp;

	576 }

	577

	578 // Compute luminance for the first sub block.

	579 ComputeLuminance(dst, sub_block_avg[sub_block_off_0], 0,

	580 g_idx_to_num[sub_block_off_0], data,

	581 ETC1_SET_ERROR(expected_errors[0]));

	582 // Compute luminance for the second sub block.

	583 ComputeLuminance(dst, sub_block_avg[sub_block_off_1], 1,

	584 g_idx_to_num[sub_block_off_1], data,

	585 ETC1_SET_ERROR(expected_errors[1]));

	586 }

	587

	588 static void ExtractBlock(uint8_t* dst, const uint8_t* src, int width) {

	589 for (int j = 0; j < 4; ++j) {

	590 memcpy(&dst[j * 4 * 4], src, 4 * 4);

	591 src += width * 4;

	592 }

	593 }

	594

	595 inline bool TransposeBlock(uint8_t* block, __m128i* transposed /* [4] */) {

	596 __m128i tmp3, tmp2, tmp1, tmp0;

	597 __m128i test_solid = _mm_set1_epi32(((uint32_t)block));

	598 uint16_t mask = 0xFFFF;

	599

	600 transposed[0] = _mm_loadu_si128((__m128i*)(block)); // a0,a1,a2,...a7, ...a15

	601 transposed[1] =

	602 _mm_loadu_si128((__m128i*)(block + 16)); // b0, b1,b2,...b7.... b15

	603 transposed[2] =

	604 _mm_loadu_si128((__m128i*)(block + 32)); // c0, c1,c2,...c7....c15

	605 transposed[3] =

	606 _mm_loadu_si128((__m128i*)(block + 48)); // d0,d1,d2,...d7....d15

	607

	608 #pragma unroll

	609 for (int i = 0; i < 4; i++) {

	610 mask &= _mm_movemask_epi8(_mm_cmpeq_epi8(transposed[i], test_solid));

	611 }

	612

	613 if (mask == 0xFFFF) {

	614 return false; /* block is solid, no need to do any more work */

	615 }

	616

	617 tmp0 = _mm_unpacklo_epi8(

	618 transposed[0], transposed[1]); // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7

	619 tmp1 = _mm_unpacklo_epi8(

	620 transposed[2], transposed[3]); // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7

	621 tmp2 = _mm_unpackhi_epi8(

	622 transposed[0],

	623 transposed[1]); // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15

	624 tmp3 = _mm_unpackhi_epi8(

	625 transposed[2],

	626 transposed[3]); // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15

	627

	628 transposed[0] = _mm_unpacklo_epi8(

	629 tmp0, tmp2); // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11

	630 transposed[1] = _mm_unpackhi_epi8(

	631 tmp0, tmp2); // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15

	632 transposed[2] =

	633 _mm_unpacklo_epi8(tmp1, tmp3); // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11

	634 transposed[3] = _mm_unpackhi_epi8(

	635 tmp1, tmp3); // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15

	636

	637 tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]); // a0,a8, b0,b8,

	638 // c0,c8, d0,d8,

	639 // a1,a9, b1,b9,

	640 // c1,c9, d1,d9

	641 tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]); // a2,a10, b2,b10,

	642 // c2,c10, d2,d10,

	643 // a3,a11, b3,b11,

	644 // c3,c11, d3,d11

	645 tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]); // a4,a12, b4,b12,

	646 // c4,c12, d4,d12,

	647 // a5,a13, b5,b13,

	648 // c5,c13, d5,d13,

	649 tmp3 = _mm_unpackhi_epi32(transposed[1],

	650 transposed[3]); // a6,a14, b6,b14, c6,c14, d6,d14,

	651 // a7,a15,b7,b15,c7,c15,d7,d15

	652

	653 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2); // a0,a4, a8, a12, b0,b4,

	654 // b8,b12, c0,c4, c8, c12,

	655 // d0,d4, d8, d12

	656 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2); // a1,a5, a9, a13, b1,b5,

	657 // b9,b13, c1,c5, c9, c13,

	658 // d1,d5, d9, d13

	659 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3); // a2,a6, a10,a14, b2,b6,

	660 // b10,b14, c2,c6, c10,c14,

	661 // d2,d6, d10,d14

	662 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3); // a3,a7, a11,a15, b3,b7,

	663 // b11,b15, c3,c7, c11,c15,

	664 // d3,d7, d11,d15

	665 return true;

	666 }

	667

	668 inline void UnpackBlock(__m128i* packed,

	669 __m128i* red,

	670 __m128i* green,

	671 __m128i* blue,

	672 __m128i* alpha) {

	673 const __m128i zero = _mm_set1_epi8(0);

	674 __m128i tmp_low, tmp_high;

	675

	676 /* unpack red */

	677 tmp_low = _mm_unpacklo_epi8(packed[0], zero);

	678 tmp_high = _mm_unpackhi_epi8(packed[0], zero);

	679

	680 red[0] = _mm_unpacklo_epi16(tmp_low, zero);

	681 red[1] = _mm_unpackhi_epi16(tmp_low, zero);

	682

	683 red[2] = _mm_unpacklo_epi16(tmp_high, zero);

	684 red[3] = _mm_unpackhi_epi16(tmp_high, zero);

	685

	686 /* unpack green */

	687 tmp_low = _mm_unpacklo_epi8(packed[1], zero);

	688 tmp_high = _mm_unpackhi_epi8(packed[1], zero);

	689

	690 green[0] = _mm_unpacklo_epi16(tmp_low, zero);

	691 green[1] = _mm_unpackhi_epi16(tmp_low, zero);

	692

	693 green[2] = _mm_unpacklo_epi16(tmp_high, zero);

	694 green[3] = _mm_unpackhi_epi16(tmp_high, zero);

	695

	696 /* unpack blue */

	697 tmp_low = _mm_unpacklo_epi8(packed[2], zero);

	698 tmp_high = _mm_unpackhi_epi8(packed[2], zero);

	699

	700 blue[0] = _mm_unpacklo_epi16(tmp_low, zero);

	701 blue[1] = _mm_unpackhi_epi16(tmp_low, zero);

	702

	703 blue[2] = _mm_unpacklo_epi16(tmp_high, zero);

	704 blue[3] = _mm_unpackhi_epi16(tmp_high, zero);

	705

	706 /* unpack alpha */

	707 tmp_low = _mm_unpacklo_epi8(packed[3], zero);

	708 tmp_high = _mm_unpackhi_epi8(packed[3], zero);

	709

	710 alpha[0] = _mm_unpacklo_epi16(tmp_low, zero);

	711 alpha[1] = _mm_unpackhi_epi16(tmp_low, zero);

	712

	713 alpha[2] = _mm_unpacklo_epi16(tmp_high, zero);

	714 alpha[3] = _mm_unpackhi_epi16(tmp_high, zero);

	715 }

	716

	717 inline void CompressSolid(uint8_t* dst, uint8_t* block) {

	718 // Clear destination buffer so that we can "or" in the results.

	719 memset(dst, 0, 8);

	720

	721 const float src_color_float[3] = {static_cast<float>(block[0]),

	722 static_cast<float>(block[1]),

	723 static_cast<float>(block[2])};

	724 const Color base = MakeColor555(src_color_float);

	725 const __m128i base_v =

	726 _mm_set_epi32(0, base.channels.r, base.channels.g, base.channels.b);

	727

	728 const __m128i constant = _mm_set_epi32(0, block[2], block[1], block[0]);

	729 __m128i lum;

	730 __m128i colors[4];

	731 static const __m128i rgb =

	732 _mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

	733

	734 WriteDiff(dst, true);

	735 WriteFlip(dst, false);

	736

	737 WriteColors555(dst, base, base);

	738

	739 uint8_t best_tbl_idx = 0;

	740 uint8_t best_mod_idx = 0;

	741 uint32_t best_mod_err = INT32_MAX;

	742

	743 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	744 lum = _mm_set_epi32(

	745 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],

	746 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);

	747 colors[0] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x0));

	748 colors[1] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x55));

	749 colors[2] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xAA));

	750 colors[3] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xFF));

	751 #pragma unroll

	752 for (int i = 0; i < 4; i++) {

	753 uint32_t mod_err =

	754 SumSSE(GetColorErrorSSE(constant, _mm_and_si128(colors[i], rgb)));

	755 colors[i] = _mm_and_si128(colors[i], rgb);

	756 if (mod_err < best_mod_err) {

	757 best_tbl_idx = tbl_idx;

	758 best_mod_idx = i;

	759 best_mod_err = mod_err;

	760

	761 if (mod_err == 0) {

	762 break; // We cannot do any better than this.

	763 }

	764 }

	765 }

	766 }

	767

	768 WriteCodewordTable(dst, 0, best_tbl_idx);

	769 WriteCodewordTable(dst, 1, best_tbl_idx);

	770

	771 uint8_t pix_idx = g_mod_to_pix[best_mod_idx];

	772 uint32_t lsb = pix_idx & 0x1;

	773 uint32_t msb = pix_idx >> 1;

	774

	775 uint32_t pix_data = 0;

	776 for (unsigned int i = 0; i < 2; ++i) {

	777 for (unsigned int j = 0; j < 8; ++j) {

	778 // Obtain the texel number as specified in the standard.

	779 int texel_num = g_idx_to_num[i][j];

	780 pix_data \|= msb << (texel_num + 16);

	781 pix_data \|= lsb << (texel_num);

	782 }

	783 }

	784

	785 WritePixelData(dst, pix_data);

	786 }

	787

	788 } // namespace

	789

	790 namespace cc {

	791

	792 void TextureCompressorETC1SSE::Compress(const uint8_t* src,

	793 uint8_t* dst,

	794 int width,

	795 int height,

	796 Quality quality) {

	797 DCHECK(width >= 4 && (width & 3) == 0);

	798 DCHECK(height >= 4 && (height & 3) == 0);

	799

	800 uint8_t block[64] __attribute__((aligned(16)));

	801 __m128i packed[4];

	802 __m128i red[4], green[4], blue[4], alpha[4];

	803 __sse_data data;

	804

	805 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {

	806 for (int x = 0; x < width; x += 4, dst += 8) {

	807 ExtractBlock(block, src + x * 4, width);

	808 if (TransposeBlock(block, packed) == false) {

	809 CompressSolid(dst, block);

	810 } else {

	811 UnpackBlock(packed, blue, green, red, alpha);

	812

	813 data.block = block;

	814 data.packed = packed;

	815 data.red = red;

	816 data.blue = blue;

	817 data.green = green;

	818

	819 CompressBlock(dst, &data);

	820 }

	821 }

	822 }

	823 }

	824

	825 } // namespace cc

OLD	NEW

« cc/resources/texture_compressor.h ('K') | « cc/resources/texture_compressor_etc1_sse.h ('k') | cc/resources/texture_compressor_util.h » ('j') | no next file with comments »