cc/resources/texture_compressor_etc1_sse.cc - Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression

Side by Side Diff: cc/resources/texture_compressor_etc1_sse.cc

Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Using ALIGNAS instead of attribute and remove pragma's Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "texture_compressor_etc1_sse.h"

	6

	7 #include <assert.h>

	8 #include <emmintrin.h>

	9 #include <stdio.h>

	10 #include <stdlib.h>

	11 #include <string.h>

	12 #include <time.h>

	13

	14 #include <cmath>

	15 #include <limits>

	16 #include <sstream>

	17

	18 #include "base/compiler_specific.h"

	19 #include "base/logging.h"

	20 #include "cc/resources/texture_compressor_util.h"

	21

	22 using namespace cc::texture_compress;

	23

	24 namespace {

	25

	26 #define ETC1_SET_ERROR(x) (x + x / 2 + 384)

	27

	28 struct __sse_data {

	29 /* raw data */

	30 uint8_t* block;

	31 /* 8 bit packed values */

	32 __m128i* packed;

	33 /* 32 bit zero extended values - 4x4 arrays */

	34 __m128i* blue;

	35 __m128i* green;

	36 __m128i* red;

	37 // __m128i *alpha;

	38 };

	39

	40 /* commonly used registers */

	41 static const __m128i __sse_zero = _mm_set1_epi32(0);

	42 static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF);

	43

	44 inline __m128i AddAndClamp(const __m128i x, const __m128i y) {

	45 static const __m128i color_max = _mm_set1_epi32(0xFF);

	46 return _mm_max_epi16(__sse_zero,

	47 _mm_min_epi16(_mm_add_epi16(x, y), color_max));

	48 }

	49

	50 inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) {

	51 /* changed from _mm_mullo_epi32 to _mm_mullo_epi16 */

	52 __m128i ret = _mm_sub_epi16(x, y);

	53 return _mm_mullo_epi16(ret, ret);

	54 }

	55

	56 inline __m128i AddChannelError(const __m128i x,

	57 const __m128i y,

	58 const __m128i z) {

	59 return _mm_add_epi32(x, _mm_add_epi32(y, z));

	60 }

	61

	62 inline uint32_t SumSSE(const __m128i x) {

	63 __m128i sum = _mm_add_epi32(x, _mm_shuffle_epi32(x, 0x4E));

	64 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));

	65

	66 return _mm_cvtsi128_si32(sum);

	67 }

	68

	69 inline uint32_t GetVerticalError(const __sse_data* data,

	70 const __m128i* blue_avg,

	71 const __m128i* green_avg,

	72 const __m128i* red_avg,

	73 uint32_t* verror) {

	74 __m128i error = __sse_zero;

	75

	76 for (int i = 0; i < 4; i++) {

	77 error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0]));

	78 error =

	79 _mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0]));

	80 error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0]));

	81 }

	82

	83 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	84

	85 verror[0] = _mm_cvtsi128_si32(error);

	86 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	87

	88 return verror[0] + verror[1];

	89 }

	90

	91 inline uint32_t GetHorizontalError(const __sse_data* data,

	92 const __m128i* blue_avg,

	93 const __m128i* green_avg,

	94 const __m128i* red_avg,

	95 uint32_t* verror) {

	96 __m128i error = __sse_zero;

	97 int first_index, second_index;

	98

	99 for (int i = 0; i < 2; i++) {

	100 first_index = 2 * i;

	101 second_index = first_index + 1;

	102

	103 error = _mm_add_epi32(

	104 error, GetColorErrorSSE(data->blue[first_index], blue_avg[i]));

	105 error = _mm_add_epi32(

	106 error, GetColorErrorSSE(data->blue[second_index], blue_avg[i]));

	107 error = _mm_add_epi32(

	108 error, GetColorErrorSSE(data->green[first_index], green_avg[i]));

	109 error = _mm_add_epi32(

	110 error, GetColorErrorSSE(data->green[second_index], green_avg[i]));

	111 error = _mm_add_epi32(error,

	112 GetColorErrorSSE(data->red[first_index], red_avg[i]));

	113 error = _mm_add_epi32(

	114 error, GetColorErrorSSE(data->red[second_index], red_avg[i]));

	115 }

	116

	117 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	118

	119 verror[0] = _mm_cvtsi128_si32(error);

	120 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	121

	122 return verror[0] + verror[1];

	123 }

	124

	125 inline void GetAvgColors(const __sse_data* data,

	126 float* output,

	127 bool* __sse_use_diff) {

	128 __m128i sum[2], tmp;

	129

	130 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe

	131

	132 /* get avg red */

	133 /* [S0 S0 S1 S1] */

	134 sum[0] = _mm_add_epi32(data->red[0], data->red[1]);

	135 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	136

	137 /* [S2 S2 S3 S3] */

	138 sum[1] = _mm_add_epi32(data->red[2], data->red[3]);

	139 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	140

	141 float hred[2], vred[2];

	142 hred[0] = (_mm_cvtsi128_si32(

	143 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	144 8.0f;

	145 hred[1] = (_mm_cvtsi128_si32(

	146 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	147 8.0f;

	148

	149 tmp = _mm_add_epi32(sum[0], sum[1]);

	150 vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	151 vred[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	152

	153 /* get avg green */

	154 /* [S0 S0 S1 S1] */

	155 sum[0] = _mm_add_epi32(data->green[0], data->green[1]);

	156 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	157

	158 /* [S2 S2 S3 S3] */

	159 sum[1] = _mm_add_epi32(data->green[2], data->green[3]);

	160 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	161

	162 float hgreen[2], vgreen[2];

	163 hgreen[0] = (_mm_cvtsi128_si32(

	164 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	165 8.0f;

	166 hgreen[1] = (_mm_cvtsi128_si32(

	167 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	168 8.0f;

	169

	170 tmp = _mm_add_epi32(sum[0], sum[1]);

	171 vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	172 vgreen[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	173

	174 /* get avg blue */

	175 /* [S0 S0 S1 S1] */

	176 sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]);

	177 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	178

	179 /* [S2 S2 S3 S3] */

	180 sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]);

	181 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	182

	183 float hblue[2], vblue[2];

	184 hblue[0] = (_mm_cvtsi128_si32(

	185 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	186 8.0f;

	187 hblue[1] = (_mm_cvtsi128_si32(

	188 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	189 8.0f;

	190

	191 tmp = _mm_add_epi32(sum[0], sum[1]);

	192 vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	193 vblue[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	194

	195 /* TODO(radu.velea): return int's instead of floats */

	196 output[0] = vblue[0];

	197 output[1] = vgreen[0];

	198 output[2] = vred[0];

	199

	200 output[3] = vblue[1];

	201 output[4] = vgreen[1];

	202 output[5] = vred[1];

	203

	204 output[6] = hblue[0];

	205 output[7] = hgreen[0];

	206 output[8] = hred[0];

	207

	208 output[9] = hblue[1];

	209 output[10] = hgreen[1];

	210 output[11] = hred[1];

	211

	212 __m128i threashhold_upper = _mm_set1_epi32(3);
	robert.bradford 2015/04/28 15:47:04 nit: spelling s/threashold/threshold/g nit: spelling s/threashold/threshold/g
	213 __m128i threashhold_lower = _mm_set1_epi32(-4);

	214

	215 __m128 factor_v = _mm_set1_ps(31.0f / 255.0f);

	216 __m128 rounding_v = _mm_set1_ps(0.5f);

	217 __m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0);

	218 __m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0);

	219

	220 __m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0);

	221 __m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0);

	222

	223 h_avg_0 = _mm_mul_ps(h_avg_0, factor_v);

	224 h_avg_1 = _mm_mul_ps(h_avg_1, factor_v);

	225 v_avg_0 = _mm_mul_ps(v_avg_0, factor_v);

	226 v_avg_1 = _mm_mul_ps(v_avg_1, factor_v);

	227

	228 h_avg_0 = _mm_add_ps(h_avg_0, rounding_v);

	229 h_avg_1 = _mm_add_ps(h_avg_1, rounding_v);

	230 v_avg_0 = _mm_add_ps(v_avg_0, rounding_v);

	231 v_avg_1 = _mm_add_ps(v_avg_1, rounding_v);

	232

	233 __m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0);

	234 __m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1);

	235

	236 __m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0);

	237 __m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1);

	238

	239 h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i);

	240 v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i);

	241

	242 __sse_use_diff[0] =

	243 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threashhold_lower)));

	244 __sse_use_diff[0] &=

	245 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threashhold_upper)));

	246

	247 __sse_use_diff[1] =

	248 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threashhold_lower)));

	249 __sse_use_diff[1] &=

	250 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threashhold_upper)));

	251 }

	252

	253 void ComputeLuminance(uint8_t* block,

	254 const Color& base,

	255 const int sub_block_id,

	256 const uint8_t* idx_to_num_tab,

	257 const __sse_data* data,

	258 const uint32_t expected_error) {

	259 uint8_t best_tbl_idx = 0;

	260 uint32_t best_error = 0x7FFFFFFF;

	261 uint8_t best_mod_idx[8][8]; // [table][texel]

	262

	263 const __m128i base_blue = _mm_set1_epi32(base.channels.b);

	264 const __m128i base_green = _mm_set1_epi32(base.channels.g);

	265 const __m128i base_red = _mm_set1_epi32(base.channels.r);

	266

	267 __m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red;

	268 __m128i block_error, mask;

	269

	270 /* this will have the minimum errors for each 4 pixels */

	271 __m128i first_half_min;

	272 __m128i second_half_min;

	273

	274 /* this will have the matching table index combo for each 4 pixels */

	275 __m128i first_half_pattern;

	276 __m128i second_half_pattern;

	277

	278 const __m128i first_blue_data_block = data->blue[2 * sub_block_id];

	279 const __m128i first_green_data_block = data->green[2 * sub_block_id];

	280 const __m128i first_red_data_block = data->red[2 * sub_block_id];

	281

	282 const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1];

	283 const __m128i second_green_data_block = data->green[2 * sub_block_id + 1];

	284 const __m128i second_red_data_block = data->red[2 * sub_block_id + 1];

	285

	286 uint32_t min;

	287 /* fail early to increase speed */

	288 long delta = INT32_MAX;

	289 uint32_t last_min = INT32_MAX;

	290

	291 const uint8_t shuffle_mask[] = {

	292 0x1B, 0x4E, 0xB1, 0xE4}; /* important they are sorted ascending */

	293

	294 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	295 tmp = _mm_set_epi32(

	296 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],

	297 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);

	298

	299 test_blue = AddAndClamp(tmp, base_blue);

	300 test_green = AddAndClamp(tmp, base_green);

	301 test_red = AddAndClamp(tmp, base_red);

	302

	303 first_half_min = __sse_max_int;

	304 second_half_min = __sse_max_int;

	305

	306 first_half_pattern = __sse_zero;

	307 second_half_pattern = __sse_zero;

	308

	309 for (uint8_t imm8 : shuffle_mask) {

	310 switch (imm8) {

	311 case 0x1B:

	312 tmp_blue = _mm_shuffle_epi32(test_blue, 0x1B);

	313 tmp_green = _mm_shuffle_epi32(test_green, 0x1B);

	314 tmp_red = _mm_shuffle_epi32(test_red, 0x1B);

	315 break;

	316 case 0x4E:

	317 tmp_blue = _mm_shuffle_epi32(test_blue, 0x4E);

	318 tmp_green = _mm_shuffle_epi32(test_green, 0x4E);

	319 tmp_red = _mm_shuffle_epi32(test_red, 0x4E);

	320 break;

	321 case 0xB1:

	322 tmp_blue = _mm_shuffle_epi32(test_blue, 0xB1);

	323 tmp_green = _mm_shuffle_epi32(test_green, 0xB1);

	324 tmp_red = _mm_shuffle_epi32(test_red, 0xB1);

	325 break;

	326 case 0xE4:

	327 tmp_blue = _mm_shuffle_epi32(test_blue, 0xE4);

	328 tmp_green = _mm_shuffle_epi32(test_green, 0xE4);

	329 tmp_red = _mm_shuffle_epi32(test_red, 0xE4);

	330 break;

	331 default:

	332 tmp_blue = test_blue;

	333 tmp_green = test_green;

	334 tmp_red = test_red;

	335 }

	336

	337 tmp = _mm_set1_epi32(imm8);

	338

	339 block_error =

	340 AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block),

	341 GetColorErrorSSE(tmp_green, first_green_data_block),

	342 GetColorErrorSSE(tmp_red, first_red_data_block));

	343

	344 /* save winning pattern */

	345 first_half_pattern = _mm_max_epi16(

	346 first_half_pattern,

	347 _mm_and_si128(tmp, _mm_cmpgt_epi32(first_half_min, block_error)));

	348 /* should use _mm_min_epi32(first_half_min, block_error); otherwise

	349 * performance penalty */

	350 mask = _mm_cmplt_epi32(block_error, first_half_min);

	351 first_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	352 _mm_andnot_si128(mask, first_half_min));

	353

	354 /* Second part of the block */

	355 block_error =

	356 AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block),

	357 GetColorErrorSSE(tmp_green, second_green_data_block),

	358 GetColorErrorSSE(tmp_red, second_red_data_block));

	359

	360 /* save winning pattern */

	361 second_half_pattern = _mm_max_epi16(

	362 second_half_pattern,

	363 _mm_and_si128(tmp, _mm_cmpgt_epi32(second_half_min, block_error)));

	364 /* should use _mm_min_epi32(second_half_min, block_error); otherwise

	365 * performance penalty */

	366 mask = _mm_cmplt_epi32(block_error, second_half_min);

	367 second_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	368 _mm_andnot_si128(mask, second_half_min));

	369 }

	370

	371 first_half_min = _mm_add_epi32(first_half_min, second_half_min);

	372 first_half_min =

	373 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E));

	374 first_half_min =

	375 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1));

	376

	377 min = _mm_cvtsi128_si32(first_half_min);

	378

	379 delta = min - last_min;

	380 last_min = min;

	381

	382 if (min < best_error) {

	383 best_tbl_idx = tbl_idx;

	384 best_error = min;

	385

	386 best_mod_idx[tbl_idx][0] =

	387 (_mm_cvtsi128_si32(first_half_pattern) >> (0)) & 3;

	388 best_mod_idx[tbl_idx][4] =

	389 (_mm_cvtsi128_si32(second_half_pattern) >> (0)) & 3;

	390

	391 best_mod_idx[tbl_idx][1] =

	392 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x1)) >>

	393 (2)) &

	394 3;

	395 best_mod_idx[tbl_idx][5] =

	396 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x1)) >>

	397 (2)) &

	398 3;

	399

	400 best_mod_idx[tbl_idx][2] =

	401 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x2)) >>

	402 (4)) &

	403 3;

	404 best_mod_idx[tbl_idx][6] =

	405 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x2)) >>

	406 (4)) &

	407 3;

	408

	409 best_mod_idx[tbl_idx][3] =

	410 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x3)) >>

	411 (6)) &

	412 3;

	413 best_mod_idx[tbl_idx][7] =

	414 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x3)) >>

	415 (6)) &

	416 3;

	417

	418 if (best_error == 0) {

	419 break;

	420 }

	421 } else if (delta > 0 && expected_error < min) {

	422 /* error is growing and is well beyond expected error */

	423 break;

	424 }

	425 }

	426

	427 WriteCodewordTable(block, sub_block_id, best_tbl_idx);

	428

	429 uint32_t pix_data = 0;

	430 uint8_t mod_idx;

	431 uint8_t pix_idx;

	432 uint32_t lsb;

	433 uint32_t msb;

	434 int texel_num;

	435

	436 for (unsigned int i = 0; i < 8; ++i) {

	437 mod_idx = best_mod_idx[best_tbl_idx][i];

	438 pix_idx = g_mod_to_pix[mod_idx];

	439

	440 lsb = pix_idx & 0x1;

	441 msb = pix_idx >> 1;

	442

	443 // Obtain the texel number as specified in the standard.

	444 texel_num = idx_to_num_tab[i];

	445 pix_data \|= msb << (texel_num + 16);

	446 pix_data \|= lsb << (texel_num);

	447 }

	448

	449 WritePixelData(block, pix_data);

	450 }

	451

	452 void CompressBlock(uint8_t* dst, __sse_data* data) {

	453 /* first 3 vertical 1, seconds 3 vertical 2, third 3 horizontal 1, last 3

	454 * horizontal 2 */

	455 float __sse_avg_colors[12] = {

	456 0,

	457 };

	458 bool use_differential[2] = {true, true};

	459 GetAvgColors(data, __sse_avg_colors, use_differential);

	460 Color sub_block_avg[4];

	461

	462 /* TODO(radu.velea): remove floating point operations and use only int's +

	463 * normal

	464 * rounding and shifts */

	465 for (int i = 0, j = 1; i < 4; i += 2, j += 2) {

	466 if (use_differential[i / 2] == false) {

	467 sub_block_avg[i] = MakeColor444(&__sse_avg_colors[i * 3]);

	468 sub_block_avg[j] = MakeColor444(&__sse_avg_colors[j * 3]);

	469 } else {

	470 sub_block_avg[i] = MakeColor555(&__sse_avg_colors[i * 3]);

	471 sub_block_avg[j] = MakeColor555(&__sse_avg_colors[j * 3]);

	472 }

	473 }

	474

	475 __m128i red_avg[2], green_avg[2], blue_avg[2];

	476

	477 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	478 blue_avg[0] =

	479 _mm_set_epi32((int)__sse_avg_colors[3], (int)__sse_avg_colors[3],

	480 (int)__sse_avg_colors[0], (int)__sse_avg_colors[0]);

	481

	482 green_avg[0] =

	483 _mm_set_epi32((int)__sse_avg_colors[4], (int)__sse_avg_colors[4],

	484 (int)__sse_avg_colors[1], (int)__sse_avg_colors[1]);

	485

	486 red_avg[0] =

	487 _mm_set_epi32((int)__sse_avg_colors[5], (int)__sse_avg_colors[5],

	488 (int)__sse_avg_colors[2], (int)__sse_avg_colors[2]);

	489

	490 uint32_t vertical_error[2];

	491 GetVerticalError(data, blue_avg, green_avg, red_avg, vertical_error);

	492

	493 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	494 blue_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[6]);

	495 blue_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[9]);

	496

	497 green_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[7]);

	498 green_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[10]);

	499

	500 red_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[8]);

	501 red_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[11]);

	502

	503 uint32_t horizontal_error[2];

	504 GetHorizontalError(data, blue_avg, green_avg, red_avg, horizontal_error);

	505

	506 bool flip = (horizontal_error[0] + horizontal_error[1]) <

	507 (vertical_error[0] + vertical_error[1]);

	508 uint32_t* expected_errors = flip == true ? horizontal_error : vertical_error;

	509

	510 // Clear destination buffer so that we can "or" in the results.

	511 memset(dst, 0, 8);

	512

	513 WriteDiff(dst, use_differential[!!flip]);

	514 WriteFlip(dst, flip);

	515

	516 uint8_t sub_block_off_0 = flip ? 2 : 0;

	517 uint8_t sub_block_off_1 = sub_block_off_0 + 1;

	518

	519 if (use_differential[!!flip]) {

	520 WriteColors555(dst, sub_block_avg[sub_block_off_0],

	521 sub_block_avg[sub_block_off_1]);

	522 } else {

	523 WriteColors444(dst, sub_block_avg[sub_block_off_0],

	524 sub_block_avg[sub_block_off_1]);

	525 }

	526

	527 if (flip == false) {

	528 /* transpose vertical data into horizontal lines */

	529 __m128i tmp;

	530 for (int i = 0; i < 4; i += 2) {

	531 tmp = data->blue[i];

	532 data->blue[i] = _mm_add_epi32(

	533 _mm_move_epi64(data->blue[i]),

	534 _mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E));

	535 data->blue[i + 1] = _mm_add_epi32(

	536 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	537 _mm_shuffle_epi32(

	538 _mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)),

	539 0x4E));

	540

	541 tmp = data->green[i];

	542 data->green[i] = _mm_add_epi32(

	543 _mm_move_epi64(data->green[i]),

	544 _mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E));

	545 data->green[i + 1] = _mm_add_epi32(

	546 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	547 _mm_shuffle_epi32(

	548 _mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)),

	549 0x4E));

	550

	551 tmp = data->red[i];

	552 data->red[i] = _mm_add_epi32(

	553 _mm_move_epi64(data->red[i]),

	554 _mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E));

	555 data->red[i + 1] = _mm_add_epi32(

	556 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	557 _mm_shuffle_epi32(

	558 _mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E));

	559 }

	560

	561 tmp = data->blue[1];

	562 data->blue[1] = data->blue[2];

	563 data->blue[2] = tmp;

	564

	565 tmp = data->green[1];

	566 data->green[1] = data->green[2];

	567 data->green[2] = tmp;

	568

	569 tmp = data->red[1];

	570 data->red[1] = data->red[2];

	571 data->red[2] = tmp;

	572 }

	573

	574 // Compute luminance for the first sub block.

	575 ComputeLuminance(dst, sub_block_avg[sub_block_off_0], 0,

	576 g_idx_to_num[sub_block_off_0], data,

	577 ETC1_SET_ERROR(expected_errors[0]));

	578 // Compute luminance for the second sub block.

	579 ComputeLuminance(dst, sub_block_avg[sub_block_off_1], 1,

	580 g_idx_to_num[sub_block_off_1], data,

	581 ETC1_SET_ERROR(expected_errors[1]));

	582 }

	583

	584 static void ExtractBlock(uint8_t* dst, const uint8_t* src, int width) {

	585 for (int j = 0; j < 4; ++j) {

	586 memcpy(&dst[j * 4 * 4], src, 4 * 4);

	587 src += width * 4;

	588 }

	589 }

	590

	591 inline bool TransposeBlock(uint8_t* block, __m128i* transposed /* [4] */) {

	592 __m128i tmp3, tmp2, tmp1, tmp0;

	593 __m128i test_solid = _mm_set1_epi32(((uint32_t)block));

	594 uint16_t mask = 0xFFFF;

	595

	596 transposed[0] = _mm_loadu_si128((__m128i*)(block)); // a0,a1,a2,...a7, ...a15

	597 transposed[1] =

	598 _mm_loadu_si128((__m128i*)(block + 16)); // b0, b1,b2,...b7.... b15

	599 transposed[2] =

	600 _mm_loadu_si128((__m128i*)(block + 32)); // c0, c1,c2,...c7....c15

	601 transposed[3] =

	602 _mm_loadu_si128((__m128i*)(block + 48)); // d0,d1,d2,...d7....d15

	603

	604 for (int i = 0; i < 4; i++) {

	605 mask &= _mm_movemask_epi8(_mm_cmpeq_epi8(transposed[i], test_solid));

	606 }

	607

	608 if (mask == 0xFFFF) {

	609 return false; /* block is solid, no need to do any more work */

	610 }

	611

	612 tmp0 = _mm_unpacklo_epi8(

	613 transposed[0], transposed[1]); // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7

	614 tmp1 = _mm_unpacklo_epi8(

	615 transposed[2], transposed[3]); // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7

	616 tmp2 = _mm_unpackhi_epi8(

	617 transposed[0],

	618 transposed[1]); // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15

	619 tmp3 = _mm_unpackhi_epi8(

	620 transposed[2],

	621 transposed[3]); // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15

	622

	623 transposed[0] = _mm_unpacklo_epi8(

	624 tmp0, tmp2); // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11

	625 transposed[1] = _mm_unpackhi_epi8(

	626 tmp0, tmp2); // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15

	627 transposed[2] =

	628 _mm_unpacklo_epi8(tmp1, tmp3); // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11

	629 transposed[3] = _mm_unpackhi_epi8(

	630 tmp1, tmp3); // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15

	631

	632 tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]); // a0,a8, b0,b8,

	633 // c0,c8, d0,d8,

	634 // a1,a9, b1,b9,

	635 // c1,c9, d1,d9

	636 tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]); // a2,a10, b2,b10,

	637 // c2,c10, d2,d10,

	638 // a3,a11, b3,b11,

	639 // c3,c11, d3,d11

	640 tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]); // a4,a12, b4,b12,

	641 // c4,c12, d4,d12,

	642 // a5,a13, b5,b13,

	643 // c5,c13, d5,d13,

	644 tmp3 = _mm_unpackhi_epi32(transposed[1],

	645 transposed[3]); // a6,a14, b6,b14, c6,c14, d6,d14,

	646 // a7,a15,b7,b15,c7,c15,d7,d15

	647

	648 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2); // a0,a4, a8, a12, b0,b4,

	649 // b8,b12, c0,c4, c8, c12,

	650 // d0,d4, d8, d12

	651 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2); // a1,a5, a9, a13, b1,b5,

	652 // b9,b13, c1,c5, c9, c13,

	653 // d1,d5, d9, d13

	654 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3); // a2,a6, a10,a14, b2,b6,

	655 // b10,b14, c2,c6, c10,c14,

	656 // d2,d6, d10,d14

	657 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3); // a3,a7, a11,a15, b3,b7,

	658 // b11,b15, c3,c7, c11,c15,

	659 // d3,d7, d11,d15

	660 return true;

	661 }

	662

	663 inline void UnpackBlock(__m128i* packed,

	664 __m128i* red,

	665 __m128i* green,

	666 __m128i* blue,

	667 __m128i* alpha) {

	668 const __m128i zero = _mm_set1_epi8(0);

	669 __m128i tmp_low, tmp_high;

	670

	671 /* unpack red */

	672 tmp_low = _mm_unpacklo_epi8(packed[0], zero);

	673 tmp_high = _mm_unpackhi_epi8(packed[0], zero);

	674

	675 red[0] = _mm_unpacklo_epi16(tmp_low, zero);

	676 red[1] = _mm_unpackhi_epi16(tmp_low, zero);

	677

	678 red[2] = _mm_unpacklo_epi16(tmp_high, zero);

	679 red[3] = _mm_unpackhi_epi16(tmp_high, zero);

	680

	681 /* unpack green */

	682 tmp_low = _mm_unpacklo_epi8(packed[1], zero);

	683 tmp_high = _mm_unpackhi_epi8(packed[1], zero);

	684

	685 green[0] = _mm_unpacklo_epi16(tmp_low, zero);

	686 green[1] = _mm_unpackhi_epi16(tmp_low, zero);

	687

	688 green[2] = _mm_unpacklo_epi16(tmp_high, zero);

	689 green[3] = _mm_unpackhi_epi16(tmp_high, zero);

	690

	691 /* unpack blue */

	692 tmp_low = _mm_unpacklo_epi8(packed[2], zero);

	693 tmp_high = _mm_unpackhi_epi8(packed[2], zero);

	694

	695 blue[0] = _mm_unpacklo_epi16(tmp_low, zero);

	696 blue[1] = _mm_unpackhi_epi16(tmp_low, zero);

	697

	698 blue[2] = _mm_unpacklo_epi16(tmp_high, zero);

	699 blue[3] = _mm_unpackhi_epi16(tmp_high, zero);

	700

	701 /* unpack alpha */

	702 tmp_low = _mm_unpacklo_epi8(packed[3], zero);

	703 tmp_high = _mm_unpackhi_epi8(packed[3], zero);

	704

	705 alpha[0] = _mm_unpacklo_epi16(tmp_low, zero);

	706 alpha[1] = _mm_unpackhi_epi16(tmp_low, zero);

	707

	708 alpha[2] = _mm_unpacklo_epi16(tmp_high, zero);

	709 alpha[3] = _mm_unpackhi_epi16(tmp_high, zero);

	710 }

	711

	712 inline void CompressSolid(uint8_t* dst, uint8_t* block) {

	713 // Clear destination buffer so that we can "or" in the results.

	714 memset(dst, 0, 8);

	715

	716 const float src_color_float[3] = {static_cast<float>(block[0]),

	717 static_cast<float>(block[1]),

	718 static_cast<float>(block[2])};

	719 const Color base = MakeColor555(src_color_float);

	720 const __m128i base_v =

	721 _mm_set_epi32(0, base.channels.r, base.channels.g, base.channels.b);

	722

	723 const __m128i constant = _mm_set_epi32(0, block[2], block[1], block[0]);

	724 __m128i lum;

	725 __m128i colors[4];

	726 static const __m128i rgb =

	727 _mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

	728

	729 WriteDiff(dst, true);

	730 WriteFlip(dst, false);

	731

	732 WriteColors555(dst, base, base);

	733

	734 uint8_t best_tbl_idx = 0;

	735 uint8_t best_mod_idx = 0;

	736 uint32_t best_mod_err = INT32_MAX;

	737

	738 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	739 lum = _mm_set_epi32(

	740 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],

	741 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);

	742 colors[0] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x0));

	743 colors[1] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x55));

	744 colors[2] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xAA));

	745 colors[3] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xFF));

	746

	747 for (int i = 0; i < 4; i++) {

	748 uint32_t mod_err =

	749 SumSSE(GetColorErrorSSE(constant, _mm_and_si128(colors[i], rgb)));

	750 colors[i] = _mm_and_si128(colors[i], rgb);

	751 if (mod_err < best_mod_err) {

	752 best_tbl_idx = tbl_idx;

	753 best_mod_idx = i;

	754 best_mod_err = mod_err;

	755

	756 if (mod_err == 0) {

	757 break; // We cannot do any better than this.

	758 }

	759 }

	760 }

	761 }

	762

	763 WriteCodewordTable(dst, 0, best_tbl_idx);

	764 WriteCodewordTable(dst, 1, best_tbl_idx);

	765

	766 uint8_t pix_idx = g_mod_to_pix[best_mod_idx];

	767 uint32_t lsb = pix_idx & 0x1;

	768 uint32_t msb = pix_idx >> 1;

	769

	770 uint32_t pix_data = 0;

	771 for (unsigned int i = 0; i < 2; ++i) {

	772 for (unsigned int j = 0; j < 8; ++j) {

	773 // Obtain the texel number as specified in the standard.

	774 int texel_num = g_idx_to_num[i][j];

	775 pix_data \|= msb << (texel_num + 16);

	776 pix_data \|= lsb << (texel_num);

	777 }

	778 }

	779

	780 WritePixelData(dst, pix_data);

	781 }

	782

	783 } // namespace

	784

	785 namespace cc {

	786

	787 void TextureCompressorETC1SSE::Compress(const uint8_t* src,

	788 uint8_t* dst,

	789 int width,

	790 int height,

	791 Quality quality) {

	792 DCHECK(width >= 4 && (width & 3) == 0);

	793 DCHECK(height >= 4 && (height & 3) == 0);

	794

	795 ALIGNAS(16) uint8_t block[64];

	796 __m128i packed[4];

	797 __m128i red[4], green[4], blue[4], alpha[4];

	798 __sse_data data;

	799

	800 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {

	801 for (int x = 0; x < width; x += 4, dst += 8) {

	802 ExtractBlock(block, src + x * 4, width);

	803 if (TransposeBlock(block, packed) == false) {

	804 CompressSolid(dst, block);

	805 } else {

	806 UnpackBlock(packed, blue, green, red, alpha);

	807

	808 data.block = block;

	809 data.packed = packed;

	810 data.red = red;

	811 data.blue = blue;

	812 data.green = green;

	813

	814 CompressBlock(dst, &data);

	815 }

	816 }

	817 }

	818 }

	819

	820 } // namespace cc

OLD	NEW

« no previous file with comments | « cc/resources/texture_compressor_etc1_sse.h ('k') | cc/resources/texture_compressor_util.h » ('j') | no next file with comments »