cc/resources/texture_compressor_etc1_sse.cc - Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression

Side by Side Diff: cc/resources/texture_compressor_etc1_sse.cc

Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« cc/resources/texture_compressor.cc ('K') | « cc/resources/texture_compressor_etc1_sse.h ('k') | cc/resources/texture_compressor_unittest.cc » ('j') | cc/resources/texture_compressor_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "texture_compressor_etc1_sse.h"

	6

	7 #include <assert.h>

	8 #include <emmintrin.h>

	9 #include <stdio.h>

	10 #include <stdlib.h>

	11 #include <string.h>

	12 #include <time.h>

	13

	14 #include <cmath>

	15 #include <limits>

	16 #include <sstream>

	17

	18 #include "base/compiler_specific.h"

	19 #include "base/logging.h"

	20 #include "cc/resources/texture_compressor_util.h"

	21

	22 using namespace cc;

	23

	24 namespace {

	25

	26 #define ETC1_SET_ERROR(x) (x + x / 2 + 384)

	27

	28 struct __sse_data {

	29 /* raw data */

	30 uint8_t* block;

	31 /* 8 bit packed values */

	32 __m128i* packed;

	33 /* 32 bit zero extended values - 4x4 arrays */

	34 __m128i* blue;

	35 __m128i* green;

	36 __m128i* red;

	37 // __m128i *alpha;

	38 };

	39

	40 /* commonly used registers */

	41 static const __m128i __sse_zero = _mm_set1_epi32(0);

	42 static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF);

	43

	44 inline __m128i AddAndClamp(const __m128i x, const __m128i y) {

	45 static const __m128i color_max = _mm_set1_epi32(0xFF);

	46 return _mm_max_epi16(__sse_zero,

	47 _mm_min_epi16(_mm_add_epi16(x, y), color_max));

	48 }

	49

	50 inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) {

	51 /* changed from _mm_mullo_epi32 to _mm_mullo_epi16 */

	52 __m128i ret = _mm_sub_epi16(x, y);

	53 return _mm_mullo_epi16(ret, ret);

	54 }

	55

	56 inline __m128i AddChannelError(const __m128i x,

	57 const __m128i y,

	58 const __m128i z) {

	59 return _mm_add_epi32(x, _mm_add_epi32(y, z));

	60 }

	61

	62 inline uint32_t SumSSE(const __m128i x) {

	63 __m128i sum = _mm_add_epi32(x, _mm_shuffle_epi32(x, 0x4E));

	64 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));

	65

	66 return _mm_cvtsi128_si32(sum);

	67 }

	68

	69 inline uint32_t GetVerticalError(const __sse_data* data,

	70 const __m128i* blue_avg,

	71 const __m128i* green_avg,

	72 const __m128i* red_avg,

	73 uint32_t* verror) {

	74 __m128i error = __sse_zero;

	75

	76 for (int i = 0; i < 4; i++) {

	77 error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0]));

	78 error =

	79 _mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0]));

	80 error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0]));

	81 }

	82

	83 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	84

	85 verror[0] = _mm_cvtsi128_si32(error);

	86 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	87

	88 return verror[0] + verror[1];

	89 }

	90

	91 inline uint32_t GetHorizontalError(const __sse_data* data,

	92 const __m128i* blue_avg,

	93 const __m128i* green_avg,

	94 const __m128i* red_avg,

	95 uint32_t* verror) {

	96 __m128i error = __sse_zero;

	97 int first_index, second_index;

	98

	99 for (int i = 0; i < 2; i++) {

	100 first_index = 2 * i;

	101 second_index = first_index + 1;

	102

	103 error = _mm_add_epi32(

	104 error, GetColorErrorSSE(data->blue[first_index], blue_avg[i]));

	105 error = _mm_add_epi32(

	106 error, GetColorErrorSSE(data->blue[second_index], blue_avg[i]));

	107 error = _mm_add_epi32(

	108 error, GetColorErrorSSE(data->green[first_index], green_avg[i]));

	109 error = _mm_add_epi32(

	110 error, GetColorErrorSSE(data->green[second_index], green_avg[i]));

	111 error = _mm_add_epi32(error,

	112 GetColorErrorSSE(data->red[first_index], red_avg[i]));

	113 error = _mm_add_epi32(

	114 error, GetColorErrorSSE(data->red[second_index], red_avg[i]));

	115 }

	116

	117 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	118

	119 verror[0] = _mm_cvtsi128_si32(error);

	120 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	121

	122 return verror[0] + verror[1];

	123 }

	124

	125 inline void GetAvgColors(const __sse_data* data,

	126 float* output,

	127 bool* __sse_use_diff) {

	128 __m128i sum[2], tmp;

	129

	130 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe

	131

	132 /* get avg red */

	133 /* [S0 S0 S1 S1] */

	134 sum[0] = _mm_add_epi32(data->red[0], data->red[1]);

	135 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	136

	137 /* [S2 S2 S3 S3] */

	138 sum[1] = _mm_add_epi32(data->red[2], data->red[3]);

	139 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	140

	141 float hred[2], vred[2];

	142 hred[0] = (_mm_cvtsi128_si32(

	143 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	144 8.0f;

	145 hred[1] = (_mm_cvtsi128_si32(

	146 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	147 8.0f;

	148

	149 tmp = _mm_add_epi32(sum[0], sum[1]);

	150 vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	151 vred[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	152

	153 /* get avg green */

	154 /* [S0 S0 S1 S1] */

	155 sum[0] = _mm_add_epi32(data->green[0], data->green[1]);

	156 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	157

	158 /* [S2 S2 S3 S3] */

	159 sum[1] = _mm_add_epi32(data->green[2], data->green[3]);

	160 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	161

	162 float hgreen[2], vgreen[2];

	163 hgreen[0] = (_mm_cvtsi128_si32(

	164 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	165 8.0f;

	166 hgreen[1] = (_mm_cvtsi128_si32(

	167 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	168 8.0f;

	169

	170 tmp = _mm_add_epi32(sum[0], sum[1]);

	171 vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	172 vgreen[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	173

	174 /* get avg blue */

	175 /* [S0 S0 S1 S1] */

	176 sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]);

	177 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	178

	179 /* [S2 S2 S3 S3] */

	180 sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]);

	181 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	182

	183 float hblue[2], vblue[2];

	184 hblue[0] = (_mm_cvtsi128_si32(

	185 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	186 8.0f;

	187 hblue[1] = (_mm_cvtsi128_si32(

	188 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	189 8.0f;

	190

	191 tmp = _mm_add_epi32(sum[0], sum[1]);

	192 vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	193 vblue[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	194

	195 /* TODO(radu.velea): return int's instead of floats */

	196 output[0] = vblue[0];

	197 output[1] = vgreen[0];

	198 output[2] = vred[0];

	199

	200 output[3] = vblue[1];

	201 output[4] = vgreen[1];

	202 output[5] = vred[1];

	203

	204 output[6] = hblue[0];

	205 output[7] = hgreen[0];

	206 output[8] = hred[0];

	207

	208 output[9] = hblue[1];

	209 output[10] = hgreen[1];

	210 output[11] = hred[1];

	211

	212 __m128i threshold_upper = _mm_set1_epi32(3);

	213 __m128i threshold_lower = _mm_set1_epi32(-4);

	214

	215 __m128 factor_v = _mm_set1_ps(31.0f / 255.0f);

	216 __m128 rounding_v = _mm_set1_ps(0.5f);

	217 __m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0);

	218 __m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0);

	219

	220 __m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0);

	221 __m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0);

	222

	223 h_avg_0 = _mm_mul_ps(h_avg_0, factor_v);

	224 h_avg_1 = _mm_mul_ps(h_avg_1, factor_v);

	225 v_avg_0 = _mm_mul_ps(v_avg_0, factor_v);

	226 v_avg_1 = _mm_mul_ps(v_avg_1, factor_v);

	227

	228 h_avg_0 = _mm_add_ps(h_avg_0, rounding_v);

	229 h_avg_1 = _mm_add_ps(h_avg_1, rounding_v);

	230 v_avg_0 = _mm_add_ps(v_avg_0, rounding_v);

	231 v_avg_1 = _mm_add_ps(v_avg_1, rounding_v);

	232

	233 __m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0);

	234 __m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1);

	235

	236 __m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0);

	237 __m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1);

	238

	239 h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i);

	240 v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i);

	241

	242 __sse_use_diff[0] =

	243 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threshold_lower)));

	244 __sse_use_diff[0] &=

	245 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threshold_upper)));

	246

	247 __sse_use_diff[1] =

	248 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threshold_lower)));

	249 __sse_use_diff[1] &=

	250 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threshold_upper)));

	251 }

	252

	253 void ComputeLuminance(uint8_t* block,

	254 const Color& base,

	255 const int sub_block_id,

	256 const uint8_t* idx_to_num_tab,

	257 const __sse_data* data,

	258 const uint32_t expected_error) {

	259 uint8_t best_tbl_idx = 0;

	260 uint32_t best_error = 0x7FFFFFFF;

	261 uint8_t best_mod_idx[8][8]; // [table][texel]

	262

	263 const __m128i base_blue = _mm_set1_epi32(base.channels.b);

	264 const __m128i base_green = _mm_set1_epi32(base.channels.g);

	265 const __m128i base_red = _mm_set1_epi32(base.channels.r);

	266

	267 __m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red;

	268 __m128i block_error, mask;

	269

	270 /* this will have the minimum errors for each 4 pixels */

	271 __m128i first_half_min;

	272 __m128i second_half_min;

	273

	274 /* this will have the matching table index combo for each 4 pixels */

	275 __m128i first_half_pattern;

	276 __m128i second_half_pattern;

	277

	278 const __m128i first_blue_data_block = data->blue[2 * sub_block_id];

	279 const __m128i first_green_data_block = data->green[2 * sub_block_id];

	280 const __m128i first_red_data_block = data->red[2 * sub_block_id];

	281

	282 const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1];

	283 const __m128i second_green_data_block = data->green[2 * sub_block_id + 1];

	284 const __m128i second_red_data_block = data->red[2 * sub_block_id + 1];

	285

	286 uint32_t min;

	287 /* fail early to increase speed */

	288 long delta = INT32_MAX;

	289 uint32_t last_min = INT32_MAX;

	290

	291 const uint8_t shuffle_mask[] = {

	292 0x1B, 0x4E, 0xB1, 0xE4}; /* important they are sorted ascending */

	293

	294 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	295 tmp = _mm_set_epi32(

	296 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],

	297 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);

	298

	299 test_blue = AddAndClamp(tmp, base_blue);

	300 test_green = AddAndClamp(tmp, base_green);

	301 test_red = AddAndClamp(tmp, base_red);

	302

	303 first_half_min = __sse_max_int;

	304 second_half_min = __sse_max_int;

	305

	306 first_half_pattern = __sse_zero;

	307 second_half_pattern = __sse_zero;

	308

	309 for (uint8_t imm8 : shuffle_mask) {

	310 switch (imm8) {

	311 case 0x1B:

	312 tmp_blue = _mm_shuffle_epi32(test_blue, 0x1B);

	313 tmp_green = _mm_shuffle_epi32(test_green, 0x1B);

	314 tmp_red = _mm_shuffle_epi32(test_red, 0x1B);

	315 break;

	316 case 0x4E:

	317 tmp_blue = _mm_shuffle_epi32(test_blue, 0x4E);

	318 tmp_green = _mm_shuffle_epi32(test_green, 0x4E);

	319 tmp_red = _mm_shuffle_epi32(test_red, 0x4E);

	320 break;

	321 case 0xB1:

	322 tmp_blue = _mm_shuffle_epi32(test_blue, 0xB1);

	323 tmp_green = _mm_shuffle_epi32(test_green, 0xB1);

	324 tmp_red = _mm_shuffle_epi32(test_red, 0xB1);

	325 break;

	326 case 0xE4:

	327 tmp_blue = _mm_shuffle_epi32(test_blue, 0xE4);

	328 tmp_green = _mm_shuffle_epi32(test_green, 0xE4);

	329 tmp_red = _mm_shuffle_epi32(test_red, 0xE4);

	330 break;

	331 default:

	332 tmp_blue = test_blue;

	333 tmp_green = test_green;

	334 tmp_red = test_red;

	335 }

	336

	337 tmp = _mm_set1_epi32(imm8);

	338

	339 block_error =

	340 AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block),

	341 GetColorErrorSSE(tmp_green, first_green_data_block),

	342 GetColorErrorSSE(tmp_red, first_red_data_block));

	343

	344 /* save winning pattern */

	345 first_half_pattern = _mm_max_epi16(

	346 first_half_pattern,

	347 _mm_and_si128(tmp, _mm_cmpgt_epi32(first_half_min, block_error)));

	348 /* should use _mm_min_epi32(first_half_min, block_error); otherwise

	349 * performance penalty */

	350 mask = _mm_cmplt_epi32(block_error, first_half_min);

	351 first_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	352 _mm_andnot_si128(mask, first_half_min));

	353

	354 /* Second part of the block */

	355 block_error =

	356 AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block),

	357 GetColorErrorSSE(tmp_green, second_green_data_block),

	358 GetColorErrorSSE(tmp_red, second_red_data_block));

	359

	360 /* save winning pattern */

	361 second_half_pattern = _mm_max_epi16(

	362 second_half_pattern,

	363 _mm_and_si128(tmp, _mm_cmpgt_epi32(second_half_min, block_error)));

	364 /* should use _mm_min_epi32(second_half_min, block_error); otherwise

	365 * performance penalty */

	366 mask = _mm_cmplt_epi32(block_error, second_half_min);

	367 second_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	368 _mm_andnot_si128(mask, second_half_min));

	369 }

	370

	371 first_half_min = _mm_add_epi32(first_half_min, second_half_min);

	372 first_half_min =

	373 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E));

	374 first_half_min =

	375 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1));

	376

	377 min = _mm_cvtsi128_si32(first_half_min);

	378

	379 delta = min - last_min;

	380 last_min = min;

	381

	382 if (min < best_error) {

	383 best_tbl_idx = tbl_idx;

	384 best_error = min;

	385

	386 best_mod_idx[tbl_idx][0] =

	387 (_mm_cvtsi128_si32(first_half_pattern) >> (0)) & 3;

	388 best_mod_idx[tbl_idx][4] =

	389 (_mm_cvtsi128_si32(second_half_pattern) >> (0)) & 3;

	390

	391 best_mod_idx[tbl_idx][1] =

	392 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x1)) >>

	393 (2)) &

	394 3;

	395 best_mod_idx[tbl_idx][5] =

	396 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x1)) >>

	397 (2)) &

	398 3;

	399

	400 best_mod_idx[tbl_idx][2] =

	401 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x2)) >>

	402 (4)) &

	403 3;

	404 best_mod_idx[tbl_idx][6] =

	405 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x2)) >>

	406 (4)) &

	407 3;

	408

	409 best_mod_idx[tbl_idx][3] =

	410 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x3)) >>

	411 (6)) &

	412 3;

	413 best_mod_idx[tbl_idx][7] =

	414 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x3)) >>

	415 (6)) &

	416 3;

	417

	418 if (best_error == 0) {

	419 break;

	420 }

	421 } else if (delta > 0 && expected_error < min) {

	422 /* error is growing and is well beyond expected error */

	423 break;

	424 }

	425 }

	426

	427 WriteCodewordTable(block, sub_block_id, best_tbl_idx);

	428

	429 uint32_t pix_data = 0;

	430 uint8_t mod_idx;

	431 uint8_t pix_idx;

	432 uint32_t lsb;

	433 uint32_t msb;

	434 int texel_num;

	435

	436 for (unsigned int i = 0; i < 8; ++i) {

	437 mod_idx = best_mod_idx[best_tbl_idx][i];

	438 pix_idx = g_mod_to_pix[mod_idx];

	439

	440 lsb = pix_idx & 0x1;

	441 msb = pix_idx >> 1;

	442

	443 // Obtain the texel number as specified in the standard.

	444 texel_num = idx_to_num_tab[i];

	445 pix_data \|= msb << (texel_num + 16);

	446 pix_data \|= lsb << (texel_num);

	447 }

	448

	449 WritePixelData(block, pix_data);

	450 }

	451

	452 void CompressBlock(uint8_t* dst, __sse_data* data) {

	453 /* first 3 vertical 1, seconds 3 vertical 2, third 3 horizontal 1, last 3

	454 * horizontal 2 */

	455 float __sse_avg_colors[12] = {

	456 0,

	457 };

	458 bool use_differential[2] = {true, true};

	459 GetAvgColors(data, __sse_avg_colors, use_differential);

	460 Color sub_block_avg[4];

	461

	462 /* TODO(radu.velea): remove floating point operations and use only int's +

	463 * normal

	464 * rounding and shifts */

	465 for (int i = 0, j = 1; i < 4; i += 2, j += 2) {

	466 if (use_differential[i / 2] == false) {

	467 sub_block_avg[i] = MakeColor444(&__sse_avg_colors[i * 3]);

	468 sub_block_avg[j] = MakeColor444(&__sse_avg_colors[j * 3]);

	469 } else {

	470 sub_block_avg[i] = MakeColor555(&__sse_avg_colors[i * 3]);

	471 sub_block_avg[j] = MakeColor555(&__sse_avg_colors[j * 3]);

	472 }

	473 }

	474

	475 __m128i red_avg[2], green_avg[2], blue_avg[2];

	476

	477 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	478 blue_avg[0] =

	479 _mm_set_epi32((int)__sse_avg_colors[3], (int)__sse_avg_colors[3],

	480 (int)__sse_avg_colors[0], (int)__sse_avg_colors[0]);

	481

	482 green_avg[0] =

	483 _mm_set_epi32((int)__sse_avg_colors[4], (int)__sse_avg_colors[4],

	484 (int)__sse_avg_colors[1], (int)__sse_avg_colors[1]);

	485

	486 red_avg[0] =

	487 _mm_set_epi32((int)__sse_avg_colors[5], (int)__sse_avg_colors[5],

	488 (int)__sse_avg_colors[2], (int)__sse_avg_colors[2]);

	489

	490 uint32_t vertical_error[2];

	491 GetVerticalError(data, blue_avg, green_avg, red_avg, vertical_error);

	492

	493 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	494 blue_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[6]);

	495 blue_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[9]);

	496

	497 green_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[7]);

	498 green_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[10]);

	499

	500 red_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[8]);

	501 red_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[11]);

	502

	503 uint32_t horizontal_error[2];

	504 GetHorizontalError(data, blue_avg, green_avg, red_avg, horizontal_error);

	505

	506 bool flip = (horizontal_error[0] + horizontal_error[1]) <

	507 (vertical_error[0] + vertical_error[1]);

	508 uint32_t* expected_errors = flip == true ? horizontal_error : vertical_error;

	509

	510 // Clear destination buffer so that we can "or" in the results.

	511 memset(dst, 0, 8);

	512

	513 WriteDiff(dst, use_differential[!!flip]);

	514 WriteFlip(dst, flip);

	515

	516 uint8_t sub_block_off_0 = flip ? 2 : 0;

	517 uint8_t sub_block_off_1 = sub_block_off_0 + 1;

	518

	519 if (use_differential[!!flip]) {

	520 WriteColors555(dst, sub_block_avg[sub_block_off_0],

	521 sub_block_avg[sub_block_off_1]);

	522 } else {

	523 WriteColors444(dst, sub_block_avg[sub_block_off_0],

	524 sub_block_avg[sub_block_off_1]);

	525 }

	526

	527 if (flip == false) {

	528 /* transpose vertical data into horizontal lines */

	529 __m128i tmp;

	530 for (int i = 0; i < 4; i += 2) {

	531 tmp = data->blue[i];

	532 data->blue[i] = _mm_add_epi32(

	533 _mm_move_epi64(data->blue[i]),

	534 _mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E));

	535 data->blue[i + 1] = _mm_add_epi32(

	536 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	537 _mm_shuffle_epi32(

	538 _mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)),

	539 0x4E));

	540

	541 tmp = data->green[i];

	542 data->green[i] = _mm_add_epi32(

	543 _mm_move_epi64(data->green[i]),

	544 _mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E));

	545 data->green[i + 1] = _mm_add_epi32(

	546 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	547 _mm_shuffle_epi32(

	548 _mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)),

	549 0x4E));

	550

	551 tmp = data->red[i];

	552 data->red[i] = _mm_add_epi32(

	553 _mm_move_epi64(data->red[i]),

	554 _mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E));

	555 data->red[i + 1] = _mm_add_epi32(

	556 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	557 _mm_shuffle_epi32(

	558 _mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E));

	559 }

	560

	561 tmp = data->blue[1];

	562 data->blue[1] = data->blue[2];

	563 data->blue[2] = tmp;

	564

	565 tmp = data->green[1];

	566 data->green[1] = data->green[2];

	567 data->green[2] = tmp;

	568

	569 tmp = data->red[1];

	570 data->red[1] = data->red[2];

	571 data->red[2] = tmp;

	572 }

	573

	574 // Compute luminance for the first sub block.

	575 ComputeLuminance(dst, sub_block_avg[sub_block_off_0], 0,

	576 g_idx_to_num[sub_block_off_0], data,

	577 ETC1_SET_ERROR(expected_errors[0]));

	578 // Compute luminance for the second sub block.

	579 ComputeLuminance(dst, sub_block_avg[sub_block_off_1], 1,

	580 g_idx_to_num[sub_block_off_1], data,

	581 ETC1_SET_ERROR(expected_errors[1]));

	582 }

	583

	584 static void ExtractBlock(uint8_t* dst, const uint8_t* src, int width) {

	585 for (int j = 0; j < 4; ++j) {

	586 memcpy(&dst[j * 4 * 4], src, 4 * 4);

	587 src += width * 4;

	588 }

	589 }

	590

	591 inline bool TransposeBlock(uint8_t* block, __m128i* transposed /* [4] */) {

	592 /* This function transforms an incommig block of RGBA or GBRA pixels into 4

	593 * registers, each containing the data corresponding for a single channel.

	594 * Ex: transposed[0] will have all the R values for a RGBA block,

	595 * transposed[1] will have G, etc.

	596 * The values are packed as 8 bit unsigned values in the SSE registers.

	597 *

	598 * Before doing any work we check if the block is solid.

	599 */

	600 __m128i tmp3, tmp2, tmp1, tmp0;

	601 __m128i test_solid = _mm_set1_epi32(((uint32_t)block));

	602 uint16_t mask = 0xFFFF;

	603

	604 // a0,a1,a2,...a7, ...a15

	605 transposed[0] = _mm_loadu_si128((__m128i*)(block));

	606 // b0, b1,b2,...b7.... b15

	607 transposed[1] = _mm_loadu_si128((__m128i*)(block + 16));

	608 // c0, c1,c2,...c7....c15

	609 transposed[2] = _mm_loadu_si128((__m128i*)(block + 32));

	610 // d0,d1,d2,...d7....d15

	611 transposed[3] = _mm_loadu_si128((__m128i*)(block + 48));

	612

	613 for (int i = 0; i < 4; i++) {

	614 mask &= _mm_movemask_epi8(_mm_cmpeq_epi8(transposed[i], test_solid));

	615 }

	616

	617 if (mask == 0xFFFF) {

	618 return false; /* block is solid, no need to do any more work */

	619 }

	620

	621 // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7

	622 tmp0 = _mm_unpacklo_epi8(transposed[0], transposed[1]);

	623 // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7

	624 tmp1 = _mm_unpacklo_epi8(transposed[2], transposed[3]);

	625 // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15

	626 tmp2 = _mm_unpackhi_epi8(transposed[0], transposed[1]);

	627 // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15

	628 tmp3 = _mm_unpackhi_epi8(transposed[2], transposed[3]);

	629

	630 // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11

	631 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);

	632 // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15

	633 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);

	634 // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11

	635 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);

	636 // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15

	637 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);

	638

	639 // a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9

	640 tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]);

	641 // a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11

	642 tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]);

	643 // a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13

	644 tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]);

	645 // a6,a14, b6,b14, c6,c14, d6,d14, a7,a15, b7,b15, c7,c15, d7,d15

	646 tmp3 = _mm_unpackhi_epi32(transposed[1], transposed[3]);

	647

	648 // a0,a4, a8,a12, b0,b4, b8,b12, c0,c4, c8,c12, d0,d4, d8,d12

	649 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);

	650 // a1,a5, a9,a13, b1,b5, b9,b13, c1,c5, c9,c13, d1,d5, d9,d13

	651 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);

	652 // a2,a6, a10,a14, b2,b6, b10,b14, c2,c6, c10,c14, d2,d6, d10,d14

	653 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);

	654 // a3,a7, a11,a15, b3,b7, b11,b15, c3,c7, c11,c15, d3,d7, d11,d15

	655 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);

	656

	657 return true;

	658 }

	659

	660 inline void UnpackBlock(__m128i* packed,

	661 __m128i* red,

	662 __m128i* green,

	663 __m128i* blue,

	664 __m128i* alpha) {

	665 const __m128i zero = _mm_set1_epi8(0);

	666 __m128i tmp_low, tmp_high;

	667

	668 /* unpack red */

	669 tmp_low = _mm_unpacklo_epi8(packed[0], zero);

	670 tmp_high = _mm_unpackhi_epi8(packed[0], zero);

	671

	672 red[0] = _mm_unpacklo_epi16(tmp_low, zero);

	673 red[1] = _mm_unpackhi_epi16(tmp_low, zero);

	674

	675 red[2] = _mm_unpacklo_epi16(tmp_high, zero);

	676 red[3] = _mm_unpackhi_epi16(tmp_high, zero);

	677

	678 /* unpack green */

	679 tmp_low = _mm_unpacklo_epi8(packed[1], zero);

	680 tmp_high = _mm_unpackhi_epi8(packed[1], zero);

	681

	682 green[0] = _mm_unpacklo_epi16(tmp_low, zero);

	683 green[1] = _mm_unpackhi_epi16(tmp_low, zero);

	684

	685 green[2] = _mm_unpacklo_epi16(tmp_high, zero);

	686 green[3] = _mm_unpackhi_epi16(tmp_high, zero);

	687

	688 /* unpack blue */

	689 tmp_low = _mm_unpacklo_epi8(packed[2], zero);

	690 tmp_high = _mm_unpackhi_epi8(packed[2], zero);

	691

	692 blue[0] = _mm_unpacklo_epi16(tmp_low, zero);

	693 blue[1] = _mm_unpackhi_epi16(tmp_low, zero);

	694

	695 blue[2] = _mm_unpacklo_epi16(tmp_high, zero);

	696 blue[3] = _mm_unpackhi_epi16(tmp_high, zero);

	697

	698 /* unpack alpha */

	699 tmp_low = _mm_unpacklo_epi8(packed[3], zero);

	700 tmp_high = _mm_unpackhi_epi8(packed[3], zero);

	701

	702 alpha[0] = _mm_unpacklo_epi16(tmp_low, zero);

	703 alpha[1] = _mm_unpackhi_epi16(tmp_low, zero);

	704

	705 alpha[2] = _mm_unpacklo_epi16(tmp_high, zero);

	706 alpha[3] = _mm_unpackhi_epi16(tmp_high, zero);

	707 }

	708

	709 inline void CompressSolid(uint8_t* dst, uint8_t* block) {

	710 // Clear destination buffer so that we can "or" in the results.

	711 memset(dst, 0, 8);

	712

	713 const float src_color_float[3] = {static_cast<float>(block[0]),

	714 static_cast<float>(block[1]),

	715 static_cast<float>(block[2])};

	716 const Color base = MakeColor555(src_color_float);

	717 const __m128i base_v =

	718 _mm_set_epi32(0, base.channels.r, base.channels.g, base.channels.b);

	719

	720 const __m128i constant = _mm_set_epi32(0, block[2], block[1], block[0]);

	721 __m128i lum;

	722 __m128i colors[4];

	723 static const __m128i rgb =

	724 _mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

	725

	726 WriteDiff(dst, true);

	727 WriteFlip(dst, false);

	728

	729 WriteColors555(dst, base, base);

	730

	731 uint8_t best_tbl_idx = 0;

	732 uint8_t best_mod_idx = 0;

	733 uint32_t best_mod_err = INT32_MAX;

	734

	735 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	736 lum = _mm_set_epi32(

	737 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],

	738 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);

	739 colors[0] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x0));

	740 colors[1] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x55));

	741 colors[2] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xAA));

	742 colors[3] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xFF));

	743

	744 for (int i = 0; i < 4; i++) {

	745 uint32_t mod_err =

	746 SumSSE(GetColorErrorSSE(constant, _mm_and_si128(colors[i], rgb)));

	747 colors[i] = _mm_and_si128(colors[i], rgb);

	748 if (mod_err < best_mod_err) {

	749 best_tbl_idx = tbl_idx;

	750 best_mod_idx = i;

	751 best_mod_err = mod_err;

	752

	753 if (mod_err == 0) {

	754 break; // We cannot do any better than this.

	755 }

	756 }

	757 }

	758 }

	759

	760 WriteCodewordTable(dst, 0, best_tbl_idx);

	761 WriteCodewordTable(dst, 1, best_tbl_idx);

	762

	763 uint8_t pix_idx = g_mod_to_pix[best_mod_idx];

	764 uint32_t lsb = pix_idx & 0x1;

	765 uint32_t msb = pix_idx >> 1;

	766

	767 uint32_t pix_data = 0;

	768 for (unsigned int i = 0; i < 2; ++i) {

	769 for (unsigned int j = 0; j < 8; ++j) {

	770 // Obtain the texel number as specified in the standard.

	771 int texel_num = g_idx_to_num[i][j];

	772 pix_data \|= msb << (texel_num + 16);

	773 pix_data \|= lsb << (texel_num);

	774 }

	775 }

	776

	777 WritePixelData(dst, pix_data);

	778 }

	779

	780 } // namespace

	781

	782 namespace cc {

	783

	784 void TextureCompressorETC1SSE::Compress(const uint8_t* src,

	785 uint8_t* dst,

	786 int width,

	787 int height,

	788 Quality quality) {

	789 DCHECK(width >= 4 && (width & 3) == 0);

	790 DCHECK(height >= 4 && (height & 3) == 0);

	791

	792 ALIGNAS(16) uint8_t block[64];

	793 __m128i packed[4];

	794 __m128i red[4], green[4], blue[4], alpha[4];

	795 __sse_data data;

	796

	797 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {

	798 for (int x = 0; x < width; x += 4, dst += 8) {

	799 ExtractBlock(block, src + x * 4, width);

	800 if (TransposeBlock(block, packed) == false) {

	801 CompressSolid(dst, block);

	802 } else {

	803 UnpackBlock(packed, blue, green, red, alpha);

	804

	805 data.block = block;

	806 data.packed = packed;

	807 data.red = red;

	808 data.blue = blue;

	809 data.green = green;

	810

	811 CompressBlock(dst, &data);

	812 }

	813 }

	814 }

	815 }

	816

	817 } // namespace cc

OLD	NEW