cc/resources/texture_compressor_etc1_sse.cc - Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression

Side by Side Diff: cc/resources/texture_compressor_etc1_sse.cc

Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« cc/resources/texture_compressor_etc1.cc ('K') | « cc/resources/texture_compressor_etc1_sse.h ('k') | cc/resources/texture_compressor_etc1_unittest.cc » ('j') | cc/resources/texture_compressor_perftest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "cc/resources/texture_compressor_etc1_sse.h"

	6

	7 #include <assert.h>

	8 #include <emmintrin.h>

	9 #include <cmath>

	10 #include <limits>

	11

	12 #include "base/compiler_specific.h"

	13 #include "base/logging.h"

	14 // using this header for common functions such as Color handling

	15 // and codeword table

	16 #include "cc/resources/texture_compressor_etc1.h"

	17

	18 namespace {

	19

	20 #define ETC1_SET_ERROR(x) (x + x / 2 + 384)
	reveman 2015/05/07 14:24:35 nit: inline function instead of macro nit: inline function instead of macro radu.velea 2015/05/07 15:53:45 Done. Show quoted text On 2015/05/07 14:24:35, reveman wrote: > nit: inline function instead of macro Done.
	21

	22 struct __sse_data {

	23 // raw data

	24 uint8_t* block;

	25 // 8 bit packed values

	26 __m128i* packed;

	27 // 32 bit zero extended values - 4x4 arrays

	28 __m128i* blue;

	29 __m128i* green;

	30 __m128i* red;

	31 };

	32

	33 // commonly used registers
	reveman 2015/05/07 14:24:35 nit: I don't feel too strongly about this but ther nit: I don't feel too strongly about this but there's some inconsistency between comments in this patch. It's recommended to write comments as real sentences in chromium. First word capitalized and a period at the end. "// Commonly used registers." in this case. If you like to update all the comments in this patch to be more consistent in this regard then that's great but it's also fine with me to leave it as is. Up to you. radu.velea 2015/05/07 15:53:46 Done. Show quoted text On 2015/05/07 14:24:35, reveman wrote: > nit: I don't feel too strongly about this but there's some inconsistency between > comments in this patch. It's recommended to write comments as real sentences in > chromium. First word capitalized and a period at the end. "// Commonly used > registers." in this case. If you like to update all the comments in this patch > to be more consistent in this regard then that's great but it's also fine with > me to leave it as is. Up to you. Done.
	34 static const __m128i __sse_zero = _mm_set1_epi32(0);

	35 static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF);

	36

	37 inline __m128i AddAndClamp(const __m128i x, const __m128i y) {

	38 static const __m128i color_max = _mm_set1_epi32(0xFF);

	39 return _mm_max_epi16(__sse_zero,

	40 _mm_min_epi16(_mm_add_epi16(x, y), color_max));

	41 }

	42

	43 inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) {

	44 // changed from _mm_mullo_epi32 (SSE4) to _mm_mullo_epi16 (SSE2)

	45 __m128i ret = _mm_sub_epi16(x, y);

	46 return _mm_mullo_epi16(ret, ret);

	47 }

	48

	49 inline __m128i AddChannelError(const __m128i x,

	50 const __m128i y,

	51 const __m128i z) {

	52 return _mm_add_epi32(x, _mm_add_epi32(y, z));

	53 }

	54

	55 inline uint32_t SumSSE(const __m128i x) {

	56 __m128i sum = _mm_add_epi32(x, _mm_shuffle_epi32(x, 0x4E));

	57 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));

	58

	59 return _mm_cvtsi128_si32(sum);

	60 }

	61

	62 inline uint32_t GetVerticalError(const __sse_data* data,

	63 const __m128i* blue_avg,

	64 const __m128i* green_avg,

	65 const __m128i* red_avg,

	66 uint32_t* verror) {

	67 __m128i error = __sse_zero;

	68

	69 for (int i = 0; i < 4; i++) {

	70 error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0]));

	71 error =

	72 _mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0]));

	73 error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0]));

	74 }

	75

	76 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	77

	78 verror[0] = _mm_cvtsi128_si32(error);

	79 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	80

	81 return verror[0] + verror[1];

	82 }

	83

	84 inline uint32_t GetHorizontalError(const __sse_data* data,

	85 const __m128i* blue_avg,

	86 const __m128i* green_avg,

	87 const __m128i* red_avg,

	88 uint32_t* verror) {

	89 __m128i error = __sse_zero;

	90 int first_index, second_index;

	91

	92 for (int i = 0; i < 2; i++) {

	93 first_index = 2 * i;

	94 second_index = first_index + 1;

	95

	96 error = _mm_add_epi32(

	97 error, GetColorErrorSSE(data->blue[first_index], blue_avg[i]));

	98 error = _mm_add_epi32(

	99 error, GetColorErrorSSE(data->blue[second_index], blue_avg[i]));

	100 error = _mm_add_epi32(

	101 error, GetColorErrorSSE(data->green[first_index], green_avg[i]));

	102 error = _mm_add_epi32(

	103 error, GetColorErrorSSE(data->green[second_index], green_avg[i]));

	104 error = _mm_add_epi32(error,

	105 GetColorErrorSSE(data->red[first_index], red_avg[i]));

	106 error = _mm_add_epi32(

	107 error, GetColorErrorSSE(data->red[second_index], red_avg[i]));

	108 }

	109

	110 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	111

	112 verror[0] = _mm_cvtsi128_si32(error);

	113 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	114

	115 return verror[0] + verror[1];

	116 }

	117

	118 inline void GetAvgColors(const __sse_data* data,

	119 float* output,

	120 bool* __sse_use_diff) {

	121 __m128i sum[2], tmp;

	122

	123 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe

	124

	125 // get avg red

	126 // [S0 S0 S1 S1]

	127 sum[0] = _mm_add_epi32(data->red[0], data->red[1]);

	128 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	129

	130 // [S2 S2 S3 S3]

	131 sum[1] = _mm_add_epi32(data->red[2], data->red[3]);

	132 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	133

	134 float hred[2], vred[2];

	135 hred[0] = (_mm_cvtsi128_si32(

	136 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	137 8.0f;

	138 hred[1] = (_mm_cvtsi128_si32(

	139 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	140 8.0f;

	141

	142 tmp = _mm_add_epi32(sum[0], sum[1]);

	143 vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	144 vred[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	145

	146 // get avg green

	147 // [S0 S0 S1 S1]

	148 sum[0] = _mm_add_epi32(data->green[0], data->green[1]);

	149 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	150

	151 // [S2 S2 S3 S3]

	152 sum[1] = _mm_add_epi32(data->green[2], data->green[3]);

	153 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	154

	155 float hgreen[2], vgreen[2];

	156 hgreen[0] = (_mm_cvtsi128_si32(

	157 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	158 8.0f;

	159 hgreen[1] = (_mm_cvtsi128_si32(

	160 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	161 8.0f;

	162

	163 tmp = _mm_add_epi32(sum[0], sum[1]);

	164 vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	165 vgreen[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	166

	167 // get avg blue

	168 // [S0 S0 S1 S1]

	169 sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]);

	170 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	171

	172 // [S2 S2 S3 S3]

	173 sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]);

	174 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	175

	176 float hblue[2], vblue[2];

	177 hblue[0] = (_mm_cvtsi128_si32(

	178 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	179 8.0f;

	180 hblue[1] = (_mm_cvtsi128_si32(

	181 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	182 8.0f;

	183

	184 tmp = _mm_add_epi32(sum[0], sum[1]);

	185 vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	186 vblue[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	187

	188 // TODO(radu.velea): return int's instead of floats, based on Quality

	189 output[0] = vblue[0];

	190 output[1] = vgreen[0];

	191 output[2] = vred[0];

	192

	193 output[3] = vblue[1];

	194 output[4] = vgreen[1];

	195 output[5] = vred[1];

	196

	197 output[6] = hblue[0];

	198 output[7] = hgreen[0];

	199 output[8] = hred[0];

	200

	201 output[9] = hblue[1];

	202 output[10] = hgreen[1];

	203 output[11] = hred[1];

	204

	205 __m128i threshold_upper = _mm_set1_epi32(3);

	206 __m128i threshold_lower = _mm_set1_epi32(-4);

	207

	208 __m128 factor_v = _mm_set1_ps(31.0f / 255.0f);

	209 __m128 rounding_v = _mm_set1_ps(0.5f);

	210 __m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0);

	211 __m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0);

	212

	213 __m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0);

	214 __m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0);

	215

	216 h_avg_0 = _mm_mul_ps(h_avg_0, factor_v);

	217 h_avg_1 = _mm_mul_ps(h_avg_1, factor_v);

	218 v_avg_0 = _mm_mul_ps(v_avg_0, factor_v);

	219 v_avg_1 = _mm_mul_ps(v_avg_1, factor_v);

	220

	221 h_avg_0 = _mm_add_ps(h_avg_0, rounding_v);

	222 h_avg_1 = _mm_add_ps(h_avg_1, rounding_v);

	223 v_avg_0 = _mm_add_ps(v_avg_0, rounding_v);

	224 v_avg_1 = _mm_add_ps(v_avg_1, rounding_v);

	225

	226 __m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0);

	227 __m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1);

	228

	229 __m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0);

	230 __m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1);

	231

	232 h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i);

	233 v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i);

	234

	235 __sse_use_diff[0] =

	236 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threshold_lower)));

	237 __sse_use_diff[0] &=

	238 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threshold_upper)));

	239

	240 __sse_use_diff[1] =

	241 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threshold_lower)));

	242 __sse_use_diff[1] &=

	243 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threshold_upper)));

	244 }

	245

	246 void ComputeLuminance(uint8_t* block,

	247 const cc::Color& base,

	248 const int sub_block_id,

	249 const uint8_t* idx_to_num_tab,

	250 const __sse_data* data,

	251 const uint32_t expected_error) {

	252 uint8_t best_tbl_idx = 0;

	253 uint32_t best_error = 0x7FFFFFFF;

	254 uint8_t best_mod_idx[8][8]; // [table][texel]

	255

	256 const __m128i base_blue = _mm_set1_epi32(base.channels.b);

	257 const __m128i base_green = _mm_set1_epi32(base.channels.g);

	258 const __m128i base_red = _mm_set1_epi32(base.channels.r);

	259

	260 __m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red;

	261 __m128i block_error, mask;

	262

	263 // this will have the minimum errors for each 4 pixels

	264 __m128i first_half_min;

	265 __m128i second_half_min;

	266

	267 // this will have the matching table index combo for each 4 pixels

	268 __m128i first_half_pattern;

	269 __m128i second_half_pattern;

	270

	271 const __m128i first_blue_data_block = data->blue[2 * sub_block_id];

	272 const __m128i first_green_data_block = data->green[2 * sub_block_id];

	273 const __m128i first_red_data_block = data->red[2 * sub_block_id];

	274

	275 const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1];

	276 const __m128i second_green_data_block = data->green[2 * sub_block_id + 1];

	277 const __m128i second_red_data_block = data->red[2 * sub_block_id + 1];

	278

	279 uint32_t min;

	280 // fail early to increase speed

	281 long delta = INT32_MAX;

	282 uint32_t last_min = INT32_MAX;

	283

	284 const uint8_t shuffle_mask[] = {

	285 0x1B, 0x4E, 0xB1, 0xE4}; // important they are sorted ascending

	286

	287 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	288 tmp = _mm_set_epi32(

	289 cc::g_codeword_tables[tbl_idx][3], cc::g_codeword_tables[tbl_idx][2],

	290 cc::g_codeword_tables[tbl_idx][1], cc::g_codeword_tables[tbl_idx][0]);

	291

	292 test_blue = AddAndClamp(tmp, base_blue);

	293 test_green = AddAndClamp(tmp, base_green);

	294 test_red = AddAndClamp(tmp, base_red);

	295

	296 first_half_min = __sse_max_int;

	297 second_half_min = __sse_max_int;

	298

	299 first_half_pattern = __sse_zero;

	300 second_half_pattern = __sse_zero;

	301

	302 for (uint8_t imm8 : shuffle_mask) {

	303 switch (imm8) {

	304 case 0x1B:

	305 tmp_blue = _mm_shuffle_epi32(test_blue, 0x1B);

	306 tmp_green = _mm_shuffle_epi32(test_green, 0x1B);

	307 tmp_red = _mm_shuffle_epi32(test_red, 0x1B);

	308 break;

	309 case 0x4E:

	310 tmp_blue = _mm_shuffle_epi32(test_blue, 0x4E);

	311 tmp_green = _mm_shuffle_epi32(test_green, 0x4E);

	312 tmp_red = _mm_shuffle_epi32(test_red, 0x4E);

	313 break;

	314 case 0xB1:

	315 tmp_blue = _mm_shuffle_epi32(test_blue, 0xB1);

	316 tmp_green = _mm_shuffle_epi32(test_green, 0xB1);

	317 tmp_red = _mm_shuffle_epi32(test_red, 0xB1);

	318 break;

	319 case 0xE4:

	320 tmp_blue = _mm_shuffle_epi32(test_blue, 0xE4);

	321 tmp_green = _mm_shuffle_epi32(test_green, 0xE4);

	322 tmp_red = _mm_shuffle_epi32(test_red, 0xE4);

	323 break;

	324 default:

	325 tmp_blue = test_blue;

	326 tmp_green = test_green;

	327 tmp_red = test_red;

	328 }

	329

	330 tmp = _mm_set1_epi32(imm8);

	331

	332 block_error =

	333 AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block),

	334 GetColorErrorSSE(tmp_green, first_green_data_block),

	335 GetColorErrorSSE(tmp_red, first_red_data_block));

	336

	337 // save winning pattern

	338 first_half_pattern = _mm_max_epi16(

	339 first_half_pattern,

	340 _mm_and_si128(tmp, _mm_cmpgt_epi32(first_half_min, block_error)));

	341 // should use _mm_min_epi32(first_half_min, block_error); from SSE4

	342 // otherwise small performance penalty

	343 mask = _mm_cmplt_epi32(block_error, first_half_min);

	344 first_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	345 _mm_andnot_si128(mask, first_half_min));

	346

	347 // Second part of the block

	348 block_error =

	349 AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block),

	350 GetColorErrorSSE(tmp_green, second_green_data_block),

	351 GetColorErrorSSE(tmp_red, second_red_data_block));

	352

	353 // save winning pattern

	354 second_half_pattern = _mm_max_epi16(

	355 second_half_pattern,

	356 _mm_and_si128(tmp, _mm_cmpgt_epi32(second_half_min, block_error)));

	357 // should use _mm_min_epi32(second_half_min, block_error); from SSE4

	358 // otherwise performance penalty

	359 mask = _mm_cmplt_epi32(block_error, second_half_min);

	360 second_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	361 _mm_andnot_si128(mask, second_half_min));

	362 }

	363

	364 first_half_min = _mm_add_epi32(first_half_min, second_half_min);

	365 first_half_min =

	366 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E));

	367 first_half_min =

	368 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1));

	369

	370 min = _mm_cvtsi128_si32(first_half_min);

	371

	372 delta = min - last_min;

	373 last_min = min;

	374

	375 if (min < best_error) {

	376 best_tbl_idx = tbl_idx;

	377 best_error = min;

	378

	379 best_mod_idx[tbl_idx][0] =

	380 (_mm_cvtsi128_si32(first_half_pattern) >> (0)) & 3;

	381 best_mod_idx[tbl_idx][4] =

	382 (_mm_cvtsi128_si32(second_half_pattern) >> (0)) & 3;

	383

	384 best_mod_idx[tbl_idx][1] =

	385 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x1)) >>

	386 (2)) &

	387 3;

	388 best_mod_idx[tbl_idx][5] =

	389 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x1)) >>

	390 (2)) &

	391 3;

	392

	393 best_mod_idx[tbl_idx][2] =

	394 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x2)) >>

	395 (4)) &

	396 3;

	397 best_mod_idx[tbl_idx][6] =

	398 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x2)) >>

	399 (4)) &

	400 3;

	401

	402 best_mod_idx[tbl_idx][3] =

	403 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x3)) >>

	404 (6)) &

	405 3;

	406 best_mod_idx[tbl_idx][7] =

	407 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x3)) >>

	408 (6)) &

	409 3;

	410

	411 if (best_error == 0) {

	412 break;

	413 }

	414 } else if (delta > 0 && expected_error < min) {

	415 // error is growing and is well beyond expected error

	416 break;

	417 }

	418 }

	419

	420 cc::WriteCodewordTable(block, sub_block_id, best_tbl_idx);

	421

	422 uint32_t pix_data = 0;

	423 uint8_t mod_idx;

	424 uint8_t pix_idx;

	425 uint32_t lsb;

	426 uint32_t msb;

	427 int texel_num;

	428

	429 for (unsigned int i = 0; i < 8; ++i) {

	430 mod_idx = best_mod_idx[best_tbl_idx][i];

	431 pix_idx = cc::g_mod_to_pix[mod_idx];

	432

	433 lsb = pix_idx & 0x1;

	434 msb = pix_idx >> 1;

	435

	436 // Obtain the texel number as specified in the standard.

	437 texel_num = idx_to_num_tab[i];

	438 pix_data \|= msb << (texel_num + 16);

	439 pix_data \|= lsb << (texel_num);

	440 }

	441

	442 cc::WritePixelData(block, pix_data);

	443 }

	444

	445 void CompressBlock(uint8_t* dst, __sse_data* data) {

	446 // first 3 vertical 1, second 3 vertical 2, third 3 horizontal 1, last 3

	447 // horizontal 2

	448 float __sse_avg_colors[12] = {

	449 0,

	450 };

	451 bool use_differential[2] = {true, true};

	452 GetAvgColors(data, __sse_avg_colors, use_differential);

	453 cc::Color sub_block_avg[4];

	454

	455 // TODO(radu.velea): remove floating point operations and use only int's +

	456 // normal rounding and shifts for reduced Quality

	457 for (int i = 0, j = 1; i < 4; i += 2, j += 2) {

	458 if (use_differential[i / 2] == false) {

	459 sub_block_avg[i] = cc::MakeColor444(&__sse_avg_colors[i * 3]);

	460 sub_block_avg[j] = cc::MakeColor444(&__sse_avg_colors[j * 3]);

	461 } else {

	462 sub_block_avg[i] = cc::MakeColor555(&__sse_avg_colors[i * 3]);

	463 sub_block_avg[j] = cc::MakeColor555(&__sse_avg_colors[j * 3]);

	464 }

	465 }

	466

	467 __m128i red_avg[2], green_avg[2], blue_avg[2];

	468

	469 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	470 blue_avg[0] =

	471 _mm_set_epi32((int)__sse_avg_colors[3], (int)__sse_avg_colors[3],

	472 (int)__sse_avg_colors[0], (int)__sse_avg_colors[0]);
	reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>( nit: please avoid c-style casts. static_cast<int>() instead radu.velea 2015/05/07 15:53:45 Done. Show quoted text On 2015/05/07 14:24:35, reveman wrote: > nit: please avoid c-style casts. static_cast<int>() instead Done.
	473

	474 green_avg[0] =

	475 _mm_set_epi32((int)__sse_avg_colors[4], (int)__sse_avg_colors[4],

	476 (int)__sse_avg_colors[1], (int)__sse_avg_colors[1]);
	reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>( nit: please avoid c-style casts. static_cast<int>() instead radu.velea 2015/05/07 15:53:46 Done. Show quoted text On 2015/05/07 14:24:35, reveman wrote: > nit: please avoid c-style casts. static_cast<int>() instead Done.
	477

	478 red_avg[0] =

	479 _mm_set_epi32((int)__sse_avg_colors[5], (int)__sse_avg_colors[5],

	480 (int)__sse_avg_colors[2], (int)__sse_avg_colors[2]);
	reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>( nit: please avoid c-style casts. static_cast<int>() instead radu.velea 2015/05/07 15:53:45 Done. Show quoted text On 2015/05/07 14:24:35, reveman wrote: > nit: please avoid c-style casts. static_cast<int>() instead Done.
	481

	482 uint32_t vertical_error[2];

	483 GetVerticalError(data, blue_avg, green_avg, red_avg, vertical_error);

	484

	485 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	486 blue_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[6]);

	487 blue_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[9]);
	reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>( nit: please avoid c-style casts. static_cast<int>() instead radu.velea 2015/05/07 15:53:45 Done. Show quoted text On 2015/05/07 14:24:35, reveman wrote: > nit: please avoid c-style casts. static_cast<int>() instead Done.
	488

	489 green_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[7]);

	490 green_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[10]);
	reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>( nit: please avoid c-style casts. static_cast<int>() instead radu.velea 2015/05/07 15:53:45 Done. Show quoted text On 2015/05/07 14:24:35, reveman wrote: > nit: please avoid c-style casts. static_cast<int>() instead Done.
	491

	492 red_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[8]);

	493 red_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[11]);
	reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>( nit: please avoid c-style casts. static_cast<int>() instead radu.velea 2015/05/07 15:53:45 Done. Show quoted text On 2015/05/07 14:24:35, reveman wrote: > nit: please avoid c-style casts. static_cast<int>() instead Done.
	494

	495 uint32_t horizontal_error[2];

	496 GetHorizontalError(data, blue_avg, green_avg, red_avg, horizontal_error);

	497

	498 bool flip = (horizontal_error[0] + horizontal_error[1]) <

	499 (vertical_error[0] + vertical_error[1]);

	500 uint32_t* expected_errors = flip == true ? horizontal_error : vertical_error;
	reveman 2015/05/07 14:24:35 nit: s/flip == true/flip/ nit: s/flip == true/flip/ radu.velea 2015/05/07 15:53:45 Done. Show quoted text On 2015/05/07 14:24:35, reveman wrote: > nit: s/flip == true/flip/ Done.
	501

	502 // Clear destination buffer so that we can "or" in the results.

	503 memset(dst, 0, 8);

	504

	505 cc::WriteDiff(dst, use_differential[!!flip]);

	506 cc::WriteFlip(dst, flip);

	507

	508 uint8_t sub_block_off_0 = flip ? 2 : 0;

	509 uint8_t sub_block_off_1 = sub_block_off_0 + 1;

	510

	511 if (use_differential[!!flip]) {

	512 cc::WriteColors555(dst, sub_block_avg[sub_block_off_0],

	513 sub_block_avg[sub_block_off_1]);

	514 } else {

	515 cc::WriteColors444(dst, sub_block_avg[sub_block_off_0],

	516 sub_block_avg[sub_block_off_1]);

	517 }

	518

	519 if (flip == false) {
	reveman 2015/05/07 14:24:35 nit: if (!flip) nit: if (!flip) radu.velea 2015/05/07 15:53:45 Done. Show quoted text On 2015/05/07 14:24:35, reveman wrote: > nit: if (!flip) Done.
	520 // transpose vertical data into horizontal lines

	521 __m128i tmp;

	522 for (int i = 0; i < 4; i += 2) {

	523 tmp = data->blue[i];

	524 data->blue[i] = _mm_add_epi32(

	525 _mm_move_epi64(data->blue[i]),

	526 _mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E));

	527 data->blue[i + 1] = _mm_add_epi32(

	528 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	529 _mm_shuffle_epi32(

	530 _mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)),

	531 0x4E));

	532

	533 tmp = data->green[i];

	534 data->green[i] = _mm_add_epi32(

	535 _mm_move_epi64(data->green[i]),

	536 _mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E));

	537 data->green[i + 1] = _mm_add_epi32(

	538 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	539 _mm_shuffle_epi32(

	540 _mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)),

	541 0x4E));

	542

	543 tmp = data->red[i];

	544 data->red[i] = _mm_add_epi32(

	545 _mm_move_epi64(data->red[i]),

	546 _mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E));

	547 data->red[i + 1] = _mm_add_epi32(

	548 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	549 _mm_shuffle_epi32(

	550 _mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E));

	551 }

	552

	553 tmp = data->blue[1];

	554 data->blue[1] = data->blue[2];

	555 data->blue[2] = tmp;

	556

	557 tmp = data->green[1];

	558 data->green[1] = data->green[2];

	559 data->green[2] = tmp;

	560

	561 tmp = data->red[1];

	562 data->red[1] = data->red[2];

	563 data->red[2] = tmp;

	564 }

	565

	566 // Compute luminance for the first sub block.

	567 ComputeLuminance(dst, sub_block_avg[sub_block_off_0], 0,

	568 cc::g_idx_to_num[sub_block_off_0], data,

	569 ETC1_SET_ERROR(expected_errors[0]));

	570 // Compute luminance for the second sub block.

	571 ComputeLuminance(dst, sub_block_avg[sub_block_off_1], 1,

	572 cc::g_idx_to_num[sub_block_off_1], data,

	573 ETC1_SET_ERROR(expected_errors[1]));

	574 }

	575

	576 static void ExtractBlock(uint8_t* dst, const uint8_t* src, int width) {

	577 for (int j = 0; j < 4; ++j) {

	578 memcpy(&dst[j * 4 * 4], src, 4 * 4);

	579 src += width * 4;

	580 }

	581 }

	582

	583 inline bool TransposeBlock(uint8_t* block, __m128i* transposed) {

	584 // This function transforms an incommig block of RGBA or GBRA pixels into 4

	585 // registers, each containing the data corresponding for a single channel.

	586 // Ex: transposed[0] will have all the R values for a RGBA block,

	587 // transposed[1] will have G, etc.

	588 // The values are packed as 8 bit unsigned values in the SSE registers.

	589

	590 // Before doing any work we check if the block is solid.

	591 __m128i tmp3, tmp2, tmp1, tmp0;

	592 __m128i test_solid = _mm_set1_epi32(((uint32_t)block));

	593 uint16_t mask = 0xFFFF;

	594

	595 // a0,a1,a2,...a7, ...a15

	596 transposed[0] = _mm_loadu_si128((__m128i*)(block));

	597 // b0, b1,b2,...b7.... b15

	598 transposed[1] = _mm_loadu_si128((__m128i*)(block + 16));

	599 // c0, c1,c2,...c7....c15

	600 transposed[2] = _mm_loadu_si128((__m128i*)(block + 32));

	601 // d0,d1,d2,...d7....d15

	602 transposed[3] = _mm_loadu_si128((__m128i*)(block + 48));

	603

	604 for (int i = 0; i < 4; i++) {

	605 mask &= _mm_movemask_epi8(_mm_cmpeq_epi8(transposed[i], test_solid));

	606 }

	607

	608 if (mask == 0xFFFF) {

	609 return false; // block is solid, no need to do any more work

	610 }

	611

	612 // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7

	613 tmp0 = _mm_unpacklo_epi8(transposed[0], transposed[1]);

	614 // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7

	615 tmp1 = _mm_unpacklo_epi8(transposed[2], transposed[3]);

	616 // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15

	617 tmp2 = _mm_unpackhi_epi8(transposed[0], transposed[1]);

	618 // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15

	619 tmp3 = _mm_unpackhi_epi8(transposed[2], transposed[3]);

	620

	621 // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11

	622 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);

	623 // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15

	624 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);

	625 // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11

	626 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);

	627 // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15

	628 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);

	629

	630 // a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9

	631 tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]);

	632 // a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11

	633 tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]);

	634 // a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13

	635 tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]);

	636 // a6,a14, b6,b14, c6,c14, d6,d14, a7,a15, b7,b15, c7,c15, d7,d15

	637 tmp3 = _mm_unpackhi_epi32(transposed[1], transposed[3]);

	638

	639 // a0,a4, a8,a12, b0,b4, b8,b12, c0,c4, c8,c12, d0,d4, d8,d12

	640 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);

	641 // a1,a5, a9,a13, b1,b5, b9,b13, c1,c5, c9,c13, d1,d5, d9,d13

	642 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);

	643 // a2,a6, a10,a14, b2,b6, b10,b14, c2,c6, c10,c14, d2,d6, d10,d14

	644 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);

	645 // a3,a7, a11,a15, b3,b7, b11,b15, c3,c7, c11,c15, d3,d7, d11,d15

	646 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);

	647

	648 return true;

	649 }

	650

	651 inline void UnpackBlock(__m128i* packed,

	652 __m128i* red,

	653 __m128i* green,

	654 __m128i* blue,

	655 __m128i* alpha) {

	656 const __m128i zero = _mm_set1_epi8(0);

	657 __m128i tmp_low, tmp_high;

	658

	659 // unpack red

	660 tmp_low = _mm_unpacklo_epi8(packed[0], zero);

	661 tmp_high = _mm_unpackhi_epi8(packed[0], zero);

	662

	663 red[0] = _mm_unpacklo_epi16(tmp_low, zero);

	664 red[1] = _mm_unpackhi_epi16(tmp_low, zero);

	665

	666 red[2] = _mm_unpacklo_epi16(tmp_high, zero);

	667 red[3] = _mm_unpackhi_epi16(tmp_high, zero);

	668

	669 // unpack green

	670 tmp_low = _mm_unpacklo_epi8(packed[1], zero);

	671 tmp_high = _mm_unpackhi_epi8(packed[1], zero);

	672

	673 green[0] = _mm_unpacklo_epi16(tmp_low, zero);

	674 green[1] = _mm_unpackhi_epi16(tmp_low, zero);

	675

	676 green[2] = _mm_unpacklo_epi16(tmp_high, zero);

	677 green[3] = _mm_unpackhi_epi16(tmp_high, zero);

	678

	679 // unpack blue

	680 tmp_low = _mm_unpacklo_epi8(packed[2], zero);

	681 tmp_high = _mm_unpackhi_epi8(packed[2], zero);

	682

	683 blue[0] = _mm_unpacklo_epi16(tmp_low, zero);

	684 blue[1] = _mm_unpackhi_epi16(tmp_low, zero);

	685

	686 blue[2] = _mm_unpacklo_epi16(tmp_high, zero);

	687 blue[3] = _mm_unpackhi_epi16(tmp_high, zero);

	688

	689 // unpack alpha - unused for ETC1

	690 tmp_low = _mm_unpacklo_epi8(packed[3], zero);

	691 tmp_high = _mm_unpackhi_epi8(packed[3], zero);

	692

	693 alpha[0] = _mm_unpacklo_epi16(tmp_low, zero);

	694 alpha[1] = _mm_unpackhi_epi16(tmp_low, zero);

	695

	696 alpha[2] = _mm_unpacklo_epi16(tmp_high, zero);

	697 alpha[3] = _mm_unpackhi_epi16(tmp_high, zero);

	698 }

	699

	700 inline void CompressSolid(uint8_t* dst, uint8_t* block) {

	701 // Clear destination buffer so that we can "or" in the results.

	702 memset(dst, 0, 8);

	703

	704 const float src_color_float[3] = {static_cast<float>(block[0]),

	705 static_cast<float>(block[1]),

	706 static_cast<float>(block[2])};

	707 const cc::Color base = cc::MakeColor555(src_color_float);

	708 const __m128i base_v =

	709 _mm_set_epi32(0, base.channels.r, base.channels.g, base.channels.b);

	710

	711 const __m128i constant = _mm_set_epi32(0, block[2], block[1], block[0]);

	712 __m128i lum;

	713 __m128i colors[4];

	714 static const __m128i rgb =

	715 _mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

	716

	717 cc::WriteDiff(dst, true);

	718 cc::WriteFlip(dst, false);

	719

	720 cc::WriteColors555(dst, base, base);

	721

	722 uint8_t best_tbl_idx = 0;

	723 uint8_t best_mod_idx = 0;

	724 uint32_t best_mod_err = INT32_MAX;

	725

	726 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	727 lum = _mm_set_epi32(

	728 cc::g_codeword_tables[tbl_idx][3], cc::g_codeword_tables[tbl_idx][2],

	729 cc::g_codeword_tables[tbl_idx][1], cc::g_codeword_tables[tbl_idx][0]);

	730 colors[0] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x0));

	731 colors[1] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x55));

	732 colors[2] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xAA));

	733 colors[3] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xFF));

	734

	735 for (int i = 0; i < 4; i++) {

	736 uint32_t mod_err =

	737 SumSSE(GetColorErrorSSE(constant, _mm_and_si128(colors[i], rgb)));

	738 colors[i] = _mm_and_si128(colors[i], rgb);

	739 if (mod_err < best_mod_err) {

	740 best_tbl_idx = tbl_idx;

	741 best_mod_idx = i;

	742 best_mod_err = mod_err;

	743

	744 if (mod_err == 0) {

	745 break; // We cannot do any better than this.

	746 }

	747 }

	748 }

	749 }

	750

	751 cc::WriteCodewordTable(dst, 0, best_tbl_idx);

	752 cc::WriteCodewordTable(dst, 1, best_tbl_idx);

	753

	754 uint8_t pix_idx = cc::g_mod_to_pix[best_mod_idx];

	755 uint32_t lsb = pix_idx & 0x1;

	756 uint32_t msb = pix_idx >> 1;

	757

	758 uint32_t pix_data = 0;

	759 for (unsigned int i = 0; i < 2; ++i) {

	760 for (unsigned int j = 0; j < 8; ++j) {

	761 // Obtain the texel number as specified in the standard.

	762 int texel_num = cc::g_idx_to_num[i][j];

	763 pix_data \|= msb << (texel_num + 16);

	764 pix_data \|= lsb << (texel_num);

	765 }

	766 }

	767

	768 cc::WritePixelData(dst, pix_data);

	769 }

	770

	771 } // namespace

	772

	773 namespace cc {
	reveman 2015/05/07 14:24:35 nit: please move this up to line 17 just before "n nit: please move this up to line 17 just before "namespace {" and remove the cc:: prefix in the code above radu.velea 2015/05/07 15:53:45 Done. Show quoted text On 2015/05/07 14:24:35, reveman wrote: > nit: please move this up to line 17 just before "namespace {" and remove the > cc:: prefix in the code above Done.
	774

	775 void TextureCompressorETC1SSE::Compress(const uint8_t* src,

	776 uint8_t* dst,

	777 int width,

	778 int height,

	779 Quality quality) {

	780 DCHECK_GE(width, 4);

	781 DCHECK_EQ((width & 3), 0);

	782 DCHECK_GE(height, 4);

	783 DCHECK_EQ((height & 3), 0);

	784

	785 ALIGNAS(16) uint8_t block[64];

	786 __m128i packed[4];

	787 __m128i red[4], green[4], blue[4], alpha[4];

	788 __sse_data data;

	789

	790 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {

	791 for (int x = 0; x < width; x += 4, dst += 8) {

	792 ExtractBlock(block, src + x * 4, width);

	793 if (TransposeBlock(block, packed) == false) {

	794 CompressSolid(dst, block);

	795 } else {

	796 UnpackBlock(packed, blue, green, red, alpha);

	797

	798 data.block = block;

	799 data.packed = packed;

	800 data.red = red;

	801 data.blue = blue;

	802 data.green = green;

	803

	804 CompressBlock(dst, &data);

	805 }

	806 }

	807 }

	808 }

	809

	810 } // namespace cc

OLD	NEW