cc/resources/texture_compressor_etc1_sse.cc - Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression

Side by Side Diff: cc/resources/texture_compressor_etc1_sse.cc

Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Applying feedback Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« cc/resources/texture_compressor_etc1.h ('K') | « cc/resources/texture_compressor_etc1_sse.h ('k') | cc/resources/texture_compressor_etc1_unittest.cc » ('j') | cc/resources/texture_compressor_etc1_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "cc/resources/texture_compressor_etc1_sse.h"

	6

	7 #include <assert.h>

	8 #include <emmintrin.h>

	9 #include <cmath>

	10 #include <limits>

	11

	12 #include "base/compiler_specific.h"

	13 #include "base/logging.h"

	14 /* using this header for common functions such as Color handling

	15 * and codeword table

	16 */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	17 #include "cc/resources/texture_compressor_etc1.h"

	18

	19 using namespace cc;

	20

	21 namespace {

	22

	23 #define ETC1_SET_ERROR(x) (x + x / 2 + 384)

	24

	25 struct __sse_data {

	26 /* raw data */
	reveman 2015/05/06 18:44:02 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: comment style Done.
	27 uint8_t* block;

	28 /* 8 bit packed values */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	29 __m128i* packed;

	30 /* 32 bit zero extended values - 4x4 arrays */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	31 __m128i* blue;

	32 __m128i* green;

	33 __m128i* red;

	34 // __m128i *alpha;
	reveman 2015/05/06 18:44:01 nit: comment style should this even be here? nit: comment style should this even be here? radu.velea 2015/05/07 11:21:39 This would be used when adding ETC2: http://en.wik Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style > > should this even be here? This would be used when adding ETC2: http://en.wikipedia.org/wiki/Ericsson_Texture_Compression#ETC2_and_EAC But for know I guess it's not needed and I removed it.
	35 };

	36

	37 /* commonly used registers */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	38 static const __m128i __sse_zero = _mm_set1_epi32(0);

	39 static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF);

	40

	41 inline __m128i AddAndClamp(const __m128i x, const __m128i y) {

	42 static const __m128i color_max = _mm_set1_epi32(0xFF);

	43 return _mm_max_epi16(__sse_zero,

	44 _mm_min_epi16(_mm_add_epi16(x, y), color_max));

	45 }

	46

	47 inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) {

	48 /* changed from _mm_mullo_epi32 to _mm_mullo_epi16 */
	reveman 2015/05/06 18:44:02 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: comment style Done.
	49 __m128i ret = _mm_sub_epi16(x, y);

	50 return _mm_mullo_epi16(ret, ret);

	51 }

	52

	53 inline __m128i AddChannelError(const __m128i x,

	54 const __m128i y,

	55 const __m128i z) {

	56 return _mm_add_epi32(x, _mm_add_epi32(y, z));

	57 }

	58

	59 inline uint32_t SumSSE(const __m128i x) {

	60 __m128i sum = _mm_add_epi32(x, _mm_shuffle_epi32(x, 0x4E));

	61 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));

	62

	63 return _mm_cvtsi128_si32(sum);

	64 }

	65

	66 inline uint32_t GetVerticalError(const __sse_data* data,

	67 const __m128i* blue_avg,

	68 const __m128i* green_avg,

	69 const __m128i* red_avg,

	70 uint32_t* verror) {

	71 __m128i error = __sse_zero;

	72

	73 for (int i = 0; i < 4; i++) {

	74 error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0]));

	75 error =

	76 _mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0]));

	77 error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0]));

	78 }

	79

	80 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	81

	82 verror[0] = _mm_cvtsi128_si32(error);

	83 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	84

	85 return verror[0] + verror[1];

	86 }

	87

	88 inline uint32_t GetHorizontalError(const __sse_data* data,

	89 const __m128i* blue_avg,

	90 const __m128i* green_avg,

	91 const __m128i* red_avg,

	92 uint32_t* verror) {

	93 __m128i error = __sse_zero;

	94 int first_index, second_index;

	95

	96 for (int i = 0; i < 2; i++) {

	97 first_index = 2 * i;

	98 second_index = first_index + 1;

	99

	100 error = _mm_add_epi32(

	101 error, GetColorErrorSSE(data->blue[first_index], blue_avg[i]));

	102 error = _mm_add_epi32(

	103 error, GetColorErrorSSE(data->blue[second_index], blue_avg[i]));

	104 error = _mm_add_epi32(

	105 error, GetColorErrorSSE(data->green[first_index], green_avg[i]));

	106 error = _mm_add_epi32(

	107 error, GetColorErrorSSE(data->green[second_index], green_avg[i]));

	108 error = _mm_add_epi32(error,

	109 GetColorErrorSSE(data->red[first_index], red_avg[i]));

	110 error = _mm_add_epi32(

	111 error, GetColorErrorSSE(data->red[second_index], red_avg[i]));

	112 }

	113

	114 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	115

	116 verror[0] = _mm_cvtsi128_si32(error);

	117 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	118

	119 return verror[0] + verror[1];

	120 }

	121

	122 inline void GetAvgColors(const __sse_data* data,

	123 float* output,

	124 bool* __sse_use_diff) {

	125 __m128i sum[2], tmp;

	126

	127 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe

	128

	129 /* get avg red */

	130 /* [S0 S0 S1 S1] */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	131 sum[0] = _mm_add_epi32(data->red[0], data->red[1]);

	132 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	133

	134 /* [S2 S2 S3 S3] */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	135 sum[1] = _mm_add_epi32(data->red[2], data->red[3]);

	136 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	137

	138 float hred[2], vred[2];

	139 hred[0] = (_mm_cvtsi128_si32(

	140 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	141 8.0f;

	142 hred[1] = (_mm_cvtsi128_si32(

	143 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	144 8.0f;

	145

	146 tmp = _mm_add_epi32(sum[0], sum[1]);

	147 vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	148 vred[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	149

	150 /* get avg green */

	151 /* [S0 S0 S1 S1] */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	152 sum[0] = _mm_add_epi32(data->green[0], data->green[1]);

	153 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	154

	155 /* [S2 S2 S3 S3] */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	156 sum[1] = _mm_add_epi32(data->green[2], data->green[3]);

	157 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	158

	159 float hgreen[2], vgreen[2];

	160 hgreen[0] = (_mm_cvtsi128_si32(

	161 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	162 8.0f;

	163 hgreen[1] = (_mm_cvtsi128_si32(

	164 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	165 8.0f;

	166

	167 tmp = _mm_add_epi32(sum[0], sum[1]);

	168 vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	169 vgreen[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	170

	171 /* get avg blue */

	172 /* [S0 S0 S1 S1] */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	173 sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]);

	174 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	175

	176 /* [S2 S2 S3 S3] */
	reveman 2015/05/06 18:44:02 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: comment style Done.
	177 sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]);

	178 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	179

	180 float hblue[2], vblue[2];

	181 hblue[0] = (_mm_cvtsi128_si32(

	182 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	183 8.0f;

	184 hblue[1] = (_mm_cvtsi128_si32(

	185 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	186 8.0f;

	187

	188 tmp = _mm_add_epi32(sum[0], sum[1]);

	189 vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	190 vblue[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	191

	192 /* TODO(radu.velea): return int's instead of floats */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	193 output[0] = vblue[0];

	194 output[1] = vgreen[0];

	195 output[2] = vred[0];

	196

	197 output[3] = vblue[1];

	198 output[4] = vgreen[1];

	199 output[5] = vred[1];

	200

	201 output[6] = hblue[0];

	202 output[7] = hgreen[0];

	203 output[8] = hred[0];

	204

	205 output[9] = hblue[1];

	206 output[10] = hgreen[1];

	207 output[11] = hred[1];

	208

	209 __m128i threshold_upper = _mm_set1_epi32(3);

	210 __m128i threshold_lower = _mm_set1_epi32(-4);

	211

	212 __m128 factor_v = _mm_set1_ps(31.0f / 255.0f);

	213 __m128 rounding_v = _mm_set1_ps(0.5f);

	214 __m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0);

	215 __m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0);

	216

	217 __m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0);

	218 __m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0);

	219

	220 h_avg_0 = _mm_mul_ps(h_avg_0, factor_v);

	221 h_avg_1 = _mm_mul_ps(h_avg_1, factor_v);

	222 v_avg_0 = _mm_mul_ps(v_avg_0, factor_v);

	223 v_avg_1 = _mm_mul_ps(v_avg_1, factor_v);

	224

	225 h_avg_0 = _mm_add_ps(h_avg_0, rounding_v);

	226 h_avg_1 = _mm_add_ps(h_avg_1, rounding_v);

	227 v_avg_0 = _mm_add_ps(v_avg_0, rounding_v);

	228 v_avg_1 = _mm_add_ps(v_avg_1, rounding_v);

	229

	230 __m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0);

	231 __m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1);

	232

	233 __m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0);

	234 __m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1);

	235

	236 h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i);

	237 v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i);

	238

	239 __sse_use_diff[0] =

	240 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threshold_lower)));

	241 __sse_use_diff[0] &=

	242 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threshold_upper)));

	243

	244 __sse_use_diff[1] =

	245 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threshold_lower)));

	246 __sse_use_diff[1] &=

	247 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threshold_upper)));

	248 }

	249

	250 void ComputeLuminance(uint8_t* block,

	251 const Color& base,

	252 const int sub_block_id,

	253 const uint8_t* idx_to_num_tab,

	254 const __sse_data* data,

	255 const uint32_t expected_error) {

	256 uint8_t best_tbl_idx = 0;

	257 uint32_t best_error = 0x7FFFFFFF;

	258 uint8_t best_mod_idx[8][8]; // [table][texel]

	259

	260 const __m128i base_blue = _mm_set1_epi32(base.channels.b);

	261 const __m128i base_green = _mm_set1_epi32(base.channels.g);

	262 const __m128i base_red = _mm_set1_epi32(base.channels.r);

	263

	264 __m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red;

	265 __m128i block_error, mask;

	266

	267 /* this will have the minimum errors for each 4 pixels */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	268 __m128i first_half_min;

	269 __m128i second_half_min;

	270

	271 /* this will have the matching table index combo for each 4 pixels */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	272 __m128i first_half_pattern;

	273 __m128i second_half_pattern;

	274

	275 const __m128i first_blue_data_block = data->blue[2 * sub_block_id];

	276 const __m128i first_green_data_block = data->green[2 * sub_block_id];

	277 const __m128i first_red_data_block = data->red[2 * sub_block_id];

	278

	279 const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1];

	280 const __m128i second_green_data_block = data->green[2 * sub_block_id + 1];

	281 const __m128i second_red_data_block = data->red[2 * sub_block_id + 1];

	282

	283 uint32_t min;

	284 /* fail early to increase speed */
	reveman 2015/05/06 18:44:02 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: comment style Done.
	285 long delta = INT32_MAX;

	286 uint32_t last_min = INT32_MAX;

	287

	288 const uint8_t shuffle_mask[] = {

	289 0x1B, 0x4E, 0xB1, 0xE4}; /* important they are sorted ascending */
	reveman 2015/05/06 18:44:02 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: comment style Done.
	290

	291 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	292 tmp = _mm_set_epi32(

	293 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],

	294 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);

	295

	296 test_blue = AddAndClamp(tmp, base_blue);

	297 test_green = AddAndClamp(tmp, base_green);

	298 test_red = AddAndClamp(tmp, base_red);

	299

	300 first_half_min = __sse_max_int;

	301 second_half_min = __sse_max_int;

	302

	303 first_half_pattern = __sse_zero;

	304 second_half_pattern = __sse_zero;

	305

	306 for (uint8_t imm8 : shuffle_mask) {

	307 switch (imm8) {

	308 case 0x1B:

	309 tmp_blue = _mm_shuffle_epi32(test_blue, 0x1B);

	310 tmp_green = _mm_shuffle_epi32(test_green, 0x1B);

	311 tmp_red = _mm_shuffle_epi32(test_red, 0x1B);

	312 break;

	313 case 0x4E:

	314 tmp_blue = _mm_shuffle_epi32(test_blue, 0x4E);

	315 tmp_green = _mm_shuffle_epi32(test_green, 0x4E);

	316 tmp_red = _mm_shuffle_epi32(test_red, 0x4E);

	317 break;

	318 case 0xB1:

	319 tmp_blue = _mm_shuffle_epi32(test_blue, 0xB1);

	320 tmp_green = _mm_shuffle_epi32(test_green, 0xB1);

	321 tmp_red = _mm_shuffle_epi32(test_red, 0xB1);

	322 break;

	323 case 0xE4:

	324 tmp_blue = _mm_shuffle_epi32(test_blue, 0xE4);

	325 tmp_green = _mm_shuffle_epi32(test_green, 0xE4);

	326 tmp_red = _mm_shuffle_epi32(test_red, 0xE4);

	327 break;

	328 default:

	329 tmp_blue = test_blue;

	330 tmp_green = test_green;

	331 tmp_red = test_red;

	332 }

	333

	334 tmp = _mm_set1_epi32(imm8);

	335

	336 block_error =

	337 AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block),

	338 GetColorErrorSSE(tmp_green, first_green_data_block),

	339 GetColorErrorSSE(tmp_red, first_red_data_block));

	340

	341 /* save winning pattern */
	reveman 2015/05/06 18:44:02 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: comment style Done.
	342 first_half_pattern = _mm_max_epi16(

	343 first_half_pattern,

	344 _mm_and_si128(tmp, _mm_cmpgt_epi32(first_half_min, block_error)));

	345 /* should use _mm_min_epi32(first_half_min, block_error); otherwise

	346 * performance penalty */
	reveman 2015/05/06 18:44:00 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:00, reveman wrote: > nit: comment style Done.
	347 mask = _mm_cmplt_epi32(block_error, first_half_min);

	348 first_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	349 _mm_andnot_si128(mask, first_half_min));

	350

	351 /* Second part of the block */
	reveman 2015/05/06 18:44:02 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:41 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: comment style Done.
	352 block_error =

	353 AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block),

	354 GetColorErrorSSE(tmp_green, second_green_data_block),

	355 GetColorErrorSSE(tmp_red, second_red_data_block));

	356

	357 /* save winning pattern */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	358 second_half_pattern = _mm_max_epi16(

	359 second_half_pattern,

	360 _mm_and_si128(tmp, _mm_cmpgt_epi32(second_half_min, block_error)));

	361 /* should use _mm_min_epi32(second_half_min, block_error); otherwise

	362 * performance penalty */
	reveman 2015/05/06 18:44:02 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: comment style Done.
	363 mask = _mm_cmplt_epi32(block_error, second_half_min);

	364 second_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	365 _mm_andnot_si128(mask, second_half_min));

	366 }

	367

	368 first_half_min = _mm_add_epi32(first_half_min, second_half_min);

	369 first_half_min =

	370 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E));

	371 first_half_min =

	372 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1));

	373

	374 min = _mm_cvtsi128_si32(first_half_min);

	375

	376 delta = min - last_min;

	377 last_min = min;

	378

	379 if (min < best_error) {

	380 best_tbl_idx = tbl_idx;

	381 best_error = min;

	382

	383 best_mod_idx[tbl_idx][0] =

	384 (_mm_cvtsi128_si32(first_half_pattern) >> (0)) & 3;

	385 best_mod_idx[tbl_idx][4] =

	386 (_mm_cvtsi128_si32(second_half_pattern) >> (0)) & 3;

	387

	388 best_mod_idx[tbl_idx][1] =

	389 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x1)) >>

	390 (2)) &

	391 3;

	392 best_mod_idx[tbl_idx][5] =

	393 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x1)) >>

	394 (2)) &

	395 3;

	396

	397 best_mod_idx[tbl_idx][2] =

	398 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x2)) >>

	399 (4)) &

	400 3;

	401 best_mod_idx[tbl_idx][6] =

	402 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x2)) >>

	403 (4)) &

	404 3;

	405

	406 best_mod_idx[tbl_idx][3] =

	407 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x3)) >>

	408 (6)) &

	409 3;

	410 best_mod_idx[tbl_idx][7] =

	411 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x3)) >>

	412 (6)) &

	413 3;

	414

	415 if (best_error == 0) {

	416 break;

	417 }

	418 } else if (delta > 0 && expected_error < min) {

	419 /* error is growing and is well beyond expected error */
	reveman 2015/05/06 18:44:02 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: comment style Done.
	420 break;

	421 }

	422 }

	423

	424 WriteCodewordTable(block, sub_block_id, best_tbl_idx);

	425

	426 uint32_t pix_data = 0;

	427 uint8_t mod_idx;

	428 uint8_t pix_idx;

	429 uint32_t lsb;

	430 uint32_t msb;

	431 int texel_num;

	432

	433 for (unsigned int i = 0; i < 8; ++i) {

	434 mod_idx = best_mod_idx[best_tbl_idx][i];

	435 pix_idx = g_mod_to_pix[mod_idx];

	436

	437 lsb = pix_idx & 0x1;

	438 msb = pix_idx >> 1;

	439

	440 // Obtain the texel number as specified in the standard.

	441 texel_num = idx_to_num_tab[i];

	442 pix_data \|= msb << (texel_num + 16);

	443 pix_data \|= lsb << (texel_num);

	444 }

	445

	446 WritePixelData(block, pix_data);

	447 }

	448

	449 void CompressBlock(uint8_t* dst, __sse_data* data) {

	450 /* first 3 vertical 1, seconds 3 vertical 2, third 3 horizontal 1, last 3

	451 * horizontal 2 */
	reveman 2015/05/06 18:44:02 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: comment style Done.
	452 float __sse_avg_colors[12] = {

	453 0,

	454 };

	455 bool use_differential[2] = {true, true};

	456 GetAvgColors(data, __sse_avg_colors, use_differential);

	457 Color sub_block_avg[4];

	458

	459 /* TODO(radu.velea): remove floating point operations and use only int's +

	460 * normal

	461 * rounding and shifts */
	reveman 2015/05/06 18:44:01 nit: comment style and line wrapping nit: comment style and line wrapping radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style and line wrapping Done.
	462 for (int i = 0, j = 1; i < 4; i += 2, j += 2) {

	463 if (use_differential[i / 2] == false) {

	464 sub_block_avg[i] = MakeColor444(&__sse_avg_colors[i * 3]);

	465 sub_block_avg[j] = MakeColor444(&__sse_avg_colors[j * 3]);

	466 } else {

	467 sub_block_avg[i] = MakeColor555(&__sse_avg_colors[i * 3]);

	468 sub_block_avg[j] = MakeColor555(&__sse_avg_colors[j * 3]);

	469 }

	470 }

	471

	472 __m128i red_avg[2], green_avg[2], blue_avg[2];

	473

	474 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	475 blue_avg[0] =

	476 _mm_set_epi32((int)__sse_avg_colors[3], (int)__sse_avg_colors[3],

	477 (int)__sse_avg_colors[0], (int)__sse_avg_colors[0]);

	478

	479 green_avg[0] =

	480 _mm_set_epi32((int)__sse_avg_colors[4], (int)__sse_avg_colors[4],

	481 (int)__sse_avg_colors[1], (int)__sse_avg_colors[1]);

	482

	483 red_avg[0] =

	484 _mm_set_epi32((int)__sse_avg_colors[5], (int)__sse_avg_colors[5],

	485 (int)__sse_avg_colors[2], (int)__sse_avg_colors[2]);

	486

	487 uint32_t vertical_error[2];

	488 GetVerticalError(data, blue_avg, green_avg, red_avg, vertical_error);

	489

	490 // TODO(radu.velea): perfect accuracy, maybe skip floating variables

	491 blue_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[6]);

	492 blue_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[9]);

	493

	494 green_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[7]);

	495 green_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[10]);

	496

	497 red_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[8]);

	498 red_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[11]);

	499

	500 uint32_t horizontal_error[2];

	501 GetHorizontalError(data, blue_avg, green_avg, red_avg, horizontal_error);

	502

	503 bool flip = (horizontal_error[0] + horizontal_error[1]) <

	504 (vertical_error[0] + vertical_error[1]);

	505 uint32_t* expected_errors = flip == true ? horizontal_error : vertical_error;

	506

	507 // Clear destination buffer so that we can "or" in the results.

	508 memset(dst, 0, 8);

	509

	510 WriteDiff(dst, use_differential[!!flip]);

	511 WriteFlip(dst, flip);

	512

	513 uint8_t sub_block_off_0 = flip ? 2 : 0;

	514 uint8_t sub_block_off_1 = sub_block_off_0 + 1;

	515

	516 if (use_differential[!!flip]) {

	517 WriteColors555(dst, sub_block_avg[sub_block_off_0],

	518 sub_block_avg[sub_block_off_1]);

	519 } else {

	520 WriteColors444(dst, sub_block_avg[sub_block_off_0],

	521 sub_block_avg[sub_block_off_1]);

	522 }

	523

	524 if (flip == false) {

	525 /* transpose vertical data into horizontal lines */
	reveman 2015/05/06 18:44:00 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:00, reveman wrote: > nit: comment style Done.
	526 __m128i tmp;

	527 for (int i = 0; i < 4; i += 2) {

	528 tmp = data->blue[i];

	529 data->blue[i] = _mm_add_epi32(

	530 _mm_move_epi64(data->blue[i]),

	531 _mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E));

	532 data->blue[i + 1] = _mm_add_epi32(

	533 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	534 _mm_shuffle_epi32(

	535 _mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)),

	536 0x4E));

	537

	538 tmp = data->green[i];

	539 data->green[i] = _mm_add_epi32(

	540 _mm_move_epi64(data->green[i]),

	541 _mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E));

	542 data->green[i + 1] = _mm_add_epi32(

	543 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	544 _mm_shuffle_epi32(

	545 _mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)),

	546 0x4E));

	547

	548 tmp = data->red[i];

	549 data->red[i] = _mm_add_epi32(

	550 _mm_move_epi64(data->red[i]),

	551 _mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E));

	552 data->red[i + 1] = _mm_add_epi32(

	553 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	554 _mm_shuffle_epi32(

	555 _mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E));

	556 }

	557

	558 tmp = data->blue[1];

	559 data->blue[1] = data->blue[2];

	560 data->blue[2] = tmp;

	561

	562 tmp = data->green[1];

	563 data->green[1] = data->green[2];

	564 data->green[2] = tmp;

	565

	566 tmp = data->red[1];

	567 data->red[1] = data->red[2];

	568 data->red[2] = tmp;

	569 }

	570

	571 // Compute luminance for the first sub block.

	572 ComputeLuminance(dst, sub_block_avg[sub_block_off_0], 0,

	573 g_idx_to_num[sub_block_off_0], data,

	574 ETC1_SET_ERROR(expected_errors[0]));

	575 // Compute luminance for the second sub block.

	576 ComputeLuminance(dst, sub_block_avg[sub_block_off_1], 1,

	577 g_idx_to_num[sub_block_off_1], data,

	578 ETC1_SET_ERROR(expected_errors[1]));

	579 }

	580

	581 static void ExtractBlock(uint8_t* dst, const uint8_t* src, int width) {

	582 for (int j = 0; j < 4; ++j) {

	583 memcpy(&dst[j * 4 * 4], src, 4 * 4);

	584 src += width * 4;

	585 }

	586 }

	587

	588 inline bool TransposeBlock(uint8_t* block, __m128i* transposed /* [4] */) {

	589 /* This function transforms an incommig block of RGBA or GBRA pixels into 4

	590 * registers, each containing the data corresponding for a single channel.

	591 * Ex: transposed[0] will have all the R values for a RGBA block,

	592 * transposed[1] will have G, etc.

	593 * The values are packed as 8 bit unsigned values in the SSE registers.

	594 *

	595 * Before doing any work we check if the block is solid.

	596 */
	reveman 2015/05/06 18:44:02 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: comment style Done.
	597 __m128i tmp3, tmp2, tmp1, tmp0;

	598 __m128i test_solid = _mm_set1_epi32(((uint32_t)block));

	599 uint16_t mask = 0xFFFF;

	600

	601 // a0,a1,a2,...a7, ...a15

	602 transposed[0] = _mm_loadu_si128((__m128i*)(block));

	603 // b0, b1,b2,...b7.... b15

	604 transposed[1] = _mm_loadu_si128((__m128i*)(block + 16));

	605 // c0, c1,c2,...c7....c15

	606 transposed[2] = _mm_loadu_si128((__m128i*)(block + 32));

	607 // d0,d1,d2,...d7....d15

	608 transposed[3] = _mm_loadu_si128((__m128i*)(block + 48));

	609

	610 for (int i = 0; i < 4; i++) {

	611 mask &= _mm_movemask_epi8(_mm_cmpeq_epi8(transposed[i], test_solid));

	612 }

	613

	614 if (mask == 0xFFFF) {

	615 return false; /* block is solid, no need to do any more work */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	616 }

	617

	618 // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7

	619 tmp0 = _mm_unpacklo_epi8(transposed[0], transposed[1]);

	620 // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7

	621 tmp1 = _mm_unpacklo_epi8(transposed[2], transposed[3]);

	622 // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15

	623 tmp2 = _mm_unpackhi_epi8(transposed[0], transposed[1]);

	624 // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15

	625 tmp3 = _mm_unpackhi_epi8(transposed[2], transposed[3]);

	626

	627 // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11

	628 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);

	629 // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15

	630 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);

	631 // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11

	632 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);

	633 // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15

	634 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);

	635

	636 // a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9

	637 tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]);

	638 // a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11

	639 tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]);

	640 // a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13

	641 tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]);

	642 // a6,a14, b6,b14, c6,c14, d6,d14, a7,a15, b7,b15, c7,c15, d7,d15

	643 tmp3 = _mm_unpackhi_epi32(transposed[1], transposed[3]);

	644

	645 // a0,a4, a8,a12, b0,b4, b8,b12, c0,c4, c8,c12, d0,d4, d8,d12

	646 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);

	647 // a1,a5, a9,a13, b1,b5, b9,b13, c1,c5, c9,c13, d1,d5, d9,d13

	648 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);

	649 // a2,a6, a10,a14, b2,b6, b10,b14, c2,c6, c10,c14, d2,d6, d10,d14

	650 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);

	651 // a3,a7, a11,a15, b3,b7, b11,b15, c3,c7, c11,c15, d3,d7, d11,d15

	652 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);

	653

	654 return true;

	655 }

	656

	657 inline void UnpackBlock(__m128i* packed,

	658 __m128i* red,

	659 __m128i* green,

	660 __m128i* blue,

	661 __m128i* alpha) {

	662 const __m128i zero = _mm_set1_epi8(0);

	663 __m128i tmp_low, tmp_high;

	664

	665 /* unpack red */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:41 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	666 tmp_low = _mm_unpacklo_epi8(packed[0], zero);

	667 tmp_high = _mm_unpackhi_epi8(packed[0], zero);

	668

	669 red[0] = _mm_unpacklo_epi16(tmp_low, zero);

	670 red[1] = _mm_unpackhi_epi16(tmp_low, zero);

	671

	672 red[2] = _mm_unpacklo_epi16(tmp_high, zero);

	673 red[3] = _mm_unpackhi_epi16(tmp_high, zero);

	674

	675 /* unpack green */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	676 tmp_low = _mm_unpacklo_epi8(packed[1], zero);

	677 tmp_high = _mm_unpackhi_epi8(packed[1], zero);

	678

	679 green[0] = _mm_unpacklo_epi16(tmp_low, zero);

	680 green[1] = _mm_unpackhi_epi16(tmp_low, zero);

	681

	682 green[2] = _mm_unpacklo_epi16(tmp_high, zero);

	683 green[3] = _mm_unpackhi_epi16(tmp_high, zero);

	684

	685 /* unpack blue */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	686 tmp_low = _mm_unpacklo_epi8(packed[2], zero);

	687 tmp_high = _mm_unpackhi_epi8(packed[2], zero);

	688

	689 blue[0] = _mm_unpacklo_epi16(tmp_low, zero);

	690 blue[1] = _mm_unpackhi_epi16(tmp_low, zero);

	691

	692 blue[2] = _mm_unpacklo_epi16(tmp_high, zero);

	693 blue[3] = _mm_unpackhi_epi16(tmp_high, zero);

	694

	695 /* unpack alpha */
	reveman 2015/05/06 18:44:01 nit: comment style nit: comment style radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: comment style Done.
	696 tmp_low = _mm_unpacklo_epi8(packed[3], zero);

	697 tmp_high = _mm_unpackhi_epi8(packed[3], zero);

	698

	699 alpha[0] = _mm_unpacklo_epi16(tmp_low, zero);

	700 alpha[1] = _mm_unpackhi_epi16(tmp_low, zero);

	701

	702 alpha[2] = _mm_unpacklo_epi16(tmp_high, zero);

	703 alpha[3] = _mm_unpackhi_epi16(tmp_high, zero);

	704 }

	705

	706 inline void CompressSolid(uint8_t* dst, uint8_t* block) {

	707 // Clear destination buffer so that we can "or" in the results.

	708 memset(dst, 0, 8);

	709

	710 const float src_color_float[3] = {static_cast<float>(block[0]),

	711 static_cast<float>(block[1]),

	712 static_cast<float>(block[2])};

	713 const Color base = MakeColor555(src_color_float);

	714 const __m128i base_v =

	715 _mm_set_epi32(0, base.channels.r, base.channels.g, base.channels.b);

	716

	717 const __m128i constant = _mm_set_epi32(0, block[2], block[1], block[0]);

	718 __m128i lum;

	719 __m128i colors[4];

	720 static const __m128i rgb =

	721 _mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

	722

	723 WriteDiff(dst, true);

	724 WriteFlip(dst, false);

	725

	726 WriteColors555(dst, base, base);

	727

	728 uint8_t best_tbl_idx = 0;

	729 uint8_t best_mod_idx = 0;

	730 uint32_t best_mod_err = INT32_MAX;

	731

	732 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	733 lum = _mm_set_epi32(

	734 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],

	735 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);

	736 colors[0] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x0));

	737 colors[1] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x55));

	738 colors[2] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xAA));

	739 colors[3] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xFF));

	740

	741 for (int i = 0; i < 4; i++) {

	742 uint32_t mod_err =

	743 SumSSE(GetColorErrorSSE(constant, _mm_and_si128(colors[i], rgb)));

	744 colors[i] = _mm_and_si128(colors[i], rgb);

	745 if (mod_err < best_mod_err) {

	746 best_tbl_idx = tbl_idx;

	747 best_mod_idx = i;

	748 best_mod_err = mod_err;

	749

	750 if (mod_err == 0) {

	751 break; // We cannot do any better than this.

	752 }

	753 }

	754 }

	755 }

	756

	757 WriteCodewordTable(dst, 0, best_tbl_idx);

	758 WriteCodewordTable(dst, 1, best_tbl_idx);

	759

	760 uint8_t pix_idx = g_mod_to_pix[best_mod_idx];

	761 uint32_t lsb = pix_idx & 0x1;

	762 uint32_t msb = pix_idx >> 1;

	763

	764 uint32_t pix_data = 0;

	765 for (unsigned int i = 0; i < 2; ++i) {

	766 for (unsigned int j = 0; j < 8; ++j) {

	767 // Obtain the texel number as specified in the standard.

	768 int texel_num = g_idx_to_num[i][j];

	769 pix_data \|= msb << (texel_num + 16);

	770 pix_data \|= lsb << (texel_num);

	771 }

	772 }

	773

	774 WritePixelData(dst, pix_data);

	775 }

	776

	777 } // namespace

	778

	779 namespace cc {

	780

	781 void TextureCompressorETC1SSE::Compress(const uint8_t* src,

	782 uint8_t* dst,

	783 int width,

	784 int height,

	785 Quality quality) {

	786 DCHECK(width >= 4 && (width & 3) == 0);
	reveman 2015/05/06 18:44:01 nit: two DCHECKs instead: DCHECK_GE(width, 4); DCH nit: two DCHECKs instead: DCHECK_GE(width, 4); DCHECK_EQ(width & 3, 0); radu.velea 2015/05/07 11:21:40 Done. Show quoted text On 2015/05/06 18:44:01, reveman wrote: > nit: two DCHECKs instead: > DCHECK_GE(width, 4); > DCHECK_EQ(width & 3, 0); Done.
	787 DCHECK(height >= 4 && (height & 3) == 0);
	reveman 2015/05/06 18:44:02 nit: two DCHECKs instead nit: two DCHECKs instead radu.velea 2015/05/07 11:21:39 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: two DCHECKs instead Done. radu.velea 2015/05/07 11:21:41 Done. Show quoted text On 2015/05/06 18:44:02, reveman wrote: > nit: two DCHECKs instead Done.
	788

	789 ALIGNAS(16) uint8_t block[64];

	790 __m128i packed[4];

	791 __m128i red[4], green[4], blue[4], alpha[4];

	792 __sse_data data;

	793

	794 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {

	795 for (int x = 0; x < width; x += 4, dst += 8) {

	796 ExtractBlock(block, src + x * 4, width);

	797 if (TransposeBlock(block, packed) == false) {

	798 CompressSolid(dst, block);

	799 } else {

	800 UnpackBlock(packed, blue, green, red, alpha);

	801

	802 data.block = block;

	803 data.packed = packed;

	804 data.red = red;

	805 data.blue = blue;

	806 data.green = green;

	807

	808 CompressBlock(dst, &data);

	809 }

	810 }

	811 }

	812 }

	813

	814 } // namespace cc

OLD	NEW