cc/resources/texture_compressor_etc1_sse.cc - Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression

Side by Side Diff: cc/resources/texture_compressor_etc1_sse.cc

Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Updated comments, casts and other minor issues. Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "cc/resources/texture_compressor_etc1_sse.h"

	6

	7 #include <assert.h>
	reveman 2015/05/07 16:26:29 is this needed? is this needed? radu.velea 2015/05/08 10:52:11 Done. Show quoted text On 2015/05/07 16:26:29, reveman wrote: > is this needed? Done.
	8 #include <emmintrin.h>

	9 #include <cmath>

	10 #include <limits>

	11

	12 #include "base/compiler_specific.h"

	13 #include "base/logging.h"

	14 // Using this header for common functions such as Color handling

	15 // and codeword table.

	16 #include "cc/resources/texture_compressor_etc1.h"

	17

	18 namespace cc {

	19

	20 namespace {

	21

	22 inline uint32_t SetETC1MaxError(uint32_t avg_error) {

	23 // ETC1 codeword table is sorted ascending.

	24 // Our algorithm will try to identify the index that generates the minimum

	25 // error.

	26 // The min error calculated during ComputeLuminance main loop will converge

	27 // towards that value.

	28 // We use this threashold to determine when it doesn't make sense to iterate

	29 // further through the array.

	30 return avg_error + avg_error / 2 + 384;

	31 }

	32

	33 struct __sse_data {

	34 // This is used to store raw data.

	35 uint8_t* block;

	36 // This is used to store 8 bit packed values.

	37 __m128i* packed;

	38 // This is used to store 32 bit zero extended values into 4x4 arrays.

	39 __m128i* blue;

	40 __m128i* green;

	41 __m128i* red;

	42 };

	43

	44 // Commonly used registers throughout the code.

	45 static const __m128i __sse_zero = _mm_set1_epi32(0);

	46 static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF);

	47

	48 inline __m128i AddAndClamp(const __m128i x, const __m128i y) {

	49 static const __m128i color_max = _mm_set1_epi32(0xFF);

	50 return _mm_max_epi16(__sse_zero,

	51 _mm_min_epi16(_mm_add_epi16(x, y), color_max));

	52 }

	53

	54 inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) {

	55 // Changed from _mm_mullo_epi32 (SSE4) to _mm_mullo_epi16 (SSE2).

	56 __m128i ret = _mm_sub_epi16(x, y);

	57 return _mm_mullo_epi16(ret, ret);

	58 }

	59

	60 inline __m128i AddChannelError(const __m128i x,

	61 const __m128i y,

	62 const __m128i z) {

	63 return _mm_add_epi32(x, _mm_add_epi32(y, z));

	64 }

	65

	66 inline uint32_t SumSSE(const __m128i x) {

	67 __m128i sum = _mm_add_epi32(x, _mm_shuffle_epi32(x, 0x4E));

	68 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));

	69

	70 return _mm_cvtsi128_si32(sum);

	71 }

	72

	73 inline uint32_t GetVerticalError(const __sse_data* data,

	74 const __m128i* blue_avg,

	75 const __m128i* green_avg,

	76 const __m128i* red_avg,

	77 uint32_t* verror) {

	78 __m128i error = __sse_zero;

	79

	80 for (int i = 0; i < 4; i++) {

	81 error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0]));

	82 error =

	83 _mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0]));

	84 error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0]));

	85 }

	86

	87 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	88

	89 verror[0] = _mm_cvtsi128_si32(error);

	90 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	91

	92 return verror[0] + verror[1];

	93 }

	94

	95 inline uint32_t GetHorizontalError(const __sse_data* data,

	96 const __m128i* blue_avg,

	97 const __m128i* green_avg,

	98 const __m128i* red_avg,

	99 uint32_t* verror) {

	100 __m128i error = __sse_zero;

	101 int first_index, second_index;

	102

	103 for (int i = 0; i < 2; i++) {

	104 first_index = 2 * i;

	105 second_index = first_index + 1;

	106

	107 error = _mm_add_epi32(

	108 error, GetColorErrorSSE(data->blue[first_index], blue_avg[i]));

	109 error = _mm_add_epi32(

	110 error, GetColorErrorSSE(data->blue[second_index], blue_avg[i]));

	111 error = _mm_add_epi32(

	112 error, GetColorErrorSSE(data->green[first_index], green_avg[i]));

	113 error = _mm_add_epi32(

	114 error, GetColorErrorSSE(data->green[second_index], green_avg[i]));

	115 error = _mm_add_epi32(error,

	116 GetColorErrorSSE(data->red[first_index], red_avg[i]));

	117 error = _mm_add_epi32(

	118 error, GetColorErrorSSE(data->red[second_index], red_avg[i]));

	119 }

	120

	121 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));

	122

	123 verror[0] = _mm_cvtsi128_si32(error);

	124 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));

	125

	126 return verror[0] + verror[1];

	127 }

	128

	129 inline void GetAvgColors(const __sse_data* data,

	130 float* output,

	131 bool* __sse_use_diff) {

	132 __m128i sum[2], tmp;

	133

	134 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe.

	135

	136 // Compute avg red value.

	137 // [S0 S0 S1 S1]

	138 sum[0] = _mm_add_epi32(data->red[0], data->red[1]);

	139 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	140

	141 // [S2 S2 S3 S3]

	142 sum[1] = _mm_add_epi32(data->red[2], data->red[3]);

	143 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	144

	145 float hred[2], vred[2];

	146 hred[0] = (_mm_cvtsi128_si32(

	147 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	148 8.0f;

	149 hred[1] = (_mm_cvtsi128_si32(

	150 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	151 8.0f;

	152

	153 tmp = _mm_add_epi32(sum[0], sum[1]);

	154 vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	155 vred[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	156

	157 // Compute avg green value.

	158 // [S0 S0 S1 S1]

	159 sum[0] = _mm_add_epi32(data->green[0], data->green[1]);

	160 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	161

	162 // [S2 S2 S3 S3]

	163 sum[1] = _mm_add_epi32(data->green[2], data->green[3]);

	164 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	165

	166 float hgreen[2], vgreen[2];

	167 hgreen[0] = (_mm_cvtsi128_si32(

	168 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	169 8.0f;

	170 hgreen[1] = (_mm_cvtsi128_si32(

	171 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	172 8.0f;

	173

	174 tmp = _mm_add_epi32(sum[0], sum[1]);

	175 vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	176 vgreen[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	177

	178 // Compute avg blue value.

	179 // [S0 S0 S1 S1]

	180 sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]);

	181 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));

	182

	183 // [S2 S2 S3 S3]

	184 sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]);

	185 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));

	186

	187 float hblue[2], vblue[2];

	188 hblue[0] = (_mm_cvtsi128_si32(

	189 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /

	190 8.0f;

	191 hblue[1] = (_mm_cvtsi128_si32(

	192 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /

	193 8.0f;

	194

	195 tmp = _mm_add_epi32(sum[0], sum[1]);

	196 vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;

	197 vblue[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;

	198

	199 // TODO(radu.velea): Return int's instead of floats, based on Quality.

	200 output[0] = vblue[0];

	201 output[1] = vgreen[0];

	202 output[2] = vred[0];

	203

	204 output[3] = vblue[1];

	205 output[4] = vgreen[1];

	206 output[5] = vred[1];

	207

	208 output[6] = hblue[0];

	209 output[7] = hgreen[0];

	210 output[8] = hred[0];

	211

	212 output[9] = hblue[1];

	213 output[10] = hgreen[1];

	214 output[11] = hred[1];

	215

	216 __m128i threshold_upper = _mm_set1_epi32(3);

	217 __m128i threshold_lower = _mm_set1_epi32(-4);

	218

	219 __m128 factor_v = _mm_set1_ps(31.0f / 255.0f);

	220 __m128 rounding_v = _mm_set1_ps(0.5f);

	221 __m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0);

	222 __m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0);

	223

	224 __m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0);

	225 __m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0);

	226

	227 h_avg_0 = _mm_mul_ps(h_avg_0, factor_v);

	228 h_avg_1 = _mm_mul_ps(h_avg_1, factor_v);

	229 v_avg_0 = _mm_mul_ps(v_avg_0, factor_v);

	230 v_avg_1 = _mm_mul_ps(v_avg_1, factor_v);

	231

	232 h_avg_0 = _mm_add_ps(h_avg_0, rounding_v);

	233 h_avg_1 = _mm_add_ps(h_avg_1, rounding_v);

	234 v_avg_0 = _mm_add_ps(v_avg_0, rounding_v);

	235 v_avg_1 = _mm_add_ps(v_avg_1, rounding_v);

	236

	237 __m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0);

	238 __m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1);

	239

	240 __m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0);

	241 __m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1);

	242

	243 h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i);

	244 v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i);

	245

	246 __sse_use_diff[0] =

	247 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threshold_lower)));

	248 __sse_use_diff[0] &=

	249 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threshold_upper)));

	250

	251 __sse_use_diff[1] =

	252 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threshold_lower)));

	253 __sse_use_diff[1] &=

	254 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threshold_upper)));

	255 }

	256

	257 void ComputeLuminance(uint8_t* block,

	258 const Color& base,

	259 const int sub_block_id,

	260 const uint8_t* idx_to_num_tab,

	261 const __sse_data* data,

	262 const uint32_t expected_error) {

	263 uint8_t best_tbl_idx = 0;

	264 uint32_t best_error = 0x7FFFFFFF;

	265 uint8_t best_mod_idx[8][8]; // [table][texel]

	266

	267 const __m128i base_blue = _mm_set1_epi32(base.channels.b);

	268 const __m128i base_green = _mm_set1_epi32(base.channels.g);

	269 const __m128i base_red = _mm_set1_epi32(base.channels.r);

	270

	271 __m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red;

	272 __m128i block_error, mask;

	273

	274 // This will have the minimum errors for each 4 pixels.

	275 __m128i first_half_min;

	276 __m128i second_half_min;

	277

	278 // This will have the matching table index combo for each 4 pixels.

	279 __m128i first_half_pattern;

	280 __m128i second_half_pattern;

	281

	282 const __m128i first_blue_data_block = data->blue[2 * sub_block_id];

	283 const __m128i first_green_data_block = data->green[2 * sub_block_id];

	284 const __m128i first_red_data_block = data->red[2 * sub_block_id];

	285

	286 const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1];

	287 const __m128i second_green_data_block = data->green[2 * sub_block_id + 1];

	288 const __m128i second_red_data_block = data->red[2 * sub_block_id + 1];

	289

	290 uint32_t min;

	291 // Fail early to increase speed.

	292 long delta = INT32_MAX;

	293 uint32_t last_min = INT32_MAX;

	294

	295 const uint8_t shuffle_mask[] = {

	296 0x1B, 0x4E, 0xB1, 0xE4}; // Important they are sorted ascending.

	297

	298 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	299 tmp = _mm_set_epi32(

	300 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],

	301 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);

	302

	303 test_blue = AddAndClamp(tmp, base_blue);

	304 test_green = AddAndClamp(tmp, base_green);

	305 test_red = AddAndClamp(tmp, base_red);

	306

	307 first_half_min = __sse_max_int;

	308 second_half_min = __sse_max_int;

	309

	310 first_half_pattern = __sse_zero;

	311 second_half_pattern = __sse_zero;

	312

	313 for (uint8_t imm8 : shuffle_mask) {

	314 switch (imm8) {

	315 case 0x1B:

	316 tmp_blue = _mm_shuffle_epi32(test_blue, 0x1B);

	317 tmp_green = _mm_shuffle_epi32(test_green, 0x1B);

	318 tmp_red = _mm_shuffle_epi32(test_red, 0x1B);

	319 break;

	320 case 0x4E:

	321 tmp_blue = _mm_shuffle_epi32(test_blue, 0x4E);

	322 tmp_green = _mm_shuffle_epi32(test_green, 0x4E);

	323 tmp_red = _mm_shuffle_epi32(test_red, 0x4E);

	324 break;

	325 case 0xB1:

	326 tmp_blue = _mm_shuffle_epi32(test_blue, 0xB1);

	327 tmp_green = _mm_shuffle_epi32(test_green, 0xB1);

	328 tmp_red = _mm_shuffle_epi32(test_red, 0xB1);

	329 break;

	330 case 0xE4:

	331 tmp_blue = _mm_shuffle_epi32(test_blue, 0xE4);

	332 tmp_green = _mm_shuffle_epi32(test_green, 0xE4);

	333 tmp_red = _mm_shuffle_epi32(test_red, 0xE4);

	334 break;

	335 default:

	336 tmp_blue = test_blue;

	337 tmp_green = test_green;

	338 tmp_red = test_red;

	339 }

	340

	341 tmp = _mm_set1_epi32(imm8);

	342

	343 block_error =

	344 AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block),

	345 GetColorErrorSSE(tmp_green, first_green_data_block),

	346 GetColorErrorSSE(tmp_red, first_red_data_block));

	347

	348 // Save winning pattern.

	349 first_half_pattern = _mm_max_epi16(

	350 first_half_pattern,

	351 _mm_and_si128(tmp, _mm_cmpgt_epi32(first_half_min, block_error)));

	352 // Should use _mm_min_epi32(first_half_min, block_error); from SSE4

	353 // otherwise we have a small performance penalty.

	354 mask = _mm_cmplt_epi32(block_error, first_half_min);

	355 first_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	356 _mm_andnot_si128(mask, first_half_min));

	357

	358 // Compute second part of the block.

	359 block_error =

	360 AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block),

	361 GetColorErrorSSE(tmp_green, second_green_data_block),

	362 GetColorErrorSSE(tmp_red, second_red_data_block));

	363

	364 // Save winning pattern.

	365 second_half_pattern = _mm_max_epi16(

	366 second_half_pattern,

	367 _mm_and_si128(tmp, _mm_cmpgt_epi32(second_half_min, block_error)));

	368 // Should use _mm_min_epi32(second_half_min, block_error); from SSE4

	369 // otherwise we have a small performance penalty.

	370 mask = _mm_cmplt_epi32(block_error, second_half_min);

	371 second_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),

	372 _mm_andnot_si128(mask, second_half_min));

	373 }

	374

	375 first_half_min = _mm_add_epi32(first_half_min, second_half_min);

	376 first_half_min =

	377 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E));

	378 first_half_min =

	379 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1));

	380

	381 min = _mm_cvtsi128_si32(first_half_min);

	382

	383 delta = min - last_min;

	384 last_min = min;

	385

	386 if (min < best_error) {

	387 best_tbl_idx = tbl_idx;

	388 best_error = min;

	389

	390 best_mod_idx[tbl_idx][0] =

	391 (_mm_cvtsi128_si32(first_half_pattern) >> (0)) & 3;

	392 best_mod_idx[tbl_idx][4] =

	393 (_mm_cvtsi128_si32(second_half_pattern) >> (0)) & 3;

	394

	395 best_mod_idx[tbl_idx][1] =

	396 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x1)) >>

	397 (2)) &

	398 3;

	399 best_mod_idx[tbl_idx][5] =

	400 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x1)) >>

	401 (2)) &

	402 3;

	403

	404 best_mod_idx[tbl_idx][2] =

	405 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x2)) >>

	406 (4)) &

	407 3;

	408 best_mod_idx[tbl_idx][6] =

	409 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x2)) >>

	410 (4)) &

	411 3;

	412

	413 best_mod_idx[tbl_idx][3] =

	414 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x3)) >>

	415 (6)) &

	416 3;

	417 best_mod_idx[tbl_idx][7] =

	418 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x3)) >>

	419 (6)) &

	420 3;

	421

	422 if (best_error == 0) {

	423 break;

	424 }

	425 } else if (delta > 0 && expected_error < min) {

	426 // The error is growing and is well beyond expected threshold.

	427 break;

	428 }

	429 }

	430

	431 WriteCodewordTable(block, sub_block_id, best_tbl_idx);

	432

	433 uint32_t pix_data = 0;

	434 uint8_t mod_idx;

	435 uint8_t pix_idx;

	436 uint32_t lsb;

	437 uint32_t msb;

	438 int texel_num;

	439

	440 for (unsigned int i = 0; i < 8; ++i) {

	441 mod_idx = best_mod_idx[best_tbl_idx][i];

	442 pix_idx = g_mod_to_pix[mod_idx];

	443

	444 lsb = pix_idx & 0x1;

	445 msb = pix_idx >> 1;

	446

	447 // Obtain the texel number as specified in the standard.

	448 texel_num = idx_to_num_tab[i];

	449 pix_data \|= msb << (texel_num + 16);

	450 pix_data \|= lsb << (texel_num);

	451 }

	452

	453 WritePixelData(block, pix_data);

	454 }

	455

	456 void CompressBlock(uint8_t* dst, __sse_data* data) {

	457 // First 3 values are for vertical 1, second 3 vertical 2, third 3 horizontal

	458 // 1, last 3

	459 // horizontal 2.

	460 float __sse_avg_colors[12] = {

	461 0,

	462 };

	463 bool use_differential[2] = {true, true};

	464 GetAvgColors(data, __sse_avg_colors, use_differential);

	465 Color sub_block_avg[4];

	466

	467 // TODO(radu.velea): Remove floating point operations and use only int's +

	468 // normal rounding and shifts for reduced Quality.

	469 for (int i = 0, j = 1; i < 4; i += 2, j += 2) {

	470 if (use_differential[i / 2] == false) {

	471 sub_block_avg[i] = MakeColor444(&__sse_avg_colors[i * 3]);

	472 sub_block_avg[j] = MakeColor444(&__sse_avg_colors[j * 3]);

	473 } else {

	474 sub_block_avg[i] = MakeColor555(&__sse_avg_colors[i * 3]);

	475 sub_block_avg[j] = MakeColor555(&__sse_avg_colors[j * 3]);

	476 }

	477 }

	478

	479 __m128i red_avg[2], green_avg[2], blue_avg[2];

	480

	481 // TODO(radu.velea): Perfect accuracy, maybe skip floating variables.

	482 blue_avg[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors[3]),

	483 static_cast<int>(__sse_avg_colors[3]),

	484 static_cast<int>(__sse_avg_colors[0]),

	485 static_cast<int>(__sse_avg_colors[0]));

	486

	487 green_avg[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors[4]),

	488 static_cast<int>(__sse_avg_colors[4]),

	489 static_cast<int>(__sse_avg_colors[1]),

	490 static_cast<int>(__sse_avg_colors[1]));

	491

	492 red_avg[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors[5]),

	493 static_cast<int>(__sse_avg_colors[5]),

	494 static_cast<int>(__sse_avg_colors[2]),

	495 static_cast<int>(__sse_avg_colors[2]));

	496

	497 uint32_t vertical_error[2];

	498 GetVerticalError(data, blue_avg, green_avg, red_avg, vertical_error);

	499

	500 // TODO(radu.velea): Perfect accuracy, maybe skip floating variables.

	501 blue_avg[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[6]));

	502 blue_avg[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[9]));

	503

	504 green_avg[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[7]));

	505 green_avg[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[10]));

	506

	507 red_avg[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[8]));

	508 red_avg[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[11]));

	509

	510 uint32_t horizontal_error[2];

	511 GetHorizontalError(data, blue_avg, green_avg, red_avg, horizontal_error);

	512

	513 bool flip = (horizontal_error[0] + horizontal_error[1]) <

	514 (vertical_error[0] + vertical_error[1]);

	515 uint32_t* expected_errors = flip ? horizontal_error : vertical_error;

	516

	517 // Clear destination buffer so that we can "or" in the results.

	518 memset(dst, 0, 8);

	519

	520 WriteDiff(dst, use_differential[!!flip]);

	521 WriteFlip(dst, flip);

	522

	523 uint8_t sub_block_off_0 = flip ? 2 : 0;

	524 uint8_t sub_block_off_1 = sub_block_off_0 + 1;

	525

	526 if (use_differential[!!flip]) {

	527 WriteColors555(dst, sub_block_avg[sub_block_off_0],

	528 sub_block_avg[sub_block_off_1]);

	529 } else {

	530 WriteColors444(dst, sub_block_avg[sub_block_off_0],

	531 sub_block_avg[sub_block_off_1]);

	532 }

	533

	534 if (!flip) {

	535 // Transpose vertical data into horizontal lines.

	536 __m128i tmp;

	537 for (int i = 0; i < 4; i += 2) {

	538 tmp = data->blue[i];

	539 data->blue[i] = _mm_add_epi32(

	540 _mm_move_epi64(data->blue[i]),

	541 _mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E));

	542 data->blue[i + 1] = _mm_add_epi32(

	543 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	544 _mm_shuffle_epi32(

	545 _mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)),

	546 0x4E));

	547

	548 tmp = data->green[i];

	549 data->green[i] = _mm_add_epi32(

	550 _mm_move_epi64(data->green[i]),

	551 _mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E));

	552 data->green[i + 1] = _mm_add_epi32(

	553 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	554 _mm_shuffle_epi32(

	555 _mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)),

	556 0x4E));

	557

	558 tmp = data->red[i];

	559 data->red[i] = _mm_add_epi32(

	560 _mm_move_epi64(data->red[i]),

	561 _mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E));

	562 data->red[i + 1] = _mm_add_epi32(

	563 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),

	564 _mm_shuffle_epi32(

	565 _mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E));

	566 }

	567

	568 tmp = data->blue[1];

	569 data->blue[1] = data->blue[2];

	570 data->blue[2] = tmp;

	571

	572 tmp = data->green[1];

	573 data->green[1] = data->green[2];

	574 data->green[2] = tmp;

	575

	576 tmp = data->red[1];

	577 data->red[1] = data->red[2];

	578 data->red[2] = tmp;

	579 }

	580

	581 // Compute luminance for the first sub block.

	582 ComputeLuminance(dst, sub_block_avg[sub_block_off_0], 0,

	583 g_idx_to_num[sub_block_off_0], data,

	584 SetETC1MaxError(expected_errors[0]));

	585 // Compute luminance for the second sub block.

	586 ComputeLuminance(dst, sub_block_avg[sub_block_off_1], 1,

	587 g_idx_to_num[sub_block_off_1], data,

	588 SetETC1MaxError(expected_errors[1]));

	589 }

	590

	591 static void ExtractBlock(uint8_t* dst, const uint8_t* src, int width) {

	592 for (int j = 0; j < 4; ++j) {

	593 memcpy(&dst[j * 4 * 4], src, 4 * 4);

	594 src += width * 4;

	595 }

	596 }

	597

	598 inline bool TransposeBlock(uint8_t* block, __m128i* transposed) {

	599 // This function transforms an incommig block of RGBA or GBRA pixels into 4

	600 // registers, each containing the data corresponding for a single channel.

	601 // Ex: transposed[0] will have all the R values for a RGBA block,

	602 // transposed[1] will have G, etc.

	603 // The values are packed as 8 bit unsigned values in the SSE registers.

	604

	605 // Before doing any work we check if the block is solid.

	606 __m128i tmp3, tmp2, tmp1, tmp0;

	607 __m128i test_solid = _mm_set1_epi32(((uint32_t)block));

	608 uint16_t mask = 0xFFFF;

	609

	610 // a0,a1,a2,...a7, ...a15

	611 transposed[0] = _mm_loadu_si128((__m128i*)(block));

	612 // b0, b1,b2,...b7.... b15

	613 transposed[1] = _mm_loadu_si128((__m128i*)(block + 16));

	614 // c0, c1,c2,...c7....c15

	615 transposed[2] = _mm_loadu_si128((__m128i*)(block + 32));

	616 // d0,d1,d2,...d7....d15

	617 transposed[3] = _mm_loadu_si128((__m128i*)(block + 48));

	618

	619 for (int i = 0; i < 4; i++) {

	620 mask &= _mm_movemask_epi8(_mm_cmpeq_epi8(transposed[i], test_solid));

	621 }

	622

	623 if (mask == 0xFFFF) {

	624 // Block is solid, no need to do any more work.

	625 return false;

	626 }

	627

	628 // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7

	629 tmp0 = _mm_unpacklo_epi8(transposed[0], transposed[1]);

	630 // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7

	631 tmp1 = _mm_unpacklo_epi8(transposed[2], transposed[3]);

	632 // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15

	633 tmp2 = _mm_unpackhi_epi8(transposed[0], transposed[1]);

	634 // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15

	635 tmp3 = _mm_unpackhi_epi8(transposed[2], transposed[3]);

	636

	637 // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11

	638 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);

	639 // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15

	640 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);

	641 // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11

	642 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);

	643 // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15

	644 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);

	645

	646 // a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9

	647 tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]);

	648 // a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11

	649 tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]);

	650 // a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13

	651 tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]);

	652 // a6,a14, b6,b14, c6,c14, d6,d14, a7,a15, b7,b15, c7,c15, d7,d15

	653 tmp3 = _mm_unpackhi_epi32(transposed[1], transposed[3]);

	654

	655 // a0,a4, a8,a12, b0,b4, b8,b12, c0,c4, c8,c12, d0,d4, d8,d12

	656 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);

	657 // a1,a5, a9,a13, b1,b5, b9,b13, c1,c5, c9,c13, d1,d5, d9,d13

	658 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);

	659 // a2,a6, a10,a14, b2,b6, b10,b14, c2,c6, c10,c14, d2,d6, d10,d14

	660 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);

	661 // a3,a7, a11,a15, b3,b7, b11,b15, c3,c7, c11,c15, d3,d7, d11,d15

	662 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);

	663

	664 return true;

	665 }

	666

	667 inline void UnpackBlock(__m128i* packed,

	668 __m128i* red,

	669 __m128i* green,

	670 __m128i* blue,

	671 __m128i* alpha) {

	672 const __m128i zero = _mm_set1_epi8(0);

	673 __m128i tmp_low, tmp_high;

	674

	675 // Unpack red.

	676 tmp_low = _mm_unpacklo_epi8(packed[0], zero);

	677 tmp_high = _mm_unpackhi_epi8(packed[0], zero);

	678

	679 red[0] = _mm_unpacklo_epi16(tmp_low, zero);

	680 red[1] = _mm_unpackhi_epi16(tmp_low, zero);

	681

	682 red[2] = _mm_unpacklo_epi16(tmp_high, zero);

	683 red[3] = _mm_unpackhi_epi16(tmp_high, zero);

	684

	685 // Unpack green.

	686 tmp_low = _mm_unpacklo_epi8(packed[1], zero);

	687 tmp_high = _mm_unpackhi_epi8(packed[1], zero);

	688

	689 green[0] = _mm_unpacklo_epi16(tmp_low, zero);

	690 green[1] = _mm_unpackhi_epi16(tmp_low, zero);

	691

	692 green[2] = _mm_unpacklo_epi16(tmp_high, zero);

	693 green[3] = _mm_unpackhi_epi16(tmp_high, zero);

	694

	695 // Unpack blue.

	696 tmp_low = _mm_unpacklo_epi8(packed[2], zero);

	697 tmp_high = _mm_unpackhi_epi8(packed[2], zero);

	698

	699 blue[0] = _mm_unpacklo_epi16(tmp_low, zero);

	700 blue[1] = _mm_unpackhi_epi16(tmp_low, zero);

	701

	702 blue[2] = _mm_unpacklo_epi16(tmp_high, zero);

	703 blue[3] = _mm_unpackhi_epi16(tmp_high, zero);

	704

	705 // Unpack alpha - unused for ETC1.

	706 tmp_low = _mm_unpacklo_epi8(packed[3], zero);

	707 tmp_high = _mm_unpackhi_epi8(packed[3], zero);

	708

	709 alpha[0] = _mm_unpacklo_epi16(tmp_low, zero);

	710 alpha[1] = _mm_unpackhi_epi16(tmp_low, zero);

	711

	712 alpha[2] = _mm_unpacklo_epi16(tmp_high, zero);

	713 alpha[3] = _mm_unpackhi_epi16(tmp_high, zero);

	714 }

	715

	716 inline void CompressSolid(uint8_t* dst, uint8_t* block) {

	717 // Clear destination buffer so that we can "or" in the results.

	718 memset(dst, 0, 8);

	719

	720 const float src_color_float[3] = {static_cast<float>(block[0]),

	721 static_cast<float>(block[1]),

	722 static_cast<float>(block[2])};

	723 const Color base = MakeColor555(src_color_float);

	724 const __m128i base_v =

	725 _mm_set_epi32(0, base.channels.r, base.channels.g, base.channels.b);

	726

	727 const __m128i constant = _mm_set_epi32(0, block[2], block[1], block[0]);

	728 __m128i lum;

	729 __m128i colors[4];

	730 static const __m128i rgb =

	731 _mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

	732

	733 WriteDiff(dst, true);

	734 WriteFlip(dst, false);

	735

	736 WriteColors555(dst, base, base);

	737

	738 uint8_t best_tbl_idx = 0;

	739 uint8_t best_mod_idx = 0;

	740 uint32_t best_mod_err = INT32_MAX;

	741

	742 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {

	743 lum = _mm_set_epi32(

	744 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],

	745 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);

	746 colors[0] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x0));

	747 colors[1] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x55));

	748 colors[2] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xAA));

	749 colors[3] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xFF));

	750

	751 for (int i = 0; i < 4; i++) {

	752 uint32_t mod_err =

	753 SumSSE(GetColorErrorSSE(constant, _mm_and_si128(colors[i], rgb)));

	754 colors[i] = _mm_and_si128(colors[i], rgb);

	755 if (mod_err < best_mod_err) {

	756 best_tbl_idx = tbl_idx;

	757 best_mod_idx = i;

	758 best_mod_err = mod_err;

	759

	760 if (mod_err == 0) {

	761 break; // We cannot do any better than this.

	762 }

	763 }

	764 }

	765 }

	766

	767 WriteCodewordTable(dst, 0, best_tbl_idx);

	768 WriteCodewordTable(dst, 1, best_tbl_idx);

	769

	770 uint8_t pix_idx = g_mod_to_pix[best_mod_idx];

	771 uint32_t lsb = pix_idx & 0x1;

	772 uint32_t msb = pix_idx >> 1;

	773

	774 uint32_t pix_data = 0;

	775 for (unsigned int i = 0; i < 2; ++i) {

	776 for (unsigned int j = 0; j < 8; ++j) {

	777 // Obtain the texel number as specified in the standard.

	778 int texel_num = g_idx_to_num[i][j];

	779 pix_data \|= msb << (texel_num + 16);

	780 pix_data \|= lsb << (texel_num);

	781 }

	782 }

	783

	784 WritePixelData(dst, pix_data);

	785 }

	786

	787 } // namespace

	788

	789 void TextureCompressorETC1SSE::Compress(const uint8_t* src,

	790 uint8_t* dst,

	791 int width,

	792 int height,

	793 Quality quality) {

	794 DCHECK_GE(width, 4);

	795 DCHECK_EQ((width & 3), 0);

	796 DCHECK_GE(height, 4);

	797 DCHECK_EQ((height & 3), 0);

	798

	799 ALIGNAS(16) uint8_t block[64];

	800 __m128i packed[4];

	801 __m128i red[4], green[4], blue[4], alpha[4];

	802 __sse_data data;

	803

	804 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {

	805 for (int x = 0; x < width; x += 4, dst += 8) {

	806 ExtractBlock(block, src + x * 4, width);

	807 if (TransposeBlock(block, packed) == false) {

	808 CompressSolid(dst, block);

	809 } else {

	810 UnpackBlock(packed, blue, green, red, alpha);

	811

	812 data.block = block;

	813 data.packed = packed;

	814 data.red = red;

	815 data.blue = blue;

	816 data.green = green;

	817

	818 CompressBlock(dst, &data);

	819 }

	820 }

	821 }

	822 }

	823

	824 } // namespace cc

OLD	NEW

« cc/resources/texture_compressor.cc ('K') | « cc/resources/texture_compressor_etc1_sse.h ('k') | cc/resources/texture_compressor_etc1_unittest.cc » ('j') | no next file with comments »