Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(231)

Side by Side Diff: cc/resources/texture_compressor_etc1_sse.cc

Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "cc/resources/texture_compressor_etc1_sse.h"
6
7 #include <assert.h>
8 #include <emmintrin.h>
9 #include <cmath>
10 #include <limits>
11
12 #include "base/compiler_specific.h"
13 #include "base/logging.h"
14 // using this header for common functions such as Color handling
15 // and codeword table
16 #include "cc/resources/texture_compressor_etc1.h"
17
18 namespace {
19
20 #define ETC1_SET_ERROR(x) (x + x / 2 + 384)
reveman 2015/05/07 14:24:35 nit: inline function instead of macro
radu.velea 2015/05/07 15:53:45 Done.
21
22 struct __sse_data {
23 // raw data
24 uint8_t* block;
25 // 8 bit packed values
26 __m128i* packed;
27 // 32 bit zero extended values - 4x4 arrays
28 __m128i* blue;
29 __m128i* green;
30 __m128i* red;
31 };
32
33 // commonly used registers
reveman 2015/05/07 14:24:35 nit: I don't feel too strongly about this but ther
radu.velea 2015/05/07 15:53:46 Done.
34 static const __m128i __sse_zero = _mm_set1_epi32(0);
35 static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF);
36
37 inline __m128i AddAndClamp(const __m128i x, const __m128i y) {
38 static const __m128i color_max = _mm_set1_epi32(0xFF);
39 return _mm_max_epi16(__sse_zero,
40 _mm_min_epi16(_mm_add_epi16(x, y), color_max));
41 }
42
43 inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) {
44 // changed from _mm_mullo_epi32 (SSE4) to _mm_mullo_epi16 (SSE2)
45 __m128i ret = _mm_sub_epi16(x, y);
46 return _mm_mullo_epi16(ret, ret);
47 }
48
49 inline __m128i AddChannelError(const __m128i x,
50 const __m128i y,
51 const __m128i z) {
52 return _mm_add_epi32(x, _mm_add_epi32(y, z));
53 }
54
55 inline uint32_t SumSSE(const __m128i x) {
56 __m128i sum = _mm_add_epi32(x, _mm_shuffle_epi32(x, 0x4E));
57 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));
58
59 return _mm_cvtsi128_si32(sum);
60 }
61
62 inline uint32_t GetVerticalError(const __sse_data* data,
63 const __m128i* blue_avg,
64 const __m128i* green_avg,
65 const __m128i* red_avg,
66 uint32_t* verror) {
67 __m128i error = __sse_zero;
68
69 for (int i = 0; i < 4; i++) {
70 error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0]));
71 error =
72 _mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0]));
73 error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0]));
74 }
75
76 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));
77
78 verror[0] = _mm_cvtsi128_si32(error);
79 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));
80
81 return verror[0] + verror[1];
82 }
83
84 inline uint32_t GetHorizontalError(const __sse_data* data,
85 const __m128i* blue_avg,
86 const __m128i* green_avg,
87 const __m128i* red_avg,
88 uint32_t* verror) {
89 __m128i error = __sse_zero;
90 int first_index, second_index;
91
92 for (int i = 0; i < 2; i++) {
93 first_index = 2 * i;
94 second_index = first_index + 1;
95
96 error = _mm_add_epi32(
97 error, GetColorErrorSSE(data->blue[first_index], blue_avg[i]));
98 error = _mm_add_epi32(
99 error, GetColorErrorSSE(data->blue[second_index], blue_avg[i]));
100 error = _mm_add_epi32(
101 error, GetColorErrorSSE(data->green[first_index], green_avg[i]));
102 error = _mm_add_epi32(
103 error, GetColorErrorSSE(data->green[second_index], green_avg[i]));
104 error = _mm_add_epi32(error,
105 GetColorErrorSSE(data->red[first_index], red_avg[i]));
106 error = _mm_add_epi32(
107 error, GetColorErrorSSE(data->red[second_index], red_avg[i]));
108 }
109
110 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));
111
112 verror[0] = _mm_cvtsi128_si32(error);
113 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));
114
115 return verror[0] + verror[1];
116 }
117
118 inline void GetAvgColors(const __sse_data* data,
119 float* output,
120 bool* __sse_use_diff) {
121 __m128i sum[2], tmp;
122
123 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe
124
125 // get avg red
126 // [S0 S0 S1 S1]
127 sum[0] = _mm_add_epi32(data->red[0], data->red[1]);
128 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
129
130 // [S2 S2 S3 S3]
131 sum[1] = _mm_add_epi32(data->red[2], data->red[3]);
132 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
133
134 float hred[2], vred[2];
135 hred[0] = (_mm_cvtsi128_si32(
136 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
137 8.0f;
138 hred[1] = (_mm_cvtsi128_si32(
139 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
140 8.0f;
141
142 tmp = _mm_add_epi32(sum[0], sum[1]);
143 vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
144 vred[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;
145
146 // get avg green
147 // [S0 S0 S1 S1]
148 sum[0] = _mm_add_epi32(data->green[0], data->green[1]);
149 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
150
151 // [S2 S2 S3 S3]
152 sum[1] = _mm_add_epi32(data->green[2], data->green[3]);
153 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
154
155 float hgreen[2], vgreen[2];
156 hgreen[0] = (_mm_cvtsi128_si32(
157 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
158 8.0f;
159 hgreen[1] = (_mm_cvtsi128_si32(
160 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
161 8.0f;
162
163 tmp = _mm_add_epi32(sum[0], sum[1]);
164 vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
165 vgreen[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;
166
167 // get avg blue
168 // [S0 S0 S1 S1]
169 sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]);
170 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
171
172 // [S2 S2 S3 S3]
173 sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]);
174 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
175
176 float hblue[2], vblue[2];
177 hblue[0] = (_mm_cvtsi128_si32(
178 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
179 8.0f;
180 hblue[1] = (_mm_cvtsi128_si32(
181 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
182 8.0f;
183
184 tmp = _mm_add_epi32(sum[0], sum[1]);
185 vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
186 vblue[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;
187
188 // TODO(radu.velea): return int's instead of floats, based on Quality
189 output[0] = vblue[0];
190 output[1] = vgreen[0];
191 output[2] = vred[0];
192
193 output[3] = vblue[1];
194 output[4] = vgreen[1];
195 output[5] = vred[1];
196
197 output[6] = hblue[0];
198 output[7] = hgreen[0];
199 output[8] = hred[0];
200
201 output[9] = hblue[1];
202 output[10] = hgreen[1];
203 output[11] = hred[1];
204
205 __m128i threshold_upper = _mm_set1_epi32(3);
206 __m128i threshold_lower = _mm_set1_epi32(-4);
207
208 __m128 factor_v = _mm_set1_ps(31.0f / 255.0f);
209 __m128 rounding_v = _mm_set1_ps(0.5f);
210 __m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0);
211 __m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0);
212
213 __m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0);
214 __m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0);
215
216 h_avg_0 = _mm_mul_ps(h_avg_0, factor_v);
217 h_avg_1 = _mm_mul_ps(h_avg_1, factor_v);
218 v_avg_0 = _mm_mul_ps(v_avg_0, factor_v);
219 v_avg_1 = _mm_mul_ps(v_avg_1, factor_v);
220
221 h_avg_0 = _mm_add_ps(h_avg_0, rounding_v);
222 h_avg_1 = _mm_add_ps(h_avg_1, rounding_v);
223 v_avg_0 = _mm_add_ps(v_avg_0, rounding_v);
224 v_avg_1 = _mm_add_ps(v_avg_1, rounding_v);
225
226 __m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0);
227 __m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1);
228
229 __m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0);
230 __m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1);
231
232 h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i);
233 v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i);
234
235 __sse_use_diff[0] =
236 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threshold_lower)));
237 __sse_use_diff[0] &=
238 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threshold_upper)));
239
240 __sse_use_diff[1] =
241 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threshold_lower)));
242 __sse_use_diff[1] &=
243 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threshold_upper)));
244 }
245
246 void ComputeLuminance(uint8_t* block,
247 const cc::Color& base,
248 const int sub_block_id,
249 const uint8_t* idx_to_num_tab,
250 const __sse_data* data,
251 const uint32_t expected_error) {
252 uint8_t best_tbl_idx = 0;
253 uint32_t best_error = 0x7FFFFFFF;
254 uint8_t best_mod_idx[8][8]; // [table][texel]
255
256 const __m128i base_blue = _mm_set1_epi32(base.channels.b);
257 const __m128i base_green = _mm_set1_epi32(base.channels.g);
258 const __m128i base_red = _mm_set1_epi32(base.channels.r);
259
260 __m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red;
261 __m128i block_error, mask;
262
263 // this will have the minimum errors for each 4 pixels
264 __m128i first_half_min;
265 __m128i second_half_min;
266
267 // this will have the matching table index combo for each 4 pixels
268 __m128i first_half_pattern;
269 __m128i second_half_pattern;
270
271 const __m128i first_blue_data_block = data->blue[2 * sub_block_id];
272 const __m128i first_green_data_block = data->green[2 * sub_block_id];
273 const __m128i first_red_data_block = data->red[2 * sub_block_id];
274
275 const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1];
276 const __m128i second_green_data_block = data->green[2 * sub_block_id + 1];
277 const __m128i second_red_data_block = data->red[2 * sub_block_id + 1];
278
279 uint32_t min;
280 // fail early to increase speed
281 long delta = INT32_MAX;
282 uint32_t last_min = INT32_MAX;
283
284 const uint8_t shuffle_mask[] = {
285 0x1B, 0x4E, 0xB1, 0xE4}; // important they are sorted ascending
286
287 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {
288 tmp = _mm_set_epi32(
289 cc::g_codeword_tables[tbl_idx][3], cc::g_codeword_tables[tbl_idx][2],
290 cc::g_codeword_tables[tbl_idx][1], cc::g_codeword_tables[tbl_idx][0]);
291
292 test_blue = AddAndClamp(tmp, base_blue);
293 test_green = AddAndClamp(tmp, base_green);
294 test_red = AddAndClamp(tmp, base_red);
295
296 first_half_min = __sse_max_int;
297 second_half_min = __sse_max_int;
298
299 first_half_pattern = __sse_zero;
300 second_half_pattern = __sse_zero;
301
302 for (uint8_t imm8 : shuffle_mask) {
303 switch (imm8) {
304 case 0x1B:
305 tmp_blue = _mm_shuffle_epi32(test_blue, 0x1B);
306 tmp_green = _mm_shuffle_epi32(test_green, 0x1B);
307 tmp_red = _mm_shuffle_epi32(test_red, 0x1B);
308 break;
309 case 0x4E:
310 tmp_blue = _mm_shuffle_epi32(test_blue, 0x4E);
311 tmp_green = _mm_shuffle_epi32(test_green, 0x4E);
312 tmp_red = _mm_shuffle_epi32(test_red, 0x4E);
313 break;
314 case 0xB1:
315 tmp_blue = _mm_shuffle_epi32(test_blue, 0xB1);
316 tmp_green = _mm_shuffle_epi32(test_green, 0xB1);
317 tmp_red = _mm_shuffle_epi32(test_red, 0xB1);
318 break;
319 case 0xE4:
320 tmp_blue = _mm_shuffle_epi32(test_blue, 0xE4);
321 tmp_green = _mm_shuffle_epi32(test_green, 0xE4);
322 tmp_red = _mm_shuffle_epi32(test_red, 0xE4);
323 break;
324 default:
325 tmp_blue = test_blue;
326 tmp_green = test_green;
327 tmp_red = test_red;
328 }
329
330 tmp = _mm_set1_epi32(imm8);
331
332 block_error =
333 AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block),
334 GetColorErrorSSE(tmp_green, first_green_data_block),
335 GetColorErrorSSE(tmp_red, first_red_data_block));
336
337 // save winning pattern
338 first_half_pattern = _mm_max_epi16(
339 first_half_pattern,
340 _mm_and_si128(tmp, _mm_cmpgt_epi32(first_half_min, block_error)));
341 // should use _mm_min_epi32(first_half_min, block_error); from SSE4
342 // otherwise small performance penalty
343 mask = _mm_cmplt_epi32(block_error, first_half_min);
344 first_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),
345 _mm_andnot_si128(mask, first_half_min));
346
347 // Second part of the block
348 block_error =
349 AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block),
350 GetColorErrorSSE(tmp_green, second_green_data_block),
351 GetColorErrorSSE(tmp_red, second_red_data_block));
352
353 // save winning pattern
354 second_half_pattern = _mm_max_epi16(
355 second_half_pattern,
356 _mm_and_si128(tmp, _mm_cmpgt_epi32(second_half_min, block_error)));
357 // should use _mm_min_epi32(second_half_min, block_error); from SSE4
358 // otherwise performance penalty
359 mask = _mm_cmplt_epi32(block_error, second_half_min);
360 second_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),
361 _mm_andnot_si128(mask, second_half_min));
362 }
363
364 first_half_min = _mm_add_epi32(first_half_min, second_half_min);
365 first_half_min =
366 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E));
367 first_half_min =
368 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1));
369
370 min = _mm_cvtsi128_si32(first_half_min);
371
372 delta = min - last_min;
373 last_min = min;
374
375 if (min < best_error) {
376 best_tbl_idx = tbl_idx;
377 best_error = min;
378
379 best_mod_idx[tbl_idx][0] =
380 (_mm_cvtsi128_si32(first_half_pattern) >> (0)) & 3;
381 best_mod_idx[tbl_idx][4] =
382 (_mm_cvtsi128_si32(second_half_pattern) >> (0)) & 3;
383
384 best_mod_idx[tbl_idx][1] =
385 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x1)) >>
386 (2)) &
387 3;
388 best_mod_idx[tbl_idx][5] =
389 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x1)) >>
390 (2)) &
391 3;
392
393 best_mod_idx[tbl_idx][2] =
394 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x2)) >>
395 (4)) &
396 3;
397 best_mod_idx[tbl_idx][6] =
398 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x2)) >>
399 (4)) &
400 3;
401
402 best_mod_idx[tbl_idx][3] =
403 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x3)) >>
404 (6)) &
405 3;
406 best_mod_idx[tbl_idx][7] =
407 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x3)) >>
408 (6)) &
409 3;
410
411 if (best_error == 0) {
412 break;
413 }
414 } else if (delta > 0 && expected_error < min) {
415 // error is growing and is well beyond expected error
416 break;
417 }
418 }
419
420 cc::WriteCodewordTable(block, sub_block_id, best_tbl_idx);
421
422 uint32_t pix_data = 0;
423 uint8_t mod_idx;
424 uint8_t pix_idx;
425 uint32_t lsb;
426 uint32_t msb;
427 int texel_num;
428
429 for (unsigned int i = 0; i < 8; ++i) {
430 mod_idx = best_mod_idx[best_tbl_idx][i];
431 pix_idx = cc::g_mod_to_pix[mod_idx];
432
433 lsb = pix_idx & 0x1;
434 msb = pix_idx >> 1;
435
436 // Obtain the texel number as specified in the standard.
437 texel_num = idx_to_num_tab[i];
438 pix_data |= msb << (texel_num + 16);
439 pix_data |= lsb << (texel_num);
440 }
441
442 cc::WritePixelData(block, pix_data);
443 }
444
445 void CompressBlock(uint8_t* dst, __sse_data* data) {
446 // first 3 vertical 1, second 3 vertical 2, third 3 horizontal 1, last 3
447 // horizontal 2
448 float __sse_avg_colors[12] = {
449 0,
450 };
451 bool use_differential[2] = {true, true};
452 GetAvgColors(data, __sse_avg_colors, use_differential);
453 cc::Color sub_block_avg[4];
454
455 // TODO(radu.velea): remove floating point operations and use only int's +
456 // normal rounding and shifts for reduced Quality
457 for (int i = 0, j = 1; i < 4; i += 2, j += 2) {
458 if (use_differential[i / 2] == false) {
459 sub_block_avg[i] = cc::MakeColor444(&__sse_avg_colors[i * 3]);
460 sub_block_avg[j] = cc::MakeColor444(&__sse_avg_colors[j * 3]);
461 } else {
462 sub_block_avg[i] = cc::MakeColor555(&__sse_avg_colors[i * 3]);
463 sub_block_avg[j] = cc::MakeColor555(&__sse_avg_colors[j * 3]);
464 }
465 }
466
467 __m128i red_avg[2], green_avg[2], blue_avg[2];
468
469 // TODO(radu.velea): perfect accuracy, maybe skip floating variables
470 blue_avg[0] =
471 _mm_set_epi32((int)__sse_avg_colors[3], (int)__sse_avg_colors[3],
472 (int)__sse_avg_colors[0], (int)__sse_avg_colors[0]);
reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>(
radu.velea 2015/05/07 15:53:45 Done.
473
474 green_avg[0] =
475 _mm_set_epi32((int)__sse_avg_colors[4], (int)__sse_avg_colors[4],
476 (int)__sse_avg_colors[1], (int)__sse_avg_colors[1]);
reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>(
radu.velea 2015/05/07 15:53:46 Done.
477
478 red_avg[0] =
479 _mm_set_epi32((int)__sse_avg_colors[5], (int)__sse_avg_colors[5],
480 (int)__sse_avg_colors[2], (int)__sse_avg_colors[2]);
reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>(
radu.velea 2015/05/07 15:53:45 Done.
481
482 uint32_t vertical_error[2];
483 GetVerticalError(data, blue_avg, green_avg, red_avg, vertical_error);
484
485 // TODO(radu.velea): perfect accuracy, maybe skip floating variables
486 blue_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[6]);
487 blue_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[9]);
reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>(
radu.velea 2015/05/07 15:53:45 Done.
488
489 green_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[7]);
490 green_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[10]);
reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>(
radu.velea 2015/05/07 15:53:45 Done.
491
492 red_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[8]);
493 red_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[11]);
reveman 2015/05/07 14:24:35 nit: please avoid c-style casts. static_cast<int>(
radu.velea 2015/05/07 15:53:45 Done.
494
495 uint32_t horizontal_error[2];
496 GetHorizontalError(data, blue_avg, green_avg, red_avg, horizontal_error);
497
498 bool flip = (horizontal_error[0] + horizontal_error[1]) <
499 (vertical_error[0] + vertical_error[1]);
500 uint32_t* expected_errors = flip == true ? horizontal_error : vertical_error;
reveman 2015/05/07 14:24:35 nit: s/flip == true/flip/
radu.velea 2015/05/07 15:53:45 Done.
501
502 // Clear destination buffer so that we can "or" in the results.
503 memset(dst, 0, 8);
504
505 cc::WriteDiff(dst, use_differential[!!flip]);
506 cc::WriteFlip(dst, flip);
507
508 uint8_t sub_block_off_0 = flip ? 2 : 0;
509 uint8_t sub_block_off_1 = sub_block_off_0 + 1;
510
511 if (use_differential[!!flip]) {
512 cc::WriteColors555(dst, sub_block_avg[sub_block_off_0],
513 sub_block_avg[sub_block_off_1]);
514 } else {
515 cc::WriteColors444(dst, sub_block_avg[sub_block_off_0],
516 sub_block_avg[sub_block_off_1]);
517 }
518
519 if (flip == false) {
reveman 2015/05/07 14:24:35 nit: if (!flip)
radu.velea 2015/05/07 15:53:45 Done.
520 // transpose vertical data into horizontal lines
521 __m128i tmp;
522 for (int i = 0; i < 4; i += 2) {
523 tmp = data->blue[i];
524 data->blue[i] = _mm_add_epi32(
525 _mm_move_epi64(data->blue[i]),
526 _mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E));
527 data->blue[i + 1] = _mm_add_epi32(
528 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
529 _mm_shuffle_epi32(
530 _mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)),
531 0x4E));
532
533 tmp = data->green[i];
534 data->green[i] = _mm_add_epi32(
535 _mm_move_epi64(data->green[i]),
536 _mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E));
537 data->green[i + 1] = _mm_add_epi32(
538 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
539 _mm_shuffle_epi32(
540 _mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)),
541 0x4E));
542
543 tmp = data->red[i];
544 data->red[i] = _mm_add_epi32(
545 _mm_move_epi64(data->red[i]),
546 _mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E));
547 data->red[i + 1] = _mm_add_epi32(
548 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
549 _mm_shuffle_epi32(
550 _mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E));
551 }
552
553 tmp = data->blue[1];
554 data->blue[1] = data->blue[2];
555 data->blue[2] = tmp;
556
557 tmp = data->green[1];
558 data->green[1] = data->green[2];
559 data->green[2] = tmp;
560
561 tmp = data->red[1];
562 data->red[1] = data->red[2];
563 data->red[2] = tmp;
564 }
565
566 // Compute luminance for the first sub block.
567 ComputeLuminance(dst, sub_block_avg[sub_block_off_0], 0,
568 cc::g_idx_to_num[sub_block_off_0], data,
569 ETC1_SET_ERROR(expected_errors[0]));
570 // Compute luminance for the second sub block.
571 ComputeLuminance(dst, sub_block_avg[sub_block_off_1], 1,
572 cc::g_idx_to_num[sub_block_off_1], data,
573 ETC1_SET_ERROR(expected_errors[1]));
574 }
575
576 static void ExtractBlock(uint8_t* dst, const uint8_t* src, int width) {
577 for (int j = 0; j < 4; ++j) {
578 memcpy(&dst[j * 4 * 4], src, 4 * 4);
579 src += width * 4;
580 }
581 }
582
583 inline bool TransposeBlock(uint8_t* block, __m128i* transposed) {
584 // This function transforms an incommig block of RGBA or GBRA pixels into 4
585 // registers, each containing the data corresponding for a single channel.
586 // Ex: transposed[0] will have all the R values for a RGBA block,
587 // transposed[1] will have G, etc.
588 // The values are packed as 8 bit unsigned values in the SSE registers.
589
590 // Before doing any work we check if the block is solid.
591 __m128i tmp3, tmp2, tmp1, tmp0;
592 __m128i test_solid = _mm_set1_epi32(*((uint32_t*)block));
593 uint16_t mask = 0xFFFF;
594
595 // a0,a1,a2,...a7, ...a15
596 transposed[0] = _mm_loadu_si128((__m128i*)(block));
597 // b0, b1,b2,...b7.... b15
598 transposed[1] = _mm_loadu_si128((__m128i*)(block + 16));
599 // c0, c1,c2,...c7....c15
600 transposed[2] = _mm_loadu_si128((__m128i*)(block + 32));
601 // d0,d1,d2,...d7....d15
602 transposed[3] = _mm_loadu_si128((__m128i*)(block + 48));
603
604 for (int i = 0; i < 4; i++) {
605 mask &= _mm_movemask_epi8(_mm_cmpeq_epi8(transposed[i], test_solid));
606 }
607
608 if (mask == 0xFFFF) {
609 return false; // block is solid, no need to do any more work
610 }
611
612 // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
613 tmp0 = _mm_unpacklo_epi8(transposed[0], transposed[1]);
614 // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
615 tmp1 = _mm_unpacklo_epi8(transposed[2], transposed[3]);
616 // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
617 tmp2 = _mm_unpackhi_epi8(transposed[0], transposed[1]);
618 // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
619 tmp3 = _mm_unpackhi_epi8(transposed[2], transposed[3]);
620
621 // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
622 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);
623 // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
624 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);
625 // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
626 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);
627 // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
628 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);
629
630 // a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
631 tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]);
632 // a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
633 tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]);
634 // a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13
635 tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]);
636 // a6,a14, b6,b14, c6,c14, d6,d14, a7,a15, b7,b15, c7,c15, d7,d15
637 tmp3 = _mm_unpackhi_epi32(transposed[1], transposed[3]);
638
639 // a0,a4, a8,a12, b0,b4, b8,b12, c0,c4, c8,c12, d0,d4, d8,d12
640 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);
641 // a1,a5, a9,a13, b1,b5, b9,b13, c1,c5, c9,c13, d1,d5, d9,d13
642 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);
643 // a2,a6, a10,a14, b2,b6, b10,b14, c2,c6, c10,c14, d2,d6, d10,d14
644 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);
645 // a3,a7, a11,a15, b3,b7, b11,b15, c3,c7, c11,c15, d3,d7, d11,d15
646 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);
647
648 return true;
649 }
650
651 inline void UnpackBlock(__m128i* packed,
652 __m128i* red,
653 __m128i* green,
654 __m128i* blue,
655 __m128i* alpha) {
656 const __m128i zero = _mm_set1_epi8(0);
657 __m128i tmp_low, tmp_high;
658
659 // unpack red
660 tmp_low = _mm_unpacklo_epi8(packed[0], zero);
661 tmp_high = _mm_unpackhi_epi8(packed[0], zero);
662
663 red[0] = _mm_unpacklo_epi16(tmp_low, zero);
664 red[1] = _mm_unpackhi_epi16(tmp_low, zero);
665
666 red[2] = _mm_unpacklo_epi16(tmp_high, zero);
667 red[3] = _mm_unpackhi_epi16(tmp_high, zero);
668
669 // unpack green
670 tmp_low = _mm_unpacklo_epi8(packed[1], zero);
671 tmp_high = _mm_unpackhi_epi8(packed[1], zero);
672
673 green[0] = _mm_unpacklo_epi16(tmp_low, zero);
674 green[1] = _mm_unpackhi_epi16(tmp_low, zero);
675
676 green[2] = _mm_unpacklo_epi16(tmp_high, zero);
677 green[3] = _mm_unpackhi_epi16(tmp_high, zero);
678
679 // unpack blue
680 tmp_low = _mm_unpacklo_epi8(packed[2], zero);
681 tmp_high = _mm_unpackhi_epi8(packed[2], zero);
682
683 blue[0] = _mm_unpacklo_epi16(tmp_low, zero);
684 blue[1] = _mm_unpackhi_epi16(tmp_low, zero);
685
686 blue[2] = _mm_unpacklo_epi16(tmp_high, zero);
687 blue[3] = _mm_unpackhi_epi16(tmp_high, zero);
688
689 // unpack alpha - unused for ETC1
690 tmp_low = _mm_unpacklo_epi8(packed[3], zero);
691 tmp_high = _mm_unpackhi_epi8(packed[3], zero);
692
693 alpha[0] = _mm_unpacklo_epi16(tmp_low, zero);
694 alpha[1] = _mm_unpackhi_epi16(tmp_low, zero);
695
696 alpha[2] = _mm_unpacklo_epi16(tmp_high, zero);
697 alpha[3] = _mm_unpackhi_epi16(tmp_high, zero);
698 }
699
700 inline void CompressSolid(uint8_t* dst, uint8_t* block) {
701 // Clear destination buffer so that we can "or" in the results.
702 memset(dst, 0, 8);
703
704 const float src_color_float[3] = {static_cast<float>(block[0]),
705 static_cast<float>(block[1]),
706 static_cast<float>(block[2])};
707 const cc::Color base = cc::MakeColor555(src_color_float);
708 const __m128i base_v =
709 _mm_set_epi32(0, base.channels.r, base.channels.g, base.channels.b);
710
711 const __m128i constant = _mm_set_epi32(0, block[2], block[1], block[0]);
712 __m128i lum;
713 __m128i colors[4];
714 static const __m128i rgb =
715 _mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
716
717 cc::WriteDiff(dst, true);
718 cc::WriteFlip(dst, false);
719
720 cc::WriteColors555(dst, base, base);
721
722 uint8_t best_tbl_idx = 0;
723 uint8_t best_mod_idx = 0;
724 uint32_t best_mod_err = INT32_MAX;
725
726 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {
727 lum = _mm_set_epi32(
728 cc::g_codeword_tables[tbl_idx][3], cc::g_codeword_tables[tbl_idx][2],
729 cc::g_codeword_tables[tbl_idx][1], cc::g_codeword_tables[tbl_idx][0]);
730 colors[0] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x0));
731 colors[1] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x55));
732 colors[2] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xAA));
733 colors[3] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xFF));
734
735 for (int i = 0; i < 4; i++) {
736 uint32_t mod_err =
737 SumSSE(GetColorErrorSSE(constant, _mm_and_si128(colors[i], rgb)));
738 colors[i] = _mm_and_si128(colors[i], rgb);
739 if (mod_err < best_mod_err) {
740 best_tbl_idx = tbl_idx;
741 best_mod_idx = i;
742 best_mod_err = mod_err;
743
744 if (mod_err == 0) {
745 break; // We cannot do any better than this.
746 }
747 }
748 }
749 }
750
751 cc::WriteCodewordTable(dst, 0, best_tbl_idx);
752 cc::WriteCodewordTable(dst, 1, best_tbl_idx);
753
754 uint8_t pix_idx = cc::g_mod_to_pix[best_mod_idx];
755 uint32_t lsb = pix_idx & 0x1;
756 uint32_t msb = pix_idx >> 1;
757
758 uint32_t pix_data = 0;
759 for (unsigned int i = 0; i < 2; ++i) {
760 for (unsigned int j = 0; j < 8; ++j) {
761 // Obtain the texel number as specified in the standard.
762 int texel_num = cc::g_idx_to_num[i][j];
763 pix_data |= msb << (texel_num + 16);
764 pix_data |= lsb << (texel_num);
765 }
766 }
767
768 cc::WritePixelData(dst, pix_data);
769 }
770
771 } // namespace
772
773 namespace cc {
reveman 2015/05/07 14:24:35 nit: please move this up to line 17 just before "n
radu.velea 2015/05/07 15:53:45 Done.
774
775 void TextureCompressorETC1SSE::Compress(const uint8_t* src,
776 uint8_t* dst,
777 int width,
778 int height,
779 Quality quality) {
780 DCHECK_GE(width, 4);
781 DCHECK_EQ((width & 3), 0);
782 DCHECK_GE(height, 4);
783 DCHECK_EQ((height & 3), 0);
784
785 ALIGNAS(16) uint8_t block[64];
786 __m128i packed[4];
787 __m128i red[4], green[4], blue[4], alpha[4];
788 __sse_data data;
789
790 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {
791 for (int x = 0; x < width; x += 4, dst += 8) {
792 ExtractBlock(block, src + x * 4, width);
793 if (TransposeBlock(block, packed) == false) {
794 CompressSolid(dst, block);
795 } else {
796 UnpackBlock(packed, blue, green, red, alpha);
797
798 data.block = block;
799 data.packed = packed;
800 data.red = red;
801 data.blue = blue;
802 data.green = green;
803
804 CompressBlock(dst, &data);
805 }
806 }
807 }
808 }
809
810 } // namespace cc
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698