Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1196)

Side by Side Diff: cc/resources/texture_compressor_etc1_sse.cc

Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Updated comments, casts and other minor issues. Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "cc/resources/texture_compressor_etc1_sse.h"
6
7 #include <assert.h>
reveman 2015/05/07 16:26:29 is this needed?
radu.velea 2015/05/08 10:52:11 Done.
8 #include <emmintrin.h>
9 #include <cmath>
10 #include <limits>
11
12 #include "base/compiler_specific.h"
13 #include "base/logging.h"
14 // Using this header for common functions such as Color handling
15 // and codeword table.
16 #include "cc/resources/texture_compressor_etc1.h"
17
18 namespace cc {
19
20 namespace {
21
22 inline uint32_t SetETC1MaxError(uint32_t avg_error) {
23 // ETC1 codeword table is sorted ascending.
24 // Our algorithm will try to identify the index that generates the minimum
25 // error.
26 // The min error calculated during ComputeLuminance main loop will converge
27 // towards that value.
28 // We use this threashold to determine when it doesn't make sense to iterate
29 // further through the array.
30 return avg_error + avg_error / 2 + 384;
31 }
32
33 struct __sse_data {
34 // This is used to store raw data.
35 uint8_t* block;
36 // This is used to store 8 bit packed values.
37 __m128i* packed;
38 // This is used to store 32 bit zero extended values into 4x4 arrays.
39 __m128i* blue;
40 __m128i* green;
41 __m128i* red;
42 };
43
44 // Commonly used registers throughout the code.
45 static const __m128i __sse_zero = _mm_set1_epi32(0);
46 static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF);
47
48 inline __m128i AddAndClamp(const __m128i x, const __m128i y) {
49 static const __m128i color_max = _mm_set1_epi32(0xFF);
50 return _mm_max_epi16(__sse_zero,
51 _mm_min_epi16(_mm_add_epi16(x, y), color_max));
52 }
53
54 inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) {
55 // Changed from _mm_mullo_epi32 (SSE4) to _mm_mullo_epi16 (SSE2).
56 __m128i ret = _mm_sub_epi16(x, y);
57 return _mm_mullo_epi16(ret, ret);
58 }
59
60 inline __m128i AddChannelError(const __m128i x,
61 const __m128i y,
62 const __m128i z) {
63 return _mm_add_epi32(x, _mm_add_epi32(y, z));
64 }
65
66 inline uint32_t SumSSE(const __m128i x) {
67 __m128i sum = _mm_add_epi32(x, _mm_shuffle_epi32(x, 0x4E));
68 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));
69
70 return _mm_cvtsi128_si32(sum);
71 }
72
73 inline uint32_t GetVerticalError(const __sse_data* data,
74 const __m128i* blue_avg,
75 const __m128i* green_avg,
76 const __m128i* red_avg,
77 uint32_t* verror) {
78 __m128i error = __sse_zero;
79
80 for (int i = 0; i < 4; i++) {
81 error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0]));
82 error =
83 _mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0]));
84 error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0]));
85 }
86
87 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));
88
89 verror[0] = _mm_cvtsi128_si32(error);
90 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));
91
92 return verror[0] + verror[1];
93 }
94
95 inline uint32_t GetHorizontalError(const __sse_data* data,
96 const __m128i* blue_avg,
97 const __m128i* green_avg,
98 const __m128i* red_avg,
99 uint32_t* verror) {
100 __m128i error = __sse_zero;
101 int first_index, second_index;
102
103 for (int i = 0; i < 2; i++) {
104 first_index = 2 * i;
105 second_index = first_index + 1;
106
107 error = _mm_add_epi32(
108 error, GetColorErrorSSE(data->blue[first_index], blue_avg[i]));
109 error = _mm_add_epi32(
110 error, GetColorErrorSSE(data->blue[second_index], blue_avg[i]));
111 error = _mm_add_epi32(
112 error, GetColorErrorSSE(data->green[first_index], green_avg[i]));
113 error = _mm_add_epi32(
114 error, GetColorErrorSSE(data->green[second_index], green_avg[i]));
115 error = _mm_add_epi32(error,
116 GetColorErrorSSE(data->red[first_index], red_avg[i]));
117 error = _mm_add_epi32(
118 error, GetColorErrorSSE(data->red[second_index], red_avg[i]));
119 }
120
121 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));
122
123 verror[0] = _mm_cvtsi128_si32(error);
124 verror[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error, 0xB1));
125
126 return verror[0] + verror[1];
127 }
128
129 inline void GetAvgColors(const __sse_data* data,
130 float* output,
131 bool* __sse_use_diff) {
132 __m128i sum[2], tmp;
133
134 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe.
135
136 // Compute avg red value.
137 // [S0 S0 S1 S1]
138 sum[0] = _mm_add_epi32(data->red[0], data->red[1]);
139 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
140
141 // [S2 S2 S3 S3]
142 sum[1] = _mm_add_epi32(data->red[2], data->red[3]);
143 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
144
145 float hred[2], vred[2];
146 hred[0] = (_mm_cvtsi128_si32(
147 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
148 8.0f;
149 hred[1] = (_mm_cvtsi128_si32(
150 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
151 8.0f;
152
153 tmp = _mm_add_epi32(sum[0], sum[1]);
154 vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
155 vred[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;
156
157 // Compute avg green value.
158 // [S0 S0 S1 S1]
159 sum[0] = _mm_add_epi32(data->green[0], data->green[1]);
160 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
161
162 // [S2 S2 S3 S3]
163 sum[1] = _mm_add_epi32(data->green[2], data->green[3]);
164 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
165
166 float hgreen[2], vgreen[2];
167 hgreen[0] = (_mm_cvtsi128_si32(
168 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
169 8.0f;
170 hgreen[1] = (_mm_cvtsi128_si32(
171 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
172 8.0f;
173
174 tmp = _mm_add_epi32(sum[0], sum[1]);
175 vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
176 vgreen[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;
177
178 // Compute avg blue value.
179 // [S0 S0 S1 S1]
180 sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]);
181 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
182
183 // [S2 S2 S3 S3]
184 sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]);
185 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
186
187 float hblue[2], vblue[2];
188 hblue[0] = (_mm_cvtsi128_si32(
189 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
190 8.0f;
191 hblue[1] = (_mm_cvtsi128_si32(
192 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
193 8.0f;
194
195 tmp = _mm_add_epi32(sum[0], sum[1]);
196 vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
197 vblue[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp, 0x2))) / 8.0f;
198
199 // TODO(radu.velea): Return int's instead of floats, based on Quality.
200 output[0] = vblue[0];
201 output[1] = vgreen[0];
202 output[2] = vred[0];
203
204 output[3] = vblue[1];
205 output[4] = vgreen[1];
206 output[5] = vred[1];
207
208 output[6] = hblue[0];
209 output[7] = hgreen[0];
210 output[8] = hred[0];
211
212 output[9] = hblue[1];
213 output[10] = hgreen[1];
214 output[11] = hred[1];
215
216 __m128i threshold_upper = _mm_set1_epi32(3);
217 __m128i threshold_lower = _mm_set1_epi32(-4);
218
219 __m128 factor_v = _mm_set1_ps(31.0f / 255.0f);
220 __m128 rounding_v = _mm_set1_ps(0.5f);
221 __m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0);
222 __m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0);
223
224 __m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0);
225 __m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0);
226
227 h_avg_0 = _mm_mul_ps(h_avg_0, factor_v);
228 h_avg_1 = _mm_mul_ps(h_avg_1, factor_v);
229 v_avg_0 = _mm_mul_ps(v_avg_0, factor_v);
230 v_avg_1 = _mm_mul_ps(v_avg_1, factor_v);
231
232 h_avg_0 = _mm_add_ps(h_avg_0, rounding_v);
233 h_avg_1 = _mm_add_ps(h_avg_1, rounding_v);
234 v_avg_0 = _mm_add_ps(v_avg_0, rounding_v);
235 v_avg_1 = _mm_add_ps(v_avg_1, rounding_v);
236
237 __m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0);
238 __m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1);
239
240 __m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0);
241 __m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1);
242
243 h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i);
244 v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i);
245
246 __sse_use_diff[0] =
247 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threshold_lower)));
248 __sse_use_diff[0] &=
249 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threshold_upper)));
250
251 __sse_use_diff[1] =
252 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threshold_lower)));
253 __sse_use_diff[1] &=
254 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threshold_upper)));
255 }
256
257 void ComputeLuminance(uint8_t* block,
258 const Color& base,
259 const int sub_block_id,
260 const uint8_t* idx_to_num_tab,
261 const __sse_data* data,
262 const uint32_t expected_error) {
263 uint8_t best_tbl_idx = 0;
264 uint32_t best_error = 0x7FFFFFFF;
265 uint8_t best_mod_idx[8][8]; // [table][texel]
266
267 const __m128i base_blue = _mm_set1_epi32(base.channels.b);
268 const __m128i base_green = _mm_set1_epi32(base.channels.g);
269 const __m128i base_red = _mm_set1_epi32(base.channels.r);
270
271 __m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red;
272 __m128i block_error, mask;
273
274 // This will have the minimum errors for each 4 pixels.
275 __m128i first_half_min;
276 __m128i second_half_min;
277
278 // This will have the matching table index combo for each 4 pixels.
279 __m128i first_half_pattern;
280 __m128i second_half_pattern;
281
282 const __m128i first_blue_data_block = data->blue[2 * sub_block_id];
283 const __m128i first_green_data_block = data->green[2 * sub_block_id];
284 const __m128i first_red_data_block = data->red[2 * sub_block_id];
285
286 const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1];
287 const __m128i second_green_data_block = data->green[2 * sub_block_id + 1];
288 const __m128i second_red_data_block = data->red[2 * sub_block_id + 1];
289
290 uint32_t min;
291 // Fail early to increase speed.
292 long delta = INT32_MAX;
293 uint32_t last_min = INT32_MAX;
294
295 const uint8_t shuffle_mask[] = {
296 0x1B, 0x4E, 0xB1, 0xE4}; // Important they are sorted ascending.
297
298 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {
299 tmp = _mm_set_epi32(
300 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],
301 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);
302
303 test_blue = AddAndClamp(tmp, base_blue);
304 test_green = AddAndClamp(tmp, base_green);
305 test_red = AddAndClamp(tmp, base_red);
306
307 first_half_min = __sse_max_int;
308 second_half_min = __sse_max_int;
309
310 first_half_pattern = __sse_zero;
311 second_half_pattern = __sse_zero;
312
313 for (uint8_t imm8 : shuffle_mask) {
314 switch (imm8) {
315 case 0x1B:
316 tmp_blue = _mm_shuffle_epi32(test_blue, 0x1B);
317 tmp_green = _mm_shuffle_epi32(test_green, 0x1B);
318 tmp_red = _mm_shuffle_epi32(test_red, 0x1B);
319 break;
320 case 0x4E:
321 tmp_blue = _mm_shuffle_epi32(test_blue, 0x4E);
322 tmp_green = _mm_shuffle_epi32(test_green, 0x4E);
323 tmp_red = _mm_shuffle_epi32(test_red, 0x4E);
324 break;
325 case 0xB1:
326 tmp_blue = _mm_shuffle_epi32(test_blue, 0xB1);
327 tmp_green = _mm_shuffle_epi32(test_green, 0xB1);
328 tmp_red = _mm_shuffle_epi32(test_red, 0xB1);
329 break;
330 case 0xE4:
331 tmp_blue = _mm_shuffle_epi32(test_blue, 0xE4);
332 tmp_green = _mm_shuffle_epi32(test_green, 0xE4);
333 tmp_red = _mm_shuffle_epi32(test_red, 0xE4);
334 break;
335 default:
336 tmp_blue = test_blue;
337 tmp_green = test_green;
338 tmp_red = test_red;
339 }
340
341 tmp = _mm_set1_epi32(imm8);
342
343 block_error =
344 AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block),
345 GetColorErrorSSE(tmp_green, first_green_data_block),
346 GetColorErrorSSE(tmp_red, first_red_data_block));
347
348 // Save winning pattern.
349 first_half_pattern = _mm_max_epi16(
350 first_half_pattern,
351 _mm_and_si128(tmp, _mm_cmpgt_epi32(first_half_min, block_error)));
352 // Should use _mm_min_epi32(first_half_min, block_error); from SSE4
353 // otherwise we have a small performance penalty.
354 mask = _mm_cmplt_epi32(block_error, first_half_min);
355 first_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),
356 _mm_andnot_si128(mask, first_half_min));
357
358 // Compute second part of the block.
359 block_error =
360 AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block),
361 GetColorErrorSSE(tmp_green, second_green_data_block),
362 GetColorErrorSSE(tmp_red, second_red_data_block));
363
364 // Save winning pattern.
365 second_half_pattern = _mm_max_epi16(
366 second_half_pattern,
367 _mm_and_si128(tmp, _mm_cmpgt_epi32(second_half_min, block_error)));
368 // Should use _mm_min_epi32(second_half_min, block_error); from SSE4
369 // otherwise we have a small performance penalty.
370 mask = _mm_cmplt_epi32(block_error, second_half_min);
371 second_half_min = _mm_add_epi32(_mm_and_si128(mask, block_error),
372 _mm_andnot_si128(mask, second_half_min));
373 }
374
375 first_half_min = _mm_add_epi32(first_half_min, second_half_min);
376 first_half_min =
377 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E));
378 first_half_min =
379 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1));
380
381 min = _mm_cvtsi128_si32(first_half_min);
382
383 delta = min - last_min;
384 last_min = min;
385
386 if (min < best_error) {
387 best_tbl_idx = tbl_idx;
388 best_error = min;
389
390 best_mod_idx[tbl_idx][0] =
391 (_mm_cvtsi128_si32(first_half_pattern) >> (0)) & 3;
392 best_mod_idx[tbl_idx][4] =
393 (_mm_cvtsi128_si32(second_half_pattern) >> (0)) & 3;
394
395 best_mod_idx[tbl_idx][1] =
396 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x1)) >>
397 (2)) &
398 3;
399 best_mod_idx[tbl_idx][5] =
400 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x1)) >>
401 (2)) &
402 3;
403
404 best_mod_idx[tbl_idx][2] =
405 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x2)) >>
406 (4)) &
407 3;
408 best_mod_idx[tbl_idx][6] =
409 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x2)) >>
410 (4)) &
411 3;
412
413 best_mod_idx[tbl_idx][3] =
414 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern, 0x3)) >>
415 (6)) &
416 3;
417 best_mod_idx[tbl_idx][7] =
418 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern, 0x3)) >>
419 (6)) &
420 3;
421
422 if (best_error == 0) {
423 break;
424 }
425 } else if (delta > 0 && expected_error < min) {
426 // The error is growing and is well beyond expected threshold.
427 break;
428 }
429 }
430
431 WriteCodewordTable(block, sub_block_id, best_tbl_idx);
432
433 uint32_t pix_data = 0;
434 uint8_t mod_idx;
435 uint8_t pix_idx;
436 uint32_t lsb;
437 uint32_t msb;
438 int texel_num;
439
440 for (unsigned int i = 0; i < 8; ++i) {
441 mod_idx = best_mod_idx[best_tbl_idx][i];
442 pix_idx = g_mod_to_pix[mod_idx];
443
444 lsb = pix_idx & 0x1;
445 msb = pix_idx >> 1;
446
447 // Obtain the texel number as specified in the standard.
448 texel_num = idx_to_num_tab[i];
449 pix_data |= msb << (texel_num + 16);
450 pix_data |= lsb << (texel_num);
451 }
452
453 WritePixelData(block, pix_data);
454 }
455
456 void CompressBlock(uint8_t* dst, __sse_data* data) {
457 // First 3 values are for vertical 1, second 3 vertical 2, third 3 horizontal
458 // 1, last 3
459 // horizontal 2.
460 float __sse_avg_colors[12] = {
461 0,
462 };
463 bool use_differential[2] = {true, true};
464 GetAvgColors(data, __sse_avg_colors, use_differential);
465 Color sub_block_avg[4];
466
467 // TODO(radu.velea): Remove floating point operations and use only int's +
468 // normal rounding and shifts for reduced Quality.
469 for (int i = 0, j = 1; i < 4; i += 2, j += 2) {
470 if (use_differential[i / 2] == false) {
471 sub_block_avg[i] = MakeColor444(&__sse_avg_colors[i * 3]);
472 sub_block_avg[j] = MakeColor444(&__sse_avg_colors[j * 3]);
473 } else {
474 sub_block_avg[i] = MakeColor555(&__sse_avg_colors[i * 3]);
475 sub_block_avg[j] = MakeColor555(&__sse_avg_colors[j * 3]);
476 }
477 }
478
479 __m128i red_avg[2], green_avg[2], blue_avg[2];
480
481 // TODO(radu.velea): Perfect accuracy, maybe skip floating variables.
482 blue_avg[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors[3]),
483 static_cast<int>(__sse_avg_colors[3]),
484 static_cast<int>(__sse_avg_colors[0]),
485 static_cast<int>(__sse_avg_colors[0]));
486
487 green_avg[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors[4]),
488 static_cast<int>(__sse_avg_colors[4]),
489 static_cast<int>(__sse_avg_colors[1]),
490 static_cast<int>(__sse_avg_colors[1]));
491
492 red_avg[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors[5]),
493 static_cast<int>(__sse_avg_colors[5]),
494 static_cast<int>(__sse_avg_colors[2]),
495 static_cast<int>(__sse_avg_colors[2]));
496
497 uint32_t vertical_error[2];
498 GetVerticalError(data, blue_avg, green_avg, red_avg, vertical_error);
499
500 // TODO(radu.velea): Perfect accuracy, maybe skip floating variables.
501 blue_avg[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[6]));
502 blue_avg[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[9]));
503
504 green_avg[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[7]));
505 green_avg[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[10]));
506
507 red_avg[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[8]));
508 red_avg[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors[11]));
509
510 uint32_t horizontal_error[2];
511 GetHorizontalError(data, blue_avg, green_avg, red_avg, horizontal_error);
512
513 bool flip = (horizontal_error[0] + horizontal_error[1]) <
514 (vertical_error[0] + vertical_error[1]);
515 uint32_t* expected_errors = flip ? horizontal_error : vertical_error;
516
517 // Clear destination buffer so that we can "or" in the results.
518 memset(dst, 0, 8);
519
520 WriteDiff(dst, use_differential[!!flip]);
521 WriteFlip(dst, flip);
522
523 uint8_t sub_block_off_0 = flip ? 2 : 0;
524 uint8_t sub_block_off_1 = sub_block_off_0 + 1;
525
526 if (use_differential[!!flip]) {
527 WriteColors555(dst, sub_block_avg[sub_block_off_0],
528 sub_block_avg[sub_block_off_1]);
529 } else {
530 WriteColors444(dst, sub_block_avg[sub_block_off_0],
531 sub_block_avg[sub_block_off_1]);
532 }
533
534 if (!flip) {
535 // Transpose vertical data into horizontal lines.
536 __m128i tmp;
537 for (int i = 0; i < 4; i += 2) {
538 tmp = data->blue[i];
539 data->blue[i] = _mm_add_epi32(
540 _mm_move_epi64(data->blue[i]),
541 _mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E));
542 data->blue[i + 1] = _mm_add_epi32(
543 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
544 _mm_shuffle_epi32(
545 _mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)),
546 0x4E));
547
548 tmp = data->green[i];
549 data->green[i] = _mm_add_epi32(
550 _mm_move_epi64(data->green[i]),
551 _mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E));
552 data->green[i + 1] = _mm_add_epi32(
553 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
554 _mm_shuffle_epi32(
555 _mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)),
556 0x4E));
557
558 tmp = data->red[i];
559 data->red[i] = _mm_add_epi32(
560 _mm_move_epi64(data->red[i]),
561 _mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E));
562 data->red[i + 1] = _mm_add_epi32(
563 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
564 _mm_shuffle_epi32(
565 _mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E));
566 }
567
568 tmp = data->blue[1];
569 data->blue[1] = data->blue[2];
570 data->blue[2] = tmp;
571
572 tmp = data->green[1];
573 data->green[1] = data->green[2];
574 data->green[2] = tmp;
575
576 tmp = data->red[1];
577 data->red[1] = data->red[2];
578 data->red[2] = tmp;
579 }
580
581 // Compute luminance for the first sub block.
582 ComputeLuminance(dst, sub_block_avg[sub_block_off_0], 0,
583 g_idx_to_num[sub_block_off_0], data,
584 SetETC1MaxError(expected_errors[0]));
585 // Compute luminance for the second sub block.
586 ComputeLuminance(dst, sub_block_avg[sub_block_off_1], 1,
587 g_idx_to_num[sub_block_off_1], data,
588 SetETC1MaxError(expected_errors[1]));
589 }
590
591 static void ExtractBlock(uint8_t* dst, const uint8_t* src, int width) {
592 for (int j = 0; j < 4; ++j) {
593 memcpy(&dst[j * 4 * 4], src, 4 * 4);
594 src += width * 4;
595 }
596 }
597
598 inline bool TransposeBlock(uint8_t* block, __m128i* transposed) {
599 // This function transforms an incommig block of RGBA or GBRA pixels into 4
600 // registers, each containing the data corresponding for a single channel.
601 // Ex: transposed[0] will have all the R values for a RGBA block,
602 // transposed[1] will have G, etc.
603 // The values are packed as 8 bit unsigned values in the SSE registers.
604
605 // Before doing any work we check if the block is solid.
606 __m128i tmp3, tmp2, tmp1, tmp0;
607 __m128i test_solid = _mm_set1_epi32(*((uint32_t*)block));
608 uint16_t mask = 0xFFFF;
609
610 // a0,a1,a2,...a7, ...a15
611 transposed[0] = _mm_loadu_si128((__m128i*)(block));
612 // b0, b1,b2,...b7.... b15
613 transposed[1] = _mm_loadu_si128((__m128i*)(block + 16));
614 // c0, c1,c2,...c7....c15
615 transposed[2] = _mm_loadu_si128((__m128i*)(block + 32));
616 // d0,d1,d2,...d7....d15
617 transposed[3] = _mm_loadu_si128((__m128i*)(block + 48));
618
619 for (int i = 0; i < 4; i++) {
620 mask &= _mm_movemask_epi8(_mm_cmpeq_epi8(transposed[i], test_solid));
621 }
622
623 if (mask == 0xFFFF) {
624 // Block is solid, no need to do any more work.
625 return false;
626 }
627
628 // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
629 tmp0 = _mm_unpacklo_epi8(transposed[0], transposed[1]);
630 // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
631 tmp1 = _mm_unpacklo_epi8(transposed[2], transposed[3]);
632 // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
633 tmp2 = _mm_unpackhi_epi8(transposed[0], transposed[1]);
634 // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
635 tmp3 = _mm_unpackhi_epi8(transposed[2], transposed[3]);
636
637 // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
638 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);
639 // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
640 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);
641 // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
642 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);
643 // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
644 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);
645
646 // a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
647 tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]);
648 // a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
649 tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]);
650 // a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13
651 tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]);
652 // a6,a14, b6,b14, c6,c14, d6,d14, a7,a15, b7,b15, c7,c15, d7,d15
653 tmp3 = _mm_unpackhi_epi32(transposed[1], transposed[3]);
654
655 // a0,a4, a8,a12, b0,b4, b8,b12, c0,c4, c8,c12, d0,d4, d8,d12
656 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2);
657 // a1,a5, a9,a13, b1,b5, b9,b13, c1,c5, c9,c13, d1,d5, d9,d13
658 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2);
659 // a2,a6, a10,a14, b2,b6, b10,b14, c2,c6, c10,c14, d2,d6, d10,d14
660 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3);
661 // a3,a7, a11,a15, b3,b7, b11,b15, c3,c7, c11,c15, d3,d7, d11,d15
662 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3);
663
664 return true;
665 }
666
667 inline void UnpackBlock(__m128i* packed,
668 __m128i* red,
669 __m128i* green,
670 __m128i* blue,
671 __m128i* alpha) {
672 const __m128i zero = _mm_set1_epi8(0);
673 __m128i tmp_low, tmp_high;
674
675 // Unpack red.
676 tmp_low = _mm_unpacklo_epi8(packed[0], zero);
677 tmp_high = _mm_unpackhi_epi8(packed[0], zero);
678
679 red[0] = _mm_unpacklo_epi16(tmp_low, zero);
680 red[1] = _mm_unpackhi_epi16(tmp_low, zero);
681
682 red[2] = _mm_unpacklo_epi16(tmp_high, zero);
683 red[3] = _mm_unpackhi_epi16(tmp_high, zero);
684
685 // Unpack green.
686 tmp_low = _mm_unpacklo_epi8(packed[1], zero);
687 tmp_high = _mm_unpackhi_epi8(packed[1], zero);
688
689 green[0] = _mm_unpacklo_epi16(tmp_low, zero);
690 green[1] = _mm_unpackhi_epi16(tmp_low, zero);
691
692 green[2] = _mm_unpacklo_epi16(tmp_high, zero);
693 green[3] = _mm_unpackhi_epi16(tmp_high, zero);
694
695 // Unpack blue.
696 tmp_low = _mm_unpacklo_epi8(packed[2], zero);
697 tmp_high = _mm_unpackhi_epi8(packed[2], zero);
698
699 blue[0] = _mm_unpacklo_epi16(tmp_low, zero);
700 blue[1] = _mm_unpackhi_epi16(tmp_low, zero);
701
702 blue[2] = _mm_unpacklo_epi16(tmp_high, zero);
703 blue[3] = _mm_unpackhi_epi16(tmp_high, zero);
704
705 // Unpack alpha - unused for ETC1.
706 tmp_low = _mm_unpacklo_epi8(packed[3], zero);
707 tmp_high = _mm_unpackhi_epi8(packed[3], zero);
708
709 alpha[0] = _mm_unpacklo_epi16(tmp_low, zero);
710 alpha[1] = _mm_unpackhi_epi16(tmp_low, zero);
711
712 alpha[2] = _mm_unpacklo_epi16(tmp_high, zero);
713 alpha[3] = _mm_unpackhi_epi16(tmp_high, zero);
714 }
715
716 inline void CompressSolid(uint8_t* dst, uint8_t* block) {
717 // Clear destination buffer so that we can "or" in the results.
718 memset(dst, 0, 8);
719
720 const float src_color_float[3] = {static_cast<float>(block[0]),
721 static_cast<float>(block[1]),
722 static_cast<float>(block[2])};
723 const Color base = MakeColor555(src_color_float);
724 const __m128i base_v =
725 _mm_set_epi32(0, base.channels.r, base.channels.g, base.channels.b);
726
727 const __m128i constant = _mm_set_epi32(0, block[2], block[1], block[0]);
728 __m128i lum;
729 __m128i colors[4];
730 static const __m128i rgb =
731 _mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
732
733 WriteDiff(dst, true);
734 WriteFlip(dst, false);
735
736 WriteColors555(dst, base, base);
737
738 uint8_t best_tbl_idx = 0;
739 uint8_t best_mod_idx = 0;
740 uint32_t best_mod_err = INT32_MAX;
741
742 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {
743 lum = _mm_set_epi32(
744 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],
745 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);
746 colors[0] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x0));
747 colors[1] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0x55));
748 colors[2] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xAA));
749 colors[3] = AddAndClamp(base_v, _mm_shuffle_epi32(lum, 0xFF));
750
751 for (int i = 0; i < 4; i++) {
752 uint32_t mod_err =
753 SumSSE(GetColorErrorSSE(constant, _mm_and_si128(colors[i], rgb)));
754 colors[i] = _mm_and_si128(colors[i], rgb);
755 if (mod_err < best_mod_err) {
756 best_tbl_idx = tbl_idx;
757 best_mod_idx = i;
758 best_mod_err = mod_err;
759
760 if (mod_err == 0) {
761 break; // We cannot do any better than this.
762 }
763 }
764 }
765 }
766
767 WriteCodewordTable(dst, 0, best_tbl_idx);
768 WriteCodewordTable(dst, 1, best_tbl_idx);
769
770 uint8_t pix_idx = g_mod_to_pix[best_mod_idx];
771 uint32_t lsb = pix_idx & 0x1;
772 uint32_t msb = pix_idx >> 1;
773
774 uint32_t pix_data = 0;
775 for (unsigned int i = 0; i < 2; ++i) {
776 for (unsigned int j = 0; j < 8; ++j) {
777 // Obtain the texel number as specified in the standard.
778 int texel_num = g_idx_to_num[i][j];
779 pix_data |= msb << (texel_num + 16);
780 pix_data |= lsb << (texel_num);
781 }
782 }
783
784 WritePixelData(dst, pix_data);
785 }
786
787 } // namespace
788
789 void TextureCompressorETC1SSE::Compress(const uint8_t* src,
790 uint8_t* dst,
791 int width,
792 int height,
793 Quality quality) {
794 DCHECK_GE(width, 4);
795 DCHECK_EQ((width & 3), 0);
796 DCHECK_GE(height, 4);
797 DCHECK_EQ((height & 3), 0);
798
799 ALIGNAS(16) uint8_t block[64];
800 __m128i packed[4];
801 __m128i red[4], green[4], blue[4], alpha[4];
802 __sse_data data;
803
804 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {
805 for (int x = 0; x < width; x += 4, dst += 8) {
806 ExtractBlock(block, src + x * 4, width);
807 if (TransposeBlock(block, packed) == false) {
808 CompressSolid(dst, block);
809 } else {
810 UnpackBlock(packed, blue, green, red, alpha);
811
812 data.block = block;
813 data.packed = packed;
814 data.red = red;
815 data.blue = blue;
816 data.green = green;
817
818 CompressBlock(dst, &data);
819 }
820 }
821 }
822 }
823
824 } // namespace cc
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698