Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(14)

Side by Side Diff: cc/resources/texture_compressor_etc1_sse.cc

Issue 1096703002: Reland: Add ETC1 powered SSE encoder for tile texture compression (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: --no-find-copies Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "texture_compressor_etc1_sse.h"
6
7 #include <assert.h>
8 #include <smmintrin.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <time.h>
13 #include <unistd.h>
14
15 #include <cmath>
16 #include <limits>
17 #include <sstream>
18
19 #include "base/compiler_specific.h"
20 #include "base/logging.h"
21
22 // Defining the following macro will cause the error metric function to weigh
23 // each color channel differently depending on how the human eye can perceive
24 // them. This can give a slight improvement in image quality at the cost of a
25 // performance hit.
26 // #define USE_PERCEIVED_ERROR_METRIC
27
28 namespace {
29
30 template <typename T>
31 inline T clamp(T val, T min, T max) {
32 return val < min ? min : (val > max ? max : val);
33 }
34
35 inline uint8_t round_to_5_bits(float val) {
36 return clamp<uint8_t>(val * 31.0f / 255.0f + 0.5f, 0, 31);
37 }
38
39 inline uint8_t round_to_4_bits(float val) {
40 return clamp<uint8_t>(val * 15.0f / 255.0f + 0.5f, 0, 15);
41 }
42
43 union Color {
44 struct BgraColorType {
45 uint8_t b;
46 uint8_t g;
47 uint8_t r;
48 uint8_t a;
49 } channels;
50 uint8_t components[4];
51 uint32_t bits;
52 };
53
54 /*
55 * Codeword tables.
56 * See: Table 3.17.2
57 */
58 static const int16_t g_codeword_tables[8][4]
59 __attribute__((aligned(16))) = {{-8, -2, 2, 8},
60 {-17, -5, 5, 17},
61 {-29, -9, 9, 29},
62 {-42, -13, 13, 42},
63 {-60, -18, 18, 60},
64 {-80, -24, 24, 80},
65 {-106, -33, 33, 106},
66 {-183, -47, 47, 183}};
67
68 /*
69 * Maps modifier indices to pixel index values.
70 * See: Table 3.17.3
71 */
72 static const uint8_t g_mod_to_pix[4] = {3, 2, 0, 1};
73
74 /*
75 * The ETC1 specification index texels as follows:
76 *
77 * [a][e][i][m] [ 0][ 4][ 8][12]
78 * [b][f][j][n] <-> [ 1][ 5][ 9][13]
79 * [c][g][k][o] [ 2][ 6][10][14]
80 * [d][h][l][p] [ 3][ 7][11][15]
81 *
82 * However, when extracting sub blocks from BGRA data the natural array
83 * indexing order ends up different:
84 *
85 * vertical0: [a][e][b][f] horizontal0: [a][e][i][m]
86 * [c][g][d][h] [b][f][j][n]
87 * vertical1: [i][m][j][n] horizontal1: [c][g][k][o]
88 * [k][o][l][p] [d][h][l][p]
89 *
90 * In order to translate from the natural array indices in a sub block to the
91 * indices (number) used by specification and hardware we use this table.
92 */
93 static const uint8_t g_idx_to_num[4][8] = {
94 {0, 4, 1, 5, 2, 6, 3, 7}, // Vertical block 0.
95 {8, 12, 9, 13, 10, 14, 11, 15}, // Vertical block 1.
96 {0, 4, 8, 12, 1, 5, 9, 13}, // Horizontal block 0.
97 {2, 6, 10, 14, 3, 7, 11, 15} // Horizontal block 1.
98 };
99
100 inline void WriteColors444(uint8_t* block,
101 const Color& color0,
102 const Color& color1) {
103 /* 0, 1, 2 - for ARM */
adrian.belgun 2015/04/17 14:02:58 Please check image channel order for input. This
104 block[2] = (color0.channels.r & 0xf0) | (color1.channels.r >> 4);
105 block[1] = (color0.channels.g & 0xf0) | (color1.channels.g >> 4);
106 block[0] = (color0.channels.b & 0xf0) | (color1.channels.b >> 4);
107 }
108
109 inline void WriteColors555(uint8_t* block,
110 const Color& color0,
111 const Color& color1) {
112 // Table for conversion to 3-bit two complement format.
113 static const uint8_t two_compl_trans_table[8] = {
114 4, // -4 (100b)
115 5, // -3 (101b)
116 6, // -2 (110b)
117 7, // -1 (111b)
118 0, // 0 (000b)
119 1, // 1 (001b)
120 2, // 2 (010b)
121 3, // 3 (011b)
122 };
123
124 int16_t delta_r =
125 static_cast<int16_t>(color1.channels.r >> 3) - (color0.channels.r >> 3);
126 int16_t delta_g =
127 static_cast<int16_t>(color1.channels.g >> 3) - (color0.channels.g >> 3);
128 int16_t delta_b =
129 static_cast<int16_t>(color1.channels.b >> 3) - (color0.channels.b >> 3);
130 DCHECK(delta_r >= -4 && delta_r <= 3);
131 DCHECK(delta_g >= -4 && delta_g <= 3);
132 DCHECK(delta_b >= -4 && delta_b <= 3);
133
134 /* 0, 1, 2 - for ARM */
adrian.belgun 2015/04/17 14:02:58 Same comments as for :103.
135 block[2] = (color0.channels.r & 0xf8) | two_compl_trans_table[delta_r + 4];
136 block[1] = (color0.channels.g & 0xf8) | two_compl_trans_table[delta_g + 4];
137 block[0] = (color0.channels.b & 0xf8) | two_compl_trans_table[delta_b + 4];
138 }
139
140 inline void WriteCodewordTable(uint8_t* block,
141 uint8_t sub_block_id,
142 uint8_t table) {
143 DCHECK_LT(sub_block_id, 2);
144 DCHECK_LT(table, 8);
145
146 uint8_t shift = (2 + (3 - sub_block_id * 3));
147 block[3] &= ~(0x07 << shift);
148 block[3] |= table << shift;
149 }
150
151 inline void WritePixelData(uint8_t* block, uint32_t pixel_data) {
152 block[4] |= pixel_data >> 24;
153 block[5] |= (pixel_data >> 16) & 0xff;
154 block[6] |= (pixel_data >> 8) & 0xff;
155 block[7] |= pixel_data & 0xff;
156 }
157
158 inline void WriteFlip(uint8_t* block, bool flip) {
159 block[3] &= ~0x01;
160 block[3] |= static_cast<uint8_t>(flip);
161 }
162
163 inline void WriteDiff(uint8_t* block, bool diff) {
164 block[3] &= ~0x02;
165 block[3] |= static_cast<uint8_t>(diff) << 1;
166 }
167
168 /**
169 * Compress and rounds BGR888 into BGR444. The resulting BGR444 color is
170 * expanded to BGR888 as it would be in hardware after decompression. The
171 * actual 444-bit data is available in the four most significant bits of each
172 * channel.
173 */
174 inline Color MakeColor444(const float* bgr) {
175 uint8_t b4 = round_to_4_bits(bgr[0]);
176 uint8_t g4 = round_to_4_bits(bgr[1]);
177 uint8_t r4 = round_to_4_bits(bgr[2]);
178 Color bgr444;
179 bgr444.channels.b = (b4 << 4) | b4;
180 bgr444.channels.g = (g4 << 4) | g4;
181 bgr444.channels.r = (r4 << 4) | r4;
182 bgr444.channels.a = 0x44; /* added by Radu */
183 return bgr444;
184 }
185
186 /**
187 * Compress and rounds BGR888 into BGR555. The resulting BGR555 color is
188 * expanded to BGR888 as it would be in hardware after decompression. The
189 * actual 555-bit data is available in the five most significant bits of each
190 * channel.
191 */
192 inline Color MakeColor555(const float* bgr) {
193 uint8_t b5 = round_to_5_bits(bgr[0]);
194 uint8_t g5 = round_to_5_bits(bgr[1]);
195 uint8_t r5 = round_to_5_bits(bgr[2]);
196 Color bgr555;
197 bgr555.channels.b = (b5 << 3) | (b5 >> 2);
198 bgr555.channels.g = (g5 << 3) | (g5 >> 2);
199 bgr555.channels.r = (r5 << 3) | (r5 >> 2);
200 bgr555.channels.a = 0x55; /* added by Radu */
201 return bgr555;
202 }
203
204 /**
205 * Constructs a color from a given base color and luminance value.
206 */
207 inline Color MakeColor(const Color& base, int16_t lum) {
208 int b = static_cast<int>(base.channels.b) + lum;
209 int g = static_cast<int>(base.channels.g) + lum;
210 int r = static_cast<int>(base.channels.r) + lum;
211 Color color;
212 color.channels.b = static_cast<uint8_t>(clamp(b, 0, 255));
213 color.channels.g = static_cast<uint8_t>(clamp(g, 0, 255));
214 color.channels.r = static_cast<uint8_t>(clamp(r, 0, 255));
215 return color;
216 }
217
218 /**
219 * Calculates the error metric for two colors. A small error signals that the
220 * colors are similar to each other, a large error the signals the opposite.
221 */
222 inline uint32_t GetColorError(const Color& u, const Color& v) {
223 #ifdef USE_PERCEIVED_ERROR_METRIC
224 float delta_b = static_cast<float>(u.channels.b) - v.channels.b;
225 float delta_g = static_cast<float>(u.channels.g) - v.channels.g;
226 float delta_r = static_cast<float>(u.channels.r) - v.channels.r;
227 return static_cast<uint32_t>(0.299f * delta_b * delta_b +
228 0.587f * delta_g * delta_g +
229 0.114f * delta_r * delta_r);
230 #else
231 int delta_b = static_cast<int>(u.channels.b) - v.channels.b;
232 int delta_g = static_cast<int>(u.channels.g) - v.channels.g;
233 int delta_r = static_cast<int>(u.channels.r) - v.channels.r;
234 return delta_b * delta_b + delta_g * delta_g + delta_r * delta_r;
235 #endif
236 }
237
238 /**************************************** START OF SSE CODE
adrian.belgun 2015/04/17 14:02:58 Use only one line here. Reduce number of stars.
239 * ***************************************/
240
241 struct __sse_data {
242 /* raw data */
243 uint8_t* block;
244 /* 8 bit packed values */
245 __m128i* packed;
246 /* 32 bit zero extended values - 4x4 arrays */
247 __m128i* blue;
248 __m128i* green;
249 __m128i* red;
250 // __m128i *alpha;
251 };
252
253 /* commonly used registers */
254 static const __m128i __sse_zero = _mm_set1_epi32(0);
255 static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF);
256
257 inline __m128i AddAndClamp(const __m128i x, const __m128i y) {
258 static const __m128i color_max = _mm_set1_epi32(0xFF);
259 return _mm_max_epi32(__sse_zero,
260 _mm_min_epi32(_mm_add_epi32(x, y), color_max));
261 }
262
263 inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) {
264 __m128i ret = _mm_sub_epi32(x, y);
265 return _mm_mullo_epi32(ret, ret);
266 }
267
268 inline __m128i AddChannelError(const __m128i x,
269 const __m128i y,
270 const __m128i z) {
271 return _mm_add_epi32(x, _mm_add_epi32(y, z));
272 }
273 /*
274 inline void ShuffleImm(__m128i *src, __m128i *dest, int size, uint8_t notimm) {
275 switch(notimm) {
276 case 0x1B:
277 for (int i = 0; i < size; i++) {
adrian.belgun 2015/04/17 14:02:58 Braces are optional for single-statement loops. Co
278 dest[i] = _mm_shuffle_epi32(src[i], 0x1B);
279 }
280 break;
281 case 0x4E:
282 for (int i = 0; i < size; i++) {
283 dest[i] = _mm_shuffle_epi32(src[i], 0x4E);
284 }
285 break;
286 case 0xB1:
287 for (int i = 0; i < size; i++) {
288 dest[i] = _mm_shuffle_epi32(src[i], 0xB1);
289 }
290 break;
291 case 0xE4:
292 for (int i = 0; i < size; i++) {
293 dest[i] = _mm_shuffle_epi32(src[i], 0xE4);
294 }
295 break;
296 default:
297 for (int i = 0; i < size; i++) {
298 dest[i] = src[i];
299 }
300 };
301 }
302 */
303 inline uint32_t GetVerticalError(const __sse_data* data,
304 const __m128i* blue_avg,
305 const __m128i* green_avg,
306 const __m128i* red_avg) {
307 __m128i error = __sse_zero;
308
309 #pragma unroll
310 for (int i = 0; i < 4; i++) {
311 error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0]));
312 error =
313 _mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0]));
314 error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0]));
315 }
316
317 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));
318 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0xB1));
319
320 return _mm_cvtsi128_si32(error);
321 }
322
323 inline uint32_t GetHorizontalError(const __sse_data* data,
324 const __m128i* blue_avg,
325 const __m128i* green_avg,
326 const __m128i* red_avg) {
327 __m128i error = __sse_zero;
328 int first_index, second_index;
329
330 #pragma unroll
331 for (int i = 0; i < 2; i++) {
332 first_index = 2 * i;
333 second_index = first_index + 1;
334
335 error = _mm_add_epi32(
336 error, GetColorErrorSSE(data->blue[first_index], blue_avg[i]));
337 error = _mm_add_epi32(
338 error, GetColorErrorSSE(data->blue[second_index], blue_avg[i]));
339 error = _mm_add_epi32(
340 error, GetColorErrorSSE(data->green[first_index], green_avg[i]));
341 error = _mm_add_epi32(
342 error, GetColorErrorSSE(data->green[second_index], green_avg[i]));
343 error = _mm_add_epi32(error,
344 GetColorErrorSSE(data->red[first_index], red_avg[i]));
345 error = _mm_add_epi32(
346 error, GetColorErrorSSE(data->red[second_index], red_avg[i]));
347 }
348
349 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E));
350 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0xB1));
351 return _mm_cvtsi128_si32(error);
352 }
353
354 inline void GetAvgColors(const __sse_data* data,
355 float* output,
356 bool* __sse_use_diff) {
357 __m128i sum[2], tmp;
358
359 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe
360
361 /* get avg red */
362 /* [S0 S0 S1 S1] */
363 sum[0] = _mm_add_epi32(data->red[0], data->red[1]);
364 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
365
366 /* [S2 S2 S3 S3] */
367 sum[1] = _mm_add_epi32(data->red[2], data->red[3]);
368 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
369
370 float hred[2], vred[2];
371 hred[0] = (_mm_cvtsi128_si32(
372 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
373 8.0f;
374 hred[1] = (_mm_cvtsi128_si32(
375 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
376 8.0f;
377
378 tmp = _mm_add_epi32(sum[0], sum[1]);
379 vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
380 vred[1] = (_mm_extract_epi32(tmp, 2)) / 8.0f;
381
382 /* get avg green */
383 /* [S0 S0 S1 S1] */
384 sum[0] = _mm_add_epi32(data->green[0], data->green[1]);
385 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
386
387 /* [S2 S2 S3 S3] */
388 sum[1] = _mm_add_epi32(data->green[2], data->green[3]);
389 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
390
391 float hgreen[2], vgreen[2];
392 hgreen[0] = (_mm_cvtsi128_si32(
393 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
394 8.0f;
395 hgreen[1] = (_mm_cvtsi128_si32(
396 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
397 8.0f;
398
399 tmp = _mm_add_epi32(sum[0], sum[1]);
400 vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
401 vgreen[1] = (_mm_extract_epi32(tmp, 2)) / 8.0f;
402
403 /* get avg blue */
404 /* [S0 S0 S1 S1] */
405 sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]);
406 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1));
407
408 /* [S2 S2 S3 S3] */
409 sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]);
410 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1));
411
412 float hblue[2], vblue[2];
413 hblue[0] = (_mm_cvtsi128_si32(
414 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) /
415 8.0f;
416 hblue[1] = (_mm_cvtsi128_si32(
417 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) /
418 8.0f;
419
420 tmp = _mm_add_epi32(sum[0], sum[1]);
421 vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f;
422 vblue[1] = (_mm_extract_epi32(tmp, 2)) / 8.0f;
423
424 /* TODO(radu.velea): return int's instead of floats */
425 output[0] = vblue[0];
426 output[1] = vgreen[0];
427 output[2] = vred[0];
428
429 output[3] = vblue[1];
430 output[4] = vgreen[1];
431 output[5] = vred[1];
432
433 output[6] = hblue[0];
434 output[7] = hgreen[0];
435 output[8] = hred[0];
436
437 output[9] = hblue[1];
438 output[10] = hgreen[1];
439 output[11] = hred[1];
440
441 __m128i threashhold_upper = _mm_set1_epi32(3);
442 __m128i threashhold_lower = _mm_set1_epi32(-4);
443
444 __m128 factor_v = _mm_set1_ps(31.0f / 255.0f);
445 __m128 rounding_v = _mm_set1_ps(0.5f);
446 __m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0);
447 __m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0);
448
449 __m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0);
450 __m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0);
451
452 h_avg_0 = _mm_mul_ps(h_avg_0, factor_v);
453 h_avg_1 = _mm_mul_ps(h_avg_1, factor_v);
454 v_avg_0 = _mm_mul_ps(v_avg_0, factor_v);
455 v_avg_1 = _mm_mul_ps(v_avg_1, factor_v);
456
457 h_avg_0 = _mm_add_ps(h_avg_0, rounding_v);
458 h_avg_1 = _mm_add_ps(h_avg_1, rounding_v);
459 v_avg_0 = _mm_add_ps(v_avg_0, rounding_v);
460 v_avg_1 = _mm_add_ps(v_avg_1, rounding_v);
461
462 __m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0);
463 __m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1);
464
465 __m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0);
466 __m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1);
467
468 h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i);
469 v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i);
470
471 __sse_use_diff[0] =
472 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threashhold_lower)));
473 __sse_use_diff[0] &=
474 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threashhold_upper)));
475
476 __sse_use_diff[1] =
477 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threashhold_lower)));
478 __sse_use_diff[1] &=
479 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threashhold_upper)));
480 }
481
482 void ComputeLuminanceSSE(uint8_t* block,
483 const Color& base,
484 const int sub_block_id,
485 const uint8_t* idx_to_num_tab,
486 const __sse_data* data) {
487 uint8_t my_best_tbl_idx = 0;
488 uint32_t my_best_error = 0x7FFFFFFF;
489 uint8_t my_best_mod_idx[8][8]; // [table][texel]
490
491 const __m128i base_blue = _mm_set1_epi32(base.channels.b);
492 const __m128i base_green = _mm_set1_epi32(base.channels.g);
493 const __m128i base_red = _mm_set1_epi32(base.channels.r);
494
495 __m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red;
496 __m128i block_error;
497
498 /* this will have the minimum errors for each 4 pixels */
499 __m128i first_half_min;
500 __m128i second_half_min;
501
502 /* this will have the matching table index combo for each 4 pixels */
503 __m128i first_half_pattern;
504 __m128i second_half_pattern;
505
506 const __m128i first_blue_data_block = data->blue[2 * sub_block_id];
507 const __m128i first_green_data_block = data->green[2 * sub_block_id];
508 const __m128i first_red_data_block = data->red[2 * sub_block_id];
509
510 const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1];
511 const __m128i second_green_data_block = data->green[2 * sub_block_id + 1];
512 const __m128i second_red_data_block = data->red[2 * sub_block_id + 1];
513
514 uint32_t min;
515
516 #define ELEMENT_1 3, 2, 1, 0
517 #define ELEMENT_2 7, 6, 5, 4
518 #define ELEMENT_3 11, 10, 9, 8
519 #define ELEMENT_4 15, 14, 13, 12
520
521 static const __m128i mask_extended[4] = {
522 _mm_set_epi8(ELEMENT_1, ELEMENT_2, ELEMENT_3, ELEMENT_4),
523 _mm_set_epi8(ELEMENT_2, ELEMENT_1, ELEMENT_4, ELEMENT_3),
524 _mm_set_epi8(ELEMENT_3, ELEMENT_4, ELEMENT_1, ELEMENT_2),
525 _mm_set_epi8(ELEMENT_4, ELEMENT_3, ELEMENT_2, ELEMENT_1)};
526
527 static const __m128i mask_imm[4] = {_mm_set1_epi32(0x1B),
528 _mm_set1_epi32(0x4E),
529 _mm_set1_epi32(0xB1),
530 _mm_set1_epi32(0xE4)};
531
532 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {
533 tmp = _mm_set_epi32(
534 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2],
535 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]);
536
537 test_blue = AddAndClamp(tmp, base_blue);
538 test_green = AddAndClamp(tmp, base_green);
539 test_red = AddAndClamp(tmp, base_red);
540
541 first_half_min = __sse_max_int;
542 second_half_min = __sse_max_int;
543
544 first_half_pattern = __sse_zero;
545 second_half_pattern = __sse_zero;
546
547 #pragma unroll
548 for (int i = 0; i < 4; i++) {
549 tmp_blue = _mm_shuffle_epi8(test_blue, mask_extended[i]);
550 tmp_green = _mm_shuffle_epi8(test_green, mask_extended[i]);
551 tmp_red = _mm_shuffle_epi8(test_red, mask_extended[i]);
552
553 block_error =
554 AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block),
555 GetColorErrorSSE(tmp_green, first_green_data_block),
556 GetColorErrorSSE(tmp_red, first_red_data_block));
557
558 /* save winning pattern */
559 first_half_pattern = _mm_max_epi32(
560 first_half_pattern,
561 _mm_and_si128(mask_imm[i],
562 _mm_cmpgt_epi32(first_half_min, block_error)));
563 first_half_min = _mm_min_epi32(first_half_min, block_error);
564
565 /* Second part of the block */
566 block_error =
567 AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block),
568 GetColorErrorSSE(tmp_green, second_green_data_block),
569 GetColorErrorSSE(tmp_red, second_red_data_block));
570
571 /* save winning pattern */
572 second_half_pattern = _mm_max_epi32(
573 second_half_pattern,
574 _mm_and_si128(mask_imm[i],
575 _mm_cmpgt_epi32(second_half_min, block_error)));
576 second_half_min = _mm_min_epi32(second_half_min, block_error);
577 }
578
579 first_half_min = _mm_add_epi32(first_half_min, second_half_min);
580 first_half_min =
581 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E));
582 first_half_min =
583 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1));
584
585 min = _mm_cvtsi128_si32(first_half_min);
586
587 if (min < my_best_error) {
588 my_best_tbl_idx = tbl_idx;
589 my_best_error = min;
590 #if O3_OPTIMIZATION
591 #pragma unroll
592 for (int i = 0; i < 4; i++) {
593 my_best_mod_idx[tbl_idx][i] =
594 (_mm_extract_epi32(first_half_pattern, i) >> (2 * i)) & 3;
595 my_best_mod_idx[tbl_idx][i + 4] =
596 (_mm_extract_epi32(second_half_pattern, i) >> (2 * i)) & 3;
597 }
598 #endif
599 my_best_mod_idx[tbl_idx][0] =
600 (_mm_extract_epi32(first_half_pattern, 0) >> (0)) & 3;
601 my_best_mod_idx[tbl_idx][4] =
602 (_mm_extract_epi32(second_half_pattern, 0) >> (0)) & 3;
603
604 my_best_mod_idx[tbl_idx][1] =
605 (_mm_extract_epi32(first_half_pattern, 1) >> (2)) & 3;
606 my_best_mod_idx[tbl_idx][5] =
607 (_mm_extract_epi32(second_half_pattern, 1) >> (2)) & 3;
608
609 my_best_mod_idx[tbl_idx][2] =
610 (_mm_extract_epi32(first_half_pattern, 2) >> (4)) & 3;
611 my_best_mod_idx[tbl_idx][6] =
612 (_mm_extract_epi32(second_half_pattern, 2) >> (4)) & 3;
613
614 my_best_mod_idx[tbl_idx][3] =
615 (_mm_extract_epi32(first_half_pattern, 3) >> (6)) & 3;
616 my_best_mod_idx[tbl_idx][7] =
617 (_mm_extract_epi32(second_half_pattern, 3) >> (6)) & 3;
618
619 if (my_best_error == 0) {
620 break;
621 }
622 }
623 }
624
625 WriteCodewordTable(block, sub_block_id, my_best_tbl_idx);
626
627 uint32_t pix_data = 0;
628 uint8_t mod_idx;
629 uint8_t pix_idx;
630 uint32_t lsb;
631 uint32_t msb;
632 int texel_num;
633
634 for (unsigned int i = 0; i < 8; ++i) {
635 mod_idx = my_best_mod_idx[my_best_tbl_idx][i];
636 pix_idx = g_mod_to_pix[mod_idx];
637
638 lsb = pix_idx & 0x1;
639 msb = pix_idx >> 1;
640
641 // Obtain the texel number as specified in the standard.
642 texel_num = idx_to_num_tab[i];
643 pix_data |= msb << (texel_num + 16);
644 pix_data |= lsb << (texel_num);
645 }
646
647 WritePixelData(block, pix_data);
648 }
649
650 void CompressBlock(uint8_t* dst, __sse_data* data) {
651 /* first 3 vertical 1, seconds 3 vertical 2, third 3 horizontal 1, last 3
652 * horizontal 2 */
653 float __sse_avg_colors[12] = {
654 0,
655 };
656 bool use_differential[2] = {true, true};
657 GetAvgColors(data, __sse_avg_colors, use_differential);
658 Color sub_block_avg[4];
659
660 /* TODO(radu.velea): remove floating point operations and use only int's +
661 * normal
662 * rounding and shifts */
663 for (int i = 0, j = 1; i < 4; i += 2, j += 2) {
664 if (use_differential[i / 2] == false) {
665 sub_block_avg[i] = MakeColor444(&__sse_avg_colors[i * 3]);
666 sub_block_avg[j] = MakeColor444(&__sse_avg_colors[j * 3]);
667 } else {
668 sub_block_avg[i] = MakeColor555(&__sse_avg_colors[i * 3]);
669 sub_block_avg[j] = MakeColor555(&__sse_avg_colors[j * 3]);
670 }
671 }
672
673 __m128i red_avg[2], green_avg[2], blue_avg[2];
674
675 // TODO(radu.velea): perfect accuracy, maybe skip floating variables
676 blue_avg[0] =
677 _mm_set_epi32((int)__sse_avg_colors[3], (int)__sse_avg_colors[3],
678 (int)__sse_avg_colors[0], (int)__sse_avg_colors[0]);
679
680 green_avg[0] =
681 _mm_set_epi32((int)__sse_avg_colors[4], (int)__sse_avg_colors[4],
682 (int)__sse_avg_colors[1], (int)__sse_avg_colors[1]);
683
684 red_avg[0] =
685 _mm_set_epi32((int)__sse_avg_colors[5], (int)__sse_avg_colors[5],
686 (int)__sse_avg_colors[2], (int)__sse_avg_colors[2]);
687
688 uint32_t vertical_error =
689 GetVerticalError(data, blue_avg, green_avg, red_avg);
690
691 // TODO(radu.velea): perfect accuracy, maybe skip floating variables
692 blue_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[6]);
693 blue_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[9]);
694
695 green_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[7]);
696 green_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[10]);
697
698 red_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[8]);
699 red_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[11]);
700
701 uint32_t horizontal_error =
702 GetHorizontalError(data, blue_avg, green_avg, red_avg);
703
704 bool flip = horizontal_error < vertical_error;
705
706 // Clear destination buffer so that we can "or" in the results.
707 memset(dst, 0, 8);
708
709 WriteDiff(dst, use_differential[!!flip]);
710 WriteFlip(dst, flip);
711
712 uint8_t sub_block_off_0 = flip ? 2 : 0;
713 uint8_t sub_block_off_1 = sub_block_off_0 + 1;
714
715 if (use_differential[!!flip]) {
716 WriteColors555(dst, sub_block_avg[sub_block_off_0],
717 sub_block_avg[sub_block_off_1]);
718 } else {
719 WriteColors444(dst, sub_block_avg[sub_block_off_0],
720 sub_block_avg[sub_block_off_1]);
721 }
722
723 if (flip == false) {
724 /* transpose vertical data into horizontal lines */
725 __m128i tmp;
726 #pragma unroll
727 for (int i = 0; i < 4; i += 2) {
728 tmp = data->blue[i];
729 data->blue[i] = _mm_add_epi32(
730 _mm_move_epi64(data->blue[i]),
731 _mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E));
732 data->blue[i + 1] = _mm_add_epi32(
733 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
734 _mm_shuffle_epi32(
735 _mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)),
736 0x4E));
737
738 tmp = data->green[i];
739 data->green[i] = _mm_add_epi32(
740 _mm_move_epi64(data->green[i]),
741 _mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E));
742 data->green[i + 1] = _mm_add_epi32(
743 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
744 _mm_shuffle_epi32(
745 _mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)),
746 0x4E));
747
748 tmp = data->red[i];
749 data->red[i] = _mm_add_epi32(
750 _mm_move_epi64(data->red[i]),
751 _mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E));
752 data->red[i + 1] = _mm_add_epi32(
753 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)),
754 _mm_shuffle_epi32(
755 _mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E));
756 }
757
758 tmp = data->blue[1];
759 data->blue[1] = data->blue[2];
760 data->blue[2] = tmp;
761
762 tmp = data->green[1];
763 data->green[1] = data->green[2];
764 data->green[2] = tmp;
765
766 tmp = data->red[1];
767 data->red[1] = data->red[2];
768 data->red[2] = tmp;
769 }
770
771 // Compute luminance for the first sub block.
772 ComputeLuminanceSSE(dst, sub_block_avg[sub_block_off_0], 0,
773 g_idx_to_num[sub_block_off_0], data);
774 // Compute luminance for the second sub block.
775 ComputeLuminanceSSE(dst, sub_block_avg[sub_block_off_1], 1,
776 g_idx_to_num[sub_block_off_1], data);
777 }
778
779 static void LegacyExtractBlock(uint8_t* dst, const uint8_t* src, int width) {
780 for (int j = 0; j < 4; ++j) {
781 memcpy(&dst[j * 4 * 4], src, 4 * 4);
782 src += width * 4;
783 }
784 }
785
786 inline void TransposeBlock(uint8_t* block, __m128i* transposed /* [4] */) {
787 __m128i tmp3, tmp2, tmp1, tmp0;
788
adrian.belgun 2015/04/17 14:02:58 I think something went wrong with 'git cl format'
789 transposed[0] = _mm_loadu_si128((__m128i*)(block)); // a0,a1,a2,...a7, ...a15
790 transposed[1] =
791 _mm_loadu_si128((__m128i*)(block + 16)); // b0, b1,b2,...b7.... b15
792 transposed[2] =
793 _mm_loadu_si128((__m128i*)(block + 32)); // c0, c1,c2,...c7....c15
794 transposed[3] =
795 _mm_loadu_si128((__m128i*)(block + 48)); // d0,d1,d2,...d7....d15
796
797 tmp0 = _mm_unpacklo_epi8(
798 transposed[0], transposed[1]); // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
799 tmp1 = _mm_unpacklo_epi8(
800 transposed[2], transposed[3]); // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
801 tmp2 = _mm_unpackhi_epi8(
802 transposed[0],
803 transposed[1]); // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
804 tmp3 = _mm_unpackhi_epi8(
805 transposed[2],
806 transposed[3]); // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
807
808 transposed[0] = _mm_unpacklo_epi8(
809 tmp0, tmp2); // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
810 transposed[1] = _mm_unpackhi_epi8(
811 tmp0, tmp2); // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
812 transposed[2] =
813 _mm_unpacklo_epi8(tmp1, tmp3); // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
814 transposed[3] = _mm_unpackhi_epi8(
815 tmp1, tmp3); // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
816
817 tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]); // a0,a8, b0,b8,
818 // c0,c8, d0,d8,
819 // a1,a9, b1,b9,
820 // c1,c9, d1,d9
821 tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]); // a2,a10, b2,b10,
822 // c2,c10, d2,d10,
823 // a3,a11, b3,b11,
824 // c3,c11, d3,d11
825 tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]); // a4,a12, b4,b12,
826 // c4,c12, d4,d12,
827 // a5,a13, b5,b13,
828 // c5,c13, d5,d13,
829 tmp3 = _mm_unpackhi_epi32(transposed[1],
830 transposed[3]); // a6,a14, b6,b14, c6,c14, d6,d14,
831 // a7,a15,b7,b15,c7,c15,d7,d15
832
833 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2); // a0,a4, a8, a12, b0,b4,
834 // b8,b12, c0,c4, c8, c12,
835 // d0,d4, d8, d12
836 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2); // a1,a5, a9, a13, b1,b5,
837 // b9,b13, c1,c5, c9, c13,
838 // d1,d5, d9, d13
839 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3); // a2,a6, a10,a14, b2,b6,
840 // b10,b14, c2,c6, c10,c14,
841 // d2,d6, d10,d14
842 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3); // a3,a7, a11,a15, b3,b7,
843 // b11,b15, c3,c7, c11,c15,
844 // d3,d7, d11,d15
845 }
846
847 inline void UnpackBlock(__m128i* packed,
848 __m128i* red,
849 __m128i* green,
850 __m128i* blue,
851 __m128i* alpha) {
852 const __m128i zero = _mm_set1_epi8(0);
853 __m128i tmp_low, tmp_high;
854
855 /* unpack red */
856 tmp_low = _mm_unpacklo_epi8(packed[0], zero);
857 tmp_high = _mm_unpackhi_epi8(packed[0], zero);
858
859 red[0] = _mm_unpacklo_epi16(tmp_low, zero);
860 red[1] = _mm_unpackhi_epi16(tmp_low, zero);
861
862 red[2] = _mm_unpacklo_epi16(tmp_high, zero);
863 red[3] = _mm_unpackhi_epi16(tmp_high, zero);
864
865 /* unpack green */
866 tmp_low = _mm_unpacklo_epi8(packed[1], zero);
867 tmp_high = _mm_unpackhi_epi8(packed[1], zero);
868
869 green[0] = _mm_unpacklo_epi16(tmp_low, zero);
870 green[1] = _mm_unpackhi_epi16(tmp_low, zero);
871
872 green[2] = _mm_unpacklo_epi16(tmp_high, zero);
873 green[3] = _mm_unpackhi_epi16(tmp_high, zero);
874
875 /* unpack blue */
876 tmp_low = _mm_unpacklo_epi8(packed[2], zero);
877 tmp_high = _mm_unpackhi_epi8(packed[2], zero);
878
879 blue[0] = _mm_unpacklo_epi16(tmp_low, zero);
880 blue[1] = _mm_unpackhi_epi16(tmp_low, zero);
881
882 blue[2] = _mm_unpacklo_epi16(tmp_high, zero);
883 blue[3] = _mm_unpackhi_epi16(tmp_high, zero);
884
885 /* unpack alpha */
886 tmp_low = _mm_unpacklo_epi8(packed[3], zero);
887 tmp_high = _mm_unpackhi_epi8(packed[3], zero);
888
889 alpha[0] = _mm_unpacklo_epi16(tmp_low, zero);
890 alpha[1] = _mm_unpackhi_epi16(tmp_low, zero);
891
892 alpha[2] = _mm_unpacklo_epi16(tmp_high, zero);
893 alpha[3] = _mm_unpackhi_epi16(tmp_high, zero);
894 }
895
896 inline int BlockIsConstant(const uint8_t* block, const __m128i* transposed) {
897 __m128i first = _mm_set1_epi8(block[0]);
898 first = _mm_cmpeq_epi8(transposed[0], first);
899 if (_mm_movemask_epi8(first) != 0xFFFF) {
900 return 0;
901 }
902
903 first = _mm_set1_epi8(block[1]);
904 first = _mm_cmpeq_epi8(transposed[1], first);
905
906 if (_mm_movemask_epi8(first) != 0xFFFF) {
907 return 0;
908 }
909
910 first = _mm_set1_epi8(block[2]);
911 first = _mm_cmpeq_epi8(transposed[2], first);
912
913 if (_mm_movemask_epi8(first) != 0xFFFF) {
914 return 0;
915 }
916
917 return 1;
918 }
919
920 inline void CompressSolid(uint8_t* dst, uint8_t* block) {
921 // Clear destination buffer so that we can "or" in the results.
922 memset(dst, 0, 8);
923
924 float src_color_float[3] = {static_cast<float>(block[0]),
925 static_cast<float>(block[1]),
926 static_cast<float>(block[2])};
927 Color base = MakeColor555(src_color_float);
928 Color constant;
929 constant.channels.b = block[0];
930 constant.channels.g = block[1];
931 constant.channels.r = block[2];
932
933 WriteDiff(dst, true);
934 WriteFlip(dst, false);
935 WriteColors555(dst, base, base);
936
937 uint8_t best_tbl_idx = 0;
938 uint8_t best_mod_idx = 0;
939 uint32_t best_mod_err = std::numeric_limits<uint32_t>::max();
940
941 // Try all codeword tables to find the one giving the best results for this
942 // block.
943 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) {
944 // Try all modifiers in the current table to find which one gives the
945 // smallest error.
946 for (unsigned int mod_idx = 0; mod_idx < 4; ++mod_idx) {
947 int16_t lum = g_codeword_tables[tbl_idx][mod_idx];
948 const Color& color = MakeColor(base, lum);
949
950 uint32_t mod_err = GetColorError(constant, color);
951 if (mod_err < best_mod_err) {
952 best_tbl_idx = tbl_idx;
953 best_mod_idx = mod_idx;
954 best_mod_err = mod_err;
955
956 if (mod_err == 0)
957 break; // We cannot do any better than this.
958 }
959 }
960
961 if (best_mod_err == 0)
962 break;
963 }
964
965 WriteCodewordTable(dst, 0, best_tbl_idx);
966 WriteCodewordTable(dst, 1, best_tbl_idx);
967
968 uint8_t pix_idx = g_mod_to_pix[best_mod_idx];
969 uint32_t lsb = pix_idx & 0x1;
970 uint32_t msb = pix_idx >> 1;
971
972 uint32_t pix_data = 0;
973 for (unsigned int i = 0; i < 2; ++i) {
974 for (unsigned int j = 0; j < 8; ++j) {
975 // Obtain the texel number as specified in the standard.
976 int texel_num = g_idx_to_num[i][j];
977 pix_data |= msb << (texel_num + 16);
978 pix_data |= lsb << (texel_num);
979 }
980 }
981
982 WritePixelData(dst, pix_data);
983 }
984
985 } // namespace
986
987 namespace cc {
988
989 void TextureCompressorETC1_SSE::Compress(const uint8_t* src,
990 uint8_t* dst,
991 int width,
992 int height,
993 Quality quality) {
994 DCHECK(width >= 4 && (width & 3) == 0);
995 DCHECK(height >= 4 && (height & 3) == 0);
996
997 uint8_t block[64] __attribute__((aligned(16)));
998 __m128i packed[4];
999 __m128i red[4], green[4], blue[4], alpha[4];
1000 __sse_data data;
1001
1002 for (int y = 0; y < height; y += 4, src += width * 4 * 4) {
1003 for (int x = 0; x < width; x += 4, dst += 8) {
1004 /* SSE */
1005 LegacyExtractBlock(block, src + x * 4, width);
1006 TransposeBlock(block, packed);
1007 if (BlockIsConstant(block, packed) == 1) {
1008 /* TODO(radu.velea): handle constant blocks in SSE */
1009 CompressSolid(dst, block);
1010 } else {
1011 UnpackBlock(packed, blue, green, red, alpha);
1012
1013 data.block = block;
1014 data.packed = packed;
1015 data.red = red;
1016 data.blue = blue;
1017 data.green = green;
1018
1019 CompressBlock(dst, &data);
1020 }
1021 }
1022 }
1023 }
1024
1025 } // namespace cc
OLDNEW
« cc/resources/texture_compressor.cc ('K') | « cc/resources/texture_compressor_etc1_sse.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698