OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "texture_compressor_etc1_sse.h" | |
6 | |
7 #include <assert.h> | |
8 #include <smmintrin.h> | |
9 #include <stdio.h> | |
10 #include <stdlib.h> | |
11 #include <string.h> | |
12 #include <time.h> | |
13 #include <unistd.h> | |
14 | |
15 #include <cmath> | |
16 #include <limits> | |
17 #include <sstream> | |
18 | |
19 #include "base/compiler_specific.h" | |
20 #include "base/logging.h" | |
21 | |
22 // Defining the following macro will cause the error metric function to weigh | |
23 // each color channel differently depending on how the human eye can perceive | |
24 // them. This can give a slight improvement in image quality at the cost of a | |
25 // performance hit. | |
26 // #define USE_PERCEIVED_ERROR_METRIC | |
27 | |
28 namespace { | |
29 | |
30 template <typename T> | |
31 inline T clamp(T val, T min, T max) { | |
32 return val < min ? min : (val > max ? max : val); | |
33 } | |
34 | |
35 inline uint8_t round_to_5_bits(float val) { | |
36 return clamp<uint8_t>(val * 31.0f / 255.0f + 0.5f, 0, 31); | |
37 } | |
38 | |
39 inline uint8_t round_to_4_bits(float val) { | |
40 return clamp<uint8_t>(val * 15.0f / 255.0f + 0.5f, 0, 15); | |
41 } | |
42 | |
43 union Color { | |
44 struct BgraColorType { | |
45 uint8_t b; | |
46 uint8_t g; | |
47 uint8_t r; | |
48 uint8_t a; | |
49 } channels; | |
50 uint8_t components[4]; | |
51 uint32_t bits; | |
52 }; | |
53 | |
54 /* | |
55 * Codeword tables. | |
56 * See: Table 3.17.2 | |
57 */ | |
58 static const int16_t g_codeword_tables[8][4] | |
59 __attribute__((aligned(16))) = {{-8, -2, 2, 8}, | |
60 {-17, -5, 5, 17}, | |
61 {-29, -9, 9, 29}, | |
62 {-42, -13, 13, 42}, | |
63 {-60, -18, 18, 60}, | |
64 {-80, -24, 24, 80}, | |
65 {-106, -33, 33, 106}, | |
66 {-183, -47, 47, 183}}; | |
67 | |
68 /* | |
69 * Maps modifier indices to pixel index values. | |
70 * See: Table 3.17.3 | |
71 */ | |
72 static const uint8_t g_mod_to_pix[4] = {3, 2, 0, 1}; | |
73 | |
74 /* | |
75 * The ETC1 specification index texels as follows: | |
76 * | |
77 * [a][e][i][m] [ 0][ 4][ 8][12] | |
78 * [b][f][j][n] <-> [ 1][ 5][ 9][13] | |
79 * [c][g][k][o] [ 2][ 6][10][14] | |
80 * [d][h][l][p] [ 3][ 7][11][15] | |
81 * | |
82 * However, when extracting sub blocks from BGRA data the natural array | |
83 * indexing order ends up different: | |
84 * | |
85 * vertical0: [a][e][b][f] horizontal0: [a][e][i][m] | |
86 * [c][g][d][h] [b][f][j][n] | |
87 * vertical1: [i][m][j][n] horizontal1: [c][g][k][o] | |
88 * [k][o][l][p] [d][h][l][p] | |
89 * | |
90 * In order to translate from the natural array indices in a sub block to the | |
91 * indices (number) used by specification and hardware we use this table. | |
92 */ | |
93 static const uint8_t g_idx_to_num[4][8] = { | |
94 {0, 4, 1, 5, 2, 6, 3, 7}, // Vertical block 0. | |
95 {8, 12, 9, 13, 10, 14, 11, 15}, // Vertical block 1. | |
96 {0, 4, 8, 12, 1, 5, 9, 13}, // Horizontal block 0. | |
97 {2, 6, 10, 14, 3, 7, 11, 15} // Horizontal block 1. | |
98 }; | |
99 | |
100 inline void WriteColors444(uint8_t* block, | |
101 const Color& color0, | |
102 const Color& color1) { | |
103 /* 0, 1, 2 - for ARM */ | |
adrian.belgun
2015/04/17 14:02:58
Please check image channel order for input.
This
| |
104 block[2] = (color0.channels.r & 0xf0) | (color1.channels.r >> 4); | |
105 block[1] = (color0.channels.g & 0xf0) | (color1.channels.g >> 4); | |
106 block[0] = (color0.channels.b & 0xf0) | (color1.channels.b >> 4); | |
107 } | |
108 | |
109 inline void WriteColors555(uint8_t* block, | |
110 const Color& color0, | |
111 const Color& color1) { | |
112 // Table for conversion to 3-bit two complement format. | |
113 static const uint8_t two_compl_trans_table[8] = { | |
114 4, // -4 (100b) | |
115 5, // -3 (101b) | |
116 6, // -2 (110b) | |
117 7, // -1 (111b) | |
118 0, // 0 (000b) | |
119 1, // 1 (001b) | |
120 2, // 2 (010b) | |
121 3, // 3 (011b) | |
122 }; | |
123 | |
124 int16_t delta_r = | |
125 static_cast<int16_t>(color1.channels.r >> 3) - (color0.channels.r >> 3); | |
126 int16_t delta_g = | |
127 static_cast<int16_t>(color1.channels.g >> 3) - (color0.channels.g >> 3); | |
128 int16_t delta_b = | |
129 static_cast<int16_t>(color1.channels.b >> 3) - (color0.channels.b >> 3); | |
130 DCHECK(delta_r >= -4 && delta_r <= 3); | |
131 DCHECK(delta_g >= -4 && delta_g <= 3); | |
132 DCHECK(delta_b >= -4 && delta_b <= 3); | |
133 | |
134 /* 0, 1, 2 - for ARM */ | |
adrian.belgun
2015/04/17 14:02:58
Same comments as for :103.
| |
135 block[2] = (color0.channels.r & 0xf8) | two_compl_trans_table[delta_r + 4]; | |
136 block[1] = (color0.channels.g & 0xf8) | two_compl_trans_table[delta_g + 4]; | |
137 block[0] = (color0.channels.b & 0xf8) | two_compl_trans_table[delta_b + 4]; | |
138 } | |
139 | |
140 inline void WriteCodewordTable(uint8_t* block, | |
141 uint8_t sub_block_id, | |
142 uint8_t table) { | |
143 DCHECK_LT(sub_block_id, 2); | |
144 DCHECK_LT(table, 8); | |
145 | |
146 uint8_t shift = (2 + (3 - sub_block_id * 3)); | |
147 block[3] &= ~(0x07 << shift); | |
148 block[3] |= table << shift; | |
149 } | |
150 | |
151 inline void WritePixelData(uint8_t* block, uint32_t pixel_data) { | |
152 block[4] |= pixel_data >> 24; | |
153 block[5] |= (pixel_data >> 16) & 0xff; | |
154 block[6] |= (pixel_data >> 8) & 0xff; | |
155 block[7] |= pixel_data & 0xff; | |
156 } | |
157 | |
158 inline void WriteFlip(uint8_t* block, bool flip) { | |
159 block[3] &= ~0x01; | |
160 block[3] |= static_cast<uint8_t>(flip); | |
161 } | |
162 | |
163 inline void WriteDiff(uint8_t* block, bool diff) { | |
164 block[3] &= ~0x02; | |
165 block[3] |= static_cast<uint8_t>(diff) << 1; | |
166 } | |
167 | |
168 /** | |
169 * Compress and rounds BGR888 into BGR444. The resulting BGR444 color is | |
170 * expanded to BGR888 as it would be in hardware after decompression. The | |
171 * actual 444-bit data is available in the four most significant bits of each | |
172 * channel. | |
173 */ | |
174 inline Color MakeColor444(const float* bgr) { | |
175 uint8_t b4 = round_to_4_bits(bgr[0]); | |
176 uint8_t g4 = round_to_4_bits(bgr[1]); | |
177 uint8_t r4 = round_to_4_bits(bgr[2]); | |
178 Color bgr444; | |
179 bgr444.channels.b = (b4 << 4) | b4; | |
180 bgr444.channels.g = (g4 << 4) | g4; | |
181 bgr444.channels.r = (r4 << 4) | r4; | |
182 bgr444.channels.a = 0x44; /* added by Radu */ | |
183 return bgr444; | |
184 } | |
185 | |
186 /** | |
187 * Compress and rounds BGR888 into BGR555. The resulting BGR555 color is | |
188 * expanded to BGR888 as it would be in hardware after decompression. The | |
189 * actual 555-bit data is available in the five most significant bits of each | |
190 * channel. | |
191 */ | |
192 inline Color MakeColor555(const float* bgr) { | |
193 uint8_t b5 = round_to_5_bits(bgr[0]); | |
194 uint8_t g5 = round_to_5_bits(bgr[1]); | |
195 uint8_t r5 = round_to_5_bits(bgr[2]); | |
196 Color bgr555; | |
197 bgr555.channels.b = (b5 << 3) | (b5 >> 2); | |
198 bgr555.channels.g = (g5 << 3) | (g5 >> 2); | |
199 bgr555.channels.r = (r5 << 3) | (r5 >> 2); | |
200 bgr555.channels.a = 0x55; /* added by Radu */ | |
201 return bgr555; | |
202 } | |
203 | |
204 /** | |
205 * Constructs a color from a given base color and luminance value. | |
206 */ | |
207 inline Color MakeColor(const Color& base, int16_t lum) { | |
208 int b = static_cast<int>(base.channels.b) + lum; | |
209 int g = static_cast<int>(base.channels.g) + lum; | |
210 int r = static_cast<int>(base.channels.r) + lum; | |
211 Color color; | |
212 color.channels.b = static_cast<uint8_t>(clamp(b, 0, 255)); | |
213 color.channels.g = static_cast<uint8_t>(clamp(g, 0, 255)); | |
214 color.channels.r = static_cast<uint8_t>(clamp(r, 0, 255)); | |
215 return color; | |
216 } | |
217 | |
218 /** | |
219 * Calculates the error metric for two colors. A small error signals that the | |
220 * colors are similar to each other, a large error the signals the opposite. | |
221 */ | |
222 inline uint32_t GetColorError(const Color& u, const Color& v) { | |
223 #ifdef USE_PERCEIVED_ERROR_METRIC | |
224 float delta_b = static_cast<float>(u.channels.b) - v.channels.b; | |
225 float delta_g = static_cast<float>(u.channels.g) - v.channels.g; | |
226 float delta_r = static_cast<float>(u.channels.r) - v.channels.r; | |
227 return static_cast<uint32_t>(0.299f * delta_b * delta_b + | |
228 0.587f * delta_g * delta_g + | |
229 0.114f * delta_r * delta_r); | |
230 #else | |
231 int delta_b = static_cast<int>(u.channels.b) - v.channels.b; | |
232 int delta_g = static_cast<int>(u.channels.g) - v.channels.g; | |
233 int delta_r = static_cast<int>(u.channels.r) - v.channels.r; | |
234 return delta_b * delta_b + delta_g * delta_g + delta_r * delta_r; | |
235 #endif | |
236 } | |
237 | |
238 /**************************************** START OF SSE CODE | |
adrian.belgun
2015/04/17 14:02:58
Use only one line here. Reduce number of stars.
| |
239 * ***************************************/ | |
240 | |
241 struct __sse_data { | |
242 /* raw data */ | |
243 uint8_t* block; | |
244 /* 8 bit packed values */ | |
245 __m128i* packed; | |
246 /* 32 bit zero extended values - 4x4 arrays */ | |
247 __m128i* blue; | |
248 __m128i* green; | |
249 __m128i* red; | |
250 // __m128i *alpha; | |
251 }; | |
252 | |
253 /* commonly used registers */ | |
254 static const __m128i __sse_zero = _mm_set1_epi32(0); | |
255 static const __m128i __sse_max_int = _mm_set1_epi32(0x7FFFFFFF); | |
256 | |
257 inline __m128i AddAndClamp(const __m128i x, const __m128i y) { | |
258 static const __m128i color_max = _mm_set1_epi32(0xFF); | |
259 return _mm_max_epi32(__sse_zero, | |
260 _mm_min_epi32(_mm_add_epi32(x, y), color_max)); | |
261 } | |
262 | |
263 inline __m128i GetColorErrorSSE(const __m128i x, const __m128i y) { | |
264 __m128i ret = _mm_sub_epi32(x, y); | |
265 return _mm_mullo_epi32(ret, ret); | |
266 } | |
267 | |
268 inline __m128i AddChannelError(const __m128i x, | |
269 const __m128i y, | |
270 const __m128i z) { | |
271 return _mm_add_epi32(x, _mm_add_epi32(y, z)); | |
272 } | |
273 /* | |
274 inline void ShuffleImm(__m128i *src, __m128i *dest, int size, uint8_t notimm) { | |
275 switch(notimm) { | |
276 case 0x1B: | |
277 for (int i = 0; i < size; i++) { | |
adrian.belgun
2015/04/17 14:02:58
Braces are optional for single-statement loops. Co
| |
278 dest[i] = _mm_shuffle_epi32(src[i], 0x1B); | |
279 } | |
280 break; | |
281 case 0x4E: | |
282 for (int i = 0; i < size; i++) { | |
283 dest[i] = _mm_shuffle_epi32(src[i], 0x4E); | |
284 } | |
285 break; | |
286 case 0xB1: | |
287 for (int i = 0; i < size; i++) { | |
288 dest[i] = _mm_shuffle_epi32(src[i], 0xB1); | |
289 } | |
290 break; | |
291 case 0xE4: | |
292 for (int i = 0; i < size; i++) { | |
293 dest[i] = _mm_shuffle_epi32(src[i], 0xE4); | |
294 } | |
295 break; | |
296 default: | |
297 for (int i = 0; i < size; i++) { | |
298 dest[i] = src[i]; | |
299 } | |
300 }; | |
301 } | |
302 */ | |
303 inline uint32_t GetVerticalError(const __sse_data* data, | |
304 const __m128i* blue_avg, | |
305 const __m128i* green_avg, | |
306 const __m128i* red_avg) { | |
307 __m128i error = __sse_zero; | |
308 | |
309 #pragma unroll | |
310 for (int i = 0; i < 4; i++) { | |
311 error = _mm_add_epi32(error, GetColorErrorSSE(data->blue[i], blue_avg[0])); | |
312 error = | |
313 _mm_add_epi32(error, GetColorErrorSSE(data->green[i], green_avg[0])); | |
314 error = _mm_add_epi32(error, GetColorErrorSSE(data->red[i], red_avg[0])); | |
315 } | |
316 | |
317 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E)); | |
318 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0xB1)); | |
319 | |
320 return _mm_cvtsi128_si32(error); | |
321 } | |
322 | |
323 inline uint32_t GetHorizontalError(const __sse_data* data, | |
324 const __m128i* blue_avg, | |
325 const __m128i* green_avg, | |
326 const __m128i* red_avg) { | |
327 __m128i error = __sse_zero; | |
328 int first_index, second_index; | |
329 | |
330 #pragma unroll | |
331 for (int i = 0; i < 2; i++) { | |
332 first_index = 2 * i; | |
333 second_index = first_index + 1; | |
334 | |
335 error = _mm_add_epi32( | |
336 error, GetColorErrorSSE(data->blue[first_index], blue_avg[i])); | |
337 error = _mm_add_epi32( | |
338 error, GetColorErrorSSE(data->blue[second_index], blue_avg[i])); | |
339 error = _mm_add_epi32( | |
340 error, GetColorErrorSSE(data->green[first_index], green_avg[i])); | |
341 error = _mm_add_epi32( | |
342 error, GetColorErrorSSE(data->green[second_index], green_avg[i])); | |
343 error = _mm_add_epi32(error, | |
344 GetColorErrorSSE(data->red[first_index], red_avg[i])); | |
345 error = _mm_add_epi32( | |
346 error, GetColorErrorSSE(data->red[second_index], red_avg[i])); | |
347 } | |
348 | |
349 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0x4E)); | |
350 error = _mm_add_epi32(error, _mm_shuffle_epi32(error, 0xB1)); | |
351 return _mm_cvtsi128_si32(error); | |
352 } | |
353 | |
354 inline void GetAvgColors(const __sse_data* data, | |
355 float* output, | |
356 bool* __sse_use_diff) { | |
357 __m128i sum[2], tmp; | |
358 | |
359 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe | |
360 | |
361 /* get avg red */ | |
362 /* [S0 S0 S1 S1] */ | |
363 sum[0] = _mm_add_epi32(data->red[0], data->red[1]); | |
364 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1)); | |
365 | |
366 /* [S2 S2 S3 S3] */ | |
367 sum[1] = _mm_add_epi32(data->red[2], data->red[3]); | |
368 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1)); | |
369 | |
370 float hred[2], vred[2]; | |
371 hred[0] = (_mm_cvtsi128_si32( | |
372 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) / | |
373 8.0f; | |
374 hred[1] = (_mm_cvtsi128_si32( | |
375 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) / | |
376 8.0f; | |
377 | |
378 tmp = _mm_add_epi32(sum[0], sum[1]); | |
379 vred[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f; | |
380 vred[1] = (_mm_extract_epi32(tmp, 2)) / 8.0f; | |
381 | |
382 /* get avg green */ | |
383 /* [S0 S0 S1 S1] */ | |
384 sum[0] = _mm_add_epi32(data->green[0], data->green[1]); | |
385 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1)); | |
386 | |
387 /* [S2 S2 S3 S3] */ | |
388 sum[1] = _mm_add_epi32(data->green[2], data->green[3]); | |
389 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1)); | |
390 | |
391 float hgreen[2], vgreen[2]; | |
392 hgreen[0] = (_mm_cvtsi128_si32( | |
393 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) / | |
394 8.0f; | |
395 hgreen[1] = (_mm_cvtsi128_si32( | |
396 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) / | |
397 8.0f; | |
398 | |
399 tmp = _mm_add_epi32(sum[0], sum[1]); | |
400 vgreen[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f; | |
401 vgreen[1] = (_mm_extract_epi32(tmp, 2)) / 8.0f; | |
402 | |
403 /* get avg blue */ | |
404 /* [S0 S0 S1 S1] */ | |
405 sum[0] = _mm_add_epi32(data->blue[0], data->blue[1]); | |
406 sum[0] = _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0xB1)); | |
407 | |
408 /* [S2 S2 S3 S3] */ | |
409 sum[1] = _mm_add_epi32(data->blue[2], data->blue[3]); | |
410 sum[1] = _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0xB1)); | |
411 | |
412 float hblue[2], vblue[2]; | |
413 hblue[0] = (_mm_cvtsi128_si32( | |
414 _mm_add_epi32(sum[0], _mm_shuffle_epi32(sum[0], 0x4E)))) / | |
415 8.0f; | |
416 hblue[1] = (_mm_cvtsi128_si32( | |
417 _mm_add_epi32(sum[1], _mm_shuffle_epi32(sum[1], 0x4E)))) / | |
418 8.0f; | |
419 | |
420 tmp = _mm_add_epi32(sum[0], sum[1]); | |
421 vblue[0] = (_mm_cvtsi128_si32(tmp)) / 8.0f; | |
422 vblue[1] = (_mm_extract_epi32(tmp, 2)) / 8.0f; | |
423 | |
424 /* TODO(radu.velea): return int's instead of floats */ | |
425 output[0] = vblue[0]; | |
426 output[1] = vgreen[0]; | |
427 output[2] = vred[0]; | |
428 | |
429 output[3] = vblue[1]; | |
430 output[4] = vgreen[1]; | |
431 output[5] = vred[1]; | |
432 | |
433 output[6] = hblue[0]; | |
434 output[7] = hgreen[0]; | |
435 output[8] = hred[0]; | |
436 | |
437 output[9] = hblue[1]; | |
438 output[10] = hgreen[1]; | |
439 output[11] = hred[1]; | |
440 | |
441 __m128i threashhold_upper = _mm_set1_epi32(3); | |
442 __m128i threashhold_lower = _mm_set1_epi32(-4); | |
443 | |
444 __m128 factor_v = _mm_set1_ps(31.0f / 255.0f); | |
445 __m128 rounding_v = _mm_set1_ps(0.5f); | |
446 __m128 h_avg_0 = _mm_set_ps(hblue[0], hgreen[0], hred[0], 0); | |
447 __m128 h_avg_1 = _mm_set_ps(hblue[1], hgreen[1], hred[1], 0); | |
448 | |
449 __m128 v_avg_0 = _mm_set_ps(vblue[0], vgreen[0], vred[0], 0); | |
450 __m128 v_avg_1 = _mm_set_ps(vblue[1], vgreen[1], vred[1], 0); | |
451 | |
452 h_avg_0 = _mm_mul_ps(h_avg_0, factor_v); | |
453 h_avg_1 = _mm_mul_ps(h_avg_1, factor_v); | |
454 v_avg_0 = _mm_mul_ps(v_avg_0, factor_v); | |
455 v_avg_1 = _mm_mul_ps(v_avg_1, factor_v); | |
456 | |
457 h_avg_0 = _mm_add_ps(h_avg_0, rounding_v); | |
458 h_avg_1 = _mm_add_ps(h_avg_1, rounding_v); | |
459 v_avg_0 = _mm_add_ps(v_avg_0, rounding_v); | |
460 v_avg_1 = _mm_add_ps(v_avg_1, rounding_v); | |
461 | |
462 __m128i h_avg_0i = _mm_cvttps_epi32(h_avg_0); | |
463 __m128i h_avg_1i = _mm_cvttps_epi32(h_avg_1); | |
464 | |
465 __m128i v_avg_0i = _mm_cvttps_epi32(v_avg_0); | |
466 __m128i v_avg_1i = _mm_cvttps_epi32(v_avg_1); | |
467 | |
468 h_avg_0i = _mm_sub_epi32(h_avg_1i, h_avg_0i); | |
469 v_avg_0i = _mm_sub_epi32(v_avg_1i, v_avg_0i); | |
470 | |
471 __sse_use_diff[0] = | |
472 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i, threashhold_lower))); | |
473 __sse_use_diff[0] &= | |
474 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i, threashhold_upper))); | |
475 | |
476 __sse_use_diff[1] = | |
477 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i, threashhold_lower))); | |
478 __sse_use_diff[1] &= | |
479 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i, threashhold_upper))); | |
480 } | |
481 | |
482 void ComputeLuminanceSSE(uint8_t* block, | |
483 const Color& base, | |
484 const int sub_block_id, | |
485 const uint8_t* idx_to_num_tab, | |
486 const __sse_data* data) { | |
487 uint8_t my_best_tbl_idx = 0; | |
488 uint32_t my_best_error = 0x7FFFFFFF; | |
489 uint8_t my_best_mod_idx[8][8]; // [table][texel] | |
490 | |
491 const __m128i base_blue = _mm_set1_epi32(base.channels.b); | |
492 const __m128i base_green = _mm_set1_epi32(base.channels.g); | |
493 const __m128i base_red = _mm_set1_epi32(base.channels.r); | |
494 | |
495 __m128i test_red, test_blue, test_green, tmp, tmp_blue, tmp_green, tmp_red; | |
496 __m128i block_error; | |
497 | |
498 /* this will have the minimum errors for each 4 pixels */ | |
499 __m128i first_half_min; | |
500 __m128i second_half_min; | |
501 | |
502 /* this will have the matching table index combo for each 4 pixels */ | |
503 __m128i first_half_pattern; | |
504 __m128i second_half_pattern; | |
505 | |
506 const __m128i first_blue_data_block = data->blue[2 * sub_block_id]; | |
507 const __m128i first_green_data_block = data->green[2 * sub_block_id]; | |
508 const __m128i first_red_data_block = data->red[2 * sub_block_id]; | |
509 | |
510 const __m128i second_blue_data_block = data->blue[2 * sub_block_id + 1]; | |
511 const __m128i second_green_data_block = data->green[2 * sub_block_id + 1]; | |
512 const __m128i second_red_data_block = data->red[2 * sub_block_id + 1]; | |
513 | |
514 uint32_t min; | |
515 | |
516 #define ELEMENT_1 3, 2, 1, 0 | |
517 #define ELEMENT_2 7, 6, 5, 4 | |
518 #define ELEMENT_3 11, 10, 9, 8 | |
519 #define ELEMENT_4 15, 14, 13, 12 | |
520 | |
521 static const __m128i mask_extended[4] = { | |
522 _mm_set_epi8(ELEMENT_1, ELEMENT_2, ELEMENT_3, ELEMENT_4), | |
523 _mm_set_epi8(ELEMENT_2, ELEMENT_1, ELEMENT_4, ELEMENT_3), | |
524 _mm_set_epi8(ELEMENT_3, ELEMENT_4, ELEMENT_1, ELEMENT_2), | |
525 _mm_set_epi8(ELEMENT_4, ELEMENT_3, ELEMENT_2, ELEMENT_1)}; | |
526 | |
527 static const __m128i mask_imm[4] = {_mm_set1_epi32(0x1B), | |
528 _mm_set1_epi32(0x4E), | |
529 _mm_set1_epi32(0xB1), | |
530 _mm_set1_epi32(0xE4)}; | |
531 | |
532 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) { | |
533 tmp = _mm_set_epi32( | |
534 g_codeword_tables[tbl_idx][3], g_codeword_tables[tbl_idx][2], | |
535 g_codeword_tables[tbl_idx][1], g_codeword_tables[tbl_idx][0]); | |
536 | |
537 test_blue = AddAndClamp(tmp, base_blue); | |
538 test_green = AddAndClamp(tmp, base_green); | |
539 test_red = AddAndClamp(tmp, base_red); | |
540 | |
541 first_half_min = __sse_max_int; | |
542 second_half_min = __sse_max_int; | |
543 | |
544 first_half_pattern = __sse_zero; | |
545 second_half_pattern = __sse_zero; | |
546 | |
547 #pragma unroll | |
548 for (int i = 0; i < 4; i++) { | |
549 tmp_blue = _mm_shuffle_epi8(test_blue, mask_extended[i]); | |
550 tmp_green = _mm_shuffle_epi8(test_green, mask_extended[i]); | |
551 tmp_red = _mm_shuffle_epi8(test_red, mask_extended[i]); | |
552 | |
553 block_error = | |
554 AddChannelError(GetColorErrorSSE(tmp_blue, first_blue_data_block), | |
555 GetColorErrorSSE(tmp_green, first_green_data_block), | |
556 GetColorErrorSSE(tmp_red, first_red_data_block)); | |
557 | |
558 /* save winning pattern */ | |
559 first_half_pattern = _mm_max_epi32( | |
560 first_half_pattern, | |
561 _mm_and_si128(mask_imm[i], | |
562 _mm_cmpgt_epi32(first_half_min, block_error))); | |
563 first_half_min = _mm_min_epi32(first_half_min, block_error); | |
564 | |
565 /* Second part of the block */ | |
566 block_error = | |
567 AddChannelError(GetColorErrorSSE(tmp_blue, second_blue_data_block), | |
568 GetColorErrorSSE(tmp_green, second_green_data_block), | |
569 GetColorErrorSSE(tmp_red, second_red_data_block)); | |
570 | |
571 /* save winning pattern */ | |
572 second_half_pattern = _mm_max_epi32( | |
573 second_half_pattern, | |
574 _mm_and_si128(mask_imm[i], | |
575 _mm_cmpgt_epi32(second_half_min, block_error))); | |
576 second_half_min = _mm_min_epi32(second_half_min, block_error); | |
577 } | |
578 | |
579 first_half_min = _mm_add_epi32(first_half_min, second_half_min); | |
580 first_half_min = | |
581 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0x4E)); | |
582 first_half_min = | |
583 _mm_add_epi32(first_half_min, _mm_shuffle_epi32(first_half_min, 0xB1)); | |
584 | |
585 min = _mm_cvtsi128_si32(first_half_min); | |
586 | |
587 if (min < my_best_error) { | |
588 my_best_tbl_idx = tbl_idx; | |
589 my_best_error = min; | |
590 #if O3_OPTIMIZATION | |
591 #pragma unroll | |
592 for (int i = 0; i < 4; i++) { | |
593 my_best_mod_idx[tbl_idx][i] = | |
594 (_mm_extract_epi32(first_half_pattern, i) >> (2 * i)) & 3; | |
595 my_best_mod_idx[tbl_idx][i + 4] = | |
596 (_mm_extract_epi32(second_half_pattern, i) >> (2 * i)) & 3; | |
597 } | |
598 #endif | |
599 my_best_mod_idx[tbl_idx][0] = | |
600 (_mm_extract_epi32(first_half_pattern, 0) >> (0)) & 3; | |
601 my_best_mod_idx[tbl_idx][4] = | |
602 (_mm_extract_epi32(second_half_pattern, 0) >> (0)) & 3; | |
603 | |
604 my_best_mod_idx[tbl_idx][1] = | |
605 (_mm_extract_epi32(first_half_pattern, 1) >> (2)) & 3; | |
606 my_best_mod_idx[tbl_idx][5] = | |
607 (_mm_extract_epi32(second_half_pattern, 1) >> (2)) & 3; | |
608 | |
609 my_best_mod_idx[tbl_idx][2] = | |
610 (_mm_extract_epi32(first_half_pattern, 2) >> (4)) & 3; | |
611 my_best_mod_idx[tbl_idx][6] = | |
612 (_mm_extract_epi32(second_half_pattern, 2) >> (4)) & 3; | |
613 | |
614 my_best_mod_idx[tbl_idx][3] = | |
615 (_mm_extract_epi32(first_half_pattern, 3) >> (6)) & 3; | |
616 my_best_mod_idx[tbl_idx][7] = | |
617 (_mm_extract_epi32(second_half_pattern, 3) >> (6)) & 3; | |
618 | |
619 if (my_best_error == 0) { | |
620 break; | |
621 } | |
622 } | |
623 } | |
624 | |
625 WriteCodewordTable(block, sub_block_id, my_best_tbl_idx); | |
626 | |
627 uint32_t pix_data = 0; | |
628 uint8_t mod_idx; | |
629 uint8_t pix_idx; | |
630 uint32_t lsb; | |
631 uint32_t msb; | |
632 int texel_num; | |
633 | |
634 for (unsigned int i = 0; i < 8; ++i) { | |
635 mod_idx = my_best_mod_idx[my_best_tbl_idx][i]; | |
636 pix_idx = g_mod_to_pix[mod_idx]; | |
637 | |
638 lsb = pix_idx & 0x1; | |
639 msb = pix_idx >> 1; | |
640 | |
641 // Obtain the texel number as specified in the standard. | |
642 texel_num = idx_to_num_tab[i]; | |
643 pix_data |= msb << (texel_num + 16); | |
644 pix_data |= lsb << (texel_num); | |
645 } | |
646 | |
647 WritePixelData(block, pix_data); | |
648 } | |
649 | |
650 void CompressBlock(uint8_t* dst, __sse_data* data) { | |
651 /* first 3 vertical 1, seconds 3 vertical 2, third 3 horizontal 1, last 3 | |
652 * horizontal 2 */ | |
653 float __sse_avg_colors[12] = { | |
654 0, | |
655 }; | |
656 bool use_differential[2] = {true, true}; | |
657 GetAvgColors(data, __sse_avg_colors, use_differential); | |
658 Color sub_block_avg[4]; | |
659 | |
660 /* TODO(radu.velea): remove floating point operations and use only int's + | |
661 * normal | |
662 * rounding and shifts */ | |
663 for (int i = 0, j = 1; i < 4; i += 2, j += 2) { | |
664 if (use_differential[i / 2] == false) { | |
665 sub_block_avg[i] = MakeColor444(&__sse_avg_colors[i * 3]); | |
666 sub_block_avg[j] = MakeColor444(&__sse_avg_colors[j * 3]); | |
667 } else { | |
668 sub_block_avg[i] = MakeColor555(&__sse_avg_colors[i * 3]); | |
669 sub_block_avg[j] = MakeColor555(&__sse_avg_colors[j * 3]); | |
670 } | |
671 } | |
672 | |
673 __m128i red_avg[2], green_avg[2], blue_avg[2]; | |
674 | |
675 // TODO(radu.velea): perfect accuracy, maybe skip floating variables | |
676 blue_avg[0] = | |
677 _mm_set_epi32((int)__sse_avg_colors[3], (int)__sse_avg_colors[3], | |
678 (int)__sse_avg_colors[0], (int)__sse_avg_colors[0]); | |
679 | |
680 green_avg[0] = | |
681 _mm_set_epi32((int)__sse_avg_colors[4], (int)__sse_avg_colors[4], | |
682 (int)__sse_avg_colors[1], (int)__sse_avg_colors[1]); | |
683 | |
684 red_avg[0] = | |
685 _mm_set_epi32((int)__sse_avg_colors[5], (int)__sse_avg_colors[5], | |
686 (int)__sse_avg_colors[2], (int)__sse_avg_colors[2]); | |
687 | |
688 uint32_t vertical_error = | |
689 GetVerticalError(data, blue_avg, green_avg, red_avg); | |
690 | |
691 // TODO(radu.velea): perfect accuracy, maybe skip floating variables | |
692 blue_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[6]); | |
693 blue_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[9]); | |
694 | |
695 green_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[7]); | |
696 green_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[10]); | |
697 | |
698 red_avg[0] = _mm_set1_epi32((int)__sse_avg_colors[8]); | |
699 red_avg[1] = _mm_set1_epi32((int)__sse_avg_colors[11]); | |
700 | |
701 uint32_t horizontal_error = | |
702 GetHorizontalError(data, blue_avg, green_avg, red_avg); | |
703 | |
704 bool flip = horizontal_error < vertical_error; | |
705 | |
706 // Clear destination buffer so that we can "or" in the results. | |
707 memset(dst, 0, 8); | |
708 | |
709 WriteDiff(dst, use_differential[!!flip]); | |
710 WriteFlip(dst, flip); | |
711 | |
712 uint8_t sub_block_off_0 = flip ? 2 : 0; | |
713 uint8_t sub_block_off_1 = sub_block_off_0 + 1; | |
714 | |
715 if (use_differential[!!flip]) { | |
716 WriteColors555(dst, sub_block_avg[sub_block_off_0], | |
717 sub_block_avg[sub_block_off_1]); | |
718 } else { | |
719 WriteColors444(dst, sub_block_avg[sub_block_off_0], | |
720 sub_block_avg[sub_block_off_1]); | |
721 } | |
722 | |
723 if (flip == false) { | |
724 /* transpose vertical data into horizontal lines */ | |
725 __m128i tmp; | |
726 #pragma unroll | |
727 for (int i = 0; i < 4; i += 2) { | |
728 tmp = data->blue[i]; | |
729 data->blue[i] = _mm_add_epi32( | |
730 _mm_move_epi64(data->blue[i]), | |
731 _mm_shuffle_epi32(_mm_move_epi64(data->blue[i + 1]), 0x4E)); | |
732 data->blue[i + 1] = _mm_add_epi32( | |
733 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)), | |
734 _mm_shuffle_epi32( | |
735 _mm_move_epi64(_mm_shuffle_epi32(data->blue[i + 1], 0x4E)), | |
736 0x4E)); | |
737 | |
738 tmp = data->green[i]; | |
739 data->green[i] = _mm_add_epi32( | |
740 _mm_move_epi64(data->green[i]), | |
741 _mm_shuffle_epi32(_mm_move_epi64(data->green[i + 1]), 0x4E)); | |
742 data->green[i + 1] = _mm_add_epi32( | |
743 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)), | |
744 _mm_shuffle_epi32( | |
745 _mm_move_epi64(_mm_shuffle_epi32(data->green[i + 1], 0x4E)), | |
746 0x4E)); | |
747 | |
748 tmp = data->red[i]; | |
749 data->red[i] = _mm_add_epi32( | |
750 _mm_move_epi64(data->red[i]), | |
751 _mm_shuffle_epi32(_mm_move_epi64(data->red[i + 1]), 0x4E)); | |
752 data->red[i + 1] = _mm_add_epi32( | |
753 _mm_move_epi64(_mm_shuffle_epi32(tmp, 0x4E)), | |
754 _mm_shuffle_epi32( | |
755 _mm_move_epi64(_mm_shuffle_epi32(data->red[i + 1], 0x4E)), 0x4E)); | |
756 } | |
757 | |
758 tmp = data->blue[1]; | |
759 data->blue[1] = data->blue[2]; | |
760 data->blue[2] = tmp; | |
761 | |
762 tmp = data->green[1]; | |
763 data->green[1] = data->green[2]; | |
764 data->green[2] = tmp; | |
765 | |
766 tmp = data->red[1]; | |
767 data->red[1] = data->red[2]; | |
768 data->red[2] = tmp; | |
769 } | |
770 | |
771 // Compute luminance for the first sub block. | |
772 ComputeLuminanceSSE(dst, sub_block_avg[sub_block_off_0], 0, | |
773 g_idx_to_num[sub_block_off_0], data); | |
774 // Compute luminance for the second sub block. | |
775 ComputeLuminanceSSE(dst, sub_block_avg[sub_block_off_1], 1, | |
776 g_idx_to_num[sub_block_off_1], data); | |
777 } | |
778 | |
779 static void LegacyExtractBlock(uint8_t* dst, const uint8_t* src, int width) { | |
780 for (int j = 0; j < 4; ++j) { | |
781 memcpy(&dst[j * 4 * 4], src, 4 * 4); | |
782 src += width * 4; | |
783 } | |
784 } | |
785 | |
786 inline void TransposeBlock(uint8_t* block, __m128i* transposed /* [4] */) { | |
787 __m128i tmp3, tmp2, tmp1, tmp0; | |
788 | |
adrian.belgun
2015/04/17 14:02:58
I think something went wrong with 'git cl format'
| |
789 transposed[0] = _mm_loadu_si128((__m128i*)(block)); // a0,a1,a2,...a7, ...a15 | |
790 transposed[1] = | |
791 _mm_loadu_si128((__m128i*)(block + 16)); // b0, b1,b2,...b7.... b15 | |
792 transposed[2] = | |
793 _mm_loadu_si128((__m128i*)(block + 32)); // c0, c1,c2,...c7....c15 | |
794 transposed[3] = | |
795 _mm_loadu_si128((__m128i*)(block + 48)); // d0,d1,d2,...d7....d15 | |
796 | |
797 tmp0 = _mm_unpacklo_epi8( | |
798 transposed[0], transposed[1]); // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7 | |
799 tmp1 = _mm_unpacklo_epi8( | |
800 transposed[2], transposed[3]); // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7 | |
801 tmp2 = _mm_unpackhi_epi8( | |
802 transposed[0], | |
803 transposed[1]); // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15 | |
804 tmp3 = _mm_unpackhi_epi8( | |
805 transposed[2], | |
806 transposed[3]); // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15 | |
807 | |
808 transposed[0] = _mm_unpacklo_epi8( | |
809 tmp0, tmp2); // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11 | |
810 transposed[1] = _mm_unpackhi_epi8( | |
811 tmp0, tmp2); // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15 | |
812 transposed[2] = | |
813 _mm_unpacklo_epi8(tmp1, tmp3); // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11 | |
814 transposed[3] = _mm_unpackhi_epi8( | |
815 tmp1, tmp3); // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15 | |
816 | |
817 tmp0 = _mm_unpacklo_epi32(transposed[0], transposed[2]); // a0,a8, b0,b8, | |
818 // c0,c8, d0,d8, | |
819 // a1,a9, b1,b9, | |
820 // c1,c9, d1,d9 | |
821 tmp1 = _mm_unpackhi_epi32(transposed[0], transposed[2]); // a2,a10, b2,b10, | |
822 // c2,c10, d2,d10, | |
823 // a3,a11, b3,b11, | |
824 // c3,c11, d3,d11 | |
825 tmp2 = _mm_unpacklo_epi32(transposed[1], transposed[3]); // a4,a12, b4,b12, | |
826 // c4,c12, d4,d12, | |
827 // a5,a13, b5,b13, | |
828 // c5,c13, d5,d13, | |
829 tmp3 = _mm_unpackhi_epi32(transposed[1], | |
830 transposed[3]); // a6,a14, b6,b14, c6,c14, d6,d14, | |
831 // a7,a15,b7,b15,c7,c15,d7,d15 | |
832 | |
833 transposed[0] = _mm_unpacklo_epi8(tmp0, tmp2); // a0,a4, a8, a12, b0,b4, | |
834 // b8,b12, c0,c4, c8, c12, | |
835 // d0,d4, d8, d12 | |
836 transposed[1] = _mm_unpackhi_epi8(tmp0, tmp2); // a1,a5, a9, a13, b1,b5, | |
837 // b9,b13, c1,c5, c9, c13, | |
838 // d1,d5, d9, d13 | |
839 transposed[2] = _mm_unpacklo_epi8(tmp1, tmp3); // a2,a6, a10,a14, b2,b6, | |
840 // b10,b14, c2,c6, c10,c14, | |
841 // d2,d6, d10,d14 | |
842 transposed[3] = _mm_unpackhi_epi8(tmp1, tmp3); // a3,a7, a11,a15, b3,b7, | |
843 // b11,b15, c3,c7, c11,c15, | |
844 // d3,d7, d11,d15 | |
845 } | |
846 | |
847 inline void UnpackBlock(__m128i* packed, | |
848 __m128i* red, | |
849 __m128i* green, | |
850 __m128i* blue, | |
851 __m128i* alpha) { | |
852 const __m128i zero = _mm_set1_epi8(0); | |
853 __m128i tmp_low, tmp_high; | |
854 | |
855 /* unpack red */ | |
856 tmp_low = _mm_unpacklo_epi8(packed[0], zero); | |
857 tmp_high = _mm_unpackhi_epi8(packed[0], zero); | |
858 | |
859 red[0] = _mm_unpacklo_epi16(tmp_low, zero); | |
860 red[1] = _mm_unpackhi_epi16(tmp_low, zero); | |
861 | |
862 red[2] = _mm_unpacklo_epi16(tmp_high, zero); | |
863 red[3] = _mm_unpackhi_epi16(tmp_high, zero); | |
864 | |
865 /* unpack green */ | |
866 tmp_low = _mm_unpacklo_epi8(packed[1], zero); | |
867 tmp_high = _mm_unpackhi_epi8(packed[1], zero); | |
868 | |
869 green[0] = _mm_unpacklo_epi16(tmp_low, zero); | |
870 green[1] = _mm_unpackhi_epi16(tmp_low, zero); | |
871 | |
872 green[2] = _mm_unpacklo_epi16(tmp_high, zero); | |
873 green[3] = _mm_unpackhi_epi16(tmp_high, zero); | |
874 | |
875 /* unpack blue */ | |
876 tmp_low = _mm_unpacklo_epi8(packed[2], zero); | |
877 tmp_high = _mm_unpackhi_epi8(packed[2], zero); | |
878 | |
879 blue[0] = _mm_unpacklo_epi16(tmp_low, zero); | |
880 blue[1] = _mm_unpackhi_epi16(tmp_low, zero); | |
881 | |
882 blue[2] = _mm_unpacklo_epi16(tmp_high, zero); | |
883 blue[3] = _mm_unpackhi_epi16(tmp_high, zero); | |
884 | |
885 /* unpack alpha */ | |
886 tmp_low = _mm_unpacklo_epi8(packed[3], zero); | |
887 tmp_high = _mm_unpackhi_epi8(packed[3], zero); | |
888 | |
889 alpha[0] = _mm_unpacklo_epi16(tmp_low, zero); | |
890 alpha[1] = _mm_unpackhi_epi16(tmp_low, zero); | |
891 | |
892 alpha[2] = _mm_unpacklo_epi16(tmp_high, zero); | |
893 alpha[3] = _mm_unpackhi_epi16(tmp_high, zero); | |
894 } | |
895 | |
896 inline int BlockIsConstant(const uint8_t* block, const __m128i* transposed) { | |
897 __m128i first = _mm_set1_epi8(block[0]); | |
898 first = _mm_cmpeq_epi8(transposed[0], first); | |
899 if (_mm_movemask_epi8(first) != 0xFFFF) { | |
900 return 0; | |
901 } | |
902 | |
903 first = _mm_set1_epi8(block[1]); | |
904 first = _mm_cmpeq_epi8(transposed[1], first); | |
905 | |
906 if (_mm_movemask_epi8(first) != 0xFFFF) { | |
907 return 0; | |
908 } | |
909 | |
910 first = _mm_set1_epi8(block[2]); | |
911 first = _mm_cmpeq_epi8(transposed[2], first); | |
912 | |
913 if (_mm_movemask_epi8(first) != 0xFFFF) { | |
914 return 0; | |
915 } | |
916 | |
917 return 1; | |
918 } | |
919 | |
920 inline void CompressSolid(uint8_t* dst, uint8_t* block) { | |
921 // Clear destination buffer so that we can "or" in the results. | |
922 memset(dst, 0, 8); | |
923 | |
924 float src_color_float[3] = {static_cast<float>(block[0]), | |
925 static_cast<float>(block[1]), | |
926 static_cast<float>(block[2])}; | |
927 Color base = MakeColor555(src_color_float); | |
928 Color constant; | |
929 constant.channels.b = block[0]; | |
930 constant.channels.g = block[1]; | |
931 constant.channels.r = block[2]; | |
932 | |
933 WriteDiff(dst, true); | |
934 WriteFlip(dst, false); | |
935 WriteColors555(dst, base, base); | |
936 | |
937 uint8_t best_tbl_idx = 0; | |
938 uint8_t best_mod_idx = 0; | |
939 uint32_t best_mod_err = std::numeric_limits<uint32_t>::max(); | |
940 | |
941 // Try all codeword tables to find the one giving the best results for this | |
942 // block. | |
943 for (unsigned int tbl_idx = 0; tbl_idx < 8; ++tbl_idx) { | |
944 // Try all modifiers in the current table to find which one gives the | |
945 // smallest error. | |
946 for (unsigned int mod_idx = 0; mod_idx < 4; ++mod_idx) { | |
947 int16_t lum = g_codeword_tables[tbl_idx][mod_idx]; | |
948 const Color& color = MakeColor(base, lum); | |
949 | |
950 uint32_t mod_err = GetColorError(constant, color); | |
951 if (mod_err < best_mod_err) { | |
952 best_tbl_idx = tbl_idx; | |
953 best_mod_idx = mod_idx; | |
954 best_mod_err = mod_err; | |
955 | |
956 if (mod_err == 0) | |
957 break; // We cannot do any better than this. | |
958 } | |
959 } | |
960 | |
961 if (best_mod_err == 0) | |
962 break; | |
963 } | |
964 | |
965 WriteCodewordTable(dst, 0, best_tbl_idx); | |
966 WriteCodewordTable(dst, 1, best_tbl_idx); | |
967 | |
968 uint8_t pix_idx = g_mod_to_pix[best_mod_idx]; | |
969 uint32_t lsb = pix_idx & 0x1; | |
970 uint32_t msb = pix_idx >> 1; | |
971 | |
972 uint32_t pix_data = 0; | |
973 for (unsigned int i = 0; i < 2; ++i) { | |
974 for (unsigned int j = 0; j < 8; ++j) { | |
975 // Obtain the texel number as specified in the standard. | |
976 int texel_num = g_idx_to_num[i][j]; | |
977 pix_data |= msb << (texel_num + 16); | |
978 pix_data |= lsb << (texel_num); | |
979 } | |
980 } | |
981 | |
982 WritePixelData(dst, pix_data); | |
983 } | |
984 | |
985 } // namespace | |
986 | |
987 namespace cc { | |
988 | |
989 void TextureCompressorETC1_SSE::Compress(const uint8_t* src, | |
990 uint8_t* dst, | |
991 int width, | |
992 int height, | |
993 Quality quality) { | |
994 DCHECK(width >= 4 && (width & 3) == 0); | |
995 DCHECK(height >= 4 && (height & 3) == 0); | |
996 | |
997 uint8_t block[64] __attribute__((aligned(16))); | |
998 __m128i packed[4]; | |
999 __m128i red[4], green[4], blue[4], alpha[4]; | |
1000 __sse_data data; | |
1001 | |
1002 for (int y = 0; y < height; y += 4, src += width * 4 * 4) { | |
1003 for (int x = 0; x < width; x += 4, dst += 8) { | |
1004 /* SSE */ | |
1005 LegacyExtractBlock(block, src + x * 4, width); | |
1006 TransposeBlock(block, packed); | |
1007 if (BlockIsConstant(block, packed) == 1) { | |
1008 /* TODO(radu.velea): handle constant blocks in SSE */ | |
1009 CompressSolid(dst, block); | |
1010 } else { | |
1011 UnpackBlock(packed, blue, green, red, alpha); | |
1012 | |
1013 data.block = block; | |
1014 data.packed = packed; | |
1015 data.red = red; | |
1016 data.blue = blue; | |
1017 data.green = green; | |
1018 | |
1019 CompressBlock(dst, &data); | |
1020 } | |
1021 } | |
1022 } | |
1023 } | |
1024 | |
1025 } // namespace cc | |
OLD | NEW |