Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(159)

Side by Side Diff: third_party/qcms/src/transform-sse2.c

Issue 2014023003: Add exact version of qcms used by Chrome for testing and comparison (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/qcms/src/transform.c ('k') | third_party/qcms/src/transform_util.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // qcms
2 // Copyright (C) 2009 Mozilla Foundation
3 // Copyright (C) 2015 Intel Corporation
4 //
5 // Permission is hereby granted, free of charge, to any person obtaining
6 // a copy of this software and associated documentation files (the "Software"),
7 // to deal in the Software without restriction, including without limitation
8 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 // and/or sell copies of the Software, and to permit persons to whom the Softwar e
10 // is furnished to do so, subject to the following conditions:
11 //
12 // The above copyright notice and this permission notice shall be included in
13 // all copies or substantial portions of the Software.
14 //
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
17 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
23 #include <emmintrin.h>
24
25 #include "qcmsint.h"
26
27 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequ ence */
28 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE - 1)
29 #define CLAMPMAXVAL 1.0f
30
31 static const ALIGN float floatScaleX4[4] =
32 { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
33 static const ALIGN float clampMaxValueX4[4] =
34 { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
35
36 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
37 unsigned char *src,
38 unsigned char *dest,
39 size_t length,
40 qcms_format_type output_format)
41 {
42 unsigned int i;
43 float (*mat)[4] = transform->matrix;
44 char input_back[32];
45 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
46 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec( align(32))
47 * because they don't work on stack variables. gcc 4.4 does do the right thi ng
48 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
49 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
50 /* share input and output locations to save having to keep the
51 * locations in separate registers */
52 uint32_t const * output = (uint32_t*)input;
53
54 /* deref *transform now to avoid it in loop */
55 const float *igtbl_r = transform->input_gamma_table_r;
56 const float *igtbl_g = transform->input_gamma_table_g;
57 const float *igtbl_b = transform->input_gamma_table_b;
58
59 /* deref *transform now to avoid it in loop */
60 const uint8_t *otdata_r = &transform->output_table_r->data[0];
61 const uint8_t *otdata_g = &transform->output_table_g->data[0];
62 const uint8_t *otdata_b = &transform->output_table_b->data[0];
63
64 /* input matrix values never change */
65 const __m128 mat0 = _mm_load_ps(mat[0]);
66 const __m128 mat1 = _mm_load_ps(mat[1]);
67 const __m128 mat2 = _mm_load_ps(mat[2]);
68
69 /* these values don't change, either */
70 const __m128 max = _mm_load_ps(clampMaxValueX4);
71 const __m128 min = _mm_setzero_ps();
72 const __m128 scale = _mm_load_ps(floatScaleX4);
73
74 /* working variables */
75 __m128 vec_r, vec_g, vec_b, result;
76 const int r_out = output_format.r;
77 const int b_out = output_format.b;
78
79 /* CYA */
80 if (!length)
81 return;
82
83 /* one pixel is handled outside of the loop */
84 length--;
85
86 /* setup for transforming 1st pixel */
87 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
88 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
89 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
90 src += 3;
91
92 /* transform all but final pixel */
93
94 for (i=0; i<length; i++)
95 {
96 /* position values from gamma tables */
97 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
98 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
99 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
100
101 /* gamma * matrix */
102 vec_r = _mm_mul_ps(vec_r, mat0);
103 vec_g = _mm_mul_ps(vec_g, mat1);
104 vec_b = _mm_mul_ps(vec_b, mat2);
105
106 /* crunch, crunch, crunch */
107 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));
108 vec_r = _mm_max_ps(min, vec_r);
109 vec_r = _mm_min_ps(max, vec_r);
110 result = _mm_mul_ps(vec_r, scale);
111
112 /* store calc'd output tables indices */
113 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
114
115 /* load for next loop while store completes */
116 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
117 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
118 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
119 src += 3;
120
121 /* use calc'd indices to output RGB values */
122 dest[r_out] = otdata_r[output[0]];
123 dest[1] = otdata_g[output[1]];
124 dest[b_out] = otdata_b[output[2]];
125 dest += 3;
126 }
127
128 /* handle final (maybe only) pixel */
129
130 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
131 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
132 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
133
134 vec_r = _mm_mul_ps(vec_r, mat0);
135 vec_g = _mm_mul_ps(vec_g, mat1);
136 vec_b = _mm_mul_ps(vec_b, mat2);
137
138 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));
139 vec_r = _mm_max_ps(min, vec_r);
140 vec_r = _mm_min_ps(max, vec_r);
141 result = _mm_mul_ps(vec_r, scale);
142
143 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
144
145 dest[r_out] = otdata_r[output[0]];
146 dest[1] = otdata_g[output[1]];
147 dest[b_out] = otdata_b[output[2]];
148 }
149
150 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
151 unsigned char *src,
152 unsigned char *dest,
153 size_t length,
154 qcms_format_type output_format)
155 {
156 unsigned int i;
157 float (*mat)[4] = transform->matrix;
158 char input_back[32];
159 /* Ensure we have a buffer that's 16 byte aligned regardless of the original
160 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec( align(32))
161 * because they don't work on stack variables. gcc 4.4 does do the right thi ng
162 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
163 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
164 /* share input and output locations to save having to keep the
165 * locations in separate registers */
166 uint32_t const * output = (uint32_t*)input;
167
168 /* deref *transform now to avoid it in loop */
169 const float *igtbl_r = transform->input_gamma_table_r;
170 const float *igtbl_g = transform->input_gamma_table_g;
171 const float *igtbl_b = transform->input_gamma_table_b;
172
173 /* deref *transform now to avoid it in loop */
174 const uint8_t *otdata_r = &transform->output_table_r->data[0];
175 const uint8_t *otdata_g = &transform->output_table_g->data[0];
176 const uint8_t *otdata_b = &transform->output_table_b->data[0];
177
178 /* input matrix values never change */
179 const __m128 mat0 = _mm_load_ps(mat[0]);
180 const __m128 mat1 = _mm_load_ps(mat[1]);
181 const __m128 mat2 = _mm_load_ps(mat[2]);
182
183 /* these values don't change, either */
184 const __m128 max = _mm_load_ps(clampMaxValueX4);
185 const __m128 min = _mm_setzero_ps();
186 const __m128 scale = _mm_load_ps(floatScaleX4);
187
188 /* working variables */
189 __m128 vec_r, vec_g, vec_b, result;
190 const int r_out = output_format.r;
191 const int b_out = output_format.b;
192 unsigned char alpha;
193
194 /* CYA */
195 if (!length)
196 return;
197
198 /* one pixel is handled outside of the loop */
199 length--;
200
201 /* setup for transforming 1st pixel */
202 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
203 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
204 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
205 alpha = src[3];
206 src += 4;
207
208 /* transform all but final pixel */
209
210 for (i=0; i<length; i++)
211 {
212 /* position values from gamma tables */
213 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
214 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
215 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
216
217 /* gamma * matrix */
218 vec_r = _mm_mul_ps(vec_r, mat0);
219 vec_g = _mm_mul_ps(vec_g, mat1);
220 vec_b = _mm_mul_ps(vec_b, mat2);
221
222 /* store alpha for this pixel; load alpha for next */
223 dest[3] = alpha;
224 alpha = src[3];
225
226 /* crunch, crunch, crunch */
227 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));
228 vec_r = _mm_max_ps(min, vec_r);
229 vec_r = _mm_min_ps(max, vec_r);
230 result = _mm_mul_ps(vec_r, scale);
231
232 /* store calc'd output tables indices */
233 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
234
235 /* load gamma values for next loop while store completes */
236 vec_r = _mm_load_ss(&igtbl_r[src[0]]);
237 vec_g = _mm_load_ss(&igtbl_g[src[1]]);
238 vec_b = _mm_load_ss(&igtbl_b[src[2]]);
239 src += 4;
240
241 /* use calc'd indices to output RGB values */
242 dest[r_out] = otdata_r[output[0]];
243 dest[1] = otdata_g[output[1]];
244 dest[b_out] = otdata_b[output[2]];
245 dest += 4;
246 }
247
248 /* handle final (maybe only) pixel */
249
250 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
251 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
252 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
253
254 vec_r = _mm_mul_ps(vec_r, mat0);
255 vec_g = _mm_mul_ps(vec_g, mat1);
256 vec_b = _mm_mul_ps(vec_b, mat2);
257
258 dest[3] = alpha;
259
260 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));
261 vec_r = _mm_max_ps(min, vec_r);
262 vec_r = _mm_min_ps(max, vec_r);
263 result = _mm_mul_ps(vec_r, scale);
264
265 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
266
267 dest[r_out] = otdata_r[output[0]];
268 dest[1] = otdata_g[output[1]];
269 dest[b_out] = otdata_b[output[2]];
270 }
271
272 static inline __m128i __mm_swizzle_epi32(__m128i value, int bgra)
273 {
274 return bgra ? _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 1, 2, 3)) :
275 _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 3, 2, 1)) ;
276 }
277
278 void qcms_transform_data_tetra_clut_rgba_sse2(qcms_transform *transform,
279 unsigned char *src,
280 unsigned char *dest,
281 size_t length,
282 qcms_format_type output_format)
283 {
284 const int bgra = output_format.r;
285
286 size_t i;
287
288 const int xy_len_3 = 3 * 1;
289 const int x_len_3 = 3 * transform->grid_size;
290 const int len_3 = x_len_3 * transform->grid_size;
291
292 const __m128 __255 = _mm_set1_ps(255.0f);
293 const __m128 __one = _mm_set1_ps(1.0f);
294 const __m128 __000 = _mm_setzero_ps();
295
296 const float* r_table = transform->r_clut;
297 const float* g_table = transform->g_clut;
298 const float* b_table = transform->b_clut;
299
300 int i3, i2, i1, i0;
301
302 __m128 c3;
303 __m128 c2;
304 __m128 c1;
305 __m128 c0;
306
307 if (!(transform->transform_flags & TRANSFORM_FLAG_CLUT_CACHE))
308 qcms_transform_build_clut_cache(transform);
309
310 for (i = 0; i < length; ++i) {
311 unsigned char in_r = *src++;
312 unsigned char in_g = *src++;
313 unsigned char in_b = *src++;
314
315 // initialize the output result with the alpha channel only
316
317 __m128i result = _mm_setr_epi32(*src++, 0, 0, 0);
318
319 // get the input point r.xyz relative to the subcube origin
320
321 float rx = transform->r_cache[in_r];
322 float ry = transform->r_cache[in_g];
323 float rz = transform->r_cache[in_b];
324
325 // load and LUT scale the subcube maximum vertex
326
327 int xn = transform->ceil_cache[in_r] * len_3;
328 int yn = transform->ceil_cache[in_g] * x_len_3;
329 int zn = transform->ceil_cache[in_b] * xy_len_3;
330
331 // load and LUT scale the subcube origin vertex
332
333 int x0 = transform->floor_cache[in_r] * len_3;
334 int y0 = transform->floor_cache[in_g] * x_len_3;
335 int z0 = transform->floor_cache[in_b] * xy_len_3;
336
337 // tetrahedral interpolate the input color r.xyz
338
339 #define TETRA_LOOKUP_CLUT(i3, i2, i1, i0) \
340 c0 = _mm_set_ps(b_table[i0], g_table[i0], r_table[i0], 0.f), \
341 c1 = _mm_set_ps(b_table[i1], g_table[i1], r_table[i1], 0.f), \
342 c2 = _mm_set_ps(b_table[i2], g_table[i2], r_table[i2], 0.f), \
343 c3 = _mm_set_ps(b_table[i3], g_table[i3], r_table[i3], 0.f)
344
345 i0 = x0 + y0 + z0;
346
347 if (rx >= ry) {
348
349 if (ry >= rz) { // rx >= ry && ry >= rz
350
351 i3 = yn + (i1 = xn);
352 i1 += i0 - x0;
353 i2 = i3 + z0;
354 i3 += zn;
355
356 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
357
358 c3 = _mm_sub_ps(c3, c2);
359 c2 = _mm_sub_ps(c2, c1);
360 c1 = _mm_sub_ps(c1, c0);
361
362 } else if (rx >= rz) { // rx >= rz && rz >= ry
363
364 i3 = zn + (i1 = xn);
365 i1 += i0 - x0;
366 i2 = i3 + yn;
367 i3 += y0;
368
369 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
370
371 c2 = _mm_sub_ps(c2, c3);
372 c3 = _mm_sub_ps(c3, c1);
373 c1 = _mm_sub_ps(c1, c0);
374
375 } else { // rz > rx && rx >= ry
376
377 i2 = xn + (i3 = zn);
378 i3 += i0 - z0;
379 i1 = i2 + y0;
380 i2 += yn;
381
382 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
383
384 c2 = _mm_sub_ps(c2, c1);
385 c1 = _mm_sub_ps(c1, c3);
386 c3 = _mm_sub_ps(c3, c0);
387 }
388 } else {
389
390 if (rx >= rz) { // ry > rx && rx >= rz
391
392 i3 = xn + (i2 = yn);
393 i2 += i0 - y0;
394 i1 = i3 + z0;
395 i3 += zn;
396
397 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
398
399 c3 = _mm_sub_ps(c3, c1);
400 c1 = _mm_sub_ps(c1, c2);
401 c2 = _mm_sub_ps(c2, c0);
402
403 } else if (ry >= rz) { // ry >= rz && rz > rx
404
405 i3 = zn + (i2 = yn);
406 i2 += i0 - y0;
407 i1 = i3 + xn;
408 i3 += x0;
409
410 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
411
412 c1 = _mm_sub_ps(c1, c3);
413 c3 = _mm_sub_ps(c3, c2);
414 c2 = _mm_sub_ps(c2, c0);
415
416 } else { // rz > ry && ry > rx
417
418 i2 = yn + (i3 = zn);
419 i3 += i0 - z0;
420 i1 = i2 + xn;
421 i2 += x0;
422
423 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);
424
425 c1 = _mm_sub_ps(c1, c2);
426 c2 = _mm_sub_ps(c2, c3);
427 c3 = _mm_sub_ps(c3, c0);
428 }
429 }
430
431 // output.xyz = column_matrix(c1, c2, c3) x r.xyz + c0.xyz
432
433 c0 = _mm_add_ps(c0, _mm_mul_ps(c1, _mm_set1_ps(rx)));
434 c0 = _mm_add_ps(c0, _mm_mul_ps(c2, _mm_set1_ps(ry)));
435 c0 = _mm_add_ps(c0, _mm_mul_ps(c3, _mm_set1_ps(rz)));
436
437 // clamp to [0.0..1.0], then scale by 255
438
439 c0 = _mm_max_ps(c0, __000);
440 c0 = _mm_min_ps(c0, __one);
441 c0 = _mm_mul_ps(c0, __255);
442
443 // int(c0) with float rounding, add alpha
444
445 result = _mm_add_epi32(result, _mm_cvtps_epi32(c0));
446
447 // swizzle and repack in result low bytes
448
449 result = __mm_swizzle_epi32(result, bgra);
450 result = _mm_packus_epi16(result, result);
451 result = _mm_packus_epi16(result, result);
452
453 // store into uint32_t* pixel destination
454
455 *(uint32_t *)dest = _mm_cvtsi128_si32(result);
456 dest += 4;
457 }
458 }
OLDNEW
« no previous file with comments | « third_party/qcms/src/transform.c ('k') | third_party/qcms/src/transform_util.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698