third_party/qcms/src/transform-sse2.c - Issue 2014023003: Add exact version of qcms used by Chrome for testing and comparison

Side by Side Diff: third_party/qcms/src/transform-sse2.c

Issue 2014023003: Add exact version of qcms used by Chrome for testing and comparison (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // qcms

	2 // Copyright (C) 2009 Mozilla Foundation

	3 // Copyright (C) 2015 Intel Corporation

	4 //

	5 // Permission is hereby granted, free of charge, to any person obtaining

	6 // a copy of this software and associated documentation files (the "Software"),

	7 // to deal in the Software without restriction, including without limitation

	8 // the rights to use, copy, modify, merge, publish, distribute, sublicense,

	9 // and/or sell copies of the Software, and to permit persons to whom the Softwar e

	10 // is furnished to do so, subject to the following conditions:

	11 //

	12 // The above copyright notice and this permission notice shall be included in

	13 // all copies or substantial portions of the Software.

	14 //

	15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

	16 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO

	17 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

	18 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE

	19 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION

	20 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION

	21 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

	22

	23 #include <emmintrin.h>

	24

	25 #include "qcmsint.h"

	26

	27 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequ ence */

	28 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE - 1)

	29 #define CLAMPMAXVAL 1.0f

	30

	31 static const ALIGN float floatScaleX4[4] =

	32 { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};

	33 static const ALIGN float clampMaxValueX4[4] =

	34 { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};

	35

	36 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,

	37 unsigned char *src,

	38 unsigned char *dest,

	39 size_t length,

	40 qcms_format_type output_format)

	41 {

	42 unsigned int i;

	43 float (*mat)[4] = transform->matrix;

	44 char input_back[32];

	45 /* Ensure we have a buffer that's 16 byte aligned regardless of the original

	46 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec( align(32))

	47 * because they don't work on stack variables. gcc 4.4 does do the right thi ng

	48 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */

	49 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);

	50 /* share input and output locations to save having to keep the

	51 * locations in separate registers */

	52 uint32_t const * output = (uint32_t*)input;

	53

	54 /* deref transform now to avoid it in loop /

	55 const float *igtbl_r = transform->input_gamma_table_r;

	56 const float *igtbl_g = transform->input_gamma_table_g;

	57 const float *igtbl_b = transform->input_gamma_table_b;

	58

	59 /* deref transform now to avoid it in loop /

	60 const uint8_t *otdata_r = &transform->output_table_r->data[0];

	61 const uint8_t *otdata_g = &transform->output_table_g->data[0];

	62 const uint8_t *otdata_b = &transform->output_table_b->data[0];

	63

	64 /* input matrix values never change */

	65 const __m128 mat0 = _mm_load_ps(mat[0]);

	66 const __m128 mat1 = _mm_load_ps(mat[1]);

	67 const __m128 mat2 = _mm_load_ps(mat[2]);

	68

	69 /* these values don't change, either */

	70 const __m128 max = _mm_load_ps(clampMaxValueX4);

	71 const __m128 min = _mm_setzero_ps();

	72 const __m128 scale = _mm_load_ps(floatScaleX4);

	73

	74 /* working variables */

	75 __m128 vec_r, vec_g, vec_b, result;

	76 const int r_out = output_format.r;

	77 const int b_out = output_format.b;

	78

	79 /* CYA */

	80 if (!length)

	81 return;

	82

	83 /* one pixel is handled outside of the loop */

	84 length--;

	85

	86 /* setup for transforming 1st pixel */

	87 vec_r = _mm_load_ss(&igtbl_r[src[0]]);

	88 vec_g = _mm_load_ss(&igtbl_g[src[1]]);

	89 vec_b = _mm_load_ss(&igtbl_b[src[2]]);

	90 src += 3;

	91

	92 /* transform all but final pixel */

	93

	94 for (i=0; i<length; i++)

	95 {

	96 /* position values from gamma tables */

	97 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);

	98 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);

	99 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);

	100

	101 /* gamma * matrix */

	102 vec_r = _mm_mul_ps(vec_r, mat0);

	103 vec_g = _mm_mul_ps(vec_g, mat1);

	104 vec_b = _mm_mul_ps(vec_b, mat2);

	105

	106 /* crunch, crunch, crunch */

	107 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));

	108 vec_r = _mm_max_ps(min, vec_r);

	109 vec_r = _mm_min_ps(max, vec_r);

	110 result = _mm_mul_ps(vec_r, scale);

	111

	112 /* store calc'd output tables indices */

	113 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));

	114

	115 /* load for next loop while store completes */

	116 vec_r = _mm_load_ss(&igtbl_r[src[0]]);

	117 vec_g = _mm_load_ss(&igtbl_g[src[1]]);

	118 vec_b = _mm_load_ss(&igtbl_b[src[2]]);

	119 src += 3;

	120

	121 /* use calc'd indices to output RGB values */

	122 dest[r_out] = otdata_r[output[0]];

	123 dest[1] = otdata_g[output[1]];

	124 dest[b_out] = otdata_b[output[2]];

	125 dest += 3;

	126 }

	127

	128 /* handle final (maybe only) pixel */

	129

	130 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);

	131 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);

	132 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);

	133

	134 vec_r = _mm_mul_ps(vec_r, mat0);

	135 vec_g = _mm_mul_ps(vec_g, mat1);

	136 vec_b = _mm_mul_ps(vec_b, mat2);

	137

	138 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));

	139 vec_r = _mm_max_ps(min, vec_r);

	140 vec_r = _mm_min_ps(max, vec_r);

	141 result = _mm_mul_ps(vec_r, scale);

	142

	143 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));

	144

	145 dest[r_out] = otdata_r[output[0]];

	146 dest[1] = otdata_g[output[1]];

	147 dest[b_out] = otdata_b[output[2]];

	148 }

	149

	150 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,

	151 unsigned char *src,

	152 unsigned char *dest,

	153 size_t length,

	154 qcms_format_type output_format)

	155 {

	156 unsigned int i;

	157 float (*mat)[4] = transform->matrix;

	158 char input_back[32];

	159 /* Ensure we have a buffer that's 16 byte aligned regardless of the original

	160 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec( align(32))

	161 * because they don't work on stack variables. gcc 4.4 does do the right thi ng

	162 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */

	163 float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);

	164 /* share input and output locations to save having to keep the

	165 * locations in separate registers */

	166 uint32_t const * output = (uint32_t*)input;

	167

	168 /* deref transform now to avoid it in loop /

	169 const float *igtbl_r = transform->input_gamma_table_r;

	170 const float *igtbl_g = transform->input_gamma_table_g;

	171 const float *igtbl_b = transform->input_gamma_table_b;

	172

	173 /* deref transform now to avoid it in loop /

	174 const uint8_t *otdata_r = &transform->output_table_r->data[0];

	175 const uint8_t *otdata_g = &transform->output_table_g->data[0];

	176 const uint8_t *otdata_b = &transform->output_table_b->data[0];

	177

	178 /* input matrix values never change */

	179 const __m128 mat0 = _mm_load_ps(mat[0]);

	180 const __m128 mat1 = _mm_load_ps(mat[1]);

	181 const __m128 mat2 = _mm_load_ps(mat[2]);

	182

	183 /* these values don't change, either */

	184 const __m128 max = _mm_load_ps(clampMaxValueX4);

	185 const __m128 min = _mm_setzero_ps();

	186 const __m128 scale = _mm_load_ps(floatScaleX4);

	187

	188 /* working variables */

	189 __m128 vec_r, vec_g, vec_b, result;

	190 const int r_out = output_format.r;

	191 const int b_out = output_format.b;

	192 unsigned char alpha;

	193

	194 /* CYA */

	195 if (!length)

	196 return;

	197

	198 /* one pixel is handled outside of the loop */

	199 length--;

	200

	201 /* setup for transforming 1st pixel */

	202 vec_r = _mm_load_ss(&igtbl_r[src[0]]);

	203 vec_g = _mm_load_ss(&igtbl_g[src[1]]);

	204 vec_b = _mm_load_ss(&igtbl_b[src[2]]);

	205 alpha = src[3];

	206 src += 4;

	207

	208 /* transform all but final pixel */

	209

	210 for (i=0; i<length; i++)

	211 {

	212 /* position values from gamma tables */

	213 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);

	214 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);

	215 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);

	216

	217 /* gamma * matrix */

	218 vec_r = _mm_mul_ps(vec_r, mat0);

	219 vec_g = _mm_mul_ps(vec_g, mat1);

	220 vec_b = _mm_mul_ps(vec_b, mat2);

	221

	222 /* store alpha for this pixel; load alpha for next */

	223 dest[3] = alpha;

	224 alpha = src[3];

	225

	226 /* crunch, crunch, crunch */

	227 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));

	228 vec_r = _mm_max_ps(min, vec_r);

	229 vec_r = _mm_min_ps(max, vec_r);

	230 result = _mm_mul_ps(vec_r, scale);

	231

	232 /* store calc'd output tables indices */

	233 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));

	234

	235 /* load gamma values for next loop while store completes */

	236 vec_r = _mm_load_ss(&igtbl_r[src[0]]);

	237 vec_g = _mm_load_ss(&igtbl_g[src[1]]);

	238 vec_b = _mm_load_ss(&igtbl_b[src[2]]);

	239 src += 4;

	240

	241 /* use calc'd indices to output RGB values */

	242 dest[r_out] = otdata_r[output[0]];

	243 dest[1] = otdata_g[output[1]];

	244 dest[b_out] = otdata_b[output[2]];

	245 dest += 4;

	246 }

	247

	248 /* handle final (maybe only) pixel */

	249

	250 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);

	251 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);

	252 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);

	253

	254 vec_r = _mm_mul_ps(vec_r, mat0);

	255 vec_g = _mm_mul_ps(vec_g, mat1);

	256 vec_b = _mm_mul_ps(vec_b, mat2);

	257

	258 dest[3] = alpha;

	259

	260 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));

	261 vec_r = _mm_max_ps(min, vec_r);

	262 vec_r = _mm_min_ps(max, vec_r);

	263 result = _mm_mul_ps(vec_r, scale);

	264

	265 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));

	266

	267 dest[r_out] = otdata_r[output[0]];

	268 dest[1] = otdata_g[output[1]];

	269 dest[b_out] = otdata_b[output[2]];

	270 }

	271

	272 static inline __m128i __mm_swizzle_epi32(__m128i value, int bgra)

	273 {

	274 return bgra ? _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 1, 2, 3)) :

	275 _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 3, 2, 1)) ;

	276 }

	277

	278 void qcms_transform_data_tetra_clut_rgba_sse2(qcms_transform *transform,

	279 unsigned char *src,

	280 unsigned char *dest,

	281 size_t length,

	282 qcms_format_type output_format)

	283 {

	284 const int bgra = output_format.r;

	285

	286 size_t i;

	287

	288 const int xy_len_3 = 3 * 1;

	289 const int x_len_3 = 3 * transform->grid_size;

	290 const int len_3 = x_len_3 * transform->grid_size;

	291

	292 const __m128 __255 = _mm_set1_ps(255.0f);

	293 const __m128 __one = _mm_set1_ps(1.0f);

	294 const __m128 __000 = _mm_setzero_ps();

	295

	296 const float* r_table = transform->r_clut;

	297 const float* g_table = transform->g_clut;

	298 const float* b_table = transform->b_clut;

	299

	300 int i3, i2, i1, i0;

	301

	302 __m128 c3;

	303 __m128 c2;

	304 __m128 c1;

	305 __m128 c0;

	306

	307 if (!(transform->transform_flags & TRANSFORM_FLAG_CLUT_CACHE))

	308 qcms_transform_build_clut_cache(transform);

	309

	310 for (i = 0; i < length; ++i) {

	311 unsigned char in_r = *src++;

	312 unsigned char in_g = *src++;

	313 unsigned char in_b = *src++;

	314

	315 // initialize the output result with the alpha channel only

	316

	317 __m128i result = _mm_setr_epi32(*src++, 0, 0, 0);

	318

	319 // get the input point r.xyz relative to the subcube origin

	320

	321 float rx = transform->r_cache[in_r];

	322 float ry = transform->r_cache[in_g];

	323 float rz = transform->r_cache[in_b];

	324

	325 // load and LUT scale the subcube maximum vertex

	326

	327 int xn = transform->ceil_cache[in_r] * len_3;

	328 int yn = transform->ceil_cache[in_g] * x_len_3;

	329 int zn = transform->ceil_cache[in_b] * xy_len_3;

	330

	331 // load and LUT scale the subcube origin vertex

	332

	333 int x0 = transform->floor_cache[in_r] * len_3;

	334 int y0 = transform->floor_cache[in_g] * x_len_3;

	335 int z0 = transform->floor_cache[in_b] * xy_len_3;

	336

	337 // tetrahedral interpolate the input color r.xyz

	338

	339 #define TETRA_LOOKUP_CLUT(i3, i2, i1, i0) \

	340 c0 = _mm_set_ps(b_table[i0], g_table[i0], r_table[i0], 0.f), \

	341 c1 = _mm_set_ps(b_table[i1], g_table[i1], r_table[i1], 0.f), \

	342 c2 = _mm_set_ps(b_table[i2], g_table[i2], r_table[i2], 0.f), \

	343 c3 = _mm_set_ps(b_table[i3], g_table[i3], r_table[i3], 0.f)

	344

	345 i0 = x0 + y0 + z0;

	346

	347 if (rx >= ry) {

	348

	349 if (ry >= rz) { // rx >= ry && ry >= rz

	350

	351 i3 = yn + (i1 = xn);

	352 i1 += i0 - x0;

	353 i2 = i3 + z0;

	354 i3 += zn;

	355

	356 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);

	357

	358 c3 = _mm_sub_ps(c3, c2);

	359 c2 = _mm_sub_ps(c2, c1);

	360 c1 = _mm_sub_ps(c1, c0);

	361

	362 } else if (rx >= rz) { // rx >= rz && rz >= ry

	363

	364 i3 = zn + (i1 = xn);

	365 i1 += i0 - x0;

	366 i2 = i3 + yn;

	367 i3 += y0;

	368

	369 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);

	370

	371 c2 = _mm_sub_ps(c2, c3);

	372 c3 = _mm_sub_ps(c3, c1);

	373 c1 = _mm_sub_ps(c1, c0);

	374

	375 } else { // rz > rx && rx >= ry

	376

	377 i2 = xn + (i3 = zn);

	378 i3 += i0 - z0;

	379 i1 = i2 + y0;

	380 i2 += yn;

	381

	382 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);

	383

	384 c2 = _mm_sub_ps(c2, c1);

	385 c1 = _mm_sub_ps(c1, c3);

	386 c3 = _mm_sub_ps(c3, c0);

	387 }

	388 } else {

	389

	390 if (rx >= rz) { // ry > rx && rx >= rz

	391

	392 i3 = xn + (i2 = yn);

	393 i2 += i0 - y0;

	394 i1 = i3 + z0;

	395 i3 += zn;

	396

	397 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);

	398

	399 c3 = _mm_sub_ps(c3, c1);

	400 c1 = _mm_sub_ps(c1, c2);

	401 c2 = _mm_sub_ps(c2, c0);

	402

	403 } else if (ry >= rz) { // ry >= rz && rz > rx

	404

	405 i3 = zn + (i2 = yn);

	406 i2 += i0 - y0;

	407 i1 = i3 + xn;

	408 i3 += x0;

	409

	410 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);

	411

	412 c1 = _mm_sub_ps(c1, c3);

	413 c3 = _mm_sub_ps(c3, c2);

	414 c2 = _mm_sub_ps(c2, c0);

	415

	416 } else { // rz > ry && ry > rx

	417

	418 i2 = yn + (i3 = zn);

	419 i3 += i0 - z0;

	420 i1 = i2 + xn;

	421 i2 += x0;

	422

	423 TETRA_LOOKUP_CLUT(i3, i2, i1, i0);

	424

	425 c1 = _mm_sub_ps(c1, c2);

	426 c2 = _mm_sub_ps(c2, c3);

	427 c3 = _mm_sub_ps(c3, c0);

	428 }

	429 }

	430

	431 // output.xyz = column_matrix(c1, c2, c3) x r.xyz + c0.xyz

	432

	433 c0 = _mm_add_ps(c0, _mm_mul_ps(c1, _mm_set1_ps(rx)));

	434 c0 = _mm_add_ps(c0, _mm_mul_ps(c2, _mm_set1_ps(ry)));

	435 c0 = _mm_add_ps(c0, _mm_mul_ps(c3, _mm_set1_ps(rz)));

	436

	437 // clamp to [0.0..1.0], then scale by 255

	438

	439 c0 = _mm_max_ps(c0, __000);

	440 c0 = _mm_min_ps(c0, __one);

	441 c0 = _mm_mul_ps(c0, __255);

	442

	443 // int(c0) with float rounding, add alpha

	444

	445 result = _mm_add_epi32(result, _mm_cvtps_epi32(c0));

	446

	447 // swizzle and repack in result low bytes

	448

	449 result = __mm_swizzle_epi32(result, bgra);

	450 result = _mm_packus_epi16(result, result);

	451 result = _mm_packus_epi16(result, result);

	452

	453 // store into uint32_t* pixel destination

	454

	455 (uint32_t )dest = _mm_cvtsi128_si32(result);

	456 dest += 4;

	457 }

	458 }

OLD	NEW

« no previous file with comments | « third_party/qcms/src/transform.c ('k') | third_party/qcms/src/transform_util.h » ('j') | no next file with comments »