third_party/qcms/src/transform-sse2.c - Issue 1414493004: [qcms] Reduce RMS color error for qcms_transform_data_rgb(a)_out_lut

Side by Side Diff: third_party/qcms/src/transform-sse2.c

Issue 1414493004: [qcms] Reduce RMS color error for qcms_transform_data_rgb(a)_out_lut (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // qcms	1 // qcms

2 // Copyright (C) 2009 Mozilla Foundation	2 // Copyright (C) 2009 Mozilla Foundation

3 // Copyright (C) 2015 Intel Corporation	3 // Copyright (C) 2015 Intel Corporation

4 //	4 //

5 // Permission is hereby granted, free of charge, to any person obtaining	5 // Permission is hereby granted, free of charge, to any person obtaining

6 // a copy of this software and associated documentation files (the "Software"),	6 // a copy of this software and associated documentation files (the "Software"),

7 // to deal in the Software without restriction, including without limitation	7 // to deal in the Software without restriction, including without limitation

8 // the rights to use, copy, modify, merge, publish, distribute, sublicense,	8 // the rights to use, copy, modify, merge, publish, distribute, sublicense,

9 // and/or sell copies of the Software, and to permit persons to whom the Softwar e	9 // and/or sell copies of the Software, and to permit persons to whom the Softwar e

10 // is furnished to do so, subject to the following conditions:	10 // is furnished to do so, subject to the following conditions:

11 //	11 //

12 // The above copyright notice and this permission notice shall be included in	12 // The above copyright notice and this permission notice shall be included in

13 // all copies or substantial portions of the Software.	13 // all copies or substantial portions of the Software.

14 //	14 //

15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,	15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

16 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO	16 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO

17 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND	17 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

18 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE	18 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE

19 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION	19 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION

20 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION	20 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION

21 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.	21 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

22	22

23 #include <emmintrin.h>	23 #include <emmintrin.h>

24	24

25 #include "qcmsint.h"	25 #include "qcmsint.h"

26	26

27 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequ ence */	27 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequ ence */

28 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)	28 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE - 1)

29 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZ E )	29 #define CLAMPMAXVAL 1.0f

	30

30 static const ALIGN float floatScaleX4[4] =	31 static const ALIGN float floatScaleX4[4] =

31 { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};	32 { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};

32 static const ALIGN float clampMaxValueX4[4] =	33 static const ALIGN float clampMaxValueX4[4] =

33 { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};	34 { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};

34	35

35 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,	36 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,

36 unsigned char *src,	37 unsigned char *src,

37 unsigned char *dest,	38 unsigned char *dest,

38 size_t length,	39 size_t length,

39 qcms_format_type output_format)	40 qcms_format_type output_format)

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
96 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);	97 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);

97 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);	98 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);

98 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);	99 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);

99	100

100 /* gamma * matrix */	101 /* gamma * matrix */

101 vec_r = _mm_mul_ps(vec_r, mat0);	102 vec_r = _mm_mul_ps(vec_r, mat0);

102 vec_g = _mm_mul_ps(vec_g, mat1);	103 vec_g = _mm_mul_ps(vec_g, mat1);

103 vec_b = _mm_mul_ps(vec_b, mat2);	104 vec_b = _mm_mul_ps(vec_b, mat2);

104	105

105 /* crunch, crunch, crunch */	106 /* crunch, crunch, crunch */

106 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));	107 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));

107 vec_r = _mm_max_ps(min, vec_r);	108 vec_r = _mm_max_ps(min, vec_r);

108 vec_r = _mm_min_ps(max, vec_r);	109 vec_r = _mm_min_ps(max, vec_r);

109 result = _mm_mul_ps(vec_r, scale);	110 result = _mm_mul_ps(vec_r, scale);

110	111

111 /* store calc'd output tables indices */	112 /* store calc'd output tables indices */

112 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));	113 _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result));

113	114

114 /* load for next loop while store completes */	115 /* load for next loop while store completes */

115 vec_r = _mm_load_ss(&igtbl_r[src[0]]);	116 vec_r = _mm_load_ss(&igtbl_r[src[0]]);

116 vec_g = _mm_load_ss(&igtbl_g[src[1]]);	117 vec_g = _mm_load_ss(&igtbl_g[src[1]]);

117 vec_b = _mm_load_ss(&igtbl_b[src[2]]);	118 vec_b = _mm_load_ss(&igtbl_b[src[2]]);

118 src += 3;	119 src += 3;

119	120

120 /* use calc'd indices to output RGB values */	121 /* use calc'd indices to output RGB values */

121 dest[r_out] = otdata_r[output[0]];	122 dest[r_out] = otdata_r[output[0]];

122 dest[1] = otdata_g[output[1]];	123 dest[1] = otdata_g[output[1]];

123 dest[b_out] = otdata_b[output[2]];	124 dest[b_out] = otdata_b[output[2]];

124 dest += 3;	125 dest += 3;

125 }	126 }

126	127

127 /* handle final (maybe only) pixel */	128 /* handle final (maybe only) pixel */

128	129

129 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);	130 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);

130 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);	131 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);

131 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);	132 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);

132	133

133 vec_r = _mm_mul_ps(vec_r, mat0);	134 vec_r = _mm_mul_ps(vec_r, mat0);

134 vec_g = _mm_mul_ps(vec_g, mat1);	135 vec_g = _mm_mul_ps(vec_g, mat1);

135 vec_b = _mm_mul_ps(vec_b, mat2);	136 vec_b = _mm_mul_ps(vec_b, mat2);

136	137

137 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));	138 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));

138 vec_r = _mm_max_ps(min, vec_r);	139 vec_r = _mm_max_ps(min, vec_r);

139 vec_r = _mm_min_ps(max, vec_r);	140 vec_r = _mm_min_ps(max, vec_r);

140 result = _mm_mul_ps(vec_r, scale);	141 result = _mm_mul_ps(vec_r, scale);

141	142

142 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));	143 _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result));

143	144

144 dest[r_out] = otdata_r[output[0]];	145 dest[r_out] = otdata_r[output[0]];

145 dest[1] = otdata_g[output[1]];	146 dest[1] = otdata_g[output[1]];

146 dest[b_out] = otdata_b[output[2]];	147 dest[b_out] = otdata_b[output[2]];

147 }	148 }

148	149

149 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,	150 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,

150 unsigned char *src,	151 unsigned char *src,

151 unsigned char *dest,	152 unsigned char *dest,

152 size_t length,	153 size_t length,

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
216 /* gamma * matrix */	217 /* gamma * matrix */

217 vec_r = _mm_mul_ps(vec_r, mat0);	218 vec_r = _mm_mul_ps(vec_r, mat0);

218 vec_g = _mm_mul_ps(vec_g, mat1);	219 vec_g = _mm_mul_ps(vec_g, mat1);

219 vec_b = _mm_mul_ps(vec_b, mat2);	220 vec_b = _mm_mul_ps(vec_b, mat2);

220	221

221 /* store alpha for this pixel; load alpha for next */	222 /* store alpha for this pixel; load alpha for next */

222 dest[3] = alpha;	223 dest[3] = alpha;

223 alpha = src[3];	224 alpha = src[3];

224	225

225 /* crunch, crunch, crunch */	226 /* crunch, crunch, crunch */

226 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));	227 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));

227 vec_r = _mm_max_ps(min, vec_r);	228 vec_r = _mm_max_ps(min, vec_r);

228 vec_r = _mm_min_ps(max, vec_r);	229 vec_r = _mm_min_ps(max, vec_r);

229 result = _mm_mul_ps(vec_r, scale);	230 result = _mm_mul_ps(vec_r, scale);

230	231

231 /* store calc'd output tables indices */	232 /* store calc'd output tables indices */

232 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));	233 _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result));
	Noel Gordon 2015/11/04 13:01:28 Now we match the s/w case, one question: if you ro Now we match the s/w case, one question: if you rounded here, would that reduce the RMS error? radu.velea 2015/11/04 13:12:19 Using SSE round (_mm_cvtps_epi32) generates the fo Show quoted text On 2015/11/04 13:01:28, noel gordon wrote: > Now we match the s/w case, one question: if you rounded here, would that reduce > the RMS error? Using SSE round (_mm_cvtps_epi32) generates the following output for SSE code: Input profile sRGB IEC61966-2.1 Output profile Adobe RGB (1998) (using qcms precache) RMS color error 2.66 Input profile Adobe RGB (1998) Output profile sRGB IEC61966-2.1 (using qcms precache) RMS color error 0.87
233	234

234 /* load gamma values for next loop while store completes */	235 /* load gamma values for next loop while store completes */

235 vec_r = _mm_load_ss(&igtbl_r[src[0]]);	236 vec_r = _mm_load_ss(&igtbl_r[src[0]]);

236 vec_g = _mm_load_ss(&igtbl_g[src[1]]);	237 vec_g = _mm_load_ss(&igtbl_g[src[1]]);

237 vec_b = _mm_load_ss(&igtbl_b[src[2]]);	238 vec_b = _mm_load_ss(&igtbl_b[src[2]]);

238 src += 4;	239 src += 4;

239	240

240 /* use calc'd indices to output RGB values */	241 /* use calc'd indices to output RGB values */

241 dest[r_out] = otdata_r[output[0]];	242 dest[r_out] = otdata_r[output[0]];

242 dest[1] = otdata_g[output[1]];	243 dest[1] = otdata_g[output[1]];

243 dest[b_out] = otdata_b[output[2]];	244 dest[b_out] = otdata_b[output[2]];

244 dest += 4;	245 dest += 4;

245 }	246 }

246	247

247 /* handle final (maybe only) pixel */	248 /* handle final (maybe only) pixel */

248	249

249 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);	250 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);

250 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);	251 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);

251 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);	252 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);

252	253

253 vec_r = _mm_mul_ps(vec_r, mat0);	254 vec_r = _mm_mul_ps(vec_r, mat0);

254 vec_g = _mm_mul_ps(vec_g, mat1);	255 vec_g = _mm_mul_ps(vec_g, mat1);

255 vec_b = _mm_mul_ps(vec_b, mat2);	256 vec_b = _mm_mul_ps(vec_b, mat2);

256	257

257 dest[3] = alpha;	258 dest[3] = alpha;

258	259

259 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));	260 vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b));

260 vec_r = _mm_max_ps(min, vec_r);	261 vec_r = _mm_max_ps(min, vec_r);

261 vec_r = _mm_min_ps(max, vec_r);	262 vec_r = _mm_min_ps(max, vec_r);

262 result = _mm_mul_ps(vec_r, scale);	263 result = _mm_mul_ps(vec_r, scale);

263	264

264 _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));	265 _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result));

265	266

266 dest[r_out] = otdata_r[output[0]];	267 dest[r_out] = otdata_r[output[0]];

267 dest[1] = otdata_g[output[1]];	268 dest[1] = otdata_g[output[1]];

268 dest[b_out] = otdata_b[output[2]];	269 dest[b_out] = otdata_b[output[2]];

269 }	270 }

270	271

271 static inline __m128i __mm_swizzle_epi32(__m128i value, int bgra)	272 static inline __m128i __mm_swizzle_epi32(__m128i value, int bgra)

272 {	273 {

273 return bgra ? _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 1, 2, 3)) :	274 return bgra ? _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 1, 2, 3)) :

274 _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 3, 2, 1)) ;	275 _mm_shuffle_epi32(value, _MM_SHUFFLE(0, 3, 2, 1)) ;

(...skipping 173 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
448 result = __mm_swizzle_epi32(result, bgra);	449 result = __mm_swizzle_epi32(result, bgra);

449 result = _mm_packus_epi16(result, result);	450 result = _mm_packus_epi16(result, result);

450 result = _mm_packus_epi16(result, result);	451 result = _mm_packus_epi16(result, result);

451	452

452 // store into uint32_t* pixel destination	453 // store into uint32_t* pixel destination

453	454

454 (uint32_t )dest = _mm_cvtsi128_si32(result);	455 (uint32_t )dest = _mm_cvtsi128_si32(result);

455 dest += 4;	456 dest += 4;

456 }	457 }

457 }	458 }

OLD	NEW

« no previous file with comments | « third_party/qcms/README.chromium ('k') | no next file » | no next file with comments »