OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkColorXform_opts_DEFINED | 8 #ifndef SkColorXform_opts_DEFINED |
9 #define SkColorXform_opts_DEFINED | 9 #define SkColorXform_opts_DEFINED |
10 | 10 |
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
160 __m128 x64 = _mm_rsqrt_ps(x32); | 160 __m128 x64 = _mm_rsqrt_ps(x32); |
161 | 161 |
162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64) | 162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64) |
163 // Note that we also scale to the 0-255 range. | 163 // Note that we also scale to the 0-255 range. |
164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this | 164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this |
165 // is faster, because it allows us to start the muls in parallel with the rs qrts. | 165 // is faster, because it allows us to start the muls in parallel with the rs qrts. |
166 __m128 scale = _mm_set1_ps(255.0f); | 166 __m128 scale = _mm_set1_ps(255.0f); |
167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64)); | 167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64)); |
168 } | 168 } |
169 | 169 |
170 static __m128 if_then_else(__m128 mask, __m128 a, __m128 b) { | |
171 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 | |
172 return _mm_blendv_ps(b, a, mask); | |
173 #else | |
174 return _mm_or_ps(_mm_and_ps(a, mask), _mm_andnot_ps(b, mask)); | |
mtklein
2016/06/17 17:19:40
and(mask, a) or andnot(mask, b)
msarett
2016/06/17 20:10:11
Let's just use Sk4f haha :).
| |
175 #endif | |
176 } | |
177 | |
178 // Below is a good approximation of the sRGB gamma curve (within 1 when scaled t o 8-bit pixels). | |
179 // For 0.00000f <= x < 0.00349f, 12.92 * x | |
180 // For 0.00349f <= x <= 1.00000f, 0.679*(x.^0.5) + 0.423*x.^(0.25) - 0.101 | |
181 // Note that the intersection was selected to be a point where both functions pr oduce the | |
182 // same pixel value when rounded. | |
183 static __m128 linear_to_srgb(__m128 x) { | |
184 __m128 rsqrt = _mm_rsqrt_ps(x); | |
185 __m128 sqrt = _mm_rcp_ps(rsqrt); | |
186 __m128 ftrt = _mm_rsqrt_ps(rsqrt); | |
187 | |
188 __m128 hi = _mm_add_ps(_mm_add_ps( _mm_set1_ps(-0.1011150849 98961f * 255.0f), | |
189 _mm_mul_ps(sqrt, _mm_set1_ps(+0.6785130299 59381f * 255.0f))), | |
190 _mm_mul_ps(ftrt, _mm_set1_ps(+0.4226020550 39580f * 255.0f))); | |
191 | |
192 __m128 lo = _mm_mul_ps(x, _mm_set1_ps(12.92f * 255.0f)); | |
193 | |
194 __m128 mask = _mm_cmplt_ps(x, _mm_set1_ps(0.00349f)); | |
195 return if_then_else(mask, lo, hi); | |
196 } | |
197 | |
170 static __m128 clamp_0_to_255(__m128 x) { | 198 static __m128 clamp_0_to_255(__m128 x) { |
171 // The order of the arguments is important here. We want to make sure that NaN | 199 // The order of the arguments is important here. We want to make sure that NaN |
172 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN. | 200 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN. |
173 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f)); | 201 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f)); |
174 } | 202 } |
175 | 203 |
176 template <const float (&linear_from_curve)[256]> | 204 template <const float (&linear_from_curve)[256], __m128 (*linear_to_curve)(__m12 8)> |
177 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, | 205 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, |
178 const float matrix[16]) { | 206 const float matrix[16]) { |
179 // Load transformation matrix. | 207 // Load transformation matrix. |
180 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); | 208 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); |
181 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); | 209 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); |
182 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); | 210 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); |
183 | 211 |
184 while (len >= 4) { | 212 while (len >= 4) { |
185 // Convert to linear. The look-up table has perfect accuracy. | 213 // Convert to linear. The look-up table has perfect accuracy. |
186 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF], | 214 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF], |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
221 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); | 249 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); |
222 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); | 250 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); |
223 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); | 251 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); |
224 | 252 |
225 // dstBlues = bX * reds + bY * greens + bZ * blues | 253 // dstBlues = bX * reds + bY * greens + bZ * blues |
226 __m128 dstBlues = _mm_mul_ps(reds, bX); | 254 __m128 dstBlues = _mm_mul_ps(reds, bX); |
227 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); | 255 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); |
228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); | 256 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); |
229 | 257 |
230 // Convert to dst gamma. | 258 // Convert to dst gamma. |
231 dstReds = linear_to_2dot2(dstReds); | 259 dstReds = linear_to_curve(dstReds); |
232 dstGreens = linear_to_2dot2(dstGreens); | 260 dstGreens = linear_to_curve(dstGreens); |
233 dstBlues = linear_to_2dot2(dstBlues); | 261 dstBlues = linear_to_curve(dstBlues); |
234 | 262 |
235 // Clamp floats. | 263 // Clamp floats. |
236 dstReds = clamp_0_to_255(dstReds); | 264 dstReds = clamp_0_to_255(dstReds); |
237 dstGreens = clamp_0_to_255(dstGreens); | 265 dstGreens = clamp_0_to_255(dstGreens); |
238 dstBlues = clamp_0_to_255(dstBlues); | 266 dstBlues = clamp_0_to_255(dstBlues); |
239 | 267 |
240 // Convert to bytes and store to memory. | 268 // Convert to bytes and store to memory. |
241 __m128i rgba = _mm_set1_epi32(0xFF000000); | 269 __m128i rgba = _mm_set1_epi32(0xFF000000); |
242 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); | 270 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); |
243 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); | 271 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); |
(...skipping 10 matching lines...) Expand all Loading... | |
254 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]), | 282 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]), |
255 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]), | 283 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]), |
256 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]); | 284 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]); |
257 | 285 |
258 // Apply the transformation matrix to dst gamut. | 286 // Apply the transformation matrix to dst gamut. |
259 __m128 dstPixel = _mm_mul_ps(r, rXgXbX); | 287 __m128 dstPixel = _mm_mul_ps(r, rXgXbX); |
260 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY)); | 288 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY)); |
261 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ)); | 289 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ)); |
262 | 290 |
263 // Convert to dst gamma. | 291 // Convert to dst gamma. |
264 dstPixel = linear_to_2dot2(dstPixel); | 292 dstPixel = linear_to_curve(dstPixel); |
265 | 293 |
266 // Clamp floats to 0-255 range. | 294 // Clamp floats to 0-255 range. |
267 dstPixel = clamp_0_to_255(dstPixel); | 295 dstPixel = clamp_0_to_255(dstPixel); |
268 | 296 |
269 // Convert to bytes and store to memory. | 297 // Convert to bytes and store to memory. |
270 __m128i dstInts = _mm_cvtps_epi32(dstPixel); | 298 __m128i dstInts = _mm_cvtps_epi32(dstPixel); |
271 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts); | 299 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts); |
272 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes); | 300 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes); |
273 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes)); | 301 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes)); |
274 | 302 |
275 dst += 1; | 303 dst += 1; |
276 src += 1; | 304 src += 1; |
277 len -= 1; | 305 len -= 1; |
278 } | 306 } |
279 } | 307 } |
280 | 308 |
281 #else | 309 #else |
282 | 310 |
311 static float linear_to_2dot2(float v) { | |
312 return powf(v, (1.0f / 2.2f)) * 255.0f; | |
313 } | |
314 | |
315 static void linear_to_srgb(float v) { | |
316 if (v < 0031308f) { | |
317 return (12.92f * v) * 255.0f; | |
318 } else { | |
319 return ((1.055f * powf(v, (1.0f / 2.4f))) - 0.055f) * 255.0f; | |
320 } | |
321 } | |
322 | |
283 static uint8_t clamp_float_to_byte(float v) { | 323 static uint8_t clamp_float_to_byte(float v) { |
284 // The ordering of the logic is a little strange here in order | 324 // The ordering of the logic is a little strange here in order |
285 // to make sure we convert NaNs to 0. | 325 // to make sure we convert NaNs to 0. |
286 if (v >= 254.5f) { | 326 if (v >= 254.5f) { |
287 return 255; | 327 return 255; |
288 } else if (v >= 0.5f) { | 328 } else if (v >= 0.5f) { |
289 return (uint8_t) (v + 0.5f); | 329 return (uint8_t) (v + 0.5f); |
290 } else { | 330 } else { |
291 return 0; | 331 return 0; |
292 } | 332 } |
293 } | 333 } |
294 | 334 |
295 template <const float (&linear_from_curve)[256]> | 335 template <const float (&linear_from_curve)[256], float(*linear_to_curve)(float)> |
296 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, | 336 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, |
297 const float matrix[16]) { | 337 const float matrix[16]) { |
298 while (len-- > 0) { | 338 while (len-- > 0) { |
299 // Convert to linear. | 339 // Convert to linear. |
300 float srcFloats[3]; | 340 float srcFloats[3]; |
301 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF]; | 341 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF]; |
302 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF]; | 342 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF]; |
303 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF]; | 343 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF]; |
304 | 344 |
305 // Convert to dst gamut. | 345 // Convert to dst gamut. |
306 float dstFloats[3]; | 346 float dstFloats[3]; |
307 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + | 347 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + |
308 srcFloats[2] * matrix[8]; | 348 srcFloats[2] * matrix[8]; |
309 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + | 349 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + |
310 srcFloats[2] * matrix[9]; | 350 srcFloats[2] * matrix[9]; |
311 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + | 351 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + |
312 srcFloats[2] * matrix[10]; | 352 srcFloats[2] * matrix[10]; |
313 | 353 |
314 // Convert to dst gamma. | 354 // Convert to dst gamma. |
315 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported. | 355 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported. |
316 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f; | 356 dstFloats[0] = linear_to_curve(dstFloats[0]); |
317 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f; | 357 dstFloats[1] = linear_to_curve(dstFloats[1]); |
318 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f; | 358 dstFloats[2] = linear_to_curve(dstFloats[2]); |
319 | 359 |
320 *dst = (0xFF << 24) | | 360 *dst = (0xFF << 24) | |
321 (clamp_float_to_byte(dstFloats[2]) << 16) | | 361 (clamp_float_to_byte(dstFloats[2]) << 16) | |
322 (clamp_float_to_byte(dstFloats[1]) << 8) | | 362 (clamp_float_to_byte(dstFloats[1]) << 8) | |
323 (clamp_float_to_byte(dstFloats[0]) << 0); | 363 (clamp_float_to_byte(dstFloats[0]) << 0); |
324 | 364 |
325 dst++; | 365 dst++; |
326 src++; | 366 src++; |
327 } | 367 } |
328 } | 368 } |
329 | 369 |
330 #endif | 370 #endif |
331 | 371 |
332 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len, | 372 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len, |
333 const float matrix[16]) { | 373 const float matrix[16]) { |
334 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix); | 374 color_xform_RGB1<linear_from_srgb, linear_to_2dot2>(dst, src, len, matrix); |
335 } | 375 } |
336 | 376 |
337 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len, | 377 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len, |
338 const float matrix[16]) { | 378 const float matrix[16]) { |
339 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix); | 379 color_xform_RGB1<linear_from_2dot2, linear_to_2dot2>(dst, src, len, matrix); |
380 } | |
381 | |
382 static void color_xform_RGB1_srgb_to_srgb(uint32_t* dst, const uint32_t* src, in t len, | |
383 const float matrix[16]) { | |
384 color_xform_RGB1<linear_from_srgb, linear_to_srgb>(dst, src, len, matrix); | |
385 } | |
386 | |
387 static void color_xform_RGB1_2dot2_to_srgb(uint32_t* dst, const uint32_t* src, i nt len, | |
388 const float matrix[16]) { | |
389 color_xform_RGB1<linear_from_2dot2, linear_to_srgb>(dst, src, len, matrix); | |
340 } | 390 } |
341 | 391 |
342 } | 392 } |
343 | 393 |
344 #endif // SkColorXform_opts_DEFINED | 394 #endif // SkColorXform_opts_DEFINED |
OLD | NEW |