Chromium Code Reviews| Index: src/opts/SkColorXform_opts.h |
| diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..fda6425558bb9e240be7a02a53953033befdcaf4 |
| --- /dev/null |
| +++ b/src/opts/SkColorXform_opts.h |
| @@ -0,0 +1,176 @@ |
| +/* |
| + * Copyright 2016 Google Inc. |
| + * |
| + * Use of this source code is governed by a BSD-style license that can be |
| + * found in the LICENSE file. |
| + */ |
| + |
| +#ifndef SkColorXform_opts_DEFINED |
| +#define SkColorXform_opts_DEFINED |
| + |
| +#include "SkColorPriv.h" |
| + |
| +namespace SK_OPTS_NS { |
| + |
| +static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_t* src, int len, |
| + const float matrix[16]) { |
| + while (len-- > 0) { |
| + float srcFloats[3]; |
| + srcFloats[0] = (float) ((*src >> 0) & 0xFF); |
| + srcFloats[1] = (float) ((*src >> 8) & 0xFF); |
| + srcFloats[2] = (float) ((*src >> 16) & 0xFF); |
| + |
| + // Convert to linear. |
| + // TODO (msarett): |
| + // We should use X^2.2 here instead of X^2. What is the impact on correctness? |
| + // We should be able to get closer to 2.2 at a small performance cost. |
| + srcFloats[0] = srcFloats[0] * srcFloats[0]; |
| + srcFloats[1] = srcFloats[1] * srcFloats[1]; |
| + srcFloats[2] = srcFloats[2] * srcFloats[2]; |
| + |
| + // Convert to dst gamut. |
| + float dstFloats[3]; |
| + // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost always zero. |
| + // Should we have another optimized path that avoids the extra addition when they |
| + // are zero? |
| + dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + |
| + srcFloats[2] * matrix[8] + matrix[12]; |
| + dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + |
| + srcFloats[2] * matrix[9] + matrix[13]; |
| + dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + |
| + srcFloats[2] * matrix[10] + matrix[14]; |
| + |
| + // Convert to dst gamma. |
| + // TODO (msarett): |
| + // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness? |
| + // We should be able to get closer to (1/2.2) at a small performance cost. |
| + dstFloats[0] = sqrtf(dstFloats[0]); |
| + dstFloats[1] = sqrtf(dstFloats[1]); |
| + dstFloats[2] = sqrtf(dstFloats[2]); |
| + |
| + // FIXME (msarett) |
| + // We should clamp here, but I intentionally don't. Just curious to see what happens, |
| + // since I think it's unnecessary for most of my current tests. It will definitely be |
| + // necessary when converting wider gamuts to smaller gamuts. Should we always clamp, |
| + // or is there a performance reason to clamp only when necessary? Also, clamping is not |
|
mtklein
2016/06/08 01:41:40
I think clamping is always the right choice when c
msarett
2016/06/08 13:48:28
Yeah I think it's fine (and in many cases necessar
|
| + // the right choice for all rendering intents. Should we consider the rendering intent? |
| + *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF), |
| + (uint8_t) (dstFloats[0] + 0.5f), |
| + (uint8_t) (dstFloats[0] + 0.5f), |
| + (uint8_t) (dstFloats[0] + 0.5f)); |
| + |
| + dst++; |
| + src++; |
| + } |
| +} |
| + |
| +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| + |
| +static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, int len, |
| + const float matrix[16]) { |
| + // Create masks. |
| + __m128i byteMask = _mm_set1_epi32(0xFF); |
| + __m128i alphaMask = _mm_set1_epi32(0xFF000000); |
| + |
| + // Load transformation matrix. |
| + __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); |
| + __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); |
| + __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); |
| + __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]); |
| + |
| + while (len >= 4) { |
| + // Load 4 pixels and convert them to floats. |
| + __m128i rgba = _mm_loadu_si128((const __m128i*) src); |
| + __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask)); |
| + __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask)); |
| + __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask)); |
| + |
| + // Convert to linear. |
| + // FIXME (msarett): |
| + // Should we be more accurate? |
|
mtklein
2016/06/08 01:41:40
Luckily we've been working on just this problem!
msarett
2016/06/08 13:48:28
Very cool, sounds good. I plan to start by landin
|
| + reds = _mm_mul_ps(reds, reds); |
| + greens = _mm_mul_ps(greens, greens); |
| + blues = _mm_mul_ps(blues, blues); |
| + |
| + // Apply the transformation matrix to dst gamut. |
| + // FIXME (msarett): |
| + // rQ, gQ, and bQ are almost always zero. Can we save a couple instructions? |
|
mtklein
2016/06/08 01:41:40
Seems worthwhile, given you can do the check once
msarett
2016/06/08 13:48:28
This actually improves performance more than I sus
|
| + |
| + // Copy rX, rY, rZ, and rQ across their vector own vectors. |
|
mtklein
2016/06/08 01:41:41
I think I get what you're saying here, but this do
msarett
2016/06/08 13:48:28
Yes that's better :).
|
| + __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0); |
|
mtklein
2016/06/08 01:41:41
I sort of like to write this sort of shuffle as 0x
msarett
2016/06/08 13:48:28
sgtm
|
| + __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0); |
| + __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0); |
| + __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0); |
| + |
| + // dstReds = rX * reds + rY * greens + rZ * blues + rQ |
| + __m128 dstReds = _mm_mul_ps(reds, rX); |
| + dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); |
| + dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); |
| + dstReds = _mm_add_ps(dstReds, rQ); |
| + |
| + // Copy gX, gY, gZ, and gQ across their vector own vectors. |
| + __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); |
| + __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); |
| + __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); |
| + __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55); |
| + |
| + // dstGreens = gX * reds + gY * greens + gZ * blues + gQ |
| + __m128 dstGreens = _mm_mul_ps(reds, gX); |
|
mtklein
2016/06/08 01:41:41
This mul-add chain makes me think we should follow
msarett
2016/06/08 13:48:28
Yes! I had the same thought.
|
| + dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); |
| + dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); |
| + dstGreens = _mm_add_ps(dstGreens, gQ); |
| + |
| + // Copy bX, bY, bZ, and bQ across their vector own vectors. |
| + __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); |
| + __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); |
| + __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); |
| + __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA); |
| + |
| + // dstBlues = bX * reds + bY * greens + bZ * blues + bQ |
| + __m128 dstBlues = _mm_mul_ps(reds, bX); |
| + dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); |
| + dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); |
| + dstBlues = _mm_add_ps(dstBlues, bQ); |
| + |
| + // Convert to dst gamma. |
| + // FIXME (msarett): |
| + // Should we be more accurate? |
| + dstReds = _mm_sqrt_ps(dstReds); |
|
mtklein
2016/06/08 01:41:40
You might want to try _mm_rcp_ps(_mm_rsqrt_ps(...)
msarett
2016/06/08 13:48:28
Great! Drops about 10us.
|
| + dstGreens = _mm_sqrt_ps(dstGreens); |
| + dstBlues = _mm_sqrt_ps(dstBlues); |
| + |
| + // Convert to bytes and store to memory. |
| + // FIXME (msarett): |
| + // We need to clamp. |
|
mtklein
2016/06/08 01:41:40
Definitely. Any overflow here will clobber neighb
msarett
2016/06/08 13:48:28
Done.
|
| + rgba = _mm_and_si128(alphaMask, rgba); |
| +#ifdef SK_PMCOLOR_IS_RGBA |
| + rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); |
| + rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8)); |
| + rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16)); |
| +#else |
| + rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) ); |
| + rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8)); |
| + rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16)); |
| +#endif |
| + _mm_storeu_si128((__m128i*) dst, rgba); |
| + |
| + dst += 4; |
| + src += 4; |
| + len -= 4; |
| + } |
| + |
| + color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); |
| +} |
| + |
| +#else |
| + |
| +static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, int len, |
| + const float matrix[16]) { |
| + color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix); |
| +} |
| + |
| +#endif |
| + |
| +} |
| + |
| +#endif // SkColorXform_opts_DEFINED |