src/opts/SkColorXform_opts.h - Issue 2046013002: Optimize color xforms with 2.2 gammas for SSE2

Unified Diff: src/opts/SkColorXform_opts.h

Issue 2046013002: Optimize color xforms with 2.2 gammas for SSE2 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/opts/SkColorXform_opts.h

diff --git a/src/opts/SkColorXform_opts.h b/src/opts/SkColorXform_opts.h

new file mode 100644

index 0000000000000000000000000000000000000000..fda6425558bb9e240be7a02a53953033befdcaf4

--- /dev/null

+++ b/src/opts/SkColorXform_opts.h

@@ -0,0 +1,176 @@

+/*

+ *

+ * Use of this source code is governed by a BSD-style license that can be

+ * found in the LICENSE file.

+ */

+#ifndef SkColorXform_opts_DEFINED

+#define SkColorXform_opts_DEFINED

+#include "SkColorPriv.h"

+namespace SK_OPTS_NS {

+static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_t* src, int len,

+ const float matrix[16]) {

+ while (len-- > 0) {

+ float srcFloats[3];

+ srcFloats[0] = (float) ((*src >> 0) & 0xFF);

+ srcFloats[1] = (float) ((*src >> 8) & 0xFF);

+ srcFloats[2] = (float) ((*src >> 16) & 0xFF);

+ // Convert to linear.

+ // TODO (msarett):

+ // We should use X^2.2 here instead of X^2. What is the impact on correctness?

+ // We should be able to get closer to 2.2 at a small performance cost.

+ srcFloats[0] = srcFloats[0] * srcFloats[0];

+ srcFloats[1] = srcFloats[1] * srcFloats[1];

+ srcFloats[2] = srcFloats[2] * srcFloats[2];

+ // Convert to dst gamut.

+ float dstFloats[3];

+ // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost always zero.

+ // Should we have another optimized path that avoids the extra addition when they

+ // are zero?

+ dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +

+ srcFloats[2] * matrix[8] + matrix[12];

+ dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +

+ srcFloats[2] * matrix[9] + matrix[13];

+ dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +

+ srcFloats[2] * matrix[10] + matrix[14];

+ // Convert to dst gamma.

+ // TODO (msarett):

+ // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness?

+ // We should be able to get closer to (1/2.2) at a small performance cost.

+ dstFloats[0] = sqrtf(dstFloats[0]);

+ dstFloats[1] = sqrtf(dstFloats[1]);

+ dstFloats[2] = sqrtf(dstFloats[2]);

+ // FIXME (msarett)

+ // We should clamp here, but I intentionally don't. Just curious to see what happens,

+ // since I think it's unnecessary for most of my current tests. It will definitely be

+ // necessary when converting wider gamuts to smaller gamuts. Should we always clamp,

+ // or is there a performance reason to clamp only when necessary? Also, clamping is not

mtklein 2016/06/08 01:41:40 I think clamping is always the right choice when c

msarett 2016/06/08 13:48:28 Yeah I think it's fine (and in many cases necessar

+ // the right choice for all rendering intents. Should we consider the rendering intent?

+ *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),

+ (uint8_t) (dstFloats[0] + 0.5f),

+ (uint8_t) (dstFloats[0] + 0.5f));

+ dst++;

+ src++;

+ }

+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

+static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, int len,

+ const float matrix[16]) {

+ // Create masks.

+ __m128i byteMask = _mm_set1_epi32(0xFF);

+ __m128i alphaMask = _mm_set1_epi32(0xFF000000);

+ // Load transformation matrix.

+ __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);

+ __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);

+ __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);

+ __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]);

+ while (len >= 4) {

+ // Load 4 pixels and convert them to floats.

+ __m128i rgba = _mm_loadu_si128((const __m128i*) src);

+ __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask));

+ __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask));

+ __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask));

+ // Convert to linear.

+ // FIXME (msarett):

+ // Should we be more accurate?

mtklein 2016/06/08 01:41:40 Luckily we've been working on just this problem!

msarett 2016/06/08 13:48:28 Very cool, sounds good. I plan to start by landin

+ reds = _mm_mul_ps(reds, reds);

+ greens = _mm_mul_ps(greens, greens);

+ blues = _mm_mul_ps(blues, blues);

+ // Apply the transformation matrix to dst gamut.

+ // FIXME (msarett):

+ // rQ, gQ, and bQ are almost always zero. Can we save a couple instructions?

mtklein 2016/06/08 01:41:40 Seems worthwhile, given you can do the check once

msarett 2016/06/08 13:48:28 This actually improves performance more than I sus

+ // Copy rX, rY, rZ, and rQ across their vector own vectors.

mtklein 2016/06/08 01:41:41 I think I get what you're saying here, but this do

msarett 2016/06/08 13:48:28 Yes that's better :).

+ __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0);

mtklein 2016/06/08 01:41:41 I sort of like to write this sort of shuffle as 0x

msarett 2016/06/08 13:48:28 sgtm

+ __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0);

+ __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0);

+ __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0);

+ // dstReds = rX * reds + rY * greens + rZ * blues + rQ

+ __m128 dstReds = _mm_mul_ps(reds, rX);

+ dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));

+ dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));

+ dstReds = _mm_add_ps(dstReds, rQ);

+ // Copy gX, gY, gZ, and gQ across their vector own vectors.

+ __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);

+ __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);

+ __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);

+ __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55);

+ // dstGreens = gX * reds + gY * greens + gZ * blues + gQ

+ __m128 dstGreens = _mm_mul_ps(reds, gX);

mtklein 2016/06/08 01:41:41 This mul-add chain makes me think we should follow

msarett 2016/06/08 13:48:28 Yes! I had the same thought.

+ dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));

+ dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));

+ dstGreens = _mm_add_ps(dstGreens, gQ);

+ // Copy bX, bY, bZ, and bQ across their vector own vectors.

+ __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);

+ __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);

+ __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);

+ __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA);

+ // dstBlues = bX * reds + bY * greens + bZ * blues + bQ

+ __m128 dstBlues = _mm_mul_ps(reds, bX);

+ dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));

+ dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));

+ dstBlues = _mm_add_ps(dstBlues, bQ);

+ // Convert to dst gamma.

+ // FIXME (msarett):

+ // Should we be more accurate?

+ dstReds = _mm_sqrt_ps(dstReds);

mtklein 2016/06/08 01:41:40 You might want to try _mm_rcp_ps(_mm_rsqrt_ps(...)

msarett 2016/06/08 13:48:28 Great! Drops about 10us.

+ dstGreens = _mm_sqrt_ps(dstGreens);

+ dstBlues = _mm_sqrt_ps(dstBlues);

+ // Convert to bytes and store to memory.

+ // FIXME (msarett):

+ // We need to clamp.

mtklein 2016/06/08 01:41:40 Definitely. Any overflow here will clobber neighb

msarett 2016/06/08 13:48:28 Done.

+ rgba = _mm_and_si128(alphaMask, rgba);

+#ifdef SK_PMCOLOR_IS_RGBA

+ rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );

+ rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8));

+ rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16));

+#else

+ rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) );

+ rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8));

+ rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16));

+#endif

+ _mm_storeu_si128((__m128i*) dst, rgba);

+ dst += 4;

+ src += 4;

+ len -= 4;

+ }

+ color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);

+#else

+static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, int len,

+ const float matrix[16]) {

+ color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);

+#endif

+#endif // SkColorXform_opts_DEFINED

« src/core/SkOpts.h ('K') | « src/core/SkOpts.cpp ('k') | src/opts/SkOpts_ssse3.cpp » ('j') | src/opts/SkOpts_ssse3.cpp » ('J')