src/opts/SkColorXform_opts.h - Issue 2046013002: Optimize color xforms with 2.2 gammas for SSE2

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2046013002: Optimize color xforms with 2.2 gammas for SSE2 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2016 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #ifndef SkColorXform_opts_DEFINED

	9 #define SkColorXform_opts_DEFINED

	10

	11 #include "SkColorPriv.h"

	12

	13 namespace SK_OPTS_NS {

	14

	15 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_ t* src, int len,

	16 const float matrix[16]) {

	17 while (len-- > 0) {

	18 float srcFloats[3];

	19 srcFloats[0] = (float) ((*src >> 0) & 0xFF);

	20 srcFloats[1] = (float) ((*src >> 8) & 0xFF);

	21 srcFloats[2] = (float) ((*src >> 16) & 0xFF);

	22

	23 // Convert to linear.

	24 // TODO (msarett):

	25 // We should use X^2.2 here instead of X^2. What is the impact on corre ctness?

	26 // We should be able to get closer to 2.2 at a small performance cost.

	27 srcFloats[0] = srcFloats[0] * srcFloats[0];

	28 srcFloats[1] = srcFloats[1] * srcFloats[1];

	29 srcFloats[2] = srcFloats[2] * srcFloats[2];

	30

	31 // Convert to dst gamut.

	32 float dstFloats[3];

	33 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw ays zero.

	34 // Should we have another optimized path that avoids the extra addition when they

	35 // are zero?

	36 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +

	37 srcFloats[2] * matrix[8] + matrix[12];

	38 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +

	39 srcFloats[2] * matrix[9] + matrix[13];

	40 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +

	41 srcFloats[2] * matrix[10] + matrix[14];

	42

	43 // Convert to dst gamma.

	44 // TODO (msarett):

	45 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness?

	46 // We should be able to get closer to (1/2.2) at a small performance cos t.

	47 dstFloats[0] = sqrtf(dstFloats[0]);

	48 dstFloats[1] = sqrtf(dstFloats[1]);

	49 dstFloats[2] = sqrtf(dstFloats[2]);

	50

	51 // FIXME (msarett)

	52 // We should clamp here, but I intentionally don't. Just curious to see what happens,

	53 // since I think it's unnecessary for most of my current tests. It will definitely be

	54 // necessary when converting wider gamuts to smaller gamuts. Should we always clamp,

	55 // or is there a performance reason to clamp only when necessary? Also, clamping is not
	mtklein 2016/06/08 01:41:40 I think clamping is always the right choice when c I think clamping is always the right choice when converting down to bytes. If you're keeping these in float or half, sure, you can get philosophical about whether or when to clamp. But it really makes no sense to claim you're converting 256.3 to a byte. msarett 2016/06/08 13:48:28 Yeah I think it's fine (and in many cases necessar Show quoted text On 2016/06/08 01:41:40, mtklein wrote: > I think clamping is always the right choice when converting down to bytes. If > you're keeping these in float or half, sure, you can get philosophical about > whether or when to clamp. But it really makes no sense to claim you're > converting 256.3 to a byte. Yeah I think it's fine (and in many cases necessary) to clamp. I was suggesting that, in some cases, the form of the srcToDst matrix might guarantee that the final result will be between 0 and 255. I've added clamping. It adds about 20us to xform time.
	56 // the right choice for all rendering intents. Should we consider the r endering intent?

	57 dst = SkPackARGB32NoCheck(((src >> 24) & 0xFF),

	58 (uint8_t) (dstFloats[0] + 0.5f),

	59 (uint8_t) (dstFloats[0] + 0.5f),

	60 (uint8_t) (dstFloats[0] + 0.5f));

	61

	62 dst++;

	63 src++;

	64 }

	65 }

	66

	67 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	68

	69 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,

	70 const float matrix[16]) {

	71 // Create masks.

	72 __m128i byteMask = _mm_set1_epi32(0xFF);

	73 __m128i alphaMask = _mm_set1_epi32(0xFF000000);

	74

	75 // Load transformation matrix.

	76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);

	77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);

	78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);

	79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]);

	80

	81 while (len >= 4) {

	82 // Load 4 pixels and convert them to floats.

	83 __m128i rgba = _mm_loadu_si128((const __m128i*) src);

	84 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask));

	85 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask));

	86 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask));

	87

	88 // Convert to linear.

	89 // FIXME (msarett):

	90 // Should we be more accurate?
	mtklein 2016/06/08 01:41:40 Luckily we've been working on just this problem! Luckily we've been working on just this problem! Grab me tomorrow and I can walk you through all the options for speed/accuracy tradeoffs. Brian Osman will be able to give us the exact constants once we pick a particular formula skeletons. msarett 2016/06/08 13:48:28 Very cool, sounds good. I plan to start by landin Show quoted text On 2016/06/08 01:41:40, mtklein wrote: > Luckily we've been working on just this problem! Grab me tomorrow and I can > walk you through all the options for speed/accuracy tradeoffs. Brian Osman will > be able to give us the exact constants once we pick a particular formula > skeletons. Very cool, sounds good. I plan to start by landing this (to see diffs on Gold), then to start to mess with the formulas until we are at least as accurate as QCMS.
	91 reds = _mm_mul_ps(reds, reds);

	92 greens = _mm_mul_ps(greens, greens);

	93 blues = _mm_mul_ps(blues, blues);

	94

	95 // Apply the transformation matrix to dst gamut.

	96 // FIXME (msarett):

	97 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct ions?
	mtklein 2016/06/08 01:41:40 Seems worthwhile, given you can do the check once Seems worthwhile, given you can do the check once outside the pixel loop. msarett 2016/06/08 13:48:28 This actually improves performance more than I sus Show quoted text On 2016/06/08 01:41:40, mtklein wrote: > Seems worthwhile, given you can do the check once outside the pixel loop. This actually improves performance more than I suspected. Minus about 25us. Maybe it allows us to do a better job of keeping these constants in the registers. I think I'll leave the TODO and handle in a follow-up. I think we'll need to use a different proc (or template this one)...
	98

	99 // Copy rX, rY, rZ, and rQ across their vector own vectors.
	mtklein 2016/06/08 01:41:41 I think I get what you're saying here, but this do I think I get what you're saying here, but this doesn't strike me as well-formed English. // Splat rX, rY, rZ, and rQ each across a register. ? msarett 2016/06/08 13:48:28 Yes that's better :). Show quoted text On 2016/06/08 01:41:41, mtklein wrote: > I think I get what you're saying here, but this doesn't strike me as well-formed > English. > > // Splat rX, rY, rZ, and rQ each across a register. > > ? Yes that's better :).
	100 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0);
	mtklein 2016/06/08 01:41:41 I sort of like to write this sort of shuffle as 0x I sort of like to write this sort of shuffle as 0x00 as a reminder it's a bit pattern. And of course I'll prefer 0b00000000 once C++14 rolls around. :) msarett 2016/06/08 13:48:28 sgtm Show quoted text On 2016/06/08 01:41:41, mtklein wrote: > I sort of like to write this sort of shuffle as 0x00 as a reminder it's a bit > pattern. And of course I'll prefer 0b00000000 once C++14 rolls around. :) sgtm
	101 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0);

	102 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0);

	103 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0);

	104

	105 // dstReds = rX * reds + rY * greens + rZ * blues + rQ

	106 __m128 dstReds = _mm_mul_ps(reds, rX);

	107 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));

	108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));

	109 dstReds = _mm_add_ps(dstReds, rQ);

	110

	111 // Copy gX, gY, gZ, and gQ across their vector own vectors.

	112 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);

	113 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);

	114 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);

	115 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55);

	116

	117 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ

	118 __m128 dstGreens = _mm_mul_ps(reds, gX);
	mtklein 2016/06/08 01:41:41 This mul-add chain makes me think we should follow This mul-add chain makes me think we should follow up with versions using AVX and/or AVX+FMA (≈AVX2). msarett 2016/06/08 13:48:28 Yes! I had the same thought. Show quoted text On 2016/06/08 01:41:41, mtklein wrote: > This mul-add chain makes me think we should follow up with versions using AVX > and/or AVX+FMA (≈AVX2). Yes! I had the same thought.
	119 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));

	120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));

	121 dstGreens = _mm_add_ps(dstGreens, gQ);

	122

	123 // Copy bX, bY, bZ, and bQ across their vector own vectors.

	124 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);

	125 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);

	126 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);

	127 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA);

	128

	129 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ

	130 __m128 dstBlues = _mm_mul_ps(reds, bX);

	131 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));

	132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));

	133 dstBlues = _mm_add_ps(dstBlues, bQ);

	134

	135 // Convert to dst gamma.

	136 // FIXME (msarett):

	137 // Should we be more accurate?

	138 dstReds = _mm_sqrt_ps(dstReds);
	mtklein 2016/06/08 01:41:40 You might want to try _mm_rcp_ps(_mm_rsqrt_ps(...) You might want to try _mm_rcp_ps(_mm_rsqrt_ps(...)) as a faster sqrt. It should be precise to 11 bits, and very noticeably faster, at least 2x and on some machines up to something like 7x. msarett 2016/06/08 13:48:28 Great! Drops about 10us. Show quoted text On 2016/06/08 01:41:40, mtklein wrote: > You might want to try _mm_rcp_ps(_mm_rsqrt_ps(...)) as a faster sqrt. It should > be precise to 11 bits, and very noticeably faster, at least 2x and on some > machines up to something like 7x. Great! Drops about 10us.
	139 dstGreens = _mm_sqrt_ps(dstGreens);

	140 dstBlues = _mm_sqrt_ps(dstBlues);

	141

	142 // Convert to bytes and store to memory.

	143 // FIXME (msarett):

	144 // We need to clamp.
	mtklein 2016/06/08 01:41:40 Definitely. Any overflow here will clobber neighb Definitely. Any overflow here will clobber neighboring channels. msarett 2016/06/08 13:48:28 Done. Show quoted text On 2016/06/08 01:41:40, mtklein wrote: > Definitely. Any overflow here will clobber neighboring channels. Done.
	145 rgba = _mm_and_si128(alphaMask, rgba);

	146 #ifdef SK_PMCOLOR_IS_RGBA

	147 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );

	148 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );

	149 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) );

	150 #else

	151 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) );

	152 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );

	153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16) );

	154 #endif

	155 _mm_storeu_si128((__m128i*) dst, rgba);

	156

	157 dst += 4;

	158 src += 4;

	159 len -= 4;

	160 }

	161

	162 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);

	163 }

	164

	165 #else

	166

	167 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,

	168 const float matrix[16]) {

	169 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);

	170 }

	171

	172 #endif

	173

	174 }

	175

	176 #endif // SkColorXform_opts_DEFINED

OLD	NEW

« src/core/SkOpts.h ('K') | « src/core/SkOpts.cpp ('k') | src/opts/SkOpts_ssse3.cpp » ('j') | src/opts/SkOpts_ssse3.cpp » ('J')