src/opts/SkColorXform_opts.h - Issue 2046013002: Optimize color xforms with 2.2 gammas for SSE2

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2046013002: Optimize color xforms with 2.2 gammas for SSE2 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Response to comments Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2016 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #ifndef SkColorXform_opts_DEFINED

	9 #define SkColorXform_opts_DEFINED

	10

	11 #include "SkColorPriv.h"

	12

	13 namespace SK_OPTS_NS {

	14

	15 static uint8_t clamp_float_to_byte(float v) {
	scroggo 2016/06/08 17:08:34 This method is slightly different than the other o This method is slightly different than the other one (in SkColorSpaceXform), despite having the same name. The other one multiplies by 255 first, and compares against 255 instead of 254.5. Are they intended to be different? If so, maybe give them different names than clarify the difference? msarett 2016/06/08 17:23:15 Done. I think the version with 254.5 and 0.5 is " Show quoted text On 2016/06/08 17:08:34, scroggo wrote: > This method is slightly different than the other one (in SkColorSpaceXform), > despite having the same name. > > The other one multiplies by 255 first, and compares against 255 instead of > 254.5. > > Are they intended to be different? If so, maybe give them different names than > clarify the difference? Done. I think the version with 254.5 and 0.5 is "better", because we will be less likely to need to cast float to int. I'll update both versions to behave like that. The other difference is that here we assume that the float corresponds to 0-255 (while the other assumes 0-1). I'm still figuring out whether we need to normalize to 0-1 or not. It may depend on how we decide to approximate the gamma curve. I've changed the name in one of the functions to reference "normalized floats". It's likely that both implementations will keep changing.
	16 if (v >= 254.5f) {

	17 return 255;

	18 } else if (v < 0.5f) {

	19 return 0;

	20 } else {

	21 return (uint8_t) (v + 0.5f);

	22 }

	23 }

	24

	25 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_ t* src, int len,

	26 const float matrix[16]) {

	27 while (len-- > 0) {

	28 float srcFloats[3];

	29 srcFloats[0] = (float) ((*src >> 0) & 0xFF);

	30 srcFloats[1] = (float) ((*src >> 8) & 0xFF);

	31 srcFloats[2] = (float) ((*src >> 16) & 0xFF);

	32

	33 // Convert to linear.

	34 // TODO (msarett):

	35 // We should use X^2.2 here instead of X^2. What is the impact on corre ctness?

	36 // We should be able to get closer to 2.2 at a small performance cost.

	37 srcFloats[0] = srcFloats[0] * srcFloats[0];

	38 srcFloats[1] = srcFloats[1] * srcFloats[1];

	39 srcFloats[2] = srcFloats[2] * srcFloats[2];

	40

	41 // Convert to dst gamut.

	42 float dstFloats[3];

	43 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw ays zero.

	44 // Should we have another optimized path that avoids the extra addition when they

	45 // are zero?

	46 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +

	47 srcFloats[2] * matrix[8] + matrix[12];

	48 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +

	49 srcFloats[2] * matrix[9] + matrix[13];

	50 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +

	51 srcFloats[2] * matrix[10] + matrix[14];

	52

	53 // Convert to dst gamma.

	54 // TODO (msarett):

	55 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness?

	56 // We should be able to get closer to (1/2.2) at a small performance cos t.

	57 dstFloats[0] = sqrtf(dstFloats[0]);

	58 dstFloats[1] = sqrtf(dstFloats[1]);

	59 dstFloats[2] = sqrtf(dstFloats[2]);

	60

	61 dst = SkPackARGB32NoCheck(((src >> 24) & 0xFF),

	62 clamp_float_to_byte(dstFloats[0]),

	63 clamp_float_to_byte(dstFloats[1]),

	64 clamp_float_to_byte(dstFloats[2]));

	65

	66 dst++;

	67 src++;

	68 }

	69 }

	70

	71 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	72

	73 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,

	74 const float matrix[16]) {

	75 // Load transformation matrix.

	76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);

	77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);

	78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);

	79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]);

	80

	81 while (len >= 4) {

	82 // Load 4 pixels and convert them to floats.

	83 __m128i rgba = _mm_loadu_si128((const __m128i*) src);

	84 __m128i byteMask = _mm_set1_epi32(0xFF);

	85 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask));

	86 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask));

	87 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask));

	88

	89 // Convert to linear.

	90 // FIXME (msarett):

	91 // Should we be more accurate?

	92 reds = _mm_mul_ps(reds, reds);

	93 greens = _mm_mul_ps(greens, greens);

	94 blues = _mm_mul_ps(blues, blues);

	95

	96 // Apply the transformation matrix to dst gamut.

	97 // FIXME (msarett):

	98 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct ions?

	99

	100 // Splat rX, rY, rZ, and rQ each across a register.

	101 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00);

	102 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00);

	103 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);

	104 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00);

	105

	106 // dstReds = rX * reds + rY * greens + rZ * blues + rQ

	107 __m128 dstReds = _mm_mul_ps(reds, rX);

	108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));

	109 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));

	110 dstReds = _mm_add_ps(dstReds, rQ);

	111

	112 // Splat gX, gY, gZ, and gQ each across a register.

	113 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);

	114 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);

	115 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);

	116 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55);

	117

	118 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ

	119 __m128 dstGreens = _mm_mul_ps(reds, gX);

	120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));

	121 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));

	122 dstGreens = _mm_add_ps(dstGreens, gQ);

	123

	124 // Splat bX, bY, bZ, and bQ each across a register.

	125 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);

	126 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);

	127 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);

	128 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA);

	129

	130 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ

	131 __m128 dstBlues = _mm_mul_ps(reds, bX);

	132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));

	133 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));

	134 dstBlues = _mm_add_ps(dstBlues, bQ);

	135

	136 // Convert to dst gamma.

	137 // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt.

	138 // FIXME (msarett):

	139 // Should we be more accurate?

	140 dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds));

	141 dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens));

	142 dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues));

	143

	144 // Clamp floats to 0-255 range.

	145 dstReds = _mm_min_ps(_mm_max_ps(dstReds, _mm_setzero_ps()), _mm_set1 _ps(255.0f));

	146 dstGreens = _mm_min_ps(_mm_max_ps(dstGreens, _mm_setzero_ps()), _mm_set1 _ps(255.0f));

	147 dstBlues = _mm_min_ps(_mm_max_ps(dstBlues, _mm_setzero_ps()), _mm_set1 _ps(255.0f));

	148

	149 // Convert to bytes and store to memory.

	150 rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba);

	151 #ifdef SK_PMCOLOR_IS_RGBA

	152 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );

	153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );

	154 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) );

	155 #else

	156 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) );

	157 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );

	158 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16) );

	159 #endif

	160 _mm_storeu_si128((__m128i*) dst, rgba);

	161

	162 dst += 4;

	163 src += 4;

	164 len -= 4;

	165 }

	166

	167 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);

	168 }

	169

	170 #else

	171

	172 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,

	173 const float matrix[16]) {

	174 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);

	175 }

	176

	177 #endif

	178

	179 }

	180

	181 #endif // SkColorXform_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/core/SkOpts.cpp ('k') | no next file » | no next file with comments »