src/opts/SkColorXform_opts.h - Issue 2046013002: Optimize color xforms with 2.2 gammas for SSE2

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2046013002: Optimize color xforms with 2.2 gammas for SSE2 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Response to comments Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2016 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #ifndef SkColorXform_opts_DEFINED

	9 #define SkColorXform_opts_DEFINED

	10

	11 #include "SkColorPriv.h"

	12

	13 namespace SK_OPTS_NS {

	14

	15 static uint8_t clamp_float_to_byte(float v) {

	16 if (v >= 254.5f) {

	17 return 255;

	18 } else if (v < 0.5f) {

	19 return 0;

	20 } else {

	21 return (uint8_t) (v + 0.5f);

	22 }

	23 }

	24

	25 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_ t* src, int len,

	26 const float matrix[16]) {

	27 while (len-- > 0) {

	28 float srcFloats[3];

	29 srcFloats[0] = (float) ((*src >> 0) & 0xFF);

	30 srcFloats[1] = (float) ((*src >> 8) & 0xFF);

	31 srcFloats[2] = (float) ((*src >> 16) & 0xFF);

	32

	33 // Convert to linear.

	34 // TODO (msarett):

	35 // We should use X^2.2 here instead of X^2. What is the impact on corre ctness?

	36 // We should be able to get closer to 2.2 at a small performance cost.

	37 srcFloats[0] = srcFloats[0] * srcFloats[0];

	38 srcFloats[1] = srcFloats[1] * srcFloats[1];

	39 srcFloats[2] = srcFloats[2] * srcFloats[2];

	40

	41 // Convert to dst gamut.

	42 float dstFloats[3];

	43 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw ays zero.

	44 // Should we have another optimized path that avoids the extra addition when they

	45 // are zero?

	46 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +

	47 srcFloats[2] * matrix[8] + matrix[12];

	48 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +

	49 srcFloats[2] * matrix[9] + matrix[13];

	50 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +

	51 srcFloats[2] * matrix[10] + matrix[14];

	52

	53 // Convert to dst gamma.

	54 // TODO (msarett):

	55 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness?

	56 // We should be able to get closer to (1/2.2) at a small performance cos t.

	57 dstFloats[0] = sqrtf(dstFloats[0]);

	58 dstFloats[1] = sqrtf(dstFloats[1]);

	59 dstFloats[2] = sqrtf(dstFloats[2]);

	60

	61 dst = SkPackARGB32NoCheck(((src >> 24) & 0xFF),

	62 clamp_float_to_byte(dstFloats[0]),

	63 clamp_float_to_byte(dstFloats[1]),

	64 clamp_float_to_byte(dstFloats[2]));

	65

	66 dst++;

	67 src++;

	68 }

	69 }

	70

	71 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

	72

	73 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,

	74 const float matrix[16]) {

	75 // Load transformation matrix.

	76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);

	77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);

	78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);

	79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]);

	80

	81 while (len >= 4) {

	82 // Load 4 pixels and convert them to floats.

	83 __m128i rgba = _mm_loadu_si128((const __m128i*) src);

	84 __m128i byteMask = _mm_set1_epi32(0xFF);

	85 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask));

	86 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask));

	87 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask));

	88

	89 // Convert to linear.

	90 // FIXME (msarett):

	91 // Should we be more accurate?

	92 reds = _mm_mul_ps(reds, reds);

	93 greens = _mm_mul_ps(greens, greens);

	94 blues = _mm_mul_ps(blues, blues);

	95

	96 // Apply the transformation matrix to dst gamut.

	97 // FIXME (msarett):

	98 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct ions?

	99

	100 // Splat rX, rY, rZ, and rQ each across a register.

	101 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00);

	102 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00);

	103 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00);

	104 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x00);

	105

	106 // dstReds = rX * reds + rY * greens + rZ * blues + rQ

	107 __m128 dstReds = _mm_mul_ps(reds, rX);

	108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));

	109 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));

	110 dstReds = _mm_add_ps(dstReds, rQ);

	111

	112 // Splat gX, gY, gZ, and gQ each across a register.

	113 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);

	114 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);

	115 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);

	116 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55);

	117

	118 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ

	119 __m128 dstGreens = _mm_mul_ps(reds, gX);

	120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));

	121 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));

	122 dstGreens = _mm_add_ps(dstGreens, gQ);

	123

	124 // Splat bX, bY, bZ, and bQ each across a register.

	125 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);

	126 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);

	127 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);

	128 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA);

	129

	130 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ

	131 __m128 dstBlues = _mm_mul_ps(reds, bX);

	132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));

	133 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));

	134 dstBlues = _mm_add_ps(dstBlues, bQ);

	135

	136 // Convert to dst gamma.

	137 // Note that the reciprocal of the reciprocal sqrt, is just a fast sqrt.

	138 // FIXME (msarett):

	139 // Should we be more accurate?

	140 dstReds = _mm_rcp_ps(_mm_rsqrt_ps(dstReds));

	141 dstGreens = _mm_rcp_ps(_mm_rsqrt_ps(dstGreens));

	142 dstBlues = _mm_rcp_ps(_mm_rsqrt_ps(dstBlues));

	143

	144 // Clamp floats to 0-255 range.

	145 dstReds = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstReds, _mm_set1_ ps(255.0f)));

	146 dstGreens = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstGreens, _mm_set1_ ps(255.0f)));

	147 dstBlues = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(dstBlues, _mm_set1_ ps(255.0f)));

	148

	149 // Convert to bytes and store to memory.

	150 rgba = _mm_and_si128(_mm_set1_epi32(0xFF000000), rgba);

	151 #ifdef SK_PMCOLOR_IS_RGBA

	152 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );

	153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );

	154 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) );

	155 #else

	156 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) );

	157 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );

	158 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16) );

	159 #endif

	160 _mm_storeu_si128((__m128i*) dst, rgba);

	161

	162 dst += 4;

	163 src += 4;

	164 len -= 4;

	165 }

	166

	167 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);

	168 }

	169

	170 #else

	171

	172 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,

	173 const float matrix[16]) {

	174 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);

	175 }

	176

	177 #endif

	178

	179 }

	180

	181 #endif // SkColorXform_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/core/SkOpts.cpp ('k') | no next file » | no next file with comments »