Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(105)

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2046013002: Optimize color xforms with 2.2 gammas for SSE2 (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkColorXform_opts_DEFINED
9 #define SkColorXform_opts_DEFINED
10
11 #include "SkColorPriv.h"
12
13 namespace SK_OPTS_NS {
14
15 static void color_xform_2Dot2_RGBA_to_8888_portable(uint32_t* dst, const uint32_ t* src, int len,
16 const float matrix[16]) {
17 while (len-- > 0) {
18 float srcFloats[3];
19 srcFloats[0] = (float) ((*src >> 0) & 0xFF);
20 srcFloats[1] = (float) ((*src >> 8) & 0xFF);
21 srcFloats[2] = (float) ((*src >> 16) & 0xFF);
22
23 // Convert to linear.
24 // TODO (msarett):
25 // We should use X^2.2 here instead of X^2. What is the impact on corre ctness?
26 // We should be able to get closer to 2.2 at a small performance cost.
27 srcFloats[0] = srcFloats[0] * srcFloats[0];
28 srcFloats[1] = srcFloats[1] * srcFloats[1];
29 srcFloats[2] = srcFloats[2] * srcFloats[2];
30
31 // Convert to dst gamut.
32 float dstFloats[3];
33 // TODO (msarett): matrix[12], matrix[13], and matrix[14] are almost alw ays zero.
34 // Should we have another optimized path that avoids the extra addition when they
35 // are zero?
36 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
37 srcFloats[2] * matrix[8] + matrix[12];
38 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
39 srcFloats[2] * matrix[9] + matrix[13];
40 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
41 srcFloats[2] * matrix[10] + matrix[14];
42
43 // Convert to dst gamma.
44 // TODO (msarett):
45 // We should use X^(1/2.2) here instead of X^(1/2). What is the impact on correctness?
46 // We should be able to get closer to (1/2.2) at a small performance cos t.
47 dstFloats[0] = sqrtf(dstFloats[0]);
48 dstFloats[1] = sqrtf(dstFloats[1]);
49 dstFloats[2] = sqrtf(dstFloats[2]);
50
51 // FIXME (msarett)
52 // We should clamp here, but I intentionally don't. Just curious to see what happens,
53 // since I think it's unnecessary for most of my current tests. It will definitely be
54 // necessary when converting wider gamuts to smaller gamuts. Should we always clamp,
55 // or is there a performance reason to clamp only when necessary? Also, clamping is not
mtklein 2016/06/08 01:41:40 I think clamping is always the right choice when c
msarett 2016/06/08 13:48:28 Yeah I think it's fine (and in many cases necessar
56 // the right choice for all rendering intents. Should we consider the r endering intent?
57 *dst = SkPackARGB32NoCheck(((*src >> 24) & 0xFF),
58 (uint8_t) (dstFloats[0] + 0.5f),
59 (uint8_t) (dstFloats[0] + 0.5f),
60 (uint8_t) (dstFloats[0] + 0.5f));
61
62 dst++;
63 src++;
64 }
65 }
66
67 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
68
69 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,
70 const float matrix[16]) {
71 // Create masks.
72 __m128i byteMask = _mm_set1_epi32(0xFF);
73 __m128i alphaMask = _mm_set1_epi32(0xFF000000);
74
75 // Load transformation matrix.
76 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);
77 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);
78 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);
79 __m128 rQgQbQ = _mm_loadu_ps(&matrix[12]);
80
81 while (len >= 4) {
82 // Load 4 pixels and convert them to floats.
83 __m128i rgba = _mm_loadu_si128((const __m128i*) src);
84 __m128 reds = _mm_cvtepi32_ps(_mm_and_si128( rgba, byteMask));
85 __m128 greens = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 8), byteMask));
86 __m128 blues = _mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(rgba, 16), byteMask));
87
88 // Convert to linear.
89 // FIXME (msarett):
90 // Should we be more accurate?
mtklein 2016/06/08 01:41:40 Luckily we've been working on just this problem!
msarett 2016/06/08 13:48:28 Very cool, sounds good. I plan to start by landin
91 reds = _mm_mul_ps(reds, reds);
92 greens = _mm_mul_ps(greens, greens);
93 blues = _mm_mul_ps(blues, blues);
94
95 // Apply the transformation matrix to dst gamut.
96 // FIXME (msarett):
97 // rQ, gQ, and bQ are almost always zero. Can we save a couple instruct ions?
mtklein 2016/06/08 01:41:40 Seems worthwhile, given you can do the check once
msarett 2016/06/08 13:48:28 This actually improves performance more than I sus
98
99 // Copy rX, rY, rZ, and rQ across their vector own vectors.
mtklein 2016/06/08 01:41:41 I think I get what you're saying here, but this do
msarett 2016/06/08 13:48:28 Yes that's better :).
100 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0);
mtklein 2016/06/08 01:41:41 I sort of like to write this sort of shuffle as 0x
msarett 2016/06/08 13:48:28 sgtm
101 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0);
102 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0);
103 __m128 rQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0);
104
105 // dstReds = rX * reds + rY * greens + rZ * blues + rQ
106 __m128 dstReds = _mm_mul_ps(reds, rX);
107 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY));
108 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ));
109 dstReds = _mm_add_ps(dstReds, rQ);
110
111 // Copy gX, gY, gZ, and gQ across their vector own vectors.
112 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55);
113 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55);
114 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55);
115 __m128 gQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0x55);
116
117 // dstGreens = gX * reds + gY * greens + gZ * blues + gQ
118 __m128 dstGreens = _mm_mul_ps(reds, gX);
mtklein 2016/06/08 01:41:41 This mul-add chain makes me think we should follow
msarett 2016/06/08 13:48:28 Yes! I had the same thought.
119 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY));
120 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ));
121 dstGreens = _mm_add_ps(dstGreens, gQ);
122
123 // Copy bX, bY, bZ, and bQ across their vector own vectors.
124 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);
125 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);
126 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);
127 __m128 bQ = _mm_shuffle_ps(rQgQbQ, rQgQbQ, 0xAA);
128
129 // dstBlues = bX * reds + bY * greens + bZ * blues + bQ
130 __m128 dstBlues = _mm_mul_ps(reds, bX);
131 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));
132 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));
133 dstBlues = _mm_add_ps(dstBlues, bQ);
134
135 // Convert to dst gamma.
136 // FIXME (msarett):
137 // Should we be more accurate?
138 dstReds = _mm_sqrt_ps(dstReds);
mtklein 2016/06/08 01:41:40 You might want to try _mm_rcp_ps(_mm_rsqrt_ps(...)
msarett 2016/06/08 13:48:28 Great! Drops about 10us.
139 dstGreens = _mm_sqrt_ps(dstGreens);
140 dstBlues = _mm_sqrt_ps(dstBlues);
141
142 // Convert to bytes and store to memory.
143 // FIXME (msarett):
144 // We need to clamp.
mtklein 2016/06/08 01:41:40 Definitely. Any overflow here will clobber neighb
msarett 2016/06/08 13:48:28 Done.
145 rgba = _mm_and_si128(alphaMask, rgba);
146 #ifdef SK_PMCOLOR_IS_RGBA
147 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );
148 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );
149 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16) );
150 #else
151 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstBlues) );
152 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );
153 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstReds), 16) );
154 #endif
155 _mm_storeu_si128((__m128i*) dst, rgba);
156
157 dst += 4;
158 src += 4;
159 len -= 4;
160 }
161
162 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);
163 }
164
165 #else
166
167 static void color_xform_2Dot2_RGBA_to_8888(uint32_t* dst, const uint32_t* src, i nt len,
168 const float matrix[16]) {
169 color_xform_2Dot2_RGBA_to_8888_portable(dst, src, len, matrix);
170 }
171
172 #endif
173
174 }
175
176 #endif // SkColorXform_opts_DEFINED
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698