OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkColorXform_opts_DEFINED | 8 #ifndef SkColorXform_opts_DEFINED |
9 #define SkColorXform_opts_DEFINED | 9 #define SkColorXform_opts_DEFINED |
10 | 10 |
| 11 #include "SkNx.h" |
11 #include "SkColorPriv.h" | 12 #include "SkColorPriv.h" |
12 | 13 |
13 namespace SK_OPTS_NS { | 14 namespace SK_OPTS_NS { |
14 | 15 |
15 extern const float linear_from_srgb[256] = { | 16 extern const float linear_from_srgb[256] = { |
16 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0
00910580950646513f, | 17 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0
00910580950646513f, |
17 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0
02124688884841860f, | 18 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0
02124688884841860f, |
18 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0
03346535763899160f, | 19 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0
03346535763899160f, |
19 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0
04776953480693730f, | 20 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0
04776953480693730f, |
20 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0
06512090792594470f, | 21 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0
06512090792594470f, |
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
139 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7
74227314218442000f, | 140 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7
74227314218442000f, |
140 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8
04559113894567000f, | 141 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8
04559113894567000f, |
141 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8
35527791460841000f, | 142 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8
35527791460841000f, |
142 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8
67135537520905000f, | 143 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8
67135537520905000f, |
143 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8
99384513046529000f, | 144 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8
99384513046529000f, |
144 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9
32276850264543000f, | 145 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9
32276850264543000f, |
145 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9
65814653503130000f, | 146 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9
65814653503130000f, |
146 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0
00000000000000000f, | 147 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0
00000000000000000f, |
147 }; | 148 }; |
148 | 149 |
149 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 150 static Sk4f linear_to_2dot2(const Sk4f& x) { |
| 151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2). |
| 152 auto x2 = x.rsqrt(), // x^(-1/2) |
| 153 x32 = x2.rsqrt().rsqrt().rsqrt().rsqrt(), // x^(-1/32) |
| 154 x64 = x32.rsqrt(); // x^(+1/64) |
150 | 155 |
151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2). | 156 // 29 = 32 - 2 - 1 |
152 static __m128 linear_to_2dot2(__m128 x) { | 157 return 255.0f * x2.invert() * x32 * x64.invert(); |
153 // x^(-1/2) | |
154 __m128 x2 = _mm_rsqrt_ps(x); | |
155 | |
156 // x^(-1/32) | |
157 __m128 x32 = _mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(x2)))); | |
158 | |
159 // x^(+1/64) | |
160 __m128 x64 = _mm_rsqrt_ps(x32); | |
161 | |
162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64) | |
163 // Note that we also scale to the 0-255 range. | |
164 // These terms can be combined more minimally with 3 muls and 1 reciprocal.
However, this | |
165 // is faster, because it allows us to start the muls in parallel with the rs
qrts. | |
166 __m128 scale = _mm_set1_ps(255.0f); | |
167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc
p_ps(x64)); | |
168 } | 158 } |
169 | 159 |
170 static __m128 clamp_0_to_255(__m128 x) { | 160 static Sk4f clamp_0_to_255(const Sk4f& x) { |
171 // The order of the arguments is important here. We want to make sure that
NaN | 161 // The order of the arguments is important here. We want to make sure that
NaN |
172 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN. | 162 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN. |
173 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f)); | 163 return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f); |
174 } | 164 } |
175 | 165 |
176 template <const float (&linear_from_curve)[256]> | 166 template <const float (&linear_from_curve)[256]> |
177 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, | 167 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, |
178 const float matrix[16]) { | 168 const float matrix[16]) { |
179 // Load transformation matrix. | 169 // Load transformation matrix. |
180 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); | 170 auto rXgXbX = Sk4f::Load(matrix + 0), |
181 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); | 171 rYgYbY = Sk4f::Load(matrix + 4), |
182 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); | 172 rZgZbZ = Sk4f::Load(matrix + 8); |
183 | 173 |
184 while (len >= 4) { | 174 while (len >= 4) { |
185 // Convert to linear. The look-up table has perfect accuracy. | 175 // Convert to linear. The look-up table has perfect accuracy. |
186 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF], | 176 auto reds = Sk4f{linear_from_curve[(src[0] >> 0) & 0xFF], |
187 linear_from_curve[(src[1] >> 0) & 0xFF], | 177 linear_from_curve[(src[1] >> 0) & 0xFF], |
188 linear_from_curve[(src[2] >> 0) & 0xFF], | 178 linear_from_curve[(src[2] >> 0) & 0xFF], |
189 linear_from_curve[(src[3] >> 0) & 0xFF]); | 179 linear_from_curve[(src[3] >> 0) & 0xFF]}; |
190 __m128 greens = _mm_setr_ps(linear_from_curve[(src[0] >> 8) & 0xFF], | 180 auto greens = Sk4f{linear_from_curve[(src[0] >> 8) & 0xFF], |
191 linear_from_curve[(src[1] >> 8) & 0xFF], | 181 linear_from_curve[(src[1] >> 8) & 0xFF], |
192 linear_from_curve[(src[2] >> 8) & 0xFF], | 182 linear_from_curve[(src[2] >> 8) & 0xFF], |
193 linear_from_curve[(src[3] >> 8) & 0xFF]); | 183 linear_from_curve[(src[3] >> 8) & 0xFF]}; |
194 __m128 blues = _mm_setr_ps(linear_from_curve[(src[0] >> 16) & 0xFF], | 184 auto blues = Sk4f{linear_from_curve[(src[0] >> 16) & 0xFF], |
195 linear_from_curve[(src[1] >> 16) & 0xFF], | 185 linear_from_curve[(src[1] >> 16) & 0xFF], |
196 linear_from_curve[(src[2] >> 16) & 0xFF], | 186 linear_from_curve[(src[2] >> 16) & 0xFF], |
197 linear_from_curve[(src[3] >> 16) & 0xFF]); | 187 linear_from_curve[(src[3] >> 16) & 0xFF]}; |
198 | 188 |
199 // Apply the transformation matrix to dst gamut. | 189 // Apply the transformation matrix to dst gamut. |
200 // Splat rX, rY, and rZ each across a register. | 190 auto dstReds = rXgXbX[0]*reds + rYgYbY[0]*greens + rZgZbZ[0]*blues, |
201 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); | 191 dstGreens = rXgXbX[1]*reds + rYgYbY[1]*greens + rZgZbZ[1]*blues, |
202 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); | 192 dstBlues = rXgXbX[2]*reds + rYgYbY[2]*greens + rZgZbZ[2]*blues; |
203 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); | |
204 | |
205 // dstReds = rX * reds + rY * greens + rZ * blues | |
206 __m128 dstReds = _mm_mul_ps(reds, rX); | |
207 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); | |
208 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); | |
209 | |
210 // Splat gX, gY, and gZ each across a register. | |
211 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); | |
212 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); | |
213 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); | |
214 | |
215 // dstGreens = gX * reds + gY * greens + gZ * blues | |
216 __m128 dstGreens = _mm_mul_ps(reds, gX); | |
217 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); | |
218 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); | |
219 | |
220 // Splat bX, bY, and bZ each across a register. | |
221 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); | |
222 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); | |
223 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); | |
224 | |
225 // dstBlues = bX * reds + bY * greens + bZ * blues | |
226 __m128 dstBlues = _mm_mul_ps(reds, bX); | |
227 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); | |
228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); | |
229 | 193 |
230 // Convert to dst gamma. | 194 // Convert to dst gamma. |
231 dstReds = linear_to_2dot2(dstReds); | 195 dstReds = linear_to_2dot2(dstReds); |
232 dstGreens = linear_to_2dot2(dstGreens); | 196 dstGreens = linear_to_2dot2(dstGreens); |
233 dstBlues = linear_to_2dot2(dstBlues); | 197 dstBlues = linear_to_2dot2(dstBlues); |
234 | 198 |
235 // Clamp floats. | 199 // Clamp floats to byte range. |
236 dstReds = clamp_0_to_255(dstReds); | 200 dstReds = clamp_0_to_255(dstReds); |
237 dstGreens = clamp_0_to_255(dstGreens); | 201 dstGreens = clamp_0_to_255(dstGreens); |
238 dstBlues = clamp_0_to_255(dstBlues); | 202 dstBlues = clamp_0_to_255(dstBlues); |
239 | 203 |
240 // Convert to bytes and store to memory. | 204 // Convert to bytes and store to memory. |
241 __m128i rgba = _mm_set1_epi32(0xFF000000); | 205 auto rgba = (Sk4i{(int)0xFF000000} ) |
242 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds)
); | 206 | (SkNx_cast<int>(dstReds) ) |
243 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8)
); | 207 | (SkNx_cast<int>(dstGreens) << 8) |
244 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16)
); | 208 | (SkNx_cast<int>(dstBlues) << 16); |
245 _mm_storeu_si128((__m128i*) dst, rgba); | 209 rgba.store(dst); |
246 | 210 |
247 dst += 4; | 211 dst += 4; |
248 src += 4; | 212 src += 4; |
249 len -= 4; | 213 len -= 4; |
250 } | 214 } |
251 | 215 |
252 while (len > 0) { | 216 while (len > 0) { |
253 // Splat the red, green, and blue components. | 217 // Splat r,g,b across a register each. |
254 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]), | 218 auto r = Sk4f{linear_from_curve[(*src >> 0) & 0xFF]}, |
255 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]), | 219 g = Sk4f{linear_from_curve[(*src >> 8) & 0xFF]}, |
256 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]); | 220 b = Sk4f{linear_from_curve[(*src >> 16) & 0xFF]}; |
257 | 221 |
258 // Apply the transformation matrix to dst gamut. | 222 // Apply transformation matrix to dst gamut. |
259 __m128 dstPixel = _mm_mul_ps(r, rXgXbX); | 223 auto dstPixel = rXgXbX*r + rYgYbY*g + rZgZbZ*b; |
260 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY)); | |
261 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ)); | |
262 | 224 |
263 // Convert to dst gamma. | 225 // Convert to dst gamma. |
264 dstPixel = linear_to_2dot2(dstPixel); | 226 dstPixel = linear_to_2dot2(dstPixel); |
265 | 227 |
266 // Clamp floats to 0-255 range. | 228 // Clamp floats to byte range. |
267 dstPixel = clamp_0_to_255(dstPixel); | 229 dstPixel = clamp_0_to_255(dstPixel); |
268 | 230 |
269 // Convert to bytes and store to memory. | 231 // Convert to bytes and store to memory. |
270 __m128i dstInts = _mm_cvtps_epi32(dstPixel); | 232 uint32_t rgba; |
271 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts),
dstInts); | 233 SkNx_cast<uint8_t>(dstPixel).store(&rgba); |
272 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes); | 234 rgba |= 0xFF000000; |
273 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes)); | 235 *dst = rgba; |
274 | 236 |
275 dst += 1; | 237 dst += 1; |
276 src += 1; | 238 src += 1; |
277 len -= 1; | 239 len -= 1; |
278 } | 240 } |
279 } | 241 } |
280 | 242 |
281 #else | |
282 | |
283 static uint8_t clamp_float_to_byte(float v) { | |
284 // The ordering of the logic is a little strange here in order | |
285 // to make sure we convert NaNs to 0. | |
286 if (v >= 254.5f) { | |
287 return 255; | |
288 } else if (v >= 0.5f) { | |
289 return (uint8_t) (v + 0.5f); | |
290 } else { | |
291 return 0; | |
292 } | |
293 } | |
294 | |
295 template <const float (&linear_from_curve)[256]> | |
296 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, | |
297 const float matrix[16]) { | |
298 while (len-- > 0) { | |
299 // Convert to linear. | |
300 float srcFloats[3]; | |
301 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF]; | |
302 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF]; | |
303 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF]; | |
304 | |
305 // Convert to dst gamut. | |
306 float dstFloats[3]; | |
307 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + | |
308 srcFloats[2] * matrix[8]; | |
309 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + | |
310 srcFloats[2] * matrix[9]; | |
311 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + | |
312 srcFloats[2] * matrix[10]; | |
313 | |
314 // Convert to dst gamma. | |
315 // Note: pow is really, really slow. We will suffer when SSE2 is not su
pported. | |
316 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f; | |
317 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f; | |
318 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f; | |
319 | |
320 *dst = (0xFF << 24) | | |
321 (clamp_float_to_byte(dstFloats[2]) << 16) | | |
322 (clamp_float_to_byte(dstFloats[1]) << 8) | | |
323 (clamp_float_to_byte(dstFloats[0]) << 0); | |
324 | |
325 dst++; | |
326 src++; | |
327 } | |
328 } | |
329 | |
330 #endif | |
331 | |
332 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i
nt len, | 243 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i
nt len, |
333 const float matrix[16]) { | 244 const float matrix[16]) { |
334 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix); | 245 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix); |
335 } | 246 } |
336 | 247 |
337 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src,
int len, | 248 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src,
int len, |
338 const float matrix[16]) { | 249 const float matrix[16]) { |
339 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix); | 250 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix); |
340 } | 251 } |
341 | 252 |
342 } | 253 } // namespace SK_OPTS_NS |
343 | 254 |
344 #endif // SkColorXform_opts_DEFINED | 255 #endif // SkColorXform_opts_DEFINED |
OLD | NEW |