| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkColorXform_opts_DEFINED | 8 #ifndef SkColorXform_opts_DEFINED |
| 9 #define SkColorXform_opts_DEFINED | 9 #define SkColorXform_opts_DEFINED |
| 10 | 10 |
| 11 #include "SkNx.h" |
| 11 #include "SkColorPriv.h" | 12 #include "SkColorPriv.h" |
| 12 | 13 |
| 13 namespace SK_OPTS_NS { | 14 namespace SK_OPTS_NS { |
| 14 | 15 |
| 15 extern const float linear_from_srgb[256] = { | 16 extern const float linear_from_srgb[256] = { |
| 16 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0
00910580950646513f, | 17 0.000000000000000000f, 0.000303526983548838f, 0.000607053967097675f, 0.0
00910580950646513f, |
| 17 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0
02124688884841860f, | 18 0.001214107934195350f, 0.001517634917744190f, 0.001821161901293030f, 0.0
02124688884841860f, |
| 18 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0
03346535763899160f, | 19 0.002428215868390700f, 0.002731742851939540f, 0.003034518678424960f, 0.0
03346535763899160f, |
| 19 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0
04776953480693730f, | 20 0.003676507324047440f, 0.004024717018496310f, 0.004391442037410290f, 0.0
04776953480693730f, |
| 20 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0
06512090792594470f, | 21 0.005181516702338390f, 0.005605391624202720f, 0.006048833022857060f, 0.0
06512090792594470f, |
| (...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 139 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7
74227314218442000f, | 140 0.751895080583051000f, 0.759299550695091000f, 0.766743616862161000f, 0.7
74227314218442000f, |
| 140 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8
04559113894567000f, | 141 0.781750677773962000f, 0.789313742415586000f, 0.796916542907978000f, 0.8
04559113894567000f, |
| 141 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8
35527791460841000f, | 142 0.812241489898490000f, 0.819963705323528000f, 0.827725794455034000f, 0.8
35527791460841000f, |
| 142 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8
67135537520905000f, | 143 0.843369730392169000f, 0.851251645184515000f, 0.859173569658532000f, 0.8
67135537520905000f, |
| 143 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8
99384513046529000f, | 144 0.875137582365205000f, 0.883179737672745000f, 0.891262036813419000f, 0.8
99384513046529000f, |
| 144 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9
32276850264543000f, | 145 0.907547199521614000f, 0.915750129279253000f, 0.923993335251873000f, 0.9
32276850264543000f, |
| 145 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9
65814653503130000f, | 146 0.940600707035753000f, 0.948964938178195000f, 0.957369576199527000f, 0.9
65814653503130000f, |
| 146 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0
00000000000000000f, | 147 0.974300202388861000f, 0.982826255053791000f, 0.991392843592940000f, 1.0
00000000000000000f, |
| 147 }; | 148 }; |
| 148 | 149 |
| 149 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 150 static Sk4f linear_to_2dot2(const Sk4f& x) { |
| 151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2). |
| 152 auto x2 = x.rsqrt(), // x^(-1/2) |
| 153 x32 = x2.rsqrt().rsqrt().rsqrt().rsqrt(), // x^(-1/32) |
| 154 x64 = x32.rsqrt(); // x^(+1/64) |
| 150 | 155 |
| 151 // x^(29/64) is a very good approximation of the true value, x^(1/2.2). | 156 // 29 = 32 - 2 - 1 |
| 152 static __m128 linear_to_2dot2(__m128 x) { | 157 return 255.0f * x2.invert() * x32 * x64.invert(); |
| 153 // x^(-1/2) | |
| 154 __m128 x2 = _mm_rsqrt_ps(x); | |
| 155 | |
| 156 // x^(-1/32) | |
| 157 __m128 x32 = _mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(_mm_rsqrt_ps(x2)))); | |
| 158 | |
| 159 // x^(+1/64) | |
| 160 __m128 x64 = _mm_rsqrt_ps(x32); | |
| 161 | |
| 162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64) | |
| 163 // Note that we also scale to the 0-255 range. | |
| 164 // These terms can be combined more minimally with 3 muls and 1 reciprocal.
However, this | |
| 165 // is faster, because it allows us to start the muls in parallel with the rs
qrts. | |
| 166 __m128 scale = _mm_set1_ps(255.0f); | |
| 167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc
p_ps(x64)); | |
| 168 } | 158 } |
| 169 | 159 |
| 170 static __m128 clamp_0_to_255(__m128 x) { | 160 static Sk4f clamp_0_to_255(const Sk4f& x) { |
| 171 // The order of the arguments is important here. We want to make sure that
NaN | 161 // The order of the arguments is important here. We want to make sure that
NaN |
| 172 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN. | 162 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN. |
| 173 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f)); | 163 return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f); |
| 174 } | 164 } |
| 175 | 165 |
| 176 template <const float (&linear_from_curve)[256]> | 166 template <const float (&linear_from_curve)[256]> |
| 177 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, | 167 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, |
| 178 const float matrix[16]) { | 168 const float matrix[16]) { |
| 179 // Load transformation matrix. | 169 // Load transformation matrix. |
| 180 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); | 170 auto rXgXbX = Sk4f::Load(matrix + 0), |
| 181 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); | 171 rYgYbY = Sk4f::Load(matrix + 4), |
| 182 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); | 172 rZgZbZ = Sk4f::Load(matrix + 8); |
| 183 | 173 |
| 184 while (len >= 4) { | 174 while (len >= 4) { |
| 185 // Convert to linear. The look-up table has perfect accuracy. | 175 // Convert to linear. The look-up table has perfect accuracy. |
| 186 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF], | 176 auto reds = Sk4f{linear_from_curve[(src[0] >> 0) & 0xFF], |
| 187 linear_from_curve[(src[1] >> 0) & 0xFF], | 177 linear_from_curve[(src[1] >> 0) & 0xFF], |
| 188 linear_from_curve[(src[2] >> 0) & 0xFF], | 178 linear_from_curve[(src[2] >> 0) & 0xFF], |
| 189 linear_from_curve[(src[3] >> 0) & 0xFF]); | 179 linear_from_curve[(src[3] >> 0) & 0xFF]}; |
| 190 __m128 greens = _mm_setr_ps(linear_from_curve[(src[0] >> 8) & 0xFF], | 180 auto greens = Sk4f{linear_from_curve[(src[0] >> 8) & 0xFF], |
| 191 linear_from_curve[(src[1] >> 8) & 0xFF], | 181 linear_from_curve[(src[1] >> 8) & 0xFF], |
| 192 linear_from_curve[(src[2] >> 8) & 0xFF], | 182 linear_from_curve[(src[2] >> 8) & 0xFF], |
| 193 linear_from_curve[(src[3] >> 8) & 0xFF]); | 183 linear_from_curve[(src[3] >> 8) & 0xFF]}; |
| 194 __m128 blues = _mm_setr_ps(linear_from_curve[(src[0] >> 16) & 0xFF], | 184 auto blues = Sk4f{linear_from_curve[(src[0] >> 16) & 0xFF], |
| 195 linear_from_curve[(src[1] >> 16) & 0xFF], | 185 linear_from_curve[(src[1] >> 16) & 0xFF], |
| 196 linear_from_curve[(src[2] >> 16) & 0xFF], | 186 linear_from_curve[(src[2] >> 16) & 0xFF], |
| 197 linear_from_curve[(src[3] >> 16) & 0xFF]); | 187 linear_from_curve[(src[3] >> 16) & 0xFF]}; |
| 198 | 188 |
| 199 // Apply the transformation matrix to dst gamut. | 189 // Apply the transformation matrix to dst gamut. |
| 200 // Splat rX, rY, and rZ each across a register. | 190 auto dstReds = rXgXbX[0]*reds + rYgYbY[0]*greens + rZgZbZ[0]*blues, |
| 201 __m128 rX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x00); | 191 dstGreens = rXgXbX[1]*reds + rYgYbY[1]*greens + rZgZbZ[1]*blues, |
| 202 __m128 rY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x00); | 192 dstBlues = rXgXbX[2]*reds + rYgYbY[2]*greens + rZgZbZ[2]*blues; |
| 203 __m128 rZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x00); | |
| 204 | |
| 205 // dstReds = rX * reds + rY * greens + rZ * blues | |
| 206 __m128 dstReds = _mm_mul_ps(reds, rX); | |
| 207 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(greens, rY)); | |
| 208 dstReds = _mm_add_ps(dstReds, _mm_mul_ps(blues, rZ)); | |
| 209 | |
| 210 // Splat gX, gY, and gZ each across a register. | |
| 211 __m128 gX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0x55); | |
| 212 __m128 gY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0x55); | |
| 213 __m128 gZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0x55); | |
| 214 | |
| 215 // dstGreens = gX * reds + gY * greens + gZ * blues | |
| 216 __m128 dstGreens = _mm_mul_ps(reds, gX); | |
| 217 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(greens, gY)); | |
| 218 dstGreens = _mm_add_ps(dstGreens, _mm_mul_ps(blues, gZ)); | |
| 219 | |
| 220 // Splat bX, bY, and bZ each across a register. | |
| 221 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); | |
| 222 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); | |
| 223 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); | |
| 224 | |
| 225 // dstBlues = bX * reds + bY * greens + bZ * blues | |
| 226 __m128 dstBlues = _mm_mul_ps(reds, bX); | |
| 227 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); | |
| 228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); | |
| 229 | 193 |
| 230 // Convert to dst gamma. | 194 // Convert to dst gamma. |
| 231 dstReds = linear_to_2dot2(dstReds); | 195 dstReds = linear_to_2dot2(dstReds); |
| 232 dstGreens = linear_to_2dot2(dstGreens); | 196 dstGreens = linear_to_2dot2(dstGreens); |
| 233 dstBlues = linear_to_2dot2(dstBlues); | 197 dstBlues = linear_to_2dot2(dstBlues); |
| 234 | 198 |
| 235 // Clamp floats. | 199 // Clamp floats to byte range. |
| 236 dstReds = clamp_0_to_255(dstReds); | 200 dstReds = clamp_0_to_255(dstReds); |
| 237 dstGreens = clamp_0_to_255(dstGreens); | 201 dstGreens = clamp_0_to_255(dstGreens); |
| 238 dstBlues = clamp_0_to_255(dstBlues); | 202 dstBlues = clamp_0_to_255(dstBlues); |
| 239 | 203 |
| 240 // Convert to bytes and store to memory. | 204 // Convert to bytes and store to memory. |
| 241 __m128i rgba = _mm_set1_epi32(0xFF000000); | 205 auto rgba = (Sk4i{(int)0xFF000000} ) |
| 242 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds)
); | 206 | (SkNx_cast<int>(dstReds) ) |
| 243 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8)
); | 207 | (SkNx_cast<int>(dstGreens) << 8) |
| 244 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstBlues), 16)
); | 208 | (SkNx_cast<int>(dstBlues) << 16); |
| 245 _mm_storeu_si128((__m128i*) dst, rgba); | 209 rgba.store(dst); |
| 246 | 210 |
| 247 dst += 4; | 211 dst += 4; |
| 248 src += 4; | 212 src += 4; |
| 249 len -= 4; | 213 len -= 4; |
| 250 } | 214 } |
| 251 | 215 |
| 252 while (len > 0) { | 216 while (len > 0) { |
| 253 // Splat the red, green, and blue components. | 217 // Splat r,g,b across a register each. |
| 254 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]), | 218 auto r = Sk4f{linear_from_curve[(*src >> 0) & 0xFF]}, |
| 255 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]), | 219 g = Sk4f{linear_from_curve[(*src >> 8) & 0xFF]}, |
| 256 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]); | 220 b = Sk4f{linear_from_curve[(*src >> 16) & 0xFF]}; |
| 257 | 221 |
| 258 // Apply the transformation matrix to dst gamut. | 222 // Apply transformation matrix to dst gamut. |
| 259 __m128 dstPixel = _mm_mul_ps(r, rXgXbX); | 223 auto dstPixel = rXgXbX*r + rYgYbY*g + rZgZbZ*b; |
| 260 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY)); | |
| 261 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ)); | |
| 262 | 224 |
| 263 // Convert to dst gamma. | 225 // Convert to dst gamma. |
| 264 dstPixel = linear_to_2dot2(dstPixel); | 226 dstPixel = linear_to_2dot2(dstPixel); |
| 265 | 227 |
| 266 // Clamp floats to 0-255 range. | 228 // Clamp floats to byte range. |
| 267 dstPixel = clamp_0_to_255(dstPixel); | 229 dstPixel = clamp_0_to_255(dstPixel); |
| 268 | 230 |
| 269 // Convert to bytes and store to memory. | 231 // Convert to bytes and store to memory. |
| 270 __m128i dstInts = _mm_cvtps_epi32(dstPixel); | 232 uint32_t rgba; |
| 271 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts),
dstInts); | 233 SkNx_cast<uint8_t>(dstPixel).store(&rgba); |
| 272 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes); | 234 rgba |= 0xFF000000; |
| 273 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes)); | 235 *dst = rgba; |
| 274 | 236 |
| 275 dst += 1; | 237 dst += 1; |
| 276 src += 1; | 238 src += 1; |
| 277 len -= 1; | 239 len -= 1; |
| 278 } | 240 } |
| 279 } | 241 } |
| 280 | 242 |
| 281 #else | |
| 282 | |
| 283 static uint8_t clamp_float_to_byte(float v) { | |
| 284 // The ordering of the logic is a little strange here in order | |
| 285 // to make sure we convert NaNs to 0. | |
| 286 if (v >= 254.5f) { | |
| 287 return 255; | |
| 288 } else if (v >= 0.5f) { | |
| 289 return (uint8_t) (v + 0.5f); | |
| 290 } else { | |
| 291 return 0; | |
| 292 } | |
| 293 } | |
| 294 | |
| 295 template <const float (&linear_from_curve)[256]> | |
| 296 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, | |
| 297 const float matrix[16]) { | |
| 298 while (len-- > 0) { | |
| 299 // Convert to linear. | |
| 300 float srcFloats[3]; | |
| 301 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF]; | |
| 302 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF]; | |
| 303 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF]; | |
| 304 | |
| 305 // Convert to dst gamut. | |
| 306 float dstFloats[3]; | |
| 307 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + | |
| 308 srcFloats[2] * matrix[8]; | |
| 309 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + | |
| 310 srcFloats[2] * matrix[9]; | |
| 311 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + | |
| 312 srcFloats[2] * matrix[10]; | |
| 313 | |
| 314 // Convert to dst gamma. | |
| 315 // Note: pow is really, really slow. We will suffer when SSE2 is not su
pported. | |
| 316 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f; | |
| 317 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f; | |
| 318 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f; | |
| 319 | |
| 320 *dst = (0xFF << 24) | | |
| 321 (clamp_float_to_byte(dstFloats[2]) << 16) | | |
| 322 (clamp_float_to_byte(dstFloats[1]) << 8) | | |
| 323 (clamp_float_to_byte(dstFloats[0]) << 0); | |
| 324 | |
| 325 dst++; | |
| 326 src++; | |
| 327 } | |
| 328 } | |
| 329 | |
| 330 #endif | |
| 331 | |
| 332 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i
nt len, | 243 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i
nt len, |
| 333 const float matrix[16]) { | 244 const float matrix[16]) { |
| 334 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix); | 245 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix); |
| 335 } | 246 } |
| 336 | 247 |
| 337 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src,
int len, | 248 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src,
int len, |
| 338 const float matrix[16]) { | 249 const float matrix[16]) { |
| 339 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix); | 250 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix); |
| 340 } | 251 } |
| 341 | 252 |
| 342 } | 253 } // namespace SK_OPTS_NS |
| 343 | 254 |
| 344 #endif // SkColorXform_opts_DEFINED | 255 #endif // SkColorXform_opts_DEFINED |
| OLD | NEW |