Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2078623002: Support sRGB dsts in opt code (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkOpts_sse41.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 Google Inc. 2 * Copyright 2016 Google Inc.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #ifndef SkColorXform_opts_DEFINED 8 #ifndef SkColorXform_opts_DEFINED
9 #define SkColorXform_opts_DEFINED 9 #define SkColorXform_opts_DEFINED
10 10
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after
160 __m128 x64 = _mm_rsqrt_ps(x32); 160 __m128 x64 = _mm_rsqrt_ps(x32);
161 161
162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64) 162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64)
163 // Note that we also scale to the 0-255 range. 163 // Note that we also scale to the 0-255 range.
164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this 164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this
165 // is faster, because it allows us to start the muls in parallel with the rs qrts. 165 // is faster, because it allows us to start the muls in parallel with the rs qrts.
166 __m128 scale = _mm_set1_ps(255.0f); 166 __m128 scale = _mm_set1_ps(255.0f);
167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64)); 167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64));
168 } 168 }
169 169
170 static __m128 if_then_else(__m128 mask, __m128 a, __m128 b) {
171 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
172 return _mm_blendv_ps(b, a, mask);
173 #else
174 return _mm_or_ps(_mm_and_ps(a, mask), _mm_andnot_ps(b, mask));
mtklein 2016/06/17 17:19:40 and(mask, a) or andnot(mask, b)
msarett 2016/06/17 20:10:11 Let's just use Sk4f haha :).
175 #endif
176 }
177
178 // Below is a good approximation of the sRGB gamma curve (within 1 when scaled t o 8-bit pixels).
179 // For 0.00000f <= x < 0.00349f, 12.92 * x
180 // For 0.00349f <= x <= 1.00000f, 0.679*(x.^0.5) + 0.423*x.^(0.25) - 0.101
181 // Note that the intersection was selected to be a point where both functions pr oduce the
182 // same pixel value when rounded.
183 static __m128 linear_to_srgb(__m128 x) {
184 __m128 rsqrt = _mm_rsqrt_ps(x);
185 __m128 sqrt = _mm_rcp_ps(rsqrt);
186 __m128 ftrt = _mm_rsqrt_ps(rsqrt);
187
188 __m128 hi = _mm_add_ps(_mm_add_ps( _mm_set1_ps(-0.1011150849 98961f * 255.0f),
189 _mm_mul_ps(sqrt, _mm_set1_ps(+0.6785130299 59381f * 255.0f))),
190 _mm_mul_ps(ftrt, _mm_set1_ps(+0.4226020550 39580f * 255.0f)));
191
192 __m128 lo = _mm_mul_ps(x, _mm_set1_ps(12.92f * 255.0f));
193
194 __m128 mask = _mm_cmplt_ps(x, _mm_set1_ps(0.00349f));
195 return if_then_else(mask, lo, hi);
196 }
197
170 static __m128 clamp_0_to_255(__m128 x) { 198 static __m128 clamp_0_to_255(__m128 x) {
171 // The order of the arguments is important here. We want to make sure that NaN 199 // The order of the arguments is important here. We want to make sure that NaN
172 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN. 200 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.
173 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f)); 201 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f));
174 } 202 }
175 203
176 template <const float (&linear_from_curve)[256]> 204 template <const float (&linear_from_curve)[256], __m128 (*linear_to_curve)(__m12 8)>
177 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, 205 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
178 const float matrix[16]) { 206 const float matrix[16]) {
179 // Load transformation matrix. 207 // Load transformation matrix.
180 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]); 208 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);
181 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]); 209 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);
182 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]); 210 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);
183 211
184 while (len >= 4) { 212 while (len >= 4) {
185 // Convert to linear. The look-up table has perfect accuracy. 213 // Convert to linear. The look-up table has perfect accuracy.
186 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF], 214 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF],
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
221 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA); 249 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);
222 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA); 250 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);
223 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA); 251 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);
224 252
225 // dstBlues = bX * reds + bY * greens + bZ * blues 253 // dstBlues = bX * reds + bY * greens + bZ * blues
226 __m128 dstBlues = _mm_mul_ps(reds, bX); 254 __m128 dstBlues = _mm_mul_ps(reds, bX);
227 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY)); 255 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));
228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ)); 256 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));
229 257
230 // Convert to dst gamma. 258 // Convert to dst gamma.
231 dstReds = linear_to_2dot2(dstReds); 259 dstReds = linear_to_curve(dstReds);
232 dstGreens = linear_to_2dot2(dstGreens); 260 dstGreens = linear_to_curve(dstGreens);
233 dstBlues = linear_to_2dot2(dstBlues); 261 dstBlues = linear_to_curve(dstBlues);
234 262
235 // Clamp floats. 263 // Clamp floats.
236 dstReds = clamp_0_to_255(dstReds); 264 dstReds = clamp_0_to_255(dstReds);
237 dstGreens = clamp_0_to_255(dstGreens); 265 dstGreens = clamp_0_to_255(dstGreens);
238 dstBlues = clamp_0_to_255(dstBlues); 266 dstBlues = clamp_0_to_255(dstBlues);
239 267
240 // Convert to bytes and store to memory. 268 // Convert to bytes and store to memory.
241 __m128i rgba = _mm_set1_epi32(0xFF000000); 269 __m128i rgba = _mm_set1_epi32(0xFF000000);
242 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) ); 270 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );
243 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) ); 271 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );
(...skipping 10 matching lines...) Expand all
254 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]), 282 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]),
255 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]), 283 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]),
256 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]); 284 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]);
257 285
258 // Apply the transformation matrix to dst gamut. 286 // Apply the transformation matrix to dst gamut.
259 __m128 dstPixel = _mm_mul_ps(r, rXgXbX); 287 __m128 dstPixel = _mm_mul_ps(r, rXgXbX);
260 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY)); 288 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY));
261 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ)); 289 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ));
262 290
263 // Convert to dst gamma. 291 // Convert to dst gamma.
264 dstPixel = linear_to_2dot2(dstPixel); 292 dstPixel = linear_to_curve(dstPixel);
265 293
266 // Clamp floats to 0-255 range. 294 // Clamp floats to 0-255 range.
267 dstPixel = clamp_0_to_255(dstPixel); 295 dstPixel = clamp_0_to_255(dstPixel);
268 296
269 // Convert to bytes and store to memory. 297 // Convert to bytes and store to memory.
270 __m128i dstInts = _mm_cvtps_epi32(dstPixel); 298 __m128i dstInts = _mm_cvtps_epi32(dstPixel);
271 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts); 299 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts);
272 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes); 300 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes);
273 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes)); 301 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes));
274 302
275 dst += 1; 303 dst += 1;
276 src += 1; 304 src += 1;
277 len -= 1; 305 len -= 1;
278 } 306 }
279 } 307 }
280 308
281 #else 309 #else
282 310
311 static float linear_to_2dot2(float v) {
312 return powf(v, (1.0f / 2.2f)) * 255.0f;
313 }
314
315 static void linear_to_srgb(float v) {
316 if (v < 0031308f) {
317 return (12.92f * v) * 255.0f;
318 } else {
319 return ((1.055f * powf(v, (1.0f / 2.4f))) - 0.055f) * 255.0f;
320 }
321 }
322
283 static uint8_t clamp_float_to_byte(float v) { 323 static uint8_t clamp_float_to_byte(float v) {
284 // The ordering of the logic is a little strange here in order 324 // The ordering of the logic is a little strange here in order
285 // to make sure we convert NaNs to 0. 325 // to make sure we convert NaNs to 0.
286 if (v >= 254.5f) { 326 if (v >= 254.5f) {
287 return 255; 327 return 255;
288 } else if (v >= 0.5f) { 328 } else if (v >= 0.5f) {
289 return (uint8_t) (v + 0.5f); 329 return (uint8_t) (v + 0.5f);
290 } else { 330 } else {
291 return 0; 331 return 0;
292 } 332 }
293 } 333 }
294 334
295 template <const float (&linear_from_curve)[256]> 335 template <const float (&linear_from_curve)[256], float(*linear_to_curve)(float)>
296 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len, 336 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
297 const float matrix[16]) { 337 const float matrix[16]) {
298 while (len-- > 0) { 338 while (len-- > 0) {
299 // Convert to linear. 339 // Convert to linear.
300 float srcFloats[3]; 340 float srcFloats[3];
301 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF]; 341 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF];
302 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF]; 342 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF];
303 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF]; 343 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF];
304 344
305 // Convert to dst gamut. 345 // Convert to dst gamut.
306 float dstFloats[3]; 346 float dstFloats[3];
307 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] + 347 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +
308 srcFloats[2] * matrix[8]; 348 srcFloats[2] * matrix[8];
309 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] + 349 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +
310 srcFloats[2] * matrix[9]; 350 srcFloats[2] * matrix[9];
311 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] + 351 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +
312 srcFloats[2] * matrix[10]; 352 srcFloats[2] * matrix[10];
313 353
314 // Convert to dst gamma. 354 // Convert to dst gamma.
315 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported. 355 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported.
316 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f; 356 dstFloats[0] = linear_to_curve(dstFloats[0]);
317 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f; 357 dstFloats[1] = linear_to_curve(dstFloats[1]);
318 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f; 358 dstFloats[2] = linear_to_curve(dstFloats[2]);
319 359
320 *dst = (0xFF << 24) | 360 *dst = (0xFF << 24) |
321 (clamp_float_to_byte(dstFloats[2]) << 16) | 361 (clamp_float_to_byte(dstFloats[2]) << 16) |
322 (clamp_float_to_byte(dstFloats[1]) << 8) | 362 (clamp_float_to_byte(dstFloats[1]) << 8) |
323 (clamp_float_to_byte(dstFloats[0]) << 0); 363 (clamp_float_to_byte(dstFloats[0]) << 0);
324 364
325 dst++; 365 dst++;
326 src++; 366 src++;
327 } 367 }
328 } 368 }
329 369
330 #endif 370 #endif
331 371
332 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len, 372 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len,
333 const float matrix[16]) { 373 const float matrix[16]) {
334 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix); 374 color_xform_RGB1<linear_from_srgb, linear_to_2dot2>(dst, src, len, matrix);
335 } 375 }
336 376
337 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len, 377 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len,
338 const float matrix[16]) { 378 const float matrix[16]) {
339 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix); 379 color_xform_RGB1<linear_from_2dot2, linear_to_2dot2>(dst, src, len, matrix);
380 }
381
382 static void color_xform_RGB1_srgb_to_srgb(uint32_t* dst, const uint32_t* src, in t len,
383 const float matrix[16]) {
384 color_xform_RGB1<linear_from_srgb, linear_to_srgb>(dst, src, len, matrix);
385 }
386
387 static void color_xform_RGB1_2dot2_to_srgb(uint32_t* dst, const uint32_t* src, i nt len,
388 const float matrix[16]) {
389 color_xform_RGB1<linear_from_2dot2, linear_to_srgb>(dst, src, len, matrix);
340 } 390 }
341 391
342 } 392 }
343 393
344 #endif // SkColorXform_opts_DEFINED 394 #endif // SkColorXform_opts_DEFINED
OLDNEW
« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkOpts_sse41.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698