src/opts/SkColorXform_opts.h - Issue 2078623002: Support sRGB dsts in opt code

Side by Side Diff: src/opts/SkColorXform_opts.h

Issue 2078623002: Support sRGB dsts in opt code (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkColorXform_opts_DEFINED	8 #ifndef SkColorXform_opts_DEFINED

9 #define SkColorXform_opts_DEFINED	9 #define SkColorXform_opts_DEFINED

10	10

(...skipping 149 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
160 __m128 x64 = _mm_rsqrt_ps(x32);	160 __m128 x64 = _mm_rsqrt_ps(x32);

161	161

162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64)	162 // x^(+29/64) = x^(+1/2) * x^(-1/32) * x^(-1/64)

163 // Note that we also scale to the 0-255 range.	163 // Note that we also scale to the 0-255 range.

164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this	164 // These terms can be combined more minimally with 3 muls and 1 reciprocal. However, this

165 // is faster, because it allows us to start the muls in parallel with the rs qrts.	165 // is faster, because it allows us to start the muls in parallel with the rs qrts.

166 __m128 scale = _mm_set1_ps(255.0f);	166 __m128 scale = _mm_set1_ps(255.0f);

167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64));	167 return _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(scale, _mm_rcp_ps(x2)), x32), _mm_rc p_ps(x64));

168 }	168 }

169	169

	170 static __m128 if_then_else(__m128 mask, __m128 a, __m128 b) {

	171 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41

	172 return _mm_blendv_ps(b, a, mask);

	173 #else

	174 return _mm_or_ps(_mm_and_ps(a, mask), _mm_andnot_ps(b, mask));
	mtklein 2016/06/17 17:19:40 and(mask, a) or andnot(mask, b) and(mask, a) or andnot(mask, b) msarett 2016/06/17 20:10:11 Let's just use Sk4f haha :). Show quoted text On 2016/06/17 17:19:40, mtklein wrote: > and(mask, a) or andnot(mask, b) Let's just use Sk4f haha :).
	175 #endif

	176 }

	177

	178 // Below is a good approximation of the sRGB gamma curve (within 1 when scaled t o 8-bit pixels).

	179 // For 0.00000f <= x < 0.00349f, 12.92 * x

	180 // For 0.00349f <= x <= 1.00000f, 0.679(x.^0.5) + 0.423x.^(0.25) - 0.101

	181 // Note that the intersection was selected to be a point where both functions pr oduce the

	182 // same pixel value when rounded.

	183 static __m128 linear_to_srgb(__m128 x) {

	184 __m128 rsqrt = _mm_rsqrt_ps(x);

	185 __m128 sqrt = _mm_rcp_ps(rsqrt);

	186 __m128 ftrt = _mm_rsqrt_ps(rsqrt);

	187

	188 __m128 hi = _mm_add_ps(_mm_add_ps( _mm_set1_ps(-0.1011150849 98961f * 255.0f),

	189 _mm_mul_ps(sqrt, _mm_set1_ps(+0.6785130299 59381f * 255.0f))),

	190 _mm_mul_ps(ftrt, _mm_set1_ps(+0.4226020550 39580f * 255.0f)));

	191

	192 __m128 lo = _mm_mul_ps(x, _mm_set1_ps(12.92f * 255.0f));

	193

	194 __m128 mask = _mm_cmplt_ps(x, _mm_set1_ps(0.00349f));

	195 return if_then_else(mask, lo, hi);

	196 }

	197

170 static __m128 clamp_0_to_255(__m128 x) {	198 static __m128 clamp_0_to_255(__m128 x) {

171 // The order of the arguments is important here. We want to make sure that NaN	199 // The order of the arguments is important here. We want to make sure that NaN

172 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.	200 // clamps to zero. Note that max(NaN, 0) = 0, while max(0, NaN) = NaN.

173 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f));	201 return _mm_min_ps(_mm_max_ps(x, _mm_setzero_ps()), _mm_set1_ps(255.0f));

174 }	202 }

175	203

176 template <const float (&linear_from_curve)[256]>	204 template <const float (&linear_from_curve)[256], __m128 (*linear_to_curve)(__m12 8)>

177 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,	205 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,

178 const float matrix[16]) {	206 const float matrix[16]) {

179 // Load transformation matrix.	207 // Load transformation matrix.

180 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);	208 __m128 rXgXbX = _mm_loadu_ps(&matrix[0]);

181 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);	209 __m128 rYgYbY = _mm_loadu_ps(&matrix[4]);

182 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);	210 __m128 rZgZbZ = _mm_loadu_ps(&matrix[8]);

183	211

184 while (len >= 4) {	212 while (len >= 4) {

185 // Convert to linear. The look-up table has perfect accuracy.	213 // Convert to linear. The look-up table has perfect accuracy.

186 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF],	214 __m128 reds = _mm_setr_ps(linear_from_curve[(src[0] >> 0) & 0xFF],

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
221 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);	249 __m128 bX = _mm_shuffle_ps(rXgXbX, rXgXbX, 0xAA);

222 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);	250 __m128 bY = _mm_shuffle_ps(rYgYbY, rYgYbY, 0xAA);

223 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);	251 __m128 bZ = _mm_shuffle_ps(rZgZbZ, rZgZbZ, 0xAA);

224	252

225 // dstBlues = bX * reds + bY * greens + bZ * blues	253 // dstBlues = bX * reds + bY * greens + bZ * blues

226 __m128 dstBlues = _mm_mul_ps(reds, bX);	254 __m128 dstBlues = _mm_mul_ps(reds, bX);

227 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));	255 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(greens, bY));

228 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));	256 dstBlues = _mm_add_ps(dstBlues, _mm_mul_ps(blues, bZ));

229	257

230 // Convert to dst gamma.	258 // Convert to dst gamma.

231 dstReds = linear_to_2dot2(dstReds);	259 dstReds = linear_to_curve(dstReds);

232 dstGreens = linear_to_2dot2(dstGreens);	260 dstGreens = linear_to_curve(dstGreens);

233 dstBlues = linear_to_2dot2(dstBlues);	261 dstBlues = linear_to_curve(dstBlues);

234	262

235 // Clamp floats.	263 // Clamp floats.

236 dstReds = clamp_0_to_255(dstReds);	264 dstReds = clamp_0_to_255(dstReds);

237 dstGreens = clamp_0_to_255(dstGreens);	265 dstGreens = clamp_0_to_255(dstGreens);

238 dstBlues = clamp_0_to_255(dstBlues);	266 dstBlues = clamp_0_to_255(dstBlues);

239	267

240 // Convert to bytes and store to memory.	268 // Convert to bytes and store to memory.

241 __m128i rgba = _mm_set1_epi32(0xFF000000);	269 __m128i rgba = _mm_set1_epi32(0xFF000000);

242 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );	270 rgba = _mm_or_si128(rgba, _mm_cvtps_epi32(dstReds) );

243 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );	271 rgba = _mm_or_si128(rgba, _mm_slli_epi32(_mm_cvtps_epi32(dstGreens), 8) );

(...skipping 10 matching lines...) Expand all Loading...
254 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]),	282 __m128 r = _mm_set1_ps(linear_from_curve[(src[0] >> 0) & 0xFF]),

255 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]),	283 g = _mm_set1_ps(linear_from_curve[(src[0] >> 8) & 0xFF]),

256 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]);	284 b = _mm_set1_ps(linear_from_curve[(src[0] >> 16) & 0xFF]);

257	285

258 // Apply the transformation matrix to dst gamut.	286 // Apply the transformation matrix to dst gamut.

259 __m128 dstPixel = _mm_mul_ps(r, rXgXbX);	287 __m128 dstPixel = _mm_mul_ps(r, rXgXbX);

260 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY));	288 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(g, rYgYbY));

261 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ));	289 dstPixel = _mm_add_ps(dstPixel, _mm_mul_ps(b, rZgZbZ));

262	290

263 // Convert to dst gamma.	291 // Convert to dst gamma.

264 dstPixel = linear_to_2dot2(dstPixel);	292 dstPixel = linear_to_curve(dstPixel);

265	293

266 // Clamp floats to 0-255 range.	294 // Clamp floats to 0-255 range.

267 dstPixel = clamp_0_to_255(dstPixel);	295 dstPixel = clamp_0_to_255(dstPixel);

268	296

269 // Convert to bytes and store to memory.	297 // Convert to bytes and store to memory.

270 __m128i dstInts = _mm_cvtps_epi32(dstPixel);	298 __m128i dstInts = _mm_cvtps_epi32(dstPixel);

271 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts);	299 __m128i dstBytes = _mm_packus_epi16(_mm_packus_epi16(dstInts, dstInts), dstInts);

272 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes);	300 dstBytes = _mm_or_si128(_mm_set1_epi32(0xFF000000), dstBytes);

273 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes));	301 _mm_store_ss((float*) dst, _mm_castsi128_ps(dstBytes));

274	302

275 dst += 1;	303 dst += 1;

276 src += 1;	304 src += 1;

277 len -= 1;	305 len -= 1;

278 }	306 }

279 }	307 }

280	308

281 #else	309 #else

282	310

	311 static float linear_to_2dot2(float v) {

	312 return powf(v, (1.0f / 2.2f)) * 255.0f;

	313 }

	314

	315 static void linear_to_srgb(float v) {

	316 if (v < 0031308f) {

	317 return (12.92f * v) * 255.0f;

	318 } else {

	319 return ((1.055f * powf(v, (1.0f / 2.4f))) - 0.055f) * 255.0f;

	320 }

	321 }

	322

283 static uint8_t clamp_float_to_byte(float v) {	323 static uint8_t clamp_float_to_byte(float v) {

284 // The ordering of the logic is a little strange here in order	324 // The ordering of the logic is a little strange here in order

285 // to make sure we convert NaNs to 0.	325 // to make sure we convert NaNs to 0.

286 if (v >= 254.5f) {	326 if (v >= 254.5f) {

287 return 255;	327 return 255;

288 } else if (v >= 0.5f) {	328 } else if (v >= 0.5f) {

289 return (uint8_t) (v + 0.5f);	329 return (uint8_t) (v + 0.5f);

290 } else {	330 } else {

291 return 0;	331 return 0;

292 }	332 }

293 }	333 }

294	334

295 template <const float (&linear_from_curve)[256]>	335 template <const float (&linear_from_curve)[256], float(*linear_to_curve)(float)>

296 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,	336 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,

297 const float matrix[16]) {	337 const float matrix[16]) {

298 while (len-- > 0) {	338 while (len-- > 0) {

299 // Convert to linear.	339 // Convert to linear.

300 float srcFloats[3];	340 float srcFloats[3];

301 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF];	341 srcFloats[0] = linear_from_curve[(*src >> 0) & 0xFF];

302 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF];	342 srcFloats[1] = linear_from_curve[(*src >> 8) & 0xFF];

303 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF];	343 srcFloats[2] = linear_from_curve[(*src >> 16) & 0xFF];

304	344

305 // Convert to dst gamut.	345 // Convert to dst gamut.

306 float dstFloats[3];	346 float dstFloats[3];

307 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +	347 dstFloats[0] = srcFloats[0] * matrix[0] + srcFloats[1] * matrix[4] +

308 srcFloats[2] * matrix[8];	348 srcFloats[2] * matrix[8];

309 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +	349 dstFloats[1] = srcFloats[0] * matrix[1] + srcFloats[1] * matrix[5] +

310 srcFloats[2] * matrix[9];	350 srcFloats[2] * matrix[9];

311 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +	351 dstFloats[2] = srcFloats[0] * matrix[2] + srcFloats[1] * matrix[6] +

312 srcFloats[2] * matrix[10];	352 srcFloats[2] * matrix[10];

313	353

314 // Convert to dst gamma.	354 // Convert to dst gamma.

315 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported.	355 // Note: pow is really, really slow. We will suffer when SSE2 is not su pported.

316 dstFloats[0] = powf(dstFloats[0], (1/2.2f)) * 255.0f;	356 dstFloats[0] = linear_to_curve(dstFloats[0]);

317 dstFloats[1] = powf(dstFloats[1], (1/2.2f)) * 255.0f;	357 dstFloats[1] = linear_to_curve(dstFloats[1]);

318 dstFloats[2] = powf(dstFloats[2], (1/2.2f)) * 255.0f;	358 dstFloats[2] = linear_to_curve(dstFloats[2]);

319	359

320 *dst = (0xFF << 24) \|	360 *dst = (0xFF << 24) \|

321 (clamp_float_to_byte(dstFloats[2]) << 16) \|	361 (clamp_float_to_byte(dstFloats[2]) << 16) \|

322 (clamp_float_to_byte(dstFloats[1]) << 8) \|	362 (clamp_float_to_byte(dstFloats[1]) << 8) \|

323 (clamp_float_to_byte(dstFloats[0]) << 0);	363 (clamp_float_to_byte(dstFloats[0]) << 0);

324	364

325 dst++;	365 dst++;

326 src++;	366 src++;

327 }	367 }

328 }	368 }

329	369

330 #endif	370 #endif

331	371

332 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len,	372 static void color_xform_RGB1_srgb_to_2dot2(uint32_t* dst, const uint32_t* src, i nt len,

333 const float matrix[16]) {	373 const float matrix[16]) {

334 color_xform_RGB1<linear_from_srgb>(dst, src, len, matrix);	374 color_xform_RGB1<linear_from_srgb, linear_to_2dot2>(dst, src, len, matrix);

335 }	375 }

336	376

337 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len,	377 static void color_xform_RGB1_2dot2_to_2dot2(uint32_t* dst, const uint32_t* src, int len,

338 const float matrix[16]) {	378 const float matrix[16]) {

339 color_xform_RGB1<linear_from_2dot2>(dst, src, len, matrix);	379 color_xform_RGB1<linear_from_2dot2, linear_to_2dot2>(dst, src, len, matrix);

	380 }

	381

	382 static void color_xform_RGB1_srgb_to_srgb(uint32_t* dst, const uint32_t* src, in t len,

	383 const float matrix[16]) {

	384 color_xform_RGB1<linear_from_srgb, linear_to_srgb>(dst, src, len, matrix);

	385 }

	386

	387 static void color_xform_RGB1_2dot2_to_srgb(uint32_t* dst, const uint32_t* src, i nt len,

	388 const float matrix[16]) {

	389 color_xform_RGB1<linear_from_2dot2, linear_to_srgb>(dst, src, len, matrix);

340 }	390 }

341	391

342 }	392 }

343	393

344 #endif // SkColorXform_opts_DEFINED	394 #endif // SkColorXform_opts_DEFINED

OLD	NEW

« no previous file with comments | « src/core/SkOpts.cpp ('k') | src/opts/SkOpts_sse41.cpp » ('j') | no next file with comments »