src/opts/SkRasterPipeline_opts.h - Issue 2449243003: Initial implementation of a SkColorSpace_A2B xform

Side by Side Diff: src/opts/SkRasterPipeline_opts.h

Issue 2449243003: Initial implementation of a SkColorSpace_A2B xform (Closed)

Patch Set: updated implementation to use SkRasterPipeline Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkRasterPipeline_opts_DEFINED	8 #ifndef SkRasterPipeline_opts_DEFINED

9 #define SkRasterPipeline_opts_DEFINED	9 #define SkRasterPipeline_opts_DEFINED

10	10

11 #include "SkColorPriv.h"	11 #include "SkColorPriv.h"

	12 #include "SkColorSpace_Base.h"

12 #include "SkHalf.h"	13 #include "SkHalf.h"

	14 #include "SkMatrix44.h"

13 #include "SkPM4f.h"	15 #include "SkPM4f.h"

14 #include "SkPM4fPriv.h"	16 #include "SkPM4fPriv.h"

15 #include "SkRasterPipeline.h"	17 #include "SkRasterPipeline.h"

16 #include "SkSRGB.h"	18 #include "SkSRGB.h"

17 #include "SkUtils.h"	19 #include "SkUtils.h"

18 #include <utility>	20 #include <utility>

19	21

20 namespace {	22 namespace {

21	23

22 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2	24 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2

(...skipping 104 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
127 SkNf dr, SkNf dg, SkNf db, SkNf da) { \	129 SkNf dr, SkNf dg, SkNf db, SkNf da) { \

128 r = name##_kernel(r,a,dr,da); \	130 r = name##_kernel(r,a,dr,da); \

129 g = name##_kernel(g,a,dg,da); \	131 g = name##_kernel(g,a,dg,da); \

130 b = name##_kernel(b,a,db,da); \	132 b = name##_kernel(b,a,db,da); \

131 a = a + (da * (1.0f-a)); \	133 a = a + (da * (1.0f-a)); \

132 next(st, x,tail, r,g,b,a, dr,dg,db,da); \	134 next(st, x,tail, r,g,b,a, dr,dg,db,da); \

133 } \	135 } \

134 static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \	136 static SK_ALWAYS_INLINE SkNf name##_kernel(const SkNf& s, const SkNf& sa, \

135 const SkNf& d, const SkNf& da)	137 const SkNf& d, const SkNf& da)

136	138

	139

	140 #define GAMMA_STAGE(name) \
	msarett1 2016/11/09 00:01:05 Instead of this, I think I would prefer 6 normal s Instead of this, I think I would prefer 6 normal stages. Each of the stages can call a helper function to apply the gamma. So the code that actually applies the gamma is shared. mtklein_C 2016/11/09 11:04:36 I think you mean 3 normal stages? Each STAGE invo Show quoted text On 2016/11/09 at 00:01:05, msarett1 wrote: > Instead of this, I think I would prefer 6 normal stages. Each of the stages can call a helper function to apply the gamma. So the code that actually applies the gamma is shared. I think you mean 3 normal stages? Each STAGE invocation handles Body/Tail pair. Yeah, none of the GAMMA_STAGEs look like they're going to perform significantly differently from serial code, so we're probably not going to mind the gamma being a function call. Might be kind of fun to see just how general-purpose we can make these: STAGE(fn_r) { auto fn = (const std::function<float(float)>)ctx; float result[N]; for (int i = 0; i < N; i++) { result[i] = (fn)(r[i]); } r = SkNf::Load(result); } STAGE(fn_g) { ... } STAGE(fn_b) { ... } STAGE(fn_a) { ... } raftias 2016/11/10 21:36:06 I did this with fn_1_r/g/b. If we add in specific Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > On 2016/11/09 at 00:01:05, msarett1 wrote: > > Instead of this, I think I would prefer 6 normal stages. Each of the stages > can call a helper function to apply the gamma. So the code that actually > applies the gamma is shared. > > I think you mean 3 normal stages? Each STAGE invocation handles Body/Tail pair. > > Yeah, none of the GAMMA_STAGEs look like they're going to perform significantly > differently from serial code, so we're probably not going to mind the gamma > being a function call. Might be kind of fun to see just how general-purpose we > can make these: > > STAGE(fn_r) { > auto fn = (const std::function<float(float)>)ctx; > > float result[N]; > for (int i = 0; i < N; i++) { > result[i] = (fn)(r[i]); > } > r = SkNf::Load(result); > } > STAGE(fn_g) { ... } > STAGE(fn_b) { ... } > STAGE(fn_a) { ... } I did this with fn_1_r/g/b. If we add in specific functions for other gamma types (sRGB/2.2) that have a vectorized implementation we could do an efficient fn_n_r/g/b that uses those I guess.
	141 static SK_ALWAYS_INLINE SkNf name##_kernel(void* ctx, SkNf& s); \

	142 SI void SK_VECTORCALL name##_r(BodyStage* st, size_t x, \

	143 SkNf r, SkNf g, SkNf b, SkNf a, \

	144 SkNf dr, SkNf dg, SkNf db, SkNf da) { \

	145 r = name##_kernel(st->ctx, r); \

	146 next(st, x, r,g,b,a, dr,dg,db,da); \

	147 } \

	148 SI void SK_VECTORCALL name##_r(TailStage* st, size_t x, size_t tail, \

	149 SkNf r, SkNf g, SkNf b, SkNf a, \

	150 SkNf dr, SkNf dg, SkNf db, SkNf da) { \

	151 r = name##_kernel(st->ctx, r); \

	152 next(st, x,tail, r,g,b,a, dr,dg,db,da); \

	153 } \

	154 SI void SK_VECTORCALL name##_g(BodyStage* st, size_t x, \

	155 SkNf r, SkNf g, SkNf b, SkNf a, \

	156 SkNf dr, SkNf dg, SkNf db, SkNf da) { \

	157 g = name##_kernel(st->ctx, g); \

	158 next(st, x, r,g,b,a, dr,dg,db,da); \

	159 } \

	160 SI void SK_VECTORCALL name##_g(TailStage* st, size_t x, size_t tail, \

	161 SkNf r, SkNf g, SkNf b, SkNf a, \

	162 SkNf dr, SkNf dg, SkNf db, SkNf da) { \

	163 g = name##_kernel(st->ctx, g); \

	164 next(st, x,tail, r,g,b,a, dr,dg,db,da); \

	165 } \

	166 SI void SK_VECTORCALL name##_b(BodyStage* st, size_t x, \

	167 SkNf r, SkNf g, SkNf b, SkNf a, \

	168 SkNf dr, SkNf dg, SkNf db, SkNf da) { \

	169 b = name##_kernel(st->ctx, b); \

	170 next(st, x, r,g,b,a, dr,dg,db,da); \

	171 } \

	172 SI void SK_VECTORCALL name##_b(TailStage* st, size_t x, size_t tail, \

	173 SkNf r, SkNf g, SkNf b, SkNf a, \

	174 SkNf dr, SkNf dg, SkNf db, SkNf da) { \

	175 b = name##_kernel(st->ctx, b); \

	176 next(st, x,tail, r,g,b,a, dr,dg,db,da); \

	177 } \

	178 SI void SK_VECTORCALL name##_a(BodyStage* st, size_t x, \
	msarett1 2016/11/09 00:01:05 All we need to do with "a" is load it and store. All we need to do with "a" is load it and store. The transformation happens on r, g, and b. Perhaps this is for when we abuse "a" for CMYK? In that case, let's not make this change until we are adding CMYK support. raftias 2016/11/10 21:36:07 It was indeed for CMYK/etc support. Show quoted text On 2016/11/09 00:01:05, msarett1 wrote: > All we need to do with "a" is load it and store. The transformation happens on > r, g, and b. > > Perhaps this is for when we abuse "a" for CMYK? In that case, let's not make > this change until we are adding CMYK support. It was indeed for CMYK/etc support.
	179 SkNf r, SkNf g, SkNf b, SkNf a, \

	180 SkNf dr, SkNf dg, SkNf db, SkNf da) { \

	181 a = name##_kernel(st->ctx, a); \

	182 next(st, x, r,g,b,a, dr,dg,db,da); \

	183 } \

	184 SI void SK_VECTORCALL name##_a(TailStage* st, size_t x, size_t tail, \

	185 SkNf r, SkNf g, SkNf b, SkNf a, \

	186 SkNf dr, SkNf dg, SkNf db, SkNf da) { \

	187 a = name##_kernel(st->ctx, a); \

	188 next(st, x,tail, r,g,b,a, dr,dg,db,da); \

	189 } \

	190 static SK_ALWAYS_INLINE SkNf name##_kernel(void* ctx, SkNf& s)

	191

137 SI SkNf inv(const SkNf& x) { return 1.0f - x; }	192 SI SkNf inv(const SkNf& x) { return 1.0f - x; }

138	193

139 SI SkNf lerp(const SkNf& from, const SkNf& to, const SkNf& cov) {	194 SI SkNf lerp(const SkNf& from, const SkNf& to, const SkNf& cov) {

140 return SkNx_fma(to-from, cov, from);	195 return SkNx_fma(to-from, cov, from);

141 }	196 }

142	197

143 template <bool kIsTail, typename T>	198 template <bool kIsTail, typename T>

144 SI SkNx<N,T> load(size_t tail, const T* src) {	199 SI SkNx<N,T> load(size_t tail, const T* src) {

145 SkASSERT(kIsTail == (tail > 0));	200 SkASSERT(kIsTail == (tail > 0));

146 // TODO: maskload for 32- and 64-bit T	201 // TODO: maskload for 32- and 64-bit T

(...skipping 277 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
424 }	479 }

425	480

426 STAGE(store_srgb, false) {	481 STAGE(store_srgb, false) {

427 auto ptr = (uint32_t*)ctx + x;	482 auto ptr = (uint32_t*)ctx + x;

428 store<kIsTail>(tail, ( sk_linear_to_srgb(r) << SK_R32_SHIFT	483 store<kIsTail>(tail, ( sk_linear_to_srgb(r) << SK_R32_SHIFT

429 \| sk_linear_to_srgb(g) << SK_G32_SHIFT	484 \| sk_linear_to_srgb(g) << SK_G32_SHIFT

430 \| sk_linear_to_srgb(b) << SK_B32_SHIFT	485 \| sk_linear_to_srgb(b) << SK_B32_SHIFT

431 \| SkNx_cast<int>(0.5f + 255.0f * a) << SK_A32_SHIFT), ( int*)ptr);	486 \| SkNx_cast<int>(0.5f + 255.0f * a) << SK_A32_SHIFT), ( int*)ptr);

432 }	487 }

433	488

	489 STAGE(load_s_linear_rgba, true) {
	msarett1 2016/11/09 00:01:05 nit: Follow style conventions from above Use whit nit: Follow style conventions from above Use whitespace to line things up 0xff instead of 0xFF mtklein_C 2016/11/09 11:04:36 Let's call these _8888. That's our common shortha Let's call these _8888. That's our common shorthand for gamma=1 8-bit component colors.
	490 auto ptr = (const uint32_t*)ctx + x;

	491

	492 auto px = load<kIsTail>(tail, ptr);

	493 auto to_int = [](const SkNx<N, uint32_t>& v) { return SkNi::Load(&v); };

	494 r = (1/255.0f)*SkNx_cast<float>(to_int((px >> 0) & 0xFF));

	495 g = (1/255.0f)*SkNx_cast<float>(to_int((px >> 8) & 0xFF));

	496 b = (1/255.0f)*SkNx_cast<float>(to_int((px >> 16) & 0xFF));

	497 a = (1/255.0f)*SkNx_cast<float>(to_int(px >> 24));

	498 }

	499

	500 STAGE(load_s_linear_bgra, true) {
	mtklein_C 2016/11/09 11:04:36 How about we write everything in terms of rgba, an How about we write everything in terms of rgba, and use an extra swap_rb stage if we need to handle bgra. Something like this? STAGE(swap_rb) { SkTSwap(r,b); } STAGE(load_s_8888, true) { ... } STAGE(store_8888, false) { ... } raftias 2016/11/10 21:36:07 I that before (with that exact name, even), then t Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > How about we write everything in terms of rgba, and use an extra swap_rb stage > if we need to handle bgra. Something like this? > > STAGE(swap_rb) { > SkTSwap(r,b); > } > > STAGE(load_s_8888, true) { ... } > STAGE(store_8888, false) { ... } I that before (with that exact name, even), then talked to Matt and removed it. I could add it back, though. msarett1 2016/11/11 14:36:51 Let's defer to Mike on this one. lgtm, as is. Show quoted text On 2016/11/10 21:36:07, raftias wrote: > On 2016/11/09 11:04:36, mtklein_C wrote: > > How about we write everything in terms of rgba, and use an extra swap_rb stage > > if we need to handle bgra. Something like this? > > > > STAGE(swap_rb) { > > SkTSwap(r,b); > > } > > > > STAGE(load_s_8888, true) { ... } > > STAGE(store_8888, false) { ... } > > I that before (with that exact name, even), then talked to Matt and removed it. > I could add it back, though. Let's defer to Mike on this one. lgtm, as is.
	501 auto ptr = (const uint32_t*)ctx + x;

	502

	503 auto px = load<kIsTail>(tail, ptr);

	504 auto to_int = [](const SkNx<N, uint32_t>& v) { return SkNi::Load(&v); };

	505 r = (1/255.0f)*SkNx_cast<float>(to_int((px >> 16) & 0xFF));

	506 g = (1/255.0f)*SkNx_cast<float>(to_int((px >> 8) & 0xFF));

	507 b = (1/255.0f)*SkNx_cast<float>(to_int((px >> 0) & 0xFF));

	508 a = (1/255.0f)*SkNx_cast<float>(to_int((px >> 24)));

	509 }

	510

	511 // Clamp colors into [0,1] premul (e.g. just before storing back to memory).
	raftias 2016/11/08 21:19:58 I noticed when I pulled before uploading that this I noticed when I pulled before uploading that this was recently removed from the other store functions - however without it I get really psychedelic results from over(or under?)flow. msarett1 2016/11/09 00:01:05 I believe the idea is to not waste time clamping w Show quoted text On 2016/11/08 21:19:58, raftias wrote: > I noticed when I pulled before uploading that this was recently removed from the > other store functions - however without it I get really psychedelic results from > over(or under?)flow. I believe the idea is to not waste time clamping when we are sure that 0-1 input values guarantee 0-1 output values. When we are not sure, we should clamp. mtklein_C 2016/11/09 11:04:36 This has now been split into two stages, clamp_0 a Show quoted text On 2016/11/09 at 00:01:05, msarett1 wrote: > On 2016/11/08 21:19:58, raftias wrote: > > I noticed when I pulled before uploading that this was recently removed from the > > other store functions - however without it I get really psychedelic results from > > over(or under?)flow. > > I believe the idea is to not waste time clamping when we are sure that 0-1 input values guarantee 0-1 output values. When we are not sure, we should clamp. This has now been split into two stages, clamp_0 and clamp_a. We noticed that in most pipelines clamping was not required, and often when it was, only in one direction. As stages, we can add one, both, or neither as needed.
	512 SI void clamp_01_premul(SkNf& r, SkNf& g, SkNf& b, SkNf& a) {

	513 a = SkNf::Max(a, 0.0f);

	514 r = SkNf::Max(r, 0.0f);

	515 g = SkNf::Max(g, 0.0f);

	516 b = SkNf::Max(b, 0.0f);

	517

	518 a = SkNf::Min(a, 1.0f);

	519 r = SkNf::Min(r, a);

	520 g = SkNf::Min(g, a);

	521 b = SkNf::Min(b, a);

	522 }

	523

	524 STAGE(store_linear_rgba, false) {

	525 clamp_01_premul(r,g,b,a);

	526 auto ptr = (uint32_t*)ctx + x;

	527 store<kIsTail>(tail, ( SkNx_cast<int>(255.0f * r + 0.5f) << 0
	msarett1 2016/11/09 00:01:05 I don't think you need the "+ 0.5f" terms. I thin I don't think you need the "+ 0.5f" terms. I think Mike is just adding those above to avoid weird rounding issues where alpha ends up greater than r, g, or b? In that case, definitely remove from r, g, and b - and I think from a as well. mtklein_C 2016/11/09 11:04:36 No, we're doing that to round to the nearest byte Show quoted text On 2016/11/09 at 00:01:05, msarett1 wrote: > I don't think you need the "+ 0.5f" terms. I think Mike is just adding those above to avoid weird rounding issues where alpha ends up greater than r, g, or b? > > In that case, definitely remove from r, g, and b - and I think from a as well. No, we're doing that to round to the nearest byte rather than truncate. I think it's right to leave them all in.
	528 \| SkNx_cast<int>(255.0f * g + 0.5f) << 8

	529 \| SkNx_cast<int>(255.0f * b + 0.5f) << 16

	530 \| SkNx_cast<int>(255.0f * a + 0.5f) << 24 ), (int*)ptr) ;

	531 }

	532

	533 STAGE(store_linear_bgra, false) {

	534 clamp_01_premul(r,g,b,a);

	535 auto ptr = (uint32_t*)ctx + x;

	536 store<kIsTail>(tail, ( SkNx_cast<int>(255.0f * r + 0.5f) << 16

	537 \| SkNx_cast<int>(255.0f * g + 0.5f) << 8

	538 \| SkNx_cast<int>(255.0f * b + 0.5f) << 0

	539 \| SkNx_cast<int>(255.0f * a + 0.5f) << 24 ), (int*)ptr) ;

	540 }

	541

434 RGBA_XFERMODE(clear) { return 0.0f; }	542 RGBA_XFERMODE(clear) { return 0.0f; }

435 //RGBA_XFERMODE(src) { return s; } // This would be a no-op stage, so we just omit it.	543 //RGBA_XFERMODE(src) { return s; } // This would be a no-op stage, so we just omit it.

436 RGBA_XFERMODE(dst) { return d; }	544 RGBA_XFERMODE(dst) { return d; }

437	545

438 RGBA_XFERMODE(srcatop) { return sda + dinv(sa); }	546 RGBA_XFERMODE(srcatop) { return sda + dinv(sa); }

439 RGBA_XFERMODE(srcin) { return s * da; }	547 RGBA_XFERMODE(srcin) { return s * da; }

440 RGBA_XFERMODE(srcout) { return s * inv(da); }	548 RGBA_XFERMODE(srcout) { return s * inv(da); }

441 RGBA_XFERMODE(srcover) { return SkNx_fma(d, inv(sa), s); }	549 RGBA_XFERMODE(srcover) { return SkNx_fma(d, inv(sa), s); }

442 RGBA_XFERMODE(dstatop) { return srcatop_kernel(d,da,s,sa); }	550 RGBA_XFERMODE(dstatop) { return srcatop_kernel(d,da,s,sa); }

443 RGBA_XFERMODE(dstin) { return srcin_kernel (d,da,s,sa); }	551 RGBA_XFERMODE(dstin) { return srcin_kernel (d,da,s,sa); }

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
483 liteDst = m.rsqrt().invert() - m, // Used in case 3.	591 liteDst = m.rsqrt().invert() - m, // Used in case 3.

484 liteSrc = dsa + da(s2 - sa) * (4.0f*d <= da).thenElse(darkDst, liteDs t); // 2 or 3?	592 liteSrc = dsa + da(s2 - sa) * (4.0f*d <= da).thenElse(darkDst, liteDs t); // 2 or 3?

485 return sinv(da) + dinv(sa) + (s2 <= sa).thenElse(darkSrc, liteSrc); // 1 or (2 or 3)?	593 return sinv(da) + dinv(sa) + (s2 <= sa).thenElse(darkSrc, liteSrc); // 1 or (2 or 3)?

486 }	594 }

487	595

488 STAGE(luminance_to_alpha, true) {	596 STAGE(luminance_to_alpha, true) {

489 a = SK_LUM_COEFF_Rr + SK_LUM_COEFF_Gg + SK_LUM_COEFF_B*b;	597 a = SK_LUM_COEFF_Rr + SK_LUM_COEFF_Gg + SK_LUM_COEFF_B*b;

490 r = g = b = 0;	598 r = g = b = 0;

491 }	599 }

492	600

	601 STAGE(matrix_4x4, true) {

	602 const SkMatrix44& mat = (const SkMatrix44)ctx;
	mtklein_C 2016/11/09 11:04:36 I'd like matrix_4x4 and matrix_4x5 to look and beh I'd like matrix_4x4 and matrix_4x5 to look and behave as similarly as possible, so that the differences they do have stand out. Can just take a column-major float[16] here, swap the order of coefficients in the fma()s in one of the two functions, etc? I wouldn't mind splitting matrix_4x5 into matrix_4x4 and translate_4x1. raftias 2016/11/10 21:36:06 They were just different since I had written it an Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > I'd like matrix_4x4 and matrix_4x5 to look and behave as similarly as possible, > so that the differences they do have stand out. Can just take a column-major > float[16] here, swap the order of coefficients in the fma()s in one of the two > functions, etc? > > I wouldn't mind splitting matrix_4x5 into matrix_4x4 and translate_4x1. They were just different since I had written it and didn't see 4x5 until I merged right before submitting the CL. I could do that though.
	603 auto fma = [](const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma (f,m,a); };

	604 dr = fma(mat.get(0, 0),r, fma(mat.get(0, 1),g, fma(mat.get(0, 2),b, mat.get( 0, 3)*a)));
	msarett1 2016/11/09 00:01:05 No need for "a". Actually I think we don't want No need for "a". Actually I think we don't want it for sure. We're probably just lucky that it's always 1 here. mtklein_C 2016/11/09 11:04:36 If we don't a here, we can't really call this sta Show quoted text On 2016/11/09 at 00:01:05, msarett1 wrote: > No need for "a". Actually I think we don't want it for sure. We're probably just lucky that it's always 1 here. If we don't a here, we can't really call this stage matrix_4x4. raftias* 2016/11/10 21:36:06 It's 3x4 now. Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > On 2016/11/09 at 00:01:05, msarett1 wrote: > > No need for "a". Actually I think we don't want it for sure. We're probably > just lucky that it's always 1 here. > > If we don't a here, we can't really call this stage matrix_4x4. It's 3x4 now.
	605 dg = fma(mat.get(1, 0),r, fma(mat.get(1, 1),g, fma(mat.get(1, 2),b, mat.get( 1, 3)*a)));

	606 db = fma(mat.get(2, 0),r, fma(mat.get(2, 1),g, fma(mat.get(2, 2),b, mat.get( 2, 3)*a)));
	msarett1 2016/11/09 00:01:05 Mike, is it ok that we're destructive to dr, dg, d Mike, is it ok that we're destructive to dr, dg, db, da here? mtklein_C 2016/11/09 11:04:36 The pedantic answer is that that depends what you' Show quoted text On 2016/11/09 at 00:01:05, msarett1 wrote: > Mike, is it ok that we're destructive to dr, dg, db, da here? The pedantic answer is that that depends what you're doing with them later... it may be okay for this particular use case, but it sure limits other use of matrix_4x4 if it always destroys those values. Generally there's no need to do this. It's almost always better to make new names like we do in matrix_4x5. There are usually plenty of temporary registers available. Sometimes calling convention limitations mean {dr,dg,db,da} live on the stack while there are several registers available as temporaries within a function. But the most important reason not to do this is it reads weird. We should only change dr if the code's purpose is to change dr. If we write code like this, it looks like we're intentionally leaving these values in both {r,g,b,a} and {dr,dg,db,da}. raftias 2016/11/10 21:36:06 I'll remove these and put them in temporaries. I j Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > On 2016/11/09 at 00:01:05, msarett1 wrote: > > Mike, is it ok that we're destructive to dr, dg, db, da here? > > The pedantic answer is that that depends what you're doing with them later... it > may be okay for this particular use case, but it sure limits other use of > matrix_4x4 if it always destroys those values. > > Generally there's no need to do this. It's almost always better to make new > names like we do in matrix_4x5. There are usually plenty of temporary registers > available. Sometimes calling convention limitations mean {dr,dg,db,da} live on > the stack while there are several registers available as temporaries within a > function. > > But the most important reason not to do this is it reads weird. We should only > change dr if the code's purpose is to change dr. If we write code like this, it > looks like we're intentionally leaving these values in both {r,g,b,a} and > {dr,dg,db,da}. I'll remove these and put them in temporaries. I just thought we were low on registers due to a comment near the top of the file about Windows and that these would already be in registers.
	607 da = fma(mat.get(3, 0),r, fma(mat.get(3, 1),g, fma(mat.get(3, 2),b, mat.get( 3, 3)*a)));

	608 r = dr;

	609 g = dg;

	610 b = db;

	611 a = da;

	612 }

	613

493 STAGE(matrix_4x5, true) {	614 STAGE(matrix_4x5, true) {

494 auto m = (const float*)ctx;	615 auto m = (const float*)ctx;

495	616

496 auto fma = [](const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma (f,m,a); };	617 auto fma = [](const SkNf& f, const SkNf& m, const SkNf& a) { return SkNx_fma (f,m,a); };

497 auto R = fma(r,m[0], fma(g,m[4], fma(b,m[ 8], fma(a,m[12], m[16])))),	618 auto R = fma(r,m[0], fma(g,m[4], fma(b,m[ 8], fma(a,m[12], m[16])))),

498 G = fma(r,m[1], fma(g,m[5], fma(b,m[ 9], fma(a,m[13], m[17])))),	619 G = fma(r,m[1], fma(g,m[5], fma(b,m[ 9], fma(a,m[13], m[17])))),

499 B = fma(r,m[2], fma(g,m[6], fma(b,m[10], fma(a,m[14], m[18])))),	620 B = fma(r,m[2], fma(g,m[6], fma(b,m[10], fma(a,m[14], m[18])))),

500 A = fma(r,m[3], fma(g,m[7], fma(b,m[11], fma(a,m[15], m[19]))));	621 A = fma(r,m[3], fma(g,m[7], fma(b,m[11], fma(a,m[15], m[19]))));

501 r = R;	622 r = R;

502 g = G;	623 g = G;

503 b = B;	624 b = B;

504 a = A;	625 a = A;

505 }	626 }

506	627

	628 static inline Sk4f powNf(const Sk4f& x, float exp) {
	mtklein_C 2016/11/09 11:04:36 Generally this file writes static inline as SI. I Generally this file writes static inline as SI. It's not important. (It _is_ important that we mark these functions as both static and inline; SI is there to make it easy.) raftias 2016/11/10 21:36:06 Acknowledged. Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > Generally this file writes static inline as SI. It's not important. (It _is_ > important that we mark these functions as both static and inline; SI is there to > make it easy.) Acknowledged.
	629 return Sk4f{::powf(x[0], exp), ::powf(x[1], exp), ::powf(x[2], exp), ::powf( x[3], exp)};

	630 }

	631

	632 static inline Sk8f powNf(const Sk8f& x, float exp) {

	633 return Sk8f{::powf(x[0], exp), ::powf(x[1], exp), ::powf(x[2], exp), ::powf( x[3], exp),

	634 ::powf(x[4], exp), ::powf(x[5], exp), ::powf(x[6], exp), ::powf( x[7], exp)};

	635 }

	636

	637 GAMMA_STAGE(param_gamma) {

	638 const SkColorSpaceTransferFn& gamma = (const SkColorSpaceTransferFn)ctx;

	639 return (s <= gamma.fD).thenElse(gamma.fE * s + gamma.fF,
	msarett1 2016/11/09 00:01:05 nit: < instead of <= nit: < instead of <= raftias 2016/11/10 21:36:06 Done. Show quoted text On 2016/11/09 00:01:05, msarett1 wrote: > nit: < instead of <= Done.
	640 powNf(s * gamma.fA + gamma.fB, gamma.fG) + g amma.fC);

	641 }

	642

	643 static constexpr float kGammaTableSize = 1024;

	644

	645 GAMMA_STAGE(table_gamma) {

	646 constexpr float maxIndex = kGammaTableSize - 1;

	647 const float* gammaTables = (const float*)ctx;
	mtklein_C 2016/11/09 11:04:36 This name makes it seem like we're going to be usi This name makes it seem like we're going to be using different tables. There's only one table here right? auto table = (const float)ctx; raftias* 2016/11/10 21:36:06 Acknowledged. Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > This name makes it seem like we're going to be using different tables. There's > only one table here right? > > auto table = (const float*)ctx; Acknowledged.
	648 s = SkNf::Min(SkNf::Max(maxIndex * s, 0.f), maxIndex);
	mtklein_C 2016/11/09 11:04:36 If we're not going to source the 1024 constant fro If we're not going to source the 1024 constant from somewhere, I'd rather just go fully nameless: s = SkNx::Max(0, SkNf::Min(s, 1)) * 1023; raftias 2016/11/10 21:36:06 ApplyTable stores it now. Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > If we're not going to source the 1024 constant from somewhere, I'd rather just > go fully nameless: > > s = SkNx::Max(0, SkNf::Min(s, 1)) * 1023; ApplyTable stores it now.
	649 float result[N];

	650 for (int i = 0; i < N; ++i) {

	651 result[i] = gammaTables[lrintf(s[i])];

	652 }

	653 return SkNf::Load(result);

	654 }

	655

	656 static inline void interp_3d_clut(float dst[3], float src[3], const SkColorLookU pTable* colorLUT) {
	msarett1 2016/11/09 00:01:05 This maybe does not need to belong in this file. This maybe does not need to belong in this file. Possibly should be SkColorSpaceXformPriv.h. Mike? mtklein_C 2016/11/09 11:04:36 Why don't we make this a normal, separately-compil Show quoted text On 2016/11/09 at 00:01:05, msarett1 wrote: > This maybe does not need to belong in this file. Possibly should be SkColorSpaceXformPriv.h. Mike? Why don't we make this a normal, separately-compiled method of SkColorLookupTable? It's giant, serial, and we're going to call it a loop serially around that. Inlining it can't be important... raftias 2016/11/10 21:36:06 Done. Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > On 2016/11/09 at 00:01:05, msarett1 wrote: > > This maybe does not need to belong in this file. Possibly should be > SkColorSpaceXformPriv.h. Mike? > > Why don't we make this a normal, separately-compiled method of > SkColorLookupTable? > > It's giant, serial, and we're going to call it a loop serially around that. > Inlining it can't be important... Done.
	657 // Call the src components x, y, and z.

	658 uint8_t maxX = colorLUT->fGridPoints[0] - 1;

	659 uint8_t maxY = colorLUT->fGridPoints[1] - 1;

	660 uint8_t maxZ = colorLUT->fGridPoints[2] - 1;

	661

	662 // An approximate index into each of the three dimensions of the table.

	663 float x = src[0] * maxX;

	664 float y = src[1] * maxY;

	665 float z = src[2] * maxZ;

	666

	667 // This gives us the low index for our interpolation.

	668 int ix = sk_float_floor2int(x);

	669 int iy = sk_float_floor2int(y);

	670 int iz = sk_float_floor2int(z);

	671

	672 // Make sure the low index is not also the max index.

	673 ix = (maxX == ix) ? ix - 1 : ix;

	674 iy = (maxY == iy) ? iy - 1 : iy;

	675 iz = (maxZ == iz) ? iz - 1 : iz;

	676

	677 // Weighting factors for the interpolation.

	678 float diffX = x - ix;

	679 float diffY = y - iy;

	680 float diffZ = z - iz;

	681

	682 // Constants to help us navigate the 3D table.

	683 // Ex: Assume x = a, y = b, z = c.

	684 // table[a * n001 + b * n010 + c * n100] logically equals table[a][b][c] .

	685 const int n000 = 0;

	686 const int n001 = 3 * colorLUT->fGridPoints[1] * colorLUT->fGridPoints[2];

	687 const int n010 = 3 * colorLUT->fGridPoints[2];

	688 const int n011 = n001 + n010;

	689 const int n100 = 3;

	690 const int n101 = n100 + n001;

	691 const int n110 = n100 + n010;

	692 const int n111 = n110 + n001;

	693

	694 // Base ptr into the table.

	695 const float* ptr = &(colorLUT->table()[ixn001 + iyn010 + iz*n100]);

	696

	697 // The code below performs a tetrahedral interpolation for each of the three

	698 // dst components. Once the tetrahedron containing the interpolation point is

	699 // identified, the interpolation is a weighted sum of grid values at the

	700 // vertices of the tetrahedron. The claim is that tetrahedral interpolation

	701 // provides a more accurate color conversion.

	702 // blogs.mathworks.com/steve/2006/11/24/tetrahedral-interpolation-for-colors pace-conversion/

	703 //

	704 // I have one test image, and visually I can't tell the difference between

	705 // tetrahedral and trilinear interpolation. In terms of computation, the

	706 // tetrahedral code requires more branches but less computation. The

	707 // SampleICC library provides an option for the client to choose either

	708 // tetrahedral or trilinear.

	709 for (int i = 0; i < 3; i++) {

	710 if (diffZ < diffY) {

	711 if (diffZ < diffX) {

	712 dst[i] = (ptr[n000] + diffZ * (ptr[n110] - ptr[n010]) +

	713 diffY * (ptr[n010] - ptr[n000]) +

	714 diffX * (ptr[n111] - ptr[n110]));

	715 } else if (diffY < diffX) {

	716 dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) +

	717 diffY * (ptr[n011] - ptr[n001]) +

	718 diffX * (ptr[n001] - ptr[n000]));

	719 } else {

	720 dst[i] = (ptr[n000] + diffZ * (ptr[n111] - ptr[n011]) +

	721 diffY * (ptr[n010] - ptr[n000]) +

	722 diffX * (ptr[n011] - ptr[n010]));

	723 }

	724 } else {

	725 if (diffZ < diffX) {

	726 dst[i] = (ptr[n000] + diffZ * (ptr[n101] - ptr[n001]) +

	727 diffY * (ptr[n111] - ptr[n101]) +

	728 diffX * (ptr[n001] - ptr[n000]));

	729 } else if (diffY < diffX) {

	730 dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) +

	731 diffY * (ptr[n111] - ptr[n101]) +

	732 diffX * (ptr[n101] - ptr[n100]));

	733 } else {

	734 dst[i] = (ptr[n000] + diffZ * (ptr[n100] - ptr[n000]) +

	735 diffY * (ptr[n110] - ptr[n100]) +

	736 diffX * (ptr[n111] - ptr[n110]));

	737 }

	738 }

	739

	740 // Increment the table ptr in order to handle the next component.

	741 // Note that this is the how table is designed: all of nXXX

	742 // variables are multiples of 3 because there are 3 output

	743 // components.

	744 ptr++;

	745 }

	746 }

	747

	748 STAGE(clut, true) {
	mtklein_C 2016/11/09 11:04:36 how about color_lookup_table? how about color_lookup_table? raftias 2016/11/10 21:36:06 Done. Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > how about color_lookup_table? Done.
	749 const SkColorLookUpTable* colorLUT = (const SkColorLookUpTable*)ctx;
	mtklein_C 2016/11/09 11:04:36 Side note: it's going to drive me nuts that we cap Side note: it's going to drive me nuts that we capitalized SkColorLookUpTable this way. It's a Lookup Table, not a Look Up Table... raftias 2016/11/10 21:36:06 I didn't name it, but my guess is that it's becaus Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > Side note: it's going to drive me nuts that we capitalized SkColorLookUpTable > this way. It's a Lookup Table, not a Look Up Table... I didn't name it, but my guess is that it's because everywhere in the ICC it's referred to shorthand as a CLUT. msarett1 2016/11/11 14:36:51 I don't feel strongly about the name. Feel free t Show quoted text On 2016/11/10 21:36:06, raftias wrote: > On 2016/11/09 11:04:36, mtklein_C wrote: > > Side note: it's going to drive me nuts that we capitalized SkColorLookUpTable > > this way. It's a Lookup Table, not a Look Up Table... > > I didn't name it, but my guess is that it's because everywhere in the ICC it's > referred to shorthand as a CLUT. I don't feel strongly about the name. Feel free to rename it (or not) in this CL.
	750 float rgb[3];

	751 alignas(alignof(SkNf)) float result[3][N];
	mtklein_C 2016/11/09 11:04:36 Let's drop the alignment business. SkNf::Load() s Let's drop the alignment business. SkNf::Load() should always work. raftias 2016/11/10 21:36:06 Done. Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > Let's drop the alignment business. SkNf::Load() should always work. Done.
	752 for (int i = 0; i < N; ++i) {

	753 rgb[0] = r[i];

	754 rgb[1] = g[i];

	755 rgb[2] = b[i];

	756 interp_3d_clut(rgb, rgb, colorLUT);

	757 result[0][i] = rgb[0];

	758 result[1][i] = rgb[1];

	759 result[2][i] = rgb[2];

	760 }

	761 r = SkNf::Load(result[0]);

	762 g = SkNf::Load(result[1]);

	763 b = SkNf::Load(result[2]);

	764 }

	765

	766 STAGE(labtoxyz, true) {
	raftias 2016/11/08 21:19:58 I think this can be expressed as a matrix_4x4 foll I think this can be expressed as a matrix_4x4 followed by a param_gamma, however this would be slower (powf(x, 3), other stuff) and possibly less clear, but would reduce the total number of raster pipeline stock stages. What would you recommend? mtklein_C 2016/11/09 11:04:36 I think this is clearer as its own stage. It's pr Show quoted text On 2016/11/08 at 21:19:58, raftias wrote: > I think this can be expressed as a matrix_4x4 followed by a param_gamma, however this would be slower (powf(x, 3), other stuff) and possibly less clear, but would reduce the total number of raster pipeline stock stages. What would you recommend? I think this is clearer as its own stage. It's probably best to put some underscores in the name there so people don't have to guess where the word divisions are. lab_to_xyz? raftias 2016/11/10 21:36:06 Done. Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > On 2016/11/08 at 21:19:58, raftias wrote: > > I think this can be expressed as a matrix_4x4 followed by a param_gamma, > however this would be slower (powf(x, 3), other stuff) and possibly less clear, > but would reduce the total number of raster pipeline stock stages. What would > you recommend? > > I think this is clearer as its own stage. > > It's probably best to put some underscores in the name there so people don't > have to guess where the word divisions are. lab_to_xyz? Done.
	767 const auto lab_l = r * 100.f;

	768 const auto lab_a = g * 255.f - 128.f;

	769 const auto lab_b = b * 255.f - 128.f;

	770 auto Y = (lab_l + 16.f) * (1.f/116.f);

	771 auto X = lab_a * (1.f/500.f) + Y;
	mtklein_C 2016/11/09 11:04:36 One .f is plenty to get these solidly as float con One .f is plenty to get these solidly as float constants. I like it to be the denominator (1/500.0f) so that the fraction is readable. I don't have much of an opinion on whether or not to write .0f or .f, but I would like this file to be consistent. Please either change all this new code to use .0f, or change all the existing float constants to .f. raftias 2016/11/10 21:36:06 Acknowledged. Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > One .f is plenty to get these solidly as float constants. I like it to be the > denominator (1/500.0f) so that the fraction is readable. > > I don't have much of an opinion on whether or not to write .0f or .f, but I > would like this file to be consistent. Please either change all this new code > to use .0f, or change all the existing float constants to .f. Acknowledged.
	772 auto Z = Y - (lab_b * (1.f/200.f));

	773

	774 auto cubed = XXX;

	775 X = (cubed > 0.008856f).thenElse(cubed, (X - (16.f/116.f)) * (1.f/7.787f));

	776 cubed = YYY;
	mtklein_C 2016/11/09 11:04:36 At a glance it looks like cubed must be accumulati At a glance it looks like cubed must be accumulating some value that affects X, then Y, then Z in turn, where really they're all independent. This sort of code is usually clearer if you don't change values. Let's write auto X3 = XXX, Y3 = YYY, Z3 = ZZZ; X = (X3 > ...) ...; Y = (Y3 > ...) ...; Z = (Z3 > ...) ...; This way all the parallel operations are visibly parallel. If you write code in this style, you will rarely find yourself marking things as const. Everything's const and it doesn't need to be mentioned. raftias 2016/11/10 21:36:06 Done. Show quoted text On 2016/11/09 11:04:36, mtklein_C wrote: > At a glance it looks like cubed must be accumulating some value that affects X, > then Y, then Z in turn, where really they're all independent. > > This sort of code is usually clearer if you don't change values. Let's write > > auto X3 = XXX, > Y3 = YYY, > Z3 = ZZZ; > X = (X3 > ...) ...; > Y = (Y3 > ...) ...; > Z = (Z3 > ...) ...; > > This way all the parallel operations are visibly parallel. > > If you write code in this style, you will rarely find yourself marking things as > const. Everything's const and it doesn't need to be mentioned. Done.
	777 Y = (cubed > 0.008856f).thenElse(cubed, (Y - (16.f/116.f)) * (1.f/7.787f));

	778 cubed = ZZZ;

	779 Z = (cubed > 0.008856f).thenElse(cubed, (Z - (16.f/116.f)) * (1.f/7.787f));

	780

	781 // adjust to D50 illuminant

	782 X *= 0.96422f;

	783 Y *= 1.00000f;

	784 Z *= 0.82521f;

	785

	786 r = X;

	787 g = Y;

	788 b = Z;

	789 }

	790

507 template <typename Fn>	791 template <typename Fn>

508 SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) {	792 SI Fn enum_to_Fn(SkRasterPipeline::StockStage st) {

509 switch (st) {	793 switch (st) {

510 #define M(stage) case SkRasterPipeline::stage: return stage;	794 #define M(stage) case SkRasterPipeline::stage: return stage;

511 SK_RASTER_PIPELINE_STAGES(M)	795 SK_RASTER_PIPELINE_STAGES(M)

512 #undef M	796 #undef M

513 }	797 }

514 SkASSERT(false);	798 SkASSERT(false);

515 return just_return;	799 return just_return;

516 }	800 }

(...skipping 80 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
597 }	881 }

598	882

599 } // namespace SK_OPTS_NS	883 } // namespace SK_OPTS_NS

600	884

601 #undef SI	885 #undef SI

602 #undef STAGE	886 #undef STAGE

603 #undef RGBA_XFERMODE	887 #undef RGBA_XFERMODE

604 #undef RGB_XFERMODE	888 #undef RGB_XFERMODE

605	889

606 #endif//SkRasterPipeline_opts_DEFINED	890 #endif//SkRasterPipeline_opts_DEFINED

OLD	NEW

« src/core/SkColorSpace_A2B.h ('K') | « src/core/SkRasterPipeline.h ('k') | tests/ColorSpaceXformTest.cpp » ('j') | no next file with comments »