src/opts/SkXfermode_opts.h - Issue 1432903002: float xfermodes (burn, dodge, softlight) in Sk8f, possibly using AVX.

Side by Side Diff: src/opts/SkXfermode_opts.h

Issue 1432903002: float xfermodes (burn, dodge, softlight) in Sk8f, possibly using AVX. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: 1.0f/255 Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2015 Google Inc.	2 * Copyright 2015 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef Sk4pxXfermode_DEFINED	8 #ifndef Sk4pxXfermode_DEFINED

9 #define Sk4pxXfermode_DEFINED	9 #define Sk4pxXfermode_DEFINED

10	10

(...skipping 91 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
102 dsa = (d*sa).div255();	102 dsa = (d*sa).div255();

103	103

104 auto srcover = s + (d * sa.inv()).div255(),	104 auto srcover = s + (d * sa.inv()).div255(),

105 dstover = d + (s * da.inv()).div255();	105 dstover = d + (s * da.inv()).div255();

106 auto alphas = srcover,	106 auto alphas = srcover,

107 colors = (dsa < sda).thenElse(srcover, dstover);	107 colors = (dsa < sda).thenElse(srcover, dstover);

108 return alphas.zeroColors() + colors.zeroAlphas();	108 return alphas.zeroColors() + colors.zeroAlphas();

109 }	109 }

110 #undef XFERMODE	110 #undef XFERMODE

111	111

112 // Some xfermodes use math like divide or sqrt that's best done in floats 1 pixe l at a time.	112 // Some xfermodes use math like divide or sqrt that's best done in floats.

113 #define XFERMODE(Name) static Sk4f SK_VECTORCALL Name(Sk4f d, Sk4f s)	113 // We write it generically, then call it 1 or 2 pixels at a time (T == Sk4f or S k8f).

	114 #define XFERMODE(Name) struct Name { template <typename T> T operator()(const T& , const T&); }; \

	115 template <typename T> T Name::operator()(const T& d, const T& s)

114	116

	117 static_assert(SK_A32_SHIFT == 24, "");

115 static inline Sk4f a_rgb(const Sk4f& a, const Sk4f& rgb) {	118 static inline Sk4f a_rgb(const Sk4f& a, const Sk4f& rgb) {

116 static_assert(SK_A32_SHIFT == 24, "");

117 return a * Sk4f(0,0,0,1) + rgb * Sk4f(1,1,1,0);	119 return a * Sk4f(0,0,0,1) + rgb * Sk4f(1,1,1,0);

118 }	120 }

119 static inline Sk4f alphas(const Sk4f& f) {	121 static inline Sk8f a_rgb(const Sk8f& a, const Sk8f& rgb) {

120 return SkNx_dup<SK_A32_SHIFT/8>(f);	122 // TODO: SkNx_blend<0,0,0,1,0,0,0,1>(a, rgb) to let us use _mm256_blend_ps?

	123 return a * Sk8f(0,0,0,1,0,0,0,1) + rgb * Sk8f(1,1,1,0,1,1,1,0);

121 }	124 }

	125 static inline Sk4f alphas(const Sk4f& f) { return SkNx_shuffle<3,3,3,3> ( f); }

	126 static inline Sk8f alphas(const Sk8f& f) { return SkNx_shuffle<3,3,3,3,7,7,7,7>( f); }
	msarett 2015/11/09 23:25:06 Where is shuffle defined for AVX? Oh, it looks li Where is shuffle defined for AVX? Oh, it looks like it uses SkNx's default implementation with kth(). Is a platform specific implementation a TODO? Or can we not do better than the default? mtklein 2015/11/10 00:22:05 Right, default implementation. This compiles into Show quoted text On 2015/11/09 at 23:25:06, msarett wrote: > Where is shuffle defined for AVX? > > Oh, it looks like it uses SkNx's default implementation with kth(). Is a platform specific implementation a TODO? Or can we not do better than the default? Right, default implementation. This compiles into one instruction, vpermilps $255, %ymmN, %ymmM. (Shuffle element 3 into each of lanes 0,1,2,3 and element 3+4=7 into 4,5,6,7). Pretty hard to beat. msarett 2015/11/10 14:54:16 Great, cool instruction! I'm impressed by the com Show quoted text On 2015/11/10 00:22:05, mtklein wrote: > On 2015/11/09 at 23:25:06, msarett wrote: > > Where is shuffle defined for AVX? > > > > Oh, it looks like it uses SkNx's default implementation with kth(). Is a > platform specific implementation a TODO? Or can we not do better than the > default? > > Right, default implementation. This compiles into one instruction, vpermilps > $255, %ymmN, %ymmM. (Shuffle element 3 into each of lanes 0,1,2,3 and element > 3+4=7 into 4,5,6,7). Pretty hard to beat. Great, cool instruction! I'm impressed by the compiler. Is having the shared code the benefit of obscuring this? mtklein 2015/11/11 19:27:13 Yeah, Clang's pretty impressive. Mostly SkNx_shuf Show quoted text On 2015/11/10 at 14:54:16, msarett wrote: > On 2015/11/10 00:22:05, mtklein wrote: > > On 2015/11/09 at 23:25:06, msarett wrote: > > > Where is shuffle defined for AVX? > > > > > > Oh, it looks like it uses SkNx's default implementation with kth(). Is a > > platform specific implementation a TODO? Or can we not do better than the > > default? > > > > Right, default implementation. This compiles into one instruction, vpermilps > > $255, %ymmN, %ymmM. (Shuffle element 3 into each of lanes 0,1,2,3 and element > > 3+4=7 into 4,5,6,7). Pretty hard to beat. > > Great, cool instruction! > > I'm impressed by the compiler. Is having the shared code the benefit of obscuring this? Yeah, Clang's pretty impressive. Mostly SkNx_shuffle works the way it does in case we find things the compiler can't quite do best. There have been a couple shuffles we think we can do better on NEON, for instance, though none are currently landed. SkNx_shuffle also gives us a point where we can hook in and help out less smart compilers, not to name names.
122	127

123 XFERMODE(ColorDodge) {	128 XFERMODE(ColorDodge) {

124 auto sa = alphas(s),	129 auto sa = alphas(s),

125 da = alphas(d),	130 da = alphas(d),

126 isa = Sk4f(1)-sa,	131 isa = T(1)-sa,

127 ida = Sk4f(1)-da;	132 ida = T(1)-da;

128	133

129 auto srcover = s + d*isa,	134 auto srcover = s + d*isa,

130 dstover = d + s*ida,	135 dstover = d + s*ida,

131 otherwise = sa * Sk4f::Min(da, (dsa)(sa-s).approxInvert()) + sida + disa;	136 otherwise = sa * T::Min(da, (dsa)(sa-s).approxInvert()) + sida + di sa;

132	137

133 // Order matters here, preferring d==0 over s==sa.	138 // Order matters here, preferring d==0 over s==sa.

134 auto colors = (d == Sk4f(0)).thenElse(dstover,	139 auto colors = (d == 0).thenElse(dstover,

135 (s == sa).thenElse(srcover,	140 (s == sa).thenElse(srcover,

136 otherwise));	141 otherwise));

137 return a_rgb(srcover, colors);	142 return a_rgb(srcover, colors);

138 }	143 }

139 XFERMODE(ColorBurn) {	144 XFERMODE(ColorBurn) {

140 auto sa = alphas(s),	145 auto sa = alphas(s),

141 da = alphas(d),	146 da = alphas(d),

142 isa = Sk4f(1)-sa,	147 isa = T(1)-sa,

143 ida = Sk4f(1)-da;	148 ida = T(1)-da;

144	149

145 auto srcover = s + d*isa,	150 auto srcover = s + d*isa,

146 dstover = d + s*ida,	151 dstover = d + s*ida,

147 otherwise = sa(da-Sk4f::Min(da, (da-d)sas.approxInvert())) + sida + d*isa;	152 otherwise = sa(da-T::Min(da, (da-d)sas.approxInvert())) + sida + d* isa;

148	153

149 // Order matters here, preferring d==da over s==0.	154 // Order matters here, preferring d==da over s==0.

150 auto colors = (d == da).thenElse(dstover,	155 auto colors = (d == da).thenElse(dstover,

151 (s == Sk4f(0)).thenElse(srcover,	156 (s == 0).thenElse(srcover,

152 otherwise));	157 otherwise));

153 return a_rgb(srcover, colors);	158 return a_rgb(srcover, colors);

154 }	159 }

155 XFERMODE(SoftLight) {	160 XFERMODE(SoftLight) {

156 auto sa = alphas(s),	161 auto sa = alphas(s),

157 da = alphas(d),	162 da = alphas(d),

158 isa = Sk4f(1)-sa,	163 isa = T(1)-sa,

159 ida = Sk4f(1)-da;	164 ida = T(1)-da;

160	165

161 // Some common terms.	166 // Some common terms.

162 auto m = (da > Sk4f(0)).thenElse(d / da, Sk4f(0)),	167 auto m = (da > 0).thenElse(d / da, 0),

163 s2 = Sk4f(2)*s,	168 s2 = s*2,

164 m4 = Sk4f(4)*m;	169 m4 = m*4;

165	170

166 // The logic forks three ways:	171 // The logic forks three ways:

167 // 1. dark src?	172 // 1. dark src?

168 // 2. light src, dark dst?	173 // 2. light src, dark dst?

169 // 3. light src, light dst?	174 // 3. light src, light dst?

170 auto darkSrc = d(sa + (s2 - sa)(Sk4f(1) - m)), // Used in case 1.	175 auto darkSrc = d(sa + (s2 - sa)(T(1) - m)), // Used in case 1.

171 darkDst = (m4m4 + m4)(m - Sk4f(1)) + Sk4f(7)*m, // Used in case 2.	176 darkDst = (m4m4 + m4)(m - 1) + m*7, // Used in case 2.

172 liteDst = m.sqrt() - m, // Used in case 3.	177 liteDst = m.sqrt() - m, // Used in case 3.

173 liteSrc = dsa + da(s2-sa)(Sk4f(4)d <= da).thenElse(darkDst, liteDst ); // Case 2 or 3?	178 liteSrc = dsa + da(s2-sa)(d4 <= da).thenElse(darkDst, liteDst); // Case 2 or 3?

174	179

175 auto alpha = s + d*isa;	180 auto alpha = s + d*isa;

176 auto colors = sida + disa + (s2 <= sa).thenElse(darkSrc, liteSrc); // Case 1 or 2/3?	181 auto colors = sida + disa + (s2 <= sa).thenElse(darkSrc, liteSrc); // Case 1 or 2/3?

177	182

178 return a_rgb(alpha, colors);	183 return a_rgb(alpha, colors);

179 }	184 }

180 #undef XFERMODE	185 #undef XFERMODE

181	186

182 // A reasonable fallback mode for doing AA is to simply apply the transfermode f irst,	187 // A reasonable fallback mode for doing AA is to simply apply the transfermode f irst,

183 // then linearly interpolate the AA.	188 // then linearly interpolate the AA.

184 template <Sk4px (SK_VECTORCALL *Mode)(Sk4px, Sk4px)>	189 template <Sk4px (SK_VECTORCALL *Mode)(Sk4px, Sk4px)>

185 static Sk4px SK_VECTORCALL xfer_aa(Sk4px s, Sk4px d, Sk4px aa) {	190 static Sk4px SK_VECTORCALL xfer_aa(Sk4px s, Sk4px d, Sk4px aa) {

186 Sk4px bw = Mode(s, d);	191 Sk4px bw = Mode(s, d);

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
233 });	238 });

234 }	239 }

235 }	240 }

236	241

237 private:	242 private:

238 Proc4 fProc4;	243 Proc4 fProc4;

239 AAProc4 fAAProc4;	244 AAProc4 fAAProc4;

240 typedef SkProcCoeffXfermode INHERITED;	245 typedef SkProcCoeffXfermode INHERITED;

241 };	246 };

242	247

243 class Sk4fXfermode : public SkProcCoeffXfermode {	248 template <typename BlendFn>

	249 class FloatXfermode : public SkProcCoeffXfermode {

244 public:	250 public:

245 typedef Sk4f (SK_VECTORCALL *ProcF)(Sk4f, Sk4f);	251 FloatXfermode(const ProcCoeff& rec, SkXfermode::Mode mode)

246 Sk4fXfermode(const ProcCoeff& rec, SkXfermode::Mode mode, ProcF procf)	252 : INHERITED(rec, mode) {}

247 : INHERITED(rec, mode)

248 , fProcF(procf) {}

249	253

250 void xfer32(SkPMColor dst[], const SkPMColor src[], int n, const SkAlpha aa[ ]) const override {	254 void xfer32(SkPMColor dst[], const SkPMColor src[], int n, const SkAlpha aa[ ]) const override {
	msarett 2015/11/09 23:25:06 It looks like you've deleted some private helper f It looks like you've deleted some private helper functions and pulled them into the implementation of this function? Which allows you to efficiently combine Sk8f and Sk4f? mtklein 2015/11/10 00:22:05 Yep, just moving things around. Show quoted text On 2015/11/09 at 23:25:06, msarett wrote: > It looks like you've deleted some private helper functions and pulled them into the implementation of this function? Which allows you to efficiently combine Sk8f and Sk4f? Yep, just moving things around. msarett 2015/11/10 14:54:16 Acknowledged. Show quoted text On 2015/11/10 00:22:05, mtklein wrote: > On 2015/11/09 at 23:25:06, msarett wrote: > > It looks like you've deleted some private helper functions and pulled them > into the implementation of this function? Which allows you to efficiently > combine Sk8f and Sk4f? > > Yep, just moving things around. Acknowledged.
251 for (int i = 0; i < n; i++) {	255 BlendFn blend;

252 dst[i] = aa ? this->xfer32(dst[i], src[i], aa[i])	256 while (n >= 2) {

253 : this->xfer32(dst[i], src[i]);	257 auto d = Sk8f::FromBytes((const uint8_t)dst) (1.0f/255),
	msarett 2015/11/09 23:25:06 nit: Any reason not to write 1.0f / 255.0f? nit: Any reason not to write 1.0f / 255.0f? mtklein 2015/11/10 00:22:05 Nope, either way works the same and gets to the sa Show quoted text On 2015/11/09 at 23:25:06, msarett wrote: > nit: Any reason not to write 1.0f / 255.0f? Nope, either way works the same and gets to the same compile time constant. I'd write 1.0/255, but MSVC complains that 1.0/255 != 1.0f/255. Gotta get an 'f' in there somewhere.
	258 s = Sk8f::FromBytes((const uint8_t)src) (1.0f/255),

	259 b = blend(d, s);

	260 if (aa) {

	261 auto a255 = Sk8f(aa[0],aa[0],aa[0],aa[0], aa[1],aa[1],aa[1],aa[1 ]);

	262 (ba255 + d(Sk8f(255)-a255) + 0.5).toBytes((uint8_t*)dst);

	263 aa += 2;

	264 } else {

	265 (b * 255 + 0.5).toBytes((uint8_t*)dst);

	266 }

	267 dst += 2;

	268 src += 2;

	269 n -= 2;

	270 }

	271 if (n) {

	272 auto d = Sk4f::FromBytes((const uint8_t)dst) (1.0f/255),

	273 s = Sk4f::FromBytes((const uint8_t)src) (1.0f/255),

	274 b = blend(d, s);

	275 if (aa) {

	276 auto a255 = Sk4f(aa[0],aa[0],aa[0],aa[0]);

	277 (ba255 + d(Sk4f(255)-a255) + 0.5).toBytes((uint8_t*)dst);

	278 aa++;

	279 } else {

	280 (b * 255 + 0.5).toBytes((uint8_t*)dst);

	281 }

254 }	282 }

255 }	283 }

256	284

257 void xfer16(uint16_t dst[], const SkPMColor src[], int n, const SkAlpha aa[] ) const override {	285 void xfer16(uint16_t dst[], const SkPMColor src[], int n, const SkAlpha aa[] ) const override {

258 for (int i = 0; i < n; i++) {	286 for (int i = 0; i < n; i++) {

259 SkPMColor dst32 = SkPixel16ToPixel32(dst[i]);	287 SkPMColor dst32 = SkPixel16ToPixel32(dst[i]); // Convert d st up to 8888.

260 dst32 = aa ? this->xfer32(dst32, src[i], aa[i])	288 this->xfer32(&dst32, src+i, 1, aa ? aa+i : nullptr); // Blend 1 p ixel.

261 : this->xfer32(dst32, src[i]);	289 dst[i] = SkPixel32ToPixel16(dst32); // Repack ds t to 565 and store.
	msarett 2015/11/09 23:25:06 This seems slow? Although not different from befo This seems slow? Although not different from before. mtklein 2015/11/10 00:22:05 Right. No different from before. Just always 1 p Show quoted text On 2015/11/09 at 23:25:06, msarett wrote: > This seems slow? Although not different from before. Right. No different from before. Just always 1 pixel at a time. Seems like a fine compromise for 565.
262 dst[i] = SkPixel32ToPixel16(dst32);

263 }	290 }

264 }	291 }

265	292

266 private:	293 private:

267 static Sk4f Load(SkPMColor c) {

268 return Sk4f::FromBytes((uint8_t)&c) Sk4f(1.0f/255);

269 }

270 static SkPMColor Round(const Sk4f& f) {

271 SkPMColor c;

272 (f * Sk4f(255) + Sk4f(0.5f)).toBytes((uint8_t*)&c);

273 return c;

274 }

275 inline SkPMColor xfer32(SkPMColor dst, SkPMColor src) const {

276 return Round(fProcF(Load(dst), Load(src)));

277 }

278

279 inline SkPMColor xfer32(SkPMColor dst, SkPMColor src, SkAlpha aa) const {

280 Sk4f s(Load(src)),

281 d(Load(dst)),

282 b(fProcF(d,s));

283 // We do aa in full float precision before going back down to bytes, bec ause we can!

284 Sk4f a = Sk4f(aa) * Sk4f(1.0f/255);

285 b = ba + d(Sk4f(1)-a);

286 return Round(b);

287 }

288

289 ProcF fProcF;

290 typedef SkProcCoeffXfermode INHERITED;	294 typedef SkProcCoeffXfermode INHERITED;

291 };	295 };

292	296

293 } // namespace	297 } // namespace

294	298

295 namespace SK_OPTS_NS {	299 namespace SK_OPTS_NS {

296	300

297 static SkXfermode* create_xfermode(const ProcCoeff& rec, SkXfermode::Mode mode) {	301 static SkXfermode* create_xfermode(const ProcCoeff& rec, SkXfermode::Mode mode) {

298 switch (mode) {	302 switch (mode) {

299 #define CASE(Mode) \	303 #define CASE(Mode) \

(...skipping 16 matching lines...) Expand all Loading...
316 CASE(Multiply);	320 CASE(Multiply);

317 CASE(Difference);	321 CASE(Difference);

318 CASE(Exclusion);	322 CASE(Exclusion);

319 CASE(HardLight);	323 CASE(HardLight);

320 CASE(Overlay);	324 CASE(Overlay);

321 CASE(Darken);	325 CASE(Darken);

322 CASE(Lighten);	326 CASE(Lighten);

323 #undef CASE	327 #undef CASE

324	328

325 #define CASE(Mode) \	329 #define CASE(Mode) \

326 case SkXfermode::k##Mode##_Mode: return new Sk4fXfermode(rec, mode, &Mode)	330 case SkXfermode::k##Mode##_Mode: return new FloatXfermode<Mode>(rec, mode)

327 CASE(ColorDodge);	331 CASE(ColorDodge);

328 CASE(ColorBurn);	332 CASE(ColorBurn);

329 CASE(SoftLight);	333 CASE(SoftLight);

330 #undef CASE	334 #undef CASE

331	335

332 default: break;	336 default: break;

333 }	337 }

334 return nullptr;	338 return nullptr;

335 }	339 }

336	340

337 } // namespace SK_OPTS_NS	341 } // namespace SK_OPTS_NS

338	342

339 #endif//Sk4pxXfermode_DEFINED	343 #endif//Sk4pxXfermode_DEFINED

OLD	NEW

« src/opts/SkNx_avx.h ('K') | « src/opts/SkOpts_avx.cpp ('k') | no next file » | no next file with comments »