src/core/SkXfermode4f.cpp - Issue 1813263002: custom ssse3 srcover_n_srgb_bw, about 1.8x faster

Side by Side Diff: src/core/SkXfermode4f.cpp

Issue 1813263002: custom ssse3 srcover_n_srgb_bw, about 1.8x faster (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: undo Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkPM4fPriv.h"	8 #include "SkPM4fPriv.h"

9 #include "SkUtils.h"	9 #include "SkUtils.h"

10 #include "SkXfermode.h"	10 #include "SkXfermode.h"

(...skipping 160 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
171 if (D == kLinear_Dst) {	171 if (D == kLinear_Dst) {

172 // operate in bias-255 space for src and dst	172 // operate in bias-255 space for src and dst

173 const Sk4f& s4_255 = s4 * Sk4f(255);	173 const Sk4f& s4_255 = s4 * Sk4f(255);

174 while (count >= 4) {	174 while (count >= 4) {

175 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.f);	175 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.f);

176 Sk4f r0 = lerp(s4_255, to_4f(dst[0]), Sk4f(aa4[0])) + Sk4f(0.5f) ;	176 Sk4f r0 = lerp(s4_255, to_4f(dst[0]), Sk4f(aa4[0])) + Sk4f(0.5f) ;

177 Sk4f r1 = lerp(s4_255, to_4f(dst[1]), Sk4f(aa4[1])) + Sk4f(0.5f) ;	177 Sk4f r1 = lerp(s4_255, to_4f(dst[1]), Sk4f(aa4[1])) + Sk4f(0.5f) ;

178 Sk4f r2 = lerp(s4_255, to_4f(dst[2]), Sk4f(aa4[2])) + Sk4f(0.5f) ;	178 Sk4f r2 = lerp(s4_255, to_4f(dst[2]), Sk4f(aa4[2])) + Sk4f(0.5f) ;

179 Sk4f r3 = lerp(s4_255, to_4f(dst[3]), Sk4f(aa4[3])) + Sk4f(0.5f) ;	179 Sk4f r3 = lerp(s4_255, to_4f(dst[3]), Sk4f(aa4[3])) + Sk4f(0.5f) ;

180 Sk4f_ToBytes((uint8_t*)dst, r0, r1, r2, r3);	180 Sk4f_ToBytes((uint8_t*)dst, r0, r1, r2, r3);

181	181

182 dst += 4;	182 dst += 4;

183 aa += 4;	183 aa += 4;

184 count -= 4;	184 count -= 4;

185 }	185 }

186 } else { // kSRGB	186 } else { // kSRGB

187 while (count >= 4) {	187 while (count >= 4) {

188 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.0f);	188 Sk4f aa4 = SkNx_cast<float>(Sk4b::Load(aa)) * Sk4f(1/255.0f);

189	189

190 /* If we ever natively support convert 255_linear -> 255_srgb, then perhaps	190 /* If we ever natively support convert 255_linear -> 255_srgb, then perhaps

191 * it would be faster (and possibly allow more code sharing wit h kLinear) to	191 * it would be faster (and possibly allow more code sharing wit h kLinear) to

192 * stay in that space.	192 * stay in that space.

193 */	193 */

194 Sk4f r0 = lerp(s4, load_dst<D>(dst[0]), Sk4f(aa4[0]));	194 Sk4f r0 = lerp(s4, load_dst<D>(dst[0]), Sk4f(aa4[0]));

195 Sk4f r1 = lerp(s4, load_dst<D>(dst[1]), Sk4f(aa4[1]));	195 Sk4f r1 = lerp(s4, load_dst<D>(dst[1]), Sk4f(aa4[1]));

196 Sk4f r2 = lerp(s4, load_dst<D>(dst[2]), Sk4f(aa4[2]));	196 Sk4f r2 = lerp(s4, load_dst<D>(dst[2]), Sk4f(aa4[2]));

197 Sk4f r3 = lerp(s4, load_dst<D>(dst[3]), Sk4f(aa4[3]));	197 Sk4f r3 = lerp(s4, load_dst<D>(dst[3]), Sk4f(aa4[3]));

198 Sk4f_ToBytes((uint8_t*)dst,	198 Sk4f_ToBytes((uint8_t*)dst,

199 linear_unit_to_srgb_255f(r0),	199 linear_unit_to_srgb_255f(r0),

200 linear_unit_to_srgb_255f(r1),	200 linear_unit_to_srgb_255f(r1),

201 linear_unit_to_srgb_255f(r2),	201 linear_unit_to_srgb_255f(r2),

202 linear_unit_to_srgb_255f(r3));	202 linear_unit_to_srgb_255f(r3));

203	203

204 dst += 4;	204 dst += 4;

205 aa += 4;	205 aa += 4;

206 count -= 4;	206 count -= 4;

207 }	207 }

208 }	208 }

209 for (int i = 0; i < count; ++i) {	209 for (int i = 0; i < count; ++i) {

210 unsigned a = aa[i];	210 unsigned a = aa[i];

211 Sk4f d4 = load_dst<D>(dst[i]);	211 Sk4f d4 = load_dst<D>(dst[i]);

212 dst[i] = store_dst<D>(lerp(s4, d4, a));	212 dst[i] = store_dst<D>(lerp(s4, d4, a));

213 }	213 }

(...skipping 12 matching lines...) Expand all Loading...
226 //////////////////////////////////////////////////////////////////////////////// ///////////////////	226 //////////////////////////////////////////////////////////////////////////////// ///////////////////

227	227

228 static void dst(const SkXfermode*, uint32_t dst[], const SkPM4f[], int count, co nst SkAlpha aa[]) {}	228 static void dst(const SkXfermode*, uint32_t dst[], const SkPM4f[], int count, co nst SkAlpha aa[]) {}

229	229

230 const SkXfermode::D32Proc gProcs_Dst[] = {	230 const SkXfermode::D32Proc gProcs_Dst[] = {

231 dst, dst, dst, dst, dst, dst, dst, dst,	231 dst, dst, dst, dst, dst, dst, dst, dst,

232 };	232 };

233	233

234 //////////////////////////////////////////////////////////////////////////////// ///////////////////	234 //////////////////////////////////////////////////////////////////////////////// ///////////////////

235	235

	236

	237 static void srcover_n_srgb_bw(uint32_t dst[], const SkPM4f src[], int count) {

	238 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 // For _mm_shuffle_epi8

	239 while (count >= 4) {

	240 // Load 4 sRGB RGBA/BGRA 8888 dst pixels.

	241 // We'll write most of this as if they're RGBA, and just swizzle the src pixels to match.

	242 __m128i d4 = _mm_loadu_si128((const __m128i*)dst);

	243

	244 // Transpose into planar and convert each plane to float.

	245 auto _ = ~0; // Shuffles in a zero byte.

	246 auto dr = _mm_cvtepi32_ps(

	247 _mm_shuffle_epi8(d4, _mm_setr_epi8(0,_,_,_, 4,_,_,_, 8,_,_,_,12, _,_,_)));

	248 auto dg = _mm_cvtepi32_ps(

	249 _mm_shuffle_epi8(d4, _mm_setr_epi8(1,_,_,_, 5,_,_,_, 9,_,_,_,13, _,_,_)));

	250 auto db = _mm_cvtepi32_ps(

	251 _mm_shuffle_epi8(d4, _mm_setr_epi8(2,_,_,_, 6,_,_,_,10,_,_,_,14, _,_,_)));

	252 auto da = _mm_cvtepi32_ps(

	253 _mm_shuffle_epi8(d4, _mm_setr_epi8(3,_,_,_, 7,_,_,_,11,_,_,_,15, _,_,_)));

	254

	255 // Scale to [0,1].

	256 dr = _mm_mul_ps(dr, _mm_set1_ps(1/255.0f));

	257 dg = _mm_mul_ps(dg, _mm_set1_ps(1/255.0f));

	258 db = _mm_mul_ps(db, _mm_set1_ps(1/255.0f));

	259 da = _mm_mul_ps(da, _mm_set1_ps(1/255.0f));

	260

	261 // Apply approximate sRGB gamma correction to convert to linear (as if g amma were 2).

	262 dr = _mm_mul_ps(dr, dr);

	263 dg = _mm_mul_ps(dg, dg);

	264 db = _mm_mul_ps(db, db);

	265

	266 // Load 4 linear float src pixels.

	267 auto s0 = _mm_loadu_ps(src[0].fVec),

	268 s1 = _mm_loadu_ps(src[1].fVec),

	269 s2 = _mm_loadu_ps(src[2].fVec),

	270 s3 = _mm_loadu_ps(src[3].fVec);

	271

	272 // Transpose src pixels to planar too, and give the registers better nam es.

	273 _MM_TRANSPOSE4_PS(s0, s1, s2, s3);

	274 auto sr = s0,

	275 sg = s1,

	276 sb = s2,

	277 sa = s3;

	278

	279 // Match color order with destination, if necessary.

	280 #if defined(SK_PMCOLOR_IS_BGRA)

	281 SkTSwap(sr, sb);

	282 #endif

	283

	284 // Now, the meat of what we wanted to do... perform the srcover blend.

	285 auto invSA = _mm_sub_ps(_mm_set1_ps(1), sa);

	286 auto r = _mm_add_ps(sr, _mm_mul_ps(dr, invSA)),

	287 g = _mm_add_ps(sg, _mm_mul_ps(dg, invSA)),

	288 b = _mm_add_ps(sb, _mm_mul_ps(db, invSA)),

	289 a = _mm_add_ps(sa, _mm_mul_ps(da, invSA));

	290

	291 // Convert back to sRGB and [0,255], again approximating sRGB as gamma = = 2.

	292 r = _mm_mul_ps(_mm_sqrt_ps(r), _mm_set1_ps(255));

	293 g = _mm_mul_ps(_mm_sqrt_ps(g), _mm_set1_ps(255));

	294 b = _mm_mul_ps(_mm_sqrt_ps(b), _mm_set1_ps(255));

	295 a = _mm_mul_ps( (a), _mm_set1_ps(255));

	296

	297 // Convert to int (with rounding) and pack back down to planar 8-bit.

	298 __m128i x = _mm_packus_epi16(_mm_packus_epi16(_mm_cvtps_epi32(r), _mm_cv tps_epi32(g)),

	299 _mm_packus_epi16(_mm_cvtps_epi32(b), _mm_cv tps_epi32(a)));

	300

	301 // Transpose back to interlaced RGBA and write back to dst.

	302 x = _mm_shuffle_epi8(x, _mm_setr_epi8(0, 4, 8, 12,

	303 1, 5, 9, 13,

	304 2, 6, 10, 14,

	305 3, 7, 11, 15));

	306 _mm_storeu_si128((__m128i*)dst, x);

	307

	308 count -= 4;

	309 dst += 4;

	310 src += 4;

	311 }

	312 #endif

	313 // This should look just like the non-specialized case in srcover_n.

	314 for (int i = 0; i < count; ++i) {

	315 Sk4f s4 = src[i].to4f_pmorder();

	316 Sk4f d4 = load_dst<kSRGB_Dst>(dst[i]);

	317 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));

	318 dst[i] = store_dst<kSRGB_Dst>(r4);

	319 }

	320 }

	321

236 template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[],	322 template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[],

237 const SkPM4f src[], int count, const SkAlpha aa[]) {	323 const SkPM4f src[], int count, const SkAlpha aa[]) {

238 if (aa) {	324 if (aa) {

239 for (int i = 0; i < count; ++i) {	325 for (int i = 0; i < count; ++i) {

240 unsigned a = aa[i];	326 unsigned a = aa[i];

241 if (0 == a) {	327 if (0 == a) {

242 continue;	328 continue;

243 }	329 }

244 Sk4f s4 = src[i].to4f_pmorder();	330 Sk4f s4 = src[i].to4f_pmorder();

245 Sk4f d4 = load_dst<D>(dst[i]);	331 Sk4f d4 = load_dst<D>(dst[i]);

246 if (a != 0xFF) {	332 if (a != 0xFF) {

247 s4 = scale_by_coverage(s4, a);	333 s4 = scale_by_coverage(s4, a);

248 }	334 }

249 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));	335 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));

250 dst[i] = store_dst<D>(r4);	336 dst[i] = store_dst<D>(r4);

251 }	337 }

252 } else {	338 } else {

253 for (int i = 0; i < count; ++i) {	339 if (D == kSRGB_Dst) {

254 Sk4f s4 = src[i].to4f_pmorder();	340 srcover_n_srgb_bw(dst, src, count);

255 Sk4f d4 = load_dst<D>(dst[i]);	341 } else {

256 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));	342 for (int i = 0; i < count; ++i) {

257 dst[i] = store_dst<D>(r4);	343 Sk4f s4 = src[i].to4f_pmorder();

	344 Sk4f d4 = load_dst<D>(dst[i]);

	345 Sk4f r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));

	346 dst[i] = store_dst<D>(r4);

	347 }

258 }	348 }

259 }	349 }

260 }	350 }

261	351

262 static void srcover_linear_dst_1(const SkXfermode*, uint32_t dst[],	352 static void srcover_linear_dst_1(const SkXfermode*, uint32_t dst[],

263 const SkPM4f* src, int count, const SkAlpha aa[ ]) {	353 const SkPM4f* src, int count, const SkAlpha aa[ ]) {

264 const Sk4f s4 = src->to4f_pmorder();	354 const Sk4f s4 = src->to4f_pmorder();

265 const Sk4f dst_scale = Sk4f(1 - get_alpha(s4));	355 const Sk4f dst_scale = Sk4f(1 - get_alpha(s4));

266	356

267 if (aa) {	357 if (aa) {

268 for (int i = 0; i < count; ++i) {	358 for (int i = 0; i < count; ++i) {

269 unsigned a = aa[i];	359 unsigned a = aa[i];

270 if (0 == a) {	360 if (0 == a) {

271 continue;	361 continue;

272 }	362 }

273 Sk4f d4 = Sk4f_fromL32(dst[i]);	363 Sk4f d4 = Sk4f_fromL32(dst[i]);

274 Sk4f r4;	364 Sk4f r4;

275 if (a != 0xFF) {	365 if (a != 0xFF) {

276 Sk4f s4_aa = scale_by_coverage(s4, a);	366 Sk4f s4_aa = scale_by_coverage(s4, a);

(...skipping 112 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
389 Sk4i rgbi = Sk4i(SkGetPackedR16(rgb), SkGetPackedG16(rgb), SkGetPackedB16(rg b), 0);	479 Sk4i rgbi = Sk4i(SkGetPackedR16(rgb), SkGetPackedG16(rgb), SkGetPackedB16(rg b), 0);

390 #else	480 #else

391 Sk4i rgbi = Sk4i(SkGetPackedB16(rgb), SkGetPackedG16(rgb), SkGetPackedR16(rg b), 0);	481 Sk4i rgbi = Sk4i(SkGetPackedB16(rgb), SkGetPackedG16(rgb), SkGetPackedR16(rg b), 0);

392 #endif	482 #endif

393 return SkNx_cast<float>(rgbi) * Sk4f(1.0f/31, 1.0f/63, 1.0f/31, 0);	483 return SkNx_cast<float>(rgbi) * Sk4f(1.0f/31, 1.0f/63, 1.0f/31, 0);

394 }	484 }

395	485

396 template <DstType D>	486 template <DstType D>

397 void src_1_lcd(uint32_t dst[], const SkPM4f* src, int count, const uint16_t lcd[ ]) {	487 void src_1_lcd(uint32_t dst[], const SkPM4f* src, int count, const uint16_t lcd[ ]) {

398 const Sk4f s4 = Sk4f::Load(src->fVec);	488 const Sk4f s4 = Sk4f::Load(src->fVec);

399	489

400 if (D == kLinear_Dst) {	490 if (D == kLinear_Dst) {

401 // operate in bias-255 space for src and dst	491 // operate in bias-255 space for src and dst

402 const Sk4f s4bias = s4 * Sk4f(255);	492 const Sk4f s4bias = s4 * Sk4f(255);

403 for (int i = 0; i < count; ++i) {	493 for (int i = 0; i < count; ++i) {

404 uint16_t rgb = lcd[i];	494 uint16_t rgb = lcd[i];

405 if (0 == rgb) {	495 if (0 == rgb) {

406 continue;	496 continue;

407 }	497 }

408 Sk4f d4bias = to_4f(dst[i]);	498 Sk4f d4bias = to_4f(dst[i]);

409 dst[i] = to_4b(lerp(s4bias, d4bias, lcd16_to_unit_4f(rgb))) \| (SK_A3 2_MASK << SK_A32_SHIFT);	499 dst[i] = to_4b(lerp(s4bias, d4bias, lcd16_to_unit_4f(rgb))) \| (SK_A3 2_MASK << SK_A32_SHIFT);

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
472	562

473 const LCD32Proc procs[] = {	563 const LCD32Proc procs[] = {

474 srcover_n_lcd<kSRGB_Dst>, src_n_lcd<kSRGB_Dst>,	564 srcover_n_lcd<kSRGB_Dst>, src_n_lcd<kSRGB_Dst>,

475 srcover_1_lcd<kSRGB_Dst>, src_1_lcd<kSRGB_Dst>,	565 srcover_1_lcd<kSRGB_Dst>, src_1_lcd<kSRGB_Dst>,

476	566

477 srcover_n_lcd<kLinear_Dst>, src_n_lcd<kLinear_Dst>,	567 srcover_n_lcd<kLinear_Dst>, src_n_lcd<kLinear_Dst>,

478 srcover_1_lcd<kLinear_Dst>, src_1_lcd<kLinear_Dst>,	568 srcover_1_lcd<kLinear_Dst>, src_1_lcd<kLinear_Dst>,

479 };	569 };

480 return procs[flags];	570 return procs[flags];

481 }	571 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »