src/opts/SkSwizzler_opts.h - Issue 1666853002: SSE optimizations for GrayAlpha -> RGBA/BGRA Premul/Unpremul

Side by Side Diff: src/opts/SkSwizzler_opts.h

Issue 1666853002: SSE optimizations for GrayAlpha -> RGBA/BGRA Premul/Unpremul (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2016 Google Inc.	2 * Copyright 2016 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkSwizzler_opts_DEFINED	8 #ifndef SkSwizzler_opts_DEFINED

9 #define SkSwizzler_opts_DEFINED	9 #define SkSwizzler_opts_DEFINED

10	10

(...skipping 385 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
396 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {	396 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {

397 expand_grayA<false>(dst, src, count);	397 expand_grayA<false>(dst, src, count);

398 }	398 }

399	399

400 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {	400 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {

401 expand_grayA<true>(dst, src, count);	401 expand_grayA<true>(dst, src, count);

402 }	402 }

403	403

404 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3	404 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

405	405

	406 // Scale a byte by another.

	407 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.

	408 static __m128i scale(__m128i x, __m128i y) {

	409 const __m128i _128 = _mm_set1_epi16(128);

	410 const __m128i _257 = _mm_set1_epi16(257);

	411

	412 // (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.

	413 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);

	414 }

	415

406 template <bool kSwapRB>	416 template <bool kSwapRB>

407 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {	417 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {

408 auto src = (const uint32_t*)vsrc;	418 auto src = (const uint32_t*)vsrc;

409	419

410 auto premul8 = [](__m128i* lo, __m128i* hi) {	420 auto premul8 = [](__m128i* lo, __m128i* hi) {

411 const __m128i zeros = _mm_setzero_si128();	421 const __m128i zeros = _mm_setzero_si128();

412 const __m128i _128 = _mm_set1_epi16(128);

413 const __m128i _257 = _mm_set1_epi16(257);

414 __m128i planar;	422 __m128i planar;

415 if (kSwapRB) {	423 if (kSwapRB) {

416 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);	424 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);

417 } else {	425 } else {

418 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);	426 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);

419 }	427 }

420	428

421 // Swizzle the pixels to 8-bit planar.	429 // Swizzle the pixels to 8-bit planar.

422 lo = _mm_shuffle_epi8(lo, planar); // rrrrgggg bb bbaaaa	430 lo = _mm_shuffle_epi8(lo, planar); // rrrrgggg bb bbaaaa

423 hi = _mm_shuffle_epi8(hi, planar); // RRRRGGGG BB BBAAAA	431 hi = _mm_shuffle_epi8(hi, planar); // RRRRGGGG BB BBAAAA

424 __m128i rg = _mm_unpacklo_epi32(lo, hi), // rrrrRRRR gg ggGGGG	432 __m128i rg = _mm_unpacklo_epi32(lo, hi), // rrrrRRRR gg ggGGGG

425 ba = _mm_unpackhi_epi32(lo, hi); // bbbbBBBB aa aaAAAA	433 ba = _mm_unpackhi_epi32(lo, hi); // bbbbBBBB aa aaAAAA

426	434

427 // Unpack to 16-bit planar.	435 // Unpack to 16-bit planar.

428 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_ R_R_R_	436 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_ R_R_R_

429 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_ G_G_G_	437 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_ G_G_G_

430 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_ B_B_B_	438 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_ B_B_B_

431 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_ A_A_A_	439 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_ A_A_A_

432	440

433 // Premultiply! (x+127)/255 == ((x+128)257)>>16 for 0 <= x <= 255255.	441 // Premultiply!

434 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);	442 r = scale(r, a);

435 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);	443 g = scale(g, a);

436 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);	444 b = scale(b, a);

437	445

438 // Repack into interlaced pixels.	446 // Repack into interlaced pixels.

439 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RG RGRGRG	447 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RG RGRGRG

440 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BA BABABA	448 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BA BABABA

441 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rg bargba	449 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rg bargba

442 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RG BARGBA	450 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RG BARGBA

443 };	451 };

444	452

445 while (count >= 8) {	453 while (count >= 8) {

446 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),	454 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),

(...skipping 118 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
565 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);	573 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);

566	574

567 src += 16;	575 src += 16;

568 dst += 16;	576 dst += 16;

569 count -= 16;	577 count -= 16;

570 }	578 }

571	579

572 gray_to_RGB1_portable(dst, src, count);	580 gray_to_RGB1_portable(dst, src, count);

573 }	581 }

574	582

575 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {	583 static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
	msarett 2016/02/03 22:23:35 pshufb was a little slower on mobile, but closer t pshufb was a little slower on mobile, but closer this time.
	584 const uint8_t* src = (const uint8_t*) vsrc;

	585

	586 const __m128i mask = _mm_set1_epi16(0x00FF);

	587 while (count >= 8) {

	588 __m128i ga = _mm_loadu_si128((const __m128i*) src);

	589

	590 __m128i gg = _mm_or_si128(_mm_and_si128(ga, mask), _mm_slli_epi16(ga, 8) );
	mtklein 2016/02/03 22:46:32 Might be easier to digest all together as __m128i Might be easier to digest all together as __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)), _mm_slli_epi16(ga, 8)); msarett 2016/02/03 22:59:50 Done. Show quoted text On 2016/02/03 22:46:32, mtklein wrote: > Might be easier to digest all together as > > __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)), > _mm_slli_epi16(ga, 8)); Done.
	591

	592 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);

	593 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);

	594

	595 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);

	596 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);

	597

	598 src += 8*2;

	599 dst += 8;

	600 count -= 8;

	601 }

	602

576 grayA_to_RGBA_portable(dst, src, count);	603 grayA_to_RGBA_portable(dst, src, count);

577 }	604 }

578	605

579 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {	606 static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {

	607 const uint8_t* src = (const uint8_t*) vsrc;

	608

	609 const __m128i mask = _mm_set1_epi16(0x00FF);

	610 while (count >= 8) {

	611 __m128i grayA = _mm_loadu_si128((const __m128i*) src);

	612

	613 __m128i g0 = _mm_and_si128(grayA, mask);
	mtklein 2016/02/03 22:46:32 ditto here, moving the mask inline, __m128i g0 = ditto here, moving the mask inline, __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF)); msarett 2016/02/03 22:59:50 Done. Show quoted text On 2016/02/03 22:46:32, mtklein wrote: > ditto here, moving the mask inline, > > __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF)); Done.
	614 __m128i a0 = _mm_srli_epi16(grayA, 8);

	615

	616 // Premultiply

	617 g0 = scale(g0, a0);

	618

	619 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));

	620 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));

	621

	622

	623 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);

	624 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);

	625

	626 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);

	627 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);

	628

	629 src += 8*2;

	630 dst += 8;

	631 count -= 8;

	632 }

	633

580 grayA_to_rgbA_portable(dst, src, count);	634 grayA_to_rgbA_portable(dst, src, count);

581 }	635 }

582	636

583 #else	637 #else

584	638

585 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {	639 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {

586 RGBA_to_rgbA_portable(dst, src, count);	640 RGBA_to_rgbA_portable(dst, src, count);

587 }	641 }

588	642

589 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {	643 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {

(...skipping 22 matching lines...) Expand all Loading...
612	666

613 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {	667 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {

614 grayA_to_rgbA_portable(dst, src, count);	668 grayA_to_rgbA_portable(dst, src, count);

615 }	669 }

616	670

617 #endif	671 #endif

618	672

619 }	673 }

620	674

621 #endif // SkSwizzler_opts_DEFINED	675 #endif // SkSwizzler_opts_DEFINED

OLD	NEW

« no previous file with comments | « bench/SwizzleBench.cpp ('k') | no next file » | no next file with comments »