Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkSwizzler_opts_DEFINED | 8 #ifndef SkSwizzler_opts_DEFINED |
| 9 #define SkSwizzler_opts_DEFINED | 9 #define SkSwizzler_opts_DEFINED |
| 10 | 10 |
| (...skipping 385 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 396 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { | 396 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { |
| 397 expand_grayA<false>(dst, src, count); | 397 expand_grayA<false>(dst, src, count); |
| 398 } | 398 } |
| 399 | 399 |
| 400 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { | 400 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { |
| 401 expand_grayA<true>(dst, src, count); | 401 expand_grayA<true>(dst, src, count); |
| 402 } | 402 } |
| 403 | 403 |
| 404 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 | 404 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 405 | 405 |
| 406 // Scale a byte by another. | |
| 407 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. | |
| 408 static __m128i scale(__m128i x, __m128i y) { | |
| 409 const __m128i _128 = _mm_set1_epi16(128); | |
| 410 const __m128i _257 = _mm_set1_epi16(257); | |
| 411 | |
| 412 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. | |
| 413 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); | |
| 414 } | |
| 415 | |
| 406 template <bool kSwapRB> | 416 template <bool kSwapRB> |
| 407 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { | 417 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { |
| 408 auto src = (const uint32_t*)vsrc; | 418 auto src = (const uint32_t*)vsrc; |
| 409 | 419 |
| 410 auto premul8 = [](__m128i* lo, __m128i* hi) { | 420 auto premul8 = [](__m128i* lo, __m128i* hi) { |
| 411 const __m128i zeros = _mm_setzero_si128(); | 421 const __m128i zeros = _mm_setzero_si128(); |
| 412 const __m128i _128 = _mm_set1_epi16(128); | |
| 413 const __m128i _257 = _mm_set1_epi16(257); | |
| 414 __m128i planar; | 422 __m128i planar; |
| 415 if (kSwapRB) { | 423 if (kSwapRB) { |
| 416 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); | 424 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); |
| 417 } else { | 425 } else { |
| 418 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); | 426 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); |
| 419 } | 427 } |
| 420 | 428 |
| 421 // Swizzle the pixels to 8-bit planar. | 429 // Swizzle the pixels to 8-bit planar. |
| 422 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bb bbaaaa | 430 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bb bbaaaa |
| 423 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BB BBAAAA | 431 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BB BBAAAA |
| 424 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR gg ggGGGG | 432 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR gg ggGGGG |
| 425 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aa aaAAAA | 433 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aa aaAAAA |
| 426 | 434 |
| 427 // Unpack to 16-bit planar. | 435 // Unpack to 16-bit planar. |
| 428 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_ R_R_R_ | 436 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_ R_R_R_ |
| 429 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_ G_G_G_ | 437 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_ G_G_G_ |
| 430 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_ B_B_B_ | 438 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_ B_B_B_ |
| 431 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_ A_A_A_ | 439 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_ A_A_A_ |
| 432 | 440 |
| 433 // Premultiply! (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. | 441 // Premultiply! |
| 434 r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257); | 442 r = scale(r, a); |
| 435 g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257); | 443 g = scale(g, a); |
| 436 b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257); | 444 b = scale(b, a); |
| 437 | 445 |
| 438 // Repack into interlaced pixels. | 446 // Repack into interlaced pixels. |
| 439 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RG RGRGRG | 447 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RG RGRGRG |
| 440 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BA BABABA | 448 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BA BABABA |
| 441 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rg bargba | 449 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rg bargba |
| 442 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RG BARGBA | 450 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RG BARGBA |
| 443 }; | 451 }; |
| 444 | 452 |
| 445 while (count >= 8) { | 453 while (count >= 8) { |
| 446 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), | 454 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), |
| (...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 565 _mm_storeu_si128((__m128i*) (dst + 12), ggga3); | 573 _mm_storeu_si128((__m128i*) (dst + 12), ggga3); |
| 566 | 574 |
| 567 src += 16; | 575 src += 16; |
| 568 dst += 16; | 576 dst += 16; |
| 569 count -= 16; | 577 count -= 16; |
| 570 } | 578 } |
| 571 | 579 |
| 572 gray_to_RGB1_portable(dst, src, count); | 580 gray_to_RGB1_portable(dst, src, count); |
| 573 } | 581 } |
| 574 | 582 |
| 575 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { | 583 static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) { |
|
msarett
2016/02/03 22:23:35
pshufb was a little slower on mobile, but closer t
| |
| 584 const uint8_t* src = (const uint8_t*) vsrc; | |
| 585 | |
| 586 const __m128i mask = _mm_set1_epi16(0x00FF); | |
| 587 while (count >= 8) { | |
| 588 __m128i ga = _mm_loadu_si128((const __m128i*) src); | |
| 589 | |
| 590 __m128i gg = _mm_or_si128(_mm_and_si128(ga, mask), _mm_slli_epi16(ga, 8) ); | |
|
mtklein
2016/02/03 22:46:32
Might be easier to digest all together as
__m128i
msarett
2016/02/03 22:59:50
Done.
| |
| 591 | |
| 592 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); | |
| 593 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); | |
| 594 | |
| 595 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); | |
| 596 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); | |
| 597 | |
| 598 src += 8*2; | |
| 599 dst += 8; | |
| 600 count -= 8; | |
| 601 } | |
| 602 | |
| 576 grayA_to_RGBA_portable(dst, src, count); | 603 grayA_to_RGBA_portable(dst, src, count); |
| 577 } | 604 } |
| 578 | 605 |
| 579 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { | 606 static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) { |
| 607 const uint8_t* src = (const uint8_t*) vsrc; | |
| 608 | |
| 609 const __m128i mask = _mm_set1_epi16(0x00FF); | |
| 610 while (count >= 8) { | |
| 611 __m128i grayA = _mm_loadu_si128((const __m128i*) src); | |
| 612 | |
| 613 __m128i g0 = _mm_and_si128(grayA, mask); | |
|
mtklein
2016/02/03 22:46:32
ditto here, moving the mask inline,
__m128i g0 =
msarett
2016/02/03 22:59:50
Done.
| |
| 614 __m128i a0 = _mm_srli_epi16(grayA, 8); | |
| 615 | |
| 616 // Premultiply | |
| 617 g0 = scale(g0, a0); | |
| 618 | |
| 619 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8)); | |
| 620 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8)); | |
| 621 | |
| 622 | |
| 623 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); | |
| 624 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); | |
| 625 | |
| 626 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); | |
| 627 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); | |
| 628 | |
| 629 src += 8*2; | |
| 630 dst += 8; | |
| 631 count -= 8; | |
| 632 } | |
| 633 | |
| 580 grayA_to_rgbA_portable(dst, src, count); | 634 grayA_to_rgbA_portable(dst, src, count); |
| 581 } | 635 } |
| 582 | 636 |
| 583 #else | 637 #else |
| 584 | 638 |
| 585 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { | 639 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { |
| 586 RGBA_to_rgbA_portable(dst, src, count); | 640 RGBA_to_rgbA_portable(dst, src, count); |
| 587 } | 641 } |
| 588 | 642 |
| 589 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { | 643 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { |
| (...skipping 22 matching lines...) Expand all Loading... | |
| 612 | 666 |
| 613 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { | 667 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { |
| 614 grayA_to_rgbA_portable(dst, src, count); | 668 grayA_to_rgbA_portable(dst, src, count); |
| 615 } | 669 } |
| 616 | 670 |
| 617 #endif | 671 #endif |
| 618 | 672 |
| 619 } | 673 } |
| 620 | 674 |
| 621 #endif // SkSwizzler_opts_DEFINED | 675 #endif // SkSwizzler_opts_DEFINED |
| OLD | NEW |