OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 | 8 |
9 #include "SkBlitRow_opts_SSE2.h" | 9 #include "SkBlitRow_opts_SSE2.h" |
10 #include "SkBitmapProcState_opts_SSE2.h" | 10 #include "SkBitmapProcState_opts_SSE2.h" |
(...skipping 526 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
537 #endif | 537 #endif |
538 | 538 |
539 #if SK_B16x5_B32x5_SHIFT == 0 | 539 #if SK_B16x5_B32x5_SHIFT == 0 |
540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) | 540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) |
541 #elif SK_B16x5_B32x5_SHIFT > 0 | 541 #elif SK_B16x5_B32x5_SHIFT > 0 |
542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32 x5_SHIFT)) | 542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32 x5_SHIFT)) |
543 #else | 543 #else |
544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B3 2x5_SHIFT)) | 544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B3 2x5_SHIFT)) |
545 #endif | 545 #endif |
546 | 546 |
547 static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst, | 547 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, |
548 __m128i &mask, __m128i &scale) { | 548 __m128i &mask, __m128i &srcA) { |
549 // In the following comments, the components of src, dst and mask are | |
550 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked | |
551 // by an R, G, B, or A suffix. Components of one of the four pixels that | |
552 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for | |
alokp
2013/06/26 22:42:11
IMO positional suffix is not necessary because you
ernstm
2013/06/26 23:12:16
I had added the positional suffix for the cases wh
| |
553 // example is the blue channel of the second destination pixel. Memory | |
554 // layout is shown for an ARGB byte order in a color value. | |
555 | |
556 // src and srcA store 8-bit values interleaved with zeros. | |
557 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) | |
558 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, | |
559 // srcA, 0, srcA, 0, srcA, 0, srcA, 0) | |
560 // mask stores 16-bit values (compressed three channels) interleaved with ze ros. | |
561 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. | |
562 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, | |
563 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) | |
564 | |
549 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. | 565 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
566 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) | |
550 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), | 567 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
551 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); | 568 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
552 | 569 |
570 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) | |
553 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), | 571 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
554 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); | 572 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
555 | 573 |
574 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) | |
556 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), | 575 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
557 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); | 576 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
558 | 577 |
559 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) | 578 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
579 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an | |
580 // 8-bit position | |
581 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, | |
582 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) | |
560 mask = _mm_or_si128(_mm_or_si128(r, g), b); | 583 mask = _mm_or_si128(_mm_or_si128(r, g), b); |
561 | 584 |
562 // Interleave R,G,B into the lower byte of word. | 585 // Interleave R,G,B into the lower byte of word. |
586 // i.e. split the sixteen 8-bit values from mask into two sets of eight | |
587 // 16-bit values, padded by zero. | |
563 __m128i maskLo, maskHi; | 588 __m128i maskLo, maskHi; |
589 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) | |
564 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); | 590 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
591 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) | |
565 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); | 592 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
566 | 593 |
567 // Upscale to 0..32 | 594 // Upscale from 0..31 to 0..32 |
595 // (allows to replace division by left-shift further down) | |
596 // Left-shift each component by 4 and add the result back to that component, | |
597 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 | |
568 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); | 598 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
569 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); | 599 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
570 | 600 |
571 maskLo = _mm_mullo_epi16(maskLo, scale); | 601 // Multiply each component of maskLo and maskHi by srcA |
572 maskHi = _mm_mullo_epi16(maskHi, scale); | 602 maskLo = _mm_mullo_epi16(maskLo, srcA); |
573 | 603 maskHi = _mm_mullo_epi16(maskHi, srcA); |
604 | |
605 // Left shift mask components by 8 (divide by 256) | |
574 maskLo = _mm_srli_epi16(maskLo, 8); | 606 maskLo = _mm_srli_epi16(maskLo, 8); |
575 maskHi = _mm_srli_epi16(maskHi, 8); | 607 maskHi = _mm_srli_epi16(maskHi, 8); |
576 | 608 |
577 // Interleave R,G,B into the lower byte of the word. | 609 // Interleave R,G,B into the lower byte of the word |
610 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) | |
578 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); | 611 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
612 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) | |
579 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); | 613 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
580 | 614 |
581 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); | 615 // mask = (src - dst) * mask |
582 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); | 616 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
583 | 617 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
618 | |
619 // mask = (src - dst) * mask >> 5 | |
584 maskLo = _mm_srai_epi16(maskLo, 5); | 620 maskLo = _mm_srai_epi16(maskLo, 5); |
585 maskHi = _mm_srai_epi16(maskHi, 5); | 621 maskHi = _mm_srai_epi16(maskHi, 5); |
586 | 622 |
587 // Add two pixels into result. | 623 // Add two pixels into result. |
624 // result = dst + ((src - dst) * mask >> 5) | |
588 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); | 625 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
589 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); | 626 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
590 | 627 |
591 // Pack into 4 32bit dst pixels | 628 // Pack into 4 32bit dst pixels. |
629 // resultLo and resultHi contain eight 16-bit components (two pixels) each. | |
630 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), | |
631 // clamping to 255 if necessary. | |
592 return _mm_packus_epi16(resultLo, resultHi); | 632 return _mm_packus_epi16(resultLo, resultHi); |
593 } | 633 } |
594 | 634 |
595 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst, | 635 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, |
596 __m128i &mask) { | 636 __m128i &mask) { |
637 // In the following comments, the components of src, dst and mask are | |
638 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked | |
639 // by an R, G, B, or A suffix. Components of one of the four pixels that | |
640 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for | |
641 // example is the blue channel of the second destination pixel. Memory | |
642 // layout is shown for an ARGB byte order in a color value. | |
643 | |
644 // src and srcA store 8-bit values interleaved with zeros. | |
645 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) | |
646 // mask stores 16-bit values (shown as high and low bytes) interleaved with | |
647 // zeros | |
648 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, | |
649 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) | |
650 | |
597 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. | 651 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
652 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) | |
598 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), | 653 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
599 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); | 654 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
600 | 655 |
656 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) | |
601 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), | 657 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
602 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); | 658 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
603 | 659 |
660 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) | |
604 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), | 661 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
605 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); | 662 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
606 | 663 |
607 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) | 664 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
665 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an | |
666 // 8-bit position | |
667 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, | |
668 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) | |
608 mask = _mm_or_si128(_mm_or_si128(r, g), b); | 669 mask = _mm_or_si128(_mm_or_si128(r, g), b); |
609 | 670 |
610 // Interleave R,G,B into the lower byte of word. | 671 // Interleave R,G,B into the lower byte of word. |
672 // i.e. split the sixteen 8-bit values from mask into two sets of eight | |
673 // 16-bit values, padded by zero. | |
611 __m128i maskLo, maskHi; | 674 __m128i maskLo, maskHi; |
675 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) | |
612 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); | 676 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
677 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) | |
613 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); | 678 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
614 | 679 |
615 // Upscale to 0..32 | 680 // Upscale from 0..31 to 0..32 |
681 // (allows to replace division by left-shift further down) | |
682 // Left-shift each component by 4 and add the result back to that component, | |
683 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 | |
616 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); | 684 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
617 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); | 685 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
618 | 686 |
619 // Interleave R,G,B into the lower byte of the word. | 687 // Interleave R,G,B into the lower byte of the word |
688 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) | |
620 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); | 689 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
690 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) | |
621 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); | 691 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
622 | 692 |
623 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); | 693 // mask = (src - dst) * mask |
624 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); | 694 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
625 | 695 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
696 | |
697 // mask = (src - dst) * mask >> 5 | |
626 maskLo = _mm_srai_epi16(maskLo, 5); | 698 maskLo = _mm_srai_epi16(maskLo, 5); |
627 maskHi = _mm_srai_epi16(maskHi, 5); | 699 maskHi = _mm_srai_epi16(maskHi, 5); |
628 | 700 |
629 // Add two pixels into result. | 701 // Add two pixels into result. |
702 // result = dst + ((src - dst) * mask >> 5) | |
630 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); | 703 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
631 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); | 704 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
632 | 705 |
633 // Pack into 4 32bit dst pixels and force opaque. | 706 // Pack into 4 32bit dst pixels and force opaque. |
707 // resultLo and resultHi contain eight 16-bit components (two pixels) each. | |
708 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), | |
709 // clamping to 255 if necessary. Set alpha components to 0xFF. | |
634 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), | 710 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), |
635 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); | 711 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); |
636 } | 712 } |
637 | 713 |
638 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[], | 714 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], |
639 SkColor color, int width, SkPMColor) { | 715 SkColor src, int width, SkPMColor) { |
640 if (width <= 0) { | 716 if (width <= 0) { |
641 return; | 717 return; |
642 } | 718 } |
643 | 719 |
644 int srcA = SkColorGetA(color); | 720 int srcA = SkColorGetA(src); |
645 int srcR = SkColorGetR(color); | 721 int srcR = SkColorGetR(src); |
646 int srcG = SkColorGetG(color); | 722 int srcG = SkColorGetG(src); |
647 int srcB = SkColorGetB(color); | 723 int srcB = SkColorGetB(src); |
648 | 724 |
649 srcA = SkAlpha255To256(srcA); | 725 srcA = SkAlpha255To256(srcA); |
650 | 726 |
651 if (width >= 4) { | 727 if (width >= 4) { |
652 SkASSERT(((size_t)dst & 0x03) == 0); | 728 SkASSERT(((size_t)dst & 0x03) == 0); |
653 while (((size_t)dst & 0x0F) != 0) { | 729 while (((size_t)dst & 0x0F) != 0) { |
654 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); | 730 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); |
655 src++; | 731 mask++; |
656 dst++; | 732 dst++; |
657 width--; | 733 width--; |
658 } | 734 } |
659 | 735 |
660 __m128i *d = reinterpret_cast<__m128i*>(dst); | 736 __m128i *d = reinterpret_cast<__m128i*>(dst); |
661 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); | 737 // Set alpha to 0xFF and replicate source four times in SSE register. |
662 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); | 738 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
663 __m128i scale = _mm_set1_epi16(srcA); | 739 // Interleave with zeros to get two sets of four 16-bit values. |
740 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); | |
741 // Set srcA_sse to contain eight copies of srcA, padded with zero. | |
742 // src_sse = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0 ) | |
743 __m128i srcA_sse = _mm_set1_epi16(srcA); | |
664 while (width >= 4) { | 744 while (width >= 4) { |
665 __m128i dst_pixel = _mm_load_si128(d); | 745 // Load four destination pixels into dst_sse. |
666 __m128i mask_pixel = _mm_loadl_epi64( | 746 __m128i dst_sse = _mm_load_si128(d); |
667 reinterpret_cast<const __m128i*>(src)); | 747 // Load four 16-bit masks into lower half of mask_sse. |
668 | 748 __m128i mask_sse = _mm_loadl_epi64( |
669 // Check whether mask_pixels are equal to 0 and get the highest bit | 749 reinterpret_cast<const __m128i*>(mask)); |
670 // of each byte of result, if mask pixes are all zero, we will get | 750 |
751 // Check whether masks are equal to 0 and get the highest bit | |
752 // of each byte of result, if masks are all zero, we will get | |
671 // pack_cmp to 0xFFFF | 753 // pack_cmp to 0xFFFF |
672 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, | 754 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
673 _mm_setzero_si128())); | 755 _mm_setzero_si128())); |
674 | 756 |
675 // if mask pixels are not all zero, we will blend the dst pixels | 757 // if mask pixels are not all zero, we will blend the dst pixels |
676 if (pack_cmp != 0xFFFF) { | 758 if (pack_cmp != 0xFFFF) { |
677 // Unpack 4 16bit mask pixels to | 759 // Unpack 4 16bit mask pixels to |
678 // (p0, 0, p1, 0, p2, 0, p3, 0) | 760 // (m0, 0, m1, 0, m2, 0, m3, 0) |
679 mask_pixel = _mm_unpacklo_epi16(mask_pixel, | 761 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
680 _mm_setzero_si128()); | 762 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
763 mask_sse = _mm_unpacklo_epi16(mask_sse, | |
764 _mm_setzero_si128()); | |
681 | 765 |
682 // Process 4 32bit dst pixels | 766 // Process 4 32bit dst pixels |
683 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel, | 767 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, |
684 mask_pixel, scale); | 768 mask_sse, srcA_sse); |
685 _mm_store_si128(d, result); | 769 _mm_store_si128(d, result); |
686 } | 770 } |
687 | 771 |
688 d++; | 772 d++; |
689 src += 4; | 773 mask += 4; |
690 width -= 4; | 774 width -= 4; |
691 } | 775 } |
692 | 776 |
693 dst = reinterpret_cast<SkPMColor*>(d); | 777 dst = reinterpret_cast<SkPMColor*>(d); |
694 } | 778 } |
695 | 779 |
696 while (width > 0) { | 780 while (width > 0) { |
697 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); | 781 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); |
698 src++; | 782 mask++; |
699 dst++; | 783 dst++; |
700 width--; | 784 width--; |
701 } | 785 } |
702 } | 786 } |
703 | 787 |
704 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[], | 788 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], |
705 SkColor color, int width, SkPMColor opaqueDst) { | 789 SkColor src, int width, SkPMColor opaqueDst) { |
706 if (width <= 0) { | 790 if (width <= 0) { |
707 return; | 791 return; |
708 } | 792 } |
709 | 793 |
710 int srcR = SkColorGetR(color); | 794 int srcR = SkColorGetR(src); |
711 int srcG = SkColorGetG(color); | 795 int srcG = SkColorGetG(src); |
712 int srcB = SkColorGetB(color); | 796 int srcB = SkColorGetB(src); |
713 | 797 |
714 if (width >= 4) { | 798 if (width >= 4) { |
715 SkASSERT(((size_t)dst & 0x03) == 0); | 799 SkASSERT(((size_t)dst & 0x03) == 0); |
716 while (((size_t)dst & 0x0F) != 0) { | 800 while (((size_t)dst & 0x0F) != 0) { |
717 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); | 801 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
718 src++; | 802 mask++; |
719 dst++; | 803 dst++; |
720 width--; | 804 width--; |
721 } | 805 } |
722 | 806 |
723 __m128i *d = reinterpret_cast<__m128i*>(dst); | 807 __m128i *d = reinterpret_cast<__m128i*>(dst); |
724 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); | 808 // Set alpha to 0xFF and replicate source four times in SSE register. |
725 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); | 809 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
810 // Set srcA_sse to contain eight copies of srcA, padded with zero. | |
811 // src_sse = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0 ) | |
812 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); | |
726 while (width >= 4) { | 813 while (width >= 4) { |
727 __m128i dst_pixel = _mm_load_si128(d); | 814 // Load four destination pixels into dst_sse. |
728 __m128i mask_pixel = _mm_loadl_epi64( | 815 __m128i dst_sse = _mm_load_si128(d); |
729 reinterpret_cast<const __m128i*>(src)); | 816 // Load four 16-bit masks into lower half of mask_sse. |
817 __m128i mask_sse = _mm_loadl_epi64( | |
818 reinterpret_cast<const __m128i*>(mask)); | |
730 | 819 |
731 // Check whether mask_pixels are equal to 0 and get the highest bit | 820 // Check whether masks are equal to 0 and get the highest bit |
732 // of each byte of result, if mask pixes are all zero, we will get | 821 // of each byte of result, if masks are all zero, we will get |
733 // pack_cmp to 0xFFFF | 822 // pack_cmp to 0xFFFF |
734 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, | 823 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
735 _mm_setzero_si128())); | 824 _mm_setzero_si128())); |
736 | 825 |
737 // if mask pixels are not all zero, we will blend the dst pixels | 826 // if mask pixels are not all zero, we will blend the dst pixels |
738 if (pack_cmp != 0xFFFF) { | 827 if (pack_cmp != 0xFFFF) { |
739 // Unpack 4 16bit mask pixels to | 828 // Unpack 4 16bit mask pixels to |
740 // (p0, 0, p1, 0, p2, 0, p3, 0) | 829 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
741 mask_pixel = _mm_unpacklo_epi16(mask_pixel, | 830 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
742 _mm_setzero_si128()); | 831 mask_sse = _mm_unpacklo_epi16(mask_sse, |
832 _mm_setzero_si128()); | |
743 | 833 |
744 // Process 4 32bit dst pixels | 834 // Process 4 32bit dst pixels |
745 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel, | 835 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, |
746 mask_pixel); | 836 mask_sse); |
747 _mm_store_si128(d, result); | 837 _mm_store_si128(d, result); |
748 } | 838 } |
749 | 839 |
750 d++; | 840 d++; |
751 src += 4; | 841 mask += 4; |
752 width -= 4; | 842 width -= 4; |
753 } | 843 } |
754 | 844 |
755 dst = reinterpret_cast<SkPMColor*>(d); | 845 dst = reinterpret_cast<SkPMColor*>(d); |
756 } | 846 } |
757 | 847 |
758 while (width > 0) { | 848 while (width > 0) { |
759 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); | 849 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
760 src++; | 850 src++; |
761 dst++; | 851 dst++; |
762 width--; | 852 width--; |
763 } | 853 } |
764 } | 854 } |
OLD | NEW |