Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2012 The Android Open Source Project | 2 * Copyright 2012 The Android Open Source Project |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 | 8 |
| 9 #include "SkBlitRow_opts_SSE2.h" | 9 #include "SkBlitRow_opts_SSE2.h" |
| 10 #include "SkBitmapProcState_opts_SSE2.h" | 10 #include "SkBitmapProcState_opts_SSE2.h" |
| (...skipping 526 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 537 #endif | 537 #endif |
| 538 | 538 |
| 539 #if SK_B16x5_B32x5_SHIFT == 0 | 539 #if SK_B16x5_B32x5_SHIFT == 0 |
| 540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) | 540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) |
| 541 #elif SK_B16x5_B32x5_SHIFT > 0 | 541 #elif SK_B16x5_B32x5_SHIFT > 0 |
| 542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32 x5_SHIFT)) | 542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32 x5_SHIFT)) |
| 543 #else | 543 #else |
| 544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B3 2x5_SHIFT)) | 544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B3 2x5_SHIFT)) |
| 545 #endif | 545 #endif |
| 546 | 546 |
| 547 static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst, | 547 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, |
| 548 __m128i &mask, __m128i &scale) { | 548 __m128i &mask, __m128i &srcA) { |
| 549 // In the following comments, the components of src, dst and mask are | |
| 550 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked | |
| 551 // by an R, G, B, or A suffix. Components of one of the four pixels that | |
| 552 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for | |
|
alokp
2013/06/26 22:42:11
IMO positional suffix is not necessary because you
ernstm
2013/06/26 23:12:16
I had added the positional suffix for the cases wh
| |
| 553 // example is the blue channel of the second destination pixel. Memory | |
| 554 // layout is shown for an ARGB byte order in a color value. | |
| 555 | |
| 556 // src and srcA store 8-bit values interleaved with zeros. | |
| 557 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) | |
| 558 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, | |
| 559 // srcA, 0, srcA, 0, srcA, 0, srcA, 0) | |
| 560 // mask stores 16-bit values (compressed three channels) interleaved with ze ros. | |
| 561 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. | |
| 562 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, | |
| 563 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) | |
| 564 | |
| 549 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. | 565 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
| 566 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) | |
| 550 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), | 567 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
| 551 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); | 568 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
| 552 | 569 |
| 570 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) | |
| 553 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), | 571 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
| 554 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); | 572 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
| 555 | 573 |
| 574 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) | |
| 556 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), | 575 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
| 557 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); | 576 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
| 558 | 577 |
| 559 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) | 578 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
| 579 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an | |
| 580 // 8-bit position | |
| 581 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, | |
| 582 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) | |
| 560 mask = _mm_or_si128(_mm_or_si128(r, g), b); | 583 mask = _mm_or_si128(_mm_or_si128(r, g), b); |
| 561 | 584 |
| 562 // Interleave R,G,B into the lower byte of word. | 585 // Interleave R,G,B into the lower byte of word. |
| 586 // i.e. split the sixteen 8-bit values from mask into two sets of eight | |
| 587 // 16-bit values, padded by zero. | |
| 563 __m128i maskLo, maskHi; | 588 __m128i maskLo, maskHi; |
| 589 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) | |
| 564 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); | 590 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
| 591 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) | |
| 565 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); | 592 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
| 566 | 593 |
| 567 // Upscale to 0..32 | 594 // Upscale from 0..31 to 0..32 |
| 595 // (allows to replace division by left-shift further down) | |
| 596 // Left-shift each component by 4 and add the result back to that component, | |
| 597 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 | |
| 568 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); | 598 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
| 569 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); | 599 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
| 570 | 600 |
| 571 maskLo = _mm_mullo_epi16(maskLo, scale); | 601 // Multiply each component of maskLo and maskHi by srcA |
| 572 maskHi = _mm_mullo_epi16(maskHi, scale); | 602 maskLo = _mm_mullo_epi16(maskLo, srcA); |
| 573 | 603 maskHi = _mm_mullo_epi16(maskHi, srcA); |
| 604 | |
| 605 // Left shift mask components by 8 (divide by 256) | |
| 574 maskLo = _mm_srli_epi16(maskLo, 8); | 606 maskLo = _mm_srli_epi16(maskLo, 8); |
| 575 maskHi = _mm_srli_epi16(maskHi, 8); | 607 maskHi = _mm_srli_epi16(maskHi, 8); |
| 576 | 608 |
| 577 // Interleave R,G,B into the lower byte of the word. | 609 // Interleave R,G,B into the lower byte of the word |
| 610 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) | |
| 578 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); | 611 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
| 612 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) | |
| 579 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); | 613 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
| 580 | 614 |
| 581 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); | 615 // mask = (src - dst) * mask |
| 582 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); | 616 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
| 583 | 617 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
| 618 | |
| 619 // mask = (src - dst) * mask >> 5 | |
| 584 maskLo = _mm_srai_epi16(maskLo, 5); | 620 maskLo = _mm_srai_epi16(maskLo, 5); |
| 585 maskHi = _mm_srai_epi16(maskHi, 5); | 621 maskHi = _mm_srai_epi16(maskHi, 5); |
| 586 | 622 |
| 587 // Add two pixels into result. | 623 // Add two pixels into result. |
| 624 // result = dst + ((src - dst) * mask >> 5) | |
| 588 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); | 625 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
| 589 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); | 626 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
| 590 | 627 |
| 591 // Pack into 4 32bit dst pixels | 628 // Pack into 4 32bit dst pixels. |
| 629 // resultLo and resultHi contain eight 16-bit components (two pixels) each. | |
| 630 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), | |
| 631 // clamping to 255 if necessary. | |
| 592 return _mm_packus_epi16(resultLo, resultHi); | 632 return _mm_packus_epi16(resultLo, resultHi); |
| 593 } | 633 } |
| 594 | 634 |
| 595 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst, | 635 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, |
| 596 __m128i &mask) { | 636 __m128i &mask) { |
| 637 // In the following comments, the components of src, dst and mask are | |
| 638 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked | |
| 639 // by an R, G, B, or A suffix. Components of one of the four pixels that | |
| 640 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for | |
| 641 // example is the blue channel of the second destination pixel. Memory | |
| 642 // layout is shown for an ARGB byte order in a color value. | |
| 643 | |
| 644 // src and srcA store 8-bit values interleaved with zeros. | |
| 645 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) | |
| 646 // mask stores 16-bit values (shown as high and low bytes) interleaved with | |
| 647 // zeros | |
| 648 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, | |
| 649 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) | |
| 650 | |
| 597 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. | 651 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
| 652 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) | |
| 598 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), | 653 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
| 599 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); | 654 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
| 600 | 655 |
| 656 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) | |
| 601 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), | 657 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
| 602 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); | 658 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
| 603 | 659 |
| 660 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) | |
| 604 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), | 661 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
| 605 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); | 662 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
| 606 | 663 |
| 607 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) | 664 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
| 665 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an | |
| 666 // 8-bit position | |
| 667 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, | |
| 668 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) | |
| 608 mask = _mm_or_si128(_mm_or_si128(r, g), b); | 669 mask = _mm_or_si128(_mm_or_si128(r, g), b); |
| 609 | 670 |
| 610 // Interleave R,G,B into the lower byte of word. | 671 // Interleave R,G,B into the lower byte of word. |
| 672 // i.e. split the sixteen 8-bit values from mask into two sets of eight | |
| 673 // 16-bit values, padded by zero. | |
| 611 __m128i maskLo, maskHi; | 674 __m128i maskLo, maskHi; |
| 675 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) | |
| 612 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); | 676 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
| 677 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) | |
| 613 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); | 678 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
| 614 | 679 |
| 615 // Upscale to 0..32 | 680 // Upscale from 0..31 to 0..32 |
| 681 // (allows to replace division by left-shift further down) | |
| 682 // Left-shift each component by 4 and add the result back to that component, | |
| 683 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 | |
| 616 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); | 684 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
| 617 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); | 685 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
| 618 | 686 |
| 619 // Interleave R,G,B into the lower byte of the word. | 687 // Interleave R,G,B into the lower byte of the word |
| 688 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) | |
| 620 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); | 689 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
| 690 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) | |
| 621 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); | 691 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
| 622 | 692 |
| 623 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); | 693 // mask = (src - dst) * mask |
| 624 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); | 694 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
| 625 | 695 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
| 696 | |
| 697 // mask = (src - dst) * mask >> 5 | |
| 626 maskLo = _mm_srai_epi16(maskLo, 5); | 698 maskLo = _mm_srai_epi16(maskLo, 5); |
| 627 maskHi = _mm_srai_epi16(maskHi, 5); | 699 maskHi = _mm_srai_epi16(maskHi, 5); |
| 628 | 700 |
| 629 // Add two pixels into result. | 701 // Add two pixels into result. |
| 702 // result = dst + ((src - dst) * mask >> 5) | |
| 630 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); | 703 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
| 631 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); | 704 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
| 632 | 705 |
| 633 // Pack into 4 32bit dst pixels and force opaque. | 706 // Pack into 4 32bit dst pixels and force opaque. |
| 707 // resultLo and resultHi contain eight 16-bit components (two pixels) each. | |
| 708 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), | |
| 709 // clamping to 255 if necessary. Set alpha components to 0xFF. | |
| 634 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), | 710 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), |
| 635 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); | 711 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); |
| 636 } | 712 } |
| 637 | 713 |
| 638 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[], | 714 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], |
| 639 SkColor color, int width, SkPMColor) { | 715 SkColor src, int width, SkPMColor) { |
| 640 if (width <= 0) { | 716 if (width <= 0) { |
| 641 return; | 717 return; |
| 642 } | 718 } |
| 643 | 719 |
| 644 int srcA = SkColorGetA(color); | 720 int srcA = SkColorGetA(src); |
| 645 int srcR = SkColorGetR(color); | 721 int srcR = SkColorGetR(src); |
| 646 int srcG = SkColorGetG(color); | 722 int srcG = SkColorGetG(src); |
| 647 int srcB = SkColorGetB(color); | 723 int srcB = SkColorGetB(src); |
| 648 | 724 |
| 649 srcA = SkAlpha255To256(srcA); | 725 srcA = SkAlpha255To256(srcA); |
| 650 | 726 |
| 651 if (width >= 4) { | 727 if (width >= 4) { |
| 652 SkASSERT(((size_t)dst & 0x03) == 0); | 728 SkASSERT(((size_t)dst & 0x03) == 0); |
| 653 while (((size_t)dst & 0x0F) != 0) { | 729 while (((size_t)dst & 0x0F) != 0) { |
| 654 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); | 730 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); |
| 655 src++; | 731 mask++; |
| 656 dst++; | 732 dst++; |
| 657 width--; | 733 width--; |
| 658 } | 734 } |
| 659 | 735 |
| 660 __m128i *d = reinterpret_cast<__m128i*>(dst); | 736 __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 661 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); | 737 // Set alpha to 0xFF and replicate source four times in SSE register. |
| 662 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); | 738 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
| 663 __m128i scale = _mm_set1_epi16(srcA); | 739 // Interleave with zeros to get two sets of four 16-bit values. |
| 740 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); | |
| 741 // Set srcA_sse to contain eight copies of srcA, padded with zero. | |
| 742 // src_sse = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0 ) | |
| 743 __m128i srcA_sse = _mm_set1_epi16(srcA); | |
| 664 while (width >= 4) { | 744 while (width >= 4) { |
| 665 __m128i dst_pixel = _mm_load_si128(d); | 745 // Load four destination pixels into dst_sse. |
| 666 __m128i mask_pixel = _mm_loadl_epi64( | 746 __m128i dst_sse = _mm_load_si128(d); |
| 667 reinterpret_cast<const __m128i*>(src)); | 747 // Load four 16-bit masks into lower half of mask_sse. |
| 668 | 748 __m128i mask_sse = _mm_loadl_epi64( |
| 669 // Check whether mask_pixels are equal to 0 and get the highest bit | 749 reinterpret_cast<const __m128i*>(mask)); |
| 670 // of each byte of result, if mask pixes are all zero, we will get | 750 |
| 751 // Check whether masks are equal to 0 and get the highest bit | |
| 752 // of each byte of result, if masks are all zero, we will get | |
| 671 // pack_cmp to 0xFFFF | 753 // pack_cmp to 0xFFFF |
| 672 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, | 754 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
| 673 _mm_setzero_si128())); | 755 _mm_setzero_si128())); |
| 674 | 756 |
| 675 // if mask pixels are not all zero, we will blend the dst pixels | 757 // if mask pixels are not all zero, we will blend the dst pixels |
| 676 if (pack_cmp != 0xFFFF) { | 758 if (pack_cmp != 0xFFFF) { |
| 677 // Unpack 4 16bit mask pixels to | 759 // Unpack 4 16bit mask pixels to |
| 678 // (p0, 0, p1, 0, p2, 0, p3, 0) | 760 // (m0, 0, m1, 0, m2, 0, m3, 0) |
| 679 mask_pixel = _mm_unpacklo_epi16(mask_pixel, | 761 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
| 680 _mm_setzero_si128()); | 762 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
| 763 mask_sse = _mm_unpacklo_epi16(mask_sse, | |
| 764 _mm_setzero_si128()); | |
| 681 | 765 |
| 682 // Process 4 32bit dst pixels | 766 // Process 4 32bit dst pixels |
| 683 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel, | 767 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, |
| 684 mask_pixel, scale); | 768 mask_sse, srcA_sse); |
| 685 _mm_store_si128(d, result); | 769 _mm_store_si128(d, result); |
| 686 } | 770 } |
| 687 | 771 |
| 688 d++; | 772 d++; |
| 689 src += 4; | 773 mask += 4; |
| 690 width -= 4; | 774 width -= 4; |
| 691 } | 775 } |
| 692 | 776 |
| 693 dst = reinterpret_cast<SkPMColor*>(d); | 777 dst = reinterpret_cast<SkPMColor*>(d); |
| 694 } | 778 } |
| 695 | 779 |
| 696 while (width > 0) { | 780 while (width > 0) { |
| 697 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); | 781 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); |
| 698 src++; | 782 mask++; |
| 699 dst++; | 783 dst++; |
| 700 width--; | 784 width--; |
| 701 } | 785 } |
| 702 } | 786 } |
| 703 | 787 |
| 704 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[], | 788 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], |
| 705 SkColor color, int width, SkPMColor opaqueDst) { | 789 SkColor src, int width, SkPMColor opaqueDst) { |
| 706 if (width <= 0) { | 790 if (width <= 0) { |
| 707 return; | 791 return; |
| 708 } | 792 } |
| 709 | 793 |
| 710 int srcR = SkColorGetR(color); | 794 int srcR = SkColorGetR(src); |
| 711 int srcG = SkColorGetG(color); | 795 int srcG = SkColorGetG(src); |
| 712 int srcB = SkColorGetB(color); | 796 int srcB = SkColorGetB(src); |
| 713 | 797 |
| 714 if (width >= 4) { | 798 if (width >= 4) { |
| 715 SkASSERT(((size_t)dst & 0x03) == 0); | 799 SkASSERT(((size_t)dst & 0x03) == 0); |
| 716 while (((size_t)dst & 0x0F) != 0) { | 800 while (((size_t)dst & 0x0F) != 0) { |
| 717 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); | 801 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
| 718 src++; | 802 mask++; |
| 719 dst++; | 803 dst++; |
| 720 width--; | 804 width--; |
| 721 } | 805 } |
| 722 | 806 |
| 723 __m128i *d = reinterpret_cast<__m128i*>(dst); | 807 __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 724 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); | 808 // Set alpha to 0xFF and replicate source four times in SSE register. |
| 725 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); | 809 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
| 810 // Set srcA_sse to contain eight copies of srcA, padded with zero. | |
| 811 // src_sse = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0 ) | |
| 812 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); | |
| 726 while (width >= 4) { | 813 while (width >= 4) { |
| 727 __m128i dst_pixel = _mm_load_si128(d); | 814 // Load four destination pixels into dst_sse. |
| 728 __m128i mask_pixel = _mm_loadl_epi64( | 815 __m128i dst_sse = _mm_load_si128(d); |
| 729 reinterpret_cast<const __m128i*>(src)); | 816 // Load four 16-bit masks into lower half of mask_sse. |
| 817 __m128i mask_sse = _mm_loadl_epi64( | |
| 818 reinterpret_cast<const __m128i*>(mask)); | |
| 730 | 819 |
| 731 // Check whether mask_pixels are equal to 0 and get the highest bit | 820 // Check whether masks are equal to 0 and get the highest bit |
| 732 // of each byte of result, if mask pixes are all zero, we will get | 821 // of each byte of result, if masks are all zero, we will get |
| 733 // pack_cmp to 0xFFFF | 822 // pack_cmp to 0xFFFF |
| 734 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, | 823 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
| 735 _mm_setzero_si128())); | 824 _mm_setzero_si128())); |
| 736 | 825 |
| 737 // if mask pixels are not all zero, we will blend the dst pixels | 826 // if mask pixels are not all zero, we will blend the dst pixels |
| 738 if (pack_cmp != 0xFFFF) { | 827 if (pack_cmp != 0xFFFF) { |
| 739 // Unpack 4 16bit mask pixels to | 828 // Unpack 4 16bit mask pixels to |
| 740 // (p0, 0, p1, 0, p2, 0, p3, 0) | 829 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
| 741 mask_pixel = _mm_unpacklo_epi16(mask_pixel, | 830 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
| 742 _mm_setzero_si128()); | 831 mask_sse = _mm_unpacklo_epi16(mask_sse, |
| 832 _mm_setzero_si128()); | |
| 743 | 833 |
| 744 // Process 4 32bit dst pixels | 834 // Process 4 32bit dst pixels |
| 745 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel, | 835 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, |
| 746 mask_pixel); | 836 mask_sse); |
| 747 _mm_store_si128(d, result); | 837 _mm_store_si128(d, result); |
| 748 } | 838 } |
| 749 | 839 |
| 750 d++; | 840 d++; |
| 751 src += 4; | 841 mask += 4; |
| 752 width -= 4; | 842 width -= 4; |
| 753 } | 843 } |
| 754 | 844 |
| 755 dst = reinterpret_cast<SkPMColor*>(d); | 845 dst = reinterpret_cast<SkPMColor*>(d); |
| 756 } | 846 } |
| 757 | 847 |
| 758 while (width > 0) { | 848 while (width > 0) { |
| 759 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); | 849 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
| 760 src++; | 850 src++; |
| 761 dst++; | 851 dst++; |
| 762 width--; | 852 width--; |
| 763 } | 853 } |
| 764 } | 854 } |
| OLD | NEW |