src/opts/SkBlitRow_opts_SSE2.cpp - Issue 17847010: Commented SSE blend functions and cleaned-up variable naming.

Side by Side Diff: src/opts/SkBlitRow_opts_SSE2.cpp

Issue 17847010: Commented SSE blend functions and cleaned-up variable naming. (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8	8

9 #include "SkBlitRow_opts_SSE2.h"	9 #include "SkBlitRow_opts_SSE2.h"

10 #include "SkBitmapProcState_opts_SSE2.h"	10 #include "SkBitmapProcState_opts_SSE2.h"

(...skipping 526 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
537 #endif	537 #endif

538	538

539 #if SK_B16x5_B32x5_SHIFT == 0	539 #if SK_B16x5_B32x5_SHIFT == 0

540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)	540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)

541 #elif SK_B16x5_B32x5_SHIFT > 0	541 #elif SK_B16x5_B32x5_SHIFT > 0

542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32 x5_SHIFT))	542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32 x5_SHIFT))

543 #else	543 #else

544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B3 2x5_SHIFT))	544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B3 2x5_SHIFT))

545 #endif	545 #endif

546	546

547 static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,	547 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,

548 __m128i &mask, __m128i &scale) {	548 __m128i &mask, __m128i &srcA) {

	549 // In the following comments, the components of src, dst and mask are

	550 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

	551 // by an R, G, B, or A suffix. Components of one of the four pixels that

	552 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
	alokp 2013/06/26 22:42:11 IMO positional suffix is not necessary because you IMO positional suffix is not necessary because you already document the full layout of __m128. ernstm 2013/06/26 23:12:16 I had added the positional suffix for the cases wh Show quoted text On 2013/06/26 22:42:11, Alok Priyadarshi wrote: > IMO positional suffix is not necessary because you already document the full > layout of __m128. I had added the positional suffix for the cases where the four pixels are split into two SSE register, e.g. dstLo and dstHi.
	553 // example is the blue channel of the second destination pixel. Memory

	554 // layout is shown for an ARGB byte order in a color value.

	555

	556 // src and srcA store 8-bit values interleaved with zeros.

	557 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

	558 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,

	559 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)

	560 // mask stores 16-bit values (compressed three channels) interleaved with ze ros.

	561 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.

	562 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

	563 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

	564

549 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.	565 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

	566 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

550 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),	567 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

551 _mm_set1_epi32(0x1F << SK_R32_SHIFT));	568 _mm_set1_epi32(0x1F << SK_R32_SHIFT));

552	569

	570 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

553 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),	571 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

554 _mm_set1_epi32(0x1F << SK_G32_SHIFT));	572 _mm_set1_epi32(0x1F << SK_G32_SHIFT));

555	573

	574 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

556 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),	575 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

557 _mm_set1_epi32(0x1F << SK_B32_SHIFT));	576 _mm_set1_epi32(0x1F << SK_B32_SHIFT));

558	577

559 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)	578 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

	579 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

	580 // 8-bit position

	581 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

	582 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

560 mask = _mm_or_si128(_mm_or_si128(r, g), b);	583 mask = _mm_or_si128(_mm_or_si128(r, g), b);

561	584

562 // Interleave R,G,B into the lower byte of word.	585 // Interleave R,G,B into the lower byte of word.

	586 // i.e. split the sixteen 8-bit values from mask into two sets of eight

	587 // 16-bit values, padded by zero.

563 __m128i maskLo, maskHi;	588 __m128i maskLo, maskHi;

	589 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

564 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());	590 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

	591 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

565 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());	592 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

566	593

567 // Upscale to 0..32	594 // Upscale from 0..31 to 0..32

	595 // (allows to replace division by left-shift further down)

	596 // Left-shift each component by 4 and add the result back to that component,

	597 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

568 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));	598 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

569 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));	599 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

570	600

571 maskLo = _mm_mullo_epi16(maskLo, scale);	601 // Multiply each component of maskLo and maskHi by srcA

572 maskHi = _mm_mullo_epi16(maskHi, scale);	602 maskLo = _mm_mullo_epi16(maskLo, srcA);

573	603 maskHi = _mm_mullo_epi16(maskHi, srcA);

	604

	605 // Left shift mask components by 8 (divide by 256)

574 maskLo = _mm_srli_epi16(maskLo, 8);	606 maskLo = _mm_srli_epi16(maskLo, 8);

575 maskHi = _mm_srli_epi16(maskHi, 8);	607 maskHi = _mm_srli_epi16(maskHi, 8);

576	608

577 // Interleave R,G,B into the lower byte of the word.	609 // Interleave R,G,B into the lower byte of the word

	610 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

578 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());	611 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

	612 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

579 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());	613 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

580	614

581 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));	615 // mask = (src - dst) * mask

582 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));	616 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

583	617 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

	618

	619 // mask = (src - dst) * mask >> 5

584 maskLo = _mm_srai_epi16(maskLo, 5);	620 maskLo = _mm_srai_epi16(maskLo, 5);

585 maskHi = _mm_srai_epi16(maskHi, 5);	621 maskHi = _mm_srai_epi16(maskHi, 5);

586	622

587 // Add two pixels into result.	623 // Add two pixels into result.

	624 // result = dst + ((src - dst) * mask >> 5)

588 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);	625 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);

589 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);	626 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);

590	627

591 // Pack into 4 32bit dst pixels	628 // Pack into 4 32bit dst pixels.

	629 // resultLo and resultHi contain eight 16-bit components (two pixels) each.

	630 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

	631 // clamping to 255 if necessary.

592 return _mm_packus_epi16(resultLo, resultHi);	632 return _mm_packus_epi16(resultLo, resultHi);

593 }	633 }

594	634

595 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,	635 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,

596 __m128i &mask) {	636 __m128i &mask) {

	637 // In the following comments, the components of src, dst and mask are

	638 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

	639 // by an R, G, B, or A suffix. Components of one of the four pixels that

	640 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

	641 // example is the blue channel of the second destination pixel. Memory

	642 // layout is shown for an ARGB byte order in a color value.

	643

	644 // src and srcA store 8-bit values interleaved with zeros.

	645 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

	646 // mask stores 16-bit values (shown as high and low bytes) interleaved with

	647 // zeros

	648 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

	649 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

	650

597 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.	651 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

	652 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

598 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),	653 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

599 _mm_set1_epi32(0x1F << SK_R32_SHIFT));	654 _mm_set1_epi32(0x1F << SK_R32_SHIFT));

600	655

	656 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

601 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),	657 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

602 _mm_set1_epi32(0x1F << SK_G32_SHIFT));	658 _mm_set1_epi32(0x1F << SK_G32_SHIFT));

603	659

	660 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

604 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),	661 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

605 _mm_set1_epi32(0x1F << SK_B32_SHIFT));	662 _mm_set1_epi32(0x1F << SK_B32_SHIFT));

606	663

607 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)	664 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

	665 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

	666 // 8-bit position

	667 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

	668 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

608 mask = _mm_or_si128(_mm_or_si128(r, g), b);	669 mask = _mm_or_si128(_mm_or_si128(r, g), b);

609	670

610 // Interleave R,G,B into the lower byte of word.	671 // Interleave R,G,B into the lower byte of word.

	672 // i.e. split the sixteen 8-bit values from mask into two sets of eight

	673 // 16-bit values, padded by zero.

611 __m128i maskLo, maskHi;	674 __m128i maskLo, maskHi;

	675 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

612 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());	676 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

	677 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

613 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());	678 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

614	679

615 // Upscale to 0..32	680 // Upscale from 0..31 to 0..32

	681 // (allows to replace division by left-shift further down)

	682 // Left-shift each component by 4 and add the result back to that component,

	683 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

616 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));	684 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

617 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));	685 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

618	686

619 // Interleave R,G,B into the lower byte of the word.	687 // Interleave R,G,B into the lower byte of the word

	688 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

620 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());	689 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

	690 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

621 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());	691 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

622	692

623 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));	693 // mask = (src - dst) * mask

624 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));	694 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

625	695 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

	696

	697 // mask = (src - dst) * mask >> 5

626 maskLo = _mm_srai_epi16(maskLo, 5);	698 maskLo = _mm_srai_epi16(maskLo, 5);

627 maskHi = _mm_srai_epi16(maskHi, 5);	699 maskHi = _mm_srai_epi16(maskHi, 5);

628	700

629 // Add two pixels into result.	701 // Add two pixels into result.

	702 // result = dst + ((src - dst) * mask >> 5)

630 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);	703 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);

631 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);	704 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);

632	705

633 // Pack into 4 32bit dst pixels and force opaque.	706 // Pack into 4 32bit dst pixels and force opaque.

	707 // resultLo and resultHi contain eight 16-bit components (two pixels) each.

	708 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

	709 // clamping to 255 if necessary. Set alpha components to 0xFF.

634 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),	710 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),

635 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));	711 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));

636 }	712 }

637	713

638 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],	714 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],

639 SkColor color, int width, SkPMColor) {	715 SkColor src, int width, SkPMColor) {

640 if (width <= 0) {	716 if (width <= 0) {

641 return;	717 return;

642 }	718 }

643	719

644 int srcA = SkColorGetA(color);	720 int srcA = SkColorGetA(src);

645 int srcR = SkColorGetR(color);	721 int srcR = SkColorGetR(src);

646 int srcG = SkColorGetG(color);	722 int srcG = SkColorGetG(src);

647 int srcB = SkColorGetB(color);	723 int srcB = SkColorGetB(src);

648	724

649 srcA = SkAlpha255To256(srcA);	725 srcA = SkAlpha255To256(srcA);

650	726

651 if (width >= 4) {	727 if (width >= 4) {

652 SkASSERT(((size_t)dst & 0x03) == 0);	728 SkASSERT(((size_t)dst & 0x03) == 0);

653 while (((size_t)dst & 0x0F) != 0) {	729 while (((size_t)dst & 0x0F) != 0) {

654 dst = SkBlendLCD16(srcA, srcR, srcG, srcB, dst, *src);	730 dst = SkBlendLCD16(srcA, srcR, srcG, srcB, dst, *mask);

655 src++;	731 mask++;

656 dst++;	732 dst++;

657 width--;	733 width--;

658 }	734 }

659	735

660 __m128i d = reinterpret_cast<__m128i>(dst);	736 __m128i d = reinterpret_cast<__m128i>(dst);

661 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));	737 // Set alpha to 0xFF and replicate source four times in SSE register.

662 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());	738 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

663 __m128i scale = _mm_set1_epi16(srcA);	739 // Interleave with zeros to get two sets of four 16-bit values.

	740 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

	741 // Set srcA_sse to contain eight copies of srcA, padded with zero.

	742 // src_sse = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0 )

	743 __m128i srcA_sse = _mm_set1_epi16(srcA);

664 while (width >= 4) {	744 while (width >= 4) {

665 __m128i dst_pixel = _mm_load_si128(d);	745 // Load four destination pixels into dst_sse.

666 __m128i mask_pixel = _mm_loadl_epi64(	746 __m128i dst_sse = _mm_load_si128(d);

667 reinterpret_cast<const __m128i*>(src));	747 // Load four 16-bit masks into lower half of mask_sse.

668	748 __m128i mask_sse = _mm_loadl_epi64(

669 // Check whether mask_pixels are equal to 0 and get the highest bit	749 reinterpret_cast<const __m128i*>(mask));

670 // of each byte of result, if mask pixes are all zero, we will get	750

	751 // Check whether masks are equal to 0 and get the highest bit

	752 // of each byte of result, if masks are all zero, we will get

671 // pack_cmp to 0xFFFF	753 // pack_cmp to 0xFFFF

672 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,	754 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

673 _mm_setzero_si128()));	755 _mm_setzero_si128()));

674	756

675 // if mask pixels are not all zero, we will blend the dst pixels	757 // if mask pixels are not all zero, we will blend the dst pixels

676 if (pack_cmp != 0xFFFF) {	758 if (pack_cmp != 0xFFFF) {

677 // Unpack 4 16bit mask pixels to	759 // Unpack 4 16bit mask pixels to

678 // (p0, 0, p1, 0, p2, 0, p3, 0)	760 // (m0, 0, m1, 0, m2, 0, m3, 0)

679 mask_pixel = _mm_unpacklo_epi16(mask_pixel,	761 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

680 _mm_setzero_si128());	762 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

	763 mask_sse = _mm_unpacklo_epi16(mask_sse,

	764 _mm_setzero_si128());

681	765

682 // Process 4 32bit dst pixels	766 // Process 4 32bit dst pixels

683 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,	767 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,

684 mask_pixel, scale);	768 mask_sse, srcA_sse);

685 _mm_store_si128(d, result);	769 _mm_store_si128(d, result);

686 }	770 }

687	771

688 d++;	772 d++;

689 src += 4;	773 mask += 4;

690 width -= 4;	774 width -= 4;

691 }	775 }

692	776

693 dst = reinterpret_cast<SkPMColor*>(d);	777 dst = reinterpret_cast<SkPMColor*>(d);

694 }	778 }

695	779

696 while (width > 0) {	780 while (width > 0) {

697 dst = SkBlendLCD16(srcA, srcR, srcG, srcB, dst, *src);	781 dst = SkBlendLCD16(srcA, srcR, srcG, srcB, dst, *mask);

698 src++;	782 mask++;

699 dst++;	783 dst++;

700 width--;	784 width--;

701 }	785 }

702 }	786 }

703	787

704 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],	788 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],

705 SkColor color, int width, SkPMColor opaqueDst) {	789 SkColor src, int width, SkPMColor opaqueDst) {

706 if (width <= 0) {	790 if (width <= 0) {

707 return;	791 return;

708 }	792 }

709	793

710 int srcR = SkColorGetR(color);	794 int srcR = SkColorGetR(src);

711 int srcG = SkColorGetG(color);	795 int srcG = SkColorGetG(src);

712 int srcB = SkColorGetB(color);	796 int srcB = SkColorGetB(src);

713	797

714 if (width >= 4) {	798 if (width >= 4) {

715 SkASSERT(((size_t)dst & 0x03) == 0);	799 SkASSERT(((size_t)dst & 0x03) == 0);

716 while (((size_t)dst & 0x0F) != 0) {	800 while (((size_t)dst & 0x0F) != 0) {

717 dst = SkBlendLCD16Opaque(srcR, srcG, srcB, dst, *src, opaqueDst);	801 dst = SkBlendLCD16Opaque(srcR, srcG, srcB, dst, *mask, opaqueDst);

718 src++;	802 mask++;

719 dst++;	803 dst++;

720 width--;	804 width--;

721 }	805 }

722	806

723 __m128i d = reinterpret_cast<__m128i>(dst);	807 __m128i d = reinterpret_cast<__m128i>(dst);

724 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));	808 // Set alpha to 0xFF and replicate source four times in SSE register.

725 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());	809 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

	810 // Set srcA_sse to contain eight copies of srcA, padded with zero.

	811 // src_sse = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0 )

	812 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

726 while (width >= 4) {	813 while (width >= 4) {

727 __m128i dst_pixel = _mm_load_si128(d);	814 // Load four destination pixels into dst_sse.

728 __m128i mask_pixel = _mm_loadl_epi64(	815 __m128i dst_sse = _mm_load_si128(d);

729 reinterpret_cast<const __m128i*>(src));	816 // Load four 16-bit masks into lower half of mask_sse.

	817 __m128i mask_sse = _mm_loadl_epi64(

	818 reinterpret_cast<const __m128i*>(mask));

730	819

731 // Check whether mask_pixels are equal to 0 and get the highest bit	820 // Check whether masks are equal to 0 and get the highest bit

732 // of each byte of result, if mask pixes are all zero, we will get	821 // of each byte of result, if masks are all zero, we will get

733 // pack_cmp to 0xFFFF	822 // pack_cmp to 0xFFFF

734 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,	823 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

735 _mm_setzero_si128()));	824 _mm_setzero_si128()));

736	825

737 // if mask pixels are not all zero, we will blend the dst pixels	826 // if mask pixels are not all zero, we will blend the dst pixels

738 if (pack_cmp != 0xFFFF) {	827 if (pack_cmp != 0xFFFF) {

739 // Unpack 4 16bit mask pixels to	828 // Unpack 4 16bit mask pixels to

740 // (p0, 0, p1, 0, p2, 0, p3, 0)	829 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

741 mask_pixel = _mm_unpacklo_epi16(mask_pixel,	830 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

742 _mm_setzero_si128());	831 mask_sse = _mm_unpacklo_epi16(mask_sse,

	832 _mm_setzero_si128());

743	833

744 // Process 4 32bit dst pixels	834 // Process 4 32bit dst pixels

745 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,	835 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,

746 mask_pixel);	836 mask_sse);

747 _mm_store_si128(d, result);	837 _mm_store_si128(d, result);

748 }	838 }

749	839

750 d++;	840 d++;

751 src += 4;	841 mask += 4;

752 width -= 4;	842 width -= 4;

753 }	843 }

754	844

755 dst = reinterpret_cast<SkPMColor*>(d);	845 dst = reinterpret_cast<SkPMColor*>(d);

756 }	846 }

757	847

758 while (width > 0) {	848 while (width > 0) {

759 dst = SkBlendLCD16Opaque(srcR, srcG, srcB, dst, *src, opaqueDst);	849 dst = SkBlendLCD16Opaque(srcR, srcG, srcB, dst, *mask, opaqueDst);

760 src++;	850 src++;

761 dst++;	851 dst++;

762 width--;	852 width--;

763 }	853 }

764 }	854 }

OLD	NEW

« no previous file with comments | « include/core/SkColorPriv.h ('k') | no next file » | no next file with comments »