| Index: source/row_gcc.cc
|
| diff --git a/source/row_gcc.cc b/source/row_gcc.cc
|
| index bf9ddde42c3a4c0e759d831f90c06996b49e8419..86810514d1573cef1f6ffe991002d0888ecba2e8 100644
|
| --- a/source/row_gcc.cc
|
| +++ b/source/row_gcc.cc
|
| @@ -2860,6 +2860,47 @@ void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
|
| }
|
| #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
|
|
|
| +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
|
| +static const uvec8 kShuffleAlphaShort_AVX2 = {
|
| + 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
|
| + 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u
|
| +};
|
| +
|
| +void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
|
| + asm volatile (
|
| + "vmovdqa %3,%%ymm4 \n"
|
| + "vbroadcastf128 %4,%%ymm5 \n"
|
| + LABELALIGN
|
| + "1: \n"
|
| + "vmovdqu " MEMACCESS(0) ", %%ymm0 \n"
|
| + "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
|
| + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
|
| + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
|
| + "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
|
| + "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
|
| + "lea " MEMLEA(0x80, 0) ", %0 \n"
|
| + "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
|
| + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
|
| + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
|
| + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
|
| + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
|
| + "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
|
| + "vmovdqu %%ymm0," MEMACCESS(1) " \n"
|
| + "lea " MEMLEA(0x20,1) ",%1 \n"
|
| + "sub $0x20, %2 \n"
|
| + "jg 1b \n"
|
| + "vzeroupper \n"
|
| + : "+r"(src_argb), // %0
|
| + "+r"(dst_a), // %1
|
| + "+rm"(width) // %2
|
| + : "m"(kPermdARGBToY_AVX), // %3
|
| + "m"(kShuffleAlphaShort_AVX2) // %4
|
| + : "memory", "cc"
|
| + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
| + );
|
| +}
|
| +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2
|
| +
|
| #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
|
| // width in pixels
|
| void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
|
|
|