OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 1793 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1804 } | 1804 } |
1805 #endif // HAS_I422ALPHATOARGBROW_SSSE3 | 1805 #endif // HAS_I422ALPHATOARGBROW_SSSE3 |
1806 | 1806 |
1807 #ifdef HAS_I411TOARGBROW_SSSE3 | 1807 #ifdef HAS_I411TOARGBROW_SSSE3 |
1808 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | 1808 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
1809 const uint8* u_buf, | 1809 const uint8* u_buf, |
1810 const uint8* v_buf, | 1810 const uint8* v_buf, |
1811 uint8* dst_argb, | 1811 uint8* dst_argb, |
1812 const struct YuvConstants* yuvconstants, | 1812 const struct YuvConstants* yuvconstants, |
1813 int width) { | 1813 int width) { |
1814 int temp = 0; | 1814 int temp; |
1815 asm volatile ( | 1815 asm volatile ( |
1816 YUVTORGB_SETUP(yuvconstants) | 1816 YUVTORGB_SETUP(yuvconstants) |
1817 "sub %[u_buf],%[v_buf] \n" | 1817 "sub %[u_buf],%[v_buf] \n" |
1818 "pcmpeqb %%xmm5,%%xmm5 \n" | 1818 "pcmpeqb %%xmm5,%%xmm5 \n" |
1819 LABELALIGN | 1819 LABELALIGN |
1820 "1: \n" | 1820 "1: \n" |
1821 READYUV411_TEMP | 1821 READYUV411_TEMP |
1822 YUVTORGB(yuvconstants) | 1822 YUVTORGB(yuvconstants) |
1823 STOREARGB | 1823 STOREARGB |
1824 "subl $0x8,%[width] \n" | 1824 "subl $0x8,%[width] \n" |
1825 "jg 1b \n" | 1825 "jg 1b \n" |
1826 : [y_buf]"+r"(y_buf), // %[y_buf] | 1826 : [y_buf]"+r"(y_buf), // %[y_buf] |
1827 [u_buf]"+r"(u_buf), // %[u_buf] | 1827 [u_buf]"+r"(u_buf), // %[u_buf] |
1828 [v_buf]"+r"(v_buf), // %[v_buf] | 1828 [v_buf]"+r"(v_buf), // %[v_buf] |
1829 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1829 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1830 [temp]"+r"(temp), // %[temp] | 1830 [temp]"=&r"(temp), // %[temp] |
1831 #if defined(__i386__) && defined(__pic__) | 1831 #if defined(__i386__) && defined(__pic__) |
1832 [width]"+m"(width) // %[width] | 1832 [width]"+m"(width) // %[width] |
1833 #else | 1833 #else |
1834 [width]"+rm"(width) // %[width] | 1834 [width]"+rm"(width) // %[width] |
1835 #endif | 1835 #endif |
1836 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1836 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1837 : "memory", "cc", NACL_R14 YUVTORGB_REGS | 1837 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
1838 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1838 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1839 ); | 1839 ); |
1840 } | 1840 } |
1841 #endif | 1841 #endif |
1842 | 1842 |
1843 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, | 1843 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, |
1844 const uint8* uv_buf, | 1844 const uint8* uv_buf, |
(...skipping 1880 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3725 : "memory", "cc" | 3725 : "memory", "cc" |
3726 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 3726 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
3727 ); | 3727 ); |
3728 } | 3728 } |
3729 #endif // HAS_ARGBATTENUATEROW_AVX2 | 3729 #endif // HAS_ARGBATTENUATEROW_AVX2 |
3730 | 3730 |
3731 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 | 3731 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 |
3732 // Unattenuate 4 pixels at a time. | 3732 // Unattenuate 4 pixels at a time. |
3733 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 3733 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
3734 int width) { | 3734 int width) { |
3735 uintptr_t alpha = 0; | 3735 uintptr_t alpha; |
3736 asm volatile ( | 3736 asm volatile ( |
3737 // 4 pixel loop. | 3737 // 4 pixel loop. |
3738 LABELALIGN | 3738 LABELALIGN |
3739 "1: \n" | 3739 "1: \n" |
3740 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3740 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
3741 "movzb " MEMACCESS2(0x03,0) ",%3 \n" | 3741 "movzb " MEMACCESS2(0x03,0) ",%3 \n" |
3742 "punpcklbw %%xmm0,%%xmm0 \n" | 3742 "punpcklbw %%xmm0,%%xmm0 \n" |
3743 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 | 3743 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 |
3744 "movzb " MEMACCESS2(0x07,0) ",%3 \n" | 3744 "movzb " MEMACCESS2(0x07,0) ",%3 \n" |
3745 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 | 3745 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 |
(...skipping 10 matching lines...) Expand all Loading... |
3756 "pshuflw $0x40,%%xmm2,%%xmm2 \n" | 3756 "pshuflw $0x40,%%xmm2,%%xmm2 \n" |
3757 "pshuflw $0x40,%%xmm3,%%xmm3 \n" | 3757 "pshuflw $0x40,%%xmm3,%%xmm3 \n" |
3758 "movlhps %%xmm3,%%xmm2 \n" | 3758 "movlhps %%xmm3,%%xmm2 \n" |
3759 "pmulhuw %%xmm2,%%xmm1 \n" | 3759 "pmulhuw %%xmm2,%%xmm1 \n" |
3760 "lea " MEMLEA(0x10,0) ",%0 \n" | 3760 "lea " MEMLEA(0x10,0) ",%0 \n" |
3761 "packuswb %%xmm1,%%xmm0 \n" | 3761 "packuswb %%xmm1,%%xmm0 \n" |
3762 "movdqu %%xmm0," MEMACCESS(1) " \n" | 3762 "movdqu %%xmm0," MEMACCESS(1) " \n" |
3763 "lea " MEMLEA(0x10,1) ",%1 \n" | 3763 "lea " MEMLEA(0x10,1) ",%1 \n" |
3764 "sub $0x4,%2 \n" | 3764 "sub $0x4,%2 \n" |
3765 "jg 1b \n" | 3765 "jg 1b \n" |
3766 : "+r"(src_argb), // %0 | 3766 : "+r"(src_argb), // %0 |
3767 "+r"(dst_argb), // %1 | 3767 "+r"(dst_argb), // %1 |
3768 "+r"(width), // %2 | 3768 "+r"(width), // %2 |
3769 "+r"(alpha) // %3 | 3769 "=&r"(alpha) // %3 |
3770 : "r"(fixed_invtbl8) // %4 | 3770 : "r"(fixed_invtbl8) // %4 |
3771 : "memory", "cc", NACL_R14 | 3771 : "memory", "cc", NACL_R14 |
3772 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 3772 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
3773 ); | 3773 ); |
3774 } | 3774 } |
3775 #endif // HAS_ARGBUNATTENUATEROW_SSE2 | 3775 #endif // HAS_ARGBUNATTENUATEROW_SSE2 |
3776 | 3776 |
3777 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 | 3777 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 |
3778 // Shuffle table duplicating alpha. | 3778 // Shuffle table duplicating alpha. |
3779 static const uvec8 kUnattenShuffleAlpha_AVX2 = { | 3779 static const uvec8 kUnattenShuffleAlpha_AVX2 = { |
3780 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u | 3780 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u |
3781 }; | 3781 }; |
3782 // Unattenuate 8 pixels at a time. | 3782 // Unattenuate 8 pixels at a time. |
3783 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 3783 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
3784 int width) { | 3784 int width) { |
3785 uintptr_t alpha = 0; | 3785 uintptr_t alpha; |
3786 asm volatile ( | 3786 asm volatile ( |
3787 "sub %0,%1 \n" | 3787 "sub %0,%1 \n" |
3788 "vbroadcastf128 %5,%%ymm5 \n" | 3788 "vbroadcastf128 %5,%%ymm5 \n" |
3789 | 3789 |
3790 // 8 pixel loop. | 3790 // 8 pixel loop. |
3791 LABELALIGN | 3791 LABELALIGN |
3792 "1: \n" | 3792 "1: \n" |
3793 // replace VPGATHER | 3793 // replace VPGATHER |
3794 "movzb " MEMACCESS2(0x03,0) ",%3 \n" | 3794 "movzb " MEMACCESS2(0x03,0) ",%3 \n" |
3795 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 | 3795 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 |
(...skipping 28 matching lines...) Expand all Loading... |
3824 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" | 3824 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" |
3825 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" | 3825 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" |
3826 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" | 3826 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" |
3827 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" | 3827 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" |
3828 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" | 3828 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" |
3829 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) | 3829 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) |
3830 "lea " MEMLEA(0x20,0) ",%0 \n" | 3830 "lea " MEMLEA(0x20,0) ",%0 \n" |
3831 "sub $0x8,%2 \n" | 3831 "sub $0x8,%2 \n" |
3832 "jg 1b \n" | 3832 "jg 1b \n" |
3833 "vzeroupper \n" | 3833 "vzeroupper \n" |
3834 : "+r"(src_argb), // %0 | 3834 : "+r"(src_argb), // %0 |
3835 "+r"(dst_argb), // %1 | 3835 "+r"(dst_argb), // %1 |
3836 "+r"(width), // %2 | 3836 "+r"(width), // %2 |
3837 "+r"(alpha) // %3 | 3837 "=&r"(alpha) // %3 |
3838 : "r"(fixed_invtbl8), // %4 | 3838 : "r"(fixed_invtbl8), // %4 |
3839 "m"(kUnattenShuffleAlpha_AVX2) // %5 | 3839 "m"(kUnattenShuffleAlpha_AVX2) // %5 |
3840 : "memory", "cc", NACL_R14 | 3840 : "memory", "cc", NACL_R14 |
3841 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 3841 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
3842 ); | 3842 ); |
3843 } | 3843 } |
3844 #endif // HAS_ARGBUNATTENUATEROW_AVX2 | 3844 #endif // HAS_ARGBUNATTENUATEROW_AVX2 |
3845 | 3845 |
3846 #ifdef HAS_ARGBGRAYROW_SSSE3 | 3846 #ifdef HAS_ARGBGRAYROW_SSSE3 |
3847 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels | 3847 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels |
(...skipping 904 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4752 ); | 4752 ); |
4753 } | 4753 } |
4754 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 | 4754 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
4755 | 4755 |
4756 #ifdef HAS_ARGBAFFINEROW_SSE2 | 4756 #ifdef HAS_ARGBAFFINEROW_SSE2 |
4757 // Copy ARGB pixels from source image with slope to a row of destination. | 4757 // Copy ARGB pixels from source image with slope to a row of destination. |
4758 LIBYUV_API | 4758 LIBYUV_API |
4759 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, | 4759 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, |
4760 uint8* dst_argb, const float* src_dudv, int width) { | 4760 uint8* dst_argb, const float* src_dudv, int width) { |
4761 intptr_t src_argb_stride_temp = src_argb_stride; | 4761 intptr_t src_argb_stride_temp = src_argb_stride; |
4762 intptr_t temp = 0; | 4762 intptr_t temp; |
4763 asm volatile ( | 4763 asm volatile ( |
4764 "movq " MEMACCESS(3) ",%%xmm2 \n" | 4764 "movq " MEMACCESS(3) ",%%xmm2 \n" |
4765 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" | 4765 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" |
4766 "shl $0x10,%1 \n" | 4766 "shl $0x10,%1 \n" |
4767 "add $0x4,%1 \n" | 4767 "add $0x4,%1 \n" |
4768 "movd %1,%%xmm5 \n" | 4768 "movd %1,%%xmm5 \n" |
4769 "sub $0x4,%4 \n" | 4769 "sub $0x4,%4 \n" |
4770 "jl 49f \n" | 4770 "jl 49f \n" |
4771 | 4771 |
4772 "pshufd $0x44,%%xmm7,%%xmm7 \n" | 4772 "pshufd $0x44,%%xmm7,%%xmm7 \n" |
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4824 "movd %%xmm0," MEMACCESS(2) " \n" | 4824 "movd %%xmm0," MEMACCESS(2) " \n" |
4825 "lea " MEMLEA(0x04,2) ",%2 \n" | 4825 "lea " MEMLEA(0x04,2) ",%2 \n" |
4826 "sub $0x1,%4 \n" | 4826 "sub $0x1,%4 \n" |
4827 "jge 10b \n" | 4827 "jge 10b \n" |
4828 "19: \n" | 4828 "19: \n" |
4829 : "+r"(src_argb), // %0 | 4829 : "+r"(src_argb), // %0 |
4830 "+r"(src_argb_stride_temp), // %1 | 4830 "+r"(src_argb_stride_temp), // %1 |
4831 "+r"(dst_argb), // %2 | 4831 "+r"(dst_argb), // %2 |
4832 "+r"(src_dudv), // %3 | 4832 "+r"(src_dudv), // %3 |
4833 "+rm"(width), // %4 | 4833 "+rm"(width), // %4 |
4834 "+r"(temp) // %5 | 4834 "=&r"(temp) // %5 |
4835 : | 4835 : |
4836 : "memory", "cc", NACL_R14 | 4836 : "memory", "cc", NACL_R14 |
4837 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 4837 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
4838 ); | 4838 ); |
4839 } | 4839 } |
4840 #endif // HAS_ARGBAFFINEROW_SSE2 | 4840 #endif // HAS_ARGBAFFINEROW_SSE2 |
4841 | 4841 |
4842 #ifdef HAS_INTERPOLATEROW_SSSE3 | 4842 #ifdef HAS_INTERPOLATEROW_SSSE3 |
4843 // Bilinear filter 16x2 -> 16x1 | 4843 // Bilinear filter 16x2 -> 16x1 |
4844 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 4844 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
(...skipping 205 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5050 : "memory", "cc" | 5050 : "memory", "cc" |
5051 , "xmm0", "xmm1", "xmm5" | 5051 , "xmm0", "xmm1", "xmm5" |
5052 ); | 5052 ); |
5053 } | 5053 } |
5054 #endif // HAS_ARGBSHUFFLEROW_AVX2 | 5054 #endif // HAS_ARGBSHUFFLEROW_AVX2 |
5055 | 5055 |
5056 #ifdef HAS_ARGBSHUFFLEROW_SSE2 | 5056 #ifdef HAS_ARGBSHUFFLEROW_SSE2 |
5057 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5057 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
5058 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 5058 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
5059 const uint8* shuffler, int width) { | 5059 const uint8* shuffler, int width) { |
5060 uintptr_t pixel_temp = 0u; | 5060 uintptr_t pixel_temp; |
5061 asm volatile ( | 5061 asm volatile ( |
5062 "pxor %%xmm5,%%xmm5 \n" | 5062 "pxor %%xmm5,%%xmm5 \n" |
5063 "mov " MEMACCESS(4) ",%k2 \n" | 5063 "mov " MEMACCESS(4) ",%k2 \n" |
5064 "cmp $0x3000102,%k2 \n" | 5064 "cmp $0x3000102,%k2 \n" |
5065 "je 3012f \n" | 5065 "je 3012f \n" |
5066 "cmp $0x10203,%k2 \n" | 5066 "cmp $0x10203,%k2 \n" |
5067 "je 123f \n" | 5067 "je 123f \n" |
5068 "cmp $0x30201,%k2 \n" | 5068 "cmp $0x30201,%k2 \n" |
5069 "je 321f \n" | 5069 "je 321f \n" |
5070 "cmp $0x2010003,%k2 \n" | 5070 "cmp $0x2010003,%k2 \n" |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5155 "pshuflw $0xc6,%%xmm0,%%xmm0 \n" | 5155 "pshuflw $0xc6,%%xmm0,%%xmm0 \n" |
5156 "pshufhw $0xc6,%%xmm1,%%xmm1 \n" | 5156 "pshufhw $0xc6,%%xmm1,%%xmm1 \n" |
5157 "pshuflw $0xc6,%%xmm1,%%xmm1 \n" | 5157 "pshuflw $0xc6,%%xmm1,%%xmm1 \n" |
5158 "packuswb %%xmm1,%%xmm0 \n" | 5158 "packuswb %%xmm1,%%xmm0 \n" |
5159 "movdqu %%xmm0," MEMACCESS(1) " \n" | 5159 "movdqu %%xmm0," MEMACCESS(1) " \n" |
5160 "lea " MEMLEA(0x10,1) ",%1 \n" | 5160 "lea " MEMLEA(0x10,1) ",%1 \n" |
5161 "sub $0x4,%3 \n" | 5161 "sub $0x4,%3 \n" |
5162 "jg 3012b \n" | 5162 "jg 3012b \n" |
5163 | 5163 |
5164 "99: \n" | 5164 "99: \n" |
5165 : "+r"(src_argb), // %0 | 5165 : "+r"(src_argb), // %0 |
5166 "+r"(dst_argb), // %1 | 5166 "+r"(dst_argb), // %1 |
5167 "+d"(pixel_temp), // %2 | 5167 "=&d"(pixel_temp), // %2 |
5168 "+r"(width) // %3 | 5168 "+r"(width) // %3 |
5169 : "r"(shuffler) // %4 | 5169 : "r"(shuffler) // %4 |
5170 : "memory", "cc", NACL_R14 | 5170 : "memory", "cc", NACL_R14 |
5171 "xmm0", "xmm1", "xmm5" | 5171 "xmm0", "xmm1", "xmm5" |
5172 ); | 5172 ); |
5173 } | 5173 } |
5174 #endif // HAS_ARGBSHUFFLEROW_SSE2 | 5174 #endif // HAS_ARGBSHUFFLEROW_SSE2 |
5175 | 5175 |
5176 #ifdef HAS_I422TOYUY2ROW_SSE2 | 5176 #ifdef HAS_I422TOYUY2ROW_SSE2 |
5177 void I422ToYUY2Row_SSE2(const uint8* src_y, | 5177 void I422ToYUY2Row_SSE2(const uint8* src_y, |
5178 const uint8* src_u, | 5178 const uint8* src_u, |
5179 const uint8* src_v, | 5179 const uint8* src_v, |
(...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5336 : "memory", "cc", | 5336 : "memory", "cc", |
5337 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 5337 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
5338 ); | 5338 ); |
5339 } | 5339 } |
5340 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 | 5340 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 |
5341 | 5341 |
5342 #ifdef HAS_ARGBCOLORTABLEROW_X86 | 5342 #ifdef HAS_ARGBCOLORTABLEROW_X86 |
5343 // Tranform ARGB pixels with color table. | 5343 // Tranform ARGB pixels with color table. |
5344 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, | 5344 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, |
5345 int width) { | 5345 int width) { |
5346 uintptr_t pixel_temp = 0u; | 5346 uintptr_t pixel_temp; |
5347 asm volatile ( | 5347 asm volatile ( |
5348 // 1 pixel loop. | 5348 // 1 pixel loop. |
5349 LABELALIGN | 5349 LABELALIGN |
5350 "1: \n" | 5350 "1: \n" |
5351 "movzb " MEMACCESS(0) ",%1 \n" | 5351 "movzb " MEMACCESS(0) ",%1 \n" |
5352 "lea " MEMLEA(0x4,0) ",%0 \n" | 5352 "lea " MEMLEA(0x4,0) ",%0 \n" |
5353 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 | 5353 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 |
5354 "mov %b1," MEMACCESS2(-0x4,0) " \n" | 5354 "mov %b1," MEMACCESS2(-0x4,0) " \n" |
5355 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" | 5355 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" |
5356 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 | 5356 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 |
5357 "mov %b1," MEMACCESS2(-0x3,0) " \n" | 5357 "mov %b1," MEMACCESS2(-0x3,0) " \n" |
5358 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" | 5358 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" |
5359 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 | 5359 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 |
5360 "mov %b1," MEMACCESS2(-0x2,0) " \n" | 5360 "mov %b1," MEMACCESS2(-0x2,0) " \n" |
5361 "movzb " MEMACCESS2(-0x1,0) ",%1 \n" | 5361 "movzb " MEMACCESS2(-0x1,0) ",%1 \n" |
5362 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 | 5362 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 |
5363 "mov %b1," MEMACCESS2(-0x1,0) " \n" | 5363 "mov %b1," MEMACCESS2(-0x1,0) " \n" |
5364 "dec %2 \n" | 5364 "dec %2 \n" |
5365 "jg 1b \n" | 5365 "jg 1b \n" |
5366 : "+r"(dst_argb), // %0 | 5366 : "+r"(dst_argb), // %0 |
5367 "+d"(pixel_temp), // %1 | 5367 "=&d"(pixel_temp), // %1 |
5368 "+r"(width) // %2 | 5368 "+r"(width) // %2 |
5369 : "r"(table_argb) // %3 | 5369 : "r"(table_argb) // %3 |
5370 : "memory", "cc"); | 5370 : "memory", "cc"); |
5371 } | 5371 } |
5372 #endif // HAS_ARGBCOLORTABLEROW_X86 | 5372 #endif // HAS_ARGBCOLORTABLEROW_X86 |
5373 | 5373 |
5374 #ifdef HAS_RGBCOLORTABLEROW_X86 | 5374 #ifdef HAS_RGBCOLORTABLEROW_X86 |
5375 // Tranform RGB pixels with color table. | 5375 // Tranform RGB pixels with color table. |
5376 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { | 5376 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { |
5377 uintptr_t pixel_temp = 0u; | 5377 uintptr_t pixel_temp; |
5378 asm volatile ( | 5378 asm volatile ( |
5379 // 1 pixel loop. | 5379 // 1 pixel loop. |
5380 LABELALIGN | 5380 LABELALIGN |
5381 "1: \n" | 5381 "1: \n" |
5382 "movzb " MEMACCESS(0) ",%1 \n" | 5382 "movzb " MEMACCESS(0) ",%1 \n" |
5383 "lea " MEMLEA(0x4,0) ",%0 \n" | 5383 "lea " MEMLEA(0x4,0) ",%0 \n" |
5384 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 | 5384 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 |
5385 "mov %b1," MEMACCESS2(-0x4,0) " \n" | 5385 "mov %b1," MEMACCESS2(-0x4,0) " \n" |
5386 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" | 5386 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" |
5387 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 | 5387 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 |
5388 "mov %b1," MEMACCESS2(-0x3,0) " \n" | 5388 "mov %b1," MEMACCESS2(-0x3,0) " \n" |
5389 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" | 5389 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" |
5390 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 | 5390 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 |
5391 "mov %b1," MEMACCESS2(-0x2,0) " \n" | 5391 "mov %b1," MEMACCESS2(-0x2,0) " \n" |
5392 "dec %2 \n" | 5392 "dec %2 \n" |
5393 "jg 1b \n" | 5393 "jg 1b \n" |
5394 : "+r"(dst_argb), // %0 | 5394 : "+r"(dst_argb), // %0 |
5395 "+d"(pixel_temp), // %1 | 5395 "=&d"(pixel_temp), // %1 |
5396 "+r"(width) // %2 | 5396 "+r"(width) // %2 |
5397 : "r"(table_argb) // %3 | 5397 : "r"(table_argb) // %3 |
5398 : "memory", "cc"); | 5398 : "memory", "cc"); |
5399 } | 5399 } |
5400 #endif // HAS_RGBCOLORTABLEROW_X86 | 5400 #endif // HAS_RGBCOLORTABLEROW_X86 |
5401 | 5401 |
5402 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5402 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5403 // Tranform RGB pixels with luma table. | 5403 // Tranform RGB pixels with luma table. |
5404 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 5404 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
5405 int width, | 5405 int width, |
5406 const uint8* luma, uint32 lumacoeff) { | 5406 const uint8* luma, uint32 lumacoeff) { |
5407 uintptr_t pixel_temp = 0u; | 5407 uintptr_t pixel_temp; |
5408 uintptr_t table_temp = 0u; | 5408 uintptr_t table_temp; |
5409 asm volatile ( | 5409 asm volatile ( |
5410 "movd %6,%%xmm3 \n" | 5410 "movd %6,%%xmm3 \n" |
5411 "pshufd $0x0,%%xmm3,%%xmm3 \n" | 5411 "pshufd $0x0,%%xmm3,%%xmm3 \n" |
5412 "pcmpeqb %%xmm4,%%xmm4 \n" | 5412 "pcmpeqb %%xmm4,%%xmm4 \n" |
5413 "psllw $0x8,%%xmm4 \n" | 5413 "psllw $0x8,%%xmm4 \n" |
5414 "pxor %%xmm5,%%xmm5 \n" | 5414 "pxor %%xmm5,%%xmm5 \n" |
5415 | 5415 |
5416 // 4 pixel loop. | 5416 // 4 pixel loop. |
5417 LABELALIGN | 5417 LABELALIGN |
5418 "1: \n" | 5418 "1: \n" |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5480 "mov %b0," MEMACCESS2(0xd,3) " \n" | 5480 "mov %b0," MEMACCESS2(0xd,3) " \n" |
5481 "movzb " MEMACCESS2(0xe,2) ",%0 \n" | 5481 "movzb " MEMACCESS2(0xe,2) ",%0 \n" |
5482 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 | 5482 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
5483 "mov %b0," MEMACCESS2(0xe,3) " \n" | 5483 "mov %b0," MEMACCESS2(0xe,3) " \n" |
5484 "movzb " MEMACCESS2(0xf,2) ",%0 \n" | 5484 "movzb " MEMACCESS2(0xf,2) ",%0 \n" |
5485 "mov %b0," MEMACCESS2(0xf,3) " \n" | 5485 "mov %b0," MEMACCESS2(0xf,3) " \n" |
5486 "lea " MEMLEA(0x10,2) ",%2 \n" | 5486 "lea " MEMLEA(0x10,2) ",%2 \n" |
5487 "lea " MEMLEA(0x10,3) ",%3 \n" | 5487 "lea " MEMLEA(0x10,3) ",%3 \n" |
5488 "sub $0x4,%4 \n" | 5488 "sub $0x4,%4 \n" |
5489 "jg 1b \n" | 5489 "jg 1b \n" |
5490 : "+d"(pixel_temp), // %0 | 5490 : "=&d"(pixel_temp), // %0 |
5491 "+a"(table_temp), // %1 | 5491 "=&a"(table_temp), // %1 |
5492 "+r"(src_argb), // %2 | 5492 "+r"(src_argb), // %2 |
5493 "+r"(dst_argb), // %3 | 5493 "+r"(dst_argb), // %3 |
5494 "+rm"(width) // %4 | 5494 "+rm"(width) // %4 |
5495 : "r"(luma), // %5 | 5495 : "r"(luma), // %5 |
5496 "rm"(lumacoeff) // %6 | 5496 "rm"(lumacoeff) // %6 |
5497 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" | 5497 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" |
5498 ); | 5498 ); |
5499 } | 5499 } |
5500 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5500 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5501 | 5501 |
5502 #endif // defined(__x86_64__) || defined(__i386__) | 5502 #endif // defined(__x86_64__) || defined(__i386__) |
5503 | 5503 |
5504 #ifdef __cplusplus | 5504 #ifdef __cplusplus |
5505 } // extern "C" | 5505 } // extern "C" |
5506 } // namespace libyuv | 5506 } // namespace libyuv |
5507 #endif | 5507 #endif |
OLD | NEW |