Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(83)

Side by Side Diff: source/row_gcc.cc

Issue 1377053003: remove sse2 functions that also have ssse3 (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: lint warning fixes Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
(...skipping 1661 matching lines...) Expand 10 before | Expand all | Expand 10 after
1672 "jg 1b \n" 1672 "jg 1b \n"
1673 : [y_buf]"+r"(y_buf), // %[y_buf] 1673 : [y_buf]"+r"(y_buf), // %[y_buf]
1674 [u_buf]"+r"(u_buf), // %[u_buf] 1674 [u_buf]"+r"(u_buf), // %[u_buf]
1675 [v_buf]"+r"(v_buf), // %[v_buf] 1675 [v_buf]"+r"(v_buf), // %[v_buf]
1676 [a_buf]"+r"(a_buf), // %[a_buf] 1676 [a_buf]"+r"(a_buf), // %[a_buf]
1677 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1677 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1678 #if defined(__i386__) && defined(__pic__) 1678 #if defined(__i386__) && defined(__pic__)
1679 [width]"+m"(width) // %[width] 1679 [width]"+m"(width) // %[width]
1680 #else 1680 #else
1681 [width]"+rm"(width) // %[width] 1681 [width]"+rm"(width) // %[width]
1682 #endif 1682 #endif
1683 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1683 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1684 : "memory", "cc", NACL_R14 1684 : "memory", "cc", NACL_R14
1685 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1685 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1686 ); 1686 );
1687 } 1687 }
1688 1688
1689 void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf, 1689 void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
1690 const uint8* u_buf, 1690 const uint8* u_buf,
1691 const uint8* v_buf, 1691 const uint8* v_buf,
1692 const uint8* a_buf, 1692 const uint8* a_buf,
(...skipping 11 matching lines...) Expand all
1704 "jg 1b \n" 1704 "jg 1b \n"
1705 : [y_buf]"+r"(y_buf), // %[y_buf] 1705 : [y_buf]"+r"(y_buf), // %[y_buf]
1706 [u_buf]"+r"(u_buf), // %[u_buf] 1706 [u_buf]"+r"(u_buf), // %[u_buf]
1707 [v_buf]"+r"(v_buf), // %[v_buf] 1707 [v_buf]"+r"(v_buf), // %[v_buf]
1708 [a_buf]"+r"(a_buf), // %[a_buf] 1708 [a_buf]"+r"(a_buf), // %[a_buf]
1709 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] 1709 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
1710 #if defined(__i386__) && defined(__pic__) 1710 #if defined(__i386__) && defined(__pic__)
1711 [width]"+m"(width) // %[width] 1711 [width]"+m"(width) // %[width]
1712 #else 1712 #else
1713 [width]"+rm"(width) // %[width] 1713 [width]"+rm"(width) // %[width]
1714 #endif 1714 #endif
1715 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1715 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1716 : "memory", "cc", NACL_R14 1716 : "memory", "cc", NACL_R14
1717 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1717 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1718 ); 1718 );
1719 } 1719 }
1720 1720
1721 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, 1721 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1722 const uint8* u_buf, 1722 const uint8* u_buf,
1723 const uint8* v_buf, 1723 const uint8* v_buf,
1724 uint8* dst_argb, 1724 uint8* dst_argb,
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after
1980 1980
1981 // Store 16 ARGB values. 1981 // Store 16 ARGB values.
1982 #define STOREARGB_AVX2 \ 1982 #define STOREARGB_AVX2 \
1983 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1983 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1984 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1984 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1985 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ 1985 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
1986 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ 1986 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
1987 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ 1987 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
1988 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ 1988 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
1989 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ 1989 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
1990 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \ 1990 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
1991 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" 1991 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
1992 1992
1993 // Store 16 ABGR values. 1993 // Store 16 ABGR values.
1994 #define STOREABGR_AVX2 \ 1994 #define STOREABGR_AVX2 \
1995 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \ 1995 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \
1996 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ 1996 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
1997 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \ 1997 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \
1998 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ 1998 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
1999 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \ 1999 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \
2000 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \ 2000 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \
2001 "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \ 2001 "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \
2002 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_abgr]) " \n" \ 2002 "vmovdqu %%ymm1," MEMACCESS2(0x20, [dst_abgr]) " \n" \
2003 "lea " MEMLEA(0x40,[dst_abgr]) ",%[dst_abgr] \n" 2003 "lea " MEMLEA(0x40, [dst_abgr]) ", %[dst_abgr] \n"
2004 2004
2005 #if defined(HAS_I422TOBGRAROW_AVX2) 2005 #if defined(HAS_I422TOBGRAROW_AVX2)
2006 // 16 pixels 2006 // 16 pixels
2007 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). 2007 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
2008 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, 2008 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
2009 const uint8* u_buf, 2009 const uint8* u_buf,
2010 const uint8* v_buf, 2010 const uint8* v_buf,
2011 uint8* dst_bgra, 2011 uint8* dst_bgra,
2012 struct YuvConstants* yuvconstants, 2012 struct YuvConstants* yuvconstants,
2013 int width) { 2013 int width) {
2014 asm volatile ( 2014 asm volatile (
2015 "sub %[u_buf],%[v_buf] \n" 2015 "sub %[u_buf],%[v_buf] \n"
2016 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2016 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2017 LABELALIGN 2017 LABELALIGN
2018 "1: \n" 2018 "1: \n"
2019 READYUV422_AVX2 2019 READYUV422_AVX2
2020 YUVTORGB_AVX2(yuvconstants) 2020 YUVTORGB_AVX2(yuvconstants)
2021 2021
2022 // Step 3: Weave into BGRA 2022 // Step 3: Weave into BGRA
2023 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB 2023 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB
2024 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 2024 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2025 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR 2025 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR
2026 "vpermq $0xd8,%%ymm2,%%ymm2 \n" 2026 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2027 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels 2027 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels
2028 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels 2028 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels
2029
2030 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" 2029 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n"
2031 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" 2030 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
2032 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" 2031 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
2033 "sub $0x10,%[width] \n" 2032 "sub $0x10,%[width] \n"
2034 "jg 1b \n" 2033 "jg 1b \n"
2035 "vzeroupper \n" 2034 "vzeroupper \n"
2036 : [y_buf]"+r"(y_buf), // %[y_buf] 2035 : [y_buf]"+r"(y_buf), // %[y_buf]
2037 [u_buf]"+r"(u_buf), // %[u_buf] 2036 [u_buf]"+r"(u_buf), // %[u_buf]
2038 [v_buf]"+r"(v_buf), // %[v_buf] 2037 [v_buf]"+r"(v_buf), // %[v_buf]
2039 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] 2038 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
2099 "vzeroupper \n" 2098 "vzeroupper \n"
2100 : [y_buf]"+r"(y_buf), // %[y_buf] 2099 : [y_buf]"+r"(y_buf), // %[y_buf]
2101 [u_buf]"+r"(u_buf), // %[u_buf] 2100 [u_buf]"+r"(u_buf), // %[u_buf]
2102 [v_buf]"+r"(v_buf), // %[v_buf] 2101 [v_buf]"+r"(v_buf), // %[v_buf]
2103 [a_buf]"+r"(a_buf), // %[a_buf] 2102 [a_buf]"+r"(a_buf), // %[a_buf]
2104 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2103 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2105 #if defined(__i386__) && defined(__pic__) 2104 #if defined(__i386__) && defined(__pic__)
2106 [width]"+m"(width) // %[width] 2105 [width]"+m"(width) // %[width]
2107 #else 2106 #else
2108 [width]"+rm"(width) // %[width] 2107 [width]"+rm"(width) // %[width]
2109 #endif 2108 #endif
2110 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2109 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2111 : "memory", "cc", NACL_R14 2110 : "memory", "cc", NACL_R14
2112 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2111 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2113 ); 2112 );
2114 } 2113 }
2115 #endif // HAS_I422ALPHATOARGBROW_AVX2 2114 #endif // HAS_I422ALPHATOARGBROW_AVX2
2116 2115
2117 #if defined(HAS_I422ALPHATOABGRROW_AVX2) 2116 #if defined(HAS_I422ALPHATOABGRROW_AVX2)
2118 // 16 pixels 2117 // 16 pixels
2119 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. 2118 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR.
(...skipping 16 matching lines...) Expand all
2136 "vzeroupper \n" 2135 "vzeroupper \n"
2137 : [y_buf]"+r"(y_buf), // %[y_buf] 2136 : [y_buf]"+r"(y_buf), // %[y_buf]
2138 [u_buf]"+r"(u_buf), // %[u_buf] 2137 [u_buf]"+r"(u_buf), // %[u_buf]
2139 [v_buf]"+r"(v_buf), // %[v_buf] 2138 [v_buf]"+r"(v_buf), // %[v_buf]
2140 [a_buf]"+r"(a_buf), // %[a_buf] 2139 [a_buf]"+r"(a_buf), // %[a_buf]
2141 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] 2140 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
2142 #if defined(__i386__) && defined(__pic__) 2141 #if defined(__i386__) && defined(__pic__)
2143 [width]"+m"(width) // %[width] 2142 [width]"+m"(width) // %[width]
2144 #else 2143 #else
2145 [width]"+rm"(width) // %[width] 2144 [width]"+rm"(width) // %[width]
2146 #endif 2145 #endif
2147 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2146 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2148 : "memory", "cc", NACL_R14 2147 : "memory", "cc", NACL_R14
2149 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2148 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2150 ); 2149 );
2151 } 2150 }
2152 #endif // HAS_I422ALPHATOABGRROW_AVX2 2151 #endif // HAS_I422ALPHATOABGRROW_AVX2
2153 2152
2154 #if defined(HAS_I422TOABGRROW_AVX2) 2153 #if defined(HAS_I422TOABGRROW_AVX2)
2155 // 16 pixels 2154 // 16 pixels
2156 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). 2155 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after
2226 #endif // HAS_I422TORGBAROW_AVX2 2225 #endif // HAS_I422TORGBAROW_AVX2
2227 2226
2228 #if defined(HAS_NV12TOARGBROW_AVX2) 2227 #if defined(HAS_NV12TOARGBROW_AVX2)
2229 // 16 pixels. 2228 // 16 pixels.
2230 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2229 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2231 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, 2230 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
2232 const uint8* uv_buf, 2231 const uint8* uv_buf,
2233 uint8* dst_argb, 2232 uint8* dst_argb,
2234 struct YuvConstants* yuvconstants, 2233 struct YuvConstants* yuvconstants,
2235 int width) { 2234 int width) {
2236
2237 asm volatile ( 2235 asm volatile (
2238 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2236 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2239 LABELALIGN 2237 LABELALIGN
2240 "1: \n" 2238 "1: \n"
2241 READNV12_AVX2 2239 READNV12_AVX2
2242 YUVTORGB_AVX2(yuvconstants) 2240 YUVTORGB_AVX2(yuvconstants)
2243 STOREARGB_AVX2 2241 STOREARGB_AVX2
2244 "sub $0x10,%[width] \n" 2242 "sub $0x10,%[width] \n"
2245 "jg 1b \n" 2243 "jg 1b \n"
2246 "vzeroupper \n" 2244 "vzeroupper \n"
2247 : [y_buf]"+r"(y_buf), // %[y_buf] 2245 : [y_buf]"+r"(y_buf), // %[y_buf]
2248 [uv_buf]"+r"(uv_buf), // %[uv_buf] 2246 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2249 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2247 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2250 [width]"+rm"(width) // %[width] 2248 [width]"+rm"(width) // %[width]
2251 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2249 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2252 // Does not use r14. 2250 // Does not use r14.
2253 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2251 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2254 ); 2252 );
2255 } 2253 }
2256 #endif // HAS_YUY2TOARGBROW_AVX2 2254 #endif // HAS_YUY2TOARGBROW_AVX2
2257 2255
2258 2256
2259 #if defined(HAS_YUY2TOARGBROW_AVX2) 2257 #if defined(HAS_YUY2TOARGBROW_AVX2)
2260 // 16 pixels. 2258 // 16 pixels.
2261 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2259 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2262 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, 2260 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2263 uint8* dst_argb, 2261 uint8* dst_argb,
2264 struct YuvConstants* yuvconstants, 2262 struct YuvConstants* yuvconstants,
2265 int width) { 2263 int width) {
2266
2267 asm volatile ( 2264 asm volatile (
2268 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2265 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2269 LABELALIGN 2266 LABELALIGN
2270 "1: \n" 2267 "1: \n"
2271 READYUY2_AVX2 2268 READYUY2_AVX2
2272 YUVTORGB_AVX2(yuvconstants) 2269 YUVTORGB_AVX2(yuvconstants)
2273 STOREARGB_AVX2 2270 STOREARGB_AVX2
2274 "sub $0x10,%[width] \n" 2271 "sub $0x10,%[width] \n"
2275 "jg 1b \n" 2272 "jg 1b \n"
2276 "vzeroupper \n" 2273 "vzeroupper \n"
2277 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] 2274 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2278 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2275 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2279 [width]"+rm"(width) // %[width] 2276 [width]"+rm"(width) // %[width]
2280 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2277 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2281 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), 2278 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2282 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) 2279 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2283 // Does not use r14. 2280 // Does not use r14.
2284 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2281 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2285 ); 2282 );
2286 } 2283 }
2287 #endif // HAS_YUY2TOARGBROW_AVX2 2284 #endif // HAS_YUY2TOARGBROW_AVX2
2288 2285
2289 #if defined(HAS_UYVYTOARGBROW_AVX2) 2286 #if defined(HAS_UYVYTOARGBROW_AVX2)
2290 // 16 pixels. 2287 // 16 pixels.
2291 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2288 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2292 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, 2289 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
2293 uint8* dst_argb, 2290 uint8* dst_argb,
2294 struct YuvConstants* yuvconstants, 2291 struct YuvConstants* yuvconstants,
2295 int width) { 2292 int width) {
2296
2297 asm volatile ( 2293 asm volatile (
2298 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2294 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2299 LABELALIGN 2295 LABELALIGN
2300 "1: \n" 2296 "1: \n"
2301 READUYVY_AVX2 2297 READUYVY_AVX2
2302 YUVTORGB_AVX2(yuvconstants) 2298 YUVTORGB_AVX2(yuvconstants)
2303 STOREARGB_AVX2 2299 STOREARGB_AVX2
2304 "sub $0x10,%[width] \n" 2300 "sub $0x10,%[width] \n"
2305 "jg 1b \n" 2301 "jg 1b \n"
2306 "vzeroupper \n" 2302 "vzeroupper \n"
(...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after
2454 : "+r"(src), // %0 2450 : "+r"(src), // %0
2455 "+r"(dst), // %1 2451 "+r"(dst), // %1
2456 "+r"(temp_width) // %2 2452 "+r"(temp_width) // %2
2457 : "m"(kShuffleMirror) // %3 2453 : "m"(kShuffleMirror) // %3
2458 : "memory", "cc", NACL_R14 2454 : "memory", "cc", NACL_R14
2459 "xmm0", "xmm5" 2455 "xmm0", "xmm5"
2460 ); 2456 );
2461 } 2457 }
2462 #endif // HAS_MIRRORROW_AVX2 2458 #endif // HAS_MIRRORROW_AVX2
2463 2459
2464 #ifdef HAS_MIRRORROW_SSE2
2465 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2466 intptr_t temp_width = (intptr_t)(width);
2467 asm volatile (
2468 LABELALIGN
2469 "1: \n"
2470 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2471 "movdqa %%xmm0,%%xmm1 \n"
2472 "psllw $0x8,%%xmm0 \n"
2473 "psrlw $0x8,%%xmm1 \n"
2474 "por %%xmm1,%%xmm0 \n"
2475 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2476 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2477 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2478 "movdqu %%xmm0," MEMACCESS(1) " \n"
2479 "lea " MEMLEA(0x10,1)",%1 \n"
2480 "sub $0x10,%2 \n"
2481 "jg 1b \n"
2482 : "+r"(src), // %0
2483 "+r"(dst), // %1
2484 "+r"(temp_width) // %2
2485 :
2486 : "memory", "cc", NACL_R14
2487 "xmm0", "xmm1"
2488 );
2489 }
2490 #endif // HAS_MIRRORROW_SSE2
2491
2492 #ifdef HAS_MIRRORROW_UV_SSSE3 2460 #ifdef HAS_MIRRORROW_UV_SSSE3
2493 // Shuffle table for reversing the bytes of UV channels. 2461 // Shuffle table for reversing the bytes of UV channels.
2494 static uvec8 kShuffleMirrorUV = { 2462 static uvec8 kShuffleMirrorUV = {
2495 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 2463 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2496 }; 2464 };
2497 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 2465 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2498 int width) { 2466 int width) {
2499 intptr_t temp_width = (intptr_t)(width); 2467 intptr_t temp_width = (intptr_t)(width);
2500 asm volatile ( 2468 asm volatile (
2501 "movdqa %4,%%xmm1 \n" 2469 "movdqa %4,%%xmm1 \n"
(...skipping 824 matching lines...) Expand 10 before | Expand all | Expand 10 after
3326 "+r"(dst_u), // %1 3294 "+r"(dst_u), // %1
3327 "+r"(dst_v), // %2 3295 "+r"(dst_v), // %2
3328 "+r"(pix) // %3 3296 "+r"(pix) // %3
3329 : 3297 :
3330 : "memory", "cc", NACL_R14 3298 : "memory", "cc", NACL_R14
3331 "xmm0", "xmm1", "xmm5" 3299 "xmm0", "xmm1", "xmm5"
3332 ); 3300 );
3333 } 3301 }
3334 #endif // HAS_YUY2TOYROW_AVX2 3302 #endif // HAS_YUY2TOYROW_AVX2
3335 3303
3336 #ifdef HAS_ARGBBLENDROW_SSE2
3337 // Blend 8 pixels at a time.
3338 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3339 uint8* dst_argb, int width) {
3340 asm volatile (
3341 "pcmpeqb %%xmm7,%%xmm7 \n"
3342 "psrlw $0xf,%%xmm7 \n"
3343 "pcmpeqb %%xmm6,%%xmm6 \n"
3344 "psrlw $0x8,%%xmm6 \n"
3345 "pcmpeqb %%xmm5,%%xmm5 \n"
3346 "psllw $0x8,%%xmm5 \n"
3347 "pcmpeqb %%xmm4,%%xmm4 \n"
3348 "pslld $0x18,%%xmm4 \n"
3349 "sub $0x4,%3 \n"
3350 "jl 49f \n"
3351
3352 // 4 pixel loop.
3353 LABELALIGN
3354 "41: \n"
3355 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3356 "lea " MEMLEA(0x10,0) ",%0 \n"
3357 "movdqa %%xmm3,%%xmm0 \n"
3358 "pxor %%xmm4,%%xmm3 \n"
3359 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3360 "psrlw $0x8,%%xmm3 \n"
3361 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3362 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3363 "pand %%xmm6,%%xmm2 \n"
3364 "paddw %%xmm7,%%xmm3 \n"
3365 "pmullw %%xmm3,%%xmm2 \n"
3366 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3367 "lea " MEMLEA(0x10,1) ",%1 \n"
3368 "psrlw $0x8,%%xmm1 \n"
3369 "por %%xmm4,%%xmm0 \n"
3370 "pmullw %%xmm3,%%xmm1 \n"
3371 "psrlw $0x8,%%xmm2 \n"
3372 "paddusb %%xmm2,%%xmm0 \n"
3373 "pand %%xmm5,%%xmm1 \n"
3374 "paddusb %%xmm1,%%xmm0 \n"
3375 "movdqu %%xmm0," MEMACCESS(2) " \n"
3376 "lea " MEMLEA(0x10,2) ",%2 \n"
3377 "sub $0x4,%3 \n"
3378 "jge 41b \n"
3379
3380 "49: \n"
3381 "add $0x3,%3 \n"
3382 "jl 99f \n"
3383
3384 // 1 pixel loop.
3385 "91: \n"
3386 "movd " MEMACCESS(0) ",%%xmm3 \n"
3387 "lea " MEMLEA(0x4,0) ",%0 \n"
3388 "movdqa %%xmm3,%%xmm0 \n"
3389 "pxor %%xmm4,%%xmm3 \n"
3390 "movd " MEMACCESS(1) ",%%xmm2 \n"
3391 "psrlw $0x8,%%xmm3 \n"
3392 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3393 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3394 "pand %%xmm6,%%xmm2 \n"
3395 "paddw %%xmm7,%%xmm3 \n"
3396 "pmullw %%xmm3,%%xmm2 \n"
3397 "movd " MEMACCESS(1) ",%%xmm1 \n"
3398 "lea " MEMLEA(0x4,1) ",%1 \n"
3399 "psrlw $0x8,%%xmm1 \n"
3400 "por %%xmm4,%%xmm0 \n"
3401 "pmullw %%xmm3,%%xmm1 \n"
3402 "psrlw $0x8,%%xmm2 \n"
3403 "paddusb %%xmm2,%%xmm0 \n"
3404 "pand %%xmm5,%%xmm1 \n"
3405 "paddusb %%xmm1,%%xmm0 \n"
3406 "movd %%xmm0," MEMACCESS(2) " \n"
3407 "lea " MEMLEA(0x4,2) ",%2 \n"
3408 "sub $0x1,%3 \n"
3409 "jge 91b \n"
3410 "99: \n"
3411 : "+r"(src_argb0), // %0
3412 "+r"(src_argb1), // %1
3413 "+r"(dst_argb), // %2
3414 "+r"(width) // %3
3415 :
3416 : "memory", "cc"
3417 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3418 );
3419 }
3420 #endif // HAS_ARGBBLENDROW_SSE2
3421
3422 #ifdef HAS_ARGBBLENDROW_SSSE3 3304 #ifdef HAS_ARGBBLENDROW_SSSE3
3423 // Shuffle table for isolating alpha. 3305 // Shuffle table for isolating alpha.
3424 static uvec8 kShuffleAlpha = { 3306 static uvec8 kShuffleAlpha = {
3425 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 3307 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3426 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 3308 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3427 }; 3309 };
3428 3310
3429 // Blend 8 pixels at a time 3311 // Blend 8 pixels at a time
3430 // Shuffle table for reversing the bytes.
3431
3432 // Same as SSE2, but replaces
3433 // psrlw xmm3, 8 // alpha
3434 // pshufhw xmm3, xmm3,0F5h // 8 alpha words
3435 // pshuflw xmm3, xmm3,0F5h
3436 // with..
3437 // pshufb xmm3, kShuffleAlpha // alpha
3438
3439 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 3312 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3440 uint8* dst_argb, int width) { 3313 uint8* dst_argb, int width) {
3441 asm volatile ( 3314 asm volatile (
3442 "pcmpeqb %%xmm7,%%xmm7 \n" 3315 "pcmpeqb %%xmm7,%%xmm7 \n"
3443 "psrlw $0xf,%%xmm7 \n" 3316 "psrlw $0xf,%%xmm7 \n"
3444 "pcmpeqb %%xmm6,%%xmm6 \n" 3317 "pcmpeqb %%xmm6,%%xmm6 \n"
3445 "psrlw $0x8,%%xmm6 \n" 3318 "psrlw $0x8,%%xmm6 \n"
3446 "pcmpeqb %%xmm5,%%xmm5 \n" 3319 "pcmpeqb %%xmm5,%%xmm5 \n"
3447 "psllw $0x8,%%xmm5 \n" 3320 "psllw $0x8,%%xmm5 \n"
3448 "pcmpeqb %%xmm4,%%xmm4 \n" 3321 "pcmpeqb %%xmm4,%%xmm4 \n"
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
3509 "+r"(src_argb1), // %1 3382 "+r"(src_argb1), // %1
3510 "+r"(dst_argb), // %2 3383 "+r"(dst_argb), // %2
3511 "+r"(width) // %3 3384 "+r"(width) // %3
3512 : "m"(kShuffleAlpha) // %4 3385 : "m"(kShuffleAlpha) // %4
3513 : "memory", "cc" 3386 : "memory", "cc"
3514 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 3387 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3515 ); 3388 );
3516 } 3389 }
3517 #endif // HAS_ARGBBLENDROW_SSSE3 3390 #endif // HAS_ARGBBLENDROW_SSSE3
3518 3391
3519 #ifdef HAS_ARGBATTENUATEROW_SSE2
3520 // Attenuate 4 pixels at a time.
3521 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3522 asm volatile (
3523 "pcmpeqb %%xmm4,%%xmm4 \n"
3524 "pslld $0x18,%%xmm4 \n"
3525 "pcmpeqb %%xmm5,%%xmm5 \n"
3526 "psrld $0x8,%%xmm5 \n"
3527
3528 // 4 pixel loop.
3529 LABELALIGN
3530 "1: \n"
3531 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3532 "punpcklbw %%xmm0,%%xmm0 \n"
3533 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3534 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3535 "pmulhuw %%xmm2,%%xmm0 \n"
3536 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3537 "punpckhbw %%xmm1,%%xmm1 \n"
3538 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3539 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3540 "pmulhuw %%xmm2,%%xmm1 \n"
3541 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3542 "lea " MEMLEA(0x10,0) ",%0 \n"
3543 "psrlw $0x8,%%xmm0 \n"
3544 "pand %%xmm4,%%xmm2 \n"
3545 "psrlw $0x8,%%xmm1 \n"
3546 "packuswb %%xmm1,%%xmm0 \n"
3547 "pand %%xmm5,%%xmm0 \n"
3548 "por %%xmm2,%%xmm0 \n"
3549 "movdqu %%xmm0," MEMACCESS(1) " \n"
3550 "lea " MEMLEA(0x10,1) ",%1 \n"
3551 "sub $0x4,%2 \n"
3552 "jg 1b \n"
3553 : "+r"(src_argb), // %0
3554 "+r"(dst_argb), // %1
3555 "+r"(width) // %2
3556 :
3557 : "memory", "cc"
3558 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3559 );
3560 }
3561 #endif // HAS_ARGBATTENUATEROW_SSE2
3562
3563 #ifdef HAS_ARGBATTENUATEROW_SSSE3 3392 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3564 // Shuffle table duplicating alpha 3393 // Shuffle table duplicating alpha
3565 static uvec8 kShuffleAlpha0 = { 3394 static uvec8 kShuffleAlpha0 = {
3566 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u 3395 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3567 }; 3396 };
3568 static uvec8 kShuffleAlpha1 = { 3397 static uvec8 kShuffleAlpha1 = {
3569 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 3398 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3570 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u 3399 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
3571 }; 3400 };
3572 // Attenuate 4 pixels at a time. 3401 // Attenuate 4 pixels at a time.
(...skipping 2010 matching lines...) Expand 10 before | Expand all | Expand 10 after
5583 ); 5412 );
5584 } 5413 }
5585 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5414 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5586 5415
5587 #endif // defined(__x86_64__) || defined(__i386__) 5416 #endif // defined(__x86_64__) || defined(__i386__)
5588 5417
5589 #ifdef __cplusplus 5418 #ifdef __cplusplus
5590 } // extern "C" 5419 } // extern "C"
5591 } // namespace libyuv 5420 } // namespace libyuv
5592 #endif 5421 #endif
OLDNEW
« include/libyuv/row.h ('K') | « source/row_any.cc ('k') | source/row_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698