source/row_gcc.cc - Issue 1377053003: remove sse2 functions that also have ssse3

Side by Side Diff: source/row_gcc.cc

Issue 1377053003: remove sse2 functions that also have ssse3 (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master

Patch Set: lint warning fixes Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // VERSION 2	1 // VERSION 2

2 /*	2 /*

3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.	3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.

4 *	4 *

5 * Use of this source code is governed by a BSD-style license	5 * Use of this source code is governed by a BSD-style license

6 * that can be found in the LICENSE file in the root of the source	6 * that can be found in the LICENSE file in the root of the source

7 * tree. An additional intellectual property rights grant can be found	7 * tree. An additional intellectual property rights grant can be found

8 * in the file PATENTS. All contributing project authors may	8 * in the file PATENTS. All contributing project authors may

9 * be found in the AUTHORS file in the root of the source tree.	9 * be found in the AUTHORS file in the root of the source tree.

10 */	10 */

(...skipping 1661 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1672 "jg 1b \n"	1672 "jg 1b \n"

1673 : [y_buf]"+r"(y_buf), // %[y_buf]	1673 : [y_buf]"+r"(y_buf), // %[y_buf]

1674 [u_buf]"+r"(u_buf), // %[u_buf]	1674 [u_buf]"+r"(u_buf), // %[u_buf]

1675 [v_buf]"+r"(v_buf), // %[v_buf]	1675 [v_buf]"+r"(v_buf), // %[v_buf]

1676 [a_buf]"+r"(a_buf), // %[a_buf]	1676 [a_buf]"+r"(a_buf), // %[a_buf]

1677 [dst_argb]"+r"(dst_argb), // %[dst_argb]	1677 [dst_argb]"+r"(dst_argb), // %[dst_argb]

1678 #if defined(__i386__) && defined(__pic__)	1678 #if defined(__i386__) && defined(__pic__)

1679 [width]"+m"(width) // %[width]	1679 [width]"+m"(width) // %[width]

1680 #else	1680 #else

1681 [width]"+rm"(width) // %[width]	1681 [width]"+rm"(width) // %[width]

1682 #endif	1682 #endif

1683 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]	1683 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]

1684 : "memory", "cc", NACL_R14	1684 : "memory", "cc", NACL_R14

1685 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"	1685 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

1686 );	1686 );

1687 }	1687 }

1688	1688

1689 void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf,	1689 void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf,

1690 const uint8* u_buf,	1690 const uint8* u_buf,

1691 const uint8* v_buf,	1691 const uint8* v_buf,

1692 const uint8* a_buf,	1692 const uint8* a_buf,

(...skipping 11 matching lines...) Expand all Loading...
1704 "jg 1b \n"	1704 "jg 1b \n"

1705 : [y_buf]"+r"(y_buf), // %[y_buf]	1705 : [y_buf]"+r"(y_buf), // %[y_buf]

1706 [u_buf]"+r"(u_buf), // %[u_buf]	1706 [u_buf]"+r"(u_buf), // %[u_buf]

1707 [v_buf]"+r"(v_buf), // %[v_buf]	1707 [v_buf]"+r"(v_buf), // %[v_buf]

1708 [a_buf]"+r"(a_buf), // %[a_buf]	1708 [a_buf]"+r"(a_buf), // %[a_buf]

1709 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]	1709 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]

1710 #if defined(__i386__) && defined(__pic__)	1710 #if defined(__i386__) && defined(__pic__)

1711 [width]"+m"(width) // %[width]	1711 [width]"+m"(width) // %[width]

1712 #else	1712 #else

1713 [width]"+rm"(width) // %[width]	1713 [width]"+rm"(width) // %[width]

1714 #endif	1714 #endif

1715 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]	1715 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]

1716 : "memory", "cc", NACL_R14	1716 : "memory", "cc", NACL_R14

1717 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"	1717 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

1718 );	1718 );

1719 }	1719 }

1720	1720

1721 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,	1721 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,

1722 const uint8* u_buf,	1722 const uint8* u_buf,

1723 const uint8* v_buf,	1723 const uint8* v_buf,

1724 uint8* dst_argb,	1724 uint8* dst_argb,

(...skipping 255 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1980	1980

1981 // Store 16 ARGB values.	1981 // Store 16 ARGB values.

1982 #define STOREARGB_AVX2 \	1982 #define STOREARGB_AVX2 \

1983 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \	1983 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \

1984 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \	1984 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \

1985 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \	1985 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \

1986 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \	1986 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \

1987 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \	1987 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \

1988 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \	1988 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \

1989 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \	1989 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \

1990 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \	1990 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \

1991 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"	1991 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"

1992	1992

1993 // Store 16 ABGR values.	1993 // Store 16 ABGR values.

1994 #define STOREABGR_AVX2 \	1994 #define STOREABGR_AVX2 \

1995 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \	1995 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \

1996 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \	1996 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \

1997 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \	1997 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \

1998 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \	1998 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \

1999 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \	1999 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \

2000 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \	2000 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \

2001 "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \	2001 "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \

2002 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_abgr]) " \n" \	2002 "vmovdqu %%ymm1," MEMACCESS2(0x20, [dst_abgr]) " \n" \

2003 "lea " MEMLEA(0x40,[dst_abgr]) ",%[dst_abgr] \n"	2003 "lea " MEMLEA(0x40, [dst_abgr]) ", %[dst_abgr] \n"

2004	2004

2005 #if defined(HAS_I422TOBGRAROW_AVX2)	2005 #if defined(HAS_I422TOBGRAROW_AVX2)

2006 // 16 pixels	2006 // 16 pixels

2007 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).	2007 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).

2008 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,	2008 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,

2009 const uint8* u_buf,	2009 const uint8* u_buf,

2010 const uint8* v_buf,	2010 const uint8* v_buf,

2011 uint8* dst_bgra,	2011 uint8* dst_bgra,

2012 struct YuvConstants* yuvconstants,	2012 struct YuvConstants* yuvconstants,

2013 int width) {	2013 int width) {

2014 asm volatile (	2014 asm volatile (

2015 "sub %[u_buf],%[v_buf] \n"	2015 "sub %[u_buf],%[v_buf] \n"

2016 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"	2016 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"

2017 LABELALIGN	2017 LABELALIGN

2018 "1: \n"	2018 "1: \n"

2019 READYUV422_AVX2	2019 READYUV422_AVX2

2020 YUVTORGB_AVX2(yuvconstants)	2020 YUVTORGB_AVX2(yuvconstants)

2021	2021

2022 // Step 3: Weave into BGRA	2022 // Step 3: Weave into BGRA

2023 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB	2023 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB

2024 "vpermq $0xd8,%%ymm1,%%ymm1 \n"	2024 "vpermq $0xd8,%%ymm1,%%ymm1 \n"

2025 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR	2025 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR

2026 "vpermq $0xd8,%%ymm2,%%ymm2 \n"	2026 "vpermq $0xd8,%%ymm2,%%ymm2 \n"

2027 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels	2027 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels

2028 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels	2028 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels

2029

2030 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n"	2029 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n"

2031 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"	2030 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"

2032 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"	2031 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"

2033 "sub $0x10,%[width] \n"	2032 "sub $0x10,%[width] \n"

2034 "jg 1b \n"	2033 "jg 1b \n"

2035 "vzeroupper \n"	2034 "vzeroupper \n"

2036 : [y_buf]"+r"(y_buf), // %[y_buf]	2035 : [y_buf]"+r"(y_buf), // %[y_buf]

2037 [u_buf]"+r"(u_buf), // %[u_buf]	2036 [u_buf]"+r"(u_buf), // %[u_buf]

2038 [v_buf]"+r"(v_buf), // %[v_buf]	2037 [v_buf]"+r"(v_buf), // %[v_buf]

2039 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]	2038 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2099 "vzeroupper \n"	2098 "vzeroupper \n"

2100 : [y_buf]"+r"(y_buf), // %[y_buf]	2099 : [y_buf]"+r"(y_buf), // %[y_buf]

2101 [u_buf]"+r"(u_buf), // %[u_buf]	2100 [u_buf]"+r"(u_buf), // %[u_buf]

2102 [v_buf]"+r"(v_buf), // %[v_buf]	2101 [v_buf]"+r"(v_buf), // %[v_buf]

2103 [a_buf]"+r"(a_buf), // %[a_buf]	2102 [a_buf]"+r"(a_buf), // %[a_buf]

2104 [dst_argb]"+r"(dst_argb), // %[dst_argb]	2103 [dst_argb]"+r"(dst_argb), // %[dst_argb]

2105 #if defined(__i386__) && defined(__pic__)	2104 #if defined(__i386__) && defined(__pic__)

2106 [width]"+m"(width) // %[width]	2105 [width]"+m"(width) // %[width]

2107 #else	2106 #else

2108 [width]"+rm"(width) // %[width]	2107 [width]"+rm"(width) // %[width]

2109 #endif	2108 #endif

2110 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]	2109 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]

2111 : "memory", "cc", NACL_R14	2110 : "memory", "cc", NACL_R14

2112 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"	2111 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

2113 );	2112 );

2114 }	2113 }

2115 #endif // HAS_I422ALPHATOARGBROW_AVX2	2114 #endif // HAS_I422ALPHATOARGBROW_AVX2

2116	2115

2117 #if defined(HAS_I422ALPHATOABGRROW_AVX2)	2116 #if defined(HAS_I422ALPHATOABGRROW_AVX2)

2118 // 16 pixels	2117 // 16 pixels

2119 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR.	2118 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR.

(...skipping 16 matching lines...) Expand all Loading...
2136 "vzeroupper \n"	2135 "vzeroupper \n"

2137 : [y_buf]"+r"(y_buf), // %[y_buf]	2136 : [y_buf]"+r"(y_buf), // %[y_buf]

2138 [u_buf]"+r"(u_buf), // %[u_buf]	2137 [u_buf]"+r"(u_buf), // %[u_buf]

2139 [v_buf]"+r"(v_buf), // %[v_buf]	2138 [v_buf]"+r"(v_buf), // %[v_buf]

2140 [a_buf]"+r"(a_buf), // %[a_buf]	2139 [a_buf]"+r"(a_buf), // %[a_buf]

2141 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]	2140 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]

2142 #if defined(__i386__) && defined(__pic__)	2141 #if defined(__i386__) && defined(__pic__)

2143 [width]"+m"(width) // %[width]	2142 [width]"+m"(width) // %[width]

2144 #else	2143 #else

2145 [width]"+rm"(width) // %[width]	2144 [width]"+rm"(width) // %[width]

2146 #endif	2145 #endif

2147 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]	2146 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]

2148 : "memory", "cc", NACL_R14	2147 : "memory", "cc", NACL_R14

2149 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"	2148 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

2150 );	2149 );

2151 }	2150 }

2152 #endif // HAS_I422ALPHATOABGRROW_AVX2	2151 #endif // HAS_I422ALPHATOABGRROW_AVX2

2153	2152

2154 #if defined(HAS_I422TOABGRROW_AVX2)	2153 #if defined(HAS_I422TOABGRROW_AVX2)

2155 // 16 pixels	2154 // 16 pixels

2156 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).	2155 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).

(...skipping 69 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2226 #endif // HAS_I422TORGBAROW_AVX2	2225 #endif // HAS_I422TORGBAROW_AVX2

2227	2226

2228 #if defined(HAS_NV12TOARGBROW_AVX2)	2227 #if defined(HAS_NV12TOARGBROW_AVX2)

2229 // 16 pixels.	2228 // 16 pixels.

2230 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).	2229 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

2231 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,	2230 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,

2232 const uint8* uv_buf,	2231 const uint8* uv_buf,

2233 uint8* dst_argb,	2232 uint8* dst_argb,

2234 struct YuvConstants* yuvconstants,	2233 struct YuvConstants* yuvconstants,

2235 int width) {	2234 int width) {

2236

2237 asm volatile (	2235 asm volatile (

2238 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"	2236 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"

2239 LABELALIGN	2237 LABELALIGN

2240 "1: \n"	2238 "1: \n"

2241 READNV12_AVX2	2239 READNV12_AVX2

2242 YUVTORGB_AVX2(yuvconstants)	2240 YUVTORGB_AVX2(yuvconstants)

2243 STOREARGB_AVX2	2241 STOREARGB_AVX2

2244 "sub $0x10,%[width] \n"	2242 "sub $0x10,%[width] \n"

2245 "jg 1b \n"	2243 "jg 1b \n"

2246 "vzeroupper \n"	2244 "vzeroupper \n"

2247 : [y_buf]"+r"(y_buf), // %[y_buf]	2245 : [y_buf]"+r"(y_buf), // %[y_buf]

2248 [uv_buf]"+r"(uv_buf), // %[uv_buf]	2246 [uv_buf]"+r"(uv_buf), // %[uv_buf]

2249 [dst_argb]"+r"(dst_argb), // %[dst_argb]	2247 [dst_argb]"+r"(dst_argb), // %[dst_argb]

2250 [width]"+rm"(width) // %[width]	2248 [width]"+rm"(width) // %[width]

2251 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]	2249 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]

2252 // Does not use r14.	2250 // Does not use r14.

2253 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"	2251 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

2254 );	2252 );

2255 }	2253 }

2256 #endif // HAS_YUY2TOARGBROW_AVX2	2254 #endif // HAS_YUY2TOARGBROW_AVX2

2257	2255

2258	2256

2259 #if defined(HAS_YUY2TOARGBROW_AVX2)	2257 #if defined(HAS_YUY2TOARGBROW_AVX2)

2260 // 16 pixels.	2258 // 16 pixels.

2261 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).	2259 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).

2262 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,	2260 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,

2263 uint8* dst_argb,	2261 uint8* dst_argb,

2264 struct YuvConstants* yuvconstants,	2262 struct YuvConstants* yuvconstants,

2265 int width) {	2263 int width) {

2266

2267 asm volatile (	2264 asm volatile (

2268 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"	2265 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"

2269 LABELALIGN	2266 LABELALIGN

2270 "1: \n"	2267 "1: \n"

2271 READYUY2_AVX2	2268 READYUY2_AVX2

2272 YUVTORGB_AVX2(yuvconstants)	2269 YUVTORGB_AVX2(yuvconstants)

2273 STOREARGB_AVX2	2270 STOREARGB_AVX2

2274 "sub $0x10,%[width] \n"	2271 "sub $0x10,%[width] \n"

2275 "jg 1b \n"	2272 "jg 1b \n"

2276 "vzeroupper \n"	2273 "vzeroupper \n"

2277 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]	2274 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]

2278 [dst_argb]"+r"(dst_argb), // %[dst_argb]	2275 [dst_argb]"+r"(dst_argb), // %[dst_argb]

2279 [width]"+rm"(width) // %[width]	2276 [width]"+rm"(width) // %[width]

2280 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]	2277 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

2281 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),	2278 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),

2282 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)	2279 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)

2283 // Does not use r14.	2280 // Does not use r14.

2284 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"	2281 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

2285 );	2282 );

2286 }	2283 }

2287 #endif // HAS_YUY2TOARGBROW_AVX2	2284 #endif // HAS_YUY2TOARGBROW_AVX2

2288	2285

2289 #if defined(HAS_UYVYTOARGBROW_AVX2)	2286 #if defined(HAS_UYVYTOARGBROW_AVX2)

2290 // 16 pixels.	2287 // 16 pixels.

2291 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).	2288 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).

2292 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,	2289 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,

2293 uint8* dst_argb,	2290 uint8* dst_argb,

2294 struct YuvConstants* yuvconstants,	2291 struct YuvConstants* yuvconstants,

2295 int width) {	2292 int width) {

2296

2297 asm volatile (	2293 asm volatile (

2298 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"	2294 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"

2299 LABELALIGN	2295 LABELALIGN

2300 "1: \n"	2296 "1: \n"

2301 READUYVY_AVX2	2297 READUYVY_AVX2

2302 YUVTORGB_AVX2(yuvconstants)	2298 YUVTORGB_AVX2(yuvconstants)

2303 STOREARGB_AVX2	2299 STOREARGB_AVX2

2304 "sub $0x10,%[width] \n"	2300 "sub $0x10,%[width] \n"

2305 "jg 1b \n"	2301 "jg 1b \n"

2306 "vzeroupper \n"	2302 "vzeroupper \n"

(...skipping 147 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2454 : "+r"(src), // %0	2450 : "+r"(src), // %0

2455 "+r"(dst), // %1	2451 "+r"(dst), // %1

2456 "+r"(temp_width) // %2	2452 "+r"(temp_width) // %2

2457 : "m"(kShuffleMirror) // %3	2453 : "m"(kShuffleMirror) // %3

2458 : "memory", "cc", NACL_R14	2454 : "memory", "cc", NACL_R14

2459 "xmm0", "xmm5"	2455 "xmm0", "xmm5"

2460 );	2456 );

2461 }	2457 }

2462 #endif // HAS_MIRRORROW_AVX2	2458 #endif // HAS_MIRRORROW_AVX2

2463	2459

2464 #ifdef HAS_MIRRORROW_SSE2

2465 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {

2466 intptr_t temp_width = (intptr_t)(width);

2467 asm volatile (

2468 LABELALIGN

2469 "1: \n"

2470 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0

2471 "movdqa %%xmm0,%%xmm1 \n"

2472 "psllw $0x8,%%xmm0 \n"

2473 "psrlw $0x8,%%xmm1 \n"

2474 "por %%xmm1,%%xmm0 \n"

2475 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"

2476 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"

2477 "pshufd $0x4e,%%xmm0,%%xmm0 \n"

2478 "movdqu %%xmm0," MEMACCESS(1) " \n"

2479 "lea " MEMLEA(0x10,1)",%1 \n"

2480 "sub $0x10,%2 \n"

2481 "jg 1b \n"

2482 : "+r"(src), // %0

2483 "+r"(dst), // %1

2484 "+r"(temp_width) // %2

2485 :

2486 : "memory", "cc", NACL_R14

2487 "xmm0", "xmm1"

2488 );

2489 }

2490 #endif // HAS_MIRRORROW_SSE2

2491

2492 #ifdef HAS_MIRRORROW_UV_SSSE3	2460 #ifdef HAS_MIRRORROW_UV_SSSE3

2493 // Shuffle table for reversing the bytes of UV channels.	2461 // Shuffle table for reversing the bytes of UV channels.

2494 static uvec8 kShuffleMirrorUV = {	2462 static uvec8 kShuffleMirrorUV = {

2495 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u	2463 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u

2496 };	2464 };

2497 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,	2465 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,

2498 int width) {	2466 int width) {

2499 intptr_t temp_width = (intptr_t)(width);	2467 intptr_t temp_width = (intptr_t)(width);

2500 asm volatile (	2468 asm volatile (

2501 "movdqa %4,%%xmm1 \n"	2469 "movdqa %4,%%xmm1 \n"

(...skipping 824 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3326 "+r"(dst_u), // %1	3294 "+r"(dst_u), // %1

3327 "+r"(dst_v), // %2	3295 "+r"(dst_v), // %2

3328 "+r"(pix) // %3	3296 "+r"(pix) // %3

3329 :	3297 :

3330 : "memory", "cc", NACL_R14	3298 : "memory", "cc", NACL_R14

3331 "xmm0", "xmm1", "xmm5"	3299 "xmm0", "xmm1", "xmm5"

3332 );	3300 );

3333 }	3301 }

3334 #endif // HAS_YUY2TOYROW_AVX2	3302 #endif // HAS_YUY2TOYROW_AVX2

3335	3303

3336 #ifdef HAS_ARGBBLENDROW_SSE2

3337 // Blend 8 pixels at a time.

3338 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

3339 uint8* dst_argb, int width) {

3340 asm volatile (

3341 "pcmpeqb %%xmm7,%%xmm7 \n"

3342 "psrlw $0xf,%%xmm7 \n"

3343 "pcmpeqb %%xmm6,%%xmm6 \n"

3344 "psrlw $0x8,%%xmm6 \n"

3345 "pcmpeqb %%xmm5,%%xmm5 \n"

3346 "psllw $0x8,%%xmm5 \n"

3347 "pcmpeqb %%xmm4,%%xmm4 \n"

3348 "pslld $0x18,%%xmm4 \n"

3349 "sub $0x4,%3 \n"

3350 "jl 49f \n"

3351

3352 // 4 pixel loop.

3353 LABELALIGN

3354 "41: \n"

3355 "movdqu " MEMACCESS(0) ",%%xmm3 \n"

3356 "lea " MEMLEA(0x10,0) ",%0 \n"

3357 "movdqa %%xmm3,%%xmm0 \n"

3358 "pxor %%xmm4,%%xmm3 \n"

3359 "movdqu " MEMACCESS(1) ",%%xmm2 \n"

3360 "psrlw $0x8,%%xmm3 \n"

3361 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"

3362 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"

3363 "pand %%xmm6,%%xmm2 \n"

3364 "paddw %%xmm7,%%xmm3 \n"

3365 "pmullw %%xmm3,%%xmm2 \n"

3366 "movdqu " MEMACCESS(1) ",%%xmm1 \n"

3367 "lea " MEMLEA(0x10,1) ",%1 \n"

3368 "psrlw $0x8,%%xmm1 \n"

3369 "por %%xmm4,%%xmm0 \n"

3370 "pmullw %%xmm3,%%xmm1 \n"

3371 "psrlw $0x8,%%xmm2 \n"

3372 "paddusb %%xmm2,%%xmm0 \n"

3373 "pand %%xmm5,%%xmm1 \n"

3374 "paddusb %%xmm1,%%xmm0 \n"

3375 "movdqu %%xmm0," MEMACCESS(2) " \n"

3376 "lea " MEMLEA(0x10,2) ",%2 \n"

3377 "sub $0x4,%3 \n"

3378 "jge 41b \n"

3379

3380 "49: \n"

3381 "add $0x3,%3 \n"

3382 "jl 99f \n"

3383

3384 // 1 pixel loop.

3385 "91: \n"

3386 "movd " MEMACCESS(0) ",%%xmm3 \n"

3387 "lea " MEMLEA(0x4,0) ",%0 \n"

3388 "movdqa %%xmm3,%%xmm0 \n"

3389 "pxor %%xmm4,%%xmm3 \n"

3390 "movd " MEMACCESS(1) ",%%xmm2 \n"

3391 "psrlw $0x8,%%xmm3 \n"

3392 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"

3393 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"

3394 "pand %%xmm6,%%xmm2 \n"

3395 "paddw %%xmm7,%%xmm3 \n"

3396 "pmullw %%xmm3,%%xmm2 \n"

3397 "movd " MEMACCESS(1) ",%%xmm1 \n"

3398 "lea " MEMLEA(0x4,1) ",%1 \n"

3399 "psrlw $0x8,%%xmm1 \n"

3400 "por %%xmm4,%%xmm0 \n"

3401 "pmullw %%xmm3,%%xmm1 \n"

3402 "psrlw $0x8,%%xmm2 \n"

3403 "paddusb %%xmm2,%%xmm0 \n"

3404 "pand %%xmm5,%%xmm1 \n"

3405 "paddusb %%xmm1,%%xmm0 \n"

3406 "movd %%xmm0," MEMACCESS(2) " \n"

3407 "lea " MEMLEA(0x4,2) ",%2 \n"

3408 "sub $0x1,%3 \n"

3409 "jge 91b \n"

3410 "99: \n"

3411 : "+r"(src_argb0), // %0

3412 "+r"(src_argb1), // %1

3413 "+r"(dst_argb), // %2

3414 "+r"(width) // %3

3415 :

3416 : "memory", "cc"

3417 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

3418 );

3419 }

3420 #endif // HAS_ARGBBLENDROW_SSE2

3421

3422 #ifdef HAS_ARGBBLENDROW_SSSE3	3304 #ifdef HAS_ARGBBLENDROW_SSSE3

3423 // Shuffle table for isolating alpha.	3305 // Shuffle table for isolating alpha.

3424 static uvec8 kShuffleAlpha = {	3306 static uvec8 kShuffleAlpha = {

3425 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,	3307 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,

3426 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80	3308 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80

3427 };	3309 };

3428	3310

3429 // Blend 8 pixels at a time	3311 // Blend 8 pixels at a time

3430 // Shuffle table for reversing the bytes.

3431

3432 // Same as SSE2, but replaces

3433 // psrlw xmm3, 8 // alpha

3434 // pshufhw xmm3, xmm3,0F5h // 8 alpha words

3435 // pshuflw xmm3, xmm3,0F5h

3436 // with..

3437 // pshufb xmm3, kShuffleAlpha // alpha

3438

3439 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,	3312 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,

3440 uint8* dst_argb, int width) {	3313 uint8* dst_argb, int width) {

3441 asm volatile (	3314 asm volatile (

3442 "pcmpeqb %%xmm7,%%xmm7 \n"	3315 "pcmpeqb %%xmm7,%%xmm7 \n"

3443 "psrlw $0xf,%%xmm7 \n"	3316 "psrlw $0xf,%%xmm7 \n"

3444 "pcmpeqb %%xmm6,%%xmm6 \n"	3317 "pcmpeqb %%xmm6,%%xmm6 \n"

3445 "psrlw $0x8,%%xmm6 \n"	3318 "psrlw $0x8,%%xmm6 \n"

3446 "pcmpeqb %%xmm5,%%xmm5 \n"	3319 "pcmpeqb %%xmm5,%%xmm5 \n"

3447 "psllw $0x8,%%xmm5 \n"	3320 "psllw $0x8,%%xmm5 \n"

3448 "pcmpeqb %%xmm4,%%xmm4 \n"	3321 "pcmpeqb %%xmm4,%%xmm4 \n"

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3509 "+r"(src_argb1), // %1	3382 "+r"(src_argb1), // %1

3510 "+r"(dst_argb), // %2	3383 "+r"(dst_argb), // %2

3511 "+r"(width) // %3	3384 "+r"(width) // %3

3512 : "m"(kShuffleAlpha) // %4	3385 : "m"(kShuffleAlpha) // %4

3513 : "memory", "cc"	3386 : "memory", "cc"

3514 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"	3387 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

3515 );	3388 );

3516 }	3389 }

3517 #endif // HAS_ARGBBLENDROW_SSSE3	3390 #endif // HAS_ARGBBLENDROW_SSSE3

3518	3391

3519 #ifdef HAS_ARGBATTENUATEROW_SSE2

3520 // Attenuate 4 pixels at a time.

3521 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {

3522 asm volatile (

3523 "pcmpeqb %%xmm4,%%xmm4 \n"

3524 "pslld $0x18,%%xmm4 \n"

3525 "pcmpeqb %%xmm5,%%xmm5 \n"

3526 "psrld $0x8,%%xmm5 \n"

3527

3528 // 4 pixel loop.

3529 LABELALIGN

3530 "1: \n"

3531 "movdqu " MEMACCESS(0) ",%%xmm0 \n"

3532 "punpcklbw %%xmm0,%%xmm0 \n"

3533 "pshufhw $0xff,%%xmm0,%%xmm2 \n"

3534 "pshuflw $0xff,%%xmm2,%%xmm2 \n"

3535 "pmulhuw %%xmm2,%%xmm0 \n"

3536 "movdqu " MEMACCESS(0) ",%%xmm1 \n"

3537 "punpckhbw %%xmm1,%%xmm1 \n"

3538 "pshufhw $0xff,%%xmm1,%%xmm2 \n"

3539 "pshuflw $0xff,%%xmm2,%%xmm2 \n"

3540 "pmulhuw %%xmm2,%%xmm1 \n"

3541 "movdqu " MEMACCESS(0) ",%%xmm2 \n"

3542 "lea " MEMLEA(0x10,0) ",%0 \n"

3543 "psrlw $0x8,%%xmm0 \n"

3544 "pand %%xmm4,%%xmm2 \n"

3545 "psrlw $0x8,%%xmm1 \n"

3546 "packuswb %%xmm1,%%xmm0 \n"

3547 "pand %%xmm5,%%xmm0 \n"

3548 "por %%xmm2,%%xmm0 \n"

3549 "movdqu %%xmm0," MEMACCESS(1) " \n"

3550 "lea " MEMLEA(0x10,1) ",%1 \n"

3551 "sub $0x4,%2 \n"

3552 "jg 1b \n"

3553 : "+r"(src_argb), // %0

3554 "+r"(dst_argb), // %1

3555 "+r"(width) // %2

3556 :

3557 : "memory", "cc"

3558 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

3559 );

3560 }

3561 #endif // HAS_ARGBATTENUATEROW_SSE2

3562

3563 #ifdef HAS_ARGBATTENUATEROW_SSSE3	3392 #ifdef HAS_ARGBATTENUATEROW_SSSE3

3564 // Shuffle table duplicating alpha	3393 // Shuffle table duplicating alpha

3565 static uvec8 kShuffleAlpha0 = {	3394 static uvec8 kShuffleAlpha0 = {

3566 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u	3395 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u

3567 };	3396 };

3568 static uvec8 kShuffleAlpha1 = {	3397 static uvec8 kShuffleAlpha1 = {

3569 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,	3398 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,

3570 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u	3399 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u

3571 };	3400 };

3572 // Attenuate 4 pixels at a time.	3401 // Attenuate 4 pixels at a time.

(...skipping 2010 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5583 );	5412 );

5584 }	5413 }

5585 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3	5414 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3

5586	5415

5587 #endif // defined(__x86_64__) \|\| defined(__i386__)	5416 #endif // defined(__x86_64__) \|\| defined(__i386__)

5588	5417

5589 #ifdef __cplusplus	5418 #ifdef __cplusplus

5590 } // extern "C"	5419 } // extern "C"

5591 } // namespace libyuv	5420 } // namespace libyuv

5592 #endif	5421 #endif

OLD	NEW

« include/libyuv/row.h ('K') | « source/row_any.cc ('k') | source/row_win.cc » ('j') | no next file with comments »