OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 1661 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1672 "jg 1b \n" | 1672 "jg 1b \n" |
1673 : [y_buf]"+r"(y_buf), // %[y_buf] | 1673 : [y_buf]"+r"(y_buf), // %[y_buf] |
1674 [u_buf]"+r"(u_buf), // %[u_buf] | 1674 [u_buf]"+r"(u_buf), // %[u_buf] |
1675 [v_buf]"+r"(v_buf), // %[v_buf] | 1675 [v_buf]"+r"(v_buf), // %[v_buf] |
1676 [a_buf]"+r"(a_buf), // %[a_buf] | 1676 [a_buf]"+r"(a_buf), // %[a_buf] |
1677 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1677 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1678 #if defined(__i386__) && defined(__pic__) | 1678 #if defined(__i386__) && defined(__pic__) |
1679 [width]"+m"(width) // %[width] | 1679 [width]"+m"(width) // %[width] |
1680 #else | 1680 #else |
1681 [width]"+rm"(width) // %[width] | 1681 [width]"+rm"(width) // %[width] |
1682 #endif | 1682 #endif |
1683 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1683 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1684 : "memory", "cc", NACL_R14 | 1684 : "memory", "cc", NACL_R14 |
1685 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1685 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1686 ); | 1686 ); |
1687 } | 1687 } |
1688 | 1688 |
1689 void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf, | 1689 void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf, |
1690 const uint8* u_buf, | 1690 const uint8* u_buf, |
1691 const uint8* v_buf, | 1691 const uint8* v_buf, |
1692 const uint8* a_buf, | 1692 const uint8* a_buf, |
(...skipping 11 matching lines...) Expand all Loading... |
1704 "jg 1b \n" | 1704 "jg 1b \n" |
1705 : [y_buf]"+r"(y_buf), // %[y_buf] | 1705 : [y_buf]"+r"(y_buf), // %[y_buf] |
1706 [u_buf]"+r"(u_buf), // %[u_buf] | 1706 [u_buf]"+r"(u_buf), // %[u_buf] |
1707 [v_buf]"+r"(v_buf), // %[v_buf] | 1707 [v_buf]"+r"(v_buf), // %[v_buf] |
1708 [a_buf]"+r"(a_buf), // %[a_buf] | 1708 [a_buf]"+r"(a_buf), // %[a_buf] |
1709 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | 1709 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
1710 #if defined(__i386__) && defined(__pic__) | 1710 #if defined(__i386__) && defined(__pic__) |
1711 [width]"+m"(width) // %[width] | 1711 [width]"+m"(width) // %[width] |
1712 #else | 1712 #else |
1713 [width]"+rm"(width) // %[width] | 1713 [width]"+rm"(width) // %[width] |
1714 #endif | 1714 #endif |
1715 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1715 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1716 : "memory", "cc", NACL_R14 | 1716 : "memory", "cc", NACL_R14 |
1717 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1717 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1718 ); | 1718 ); |
1719 } | 1719 } |
1720 | 1720 |
1721 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | 1721 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
1722 const uint8* u_buf, | 1722 const uint8* u_buf, |
1723 const uint8* v_buf, | 1723 const uint8* v_buf, |
1724 uint8* dst_argb, | 1724 uint8* dst_argb, |
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1980 | 1980 |
1981 // Store 16 ARGB values. | 1981 // Store 16 ARGB values. |
1982 #define STOREARGB_AVX2 \ | 1982 #define STOREARGB_AVX2 \ |
1983 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1983 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
1984 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1984 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1985 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ | 1985 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ |
1986 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ | 1986 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
1987 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ | 1987 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ |
1988 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ | 1988 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ |
1989 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ | 1989 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ |
1990 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \ | 1990 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ |
1991 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | 1991 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" |
1992 | 1992 |
1993 // Store 16 ABGR values. | 1993 // Store 16 ABGR values. |
1994 #define STOREABGR_AVX2 \ | 1994 #define STOREABGR_AVX2 \ |
1995 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \ | 1995 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \ |
1996 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ | 1996 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ |
1997 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \ | 1997 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \ |
1998 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ | 1998 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
1999 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \ | 1999 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \ |
2000 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \ | 2000 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \ |
2001 "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \ | 2001 "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \ |
2002 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_abgr]) " \n" \ | 2002 "vmovdqu %%ymm1," MEMACCESS2(0x20, [dst_abgr]) " \n" \ |
2003 "lea " MEMLEA(0x40,[dst_abgr]) ",%[dst_abgr] \n" | 2003 "lea " MEMLEA(0x40, [dst_abgr]) ", %[dst_abgr] \n" |
2004 | 2004 |
2005 #if defined(HAS_I422TOBGRAROW_AVX2) | 2005 #if defined(HAS_I422TOBGRAROW_AVX2) |
2006 // 16 pixels | 2006 // 16 pixels |
2007 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 2007 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
2008 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, | 2008 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, |
2009 const uint8* u_buf, | 2009 const uint8* u_buf, |
2010 const uint8* v_buf, | 2010 const uint8* v_buf, |
2011 uint8* dst_bgra, | 2011 uint8* dst_bgra, |
2012 struct YuvConstants* yuvconstants, | 2012 struct YuvConstants* yuvconstants, |
2013 int width) { | 2013 int width) { |
2014 asm volatile ( | 2014 asm volatile ( |
2015 "sub %[u_buf],%[v_buf] \n" | 2015 "sub %[u_buf],%[v_buf] \n" |
2016 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2016 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2017 LABELALIGN | 2017 LABELALIGN |
2018 "1: \n" | 2018 "1: \n" |
2019 READYUV422_AVX2 | 2019 READYUV422_AVX2 |
2020 YUVTORGB_AVX2(yuvconstants) | 2020 YUVTORGB_AVX2(yuvconstants) |
2021 | 2021 |
2022 // Step 3: Weave into BGRA | 2022 // Step 3: Weave into BGRA |
2023 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB | 2023 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB |
2024 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 2024 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
2025 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR | 2025 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR |
2026 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 2026 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
2027 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels | 2027 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels |
2028 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels | 2028 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels |
2029 | |
2030 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" | 2029 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" |
2031 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" | 2030 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" |
2032 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" | 2031 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" |
2033 "sub $0x10,%[width] \n" | 2032 "sub $0x10,%[width] \n" |
2034 "jg 1b \n" | 2033 "jg 1b \n" |
2035 "vzeroupper \n" | 2034 "vzeroupper \n" |
2036 : [y_buf]"+r"(y_buf), // %[y_buf] | 2035 : [y_buf]"+r"(y_buf), // %[y_buf] |
2037 [u_buf]"+r"(u_buf), // %[u_buf] | 2036 [u_buf]"+r"(u_buf), // %[u_buf] |
2038 [v_buf]"+r"(v_buf), // %[v_buf] | 2037 [v_buf]"+r"(v_buf), // %[v_buf] |
2039 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] | 2038 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2099 "vzeroupper \n" | 2098 "vzeroupper \n" |
2100 : [y_buf]"+r"(y_buf), // %[y_buf] | 2099 : [y_buf]"+r"(y_buf), // %[y_buf] |
2101 [u_buf]"+r"(u_buf), // %[u_buf] | 2100 [u_buf]"+r"(u_buf), // %[u_buf] |
2102 [v_buf]"+r"(v_buf), // %[v_buf] | 2101 [v_buf]"+r"(v_buf), // %[v_buf] |
2103 [a_buf]"+r"(a_buf), // %[a_buf] | 2102 [a_buf]"+r"(a_buf), // %[a_buf] |
2104 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2103 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2105 #if defined(__i386__) && defined(__pic__) | 2104 #if defined(__i386__) && defined(__pic__) |
2106 [width]"+m"(width) // %[width] | 2105 [width]"+m"(width) // %[width] |
2107 #else | 2106 #else |
2108 [width]"+rm"(width) // %[width] | 2107 [width]"+rm"(width) // %[width] |
2109 #endif | 2108 #endif |
2110 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2109 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2111 : "memory", "cc", NACL_R14 | 2110 : "memory", "cc", NACL_R14 |
2112 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2111 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2113 ); | 2112 ); |
2114 } | 2113 } |
2115 #endif // HAS_I422ALPHATOARGBROW_AVX2 | 2114 #endif // HAS_I422ALPHATOARGBROW_AVX2 |
2116 | 2115 |
2117 #if defined(HAS_I422ALPHATOABGRROW_AVX2) | 2116 #if defined(HAS_I422ALPHATOABGRROW_AVX2) |
2118 // 16 pixels | 2117 // 16 pixels |
2119 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. | 2118 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. |
(...skipping 16 matching lines...) Expand all Loading... |
2136 "vzeroupper \n" | 2135 "vzeroupper \n" |
2137 : [y_buf]"+r"(y_buf), // %[y_buf] | 2136 : [y_buf]"+r"(y_buf), // %[y_buf] |
2138 [u_buf]"+r"(u_buf), // %[u_buf] | 2137 [u_buf]"+r"(u_buf), // %[u_buf] |
2139 [v_buf]"+r"(v_buf), // %[v_buf] | 2138 [v_buf]"+r"(v_buf), // %[v_buf] |
2140 [a_buf]"+r"(a_buf), // %[a_buf] | 2139 [a_buf]"+r"(a_buf), // %[a_buf] |
2141 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | 2140 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
2142 #if defined(__i386__) && defined(__pic__) | 2141 #if defined(__i386__) && defined(__pic__) |
2143 [width]"+m"(width) // %[width] | 2142 [width]"+m"(width) // %[width] |
2144 #else | 2143 #else |
2145 [width]"+rm"(width) // %[width] | 2144 [width]"+rm"(width) // %[width] |
2146 #endif | 2145 #endif |
2147 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2146 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2148 : "memory", "cc", NACL_R14 | 2147 : "memory", "cc", NACL_R14 |
2149 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2148 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2150 ); | 2149 ); |
2151 } | 2150 } |
2152 #endif // HAS_I422ALPHATOABGRROW_AVX2 | 2151 #endif // HAS_I422ALPHATOABGRROW_AVX2 |
2153 | 2152 |
2154 #if defined(HAS_I422TOABGRROW_AVX2) | 2153 #if defined(HAS_I422TOABGRROW_AVX2) |
2155 // 16 pixels | 2154 // 16 pixels |
2156 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 2155 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2226 #endif // HAS_I422TORGBAROW_AVX2 | 2225 #endif // HAS_I422TORGBAROW_AVX2 |
2227 | 2226 |
2228 #if defined(HAS_NV12TOARGBROW_AVX2) | 2227 #if defined(HAS_NV12TOARGBROW_AVX2) |
2229 // 16 pixels. | 2228 // 16 pixels. |
2230 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2229 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2231 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, | 2230 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, |
2232 const uint8* uv_buf, | 2231 const uint8* uv_buf, |
2233 uint8* dst_argb, | 2232 uint8* dst_argb, |
2234 struct YuvConstants* yuvconstants, | 2233 struct YuvConstants* yuvconstants, |
2235 int width) { | 2234 int width) { |
2236 | |
2237 asm volatile ( | 2235 asm volatile ( |
2238 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2236 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2239 LABELALIGN | 2237 LABELALIGN |
2240 "1: \n" | 2238 "1: \n" |
2241 READNV12_AVX2 | 2239 READNV12_AVX2 |
2242 YUVTORGB_AVX2(yuvconstants) | 2240 YUVTORGB_AVX2(yuvconstants) |
2243 STOREARGB_AVX2 | 2241 STOREARGB_AVX2 |
2244 "sub $0x10,%[width] \n" | 2242 "sub $0x10,%[width] \n" |
2245 "jg 1b \n" | 2243 "jg 1b \n" |
2246 "vzeroupper \n" | 2244 "vzeroupper \n" |
2247 : [y_buf]"+r"(y_buf), // %[y_buf] | 2245 : [y_buf]"+r"(y_buf), // %[y_buf] |
2248 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 2246 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
2249 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2247 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2250 [width]"+rm"(width) // %[width] | 2248 [width]"+rm"(width) // %[width] |
2251 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2249 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2252 // Does not use r14. | 2250 // Does not use r14. |
2253 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2251 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2254 ); | 2252 ); |
2255 } | 2253 } |
2256 #endif // HAS_YUY2TOARGBROW_AVX2 | 2254 #endif // HAS_YUY2TOARGBROW_AVX2 |
2257 | 2255 |
2258 | 2256 |
2259 #if defined(HAS_YUY2TOARGBROW_AVX2) | 2257 #if defined(HAS_YUY2TOARGBROW_AVX2) |
2260 // 16 pixels. | 2258 // 16 pixels. |
2261 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2259 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
2262 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, | 2260 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, |
2263 uint8* dst_argb, | 2261 uint8* dst_argb, |
2264 struct YuvConstants* yuvconstants, | 2262 struct YuvConstants* yuvconstants, |
2265 int width) { | 2263 int width) { |
2266 | |
2267 asm volatile ( | 2264 asm volatile ( |
2268 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2265 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2269 LABELALIGN | 2266 LABELALIGN |
2270 "1: \n" | 2267 "1: \n" |
2271 READYUY2_AVX2 | 2268 READYUY2_AVX2 |
2272 YUVTORGB_AVX2(yuvconstants) | 2269 YUVTORGB_AVX2(yuvconstants) |
2273 STOREARGB_AVX2 | 2270 STOREARGB_AVX2 |
2274 "sub $0x10,%[width] \n" | 2271 "sub $0x10,%[width] \n" |
2275 "jg 1b \n" | 2272 "jg 1b \n" |
2276 "vzeroupper \n" | 2273 "vzeroupper \n" |
2277 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] | 2274 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] |
2278 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2275 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2279 [width]"+rm"(width) // %[width] | 2276 [width]"+rm"(width) // %[width] |
2280 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 2277 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
2281 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), | 2278 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), |
2282 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) | 2279 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) |
2283 // Does not use r14. | 2280 // Does not use r14. |
2284 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2281 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2285 ); | 2282 ); |
2286 } | 2283 } |
2287 #endif // HAS_YUY2TOARGBROW_AVX2 | 2284 #endif // HAS_YUY2TOARGBROW_AVX2 |
2288 | 2285 |
2289 #if defined(HAS_UYVYTOARGBROW_AVX2) | 2286 #if defined(HAS_UYVYTOARGBROW_AVX2) |
2290 // 16 pixels. | 2287 // 16 pixels. |
2291 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2288 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
2292 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, | 2289 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, |
2293 uint8* dst_argb, | 2290 uint8* dst_argb, |
2294 struct YuvConstants* yuvconstants, | 2291 struct YuvConstants* yuvconstants, |
2295 int width) { | 2292 int width) { |
2296 | |
2297 asm volatile ( | 2293 asm volatile ( |
2298 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2294 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2299 LABELALIGN | 2295 LABELALIGN |
2300 "1: \n" | 2296 "1: \n" |
2301 READUYVY_AVX2 | 2297 READUYVY_AVX2 |
2302 YUVTORGB_AVX2(yuvconstants) | 2298 YUVTORGB_AVX2(yuvconstants) |
2303 STOREARGB_AVX2 | 2299 STOREARGB_AVX2 |
2304 "sub $0x10,%[width] \n" | 2300 "sub $0x10,%[width] \n" |
2305 "jg 1b \n" | 2301 "jg 1b \n" |
2306 "vzeroupper \n" | 2302 "vzeroupper \n" |
(...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2454 : "+r"(src), // %0 | 2450 : "+r"(src), // %0 |
2455 "+r"(dst), // %1 | 2451 "+r"(dst), // %1 |
2456 "+r"(temp_width) // %2 | 2452 "+r"(temp_width) // %2 |
2457 : "m"(kShuffleMirror) // %3 | 2453 : "m"(kShuffleMirror) // %3 |
2458 : "memory", "cc", NACL_R14 | 2454 : "memory", "cc", NACL_R14 |
2459 "xmm0", "xmm5" | 2455 "xmm0", "xmm5" |
2460 ); | 2456 ); |
2461 } | 2457 } |
2462 #endif // HAS_MIRRORROW_AVX2 | 2458 #endif // HAS_MIRRORROW_AVX2 |
2463 | 2459 |
2464 #ifdef HAS_MIRRORROW_SSE2 | |
2465 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | |
2466 intptr_t temp_width = (intptr_t)(width); | |
2467 asm volatile ( | |
2468 LABELALIGN | |
2469 "1: \n" | |
2470 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 | |
2471 "movdqa %%xmm0,%%xmm1 \n" | |
2472 "psllw $0x8,%%xmm0 \n" | |
2473 "psrlw $0x8,%%xmm1 \n" | |
2474 "por %%xmm1,%%xmm0 \n" | |
2475 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" | |
2476 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" | |
2477 "pshufd $0x4e,%%xmm0,%%xmm0 \n" | |
2478 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
2479 "lea " MEMLEA(0x10,1)",%1 \n" | |
2480 "sub $0x10,%2 \n" | |
2481 "jg 1b \n" | |
2482 : "+r"(src), // %0 | |
2483 "+r"(dst), // %1 | |
2484 "+r"(temp_width) // %2 | |
2485 : | |
2486 : "memory", "cc", NACL_R14 | |
2487 "xmm0", "xmm1" | |
2488 ); | |
2489 } | |
2490 #endif // HAS_MIRRORROW_SSE2 | |
2491 | |
2492 #ifdef HAS_MIRRORROW_UV_SSSE3 | 2460 #ifdef HAS_MIRRORROW_UV_SSSE3 |
2493 // Shuffle table for reversing the bytes of UV channels. | 2461 // Shuffle table for reversing the bytes of UV channels. |
2494 static uvec8 kShuffleMirrorUV = { | 2462 static uvec8 kShuffleMirrorUV = { |
2495 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u | 2463 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u |
2496 }; | 2464 }; |
2497 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, | 2465 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, |
2498 int width) { | 2466 int width) { |
2499 intptr_t temp_width = (intptr_t)(width); | 2467 intptr_t temp_width = (intptr_t)(width); |
2500 asm volatile ( | 2468 asm volatile ( |
2501 "movdqa %4,%%xmm1 \n" | 2469 "movdqa %4,%%xmm1 \n" |
(...skipping 824 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3326 "+r"(dst_u), // %1 | 3294 "+r"(dst_u), // %1 |
3327 "+r"(dst_v), // %2 | 3295 "+r"(dst_v), // %2 |
3328 "+r"(pix) // %3 | 3296 "+r"(pix) // %3 |
3329 : | 3297 : |
3330 : "memory", "cc", NACL_R14 | 3298 : "memory", "cc", NACL_R14 |
3331 "xmm0", "xmm1", "xmm5" | 3299 "xmm0", "xmm1", "xmm5" |
3332 ); | 3300 ); |
3333 } | 3301 } |
3334 #endif // HAS_YUY2TOYROW_AVX2 | 3302 #endif // HAS_YUY2TOYROW_AVX2 |
3335 | 3303 |
3336 #ifdef HAS_ARGBBLENDROW_SSE2 | |
3337 // Blend 8 pixels at a time. | |
3338 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
3339 uint8* dst_argb, int width) { | |
3340 asm volatile ( | |
3341 "pcmpeqb %%xmm7,%%xmm7 \n" | |
3342 "psrlw $0xf,%%xmm7 \n" | |
3343 "pcmpeqb %%xmm6,%%xmm6 \n" | |
3344 "psrlw $0x8,%%xmm6 \n" | |
3345 "pcmpeqb %%xmm5,%%xmm5 \n" | |
3346 "psllw $0x8,%%xmm5 \n" | |
3347 "pcmpeqb %%xmm4,%%xmm4 \n" | |
3348 "pslld $0x18,%%xmm4 \n" | |
3349 "sub $0x4,%3 \n" | |
3350 "jl 49f \n" | |
3351 | |
3352 // 4 pixel loop. | |
3353 LABELALIGN | |
3354 "41: \n" | |
3355 "movdqu " MEMACCESS(0) ",%%xmm3 \n" | |
3356 "lea " MEMLEA(0x10,0) ",%0 \n" | |
3357 "movdqa %%xmm3,%%xmm0 \n" | |
3358 "pxor %%xmm4,%%xmm3 \n" | |
3359 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | |
3360 "psrlw $0x8,%%xmm3 \n" | |
3361 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" | |
3362 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" | |
3363 "pand %%xmm6,%%xmm2 \n" | |
3364 "paddw %%xmm7,%%xmm3 \n" | |
3365 "pmullw %%xmm3,%%xmm2 \n" | |
3366 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
3367 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3368 "psrlw $0x8,%%xmm1 \n" | |
3369 "por %%xmm4,%%xmm0 \n" | |
3370 "pmullw %%xmm3,%%xmm1 \n" | |
3371 "psrlw $0x8,%%xmm2 \n" | |
3372 "paddusb %%xmm2,%%xmm0 \n" | |
3373 "pand %%xmm5,%%xmm1 \n" | |
3374 "paddusb %%xmm1,%%xmm0 \n" | |
3375 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
3376 "lea " MEMLEA(0x10,2) ",%2 \n" | |
3377 "sub $0x4,%3 \n" | |
3378 "jge 41b \n" | |
3379 | |
3380 "49: \n" | |
3381 "add $0x3,%3 \n" | |
3382 "jl 99f \n" | |
3383 | |
3384 // 1 pixel loop. | |
3385 "91: \n" | |
3386 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
3387 "lea " MEMLEA(0x4,0) ",%0 \n" | |
3388 "movdqa %%xmm3,%%xmm0 \n" | |
3389 "pxor %%xmm4,%%xmm3 \n" | |
3390 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
3391 "psrlw $0x8,%%xmm3 \n" | |
3392 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" | |
3393 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" | |
3394 "pand %%xmm6,%%xmm2 \n" | |
3395 "paddw %%xmm7,%%xmm3 \n" | |
3396 "pmullw %%xmm3,%%xmm2 \n" | |
3397 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
3398 "lea " MEMLEA(0x4,1) ",%1 \n" | |
3399 "psrlw $0x8,%%xmm1 \n" | |
3400 "por %%xmm4,%%xmm0 \n" | |
3401 "pmullw %%xmm3,%%xmm1 \n" | |
3402 "psrlw $0x8,%%xmm2 \n" | |
3403 "paddusb %%xmm2,%%xmm0 \n" | |
3404 "pand %%xmm5,%%xmm1 \n" | |
3405 "paddusb %%xmm1,%%xmm0 \n" | |
3406 "movd %%xmm0," MEMACCESS(2) " \n" | |
3407 "lea " MEMLEA(0x4,2) ",%2 \n" | |
3408 "sub $0x1,%3 \n" | |
3409 "jge 91b \n" | |
3410 "99: \n" | |
3411 : "+r"(src_argb0), // %0 | |
3412 "+r"(src_argb1), // %1 | |
3413 "+r"(dst_argb), // %2 | |
3414 "+r"(width) // %3 | |
3415 : | |
3416 : "memory", "cc" | |
3417 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
3418 ); | |
3419 } | |
3420 #endif // HAS_ARGBBLENDROW_SSE2 | |
3421 | |
3422 #ifdef HAS_ARGBBLENDROW_SSSE3 | 3304 #ifdef HAS_ARGBBLENDROW_SSSE3 |
3423 // Shuffle table for isolating alpha. | 3305 // Shuffle table for isolating alpha. |
3424 static uvec8 kShuffleAlpha = { | 3306 static uvec8 kShuffleAlpha = { |
3425 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, | 3307 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
3426 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | 3308 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
3427 }; | 3309 }; |
3428 | 3310 |
3429 // Blend 8 pixels at a time | 3311 // Blend 8 pixels at a time |
3430 // Shuffle table for reversing the bytes. | |
3431 | |
3432 // Same as SSE2, but replaces | |
3433 // psrlw xmm3, 8 // alpha | |
3434 // pshufhw xmm3, xmm3,0F5h // 8 alpha words | |
3435 // pshuflw xmm3, xmm3,0F5h | |
3436 // with.. | |
3437 // pshufb xmm3, kShuffleAlpha // alpha | |
3438 | |
3439 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | 3312 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
3440 uint8* dst_argb, int width) { | 3313 uint8* dst_argb, int width) { |
3441 asm volatile ( | 3314 asm volatile ( |
3442 "pcmpeqb %%xmm7,%%xmm7 \n" | 3315 "pcmpeqb %%xmm7,%%xmm7 \n" |
3443 "psrlw $0xf,%%xmm7 \n" | 3316 "psrlw $0xf,%%xmm7 \n" |
3444 "pcmpeqb %%xmm6,%%xmm6 \n" | 3317 "pcmpeqb %%xmm6,%%xmm6 \n" |
3445 "psrlw $0x8,%%xmm6 \n" | 3318 "psrlw $0x8,%%xmm6 \n" |
3446 "pcmpeqb %%xmm5,%%xmm5 \n" | 3319 "pcmpeqb %%xmm5,%%xmm5 \n" |
3447 "psllw $0x8,%%xmm5 \n" | 3320 "psllw $0x8,%%xmm5 \n" |
3448 "pcmpeqb %%xmm4,%%xmm4 \n" | 3321 "pcmpeqb %%xmm4,%%xmm4 \n" |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3509 "+r"(src_argb1), // %1 | 3382 "+r"(src_argb1), // %1 |
3510 "+r"(dst_argb), // %2 | 3383 "+r"(dst_argb), // %2 |
3511 "+r"(width) // %3 | 3384 "+r"(width) // %3 |
3512 : "m"(kShuffleAlpha) // %4 | 3385 : "m"(kShuffleAlpha) // %4 |
3513 : "memory", "cc" | 3386 : "memory", "cc" |
3514 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 3387 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
3515 ); | 3388 ); |
3516 } | 3389 } |
3517 #endif // HAS_ARGBBLENDROW_SSSE3 | 3390 #endif // HAS_ARGBBLENDROW_SSSE3 |
3518 | 3391 |
3519 #ifdef HAS_ARGBATTENUATEROW_SSE2 | |
3520 // Attenuate 4 pixels at a time. | |
3521 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { | |
3522 asm volatile ( | |
3523 "pcmpeqb %%xmm4,%%xmm4 \n" | |
3524 "pslld $0x18,%%xmm4 \n" | |
3525 "pcmpeqb %%xmm5,%%xmm5 \n" | |
3526 "psrld $0x8,%%xmm5 \n" | |
3527 | |
3528 // 4 pixel loop. | |
3529 LABELALIGN | |
3530 "1: \n" | |
3531 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
3532 "punpcklbw %%xmm0,%%xmm0 \n" | |
3533 "pshufhw $0xff,%%xmm0,%%xmm2 \n" | |
3534 "pshuflw $0xff,%%xmm2,%%xmm2 \n" | |
3535 "pmulhuw %%xmm2,%%xmm0 \n" | |
3536 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
3537 "punpckhbw %%xmm1,%%xmm1 \n" | |
3538 "pshufhw $0xff,%%xmm1,%%xmm2 \n" | |
3539 "pshuflw $0xff,%%xmm2,%%xmm2 \n" | |
3540 "pmulhuw %%xmm2,%%xmm1 \n" | |
3541 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
3542 "lea " MEMLEA(0x10,0) ",%0 \n" | |
3543 "psrlw $0x8,%%xmm0 \n" | |
3544 "pand %%xmm4,%%xmm2 \n" | |
3545 "psrlw $0x8,%%xmm1 \n" | |
3546 "packuswb %%xmm1,%%xmm0 \n" | |
3547 "pand %%xmm5,%%xmm0 \n" | |
3548 "por %%xmm2,%%xmm0 \n" | |
3549 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
3550 "lea " MEMLEA(0x10,1) ",%1 \n" | |
3551 "sub $0x4,%2 \n" | |
3552 "jg 1b \n" | |
3553 : "+r"(src_argb), // %0 | |
3554 "+r"(dst_argb), // %1 | |
3555 "+r"(width) // %2 | |
3556 : | |
3557 : "memory", "cc" | |
3558 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
3559 ); | |
3560 } | |
3561 #endif // HAS_ARGBATTENUATEROW_SSE2 | |
3562 | |
3563 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | 3392 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
3564 // Shuffle table duplicating alpha | 3393 // Shuffle table duplicating alpha |
3565 static uvec8 kShuffleAlpha0 = { | 3394 static uvec8 kShuffleAlpha0 = { |
3566 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u | 3395 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u |
3567 }; | 3396 }; |
3568 static uvec8 kShuffleAlpha1 = { | 3397 static uvec8 kShuffleAlpha1 = { |
3569 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | 3398 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
3570 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u | 3399 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u |
3571 }; | 3400 }; |
3572 // Attenuate 4 pixels at a time. | 3401 // Attenuate 4 pixels at a time. |
(...skipping 2010 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5583 ); | 5412 ); |
5584 } | 5413 } |
5585 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5414 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5586 | 5415 |
5587 #endif // defined(__x86_64__) || defined(__i386__) | 5416 #endif // defined(__x86_64__) || defined(__i386__) |
5588 | 5417 |
5589 #ifdef __cplusplus | 5418 #ifdef __cplusplus |
5590 } // extern "C" | 5419 } // extern "C" |
5591 } // namespace libyuv | 5420 } // namespace libyuv |
5592 #endif | 5421 #endif |
OLD | NEW |