| OLD | NEW |
| 1 // VERSION 2 | 1 // VERSION 2 |
| 2 /* | 2 /* |
| 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 4 * | 4 * |
| 5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
| 6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
| 7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
| 8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
| 9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
| 10 */ | 10 */ |
| (...skipping 1661 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1672 "jg 1b \n" | 1672 "jg 1b \n" |
| 1673 : [y_buf]"+r"(y_buf), // %[y_buf] | 1673 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1674 [u_buf]"+r"(u_buf), // %[u_buf] | 1674 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1675 [v_buf]"+r"(v_buf), // %[v_buf] | 1675 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1676 [a_buf]"+r"(a_buf), // %[a_buf] | 1676 [a_buf]"+r"(a_buf), // %[a_buf] |
| 1677 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1677 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1678 #if defined(__i386__) && defined(__pic__) | 1678 #if defined(__i386__) && defined(__pic__) |
| 1679 [width]"+m"(width) // %[width] | 1679 [width]"+m"(width) // %[width] |
| 1680 #else | 1680 #else |
| 1681 [width]"+rm"(width) // %[width] | 1681 [width]"+rm"(width) // %[width] |
| 1682 #endif | 1682 #endif |
| 1683 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1683 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1684 : "memory", "cc", NACL_R14 | 1684 : "memory", "cc", NACL_R14 |
| 1685 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1685 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1686 ); | 1686 ); |
| 1687 } | 1687 } |
| 1688 | 1688 |
| 1689 void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf, | 1689 void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf, |
| 1690 const uint8* u_buf, | 1690 const uint8* u_buf, |
| 1691 const uint8* v_buf, | 1691 const uint8* v_buf, |
| 1692 const uint8* a_buf, | 1692 const uint8* a_buf, |
| (...skipping 11 matching lines...) Expand all Loading... |
| 1704 "jg 1b \n" | 1704 "jg 1b \n" |
| 1705 : [y_buf]"+r"(y_buf), // %[y_buf] | 1705 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1706 [u_buf]"+r"(u_buf), // %[u_buf] | 1706 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1707 [v_buf]"+r"(v_buf), // %[v_buf] | 1707 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1708 [a_buf]"+r"(a_buf), // %[a_buf] | 1708 [a_buf]"+r"(a_buf), // %[a_buf] |
| 1709 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | 1709 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
| 1710 #if defined(__i386__) && defined(__pic__) | 1710 #if defined(__i386__) && defined(__pic__) |
| 1711 [width]"+m"(width) // %[width] | 1711 [width]"+m"(width) // %[width] |
| 1712 #else | 1712 #else |
| 1713 [width]"+rm"(width) // %[width] | 1713 [width]"+rm"(width) // %[width] |
| 1714 #endif | 1714 #endif |
| 1715 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1715 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1716 : "memory", "cc", NACL_R14 | 1716 : "memory", "cc", NACL_R14 |
| 1717 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1717 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1718 ); | 1718 ); |
| 1719 } | 1719 } |
| 1720 | 1720 |
| 1721 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | 1721 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
| 1722 const uint8* u_buf, | 1722 const uint8* u_buf, |
| 1723 const uint8* v_buf, | 1723 const uint8* v_buf, |
| 1724 uint8* dst_argb, | 1724 uint8* dst_argb, |
| (...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1980 | 1980 |
| 1981 // Store 16 ARGB values. | 1981 // Store 16 ARGB values. |
| 1982 #define STOREARGB_AVX2 \ | 1982 #define STOREARGB_AVX2 \ |
| 1983 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1983 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
| 1984 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1984 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1985 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ | 1985 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ |
| 1986 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ | 1986 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
| 1987 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ | 1987 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ |
| 1988 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ | 1988 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ |
| 1989 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ | 1989 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ |
| 1990 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \ | 1990 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ |
| 1991 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | 1991 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" |
| 1992 | 1992 |
| 1993 // Store 16 ABGR values. | 1993 // Store 16 ABGR values. |
| 1994 #define STOREABGR_AVX2 \ | 1994 #define STOREABGR_AVX2 \ |
| 1995 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \ | 1995 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \ |
| 1996 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ | 1996 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ |
| 1997 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \ | 1997 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \ |
| 1998 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ | 1998 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
| 1999 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \ | 1999 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \ |
| 2000 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \ | 2000 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \ |
| 2001 "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \ | 2001 "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \ |
| 2002 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_abgr]) " \n" \ | 2002 "vmovdqu %%ymm1," MEMACCESS2(0x20, [dst_abgr]) " \n" \ |
| 2003 "lea " MEMLEA(0x40,[dst_abgr]) ",%[dst_abgr] \n" | 2003 "lea " MEMLEA(0x40, [dst_abgr]) ", %[dst_abgr] \n" |
| 2004 | 2004 |
| 2005 #if defined(HAS_I422TOBGRAROW_AVX2) | 2005 #if defined(HAS_I422TOBGRAROW_AVX2) |
| 2006 // 16 pixels | 2006 // 16 pixels |
| 2007 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 2007 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
| 2008 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, | 2008 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, |
| 2009 const uint8* u_buf, | 2009 const uint8* u_buf, |
| 2010 const uint8* v_buf, | 2010 const uint8* v_buf, |
| 2011 uint8* dst_bgra, | 2011 uint8* dst_bgra, |
| 2012 struct YuvConstants* yuvconstants, | 2012 struct YuvConstants* yuvconstants, |
| 2013 int width) { | 2013 int width) { |
| 2014 asm volatile ( | 2014 asm volatile ( |
| 2015 "sub %[u_buf],%[v_buf] \n" | 2015 "sub %[u_buf],%[v_buf] \n" |
| 2016 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2016 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2017 LABELALIGN | 2017 LABELALIGN |
| 2018 "1: \n" | 2018 "1: \n" |
| 2019 READYUV422_AVX2 | 2019 READYUV422_AVX2 |
| 2020 YUVTORGB_AVX2(yuvconstants) | 2020 YUVTORGB_AVX2(yuvconstants) |
| 2021 | 2021 |
| 2022 // Step 3: Weave into BGRA | 2022 // Step 3: Weave into BGRA |
| 2023 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB | 2023 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB |
| 2024 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 2024 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
| 2025 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR | 2025 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR |
| 2026 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 2026 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
| 2027 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels | 2027 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels |
| 2028 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels | 2028 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels |
| 2029 | |
| 2030 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" | 2029 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" |
| 2031 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" | 2030 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" |
| 2032 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" | 2031 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" |
| 2033 "sub $0x10,%[width] \n" | 2032 "sub $0x10,%[width] \n" |
| 2034 "jg 1b \n" | 2033 "jg 1b \n" |
| 2035 "vzeroupper \n" | 2034 "vzeroupper \n" |
| 2036 : [y_buf]"+r"(y_buf), // %[y_buf] | 2035 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2037 [u_buf]"+r"(u_buf), // %[u_buf] | 2036 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2038 [v_buf]"+r"(v_buf), // %[v_buf] | 2037 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2039 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] | 2038 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] |
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2099 "vzeroupper \n" | 2098 "vzeroupper \n" |
| 2100 : [y_buf]"+r"(y_buf), // %[y_buf] | 2099 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2101 [u_buf]"+r"(u_buf), // %[u_buf] | 2100 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2102 [v_buf]"+r"(v_buf), // %[v_buf] | 2101 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2103 [a_buf]"+r"(a_buf), // %[a_buf] | 2102 [a_buf]"+r"(a_buf), // %[a_buf] |
| 2104 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2103 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2105 #if defined(__i386__) && defined(__pic__) | 2104 #if defined(__i386__) && defined(__pic__) |
| 2106 [width]"+m"(width) // %[width] | 2105 [width]"+m"(width) // %[width] |
| 2107 #else | 2106 #else |
| 2108 [width]"+rm"(width) // %[width] | 2107 [width]"+rm"(width) // %[width] |
| 2109 #endif | 2108 #endif |
| 2110 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2109 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 2111 : "memory", "cc", NACL_R14 | 2110 : "memory", "cc", NACL_R14 |
| 2112 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2111 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2113 ); | 2112 ); |
| 2114 } | 2113 } |
| 2115 #endif // HAS_I422ALPHATOARGBROW_AVX2 | 2114 #endif // HAS_I422ALPHATOARGBROW_AVX2 |
| 2116 | 2115 |
| 2117 #if defined(HAS_I422ALPHATOABGRROW_AVX2) | 2116 #if defined(HAS_I422ALPHATOABGRROW_AVX2) |
| 2118 // 16 pixels | 2117 // 16 pixels |
| 2119 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. | 2118 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. |
| (...skipping 16 matching lines...) Expand all Loading... |
| 2136 "vzeroupper \n" | 2135 "vzeroupper \n" |
| 2137 : [y_buf]"+r"(y_buf), // %[y_buf] | 2136 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2138 [u_buf]"+r"(u_buf), // %[u_buf] | 2137 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2139 [v_buf]"+r"(v_buf), // %[v_buf] | 2138 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2140 [a_buf]"+r"(a_buf), // %[a_buf] | 2139 [a_buf]"+r"(a_buf), // %[a_buf] |
| 2141 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] | 2140 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
| 2142 #if defined(__i386__) && defined(__pic__) | 2141 #if defined(__i386__) && defined(__pic__) |
| 2143 [width]"+m"(width) // %[width] | 2142 [width]"+m"(width) // %[width] |
| 2144 #else | 2143 #else |
| 2145 [width]"+rm"(width) // %[width] | 2144 [width]"+rm"(width) // %[width] |
| 2146 #endif | 2145 #endif |
| 2147 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2146 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 2148 : "memory", "cc", NACL_R14 | 2147 : "memory", "cc", NACL_R14 |
| 2149 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2148 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2150 ); | 2149 ); |
| 2151 } | 2150 } |
| 2152 #endif // HAS_I422ALPHATOABGRROW_AVX2 | 2151 #endif // HAS_I422ALPHATOABGRROW_AVX2 |
| 2153 | 2152 |
| 2154 #if defined(HAS_I422TOABGRROW_AVX2) | 2153 #if defined(HAS_I422TOABGRROW_AVX2) |
| 2155 // 16 pixels | 2154 // 16 pixels |
| 2156 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 2155 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
| (...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2226 #endif // HAS_I422TORGBAROW_AVX2 | 2225 #endif // HAS_I422TORGBAROW_AVX2 |
| 2227 | 2226 |
| 2228 #if defined(HAS_NV12TOARGBROW_AVX2) | 2227 #if defined(HAS_NV12TOARGBROW_AVX2) |
| 2229 // 16 pixels. | 2228 // 16 pixels. |
| 2230 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2229 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2231 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, | 2230 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, |
| 2232 const uint8* uv_buf, | 2231 const uint8* uv_buf, |
| 2233 uint8* dst_argb, | 2232 uint8* dst_argb, |
| 2234 struct YuvConstants* yuvconstants, | 2233 struct YuvConstants* yuvconstants, |
| 2235 int width) { | 2234 int width) { |
| 2236 | |
| 2237 asm volatile ( | 2235 asm volatile ( |
| 2238 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2236 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2239 LABELALIGN | 2237 LABELALIGN |
| 2240 "1: \n" | 2238 "1: \n" |
| 2241 READNV12_AVX2 | 2239 READNV12_AVX2 |
| 2242 YUVTORGB_AVX2(yuvconstants) | 2240 YUVTORGB_AVX2(yuvconstants) |
| 2243 STOREARGB_AVX2 | 2241 STOREARGB_AVX2 |
| 2244 "sub $0x10,%[width] \n" | 2242 "sub $0x10,%[width] \n" |
| 2245 "jg 1b \n" | 2243 "jg 1b \n" |
| 2246 "vzeroupper \n" | 2244 "vzeroupper \n" |
| 2247 : [y_buf]"+r"(y_buf), // %[y_buf] | 2245 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2248 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 2246 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
| 2249 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2247 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2250 [width]"+rm"(width) // %[width] | 2248 [width]"+rm"(width) // %[width] |
| 2251 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2249 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 2252 // Does not use r14. | 2250 // Does not use r14. |
| 2253 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2251 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2254 ); | 2252 ); |
| 2255 } | 2253 } |
| 2256 #endif // HAS_YUY2TOARGBROW_AVX2 | 2254 #endif // HAS_YUY2TOARGBROW_AVX2 |
| 2257 | 2255 |
| 2258 | 2256 |
| 2259 #if defined(HAS_YUY2TOARGBROW_AVX2) | 2257 #if defined(HAS_YUY2TOARGBROW_AVX2) |
| 2260 // 16 pixels. | 2258 // 16 pixels. |
| 2261 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2259 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2262 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, | 2260 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, |
| 2263 uint8* dst_argb, | 2261 uint8* dst_argb, |
| 2264 struct YuvConstants* yuvconstants, | 2262 struct YuvConstants* yuvconstants, |
| 2265 int width) { | 2263 int width) { |
| 2266 | |
| 2267 asm volatile ( | 2264 asm volatile ( |
| 2268 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2265 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2269 LABELALIGN | 2266 LABELALIGN |
| 2270 "1: \n" | 2267 "1: \n" |
| 2271 READYUY2_AVX2 | 2268 READYUY2_AVX2 |
| 2272 YUVTORGB_AVX2(yuvconstants) | 2269 YUVTORGB_AVX2(yuvconstants) |
| 2273 STOREARGB_AVX2 | 2270 STOREARGB_AVX2 |
| 2274 "sub $0x10,%[width] \n" | 2271 "sub $0x10,%[width] \n" |
| 2275 "jg 1b \n" | 2272 "jg 1b \n" |
| 2276 "vzeroupper \n" | 2273 "vzeroupper \n" |
| 2277 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] | 2274 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] |
| 2278 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2275 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2279 [width]"+rm"(width) // %[width] | 2276 [width]"+rm"(width) // %[width] |
| 2280 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 2277 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 2281 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), | 2278 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), |
| 2282 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) | 2279 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) |
| 2283 // Does not use r14. | 2280 // Does not use r14. |
| 2284 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2281 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2285 ); | 2282 ); |
| 2286 } | 2283 } |
| 2287 #endif // HAS_YUY2TOARGBROW_AVX2 | 2284 #endif // HAS_YUY2TOARGBROW_AVX2 |
| 2288 | 2285 |
| 2289 #if defined(HAS_UYVYTOARGBROW_AVX2) | 2286 #if defined(HAS_UYVYTOARGBROW_AVX2) |
| 2290 // 16 pixels. | 2287 // 16 pixels. |
| 2291 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2288 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2292 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, | 2289 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, |
| 2293 uint8* dst_argb, | 2290 uint8* dst_argb, |
| 2294 struct YuvConstants* yuvconstants, | 2291 struct YuvConstants* yuvconstants, |
| 2295 int width) { | 2292 int width) { |
| 2296 | |
| 2297 asm volatile ( | 2293 asm volatile ( |
| 2298 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2294 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2299 LABELALIGN | 2295 LABELALIGN |
| 2300 "1: \n" | 2296 "1: \n" |
| 2301 READUYVY_AVX2 | 2297 READUYVY_AVX2 |
| 2302 YUVTORGB_AVX2(yuvconstants) | 2298 YUVTORGB_AVX2(yuvconstants) |
| 2303 STOREARGB_AVX2 | 2299 STOREARGB_AVX2 |
| 2304 "sub $0x10,%[width] \n" | 2300 "sub $0x10,%[width] \n" |
| 2305 "jg 1b \n" | 2301 "jg 1b \n" |
| 2306 "vzeroupper \n" | 2302 "vzeroupper \n" |
| (...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2454 : "+r"(src), // %0 | 2450 : "+r"(src), // %0 |
| 2455 "+r"(dst), // %1 | 2451 "+r"(dst), // %1 |
| 2456 "+r"(temp_width) // %2 | 2452 "+r"(temp_width) // %2 |
| 2457 : "m"(kShuffleMirror) // %3 | 2453 : "m"(kShuffleMirror) // %3 |
| 2458 : "memory", "cc", NACL_R14 | 2454 : "memory", "cc", NACL_R14 |
| 2459 "xmm0", "xmm5" | 2455 "xmm0", "xmm5" |
| 2460 ); | 2456 ); |
| 2461 } | 2457 } |
| 2462 #endif // HAS_MIRRORROW_AVX2 | 2458 #endif // HAS_MIRRORROW_AVX2 |
| 2463 | 2459 |
| 2464 #ifdef HAS_MIRRORROW_SSE2 | |
| 2465 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | |
| 2466 intptr_t temp_width = (intptr_t)(width); | |
| 2467 asm volatile ( | |
| 2468 LABELALIGN | |
| 2469 "1: \n" | |
| 2470 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 | |
| 2471 "movdqa %%xmm0,%%xmm1 \n" | |
| 2472 "psllw $0x8,%%xmm0 \n" | |
| 2473 "psrlw $0x8,%%xmm1 \n" | |
| 2474 "por %%xmm1,%%xmm0 \n" | |
| 2475 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" | |
| 2476 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" | |
| 2477 "pshufd $0x4e,%%xmm0,%%xmm0 \n" | |
| 2478 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 2479 "lea " MEMLEA(0x10,1)",%1 \n" | |
| 2480 "sub $0x10,%2 \n" | |
| 2481 "jg 1b \n" | |
| 2482 : "+r"(src), // %0 | |
| 2483 "+r"(dst), // %1 | |
| 2484 "+r"(temp_width) // %2 | |
| 2485 : | |
| 2486 : "memory", "cc", NACL_R14 | |
| 2487 "xmm0", "xmm1" | |
| 2488 ); | |
| 2489 } | |
| 2490 #endif // HAS_MIRRORROW_SSE2 | |
| 2491 | |
| 2492 #ifdef HAS_MIRRORROW_UV_SSSE3 | 2460 #ifdef HAS_MIRRORROW_UV_SSSE3 |
| 2493 // Shuffle table for reversing the bytes of UV channels. | 2461 // Shuffle table for reversing the bytes of UV channels. |
| 2494 static uvec8 kShuffleMirrorUV = { | 2462 static uvec8 kShuffleMirrorUV = { |
| 2495 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u | 2463 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u |
| 2496 }; | 2464 }; |
| 2497 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, | 2465 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, |
| 2498 int width) { | 2466 int width) { |
| 2499 intptr_t temp_width = (intptr_t)(width); | 2467 intptr_t temp_width = (intptr_t)(width); |
| 2500 asm volatile ( | 2468 asm volatile ( |
| 2501 "movdqa %4,%%xmm1 \n" | 2469 "movdqa %4,%%xmm1 \n" |
| (...skipping 824 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3326 "+r"(dst_u), // %1 | 3294 "+r"(dst_u), // %1 |
| 3327 "+r"(dst_v), // %2 | 3295 "+r"(dst_v), // %2 |
| 3328 "+r"(pix) // %3 | 3296 "+r"(pix) // %3 |
| 3329 : | 3297 : |
| 3330 : "memory", "cc", NACL_R14 | 3298 : "memory", "cc", NACL_R14 |
| 3331 "xmm0", "xmm1", "xmm5" | 3299 "xmm0", "xmm1", "xmm5" |
| 3332 ); | 3300 ); |
| 3333 } | 3301 } |
| 3334 #endif // HAS_YUY2TOYROW_AVX2 | 3302 #endif // HAS_YUY2TOYROW_AVX2 |
| 3335 | 3303 |
| 3336 #ifdef HAS_ARGBBLENDROW_SSE2 | |
| 3337 // Blend 8 pixels at a time. | |
| 3338 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
| 3339 uint8* dst_argb, int width) { | |
| 3340 asm volatile ( | |
| 3341 "pcmpeqb %%xmm7,%%xmm7 \n" | |
| 3342 "psrlw $0xf,%%xmm7 \n" | |
| 3343 "pcmpeqb %%xmm6,%%xmm6 \n" | |
| 3344 "psrlw $0x8,%%xmm6 \n" | |
| 3345 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 3346 "psllw $0x8,%%xmm5 \n" | |
| 3347 "pcmpeqb %%xmm4,%%xmm4 \n" | |
| 3348 "pslld $0x18,%%xmm4 \n" | |
| 3349 "sub $0x4,%3 \n" | |
| 3350 "jl 49f \n" | |
| 3351 | |
| 3352 // 4 pixel loop. | |
| 3353 LABELALIGN | |
| 3354 "41: \n" | |
| 3355 "movdqu " MEMACCESS(0) ",%%xmm3 \n" | |
| 3356 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 3357 "movdqa %%xmm3,%%xmm0 \n" | |
| 3358 "pxor %%xmm4,%%xmm3 \n" | |
| 3359 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | |
| 3360 "psrlw $0x8,%%xmm3 \n" | |
| 3361 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" | |
| 3362 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" | |
| 3363 "pand %%xmm6,%%xmm2 \n" | |
| 3364 "paddw %%xmm7,%%xmm3 \n" | |
| 3365 "pmullw %%xmm3,%%xmm2 \n" | |
| 3366 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | |
| 3367 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3368 "psrlw $0x8,%%xmm1 \n" | |
| 3369 "por %%xmm4,%%xmm0 \n" | |
| 3370 "pmullw %%xmm3,%%xmm1 \n" | |
| 3371 "psrlw $0x8,%%xmm2 \n" | |
| 3372 "paddusb %%xmm2,%%xmm0 \n" | |
| 3373 "pand %%xmm5,%%xmm1 \n" | |
| 3374 "paddusb %%xmm1,%%xmm0 \n" | |
| 3375 "movdqu %%xmm0," MEMACCESS(2) " \n" | |
| 3376 "lea " MEMLEA(0x10,2) ",%2 \n" | |
| 3377 "sub $0x4,%3 \n" | |
| 3378 "jge 41b \n" | |
| 3379 | |
| 3380 "49: \n" | |
| 3381 "add $0x3,%3 \n" | |
| 3382 "jl 99f \n" | |
| 3383 | |
| 3384 // 1 pixel loop. | |
| 3385 "91: \n" | |
| 3386 "movd " MEMACCESS(0) ",%%xmm3 \n" | |
| 3387 "lea " MEMLEA(0x4,0) ",%0 \n" | |
| 3388 "movdqa %%xmm3,%%xmm0 \n" | |
| 3389 "pxor %%xmm4,%%xmm3 \n" | |
| 3390 "movd " MEMACCESS(1) ",%%xmm2 \n" | |
| 3391 "psrlw $0x8,%%xmm3 \n" | |
| 3392 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" | |
| 3393 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" | |
| 3394 "pand %%xmm6,%%xmm2 \n" | |
| 3395 "paddw %%xmm7,%%xmm3 \n" | |
| 3396 "pmullw %%xmm3,%%xmm2 \n" | |
| 3397 "movd " MEMACCESS(1) ",%%xmm1 \n" | |
| 3398 "lea " MEMLEA(0x4,1) ",%1 \n" | |
| 3399 "psrlw $0x8,%%xmm1 \n" | |
| 3400 "por %%xmm4,%%xmm0 \n" | |
| 3401 "pmullw %%xmm3,%%xmm1 \n" | |
| 3402 "psrlw $0x8,%%xmm2 \n" | |
| 3403 "paddusb %%xmm2,%%xmm0 \n" | |
| 3404 "pand %%xmm5,%%xmm1 \n" | |
| 3405 "paddusb %%xmm1,%%xmm0 \n" | |
| 3406 "movd %%xmm0," MEMACCESS(2) " \n" | |
| 3407 "lea " MEMLEA(0x4,2) ",%2 \n" | |
| 3408 "sub $0x1,%3 \n" | |
| 3409 "jge 91b \n" | |
| 3410 "99: \n" | |
| 3411 : "+r"(src_argb0), // %0 | |
| 3412 "+r"(src_argb1), // %1 | |
| 3413 "+r"(dst_argb), // %2 | |
| 3414 "+r"(width) // %3 | |
| 3415 : | |
| 3416 : "memory", "cc" | |
| 3417 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | |
| 3418 ); | |
| 3419 } | |
| 3420 #endif // HAS_ARGBBLENDROW_SSE2 | |
| 3421 | |
| 3422 #ifdef HAS_ARGBBLENDROW_SSSE3 | 3304 #ifdef HAS_ARGBBLENDROW_SSSE3 |
| 3423 // Shuffle table for isolating alpha. | 3305 // Shuffle table for isolating alpha. |
| 3424 static uvec8 kShuffleAlpha = { | 3306 static uvec8 kShuffleAlpha = { |
| 3425 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, | 3307 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
| 3426 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | 3308 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
| 3427 }; | 3309 }; |
| 3428 | 3310 |
| 3429 // Blend 8 pixels at a time | 3311 // Blend 8 pixels at a time |
| 3430 // Shuffle table for reversing the bytes. | |
| 3431 | |
| 3432 // Same as SSE2, but replaces | |
| 3433 // psrlw xmm3, 8 // alpha | |
| 3434 // pshufhw xmm3, xmm3,0F5h // 8 alpha words | |
| 3435 // pshuflw xmm3, xmm3,0F5h | |
| 3436 // with.. | |
| 3437 // pshufb xmm3, kShuffleAlpha // alpha | |
| 3438 | |
| 3439 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | 3312 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
| 3440 uint8* dst_argb, int width) { | 3313 uint8* dst_argb, int width) { |
| 3441 asm volatile ( | 3314 asm volatile ( |
| 3442 "pcmpeqb %%xmm7,%%xmm7 \n" | 3315 "pcmpeqb %%xmm7,%%xmm7 \n" |
| 3443 "psrlw $0xf,%%xmm7 \n" | 3316 "psrlw $0xf,%%xmm7 \n" |
| 3444 "pcmpeqb %%xmm6,%%xmm6 \n" | 3317 "pcmpeqb %%xmm6,%%xmm6 \n" |
| 3445 "psrlw $0x8,%%xmm6 \n" | 3318 "psrlw $0x8,%%xmm6 \n" |
| 3446 "pcmpeqb %%xmm5,%%xmm5 \n" | 3319 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3447 "psllw $0x8,%%xmm5 \n" | 3320 "psllw $0x8,%%xmm5 \n" |
| 3448 "pcmpeqb %%xmm4,%%xmm4 \n" | 3321 "pcmpeqb %%xmm4,%%xmm4 \n" |
| (...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3509 "+r"(src_argb1), // %1 | 3382 "+r"(src_argb1), // %1 |
| 3510 "+r"(dst_argb), // %2 | 3383 "+r"(dst_argb), // %2 |
| 3511 "+r"(width) // %3 | 3384 "+r"(width) // %3 |
| 3512 : "m"(kShuffleAlpha) // %4 | 3385 : "m"(kShuffleAlpha) // %4 |
| 3513 : "memory", "cc" | 3386 : "memory", "cc" |
| 3514 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 3387 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 3515 ); | 3388 ); |
| 3516 } | 3389 } |
| 3517 #endif // HAS_ARGBBLENDROW_SSSE3 | 3390 #endif // HAS_ARGBBLENDROW_SSSE3 |
| 3518 | 3391 |
| 3519 #ifdef HAS_ARGBATTENUATEROW_SSE2 | |
| 3520 // Attenuate 4 pixels at a time. | |
| 3521 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { | |
| 3522 asm volatile ( | |
| 3523 "pcmpeqb %%xmm4,%%xmm4 \n" | |
| 3524 "pslld $0x18,%%xmm4 \n" | |
| 3525 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 3526 "psrld $0x8,%%xmm5 \n" | |
| 3527 | |
| 3528 // 4 pixel loop. | |
| 3529 LABELALIGN | |
| 3530 "1: \n" | |
| 3531 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | |
| 3532 "punpcklbw %%xmm0,%%xmm0 \n" | |
| 3533 "pshufhw $0xff,%%xmm0,%%xmm2 \n" | |
| 3534 "pshuflw $0xff,%%xmm2,%%xmm2 \n" | |
| 3535 "pmulhuw %%xmm2,%%xmm0 \n" | |
| 3536 "movdqu " MEMACCESS(0) ",%%xmm1 \n" | |
| 3537 "punpckhbw %%xmm1,%%xmm1 \n" | |
| 3538 "pshufhw $0xff,%%xmm1,%%xmm2 \n" | |
| 3539 "pshuflw $0xff,%%xmm2,%%xmm2 \n" | |
| 3540 "pmulhuw %%xmm2,%%xmm1 \n" | |
| 3541 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
| 3542 "lea " MEMLEA(0x10,0) ",%0 \n" | |
| 3543 "psrlw $0x8,%%xmm0 \n" | |
| 3544 "pand %%xmm4,%%xmm2 \n" | |
| 3545 "psrlw $0x8,%%xmm1 \n" | |
| 3546 "packuswb %%xmm1,%%xmm0 \n" | |
| 3547 "pand %%xmm5,%%xmm0 \n" | |
| 3548 "por %%xmm2,%%xmm0 \n" | |
| 3549 "movdqu %%xmm0," MEMACCESS(1) " \n" | |
| 3550 "lea " MEMLEA(0x10,1) ",%1 \n" | |
| 3551 "sub $0x4,%2 \n" | |
| 3552 "jg 1b \n" | |
| 3553 : "+r"(src_argb), // %0 | |
| 3554 "+r"(dst_argb), // %1 | |
| 3555 "+r"(width) // %2 | |
| 3556 : | |
| 3557 : "memory", "cc" | |
| 3558 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 3559 ); | |
| 3560 } | |
| 3561 #endif // HAS_ARGBATTENUATEROW_SSE2 | |
| 3562 | |
| 3563 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | 3392 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
| 3564 // Shuffle table duplicating alpha | 3393 // Shuffle table duplicating alpha |
| 3565 static uvec8 kShuffleAlpha0 = { | 3394 static uvec8 kShuffleAlpha0 = { |
| 3566 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u | 3395 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u |
| 3567 }; | 3396 }; |
| 3568 static uvec8 kShuffleAlpha1 = { | 3397 static uvec8 kShuffleAlpha1 = { |
| 3569 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | 3398 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
| 3570 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u | 3399 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u |
| 3571 }; | 3400 }; |
| 3572 // Attenuate 4 pixels at a time. | 3401 // Attenuate 4 pixels at a time. |
| (...skipping 2010 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5583 ); | 5412 ); |
| 5584 } | 5413 } |
| 5585 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5414 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 5586 | 5415 |
| 5587 #endif // defined(__x86_64__) || defined(__i386__) | 5416 #endif // defined(__x86_64__) || defined(__i386__) |
| 5588 | 5417 |
| 5589 #ifdef __cplusplus | 5418 #ifdef __cplusplus |
| 5590 } // extern "C" | 5419 } // extern "C" |
| 5591 } // namespace libyuv | 5420 } // namespace libyuv |
| 5592 #endif | 5421 #endif |
| OLD | NEW |