| OLD | NEW |
| 1 // VERSION 2 | 1 // VERSION 2 |
| 2 /* | 2 /* |
| 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 4 * | 4 * |
| 5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
| 6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
| 7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
| 8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
| 9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
| 10 */ | 10 */ |
| (...skipping 1508 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1519 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1519 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 1520 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ | 1520 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
| 1521 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1521 "punpcklbw %%xmm1,%%xmm0 \n" \ |
| 1522 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1522 "punpcklwd %%xmm0,%%xmm0 \n" \ |
| 1523 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1523 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1524 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1524 "punpcklbw %%xmm4,%%xmm4 \n" \ |
| 1525 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ | 1525 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
| 1526 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ | 1526 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
| 1527 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" | 1527 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" |
| 1528 | 1528 |
| 1529 // Read 2 UV from 411, upsample to 8 UV. | |
| 1530 // reading 4 bytes is an msan violation. | |
| 1531 // "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" | |
| 1532 // MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) | |
| 1533 // pinsrw fails with drmemory | |
| 1534 // __asm pinsrw xmm0, [esi], 0 /* U */ | |
| 1535 // __asm pinsrw xmm1, [esi + edi], 0 /* V */ | |
| 1536 #define READYUV411_TEMP \ | |
| 1537 "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \ | |
| 1538 "movd %[temp],%%xmm0 \n" \ | |
| 1539 MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \ | |
| 1540 "movd %[temp],%%xmm1 \n" \ | |
| 1541 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ | |
| 1542 "punpcklbw %%xmm1,%%xmm0 \n" \ | |
| 1543 "punpcklwd %%xmm0,%%xmm0 \n" \ | |
| 1544 "punpckldq %%xmm0,%%xmm0 \n" \ | |
| 1545 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | |
| 1546 "punpcklbw %%xmm4,%%xmm4 \n" \ | |
| 1547 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | |
| 1548 | |
| 1549 // Read 4 UV from NV12, upsample to 8 UV | 1529 // Read 4 UV from NV12, upsample to 8 UV |
| 1550 #define READNV12 \ | 1530 #define READNV12 \ |
| 1551 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1531 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
| 1552 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ | 1532 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ |
| 1553 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1533 "punpcklwd %%xmm0,%%xmm0 \n" \ |
| 1554 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1534 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1555 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1535 "punpcklbw %%xmm4,%%xmm4 \n" \ |
| 1556 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1536 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
| 1557 | 1537 |
| 1558 // Read 4 VU from NV21, upsample to 8 UV | 1538 // Read 4 VU from NV21, upsample to 8 UV |
| (...skipping 238 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1797 #else | 1777 #else |
| 1798 [width]"+rm"(width) // %[width] | 1778 [width]"+rm"(width) // %[width] |
| 1799 #endif | 1779 #endif |
| 1800 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1780 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1801 : "memory", "cc", NACL_R14 YUVTORGB_REGS | 1781 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
| 1802 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1782 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1803 ); | 1783 ); |
| 1804 } | 1784 } |
| 1805 #endif // HAS_I422ALPHATOARGBROW_SSSE3 | 1785 #endif // HAS_I422ALPHATOARGBROW_SSSE3 |
| 1806 | 1786 |
| 1807 #ifdef HAS_I411TOARGBROW_SSSE3 | |
| 1808 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | |
| 1809 const uint8* u_buf, | |
| 1810 const uint8* v_buf, | |
| 1811 uint8* dst_argb, | |
| 1812 const struct YuvConstants* yuvconstants, | |
| 1813 int width) { | |
| 1814 int temp; | |
| 1815 asm volatile ( | |
| 1816 YUVTORGB_SETUP(yuvconstants) | |
| 1817 "sub %[u_buf],%[v_buf] \n" | |
| 1818 "pcmpeqb %%xmm5,%%xmm5 \n" | |
| 1819 LABELALIGN | |
| 1820 "1: \n" | |
| 1821 READYUV411_TEMP | |
| 1822 YUVTORGB(yuvconstants) | |
| 1823 STOREARGB | |
| 1824 "subl $0x8,%[width] \n" | |
| 1825 "jg 1b \n" | |
| 1826 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 1827 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 1828 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 1829 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
| 1830 [temp]"=&r"(temp), // %[temp] | |
| 1831 #if defined(__i386__) && defined(__pic__) | |
| 1832 [width]"+m"(width) // %[width] | |
| 1833 #else | |
| 1834 [width]"+rm"(width) // %[width] | |
| 1835 #endif | |
| 1836 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | |
| 1837 : "memory", "cc", NACL_R14 YUVTORGB_REGS | |
| 1838 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 1839 ); | |
| 1840 } | |
| 1841 #endif | |
| 1842 | |
| 1843 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, | 1787 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, |
| 1844 const uint8* uv_buf, | 1788 const uint8* uv_buf, |
| 1845 uint8* dst_argb, | 1789 uint8* dst_argb, |
| 1846 const struct YuvConstants* yuvconstants, | 1790 const struct YuvConstants* yuvconstants, |
| 1847 int width) { | 1791 int width) { |
| 1848 asm volatile ( | 1792 asm volatile ( |
| 1849 YUVTORGB_SETUP(yuvconstants) | 1793 YUVTORGB_SETUP(yuvconstants) |
| 1850 "pcmpeqb %%xmm5,%%xmm5 \n" | 1794 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1851 LABELALIGN | 1795 LABELALIGN |
| 1852 "1: \n" | 1796 "1: \n" |
| (...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2006 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1950 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 2007 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1951 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 2008 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1952 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 2009 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1953 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 2010 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1954 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 2011 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ | 1955 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ |
| 2012 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ | 1956 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
| 2013 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ | 1957 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ |
| 2014 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" | 1958 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" |
| 2015 | 1959 |
| 2016 // Read 4 UV from 411, upsample to 16 UV. | |
| 2017 #define READYUV411_AVX2 \ | |
| 2018 "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | |
| 2019 MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | |
| 2020 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ | |
| 2021 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | |
| 2022 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | |
| 2023 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | |
| 2024 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \ | |
| 2025 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | |
| 2026 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | |
| 2027 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | |
| 2028 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | |
| 2029 | |
| 2030 // Read 8 UV from NV12, upsample to 16 UV. | 1960 // Read 8 UV from NV12, upsample to 16 UV. |
| 2031 #define READNV12_AVX2 \ | 1961 #define READNV12_AVX2 \ |
| 2032 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1962 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
| 2033 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ | 1963 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ |
| 2034 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1964 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 2035 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1965 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 2036 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1966 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 2037 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1967 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 2038 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1968 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 2039 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1969 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
| (...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2156 [v_buf]"+r"(v_buf), // %[v_buf] | 2086 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2157 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2087 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2158 [width]"+rm"(width) // %[width] | 2088 [width]"+rm"(width) // %[width] |
| 2159 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2089 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 2160 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 | 2090 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
| 2161 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2091 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2162 ); | 2092 ); |
| 2163 } | 2093 } |
| 2164 #endif // HAS_I444TOARGBROW_AVX2 | 2094 #endif // HAS_I444TOARGBROW_AVX2 |
| 2165 | 2095 |
| 2166 #ifdef HAS_I411TOARGBROW_AVX2 | |
| 2167 // 16 pixels | |
| 2168 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | |
| 2169 void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf, | |
| 2170 const uint8* u_buf, | |
| 2171 const uint8* v_buf, | |
| 2172 uint8* dst_argb, | |
| 2173 const struct YuvConstants* yuvconstants, | |
| 2174 int width) { | |
| 2175 asm volatile ( | |
| 2176 YUVTORGB_SETUP_AVX2(yuvconstants) | |
| 2177 "sub %[u_buf],%[v_buf] \n" | |
| 2178 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
| 2179 LABELALIGN | |
| 2180 "1: \n" | |
| 2181 READYUV411_AVX2 | |
| 2182 YUVTORGB_AVX2(yuvconstants) | |
| 2183 STOREARGB_AVX2 | |
| 2184 "sub $0x10,%[width] \n" | |
| 2185 "jg 1b \n" | |
| 2186 "vzeroupper \n" | |
| 2187 : [y_buf]"+r"(y_buf), // %[y_buf] | |
| 2188 [u_buf]"+r"(u_buf), // %[u_buf] | |
| 2189 [v_buf]"+r"(v_buf), // %[v_buf] | |
| 2190 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
| 2191 [width]"+rm"(width) // %[width] | |
| 2192 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | |
| 2193 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 | |
| 2194 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
| 2195 ); | |
| 2196 } | |
| 2197 #endif // HAS_I411TOARGBROW_AVX2 | |
| 2198 | |
| 2199 #if defined(HAS_I422TOARGBROW_AVX2) | 2096 #if defined(HAS_I422TOARGBROW_AVX2) |
| 2200 // 16 pixels | 2097 // 16 pixels |
| 2201 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2098 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2202 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, | 2099 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, |
| 2203 const uint8* u_buf, | 2100 const uint8* u_buf, |
| 2204 const uint8* v_buf, | 2101 const uint8* v_buf, |
| 2205 uint8* dst_argb, | 2102 uint8* dst_argb, |
| 2206 const struct YuvConstants* yuvconstants, | 2103 const struct YuvConstants* yuvconstants, |
| 2207 int width) { | 2104 int width) { |
| 2208 asm volatile ( | 2105 asm volatile ( |
| (...skipping 3385 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5594 ); | 5491 ); |
| 5595 } | 5492 } |
| 5596 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5493 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 5597 | 5494 |
| 5598 #endif // defined(__x86_64__) || defined(__i386__) | 5495 #endif // defined(__x86_64__) || defined(__i386__) |
| 5599 | 5496 |
| 5600 #ifdef __cplusplus | 5497 #ifdef __cplusplus |
| 5601 } // extern "C" | 5498 } // extern "C" |
| 5602 } // namespace libyuv | 5499 } // namespace libyuv |
| 5603 #endif | 5500 #endif |
| OLD | NEW |