OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 1508 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1519 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1519 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1520 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ | 1520 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
1521 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1521 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1522 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1522 "punpcklwd %%xmm0,%%xmm0 \n" \ |
1523 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1523 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1524 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1524 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1525 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ | 1525 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
1526 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ | 1526 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
1527 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" | 1527 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" |
1528 | 1528 |
1529 // Read 2 UV from 411, upsample to 8 UV. | |
1530 // reading 4 bytes is an msan violation. | |
1531 // "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" | |
1532 // MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) | |
1533 // pinsrw fails with drmemory | |
1534 // __asm pinsrw xmm0, [esi], 0 /* U */ | |
1535 // __asm pinsrw xmm1, [esi + edi], 0 /* V */ | |
1536 #define READYUV411_TEMP \ | |
1537 "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \ | |
1538 "movd %[temp],%%xmm0 \n" \ | |
1539 MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \ | |
1540 "movd %[temp],%%xmm1 \n" \ | |
1541 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ | |
1542 "punpcklbw %%xmm1,%%xmm0 \n" \ | |
1543 "punpcklwd %%xmm0,%%xmm0 \n" \ | |
1544 "punpckldq %%xmm0,%%xmm0 \n" \ | |
1545 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | |
1546 "punpcklbw %%xmm4,%%xmm4 \n" \ | |
1547 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | |
1548 | |
1549 // Read 4 UV from NV12, upsample to 8 UV | 1529 // Read 4 UV from NV12, upsample to 8 UV |
1550 #define READNV12 \ | 1530 #define READNV12 \ |
1551 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1531 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
1552 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ | 1532 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ |
1553 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1533 "punpcklwd %%xmm0,%%xmm0 \n" \ |
1554 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1534 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1555 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1535 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1556 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1536 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
1557 | 1537 |
1558 // Read 4 VU from NV21, upsample to 8 UV | 1538 // Read 4 VU from NV21, upsample to 8 UV |
(...skipping 238 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1797 #else | 1777 #else |
1798 [width]"+rm"(width) // %[width] | 1778 [width]"+rm"(width) // %[width] |
1799 #endif | 1779 #endif |
1800 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1780 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1801 : "memory", "cc", NACL_R14 YUVTORGB_REGS | 1781 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
1802 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1782 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1803 ); | 1783 ); |
1804 } | 1784 } |
1805 #endif // HAS_I422ALPHATOARGBROW_SSSE3 | 1785 #endif // HAS_I422ALPHATOARGBROW_SSSE3 |
1806 | 1786 |
1807 #ifdef HAS_I411TOARGBROW_SSSE3 | |
1808 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | |
1809 const uint8* u_buf, | |
1810 const uint8* v_buf, | |
1811 uint8* dst_argb, | |
1812 const struct YuvConstants* yuvconstants, | |
1813 int width) { | |
1814 int temp; | |
1815 asm volatile ( | |
1816 YUVTORGB_SETUP(yuvconstants) | |
1817 "sub %[u_buf],%[v_buf] \n" | |
1818 "pcmpeqb %%xmm5,%%xmm5 \n" | |
1819 LABELALIGN | |
1820 "1: \n" | |
1821 READYUV411_TEMP | |
1822 YUVTORGB(yuvconstants) | |
1823 STOREARGB | |
1824 "subl $0x8,%[width] \n" | |
1825 "jg 1b \n" | |
1826 : [y_buf]"+r"(y_buf), // %[y_buf] | |
1827 [u_buf]"+r"(u_buf), // %[u_buf] | |
1828 [v_buf]"+r"(v_buf), // %[v_buf] | |
1829 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
1830 [temp]"=&r"(temp), // %[temp] | |
1831 #if defined(__i386__) && defined(__pic__) | |
1832 [width]"+m"(width) // %[width] | |
1833 #else | |
1834 [width]"+rm"(width) // %[width] | |
1835 #endif | |
1836 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | |
1837 : "memory", "cc", NACL_R14 YUVTORGB_REGS | |
1838 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
1839 ); | |
1840 } | |
1841 #endif | |
1842 | |
1843 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, | 1787 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, |
1844 const uint8* uv_buf, | 1788 const uint8* uv_buf, |
1845 uint8* dst_argb, | 1789 uint8* dst_argb, |
1846 const struct YuvConstants* yuvconstants, | 1790 const struct YuvConstants* yuvconstants, |
1847 int width) { | 1791 int width) { |
1848 asm volatile ( | 1792 asm volatile ( |
1849 YUVTORGB_SETUP(yuvconstants) | 1793 YUVTORGB_SETUP(yuvconstants) |
1850 "pcmpeqb %%xmm5,%%xmm5 \n" | 1794 "pcmpeqb %%xmm5,%%xmm5 \n" |
1851 LABELALIGN | 1795 LABELALIGN |
1852 "1: \n" | 1796 "1: \n" |
(...skipping 153 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2006 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1950 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
2007 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1951 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
2008 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1952 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
2009 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1953 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
2010 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1954 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
2011 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ | 1955 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ |
2012 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ | 1956 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
2013 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ | 1957 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ |
2014 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" | 1958 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" |
2015 | 1959 |
2016 // Read 4 UV from 411, upsample to 16 UV. | |
2017 #define READYUV411_AVX2 \ | |
2018 "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | |
2019 MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | |
2020 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ | |
2021 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | |
2022 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | |
2023 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | |
2024 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \ | |
2025 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | |
2026 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | |
2027 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | |
2028 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | |
2029 | |
2030 // Read 8 UV from NV12, upsample to 16 UV. | 1960 // Read 8 UV from NV12, upsample to 16 UV. |
2031 #define READNV12_AVX2 \ | 1961 #define READNV12_AVX2 \ |
2032 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1962 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
2033 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ | 1963 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ |
2034 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1964 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
2035 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1965 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
2036 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1966 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
2037 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1967 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
2038 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1968 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
2039 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1969 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
(...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2156 [v_buf]"+r"(v_buf), // %[v_buf] | 2086 [v_buf]"+r"(v_buf), // %[v_buf] |
2157 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2087 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2158 [width]"+rm"(width) // %[width] | 2088 [width]"+rm"(width) // %[width] |
2159 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2089 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2160 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 | 2090 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
2161 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2091 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2162 ); | 2092 ); |
2163 } | 2093 } |
2164 #endif // HAS_I444TOARGBROW_AVX2 | 2094 #endif // HAS_I444TOARGBROW_AVX2 |
2165 | 2095 |
2166 #ifdef HAS_I411TOARGBROW_AVX2 | |
2167 // 16 pixels | |
2168 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | |
2169 void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf, | |
2170 const uint8* u_buf, | |
2171 const uint8* v_buf, | |
2172 uint8* dst_argb, | |
2173 const struct YuvConstants* yuvconstants, | |
2174 int width) { | |
2175 asm volatile ( | |
2176 YUVTORGB_SETUP_AVX2(yuvconstants) | |
2177 "sub %[u_buf],%[v_buf] \n" | |
2178 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | |
2179 LABELALIGN | |
2180 "1: \n" | |
2181 READYUV411_AVX2 | |
2182 YUVTORGB_AVX2(yuvconstants) | |
2183 STOREARGB_AVX2 | |
2184 "sub $0x10,%[width] \n" | |
2185 "jg 1b \n" | |
2186 "vzeroupper \n" | |
2187 : [y_buf]"+r"(y_buf), // %[y_buf] | |
2188 [u_buf]"+r"(u_buf), // %[u_buf] | |
2189 [v_buf]"+r"(v_buf), // %[v_buf] | |
2190 [dst_argb]"+r"(dst_argb), // %[dst_argb] | |
2191 [width]"+rm"(width) // %[width] | |
2192 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | |
2193 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 | |
2194 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | |
2195 ); | |
2196 } | |
2197 #endif // HAS_I411TOARGBROW_AVX2 | |
2198 | |
2199 #if defined(HAS_I422TOARGBROW_AVX2) | 2096 #if defined(HAS_I422TOARGBROW_AVX2) |
2200 // 16 pixels | 2097 // 16 pixels |
2201 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2098 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2202 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, | 2099 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, |
2203 const uint8* u_buf, | 2100 const uint8* u_buf, |
2204 const uint8* v_buf, | 2101 const uint8* v_buf, |
2205 uint8* dst_argb, | 2102 uint8* dst_argb, |
2206 const struct YuvConstants* yuvconstants, | 2103 const struct YuvConstants* yuvconstants, |
2207 int width) { | 2104 int width) { |
2208 asm volatile ( | 2105 asm volatile ( |
(...skipping 3385 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5594 ); | 5491 ); |
5595 } | 5492 } |
5596 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5493 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5597 | 5494 |
5598 #endif // defined(__x86_64__) || defined(__i386__) | 5495 #endif // defined(__x86_64__) || defined(__i386__) |
5599 | 5496 |
5600 #ifdef __cplusplus | 5497 #ifdef __cplusplus |
5601 } // extern "C" | 5498 } // extern "C" |
5602 } // namespace libyuv | 5499 } // namespace libyuv |
5603 #endif | 5500 #endif |
OLD | NEW |