OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 1809 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1820 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1820 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1821 : "memory", "cc", NACL_R14 | 1821 : "memory", "cc", NACL_R14 |
1822 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1822 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1823 ); | 1823 ); |
1824 } | 1824 } |
1825 | 1825 |
1826 #endif // HAS_I422TOARGBROW_SSSE3 | 1826 #endif // HAS_I422TOARGBROW_SSSE3 |
1827 | 1827 |
1828 // Read 8 UV from 422, upsample to 16 UV. | 1828 // Read 8 UV from 422, upsample to 16 UV. |
1829 #define READYUV422_AVX2 \ | 1829 #define READYUV422_AVX2 \ |
1830 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1830 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1831 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1831 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1832 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1832 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
1833 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1833 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
1834 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1834 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1835 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1835 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
1836 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1836 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1837 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1837 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
1838 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1838 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
1839 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1839 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
1840 | 1840 |
1841 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. | 1841 // Read 8 UV from NV12, upsample to 16 UV. |
| 1842 #define READNV12_AVX2 \ |
| 1843 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
| 1844 "lea " MEMLEA(0x16, [uv_buf]) ",%[uv_buf] \n" \ |
| 1845 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1846 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 1847 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1848 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 1849 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 1850 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
| 1851 |
| 1852 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. |
1842 #define READYUY2_AVX2 \ | 1853 #define READYUY2_AVX2 \ |
1843 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ | 1854 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ |
1844 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ | 1855 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ |
1845 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ | 1856 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ |
1846 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ | 1857 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ |
1847 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" | 1858 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" |
1848 | 1859 |
1849 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. | 1860 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. |
1850 #define READUYVY_AVX2 \ | 1861 #define READUYVY_AVX2 \ |
1851 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ | 1862 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ |
1852 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ | 1863 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ |
1853 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ | 1864 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ |
1854 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ | 1865 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ |
1855 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" | 1866 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" |
1856 | 1867 |
1857 // Convert 16 pixels: 16 UV and 16 Y. | 1868 // Convert 16 pixels: 16 UV and 16 Y. |
1858 #define YUVTORGB_AVX2(YuvConstants) \ | 1869 #define YUVTORGB_AVX2(yuvconstants) \ |
1859 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ | 1870 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ |
1860 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ | 1871 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ |
1861 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ | 1872 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ |
1862 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ | 1873 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ |
1863 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ | 1874 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ |
1864 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ | 1875 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ |
1865 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ | 1876 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ |
1866 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ | 1877 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ |
1867 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ | 1878 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ |
1868 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm4,%%ymm4 \n" \ | 1879 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ |
1869 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ | 1880 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ |
1870 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ | 1881 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ |
1871 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ | 1882 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ |
1872 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | 1883 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
1873 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | 1884 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
1874 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | 1885 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ |
1875 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | 1886 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ |
1876 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 1887 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
1877 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 1888 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
1878 | 1889 |
(...skipping 161 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2040 [v_buf]"+r"(v_buf), // %[v_buf] | 2051 [v_buf]"+r"(v_buf), // %[v_buf] |
2041 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2052 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2042 [width]"+rm"(width) // %[width] | 2053 [width]"+rm"(width) // %[width] |
2043 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2054 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2044 : "memory", "cc", NACL_R14 | 2055 : "memory", "cc", NACL_R14 |
2045 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2056 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2046 ); | 2057 ); |
2047 } | 2058 } |
2048 #endif // HAS_I422TORGBAROW_AVX2 | 2059 #endif // HAS_I422TORGBAROW_AVX2 |
2049 | 2060 |
| 2061 #if defined(HAS_NV12TOARGBROW_AVX2) |
| 2062 // 16 pixels. |
| 2063 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2064 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, |
| 2065 const uint8* uv_buf, |
| 2066 uint8* dst_argb, |
| 2067 struct YuvConstants* yuvconstants, |
| 2068 int width) { |
| 2069 |
| 2070 asm volatile ( |
| 2071 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2072 LABELALIGN |
| 2073 "1: \n" |
| 2074 READNV12_AVX2 |
| 2075 YUVTORGB_AVX2(yuvconstants) |
| 2076 STOREARGB_AVX2 |
| 2077 "sub $0x10,%[width] \n" |
| 2078 "jg 1b \n" |
| 2079 "vzeroupper \n" |
| 2080 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2081 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
| 2082 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2083 [width]"+rm"(width) // %[width] |
| 2084 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 2085 // Does not use r14. |
| 2086 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2087 ); |
| 2088 } |
| 2089 #endif // HAS_YUY2TOARGBROW_AVX2 |
| 2090 |
| 2091 |
2050 #if defined(HAS_YUY2TOARGBROW_AVX2) | 2092 #if defined(HAS_YUY2TOARGBROW_AVX2) |
2051 // 16 pixels. | 2093 // 16 pixels. |
2052 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2094 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
2053 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, | 2095 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, |
2054 uint8* dst_argb, | 2096 uint8* dst_argb, |
2055 struct YuvConstants* yuvconstants, | 2097 struct YuvConstants* yuvconstants, |
2056 int width) { | 2098 int width) { |
2057 | 2099 |
2058 asm volatile ( | 2100 asm volatile ( |
2059 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2101 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
(...skipping 3314 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5374 ); | 5416 ); |
5375 } | 5417 } |
5376 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5418 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5377 | 5419 |
5378 #endif // defined(__x86_64__) || defined(__i386__) | 5420 #endif // defined(__x86_64__) || defined(__i386__) |
5379 | 5421 |
5380 #ifdef __cplusplus | 5422 #ifdef __cplusplus |
5381 } // extern "C" | 5423 } // extern "C" |
5382 } // namespace libyuv | 5424 } // namespace libyuv |
5383 #endif | 5425 #endif |
OLD | NEW |