Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: source/row_gcc.cc

Issue 1364913002: NV12ToARGB_AVX2 ported to gcc (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: nv12 macro fixed Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « include/libyuv/row.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
(...skipping 1809 matching lines...) Expand 10 before | Expand all | Expand 10 after
1820 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1820 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1821 : "memory", "cc", NACL_R14 1821 : "memory", "cc", NACL_R14
1822 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1822 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1823 ); 1823 );
1824 } 1824 }
1825 1825
1826 #endif // HAS_I422TOARGBROW_SSSE3 1826 #endif // HAS_I422TOARGBROW_SSSE3
1827 1827
1828 // Read 8 UV from 422, upsample to 16 UV. 1828 // Read 8 UV from 422, upsample to 16 UV.
1829 #define READYUV422_AVX2 \ 1829 #define READYUV422_AVX2 \
1830 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1830 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1831 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1831 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1832 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1832 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1833 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1833 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1834 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1834 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1835 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 1835 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1836 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1836 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1837 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1837 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1838 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1838 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1839 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1839 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1840 1840
1841 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. 1841 // Read 8 UV from NV12, upsample to 16 UV.
1842 #define READNV12_AVX2 \
1843 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1844 "lea " MEMLEA(0x16, [uv_buf]) ",%[uv_buf] \n" \
1845 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1846 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1847 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1848 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1849 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1850 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1851
1852 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
1842 #define READYUY2_AVX2 \ 1853 #define READYUY2_AVX2 \
1843 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ 1854 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
1844 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ 1855 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
1845 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ 1856 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
1846 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ 1857 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
1847 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" 1858 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
1848 1859
1849 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. 1860 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
1850 #define READUYVY_AVX2 \ 1861 #define READUYVY_AVX2 \
1851 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ 1862 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
1852 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ 1863 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
1853 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ 1864 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
1854 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ 1865 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
1855 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" 1866 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
1856 1867
1857 // Convert 16 pixels: 16 UV and 16 Y. 1868 // Convert 16 pixels: 16 UV and 16 Y.
1858 #define YUVTORGB_AVX2(YuvConstants) \ 1869 #define YUVTORGB_AVX2(yuvconstants) \
1859 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ 1870 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
1860 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ 1871 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
1861 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ 1872 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
1862 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ 1873 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
1863 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ 1874 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
1864 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ 1875 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
1865 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ 1876 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
1866 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ 1877 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
1867 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ 1878 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
1868 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm4,%%ymm4 \n" \ 1879 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
1869 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ 1880 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
1870 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ 1881 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
1871 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ 1882 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
1872 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 1883 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
1873 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 1884 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
1874 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 1885 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
1875 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 1886 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
1876 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 1887 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
1877 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 1888 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
1878 1889
(...skipping 161 matching lines...) Expand 10 before | Expand all | Expand 10 after
2040 [v_buf]"+r"(v_buf), // %[v_buf] 2051 [v_buf]"+r"(v_buf), // %[v_buf]
2041 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2052 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2042 [width]"+rm"(width) // %[width] 2053 [width]"+rm"(width) // %[width]
2043 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2054 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2044 : "memory", "cc", NACL_R14 2055 : "memory", "cc", NACL_R14
2045 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2056 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2046 ); 2057 );
2047 } 2058 }
2048 #endif // HAS_I422TORGBAROW_AVX2 2059 #endif // HAS_I422TORGBAROW_AVX2
2049 2060
2061 #if defined(HAS_NV12TOARGBROW_AVX2)
2062 // 16 pixels.
2063 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2064 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
2065 const uint8* uv_buf,
2066 uint8* dst_argb,
2067 struct YuvConstants* yuvconstants,
2068 int width) {
2069
2070 asm volatile (
2071 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2072 LABELALIGN
2073 "1: \n"
2074 READNV12_AVX2
2075 YUVTORGB_AVX2(yuvconstants)
2076 STOREARGB_AVX2
2077 "sub $0x10,%[width] \n"
2078 "jg 1b \n"
2079 "vzeroupper \n"
2080 : [y_buf]"+r"(y_buf), // %[y_buf]
2081 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2082 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2083 [width]"+rm"(width) // %[width]
2084 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2085 // Does not use r14.
2086 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2087 );
2088 }
2089 #endif // HAS_YUY2TOARGBROW_AVX2
2090
2091
2050 #if defined(HAS_YUY2TOARGBROW_AVX2) 2092 #if defined(HAS_YUY2TOARGBROW_AVX2)
2051 // 16 pixels. 2093 // 16 pixels.
2052 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2094 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2053 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, 2095 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2054 uint8* dst_argb, 2096 uint8* dst_argb,
2055 struct YuvConstants* yuvconstants, 2097 struct YuvConstants* yuvconstants,
2056 int width) { 2098 int width) {
2057 2099
2058 asm volatile ( 2100 asm volatile (
2059 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2101 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
(...skipping 3314 matching lines...) Expand 10 before | Expand all | Expand 10 after
5374 ); 5416 );
5375 } 5417 }
5376 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5418 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5377 5419
5378 #endif // defined(__x86_64__) || defined(__i386__) 5420 #endif // defined(__x86_64__) || defined(__i386__)
5379 5421
5380 #ifdef __cplusplus 5422 #ifdef __cplusplus
5381 } // extern "C" 5423 } // extern "C"
5382 } // namespace libyuv 5424 } // namespace libyuv
5383 #endif 5425 #endif
OLDNEW
« no previous file with comments | « include/libyuv/row.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698