OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 146 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
157 static const lvec8 kShuffleUYVYY = { | 157 static const lvec8 kShuffleUYVYY = { |
158 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, | 158 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, |
159 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 | 159 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 |
160 }; | 160 }; |
161 | 161 |
162 // UYVY shuf 8 UV to 16 UV. | 162 // UYVY shuf 8 UV to 16 UV. |
163 static const lvec8 kShuffleUYVYUV = { | 163 static const lvec8 kShuffleUYVYUV = { |
164 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, | 164 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, |
165 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 | 165 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 |
166 }; | 166 }; |
| 167 |
| 168 // NV21 shuf 8 VU to 16 UV. |
| 169 static const lvec8 kShuffleNV21 = { |
| 170 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
| 171 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
| 172 }; |
167 #endif // HAS_RGB24TOARGBROW_SSSE3 | 173 #endif // HAS_RGB24TOARGBROW_SSSE3 |
168 | 174 |
169 #ifdef HAS_J400TOARGBROW_SSE2 | 175 #ifdef HAS_J400TOARGBROW_SSE2 |
170 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { | 176 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
171 asm volatile ( | 177 asm volatile ( |
172 "pcmpeqb %%xmm5,%%xmm5 \n" | 178 "pcmpeqb %%xmm5,%%xmm5 \n" |
173 "pslld $0x18,%%xmm5 \n" | 179 "pslld $0x18,%%xmm5 \n" |
174 LABELALIGN | 180 LABELALIGN |
175 "1: \n" | 181 "1: \n" |
176 "movq " MEMACCESS(0) ",%%xmm0 \n" | 182 "movq " MEMACCESS(0) ",%%xmm0 \n" |
(...skipping 1214 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1391 | 1397 |
1392 // Read 4 UV from NV12, upsample to 8 UV | 1398 // Read 4 UV from NV12, upsample to 8 UV |
1393 #define READNV12 \ | 1399 #define READNV12 \ |
1394 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1400 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
1395 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ | 1401 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ |
1396 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1402 "punpcklwd %%xmm0,%%xmm0 \n" \ |
1397 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1403 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1398 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1404 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1399 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1405 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
1400 | 1406 |
| 1407 // Read 4 VU from NV21, upsample to 8 UV |
| 1408 #define READNV21 \ |
| 1409 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ |
| 1410 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ |
| 1411 "pshufb %[kShuffleNV21], %%xmm0 \n" \ |
| 1412 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1413 "punpcklbw %%xmm4,%%xmm4 \n" \ |
| 1414 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
| 1415 |
1401 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. | 1416 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. |
1402 #define READYUY2 \ | 1417 #define READYUY2 \ |
1403 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ | 1418 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ |
1404 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ | 1419 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ |
1405 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ | 1420 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ |
1406 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ | 1421 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ |
1407 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" | 1422 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" |
1408 | 1423 |
1409 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. | 1424 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. |
1410 #define READUYVY \ | 1425 #define READUYVY \ |
(...skipping 351 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1762 : [y_buf]"+r"(y_buf), // %[y_buf] | 1777 : [y_buf]"+r"(y_buf), // %[y_buf] |
1763 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 1778 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
1764 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1779 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1765 [width]"+rm"(width) // %[width] | 1780 [width]"+rm"(width) // %[width] |
1766 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1781 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1767 // Does not use r14. | 1782 // Does not use r14. |
1768 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1783 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1769 ); | 1784 ); |
1770 } | 1785 } |
1771 | 1786 |
| 1787 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, |
| 1788 const uint8* vu_buf, |
| 1789 uint8* dst_argb, |
| 1790 struct YuvConstants* yuvconstants, |
| 1791 int width) { |
| 1792 asm volatile ( |
| 1793 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1794 LABELALIGN |
| 1795 "1: \n" |
| 1796 READNV21 |
| 1797 YUVTORGB(yuvconstants) |
| 1798 STOREARGB |
| 1799 "sub $0x8,%[width] \n" |
| 1800 "jg 1b \n" |
| 1801 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1802 [vu_buf]"+r"(vu_buf), // %[vu_buf] |
| 1803 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1804 [width]"+rm"(width) // %[width] |
| 1805 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 1806 [kShuffleNV21]"m"(kShuffleNV21) |
| 1807 // Does not use r14. |
| 1808 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1809 ); |
| 1810 } |
| 1811 |
1772 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, | 1812 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, |
1773 uint8* dst_argb, | 1813 uint8* dst_argb, |
1774 struct YuvConstants* yuvconstants, | 1814 struct YuvConstants* yuvconstants, |
1775 int width) { | 1815 int width) { |
1776 asm volatile ( | 1816 asm volatile ( |
1777 "pcmpeqb %%xmm5,%%xmm5 \n" | 1817 "pcmpeqb %%xmm5,%%xmm5 \n" |
1778 LABELALIGN | 1818 LABELALIGN |
1779 "1: \n" | 1819 "1: \n" |
1780 READYUY2 | 1820 READYUY2 |
1781 YUVTORGB(yuvconstants) | 1821 YUVTORGB(yuvconstants) |
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1933 #define READNV12_AVX2 \ | 1973 #define READNV12_AVX2 \ |
1934 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1974 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
1935 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ | 1975 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ |
1936 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1976 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1937 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1977 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
1938 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1978 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1939 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1979 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
1940 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1980 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
1941 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1981 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
1942 | 1982 |
| 1983 // Read 8 VU from NV21, upsample to 16 UV. |
| 1984 #define READNV21_AVX2 \ |
| 1985 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ |
| 1986 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ |
| 1987 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1988 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ |
| 1989 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1990 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 1991 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 1992 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
| 1993 |
1943 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. | 1994 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. |
1944 #define READYUY2_AVX2 \ | 1995 #define READYUY2_AVX2 \ |
1945 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ | 1996 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ |
1946 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ | 1997 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ |
1947 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ | 1998 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ |
1948 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ | 1999 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ |
1949 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" | 2000 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" |
1950 | 2001 |
1951 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. | 2002 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. |
1952 #define READUYVY_AVX2 \ | 2003 #define READUYVY_AVX2 \ |
(...skipping 291 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2244 "vzeroupper \n" | 2295 "vzeroupper \n" |
2245 : [y_buf]"+r"(y_buf), // %[y_buf] | 2296 : [y_buf]"+r"(y_buf), // %[y_buf] |
2246 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 2297 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
2247 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2298 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2248 [width]"+rm"(width) // %[width] | 2299 [width]"+rm"(width) // %[width] |
2249 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2300 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2250 // Does not use r14. | 2301 // Does not use r14. |
2251 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2302 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2252 ); | 2303 ); |
2253 } | 2304 } |
2254 #endif // HAS_YUY2TOARGBROW_AVX2 | 2305 #endif // HAS_NV12TOARGBROW_AVX2 |
2255 | 2306 |
| 2307 #if defined(HAS_NV21TOARGBROW_AVX2) |
| 2308 // 16 pixels. |
| 2309 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2310 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, |
| 2311 const uint8* vu_buf, |
| 2312 uint8* dst_argb, |
| 2313 struct YuvConstants* yuvconstants, |
| 2314 int width) { |
| 2315 asm volatile ( |
| 2316 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2317 LABELALIGN |
| 2318 "1: \n" |
| 2319 READNV21_AVX2 |
| 2320 YUVTORGB_AVX2(yuvconstants) |
| 2321 STOREARGB_AVX2 |
| 2322 "sub $0x10,%[width] \n" |
| 2323 "jg 1b \n" |
| 2324 "vzeroupper \n" |
| 2325 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2326 [vu_buf]"+r"(vu_buf), // %[vu_buf] |
| 2327 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2328 [width]"+rm"(width) // %[width] |
| 2329 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 2330 [kShuffleNV21]"m"(kShuffleNV21) |
| 2331 // Does not use r14. |
| 2332 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2333 ); |
| 2334 } |
| 2335 #endif // HAS_NV21TOARGBROW_AVX2 |
2256 | 2336 |
2257 #if defined(HAS_YUY2TOARGBROW_AVX2) | 2337 #if defined(HAS_YUY2TOARGBROW_AVX2) |
2258 // 16 pixels. | 2338 // 16 pixels. |
2259 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2339 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
2260 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, | 2340 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, |
2261 uint8* dst_argb, | 2341 uint8* dst_argb, |
2262 struct YuvConstants* yuvconstants, | 2342 struct YuvConstants* yuvconstants, |
2263 int width) { | 2343 int width) { |
2264 asm volatile ( | 2344 asm volatile ( |
2265 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2345 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
(...skipping 3146 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5412 ); | 5492 ); |
5413 } | 5493 } |
5414 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5494 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5415 | 5495 |
5416 #endif // defined(__x86_64__) || defined(__i386__) | 5496 #endif // defined(__x86_64__) || defined(__i386__) |
5417 | 5497 |
5418 #ifdef __cplusplus | 5498 #ifdef __cplusplus |
5419 } // extern "C" | 5499 } // extern "C" |
5420 } // namespace libyuv | 5500 } // namespace libyuv |
5421 #endif | 5501 #endif |
OLD | NEW |