Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(175)

Side by Side Diff: source/row_gcc.cc

Issue 1372653003: avx2 I422AlphaToARGB (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: GCC AVX2 use storeABGR Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_any.cc ('k') | source/row_win.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
(...skipping 1346 matching lines...) Expand 10 before | Expand all | Expand 10 after
1357 #define READYUV422 \ 1357 #define READYUV422 \
1358 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1358 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1359 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1359 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1360 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ 1360 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1361 "punpcklbw %%xmm1,%%xmm0 \n" \ 1361 "punpcklbw %%xmm1,%%xmm0 \n" \
1362 "punpcklwd %%xmm0,%%xmm0 \n" \ 1362 "punpcklwd %%xmm0,%%xmm0 \n" \
1363 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1363 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1364 "punpcklbw %%xmm4,%%xmm4 \n" \ 1364 "punpcklbw %%xmm4,%%xmm4 \n" \
1365 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1365 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1366 1366
1367 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
1368 #define READYUVA422 \
1369 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1370 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1371 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1372 "punpcklbw %%xmm1,%%xmm0 \n" \
1373 "punpcklwd %%xmm0,%%xmm0 \n" \
1374 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1375 "punpcklbw %%xmm4,%%xmm4 \n" \
1376 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
1377 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
1378 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
1379
1367 // Read 2 UV from 411, upsample to 8 UV 1380 // Read 2 UV from 411, upsample to 8 UV
1368 #define READYUV411 \ 1381 #define READYUV411 \
1369 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1382 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1370 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1383 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1371 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ 1384 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
1372 "punpcklbw %%xmm1,%%xmm0 \n" \ 1385 "punpcklbw %%xmm1,%%xmm0 \n" \
1373 "punpcklwd %%xmm0,%%xmm0 \n" \ 1386 "punpcklwd %%xmm0,%%xmm0 \n" \
1374 "punpckldq %%xmm0,%%xmm0 \n" \ 1387 "punpckldq %%xmm0,%%xmm0 \n" \
1375 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1388 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1376 "punpcklbw %%xmm4,%%xmm4 \n" \ 1389 "punpcklbw %%xmm4,%%xmm4 \n" \
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
1419 "paddsw %%xmm4,%%xmm0 \n" \ 1432 "paddsw %%xmm4,%%xmm0 \n" \
1420 "paddsw %%xmm4,%%xmm1 \n" \ 1433 "paddsw %%xmm4,%%xmm1 \n" \
1421 "paddsw %%xmm4,%%xmm2 \n" \ 1434 "paddsw %%xmm4,%%xmm2 \n" \
1422 "psraw $0x6,%%xmm0 \n" \ 1435 "psraw $0x6,%%xmm0 \n" \
1423 "psraw $0x6,%%xmm1 \n" \ 1436 "psraw $0x6,%%xmm1 \n" \
1424 "psraw $0x6,%%xmm2 \n" \ 1437 "psraw $0x6,%%xmm2 \n" \
1425 "packuswb %%xmm0,%%xmm0 \n" \ 1438 "packuswb %%xmm0,%%xmm0 \n" \
1426 "packuswb %%xmm1,%%xmm1 \n" \ 1439 "packuswb %%xmm1,%%xmm1 \n" \
1427 "packuswb %%xmm2,%%xmm2 \n" 1440 "packuswb %%xmm2,%%xmm2 \n"
1428 1441
1429 // Store 8 ARGB values. Assumes XMM5 is set. 1442 // Store 8 ARGB values.
1430 #define STOREARGB \ 1443 #define STOREARGB \
1431 "punpcklbw %%xmm1,%%xmm0 \n" \ 1444 "punpcklbw %%xmm1,%%xmm0 \n" \
1432 "punpcklbw %%xmm5,%%xmm2 \n" \ 1445 "punpcklbw %%xmm5,%%xmm2 \n" \
1433 "movdqa %%xmm0,%%xmm1 \n" \ 1446 "movdqa %%xmm0,%%xmm1 \n" \
1434 "punpcklwd %%xmm2,%%xmm0 \n" \ 1447 "punpcklwd %%xmm2,%%xmm0 \n" \
1435 "punpckhwd %%xmm2,%%xmm1 \n" \ 1448 "punpckhwd %%xmm2,%%xmm1 \n" \
1436 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ 1449 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1437 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ 1450 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
1438 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" 1451 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
1439 1452
1440 // Store 8 BGRA values. 1453 // Store 8 BGRA values.
1441 #define STOREBGRA \ 1454 #define STOREBGRA \
1442 "pcmpeqb %%xmm5,%%xmm5 \n" \ 1455 "pcmpeqb %%xmm5,%%xmm5 \n" \
1443 "punpcklbw %%xmm0,%%xmm1 \n" \ 1456 "punpcklbw %%xmm0,%%xmm1 \n" \
1444 "punpcklbw %%xmm2,%%xmm5 \n" \ 1457 "punpcklbw %%xmm2,%%xmm5 \n" \
1445 "movdqa %%xmm5,%%xmm0 \n" \ 1458 "movdqa %%xmm5,%%xmm0 \n" \
1446 "punpcklwd %%xmm1,%%xmm5 \n" \ 1459 "punpcklwd %%xmm1,%%xmm5 \n" \
1447 "punpckhwd %%xmm1,%%xmm0 \n" \ 1460 "punpckhwd %%xmm1,%%xmm0 \n" \
1448 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ 1461 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
1449 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ 1462 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \
1450 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" 1463 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n"
1451 1464
1452 // Store 8 ABGR values. Assumes XMM5 is set. 1465 // Store 8 ABGR values.
1453 #define STOREABGR \ 1466 #define STOREABGR \
1454 "punpcklbw %%xmm1,%%xmm2 \n" \ 1467 "punpcklbw %%xmm1,%%xmm2 \n" \
1455 "punpcklbw %%xmm5,%%xmm0 \n" \ 1468 "punpcklbw %%xmm5,%%xmm0 \n" \
1456 "movdqa %%xmm2,%%xmm1 \n" \ 1469 "movdqa %%xmm2,%%xmm1 \n" \
1457 "punpcklwd %%xmm0,%%xmm2 \n" \ 1470 "punpcklwd %%xmm0,%%xmm2 \n" \
1458 "punpckhwd %%xmm0,%%xmm1 \n" \ 1471 "punpckhwd %%xmm0,%%xmm1 \n" \
1459 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ 1472 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
1460 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ 1473 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \
1461 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" 1474 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n"
1462 1475
1463 // Store 8 RGBA values. Assumes XMM5 is set. 1476 // Store 8 RGBA values.
1464 #define STORERGBA \ 1477 #define STORERGBA \
1465 "pcmpeqb %%xmm5,%%xmm5 \n" \ 1478 "pcmpeqb %%xmm5,%%xmm5 \n" \
1466 "punpcklbw %%xmm2,%%xmm1 \n" \ 1479 "punpcklbw %%xmm2,%%xmm1 \n" \
1467 "punpcklbw %%xmm0,%%xmm5 \n" \ 1480 "punpcklbw %%xmm0,%%xmm5 \n" \
1468 "movdqa %%xmm5,%%xmm0 \n" \ 1481 "movdqa %%xmm5,%%xmm0 \n" \
1469 "punpcklwd %%xmm1,%%xmm5 \n" \ 1482 "punpcklwd %%xmm1,%%xmm5 \n" \
1470 "punpckhwd %%xmm1,%%xmm0 \n" \ 1483 "punpckhwd %%xmm1,%%xmm0 \n" \
1471 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ 1484 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
1472 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ 1485 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
1473 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" 1486 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
(...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after
1636 [u_buf]"+r"(u_buf), // %[u_buf] 1649 [u_buf]"+r"(u_buf), // %[u_buf]
1637 [v_buf]"+r"(v_buf), // %[v_buf] 1650 [v_buf]"+r"(v_buf), // %[v_buf]
1638 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1651 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1639 [width]"+rm"(width) // %[width] 1652 [width]"+rm"(width) // %[width]
1640 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1653 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1641 : "memory", "cc", NACL_R14 1654 : "memory", "cc", NACL_R14
1642 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1655 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1643 ); 1656 );
1644 } 1657 }
1645 1658
1659 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
1660 const uint8* u_buf,
1661 const uint8* v_buf,
1662 const uint8* a_buf,
1663 uint8* dst_argb,
1664 struct YuvConstants* yuvconstants,
1665 int width) {
1666 asm volatile (
1667 "sub %[u_buf],%[v_buf] \n"
1668 LABELALIGN
1669 "1: \n"
1670 READYUVA422
1671 YUVTORGB(yuvconstants)
1672 STOREARGB
1673 "sub $0x8,%[width] \n"
1674 "jg 1b \n"
1675 : [y_buf]"+r"(y_buf), // %[y_buf]
1676 [u_buf]"+r"(u_buf), // %[u_buf]
1677 [v_buf]"+r"(v_buf), // %[v_buf]
1678 [a_buf]"+r"(a_buf), // %[a_buf]
1679 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1680 [width]"+rm"(width) // %[width]
1681 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1682 : "memory", "cc", NACL_R14
1683 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1684 );
1685 }
1686
1687 void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
1688 const uint8* u_buf,
1689 const uint8* v_buf,
1690 const uint8* a_buf,
1691 uint8* dst_abgr,
1692 struct YuvConstants* yuvconstants,
1693 int width) {
1694 asm volatile (
1695 "sub %[u_buf],%[v_buf] \n"
1696 LABELALIGN
1697 "1: \n"
1698 READYUVA422
1699 YUVTORGB(yuvconstants)
1700 STOREABGR
1701 "sub $0x8,%[width] \n"
1702 "jg 1b \n"
1703 : [y_buf]"+r"(y_buf), // %[y_buf]
1704 [u_buf]"+r"(u_buf), // %[u_buf]
1705 [v_buf]"+r"(v_buf), // %[v_buf]
1706 [a_buf]"+r"(a_buf), // %[a_buf]
1707 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
1708 [width]"+rm"(width) // %[width]
1709 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1710 : "memory", "cc", NACL_R14
1711 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1712 );
1713 }
1714
1646 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, 1715 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1647 const uint8* u_buf, 1716 const uint8* u_buf,
1648 const uint8* v_buf, 1717 const uint8* v_buf,
1649 uint8* dst_argb, 1718 uint8* dst_argb,
1650 struct YuvConstants* yuvconstants, 1719 struct YuvConstants* yuvconstants,
1651 int width) { 1720 int width) {
1652 asm volatile ( 1721 asm volatile (
1653 "sub %[u_buf],%[v_buf] \n" 1722 "sub %[u_buf],%[v_buf] \n"
1654 "pcmpeqb %%xmm5,%%xmm5 \n" 1723 "pcmpeqb %%xmm5,%%xmm5 \n"
1655 LABELALIGN 1724 LABELALIGN
(...skipping 175 matching lines...) Expand 10 before | Expand all | Expand 10 after
1831 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1900 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1832 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1901 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1833 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1902 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1834 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1903 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1835 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 1904 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1836 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1905 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1837 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1906 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1838 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1907 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1839 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1908 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1840 1909
1910 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
1911 #define READYUVA422_AVX2 \
1912 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1913 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1914 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1915 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1916 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1917 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1918 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1919 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1920 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1921 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
1922 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
1923 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
1924 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
1925
1841 // Read 8 UV from NV12, upsample to 16 UV. 1926 // Read 8 UV from NV12, upsample to 16 UV.
1842 #define READNV12_AVX2 \ 1927 #define READNV12_AVX2 \
1843 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 1928 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1844 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ 1929 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
1845 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1930 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1846 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 1931 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1847 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1932 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1848 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1933 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1849 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1934 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1850 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1935 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
(...skipping 29 matching lines...) Expand all
1880 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ 1965 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
1881 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ 1966 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
1882 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ 1967 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
1883 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 1968 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
1884 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 1969 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
1885 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 1970 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
1886 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 1971 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
1887 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 1972 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
1888 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 1973 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
1889 1974
1890 // Store 16 ARGB values. Assumes XMM5 is set. 1975 // Store 16 ARGB values.
1891 #define STOREARGB_AVX2 \ 1976 #define STOREARGB_AVX2 \
1892 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1977 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1893 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1978 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1894 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ 1979 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
1895 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ 1980 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
1896 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ 1981 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
1897 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ 1982 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
1898 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ 1983 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
1899 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \ 1984 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \
1900 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" 1985 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
1901 1986
1987 // Store 16 ABGR values.
1988 #define STOREABGR_AVX2 \
1989 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \
1990 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
1991 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \
1992 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
1993 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \
1994 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \
1995 "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \
1996 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_abgr]) " \n" \
1997 "lea " MEMLEA(0x40,[dst_abgr]) ",%[dst_abgr] \n"
1998
1902 #if defined(HAS_I422TOBGRAROW_AVX2) 1999 #if defined(HAS_I422TOBGRAROW_AVX2)
1903 // 16 pixels 2000 // 16 pixels
1904 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). 2001 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
1905 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, 2002 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
1906 const uint8* u_buf, 2003 const uint8* u_buf,
1907 const uint8* v_buf, 2004 const uint8* v_buf,
1908 uint8* dst_bgra, 2005 uint8* dst_bgra,
1909 struct YuvConstants* yuvconstants, 2006 struct YuvConstants* yuvconstants,
1910 int width) { 2007 int width) {
1911 asm volatile ( 2008 asm volatile (
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
1967 [v_buf]"+r"(v_buf), // %[v_buf] 2064 [v_buf]"+r"(v_buf), // %[v_buf]
1968 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2065 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1969 [width]"+rm"(width) // %[width] 2066 [width]"+rm"(width) // %[width]
1970 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2067 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1971 : "memory", "cc", NACL_R14 2068 : "memory", "cc", NACL_R14
1972 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2069 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1973 ); 2070 );
1974 } 2071 }
1975 #endif // HAS_I422TOARGBROW_AVX2 2072 #endif // HAS_I422TOARGBROW_AVX2
1976 2073
2074 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2075 // 16 pixels
2076 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2077 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2078 const uint8* u_buf,
2079 const uint8* v_buf,
2080 const uint8* a_buf,
2081 uint8* dst_argb,
2082 struct YuvConstants* yuvconstants,
2083 int width) {
2084 asm volatile (
2085 "sub %[u_buf],%[v_buf] \n"
2086 LABELALIGN
2087 "1: \n"
2088 READYUVA422_AVX2
2089 YUVTORGB_AVX2(yuvconstants)
2090 STOREARGB_AVX2
2091 "sub $0x10,%[width] \n"
2092 "jg 1b \n"
2093 "vzeroupper \n"
2094 : [y_buf]"+r"(y_buf), // %[y_buf]
2095 [u_buf]"+r"(u_buf), // %[u_buf]
2096 [v_buf]"+r"(v_buf), // %[v_buf]
2097 [a_buf]"+r"(a_buf), // %[a_buf]
2098 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2099 [width]"+rm"(width) // %[width]
2100 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2101 : "memory", "cc", NACL_R14
2102 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2103 );
2104 }
2105 #endif // HAS_I422ALPHATOARGBROW_AVX2
2106
2107 #if defined(HAS_I422ALPHATOABGRROW_AVX2)
2108 // 16 pixels
2109 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR.
2110 void OMITFP I422AlphaToABGRRow_AVX2(const uint8* y_buf,
2111 const uint8* u_buf,
2112 const uint8* v_buf,
2113 const uint8* a_buf,
2114 uint8* dst_abgr,
2115 struct YuvConstants* yuvconstants,
2116 int width) {
2117 asm volatile (
2118 "sub %[u_buf],%[v_buf] \n"
2119 LABELALIGN
2120 "1: \n"
2121 READYUVA422_AVX2
2122 YUVTORGB_AVX2(yuvconstants)
2123 STOREABGR_AVX2
2124 "sub $0x10,%[width] \n"
2125 "jg 1b \n"
2126 "vzeroupper \n"
2127 : [y_buf]"+r"(y_buf), // %[y_buf]
2128 [u_buf]"+r"(u_buf), // %[u_buf]
2129 [v_buf]"+r"(v_buf), // %[v_buf]
2130 [a_buf]"+r"(a_buf), // %[a_buf]
2131 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
2132 [width]"+rm"(width) // %[width]
2133 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2134 : "memory", "cc", NACL_R14
2135 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2136 );
2137 }
2138 #endif // HAS_I422ALPHATOABGRROW_AVX2
2139
1977 #if defined(HAS_I422TOABGRROW_AVX2) 2140 #if defined(HAS_I422TOABGRROW_AVX2)
1978 // 16 pixels 2141 // 16 pixels
1979 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). 2142 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
1980 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, 2143 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
1981 const uint8* u_buf, 2144 const uint8* u_buf,
1982 const uint8* v_buf, 2145 const uint8* v_buf,
1983 uint8* dst_argb, 2146 uint8* dst_abgr,
1984 struct YuvConstants* yuvconstants, 2147 struct YuvConstants* yuvconstants,
1985 int width) { 2148 int width) {
1986 asm volatile ( 2149 asm volatile (
1987 "sub %[u_buf],%[v_buf] \n" 2150 "sub %[u_buf],%[v_buf] \n"
1988 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2151 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
1989 LABELALIGN 2152 LABELALIGN
1990 "1: \n" 2153 "1: \n"
1991 READYUV422_AVX2 2154 READYUV422_AVX2
1992 YUVTORGB_AVX2(yuvconstants) 2155 YUVTORGB_AVX2(yuvconstants)
1993 2156 STOREABGR_AVX2
1994 // Step 3: Weave into ABGR
1995 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG
1996 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
1997 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA
1998 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
1999 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels
2000 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels
2001 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2002 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2003 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2004 "sub $0x10,%[width] \n" 2157 "sub $0x10,%[width] \n"
2005 "jg 1b \n" 2158 "jg 1b \n"
2006 "vzeroupper \n" 2159 "vzeroupper \n"
2007 : [y_buf]"+r"(y_buf), // %[y_buf] 2160 : [y_buf]"+r"(y_buf), // %[y_buf]
2008 [u_buf]"+r"(u_buf), // %[u_buf] 2161 [u_buf]"+r"(u_buf), // %[u_buf]
2009 [v_buf]"+r"(v_buf), // %[v_buf] 2162 [v_buf]"+r"(v_buf), // %[v_buf]
2010 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2163 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
2011 [width]"+rm"(width) // %[width] 2164 [width]"+rm"(width) // %[width]
2012 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2165 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2013 : "memory", "cc", NACL_R14 2166 : "memory", "cc", NACL_R14
2014 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2167 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2015 ); 2168 );
2016 } 2169 }
2017 #endif // HAS_I422TOABGRROW_AVX2 2170 #endif // HAS_I422TOABGRROW_AVX2
2018 2171
2019 #if defined(HAS_I422TORGBAROW_AVX2) 2172 #if defined(HAS_I422TORGBAROW_AVX2)
2020 // 16 pixels 2173 // 16 pixels
(...skipping 3395 matching lines...) Expand 10 before | Expand all | Expand 10 after
5416 ); 5569 );
5417 } 5570 }
5418 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5571 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5419 5572
5420 #endif // defined(__x86_64__) || defined(__i386__) 5573 #endif // defined(__x86_64__) || defined(__i386__)
5421 5574
5422 #ifdef __cplusplus 5575 #ifdef __cplusplus
5423 } // extern "C" 5576 } // extern "C"
5424 } // namespace libyuv 5577 } // namespace libyuv
5425 #endif 5578 #endif
OLDNEW
« no previous file with comments | « source/row_any.cc ('k') | source/row_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698