OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 1346 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1357 #define READYUV422 \ | 1357 #define READYUV422 \ |
1358 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1358 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1359 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1359 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1360 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ | 1360 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
1361 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1361 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1362 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1362 "punpcklwd %%xmm0,%%xmm0 \n" \ |
1363 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1363 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1364 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1364 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1365 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1365 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
1366 | 1366 |
| 1367 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. |
| 1368 #define READYUVA422 \ |
| 1369 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 1370 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 1371 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
| 1372 "punpcklbw %%xmm1,%%xmm0 \n" \ |
| 1373 "punpcklwd %%xmm0,%%xmm0 \n" \ |
| 1374 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1375 "punpcklbw %%xmm4,%%xmm4 \n" \ |
| 1376 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
| 1377 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
| 1378 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" |
| 1379 |
1367 // Read 2 UV from 411, upsample to 8 UV | 1380 // Read 2 UV from 411, upsample to 8 UV |
1368 #define READYUV411 \ | 1381 #define READYUV411 \ |
1369 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1382 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1370 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1383 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1371 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ | 1384 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ |
1372 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1385 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1373 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1386 "punpcklwd %%xmm0,%%xmm0 \n" \ |
1374 "punpckldq %%xmm0,%%xmm0 \n" \ | 1387 "punpckldq %%xmm0,%%xmm0 \n" \ |
1375 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1388 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1376 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1389 "punpcklbw %%xmm4,%%xmm4 \n" \ |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1419 "paddsw %%xmm4,%%xmm0 \n" \ | 1432 "paddsw %%xmm4,%%xmm0 \n" \ |
1420 "paddsw %%xmm4,%%xmm1 \n" \ | 1433 "paddsw %%xmm4,%%xmm1 \n" \ |
1421 "paddsw %%xmm4,%%xmm2 \n" \ | 1434 "paddsw %%xmm4,%%xmm2 \n" \ |
1422 "psraw $0x6,%%xmm0 \n" \ | 1435 "psraw $0x6,%%xmm0 \n" \ |
1423 "psraw $0x6,%%xmm1 \n" \ | 1436 "psraw $0x6,%%xmm1 \n" \ |
1424 "psraw $0x6,%%xmm2 \n" \ | 1437 "psraw $0x6,%%xmm2 \n" \ |
1425 "packuswb %%xmm0,%%xmm0 \n" \ | 1438 "packuswb %%xmm0,%%xmm0 \n" \ |
1426 "packuswb %%xmm1,%%xmm1 \n" \ | 1439 "packuswb %%xmm1,%%xmm1 \n" \ |
1427 "packuswb %%xmm2,%%xmm2 \n" | 1440 "packuswb %%xmm2,%%xmm2 \n" |
1428 | 1441 |
1429 // Store 8 ARGB values. Assumes XMM5 is set. | 1442 // Store 8 ARGB values. |
1430 #define STOREARGB \ | 1443 #define STOREARGB \ |
1431 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1444 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1432 "punpcklbw %%xmm5,%%xmm2 \n" \ | 1445 "punpcklbw %%xmm5,%%xmm2 \n" \ |
1433 "movdqa %%xmm0,%%xmm1 \n" \ | 1446 "movdqa %%xmm0,%%xmm1 \n" \ |
1434 "punpcklwd %%xmm2,%%xmm0 \n" \ | 1447 "punpcklwd %%xmm2,%%xmm0 \n" \ |
1435 "punpckhwd %%xmm2,%%xmm1 \n" \ | 1448 "punpckhwd %%xmm2,%%xmm1 \n" \ |
1436 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ | 1449 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ |
1437 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ | 1450 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ |
1438 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" | 1451 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" |
1439 | 1452 |
1440 // Store 8 BGRA values. | 1453 // Store 8 BGRA values. |
1441 #define STOREBGRA \ | 1454 #define STOREBGRA \ |
1442 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1455 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
1443 "punpcklbw %%xmm0,%%xmm1 \n" \ | 1456 "punpcklbw %%xmm0,%%xmm1 \n" \ |
1444 "punpcklbw %%xmm2,%%xmm5 \n" \ | 1457 "punpcklbw %%xmm2,%%xmm5 \n" \ |
1445 "movdqa %%xmm5,%%xmm0 \n" \ | 1458 "movdqa %%xmm5,%%xmm0 \n" \ |
1446 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1459 "punpcklwd %%xmm1,%%xmm5 \n" \ |
1447 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1460 "punpckhwd %%xmm1,%%xmm0 \n" \ |
1448 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ | 1461 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ |
1449 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ | 1462 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ |
1450 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" | 1463 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" |
1451 | 1464 |
1452 // Store 8 ABGR values. Assumes XMM5 is set. | 1465 // Store 8 ABGR values. |
1453 #define STOREABGR \ | 1466 #define STOREABGR \ |
1454 "punpcklbw %%xmm1,%%xmm2 \n" \ | 1467 "punpcklbw %%xmm1,%%xmm2 \n" \ |
1455 "punpcklbw %%xmm5,%%xmm0 \n" \ | 1468 "punpcklbw %%xmm5,%%xmm0 \n" \ |
1456 "movdqa %%xmm2,%%xmm1 \n" \ | 1469 "movdqa %%xmm2,%%xmm1 \n" \ |
1457 "punpcklwd %%xmm0,%%xmm2 \n" \ | 1470 "punpcklwd %%xmm0,%%xmm2 \n" \ |
1458 "punpckhwd %%xmm0,%%xmm1 \n" \ | 1471 "punpckhwd %%xmm0,%%xmm1 \n" \ |
1459 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ | 1472 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ |
1460 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ | 1473 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ |
1461 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" | 1474 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" |
1462 | 1475 |
1463 // Store 8 RGBA values. Assumes XMM5 is set. | 1476 // Store 8 RGBA values. |
1464 #define STORERGBA \ | 1477 #define STORERGBA \ |
1465 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1478 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
1466 "punpcklbw %%xmm2,%%xmm1 \n" \ | 1479 "punpcklbw %%xmm2,%%xmm1 \n" \ |
1467 "punpcklbw %%xmm0,%%xmm5 \n" \ | 1480 "punpcklbw %%xmm0,%%xmm5 \n" \ |
1468 "movdqa %%xmm5,%%xmm0 \n" \ | 1481 "movdqa %%xmm5,%%xmm0 \n" \ |
1469 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1482 "punpcklwd %%xmm1,%%xmm5 \n" \ |
1470 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1483 "punpckhwd %%xmm1,%%xmm0 \n" \ |
1471 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ | 1484 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ |
1472 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ | 1485 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ |
1473 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" | 1486 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" |
(...skipping 162 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1636 [u_buf]"+r"(u_buf), // %[u_buf] | 1649 [u_buf]"+r"(u_buf), // %[u_buf] |
1637 [v_buf]"+r"(v_buf), // %[v_buf] | 1650 [v_buf]"+r"(v_buf), // %[v_buf] |
1638 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1651 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1639 [width]"+rm"(width) // %[width] | 1652 [width]"+rm"(width) // %[width] |
1640 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1653 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1641 : "memory", "cc", NACL_R14 | 1654 : "memory", "cc", NACL_R14 |
1642 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1655 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1643 ); | 1656 ); |
1644 } | 1657 } |
1645 | 1658 |
| 1659 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, |
| 1660 const uint8* u_buf, |
| 1661 const uint8* v_buf, |
| 1662 const uint8* a_buf, |
| 1663 uint8* dst_argb, |
| 1664 struct YuvConstants* yuvconstants, |
| 1665 int width) { |
| 1666 asm volatile ( |
| 1667 "sub %[u_buf],%[v_buf] \n" |
| 1668 LABELALIGN |
| 1669 "1: \n" |
| 1670 READYUVA422 |
| 1671 YUVTORGB(yuvconstants) |
| 1672 STOREARGB |
| 1673 "sub $0x8,%[width] \n" |
| 1674 "jg 1b \n" |
| 1675 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1676 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1677 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1678 [a_buf]"+r"(a_buf), // %[a_buf] |
| 1679 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1680 [width]"+rm"(width) // %[width] |
| 1681 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1682 : "memory", "cc", NACL_R14 |
| 1683 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1684 ); |
| 1685 } |
| 1686 |
| 1687 void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf, |
| 1688 const uint8* u_buf, |
| 1689 const uint8* v_buf, |
| 1690 const uint8* a_buf, |
| 1691 uint8* dst_abgr, |
| 1692 struct YuvConstants* yuvconstants, |
| 1693 int width) { |
| 1694 asm volatile ( |
| 1695 "sub %[u_buf],%[v_buf] \n" |
| 1696 LABELALIGN |
| 1697 "1: \n" |
| 1698 READYUVA422 |
| 1699 YUVTORGB(yuvconstants) |
| 1700 STOREABGR |
| 1701 "sub $0x8,%[width] \n" |
| 1702 "jg 1b \n" |
| 1703 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1704 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1705 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1706 [a_buf]"+r"(a_buf), // %[a_buf] |
| 1707 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
| 1708 [width]"+rm"(width) // %[width] |
| 1709 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1710 : "memory", "cc", NACL_R14 |
| 1711 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1712 ); |
| 1713 } |
| 1714 |
1646 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | 1715 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
1647 const uint8* u_buf, | 1716 const uint8* u_buf, |
1648 const uint8* v_buf, | 1717 const uint8* v_buf, |
1649 uint8* dst_argb, | 1718 uint8* dst_argb, |
1650 struct YuvConstants* yuvconstants, | 1719 struct YuvConstants* yuvconstants, |
1651 int width) { | 1720 int width) { |
1652 asm volatile ( | 1721 asm volatile ( |
1653 "sub %[u_buf],%[v_buf] \n" | 1722 "sub %[u_buf],%[v_buf] \n" |
1654 "pcmpeqb %%xmm5,%%xmm5 \n" | 1723 "pcmpeqb %%xmm5,%%xmm5 \n" |
1655 LABELALIGN | 1724 LABELALIGN |
(...skipping 175 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1831 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1900 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1832 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1901 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
1833 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1902 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
1834 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1903 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1835 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1904 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
1836 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1905 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1837 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1906 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
1838 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1907 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
1839 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1908 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
1840 | 1909 |
| 1910 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. |
| 1911 #define READYUVA422_AVX2 \ |
| 1912 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 1913 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 1914 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
| 1915 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
| 1916 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1917 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 1918 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1919 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 1920 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 1921 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ |
| 1922 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
| 1923 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ |
| 1924 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" |
| 1925 |
1841 // Read 8 UV from NV12, upsample to 16 UV. | 1926 // Read 8 UV from NV12, upsample to 16 UV. |
1842 #define READNV12_AVX2 \ | 1927 #define READNV12_AVX2 \ |
1843 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1928 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
1844 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ | 1929 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ |
1845 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1930 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1846 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1931 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
1847 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1932 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1848 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1933 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
1849 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1934 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
1850 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1935 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
(...skipping 29 matching lines...) Expand all Loading... |
1880 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ | 1965 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ |
1881 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ | 1966 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ |
1882 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ | 1967 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ |
1883 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | 1968 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
1884 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | 1969 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
1885 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | 1970 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ |
1886 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | 1971 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ |
1887 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 1972 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
1888 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 1973 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
1889 | 1974 |
1890 // Store 16 ARGB values. Assumes XMM5 is set. | 1975 // Store 16 ARGB values. |
1891 #define STOREARGB_AVX2 \ | 1976 #define STOREARGB_AVX2 \ |
1892 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1977 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
1893 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1978 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1894 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ | 1979 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ |
1895 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ | 1980 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
1896 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ | 1981 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ |
1897 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ | 1982 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ |
1898 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ | 1983 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ |
1899 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \ | 1984 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \ |
1900 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | 1985 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
1901 | 1986 |
| 1987 // Store 16 ABGR values. |
| 1988 #define STOREABGR_AVX2 \ |
| 1989 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \ |
| 1990 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ |
| 1991 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \ |
| 1992 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
| 1993 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \ |
| 1994 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \ |
| 1995 "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \ |
| 1996 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_abgr]) " \n" \ |
| 1997 "lea " MEMLEA(0x40,[dst_abgr]) ",%[dst_abgr] \n" |
| 1998 |
1902 #if defined(HAS_I422TOBGRAROW_AVX2) | 1999 #if defined(HAS_I422TOBGRAROW_AVX2) |
1903 // 16 pixels | 2000 // 16 pixels |
1904 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 2001 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
1905 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, | 2002 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, |
1906 const uint8* u_buf, | 2003 const uint8* u_buf, |
1907 const uint8* v_buf, | 2004 const uint8* v_buf, |
1908 uint8* dst_bgra, | 2005 uint8* dst_bgra, |
1909 struct YuvConstants* yuvconstants, | 2006 struct YuvConstants* yuvconstants, |
1910 int width) { | 2007 int width) { |
1911 asm volatile ( | 2008 asm volatile ( |
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1967 [v_buf]"+r"(v_buf), // %[v_buf] | 2064 [v_buf]"+r"(v_buf), // %[v_buf] |
1968 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2065 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1969 [width]"+rm"(width) // %[width] | 2066 [width]"+rm"(width) // %[width] |
1970 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2067 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1971 : "memory", "cc", NACL_R14 | 2068 : "memory", "cc", NACL_R14 |
1972 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2069 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1973 ); | 2070 ); |
1974 } | 2071 } |
1975 #endif // HAS_I422TOARGBROW_AVX2 | 2072 #endif // HAS_I422TOARGBROW_AVX2 |
1976 | 2073 |
| 2074 #if defined(HAS_I422ALPHATOARGBROW_AVX2) |
| 2075 // 16 pixels |
| 2076 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. |
| 2077 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, |
| 2078 const uint8* u_buf, |
| 2079 const uint8* v_buf, |
| 2080 const uint8* a_buf, |
| 2081 uint8* dst_argb, |
| 2082 struct YuvConstants* yuvconstants, |
| 2083 int width) { |
| 2084 asm volatile ( |
| 2085 "sub %[u_buf],%[v_buf] \n" |
| 2086 LABELALIGN |
| 2087 "1: \n" |
| 2088 READYUVA422_AVX2 |
| 2089 YUVTORGB_AVX2(yuvconstants) |
| 2090 STOREARGB_AVX2 |
| 2091 "sub $0x10,%[width] \n" |
| 2092 "jg 1b \n" |
| 2093 "vzeroupper \n" |
| 2094 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2095 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2096 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2097 [a_buf]"+r"(a_buf), // %[a_buf] |
| 2098 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2099 [width]"+rm"(width) // %[width] |
| 2100 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 2101 : "memory", "cc", NACL_R14 |
| 2102 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2103 ); |
| 2104 } |
| 2105 #endif // HAS_I422ALPHATOARGBROW_AVX2 |
| 2106 |
| 2107 #if defined(HAS_I422ALPHATOABGRROW_AVX2) |
| 2108 // 16 pixels |
| 2109 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. |
| 2110 void OMITFP I422AlphaToABGRRow_AVX2(const uint8* y_buf, |
| 2111 const uint8* u_buf, |
| 2112 const uint8* v_buf, |
| 2113 const uint8* a_buf, |
| 2114 uint8* dst_abgr, |
| 2115 struct YuvConstants* yuvconstants, |
| 2116 int width) { |
| 2117 asm volatile ( |
| 2118 "sub %[u_buf],%[v_buf] \n" |
| 2119 LABELALIGN |
| 2120 "1: \n" |
| 2121 READYUVA422_AVX2 |
| 2122 YUVTORGB_AVX2(yuvconstants) |
| 2123 STOREABGR_AVX2 |
| 2124 "sub $0x10,%[width] \n" |
| 2125 "jg 1b \n" |
| 2126 "vzeroupper \n" |
| 2127 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2128 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2129 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2130 [a_buf]"+r"(a_buf), // %[a_buf] |
| 2131 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
| 2132 [width]"+rm"(width) // %[width] |
| 2133 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 2134 : "memory", "cc", NACL_R14 |
| 2135 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2136 ); |
| 2137 } |
| 2138 #endif // HAS_I422ALPHATOABGRROW_AVX2 |
| 2139 |
1977 #if defined(HAS_I422TOABGRROW_AVX2) | 2140 #if defined(HAS_I422TOABGRROW_AVX2) |
1978 // 16 pixels | 2141 // 16 pixels |
1979 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 2142 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
1980 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, | 2143 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, |
1981 const uint8* u_buf, | 2144 const uint8* u_buf, |
1982 const uint8* v_buf, | 2145 const uint8* v_buf, |
1983 uint8* dst_argb, | 2146 uint8* dst_abgr, |
1984 struct YuvConstants* yuvconstants, | 2147 struct YuvConstants* yuvconstants, |
1985 int width) { | 2148 int width) { |
1986 asm volatile ( | 2149 asm volatile ( |
1987 "sub %[u_buf],%[v_buf] \n" | 2150 "sub %[u_buf],%[v_buf] \n" |
1988 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2151 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
1989 LABELALIGN | 2152 LABELALIGN |
1990 "1: \n" | 2153 "1: \n" |
1991 READYUV422_AVX2 | 2154 READYUV422_AVX2 |
1992 YUVTORGB_AVX2(yuvconstants) | 2155 YUVTORGB_AVX2(yuvconstants) |
1993 | 2156 STOREABGR_AVX2 |
1994 // Step 3: Weave into ABGR | |
1995 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG | |
1996 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | |
1997 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA | |
1998 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
1999 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels | |
2000 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels | |
2001 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" | |
2002 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" | |
2003 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | |
2004 "sub $0x10,%[width] \n" | 2157 "sub $0x10,%[width] \n" |
2005 "jg 1b \n" | 2158 "jg 1b \n" |
2006 "vzeroupper \n" | 2159 "vzeroupper \n" |
2007 : [y_buf]"+r"(y_buf), // %[y_buf] | 2160 : [y_buf]"+r"(y_buf), // %[y_buf] |
2008 [u_buf]"+r"(u_buf), // %[u_buf] | 2161 [u_buf]"+r"(u_buf), // %[u_buf] |
2009 [v_buf]"+r"(v_buf), // %[v_buf] | 2162 [v_buf]"+r"(v_buf), // %[v_buf] |
2010 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2163 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
2011 [width]"+rm"(width) // %[width] | 2164 [width]"+rm"(width) // %[width] |
2012 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2165 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2013 : "memory", "cc", NACL_R14 | 2166 : "memory", "cc", NACL_R14 |
2014 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2167 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2015 ); | 2168 ); |
2016 } | 2169 } |
2017 #endif // HAS_I422TOABGRROW_AVX2 | 2170 #endif // HAS_I422TOABGRROW_AVX2 |
2018 | 2171 |
2019 #if defined(HAS_I422TORGBAROW_AVX2) | 2172 #if defined(HAS_I422TORGBAROW_AVX2) |
2020 // 16 pixels | 2173 // 16 pixels |
(...skipping 3395 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5416 ); | 5569 ); |
5417 } | 5570 } |
5418 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5571 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5419 | 5572 |
5420 #endif // defined(__x86_64__) || defined(__i386__) | 5573 #endif // defined(__x86_64__) || defined(__i386__) |
5421 | 5574 |
5422 #ifdef __cplusplus | 5575 #ifdef __cplusplus |
5423 } // extern "C" | 5576 } // extern "C" |
5424 } // namespace libyuv | 5577 } // namespace libyuv |
5425 #endif | 5578 #endif |
OLD | NEW |