OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 1546 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1557 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" | 1557 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" |
1558 | 1558 |
1559 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. | 1559 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. |
1560 #define READUYVY \ | 1560 #define READUYVY \ |
1561 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ | 1561 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ |
1562 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ | 1562 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ |
1563 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ | 1563 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ |
1564 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ | 1564 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ |
1565 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" | 1565 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" |
1566 | 1566 |
| 1567 #if defined(__x86_64__) |
| 1568 #define YUVTORGB_SETUP(yuvconstants) \ |
| 1569 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ |
| 1570 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ |
| 1571 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ |
| 1572 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ |
| 1573 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ |
| 1574 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ |
| 1575 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" |
1567 // Convert 8 pixels: 8 UV and 8 Y | 1576 // Convert 8 pixels: 8 UV and 8 Y |
1568 #define YUVTORGB(yuvconstants) \ | 1577 #define YUVTORGB(yuvconstants) \ |
1569 "movdqa %%xmm0,%%xmm1 \n" \ | 1578 "movdqa %%xmm0,%%xmm1 \n" \ |
| 1579 "movdqa %%xmm0,%%xmm2 \n" \ |
| 1580 "movdqa %%xmm0,%%xmm3 \n" \ |
| 1581 "movdqa %%xmm11,%%xmm0 \n" \ |
| 1582 "pmaddubsw %%xmm8,%%xmm1 \n" \ |
| 1583 "psubw %%xmm1,%%xmm0 \n" \ |
| 1584 "movdqa %%xmm12,%%xmm1 \n" \ |
| 1585 "pmaddubsw %%xmm9,%%xmm2 \n" \ |
| 1586 "psubw %%xmm2,%%xmm1 \n" \ |
| 1587 "movdqa %%xmm13,%%xmm2 \n" \ |
| 1588 "pmaddubsw %%xmm10,%%xmm3 \n" \ |
| 1589 "psubw %%xmm3,%%xmm2 \n" \ |
| 1590 "pmulhuw %%xmm14,%%xmm4 \n" \ |
| 1591 "paddsw %%xmm4,%%xmm0 \n" \ |
| 1592 "paddsw %%xmm4,%%xmm1 \n" \ |
| 1593 "paddsw %%xmm4,%%xmm2 \n" \ |
| 1594 "psraw $0x6,%%xmm0 \n" \ |
| 1595 "psraw $0x6,%%xmm1 \n" \ |
| 1596 "psraw $0x6,%%xmm2 \n" \ |
| 1597 "packuswb %%xmm0,%%xmm0 \n" \ |
| 1598 "packuswb %%xmm1,%%xmm1 \n" \ |
| 1599 "packuswb %%xmm2,%%xmm2 \n" |
| 1600 #define YUVTORGB_REGS \ |
| 1601 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", |
| 1602 |
| 1603 #else |
| 1604 #define YUVTORGB_SETUP(yuvconstants) |
| 1605 // Convert 8 pixels: 8 UV and 8 Y |
| 1606 #define YUVTORGB(yuvconstants) \ |
| 1607 "movdqa %%xmm0,%%xmm1 \n" \ |
1570 "movdqa %%xmm0,%%xmm2 \n" \ | 1608 "movdqa %%xmm0,%%xmm2 \n" \ |
1571 "movdqa %%xmm0,%%xmm3 \n" \ | 1609 "movdqa %%xmm0,%%xmm3 \n" \ |
1572 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ | 1610 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ |
1573 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ | 1611 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ |
1574 "psubw %%xmm1,%%xmm0 \n" \ | 1612 "psubw %%xmm1,%%xmm0 \n" \ |
1575 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ | 1613 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ |
1576 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ | 1614 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ |
1577 "psubw %%xmm2,%%xmm1 \n" \ | 1615 "psubw %%xmm2,%%xmm1 \n" \ |
1578 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ | 1616 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ |
1579 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ | 1617 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ |
1580 "psubw %%xmm3,%%xmm2 \n" \ | 1618 "psubw %%xmm3,%%xmm2 \n" \ |
1581 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ | 1619 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ |
1582 "paddsw %%xmm4,%%xmm0 \n" \ | 1620 "paddsw %%xmm4,%%xmm0 \n" \ |
1583 "paddsw %%xmm4,%%xmm1 \n" \ | 1621 "paddsw %%xmm4,%%xmm1 \n" \ |
1584 "paddsw %%xmm4,%%xmm2 \n" \ | 1622 "paddsw %%xmm4,%%xmm2 \n" \ |
1585 "psraw $0x6,%%xmm0 \n" \ | 1623 "psraw $0x6,%%xmm0 \n" \ |
1586 "psraw $0x6,%%xmm1 \n" \ | 1624 "psraw $0x6,%%xmm1 \n" \ |
1587 "psraw $0x6,%%xmm2 \n" \ | 1625 "psraw $0x6,%%xmm2 \n" \ |
1588 "packuswb %%xmm0,%%xmm0 \n" \ | 1626 "packuswb %%xmm0,%%xmm0 \n" \ |
1589 "packuswb %%xmm1,%%xmm1 \n" \ | 1627 "packuswb %%xmm1,%%xmm1 \n" \ |
1590 "packuswb %%xmm2,%%xmm2 \n" | 1628 "packuswb %%xmm2,%%xmm2 \n" |
| 1629 #define YUVTORGB_REGS |
| 1630 #endif |
1591 | 1631 |
1592 // Store 8 ARGB values. | 1632 // Store 8 ARGB values. |
1593 #define STOREARGB \ | 1633 #define STOREARGB \ |
1594 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1634 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1595 "punpcklbw %%xmm5,%%xmm2 \n" \ | 1635 "punpcklbw %%xmm5,%%xmm2 \n" \ |
1596 "movdqa %%xmm0,%%xmm1 \n" \ | 1636 "movdqa %%xmm0,%%xmm1 \n" \ |
1597 "punpcklwd %%xmm2,%%xmm0 \n" \ | 1637 "punpcklwd %%xmm2,%%xmm0 \n" \ |
1598 "punpckhwd %%xmm2,%%xmm1 \n" \ | 1638 "punpckhwd %%xmm2,%%xmm1 \n" \ |
1599 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ | 1639 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ |
1600 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ | 1640 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ |
(...skipping 11 matching lines...) Expand all Loading... |
1612 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ | 1652 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ |
1613 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" | 1653 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" |
1614 | 1654 |
1615 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, | 1655 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, |
1616 const uint8* u_buf, | 1656 const uint8* u_buf, |
1617 const uint8* v_buf, | 1657 const uint8* v_buf, |
1618 uint8* dst_argb, | 1658 uint8* dst_argb, |
1619 const struct YuvConstants* yuvconstants, | 1659 const struct YuvConstants* yuvconstants, |
1620 int width) { | 1660 int width) { |
1621 asm volatile ( | 1661 asm volatile ( |
| 1662 YUVTORGB_SETUP(yuvconstants) |
1622 "sub %[u_buf],%[v_buf] \n" | 1663 "sub %[u_buf],%[v_buf] \n" |
1623 "pcmpeqb %%xmm5,%%xmm5 \n" | 1664 "pcmpeqb %%xmm5,%%xmm5 \n" |
1624 LABELALIGN | 1665 LABELALIGN |
1625 "1: \n" | 1666 "1: \n" |
1626 READYUV444 | 1667 READYUV444 |
1627 YUVTORGB(yuvconstants) | 1668 YUVTORGB(yuvconstants) |
1628 STOREARGB | 1669 STOREARGB |
1629 "sub $0x8,%[width] \n" | 1670 "sub $0x8,%[width] \n" |
1630 "jg 1b \n" | 1671 "jg 1b \n" |
1631 : [y_buf]"+r"(y_buf), // %[y_buf] | 1672 : [y_buf]"+r"(y_buf), // %[y_buf] |
1632 [u_buf]"+r"(u_buf), // %[u_buf] | 1673 [u_buf]"+r"(u_buf), // %[u_buf] |
1633 [v_buf]"+r"(v_buf), // %[v_buf] | 1674 [v_buf]"+r"(v_buf), // %[v_buf] |
1634 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1675 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1635 [width]"+rm"(width) // %[width] | 1676 [width]"+rm"(width) // %[width] |
1636 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1677 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1637 : "memory", "cc", NACL_R14 | 1678 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
1638 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1679 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1639 ); | 1680 ); |
1640 } | 1681 } |
1641 | 1682 |
1642 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, | 1683 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, |
1643 const uint8* u_buf, | 1684 const uint8* u_buf, |
1644 const uint8* v_buf, | 1685 const uint8* v_buf, |
1645 uint8* dst_rgb24, | 1686 uint8* dst_rgb24, |
1646 const struct YuvConstants* yuvconstants, | 1687 const struct YuvConstants* yuvconstants, |
1647 int width) { | 1688 int width) { |
1648 asm volatile ( | 1689 asm volatile ( |
| 1690 YUVTORGB_SETUP(yuvconstants) |
1649 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" | 1691 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" |
1650 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" | 1692 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" |
1651 "sub %[u_buf],%[v_buf] \n" | 1693 "sub %[u_buf],%[v_buf] \n" |
1652 LABELALIGN | 1694 LABELALIGN |
1653 "1: \n" | 1695 "1: \n" |
1654 READYUV422 | 1696 READYUV422 |
1655 YUVTORGB(yuvconstants) | 1697 YUVTORGB(yuvconstants) |
1656 "punpcklbw %%xmm1,%%xmm0 \n" | 1698 "punpcklbw %%xmm1,%%xmm0 \n" |
1657 "punpcklbw %%xmm2,%%xmm2 \n" | 1699 "punpcklbw %%xmm2,%%xmm2 \n" |
1658 "movdqa %%xmm0,%%xmm1 \n" | 1700 "movdqa %%xmm0,%%xmm1 \n" |
(...skipping 12 matching lines...) Expand all Loading... |
1671 [v_buf]"+r"(v_buf), // %[v_buf] | 1713 [v_buf]"+r"(v_buf), // %[v_buf] |
1672 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] | 1714 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] |
1673 #if defined(__i386__) && defined(__pic__) | 1715 #if defined(__i386__) && defined(__pic__) |
1674 [width]"+m"(width) // %[width] | 1716 [width]"+m"(width) // %[width] |
1675 #else | 1717 #else |
1676 [width]"+rm"(width) // %[width] | 1718 [width]"+rm"(width) // %[width] |
1677 #endif | 1719 #endif |
1678 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 1720 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
1679 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), | 1721 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), |
1680 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) | 1722 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) |
1681 : "memory", "cc", NACL_R14 | 1723 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
1682 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 1724 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
1683 ); | 1725 ); |
1684 } | 1726 } |
1685 | 1727 |
1686 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, | 1728 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, |
1687 const uint8* u_buf, | 1729 const uint8* u_buf, |
1688 const uint8* v_buf, | 1730 const uint8* v_buf, |
1689 uint8* dst_argb, | 1731 uint8* dst_argb, |
1690 const struct YuvConstants* yuvconstants, | 1732 const struct YuvConstants* yuvconstants, |
1691 int width) { | 1733 int width) { |
1692 asm volatile ( | 1734 asm volatile ( |
| 1735 YUVTORGB_SETUP(yuvconstants) |
1693 "sub %[u_buf],%[v_buf] \n" | 1736 "sub %[u_buf],%[v_buf] \n" |
1694 "pcmpeqb %%xmm5,%%xmm5 \n" | 1737 "pcmpeqb %%xmm5,%%xmm5 \n" |
1695 LABELALIGN | 1738 LABELALIGN |
1696 "1: \n" | 1739 "1: \n" |
1697 READYUV422 | 1740 READYUV422 |
1698 YUVTORGB(yuvconstants) | 1741 YUVTORGB(yuvconstants) |
1699 STOREARGB | 1742 STOREARGB |
1700 "sub $0x8,%[width] \n" | 1743 "sub $0x8,%[width] \n" |
1701 "jg 1b \n" | 1744 "jg 1b \n" |
1702 : [y_buf]"+r"(y_buf), // %[y_buf] | 1745 : [y_buf]"+r"(y_buf), // %[y_buf] |
1703 [u_buf]"+r"(u_buf), // %[u_buf] | 1746 [u_buf]"+r"(u_buf), // %[u_buf] |
1704 [v_buf]"+r"(v_buf), // %[v_buf] | 1747 [v_buf]"+r"(v_buf), // %[v_buf] |
1705 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1748 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1706 [width]"+rm"(width) // %[width] | 1749 [width]"+rm"(width) // %[width] |
1707 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1750 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1708 : "memory", "cc", NACL_R14 | 1751 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
1709 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1752 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1710 ); | 1753 ); |
1711 } | 1754 } |
1712 | 1755 |
1713 #ifdef HAS_I422ALPHATOARGBROW_SSSE3 | 1756 #ifdef HAS_I422ALPHATOARGBROW_SSSE3 |
1714 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, | 1757 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, |
1715 const uint8* u_buf, | 1758 const uint8* u_buf, |
1716 const uint8* v_buf, | 1759 const uint8* v_buf, |
1717 const uint8* a_buf, | 1760 const uint8* a_buf, |
1718 uint8* dst_argb, | 1761 uint8* dst_argb, |
1719 const struct YuvConstants* yuvconstants, | 1762 const struct YuvConstants* yuvconstants, |
1720 int width) { | 1763 int width) { |
1721 asm volatile ( | 1764 asm volatile ( |
| 1765 YUVTORGB_SETUP(yuvconstants) |
1722 "sub %[u_buf],%[v_buf] \n" | 1766 "sub %[u_buf],%[v_buf] \n" |
1723 LABELALIGN | 1767 LABELALIGN |
1724 "1: \n" | 1768 "1: \n" |
1725 READYUVA422 | 1769 READYUVA422 |
1726 YUVTORGB(yuvconstants) | 1770 YUVTORGB(yuvconstants) |
1727 STOREARGB | 1771 STOREARGB |
1728 "subl $0x8,%[width] \n" | 1772 "subl $0x8,%[width] \n" |
1729 "jg 1b \n" | 1773 "jg 1b \n" |
1730 : [y_buf]"+r"(y_buf), // %[y_buf] | 1774 : [y_buf]"+r"(y_buf), // %[y_buf] |
1731 [u_buf]"+r"(u_buf), // %[u_buf] | 1775 [u_buf]"+r"(u_buf), // %[u_buf] |
1732 [v_buf]"+r"(v_buf), // %[v_buf] | 1776 [v_buf]"+r"(v_buf), // %[v_buf] |
1733 [a_buf]"+r"(a_buf), // %[a_buf] | 1777 [a_buf]"+r"(a_buf), // %[a_buf] |
1734 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1778 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1735 #if defined(__i386__) && defined(__pic__) | 1779 #if defined(__i386__) && defined(__pic__) |
1736 [width]"+m"(width) // %[width] | 1780 [width]"+m"(width) // %[width] |
1737 #else | 1781 #else |
1738 [width]"+rm"(width) // %[width] | 1782 [width]"+rm"(width) // %[width] |
1739 #endif | 1783 #endif |
1740 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1784 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1741 : "memory", "cc", NACL_R14 | 1785 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
1742 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1786 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1743 ); | 1787 ); |
1744 } | 1788 } |
1745 #endif // HAS_I422ALPHATOARGBROW_SSSE3 | 1789 #endif // HAS_I422ALPHATOARGBROW_SSSE3 |
1746 | 1790 |
1747 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, | 1791 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
1748 const uint8* u_buf, | 1792 const uint8* u_buf, |
1749 const uint8* v_buf, | 1793 const uint8* v_buf, |
1750 uint8* dst_argb, | 1794 uint8* dst_argb, |
1751 const struct YuvConstants* yuvconstants, | 1795 const struct YuvConstants* yuvconstants, |
1752 int width) { | 1796 int width) { |
1753 asm volatile ( | 1797 asm volatile ( |
| 1798 YUVTORGB_SETUP(yuvconstants) |
1754 "sub %[u_buf],%[v_buf] \n" | 1799 "sub %[u_buf],%[v_buf] \n" |
1755 "pcmpeqb %%xmm5,%%xmm5 \n" | 1800 "pcmpeqb %%xmm5,%%xmm5 \n" |
1756 LABELALIGN | 1801 LABELALIGN |
1757 "1: \n" | 1802 "1: \n" |
1758 READYUV411 | 1803 READYUV411 |
1759 YUVTORGB(yuvconstants) | 1804 YUVTORGB(yuvconstants) |
1760 STOREARGB | 1805 STOREARGB |
1761 "sub $0x8,%[width] \n" | 1806 "sub $0x8,%[width] \n" |
1762 "jg 1b \n" | 1807 "jg 1b \n" |
1763 : [y_buf]"+r"(y_buf), // %[y_buf] | 1808 : [y_buf]"+r"(y_buf), // %[y_buf] |
1764 [u_buf]"+r"(u_buf), // %[u_buf] | 1809 [u_buf]"+r"(u_buf), // %[u_buf] |
1765 [v_buf]"+r"(v_buf), // %[v_buf] | 1810 [v_buf]"+r"(v_buf), // %[v_buf] |
1766 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1811 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1767 [width]"+rm"(width) // %[width] | 1812 [width]"+rm"(width) // %[width] |
1768 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1813 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1769 : "memory", "cc", NACL_R14 | 1814 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
1770 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1815 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1771 ); | 1816 ); |
1772 } | 1817 } |
1773 | 1818 |
1774 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, | 1819 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, |
1775 const uint8* uv_buf, | 1820 const uint8* uv_buf, |
1776 uint8* dst_argb, | 1821 uint8* dst_argb, |
1777 const struct YuvConstants* yuvconstants, | 1822 const struct YuvConstants* yuvconstants, |
1778 int width) { | 1823 int width) { |
1779 asm volatile ( | 1824 asm volatile ( |
| 1825 YUVTORGB_SETUP(yuvconstants) |
1780 "pcmpeqb %%xmm5,%%xmm5 \n" | 1826 "pcmpeqb %%xmm5,%%xmm5 \n" |
1781 LABELALIGN | 1827 LABELALIGN |
1782 "1: \n" | 1828 "1: \n" |
1783 READNV12 | 1829 READNV12 |
1784 YUVTORGB(yuvconstants) | 1830 YUVTORGB(yuvconstants) |
1785 STOREARGB | 1831 STOREARGB |
1786 "sub $0x8,%[width] \n" | 1832 "sub $0x8,%[width] \n" |
1787 "jg 1b \n" | 1833 "jg 1b \n" |
1788 : [y_buf]"+r"(y_buf), // %[y_buf] | 1834 : [y_buf]"+r"(y_buf), // %[y_buf] |
1789 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 1835 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
1790 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1836 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1791 [width]"+rm"(width) // %[width] | 1837 [width]"+rm"(width) // %[width] |
1792 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1838 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1793 // Does not use r14. | 1839 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
1794 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1840 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1795 ); | 1841 ); |
1796 } | 1842 } |
1797 | 1843 |
1798 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, | 1844 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, |
1799 const uint8* vu_buf, | 1845 const uint8* vu_buf, |
1800 uint8* dst_argb, | 1846 uint8* dst_argb, |
1801 const struct YuvConstants* yuvconstants, | 1847 const struct YuvConstants* yuvconstants, |
1802 int width) { | 1848 int width) { |
1803 asm volatile ( | 1849 asm volatile ( |
| 1850 YUVTORGB_SETUP(yuvconstants) |
1804 "pcmpeqb %%xmm5,%%xmm5 \n" | 1851 "pcmpeqb %%xmm5,%%xmm5 \n" |
1805 LABELALIGN | 1852 LABELALIGN |
1806 "1: \n" | 1853 "1: \n" |
1807 READNV21 | 1854 READNV21 |
1808 YUVTORGB(yuvconstants) | 1855 YUVTORGB(yuvconstants) |
1809 STOREARGB | 1856 STOREARGB |
1810 "sub $0x8,%[width] \n" | 1857 "sub $0x8,%[width] \n" |
1811 "jg 1b \n" | 1858 "jg 1b \n" |
1812 : [y_buf]"+r"(y_buf), // %[y_buf] | 1859 : [y_buf]"+r"(y_buf), // %[y_buf] |
1813 [vu_buf]"+r"(vu_buf), // %[vu_buf] | 1860 [vu_buf]"+r"(vu_buf), // %[vu_buf] |
1814 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1861 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1815 [width]"+rm"(width) // %[width] | 1862 [width]"+rm"(width) // %[width] |
1816 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 1863 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
1817 [kShuffleNV21]"m"(kShuffleNV21) | 1864 [kShuffleNV21]"m"(kShuffleNV21) |
1818 // Does not use r14. | 1865 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
1819 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1866 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1820 ); | 1867 ); |
1821 } | 1868 } |
1822 | 1869 |
1823 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, | 1870 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, |
1824 uint8* dst_argb, | 1871 uint8* dst_argb, |
1825 const struct YuvConstants* yuvconstants, | 1872 const struct YuvConstants* yuvconstants, |
1826 int width) { | 1873 int width) { |
1827 asm volatile ( | 1874 asm volatile ( |
| 1875 YUVTORGB_SETUP(yuvconstants) |
1828 "pcmpeqb %%xmm5,%%xmm5 \n" | 1876 "pcmpeqb %%xmm5,%%xmm5 \n" |
1829 LABELALIGN | 1877 LABELALIGN |
1830 "1: \n" | 1878 "1: \n" |
1831 READYUY2 | 1879 READYUY2 |
1832 YUVTORGB(yuvconstants) | 1880 YUVTORGB(yuvconstants) |
1833 STOREARGB | 1881 STOREARGB |
1834 "sub $0x8,%[width] \n" | 1882 "sub $0x8,%[width] \n" |
1835 "jg 1b \n" | 1883 "jg 1b \n" |
1836 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] | 1884 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] |
1837 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1885 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1838 [width]"+rm"(width) // %[width] | 1886 [width]"+rm"(width) // %[width] |
1839 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 1887 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
1840 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), | 1888 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), |
1841 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) | 1889 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) |
1842 // Does not use r14. | 1890 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
1843 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1891 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1844 ); | 1892 ); |
1845 } | 1893 } |
1846 | 1894 |
1847 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, | 1895 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, |
1848 uint8* dst_argb, | 1896 uint8* dst_argb, |
1849 const struct YuvConstants* yuvconstants, | 1897 const struct YuvConstants* yuvconstants, |
1850 int width) { | 1898 int width) { |
1851 asm volatile ( | 1899 asm volatile ( |
| 1900 YUVTORGB_SETUP(yuvconstants) |
1852 "pcmpeqb %%xmm5,%%xmm5 \n" | 1901 "pcmpeqb %%xmm5,%%xmm5 \n" |
1853 LABELALIGN | 1902 LABELALIGN |
1854 "1: \n" | 1903 "1: \n" |
1855 READUYVY | 1904 READUYVY |
1856 YUVTORGB(yuvconstants) | 1905 YUVTORGB(yuvconstants) |
1857 STOREARGB | 1906 STOREARGB |
1858 "sub $0x8,%[width] \n" | 1907 "sub $0x8,%[width] \n" |
1859 "jg 1b \n" | 1908 "jg 1b \n" |
1860 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] | 1909 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] |
1861 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1910 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
1862 [width]"+rm"(width) // %[width] | 1911 [width]"+rm"(width) // %[width] |
1863 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 1912 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
1864 [kShuffleUYVYY]"m"(kShuffleUYVYY), | 1913 [kShuffleUYVYY]"m"(kShuffleUYVYY), |
1865 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) | 1914 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) |
1866 // Does not use r14. | 1915 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
1867 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1916 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1868 ); | 1917 ); |
1869 } | 1918 } |
1870 | 1919 |
1871 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, | 1920 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
1872 const uint8* u_buf, | 1921 const uint8* u_buf, |
1873 const uint8* v_buf, | 1922 const uint8* v_buf, |
1874 uint8* dst_rgba, | 1923 uint8* dst_rgba, |
1875 const struct YuvConstants* yuvconstants, | 1924 const struct YuvConstants* yuvconstants, |
1876 int width) { | 1925 int width) { |
1877 asm volatile ( | 1926 asm volatile ( |
| 1927 YUVTORGB_SETUP(yuvconstants) |
1878 "sub %[u_buf],%[v_buf] \n" | 1928 "sub %[u_buf],%[v_buf] \n" |
1879 "pcmpeqb %%xmm5,%%xmm5 \n" | 1929 "pcmpeqb %%xmm5,%%xmm5 \n" |
1880 LABELALIGN | 1930 LABELALIGN |
1881 "1: \n" | 1931 "1: \n" |
1882 READYUV422 | 1932 READYUV422 |
1883 YUVTORGB(yuvconstants) | 1933 YUVTORGB(yuvconstants) |
1884 STORERGBA | 1934 STORERGBA |
1885 "sub $0x8,%[width] \n" | 1935 "sub $0x8,%[width] \n" |
1886 "jg 1b \n" | 1936 "jg 1b \n" |
1887 : [y_buf]"+r"(y_buf), // %[y_buf] | 1937 : [y_buf]"+r"(y_buf), // %[y_buf] |
1888 [u_buf]"+r"(u_buf), // %[u_buf] | 1938 [u_buf]"+r"(u_buf), // %[u_buf] |
1889 [v_buf]"+r"(v_buf), // %[v_buf] | 1939 [v_buf]"+r"(v_buf), // %[v_buf] |
1890 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] | 1940 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] |
1891 [width]"+rm"(width) // %[width] | 1941 [width]"+rm"(width) // %[width] |
1892 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1942 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
1893 : "memory", "cc", NACL_R14 | 1943 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
1894 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1944 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
1895 ); | 1945 ); |
1896 } | 1946 } |
1897 | 1947 |
1898 #endif // HAS_I422TOARGBROW_SSSE3 | 1948 #endif // HAS_I422TOARGBROW_SSSE3 |
1899 | 1949 |
1900 // Read 8 UV from 422, upsample to 16 UV. | 1950 // Read 8 UV from 422, upsample to 16 UV. |
1901 #define READYUV422_AVX2 \ | 1951 #define READYUV422_AVX2 \ |
1902 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1952 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1903 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1953 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1957 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" | 2007 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" |
1958 | 2008 |
1959 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. | 2009 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. |
1960 #define READUYVY_AVX2 \ | 2010 #define READUYVY_AVX2 \ |
1961 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ | 2011 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ |
1962 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ | 2012 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ |
1963 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ | 2013 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ |
1964 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ | 2014 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ |
1965 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" | 2015 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" |
1966 | 2016 |
1967 // Convert 16 pixels: 16 UV and 16 Y. | 2017 #if defined(__x86_64__) |
| 2018 #define YUVTORGB_SETUP_AVX2(yuvconstants) \ |
| 2019 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ |
| 2020 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ |
| 2021 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ |
| 2022 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ |
| 2023 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ |
| 2024 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ |
| 2025 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" |
| 2026 #define YUVTORGB_AVX2(yuvconstants) \ |
| 2027 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ |
| 2028 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ |
| 2029 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ |
| 2030 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ |
| 2031 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ |
| 2032 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ |
| 2033 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ |
| 2034 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ |
| 2035 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ |
| 2036 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ |
| 2037 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
| 2038 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
| 2039 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ |
| 2040 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 2041 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
| 2042 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
| 2043 #define YUVTORGB_REGS_AVX2 \ |
| 2044 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", |
| 2045 #else// Convert 16 pixels: 16 UV and 16 Y. |
| 2046 #define YUVTORGB_SETUP_AVX2(yuvconstants) |
1968 #define YUVTORGB_AVX2(yuvconstants) \ | 2047 #define YUVTORGB_AVX2(yuvconstants) \ |
1969 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ | 2048 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ |
1970 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ | 2049 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ |
1971 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ | 2050 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ |
1972 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ | 2051 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ |
1973 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ | 2052 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ |
1974 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ | 2053 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ |
1975 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ | 2054 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ |
1976 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ | 2055 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ |
1977 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ | 2056 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ |
1978 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ | 2057 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ |
1979 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ | 2058 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ |
1980 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ | 2059 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ |
1981 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ | 2060 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ |
1982 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | 2061 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
1983 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | 2062 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
1984 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | 2063 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ |
1985 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | 2064 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ |
1986 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 2065 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
1987 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 2066 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
| 2067 #define YUVTORGB_REGS_AVX2 |
| 2068 #endif |
1988 | 2069 |
1989 // Store 16 ARGB values. | 2070 // Store 16 ARGB values. |
1990 #define STOREARGB_AVX2 \ | 2071 #define STOREARGB_AVX2 \ |
1991 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 2072 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
1992 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 2073 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1993 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ | 2074 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ |
1994 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ | 2075 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
1995 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ | 2076 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ |
1996 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ | 2077 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ |
1997 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ | 2078 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ |
1998 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ | 2079 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ |
1999 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" | 2080 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" |
2000 | 2081 |
2001 #if defined(HAS_I422TOARGBROW_AVX2) | 2082 #if defined(HAS_I422TOARGBROW_AVX2) |
2002 // 16 pixels | 2083 // 16 pixels |
2003 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2084 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2004 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, | 2085 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, |
2005 const uint8* u_buf, | 2086 const uint8* u_buf, |
2006 const uint8* v_buf, | 2087 const uint8* v_buf, |
2007 uint8* dst_argb, | 2088 uint8* dst_argb, |
2008 const struct YuvConstants* yuvconstants, | 2089 const struct YuvConstants* yuvconstants, |
2009 int width) { | 2090 int width) { |
2010 asm volatile ( | 2091 asm volatile ( |
| 2092 YUVTORGB_SETUP_AVX2(yuvconstants) |
2011 "sub %[u_buf],%[v_buf] \n" | 2093 "sub %[u_buf],%[v_buf] \n" |
2012 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2094 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2013 LABELALIGN | 2095 LABELALIGN |
2014 "1: \n" | 2096 "1: \n" |
2015 READYUV422_AVX2 | 2097 READYUV422_AVX2 |
2016 YUVTORGB_AVX2(yuvconstants) | 2098 YUVTORGB_AVX2(yuvconstants) |
2017 STOREARGB_AVX2 | 2099 STOREARGB_AVX2 |
2018 "sub $0x10,%[width] \n" | 2100 "sub $0x10,%[width] \n" |
2019 "jg 1b \n" | 2101 "jg 1b \n" |
2020 "vzeroupper \n" | 2102 "vzeroupper \n" |
2021 : [y_buf]"+r"(y_buf), // %[y_buf] | 2103 : [y_buf]"+r"(y_buf), // %[y_buf] |
2022 [u_buf]"+r"(u_buf), // %[u_buf] | 2104 [u_buf]"+r"(u_buf), // %[u_buf] |
2023 [v_buf]"+r"(v_buf), // %[v_buf] | 2105 [v_buf]"+r"(v_buf), // %[v_buf] |
2024 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2106 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2025 [width]"+rm"(width) // %[width] | 2107 [width]"+rm"(width) // %[width] |
2026 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2108 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2027 : "memory", "cc", NACL_R14 | 2109 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
2028 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2110 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2029 ); | 2111 ); |
2030 } | 2112 } |
2031 #endif // HAS_I422TOARGBROW_AVX2 | 2113 #endif // HAS_I422TOARGBROW_AVX2 |
2032 | 2114 |
2033 #if defined(HAS_I422ALPHATOARGBROW_AVX2) | 2115 #if defined(HAS_I422ALPHATOARGBROW_AVX2) |
2034 // 16 pixels | 2116 // 16 pixels |
2035 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. | 2117 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. |
2036 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, | 2118 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, |
2037 const uint8* u_buf, | 2119 const uint8* u_buf, |
2038 const uint8* v_buf, | 2120 const uint8* v_buf, |
2039 const uint8* a_buf, | 2121 const uint8* a_buf, |
2040 uint8* dst_argb, | 2122 uint8* dst_argb, |
2041 const struct YuvConstants* yuvconstants, | 2123 const struct YuvConstants* yuvconstants, |
2042 int width) { | 2124 int width) { |
2043 asm volatile ( | 2125 asm volatile ( |
| 2126 YUVTORGB_SETUP_AVX2(yuvconstants) |
2044 "sub %[u_buf],%[v_buf] \n" | 2127 "sub %[u_buf],%[v_buf] \n" |
2045 LABELALIGN | 2128 LABELALIGN |
2046 "1: \n" | 2129 "1: \n" |
2047 READYUVA422_AVX2 | 2130 READYUVA422_AVX2 |
2048 YUVTORGB_AVX2(yuvconstants) | 2131 YUVTORGB_AVX2(yuvconstants) |
2049 STOREARGB_AVX2 | 2132 STOREARGB_AVX2 |
2050 "subl $0x10,%[width] \n" | 2133 "subl $0x10,%[width] \n" |
2051 "jg 1b \n" | 2134 "jg 1b \n" |
2052 "vzeroupper \n" | 2135 "vzeroupper \n" |
2053 : [y_buf]"+r"(y_buf), // %[y_buf] | 2136 : [y_buf]"+r"(y_buf), // %[y_buf] |
2054 [u_buf]"+r"(u_buf), // %[u_buf] | 2137 [u_buf]"+r"(u_buf), // %[u_buf] |
2055 [v_buf]"+r"(v_buf), // %[v_buf] | 2138 [v_buf]"+r"(v_buf), // %[v_buf] |
2056 [a_buf]"+r"(a_buf), // %[a_buf] | 2139 [a_buf]"+r"(a_buf), // %[a_buf] |
2057 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2140 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2058 #if defined(__i386__) && defined(__pic__) | 2141 #if defined(__i386__) && defined(__pic__) |
2059 [width]"+m"(width) // %[width] | 2142 [width]"+m"(width) // %[width] |
2060 #else | 2143 #else |
2061 [width]"+rm"(width) // %[width] | 2144 [width]"+rm"(width) // %[width] |
2062 #endif | 2145 #endif |
2063 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2146 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2064 : "memory", "cc", NACL_R14 | 2147 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
2065 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2148 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2066 ); | 2149 ); |
2067 } | 2150 } |
2068 #endif // HAS_I422ALPHATOARGBROW_AVX2 | 2151 #endif // HAS_I422ALPHATOARGBROW_AVX2 |
2069 | 2152 |
2070 #if defined(HAS_I422TORGBAROW_AVX2) | 2153 #if defined(HAS_I422TORGBAROW_AVX2) |
2071 // 16 pixels | 2154 // 16 pixels |
2072 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | 2155 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
2073 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, | 2156 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, |
2074 const uint8* u_buf, | 2157 const uint8* u_buf, |
2075 const uint8* v_buf, | 2158 const uint8* v_buf, |
2076 uint8* dst_argb, | 2159 uint8* dst_argb, |
2077 const struct YuvConstants* yuvconstants, | 2160 const struct YuvConstants* yuvconstants, |
2078 int width) { | 2161 int width) { |
2079 asm volatile ( | 2162 asm volatile ( |
| 2163 YUVTORGB_SETUP_AVX2(yuvconstants) |
2080 "sub %[u_buf],%[v_buf] \n" | 2164 "sub %[u_buf],%[v_buf] \n" |
2081 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2165 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2082 LABELALIGN | 2166 LABELALIGN |
2083 "1: \n" | 2167 "1: \n" |
2084 READYUV422_AVX2 | 2168 READYUV422_AVX2 |
2085 YUVTORGB_AVX2(yuvconstants) | 2169 YUVTORGB_AVX2(yuvconstants) |
2086 | 2170 |
2087 // Step 3: Weave into RGBA | 2171 // Step 3: Weave into RGBA |
2088 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" | 2172 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" |
2089 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 2173 "vpermq $0xd8,%%ymm1,%%ymm1 \n" |
2090 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" | 2174 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" |
2091 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | 2175 "vpermq $0xd8,%%ymm2,%%ymm2 \n" |
2092 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" | 2176 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" |
2093 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" | 2177 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" |
2094 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" | 2178 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" |
2095 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" | 2179 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" |
2096 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" | 2180 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" |
2097 "sub $0x10,%[width] \n" | 2181 "sub $0x10,%[width] \n" |
2098 "jg 1b \n" | 2182 "jg 1b \n" |
2099 "vzeroupper \n" | 2183 "vzeroupper \n" |
2100 : [y_buf]"+r"(y_buf), // %[y_buf] | 2184 : [y_buf]"+r"(y_buf), // %[y_buf] |
2101 [u_buf]"+r"(u_buf), // %[u_buf] | 2185 [u_buf]"+r"(u_buf), // %[u_buf] |
2102 [v_buf]"+r"(v_buf), // %[v_buf] | 2186 [v_buf]"+r"(v_buf), // %[v_buf] |
2103 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2187 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2104 [width]"+rm"(width) // %[width] | 2188 [width]"+rm"(width) // %[width] |
2105 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2189 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2106 : "memory", "cc", NACL_R14 | 2190 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
2107 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2191 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2108 ); | 2192 ); |
2109 } | 2193 } |
2110 #endif // HAS_I422TORGBAROW_AVX2 | 2194 #endif // HAS_I422TORGBAROW_AVX2 |
2111 | 2195 |
2112 #if defined(HAS_NV12TOARGBROW_AVX2) | 2196 #if defined(HAS_NV12TOARGBROW_AVX2) |
2113 // 16 pixels. | 2197 // 16 pixels. |
2114 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2198 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2115 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, | 2199 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, |
2116 const uint8* uv_buf, | 2200 const uint8* uv_buf, |
2117 uint8* dst_argb, | 2201 uint8* dst_argb, |
2118 const struct YuvConstants* yuvconstants, | 2202 const struct YuvConstants* yuvconstants, |
2119 int width) { | 2203 int width) { |
2120 asm volatile ( | 2204 asm volatile ( |
| 2205 YUVTORGB_SETUP_AVX2(yuvconstants) |
2121 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2206 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2122 LABELALIGN | 2207 LABELALIGN |
2123 "1: \n" | 2208 "1: \n" |
2124 READNV12_AVX2 | 2209 READNV12_AVX2 |
2125 YUVTORGB_AVX2(yuvconstants) | 2210 YUVTORGB_AVX2(yuvconstants) |
2126 STOREARGB_AVX2 | 2211 STOREARGB_AVX2 |
2127 "sub $0x10,%[width] \n" | 2212 "sub $0x10,%[width] \n" |
2128 "jg 1b \n" | 2213 "jg 1b \n" |
2129 "vzeroupper \n" | 2214 "vzeroupper \n" |
2130 : [y_buf]"+r"(y_buf), // %[y_buf] | 2215 : [y_buf]"+r"(y_buf), // %[y_buf] |
2131 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 2216 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
2132 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2217 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2133 [width]"+rm"(width) // %[width] | 2218 [width]"+rm"(width) // %[width] |
2134 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2219 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2135 // Does not use r14. | 2220 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
2136 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2221 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2137 ); | 2222 ); |
2138 } | 2223 } |
2139 #endif // HAS_NV12TOARGBROW_AVX2 | 2224 #endif // HAS_NV12TOARGBROW_AVX2 |
2140 | 2225 |
2141 #if defined(HAS_NV21TOARGBROW_AVX2) | 2226 #if defined(HAS_NV21TOARGBROW_AVX2) |
2142 // 16 pixels. | 2227 // 16 pixels. |
2143 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2228 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2144 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, | 2229 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, |
2145 const uint8* vu_buf, | 2230 const uint8* vu_buf, |
2146 uint8* dst_argb, | 2231 uint8* dst_argb, |
2147 const struct YuvConstants* yuvconstants, | 2232 const struct YuvConstants* yuvconstants, |
2148 int width) { | 2233 int width) { |
2149 asm volatile ( | 2234 asm volatile ( |
| 2235 YUVTORGB_SETUP_AVX2(yuvconstants) |
2150 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2236 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2151 LABELALIGN | 2237 LABELALIGN |
2152 "1: \n" | 2238 "1: \n" |
2153 READNV21_AVX2 | 2239 READNV21_AVX2 |
2154 YUVTORGB_AVX2(yuvconstants) | 2240 YUVTORGB_AVX2(yuvconstants) |
2155 STOREARGB_AVX2 | 2241 STOREARGB_AVX2 |
2156 "sub $0x10,%[width] \n" | 2242 "sub $0x10,%[width] \n" |
2157 "jg 1b \n" | 2243 "jg 1b \n" |
2158 "vzeroupper \n" | 2244 "vzeroupper \n" |
2159 : [y_buf]"+r"(y_buf), // %[y_buf] | 2245 : [y_buf]"+r"(y_buf), // %[y_buf] |
2160 [vu_buf]"+r"(vu_buf), // %[vu_buf] | 2246 [vu_buf]"+r"(vu_buf), // %[vu_buf] |
2161 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2247 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2162 [width]"+rm"(width) // %[width] | 2248 [width]"+rm"(width) // %[width] |
2163 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 2249 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
2164 [kShuffleNV21]"m"(kShuffleNV21) | 2250 [kShuffleNV21]"m"(kShuffleNV21) |
2165 // Does not use r14. | 2251 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
2166 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2252 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2167 ); | 2253 ); |
2168 } | 2254 } |
2169 #endif // HAS_NV21TOARGBROW_AVX2 | 2255 #endif // HAS_NV21TOARGBROW_AVX2 |
2170 | 2256 |
2171 #if defined(HAS_YUY2TOARGBROW_AVX2) | 2257 #if defined(HAS_YUY2TOARGBROW_AVX2) |
2172 // 16 pixels. | 2258 // 16 pixels. |
2173 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2259 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
2174 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, | 2260 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, |
2175 uint8* dst_argb, | 2261 uint8* dst_argb, |
2176 const struct YuvConstants* yuvconstants, | 2262 const struct YuvConstants* yuvconstants, |
2177 int width) { | 2263 int width) { |
2178 asm volatile ( | 2264 asm volatile ( |
| 2265 YUVTORGB_SETUP_AVX2(yuvconstants) |
2179 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2266 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2180 LABELALIGN | 2267 LABELALIGN |
2181 "1: \n" | 2268 "1: \n" |
2182 READYUY2_AVX2 | 2269 READYUY2_AVX2 |
2183 YUVTORGB_AVX2(yuvconstants) | 2270 YUVTORGB_AVX2(yuvconstants) |
2184 STOREARGB_AVX2 | 2271 STOREARGB_AVX2 |
2185 "sub $0x10,%[width] \n" | 2272 "sub $0x10,%[width] \n" |
2186 "jg 1b \n" | 2273 "jg 1b \n" |
2187 "vzeroupper \n" | 2274 "vzeroupper \n" |
2188 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] | 2275 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] |
2189 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2276 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2190 [width]"+rm"(width) // %[width] | 2277 [width]"+rm"(width) // %[width] |
2191 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 2278 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
2192 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), | 2279 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), |
2193 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) | 2280 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) |
2194 // Does not use r14. | 2281 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
2195 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2282 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2196 ); | 2283 ); |
2197 } | 2284 } |
2198 #endif // HAS_YUY2TOARGBROW_AVX2 | 2285 #endif // HAS_YUY2TOARGBROW_AVX2 |
2199 | 2286 |
2200 #if defined(HAS_UYVYTOARGBROW_AVX2) | 2287 #if defined(HAS_UYVYTOARGBROW_AVX2) |
2201 // 16 pixels. | 2288 // 16 pixels. |
2202 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2289 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
2203 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, | 2290 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, |
2204 uint8* dst_argb, | 2291 uint8* dst_argb, |
2205 const struct YuvConstants* yuvconstants, | 2292 const struct YuvConstants* yuvconstants, |
2206 int width) { | 2293 int width) { |
2207 asm volatile ( | 2294 asm volatile ( |
| 2295 YUVTORGB_SETUP_AVX2(yuvconstants) |
2208 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2296 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2209 LABELALIGN | 2297 LABELALIGN |
2210 "1: \n" | 2298 "1: \n" |
2211 READUYVY_AVX2 | 2299 READUYVY_AVX2 |
2212 YUVTORGB_AVX2(yuvconstants) | 2300 YUVTORGB_AVX2(yuvconstants) |
2213 STOREARGB_AVX2 | 2301 STOREARGB_AVX2 |
2214 "sub $0x10,%[width] \n" | 2302 "sub $0x10,%[width] \n" |
2215 "jg 1b \n" | 2303 "jg 1b \n" |
2216 "vzeroupper \n" | 2304 "vzeroupper \n" |
2217 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] | 2305 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] |
2218 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2306 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2219 [width]"+rm"(width) // %[width] | 2307 [width]"+rm"(width) // %[width] |
2220 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 2308 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
2221 [kShuffleUYVYY]"m"(kShuffleUYVYY), | 2309 [kShuffleUYVYY]"m"(kShuffleUYVYY), |
2222 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) | 2310 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) |
2223 // Does not use r14. | 2311 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
2224 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2312 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2225 ); | 2313 ); |
2226 } | 2314 } |
2227 #endif // HAS_UYVYTOARGBROW_AVX2 | 2315 #endif // HAS_UYVYTOARGBROW_AVX2 |
2228 | 2316 |
2229 #ifdef HAS_I400TOARGBROW_SSE2 | 2317 #ifdef HAS_I400TOARGBROW_SSE2 |
2230 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { | 2318 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { |
2231 asm volatile ( | 2319 asm volatile ( |
2232 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 | 2320 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 |
2233 "movd %%eax,%%xmm2 \n" | 2321 "movd %%eax,%%xmm2 \n" |
2234 "pshufd $0x0,%%xmm2,%%xmm2 \n" | 2322 "pshufd $0x0,%%xmm2,%%xmm2 \n" |
(...skipping 3093 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5328 ); | 5416 ); |
5329 } | 5417 } |
5330 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5418 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5331 | 5419 |
5332 #endif // defined(__x86_64__) || defined(__i386__) | 5420 #endif // defined(__x86_64__) || defined(__i386__) |
5333 | 5421 |
5334 #ifdef __cplusplus | 5422 #ifdef __cplusplus |
5335 } // extern "C" | 5423 } // extern "C" |
5336 } // namespace libyuv | 5424 } // namespace libyuv |
5337 #endif | 5425 #endif |
OLD | NEW |