Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(238)

Side by Side Diff: source/row_gcc.cc

Issue 1407353010: YUV to RGB for x64 use registers instead of memory. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: bump version Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « include/libyuv/version.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
(...skipping 1546 matching lines...) Expand 10 before | Expand all | Expand 10 after
1557 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" 1557 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
1558 1558
1559 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. 1559 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1560 #define READUYVY \ 1560 #define READUYVY \
1561 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ 1561 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
1562 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ 1562 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
1563 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ 1563 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
1564 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ 1564 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
1565 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" 1565 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
1566 1566
1567 #if defined(__x86_64__)
1568 #define YUVTORGB_SETUP(yuvconstants) \
1569 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
1570 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
1571 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
1572 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
1573 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
1574 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
1575 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
1567 // Convert 8 pixels: 8 UV and 8 Y 1576 // Convert 8 pixels: 8 UV and 8 Y
1568 #define YUVTORGB(yuvconstants) \ 1577 #define YUVTORGB(yuvconstants) \
1569 "movdqa %%xmm0,%%xmm1 \n" \ 1578 "movdqa %%xmm0,%%xmm1 \n" \
1579 "movdqa %%xmm0,%%xmm2 \n" \
1580 "movdqa %%xmm0,%%xmm3 \n" \
1581 "movdqa %%xmm11,%%xmm0 \n" \
1582 "pmaddubsw %%xmm8,%%xmm1 \n" \
1583 "psubw %%xmm1,%%xmm0 \n" \
1584 "movdqa %%xmm12,%%xmm1 \n" \
1585 "pmaddubsw %%xmm9,%%xmm2 \n" \
1586 "psubw %%xmm2,%%xmm1 \n" \
1587 "movdqa %%xmm13,%%xmm2 \n" \
1588 "pmaddubsw %%xmm10,%%xmm3 \n" \
1589 "psubw %%xmm3,%%xmm2 \n" \
1590 "pmulhuw %%xmm14,%%xmm4 \n" \
1591 "paddsw %%xmm4,%%xmm0 \n" \
1592 "paddsw %%xmm4,%%xmm1 \n" \
1593 "paddsw %%xmm4,%%xmm2 \n" \
1594 "psraw $0x6,%%xmm0 \n" \
1595 "psraw $0x6,%%xmm1 \n" \
1596 "psraw $0x6,%%xmm2 \n" \
1597 "packuswb %%xmm0,%%xmm0 \n" \
1598 "packuswb %%xmm1,%%xmm1 \n" \
1599 "packuswb %%xmm2,%%xmm2 \n"
1600 #define YUVTORGB_REGS \
1601 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
1602
1603 #else
1604 #define YUVTORGB_SETUP(yuvconstants)
1605 // Convert 8 pixels: 8 UV and 8 Y
1606 #define YUVTORGB(yuvconstants) \
1607 "movdqa %%xmm0,%%xmm1 \n" \
1570 "movdqa %%xmm0,%%xmm2 \n" \ 1608 "movdqa %%xmm0,%%xmm2 \n" \
1571 "movdqa %%xmm0,%%xmm3 \n" \ 1609 "movdqa %%xmm0,%%xmm3 \n" \
1572 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ 1610 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
1573 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ 1611 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
1574 "psubw %%xmm1,%%xmm0 \n" \ 1612 "psubw %%xmm1,%%xmm0 \n" \
1575 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ 1613 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
1576 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ 1614 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
1577 "psubw %%xmm2,%%xmm1 \n" \ 1615 "psubw %%xmm2,%%xmm1 \n" \
1578 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ 1616 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
1579 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ 1617 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
1580 "psubw %%xmm3,%%xmm2 \n" \ 1618 "psubw %%xmm3,%%xmm2 \n" \
1581 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ 1619 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
1582 "paddsw %%xmm4,%%xmm0 \n" \ 1620 "paddsw %%xmm4,%%xmm0 \n" \
1583 "paddsw %%xmm4,%%xmm1 \n" \ 1621 "paddsw %%xmm4,%%xmm1 \n" \
1584 "paddsw %%xmm4,%%xmm2 \n" \ 1622 "paddsw %%xmm4,%%xmm2 \n" \
1585 "psraw $0x6,%%xmm0 \n" \ 1623 "psraw $0x6,%%xmm0 \n" \
1586 "psraw $0x6,%%xmm1 \n" \ 1624 "psraw $0x6,%%xmm1 \n" \
1587 "psraw $0x6,%%xmm2 \n" \ 1625 "psraw $0x6,%%xmm2 \n" \
1588 "packuswb %%xmm0,%%xmm0 \n" \ 1626 "packuswb %%xmm0,%%xmm0 \n" \
1589 "packuswb %%xmm1,%%xmm1 \n" \ 1627 "packuswb %%xmm1,%%xmm1 \n" \
1590 "packuswb %%xmm2,%%xmm2 \n" 1628 "packuswb %%xmm2,%%xmm2 \n"
1629 #define YUVTORGB_REGS
1630 #endif
1591 1631
1592 // Store 8 ARGB values. 1632 // Store 8 ARGB values.
1593 #define STOREARGB \ 1633 #define STOREARGB \
1594 "punpcklbw %%xmm1,%%xmm0 \n" \ 1634 "punpcklbw %%xmm1,%%xmm0 \n" \
1595 "punpcklbw %%xmm5,%%xmm2 \n" \ 1635 "punpcklbw %%xmm5,%%xmm2 \n" \
1596 "movdqa %%xmm0,%%xmm1 \n" \ 1636 "movdqa %%xmm0,%%xmm1 \n" \
1597 "punpcklwd %%xmm2,%%xmm0 \n" \ 1637 "punpcklwd %%xmm2,%%xmm0 \n" \
1598 "punpckhwd %%xmm2,%%xmm1 \n" \ 1638 "punpckhwd %%xmm2,%%xmm1 \n" \
1599 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ 1639 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1600 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ 1640 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
(...skipping 11 matching lines...) Expand all
1612 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ 1652 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
1613 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" 1653 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
1614 1654
1615 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, 1655 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1616 const uint8* u_buf, 1656 const uint8* u_buf,
1617 const uint8* v_buf, 1657 const uint8* v_buf,
1618 uint8* dst_argb, 1658 uint8* dst_argb,
1619 const struct YuvConstants* yuvconstants, 1659 const struct YuvConstants* yuvconstants,
1620 int width) { 1660 int width) {
1621 asm volatile ( 1661 asm volatile (
1662 YUVTORGB_SETUP(yuvconstants)
1622 "sub %[u_buf],%[v_buf] \n" 1663 "sub %[u_buf],%[v_buf] \n"
1623 "pcmpeqb %%xmm5,%%xmm5 \n" 1664 "pcmpeqb %%xmm5,%%xmm5 \n"
1624 LABELALIGN 1665 LABELALIGN
1625 "1: \n" 1666 "1: \n"
1626 READYUV444 1667 READYUV444
1627 YUVTORGB(yuvconstants) 1668 YUVTORGB(yuvconstants)
1628 STOREARGB 1669 STOREARGB
1629 "sub $0x8,%[width] \n" 1670 "sub $0x8,%[width] \n"
1630 "jg 1b \n" 1671 "jg 1b \n"
1631 : [y_buf]"+r"(y_buf), // %[y_buf] 1672 : [y_buf]"+r"(y_buf), // %[y_buf]
1632 [u_buf]"+r"(u_buf), // %[u_buf] 1673 [u_buf]"+r"(u_buf), // %[u_buf]
1633 [v_buf]"+r"(v_buf), // %[v_buf] 1674 [v_buf]"+r"(v_buf), // %[v_buf]
1634 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1675 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1635 [width]"+rm"(width) // %[width] 1676 [width]"+rm"(width) // %[width]
1636 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1677 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1637 : "memory", "cc", NACL_R14 1678 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1638 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1679 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1639 ); 1680 );
1640 } 1681 }
1641 1682
1642 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, 1683 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1643 const uint8* u_buf, 1684 const uint8* u_buf,
1644 const uint8* v_buf, 1685 const uint8* v_buf,
1645 uint8* dst_rgb24, 1686 uint8* dst_rgb24,
1646 const struct YuvConstants* yuvconstants, 1687 const struct YuvConstants* yuvconstants,
1647 int width) { 1688 int width) {
1648 asm volatile ( 1689 asm volatile (
1690 YUVTORGB_SETUP(yuvconstants)
1649 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" 1691 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1650 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" 1692 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1651 "sub %[u_buf],%[v_buf] \n" 1693 "sub %[u_buf],%[v_buf] \n"
1652 LABELALIGN 1694 LABELALIGN
1653 "1: \n" 1695 "1: \n"
1654 READYUV422 1696 READYUV422
1655 YUVTORGB(yuvconstants) 1697 YUVTORGB(yuvconstants)
1656 "punpcklbw %%xmm1,%%xmm0 \n" 1698 "punpcklbw %%xmm1,%%xmm0 \n"
1657 "punpcklbw %%xmm2,%%xmm2 \n" 1699 "punpcklbw %%xmm2,%%xmm2 \n"
1658 "movdqa %%xmm0,%%xmm1 \n" 1700 "movdqa %%xmm0,%%xmm1 \n"
(...skipping 12 matching lines...) Expand all
1671 [v_buf]"+r"(v_buf), // %[v_buf] 1713 [v_buf]"+r"(v_buf), // %[v_buf]
1672 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] 1714 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
1673 #if defined(__i386__) && defined(__pic__) 1715 #if defined(__i386__) && defined(__pic__)
1674 [width]"+m"(width) // %[width] 1716 [width]"+m"(width) // %[width]
1675 #else 1717 #else
1676 [width]"+rm"(width) // %[width] 1718 [width]"+rm"(width) // %[width]
1677 #endif 1719 #endif
1678 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1720 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1679 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), 1721 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1680 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) 1722 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1681 : "memory", "cc", NACL_R14 1723 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1682 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1724 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1683 ); 1725 );
1684 } 1726 }
1685 1727
1686 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, 1728 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1687 const uint8* u_buf, 1729 const uint8* u_buf,
1688 const uint8* v_buf, 1730 const uint8* v_buf,
1689 uint8* dst_argb, 1731 uint8* dst_argb,
1690 const struct YuvConstants* yuvconstants, 1732 const struct YuvConstants* yuvconstants,
1691 int width) { 1733 int width) {
1692 asm volatile ( 1734 asm volatile (
1735 YUVTORGB_SETUP(yuvconstants)
1693 "sub %[u_buf],%[v_buf] \n" 1736 "sub %[u_buf],%[v_buf] \n"
1694 "pcmpeqb %%xmm5,%%xmm5 \n" 1737 "pcmpeqb %%xmm5,%%xmm5 \n"
1695 LABELALIGN 1738 LABELALIGN
1696 "1: \n" 1739 "1: \n"
1697 READYUV422 1740 READYUV422
1698 YUVTORGB(yuvconstants) 1741 YUVTORGB(yuvconstants)
1699 STOREARGB 1742 STOREARGB
1700 "sub $0x8,%[width] \n" 1743 "sub $0x8,%[width] \n"
1701 "jg 1b \n" 1744 "jg 1b \n"
1702 : [y_buf]"+r"(y_buf), // %[y_buf] 1745 : [y_buf]"+r"(y_buf), // %[y_buf]
1703 [u_buf]"+r"(u_buf), // %[u_buf] 1746 [u_buf]"+r"(u_buf), // %[u_buf]
1704 [v_buf]"+r"(v_buf), // %[v_buf] 1747 [v_buf]"+r"(v_buf), // %[v_buf]
1705 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1748 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1706 [width]"+rm"(width) // %[width] 1749 [width]"+rm"(width) // %[width]
1707 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1750 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1708 : "memory", "cc", NACL_R14 1751 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1709 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1752 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1710 ); 1753 );
1711 } 1754 }
1712 1755
1713 #ifdef HAS_I422ALPHATOARGBROW_SSSE3 1756 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
1714 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, 1757 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
1715 const uint8* u_buf, 1758 const uint8* u_buf,
1716 const uint8* v_buf, 1759 const uint8* v_buf,
1717 const uint8* a_buf, 1760 const uint8* a_buf,
1718 uint8* dst_argb, 1761 uint8* dst_argb,
1719 const struct YuvConstants* yuvconstants, 1762 const struct YuvConstants* yuvconstants,
1720 int width) { 1763 int width) {
1721 asm volatile ( 1764 asm volatile (
1765 YUVTORGB_SETUP(yuvconstants)
1722 "sub %[u_buf],%[v_buf] \n" 1766 "sub %[u_buf],%[v_buf] \n"
1723 LABELALIGN 1767 LABELALIGN
1724 "1: \n" 1768 "1: \n"
1725 READYUVA422 1769 READYUVA422
1726 YUVTORGB(yuvconstants) 1770 YUVTORGB(yuvconstants)
1727 STOREARGB 1771 STOREARGB
1728 "subl $0x8,%[width] \n" 1772 "subl $0x8,%[width] \n"
1729 "jg 1b \n" 1773 "jg 1b \n"
1730 : [y_buf]"+r"(y_buf), // %[y_buf] 1774 : [y_buf]"+r"(y_buf), // %[y_buf]
1731 [u_buf]"+r"(u_buf), // %[u_buf] 1775 [u_buf]"+r"(u_buf), // %[u_buf]
1732 [v_buf]"+r"(v_buf), // %[v_buf] 1776 [v_buf]"+r"(v_buf), // %[v_buf]
1733 [a_buf]"+r"(a_buf), // %[a_buf] 1777 [a_buf]"+r"(a_buf), // %[a_buf]
1734 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1778 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1735 #if defined(__i386__) && defined(__pic__) 1779 #if defined(__i386__) && defined(__pic__)
1736 [width]"+m"(width) // %[width] 1780 [width]"+m"(width) // %[width]
1737 #else 1781 #else
1738 [width]"+rm"(width) // %[width] 1782 [width]"+rm"(width) // %[width]
1739 #endif 1783 #endif
1740 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1784 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1741 : "memory", "cc", NACL_R14 1785 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1742 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1786 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1743 ); 1787 );
1744 } 1788 }
1745 #endif // HAS_I422ALPHATOARGBROW_SSSE3 1789 #endif // HAS_I422ALPHATOARGBROW_SSSE3
1746 1790
1747 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, 1791 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1748 const uint8* u_buf, 1792 const uint8* u_buf,
1749 const uint8* v_buf, 1793 const uint8* v_buf,
1750 uint8* dst_argb, 1794 uint8* dst_argb,
1751 const struct YuvConstants* yuvconstants, 1795 const struct YuvConstants* yuvconstants,
1752 int width) { 1796 int width) {
1753 asm volatile ( 1797 asm volatile (
1798 YUVTORGB_SETUP(yuvconstants)
1754 "sub %[u_buf],%[v_buf] \n" 1799 "sub %[u_buf],%[v_buf] \n"
1755 "pcmpeqb %%xmm5,%%xmm5 \n" 1800 "pcmpeqb %%xmm5,%%xmm5 \n"
1756 LABELALIGN 1801 LABELALIGN
1757 "1: \n" 1802 "1: \n"
1758 READYUV411 1803 READYUV411
1759 YUVTORGB(yuvconstants) 1804 YUVTORGB(yuvconstants)
1760 STOREARGB 1805 STOREARGB
1761 "sub $0x8,%[width] \n" 1806 "sub $0x8,%[width] \n"
1762 "jg 1b \n" 1807 "jg 1b \n"
1763 : [y_buf]"+r"(y_buf), // %[y_buf] 1808 : [y_buf]"+r"(y_buf), // %[y_buf]
1764 [u_buf]"+r"(u_buf), // %[u_buf] 1809 [u_buf]"+r"(u_buf), // %[u_buf]
1765 [v_buf]"+r"(v_buf), // %[v_buf] 1810 [v_buf]"+r"(v_buf), // %[v_buf]
1766 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1811 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1767 [width]"+rm"(width) // %[width] 1812 [width]"+rm"(width) // %[width]
1768 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1813 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1769 : "memory", "cc", NACL_R14 1814 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1770 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1815 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1771 ); 1816 );
1772 } 1817 }
1773 1818
1774 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, 1819 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1775 const uint8* uv_buf, 1820 const uint8* uv_buf,
1776 uint8* dst_argb, 1821 uint8* dst_argb,
1777 const struct YuvConstants* yuvconstants, 1822 const struct YuvConstants* yuvconstants,
1778 int width) { 1823 int width) {
1779 asm volatile ( 1824 asm volatile (
1825 YUVTORGB_SETUP(yuvconstants)
1780 "pcmpeqb %%xmm5,%%xmm5 \n" 1826 "pcmpeqb %%xmm5,%%xmm5 \n"
1781 LABELALIGN 1827 LABELALIGN
1782 "1: \n" 1828 "1: \n"
1783 READNV12 1829 READNV12
1784 YUVTORGB(yuvconstants) 1830 YUVTORGB(yuvconstants)
1785 STOREARGB 1831 STOREARGB
1786 "sub $0x8,%[width] \n" 1832 "sub $0x8,%[width] \n"
1787 "jg 1b \n" 1833 "jg 1b \n"
1788 : [y_buf]"+r"(y_buf), // %[y_buf] 1834 : [y_buf]"+r"(y_buf), // %[y_buf]
1789 [uv_buf]"+r"(uv_buf), // %[uv_buf] 1835 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1790 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1836 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1791 [width]"+rm"(width) // %[width] 1837 [width]"+rm"(width) // %[width]
1792 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1838 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1793 // Does not use r14. 1839 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1794 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1840 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1795 ); 1841 );
1796 } 1842 }
1797 1843
1798 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, 1844 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1799 const uint8* vu_buf, 1845 const uint8* vu_buf,
1800 uint8* dst_argb, 1846 uint8* dst_argb,
1801 const struct YuvConstants* yuvconstants, 1847 const struct YuvConstants* yuvconstants,
1802 int width) { 1848 int width) {
1803 asm volatile ( 1849 asm volatile (
1850 YUVTORGB_SETUP(yuvconstants)
1804 "pcmpeqb %%xmm5,%%xmm5 \n" 1851 "pcmpeqb %%xmm5,%%xmm5 \n"
1805 LABELALIGN 1852 LABELALIGN
1806 "1: \n" 1853 "1: \n"
1807 READNV21 1854 READNV21
1808 YUVTORGB(yuvconstants) 1855 YUVTORGB(yuvconstants)
1809 STOREARGB 1856 STOREARGB
1810 "sub $0x8,%[width] \n" 1857 "sub $0x8,%[width] \n"
1811 "jg 1b \n" 1858 "jg 1b \n"
1812 : [y_buf]"+r"(y_buf), // %[y_buf] 1859 : [y_buf]"+r"(y_buf), // %[y_buf]
1813 [vu_buf]"+r"(vu_buf), // %[vu_buf] 1860 [vu_buf]"+r"(vu_buf), // %[vu_buf]
1814 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1861 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1815 [width]"+rm"(width) // %[width] 1862 [width]"+rm"(width) // %[width]
1816 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1863 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1817 [kShuffleNV21]"m"(kShuffleNV21) 1864 [kShuffleNV21]"m"(kShuffleNV21)
1818 // Does not use r14. 1865 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1819 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1866 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1820 ); 1867 );
1821 } 1868 }
1822 1869
1823 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, 1870 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
1824 uint8* dst_argb, 1871 uint8* dst_argb,
1825 const struct YuvConstants* yuvconstants, 1872 const struct YuvConstants* yuvconstants,
1826 int width) { 1873 int width) {
1827 asm volatile ( 1874 asm volatile (
1875 YUVTORGB_SETUP(yuvconstants)
1828 "pcmpeqb %%xmm5,%%xmm5 \n" 1876 "pcmpeqb %%xmm5,%%xmm5 \n"
1829 LABELALIGN 1877 LABELALIGN
1830 "1: \n" 1878 "1: \n"
1831 READYUY2 1879 READYUY2
1832 YUVTORGB(yuvconstants) 1880 YUVTORGB(yuvconstants)
1833 STOREARGB 1881 STOREARGB
1834 "sub $0x8,%[width] \n" 1882 "sub $0x8,%[width] \n"
1835 "jg 1b \n" 1883 "jg 1b \n"
1836 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] 1884 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
1837 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1885 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1838 [width]"+rm"(width) // %[width] 1886 [width]"+rm"(width) // %[width]
1839 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1887 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1840 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), 1888 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
1841 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) 1889 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
1842 // Does not use r14. 1890 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1843 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1891 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1844 ); 1892 );
1845 } 1893 }
1846 1894
1847 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, 1895 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
1848 uint8* dst_argb, 1896 uint8* dst_argb,
1849 const struct YuvConstants* yuvconstants, 1897 const struct YuvConstants* yuvconstants,
1850 int width) { 1898 int width) {
1851 asm volatile ( 1899 asm volatile (
1900 YUVTORGB_SETUP(yuvconstants)
1852 "pcmpeqb %%xmm5,%%xmm5 \n" 1901 "pcmpeqb %%xmm5,%%xmm5 \n"
1853 LABELALIGN 1902 LABELALIGN
1854 "1: \n" 1903 "1: \n"
1855 READUYVY 1904 READUYVY
1856 YUVTORGB(yuvconstants) 1905 YUVTORGB(yuvconstants)
1857 STOREARGB 1906 STOREARGB
1858 "sub $0x8,%[width] \n" 1907 "sub $0x8,%[width] \n"
1859 "jg 1b \n" 1908 "jg 1b \n"
1860 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] 1909 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
1861 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1910 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1862 [width]"+rm"(width) // %[width] 1911 [width]"+rm"(width) // %[width]
1863 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1912 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1864 [kShuffleUYVYY]"m"(kShuffleUYVYY), 1913 [kShuffleUYVYY]"m"(kShuffleUYVYY),
1865 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) 1914 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
1866 // Does not use r14. 1915 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1867 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1916 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1868 ); 1917 );
1869 } 1918 }
1870 1919
1871 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, 1920 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1872 const uint8* u_buf, 1921 const uint8* u_buf,
1873 const uint8* v_buf, 1922 const uint8* v_buf,
1874 uint8* dst_rgba, 1923 uint8* dst_rgba,
1875 const struct YuvConstants* yuvconstants, 1924 const struct YuvConstants* yuvconstants,
1876 int width) { 1925 int width) {
1877 asm volatile ( 1926 asm volatile (
1927 YUVTORGB_SETUP(yuvconstants)
1878 "sub %[u_buf],%[v_buf] \n" 1928 "sub %[u_buf],%[v_buf] \n"
1879 "pcmpeqb %%xmm5,%%xmm5 \n" 1929 "pcmpeqb %%xmm5,%%xmm5 \n"
1880 LABELALIGN 1930 LABELALIGN
1881 "1: \n" 1931 "1: \n"
1882 READYUV422 1932 READYUV422
1883 YUVTORGB(yuvconstants) 1933 YUVTORGB(yuvconstants)
1884 STORERGBA 1934 STORERGBA
1885 "sub $0x8,%[width] \n" 1935 "sub $0x8,%[width] \n"
1886 "jg 1b \n" 1936 "jg 1b \n"
1887 : [y_buf]"+r"(y_buf), // %[y_buf] 1937 : [y_buf]"+r"(y_buf), // %[y_buf]
1888 [u_buf]"+r"(u_buf), // %[u_buf] 1938 [u_buf]"+r"(u_buf), // %[u_buf]
1889 [v_buf]"+r"(v_buf), // %[v_buf] 1939 [v_buf]"+r"(v_buf), // %[v_buf]
1890 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] 1940 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
1891 [width]"+rm"(width) // %[width] 1941 [width]"+rm"(width) // %[width]
1892 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1942 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1893 : "memory", "cc", NACL_R14 1943 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1894 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1944 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1895 ); 1945 );
1896 } 1946 }
1897 1947
1898 #endif // HAS_I422TOARGBROW_SSSE3 1948 #endif // HAS_I422TOARGBROW_SSSE3
1899 1949
1900 // Read 8 UV from 422, upsample to 16 UV. 1950 // Read 8 UV from 422, upsample to 16 UV.
1901 #define READYUV422_AVX2 \ 1951 #define READYUV422_AVX2 \
1902 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1952 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1903 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1953 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
1957 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" 2007 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
1958 2008
1959 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. 2009 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
1960 #define READUYVY_AVX2 \ 2010 #define READUYVY_AVX2 \
1961 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ 2011 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
1962 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ 2012 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
1963 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ 2013 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
1964 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ 2014 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
1965 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" 2015 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
1966 2016
1967 // Convert 16 pixels: 16 UV and 16 Y. 2017 #if defined(__x86_64__)
2018 #define YUVTORGB_SETUP_AVX2(yuvconstants) \
2019 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
2020 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
2021 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
2022 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
2023 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
2024 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
2025 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
2026 #define YUVTORGB_AVX2(yuvconstants) \
2027 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
2028 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
2029 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
2030 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
2031 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
2032 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
2033 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
2034 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2035 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2036 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
2037 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
2038 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
2039 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
2040 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2041 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
2042 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2043 #define YUVTORGB_REGS_AVX2 \
2044 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2045 #else// Convert 16 pixels: 16 UV and 16 Y.
2046 #define YUVTORGB_SETUP_AVX2(yuvconstants)
1968 #define YUVTORGB_AVX2(yuvconstants) \ 2047 #define YUVTORGB_AVX2(yuvconstants) \
1969 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ 2048 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
1970 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ 2049 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
1971 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ 2050 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
1972 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ 2051 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
1973 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ 2052 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
1974 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ 2053 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
1975 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ 2054 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
1976 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ 2055 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
1977 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ 2056 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
1978 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ 2057 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
1979 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ 2058 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
1980 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ 2059 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
1981 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ 2060 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
1982 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 2061 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
1983 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 2062 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
1984 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 2063 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
1985 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 2064 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
1986 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 2065 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
1987 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 2066 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2067 #define YUVTORGB_REGS_AVX2
2068 #endif
1988 2069
1989 // Store 16 ARGB values. 2070 // Store 16 ARGB values.
1990 #define STOREARGB_AVX2 \ 2071 #define STOREARGB_AVX2 \
1991 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 2072 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1992 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 2073 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1993 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ 2074 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
1994 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ 2075 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
1995 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ 2076 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
1996 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ 2077 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
1997 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ 2078 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
1998 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ 2079 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
1999 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" 2080 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
2000 2081
2001 #if defined(HAS_I422TOARGBROW_AVX2) 2082 #if defined(HAS_I422TOARGBROW_AVX2)
2002 // 16 pixels 2083 // 16 pixels
2003 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2084 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2004 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, 2085 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2005 const uint8* u_buf, 2086 const uint8* u_buf,
2006 const uint8* v_buf, 2087 const uint8* v_buf,
2007 uint8* dst_argb, 2088 uint8* dst_argb,
2008 const struct YuvConstants* yuvconstants, 2089 const struct YuvConstants* yuvconstants,
2009 int width) { 2090 int width) {
2010 asm volatile ( 2091 asm volatile (
2092 YUVTORGB_SETUP_AVX2(yuvconstants)
2011 "sub %[u_buf],%[v_buf] \n" 2093 "sub %[u_buf],%[v_buf] \n"
2012 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2094 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2013 LABELALIGN 2095 LABELALIGN
2014 "1: \n" 2096 "1: \n"
2015 READYUV422_AVX2 2097 READYUV422_AVX2
2016 YUVTORGB_AVX2(yuvconstants) 2098 YUVTORGB_AVX2(yuvconstants)
2017 STOREARGB_AVX2 2099 STOREARGB_AVX2
2018 "sub $0x10,%[width] \n" 2100 "sub $0x10,%[width] \n"
2019 "jg 1b \n" 2101 "jg 1b \n"
2020 "vzeroupper \n" 2102 "vzeroupper \n"
2021 : [y_buf]"+r"(y_buf), // %[y_buf] 2103 : [y_buf]"+r"(y_buf), // %[y_buf]
2022 [u_buf]"+r"(u_buf), // %[u_buf] 2104 [u_buf]"+r"(u_buf), // %[u_buf]
2023 [v_buf]"+r"(v_buf), // %[v_buf] 2105 [v_buf]"+r"(v_buf), // %[v_buf]
2024 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2106 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2025 [width]"+rm"(width) // %[width] 2107 [width]"+rm"(width) // %[width]
2026 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2108 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2027 : "memory", "cc", NACL_R14 2109 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2028 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2110 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2029 ); 2111 );
2030 } 2112 }
2031 #endif // HAS_I422TOARGBROW_AVX2 2113 #endif // HAS_I422TOARGBROW_AVX2
2032 2114
2033 #if defined(HAS_I422ALPHATOARGBROW_AVX2) 2115 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2034 // 16 pixels 2116 // 16 pixels
2035 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. 2117 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2036 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, 2118 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2037 const uint8* u_buf, 2119 const uint8* u_buf,
2038 const uint8* v_buf, 2120 const uint8* v_buf,
2039 const uint8* a_buf, 2121 const uint8* a_buf,
2040 uint8* dst_argb, 2122 uint8* dst_argb,
2041 const struct YuvConstants* yuvconstants, 2123 const struct YuvConstants* yuvconstants,
2042 int width) { 2124 int width) {
2043 asm volatile ( 2125 asm volatile (
2126 YUVTORGB_SETUP_AVX2(yuvconstants)
2044 "sub %[u_buf],%[v_buf] \n" 2127 "sub %[u_buf],%[v_buf] \n"
2045 LABELALIGN 2128 LABELALIGN
2046 "1: \n" 2129 "1: \n"
2047 READYUVA422_AVX2 2130 READYUVA422_AVX2
2048 YUVTORGB_AVX2(yuvconstants) 2131 YUVTORGB_AVX2(yuvconstants)
2049 STOREARGB_AVX2 2132 STOREARGB_AVX2
2050 "subl $0x10,%[width] \n" 2133 "subl $0x10,%[width] \n"
2051 "jg 1b \n" 2134 "jg 1b \n"
2052 "vzeroupper \n" 2135 "vzeroupper \n"
2053 : [y_buf]"+r"(y_buf), // %[y_buf] 2136 : [y_buf]"+r"(y_buf), // %[y_buf]
2054 [u_buf]"+r"(u_buf), // %[u_buf] 2137 [u_buf]"+r"(u_buf), // %[u_buf]
2055 [v_buf]"+r"(v_buf), // %[v_buf] 2138 [v_buf]"+r"(v_buf), // %[v_buf]
2056 [a_buf]"+r"(a_buf), // %[a_buf] 2139 [a_buf]"+r"(a_buf), // %[a_buf]
2057 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2140 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2058 #if defined(__i386__) && defined(__pic__) 2141 #if defined(__i386__) && defined(__pic__)
2059 [width]"+m"(width) // %[width] 2142 [width]"+m"(width) // %[width]
2060 #else 2143 #else
2061 [width]"+rm"(width) // %[width] 2144 [width]"+rm"(width) // %[width]
2062 #endif 2145 #endif
2063 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2146 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2064 : "memory", "cc", NACL_R14 2147 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2065 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2148 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2066 ); 2149 );
2067 } 2150 }
2068 #endif // HAS_I422ALPHATOARGBROW_AVX2 2151 #endif // HAS_I422ALPHATOARGBROW_AVX2
2069 2152
2070 #if defined(HAS_I422TORGBAROW_AVX2) 2153 #if defined(HAS_I422TORGBAROW_AVX2)
2071 // 16 pixels 2154 // 16 pixels
2072 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). 2155 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2073 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, 2156 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2074 const uint8* u_buf, 2157 const uint8* u_buf,
2075 const uint8* v_buf, 2158 const uint8* v_buf,
2076 uint8* dst_argb, 2159 uint8* dst_argb,
2077 const struct YuvConstants* yuvconstants, 2160 const struct YuvConstants* yuvconstants,
2078 int width) { 2161 int width) {
2079 asm volatile ( 2162 asm volatile (
2163 YUVTORGB_SETUP_AVX2(yuvconstants)
2080 "sub %[u_buf],%[v_buf] \n" 2164 "sub %[u_buf],%[v_buf] \n"
2081 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2165 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2082 LABELALIGN 2166 LABELALIGN
2083 "1: \n" 2167 "1: \n"
2084 READYUV422_AVX2 2168 READYUV422_AVX2
2085 YUVTORGB_AVX2(yuvconstants) 2169 YUVTORGB_AVX2(yuvconstants)
2086 2170
2087 // Step 3: Weave into RGBA 2171 // Step 3: Weave into RGBA
2088 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" 2172 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
2089 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 2173 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2090 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" 2174 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
2091 "vpermq $0xd8,%%ymm2,%%ymm2 \n" 2175 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2092 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" 2176 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
2093 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" 2177 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
2094 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" 2178 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2095 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" 2179 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2096 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" 2180 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2097 "sub $0x10,%[width] \n" 2181 "sub $0x10,%[width] \n"
2098 "jg 1b \n" 2182 "jg 1b \n"
2099 "vzeroupper \n" 2183 "vzeroupper \n"
2100 : [y_buf]"+r"(y_buf), // %[y_buf] 2184 : [y_buf]"+r"(y_buf), // %[y_buf]
2101 [u_buf]"+r"(u_buf), // %[u_buf] 2185 [u_buf]"+r"(u_buf), // %[u_buf]
2102 [v_buf]"+r"(v_buf), // %[v_buf] 2186 [v_buf]"+r"(v_buf), // %[v_buf]
2103 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2187 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2104 [width]"+rm"(width) // %[width] 2188 [width]"+rm"(width) // %[width]
2105 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2189 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2106 : "memory", "cc", NACL_R14 2190 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2107 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2191 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2108 ); 2192 );
2109 } 2193 }
2110 #endif // HAS_I422TORGBAROW_AVX2 2194 #endif // HAS_I422TORGBAROW_AVX2
2111 2195
2112 #if defined(HAS_NV12TOARGBROW_AVX2) 2196 #if defined(HAS_NV12TOARGBROW_AVX2)
2113 // 16 pixels. 2197 // 16 pixels.
2114 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2198 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2115 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, 2199 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
2116 const uint8* uv_buf, 2200 const uint8* uv_buf,
2117 uint8* dst_argb, 2201 uint8* dst_argb,
2118 const struct YuvConstants* yuvconstants, 2202 const struct YuvConstants* yuvconstants,
2119 int width) { 2203 int width) {
2120 asm volatile ( 2204 asm volatile (
2205 YUVTORGB_SETUP_AVX2(yuvconstants)
2121 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2206 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2122 LABELALIGN 2207 LABELALIGN
2123 "1: \n" 2208 "1: \n"
2124 READNV12_AVX2 2209 READNV12_AVX2
2125 YUVTORGB_AVX2(yuvconstants) 2210 YUVTORGB_AVX2(yuvconstants)
2126 STOREARGB_AVX2 2211 STOREARGB_AVX2
2127 "sub $0x10,%[width] \n" 2212 "sub $0x10,%[width] \n"
2128 "jg 1b \n" 2213 "jg 1b \n"
2129 "vzeroupper \n" 2214 "vzeroupper \n"
2130 : [y_buf]"+r"(y_buf), // %[y_buf] 2215 : [y_buf]"+r"(y_buf), // %[y_buf]
2131 [uv_buf]"+r"(uv_buf), // %[uv_buf] 2216 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2132 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2217 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2133 [width]"+rm"(width) // %[width] 2218 [width]"+rm"(width) // %[width]
2134 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2219 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2135 // Does not use r14. 2220 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2136 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2221 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2137 ); 2222 );
2138 } 2223 }
2139 #endif // HAS_NV12TOARGBROW_AVX2 2224 #endif // HAS_NV12TOARGBROW_AVX2
2140 2225
2141 #if defined(HAS_NV21TOARGBROW_AVX2) 2226 #if defined(HAS_NV21TOARGBROW_AVX2)
2142 // 16 pixels. 2227 // 16 pixels.
2143 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2228 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2144 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, 2229 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
2145 const uint8* vu_buf, 2230 const uint8* vu_buf,
2146 uint8* dst_argb, 2231 uint8* dst_argb,
2147 const struct YuvConstants* yuvconstants, 2232 const struct YuvConstants* yuvconstants,
2148 int width) { 2233 int width) {
2149 asm volatile ( 2234 asm volatile (
2235 YUVTORGB_SETUP_AVX2(yuvconstants)
2150 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2236 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2151 LABELALIGN 2237 LABELALIGN
2152 "1: \n" 2238 "1: \n"
2153 READNV21_AVX2 2239 READNV21_AVX2
2154 YUVTORGB_AVX2(yuvconstants) 2240 YUVTORGB_AVX2(yuvconstants)
2155 STOREARGB_AVX2 2241 STOREARGB_AVX2
2156 "sub $0x10,%[width] \n" 2242 "sub $0x10,%[width] \n"
2157 "jg 1b \n" 2243 "jg 1b \n"
2158 "vzeroupper \n" 2244 "vzeroupper \n"
2159 : [y_buf]"+r"(y_buf), // %[y_buf] 2245 : [y_buf]"+r"(y_buf), // %[y_buf]
2160 [vu_buf]"+r"(vu_buf), // %[vu_buf] 2246 [vu_buf]"+r"(vu_buf), // %[vu_buf]
2161 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2247 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2162 [width]"+rm"(width) // %[width] 2248 [width]"+rm"(width) // %[width]
2163 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2249 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2164 [kShuffleNV21]"m"(kShuffleNV21) 2250 [kShuffleNV21]"m"(kShuffleNV21)
2165 // Does not use r14. 2251 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2166 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2252 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2167 ); 2253 );
2168 } 2254 }
2169 #endif // HAS_NV21TOARGBROW_AVX2 2255 #endif // HAS_NV21TOARGBROW_AVX2
2170 2256
2171 #if defined(HAS_YUY2TOARGBROW_AVX2) 2257 #if defined(HAS_YUY2TOARGBROW_AVX2)
2172 // 16 pixels. 2258 // 16 pixels.
2173 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2259 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2174 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, 2260 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2175 uint8* dst_argb, 2261 uint8* dst_argb,
2176 const struct YuvConstants* yuvconstants, 2262 const struct YuvConstants* yuvconstants,
2177 int width) { 2263 int width) {
2178 asm volatile ( 2264 asm volatile (
2265 YUVTORGB_SETUP_AVX2(yuvconstants)
2179 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2266 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2180 LABELALIGN 2267 LABELALIGN
2181 "1: \n" 2268 "1: \n"
2182 READYUY2_AVX2 2269 READYUY2_AVX2
2183 YUVTORGB_AVX2(yuvconstants) 2270 YUVTORGB_AVX2(yuvconstants)
2184 STOREARGB_AVX2 2271 STOREARGB_AVX2
2185 "sub $0x10,%[width] \n" 2272 "sub $0x10,%[width] \n"
2186 "jg 1b \n" 2273 "jg 1b \n"
2187 "vzeroupper \n" 2274 "vzeroupper \n"
2188 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] 2275 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2189 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2276 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2190 [width]"+rm"(width) // %[width] 2277 [width]"+rm"(width) // %[width]
2191 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2278 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2192 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), 2279 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2193 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) 2280 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2194 // Does not use r14. 2281 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2195 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2282 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2196 ); 2283 );
2197 } 2284 }
2198 #endif // HAS_YUY2TOARGBROW_AVX2 2285 #endif // HAS_YUY2TOARGBROW_AVX2
2199 2286
2200 #if defined(HAS_UYVYTOARGBROW_AVX2) 2287 #if defined(HAS_UYVYTOARGBROW_AVX2)
2201 // 16 pixels. 2288 // 16 pixels.
2202 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2289 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2203 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, 2290 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
2204 uint8* dst_argb, 2291 uint8* dst_argb,
2205 const struct YuvConstants* yuvconstants, 2292 const struct YuvConstants* yuvconstants,
2206 int width) { 2293 int width) {
2207 asm volatile ( 2294 asm volatile (
2295 YUVTORGB_SETUP_AVX2(yuvconstants)
2208 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2296 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2209 LABELALIGN 2297 LABELALIGN
2210 "1: \n" 2298 "1: \n"
2211 READUYVY_AVX2 2299 READUYVY_AVX2
2212 YUVTORGB_AVX2(yuvconstants) 2300 YUVTORGB_AVX2(yuvconstants)
2213 STOREARGB_AVX2 2301 STOREARGB_AVX2
2214 "sub $0x10,%[width] \n" 2302 "sub $0x10,%[width] \n"
2215 "jg 1b \n" 2303 "jg 1b \n"
2216 "vzeroupper \n" 2304 "vzeroupper \n"
2217 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] 2305 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
2218 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2306 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2219 [width]"+rm"(width) // %[width] 2307 [width]"+rm"(width) // %[width]
2220 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2308 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2221 [kShuffleUYVYY]"m"(kShuffleUYVYY), 2309 [kShuffleUYVYY]"m"(kShuffleUYVYY),
2222 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) 2310 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2223 // Does not use r14. 2311 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2224 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2312 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2225 ); 2313 );
2226 } 2314 }
2227 #endif // HAS_UYVYTOARGBROW_AVX2 2315 #endif // HAS_UYVYTOARGBROW_AVX2
2228 2316
2229 #ifdef HAS_I400TOARGBROW_SSE2 2317 #ifdef HAS_I400TOARGBROW_SSE2
2230 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { 2318 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2231 asm volatile ( 2319 asm volatile (
2232 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 2320 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2233 "movd %%eax,%%xmm2 \n" 2321 "movd %%eax,%%xmm2 \n"
2234 "pshufd $0x0,%%xmm2,%%xmm2 \n" 2322 "pshufd $0x0,%%xmm2,%%xmm2 \n"
(...skipping 3093 matching lines...) Expand 10 before | Expand all | Expand 10 after
5328 ); 5416 );
5329 } 5417 }
5330 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5418 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5331 5419
5332 #endif // defined(__x86_64__) || defined(__i386__) 5420 #endif // defined(__x86_64__) || defined(__i386__)
5333 5421
5334 #ifdef __cplusplus 5422 #ifdef __cplusplus
5335 } // extern "C" 5423 } // extern "C"
5336 } // namespace libyuv 5424 } // namespace libyuv
5337 #endif 5425 #endif
OLDNEW
« no previous file with comments | « include/libyuv/version.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698