Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(362)

Side by Side Diff: src/IceTargetLoweringX8632.cpp

Issue 1146803002: Subzero: Strength-reduce mul by certain constants. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master
Patch Set: Fill out the cross test. Cleanup. Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/assembler/x86/immediate_encodings.ll » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//
2 // 2 //
3 // The Subzero Code Generator 3 // The Subzero Code Generator
4 // 4 //
5 // This file is distributed under the University of Illinois Open Source 5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details. 6 // License. See LICENSE.TXT for details.
7 // 7 //
8 //===----------------------------------------------------------------------===// 8 //===----------------------------------------------------------------------===//
9 // 9 //
10 // This file implements the TargetLoweringX8632 class, which 10 // This file implements the TargetLoweringX8632 class, which
(...skipping 1269 matching lines...) Expand 10 before | Expand all | Expand 10 after
1280 // multiple of the required alignment at runtime. 1280 // multiple of the required alignment at runtime.
1281 Variable *T = makeReg(IceType_i32); 1281 Variable *T = makeReg(IceType_i32);
1282 _mov(T, TotalSize); 1282 _mov(T, TotalSize);
1283 _add(T, Ctx->getConstantInt32(Alignment - 1)); 1283 _add(T, Ctx->getConstantInt32(Alignment - 1));
1284 _and(T, Ctx->getConstantInt32(-Alignment)); 1284 _and(T, Ctx->getConstantInt32(-Alignment));
1285 _sub(esp, T); 1285 _sub(esp, T);
1286 } 1286 }
1287 _mov(Dest, esp); 1287 _mov(Dest, esp);
1288 } 1288 }
1289 1289
1290 // Strength-reduce scalar integer multiplication by a constant (for
1291 // i32 or narrower) for certain constants. The lea instruction can be
1292 // used to multiply by 3, 5, or 9, and the lsh instruction can be used
1293 // to multiply by powers of 2. These can be combined such that
1294 // e.g. multiplying by 100 can be done as 2 lea-based multiplies by 5,
1295 // combined with left-shifting by 2.
1296 bool TargetX8632::optimizeScalarMul(Variable *Dest, Operand *Src0,
1297 int32_t Src1) {
1298 // Disable this optimization for Om1 and O0, just to keep things
1299 // simple there.
1300 if (Ctx->getFlags().getOptLevel() < Opt_1)
1301 return false;
1302 Variable *T = nullptr;
1303 if (Src1 == -1) {
1304 _mov(T, Src0);
1305 _neg(T);
1306 _mov(Dest, T);
1307 return true;
1308 }
1309 if (Src1 == 1) {
1310 _mov(T, Src0);
1311 _mov(Dest, T);
1312 return true;
1313 }
1314 if (Src1 <= 0)
John 2015/06/10 17:27:52 Just throwing an idea... would it work to invert S
Jim Stichnoth 2015/06/12 17:31:47 Done. Though I suppress that when the constant is
1315 return false;
1316 uint32_t Count9 = 0;
1317 uint32_t Count5 = 0;
1318 uint32_t Count3 = 0;
1319 uint32_t Count2 = 0;
1320 uint32_t CountOps = 0;
1321 while (Src1 > 1) {
1322 if (Src1 % 9 == 0) {
1323 ++CountOps;
1324 ++Count9;
1325 Src1 /= 9;
1326 } else if (Src1 % 5 == 0) {
1327 ++CountOps;
1328 ++Count5;
1329 Src1 /= 5;
1330 } else if (Src1 % 3 == 0) {
1331 ++CountOps;
1332 ++Count3;
1333 Src1 /= 3;
1334 } else if (Src1 % 2 == 0) {
1335 if (Count2 == 0)
1336 ++CountOps;
1337 ++Count2;
1338 Src1 /= 2;
1339 } else {
1340 return false;
1341 }
1342 }
1343 // Limit the number of lea/shl operations for a single multiply, to
1344 // a somewhat arbitrary choice of 5.
1345 const uint32_t MaxOpsForOptimizedMul = 5;
jvoung (off chromium) 2015/06/10 18:12:27 FWIW, some of the agner tables say mul r/i is abou
John 2015/06/10 23:47:19 This little command echo "int v(int i) { return i
Jim Stichnoth 2015/06/12 17:31:47 OK. I reduced the threshold to 3.
1346 if (CountOps > MaxOpsForOptimizedMul)
1347 return false;
1348 _mov(T, Src0);
1349 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1350 for (uint32_t i = 0; i < Count9; ++i) {
1351 const uint16_t Shift = 3; // log2(9-1)
1352 _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
1353 _set_dest_nonkillable();
1354 }
1355 for (uint32_t i = 0; i < Count5; ++i) {
1356 const uint16_t Shift = 2; // log2(5-1)
1357 _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
1358 _set_dest_nonkillable();
1359 }
1360 for (uint32_t i = 0; i < Count3; ++i) {
1361 const uint16_t Shift = 1; // log2(3-1)
1362 _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
1363 _set_dest_nonkillable();
1364 }
1365 if (Count2) {
1366 _shl(T, Ctx->getConstantInt(Dest->getType(), Count2));
1367 }
1368 _mov(Dest, T);
1369 return true;
1370 }
1371
1290 void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { 1372 void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
1291 Variable *Dest = Inst->getDest(); 1373 Variable *Dest = Inst->getDest();
1292 Operand *Src0 = legalize(Inst->getSrc(0)); 1374 Operand *Src0 = legalize(Inst->getSrc(0));
1293 Operand *Src1 = legalize(Inst->getSrc(1)); 1375 Operand *Src1 = legalize(Inst->getSrc(1));
1294 if (Inst->isCommutative()) { 1376 if (Inst->isCommutative()) {
1295 if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) 1377 if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
1296 std::swap(Src0, Src1); 1378 std::swap(Src0, Src1);
1379 if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))
1380 std::swap(Src0, Src1);
1297 } 1381 }
1298 if (Dest->getType() == IceType_i64) { 1382 if (Dest->getType() == IceType_i64) {
1299 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); 1383 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
1300 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); 1384 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
1301 Operand *Src0Lo = loOperand(Src0); 1385 Operand *Src0Lo = loOperand(Src0);
1302 Operand *Src0Hi = hiOperand(Src0); 1386 Operand *Src0Hi = hiOperand(Src0);
1303 Operand *Src1Lo = loOperand(Src1); 1387 Operand *Src1Lo = loOperand(Src1);
1304 Operand *Src1Hi = hiOperand(Src1); 1388 Operand *Src1Hi = hiOperand(Src1);
1305 Variable *T_Lo = nullptr, *T_Hi = nullptr; 1389 Variable *T_Lo = nullptr, *T_Hi = nullptr;
1306 switch (Inst->getOp()) { 1390 switch (Inst->getOp()) {
(...skipping 207 matching lines...) Expand 10 before | Expand all | Expand 10 after
1514 lowerCall(Call); 1598 lowerCall(Call);
1515 } break; 1599 } break;
1516 case InstArithmetic::Fadd: 1600 case InstArithmetic::Fadd:
1517 case InstArithmetic::Fsub: 1601 case InstArithmetic::Fsub:
1518 case InstArithmetic::Fmul: 1602 case InstArithmetic::Fmul:
1519 case InstArithmetic::Fdiv: 1603 case InstArithmetic::Fdiv:
1520 case InstArithmetic::Frem: 1604 case InstArithmetic::Frem:
1521 llvm_unreachable("FP instruction with i64 type"); 1605 llvm_unreachable("FP instruction with i64 type");
1522 break; 1606 break;
1523 } 1607 }
1524 } else if (isVectorType(Dest->getType())) { 1608 return;
1609 }
1610 if (isVectorType(Dest->getType())) {
1525 // TODO: Trap on integer divide and integer modulo by zero. 1611 // TODO: Trap on integer divide and integer modulo by zero.
1526 // See: https://code.google.com/p/nativeclient/issues/detail?id=3899 1612 // See: https://code.google.com/p/nativeclient/issues/detail?id=3899
1527 if (llvm::isa<OperandX8632Mem>(Src1)) 1613 if (llvm::isa<OperandX8632Mem>(Src1))
1528 Src1 = legalizeToVar(Src1); 1614 Src1 = legalizeToVar(Src1);
1529 switch (Inst->getOp()) { 1615 switch (Inst->getOp()) {
1530 case InstArithmetic::_num: 1616 case InstArithmetic::_num:
1531 llvm_unreachable("Unknown arithmetic operator"); 1617 llvm_unreachable("Unknown arithmetic operator");
1532 break; 1618 break;
1533 case InstArithmetic::Add: { 1619 case InstArithmetic::Add: {
1534 Variable *T = makeReg(Dest->getType()); 1620 Variable *T = makeReg(Dest->getType());
(...skipping 108 matching lines...) Expand 10 before | Expand all | Expand 10 after
1643 case InstArithmetic::Fdiv: { 1729 case InstArithmetic::Fdiv: {
1644 Variable *T = makeReg(Dest->getType()); 1730 Variable *T = makeReg(Dest->getType());
1645 _movp(T, Src0); 1731 _movp(T, Src0);
1646 _divps(T, Src1); 1732 _divps(T, Src1);
1647 _movp(Dest, T); 1733 _movp(Dest, T);
1648 } break; 1734 } break;
1649 case InstArithmetic::Frem: 1735 case InstArithmetic::Frem:
1650 scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1); 1736 scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
1651 break; 1737 break;
1652 } 1738 }
1653 } else { // Dest->getType() is non-i64 scalar 1739 return;
1740 }
1741 { // TODO(stichnot): Remove this level of {}. (It's still there to
1742 // make the diffs easier to read.)
1654 Variable *T_edx = nullptr; 1743 Variable *T_edx = nullptr;
1655 Variable *T = nullptr; 1744 Variable *T = nullptr;
1656 switch (Inst->getOp()) { 1745 switch (Inst->getOp()) {
1657 case InstArithmetic::_num: 1746 case InstArithmetic::_num:
1658 llvm_unreachable("Unknown arithmetic operator"); 1747 llvm_unreachable("Unknown arithmetic operator");
1659 break; 1748 break;
1660 case InstArithmetic::Add: 1749 case InstArithmetic::Add:
1661 _mov(T, Src0); 1750 _mov(T, Src0);
1662 _add(T, Src1); 1751 _add(T, Src1);
1663 _mov(Dest, T); 1752 _mov(Dest, T);
(...skipping 12 matching lines...) Expand all
1676 _mov(T, Src0); 1765 _mov(T, Src0);
1677 _xor(T, Src1); 1766 _xor(T, Src1);
1678 _mov(Dest, T); 1767 _mov(Dest, T);
1679 break; 1768 break;
1680 case InstArithmetic::Sub: 1769 case InstArithmetic::Sub:
1681 _mov(T, Src0); 1770 _mov(T, Src0);
1682 _sub(T, Src1); 1771 _sub(T, Src1);
1683 _mov(Dest, T); 1772 _mov(Dest, T);
1684 break; 1773 break;
1685 case InstArithmetic::Mul: 1774 case InstArithmetic::Mul:
1686 // TODO: Optimize for llvm::isa<Constant>(Src1) 1775 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1687 // TODO: Strength-reduce multiplications by a constant, 1776 if (optimizeScalarMul(Dest, Src0, C->getValue()))
1688 // particularly -1 and powers of 2. Advanced: use lea to 1777 return;
1689 // multiply by 3, 5, 9. 1778 }
1690 //
1691 // The 8-bit version of imul only allows the form "imul r/m8" 1779 // The 8-bit version of imul only allows the form "imul r/m8"
1692 // where T must be in eax. 1780 // where T must be in eax.
1693 if (isByteSizedArithType(Dest->getType())) { 1781 if (isByteSizedArithType(Dest->getType())) {
1694 _mov(T, Src0, RegX8632::Reg_eax); 1782 _mov(T, Src0, RegX8632::Reg_eax);
1695 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 1783 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1696 } else { 1784 } else {
1697 _mov(T, Src0); 1785 _mov(T, Src0);
1698 } 1786 }
1699 _imul(T, Src1); 1787 _imul(T, Src1);
1700 _mov(Dest, T); 1788 _mov(Dest, T);
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
1733 _mov(Dest, T); 1821 _mov(Dest, T);
1734 } else { 1822 } else {
1735 Constant *Zero = Ctx->getConstantZero(IceType_i32); 1823 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1736 _mov(T, Src0, RegX8632::Reg_eax); 1824 _mov(T, Src0, RegX8632::Reg_eax);
1737 _mov(T_edx, Zero, RegX8632::Reg_edx); 1825 _mov(T_edx, Zero, RegX8632::Reg_edx);
1738 _div(T, Src1, T_edx); 1826 _div(T, Src1, T_edx);
1739 _mov(Dest, T); 1827 _mov(Dest, T);
1740 } 1828 }
1741 break; 1829 break;
1742 case InstArithmetic::Sdiv: 1830 case InstArithmetic::Sdiv:
1831 // TODO(stichnot): Enable this after doing better performance
1832 // and cross testing.
1833 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
1834 // Optimize division by constant power of 2, but not for Om1
1835 // or O0, just to keep things simple there.
1836 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1837 int32_t Divisor = C->getValue();
1838 uint32_t UDivisor = static_cast<uint32_t>(Divisor);
1839 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
1840 uint32_t LogDiv = llvm::Log2_32(UDivisor);
1841 Type Ty = Dest->getType();
1842 // LLVM does the following for dest=src/(1<<log):
1843 // t=src
1844 // sar t,typewidth-1 // -1 if src is negative, 0 if not
1845 // shr t,typewidth-log
1846 // add t,src
1847 // sar t,log
1848 // dest=t
1849 uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
1850 _mov(T, Src0);
1851 // If for some reason we are dividing by 1, just treat it
1852 // like an assignment.
1853 if (LogDiv > 0) {
1854 // The initial sar is unnecessary when dividing by 2.
1855 if (LogDiv > 1)
1856 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
1857 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
1858 _add(T, Src0);
1859 _sar(T, Ctx->getConstantInt(Ty, LogDiv));
1860 }
1861 _mov(Dest, T);
1862 return;
1863 }
1864 }
1865 }
1743 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 1866 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1744 if (isByteSizedArithType(Dest->getType())) { 1867 if (isByteSizedArithType(Dest->getType())) {
1745 _mov(T, Src0, RegX8632::Reg_eax); 1868 _mov(T, Src0, RegX8632::Reg_eax);
1746 _cbwdq(T, T); 1869 _cbwdq(T, T);
1747 _idiv(T, Src1, T); 1870 _idiv(T, Src1, T);
1748 _mov(Dest, T); 1871 _mov(Dest, T);
1749 } else { 1872 } else {
1750 T_edx = makeReg(IceType_i32, RegX8632::Reg_edx); 1873 T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
1751 _mov(T, Src0, RegX8632::Reg_eax); 1874 _mov(T, Src0, RegX8632::Reg_eax);
1752 _cbwdq(T_edx, T); 1875 _cbwdq(T_edx, T);
(...skipping 12 matching lines...) Expand all
1765 _mov(Dest, T_ah); 1888 _mov(Dest, T_ah);
1766 } else { 1889 } else {
1767 Constant *Zero = Ctx->getConstantZero(IceType_i32); 1890 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1768 _mov(T_edx, Zero, RegX8632::Reg_edx); 1891 _mov(T_edx, Zero, RegX8632::Reg_edx);
1769 _mov(T, Src0, RegX8632::Reg_eax); 1892 _mov(T, Src0, RegX8632::Reg_eax);
1770 _div(T_edx, Src1, T); 1893 _div(T_edx, Src1, T);
1771 _mov(Dest, T_edx); 1894 _mov(Dest, T_edx);
1772 } 1895 }
1773 break; 1896 break;
1774 case InstArithmetic::Srem: 1897 case InstArithmetic::Srem:
1898 // TODO(stichnot): Enable this after doing better performance
1899 // and cross testing.
1900 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
1901 // Optimize mod by constant power of 2, but not for Om1 or O0,
1902 // just to keep things simple there.
1903 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1904 int32_t Divisor = C->getValue();
1905 uint32_t UDivisor = static_cast<uint32_t>(Divisor);
1906 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
1907 uint32_t LogDiv = llvm::Log2_32(UDivisor);
1908 Type Ty = Dest->getType();
1909 // LLVM does the following for dest=src%(1<<log):
1910 // t=src
1911 // sar t,typewidth-1 // -1 if src is negative, 0 if not
1912 // shr t,typewidth-log
1913 // add t,src
1914 // and t, -(1<<log)
1915 // sub t,src
1916 // neg t
1917 // dest=t
1918 uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
1919 // If for some reason we are dividing by 1, just assign 0.
1920 if (LogDiv == 0) {
1921 _mov(Dest, Ctx->getConstantZero(Ty));
1922 return;
1923 }
1924 _mov(T, Src0);
1925 // The initial sar is unnecessary when dividing by 2.
1926 if (LogDiv > 1)
1927 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
1928 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
1929 _add(T, Src0);
1930 _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
1931 _sub(T, Src0);
1932 _neg(T);
1933 _mov(Dest, T);
1934 return;
1935 }
1936 }
1937 }
1775 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); 1938 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1776 if (isByteSizedArithType(Dest->getType())) { 1939 if (isByteSizedArithType(Dest->getType())) {
1777 Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah); 1940 Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah);
1778 _mov(T, Src0, RegX8632::Reg_eax); 1941 _mov(T, Src0, RegX8632::Reg_eax);
1779 _cbwdq(T, T); 1942 _cbwdq(T, T);
1780 Context.insert(InstFakeDef::create(Func, T_ah)); 1943 Context.insert(InstFakeDef::create(Func, T_ah));
1781 _idiv(T_ah, Src1, T); 1944 _idiv(T_ah, Src1, T);
1782 _mov(Dest, T_ah); 1945 _mov(Dest, T_ah);
1783 } else { 1946 } else {
1784 T_edx = makeReg(IceType_i32, RegX8632::Reg_edx); 1947 T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
(...skipping 25 matching lines...) Expand all
1810 break; 1973 break;
1811 case InstArithmetic::Frem: { 1974 case InstArithmetic::Frem: {
1812 const SizeT MaxSrcs = 2; 1975 const SizeT MaxSrcs = 2;
1813 Type Ty = Dest->getType(); 1976 Type Ty = Dest->getType();
1814 InstCall *Call = 1977 InstCall *Call =
1815 makeHelperCall(isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, 1978 makeHelperCall(isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64,
1816 Dest, MaxSrcs); 1979 Dest, MaxSrcs);
1817 Call->addArg(Src0); 1980 Call->addArg(Src0);
1818 Call->addArg(Src1); 1981 Call->addArg(Src1);
1819 return lowerCall(Call); 1982 return lowerCall(Call);
1820 } break; 1983 }
1821 } 1984 }
1822 } 1985 }
1823 } 1986 }
1824 1987
1825 void TargetX8632::lowerAssign(const InstAssign *Inst) { 1988 void TargetX8632::lowerAssign(const InstAssign *Inst) {
1826 Variable *Dest = Inst->getDest(); 1989 Variable *Dest = Inst->getDest();
1827 Operand *Src0 = Inst->getSrc(0); 1990 Operand *Src0 = Inst->getSrc(0);
1828 assert(Dest->getType() == Src0->getType()); 1991 assert(Dest->getType() == Src0->getType());
1829 if (Dest->getType() == IceType_i64) { 1992 if (Dest->getType() == IceType_i64) {
1830 Src0 = legalize(Src0); 1993 Src0 = legalize(Src0);
(...skipping 3180 matching lines...) Expand 10 before | Expand all | Expand 10 after
5011 case FT_Asm: 5174 case FT_Asm:
5012 case FT_Iasm: { 5175 case FT_Iasm: {
5013 OstreamLocker L(Ctx); 5176 OstreamLocker L(Ctx);
5014 emitConstantPool<PoolTypeConverter<float>>(Ctx); 5177 emitConstantPool<PoolTypeConverter<float>>(Ctx);
5015 emitConstantPool<PoolTypeConverter<double>>(Ctx); 5178 emitConstantPool<PoolTypeConverter<double>>(Ctx);
5016 } break; 5179 } break;
5017 } 5180 }
5018 } 5181 }
5019 5182
5020 } // end of namespace Ice 5183 } // end of namespace Ice
OLDNEW
« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/assembler/x86/immediate_encodings.ll » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698