Chromium Code Reviews| Index: src/IceTargetLoweringX8632.cpp |
| diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp |
| index b8b633ac27bb1e9cafc320706d8eb76505be55c8..a9a7acd2364c57388bc38e4ecec47d58af2b6385 100644 |
| --- a/src/IceTargetLoweringX8632.cpp |
| +++ b/src/IceTargetLoweringX8632.cpp |
| @@ -1287,6 +1287,88 @@ void TargetX8632::lowerAlloca(const InstAlloca *Inst) { |
| _mov(Dest, esp); |
| } |
| +// Strength-reduce scalar integer multiplication by a constant (for |
| +// i32 or narrower) for certain constants. The lea instruction can be |
| +// used to multiply by 3, 5, or 9, and the lsh instruction can be used |
| +// to multiply by powers of 2. These can be combined such that |
| +// e.g. multiplying by 100 can be done as 2 lea-based multiplies by 5, |
| +// combined with left-shifting by 2. |
| +bool TargetX8632::optimizeScalarMul(Variable *Dest, Operand *Src0, |
| + int32_t Src1) { |
| + // Disable this optimization for Om1 and O0, just to keep things |
| + // simple there. |
| + if (Ctx->getFlags().getOptLevel() < Opt_1) |
| + return false; |
| + Variable *T = nullptr; |
| + if (Src1 == -1) { |
| + _mov(T, Src0); |
| + _neg(T); |
| + _mov(Dest, T); |
| + return true; |
| + } |
| + if (Src1 == 1) { |
| + _mov(T, Src0); |
| + _mov(Dest, T); |
| + return true; |
| + } |
| + if (Src1 <= 0) |
|
John
2015/06/10 17:27:52
Just throwing an idea... would it work to invert S
Jim Stichnoth
2015/06/12 17:31:47
Done. Though I suppress that when the constant is
|
| + return false; |
| + uint32_t Count9 = 0; |
| + uint32_t Count5 = 0; |
| + uint32_t Count3 = 0; |
| + uint32_t Count2 = 0; |
| + uint32_t CountOps = 0; |
| + while (Src1 > 1) { |
| + if (Src1 % 9 == 0) { |
| + ++CountOps; |
| + ++Count9; |
| + Src1 /= 9; |
| + } else if (Src1 % 5 == 0) { |
| + ++CountOps; |
| + ++Count5; |
| + Src1 /= 5; |
| + } else if (Src1 % 3 == 0) { |
| + ++CountOps; |
| + ++Count3; |
| + Src1 /= 3; |
| + } else if (Src1 % 2 == 0) { |
| + if (Count2 == 0) |
| + ++CountOps; |
| + ++Count2; |
| + Src1 /= 2; |
| + } else { |
| + return false; |
| + } |
| + } |
| + // Limit the number of lea/shl operations for a single multiply, to |
| + // a somewhat arbitrary choice of 5. |
| + const uint32_t MaxOpsForOptimizedMul = 5; |
|
jvoung (off chromium)
2015/06/10 18:12:27
FWIW, some of the agner tables say mul r/i is abou
John
2015/06/10 23:47:19
This little command
echo "int v(int i) { return i
Jim Stichnoth
2015/06/12 17:31:47
OK. I reduced the threshold to 3.
|
| + if (CountOps > MaxOpsForOptimizedMul) |
| + return false; |
| + _mov(T, Src0); |
| + Constant *Zero = Ctx->getConstantZero(IceType_i32); |
| + for (uint32_t i = 0; i < Count9; ++i) { |
| + const uint16_t Shift = 3; // log2(9-1) |
| + _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift)); |
| + _set_dest_nonkillable(); |
| + } |
| + for (uint32_t i = 0; i < Count5; ++i) { |
| + const uint16_t Shift = 2; // log2(5-1) |
| + _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift)); |
| + _set_dest_nonkillable(); |
| + } |
| + for (uint32_t i = 0; i < Count3; ++i) { |
| + const uint16_t Shift = 1; // log2(3-1) |
| + _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift)); |
| + _set_dest_nonkillable(); |
| + } |
| + if (Count2) { |
| + _shl(T, Ctx->getConstantInt(Dest->getType(), Count2)); |
| + } |
| + _mov(Dest, T); |
| + return true; |
| +} |
| + |
| void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
| Variable *Dest = Inst->getDest(); |
| Operand *Src0 = legalize(Inst->getSrc(0)); |
| @@ -1294,6 +1376,8 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
| if (Inst->isCommutative()) { |
| if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) |
| std::swap(Src0, Src1); |
| + if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) |
| + std::swap(Src0, Src1); |
| } |
| if (Dest->getType() == IceType_i64) { |
| Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); |
| @@ -1521,7 +1605,9 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
| llvm_unreachable("FP instruction with i64 type"); |
| break; |
| } |
| - } else if (isVectorType(Dest->getType())) { |
| + return; |
| + } |
| + if (isVectorType(Dest->getType())) { |
| // TODO: Trap on integer divide and integer modulo by zero. |
| // See: https://code.google.com/p/nativeclient/issues/detail?id=3899 |
| if (llvm::isa<OperandX8632Mem>(Src1)) |
| @@ -1650,7 +1736,10 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
| scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1); |
| break; |
| } |
| - } else { // Dest->getType() is non-i64 scalar |
| + return; |
| + } |
| + { // TODO(stichnot): Remove this level of {}. (It's still there to |
| + // make the diffs easier to read.) |
| Variable *T_edx = nullptr; |
| Variable *T = nullptr; |
| switch (Inst->getOp()) { |
| @@ -1683,11 +1772,10 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
| _mov(Dest, T); |
| break; |
| case InstArithmetic::Mul: |
| - // TODO: Optimize for llvm::isa<Constant>(Src1) |
| - // TODO: Strength-reduce multiplications by a constant, |
| - // particularly -1 and powers of 2. Advanced: use lea to |
| - // multiply by 3, 5, 9. |
| - // |
| + if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
| + if (optimizeScalarMul(Dest, Src0, C->getValue())) |
| + return; |
| + } |
| // The 8-bit version of imul only allows the form "imul r/m8" |
| // where T must be in eax. |
| if (isByteSizedArithType(Dest->getType())) { |
| @@ -1740,6 +1828,41 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
| } |
| break; |
| case InstArithmetic::Sdiv: |
| + // TODO(stichnot): Enable this after doing better performance |
| + // and cross testing. |
| + if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { |
| + // Optimize division by constant power of 2, but not for Om1 |
| + // or O0, just to keep things simple there. |
| + if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
| + int32_t Divisor = C->getValue(); |
| + uint32_t UDivisor = static_cast<uint32_t>(Divisor); |
| + if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { |
| + uint32_t LogDiv = llvm::Log2_32(UDivisor); |
| + Type Ty = Dest->getType(); |
| + // LLVM does the following for dest=src/(1<<log): |
| + // t=src |
| + // sar t,typewidth-1 // -1 if src is negative, 0 if not |
| + // shr t,typewidth-log |
| + // add t,src |
| + // sar t,log |
| + // dest=t |
| + uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty); |
| + _mov(T, Src0); |
| + // If for some reason we are dividing by 1, just treat it |
| + // like an assignment. |
| + if (LogDiv > 0) { |
| + // The initial sar is unnecessary when dividing by 2. |
| + if (LogDiv > 1) |
| + _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); |
| + _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); |
| + _add(T, Src0); |
| + _sar(T, Ctx->getConstantInt(Ty, LogDiv)); |
| + } |
| + _mov(Dest, T); |
| + return; |
| + } |
| + } |
| + } |
| Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
| if (isByteSizedArithType(Dest->getType())) { |
| _mov(T, Src0, RegX8632::Reg_eax); |
| @@ -1772,6 +1895,46 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
| } |
| break; |
| case InstArithmetic::Srem: |
| + // TODO(stichnot): Enable this after doing better performance |
| + // and cross testing. |
| + if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { |
| + // Optimize mod by constant power of 2, but not for Om1 or O0, |
| + // just to keep things simple there. |
| + if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
| + int32_t Divisor = C->getValue(); |
| + uint32_t UDivisor = static_cast<uint32_t>(Divisor); |
| + if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { |
| + uint32_t LogDiv = llvm::Log2_32(UDivisor); |
| + Type Ty = Dest->getType(); |
| + // LLVM does the following for dest=src%(1<<log): |
| + // t=src |
| + // sar t,typewidth-1 // -1 if src is negative, 0 if not |
| + // shr t,typewidth-log |
| + // add t,src |
| + // and t, -(1<<log) |
| + // sub t,src |
| + // neg t |
| + // dest=t |
| + uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty); |
| + // If for some reason we are dividing by 1, just assign 0. |
| + if (LogDiv == 0) { |
| + _mov(Dest, Ctx->getConstantZero(Ty)); |
| + return; |
| + } |
| + _mov(T, Src0); |
| + // The initial sar is unnecessary when dividing by 2. |
| + if (LogDiv > 1) |
| + _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); |
| + _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); |
| + _add(T, Src0); |
| + _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv))); |
| + _sub(T, Src0); |
| + _neg(T); |
| + _mov(Dest, T); |
| + return; |
| + } |
| + } |
| + } |
| Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
| if (isByteSizedArithType(Dest->getType())) { |
| Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah); |
| @@ -1817,7 +1980,7 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
| Call->addArg(Src0); |
| Call->addArg(Src1); |
| return lowerCall(Call); |
| - } break; |
| + } |
| } |
| } |
| } |