Index: src/IceTargetLoweringX8632.cpp |
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp |
index 506b315079909210d3b1f7c9dc1fbee21e0e53ea..f6cd26f64f5299550de9cc42d20ee07c8f68e400 100644 |
--- a/src/IceTargetLoweringX8632.cpp |
+++ b/src/IceTargetLoweringX8632.cpp |
@@ -1287,6 +1287,102 @@ void TargetX8632::lowerAlloca(const InstAlloca *Inst) { |
_mov(Dest, esp); |
} |
+// Strength-reduce scalar integer multiplication by a constant (for |
+// i32 or narrower) for certain constants. The lea instruction can be |
+// used to multiply by 3, 5, or 9, and the lsh instruction can be used |
+// to multiply by powers of 2. These can be combined such that |
+// e.g. multiplying by 100 can be done as 2 lea-based multiplies by 5, |
+// combined with left-shifting by 2. |
+bool TargetX8632::optimizeScalarMul(Variable *Dest, Operand *Src0, |
+ int32_t Src1) { |
+ // Disable this optimization for Om1 and O0, just to keep things |
+ // simple there. |
+ if (Ctx->getFlags().getOptLevel() < Opt_1) |
+ return false; |
+ Type Ty = Dest->getType(); |
+ Variable *T = nullptr; |
+ if (Src1 == -1) { |
+ _mov(T, Src0); |
+ _neg(T); |
+ _mov(Dest, T); |
+ return true; |
+ } |
+ if (Src1 == 0) { |
+ _mov(Dest, Ctx->getConstantZero(Ty)); |
+ return true; |
+ } |
+ if (Src1 == 1) { |
+ _mov(T, Src0); |
+ _mov(Dest, T); |
+ return true; |
+ } |
+ // Don't bother with the edge case where Src1 == MININT. |
+ if (Src1 == -Src1) |
+ return false; |
+ const bool Src1IsNegative = Src1 < 0; |
+ if (Src1IsNegative) |
+ Src1 = -Src1; |
+ uint32_t Count9 = 0; |
+ uint32_t Count5 = 0; |
+ uint32_t Count3 = 0; |
+ uint32_t Count2 = 0; |
+ uint32_t CountOps = 0; |
+ while (Src1 > 1) { |
+ if (Src1 % 9 == 0) { |
+ ++CountOps; |
+ ++Count9; |
+ Src1 /= 9; |
+ } else if (Src1 % 5 == 0) { |
+ ++CountOps; |
+ ++Count5; |
+ Src1 /= 5; |
+ } else if (Src1 % 3 == 0) { |
+ ++CountOps; |
+ ++Count3; |
+ Src1 /= 3; |
+ } else if (Src1 % 2 == 0) { |
+ if (Count2 == 0) |
+ ++CountOps; |
+ ++Count2; |
+ Src1 /= 2; |
+ } else { |
+ return false; |
+ } |
+ } |
+ // Lea optimization only works for i16 and i32 types, not i8. |
+ if (Ty != IceType_i16 && Ty != IceType_i32 && (Count3 || Count5 || Count9)) |
+ return false; |
+ // Limit the number of lea/shl operations for a single multiply, to |
+ // a somewhat arbitrary choice of 3. |
+ const uint32_t MaxOpsForOptimizedMul = 3; |
+ if (CountOps > MaxOpsForOptimizedMul) |
+ return false; |
+ _mov(T, Src0); |
+ Constant *Zero = Ctx->getConstantZero(IceType_i32); |
+ for (uint32_t i = 0; i < Count9; ++i) { |
+ const uint16_t Shift = 3; // log2(9-1) |
+ _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift)); |
+ _set_dest_nonkillable(); |
+ } |
+ for (uint32_t i = 0; i < Count5; ++i) { |
+ const uint16_t Shift = 2; // log2(5-1) |
+ _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift)); |
+ _set_dest_nonkillable(); |
+ } |
+ for (uint32_t i = 0; i < Count3; ++i) { |
+ const uint16_t Shift = 1; // log2(3-1) |
+ _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift)); |
+ _set_dest_nonkillable(); |
+ } |
+ if (Count2) { |
+ _shl(T, Ctx->getConstantInt(Ty, Count2)); |
+ } |
+ if (Src1IsNegative) |
+ _neg(T); |
+ _mov(Dest, T); |
+ return true; |
+} |
+ |
void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
Variable *Dest = Inst->getDest(); |
Operand *Src0 = legalize(Inst->getSrc(0)); |
@@ -1294,6 +1390,8 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
if (Inst->isCommutative()) { |
if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) |
std::swap(Src0, Src1); |
+ if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) |
+ std::swap(Src0, Src1); |
} |
if (Dest->getType() == IceType_i64) { |
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); |
@@ -1521,7 +1619,9 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
llvm_unreachable("FP instruction with i64 type"); |
break; |
} |
- } else if (isVectorType(Dest->getType())) { |
+ return; |
+ } |
+ if (isVectorType(Dest->getType())) { |
// TODO: Trap on integer divide and integer modulo by zero. |
// See: https://code.google.com/p/nativeclient/issues/detail?id=3899 |
if (llvm::isa<OperandX8632Mem>(Src1)) |
@@ -1650,175 +1750,248 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { |
scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1); |
break; |
} |
- } else { // Dest->getType() is non-i64 scalar |
- Variable *T_edx = nullptr; |
- Variable *T = nullptr; |
- switch (Inst->getOp()) { |
- case InstArithmetic::_num: |
- llvm_unreachable("Unknown arithmetic operator"); |
- break; |
- case InstArithmetic::Add: |
- _mov(T, Src0); |
- _add(T, Src1); |
- _mov(Dest, T); |
- break; |
- case InstArithmetic::And: |
- _mov(T, Src0); |
- _and(T, Src1); |
- _mov(Dest, T); |
- break; |
- case InstArithmetic::Or: |
- _mov(T, Src0); |
- _or(T, Src1); |
- _mov(Dest, T); |
- break; |
- case InstArithmetic::Xor: |
+ return; |
+ } |
+ Variable *T_edx = nullptr; |
+ Variable *T = nullptr; |
+ switch (Inst->getOp()) { |
+ case InstArithmetic::_num: |
+ llvm_unreachable("Unknown arithmetic operator"); |
+ break; |
+ case InstArithmetic::Add: |
+ _mov(T, Src0); |
+ _add(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::And: |
+ _mov(T, Src0); |
+ _and(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Or: |
+ _mov(T, Src0); |
+ _or(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Xor: |
+ _mov(T, Src0); |
+ _xor(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Sub: |
+ _mov(T, Src0); |
+ _sub(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Mul: |
+ if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
+ if (optimizeScalarMul(Dest, Src0, C->getValue())) |
+ return; |
+ } |
+ // The 8-bit version of imul only allows the form "imul r/m8" |
+ // where T must be in eax. |
+ if (isByteSizedArithType(Dest->getType())) { |
+ _mov(T, Src0, RegX8632::Reg_eax); |
+ Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
+ } else { |
_mov(T, Src0); |
- _xor(T, Src1); |
+ } |
+ _imul(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Shl: |
+ _mov(T, Src0); |
+ if (!llvm::isa<Constant>(Src1)) |
+ Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx); |
+ _shl(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Lshr: |
+ _mov(T, Src0); |
+ if (!llvm::isa<Constant>(Src1)) |
+ Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx); |
+ _shr(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Ashr: |
+ _mov(T, Src0); |
+ if (!llvm::isa<Constant>(Src1)) |
+ Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx); |
+ _sar(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Udiv: |
+ // div and idiv are the few arithmetic operators that do not allow |
+ // immediates as the operand. |
+ Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
+ if (isByteSizedArithType(Dest->getType())) { |
+ Variable *T_ah = nullptr; |
+ Constant *Zero = Ctx->getConstantZero(IceType_i8); |
+ _mov(T, Src0, RegX8632::Reg_eax); |
+ _mov(T_ah, Zero, RegX8632::Reg_ah); |
+ _div(T, Src1, T_ah); |
_mov(Dest, T); |
- break; |
- case InstArithmetic::Sub: |
- _mov(T, Src0); |
- _sub(T, Src1); |
+ } else { |
+ Constant *Zero = Ctx->getConstantZero(IceType_i32); |
+ _mov(T, Src0, RegX8632::Reg_eax); |
+ _mov(T_edx, Zero, RegX8632::Reg_edx); |
+ _div(T, Src1, T_edx); |
_mov(Dest, T); |
- break; |
- case InstArithmetic::Mul: |
- // TODO: Optimize for llvm::isa<Constant>(Src1) |
- // TODO: Strength-reduce multiplications by a constant, |
- // particularly -1 and powers of 2. Advanced: use lea to |
- // multiply by 3, 5, 9. |
- // |
- // The 8-bit version of imul only allows the form "imul r/m8" |
- // where T must be in eax. |
- if (isByteSizedArithType(Dest->getType())) { |
- _mov(T, Src0, RegX8632::Reg_eax); |
- Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
- } else { |
- _mov(T, Src0); |
+ } |
+ break; |
+ case InstArithmetic::Sdiv: |
+ // TODO(stichnot): Enable this after doing better performance |
+ // and cross testing. |
+ if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { |
+ // Optimize division by constant power of 2, but not for Om1 |
+ // or O0, just to keep things simple there. |
+ if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
+ int32_t Divisor = C->getValue(); |
+ uint32_t UDivisor = static_cast<uint32_t>(Divisor); |
+ if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { |
+ uint32_t LogDiv = llvm::Log2_32(UDivisor); |
+ Type Ty = Dest->getType(); |
+ // LLVM does the following for dest=src/(1<<log): |
+ // t=src |
+ // sar t,typewidth-1 // -1 if src is negative, 0 if not |
+ // shr t,typewidth-log |
+ // add t,src |
+ // sar t,log |
+ // dest=t |
+ uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty); |
+ _mov(T, Src0); |
+ // If for some reason we are dividing by 1, just treat it |
+ // like an assignment. |
+ if (LogDiv > 0) { |
+ // The initial sar is unnecessary when dividing by 2. |
+ if (LogDiv > 1) |
+ _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); |
+ _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); |
+ _add(T, Src0); |
+ _sar(T, Ctx->getConstantInt(Ty, LogDiv)); |
+ } |
+ _mov(Dest, T); |
+ return; |
+ } |
} |
- _imul(T, Src1); |
- _mov(Dest, T); |
- break; |
- case InstArithmetic::Shl: |
- _mov(T, Src0); |
- if (!llvm::isa<Constant>(Src1)) |
- Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx); |
- _shl(T, Src1); |
- _mov(Dest, T); |
- break; |
- case InstArithmetic::Lshr: |
- _mov(T, Src0); |
- if (!llvm::isa<Constant>(Src1)) |
- Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx); |
- _shr(T, Src1); |
+ } |
+ Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
+ if (isByteSizedArithType(Dest->getType())) { |
+ _mov(T, Src0, RegX8632::Reg_eax); |
+ _cbwdq(T, T); |
+ _idiv(T, Src1, T); |
_mov(Dest, T); |
- break; |
- case InstArithmetic::Ashr: |
- _mov(T, Src0); |
- if (!llvm::isa<Constant>(Src1)) |
- Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx); |
- _sar(T, Src1); |
+ } else { |
+ T_edx = makeReg(IceType_i32, RegX8632::Reg_edx); |
+ _mov(T, Src0, RegX8632::Reg_eax); |
+ _cbwdq(T_edx, T); |
+ _idiv(T, Src1, T_edx); |
_mov(Dest, T); |
- break; |
- case InstArithmetic::Udiv: |
- // div and idiv are the few arithmetic operators that do not allow |
- // immediates as the operand. |
- Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
- if (isByteSizedArithType(Dest->getType())) { |
- Variable *T_ah = nullptr; |
- Constant *Zero = Ctx->getConstantZero(IceType_i8); |
- _mov(T, Src0, RegX8632::Reg_eax); |
- _mov(T_ah, Zero, RegX8632::Reg_ah); |
- _div(T, Src1, T_ah); |
- _mov(Dest, T); |
- } else { |
- Constant *Zero = Ctx->getConstantZero(IceType_i32); |
- _mov(T, Src0, RegX8632::Reg_eax); |
- _mov(T_edx, Zero, RegX8632::Reg_edx); |
- _div(T, Src1, T_edx); |
- _mov(Dest, T); |
- } |
- break; |
- case InstArithmetic::Sdiv: |
- Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
- if (isByteSizedArithType(Dest->getType())) { |
- _mov(T, Src0, RegX8632::Reg_eax); |
- _cbwdq(T, T); |
- _idiv(T, Src1, T); |
- _mov(Dest, T); |
- } else { |
- T_edx = makeReg(IceType_i32, RegX8632::Reg_edx); |
- _mov(T, Src0, RegX8632::Reg_eax); |
- _cbwdq(T_edx, T); |
- _idiv(T, Src1, T_edx); |
- _mov(Dest, T); |
- } |
- break; |
- case InstArithmetic::Urem: |
- Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
- if (isByteSizedArithType(Dest->getType())) { |
- Variable *T_ah = nullptr; |
- Constant *Zero = Ctx->getConstantZero(IceType_i8); |
- _mov(T, Src0, RegX8632::Reg_eax); |
- _mov(T_ah, Zero, RegX8632::Reg_ah); |
- _div(T_ah, Src1, T); |
- _mov(Dest, T_ah); |
- } else { |
- Constant *Zero = Ctx->getConstantZero(IceType_i32); |
- _mov(T_edx, Zero, RegX8632::Reg_edx); |
- _mov(T, Src0, RegX8632::Reg_eax); |
- _div(T_edx, Src1, T); |
- _mov(Dest, T_edx); |
- } |
- break; |
- case InstArithmetic::Srem: |
- Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
- if (isByteSizedArithType(Dest->getType())) { |
- Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah); |
- _mov(T, Src0, RegX8632::Reg_eax); |
- _cbwdq(T, T); |
- Context.insert(InstFakeDef::create(Func, T_ah)); |
- _idiv(T_ah, Src1, T); |
- _mov(Dest, T_ah); |
- } else { |
- T_edx = makeReg(IceType_i32, RegX8632::Reg_edx); |
- _mov(T, Src0, RegX8632::Reg_eax); |
- _cbwdq(T_edx, T); |
- _idiv(T_edx, Src1, T); |
- _mov(Dest, T_edx); |
+ } |
+ break; |
+ case InstArithmetic::Urem: |
+ Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
+ if (isByteSizedArithType(Dest->getType())) { |
+ Variable *T_ah = nullptr; |
+ Constant *Zero = Ctx->getConstantZero(IceType_i8); |
+ _mov(T, Src0, RegX8632::Reg_eax); |
+ _mov(T_ah, Zero, RegX8632::Reg_ah); |
+ _div(T_ah, Src1, T); |
+ _mov(Dest, T_ah); |
+ } else { |
+ Constant *Zero = Ctx->getConstantZero(IceType_i32); |
+ _mov(T_edx, Zero, RegX8632::Reg_edx); |
+ _mov(T, Src0, RegX8632::Reg_eax); |
+ _div(T_edx, Src1, T); |
+ _mov(Dest, T_edx); |
+ } |
+ break; |
+ case InstArithmetic::Srem: |
+ // TODO(stichnot): Enable this after doing better performance |
+ // and cross testing. |
+ if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { |
+ // Optimize mod by constant power of 2, but not for Om1 or O0, |
+ // just to keep things simple there. |
+ if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
+ int32_t Divisor = C->getValue(); |
+ uint32_t UDivisor = static_cast<uint32_t>(Divisor); |
+ if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { |
+ uint32_t LogDiv = llvm::Log2_32(UDivisor); |
+ Type Ty = Dest->getType(); |
+ // LLVM does the following for dest=src%(1<<log): |
+ // t=src |
+ // sar t,typewidth-1 // -1 if src is negative, 0 if not |
+ // shr t,typewidth-log |
+ // add t,src |
+ // and t, -(1<<log) |
+ // sub t,src |
+ // neg t |
+ // dest=t |
+ uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty); |
+ // If for some reason we are dividing by 1, just assign 0. |
+ if (LogDiv == 0) { |
+ _mov(Dest, Ctx->getConstantZero(Ty)); |
+ return; |
+ } |
+ _mov(T, Src0); |
+ // The initial sar is unnecessary when dividing by 2. |
+ if (LogDiv > 1) |
+ _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); |
+ _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); |
+ _add(T, Src0); |
+ _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv))); |
+ _sub(T, Src0); |
+ _neg(T); |
+ _mov(Dest, T); |
+ return; |
+ } |
} |
- break; |
- case InstArithmetic::Fadd: |
- _mov(T, Src0); |
- _addss(T, Src1); |
- _mov(Dest, T); |
- break; |
- case InstArithmetic::Fsub: |
- _mov(T, Src0); |
- _subss(T, Src1); |
- _mov(Dest, T); |
- break; |
- case InstArithmetic::Fmul: |
- _mov(T, Src0); |
- _mulss(T, Src1); |
- _mov(Dest, T); |
- break; |
- case InstArithmetic::Fdiv: |
- _mov(T, Src0); |
- _divss(T, Src1); |
- _mov(Dest, T); |
- break; |
- case InstArithmetic::Frem: { |
- const SizeT MaxSrcs = 2; |
- Type Ty = Dest->getType(); |
- InstCall *Call = |
- makeHelperCall(isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, |
- Dest, MaxSrcs); |
- Call->addArg(Src0); |
- Call->addArg(Src1); |
- return lowerCall(Call); |
- } break; |
} |
+ Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
+ if (isByteSizedArithType(Dest->getType())) { |
+ Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah); |
+ _mov(T, Src0, RegX8632::Reg_eax); |
+ _cbwdq(T, T); |
+ Context.insert(InstFakeDef::create(Func, T_ah)); |
+ _idiv(T_ah, Src1, T); |
+ _mov(Dest, T_ah); |
+ } else { |
+ T_edx = makeReg(IceType_i32, RegX8632::Reg_edx); |
+ _mov(T, Src0, RegX8632::Reg_eax); |
+ _cbwdq(T_edx, T); |
+ _idiv(T_edx, Src1, T); |
+ _mov(Dest, T_edx); |
+ } |
+ break; |
+ case InstArithmetic::Fadd: |
+ _mov(T, Src0); |
+ _addss(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Fsub: |
+ _mov(T, Src0); |
+ _subss(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Fmul: |
+ _mov(T, Src0); |
+ _mulss(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Fdiv: |
+ _mov(T, Src0); |
+ _divss(T, Src1); |
+ _mov(Dest, T); |
+ break; |
+ case InstArithmetic::Frem: { |
+ const SizeT MaxSrcs = 2; |
+ Type Ty = Dest->getType(); |
+ InstCall *Call = makeHelperCall( |
+ isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs); |
+ Call->addArg(Src0); |
+ Call->addArg(Src1); |
+ return lowerCall(Call); |
+ } |
} |
} |
@@ -3119,10 +3292,11 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { |
Func->setError("Unexpected memory ordering for AtomicRMW"); |
return; |
} |
- lowerAtomicRMW(Instr->getDest(), |
- static_cast<uint32_t>(llvm::cast<ConstantInteger32>( |
- Instr->getArg(0))->getValue()), |
- Instr->getArg(1), Instr->getArg(2)); |
+ lowerAtomicRMW( |
+ Instr->getDest(), |
+ static_cast<uint32_t>( |
+ llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()), |
+ Instr->getArg(1), Instr->getArg(2)); |
return; |
case Intrinsics::AtomicStore: { |
if (!Intrinsics::isMemoryOrderValid( |