Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(31)

Unified Diff: src/IceTargetLoweringX8632.cpp

Issue 1146803002: Subzero: Strength-reduce mul by certain constants. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master
Patch Set: Remove a TODO Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/assembler/x86/immediate_encodings.ll » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/IceTargetLoweringX8632.cpp
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 506b315079909210d3b1f7c9dc1fbee21e0e53ea..f6cd26f64f5299550de9cc42d20ee07c8f68e400 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -1287,6 +1287,102 @@ void TargetX8632::lowerAlloca(const InstAlloca *Inst) {
_mov(Dest, esp);
}
+// Strength-reduce scalar integer multiplication by a constant (for
+// i32 or narrower) for certain constants. The lea instruction can be
+// used to multiply by 3, 5, or 9, and the lsh instruction can be used
+// to multiply by powers of 2. These can be combined such that
+// e.g. multiplying by 100 can be done as 2 lea-based multiplies by 5,
+// combined with left-shifting by 2.
+bool TargetX8632::optimizeScalarMul(Variable *Dest, Operand *Src0,
+ int32_t Src1) {
+ // Disable this optimization for Om1 and O0, just to keep things
+ // simple there.
+ if (Ctx->getFlags().getOptLevel() < Opt_1)
+ return false;
+ Type Ty = Dest->getType();
+ Variable *T = nullptr;
+ if (Src1 == -1) {
+ _mov(T, Src0);
+ _neg(T);
+ _mov(Dest, T);
+ return true;
+ }
+ if (Src1 == 0) {
+ _mov(Dest, Ctx->getConstantZero(Ty));
+ return true;
+ }
+ if (Src1 == 1) {
+ _mov(T, Src0);
+ _mov(Dest, T);
+ return true;
+ }
+ // Don't bother with the edge case where Src1 == MININT.
+ if (Src1 == -Src1)
+ return false;
+ const bool Src1IsNegative = Src1 < 0;
+ if (Src1IsNegative)
+ Src1 = -Src1;
+ uint32_t Count9 = 0;
+ uint32_t Count5 = 0;
+ uint32_t Count3 = 0;
+ uint32_t Count2 = 0;
+ uint32_t CountOps = 0;
+ while (Src1 > 1) {
+ if (Src1 % 9 == 0) {
+ ++CountOps;
+ ++Count9;
+ Src1 /= 9;
+ } else if (Src1 % 5 == 0) {
+ ++CountOps;
+ ++Count5;
+ Src1 /= 5;
+ } else if (Src1 % 3 == 0) {
+ ++CountOps;
+ ++Count3;
+ Src1 /= 3;
+ } else if (Src1 % 2 == 0) {
+ if (Count2 == 0)
+ ++CountOps;
+ ++Count2;
+ Src1 /= 2;
+ } else {
+ return false;
+ }
+ }
+ // Lea optimization only works for i16 and i32 types, not i8.
+ if (Ty != IceType_i16 && Ty != IceType_i32 && (Count3 || Count5 || Count9))
+ return false;
+ // Limit the number of lea/shl operations for a single multiply, to
+ // a somewhat arbitrary choice of 3.
+ const uint32_t MaxOpsForOptimizedMul = 3;
+ if (CountOps > MaxOpsForOptimizedMul)
+ return false;
+ _mov(T, Src0);
+ Constant *Zero = Ctx->getConstantZero(IceType_i32);
+ for (uint32_t i = 0; i < Count9; ++i) {
+ const uint16_t Shift = 3; // log2(9-1)
+ _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
+ _set_dest_nonkillable();
+ }
+ for (uint32_t i = 0; i < Count5; ++i) {
+ const uint16_t Shift = 2; // log2(5-1)
+ _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
+ _set_dest_nonkillable();
+ }
+ for (uint32_t i = 0; i < Count3; ++i) {
+ const uint16_t Shift = 1; // log2(3-1)
+ _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
+ _set_dest_nonkillable();
+ }
+ if (Count2) {
+ _shl(T, Ctx->getConstantInt(Ty, Count2));
+ }
+ if (Src1IsNegative)
+ _neg(T);
+ _mov(Dest, T);
+ return true;
+}
+
void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
Variable *Dest = Inst->getDest();
Operand *Src0 = legalize(Inst->getSrc(0));
@@ -1294,6 +1390,8 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
if (Inst->isCommutative()) {
if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
std::swap(Src0, Src1);
+ if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))
+ std::swap(Src0, Src1);
}
if (Dest->getType() == IceType_i64) {
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
@@ -1521,7 +1619,9 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
llvm_unreachable("FP instruction with i64 type");
break;
}
- } else if (isVectorType(Dest->getType())) {
+ return;
+ }
+ if (isVectorType(Dest->getType())) {
// TODO: Trap on integer divide and integer modulo by zero.
// See: https://code.google.com/p/nativeclient/issues/detail?id=3899
if (llvm::isa<OperandX8632Mem>(Src1))
@@ -1650,175 +1750,248 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
break;
}
- } else { // Dest->getType() is non-i64 scalar
- Variable *T_edx = nullptr;
- Variable *T = nullptr;
- switch (Inst->getOp()) {
- case InstArithmetic::_num:
- llvm_unreachable("Unknown arithmetic operator");
- break;
- case InstArithmetic::Add:
- _mov(T, Src0);
- _add(T, Src1);
- _mov(Dest, T);
- break;
- case InstArithmetic::And:
- _mov(T, Src0);
- _and(T, Src1);
- _mov(Dest, T);
- break;
- case InstArithmetic::Or:
- _mov(T, Src0);
- _or(T, Src1);
- _mov(Dest, T);
- break;
- case InstArithmetic::Xor:
+ return;
+ }
+ Variable *T_edx = nullptr;
+ Variable *T = nullptr;
+ switch (Inst->getOp()) {
+ case InstArithmetic::_num:
+ llvm_unreachable("Unknown arithmetic operator");
+ break;
+ case InstArithmetic::Add:
+ _mov(T, Src0);
+ _add(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::And:
+ _mov(T, Src0);
+ _and(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Or:
+ _mov(T, Src0);
+ _or(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Xor:
+ _mov(T, Src0);
+ _xor(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Sub:
+ _mov(T, Src0);
+ _sub(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Mul:
+ if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+ if (optimizeScalarMul(Dest, Src0, C->getValue()))
+ return;
+ }
+ // The 8-bit version of imul only allows the form "imul r/m8"
+ // where T must be in eax.
+ if (isByteSizedArithType(Dest->getType())) {
+ _mov(T, Src0, RegX8632::Reg_eax);
+ Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+ } else {
_mov(T, Src0);
- _xor(T, Src1);
+ }
+ _imul(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Shl:
+ _mov(T, Src0);
+ if (!llvm::isa<Constant>(Src1))
+ Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
+ _shl(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Lshr:
+ _mov(T, Src0);
+ if (!llvm::isa<Constant>(Src1))
+ Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
+ _shr(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Ashr:
+ _mov(T, Src0);
+ if (!llvm::isa<Constant>(Src1))
+ Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
+ _sar(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Udiv:
+ // div and idiv are the few arithmetic operators that do not allow
+ // immediates as the operand.
+ Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+ if (isByteSizedArithType(Dest->getType())) {
+ Variable *T_ah = nullptr;
+ Constant *Zero = Ctx->getConstantZero(IceType_i8);
+ _mov(T, Src0, RegX8632::Reg_eax);
+ _mov(T_ah, Zero, RegX8632::Reg_ah);
+ _div(T, Src1, T_ah);
_mov(Dest, T);
- break;
- case InstArithmetic::Sub:
- _mov(T, Src0);
- _sub(T, Src1);
+ } else {
+ Constant *Zero = Ctx->getConstantZero(IceType_i32);
+ _mov(T, Src0, RegX8632::Reg_eax);
+ _mov(T_edx, Zero, RegX8632::Reg_edx);
+ _div(T, Src1, T_edx);
_mov(Dest, T);
- break;
- case InstArithmetic::Mul:
- // TODO: Optimize for llvm::isa<Constant>(Src1)
- // TODO: Strength-reduce multiplications by a constant,
- // particularly -1 and powers of 2. Advanced: use lea to
- // multiply by 3, 5, 9.
- //
- // The 8-bit version of imul only allows the form "imul r/m8"
- // where T must be in eax.
- if (isByteSizedArithType(Dest->getType())) {
- _mov(T, Src0, RegX8632::Reg_eax);
- Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
- } else {
- _mov(T, Src0);
+ }
+ break;
+ case InstArithmetic::Sdiv:
+ // TODO(stichnot): Enable this after doing better performance
+ // and cross testing.
+ if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
+ // Optimize division by constant power of 2, but not for Om1
+ // or O0, just to keep things simple there.
+ if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+ int32_t Divisor = C->getValue();
+ uint32_t UDivisor = static_cast<uint32_t>(Divisor);
+ if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
+ uint32_t LogDiv = llvm::Log2_32(UDivisor);
+ Type Ty = Dest->getType();
+ // LLVM does the following for dest=src/(1<<log):
+ // t=src
+ // sar t,typewidth-1 // -1 if src is negative, 0 if not
+ // shr t,typewidth-log
+ // add t,src
+ // sar t,log
+ // dest=t
+ uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
+ _mov(T, Src0);
+ // If for some reason we are dividing by 1, just treat it
+ // like an assignment.
+ if (LogDiv > 0) {
+ // The initial sar is unnecessary when dividing by 2.
+ if (LogDiv > 1)
+ _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
+ _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
+ _add(T, Src0);
+ _sar(T, Ctx->getConstantInt(Ty, LogDiv));
+ }
+ _mov(Dest, T);
+ return;
+ }
}
- _imul(T, Src1);
- _mov(Dest, T);
- break;
- case InstArithmetic::Shl:
- _mov(T, Src0);
- if (!llvm::isa<Constant>(Src1))
- Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
- _shl(T, Src1);
- _mov(Dest, T);
- break;
- case InstArithmetic::Lshr:
- _mov(T, Src0);
- if (!llvm::isa<Constant>(Src1))
- Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
- _shr(T, Src1);
+ }
+ Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+ if (isByteSizedArithType(Dest->getType())) {
+ _mov(T, Src0, RegX8632::Reg_eax);
+ _cbwdq(T, T);
+ _idiv(T, Src1, T);
_mov(Dest, T);
- break;
- case InstArithmetic::Ashr:
- _mov(T, Src0);
- if (!llvm::isa<Constant>(Src1))
- Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
- _sar(T, Src1);
+ } else {
+ T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
+ _mov(T, Src0, RegX8632::Reg_eax);
+ _cbwdq(T_edx, T);
+ _idiv(T, Src1, T_edx);
_mov(Dest, T);
- break;
- case InstArithmetic::Udiv:
- // div and idiv are the few arithmetic operators that do not allow
- // immediates as the operand.
- Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
- if (isByteSizedArithType(Dest->getType())) {
- Variable *T_ah = nullptr;
- Constant *Zero = Ctx->getConstantZero(IceType_i8);
- _mov(T, Src0, RegX8632::Reg_eax);
- _mov(T_ah, Zero, RegX8632::Reg_ah);
- _div(T, Src1, T_ah);
- _mov(Dest, T);
- } else {
- Constant *Zero = Ctx->getConstantZero(IceType_i32);
- _mov(T, Src0, RegX8632::Reg_eax);
- _mov(T_edx, Zero, RegX8632::Reg_edx);
- _div(T, Src1, T_edx);
- _mov(Dest, T);
- }
- break;
- case InstArithmetic::Sdiv:
- Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
- if (isByteSizedArithType(Dest->getType())) {
- _mov(T, Src0, RegX8632::Reg_eax);
- _cbwdq(T, T);
- _idiv(T, Src1, T);
- _mov(Dest, T);
- } else {
- T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
- _mov(T, Src0, RegX8632::Reg_eax);
- _cbwdq(T_edx, T);
- _idiv(T, Src1, T_edx);
- _mov(Dest, T);
- }
- break;
- case InstArithmetic::Urem:
- Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
- if (isByteSizedArithType(Dest->getType())) {
- Variable *T_ah = nullptr;
- Constant *Zero = Ctx->getConstantZero(IceType_i8);
- _mov(T, Src0, RegX8632::Reg_eax);
- _mov(T_ah, Zero, RegX8632::Reg_ah);
- _div(T_ah, Src1, T);
- _mov(Dest, T_ah);
- } else {
- Constant *Zero = Ctx->getConstantZero(IceType_i32);
- _mov(T_edx, Zero, RegX8632::Reg_edx);
- _mov(T, Src0, RegX8632::Reg_eax);
- _div(T_edx, Src1, T);
- _mov(Dest, T_edx);
- }
- break;
- case InstArithmetic::Srem:
- Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
- if (isByteSizedArithType(Dest->getType())) {
- Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah);
- _mov(T, Src0, RegX8632::Reg_eax);
- _cbwdq(T, T);
- Context.insert(InstFakeDef::create(Func, T_ah));
- _idiv(T_ah, Src1, T);
- _mov(Dest, T_ah);
- } else {
- T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
- _mov(T, Src0, RegX8632::Reg_eax);
- _cbwdq(T_edx, T);
- _idiv(T_edx, Src1, T);
- _mov(Dest, T_edx);
+ }
+ break;
+ case InstArithmetic::Urem:
+ Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+ if (isByteSizedArithType(Dest->getType())) {
+ Variable *T_ah = nullptr;
+ Constant *Zero = Ctx->getConstantZero(IceType_i8);
+ _mov(T, Src0, RegX8632::Reg_eax);
+ _mov(T_ah, Zero, RegX8632::Reg_ah);
+ _div(T_ah, Src1, T);
+ _mov(Dest, T_ah);
+ } else {
+ Constant *Zero = Ctx->getConstantZero(IceType_i32);
+ _mov(T_edx, Zero, RegX8632::Reg_edx);
+ _mov(T, Src0, RegX8632::Reg_eax);
+ _div(T_edx, Src1, T);
+ _mov(Dest, T_edx);
+ }
+ break;
+ case InstArithmetic::Srem:
+ // TODO(stichnot): Enable this after doing better performance
+ // and cross testing.
+ if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
+ // Optimize mod by constant power of 2, but not for Om1 or O0,
+ // just to keep things simple there.
+ if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+ int32_t Divisor = C->getValue();
+ uint32_t UDivisor = static_cast<uint32_t>(Divisor);
+ if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
+ uint32_t LogDiv = llvm::Log2_32(UDivisor);
+ Type Ty = Dest->getType();
+ // LLVM does the following for dest=src%(1<<log):
+ // t=src
+ // sar t,typewidth-1 // -1 if src is negative, 0 if not
+ // shr t,typewidth-log
+ // add t,src
+ // and t, -(1<<log)
+ // sub t,src
+ // neg t
+ // dest=t
+ uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
+ // If for some reason we are dividing by 1, just assign 0.
+ if (LogDiv == 0) {
+ _mov(Dest, Ctx->getConstantZero(Ty));
+ return;
+ }
+ _mov(T, Src0);
+ // The initial sar is unnecessary when dividing by 2.
+ if (LogDiv > 1)
+ _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
+ _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
+ _add(T, Src0);
+ _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
+ _sub(T, Src0);
+ _neg(T);
+ _mov(Dest, T);
+ return;
+ }
}
- break;
- case InstArithmetic::Fadd:
- _mov(T, Src0);
- _addss(T, Src1);
- _mov(Dest, T);
- break;
- case InstArithmetic::Fsub:
- _mov(T, Src0);
- _subss(T, Src1);
- _mov(Dest, T);
- break;
- case InstArithmetic::Fmul:
- _mov(T, Src0);
- _mulss(T, Src1);
- _mov(Dest, T);
- break;
- case InstArithmetic::Fdiv:
- _mov(T, Src0);
- _divss(T, Src1);
- _mov(Dest, T);
- break;
- case InstArithmetic::Frem: {
- const SizeT MaxSrcs = 2;
- Type Ty = Dest->getType();
- InstCall *Call =
- makeHelperCall(isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64,
- Dest, MaxSrcs);
- Call->addArg(Src0);
- Call->addArg(Src1);
- return lowerCall(Call);
- } break;
}
+ Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
+ if (isByteSizedArithType(Dest->getType())) {
+ Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah);
+ _mov(T, Src0, RegX8632::Reg_eax);
+ _cbwdq(T, T);
+ Context.insert(InstFakeDef::create(Func, T_ah));
+ _idiv(T_ah, Src1, T);
+ _mov(Dest, T_ah);
+ } else {
+ T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
+ _mov(T, Src0, RegX8632::Reg_eax);
+ _cbwdq(T_edx, T);
+ _idiv(T_edx, Src1, T);
+ _mov(Dest, T_edx);
+ }
+ break;
+ case InstArithmetic::Fadd:
+ _mov(T, Src0);
+ _addss(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Fsub:
+ _mov(T, Src0);
+ _subss(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Fmul:
+ _mov(T, Src0);
+ _mulss(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Fdiv:
+ _mov(T, Src0);
+ _divss(T, Src1);
+ _mov(Dest, T);
+ break;
+ case InstArithmetic::Frem: {
+ const SizeT MaxSrcs = 2;
+ Type Ty = Dest->getType();
+ InstCall *Call = makeHelperCall(
+ isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
+ Call->addArg(Src0);
+ Call->addArg(Src1);
+ return lowerCall(Call);
+ }
}
}
@@ -3119,10 +3292,11 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
Func->setError("Unexpected memory ordering for AtomicRMW");
return;
}
- lowerAtomicRMW(Instr->getDest(),
- static_cast<uint32_t>(llvm::cast<ConstantInteger32>(
- Instr->getArg(0))->getValue()),
- Instr->getArg(1), Instr->getArg(2));
+ lowerAtomicRMW(
+ Instr->getDest(),
+ static_cast<uint32_t>(
+ llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
+ Instr->getArg(1), Instr->getArg(2));
return;
case Intrinsics::AtomicStore: {
if (!Intrinsics::isMemoryOrderValid(
« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/assembler/x86/immediate_encodings.ll » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698