src/IceTargetLoweringX8632.cpp - Issue 1146803002: Subzero: Strength-reduce mul by certain constants.

Unified Diff: src/IceTargetLoweringX8632.cpp

Issue 1146803002: Subzero: Strength-reduce mul by certain constants. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: Fill out the cross test. Cleanup. Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/IceTargetLoweringX8632.cpp

diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp

index b8b633ac27bb1e9cafc320706d8eb76505be55c8..a9a7acd2364c57388bc38e4ecec47d58af2b6385 100644

--- a/src/IceTargetLoweringX8632.cpp

+++ b/src/IceTargetLoweringX8632.cpp

@@ -1287,6 +1287,88 @@ void TargetX8632::lowerAlloca(const InstAlloca *Inst) {

_mov(Dest, esp);

}

+// Strength-reduce scalar integer multiplication by a constant (for

+// i32 or narrower) for certain constants. The lea instruction can be

+// used to multiply by 3, 5, or 9, and the lsh instruction can be used

+// to multiply by powers of 2. These can be combined such that

+// e.g. multiplying by 100 can be done as 2 lea-based multiplies by 5,

+// combined with left-shifting by 2.

+bool TargetX8632::optimizeScalarMul(Variable *Dest, Operand *Src0,

+ int32_t Src1) {

+ // Disable this optimization for Om1 and O0, just to keep things

+ // simple there.

+ if (Ctx->getFlags().getOptLevel() < Opt_1)

+ return false;

+ Variable *T = nullptr;

+ if (Src1 == -1) {

+ _mov(T, Src0);

+ _neg(T);

+ _mov(Dest, T);

+ return true;

+ }

+ if (Src1 == 1) {

+ _mov(T, Src0);

+ _mov(Dest, T);

+ return true;

+ }

+ if (Src1 <= 0)

John 2015/06/10 17:27:52 Just throwing an idea... would it work to invert S

Jim Stichnoth 2015/06/12 17:31:47 Done. Though I suppress that when the constant is

+ return false;

+ uint32_t Count9 = 0;

+ uint32_t Count5 = 0;

+ uint32_t Count3 = 0;

+ uint32_t Count2 = 0;

+ uint32_t CountOps = 0;

+ while (Src1 > 1) {

+ if (Src1 % 9 == 0) {

+ ++CountOps;

+ ++Count9;

+ Src1 /= 9;

+ } else if (Src1 % 5 == 0) {

+ ++CountOps;

+ ++Count5;

+ Src1 /= 5;

+ } else if (Src1 % 3 == 0) {

+ ++CountOps;

+ ++Count3;

+ Src1 /= 3;

+ } else if (Src1 % 2 == 0) {

+ if (Count2 == 0)

+ ++CountOps;

+ ++Count2;

+ Src1 /= 2;

+ } else {

+ return false;

+ }

+ // Limit the number of lea/shl operations for a single multiply, to

+ // a somewhat arbitrary choice of 5.

+ const uint32_t MaxOpsForOptimizedMul = 5;

jvoung (off chromium) 2015/06/10 18:12:27 FWIW, some of the agner tables say mul r/i is abou

John 2015/06/10 23:47:19 This little command echo "int v(int i) { return i

Jim Stichnoth 2015/06/12 17:31:47 OK. I reduced the threshold to 3.

+ if (CountOps > MaxOpsForOptimizedMul)

+ return false;

+ _mov(T, Src0);

+ Constant *Zero = Ctx->getConstantZero(IceType_i32);

+ for (uint32_t i = 0; i < Count9; ++i) {

+ const uint16_t Shift = 3; // log2(9-1)

+ _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));

+ _set_dest_nonkillable();

+ }

+ for (uint32_t i = 0; i < Count5; ++i) {

+ const uint16_t Shift = 2; // log2(5-1)

+ _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));

+ _set_dest_nonkillable();

+ }

+ for (uint32_t i = 0; i < Count3; ++i) {

+ const uint16_t Shift = 1; // log2(3-1)

+ _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));

+ _set_dest_nonkillable();

+ }

+ if (Count2) {

+ _shl(T, Ctx->getConstantInt(Dest->getType(), Count2));

+ }

+ _mov(Dest, T);

+ return true;

void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {

Variable *Dest = Inst->getDest();

Operand *Src0 = legalize(Inst->getSrc(0));

@@ -1294,6 +1376,8 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {

if (Inst->isCommutative()) {

if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))

std::swap(Src0, Src1);

+ if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))

+ std::swap(Src0, Src1);

}

if (Dest->getType() == IceType_i64) {

Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));

@@ -1521,7 +1605,9 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {

llvm_unreachable("FP instruction with i64 type");

break;

}

- } else if (isVectorType(Dest->getType())) {

+ return;

+ }

+ if (isVectorType(Dest->getType())) {

// TODO: Trap on integer divide and integer modulo by zero.

// See: https://code.google.com/p/nativeclient/issues/detail?id=3899

if (llvm::isa<OperandX8632Mem>(Src1))

@@ -1650,7 +1736,10 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {

scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);

break;

}

- } else { // Dest->getType() is non-i64 scalar

+ return;

+ }

+ { // TODO(stichnot): Remove this level of {}. (It's still there to

+ // make the diffs easier to read.)

Variable *T_edx = nullptr;

Variable *T = nullptr;

switch (Inst->getOp()) {

@@ -1683,11 +1772,10 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {

_mov(Dest, T);

break;

case InstArithmetic::Mul:

- // TODO: Optimize for llvm::isa<Constant>(Src1)

- // TODO: Strength-reduce multiplications by a constant,

- // particularly -1 and powers of 2. Advanced: use lea to

- // multiply by 3, 5, 9.

- //

+ if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {

+ if (optimizeScalarMul(Dest, Src0, C->getValue()))

+ return;

+ }

// The 8-bit version of imul only allows the form "imul r/m8"

// where T must be in eax.

if (isByteSizedArithType(Dest->getType())) {

@@ -1740,6 +1828,41 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {

}

break;

case InstArithmetic::Sdiv:

+ // TODO(stichnot): Enable this after doing better performance

+ // and cross testing.

+ if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {

+ // Optimize division by constant power of 2, but not for Om1

+ // or O0, just to keep things simple there.

+ if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {

+ int32_t Divisor = C->getValue();

+ uint32_t UDivisor = static_cast<uint32_t>(Divisor);

+ if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {

+ uint32_t LogDiv = llvm::Log2_32(UDivisor);

+ Type Ty = Dest->getType();

+ // LLVM does the following for dest=src/(1<<log):

+ // t=src

+ // sar t,typewidth-1 // -1 if src is negative, 0 if not

+ // shr t,typewidth-log

+ // add t,src

+ // sar t,log

+ // dest=t

+ uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);

+ _mov(T, Src0);

+ // If for some reason we are dividing by 1, just treat it

+ // like an assignment.

+ if (LogDiv > 0) {

+ // The initial sar is unnecessary when dividing by 2.

+ if (LogDiv > 1)

+ _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));

+ _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));

+ _add(T, Src0);

+ _sar(T, Ctx->getConstantInt(Ty, LogDiv));

+ }

+ _mov(Dest, T);

+ return;

+ }

Src1 = legalize(Src1, Legal_Reg | Legal_Mem);

if (isByteSizedArithType(Dest->getType())) {

_mov(T, Src0, RegX8632::Reg_eax);

@@ -1772,6 +1895,46 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {

}

break;

case InstArithmetic::Srem:

+ // TODO(stichnot): Enable this after doing better performance

+ // and cross testing.

+ if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {

+ // Optimize mod by constant power of 2, but not for Om1 or O0,

+ // just to keep things simple there.

+ if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {

+ int32_t Divisor = C->getValue();

+ uint32_t UDivisor = static_cast<uint32_t>(Divisor);

+ if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {

+ uint32_t LogDiv = llvm::Log2_32(UDivisor);

+ Type Ty = Dest->getType();

+ // LLVM does the following for dest=src%(1<<log):

+ // t=src

+ // sar t,typewidth-1 // -1 if src is negative, 0 if not

+ // shr t,typewidth-log

+ // add t,src

+ // and t, -(1<<log)

+ // sub t,src

+ // neg t

+ // dest=t

+ uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);

+ // If for some reason we are dividing by 1, just assign 0.

+ if (LogDiv == 0) {

+ _mov(Dest, Ctx->getConstantZero(Ty));

+ return;

+ }

+ _mov(T, Src0);

+ // The initial sar is unnecessary when dividing by 2.

+ if (LogDiv > 1)

+ _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));

+ _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));

+ _add(T, Src0);

+ _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));

+ _sub(T, Src0);

+ _neg(T);

+ _mov(Dest, T);

+ return;

+ }

Src1 = legalize(Src1, Legal_Reg | Legal_Mem);

if (isByteSizedArithType(Dest->getType())) {

Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah);

@@ -1817,7 +1980,7 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {

Call->addArg(Src0);

Call->addArg(Src1);

return lowerCall(Call);

- } break;

+ }

}

« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/assembler/x86/immediate_encodings.ll » ('j') | no next file with comments »