src/IceTargetLoweringX8632.cpp - Issue 390443005: Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now.

Unified Diff: src/IceTargetLoweringX8632.cpp

Issue 390443005: Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: stuff Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/IceTargetLoweringX8632.cpp

diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp

index bf11573b69c4e9b0ed1c084a35150bd887550607..90d25332bccd7f01ceffcba635431ea0aecf21ce 100644

--- a/src/IceTargetLoweringX8632.cpp

+++ b/src/IceTargetLoweringX8632.cpp

@@ -39,7 +39,7 @@ namespace {

const struct TableFcmp_ {

uint32_t Default;

bool SwapOperands;

- InstX8632Br::BrCond C1, C2;

+ InstX8632::BrCond C1, C2;

} TableFcmp[] = {

#define X(val, dflt, swap, C1, C2) \

{ dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \

@@ -54,7 +54,7 @@ const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);

// x86 conditional branch instruction.

const struct TableIcmp32_ {

- InstX8632Br::BrCond Mapping;

+ InstX8632::BrCond Mapping;

} TableIcmp32[] = {

#define X(val, C_32, C1_64, C2_64, C3_64) \

{ InstX8632Br::C_32 } \

@@ -69,7 +69,7 @@ const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);

// conditional branches are needed. For the other conditions, three separate

// conditional branches are needed.

const struct TableIcmp64_ {

- InstX8632Br::BrCond C1, C2, C3;

+ InstX8632::BrCond C1, C2, C3;

} TableIcmp64[] = {

#define X(val, C_32, C1_64, C2_64, C3_64) \

{ InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \

@@ -79,7 +79,7 @@ const struct TableIcmp64_ {

};

const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);

-InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {

+InstX8632::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {

size_t Index = static_cast<size_t>(Cond);

assert(Index < TableIcmp32Size);

return TableIcmp32[Index].Mapping;

@@ -2108,11 +2108,36 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {

return;

}

case Intrinsics::Bswap:

+ Func->setError("Unhandled intrinsic");

+ return;

+ case Intrinsics::Ctpop: {

+ Variable *Dest = Instr->getDest();

+ Operand *Val = Instr->getArg(0);

+ InstCall *Call = makeHelperCall(Val->getType() == IceType_i32 ?

+ "__popcountsi2" : "__popcountdi2", Dest, 1);

+ Call->addArg(Val);

+ lowerCall(Call);

+ // The popcount helpers always return 32-bit values, while the intrinsic's

+ // signature matches the native POPCNT instruction and fills a 64-bit reg

+ // (in 64-bit mode). Thus, clear the upper bits of the dest just in case

+ // the user doesn't do that in the IR. If the user does that in the IR,

+ // then this zero'ing instruction is dead and gets optimized out.

+ if (Val->getType() == IceType_i64) {

+ Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));

+ Constant *Zero = Ctx->getConstantZero(IceType_i32);

+ _mov(DestHi, Zero);

+ }

+ return;

+ }

case Intrinsics::Ctlz:

- case Intrinsics::Ctpop:

+ // The "is zero undef" parameter is ignored and we always return

+ // a well-defined value.

+ lowerCtlz(Instr->getDest(), Instr->getArg(0));

+ return;

case Intrinsics::Cttz:

- // TODO(jvoung): fill it in.

- Func->setError("Unhandled intrinsic");

+ // The "is zero undef" parameter is ignored and we always return

+ // a well-defined value.

+ lowerCttz(Instr->getDest(), Instr->getArg(0));

return;

case Intrinsics::Longjmp: {

InstCall *Call = makeHelperCall("longjmp", NULL, 2);

@@ -2407,6 +2432,98 @@ void TargetX8632::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,

_mov(Dest, T_eax);

}

+// We could do constant folding here, but that should have

+// been done by the front-end/middle-end optimizations.

+void TargetX8632::lowerCtlz(Variable *Dest, Operand *Val) {

+ // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).

+ // Then the instructions will handle the Val == 0 case much more simply

+ // and won't require conversion from bit position to number of zeros.

+ //

+ // Otherwise:

+ // bsr IF_NOT_ZERO, Val

+ // mov DEST, 63

+ // cmovne DEST, IF_NOT_ZERO

Jim Stichnoth 2014/07/14 23:20:45 Might want to document that the cmov instruction r

jvoung (off chromium) 2014/07/15 21:30:23 Done. I think the cvt instructions and some/most

+ // xor DEST, 31

+ //

+ // The xor DEST, 31 converts a bit position to # of leading zeroes.

+ // E.g., for 000... 00001100, bsr will say that the most significant bit

+ // set is at position 3, while the number of leading zeros is 28. Xor is

+ // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case).

+ //

+ // Similar for 64-bit, but start w/ speculating that the upper 32 bits

+ // are all zero, and compute the result for that case (checking the lower

+ // 32 bits). Then actually compute the result for the upper bits and

+ // cmov in the result from the lower computation if the earlier speculation

+ // was correct.

+ Type Ty = Val->getType();

+ assert(Ty == IceType_i32 || Ty == IceType_i64);

+ Val = legalize(Val);

+ Operand *FirstVal = Ty == IceType_i32 ? Val : loOperand(Val);

+ Variable *T = makeReg(IceType_i32);

+ _bsr(T, FirstVal);

+ Variable *T_Dest = makeReg(IceType_i32);

+ Constant *SixtyThree = Ctx->getConstantInt(IceType_i32, 63);

+ Constant *ThirtyOne = Ctx->getConstantInt(IceType_i32, 31);

+ _mov(T_Dest, SixtyThree);

+ _cmov(T_Dest, T, InstX8632::Br_ne);

+ _xor(T_Dest, ThirtyOne);

+ if (Ty == IceType_i32) {

+ _mov(Dest, T_Dest);

+ return;

+ }

+ Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32);

+ _add(T_Dest, ThirtyTwo);

+ Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));

+ Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));

+ // Will be using "test" on this, so we need a registerized version.

+ Variable *HigherVar = legalizeToVar(hiOperand(Val));

+ Variable *T_Dest2 = makeReg(IceType_i32);

+ _bsr(T_Dest2, HigherVar);

+ _xor(T_Dest2, ThirtyOne);

+ _test(HigherVar, HigherVar);

+ _cmov(T_Dest2, T_Dest, InstX8632::Br_e);

+ _mov(DestLo, T_Dest2);

+ _mov(DestHi, Ctx->getConstantZero(IceType_i32));

+void TargetX8632::lowerCttz(Variable *Dest, Operand *Val) {

+ // TODO(jvoung): Determine if the user CPU supports TZCNT (BMI).

Jim Stichnoth 2014/07/14 23:20:45 Would it make sense to refactor this and lowerCtlz

jvoung (off chromium) 2014/07/15 21:30:23 Hmm, I tried that in the new patch set. There are

+ // Then the instructions will handle the Val == 0 case much more simply.

+ //

+ // Otherwise:

+ // bsf IF_NOT_ZERO, Val

+ // mov IF_ZERO, 32

+ // cmovne IF_ZERO, IF_NOT_ZERO

+ //

+ // Similar for 64-bit, but start w/ speculating that the bottom 32 bits

+ // are all zero.

+ Type Ty = Val->getType();

+ assert(Ty == IceType_i32 || Ty == IceType_i64);

+ Val = legalize(Val);

+ Operand *FirstVal = Ty == IceType_i32 ? Val : hiOperand(Val);

+ Variable *T = makeReg(IceType_i32);

+ _bsf(T, FirstVal);

+ Variable *T_Dest = makeReg(IceType_i32);

+ Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32);

+ _mov(T_Dest, ThirtyTwo);

+ _cmov(T_Dest, T, InstX8632::Br_ne);

+ if (Ty == IceType_i32) {

+ _mov(Dest, T_Dest);

+ return;

+ }

+ _add(T_Dest, ThirtyTwo);

+ Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));

+ Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));

+ // Will be using "test" on this, so we need a registerized version.

+ Variable *LowerVar = legalizeToVar(loOperand(Val));

+ Variable *T_Dest2 = makeReg(IceType_i32);

+ _bsf(T_Dest2, LowerVar);

+ _test(LowerVar, LowerVar);

+ _cmov(T_Dest2, T_Dest, InstX8632::Br_e);

+ _mov(DestLo, T_Dest2);

+ _mov(DestHi, Ctx->getConstantZero(IceType_i32));

namespace {

bool isAdd(const Inst *Inst) {

« src/IceInstX8632.cpp ('K') | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll » ('j') | tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll » ('J')