src/IceTargetLoweringX8632.cpp - Issue 427843002: Subzero: Add support for SSE4.1 instructions.

Unified Diff: src/IceTargetLoweringX8632.cpp

Issue 427843002: Subzero: Add support for SSE4.1 instructions. (Closed) Base URL: https://gerrit.chromium.org/gerrit/p/native_client/pnacl-subzero.git@master

Patch Set: Fix an empty line that was deleted Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/IceTargetLoweringX8632.cpp

diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp

index 00db25a5a29823b3411b6b030f5fd174184aeb55..cc6f2226a96984d3eb4e61c04dc33fa32b780a93 100644

--- a/src/IceTargetLoweringX8632.cpp

+++ b/src/IceTargetLoweringX8632.cpp

@@ -22,6 +22,7 @@

#include "IceOperand.h"

#include "IceTargetLoweringX8632.def"

#include "IceTargetLoweringX8632.h"

+#include "llvm/Support/CommandLine.h"

namespace Ice {

@@ -123,6 +124,17 @@ const unsigned X86_MAX_XMM_ARGS = 4;

// The number of bits in a byte

const unsigned X86_CHAR_BIT = 8;

+// Instruction set options

+namespace cl = ::llvm::cl;

+cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet(

+ "mattr", cl::desc("X86 target attributes"),

+ cl::init(TargetX8632::SSE2),

+ cl::values(

+ clEnumValN(TargetX8632::SSE2, "sse2",

+ "Enable SSE2 instructions (default)"),

+ clEnumValN(TargetX8632::SSE4_1, "sse4.1",

+ "Enable SSE 4.1 instructions"), clEnumValEnd));

// Return a string representation of the type that is suitable for use

// in an identifier.

IceString typeIdentString(const Type Ty) {

@@ -234,8 +246,9 @@ void __attribute__((unused)) xMacroIntegrityCheck() {

} // end of anonymous namespace

TargetX8632::TargetX8632(Cfg *Func)

- : TargetLowering(Func), IsEbpBasedFrame(false), FrameSizeLocals(0),

- LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false),

+ : TargetLowering(Func), InstructionSet(CLInstructionSet),

+ IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0),

+ NextLabelNumber(0), ComputedLiveRanges(false),

PhysicalRegisters(VarList(Reg_NUM)) {

// TODO: Don't initialize IntegerRegisters and friends every time.

// Instead, initialize in some sort of static initializer for the

@@ -1228,7 +1241,16 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {

_movp(Dest, T);

} break;

case InstArithmetic::Mul: {

- if (Dest->getType() == IceType_v4i32) {

+ bool TypesAreValidForPmull =

+ Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;

+ bool InstructionSetIsValidForPmull =

+ Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1;

+ if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {

+ Variable *T = makeReg(Dest->getType());

+ _movp(T, Src0);

+ _pmull(T, legalizeToVar(Src1));

+ _movp(Dest, T);

+ } else if (Dest->getType() == IceType_v4i32) {

// Lowering sequence:

// Note: The mask arguments have index 0 on the left.

@@ -1243,8 +1265,6 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {

// shufps T1, T2, {0,2,0,2}

// pshufd T4, T1, {0,2,1,3}

// movups Dest, T4

- //

- // TODO(wala): SSE4.1 has pmulld.

// Mask that directs pshufd to create a vector with entries

// Src[1, 0, 3, 0]

@@ -1273,11 +1293,6 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {

_shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202));

_pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213));

_movp(Dest, T4);

- } else if (Dest->getType() == IceType_v8i16) {

- Variable *T = makeReg(IceType_v8i16);

- _movp(T, Src0);

- _pmullw(T, legalizeToVar(Src1));

- _movp(Dest, T);

} else {

assert(Dest->getType() == IceType_v16i8);

// Sz_mul_v16i8

@@ -2155,10 +2170,15 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {

Variable *ExtractedElement = makeReg(InVectorElementTy);

// TODO(wala): Determine the best lowering sequences for each type.

- if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {

- // Lower extractelement operations where the element is 32 bits

- // wide with pshufd.

- // TODO(wala): SSE4.1 has extractps and pextrd

+ bool CanUsePextr =

+ Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1;

+ if (CanUsePextr && Ty != IceType_v4f32) {

+ // Use pextrb, pextrw, or pextrd.

+ Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);

+ Variable *SourceVectR = legalizeToVar(SourceVectOperand);

+ _pextr(ExtractedElement, SourceVectR, Mask);

+ } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {

+ // Use pshufd and movd/movss.

// ALIGNHACK: Force vector operands to registers in instructions that

// require aligned memory operands until support for stack alignment

@@ -2187,13 +2207,9 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {

_movss(ExtractedElement, T);

}

#undef ALIGN_HACK

- } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {

- Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);

- _pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask);

} else {

assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);

// Spill the value to a stack slot and do the extraction in memory.

- // TODO(wala): SSE4.1 has pextrb.

// TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when

// support for legalizing to mem is implemented.

@@ -2539,10 +2555,18 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {

ElementToInsert = Expanded;

}

- if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {

- // Lower insertelement with 32-bit wide elements using shufps or

- // movss.

- // TODO(wala): SSE4.1 has pinsrd and insertps.

+ if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {

+ // Use insertps, pinsrb, pinsrw, or pinsrd.

+ Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);

+ Variable *T = makeReg(Ty);

+ _movp(T, SourceVectOperand);

+ if (Ty == IceType_v4f32)

+ _insertps(T, Element, Ctx->getConstantInt(IceType_i8, Index << 4));

+ else

+ _pinsr(T, Element, Ctx->getConstantInt(IceType_i8, Index));

+ _movp(Inst->getDest(), T);

+ } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {

+ // Use shufps or movss.

Variable *Element = NULL;

if (InVectorElementTy == IceType_f32) {

// Element will be in an XMM register since it is floating point.

@@ -2607,17 +2631,10 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {

_movp(Inst->getDest(), T);

}

#undef ALIGN_HACK

- } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {

- Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);

- Variable *T = makeReg(Ty);

- _movp(T, SourceVectOperand);

- _pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index));

- _movp(Inst->getDest(), T);

} else {

assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);

// Spill the value to a stack slot and perform the insertion in

// memory.

- // TODO(wala): SSE4.1 has pinsrb.

// TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when

// support for legalizing to mem is implemented.

@@ -3551,11 +3568,42 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {

Operand *Condition = Inst->getCondition();

if (isVectorType(Dest->getType())) {

- // a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d)

- // TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has

- // blendps and pblendw for constant condition operands.

Type SrcTy = SrcT->getType();

Variable *T = makeReg(SrcTy);

+ // ALIGNHACK: Until stack alignment support is implemented, vector

+ // instructions need to have vector operands in registers. Once

+ // there is support for stack alignment, LEGAL_HACK can be removed.

+#define LEGAL_HACK(Vect) legalizeToVar((Vect))

+ if (InstructionSet >= SSE4_1) {

+ // TODO(wala): If the condition operand is a constant, use blendps

+ // or pblendw.

+ //

+ // Use blendvps or pblendvb to implement select.

+ if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||

+ SrcTy == IceType_v4f32) {

+ Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0);

+ _movp(xmm0, Condition);

+ _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31));

+ _movp(T, SrcF);

+ _blendvps(T, LEGAL_HACK(SrcT), xmm0);

+ _movp(Dest, T);

+ } else {

+ assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);

+ Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16

+ : IceType_v16i8;

+ Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0);

+ lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));

+ _movp(T, SrcF);

+ _pblendvb(T, LEGAL_HACK(SrcT), xmm0);

+ _movp(Dest, T);

+ }

+ return;

+ }

+ // Lower select without SSE4.1:

+ // a=d?b:c ==>

+ // if elementtype(d) != i1:

+ // d=sext(d);

+ // a=(b&d)|(c&~d);

Variable *T2 = makeReg(SrcTy);

// Sign extend the condition operand if applicable.

if (SrcTy == IceType_v4f32) {

@@ -3568,11 +3616,6 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {

} else {

_movp(T, Condition);

}

- // ALIGNHACK: Until stack alignment support is implemented, the

- // bitwise vector instructions need to have both operands in

- // registers. Once there is support for stack alignment, LEGAL_HACK

- // can be removed.

-#define LEGAL_HACK(Vect) legalizeToVar((Vect))

_movp(T2, T);

_pand(T, LEGAL_HACK(SrcT));

_pandn(T2, LEGAL_HACK(SrcF));

« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/vector-arith.ll » ('j') | no next file with comments »