src/IceTargetLoweringX8632.cpp - Issue 427843002: Subzero: Add support for SSE4.1 instructions.

Side by Side Diff: src/IceTargetLoweringX8632.cpp

Issue 427843002: Subzero: Add support for SSE4.1 instructions. (Closed) Base URL: https://gerrit.chromium.org/gerrit/p/native_client/pnacl-subzero.git@master

Patch Set: 1) Fix compilation 2) Fuse conditions in mul lowering to avoid code duplication Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//	1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//

2 //	2 //

3 // The Subzero Code Generator	3 // The Subzero Code Generator

4 //	4 //

5 // This file is distributed under the University of Illinois Open Source	5 // This file is distributed under the University of Illinois Open Source

6 // License. See LICENSE.TXT for details.	6 // License. See LICENSE.TXT for details.

7 //	7 //

8 //===----------------------------------------------------------------------===//	8 //===----------------------------------------------------------------------===//

9 //	9 //

10 // This file implements the TargetLoweringX8632 class, which	10 // This file implements the TargetLoweringX8632 class, which

11 // consists almost entirely of the lowering sequence for each	11 // consists almost entirely of the lowering sequence for each

12 // high-level instruction. It also implements	12 // high-level instruction. It also implements

13 // TargetX8632Fast::postLower() which does the simplest possible	13 // TargetX8632Fast::postLower() which does the simplest possible

14 // register allocation for the "fast" target.	14 // register allocation for the "fast" target.

15 //	15 //

16 //===----------------------------------------------------------------------===//	16 //===----------------------------------------------------------------------===//

17	17

18 #include "IceDefs.h"	18 #include "IceDefs.h"

19 #include "IceCfg.h"	19 #include "IceCfg.h"

20 #include "IceCfgNode.h"	20 #include "IceCfgNode.h"

21 #include "IceInstX8632.h"	21 #include "IceInstX8632.h"

22 #include "IceOperand.h"	22 #include "IceOperand.h"

23 #include "IceTargetLoweringX8632.def"	23 #include "IceTargetLoweringX8632.def"

24 #include "IceTargetLoweringX8632.h"	24 #include "IceTargetLoweringX8632.h"

	25 #include "llvm/Support/CommandLine.h"

25	26

26 namespace Ice {	27 namespace Ice {

27	28

28 namespace {	29 namespace {

29	30

30 // The following table summarizes the logic for lowering the fcmp	31 // The following table summarizes the logic for lowering the fcmp

31 // instruction. There is one table entry for each of the 16 conditions.	32 // instruction. There is one table entry for each of the 16 conditions.

32 //	33 //

33 // The first four columns describe the case when the operands are	34 // The first four columns describe the case when the operands are

34 // floating point scalar values. A comment in lowerFcmp() describes the	35 // floating point scalar values. A comment in lowerFcmp() describes the

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
116 size_t Index = static_cast<size_t>(Ty);	117 size_t Index = static_cast<size_t>(Ty);

117 assert(Index < TableTypeX8632AttributesSize);	118 assert(Index < TableTypeX8632AttributesSize);

118 return TableTypeX8632Attributes[Ty].InVectorElementType;	119 return TableTypeX8632Attributes[Ty].InVectorElementType;

119 }	120 }

120	121

121 // The maximum number of arguments to pass in XMM registers	122 // The maximum number of arguments to pass in XMM registers

122 const unsigned X86_MAX_XMM_ARGS = 4;	123 const unsigned X86_MAX_XMM_ARGS = 4;

123 // The number of bits in a byte	124 // The number of bits in a byte

124 const unsigned X86_CHAR_BIT = 8;	125 const unsigned X86_CHAR_BIT = 8;

125	126

	127 // Instruction set options

	128 namespace cl = ::llvm::cl;

	129 cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet(

	130 "mattr", cl::desc("X86 target attributes"),

	131 cl::init(TargetX8632::SSE2),

	132 cl::values(

	133 clEnumValN(TargetX8632::SSE2, "sse2",

	134 "Enable SSE2 instructions (default)"),

	135 clEnumValN(TargetX8632::SSE4_1, "sse4.1",

	136 "Enable SSE 4.1 instructions"), clEnumValEnd));

	137

126 // Return a string representation of the type that is suitable for use	138 // Return a string representation of the type that is suitable for use

127 // in an identifier.	139 // in an identifier.

128 IceString typeIdentString(const Type Ty) {	140 IceString typeIdentString(const Type Ty) {

129 IceString Str;	141 IceString Str;

130 llvm::raw_string_ostream BaseOS(Str);	142 llvm::raw_string_ostream BaseOS(Str);

131 if (isVectorType(Ty)) {	143 if (isVectorType(Ty)) {

132 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty);	144 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty);

133 } else {	145 } else {

134 BaseOS << Ty;	146 BaseOS << Ty;

135 }	147 }

(...skipping 91 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
227 #define X(tag, size, align, elts, elty, str) \	239 #define X(tag, size, align, elts, elty, str) \

228 STATIC_ASSERT(_table1_##tag == _table2_##tag);	240 STATIC_ASSERT(_table1_##tag == _table2_##tag);

229 ICETYPE_TABLE;	241 ICETYPE_TABLE;

230 #undef X	242 #undef X

231 }	243 }

232 }	244 }

233	245

234 } // end of anonymous namespace	246 } // end of anonymous namespace

235	247

236 TargetX8632::TargetX8632(Cfg *Func)	248 TargetX8632::TargetX8632(Cfg *Func)

237 : TargetLowering(Func), IsEbpBasedFrame(false), FrameSizeLocals(0),	249 : TargetLowering(Func), InstructionSet(CLInstructionSet),

238 LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false),	250 IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0),

	251 NextLabelNumber(0), ComputedLiveRanges(false),

239 PhysicalRegisters(VarList(Reg_NUM)) {	252 PhysicalRegisters(VarList(Reg_NUM)) {

240 // TODO: Don't initialize IntegerRegisters and friends every time.	253 // TODO: Don't initialize IntegerRegisters and friends every time.

241 // Instead, initialize in some sort of static initializer for the	254 // Instead, initialize in some sort of static initializer for the

242 // class.	255 // class.

243 llvm::SmallBitVector IntegerRegisters(Reg_NUM);	256 llvm::SmallBitVector IntegerRegisters(Reg_NUM);

244 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM);	257 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM);

245 llvm::SmallBitVector FloatRegisters(Reg_NUM);	258 llvm::SmallBitVector FloatRegisters(Reg_NUM);

246 llvm::SmallBitVector VectorRegisters(Reg_NUM);	259 llvm::SmallBitVector VectorRegisters(Reg_NUM);

247 llvm::SmallBitVector InvalidRegisters(Reg_NUM);	260 llvm::SmallBitVector InvalidRegisters(Reg_NUM);

248 ScratchRegs.resize(Reg_NUM);	261 ScratchRegs.resize(Reg_NUM);

(...skipping 972 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1221 _pxor(T, LEGAL_HACK(Src1));	1234 _pxor(T, LEGAL_HACK(Src1));

1222 _movp(Dest, T);	1235 _movp(Dest, T);

1223 } break;	1236 } break;

1224 case InstArithmetic::Sub: {	1237 case InstArithmetic::Sub: {

1225 Variable *T = makeReg(Dest->getType());	1238 Variable *T = makeReg(Dest->getType());

1226 _movp(T, Src0);	1239 _movp(T, Src0);

1227 _psub(T, LEGAL_HACK(Src1));	1240 _psub(T, LEGAL_HACK(Src1));

1228 _movp(Dest, T);	1241 _movp(Dest, T);

1229 } break;	1242 } break;

1230 case InstArithmetic::Mul: {	1243 case InstArithmetic::Mul: {

1231 if (Dest->getType() == IceType_v4i32) {	1244 if (Dest->getType() == IceType_v8i16 \|\|

	1245 (InstructionSet >= SSE4_1 && Dest->getType() == IceType_v4i32)) {

	1246 Variable *T = makeReg(Dest->getType());

	1247 _movp(T, Src0);

	1248 _pmull(T, legalizeToVar(Src1));

	1249 _movp(Dest, T);

	1250 } else if (Dest->getType() == IceType_v4i32) {

1232 // Lowering sequence:	1251 // Lowering sequence:

1233 // Note: The mask arguments have index 0 on the left.	1252 // Note: The mask arguments have index 0 on the left.

1234 //	1253 //

1235 // movups T1, Src0	1254 // movups T1, Src0

1236 // pshufd T2, Src0, {1,0,3,0}	1255 // pshufd T2, Src0, {1,0,3,0}

1237 // pshufd T3, Src1, {1,0,3,0}	1256 // pshufd T3, Src1, {1,0,3,0}

1238 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}	1257 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}

1239 // pmuludq T1, Src1	1258 // pmuludq T1, Src1

1240 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}	1259 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}

1241 // pmuludq T2, T3	1260 // pmuludq T2, T3

1242 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}	1261 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}

1243 // shufps T1, T2, {0,2,0,2}	1262 // shufps T1, T2, {0,2,0,2}

1244 // pshufd T4, T1, {0,2,1,3}	1263 // pshufd T4, T1, {0,2,1,3}

1245 // movups Dest, T4	1264 // movups Dest, T4

1246 //

1247 // TODO(wala): SSE4.1 has pmulld.

1248	1265

1249 // Mask that directs pshufd to create a vector with entries	1266 // Mask that directs pshufd to create a vector with entries

1250 // Src[1, 0, 3, 0]	1267 // Src[1, 0, 3, 0]

1251 const unsigned Constant1030 = 0x31;	1268 const unsigned Constant1030 = 0x31;

1252 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030);	1269 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030);

1253 // Mask that directs shufps to create a vector with entries	1270 // Mask that directs shufps to create a vector with entries

1254 // Dest[0, 2], Src[0, 2]	1271 // Dest[0, 2], Src[0, 2]

1255 const unsigned Mask0202 = 0x88;	1272 const unsigned Mask0202 = 0x88;

1256 // Mask that directs pshufd to create a vector with entries	1273 // Mask that directs pshufd to create a vector with entries

1257 // Src[0, 2, 1, 3]	1274 // Src[0, 2, 1, 3]

1258 const unsigned Mask0213 = 0xd8;	1275 const unsigned Mask0213 = 0xd8;

1259 Variable *T1 = makeReg(IceType_v4i32);	1276 Variable *T1 = makeReg(IceType_v4i32);

1260 Variable *T2 = makeReg(IceType_v4i32);	1277 Variable *T2 = makeReg(IceType_v4i32);

1261 Variable *T3 = makeReg(IceType_v4i32);	1278 Variable *T3 = makeReg(IceType_v4i32);

1262 Variable *T4 = makeReg(IceType_v4i32);	1279 Variable *T4 = makeReg(IceType_v4i32);

1263 _movp(T1, Src0);	1280 _movp(T1, Src0);

1264 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R	1281 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R

1265 // with Src1 after stack operand alignment support is	1282 // with Src1 after stack operand alignment support is

1266 // implemented.	1283 // implemented.

1267 Variable *Src0R = LEGAL_HACK(Src0);	1284 Variable *Src0R = LEGAL_HACK(Src0);

1268 Variable *Src1R = LEGAL_HACK(Src1);	1285 Variable *Src1R = LEGAL_HACK(Src1);

1269 _pshufd(T2, Src0R, Mask1030);	1286 _pshufd(T2, Src0R, Mask1030);

1270 _pshufd(T3, Src1R, Mask1030);	1287 _pshufd(T3, Src1R, Mask1030);

1271 _pmuludq(T1, Src1R);	1288 _pmuludq(T1, Src1R);

1272 _pmuludq(T2, T3);	1289 _pmuludq(T2, T3);

1273 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202));	1290 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202));

1274 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213));	1291 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213));

1275 _movp(Dest, T4);	1292 _movp(Dest, T4);

1276 } else if (Dest->getType() == IceType_v8i16) {

1277 Variable *T = makeReg(IceType_v8i16);

1278 _movp(T, Src0);

1279 _pmullw(T, legalizeToVar(Src1));

1280 _movp(Dest, T);

1281 } else {	1293 } else {

1282 assert(Dest->getType() == IceType_v16i8);	1294 assert(Dest->getType() == IceType_v16i8);

1283 // Sz_mul_v16i8	1295 // Sz_mul_v16i8

1284 const IceString Helper = "Sz_mul_v16i8";	1296 const IceString Helper = "Sz_mul_v16i8";

1285 const SizeT MaxSrcs = 2;	1297 const SizeT MaxSrcs = 2;

1286 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);	1298 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);

1287 Call->addArg(Src0);	1299 Call->addArg(Src0);

1288 Call->addArg(Src1);	1300 Call->addArg(Src1);

1289 lowerCall(Call);	1301 lowerCall(Call);

1290 }	1302 }

(...skipping 857 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2148 // Only constant indices are allowed in PNaCl IR.	2160 // Only constant indices are allowed in PNaCl IR.

2149 assert(ElementIndex);	2161 assert(ElementIndex);

2150	2162

2151 unsigned Index = ElementIndex->getValue();	2163 unsigned Index = ElementIndex->getValue();

2152 Type Ty = SourceVectOperand->getType();	2164 Type Ty = SourceVectOperand->getType();

2153 Type ElementTy = typeElementType(Ty);	2165 Type ElementTy = typeElementType(Ty);

2154 Type InVectorElementTy = getInVectorElementType(Ty);	2166 Type InVectorElementTy = getInVectorElementType(Ty);

2155 Variable *ExtractedElement = makeReg(InVectorElementTy);	2167 Variable *ExtractedElement = makeReg(InVectorElementTy);

2156	2168

2157 // TODO(wala): Determine the best lowering sequences for each type.	2169 // TODO(wala): Determine the best lowering sequences for each type.

2158 if (Ty == IceType_v4i32 \|\| Ty == IceType_v4f32 \|\| Ty == IceType_v4i1) {	2170 if (Ty == IceType_v8i16 \|\| Ty == IceType_v8i1 \|\|

2159 // Lower extractelement operations where the element is 32 bits	2171 (InstructionSet >= SSE4_1 && Ty != IceType_v4f32)) {

2160 // wide with pshufd.	2172 // Use pextrb, pextrw, or pextrd.

2161 // TODO(wala): SSE4.1 has extractps and pextrd	2173 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);

	2174 Variable *SourceVectR = legalizeToVar(SourceVectOperand);

	2175 _pextr(ExtractedElement, SourceVectR, Mask);

	2176 } else if (Ty == IceType_v4i32 \|\| Ty == IceType_v4f32 \|\| Ty == IceType_v4i1) {

	2177 // Use pshufd and movd/movss.

2162 //	2178 //

2163 // ALIGNHACK: Force vector operands to registers in instructions that	2179 // ALIGNHACK: Force vector operands to registers in instructions that

2164 // require aligned memory operands until support for stack alignment	2180 // require aligned memory operands until support for stack alignment

2165 // is implemented.	2181 // is implemented.

2166 #define ALIGN_HACK(Vect) legalizeToVar((Vect))	2182 #define ALIGN_HACK(Vect) legalizeToVar((Vect))

2167 Variable *T = NULL;	2183 Variable *T = NULL;

2168 if (Index) {	2184 if (Index) {

2169 // The shuffle only needs to occur if the element to be extracted	2185 // The shuffle only needs to occur if the element to be extracted

2170 // is not at the lowest index.	2186 // is not at the lowest index.

2171 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);	2187 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);

2172 T = makeReg(Ty);	2188 T = makeReg(Ty);

2173 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask);	2189 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask);

2174 } else {	2190 } else {

2175 T = legalizeToVar(SourceVectOperand);	2191 T = legalizeToVar(SourceVectOperand);

2176 }	2192 }

2177	2193

2178 if (InVectorElementTy == IceType_i32) {	2194 if (InVectorElementTy == IceType_i32) {

2179 _movd(ExtractedElement, T);	2195 _movd(ExtractedElement, T);

2180 } else { // Ty == Icetype_f32	2196 } else { // Ty == Icetype_f32

2181 // TODO(wala): _movss is only used here because _mov does not	2197 // TODO(wala): _movss is only used here because _mov does not

2182 // allow a vector source and a scalar destination. _mov should be	2198 // allow a vector source and a scalar destination. _mov should be

2183 // able to be used here.	2199 // able to be used here.

2184 // _movss is a binary instruction, so the FakeDef is needed to	2200 // _movss is a binary instruction, so the FakeDef is needed to

2185 // keep the live range analysis consistent.	2201 // keep the live range analysis consistent.

2186 Context.insert(InstFakeDef::create(Func, ExtractedElement));	2202 Context.insert(InstFakeDef::create(Func, ExtractedElement));

2187 _movss(ExtractedElement, T);	2203 _movss(ExtractedElement, T);

2188 }	2204 }

2189 #undef ALIGN_HACK	2205 #undef ALIGN_HACK

2190 } else if (Ty == IceType_v8i16 \|\| Ty == IceType_v8i1) {

2191 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);

2192 _pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask);

2193 } else {	2206 } else {

2194 assert(Ty == IceType_v16i8 \|\| Ty == IceType_v16i1);	2207 assert(Ty == IceType_v16i8 \|\| Ty == IceType_v16i1);

2195 // Spill the value to a stack slot and do the extraction in memory.	2208 // Spill the value to a stack slot and do the extraction in memory.

2196 // TODO(wala): SSE4.1 has pextrb.

2197 //	2209 //

2198 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when	2210 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when

2199 // support for legalizing to mem is implemented.	2211 // support for legalizing to mem is implemented.

2200 Variable *Slot = Func->makeVariable(Ty, Context.getNode());	2212 Variable *Slot = Func->makeVariable(Ty, Context.getNode());

2201 Slot->setWeight(RegWeight::Zero);	2213 Slot->setWeight(RegWeight::Zero);

2202 _movp(Slot, legalizeToVar(SourceVectOperand));	2214 _movp(Slot, legalizeToVar(SourceVectOperand));

2203	2215

2204 // Compute the location of the element in memory.	2216 // Compute the location of the element in memory.

2205 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);	2217 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);

2206 OperandX8632Mem *Loc =	2218 OperandX8632Mem *Loc =

(...skipping 325 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2532 // Expand the element to the appropriate size for it to be inserted	2544 // Expand the element to the appropriate size for it to be inserted

2533 // in the vector.	2545 // in the vector.

2534 Variable *Expanded =	2546 Variable *Expanded =

2535 Func->makeVariable(InVectorElementTy, Context.getNode());	2547 Func->makeVariable(InVectorElementTy, Context.getNode());

2536 InstCast *Cast =	2548 InstCast *Cast =

2537 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert);	2549 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert);

2538 lowerCast(Cast);	2550 lowerCast(Cast);

2539 ElementToInsert = Expanded;	2551 ElementToInsert = Expanded;

2540 }	2552 }

2541	2553

2542 if (Ty == IceType_v4i32 \|\| Ty == IceType_v4f32 \|\| Ty == IceType_v4i1) {	2554 if (Ty == IceType_v8i16 \|\| Ty == IceType_v8i1 \|\| InstructionSet >= SSE4_1) {

2543 // Lower insertelement with 32-bit wide elements using shufps or	2555 // Use insertps, pinsrb, pinsrw, or pinsrd.

2544 // movss.	2556 Operand *Element = legalize(ElementToInsert, Legal_Mem \| Legal_Reg);

2545 // TODO(wala): SSE4.1 has pinsrd and insertps.	2557 Variable *T = makeReg(Ty);

	2558 _movp(T, SourceVectOperand);

	2559 if (Ty == IceType_v4f32)

	2560 _insertps(T, Element, Ctx->getConstantInt(IceType_i8, Index << 4));

	2561 else

	2562 _pinsr(T, Element, Ctx->getConstantInt(IceType_i8, Index));

	2563 _movp(Inst->getDest(), T);

	2564 } else if (Ty == IceType_v4i32 \|\| Ty == IceType_v4f32 \|\| Ty == IceType_v4i1) {

	2565 // Use shufps or movss.

2546 Variable *Element = NULL;	2566 Variable *Element = NULL;

2547 if (InVectorElementTy == IceType_f32) {	2567 if (InVectorElementTy == IceType_f32) {

2548 // Element will be in an XMM register since it is floating point.	2568 // Element will be in an XMM register since it is floating point.

2549 Element = legalizeToVar(ElementToInsert);	2569 Element = legalizeToVar(ElementToInsert);

2550 } else {	2570 } else {

2551 // Copy an integer to an XMM register.	2571 // Copy an integer to an XMM register.

2552 Operand *T = legalize(ElementToInsert, Legal_Reg \| Legal_Mem);	2572 Operand *T = legalize(ElementToInsert, Legal_Reg \| Legal_Mem);

2553 Element = makeReg(Ty);	2573 Element = makeReg(Ty);

2554 _movd(Element, T);	2574 _movd(Element, T);

2555 }	2575 }

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2600 _shufps(Element, SourceVectOperand, Mask2Constant);	2620 _shufps(Element, SourceVectOperand, Mask2Constant);

2601 _movp(Inst->getDest(), Element);	2621 _movp(Inst->getDest(), Element);

2602 } else {	2622 } else {

2603 Variable *T = makeReg(Ty);	2623 Variable *T = makeReg(Ty);

2604 _movp(T, SourceVectOperand);	2624 _movp(T, SourceVectOperand);

2605 _shufps(Element, T, Mask1Constant);	2625 _shufps(Element, T, Mask1Constant);

2606 _shufps(T, Element, Mask2Constant);	2626 _shufps(T, Element, Mask2Constant);

2607 _movp(Inst->getDest(), T);	2627 _movp(Inst->getDest(), T);

2608 }	2628 }

2609 #undef ALIGN_HACK	2629 #undef ALIGN_HACK

2610 } else if (Ty == IceType_v8i16 \|\| Ty == IceType_v8i1) {

2611 Operand *Element = legalize(ElementToInsert, Legal_Mem \| Legal_Reg);

2612 Variable *T = makeReg(Ty);

2613 _movp(T, SourceVectOperand);

2614 _pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index));

2615 _movp(Inst->getDest(), T);

2616 } else {	2630 } else {

2617 assert(Ty == IceType_v16i8 \|\| Ty == IceType_v16i1);	2631 assert(Ty == IceType_v16i8 \|\| Ty == IceType_v16i1);

2618 // Spill the value to a stack slot and perform the insertion in	2632 // Spill the value to a stack slot and perform the insertion in

2619 // memory.	2633 // memory.

2620 // TODO(wala): SSE4.1 has pinsrb.

2621 //	2634 //

2622 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when	2635 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when

2623 // support for legalizing to mem is implemented.	2636 // support for legalizing to mem is implemented.

2624 Variable *Slot = Func->makeVariable(Ty, Context.getNode());	2637 Variable *Slot = Func->makeVariable(Ty, Context.getNode());

2625 Slot->setWeight(RegWeight::Zero);	2638 Slot->setWeight(RegWeight::Zero);

2626 _movp(Slot, legalizeToVar(SourceVectOperand));	2639 _movp(Slot, legalizeToVar(SourceVectOperand));

2627	2640

2628 // Compute the location of the position to insert in memory.	2641 // Compute the location of the position to insert in memory.

2629 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);	2642 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);

2630 OperandX8632Mem *Loc =	2643 OperandX8632Mem *Loc =

(...skipping 840 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3471 Context.insert(InstFakeUse::create(Func, esp));	3484 Context.insert(InstFakeUse::create(Func, esp));

3472 }	3485 }

3473	3486

3474 void TargetX8632::lowerSelect(const InstSelect *Inst) {	3487 void TargetX8632::lowerSelect(const InstSelect *Inst) {

3475 Variable *Dest = Inst->getDest();	3488 Variable *Dest = Inst->getDest();

3476 Operand *SrcT = Inst->getTrueOperand();	3489 Operand *SrcT = Inst->getTrueOperand();

3477 Operand *SrcF = Inst->getFalseOperand();	3490 Operand *SrcF = Inst->getFalseOperand();

3478 Operand *Condition = Inst->getCondition();	3491 Operand *Condition = Inst->getCondition();

3479	3492

3480 if (isVectorType(Dest->getType())) {	3493 if (isVectorType(Dest->getType())) {

3481 // a=d?b:c ==> d=sext(d); a=(b&d)\|(c&~d)

3482 // TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has

3483 // blendps and pblendw for constant condition operands.

3484 Type SrcTy = SrcT->getType();	3494 Type SrcTy = SrcT->getType();

3485 Variable *T = makeReg(SrcTy);	3495 Variable *T = makeReg(SrcTy);

	3496 // ALIGNHACK: Until stack alignment support is implemented, vector

	3497 // instructions need to have vector operands in registers. Once

	3498 // there is support for stack alignment, LEGAL_HACK can be removed.

	3499 #define LEGAL_HACK(Vect) legalizeToVar((Vect))

	3500 if (InstructionSet >= SSE4_1) {

	3501 // TODO(wala): If the condition operand is a constant, use blendps

	3502 // or pblendw.

	3503 //

	3504 // Use blendvps or pblendvb to implement select.

	3505 if (SrcTy == IceType_v4i1 \|\| SrcTy == IceType_v4i32 \|\|

	3506 SrcTy == IceType_v4f32) {

	3507 Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0);

	3508 _movp(xmm0, Condition);

	3509 _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31));

	3510 _movp(T, SrcF);

	3511 _blendvps(T, LEGAL_HACK(SrcT), xmm0);

	3512 _movp(Dest, T);

	3513 } else {

	3514 assert(typeNumElements(SrcTy) == 8 \|\| typeNumElements(SrcTy) == 16);

	3515 Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16

	3516 : IceType_v16i8;

	3517 Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0);

	3518 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));

	3519 _movp(T, SrcF);

	3520 _pblendvb(T, LEGAL_HACK(SrcT), xmm0);

	3521 _movp(Dest, T);

	3522 }

	3523 return;

	3524 }

	3525 // Lower select without SSE4.1:

	3526 // a=d?b:c ==>

	3527 // if elementtype(d) != i1:

	3528 // d=sext(d);

	3529 // a=(b&d)\|(c&~d);

3486 Variable *T2 = makeReg(SrcTy);	3530 Variable *T2 = makeReg(SrcTy);

3487 // Sign extend the condition operand if applicable.	3531 // Sign extend the condition operand if applicable.

3488 if (SrcTy == IceType_v4f32) {	3532 if (SrcTy == IceType_v4f32) {

3489 // The sext operation takes only integer arguments.	3533 // The sext operation takes only integer arguments.

3490 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode());	3534 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode());

3491 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));	3535 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));

3492 _movp(T, T3);	3536 _movp(T, T3);

3493 } else if (typeElementType(SrcTy) != IceType_i1) {	3537 } else if (typeElementType(SrcTy) != IceType_i1) {

3494 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));	3538 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));

3495 } else {	3539 } else {

3496 _movp(T, Condition);	3540 _movp(T, Condition);

3497 }	3541 }

3498 // ALIGNHACK: Until stack alignment support is implemented, the

3499 // bitwise vector instructions need to have both operands in

3500 // registers. Once there is support for stack alignment, LEGAL_HACK

3501 // can be removed.

3502 #define LEGAL_HACK(Vect) legalizeToVar((Vect))

3503 _movp(T2, T);	3542 _movp(T2, T);

3504 _pand(T, LEGAL_HACK(SrcT));	3543 _pand(T, LEGAL_HACK(SrcT));

3505 _pandn(T2, LEGAL_HACK(SrcF));	3544 _pandn(T2, LEGAL_HACK(SrcF));

3506 _por(T, T2);	3545 _por(T, T2);

3507 _movp(Dest, T);	3546 _movp(Dest, T);

3508 #undef LEGAL_HACK	3547 #undef LEGAL_HACK

3509	3548

3510 return;	3549 return;

3511 }	3550 }

3512	3551

(...skipping 504 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4017 for (SizeT i = 0; i < Size; ++i) {	4056 for (SizeT i = 0; i < Size; ++i) {

4018 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";	4057 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";

4019 }	4058 }

4020 Str << "\t.size\t" << MangledName << ", " << Size << "\n";	4059 Str << "\t.size\t" << MangledName << ", " << Size << "\n";

4021 }	4060 }

4022 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName	4061 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName

4023 << "\n";	4062 << "\n";

4024 }	4063 }

4025	4064

4026 } // end of namespace Ice	4065 } // end of namespace Ice

OLD	NEW

« src/IceTargetLoweringX8632.h ('K') | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/vector-arith.ll » ('j') | tests_lit/llvm2ice_tests/vector-arith.ll » ('J')