src/IceTargetLoweringX8632.cpp - Issue 427843002: Subzero: Add support for SSE4.1 instructions.

Side by Side Diff: src/IceTargetLoweringX8632.cpp

Issue 427843002: Subzero: Add support for SSE4.1 instructions. (Closed) Base URL: https://gerrit.chromium.org/gerrit/p/native_client/pnacl-subzero.git@master

Patch Set: Fix an empty line that was deleted Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//	1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//

2 //	2 //

3 // The Subzero Code Generator	3 // The Subzero Code Generator

4 //	4 //

5 // This file is distributed under the University of Illinois Open Source	5 // This file is distributed under the University of Illinois Open Source

6 // License. See LICENSE.TXT for details.	6 // License. See LICENSE.TXT for details.

7 //	7 //

8 //===----------------------------------------------------------------------===//	8 //===----------------------------------------------------------------------===//

9 //	9 //

10 // This file implements the TargetLoweringX8632 class, which	10 // This file implements the TargetLoweringX8632 class, which

11 // consists almost entirely of the lowering sequence for each	11 // consists almost entirely of the lowering sequence for each

12 // high-level instruction. It also implements	12 // high-level instruction. It also implements

13 // TargetX8632Fast::postLower() which does the simplest possible	13 // TargetX8632Fast::postLower() which does the simplest possible

14 // register allocation for the "fast" target.	14 // register allocation for the "fast" target.

15 //	15 //

16 //===----------------------------------------------------------------------===//	16 //===----------------------------------------------------------------------===//

17	17

18 #include "IceDefs.h"	18 #include "IceDefs.h"

19 #include "IceCfg.h"	19 #include "IceCfg.h"

20 #include "IceCfgNode.h"	20 #include "IceCfgNode.h"

21 #include "IceInstX8632.h"	21 #include "IceInstX8632.h"

22 #include "IceOperand.h"	22 #include "IceOperand.h"

23 #include "IceTargetLoweringX8632.def"	23 #include "IceTargetLoweringX8632.def"

24 #include "IceTargetLoweringX8632.h"	24 #include "IceTargetLoweringX8632.h"

	25 #include "llvm/Support/CommandLine.h"

25	26

26 namespace Ice {	27 namespace Ice {

27	28

28 namespace {	29 namespace {

29	30

30 // The following table summarizes the logic for lowering the fcmp	31 // The following table summarizes the logic for lowering the fcmp

31 // instruction. There is one table entry for each of the 16 conditions.	32 // instruction. There is one table entry for each of the 16 conditions.

32 //	33 //

33 // The first four columns describe the case when the operands are	34 // The first four columns describe the case when the operands are

34 // floating point scalar values. A comment in lowerFcmp() describes the	35 // floating point scalar values. A comment in lowerFcmp() describes the

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
116 size_t Index = static_cast<size_t>(Ty);	117 size_t Index = static_cast<size_t>(Ty);

117 assert(Index < TableTypeX8632AttributesSize);	118 assert(Index < TableTypeX8632AttributesSize);

118 return TableTypeX8632Attributes[Ty].InVectorElementType;	119 return TableTypeX8632Attributes[Ty].InVectorElementType;

119 }	120 }

120	121

121 // The maximum number of arguments to pass in XMM registers	122 // The maximum number of arguments to pass in XMM registers

122 const unsigned X86_MAX_XMM_ARGS = 4;	123 const unsigned X86_MAX_XMM_ARGS = 4;

123 // The number of bits in a byte	124 // The number of bits in a byte

124 const unsigned X86_CHAR_BIT = 8;	125 const unsigned X86_CHAR_BIT = 8;

125	126

	127 // Instruction set options

	128 namespace cl = ::llvm::cl;

	129 cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet(

	130 "mattr", cl::desc("X86 target attributes"),

	131 cl::init(TargetX8632::SSE2),

	132 cl::values(

	133 clEnumValN(TargetX8632::SSE2, "sse2",

	134 "Enable SSE2 instructions (default)"),

	135 clEnumValN(TargetX8632::SSE4_1, "sse4.1",

	136 "Enable SSE 4.1 instructions"), clEnumValEnd));

	137

126 // Return a string representation of the type that is suitable for use	138 // Return a string representation of the type that is suitable for use

127 // in an identifier.	139 // in an identifier.

128 IceString typeIdentString(const Type Ty) {	140 IceString typeIdentString(const Type Ty) {

129 IceString Str;	141 IceString Str;

130 llvm::raw_string_ostream BaseOS(Str);	142 llvm::raw_string_ostream BaseOS(Str);

131 if (isVectorType(Ty)) {	143 if (isVectorType(Ty)) {

132 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty);	144 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty);

133 } else {	145 } else {

134 BaseOS << Ty;	146 BaseOS << Ty;

135 }	147 }

(...skipping 91 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
227 #define X(tag, size, align, elts, elty, str) \	239 #define X(tag, size, align, elts, elty, str) \

228 STATIC_ASSERT(_table1_##tag == _table2_##tag);	240 STATIC_ASSERT(_table1_##tag == _table2_##tag);

229 ICETYPE_TABLE;	241 ICETYPE_TABLE;

230 #undef X	242 #undef X

231 }	243 }

232 }	244 }

233	245

234 } // end of anonymous namespace	246 } // end of anonymous namespace

235	247

236 TargetX8632::TargetX8632(Cfg *Func)	248 TargetX8632::TargetX8632(Cfg *Func)

237 : TargetLowering(Func), IsEbpBasedFrame(false), FrameSizeLocals(0),	249 : TargetLowering(Func), InstructionSet(CLInstructionSet),

238 LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false),	250 IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0),

	251 NextLabelNumber(0), ComputedLiveRanges(false),

239 PhysicalRegisters(VarList(Reg_NUM)) {	252 PhysicalRegisters(VarList(Reg_NUM)) {

240 // TODO: Don't initialize IntegerRegisters and friends every time.	253 // TODO: Don't initialize IntegerRegisters and friends every time.

241 // Instead, initialize in some sort of static initializer for the	254 // Instead, initialize in some sort of static initializer for the

242 // class.	255 // class.

243 llvm::SmallBitVector IntegerRegisters(Reg_NUM);	256 llvm::SmallBitVector IntegerRegisters(Reg_NUM);

244 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM);	257 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM);

245 llvm::SmallBitVector FloatRegisters(Reg_NUM);	258 llvm::SmallBitVector FloatRegisters(Reg_NUM);

246 llvm::SmallBitVector VectorRegisters(Reg_NUM);	259 llvm::SmallBitVector VectorRegisters(Reg_NUM);

247 llvm::SmallBitVector InvalidRegisters(Reg_NUM);	260 llvm::SmallBitVector InvalidRegisters(Reg_NUM);

248 ScratchRegs.resize(Reg_NUM);	261 ScratchRegs.resize(Reg_NUM);

(...skipping 972 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1221 _pxor(T, LEGAL_HACK(Src1));	1234 _pxor(T, LEGAL_HACK(Src1));

1222 _movp(Dest, T);	1235 _movp(Dest, T);

1223 } break;	1236 } break;

1224 case InstArithmetic::Sub: {	1237 case InstArithmetic::Sub: {

1225 Variable *T = makeReg(Dest->getType());	1238 Variable *T = makeReg(Dest->getType());

1226 _movp(T, Src0);	1239 _movp(T, Src0);

1227 _psub(T, LEGAL_HACK(Src1));	1240 _psub(T, LEGAL_HACK(Src1));

1228 _movp(Dest, T);	1241 _movp(Dest, T);

1229 } break;	1242 } break;

1230 case InstArithmetic::Mul: {	1243 case InstArithmetic::Mul: {

1231 if (Dest->getType() == IceType_v4i32) {	1244 bool TypesAreValidForPmull =

	1245 Dest->getType() == IceType_v4i32 \|\| Dest->getType() == IceType_v8i16;

	1246 bool InstructionSetIsValidForPmull =

	1247 Dest->getType() == IceType_v8i16 \|\| InstructionSet >= SSE4_1;

	1248 if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {

	1249 Variable *T = makeReg(Dest->getType());

	1250 _movp(T, Src0);

	1251 _pmull(T, legalizeToVar(Src1));

	1252 _movp(Dest, T);

	1253 } else if (Dest->getType() == IceType_v4i32) {

1232 // Lowering sequence:	1254 // Lowering sequence:

1233 // Note: The mask arguments have index 0 on the left.	1255 // Note: The mask arguments have index 0 on the left.

1234 //	1256 //

1235 // movups T1, Src0	1257 // movups T1, Src0

1236 // pshufd T2, Src0, {1,0,3,0}	1258 // pshufd T2, Src0, {1,0,3,0}

1237 // pshufd T3, Src1, {1,0,3,0}	1259 // pshufd T3, Src1, {1,0,3,0}

1238 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}	1260 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}

1239 // pmuludq T1, Src1	1261 // pmuludq T1, Src1

1240 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}	1262 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}

1241 // pmuludq T2, T3	1263 // pmuludq T2, T3

1242 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}	1264 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}

1243 // shufps T1, T2, {0,2,0,2}	1265 // shufps T1, T2, {0,2,0,2}

1244 // pshufd T4, T1, {0,2,1,3}	1266 // pshufd T4, T1, {0,2,1,3}

1245 // movups Dest, T4	1267 // movups Dest, T4

1246 //

1247 // TODO(wala): SSE4.1 has pmulld.

1248	1268

1249 // Mask that directs pshufd to create a vector with entries	1269 // Mask that directs pshufd to create a vector with entries

1250 // Src[1, 0, 3, 0]	1270 // Src[1, 0, 3, 0]

1251 const unsigned Constant1030 = 0x31;	1271 const unsigned Constant1030 = 0x31;

1252 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030);	1272 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030);

1253 // Mask that directs shufps to create a vector with entries	1273 // Mask that directs shufps to create a vector with entries

1254 // Dest[0, 2], Src[0, 2]	1274 // Dest[0, 2], Src[0, 2]

1255 const unsigned Mask0202 = 0x88;	1275 const unsigned Mask0202 = 0x88;

1256 // Mask that directs pshufd to create a vector with entries	1276 // Mask that directs pshufd to create a vector with entries

1257 // Src[0, 2, 1, 3]	1277 // Src[0, 2, 1, 3]

1258 const unsigned Mask0213 = 0xd8;	1278 const unsigned Mask0213 = 0xd8;

1259 Variable *T1 = makeReg(IceType_v4i32);	1279 Variable *T1 = makeReg(IceType_v4i32);

1260 Variable *T2 = makeReg(IceType_v4i32);	1280 Variable *T2 = makeReg(IceType_v4i32);

1261 Variable *T3 = makeReg(IceType_v4i32);	1281 Variable *T3 = makeReg(IceType_v4i32);

1262 Variable *T4 = makeReg(IceType_v4i32);	1282 Variable *T4 = makeReg(IceType_v4i32);

1263 _movp(T1, Src0);	1283 _movp(T1, Src0);

1264 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R	1284 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R

1265 // with Src1 after stack operand alignment support is	1285 // with Src1 after stack operand alignment support is

1266 // implemented.	1286 // implemented.

1267 Variable *Src0R = LEGAL_HACK(Src0);	1287 Variable *Src0R = LEGAL_HACK(Src0);

1268 Variable *Src1R = LEGAL_HACK(Src1);	1288 Variable *Src1R = LEGAL_HACK(Src1);

1269 _pshufd(T2, Src0R, Mask1030);	1289 _pshufd(T2, Src0R, Mask1030);

1270 _pshufd(T3, Src1R, Mask1030);	1290 _pshufd(T3, Src1R, Mask1030);

1271 _pmuludq(T1, Src1R);	1291 _pmuludq(T1, Src1R);

1272 _pmuludq(T2, T3);	1292 _pmuludq(T2, T3);

1273 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202));	1293 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202));

1274 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213));	1294 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213));

1275 _movp(Dest, T4);	1295 _movp(Dest, T4);

1276 } else if (Dest->getType() == IceType_v8i16) {

1277 Variable *T = makeReg(IceType_v8i16);

1278 _movp(T, Src0);

1279 _pmullw(T, legalizeToVar(Src1));

1280 _movp(Dest, T);

1281 } else {	1296 } else {

1282 assert(Dest->getType() == IceType_v16i8);	1297 assert(Dest->getType() == IceType_v16i8);

1283 // Sz_mul_v16i8	1298 // Sz_mul_v16i8

1284 const IceString Helper = "Sz_mul_v16i8";	1299 const IceString Helper = "Sz_mul_v16i8";

1285 const SizeT MaxSrcs = 2;	1300 const SizeT MaxSrcs = 2;

1286 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);	1301 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);

1287 Call->addArg(Src0);	1302 Call->addArg(Src0);

1288 Call->addArg(Src1);	1303 Call->addArg(Src1);

1289 lowerCall(Call);	1304 lowerCall(Call);

1290 }	1305 }

(...skipping 857 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2148 // Only constant indices are allowed in PNaCl IR.	2163 // Only constant indices are allowed in PNaCl IR.

2149 assert(ElementIndex);	2164 assert(ElementIndex);

2150	2165

2151 unsigned Index = ElementIndex->getValue();	2166 unsigned Index = ElementIndex->getValue();

2152 Type Ty = SourceVectOperand->getType();	2167 Type Ty = SourceVectOperand->getType();

2153 Type ElementTy = typeElementType(Ty);	2168 Type ElementTy = typeElementType(Ty);

2154 Type InVectorElementTy = getInVectorElementType(Ty);	2169 Type InVectorElementTy = getInVectorElementType(Ty);

2155 Variable *ExtractedElement = makeReg(InVectorElementTy);	2170 Variable *ExtractedElement = makeReg(InVectorElementTy);

2156	2171

2157 // TODO(wala): Determine the best lowering sequences for each type.	2172 // TODO(wala): Determine the best lowering sequences for each type.

2158 if (Ty == IceType_v4i32 \|\| Ty == IceType_v4f32 \|\| Ty == IceType_v4i1) {	2173 bool CanUsePextr =

2159 // Lower extractelement operations where the element is 32 bits	2174 Ty == IceType_v8i16 \|\| Ty == IceType_v8i1 \|\| InstructionSet >= SSE4_1;

2160 // wide with pshufd.	2175 if (CanUsePextr && Ty != IceType_v4f32) {

2161 // TODO(wala): SSE4.1 has extractps and pextrd	2176 // Use pextrb, pextrw, or pextrd.

	2177 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);

	2178 Variable *SourceVectR = legalizeToVar(SourceVectOperand);

	2179 _pextr(ExtractedElement, SourceVectR, Mask);

	2180 } else if (Ty == IceType_v4i32 \|\| Ty == IceType_v4f32 \|\| Ty == IceType_v4i1) {

	2181 // Use pshufd and movd/movss.

2162 //	2182 //

2163 // ALIGNHACK: Force vector operands to registers in instructions that	2183 // ALIGNHACK: Force vector operands to registers in instructions that

2164 // require aligned memory operands until support for stack alignment	2184 // require aligned memory operands until support for stack alignment

2165 // is implemented.	2185 // is implemented.

2166 #define ALIGN_HACK(Vect) legalizeToVar((Vect))	2186 #define ALIGN_HACK(Vect) legalizeToVar((Vect))

2167 Variable *T = NULL;	2187 Variable *T = NULL;

2168 if (Index) {	2188 if (Index) {

2169 // The shuffle only needs to occur if the element to be extracted	2189 // The shuffle only needs to occur if the element to be extracted

2170 // is not at the lowest index.	2190 // is not at the lowest index.

2171 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);	2191 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);

2172 T = makeReg(Ty);	2192 T = makeReg(Ty);

2173 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask);	2193 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask);

2174 } else {	2194 } else {

2175 T = legalizeToVar(SourceVectOperand);	2195 T = legalizeToVar(SourceVectOperand);

2176 }	2196 }

2177	2197

2178 if (InVectorElementTy == IceType_i32) {	2198 if (InVectorElementTy == IceType_i32) {

2179 _movd(ExtractedElement, T);	2199 _movd(ExtractedElement, T);

2180 } else { // Ty == Icetype_f32	2200 } else { // Ty == Icetype_f32

2181 // TODO(wala): _movss is only used here because _mov does not	2201 // TODO(wala): _movss is only used here because _mov does not

2182 // allow a vector source and a scalar destination. _mov should be	2202 // allow a vector source and a scalar destination. _mov should be

2183 // able to be used here.	2203 // able to be used here.

2184 // _movss is a binary instruction, so the FakeDef is needed to	2204 // _movss is a binary instruction, so the FakeDef is needed to

2185 // keep the live range analysis consistent.	2205 // keep the live range analysis consistent.

2186 Context.insert(InstFakeDef::create(Func, ExtractedElement));	2206 Context.insert(InstFakeDef::create(Func, ExtractedElement));

2187 _movss(ExtractedElement, T);	2207 _movss(ExtractedElement, T);

2188 }	2208 }

2189 #undef ALIGN_HACK	2209 #undef ALIGN_HACK

2190 } else if (Ty == IceType_v8i16 \|\| Ty == IceType_v8i1) {

2191 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);

2192 _pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask);

2193 } else {	2210 } else {

2194 assert(Ty == IceType_v16i8 \|\| Ty == IceType_v16i1);	2211 assert(Ty == IceType_v16i8 \|\| Ty == IceType_v16i1);

2195 // Spill the value to a stack slot and do the extraction in memory.	2212 // Spill the value to a stack slot and do the extraction in memory.

2196 // TODO(wala): SSE4.1 has pextrb.

2197 //	2213 //

2198 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when	2214 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when

2199 // support for legalizing to mem is implemented.	2215 // support for legalizing to mem is implemented.

2200 Variable *Slot = Func->makeVariable(Ty, Context.getNode());	2216 Variable *Slot = Func->makeVariable(Ty, Context.getNode());

2201 Slot->setWeight(RegWeight::Zero);	2217 Slot->setWeight(RegWeight::Zero);

2202 _movp(Slot, legalizeToVar(SourceVectOperand));	2218 _movp(Slot, legalizeToVar(SourceVectOperand));

2203	2219

2204 // Compute the location of the element in memory.	2220 // Compute the location of the element in memory.

2205 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);	2221 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);

2206 OperandX8632Mem *Loc =	2222 OperandX8632Mem *Loc =

(...skipping 325 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2532 // Expand the element to the appropriate size for it to be inserted	2548 // Expand the element to the appropriate size for it to be inserted

2533 // in the vector.	2549 // in the vector.

2534 Variable *Expanded =	2550 Variable *Expanded =

2535 Func->makeVariable(InVectorElementTy, Context.getNode());	2551 Func->makeVariable(InVectorElementTy, Context.getNode());

2536 InstCast *Cast =	2552 InstCast *Cast =

2537 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert);	2553 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert);

2538 lowerCast(Cast);	2554 lowerCast(Cast);

2539 ElementToInsert = Expanded;	2555 ElementToInsert = Expanded;

2540 }	2556 }

2541	2557

2542 if (Ty == IceType_v4i32 \|\| Ty == IceType_v4f32 \|\| Ty == IceType_v4i1) {	2558 if (Ty == IceType_v8i16 \|\| Ty == IceType_v8i1 \|\| InstructionSet >= SSE4_1) {

2543 // Lower insertelement with 32-bit wide elements using shufps or	2559 // Use insertps, pinsrb, pinsrw, or pinsrd.

2544 // movss.	2560 Operand *Element = legalize(ElementToInsert, Legal_Mem \| Legal_Reg);

2545 // TODO(wala): SSE4.1 has pinsrd and insertps.	2561 Variable *T = makeReg(Ty);

	2562 _movp(T, SourceVectOperand);

	2563 if (Ty == IceType_v4f32)

	2564 _insertps(T, Element, Ctx->getConstantInt(IceType_i8, Index << 4));

	2565 else

	2566 _pinsr(T, Element, Ctx->getConstantInt(IceType_i8, Index));

	2567 _movp(Inst->getDest(), T);

	2568 } else if (Ty == IceType_v4i32 \|\| Ty == IceType_v4f32 \|\| Ty == IceType_v4i1) {

	2569 // Use shufps or movss.

2546 Variable *Element = NULL;	2570 Variable *Element = NULL;

2547 if (InVectorElementTy == IceType_f32) {	2571 if (InVectorElementTy == IceType_f32) {

2548 // Element will be in an XMM register since it is floating point.	2572 // Element will be in an XMM register since it is floating point.

2549 Element = legalizeToVar(ElementToInsert);	2573 Element = legalizeToVar(ElementToInsert);

2550 } else {	2574 } else {

2551 // Copy an integer to an XMM register.	2575 // Copy an integer to an XMM register.

2552 Operand *T = legalize(ElementToInsert, Legal_Reg \| Legal_Mem);	2576 Operand *T = legalize(ElementToInsert, Legal_Reg \| Legal_Mem);

2553 Element = makeReg(Ty);	2577 Element = makeReg(Ty);

2554 _movd(Element, T);	2578 _movd(Element, T);

2555 }	2579 }

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2600 _shufps(Element, SourceVectOperand, Mask2Constant);	2624 _shufps(Element, SourceVectOperand, Mask2Constant);

2601 _movp(Inst->getDest(), Element);	2625 _movp(Inst->getDest(), Element);

2602 } else {	2626 } else {

2603 Variable *T = makeReg(Ty);	2627 Variable *T = makeReg(Ty);

2604 _movp(T, SourceVectOperand);	2628 _movp(T, SourceVectOperand);

2605 _shufps(Element, T, Mask1Constant);	2629 _shufps(Element, T, Mask1Constant);

2606 _shufps(T, Element, Mask2Constant);	2630 _shufps(T, Element, Mask2Constant);

2607 _movp(Inst->getDest(), T);	2631 _movp(Inst->getDest(), T);

2608 }	2632 }

2609 #undef ALIGN_HACK	2633 #undef ALIGN_HACK

2610 } else if (Ty == IceType_v8i16 \|\| Ty == IceType_v8i1) {

2611 Operand *Element = legalize(ElementToInsert, Legal_Mem \| Legal_Reg);

2612 Variable *T = makeReg(Ty);

2613 _movp(T, SourceVectOperand);

2614 _pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index));

2615 _movp(Inst->getDest(), T);

2616 } else {	2634 } else {

2617 assert(Ty == IceType_v16i8 \|\| Ty == IceType_v16i1);	2635 assert(Ty == IceType_v16i8 \|\| Ty == IceType_v16i1);

2618 // Spill the value to a stack slot and perform the insertion in	2636 // Spill the value to a stack slot and perform the insertion in

2619 // memory.	2637 // memory.

2620 // TODO(wala): SSE4.1 has pinsrb.

2621 //	2638 //

2622 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when	2639 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when

2623 // support for legalizing to mem is implemented.	2640 // support for legalizing to mem is implemented.

2624 Variable *Slot = Func->makeVariable(Ty, Context.getNode());	2641 Variable *Slot = Func->makeVariable(Ty, Context.getNode());

2625 Slot->setWeight(RegWeight::Zero);	2642 Slot->setWeight(RegWeight::Zero);

2626 _movp(Slot, legalizeToVar(SourceVectOperand));	2643 _movp(Slot, legalizeToVar(SourceVectOperand));

2627	2644

2628 // Compute the location of the position to insert in memory.	2645 // Compute the location of the position to insert in memory.

2629 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);	2646 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);

2630 OperandX8632Mem *Loc =	2647 OperandX8632Mem *Loc =

(...skipping 913 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3544 Context.insert(InstFakeUse::create(Func, esp));	3561 Context.insert(InstFakeUse::create(Func, esp));

3545 }	3562 }

3546	3563

3547 void TargetX8632::lowerSelect(const InstSelect *Inst) {	3564 void TargetX8632::lowerSelect(const InstSelect *Inst) {

3548 Variable *Dest = Inst->getDest();	3565 Variable *Dest = Inst->getDest();

3549 Operand *SrcT = Inst->getTrueOperand();	3566 Operand *SrcT = Inst->getTrueOperand();

3550 Operand *SrcF = Inst->getFalseOperand();	3567 Operand *SrcF = Inst->getFalseOperand();

3551 Operand *Condition = Inst->getCondition();	3568 Operand *Condition = Inst->getCondition();

3552	3569

3553 if (isVectorType(Dest->getType())) {	3570 if (isVectorType(Dest->getType())) {

3554 // a=d?b:c ==> d=sext(d); a=(b&d)\|(c&~d)

3555 // TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has

3556 // blendps and pblendw for constant condition operands.

3557 Type SrcTy = SrcT->getType();	3571 Type SrcTy = SrcT->getType();

3558 Variable *T = makeReg(SrcTy);	3572 Variable *T = makeReg(SrcTy);

	3573 // ALIGNHACK: Until stack alignment support is implemented, vector

	3574 // instructions need to have vector operands in registers. Once

	3575 // there is support for stack alignment, LEGAL_HACK can be removed.

	3576 #define LEGAL_HACK(Vect) legalizeToVar((Vect))

	3577 if (InstructionSet >= SSE4_1) {

	3578 // TODO(wala): If the condition operand is a constant, use blendps

	3579 // or pblendw.

	3580 //

	3581 // Use blendvps or pblendvb to implement select.

	3582 if (SrcTy == IceType_v4i1 \|\| SrcTy == IceType_v4i32 \|\|

	3583 SrcTy == IceType_v4f32) {

	3584 Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0);

	3585 _movp(xmm0, Condition);

	3586 _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31));

	3587 _movp(T, SrcF);

	3588 _blendvps(T, LEGAL_HACK(SrcT), xmm0);

	3589 _movp(Dest, T);

	3590 } else {

	3591 assert(typeNumElements(SrcTy) == 8 \|\| typeNumElements(SrcTy) == 16);

	3592 Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16

	3593 : IceType_v16i8;

	3594 Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0);

	3595 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));

	3596 _movp(T, SrcF);

	3597 _pblendvb(T, LEGAL_HACK(SrcT), xmm0);

	3598 _movp(Dest, T);

	3599 }

	3600 return;

	3601 }

	3602 // Lower select without SSE4.1:

	3603 // a=d?b:c ==>

	3604 // if elementtype(d) != i1:

	3605 // d=sext(d);

	3606 // a=(b&d)\|(c&~d);

3559 Variable *T2 = makeReg(SrcTy);	3607 Variable *T2 = makeReg(SrcTy);

3560 // Sign extend the condition operand if applicable.	3608 // Sign extend the condition operand if applicable.

3561 if (SrcTy == IceType_v4f32) {	3609 if (SrcTy == IceType_v4f32) {

3562 // The sext operation takes only integer arguments.	3610 // The sext operation takes only integer arguments.

3563 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode());	3611 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode());

3564 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));	3612 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));

3565 _movp(T, T3);	3613 _movp(T, T3);

3566 } else if (typeElementType(SrcTy) != IceType_i1) {	3614 } else if (typeElementType(SrcTy) != IceType_i1) {

3567 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));	3615 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));

3568 } else {	3616 } else {

3569 _movp(T, Condition);	3617 _movp(T, Condition);

3570 }	3618 }

3571 // ALIGNHACK: Until stack alignment support is implemented, the

3572 // bitwise vector instructions need to have both operands in

3573 // registers. Once there is support for stack alignment, LEGAL_HACK

3574 // can be removed.

3575 #define LEGAL_HACK(Vect) legalizeToVar((Vect))

3576 _movp(T2, T);	3619 _movp(T2, T);

3577 _pand(T, LEGAL_HACK(SrcT));	3620 _pand(T, LEGAL_HACK(SrcT));

3578 _pandn(T2, LEGAL_HACK(SrcF));	3621 _pandn(T2, LEGAL_HACK(SrcF));

3579 _por(T, T2);	3622 _por(T, T2);

3580 _movp(Dest, T);	3623 _movp(Dest, T);

3581 #undef LEGAL_HACK	3624 #undef LEGAL_HACK

3582	3625

3583 return;	3626 return;

3584 }	3627 }

3585	3628

(...skipping 504 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4090 for (SizeT i = 0; i < Size; ++i) {	4133 for (SizeT i = 0; i < Size; ++i) {

4091 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";	4134 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";

4092 }	4135 }

4093 Str << "\t.size\t" << MangledName << ", " << Size << "\n";	4136 Str << "\t.size\t" << MangledName << ", " << Size << "\n";

4094 }	4137 }

4095 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName	4138 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName

4096 << "\n";	4139 << "\n";

4097 }	4140 }

4098	4141

4099 } // end of namespace Ice	4142 } // end of namespace Ice

OLD	NEW

« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/vector-arith.ll » ('j') | no next file with comments »