Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1108)

Side by Side Diff: src/IceTargetLoweringX8632.cpp

Issue 427843002: Subzero: Add support for SSE4.1 instructions. (Closed) Base URL: https://gerrit.chromium.org/gerrit/p/native_client/pnacl-subzero.git@master
Patch Set: Fix an empty line that was deleted Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/vector-arith.ll » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//
2 // 2 //
3 // The Subzero Code Generator 3 // The Subzero Code Generator
4 // 4 //
5 // This file is distributed under the University of Illinois Open Source 5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details. 6 // License. See LICENSE.TXT for details.
7 // 7 //
8 //===----------------------------------------------------------------------===// 8 //===----------------------------------------------------------------------===//
9 // 9 //
10 // This file implements the TargetLoweringX8632 class, which 10 // This file implements the TargetLoweringX8632 class, which
11 // consists almost entirely of the lowering sequence for each 11 // consists almost entirely of the lowering sequence for each
12 // high-level instruction. It also implements 12 // high-level instruction. It also implements
13 // TargetX8632Fast::postLower() which does the simplest possible 13 // TargetX8632Fast::postLower() which does the simplest possible
14 // register allocation for the "fast" target. 14 // register allocation for the "fast" target.
15 // 15 //
16 //===----------------------------------------------------------------------===// 16 //===----------------------------------------------------------------------===//
17 17
18 #include "IceDefs.h" 18 #include "IceDefs.h"
19 #include "IceCfg.h" 19 #include "IceCfg.h"
20 #include "IceCfgNode.h" 20 #include "IceCfgNode.h"
21 #include "IceInstX8632.h" 21 #include "IceInstX8632.h"
22 #include "IceOperand.h" 22 #include "IceOperand.h"
23 #include "IceTargetLoweringX8632.def" 23 #include "IceTargetLoweringX8632.def"
24 #include "IceTargetLoweringX8632.h" 24 #include "IceTargetLoweringX8632.h"
25 #include "llvm/Support/CommandLine.h"
25 26
26 namespace Ice { 27 namespace Ice {
27 28
28 namespace { 29 namespace {
29 30
30 // The following table summarizes the logic for lowering the fcmp 31 // The following table summarizes the logic for lowering the fcmp
31 // instruction. There is one table entry for each of the 16 conditions. 32 // instruction. There is one table entry for each of the 16 conditions.
32 // 33 //
33 // The first four columns describe the case when the operands are 34 // The first four columns describe the case when the operands are
34 // floating point scalar values. A comment in lowerFcmp() describes the 35 // floating point scalar values. A comment in lowerFcmp() describes the
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
116 size_t Index = static_cast<size_t>(Ty); 117 size_t Index = static_cast<size_t>(Ty);
117 assert(Index < TableTypeX8632AttributesSize); 118 assert(Index < TableTypeX8632AttributesSize);
118 return TableTypeX8632Attributes[Ty].InVectorElementType; 119 return TableTypeX8632Attributes[Ty].InVectorElementType;
119 } 120 }
120 121
121 // The maximum number of arguments to pass in XMM registers 122 // The maximum number of arguments to pass in XMM registers
122 const unsigned X86_MAX_XMM_ARGS = 4; 123 const unsigned X86_MAX_XMM_ARGS = 4;
123 // The number of bits in a byte 124 // The number of bits in a byte
124 const unsigned X86_CHAR_BIT = 8; 125 const unsigned X86_CHAR_BIT = 8;
125 126
127 // Instruction set options
128 namespace cl = ::llvm::cl;
129 cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet(
130 "mattr", cl::desc("X86 target attributes"),
131 cl::init(TargetX8632::SSE2),
132 cl::values(
133 clEnumValN(TargetX8632::SSE2, "sse2",
134 "Enable SSE2 instructions (default)"),
135 clEnumValN(TargetX8632::SSE4_1, "sse4.1",
136 "Enable SSE 4.1 instructions"), clEnumValEnd));
137
126 // Return a string representation of the type that is suitable for use 138 // Return a string representation of the type that is suitable for use
127 // in an identifier. 139 // in an identifier.
128 IceString typeIdentString(const Type Ty) { 140 IceString typeIdentString(const Type Ty) {
129 IceString Str; 141 IceString Str;
130 llvm::raw_string_ostream BaseOS(Str); 142 llvm::raw_string_ostream BaseOS(Str);
131 if (isVectorType(Ty)) { 143 if (isVectorType(Ty)) {
132 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty); 144 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty);
133 } else { 145 } else {
134 BaseOS << Ty; 146 BaseOS << Ty;
135 } 147 }
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after
227 #define X(tag, size, align, elts, elty, str) \ 239 #define X(tag, size, align, elts, elty, str) \
228 STATIC_ASSERT(_table1_##tag == _table2_##tag); 240 STATIC_ASSERT(_table1_##tag == _table2_##tag);
229 ICETYPE_TABLE; 241 ICETYPE_TABLE;
230 #undef X 242 #undef X
231 } 243 }
232 } 244 }
233 245
234 } // end of anonymous namespace 246 } // end of anonymous namespace
235 247
236 TargetX8632::TargetX8632(Cfg *Func) 248 TargetX8632::TargetX8632(Cfg *Func)
237 : TargetLowering(Func), IsEbpBasedFrame(false), FrameSizeLocals(0), 249 : TargetLowering(Func), InstructionSet(CLInstructionSet),
238 LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false), 250 IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0),
251 NextLabelNumber(0), ComputedLiveRanges(false),
239 PhysicalRegisters(VarList(Reg_NUM)) { 252 PhysicalRegisters(VarList(Reg_NUM)) {
240 // TODO: Don't initialize IntegerRegisters and friends every time. 253 // TODO: Don't initialize IntegerRegisters and friends every time.
241 // Instead, initialize in some sort of static initializer for the 254 // Instead, initialize in some sort of static initializer for the
242 // class. 255 // class.
243 llvm::SmallBitVector IntegerRegisters(Reg_NUM); 256 llvm::SmallBitVector IntegerRegisters(Reg_NUM);
244 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM); 257 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM);
245 llvm::SmallBitVector FloatRegisters(Reg_NUM); 258 llvm::SmallBitVector FloatRegisters(Reg_NUM);
246 llvm::SmallBitVector VectorRegisters(Reg_NUM); 259 llvm::SmallBitVector VectorRegisters(Reg_NUM);
247 llvm::SmallBitVector InvalidRegisters(Reg_NUM); 260 llvm::SmallBitVector InvalidRegisters(Reg_NUM);
248 ScratchRegs.resize(Reg_NUM); 261 ScratchRegs.resize(Reg_NUM);
(...skipping 972 matching lines...) Expand 10 before | Expand all | Expand 10 after
1221 _pxor(T, LEGAL_HACK(Src1)); 1234 _pxor(T, LEGAL_HACK(Src1));
1222 _movp(Dest, T); 1235 _movp(Dest, T);
1223 } break; 1236 } break;
1224 case InstArithmetic::Sub: { 1237 case InstArithmetic::Sub: {
1225 Variable *T = makeReg(Dest->getType()); 1238 Variable *T = makeReg(Dest->getType());
1226 _movp(T, Src0); 1239 _movp(T, Src0);
1227 _psub(T, LEGAL_HACK(Src1)); 1240 _psub(T, LEGAL_HACK(Src1));
1228 _movp(Dest, T); 1241 _movp(Dest, T);
1229 } break; 1242 } break;
1230 case InstArithmetic::Mul: { 1243 case InstArithmetic::Mul: {
1231 if (Dest->getType() == IceType_v4i32) { 1244 bool TypesAreValidForPmull =
1245 Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
1246 bool InstructionSetIsValidForPmull =
1247 Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1;
1248 if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
1249 Variable *T = makeReg(Dest->getType());
1250 _movp(T, Src0);
1251 _pmull(T, legalizeToVar(Src1));
1252 _movp(Dest, T);
1253 } else if (Dest->getType() == IceType_v4i32) {
1232 // Lowering sequence: 1254 // Lowering sequence:
1233 // Note: The mask arguments have index 0 on the left. 1255 // Note: The mask arguments have index 0 on the left.
1234 // 1256 //
1235 // movups T1, Src0 1257 // movups T1, Src0
1236 // pshufd T2, Src0, {1,0,3,0} 1258 // pshufd T2, Src0, {1,0,3,0}
1237 // pshufd T3, Src1, {1,0,3,0} 1259 // pshufd T3, Src1, {1,0,3,0}
1238 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} 1260 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
1239 // pmuludq T1, Src1 1261 // pmuludq T1, Src1
1240 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]} 1262 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
1241 // pmuludq T2, T3 1263 // pmuludq T2, T3
1242 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])} 1264 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
1243 // shufps T1, T2, {0,2,0,2} 1265 // shufps T1, T2, {0,2,0,2}
1244 // pshufd T4, T1, {0,2,1,3} 1266 // pshufd T4, T1, {0,2,1,3}
1245 // movups Dest, T4 1267 // movups Dest, T4
1246 //
1247 // TODO(wala): SSE4.1 has pmulld.
1248 1268
1249 // Mask that directs pshufd to create a vector with entries 1269 // Mask that directs pshufd to create a vector with entries
1250 // Src[1, 0, 3, 0] 1270 // Src[1, 0, 3, 0]
1251 const unsigned Constant1030 = 0x31; 1271 const unsigned Constant1030 = 0x31;
1252 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030); 1272 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030);
1253 // Mask that directs shufps to create a vector with entries 1273 // Mask that directs shufps to create a vector with entries
1254 // Dest[0, 2], Src[0, 2] 1274 // Dest[0, 2], Src[0, 2]
1255 const unsigned Mask0202 = 0x88; 1275 const unsigned Mask0202 = 0x88;
1256 // Mask that directs pshufd to create a vector with entries 1276 // Mask that directs pshufd to create a vector with entries
1257 // Src[0, 2, 1, 3] 1277 // Src[0, 2, 1, 3]
1258 const unsigned Mask0213 = 0xd8; 1278 const unsigned Mask0213 = 0xd8;
1259 Variable *T1 = makeReg(IceType_v4i32); 1279 Variable *T1 = makeReg(IceType_v4i32);
1260 Variable *T2 = makeReg(IceType_v4i32); 1280 Variable *T2 = makeReg(IceType_v4i32);
1261 Variable *T3 = makeReg(IceType_v4i32); 1281 Variable *T3 = makeReg(IceType_v4i32);
1262 Variable *T4 = makeReg(IceType_v4i32); 1282 Variable *T4 = makeReg(IceType_v4i32);
1263 _movp(T1, Src0); 1283 _movp(T1, Src0);
1264 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R 1284 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R
1265 // with Src1 after stack operand alignment support is 1285 // with Src1 after stack operand alignment support is
1266 // implemented. 1286 // implemented.
1267 Variable *Src0R = LEGAL_HACK(Src0); 1287 Variable *Src0R = LEGAL_HACK(Src0);
1268 Variable *Src1R = LEGAL_HACK(Src1); 1288 Variable *Src1R = LEGAL_HACK(Src1);
1269 _pshufd(T2, Src0R, Mask1030); 1289 _pshufd(T2, Src0R, Mask1030);
1270 _pshufd(T3, Src1R, Mask1030); 1290 _pshufd(T3, Src1R, Mask1030);
1271 _pmuludq(T1, Src1R); 1291 _pmuludq(T1, Src1R);
1272 _pmuludq(T2, T3); 1292 _pmuludq(T2, T3);
1273 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202)); 1293 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202));
1274 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213)); 1294 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213));
1275 _movp(Dest, T4); 1295 _movp(Dest, T4);
1276 } else if (Dest->getType() == IceType_v8i16) {
1277 Variable *T = makeReg(IceType_v8i16);
1278 _movp(T, Src0);
1279 _pmullw(T, legalizeToVar(Src1));
1280 _movp(Dest, T);
1281 } else { 1296 } else {
1282 assert(Dest->getType() == IceType_v16i8); 1297 assert(Dest->getType() == IceType_v16i8);
1283 // Sz_mul_v16i8 1298 // Sz_mul_v16i8
1284 const IceString Helper = "Sz_mul_v16i8"; 1299 const IceString Helper = "Sz_mul_v16i8";
1285 const SizeT MaxSrcs = 2; 1300 const SizeT MaxSrcs = 2;
1286 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs); 1301 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
1287 Call->addArg(Src0); 1302 Call->addArg(Src0);
1288 Call->addArg(Src1); 1303 Call->addArg(Src1);
1289 lowerCall(Call); 1304 lowerCall(Call);
1290 } 1305 }
(...skipping 857 matching lines...) Expand 10 before | Expand all | Expand 10 after
2148 // Only constant indices are allowed in PNaCl IR. 2163 // Only constant indices are allowed in PNaCl IR.
2149 assert(ElementIndex); 2164 assert(ElementIndex);
2150 2165
2151 unsigned Index = ElementIndex->getValue(); 2166 unsigned Index = ElementIndex->getValue();
2152 Type Ty = SourceVectOperand->getType(); 2167 Type Ty = SourceVectOperand->getType();
2153 Type ElementTy = typeElementType(Ty); 2168 Type ElementTy = typeElementType(Ty);
2154 Type InVectorElementTy = getInVectorElementType(Ty); 2169 Type InVectorElementTy = getInVectorElementType(Ty);
2155 Variable *ExtractedElement = makeReg(InVectorElementTy); 2170 Variable *ExtractedElement = makeReg(InVectorElementTy);
2156 2171
2157 // TODO(wala): Determine the best lowering sequences for each type. 2172 // TODO(wala): Determine the best lowering sequences for each type.
2158 if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { 2173 bool CanUsePextr =
2159 // Lower extractelement operations where the element is 32 bits 2174 Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1;
2160 // wide with pshufd. 2175 if (CanUsePextr && Ty != IceType_v4f32) {
2161 // TODO(wala): SSE4.1 has extractps and pextrd 2176 // Use pextrb, pextrw, or pextrd.
2177 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
2178 Variable *SourceVectR = legalizeToVar(SourceVectOperand);
2179 _pextr(ExtractedElement, SourceVectR, Mask);
2180 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
2181 // Use pshufd and movd/movss.
2162 // 2182 //
2163 // ALIGNHACK: Force vector operands to registers in instructions that 2183 // ALIGNHACK: Force vector operands to registers in instructions that
2164 // require aligned memory operands until support for stack alignment 2184 // require aligned memory operands until support for stack alignment
2165 // is implemented. 2185 // is implemented.
2166 #define ALIGN_HACK(Vect) legalizeToVar((Vect)) 2186 #define ALIGN_HACK(Vect) legalizeToVar((Vect))
2167 Variable *T = NULL; 2187 Variable *T = NULL;
2168 if (Index) { 2188 if (Index) {
2169 // The shuffle only needs to occur if the element to be extracted 2189 // The shuffle only needs to occur if the element to be extracted
2170 // is not at the lowest index. 2190 // is not at the lowest index.
2171 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); 2191 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
2172 T = makeReg(Ty); 2192 T = makeReg(Ty);
2173 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask); 2193 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask);
2174 } else { 2194 } else {
2175 T = legalizeToVar(SourceVectOperand); 2195 T = legalizeToVar(SourceVectOperand);
2176 } 2196 }
2177 2197
2178 if (InVectorElementTy == IceType_i32) { 2198 if (InVectorElementTy == IceType_i32) {
2179 _movd(ExtractedElement, T); 2199 _movd(ExtractedElement, T);
2180 } else { // Ty == Icetype_f32 2200 } else { // Ty == Icetype_f32
2181 // TODO(wala): _movss is only used here because _mov does not 2201 // TODO(wala): _movss is only used here because _mov does not
2182 // allow a vector source and a scalar destination. _mov should be 2202 // allow a vector source and a scalar destination. _mov should be
2183 // able to be used here. 2203 // able to be used here.
2184 // _movss is a binary instruction, so the FakeDef is needed to 2204 // _movss is a binary instruction, so the FakeDef is needed to
2185 // keep the live range analysis consistent. 2205 // keep the live range analysis consistent.
2186 Context.insert(InstFakeDef::create(Func, ExtractedElement)); 2206 Context.insert(InstFakeDef::create(Func, ExtractedElement));
2187 _movss(ExtractedElement, T); 2207 _movss(ExtractedElement, T);
2188 } 2208 }
2189 #undef ALIGN_HACK 2209 #undef ALIGN_HACK
2190 } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
2191 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
2192 _pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask);
2193 } else { 2210 } else {
2194 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); 2211 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
2195 // Spill the value to a stack slot and do the extraction in memory. 2212 // Spill the value to a stack slot and do the extraction in memory.
2196 // TODO(wala): SSE4.1 has pextrb.
2197 // 2213 //
2198 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when 2214 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
2199 // support for legalizing to mem is implemented. 2215 // support for legalizing to mem is implemented.
2200 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); 2216 Variable *Slot = Func->makeVariable(Ty, Context.getNode());
2201 Slot->setWeight(RegWeight::Zero); 2217 Slot->setWeight(RegWeight::Zero);
2202 _movp(Slot, legalizeToVar(SourceVectOperand)); 2218 _movp(Slot, legalizeToVar(SourceVectOperand));
2203 2219
2204 // Compute the location of the element in memory. 2220 // Compute the location of the element in memory.
2205 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); 2221 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
2206 OperandX8632Mem *Loc = 2222 OperandX8632Mem *Loc =
(...skipping 325 matching lines...) Expand 10 before | Expand all | Expand 10 after
2532 // Expand the element to the appropriate size for it to be inserted 2548 // Expand the element to the appropriate size for it to be inserted
2533 // in the vector. 2549 // in the vector.
2534 Variable *Expanded = 2550 Variable *Expanded =
2535 Func->makeVariable(InVectorElementTy, Context.getNode()); 2551 Func->makeVariable(InVectorElementTy, Context.getNode());
2536 InstCast *Cast = 2552 InstCast *Cast =
2537 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert); 2553 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert);
2538 lowerCast(Cast); 2554 lowerCast(Cast);
2539 ElementToInsert = Expanded; 2555 ElementToInsert = Expanded;
2540 } 2556 }
2541 2557
2542 if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { 2558 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
2543 // Lower insertelement with 32-bit wide elements using shufps or 2559 // Use insertps, pinsrb, pinsrw, or pinsrd.
2544 // movss. 2560 Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);
2545 // TODO(wala): SSE4.1 has pinsrd and insertps. 2561 Variable *T = makeReg(Ty);
2562 _movp(T, SourceVectOperand);
2563 if (Ty == IceType_v4f32)
2564 _insertps(T, Element, Ctx->getConstantInt(IceType_i8, Index << 4));
2565 else
2566 _pinsr(T, Element, Ctx->getConstantInt(IceType_i8, Index));
2567 _movp(Inst->getDest(), T);
2568 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
2569 // Use shufps or movss.
2546 Variable *Element = NULL; 2570 Variable *Element = NULL;
2547 if (InVectorElementTy == IceType_f32) { 2571 if (InVectorElementTy == IceType_f32) {
2548 // Element will be in an XMM register since it is floating point. 2572 // Element will be in an XMM register since it is floating point.
2549 Element = legalizeToVar(ElementToInsert); 2573 Element = legalizeToVar(ElementToInsert);
2550 } else { 2574 } else {
2551 // Copy an integer to an XMM register. 2575 // Copy an integer to an XMM register.
2552 Operand *T = legalize(ElementToInsert, Legal_Reg | Legal_Mem); 2576 Operand *T = legalize(ElementToInsert, Legal_Reg | Legal_Mem);
2553 Element = makeReg(Ty); 2577 Element = makeReg(Ty);
2554 _movd(Element, T); 2578 _movd(Element, T);
2555 } 2579 }
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
2600 _shufps(Element, SourceVectOperand, Mask2Constant); 2624 _shufps(Element, SourceVectOperand, Mask2Constant);
2601 _movp(Inst->getDest(), Element); 2625 _movp(Inst->getDest(), Element);
2602 } else { 2626 } else {
2603 Variable *T = makeReg(Ty); 2627 Variable *T = makeReg(Ty);
2604 _movp(T, SourceVectOperand); 2628 _movp(T, SourceVectOperand);
2605 _shufps(Element, T, Mask1Constant); 2629 _shufps(Element, T, Mask1Constant);
2606 _shufps(T, Element, Mask2Constant); 2630 _shufps(T, Element, Mask2Constant);
2607 _movp(Inst->getDest(), T); 2631 _movp(Inst->getDest(), T);
2608 } 2632 }
2609 #undef ALIGN_HACK 2633 #undef ALIGN_HACK
2610 } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
2611 Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);
2612 Variable *T = makeReg(Ty);
2613 _movp(T, SourceVectOperand);
2614 _pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index));
2615 _movp(Inst->getDest(), T);
2616 } else { 2634 } else {
2617 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); 2635 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
2618 // Spill the value to a stack slot and perform the insertion in 2636 // Spill the value to a stack slot and perform the insertion in
2619 // memory. 2637 // memory.
2620 // TODO(wala): SSE4.1 has pinsrb.
2621 // 2638 //
2622 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when 2639 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
2623 // support for legalizing to mem is implemented. 2640 // support for legalizing to mem is implemented.
2624 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); 2641 Variable *Slot = Func->makeVariable(Ty, Context.getNode());
2625 Slot->setWeight(RegWeight::Zero); 2642 Slot->setWeight(RegWeight::Zero);
2626 _movp(Slot, legalizeToVar(SourceVectOperand)); 2643 _movp(Slot, legalizeToVar(SourceVectOperand));
2627 2644
2628 // Compute the location of the position to insert in memory. 2645 // Compute the location of the position to insert in memory.
2629 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); 2646 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
2630 OperandX8632Mem *Loc = 2647 OperandX8632Mem *Loc =
(...skipping 913 matching lines...) Expand 10 before | Expand all | Expand 10 after
3544 Context.insert(InstFakeUse::create(Func, esp)); 3561 Context.insert(InstFakeUse::create(Func, esp));
3545 } 3562 }
3546 3563
3547 void TargetX8632::lowerSelect(const InstSelect *Inst) { 3564 void TargetX8632::lowerSelect(const InstSelect *Inst) {
3548 Variable *Dest = Inst->getDest(); 3565 Variable *Dest = Inst->getDest();
3549 Operand *SrcT = Inst->getTrueOperand(); 3566 Operand *SrcT = Inst->getTrueOperand();
3550 Operand *SrcF = Inst->getFalseOperand(); 3567 Operand *SrcF = Inst->getFalseOperand();
3551 Operand *Condition = Inst->getCondition(); 3568 Operand *Condition = Inst->getCondition();
3552 3569
3553 if (isVectorType(Dest->getType())) { 3570 if (isVectorType(Dest->getType())) {
3554 // a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d)
3555 // TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has
3556 // blendps and pblendw for constant condition operands.
3557 Type SrcTy = SrcT->getType(); 3571 Type SrcTy = SrcT->getType();
3558 Variable *T = makeReg(SrcTy); 3572 Variable *T = makeReg(SrcTy);
3573 // ALIGNHACK: Until stack alignment support is implemented, vector
3574 // instructions need to have vector operands in registers. Once
3575 // there is support for stack alignment, LEGAL_HACK can be removed.
3576 #define LEGAL_HACK(Vect) legalizeToVar((Vect))
3577 if (InstructionSet >= SSE4_1) {
3578 // TODO(wala): If the condition operand is a constant, use blendps
3579 // or pblendw.
3580 //
3581 // Use blendvps or pblendvb to implement select.
3582 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
3583 SrcTy == IceType_v4f32) {
3584 Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0);
3585 _movp(xmm0, Condition);
3586 _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31));
3587 _movp(T, SrcF);
3588 _blendvps(T, LEGAL_HACK(SrcT), xmm0);
3589 _movp(Dest, T);
3590 } else {
3591 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
3592 Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16
3593 : IceType_v16i8;
3594 Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0);
3595 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
3596 _movp(T, SrcF);
3597 _pblendvb(T, LEGAL_HACK(SrcT), xmm0);
3598 _movp(Dest, T);
3599 }
3600 return;
3601 }
3602 // Lower select without SSE4.1:
3603 // a=d?b:c ==>
3604 // if elementtype(d) != i1:
3605 // d=sext(d);
3606 // a=(b&d)|(c&~d);
3559 Variable *T2 = makeReg(SrcTy); 3607 Variable *T2 = makeReg(SrcTy);
3560 // Sign extend the condition operand if applicable. 3608 // Sign extend the condition operand if applicable.
3561 if (SrcTy == IceType_v4f32) { 3609 if (SrcTy == IceType_v4f32) {
3562 // The sext operation takes only integer arguments. 3610 // The sext operation takes only integer arguments.
3563 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode()); 3611 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode());
3564 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); 3612 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
3565 _movp(T, T3); 3613 _movp(T, T3);
3566 } else if (typeElementType(SrcTy) != IceType_i1) { 3614 } else if (typeElementType(SrcTy) != IceType_i1) {
3567 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); 3615 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
3568 } else { 3616 } else {
3569 _movp(T, Condition); 3617 _movp(T, Condition);
3570 } 3618 }
3571 // ALIGNHACK: Until stack alignment support is implemented, the
3572 // bitwise vector instructions need to have both operands in
3573 // registers. Once there is support for stack alignment, LEGAL_HACK
3574 // can be removed.
3575 #define LEGAL_HACK(Vect) legalizeToVar((Vect))
3576 _movp(T2, T); 3619 _movp(T2, T);
3577 _pand(T, LEGAL_HACK(SrcT)); 3620 _pand(T, LEGAL_HACK(SrcT));
3578 _pandn(T2, LEGAL_HACK(SrcF)); 3621 _pandn(T2, LEGAL_HACK(SrcF));
3579 _por(T, T2); 3622 _por(T, T2);
3580 _movp(Dest, T); 3623 _movp(Dest, T);
3581 #undef LEGAL_HACK 3624 #undef LEGAL_HACK
3582 3625
3583 return; 3626 return;
3584 } 3627 }
3585 3628
(...skipping 504 matching lines...) Expand 10 before | Expand all | Expand 10 after
4090 for (SizeT i = 0; i < Size; ++i) { 4133 for (SizeT i = 0; i < Size; ++i) {
4091 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; 4134 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";
4092 } 4135 }
4093 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; 4136 Str << "\t.size\t" << MangledName << ", " << Size << "\n";
4094 } 4137 }
4095 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName 4138 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName
4096 << "\n"; 4139 << "\n";
4097 } 4140 }
4098 4141
4099 } // end of namespace Ice 4142 } // end of namespace Ice
OLDNEW
« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/vector-arith.ll » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698