Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(206)

Side by Side Diff: src/IceTargetLoweringX8632.cpp

Issue 427843002: Subzero: Add support for SSE4.1 instructions. (Closed) Base URL: https://gerrit.chromium.org/gerrit/p/native_client/pnacl-subzero.git@master
Patch Set: 1) Fix compilation 2) Fuse conditions in mul lowering to avoid code duplication Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//
2 // 2 //
3 // The Subzero Code Generator 3 // The Subzero Code Generator
4 // 4 //
5 // This file is distributed under the University of Illinois Open Source 5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details. 6 // License. See LICENSE.TXT for details.
7 // 7 //
8 //===----------------------------------------------------------------------===// 8 //===----------------------------------------------------------------------===//
9 // 9 //
10 // This file implements the TargetLoweringX8632 class, which 10 // This file implements the TargetLoweringX8632 class, which
11 // consists almost entirely of the lowering sequence for each 11 // consists almost entirely of the lowering sequence for each
12 // high-level instruction. It also implements 12 // high-level instruction. It also implements
13 // TargetX8632Fast::postLower() which does the simplest possible 13 // TargetX8632Fast::postLower() which does the simplest possible
14 // register allocation for the "fast" target. 14 // register allocation for the "fast" target.
15 // 15 //
16 //===----------------------------------------------------------------------===// 16 //===----------------------------------------------------------------------===//
17 17
18 #include "IceDefs.h" 18 #include "IceDefs.h"
19 #include "IceCfg.h" 19 #include "IceCfg.h"
20 #include "IceCfgNode.h" 20 #include "IceCfgNode.h"
21 #include "IceInstX8632.h" 21 #include "IceInstX8632.h"
22 #include "IceOperand.h" 22 #include "IceOperand.h"
23 #include "IceTargetLoweringX8632.def" 23 #include "IceTargetLoweringX8632.def"
24 #include "IceTargetLoweringX8632.h" 24 #include "IceTargetLoweringX8632.h"
25 #include "llvm/Support/CommandLine.h"
25 26
26 namespace Ice { 27 namespace Ice {
27 28
28 namespace { 29 namespace {
29 30
30 // The following table summarizes the logic for lowering the fcmp 31 // The following table summarizes the logic for lowering the fcmp
31 // instruction. There is one table entry for each of the 16 conditions. 32 // instruction. There is one table entry for each of the 16 conditions.
32 // 33 //
33 // The first four columns describe the case when the operands are 34 // The first four columns describe the case when the operands are
34 // floating point scalar values. A comment in lowerFcmp() describes the 35 // floating point scalar values. A comment in lowerFcmp() describes the
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
116 size_t Index = static_cast<size_t>(Ty); 117 size_t Index = static_cast<size_t>(Ty);
117 assert(Index < TableTypeX8632AttributesSize); 118 assert(Index < TableTypeX8632AttributesSize);
118 return TableTypeX8632Attributes[Ty].InVectorElementType; 119 return TableTypeX8632Attributes[Ty].InVectorElementType;
119 } 120 }
120 121
121 // The maximum number of arguments to pass in XMM registers 122 // The maximum number of arguments to pass in XMM registers
122 const unsigned X86_MAX_XMM_ARGS = 4; 123 const unsigned X86_MAX_XMM_ARGS = 4;
123 // The number of bits in a byte 124 // The number of bits in a byte
124 const unsigned X86_CHAR_BIT = 8; 125 const unsigned X86_CHAR_BIT = 8;
125 126
127 // Instruction set options
128 namespace cl = ::llvm::cl;
129 cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet(
130 "mattr", cl::desc("X86 target attributes"),
131 cl::init(TargetX8632::SSE2),
132 cl::values(
133 clEnumValN(TargetX8632::SSE2, "sse2",
134 "Enable SSE2 instructions (default)"),
135 clEnumValN(TargetX8632::SSE4_1, "sse4.1",
136 "Enable SSE 4.1 instructions"), clEnumValEnd));
137
126 // Return a string representation of the type that is suitable for use 138 // Return a string representation of the type that is suitable for use
127 // in an identifier. 139 // in an identifier.
128 IceString typeIdentString(const Type Ty) { 140 IceString typeIdentString(const Type Ty) {
129 IceString Str; 141 IceString Str;
130 llvm::raw_string_ostream BaseOS(Str); 142 llvm::raw_string_ostream BaseOS(Str);
131 if (isVectorType(Ty)) { 143 if (isVectorType(Ty)) {
132 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty); 144 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty);
133 } else { 145 } else {
134 BaseOS << Ty; 146 BaseOS << Ty;
135 } 147 }
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after
227 #define X(tag, size, align, elts, elty, str) \ 239 #define X(tag, size, align, elts, elty, str) \
228 STATIC_ASSERT(_table1_##tag == _table2_##tag); 240 STATIC_ASSERT(_table1_##tag == _table2_##tag);
229 ICETYPE_TABLE; 241 ICETYPE_TABLE;
230 #undef X 242 #undef X
231 } 243 }
232 } 244 }
233 245
234 } // end of anonymous namespace 246 } // end of anonymous namespace
235 247
236 TargetX8632::TargetX8632(Cfg *Func) 248 TargetX8632::TargetX8632(Cfg *Func)
237 : TargetLowering(Func), IsEbpBasedFrame(false), FrameSizeLocals(0), 249 : TargetLowering(Func), InstructionSet(CLInstructionSet),
238 LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false), 250 IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0),
251 NextLabelNumber(0), ComputedLiveRanges(false),
239 PhysicalRegisters(VarList(Reg_NUM)) { 252 PhysicalRegisters(VarList(Reg_NUM)) {
240 // TODO: Don't initialize IntegerRegisters and friends every time. 253 // TODO: Don't initialize IntegerRegisters and friends every time.
241 // Instead, initialize in some sort of static initializer for the 254 // Instead, initialize in some sort of static initializer for the
242 // class. 255 // class.
243 llvm::SmallBitVector IntegerRegisters(Reg_NUM); 256 llvm::SmallBitVector IntegerRegisters(Reg_NUM);
244 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM); 257 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM);
245 llvm::SmallBitVector FloatRegisters(Reg_NUM); 258 llvm::SmallBitVector FloatRegisters(Reg_NUM);
246 llvm::SmallBitVector VectorRegisters(Reg_NUM); 259 llvm::SmallBitVector VectorRegisters(Reg_NUM);
247 llvm::SmallBitVector InvalidRegisters(Reg_NUM); 260 llvm::SmallBitVector InvalidRegisters(Reg_NUM);
248 ScratchRegs.resize(Reg_NUM); 261 ScratchRegs.resize(Reg_NUM);
(...skipping 972 matching lines...) Expand 10 before | Expand all | Expand 10 after
1221 _pxor(T, LEGAL_HACK(Src1)); 1234 _pxor(T, LEGAL_HACK(Src1));
1222 _movp(Dest, T); 1235 _movp(Dest, T);
1223 } break; 1236 } break;
1224 case InstArithmetic::Sub: { 1237 case InstArithmetic::Sub: {
1225 Variable *T = makeReg(Dest->getType()); 1238 Variable *T = makeReg(Dest->getType());
1226 _movp(T, Src0); 1239 _movp(T, Src0);
1227 _psub(T, LEGAL_HACK(Src1)); 1240 _psub(T, LEGAL_HACK(Src1));
1228 _movp(Dest, T); 1241 _movp(Dest, T);
1229 } break; 1242 } break;
1230 case InstArithmetic::Mul: { 1243 case InstArithmetic::Mul: {
1231 if (Dest->getType() == IceType_v4i32) { 1244 if (Dest->getType() == IceType_v8i16 ||
1245 (InstructionSet >= SSE4_1 && Dest->getType() == IceType_v4i32)) {
1246 Variable *T = makeReg(Dest->getType());
1247 _movp(T, Src0);
1248 _pmull(T, legalizeToVar(Src1));
1249 _movp(Dest, T);
1250 } else if (Dest->getType() == IceType_v4i32) {
1232 // Lowering sequence: 1251 // Lowering sequence:
1233 // Note: The mask arguments have index 0 on the left. 1252 // Note: The mask arguments have index 0 on the left.
1234 // 1253 //
1235 // movups T1, Src0 1254 // movups T1, Src0
1236 // pshufd T2, Src0, {1,0,3,0} 1255 // pshufd T2, Src0, {1,0,3,0}
1237 // pshufd T3, Src1, {1,0,3,0} 1256 // pshufd T3, Src1, {1,0,3,0}
1238 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} 1257 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
1239 // pmuludq T1, Src1 1258 // pmuludq T1, Src1
1240 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]} 1259 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
1241 // pmuludq T2, T3 1260 // pmuludq T2, T3
1242 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])} 1261 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
1243 // shufps T1, T2, {0,2,0,2} 1262 // shufps T1, T2, {0,2,0,2}
1244 // pshufd T4, T1, {0,2,1,3} 1263 // pshufd T4, T1, {0,2,1,3}
1245 // movups Dest, T4 1264 // movups Dest, T4
1246 //
1247 // TODO(wala): SSE4.1 has pmulld.
1248 1265
1249 // Mask that directs pshufd to create a vector with entries 1266 // Mask that directs pshufd to create a vector with entries
1250 // Src[1, 0, 3, 0] 1267 // Src[1, 0, 3, 0]
1251 const unsigned Constant1030 = 0x31; 1268 const unsigned Constant1030 = 0x31;
1252 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030); 1269 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030);
1253 // Mask that directs shufps to create a vector with entries 1270 // Mask that directs shufps to create a vector with entries
1254 // Dest[0, 2], Src[0, 2] 1271 // Dest[0, 2], Src[0, 2]
1255 const unsigned Mask0202 = 0x88; 1272 const unsigned Mask0202 = 0x88;
1256 // Mask that directs pshufd to create a vector with entries 1273 // Mask that directs pshufd to create a vector with entries
1257 // Src[0, 2, 1, 3] 1274 // Src[0, 2, 1, 3]
1258 const unsigned Mask0213 = 0xd8; 1275 const unsigned Mask0213 = 0xd8;
1259 Variable *T1 = makeReg(IceType_v4i32); 1276 Variable *T1 = makeReg(IceType_v4i32);
1260 Variable *T2 = makeReg(IceType_v4i32); 1277 Variable *T2 = makeReg(IceType_v4i32);
1261 Variable *T3 = makeReg(IceType_v4i32); 1278 Variable *T3 = makeReg(IceType_v4i32);
1262 Variable *T4 = makeReg(IceType_v4i32); 1279 Variable *T4 = makeReg(IceType_v4i32);
1263 _movp(T1, Src0); 1280 _movp(T1, Src0);
1264 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R 1281 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R
1265 // with Src1 after stack operand alignment support is 1282 // with Src1 after stack operand alignment support is
1266 // implemented. 1283 // implemented.
1267 Variable *Src0R = LEGAL_HACK(Src0); 1284 Variable *Src0R = LEGAL_HACK(Src0);
1268 Variable *Src1R = LEGAL_HACK(Src1); 1285 Variable *Src1R = LEGAL_HACK(Src1);
1269 _pshufd(T2, Src0R, Mask1030); 1286 _pshufd(T2, Src0R, Mask1030);
1270 _pshufd(T3, Src1R, Mask1030); 1287 _pshufd(T3, Src1R, Mask1030);
1271 _pmuludq(T1, Src1R); 1288 _pmuludq(T1, Src1R);
1272 _pmuludq(T2, T3); 1289 _pmuludq(T2, T3);
1273 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202)); 1290 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202));
1274 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213)); 1291 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213));
1275 _movp(Dest, T4); 1292 _movp(Dest, T4);
1276 } else if (Dest->getType() == IceType_v8i16) {
1277 Variable *T = makeReg(IceType_v8i16);
1278 _movp(T, Src0);
1279 _pmullw(T, legalizeToVar(Src1));
1280 _movp(Dest, T);
1281 } else { 1293 } else {
1282 assert(Dest->getType() == IceType_v16i8); 1294 assert(Dest->getType() == IceType_v16i8);
1283 // Sz_mul_v16i8 1295 // Sz_mul_v16i8
1284 const IceString Helper = "Sz_mul_v16i8"; 1296 const IceString Helper = "Sz_mul_v16i8";
1285 const SizeT MaxSrcs = 2; 1297 const SizeT MaxSrcs = 2;
1286 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs); 1298 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs);
1287 Call->addArg(Src0); 1299 Call->addArg(Src0);
1288 Call->addArg(Src1); 1300 Call->addArg(Src1);
1289 lowerCall(Call); 1301 lowerCall(Call);
1290 } 1302 }
(...skipping 857 matching lines...) Expand 10 before | Expand all | Expand 10 after
2148 // Only constant indices are allowed in PNaCl IR. 2160 // Only constant indices are allowed in PNaCl IR.
2149 assert(ElementIndex); 2161 assert(ElementIndex);
2150 2162
2151 unsigned Index = ElementIndex->getValue(); 2163 unsigned Index = ElementIndex->getValue();
2152 Type Ty = SourceVectOperand->getType(); 2164 Type Ty = SourceVectOperand->getType();
2153 Type ElementTy = typeElementType(Ty); 2165 Type ElementTy = typeElementType(Ty);
2154 Type InVectorElementTy = getInVectorElementType(Ty); 2166 Type InVectorElementTy = getInVectorElementType(Ty);
2155 Variable *ExtractedElement = makeReg(InVectorElementTy); 2167 Variable *ExtractedElement = makeReg(InVectorElementTy);
2156 2168
2157 // TODO(wala): Determine the best lowering sequences for each type. 2169 // TODO(wala): Determine the best lowering sequences for each type.
2158 if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { 2170 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
2159 // Lower extractelement operations where the element is 32 bits 2171 (InstructionSet >= SSE4_1 && Ty != IceType_v4f32)) {
2160 // wide with pshufd. 2172 // Use pextrb, pextrw, or pextrd.
2161 // TODO(wala): SSE4.1 has extractps and pextrd 2173 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
2174 Variable *SourceVectR = legalizeToVar(SourceVectOperand);
2175 _pextr(ExtractedElement, SourceVectR, Mask);
2176 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
2177 // Use pshufd and movd/movss.
2162 // 2178 //
2163 // ALIGNHACK: Force vector operands to registers in instructions that 2179 // ALIGNHACK: Force vector operands to registers in instructions that
2164 // require aligned memory operands until support for stack alignment 2180 // require aligned memory operands until support for stack alignment
2165 // is implemented. 2181 // is implemented.
2166 #define ALIGN_HACK(Vect) legalizeToVar((Vect)) 2182 #define ALIGN_HACK(Vect) legalizeToVar((Vect))
2167 Variable *T = NULL; 2183 Variable *T = NULL;
2168 if (Index) { 2184 if (Index) {
2169 // The shuffle only needs to occur if the element to be extracted 2185 // The shuffle only needs to occur if the element to be extracted
2170 // is not at the lowest index. 2186 // is not at the lowest index.
2171 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); 2187 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
2172 T = makeReg(Ty); 2188 T = makeReg(Ty);
2173 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask); 2189 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask);
2174 } else { 2190 } else {
2175 T = legalizeToVar(SourceVectOperand); 2191 T = legalizeToVar(SourceVectOperand);
2176 } 2192 }
2177 2193
2178 if (InVectorElementTy == IceType_i32) { 2194 if (InVectorElementTy == IceType_i32) {
2179 _movd(ExtractedElement, T); 2195 _movd(ExtractedElement, T);
2180 } else { // Ty == Icetype_f32 2196 } else { // Ty == Icetype_f32
2181 // TODO(wala): _movss is only used here because _mov does not 2197 // TODO(wala): _movss is only used here because _mov does not
2182 // allow a vector source and a scalar destination. _mov should be 2198 // allow a vector source and a scalar destination. _mov should be
2183 // able to be used here. 2199 // able to be used here.
2184 // _movss is a binary instruction, so the FakeDef is needed to 2200 // _movss is a binary instruction, so the FakeDef is needed to
2185 // keep the live range analysis consistent. 2201 // keep the live range analysis consistent.
2186 Context.insert(InstFakeDef::create(Func, ExtractedElement)); 2202 Context.insert(InstFakeDef::create(Func, ExtractedElement));
2187 _movss(ExtractedElement, T); 2203 _movss(ExtractedElement, T);
2188 } 2204 }
2189 #undef ALIGN_HACK 2205 #undef ALIGN_HACK
2190 } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
2191 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
2192 _pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask);
2193 } else { 2206 } else {
2194 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); 2207 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
2195 // Spill the value to a stack slot and do the extraction in memory. 2208 // Spill the value to a stack slot and do the extraction in memory.
2196 // TODO(wala): SSE4.1 has pextrb.
2197 // 2209 //
2198 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when 2210 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
2199 // support for legalizing to mem is implemented. 2211 // support for legalizing to mem is implemented.
2200 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); 2212 Variable *Slot = Func->makeVariable(Ty, Context.getNode());
2201 Slot->setWeight(RegWeight::Zero); 2213 Slot->setWeight(RegWeight::Zero);
2202 _movp(Slot, legalizeToVar(SourceVectOperand)); 2214 _movp(Slot, legalizeToVar(SourceVectOperand));
2203 2215
2204 // Compute the location of the element in memory. 2216 // Compute the location of the element in memory.
2205 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); 2217 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
2206 OperandX8632Mem *Loc = 2218 OperandX8632Mem *Loc =
(...skipping 325 matching lines...) Expand 10 before | Expand all | Expand 10 after
2532 // Expand the element to the appropriate size for it to be inserted 2544 // Expand the element to the appropriate size for it to be inserted
2533 // in the vector. 2545 // in the vector.
2534 Variable *Expanded = 2546 Variable *Expanded =
2535 Func->makeVariable(InVectorElementTy, Context.getNode()); 2547 Func->makeVariable(InVectorElementTy, Context.getNode());
2536 InstCast *Cast = 2548 InstCast *Cast =
2537 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert); 2549 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert);
2538 lowerCast(Cast); 2550 lowerCast(Cast);
2539 ElementToInsert = Expanded; 2551 ElementToInsert = Expanded;
2540 } 2552 }
2541 2553
2542 if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { 2554 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
2543 // Lower insertelement with 32-bit wide elements using shufps or 2555 // Use insertps, pinsrb, pinsrw, or pinsrd.
2544 // movss. 2556 Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);
2545 // TODO(wala): SSE4.1 has pinsrd and insertps. 2557 Variable *T = makeReg(Ty);
2558 _movp(T, SourceVectOperand);
2559 if (Ty == IceType_v4f32)
2560 _insertps(T, Element, Ctx->getConstantInt(IceType_i8, Index << 4));
2561 else
2562 _pinsr(T, Element, Ctx->getConstantInt(IceType_i8, Index));
2563 _movp(Inst->getDest(), T);
2564 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
2565 // Use shufps or movss.
2546 Variable *Element = NULL; 2566 Variable *Element = NULL;
2547 if (InVectorElementTy == IceType_f32) { 2567 if (InVectorElementTy == IceType_f32) {
2548 // Element will be in an XMM register since it is floating point. 2568 // Element will be in an XMM register since it is floating point.
2549 Element = legalizeToVar(ElementToInsert); 2569 Element = legalizeToVar(ElementToInsert);
2550 } else { 2570 } else {
2551 // Copy an integer to an XMM register. 2571 // Copy an integer to an XMM register.
2552 Operand *T = legalize(ElementToInsert, Legal_Reg | Legal_Mem); 2572 Operand *T = legalize(ElementToInsert, Legal_Reg | Legal_Mem);
2553 Element = makeReg(Ty); 2573 Element = makeReg(Ty);
2554 _movd(Element, T); 2574 _movd(Element, T);
2555 } 2575 }
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
2600 _shufps(Element, SourceVectOperand, Mask2Constant); 2620 _shufps(Element, SourceVectOperand, Mask2Constant);
2601 _movp(Inst->getDest(), Element); 2621 _movp(Inst->getDest(), Element);
2602 } else { 2622 } else {
2603 Variable *T = makeReg(Ty); 2623 Variable *T = makeReg(Ty);
2604 _movp(T, SourceVectOperand); 2624 _movp(T, SourceVectOperand);
2605 _shufps(Element, T, Mask1Constant); 2625 _shufps(Element, T, Mask1Constant);
2606 _shufps(T, Element, Mask2Constant); 2626 _shufps(T, Element, Mask2Constant);
2607 _movp(Inst->getDest(), T); 2627 _movp(Inst->getDest(), T);
2608 } 2628 }
2609 #undef ALIGN_HACK 2629 #undef ALIGN_HACK
2610 } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
2611 Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);
2612 Variable *T = makeReg(Ty);
2613 _movp(T, SourceVectOperand);
2614 _pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index));
2615 _movp(Inst->getDest(), T);
2616 } else { 2630 } else {
2617 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); 2631 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
2618 // Spill the value to a stack slot and perform the insertion in 2632 // Spill the value to a stack slot and perform the insertion in
2619 // memory. 2633 // memory.
2620 // TODO(wala): SSE4.1 has pinsrb.
2621 // 2634 //
2622 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when 2635 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
2623 // support for legalizing to mem is implemented. 2636 // support for legalizing to mem is implemented.
2624 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); 2637 Variable *Slot = Func->makeVariable(Ty, Context.getNode());
2625 Slot->setWeight(RegWeight::Zero); 2638 Slot->setWeight(RegWeight::Zero);
2626 _movp(Slot, legalizeToVar(SourceVectOperand)); 2639 _movp(Slot, legalizeToVar(SourceVectOperand));
2627 2640
2628 // Compute the location of the position to insert in memory. 2641 // Compute the location of the position to insert in memory.
2629 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); 2642 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
2630 OperandX8632Mem *Loc = 2643 OperandX8632Mem *Loc =
(...skipping 840 matching lines...) Expand 10 before | Expand all | Expand 10 after
3471 Context.insert(InstFakeUse::create(Func, esp)); 3484 Context.insert(InstFakeUse::create(Func, esp));
3472 } 3485 }
3473 3486
3474 void TargetX8632::lowerSelect(const InstSelect *Inst) { 3487 void TargetX8632::lowerSelect(const InstSelect *Inst) {
3475 Variable *Dest = Inst->getDest(); 3488 Variable *Dest = Inst->getDest();
3476 Operand *SrcT = Inst->getTrueOperand(); 3489 Operand *SrcT = Inst->getTrueOperand();
3477 Operand *SrcF = Inst->getFalseOperand(); 3490 Operand *SrcF = Inst->getFalseOperand();
3478 Operand *Condition = Inst->getCondition(); 3491 Operand *Condition = Inst->getCondition();
3479 3492
3480 if (isVectorType(Dest->getType())) { 3493 if (isVectorType(Dest->getType())) {
3481 // a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d)
3482 // TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has
3483 // blendps and pblendw for constant condition operands.
3484 Type SrcTy = SrcT->getType(); 3494 Type SrcTy = SrcT->getType();
3485 Variable *T = makeReg(SrcTy); 3495 Variable *T = makeReg(SrcTy);
3496 // ALIGNHACK: Until stack alignment support is implemented, vector
3497 // instructions need to have vector operands in registers. Once
3498 // there is support for stack alignment, LEGAL_HACK can be removed.
3499 #define LEGAL_HACK(Vect) legalizeToVar((Vect))
3500 if (InstructionSet >= SSE4_1) {
3501 // TODO(wala): If the condition operand is a constant, use blendps
3502 // or pblendw.
3503 //
3504 // Use blendvps or pblendvb to implement select.
3505 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
3506 SrcTy == IceType_v4f32) {
3507 Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0);
3508 _movp(xmm0, Condition);
3509 _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31));
3510 _movp(T, SrcF);
3511 _blendvps(T, LEGAL_HACK(SrcT), xmm0);
3512 _movp(Dest, T);
3513 } else {
3514 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
3515 Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16
3516 : IceType_v16i8;
3517 Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0);
3518 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
3519 _movp(T, SrcF);
3520 _pblendvb(T, LEGAL_HACK(SrcT), xmm0);
3521 _movp(Dest, T);
3522 }
3523 return;
3524 }
3525 // Lower select without SSE4.1:
3526 // a=d?b:c ==>
3527 // if elementtype(d) != i1:
3528 // d=sext(d);
3529 // a=(b&d)|(c&~d);
3486 Variable *T2 = makeReg(SrcTy); 3530 Variable *T2 = makeReg(SrcTy);
3487 // Sign extend the condition operand if applicable. 3531 // Sign extend the condition operand if applicable.
3488 if (SrcTy == IceType_v4f32) { 3532 if (SrcTy == IceType_v4f32) {
3489 // The sext operation takes only integer arguments. 3533 // The sext operation takes only integer arguments.
3490 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode()); 3534 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode());
3491 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); 3535 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
3492 _movp(T, T3); 3536 _movp(T, T3);
3493 } else if (typeElementType(SrcTy) != IceType_i1) { 3537 } else if (typeElementType(SrcTy) != IceType_i1) {
3494 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); 3538 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
3495 } else { 3539 } else {
3496 _movp(T, Condition); 3540 _movp(T, Condition);
3497 } 3541 }
3498 // ALIGNHACK: Until stack alignment support is implemented, the
3499 // bitwise vector instructions need to have both operands in
3500 // registers. Once there is support for stack alignment, LEGAL_HACK
3501 // can be removed.
3502 #define LEGAL_HACK(Vect) legalizeToVar((Vect))
3503 _movp(T2, T); 3542 _movp(T2, T);
3504 _pand(T, LEGAL_HACK(SrcT)); 3543 _pand(T, LEGAL_HACK(SrcT));
3505 _pandn(T2, LEGAL_HACK(SrcF)); 3544 _pandn(T2, LEGAL_HACK(SrcF));
3506 _por(T, T2); 3545 _por(T, T2);
3507 _movp(Dest, T); 3546 _movp(Dest, T);
3508 #undef LEGAL_HACK 3547 #undef LEGAL_HACK
3509 3548
3510 return; 3549 return;
3511 } 3550 }
3512 3551
(...skipping 504 matching lines...) Expand 10 before | Expand all | Expand 10 after
4017 for (SizeT i = 0; i < Size; ++i) { 4056 for (SizeT i = 0; i < Size; ++i) {
4018 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; 4057 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";
4019 } 4058 }
4020 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; 4059 Str << "\t.size\t" << MangledName << ", " << Size << "\n";
4021 } 4060 }
4022 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName 4061 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName
4023 << "\n"; 4062 << "\n";
4024 } 4063 }
4025 4064
4026 } // end of namespace Ice 4065 } // end of namespace Ice
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698