OLD | NEW |
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// | 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// |
2 // | 2 // |
3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
4 // | 4 // |
5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
7 // | 7 // |
8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
9 // | 9 // |
10 // This file implements the TargetLoweringX8632 class, which | 10 // This file implements the TargetLoweringX8632 class, which |
11 // consists almost entirely of the lowering sequence for each | 11 // consists almost entirely of the lowering sequence for each |
12 // high-level instruction. It also implements | 12 // high-level instruction. It also implements |
13 // TargetX8632Fast::postLower() which does the simplest possible | 13 // TargetX8632Fast::postLower() which does the simplest possible |
14 // register allocation for the "fast" target. | 14 // register allocation for the "fast" target. |
15 // | 15 // |
16 //===----------------------------------------------------------------------===// | 16 //===----------------------------------------------------------------------===// |
17 | 17 |
18 #include "IceDefs.h" | 18 #include "IceDefs.h" |
19 #include "IceCfg.h" | 19 #include "IceCfg.h" |
20 #include "IceCfgNode.h" | 20 #include "IceCfgNode.h" |
21 #include "IceInstX8632.h" | 21 #include "IceInstX8632.h" |
22 #include "IceOperand.h" | 22 #include "IceOperand.h" |
23 #include "IceTargetLoweringX8632.def" | 23 #include "IceTargetLoweringX8632.def" |
24 #include "IceTargetLoweringX8632.h" | 24 #include "IceTargetLoweringX8632.h" |
| 25 #include "llvm/Support/CommandLine.h" |
25 | 26 |
26 namespace Ice { | 27 namespace Ice { |
27 | 28 |
28 namespace { | 29 namespace { |
29 | 30 |
30 // The following table summarizes the logic for lowering the fcmp | 31 // The following table summarizes the logic for lowering the fcmp |
31 // instruction. There is one table entry for each of the 16 conditions. | 32 // instruction. There is one table entry for each of the 16 conditions. |
32 // | 33 // |
33 // The first four columns describe the case when the operands are | 34 // The first four columns describe the case when the operands are |
34 // floating point scalar values. A comment in lowerFcmp() describes the | 35 // floating point scalar values. A comment in lowerFcmp() describes the |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
116 size_t Index = static_cast<size_t>(Ty); | 117 size_t Index = static_cast<size_t>(Ty); |
117 assert(Index < TableTypeX8632AttributesSize); | 118 assert(Index < TableTypeX8632AttributesSize); |
118 return TableTypeX8632Attributes[Ty].InVectorElementType; | 119 return TableTypeX8632Attributes[Ty].InVectorElementType; |
119 } | 120 } |
120 | 121 |
121 // The maximum number of arguments to pass in XMM registers | 122 // The maximum number of arguments to pass in XMM registers |
122 const unsigned X86_MAX_XMM_ARGS = 4; | 123 const unsigned X86_MAX_XMM_ARGS = 4; |
123 // The number of bits in a byte | 124 // The number of bits in a byte |
124 const unsigned X86_CHAR_BIT = 8; | 125 const unsigned X86_CHAR_BIT = 8; |
125 | 126 |
| 127 // Instruction set options |
| 128 namespace cl = ::llvm::cl; |
| 129 cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet( |
| 130 "mattr", cl::desc("X86 target attributes"), |
| 131 cl::init(TargetX8632::SSE2), |
| 132 cl::values( |
| 133 clEnumValN(TargetX8632::SSE2, "sse2", |
| 134 "Enable SSE2 instructions (default)"), |
| 135 clEnumValN(TargetX8632::SSE4_1, "sse4.1", |
| 136 "Enable SSE 4.1 instructions"), clEnumValEnd)); |
| 137 |
126 // Return a string representation of the type that is suitable for use | 138 // Return a string representation of the type that is suitable for use |
127 // in an identifier. | 139 // in an identifier. |
128 IceString typeIdentString(const Type Ty) { | 140 IceString typeIdentString(const Type Ty) { |
129 IceString Str; | 141 IceString Str; |
130 llvm::raw_string_ostream BaseOS(Str); | 142 llvm::raw_string_ostream BaseOS(Str); |
131 if (isVectorType(Ty)) { | 143 if (isVectorType(Ty)) { |
132 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty); | 144 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty); |
133 } else { | 145 } else { |
134 BaseOS << Ty; | 146 BaseOS << Ty; |
135 } | 147 } |
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
227 #define X(tag, size, align, elts, elty, str) \ | 239 #define X(tag, size, align, elts, elty, str) \ |
228 STATIC_ASSERT(_table1_##tag == _table2_##tag); | 240 STATIC_ASSERT(_table1_##tag == _table2_##tag); |
229 ICETYPE_TABLE; | 241 ICETYPE_TABLE; |
230 #undef X | 242 #undef X |
231 } | 243 } |
232 } | 244 } |
233 | 245 |
234 } // end of anonymous namespace | 246 } // end of anonymous namespace |
235 | 247 |
236 TargetX8632::TargetX8632(Cfg *Func) | 248 TargetX8632::TargetX8632(Cfg *Func) |
237 : TargetLowering(Func), IsEbpBasedFrame(false), FrameSizeLocals(0), | 249 : TargetLowering(Func), InstructionSet(CLInstructionSet), |
238 LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false), | 250 IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0), |
| 251 NextLabelNumber(0), ComputedLiveRanges(false), |
239 PhysicalRegisters(VarList(Reg_NUM)) { | 252 PhysicalRegisters(VarList(Reg_NUM)) { |
240 // TODO: Don't initialize IntegerRegisters and friends every time. | 253 // TODO: Don't initialize IntegerRegisters and friends every time. |
241 // Instead, initialize in some sort of static initializer for the | 254 // Instead, initialize in some sort of static initializer for the |
242 // class. | 255 // class. |
243 llvm::SmallBitVector IntegerRegisters(Reg_NUM); | 256 llvm::SmallBitVector IntegerRegisters(Reg_NUM); |
244 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM); | 257 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM); |
245 llvm::SmallBitVector FloatRegisters(Reg_NUM); | 258 llvm::SmallBitVector FloatRegisters(Reg_NUM); |
246 llvm::SmallBitVector VectorRegisters(Reg_NUM); | 259 llvm::SmallBitVector VectorRegisters(Reg_NUM); |
247 llvm::SmallBitVector InvalidRegisters(Reg_NUM); | 260 llvm::SmallBitVector InvalidRegisters(Reg_NUM); |
248 ScratchRegs.resize(Reg_NUM); | 261 ScratchRegs.resize(Reg_NUM); |
(...skipping 972 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1221 _pxor(T, LEGAL_HACK(Src1)); | 1234 _pxor(T, LEGAL_HACK(Src1)); |
1222 _movp(Dest, T); | 1235 _movp(Dest, T); |
1223 } break; | 1236 } break; |
1224 case InstArithmetic::Sub: { | 1237 case InstArithmetic::Sub: { |
1225 Variable *T = makeReg(Dest->getType()); | 1238 Variable *T = makeReg(Dest->getType()); |
1226 _movp(T, Src0); | 1239 _movp(T, Src0); |
1227 _psub(T, LEGAL_HACK(Src1)); | 1240 _psub(T, LEGAL_HACK(Src1)); |
1228 _movp(Dest, T); | 1241 _movp(Dest, T); |
1229 } break; | 1242 } break; |
1230 case InstArithmetic::Mul: { | 1243 case InstArithmetic::Mul: { |
1231 if (Dest->getType() == IceType_v4i32) { | 1244 if (Dest->getType() == IceType_v8i16 || |
| 1245 (InstructionSet >= SSE4_1 && Dest->getType() == IceType_v4i32)) { |
| 1246 Variable *T = makeReg(Dest->getType()); |
| 1247 _movp(T, Src0); |
| 1248 _pmull(T, legalizeToVar(Src1)); |
| 1249 _movp(Dest, T); |
| 1250 } else if (Dest->getType() == IceType_v4i32) { |
1232 // Lowering sequence: | 1251 // Lowering sequence: |
1233 // Note: The mask arguments have index 0 on the left. | 1252 // Note: The mask arguments have index 0 on the left. |
1234 // | 1253 // |
1235 // movups T1, Src0 | 1254 // movups T1, Src0 |
1236 // pshufd T2, Src0, {1,0,3,0} | 1255 // pshufd T2, Src0, {1,0,3,0} |
1237 // pshufd T3, Src1, {1,0,3,0} | 1256 // pshufd T3, Src1, {1,0,3,0} |
1238 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} | 1257 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} |
1239 // pmuludq T1, Src1 | 1258 // pmuludq T1, Src1 |
1240 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]} | 1259 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]} |
1241 // pmuludq T2, T3 | 1260 // pmuludq T2, T3 |
1242 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])} | 1261 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])} |
1243 // shufps T1, T2, {0,2,0,2} | 1262 // shufps T1, T2, {0,2,0,2} |
1244 // pshufd T4, T1, {0,2,1,3} | 1263 // pshufd T4, T1, {0,2,1,3} |
1245 // movups Dest, T4 | 1264 // movups Dest, T4 |
1246 // | |
1247 // TODO(wala): SSE4.1 has pmulld. | |
1248 | 1265 |
1249 // Mask that directs pshufd to create a vector with entries | 1266 // Mask that directs pshufd to create a vector with entries |
1250 // Src[1, 0, 3, 0] | 1267 // Src[1, 0, 3, 0] |
1251 const unsigned Constant1030 = 0x31; | 1268 const unsigned Constant1030 = 0x31; |
1252 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030); | 1269 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030); |
1253 // Mask that directs shufps to create a vector with entries | 1270 // Mask that directs shufps to create a vector with entries |
1254 // Dest[0, 2], Src[0, 2] | 1271 // Dest[0, 2], Src[0, 2] |
1255 const unsigned Mask0202 = 0x88; | 1272 const unsigned Mask0202 = 0x88; |
1256 // Mask that directs pshufd to create a vector with entries | 1273 // Mask that directs pshufd to create a vector with entries |
1257 // Src[0, 2, 1, 3] | 1274 // Src[0, 2, 1, 3] |
1258 const unsigned Mask0213 = 0xd8; | 1275 const unsigned Mask0213 = 0xd8; |
1259 Variable *T1 = makeReg(IceType_v4i32); | 1276 Variable *T1 = makeReg(IceType_v4i32); |
1260 Variable *T2 = makeReg(IceType_v4i32); | 1277 Variable *T2 = makeReg(IceType_v4i32); |
1261 Variable *T3 = makeReg(IceType_v4i32); | 1278 Variable *T3 = makeReg(IceType_v4i32); |
1262 Variable *T4 = makeReg(IceType_v4i32); | 1279 Variable *T4 = makeReg(IceType_v4i32); |
1263 _movp(T1, Src0); | 1280 _movp(T1, Src0); |
1264 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R | 1281 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R |
1265 // with Src1 after stack operand alignment support is | 1282 // with Src1 after stack operand alignment support is |
1266 // implemented. | 1283 // implemented. |
1267 Variable *Src0R = LEGAL_HACK(Src0); | 1284 Variable *Src0R = LEGAL_HACK(Src0); |
1268 Variable *Src1R = LEGAL_HACK(Src1); | 1285 Variable *Src1R = LEGAL_HACK(Src1); |
1269 _pshufd(T2, Src0R, Mask1030); | 1286 _pshufd(T2, Src0R, Mask1030); |
1270 _pshufd(T3, Src1R, Mask1030); | 1287 _pshufd(T3, Src1R, Mask1030); |
1271 _pmuludq(T1, Src1R); | 1288 _pmuludq(T1, Src1R); |
1272 _pmuludq(T2, T3); | 1289 _pmuludq(T2, T3); |
1273 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202)); | 1290 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202)); |
1274 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213)); | 1291 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213)); |
1275 _movp(Dest, T4); | 1292 _movp(Dest, T4); |
1276 } else if (Dest->getType() == IceType_v8i16) { | |
1277 Variable *T = makeReg(IceType_v8i16); | |
1278 _movp(T, Src0); | |
1279 _pmullw(T, legalizeToVar(Src1)); | |
1280 _movp(Dest, T); | |
1281 } else { | 1293 } else { |
1282 assert(Dest->getType() == IceType_v16i8); | 1294 assert(Dest->getType() == IceType_v16i8); |
1283 // Sz_mul_v16i8 | 1295 // Sz_mul_v16i8 |
1284 const IceString Helper = "Sz_mul_v16i8"; | 1296 const IceString Helper = "Sz_mul_v16i8"; |
1285 const SizeT MaxSrcs = 2; | 1297 const SizeT MaxSrcs = 2; |
1286 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs); | 1298 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs); |
1287 Call->addArg(Src0); | 1299 Call->addArg(Src0); |
1288 Call->addArg(Src1); | 1300 Call->addArg(Src1); |
1289 lowerCall(Call); | 1301 lowerCall(Call); |
1290 } | 1302 } |
(...skipping 857 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2148 // Only constant indices are allowed in PNaCl IR. | 2160 // Only constant indices are allowed in PNaCl IR. |
2149 assert(ElementIndex); | 2161 assert(ElementIndex); |
2150 | 2162 |
2151 unsigned Index = ElementIndex->getValue(); | 2163 unsigned Index = ElementIndex->getValue(); |
2152 Type Ty = SourceVectOperand->getType(); | 2164 Type Ty = SourceVectOperand->getType(); |
2153 Type ElementTy = typeElementType(Ty); | 2165 Type ElementTy = typeElementType(Ty); |
2154 Type InVectorElementTy = getInVectorElementType(Ty); | 2166 Type InVectorElementTy = getInVectorElementType(Ty); |
2155 Variable *ExtractedElement = makeReg(InVectorElementTy); | 2167 Variable *ExtractedElement = makeReg(InVectorElementTy); |
2156 | 2168 |
2157 // TODO(wala): Determine the best lowering sequences for each type. | 2169 // TODO(wala): Determine the best lowering sequences for each type. |
2158 if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { | 2170 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || |
2159 // Lower extractelement operations where the element is 32 bits | 2171 (InstructionSet >= SSE4_1 && Ty != IceType_v4f32)) { |
2160 // wide with pshufd. | 2172 // Use pextrb, pextrw, or pextrd. |
2161 // TODO(wala): SSE4.1 has extractps and pextrd | 2173 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); |
| 2174 Variable *SourceVectR = legalizeToVar(SourceVectOperand); |
| 2175 _pextr(ExtractedElement, SourceVectR, Mask); |
| 2176 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { |
| 2177 // Use pshufd and movd/movss. |
2162 // | 2178 // |
2163 // ALIGNHACK: Force vector operands to registers in instructions that | 2179 // ALIGNHACK: Force vector operands to registers in instructions that |
2164 // require aligned memory operands until support for stack alignment | 2180 // require aligned memory operands until support for stack alignment |
2165 // is implemented. | 2181 // is implemented. |
2166 #define ALIGN_HACK(Vect) legalizeToVar((Vect)) | 2182 #define ALIGN_HACK(Vect) legalizeToVar((Vect)) |
2167 Variable *T = NULL; | 2183 Variable *T = NULL; |
2168 if (Index) { | 2184 if (Index) { |
2169 // The shuffle only needs to occur if the element to be extracted | 2185 // The shuffle only needs to occur if the element to be extracted |
2170 // is not at the lowest index. | 2186 // is not at the lowest index. |
2171 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); | 2187 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); |
2172 T = makeReg(Ty); | 2188 T = makeReg(Ty); |
2173 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask); | 2189 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask); |
2174 } else { | 2190 } else { |
2175 T = legalizeToVar(SourceVectOperand); | 2191 T = legalizeToVar(SourceVectOperand); |
2176 } | 2192 } |
2177 | 2193 |
2178 if (InVectorElementTy == IceType_i32) { | 2194 if (InVectorElementTy == IceType_i32) { |
2179 _movd(ExtractedElement, T); | 2195 _movd(ExtractedElement, T); |
2180 } else { // Ty == Icetype_f32 | 2196 } else { // Ty == Icetype_f32 |
2181 // TODO(wala): _movss is only used here because _mov does not | 2197 // TODO(wala): _movss is only used here because _mov does not |
2182 // allow a vector source and a scalar destination. _mov should be | 2198 // allow a vector source and a scalar destination. _mov should be |
2183 // able to be used here. | 2199 // able to be used here. |
2184 // _movss is a binary instruction, so the FakeDef is needed to | 2200 // _movss is a binary instruction, so the FakeDef is needed to |
2185 // keep the live range analysis consistent. | 2201 // keep the live range analysis consistent. |
2186 Context.insert(InstFakeDef::create(Func, ExtractedElement)); | 2202 Context.insert(InstFakeDef::create(Func, ExtractedElement)); |
2187 _movss(ExtractedElement, T); | 2203 _movss(ExtractedElement, T); |
2188 } | 2204 } |
2189 #undef ALIGN_HACK | 2205 #undef ALIGN_HACK |
2190 } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) { | |
2191 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); | |
2192 _pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask); | |
2193 } else { | 2206 } else { |
2194 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2207 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
2195 // Spill the value to a stack slot and do the extraction in memory. | 2208 // Spill the value to a stack slot and do the extraction in memory. |
2196 // TODO(wala): SSE4.1 has pextrb. | |
2197 // | 2209 // |
2198 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when | 2210 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when |
2199 // support for legalizing to mem is implemented. | 2211 // support for legalizing to mem is implemented. |
2200 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); | 2212 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); |
2201 Slot->setWeight(RegWeight::Zero); | 2213 Slot->setWeight(RegWeight::Zero); |
2202 _movp(Slot, legalizeToVar(SourceVectOperand)); | 2214 _movp(Slot, legalizeToVar(SourceVectOperand)); |
2203 | 2215 |
2204 // Compute the location of the element in memory. | 2216 // Compute the location of the element in memory. |
2205 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); | 2217 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); |
2206 OperandX8632Mem *Loc = | 2218 OperandX8632Mem *Loc = |
(...skipping 325 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2532 // Expand the element to the appropriate size for it to be inserted | 2544 // Expand the element to the appropriate size for it to be inserted |
2533 // in the vector. | 2545 // in the vector. |
2534 Variable *Expanded = | 2546 Variable *Expanded = |
2535 Func->makeVariable(InVectorElementTy, Context.getNode()); | 2547 Func->makeVariable(InVectorElementTy, Context.getNode()); |
2536 InstCast *Cast = | 2548 InstCast *Cast = |
2537 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert); | 2549 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert); |
2538 lowerCast(Cast); | 2550 lowerCast(Cast); |
2539 ElementToInsert = Expanded; | 2551 ElementToInsert = Expanded; |
2540 } | 2552 } |
2541 | 2553 |
2542 if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { | 2554 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) { |
2543 // Lower insertelement with 32-bit wide elements using shufps or | 2555 // Use insertps, pinsrb, pinsrw, or pinsrd. |
2544 // movss. | 2556 Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg); |
2545 // TODO(wala): SSE4.1 has pinsrd and insertps. | 2557 Variable *T = makeReg(Ty); |
| 2558 _movp(T, SourceVectOperand); |
| 2559 if (Ty == IceType_v4f32) |
| 2560 _insertps(T, Element, Ctx->getConstantInt(IceType_i8, Index << 4)); |
| 2561 else |
| 2562 _pinsr(T, Element, Ctx->getConstantInt(IceType_i8, Index)); |
| 2563 _movp(Inst->getDest(), T); |
| 2564 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { |
| 2565 // Use shufps or movss. |
2546 Variable *Element = NULL; | 2566 Variable *Element = NULL; |
2547 if (InVectorElementTy == IceType_f32) { | 2567 if (InVectorElementTy == IceType_f32) { |
2548 // Element will be in an XMM register since it is floating point. | 2568 // Element will be in an XMM register since it is floating point. |
2549 Element = legalizeToVar(ElementToInsert); | 2569 Element = legalizeToVar(ElementToInsert); |
2550 } else { | 2570 } else { |
2551 // Copy an integer to an XMM register. | 2571 // Copy an integer to an XMM register. |
2552 Operand *T = legalize(ElementToInsert, Legal_Reg | Legal_Mem); | 2572 Operand *T = legalize(ElementToInsert, Legal_Reg | Legal_Mem); |
2553 Element = makeReg(Ty); | 2573 Element = makeReg(Ty); |
2554 _movd(Element, T); | 2574 _movd(Element, T); |
2555 } | 2575 } |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2600 _shufps(Element, SourceVectOperand, Mask2Constant); | 2620 _shufps(Element, SourceVectOperand, Mask2Constant); |
2601 _movp(Inst->getDest(), Element); | 2621 _movp(Inst->getDest(), Element); |
2602 } else { | 2622 } else { |
2603 Variable *T = makeReg(Ty); | 2623 Variable *T = makeReg(Ty); |
2604 _movp(T, SourceVectOperand); | 2624 _movp(T, SourceVectOperand); |
2605 _shufps(Element, T, Mask1Constant); | 2625 _shufps(Element, T, Mask1Constant); |
2606 _shufps(T, Element, Mask2Constant); | 2626 _shufps(T, Element, Mask2Constant); |
2607 _movp(Inst->getDest(), T); | 2627 _movp(Inst->getDest(), T); |
2608 } | 2628 } |
2609 #undef ALIGN_HACK | 2629 #undef ALIGN_HACK |
2610 } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) { | |
2611 Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg); | |
2612 Variable *T = makeReg(Ty); | |
2613 _movp(T, SourceVectOperand); | |
2614 _pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index)); | |
2615 _movp(Inst->getDest(), T); | |
2616 } else { | 2630 } else { |
2617 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2631 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
2618 // Spill the value to a stack slot and perform the insertion in | 2632 // Spill the value to a stack slot and perform the insertion in |
2619 // memory. | 2633 // memory. |
2620 // TODO(wala): SSE4.1 has pinsrb. | |
2621 // | 2634 // |
2622 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when | 2635 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when |
2623 // support for legalizing to mem is implemented. | 2636 // support for legalizing to mem is implemented. |
2624 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); | 2637 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); |
2625 Slot->setWeight(RegWeight::Zero); | 2638 Slot->setWeight(RegWeight::Zero); |
2626 _movp(Slot, legalizeToVar(SourceVectOperand)); | 2639 _movp(Slot, legalizeToVar(SourceVectOperand)); |
2627 | 2640 |
2628 // Compute the location of the position to insert in memory. | 2641 // Compute the location of the position to insert in memory. |
2629 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); | 2642 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); |
2630 OperandX8632Mem *Loc = | 2643 OperandX8632Mem *Loc = |
(...skipping 840 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3471 Context.insert(InstFakeUse::create(Func, esp)); | 3484 Context.insert(InstFakeUse::create(Func, esp)); |
3472 } | 3485 } |
3473 | 3486 |
3474 void TargetX8632::lowerSelect(const InstSelect *Inst) { | 3487 void TargetX8632::lowerSelect(const InstSelect *Inst) { |
3475 Variable *Dest = Inst->getDest(); | 3488 Variable *Dest = Inst->getDest(); |
3476 Operand *SrcT = Inst->getTrueOperand(); | 3489 Operand *SrcT = Inst->getTrueOperand(); |
3477 Operand *SrcF = Inst->getFalseOperand(); | 3490 Operand *SrcF = Inst->getFalseOperand(); |
3478 Operand *Condition = Inst->getCondition(); | 3491 Operand *Condition = Inst->getCondition(); |
3479 | 3492 |
3480 if (isVectorType(Dest->getType())) { | 3493 if (isVectorType(Dest->getType())) { |
3481 // a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d) | |
3482 // TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has | |
3483 // blendps and pblendw for constant condition operands. | |
3484 Type SrcTy = SrcT->getType(); | 3494 Type SrcTy = SrcT->getType(); |
3485 Variable *T = makeReg(SrcTy); | 3495 Variable *T = makeReg(SrcTy); |
| 3496 // ALIGNHACK: Until stack alignment support is implemented, vector |
| 3497 // instructions need to have vector operands in registers. Once |
| 3498 // there is support for stack alignment, LEGAL_HACK can be removed. |
| 3499 #define LEGAL_HACK(Vect) legalizeToVar((Vect)) |
| 3500 if (InstructionSet >= SSE4_1) { |
| 3501 // TODO(wala): If the condition operand is a constant, use blendps |
| 3502 // or pblendw. |
| 3503 // |
| 3504 // Use blendvps or pblendvb to implement select. |
| 3505 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 || |
| 3506 SrcTy == IceType_v4f32) { |
| 3507 Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0); |
| 3508 _movp(xmm0, Condition); |
| 3509 _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31)); |
| 3510 _movp(T, SrcF); |
| 3511 _blendvps(T, LEGAL_HACK(SrcT), xmm0); |
| 3512 _movp(Dest, T); |
| 3513 } else { |
| 3514 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16); |
| 3515 Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16 |
| 3516 : IceType_v16i8; |
| 3517 Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0); |
| 3518 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition)); |
| 3519 _movp(T, SrcF); |
| 3520 _pblendvb(T, LEGAL_HACK(SrcT), xmm0); |
| 3521 _movp(Dest, T); |
| 3522 } |
| 3523 return; |
| 3524 } |
| 3525 // Lower select without SSE4.1: |
| 3526 // a=d?b:c ==> |
| 3527 // if elementtype(d) != i1: |
| 3528 // d=sext(d); |
| 3529 // a=(b&d)|(c&~d); |
3486 Variable *T2 = makeReg(SrcTy); | 3530 Variable *T2 = makeReg(SrcTy); |
3487 // Sign extend the condition operand if applicable. | 3531 // Sign extend the condition operand if applicable. |
3488 if (SrcTy == IceType_v4f32) { | 3532 if (SrcTy == IceType_v4f32) { |
3489 // The sext operation takes only integer arguments. | 3533 // The sext operation takes only integer arguments. |
3490 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode()); | 3534 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode()); |
3491 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); | 3535 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); |
3492 _movp(T, T3); | 3536 _movp(T, T3); |
3493 } else if (typeElementType(SrcTy) != IceType_i1) { | 3537 } else if (typeElementType(SrcTy) != IceType_i1) { |
3494 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); | 3538 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); |
3495 } else { | 3539 } else { |
3496 _movp(T, Condition); | 3540 _movp(T, Condition); |
3497 } | 3541 } |
3498 // ALIGNHACK: Until stack alignment support is implemented, the | |
3499 // bitwise vector instructions need to have both operands in | |
3500 // registers. Once there is support for stack alignment, LEGAL_HACK | |
3501 // can be removed. | |
3502 #define LEGAL_HACK(Vect) legalizeToVar((Vect)) | |
3503 _movp(T2, T); | 3542 _movp(T2, T); |
3504 _pand(T, LEGAL_HACK(SrcT)); | 3543 _pand(T, LEGAL_HACK(SrcT)); |
3505 _pandn(T2, LEGAL_HACK(SrcF)); | 3544 _pandn(T2, LEGAL_HACK(SrcF)); |
3506 _por(T, T2); | 3545 _por(T, T2); |
3507 _movp(Dest, T); | 3546 _movp(Dest, T); |
3508 #undef LEGAL_HACK | 3547 #undef LEGAL_HACK |
3509 | 3548 |
3510 return; | 3549 return; |
3511 } | 3550 } |
3512 | 3551 |
(...skipping 504 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4017 for (SizeT i = 0; i < Size; ++i) { | 4056 for (SizeT i = 0; i < Size; ++i) { |
4018 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; | 4057 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; |
4019 } | 4058 } |
4020 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; | 4059 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; |
4021 } | 4060 } |
4022 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName | 4061 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName |
4023 << "\n"; | 4062 << "\n"; |
4024 } | 4063 } |
4025 | 4064 |
4026 } // end of namespace Ice | 4065 } // end of namespace Ice |
OLD | NEW |