OLD | NEW |
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// | 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// |
2 // | 2 // |
3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
4 // | 4 // |
5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
7 // | 7 // |
8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
9 // | 9 // |
10 // This file implements the TargetLoweringX8632 class, which | 10 // This file implements the TargetLoweringX8632 class, which |
11 // consists almost entirely of the lowering sequence for each | 11 // consists almost entirely of the lowering sequence for each |
12 // high-level instruction. It also implements | 12 // high-level instruction. It also implements |
13 // TargetX8632Fast::postLower() which does the simplest possible | 13 // TargetX8632Fast::postLower() which does the simplest possible |
14 // register allocation for the "fast" target. | 14 // register allocation for the "fast" target. |
15 // | 15 // |
16 //===----------------------------------------------------------------------===// | 16 //===----------------------------------------------------------------------===// |
17 | 17 |
18 #include "IceDefs.h" | 18 #include "IceDefs.h" |
19 #include "IceCfg.h" | 19 #include "IceCfg.h" |
20 #include "IceCfgNode.h" | 20 #include "IceCfgNode.h" |
21 #include "IceInstX8632.h" | 21 #include "IceInstX8632.h" |
22 #include "IceOperand.h" | 22 #include "IceOperand.h" |
23 #include "IceTargetLoweringX8632.def" | 23 #include "IceTargetLoweringX8632.def" |
24 #include "IceTargetLoweringX8632.h" | 24 #include "IceTargetLoweringX8632.h" |
| 25 #include "llvm/Support/CommandLine.h" |
25 | 26 |
26 namespace Ice { | 27 namespace Ice { |
27 | 28 |
28 namespace { | 29 namespace { |
29 | 30 |
30 // The following table summarizes the logic for lowering the fcmp | 31 // The following table summarizes the logic for lowering the fcmp |
31 // instruction. There is one table entry for each of the 16 conditions. | 32 // instruction. There is one table entry for each of the 16 conditions. |
32 // | 33 // |
33 // The first four columns describe the case when the operands are | 34 // The first four columns describe the case when the operands are |
34 // floating point scalar values. A comment in lowerFcmp() describes the | 35 // floating point scalar values. A comment in lowerFcmp() describes the |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
116 size_t Index = static_cast<size_t>(Ty); | 117 size_t Index = static_cast<size_t>(Ty); |
117 assert(Index < TableTypeX8632AttributesSize); | 118 assert(Index < TableTypeX8632AttributesSize); |
118 return TableTypeX8632Attributes[Ty].InVectorElementType; | 119 return TableTypeX8632Attributes[Ty].InVectorElementType; |
119 } | 120 } |
120 | 121 |
121 // The maximum number of arguments to pass in XMM registers | 122 // The maximum number of arguments to pass in XMM registers |
122 const unsigned X86_MAX_XMM_ARGS = 4; | 123 const unsigned X86_MAX_XMM_ARGS = 4; |
123 // The number of bits in a byte | 124 // The number of bits in a byte |
124 const unsigned X86_CHAR_BIT = 8; | 125 const unsigned X86_CHAR_BIT = 8; |
125 | 126 |
| 127 // Instruction set options |
| 128 namespace cl = ::llvm::cl; |
| 129 cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet( |
| 130 "mattr", cl::desc("X86 target attributes"), |
| 131 cl::init(TargetX8632::SSE2), |
| 132 cl::values( |
| 133 clEnumValN(TargetX8632::SSE2, "sse2", |
| 134 "Enable SSE2 instructions (default)"), |
| 135 clEnumValN(TargetX8632::SSE4_1, "sse4.1", |
| 136 "Enable SSE 4.1 instructions"), clEnumValEnd)); |
| 137 |
126 // Return a string representation of the type that is suitable for use | 138 // Return a string representation of the type that is suitable for use |
127 // in an identifier. | 139 // in an identifier. |
128 IceString typeIdentString(const Type Ty) { | 140 IceString typeIdentString(const Type Ty) { |
129 IceString Str; | 141 IceString Str; |
130 llvm::raw_string_ostream BaseOS(Str); | 142 llvm::raw_string_ostream BaseOS(Str); |
131 if (isVectorType(Ty)) { | 143 if (isVectorType(Ty)) { |
132 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty); | 144 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty); |
133 } else { | 145 } else { |
134 BaseOS << Ty; | 146 BaseOS << Ty; |
135 } | 147 } |
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
227 #define X(tag, size, align, elts, elty, str) \ | 239 #define X(tag, size, align, elts, elty, str) \ |
228 STATIC_ASSERT(_table1_##tag == _table2_##tag); | 240 STATIC_ASSERT(_table1_##tag == _table2_##tag); |
229 ICETYPE_TABLE; | 241 ICETYPE_TABLE; |
230 #undef X | 242 #undef X |
231 } | 243 } |
232 } | 244 } |
233 | 245 |
234 } // end of anonymous namespace | 246 } // end of anonymous namespace |
235 | 247 |
236 TargetX8632::TargetX8632(Cfg *Func) | 248 TargetX8632::TargetX8632(Cfg *Func) |
237 : TargetLowering(Func), IsEbpBasedFrame(false), FrameSizeLocals(0), | 249 : TargetLowering(Func), InstructionSet(CLInstructionSet), |
238 LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false), | 250 IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0), |
| 251 NextLabelNumber(0), ComputedLiveRanges(false), |
239 PhysicalRegisters(VarList(Reg_NUM)) { | 252 PhysicalRegisters(VarList(Reg_NUM)) { |
240 // TODO: Don't initialize IntegerRegisters and friends every time. | 253 // TODO: Don't initialize IntegerRegisters and friends every time. |
241 // Instead, initialize in some sort of static initializer for the | 254 // Instead, initialize in some sort of static initializer for the |
242 // class. | 255 // class. |
243 llvm::SmallBitVector IntegerRegisters(Reg_NUM); | 256 llvm::SmallBitVector IntegerRegisters(Reg_NUM); |
244 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM); | 257 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM); |
245 llvm::SmallBitVector FloatRegisters(Reg_NUM); | 258 llvm::SmallBitVector FloatRegisters(Reg_NUM); |
246 llvm::SmallBitVector VectorRegisters(Reg_NUM); | 259 llvm::SmallBitVector VectorRegisters(Reg_NUM); |
247 llvm::SmallBitVector InvalidRegisters(Reg_NUM); | 260 llvm::SmallBitVector InvalidRegisters(Reg_NUM); |
248 ScratchRegs.resize(Reg_NUM); | 261 ScratchRegs.resize(Reg_NUM); |
(...skipping 972 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1221 _pxor(T, LEGAL_HACK(Src1)); | 1234 _pxor(T, LEGAL_HACK(Src1)); |
1222 _movp(Dest, T); | 1235 _movp(Dest, T); |
1223 } break; | 1236 } break; |
1224 case InstArithmetic::Sub: { | 1237 case InstArithmetic::Sub: { |
1225 Variable *T = makeReg(Dest->getType()); | 1238 Variable *T = makeReg(Dest->getType()); |
1226 _movp(T, Src0); | 1239 _movp(T, Src0); |
1227 _psub(T, LEGAL_HACK(Src1)); | 1240 _psub(T, LEGAL_HACK(Src1)); |
1228 _movp(Dest, T); | 1241 _movp(Dest, T); |
1229 } break; | 1242 } break; |
1230 case InstArithmetic::Mul: { | 1243 case InstArithmetic::Mul: { |
1231 if (Dest->getType() == IceType_v4i32) { | 1244 bool TypesAreValidForPmull = |
| 1245 Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16; |
| 1246 bool InstructionSetIsValidForPmull = |
| 1247 Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1; |
| 1248 if (TypesAreValidForPmull && InstructionSetIsValidForPmull) { |
| 1249 Variable *T = makeReg(Dest->getType()); |
| 1250 _movp(T, Src0); |
| 1251 _pmull(T, legalizeToVar(Src1)); |
| 1252 _movp(Dest, T); |
| 1253 } else if (Dest->getType() == IceType_v4i32) { |
1232 // Lowering sequence: | 1254 // Lowering sequence: |
1233 // Note: The mask arguments have index 0 on the left. | 1255 // Note: The mask arguments have index 0 on the left. |
1234 // | 1256 // |
1235 // movups T1, Src0 | 1257 // movups T1, Src0 |
1236 // pshufd T2, Src0, {1,0,3,0} | 1258 // pshufd T2, Src0, {1,0,3,0} |
1237 // pshufd T3, Src1, {1,0,3,0} | 1259 // pshufd T3, Src1, {1,0,3,0} |
1238 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} | 1260 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} |
1239 // pmuludq T1, Src1 | 1261 // pmuludq T1, Src1 |
1240 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]} | 1262 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]} |
1241 // pmuludq T2, T3 | 1263 // pmuludq T2, T3 |
1242 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])} | 1264 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])} |
1243 // shufps T1, T2, {0,2,0,2} | 1265 // shufps T1, T2, {0,2,0,2} |
1244 // pshufd T4, T1, {0,2,1,3} | 1266 // pshufd T4, T1, {0,2,1,3} |
1245 // movups Dest, T4 | 1267 // movups Dest, T4 |
1246 // | |
1247 // TODO(wala): SSE4.1 has pmulld. | |
1248 | 1268 |
1249 // Mask that directs pshufd to create a vector with entries | 1269 // Mask that directs pshufd to create a vector with entries |
1250 // Src[1, 0, 3, 0] | 1270 // Src[1, 0, 3, 0] |
1251 const unsigned Constant1030 = 0x31; | 1271 const unsigned Constant1030 = 0x31; |
1252 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030); | 1272 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030); |
1253 // Mask that directs shufps to create a vector with entries | 1273 // Mask that directs shufps to create a vector with entries |
1254 // Dest[0, 2], Src[0, 2] | 1274 // Dest[0, 2], Src[0, 2] |
1255 const unsigned Mask0202 = 0x88; | 1275 const unsigned Mask0202 = 0x88; |
1256 // Mask that directs pshufd to create a vector with entries | 1276 // Mask that directs pshufd to create a vector with entries |
1257 // Src[0, 2, 1, 3] | 1277 // Src[0, 2, 1, 3] |
1258 const unsigned Mask0213 = 0xd8; | 1278 const unsigned Mask0213 = 0xd8; |
1259 Variable *T1 = makeReg(IceType_v4i32); | 1279 Variable *T1 = makeReg(IceType_v4i32); |
1260 Variable *T2 = makeReg(IceType_v4i32); | 1280 Variable *T2 = makeReg(IceType_v4i32); |
1261 Variable *T3 = makeReg(IceType_v4i32); | 1281 Variable *T3 = makeReg(IceType_v4i32); |
1262 Variable *T4 = makeReg(IceType_v4i32); | 1282 Variable *T4 = makeReg(IceType_v4i32); |
1263 _movp(T1, Src0); | 1283 _movp(T1, Src0); |
1264 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R | 1284 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R |
1265 // with Src1 after stack operand alignment support is | 1285 // with Src1 after stack operand alignment support is |
1266 // implemented. | 1286 // implemented. |
1267 Variable *Src0R = LEGAL_HACK(Src0); | 1287 Variable *Src0R = LEGAL_HACK(Src0); |
1268 Variable *Src1R = LEGAL_HACK(Src1); | 1288 Variable *Src1R = LEGAL_HACK(Src1); |
1269 _pshufd(T2, Src0R, Mask1030); | 1289 _pshufd(T2, Src0R, Mask1030); |
1270 _pshufd(T3, Src1R, Mask1030); | 1290 _pshufd(T3, Src1R, Mask1030); |
1271 _pmuludq(T1, Src1R); | 1291 _pmuludq(T1, Src1R); |
1272 _pmuludq(T2, T3); | 1292 _pmuludq(T2, T3); |
1273 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202)); | 1293 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202)); |
1274 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213)); | 1294 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213)); |
1275 _movp(Dest, T4); | 1295 _movp(Dest, T4); |
1276 } else if (Dest->getType() == IceType_v8i16) { | |
1277 Variable *T = makeReg(IceType_v8i16); | |
1278 _movp(T, Src0); | |
1279 _pmullw(T, legalizeToVar(Src1)); | |
1280 _movp(Dest, T); | |
1281 } else { | 1296 } else { |
1282 assert(Dest->getType() == IceType_v16i8); | 1297 assert(Dest->getType() == IceType_v16i8); |
1283 // Sz_mul_v16i8 | 1298 // Sz_mul_v16i8 |
1284 const IceString Helper = "Sz_mul_v16i8"; | 1299 const IceString Helper = "Sz_mul_v16i8"; |
1285 const SizeT MaxSrcs = 2; | 1300 const SizeT MaxSrcs = 2; |
1286 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs); | 1301 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs); |
1287 Call->addArg(Src0); | 1302 Call->addArg(Src0); |
1288 Call->addArg(Src1); | 1303 Call->addArg(Src1); |
1289 lowerCall(Call); | 1304 lowerCall(Call); |
1290 } | 1305 } |
(...skipping 857 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2148 // Only constant indices are allowed in PNaCl IR. | 2163 // Only constant indices are allowed in PNaCl IR. |
2149 assert(ElementIndex); | 2164 assert(ElementIndex); |
2150 | 2165 |
2151 unsigned Index = ElementIndex->getValue(); | 2166 unsigned Index = ElementIndex->getValue(); |
2152 Type Ty = SourceVectOperand->getType(); | 2167 Type Ty = SourceVectOperand->getType(); |
2153 Type ElementTy = typeElementType(Ty); | 2168 Type ElementTy = typeElementType(Ty); |
2154 Type InVectorElementTy = getInVectorElementType(Ty); | 2169 Type InVectorElementTy = getInVectorElementType(Ty); |
2155 Variable *ExtractedElement = makeReg(InVectorElementTy); | 2170 Variable *ExtractedElement = makeReg(InVectorElementTy); |
2156 | 2171 |
2157 // TODO(wala): Determine the best lowering sequences for each type. | 2172 // TODO(wala): Determine the best lowering sequences for each type. |
2158 if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { | 2173 bool CanUsePextr = |
2159 // Lower extractelement operations where the element is 32 bits | 2174 Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1; |
2160 // wide with pshufd. | 2175 if (CanUsePextr && Ty != IceType_v4f32) { |
2161 // TODO(wala): SSE4.1 has extractps and pextrd | 2176 // Use pextrb, pextrw, or pextrd. |
| 2177 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); |
| 2178 Variable *SourceVectR = legalizeToVar(SourceVectOperand); |
| 2179 _pextr(ExtractedElement, SourceVectR, Mask); |
| 2180 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { |
| 2181 // Use pshufd and movd/movss. |
2162 // | 2182 // |
2163 // ALIGNHACK: Force vector operands to registers in instructions that | 2183 // ALIGNHACK: Force vector operands to registers in instructions that |
2164 // require aligned memory operands until support for stack alignment | 2184 // require aligned memory operands until support for stack alignment |
2165 // is implemented. | 2185 // is implemented. |
2166 #define ALIGN_HACK(Vect) legalizeToVar((Vect)) | 2186 #define ALIGN_HACK(Vect) legalizeToVar((Vect)) |
2167 Variable *T = NULL; | 2187 Variable *T = NULL; |
2168 if (Index) { | 2188 if (Index) { |
2169 // The shuffle only needs to occur if the element to be extracted | 2189 // The shuffle only needs to occur if the element to be extracted |
2170 // is not at the lowest index. | 2190 // is not at the lowest index. |
2171 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); | 2191 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); |
2172 T = makeReg(Ty); | 2192 T = makeReg(Ty); |
2173 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask); | 2193 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask); |
2174 } else { | 2194 } else { |
2175 T = legalizeToVar(SourceVectOperand); | 2195 T = legalizeToVar(SourceVectOperand); |
2176 } | 2196 } |
2177 | 2197 |
2178 if (InVectorElementTy == IceType_i32) { | 2198 if (InVectorElementTy == IceType_i32) { |
2179 _movd(ExtractedElement, T); | 2199 _movd(ExtractedElement, T); |
2180 } else { // Ty == Icetype_f32 | 2200 } else { // Ty == Icetype_f32 |
2181 // TODO(wala): _movss is only used here because _mov does not | 2201 // TODO(wala): _movss is only used here because _mov does not |
2182 // allow a vector source and a scalar destination. _mov should be | 2202 // allow a vector source and a scalar destination. _mov should be |
2183 // able to be used here. | 2203 // able to be used here. |
2184 // _movss is a binary instruction, so the FakeDef is needed to | 2204 // _movss is a binary instruction, so the FakeDef is needed to |
2185 // keep the live range analysis consistent. | 2205 // keep the live range analysis consistent. |
2186 Context.insert(InstFakeDef::create(Func, ExtractedElement)); | 2206 Context.insert(InstFakeDef::create(Func, ExtractedElement)); |
2187 _movss(ExtractedElement, T); | 2207 _movss(ExtractedElement, T); |
2188 } | 2208 } |
2189 #undef ALIGN_HACK | 2209 #undef ALIGN_HACK |
2190 } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) { | |
2191 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); | |
2192 _pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask); | |
2193 } else { | 2210 } else { |
2194 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2211 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
2195 // Spill the value to a stack slot and do the extraction in memory. | 2212 // Spill the value to a stack slot and do the extraction in memory. |
2196 // TODO(wala): SSE4.1 has pextrb. | |
2197 // | 2213 // |
2198 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when | 2214 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when |
2199 // support for legalizing to mem is implemented. | 2215 // support for legalizing to mem is implemented. |
2200 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); | 2216 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); |
2201 Slot->setWeight(RegWeight::Zero); | 2217 Slot->setWeight(RegWeight::Zero); |
2202 _movp(Slot, legalizeToVar(SourceVectOperand)); | 2218 _movp(Slot, legalizeToVar(SourceVectOperand)); |
2203 | 2219 |
2204 // Compute the location of the element in memory. | 2220 // Compute the location of the element in memory. |
2205 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); | 2221 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); |
2206 OperandX8632Mem *Loc = | 2222 OperandX8632Mem *Loc = |
(...skipping 325 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2532 // Expand the element to the appropriate size for it to be inserted | 2548 // Expand the element to the appropriate size for it to be inserted |
2533 // in the vector. | 2549 // in the vector. |
2534 Variable *Expanded = | 2550 Variable *Expanded = |
2535 Func->makeVariable(InVectorElementTy, Context.getNode()); | 2551 Func->makeVariable(InVectorElementTy, Context.getNode()); |
2536 InstCast *Cast = | 2552 InstCast *Cast = |
2537 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert); | 2553 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert); |
2538 lowerCast(Cast); | 2554 lowerCast(Cast); |
2539 ElementToInsert = Expanded; | 2555 ElementToInsert = Expanded; |
2540 } | 2556 } |
2541 | 2557 |
2542 if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { | 2558 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) { |
2543 // Lower insertelement with 32-bit wide elements using shufps or | 2559 // Use insertps, pinsrb, pinsrw, or pinsrd. |
2544 // movss. | 2560 Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg); |
2545 // TODO(wala): SSE4.1 has pinsrd and insertps. | 2561 Variable *T = makeReg(Ty); |
| 2562 _movp(T, SourceVectOperand); |
| 2563 if (Ty == IceType_v4f32) |
| 2564 _insertps(T, Element, Ctx->getConstantInt(IceType_i8, Index << 4)); |
| 2565 else |
| 2566 _pinsr(T, Element, Ctx->getConstantInt(IceType_i8, Index)); |
| 2567 _movp(Inst->getDest(), T); |
| 2568 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { |
| 2569 // Use shufps or movss. |
2546 Variable *Element = NULL; | 2570 Variable *Element = NULL; |
2547 if (InVectorElementTy == IceType_f32) { | 2571 if (InVectorElementTy == IceType_f32) { |
2548 // Element will be in an XMM register since it is floating point. | 2572 // Element will be in an XMM register since it is floating point. |
2549 Element = legalizeToVar(ElementToInsert); | 2573 Element = legalizeToVar(ElementToInsert); |
2550 } else { | 2574 } else { |
2551 // Copy an integer to an XMM register. | 2575 // Copy an integer to an XMM register. |
2552 Operand *T = legalize(ElementToInsert, Legal_Reg | Legal_Mem); | 2576 Operand *T = legalize(ElementToInsert, Legal_Reg | Legal_Mem); |
2553 Element = makeReg(Ty); | 2577 Element = makeReg(Ty); |
2554 _movd(Element, T); | 2578 _movd(Element, T); |
2555 } | 2579 } |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2600 _shufps(Element, SourceVectOperand, Mask2Constant); | 2624 _shufps(Element, SourceVectOperand, Mask2Constant); |
2601 _movp(Inst->getDest(), Element); | 2625 _movp(Inst->getDest(), Element); |
2602 } else { | 2626 } else { |
2603 Variable *T = makeReg(Ty); | 2627 Variable *T = makeReg(Ty); |
2604 _movp(T, SourceVectOperand); | 2628 _movp(T, SourceVectOperand); |
2605 _shufps(Element, T, Mask1Constant); | 2629 _shufps(Element, T, Mask1Constant); |
2606 _shufps(T, Element, Mask2Constant); | 2630 _shufps(T, Element, Mask2Constant); |
2607 _movp(Inst->getDest(), T); | 2631 _movp(Inst->getDest(), T); |
2608 } | 2632 } |
2609 #undef ALIGN_HACK | 2633 #undef ALIGN_HACK |
2610 } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) { | |
2611 Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg); | |
2612 Variable *T = makeReg(Ty); | |
2613 _movp(T, SourceVectOperand); | |
2614 _pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index)); | |
2615 _movp(Inst->getDest(), T); | |
2616 } else { | 2634 } else { |
2617 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2635 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
2618 // Spill the value to a stack slot and perform the insertion in | 2636 // Spill the value to a stack slot and perform the insertion in |
2619 // memory. | 2637 // memory. |
2620 // TODO(wala): SSE4.1 has pinsrb. | |
2621 // | 2638 // |
2622 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when | 2639 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when |
2623 // support for legalizing to mem is implemented. | 2640 // support for legalizing to mem is implemented. |
2624 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); | 2641 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); |
2625 Slot->setWeight(RegWeight::Zero); | 2642 Slot->setWeight(RegWeight::Zero); |
2626 _movp(Slot, legalizeToVar(SourceVectOperand)); | 2643 _movp(Slot, legalizeToVar(SourceVectOperand)); |
2627 | 2644 |
2628 // Compute the location of the position to insert in memory. | 2645 // Compute the location of the position to insert in memory. |
2629 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); | 2646 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); |
2630 OperandX8632Mem *Loc = | 2647 OperandX8632Mem *Loc = |
(...skipping 913 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3544 Context.insert(InstFakeUse::create(Func, esp)); | 3561 Context.insert(InstFakeUse::create(Func, esp)); |
3545 } | 3562 } |
3546 | 3563 |
3547 void TargetX8632::lowerSelect(const InstSelect *Inst) { | 3564 void TargetX8632::lowerSelect(const InstSelect *Inst) { |
3548 Variable *Dest = Inst->getDest(); | 3565 Variable *Dest = Inst->getDest(); |
3549 Operand *SrcT = Inst->getTrueOperand(); | 3566 Operand *SrcT = Inst->getTrueOperand(); |
3550 Operand *SrcF = Inst->getFalseOperand(); | 3567 Operand *SrcF = Inst->getFalseOperand(); |
3551 Operand *Condition = Inst->getCondition(); | 3568 Operand *Condition = Inst->getCondition(); |
3552 | 3569 |
3553 if (isVectorType(Dest->getType())) { | 3570 if (isVectorType(Dest->getType())) { |
3554 // a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d) | |
3555 // TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has | |
3556 // blendps and pblendw for constant condition operands. | |
3557 Type SrcTy = SrcT->getType(); | 3571 Type SrcTy = SrcT->getType(); |
3558 Variable *T = makeReg(SrcTy); | 3572 Variable *T = makeReg(SrcTy); |
| 3573 // ALIGNHACK: Until stack alignment support is implemented, vector |
| 3574 // instructions need to have vector operands in registers. Once |
| 3575 // there is support for stack alignment, LEGAL_HACK can be removed. |
| 3576 #define LEGAL_HACK(Vect) legalizeToVar((Vect)) |
| 3577 if (InstructionSet >= SSE4_1) { |
| 3578 // TODO(wala): If the condition operand is a constant, use blendps |
| 3579 // or pblendw. |
| 3580 // |
| 3581 // Use blendvps or pblendvb to implement select. |
| 3582 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 || |
| 3583 SrcTy == IceType_v4f32) { |
| 3584 Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0); |
| 3585 _movp(xmm0, Condition); |
| 3586 _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31)); |
| 3587 _movp(T, SrcF); |
| 3588 _blendvps(T, LEGAL_HACK(SrcT), xmm0); |
| 3589 _movp(Dest, T); |
| 3590 } else { |
| 3591 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16); |
| 3592 Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16 |
| 3593 : IceType_v16i8; |
| 3594 Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0); |
| 3595 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition)); |
| 3596 _movp(T, SrcF); |
| 3597 _pblendvb(T, LEGAL_HACK(SrcT), xmm0); |
| 3598 _movp(Dest, T); |
| 3599 } |
| 3600 return; |
| 3601 } |
| 3602 // Lower select without SSE4.1: |
| 3603 // a=d?b:c ==> |
| 3604 // if elementtype(d) != i1: |
| 3605 // d=sext(d); |
| 3606 // a=(b&d)|(c&~d); |
3559 Variable *T2 = makeReg(SrcTy); | 3607 Variable *T2 = makeReg(SrcTy); |
3560 // Sign extend the condition operand if applicable. | 3608 // Sign extend the condition operand if applicable. |
3561 if (SrcTy == IceType_v4f32) { | 3609 if (SrcTy == IceType_v4f32) { |
3562 // The sext operation takes only integer arguments. | 3610 // The sext operation takes only integer arguments. |
3563 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode()); | 3611 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode()); |
3564 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); | 3612 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); |
3565 _movp(T, T3); | 3613 _movp(T, T3); |
3566 } else if (typeElementType(SrcTy) != IceType_i1) { | 3614 } else if (typeElementType(SrcTy) != IceType_i1) { |
3567 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); | 3615 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); |
3568 } else { | 3616 } else { |
3569 _movp(T, Condition); | 3617 _movp(T, Condition); |
3570 } | 3618 } |
3571 // ALIGNHACK: Until stack alignment support is implemented, the | |
3572 // bitwise vector instructions need to have both operands in | |
3573 // registers. Once there is support for stack alignment, LEGAL_HACK | |
3574 // can be removed. | |
3575 #define LEGAL_HACK(Vect) legalizeToVar((Vect)) | |
3576 _movp(T2, T); | 3619 _movp(T2, T); |
3577 _pand(T, LEGAL_HACK(SrcT)); | 3620 _pand(T, LEGAL_HACK(SrcT)); |
3578 _pandn(T2, LEGAL_HACK(SrcF)); | 3621 _pandn(T2, LEGAL_HACK(SrcF)); |
3579 _por(T, T2); | 3622 _por(T, T2); |
3580 _movp(Dest, T); | 3623 _movp(Dest, T); |
3581 #undef LEGAL_HACK | 3624 #undef LEGAL_HACK |
3582 | 3625 |
3583 return; | 3626 return; |
3584 } | 3627 } |
3585 | 3628 |
(...skipping 504 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4090 for (SizeT i = 0; i < Size; ++i) { | 4133 for (SizeT i = 0; i < Size; ++i) { |
4091 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; | 4134 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; |
4092 } | 4135 } |
4093 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; | 4136 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; |
4094 } | 4137 } |
4095 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName | 4138 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName |
4096 << "\n"; | 4139 << "\n"; |
4097 } | 4140 } |
4098 | 4141 |
4099 } // end of namespace Ice | 4142 } // end of namespace Ice |
OLD | NEW |