| OLD | NEW |
| 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// | 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// |
| 2 // | 2 // |
| 3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
| 4 // | 4 // |
| 5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
| 6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
| 7 // | 7 // |
| 8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
| 9 // | 9 // |
| 10 // This file implements the TargetLoweringX8632 class, which | 10 // This file implements the TargetLoweringX8632 class, which |
| 11 // consists almost entirely of the lowering sequence for each | 11 // consists almost entirely of the lowering sequence for each |
| 12 // high-level instruction. It also implements | 12 // high-level instruction. It also implements |
| 13 // TargetX8632Fast::postLower() which does the simplest possible | 13 // TargetX8632Fast::postLower() which does the simplest possible |
| 14 // register allocation for the "fast" target. | 14 // register allocation for the "fast" target. |
| 15 // | 15 // |
| 16 //===----------------------------------------------------------------------===// | 16 //===----------------------------------------------------------------------===// |
| 17 | 17 |
| 18 #include "IceDefs.h" | 18 #include "IceDefs.h" |
| 19 #include "IceCfg.h" | 19 #include "IceCfg.h" |
| 20 #include "IceCfgNode.h" | 20 #include "IceCfgNode.h" |
| 21 #include "IceInstX8632.h" | 21 #include "IceInstX8632.h" |
| 22 #include "IceOperand.h" | 22 #include "IceOperand.h" |
| 23 #include "IceTargetLoweringX8632.def" | 23 #include "IceTargetLoweringX8632.def" |
| 24 #include "IceTargetLoweringX8632.h" | 24 #include "IceTargetLoweringX8632.h" |
| 25 #include "llvm/Support/CommandLine.h" |
| 25 | 26 |
| 26 namespace Ice { | 27 namespace Ice { |
| 27 | 28 |
| 28 namespace { | 29 namespace { |
| 29 | 30 |
| 30 // The following table summarizes the logic for lowering the fcmp | 31 // The following table summarizes the logic for lowering the fcmp |
| 31 // instruction. There is one table entry for each of the 16 conditions. | 32 // instruction. There is one table entry for each of the 16 conditions. |
| 32 // | 33 // |
| 33 // The first four columns describe the case when the operands are | 34 // The first four columns describe the case when the operands are |
| 34 // floating point scalar values. A comment in lowerFcmp() describes the | 35 // floating point scalar values. A comment in lowerFcmp() describes the |
| (...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 116 size_t Index = static_cast<size_t>(Ty); | 117 size_t Index = static_cast<size_t>(Ty); |
| 117 assert(Index < TableTypeX8632AttributesSize); | 118 assert(Index < TableTypeX8632AttributesSize); |
| 118 return TableTypeX8632Attributes[Ty].InVectorElementType; | 119 return TableTypeX8632Attributes[Ty].InVectorElementType; |
| 119 } | 120 } |
| 120 | 121 |
| 121 // The maximum number of arguments to pass in XMM registers | 122 // The maximum number of arguments to pass in XMM registers |
| 122 const unsigned X86_MAX_XMM_ARGS = 4; | 123 const unsigned X86_MAX_XMM_ARGS = 4; |
| 123 // The number of bits in a byte | 124 // The number of bits in a byte |
| 124 const unsigned X86_CHAR_BIT = 8; | 125 const unsigned X86_CHAR_BIT = 8; |
| 125 | 126 |
| 127 // Instruction set options |
| 128 namespace cl = ::llvm::cl; |
| 129 cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet( |
| 130 "mattr", cl::desc("X86 target attributes"), |
| 131 cl::init(TargetX8632::SSE2), |
| 132 cl::values( |
| 133 clEnumValN(TargetX8632::SSE2, "sse2", |
| 134 "Enable SSE2 instructions (default)"), |
| 135 clEnumValN(TargetX8632::SSE4_1, "sse4.1", |
| 136 "Enable SSE 4.1 instructions"), clEnumValEnd)); |
| 137 |
| 126 // Return a string representation of the type that is suitable for use | 138 // Return a string representation of the type that is suitable for use |
| 127 // in an identifier. | 139 // in an identifier. |
| 128 IceString typeIdentString(const Type Ty) { | 140 IceString typeIdentString(const Type Ty) { |
| 129 IceString Str; | 141 IceString Str; |
| 130 llvm::raw_string_ostream BaseOS(Str); | 142 llvm::raw_string_ostream BaseOS(Str); |
| 131 if (isVectorType(Ty)) { | 143 if (isVectorType(Ty)) { |
| 132 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty); | 144 BaseOS << "v" << typeNumElements(Ty) << typeElementType(Ty); |
| 133 } else { | 145 } else { |
| 134 BaseOS << Ty; | 146 BaseOS << Ty; |
| 135 } | 147 } |
| (...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 227 #define X(tag, size, align, elts, elty, str) \ | 239 #define X(tag, size, align, elts, elty, str) \ |
| 228 STATIC_ASSERT(_table1_##tag == _table2_##tag); | 240 STATIC_ASSERT(_table1_##tag == _table2_##tag); |
| 229 ICETYPE_TABLE; | 241 ICETYPE_TABLE; |
| 230 #undef X | 242 #undef X |
| 231 } | 243 } |
| 232 } | 244 } |
| 233 | 245 |
| 234 } // end of anonymous namespace | 246 } // end of anonymous namespace |
| 235 | 247 |
| 236 TargetX8632::TargetX8632(Cfg *Func) | 248 TargetX8632::TargetX8632(Cfg *Func) |
| 237 : TargetLowering(Func), IsEbpBasedFrame(false), FrameSizeLocals(0), | 249 : TargetLowering(Func), InstructionSet(CLInstructionSet), |
| 238 LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false), | 250 IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0), |
| 251 NextLabelNumber(0), ComputedLiveRanges(false), |
| 239 PhysicalRegisters(VarList(Reg_NUM)) { | 252 PhysicalRegisters(VarList(Reg_NUM)) { |
| 240 // TODO: Don't initialize IntegerRegisters and friends every time. | 253 // TODO: Don't initialize IntegerRegisters and friends every time. |
| 241 // Instead, initialize in some sort of static initializer for the | 254 // Instead, initialize in some sort of static initializer for the |
| 242 // class. | 255 // class. |
| 243 llvm::SmallBitVector IntegerRegisters(Reg_NUM); | 256 llvm::SmallBitVector IntegerRegisters(Reg_NUM); |
| 244 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM); | 257 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM); |
| 245 llvm::SmallBitVector FloatRegisters(Reg_NUM); | 258 llvm::SmallBitVector FloatRegisters(Reg_NUM); |
| 246 llvm::SmallBitVector VectorRegisters(Reg_NUM); | 259 llvm::SmallBitVector VectorRegisters(Reg_NUM); |
| 247 llvm::SmallBitVector InvalidRegisters(Reg_NUM); | 260 llvm::SmallBitVector InvalidRegisters(Reg_NUM); |
| 248 ScratchRegs.resize(Reg_NUM); | 261 ScratchRegs.resize(Reg_NUM); |
| (...skipping 972 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1221 _pxor(T, LEGAL_HACK(Src1)); | 1234 _pxor(T, LEGAL_HACK(Src1)); |
| 1222 _movp(Dest, T); | 1235 _movp(Dest, T); |
| 1223 } break; | 1236 } break; |
| 1224 case InstArithmetic::Sub: { | 1237 case InstArithmetic::Sub: { |
| 1225 Variable *T = makeReg(Dest->getType()); | 1238 Variable *T = makeReg(Dest->getType()); |
| 1226 _movp(T, Src0); | 1239 _movp(T, Src0); |
| 1227 _psub(T, LEGAL_HACK(Src1)); | 1240 _psub(T, LEGAL_HACK(Src1)); |
| 1228 _movp(Dest, T); | 1241 _movp(Dest, T); |
| 1229 } break; | 1242 } break; |
| 1230 case InstArithmetic::Mul: { | 1243 case InstArithmetic::Mul: { |
| 1231 if (Dest->getType() == IceType_v4i32) { | 1244 bool TypesAreValidForPmull = |
| 1245 Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16; |
| 1246 bool InstructionSetIsValidForPmull = |
| 1247 Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1; |
| 1248 if (TypesAreValidForPmull && InstructionSetIsValidForPmull) { |
| 1249 Variable *T = makeReg(Dest->getType()); |
| 1250 _movp(T, Src0); |
| 1251 _pmull(T, legalizeToVar(Src1)); |
| 1252 _movp(Dest, T); |
| 1253 } else if (Dest->getType() == IceType_v4i32) { |
| 1232 // Lowering sequence: | 1254 // Lowering sequence: |
| 1233 // Note: The mask arguments have index 0 on the left. | 1255 // Note: The mask arguments have index 0 on the left. |
| 1234 // | 1256 // |
| 1235 // movups T1, Src0 | 1257 // movups T1, Src0 |
| 1236 // pshufd T2, Src0, {1,0,3,0} | 1258 // pshufd T2, Src0, {1,0,3,0} |
| 1237 // pshufd T3, Src1, {1,0,3,0} | 1259 // pshufd T3, Src1, {1,0,3,0} |
| 1238 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} | 1260 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} |
| 1239 // pmuludq T1, Src1 | 1261 // pmuludq T1, Src1 |
| 1240 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]} | 1262 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]} |
| 1241 // pmuludq T2, T3 | 1263 // pmuludq T2, T3 |
| 1242 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])} | 1264 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])} |
| 1243 // shufps T1, T2, {0,2,0,2} | 1265 // shufps T1, T2, {0,2,0,2} |
| 1244 // pshufd T4, T1, {0,2,1,3} | 1266 // pshufd T4, T1, {0,2,1,3} |
| 1245 // movups Dest, T4 | 1267 // movups Dest, T4 |
| 1246 // | |
| 1247 // TODO(wala): SSE4.1 has pmulld. | |
| 1248 | 1268 |
| 1249 // Mask that directs pshufd to create a vector with entries | 1269 // Mask that directs pshufd to create a vector with entries |
| 1250 // Src[1, 0, 3, 0] | 1270 // Src[1, 0, 3, 0] |
| 1251 const unsigned Constant1030 = 0x31; | 1271 const unsigned Constant1030 = 0x31; |
| 1252 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030); | 1272 Constant *Mask1030 = Ctx->getConstantInt(IceType_i8, Constant1030); |
| 1253 // Mask that directs shufps to create a vector with entries | 1273 // Mask that directs shufps to create a vector with entries |
| 1254 // Dest[0, 2], Src[0, 2] | 1274 // Dest[0, 2], Src[0, 2] |
| 1255 const unsigned Mask0202 = 0x88; | 1275 const unsigned Mask0202 = 0x88; |
| 1256 // Mask that directs pshufd to create a vector with entries | 1276 // Mask that directs pshufd to create a vector with entries |
| 1257 // Src[0, 2, 1, 3] | 1277 // Src[0, 2, 1, 3] |
| 1258 const unsigned Mask0213 = 0xd8; | 1278 const unsigned Mask0213 = 0xd8; |
| 1259 Variable *T1 = makeReg(IceType_v4i32); | 1279 Variable *T1 = makeReg(IceType_v4i32); |
| 1260 Variable *T2 = makeReg(IceType_v4i32); | 1280 Variable *T2 = makeReg(IceType_v4i32); |
| 1261 Variable *T3 = makeReg(IceType_v4i32); | 1281 Variable *T3 = makeReg(IceType_v4i32); |
| 1262 Variable *T4 = makeReg(IceType_v4i32); | 1282 Variable *T4 = makeReg(IceType_v4i32); |
| 1263 _movp(T1, Src0); | 1283 _movp(T1, Src0); |
| 1264 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R | 1284 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R |
| 1265 // with Src1 after stack operand alignment support is | 1285 // with Src1 after stack operand alignment support is |
| 1266 // implemented. | 1286 // implemented. |
| 1267 Variable *Src0R = LEGAL_HACK(Src0); | 1287 Variable *Src0R = LEGAL_HACK(Src0); |
| 1268 Variable *Src1R = LEGAL_HACK(Src1); | 1288 Variable *Src1R = LEGAL_HACK(Src1); |
| 1269 _pshufd(T2, Src0R, Mask1030); | 1289 _pshufd(T2, Src0R, Mask1030); |
| 1270 _pshufd(T3, Src1R, Mask1030); | 1290 _pshufd(T3, Src1R, Mask1030); |
| 1271 _pmuludq(T1, Src1R); | 1291 _pmuludq(T1, Src1R); |
| 1272 _pmuludq(T2, T3); | 1292 _pmuludq(T2, T3); |
| 1273 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202)); | 1293 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202)); |
| 1274 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213)); | 1294 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213)); |
| 1275 _movp(Dest, T4); | 1295 _movp(Dest, T4); |
| 1276 } else if (Dest->getType() == IceType_v8i16) { | |
| 1277 Variable *T = makeReg(IceType_v8i16); | |
| 1278 _movp(T, Src0); | |
| 1279 _pmullw(T, legalizeToVar(Src1)); | |
| 1280 _movp(Dest, T); | |
| 1281 } else { | 1296 } else { |
| 1282 assert(Dest->getType() == IceType_v16i8); | 1297 assert(Dest->getType() == IceType_v16i8); |
| 1283 // Sz_mul_v16i8 | 1298 // Sz_mul_v16i8 |
| 1284 const IceString Helper = "Sz_mul_v16i8"; | 1299 const IceString Helper = "Sz_mul_v16i8"; |
| 1285 const SizeT MaxSrcs = 2; | 1300 const SizeT MaxSrcs = 2; |
| 1286 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs); | 1301 InstCall *Call = makeHelperCall(Helper, Dest, MaxSrcs); |
| 1287 Call->addArg(Src0); | 1302 Call->addArg(Src0); |
| 1288 Call->addArg(Src1); | 1303 Call->addArg(Src1); |
| 1289 lowerCall(Call); | 1304 lowerCall(Call); |
| 1290 } | 1305 } |
| (...skipping 857 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2148 // Only constant indices are allowed in PNaCl IR. | 2163 // Only constant indices are allowed in PNaCl IR. |
| 2149 assert(ElementIndex); | 2164 assert(ElementIndex); |
| 2150 | 2165 |
| 2151 unsigned Index = ElementIndex->getValue(); | 2166 unsigned Index = ElementIndex->getValue(); |
| 2152 Type Ty = SourceVectOperand->getType(); | 2167 Type Ty = SourceVectOperand->getType(); |
| 2153 Type ElementTy = typeElementType(Ty); | 2168 Type ElementTy = typeElementType(Ty); |
| 2154 Type InVectorElementTy = getInVectorElementType(Ty); | 2169 Type InVectorElementTy = getInVectorElementType(Ty); |
| 2155 Variable *ExtractedElement = makeReg(InVectorElementTy); | 2170 Variable *ExtractedElement = makeReg(InVectorElementTy); |
| 2156 | 2171 |
| 2157 // TODO(wala): Determine the best lowering sequences for each type. | 2172 // TODO(wala): Determine the best lowering sequences for each type. |
| 2158 if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { | 2173 bool CanUsePextr = |
| 2159 // Lower extractelement operations where the element is 32 bits | 2174 Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1; |
| 2160 // wide with pshufd. | 2175 if (CanUsePextr && Ty != IceType_v4f32) { |
| 2161 // TODO(wala): SSE4.1 has extractps and pextrd | 2176 // Use pextrb, pextrw, or pextrd. |
| 2177 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); |
| 2178 Variable *SourceVectR = legalizeToVar(SourceVectOperand); |
| 2179 _pextr(ExtractedElement, SourceVectR, Mask); |
| 2180 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { |
| 2181 // Use pshufd and movd/movss. |
| 2162 // | 2182 // |
| 2163 // ALIGNHACK: Force vector operands to registers in instructions that | 2183 // ALIGNHACK: Force vector operands to registers in instructions that |
| 2164 // require aligned memory operands until support for stack alignment | 2184 // require aligned memory operands until support for stack alignment |
| 2165 // is implemented. | 2185 // is implemented. |
| 2166 #define ALIGN_HACK(Vect) legalizeToVar((Vect)) | 2186 #define ALIGN_HACK(Vect) legalizeToVar((Vect)) |
| 2167 Variable *T = NULL; | 2187 Variable *T = NULL; |
| 2168 if (Index) { | 2188 if (Index) { |
| 2169 // The shuffle only needs to occur if the element to be extracted | 2189 // The shuffle only needs to occur if the element to be extracted |
| 2170 // is not at the lowest index. | 2190 // is not at the lowest index. |
| 2171 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); | 2191 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); |
| 2172 T = makeReg(Ty); | 2192 T = makeReg(Ty); |
| 2173 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask); | 2193 _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask); |
| 2174 } else { | 2194 } else { |
| 2175 T = legalizeToVar(SourceVectOperand); | 2195 T = legalizeToVar(SourceVectOperand); |
| 2176 } | 2196 } |
| 2177 | 2197 |
| 2178 if (InVectorElementTy == IceType_i32) { | 2198 if (InVectorElementTy == IceType_i32) { |
| 2179 _movd(ExtractedElement, T); | 2199 _movd(ExtractedElement, T); |
| 2180 } else { // Ty == Icetype_f32 | 2200 } else { // Ty == Icetype_f32 |
| 2181 // TODO(wala): _movss is only used here because _mov does not | 2201 // TODO(wala): _movss is only used here because _mov does not |
| 2182 // allow a vector source and a scalar destination. _mov should be | 2202 // allow a vector source and a scalar destination. _mov should be |
| 2183 // able to be used here. | 2203 // able to be used here. |
| 2184 // _movss is a binary instruction, so the FakeDef is needed to | 2204 // _movss is a binary instruction, so the FakeDef is needed to |
| 2185 // keep the live range analysis consistent. | 2205 // keep the live range analysis consistent. |
| 2186 Context.insert(InstFakeDef::create(Func, ExtractedElement)); | 2206 Context.insert(InstFakeDef::create(Func, ExtractedElement)); |
| 2187 _movss(ExtractedElement, T); | 2207 _movss(ExtractedElement, T); |
| 2188 } | 2208 } |
| 2189 #undef ALIGN_HACK | 2209 #undef ALIGN_HACK |
| 2190 } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) { | |
| 2191 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); | |
| 2192 _pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask); | |
| 2193 } else { | 2210 } else { |
| 2194 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2211 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
| 2195 // Spill the value to a stack slot and do the extraction in memory. | 2212 // Spill the value to a stack slot and do the extraction in memory. |
| 2196 // TODO(wala): SSE4.1 has pextrb. | |
| 2197 // | 2213 // |
| 2198 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when | 2214 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when |
| 2199 // support for legalizing to mem is implemented. | 2215 // support for legalizing to mem is implemented. |
| 2200 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); | 2216 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); |
| 2201 Slot->setWeight(RegWeight::Zero); | 2217 Slot->setWeight(RegWeight::Zero); |
| 2202 _movp(Slot, legalizeToVar(SourceVectOperand)); | 2218 _movp(Slot, legalizeToVar(SourceVectOperand)); |
| 2203 | 2219 |
| 2204 // Compute the location of the element in memory. | 2220 // Compute the location of the element in memory. |
| 2205 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); | 2221 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); |
| 2206 OperandX8632Mem *Loc = | 2222 OperandX8632Mem *Loc = |
| (...skipping 325 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2532 // Expand the element to the appropriate size for it to be inserted | 2548 // Expand the element to the appropriate size for it to be inserted |
| 2533 // in the vector. | 2549 // in the vector. |
| 2534 Variable *Expanded = | 2550 Variable *Expanded = |
| 2535 Func->makeVariable(InVectorElementTy, Context.getNode()); | 2551 Func->makeVariable(InVectorElementTy, Context.getNode()); |
| 2536 InstCast *Cast = | 2552 InstCast *Cast = |
| 2537 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert); | 2553 InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert); |
| 2538 lowerCast(Cast); | 2554 lowerCast(Cast); |
| 2539 ElementToInsert = Expanded; | 2555 ElementToInsert = Expanded; |
| 2540 } | 2556 } |
| 2541 | 2557 |
| 2542 if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { | 2558 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) { |
| 2543 // Lower insertelement with 32-bit wide elements using shufps or | 2559 // Use insertps, pinsrb, pinsrw, or pinsrd. |
| 2544 // movss. | 2560 Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg); |
| 2545 // TODO(wala): SSE4.1 has pinsrd and insertps. | 2561 Variable *T = makeReg(Ty); |
| 2562 _movp(T, SourceVectOperand); |
| 2563 if (Ty == IceType_v4f32) |
| 2564 _insertps(T, Element, Ctx->getConstantInt(IceType_i8, Index << 4)); |
| 2565 else |
| 2566 _pinsr(T, Element, Ctx->getConstantInt(IceType_i8, Index)); |
| 2567 _movp(Inst->getDest(), T); |
| 2568 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { |
| 2569 // Use shufps or movss. |
| 2546 Variable *Element = NULL; | 2570 Variable *Element = NULL; |
| 2547 if (InVectorElementTy == IceType_f32) { | 2571 if (InVectorElementTy == IceType_f32) { |
| 2548 // Element will be in an XMM register since it is floating point. | 2572 // Element will be in an XMM register since it is floating point. |
| 2549 Element = legalizeToVar(ElementToInsert); | 2573 Element = legalizeToVar(ElementToInsert); |
| 2550 } else { | 2574 } else { |
| 2551 // Copy an integer to an XMM register. | 2575 // Copy an integer to an XMM register. |
| 2552 Operand *T = legalize(ElementToInsert, Legal_Reg | Legal_Mem); | 2576 Operand *T = legalize(ElementToInsert, Legal_Reg | Legal_Mem); |
| 2553 Element = makeReg(Ty); | 2577 Element = makeReg(Ty); |
| 2554 _movd(Element, T); | 2578 _movd(Element, T); |
| 2555 } | 2579 } |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2600 _shufps(Element, SourceVectOperand, Mask2Constant); | 2624 _shufps(Element, SourceVectOperand, Mask2Constant); |
| 2601 _movp(Inst->getDest(), Element); | 2625 _movp(Inst->getDest(), Element); |
| 2602 } else { | 2626 } else { |
| 2603 Variable *T = makeReg(Ty); | 2627 Variable *T = makeReg(Ty); |
| 2604 _movp(T, SourceVectOperand); | 2628 _movp(T, SourceVectOperand); |
| 2605 _shufps(Element, T, Mask1Constant); | 2629 _shufps(Element, T, Mask1Constant); |
| 2606 _shufps(T, Element, Mask2Constant); | 2630 _shufps(T, Element, Mask2Constant); |
| 2607 _movp(Inst->getDest(), T); | 2631 _movp(Inst->getDest(), T); |
| 2608 } | 2632 } |
| 2609 #undef ALIGN_HACK | 2633 #undef ALIGN_HACK |
| 2610 } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) { | |
| 2611 Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg); | |
| 2612 Variable *T = makeReg(Ty); | |
| 2613 _movp(T, SourceVectOperand); | |
| 2614 _pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index)); | |
| 2615 _movp(Inst->getDest(), T); | |
| 2616 } else { | 2634 } else { |
| 2617 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2635 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
| 2618 // Spill the value to a stack slot and perform the insertion in | 2636 // Spill the value to a stack slot and perform the insertion in |
| 2619 // memory. | 2637 // memory. |
| 2620 // TODO(wala): SSE4.1 has pinsrb. | |
| 2621 // | 2638 // |
| 2622 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when | 2639 // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when |
| 2623 // support for legalizing to mem is implemented. | 2640 // support for legalizing to mem is implemented. |
| 2624 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); | 2641 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); |
| 2625 Slot->setWeight(RegWeight::Zero); | 2642 Slot->setWeight(RegWeight::Zero); |
| 2626 _movp(Slot, legalizeToVar(SourceVectOperand)); | 2643 _movp(Slot, legalizeToVar(SourceVectOperand)); |
| 2627 | 2644 |
| 2628 // Compute the location of the position to insert in memory. | 2645 // Compute the location of the position to insert in memory. |
| 2629 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); | 2646 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); |
| 2630 OperandX8632Mem *Loc = | 2647 OperandX8632Mem *Loc = |
| (...skipping 913 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3544 Context.insert(InstFakeUse::create(Func, esp)); | 3561 Context.insert(InstFakeUse::create(Func, esp)); |
| 3545 } | 3562 } |
| 3546 | 3563 |
| 3547 void TargetX8632::lowerSelect(const InstSelect *Inst) { | 3564 void TargetX8632::lowerSelect(const InstSelect *Inst) { |
| 3548 Variable *Dest = Inst->getDest(); | 3565 Variable *Dest = Inst->getDest(); |
| 3549 Operand *SrcT = Inst->getTrueOperand(); | 3566 Operand *SrcT = Inst->getTrueOperand(); |
| 3550 Operand *SrcF = Inst->getFalseOperand(); | 3567 Operand *SrcF = Inst->getFalseOperand(); |
| 3551 Operand *Condition = Inst->getCondition(); | 3568 Operand *Condition = Inst->getCondition(); |
| 3552 | 3569 |
| 3553 if (isVectorType(Dest->getType())) { | 3570 if (isVectorType(Dest->getType())) { |
| 3554 // a=d?b:c ==> d=sext(d); a=(b&d)|(c&~d) | |
| 3555 // TODO(wala): SSE4.1 has blendvps and pblendvb. SSE4.1 also has | |
| 3556 // blendps and pblendw for constant condition operands. | |
| 3557 Type SrcTy = SrcT->getType(); | 3571 Type SrcTy = SrcT->getType(); |
| 3558 Variable *T = makeReg(SrcTy); | 3572 Variable *T = makeReg(SrcTy); |
| 3573 // ALIGNHACK: Until stack alignment support is implemented, vector |
| 3574 // instructions need to have vector operands in registers. Once |
| 3575 // there is support for stack alignment, LEGAL_HACK can be removed. |
| 3576 #define LEGAL_HACK(Vect) legalizeToVar((Vect)) |
| 3577 if (InstructionSet >= SSE4_1) { |
| 3578 // TODO(wala): If the condition operand is a constant, use blendps |
| 3579 // or pblendw. |
| 3580 // |
| 3581 // Use blendvps or pblendvb to implement select. |
| 3582 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 || |
| 3583 SrcTy == IceType_v4f32) { |
| 3584 Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0); |
| 3585 _movp(xmm0, Condition); |
| 3586 _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31)); |
| 3587 _movp(T, SrcF); |
| 3588 _blendvps(T, LEGAL_HACK(SrcT), xmm0); |
| 3589 _movp(Dest, T); |
| 3590 } else { |
| 3591 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16); |
| 3592 Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16 |
| 3593 : IceType_v16i8; |
| 3594 Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0); |
| 3595 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition)); |
| 3596 _movp(T, SrcF); |
| 3597 _pblendvb(T, LEGAL_HACK(SrcT), xmm0); |
| 3598 _movp(Dest, T); |
| 3599 } |
| 3600 return; |
| 3601 } |
| 3602 // Lower select without SSE4.1: |
| 3603 // a=d?b:c ==> |
| 3604 // if elementtype(d) != i1: |
| 3605 // d=sext(d); |
| 3606 // a=(b&d)|(c&~d); |
| 3559 Variable *T2 = makeReg(SrcTy); | 3607 Variable *T2 = makeReg(SrcTy); |
| 3560 // Sign extend the condition operand if applicable. | 3608 // Sign extend the condition operand if applicable. |
| 3561 if (SrcTy == IceType_v4f32) { | 3609 if (SrcTy == IceType_v4f32) { |
| 3562 // The sext operation takes only integer arguments. | 3610 // The sext operation takes only integer arguments. |
| 3563 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode()); | 3611 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode()); |
| 3564 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); | 3612 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); |
| 3565 _movp(T, T3); | 3613 _movp(T, T3); |
| 3566 } else if (typeElementType(SrcTy) != IceType_i1) { | 3614 } else if (typeElementType(SrcTy) != IceType_i1) { |
| 3567 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); | 3615 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); |
| 3568 } else { | 3616 } else { |
| 3569 _movp(T, Condition); | 3617 _movp(T, Condition); |
| 3570 } | 3618 } |
| 3571 // ALIGNHACK: Until stack alignment support is implemented, the | |
| 3572 // bitwise vector instructions need to have both operands in | |
| 3573 // registers. Once there is support for stack alignment, LEGAL_HACK | |
| 3574 // can be removed. | |
| 3575 #define LEGAL_HACK(Vect) legalizeToVar((Vect)) | |
| 3576 _movp(T2, T); | 3619 _movp(T2, T); |
| 3577 _pand(T, LEGAL_HACK(SrcT)); | 3620 _pand(T, LEGAL_HACK(SrcT)); |
| 3578 _pandn(T2, LEGAL_HACK(SrcF)); | 3621 _pandn(T2, LEGAL_HACK(SrcF)); |
| 3579 _por(T, T2); | 3622 _por(T, T2); |
| 3580 _movp(Dest, T); | 3623 _movp(Dest, T); |
| 3581 #undef LEGAL_HACK | 3624 #undef LEGAL_HACK |
| 3582 | 3625 |
| 3583 return; | 3626 return; |
| 3584 } | 3627 } |
| 3585 | 3628 |
| (...skipping 504 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4090 for (SizeT i = 0; i < Size; ++i) { | 4133 for (SizeT i = 0; i < Size; ++i) { |
| 4091 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; | 4134 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; |
| 4092 } | 4135 } |
| 4093 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; | 4136 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; |
| 4094 } | 4137 } |
| 4095 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName | 4138 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName |
| 4096 << "\n"; | 4139 << "\n"; |
| 4097 } | 4140 } |
| 4098 | 4141 |
| 4099 } // end of namespace Ice | 4142 } // end of namespace Ice |
| OLD | NEW |