OLD | NEW |
---|---|
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// | 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// |
2 // | 2 // |
3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
4 // | 4 // |
5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
7 // | 7 // |
8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
9 /// | 9 /// |
10 /// \file | 10 /// \file |
11 /// This file implements the TargetLoweringX86Base class, which | 11 /// This file implements the TargetLoweringX86Base class, which consists almost |
12 /// consists almost entirely of the lowering sequence for each | 12 /// entirely of the lowering sequence for each high-level instruction. |
13 /// high-level instruction. | |
14 /// | 13 /// |
15 //===----------------------------------------------------------------------===// | 14 //===----------------------------------------------------------------------===// |
16 | 15 |
17 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H | 16 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H |
18 #define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H | 17 #define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H |
19 | 18 |
20 #include "IceCfg.h" | 19 #include "IceCfg.h" |
21 #include "IceCfgNode.h" | 20 #include "IceCfgNode.h" |
22 #include "IceClFlags.h" | 21 #include "IceClFlags.h" |
23 #include "IceDefs.h" | 22 #include "IceDefs.h" |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
56 | 55 |
57 public: | 56 public: |
58 BoolFoldingEntry() = default; | 57 BoolFoldingEntry() = default; |
59 explicit BoolFoldingEntry(Inst *I); | 58 explicit BoolFoldingEntry(Inst *I); |
60 BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default; | 59 BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default; |
61 /// Instr is the instruction producing the i1-type variable of interest. | 60 /// Instr is the instruction producing the i1-type variable of interest. |
62 Inst *Instr = nullptr; | 61 Inst *Instr = nullptr; |
63 /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr). | 62 /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr). |
64 bool IsComplex = false; | 63 bool IsComplex = false; |
65 /// IsLiveOut is initialized conservatively to true, and is set to false when | 64 /// IsLiveOut is initialized conservatively to true, and is set to false when |
66 /// we encounter an instruction that ends Var's live range. We disable the | 65 /// we encounter an instruction that ends Var's live range. We disable the |
67 /// folding optimization when Var is live beyond this basic block. Note that | 66 /// folding optimization when Var is live beyond this basic block. Note that |
68 /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will | 67 /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will |
69 /// always be true and the folding optimization will never be performed. | 68 /// always be true and the folding optimization will never be performed. |
70 bool IsLiveOut = true; | 69 bool IsLiveOut = true; |
71 // NumUses counts the number of times Var is used as a source operand in the | 70 // NumUses counts the number of times Var is used as a source operand in the |
72 // basic block. If IsComplex is true and there is more than one use of Var, | 71 // basic block. If IsComplex is true and there is more than one use of Var, |
73 // then the folding optimization is disabled for Var. | 72 // then the folding optimization is disabled for Var. |
74 uint32_t NumUses = 0; | 73 uint32_t NumUses = 0; |
75 }; | 74 }; |
76 | 75 |
77 template <class MachineTraits> class BoolFolding { | 76 template <class MachineTraits> class BoolFolding { |
78 public: | 77 public: |
79 enum BoolFoldingProducerKind { | 78 enum BoolFoldingProducerKind { |
80 PK_None, | 79 PK_None, |
81 // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative. | 80 // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative. |
82 PK_Icmp32, | 81 PK_Icmp32, |
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
159 case InstCast::Zext: | 158 case InstCast::Zext: |
160 return CK_Zext; | 159 return CK_Zext; |
161 } | 160 } |
162 } | 161 } |
163 return CK_None; | 162 return CK_None; |
164 } | 163 } |
165 | 164 |
166 /// Returns true if the producing instruction has a "complex" lowering sequence. | 165 /// Returns true if the producing instruction has a "complex" lowering sequence. |
167 /// This generally means that its lowering sequence requires more than one | 166 /// This generally means that its lowering sequence requires more than one |
168 /// conditional branch, namely 64-bit integer compares and some floating-point | 167 /// conditional branch, namely 64-bit integer compares and some floating-point |
169 /// compares. When this is true, and there is more than one consumer, we prefer | 168 /// compares. When this is true, and there is more than one consumer, we prefer |
170 /// to disable the folding optimization because it minimizes branches. | 169 /// to disable the folding optimization because it minimizes branches. |
171 template <class MachineTraits> | 170 template <class MachineTraits> |
172 bool BoolFolding<MachineTraits>::hasComplexLowering(const Inst *Instr) { | 171 bool BoolFolding<MachineTraits>::hasComplexLowering(const Inst *Instr) { |
173 switch (getProducerKind(Instr)) { | 172 switch (getProducerKind(Instr)) { |
174 default: | 173 default: |
175 return false; | 174 return false; |
176 case PK_Icmp64: | 175 case PK_Icmp64: |
177 return true; | 176 return true; |
178 case PK_Fcmp: | 177 case PK_Fcmp: |
179 return MachineTraits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()] | 178 return MachineTraits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()] |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
215 } | 214 } |
216 for (auto &I : Producers) { | 215 for (auto &I : Producers) { |
217 // Ignore entries previously marked invalid. | 216 // Ignore entries previously marked invalid. |
218 if (I.second.Instr == nullptr) | 217 if (I.second.Instr == nullptr) |
219 continue; | 218 continue; |
220 // Disable the producer if its dest may be live beyond this block. | 219 // Disable the producer if its dest may be live beyond this block. |
221 if (I.second.IsLiveOut) { | 220 if (I.second.IsLiveOut) { |
222 setInvalid(I.first); | 221 setInvalid(I.first); |
223 continue; | 222 continue; |
224 } | 223 } |
225 // Mark as "dead" rather than outright deleting. This is so that other | 224 // Mark as "dead" rather than outright deleting. This is so that other |
226 // peephole style optimizations during or before lowering have access to | 225 // peephole style optimizations during or before lowering have access to |
227 // this instruction in undeleted form. See for example | 226 // this instruction in undeleted form. See for example |
228 // tryOptimizedCmpxchgCmpBr(). | 227 // tryOptimizedCmpxchgCmpBr(). |
229 I.second.Instr->setDead(); | 228 I.second.Instr->setDead(); |
230 } | 229 } |
231 } | 230 } |
232 | 231 |
233 template <class MachineTraits> | 232 template <class MachineTraits> |
234 const Inst * | 233 const Inst * |
235 BoolFolding<MachineTraits>::getProducerFor(const Operand *Opnd) const { | 234 BoolFolding<MachineTraits>::getProducerFor(const Operand *Opnd) const { |
236 auto *Var = llvm::dyn_cast<const Variable>(Opnd); | 235 auto *Var = llvm::dyn_cast<const Variable>(Opnd); |
237 if (Var == nullptr) | 236 if (Var == nullptr) |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
296 if (Func->hasError()) | 295 if (Func->hasError()) |
297 return; | 296 return; |
298 Func->deletePhis(); | 297 Func->deletePhis(); |
299 if (Func->hasError()) | 298 if (Func->hasError()) |
300 return; | 299 return; |
301 Func->dump("After Phi lowering"); | 300 Func->dump("After Phi lowering"); |
302 } | 301 } |
303 | 302 |
304 // Run this early so it can be used to focus optimizations on potentially hot | 303 // Run this early so it can be used to focus optimizations on potentially hot |
305 // code. | 304 // code. |
306 // TODO(stichnot,ascull): currently only used for regalloc not expensive high | 305 // TODO(stichnot,ascull): currently only used for regalloc not |
307 // level optimizations which could be focused on potentially hot code. | 306 // expensive high level optimizations which could be focused on potentially |
307 // hot code. | |
308 Func->computeLoopNestDepth(); | 308 Func->computeLoopNestDepth(); |
309 Func->dump("After loop nest depth analysis"); | 309 Func->dump("After loop nest depth analysis"); |
310 | 310 |
311 // Address mode optimization. | 311 // Address mode optimization. |
312 Func->getVMetadata()->init(VMK_SingleDefs); | 312 Func->getVMetadata()->init(VMK_SingleDefs); |
313 Func->doAddressOpt(); | 313 Func->doAddressOpt(); |
314 | 314 |
315 // Find read-modify-write opportunities. Do this after address mode | 315 // Find read-modify-write opportunities. Do this after address mode |
316 // optimization so that doAddressOpt() doesn't need to be applied to RMW | 316 // optimization so that doAddressOpt() doesn't need to be applied to RMW |
317 // instructions as well. | 317 // instructions as well. |
318 findRMW(); | 318 findRMW(); |
319 Func->dump("After RMW transform"); | 319 Func->dump("After RMW transform"); |
320 | 320 |
321 // Argument lowering | 321 // Argument lowering |
322 Func->doArgLowering(); | 322 Func->doArgLowering(); |
323 | 323 |
324 // Target lowering. This requires liveness analysis for some parts of the | 324 // Target lowering. This requires liveness analysis for some parts of the |
325 // lowering decisions, such as compare/branch fusing. If non-lightweight | 325 // lowering decisions, such as compare/branch fusing. If non-lightweight |
326 // liveness analysis is used, the instructions need to be renumbered first | 326 // liveness analysis is used, the instructions need to be renumbered first |
327 // TODO: This renumbering should only be necessary if we're actually | 327 // TODO: This renumbering should only be necessary if we're actually |
328 // calculating live intervals, which we only do for register allocation. | 328 // calculating live intervals, which we only do for register allocation. |
329 Func->renumberInstructions(); | 329 Func->renumberInstructions(); |
330 if (Func->hasError()) | 330 if (Func->hasError()) |
331 return; | 331 return; |
332 | 332 |
333 // TODO: It should be sufficient to use the fastest liveness calculation, i.e. | 333 // TODO: It should be sufficient to use the fastest liveness calculation, |
334 // livenessLightweight(). However, for some reason that slows down the rest | 334 // i.e. livenessLightweight(). However, for some reason that slows down the |
335 // of the translation. Investigate. | 335 // rest of the translation. Investigate. |
336 Func->liveness(Liveness_Basic); | 336 Func->liveness(Liveness_Basic); |
337 if (Func->hasError()) | 337 if (Func->hasError()) |
338 return; | 338 return; |
339 Func->dump("After x86 address mode opt"); | 339 Func->dump("After x86 address mode opt"); |
340 | 340 |
341 // Disable constant blinding or pooling for load optimization. | 341 // Disable constant blinding or pooling for load optimization. |
342 { | 342 { |
343 BoolFlagSaver B(RandomizationPoolingPaused, true); | 343 BoolFlagSaver B(RandomizationPoolingPaused, true); |
344 doLoadOpt(); | 344 doLoadOpt(); |
345 } | 345 } |
346 Func->genCode(); | 346 Func->genCode(); |
347 if (Func->hasError()) | 347 if (Func->hasError()) |
348 return; | 348 return; |
349 Func->dump("After x86 codegen"); | 349 Func->dump("After x86 codegen"); |
350 | 350 |
351 // Register allocation. This requires instruction renumbering and full | 351 // Register allocation. This requires instruction renumbering and full |
352 // liveness analysis. Loops must be identified before liveness so variable | 352 // liveness analysis. Loops must be identified before liveness so variable |
353 // use weights are correct. | 353 // use weights are correct. |
354 Func->renumberInstructions(); | 354 Func->renumberInstructions(); |
355 if (Func->hasError()) | 355 if (Func->hasError()) |
356 return; | 356 return; |
357 Func->liveness(Liveness_Intervals); | 357 Func->liveness(Liveness_Intervals); |
358 if (Func->hasError()) | 358 if (Func->hasError()) |
359 return; | 359 return; |
360 // Validate the live range computations. The expensive validation call is | 360 // Validate the live range computations. The expensive validation call is |
361 // deliberately only made when assertions are enabled. | 361 // deliberately only made when assertions are enabled. |
362 assert(Func->validateLiveness()); | 362 assert(Func->validateLiveness()); |
363 // The post-codegen dump is done here, after liveness analysis and associated | 363 // The post-codegen dump is done here, after liveness analysis and associated |
364 // cleanup, to make the dump cleaner and more useful. | 364 // cleanup, to make the dump cleaner and more useful. |
365 Func->dump("After initial x8632 codegen"); | 365 Func->dump("After initial x8632 codegen"); |
366 Func->getVMetadata()->init(VMK_All); | 366 Func->getVMetadata()->init(VMK_All); |
367 regAlloc(RAK_Global); | 367 regAlloc(RAK_Global); |
368 if (Func->hasError()) | 368 if (Func->hasError()) |
369 return; | 369 return; |
370 Func->dump("After linear scan regalloc"); | 370 Func->dump("After linear scan regalloc"); |
371 | 371 |
372 if (Ctx->getFlags().getPhiEdgeSplit()) { | 372 if (Ctx->getFlags().getPhiEdgeSplit()) { |
373 Func->advancedPhiLowering(); | 373 Func->advancedPhiLowering(); |
374 Func->dump("After advanced Phi lowering"); | 374 Func->dump("After advanced Phi lowering"); |
375 } | 375 } |
376 | 376 |
377 // Stack frame mapping. | 377 // Stack frame mapping. |
378 Func->genFrame(); | 378 Func->genFrame(); |
379 if (Func->hasError()) | 379 if (Func->hasError()) |
380 return; | 380 return; |
381 Func->dump("After stack frame mapping"); | 381 Func->dump("After stack frame mapping"); |
382 | 382 |
383 Func->contractEmptyNodes(); | 383 Func->contractEmptyNodes(); |
384 Func->reorderNodes(); | 384 Func->reorderNodes(); |
385 | 385 |
386 // Shuffle basic block order if -reorder-basic-blocks is enabled. | 386 // Shuffle basic block order if -reorder-basic-blocks is enabled. |
387 Func->shuffleNodes(); | 387 Func->shuffleNodes(); |
388 | 388 |
389 // Branch optimization. This needs to be done just before code emission. In | 389 // Branch optimization. This needs to be done just before code emission. In |
390 // particular, no transformations that insert or reorder CfgNodes should be | 390 // particular, no transformations that insert or reorder CfgNodes should be |
391 // done after branch optimization. We go ahead and do it before nop insertion | 391 // done after branch optimization. We go ahead and do it before nop insertion |
392 // to reduce the amount of work needed for searching for opportunities. | 392 // to reduce the amount of work needed for searching for opportunities. |
393 Func->doBranchOpt(); | 393 Func->doBranchOpt(); |
394 Func->dump("After branch optimization"); | 394 Func->dump("After branch optimization"); |
395 | 395 |
396 // Nop insertion if -nop-insertion is enabled. | 396 // Nop insertion if -nop-insertion is enabled. |
397 Func->doNopInsertion(); | 397 Func->doNopInsertion(); |
398 | 398 |
399 // Mark nodes that require sandbox alignment | 399 // Mark nodes that require sandbox alignment |
400 if (Ctx->getFlags().getUseSandboxing()) | 400 if (Ctx->getFlags().getUseSandboxing()) |
401 Func->markNodesForSandboxing(); | 401 Func->markNodesForSandboxing(); |
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
488 } | 488 } |
489 return false; | 489 return false; |
490 } | 490 } |
491 | 491 |
492 template <class Machine> void TargetX86Base<Machine>::findRMW() { | 492 template <class Machine> void TargetX86Base<Machine>::findRMW() { |
493 Func->dump("Before RMW"); | 493 Func->dump("Before RMW"); |
494 OstreamLocker L(Func->getContext()); | 494 OstreamLocker L(Func->getContext()); |
495 Ostream &Str = Func->getContext()->getStrDump(); | 495 Ostream &Str = Func->getContext()->getStrDump(); |
496 for (CfgNode *Node : Func->getNodes()) { | 496 for (CfgNode *Node : Func->getNodes()) { |
497 // Walk through the instructions, considering each sequence of 3 | 497 // Walk through the instructions, considering each sequence of 3 |
498 // instructions, and look for the particular RMW pattern. Note that this | 498 // instructions, and look for the particular RMW pattern. Note that this |
499 // search can be "broken" (false negatives) if there are intervening deleted | 499 // search can be "broken" (false negatives) if there are intervening |
500 // instructions, or intervening instructions that could be safely moved out | 500 // deleted instructions, or intervening instructions that could be safely |
501 // of the way to reveal an RMW pattern. | 501 // moved out of the way to reveal an RMW pattern. |
502 auto E = Node->getInsts().end(); | 502 auto E = Node->getInsts().end(); |
503 auto I1 = E, I2 = E, I3 = Node->getInsts().begin(); | 503 auto I1 = E, I2 = E, I3 = Node->getInsts().begin(); |
504 for (; I3 != E; I1 = I2, I2 = I3, ++I3) { | 504 for (; I3 != E; I1 = I2, I2 = I3, ++I3) { |
505 // Make I3 skip over deleted instructions. | 505 // Make I3 skip over deleted instructions. |
506 while (I3 != E && I3->isDeleted()) | 506 while (I3 != E && I3->isDeleted()) |
507 ++I3; | 507 ++I3; |
508 if (I1 == E || I2 == E || I3 == E) | 508 if (I1 == E || I2 == E || I3 == E) |
509 continue; | 509 continue; |
510 assert(!I1->isDeleted()); | 510 assert(!I1->isDeleted()); |
511 assert(!I2->isDeleted()); | 511 assert(!I2->isDeleted()); |
512 assert(!I3->isDeleted()); | 512 assert(!I3->isDeleted()); |
513 if (auto *Load = llvm::dyn_cast<InstLoad>(I1)) { | 513 if (auto *Load = llvm::dyn_cast<InstLoad>(I1)) { |
514 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(I2)) { | 514 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(I2)) { |
515 if (auto *Store = llvm::dyn_cast<InstStore>(I3)) { | 515 if (auto *Store = llvm::dyn_cast<InstStore>(I3)) { |
516 // Look for: | 516 // Look for: |
517 // a = Load addr | 517 // a = Load addr |
518 // b = <op> a, other | 518 // b = <op> a, other |
519 // Store b, addr | 519 // Store b, addr |
520 // Change to: | 520 // Change to: |
521 // a = Load addr | 521 // a = Load addr |
522 // b = <op> a, other | 522 // b = <op> a, other |
523 // x = FakeDef | 523 // x = FakeDef |
524 // RMW <op>, addr, other, x | 524 // RMW <op>, addr, other, x |
525 // b = Store b, addr, x | 525 // b = Store b, addr, x |
526 // Note that inferTwoAddress() makes sure setDestNonKillable() gets | 526 // Note that inferTwoAddress() makes sure setDestNonKillable() gets |
527 // called on the updated Store instruction, to avoid liveness | 527 // called on the updated Store instruction, to avoid liveness |
528 // problems later. | 528 // problems later. |
529 // | 529 // |
530 // With this transformation, the Store instruction acquires a Dest | 530 // With this transformation, the Store instruction acquires a Dest |
531 // variable and is now subject to dead code elimination if there are | 531 // variable and is now subject to dead code elimination if there |
532 // no more uses of "b". Variable "x" is a beacon for determining | 532 // are no more uses of "b". Variable "x" is a beacon for |
533 // whether the Store instruction gets dead-code eliminated. If the | 533 // determining whether the Store instruction gets dead-code |
534 // Store instruction is eliminated, then it must be the case that | 534 // eliminated. If the Store instruction is eliminated, then it |
535 // the RMW instruction ends x's live range, and therefore the RMW | 535 // must be the case that the RMW instruction ends x's live range, |
536 // instruction will be retained and later lowered. On the other | 536 // and therefore the RMW instruction will be retained and later |
537 // hand, if the RMW instruction does not end x's live range, then | 537 // lowered. On the other hand, if the RMW instruction does not end |
538 // the Store instruction must still be present, and therefore the | 538 // x's live range, then the Store instruction must still be |
539 // RMW instruction is ignored during lowering because it is | 539 // present, and therefore the RMW instruction is ignored during |
540 // redundant with the Store instruction. | 540 // lowering because it is redundant with the Store instruction. |
541 // | 541 // |
542 // Note that if "a" has further uses, the RMW transformation may | 542 // Note that if "a" has further uses, the RMW transformation may |
543 // still trigger, resulting in two loads and one store, which is | 543 // still trigger, resulting in two loads and one store, which is |
544 // worse than the original one load and one store. However, this is | 544 // worse than the original one load and one store. However, this |
545 // probably rare, and caching probably keeps it just as fast. | 545 // is probably rare, and caching probably keeps it just as fast. |
546 if (!isSameMemAddressOperand<Machine>(Load->getSourceAddress(), | 546 if (!isSameMemAddressOperand<Machine>(Load->getSourceAddress(), |
547 Store->getAddr())) | 547 Store->getAddr())) |
548 continue; | 548 continue; |
549 Operand *ArithSrcFromLoad = Arith->getSrc(0); | 549 Operand *ArithSrcFromLoad = Arith->getSrc(0); |
550 Operand *ArithSrcOther = Arith->getSrc(1); | 550 Operand *ArithSrcOther = Arith->getSrc(1); |
551 if (ArithSrcFromLoad != Load->getDest()) { | 551 if (ArithSrcFromLoad != Load->getDest()) { |
552 if (!Arith->isCommutative() || ArithSrcOther != Load->getDest()) | 552 if (!Arith->isCommutative() || ArithSrcOther != Load->getDest()) |
553 continue; | 553 continue; |
554 std::swap(ArithSrcFromLoad, ArithSrcOther); | 554 std::swap(ArithSrcFromLoad, ArithSrcOther); |
555 } | 555 } |
(...skipping 26 matching lines...) Expand all Loading... | |
582 } | 582 } |
583 | 583 |
584 // Converts a ConstantInteger32 operand into its constant value, or | 584 // Converts a ConstantInteger32 operand into its constant value, or |
585 // MemoryOrderInvalid if the operand is not a ConstantInteger32. | 585 // MemoryOrderInvalid if the operand is not a ConstantInteger32. |
586 inline uint64_t getConstantMemoryOrder(Operand *Opnd) { | 586 inline uint64_t getConstantMemoryOrder(Operand *Opnd) { |
587 if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd)) | 587 if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd)) |
588 return Integer->getValue(); | 588 return Integer->getValue(); |
589 return Intrinsics::MemoryOrderInvalid; | 589 return Intrinsics::MemoryOrderInvalid; |
590 } | 590 } |
591 | 591 |
592 /// Determines whether the dest of a Load instruction can be folded | 592 /// Determines whether the dest of a Load instruction can be folded into one of |
593 /// into one of the src operands of a 2-operand instruction. This is | 593 /// the src operands of a 2-operand instruction. This is true as long as the |
594 /// true as long as the load dest matches exactly one of the binary | 594 /// load dest matches exactly one of the binary instruction's src operands. |
595 /// instruction's src operands. Replaces Src0 or Src1 with LoadSrc if | 595 /// Replaces Src0 or Src1 with LoadSrc if the answer is true. |
596 /// the answer is true. | |
597 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest, | 596 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest, |
598 Operand *&Src0, Operand *&Src1) { | 597 Operand *&Src0, Operand *&Src1) { |
599 if (Src0 == LoadDest && Src1 != LoadDest) { | 598 if (Src0 == LoadDest && Src1 != LoadDest) { |
600 Src0 = LoadSrc; | 599 Src0 = LoadSrc; |
601 return true; | 600 return true; |
602 } | 601 } |
603 if (Src0 != LoadDest && Src1 == LoadDest) { | 602 if (Src0 != LoadDest && Src1 == LoadDest) { |
604 Src1 = LoadSrc; | 603 Src1 = LoadSrc; |
605 return true; | 604 return true; |
606 } | 605 } |
607 return false; | 606 return false; |
608 } | 607 } |
609 | 608 |
610 template <class Machine> void TargetX86Base<Machine>::doLoadOpt() { | 609 template <class Machine> void TargetX86Base<Machine>::doLoadOpt() { |
611 for (CfgNode *Node : Func->getNodes()) { | 610 for (CfgNode *Node : Func->getNodes()) { |
612 Context.init(Node); | 611 Context.init(Node); |
613 while (!Context.atEnd()) { | 612 while (!Context.atEnd()) { |
614 Variable *LoadDest = nullptr; | 613 Variable *LoadDest = nullptr; |
615 Operand *LoadSrc = nullptr; | 614 Operand *LoadSrc = nullptr; |
616 Inst *CurInst = Context.getCur(); | 615 Inst *CurInst = Context.getCur(); |
617 Inst *Next = Context.getNextInst(); | 616 Inst *Next = Context.getNextInst(); |
618 // Determine whether the current instruction is a Load | 617 // Determine whether the current instruction is a Load instruction or |
619 // instruction or equivalent. | 618 // equivalent. |
620 if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) { | 619 if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) { |
621 // An InstLoad always qualifies. | 620 // An InstLoad always qualifies. |
622 LoadDest = Load->getDest(); | 621 LoadDest = Load->getDest(); |
623 const bool DoLegalize = false; | 622 const bool DoLegalize = false; |
624 LoadSrc = formMemoryOperand(Load->getSourceAddress(), | 623 LoadSrc = formMemoryOperand(Load->getSourceAddress(), |
625 LoadDest->getType(), DoLegalize); | 624 LoadDest->getType(), DoLegalize); |
626 } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) { | 625 } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) { |
627 // An AtomicLoad intrinsic qualifies as long as it has a valid | 626 // An AtomicLoad intrinsic qualifies as long as it has a valid memory |
628 // memory ordering, and can be implemented in a single | 627 // ordering, and can be implemented in a single instruction (i.e., not |
629 // instruction (i.e., not i64 on x86-32). | 628 // i64 on x86-32). |
630 Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID; | 629 Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID; |
631 if (ID == Intrinsics::AtomicLoad && | 630 if (ID == Intrinsics::AtomicLoad && |
632 (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) && | 631 (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) && |
633 Intrinsics::isMemoryOrderValid( | 632 Intrinsics::isMemoryOrderValid( |
634 ID, getConstantMemoryOrder(Intrin->getArg(1)))) { | 633 ID, getConstantMemoryOrder(Intrin->getArg(1)))) { |
635 LoadDest = Intrin->getDest(); | 634 LoadDest = Intrin->getDest(); |
636 const bool DoLegalize = false; | 635 const bool DoLegalize = false; |
637 LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(), | 636 LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(), |
638 DoLegalize); | 637 DoLegalize); |
639 } | 638 } |
640 } | 639 } |
641 // A Load instruction can be folded into the following | 640 // A Load instruction can be folded into the following instruction only |
642 // instruction only if the following instruction ends the Load's | 641 // if the following instruction ends the Load's Dest variable's live |
643 // Dest variable's live range. | 642 // range. |
644 if (LoadDest && Next && Next->isLastUse(LoadDest)) { | 643 if (LoadDest && Next && Next->isLastUse(LoadDest)) { |
645 assert(LoadSrc); | 644 assert(LoadSrc); |
646 Inst *NewInst = nullptr; | 645 Inst *NewInst = nullptr; |
647 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) { | 646 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) { |
648 Operand *Src0 = Arith->getSrc(0); | 647 Operand *Src0 = Arith->getSrc(0); |
649 Operand *Src1 = Arith->getSrc(1); | 648 Operand *Src1 = Arith->getSrc(1); |
650 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { | 649 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { |
651 NewInst = InstArithmetic::create(Func, Arith->getOp(), | 650 NewInst = InstArithmetic::create(Func, Arith->getOp(), |
652 Arith->getDest(), Src0, Src1); | 651 Arith->getDest(), Src0, Src1); |
653 } | 652 } |
(...skipping 12 matching lines...) Expand all Loading... | |
666 Fcmp->getDest(), Src0, Src1); | 665 Fcmp->getDest(), Src0, Src1); |
667 } | 666 } |
668 } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) { | 667 } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) { |
669 Operand *Src0 = Select->getTrueOperand(); | 668 Operand *Src0 = Select->getTrueOperand(); |
670 Operand *Src1 = Select->getFalseOperand(); | 669 Operand *Src1 = Select->getFalseOperand(); |
671 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { | 670 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { |
672 NewInst = InstSelect::create(Func, Select->getDest(), | 671 NewInst = InstSelect::create(Func, Select->getDest(), |
673 Select->getCondition(), Src0, Src1); | 672 Select->getCondition(), Src0, Src1); |
674 } | 673 } |
675 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) { | 674 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) { |
676 // The load dest can always be folded into a Cast | 675 // The load dest can always be folded into a Cast instruction. |
677 // instruction. | |
678 Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0)); | 676 Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0)); |
679 if (Src0 == LoadDest) { | 677 if (Src0 == LoadDest) { |
680 NewInst = InstCast::create(Func, Cast->getCastKind(), | 678 NewInst = InstCast::create(Func, Cast->getCastKind(), |
681 Cast->getDest(), LoadSrc); | 679 Cast->getDest(), LoadSrc); |
682 } | 680 } |
683 } | 681 } |
684 if (NewInst) { | 682 if (NewInst) { |
685 CurInst->setDeleted(); | 683 CurInst->setDeleted(); |
686 Next->setDeleted(); | 684 Next->setDeleted(); |
687 Context.insert(NewInst); | 685 Context.insert(NewInst); |
688 // Update NewInst->LiveRangesEnded so that target lowering | 686 // Update NewInst->LiveRangesEnded so that target lowering may |
689 // may benefit. Also update NewInst->HasSideEffects. | 687 // benefit. Also update NewInst->HasSideEffects. |
690 NewInst->spliceLivenessInfo(Next, CurInst); | 688 NewInst->spliceLivenessInfo(Next, CurInst); |
691 } | 689 } |
692 } | 690 } |
693 Context.advanceCur(); | 691 Context.advanceCur(); |
694 Context.advanceNext(); | 692 Context.advanceNext(); |
695 } | 693 } |
696 } | 694 } |
697 Func->dump("After load optimization"); | 695 Func->dump("After load optimization"); |
698 } | 696 } |
699 | 697 |
(...skipping 14 matching lines...) Expand all Loading... | |
714 if (Ty == IceType_void) | 712 if (Ty == IceType_void) |
715 Ty = IceType_i32; | 713 Ty = IceType_i32; |
716 if (PhysicalRegisters[Ty].empty()) | 714 if (PhysicalRegisters[Ty].empty()) |
717 PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM); | 715 PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM); |
718 assert(RegNum < PhysicalRegisters[Ty].size()); | 716 assert(RegNum < PhysicalRegisters[Ty].size()); |
719 Variable *Reg = PhysicalRegisters[Ty][RegNum]; | 717 Variable *Reg = PhysicalRegisters[Ty][RegNum]; |
720 if (Reg == nullptr) { | 718 if (Reg == nullptr) { |
721 Reg = Func->makeVariable(Ty); | 719 Reg = Func->makeVariable(Ty); |
722 Reg->setRegNum(RegNum); | 720 Reg->setRegNum(RegNum); |
723 PhysicalRegisters[Ty][RegNum] = Reg; | 721 PhysicalRegisters[Ty][RegNum] = Reg; |
724 // Specially mark esp as an "argument" so that it is considered | 722 // Specially mark esp as an "argument" so that it is considered live upon |
725 // live upon function entry. | 723 // function entry. |
726 if (RegNum == Traits::RegisterSet::Reg_esp) { | 724 if (RegNum == Traits::RegisterSet::Reg_esp) { |
727 Func->addImplicitArg(Reg); | 725 Func->addImplicitArg(Reg); |
728 Reg->setIgnoreLiveness(); | 726 Reg->setIgnoreLiveness(); |
729 } | 727 } |
730 } | 728 } |
731 return Reg; | 729 return Reg; |
732 } | 730 } |
733 | 731 |
734 template <class Machine> | 732 template <class Machine> |
735 IceString TargetX86Base<Machine>::getRegName(SizeT RegNum, Type Ty) const { | 733 IceString TargetX86Base<Machine>::getRegName(SizeT RegNum, Type Ty) const { |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
775 BaseRegNum = getFrameOrStackReg(); | 773 BaseRegNum = getFrameOrStackReg(); |
776 if (!hasFramePointer()) | 774 if (!hasFramePointer()) |
777 Offset += getStackAdjustment(); | 775 Offset += getStackAdjustment(); |
778 } | 776 } |
779 return typename Traits::Address( | 777 return typename Traits::Address( |
780 Traits::RegisterSet::getEncodedGPR(BaseRegNum), Offset); | 778 Traits::RegisterSet::getEncodedGPR(BaseRegNum), Offset); |
781 } | 779 } |
782 | 780 |
783 /// Helper function for addProlog(). | 781 /// Helper function for addProlog(). |
784 /// | 782 /// |
785 /// This assumes Arg is an argument passed on the stack. This sets the | 783 /// This assumes Arg is an argument passed on the stack. This sets the frame |
786 /// frame offset for Arg and updates InArgsSizeBytes according to Arg's | 784 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an |
787 /// width. For an I64 arg that has been split into Lo and Hi components, | 785 /// I64 arg that has been split into Lo and Hi components, it calls itself |
788 /// it calls itself recursively on the components, taking care to handle | 786 /// recursively on the components, taking care to handle Lo first because of the |
789 /// Lo first because of the little-endian architecture. Lastly, this | 787 /// little-endian architecture. Lastly, this function generates an instruction |
790 /// function generates an instruction to copy Arg into its assigned | 788 /// to copy Arg into its assigned register if applicable. |
791 /// register if applicable. | |
792 template <class Machine> | 789 template <class Machine> |
793 void TargetX86Base<Machine>::finishArgumentLowering(Variable *Arg, | 790 void TargetX86Base<Machine>::finishArgumentLowering(Variable *Arg, |
794 Variable *FramePtr, | 791 Variable *FramePtr, |
795 size_t BasicFrameOffset, | 792 size_t BasicFrameOffset, |
796 size_t &InArgsSizeBytes) { | 793 size_t &InArgsSizeBytes) { |
797 Variable *Lo = Arg->getLo(); | 794 Variable *Lo = Arg->getLo(); |
798 Variable *Hi = Arg->getHi(); | 795 Variable *Hi = Arg->getHi(); |
799 Type Ty = Arg->getType(); | 796 Type Ty = Arg->getType(); |
800 if (!Traits::Is64Bit && Lo && Hi && Ty == IceType_i64) { | 797 if (!Traits::Is64Bit && Lo && Hi && Ty == IceType_i64) { |
801 assert(Lo->getType() != IceType_i64); // don't want infinite recursion | 798 assert(Lo->getType() != IceType_i64); // don't want infinite recursion |
(...skipping 10 matching lines...) Expand all Loading... | |
812 if (Arg->hasReg()) { | 809 if (Arg->hasReg()) { |
813 assert(Ty != IceType_i64 || Traits::Is64Bit); | 810 assert(Ty != IceType_i64 || Traits::Is64Bit); |
814 typename Traits::X86OperandMem *Mem = Traits::X86OperandMem::create( | 811 typename Traits::X86OperandMem *Mem = Traits::X86OperandMem::create( |
815 Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset())); | 812 Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset())); |
816 if (isVectorType(Arg->getType())) { | 813 if (isVectorType(Arg->getType())) { |
817 _movp(Arg, Mem); | 814 _movp(Arg, Mem); |
818 } else { | 815 } else { |
819 _mov(Arg, Mem); | 816 _mov(Arg, Mem); |
820 } | 817 } |
821 // This argument-copying instruction uses an explicit Traits::X86OperandMem | 818 // This argument-copying instruction uses an explicit Traits::X86OperandMem |
822 // operand instead of a Variable, so its fill-from-stack operation has to be | 819 // operand instead of a Variable, so its fill-from-stack operation has to |
823 // tracked separately for statistics. | 820 // be tracked separately for statistics. |
824 Ctx->statsUpdateFills(); | 821 Ctx->statsUpdateFills(); |
825 } | 822 } |
826 } | 823 } |
827 | 824 |
828 template <class Machine> Type TargetX86Base<Machine>::stackSlotType() { | 825 template <class Machine> Type TargetX86Base<Machine>::stackSlotType() { |
829 return Traits::WordType; | 826 return Traits::WordType; |
830 } | 827 } |
831 | 828 |
832 template <class Machine> | 829 template <class Machine> |
833 template <typename T> | 830 template <typename T> |
834 typename std::enable_if<!T::Is64Bit, void>::type | 831 typename std::enable_if<!T::Is64Bit, void>::type |
835 TargetX86Base<Machine>::split64(Variable *Var) { | 832 TargetX86Base<Machine>::split64(Variable *Var) { |
836 switch (Var->getType()) { | 833 switch (Var->getType()) { |
837 default: | 834 default: |
838 return; | 835 return; |
839 case IceType_i64: | 836 case IceType_i64: |
840 // TODO: Only consider F64 if we need to push each half when | 837 // TODO: Only consider F64 if we need to push each half when passing as an |
841 // passing as an argument to a function call. Note that each half | 838 // argument to a function call. Note that each half is still typed as I32. |
842 // is still typed as I32. | |
843 case IceType_f64: | 839 case IceType_f64: |
844 break; | 840 break; |
845 } | 841 } |
846 Variable *Lo = Var->getLo(); | 842 Variable *Lo = Var->getLo(); |
847 Variable *Hi = Var->getHi(); | 843 Variable *Hi = Var->getHi(); |
848 if (Lo) { | 844 if (Lo) { |
849 assert(Hi); | 845 assert(Hi); |
850 return; | 846 return; |
851 } | 847 } |
852 assert(Hi == nullptr); | 848 assert(Hi == nullptr); |
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
939 template <class Machine> | 935 template <class Machine> |
940 llvm::SmallBitVector | 936 llvm::SmallBitVector |
941 TargetX86Base<Machine>::getRegisterSet(RegSetMask Include, | 937 TargetX86Base<Machine>::getRegisterSet(RegSetMask Include, |
942 RegSetMask Exclude) const { | 938 RegSetMask Exclude) const { |
943 return Traits::getRegisterSet(Include, Exclude); | 939 return Traits::getRegisterSet(Include, Exclude); |
944 } | 940 } |
945 | 941 |
946 template <class Machine> | 942 template <class Machine> |
947 void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) { | 943 void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) { |
948 IsEbpBasedFrame = true; | 944 IsEbpBasedFrame = true; |
949 // Conservatively require the stack to be aligned. Some stack | 945 // Conservatively require the stack to be aligned. Some stack adjustment |
950 // adjustment operations implemented below assume that the stack is | 946 // operations implemented below assume that the stack is aligned before the |
951 // aligned before the alloca. All the alloca code ensures that the | 947 // alloca. All the alloca code ensures that the stack alignment is preserved |
952 // stack alignment is preserved after the alloca. The stack alignment | 948 // after the alloca. The stack alignment restriction can be relaxed in some |
953 // restriction can be relaxed in some cases. | 949 // cases. |
954 NeedsStackAlignment = true; | 950 NeedsStackAlignment = true; |
955 | 951 |
956 // TODO(stichnot): minimize the number of adjustments of esp, etc. | 952 // TODO(stichnot): minimize the number of adjustments of esp, etc. |
957 Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_esp); | 953 Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_esp); |
958 Operand *TotalSize = legalize(Inst->getSizeInBytes()); | 954 Operand *TotalSize = legalize(Inst->getSizeInBytes()); |
959 Variable *Dest = Inst->getDest(); | 955 Variable *Dest = Inst->getDest(); |
960 uint32_t AlignmentParam = Inst->getAlignInBytes(); | 956 uint32_t AlignmentParam = Inst->getAlignInBytes(); |
961 // For default align=0, set it to the real value 1, to avoid any | 957 // For default align=0, set it to the real value 1, to avoid any |
962 // bit-manipulation problems below. | 958 // bit-manipulation problems below. |
963 AlignmentParam = std::max(AlignmentParam, 1u); | 959 AlignmentParam = std::max(AlignmentParam, 1u); |
964 | 960 |
965 // LLVM enforces power of 2 alignment. | 961 // LLVM enforces power of 2 alignment. |
966 assert(llvm::isPowerOf2_32(AlignmentParam)); | 962 assert(llvm::isPowerOf2_32(AlignmentParam)); |
967 assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES)); | 963 assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES)); |
968 | 964 |
969 uint32_t Alignment = | 965 uint32_t Alignment = |
970 std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES); | 966 std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES); |
971 if (Alignment > Traits::X86_STACK_ALIGNMENT_BYTES) { | 967 if (Alignment > Traits::X86_STACK_ALIGNMENT_BYTES) { |
972 _and(esp, Ctx->getConstantInt32(-Alignment)); | 968 _and(esp, Ctx->getConstantInt32(-Alignment)); |
973 } | 969 } |
974 if (const auto *ConstantTotalSize = | 970 if (const auto *ConstantTotalSize = |
975 llvm::dyn_cast<ConstantInteger32>(TotalSize)) { | 971 llvm::dyn_cast<ConstantInteger32>(TotalSize)) { |
976 uint32_t Value = ConstantTotalSize->getValue(); | 972 uint32_t Value = ConstantTotalSize->getValue(); |
977 Value = Utils::applyAlignment(Value, Alignment); | 973 Value = Utils::applyAlignment(Value, Alignment); |
978 _sub(esp, Ctx->getConstantInt32(Value)); | 974 _sub(esp, Ctx->getConstantInt32(Value)); |
979 } else { | 975 } else { |
980 // Non-constant sizes need to be adjusted to the next highest | 976 // Non-constant sizes need to be adjusted to the next highest multiple of |
981 // multiple of the required alignment at runtime. | 977 // the required alignment at runtime. |
982 Variable *T = makeReg(IceType_i32); | 978 Variable *T = makeReg(IceType_i32); |
983 _mov(T, TotalSize); | 979 _mov(T, TotalSize); |
984 _add(T, Ctx->getConstantInt32(Alignment - 1)); | 980 _add(T, Ctx->getConstantInt32(Alignment - 1)); |
985 _and(T, Ctx->getConstantInt32(-Alignment)); | 981 _and(T, Ctx->getConstantInt32(-Alignment)); |
986 _sub(esp, T); | 982 _sub(esp, T); |
987 } | 983 } |
988 _mov(Dest, esp); | 984 _mov(Dest, esp); |
989 } | 985 } |
990 | 986 |
991 /// Strength-reduce scalar integer multiplication by a constant (for | 987 /// Strength-reduce scalar integer multiplication by a constant (for i32 or |
992 /// i32 or narrower) for certain constants. The lea instruction can be | 988 /// narrower) for certain constants. The lea instruction can be used to multiply |
993 /// used to multiply by 3, 5, or 9, and the lsh instruction can be used | 989 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of |
994 /// to multiply by powers of 2. These can be combined such that | 990 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2 |
995 /// e.g. multiplying by 100 can be done as 2 lea-based multiplies by 5, | 991 /// lea-based multiplies by 5, combined with left-shifting by 2. |
996 /// combined with left-shifting by 2. | |
997 template <class Machine> | 992 template <class Machine> |
998 bool TargetX86Base<Machine>::optimizeScalarMul(Variable *Dest, Operand *Src0, | 993 bool TargetX86Base<Machine>::optimizeScalarMul(Variable *Dest, Operand *Src0, |
999 int32_t Src1) { | 994 int32_t Src1) { |
1000 // Disable this optimization for Om1 and O0, just to keep things | 995 // Disable this optimization for Om1 and O0, just to keep things simple |
1001 // simple there. | 996 // there. |
1002 if (Ctx->getFlags().getOptLevel() < Opt_1) | 997 if (Ctx->getFlags().getOptLevel() < Opt_1) |
1003 return false; | 998 return false; |
1004 Type Ty = Dest->getType(); | 999 Type Ty = Dest->getType(); |
1005 Variable *T = nullptr; | 1000 Variable *T = nullptr; |
1006 if (Src1 == -1) { | 1001 if (Src1 == -1) { |
1007 _mov(T, Src0); | 1002 _mov(T, Src0); |
1008 _neg(T); | 1003 _neg(T); |
1009 _mov(Dest, T); | 1004 _mov(Dest, T); |
1010 return true; | 1005 return true; |
1011 } | 1006 } |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1047 ++CountOps; | 1042 ++CountOps; |
1048 ++Count2; | 1043 ++Count2; |
1049 Src1 /= 2; | 1044 Src1 /= 2; |
1050 } else { | 1045 } else { |
1051 return false; | 1046 return false; |
1052 } | 1047 } |
1053 } | 1048 } |
1054 // Lea optimization only works for i16 and i32 types, not i8. | 1049 // Lea optimization only works for i16 and i32 types, not i8. |
1055 if (Ty != IceType_i16 && Ty != IceType_i32 && (Count3 || Count5 || Count9)) | 1050 if (Ty != IceType_i16 && Ty != IceType_i32 && (Count3 || Count5 || Count9)) |
1056 return false; | 1051 return false; |
1057 // Limit the number of lea/shl operations for a single multiply, to | 1052 // Limit the number of lea/shl operations for a single multiply, to a |
1058 // a somewhat arbitrary choice of 3. | 1053 // somewhat arbitrary choice of 3. |
1059 const uint32_t MaxOpsForOptimizedMul = 3; | 1054 const uint32_t MaxOpsForOptimizedMul = 3; |
1060 if (CountOps > MaxOpsForOptimizedMul) | 1055 if (CountOps > MaxOpsForOptimizedMul) |
1061 return false; | 1056 return false; |
1062 _mov(T, Src0); | 1057 _mov(T, Src0); |
1063 Constant *Zero = Ctx->getConstantZero(IceType_i32); | 1058 Constant *Zero = Ctx->getConstantZero(IceType_i32); |
1064 for (uint32_t i = 0; i < Count9; ++i) { | 1059 for (uint32_t i = 0; i < Count9; ++i) { |
1065 const uint16_t Shift = 3; // log2(9-1) | 1060 const uint16_t Shift = 3; // log2(9-1) |
1066 _lea(T, | 1061 _lea(T, |
1067 Traits::X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift)); | 1062 Traits::X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift)); |
1068 _set_dest_nonkillable(); | 1063 _set_dest_nonkillable(); |
(...skipping 25 matching lines...) Expand all Loading... | |
1094 Operand *Src0 = legalize(Inst->getSrc(0)); | 1089 Operand *Src0 = legalize(Inst->getSrc(0)); |
1095 Operand *Src1 = legalize(Inst->getSrc(1)); | 1090 Operand *Src1 = legalize(Inst->getSrc(1)); |
1096 if (Inst->isCommutative()) { | 1091 if (Inst->isCommutative()) { |
1097 if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) | 1092 if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) |
1098 std::swap(Src0, Src1); | 1093 std::swap(Src0, Src1); |
1099 if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) | 1094 if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) |
1100 std::swap(Src0, Src1); | 1095 std::swap(Src0, Src1); |
1101 } | 1096 } |
1102 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { | 1097 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { |
1103 // These x86-32 helper-call-involved instructions are lowered in this | 1098 // These x86-32 helper-call-involved instructions are lowered in this |
1104 // separate switch. This is because loOperand() and hiOperand() | 1099 // separate switch. This is because loOperand() and hiOperand() may insert |
1105 // may insert redundant instructions for constant blinding and | 1100 // redundant instructions for constant blinding and pooling. Such redundant |
1106 // pooling. Such redundant instructions will fail liveness analysis | 1101 // instructions will fail liveness analysis under -Om1 setting. And, |
1107 // under -Om1 setting. And, actually these arguments do not need | 1102 // actually these arguments do not need to be processed with loOperand() |
1108 // to be processed with loOperand() and hiOperand() to be used. | 1103 // and hiOperand() to be used. |
1109 switch (Inst->getOp()) { | 1104 switch (Inst->getOp()) { |
1110 case InstArithmetic::Udiv: { | 1105 case InstArithmetic::Udiv: { |
1111 const SizeT MaxSrcs = 2; | 1106 const SizeT MaxSrcs = 2; |
1112 InstCall *Call = makeHelperCall(H_udiv_i64, Dest, MaxSrcs); | 1107 InstCall *Call = makeHelperCall(H_udiv_i64, Dest, MaxSrcs); |
1113 Call->addArg(Inst->getSrc(0)); | 1108 Call->addArg(Inst->getSrc(0)); |
1114 Call->addArg(Inst->getSrc(1)); | 1109 Call->addArg(Inst->getSrc(1)); |
1115 lowerCall(Call); | 1110 lowerCall(Call); |
1116 return; | 1111 return; |
1117 } | 1112 } |
1118 case InstArithmetic::Sdiv: { | 1113 case InstArithmetic::Sdiv: { |
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1209 // t4.hi += t2 | 1204 // t4.hi += t2 |
1210 // a.hi = t4.hi | 1205 // a.hi = t4.hi |
1211 // The mul instruction cannot take an immediate operand. | 1206 // The mul instruction cannot take an immediate operand. |
1212 Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem); | 1207 Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem); |
1213 _mov(T_1, Src0Hi); | 1208 _mov(T_1, Src0Hi); |
1214 _imul(T_1, Src1Lo); | 1209 _imul(T_1, Src1Lo); |
1215 _mov(T_2, Src1Hi); | 1210 _mov(T_2, Src1Hi); |
1216 _imul(T_2, Src0Lo); | 1211 _imul(T_2, Src0Lo); |
1217 _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax); | 1212 _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax); |
1218 _mul(T_4Lo, T_3, Src1Lo); | 1213 _mul(T_4Lo, T_3, Src1Lo); |
1219 // The mul instruction produces two dest variables, edx:eax. We | 1214 // The mul instruction produces two dest variables, edx:eax. We create a |
1220 // create a fake definition of edx to account for this. | 1215 // fake definition of edx to account for this. |
1221 Context.insert(InstFakeDef::create(Func, T_4Hi, T_4Lo)); | 1216 Context.insert(InstFakeDef::create(Func, T_4Hi, T_4Lo)); |
1222 _mov(DestLo, T_4Lo); | 1217 _mov(DestLo, T_4Lo); |
1223 _add(T_4Hi, T_1); | 1218 _add(T_4Hi, T_1); |
1224 _add(T_4Hi, T_2); | 1219 _add(T_4Hi, T_2); |
1225 _mov(DestHi, T_4Hi); | 1220 _mov(DestHi, T_4Hi); |
1226 } break; | 1221 } break; |
1227 case InstArithmetic::Shl: { | 1222 case InstArithmetic::Shl: { |
1228 // TODO: Refactor the similarities between Shl, Lshr, and Ashr. | 1223 // TODO: Refactor the similarities between Shl, Lshr, and Ashr. |
1229 // gcc does the following: | 1224 // gcc does the following: |
1230 // a=b<<c ==> | 1225 // a=b<<c ==> |
(...skipping 15 matching lines...) Expand all Loading... | |
1246 Constant *Zero = Ctx->getConstantZero(IceType_i32); | 1241 Constant *Zero = Ctx->getConstantZero(IceType_i32); |
1247 typename Traits::Insts::Label *Label = | 1242 typename Traits::Insts::Label *Label = |
1248 Traits::Insts::Label::create(Func, this); | 1243 Traits::Insts::Label::create(Func, this); |
1249 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); | 1244 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); |
1250 _mov(T_2, Src0Lo); | 1245 _mov(T_2, Src0Lo); |
1251 _mov(T_3, Src0Hi); | 1246 _mov(T_3, Src0Hi); |
1252 _shld(T_3, T_2, T_1); | 1247 _shld(T_3, T_2, T_1); |
1253 _shl(T_2, T_1); | 1248 _shl(T_2, T_1); |
1254 _test(T_1, BitTest); | 1249 _test(T_1, BitTest); |
1255 _br(Traits::Cond::Br_e, Label); | 1250 _br(Traits::Cond::Br_e, Label); |
1256 // T_2 and T_3 are being assigned again because of the | 1251 // T_2 and T_3 are being assigned again because of the intra-block |
1257 // intra-block control flow, so we need the _mov_nonkillable | 1252 // control flow, so we need the _mov_nonkillable variant to avoid |
1258 // variant to avoid liveness problems. | 1253 // liveness problems. |
1259 _mov_nonkillable(T_3, T_2); | 1254 _mov_nonkillable(T_3, T_2); |
1260 _mov_nonkillable(T_2, Zero); | 1255 _mov_nonkillable(T_2, Zero); |
1261 Context.insert(Label); | 1256 Context.insert(Label); |
1262 _mov(DestLo, T_2); | 1257 _mov(DestLo, T_2); |
1263 _mov(DestHi, T_3); | 1258 _mov(DestHi, T_3); |
1264 } break; | 1259 } break; |
1265 case InstArithmetic::Lshr: { | 1260 case InstArithmetic::Lshr: { |
1266 // a=b>>c (unsigned) ==> | 1261 // a=b>>c (unsigned) ==> |
1267 // t1:ecx = c.lo & 0xff | 1262 // t1:ecx = c.lo & 0xff |
1268 // t2 = b.lo | 1263 // t2 = b.lo |
(...skipping 13 matching lines...) Expand all Loading... | |
1282 Constant *Zero = Ctx->getConstantZero(IceType_i32); | 1277 Constant *Zero = Ctx->getConstantZero(IceType_i32); |
1283 typename Traits::Insts::Label *Label = | 1278 typename Traits::Insts::Label *Label = |
1284 Traits::Insts::Label::create(Func, this); | 1279 Traits::Insts::Label::create(Func, this); |
1285 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); | 1280 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); |
1286 _mov(T_2, Src0Lo); | 1281 _mov(T_2, Src0Lo); |
1287 _mov(T_3, Src0Hi); | 1282 _mov(T_3, Src0Hi); |
1288 _shrd(T_2, T_3, T_1); | 1283 _shrd(T_2, T_3, T_1); |
1289 _shr(T_3, T_1); | 1284 _shr(T_3, T_1); |
1290 _test(T_1, BitTest); | 1285 _test(T_1, BitTest); |
1291 _br(Traits::Cond::Br_e, Label); | 1286 _br(Traits::Cond::Br_e, Label); |
1292 // T_2 and T_3 are being assigned again because of the | 1287 // T_2 and T_3 are being assigned again because of the intra-block |
1293 // intra-block control flow, so we need the _mov_nonkillable | 1288 // control flow, so we need the _mov_nonkillable variant to avoid |
1294 // variant to avoid liveness problems. | 1289 // liveness problems. |
1295 _mov_nonkillable(T_2, T_3); | 1290 _mov_nonkillable(T_2, T_3); |
1296 _mov_nonkillable(T_3, Zero); | 1291 _mov_nonkillable(T_3, Zero); |
1297 Context.insert(Label); | 1292 Context.insert(Label); |
1298 _mov(DestLo, T_2); | 1293 _mov(DestLo, T_2); |
1299 _mov(DestHi, T_3); | 1294 _mov(DestHi, T_3); |
1300 } break; | 1295 } break; |
1301 case InstArithmetic::Ashr: { | 1296 case InstArithmetic::Ashr: { |
1302 // a=b>>c (signed) ==> | 1297 // a=b>>c (signed) ==> |
1303 // t1:ecx = c.lo & 0xff | 1298 // t1:ecx = c.lo & 0xff |
1304 // t2 = b.lo | 1299 // t2 = b.lo |
(...skipping 13 matching lines...) Expand all Loading... | |
1318 Constant *SignExtend = Ctx->getConstantInt32(0x1f); | 1313 Constant *SignExtend = Ctx->getConstantInt32(0x1f); |
1319 typename Traits::Insts::Label *Label = | 1314 typename Traits::Insts::Label *Label = |
1320 Traits::Insts::Label::create(Func, this); | 1315 Traits::Insts::Label::create(Func, this); |
1321 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); | 1316 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); |
1322 _mov(T_2, Src0Lo); | 1317 _mov(T_2, Src0Lo); |
1323 _mov(T_3, Src0Hi); | 1318 _mov(T_3, Src0Hi); |
1324 _shrd(T_2, T_3, T_1); | 1319 _shrd(T_2, T_3, T_1); |
1325 _sar(T_3, T_1); | 1320 _sar(T_3, T_1); |
1326 _test(T_1, BitTest); | 1321 _test(T_1, BitTest); |
1327 _br(Traits::Cond::Br_e, Label); | 1322 _br(Traits::Cond::Br_e, Label); |
1328 // T_2 and T_3 are being assigned again because of the | 1323 // T_2 and T_3 are being assigned again because of the intra-block |
1329 // intra-block control flow, so T_2 needs the _mov_nonkillable | 1324 // control flow, so T_2 needs the _mov_nonkillable variant to avoid |
1330 // variant to avoid liveness problems. T_3 doesn't need special | 1325 // liveness problems. T_3 doesn't need special treatment because it is |
1331 // treatment because it is reassigned via _sar instead of _mov. | 1326 // reassigned via _sar instead of _mov. |
1332 _mov_nonkillable(T_2, T_3); | 1327 _mov_nonkillable(T_2, T_3); |
1333 _sar(T_3, SignExtend); | 1328 _sar(T_3, SignExtend); |
1334 Context.insert(Label); | 1329 Context.insert(Label); |
1335 _mov(DestLo, T_2); | 1330 _mov(DestLo, T_2); |
1336 _mov(DestHi, T_3); | 1331 _mov(DestHi, T_3); |
1337 } break; | 1332 } break; |
1338 case InstArithmetic::Fadd: | 1333 case InstArithmetic::Fadd: |
1339 case InstArithmetic::Fsub: | 1334 case InstArithmetic::Fsub: |
1340 case InstArithmetic::Fmul: | 1335 case InstArithmetic::Fmul: |
1341 case InstArithmetic::Fdiv: | 1336 case InstArithmetic::Fdiv: |
1342 case InstArithmetic::Frem: | 1337 case InstArithmetic::Frem: |
1343 llvm_unreachable("FP instruction with i64 type"); | 1338 llvm_unreachable("FP instruction with i64 type"); |
1344 break; | 1339 break; |
1345 case InstArithmetic::Udiv: | 1340 case InstArithmetic::Udiv: |
1346 case InstArithmetic::Sdiv: | 1341 case InstArithmetic::Sdiv: |
1347 case InstArithmetic::Urem: | 1342 case InstArithmetic::Urem: |
1348 case InstArithmetic::Srem: | 1343 case InstArithmetic::Srem: |
1349 llvm_unreachable("Call-helper-involved instruction for i64 type \ | 1344 llvm_unreachable("Call-helper-involved instruction for i64 type \ |
1350 should have already been handled before"); | 1345 should have already been handled before"); |
1351 break; | 1346 break; |
1352 } | 1347 } |
1353 return; | 1348 return; |
1354 } | 1349 } |
1355 if (isVectorType(Dest->getType())) { | 1350 if (isVectorType(Dest->getType())) { |
1356 // TODO: Trap on integer divide and integer modulo by zero. | 1351 // TODO: Trap on integer divide and integer modulo by zero. See: |
1357 // See: https://code.google.com/p/nativeclient/issues/detail?id=3899 | 1352 // https://code.google.com/p/nativeclient/issues/detail?id=3899 |
1358 if (llvm::isa<typename Traits::X86OperandMem>(Src1)) | 1353 if (llvm::isa<typename Traits::X86OperandMem>(Src1)) |
1359 Src1 = legalizeToReg(Src1); | 1354 Src1 = legalizeToReg(Src1); |
1360 switch (Inst->getOp()) { | 1355 switch (Inst->getOp()) { |
1361 case InstArithmetic::_num: | 1356 case InstArithmetic::_num: |
1362 llvm_unreachable("Unknown arithmetic operator"); | 1357 llvm_unreachable("Unknown arithmetic operator"); |
1363 break; | 1358 break; |
1364 case InstArithmetic::Add: { | 1359 case InstArithmetic::Add: { |
1365 Variable *T = makeReg(Dest->getType()); | 1360 Variable *T = makeReg(Dest->getType()); |
1366 _movp(T, Src0); | 1361 _movp(T, Src0); |
1367 _padd(T, Src1); | 1362 _padd(T, Src1); |
(...skipping 144 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1512 case InstArithmetic::Sub: | 1507 case InstArithmetic::Sub: |
1513 _mov(T, Src0); | 1508 _mov(T, Src0); |
1514 _sub(T, Src1); | 1509 _sub(T, Src1); |
1515 _mov(Dest, T); | 1510 _mov(Dest, T); |
1516 break; | 1511 break; |
1517 case InstArithmetic::Mul: | 1512 case InstArithmetic::Mul: |
1518 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { | 1513 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
1519 if (optimizeScalarMul(Dest, Src0, C->getValue())) | 1514 if (optimizeScalarMul(Dest, Src0, C->getValue())) |
1520 return; | 1515 return; |
1521 } | 1516 } |
1522 // The 8-bit version of imul only allows the form "imul r/m8" | 1517 // The 8-bit version of imul only allows the form "imul r/m8" where T must |
1523 // where T must be in eax. | 1518 // be in eax. |
1524 if (isByteSizedArithType(Dest->getType())) { | 1519 if (isByteSizedArithType(Dest->getType())) { |
1525 _mov(T, Src0, Traits::RegisterSet::Reg_eax); | 1520 _mov(T, Src0, Traits::RegisterSet::Reg_eax); |
1526 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); | 1521 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
1527 } else { | 1522 } else { |
1528 _mov(T, Src0); | 1523 _mov(T, Src0); |
1529 } | 1524 } |
1530 _imul(T, Src1); | 1525 _imul(T, Src1); |
1531 _mov(Dest, T); | 1526 _mov(Dest, T); |
1532 break; | 1527 break; |
1533 case InstArithmetic::Shl: | 1528 case InstArithmetic::Shl: |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1573 Context.insert(InstFakeUse::create(Func, T_eax)); | 1568 Context.insert(InstFakeUse::create(Func, T_eax)); |
1574 } else { | 1569 } else { |
1575 Constant *Zero = Ctx->getConstantZero(IceType_i32); | 1570 Constant *Zero = Ctx->getConstantZero(IceType_i32); |
1576 _mov(T, Src0, Traits::RegisterSet::Reg_eax); | 1571 _mov(T, Src0, Traits::RegisterSet::Reg_eax); |
1577 _mov(T_edx, Zero, Traits::RegisterSet::Reg_edx); | 1572 _mov(T_edx, Zero, Traits::RegisterSet::Reg_edx); |
1578 _div(T, Src1, T_edx); | 1573 _div(T, Src1, T_edx); |
1579 _mov(Dest, T); | 1574 _mov(Dest, T); |
1580 } | 1575 } |
1581 break; | 1576 break; |
1582 case InstArithmetic::Sdiv: | 1577 case InstArithmetic::Sdiv: |
1583 // TODO(stichnot): Enable this after doing better performance | 1578 // TODO(stichnot): Enable this after doing better performance and cross |
1584 // and cross testing. | 1579 // testing. |
1585 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { | 1580 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { |
1586 // Optimize division by constant power of 2, but not for Om1 | 1581 // Optimize division by constant power of 2, but not for Om1 or O0, just |
1587 // or O0, just to keep things simple there. | 1582 // to keep things simple there. |
1588 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { | 1583 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
1589 int32_t Divisor = C->getValue(); | 1584 int32_t Divisor = C->getValue(); |
1590 uint32_t UDivisor = static_cast<uint32_t>(Divisor); | 1585 uint32_t UDivisor = static_cast<uint32_t>(Divisor); |
1591 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { | 1586 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { |
1592 uint32_t LogDiv = llvm::Log2_32(UDivisor); | 1587 uint32_t LogDiv = llvm::Log2_32(UDivisor); |
1593 Type Ty = Dest->getType(); | 1588 Type Ty = Dest->getType(); |
1594 // LLVM does the following for dest=src/(1<<log): | 1589 // LLVM does the following for dest=src/(1<<log): |
1595 // t=src | 1590 // t=src |
1596 // sar t,typewidth-1 // -1 if src is negative, 0 if not | 1591 // sar t,typewidth-1 // -1 if src is negative, 0 if not |
1597 // shr t,typewidth-log | 1592 // shr t,typewidth-log |
1598 // add t,src | 1593 // add t,src |
1599 // sar t,log | 1594 // sar t,log |
1600 // dest=t | 1595 // dest=t |
1601 uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty); | 1596 uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty); |
1602 _mov(T, Src0); | 1597 _mov(T, Src0); |
1603 // If for some reason we are dividing by 1, just treat it | 1598 // If for some reason we are dividing by 1, just treat it like an |
1604 // like an assignment. | 1599 // assignment. |
1605 if (LogDiv > 0) { | 1600 if (LogDiv > 0) { |
1606 // The initial sar is unnecessary when dividing by 2. | 1601 // The initial sar is unnecessary when dividing by 2. |
1607 if (LogDiv > 1) | 1602 if (LogDiv > 1) |
1608 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); | 1603 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); |
1609 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); | 1604 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); |
1610 _add(T, Src0); | 1605 _add(T, Src0); |
1611 _sar(T, Ctx->getConstantInt(Ty, LogDiv)); | 1606 _sar(T, Ctx->getConstantInt(Ty, LogDiv)); |
1612 } | 1607 } |
1613 _mov(Dest, T); | 1608 _mov(Dest, T); |
1614 return; | 1609 return; |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1649 } else { | 1644 } else { |
1650 Constant *Zero = Ctx->getConstantZero(IceType_i32); | 1645 Constant *Zero = Ctx->getConstantZero(IceType_i32); |
1651 T_edx = makeReg(Dest->getType(), Traits::RegisterSet::Reg_edx); | 1646 T_edx = makeReg(Dest->getType(), Traits::RegisterSet::Reg_edx); |
1652 _mov(T_edx, Zero); | 1647 _mov(T_edx, Zero); |
1653 _mov(T, Src0, Traits::RegisterSet::Reg_eax); | 1648 _mov(T, Src0, Traits::RegisterSet::Reg_eax); |
1654 _div(T_edx, Src1, T); | 1649 _div(T_edx, Src1, T); |
1655 _mov(Dest, T_edx); | 1650 _mov(Dest, T_edx); |
1656 } | 1651 } |
1657 break; | 1652 break; |
1658 case InstArithmetic::Srem: | 1653 case InstArithmetic::Srem: |
1659 // TODO(stichnot): Enable this after doing better performance | 1654 // TODO(stichnot): Enable this after doing better performance and cross |
1660 // and cross testing. | 1655 // testing. |
1661 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { | 1656 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { |
1662 // Optimize mod by constant power of 2, but not for Om1 or O0, | 1657 // Optimize mod by constant power of 2, but not for Om1 or O0, just to |
1663 // just to keep things simple there. | 1658 // keep things simple there. |
1664 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { | 1659 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
1665 int32_t Divisor = C->getValue(); | 1660 int32_t Divisor = C->getValue(); |
1666 uint32_t UDivisor = static_cast<uint32_t>(Divisor); | 1661 uint32_t UDivisor = static_cast<uint32_t>(Divisor); |
1667 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { | 1662 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { |
1668 uint32_t LogDiv = llvm::Log2_32(UDivisor); | 1663 uint32_t LogDiv = llvm::Log2_32(UDivisor); |
1669 Type Ty = Dest->getType(); | 1664 Type Ty = Dest->getType(); |
1670 // LLVM does the following for dest=src%(1<<log): | 1665 // LLVM does the following for dest=src%(1<<log): |
1671 // t=src | 1666 // t=src |
1672 // sar t,typewidth-1 // -1 if src is negative, 0 if not | 1667 // sar t,typewidth-1 // -1 if src is negative, 0 if not |
1673 // shr t,typewidth-log | 1668 // shr t,typewidth-log |
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1770 _mov(T_Hi, Src0Hi); | 1765 _mov(T_Hi, Src0Hi); |
1771 _mov(DestHi, T_Hi); | 1766 _mov(DestHi, T_Hi); |
1772 } else { | 1767 } else { |
1773 Operand *Src0Legal; | 1768 Operand *Src0Legal; |
1774 if (Dest->hasReg()) { | 1769 if (Dest->hasReg()) { |
1775 // If Dest already has a physical register, then only basic legalization | 1770 // If Dest already has a physical register, then only basic legalization |
1776 // is needed, as the source operand can be a register, immediate, or | 1771 // is needed, as the source operand can be a register, immediate, or |
1777 // memory. | 1772 // memory. |
1778 Src0Legal = legalize(Src0); | 1773 Src0Legal = legalize(Src0); |
1779 } else { | 1774 } else { |
1780 // If Dest could be a stack operand, then RI must be a physical | 1775 // If Dest could be a stack operand, then RI must be a physical register |
1781 // register or a scalar integer immediate. | 1776 // or a scalar integer immediate. |
1782 Src0Legal = legalize(Src0, Legal_Reg | Legal_Imm); | 1777 Src0Legal = legalize(Src0, Legal_Reg | Legal_Imm); |
1783 } | 1778 } |
1784 if (isVectorType(Dest->getType())) | 1779 if (isVectorType(Dest->getType())) |
1785 _movp(Dest, Src0Legal); | 1780 _movp(Dest, Src0Legal); |
1786 else | 1781 else |
1787 _mov(Dest, Src0Legal); | 1782 _mov(Dest, Src0Legal); |
1788 } | 1783 } |
1789 } | 1784 } |
1790 | 1785 |
1791 template <class Machine> | 1786 template <class Machine> |
1792 void TargetX86Base<Machine>::lowerBr(const InstBr *Inst) { | 1787 void TargetX86Base<Machine>::lowerBr(const InstBr *Inst) { |
1793 if (Inst->isUnconditional()) { | 1788 if (Inst->isUnconditional()) { |
1794 _br(Inst->getTargetUnconditional()); | 1789 _br(Inst->getTargetUnconditional()); |
1795 return; | 1790 return; |
1796 } | 1791 } |
1797 Operand *Cond = Inst->getCondition(); | 1792 Operand *Cond = Inst->getCondition(); |
1798 | 1793 |
1799 // Handle folding opportunities. | 1794 // Handle folding opportunities. |
1800 if (const class Inst *Producer = FoldingInfo.getProducerFor(Cond)) { | 1795 if (const class Inst *Producer = FoldingInfo.getProducerFor(Cond)) { |
1801 assert(Producer->isDeleted()); | 1796 assert(Producer->isDeleted()); |
1802 switch (BoolFolding::getProducerKind(Producer)) { | 1797 switch (BoolFolding::getProducerKind(Producer)) { |
1803 default: | 1798 default: |
1804 break; | 1799 break; |
1805 case BoolFolding::PK_Icmp32: { | 1800 case BoolFolding::PK_Icmp32: { |
1806 // TODO(stichnot): Refactor similarities between this block and | 1801 // TODO(stichnot): Refactor similarities between this block and the |
1807 // the corresponding code in lowerIcmp(). | 1802 // corresponding code in lowerIcmp(). |
1808 auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer); | 1803 auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer); |
1809 Operand *Src0 = Producer->getSrc(0); | 1804 Operand *Src0 = Producer->getSrc(0); |
1810 Operand *Src1 = legalize(Producer->getSrc(1)); | 1805 Operand *Src1 = legalize(Producer->getSrc(1)); |
1811 Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1); | 1806 Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1); |
1812 _cmp(Src0RM, Src1); | 1807 _cmp(Src0RM, Src1); |
1813 _br(Traits::getIcmp32Mapping(Cmp->getCondition()), Inst->getTargetTrue(), | 1808 _br(Traits::getIcmp32Mapping(Cmp->getCondition()), Inst->getTargetTrue(), |
1814 Inst->getTargetFalse()); | 1809 Inst->getTargetFalse()); |
1815 return; | 1810 return; |
1816 } | 1811 } |
1817 } | 1812 } |
(...skipping 10 matching lines...) Expand all Loading... | |
1828 // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap) | 1823 // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap) |
1829 InstCast::OpKind CastKind = Inst->getCastKind(); | 1824 InstCast::OpKind CastKind = Inst->getCastKind(); |
1830 Variable *Dest = Inst->getDest(); | 1825 Variable *Dest = Inst->getDest(); |
1831 switch (CastKind) { | 1826 switch (CastKind) { |
1832 default: | 1827 default: |
1833 Func->setError("Cast type not supported"); | 1828 Func->setError("Cast type not supported"); |
1834 return; | 1829 return; |
1835 case InstCast::Sext: { | 1830 case InstCast::Sext: { |
1836 // Src0RM is the source operand legalized to physical register or memory, | 1831 // Src0RM is the source operand legalized to physical register or memory, |
1837 // but not immediate, since the relevant x86 native instructions don't | 1832 // but not immediate, since the relevant x86 native instructions don't |
1838 // allow an immediate operand. If the operand is an immediate, we could | 1833 // allow an immediate operand. If the operand is an immediate, we could |
1839 // consider computing the strength-reduced result at translation time, | 1834 // consider computing the strength-reduced result at translation time, but |
1840 // but we're unlikely to see something like that in the bitcode that | 1835 // we're unlikely to see something like that in the bitcode that the |
1841 // the optimizer wouldn't have already taken care of. | 1836 // optimizer wouldn't have already taken care of. |
1842 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem); | 1837 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem); |
1843 if (isVectorType(Dest->getType())) { | 1838 if (isVectorType(Dest->getType())) { |
1844 Type DestTy = Dest->getType(); | 1839 Type DestTy = Dest->getType(); |
1845 if (DestTy == IceType_v16i8) { | 1840 if (DestTy == IceType_v16i8) { |
1846 // onemask = materialize(1,1,...); dst = (src & onemask) > 0 | 1841 // onemask = materialize(1,1,...); dst = (src & onemask) > 0 |
1847 Variable *OneMask = makeVectorOfOnes(Dest->getType()); | 1842 Variable *OneMask = makeVectorOfOnes(Dest->getType()); |
1848 Variable *T = makeReg(DestTy); | 1843 Variable *T = makeReg(DestTy); |
1849 _movp(T, Src0RM); | 1844 _movp(T, Src0RM); |
1850 _pand(T, OneMask); | 1845 _pand(T, OneMask); |
1851 Variable *Zeros = makeVectorOfZeros(Dest->getType()); | 1846 Variable *Zeros = makeVectorOfZeros(Dest->getType()); |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1891 // sar t1, dst_bitwidth - 1 | 1886 // sar t1, dst_bitwidth - 1 |
1892 // dst = t1 | 1887 // dst = t1 |
1893 size_t DestBits = | 1888 size_t DestBits = |
1894 Traits::X86_CHAR_BIT * typeWidthInBytes(Dest->getType()); | 1889 Traits::X86_CHAR_BIT * typeWidthInBytes(Dest->getType()); |
1895 Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1); | 1890 Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1); |
1896 Variable *T = makeReg(Dest->getType()); | 1891 Variable *T = makeReg(Dest->getType()); |
1897 if (typeWidthInBytes(Dest->getType()) <= | 1892 if (typeWidthInBytes(Dest->getType()) <= |
1898 typeWidthInBytes(Src0RM->getType())) { | 1893 typeWidthInBytes(Src0RM->getType())) { |
1899 _mov(T, Src0RM); | 1894 _mov(T, Src0RM); |
1900 } else { | 1895 } else { |
1901 // Widen the source using movsx or movzx. (It doesn't matter | 1896 // Widen the source using movsx or movzx. (It doesn't matter which one, |
1902 // which one, since the following shl/sar overwrite the bits.) | 1897 // since the following shl/sar overwrite the bits.) |
1903 _movzx(T, Src0RM); | 1898 _movzx(T, Src0RM); |
1904 } | 1899 } |
1905 _shl(T, ShiftAmount); | 1900 _shl(T, ShiftAmount); |
1906 _sar(T, ShiftAmount); | 1901 _sar(T, ShiftAmount); |
1907 _mov(Dest, T); | 1902 _mov(Dest, T); |
1908 } else { | 1903 } else { |
1909 // t1 = movsx src; dst = t1 | 1904 // t1 = movsx src; dst = t1 |
1910 Variable *T = makeReg(Dest->getType()); | 1905 Variable *T = makeReg(Dest->getType()); |
1911 _movsx(T, Src0RM); | 1906 _movsx(T, Src0RM); |
1912 _mov(Dest, T); | 1907 _mov(Dest, T); |
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2003 if (isVectorType(Dest->getType())) { | 1998 if (isVectorType(Dest->getType())) { |
2004 assert(Dest->getType() == IceType_v4i32 && | 1999 assert(Dest->getType() == IceType_v4i32 && |
2005 Inst->getSrc(0)->getType() == IceType_v4f32); | 2000 Inst->getSrc(0)->getType() == IceType_v4f32); |
2006 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem); | 2001 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem); |
2007 if (llvm::isa<typename Traits::X86OperandMem>(Src0RM)) | 2002 if (llvm::isa<typename Traits::X86OperandMem>(Src0RM)) |
2008 Src0RM = legalizeToReg(Src0RM); | 2003 Src0RM = legalizeToReg(Src0RM); |
2009 Variable *T = makeReg(Dest->getType()); | 2004 Variable *T = makeReg(Dest->getType()); |
2010 _cvt(T, Src0RM, Traits::Insts::Cvt::Tps2dq); | 2005 _cvt(T, Src0RM, Traits::Insts::Cvt::Tps2dq); |
2011 _movp(Dest, T); | 2006 _movp(Dest, T); |
2012 } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { | 2007 } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { |
2013 // Use a helper for converting floating-point values to 64-bit | 2008 // Use a helper for converting floating-point values to 64-bit integers. |
2014 // integers. SSE2 appears to have no way to convert from xmm | 2009 // SSE2 appears to have no way to convert from xmm registers to something |
2015 // registers to something like the edx:eax register pair, and | 2010 // like the edx:eax register pair, and gcc and clang both want to use x87 |
2016 // gcc and clang both want to use x87 instructions complete with | 2011 // instructions complete with temporary manipulation of the status word. |
2017 // temporary manipulation of the status word. This helper is | 2012 // This helper is not needed for x86-64. |
2018 // not needed for x86-64. | |
2019 split64(Dest); | 2013 split64(Dest); |
2020 const SizeT MaxSrcs = 1; | 2014 const SizeT MaxSrcs = 1; |
2021 Type SrcType = Inst->getSrc(0)->getType(); | 2015 Type SrcType = Inst->getSrc(0)->getType(); |
2022 InstCall *Call = | 2016 InstCall *Call = |
2023 makeHelperCall(isFloat32Asserting32Or64(SrcType) ? H_fptosi_f32_i64 | 2017 makeHelperCall(isFloat32Asserting32Or64(SrcType) ? H_fptosi_f32_i64 |
2024 : H_fptosi_f64_i64, | 2018 : H_fptosi_f64_i64, |
2025 Dest, MaxSrcs); | 2019 Dest, MaxSrcs); |
2026 Call->addArg(Inst->getSrc(0)); | 2020 Call->addArg(Inst->getSrc(0)); |
2027 lowerCall(Call); | 2021 lowerCall(Call); |
2028 } else { | 2022 } else { |
(...skipping 114 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2143 Operand *Src0 = Inst->getSrc(0); | 2137 Operand *Src0 = Inst->getSrc(0); |
2144 if (isVectorType(Src0->getType())) { | 2138 if (isVectorType(Src0->getType())) { |
2145 assert(Dest->getType() == IceType_v4f32 && | 2139 assert(Dest->getType() == IceType_v4f32 && |
2146 Src0->getType() == IceType_v4i32); | 2140 Src0->getType() == IceType_v4i32); |
2147 const SizeT MaxSrcs = 1; | 2141 const SizeT MaxSrcs = 1; |
2148 InstCall *Call = makeHelperCall(H_uitofp_4xi32_4xf32, Dest, MaxSrcs); | 2142 InstCall *Call = makeHelperCall(H_uitofp_4xi32_4xf32, Dest, MaxSrcs); |
2149 Call->addArg(Src0); | 2143 Call->addArg(Src0); |
2150 lowerCall(Call); | 2144 lowerCall(Call); |
2151 } else if (Src0->getType() == IceType_i64 || | 2145 } else if (Src0->getType() == IceType_i64 || |
2152 (!Traits::Is64Bit && Src0->getType() == IceType_i32)) { | 2146 (!Traits::Is64Bit && Src0->getType() == IceType_i32)) { |
2153 // Use a helper for x86-32 and x86-64. Also use a helper for | 2147 // Use a helper for x86-32 and x86-64. Also use a helper for i32 on |
2154 // i32 on x86-32. | 2148 // x86-32. |
2155 const SizeT MaxSrcs = 1; | 2149 const SizeT MaxSrcs = 1; |
2156 Type DestType = Dest->getType(); | 2150 Type DestType = Dest->getType(); |
2157 IceString TargetString; | 2151 IceString TargetString; |
2158 if (isInt32Asserting32Or64(Src0->getType())) { | 2152 if (isInt32Asserting32Or64(Src0->getType())) { |
2159 TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i32_f32 | 2153 TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i32_f32 |
2160 : H_uitofp_i32_f64; | 2154 : H_uitofp_i32_f64; |
2161 } else { | 2155 } else { |
2162 TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i64_f32 | 2156 TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i64_f32 |
2163 : H_uitofp_i64_f64; | 2157 : H_uitofp_i64_f64; |
2164 } | 2158 } |
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2278 _mov(DestLo, T_Lo); | 2272 _mov(DestLo, T_Lo); |
2279 _mov(T_Hi, SpillHi); | 2273 _mov(T_Hi, SpillHi); |
2280 _mov(DestHi, T_Hi); | 2274 _mov(DestHi, T_Hi); |
2281 } | 2275 } |
2282 } break; | 2276 } break; |
2283 case IceType_f64: { | 2277 case IceType_f64: { |
2284 assert(Src0->getType() == IceType_i64); | 2278 assert(Src0->getType() == IceType_i64); |
2285 if (Traits::Is64Bit) { | 2279 if (Traits::Is64Bit) { |
2286 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); | 2280 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); |
2287 Variable *T = makeReg(IceType_f64); | 2281 Variable *T = makeReg(IceType_f64); |
2288 // Movd requires its fp argument (in this case, the bitcast destination) | 2282 // Movd requires its fp argument (in this case, the bitcast |
2289 // to be an xmm register. | 2283 // destination) to be an xmm register. |
2290 T->setMustHaveReg(); | 2284 T->setMustHaveReg(); |
2291 _movd(T, Src0RM); | 2285 _movd(T, Src0RM); |
2292 _mov(Dest, T); | 2286 _mov(Dest, T); |
2293 } else { | 2287 } else { |
2294 Src0 = legalize(Src0); | 2288 Src0 = legalize(Src0); |
2295 if (llvm::isa<typename Traits::X86OperandMem>(Src0)) { | 2289 if (llvm::isa<typename Traits::X86OperandMem>(Src0)) { |
2296 Variable *T = Func->makeVariable(Dest->getType()); | 2290 Variable *T = Func->makeVariable(Dest->getType()); |
2297 _movq(T, Src0); | 2291 _movq(T, Src0); |
2298 _movq(Dest, T); | 2292 _movq(Dest, T); |
2299 break; | 2293 break; |
(...skipping 11 matching lines...) Expand all Loading... | |
2311 Variable *Spill = SpillVar; | 2305 Variable *Spill = SpillVar; |
2312 Spill->setMustNotHaveReg(); | 2306 Spill->setMustNotHaveReg(); |
2313 | 2307 |
2314 Variable *T_Lo = nullptr, *T_Hi = nullptr; | 2308 Variable *T_Lo = nullptr, *T_Hi = nullptr; |
2315 typename Traits::VariableSplit *SpillLo = Traits::VariableSplit::create( | 2309 typename Traits::VariableSplit *SpillLo = Traits::VariableSplit::create( |
2316 Func, Spill, Traits::VariableSplit::Low); | 2310 Func, Spill, Traits::VariableSplit::Low); |
2317 typename Traits::VariableSplit *SpillHi = Traits::VariableSplit::create( | 2311 typename Traits::VariableSplit *SpillHi = Traits::VariableSplit::create( |
2318 Func, Spill, Traits::VariableSplit::High); | 2312 Func, Spill, Traits::VariableSplit::High); |
2319 _mov(T_Lo, loOperand(Src0)); | 2313 _mov(T_Lo, loOperand(Src0)); |
2320 // Technically, the Spill is defined after the _store happens, but | 2314 // Technically, the Spill is defined after the _store happens, but |
2321 // SpillLo is considered a "use" of Spill so define Spill before it | 2315 // SpillLo is considered a "use" of Spill so define Spill before it is |
2322 // is used. | 2316 // used. |
2323 Context.insert(InstFakeDef::create(Func, Spill)); | 2317 Context.insert(InstFakeDef::create(Func, Spill)); |
2324 _store(T_Lo, SpillLo); | 2318 _store(T_Lo, SpillLo); |
2325 _mov(T_Hi, hiOperand(Src0)); | 2319 _mov(T_Hi, hiOperand(Src0)); |
2326 _store(T_Hi, SpillHi); | 2320 _store(T_Hi, SpillHi); |
2327 _movq(Dest, Spill); | 2321 _movq(Dest, Spill); |
2328 } | 2322 } |
2329 } break; | 2323 } break; |
2330 case IceType_v8i1: { | 2324 case IceType_v8i1: { |
2331 assert(Src0->getType() == IceType_i8); | 2325 assert(Src0->getType() == IceType_i8); |
2332 InstCall *Call = makeHelperCall(H_bitcast_i8_8xi1, Dest, 1); | 2326 InstCall *Call = makeHelperCall(H_bitcast_i8_8xi1, Dest, 1); |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2377 InstructionSet >= Traits::SSE4_1; | 2371 InstructionSet >= Traits::SSE4_1; |
2378 if (CanUsePextr && Ty != IceType_v4f32) { | 2372 if (CanUsePextr && Ty != IceType_v4f32) { |
2379 // Use pextrb, pextrw, or pextrd. | 2373 // Use pextrb, pextrw, or pextrd. |
2380 Constant *Mask = Ctx->getConstantInt32(Index); | 2374 Constant *Mask = Ctx->getConstantInt32(Index); |
2381 Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized); | 2375 Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized); |
2382 _pextr(ExtractedElementR, SourceVectR, Mask); | 2376 _pextr(ExtractedElementR, SourceVectR, Mask); |
2383 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { | 2377 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { |
2384 // Use pshufd and movd/movss. | 2378 // Use pshufd and movd/movss. |
2385 Variable *T = nullptr; | 2379 Variable *T = nullptr; |
2386 if (Index) { | 2380 if (Index) { |
2387 // The shuffle only needs to occur if the element to be extracted | 2381 // The shuffle only needs to occur if the element to be extracted is not |
2388 // is not at the lowest index. | 2382 // at the lowest index. |
2389 Constant *Mask = Ctx->getConstantInt32(Index); | 2383 Constant *Mask = Ctx->getConstantInt32(Index); |
2390 T = makeReg(Ty); | 2384 T = makeReg(Ty); |
2391 _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask); | 2385 _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask); |
2392 } else { | 2386 } else { |
2393 T = legalizeToReg(SourceVectNotLegalized); | 2387 T = legalizeToReg(SourceVectNotLegalized); |
2394 } | 2388 } |
2395 | 2389 |
2396 if (InVectorElementTy == IceType_i32) { | 2390 if (InVectorElementTy == IceType_i32) { |
2397 _movd(ExtractedElementR, T); | 2391 _movd(ExtractedElementR, T); |
2398 } else { // Ty == IceType_f32 | 2392 } else { // Ty == IceType_f32 |
2399 // TODO(wala): _movss is only used here because _mov does not | 2393 // TODO(wala): _movss is only used here because _mov does not allow a |
2400 // allow a vector source and a scalar destination. _mov should be | 2394 // vector source and a scalar destination. _mov should be able to be |
2401 // able to be used here. | 2395 // used here. |
2402 // _movss is a binary instruction, so the FakeDef is needed to | 2396 // _movss is a binary instruction, so the FakeDef is needed to keep the |
2403 // keep the live range analysis consistent. | 2397 // live range analysis consistent. |
2404 Context.insert(InstFakeDef::create(Func, ExtractedElementR)); | 2398 Context.insert(InstFakeDef::create(Func, ExtractedElementR)); |
2405 _movss(ExtractedElementR, T); | 2399 _movss(ExtractedElementR, T); |
2406 } | 2400 } |
2407 } else { | 2401 } else { |
2408 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2402 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
2409 // Spill the value to a stack slot and do the extraction in memory. | 2403 // Spill the value to a stack slot and do the extraction in memory. |
2410 // | 2404 // |
2411 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when | 2405 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support |
2412 // support for legalizing to mem is implemented. | 2406 // for legalizing to mem is implemented. |
2413 Variable *Slot = Func->makeVariable(Ty); | 2407 Variable *Slot = Func->makeVariable(Ty); |
2414 Slot->setMustNotHaveReg(); | 2408 Slot->setMustNotHaveReg(); |
2415 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); | 2409 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); |
2416 | 2410 |
2417 // Compute the location of the element in memory. | 2411 // Compute the location of the element in memory. |
2418 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); | 2412 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); |
2419 typename Traits::X86OperandMem *Loc = | 2413 typename Traits::X86OperandMem *Loc = |
2420 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); | 2414 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); |
2421 _mov(ExtractedElementR, Loc); | 2415 _mov(ExtractedElementR, Loc); |
2422 } | 2416 } |
(...skipping 159 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2582 Src0 = NewSrc0; | 2576 Src0 = NewSrc0; |
2583 Src1 = NewSrc1; | 2577 Src1 = NewSrc1; |
2584 Ty = NewTy; | 2578 Ty = NewTy; |
2585 } | 2579 } |
2586 | 2580 |
2587 InstIcmp::ICond Condition = Inst->getCondition(); | 2581 InstIcmp::ICond Condition = Inst->getCondition(); |
2588 | 2582 |
2589 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); | 2583 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); |
2590 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); | 2584 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); |
2591 | 2585 |
2592 // SSE2 only has signed comparison operations. Transform unsigned | 2586 // SSE2 only has signed comparison operations. Transform unsigned inputs in |
2593 // inputs in a manner that allows for the use of signed comparison | 2587 // a manner that allows for the use of signed comparison operations by |
2594 // operations by flipping the high order bits. | 2588 // flipping the high order bits. |
2595 if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge || | 2589 if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge || |
2596 Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) { | 2590 Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) { |
2597 Variable *T0 = makeReg(Ty); | 2591 Variable *T0 = makeReg(Ty); |
2598 Variable *T1 = makeReg(Ty); | 2592 Variable *T1 = makeReg(Ty); |
2599 Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty); | 2593 Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty); |
2600 _movp(T0, Src0RM); | 2594 _movp(T0, Src0RM); |
2601 _pxor(T0, HighOrderBits); | 2595 _pxor(T0, HighOrderBits); |
2602 _movp(T1, Src1RM); | 2596 _movp(T1, Src1RM); |
2603 _pxor(T1, HighOrderBits); | 2597 _pxor(T1, HighOrderBits); |
2604 Src0RM = T0; | 2598 Src0RM = T0; |
(...skipping 114 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2719 // Only constant indices are allowed in PNaCl IR. | 2713 // Only constant indices are allowed in PNaCl IR. |
2720 assert(ElementIndex); | 2714 assert(ElementIndex); |
2721 unsigned Index = ElementIndex->getValue(); | 2715 unsigned Index = ElementIndex->getValue(); |
2722 assert(Index < typeNumElements(SourceVectNotLegalized->getType())); | 2716 assert(Index < typeNumElements(SourceVectNotLegalized->getType())); |
2723 | 2717 |
2724 Type Ty = SourceVectNotLegalized->getType(); | 2718 Type Ty = SourceVectNotLegalized->getType(); |
2725 Type ElementTy = typeElementType(Ty); | 2719 Type ElementTy = typeElementType(Ty); |
2726 Type InVectorElementTy = Traits::getInVectorElementType(Ty); | 2720 Type InVectorElementTy = Traits::getInVectorElementType(Ty); |
2727 | 2721 |
2728 if (ElementTy == IceType_i1) { | 2722 if (ElementTy == IceType_i1) { |
2729 // Expand the element to the appropriate size for it to be inserted | 2723 // Expand the element to the appropriate size for it to be inserted in the |
2730 // in the vector. | 2724 // vector. |
2731 Variable *Expanded = Func->makeVariable(InVectorElementTy); | 2725 Variable *Expanded = Func->makeVariable(InVectorElementTy); |
2732 InstCast *Cast = InstCast::create(Func, InstCast::Zext, Expanded, | 2726 InstCast *Cast = InstCast::create(Func, InstCast::Zext, Expanded, |
2733 ElementToInsertNotLegalized); | 2727 ElementToInsertNotLegalized); |
2734 lowerCast(Cast); | 2728 lowerCast(Cast); |
2735 ElementToInsertNotLegalized = Expanded; | 2729 ElementToInsertNotLegalized = Expanded; |
2736 } | 2730 } |
2737 | 2731 |
2738 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || | 2732 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || |
2739 InstructionSet >= Traits::SSE4_1) { | 2733 InstructionSet >= Traits::SSE4_1) { |
2740 // Use insertps, pinsrb, pinsrw, or pinsrd. | 2734 // Use insertps, pinsrb, pinsrw, or pinsrd. |
(...skipping 25 matching lines...) Expand all Loading... | |
2766 } | 2760 } |
2767 | 2761 |
2768 if (Index == 0) { | 2762 if (Index == 0) { |
2769 Variable *T = makeReg(Ty); | 2763 Variable *T = makeReg(Ty); |
2770 _movp(T, SourceVectRM); | 2764 _movp(T, SourceVectRM); |
2771 _movss(T, ElementR); | 2765 _movss(T, ElementR); |
2772 _movp(Inst->getDest(), T); | 2766 _movp(Inst->getDest(), T); |
2773 return; | 2767 return; |
2774 } | 2768 } |
2775 | 2769 |
2776 // shufps treats the source and desination operands as vectors of | 2770 // shufps treats the source and desination operands as vectors of four |
Jim Stichnoth
2015/09/16 00:01:29
destination
ascull
2015/09/16 18:30:09
Done.
| |
2777 // four doublewords. The destination's two high doublewords are | 2771 // doublewords. The destination's two high doublewords are selected from |
2778 // selected from the source operand and the two low doublewords are | 2772 // the source operand and the two low doublewords are selected from the |
2779 // selected from the (original value of) the destination operand. | 2773 // (original value of) the destination operand. An insertelement operation |
2780 // An insertelement operation can be effected with a sequence of two | 2774 // can be effected with a sequence of two shufps operations with |
2781 // shufps operations with appropriate masks. In all cases below, | 2775 // appropriate masks. In all cases below, Element[0] is being inserted |
2782 // Element[0] is being inserted into SourceVectOperand. Indices are | 2776 // into SourceVectOperand. Indices are ordered from left to right. |
2783 // ordered from left to right. | |
2784 // | 2777 // |
2785 // insertelement into index 1 (result is stored in ElementR): | 2778 // insertelement into index 1 (result is stored in ElementR): |
2786 // ElementR := ElementR[0, 0] SourceVectRM[0, 0] | 2779 // ElementR := ElementR[0, 0] SourceVectRM[0, 0] |
2787 // ElementR := ElementR[3, 0] SourceVectRM[2, 3] | 2780 // ElementR := ElementR[3, 0] SourceVectRM[2, 3] |
2788 // | 2781 // |
2789 // insertelement into index 2 (result is stored in T): | 2782 // insertelement into index 2 (result is stored in T): |
2790 // T := SourceVectRM | 2783 // T := SourceVectRM |
2791 // ElementR := ElementR[0, 0] T[0, 3] | 2784 // ElementR := ElementR[0, 0] T[0, 3] |
2792 // T := T[0, 1] ElementR[0, 3] | 2785 // T := T[0, 1] ElementR[0, 3] |
2793 // | 2786 // |
(...skipping 13 matching lines...) Expand all Loading... | |
2807 _movp(Inst->getDest(), ElementR); | 2800 _movp(Inst->getDest(), ElementR); |
2808 } else { | 2801 } else { |
2809 Variable *T = makeReg(Ty); | 2802 Variable *T = makeReg(Ty); |
2810 _movp(T, SourceVectRM); | 2803 _movp(T, SourceVectRM); |
2811 _shufps(ElementR, T, Mask1Constant); | 2804 _shufps(ElementR, T, Mask1Constant); |
2812 _shufps(T, ElementR, Mask2Constant); | 2805 _shufps(T, ElementR, Mask2Constant); |
2813 _movp(Inst->getDest(), T); | 2806 _movp(Inst->getDest(), T); |
2814 } | 2807 } |
2815 } else { | 2808 } else { |
2816 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2809 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
2817 // Spill the value to a stack slot and perform the insertion in | 2810 // Spill the value to a stack slot and perform the insertion in memory. |
2818 // memory. | |
2819 // | 2811 // |
2820 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when | 2812 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support |
2821 // support for legalizing to mem is implemented. | 2813 // for legalizing to mem is implemented. |
2822 Variable *Slot = Func->makeVariable(Ty); | 2814 Variable *Slot = Func->makeVariable(Ty); |
2823 Slot->setMustNotHaveReg(); | 2815 Slot->setMustNotHaveReg(); |
2824 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); | 2816 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); |
2825 | 2817 |
2826 // Compute the location of the position to insert in memory. | 2818 // Compute the location of the position to insert in memory. |
2827 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); | 2819 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); |
2828 typename Traits::X86OperandMem *Loc = | 2820 typename Traits::X86OperandMem *Loc = |
2829 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); | 2821 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); |
2830 _store(legalizeToReg(ElementToInsertNotLegalized), Loc); | 2822 _store(legalizeToReg(ElementToInsertNotLegalized), Loc); |
2831 | 2823 |
(...skipping 25 matching lines...) Expand all Loading... | |
2857 } | 2849 } |
2858 case Intrinsics::AtomicFence: | 2850 case Intrinsics::AtomicFence: |
2859 if (!Intrinsics::isMemoryOrderValid( | 2851 if (!Intrinsics::isMemoryOrderValid( |
2860 ID, getConstantMemoryOrder(Instr->getArg(0)))) { | 2852 ID, getConstantMemoryOrder(Instr->getArg(0)))) { |
2861 Func->setError("Unexpected memory ordering for AtomicFence"); | 2853 Func->setError("Unexpected memory ordering for AtomicFence"); |
2862 return; | 2854 return; |
2863 } | 2855 } |
2864 _mfence(); | 2856 _mfence(); |
2865 return; | 2857 return; |
2866 case Intrinsics::AtomicFenceAll: | 2858 case Intrinsics::AtomicFenceAll: |
2867 // NOTE: FenceAll should prevent and load/store from being moved | 2859 // NOTE: FenceAll should prevent and load/store from being moved across the |
2868 // across the fence (both atomic and non-atomic). The InstX8632Mfence | 2860 // fence (both atomic and non-atomic). The InstX8632Mfence instruction is |
2869 // instruction is currently marked coarsely as "HasSideEffects". | 2861 // currently marked coarsely as "HasSideEffects". |
2870 _mfence(); | 2862 _mfence(); |
2871 return; | 2863 return; |
2872 case Intrinsics::AtomicIsLockFree: { | 2864 case Intrinsics::AtomicIsLockFree: { |
2873 // X86 is always lock free for 8/16/32/64 bit accesses. | 2865 // X86 is always lock free for 8/16/32/64 bit accesses. |
2874 // TODO(jvoung): Since the result is constant when given a constant | 2866 // TODO(jvoung): Since the result is constant when given a constant byte |
2875 // byte size, this opens up DCE opportunities. | 2867 // size, this opens up DCE opportunities. |
2876 Operand *ByteSize = Instr->getArg(0); | 2868 Operand *ByteSize = Instr->getArg(0); |
2877 Variable *Dest = Instr->getDest(); | 2869 Variable *Dest = Instr->getDest(); |
2878 if (ConstantInteger32 *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) { | 2870 if (ConstantInteger32 *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) { |
2879 Constant *Result; | 2871 Constant *Result; |
2880 switch (CI->getValue()) { | 2872 switch (CI->getValue()) { |
2881 default: | 2873 default: |
2882 // Some x86-64 processors support the cmpxchg16b intruction, which | 2874 // Some x86-64 processors support the cmpxchg16b intruction, which can |
Jim Stichnoth
2015/09/16 00:01:29
instruction
ascull
2015/09/16 18:30:09
Done.
| |
2883 // can make 16-byte operations lock free (when used with the LOCK | 2875 // make 16-byte operations lock free (when used with the LOCK prefix). |
2884 // prefix). However, that's not supported in 32-bit mode, so just | 2876 // However, that's not supported in 32-bit mode, so just return 0 even |
2885 // return 0 even for large sizes. | 2877 // for large sizes. |
2886 Result = Ctx->getConstantZero(IceType_i32); | 2878 Result = Ctx->getConstantZero(IceType_i32); |
2887 break; | 2879 break; |
2888 case 1: | 2880 case 1: |
2889 case 2: | 2881 case 2: |
2890 case 4: | 2882 case 4: |
2891 case 8: | 2883 case 8: |
2892 Result = Ctx->getConstantInt32(1); | 2884 Result = Ctx->getConstantInt32(1); |
2893 break; | 2885 break; |
2894 } | 2886 } |
2895 _mov(Dest, Result); | 2887 _mov(Dest, Result); |
2896 return; | 2888 return; |
2897 } | 2889 } |
2898 // The PNaCl ABI requires the byte size to be a compile-time constant. | 2890 // The PNaCl ABI requires the byte size to be a compile-time constant. |
2899 Func->setError("AtomicIsLockFree byte size should be compile-time const"); | 2891 Func->setError("AtomicIsLockFree byte size should be compile-time const"); |
2900 return; | 2892 return; |
2901 } | 2893 } |
2902 case Intrinsics::AtomicLoad: { | 2894 case Intrinsics::AtomicLoad: { |
2903 // We require the memory address to be naturally aligned. | 2895 // We require the memory address to be naturally aligned. Given that is the |
2904 // Given that is the case, then normal loads are atomic. | 2896 // case, then normal loads are atomic. |
2905 if (!Intrinsics::isMemoryOrderValid( | 2897 if (!Intrinsics::isMemoryOrderValid( |
2906 ID, getConstantMemoryOrder(Instr->getArg(1)))) { | 2898 ID, getConstantMemoryOrder(Instr->getArg(1)))) { |
2907 Func->setError("Unexpected memory ordering for AtomicLoad"); | 2899 Func->setError("Unexpected memory ordering for AtomicLoad"); |
2908 return; | 2900 return; |
2909 } | 2901 } |
2910 Variable *Dest = Instr->getDest(); | 2902 Variable *Dest = Instr->getDest(); |
2911 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { | 2903 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { |
2912 // Follow what GCC does and use a movq instead of what lowerLoad() | 2904 // Follow what GCC does and use a movq instead of what lowerLoad() |
2913 // normally does (split the load into two). | 2905 // normally does (split the load into two). Thus, this skips |
2914 // Thus, this skips load/arithmetic op folding. Load/arithmetic folding | 2906 // load/arithmetic op folding. Load/arithmetic folding can't happen |
2915 // can't happen anyway, since this is x86-32 and integer arithmetic only | 2907 // anyway, since this is x86-32 and integer arithmetic only happens on |
2916 // happens on 32-bit quantities. | 2908 // 32-bit quantities. |
2917 Variable *T = makeReg(IceType_f64); | 2909 Variable *T = makeReg(IceType_f64); |
2918 typename Traits::X86OperandMem *Addr = | 2910 typename Traits::X86OperandMem *Addr = |
2919 formMemoryOperand(Instr->getArg(0), IceType_f64); | 2911 formMemoryOperand(Instr->getArg(0), IceType_f64); |
2920 _movq(T, Addr); | 2912 _movq(T, Addr); |
2921 // Then cast the bits back out of the XMM register to the i64 Dest. | 2913 // Then cast the bits back out of the XMM register to the i64 Dest. |
2922 InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T); | 2914 InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T); |
2923 lowerCast(Cast); | 2915 lowerCast(Cast); |
2924 // Make sure that the atomic load isn't elided when unused. | 2916 // Make sure that the atomic load isn't elided when unused. |
2925 Context.insert(InstFakeUse::create(Func, Dest->getLo())); | 2917 Context.insert(InstFakeUse::create(Func, Dest->getLo())); |
2926 Context.insert(InstFakeUse::create(Func, Dest->getHi())); | 2918 Context.insert(InstFakeUse::create(Func, Dest->getHi())); |
2927 return; | 2919 return; |
2928 } | 2920 } |
2929 InstLoad *Load = InstLoad::create(Func, Dest, Instr->getArg(0)); | 2921 InstLoad *Load = InstLoad::create(Func, Dest, Instr->getArg(0)); |
2930 lowerLoad(Load); | 2922 lowerLoad(Load); |
2931 // Make sure the atomic load isn't elided when unused, by adding a FakeUse. | 2923 // Make sure the atomic load isn't elided when unused, by adding a FakeUse. |
2932 // Since lowerLoad may fuse the load w/ an arithmetic instruction, | 2924 // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert |
2933 // insert the FakeUse on the last-inserted instruction's dest. | 2925 // the FakeUse on the last-inserted instruction's dest. |
2934 Context.insert( | 2926 Context.insert( |
2935 InstFakeUse::create(Func, Context.getLastInserted()->getDest())); | 2927 InstFakeUse::create(Func, Context.getLastInserted()->getDest())); |
2936 return; | 2928 return; |
2937 } | 2929 } |
2938 case Intrinsics::AtomicRMW: | 2930 case Intrinsics::AtomicRMW: |
2939 if (!Intrinsics::isMemoryOrderValid( | 2931 if (!Intrinsics::isMemoryOrderValid( |
2940 ID, getConstantMemoryOrder(Instr->getArg(3)))) { | 2932 ID, getConstantMemoryOrder(Instr->getArg(3)))) { |
2941 Func->setError("Unexpected memory ordering for AtomicRMW"); | 2933 Func->setError("Unexpected memory ordering for AtomicRMW"); |
2942 return; | 2934 return; |
2943 } | 2935 } |
2944 lowerAtomicRMW( | 2936 lowerAtomicRMW( |
2945 Instr->getDest(), | 2937 Instr->getDest(), |
2946 static_cast<uint32_t>( | 2938 static_cast<uint32_t>( |
2947 llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()), | 2939 llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()), |
2948 Instr->getArg(1), Instr->getArg(2)); | 2940 Instr->getArg(1), Instr->getArg(2)); |
2949 return; | 2941 return; |
2950 case Intrinsics::AtomicStore: { | 2942 case Intrinsics::AtomicStore: { |
2951 if (!Intrinsics::isMemoryOrderValid( | 2943 if (!Intrinsics::isMemoryOrderValid( |
2952 ID, getConstantMemoryOrder(Instr->getArg(2)))) { | 2944 ID, getConstantMemoryOrder(Instr->getArg(2)))) { |
2953 Func->setError("Unexpected memory ordering for AtomicStore"); | 2945 Func->setError("Unexpected memory ordering for AtomicStore"); |
2954 return; | 2946 return; |
2955 } | 2947 } |
2956 // We require the memory address to be naturally aligned. | 2948 // We require the memory address to be naturally aligned. Given that is the |
2957 // Given that is the case, then normal stores are atomic. | 2949 // case, then normal stores are atomic. Add a fence after the store to make |
2958 // Add a fence after the store to make it visible. | 2950 // it visible. |
2959 Operand *Value = Instr->getArg(0); | 2951 Operand *Value = Instr->getArg(0); |
2960 Operand *Ptr = Instr->getArg(1); | 2952 Operand *Ptr = Instr->getArg(1); |
2961 if (!Traits::Is64Bit && Value->getType() == IceType_i64) { | 2953 if (!Traits::Is64Bit && Value->getType() == IceType_i64) { |
2962 // Use a movq instead of what lowerStore() normally does | 2954 // Use a movq instead of what lowerStore() normally does (split the store |
2963 // (split the store into two), following what GCC does. | 2955 // into two), following what GCC does. Cast the bits from int -> to an |
2964 // Cast the bits from int -> to an xmm register first. | 2956 // xmm register first. |
2965 Variable *T = makeReg(IceType_f64); | 2957 Variable *T = makeReg(IceType_f64); |
2966 InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value); | 2958 InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value); |
2967 lowerCast(Cast); | 2959 lowerCast(Cast); |
2968 // Then store XMM w/ a movq. | 2960 // Then store XMM w/ a movq. |
2969 typename Traits::X86OperandMem *Addr = | 2961 typename Traits::X86OperandMem *Addr = |
2970 formMemoryOperand(Ptr, IceType_f64); | 2962 formMemoryOperand(Ptr, IceType_f64); |
2971 _storeq(T, Addr); | 2963 _storeq(T, Addr); |
2972 _mfence(); | 2964 _mfence(); |
2973 return; | 2965 return; |
2974 } | 2966 } |
2975 InstStore *Store = InstStore::create(Func, Value, Ptr); | 2967 InstStore *Store = InstStore::create(Func, Value, Ptr); |
2976 lowerStore(Store); | 2968 lowerStore(Store); |
2977 _mfence(); | 2969 _mfence(); |
2978 return; | 2970 return; |
2979 } | 2971 } |
2980 case Intrinsics::Bswap: { | 2972 case Intrinsics::Bswap: { |
2981 Variable *Dest = Instr->getDest(); | 2973 Variable *Dest = Instr->getDest(); |
2982 Operand *Val = Instr->getArg(0); | 2974 Operand *Val = Instr->getArg(0); |
2983 // In 32-bit mode, bswap only works on 32-bit arguments, and the | 2975 // In 32-bit mode, bswap only works on 32-bit arguments, and the argument |
2984 // argument must be a register. Use rotate left for 16-bit bswap. | 2976 // must be a register. Use rotate left for 16-bit bswap. |
2985 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { | 2977 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { |
2986 Val = legalizeUndef(Val); | 2978 Val = legalizeUndef(Val); |
2987 Variable *T_Lo = legalizeToReg(loOperand(Val)); | 2979 Variable *T_Lo = legalizeToReg(loOperand(Val)); |
2988 Variable *T_Hi = legalizeToReg(hiOperand(Val)); | 2980 Variable *T_Hi = legalizeToReg(hiOperand(Val)); |
2989 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); | 2981 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); |
2990 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); | 2982 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); |
2991 _bswap(T_Lo); | 2983 _bswap(T_Lo); |
2992 _bswap(T_Hi); | 2984 _bswap(T_Hi); |
2993 _mov(DestLo, T_Hi); | 2985 _mov(DestLo, T_Hi); |
2994 _mov(DestHi, T_Lo); | 2986 _mov(DestHi, T_Lo); |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3063 // another 64-bit wide.) | 3055 // another 64-bit wide.) |
3064 Variable *T_1 = makeReg(IceType_i32); | 3056 Variable *T_1 = makeReg(IceType_i32); |
3065 _mov(T_1, T); | 3057 _mov(T_1, T); |
3066 Variable *T_2 = makeReg(IceType_i64); | 3058 Variable *T_2 = makeReg(IceType_i64); |
3067 _movzx(T_2, T_1); | 3059 _movzx(T_2, T_1); |
3068 _mov(Dest, T_2); | 3060 _mov(Dest, T_2); |
3069 } | 3061 } |
3070 return; | 3062 return; |
3071 } | 3063 } |
3072 case Intrinsics::Ctlz: { | 3064 case Intrinsics::Ctlz: { |
3073 // The "is zero undef" parameter is ignored and we always return | 3065 // The "is zero undef" parameter is ignored and we always return a |
3074 // a well-defined value. | 3066 // well-defined value. |
3075 Operand *Val = legalize(Instr->getArg(0)); | 3067 Operand *Val = legalize(Instr->getArg(0)); |
3076 Operand *FirstVal; | 3068 Operand *FirstVal; |
3077 Operand *SecondVal = nullptr; | 3069 Operand *SecondVal = nullptr; |
3078 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { | 3070 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { |
3079 FirstVal = loOperand(Val); | 3071 FirstVal = loOperand(Val); |
3080 SecondVal = hiOperand(Val); | 3072 SecondVal = hiOperand(Val); |
3081 } else { | 3073 } else { |
3082 FirstVal = Val; | 3074 FirstVal = Val; |
3083 } | 3075 } |
3084 const bool IsCttz = false; | 3076 const bool IsCttz = false; |
3085 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, | 3077 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, |
3086 SecondVal); | 3078 SecondVal); |
3087 return; | 3079 return; |
3088 } | 3080 } |
3089 case Intrinsics::Cttz: { | 3081 case Intrinsics::Cttz: { |
3090 // The "is zero undef" parameter is ignored and we always return | 3082 // The "is zero undef" parameter is ignored and we always return a |
3091 // a well-defined value. | 3083 // well-defined value. |
3092 Operand *Val = legalize(Instr->getArg(0)); | 3084 Operand *Val = legalize(Instr->getArg(0)); |
3093 Operand *FirstVal; | 3085 Operand *FirstVal; |
3094 Operand *SecondVal = nullptr; | 3086 Operand *SecondVal = nullptr; |
3095 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { | 3087 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { |
3096 FirstVal = hiOperand(Val); | 3088 FirstVal = hiOperand(Val); |
3097 SecondVal = loOperand(Val); | 3089 SecondVal = loOperand(Val); |
3098 } else { | 3090 } else { |
3099 FirstVal = Val; | 3091 FirstVal = Val; |
3100 } | 3092 } |
3101 const bool IsCttz = true; | 3093 const bool IsCttz = true; |
3102 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, | 3094 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, |
3103 SecondVal); | 3095 SecondVal); |
3104 return; | 3096 return; |
3105 } | 3097 } |
3106 case Intrinsics::Fabs: { | 3098 case Intrinsics::Fabs: { |
3107 Operand *Src = legalize(Instr->getArg(0)); | 3099 Operand *Src = legalize(Instr->getArg(0)); |
3108 Type Ty = Src->getType(); | 3100 Type Ty = Src->getType(); |
3109 Variable *Dest = Instr->getDest(); | 3101 Variable *Dest = Instr->getDest(); |
3110 Variable *T = makeVectorOfFabsMask(Ty); | 3102 Variable *T = makeVectorOfFabsMask(Ty); |
3111 // The pand instruction operates on an m128 memory operand, so if | 3103 // The pand instruction operates on an m128 memory operand, so if Src is an |
3112 // Src is an f32 or f64, we need to make sure it's in a register. | 3104 // f32 or f64, we need to make sure it's in a register. |
3113 if (isVectorType(Ty)) { | 3105 if (isVectorType(Ty)) { |
3114 if (llvm::isa<typename Traits::X86OperandMem>(Src)) | 3106 if (llvm::isa<typename Traits::X86OperandMem>(Src)) |
3115 Src = legalizeToReg(Src); | 3107 Src = legalizeToReg(Src); |
3116 } else { | 3108 } else { |
3117 Src = legalizeToReg(Src); | 3109 Src = legalizeToReg(Src); |
3118 } | 3110 } |
3119 _pand(T, Src); | 3111 _pand(T, Src); |
3120 if (isVectorType(Ty)) | 3112 if (isVectorType(Ty)) |
3121 _movp(Dest, T); | 3113 _movp(Dest, T); |
3122 else | 3114 else |
(...skipping 564 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3687 | 3679 |
3688 Variable *SrcBase = legalizeToReg(Src); | 3680 Variable *SrcBase = legalizeToReg(Src); |
3689 Variable *DestBase = legalizeToReg(Dest); | 3681 Variable *DestBase = legalizeToReg(Dest); |
3690 | 3682 |
3691 std::tuple<Type, Constant *, Variable *> | 3683 std::tuple<Type, Constant *, Variable *> |
3692 Moves[Traits::MEMMOVE_UNROLL_LIMIT]; | 3684 Moves[Traits::MEMMOVE_UNROLL_LIMIT]; |
3693 Constant *Offset; | 3685 Constant *Offset; |
3694 Variable *Reg; | 3686 Variable *Reg; |
3695 | 3687 |
3696 // Copy the data into registers as the source and destination could overlap | 3688 // Copy the data into registers as the source and destination could overlap |
3697 // so make sure not to clobber the memory. This also means overlapping moves | 3689 // so make sure not to clobber the memory. This also means overlapping |
3698 // can be used as we are taking a safe snapshot of the memory. | 3690 // moves can be used as we are taking a safe snapshot of the memory. |
3699 Type Ty = largestTypeInSize(CountValue); | 3691 Type Ty = largestTypeInSize(CountValue); |
3700 uint32_t TyWidth = typeWidthInBytes(Ty); | 3692 uint32_t TyWidth = typeWidthInBytes(Ty); |
3701 | 3693 |
3702 uint32_t RemainingBytes = CountValue; | 3694 uint32_t RemainingBytes = CountValue; |
3703 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth; | 3695 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth; |
3704 size_t N = 0; | 3696 size_t N = 0; |
3705 while (RemainingBytes >= TyWidth) { | 3697 while (RemainingBytes >= TyWidth) { |
3706 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT); | 3698 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT); |
3707 Offset = Ctx->getConstantInt32(OffsetAmt); | 3699 Offset = Ctx->getConstantInt32(OffsetAmt); |
3708 Reg = makeReg(Ty); | 3700 Reg = makeReg(Ty); |
(...skipping 180 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3889 Str << ", Index="; | 3881 Str << ", Index="; |
3890 if (Index) | 3882 if (Index) |
3891 Index->dump(Func); | 3883 Index->dump(Func); |
3892 else | 3884 else |
3893 Str << "<null>"; | 3885 Str << "<null>"; |
3894 Str << ", Shift=" << Shift << ", Offset=" << Offset << "\n"; | 3886 Str << ", Shift=" << Shift << ", Offset=" << Offset << "\n"; |
3895 } | 3887 } |
3896 | 3888 |
3897 inline bool matchTransitiveAssign(const VariablesMetadata *VMetadata, | 3889 inline bool matchTransitiveAssign(const VariablesMetadata *VMetadata, |
3898 Variable *&Var, const Inst *&Reason) { | 3890 Variable *&Var, const Inst *&Reason) { |
3899 // Var originates from Var=SrcVar ==> | 3891 // Var originates from Var=SrcVar ==> set Var:=SrcVar |
3900 // set Var:=SrcVar | |
3901 if (Var == nullptr) | 3892 if (Var == nullptr) |
3902 return false; | 3893 return false; |
3903 if (const Inst *VarAssign = VMetadata->getSingleDefinition(Var)) { | 3894 if (const Inst *VarAssign = VMetadata->getSingleDefinition(Var)) { |
3904 assert(!VMetadata->isMultiDef(Var)); | 3895 assert(!VMetadata->isMultiDef(Var)); |
3905 if (llvm::isa<InstAssign>(VarAssign)) { | 3896 if (llvm::isa<InstAssign>(VarAssign)) { |
3906 Operand *SrcOp = VarAssign->getSrc(0); | 3897 Operand *SrcOp = VarAssign->getSrc(0); |
3907 assert(SrcOp); | 3898 assert(SrcOp); |
3908 if (Variable *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) { | 3899 if (Variable *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) { |
3909 if (!VMetadata->isMultiDef(SrcVar) && | 3900 if (!VMetadata->isMultiDef(SrcVar) && |
3910 // TODO: ensure SrcVar stays single-BB | 3901 // TODO: ensure SrcVar stays single-BB |
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4052 Func->resetCurrentNode(); | 4043 Func->resetCurrentNode(); |
4053 if (Func->isVerbose(IceV_AddrOpt)) { | 4044 if (Func->isVerbose(IceV_AddrOpt)) { |
4054 OstreamLocker L(Func->getContext()); | 4045 OstreamLocker L(Func->getContext()); |
4055 Ostream &Str = Func->getContext()->getStrDump(); | 4046 Ostream &Str = Func->getContext()->getStrDump(); |
4056 Str << "\nStarting computeAddressOpt for instruction:\n "; | 4047 Str << "\nStarting computeAddressOpt for instruction:\n "; |
4057 Instr->dumpDecorated(Func); | 4048 Instr->dumpDecorated(Func); |
4058 } | 4049 } |
4059 (void)Offset; // TODO: pattern-match for non-zero offsets. | 4050 (void)Offset; // TODO: pattern-match for non-zero offsets. |
4060 if (Base == nullptr) | 4051 if (Base == nullptr) |
4061 return; | 4052 return; |
4062 // If the Base has more than one use or is live across multiple | 4053 // If the Base has more than one use or is live across multiple blocks, then |
4063 // blocks, then don't go further. Alternatively (?), never consider | 4054 // don't go further. Alternatively (?), never consider a transformation that |
4064 // a transformation that would change a variable that is currently | 4055 // would change a variable that is currently *not* live across basic block |
4065 // *not* live across basic block boundaries into one that *is*. | 4056 // boundaries into one that *is*. |
4066 if (Func->getVMetadata()->isMultiBlock(Base) /* || Base->getUseCount() > 1*/) | 4057 if (Func->getVMetadata()->isMultiBlock(Base) /* || Base->getUseCount() > 1*/) |
4067 return; | 4058 return; |
4068 | 4059 |
4069 const VariablesMetadata *VMetadata = Func->getVMetadata(); | 4060 const VariablesMetadata *VMetadata = Func->getVMetadata(); |
4070 bool Continue = true; | 4061 bool Continue = true; |
4071 while (Continue) { | 4062 while (Continue) { |
4072 const Inst *Reason = nullptr; | 4063 const Inst *Reason = nullptr; |
4073 if (matchTransitiveAssign(VMetadata, Base, Reason) || | 4064 if (matchTransitiveAssign(VMetadata, Base, Reason) || |
4074 matchTransitiveAssign(VMetadata, Index, Reason) || | 4065 matchTransitiveAssign(VMetadata, Index, Reason) || |
4075 matchCombinedBaseIndex(VMetadata, Base, Index, Shift, Reason) || | 4066 matchCombinedBaseIndex(VMetadata, Base, Index, Shift, Reason) || |
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4163 Operand *SrcT = Inst->getTrueOperand(); | 4154 Operand *SrcT = Inst->getTrueOperand(); |
4164 Operand *SrcF = Inst->getFalseOperand(); | 4155 Operand *SrcF = Inst->getFalseOperand(); |
4165 Operand *Condition = Inst->getCondition(); | 4156 Operand *Condition = Inst->getCondition(); |
4166 | 4157 |
4167 if (isVectorType(DestTy)) { | 4158 if (isVectorType(DestTy)) { |
4168 Type SrcTy = SrcT->getType(); | 4159 Type SrcTy = SrcT->getType(); |
4169 Variable *T = makeReg(SrcTy); | 4160 Variable *T = makeReg(SrcTy); |
4170 Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem); | 4161 Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem); |
4171 Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem); | 4162 Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem); |
4172 if (InstructionSet >= Traits::SSE4_1) { | 4163 if (InstructionSet >= Traits::SSE4_1) { |
4173 // TODO(wala): If the condition operand is a constant, use blendps | 4164 // TODO(wala): If the condition operand is a constant, use blendps or |
4174 // or pblendw. | 4165 // pblendw. |
4175 // | 4166 // |
4176 // Use blendvps or pblendvb to implement select. | 4167 // Use blendvps or pblendvb to implement select. |
4177 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 || | 4168 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 || |
4178 SrcTy == IceType_v4f32) { | 4169 SrcTy == IceType_v4f32) { |
4179 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); | 4170 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); |
4180 Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0); | 4171 Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0); |
4181 _movp(xmm0, ConditionRM); | 4172 _movp(xmm0, ConditionRM); |
4182 _psll(xmm0, Ctx->getConstantInt8(31)); | 4173 _psll(xmm0, Ctx->getConstantInt8(31)); |
4183 _movp(T, SrcFRM); | 4174 _movp(T, SrcFRM); |
4184 _blendvps(T, SrcTRM, xmm0); | 4175 _blendvps(T, SrcTRM, xmm0); |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4241 } | 4232 } |
4242 if (CmpOpnd0 == nullptr) { | 4233 if (CmpOpnd0 == nullptr) { |
4243 CmpOpnd0 = legalize(Condition, Legal_Reg | Legal_Mem); | 4234 CmpOpnd0 = legalize(Condition, Legal_Reg | Legal_Mem); |
4244 CmpOpnd1 = Ctx->getConstantZero(IceType_i32); | 4235 CmpOpnd1 = Ctx->getConstantZero(IceType_i32); |
4245 } | 4236 } |
4246 assert(CmpOpnd0); | 4237 assert(CmpOpnd0); |
4247 assert(CmpOpnd1); | 4238 assert(CmpOpnd1); |
4248 | 4239 |
4249 _cmp(CmpOpnd0, CmpOpnd1); | 4240 _cmp(CmpOpnd0, CmpOpnd1); |
4250 if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) { | 4241 if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) { |
4251 // The cmov instruction doesn't allow 8-bit or FP operands, so | 4242 // The cmov instruction doesn't allow 8-bit or FP operands, so we need |
4252 // we need explicit control flow. | 4243 // explicit control flow. |
4253 // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1: | 4244 // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1: |
4254 typename Traits::Insts::Label *Label = | 4245 typename Traits::Insts::Label *Label = |
4255 Traits::Insts::Label::create(Func, this); | 4246 Traits::Insts::Label::create(Func, this); |
4256 SrcT = legalize(SrcT, Legal_Reg | Legal_Imm); | 4247 SrcT = legalize(SrcT, Legal_Reg | Legal_Imm); |
4257 _mov(Dest, SrcT); | 4248 _mov(Dest, SrcT); |
4258 _br(Cond, Label); | 4249 _br(Cond, Label); |
4259 SrcF = legalize(SrcF, Legal_Reg | Legal_Imm); | 4250 SrcF = legalize(SrcF, Legal_Reg | Legal_Imm); |
4260 _mov_nonkillable(Dest, SrcF); | 4251 _mov_nonkillable(Dest, SrcF); |
4261 Context.insert(Label); | 4252 Context.insert(Label); |
4262 return; | 4253 return; |
4263 } | 4254 } |
4264 // mov t, SrcF; cmov_cond t, SrcT; mov dest, t | 4255 // mov t, SrcF; cmov_cond t, SrcT; mov dest, t |
4265 // But if SrcT is immediate, we might be able to do better, as | 4256 // But if SrcT is immediate, we might be able to do better, as the cmov |
4266 // the cmov instruction doesn't allow an immediate operand: | 4257 // instruction doesn't allow an immediate operand: |
4267 // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t | 4258 // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t |
4268 if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) { | 4259 if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) { |
4269 std::swap(SrcT, SrcF); | 4260 std::swap(SrcT, SrcF); |
4270 Cond = InstX86Base<Machine>::getOppositeCondition(Cond); | 4261 Cond = InstX86Base<Machine>::getOppositeCondition(Cond); |
4271 } | 4262 } |
4272 if (!Traits::Is64Bit && DestTy == IceType_i64) { | 4263 if (!Traits::Is64Bit && DestTy == IceType_i64) { |
4273 SrcT = legalizeUndef(SrcT); | 4264 SrcT = legalizeUndef(SrcT); |
4274 SrcF = legalizeUndef(SrcF); | 4265 SrcF = legalizeUndef(SrcF); |
4275 // Set the low portion. | 4266 // Set the low portion. |
4276 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); | 4267 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); |
(...skipping 339 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4616 | 4607 |
4617 lowerAssign(InstAssign::create(Func, Dest, T)); | 4608 lowerAssign(InstAssign::create(Func, Dest, T)); |
4618 } | 4609 } |
4619 | 4610 |
4620 /// The following pattern occurs often in lowered C and C++ code: | 4611 /// The following pattern occurs often in lowered C and C++ code: |
4621 /// | 4612 /// |
4622 /// %cmp = fcmp/icmp pred <n x ty> %src0, %src1 | 4613 /// %cmp = fcmp/icmp pred <n x ty> %src0, %src1 |
4623 /// %cmp.ext = sext <n x i1> %cmp to <n x ty> | 4614 /// %cmp.ext = sext <n x i1> %cmp to <n x ty> |
4624 /// | 4615 /// |
4625 /// We can eliminate the sext operation by copying the result of pcmpeqd, | 4616 /// We can eliminate the sext operation by copying the result of pcmpeqd, |
4626 /// pcmpgtd, or cmpps (which produce sign extended results) to the result | 4617 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of the |
4627 /// of the sext operation. | 4618 /// sext operation. |
4628 template <class Machine> | 4619 template <class Machine> |
4629 void TargetX86Base<Machine>::eliminateNextVectorSextInstruction( | 4620 void TargetX86Base<Machine>::eliminateNextVectorSextInstruction( |
4630 Variable *SignExtendedResult) { | 4621 Variable *SignExtendedResult) { |
4631 if (InstCast *NextCast = | 4622 if (InstCast *NextCast = |
4632 llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) { | 4623 llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) { |
4633 if (NextCast->getCastKind() == InstCast::Sext && | 4624 if (NextCast->getCastKind() == InstCast::Sext && |
4634 NextCast->getSrc(0) == SignExtendedResult) { | 4625 NextCast->getSrc(0) == SignExtendedResult) { |
4635 NextCast->setDeleted(); | 4626 NextCast->setDeleted(); |
4636 _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult)); | 4627 _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult)); |
4637 // Skip over the instruction. | 4628 // Skip over the instruction. |
4638 Context.advanceNext(); | 4629 Context.advanceNext(); |
4639 } | 4630 } |
4640 } | 4631 } |
4641 } | 4632 } |
4642 | 4633 |
4643 template <class Machine> | 4634 template <class Machine> |
4644 void TargetX86Base<Machine>::lowerUnreachable( | 4635 void TargetX86Base<Machine>::lowerUnreachable( |
4645 const InstUnreachable * /*Inst*/) { | 4636 const InstUnreachable * /*Inst*/) { |
4646 _ud2(); | 4637 _ud2(); |
4647 } | 4638 } |
4648 | 4639 |
4649 template <class Machine> | 4640 template <class Machine> |
4650 void TargetX86Base<Machine>::lowerRMW( | 4641 void TargetX86Base<Machine>::lowerRMW( |
4651 const typename Traits::Insts::FakeRMW *RMW) { | 4642 const typename Traits::Insts::FakeRMW *RMW) { |
4652 // If the beacon variable's live range does not end in this | 4643 // If the beacon variable's live range does not end in this instruction, then |
4653 // instruction, then it must end in the modified Store instruction | 4644 // it must end in the modified Store instruction that follows. This means |
4654 // that follows. This means that the original Store instruction is | 4645 // that the original Store instruction is still there, either because the |
4655 // still there, either because the value being stored is used beyond | 4646 // value being stored is used beyond the Store instruction, or because dead |
4656 // the Store instruction, or because dead code elimination did not | 4647 // code elimination did not happen. In either case, we cancel RMW lowering |
4657 // happen. In either case, we cancel RMW lowering (and the caller | 4648 // (and the caller deletes the RMW instruction). |
4658 // deletes the RMW instruction). | |
4659 if (!RMW->isLastUse(RMW->getBeacon())) | 4649 if (!RMW->isLastUse(RMW->getBeacon())) |
4660 return; | 4650 return; |
4661 Operand *Src = RMW->getData(); | 4651 Operand *Src = RMW->getData(); |
4662 Type Ty = Src->getType(); | 4652 Type Ty = Src->getType(); |
4663 typename Traits::X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty); | 4653 typename Traits::X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty); |
4664 if (!Traits::Is64Bit && Ty == IceType_i64) { | 4654 if (!Traits::Is64Bit && Ty == IceType_i64) { |
4665 Src = legalizeUndef(Src); | 4655 Src = legalizeUndef(Src); |
4666 Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm); | 4656 Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm); |
4667 Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm); | 4657 Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm); |
4668 typename Traits::X86OperandMem *AddrLo = | 4658 typename Traits::X86OperandMem *AddrLo = |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4729 template <class Machine> | 4719 template <class Machine> |
4730 void TargetX86Base<Machine>::lowerOther(const Inst *Instr) { | 4720 void TargetX86Base<Machine>::lowerOther(const Inst *Instr) { |
4731 if (const auto *RMW = | 4721 if (const auto *RMW = |
4732 llvm::dyn_cast<typename Traits::Insts::FakeRMW>(Instr)) { | 4722 llvm::dyn_cast<typename Traits::Insts::FakeRMW>(Instr)) { |
4733 lowerRMW(RMW); | 4723 lowerRMW(RMW); |
4734 } else { | 4724 } else { |
4735 TargetLowering::lowerOther(Instr); | 4725 TargetLowering::lowerOther(Instr); |
4736 } | 4726 } |
4737 } | 4727 } |
4738 | 4728 |
4739 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to | 4729 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve |
4740 /// preserve integrity of liveness analysis. Undef values are also | 4730 /// integrity of liveness analysis. Undef values are also turned into zeroes, |
4741 /// turned into zeroes, since loOperand() and hiOperand() don't expect | 4731 /// since loOperand() and hiOperand() don't expect Undef input. |
4742 /// Undef input. | |
4743 template <class Machine> void TargetX86Base<Machine>::prelowerPhis() { | 4732 template <class Machine> void TargetX86Base<Machine>::prelowerPhis() { |
4744 if (Traits::Is64Bit) { | 4733 if (Traits::Is64Bit) { |
4745 // On x86-64 we don't need to prelower phis -- the architecture can handle | 4734 // On x86-64 we don't need to prelower phis -- the architecture can handle |
4746 // 64-bit integer natively. | 4735 // 64-bit integer natively. |
4747 return; | 4736 return; |
4748 } | 4737 } |
4749 | 4738 |
4750 // Pause constant blinding or pooling, blinding or pooling will be done | 4739 // Pause constant blinding or pooling, blinding or pooling will be done later |
4751 // later during phi lowering assignments | 4740 // during phi lowering assignments |
4752 BoolFlagSaver B(RandomizationPoolingPaused, true); | 4741 BoolFlagSaver B(RandomizationPoolingPaused, true); |
4753 PhiLowering::prelowerPhis32Bit<TargetX86Base<Machine>>( | 4742 PhiLowering::prelowerPhis32Bit<TargetX86Base<Machine>>( |
4754 this, Context.getNode(), Func); | 4743 this, Context.getNode(), Func); |
4755 } | 4744 } |
4756 | 4745 |
4757 // There is no support for loading or emitting vector constants, so the | 4746 // There is no support for loading or emitting vector constants, so the vector |
4758 // vector values returned from makeVectorOfZeros, makeVectorOfOnes, | 4747 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are |
4759 // etc. are initialized with register operations. | 4748 // initialized with register operations. |
4760 // | 4749 // |
4761 // TODO(wala): Add limited support for vector constants so that | 4750 // TODO(wala): Add limited support for vector constants so that complex |
4762 // complex initialization in registers is unnecessary. | 4751 // initialization in registers is unnecessary. |
4763 | 4752 |
4764 template <class Machine> | 4753 template <class Machine> |
4765 Variable *TargetX86Base<Machine>::makeVectorOfZeros(Type Ty, int32_t RegNum) { | 4754 Variable *TargetX86Base<Machine>::makeVectorOfZeros(Type Ty, int32_t RegNum) { |
4766 Variable *Reg = makeReg(Ty, RegNum); | 4755 Variable *Reg = makeReg(Ty, RegNum); |
4767 // Insert a FakeDef, since otherwise the live range of Reg might | 4756 // Insert a FakeDef, since otherwise the live range of Reg might be |
4768 // be overestimated. | 4757 // overestimated. |
4769 Context.insert(InstFakeDef::create(Func, Reg)); | 4758 Context.insert(InstFakeDef::create(Func, Reg)); |
4770 _pxor(Reg, Reg); | 4759 _pxor(Reg, Reg); |
4771 return Reg; | 4760 return Reg; |
4772 } | 4761 } |
4773 | 4762 |
4774 template <class Machine> | 4763 template <class Machine> |
4775 Variable *TargetX86Base<Machine>::makeVectorOfMinusOnes(Type Ty, | 4764 Variable *TargetX86Base<Machine>::makeVectorOfMinusOnes(Type Ty, |
4776 int32_t RegNum) { | 4765 int32_t RegNum) { |
4777 Variable *MinusOnes = makeReg(Ty, RegNum); | 4766 Variable *MinusOnes = makeReg(Ty, RegNum); |
4778 // Insert a FakeDef so the live range of MinusOnes is not overestimated. | 4767 // Insert a FakeDef so the live range of MinusOnes is not overestimated. |
(...skipping 25 matching lines...) Expand all Loading... | |
4804 // SSE has no left shift operation for vectors of 8 bit integers. | 4793 // SSE has no left shift operation for vectors of 8 bit integers. |
4805 const uint32_t HIGH_ORDER_BITS_MASK = 0x80808080; | 4794 const uint32_t HIGH_ORDER_BITS_MASK = 0x80808080; |
4806 Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK); | 4795 Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK); |
4807 Variable *Reg = makeReg(Ty, RegNum); | 4796 Variable *Reg = makeReg(Ty, RegNum); |
4808 _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem)); | 4797 _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem)); |
4809 _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8)); | 4798 _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8)); |
4810 return Reg; | 4799 return Reg; |
4811 } | 4800 } |
4812 } | 4801 } |
4813 | 4802 |
4814 /// Construct a mask in a register that can be and'ed with a | 4803 /// Construct a mask in a register that can be and'ed with a floating-point |
4815 /// floating-point value to mask off its sign bit. The value will be | 4804 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32 |
4816 /// <4 x 0x7fffffff> for f32 and v4f32, and <2 x 0x7fffffffffffffff> | 4805 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of |
4817 /// for f64. Construct it as vector of ones logically right shifted | 4806 /// ones logically right shifted one bit. |
4818 /// one bit. TODO(stichnot): Fix the wala TODO above, to represent | 4807 // TODO(stichnot): Fix the wala |
4819 /// vector constants in memory. | 4808 // TODO: above, to represent vector constants in memory. |
4820 template <class Machine> | 4809 template <class Machine> |
4821 Variable *TargetX86Base<Machine>::makeVectorOfFabsMask(Type Ty, | 4810 Variable *TargetX86Base<Machine>::makeVectorOfFabsMask(Type Ty, |
4822 int32_t RegNum) { | 4811 int32_t RegNum) { |
4823 Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum); | 4812 Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum); |
4824 _psrl(Reg, Ctx->getConstantInt8(1)); | 4813 _psrl(Reg, Ctx->getConstantInt8(1)); |
4825 return Reg; | 4814 return Reg; |
4826 } | 4815 } |
4827 | 4816 |
4828 template <class Machine> | 4817 template <class Machine> |
4829 typename TargetX86Base<Machine>::Traits::X86OperandMem * | 4818 typename TargetX86Base<Machine>::Traits::X86OperandMem * |
4830 TargetX86Base<Machine>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot, | 4819 TargetX86Base<Machine>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot, |
4831 uint32_t Offset) { | 4820 uint32_t Offset) { |
4832 // Ensure that Loc is a stack slot. | 4821 // Ensure that Loc is a stack slot. |
4833 assert(Slot->mustNotHaveReg()); | 4822 assert(Slot->mustNotHaveReg()); |
4834 assert(Slot->getRegNum() == Variable::NoRegister); | 4823 assert(Slot->getRegNum() == Variable::NoRegister); |
4835 // Compute the location of Loc in memory. | 4824 // Compute the location of Loc in memory. |
4836 // TODO(wala,stichnot): lea should not be required. The address of | 4825 // TODO(wala,stichnot): lea should not |
4837 // the stack slot is known at compile time (although not until after | 4826 // be required. The address of the stack slot is known at compile time |
4838 // addProlog()). | 4827 // (although not until after addProlog()). |
4839 const Type PointerType = IceType_i32; | 4828 const Type PointerType = IceType_i32; |
4840 Variable *Loc = makeReg(PointerType); | 4829 Variable *Loc = makeReg(PointerType); |
4841 _lea(Loc, Slot); | 4830 _lea(Loc, Slot); |
4842 Constant *ConstantOffset = Ctx->getConstantInt32(Offset); | 4831 Constant *ConstantOffset = Ctx->getConstantInt32(Offset); |
4843 return Traits::X86OperandMem::create(Func, Ty, Loc, ConstantOffset); | 4832 return Traits::X86OperandMem::create(Func, Ty, Loc, ConstantOffset); |
4844 } | 4833 } |
4845 | 4834 |
4846 /// Helper for legalize() to emit the right code to lower an operand to a | 4835 /// Helper for legalize() to emit the right code to lower an operand to a |
4847 /// register of the appropriate type. | 4836 /// register of the appropriate type. |
4848 template <class Machine> | 4837 template <class Machine> |
4849 Variable *TargetX86Base<Machine>::copyToReg(Operand *Src, int32_t RegNum) { | 4838 Variable *TargetX86Base<Machine>::copyToReg(Operand *Src, int32_t RegNum) { |
4850 Type Ty = Src->getType(); | 4839 Type Ty = Src->getType(); |
4851 Variable *Reg = makeReg(Ty, RegNum); | 4840 Variable *Reg = makeReg(Ty, RegNum); |
4852 if (isVectorType(Ty)) { | 4841 if (isVectorType(Ty)) { |
4853 _movp(Reg, Src); | 4842 _movp(Reg, Src); |
4854 } else { | 4843 } else { |
4855 _mov(Reg, Src); | 4844 _mov(Reg, Src); |
4856 } | 4845 } |
4857 return Reg; | 4846 return Reg; |
4858 } | 4847 } |
4859 | 4848 |
4860 template <class Machine> | 4849 template <class Machine> |
4861 Operand *TargetX86Base<Machine>::legalize(Operand *From, LegalMask Allowed, | 4850 Operand *TargetX86Base<Machine>::legalize(Operand *From, LegalMask Allowed, |
4862 int32_t RegNum) { | 4851 int32_t RegNum) { |
4863 Type Ty = From->getType(); | 4852 Type Ty = From->getType(); |
4864 // Assert that a physical register is allowed. To date, all calls | 4853 // Assert that a physical register is allowed. To date, all calls to |
4865 // to legalize() allow a physical register. If a physical register | 4854 // legalize() allow a physical register. If a physical register needs to be |
4866 // needs to be explicitly disallowed, then new code will need to be | 4855 // explicitly disallowed, then new code will need to be written to force a |
4867 // written to force a spill. | 4856 // spill. |
4868 assert(Allowed & Legal_Reg); | 4857 assert(Allowed & Legal_Reg); |
4869 // If we're asking for a specific physical register, make sure we're | 4858 // If we're asking for a specific physical register, make sure we're not |
4870 // not allowing any other operand kinds. (This could be future | 4859 // allowing any other operand kinds. (This could be future work, e.g. allow |
4871 // work, e.g. allow the shl shift amount to be either an immediate | 4860 // the shl shift amount to be either an immediate or in ecx.) |
4872 // or in ecx.) | |
4873 assert(RegNum == Variable::NoRegister || Allowed == Legal_Reg); | 4861 assert(RegNum == Variable::NoRegister || Allowed == Legal_Reg); |
4874 | 4862 |
4875 if (auto Mem = llvm::dyn_cast<typename Traits::X86OperandMem>(From)) { | 4863 if (auto Mem = llvm::dyn_cast<typename Traits::X86OperandMem>(From)) { |
4876 // Before doing anything with a Mem operand, we need to ensure | 4864 // Before doing anything with a Mem operand, we need to ensure that the |
4877 // that the Base and Index components are in physical registers. | 4865 // Base and Index components are in physical registers. |
4878 Variable *Base = Mem->getBase(); | 4866 Variable *Base = Mem->getBase(); |
4879 Variable *Index = Mem->getIndex(); | 4867 Variable *Index = Mem->getIndex(); |
4880 Variable *RegBase = nullptr; | 4868 Variable *RegBase = nullptr; |
4881 Variable *RegIndex = nullptr; | 4869 Variable *RegIndex = nullptr; |
4882 if (Base) { | 4870 if (Base) { |
4883 RegBase = legalizeToReg(Base); | 4871 RegBase = legalizeToReg(Base); |
4884 } | 4872 } |
4885 if (Index) { | 4873 if (Index) { |
4886 RegIndex = legalizeToReg(Index); | 4874 RegIndex = legalizeToReg(Index); |
4887 } | 4875 } |
(...skipping 24 matching lines...) Expand all Loading... | |
4912 // If the operand is a 64 bit constant integer we need to legalize it to a | 4900 // If the operand is a 64 bit constant integer we need to legalize it to a |
4913 // register in x86-64. | 4901 // register in x86-64. |
4914 if (Traits::Is64Bit) { | 4902 if (Traits::Is64Bit) { |
4915 if (llvm::isa<ConstantInteger64>(Const)) { | 4903 if (llvm::isa<ConstantInteger64>(Const)) { |
4916 Variable *V = copyToReg(Const, RegNum); | 4904 Variable *V = copyToReg(Const, RegNum); |
4917 V->setMustHaveReg(); | 4905 V->setMustHaveReg(); |
4918 return V; | 4906 return V; |
4919 } | 4907 } |
4920 } | 4908 } |
4921 | 4909 |
4922 // If the operand is an 32 bit constant integer, we should check | 4910 // If the operand is an 32 bit constant integer, we should check whether we |
4923 // whether we need to randomize it or pool it. | 4911 // need to randomize it or pool it. |
4924 if (ConstantInteger32 *C = llvm::dyn_cast<ConstantInteger32>(Const)) { | 4912 if (ConstantInteger32 *C = llvm::dyn_cast<ConstantInteger32>(Const)) { |
4925 Operand *NewConst = randomizeOrPoolImmediate(C, RegNum); | 4913 Operand *NewConst = randomizeOrPoolImmediate(C, RegNum); |
4926 if (NewConst != Const) { | 4914 if (NewConst != Const) { |
4927 return NewConst; | 4915 return NewConst; |
4928 } | 4916 } |
4929 } | 4917 } |
4930 | 4918 |
4931 // Convert a scalar floating point constant into an explicit | 4919 // Convert a scalar floating point constant into an explicit memory |
4932 // memory operand. | 4920 // operand. |
4933 if (isScalarFloatingType(Ty)) { | 4921 if (isScalarFloatingType(Ty)) { |
4934 Variable *Base = nullptr; | 4922 Variable *Base = nullptr; |
4935 std::string Buffer; | 4923 std::string Buffer; |
4936 llvm::raw_string_ostream StrBuf(Buffer); | 4924 llvm::raw_string_ostream StrBuf(Buffer); |
4937 llvm::cast<Constant>(From)->emitPoolLabel(StrBuf); | 4925 llvm::cast<Constant>(From)->emitPoolLabel(StrBuf); |
4938 llvm::cast<Constant>(From)->setShouldBePooled(true); | 4926 llvm::cast<Constant>(From)->setShouldBePooled(true); |
4939 Constant *Offset = Ctx->getConstantSym(0, StrBuf.str(), true); | 4927 Constant *Offset = Ctx->getConstantSym(0, StrBuf.str(), true); |
4940 From = Traits::X86OperandMem::create(Func, Ty, Base, Offset); | 4928 From = Traits::X86OperandMem::create(Func, Ty, Base, Offset); |
4941 } | 4929 } |
4942 bool NeedsReg = false; | 4930 bool NeedsReg = false; |
4943 if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty)) | 4931 if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty)) |
4944 // Immediate specifically not allowed | 4932 // Immediate specifically not allowed |
4945 NeedsReg = true; | 4933 NeedsReg = true; |
4946 if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty)) | 4934 if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty)) |
4947 // On x86, FP constants are lowered to mem operands. | 4935 // On x86, FP constants are lowered to mem operands. |
4948 NeedsReg = true; | 4936 NeedsReg = true; |
4949 if (NeedsReg) { | 4937 if (NeedsReg) { |
4950 From = copyToReg(From, RegNum); | 4938 From = copyToReg(From, RegNum); |
4951 } | 4939 } |
4952 return From; | 4940 return From; |
4953 } | 4941 } |
4954 if (auto Var = llvm::dyn_cast<Variable>(From)) { | 4942 if (auto Var = llvm::dyn_cast<Variable>(From)) { |
4955 // Check if the variable is guaranteed a physical register. This | 4943 // Check if the variable is guaranteed a physical register. This can happen |
4956 // can happen either when the variable is pre-colored or when it is | 4944 // either when the variable is pre-colored or when it is assigned infinite |
4957 // assigned infinite weight. | 4945 // weight. |
4958 bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg()); | 4946 bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg()); |
4959 // We need a new physical register for the operand if: | 4947 // We need a new physical register for the operand if: |
4960 // Mem is not allowed and Var isn't guaranteed a physical | 4948 // Mem is not allowed and Var isn't guaranteed a physical |
4961 // register, or | 4949 // register, or |
4962 // RegNum is required and Var->getRegNum() doesn't match. | 4950 // RegNum is required and Var->getRegNum() doesn't match. |
4963 if ((!(Allowed & Legal_Mem) && !MustHaveRegister) || | 4951 if ((!(Allowed & Legal_Mem) && !MustHaveRegister) || |
4964 (RegNum != Variable::NoRegister && RegNum != Var->getRegNum())) { | 4952 (RegNum != Variable::NoRegister && RegNum != Var->getRegNum())) { |
4965 From = copyToReg(From, RegNum); | 4953 From = copyToReg(From, RegNum); |
4966 } | 4954 } |
4967 return From; | 4955 return From; |
4968 } | 4956 } |
4969 llvm_unreachable("Unhandled operand kind in legalize()"); | 4957 llvm_unreachable("Unhandled operand kind in legalize()"); |
4970 return From; | 4958 return From; |
4971 } | 4959 } |
4972 | 4960 |
4973 /// Provide a trivial wrapper to legalize() for this common usage. | 4961 /// Provide a trivial wrapper to legalize() for this common usage. |
4974 template <class Machine> | 4962 template <class Machine> |
4975 Variable *TargetX86Base<Machine>::legalizeToReg(Operand *From, int32_t RegNum) { | 4963 Variable *TargetX86Base<Machine>::legalizeToReg(Operand *From, int32_t RegNum) { |
4976 return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum)); | 4964 return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum)); |
4977 } | 4965 } |
4978 | 4966 |
4979 /// Legalize undef values to concrete values. | 4967 /// Legalize undef values to concrete values. |
4980 template <class Machine> | 4968 template <class Machine> |
4981 Operand *TargetX86Base<Machine>::legalizeUndef(Operand *From, int32_t RegNum) { | 4969 Operand *TargetX86Base<Machine>::legalizeUndef(Operand *From, int32_t RegNum) { |
4982 Type Ty = From->getType(); | 4970 Type Ty = From->getType(); |
4983 if (llvm::isa<ConstantUndef>(From)) { | 4971 if (llvm::isa<ConstantUndef>(From)) { |
4984 // Lower undefs to zero. Another option is to lower undefs to an | 4972 // Lower undefs to zero. Another option is to lower undefs to an |
4985 // uninitialized register; however, using an uninitialized register | 4973 // uninitialized register; however, using an uninitialized register results |
4986 // results in less predictable code. | 4974 // in less predictable code. |
4987 // | 4975 // |
4988 // If in the future the implementation is changed to lower undef | 4976 // If in the future the implementation is changed to lower undef values to |
4989 // values to uninitialized registers, a FakeDef will be needed: | 4977 // uninitialized registers, a FakeDef will be needed: |
4990 // Context.insert(InstFakeDef::create(Func, Reg)); | 4978 // Context.insert(InstFakeDef::create(Func, Reg)); |
4991 // This is in order to ensure that the live range of Reg is not | 4979 // This is in order to ensure that the live range of Reg is not |
4992 // overestimated. If the constant being lowered is a 64 bit value, | 4980 // overestimated. If the constant being lowered is a 64 bit value, then |
4993 // then the result should be split and the lo and hi components will | 4981 // the result should be split and the lo and hi components will need to go |
4994 // need to go in uninitialized registers. | 4982 // in uninitialized registers. |
4995 if (isVectorType(Ty)) | 4983 if (isVectorType(Ty)) |
4996 return makeVectorOfZeros(Ty, RegNum); | 4984 return makeVectorOfZeros(Ty, RegNum); |
4997 return Ctx->getConstantZero(Ty); | 4985 return Ctx->getConstantZero(Ty); |
4998 } | 4986 } |
4999 return From; | 4987 return From; |
5000 } | 4988 } |
5001 | 4989 |
5002 /// For the cmp instruction, if Src1 is an immediate, or known to be a | 4990 /// For the cmp instruction, if Src1 is an immediate, or known to be a physical |
5003 /// physical register, we can allow Src0 to be a memory operand. | 4991 /// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be |
5004 /// Otherwise, Src0 must be copied into a physical register. | 4992 /// copied into a physical register. (Actually, either Src0 or Src1 can be |
5005 /// (Actually, either Src0 or Src1 can be chosen for the physical | 4993 /// chosen for the physical register, but unfortunately we have to commit to one |
5006 /// register, but unfortunately we have to commit to one or the other | 4994 /// or the other before register allocation.) |
5007 /// before register allocation.) | |
5008 template <class Machine> | 4995 template <class Machine> |
5009 Operand *TargetX86Base<Machine>::legalizeSrc0ForCmp(Operand *Src0, | 4996 Operand *TargetX86Base<Machine>::legalizeSrc0ForCmp(Operand *Src0, |
5010 Operand *Src1) { | 4997 Operand *Src1) { |
5011 bool IsSrc1ImmOrReg = false; | 4998 bool IsSrc1ImmOrReg = false; |
5012 if (llvm::isa<Constant>(Src1)) { | 4999 if (llvm::isa<Constant>(Src1)) { |
5013 IsSrc1ImmOrReg = true; | 5000 IsSrc1ImmOrReg = true; |
5014 } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) { | 5001 } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) { |
5015 if (Var->hasReg()) | 5002 if (Var->hasReg()) |
5016 IsSrc1ImmOrReg = true; | 5003 IsSrc1ImmOrReg = true; |
5017 } | 5004 } |
5018 return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg); | 5005 return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg); |
5019 } | 5006 } |
5020 | 5007 |
5021 template <class Machine> | 5008 template <class Machine> |
5022 typename TargetX86Base<Machine>::Traits::X86OperandMem * | 5009 typename TargetX86Base<Machine>::Traits::X86OperandMem * |
5023 TargetX86Base<Machine>::formMemoryOperand(Operand *Opnd, Type Ty, | 5010 TargetX86Base<Machine>::formMemoryOperand(Operand *Opnd, Type Ty, |
5024 bool DoLegalize) { | 5011 bool DoLegalize) { |
5025 auto *Mem = llvm::dyn_cast<typename Traits::X86OperandMem>(Opnd); | 5012 auto *Mem = llvm::dyn_cast<typename Traits::X86OperandMem>(Opnd); |
5026 // It may be the case that address mode optimization already creates an | 5013 // It may be the case that address mode optimization already creates an |
5027 // Traits::X86OperandMem, so in that case it wouldn't need another level of | 5014 // Traits::X86OperandMem, so in that case it wouldn't need another level of |
5028 // transformation. | 5015 // transformation. |
5029 if (!Mem) { | 5016 if (!Mem) { |
5030 Variable *Base = llvm::dyn_cast<Variable>(Opnd); | 5017 Variable *Base = llvm::dyn_cast<Variable>(Opnd); |
5031 Constant *Offset = llvm::dyn_cast<Constant>(Opnd); | 5018 Constant *Offset = llvm::dyn_cast<Constant>(Opnd); |
5032 assert(Base || Offset); | 5019 assert(Base || Offset); |
5033 if (Offset) { | 5020 if (Offset) { |
5034 // During memory operand building, we do not blind or pool | 5021 // During memory operand building, we do not blind or pool the constant |
5035 // the constant offset, we will work on the whole memory | 5022 // offset, we will work on the whole memory operand later as one entity |
5036 // operand later as one entity later, this save one instruction. | 5023 // later, this save one instruction. By turning blinding and pooling off, |
5037 // By turning blinding and pooling off, we guarantee | 5024 // we guarantee legalize(Offset) will return a Constant*. |
5038 // legalize(Offset) will return a Constant*. | |
5039 { | 5025 { |
5040 BoolFlagSaver B(RandomizationPoolingPaused, true); | 5026 BoolFlagSaver B(RandomizationPoolingPaused, true); |
5041 | 5027 |
5042 Offset = llvm::cast<Constant>(legalize(Offset)); | 5028 Offset = llvm::cast<Constant>(legalize(Offset)); |
5043 } | 5029 } |
5044 | 5030 |
5045 assert(llvm::isa<ConstantInteger32>(Offset) || | 5031 assert(llvm::isa<ConstantInteger32>(Offset) || |
5046 llvm::isa<ConstantRelocatable>(Offset)); | 5032 llvm::isa<ConstantRelocatable>(Offset)); |
5047 } | 5033 } |
5048 Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); | 5034 Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); |
5049 } | 5035 } |
5050 // Do legalization, which contains randomization/pooling | 5036 // Do legalization, which contains randomization/pooling or do |
5051 // or do randomization/pooling. | 5037 // randomization/pooling. |
5052 return llvm::cast<typename Traits::X86OperandMem>( | 5038 return llvm::cast<typename Traits::X86OperandMem>( |
5053 DoLegalize ? legalize(Mem) : randomizeOrPoolImmediate(Mem)); | 5039 DoLegalize ? legalize(Mem) : randomizeOrPoolImmediate(Mem)); |
5054 } | 5040 } |
5055 | 5041 |
5056 template <class Machine> | 5042 template <class Machine> |
5057 Variable *TargetX86Base<Machine>::makeReg(Type Type, int32_t RegNum) { | 5043 Variable *TargetX86Base<Machine>::makeReg(Type Type, int32_t RegNum) { |
5058 // There aren't any 64-bit integer registers for x86-32. | 5044 // There aren't any 64-bit integer registers for x86-32. |
5059 assert(Traits::Is64Bit || Type != IceType_i64); | 5045 assert(Traits::Is64Bit || Type != IceType_i64); |
5060 Variable *Reg = Func->makeVariable(Type); | 5046 Variable *Reg = Func->makeVariable(Type); |
5061 if (RegNum == Variable::NoRegister) | 5047 if (RegNum == Variable::NoRegister) |
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5164 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == | 5150 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == |
5165 RPI_Randomize) { | 5151 RPI_Randomize) { |
5166 // blind the constant | 5152 // blind the constant |
5167 // FROM: | 5153 // FROM: |
5168 // imm | 5154 // imm |
5169 // TO: | 5155 // TO: |
5170 // insert: mov imm+cookie, Reg | 5156 // insert: mov imm+cookie, Reg |
5171 // insert: lea -cookie[Reg], Reg | 5157 // insert: lea -cookie[Reg], Reg |
5172 // => Reg | 5158 // => Reg |
5173 // If we have already assigned a phy register, we must come from | 5159 // If we have already assigned a phy register, we must come from |
5174 // andvancedPhiLowering()=>lowerAssign(). In this case we should reuse | 5160 // andvancedPhiLowering()=>lowerAssign(). In this case we should reuse |
Jim Stichnoth
2015/09/16 00:01:29
advancedPhiLowering
ascull
2015/09/16 18:30:09
Done.
| |
5175 // the assigned register as this assignment is that start of its use-def | 5161 // the assigned register as this assignment is that start of its use-def |
5176 // chain. So we add RegNum argument here. | 5162 // chain. So we add RegNum argument here. Note we use 'lea' instruction |
5177 // Note we use 'lea' instruction instead of 'xor' to avoid affecting | 5163 // instead of 'xor' to avoid affecting the flags. |
5178 // the flags. | |
5179 Variable *Reg = makeReg(IceType_i32, RegNum); | 5164 Variable *Reg = makeReg(IceType_i32, RegNum); |
5180 ConstantInteger32 *Integer = llvm::cast<ConstantInteger32>(Immediate); | 5165 ConstantInteger32 *Integer = llvm::cast<ConstantInteger32>(Immediate); |
5181 uint32_t Value = Integer->getValue(); | 5166 uint32_t Value = Integer->getValue(); |
5182 uint32_t Cookie = Func->getConstantBlindingCookie(); | 5167 uint32_t Cookie = Func->getConstantBlindingCookie(); |
5183 _mov(Reg, Ctx->getConstantInt(IceType_i32, Cookie + Value)); | 5168 _mov(Reg, Ctx->getConstantInt(IceType_i32, Cookie + Value)); |
5184 Constant *Offset = Ctx->getConstantInt(IceType_i32, 0 - Cookie); | 5169 Constant *Offset = Ctx->getConstantInt(IceType_i32, 0 - Cookie); |
5185 _lea(Reg, Traits::X86OperandMem::create(Func, IceType_i32, Reg, Offset, | 5170 _lea(Reg, Traits::X86OperandMem::create(Func, IceType_i32, Reg, Offset, |
5186 nullptr, 0)); | 5171 nullptr, 0)); |
5187 // make sure liveness analysis won't kill this variable, otherwise a | 5172 // make sure liveness analysis won't kill this variable, otherwise a |
5188 // liveness assertion will be triggered. | 5173 // liveness assertion will be triggered. |
5189 _set_dest_nonkillable(); | 5174 _set_dest_nonkillable(); |
5190 if (Immediate->getType() != IceType_i32) { | 5175 if (Immediate->getType() != IceType_i32) { |
5191 Variable *TruncReg = makeReg(Immediate->getType(), RegNum); | 5176 Variable *TruncReg = makeReg(Immediate->getType(), RegNum); |
5192 _mov(TruncReg, Reg); | 5177 _mov(TruncReg, Reg); |
5193 return TruncReg; | 5178 return TruncReg; |
5194 } | 5179 } |
5195 return Reg; | 5180 return Reg; |
5196 } | 5181 } |
5197 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) { | 5182 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) { |
5198 // pool the constant | 5183 // pool the constant |
5199 // FROM: | 5184 // FROM: |
5200 // imm | 5185 // imm |
5201 // TO: | 5186 // TO: |
5202 // insert: mov $label, Reg | 5187 // insert: mov $label, Reg |
5203 // => Reg | 5188 // => Reg |
5204 assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool); | 5189 assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool); |
5205 Immediate->setShouldBePooled(true); | 5190 Immediate->setShouldBePooled(true); |
5206 // if we have already assigned a phy register, we must come from | 5191 // if we have already assigned a phy register, we must come from |
5207 // andvancedPhiLowering()=>lowerAssign(). In this case we should reuse | 5192 // andvancedPhiLowering()=>lowerAssign(). In this case we should reuse |
Jim Stichnoth
2015/09/16 00:01:29
advancedPhiLowering
ascull
2015/09/16 18:30:09
Done.
| |
5208 // the assigned register as this assignment is that start of its use-def | 5193 // the assigned register as this assignment is that start of its use-def |
5209 // chain. So we add RegNum argument here. | 5194 // chain. So we add RegNum argument here. |
5210 Variable *Reg = makeReg(Immediate->getType(), RegNum); | 5195 Variable *Reg = makeReg(Immediate->getType(), RegNum); |
5211 IceString Label; | 5196 IceString Label; |
5212 llvm::raw_string_ostream Label_stream(Label); | 5197 llvm::raw_string_ostream Label_stream(Label); |
5213 Immediate->emitPoolLabel(Label_stream); | 5198 Immediate->emitPoolLabel(Label_stream); |
5214 const RelocOffsetT Offset = 0; | 5199 const RelocOffsetT Offset = 0; |
5215 const bool SuppressMangling = true; | 5200 const bool SuppressMangling = true; |
5216 Constant *Symbol = | 5201 Constant *Symbol = |
5217 Ctx->getConstantSym(Offset, Label_stream.str(), SuppressMangling); | 5202 Ctx->getConstantSym(Offset, Label_stream.str(), SuppressMangling); |
(...skipping 13 matching lines...) Expand all Loading... | |
5231 typename TargetX86Base<Machine>::Traits::X86OperandMem * | 5216 typename TargetX86Base<Machine>::Traits::X86OperandMem * |
5232 TargetX86Base<Machine>::randomizeOrPoolImmediate( | 5217 TargetX86Base<Machine>::randomizeOrPoolImmediate( |
5233 typename Traits::X86OperandMem *MemOperand, int32_t RegNum) { | 5218 typename Traits::X86OperandMem *MemOperand, int32_t RegNum) { |
5234 assert(MemOperand); | 5219 assert(MemOperand); |
5235 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None || | 5220 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None || |
5236 RandomizationPoolingPaused == true) { | 5221 RandomizationPoolingPaused == true) { |
5237 // immediates randomization/pooling is turned off | 5222 // immediates randomization/pooling is turned off |
5238 return MemOperand; | 5223 return MemOperand; |
5239 } | 5224 } |
5240 | 5225 |
5241 // If this memory operand is already a randommized one, we do | 5226 // If this memory operand is already a randommized one, we do not randomize |
Jim Stichnoth
2015/09/16 00:01:29
randomized
ascull
2015/09/16 18:30:09
Done.
| |
5242 // not randomize it again. | 5227 // it again. |
5243 if (MemOperand->getRandomized()) | 5228 if (MemOperand->getRandomized()) |
5244 return MemOperand; | 5229 return MemOperand; |
5245 | 5230 |
5246 if (Constant *C = llvm::dyn_cast_or_null<Constant>(MemOperand->getOffset())) { | 5231 if (Constant *C = llvm::dyn_cast_or_null<Constant>(MemOperand->getOffset())) { |
5247 if (C->shouldBeRandomizedOrPooled(Ctx)) { | 5232 if (C->shouldBeRandomizedOrPooled(Ctx)) { |
5248 // The offset of this mem operand should be blinded or pooled | 5233 // The offset of this mem operand should be blinded or pooled |
5249 Ctx->statsUpdateRPImms(); | 5234 Ctx->statsUpdateRPImms(); |
5250 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == | 5235 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == |
5251 RPI_Randomize) { | 5236 RPI_Randomize) { |
5252 // blind the constant offset | 5237 // blind the constant offset |
(...skipping 14 matching lines...) Expand all Loading... | |
5267 typename Traits::X86OperandMem *TempMemOperand = | 5252 typename Traits::X86OperandMem *TempMemOperand = |
5268 Traits::X86OperandMem::create(Func, MemOperand->getType(), | 5253 Traits::X86OperandMem::create(Func, MemOperand->getType(), |
5269 MemOperand->getBase(), Mask1); | 5254 MemOperand->getBase(), Mask1); |
5270 // If we have already assigned a physical register, we must come from | 5255 // If we have already assigned a physical register, we must come from |
5271 // advancedPhiLowering()=>lowerAssign(). In this case we should reuse | 5256 // advancedPhiLowering()=>lowerAssign(). In this case we should reuse |
5272 // the assigned register as this assignment is that start of its | 5257 // the assigned register as this assignment is that start of its |
5273 // use-def chain. So we add RegNum argument here. | 5258 // use-def chain. So we add RegNum argument here. |
5274 Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum); | 5259 Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum); |
5275 _lea(RegTemp, TempMemOperand); | 5260 _lea(RegTemp, TempMemOperand); |
5276 // As source operand doesn't use the dstreg, we don't need to add | 5261 // As source operand doesn't use the dstreg, we don't need to add |
5277 // _set_dest_nonkillable(). | 5262 // _set_dest_nonkillable(). But if we use the same Dest Reg, that is, |
5278 // But if we use the same Dest Reg, that is, with RegNum | 5263 // with RegNum assigned, we should add this _set_dest_nonkillable() |
5279 // assigned, we should add this _set_dest_nonkillable() | |
5280 if (RegNum != Variable::NoRegister) | 5264 if (RegNum != Variable::NoRegister) |
5281 _set_dest_nonkillable(); | 5265 _set_dest_nonkillable(); |
5282 | 5266 |
5283 typename Traits::X86OperandMem *NewMemOperand = | 5267 typename Traits::X86OperandMem *NewMemOperand = |
5284 Traits::X86OperandMem::create(Func, MemOperand->getType(), RegTemp, | 5268 Traits::X86OperandMem::create(Func, MemOperand->getType(), RegTemp, |
5285 Mask2, MemOperand->getIndex(), | 5269 Mask2, MemOperand->getIndex(), |
5286 MemOperand->getShift(), | 5270 MemOperand->getShift(), |
5287 MemOperand->getSegmentRegister()); | 5271 MemOperand->getSegmentRegister()); |
5288 | 5272 |
5289 // Label this memory operand as randomized, so we won't randomize it | 5273 // Label this memory operand as randomized, so we won't randomize it |
5290 // again in case we call legalize() multiple times on this memory | 5274 // again in case we call legalize() multiple times on this memory |
5291 // operand. | 5275 // operand. |
5292 NewMemOperand->setRandomized(true); | 5276 NewMemOperand->setRandomized(true); |
5293 return NewMemOperand; | 5277 return NewMemOperand; |
5294 } | 5278 } |
5295 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) { | 5279 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) { |
5296 // pool the constant offset | 5280 // pool the constant offset |
5297 // FROM: | 5281 // FROM: |
5298 // offset[base, index, shift] | 5282 // offset[base, index, shift] |
5299 // TO: | 5283 // TO: |
5300 // insert: mov $label, RegTemp | 5284 // insert: mov $label, RegTemp |
5301 // insert: lea [base, RegTemp], RegTemp | 5285 // insert: lea [base, RegTemp], RegTemp |
5302 // =>[RegTemp, index, shift] | 5286 // =>[RegTemp, index, shift] |
5303 assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == | 5287 assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == |
5304 RPI_Pool); | 5288 RPI_Pool); |
5305 // Memory operand should never exist as source operands in phi | 5289 // Memory operand should never exist as source operands in phi lowering |
5306 // lowering assignments, so there is no need to reuse any registers | 5290 // assignments, so there is no need to reuse any registers here. For |
5307 // here. For phi lowering, we should not ask for new physical | 5291 // phi lowering, we should not ask for new physical registers in |
5308 // registers in general. | 5292 // general. However, if we do meet Memory Operand during phi lowering, |
5309 // However, if we do meet Memory Operand during phi lowering, we | 5293 // we should not blind or pool the immediates for now. |
5310 // should not blind or pool the immediates for now. | |
5311 if (RegNum != Variable::NoRegister) | 5294 if (RegNum != Variable::NoRegister) |
5312 return MemOperand; | 5295 return MemOperand; |
5313 Variable *RegTemp = makeReg(IceType_i32); | 5296 Variable *RegTemp = makeReg(IceType_i32); |
5314 IceString Label; | 5297 IceString Label; |
5315 llvm::raw_string_ostream Label_stream(Label); | 5298 llvm::raw_string_ostream Label_stream(Label); |
5316 MemOperand->getOffset()->emitPoolLabel(Label_stream); | 5299 MemOperand->getOffset()->emitPoolLabel(Label_stream); |
5317 MemOperand->getOffset()->setShouldBePooled(true); | 5300 MemOperand->getOffset()->setShouldBePooled(true); |
5318 const RelocOffsetT SymOffset = 0; | 5301 const RelocOffsetT SymOffset = 0; |
5319 bool SuppressMangling = true; | 5302 bool SuppressMangling = true; |
5320 Constant *Symbol = Ctx->getConstantSym(SymOffset, Label_stream.str(), | 5303 Constant *Symbol = Ctx->getConstantSym(SymOffset, Label_stream.str(), |
(...skipping 25 matching lines...) Expand all Loading... | |
5346 } | 5329 } |
5347 // the offset is not eligible for blinding or pooling, return the original | 5330 // the offset is not eligible for blinding or pooling, return the original |
5348 // mem operand | 5331 // mem operand |
5349 return MemOperand; | 5332 return MemOperand; |
5350 } | 5333 } |
5351 | 5334 |
5352 } // end of namespace X86Internal | 5335 } // end of namespace X86Internal |
5353 } // end of namespace Ice | 5336 } // end of namespace Ice |
5354 | 5337 |
5355 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H | 5338 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H |
OLD | NEW |