OLD | NEW |
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// | 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// |
2 // | 2 // |
3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
4 // | 4 // |
5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
7 // | 7 // |
8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
9 /// | 9 /// |
10 /// \file | 10 /// \file |
11 /// This file implements the TargetLoweringX86Base class, which | 11 /// This file implements the TargetLoweringX86Base class, which consists almost |
12 /// consists almost entirely of the lowering sequence for each | 12 /// entirely of the lowering sequence for each high-level instruction. |
13 /// high-level instruction. | |
14 /// | 13 /// |
15 //===----------------------------------------------------------------------===// | 14 //===----------------------------------------------------------------------===// |
16 | 15 |
17 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H | 16 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H |
18 #define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H | 17 #define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H |
19 | 18 |
20 #include "IceCfg.h" | 19 #include "IceCfg.h" |
21 #include "IceCfgNode.h" | 20 #include "IceCfgNode.h" |
22 #include "IceClFlags.h" | 21 #include "IceClFlags.h" |
23 #include "IceDefs.h" | 22 #include "IceDefs.h" |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
56 | 55 |
57 public: | 56 public: |
58 BoolFoldingEntry() = default; | 57 BoolFoldingEntry() = default; |
59 explicit BoolFoldingEntry(Inst *I); | 58 explicit BoolFoldingEntry(Inst *I); |
60 BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default; | 59 BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default; |
61 /// Instr is the instruction producing the i1-type variable of interest. | 60 /// Instr is the instruction producing the i1-type variable of interest. |
62 Inst *Instr = nullptr; | 61 Inst *Instr = nullptr; |
63 /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr). | 62 /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr). |
64 bool IsComplex = false; | 63 bool IsComplex = false; |
65 /// IsLiveOut is initialized conservatively to true, and is set to false when | 64 /// IsLiveOut is initialized conservatively to true, and is set to false when |
66 /// we encounter an instruction that ends Var's live range. We disable the | 65 /// we encounter an instruction that ends Var's live range. We disable the |
67 /// folding optimization when Var is live beyond this basic block. Note that | 66 /// folding optimization when Var is live beyond this basic block. Note that |
68 /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will | 67 /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will |
69 /// always be true and the folding optimization will never be performed. | 68 /// always be true and the folding optimization will never be performed. |
70 bool IsLiveOut = true; | 69 bool IsLiveOut = true; |
71 // NumUses counts the number of times Var is used as a source operand in the | 70 // NumUses counts the number of times Var is used as a source operand in the |
72 // basic block. If IsComplex is true and there is more than one use of Var, | 71 // basic block. If IsComplex is true and there is more than one use of Var, |
73 // then the folding optimization is disabled for Var. | 72 // then the folding optimization is disabled for Var. |
74 uint32_t NumUses = 0; | 73 uint32_t NumUses = 0; |
75 }; | 74 }; |
76 | 75 |
77 template <class MachineTraits> class BoolFolding { | 76 template <class MachineTraits> class BoolFolding { |
78 public: | 77 public: |
79 enum BoolFoldingProducerKind { | 78 enum BoolFoldingProducerKind { |
80 PK_None, | 79 PK_None, |
81 // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative. | 80 // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative. |
82 PK_Icmp32, | 81 PK_Icmp32, |
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
159 case InstCast::Zext: | 158 case InstCast::Zext: |
160 return CK_Zext; | 159 return CK_Zext; |
161 } | 160 } |
162 } | 161 } |
163 return CK_None; | 162 return CK_None; |
164 } | 163 } |
165 | 164 |
166 /// Returns true if the producing instruction has a "complex" lowering sequence. | 165 /// Returns true if the producing instruction has a "complex" lowering sequence. |
167 /// This generally means that its lowering sequence requires more than one | 166 /// This generally means that its lowering sequence requires more than one |
168 /// conditional branch, namely 64-bit integer compares and some floating-point | 167 /// conditional branch, namely 64-bit integer compares and some floating-point |
169 /// compares. When this is true, and there is more than one consumer, we prefer | 168 /// compares. When this is true, and there is more than one consumer, we prefer |
170 /// to disable the folding optimization because it minimizes branches. | 169 /// to disable the folding optimization because it minimizes branches. |
171 template <class MachineTraits> | 170 template <class MachineTraits> |
172 bool BoolFolding<MachineTraits>::hasComplexLowering(const Inst *Instr) { | 171 bool BoolFolding<MachineTraits>::hasComplexLowering(const Inst *Instr) { |
173 switch (getProducerKind(Instr)) { | 172 switch (getProducerKind(Instr)) { |
174 default: | 173 default: |
175 return false; | 174 return false; |
176 case PK_Icmp64: | 175 case PK_Icmp64: |
177 return true; | 176 return true; |
178 case PK_Fcmp: | 177 case PK_Fcmp: |
179 return MachineTraits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()] | 178 return MachineTraits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()] |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
215 } | 214 } |
216 for (auto &I : Producers) { | 215 for (auto &I : Producers) { |
217 // Ignore entries previously marked invalid. | 216 // Ignore entries previously marked invalid. |
218 if (I.second.Instr == nullptr) | 217 if (I.second.Instr == nullptr) |
219 continue; | 218 continue; |
220 // Disable the producer if its dest may be live beyond this block. | 219 // Disable the producer if its dest may be live beyond this block. |
221 if (I.second.IsLiveOut) { | 220 if (I.second.IsLiveOut) { |
222 setInvalid(I.first); | 221 setInvalid(I.first); |
223 continue; | 222 continue; |
224 } | 223 } |
225 // Mark as "dead" rather than outright deleting. This is so that other | 224 // Mark as "dead" rather than outright deleting. This is so that other |
226 // peephole style optimizations during or before lowering have access to | 225 // peephole style optimizations during or before lowering have access to |
227 // this instruction in undeleted form. See for example | 226 // this instruction in undeleted form. See for example |
228 // tryOptimizedCmpxchgCmpBr(). | 227 // tryOptimizedCmpxchgCmpBr(). |
229 I.second.Instr->setDead(); | 228 I.second.Instr->setDead(); |
230 } | 229 } |
231 } | 230 } |
232 | 231 |
233 template <class MachineTraits> | 232 template <class MachineTraits> |
234 const Inst * | 233 const Inst * |
235 BoolFolding<MachineTraits>::getProducerFor(const Operand *Opnd) const { | 234 BoolFolding<MachineTraits>::getProducerFor(const Operand *Opnd) const { |
236 auto *Var = llvm::dyn_cast<const Variable>(Opnd); | 235 auto *Var = llvm::dyn_cast<const Variable>(Opnd); |
237 if (Var == nullptr) | 236 if (Var == nullptr) |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
296 if (Func->hasError()) | 295 if (Func->hasError()) |
297 return; | 296 return; |
298 Func->deletePhis(); | 297 Func->deletePhis(); |
299 if (Func->hasError()) | 298 if (Func->hasError()) |
300 return; | 299 return; |
301 Func->dump("After Phi lowering"); | 300 Func->dump("After Phi lowering"); |
302 } | 301 } |
303 | 302 |
304 // Run this early so it can be used to focus optimizations on potentially hot | 303 // Run this early so it can be used to focus optimizations on potentially hot |
305 // code. | 304 // code. |
306 // TODO(stichnot,ascull): currently only used for regalloc not expensive high | 305 // TODO(stichnot,ascull): currently only used for regalloc not |
307 // level optimizations which could be focused on potentially hot code. | 306 // expensive high level optimizations which could be focused on potentially |
| 307 // hot code. |
308 Func->computeLoopNestDepth(); | 308 Func->computeLoopNestDepth(); |
309 Func->dump("After loop nest depth analysis"); | 309 Func->dump("After loop nest depth analysis"); |
310 | 310 |
311 // Address mode optimization. | 311 // Address mode optimization. |
312 Func->getVMetadata()->init(VMK_SingleDefs); | 312 Func->getVMetadata()->init(VMK_SingleDefs); |
313 Func->doAddressOpt(); | 313 Func->doAddressOpt(); |
314 | 314 |
315 // Find read-modify-write opportunities. Do this after address mode | 315 // Find read-modify-write opportunities. Do this after address mode |
316 // optimization so that doAddressOpt() doesn't need to be applied to RMW | 316 // optimization so that doAddressOpt() doesn't need to be applied to RMW |
317 // instructions as well. | 317 // instructions as well. |
318 findRMW(); | 318 findRMW(); |
319 Func->dump("After RMW transform"); | 319 Func->dump("After RMW transform"); |
320 | 320 |
321 // Argument lowering | 321 // Argument lowering |
322 Func->doArgLowering(); | 322 Func->doArgLowering(); |
323 | 323 |
324 // Target lowering. This requires liveness analysis for some parts of the | 324 // Target lowering. This requires liveness analysis for some parts of the |
325 // lowering decisions, such as compare/branch fusing. If non-lightweight | 325 // lowering decisions, such as compare/branch fusing. If non-lightweight |
326 // liveness analysis is used, the instructions need to be renumbered first | 326 // liveness analysis is used, the instructions need to be renumbered first |
327 // TODO: This renumbering should only be necessary if we're actually | 327 // TODO: This renumbering should only be necessary if we're actually |
328 // calculating live intervals, which we only do for register allocation. | 328 // calculating live intervals, which we only do for register allocation. |
329 Func->renumberInstructions(); | 329 Func->renumberInstructions(); |
330 if (Func->hasError()) | 330 if (Func->hasError()) |
331 return; | 331 return; |
332 | 332 |
333 // TODO: It should be sufficient to use the fastest liveness calculation, i.e. | 333 // TODO: It should be sufficient to use the fastest liveness calculation, |
334 // livenessLightweight(). However, for some reason that slows down the rest | 334 // i.e. livenessLightweight(). However, for some reason that slows down the |
335 // of the translation. Investigate. | 335 // rest of the translation. Investigate. |
336 Func->liveness(Liveness_Basic); | 336 Func->liveness(Liveness_Basic); |
337 if (Func->hasError()) | 337 if (Func->hasError()) |
338 return; | 338 return; |
339 Func->dump("After x86 address mode opt"); | 339 Func->dump("After x86 address mode opt"); |
340 | 340 |
341 // Disable constant blinding or pooling for load optimization. | 341 // Disable constant blinding or pooling for load optimization. |
342 { | 342 { |
343 BoolFlagSaver B(RandomizationPoolingPaused, true); | 343 BoolFlagSaver B(RandomizationPoolingPaused, true); |
344 doLoadOpt(); | 344 doLoadOpt(); |
345 } | 345 } |
346 Func->genCode(); | 346 Func->genCode(); |
347 if (Func->hasError()) | 347 if (Func->hasError()) |
348 return; | 348 return; |
349 Func->dump("After x86 codegen"); | 349 Func->dump("After x86 codegen"); |
350 | 350 |
351 // Register allocation. This requires instruction renumbering and full | 351 // Register allocation. This requires instruction renumbering and full |
352 // liveness analysis. Loops must be identified before liveness so variable | 352 // liveness analysis. Loops must be identified before liveness so variable |
353 // use weights are correct. | 353 // use weights are correct. |
354 Func->renumberInstructions(); | 354 Func->renumberInstructions(); |
355 if (Func->hasError()) | 355 if (Func->hasError()) |
356 return; | 356 return; |
357 Func->liveness(Liveness_Intervals); | 357 Func->liveness(Liveness_Intervals); |
358 if (Func->hasError()) | 358 if (Func->hasError()) |
359 return; | 359 return; |
360 // Validate the live range computations. The expensive validation call is | 360 // Validate the live range computations. The expensive validation call is |
361 // deliberately only made when assertions are enabled. | 361 // deliberately only made when assertions are enabled. |
362 assert(Func->validateLiveness()); | 362 assert(Func->validateLiveness()); |
363 // The post-codegen dump is done here, after liveness analysis and associated | 363 // The post-codegen dump is done here, after liveness analysis and associated |
364 // cleanup, to make the dump cleaner and more useful. | 364 // cleanup, to make the dump cleaner and more useful. |
365 Func->dump("After initial x8632 codegen"); | 365 Func->dump("After initial x8632 codegen"); |
366 Func->getVMetadata()->init(VMK_All); | 366 Func->getVMetadata()->init(VMK_All); |
367 regAlloc(RAK_Global); | 367 regAlloc(RAK_Global); |
368 if (Func->hasError()) | 368 if (Func->hasError()) |
369 return; | 369 return; |
370 Func->dump("After linear scan regalloc"); | 370 Func->dump("After linear scan regalloc"); |
371 | 371 |
372 if (Ctx->getFlags().getPhiEdgeSplit()) { | 372 if (Ctx->getFlags().getPhiEdgeSplit()) { |
373 Func->advancedPhiLowering(); | 373 Func->advancedPhiLowering(); |
374 Func->dump("After advanced Phi lowering"); | 374 Func->dump("After advanced Phi lowering"); |
375 } | 375 } |
376 | 376 |
377 // Stack frame mapping. | 377 // Stack frame mapping. |
378 Func->genFrame(); | 378 Func->genFrame(); |
379 if (Func->hasError()) | 379 if (Func->hasError()) |
380 return; | 380 return; |
381 Func->dump("After stack frame mapping"); | 381 Func->dump("After stack frame mapping"); |
382 | 382 |
383 Func->contractEmptyNodes(); | 383 Func->contractEmptyNodes(); |
384 Func->reorderNodes(); | 384 Func->reorderNodes(); |
385 | 385 |
386 // Shuffle basic block order if -reorder-basic-blocks is enabled. | 386 // Shuffle basic block order if -reorder-basic-blocks is enabled. |
387 Func->shuffleNodes(); | 387 Func->shuffleNodes(); |
388 | 388 |
389 // Branch optimization. This needs to be done just before code emission. In | 389 // Branch optimization. This needs to be done just before code emission. In |
390 // particular, no transformations that insert or reorder CfgNodes should be | 390 // particular, no transformations that insert or reorder CfgNodes should be |
391 // done after branch optimization. We go ahead and do it before nop insertion | 391 // done after branch optimization. We go ahead and do it before nop insertion |
392 // to reduce the amount of work needed for searching for opportunities. | 392 // to reduce the amount of work needed for searching for opportunities. |
393 Func->doBranchOpt(); | 393 Func->doBranchOpt(); |
394 Func->dump("After branch optimization"); | 394 Func->dump("After branch optimization"); |
395 | 395 |
396 // Nop insertion if -nop-insertion is enabled. | 396 // Nop insertion if -nop-insertion is enabled. |
397 Func->doNopInsertion(); | 397 Func->doNopInsertion(); |
398 | 398 |
399 // Mark nodes that require sandbox alignment | 399 // Mark nodes that require sandbox alignment |
400 if (Ctx->getFlags().getUseSandboxing()) | 400 if (Ctx->getFlags().getUseSandboxing()) |
401 Func->markNodesForSandboxing(); | 401 Func->markNodesForSandboxing(); |
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
488 } | 488 } |
489 return false; | 489 return false; |
490 } | 490 } |
491 | 491 |
492 template <class Machine> void TargetX86Base<Machine>::findRMW() { | 492 template <class Machine> void TargetX86Base<Machine>::findRMW() { |
493 Func->dump("Before RMW"); | 493 Func->dump("Before RMW"); |
494 OstreamLocker L(Func->getContext()); | 494 OstreamLocker L(Func->getContext()); |
495 Ostream &Str = Func->getContext()->getStrDump(); | 495 Ostream &Str = Func->getContext()->getStrDump(); |
496 for (CfgNode *Node : Func->getNodes()) { | 496 for (CfgNode *Node : Func->getNodes()) { |
497 // Walk through the instructions, considering each sequence of 3 | 497 // Walk through the instructions, considering each sequence of 3 |
498 // instructions, and look for the particular RMW pattern. Note that this | 498 // instructions, and look for the particular RMW pattern. Note that this |
499 // search can be "broken" (false negatives) if there are intervening deleted | 499 // search can be "broken" (false negatives) if there are intervening |
500 // instructions, or intervening instructions that could be safely moved out | 500 // deleted instructions, or intervening instructions that could be safely |
501 // of the way to reveal an RMW pattern. | 501 // moved out of the way to reveal an RMW pattern. |
502 auto E = Node->getInsts().end(); | 502 auto E = Node->getInsts().end(); |
503 auto I1 = E, I2 = E, I3 = Node->getInsts().begin(); | 503 auto I1 = E, I2 = E, I3 = Node->getInsts().begin(); |
504 for (; I3 != E; I1 = I2, I2 = I3, ++I3) { | 504 for (; I3 != E; I1 = I2, I2 = I3, ++I3) { |
505 // Make I3 skip over deleted instructions. | 505 // Make I3 skip over deleted instructions. |
506 while (I3 != E && I3->isDeleted()) | 506 while (I3 != E && I3->isDeleted()) |
507 ++I3; | 507 ++I3; |
508 if (I1 == E || I2 == E || I3 == E) | 508 if (I1 == E || I2 == E || I3 == E) |
509 continue; | 509 continue; |
510 assert(!I1->isDeleted()); | 510 assert(!I1->isDeleted()); |
511 assert(!I2->isDeleted()); | 511 assert(!I2->isDeleted()); |
512 assert(!I3->isDeleted()); | 512 assert(!I3->isDeleted()); |
513 if (auto *Load = llvm::dyn_cast<InstLoad>(I1)) { | 513 if (auto *Load = llvm::dyn_cast<InstLoad>(I1)) { |
514 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(I2)) { | 514 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(I2)) { |
515 if (auto *Store = llvm::dyn_cast<InstStore>(I3)) { | 515 if (auto *Store = llvm::dyn_cast<InstStore>(I3)) { |
516 // Look for: | 516 // Look for: |
517 // a = Load addr | 517 // a = Load addr |
518 // b = <op> a, other | 518 // b = <op> a, other |
519 // Store b, addr | 519 // Store b, addr |
520 // Change to: | 520 // Change to: |
521 // a = Load addr | 521 // a = Load addr |
522 // b = <op> a, other | 522 // b = <op> a, other |
523 // x = FakeDef | 523 // x = FakeDef |
524 // RMW <op>, addr, other, x | 524 // RMW <op>, addr, other, x |
525 // b = Store b, addr, x | 525 // b = Store b, addr, x |
526 // Note that inferTwoAddress() makes sure setDestNonKillable() gets | 526 // Note that inferTwoAddress() makes sure setDestNonKillable() gets |
527 // called on the updated Store instruction, to avoid liveness | 527 // called on the updated Store instruction, to avoid liveness |
528 // problems later. | 528 // problems later. |
529 // | 529 // |
530 // With this transformation, the Store instruction acquires a Dest | 530 // With this transformation, the Store instruction acquires a Dest |
531 // variable and is now subject to dead code elimination if there are | 531 // variable and is now subject to dead code elimination if there |
532 // no more uses of "b". Variable "x" is a beacon for determining | 532 // are no more uses of "b". Variable "x" is a beacon for |
533 // whether the Store instruction gets dead-code eliminated. If the | 533 // determining whether the Store instruction gets dead-code |
534 // Store instruction is eliminated, then it must be the case that | 534 // eliminated. If the Store instruction is eliminated, then it |
535 // the RMW instruction ends x's live range, and therefore the RMW | 535 // must be the case that the RMW instruction ends x's live range, |
536 // instruction will be retained and later lowered. On the other | 536 // and therefore the RMW instruction will be retained and later |
537 // hand, if the RMW instruction does not end x's live range, then | 537 // lowered. On the other hand, if the RMW instruction does not end |
538 // the Store instruction must still be present, and therefore the | 538 // x's live range, then the Store instruction must still be |
539 // RMW instruction is ignored during lowering because it is | 539 // present, and therefore the RMW instruction is ignored during |
540 // redundant with the Store instruction. | 540 // lowering because it is redundant with the Store instruction. |
541 // | 541 // |
542 // Note that if "a" has further uses, the RMW transformation may | 542 // Note that if "a" has further uses, the RMW transformation may |
543 // still trigger, resulting in two loads and one store, which is | 543 // still trigger, resulting in two loads and one store, which is |
544 // worse than the original one load and one store. However, this is | 544 // worse than the original one load and one store. However, this |
545 // probably rare, and caching probably keeps it just as fast. | 545 // is probably rare, and caching probably keeps it just as fast. |
546 if (!isSameMemAddressOperand<Machine>(Load->getSourceAddress(), | 546 if (!isSameMemAddressOperand<Machine>(Load->getSourceAddress(), |
547 Store->getAddr())) | 547 Store->getAddr())) |
548 continue; | 548 continue; |
549 Operand *ArithSrcFromLoad = Arith->getSrc(0); | 549 Operand *ArithSrcFromLoad = Arith->getSrc(0); |
550 Operand *ArithSrcOther = Arith->getSrc(1); | 550 Operand *ArithSrcOther = Arith->getSrc(1); |
551 if (ArithSrcFromLoad != Load->getDest()) { | 551 if (ArithSrcFromLoad != Load->getDest()) { |
552 if (!Arith->isCommutative() || ArithSrcOther != Load->getDest()) | 552 if (!Arith->isCommutative() || ArithSrcOther != Load->getDest()) |
553 continue; | 553 continue; |
554 std::swap(ArithSrcFromLoad, ArithSrcOther); | 554 std::swap(ArithSrcFromLoad, ArithSrcOther); |
555 } | 555 } |
(...skipping 26 matching lines...) Expand all Loading... |
582 } | 582 } |
583 | 583 |
584 // Converts a ConstantInteger32 operand into its constant value, or | 584 // Converts a ConstantInteger32 operand into its constant value, or |
585 // MemoryOrderInvalid if the operand is not a ConstantInteger32. | 585 // MemoryOrderInvalid if the operand is not a ConstantInteger32. |
586 inline uint64_t getConstantMemoryOrder(Operand *Opnd) { | 586 inline uint64_t getConstantMemoryOrder(Operand *Opnd) { |
587 if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd)) | 587 if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd)) |
588 return Integer->getValue(); | 588 return Integer->getValue(); |
589 return Intrinsics::MemoryOrderInvalid; | 589 return Intrinsics::MemoryOrderInvalid; |
590 } | 590 } |
591 | 591 |
592 /// Determines whether the dest of a Load instruction can be folded | 592 /// Determines whether the dest of a Load instruction can be folded into one of |
593 /// into one of the src operands of a 2-operand instruction. This is | 593 /// the src operands of a 2-operand instruction. This is true as long as the |
594 /// true as long as the load dest matches exactly one of the binary | 594 /// load dest matches exactly one of the binary instruction's src operands. |
595 /// instruction's src operands. Replaces Src0 or Src1 with LoadSrc if | 595 /// Replaces Src0 or Src1 with LoadSrc if the answer is true. |
596 /// the answer is true. | |
597 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest, | 596 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest, |
598 Operand *&Src0, Operand *&Src1) { | 597 Operand *&Src0, Operand *&Src1) { |
599 if (Src0 == LoadDest && Src1 != LoadDest) { | 598 if (Src0 == LoadDest && Src1 != LoadDest) { |
600 Src0 = LoadSrc; | 599 Src0 = LoadSrc; |
601 return true; | 600 return true; |
602 } | 601 } |
603 if (Src0 != LoadDest && Src1 == LoadDest) { | 602 if (Src0 != LoadDest && Src1 == LoadDest) { |
604 Src1 = LoadSrc; | 603 Src1 = LoadSrc; |
605 return true; | 604 return true; |
606 } | 605 } |
607 return false; | 606 return false; |
608 } | 607 } |
609 | 608 |
610 template <class Machine> void TargetX86Base<Machine>::doLoadOpt() { | 609 template <class Machine> void TargetX86Base<Machine>::doLoadOpt() { |
611 for (CfgNode *Node : Func->getNodes()) { | 610 for (CfgNode *Node : Func->getNodes()) { |
612 Context.init(Node); | 611 Context.init(Node); |
613 while (!Context.atEnd()) { | 612 while (!Context.atEnd()) { |
614 Variable *LoadDest = nullptr; | 613 Variable *LoadDest = nullptr; |
615 Operand *LoadSrc = nullptr; | 614 Operand *LoadSrc = nullptr; |
616 Inst *CurInst = Context.getCur(); | 615 Inst *CurInst = Context.getCur(); |
617 Inst *Next = Context.getNextInst(); | 616 Inst *Next = Context.getNextInst(); |
618 // Determine whether the current instruction is a Load | 617 // Determine whether the current instruction is a Load instruction or |
619 // instruction or equivalent. | 618 // equivalent. |
620 if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) { | 619 if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) { |
621 // An InstLoad always qualifies. | 620 // An InstLoad always qualifies. |
622 LoadDest = Load->getDest(); | 621 LoadDest = Load->getDest(); |
623 const bool DoLegalize = false; | 622 const bool DoLegalize = false; |
624 LoadSrc = formMemoryOperand(Load->getSourceAddress(), | 623 LoadSrc = formMemoryOperand(Load->getSourceAddress(), |
625 LoadDest->getType(), DoLegalize); | 624 LoadDest->getType(), DoLegalize); |
626 } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) { | 625 } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) { |
627 // An AtomicLoad intrinsic qualifies as long as it has a valid | 626 // An AtomicLoad intrinsic qualifies as long as it has a valid memory |
628 // memory ordering, and can be implemented in a single | 627 // ordering, and can be implemented in a single instruction (i.e., not |
629 // instruction (i.e., not i64 on x86-32). | 628 // i64 on x86-32). |
630 Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID; | 629 Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID; |
631 if (ID == Intrinsics::AtomicLoad && | 630 if (ID == Intrinsics::AtomicLoad && |
632 (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) && | 631 (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) && |
633 Intrinsics::isMemoryOrderValid( | 632 Intrinsics::isMemoryOrderValid( |
634 ID, getConstantMemoryOrder(Intrin->getArg(1)))) { | 633 ID, getConstantMemoryOrder(Intrin->getArg(1)))) { |
635 LoadDest = Intrin->getDest(); | 634 LoadDest = Intrin->getDest(); |
636 const bool DoLegalize = false; | 635 const bool DoLegalize = false; |
637 LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(), | 636 LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(), |
638 DoLegalize); | 637 DoLegalize); |
639 } | 638 } |
640 } | 639 } |
641 // A Load instruction can be folded into the following | 640 // A Load instruction can be folded into the following instruction only |
642 // instruction only if the following instruction ends the Load's | 641 // if the following instruction ends the Load's Dest variable's live |
643 // Dest variable's live range. | 642 // range. |
644 if (LoadDest && Next && Next->isLastUse(LoadDest)) { | 643 if (LoadDest && Next && Next->isLastUse(LoadDest)) { |
645 assert(LoadSrc); | 644 assert(LoadSrc); |
646 Inst *NewInst = nullptr; | 645 Inst *NewInst = nullptr; |
647 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) { | 646 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) { |
648 Operand *Src0 = Arith->getSrc(0); | 647 Operand *Src0 = Arith->getSrc(0); |
649 Operand *Src1 = Arith->getSrc(1); | 648 Operand *Src1 = Arith->getSrc(1); |
650 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { | 649 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { |
651 NewInst = InstArithmetic::create(Func, Arith->getOp(), | 650 NewInst = InstArithmetic::create(Func, Arith->getOp(), |
652 Arith->getDest(), Src0, Src1); | 651 Arith->getDest(), Src0, Src1); |
653 } | 652 } |
(...skipping 12 matching lines...) Expand all Loading... |
666 Fcmp->getDest(), Src0, Src1); | 665 Fcmp->getDest(), Src0, Src1); |
667 } | 666 } |
668 } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) { | 667 } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) { |
669 Operand *Src0 = Select->getTrueOperand(); | 668 Operand *Src0 = Select->getTrueOperand(); |
670 Operand *Src1 = Select->getFalseOperand(); | 669 Operand *Src1 = Select->getFalseOperand(); |
671 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { | 670 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { |
672 NewInst = InstSelect::create(Func, Select->getDest(), | 671 NewInst = InstSelect::create(Func, Select->getDest(), |
673 Select->getCondition(), Src0, Src1); | 672 Select->getCondition(), Src0, Src1); |
674 } | 673 } |
675 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) { | 674 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) { |
676 // The load dest can always be folded into a Cast | 675 // The load dest can always be folded into a Cast instruction. |
677 // instruction. | |
678 Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0)); | 676 Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0)); |
679 if (Src0 == LoadDest) { | 677 if (Src0 == LoadDest) { |
680 NewInst = InstCast::create(Func, Cast->getCastKind(), | 678 NewInst = InstCast::create(Func, Cast->getCastKind(), |
681 Cast->getDest(), LoadSrc); | 679 Cast->getDest(), LoadSrc); |
682 } | 680 } |
683 } | 681 } |
684 if (NewInst) { | 682 if (NewInst) { |
685 CurInst->setDeleted(); | 683 CurInst->setDeleted(); |
686 Next->setDeleted(); | 684 Next->setDeleted(); |
687 Context.insert(NewInst); | 685 Context.insert(NewInst); |
688 // Update NewInst->LiveRangesEnded so that target lowering | 686 // Update NewInst->LiveRangesEnded so that target lowering may |
689 // may benefit. Also update NewInst->HasSideEffects. | 687 // benefit. Also update NewInst->HasSideEffects. |
690 NewInst->spliceLivenessInfo(Next, CurInst); | 688 NewInst->spliceLivenessInfo(Next, CurInst); |
691 } | 689 } |
692 } | 690 } |
693 Context.advanceCur(); | 691 Context.advanceCur(); |
694 Context.advanceNext(); | 692 Context.advanceNext(); |
695 } | 693 } |
696 } | 694 } |
697 Func->dump("After load optimization"); | 695 Func->dump("After load optimization"); |
698 } | 696 } |
699 | 697 |
(...skipping 14 matching lines...) Expand all Loading... |
714 if (Ty == IceType_void) | 712 if (Ty == IceType_void) |
715 Ty = IceType_i32; | 713 Ty = IceType_i32; |
716 if (PhysicalRegisters[Ty].empty()) | 714 if (PhysicalRegisters[Ty].empty()) |
717 PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM); | 715 PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM); |
718 assert(RegNum < PhysicalRegisters[Ty].size()); | 716 assert(RegNum < PhysicalRegisters[Ty].size()); |
719 Variable *Reg = PhysicalRegisters[Ty][RegNum]; | 717 Variable *Reg = PhysicalRegisters[Ty][RegNum]; |
720 if (Reg == nullptr) { | 718 if (Reg == nullptr) { |
721 Reg = Func->makeVariable(Ty); | 719 Reg = Func->makeVariable(Ty); |
722 Reg->setRegNum(RegNum); | 720 Reg->setRegNum(RegNum); |
723 PhysicalRegisters[Ty][RegNum] = Reg; | 721 PhysicalRegisters[Ty][RegNum] = Reg; |
724 // Specially mark esp as an "argument" so that it is considered | 722 // Specially mark esp as an "argument" so that it is considered live upon |
725 // live upon function entry. | 723 // function entry. |
726 if (RegNum == Traits::RegisterSet::Reg_esp) { | 724 if (RegNum == Traits::RegisterSet::Reg_esp) { |
727 Func->addImplicitArg(Reg); | 725 Func->addImplicitArg(Reg); |
728 Reg->setIgnoreLiveness(); | 726 Reg->setIgnoreLiveness(); |
729 } | 727 } |
730 } | 728 } |
731 return Reg; | 729 return Reg; |
732 } | 730 } |
733 | 731 |
734 template <class Machine> | 732 template <class Machine> |
735 IceString TargetX86Base<Machine>::getRegName(SizeT RegNum, Type Ty) const { | 733 IceString TargetX86Base<Machine>::getRegName(SizeT RegNum, Type Ty) const { |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
775 BaseRegNum = getFrameOrStackReg(); | 773 BaseRegNum = getFrameOrStackReg(); |
776 if (!hasFramePointer()) | 774 if (!hasFramePointer()) |
777 Offset += getStackAdjustment(); | 775 Offset += getStackAdjustment(); |
778 } | 776 } |
779 return typename Traits::Address( | 777 return typename Traits::Address( |
780 Traits::RegisterSet::getEncodedGPR(BaseRegNum), Offset); | 778 Traits::RegisterSet::getEncodedGPR(BaseRegNum), Offset); |
781 } | 779 } |
782 | 780 |
783 /// Helper function for addProlog(). | 781 /// Helper function for addProlog(). |
784 /// | 782 /// |
785 /// This assumes Arg is an argument passed on the stack. This sets the | 783 /// This assumes Arg is an argument passed on the stack. This sets the frame |
786 /// frame offset for Arg and updates InArgsSizeBytes according to Arg's | 784 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an |
787 /// width. For an I64 arg that has been split into Lo and Hi components, | 785 /// I64 arg that has been split into Lo and Hi components, it calls itself |
788 /// it calls itself recursively on the components, taking care to handle | 786 /// recursively on the components, taking care to handle Lo first because of the |
789 /// Lo first because of the little-endian architecture. Lastly, this | 787 /// little-endian architecture. Lastly, this function generates an instruction |
790 /// function generates an instruction to copy Arg into its assigned | 788 /// to copy Arg into its assigned register if applicable. |
791 /// register if applicable. | |
792 template <class Machine> | 789 template <class Machine> |
793 void TargetX86Base<Machine>::finishArgumentLowering(Variable *Arg, | 790 void TargetX86Base<Machine>::finishArgumentLowering(Variable *Arg, |
794 Variable *FramePtr, | 791 Variable *FramePtr, |
795 size_t BasicFrameOffset, | 792 size_t BasicFrameOffset, |
796 size_t &InArgsSizeBytes) { | 793 size_t &InArgsSizeBytes) { |
797 Variable *Lo = Arg->getLo(); | 794 Variable *Lo = Arg->getLo(); |
798 Variable *Hi = Arg->getHi(); | 795 Variable *Hi = Arg->getHi(); |
799 Type Ty = Arg->getType(); | 796 Type Ty = Arg->getType(); |
800 if (!Traits::Is64Bit && Lo && Hi && Ty == IceType_i64) { | 797 if (!Traits::Is64Bit && Lo && Hi && Ty == IceType_i64) { |
801 assert(Lo->getType() != IceType_i64); // don't want infinite recursion | 798 assert(Lo->getType() != IceType_i64); // don't want infinite recursion |
(...skipping 10 matching lines...) Expand all Loading... |
812 if (Arg->hasReg()) { | 809 if (Arg->hasReg()) { |
813 assert(Ty != IceType_i64 || Traits::Is64Bit); | 810 assert(Ty != IceType_i64 || Traits::Is64Bit); |
814 typename Traits::X86OperandMem *Mem = Traits::X86OperandMem::create( | 811 typename Traits::X86OperandMem *Mem = Traits::X86OperandMem::create( |
815 Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset())); | 812 Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset())); |
816 if (isVectorType(Arg->getType())) { | 813 if (isVectorType(Arg->getType())) { |
817 _movp(Arg, Mem); | 814 _movp(Arg, Mem); |
818 } else { | 815 } else { |
819 _mov(Arg, Mem); | 816 _mov(Arg, Mem); |
820 } | 817 } |
821 // This argument-copying instruction uses an explicit Traits::X86OperandMem | 818 // This argument-copying instruction uses an explicit Traits::X86OperandMem |
822 // operand instead of a Variable, so its fill-from-stack operation has to be | 819 // operand instead of a Variable, so its fill-from-stack operation has to |
823 // tracked separately for statistics. | 820 // be tracked separately for statistics. |
824 Ctx->statsUpdateFills(); | 821 Ctx->statsUpdateFills(); |
825 } | 822 } |
826 } | 823 } |
827 | 824 |
828 template <class Machine> Type TargetX86Base<Machine>::stackSlotType() { | 825 template <class Machine> Type TargetX86Base<Machine>::stackSlotType() { |
829 return Traits::WordType; | 826 return Traits::WordType; |
830 } | 827 } |
831 | 828 |
832 template <class Machine> | 829 template <class Machine> |
833 template <typename T> | 830 template <typename T> |
834 typename std::enable_if<!T::Is64Bit, void>::type | 831 typename std::enable_if<!T::Is64Bit, void>::type |
835 TargetX86Base<Machine>::split64(Variable *Var) { | 832 TargetX86Base<Machine>::split64(Variable *Var) { |
836 switch (Var->getType()) { | 833 switch (Var->getType()) { |
837 default: | 834 default: |
838 return; | 835 return; |
839 case IceType_i64: | 836 case IceType_i64: |
840 // TODO: Only consider F64 if we need to push each half when | 837 // TODO: Only consider F64 if we need to push each half when passing as an |
841 // passing as an argument to a function call. Note that each half | 838 // argument to a function call. Note that each half is still typed as I32. |
842 // is still typed as I32. | |
843 case IceType_f64: | 839 case IceType_f64: |
844 break; | 840 break; |
845 } | 841 } |
846 Variable *Lo = Var->getLo(); | 842 Variable *Lo = Var->getLo(); |
847 Variable *Hi = Var->getHi(); | 843 Variable *Hi = Var->getHi(); |
848 if (Lo) { | 844 if (Lo) { |
849 assert(Hi); | 845 assert(Hi); |
850 return; | 846 return; |
851 } | 847 } |
852 assert(Hi == nullptr); | 848 assert(Hi == nullptr); |
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
939 template <class Machine> | 935 template <class Machine> |
940 llvm::SmallBitVector | 936 llvm::SmallBitVector |
941 TargetX86Base<Machine>::getRegisterSet(RegSetMask Include, | 937 TargetX86Base<Machine>::getRegisterSet(RegSetMask Include, |
942 RegSetMask Exclude) const { | 938 RegSetMask Exclude) const { |
943 return Traits::getRegisterSet(Include, Exclude); | 939 return Traits::getRegisterSet(Include, Exclude); |
944 } | 940 } |
945 | 941 |
946 template <class Machine> | 942 template <class Machine> |
947 void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) { | 943 void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) { |
948 IsEbpBasedFrame = true; | 944 IsEbpBasedFrame = true; |
949 // Conservatively require the stack to be aligned. Some stack | 945 // Conservatively require the stack to be aligned. Some stack adjustment |
950 // adjustment operations implemented below assume that the stack is | 946 // operations implemented below assume that the stack is aligned before the |
951 // aligned before the alloca. All the alloca code ensures that the | 947 // alloca. All the alloca code ensures that the stack alignment is preserved |
952 // stack alignment is preserved after the alloca. The stack alignment | 948 // after the alloca. The stack alignment restriction can be relaxed in some |
953 // restriction can be relaxed in some cases. | 949 // cases. |
954 NeedsStackAlignment = true; | 950 NeedsStackAlignment = true; |
955 | 951 |
956 // TODO(stichnot): minimize the number of adjustments of esp, etc. | 952 // TODO(stichnot): minimize the number of adjustments of esp, etc. |
957 Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_esp); | 953 Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_esp); |
958 Operand *TotalSize = legalize(Inst->getSizeInBytes()); | 954 Operand *TotalSize = legalize(Inst->getSizeInBytes()); |
959 Variable *Dest = Inst->getDest(); | 955 Variable *Dest = Inst->getDest(); |
960 uint32_t AlignmentParam = Inst->getAlignInBytes(); | 956 uint32_t AlignmentParam = Inst->getAlignInBytes(); |
961 // For default align=0, set it to the real value 1, to avoid any | 957 // For default align=0, set it to the real value 1, to avoid any |
962 // bit-manipulation problems below. | 958 // bit-manipulation problems below. |
963 AlignmentParam = std::max(AlignmentParam, 1u); | 959 AlignmentParam = std::max(AlignmentParam, 1u); |
964 | 960 |
965 // LLVM enforces power of 2 alignment. | 961 // LLVM enforces power of 2 alignment. |
966 assert(llvm::isPowerOf2_32(AlignmentParam)); | 962 assert(llvm::isPowerOf2_32(AlignmentParam)); |
967 assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES)); | 963 assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES)); |
968 | 964 |
969 uint32_t Alignment = | 965 uint32_t Alignment = |
970 std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES); | 966 std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES); |
971 if (Alignment > Traits::X86_STACK_ALIGNMENT_BYTES) { | 967 if (Alignment > Traits::X86_STACK_ALIGNMENT_BYTES) { |
972 _and(esp, Ctx->getConstantInt32(-Alignment)); | 968 _and(esp, Ctx->getConstantInt32(-Alignment)); |
973 } | 969 } |
974 if (const auto *ConstantTotalSize = | 970 if (const auto *ConstantTotalSize = |
975 llvm::dyn_cast<ConstantInteger32>(TotalSize)) { | 971 llvm::dyn_cast<ConstantInteger32>(TotalSize)) { |
976 uint32_t Value = ConstantTotalSize->getValue(); | 972 uint32_t Value = ConstantTotalSize->getValue(); |
977 Value = Utils::applyAlignment(Value, Alignment); | 973 Value = Utils::applyAlignment(Value, Alignment); |
978 _sub(esp, Ctx->getConstantInt32(Value)); | 974 _sub(esp, Ctx->getConstantInt32(Value)); |
979 } else { | 975 } else { |
980 // Non-constant sizes need to be adjusted to the next highest | 976 // Non-constant sizes need to be adjusted to the next highest multiple of |
981 // multiple of the required alignment at runtime. | 977 // the required alignment at runtime. |
982 Variable *T = makeReg(IceType_i32); | 978 Variable *T = makeReg(IceType_i32); |
983 _mov(T, TotalSize); | 979 _mov(T, TotalSize); |
984 _add(T, Ctx->getConstantInt32(Alignment - 1)); | 980 _add(T, Ctx->getConstantInt32(Alignment - 1)); |
985 _and(T, Ctx->getConstantInt32(-Alignment)); | 981 _and(T, Ctx->getConstantInt32(-Alignment)); |
986 _sub(esp, T); | 982 _sub(esp, T); |
987 } | 983 } |
988 _mov(Dest, esp); | 984 _mov(Dest, esp); |
989 } | 985 } |
990 | 986 |
991 /// Strength-reduce scalar integer multiplication by a constant (for | 987 /// Strength-reduce scalar integer multiplication by a constant (for i32 or |
992 /// i32 or narrower) for certain constants. The lea instruction can be | 988 /// narrower) for certain constants. The lea instruction can be used to multiply |
993 /// used to multiply by 3, 5, or 9, and the lsh instruction can be used | 989 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of |
994 /// to multiply by powers of 2. These can be combined such that | 990 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2 |
995 /// e.g. multiplying by 100 can be done as 2 lea-based multiplies by 5, | 991 /// lea-based multiplies by 5, combined with left-shifting by 2. |
996 /// combined with left-shifting by 2. | |
997 template <class Machine> | 992 template <class Machine> |
998 bool TargetX86Base<Machine>::optimizeScalarMul(Variable *Dest, Operand *Src0, | 993 bool TargetX86Base<Machine>::optimizeScalarMul(Variable *Dest, Operand *Src0, |
999 int32_t Src1) { | 994 int32_t Src1) { |
1000 // Disable this optimization for Om1 and O0, just to keep things | 995 // Disable this optimization for Om1 and O0, just to keep things simple |
1001 // simple there. | 996 // there. |
1002 if (Ctx->getFlags().getOptLevel() < Opt_1) | 997 if (Ctx->getFlags().getOptLevel() < Opt_1) |
1003 return false; | 998 return false; |
1004 Type Ty = Dest->getType(); | 999 Type Ty = Dest->getType(); |
1005 Variable *T = nullptr; | 1000 Variable *T = nullptr; |
1006 if (Src1 == -1) { | 1001 if (Src1 == -1) { |
1007 _mov(T, Src0); | 1002 _mov(T, Src0); |
1008 _neg(T); | 1003 _neg(T); |
1009 _mov(Dest, T); | 1004 _mov(Dest, T); |
1010 return true; | 1005 return true; |
1011 } | 1006 } |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1047 ++CountOps; | 1042 ++CountOps; |
1048 ++Count2; | 1043 ++Count2; |
1049 Src1 /= 2; | 1044 Src1 /= 2; |
1050 } else { | 1045 } else { |
1051 return false; | 1046 return false; |
1052 } | 1047 } |
1053 } | 1048 } |
1054 // Lea optimization only works for i16 and i32 types, not i8. | 1049 // Lea optimization only works for i16 and i32 types, not i8. |
1055 if (Ty != IceType_i16 && Ty != IceType_i32 && (Count3 || Count5 || Count9)) | 1050 if (Ty != IceType_i16 && Ty != IceType_i32 && (Count3 || Count5 || Count9)) |
1056 return false; | 1051 return false; |
1057 // Limit the number of lea/shl operations for a single multiply, to | 1052 // Limit the number of lea/shl operations for a single multiply, to a |
1058 // a somewhat arbitrary choice of 3. | 1053 // somewhat arbitrary choice of 3. |
1059 const uint32_t MaxOpsForOptimizedMul = 3; | 1054 const uint32_t MaxOpsForOptimizedMul = 3; |
1060 if (CountOps > MaxOpsForOptimizedMul) | 1055 if (CountOps > MaxOpsForOptimizedMul) |
1061 return false; | 1056 return false; |
1062 _mov(T, Src0); | 1057 _mov(T, Src0); |
1063 Constant *Zero = Ctx->getConstantZero(IceType_i32); | 1058 Constant *Zero = Ctx->getConstantZero(IceType_i32); |
1064 for (uint32_t i = 0; i < Count9; ++i) { | 1059 for (uint32_t i = 0; i < Count9; ++i) { |
1065 const uint16_t Shift = 3; // log2(9-1) | 1060 const uint16_t Shift = 3; // log2(9-1) |
1066 _lea(T, | 1061 _lea(T, |
1067 Traits::X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift)); | 1062 Traits::X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift)); |
1068 _set_dest_nonkillable(); | 1063 _set_dest_nonkillable(); |
(...skipping 25 matching lines...) Expand all Loading... |
1094 Operand *Src0 = legalize(Inst->getSrc(0)); | 1089 Operand *Src0 = legalize(Inst->getSrc(0)); |
1095 Operand *Src1 = legalize(Inst->getSrc(1)); | 1090 Operand *Src1 = legalize(Inst->getSrc(1)); |
1096 if (Inst->isCommutative()) { | 1091 if (Inst->isCommutative()) { |
1097 if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) | 1092 if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) |
1098 std::swap(Src0, Src1); | 1093 std::swap(Src0, Src1); |
1099 if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) | 1094 if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) |
1100 std::swap(Src0, Src1); | 1095 std::swap(Src0, Src1); |
1101 } | 1096 } |
1102 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { | 1097 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { |
1103 // These x86-32 helper-call-involved instructions are lowered in this | 1098 // These x86-32 helper-call-involved instructions are lowered in this |
1104 // separate switch. This is because loOperand() and hiOperand() | 1099 // separate switch. This is because loOperand() and hiOperand() may insert |
1105 // may insert redundant instructions for constant blinding and | 1100 // redundant instructions for constant blinding and pooling. Such redundant |
1106 // pooling. Such redundant instructions will fail liveness analysis | 1101 // instructions will fail liveness analysis under -Om1 setting. And, |
1107 // under -Om1 setting. And, actually these arguments do not need | 1102 // actually these arguments do not need to be processed with loOperand() |
1108 // to be processed with loOperand() and hiOperand() to be used. | 1103 // and hiOperand() to be used. |
1109 switch (Inst->getOp()) { | 1104 switch (Inst->getOp()) { |
1110 case InstArithmetic::Udiv: { | 1105 case InstArithmetic::Udiv: { |
1111 const SizeT MaxSrcs = 2; | 1106 const SizeT MaxSrcs = 2; |
1112 InstCall *Call = makeHelperCall(H_udiv_i64, Dest, MaxSrcs); | 1107 InstCall *Call = makeHelperCall(H_udiv_i64, Dest, MaxSrcs); |
1113 Call->addArg(Inst->getSrc(0)); | 1108 Call->addArg(Inst->getSrc(0)); |
1114 Call->addArg(Inst->getSrc(1)); | 1109 Call->addArg(Inst->getSrc(1)); |
1115 lowerCall(Call); | 1110 lowerCall(Call); |
1116 return; | 1111 return; |
1117 } | 1112 } |
1118 case InstArithmetic::Sdiv: { | 1113 case InstArithmetic::Sdiv: { |
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1209 // t4.hi += t2 | 1204 // t4.hi += t2 |
1210 // a.hi = t4.hi | 1205 // a.hi = t4.hi |
1211 // The mul instruction cannot take an immediate operand. | 1206 // The mul instruction cannot take an immediate operand. |
1212 Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem); | 1207 Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem); |
1213 _mov(T_1, Src0Hi); | 1208 _mov(T_1, Src0Hi); |
1214 _imul(T_1, Src1Lo); | 1209 _imul(T_1, Src1Lo); |
1215 _mov(T_2, Src1Hi); | 1210 _mov(T_2, Src1Hi); |
1216 _imul(T_2, Src0Lo); | 1211 _imul(T_2, Src0Lo); |
1217 _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax); | 1212 _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax); |
1218 _mul(T_4Lo, T_3, Src1Lo); | 1213 _mul(T_4Lo, T_3, Src1Lo); |
1219 // The mul instruction produces two dest variables, edx:eax. We | 1214 // The mul instruction produces two dest variables, edx:eax. We create a |
1220 // create a fake definition of edx to account for this. | 1215 // fake definition of edx to account for this. |
1221 Context.insert(InstFakeDef::create(Func, T_4Hi, T_4Lo)); | 1216 Context.insert(InstFakeDef::create(Func, T_4Hi, T_4Lo)); |
1222 _mov(DestLo, T_4Lo); | 1217 _mov(DestLo, T_4Lo); |
1223 _add(T_4Hi, T_1); | 1218 _add(T_4Hi, T_1); |
1224 _add(T_4Hi, T_2); | 1219 _add(T_4Hi, T_2); |
1225 _mov(DestHi, T_4Hi); | 1220 _mov(DestHi, T_4Hi); |
1226 } break; | 1221 } break; |
1227 case InstArithmetic::Shl: { | 1222 case InstArithmetic::Shl: { |
1228 // TODO: Refactor the similarities between Shl, Lshr, and Ashr. | 1223 // TODO: Refactor the similarities between Shl, Lshr, and Ashr. |
1229 // gcc does the following: | 1224 // gcc does the following: |
1230 // a=b<<c ==> | 1225 // a=b<<c ==> |
(...skipping 15 matching lines...) Expand all Loading... |
1246 Constant *Zero = Ctx->getConstantZero(IceType_i32); | 1241 Constant *Zero = Ctx->getConstantZero(IceType_i32); |
1247 typename Traits::Insts::Label *Label = | 1242 typename Traits::Insts::Label *Label = |
1248 Traits::Insts::Label::create(Func, this); | 1243 Traits::Insts::Label::create(Func, this); |
1249 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); | 1244 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); |
1250 _mov(T_2, Src0Lo); | 1245 _mov(T_2, Src0Lo); |
1251 _mov(T_3, Src0Hi); | 1246 _mov(T_3, Src0Hi); |
1252 _shld(T_3, T_2, T_1); | 1247 _shld(T_3, T_2, T_1); |
1253 _shl(T_2, T_1); | 1248 _shl(T_2, T_1); |
1254 _test(T_1, BitTest); | 1249 _test(T_1, BitTest); |
1255 _br(Traits::Cond::Br_e, Label); | 1250 _br(Traits::Cond::Br_e, Label); |
1256 // T_2 and T_3 are being assigned again because of the | 1251 // T_2 and T_3 are being assigned again because of the intra-block |
1257 // intra-block control flow, so we need the _mov_nonkillable | 1252 // control flow, so we need the _mov_nonkillable variant to avoid |
1258 // variant to avoid liveness problems. | 1253 // liveness problems. |
1259 _mov_nonkillable(T_3, T_2); | 1254 _mov_nonkillable(T_3, T_2); |
1260 _mov_nonkillable(T_2, Zero); | 1255 _mov_nonkillable(T_2, Zero); |
1261 Context.insert(Label); | 1256 Context.insert(Label); |
1262 _mov(DestLo, T_2); | 1257 _mov(DestLo, T_2); |
1263 _mov(DestHi, T_3); | 1258 _mov(DestHi, T_3); |
1264 } break; | 1259 } break; |
1265 case InstArithmetic::Lshr: { | 1260 case InstArithmetic::Lshr: { |
1266 // a=b>>c (unsigned) ==> | 1261 // a=b>>c (unsigned) ==> |
1267 // t1:ecx = c.lo & 0xff | 1262 // t1:ecx = c.lo & 0xff |
1268 // t2 = b.lo | 1263 // t2 = b.lo |
(...skipping 13 matching lines...) Expand all Loading... |
1282 Constant *Zero = Ctx->getConstantZero(IceType_i32); | 1277 Constant *Zero = Ctx->getConstantZero(IceType_i32); |
1283 typename Traits::Insts::Label *Label = | 1278 typename Traits::Insts::Label *Label = |
1284 Traits::Insts::Label::create(Func, this); | 1279 Traits::Insts::Label::create(Func, this); |
1285 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); | 1280 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); |
1286 _mov(T_2, Src0Lo); | 1281 _mov(T_2, Src0Lo); |
1287 _mov(T_3, Src0Hi); | 1282 _mov(T_3, Src0Hi); |
1288 _shrd(T_2, T_3, T_1); | 1283 _shrd(T_2, T_3, T_1); |
1289 _shr(T_3, T_1); | 1284 _shr(T_3, T_1); |
1290 _test(T_1, BitTest); | 1285 _test(T_1, BitTest); |
1291 _br(Traits::Cond::Br_e, Label); | 1286 _br(Traits::Cond::Br_e, Label); |
1292 // T_2 and T_3 are being assigned again because of the | 1287 // T_2 and T_3 are being assigned again because of the intra-block |
1293 // intra-block control flow, so we need the _mov_nonkillable | 1288 // control flow, so we need the _mov_nonkillable variant to avoid |
1294 // variant to avoid liveness problems. | 1289 // liveness problems. |
1295 _mov_nonkillable(T_2, T_3); | 1290 _mov_nonkillable(T_2, T_3); |
1296 _mov_nonkillable(T_3, Zero); | 1291 _mov_nonkillable(T_3, Zero); |
1297 Context.insert(Label); | 1292 Context.insert(Label); |
1298 _mov(DestLo, T_2); | 1293 _mov(DestLo, T_2); |
1299 _mov(DestHi, T_3); | 1294 _mov(DestHi, T_3); |
1300 } break; | 1295 } break; |
1301 case InstArithmetic::Ashr: { | 1296 case InstArithmetic::Ashr: { |
1302 // a=b>>c (signed) ==> | 1297 // a=b>>c (signed) ==> |
1303 // t1:ecx = c.lo & 0xff | 1298 // t1:ecx = c.lo & 0xff |
1304 // t2 = b.lo | 1299 // t2 = b.lo |
(...skipping 13 matching lines...) Expand all Loading... |
1318 Constant *SignExtend = Ctx->getConstantInt32(0x1f); | 1313 Constant *SignExtend = Ctx->getConstantInt32(0x1f); |
1319 typename Traits::Insts::Label *Label = | 1314 typename Traits::Insts::Label *Label = |
1320 Traits::Insts::Label::create(Func, this); | 1315 Traits::Insts::Label::create(Func, this); |
1321 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); | 1316 _mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx); |
1322 _mov(T_2, Src0Lo); | 1317 _mov(T_2, Src0Lo); |
1323 _mov(T_3, Src0Hi); | 1318 _mov(T_3, Src0Hi); |
1324 _shrd(T_2, T_3, T_1); | 1319 _shrd(T_2, T_3, T_1); |
1325 _sar(T_3, T_1); | 1320 _sar(T_3, T_1); |
1326 _test(T_1, BitTest); | 1321 _test(T_1, BitTest); |
1327 _br(Traits::Cond::Br_e, Label); | 1322 _br(Traits::Cond::Br_e, Label); |
1328 // T_2 and T_3 are being assigned again because of the | 1323 // T_2 and T_3 are being assigned again because of the intra-block |
1329 // intra-block control flow, so T_2 needs the _mov_nonkillable | 1324 // control flow, so T_2 needs the _mov_nonkillable variant to avoid |
1330 // variant to avoid liveness problems. T_3 doesn't need special | 1325 // liveness problems. T_3 doesn't need special treatment because it is |
1331 // treatment because it is reassigned via _sar instead of _mov. | 1326 // reassigned via _sar instead of _mov. |
1332 _mov_nonkillable(T_2, T_3); | 1327 _mov_nonkillable(T_2, T_3); |
1333 _sar(T_3, SignExtend); | 1328 _sar(T_3, SignExtend); |
1334 Context.insert(Label); | 1329 Context.insert(Label); |
1335 _mov(DestLo, T_2); | 1330 _mov(DestLo, T_2); |
1336 _mov(DestHi, T_3); | 1331 _mov(DestHi, T_3); |
1337 } break; | 1332 } break; |
1338 case InstArithmetic::Fadd: | 1333 case InstArithmetic::Fadd: |
1339 case InstArithmetic::Fsub: | 1334 case InstArithmetic::Fsub: |
1340 case InstArithmetic::Fmul: | 1335 case InstArithmetic::Fmul: |
1341 case InstArithmetic::Fdiv: | 1336 case InstArithmetic::Fdiv: |
1342 case InstArithmetic::Frem: | 1337 case InstArithmetic::Frem: |
1343 llvm_unreachable("FP instruction with i64 type"); | 1338 llvm_unreachable("FP instruction with i64 type"); |
1344 break; | 1339 break; |
1345 case InstArithmetic::Udiv: | 1340 case InstArithmetic::Udiv: |
1346 case InstArithmetic::Sdiv: | 1341 case InstArithmetic::Sdiv: |
1347 case InstArithmetic::Urem: | 1342 case InstArithmetic::Urem: |
1348 case InstArithmetic::Srem: | 1343 case InstArithmetic::Srem: |
1349 llvm_unreachable("Call-helper-involved instruction for i64 type \ | 1344 llvm_unreachable("Call-helper-involved instruction for i64 type \ |
1350 should have already been handled before"); | 1345 should have already been handled before"); |
1351 break; | 1346 break; |
1352 } | 1347 } |
1353 return; | 1348 return; |
1354 } | 1349 } |
1355 if (isVectorType(Dest->getType())) { | 1350 if (isVectorType(Dest->getType())) { |
1356 // TODO: Trap on integer divide and integer modulo by zero. | 1351 // TODO: Trap on integer divide and integer modulo by zero. See: |
1357 // See: https://code.google.com/p/nativeclient/issues/detail?id=3899 | 1352 // https://code.google.com/p/nativeclient/issues/detail?id=3899 |
1358 if (llvm::isa<typename Traits::X86OperandMem>(Src1)) | 1353 if (llvm::isa<typename Traits::X86OperandMem>(Src1)) |
1359 Src1 = legalizeToReg(Src1); | 1354 Src1 = legalizeToReg(Src1); |
1360 switch (Inst->getOp()) { | 1355 switch (Inst->getOp()) { |
1361 case InstArithmetic::_num: | 1356 case InstArithmetic::_num: |
1362 llvm_unreachable("Unknown arithmetic operator"); | 1357 llvm_unreachable("Unknown arithmetic operator"); |
1363 break; | 1358 break; |
1364 case InstArithmetic::Add: { | 1359 case InstArithmetic::Add: { |
1365 Variable *T = makeReg(Dest->getType()); | 1360 Variable *T = makeReg(Dest->getType()); |
1366 _movp(T, Src0); | 1361 _movp(T, Src0); |
1367 _padd(T, Src1); | 1362 _padd(T, Src1); |
(...skipping 144 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1512 case InstArithmetic::Sub: | 1507 case InstArithmetic::Sub: |
1513 _mov(T, Src0); | 1508 _mov(T, Src0); |
1514 _sub(T, Src1); | 1509 _sub(T, Src1); |
1515 _mov(Dest, T); | 1510 _mov(Dest, T); |
1516 break; | 1511 break; |
1517 case InstArithmetic::Mul: | 1512 case InstArithmetic::Mul: |
1518 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { | 1513 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
1519 if (optimizeScalarMul(Dest, Src0, C->getValue())) | 1514 if (optimizeScalarMul(Dest, Src0, C->getValue())) |
1520 return; | 1515 return; |
1521 } | 1516 } |
1522 // The 8-bit version of imul only allows the form "imul r/m8" | 1517 // The 8-bit version of imul only allows the form "imul r/m8" where T must |
1523 // where T must be in eax. | 1518 // be in eax. |
1524 if (isByteSizedArithType(Dest->getType())) { | 1519 if (isByteSizedArithType(Dest->getType())) { |
1525 _mov(T, Src0, Traits::RegisterSet::Reg_eax); | 1520 _mov(T, Src0, Traits::RegisterSet::Reg_eax); |
1526 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); | 1521 Src1 = legalize(Src1, Legal_Reg | Legal_Mem); |
1527 } else { | 1522 } else { |
1528 _mov(T, Src0); | 1523 _mov(T, Src0); |
1529 } | 1524 } |
1530 _imul(T, Src1); | 1525 _imul(T, Src1); |
1531 _mov(Dest, T); | 1526 _mov(Dest, T); |
1532 break; | 1527 break; |
1533 case InstArithmetic::Shl: | 1528 case InstArithmetic::Shl: |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1573 Context.insert(InstFakeUse::create(Func, T_eax)); | 1568 Context.insert(InstFakeUse::create(Func, T_eax)); |
1574 } else { | 1569 } else { |
1575 Constant *Zero = Ctx->getConstantZero(IceType_i32); | 1570 Constant *Zero = Ctx->getConstantZero(IceType_i32); |
1576 _mov(T, Src0, Traits::RegisterSet::Reg_eax); | 1571 _mov(T, Src0, Traits::RegisterSet::Reg_eax); |
1577 _mov(T_edx, Zero, Traits::RegisterSet::Reg_edx); | 1572 _mov(T_edx, Zero, Traits::RegisterSet::Reg_edx); |
1578 _div(T, Src1, T_edx); | 1573 _div(T, Src1, T_edx); |
1579 _mov(Dest, T); | 1574 _mov(Dest, T); |
1580 } | 1575 } |
1581 break; | 1576 break; |
1582 case InstArithmetic::Sdiv: | 1577 case InstArithmetic::Sdiv: |
1583 // TODO(stichnot): Enable this after doing better performance | 1578 // TODO(stichnot): Enable this after doing better performance and cross |
1584 // and cross testing. | 1579 // testing. |
1585 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { | 1580 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { |
1586 // Optimize division by constant power of 2, but not for Om1 | 1581 // Optimize division by constant power of 2, but not for Om1 or O0, just |
1587 // or O0, just to keep things simple there. | 1582 // to keep things simple there. |
1588 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { | 1583 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
1589 int32_t Divisor = C->getValue(); | 1584 int32_t Divisor = C->getValue(); |
1590 uint32_t UDivisor = static_cast<uint32_t>(Divisor); | 1585 uint32_t UDivisor = static_cast<uint32_t>(Divisor); |
1591 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { | 1586 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { |
1592 uint32_t LogDiv = llvm::Log2_32(UDivisor); | 1587 uint32_t LogDiv = llvm::Log2_32(UDivisor); |
1593 Type Ty = Dest->getType(); | 1588 Type Ty = Dest->getType(); |
1594 // LLVM does the following for dest=src/(1<<log): | 1589 // LLVM does the following for dest=src/(1<<log): |
1595 // t=src | 1590 // t=src |
1596 // sar t,typewidth-1 // -1 if src is negative, 0 if not | 1591 // sar t,typewidth-1 // -1 if src is negative, 0 if not |
1597 // shr t,typewidth-log | 1592 // shr t,typewidth-log |
1598 // add t,src | 1593 // add t,src |
1599 // sar t,log | 1594 // sar t,log |
1600 // dest=t | 1595 // dest=t |
1601 uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty); | 1596 uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty); |
1602 _mov(T, Src0); | 1597 _mov(T, Src0); |
1603 // If for some reason we are dividing by 1, just treat it | 1598 // If for some reason we are dividing by 1, just treat it like an |
1604 // like an assignment. | 1599 // assignment. |
1605 if (LogDiv > 0) { | 1600 if (LogDiv > 0) { |
1606 // The initial sar is unnecessary when dividing by 2. | 1601 // The initial sar is unnecessary when dividing by 2. |
1607 if (LogDiv > 1) | 1602 if (LogDiv > 1) |
1608 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); | 1603 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1)); |
1609 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); | 1604 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv)); |
1610 _add(T, Src0); | 1605 _add(T, Src0); |
1611 _sar(T, Ctx->getConstantInt(Ty, LogDiv)); | 1606 _sar(T, Ctx->getConstantInt(Ty, LogDiv)); |
1612 } | 1607 } |
1613 _mov(Dest, T); | 1608 _mov(Dest, T); |
1614 return; | 1609 return; |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1649 } else { | 1644 } else { |
1650 Constant *Zero = Ctx->getConstantZero(IceType_i32); | 1645 Constant *Zero = Ctx->getConstantZero(IceType_i32); |
1651 T_edx = makeReg(Dest->getType(), Traits::RegisterSet::Reg_edx); | 1646 T_edx = makeReg(Dest->getType(), Traits::RegisterSet::Reg_edx); |
1652 _mov(T_edx, Zero); | 1647 _mov(T_edx, Zero); |
1653 _mov(T, Src0, Traits::RegisterSet::Reg_eax); | 1648 _mov(T, Src0, Traits::RegisterSet::Reg_eax); |
1654 _div(T_edx, Src1, T); | 1649 _div(T_edx, Src1, T); |
1655 _mov(Dest, T_edx); | 1650 _mov(Dest, T_edx); |
1656 } | 1651 } |
1657 break; | 1652 break; |
1658 case InstArithmetic::Srem: | 1653 case InstArithmetic::Srem: |
1659 // TODO(stichnot): Enable this after doing better performance | 1654 // TODO(stichnot): Enable this after doing better performance and cross |
1660 // and cross testing. | 1655 // testing. |
1661 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { | 1656 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) { |
1662 // Optimize mod by constant power of 2, but not for Om1 or O0, | 1657 // Optimize mod by constant power of 2, but not for Om1 or O0, just to |
1663 // just to keep things simple there. | 1658 // keep things simple there. |
1664 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { | 1659 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) { |
1665 int32_t Divisor = C->getValue(); | 1660 int32_t Divisor = C->getValue(); |
1666 uint32_t UDivisor = static_cast<uint32_t>(Divisor); | 1661 uint32_t UDivisor = static_cast<uint32_t>(Divisor); |
1667 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { | 1662 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) { |
1668 uint32_t LogDiv = llvm::Log2_32(UDivisor); | 1663 uint32_t LogDiv = llvm::Log2_32(UDivisor); |
1669 Type Ty = Dest->getType(); | 1664 Type Ty = Dest->getType(); |
1670 // LLVM does the following for dest=src%(1<<log): | 1665 // LLVM does the following for dest=src%(1<<log): |
1671 // t=src | 1666 // t=src |
1672 // sar t,typewidth-1 // -1 if src is negative, 0 if not | 1667 // sar t,typewidth-1 // -1 if src is negative, 0 if not |
1673 // shr t,typewidth-log | 1668 // shr t,typewidth-log |
(...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1770 _mov(T_Hi, Src0Hi); | 1765 _mov(T_Hi, Src0Hi); |
1771 _mov(DestHi, T_Hi); | 1766 _mov(DestHi, T_Hi); |
1772 } else { | 1767 } else { |
1773 Operand *Src0Legal; | 1768 Operand *Src0Legal; |
1774 if (Dest->hasReg()) { | 1769 if (Dest->hasReg()) { |
1775 // If Dest already has a physical register, then only basic legalization | 1770 // If Dest already has a physical register, then only basic legalization |
1776 // is needed, as the source operand can be a register, immediate, or | 1771 // is needed, as the source operand can be a register, immediate, or |
1777 // memory. | 1772 // memory. |
1778 Src0Legal = legalize(Src0); | 1773 Src0Legal = legalize(Src0); |
1779 } else { | 1774 } else { |
1780 // If Dest could be a stack operand, then RI must be a physical | 1775 // If Dest could be a stack operand, then RI must be a physical register |
1781 // register or a scalar integer immediate. | 1776 // or a scalar integer immediate. |
1782 Src0Legal = legalize(Src0, Legal_Reg | Legal_Imm); | 1777 Src0Legal = legalize(Src0, Legal_Reg | Legal_Imm); |
1783 } | 1778 } |
1784 if (isVectorType(Dest->getType())) | 1779 if (isVectorType(Dest->getType())) |
1785 _movp(Dest, Src0Legal); | 1780 _movp(Dest, Src0Legal); |
1786 else | 1781 else |
1787 _mov(Dest, Src0Legal); | 1782 _mov(Dest, Src0Legal); |
1788 } | 1783 } |
1789 } | 1784 } |
1790 | 1785 |
1791 template <class Machine> | 1786 template <class Machine> |
1792 void TargetX86Base<Machine>::lowerBr(const InstBr *Inst) { | 1787 void TargetX86Base<Machine>::lowerBr(const InstBr *Inst) { |
1793 if (Inst->isUnconditional()) { | 1788 if (Inst->isUnconditional()) { |
1794 _br(Inst->getTargetUnconditional()); | 1789 _br(Inst->getTargetUnconditional()); |
1795 return; | 1790 return; |
1796 } | 1791 } |
1797 Operand *Cond = Inst->getCondition(); | 1792 Operand *Cond = Inst->getCondition(); |
1798 | 1793 |
1799 // Handle folding opportunities. | 1794 // Handle folding opportunities. |
1800 if (const class Inst *Producer = FoldingInfo.getProducerFor(Cond)) { | 1795 if (const class Inst *Producer = FoldingInfo.getProducerFor(Cond)) { |
1801 assert(Producer->isDeleted()); | 1796 assert(Producer->isDeleted()); |
1802 switch (BoolFolding::getProducerKind(Producer)) { | 1797 switch (BoolFolding::getProducerKind(Producer)) { |
1803 default: | 1798 default: |
1804 break; | 1799 break; |
1805 case BoolFolding::PK_Icmp32: { | 1800 case BoolFolding::PK_Icmp32: { |
1806 // TODO(stichnot): Refactor similarities between this block and | 1801 // TODO(stichnot): Refactor similarities between this block and the |
1807 // the corresponding code in lowerIcmp(). | 1802 // corresponding code in lowerIcmp(). |
1808 auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer); | 1803 auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer); |
1809 Operand *Src0 = Producer->getSrc(0); | 1804 Operand *Src0 = Producer->getSrc(0); |
1810 Operand *Src1 = legalize(Producer->getSrc(1)); | 1805 Operand *Src1 = legalize(Producer->getSrc(1)); |
1811 Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1); | 1806 Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1); |
1812 _cmp(Src0RM, Src1); | 1807 _cmp(Src0RM, Src1); |
1813 _br(Traits::getIcmp32Mapping(Cmp->getCondition()), Inst->getTargetTrue(), | 1808 _br(Traits::getIcmp32Mapping(Cmp->getCondition()), Inst->getTargetTrue(), |
1814 Inst->getTargetFalse()); | 1809 Inst->getTargetFalse()); |
1815 return; | 1810 return; |
1816 } | 1811 } |
1817 } | 1812 } |
(...skipping 10 matching lines...) Expand all Loading... |
1828 // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap) | 1823 // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap) |
1829 InstCast::OpKind CastKind = Inst->getCastKind(); | 1824 InstCast::OpKind CastKind = Inst->getCastKind(); |
1830 Variable *Dest = Inst->getDest(); | 1825 Variable *Dest = Inst->getDest(); |
1831 switch (CastKind) { | 1826 switch (CastKind) { |
1832 default: | 1827 default: |
1833 Func->setError("Cast type not supported"); | 1828 Func->setError("Cast type not supported"); |
1834 return; | 1829 return; |
1835 case InstCast::Sext: { | 1830 case InstCast::Sext: { |
1836 // Src0RM is the source operand legalized to physical register or memory, | 1831 // Src0RM is the source operand legalized to physical register or memory, |
1837 // but not immediate, since the relevant x86 native instructions don't | 1832 // but not immediate, since the relevant x86 native instructions don't |
1838 // allow an immediate operand. If the operand is an immediate, we could | 1833 // allow an immediate operand. If the operand is an immediate, we could |
1839 // consider computing the strength-reduced result at translation time, | 1834 // consider computing the strength-reduced result at translation time, but |
1840 // but we're unlikely to see something like that in the bitcode that | 1835 // we're unlikely to see something like that in the bitcode that the |
1841 // the optimizer wouldn't have already taken care of. | 1836 // optimizer wouldn't have already taken care of. |
1842 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem); | 1837 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem); |
1843 if (isVectorType(Dest->getType())) { | 1838 if (isVectorType(Dest->getType())) { |
1844 Type DestTy = Dest->getType(); | 1839 Type DestTy = Dest->getType(); |
1845 if (DestTy == IceType_v16i8) { | 1840 if (DestTy == IceType_v16i8) { |
1846 // onemask = materialize(1,1,...); dst = (src & onemask) > 0 | 1841 // onemask = materialize(1,1,...); dst = (src & onemask) > 0 |
1847 Variable *OneMask = makeVectorOfOnes(Dest->getType()); | 1842 Variable *OneMask = makeVectorOfOnes(Dest->getType()); |
1848 Variable *T = makeReg(DestTy); | 1843 Variable *T = makeReg(DestTy); |
1849 _movp(T, Src0RM); | 1844 _movp(T, Src0RM); |
1850 _pand(T, OneMask); | 1845 _pand(T, OneMask); |
1851 Variable *Zeros = makeVectorOfZeros(Dest->getType()); | 1846 Variable *Zeros = makeVectorOfZeros(Dest->getType()); |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1891 // sar t1, dst_bitwidth - 1 | 1886 // sar t1, dst_bitwidth - 1 |
1892 // dst = t1 | 1887 // dst = t1 |
1893 size_t DestBits = | 1888 size_t DestBits = |
1894 Traits::X86_CHAR_BIT * typeWidthInBytes(Dest->getType()); | 1889 Traits::X86_CHAR_BIT * typeWidthInBytes(Dest->getType()); |
1895 Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1); | 1890 Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1); |
1896 Variable *T = makeReg(Dest->getType()); | 1891 Variable *T = makeReg(Dest->getType()); |
1897 if (typeWidthInBytes(Dest->getType()) <= | 1892 if (typeWidthInBytes(Dest->getType()) <= |
1898 typeWidthInBytes(Src0RM->getType())) { | 1893 typeWidthInBytes(Src0RM->getType())) { |
1899 _mov(T, Src0RM); | 1894 _mov(T, Src0RM); |
1900 } else { | 1895 } else { |
1901 // Widen the source using movsx or movzx. (It doesn't matter | 1896 // Widen the source using movsx or movzx. (It doesn't matter which one, |
1902 // which one, since the following shl/sar overwrite the bits.) | 1897 // since the following shl/sar overwrite the bits.) |
1903 _movzx(T, Src0RM); | 1898 _movzx(T, Src0RM); |
1904 } | 1899 } |
1905 _shl(T, ShiftAmount); | 1900 _shl(T, ShiftAmount); |
1906 _sar(T, ShiftAmount); | 1901 _sar(T, ShiftAmount); |
1907 _mov(Dest, T); | 1902 _mov(Dest, T); |
1908 } else { | 1903 } else { |
1909 // t1 = movsx src; dst = t1 | 1904 // t1 = movsx src; dst = t1 |
1910 Variable *T = makeReg(Dest->getType()); | 1905 Variable *T = makeReg(Dest->getType()); |
1911 _movsx(T, Src0RM); | 1906 _movsx(T, Src0RM); |
1912 _mov(Dest, T); | 1907 _mov(Dest, T); |
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2003 if (isVectorType(Dest->getType())) { | 1998 if (isVectorType(Dest->getType())) { |
2004 assert(Dest->getType() == IceType_v4i32 && | 1999 assert(Dest->getType() == IceType_v4i32 && |
2005 Inst->getSrc(0)->getType() == IceType_v4f32); | 2000 Inst->getSrc(0)->getType() == IceType_v4f32); |
2006 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem); | 2001 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem); |
2007 if (llvm::isa<typename Traits::X86OperandMem>(Src0RM)) | 2002 if (llvm::isa<typename Traits::X86OperandMem>(Src0RM)) |
2008 Src0RM = legalizeToReg(Src0RM); | 2003 Src0RM = legalizeToReg(Src0RM); |
2009 Variable *T = makeReg(Dest->getType()); | 2004 Variable *T = makeReg(Dest->getType()); |
2010 _cvt(T, Src0RM, Traits::Insts::Cvt::Tps2dq); | 2005 _cvt(T, Src0RM, Traits::Insts::Cvt::Tps2dq); |
2011 _movp(Dest, T); | 2006 _movp(Dest, T); |
2012 } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { | 2007 } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { |
2013 // Use a helper for converting floating-point values to 64-bit | 2008 // Use a helper for converting floating-point values to 64-bit integers. |
2014 // integers. SSE2 appears to have no way to convert from xmm | 2009 // SSE2 appears to have no way to convert from xmm registers to something |
2015 // registers to something like the edx:eax register pair, and | 2010 // like the edx:eax register pair, and gcc and clang both want to use x87 |
2016 // gcc and clang both want to use x87 instructions complete with | 2011 // instructions complete with temporary manipulation of the status word. |
2017 // temporary manipulation of the status word. This helper is | 2012 // This helper is not needed for x86-64. |
2018 // not needed for x86-64. | |
2019 split64(Dest); | 2013 split64(Dest); |
2020 const SizeT MaxSrcs = 1; | 2014 const SizeT MaxSrcs = 1; |
2021 Type SrcType = Inst->getSrc(0)->getType(); | 2015 Type SrcType = Inst->getSrc(0)->getType(); |
2022 InstCall *Call = | 2016 InstCall *Call = |
2023 makeHelperCall(isFloat32Asserting32Or64(SrcType) ? H_fptosi_f32_i64 | 2017 makeHelperCall(isFloat32Asserting32Or64(SrcType) ? H_fptosi_f32_i64 |
2024 : H_fptosi_f64_i64, | 2018 : H_fptosi_f64_i64, |
2025 Dest, MaxSrcs); | 2019 Dest, MaxSrcs); |
2026 Call->addArg(Inst->getSrc(0)); | 2020 Call->addArg(Inst->getSrc(0)); |
2027 lowerCall(Call); | 2021 lowerCall(Call); |
2028 } else { | 2022 } else { |
(...skipping 114 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2143 Operand *Src0 = Inst->getSrc(0); | 2137 Operand *Src0 = Inst->getSrc(0); |
2144 if (isVectorType(Src0->getType())) { | 2138 if (isVectorType(Src0->getType())) { |
2145 assert(Dest->getType() == IceType_v4f32 && | 2139 assert(Dest->getType() == IceType_v4f32 && |
2146 Src0->getType() == IceType_v4i32); | 2140 Src0->getType() == IceType_v4i32); |
2147 const SizeT MaxSrcs = 1; | 2141 const SizeT MaxSrcs = 1; |
2148 InstCall *Call = makeHelperCall(H_uitofp_4xi32_4xf32, Dest, MaxSrcs); | 2142 InstCall *Call = makeHelperCall(H_uitofp_4xi32_4xf32, Dest, MaxSrcs); |
2149 Call->addArg(Src0); | 2143 Call->addArg(Src0); |
2150 lowerCall(Call); | 2144 lowerCall(Call); |
2151 } else if (Src0->getType() == IceType_i64 || | 2145 } else if (Src0->getType() == IceType_i64 || |
2152 (!Traits::Is64Bit && Src0->getType() == IceType_i32)) { | 2146 (!Traits::Is64Bit && Src0->getType() == IceType_i32)) { |
2153 // Use a helper for x86-32 and x86-64. Also use a helper for | 2147 // Use a helper for x86-32 and x86-64. Also use a helper for i32 on |
2154 // i32 on x86-32. | 2148 // x86-32. |
2155 const SizeT MaxSrcs = 1; | 2149 const SizeT MaxSrcs = 1; |
2156 Type DestType = Dest->getType(); | 2150 Type DestType = Dest->getType(); |
2157 IceString TargetString; | 2151 IceString TargetString; |
2158 if (isInt32Asserting32Or64(Src0->getType())) { | 2152 if (isInt32Asserting32Or64(Src0->getType())) { |
2159 TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i32_f32 | 2153 TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i32_f32 |
2160 : H_uitofp_i32_f64; | 2154 : H_uitofp_i32_f64; |
2161 } else { | 2155 } else { |
2162 TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i64_f32 | 2156 TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i64_f32 |
2163 : H_uitofp_i64_f64; | 2157 : H_uitofp_i64_f64; |
2164 } | 2158 } |
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2278 _mov(DestLo, T_Lo); | 2272 _mov(DestLo, T_Lo); |
2279 _mov(T_Hi, SpillHi); | 2273 _mov(T_Hi, SpillHi); |
2280 _mov(DestHi, T_Hi); | 2274 _mov(DestHi, T_Hi); |
2281 } | 2275 } |
2282 } break; | 2276 } break; |
2283 case IceType_f64: { | 2277 case IceType_f64: { |
2284 assert(Src0->getType() == IceType_i64); | 2278 assert(Src0->getType() == IceType_i64); |
2285 if (Traits::Is64Bit) { | 2279 if (Traits::Is64Bit) { |
2286 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); | 2280 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); |
2287 Variable *T = makeReg(IceType_f64); | 2281 Variable *T = makeReg(IceType_f64); |
2288 // Movd requires its fp argument (in this case, the bitcast destination) | 2282 // Movd requires its fp argument (in this case, the bitcast |
2289 // to be an xmm register. | 2283 // destination) to be an xmm register. |
2290 T->setMustHaveReg(); | 2284 T->setMustHaveReg(); |
2291 _movd(T, Src0RM); | 2285 _movd(T, Src0RM); |
2292 _mov(Dest, T); | 2286 _mov(Dest, T); |
2293 } else { | 2287 } else { |
2294 Src0 = legalize(Src0); | 2288 Src0 = legalize(Src0); |
2295 if (llvm::isa<typename Traits::X86OperandMem>(Src0)) { | 2289 if (llvm::isa<typename Traits::X86OperandMem>(Src0)) { |
2296 Variable *T = Func->makeVariable(Dest->getType()); | 2290 Variable *T = Func->makeVariable(Dest->getType()); |
2297 _movq(T, Src0); | 2291 _movq(T, Src0); |
2298 _movq(Dest, T); | 2292 _movq(Dest, T); |
2299 break; | 2293 break; |
(...skipping 11 matching lines...) Expand all Loading... |
2311 Variable *Spill = SpillVar; | 2305 Variable *Spill = SpillVar; |
2312 Spill->setMustNotHaveReg(); | 2306 Spill->setMustNotHaveReg(); |
2313 | 2307 |
2314 Variable *T_Lo = nullptr, *T_Hi = nullptr; | 2308 Variable *T_Lo = nullptr, *T_Hi = nullptr; |
2315 typename Traits::VariableSplit *SpillLo = Traits::VariableSplit::create( | 2309 typename Traits::VariableSplit *SpillLo = Traits::VariableSplit::create( |
2316 Func, Spill, Traits::VariableSplit::Low); | 2310 Func, Spill, Traits::VariableSplit::Low); |
2317 typename Traits::VariableSplit *SpillHi = Traits::VariableSplit::create( | 2311 typename Traits::VariableSplit *SpillHi = Traits::VariableSplit::create( |
2318 Func, Spill, Traits::VariableSplit::High); | 2312 Func, Spill, Traits::VariableSplit::High); |
2319 _mov(T_Lo, loOperand(Src0)); | 2313 _mov(T_Lo, loOperand(Src0)); |
2320 // Technically, the Spill is defined after the _store happens, but | 2314 // Technically, the Spill is defined after the _store happens, but |
2321 // SpillLo is considered a "use" of Spill so define Spill before it | 2315 // SpillLo is considered a "use" of Spill so define Spill before it is |
2322 // is used. | 2316 // used. |
2323 Context.insert(InstFakeDef::create(Func, Spill)); | 2317 Context.insert(InstFakeDef::create(Func, Spill)); |
2324 _store(T_Lo, SpillLo); | 2318 _store(T_Lo, SpillLo); |
2325 _mov(T_Hi, hiOperand(Src0)); | 2319 _mov(T_Hi, hiOperand(Src0)); |
2326 _store(T_Hi, SpillHi); | 2320 _store(T_Hi, SpillHi); |
2327 _movq(Dest, Spill); | 2321 _movq(Dest, Spill); |
2328 } | 2322 } |
2329 } break; | 2323 } break; |
2330 case IceType_v8i1: { | 2324 case IceType_v8i1: { |
2331 assert(Src0->getType() == IceType_i8); | 2325 assert(Src0->getType() == IceType_i8); |
2332 InstCall *Call = makeHelperCall(H_bitcast_i8_8xi1, Dest, 1); | 2326 InstCall *Call = makeHelperCall(H_bitcast_i8_8xi1, Dest, 1); |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2377 InstructionSet >= Traits::SSE4_1; | 2371 InstructionSet >= Traits::SSE4_1; |
2378 if (CanUsePextr && Ty != IceType_v4f32) { | 2372 if (CanUsePextr && Ty != IceType_v4f32) { |
2379 // Use pextrb, pextrw, or pextrd. | 2373 // Use pextrb, pextrw, or pextrd. |
2380 Constant *Mask = Ctx->getConstantInt32(Index); | 2374 Constant *Mask = Ctx->getConstantInt32(Index); |
2381 Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized); | 2375 Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized); |
2382 _pextr(ExtractedElementR, SourceVectR, Mask); | 2376 _pextr(ExtractedElementR, SourceVectR, Mask); |
2383 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { | 2377 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { |
2384 // Use pshufd and movd/movss. | 2378 // Use pshufd and movd/movss. |
2385 Variable *T = nullptr; | 2379 Variable *T = nullptr; |
2386 if (Index) { | 2380 if (Index) { |
2387 // The shuffle only needs to occur if the element to be extracted | 2381 // The shuffle only needs to occur if the element to be extracted is not |
2388 // is not at the lowest index. | 2382 // at the lowest index. |
2389 Constant *Mask = Ctx->getConstantInt32(Index); | 2383 Constant *Mask = Ctx->getConstantInt32(Index); |
2390 T = makeReg(Ty); | 2384 T = makeReg(Ty); |
2391 _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask); | 2385 _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask); |
2392 } else { | 2386 } else { |
2393 T = legalizeToReg(SourceVectNotLegalized); | 2387 T = legalizeToReg(SourceVectNotLegalized); |
2394 } | 2388 } |
2395 | 2389 |
2396 if (InVectorElementTy == IceType_i32) { | 2390 if (InVectorElementTy == IceType_i32) { |
2397 _movd(ExtractedElementR, T); | 2391 _movd(ExtractedElementR, T); |
2398 } else { // Ty == IceType_f32 | 2392 } else { // Ty == IceType_f32 |
2399 // TODO(wala): _movss is only used here because _mov does not | 2393 // TODO(wala): _movss is only used here because _mov does not allow a |
2400 // allow a vector source and a scalar destination. _mov should be | 2394 // vector source and a scalar destination. _mov should be able to be |
2401 // able to be used here. | 2395 // used here. |
2402 // _movss is a binary instruction, so the FakeDef is needed to | 2396 // _movss is a binary instruction, so the FakeDef is needed to keep the |
2403 // keep the live range analysis consistent. | 2397 // live range analysis consistent. |
2404 Context.insert(InstFakeDef::create(Func, ExtractedElementR)); | 2398 Context.insert(InstFakeDef::create(Func, ExtractedElementR)); |
2405 _movss(ExtractedElementR, T); | 2399 _movss(ExtractedElementR, T); |
2406 } | 2400 } |
2407 } else { | 2401 } else { |
2408 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2402 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
2409 // Spill the value to a stack slot and do the extraction in memory. | 2403 // Spill the value to a stack slot and do the extraction in memory. |
2410 // | 2404 // |
2411 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when | 2405 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support |
2412 // support for legalizing to mem is implemented. | 2406 // for legalizing to mem is implemented. |
2413 Variable *Slot = Func->makeVariable(Ty); | 2407 Variable *Slot = Func->makeVariable(Ty); |
2414 Slot->setMustNotHaveReg(); | 2408 Slot->setMustNotHaveReg(); |
2415 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); | 2409 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); |
2416 | 2410 |
2417 // Compute the location of the element in memory. | 2411 // Compute the location of the element in memory. |
2418 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); | 2412 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); |
2419 typename Traits::X86OperandMem *Loc = | 2413 typename Traits::X86OperandMem *Loc = |
2420 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); | 2414 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); |
2421 _mov(ExtractedElementR, Loc); | 2415 _mov(ExtractedElementR, Loc); |
2422 } | 2416 } |
(...skipping 159 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2582 Src0 = NewSrc0; | 2576 Src0 = NewSrc0; |
2583 Src1 = NewSrc1; | 2577 Src1 = NewSrc1; |
2584 Ty = NewTy; | 2578 Ty = NewTy; |
2585 } | 2579 } |
2586 | 2580 |
2587 InstIcmp::ICond Condition = Inst->getCondition(); | 2581 InstIcmp::ICond Condition = Inst->getCondition(); |
2588 | 2582 |
2589 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); | 2583 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); |
2590 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); | 2584 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); |
2591 | 2585 |
2592 // SSE2 only has signed comparison operations. Transform unsigned | 2586 // SSE2 only has signed comparison operations. Transform unsigned inputs in |
2593 // inputs in a manner that allows for the use of signed comparison | 2587 // a manner that allows for the use of signed comparison operations by |
2594 // operations by flipping the high order bits. | 2588 // flipping the high order bits. |
2595 if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge || | 2589 if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge || |
2596 Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) { | 2590 Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) { |
2597 Variable *T0 = makeReg(Ty); | 2591 Variable *T0 = makeReg(Ty); |
2598 Variable *T1 = makeReg(Ty); | 2592 Variable *T1 = makeReg(Ty); |
2599 Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty); | 2593 Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty); |
2600 _movp(T0, Src0RM); | 2594 _movp(T0, Src0RM); |
2601 _pxor(T0, HighOrderBits); | 2595 _pxor(T0, HighOrderBits); |
2602 _movp(T1, Src1RM); | 2596 _movp(T1, Src1RM); |
2603 _pxor(T1, HighOrderBits); | 2597 _pxor(T1, HighOrderBits); |
2604 Src0RM = T0; | 2598 Src0RM = T0; |
(...skipping 114 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2719 // Only constant indices are allowed in PNaCl IR. | 2713 // Only constant indices are allowed in PNaCl IR. |
2720 assert(ElementIndex); | 2714 assert(ElementIndex); |
2721 unsigned Index = ElementIndex->getValue(); | 2715 unsigned Index = ElementIndex->getValue(); |
2722 assert(Index < typeNumElements(SourceVectNotLegalized->getType())); | 2716 assert(Index < typeNumElements(SourceVectNotLegalized->getType())); |
2723 | 2717 |
2724 Type Ty = SourceVectNotLegalized->getType(); | 2718 Type Ty = SourceVectNotLegalized->getType(); |
2725 Type ElementTy = typeElementType(Ty); | 2719 Type ElementTy = typeElementType(Ty); |
2726 Type InVectorElementTy = Traits::getInVectorElementType(Ty); | 2720 Type InVectorElementTy = Traits::getInVectorElementType(Ty); |
2727 | 2721 |
2728 if (ElementTy == IceType_i1) { | 2722 if (ElementTy == IceType_i1) { |
2729 // Expand the element to the appropriate size for it to be inserted | 2723 // Expand the element to the appropriate size for it to be inserted in the |
2730 // in the vector. | 2724 // vector. |
2731 Variable *Expanded = Func->makeVariable(InVectorElementTy); | 2725 Variable *Expanded = Func->makeVariable(InVectorElementTy); |
2732 InstCast *Cast = InstCast::create(Func, InstCast::Zext, Expanded, | 2726 InstCast *Cast = InstCast::create(Func, InstCast::Zext, Expanded, |
2733 ElementToInsertNotLegalized); | 2727 ElementToInsertNotLegalized); |
2734 lowerCast(Cast); | 2728 lowerCast(Cast); |
2735 ElementToInsertNotLegalized = Expanded; | 2729 ElementToInsertNotLegalized = Expanded; |
2736 } | 2730 } |
2737 | 2731 |
2738 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || | 2732 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || |
2739 InstructionSet >= Traits::SSE4_1) { | 2733 InstructionSet >= Traits::SSE4_1) { |
2740 // Use insertps, pinsrb, pinsrw, or pinsrd. | 2734 // Use insertps, pinsrb, pinsrw, or pinsrd. |
(...skipping 25 matching lines...) Expand all Loading... |
2766 } | 2760 } |
2767 | 2761 |
2768 if (Index == 0) { | 2762 if (Index == 0) { |
2769 Variable *T = makeReg(Ty); | 2763 Variable *T = makeReg(Ty); |
2770 _movp(T, SourceVectRM); | 2764 _movp(T, SourceVectRM); |
2771 _movss(T, ElementR); | 2765 _movss(T, ElementR); |
2772 _movp(Inst->getDest(), T); | 2766 _movp(Inst->getDest(), T); |
2773 return; | 2767 return; |
2774 } | 2768 } |
2775 | 2769 |
2776 // shufps treats the source and desination operands as vectors of | 2770 // shufps treats the source and destination operands as vectors of four |
2777 // four doublewords. The destination's two high doublewords are | 2771 // doublewords. The destination's two high doublewords are selected from |
2778 // selected from the source operand and the two low doublewords are | 2772 // the source operand and the two low doublewords are selected from the |
2779 // selected from the (original value of) the destination operand. | 2773 // (original value of) the destination operand. An insertelement operation |
2780 // An insertelement operation can be effected with a sequence of two | 2774 // can be effected with a sequence of two shufps operations with |
2781 // shufps operations with appropriate masks. In all cases below, | 2775 // appropriate masks. In all cases below, Element[0] is being inserted into |
2782 // Element[0] is being inserted into SourceVectOperand. Indices are | 2776 // SourceVectOperand. Indices are ordered from left to right. |
2783 // ordered from left to right. | |
2784 // | 2777 // |
2785 // insertelement into index 1 (result is stored in ElementR): | 2778 // insertelement into index 1 (result is stored in ElementR): |
2786 // ElementR := ElementR[0, 0] SourceVectRM[0, 0] | 2779 // ElementR := ElementR[0, 0] SourceVectRM[0, 0] |
2787 // ElementR := ElementR[3, 0] SourceVectRM[2, 3] | 2780 // ElementR := ElementR[3, 0] SourceVectRM[2, 3] |
2788 // | 2781 // |
2789 // insertelement into index 2 (result is stored in T): | 2782 // insertelement into index 2 (result is stored in T): |
2790 // T := SourceVectRM | 2783 // T := SourceVectRM |
2791 // ElementR := ElementR[0, 0] T[0, 3] | 2784 // ElementR := ElementR[0, 0] T[0, 3] |
2792 // T := T[0, 1] ElementR[0, 3] | 2785 // T := T[0, 1] ElementR[0, 3] |
2793 // | 2786 // |
(...skipping 13 matching lines...) Expand all Loading... |
2807 _movp(Inst->getDest(), ElementR); | 2800 _movp(Inst->getDest(), ElementR); |
2808 } else { | 2801 } else { |
2809 Variable *T = makeReg(Ty); | 2802 Variable *T = makeReg(Ty); |
2810 _movp(T, SourceVectRM); | 2803 _movp(T, SourceVectRM); |
2811 _shufps(ElementR, T, Mask1Constant); | 2804 _shufps(ElementR, T, Mask1Constant); |
2812 _shufps(T, ElementR, Mask2Constant); | 2805 _shufps(T, ElementR, Mask2Constant); |
2813 _movp(Inst->getDest(), T); | 2806 _movp(Inst->getDest(), T); |
2814 } | 2807 } |
2815 } else { | 2808 } else { |
2816 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2809 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
2817 // Spill the value to a stack slot and perform the insertion in | 2810 // Spill the value to a stack slot and perform the insertion in memory. |
2818 // memory. | |
2819 // | 2811 // |
2820 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when | 2812 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support |
2821 // support for legalizing to mem is implemented. | 2813 // for legalizing to mem is implemented. |
2822 Variable *Slot = Func->makeVariable(Ty); | 2814 Variable *Slot = Func->makeVariable(Ty); |
2823 Slot->setMustNotHaveReg(); | 2815 Slot->setMustNotHaveReg(); |
2824 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); | 2816 _movp(Slot, legalizeToReg(SourceVectNotLegalized)); |
2825 | 2817 |
2826 // Compute the location of the position to insert in memory. | 2818 // Compute the location of the position to insert in memory. |
2827 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); | 2819 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy); |
2828 typename Traits::X86OperandMem *Loc = | 2820 typename Traits::X86OperandMem *Loc = |
2829 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); | 2821 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset); |
2830 _store(legalizeToReg(ElementToInsertNotLegalized), Loc); | 2822 _store(legalizeToReg(ElementToInsertNotLegalized), Loc); |
2831 | 2823 |
(...skipping 25 matching lines...) Expand all Loading... |
2857 } | 2849 } |
2858 case Intrinsics::AtomicFence: | 2850 case Intrinsics::AtomicFence: |
2859 if (!Intrinsics::isMemoryOrderValid( | 2851 if (!Intrinsics::isMemoryOrderValid( |
2860 ID, getConstantMemoryOrder(Instr->getArg(0)))) { | 2852 ID, getConstantMemoryOrder(Instr->getArg(0)))) { |
2861 Func->setError("Unexpected memory ordering for AtomicFence"); | 2853 Func->setError("Unexpected memory ordering for AtomicFence"); |
2862 return; | 2854 return; |
2863 } | 2855 } |
2864 _mfence(); | 2856 _mfence(); |
2865 return; | 2857 return; |
2866 case Intrinsics::AtomicFenceAll: | 2858 case Intrinsics::AtomicFenceAll: |
2867 // NOTE: FenceAll should prevent and load/store from being moved | 2859 // NOTE: FenceAll should prevent and load/store from being moved across the |
2868 // across the fence (both atomic and non-atomic). The InstX8632Mfence | 2860 // fence (both atomic and non-atomic). The InstX8632Mfence instruction is |
2869 // instruction is currently marked coarsely as "HasSideEffects". | 2861 // currently marked coarsely as "HasSideEffects". |
2870 _mfence(); | 2862 _mfence(); |
2871 return; | 2863 return; |
2872 case Intrinsics::AtomicIsLockFree: { | 2864 case Intrinsics::AtomicIsLockFree: { |
2873 // X86 is always lock free for 8/16/32/64 bit accesses. | 2865 // X86 is always lock free for 8/16/32/64 bit accesses. |
2874 // TODO(jvoung): Since the result is constant when given a constant | 2866 // TODO(jvoung): Since the result is constant when given a constant byte |
2875 // byte size, this opens up DCE opportunities. | 2867 // size, this opens up DCE opportunities. |
2876 Operand *ByteSize = Instr->getArg(0); | 2868 Operand *ByteSize = Instr->getArg(0); |
2877 Variable *Dest = Instr->getDest(); | 2869 Variable *Dest = Instr->getDest(); |
2878 if (ConstantInteger32 *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) { | 2870 if (ConstantInteger32 *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) { |
2879 Constant *Result; | 2871 Constant *Result; |
2880 switch (CI->getValue()) { | 2872 switch (CI->getValue()) { |
2881 default: | 2873 default: |
2882 // Some x86-64 processors support the cmpxchg16b intruction, which | 2874 // Some x86-64 processors support the cmpxchg16b instruction, which can |
2883 // can make 16-byte operations lock free (when used with the LOCK | 2875 // make 16-byte operations lock free (when used with the LOCK prefix). |
2884 // prefix). However, that's not supported in 32-bit mode, so just | 2876 // However, that's not supported in 32-bit mode, so just return 0 even |
2885 // return 0 even for large sizes. | 2877 // for large sizes. |
2886 Result = Ctx->getConstantZero(IceType_i32); | 2878 Result = Ctx->getConstantZero(IceType_i32); |
2887 break; | 2879 break; |
2888 case 1: | 2880 case 1: |
2889 case 2: | 2881 case 2: |
2890 case 4: | 2882 case 4: |
2891 case 8: | 2883 case 8: |
2892 Result = Ctx->getConstantInt32(1); | 2884 Result = Ctx->getConstantInt32(1); |
2893 break; | 2885 break; |
2894 } | 2886 } |
2895 _mov(Dest, Result); | 2887 _mov(Dest, Result); |
2896 return; | 2888 return; |
2897 } | 2889 } |
2898 // The PNaCl ABI requires the byte size to be a compile-time constant. | 2890 // The PNaCl ABI requires the byte size to be a compile-time constant. |
2899 Func->setError("AtomicIsLockFree byte size should be compile-time const"); | 2891 Func->setError("AtomicIsLockFree byte size should be compile-time const"); |
2900 return; | 2892 return; |
2901 } | 2893 } |
2902 case Intrinsics::AtomicLoad: { | 2894 case Intrinsics::AtomicLoad: { |
2903 // We require the memory address to be naturally aligned. | 2895 // We require the memory address to be naturally aligned. Given that is the |
2904 // Given that is the case, then normal loads are atomic. | 2896 // case, then normal loads are atomic. |
2905 if (!Intrinsics::isMemoryOrderValid( | 2897 if (!Intrinsics::isMemoryOrderValid( |
2906 ID, getConstantMemoryOrder(Instr->getArg(1)))) { | 2898 ID, getConstantMemoryOrder(Instr->getArg(1)))) { |
2907 Func->setError("Unexpected memory ordering for AtomicLoad"); | 2899 Func->setError("Unexpected memory ordering for AtomicLoad"); |
2908 return; | 2900 return; |
2909 } | 2901 } |
2910 Variable *Dest = Instr->getDest(); | 2902 Variable *Dest = Instr->getDest(); |
2911 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { | 2903 if (!Traits::Is64Bit && Dest->getType() == IceType_i64) { |
2912 // Follow what GCC does and use a movq instead of what lowerLoad() | 2904 // Follow what GCC does and use a movq instead of what lowerLoad() |
2913 // normally does (split the load into two). | 2905 // normally does (split the load into two). Thus, this skips |
2914 // Thus, this skips load/arithmetic op folding. Load/arithmetic folding | 2906 // load/arithmetic op folding. Load/arithmetic folding can't happen |
2915 // can't happen anyway, since this is x86-32 and integer arithmetic only | 2907 // anyway, since this is x86-32 and integer arithmetic only happens on |
2916 // happens on 32-bit quantities. | 2908 // 32-bit quantities. |
2917 Variable *T = makeReg(IceType_f64); | 2909 Variable *T = makeReg(IceType_f64); |
2918 typename Traits::X86OperandMem *Addr = | 2910 typename Traits::X86OperandMem *Addr = |
2919 formMemoryOperand(Instr->getArg(0), IceType_f64); | 2911 formMemoryOperand(Instr->getArg(0), IceType_f64); |
2920 _movq(T, Addr); | 2912 _movq(T, Addr); |
2921 // Then cast the bits back out of the XMM register to the i64 Dest. | 2913 // Then cast the bits back out of the XMM register to the i64 Dest. |
2922 InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T); | 2914 InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T); |
2923 lowerCast(Cast); | 2915 lowerCast(Cast); |
2924 // Make sure that the atomic load isn't elided when unused. | 2916 // Make sure that the atomic load isn't elided when unused. |
2925 Context.insert(InstFakeUse::create(Func, Dest->getLo())); | 2917 Context.insert(InstFakeUse::create(Func, Dest->getLo())); |
2926 Context.insert(InstFakeUse::create(Func, Dest->getHi())); | 2918 Context.insert(InstFakeUse::create(Func, Dest->getHi())); |
2927 return; | 2919 return; |
2928 } | 2920 } |
2929 InstLoad *Load = InstLoad::create(Func, Dest, Instr->getArg(0)); | 2921 InstLoad *Load = InstLoad::create(Func, Dest, Instr->getArg(0)); |
2930 lowerLoad(Load); | 2922 lowerLoad(Load); |
2931 // Make sure the atomic load isn't elided when unused, by adding a FakeUse. | 2923 // Make sure the atomic load isn't elided when unused, by adding a FakeUse. |
2932 // Since lowerLoad may fuse the load w/ an arithmetic instruction, | 2924 // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert |
2933 // insert the FakeUse on the last-inserted instruction's dest. | 2925 // the FakeUse on the last-inserted instruction's dest. |
2934 Context.insert( | 2926 Context.insert( |
2935 InstFakeUse::create(Func, Context.getLastInserted()->getDest())); | 2927 InstFakeUse::create(Func, Context.getLastInserted()->getDest())); |
2936 return; | 2928 return; |
2937 } | 2929 } |
2938 case Intrinsics::AtomicRMW: | 2930 case Intrinsics::AtomicRMW: |
2939 if (!Intrinsics::isMemoryOrderValid( | 2931 if (!Intrinsics::isMemoryOrderValid( |
2940 ID, getConstantMemoryOrder(Instr->getArg(3)))) { | 2932 ID, getConstantMemoryOrder(Instr->getArg(3)))) { |
2941 Func->setError("Unexpected memory ordering for AtomicRMW"); | 2933 Func->setError("Unexpected memory ordering for AtomicRMW"); |
2942 return; | 2934 return; |
2943 } | 2935 } |
2944 lowerAtomicRMW( | 2936 lowerAtomicRMW( |
2945 Instr->getDest(), | 2937 Instr->getDest(), |
2946 static_cast<uint32_t>( | 2938 static_cast<uint32_t>( |
2947 llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()), | 2939 llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()), |
2948 Instr->getArg(1), Instr->getArg(2)); | 2940 Instr->getArg(1), Instr->getArg(2)); |
2949 return; | 2941 return; |
2950 case Intrinsics::AtomicStore: { | 2942 case Intrinsics::AtomicStore: { |
2951 if (!Intrinsics::isMemoryOrderValid( | 2943 if (!Intrinsics::isMemoryOrderValid( |
2952 ID, getConstantMemoryOrder(Instr->getArg(2)))) { | 2944 ID, getConstantMemoryOrder(Instr->getArg(2)))) { |
2953 Func->setError("Unexpected memory ordering for AtomicStore"); | 2945 Func->setError("Unexpected memory ordering for AtomicStore"); |
2954 return; | 2946 return; |
2955 } | 2947 } |
2956 // We require the memory address to be naturally aligned. | 2948 // We require the memory address to be naturally aligned. Given that is the |
2957 // Given that is the case, then normal stores are atomic. | 2949 // case, then normal stores are atomic. Add a fence after the store to make |
2958 // Add a fence after the store to make it visible. | 2950 // it visible. |
2959 Operand *Value = Instr->getArg(0); | 2951 Operand *Value = Instr->getArg(0); |
2960 Operand *Ptr = Instr->getArg(1); | 2952 Operand *Ptr = Instr->getArg(1); |
2961 if (!Traits::Is64Bit && Value->getType() == IceType_i64) { | 2953 if (!Traits::Is64Bit && Value->getType() == IceType_i64) { |
2962 // Use a movq instead of what lowerStore() normally does | 2954 // Use a movq instead of what lowerStore() normally does (split the store |
2963 // (split the store into two), following what GCC does. | 2955 // into two), following what GCC does. Cast the bits from int -> to an |
2964 // Cast the bits from int -> to an xmm register first. | 2956 // xmm register first. |
2965 Variable *T = makeReg(IceType_f64); | 2957 Variable *T = makeReg(IceType_f64); |
2966 InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value); | 2958 InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value); |
2967 lowerCast(Cast); | 2959 lowerCast(Cast); |
2968 // Then store XMM w/ a movq. | 2960 // Then store XMM w/ a movq. |
2969 typename Traits::X86OperandMem *Addr = | 2961 typename Traits::X86OperandMem *Addr = |
2970 formMemoryOperand(Ptr, IceType_f64); | 2962 formMemoryOperand(Ptr, IceType_f64); |
2971 _storeq(T, Addr); | 2963 _storeq(T, Addr); |
2972 _mfence(); | 2964 _mfence(); |
2973 return; | 2965 return; |
2974 } | 2966 } |
2975 InstStore *Store = InstStore::create(Func, Value, Ptr); | 2967 InstStore *Store = InstStore::create(Func, Value, Ptr); |
2976 lowerStore(Store); | 2968 lowerStore(Store); |
2977 _mfence(); | 2969 _mfence(); |
2978 return; | 2970 return; |
2979 } | 2971 } |
2980 case Intrinsics::Bswap: { | 2972 case Intrinsics::Bswap: { |
2981 Variable *Dest = Instr->getDest(); | 2973 Variable *Dest = Instr->getDest(); |
2982 Operand *Val = Instr->getArg(0); | 2974 Operand *Val = Instr->getArg(0); |
2983 // In 32-bit mode, bswap only works on 32-bit arguments, and the | 2975 // In 32-bit mode, bswap only works on 32-bit arguments, and the argument |
2984 // argument must be a register. Use rotate left for 16-bit bswap. | 2976 // must be a register. Use rotate left for 16-bit bswap. |
2985 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { | 2977 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { |
2986 Val = legalizeUndef(Val); | 2978 Val = legalizeUndef(Val); |
2987 Variable *T_Lo = legalizeToReg(loOperand(Val)); | 2979 Variable *T_Lo = legalizeToReg(loOperand(Val)); |
2988 Variable *T_Hi = legalizeToReg(hiOperand(Val)); | 2980 Variable *T_Hi = legalizeToReg(hiOperand(Val)); |
2989 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); | 2981 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); |
2990 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); | 2982 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); |
2991 _bswap(T_Lo); | 2983 _bswap(T_Lo); |
2992 _bswap(T_Hi); | 2984 _bswap(T_Hi); |
2993 _mov(DestLo, T_Hi); | 2985 _mov(DestLo, T_Hi); |
2994 _mov(DestHi, T_Lo); | 2986 _mov(DestHi, T_Lo); |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3063 // another 64-bit wide.) | 3055 // another 64-bit wide.) |
3064 Variable *T_1 = makeReg(IceType_i32); | 3056 Variable *T_1 = makeReg(IceType_i32); |
3065 _mov(T_1, T); | 3057 _mov(T_1, T); |
3066 Variable *T_2 = makeReg(IceType_i64); | 3058 Variable *T_2 = makeReg(IceType_i64); |
3067 _movzx(T_2, T_1); | 3059 _movzx(T_2, T_1); |
3068 _mov(Dest, T_2); | 3060 _mov(Dest, T_2); |
3069 } | 3061 } |
3070 return; | 3062 return; |
3071 } | 3063 } |
3072 case Intrinsics::Ctlz: { | 3064 case Intrinsics::Ctlz: { |
3073 // The "is zero undef" parameter is ignored and we always return | 3065 // The "is zero undef" parameter is ignored and we always return a |
3074 // a well-defined value. | 3066 // well-defined value. |
3075 Operand *Val = legalize(Instr->getArg(0)); | 3067 Operand *Val = legalize(Instr->getArg(0)); |
3076 Operand *FirstVal; | 3068 Operand *FirstVal; |
3077 Operand *SecondVal = nullptr; | 3069 Operand *SecondVal = nullptr; |
3078 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { | 3070 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { |
3079 FirstVal = loOperand(Val); | 3071 FirstVal = loOperand(Val); |
3080 SecondVal = hiOperand(Val); | 3072 SecondVal = hiOperand(Val); |
3081 } else { | 3073 } else { |
3082 FirstVal = Val; | 3074 FirstVal = Val; |
3083 } | 3075 } |
3084 const bool IsCttz = false; | 3076 const bool IsCttz = false; |
3085 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, | 3077 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, |
3086 SecondVal); | 3078 SecondVal); |
3087 return; | 3079 return; |
3088 } | 3080 } |
3089 case Intrinsics::Cttz: { | 3081 case Intrinsics::Cttz: { |
3090 // The "is zero undef" parameter is ignored and we always return | 3082 // The "is zero undef" parameter is ignored and we always return a |
3091 // a well-defined value. | 3083 // well-defined value. |
3092 Operand *Val = legalize(Instr->getArg(0)); | 3084 Operand *Val = legalize(Instr->getArg(0)); |
3093 Operand *FirstVal; | 3085 Operand *FirstVal; |
3094 Operand *SecondVal = nullptr; | 3086 Operand *SecondVal = nullptr; |
3095 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { | 3087 if (!Traits::Is64Bit && Val->getType() == IceType_i64) { |
3096 FirstVal = hiOperand(Val); | 3088 FirstVal = hiOperand(Val); |
3097 SecondVal = loOperand(Val); | 3089 SecondVal = loOperand(Val); |
3098 } else { | 3090 } else { |
3099 FirstVal = Val; | 3091 FirstVal = Val; |
3100 } | 3092 } |
3101 const bool IsCttz = true; | 3093 const bool IsCttz = true; |
3102 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, | 3094 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, |
3103 SecondVal); | 3095 SecondVal); |
3104 return; | 3096 return; |
3105 } | 3097 } |
3106 case Intrinsics::Fabs: { | 3098 case Intrinsics::Fabs: { |
3107 Operand *Src = legalize(Instr->getArg(0)); | 3099 Operand *Src = legalize(Instr->getArg(0)); |
3108 Type Ty = Src->getType(); | 3100 Type Ty = Src->getType(); |
3109 Variable *Dest = Instr->getDest(); | 3101 Variable *Dest = Instr->getDest(); |
3110 Variable *T = makeVectorOfFabsMask(Ty); | 3102 Variable *T = makeVectorOfFabsMask(Ty); |
3111 // The pand instruction operates on an m128 memory operand, so if | 3103 // The pand instruction operates on an m128 memory operand, so if Src is an |
3112 // Src is an f32 or f64, we need to make sure it's in a register. | 3104 // f32 or f64, we need to make sure it's in a register. |
3113 if (isVectorType(Ty)) { | 3105 if (isVectorType(Ty)) { |
3114 if (llvm::isa<typename Traits::X86OperandMem>(Src)) | 3106 if (llvm::isa<typename Traits::X86OperandMem>(Src)) |
3115 Src = legalizeToReg(Src); | 3107 Src = legalizeToReg(Src); |
3116 } else { | 3108 } else { |
3117 Src = legalizeToReg(Src); | 3109 Src = legalizeToReg(Src); |
3118 } | 3110 } |
3119 _pand(T, Src); | 3111 _pand(T, Src); |
3120 if (isVectorType(Ty)) | 3112 if (isVectorType(Ty)) |
3121 _movp(Dest, T); | 3113 _movp(Dest, T); |
3122 else | 3114 else |
(...skipping 564 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3687 | 3679 |
3688 Variable *SrcBase = legalizeToReg(Src); | 3680 Variable *SrcBase = legalizeToReg(Src); |
3689 Variable *DestBase = legalizeToReg(Dest); | 3681 Variable *DestBase = legalizeToReg(Dest); |
3690 | 3682 |
3691 std::tuple<Type, Constant *, Variable *> | 3683 std::tuple<Type, Constant *, Variable *> |
3692 Moves[Traits::MEMMOVE_UNROLL_LIMIT]; | 3684 Moves[Traits::MEMMOVE_UNROLL_LIMIT]; |
3693 Constant *Offset; | 3685 Constant *Offset; |
3694 Variable *Reg; | 3686 Variable *Reg; |
3695 | 3687 |
3696 // Copy the data into registers as the source and destination could overlap | 3688 // Copy the data into registers as the source and destination could overlap |
3697 // so make sure not to clobber the memory. This also means overlapping moves | 3689 // so make sure not to clobber the memory. This also means overlapping |
3698 // can be used as we are taking a safe snapshot of the memory. | 3690 // moves can be used as we are taking a safe snapshot of the memory. |
3699 Type Ty = largestTypeInSize(CountValue); | 3691 Type Ty = largestTypeInSize(CountValue); |
3700 uint32_t TyWidth = typeWidthInBytes(Ty); | 3692 uint32_t TyWidth = typeWidthInBytes(Ty); |
3701 | 3693 |
3702 uint32_t RemainingBytes = CountValue; | 3694 uint32_t RemainingBytes = CountValue; |
3703 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth; | 3695 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth; |
3704 size_t N = 0; | 3696 size_t N = 0; |
3705 while (RemainingBytes >= TyWidth) { | 3697 while (RemainingBytes >= TyWidth) { |
3706 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT); | 3698 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT); |
3707 Offset = Ctx->getConstantInt32(OffsetAmt); | 3699 Offset = Ctx->getConstantInt32(OffsetAmt); |
3708 Reg = makeReg(Ty); | 3700 Reg = makeReg(Ty); |
(...skipping 180 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3889 Str << ", Index="; | 3881 Str << ", Index="; |
3890 if (Index) | 3882 if (Index) |
3891 Index->dump(Func); | 3883 Index->dump(Func); |
3892 else | 3884 else |
3893 Str << "<null>"; | 3885 Str << "<null>"; |
3894 Str << ", Shift=" << Shift << ", Offset=" << Offset << "\n"; | 3886 Str << ", Shift=" << Shift << ", Offset=" << Offset << "\n"; |
3895 } | 3887 } |
3896 | 3888 |
3897 inline bool matchTransitiveAssign(const VariablesMetadata *VMetadata, | 3889 inline bool matchTransitiveAssign(const VariablesMetadata *VMetadata, |
3898 Variable *&Var, const Inst *&Reason) { | 3890 Variable *&Var, const Inst *&Reason) { |
3899 // Var originates from Var=SrcVar ==> | 3891 // Var originates from Var=SrcVar ==> set Var:=SrcVar |
3900 // set Var:=SrcVar | |
3901 if (Var == nullptr) | 3892 if (Var == nullptr) |
3902 return false; | 3893 return false; |
3903 if (const Inst *VarAssign = VMetadata->getSingleDefinition(Var)) { | 3894 if (const Inst *VarAssign = VMetadata->getSingleDefinition(Var)) { |
3904 assert(!VMetadata->isMultiDef(Var)); | 3895 assert(!VMetadata->isMultiDef(Var)); |
3905 if (llvm::isa<InstAssign>(VarAssign)) { | 3896 if (llvm::isa<InstAssign>(VarAssign)) { |
3906 Operand *SrcOp = VarAssign->getSrc(0); | 3897 Operand *SrcOp = VarAssign->getSrc(0); |
3907 assert(SrcOp); | 3898 assert(SrcOp); |
3908 if (Variable *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) { | 3899 if (Variable *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) { |
3909 if (!VMetadata->isMultiDef(SrcVar) && | 3900 if (!VMetadata->isMultiDef(SrcVar) && |
3910 // TODO: ensure SrcVar stays single-BB | 3901 // TODO: ensure SrcVar stays single-BB |
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4052 Func->resetCurrentNode(); | 4043 Func->resetCurrentNode(); |
4053 if (Func->isVerbose(IceV_AddrOpt)) { | 4044 if (Func->isVerbose(IceV_AddrOpt)) { |
4054 OstreamLocker L(Func->getContext()); | 4045 OstreamLocker L(Func->getContext()); |
4055 Ostream &Str = Func->getContext()->getStrDump(); | 4046 Ostream &Str = Func->getContext()->getStrDump(); |
4056 Str << "\nStarting computeAddressOpt for instruction:\n "; | 4047 Str << "\nStarting computeAddressOpt for instruction:\n "; |
4057 Instr->dumpDecorated(Func); | 4048 Instr->dumpDecorated(Func); |
4058 } | 4049 } |
4059 (void)Offset; // TODO: pattern-match for non-zero offsets. | 4050 (void)Offset; // TODO: pattern-match for non-zero offsets. |
4060 if (Base == nullptr) | 4051 if (Base == nullptr) |
4061 return; | 4052 return; |
4062 // If the Base has more than one use or is live across multiple | 4053 // If the Base has more than one use or is live across multiple blocks, then |
4063 // blocks, then don't go further. Alternatively (?), never consider | 4054 // don't go further. Alternatively (?), never consider a transformation that |
4064 // a transformation that would change a variable that is currently | 4055 // would change a variable that is currently *not* live across basic block |
4065 // *not* live across basic block boundaries into one that *is*. | 4056 // boundaries into one that *is*. |
4066 if (Func->getVMetadata()->isMultiBlock(Base) /* || Base->getUseCount() > 1*/) | 4057 if (Func->getVMetadata()->isMultiBlock(Base) /* || Base->getUseCount() > 1*/) |
4067 return; | 4058 return; |
4068 | 4059 |
4069 const bool MockBounds = Func->getContext()->getFlags().getMockBoundsCheck(); | 4060 const bool MockBounds = Func->getContext()->getFlags().getMockBoundsCheck(); |
4070 const VariablesMetadata *VMetadata = Func->getVMetadata(); | 4061 const VariablesMetadata *VMetadata = Func->getVMetadata(); |
4071 bool Continue = true; | 4062 bool Continue = true; |
4072 while (Continue) { | 4063 while (Continue) { |
4073 const Inst *Reason = nullptr; | 4064 const Inst *Reason = nullptr; |
4074 if (matchTransitiveAssign(VMetadata, Base, Reason) || | 4065 if (matchTransitiveAssign(VMetadata, Base, Reason) || |
4075 matchTransitiveAssign(VMetadata, Index, Reason) || | 4066 matchTransitiveAssign(VMetadata, Index, Reason) || |
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4225 Operand *SrcT = Inst->getTrueOperand(); | 4216 Operand *SrcT = Inst->getTrueOperand(); |
4226 Operand *SrcF = Inst->getFalseOperand(); | 4217 Operand *SrcF = Inst->getFalseOperand(); |
4227 Operand *Condition = Inst->getCondition(); | 4218 Operand *Condition = Inst->getCondition(); |
4228 | 4219 |
4229 if (isVectorType(DestTy)) { | 4220 if (isVectorType(DestTy)) { |
4230 Type SrcTy = SrcT->getType(); | 4221 Type SrcTy = SrcT->getType(); |
4231 Variable *T = makeReg(SrcTy); | 4222 Variable *T = makeReg(SrcTy); |
4232 Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem); | 4223 Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem); |
4233 Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem); | 4224 Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem); |
4234 if (InstructionSet >= Traits::SSE4_1) { | 4225 if (InstructionSet >= Traits::SSE4_1) { |
4235 // TODO(wala): If the condition operand is a constant, use blendps | 4226 // TODO(wala): If the condition operand is a constant, use blendps or |
4236 // or pblendw. | 4227 // pblendw. |
4237 // | 4228 // |
4238 // Use blendvps or pblendvb to implement select. | 4229 // Use blendvps or pblendvb to implement select. |
4239 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 || | 4230 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 || |
4240 SrcTy == IceType_v4f32) { | 4231 SrcTy == IceType_v4f32) { |
4241 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); | 4232 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); |
4242 Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0); | 4233 Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0); |
4243 _movp(xmm0, ConditionRM); | 4234 _movp(xmm0, ConditionRM); |
4244 _psll(xmm0, Ctx->getConstantInt8(31)); | 4235 _psll(xmm0, Ctx->getConstantInt8(31)); |
4245 _movp(T, SrcFRM); | 4236 _movp(T, SrcFRM); |
4246 _blendvps(T, SrcTRM, xmm0); | 4237 _blendvps(T, SrcTRM, xmm0); |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4303 } | 4294 } |
4304 if (CmpOpnd0 == nullptr) { | 4295 if (CmpOpnd0 == nullptr) { |
4305 CmpOpnd0 = legalize(Condition, Legal_Reg | Legal_Mem); | 4296 CmpOpnd0 = legalize(Condition, Legal_Reg | Legal_Mem); |
4306 CmpOpnd1 = Ctx->getConstantZero(IceType_i32); | 4297 CmpOpnd1 = Ctx->getConstantZero(IceType_i32); |
4307 } | 4298 } |
4308 assert(CmpOpnd0); | 4299 assert(CmpOpnd0); |
4309 assert(CmpOpnd1); | 4300 assert(CmpOpnd1); |
4310 | 4301 |
4311 _cmp(CmpOpnd0, CmpOpnd1); | 4302 _cmp(CmpOpnd0, CmpOpnd1); |
4312 if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) { | 4303 if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) { |
4313 // The cmov instruction doesn't allow 8-bit or FP operands, so | 4304 // The cmov instruction doesn't allow 8-bit or FP operands, so we need |
4314 // we need explicit control flow. | 4305 // explicit control flow. |
4315 // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1: | 4306 // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1: |
4316 typename Traits::Insts::Label *Label = | 4307 typename Traits::Insts::Label *Label = |
4317 Traits::Insts::Label::create(Func, this); | 4308 Traits::Insts::Label::create(Func, this); |
4318 SrcT = legalize(SrcT, Legal_Reg | Legal_Imm); | 4309 SrcT = legalize(SrcT, Legal_Reg | Legal_Imm); |
4319 _mov(Dest, SrcT); | 4310 _mov(Dest, SrcT); |
4320 _br(Cond, Label); | 4311 _br(Cond, Label); |
4321 SrcF = legalize(SrcF, Legal_Reg | Legal_Imm); | 4312 SrcF = legalize(SrcF, Legal_Reg | Legal_Imm); |
4322 _mov_nonkillable(Dest, SrcF); | 4313 _mov_nonkillable(Dest, SrcF); |
4323 Context.insert(Label); | 4314 Context.insert(Label); |
4324 return; | 4315 return; |
4325 } | 4316 } |
4326 // mov t, SrcF; cmov_cond t, SrcT; mov dest, t | 4317 // mov t, SrcF; cmov_cond t, SrcT; mov dest, t |
4327 // But if SrcT is immediate, we might be able to do better, as | 4318 // But if SrcT is immediate, we might be able to do better, as the cmov |
4328 // the cmov instruction doesn't allow an immediate operand: | 4319 // instruction doesn't allow an immediate operand: |
4329 // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t | 4320 // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t |
4330 if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) { | 4321 if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) { |
4331 std::swap(SrcT, SrcF); | 4322 std::swap(SrcT, SrcF); |
4332 Cond = InstX86Base<Machine>::getOppositeCondition(Cond); | 4323 Cond = InstX86Base<Machine>::getOppositeCondition(Cond); |
4333 } | 4324 } |
4334 if (!Traits::Is64Bit && DestTy == IceType_i64) { | 4325 if (!Traits::Is64Bit && DestTy == IceType_i64) { |
4335 SrcT = legalizeUndef(SrcT); | 4326 SrcT = legalizeUndef(SrcT); |
4336 SrcF = legalizeUndef(SrcF); | 4327 SrcF = legalizeUndef(SrcF); |
4337 // Set the low portion. | 4328 // Set the low portion. |
4338 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); | 4329 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); |
(...skipping 340 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4679 | 4670 |
4680 lowerAssign(InstAssign::create(Func, Dest, T)); | 4671 lowerAssign(InstAssign::create(Func, Dest, T)); |
4681 } | 4672 } |
4682 | 4673 |
4683 /// The following pattern occurs often in lowered C and C++ code: | 4674 /// The following pattern occurs often in lowered C and C++ code: |
4684 /// | 4675 /// |
4685 /// %cmp = fcmp/icmp pred <n x ty> %src0, %src1 | 4676 /// %cmp = fcmp/icmp pred <n x ty> %src0, %src1 |
4686 /// %cmp.ext = sext <n x i1> %cmp to <n x ty> | 4677 /// %cmp.ext = sext <n x i1> %cmp to <n x ty> |
4687 /// | 4678 /// |
4688 /// We can eliminate the sext operation by copying the result of pcmpeqd, | 4679 /// We can eliminate the sext operation by copying the result of pcmpeqd, |
4689 /// pcmpgtd, or cmpps (which produce sign extended results) to the result | 4680 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of the |
4690 /// of the sext operation. | 4681 /// sext operation. |
4691 template <class Machine> | 4682 template <class Machine> |
4692 void TargetX86Base<Machine>::eliminateNextVectorSextInstruction( | 4683 void TargetX86Base<Machine>::eliminateNextVectorSextInstruction( |
4693 Variable *SignExtendedResult) { | 4684 Variable *SignExtendedResult) { |
4694 if (InstCast *NextCast = | 4685 if (InstCast *NextCast = |
4695 llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) { | 4686 llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) { |
4696 if (NextCast->getCastKind() == InstCast::Sext && | 4687 if (NextCast->getCastKind() == InstCast::Sext && |
4697 NextCast->getSrc(0) == SignExtendedResult) { | 4688 NextCast->getSrc(0) == SignExtendedResult) { |
4698 NextCast->setDeleted(); | 4689 NextCast->setDeleted(); |
4699 _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult)); | 4690 _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult)); |
4700 // Skip over the instruction. | 4691 // Skip over the instruction. |
4701 Context.advanceNext(); | 4692 Context.advanceNext(); |
4702 } | 4693 } |
4703 } | 4694 } |
4704 } | 4695 } |
4705 | 4696 |
4706 template <class Machine> | 4697 template <class Machine> |
4707 void TargetX86Base<Machine>::lowerUnreachable( | 4698 void TargetX86Base<Machine>::lowerUnreachable( |
4708 const InstUnreachable * /*Inst*/) { | 4699 const InstUnreachable * /*Inst*/) { |
4709 _ud2(); | 4700 _ud2(); |
4710 } | 4701 } |
4711 | 4702 |
4712 template <class Machine> | 4703 template <class Machine> |
4713 void TargetX86Base<Machine>::lowerRMW( | 4704 void TargetX86Base<Machine>::lowerRMW( |
4714 const typename Traits::Insts::FakeRMW *RMW) { | 4705 const typename Traits::Insts::FakeRMW *RMW) { |
4715 // If the beacon variable's live range does not end in this | 4706 // If the beacon variable's live range does not end in this instruction, then |
4716 // instruction, then it must end in the modified Store instruction | 4707 // it must end in the modified Store instruction that follows. This means |
4717 // that follows. This means that the original Store instruction is | 4708 // that the original Store instruction is still there, either because the |
4718 // still there, either because the value being stored is used beyond | 4709 // value being stored is used beyond the Store instruction, or because dead |
4719 // the Store instruction, or because dead code elimination did not | 4710 // code elimination did not happen. In either case, we cancel RMW lowering |
4720 // happen. In either case, we cancel RMW lowering (and the caller | 4711 // (and the caller deletes the RMW instruction). |
4721 // deletes the RMW instruction). | |
4722 if (!RMW->isLastUse(RMW->getBeacon())) | 4712 if (!RMW->isLastUse(RMW->getBeacon())) |
4723 return; | 4713 return; |
4724 Operand *Src = RMW->getData(); | 4714 Operand *Src = RMW->getData(); |
4725 Type Ty = Src->getType(); | 4715 Type Ty = Src->getType(); |
4726 typename Traits::X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty); | 4716 typename Traits::X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty); |
4727 doMockBoundsCheck(Addr); | 4717 doMockBoundsCheck(Addr); |
4728 if (!Traits::Is64Bit && Ty == IceType_i64) { | 4718 if (!Traits::Is64Bit && Ty == IceType_i64) { |
4729 Src = legalizeUndef(Src); | 4719 Src = legalizeUndef(Src); |
4730 Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm); | 4720 Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm); |
4731 Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm); | 4721 Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm); |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4793 template <class Machine> | 4783 template <class Machine> |
4794 void TargetX86Base<Machine>::lowerOther(const Inst *Instr) { | 4784 void TargetX86Base<Machine>::lowerOther(const Inst *Instr) { |
4795 if (const auto *RMW = | 4785 if (const auto *RMW = |
4796 llvm::dyn_cast<typename Traits::Insts::FakeRMW>(Instr)) { | 4786 llvm::dyn_cast<typename Traits::Insts::FakeRMW>(Instr)) { |
4797 lowerRMW(RMW); | 4787 lowerRMW(RMW); |
4798 } else { | 4788 } else { |
4799 TargetLowering::lowerOther(Instr); | 4789 TargetLowering::lowerOther(Instr); |
4800 } | 4790 } |
4801 } | 4791 } |
4802 | 4792 |
4803 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to | 4793 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve |
4804 /// preserve integrity of liveness analysis. Undef values are also | 4794 /// integrity of liveness analysis. Undef values are also turned into zeroes, |
4805 /// turned into zeroes, since loOperand() and hiOperand() don't expect | 4795 /// since loOperand() and hiOperand() don't expect Undef input. |
4806 /// Undef input. | |
4807 template <class Machine> void TargetX86Base<Machine>::prelowerPhis() { | 4796 template <class Machine> void TargetX86Base<Machine>::prelowerPhis() { |
4808 if (Traits::Is64Bit) { | 4797 if (Traits::Is64Bit) { |
4809 // On x86-64 we don't need to prelower phis -- the architecture can handle | 4798 // On x86-64 we don't need to prelower phis -- the architecture can handle |
4810 // 64-bit integer natively. | 4799 // 64-bit integer natively. |
4811 return; | 4800 return; |
4812 } | 4801 } |
4813 | 4802 |
4814 // Pause constant blinding or pooling, blinding or pooling will be done | 4803 // Pause constant blinding or pooling, blinding or pooling will be done later |
4815 // later during phi lowering assignments | 4804 // during phi lowering assignments |
4816 BoolFlagSaver B(RandomizationPoolingPaused, true); | 4805 BoolFlagSaver B(RandomizationPoolingPaused, true); |
4817 PhiLowering::prelowerPhis32Bit<TargetX86Base<Machine>>( | 4806 PhiLowering::prelowerPhis32Bit<TargetX86Base<Machine>>( |
4818 this, Context.getNode(), Func); | 4807 this, Context.getNode(), Func); |
4819 } | 4808 } |
4820 | 4809 |
4821 // There is no support for loading or emitting vector constants, so the | 4810 // There is no support for loading or emitting vector constants, so the vector |
4822 // vector values returned from makeVectorOfZeros, makeVectorOfOnes, | 4811 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are |
4823 // etc. are initialized with register operations. | 4812 // initialized with register operations. |
4824 // | 4813 // |
4825 // TODO(wala): Add limited support for vector constants so that | 4814 // TODO(wala): Add limited support for vector constants so that complex |
4826 // complex initialization in registers is unnecessary. | 4815 // initialization in registers is unnecessary. |
4827 | 4816 |
4828 template <class Machine> | 4817 template <class Machine> |
4829 Variable *TargetX86Base<Machine>::makeVectorOfZeros(Type Ty, int32_t RegNum) { | 4818 Variable *TargetX86Base<Machine>::makeVectorOfZeros(Type Ty, int32_t RegNum) { |
4830 Variable *Reg = makeReg(Ty, RegNum); | 4819 Variable *Reg = makeReg(Ty, RegNum); |
4831 // Insert a FakeDef, since otherwise the live range of Reg might | 4820 // Insert a FakeDef, since otherwise the live range of Reg might be |
4832 // be overestimated. | 4821 // overestimated. |
4833 Context.insert(InstFakeDef::create(Func, Reg)); | 4822 Context.insert(InstFakeDef::create(Func, Reg)); |
4834 _pxor(Reg, Reg); | 4823 _pxor(Reg, Reg); |
4835 return Reg; | 4824 return Reg; |
4836 } | 4825 } |
4837 | 4826 |
4838 template <class Machine> | 4827 template <class Machine> |
4839 Variable *TargetX86Base<Machine>::makeVectorOfMinusOnes(Type Ty, | 4828 Variable *TargetX86Base<Machine>::makeVectorOfMinusOnes(Type Ty, |
4840 int32_t RegNum) { | 4829 int32_t RegNum) { |
4841 Variable *MinusOnes = makeReg(Ty, RegNum); | 4830 Variable *MinusOnes = makeReg(Ty, RegNum); |
4842 // Insert a FakeDef so the live range of MinusOnes is not overestimated. | 4831 // Insert a FakeDef so the live range of MinusOnes is not overestimated. |
(...skipping 25 matching lines...) Expand all Loading... |
4868 // SSE has no left shift operation for vectors of 8 bit integers. | 4857 // SSE has no left shift operation for vectors of 8 bit integers. |
4869 const uint32_t HIGH_ORDER_BITS_MASK = 0x80808080; | 4858 const uint32_t HIGH_ORDER_BITS_MASK = 0x80808080; |
4870 Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK); | 4859 Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK); |
4871 Variable *Reg = makeReg(Ty, RegNum); | 4860 Variable *Reg = makeReg(Ty, RegNum); |
4872 _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem)); | 4861 _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem)); |
4873 _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8)); | 4862 _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8)); |
4874 return Reg; | 4863 return Reg; |
4875 } | 4864 } |
4876 } | 4865 } |
4877 | 4866 |
4878 /// Construct a mask in a register that can be and'ed with a | 4867 /// Construct a mask in a register that can be and'ed with a floating-point |
4879 /// floating-point value to mask off its sign bit. The value will be | 4868 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32 |
4880 /// <4 x 0x7fffffff> for f32 and v4f32, and <2 x 0x7fffffffffffffff> | 4869 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of |
4881 /// for f64. Construct it as vector of ones logically right shifted | 4870 /// ones logically right shifted one bit. |
4882 /// one bit. TODO(stichnot): Fix the wala TODO above, to represent | 4871 // TODO(stichnot): Fix the wala |
4883 /// vector constants in memory. | 4872 // TODO: above, to represent vector constants in memory. |
4884 template <class Machine> | 4873 template <class Machine> |
4885 Variable *TargetX86Base<Machine>::makeVectorOfFabsMask(Type Ty, | 4874 Variable *TargetX86Base<Machine>::makeVectorOfFabsMask(Type Ty, |
4886 int32_t RegNum) { | 4875 int32_t RegNum) { |
4887 Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum); | 4876 Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum); |
4888 _psrl(Reg, Ctx->getConstantInt8(1)); | 4877 _psrl(Reg, Ctx->getConstantInt8(1)); |
4889 return Reg; | 4878 return Reg; |
4890 } | 4879 } |
4891 | 4880 |
4892 template <class Machine> | 4881 template <class Machine> |
4893 typename TargetX86Base<Machine>::Traits::X86OperandMem * | 4882 typename TargetX86Base<Machine>::Traits::X86OperandMem * |
4894 TargetX86Base<Machine>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot, | 4883 TargetX86Base<Machine>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot, |
4895 uint32_t Offset) { | 4884 uint32_t Offset) { |
4896 // Ensure that Loc is a stack slot. | 4885 // Ensure that Loc is a stack slot. |
4897 assert(Slot->mustNotHaveReg()); | 4886 assert(Slot->mustNotHaveReg()); |
4898 assert(Slot->getRegNum() == Variable::NoRegister); | 4887 assert(Slot->getRegNum() == Variable::NoRegister); |
4899 // Compute the location of Loc in memory. | 4888 // Compute the location of Loc in memory. |
4900 // TODO(wala,stichnot): lea should not be required. The address of | 4889 // TODO(wala,stichnot): lea should not |
4901 // the stack slot is known at compile time (although not until after | 4890 // be required. The address of the stack slot is known at compile time |
4902 // addProlog()). | 4891 // (although not until after addProlog()). |
4903 const Type PointerType = IceType_i32; | 4892 const Type PointerType = IceType_i32; |
4904 Variable *Loc = makeReg(PointerType); | 4893 Variable *Loc = makeReg(PointerType); |
4905 _lea(Loc, Slot); | 4894 _lea(Loc, Slot); |
4906 Constant *ConstantOffset = Ctx->getConstantInt32(Offset); | 4895 Constant *ConstantOffset = Ctx->getConstantInt32(Offset); |
4907 return Traits::X86OperandMem::create(Func, Ty, Loc, ConstantOffset); | 4896 return Traits::X86OperandMem::create(Func, Ty, Loc, ConstantOffset); |
4908 } | 4897 } |
4909 | 4898 |
4910 /// Helper for legalize() to emit the right code to lower an operand to a | 4899 /// Helper for legalize() to emit the right code to lower an operand to a |
4911 /// register of the appropriate type. | 4900 /// register of the appropriate type. |
4912 template <class Machine> | 4901 template <class Machine> |
4913 Variable *TargetX86Base<Machine>::copyToReg(Operand *Src, int32_t RegNum) { | 4902 Variable *TargetX86Base<Machine>::copyToReg(Operand *Src, int32_t RegNum) { |
4914 Type Ty = Src->getType(); | 4903 Type Ty = Src->getType(); |
4915 Variable *Reg = makeReg(Ty, RegNum); | 4904 Variable *Reg = makeReg(Ty, RegNum); |
4916 if (isVectorType(Ty)) { | 4905 if (isVectorType(Ty)) { |
4917 _movp(Reg, Src); | 4906 _movp(Reg, Src); |
4918 } else { | 4907 } else { |
4919 _mov(Reg, Src); | 4908 _mov(Reg, Src); |
4920 } | 4909 } |
4921 return Reg; | 4910 return Reg; |
4922 } | 4911 } |
4923 | 4912 |
4924 template <class Machine> | 4913 template <class Machine> |
4925 Operand *TargetX86Base<Machine>::legalize(Operand *From, LegalMask Allowed, | 4914 Operand *TargetX86Base<Machine>::legalize(Operand *From, LegalMask Allowed, |
4926 int32_t RegNum) { | 4915 int32_t RegNum) { |
4927 Type Ty = From->getType(); | 4916 Type Ty = From->getType(); |
4928 // Assert that a physical register is allowed. To date, all calls | 4917 // Assert that a physical register is allowed. To date, all calls to |
4929 // to legalize() allow a physical register. If a physical register | 4918 // legalize() allow a physical register. If a physical register needs to be |
4930 // needs to be explicitly disallowed, then new code will need to be | 4919 // explicitly disallowed, then new code will need to be written to force a |
4931 // written to force a spill. | 4920 // spill. |
4932 assert(Allowed & Legal_Reg); | 4921 assert(Allowed & Legal_Reg); |
4933 // If we're asking for a specific physical register, make sure we're | 4922 // If we're asking for a specific physical register, make sure we're not |
4934 // not allowing any other operand kinds. (This could be future | 4923 // allowing any other operand kinds. (This could be future work, e.g. allow |
4935 // work, e.g. allow the shl shift amount to be either an immediate | 4924 // the shl shift amount to be either an immediate or in ecx.) |
4936 // or in ecx.) | |
4937 assert(RegNum == Variable::NoRegister || Allowed == Legal_Reg); | 4925 assert(RegNum == Variable::NoRegister || Allowed == Legal_Reg); |
4938 | 4926 |
4939 if (auto Mem = llvm::dyn_cast<typename Traits::X86OperandMem>(From)) { | 4927 if (auto Mem = llvm::dyn_cast<typename Traits::X86OperandMem>(From)) { |
4940 // Before doing anything with a Mem operand, we need to ensure | 4928 // Before doing anything with a Mem operand, we need to ensure that the |
4941 // that the Base and Index components are in physical registers. | 4929 // Base and Index components are in physical registers. |
4942 Variable *Base = Mem->getBase(); | 4930 Variable *Base = Mem->getBase(); |
4943 Variable *Index = Mem->getIndex(); | 4931 Variable *Index = Mem->getIndex(); |
4944 Variable *RegBase = nullptr; | 4932 Variable *RegBase = nullptr; |
4945 Variable *RegIndex = nullptr; | 4933 Variable *RegIndex = nullptr; |
4946 if (Base) { | 4934 if (Base) { |
4947 RegBase = legalizeToReg(Base); | 4935 RegBase = legalizeToReg(Base); |
4948 } | 4936 } |
4949 if (Index) { | 4937 if (Index) { |
4950 RegIndex = legalizeToReg(Index); | 4938 RegIndex = legalizeToReg(Index); |
4951 } | 4939 } |
(...skipping 24 matching lines...) Expand all Loading... |
4976 // If the operand is a 64 bit constant integer we need to legalize it to a | 4964 // If the operand is a 64 bit constant integer we need to legalize it to a |
4977 // register in x86-64. | 4965 // register in x86-64. |
4978 if (Traits::Is64Bit) { | 4966 if (Traits::Is64Bit) { |
4979 if (llvm::isa<ConstantInteger64>(Const)) { | 4967 if (llvm::isa<ConstantInteger64>(Const)) { |
4980 Variable *V = copyToReg(Const, RegNum); | 4968 Variable *V = copyToReg(Const, RegNum); |
4981 V->setMustHaveReg(); | 4969 V->setMustHaveReg(); |
4982 return V; | 4970 return V; |
4983 } | 4971 } |
4984 } | 4972 } |
4985 | 4973 |
4986 // If the operand is an 32 bit constant integer, we should check | 4974 // If the operand is an 32 bit constant integer, we should check whether we |
4987 // whether we need to randomize it or pool it. | 4975 // need to randomize it or pool it. |
4988 if (ConstantInteger32 *C = llvm::dyn_cast<ConstantInteger32>(Const)) { | 4976 if (ConstantInteger32 *C = llvm::dyn_cast<ConstantInteger32>(Const)) { |
4989 Operand *NewConst = randomizeOrPoolImmediate(C, RegNum); | 4977 Operand *NewConst = randomizeOrPoolImmediate(C, RegNum); |
4990 if (NewConst != Const) { | 4978 if (NewConst != Const) { |
4991 return NewConst; | 4979 return NewConst; |
4992 } | 4980 } |
4993 } | 4981 } |
4994 | 4982 |
4995 // Convert a scalar floating point constant into an explicit | 4983 // Convert a scalar floating point constant into an explicit memory |
4996 // memory operand. | 4984 // operand. |
4997 if (isScalarFloatingType(Ty)) { | 4985 if (isScalarFloatingType(Ty)) { |
4998 Variable *Base = nullptr; | 4986 Variable *Base = nullptr; |
4999 std::string Buffer; | 4987 std::string Buffer; |
5000 llvm::raw_string_ostream StrBuf(Buffer); | 4988 llvm::raw_string_ostream StrBuf(Buffer); |
5001 llvm::cast<Constant>(From)->emitPoolLabel(StrBuf); | 4989 llvm::cast<Constant>(From)->emitPoolLabel(StrBuf); |
5002 llvm::cast<Constant>(From)->setShouldBePooled(true); | 4990 llvm::cast<Constant>(From)->setShouldBePooled(true); |
5003 Constant *Offset = Ctx->getConstantSym(0, StrBuf.str(), true); | 4991 Constant *Offset = Ctx->getConstantSym(0, StrBuf.str(), true); |
5004 From = Traits::X86OperandMem::create(Func, Ty, Base, Offset); | 4992 From = Traits::X86OperandMem::create(Func, Ty, Base, Offset); |
5005 } | 4993 } |
5006 bool NeedsReg = false; | 4994 bool NeedsReg = false; |
5007 if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty)) | 4995 if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty)) |
5008 // Immediate specifically not allowed | 4996 // Immediate specifically not allowed |
5009 NeedsReg = true; | 4997 NeedsReg = true; |
5010 if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty)) | 4998 if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty)) |
5011 // On x86, FP constants are lowered to mem operands. | 4999 // On x86, FP constants are lowered to mem operands. |
5012 NeedsReg = true; | 5000 NeedsReg = true; |
5013 if (NeedsReg) { | 5001 if (NeedsReg) { |
5014 From = copyToReg(From, RegNum); | 5002 From = copyToReg(From, RegNum); |
5015 } | 5003 } |
5016 return From; | 5004 return From; |
5017 } | 5005 } |
5018 if (auto Var = llvm::dyn_cast<Variable>(From)) { | 5006 if (auto Var = llvm::dyn_cast<Variable>(From)) { |
5019 // Check if the variable is guaranteed a physical register. This | 5007 // Check if the variable is guaranteed a physical register. This can happen |
5020 // can happen either when the variable is pre-colored or when it is | 5008 // either when the variable is pre-colored or when it is assigned infinite |
5021 // assigned infinite weight. | 5009 // weight. |
5022 bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg()); | 5010 bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg()); |
5023 // We need a new physical register for the operand if: | 5011 // We need a new physical register for the operand if: |
5024 // Mem is not allowed and Var isn't guaranteed a physical | 5012 // Mem is not allowed and Var isn't guaranteed a physical |
5025 // register, or | 5013 // register, or |
5026 // RegNum is required and Var->getRegNum() doesn't match. | 5014 // RegNum is required and Var->getRegNum() doesn't match. |
5027 if ((!(Allowed & Legal_Mem) && !MustHaveRegister) || | 5015 if ((!(Allowed & Legal_Mem) && !MustHaveRegister) || |
5028 (RegNum != Variable::NoRegister && RegNum != Var->getRegNum())) { | 5016 (RegNum != Variable::NoRegister && RegNum != Var->getRegNum())) { |
5029 From = copyToReg(From, RegNum); | 5017 From = copyToReg(From, RegNum); |
5030 } | 5018 } |
5031 return From; | 5019 return From; |
5032 } | 5020 } |
5033 llvm_unreachable("Unhandled operand kind in legalize()"); | 5021 llvm_unreachable("Unhandled operand kind in legalize()"); |
5034 return From; | 5022 return From; |
5035 } | 5023 } |
5036 | 5024 |
5037 /// Provide a trivial wrapper to legalize() for this common usage. | 5025 /// Provide a trivial wrapper to legalize() for this common usage. |
5038 template <class Machine> | 5026 template <class Machine> |
5039 Variable *TargetX86Base<Machine>::legalizeToReg(Operand *From, int32_t RegNum) { | 5027 Variable *TargetX86Base<Machine>::legalizeToReg(Operand *From, int32_t RegNum) { |
5040 return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum)); | 5028 return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum)); |
5041 } | 5029 } |
5042 | 5030 |
5043 /// Legalize undef values to concrete values. | 5031 /// Legalize undef values to concrete values. |
5044 template <class Machine> | 5032 template <class Machine> |
5045 Operand *TargetX86Base<Machine>::legalizeUndef(Operand *From, int32_t RegNum) { | 5033 Operand *TargetX86Base<Machine>::legalizeUndef(Operand *From, int32_t RegNum) { |
5046 Type Ty = From->getType(); | 5034 Type Ty = From->getType(); |
5047 if (llvm::isa<ConstantUndef>(From)) { | 5035 if (llvm::isa<ConstantUndef>(From)) { |
5048 // Lower undefs to zero. Another option is to lower undefs to an | 5036 // Lower undefs to zero. Another option is to lower undefs to an |
5049 // uninitialized register; however, using an uninitialized register | 5037 // uninitialized register; however, using an uninitialized register results |
5050 // results in less predictable code. | 5038 // in less predictable code. |
5051 // | 5039 // |
5052 // If in the future the implementation is changed to lower undef | 5040 // If in the future the implementation is changed to lower undef values to |
5053 // values to uninitialized registers, a FakeDef will be needed: | 5041 // uninitialized registers, a FakeDef will be needed: |
5054 // Context.insert(InstFakeDef::create(Func, Reg)); | 5042 // Context.insert(InstFakeDef::create(Func, Reg)); |
5055 // This is in order to ensure that the live range of Reg is not | 5043 // This is in order to ensure that the live range of Reg is not |
5056 // overestimated. If the constant being lowered is a 64 bit value, | 5044 // overestimated. If the constant being lowered is a 64 bit value, then |
5057 // then the result should be split and the lo and hi components will | 5045 // the result should be split and the lo and hi components will need to go |
5058 // need to go in uninitialized registers. | 5046 // in uninitialized registers. |
5059 if (isVectorType(Ty)) | 5047 if (isVectorType(Ty)) |
5060 return makeVectorOfZeros(Ty, RegNum); | 5048 return makeVectorOfZeros(Ty, RegNum); |
5061 return Ctx->getConstantZero(Ty); | 5049 return Ctx->getConstantZero(Ty); |
5062 } | 5050 } |
5063 return From; | 5051 return From; |
5064 } | 5052 } |
5065 | 5053 |
5066 /// For the cmp instruction, if Src1 is an immediate, or known to be a | 5054 /// For the cmp instruction, if Src1 is an immediate, or known to be a physical |
5067 /// physical register, we can allow Src0 to be a memory operand. | 5055 /// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be |
5068 /// Otherwise, Src0 must be copied into a physical register. | 5056 /// copied into a physical register. (Actually, either Src0 or Src1 can be |
5069 /// (Actually, either Src0 or Src1 can be chosen for the physical | 5057 /// chosen for the physical register, but unfortunately we have to commit to one |
5070 /// register, but unfortunately we have to commit to one or the other | 5058 /// or the other before register allocation.) |
5071 /// before register allocation.) | |
5072 template <class Machine> | 5059 template <class Machine> |
5073 Operand *TargetX86Base<Machine>::legalizeSrc0ForCmp(Operand *Src0, | 5060 Operand *TargetX86Base<Machine>::legalizeSrc0ForCmp(Operand *Src0, |
5074 Operand *Src1) { | 5061 Operand *Src1) { |
5075 bool IsSrc1ImmOrReg = false; | 5062 bool IsSrc1ImmOrReg = false; |
5076 if (llvm::isa<Constant>(Src1)) { | 5063 if (llvm::isa<Constant>(Src1)) { |
5077 IsSrc1ImmOrReg = true; | 5064 IsSrc1ImmOrReg = true; |
5078 } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) { | 5065 } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) { |
5079 if (Var->hasReg()) | 5066 if (Var->hasReg()) |
5080 IsSrc1ImmOrReg = true; | 5067 IsSrc1ImmOrReg = true; |
5081 } | 5068 } |
5082 return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg); | 5069 return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg); |
5083 } | 5070 } |
5084 | 5071 |
5085 template <class Machine> | 5072 template <class Machine> |
5086 typename TargetX86Base<Machine>::Traits::X86OperandMem * | 5073 typename TargetX86Base<Machine>::Traits::X86OperandMem * |
5087 TargetX86Base<Machine>::formMemoryOperand(Operand *Opnd, Type Ty, | 5074 TargetX86Base<Machine>::formMemoryOperand(Operand *Opnd, Type Ty, |
5088 bool DoLegalize) { | 5075 bool DoLegalize) { |
5089 auto *Mem = llvm::dyn_cast<typename Traits::X86OperandMem>(Opnd); | 5076 auto *Mem = llvm::dyn_cast<typename Traits::X86OperandMem>(Opnd); |
5090 // It may be the case that address mode optimization already creates an | 5077 // It may be the case that address mode optimization already creates an |
5091 // Traits::X86OperandMem, so in that case it wouldn't need another level of | 5078 // Traits::X86OperandMem, so in that case it wouldn't need another level of |
5092 // transformation. | 5079 // transformation. |
5093 if (!Mem) { | 5080 if (!Mem) { |
5094 Variable *Base = llvm::dyn_cast<Variable>(Opnd); | 5081 Variable *Base = llvm::dyn_cast<Variable>(Opnd); |
5095 Constant *Offset = llvm::dyn_cast<Constant>(Opnd); | 5082 Constant *Offset = llvm::dyn_cast<Constant>(Opnd); |
5096 assert(Base || Offset); | 5083 assert(Base || Offset); |
5097 if (Offset) { | 5084 if (Offset) { |
5098 // During memory operand building, we do not blind or pool | 5085 // During memory operand building, we do not blind or pool the constant |
5099 // the constant offset, we will work on the whole memory | 5086 // offset, we will work on the whole memory operand later as one entity |
5100 // operand later as one entity later, this save one instruction. | 5087 // later, this save one instruction. By turning blinding and pooling off, |
5101 // By turning blinding and pooling off, we guarantee | 5088 // we guarantee legalize(Offset) will return a Constant*. |
5102 // legalize(Offset) will return a Constant*. | |
5103 { | 5089 { |
5104 BoolFlagSaver B(RandomizationPoolingPaused, true); | 5090 BoolFlagSaver B(RandomizationPoolingPaused, true); |
5105 | 5091 |
5106 Offset = llvm::cast<Constant>(legalize(Offset)); | 5092 Offset = llvm::cast<Constant>(legalize(Offset)); |
5107 } | 5093 } |
5108 | 5094 |
5109 assert(llvm::isa<ConstantInteger32>(Offset) || | 5095 assert(llvm::isa<ConstantInteger32>(Offset) || |
5110 llvm::isa<ConstantRelocatable>(Offset)); | 5096 llvm::isa<ConstantRelocatable>(Offset)); |
5111 } | 5097 } |
5112 Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); | 5098 Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); |
5113 } | 5099 } |
5114 // Do legalization, which contains randomization/pooling | 5100 // Do legalization, which contains randomization/pooling or do |
5115 // or do randomization/pooling. | 5101 // randomization/pooling. |
5116 return llvm::cast<typename Traits::X86OperandMem>( | 5102 return llvm::cast<typename Traits::X86OperandMem>( |
5117 DoLegalize ? legalize(Mem) : randomizeOrPoolImmediate(Mem)); | 5103 DoLegalize ? legalize(Mem) : randomizeOrPoolImmediate(Mem)); |
5118 } | 5104 } |
5119 | 5105 |
5120 template <class Machine> | 5106 template <class Machine> |
5121 Variable *TargetX86Base<Machine>::makeReg(Type Type, int32_t RegNum) { | 5107 Variable *TargetX86Base<Machine>::makeReg(Type Type, int32_t RegNum) { |
5122 // There aren't any 64-bit integer registers for x86-32. | 5108 // There aren't any 64-bit integer registers for x86-32. |
5123 assert(Traits::Is64Bit || Type != IceType_i64); | 5109 assert(Traits::Is64Bit || Type != IceType_i64); |
5124 Variable *Reg = Func->makeVariable(Type); | 5110 Variable *Reg = Func->makeVariable(Type); |
5125 if (RegNum == Variable::NoRegister) | 5111 if (RegNum == Variable::NoRegister) |
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5228 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == | 5214 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == |
5229 RPI_Randomize) { | 5215 RPI_Randomize) { |
5230 // blind the constant | 5216 // blind the constant |
5231 // FROM: | 5217 // FROM: |
5232 // imm | 5218 // imm |
5233 // TO: | 5219 // TO: |
5234 // insert: mov imm+cookie, Reg | 5220 // insert: mov imm+cookie, Reg |
5235 // insert: lea -cookie[Reg], Reg | 5221 // insert: lea -cookie[Reg], Reg |
5236 // => Reg | 5222 // => Reg |
5237 // If we have already assigned a phy register, we must come from | 5223 // If we have already assigned a phy register, we must come from |
5238 // andvancedPhiLowering()=>lowerAssign(). In this case we should reuse | 5224 // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the |
5239 // the assigned register as this assignment is that start of its use-def | 5225 // assigned register as this assignment is that start of its use-def |
5240 // chain. So we add RegNum argument here. | 5226 // chain. So we add RegNum argument here. Note we use 'lea' instruction |
5241 // Note we use 'lea' instruction instead of 'xor' to avoid affecting | 5227 // instead of 'xor' to avoid affecting the flags. |
5242 // the flags. | |
5243 Variable *Reg = makeReg(IceType_i32, RegNum); | 5228 Variable *Reg = makeReg(IceType_i32, RegNum); |
5244 ConstantInteger32 *Integer = llvm::cast<ConstantInteger32>(Immediate); | 5229 ConstantInteger32 *Integer = llvm::cast<ConstantInteger32>(Immediate); |
5245 uint32_t Value = Integer->getValue(); | 5230 uint32_t Value = Integer->getValue(); |
5246 uint32_t Cookie = Func->getConstantBlindingCookie(); | 5231 uint32_t Cookie = Func->getConstantBlindingCookie(); |
5247 _mov(Reg, Ctx->getConstantInt(IceType_i32, Cookie + Value)); | 5232 _mov(Reg, Ctx->getConstantInt(IceType_i32, Cookie + Value)); |
5248 Constant *Offset = Ctx->getConstantInt(IceType_i32, 0 - Cookie); | 5233 Constant *Offset = Ctx->getConstantInt(IceType_i32, 0 - Cookie); |
5249 _lea(Reg, Traits::X86OperandMem::create(Func, IceType_i32, Reg, Offset, | 5234 _lea(Reg, Traits::X86OperandMem::create(Func, IceType_i32, Reg, Offset, |
5250 nullptr, 0)); | 5235 nullptr, 0)); |
5251 // make sure liveness analysis won't kill this variable, otherwise a | 5236 // make sure liveness analysis won't kill this variable, otherwise a |
5252 // liveness assertion will be triggered. | 5237 // liveness assertion will be triggered. |
5253 _set_dest_nonkillable(); | 5238 _set_dest_nonkillable(); |
5254 if (Immediate->getType() != IceType_i32) { | 5239 if (Immediate->getType() != IceType_i32) { |
5255 Variable *TruncReg = makeReg(Immediate->getType(), RegNum); | 5240 Variable *TruncReg = makeReg(Immediate->getType(), RegNum); |
5256 _mov(TruncReg, Reg); | 5241 _mov(TruncReg, Reg); |
5257 return TruncReg; | 5242 return TruncReg; |
5258 } | 5243 } |
5259 return Reg; | 5244 return Reg; |
5260 } | 5245 } |
5261 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) { | 5246 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) { |
5262 // pool the constant | 5247 // pool the constant |
5263 // FROM: | 5248 // FROM: |
5264 // imm | 5249 // imm |
5265 // TO: | 5250 // TO: |
5266 // insert: mov $label, Reg | 5251 // insert: mov $label, Reg |
5267 // => Reg | 5252 // => Reg |
5268 assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool); | 5253 assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool); |
5269 Immediate->setShouldBePooled(true); | 5254 Immediate->setShouldBePooled(true); |
5270 // if we have already assigned a phy register, we must come from | 5255 // if we have already assigned a phy register, we must come from |
5271 // andvancedPhiLowering()=>lowerAssign(). In this case we should reuse | 5256 // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the |
5272 // the assigned register as this assignment is that start of its use-def | 5257 // assigned register as this assignment is that start of its use-def |
5273 // chain. So we add RegNum argument here. | 5258 // chain. So we add RegNum argument here. |
5274 Variable *Reg = makeReg(Immediate->getType(), RegNum); | 5259 Variable *Reg = makeReg(Immediate->getType(), RegNum); |
5275 IceString Label; | 5260 IceString Label; |
5276 llvm::raw_string_ostream Label_stream(Label); | 5261 llvm::raw_string_ostream Label_stream(Label); |
5277 Immediate->emitPoolLabel(Label_stream); | 5262 Immediate->emitPoolLabel(Label_stream); |
5278 const RelocOffsetT Offset = 0; | 5263 const RelocOffsetT Offset = 0; |
5279 const bool SuppressMangling = true; | 5264 const bool SuppressMangling = true; |
5280 Constant *Symbol = | 5265 Constant *Symbol = |
5281 Ctx->getConstantSym(Offset, Label_stream.str(), SuppressMangling); | 5266 Ctx->getConstantSym(Offset, Label_stream.str(), SuppressMangling); |
5282 typename Traits::X86OperandMem *MemOperand = | 5267 typename Traits::X86OperandMem *MemOperand = |
(...skipping 12 matching lines...) Expand all Loading... |
5295 typename TargetX86Base<Machine>::Traits::X86OperandMem * | 5280 typename TargetX86Base<Machine>::Traits::X86OperandMem * |
5296 TargetX86Base<Machine>::randomizeOrPoolImmediate( | 5281 TargetX86Base<Machine>::randomizeOrPoolImmediate( |
5297 typename Traits::X86OperandMem *MemOperand, int32_t RegNum) { | 5282 typename Traits::X86OperandMem *MemOperand, int32_t RegNum) { |
5298 assert(MemOperand); | 5283 assert(MemOperand); |
5299 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None || | 5284 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None || |
5300 RandomizationPoolingPaused == true) { | 5285 RandomizationPoolingPaused == true) { |
5301 // immediates randomization/pooling is turned off | 5286 // immediates randomization/pooling is turned off |
5302 return MemOperand; | 5287 return MemOperand; |
5303 } | 5288 } |
5304 | 5289 |
5305 // If this memory operand is already a randommized one, we do | 5290 // If this memory operand is already a randomized one, we do not randomize it |
5306 // not randomize it again. | 5291 // again. |
5307 if (MemOperand->getRandomized()) | 5292 if (MemOperand->getRandomized()) |
5308 return MemOperand; | 5293 return MemOperand; |
5309 | 5294 |
5310 if (Constant *C = llvm::dyn_cast_or_null<Constant>(MemOperand->getOffset())) { | 5295 if (Constant *C = llvm::dyn_cast_or_null<Constant>(MemOperand->getOffset())) { |
5311 if (C->shouldBeRandomizedOrPooled(Ctx)) { | 5296 if (C->shouldBeRandomizedOrPooled(Ctx)) { |
5312 // The offset of this mem operand should be blinded or pooled | 5297 // The offset of this mem operand should be blinded or pooled |
5313 Ctx->statsUpdateRPImms(); | 5298 Ctx->statsUpdateRPImms(); |
5314 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == | 5299 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == |
5315 RPI_Randomize) { | 5300 RPI_Randomize) { |
5316 // blind the constant offset | 5301 // blind the constant offset |
(...skipping 14 matching lines...) Expand all Loading... |
5331 typename Traits::X86OperandMem *TempMemOperand = | 5316 typename Traits::X86OperandMem *TempMemOperand = |
5332 Traits::X86OperandMem::create(Func, MemOperand->getType(), | 5317 Traits::X86OperandMem::create(Func, MemOperand->getType(), |
5333 MemOperand->getBase(), Mask1); | 5318 MemOperand->getBase(), Mask1); |
5334 // If we have already assigned a physical register, we must come from | 5319 // If we have already assigned a physical register, we must come from |
5335 // advancedPhiLowering()=>lowerAssign(). In this case we should reuse | 5320 // advancedPhiLowering()=>lowerAssign(). In this case we should reuse |
5336 // the assigned register as this assignment is that start of its | 5321 // the assigned register as this assignment is that start of its |
5337 // use-def chain. So we add RegNum argument here. | 5322 // use-def chain. So we add RegNum argument here. |
5338 Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum); | 5323 Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum); |
5339 _lea(RegTemp, TempMemOperand); | 5324 _lea(RegTemp, TempMemOperand); |
5340 // As source operand doesn't use the dstreg, we don't need to add | 5325 // As source operand doesn't use the dstreg, we don't need to add |
5341 // _set_dest_nonkillable(). | 5326 // _set_dest_nonkillable(). But if we use the same Dest Reg, that is, |
5342 // But if we use the same Dest Reg, that is, with RegNum | 5327 // with RegNum assigned, we should add this _set_dest_nonkillable() |
5343 // assigned, we should add this _set_dest_nonkillable() | |
5344 if (RegNum != Variable::NoRegister) | 5328 if (RegNum != Variable::NoRegister) |
5345 _set_dest_nonkillable(); | 5329 _set_dest_nonkillable(); |
5346 | 5330 |
5347 typename Traits::X86OperandMem *NewMemOperand = | 5331 typename Traits::X86OperandMem *NewMemOperand = |
5348 Traits::X86OperandMem::create(Func, MemOperand->getType(), RegTemp, | 5332 Traits::X86OperandMem::create(Func, MemOperand->getType(), RegTemp, |
5349 Mask2, MemOperand->getIndex(), | 5333 Mask2, MemOperand->getIndex(), |
5350 MemOperand->getShift(), | 5334 MemOperand->getShift(), |
5351 MemOperand->getSegmentRegister()); | 5335 MemOperand->getSegmentRegister()); |
5352 | 5336 |
5353 // Label this memory operand as randomized, so we won't randomize it | 5337 // Label this memory operand as randomized, so we won't randomize it |
5354 // again in case we call legalize() multiple times on this memory | 5338 // again in case we call legalize() multiple times on this memory |
5355 // operand. | 5339 // operand. |
5356 NewMemOperand->setRandomized(true); | 5340 NewMemOperand->setRandomized(true); |
5357 return NewMemOperand; | 5341 return NewMemOperand; |
5358 } | 5342 } |
5359 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) { | 5343 if (Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool) { |
5360 // pool the constant offset | 5344 // pool the constant offset |
5361 // FROM: | 5345 // FROM: |
5362 // offset[base, index, shift] | 5346 // offset[base, index, shift] |
5363 // TO: | 5347 // TO: |
5364 // insert: mov $label, RegTemp | 5348 // insert: mov $label, RegTemp |
5365 // insert: lea [base, RegTemp], RegTemp | 5349 // insert: lea [base, RegTemp], RegTemp |
5366 // =>[RegTemp, index, shift] | 5350 // =>[RegTemp, index, shift] |
5367 assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == | 5351 assert(Ctx->getFlags().getRandomizeAndPoolImmediatesOption() == |
5368 RPI_Pool); | 5352 RPI_Pool); |
5369 // Memory operand should never exist as source operands in phi | 5353 // Memory operand should never exist as source operands in phi lowering |
5370 // lowering assignments, so there is no need to reuse any registers | 5354 // assignments, so there is no need to reuse any registers here. For |
5371 // here. For phi lowering, we should not ask for new physical | 5355 // phi lowering, we should not ask for new physical registers in |
5372 // registers in general. | 5356 // general. However, if we do meet Memory Operand during phi lowering, |
5373 // However, if we do meet Memory Operand during phi lowering, we | 5357 // we should not blind or pool the immediates for now. |
5374 // should not blind or pool the immediates for now. | |
5375 if (RegNum != Variable::NoRegister) | 5358 if (RegNum != Variable::NoRegister) |
5376 return MemOperand; | 5359 return MemOperand; |
5377 Variable *RegTemp = makeReg(IceType_i32); | 5360 Variable *RegTemp = makeReg(IceType_i32); |
5378 IceString Label; | 5361 IceString Label; |
5379 llvm::raw_string_ostream Label_stream(Label); | 5362 llvm::raw_string_ostream Label_stream(Label); |
5380 MemOperand->getOffset()->emitPoolLabel(Label_stream); | 5363 MemOperand->getOffset()->emitPoolLabel(Label_stream); |
5381 MemOperand->getOffset()->setShouldBePooled(true); | 5364 MemOperand->getOffset()->setShouldBePooled(true); |
5382 const RelocOffsetT SymOffset = 0; | 5365 const RelocOffsetT SymOffset = 0; |
5383 bool SuppressMangling = true; | 5366 bool SuppressMangling = true; |
5384 Constant *Symbol = Ctx->getConstantSym(SymOffset, Label_stream.str(), | 5367 Constant *Symbol = Ctx->getConstantSym(SymOffset, Label_stream.str(), |
(...skipping 25 matching lines...) Expand all Loading... |
5410 } | 5393 } |
5411 // the offset is not eligible for blinding or pooling, return the original | 5394 // the offset is not eligible for blinding or pooling, return the original |
5412 // mem operand | 5395 // mem operand |
5413 return MemOperand; | 5396 return MemOperand; |
5414 } | 5397 } |
5415 | 5398 |
5416 } // end of namespace X86Internal | 5399 } // end of namespace X86Internal |
5417 } // end of namespace Ice | 5400 } // end of namespace Ice |
5418 | 5401 |
5419 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H | 5402 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H |
OLD | NEW |