OLD | NEW |
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// | 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// |
2 // | 2 // |
3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
4 // | 4 // |
5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
7 // | 7 // |
8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
9 // | 9 // |
10 // This file implements the TargetLoweringX8632 class, which | 10 // This file implements the TargetLoweringX8632 class, which |
11 // consists almost entirely of the lowering sequence for each | 11 // consists almost entirely of the lowering sequence for each |
12 // high-level instruction. It also implements | 12 // high-level instruction. It also implements |
13 // TargetX8632Fast::postLower() which does the simplest possible | 13 // TargetX8632Fast::postLower() which does the simplest possible |
14 // register allocation for the "fast" target. | 14 // register allocation for the "fast" target. |
15 // | 15 // |
16 //===----------------------------------------------------------------------===// | 16 //===----------------------------------------------------------------------===// |
17 | 17 |
18 #include "IceDefs.h" | 18 #include "IceDefs.h" |
19 #include "IceCfg.h" | 19 #include "IceCfg.h" |
20 #include "IceCfgNode.h" | 20 #include "IceCfgNode.h" |
21 #include "IceInstX8632.h" | 21 #include "IceInstX8632.h" |
22 #include "IceOperand.h" | 22 #include "IceOperand.h" |
23 #include "IceTargetLoweringX8632.def" | 23 #include "IceTargetLoweringX8632.def" |
24 #include "IceTargetLoweringX8632.h" | 24 #include "IceTargetLoweringX8632.h" |
25 #include "llvm/Support/CommandLine.h" | 25 #include "llvm/Support/CommandLine.h" |
26 | 26 |
| 27 #include <strings.h> |
| 28 |
27 namespace Ice { | 29 namespace Ice { |
28 | 30 |
29 namespace { | 31 namespace { |
30 | 32 |
31 // The following table summarizes the logic for lowering the fcmp | 33 // The following table summarizes the logic for lowering the fcmp |
32 // instruction. There is one table entry for each of the 16 conditions. | 34 // instruction. There is one table entry for each of the 16 conditions. |
33 // | 35 // |
34 // The first four columns describe the case when the operands are | 36 // The first four columns describe the case when the operands are |
35 // floating point scalar values. A comment in lowerFcmp() describes the | 37 // floating point scalar values. A comment in lowerFcmp() describes the |
36 // lowering template. In the most general case, there is a compare | 38 // lowering template. In the most general case, there is a compare |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
121 } | 123 } |
122 | 124 |
123 // The maximum number of arguments to pass in XMM registers | 125 // The maximum number of arguments to pass in XMM registers |
124 const uint32_t X86_MAX_XMM_ARGS = 4; | 126 const uint32_t X86_MAX_XMM_ARGS = 4; |
125 // The number of bits in a byte | 127 // The number of bits in a byte |
126 const uint32_t X86_CHAR_BIT = 8; | 128 const uint32_t X86_CHAR_BIT = 8; |
127 // Stack alignment | 129 // Stack alignment |
128 const uint32_t X86_STACK_ALIGNMENT_BYTES = 16; | 130 const uint32_t X86_STACK_ALIGNMENT_BYTES = 16; |
129 // Size of the return address on the stack | 131 // Size of the return address on the stack |
130 const uint32_t X86_RET_IP_SIZE_BYTES = 4; | 132 const uint32_t X86_RET_IP_SIZE_BYTES = 4; |
| 133 // The base 2 logarithm of the width in bytes of the smallest stack slot |
| 134 const uint32_t X86_LOG2_OF_MIN_STACK_SLOT_SIZE = 2; |
| 135 // The base 2 logarithm of the width in bytes of the largest stack slot |
| 136 const uint32_t X86_LOG2_OF_MAX_STACK_SLOT_SIZE = 4; |
131 | 137 |
132 // Value is a size in bytes. Return Value adjusted to the next highest | 138 // Value and Alignment are in bytes. Return Value adjusted to the next |
133 // multiple of the stack alignment. | 139 // highest multiple of Alignment. |
| 140 uint32_t applyAlignment(uint32_t Value, uint32_t Alignment) { |
| 141 // power of 2 |
| 142 assert((Alignment & (Alignment - 1)) == 0); |
| 143 return (Value + Alignment - 1) & -Alignment; |
| 144 } |
| 145 |
| 146 // Value is in bytes. Return Value adjusted to the next highest multiple |
| 147 // of the stack alignment. |
134 uint32_t applyStackAlignment(uint32_t Value) { | 148 uint32_t applyStackAlignment(uint32_t Value) { |
135 // power of 2 | 149 return applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES); |
136 assert((X86_STACK_ALIGNMENT_BYTES & (X86_STACK_ALIGNMENT_BYTES - 1)) == 0); | |
137 return (Value + X86_STACK_ALIGNMENT_BYTES - 1) & -X86_STACK_ALIGNMENT_BYTES; | |
138 } | 150 } |
139 | 151 |
140 // Instruction set options | 152 // Instruction set options |
141 namespace cl = ::llvm::cl; | 153 namespace cl = ::llvm::cl; |
142 cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet( | 154 cl::opt<TargetX8632::X86InstructionSet> CLInstructionSet( |
143 "mattr", cl::desc("X86 target attributes"), | 155 "mattr", cl::desc("X86 target attributes"), |
144 cl::init(TargetX8632::SSE2), | 156 cl::init(TargetX8632::SSE2), |
145 cl::values( | 157 cl::values( |
146 clEnumValN(TargetX8632::SSE2, "sse2", | 158 clEnumValN(TargetX8632::SSE2, "sse2", |
147 "Enable SSE2 instructions (default)"), | 159 "Enable SSE2 instructions (default)"), |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
241 ICETYPE_TABLE; | 253 ICETYPE_TABLE; |
242 #undef X | 254 #undef X |
243 } | 255 } |
244 } | 256 } |
245 | 257 |
246 } // end of anonymous namespace | 258 } // end of anonymous namespace |
247 | 259 |
248 TargetX8632::TargetX8632(Cfg *Func) | 260 TargetX8632::TargetX8632(Cfg *Func) |
249 : TargetLowering(Func), InstructionSet(CLInstructionSet), | 261 : TargetLowering(Func), InstructionSet(CLInstructionSet), |
250 IsEbpBasedFrame(false), NeedsStackAlignment(false), FrameSizeLocals(0), | 262 IsEbpBasedFrame(false), NeedsStackAlignment(false), FrameSizeLocals(0), |
251 LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false), | 263 SpillAreaSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false), |
252 PhysicalRegisters(VarList(Reg_NUM)) { | 264 PhysicalRegisters(VarList(Reg_NUM)) { |
253 // TODO: Don't initialize IntegerRegisters and friends every time. | 265 // TODO: Don't initialize IntegerRegisters and friends every time. |
254 // Instead, initialize in some sort of static initializer for the | 266 // Instead, initialize in some sort of static initializer for the |
255 // class. | 267 // class. |
256 llvm::SmallBitVector IntegerRegisters(Reg_NUM); | 268 llvm::SmallBitVector IntegerRegisters(Reg_NUM); |
257 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM); | 269 llvm::SmallBitVector IntegerRegistersI8(Reg_NUM); |
258 llvm::SmallBitVector FloatRegisters(Reg_NUM); | 270 llvm::SmallBitVector FloatRegisters(Reg_NUM); |
259 llvm::SmallBitVector VectorRegisters(Reg_NUM); | 271 llvm::SmallBitVector VectorRegisters(Reg_NUM); |
260 llvm::SmallBitVector InvalidRegisters(Reg_NUM); | 272 llvm::SmallBitVector InvalidRegisters(Reg_NUM); |
261 ScratchRegs.resize(Reg_NUM); | 273 ScratchRegs.resize(Reg_NUM); |
(...skipping 251 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
513 Variable *RegisterArg = Func->makeVariable(Ty, DefNode, Name); | 525 Variable *RegisterArg = Func->makeVariable(Ty, DefNode, Name); |
514 RegisterArg->setRegNum(RegNum); | 526 RegisterArg->setRegNum(RegNum); |
515 RegisterArg->setIsArg(Func); | 527 RegisterArg->setIsArg(Func); |
516 Arg->setIsArg(Func, false); | 528 Arg->setIsArg(Func, false); |
517 | 529 |
518 Args[I] = RegisterArg; | 530 Args[I] = RegisterArg; |
519 Context.insert(InstAssign::create(Func, Arg, RegisterArg)); | 531 Context.insert(InstAssign::create(Func, Arg, RegisterArg)); |
520 } | 532 } |
521 } | 533 } |
522 | 534 |
| 535 void TargetX8632::sortByAlignment(VarList &Dest, const VarList &Source) const { |
| 536 // Sort the variables into buckets according to the log of their width |
| 537 // in bytes. |
| 538 const SizeT NumBuckets = |
| 539 X86_LOG2_OF_MAX_STACK_SLOT_SIZE - X86_LOG2_OF_MIN_STACK_SLOT_SIZE + 1; |
| 540 VarList Buckets[NumBuckets]; |
| 541 |
| 542 for (VarList::const_iterator I = Source.begin(), E = Source.end(); I != E; |
| 543 ++I) { |
| 544 Variable *Var = *I; |
| 545 uint32_t NaturalAlignment = typeWidthInBytesOnStack(Var->getType()); |
| 546 SizeT LogNaturalAlignment = ffs(NaturalAlignment) - 1; |
| 547 assert(LogNaturalAlignment >= X86_LOG2_OF_MIN_STACK_SLOT_SIZE); |
| 548 assert(LogNaturalAlignment <= X86_LOG2_OF_MAX_STACK_SLOT_SIZE); |
| 549 SizeT BucketIndex = LogNaturalAlignment - X86_LOG2_OF_MIN_STACK_SLOT_SIZE; |
| 550 Buckets[BucketIndex].push_back(Var); |
| 551 } |
| 552 |
| 553 for (SizeT I = 0, E = NumBuckets; I < E; ++I) { |
| 554 VarList &List = Buckets[NumBuckets - I - 1]; |
| 555 Dest.insert(Dest.end(), List.begin(), List.end()); |
| 556 } |
| 557 } |
| 558 |
523 // Helper function for addProlog(). | 559 // Helper function for addProlog(). |
524 // | 560 // |
525 // This assumes Arg is an argument passed on the stack. This sets the | 561 // This assumes Arg is an argument passed on the stack. This sets the |
526 // frame offset for Arg and updates InArgsSizeBytes according to Arg's | 562 // frame offset for Arg and updates InArgsSizeBytes according to Arg's |
527 // width. For an I64 arg that has been split into Lo and Hi components, | 563 // width. For an I64 arg that has been split into Lo and Hi components, |
528 // it calls itself recursively on the components, taking care to handle | 564 // it calls itself recursively on the components, taking care to handle |
529 // Lo first because of the little-endian architecture. Lastly, this | 565 // Lo first because of the little-endian architecture. Lastly, this |
530 // function generates an instruction to copy Arg into its assigned | 566 // function generates an instruction to copy Arg into its assigned |
531 // register if applicable. | 567 // register if applicable. |
532 void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr, | 568 void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr, |
(...skipping 23 matching lines...) Expand all Loading... |
556 _movp(Arg, Mem); | 592 _movp(Arg, Mem); |
557 } else { | 593 } else { |
558 _mov(Arg, Mem); | 594 _mov(Arg, Mem); |
559 } | 595 } |
560 } | 596 } |
561 } | 597 } |
562 | 598 |
563 Type TargetX8632::stackSlotType() { return IceType_i32; } | 599 Type TargetX8632::stackSlotType() { return IceType_i32; } |
564 | 600 |
565 void TargetX8632::addProlog(CfgNode *Node) { | 601 void TargetX8632::addProlog(CfgNode *Node) { |
| 602 // Stack frame layout: |
| 603 // |
| 604 // +------------------------+ |
| 605 // | 1. return address | |
| 606 // +------------------------+ |
| 607 // | 2. preserved registers | |
| 608 // +------------------------+ |
| 609 // | 3. padding | |
| 610 // +------------------------+ |
| 611 // | 4. global spill area | |
| 612 // +------------------------+ |
| 613 // | 5. padding | |
| 614 // +------------------------+ |
| 615 // | 6. local spill area | |
| 616 // +------------------------+ |
| 617 // | 7. padding | |
| 618 // +------------------------+ |
| 619 // | 8. allocas | |
| 620 // +------------------------+ |
| 621 // |
| 622 // The following variables record the size in bytes of the given areas: |
| 623 // * X86_RET_IP_SIZE_BYTES: area 1 |
| 624 // * PreservedRegsSizeBytes: area 2 |
| 625 // * SpillAreaPaddingBytes: area 3 |
| 626 // * GlobalsSize: area 4 |
| 627 // * GlobalsAndSubsequentPaddingSize: areas 4 - 5 |
| 628 // * LocalsSpillAreaSize: area 6 |
| 629 // * SpillAreaSizeBytes: areas 3 - 7 |
| 630 |
566 // If SimpleCoalescing is false, each variable without a register | 631 // If SimpleCoalescing is false, each variable without a register |
567 // gets its own unique stack slot, which leads to large stack | 632 // gets its own unique stack slot, which leads to large stack |
568 // frames. If SimpleCoalescing is true, then each "global" variable | 633 // frames. If SimpleCoalescing is true, then each "global" variable |
569 // without a register gets its own slot, but "local" variable slots | 634 // without a register gets its own slot, but "local" variable slots |
570 // are reused across basic blocks. E.g., if A and B are local to | 635 // are reused across basic blocks. E.g., if A and B are local to |
571 // block 1 and C is local to block 2, then C may share a slot with A | 636 // block 1 and C is local to block 2, then C may share a slot with A |
572 // or B. | 637 // or B. |
573 const bool SimpleCoalescing = true; | 638 const bool SimpleCoalescing = true; |
574 size_t InArgsSizeBytes = 0; | 639 size_t InArgsSizeBytes = 0; |
575 size_t PreservedRegsSizeBytes = 0; | 640 size_t PreservedRegsSizeBytes = 0; |
576 LocalsSizeBytes = 0; | 641 SpillAreaSizeBytes = 0; |
577 Context.init(Node); | 642 Context.init(Node); |
578 Context.setInsertPoint(Context.getCur()); | 643 Context.setInsertPoint(Context.getCur()); |
579 | 644 |
580 // Determine stack frame offsets for each Variable without a | 645 // Determine stack frame offsets for each Variable without a |
581 // register assignment. This can be done as one variable per stack | 646 // register assignment. This can be done as one variable per stack |
582 // slot. Or, do coalescing by running the register allocator again | 647 // slot. Or, do coalescing by running the register allocator again |
583 // with an infinite set of registers (as a side effect, this gives | 648 // with an infinite set of registers (as a side effect, this gives |
584 // variables a second chance at physical register assignment). | 649 // variables a second chance at physical register assignment). |
585 // | 650 // |
586 // A middle ground approach is to leverage sparsity and allocate one | 651 // A middle ground approach is to leverage sparsity and allocate one |
587 // block of space on the frame for globals (variables with | 652 // block of space on the frame for globals (variables with |
588 // multi-block lifetime), and one block to share for locals | 653 // multi-block lifetime), and one block to share for locals |
589 // (single-block lifetime). | 654 // (single-block lifetime). |
590 | 655 |
591 llvm::SmallBitVector CalleeSaves = | 656 llvm::SmallBitVector CalleeSaves = |
592 getRegisterSet(RegSet_CalleeSave, RegSet_None); | 657 getRegisterSet(RegSet_CalleeSave, RegSet_None); |
593 | 658 |
594 size_t GlobalsSize = 0; | 659 size_t GlobalsSize = 0; |
595 std::vector<size_t> LocalsSize(Func->getNumNodes()); | 660 std::vector<size_t> LocalsSize(Func->getNumNodes()); |
596 | 661 |
597 // Prepass. Compute RegsUsed, PreservedRegsSizeBytes, and | 662 // Prepass. Compute RegsUsed, PreservedRegsSizeBytes, and |
598 // LocalsSizeBytes. | 663 // SpillAreaSizeBytes. |
599 RegsUsed = llvm::SmallBitVector(CalleeSaves.size()); | 664 RegsUsed = llvm::SmallBitVector(CalleeSaves.size()); |
600 const VarList &Variables = Func->getVariables(); | 665 const VarList &Variables = Func->getVariables(); |
601 const VarList &Args = Func->getArgs(); | 666 const VarList &Args = Func->getArgs(); |
| 667 VarList SpilledVariables, SortedSpilledVariables, |
| 668 VariablesLinkedToSpillSplots; |
| 669 |
| 670 // If there is a separate locals area, this specifies the alignment |
| 671 // for it. |
| 672 uint32_t LocalsSlotsAlignmentBytes = 0; |
| 673 // The entire spill locations area gets aligned to largest natural |
| 674 // alignment of the variables that have a spill slot. |
| 675 uint32_t SpillAreaAlignmentBytes = 0; |
602 for (VarList::const_iterator I = Variables.begin(), E = Variables.end(); | 676 for (VarList::const_iterator I = Variables.begin(), E = Variables.end(); |
603 I != E; ++I) { | 677 I != E; ++I) { |
604 Variable *Var = *I; | 678 Variable *Var = *I; |
605 if (Var->hasReg()) { | 679 if (Var->hasReg()) { |
606 RegsUsed[Var->getRegNum()] = true; | 680 RegsUsed[Var->getRegNum()] = true; |
607 continue; | 681 continue; |
608 } | 682 } |
609 // An argument either does not need a stack slot (if passed in a | 683 // An argument either does not need a stack slot (if passed in a |
610 // register) or already has one (if passed on the stack). | 684 // register) or already has one (if passed on the stack). |
611 if (Var->getIsArg()) | 685 if (Var->getIsArg()) |
612 continue; | 686 continue; |
613 // An unreferenced variable doesn't need a stack slot. | 687 // An unreferenced variable doesn't need a stack slot. |
614 if (ComputedLiveRanges && Var->getLiveRange().isEmpty()) | 688 if (ComputedLiveRanges && Var->getLiveRange().isEmpty()) |
615 continue; | 689 continue; |
616 // A spill slot linked to a variable with a stack slot should reuse | 690 // A spill slot linked to a variable with a stack slot should reuse |
617 // that stack slot. | 691 // that stack slot. |
618 if (Var->getWeight() == RegWeight::Zero && Var->getRegisterOverlap()) { | 692 if (Var->getWeight() == RegWeight::Zero && Var->getRegisterOverlap()) { |
619 if (Variable *Linked = Var->getPreferredRegister()) { | 693 if (Variable *Linked = Var->getPreferredRegister()) { |
620 if (!Linked->hasReg()) | 694 if (!Linked->hasReg()) { |
| 695 VariablesLinkedToSpillSplots.push_back(Var); |
621 continue; | 696 continue; |
| 697 } |
622 } | 698 } |
623 } | 699 } |
| 700 SpilledVariables.push_back(Var); |
| 701 } |
| 702 |
| 703 SortedSpilledVariables.reserve(SpilledVariables.size()); |
| 704 sortByAlignment(SortedSpilledVariables, SpilledVariables); |
| 705 for (VarList::const_iterator I = SortedSpilledVariables.begin(), |
| 706 E = SortedSpilledVariables.end(); |
| 707 I != E; ++I) { |
| 708 Variable *Var = *I; |
624 size_t Increment = typeWidthInBytesOnStack(Var->getType()); | 709 size_t Increment = typeWidthInBytesOnStack(Var->getType()); |
| 710 if (!SpillAreaAlignmentBytes) |
| 711 SpillAreaAlignmentBytes = Increment; |
625 if (SimpleCoalescing) { | 712 if (SimpleCoalescing) { |
626 if (Var->isMultiblockLife()) { | 713 if (Var->isMultiblockLife()) { |
627 GlobalsSize += Increment; | 714 GlobalsSize += Increment; |
628 } else { | 715 } else { |
629 SizeT NodeIndex = Var->getLocalUseNode()->getIndex(); | 716 SizeT NodeIndex = Var->getLocalUseNode()->getIndex(); |
630 LocalsSize[NodeIndex] += Increment; | 717 LocalsSize[NodeIndex] += Increment; |
631 if (LocalsSize[NodeIndex] > LocalsSizeBytes) | 718 if (LocalsSize[NodeIndex] > SpillAreaSizeBytes) |
632 LocalsSizeBytes = LocalsSize[NodeIndex]; | 719 SpillAreaSizeBytes = LocalsSize[NodeIndex]; |
| 720 if (!LocalsSlotsAlignmentBytes) |
| 721 LocalsSlotsAlignmentBytes = Increment; |
633 } | 722 } |
634 } else { | 723 } else { |
635 LocalsSizeBytes += Increment; | 724 SpillAreaSizeBytes += Increment; |
636 } | 725 } |
637 } | 726 } |
638 LocalsSizeBytes += GlobalsSize; | 727 uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes; |
| 728 |
| 729 SpillAreaSizeBytes += GlobalsSize; |
639 | 730 |
640 // Add push instructions for preserved registers. | 731 // Add push instructions for preserved registers. |
641 for (SizeT i = 0; i < CalleeSaves.size(); ++i) { | 732 for (SizeT i = 0; i < CalleeSaves.size(); ++i) { |
642 if (CalleeSaves[i] && RegsUsed[i]) { | 733 if (CalleeSaves[i] && RegsUsed[i]) { |
643 PreservedRegsSizeBytes += 4; | 734 PreservedRegsSizeBytes += 4; |
644 const bool SuppressStackAdjustment = true; | 735 const bool SuppressStackAdjustment = true; |
645 _push(getPhysicalRegister(i), SuppressStackAdjustment); | 736 _push(getPhysicalRegister(i), SuppressStackAdjustment); |
646 } | 737 } |
647 } | 738 } |
648 | 739 |
649 // Generate "push ebp; mov ebp, esp" | 740 // Generate "push ebp; mov ebp, esp" |
650 if (IsEbpBasedFrame) { | 741 if (IsEbpBasedFrame) { |
651 assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)) | 742 assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)) |
652 .count() == 0); | 743 .count() == 0); |
653 PreservedRegsSizeBytes += 4; | 744 PreservedRegsSizeBytes += 4; |
654 Variable *ebp = getPhysicalRegister(Reg_ebp); | 745 Variable *ebp = getPhysicalRegister(Reg_ebp); |
655 Variable *esp = getPhysicalRegister(Reg_esp); | 746 Variable *esp = getPhysicalRegister(Reg_esp); |
656 const bool SuppressStackAdjustment = true; | 747 const bool SuppressStackAdjustment = true; |
657 _push(ebp, SuppressStackAdjustment); | 748 _push(ebp, SuppressStackAdjustment); |
658 _mov(ebp, esp); | 749 _mov(ebp, esp); |
659 } | 750 } |
660 | 751 |
661 if (NeedsStackAlignment) { | 752 // Align the variables area. SpillAreaPaddingBytes is the size of |
662 uint32_t StackSize = applyStackAlignment( | 753 // the region after the preserved registers and before the spill |
663 X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes + LocalsSizeBytes); | 754 // areas. |
664 LocalsSizeBytes = | 755 uint32_t SpillAreaPaddingBytes = 0; |
665 StackSize - X86_RET_IP_SIZE_BYTES - PreservedRegsSizeBytes; | 756 if (SpillAreaAlignmentBytes) { |
| 757 assert(SpillAreaAlignmentBytes <= X86_STACK_ALIGNMENT_BYTES); |
| 758 uint32_t PaddingStart = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes; |
| 759 uint32_t SpillAreaStart = |
| 760 applyAlignment(PaddingStart, SpillAreaAlignmentBytes); |
| 761 SpillAreaPaddingBytes = SpillAreaStart - PaddingStart; |
| 762 SpillAreaSizeBytes += SpillAreaPaddingBytes; |
666 } | 763 } |
667 | 764 |
668 // Generate "sub esp, LocalsSizeBytes" | 765 // If there are separate globals and locals areas, make sure the |
669 if (LocalsSizeBytes) | 766 // locals area is aligned by padding the end of the globals area. |
| 767 uint32_t GlobalsAndSubsequentPaddingSize = GlobalsSize; |
| 768 if (LocalsSlotsAlignmentBytes) { |
| 769 assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes); |
| 770 GlobalsAndSubsequentPaddingSize = |
| 771 applyAlignment(GlobalsSize, LocalsSlotsAlignmentBytes); |
| 772 SpillAreaSizeBytes += GlobalsAndSubsequentPaddingSize - GlobalsSize; |
| 773 } |
| 774 |
| 775 // Align esp if necessary. |
| 776 if (NeedsStackAlignment) { |
| 777 uint32_t StackOffset = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes; |
| 778 uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes); |
| 779 SpillAreaSizeBytes = StackSize - StackOffset; |
| 780 } |
| 781 |
| 782 // Generate "sub esp, SpillAreaSizeBytes" |
| 783 if (SpillAreaSizeBytes) |
670 _sub(getPhysicalRegister(Reg_esp), | 784 _sub(getPhysicalRegister(Reg_esp), |
671 Ctx->getConstantInt(IceType_i32, LocalsSizeBytes)); | 785 Ctx->getConstantInt(IceType_i32, SpillAreaSizeBytes)); |
672 | 786 |
673 resetStackAdjustment(); | 787 resetStackAdjustment(); |
674 | 788 |
675 // Fill in stack offsets for stack args, and copy args into registers | 789 // Fill in stack offsets for stack args, and copy args into registers |
676 // for those that were register-allocated. Args are pushed right to | 790 // for those that were register-allocated. Args are pushed right to |
677 // left, so Arg[0] is closest to the stack/frame pointer. | 791 // left, so Arg[0] is closest to the stack/frame pointer. |
678 Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg()); | 792 Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg()); |
679 size_t BasicFrameOffset = PreservedRegsSizeBytes + X86_RET_IP_SIZE_BYTES; | 793 size_t BasicFrameOffset = PreservedRegsSizeBytes + X86_RET_IP_SIZE_BYTES; |
680 if (!IsEbpBasedFrame) | 794 if (!IsEbpBasedFrame) |
681 BasicFrameOffset += LocalsSizeBytes; | 795 BasicFrameOffset += SpillAreaSizeBytes; |
682 | 796 |
683 unsigned NumXmmArgs = 0; | 797 unsigned NumXmmArgs = 0; |
684 for (SizeT i = 0; i < Args.size(); ++i) { | 798 for (SizeT i = 0; i < Args.size(); ++i) { |
685 Variable *Arg = Args[i]; | 799 Variable *Arg = Args[i]; |
686 // Skip arguments passed in registers. | 800 // Skip arguments passed in registers. |
687 if (isVectorType(Arg->getType()) && NumXmmArgs < X86_MAX_XMM_ARGS) { | 801 if (isVectorType(Arg->getType()) && NumXmmArgs < X86_MAX_XMM_ARGS) { |
688 ++NumXmmArgs; | 802 ++NumXmmArgs; |
689 continue; | 803 continue; |
690 } | 804 } |
691 finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes); | 805 finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
692 } | 806 } |
693 | 807 |
694 // Fill in stack offsets for locals. | 808 // Fill in stack offsets for locals. |
695 size_t TotalGlobalsSize = GlobalsSize; | 809 size_t GlobalsSpaceUsed = SpillAreaPaddingBytes; |
696 GlobalsSize = 0; | |
697 LocalsSize.assign(LocalsSize.size(), 0); | 810 LocalsSize.assign(LocalsSize.size(), 0); |
698 size_t NextStackOffset = 0; | 811 size_t NextStackOffset = GlobalsSpaceUsed; |
699 for (VarList::const_iterator I = Variables.begin(), E = Variables.end(); | 812 for (VarList::const_iterator I = SortedSpilledVariables.begin(), |
| 813 E = SortedSpilledVariables.end(); |
700 I != E; ++I) { | 814 I != E; ++I) { |
701 Variable *Var = *I; | 815 Variable *Var = *I; |
702 if (Var->hasReg()) { | |
703 RegsUsed[Var->getRegNum()] = true; | |
704 continue; | |
705 } | |
706 if (Var->getIsArg()) | |
707 continue; | |
708 if (ComputedLiveRanges && Var->getLiveRange().isEmpty()) | |
709 continue; | |
710 if (Var->getWeight() == RegWeight::Zero && Var->getRegisterOverlap()) { | |
711 if (Variable *Linked = Var->getPreferredRegister()) { | |
712 if (!Linked->hasReg()) { | |
713 // TODO: Make sure Linked has already been assigned a stack | |
714 // slot. | |
715 Var->setStackOffset(Linked->getStackOffset()); | |
716 continue; | |
717 } | |
718 } | |
719 } | |
720 size_t Increment = typeWidthInBytesOnStack(Var->getType()); | 816 size_t Increment = typeWidthInBytesOnStack(Var->getType()); |
721 if (SimpleCoalescing) { | 817 if (SimpleCoalescing) { |
722 if (Var->isMultiblockLife()) { | 818 if (Var->isMultiblockLife()) { |
723 GlobalsSize += Increment; | 819 GlobalsSpaceUsed += Increment; |
724 NextStackOffset = GlobalsSize; | 820 NextStackOffset = GlobalsSpaceUsed; |
725 } else { | 821 } else { |
726 SizeT NodeIndex = Var->getLocalUseNode()->getIndex(); | 822 SizeT NodeIndex = Var->getLocalUseNode()->getIndex(); |
727 LocalsSize[NodeIndex] += Increment; | 823 LocalsSize[NodeIndex] += Increment; |
728 NextStackOffset = TotalGlobalsSize + LocalsSize[NodeIndex]; | 824 NextStackOffset = SpillAreaPaddingBytes + |
| 825 GlobalsAndSubsequentPaddingSize + |
| 826 LocalsSize[NodeIndex]; |
729 } | 827 } |
730 } else { | 828 } else { |
731 NextStackOffset += Increment; | 829 NextStackOffset += Increment; |
732 } | 830 } |
733 if (IsEbpBasedFrame) | 831 if (IsEbpBasedFrame) |
734 Var->setStackOffset(-NextStackOffset); | 832 Var->setStackOffset(-NextStackOffset); |
735 else | 833 else |
736 Var->setStackOffset(LocalsSizeBytes - NextStackOffset); | 834 Var->setStackOffset(SpillAreaSizeBytes - NextStackOffset); |
737 } | 835 } |
738 this->FrameSizeLocals = NextStackOffset; | 836 this->FrameSizeLocals = NextStackOffset - SpillAreaPaddingBytes; |
739 this->HasComputedFrame = true; | 837 this->HasComputedFrame = true; |
740 | 838 |
| 839 // Assign stack offsets to variables that have been linked to spilled |
| 840 // variables. |
| 841 for (VarList::const_iterator I = VariablesLinkedToSpillSplots.begin(), |
| 842 E = VariablesLinkedToSpillSplots.end(); |
| 843 I != E; ++I) { |
| 844 Variable *Var = *I; |
| 845 Variable *Linked = Var->getPreferredRegister(); |
| 846 Var->setStackOffset(Linked->getStackOffset()); |
| 847 } |
| 848 |
741 if (Func->getContext()->isVerbose(IceV_Frame)) { | 849 if (Func->getContext()->isVerbose(IceV_Frame)) { |
742 Func->getContext()->getStrDump() << "LocalsSizeBytes=" << LocalsSizeBytes | 850 Ostream &Str = Func->getContext()->getStrDump(); |
743 << "\n" | 851 |
744 << "InArgsSizeBytes=" << InArgsSizeBytes | 852 Str << "Stack layout:\n"; |
745 << "\n" | 853 uint32_t EspAdjustmentPaddingSize = |
746 << "PreservedRegsSizeBytes=" | 854 SpillAreaSizeBytes - LocalsSpillAreaSize - |
747 << PreservedRegsSizeBytes << "\n"; | 855 GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes; |
| 856 Str << " in-args = " << InArgsSizeBytes << " bytes\n" |
| 857 << " return address = " << X86_RET_IP_SIZE_BYTES << " bytes\n" |
| 858 << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n" |
| 859 << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n" |
| 860 << " globals spill area = " << GlobalsSize << " bytes\n" |
| 861 << " globals-locals spill areas intermediate padding = " |
| 862 << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n" |
| 863 << " locals spill area = " << LocalsSpillAreaSize << " bytes\n" |
| 864 << " esp alignment padding = " << EspAdjustmentPaddingSize |
| 865 << " bytes\n"; |
| 866 |
| 867 Str << "Stack details:\n" |
| 868 << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n" |
| 869 << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n" |
| 870 << " locals spill area alignment = " << LocalsSlotsAlignmentBytes |
| 871 << " bytes\n" |
| 872 << " is ebp based = " << IsEbpBasedFrame << "\n"; |
748 } | 873 } |
749 } | 874 } |
750 | 875 |
751 void TargetX8632::addEpilog(CfgNode *Node) { | 876 void TargetX8632::addEpilog(CfgNode *Node) { |
752 InstList &Insts = Node->getInsts(); | 877 InstList &Insts = Node->getInsts(); |
753 InstList::reverse_iterator RI, E; | 878 InstList::reverse_iterator RI, E; |
754 for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) { | 879 for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) { |
755 if (llvm::isa<InstX8632Ret>(*RI)) | 880 if (llvm::isa<InstX8632Ret>(*RI)) |
756 break; | 881 break; |
757 } | 882 } |
758 if (RI == E) | 883 if (RI == E) |
759 return; | 884 return; |
760 | 885 |
761 // Convert the reverse_iterator position into its corresponding | 886 // Convert the reverse_iterator position into its corresponding |
762 // (forward) iterator position. | 887 // (forward) iterator position. |
763 InstList::iterator InsertPoint = RI.base(); | 888 InstList::iterator InsertPoint = RI.base(); |
764 --InsertPoint; | 889 --InsertPoint; |
765 Context.init(Node); | 890 Context.init(Node); |
766 Context.setInsertPoint(InsertPoint); | 891 Context.setInsertPoint(InsertPoint); |
767 | 892 |
768 Variable *esp = getPhysicalRegister(Reg_esp); | 893 Variable *esp = getPhysicalRegister(Reg_esp); |
769 if (IsEbpBasedFrame) { | 894 if (IsEbpBasedFrame) { |
770 Variable *ebp = getPhysicalRegister(Reg_ebp); | 895 Variable *ebp = getPhysicalRegister(Reg_ebp); |
771 _mov(esp, ebp); | 896 _mov(esp, ebp); |
772 _pop(ebp); | 897 _pop(ebp); |
773 } else { | 898 } else { |
774 // add esp, LocalsSizeBytes | 899 // add esp, SpillAreaSizeBytes |
775 if (LocalsSizeBytes) | 900 if (SpillAreaSizeBytes) |
776 _add(esp, Ctx->getConstantInt(IceType_i32, LocalsSizeBytes)); | 901 _add(esp, Ctx->getConstantInt(IceType_i32, SpillAreaSizeBytes)); |
777 } | 902 } |
778 | 903 |
779 // Add pop instructions for preserved registers. | 904 // Add pop instructions for preserved registers. |
780 llvm::SmallBitVector CalleeSaves = | 905 llvm::SmallBitVector CalleeSaves = |
781 getRegisterSet(RegSet_CalleeSave, RegSet_None); | 906 getRegisterSet(RegSet_CalleeSave, RegSet_None); |
782 for (SizeT i = 0; i < CalleeSaves.size(); ++i) { | 907 for (SizeT i = 0; i < CalleeSaves.size(); ++i) { |
783 SizeT j = CalleeSaves.size() - i - 1; | 908 SizeT j = CalleeSaves.size() - i - 1; |
784 if (j == Reg_ebp && IsEbpBasedFrame) | 909 if (j == Reg_ebp && IsEbpBasedFrame) |
785 continue; | 910 continue; |
786 if (CalleeSaves[j] && RegsUsed[j]) { | 911 if (CalleeSaves[j] && RegsUsed[j]) { |
(...skipping 197 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
984 assert((AlignmentParam & (AlignmentParam - 1)) == 0); | 1109 assert((AlignmentParam & (AlignmentParam - 1)) == 0); |
985 assert((X86_STACK_ALIGNMENT_BYTES & (X86_STACK_ALIGNMENT_BYTES - 1)) == 0); | 1110 assert((X86_STACK_ALIGNMENT_BYTES & (X86_STACK_ALIGNMENT_BYTES - 1)) == 0); |
986 | 1111 |
987 uint32_t Alignment = std::max(AlignmentParam, X86_STACK_ALIGNMENT_BYTES); | 1112 uint32_t Alignment = std::max(AlignmentParam, X86_STACK_ALIGNMENT_BYTES); |
988 if (Alignment > X86_STACK_ALIGNMENT_BYTES) { | 1113 if (Alignment > X86_STACK_ALIGNMENT_BYTES) { |
989 _and(esp, Ctx->getConstantInt(IceType_i32, -Alignment)); | 1114 _and(esp, Ctx->getConstantInt(IceType_i32, -Alignment)); |
990 } | 1115 } |
991 if (ConstantInteger *ConstantTotalSize = | 1116 if (ConstantInteger *ConstantTotalSize = |
992 llvm::dyn_cast<ConstantInteger>(TotalSize)) { | 1117 llvm::dyn_cast<ConstantInteger>(TotalSize)) { |
993 uint32_t Value = ConstantTotalSize->getValue(); | 1118 uint32_t Value = ConstantTotalSize->getValue(); |
994 // Round Value up to the next highest multiple of the alignment. | 1119 Value = applyAlignment(Value, Alignment); |
995 Value = (Value + Alignment - 1) & -Alignment; | |
996 _sub(esp, Ctx->getConstantInt(IceType_i32, Value)); | 1120 _sub(esp, Ctx->getConstantInt(IceType_i32, Value)); |
997 } else { | 1121 } else { |
998 // Non-constant sizes need to be adjusted to the next highest | 1122 // Non-constant sizes need to be adjusted to the next highest |
999 // multiple of the required alignment at runtime. | 1123 // multiple of the required alignment at runtime. |
1000 Variable *T = makeReg(IceType_i32); | 1124 Variable *T = makeReg(IceType_i32); |
1001 _mov(T, TotalSize); | 1125 _mov(T, TotalSize); |
1002 _add(T, Ctx->getConstantInt(IceType_i32, Alignment - 1)); | 1126 _add(T, Ctx->getConstantInt(IceType_i32, Alignment - 1)); |
1003 _and(T, Ctx->getConstantInt(IceType_i32, -Alignment)); | 1127 _and(T, Ctx->getConstantInt(IceType_i32, -Alignment)); |
1004 _sub(esp, T); | 1128 _sub(esp, T); |
1005 } | 1129 } |
(...skipping 226 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1232 case InstArithmetic::Fsub: | 1356 case InstArithmetic::Fsub: |
1233 case InstArithmetic::Fmul: | 1357 case InstArithmetic::Fmul: |
1234 case InstArithmetic::Fdiv: | 1358 case InstArithmetic::Fdiv: |
1235 case InstArithmetic::Frem: | 1359 case InstArithmetic::Frem: |
1236 llvm_unreachable("FP instruction with i64 type"); | 1360 llvm_unreachable("FP instruction with i64 type"); |
1237 break; | 1361 break; |
1238 } | 1362 } |
1239 } else if (isVectorType(Dest->getType())) { | 1363 } else if (isVectorType(Dest->getType())) { |
1240 // TODO: Trap on integer divide and integer modulo by zero. | 1364 // TODO: Trap on integer divide and integer modulo by zero. |
1241 // See: https://code.google.com/p/nativeclient/issues/detail?id=3899 | 1365 // See: https://code.google.com/p/nativeclient/issues/detail?id=3899 |
1242 // | |
1243 // TODO(wala): ALIGNHACK: All vector arithmetic is currently done in | |
1244 // registers. This is a workaround of the fact that there is no | |
1245 // support for aligning stack operands. Once there is support, | |
1246 // remove LEGAL_HACK. | |
1247 #define LEGAL_HACK(s) legalizeToVar((s)) | |
1248 switch (Inst->getOp()) { | 1366 switch (Inst->getOp()) { |
1249 case InstArithmetic::_num: | 1367 case InstArithmetic::_num: |
1250 llvm_unreachable("Unknown arithmetic operator"); | 1368 llvm_unreachable("Unknown arithmetic operator"); |
1251 break; | 1369 break; |
1252 case InstArithmetic::Add: { | 1370 case InstArithmetic::Add: { |
1253 Variable *T = makeReg(Dest->getType()); | 1371 Variable *T = makeReg(Dest->getType()); |
1254 _movp(T, Src0); | 1372 _movp(T, Src0); |
1255 _padd(T, LEGAL_HACK(Src1)); | 1373 _padd(T, Src1); |
1256 _movp(Dest, T); | 1374 _movp(Dest, T); |
1257 } break; | 1375 } break; |
1258 case InstArithmetic::And: { | 1376 case InstArithmetic::And: { |
1259 Variable *T = makeReg(Dest->getType()); | 1377 Variable *T = makeReg(Dest->getType()); |
1260 _movp(T, Src0); | 1378 _movp(T, Src0); |
1261 _pand(T, LEGAL_HACK(Src1)); | 1379 _pand(T, Src1); |
1262 _movp(Dest, T); | 1380 _movp(Dest, T); |
1263 } break; | 1381 } break; |
1264 case InstArithmetic::Or: { | 1382 case InstArithmetic::Or: { |
1265 Variable *T = makeReg(Dest->getType()); | 1383 Variable *T = makeReg(Dest->getType()); |
1266 _movp(T, Src0); | 1384 _movp(T, Src0); |
1267 _por(T, LEGAL_HACK(Src1)); | 1385 _por(T, Src1); |
1268 _movp(Dest, T); | 1386 _movp(Dest, T); |
1269 } break; | 1387 } break; |
1270 case InstArithmetic::Xor: { | 1388 case InstArithmetic::Xor: { |
1271 Variable *T = makeReg(Dest->getType()); | 1389 Variable *T = makeReg(Dest->getType()); |
1272 _movp(T, Src0); | 1390 _movp(T, Src0); |
1273 _pxor(T, LEGAL_HACK(Src1)); | 1391 _pxor(T, Src1); |
1274 _movp(Dest, T); | 1392 _movp(Dest, T); |
1275 } break; | 1393 } break; |
1276 case InstArithmetic::Sub: { | 1394 case InstArithmetic::Sub: { |
1277 Variable *T = makeReg(Dest->getType()); | 1395 Variable *T = makeReg(Dest->getType()); |
1278 _movp(T, Src0); | 1396 _movp(T, Src0); |
1279 _psub(T, LEGAL_HACK(Src1)); | 1397 _psub(T, Src1); |
1280 _movp(Dest, T); | 1398 _movp(Dest, T); |
1281 } break; | 1399 } break; |
1282 case InstArithmetic::Mul: { | 1400 case InstArithmetic::Mul: { |
1283 bool TypesAreValidForPmull = | 1401 bool TypesAreValidForPmull = |
1284 Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16; | 1402 Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16; |
1285 bool InstructionSetIsValidForPmull = | 1403 bool InstructionSetIsValidForPmull = |
1286 Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1; | 1404 Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1; |
1287 if (TypesAreValidForPmull && InstructionSetIsValidForPmull) { | 1405 if (TypesAreValidForPmull && InstructionSetIsValidForPmull) { |
1288 Variable *T = makeReg(Dest->getType()); | 1406 Variable *T = makeReg(Dest->getType()); |
1289 _movp(T, Src0); | 1407 _movp(T, Src0); |
1290 _pmull(T, LEGAL_HACK(Src1)); | 1408 _pmull(T, Src1); |
1291 _movp(Dest, T); | 1409 _movp(Dest, T); |
1292 } else if (Dest->getType() == IceType_v4i32) { | 1410 } else if (Dest->getType() == IceType_v4i32) { |
1293 // Lowering sequence: | 1411 // Lowering sequence: |
1294 // Note: The mask arguments have index 0 on the left. | 1412 // Note: The mask arguments have index 0 on the left. |
1295 // | 1413 // |
1296 // movups T1, Src0 | 1414 // movups T1, Src0 |
1297 // pshufd T2, Src0, {1,0,3,0} | 1415 // pshufd T2, Src0, {1,0,3,0} |
1298 // pshufd T3, Src1, {1,0,3,0} | 1416 // pshufd T3, Src1, {1,0,3,0} |
1299 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} | 1417 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]} |
1300 // pmuludq T1, Src1 | 1418 // pmuludq T1, Src1 |
(...skipping 12 matching lines...) Expand all Loading... |
1313 // Dest[0, 2], Src[0, 2] | 1431 // Dest[0, 2], Src[0, 2] |
1314 const unsigned Mask0202 = 0x88; | 1432 const unsigned Mask0202 = 0x88; |
1315 // Mask that directs pshufd to create a vector with entries | 1433 // Mask that directs pshufd to create a vector with entries |
1316 // Src[0, 2, 1, 3] | 1434 // Src[0, 2, 1, 3] |
1317 const unsigned Mask0213 = 0xd8; | 1435 const unsigned Mask0213 = 0xd8; |
1318 Variable *T1 = makeReg(IceType_v4i32); | 1436 Variable *T1 = makeReg(IceType_v4i32); |
1319 Variable *T2 = makeReg(IceType_v4i32); | 1437 Variable *T2 = makeReg(IceType_v4i32); |
1320 Variable *T3 = makeReg(IceType_v4i32); | 1438 Variable *T3 = makeReg(IceType_v4i32); |
1321 Variable *T4 = makeReg(IceType_v4i32); | 1439 Variable *T4 = makeReg(IceType_v4i32); |
1322 _movp(T1, Src0); | 1440 _movp(T1, Src0); |
1323 // TODO(wala): ALIGHNHACK: Replace Src0R with Src0 and Src1R | 1441 _pshufd(T2, Src0, Mask1030); |
1324 // with Src1 after stack operand alignment support is | 1442 _pshufd(T3, Src1, Mask1030); |
1325 // implemented. | 1443 _pmuludq(T1, Src1); |
1326 Variable *Src0R = LEGAL_HACK(Src0); | |
1327 Variable *Src1R = LEGAL_HACK(Src1); | |
1328 _pshufd(T2, Src0R, Mask1030); | |
1329 _pshufd(T3, Src1R, Mask1030); | |
1330 _pmuludq(T1, Src1R); | |
1331 _pmuludq(T2, T3); | 1444 _pmuludq(T2, T3); |
1332 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202)); | 1445 _shufps(T1, T2, Ctx->getConstantInt(IceType_i8, Mask0202)); |
1333 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213)); | 1446 _pshufd(T4, T1, Ctx->getConstantInt(IceType_i8, Mask0213)); |
1334 _movp(Dest, T4); | 1447 _movp(Dest, T4); |
1335 } else { | 1448 } else { |
1336 assert(Dest->getType() == IceType_v16i8); | 1449 assert(Dest->getType() == IceType_v16i8); |
1337 scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1); | 1450 scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1); |
1338 } | 1451 } |
1339 } break; | 1452 } break; |
1340 case InstArithmetic::Shl: | 1453 case InstArithmetic::Shl: |
1341 case InstArithmetic::Lshr: | 1454 case InstArithmetic::Lshr: |
1342 case InstArithmetic::Ashr: | 1455 case InstArithmetic::Ashr: |
1343 case InstArithmetic::Udiv: | 1456 case InstArithmetic::Udiv: |
1344 case InstArithmetic::Urem: | 1457 case InstArithmetic::Urem: |
1345 case InstArithmetic::Sdiv: | 1458 case InstArithmetic::Sdiv: |
1346 case InstArithmetic::Srem: | 1459 case InstArithmetic::Srem: |
1347 scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1); | 1460 scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1); |
1348 break; | 1461 break; |
1349 case InstArithmetic::Fadd: { | 1462 case InstArithmetic::Fadd: { |
1350 Variable *T = makeReg(Dest->getType()); | 1463 Variable *T = makeReg(Dest->getType()); |
1351 _movp(T, Src0); | 1464 _movp(T, Src0); |
1352 _addps(T, LEGAL_HACK(Src1)); | 1465 _addps(T, Src1); |
1353 _movp(Dest, T); | 1466 _movp(Dest, T); |
1354 } break; | 1467 } break; |
1355 case InstArithmetic::Fsub: { | 1468 case InstArithmetic::Fsub: { |
1356 Variable *T = makeReg(Dest->getType()); | 1469 Variable *T = makeReg(Dest->getType()); |
1357 _movp(T, Src0); | 1470 _movp(T, Src0); |
1358 _subps(T, LEGAL_HACK(Src1)); | 1471 _subps(T, Src1); |
1359 _movp(Dest, T); | 1472 _movp(Dest, T); |
1360 } break; | 1473 } break; |
1361 case InstArithmetic::Fmul: { | 1474 case InstArithmetic::Fmul: { |
1362 Variable *T = makeReg(Dest->getType()); | 1475 Variable *T = makeReg(Dest->getType()); |
1363 _movp(T, Src0); | 1476 _movp(T, Src0); |
1364 _mulps(T, LEGAL_HACK(Src1)); | 1477 _mulps(T, Src1); |
1365 _movp(Dest, T); | 1478 _movp(Dest, T); |
1366 } break; | 1479 } break; |
1367 case InstArithmetic::Fdiv: { | 1480 case InstArithmetic::Fdiv: { |
1368 Variable *T = makeReg(Dest->getType()); | 1481 Variable *T = makeReg(Dest->getType()); |
1369 _movp(T, Src0); | 1482 _movp(T, Src0); |
1370 _divps(T, LEGAL_HACK(Src1)); | 1483 _divps(T, Src1); |
1371 _movp(Dest, T); | 1484 _movp(Dest, T); |
1372 } break; | 1485 } break; |
1373 case InstArithmetic::Frem: | 1486 case InstArithmetic::Frem: |
1374 scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1); | 1487 scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1); |
1375 break; | 1488 break; |
1376 } | 1489 } |
1377 #undef LEGAL_HACK | |
1378 } else { // Dest->getType() is non-i64 scalar | 1490 } else { // Dest->getType() is non-i64 scalar |
1379 Variable *T_edx = NULL; | 1491 Variable *T_edx = NULL; |
1380 Variable *T = NULL; | 1492 Variable *T = NULL; |
1381 switch (Inst->getOp()) { | 1493 switch (Inst->getOp()) { |
1382 case InstArithmetic::_num: | 1494 case InstArithmetic::_num: |
1383 llvm_unreachable("Unknown arithmetic operator"); | 1495 llvm_unreachable("Unknown arithmetic operator"); |
1384 break; | 1496 break; |
1385 case InstArithmetic::Add: | 1497 case InstArithmetic::Add: |
1386 _mov(T, Src0); | 1498 _mov(T, Src0); |
1387 _add(T, Src1); | 1499 _add(T, Src1); |
(...skipping 804 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2192 // TODO(wala): Determine the best lowering sequences for each type. | 2304 // TODO(wala): Determine the best lowering sequences for each type. |
2193 bool CanUsePextr = | 2305 bool CanUsePextr = |
2194 Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1; | 2306 Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1; |
2195 if (CanUsePextr && Ty != IceType_v4f32) { | 2307 if (CanUsePextr && Ty != IceType_v4f32) { |
2196 // Use pextrb, pextrw, or pextrd. | 2308 // Use pextrb, pextrw, or pextrd. |
2197 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); | 2309 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); |
2198 Variable *SourceVectR = legalizeToVar(SourceVectNotLegalized); | 2310 Variable *SourceVectR = legalizeToVar(SourceVectNotLegalized); |
2199 _pextr(ExtractedElementR, SourceVectR, Mask); | 2311 _pextr(ExtractedElementR, SourceVectR, Mask); |
2200 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { | 2312 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { |
2201 // Use pshufd and movd/movss. | 2313 // Use pshufd and movd/movss. |
2202 // | |
2203 // ALIGNHACK: Force vector operands to registers in instructions | |
2204 // that require aligned memory operands until support for data | |
2205 // alignment is implemented. | |
2206 #define ALIGN_HACK(Vect) legalizeToVar((Vect)) | |
2207 Operand *SourceVectRM = | |
2208 legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem); | |
2209 Variable *T = NULL; | 2314 Variable *T = NULL; |
2210 if (Index) { | 2315 if (Index) { |
2211 // The shuffle only needs to occur if the element to be extracted | 2316 // The shuffle only needs to occur if the element to be extracted |
2212 // is not at the lowest index. | 2317 // is not at the lowest index. |
2213 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); | 2318 Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); |
2214 T = makeReg(Ty); | 2319 T = makeReg(Ty); |
2215 _pshufd(T, ALIGN_HACK(SourceVectRM), Mask); | 2320 _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask); |
2216 } else { | 2321 } else { |
2217 T = ALIGN_HACK(SourceVectRM); | 2322 T = legalizeToVar(SourceVectNotLegalized); |
2218 } | 2323 } |
2219 | 2324 |
2220 if (InVectorElementTy == IceType_i32) { | 2325 if (InVectorElementTy == IceType_i32) { |
2221 _movd(ExtractedElementR, T); | 2326 _movd(ExtractedElementR, T); |
2222 } else { // Ty == Icetype_f32 | 2327 } else { // Ty == Icetype_f32 |
2223 // TODO(wala): _movss is only used here because _mov does not | 2328 // TODO(wala): _movss is only used here because _mov does not |
2224 // allow a vector source and a scalar destination. _mov should be | 2329 // allow a vector source and a scalar destination. _mov should be |
2225 // able to be used here. | 2330 // able to be used here. |
2226 // _movss is a binary instruction, so the FakeDef is needed to | 2331 // _movss is a binary instruction, so the FakeDef is needed to |
2227 // keep the live range analysis consistent. | 2332 // keep the live range analysis consistent. |
2228 Context.insert(InstFakeDef::create(Func, ExtractedElementR)); | 2333 Context.insert(InstFakeDef::create(Func, ExtractedElementR)); |
2229 _movss(ExtractedElementR, T); | 2334 _movss(ExtractedElementR, T); |
2230 } | 2335 } |
2231 #undef ALIGN_HACK | |
2232 } else { | 2336 } else { |
2233 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2337 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
2234 // Spill the value to a stack slot and do the extraction in memory. | 2338 // Spill the value to a stack slot and do the extraction in memory. |
2235 // | 2339 // |
2236 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when | 2340 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when |
2237 // support for legalizing to mem is implemented. | 2341 // support for legalizing to mem is implemented. |
2238 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); | 2342 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); |
2239 Slot->setWeight(RegWeight::Zero); | 2343 Slot->setWeight(RegWeight::Zero); |
2240 _movp(Slot, legalizeToVar(SourceVectNotLegalized)); | 2344 _movp(Slot, legalizeToVar(SourceVectNotLegalized)); |
2241 | 2345 |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2280 | 2384 |
2281 if (Condition == InstFcmp::True) { | 2385 if (Condition == InstFcmp::True) { |
2282 // makeVectorOfOnes() requires an integer vector type. | 2386 // makeVectorOfOnes() requires an integer vector type. |
2283 T = makeVectorOfMinusOnes(IceType_v4i32); | 2387 T = makeVectorOfMinusOnes(IceType_v4i32); |
2284 } else if (Condition == InstFcmp::False) { | 2388 } else if (Condition == InstFcmp::False) { |
2285 T = makeVectorOfZeros(Dest->getType()); | 2389 T = makeVectorOfZeros(Dest->getType()); |
2286 } else { | 2390 } else { |
2287 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); | 2391 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem); |
2288 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); | 2392 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem); |
2289 | 2393 |
2290 // ALIGNHACK: Without support for data alignment, both operands to | |
2291 // cmpps need to be forced into registers. Once support for data | |
2292 // alignment is implemented, remove LEGAL_HACK. | |
2293 #define LEGAL_HACK(Vect) legalizeToVar((Vect)) | |
2294 switch (Condition) { | 2394 switch (Condition) { |
2295 default: { | 2395 default: { |
2296 InstX8632Cmpps::CmppsCond Predicate = TableFcmp[Index].Predicate; | 2396 InstX8632Cmpps::CmppsCond Predicate = TableFcmp[Index].Predicate; |
2297 assert(Predicate != InstX8632Cmpps::Cmpps_Invalid); | 2397 assert(Predicate != InstX8632Cmpps::Cmpps_Invalid); |
2298 T = makeReg(Src0RM->getType()); | 2398 T = makeReg(Src0RM->getType()); |
2299 _movp(T, Src0RM); | 2399 _movp(T, Src0RM); |
2300 _cmpps(T, LEGAL_HACK(Src1RM), Predicate); | 2400 _cmpps(T, Src1RM, Predicate); |
2301 } break; | 2401 } break; |
2302 case InstFcmp::One: { | 2402 case InstFcmp::One: { |
2303 // Check both unequal and ordered. | 2403 // Check both unequal and ordered. |
2304 T = makeReg(Src0RM->getType()); | 2404 T = makeReg(Src0RM->getType()); |
2305 Variable *T2 = makeReg(Src0RM->getType()); | 2405 Variable *T2 = makeReg(Src0RM->getType()); |
2306 Src1RM = LEGAL_HACK(Src1RM); | |
2307 _movp(T, Src0RM); | 2406 _movp(T, Src0RM); |
2308 _cmpps(T, Src1RM, InstX8632Cmpps::Cmpps_neq); | 2407 _cmpps(T, Src1RM, InstX8632Cmpps::Cmpps_neq); |
2309 _movp(T2, Src0RM); | 2408 _movp(T2, Src0RM); |
2310 _cmpps(T2, Src1RM, InstX8632Cmpps::Cmpps_ord); | 2409 _cmpps(T2, Src1RM, InstX8632Cmpps::Cmpps_ord); |
2311 _pand(T, T2); | 2410 _pand(T, T2); |
2312 } break; | 2411 } break; |
2313 case InstFcmp::Ueq: { | 2412 case InstFcmp::Ueq: { |
2314 // Check both equal or unordered. | 2413 // Check both equal or unordered. |
2315 T = makeReg(Src0RM->getType()); | 2414 T = makeReg(Src0RM->getType()); |
2316 Variable *T2 = makeReg(Src0RM->getType()); | 2415 Variable *T2 = makeReg(Src0RM->getType()); |
2317 Src1RM = LEGAL_HACK(Src1RM); | |
2318 _movp(T, Src0RM); | 2416 _movp(T, Src0RM); |
2319 _cmpps(T, Src1RM, InstX8632Cmpps::Cmpps_eq); | 2417 _cmpps(T, Src1RM, InstX8632Cmpps::Cmpps_eq); |
2320 _movp(T2, Src0RM); | 2418 _movp(T2, Src0RM); |
2321 _cmpps(T2, Src1RM, InstX8632Cmpps::Cmpps_unord); | 2419 _cmpps(T2, Src1RM, InstX8632Cmpps::Cmpps_unord); |
2322 _por(T, T2); | 2420 _por(T, T2); |
2323 } break; | 2421 } break; |
2324 } | 2422 } |
2325 #undef LEGAL_HACK | |
2326 } | 2423 } |
2327 | 2424 |
2328 _movp(Dest, T); | 2425 _movp(Dest, T); |
2329 eliminateNextVectorSextInstruction(Dest); | 2426 eliminateNextVectorSextInstruction(Dest); |
2330 return; | 2427 return; |
2331 } | 2428 } |
2332 | 2429 |
2333 // Lowering a = fcmp cond, b, c | 2430 // Lowering a = fcmp cond, b, c |
2334 // ucomiss b, c /* only if C1 != Br_None */ | 2431 // ucomiss b, c /* only if C1 != Br_None */ |
2335 // /* but swap b,c order if SwapOperands==true */ | 2432 // /* but swap b,c order if SwapOperands==true */ |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2420 Variable *T1 = makeReg(Ty); | 2517 Variable *T1 = makeReg(Ty); |
2421 Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty); | 2518 Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty); |
2422 _movp(T0, Src0RM); | 2519 _movp(T0, Src0RM); |
2423 _pxor(T0, HighOrderBits); | 2520 _pxor(T0, HighOrderBits); |
2424 _movp(T1, Src1RM); | 2521 _movp(T1, Src1RM); |
2425 _pxor(T1, HighOrderBits); | 2522 _pxor(T1, HighOrderBits); |
2426 Src0RM = T0; | 2523 Src0RM = T0; |
2427 Src1RM = T1; | 2524 Src1RM = T1; |
2428 } | 2525 } |
2429 | 2526 |
2430 // TODO: ALIGNHACK: Both operands to compare instructions need to be | |
2431 // in registers until data alignment support is implemented. Once | |
2432 // there is support for data alignment, LEGAL_HACK can be removed. | |
2433 #define LEGAL_HACK(Vect) legalizeToVar((Vect)) | |
2434 Variable *T = makeReg(Ty); | 2527 Variable *T = makeReg(Ty); |
2435 switch (Condition) { | 2528 switch (Condition) { |
2436 default: | 2529 default: |
2437 llvm_unreachable("unexpected condition"); | 2530 llvm_unreachable("unexpected condition"); |
2438 break; | 2531 break; |
2439 case InstIcmp::Eq: { | 2532 case InstIcmp::Eq: { |
2440 _movp(T, Src0RM); | 2533 _movp(T, Src0RM); |
2441 _pcmpeq(T, LEGAL_HACK(Src1RM)); | 2534 _pcmpeq(T, Src1RM); |
2442 } break; | 2535 } break; |
2443 case InstIcmp::Ne: { | 2536 case InstIcmp::Ne: { |
2444 _movp(T, Src0RM); | 2537 _movp(T, Src0RM); |
2445 _pcmpeq(T, LEGAL_HACK(Src1RM)); | 2538 _pcmpeq(T, Src1RM); |
2446 Variable *MinusOne = makeVectorOfMinusOnes(Ty); | 2539 Variable *MinusOne = makeVectorOfMinusOnes(Ty); |
2447 _pxor(T, MinusOne); | 2540 _pxor(T, MinusOne); |
2448 } break; | 2541 } break; |
2449 case InstIcmp::Ugt: | 2542 case InstIcmp::Ugt: |
2450 case InstIcmp::Sgt: { | 2543 case InstIcmp::Sgt: { |
2451 _movp(T, Src0RM); | 2544 _movp(T, Src0RM); |
2452 _pcmpgt(T, LEGAL_HACK(Src1RM)); | 2545 _pcmpgt(T, Src1RM); |
2453 } break; | 2546 } break; |
2454 case InstIcmp::Uge: | 2547 case InstIcmp::Uge: |
2455 case InstIcmp::Sge: { | 2548 case InstIcmp::Sge: { |
2456 // !(Src1RM > Src0RM) | 2549 // !(Src1RM > Src0RM) |
2457 _movp(T, Src1RM); | 2550 _movp(T, Src1RM); |
2458 _pcmpgt(T, LEGAL_HACK(Src0RM)); | 2551 _pcmpgt(T, Src0RM); |
2459 Variable *MinusOne = makeVectorOfMinusOnes(Ty); | 2552 Variable *MinusOne = makeVectorOfMinusOnes(Ty); |
2460 _pxor(T, MinusOne); | 2553 _pxor(T, MinusOne); |
2461 } break; | 2554 } break; |
2462 case InstIcmp::Ult: | 2555 case InstIcmp::Ult: |
2463 case InstIcmp::Slt: { | 2556 case InstIcmp::Slt: { |
2464 _movp(T, Src1RM); | 2557 _movp(T, Src1RM); |
2465 _pcmpgt(T, LEGAL_HACK(Src0RM)); | 2558 _pcmpgt(T, Src0RM); |
2466 } break; | 2559 } break; |
2467 case InstIcmp::Ule: | 2560 case InstIcmp::Ule: |
2468 case InstIcmp::Sle: { | 2561 case InstIcmp::Sle: { |
2469 // !(Src0RM > Src1RM) | 2562 // !(Src0RM > Src1RM) |
2470 _movp(T, Src0RM); | 2563 _movp(T, Src0RM); |
2471 _pcmpgt(T, LEGAL_HACK(Src1RM)); | 2564 _pcmpgt(T, Src1RM); |
2472 Variable *MinusOne = makeVectorOfMinusOnes(Ty); | 2565 Variable *MinusOne = makeVectorOfMinusOnes(Ty); |
2473 _pxor(T, MinusOne); | 2566 _pxor(T, MinusOne); |
2474 } break; | 2567 } break; |
2475 } | 2568 } |
2476 #undef LEGAL_HACK | |
2477 | 2569 |
2478 _movp(Dest, T); | 2570 _movp(Dest, T); |
2479 eliminateNextVectorSextInstruction(Dest); | 2571 eliminateNextVectorSextInstruction(Dest); |
2480 return; | 2572 return; |
2481 } | 2573 } |
2482 | 2574 |
2483 // If Src1 is an immediate, or known to be a physical register, we can | 2575 // If Src1 is an immediate, or known to be a physical register, we can |
2484 // allow Src0 to be a memory operand. Otherwise, Src0 must be copied into | 2576 // allow Src0 to be a memory operand. Otherwise, Src0 must be copied into |
2485 // a physical register. (Actually, either Src0 or Src1 can be chosen for | 2577 // a physical register. (Actually, either Src0 or Src1 can be chosen for |
2486 // the physical register, but unfortunately we have to commit to one or | 2578 // the physical register, but unfortunately we have to commit to one or |
(...skipping 155 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2642 // insertelement into index 3 (result is stored in T): | 2734 // insertelement into index 3 (result is stored in T): |
2643 // T := SourceVectRM | 2735 // T := SourceVectRM |
2644 // ElementR := ElementR[0, 0] T[0, 2] | 2736 // ElementR := ElementR[0, 0] T[0, 2] |
2645 // T := T[0, 1] ElementR[3, 0] | 2737 // T := T[0, 1] ElementR[3, 0] |
2646 const unsigned char Mask1[3] = {0, 192, 128}; | 2738 const unsigned char Mask1[3] = {0, 192, 128}; |
2647 const unsigned char Mask2[3] = {227, 196, 52}; | 2739 const unsigned char Mask2[3] = {227, 196, 52}; |
2648 | 2740 |
2649 Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index - 1]); | 2741 Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index - 1]); |
2650 Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index - 1]); | 2742 Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index - 1]); |
2651 | 2743 |
2652 // ALIGNHACK: Force vector operands to registers in instructions | |
2653 // that require aligned memory operands until support for data | |
2654 // alignment is implemented. | |
2655 #define ALIGN_HACK(Vect) legalizeToVar((Vect)) | |
2656 if (Index == 1) { | 2744 if (Index == 1) { |
2657 SourceVectRM = ALIGN_HACK(SourceVectRM); | |
2658 _shufps(ElementR, SourceVectRM, Mask1Constant); | 2745 _shufps(ElementR, SourceVectRM, Mask1Constant); |
2659 _shufps(ElementR, SourceVectRM, Mask2Constant); | 2746 _shufps(ElementR, SourceVectRM, Mask2Constant); |
2660 _movp(Inst->getDest(), ElementR); | 2747 _movp(Inst->getDest(), ElementR); |
2661 } else { | 2748 } else { |
2662 Variable *T = makeReg(Ty); | 2749 Variable *T = makeReg(Ty); |
2663 _movp(T, SourceVectRM); | 2750 _movp(T, SourceVectRM); |
2664 _shufps(ElementR, T, Mask1Constant); | 2751 _shufps(ElementR, T, Mask1Constant); |
2665 _shufps(T, ElementR, Mask2Constant); | 2752 _shufps(T, ElementR, Mask2Constant); |
2666 _movp(Inst->getDest(), T); | 2753 _movp(Inst->getDest(), T); |
2667 } | 2754 } |
2668 #undef ALIGN_HACK | |
2669 } else { | 2755 } else { |
2670 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); | 2756 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1); |
2671 // Spill the value to a stack slot and perform the insertion in | 2757 // Spill the value to a stack slot and perform the insertion in |
2672 // memory. | 2758 // memory. |
2673 // | 2759 // |
2674 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when | 2760 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when |
2675 // support for legalizing to mem is implemented. | 2761 // support for legalizing to mem is implemented. |
2676 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); | 2762 Variable *Slot = Func->makeVariable(Ty, Context.getNode()); |
2677 Slot->setWeight(RegWeight::Zero); | 2763 Slot->setWeight(RegWeight::Zero); |
2678 _movp(Slot, legalizeToVar(SourceVectNotLegalized)); | 2764 _movp(Slot, legalizeToVar(SourceVectNotLegalized)); |
(...skipping 941 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3620 Variable *Dest = Inst->getDest(); | 3706 Variable *Dest = Inst->getDest(); |
3621 Operand *SrcT = Inst->getTrueOperand(); | 3707 Operand *SrcT = Inst->getTrueOperand(); |
3622 Operand *SrcF = Inst->getFalseOperand(); | 3708 Operand *SrcF = Inst->getFalseOperand(); |
3623 Operand *Condition = Inst->getCondition(); | 3709 Operand *Condition = Inst->getCondition(); |
3624 | 3710 |
3625 if (isVectorType(Dest->getType())) { | 3711 if (isVectorType(Dest->getType())) { |
3626 Type SrcTy = SrcT->getType(); | 3712 Type SrcTy = SrcT->getType(); |
3627 Variable *T = makeReg(SrcTy); | 3713 Variable *T = makeReg(SrcTy); |
3628 Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem); | 3714 Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem); |
3629 Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem); | 3715 Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem); |
3630 // ALIGNHACK: Until data alignment support is implemented, vector | |
3631 // instructions need to have vector operands in registers. Once | |
3632 // there is support for data alignment, LEGAL_HACK can be removed. | |
3633 #define LEGAL_HACK(Vect) legalizeToVar((Vect)) | |
3634 if (InstructionSet >= SSE4_1) { | 3716 if (InstructionSet >= SSE4_1) { |
3635 // TODO(wala): If the condition operand is a constant, use blendps | 3717 // TODO(wala): If the condition operand is a constant, use blendps |
3636 // or pblendw. | 3718 // or pblendw. |
3637 // | 3719 // |
3638 // Use blendvps or pblendvb to implement select. | 3720 // Use blendvps or pblendvb to implement select. |
3639 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 || | 3721 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 || |
3640 SrcTy == IceType_v4f32) { | 3722 SrcTy == IceType_v4f32) { |
3641 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); | 3723 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); |
3642 Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0); | 3724 Variable *xmm0 = makeReg(IceType_v4i32, Reg_xmm0); |
3643 _movp(xmm0, ConditionRM); | 3725 _movp(xmm0, ConditionRM); |
3644 _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31)); | 3726 _psll(xmm0, Ctx->getConstantInt(IceType_i8, 31)); |
3645 _movp(T, SrcFRM); | 3727 _movp(T, SrcFRM); |
3646 _blendvps(T, LEGAL_HACK(SrcTRM), xmm0); | 3728 _blendvps(T, SrcTRM, xmm0); |
3647 _movp(Dest, T); | 3729 _movp(Dest, T); |
3648 } else { | 3730 } else { |
3649 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16); | 3731 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16); |
3650 Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16 | 3732 Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16 |
3651 : IceType_v16i8; | 3733 : IceType_v16i8; |
3652 Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0); | 3734 Variable *xmm0 = makeReg(SignExtTy, Reg_xmm0); |
3653 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition)); | 3735 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition)); |
3654 _movp(T, SrcFRM); | 3736 _movp(T, SrcFRM); |
3655 _pblendvb(T, LEGAL_HACK(SrcTRM), xmm0); | 3737 _pblendvb(T, SrcTRM, xmm0); |
3656 _movp(Dest, T); | 3738 _movp(Dest, T); |
3657 } | 3739 } |
3658 return; | 3740 return; |
3659 } | 3741 } |
3660 // Lower select without SSE4.1: | 3742 // Lower select without SSE4.1: |
3661 // a=d?b:c ==> | 3743 // a=d?b:c ==> |
3662 // if elementtype(d) != i1: | 3744 // if elementtype(d) != i1: |
3663 // d=sext(d); | 3745 // d=sext(d); |
3664 // a=(b&d)|(c&~d); | 3746 // a=(b&d)|(c&~d); |
3665 Variable *T2 = makeReg(SrcTy); | 3747 Variable *T2 = makeReg(SrcTy); |
3666 // Sign extend the condition operand if applicable. | 3748 // Sign extend the condition operand if applicable. |
3667 if (SrcTy == IceType_v4f32) { | 3749 if (SrcTy == IceType_v4f32) { |
3668 // The sext operation takes only integer arguments. | 3750 // The sext operation takes only integer arguments. |
3669 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode()); | 3751 Variable *T3 = Func->makeVariable(IceType_v4i32, Context.getNode()); |
3670 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); | 3752 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition)); |
3671 _movp(T, T3); | 3753 _movp(T, T3); |
3672 } else if (typeElementType(SrcTy) != IceType_i1) { | 3754 } else if (typeElementType(SrcTy) != IceType_i1) { |
3673 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); | 3755 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition)); |
3674 } else { | 3756 } else { |
3675 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); | 3757 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); |
3676 _movp(T, ConditionRM); | 3758 _movp(T, ConditionRM); |
3677 } | 3759 } |
3678 _movp(T2, T); | 3760 _movp(T2, T); |
3679 _pand(T, LEGAL_HACK(SrcTRM)); | 3761 _pand(T, SrcTRM); |
3680 _pandn(T2, LEGAL_HACK(SrcFRM)); | 3762 _pandn(T2, SrcFRM); |
3681 _por(T, T2); | 3763 _por(T, T2); |
3682 _movp(Dest, T); | 3764 _movp(Dest, T); |
3683 #undef LEGAL_HACK | |
3684 | 3765 |
3685 return; | 3766 return; |
3686 } | 3767 } |
3687 | 3768 |
3688 // a=d?b:c ==> cmp d,0; a=b; jne L1; FakeUse(a); a=c; L1: | 3769 // a=d?b:c ==> cmp d,0; a=b; jne L1; FakeUse(a); a=c; L1: |
3689 Operand *ConditionRMI = legalize(Condition); | 3770 Operand *ConditionRMI = legalize(Condition); |
3690 Constant *Zero = Ctx->getConstantZero(IceType_i32); | 3771 Constant *Zero = Ctx->getConstantZero(IceType_i32); |
3691 InstX8632Label *Label = InstX8632Label::create(Func, this); | 3772 InstX8632Label *Label = InstX8632Label::create(Func, this); |
3692 | 3773 |
3693 if (Dest->getType() == IceType_i64) { | 3774 if (Dest->getType() == IceType_i64) { |
(...skipping 542 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4236 for (SizeT i = 0; i < Size; ++i) { | 4317 for (SizeT i = 0; i < Size; ++i) { |
4237 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; | 4318 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; |
4238 } | 4319 } |
4239 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; | 4320 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; |
4240 } | 4321 } |
4241 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName | 4322 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName |
4242 << "\n"; | 4323 << "\n"; |
4243 } | 4324 } |
4244 | 4325 |
4245 } // end of namespace Ice | 4326 } // end of namespace Ice |
OLD | NEW |