Index: src/IceTargetLoweringX8632.cpp |
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp |
index 7630d37624bdd787fcc3f6ce7b2b69913c4fb210..bdaf9346e592cb847fc56d77a9f4b0848dc4a07c 100644 |
--- a/src/IceTargetLoweringX8632.cpp |
+++ b/src/IceTargetLoweringX8632.cpp |
@@ -85,6 +85,9 @@ InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) { |
return TableIcmp32[Index].Mapping; |
} |
+// The maximum number of arguments to pass in XMM registers |
+const unsigned X86_MAX_XMM_ARGS = 4; |
+ |
// In some cases, there are x-macros tables for both high-level and |
// low-level instructions/operands that use the same enum key value. |
// The tables are kept separate to maintain a proper separation |
@@ -247,6 +250,11 @@ void TargetX8632::translateO2() { |
Func->doAddressOpt(); |
T_doAddressOpt.printElapsedUs(Context, "doAddressOpt()"); |
+ // Argument lowering |
+ Timer T_argLowering; |
+ Func->doArgLowering(); |
+ T_argLowering.printElapsedUs(Context, "lowerArguments()"); |
+ |
// Target lowering. This requires liveness analysis for some parts |
// of the lowering decisions, such as compare/branch fusing. If |
// non-lightweight liveness analysis is used, the instructions need |
@@ -258,6 +266,7 @@ void TargetX8632::translateO2() { |
if (Func->hasError()) |
return; |
T_renumber1.printElapsedUs(Context, "renumberInstructions()"); |
+ |
// TODO: It should be sufficient to use the fastest liveness |
// calculation, i.e. livenessLightweight(). However, for some |
// reason that slows down the rest of the translation. Investigate. |
@@ -267,6 +276,7 @@ void TargetX8632::translateO2() { |
return; |
T_liveness1.printElapsedUs(Context, "liveness()"); |
Func->dump("After x86 address mode opt"); |
+ |
Timer T_genCode; |
Func->genCode(); |
if (Func->hasError()) |
@@ -329,6 +339,10 @@ void TargetX8632::translateOm1() { |
T_deletePhis.printElapsedUs(Context, "deletePhis()"); |
Func->dump("After Phi lowering"); |
+ Timer T_argLowering; |
+ Func->doArgLowering(); |
+ T_argLowering.printElapsedUs(Context, "lowerArguments()"); |
+ |
Timer T_genCode; |
Func->genCode(); |
if (Func->hasError()) |
@@ -412,34 +426,74 @@ void TargetX8632::emitVariable(const Variable *Var, const Cfg *Func) const { |
Str << "]"; |
} |
-// Helper function for addProlog(). Sets the frame offset for Arg, |
-// updates InArgsSizeBytes according to Arg's width, and generates an |
-// instruction to copy Arg into its assigned register if applicable. |
-// For an I64 arg that has been split into Lo and Hi components, it |
-// calls itself recursively on the components, taking care to handle |
-// Lo first because of the little-endian architecture. |
-void TargetX8632::setArgOffsetAndCopy(Variable *Arg, Variable *FramePtr, |
- size_t BasicFrameOffset, |
- size_t &InArgsSizeBytes) { |
+void TargetX8632::lowerArguments() { |
+ VarList &Args = Func->getArgs(); |
+ // The first four arguments of vector type, regardless of their |
+ // position relative to the other arguments in the argument list, are |
+ // passed in registers xmm0 - xmm3. |
+ unsigned NumXmmArgs = 0; |
+ |
+ Context.init(Func->getEntryNode()); |
+ Context.setInsertPoint(Context.getCur()); |
+ |
+ for (SizeT I = 0, E = Args.size(); I < E && NumXmmArgs < X86_MAX_XMM_ARGS; |
+ ++I) { |
+ Variable *Arg = Args[I]; |
+ Type Ty = Arg->getType(); |
+ if (!isVectorType(Ty)) |
+ continue; |
+ // Replace Arg in the argument list with the home register. Then |
+ // generate an instruction in the prolog to copy the home register |
+ // to the assigned location of Arg. |
+ int32_t RegNum = Reg_xmm0 + NumXmmArgs; |
+ ++NumXmmArgs; |
+ IceString Name = "home_reg:" + Arg->getName(); |
+ const CfgNode *DefNode = NULL; |
+ Variable *RegisterArg = Func->makeVariable(Ty, DefNode, Name); |
+ RegisterArg->setRegNum(RegNum); |
+ RegisterArg->setIsArg(Func); |
+ Arg->setIsArg(Func, false); |
+ |
+ Args[I] = RegisterArg; |
+ Context.insert(InstAssign::create(Func, Arg, RegisterArg)); |
+ } |
+} |
+ |
+// Helper function for addProlog(). |
+// |
+// This assumes Arg is an argument passed on the stack. This sets the |
+// frame offset for Arg and updates InArgsSizeBytes according to Arg's |
+// width. For an I64 arg that has been split into Lo and Hi components, |
+// it calls itself recursively on the components, taking care to handle |
+// Lo first because of the little-endian architecture. Lastly, this |
+// function generates an instruction to copy Arg into its assigned |
+// register if applicable. |
+void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr, |
+ size_t BasicFrameOffset, |
+ size_t &InArgsSizeBytes) { |
Variable *Lo = Arg->getLo(); |
Variable *Hi = Arg->getHi(); |
Type Ty = Arg->getType(); |
if (Lo && Hi && Ty == IceType_i64) { |
assert(Lo->getType() != IceType_i64); // don't want infinite recursion |
assert(Hi->getType() != IceType_i64); // don't want infinite recursion |
- setArgOffsetAndCopy(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
- setArgOffsetAndCopy(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
+ finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
+ finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
return; |
} |
Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes); |
+ InArgsSizeBytes += typeWidthInBytesOnStack(Ty); |
if (Arg->hasReg()) { |
assert(Ty != IceType_i64); |
OperandX8632Mem *Mem = OperandX8632Mem::create( |
Func, Ty, FramePtr, |
Ctx->getConstantInt(IceType_i32, Arg->getStackOffset())); |
- _mov(Arg, Mem); |
+ if (isVectorType(Arg->getType())) { |
+ _movp(Arg, Mem); |
+ } else { |
+ _mov(Arg, Mem); |
+ } |
} |
- InArgsSizeBytes += typeWidthInBytesOnStack(Ty); |
} |
Type TargetX8632::stackSlotType() { return IceType_i32; } |
@@ -489,7 +543,8 @@ void TargetX8632::addProlog(CfgNode *Node) { |
RegsUsed[Var->getRegNum()] = true; |
continue; |
} |
- // An argument passed on the stack already has a stack slot. |
+ // An argument either does not need a stack slot (if passed in a |
+ // register) or already has one (if passed on the stack). |
if (Var->getIsArg()) |
continue; |
// An unreferenced variable doesn't need a stack slot. |
@@ -547,23 +602,23 @@ void TargetX8632::addProlog(CfgNode *Node) { |
resetStackAdjustment(); |
- // Fill in stack offsets for args, and copy args into registers for |
- // those that were register-allocated. Args are pushed right to |
+ // Fill in stack offsets for stack args, and copy args into registers |
+ // for those that were register-allocated. Args are pushed right to |
// left, so Arg[0] is closest to the stack/frame pointer. |
- // |
- // TODO: Make this right for different width args, calling |
- // conventions, etc. For one thing, args passed in registers will |
- // need to be copied/shuffled to their home registers (the |
- // RegManager code may have some permutation logic to leverage), |
- // and if they have no home register, home space will need to be |
- // allocated on the stack to copy into. |
Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg()); |
size_t BasicFrameOffset = PreservedRegsSizeBytes + RetIpSizeBytes; |
if (!IsEbpBasedFrame) |
BasicFrameOffset += LocalsSizeBytes; |
+ |
+ unsigned NumXmmArgs = 0; |
for (SizeT i = 0; i < Args.size(); ++i) { |
Variable *Arg = Args[i]; |
- setArgOffsetAndCopy(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
+ // Skip arguments passed in registers. |
+ if (isVectorType(Arg->getType()) && NumXmmArgs < X86_MAX_XMM_ARGS) { |
+ ++NumXmmArgs; |
+ continue; |
+ } |
+ finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
} |
// Fill in stack offsets for locals. |
@@ -1253,7 +1308,10 @@ void TargetX8632::lowerAssign(const InstAssign *Inst) { |
const bool AllowOverlap = true; |
// RI is either a physical register or an immediate. |
Operand *RI = legalize(Src0, Legal_Reg | Legal_Imm, AllowOverlap); |
- _mov(Dest, RI); |
+ if (isVectorType(Dest->getType())) |
+ _movp(Dest, RI); |
+ else |
+ _mov(Dest, RI); |
} |
} |
@@ -1269,31 +1327,44 @@ void TargetX8632::lowerBr(const InstBr *Inst) { |
} |
void TargetX8632::lowerCall(const InstCall *Instr) { |
- // Generate a sequence of push instructions, pushing right to left, |
- // keeping track of stack offsets in case a push involves a stack |
- // operand and we are using an esp-based frame. |
+ // Classify each argument operand according to the location where the |
+ // argument is passed. |
+ OperandList XmmArgs; |
+ OperandList StackArgs; |
+ for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) { |
+ Operand *Arg = Instr->getArg(i); |
+ if (isVectorType(Arg->getType()) && XmmArgs.size() < X86_MAX_XMM_ARGS) { |
+ XmmArgs.push_back(Arg); |
+ } else { |
+ StackArgs.push_back(Arg); |
+ } |
+ } |
+ // For stack arguments, generate a sequence of push instructions, |
+ // pushing right to left, keeping track of stack offsets in case a |
+ // push involves a stack operand and we are using an esp-based frame. |
uint32_t StackOffset = 0; |
+ // TODO: Consolidate the stack adjustment for function calls by |
+ // reserving enough space for the arguments only once. |
+ // |
// TODO: If for some reason the call instruction gets dead-code |
// eliminated after lowering, we would need to ensure that the |
// pre-call push instructions and the post-call esp adjustment get |
// eliminated as well. |
- for (SizeT NumArgs = Instr->getNumArgs(), i = 0; i < NumArgs; ++i) { |
- Operand *Arg = legalize(Instr->getArg(NumArgs - i - 1)); |
+ for (OperandList::reverse_iterator I = StackArgs.rbegin(), |
+ E = StackArgs.rend(); I != E; ++I) { |
+ Operand *Arg = legalize(*I); |
if (Arg->getType() == IceType_i64) { |
_push(hiOperand(Arg)); |
_push(loOperand(Arg)); |
- } else if (Arg->getType() == IceType_f64) { |
- // If the Arg turns out to be a memory operand, we need to push |
- // 8 bytes, which requires two push instructions. This ends up |
- // being somewhat clumsy in the current IR, so we use a |
- // workaround. Force the operand into a (xmm) register, and |
- // then push the register. An xmm register push is actually not |
- // possible in x86, but the Push instruction emitter handles |
- // this by decrementing the stack pointer and directly writing |
- // the xmm register value. |
- Variable *T = NULL; |
- _mov(T, Arg); |
- _push(T); |
+ } else if (Arg->getType() == IceType_f64 || isVectorType(Arg->getType())) { |
+ // If the Arg turns out to be a memory operand, more than one push |
+ // instruction is required. This ends up being somewhat clumsy in |
+ // the current IR, so we use a workaround. Force the operand into |
+ // a (xmm) register, and then push the register. An xmm register |
+ // push is actually not possible in x86, but the Push instruction |
+ // emitter handles this by decrementing the stack pointer and |
+ // directly writing the xmm register value. |
+ _push(legalize(Arg, Legal_Reg)); |
} else { |
// Otherwise PNaCl requires parameter types to be at least 32-bits. |
assert(Arg->getType() == IceType_f32 || Arg->getType() == IceType_i32); |
@@ -1301,11 +1372,28 @@ void TargetX8632::lowerCall(const InstCall *Instr) { |
} |
StackOffset += typeWidthInBytesOnStack(Arg->getType()); |
} |
+ // Copy arguments to be passed in registers to the appropriate |
+ // registers. |
+ // TODO: Investigate the impact of lowering arguments passed in |
+ // registers after lowering stack arguments as opposed to the other |
+ // way around. Lowering register arguments after stack arguments may |
+ // reduce register pressure. On the other hand, lowering register |
+ // arguments first (before stack arguments) may result in more compact |
+ // code, as the memory operand displacements may end up being smaller |
+ // before any stack adjustment is done. |
+ for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) { |
+ Variable *Reg = legalizeToVar(XmmArgs[i], false, Reg_xmm0 + i); |
+ // Generate a FakeUse of register arguments so that they do not get |
+ // dead code eliminated as a result of the FakeKill of scratch |
+ // registers after the call. |
+ Context.insert(InstFakeUse::create(Func, Reg)); |
+ } |
// Generate the call instruction. Assign its result to a temporary |
// with high register allocation weight. |
Variable *Dest = Instr->getDest(); |
- Variable *eax = NULL; // doubles as RegLo as necessary |
- Variable *edx = NULL; |
+ // ReturnReg doubles as ReturnRegLo as necessary. |
+ Variable *ReturnReg = NULL; |
+ Variable *ReturnRegHi = NULL; |
if (Dest) { |
switch (Dest->getType()) { |
case IceType_NUM: |
@@ -1317,16 +1405,16 @@ void TargetX8632::lowerCall(const InstCall *Instr) { |
case IceType_i8: |
case IceType_i16: |
case IceType_i32: |
- eax = makeReg(Dest->getType(), Reg_eax); |
+ ReturnReg = makeReg(Dest->getType(), Reg_eax); |
break; |
case IceType_i64: |
- eax = makeReg(IceType_i32, Reg_eax); |
- edx = makeReg(IceType_i32, Reg_edx); |
+ ReturnReg = makeReg(IceType_i32, Reg_eax); |
+ ReturnRegHi = makeReg(IceType_i32, Reg_edx); |
break; |
case IceType_f32: |
case IceType_f64: |
- // Leave eax==edx==NULL, and capture the result with the fstp |
- // instruction. |
+ // Leave ReturnReg==ReturnRegHi==NULL, and capture the result with |
+ // the fstp instruction. |
break; |
case IceType_v4i1: |
case IceType_v8i1: |
@@ -1334,24 +1422,18 @@ void TargetX8632::lowerCall(const InstCall *Instr) { |
case IceType_v16i8: |
case IceType_v8i16: |
case IceType_v4i32: |
- case IceType_v4f32: { |
- // TODO(wala): Handle return values of vector type in the caller. |
- IceString Ty; |
- llvm::raw_string_ostream BaseOS(Ty); |
- Ostream OS(&BaseOS); |
- OS << Dest->getType(); |
- Func->setError("Unhandled dest type: " + BaseOS.str()); |
- return; |
- } |
+ case IceType_v4f32: |
+ ReturnReg = makeReg(Dest->getType(), Reg_xmm0); |
+ break; |
} |
} |
// TODO(stichnot): LEAHACK: remove Legal_All (and use default) once |
// a proper emitter is used. |
Operand *CallTarget = legalize(Instr->getCallTarget(), Legal_All); |
- Inst *NewCall = InstX8632Call::create(Func, eax, CallTarget); |
+ Inst *NewCall = InstX8632Call::create(Func, ReturnReg, CallTarget); |
Context.insert(NewCall); |
- if (edx) |
- Context.insert(InstFakeDef::create(Func, edx)); |
+ if (ReturnRegHi) |
+ Context.insert(InstFakeDef::create(Func, ReturnRegHi)); |
// Add the appropriate offset to esp. |
if (StackOffset) { |
@@ -1368,34 +1450,42 @@ void TargetX8632::lowerCall(const InstCall *Instr) { |
Context.insert(InstFakeKill::create(Func, KilledRegs, NewCall)); |
// Generate a FakeUse to keep the call live if necessary. |
- if (Instr->hasSideEffects() && eax) { |
- Inst *FakeUse = InstFakeUse::create(Func, eax); |
+ if (Instr->hasSideEffects() && ReturnReg) { |
+ Inst *FakeUse = InstFakeUse::create(Func, ReturnReg); |
Context.insert(FakeUse); |
} |
+ |
+ if (!Dest) |
+ return; |
- // Generate Dest=eax assignment. |
- if (Dest && eax) { |
- if (edx) { |
+ // Assign the result of the call to Dest. |
+ if (ReturnReg) { |
+ if (ReturnRegHi) { |
+ assert(Dest->getType() == IceType_i64); |
split64(Dest); |
Variable *DestLo = Dest->getLo(); |
Variable *DestHi = Dest->getHi(); |
- DestLo->setPreferredRegister(eax, false); |
- DestHi->setPreferredRegister(edx, false); |
- _mov(DestLo, eax); |
- _mov(DestHi, edx); |
+ DestLo->setPreferredRegister(ReturnReg, false); |
+ DestHi->setPreferredRegister(ReturnRegHi, false); |
+ _mov(DestLo, ReturnReg); |
+ _mov(DestHi, ReturnRegHi); |
} else { |
- Dest->setPreferredRegister(eax, false); |
- _mov(Dest, eax); |
+ assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 || |
+ Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 || |
+ isVectorType(Dest->getType())); |
+ Dest->setPreferredRegister(ReturnReg, false); |
+ if (isVectorType(Dest->getType())) { |
+ _movp(Dest, ReturnReg); |
+ } else { |
+ _mov(Dest, ReturnReg); |
+ } |
} |
- } |
- |
- // Special treatment for an FP function which returns its result in |
- // st(0). |
- if (Dest && |
- (Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64)) { |
+ } else if (Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64) { |
+ // Special treatment for an FP function which returns its result in |
+ // st(0). |
_fstp(Dest); |
- // If Dest ends up being a physical xmm register, the fstp emit |
- // code will route st(0) through a temporary stack slot. |
+ // If Dest ends up being a physical xmm register, the fstp emit code |
+ // will route st(0) through a temporary stack slot. |
} |
} |