Chromium Code Reviews| Index: src/IceTargetLoweringX8632.cpp |
| diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp |
| index 7630d37624bdd787fcc3f6ce7b2b69913c4fb210..188c6da3541e636c069c77d985b756c60802c1c5 100644 |
| --- a/src/IceTargetLoweringX8632.cpp |
| +++ b/src/IceTargetLoweringX8632.cpp |
| @@ -247,6 +247,11 @@ void TargetX8632::translateO2() { |
| Func->doAddressOpt(); |
| T_doAddressOpt.printElapsedUs(Context, "doAddressOpt()"); |
| + // Argument lowering |
| + Timer T_argLowering; |
| + Func->doArgLowering(); |
| + T_argLowering.printElapsedUs(Context, "lowerArguments()"); |
| + |
| // Target lowering. This requires liveness analysis for some parts |
| // of the lowering decisions, such as compare/branch fusing. If |
| // non-lightweight liveness analysis is used, the instructions need |
| @@ -258,6 +263,7 @@ void TargetX8632::translateO2() { |
| if (Func->hasError()) |
| return; |
| T_renumber1.printElapsedUs(Context, "renumberInstructions()"); |
| + |
| // TODO: It should be sufficient to use the fastest liveness |
| // calculation, i.e. livenessLightweight(). However, for some |
| // reason that slows down the rest of the translation. Investigate. |
| @@ -267,6 +273,7 @@ void TargetX8632::translateO2() { |
| return; |
| T_liveness1.printElapsedUs(Context, "liveness()"); |
| Func->dump("After x86 address mode opt"); |
| + |
| Timer T_genCode; |
| Func->genCode(); |
| if (Func->hasError()) |
| @@ -329,6 +336,10 @@ void TargetX8632::translateOm1() { |
| T_deletePhis.printElapsedUs(Context, "deletePhis()"); |
| Func->dump("After Phi lowering"); |
| + Timer T_argLowering; |
| + Func->doArgLowering(); |
| + T_argLowering.printElapsedUs(Context, "lowerArguments()"); |
| + |
| Timer T_genCode; |
| Func->genCode(); |
| if (Func->hasError()) |
| @@ -412,34 +423,73 @@ void TargetX8632::emitVariable(const Variable *Var, const Cfg *Func) const { |
| Str << "]"; |
| } |
| -// Helper function for addProlog(). Sets the frame offset for Arg, |
| -// updates InArgsSizeBytes according to Arg's width, and generates an |
| -// instruction to copy Arg into its assigned register if applicable. |
| -// For an I64 arg that has been split into Lo and Hi components, it |
| -// calls itself recursively on the components, taking care to handle |
| -// Lo first because of the little-endian architecture. |
| -void TargetX8632::setArgOffsetAndCopy(Variable *Arg, Variable *FramePtr, |
| - size_t BasicFrameOffset, |
| - size_t &InArgsSizeBytes) { |
| +void TargetX8632::lowerArguments() { |
| + VarList &Args = Func->getArgs(); |
| + // The first four arguments of vector type, regardless of their |
| + // position relative to the other arguments in the argument list, are |
| + // passed in registers xmm0 - xmm3. |
| + unsigned NumXmmArgs = 0; |
| + |
| + Context.init(Func->getEntryNode()); |
| + Context.setInsertPoint(Context.getCur()); |
| + |
| + for (SizeT I = 0, E = Args.size(); I < E && NumXmmArgs < 4; ++I) { |
|
jvoung (off chromium)
2014/07/09 04:26:53
Might be worth making the "4" a symbolic constant
Jim Stichnoth
2014/07/09 16:56:13
I agree, except that between x86-32 and x86-64 low
wala
2014/07/09 19:05:12
Done.
|
| + Variable *Arg = Args[I]; |
| + Type Ty = Arg->getType(); |
| + if (!isVectorType(Ty)) |
| + continue; |
| + // Replace Arg in the argument list with the home register. Then |
| + // generate an instruction in the prolog to copy the home register |
| + // to the assigned location of Arg. |
| + int32_t RegNum = Reg_xmm0 + NumXmmArgs; |
| + ++NumXmmArgs; |
| + IceString Name = "home_reg:" + Arg->getName(); |
| + const CfgNode *DefNode = NULL; |
| + Variable *RegisterArg = Func->makeVariable(Ty, DefNode, Name); |
| + RegisterArg->setRegNum(RegNum); |
| + RegisterArg->setIsArg(Func); |
| + Arg->setIsArg(Func, false); |
| + |
| + Args[I] = RegisterArg; |
| + Context.insert(InstAssign::create(Func, Arg, RegisterArg)); |
| + } |
| +} |
| + |
| +// Helper function for addProlog(). |
| +// |
| +// This assumes Arg is an argument passed on the stack. This sets the |
| +// frame offset for Arg and updates InArgsSizeBytes according to Arg's |
| +// width. For an I64 arg that has been split into Lo and Hi components, |
| +// it calls itself recursively on the components, taking care to handle |
| +// Lo first because of the little-endian architecture. Lastly, this |
| +// function generates an instruction to copy Arg into its assigned |
| +// register if applicable. |
| +void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr, |
| + size_t BasicFrameOffset, |
| + size_t &InArgsSizeBytes) { |
| Variable *Lo = Arg->getLo(); |
| Variable *Hi = Arg->getHi(); |
| Type Ty = Arg->getType(); |
| if (Lo && Hi && Ty == IceType_i64) { |
| assert(Lo->getType() != IceType_i64); // don't want infinite recursion |
| assert(Hi->getType() != IceType_i64); // don't want infinite recursion |
| - setArgOffsetAndCopy(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
| - setArgOffsetAndCopy(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
| + finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
| + finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
| return; |
| } |
| Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes); |
| + InArgsSizeBytes += typeWidthInBytesOnStack(Ty); |
| if (Arg->hasReg()) { |
| assert(Ty != IceType_i64); |
| OperandX8632Mem *Mem = OperandX8632Mem::create( |
| Func, Ty, FramePtr, |
| Ctx->getConstantInt(IceType_i32, Arg->getStackOffset())); |
| - _mov(Arg, Mem); |
| + if (isVectorType(Arg->getType())) { |
| + _movp(Arg, Mem); |
| + } else { |
| + _mov(Arg, Mem); |
| + } |
| } |
| - InArgsSizeBytes += typeWidthInBytesOnStack(Ty); |
| } |
| Type TargetX8632::stackSlotType() { return IceType_i32; } |
| @@ -489,7 +539,8 @@ void TargetX8632::addProlog(CfgNode *Node) { |
| RegsUsed[Var->getRegNum()] = true; |
| continue; |
| } |
| - // An argument passed on the stack already has a stack slot. |
| + // An argument either does not need a stack slot (if passed in a |
| + // register) or already has one (if passed on the stack). |
| if (Var->getIsArg()) |
| continue; |
| // An unreferenced variable doesn't need a stack slot. |
| @@ -547,23 +598,23 @@ void TargetX8632::addProlog(CfgNode *Node) { |
| resetStackAdjustment(); |
| - // Fill in stack offsets for args, and copy args into registers for |
| - // those that were register-allocated. Args are pushed right to |
| + // Fill in stack offsets for stack args, and copy args into registers |
| + // for those that were register-allocated. Args are pushed right to |
| // left, so Arg[0] is closest to the stack/frame pointer. |
| - // |
| - // TODO: Make this right for different width args, calling |
| - // conventions, etc. For one thing, args passed in registers will |
| - // need to be copied/shuffled to their home registers (the |
| - // RegManager code may have some permutation logic to leverage), |
| - // and if they have no home register, home space will need to be |
| - // allocated on the stack to copy into. |
| Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg()); |
| size_t BasicFrameOffset = PreservedRegsSizeBytes + RetIpSizeBytes; |
| if (!IsEbpBasedFrame) |
| BasicFrameOffset += LocalsSizeBytes; |
| + |
| + unsigned NumXmmArgs = 0; |
| for (SizeT i = 0; i < Args.size(); ++i) { |
| Variable *Arg = Args[i]; |
| - setArgOffsetAndCopy(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
| + // Skip arguments passed in registers. |
| + if (isVectorType(Arg->getType()) && NumXmmArgs < 4) { |
| + ++NumXmmArgs; |
| + continue; |
| + } |
| + finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes); |
| } |
| // Fill in stack offsets for locals. |
| @@ -1250,10 +1301,13 @@ void TargetX8632::lowerAssign(const InstAssign *Inst) { |
| _mov(T_Hi, Src0Hi); |
| _mov(DestHi, T_Hi); |
| } else { |
| - const bool AllowOverlap = true; |
| + const bool AllowOverlap = false; |
| // RI is either a physical register or an immediate. |
| Operand *RI = legalize(Src0, Legal_Reg | Legal_Imm, AllowOverlap); |
| - _mov(Dest, RI); |
| + if (isVectorType(Dest->getType())) |
| + _movp(Dest, RI); |
| + else |
| + _mov(Dest, RI); |
| } |
| } |
| @@ -1269,16 +1323,21 @@ void TargetX8632::lowerBr(const InstBr *Inst) { |
| } |
| void TargetX8632::lowerCall(const InstCall *Instr) { |
| - // Generate a sequence of push instructions, pushing right to left, |
| - // keeping track of stack offsets in case a push involves a stack |
| - // operand and we are using an esp-based frame. |
| + // For stack arguments, generate a sequence of push instructions, |
| + // pushing right to left, keeping track of stack offsets in case a |
| + // push involves a stack operand and we are using an esp-based frame. |
| uint32_t StackOffset = 0; |
| + // Keep track of the number of xmm registers that get used to pass |
| + // arguments. |
| + unsigned NumXmmArgs = 0; |
| + VarList RegisterArgs; |
| // TODO: If for some reason the call instruction gets dead-code |
| // eliminated after lowering, we would need to ensure that the |
| // pre-call push instructions and the post-call esp adjustment get |
| // eliminated as well. |
| for (SizeT NumArgs = Instr->getNumArgs(), i = 0; i < NumArgs; ++i) { |
| Operand *Arg = legalize(Instr->getArg(NumArgs - i - 1)); |
| + bool ArgInRegister = false; |
| if (Arg->getType() == IceType_i64) { |
| _push(hiOperand(Arg)); |
| _push(loOperand(Arg)); |
| @@ -1294,18 +1353,45 @@ void TargetX8632::lowerCall(const InstCall *Instr) { |
| Variable *T = NULL; |
| _mov(T, Arg); |
| _push(T); |
| + } else if (isVectorType(Arg->getType())) { |
| + if (NumXmmArgs < 4) { |
| + Variable *Reg = legalizeToVar(Arg, false, Reg_xmm0 + NumXmmArgs); |
| + ++NumXmmArgs; |
| + ArgInRegister = true; |
| + RegisterArgs.push_back(Reg); |
| + } else { |
| + // sub esp, 16 |
| + // movups [esp], legalize_to_reg(Arg) |
| + Variable *esp = getPhysicalRegister(Reg_esp); |
| + size_t Width = typeWidthInBytesOnStack(Arg->getType()); |
| + _sub(esp, Ctx->getConstantInt(IceType_i8, Width)); |
| + Constant *Zero = Ctx->getConstantZero(IceType_i8); |
| + OperandX8632Mem *Dest = |
| + OperandX8632Mem::create(Func, Arg->getType(), esp, Zero); |
| + _storep(legalize(Arg, Legal_Reg), Dest); |
| + } |
| } else { |
| // Otherwise PNaCl requires parameter types to be at least 32-bits. |
| assert(Arg->getType() == IceType_f32 || Arg->getType() == IceType_i32); |
| _push(Arg); |
| } |
| - StackOffset += typeWidthInBytesOnStack(Arg->getType()); |
| + if (!ArgInRegister) { |
| + StackOffset += typeWidthInBytesOnStack(Arg->getType()); |
| + } |
| + } |
| + // Generate a FakeUse of all register arguments so that they do not |
| + // get dead code eliminated as a result of the FakeKill of scratch |
| + // registers after the call. |
|
Jim Stichnoth
2014/07/09 16:56:13
I think it wouldn't hurt to assert(RegisterArgs.si
wala
2014/07/09 19:05:12
Unnecessary in the revised code.
|
| + for (VarList::const_iterator I = RegisterArgs.begin(), E = RegisterArgs.end(); |
| + I != E; ++I) { |
| + Context.insert(InstFakeUse::create(Func, *I)); |
| } |
| // Generate the call instruction. Assign its result to a temporary |
| // with high register allocation weight. |
| Variable *Dest = Instr->getDest(); |
| - Variable *eax = NULL; // doubles as RegLo as necessary |
| - Variable *edx = NULL; |
| + // ReturnReg doubles as ReturnRegLo as necessary. |
| + Variable *ReturnReg = NULL; |
| + Variable *ReturnRegHi = NULL; |
| if (Dest) { |
| switch (Dest->getType()) { |
| case IceType_NUM: |
| @@ -1317,16 +1403,16 @@ void TargetX8632::lowerCall(const InstCall *Instr) { |
| case IceType_i8: |
| case IceType_i16: |
| case IceType_i32: |
| - eax = makeReg(Dest->getType(), Reg_eax); |
| + ReturnReg = makeReg(Dest->getType(), Reg_eax); |
| break; |
| case IceType_i64: |
| - eax = makeReg(IceType_i32, Reg_eax); |
| - edx = makeReg(IceType_i32, Reg_edx); |
| + ReturnReg = makeReg(IceType_i32, Reg_eax); |
| + ReturnRegHi = makeReg(IceType_i32, Reg_edx); |
| break; |
| case IceType_f32: |
| case IceType_f64: |
| - // Leave eax==edx==NULL, and capture the result with the fstp |
| - // instruction. |
| + // Leave ReturnReg==ReturnRegHi==NULL, and capture the result with |
| + // the fstp instruction. |
| break; |
| case IceType_v4i1: |
| case IceType_v8i1: |
| @@ -1334,24 +1420,18 @@ void TargetX8632::lowerCall(const InstCall *Instr) { |
| case IceType_v16i8: |
| case IceType_v8i16: |
| case IceType_v4i32: |
| - case IceType_v4f32: { |
| - // TODO(wala): Handle return values of vector type in the caller. |
| - IceString Ty; |
| - llvm::raw_string_ostream BaseOS(Ty); |
| - Ostream OS(&BaseOS); |
| - OS << Dest->getType(); |
| - Func->setError("Unhandled dest type: " + BaseOS.str()); |
| - return; |
| - } |
| + case IceType_v4f32: |
| + ReturnReg = makeReg(Dest->getType(), Reg_xmm0); |
| + break; |
| } |
| } |
| // TODO(stichnot): LEAHACK: remove Legal_All (and use default) once |
| // a proper emitter is used. |
| Operand *CallTarget = legalize(Instr->getCallTarget(), Legal_All); |
| - Inst *NewCall = InstX8632Call::create(Func, eax, CallTarget); |
| + Inst *NewCall = InstX8632Call::create(Func, ReturnReg, CallTarget); |
| Context.insert(NewCall); |
| - if (edx) |
| - Context.insert(InstFakeDef::create(Func, edx)); |
| + if (ReturnRegHi) |
| + Context.insert(InstFakeDef::create(Func, ReturnRegHi)); |
| // Add the appropriate offset to esp. |
| if (StackOffset) { |
| @@ -1368,34 +1448,41 @@ void TargetX8632::lowerCall(const InstCall *Instr) { |
| Context.insert(InstFakeKill::create(Func, KilledRegs, NewCall)); |
| // Generate a FakeUse to keep the call live if necessary. |
| - if (Instr->hasSideEffects() && eax) { |
| - Inst *FakeUse = InstFakeUse::create(Func, eax); |
| + if (Instr->hasSideEffects() && ReturnReg) { |
| + Inst *FakeUse = InstFakeUse::create(Func, ReturnReg); |
| Context.insert(FakeUse); |
| } |
| - // Generate Dest=eax assignment. |
| - if (Dest && eax) { |
| - if (edx) { |
| + // Assign the result of the call to Dest. |
| + if (!Dest) { |
| + return; |
| + } else if (ReturnReg) { |
|
Jim Stichnoth
2014/07/09 16:56:13
Don't use "else if" here.
http://llvm.org/docs/Cod
wala
2014/07/09 19:05:12
Done.
|
| + if (ReturnRegHi) { |
| + assert(Dest->getType() == IceType_i64); |
| split64(Dest); |
| Variable *DestLo = Dest->getLo(); |
| Variable *DestHi = Dest->getHi(); |
| - DestLo->setPreferredRegister(eax, false); |
| - DestHi->setPreferredRegister(edx, false); |
| - _mov(DestLo, eax); |
| - _mov(DestHi, edx); |
| + DestLo->setPreferredRegister(ReturnReg, false); |
| + DestHi->setPreferredRegister(ReturnRegHi, false); |
| + _mov(DestLo, ReturnReg); |
| + _mov(DestHi, ReturnRegHi); |
| } else { |
| - Dest->setPreferredRegister(eax, false); |
| - _mov(Dest, eax); |
| + assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 || |
| + Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 || |
| + isVectorType(Dest->getType())); |
| + Dest->setPreferredRegister(ReturnReg, false); |
| + if (isVectorType(Dest->getType())) { |
| + _movp(Dest, ReturnReg); |
| + } else { |
| + _mov(Dest, ReturnReg); |
| + } |
| } |
| - } |
| - |
| - // Special treatment for an FP function which returns its result in |
| - // st(0). |
| - if (Dest && |
| - (Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64)) { |
| + } else if (Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64) { |
| + // Special treatment for an FP function which returns its result in |
| + // st(0). |
| _fstp(Dest); |
| - // If Dest ends up being a physical xmm register, the fstp emit |
| - // code will route st(0) through a temporary stack slot. |
| + // If Dest ends up being a physical xmm register, the fstp emit code |
| + // will route st(0) through a temporary stack slot. |
| } |
| } |