Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(13)

Unified Diff: src/IceTargetLoweringX8632.cpp

Issue 372113005: Add support for passing and returning vectors in accordance with the x86 calling convention. (Closed) Base URL: https://gerrit.chromium.org/gerrit/p/native_client/pnacl-subzero.git@master
Patch Set: Reset AllowOverlap, add comment on lowerCall() strategies, and use X86_MAX_XMM_ARGS where appropria… Created 6 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/vector-arg.ll » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/IceTargetLoweringX8632.cpp
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 7630d37624bdd787fcc3f6ce7b2b69913c4fb210..bdaf9346e592cb847fc56d77a9f4b0848dc4a07c 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -85,6 +85,9 @@ InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
return TableIcmp32[Index].Mapping;
}
+// The maximum number of arguments to pass in XMM registers
+const unsigned X86_MAX_XMM_ARGS = 4;
+
// In some cases, there are x-macros tables for both high-level and
// low-level instructions/operands that use the same enum key value.
// The tables are kept separate to maintain a proper separation
@@ -247,6 +250,11 @@ void TargetX8632::translateO2() {
Func->doAddressOpt();
T_doAddressOpt.printElapsedUs(Context, "doAddressOpt()");
+ // Argument lowering
+ Timer T_argLowering;
+ Func->doArgLowering();
+ T_argLowering.printElapsedUs(Context, "lowerArguments()");
+
// Target lowering. This requires liveness analysis for some parts
// of the lowering decisions, such as compare/branch fusing. If
// non-lightweight liveness analysis is used, the instructions need
@@ -258,6 +266,7 @@ void TargetX8632::translateO2() {
if (Func->hasError())
return;
T_renumber1.printElapsedUs(Context, "renumberInstructions()");
+
// TODO: It should be sufficient to use the fastest liveness
// calculation, i.e. livenessLightweight(). However, for some
// reason that slows down the rest of the translation. Investigate.
@@ -267,6 +276,7 @@ void TargetX8632::translateO2() {
return;
T_liveness1.printElapsedUs(Context, "liveness()");
Func->dump("After x86 address mode opt");
+
Timer T_genCode;
Func->genCode();
if (Func->hasError())
@@ -329,6 +339,10 @@ void TargetX8632::translateOm1() {
T_deletePhis.printElapsedUs(Context, "deletePhis()");
Func->dump("After Phi lowering");
+ Timer T_argLowering;
+ Func->doArgLowering();
+ T_argLowering.printElapsedUs(Context, "lowerArguments()");
+
Timer T_genCode;
Func->genCode();
if (Func->hasError())
@@ -412,34 +426,74 @@ void TargetX8632::emitVariable(const Variable *Var, const Cfg *Func) const {
Str << "]";
}
-// Helper function for addProlog(). Sets the frame offset for Arg,
-// updates InArgsSizeBytes according to Arg's width, and generates an
-// instruction to copy Arg into its assigned register if applicable.
-// For an I64 arg that has been split into Lo and Hi components, it
-// calls itself recursively on the components, taking care to handle
-// Lo first because of the little-endian architecture.
-void TargetX8632::setArgOffsetAndCopy(Variable *Arg, Variable *FramePtr,
- size_t BasicFrameOffset,
- size_t &InArgsSizeBytes) {
+void TargetX8632::lowerArguments() {
+ VarList &Args = Func->getArgs();
+ // The first four arguments of vector type, regardless of their
+ // position relative to the other arguments in the argument list, are
+ // passed in registers xmm0 - xmm3.
+ unsigned NumXmmArgs = 0;
+
+ Context.init(Func->getEntryNode());
+ Context.setInsertPoint(Context.getCur());
+
+ for (SizeT I = 0, E = Args.size(); I < E && NumXmmArgs < X86_MAX_XMM_ARGS;
+ ++I) {
+ Variable *Arg = Args[I];
+ Type Ty = Arg->getType();
+ if (!isVectorType(Ty))
+ continue;
+ // Replace Arg in the argument list with the home register. Then
+ // generate an instruction in the prolog to copy the home register
+ // to the assigned location of Arg.
+ int32_t RegNum = Reg_xmm0 + NumXmmArgs;
+ ++NumXmmArgs;
+ IceString Name = "home_reg:" + Arg->getName();
+ const CfgNode *DefNode = NULL;
+ Variable *RegisterArg = Func->makeVariable(Ty, DefNode, Name);
+ RegisterArg->setRegNum(RegNum);
+ RegisterArg->setIsArg(Func);
+ Arg->setIsArg(Func, false);
+
+ Args[I] = RegisterArg;
+ Context.insert(InstAssign::create(Func, Arg, RegisterArg));
+ }
+}
+
+// Helper function for addProlog().
+//
+// This assumes Arg is an argument passed on the stack. This sets the
+// frame offset for Arg and updates InArgsSizeBytes according to Arg's
+// width. For an I64 arg that has been split into Lo and Hi components,
+// it calls itself recursively on the components, taking care to handle
+// Lo first because of the little-endian architecture. Lastly, this
+// function generates an instruction to copy Arg into its assigned
+// register if applicable.
+void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+ size_t BasicFrameOffset,
+ size_t &InArgsSizeBytes) {
Variable *Lo = Arg->getLo();
Variable *Hi = Arg->getHi();
Type Ty = Arg->getType();
if (Lo && Hi && Ty == IceType_i64) {
assert(Lo->getType() != IceType_i64); // don't want infinite recursion
assert(Hi->getType() != IceType_i64); // don't want infinite recursion
- setArgOffsetAndCopy(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
- setArgOffsetAndCopy(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+ finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+ finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
return;
}
Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
+ InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
if (Arg->hasReg()) {
assert(Ty != IceType_i64);
OperandX8632Mem *Mem = OperandX8632Mem::create(
Func, Ty, FramePtr,
Ctx->getConstantInt(IceType_i32, Arg->getStackOffset()));
- _mov(Arg, Mem);
+ if (isVectorType(Arg->getType())) {
+ _movp(Arg, Mem);
+ } else {
+ _mov(Arg, Mem);
+ }
}
- InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
}
Type TargetX8632::stackSlotType() { return IceType_i32; }
@@ -489,7 +543,8 @@ void TargetX8632::addProlog(CfgNode *Node) {
RegsUsed[Var->getRegNum()] = true;
continue;
}
- // An argument passed on the stack already has a stack slot.
+ // An argument either does not need a stack slot (if passed in a
+ // register) or already has one (if passed on the stack).
if (Var->getIsArg())
continue;
// An unreferenced variable doesn't need a stack slot.
@@ -547,23 +602,23 @@ void TargetX8632::addProlog(CfgNode *Node) {
resetStackAdjustment();
- // Fill in stack offsets for args, and copy args into registers for
- // those that were register-allocated. Args are pushed right to
+ // Fill in stack offsets for stack args, and copy args into registers
+ // for those that were register-allocated. Args are pushed right to
// left, so Arg[0] is closest to the stack/frame pointer.
- //
- // TODO: Make this right for different width args, calling
- // conventions, etc. For one thing, args passed in registers will
- // need to be copied/shuffled to their home registers (the
- // RegManager code may have some permutation logic to leverage),
- // and if they have no home register, home space will need to be
- // allocated on the stack to copy into.
Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
size_t BasicFrameOffset = PreservedRegsSizeBytes + RetIpSizeBytes;
if (!IsEbpBasedFrame)
BasicFrameOffset += LocalsSizeBytes;
+
+ unsigned NumXmmArgs = 0;
for (SizeT i = 0; i < Args.size(); ++i) {
Variable *Arg = Args[i];
- setArgOffsetAndCopy(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+ // Skip arguments passed in registers.
+ if (isVectorType(Arg->getType()) && NumXmmArgs < X86_MAX_XMM_ARGS) {
+ ++NumXmmArgs;
+ continue;
+ }
+ finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
}
// Fill in stack offsets for locals.
@@ -1253,7 +1308,10 @@ void TargetX8632::lowerAssign(const InstAssign *Inst) {
const bool AllowOverlap = true;
// RI is either a physical register or an immediate.
Operand *RI = legalize(Src0, Legal_Reg | Legal_Imm, AllowOverlap);
- _mov(Dest, RI);
+ if (isVectorType(Dest->getType()))
+ _movp(Dest, RI);
+ else
+ _mov(Dest, RI);
}
}
@@ -1269,31 +1327,44 @@ void TargetX8632::lowerBr(const InstBr *Inst) {
}
void TargetX8632::lowerCall(const InstCall *Instr) {
- // Generate a sequence of push instructions, pushing right to left,
- // keeping track of stack offsets in case a push involves a stack
- // operand and we are using an esp-based frame.
+ // Classify each argument operand according to the location where the
+ // argument is passed.
+ OperandList XmmArgs;
+ OperandList StackArgs;
+ for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
+ Operand *Arg = Instr->getArg(i);
+ if (isVectorType(Arg->getType()) && XmmArgs.size() < X86_MAX_XMM_ARGS) {
+ XmmArgs.push_back(Arg);
+ } else {
+ StackArgs.push_back(Arg);
+ }
+ }
+ // For stack arguments, generate a sequence of push instructions,
+ // pushing right to left, keeping track of stack offsets in case a
+ // push involves a stack operand and we are using an esp-based frame.
uint32_t StackOffset = 0;
+ // TODO: Consolidate the stack adjustment for function calls by
+ // reserving enough space for the arguments only once.
+ //
// TODO: If for some reason the call instruction gets dead-code
// eliminated after lowering, we would need to ensure that the
// pre-call push instructions and the post-call esp adjustment get
// eliminated as well.
- for (SizeT NumArgs = Instr->getNumArgs(), i = 0; i < NumArgs; ++i) {
- Operand *Arg = legalize(Instr->getArg(NumArgs - i - 1));
+ for (OperandList::reverse_iterator I = StackArgs.rbegin(),
+ E = StackArgs.rend(); I != E; ++I) {
+ Operand *Arg = legalize(*I);
if (Arg->getType() == IceType_i64) {
_push(hiOperand(Arg));
_push(loOperand(Arg));
- } else if (Arg->getType() == IceType_f64) {
- // If the Arg turns out to be a memory operand, we need to push
- // 8 bytes, which requires two push instructions. This ends up
- // being somewhat clumsy in the current IR, so we use a
- // workaround. Force the operand into a (xmm) register, and
- // then push the register. An xmm register push is actually not
- // possible in x86, but the Push instruction emitter handles
- // this by decrementing the stack pointer and directly writing
- // the xmm register value.
- Variable *T = NULL;
- _mov(T, Arg);
- _push(T);
+ } else if (Arg->getType() == IceType_f64 || isVectorType(Arg->getType())) {
+ // If the Arg turns out to be a memory operand, more than one push
+ // instruction is required. This ends up being somewhat clumsy in
+ // the current IR, so we use a workaround. Force the operand into
+ // a (xmm) register, and then push the register. An xmm register
+ // push is actually not possible in x86, but the Push instruction
+ // emitter handles this by decrementing the stack pointer and
+ // directly writing the xmm register value.
+ _push(legalize(Arg, Legal_Reg));
} else {
// Otherwise PNaCl requires parameter types to be at least 32-bits.
assert(Arg->getType() == IceType_f32 || Arg->getType() == IceType_i32);
@@ -1301,11 +1372,28 @@ void TargetX8632::lowerCall(const InstCall *Instr) {
}
StackOffset += typeWidthInBytesOnStack(Arg->getType());
}
+ // Copy arguments to be passed in registers to the appropriate
+ // registers.
+ // TODO: Investigate the impact of lowering arguments passed in
+ // registers after lowering stack arguments as opposed to the other
+ // way around. Lowering register arguments after stack arguments may
+ // reduce register pressure. On the other hand, lowering register
+ // arguments first (before stack arguments) may result in more compact
+ // code, as the memory operand displacements may end up being smaller
+ // before any stack adjustment is done.
+ for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
+ Variable *Reg = legalizeToVar(XmmArgs[i], false, Reg_xmm0 + i);
+ // Generate a FakeUse of register arguments so that they do not get
+ // dead code eliminated as a result of the FakeKill of scratch
+ // registers after the call.
+ Context.insert(InstFakeUse::create(Func, Reg));
+ }
// Generate the call instruction. Assign its result to a temporary
// with high register allocation weight.
Variable *Dest = Instr->getDest();
- Variable *eax = NULL; // doubles as RegLo as necessary
- Variable *edx = NULL;
+ // ReturnReg doubles as ReturnRegLo as necessary.
+ Variable *ReturnReg = NULL;
+ Variable *ReturnRegHi = NULL;
if (Dest) {
switch (Dest->getType()) {
case IceType_NUM:
@@ -1317,16 +1405,16 @@ void TargetX8632::lowerCall(const InstCall *Instr) {
case IceType_i8:
case IceType_i16:
case IceType_i32:
- eax = makeReg(Dest->getType(), Reg_eax);
+ ReturnReg = makeReg(Dest->getType(), Reg_eax);
break;
case IceType_i64:
- eax = makeReg(IceType_i32, Reg_eax);
- edx = makeReg(IceType_i32, Reg_edx);
+ ReturnReg = makeReg(IceType_i32, Reg_eax);
+ ReturnRegHi = makeReg(IceType_i32, Reg_edx);
break;
case IceType_f32:
case IceType_f64:
- // Leave eax==edx==NULL, and capture the result with the fstp
- // instruction.
+ // Leave ReturnReg==ReturnRegHi==NULL, and capture the result with
+ // the fstp instruction.
break;
case IceType_v4i1:
case IceType_v8i1:
@@ -1334,24 +1422,18 @@ void TargetX8632::lowerCall(const InstCall *Instr) {
case IceType_v16i8:
case IceType_v8i16:
case IceType_v4i32:
- case IceType_v4f32: {
- // TODO(wala): Handle return values of vector type in the caller.
- IceString Ty;
- llvm::raw_string_ostream BaseOS(Ty);
- Ostream OS(&BaseOS);
- OS << Dest->getType();
- Func->setError("Unhandled dest type: " + BaseOS.str());
- return;
- }
+ case IceType_v4f32:
+ ReturnReg = makeReg(Dest->getType(), Reg_xmm0);
+ break;
}
}
// TODO(stichnot): LEAHACK: remove Legal_All (and use default) once
// a proper emitter is used.
Operand *CallTarget = legalize(Instr->getCallTarget(), Legal_All);
- Inst *NewCall = InstX8632Call::create(Func, eax, CallTarget);
+ Inst *NewCall = InstX8632Call::create(Func, ReturnReg, CallTarget);
Context.insert(NewCall);
- if (edx)
- Context.insert(InstFakeDef::create(Func, edx));
+ if (ReturnRegHi)
+ Context.insert(InstFakeDef::create(Func, ReturnRegHi));
// Add the appropriate offset to esp.
if (StackOffset) {
@@ -1368,34 +1450,42 @@ void TargetX8632::lowerCall(const InstCall *Instr) {
Context.insert(InstFakeKill::create(Func, KilledRegs, NewCall));
// Generate a FakeUse to keep the call live if necessary.
- if (Instr->hasSideEffects() && eax) {
- Inst *FakeUse = InstFakeUse::create(Func, eax);
+ if (Instr->hasSideEffects() && ReturnReg) {
+ Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
Context.insert(FakeUse);
}
+
+ if (!Dest)
+ return;
- // Generate Dest=eax assignment.
- if (Dest && eax) {
- if (edx) {
+ // Assign the result of the call to Dest.
+ if (ReturnReg) {
+ if (ReturnRegHi) {
+ assert(Dest->getType() == IceType_i64);
split64(Dest);
Variable *DestLo = Dest->getLo();
Variable *DestHi = Dest->getHi();
- DestLo->setPreferredRegister(eax, false);
- DestHi->setPreferredRegister(edx, false);
- _mov(DestLo, eax);
- _mov(DestHi, edx);
+ DestLo->setPreferredRegister(ReturnReg, false);
+ DestHi->setPreferredRegister(ReturnRegHi, false);
+ _mov(DestLo, ReturnReg);
+ _mov(DestHi, ReturnRegHi);
} else {
- Dest->setPreferredRegister(eax, false);
- _mov(Dest, eax);
+ assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
+ Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
+ isVectorType(Dest->getType()));
+ Dest->setPreferredRegister(ReturnReg, false);
+ if (isVectorType(Dest->getType())) {
+ _movp(Dest, ReturnReg);
+ } else {
+ _mov(Dest, ReturnReg);
+ }
}
- }
-
- // Special treatment for an FP function which returns its result in
- // st(0).
- if (Dest &&
- (Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64)) {
+ } else if (Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64) {
+ // Special treatment for an FP function which returns its result in
+ // st(0).
_fstp(Dest);
- // If Dest ends up being a physical xmm register, the fstp emit
- // code will route st(0) through a temporary stack slot.
+ // If Dest ends up being a physical xmm register, the fstp emit code
+ // will route st(0) through a temporary stack slot.
}
}
« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/vector-arg.ll » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698