Index: src/IceTargetLoweringX86BaseImpl.h |
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h |
index 2b94df0799dbd9ce97815b48218d29b857f5733a..cb62a78fe358209e0f6447494d90c728474b9fbc 100644 |
--- a/src/IceTargetLoweringX86BaseImpl.h |
+++ b/src/IceTargetLoweringX86BaseImpl.h |
@@ -1135,8 +1135,8 @@ bool TargetX86Base<TraitsType>::optimizeScalarMul(Variable *Dest, Operand *Src0, |
return false; |
} |
} |
- // Lea optimization only works for i16 and i32 types, not i8. |
- if (Ty != IceType_i16 && Ty != IceType_i32 && (Count3 || Count5 || Count9)) |
+ // Lea optimization only works for i32 type, not i8 or i16. |
Jim Stichnoth
2016/01/12 14:54:02
What's the story with i64 under x86-64? Ask John,
sehr
2016/01/12 19:01:19
Not supported either. I changed the comment.
|
+ if (Ty != IceType_i32 && (Count3 || Count5 || Count9)) |
return false; |
// Limit the number of lea/shl operations for a single multiply, to a |
// somewhat arbitrary choice of 3. |
@@ -5590,6 +5590,7 @@ void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) { |
Variable *Dest = Cast->getDest(); |
const Type DestTy = Dest->getType(); |
const char *HelperName = nullptr; |
+ Variable *CallDest = Dest; |
switch (CastKind) { |
default: |
return; |
@@ -5655,10 +5656,12 @@ void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) { |
case IceType_i8: |
assert(Src0->getType() == IceType_v8i1); |
HelperName = H_bitcast_8xi1_i8; |
+ CallDest = Func->makeVariable(IceType_i32); |
break; |
case IceType_i16: |
assert(Src0->getType() == IceType_v16i1); |
HelperName = H_bitcast_16xi1_i16; |
+ CallDest = Func->makeVariable(IceType_i32); |
break; |
case IceType_v8i1: { |
assert(Src0->getType() == IceType_i8); |
@@ -5680,10 +5683,14 @@ void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) { |
} break; |
} |
constexpr SizeT MaxSrcs = 1; |
- InstCall *Call = makeHelperCall(HelperName, Dest, MaxSrcs); |
+ InstCall *Call = makeHelperCall(HelperName, CallDest, MaxSrcs); |
Call->addArg(Src0); |
StackArgumentsSize = getCallStackArgumentsSizeBytes(Call); |
Context.insert(Call); |
+ // Sometimes we need to convert from i8 or i16 type to i32 to maintain ABI |
Jim Stichnoth
2016/01/12 14:54:02
This is a little confusing because we're actually
sehr
2016/01/12 19:01:19
Done.
|
+ // compatibility. If so, insert a conversion operator after the call. |
+ if (CallDest != Dest) |
Jim Stichnoth
2016/01/12 14:54:02
I think it would be marginally more future-proof i
sehr
2016/01/12 19:01:19
Done.
|
+ Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest); |
Cast->setDeleted(); |
} else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsicCall>(Instr)) { |
std::vector<Type> ArgTypes; |
@@ -5841,7 +5848,14 @@ Variable *TargetX86Base<TraitsType>::makeVectorOfMinusOnes(Type Ty, |
Variable *MinusOnes = makeReg(Ty, RegNum); |
// Insert a FakeDef so the live range of MinusOnes is not overestimated. |
Context.insert<InstFakeDef>(MinusOnes); |
- _pcmpeq(MinusOnes, MinusOnes); |
+ if (Ty == IceType_f64) |
+ // Making a vector of minus ones of type f64 is currently only used for the |
+ // fabs intrinsic. To use the f64 type to create this mask with pcmpeqq |
+ // requires SSE 4.1. Since we're just creating a mask, pcmpeqd does the |
+ // same job and only requires SSE2. |
+ _pcmpeq(MinusOnes, MinusOnes, IceType_f32); |
+ else |
+ _pcmpeq(MinusOnes, MinusOnes); |
return MinusOnes; |
} |