Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(20)

Unified Diff: src/IceTargetLoweringX86BaseImpl.h

Issue 1278173009: Inline memove for small constant sizes and refactor memcpy and memset. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master
Patch Set: Fix memmove and add tests Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/IceTargetLoweringX86BaseImpl.h
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index bb26090629253933c550f046e073b7b14c076ef3..41eec9e45601d2516761b1fbb8e8018ea058291b 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -3034,11 +3034,7 @@ void TargetX86Base<Machine>::lowerIntrinsicCall(
return;
}
case Intrinsics::Memmove: {
- InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
- Call->addArg(Instr->getArg(0));
- Call->addArg(Instr->getArg(1));
- Call->addArg(Instr->getArg(2));
- lowerCall(Call);
+ lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
return;
}
case Intrinsics::Memset: {
@@ -3481,15 +3477,48 @@ void TargetX86Base<Machine>::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
}
template <class Machine>
+void TargetX86Base<Machine>::typedLoad(Type Ty, Variable *Dest, Variable *Base,
+ Constant *Offset) {
+ auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
+
+ if (isVectorType(Ty))
+ _movp(Dest, Mem);
+ else if (Ty == IceType_f64)
+ _movq(Dest, Mem);
+ else
+ _mov(Dest, Mem);
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::typedStore(Type Ty, Variable *Value,
+ Variable *Base, Constant *Offset) {
+ auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
+
+ if (isVectorType(Ty))
+ _storep(Value, Mem);
+ else if (Ty == IceType_f64)
+ _storeq(Value, Mem);
+ else
+ _store(Value, Mem);
+}
+
+template <class Machine>
+void TargetX86Base<Machine>::copyMemory(Type Ty, Variable *Dest, Variable *Src,
+ int32_t OffsetAmt) {
+ Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
+ // TODO(ascull): this or add nullptr test to _movp, _movq
+ Variable *Data = makeReg(Ty);
+
+ typedLoad(Ty, Data, Src, Offset);
+ typedStore(Ty, Data, Dest, Offset);
+}
+
+template <class Machine>
void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,
Operand *Count) {
// There is a load and store for each chunk in the unroll
constexpr uint32_t UNROLL_LIMIT = 8;
constexpr uint32_t BytesPerStorep = 16;
- constexpr uint32_t BytesPerStoreq = 8;
- constexpr uint32_t BytesPerStorei32 = 4;
- constexpr uint32_t BytesPerStorei16 = 2;
- constexpr uint32_t BytesPerStorei8 = 1;
// Check if the operands are constants
const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
@@ -3504,85 +3533,105 @@ void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,
Variable *SrcBase = legalizeToReg(Src);
Variable *DestBase = legalizeToReg(Dest);
- auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) {
- Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
- // TODO(ascull): this or add nullptr test to _movp, _movq
- Variable *Data = makeReg(Ty);
+ // Find the largest type that can be used and use it as much as possible in
+ // reverse order. Then handle any remainder with overlapping copies. Since
+ // the remainder will be at the end, there will be reduced pressure on the
+ // memory unit as the accesses to the same memory are far apart.
+ Type Ty = largestTypeInSize(CountValue);
+ uint32_t TyWidth = typeWidthInBytes(Ty);
+
+ uint32_t RemainingBytes = CountValue;
+ int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
+ while (RemainingBytes >= TyWidth) {
+ copyMemory(Ty, DestBase, SrcBase, Offset);
+ RemainingBytes -= TyWidth;
+ Offset -= TyWidth;
+ }
- // TODO(ascull): is 64-bit better with vector or scalar movq?
- auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset);
- if (isVectorType(Ty))
- _movp(Data, SrcMem);
- else if (Ty == IceType_f64)
- _movq(Data, SrcMem);
- else
- _mov(Data, SrcMem);
+ if (RemainingBytes == 0)
+ return;
- auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);
- if (isVectorType(Ty))
- _storep(Data, DestMem);
- else if (Ty == IceType_f64)
- _storeq(Data, DestMem);
- else
- _store(Data, DestMem);
- };
+ // Lower the remaining bytes. Adjust to larger types in order to make use
+ // of overlaps in the copies.
+ Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
+ Offset = CountValue - typeWidthInBytes(LeftOverTy);
+ copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
+ return;
+ }
- // Lowers the assignment to the remaining bytes. Assumes the original size
- // was large enough to allow for overlaps.
- auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {
- if (Size > BytesPerStoreq) {
- lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);
- } else if (Size > BytesPerStorei32) {
- lowerCopy(IceType_f64, CountValue - BytesPerStoreq);
- } else if (Size > BytesPerStorei16) {
- lowerCopy(IceType_i32, CountValue - BytesPerStorei32);
- } else if (Size > BytesPerStorei8) {
- lowerCopy(IceType_i16, CountValue - BytesPerStorei16);
- } else if (Size == BytesPerStorei8) {
- lowerCopy(IceType_i8, CountValue - BytesPerStorei8);
- }
- };
+ // Fall back on a function call
+ InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
+ Call->addArg(Dest);
+ Call->addArg(Src);
+ Call->addArg(Count);
+ lowerCall(Call);
+}
- if (CountValue >= BytesPerStorep) {
- // Use large vector operations
- for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
- N -= BytesPerStorep;
- lowerCopy(IceType_v16i8, N);
- }
- lowerLeftOvers(CountValue & 0xF);
- return;
- }
+template <class Machine>
+void TargetX86Base<Machine>::lowerMemmove(Operand *Dest, Operand *Src,
+ Operand *Count) {
+ // There is a load and store for each chunk in the unroll
+ constexpr uint32_t UNROLL_LIMIT = 8; // 32-bit has 8 xmm registers
+ constexpr uint32_t BytesPerStorep = 16;
- // Too small to use large vector operations so use small ones instead
- if (CountValue >= BytesPerStoreq) {
- lowerCopy(IceType_f64, 0);
- lowerLeftOvers(CountValue - BytesPerStoreq);
+ // Check if the operands are constants
+ const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
+ const bool IsCountConst = CountConst != nullptr;
+ const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
+
+ if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {
+ // Unlikely, but nothing to do if it does happen
+ if (CountValue == 0)
return;
+
+ Variable *SrcBase = legalizeToReg(Src);
+ Variable *DestBase = legalizeToReg(Dest);
+
+ std::tuple<Type, Constant *, Variable *> Moves[UNROLL_LIMIT];
+ Constant *Offset;
+ Variable *Reg;
+
+ // Copy the data into registers as the source and destination could overlap
+ // so make surenot to clobber the memory. This also means overlapping moves
+ // can be used as we are taking a safe snapshot of the memory.
+ Type Ty = largestTypeInSize(CountValue);
+ uint32_t TyWidth = typeWidthInBytes(Ty);
+
+ uint32_t RemainingBytes = CountValue;
+ int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
+ size_t N = 0;
+ while (RemainingBytes >= TyWidth) {
+ assert(N <= UNROLL_LIMIT);
+ Offset = Ctx->getConstantInt32(OffsetAmt);
+ Reg = makeReg(Ty);
+ typedLoad(Ty, Reg, SrcBase, Offset);
+ RemainingBytes -= TyWidth;
+ OffsetAmt -= TyWidth;
+ Moves[N++] = std::make_tuple(Ty, Offset, Reg);
}
- // Too small for vector operations so use scalar ones
- if (CountValue >= BytesPerStorei32) {
- lowerCopy(IceType_i32, 0);
- lowerLeftOvers(CountValue - BytesPerStorei32);
- return;
+ if (RemainingBytes != 0) {
+ // Lower the remaining bytes. Adjust to larger types in order to make use
+ // of overlaps in the copies.
+ assert(N <= UNROLL_LIMIT);
+ Ty = firstTypeThatFitsSize(RemainingBytes);
+ Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
+ Reg = makeReg(Ty);
+ typedLoad(Ty, Reg, SrcBase, Offset);
+ Moves[N++] = std::make_tuple(Ty, Offset, Reg);
}
- // 3 is the awkward size as it is too small for the vector or 32-bit
- // operations and will not work with lowerLeftOvers as there is no valid
- // overlap.
- if (CountValue == 3) {
- lowerCopy(IceType_i16, 0);
- lowerCopy(IceType_i8, 2);
- return;
+ // Copy the data out into the destination memory
+ for (size_t i = 0; i < N; ++i) {
+ std::tie(Ty, Offset, Reg) = Moves[i];
+ typedStore(Ty, Reg, DestBase, Offset);
}
- // 1 or 2 can be done in a single scalar copy
- lowerLeftOvers(CountValue);
return;
}
// Fall back on a function call
- InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
+ InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
Call->addArg(Dest);
Call->addArg(Src);
Call->addArg(Count);
@@ -3596,8 +3645,6 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
constexpr uint32_t BytesPerStorep = 16;
constexpr uint32_t BytesPerStoreq = 8;
constexpr uint32_t BytesPerStorei32 = 4;
- constexpr uint32_t BytesPerStorei16 = 2;
- constexpr uint32_t BytesPerStorei8 = 1;
assert(Val->getType() == IceType_i8);
// Check if the operands are constants
@@ -3617,12 +3664,12 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
// eax, ax and al.
if (IsCountConst && IsValConst) {
Variable *Base = nullptr;
+ Variable *VecReg = nullptr;
const uint32_t SpreadValue =
(ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
- Variable *VecReg = nullptr;
auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
- uint32_t OffsetAmt) {
+ uint32_t OffsetAmt) {
assert(Base != nullptr);
Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
@@ -3631,7 +3678,7 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
if (isVectorType(Ty)) {
assert(VecReg != nullptr);
_storep(VecReg, Mem);
- } else if (Ty == IceType_i64) {
+ } else if (Ty == IceType_f64) {
assert(VecReg != nullptr);
_storeq(VecReg, Mem);
} else {
@@ -3639,63 +3686,45 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
}
};
- // Lowers the assignment to the remaining bytes. Assumes the original size
- // was large enough to allow for overlaps.
- auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) {
- if (Size > BytesPerStoreq) {
- lowerSet(IceType_v16i8, CountValue - BytesPerStorep);
- } else if (Size > BytesPerStorei32) {
- lowerSet(IceType_i64, CountValue - BytesPerStoreq);
- } else if (Size > BytesPerStorei16) {
- lowerSet(IceType_i32, CountValue - BytesPerStorei32);
- } else if (Size > BytesPerStorei8) {
- lowerSet(IceType_i16, CountValue - BytesPerStorei16);
- } else if (Size == BytesPerStorei8) {
- lowerSet(IceType_i8, CountValue - BytesPerStorei8);
- }
- };
-
- // When the value is zero it can be loaded into a vector register cheaply
- // using the xor trick.
+ // Find the largest type that can be used and use it as much as possible in
+ // reverse order. Then handle any remainder with overlapping copies. Since
+ // the remainder will be at the end, there will be reduces pressure on the
+ // memory unit as the access to the same memory are far apart.
+ Type Ty;
if (ValValue == 0 && CountValue >= BytesPerStoreq &&
CountValue <= BytesPerStorep * UNROLL_LIMIT) {
+ // When the value is zero it can be loaded into a vector register cheaply
+ // using the xor trick.
Base = legalizeToReg(Dest);
VecReg = makeVectorOfZeros(IceType_v16i8);
+ Ty = largestTypeInSize(CountValue);
+ } else if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {
+ // When the value is non-zero or the count is small we can't use vector
+ // instructions so are limits to 32-bit stores.
+ Base = legalizeToReg(Dest);
+ constexpr uint32_t MaxSize = 4;
+ Ty = largestTypeInSize(CountValue, MaxSize);
+ }
- // Too small to use large vector operations so use small ones instead
- if (CountValue < BytesPerStorep) {
- lowerSet(IceType_i64, 0);
- lowerLeftOvers(CountValue - BytesPerStoreq);
- return;
- }
-
- // Use large vector operations
- for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
- N -= 16;
- lowerSet(IceType_v16i8, N);
+ if (Base) {
+ uint32_t TyWidth = typeWidthInBytes(Ty);
+
+ uint32_t RemainingBytes = CountValue;
+ uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
+ while (RemainingBytes >= TyWidth) {
+ lowerSet(Ty, Offset);
+ RemainingBytes -= TyWidth;
+ Offset -= TyWidth;
}
- lowerLeftOvers(CountValue & 0xF);
- return;
- }
- // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?
- if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {
- Base = legalizeToReg(Dest);
- // 3 is the awkward size as it is too small for the vector or 32-bit
- // operations and will not work with lowerLeftOvers as there is no valid
- // overlap.
- if (CountValue == 3) {
- lowerSet(IceType_i16, 0);
- lowerSet(IceType_i8, 2);
+ if (RemainingBytes == 0)
return;
- }
- // TODO(ascull); 64-bit can do better with 64-bit mov
- for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
- N -= 4;
- lowerSet(IceType_i32, N);
- }
- lowerLeftOvers(CountValue & 0x3);
+ // Lower the remaining bytes. Adjust to larger types in order to make use
+ // of overlaps in the copies.
+ Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
+ Offset = CountValue - typeWidthInBytes(LeftOverTy);
+ lowerSet(LeftOverTy, Offset);
return;
}
}
@@ -4916,6 +4945,34 @@ Variable *TargetX86Base<Machine>::makeReg(Type Type, int32_t RegNum) {
return Reg;
}
+template <class Machine>
+const Type TargetX86Base<Machine>::TypeForSize[] = {
+ IceType_i8, IceType_i16, IceType_i32,
+ (Traits::Is64Bit ? IceType_i64 : IceType_f64), IceType_v16i8};
+template <class Machine>
+Type TargetX86Base<Machine>::largestTypeInSize(uint32_t Size,
+ uint32_t MaxSize) {
+ assert(Size != 0);
+ uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
+ uint32_t MaxIndex = MaxSize != NoSizeLimit
+ ? llvm::findLastSet(MaxSize, llvm::ZB_Undefined)
+ : llvm::array_lengthof(TypeForSize) - 1;
+ return TypeForSize[std::min(TyIndex, MaxIndex)];
+}
+
+template <class Machine>
+Type TargetX86Base<Machine>::firstTypeThatFitsSize(uint32_t Size,
+ uint32_t MaxSize) {
+ assert(Size != 0);
+ uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
+ if (!llvm::isPowerOf2_32(Size))
+ ++TyIndex;
+ uint32_t MaxIndex = MaxSize != NoSizeLimit
+ ? llvm::findLastSet(MaxSize, llvm::ZB_Undefined)
+ : llvm::array_lengthof(TypeForSize) - 1;
+ return TypeForSize[std::min(TyIndex, MaxIndex)];
+}
+
template <class Machine> void TargetX86Base<Machine>::postLower() {
if (Ctx->getFlags().getOptLevel() == Opt_m1)
return;

Powered by Google App Engine
This is Rietveld 408576698