src/IceTargetLoweringX86BaseImpl.h - Issue 1278173009: Inline memove for small constant sizes and refactor memcpy and memset.

Unified Diff: src/IceTargetLoweringX86BaseImpl.h

Issue 1278173009: Inline memove for small constant sizes and refactor memcpy and memset. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: Simplify xtests and add flags for memory intrinsic optimization. Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/IceTargetLoweringX86BaseImpl.h

diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h

index bb26090629253933c550f046e073b7b14c076ef3..e8971f175cb3fdc610730b014cc6ba4205894ecb 100644

--- a/src/IceTargetLoweringX86BaseImpl.h

+++ b/src/IceTargetLoweringX86BaseImpl.h

@@ -3034,11 +3034,7 @@ void TargetX86Base<Machine>::lowerIntrinsicCall(

return;

}

case Intrinsics::Memmove: {

- InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);

- Call->addArg(Instr->getArg(0));

- Call->addArg(Instr->getArg(1));

- Call->addArg(Instr->getArg(2));

- lowerCall(Call);

+ lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));

return;

}

case Intrinsics::Memset: {

@@ -3481,22 +3477,57 @@ void TargetX86Base<Machine>::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,

}

template <class Machine>

+void TargetX86Base<Machine>::typedLoad(Type Ty, Variable *Dest, Variable *Base,

+ Constant *Offset) {

+ auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);

+ if (isVectorType(Ty))

+ _movp(Dest, Mem);

+ else if (Ty == IceType_f64)

+ _movq(Dest, Mem);

+ else

+ _mov(Dest, Mem);

+template <class Machine>

+void TargetX86Base<Machine>::typedStore(Type Ty, Variable *Value,

+ Variable *Base, Constant *Offset) {

+ auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);

+ if (isVectorType(Ty))

+ _storep(Value, Mem);

+ else if (Ty == IceType_f64)

+ _storeq(Value, Mem);

+ else

+ _store(Value, Mem);

+template <class Machine>

+void TargetX86Base<Machine>::copyMemory(Type Ty, Variable *Dest, Variable *Src,

+ int32_t OffsetAmt) {

+ Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;

+ // TODO(ascull): this or add nullptr test to _movp, _movq

+ Variable *Data = makeReg(Ty);

+ typedLoad(Ty, Data, Src, Offset);

+ typedStore(Ty, Data, Dest, Offset);

+template <class Machine>

void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,

Operand *Count) {

// There is a load and store for each chunk in the unroll

constexpr uint32_t UNROLL_LIMIT = 8;

constexpr uint32_t BytesPerStorep = 16;

- constexpr uint32_t BytesPerStoreq = 8;

- constexpr uint32_t BytesPerStorei32 = 4;

- constexpr uint32_t BytesPerStorei16 = 2;

- constexpr uint32_t BytesPerStorei8 = 1;

// Check if the operands are constants

const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);

const bool IsCountConst = CountConst != nullptr;

const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;

- if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {

+ if ((Ctx->getFlags().getOptLevel() >= Opt_1 ||

Jim Stichnoth 2015/08/18 17:11:28 Refactor this new condition into a function in an

ascull 2015/08/18 18:38:01 I put this is TargetLowering as it needs access to

John 2015/08/18 18:51:39 I second the motion, but don't create an anonymous

ascull 2015/08/18 19:03:10 I don't think static works either as it needs acce

John 2015/08/18 19:34:07 that's why you'd pass it as a parameter. :)

+ Ctx->getFlags().getForceMemIntrinOpt()) &&

+ IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {

// Unlikely, but nothing to do if it does happen

if (CountValue == 0)

return;

@@ -3504,85 +3535,107 @@ void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,

Variable *SrcBase = legalizeToReg(Src);

Variable *DestBase = legalizeToReg(Dest);

- auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) {

- Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;

- // TODO(ascull): this or add nullptr test to _movp, _movq

- Variable *Data = makeReg(Ty);

+ // Find the largest type that can be used and use it as much as possible in

+ // reverse order. Then handle any remainder with overlapping copies. Since

+ // the remainder will be at the end, there will be reduced pressure on the

+ // memory unit as the accesses to the same memory are far apart.

+ Type Ty = largestTypeInSize(CountValue);

+ uint32_t TyWidth = typeWidthInBytes(Ty);

+ uint32_t RemainingBytes = CountValue;

+ int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;

+ while (RemainingBytes >= TyWidth) {

+ copyMemory(Ty, DestBase, SrcBase, Offset);

+ RemainingBytes -= TyWidth;

+ Offset -= TyWidth;

+ }

- // TODO(ascull): is 64-bit better with vector or scalar movq?

- auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset);

- if (isVectorType(Ty))

- _movp(Data, SrcMem);

- else if (Ty == IceType_f64)

- _movq(Data, SrcMem);

- else

- _mov(Data, SrcMem);

+ if (RemainingBytes == 0)

+ return;

- auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);

- if (isVectorType(Ty))

- _storep(Data, DestMem);

- else if (Ty == IceType_f64)

- _storeq(Data, DestMem);

- else

- _store(Data, DestMem);

- };

+ // Lower the remaining bytes. Adjust to larger types in order to make use

+ // of overlaps in the copies.

+ Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);

+ Offset = CountValue - typeWidthInBytes(LeftOverTy);

+ copyMemory(LeftOverTy, DestBase, SrcBase, Offset);

+ return;

+ }

- // Lowers the assignment to the remaining bytes. Assumes the original size

- // was large enough to allow for overlaps.

- auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {

- if (Size > BytesPerStoreq) {

- lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);

- } else if (Size > BytesPerStorei32) {

- lowerCopy(IceType_f64, CountValue - BytesPerStoreq);

- } else if (Size > BytesPerStorei16) {

- lowerCopy(IceType_i32, CountValue - BytesPerStorei32);

- } else if (Size > BytesPerStorei8) {

- lowerCopy(IceType_i16, CountValue - BytesPerStorei16);

- } else if (Size == BytesPerStorei8) {

- lowerCopy(IceType_i8, CountValue - BytesPerStorei8);

- }

- };

+ // Fall back on a function call

+ InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);

+ Call->addArg(Dest);

+ Call->addArg(Src);

+ Call->addArg(Count);

+ lowerCall(Call);

- if (CountValue >= BytesPerStorep) {

- // Use large vector operations

- for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {

- N -= BytesPerStorep;

- lowerCopy(IceType_v16i8, N);

- }

- lowerLeftOvers(CountValue & 0xF);

- return;

- }

+template <class Machine>

+void TargetX86Base<Machine>::lowerMemmove(Operand *Dest, Operand *Src,

+ Operand *Count) {

+ // There is a load and store for each chunk in the unroll

+ constexpr uint32_t UNROLL_LIMIT = 8; // 32-bit has 8 xmm registers

+ constexpr uint32_t BytesPerStorep = 16;

- // Too small to use large vector operations so use small ones instead

- if (CountValue >= BytesPerStoreq) {

- lowerCopy(IceType_f64, 0);

- lowerLeftOvers(CountValue - BytesPerStoreq);

+ // Check if the operands are constants

+ const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);

+ const bool IsCountConst = CountConst != nullptr;

+ const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;

+ if ((Ctx->getFlags().getOptLevel() >= Opt_1 ||

+ Ctx->getFlags().getForceMemIntrinOpt()) &&

+ IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {

+ // Unlikely, but nothing to do if it does happen

+ if (CountValue == 0)

return;

+ Variable *SrcBase = legalizeToReg(Src);

+ Variable *DestBase = legalizeToReg(Dest);

+ std::tuple<Type, Constant *, Variable *> Moves[UNROLL_LIMIT];

+ Constant *Offset;

+ Variable *Reg;

+ // Copy the data into registers as the source and destination could overlap

+ // so make surenot to clobber the memory. This also means overlapping moves

jvoung (off chromium) 2015/08/18 17:18:22 "surenot" -> "sure not"

ascull 2015/08/18 18:38:01 Done.

+ // can be used as we are taking a safe snapshot of the memory.

+ Type Ty = largestTypeInSize(CountValue);

+ uint32_t TyWidth = typeWidthInBytes(Ty);

+ uint32_t RemainingBytes = CountValue;

+ int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;

+ size_t N = 0;

+ while (RemainingBytes >= TyWidth) {

+ assert(N <= UNROLL_LIMIT);

+ Offset = Ctx->getConstantInt32(OffsetAmt);

+ Reg = makeReg(Ty);

+ typedLoad(Ty, Reg, SrcBase, Offset);

+ RemainingBytes -= TyWidth;

+ OffsetAmt -= TyWidth;

+ Moves[N++] = std::make_tuple(Ty, Offset, Reg);

}

- // Too small for vector operations so use scalar ones

- if (CountValue >= BytesPerStorei32) {

- lowerCopy(IceType_i32, 0);

- lowerLeftOvers(CountValue - BytesPerStorei32);

- return;

+ if (RemainingBytes != 0) {

+ // Lower the remaining bytes. Adjust to larger types in order to make use

+ // of overlaps in the copies.

+ assert(N <= UNROLL_LIMIT);

+ Ty = firstTypeThatFitsSize(RemainingBytes);

+ Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));

+ Reg = makeReg(Ty);

+ typedLoad(Ty, Reg, SrcBase, Offset);

+ Moves[N++] = std::make_tuple(Ty, Offset, Reg);

}

- // 3 is the awkward size as it is too small for the vector or 32-bit

- // operations and will not work with lowerLeftOvers as there is no valid

- // overlap.

- if (CountValue == 3) {

- lowerCopy(IceType_i16, 0);

- lowerCopy(IceType_i8, 2);

- return;

+ // Copy the data out into the destination memory

+ for (size_t i = 0; i < N; ++i) {

+ std::tie(Ty, Offset, Reg) = Moves[i];

+ typedStore(Ty, Reg, DestBase, Offset);

}

- // 1 or 2 can be done in a single scalar copy

- lowerLeftOvers(CountValue);

return;

}

// Fall back on a function call

- InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);

+ InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);

Call->addArg(Dest);

Call->addArg(Src);

Call->addArg(Count);

@@ -3596,8 +3649,6 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,

constexpr uint32_t BytesPerStorep = 16;

constexpr uint32_t BytesPerStoreq = 8;

constexpr uint32_t BytesPerStorei32 = 4;

- constexpr uint32_t BytesPerStorei16 = 2;

- constexpr uint32_t BytesPerStorei8 = 1;

assert(Val->getType() == IceType_i8);

// Check if the operands are constants

@@ -3615,14 +3666,16 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,

// TODO(ascull): if the count is constant but val is not it would be possible

// to inline by spreading the value across 4 bytes and accessing subregs e.g.

// eax, ax and al.

- if (IsCountConst && IsValConst) {

+ if ((Ctx->getFlags().getOptLevel() >= Opt_1 ||

+ Ctx->getFlags().getForceMemIntrinOpt()) &&

+ IsCountConst && IsValConst) {

Variable *Base = nullptr;

+ Variable *VecReg = nullptr;

const uint32_t SpreadValue =

(ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;

- Variable *VecReg = nullptr;

auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,

- uint32_t OffsetAmt) {

+ uint32_t OffsetAmt) {

assert(Base != nullptr);

Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;

@@ -3631,7 +3684,7 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,

if (isVectorType(Ty)) {

assert(VecReg != nullptr);

_storep(VecReg, Mem);

- } else if (Ty == IceType_i64) {

+ } else if (Ty == IceType_f64) {

assert(VecReg != nullptr);

_storeq(VecReg, Mem);

} else {

@@ -3639,63 +3692,45 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,

}

};

- // Lowers the assignment to the remaining bytes. Assumes the original size

- // was large enough to allow for overlaps.

- auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) {

- if (Size > BytesPerStoreq) {

- lowerSet(IceType_v16i8, CountValue - BytesPerStorep);

- } else if (Size > BytesPerStorei32) {

- lowerSet(IceType_i64, CountValue - BytesPerStoreq);

- } else if (Size > BytesPerStorei16) {

- lowerSet(IceType_i32, CountValue - BytesPerStorei32);

- } else if (Size > BytesPerStorei8) {

- lowerSet(IceType_i16, CountValue - BytesPerStorei16);

- } else if (Size == BytesPerStorei8) {

- lowerSet(IceType_i8, CountValue - BytesPerStorei8);

- }

- };

- // When the value is zero it can be loaded into a vector register cheaply

- // using the xor trick.

+ // Find the largest type that can be used and use it as much as possible in

+ // reverse order. Then handle any remainder with overlapping copies. Since

+ // the remainder will be at the end, there will be reduces pressure on the

+ // memory unit as the access to the same memory are far apart.

+ Type Ty;

if (ValValue == 0 && CountValue >= BytesPerStoreq &&

CountValue <= BytesPerStorep * UNROLL_LIMIT) {

+ // When the value is zero it can be loaded into a vector register cheaply

+ // using the xor trick.

Base = legalizeToReg(Dest);

VecReg = makeVectorOfZeros(IceType_v16i8);

+ Ty = largestTypeInSize(CountValue);

+ } else if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {

+ // When the value is non-zero or the count is small we can't use vector

+ // instructions so are limits to 32-bit stores.

jvoung (off chromium) 2015/08/18 17:18:22 "so are limited" ?

ascull 2015/08/18 18:38:01 Done.

+ Base = legalizeToReg(Dest);

+ constexpr uint32_t MaxSize = 4;

+ Ty = largestTypeInSize(CountValue, MaxSize);

+ }

- // Too small to use large vector operations so use small ones instead

- if (CountValue < BytesPerStorep) {

- lowerSet(IceType_i64, 0);

- lowerLeftOvers(CountValue - BytesPerStoreq);

- return;

- }

- // Use large vector operations

- for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {

- N -= 16;

- lowerSet(IceType_v16i8, N);

+ if (Base) {

+ uint32_t TyWidth = typeWidthInBytes(Ty);

+ uint32_t RemainingBytes = CountValue;

+ uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;

+ while (RemainingBytes >= TyWidth) {

+ lowerSet(Ty, Offset);

+ RemainingBytes -= TyWidth;

+ Offset -= TyWidth;

}

- lowerLeftOvers(CountValue & 0xF);

- return;

- }

- // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?

- if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {

- Base = legalizeToReg(Dest);

- // 3 is the awkward size as it is too small for the vector or 32-bit

- // operations and will not work with lowerLeftOvers as there is no valid

- // overlap.

- if (CountValue == 3) {

- lowerSet(IceType_i16, 0);

- lowerSet(IceType_i8, 2);

+ if (RemainingBytes == 0)

return;

- }

- // TODO(ascull); 64-bit can do better with 64-bit mov

- for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {

- N -= 4;

- lowerSet(IceType_i32, N);

- }

- lowerLeftOvers(CountValue & 0x3);

+ // Lower the remaining bytes. Adjust to larger types in order to make use

+ // of overlaps in the copies.

+ Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);

+ Offset = CountValue - typeWidthInBytes(LeftOverTy);

+ lowerSet(LeftOverTy, Offset);

return;

}

@@ -4916,6 +4951,34 @@ Variable *TargetX86Base<Machine>::makeReg(Type Type, int32_t RegNum) {

return Reg;

}

+template <class Machine>

+const Type TargetX86Base<Machine>::TypeForSize[] = {

+ IceType_i8, IceType_i16, IceType_i32,

+ (Traits::Is64Bit ? IceType_i64 : IceType_f64), IceType_v16i8};

+template <class Machine>

+Type TargetX86Base<Machine>::largestTypeInSize(uint32_t Size,

+ uint32_t MaxSize) {

+ assert(Size != 0);

+ uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);

+ uint32_t MaxIndex = MaxSize != NoSizeLimit

+ ? llvm::findLastSet(MaxSize, llvm::ZB_Undefined)

+ : llvm::array_lengthof(TypeForSize) - 1;

+ return TypeForSize[std::min(TyIndex, MaxIndex)];

+template <class Machine>

+Type TargetX86Base<Machine>::firstTypeThatFitsSize(uint32_t Size,

+ uint32_t MaxSize) {

+ assert(Size != 0);

+ uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);

+ if (!llvm::isPowerOf2_32(Size))

+ ++TyIndex;

+ uint32_t MaxIndex = MaxSize != NoSizeLimit

+ ? llvm::findLastSet(MaxSize, llvm::ZB_Undefined)

+ : llvm::array_lengthof(TypeForSize) - 1;

+ return TypeForSize[std::min(TyIndex, MaxIndex)];

template <class Machine> void TargetX86Base<Machine>::postLower() {

if (Ctx->getFlags().getOptLevel() == Opt_m1)

return;

« pydir/crosstest.py ('K') | « src/IceTargetLoweringX86Base.h ('k') | tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll » ('j') | tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll » ('J')