Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(317)

Side by Side Diff: src/IceTargetLoweringX86BaseImpl.h

Issue 1255053008: Inline memset when there is a constant value and count. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/IceTargetLoweringX86Base.h ('k') | tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==//
2 // 2 //
3 // The Subzero Code Generator 3 // The Subzero Code Generator
4 // 4 //
5 // This file is distributed under the University of Illinois Open Source 5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details. 6 // License. See LICENSE.TXT for details.
7 // 7 //
8 //===----------------------------------------------------------------------===// 8 //===----------------------------------------------------------------------===//
9 /// 9 ///
10 /// \file 10 /// \file
(...skipping 3528 matching lines...) Expand 10 before | Expand all | Expand 10 after
3539 } 3539 }
3540 case Intrinsics::Memmove: { 3540 case Intrinsics::Memmove: {
3541 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); 3541 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
3542 Call->addArg(Instr->getArg(0)); 3542 Call->addArg(Instr->getArg(0));
3543 Call->addArg(Instr->getArg(1)); 3543 Call->addArg(Instr->getArg(1));
3544 Call->addArg(Instr->getArg(2)); 3544 Call->addArg(Instr->getArg(2));
3545 lowerCall(Call); 3545 lowerCall(Call);
3546 return; 3546 return;
3547 } 3547 }
3548 case Intrinsics::Memset: { 3548 case Intrinsics::Memset: {
3549 // The value operand needs to be extended to a stack slot size because the 3549 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3550 // PNaCl ABI requires arguments to be at least 32 bits wide.
3551 Operand *ValOp = Instr->getArg(1);
3552 assert(ValOp->getType() == IceType_i8);
3553 Variable *ValExt = Func->makeVariable(stackSlotType());
3554 lowerCast(InstCast::create(Func, InstCast::Zext, ValExt, ValOp));
3555 InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3);
3556 Call->addArg(Instr->getArg(0));
3557 Call->addArg(ValExt);
3558 Call->addArg(Instr->getArg(2));
3559 lowerCall(Call);
3560 return; 3550 return;
3561 } 3551 }
3562 case Intrinsics::NaClReadTP: { 3552 case Intrinsics::NaClReadTP: {
3563 if (Ctx->getFlags().getUseSandboxing()) { 3553 if (Ctx->getFlags().getUseSandboxing()) {
3564 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand); 3554 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand);
3565 Variable *Dest = Instr->getDest(); 3555 Variable *Dest = Instr->getDest();
3566 Variable *T = nullptr; 3556 Variable *T = nullptr;
3567 _mov(T, Src); 3557 _mov(T, Src);
3568 _mov(Dest, T); 3558 _mov(Dest, T);
3569 } else { 3559 } else {
(...skipping 418 matching lines...) Expand 10 before | Expand all | Expand 10 after
3988 _bsr(T_Dest2, SecondVar); 3978 _bsr(T_Dest2, SecondVar);
3989 _xor(T_Dest2, ThirtyOne); 3979 _xor(T_Dest2, ThirtyOne);
3990 } 3980 }
3991 _test(SecondVar, SecondVar); 3981 _test(SecondVar, SecondVar);
3992 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); 3982 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);
3993 _mov(DestLo, T_Dest2); 3983 _mov(DestLo, T_Dest2);
3994 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); 3984 _mov(DestHi, Ctx->getConstantZero(IceType_i32));
3995 } 3985 }
3996 3986
3997 template <class Machine> 3987 template <class Machine>
3988 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
3989 Operand *Count) {
3990 constexpr uint32_t UNROLL_LIMIT = 16;
3991 assert(Val->getType() == IceType_i8);
3992
3993 // Check if the operands are constants
3994 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3995 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
3996 const bool IsCountConst = CountConst != nullptr;
3997 const bool IsValConst = ValConst != nullptr;
3998 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3999 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
4000
4001 // Unlikely, but nothing to do if it does happen
4002 if (IsCountConst && CountValue == 0)
4003 return;
4004
4005 // TODO(ascull): if the count is constant but val is not it would be possible
4006 // to inline by spreading the value across 4 bytes and accessing subregs e.g.
4007 // eax, ax and al.
4008 if (IsCountConst && IsValConst) {
4009 Variable *Base = legalizeToReg(Dest);
4010
4011 // 3 is the awkward size as it is too small for the vector or 32-bit
4012 // operations and will not work with lowerLeftOvers as there is no valid
4013 // overlap.
4014 if (CountValue == 3) {
4015 Constant *Offset = nullptr;
4016 auto *Mem =
4017 Traits::X86OperandMem::create(Func, IceType_i16, Base, Offset);
4018 _store(Ctx->getConstantInt16((ValValue << 8) | ValValue), Mem);
4019
4020 Offset = Ctx->getConstantInt8(2);
4021 Mem = Traits::X86OperandMem::create(Func, IceType_i8, Base, Offset);
4022 _store(Ctx->getConstantInt8(ValValue), Mem);
4023 return;
4024 }
4025
4026 // Lowers the assignment to the remaining bytes. Assumes the original size
4027 // was large enough to allow for overlaps.
4028 auto lowerLeftOvers = [this, Base, CountValue](
4029 uint32_t SpreadValue, uint32_t Size, Variable *VecReg) {
4030 auto lowerStoreSpreadValue =
4031 [this, Base, CountValue, SpreadValue](Type Ty) {
4032 Constant *Offset =
4033 Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
4034 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
4035 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
4036 };
4037
4038 if (Size > 8) {
4039 assert(VecReg != nullptr);
4040 Constant *Offset = Ctx->getConstantInt32(CountValue - 16);
4041 auto *Mem = Traits::X86OperandMem::create(Func, VecReg->getType(), Base,
4042 Offset);
4043 _storep(VecReg, Mem);
4044 } else if (Size > 4) {
4045 assert(VecReg != nullptr);
4046 Constant *Offset = Ctx->getConstantInt32(CountValue - 8);
4047 auto *Mem =
4048 Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);
4049 _storeq(VecReg, Mem);
4050 } else if (Size > 2) {
4051 lowerStoreSpreadValue(IceType_i32);
4052 } else if (Size > 1) {
4053 lowerStoreSpreadValue(IceType_i16);
4054 } else if (Size == 1) {
4055 lowerStoreSpreadValue(IceType_i8);
4056 }
4057 };
4058
4059 // When the value is zero it can be loaded into a register cheaply using
4060 // the xor trick.
4061 if (ValValue == 0 && CountValue >= 8 && CountValue <= 16 * UNROLL_LIMIT) {
jvoung (off chromium) 2015/08/04 21:17:36 Would it make sense/look good to "document" the "1
4062 Variable *Zero = makeVectorOfZeros(IceType_v16i8);
4063
4064 // Too small to use large vector operations so use small ones instead
4065 if (CountValue < 16) {
4066 Constant *Offset = nullptr;
4067 auto *Mem =
4068 Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);
4069 _storeq(Zero, Mem);
4070 lowerLeftOvers(0, CountValue - 8, Zero);
4071 return;
4072 }
4073
4074 assert(CountValue >= 16);
4075 // Use large vector operations
4076 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
4077 N -= 16;
4078 Constant *Offset = Ctx->getConstantInt32(N);
4079 auto *Mem =
4080 Traits::X86OperandMem::create(Func, Zero->getType(), Base, Offset);
4081 _storep(Zero, Mem);
4082 }
4083 uint32_t LeftOver = CountValue & 0xF;
4084 lowerLeftOvers(0, LeftOver, Zero);
4085 return;
4086 }
4087
4088 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?
4089 if (CountValue <= 4 * UNROLL_LIMIT) {
4090 // TODO(ascull); 64-bit can do better with 64-bit mov
4091 uint32_t SpreadValue =
4092 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
4093 if (CountValue >= 4) {
4094 Constant *ValueConst = Ctx->getConstantInt32(SpreadValue);
4095 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
4096 N -= 4;
4097 Constant *Offset = Ctx->getConstantInt32(N);
4098 auto *Mem =
4099 Traits::X86OperandMem::create(Func, IceType_i32, Base, Offset);
4100 _store(ValueConst, Mem);
4101 }
4102 }
4103 uint32_t LeftOver = CountValue & 0x3;
4104 lowerLeftOvers(SpreadValue, LeftOver, nullptr);
4105 return;
4106 }
4107 }
4108
4109 // Fall back on calling the memset function. The value operand needs to be
4110 // extended to a stack slot size because the PNaCl ABI requires arguments to
4111 // be at least 32 bits wide.
4112 Operand *ValExt;
4113 if (IsValConst) {
4114 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
4115 } else {
4116 Variable *ValExtVar = Func->makeVariable(stackSlotType());
4117 lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
4118 ValExt = ValExtVar;
4119 }
4120 InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3);
4121 Call->addArg(Dest);
4122 Call->addArg(ValExt);
4123 Call->addArg(Count);
4124 lowerCall(Call);
4125 }
4126
4127 template <class Machine>
3998 void TargetX86Base<Machine>::lowerIndirectJump(Variable *Target) { 4128 void TargetX86Base<Machine>::lowerIndirectJump(Variable *Target) {
3999 const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing(); 4129 const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
4000 if (NeedSandboxing) { 4130 if (NeedSandboxing) {
4001 _bundle_lock(); 4131 _bundle_lock();
4002 const SizeT BundleSize = 4132 const SizeT BundleSize =
4003 1 << Func->getAssembler<>()->getBundleAlignLog2Bytes(); 4133 1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
4004 _and(Target, Ctx->getConstantInt32(~(BundleSize - 1))); 4134 _and(Target, Ctx->getConstantInt32(~(BundleSize - 1)));
4005 } 4135 }
4006 _jmp(Target); 4136 _jmp(Target);
4007 if (NeedSandboxing) 4137 if (NeedSandboxing)
(...skipping 1464 matching lines...) Expand 10 before | Expand all | Expand 10 after
5472 } 5602 }
5473 // the offset is not eligible for blinding or pooling, return the original 5603 // the offset is not eligible for blinding or pooling, return the original
5474 // mem operand 5604 // mem operand
5475 return MemOperand; 5605 return MemOperand;
5476 } 5606 }
5477 5607
5478 } // end of namespace X86Internal 5608 } // end of namespace X86Internal
5479 } // end of namespace Ice 5609 } // end of namespace Ice
5480 5610
5481 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H 5611 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
OLDNEW
« no previous file with comments | « src/IceTargetLoweringX86Base.h ('k') | tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698