Index: lib/Target/R600/AMDILPeepholeOptimizer.cpp |
diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..3a28038666f716f3d43dad7c66677fac670d9289 |
--- /dev/null |
+++ b/lib/Target/R600/AMDILPeepholeOptimizer.cpp |
@@ -0,0 +1,1215 @@ |
+//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===// |
+// |
+// The LLVM Compiler Infrastructure |
+// |
+// This file is distributed under the University of Illinois Open Source |
+// License. See LICENSE.TXT for details. |
+// |
+/// \file |
+//==-----------------------------------------------------------------------===// |
+ |
+#define DEBUG_TYPE "PeepholeOpt" |
+#ifdef DEBUG |
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) |
+#else |
+#define DEBUGME 0 |
+#endif |
+ |
+#include "AMDILDevices.h" |
+#include "AMDGPUInstrInfo.h" |
+#include "llvm/ADT/Statistic.h" |
+#include "llvm/ADT/StringExtras.h" |
+#include "llvm/ADT/StringRef.h" |
+#include "llvm/ADT/Twine.h" |
+#include "llvm/IR/Constants.h" |
+#include "llvm/CodeGen/MachineFunction.h" |
+#include "llvm/CodeGen/MachineFunctionAnalysis.h" |
+#include "llvm/IR/Function.h" |
+#include "llvm/IR/Instructions.h" |
+#include "llvm/IR/Module.h" |
+#include "llvm/Support/Debug.h" |
+#include "llvm/Support/MathExtras.h" |
+ |
+#include <sstream> |
+ |
+#if 0 |
+STATISTIC(PointerAssignments, "Number of dynamic pointer " |
+ "assigments discovered"); |
+STATISTIC(PointerSubtract, "Number of pointer subtractions discovered"); |
+#endif |
+ |
+using namespace llvm; |
+// The Peephole optimization pass is used to do simple last minute optimizations |
+// that are required for correct code or to remove redundant functions |
+namespace { |
+ |
+class OpaqueType; |
+ |
+class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass { |
+public: |
+ TargetMachine &TM; |
+ static char ID; |
+ AMDGPUPeepholeOpt(TargetMachine &tm); |
+ ~AMDGPUPeepholeOpt(); |
+ const char *getPassName() const; |
+ bool runOnFunction(Function &F); |
+ bool doInitialization(Module &M); |
+ bool doFinalization(Module &M); |
+ void getAnalysisUsage(AnalysisUsage &AU) const; |
+protected: |
+private: |
+ // Function to initiate all of the instruction level optimizations. |
+ bool instLevelOptimizations(BasicBlock::iterator *inst); |
+ // Quick check to see if we need to dump all of the pointers into the |
+ // arena. If this is correct, then we set all pointers to exist in arena. This |
+ // is a workaround for aliasing of pointers in a struct/union. |
+ bool dumpAllIntoArena(Function &F); |
+ // Because I don't want to invalidate any pointers while in the |
+ // safeNestedForEachFunction. I push atomic conversions to a vector and handle |
+ // it later. This function does the conversions if required. |
+ void doAtomicConversionIfNeeded(Function &F); |
+ // Because __amdil_is_constant cannot be properly evaluated if |
+ // optimizations are disabled, the call's are placed in a vector |
+ // and evaluated after the __amdil_image* functions are evaluated |
+ // which should allow the __amdil_is_constant function to be |
+ // evaluated correctly. |
+ void doIsConstCallConversionIfNeeded(); |
+ bool mChanged; |
+ bool mDebug; |
+ bool mConvertAtomics; |
+ CodeGenOpt::Level optLevel; |
+ // Run a series of tests to see if we can optimize a CALL instruction. |
+ bool optimizeCallInst(BasicBlock::iterator *bbb); |
+ // A peephole optimization to optimize bit extract sequences. |
+ bool optimizeBitExtract(Instruction *inst); |
+ // A peephole optimization to optimize bit insert sequences. |
+ bool optimizeBitInsert(Instruction *inst); |
+ bool setupBitInsert(Instruction *base, |
+ Instruction *&src, |
+ Constant *&mask, |
+ Constant *&shift); |
+ // Expand the bit field insert instruction on versions of OpenCL that |
+ // don't support it. |
+ bool expandBFI(CallInst *CI); |
+ // Expand the bit field mask instruction on version of OpenCL that |
+ // don't support it. |
+ bool expandBFM(CallInst *CI); |
+ // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in |
+ // this case we need to expand them. These functions check for 24bit functions |
+ // and then expand. |
+ bool isSigned24BitOps(CallInst *CI); |
+ void expandSigned24BitOps(CallInst *CI); |
+ // One optimization that can occur is that if the required workgroup size is |
+ // specified then the result of get_local_size is known at compile time and |
+ // can be returned accordingly. |
+ bool isRWGLocalOpt(CallInst *CI); |
+ // On northern island cards, the division is slightly less accurate than on |
+ // previous generations, so we need to utilize a more accurate division. So we |
+ // can translate the accurate divide to a normal divide on all other cards. |
+ bool convertAccurateDivide(CallInst *CI); |
+ void expandAccurateDivide(CallInst *CI); |
+ // If the alignment is set incorrectly, it can produce really inefficient |
+ // code. This checks for this scenario and fixes it if possible. |
+ bool correctMisalignedMemOp(Instruction *inst); |
+ |
+ // If we are in no opt mode, then we need to make sure that |
+ // local samplers are properly propagated as constant propagation |
+ // doesn't occur and we need to know the value of kernel defined |
+ // samplers at compile time. |
+ bool propagateSamplerInst(CallInst *CI); |
+ |
+ // Helper functions |
+ |
+ // Group of functions that recursively calculate the size of a structure based |
+ // on it's sub-types. |
+ size_t getTypeSize(Type * const T, bool dereferencePtr = false); |
+ size_t getTypeSize(StructType * const ST, bool dereferencePtr = false); |
+ size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false); |
+ size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false); |
+ size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false); |
+ size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false); |
+ size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false); |
+ size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false); |
+ |
+ LLVMContext *mCTX; |
+ Function *mF; |
+ const AMDGPUSubtarget *mSTM; |
+ SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs; |
+ SmallVector<CallInst *, 16> isConstVec; |
+}; // class AMDGPUPeepholeOpt |
+ char AMDGPUPeepholeOpt::ID = 0; |
+ |
+// A template function that has two levels of looping before calling the |
+// function with a pointer to the current iterator. |
+template<class InputIterator, class SecondIterator, class Function> |
+Function safeNestedForEach(InputIterator First, InputIterator Last, |
+ SecondIterator S, Function F) { |
+ for ( ; First != Last; ++First) { |
+ SecondIterator sf, sl; |
+ for (sf = First->begin(), sl = First->end(); |
+ sf != sl; ) { |
+ if (!F(&sf)) { |
+ ++sf; |
+ } |
+ } |
+ } |
+ return F; |
+} |
+ |
+} // anonymous namespace |
+ |
+namespace llvm { |
+ FunctionPass * |
+ createAMDGPUPeepholeOpt(TargetMachine &tm) { |
+ return new AMDGPUPeepholeOpt(tm); |
+ } |
+} // llvm namespace |
+ |
+AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm) |
+ : FunctionPass(ID), TM(tm) { |
+ mDebug = DEBUGME; |
+ optLevel = TM.getOptLevel(); |
+ |
+} |
+ |
+AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() { |
+} |
+ |
+const char * |
+AMDGPUPeepholeOpt::getPassName() const { |
+ return "AMDGPU PeepHole Optimization Pass"; |
+} |
+ |
+bool |
+containsPointerType(Type *Ty) { |
+ if (!Ty) { |
+ return false; |
+ } |
+ switch(Ty->getTypeID()) { |
+ default: |
+ return false; |
+ case Type::StructTyID: { |
+ const StructType *ST = dyn_cast<StructType>(Ty); |
+ for (StructType::element_iterator stb = ST->element_begin(), |
+ ste = ST->element_end(); stb != ste; ++stb) { |
+ if (!containsPointerType(*stb)) { |
+ continue; |
+ } |
+ return true; |
+ } |
+ break; |
+ } |
+ case Type::VectorTyID: |
+ case Type::ArrayTyID: |
+ return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType()); |
+ case Type::PointerTyID: |
+ return true; |
+ }; |
+ return false; |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) { |
+ bool dumpAll = false; |
+ for (Function::const_arg_iterator cab = F.arg_begin(), |
+ cae = F.arg_end(); cab != cae; ++cab) { |
+ const Argument *arg = cab; |
+ const PointerType *PT = dyn_cast<PointerType>(arg->getType()); |
+ if (!PT) { |
+ continue; |
+ } |
+ Type *DereferencedType = PT->getElementType(); |
+ if (!dyn_cast<StructType>(DereferencedType) |
+ ) { |
+ continue; |
+ } |
+ if (!containsPointerType(DereferencedType)) { |
+ continue; |
+ } |
+ // FIXME: Because a pointer inside of a struct/union may be aliased to |
+ // another pointer we need to take the conservative approach and place all |
+ // pointers into the arena until more advanced detection is implemented. |
+ dumpAll = true; |
+ } |
+ return dumpAll; |
+} |
+void |
+AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() { |
+ if (isConstVec.empty()) { |
+ return; |
+ } |
+ for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) { |
+ CallInst *CI = isConstVec[x]; |
+ Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); |
+ Type *aType = Type::getInt32Ty(*mCTX); |
+ Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) |
+ : ConstantInt::get(aType, 0); |
+ CI->replaceAllUsesWith(Val); |
+ CI->eraseFromParent(); |
+ } |
+ isConstVec.clear(); |
+} |
+void |
+AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) { |
+ // Don't do anything if we don't have any atomic operations. |
+ if (atomicFuncs.empty()) { |
+ return; |
+ } |
+ // Change the function name for the atomic if it is required |
+ uint32_t size = atomicFuncs.size(); |
+ for (uint32_t x = 0; x < size; ++x) { |
+ atomicFuncs[x].first->setOperand( |
+ atomicFuncs[x].first->getNumOperands()-1, |
+ atomicFuncs[x].second); |
+ |
+ } |
+ mChanged = true; |
+ if (mConvertAtomics) { |
+ return; |
+ } |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::runOnFunction(Function &MF) { |
+ mChanged = false; |
+ mF = &MF; |
+ mSTM = &TM.getSubtarget<AMDGPUSubtarget>(); |
+ if (mDebug) { |
+ MF.dump(); |
+ } |
+ mCTX = &MF.getType()->getContext(); |
+ mConvertAtomics = true; |
+ safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), |
+ std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations), |
+ this)); |
+ |
+ doAtomicConversionIfNeeded(MF); |
+ doIsConstCallConversionIfNeeded(); |
+ |
+ if (mDebug) { |
+ MF.dump(); |
+ } |
+ return mChanged; |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) { |
+ Instruction *inst = (*bbb); |
+ CallInst *CI = dyn_cast<CallInst>(inst); |
+ if (!CI) { |
+ return false; |
+ } |
+ if (isSigned24BitOps(CI)) { |
+ expandSigned24BitOps(CI); |
+ ++(*bbb); |
+ CI->eraseFromParent(); |
+ return true; |
+ } |
+ if (propagateSamplerInst(CI)) { |
+ return false; |
+ } |
+ if (expandBFI(CI) || expandBFM(CI)) { |
+ ++(*bbb); |
+ CI->eraseFromParent(); |
+ return true; |
+ } |
+ if (convertAccurateDivide(CI)) { |
+ expandAccurateDivide(CI); |
+ ++(*bbb); |
+ CI->eraseFromParent(); |
+ return true; |
+ } |
+ |
+ StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName(); |
+ if (calleeName.startswith("__amdil_is_constant")) { |
+ // If we do not have optimizations, then this |
+ // cannot be properly evaluated, so we add the |
+ // call instruction to a vector and process |
+ // them at the end of processing after the |
+ // samplers have been correctly handled. |
+ if (optLevel == CodeGenOpt::None) { |
+ isConstVec.push_back(CI); |
+ return false; |
+ } else { |
+ Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); |
+ Type *aType = Type::getInt32Ty(*mCTX); |
+ Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) |
+ : ConstantInt::get(aType, 0); |
+ CI->replaceAllUsesWith(Val); |
+ ++(*bbb); |
+ CI->eraseFromParent(); |
+ return true; |
+ } |
+ } |
+ |
+ if (calleeName.equals("__amdil_is_asic_id_i32")) { |
+ ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0)); |
+ Type *aType = Type::getInt32Ty(*mCTX); |
+ Value *Val = CV; |
+ if (Val) { |
+ Val = ConstantInt::get(aType, |
+ mSTM->device()->getDeviceFlag() & CV->getZExtValue()); |
+ } else { |
+ Val = ConstantInt::get(aType, 0); |
+ } |
+ CI->replaceAllUsesWith(Val); |
+ ++(*bbb); |
+ CI->eraseFromParent(); |
+ return true; |
+ } |
+ Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1)); |
+ if (!F) { |
+ return false; |
+ } |
+ if (F->getName().startswith("__atom") && !CI->getNumUses() |
+ && F->getName().find("_xchg") == StringRef::npos) { |
+ std::string buffer(F->getName().str() + "_noret"); |
+ F = dyn_cast<Function>( |
+ F->getParent()->getOrInsertFunction(buffer, F->getFunctionType())); |
+ atomicFuncs.push_back(std::make_pair(CI, F)); |
+ } |
+ |
+ if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment) |
+ && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) { |
+ return false; |
+ } |
+ if (!mConvertAtomics) { |
+ return false; |
+ } |
+ StringRef name = F->getName(); |
+ if (name.startswith("__atom") && name.find("_g") != StringRef::npos) { |
+ mConvertAtomics = false; |
+ } |
+ return false; |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, |
+ Instruction *&src, |
+ Constant *&mask, |
+ Constant *&shift) { |
+ if (!base) { |
+ if (mDebug) { |
+ dbgs() << "Null pointer passed into function.\n"; |
+ } |
+ return false; |
+ } |
+ bool andOp = false; |
+ if (base->getOpcode() == Instruction::Shl) { |
+ shift = dyn_cast<Constant>(base->getOperand(1)); |
+ } else if (base->getOpcode() == Instruction::And) { |
+ mask = dyn_cast<Constant>(base->getOperand(1)); |
+ andOp = true; |
+ } else { |
+ if (mDebug) { |
+ dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n"; |
+ } |
+ // If the base is neither a Shl or a And, we don't fit any of the patterns above. |
+ return false; |
+ } |
+ src = dyn_cast<Instruction>(base->getOperand(0)); |
+ if (!src) { |
+ if (mDebug) { |
+ dbgs() << "Failed setup since the base operand is not an instruction!\n"; |
+ } |
+ return false; |
+ } |
+ // If we find an 'and' operation, then we don't need to |
+ // find the next operation as we already know the |
+ // bits that are valid at this point. |
+ if (andOp) { |
+ return true; |
+ } |
+ if (src->getOpcode() == Instruction::Shl && !shift) { |
+ shift = dyn_cast<Constant>(src->getOperand(1)); |
+ src = dyn_cast<Instruction>(src->getOperand(0)); |
+ } else if (src->getOpcode() == Instruction::And && !mask) { |
+ mask = dyn_cast<Constant>(src->getOperand(1)); |
+ } |
+ if (!mask && !shift) { |
+ if (mDebug) { |
+ dbgs() << "Failed setup since both mask and shift are NULL!\n"; |
+ } |
+ // Did not find a constant mask or a shift. |
+ return false; |
+ } |
+ return true; |
+} |
+bool |
+AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) { |
+ if (!inst) { |
+ return false; |
+ } |
+ if (!inst->isBinaryOp()) { |
+ return false; |
+ } |
+ if (inst->getOpcode() != Instruction::Or) { |
+ return false; |
+ } |
+ if (optLevel == CodeGenOpt::None) { |
+ return false; |
+ } |
+ // We want to do an optimization on a sequence of ops that in the end equals a |
+ // single ISA instruction. |
+ // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F) |
+ // Some simplified versions of this pattern are as follows: |
+ // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0 |
+ // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E |
+ // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B |
+ // (A & B) | (D << F) when (1 << F) >= B |
+ // (A << C) | (D & E) when (1 << C) >= E |
+ if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { |
+ // The HD4XXX hardware doesn't support the ubit_insert instruction. |
+ return false; |
+ } |
+ Type *aType = inst->getType(); |
+ bool isVector = aType->isVectorTy(); |
+ int numEle = 1; |
+ // This optimization only works on 32bit integers. |
+ if (aType->getScalarType() |
+ != Type::getInt32Ty(inst->getContext())) { |
+ return false; |
+ } |
+ if (isVector) { |
+ const VectorType *VT = dyn_cast<VectorType>(aType); |
+ numEle = VT->getNumElements(); |
+ // We currently cannot support more than 4 elements in a intrinsic and we |
+ // cannot support Vec3 types. |
+ if (numEle > 4 || numEle == 3) { |
+ return false; |
+ } |
+ } |
+ // TODO: Handle vectors. |
+ if (isVector) { |
+ if (mDebug) { |
+ dbgs() << "!!! Vectors are not supported yet!\n"; |
+ } |
+ return false; |
+ } |
+ Instruction *LHSSrc = NULL, *RHSSrc = NULL; |
+ Constant *LHSMask = NULL, *RHSMask = NULL; |
+ Constant *LHSShift = NULL, *RHSShift = NULL; |
+ Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0)); |
+ Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1)); |
+ if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) { |
+ if (mDebug) { |
+ dbgs() << "Found an OR Operation that failed setup!\n"; |
+ inst->dump(); |
+ if (LHS) { LHS->dump(); } |
+ if (LHSSrc) { LHSSrc->dump(); } |
+ if (LHSMask) { LHSMask->dump(); } |
+ if (LHSShift) { LHSShift->dump(); } |
+ } |
+ // There was an issue with the setup for BitInsert. |
+ return false; |
+ } |
+ if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) { |
+ if (mDebug) { |
+ dbgs() << "Found an OR Operation that failed setup!\n"; |
+ inst->dump(); |
+ if (RHS) { RHS->dump(); } |
+ if (RHSSrc) { RHSSrc->dump(); } |
+ if (RHSMask) { RHSMask->dump(); } |
+ if (RHSShift) { RHSShift->dump(); } |
+ } |
+ // There was an issue with the setup for BitInsert. |
+ return false; |
+ } |
+ if (mDebug) { |
+ dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n"; |
+ dbgs() << "Op: "; inst->dump(); |
+ dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; } |
+ dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; } |
+ dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; } |
+ dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; } |
+ dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; } |
+ dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; } |
+ dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; } |
+ dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; } |
+ } |
+ Constant *offset = NULL; |
+ Constant *width = NULL; |
+ uint32_t lhsMaskVal = 0, rhsMaskVal = 0; |
+ uint32_t lhsShiftVal = 0, rhsShiftVal = 0; |
+ uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0; |
+ uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0; |
+ lhsMaskVal = (LHSMask |
+ ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0); |
+ rhsMaskVal = (RHSMask |
+ ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0); |
+ lhsShiftVal = (LHSShift |
+ ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0); |
+ rhsShiftVal = (RHSShift |
+ ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0); |
+ lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal; |
+ rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal; |
+ lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal; |
+ rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal; |
+ // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks). |
+ if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) { |
+ return false; |
+ } |
+ if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) { |
+ offset = ConstantInt::get(aType, lhsMaskOffset, false); |
+ width = ConstantInt::get(aType, lhsMaskWidth, false); |
+ RHSSrc = RHS; |
+ if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) { |
+ return false; |
+ } |
+ if (!LHSShift) { |
+ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
+ "MaskShr", LHS); |
+ } else if (lhsShiftVal != lhsMaskOffset) { |
+ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
+ "MaskShr", LHS); |
+ } |
+ if (mDebug) { |
+ dbgs() << "Optimizing LHS!\n"; |
+ } |
+ } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) { |
+ offset = ConstantInt::get(aType, rhsMaskOffset, false); |
+ width = ConstantInt::get(aType, rhsMaskWidth, false); |
+ LHSSrc = RHSSrc; |
+ RHSSrc = LHS; |
+ if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) { |
+ return false; |
+ } |
+ if (!RHSShift) { |
+ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
+ "MaskShr", RHS); |
+ } else if (rhsShiftVal != rhsMaskOffset) { |
+ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
+ "MaskShr", RHS); |
+ } |
+ if (mDebug) { |
+ dbgs() << "Optimizing RHS!\n"; |
+ } |
+ } else { |
+ if (mDebug) { |
+ dbgs() << "Failed constraint 3!\n"; |
+ } |
+ return false; |
+ } |
+ if (mDebug) { |
+ dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; } |
+ dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; } |
+ dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; } |
+ dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; } |
+ } |
+ if (!offset || !width) { |
+ if (mDebug) { |
+ dbgs() << "Either width or offset are NULL, failed detection!\n"; |
+ } |
+ return false; |
+ } |
+ // Lets create the function signature. |
+ std::vector<Type *> callTypes; |
+ callTypes.push_back(aType); |
+ callTypes.push_back(aType); |
+ callTypes.push_back(aType); |
+ callTypes.push_back(aType); |
+ FunctionType *funcType = FunctionType::get(aType, callTypes, false); |
+ std::string name = "__amdil_ubit_insert"; |
+ if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; } |
+ Function *Func = |
+ dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> |
+ getOrInsertFunction(StringRef(name), funcType)); |
+ Value *Operands[4] = { |
+ width, |
+ offset, |
+ LHSSrc, |
+ RHSSrc |
+ }; |
+ CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt"); |
+ if (mDebug) { |
+ dbgs() << "Old Inst: "; |
+ inst->dump(); |
+ dbgs() << "New Inst: "; |
+ CI->dump(); |
+ dbgs() << "\n\n"; |
+ } |
+ CI->insertBefore(inst); |
+ inst->replaceAllUsesWith(CI); |
+ return true; |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) { |
+ if (!inst) { |
+ return false; |
+ } |
+ if (!inst->isBinaryOp()) { |
+ return false; |
+ } |
+ if (inst->getOpcode() != Instruction::And) { |
+ return false; |
+ } |
+ if (optLevel == CodeGenOpt::None) { |
+ return false; |
+ } |
+ // We want to do some simple optimizations on Shift right/And patterns. The |
+ // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a |
+ // value smaller than 32 and C is a mask. If C is a constant value, then the |
+ // following transformation can occur. For signed integers, it turns into the |
+ // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned |
+ // integers, it turns into the function call dst = |
+ // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract |
+ // can be found in Section 7.9 of the ATI IL spec of the stream SDK for |
+ // Evergreen hardware. |
+ if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { |
+ // This does not work on HD4XXX hardware. |
+ return false; |
+ } |
+ Type *aType = inst->getType(); |
+ bool isVector = aType->isVectorTy(); |
+ |
+ // XXX Support vector types |
+ if (isVector) { |
+ return false; |
+ } |
+ int numEle = 1; |
+ // This only works on 32bit integers |
+ if (aType->getScalarType() |
+ != Type::getInt32Ty(inst->getContext())) { |
+ return false; |
+ } |
+ if (isVector) { |
+ const VectorType *VT = dyn_cast<VectorType>(aType); |
+ numEle = VT->getNumElements(); |
+ // We currently cannot support more than 4 elements in a intrinsic and we |
+ // cannot support Vec3 types. |
+ if (numEle > 4 || numEle == 3) { |
+ return false; |
+ } |
+ } |
+ BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0)); |
+ // If the first operand is not a shift instruction, then we can return as it |
+ // doesn't match this pattern. |
+ if (!ShiftInst || !ShiftInst->isShift()) { |
+ return false; |
+ } |
+ // If we are a shift left, then we need don't match this pattern. |
+ if (ShiftInst->getOpcode() == Instruction::Shl) { |
+ return false; |
+ } |
+ bool isSigned = ShiftInst->isArithmeticShift(); |
+ Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1)); |
+ Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1)); |
+ // Lets make sure that the shift value and the and mask are constant integers. |
+ if (!AndMask || !ShrVal) { |
+ return false; |
+ } |
+ Constant *newMaskConst; |
+ Constant *shiftValConst; |
+ if (isVector) { |
+ // Handle the vector case |
+ std::vector<Constant *> maskVals; |
+ std::vector<Constant *> shiftVals; |
+ ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask); |
+ ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal); |
+ Type *scalarType = AndMaskVec->getType()->getScalarType(); |
+ assert(AndMaskVec->getNumOperands() == |
+ ShrValVec->getNumOperands() && "cannot have a " |
+ "combination where the number of elements to a " |
+ "shift and an and are different!"); |
+ for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) { |
+ ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x)); |
+ ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x)); |
+ if (!AndCI || !ShiftIC) { |
+ return false; |
+ } |
+ uint32_t maskVal = (uint32_t)AndCI->getZExtValue(); |
+ if (!isMask_32(maskVal)) { |
+ return false; |
+ } |
+ maskVal = (uint32_t)CountTrailingOnes_32(maskVal); |
+ uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue(); |
+ // If the mask or shiftval is greater than the bitcount, then break out. |
+ if (maskVal >= 32 || shiftVal >= 32) { |
+ return false; |
+ } |
+ // If the mask val is greater than the the number of original bits left |
+ // then this optimization is invalid. |
+ if (maskVal > (32 - shiftVal)) { |
+ return false; |
+ } |
+ maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned)); |
+ shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned)); |
+ } |
+ newMaskConst = ConstantVector::get(maskVals); |
+ shiftValConst = ConstantVector::get(shiftVals); |
+ } else { |
+ // Handle the scalar case |
+ uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue(); |
+ // This must be a mask value where all lower bits are set to 1 and then any |
+ // bit higher is set to 0. |
+ if (!isMask_32(maskVal)) { |
+ return false; |
+ } |
+ maskVal = (uint32_t)CountTrailingOnes_32(maskVal); |
+ // Count the number of bits set in the mask, this is the width of the |
+ // resulting bit set that is extracted from the source value. |
+ uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue(); |
+ // If the mask or shift val is greater than the bitcount, then break out. |
+ if (maskVal >= 32 || shiftVal >= 32) { |
+ return false; |
+ } |
+ // If the mask val is greater than the the number of original bits left then |
+ // this optimization is invalid. |
+ if (maskVal > (32 - shiftVal)) { |
+ return false; |
+ } |
+ newMaskConst = ConstantInt::get(aType, maskVal, isSigned); |
+ shiftValConst = ConstantInt::get(aType, shiftVal, isSigned); |
+ } |
+ // Lets create the function signature. |
+ std::vector<Type *> callTypes; |
+ callTypes.push_back(aType); |
+ callTypes.push_back(aType); |
+ callTypes.push_back(aType); |
+ FunctionType *funcType = FunctionType::get(aType, callTypes, false); |
+ std::string name = "llvm.AMDGPU.bit.extract.u32"; |
+ if (isVector) { |
+ name += ".v" + itostr(numEle) + "i32"; |
+ } else { |
+ name += "."; |
+ } |
+ // Lets create the function. |
+ Function *Func = |
+ dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> |
+ getOrInsertFunction(StringRef(name), funcType)); |
+ Value *Operands[3] = { |
+ ShiftInst->getOperand(0), |
+ shiftValConst, |
+ newMaskConst |
+ }; |
+ // Lets create the Call with the operands |
+ CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt"); |
+ CI->setDoesNotAccessMemory(); |
+ CI->insertBefore(inst); |
+ inst->replaceAllUsesWith(CI); |
+ return true; |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::expandBFI(CallInst *CI) { |
+ if (!CI) { |
+ return false; |
+ } |
+ Value *LHS = CI->getOperand(CI->getNumOperands() - 1); |
+ if (!LHS->getName().startswith("__amdil_bfi")) { |
+ return false; |
+ } |
+ Type* type = CI->getOperand(0)->getType(); |
+ Constant *negOneConst = NULL; |
+ if (type->isVectorTy()) { |
+ std::vector<Constant *> negOneVals; |
+ negOneConst = ConstantInt::get(CI->getContext(), |
+ APInt(32, StringRef("-1"), 10)); |
+ for (size_t x = 0, |
+ y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { |
+ negOneVals.push_back(negOneConst); |
+ } |
+ negOneConst = ConstantVector::get(negOneVals); |
+ } else { |
+ negOneConst = ConstantInt::get(CI->getContext(), |
+ APInt(32, StringRef("-1"), 10)); |
+ } |
+ // __amdil_bfi => (A & B) | (~A & C) |
+ BinaryOperator *lhs = |
+ BinaryOperator::Create(Instruction::And, CI->getOperand(0), |
+ CI->getOperand(1), "bfi_and", CI); |
+ BinaryOperator *rhs = |
+ BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst, |
+ "bfi_not", CI); |
+ rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2), |
+ "bfi_and", CI); |
+ lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI); |
+ CI->replaceAllUsesWith(lhs); |
+ return true; |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::expandBFM(CallInst *CI) { |
+ if (!CI) { |
+ return false; |
+ } |
+ Value *LHS = CI->getOperand(CI->getNumOperands() - 1); |
+ if (!LHS->getName().startswith("__amdil_bfm")) { |
+ return false; |
+ } |
+ // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f) |
+ Constant *newMaskConst = NULL; |
+ Constant *newShiftConst = NULL; |
+ Type* type = CI->getOperand(0)->getType(); |
+ if (type->isVectorTy()) { |
+ std::vector<Constant*> newMaskVals, newShiftVals; |
+ newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); |
+ newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); |
+ for (size_t x = 0, |
+ y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { |
+ newMaskVals.push_back(newMaskConst); |
+ newShiftVals.push_back(newShiftConst); |
+ } |
+ newMaskConst = ConstantVector::get(newMaskVals); |
+ newShiftConst = ConstantVector::get(newShiftVals); |
+ } else { |
+ newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); |
+ newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); |
+ } |
+ BinaryOperator *lhs = |
+ BinaryOperator::Create(Instruction::And, CI->getOperand(0), |
+ newMaskConst, "bfm_mask", CI); |
+ lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst, |
+ lhs, "bfm_shl", CI); |
+ lhs = BinaryOperator::Create(Instruction::Sub, lhs, |
+ newShiftConst, "bfm_sub", CI); |
+ BinaryOperator *rhs = |
+ BinaryOperator::Create(Instruction::And, CI->getOperand(1), |
+ newMaskConst, "bfm_mask", CI); |
+ lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI); |
+ CI->replaceAllUsesWith(lhs); |
+ return true; |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) { |
+ Instruction *inst = (*bbb); |
+ if (optimizeCallInst(bbb)) { |
+ return true; |
+ } |
+ if (optimizeBitExtract(inst)) { |
+ return false; |
+ } |
+ if (optimizeBitInsert(inst)) { |
+ return false; |
+ } |
+ if (correctMisalignedMemOp(inst)) { |
+ return false; |
+ } |
+ return false; |
+} |
+bool |
+AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) { |
+ LoadInst *linst = dyn_cast<LoadInst>(inst); |
+ StoreInst *sinst = dyn_cast<StoreInst>(inst); |
+ unsigned alignment; |
+ Type* Ty = inst->getType(); |
+ if (linst) { |
+ alignment = linst->getAlignment(); |
+ Ty = inst->getType(); |
+ } else if (sinst) { |
+ alignment = sinst->getAlignment(); |
+ Ty = sinst->getValueOperand()->getType(); |
+ } else { |
+ return false; |
+ } |
+ unsigned size = getTypeSize(Ty); |
+ if (size == alignment || size < alignment) { |
+ return false; |
+ } |
+ if (!Ty->isStructTy()) { |
+ return false; |
+ } |
+ if (alignment < 4) { |
+ if (linst) { |
+ linst->setAlignment(0); |
+ return true; |
+ } else if (sinst) { |
+ sinst->setAlignment(0); |
+ return true; |
+ } |
+ } |
+ return false; |
+} |
+bool |
+AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) { |
+ if (!CI) { |
+ return false; |
+ } |
+ Value *LHS = CI->getOperand(CI->getNumOperands() - 1); |
+ std::string namePrefix = LHS->getName().substr(0, 14); |
+ if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24" |
+ && namePrefix != "__amdil__imul24_high") { |
+ return false; |
+ } |
+ if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) { |
+ return false; |
+ } |
+ return true; |
+} |
+ |
+void |
+AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) { |
+ assert(isSigned24BitOps(CI) && "Must be a " |
+ "signed 24 bit operation to call this function!"); |
+ Value *LHS = CI->getOperand(CI->getNumOperands()-1); |
+ // On 7XX and 8XX we do not have signed 24bit, so we need to |
+ // expand it to the following: |
+ // imul24 turns into 32bit imul |
+ // imad24 turns into 32bit imad |
+ // imul24_high turns into 32bit imulhigh |
+ if (LHS->getName().substr(0, 14) == "__amdil_imad24") { |
+ Type *aType = CI->getOperand(0)->getType(); |
+ bool isVector = aType->isVectorTy(); |
+ int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; |
+ std::vector<Type*> callTypes; |
+ callTypes.push_back(CI->getOperand(0)->getType()); |
+ callTypes.push_back(CI->getOperand(1)->getType()); |
+ callTypes.push_back(CI->getOperand(2)->getType()); |
+ FunctionType *funcType = |
+ FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); |
+ std::string name = "__amdil_imad"; |
+ if (isVector) { |
+ name += "_v" + itostr(numEle) + "i32"; |
+ } else { |
+ name += "_i32"; |
+ } |
+ Function *Func = dyn_cast<Function>( |
+ CI->getParent()->getParent()->getParent()-> |
+ getOrInsertFunction(StringRef(name), funcType)); |
+ Value *Operands[3] = { |
+ CI->getOperand(0), |
+ CI->getOperand(1), |
+ CI->getOperand(2) |
+ }; |
+ CallInst *nCI = CallInst::Create(Func, Operands, "imad24"); |
+ nCI->insertBefore(CI); |
+ CI->replaceAllUsesWith(nCI); |
+ } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") { |
+ BinaryOperator *mulOp = |
+ BinaryOperator::Create(Instruction::Mul, CI->getOperand(0), |
+ CI->getOperand(1), "imul24", CI); |
+ CI->replaceAllUsesWith(mulOp); |
+ } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") { |
+ Type *aType = CI->getOperand(0)->getType(); |
+ |
+ bool isVector = aType->isVectorTy(); |
+ int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; |
+ std::vector<Type*> callTypes; |
+ callTypes.push_back(CI->getOperand(0)->getType()); |
+ callTypes.push_back(CI->getOperand(1)->getType()); |
+ FunctionType *funcType = |
+ FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); |
+ std::string name = "__amdil_imul_high"; |
+ if (isVector) { |
+ name += "_v" + itostr(numEle) + "i32"; |
+ } else { |
+ name += "_i32"; |
+ } |
+ Function *Func = dyn_cast<Function>( |
+ CI->getParent()->getParent()->getParent()-> |
+ getOrInsertFunction(StringRef(name), funcType)); |
+ Value *Operands[2] = { |
+ CI->getOperand(0), |
+ CI->getOperand(1) |
+ }; |
+ CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high"); |
+ nCI->insertBefore(CI); |
+ CI->replaceAllUsesWith(nCI); |
+ } |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) { |
+ return (CI != NULL |
+ && CI->getOperand(CI->getNumOperands() - 1)->getName() |
+ == "__amdil_get_local_size_int"); |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) { |
+ if (!CI) { |
+ return false; |
+ } |
+ if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX |
+ && (mSTM->getDeviceName() == "cayman")) { |
+ return false; |
+ } |
+ return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) |
+ == "__amdil_improved_div"; |
+} |
+ |
+void |
+AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) { |
+ assert(convertAccurateDivide(CI) |
+ && "expanding accurate divide can only happen if it is expandable!"); |
+ BinaryOperator *divOp = |
+ BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0), |
+ CI->getOperand(1), "fdiv32", CI); |
+ CI->replaceAllUsesWith(divOp); |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) { |
+ if (optLevel != CodeGenOpt::None) { |
+ return false; |
+ } |
+ |
+ if (!CI) { |
+ return false; |
+ } |
+ |
+ unsigned funcNameIdx = 0; |
+ funcNameIdx = CI->getNumOperands() - 1; |
+ StringRef calleeName = CI->getOperand(funcNameIdx)->getName(); |
+ if (calleeName != "__amdil_image2d_read_norm" |
+ && calleeName != "__amdil_image2d_read_unnorm" |
+ && calleeName != "__amdil_image3d_read_norm" |
+ && calleeName != "__amdil_image3d_read_unnorm") { |
+ return false; |
+ } |
+ |
+ unsigned samplerIdx = 2; |
+ samplerIdx = 1; |
+ Value *sampler = CI->getOperand(samplerIdx); |
+ LoadInst *lInst = dyn_cast<LoadInst>(sampler); |
+ if (!lInst) { |
+ return false; |
+ } |
+ |
+ if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { |
+ return false; |
+ } |
+ |
+ GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand()); |
+ // If we are loading from what is not a global value, then we |
+ // fail and return. |
+ if (!gv) { |
+ return false; |
+ } |
+ |
+ // If we don't have an initializer or we have an initializer and |
+ // the initializer is not a 32bit integer, we fail. |
+ if (!gv->hasInitializer() |
+ || !gv->getInitializer()->getType()->isIntegerTy(32)) { |
+ return false; |
+ } |
+ |
+ // Now that we have the global variable initializer, lets replace |
+ // all uses of the load instruction with the samplerVal and |
+ // reparse the __amdil_is_constant() function. |
+ Constant *samplerVal = gv->getInitializer(); |
+ lInst->replaceAllUsesWith(samplerVal); |
+ return true; |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::doInitialization(Module &M) { |
+ return false; |
+} |
+ |
+bool |
+AMDGPUPeepholeOpt::doFinalization(Module &M) { |
+ return false; |
+} |
+ |
+void |
+AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const { |
+ AU.addRequired<MachineFunctionAnalysis>(); |
+ FunctionPass::getAnalysisUsage(AU); |
+ AU.setPreservesAll(); |
+} |
+ |
+size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) { |
+ size_t size = 0; |
+ if (!T) { |
+ return size; |
+ } |
+ switch (T->getTypeID()) { |
+ case Type::X86_FP80TyID: |
+ case Type::FP128TyID: |
+ case Type::PPC_FP128TyID: |
+ case Type::LabelTyID: |
+ assert(0 && "These types are not supported by this backend"); |
+ default: |
+ case Type::FloatTyID: |
+ case Type::DoubleTyID: |
+ size = T->getPrimitiveSizeInBits() >> 3; |
+ break; |
+ case Type::PointerTyID: |
+ size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr); |
+ break; |
+ case Type::IntegerTyID: |
+ size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr); |
+ break; |
+ case Type::StructTyID: |
+ size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr); |
+ break; |
+ case Type::ArrayTyID: |
+ size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr); |
+ break; |
+ case Type::FunctionTyID: |
+ size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr); |
+ break; |
+ case Type::VectorTyID: |
+ size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr); |
+ break; |
+ }; |
+ return size; |
+} |
+ |
+size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST, |
+ bool dereferencePtr) { |
+ size_t size = 0; |
+ if (!ST) { |
+ return size; |
+ } |
+ Type *curType; |
+ StructType::element_iterator eib; |
+ StructType::element_iterator eie; |
+ for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { |
+ curType = *eib; |
+ size += getTypeSize(curType, dereferencePtr); |
+ } |
+ return size; |
+} |
+ |
+size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT, |
+ bool dereferencePtr) { |
+ return IT ? (IT->getBitWidth() >> 3) : 0; |
+} |
+ |
+size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT, |
+ bool dereferencePtr) { |
+ assert(0 && "Should not be able to calculate the size of an function type"); |
+ return 0; |
+} |
+ |
+size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT, |
+ bool dereferencePtr) { |
+ return (size_t)(AT ? (getTypeSize(AT->getElementType(), |
+ dereferencePtr) * AT->getNumElements()) |
+ : 0); |
+} |
+ |
+size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT, |
+ bool dereferencePtr) { |
+ return VT ? (VT->getBitWidth() >> 3) : 0; |
+} |
+ |
+size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT, |
+ bool dereferencePtr) { |
+ if (!PT) { |
+ return 0; |
+ } |
+ Type *CT = PT->getElementType(); |
+ if (CT->getTypeID() == Type::StructTyID && |
+ PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { |
+ return getTypeSize(dyn_cast<StructType>(CT)); |
+ } else if (dereferencePtr) { |
+ size_t size = 0; |
+ for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) { |
+ size += getTypeSize(PT->getContainedType(x), dereferencePtr); |
+ } |
+ return size; |
+ } else { |
+ return 4; |
+ } |
+} |
+ |
+size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT, |
+ bool dereferencePtr) { |
+ //assert(0 && "Should not be able to calculate the size of an opaque type"); |
+ return 4; |
+} |