lib/Target/R600/AMDILPeepholeOptimizer.cpp - Issue 183273009: Prep for merging 3.4: Undo changes from 3.3 branch

Side by Side Diff: lib/Target/R600/AMDILPeepholeOptimizer.cpp

Issue 183273009: Prep for merging 3.4: Undo changes from 3.3 branch (Closed) Base URL: http://git.chromium.org/native_client/pnacl-llvm.git@master

Patch Set: Retry Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 //===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===/ /

	2 //

	3 // The LLVM Compiler Infrastructure

	4 //

	5 // This file is distributed under the University of Illinois Open Source

	6 // License. See LICENSE.TXT for details.

	7 //

	8 /// \file

	9 //==-----------------------------------------------------------------------===//

	10

	11 #define DEBUG_TYPE "PeepholeOpt"

	12 #ifdef DEBUG

	13 #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))

	14 #else

	15 #define DEBUGME 0

	16 #endif

	17

	18 #include "AMDILDevices.h"

	19 #include "AMDGPUInstrInfo.h"

	20 #include "llvm/ADT/Statistic.h"

	21 #include "llvm/ADT/StringExtras.h"

	22 #include "llvm/ADT/StringRef.h"

	23 #include "llvm/ADT/Twine.h"

	24 #include "llvm/IR/Constants.h"

	25 #include "llvm/CodeGen/MachineFunction.h"

	26 #include "llvm/CodeGen/MachineFunctionAnalysis.h"

	27 #include "llvm/IR/Function.h"

	28 #include "llvm/IR/Instructions.h"

	29 #include "llvm/IR/Module.h"

	30 #include "llvm/Support/Debug.h"

	31 #include "llvm/Support/MathExtras.h"

	32

	33 #include <sstream>

	34

	35 #if 0

	36 STATISTIC(PointerAssignments, "Number of dynamic pointer "

	37 "assigments discovered");

	38 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");

	39 #endif

	40

	41 using namespace llvm;

	42 // The Peephole optimization pass is used to do simple last minute optimizations

	43 // that are required for correct code or to remove redundant functions

	44 namespace {

	45

	46 class OpaqueType;

	47

	48 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {

	49 public:

	50 TargetMachine &TM;

	51 static char ID;

	52 AMDGPUPeepholeOpt(TargetMachine &tm);

	53 ~AMDGPUPeepholeOpt();

	54 const char *getPassName() const;

	55 bool runOnFunction(Function &F);

	56 bool doInitialization(Module &M);

	57 bool doFinalization(Module &M);

	58 void getAnalysisUsage(AnalysisUsage &AU) const;

	59 protected:

	60 private:

	61 // Function to initiate all of the instruction level optimizations.

	62 bool instLevelOptimizations(BasicBlock::iterator *inst);

	63 // Quick check to see if we need to dump all of the pointers into the

	64 // arena. If this is correct, then we set all pointers to exist in arena. This

	65 // is a workaround for aliasing of pointers in a struct/union.

	66 bool dumpAllIntoArena(Function &F);

	67 // Because I don't want to invalidate any pointers while in the

	68 // safeNestedForEachFunction. I push atomic conversions to a vector and handle

	69 // it later. This function does the conversions if required.

	70 void doAtomicConversionIfNeeded(Function &F);

	71 // Because __amdil_is_constant cannot be properly evaluated if

	72 // optimizations are disabled, the call's are placed in a vector

	73 // and evaluated after the __amdil_image* functions are evaluated

	74 // which should allow the __amdil_is_constant function to be

	75 // evaluated correctly.

	76 void doIsConstCallConversionIfNeeded();

	77 bool mChanged;

	78 bool mDebug;

	79 bool mConvertAtomics;

	80 CodeGenOpt::Level optLevel;

	81 // Run a series of tests to see if we can optimize a CALL instruction.

	82 bool optimizeCallInst(BasicBlock::iterator *bbb);

	83 // A peephole optimization to optimize bit extract sequences.

	84 bool optimizeBitExtract(Instruction *inst);

	85 // A peephole optimization to optimize bit insert sequences.

	86 bool optimizeBitInsert(Instruction *inst);

	87 bool setupBitInsert(Instruction *base,

	88 Instruction *&src,

	89 Constant *&mask,

	90 Constant *&shift);

	91 // Expand the bit field insert instruction on versions of OpenCL that

	92 // don't support it.

	93 bool expandBFI(CallInst *CI);

	94 // Expand the bit field mask instruction on version of OpenCL that

	95 // don't support it.

	96 bool expandBFM(CallInst *CI);

	97 // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in

	98 // this case we need to expand them. These functions check for 24bit functions

	99 // and then expand.

	100 bool isSigned24BitOps(CallInst *CI);

	101 void expandSigned24BitOps(CallInst *CI);

	102 // One optimization that can occur is that if the required workgroup size is

	103 // specified then the result of get_local_size is known at compile time and

	104 // can be returned accordingly.

	105 bool isRWGLocalOpt(CallInst *CI);

	106 // On northern island cards, the division is slightly less accurate than on

	107 // previous generations, so we need to utilize a more accurate division. So we

	108 // can translate the accurate divide to a normal divide on all other cards.

	109 bool convertAccurateDivide(CallInst *CI);

	110 void expandAccurateDivide(CallInst *CI);

	111 // If the alignment is set incorrectly, it can produce really inefficient

	112 // code. This checks for this scenario and fixes it if possible.

	113 bool correctMisalignedMemOp(Instruction *inst);

	114

	115 // If we are in no opt mode, then we need to make sure that

	116 // local samplers are properly propagated as constant propagation

	117 // doesn't occur and we need to know the value of kernel defined

	118 // samplers at compile time.

	119 bool propagateSamplerInst(CallInst *CI);

	120

	121 // Helper functions

	122

	123 // Group of functions that recursively calculate the size of a structure based

	124 // on it's sub-types.

	125 size_t getTypeSize(Type * const T, bool dereferencePtr = false);

	126 size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);

	127 size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);

	128 size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);

	129 size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);

	130 size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);

	131 size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);

	132 size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);

	133

	134 LLVMContext *mCTX;

	135 Function *mF;

	136 const AMDGPUSubtarget *mSTM;

	137 SmallVector< std::pair<CallInst , Function >, 16> atomicFuncs;

	138 SmallVector<CallInst *, 16> isConstVec;

	139 }; // class AMDGPUPeepholeOpt

	140 char AMDGPUPeepholeOpt::ID = 0;

	141

	142 // A template function that has two levels of looping before calling the

	143 // function with a pointer to the current iterator.

	144 template<class InputIterator, class SecondIterator, class Function>

	145 Function safeNestedForEach(InputIterator First, InputIterator Last,

	146 SecondIterator S, Function F) {

	147 for ( ; First != Last; ++First) {

	148 SecondIterator sf, sl;

	149 for (sf = First->begin(), sl = First->end();

	150 sf != sl; ) {

	151 if (!F(&sf)) {

	152 ++sf;

	153 }

	154 }

	155 }

	156 return F;

	157 }

	158

	159 } // anonymous namespace

	160

	161 namespace llvm {

	162 FunctionPass *

	163 createAMDGPUPeepholeOpt(TargetMachine &tm) {

	164 return new AMDGPUPeepholeOpt(tm);

	165 }

	166 } // llvm namespace

	167

	168 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)

	169 : FunctionPass(ID), TM(tm) {

	170 mDebug = DEBUGME;

	171 optLevel = TM.getOptLevel();

	172

	173 }

	174

	175 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() {

	176 }

	177

	178 const char *

	179 AMDGPUPeepholeOpt::getPassName() const {

	180 return "AMDGPU PeepHole Optimization Pass";

	181 }

	182

	183 bool

	184 containsPointerType(Type *Ty) {

	185 if (!Ty) {

	186 return false;

	187 }

	188 switch(Ty->getTypeID()) {

	189 default:

	190 return false;

	191 case Type::StructTyID: {

	192 const StructType *ST = dyn_cast<StructType>(Ty);

	193 for (StructType::element_iterator stb = ST->element_begin(),

	194 ste = ST->element_end(); stb != ste; ++stb) {

	195 if (!containsPointerType(*stb)) {

	196 continue;

	197 }

	198 return true;

	199 }

	200 break;

	201 }

	202 case Type::VectorTyID:

	203 case Type::ArrayTyID:

	204 return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());

	205 case Type::PointerTyID:

	206 return true;

	207 };

	208 return false;

	209 }

	210

	211 bool

	212 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) {

	213 bool dumpAll = false;

	214 for (Function::const_arg_iterator cab = F.arg_begin(),

	215 cae = F.arg_end(); cab != cae; ++cab) {

	216 const Argument *arg = cab;

	217 const PointerType *PT = dyn_cast<PointerType>(arg->getType());

	218 if (!PT) {

	219 continue;

	220 }

	221 Type *DereferencedType = PT->getElementType();

	222 if (!dyn_cast<StructType>(DereferencedType)

	223 ) {

	224 continue;

	225 }

	226 if (!containsPointerType(DereferencedType)) {

	227 continue;

	228 }

	229 // FIXME: Because a pointer inside of a struct/union may be aliased to

	230 // another pointer we need to take the conservative approach and place all

	231 // pointers into the arena until more advanced detection is implemented.

	232 dumpAll = true;

	233 }

	234 return dumpAll;

	235 }

	236 void

	237 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {

	238 if (isConstVec.empty()) {

	239 return;

	240 }

	241 for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {

	242 CallInst *CI = isConstVec[x];

	243 Constant *CV = dyn_cast<Constant>(CI->getOperand(0));

	244 Type aType = Type::getInt32Ty(mCTX);

	245 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)

	246 : ConstantInt::get(aType, 0);

	247 CI->replaceAllUsesWith(Val);

	248 CI->eraseFromParent();

	249 }

	250 isConstVec.clear();

	251 }

	252 void

	253 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) {

	254 // Don't do anything if we don't have any atomic operations.

	255 if (atomicFuncs.empty()) {

	256 return;

	257 }

	258 // Change the function name for the atomic if it is required

	259 uint32_t size = atomicFuncs.size();

	260 for (uint32_t x = 0; x < size; ++x) {

	261 atomicFuncs[x].first->setOperand(

	262 atomicFuncs[x].first->getNumOperands()-1,

	263 atomicFuncs[x].second);

	264

	265 }

	266 mChanged = true;

	267 if (mConvertAtomics) {

	268 return;

	269 }

	270 }

	271

	272 bool

	273 AMDGPUPeepholeOpt::runOnFunction(Function &MF) {

	274 mChanged = false;

	275 mF = &MF;

	276 mSTM = &TM.getSubtarget<AMDGPUSubtarget>();

	277 if (mDebug) {

	278 MF.dump();

	279 }

	280 mCTX = &MF.getType()->getContext();

	281 mConvertAtomics = true;

	282 safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),

	283 std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),

	284 this));

	285

	286 doAtomicConversionIfNeeded(MF);

	287 doIsConstCallConversionIfNeeded();

	288

	289 if (mDebug) {

	290 MF.dump();

	291 }

	292 return mChanged;

	293 }

	294

	295 bool

	296 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) {

	297 Instruction inst = (bbb);

	298 CallInst *CI = dyn_cast<CallInst>(inst);

	299 if (!CI) {

	300 return false;

	301 }

	302 if (isSigned24BitOps(CI)) {

	303 expandSigned24BitOps(CI);

	304 ++(*bbb);

	305 CI->eraseFromParent();

	306 return true;

	307 }

	308 if (propagateSamplerInst(CI)) {

	309 return false;

	310 }

	311 if (expandBFI(CI) \|\| expandBFM(CI)) {

	312 ++(*bbb);

	313 CI->eraseFromParent();

	314 return true;

	315 }

	316 if (convertAccurateDivide(CI)) {

	317 expandAccurateDivide(CI);

	318 ++(*bbb);

	319 CI->eraseFromParent();

	320 return true;

	321 }

	322

	323 StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();

	324 if (calleeName.startswith("__amdil_is_constant")) {

	325 // If we do not have optimizations, then this

	326 // cannot be properly evaluated, so we add the

	327 // call instruction to a vector and process

	328 // them at the end of processing after the

	329 // samplers have been correctly handled.

	330 if (optLevel == CodeGenOpt::None) {

	331 isConstVec.push_back(CI);

	332 return false;

	333 } else {

	334 Constant *CV = dyn_cast<Constant>(CI->getOperand(0));

	335 Type aType = Type::getInt32Ty(mCTX);

	336 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)

	337 : ConstantInt::get(aType, 0);

	338 CI->replaceAllUsesWith(Val);

	339 ++(*bbb);

	340 CI->eraseFromParent();

	341 return true;

	342 }

	343 }

	344

	345 if (calleeName.equals("__amdil_is_asic_id_i32")) {

	346 ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));

	347 Type aType = Type::getInt32Ty(mCTX);

	348 Value *Val = CV;

	349 if (Val) {

	350 Val = ConstantInt::get(aType,

	351 mSTM->device()->getDeviceFlag() & CV->getZExtValue());

	352 } else {

	353 Val = ConstantInt::get(aType, 0);

	354 }

	355 CI->replaceAllUsesWith(Val);

	356 ++(*bbb);

	357 CI->eraseFromParent();

	358 return true;

	359 }

	360 Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));

	361 if (!F) {

	362 return false;

	363 }

	364 if (F->getName().startswith("__atom") && !CI->getNumUses()

	365 && F->getName().find("_xchg") == StringRef::npos) {

	366 std::string buffer(F->getName().str() + "_noret");

	367 F = dyn_cast<Function>(

	368 F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));

	369 atomicFuncs.push_back(std::make_pair(CI, F));

	370 }

	371

	372 if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)

	373 && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {

	374 return false;

	375 }

	376 if (!mConvertAtomics) {

	377 return false;

	378 }

	379 StringRef name = F->getName();

	380 if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {

	381 mConvertAtomics = false;

	382 }

	383 return false;

	384 }

	385

	386 bool

	387 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,

	388 Instruction *&src,

	389 Constant *&mask,

	390 Constant *&shift) {

	391 if (!base) {

	392 if (mDebug) {

	393 dbgs() << "Null pointer passed into function.\n";

	394 }

	395 return false;

	396 }

	397 bool andOp = false;

	398 if (base->getOpcode() == Instruction::Shl) {

	399 shift = dyn_cast<Constant>(base->getOperand(1));

	400 } else if (base->getOpcode() == Instruction::And) {

	401 mask = dyn_cast<Constant>(base->getOperand(1));

	402 andOp = true;

	403 } else {

	404 if (mDebug) {

	405 dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";

	406 }

	407 // If the base is neither a Shl or a And, we don't fit any of the patterns a bove.

	408 return false;

	409 }

	410 src = dyn_cast<Instruction>(base->getOperand(0));

	411 if (!src) {

	412 if (mDebug) {

	413 dbgs() << "Failed setup since the base operand is not an instruction!\n";

	414 }

	415 return false;

	416 }

	417 // If we find an 'and' operation, then we don't need to

	418 // find the next operation as we already know the

	419 // bits that are valid at this point.

	420 if (andOp) {

	421 return true;

	422 }

	423 if (src->getOpcode() == Instruction::Shl && !shift) {

	424 shift = dyn_cast<Constant>(src->getOperand(1));

	425 src = dyn_cast<Instruction>(src->getOperand(0));

	426 } else if (src->getOpcode() == Instruction::And && !mask) {

	427 mask = dyn_cast<Constant>(src->getOperand(1));

	428 }

	429 if (!mask && !shift) {

	430 if (mDebug) {

	431 dbgs() << "Failed setup since both mask and shift are NULL!\n";

	432 }

	433 // Did not find a constant mask or a shift.

	434 return false;

	435 }

	436 return true;

	437 }

	438 bool

	439 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) {

	440 if (!inst) {

	441 return false;

	442 }

	443 if (!inst->isBinaryOp()) {

	444 return false;

	445 }

	446 if (inst->getOpcode() != Instruction::Or) {

	447 return false;

	448 }

	449 if (optLevel == CodeGenOpt::None) {

	450 return false;

	451 }

	452 // We want to do an optimization on a sequence of ops that in the end equals a

	453 // single ISA instruction.

	454 // The base pattern for this optimization is - ((A & B) << C) \| ((D & E) << F)

	455 // Some simplified versions of this pattern are as follows:

	456 // (A & B) \| (D & E) when B & E == 0 && C == 0 && F == 0

	457 // ((A & B) << C) \| (D & E) when B ^ E == 0 && (1 << C) >= E

	458 // (A & B) \| ((D & E) << F) when B ^ E == 0 && (1 << F) >= B

	459 // (A & B) \| (D << F) when (1 << F) >= B

	460 // (A << C) \| (D & E) when (1 << C) >= E

	461 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {

	462 // The HD4XXX hardware doesn't support the ubit_insert instruction.

	463 return false;

	464 }

	465 Type *aType = inst->getType();

	466 bool isVector = aType->isVectorTy();

	467 int numEle = 1;

	468 // This optimization only works on 32bit integers.

	469 if (aType->getScalarType()

	470 != Type::getInt32Ty(inst->getContext())) {

	471 return false;

	472 }

	473 if (isVector) {

	474 const VectorType *VT = dyn_cast<VectorType>(aType);

	475 numEle = VT->getNumElements();

	476 // We currently cannot support more than 4 elements in a intrinsic and we

	477 // cannot support Vec3 types.

	478 if (numEle > 4 \|\| numEle == 3) {

	479 return false;

	480 }

	481 }

	482 // TODO: Handle vectors.

	483 if (isVector) {

	484 if (mDebug) {

	485 dbgs() << "!!! Vectors are not supported yet!\n";

	486 }

	487 return false;

	488 }

	489 Instruction LHSSrc = NULL, RHSSrc = NULL;

	490 Constant LHSMask = NULL, RHSMask = NULL;

	491 Constant LHSShift = NULL, RHSShift = NULL;

	492 Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));

	493 Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));

	494 if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {

	495 if (mDebug) {

	496 dbgs() << "Found an OR Operation that failed setup!\n";

	497 inst->dump();

	498 if (LHS) { LHS->dump(); }

	499 if (LHSSrc) { LHSSrc->dump(); }

	500 if (LHSMask) { LHSMask->dump(); }

	501 if (LHSShift) { LHSShift->dump(); }

	502 }

	503 // There was an issue with the setup for BitInsert.

	504 return false;

	505 }

	506 if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {

	507 if (mDebug) {

	508 dbgs() << "Found an OR Operation that failed setup!\n";

	509 inst->dump();

	510 if (RHS) { RHS->dump(); }

	511 if (RHSSrc) { RHSSrc->dump(); }

	512 if (RHSMask) { RHSMask->dump(); }

	513 if (RHSShift) { RHSShift->dump(); }

	514 }

	515 // There was an issue with the setup for BitInsert.

	516 return false;

	517 }

	518 if (mDebug) {

	519 dbgs() << "Found an OR operation that can possible be optimized to ubit inse rt!\n";

	520 dbgs() << "Op: "; inst->dump();

	521 dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\ n"; }

	522 dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "( None)\n"; }

	523 dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }

	524 dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() < < "(None)\n"; }

	525 dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\ n"; }

	526 dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "( None)\n"; }

	527 dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }

	528 dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() < < "(None)\n"; }

	529 }

	530 Constant *offset = NULL;

	531 Constant *width = NULL;

	532 uint32_t lhsMaskVal = 0, rhsMaskVal = 0;

	533 uint32_t lhsShiftVal = 0, rhsShiftVal = 0;

	534 uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;

	535 uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;

	536 lhsMaskVal = (LHSMask

	537 ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);

	538 rhsMaskVal = (RHSMask

	539 ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);

	540 lhsShiftVal = (LHSShift

	541 ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);

	542 rhsShiftVal = (RHSShift

	543 ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);

	544 lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;

	545 rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;

	546 lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;

	547 rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;

	548 // TODO: Handle the case of A & B \| D & ~B(i.e. inverted masks).

	549 if ((lhsMaskVal \|\| rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {

	550 return false;

	551 }

	552 if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {

	553 offset = ConstantInt::get(aType, lhsMaskOffset, false);

	554 width = ConstantInt::get(aType, lhsMaskWidth, false);

	555 RHSSrc = RHS;

	556 if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {

	557 return false;

	558 }

	559 if (!LHSShift) {

	560 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,

	561 "MaskShr", LHS);

	562 } else if (lhsShiftVal != lhsMaskOffset) {

	563 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,

	564 "MaskShr", LHS);

	565 }

	566 if (mDebug) {

	567 dbgs() << "Optimizing LHS!\n";

	568 }

	569 } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {

	570 offset = ConstantInt::get(aType, rhsMaskOffset, false);

	571 width = ConstantInt::get(aType, rhsMaskWidth, false);

	572 LHSSrc = RHSSrc;

	573 RHSSrc = LHS;

	574 if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {

	575 return false;

	576 }

	577 if (!RHSShift) {

	578 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,

	579 "MaskShr", RHS);

	580 } else if (rhsShiftVal != rhsMaskOffset) {

	581 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,

	582 "MaskShr", RHS);

	583 }

	584 if (mDebug) {

	585 dbgs() << "Optimizing RHS!\n";

	586 }

	587 } else {

	588 if (mDebug) {

	589 dbgs() << "Failed constraint 3!\n";

	590 }

	591 return false;

	592 }

	593 if (mDebug) {

	594 dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n" ; }

	595 dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\ n"; }

	596 dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\ n"; }

	597 dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\ n"; }

	598 }

	599 if (!offset \|\| !width) {

	600 if (mDebug) {

	601 dbgs() << "Either width or offset are NULL, failed detection!\n";

	602 }

	603 return false;

	604 }

	605 // Lets create the function signature.

	606 std::vector<Type *> callTypes;

	607 callTypes.push_back(aType);

	608 callTypes.push_back(aType);

	609 callTypes.push_back(aType);

	610 callTypes.push_back(aType);

	611 FunctionType *funcType = FunctionType::get(aType, callTypes, false);

	612 std::string name = "__amdil_ubit_insert";

	613 if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32" ; }

	614 Function *Func =

	615 dyn_cast<Function>(inst->getParent()->getParent()->getParent()->

	616 getOrInsertFunction(StringRef(name), funcType));

	617 Value *Operands[4] = {

	618 width,

	619 offset,

	620 LHSSrc,

	621 RHSSrc

	622 };

	623 CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");

	624 if (mDebug) {

	625 dbgs() << "Old Inst: ";

	626 inst->dump();

	627 dbgs() << "New Inst: ";

	628 CI->dump();

	629 dbgs() << "\n\n";

	630 }

	631 CI->insertBefore(inst);

	632 inst->replaceAllUsesWith(CI);

	633 return true;

	634 }

	635

	636 bool

	637 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) {

	638 if (!inst) {

	639 return false;

	640 }

	641 if (!inst->isBinaryOp()) {

	642 return false;

	643 }

	644 if (inst->getOpcode() != Instruction::And) {

	645 return false;

	646 }

	647 if (optLevel == CodeGenOpt::None) {

	648 return false;

	649 }

	650 // We want to do some simple optimizations on Shift right/And patterns. The

	651 // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a

	652 // value smaller than 32 and C is a mask. If C is a constant value, then the

	653 // following transformation can occur. For signed integers, it turns into the

	654 // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned

	655 // integers, it turns into the function call dst =

	656 // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u\|i]bit_extract

	657 // can be found in Section 7.9 of the ATI IL spec of the stream SDK for

	658 // Evergreen hardware.

	659 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {

	660 // This does not work on HD4XXX hardware.

	661 return false;

	662 }

	663 Type *aType = inst->getType();

	664 bool isVector = aType->isVectorTy();

	665

	666 // XXX Support vector types

	667 if (isVector) {

	668 return false;

	669 }

	670 int numEle = 1;

	671 // This only works on 32bit integers

	672 if (aType->getScalarType()

	673 != Type::getInt32Ty(inst->getContext())) {

	674 return false;

	675 }

	676 if (isVector) {

	677 const VectorType *VT = dyn_cast<VectorType>(aType);

	678 numEle = VT->getNumElements();

	679 // We currently cannot support more than 4 elements in a intrinsic and we

	680 // cannot support Vec3 types.

	681 if (numEle > 4 \|\| numEle == 3) {

	682 return false;

	683 }

	684 }

	685 BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));

	686 // If the first operand is not a shift instruction, then we can return as it

	687 // doesn't match this pattern.

	688 if (!ShiftInst \|\| !ShiftInst->isShift()) {

	689 return false;

	690 }

	691 // If we are a shift left, then we need don't match this pattern.

	692 if (ShiftInst->getOpcode() == Instruction::Shl) {

	693 return false;

	694 }

	695 bool isSigned = ShiftInst->isArithmeticShift();

	696 Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));

	697 Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));

	698 // Lets make sure that the shift value and the and mask are constant integers.

	699 if (!AndMask \|\| !ShrVal) {

	700 return false;

	701 }

	702 Constant *newMaskConst;

	703 Constant *shiftValConst;

	704 if (isVector) {

	705 // Handle the vector case

	706 std::vector<Constant *> maskVals;

	707 std::vector<Constant *> shiftVals;

	708 ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);

	709 ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);

	710 Type *scalarType = AndMaskVec->getType()->getScalarType();

	711 assert(AndMaskVec->getNumOperands() ==

	712 ShrValVec->getNumOperands() && "cannot have a "

	713 "combination where the number of elements to a "

	714 "shift and an and are different!");

	715 for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {

	716 ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));

	717 ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));

	718 if (!AndCI \|\| !ShiftIC) {

	719 return false;

	720 }

	721 uint32_t maskVal = (uint32_t)AndCI->getZExtValue();

	722 if (!isMask_32(maskVal)) {

	723 return false;

	724 }

	725 maskVal = (uint32_t)CountTrailingOnes_32(maskVal);

	726 uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();

	727 // If the mask or shiftval is greater than the bitcount, then break out.

	728 if (maskVal >= 32 \|\| shiftVal >= 32) {

	729 return false;

	730 }

	731 // If the mask val is greater than the the number of original bits left

	732 // then this optimization is invalid.

	733 if (maskVal > (32 - shiftVal)) {

	734 return false;

	735 }

	736 maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));

	737 shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));

	738 }

	739 newMaskConst = ConstantVector::get(maskVals);

	740 shiftValConst = ConstantVector::get(shiftVals);

	741 } else {

	742 // Handle the scalar case

	743 uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();

	744 // This must be a mask value where all lower bits are set to 1 and then any

	745 // bit higher is set to 0.

	746 if (!isMask_32(maskVal)) {

	747 return false;

	748 }

	749 maskVal = (uint32_t)CountTrailingOnes_32(maskVal);

	750 // Count the number of bits set in the mask, this is the width of the

	751 // resulting bit set that is extracted from the source value.

	752 uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();

	753 // If the mask or shift val is greater than the bitcount, then break out.

	754 if (maskVal >= 32 \|\| shiftVal >= 32) {

	755 return false;

	756 }

	757 // If the mask val is greater than the the number of original bits left then

	758 // this optimization is invalid.

	759 if (maskVal > (32 - shiftVal)) {

	760 return false;

	761 }

	762 newMaskConst = ConstantInt::get(aType, maskVal, isSigned);

	763 shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);

	764 }

	765 // Lets create the function signature.

	766 std::vector<Type *> callTypes;

	767 callTypes.push_back(aType);

	768 callTypes.push_back(aType);

	769 callTypes.push_back(aType);

	770 FunctionType *funcType = FunctionType::get(aType, callTypes, false);

	771 std::string name = "llvm.AMDGPU.bit.extract.u32";

	772 if (isVector) {

	773 name += ".v" + itostr(numEle) + "i32";

	774 } else {

	775 name += ".";

	776 }

	777 // Lets create the function.

	778 Function *Func =

	779 dyn_cast<Function>(inst->getParent()->getParent()->getParent()->

	780 getOrInsertFunction(StringRef(name), funcType));

	781 Value *Operands[3] = {

	782 ShiftInst->getOperand(0),

	783 shiftValConst,

	784 newMaskConst

	785 };

	786 // Lets create the Call with the operands

	787 CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");

	788 CI->setDoesNotAccessMemory();

	789 CI->insertBefore(inst);

	790 inst->replaceAllUsesWith(CI);

	791 return true;

	792 }

	793

	794 bool

	795 AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {

	796 if (!CI) {

	797 return false;

	798 }

	799 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);

	800 if (!LHS->getName().startswith("__amdil_bfi")) {

	801 return false;

	802 }

	803 Type* type = CI->getOperand(0)->getType();

	804 Constant *negOneConst = NULL;

	805 if (type->isVectorTy()) {

	806 std::vector<Constant *> negOneVals;

	807 negOneConst = ConstantInt::get(CI->getContext(),

	808 APInt(32, StringRef("-1"), 10));

	809 for (size_t x = 0,

	810 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {

	811 negOneVals.push_back(negOneConst);

	812 }

	813 negOneConst = ConstantVector::get(negOneVals);

	814 } else {

	815 negOneConst = ConstantInt::get(CI->getContext(),

	816 APInt(32, StringRef("-1"), 10));

	817 }

	818 // __amdil_bfi => (A & B) \| (~A & C)

	819 BinaryOperator *lhs =

	820 BinaryOperator::Create(Instruction::And, CI->getOperand(0),

	821 CI->getOperand(1), "bfi_and", CI);

	822 BinaryOperator *rhs =

	823 BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,

	824 "bfi_not", CI);

	825 rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),

	826 "bfi_and", CI);

	827 lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);

	828 CI->replaceAllUsesWith(lhs);

	829 return true;

	830 }

	831

	832 bool

	833 AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {

	834 if (!CI) {

	835 return false;

	836 }

	837 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);

	838 if (!LHS->getName().startswith("__amdil_bfm")) {

	839 return false;

	840 }

	841 // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)

	842 Constant *newMaskConst = NULL;

	843 Constant *newShiftConst = NULL;

	844 Type* type = CI->getOperand(0)->getType();

	845 if (type->isVectorTy()) {

	846 std::vector<Constant*> newMaskVals, newShiftVals;

	847 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);

	848 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);

	849 for (size_t x = 0,

	850 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {

	851 newMaskVals.push_back(newMaskConst);

	852 newShiftVals.push_back(newShiftConst);

	853 }

	854 newMaskConst = ConstantVector::get(newMaskVals);

	855 newShiftConst = ConstantVector::get(newShiftVals);

	856 } else {

	857 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);

	858 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);

	859 }

	860 BinaryOperator *lhs =

	861 BinaryOperator::Create(Instruction::And, CI->getOperand(0),

	862 newMaskConst, "bfm_mask", CI);

	863 lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,

	864 lhs, "bfm_shl", CI);

	865 lhs = BinaryOperator::Create(Instruction::Sub, lhs,

	866 newShiftConst, "bfm_sub", CI);

	867 BinaryOperator *rhs =

	868 BinaryOperator::Create(Instruction::And, CI->getOperand(1),

	869 newMaskConst, "bfm_mask", CI);

	870 lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);

	871 CI->replaceAllUsesWith(lhs);

	872 return true;

	873 }

	874

	875 bool

	876 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) {

	877 Instruction inst = (bbb);

	878 if (optimizeCallInst(bbb)) {

	879 return true;

	880 }

	881 if (optimizeBitExtract(inst)) {

	882 return false;

	883 }

	884 if (optimizeBitInsert(inst)) {

	885 return false;

	886 }

	887 if (correctMisalignedMemOp(inst)) {

	888 return false;

	889 }

	890 return false;

	891 }

	892 bool

	893 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {

	894 LoadInst *linst = dyn_cast<LoadInst>(inst);

	895 StoreInst *sinst = dyn_cast<StoreInst>(inst);

	896 unsigned alignment;

	897 Type* Ty = inst->getType();

	898 if (linst) {

	899 alignment = linst->getAlignment();

	900 Ty = inst->getType();

	901 } else if (sinst) {

	902 alignment = sinst->getAlignment();

	903 Ty = sinst->getValueOperand()->getType();

	904 } else {

	905 return false;

	906 }

	907 unsigned size = getTypeSize(Ty);

	908 if (size == alignment \|\| size < alignment) {

	909 return false;

	910 }

	911 if (!Ty->isStructTy()) {

	912 return false;

	913 }

	914 if (alignment < 4) {

	915 if (linst) {

	916 linst->setAlignment(0);

	917 return true;

	918 } else if (sinst) {

	919 sinst->setAlignment(0);

	920 return true;

	921 }

	922 }

	923 return false;

	924 }

	925 bool

	926 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) {

	927 if (!CI) {

	928 return false;

	929 }

	930 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);

	931 std::string namePrefix = LHS->getName().substr(0, 14);

	932 if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"

	933 && namePrefix != "__amdil__imul24_high") {

	934 return false;

	935 }

	936 if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {

	937 return false;

	938 }

	939 return true;

	940 }

	941

	942 void

	943 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) {

	944 assert(isSigned24BitOps(CI) && "Must be a "

	945 "signed 24 bit operation to call this function!");

	946 Value *LHS = CI->getOperand(CI->getNumOperands()-1);

	947 // On 7XX and 8XX we do not have signed 24bit, so we need to

	948 // expand it to the following:

	949 // imul24 turns into 32bit imul

	950 // imad24 turns into 32bit imad

	951 // imul24_high turns into 32bit imulhigh

	952 if (LHS->getName().substr(0, 14) == "__amdil_imad24") {

	953 Type *aType = CI->getOperand(0)->getType();

	954 bool isVector = aType->isVectorTy();

	955 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;

	956 std::vector<Type*> callTypes;

	957 callTypes.push_back(CI->getOperand(0)->getType());

	958 callTypes.push_back(CI->getOperand(1)->getType());

	959 callTypes.push_back(CI->getOperand(2)->getType());

	960 FunctionType *funcType =

	961 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);

	962 std::string name = "__amdil_imad";

	963 if (isVector) {

	964 name += "_v" + itostr(numEle) + "i32";

	965 } else {

	966 name += "_i32";

	967 }

	968 Function *Func = dyn_cast<Function>(

	969 CI->getParent()->getParent()->getParent()->

	970 getOrInsertFunction(StringRef(name), funcType));

	971 Value *Operands[3] = {

	972 CI->getOperand(0),

	973 CI->getOperand(1),

	974 CI->getOperand(2)

	975 };

	976 CallInst *nCI = CallInst::Create(Func, Operands, "imad24");

	977 nCI->insertBefore(CI);

	978 CI->replaceAllUsesWith(nCI);

	979 } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {

	980 BinaryOperator *mulOp =

	981 BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),

	982 CI->getOperand(1), "imul24", CI);

	983 CI->replaceAllUsesWith(mulOp);

	984 } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {

	985 Type *aType = CI->getOperand(0)->getType();

	986

	987 bool isVector = aType->isVectorTy();

	988 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;

	989 std::vector<Type*> callTypes;

	990 callTypes.push_back(CI->getOperand(0)->getType());

	991 callTypes.push_back(CI->getOperand(1)->getType());

	992 FunctionType *funcType =

	993 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);

	994 std::string name = "__amdil_imul_high";

	995 if (isVector) {

	996 name += "_v" + itostr(numEle) + "i32";

	997 } else {

	998 name += "_i32";

	999 }

	1000 Function *Func = dyn_cast<Function>(

	1001 CI->getParent()->getParent()->getParent()->

	1002 getOrInsertFunction(StringRef(name), funcType));

	1003 Value *Operands[2] = {

	1004 CI->getOperand(0),

	1005 CI->getOperand(1)

	1006 };

	1007 CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");

	1008 nCI->insertBefore(CI);

	1009 CI->replaceAllUsesWith(nCI);

	1010 }

	1011 }

	1012

	1013 bool

	1014 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) {

	1015 return (CI != NULL

	1016 && CI->getOperand(CI->getNumOperands() - 1)->getName()

	1017 == "__amdil_get_local_size_int");

	1018 }

	1019

	1020 bool

	1021 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) {

	1022 if (!CI) {

	1023 return false;

	1024 }

	1025 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX

	1026 && (mSTM->getDeviceName() == "cayman")) {

	1027 return false;

	1028 }

	1029 return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)

	1030 == "__amdil_improved_div";

	1031 }

	1032

	1033 void

	1034 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) {

	1035 assert(convertAccurateDivide(CI)

	1036 && "expanding accurate divide can only happen if it is expandable!");

	1037 BinaryOperator *divOp =

	1038 BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),

	1039 CI->getOperand(1), "fdiv32", CI);

	1040 CI->replaceAllUsesWith(divOp);

	1041 }

	1042

	1043 bool

	1044 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {

	1045 if (optLevel != CodeGenOpt::None) {

	1046 return false;

	1047 }

	1048

	1049 if (!CI) {

	1050 return false;

	1051 }

	1052

	1053 unsigned funcNameIdx = 0;

	1054 funcNameIdx = CI->getNumOperands() - 1;

	1055 StringRef calleeName = CI->getOperand(funcNameIdx)->getName();

	1056 if (calleeName != "__amdil_image2d_read_norm"

	1057 && calleeName != "__amdil_image2d_read_unnorm"

	1058 && calleeName != "__amdil_image3d_read_norm"

	1059 && calleeName != "__amdil_image3d_read_unnorm") {

	1060 return false;

	1061 }

	1062

	1063 unsigned samplerIdx = 2;

	1064 samplerIdx = 1;

	1065 Value *sampler = CI->getOperand(samplerIdx);

	1066 LoadInst *lInst = dyn_cast<LoadInst>(sampler);

	1067 if (!lInst) {

	1068 return false;

	1069 }

	1070

	1071 if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {

	1072 return false;

	1073 }

	1074

	1075 GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());

	1076 // If we are loading from what is not a global value, then we

	1077 // fail and return.

	1078 if (!gv) {

	1079 return false;

	1080 }

	1081

	1082 // If we don't have an initializer or we have an initializer and

	1083 // the initializer is not a 32bit integer, we fail.

	1084 if (!gv->hasInitializer()

	1085 \|\| !gv->getInitializer()->getType()->isIntegerTy(32)) {

	1086 return false;

	1087 }

	1088

	1089 // Now that we have the global variable initializer, lets replace

	1090 // all uses of the load instruction with the samplerVal and

	1091 // reparse the __amdil_is_constant() function.

	1092 Constant *samplerVal = gv->getInitializer();

	1093 lInst->replaceAllUsesWith(samplerVal);

	1094 return true;

	1095 }

	1096

	1097 bool

	1098 AMDGPUPeepholeOpt::doInitialization(Module &M) {

	1099 return false;

	1100 }

	1101

	1102 bool

	1103 AMDGPUPeepholeOpt::doFinalization(Module &M) {

	1104 return false;

	1105 }

	1106

	1107 void

	1108 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const {

	1109 AU.addRequired<MachineFunctionAnalysis>();

	1110 FunctionPass::getAnalysisUsage(AU);

	1111 AU.setPreservesAll();

	1112 }

	1113

	1114 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {

	1115 size_t size = 0;

	1116 if (!T) {

	1117 return size;

	1118 }

	1119 switch (T->getTypeID()) {

	1120 case Type::X86_FP80TyID:

	1121 case Type::FP128TyID:

	1122 case Type::PPC_FP128TyID:

	1123 case Type::LabelTyID:

	1124 assert(0 && "These types are not supported by this backend");

	1125 default:

	1126 case Type::FloatTyID:

	1127 case Type::DoubleTyID:

	1128 size = T->getPrimitiveSizeInBits() >> 3;

	1129 break;

	1130 case Type::PointerTyID:

	1131 size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);

	1132 break;

	1133 case Type::IntegerTyID:

	1134 size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);

	1135 break;

	1136 case Type::StructTyID:

	1137 size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);

	1138 break;

	1139 case Type::ArrayTyID:

	1140 size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);

	1141 break;

	1142 case Type::FunctionTyID:

	1143 size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);

	1144 break;

	1145 case Type::VectorTyID:

	1146 size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);

	1147 break;

	1148 };

	1149 return size;

	1150 }

	1151

	1152 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,

	1153 bool dereferencePtr) {

	1154 size_t size = 0;

	1155 if (!ST) {

	1156 return size;

	1157 }

	1158 Type *curType;

	1159 StructType::element_iterator eib;

	1160 StructType::element_iterator eie;

	1161 for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {

	1162 curType = *eib;

	1163 size += getTypeSize(curType, dereferencePtr);

	1164 }

	1165 return size;

	1166 }

	1167

	1168 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,

	1169 bool dereferencePtr) {

	1170 return IT ? (IT->getBitWidth() >> 3) : 0;

	1171 }

	1172

	1173 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,

	1174 bool dereferencePtr) {

	1175 assert(0 && "Should not be able to calculate the size of an function type");

	1176 return 0;

	1177 }

	1178

	1179 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,

	1180 bool dereferencePtr) {

	1181 return (size_t)(AT ? (getTypeSize(AT->getElementType(),

	1182 dereferencePtr) * AT->getNumElements())

	1183 : 0);

	1184 }

	1185

	1186 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,

	1187 bool dereferencePtr) {

	1188 return VT ? (VT->getBitWidth() >> 3) : 0;

	1189 }

	1190

	1191 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,

	1192 bool dereferencePtr) {

	1193 if (!PT) {

	1194 return 0;

	1195 }

	1196 Type *CT = PT->getElementType();

	1197 if (CT->getTypeID() == Type::StructTyID &&

	1198 PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {

	1199 return getTypeSize(dyn_cast<StructType>(CT));

	1200 } else if (dereferencePtr) {

	1201 size_t size = 0;

	1202 for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {

	1203 size += getTypeSize(PT->getContainedType(x), dereferencePtr);

	1204 }

	1205 return size;

	1206 } else {

	1207 return 4;

	1208 }

	1209 }

	1210

	1211 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,

	1212 bool dereferencePtr) {

	1213 //assert(0 && "Should not be able to calculate the size of an opaque type");

	1214 return 4;

	1215 }

OLD	NEW

« no previous file with comments | « lib/Target/R600/AMDILDeviceInfo.cpp ('k') | lib/Target/R600/CMakeLists.txt » ('j') | no next file with comments »