Index: tools/kilobench/kilobench.cpp |
diff --git a/tools/kilobench/kilobench.cpp b/tools/kilobench/kilobench.cpp |
index 8c844f47e0169ac26f72cae33d22b03fb52d8e50..1b9cb34de0fe74d0eb38a818847248e62f65a9e4 100644 |
--- a/tools/kilobench/kilobench.cpp |
+++ b/tools/kilobench/kilobench.cpp |
@@ -14,10 +14,15 @@ |
#include "SkStream.h" |
#include "SkSurface.h" |
#include "SkTime.h" |
+#include "SkTLList.h" |
+#include "SkThreadUtils.h" |
#include "Stats.h" |
#include "Timer.h" |
#include "VisualSKPBench.h" |
#include "gl/GrGLDefines.h" |
+#include "../private/SkMutex.h" |
+#include "../private/SkSemaphore.h" |
+#include "../private/SkGpuFenceSync.h" |
// posix only for now |
#include <unistd.h> |
@@ -34,7 +39,6 @@ |
#include "SkImageDecoder.h" |
__SK_FORCE_IMAGE_DECODER_LINKING; |
- |
static const int kAutoTuneLoops = 0; |
static const int kDefaultLoops = |
@@ -68,6 +72,8 @@ DEFINE_int32(maxLoops, 1000000, "Never run a bench more times than this."); |
DEFINE_int32(loops, kDefaultLoops, loops_help_txt().c_str()); |
DEFINE_double(gpuMs, 5, "Target bench time in millseconds for GPU."); |
DEFINE_string2(writePath, w, "", "If set, write bitmaps here as .pngs."); |
+DEFINE_bool(useBackgroundThread, true, "If false, kilobench will time cpu / gpu work together"); |
+DEFINE_bool(useMultiProcess, true, "If false, kilobench will run all tests in one process"); |
static SkString humanize(double ms) { |
return HumanizeMs(ms); |
@@ -146,25 +152,29 @@ private: |
struct GPUTarget { |
void setup() { |
- this->gl->makeCurrent(); |
+ fGL->makeCurrent(); |
// Make sure we're done with whatever came before. |
- SK_GL(*this->gl, Finish()); |
+ SK_GL(*fGL, Finish()); |
} |
SkCanvas* beginTiming(SkCanvas* canvas) { return canvas; } |
- void endTiming() { |
- if (this->gl) { |
- SK_GL(*this->gl, Flush()); |
- this->gl->swapBuffers(); |
+ void endTiming(bool usePlatformSwapBuffers) { |
+ if (fGL) { |
+ SK_GL(*fGL, Flush()); |
+ if (usePlatformSwapBuffers) { |
+ fGL->swapBuffers(); |
+ } else { |
+ fGL->waitOnSyncOrSwap(); |
+ } |
} |
} |
- void fence() { |
- SK_GL(*this->gl, Finish()); |
+ void finish() { |
+ SK_GL(*fGL, Finish()); |
} |
bool needsFrameTiming(int* maxFrameLag) const { |
- if (!this->gl->getMaxGpuFrameLag(maxFrameLag)) { |
+ if (!fGL->getMaxGpuFrameLag(maxFrameLag)) { |
// Frame lag is unknown. |
*maxFrameLag = FLAGS_gpuFrameLag; |
} |
@@ -182,24 +192,24 @@ struct GPUTarget { |
uint32_t flags = useDfText ? SkSurfaceProps::kUseDeviceIndependentFonts_Flag : |
0; |
SkSurfaceProps props(flags, SkSurfaceProps::kLegacyFontHost_InitType); |
- this->surface.reset(SkSurface::NewRenderTarget(context, |
- SkSurface::kNo_Budgeted, info, |
- numSamples, &props)); |
- this->gl = factory->getContextInfo(ctxType, ctxOptions).fGLContext; |
- if (!this->surface.get()) { |
+ fSurface.reset(SkSurface::NewRenderTarget(context, |
+ SkSurface::kNo_Budgeted, info, |
+ numSamples, &props)); |
+ fGL = factory->getContextInfo(ctxType, ctxOptions).fGLContext; |
+ if (!fSurface.get()) { |
return false; |
} |
// Kilobench should only be used on platforms with fence sync support |
- SkASSERT(this->gl->fenceSyncSupport()); |
+ SkASSERT(fGL->fenceSyncSupport()); |
return true; |
} |
SkCanvas* getCanvas() const { |
- if (!surface.get()) { |
+ if (!fSurface.get()) { |
return nullptr; |
} |
- return surface->getCanvas(); |
+ return fSurface->getCanvas(); |
} |
bool capturePixels(SkBitmap* bmp) { |
@@ -215,10 +225,11 @@ struct GPUTarget { |
return true; |
} |
+ SkGLContext* gl() { return fGL; } |
+ |
private: |
- //const Config config; |
- SkGLContext* gl; |
- SkAutoTDelete<SkSurface> surface; |
+ SkGLContext* fGL; |
+ SkAutoTDelete<SkSurface> fSurface; |
}; |
static bool write_canvas_png(GPUTarget* target, const SkString& filename) { |
@@ -276,24 +287,159 @@ static int clamp_loops(int loops) { |
} |
static double now_ms() { return SkTime::GetNSecs() * 1e-6; } |
-static double time(int loops, Benchmark* bench, GPUTarget* target) { |
- SkCanvas* canvas = target->getCanvas(); |
- if (canvas) { |
- canvas->clear(SK_ColorWHITE); |
+ |
+struct TimingThread { |
+ TimingThread(SkGLContext* mainContext) |
+ : fFenceSync(mainContext->fenceSync()) |
+ , fMainContext(mainContext) |
+ , fDone(false) {} |
+ |
+ static void Loop(void* data) { |
+ TimingThread* timingThread = reinterpret_cast<TimingThread*>(data); |
+ timingThread->timingLoop(); |
+ } |
+ |
+ // To ensure waiting for the sync actually does something, we check to make sure the we exceed |
+ // some small value |
+ const double kMinElapsed = 1e-6; |
+ bool sanity(double start) const { |
+ double elapsed = now_ms() - start; |
+ return elapsed > kMinElapsed; |
+ } |
+ |
+ void waitFence(SkPlatformGpuFence sync) { |
+ SkDEBUGCODE(double start = now_ms()); |
+ fFenceSync->waitFence(sync, false); |
+ SkASSERT(sanity(start)); |
+ } |
+ |
+ void timingLoop() { |
+ // Create a context which shares display lists with the main thread |
+ SkAutoTDelete<SkGLContext> glContext(SkCreatePlatformGLContext(kNone_GrGLStandard, |
+ fMainContext)); |
+ glContext->makeCurrent(); |
+ |
+ // Basic timing methodology is: |
+ // 1) Wait on semaphore until main thread indicates its time to start timing the frame |
+ // 2) Wait on frame start sync, record time. This is start of the frame. |
+ // 3) Wait on semaphore until main thread indicates its time to finish timing the frame |
+ // 4) Wait on frame end sync, record time. FrameEndTime - FrameStartTime = frame time |
+ // 5) Wait on semaphore until main thread indicates we should time the next frame or quit |
+ while (true) { |
+ fSemaphore.wait(); |
+ |
+ // get start sync |
+ SkPlatformGpuFence startSync = this->popStartSync(); |
+ |
+ // wait on sync |
+ this->waitFence(startSync); |
+ double start = kilobench::now_ms(); |
+ |
+ // do we want to sleep here? |
+ // wait for end sync |
+ fSemaphore.wait(); |
+ |
+ // get end sync |
+ SkPlatformGpuFence endSync = this->popEndSync(); |
+ |
+ // wait on sync |
+ this->waitFence(endSync); |
+ double elapsed = kilobench::now_ms() - start; |
+ |
+ // No mutex needed, client won't touch timings until we're done |
+ fTimings.push_back(elapsed); |
+ |
+ // clean up fences |
+ fFenceSync->deleteFence(startSync); |
+ fFenceSync->deleteFence(endSync); |
+ |
+ fSemaphore.wait(); |
+ if (this->isDone()) { |
+ break; |
+ } |
+ } |
+ } |
+ |
+ void pushStartSync() { this->pushSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); } |
+ |
+ SkPlatformGpuFence popStartSync() { |
+ return this->popSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); |
+ } |
+ |
+ void pushEndSync() { this->pushSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); } |
+ |
+ SkPlatformGpuFence popEndSync() { return this->popSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); } |
+ |
+ void setDone() { |
+ SkAutoMutexAcquire done(fDoneMutex); |
+ fDone = true; |
+ fSemaphore.signal(); |
+ } |
+ |
+ typedef SkTLList<SkPlatformGpuFence, 1> SyncQueue; |
+ |
+ void pushSync(SyncQueue* queue, SkMutex* mutex) { |
+ SkAutoMutexAcquire am(mutex); |
+ *queue->addToHead() = fFenceSync->insertFence(); |
+ fSemaphore.signal(); |
+ } |
+ |
+ SkPlatformGpuFence popSync(SyncQueue* queue, SkMutex* mutex) { |
+ SkAutoMutexAcquire am(mutex); |
+ SkPlatformGpuFence sync = *queue->head(); |
+ queue->popHead(); |
+ return sync; |
+ } |
+ |
+ bool isDone() { |
+ SkAutoMutexAcquire am1(fFrameStartSyncsMutex); |
+ SkAutoMutexAcquire done(fDoneMutex); |
+ if (fDone && fFrameStartSyncs.isEmpty()) { |
+ return true; |
+ } else { |
+ return false; |
+ } |
} |
+ |
+ const SkTArray<double>& timings() const { SkASSERT(fDone); return fTimings; } |
+ |
+private: |
+ SkGpuFenceSync* fFenceSync; |
+ SkSemaphore fSemaphore; |
+ SkMutex fFrameStartSyncsMutex; |
+ SyncQueue fFrameStartSyncs; |
+ SkMutex fFrameEndSyncsMutex; |
+ SyncQueue fFrameEndSyncs; |
+ SkTArray<double> fTimings; |
+ SkMutex fDoneMutex; |
+ SkGLContext* fMainContext; |
+ bool fDone; |
+}; |
+ |
+static double time(int loops, Benchmark* bench, GPUTarget* target, TimingThread* timingThread) { |
+ SkCanvas* canvas = target->getCanvas(); |
+ canvas->clear(SK_ColorWHITE); |
bench->preDraw(canvas); |
+ |
+ if (timingThread) { |
+ timingThread->pushStartSync(); |
+ } |
double start = now_ms(); |
canvas = target->beginTiming(canvas); |
bench->draw(loops, canvas); |
- if (canvas) { |
- canvas->flush(); |
- } |
- target->endTiming(); |
+ canvas->flush(); |
+ target->endTiming(timingThread ? true : false); |
+ |
double elapsed = now_ms() - start; |
+ if (timingThread) { |
+ timingThread->pushEndSync(); |
+ timingThread->setDone(); |
+ } |
bench->postDraw(canvas); |
return elapsed; |
} |
+// TODO For now we don't use the background timing thread to tune loops |
static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameLag) { |
// First, figure out how many loops it'll take to get a frame up to FLAGS_gpuMs. |
int loops = bench->calculateLoops(FLAGS_loops); |
@@ -310,7 +456,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL |
// If the GPU lets frames lag at all, we need to make sure we're timing |
// _this_ round, not still timing last round. |
for (int i = 0; i < maxGpuFrameLag; i++) { |
- elapsed = time(loops, bench, target); |
+ elapsed = time(loops, bench, target, nullptr); |
} |
} while (elapsed < FLAGS_gpuMs); |
@@ -319,7 +465,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL |
loops = clamp_loops(loops); |
// Make sure we're not still timing our calibration. |
- target->fence(); |
+ target->finish(); |
} else { |
loops = detect_forever_loops(loops); |
} |
@@ -327,7 +473,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL |
// Pretty much the same deal as the calibration: do some warmup to make |
// sure we're timing steady-state pipelined frames. |
for (int i = 0; i < maxGpuFrameLag - 1; i++) { |
- time(loops, bench, target); |
+ time(loops, bench, target, nullptr); |
} |
return loops; |
@@ -351,13 +497,14 @@ struct AutoSetupContextBenchAndTarget { |
int getLoops() { return setup_gpu_bench(&fTarget, fBenchmark, fMaxFrameLag); } |
- double timeSample(int loops) { |
+ double timeSample(int loops, TimingThread* timingThread) { |
for (int i = 0; i < fMaxFrameLag; i++) { |
- time(loops, fBenchmark, &fTarget); |
+ time(loops, fBenchmark, &fTarget, timingThread); |
} |
- return time(loops, fBenchmark, &fTarget) / loops; |
+ return time(loops, fBenchmark, &fTarget, timingThread) / loops; |
} |
+ |
void teardownBench() { fBenchmark->perCanvasPostDraw(fCanvas); } |
SkAutoTDelete<GrContextFactory> fCtxFactory; |
@@ -381,9 +528,32 @@ int setup_loops(Benchmark* bench) { |
return loops; |
} |
-double time_sample(Benchmark* bench, int loops) { |
+struct Sample { |
+ double fCpu; |
+ double fGpu; |
+}; |
+ |
+Sample time_sample(Benchmark* bench, int loops) { |
AutoSetupContextBenchAndTarget ascbt(bench); |
- double sample = ascbt.timeSample(loops); |
+ |
+ Sample sample; |
+ if (FLAGS_useBackgroundThread) { |
+ TimingThread timingThread(ascbt.fTarget.gl()); |
+ SkAutoTDelete<SkThread> nativeThread(new SkThread(TimingThread::Loop, &timingThread)); |
+ nativeThread->start(); |
+ sample.fCpu = ascbt.timeSample(loops, &timingThread); |
+ nativeThread->join(); |
+ |
+ // return the min |
+ double min = SK_ScalarMax; |
+ for (int i = 0; i < timingThread.timings().count(); i++) { |
+ min = SkTMin(min, timingThread.timings()[i]); |
+ } |
+ sample.fGpu = min; |
+ } else { |
+ sample.fCpu = ascbt.timeSample(loops, nullptr); |
+ } |
+ |
ascbt.teardownBench(); |
return sample; |
@@ -393,6 +563,24 @@ double time_sample(Benchmark* bench, int loops) { |
static const int kOutResultSize = 1024; |
+void printResult(const SkTArray<double>& samples, int loops, const char* name, const char* mod) { |
+ SkString newName(name); |
+ newName.appendf("_%s", mod); |
+ Stats stats(samples); |
+ const double stddev_percent = 100 * sqrt(stats.var) / stats.mean; |
+ SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n" |
+ , loops |
+ , HUMANIZE(stats.min) |
+ , HUMANIZE(stats.median) |
+ , HUMANIZE(stats.mean) |
+ , HUMANIZE(stats.max) |
+ , stddev_percent |
+ , stats.plot.c_str() |
+ , "gpu" |
+ , newName.c_str() |
+ ); |
+} |
+ |
int kilobench_main() { |
kilobench::BenchmarkStream benchStream; |
@@ -407,60 +595,63 @@ int kilobench_main() { |
while (Benchmark* b = benchStream.next()) { |
SkAutoTDelete<Benchmark> bench(b); |
- int loops; |
- SkTArray<double> samples; |
+ int loops = 1; |
+ SkTArray<double> cpuSamples; |
+ SkTArray<double> gpuSamples; |
for (int i = 0; i < FLAGS_samples + 1; i++) { |
// We fork off a new process to setup the grcontext and run the test while we wait |
- int childPid = fork(); |
- if (childPid > 0) { |
- char result[kOutResultSize]; |
- if (read(descriptors[0], result, kOutResultSize) < 0) { |
- SkFAIL("Failed to read from pipe\n"); |
- } |
- |
- // if samples == 0 then parse # of loops |
- // else parse float |
- if (i == 0) { |
- sscanf(result, "%d", &loops); |
+ if (FLAGS_useMultiProcess) { |
+ int childPid = fork(); |
+ if (childPid > 0) { |
+ char result[kOutResultSize]; |
+ if (read(descriptors[0], result, kOutResultSize) < 0) { |
+ SkFAIL("Failed to read from pipe\n"); |
+ } |
+ |
+ // if samples == 0 then parse # of loops |
+ // else parse float |
+ if (i == 0) { |
+ sscanf(result, "%d", &loops); |
+ } else { |
+ sscanf(result, "%lf %lf", &cpuSamples.push_back(), |
+ &gpuSamples.push_back()); |
+ } |
+ |
+ // wait until exit |
+ int status; |
+ waitpid(childPid, &status, 0); |
+ } else if (0 == childPid) { |
+ char result[kOutResultSize]; |
+ if (i == 0) { |
+ sprintf(result, "%d", kilobench::setup_loops(bench)); |
+ } else { |
+ kilobench::Sample sample = kilobench::time_sample(bench, loops); |
+ sprintf(result, "%lf %lf", sample.fCpu, sample.fGpu); |
+ } |
+ |
+ // Make sure to write the null terminator |
+ if (write(descriptors[1], result, strlen(result) + 1) < 0) { |
+ SkFAIL("Failed to write to pipe\n"); |
+ } |
+ return 0; |
} else { |
- sscanf(result, "%lf", &samples.push_back()); |
+ SkFAIL("Fork failed\n"); |
} |
- |
- // wait until exit |
- int status; |
- waitpid(childPid, &status, 0); |
- } else if (0 == childPid) { |
- char result[kOutResultSize]; |
+ } else { |
if (i == 0) { |
- sprintf(result, "%d", kilobench::setup_loops(bench)); |
+ loops = kilobench::setup_loops(bench); |
} else { |
- sprintf(result, "%lf", kilobench::time_sample(bench, loops)); |
+ kilobench::Sample sample = kilobench::time_sample(bench, loops); |
+ cpuSamples.push_back(sample.fCpu); |
+ gpuSamples.push_back(sample.fGpu); |
} |
- |
- // Make sure to write the null terminator |
- if (write(descriptors[1], result, strlen(result) + 1) < 0) { |
- SkFAIL("Failed to write to pipe\n"); |
- } |
- return 0; |
- } else { |
- SkFAIL("Fork failed\n"); |
} |
} |
- Stats stats(samples); |
- const double stddev_percent = 100 * sqrt(stats.var) / stats.mean; |
- SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n" |
- , loops |
- , HUMANIZE(stats.min) |
- , HUMANIZE(stats.median) |
- , HUMANIZE(stats.mean) |
- , HUMANIZE(stats.max) |
- , stddev_percent |
- , stats.plot.c_str() |
- , "gpu" |
- , bench->getUniqueName() |
- ); |
- |
+ printResult(cpuSamples, loops, bench->getUniqueName(), "cpu"); |
+ if (FLAGS_useBackgroundThread) { |
+ printResult(gpuSamples, loops, bench->getUniqueName(), "gpu"); |
+ } |
} |
return 0; |
} |