tools/kilobench/kilobench.cpp - Issue 1612513002: Add a background timing thread to kilobench

Unified Diff: tools/kilobench/kilobench.cpp

Issue 1612513002: Add a background timing thread to kilobench (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: kilobench logs gpu numbers Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/kilobench/kilobench.cpp

diff --git a/tools/kilobench/kilobench.cpp b/tools/kilobench/kilobench.cpp

index 8c844f47e0169ac26f72cae33d22b03fb52d8e50..1b9cb34de0fe74d0eb38a818847248e62f65a9e4 100644

--- a/tools/kilobench/kilobench.cpp

+++ b/tools/kilobench/kilobench.cpp

@@ -14,10 +14,15 @@

#include "SkStream.h"

#include "SkSurface.h"

#include "SkTime.h"

+#include "SkTLList.h"

+#include "SkThreadUtils.h"

#include "Stats.h"

#include "Timer.h"

#include "VisualSKPBench.h"

#include "gl/GrGLDefines.h"

+#include "../private/SkMutex.h"

+#include "../private/SkSemaphore.h"

+#include "../private/SkGpuFenceSync.h"

// posix only for now

#include <unistd.h>

@@ -34,7 +39,6 @@

#include "SkImageDecoder.h"

__SK_FORCE_IMAGE_DECODER_LINKING;

static const int kAutoTuneLoops = 0;

static const int kDefaultLoops =

@@ -68,6 +72,8 @@ DEFINE_int32(maxLoops, 1000000, "Never run a bench more times than this.");

DEFINE_int32(loops, kDefaultLoops, loops_help_txt().c_str());

DEFINE_double(gpuMs, 5, "Target bench time in millseconds for GPU.");

DEFINE_string2(writePath, w, "", "If set, write bitmaps here as .pngs.");

+DEFINE_bool(useBackgroundThread, true, "If false, kilobench will time cpu / gpu work together");

+DEFINE_bool(useMultiProcess, true, "If false, kilobench will run all tests in one process");

static SkString humanize(double ms) {

return HumanizeMs(ms);

@@ -146,25 +152,29 @@ private:

struct GPUTarget {

void setup() {

- this->gl->makeCurrent();

+ fGL->makeCurrent();

// Make sure we're done with whatever came before.

- SK_GL(*this->gl, Finish());

+ SK_GL(*fGL, Finish());

}

SkCanvas* beginTiming(SkCanvas* canvas) { return canvas; }

- void endTiming() {

- if (this->gl) {

- SK_GL(*this->gl, Flush());

- this->gl->swapBuffers();

+ void endTiming(bool usePlatformSwapBuffers) {

+ if (fGL) {

+ SK_GL(*fGL, Flush());

+ if (usePlatformSwapBuffers) {

+ fGL->swapBuffers();

+ } else {

+ fGL->waitOnSyncOrSwap();

+ }

}

- void fence() {

- SK_GL(*this->gl, Finish());

+ void finish() {

+ SK_GL(*fGL, Finish());

}

bool needsFrameTiming(int* maxFrameLag) const {

- if (!this->gl->getMaxGpuFrameLag(maxFrameLag)) {

+ if (!fGL->getMaxGpuFrameLag(maxFrameLag)) {

// Frame lag is unknown.

*maxFrameLag = FLAGS_gpuFrameLag;

}

@@ -182,24 +192,24 @@ struct GPUTarget {

uint32_t flags = useDfText ? SkSurfaceProps::kUseDeviceIndependentFonts_Flag :

SkSurfaceProps props(flags, SkSurfaceProps::kLegacyFontHost_InitType);

- this->surface.reset(SkSurface::NewRenderTarget(context,

- SkSurface::kNo_Budgeted, info,

- numSamples, &props));

- this->gl = factory->getContextInfo(ctxType, ctxOptions).fGLContext;

- if (!this->surface.get()) {

+ fSurface.reset(SkSurface::NewRenderTarget(context,

+ SkSurface::kNo_Budgeted, info,

+ numSamples, &props));

+ fGL = factory->getContextInfo(ctxType, ctxOptions).fGLContext;

+ if (!fSurface.get()) {

return false;

}

// Kilobench should only be used on platforms with fence sync support

- SkASSERT(this->gl->fenceSyncSupport());

+ SkASSERT(fGL->fenceSyncSupport());

return true;

}

SkCanvas* getCanvas() const {

- if (!surface.get()) {

+ if (!fSurface.get()) {

return nullptr;

}

- return surface->getCanvas();

+ return fSurface->getCanvas();

}

bool capturePixels(SkBitmap* bmp) {

@@ -215,10 +225,11 @@ struct GPUTarget {

return true;

}

+ SkGLContext* gl() { return fGL; }

private:

- //const Config config;

- SkGLContext* gl;

- SkAutoTDelete<SkSurface> surface;

+ SkGLContext* fGL;

+ SkAutoTDelete<SkSurface> fSurface;

};

static bool write_canvas_png(GPUTarget* target, const SkString& filename) {

@@ -276,24 +287,159 @@ static int clamp_loops(int loops) {

}

static double now_ms() { return SkTime::GetNSecs() * 1e-6; }

-static double time(int loops, Benchmark* bench, GPUTarget* target) {

- SkCanvas* canvas = target->getCanvas();

- if (canvas) {

- canvas->clear(SK_ColorWHITE);

+struct TimingThread {

+ TimingThread(SkGLContext* mainContext)

+ : fFenceSync(mainContext->fenceSync())

+ , fMainContext(mainContext)

+ , fDone(false) {}

+ static void Loop(void* data) {

+ TimingThread* timingThread = reinterpret_cast<TimingThread*>(data);

+ timingThread->timingLoop();

+ }

+ // To ensure waiting for the sync actually does something, we check to make sure the we exceed

+ // some small value

+ const double kMinElapsed = 1e-6;

+ bool sanity(double start) const {

+ double elapsed = now_ms() - start;

+ return elapsed > kMinElapsed;

+ }

+ void waitFence(SkPlatformGpuFence sync) {

+ SkDEBUGCODE(double start = now_ms());

+ fFenceSync->waitFence(sync, false);

+ SkASSERT(sanity(start));

+ }

+ void timingLoop() {

+ // Create a context which shares display lists with the main thread

+ SkAutoTDelete<SkGLContext> glContext(SkCreatePlatformGLContext(kNone_GrGLStandard,

+ fMainContext));

+ glContext->makeCurrent();

+ // Basic timing methodology is:

+ // 1) Wait on semaphore until main thread indicates its time to start timing the frame

+ // 2) Wait on frame start sync, record time. This is start of the frame.

+ // 3) Wait on semaphore until main thread indicates its time to finish timing the frame

+ // 4) Wait on frame end sync, record time. FrameEndTime - FrameStartTime = frame time

+ // 5) Wait on semaphore until main thread indicates we should time the next frame or quit

+ while (true) {

+ fSemaphore.wait();

+ // get start sync

+ SkPlatformGpuFence startSync = this->popStartSync();

+ // wait on sync

+ this->waitFence(startSync);

+ double start = kilobench::now_ms();

+ // do we want to sleep here?

+ // wait for end sync

+ fSemaphore.wait();

+ // get end sync

+ SkPlatformGpuFence endSync = this->popEndSync();

+ // wait on sync

+ this->waitFence(endSync);

+ double elapsed = kilobench::now_ms() - start;

+ // No mutex needed, client won't touch timings until we're done

+ fTimings.push_back(elapsed);

+ // clean up fences

+ fFenceSync->deleteFence(startSync);

+ fFenceSync->deleteFence(endSync);

+ fSemaphore.wait();

+ if (this->isDone()) {

+ break;

+ }

+ void pushStartSync() { this->pushSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); }

+ SkPlatformGpuFence popStartSync() {

+ return this->popSync(&fFrameStartSyncs, &fFrameStartSyncsMutex);

+ }

+ void pushEndSync() { this->pushSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }

+ SkPlatformGpuFence popEndSync() { return this->popSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }

+ void setDone() {

+ SkAutoMutexAcquire done(fDoneMutex);

+ fDone = true;

+ fSemaphore.signal();

+ }

+ typedef SkTLList<SkPlatformGpuFence, 1> SyncQueue;

+ void pushSync(SyncQueue* queue, SkMutex* mutex) {

+ SkAutoMutexAcquire am(mutex);

+ *queue->addToHead() = fFenceSync->insertFence();

+ fSemaphore.signal();

+ }

+ SkPlatformGpuFence popSync(SyncQueue* queue, SkMutex* mutex) {

+ SkAutoMutexAcquire am(mutex);

+ SkPlatformGpuFence sync = *queue->head();

+ queue->popHead();

+ return sync;

+ }

+ bool isDone() {

+ SkAutoMutexAcquire am1(fFrameStartSyncsMutex);

+ SkAutoMutexAcquire done(fDoneMutex);

+ if (fDone && fFrameStartSyncs.isEmpty()) {

+ return true;

+ } else {

+ return false;

+ }

}

+ const SkTArray<double>& timings() const { SkASSERT(fDone); return fTimings; }

+private:

+ SkGpuFenceSync* fFenceSync;

+ SkSemaphore fSemaphore;

+ SkMutex fFrameStartSyncsMutex;

+ SyncQueue fFrameStartSyncs;

+ SkMutex fFrameEndSyncsMutex;

+ SyncQueue fFrameEndSyncs;

+ SkTArray<double> fTimings;

+ SkMutex fDoneMutex;

+ SkGLContext* fMainContext;

+ bool fDone;

+};

+static double time(int loops, Benchmark* bench, GPUTarget* target, TimingThread* timingThread) {

+ SkCanvas* canvas = target->getCanvas();

+ canvas->clear(SK_ColorWHITE);

bench->preDraw(canvas);

+ if (timingThread) {

+ timingThread->pushStartSync();

+ }

double start = now_ms();

canvas = target->beginTiming(canvas);

bench->draw(loops, canvas);

- if (canvas) {

- canvas->flush();

- }

- target->endTiming();

+ canvas->flush();

+ target->endTiming(timingThread ? true : false);

double elapsed = now_ms() - start;

+ if (timingThread) {

+ timingThread->pushEndSync();

+ timingThread->setDone();

+ }

bench->postDraw(canvas);

return elapsed;

}

+// TODO For now we don't use the background timing thread to tune loops

static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameLag) {

// First, figure out how many loops it'll take to get a frame up to FLAGS_gpuMs.

int loops = bench->calculateLoops(FLAGS_loops);

@@ -310,7 +456,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL

// If the GPU lets frames lag at all, we need to make sure we're timing

// _this_ round, not still timing last round.

for (int i = 0; i < maxGpuFrameLag; i++) {

- elapsed = time(loops, bench, target);

+ elapsed = time(loops, bench, target, nullptr);

}

} while (elapsed < FLAGS_gpuMs);

@@ -319,7 +465,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL

loops = clamp_loops(loops);

// Make sure we're not still timing our calibration.

- target->fence();

+ target->finish();

} else {

loops = detect_forever_loops(loops);

}

@@ -327,7 +473,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL

// Pretty much the same deal as the calibration: do some warmup to make

// sure we're timing steady-state pipelined frames.

for (int i = 0; i < maxGpuFrameLag - 1; i++) {

- time(loops, bench, target);

+ time(loops, bench, target, nullptr);

}

return loops;

@@ -351,13 +497,14 @@ struct AutoSetupContextBenchAndTarget {

int getLoops() { return setup_gpu_bench(&fTarget, fBenchmark, fMaxFrameLag); }

- double timeSample(int loops) {

+ double timeSample(int loops, TimingThread* timingThread) {

for (int i = 0; i < fMaxFrameLag; i++) {

- time(loops, fBenchmark, &fTarget);

+ time(loops, fBenchmark, &fTarget, timingThread);

}

- return time(loops, fBenchmark, &fTarget) / loops;

+ return time(loops, fBenchmark, &fTarget, timingThread) / loops;

}

void teardownBench() { fBenchmark->perCanvasPostDraw(fCanvas); }

SkAutoTDelete<GrContextFactory> fCtxFactory;

@@ -381,9 +528,32 @@ int setup_loops(Benchmark* bench) {

return loops;

}

-double time_sample(Benchmark* bench, int loops) {

+struct Sample {

+ double fCpu;

+ double fGpu;

+};

+Sample time_sample(Benchmark* bench, int loops) {

AutoSetupContextBenchAndTarget ascbt(bench);

- double sample = ascbt.timeSample(loops);

+ Sample sample;

+ if (FLAGS_useBackgroundThread) {

+ TimingThread timingThread(ascbt.fTarget.gl());

+ SkAutoTDelete<SkThread> nativeThread(new SkThread(TimingThread::Loop, &timingThread));

+ nativeThread->start();

+ sample.fCpu = ascbt.timeSample(loops, &timingThread);

+ nativeThread->join();

+ // return the min

+ double min = SK_ScalarMax;

+ for (int i = 0; i < timingThread.timings().count(); i++) {

+ min = SkTMin(min, timingThread.timings()[i]);

+ }

+ sample.fGpu = min;

+ } else {

+ sample.fCpu = ascbt.timeSample(loops, nullptr);

+ }

ascbt.teardownBench();

return sample;

@@ -393,6 +563,24 @@ double time_sample(Benchmark* bench, int loops) {

static const int kOutResultSize = 1024;

+void printResult(const SkTArray<double>& samples, int loops, const char* name, const char* mod) {

+ SkString newName(name);

+ newName.appendf("_%s", mod);

+ Stats stats(samples);

+ const double stddev_percent = 100 * sqrt(stats.var) / stats.mean;

+ SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n"

+ , loops

+ , HUMANIZE(stats.min)

+ , HUMANIZE(stats.median)

+ , HUMANIZE(stats.mean)

+ , HUMANIZE(stats.max)

+ , stddev_percent

+ , stats.plot.c_str()

+ , "gpu"

+ , newName.c_str()

+ );

int kilobench_main() {

kilobench::BenchmarkStream benchStream;

@@ -407,60 +595,63 @@ int kilobench_main() {

while (Benchmark* b = benchStream.next()) {

SkAutoTDelete<Benchmark> bench(b);

- int loops;

- SkTArray<double> samples;

+ int loops = 1;

+ SkTArray<double> cpuSamples;

+ SkTArray<double> gpuSamples;

for (int i = 0; i < FLAGS_samples + 1; i++) {

// We fork off a new process to setup the grcontext and run the test while we wait

- int childPid = fork();

- if (childPid > 0) {

- char result[kOutResultSize];

- if (read(descriptors[0], result, kOutResultSize) < 0) {

- SkFAIL("Failed to read from pipe\n");

- }

- // if samples == 0 then parse # of loops

- // else parse float

- if (i == 0) {

- sscanf(result, "%d", &loops);

+ if (FLAGS_useMultiProcess) {

+ int childPid = fork();

+ if (childPid > 0) {

+ char result[kOutResultSize];

+ if (read(descriptors[0], result, kOutResultSize) < 0) {

+ SkFAIL("Failed to read from pipe\n");

+ }

+ // if samples == 0 then parse # of loops

+ // else parse float

+ if (i == 0) {

+ sscanf(result, "%d", &loops);

+ } else {

+ sscanf(result, "%lf %lf", &cpuSamples.push_back(),

+ &gpuSamples.push_back());

+ }

+ // wait until exit

+ int status;

+ waitpid(childPid, &status, 0);

+ } else if (0 == childPid) {

+ char result[kOutResultSize];

+ if (i == 0) {

+ sprintf(result, "%d", kilobench::setup_loops(bench));

+ } else {

+ kilobench::Sample sample = kilobench::time_sample(bench, loops);

+ sprintf(result, "%lf %lf", sample.fCpu, sample.fGpu);

+ }

+ // Make sure to write the null terminator

+ if (write(descriptors[1], result, strlen(result) + 1) < 0) {

+ SkFAIL("Failed to write to pipe\n");

+ }

+ return 0;

} else {

- sscanf(result, "%lf", &samples.push_back());

+ SkFAIL("Fork failed\n");

}

- // wait until exit

- int status;

- waitpid(childPid, &status, 0);

- } else if (0 == childPid) {

- char result[kOutResultSize];

+ } else {

if (i == 0) {

- sprintf(result, "%d", kilobench::setup_loops(bench));

+ loops = kilobench::setup_loops(bench);

} else {

- sprintf(result, "%lf", kilobench::time_sample(bench, loops));

+ kilobench::Sample sample = kilobench::time_sample(bench, loops);

+ cpuSamples.push_back(sample.fCpu);

+ gpuSamples.push_back(sample.fGpu);

}

- // Make sure to write the null terminator

- if (write(descriptors[1], result, strlen(result) + 1) < 0) {

- SkFAIL("Failed to write to pipe\n");

- }

- return 0;

- } else {

- SkFAIL("Fork failed\n");

}

- Stats stats(samples);

- const double stddev_percent = 100 * sqrt(stats.var) / stats.mean;

- SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n"

- , loops

- , HUMANIZE(stats.min)

- , HUMANIZE(stats.median)

- , HUMANIZE(stats.mean)

- , HUMANIZE(stats.max)

- , stddev_percent

- , stats.plot.c_str()

- , "gpu"

- , bench->getUniqueName()

- );

+ printResult(cpuSamples, loops, bench->getUniqueName(), "cpu");

+ if (FLAGS_useBackgroundThread) {

+ printResult(gpuSamples, loops, bench->getUniqueName(), "gpu");

+ }

}

return 0;

}

« no previous file with comments | « include/gpu/gl/SkGLContext.h ('k') | no next file » | no next file with comments »