Index: content/child/child_thread_impl_perftest.cc |
diff --git a/content/child/child_thread_impl_perftest.cc b/content/child/child_thread_impl_perftest.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..abe88032fa5b8c51b9e83cf72de94ffdad5a0aef |
--- /dev/null |
+++ b/content/child/child_thread_impl_perftest.cc |
@@ -0,0 +1,222 @@ |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+// The idea is to benchmark how the hardware, on different usages of |
+// GpuMemoryBuffer, performs when the native buffer object (bo) is mapped into |
+// the CPU. In particular this test aims to capture the effects of data |
+// coherency and answer the following: |
+// |
+// - measure memory mapping performance of GpuMemoryBuffer using shared memory |
+// (fallback case) and also native implementation of it. |
+// - what if the Renderer process (client) just writes into the buffer object? |
+// - what's the effect of reading from a write-combining (WC) memory? can we |
+// avoid read backs? |
+// - should it be UC and/or WC mapped, to get a faster access? |
+// - what the effect of clients doing sequential writes or non-sequential? If |
+// the latter, a WC mapping may end up being very slow. |
+ |
+#include "base/bind.h" |
+#include "base/command_line.h" |
+#include "base/memory/scoped_vector.h" |
+#include "base/time/time.h" |
+#include "content/child/child_gpu_memory_buffer_manager.h" |
+#include "content/child/child_thread_impl.h" |
+#include "content/common/gpu/client/gpu_memory_buffer_impl.h" |
+#include "content/public/common/content_switches.h" |
+#include "content/public/test/content_browser_test.h" |
+#include "content/public/test/content_browser_test_utils.h" |
+#include "content/shell/browser/shell.h" |
+#include "testing/perf/perf_test.h" |
+#include "url/gurl.h" |
+ |
+namespace content { |
+namespace { |
+ |
+ChildGpuMemoryBufferManager* child_gpu_memory_buffer_manager_ = NULL; |
dshwang
2015/06/25 10:52:11
nullptr.
why don't BufferPerfTest have this as mem
|
+ |
+static const int kNumRuns = 30; |
dshwang
2015/06/25 10:52:11
'static' is not needed because of anonymous namesp
|
+ |
+enum NativeBufferFlag { kDisableNativeBuffers, kEnableNativeBuffers }; |
+ |
+std::string NativeBufferFlagName(NativeBufferFlag flag) { |
+ switch (flag) { |
+ case kDisableNativeBuffers: |
+ return ""; |
+ case kEnableNativeBuffers: |
+ return "_native"; |
+ } |
+ |
+ NOTREACHED(); |
+ return ""; |
+} |
+ |
+static NativeBufferFlag native_buffer_flag_; |
dshwang
2015/06/25 10:52:11
why don't ChildThreadImplGpuMemoryBufferPerfTest h
|
+ |
+enum MemoryOperation { kMemoryOperationWrite, kMemoryOperationNoop }; |
dshwang
2015/06/25 10:52:11
unused
|
+ |
+class BufferPerfTest { |
+ public: |
+ BufferPerfTest () |
+ : gpu_memory_buffer_(nullptr), |
+ num_planes_(0), |
+ format_(gfx::GpuMemoryBuffer::BGRA_8888), |
+ buffer_size_(4, 4) {} |
+ |
+ void Allocate(void) { |
+ gpu_memory_buffer_ = |
+ child_gpu_memory_buffer_manager_->AllocateGpuMemoryBuffer( |
+ buffer_size_, format_, gfx::GpuMemoryBuffer::MAP); |
+ ASSERT_TRUE(gpu_memory_buffer_); |
+ |
+ EXPECT_EQ(format_, gpu_memory_buffer_->GetFormat()); |
+ |
+ num_planes_ = |
+ GpuMemoryBufferImpl::NumberOfPlanesForGpuMemoryBufferFormat(format_); |
+ } |
+ |
+ void Map(scoped_ptr<void* []> const &planes, std::string operation_name) { |
+ std::string flag_name = NativeBufferFlagName(native_buffer_flag_); |
+ |
+ base::TimeTicks start = base::TimeTicks::Now(); |
+ bool rv = gpu_memory_buffer_->Map(planes.get()); |
+ base::TimeTicks end = base::TimeTicks::Now(); |
+ ASSERT_TRUE(rv); |
+ EXPECT_TRUE(gpu_memory_buffer_->IsMapped()); |
+ |
+ // TODO(vignatti): get the mean time and print to stdout only once. At the |
+ // moment it's being useful to check individual runs though cause for |
+ // example VGEM has way worse performance on its first runs (got check why). |
+ perf_test::PrintResult( |
dshwang
2015/06/25 10:52:11
This test measures only Map time even in "time_to_
|
+ "time_to_execute_map", |
+ flag_name, |
+ operation_name, |
+ static_cast<size_t>((end - start).InMicroseconds()), |
+ "us", true); |
+ } |
+ |
+ void Unmap(void) { |
+ gpu_memory_buffer_->Unmap(); |
+ EXPECT_FALSE(gpu_memory_buffer_->IsMapped()); |
+ } |
+ |
+ void Write(scoped_ptr<void* []> const &planes) { |
+ // Get stride. |
+ scoped_ptr<int[]> strides(new int[num_planes_]); |
+ gpu_memory_buffer_->GetStride(strides.get()); |
+ |
+ for (size_t plane = 0; plane < num_planes_; ++plane) { |
+ size_t row_size_in_bytes = 0; |
+ EXPECT_TRUE(GpuMemoryBufferImpl::RowSizeInBytes(buffer_size_.width(), |
+ format_, plane, &row_size_in_bytes)); |
+ |
+ scoped_ptr<char[]> data(new char[row_size_in_bytes]); |
+ memset(data.get(), 0x2a + plane, row_size_in_bytes); |
+ |
+ size_t height = buffer_size_.height() / |
+ GpuMemoryBufferImpl::SubsamplingFactor(format_, plane); |
+ for (size_t y = 0; y < height; ++y) { |
+ // Copy |data| to row |y| of |plane| and verify result. |
+ memcpy(static_cast<char*>(planes[plane]) + y * strides[plane], |
+ data.get(), |
+ row_size_in_bytes); |
+#if defined(NDEBUG) |
+ EXPECT_EQ(memcmp(static_cast<char*>(planes[plane]) + y * strides[plane], |
+ data.get(), row_size_in_bytes), |
+ 0); |
+#endif |
+ } |
+ } |
+ } |
+ |
+ size_t GetNumPlanes() { return num_planes_; } |
+ |
+ private: |
+ scoped_ptr<gfx::GpuMemoryBuffer> gpu_memory_buffer_; |
+ size_t num_planes_; |
+ gfx::GpuMemoryBuffer::Format format_; |
+ gfx::Size buffer_size_; |
+}; |
+ |
+class ChildThreadImplBrowserTest : public ContentBrowserTest { |
+ public: |
+ ChildThreadImplBrowserTest() {} |
+ |
+ // Overridden from BrowserTestBase: |
+ void SetUpCommandLine(base::CommandLine* command_line) override { |
+ command_line->AppendSwitch(switches::kSingleProcess); |
+ } |
+ void SetUpOnMainThread() override { |
+ NavigateToURL(shell(), GURL(url::kAboutBlankURL)); |
+ PostTaskToInProcessRendererAndWait( |
+ base::Bind(&ChildThreadImplBrowserTest::SetUpOnChildThread, this)); |
+ } |
+ |
+ private: |
+ void SetUpOnChildThread() { |
+ child_gpu_memory_buffer_manager_ = |
+ ChildThreadImpl::current()->gpu_memory_buffer_manager(); |
+ } |
+ |
+}; |
+ |
+class ChildThreadImplGpuMemoryBufferPerfTest |
+ : public ChildThreadImplBrowserTest, |
+ public testing::WithParamInterface< |
+ ::testing::tuple<NativeBufferFlag>> { |
+ public: |
+ ChildThreadImplGpuMemoryBufferPerfTest() {} |
+ |
+ // Overridden from BrowserTestBase: |
+ void SetUpCommandLine(base::CommandLine* command_line) override { |
+ ChildThreadImplBrowserTest::SetUpCommandLine(command_line); |
+ native_buffer_flag_ = ::testing::get<0>(GetParam()); |
+ switch (native_buffer_flag_) { |
+ case kEnableNativeBuffers: |
+ command_line->AppendSwitch(switches::kEnableNativeGpuMemoryBuffers); |
+ break; |
+ case kDisableNativeBuffers: |
+ break; |
+ } |
+ } |
+ |
+ protected: |
+ scoped_ptr<BufferPerfTest> buffer_; |
+ |
+ private: |
+ DISALLOW_COPY_AND_ASSIGN(ChildThreadImplGpuMemoryBufferPerfTest); |
+}; |
+ |
+IN_PROC_BROWSER_TEST_P(ChildThreadImplGpuMemoryBufferPerfTest, |
+ Write) { |
+ buffer_.reset(new BufferPerfTest()); |
+ buffer_->Allocate(); |
+ |
+ scoped_ptr<void* []> planes(new void* [buffer_->GetNumPlanes()]); |
+ |
+ for (int i = 0; i < kNumRuns; ++i) { |
+ buffer_->Map(planes, "Write"); |
+ buffer_->Write(planes); |
dshwang
2015/06/25 10:52:11
I think Map and Unmap should be out of 'for block'
|
+ buffer_->Unmap(); |
+ } |
+} |
+ |
+IN_PROC_BROWSER_TEST_P(ChildThreadImplGpuMemoryBufferPerfTest, |
+ Map) { |
+ buffer_.reset(new BufferPerfTest()); |
+ buffer_->Allocate(); |
+ |
+ scoped_ptr<void* []> planes(new void* [buffer_->GetNumPlanes()]); |
+ |
+ for (int i = 0; i < kNumRuns; ++i) { |
+ buffer_->Map(planes, "Map"); |
+ buffer_->Unmap(); |
+ } |
+} |
+ |
+INSTANTIATE_TEST_CASE_P( |
+ ChildThreadImplGpuMemoryBufferPerfTests, |
+ ChildThreadImplGpuMemoryBufferPerfTest, |
+ ::testing::Values(kDisableNativeBuffers, kEnableNativeBuffers)); |
+} // namespace |
+} // namespace content |