Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1017)

Unified Diff: native_client_sdk/src/examples/demo/life_simd/life.cc

Issue 451883002: Update SIMD version of life demo. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: native_client_sdk/src/examples/demo/life_simd/life.cc
diff --git a/native_client_sdk/src/examples/demo/life_simd/life.cc b/native_client_sdk/src/examples/demo/life_simd/life.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bdd646a4aedc5857b5b8d7274c1645f1bfe83c6e
--- /dev/null
+++ b/native_client_sdk/src/examples/demo/life_simd/life.cc
@@ -0,0 +1,526 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <ppapi/c/ppb_input_event.h>
+#include <ppapi/cpp/fullscreen.h>
+#include <ppapi/cpp/input_event.h>
+#include <ppapi/cpp/var.h>
+#include <ppapi/cpp/var_array.h>
+#include <ppapi/cpp/var_array_buffer.h>
+#include <ppapi/cpp/var_dictionary.h>
+
+#include "ppapi_simple/ps.h"
+#include "ppapi_simple/ps_context_2d.h"
+#include "ppapi_simple/ps_event.h"
+#include "ppapi_simple/ps_instance.h"
+#include "ppapi_simple/ps_interface.h"
+#include "ppapi_simple/ps_main.h"
+#include "sdk_util/macros.h"
+#include "sdk_util/thread_pool.h"
+
+using namespace sdk_util; // For sdk_util::ThreadPool
+
+namespace {
+
+#define INLINE inline __attribute__((always_inline))
+
+// BGRA helper macro, for constructing a pixel for a BGRA buffer.
+#define MakeBGRA(b, g, r, a) \
+ (((a) << 24) | ((r) << 16) | ((g) << 8) | (b))
+
+const int kFramesToBenchmark = 100;
+const int kCellAlignment = 0x10;
+
+// 128 bit vector types
+typedef uint8_t u8x16_t __attribute__ ((vector_size (16)));
+
+// Helper function to broadcast x across 16 element vector.
+INLINE u8x16_t broadcast(uint8_t x) {
+ u8x16_t r = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
+ return r;
+}
+
+// Convert a count value into a live (green) or dead color value.
+const uint32_t kNeighborColors[] = {
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0xFF, 0x00, 0xFF),
+ MakeBGRA(0x00, 0xFF, 0x00, 0xFF),
+ MakeBGRA(0x00, 0xFF, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+ MakeBGRA(0x00, 0x00, 0x00, 0xFF),
+};
+
+// These represent the new health value of a cell based on its neighboring
+// values. The health is binary: either alive or dead.
+const uint8_t kIsAlive[] = {
+ 0, 0, 0, 0, 0, 1, 1, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+// Timer helper for benchmarking. Returns seconds elapsed since program start,
+// as a double.
+timeval start_tv;
+int start_tv_retv = gettimeofday(&start_tv, NULL);
+
+inline double getseconds() {
+ const double usec_to_sec = 0.000001;
+ timeval tv;
+ if ((0 == start_tv_retv) && (0 == gettimeofday(&tv, NULL)))
+ return (tv.tv_sec - start_tv.tv_sec) + tv.tv_usec * usec_to_sec;
+ return 0.0;
+}
+} // namespace
+
+
+class Life {
+ public:
+ Life();
+ virtual ~Life();
+ // Runs a tick of the simulations, update 2D output.
+ void Update();
+ // Handle event from user, or message from JS.
+ void HandleEvent(PSEvent* ps_event);
+ private:
+ void UpdateContext();
+ void DrawCell(int32_t x, int32_t y);
+ void ProcessTouchEvent(pp::TouchInputEvent touches);
binji 2014/08/08 00:31:40 const pp::TouchInputEvent&
nfullagar 2014/08/08 20:51:48 Done.
+ void PostUpdateMessage(const char* message, double value);
+ void StartBenchmark();
+ void EndBenchmark();
+ void Stir();
+ void wSimulate(int y);
+ static void wSimulateEntry(int y, void* data);
+ void Simulate();
+
+ bool simd_;
+ bool multithread_;
+ bool benchmarking_;
+ int benchmark_frame_counter_;
+ double bench_start_time_;
+ double bench_end_time_;
+ uint8_t* cell_in_;
+ uint8_t* cell_out_;
+ int32_t cell_stride_;
+ int32_t width_;
+ int32_t height_;
+ PSContext2D_t* ps_context_;
+ ThreadPool* workers_;
+};
+
+Life::Life() :
+ simd_(true),
+ multithread_(true),
+ benchmarking_(false),
+ benchmark_frame_counter_(0),
+ bench_start_time_(0.0),
+ bench_end_time_(0.0),
+ cell_in_(NULL),
+ cell_out_(NULL),
+ cell_stride_(0),
+ width_(0),
+ height_(0) {
+ ps_context_ = PSContext2DAllocate(PP_IMAGEDATAFORMAT_BGRA_PREMUL);
+ // Query system for number of processors via sysconf()
+ int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
+ if (num_threads < 2)
+ num_threads = 2;
+ workers_ = new ThreadPool(num_threads);
+ PSEventSetFilter(PSE_ALL);
+}
+
+Life::~Life() {
+ delete workers_;
+ PSContext2DFree(ps_context_);
+}
+
+void Life::UpdateContext() {
+ cell_stride_ = (ps_context_->width + kCellAlignment - 1) &
+ ~(kCellAlignment - 1);
binji 2014/08/08 00:31:40 whoops, bug in the previous version, hm?
+ size_t size = cell_stride_ * ps_context_->height;
+
+ if (ps_context_->width != width_ || ps_context_->height != height_) {
+ free(cell_in_);
+ free(cell_out_);
+
+ // Create a new context
+ void* in_buffer = NULL;
+ void* out_buffer = NULL;
+ // alloc buffers aligned on 16 bytes
+ posix_memalign(&in_buffer, kCellAlignment, size);
+ posix_memalign(&out_buffer, kCellAlignment, size);
+ cell_in_ = (uint8_t*) in_buffer;
+ cell_out_ = (uint8_t*) out_buffer;
+
+ memset(cell_out_, 0, size);
+ for (size_t index = 0; index < size; index++) {
+ cell_in_[index] = rand() & 1;
+ }
+ width_ = ps_context_->width;
+ height_ = ps_context_->height;
+ }
+}
+
+void Life::DrawCell(int32_t x, int32_t y) {
+ if (!cell_in_) return;
+ if (x > 0 && x < ps_context_->width - 1 &&
+ y > 0 && y < ps_context_->height - 1) {
+ cell_in_[x - 1 + y * cell_stride_] = 1;
+ cell_in_[x + 1 + y * cell_stride_] = 1;
+ cell_in_[x + (y - 1) * cell_stride_] = 1;
+ cell_in_[x + (y + 1) * cell_stride_] = 1;
+ }
+}
+
+void Life::ProcessTouchEvent(pp::TouchInputEvent touches) {
+ uint32_t count = touches.GetTouchCount(PP_TOUCHLIST_TYPE_TOUCHES);
+ uint32_t i, j;
+ for (i = 0; i < count; i++) {
+ pp::TouchPoint touch =
+ touches.GetTouchByIndex(PP_TOUCHLIST_TYPE_TOUCHES, i);
+ int radius = (int)(touch.radii().x());
+ int x = (int)(touch.position().x());
+ int y = (int)(touch.position().y());
+ // num = 1/100th the area of touch point
+ uint32_t num = (uint32_t)(M_PI * radius * radius / 100.0f);
+ for (j = 0; j < num; j++) {
+ int dx = rand() % (radius * 2) - radius;
+ int dy = rand() % (radius * 2) - radius;
+ // only plot random cells within the touch area
+ if (dx * dx + dy * dy <= radius * radius)
+ DrawCell(x + dx, y + dy);
+ }
+ }
+}
+
+void Life::PostUpdateMessage(const char* message_name, double value) {
+ pp::VarDictionary message;
+ message.Set("message", message_name);
+ message.Set("value", value);
+ PSInterfaceMessaging()->PostMessage(PSGetInstanceId(), message.pp_var());
+}
+
+void Life::StartBenchmark() {
+ printf("Running benchmark... (SIMD: %s, multi-threading: %s, size: %dx%d)\n",
+ simd_ ? "enabled" : "disabled",
+ multithread_ ? "enabled" : "disabled",
+ ps_context_->width,
+ ps_context_->height);
+ benchmarking_ = true;
+ bench_start_time_ = getseconds();
+ benchmark_frame_counter_ = kFramesToBenchmark;
+}
+
+void Life::EndBenchmark() {
+ double total_time;
+ bench_end_time_ = getseconds();
+ benchmarking_ = false;
+ total_time = bench_end_time_ - bench_start_time_;
+ printf("Finished - benchmark took %f seconds\n", total_time);
+ // Send benchmark result to JS.
+ PostUpdateMessage("benchmark_result", total_time);
+}
+
+void Life::HandleEvent(PSEvent* ps_event) {
+ // Give the 2D context a chance to process the event.
+ if (0 != PSContext2DHandleEvent(ps_context_, ps_event)) {
+ UpdateContext();
+ return;
+ }
+
+ switch(ps_event->type) {
+
+ case PSE_INSTANCE_HANDLEINPUT: {
+ pp::InputEvent event(ps_event->as_resource);
+
+ switch(event.GetType()) {
+ case PP_INPUTEVENT_TYPE_MOUSEDOWN:
+ case PP_INPUTEVENT_TYPE_MOUSEMOVE: {
+ pp::MouseInputEvent mouse = pp::MouseInputEvent(event);
+ // If the button is down, draw
+ if (mouse.GetModifiers() & PP_INPUTEVENT_MODIFIER_LEFTBUTTONDOWN) {
+ PP_Point location = mouse.GetPosition();
+ DrawCell(location.x, location.y);
+ }
+ break;
+ }
+
+ case PP_INPUTEVENT_TYPE_TOUCHSTART:
+ case PP_INPUTEVENT_TYPE_TOUCHMOVE: {
+ pp::TouchInputEvent touches = pp::TouchInputEvent(event);
+ ProcessTouchEvent(touches);
+ break;
+ }
+
+ case PP_INPUTEVENT_TYPE_KEYDOWN: {
+ pp::Fullscreen fullscreen(PSInstance::GetInstance());
+ bool isFullscreen = fullscreen.IsFullscreen();
+ fullscreen.SetFullscreen(!isFullscreen);
+ break;
+ }
+
+ default:
+ break;
+ }
+ break; // case PSE_INSTANCE_HANDLEINPUT
+ }
+
+ case PSE_INSTANCE_HANDLEMESSAGE: {
+ // Convert Pepper Simple message to PPAPI C++ vars
+ pp::Var var(ps_event->as_var);
+ if (var.is_dictionary()) {
+ pp::VarDictionary dictionary(var);
+ std::string message = dictionary.Get("message").AsString();
+ if (message == "run_benchmark" && !benchmarking_) {
+ StartBenchmark();
+ } else if (message == "set_simd") {
+ std::string value = dictionary.Get("value").AsString();
+ simd_ = value == "enable";
+ } else if (message == "set_threading") {
+ std::string value = dictionary.Get("value").AsString();
+ multithread_ = value == "enable";
+ }
+ }
+ break; // case PSE_INSTANCE_HANDLEMESSAGE
+ }
+
+ default:
+ break;
+ }
+}
+
+void Life::Stir() {
+ int32_t width = ps_context_->width;
+ int32_t height = ps_context_->height;
+ int32_t stride = cell_stride_;
+ int32_t i;
+ if (cell_in_ == NULL || cell_out_ == NULL)
+ return;
+
+ for (i = 0; i < width; ++i) {
+ cell_in_[i] = rand() & 1;
+ cell_in_[i + (height - 1) * stride] = rand() & 1;
+ }
+ for (i = 0; i < height; ++i) {
+ cell_in_[i * stride] = rand() & 1;
+ cell_in_[i * stride + (width - 1)] = rand() & 1;
+ }
+}
+
+void Life::wSimulate(int y) {
+ // Don't run simulation on top and bottom borders
+ if (y < 1 || y >= ps_context_->height - 1)
+ return;
+
+ // Do neighbor summation; apply rules, output pixel color. Note that a 1 cell
+ // wide perimeter is excluded from the simulation update; only cells from
+ // x = 1 to x < width - 1 and y = 1 to y < height - 1 are updated.
+ uint8_t *src0 = (cell_in_ + (y - 1) * cell_stride_);
+ uint8_t *src1 = src0 + cell_stride_;
+ uint8_t *src2 = src1 + cell_stride_;
+ uint8_t *dst = (cell_out_ + y * cell_stride_) + 1;
+ uint32_t *pixels = static_cast<uint32_t *>(ps_context_->data);
+ uint32_t *pixel_line = static_cast<uint32_t*>
binji 2014/08/08 00:31:40 cast isn't needed, right?
+ (pixels + y * ps_context_->stride / sizeof(uint32_t));
+ int32_t x = 1;
+
+ if (simd_) {
+ const u8x16_t kOne = broadcast(1);
+ const u8x16_t kFour = broadcast(4);
+ const u8x16_t kEight = broadcast(8);
+ const u8x16_t kZero255 = {0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ // Prime the src
+ u8x16_t src00 = *reinterpret_cast<u8x16_t*>(&src0[0]);
+ u8x16_t src01 = *reinterpret_cast<u8x16_t*>(&src0[16]);
+ u8x16_t src10 = *reinterpret_cast<u8x16_t*>(&src1[0]);
+ u8x16_t src11 = *reinterpret_cast<u8x16_t*>(&src1[16]);
+ u8x16_t src20 = *reinterpret_cast<u8x16_t*>(&src2[0]);
+ u8x16_t src21 = *reinterpret_cast<u8x16_t*>(&src2[16]);
+
+ // This inner loop is SIMD - each loop iteration will process 16 cells.
+ for (; (x + 15) < (ps_context_->width - 1); x += 16) {
+
+ // Construct jittered source temps, using __builtin_shufflevector(..) to
+ // extract a shifted 16 element vector from the 32 element concatenation
+ // of two source vectors.
+ u8x16_t src0j0 = src00;
+ u8x16_t src0j1 = __builtin_shufflevector(src00, src01,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+ u8x16_t src0j2 = __builtin_shufflevector(src00, src01,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
+ u8x16_t src1j0 = src10;
+ u8x16_t src1j1 = __builtin_shufflevector(src10, src11,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+ u8x16_t src1j2 = __builtin_shufflevector(src10, src11,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
+ u8x16_t src2j0 = src20;
+ u8x16_t src2j1 = __builtin_shufflevector(src20, src21,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+ u8x16_t src2j2 = __builtin_shufflevector(src20, src21,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
+
+ // Sum the jittered sources to construct neighbor count.
+ u8x16_t count = src0j0 + src0j1 + src0j2 +
+ src1j0 + + src1j2 +
+ src2j0 + src2j1 + src2j2;
+ // Add the center cell.
+ count = count + count + src1j1;
+ // If count > 4 and < 8, center cell will be alive in the next frame.
+ u8x16_t alive1 = count > kFour;
+ u8x16_t alive2 = count < kEight;
+ // Intersect the two comparisons from above.
+ u8x16_t alive = alive1 & alive2;
+
+ // At this point, alive[x] will be one of two values:
+ // 0x00 for a dead cell
+ // 0xFF for an alive cell.
+ //
+ // Next, convert alive cells to green pixel color.
+ // Use __builtin_shufflevector(..) to construct output pixels from
+ // concantination of alive vector and kZero255 const vector.
+ // Indices 0..15 select the 16 cells from alive vector.
+ // Index 16 is zero constant from kZero255 constant vector.
+ // Index 17 is 255 constant from kZero255 constant vector.
+ // Output pixel color values are in BGRABGRABGRABGRA order.
+ // Since each pixel needs 4 bytes of color information, 16 cells will
+ // need to expand to 4 seperate 16 byte pixel splats.
+ u8x16_t pixel0_3 = __builtin_shufflevector(alive, kZero255,
+ 16, 0, 16, 17, 16, 1, 16, 17, 16, 2, 16, 17, 16, 3, 16, 17);
+ u8x16_t pixel4_7 = __builtin_shufflevector(alive, kZero255,
+ 16, 4, 16, 17, 16, 5, 16, 17, 16, 6, 16, 17, 16, 7, 16, 17);
+ u8x16_t pixel8_11 = __builtin_shufflevector(alive, kZero255,
+ 16, 8, 16, 17, 16, 9, 16, 17, 16, 10, 16, 17, 16, 11, 16, 17);
+ u8x16_t pixel12_15 = __builtin_shufflevector(alive, kZero255,
+ 16, 12, 16, 17, 16, 13, 16, 17, 16, 14, 16, 17, 16, 15, 16, 17);
+
+ // Write 16 pixels to output pixel buffer.
+ *reinterpret_cast<u8x16_t*>(pixel_line + 0) = pixel0_3;
+ *reinterpret_cast<u8x16_t*>(pixel_line + 4) = pixel4_7;
+ *reinterpret_cast<u8x16_t*>(pixel_line + 8) = pixel8_11;
+ *reinterpret_cast<u8x16_t*>(pixel_line + 12) = pixel12_15;
+
+ // Convert alive mask to 1 or 0 and store in destination cell array.
+ *reinterpret_cast<u8x16_t*>(dst) = alive & kOne;
+
+ // Increment pointers.
+ pixel_line += 16;
+ dst += 16;
+ src0 += 16;
+ src1 += 16;
+ src2 += 16;
+
+ // Shift source over by 16 cells and read the next 16 cells.
+ src00 = src01;
+ src01 = *reinterpret_cast<u8x16_t*>(&src0[16]);
+ src10 = src11;
+ src11 = *reinterpret_cast<u8x16_t*>(&src1[16]);
+ src20 = src21;
+ src21 = *reinterpret_cast<u8x16_t*>(&src2[16]);
+ }
+ }
+
+ // The SIMD loop above does 16 cells at a time. The loop below is the
+ // regular version which processes one cell at a time. It is used to
+ // finish the remainder of the scanline not handled by the SIMD loop.
+ for (; x < (ps_context_->width - 1); ++x) {
+ // Sum the jittered sources to construct neighbor count.
+ int count = src0[0] + src0[1] + src0[2] +
+ src1[0] + + src1[2] +
+ src2[0] + src2[1] + src2[2];
+ // Add the center cell.
+ count = count + count + src1[1];
+ // Use table lookup indexed by count to determine pixel & alive state.
+ uint32_t color = kNeighborColors[count];
+ *pixel_line++ = color;
+ *dst++ = kIsAlive[count];
+ ++src0;
+ ++src1;
+ ++src2;
+ }
+}
+
+// Static entry point for worker thread.
+void Life::wSimulateEntry(int slice, void* thiz) {
+ static_cast<Life*>(thiz)->wSimulate(slice);
+}
+
+void Life::Simulate() {
+ // Stir up the edges to prevent the simulation from reaching steady state.
+ Stir();
+
+ if (multithread_) {
+ // If multi-threading enabled, dispatch tasks to pool of worker threads.
+ workers_->Dispatch(ps_context_->height, wSimulateEntry, this);
+ } else {
+ // Else manually simulate each line on this thread.
+ for (int y = 0; y < ps_context_->height; y++) {
+ wSimulateEntry(y, this);
+ }
+ }
+ std::swap(cell_in_, cell_out_);
+}
+
+void Life::Update() {
+
+ PSContext2DGetBuffer(ps_context_);
+ if (NULL == ps_context_->data)
+ return;
+
+ // If we somehow have not allocated these pointers yet, skip this frame.
+ if (!cell_in_ || !cell_out_) return;
+
+ // Simulate one (or more if benchmarking) frames
+ do {
+ Simulate();
+ if (!benchmarking_)
+ break;
+ --benchmark_frame_counter_;
+ } while(benchmark_frame_counter_ > 0);
+ if (benchmarking_)
+ EndBenchmark();
+
+ PSContext2DSwapBuffer(ps_context_);
+}
+
+// Starting point for the module. We do not use main since it would
+// collide with main in libppapi_cpp.
+int example_main(int argc, char* argv[]) {
+ Life life;
+ while (true) {
+ PSEvent* ps_event;
+ // Consume all available events
+ while ((ps_event = PSEventTryAcquire()) != NULL) {
+ life.HandleEvent(ps_event);
+ PSEventRelease(ps_event);
+ }
+ // Do simulation, render and present.
+ life.Update();
+ }
+ return 0;
+}
+
+// Register the function to call once the Instance Object is initialized.
+// see: pappi_simple/ps_main.h
+PPAPI_SIMPLE_REGISTER_MAIN(example_main);

Powered by Google App Engine
This is Rietveld 408576698