Index: native_client_sdk/src/examples/demo/earth_simd/earth.cc |
diff --git a/native_client_sdk/src/examples/demo/earth/earth.cc b/native_client_sdk/src/examples/demo/earth_simd/earth.cc |
similarity index 82% |
copy from native_client_sdk/src/examples/demo/earth/earth.cc |
copy to native_client_sdk/src/examples/demo/earth_simd/earth.cc |
index 420448bb1f476dec109ad7c856286b055892feed..0aff7c0f40dfc3247e9ac6c04edc939148a1b633 100644 |
--- a/native_client_sdk/src/examples/demo/earth/earth.cc |
+++ b/native_client_sdk/src/examples/demo/earth_simd/earth.cc |
@@ -30,13 +30,20 @@ |
using namespace sdk_util; // For sdk_util::ThreadPool |
+#define INLINE inline __attribute__((always_inline)) |
+ |
+// 128 bit SIMD vector types |
+typedef uint8_t u8x16_t __attribute__ ((vector_size (16))); |
+typedef int32_t i32x4_t __attribute__ ((vector_size (16))); |
+typedef uint32_t u32x4_t __attribute__ ((vector_size (16))); |
+typedef float f32x4_t __attribute__ ((vector_size (16))); |
+ |
// Global properties used to setup Earth demo. |
namespace { |
const float kPI = M_PI; |
const float kTwoPI = kPI * 2.0f; |
const float kOneOverPI = 1.0f / kPI; |
const float kOneOver2PI = 1.0f / kTwoPI; |
-const float kOneOver255 = 1.0f / 255.0f; |
const int kArcCosineTableSize = 4096; |
const int kFramesToBenchmark = 100; |
const float kZoomMin = 1.0f; |
@@ -58,21 +65,71 @@ inline double getseconds() { |
return 0.0; |
} |
-// RGBA helper functions, used for extracting color from RGBA source image. |
-inline float ExtractR(uint32_t c) { |
- return static_cast<float>(c & 0xFF) * kOneOver255; |
-} |
- |
-inline float ExtractG(uint32_t c) { |
- return static_cast<float>((c & 0xFF00) >> 8) * kOneOver255; |
-} |
- |
-inline float ExtractB(uint32_t c) { |
- return static_cast<float>((c & 0xFF0000) >> 16) * kOneOver255; |
+// SIMD Vector helper functions. |
+// |
+// Note that a compare between two vectors will return a signed integer vector |
+// with the same number of elements, where each element will be all bits set |
+// for true (-1), and all bits clear for false (0) This integer vector can be |
+// useful as a mask. |
+// |
+// Also note that c-style casts do not mutate the bits of a vector - only the |
+// type. Boolean operators can't operate on float vectors, but it is possible |
+// to cast them temporarily to integer vector, perform the mask, and cast |
+// them back to float. |
+// |
+// To convert a float vector to an integer vector using trunction, or to |
+// convert an integer vector to a float vector, use __builtin_convertvector(). |
+ |
+INLINE f32x4_t min(f32x4_t a, f32x4_t b) { |
+ i32x4_t m = a < b; |
+ return (f32x4_t)(((i32x4_t)a & m) | ((i32x4_t)b & ~m)); |
+} |
+ |
+INLINE f32x4_t max(f32x4_t a, f32x4_t b) { |
+ i32x4_t m = a > b; |
+ return (f32x4_t)(((i32x4_t)a & m) | ((i32x4_t)b & ~m)); |
+} |
+ |
+INLINE float dot3(f32x4_t a, f32x4_t b) { |
+ f32x4_t c = a * b; |
+ return c[0] + c[1] + c[2]; |
+} |
+ |
+INLINE f32x4_t broadcast(float x) { |
+ f32x4_t r = {x, x, x, x}; |
+ return r; |
+} |
+ |
+// SIMD RGBA helper functions, used for extracting color from RGBA source image. |
+INLINE f32x4_t ExtractRGBA(uint32_t c) { |
+ const f32x4_t kOneOver255 = broadcast(1.0f / 255.0f); |
+ const i32x4_t kZero = {0, 0, 0, 0}; |
+ i32x4_t v = {c, c, c, c}; |
+ // zero extend packed color into 32x4 integer vector |
+ v = (i32x4_t)__builtin_shufflevector((u8x16_t)v, (u8x16_t)kZero, |
+ 0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, 3, 16, 16, 16); |
+ // convert color values to float, range 0..1 |
+ f32x4_t f = __builtin_convertvector(v, f32x4_t) * kOneOver255; |
+ return f; |
+} |
+ |
+// SIMD BGRA helper function, for constructing a pixel for a BGRA buffer. |
+INLINE uint32_t PackBGRA(f32x4_t f) { |
+ const f32x4_t kZero = broadcast(0.0f); |
+ const f32x4_t kHalf = broadcast(0.5f); |
+ const f32x4_t k255 = broadcast(255.0f); |
+ f = max(f, kZero); |
+ // Add 0.5 to perform rounding instead of truncation. |
+ f = f * k255 + kHalf; |
+ f = min(f, k255); |
+ i32x4_t i = __builtin_convertvector(f, i32x4_t); |
+ u32x4_t p = (u32x4_t)__builtin_shufflevector((u8x16_t)i, (u8x16_t)i, |
+ 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0, 12); |
+ return p[0]; |
} |
// BGRA helper function, for constructing a pixel for a BGRA buffer. |
-inline uint32_t MakeBGRA(uint32_t b, uint32_t g, uint32_t r, uint32_t a) { |
+INLINE uint32_t MakeBGRA(uint32_t b, uint32_t g, uint32_t r, uint32_t a) { |
return (((a) << 24) | ((r) << 16) | ((g) << 8) | (b)); |
} |
@@ -113,7 +170,7 @@ ArcCosine::ArcCosine() { |
// looks up acos(f) using a table and lerping between entries |
// (it is expected that input f is between -1 and 1) |
-float ArcCosine::TableLerp(float f) { |
+INLINE float ArcCosine::TableLerp(float f) { |
float x = (f + 1.0f) * 0.5f; |
x = x * kArcCosineTableSize; |
int ix = static_cast<int>(x); |
@@ -134,25 +191,25 @@ union Convert { |
float AsFloat() { return f; } |
}; |
-inline const int AsInteger(const float f) { |
+INLINE const int AsInteger(const float f) { |
Convert u(f); |
return u.AsInt(); |
} |
-inline const float AsFloat(const int i) { |
+INLINE const float AsFloat(const int i) { |
Convert u(i); |
return u.AsFloat(); |
} |
const long int kOneAsInteger = AsInteger(1.0f); |
-inline float inline_quick_sqrt(float x) { |
+INLINE float inline_quick_sqrt(float x) { |
int i; |
i = (AsInteger(x) >> 1) + (kOneAsInteger >> 1); |
return AsFloat(i); |
} |
-inline float inline_sqrt(float x) { |
+INLINE float inline_sqrt(float x) { |
float y; |
y = inline_quick_sqrt(x); |
y = (y * y + x) / (2.0f * y); |
@@ -160,15 +217,6 @@ inline float inline_sqrt(float x) { |
return y; |
} |
-// takes a -0..1+ color, clamps it to 0..1 and maps it to 0..255 integer |
-inline uint32_t Clamp255(float x) { |
- if (x < 0.0f) { |
- x = 0.0f; |
- } else if (x > 1.0f) { |
- x = 1.0f; |
- } |
- return static_cast<uint32_t>(x * 255.0f); |
-} |
} // namespace |
@@ -376,13 +424,30 @@ inline uint32_t* Planet::wGetAddr(int x, int y) { |
return ps_context_->data + x + y * ps_context_->stride / sizeof(uint32_t); |
} |
-// This is the meat of the ray tracer. Given a pixel span (x0, x1) on |
+// This is the inner loop of the ray tracer. Given a pixel span (x0, x1) on |
// scanline y, shoot rays into the scene and render what they hit. Use |
-// scanline coherence to do a few optimizations |
+// scanline coherence to do a few optimizations. |
+// This version uses portable SIMD 4 element single precision floating point |
+// vectors to perform many of the calculations, and builds only on PNaCl. |
void Planet::wRenderPixelSpan(int x0, int x1, int y) { |
if (!base_tex_ || !night_tex_) |
return; |
- const int kColorBlack = MakeBGRA(0, 0, 0, 0xFF); |
+ const uint32_t kColorBlack = MakeBGRA(0, 0, 0, 0xFF); |
+ const uint32_t kSolidAlpha = MakeBGRA(0, 0, 0, 0xFF); |
+ const f32x4_t kOne = {1.0f, 1.0f, 1.0f, 1.0f}; |
+ const f32x4_t diffuse = {diffuse_r_, diffuse_g_, diffuse_b_, 0.0f}; |
+ const f32x4_t ambient = {ambient_r_, ambient_g_, ambient_b_, 0.0f}; |
+ const f32x4_t light_pos = {light_x_, light_y_, light_z_, 1.0f}; |
+ const f32x4_t planet_pos = {planet_x_, planet_y_, planet_z_, 1.0f}; |
+ const f32x4_t planet_one_over_radius = broadcast(planet_one_over_radius_); |
+ const f32x4_t planet_equator = { |
+ planet_equator_x_, planet_equator_y_, planet_equator_z_, 0.0f}; |
+ const f32x4_t planet_pole = { |
+ planet_pole_x_, planet_pole_y_, planet_pole_z_, 1.0f}; |
+ const f32x4_t planet_pole_x_equator = { |
+ planet_pole_x_equator_x_, planet_pole_x_equator_y_, |
+ planet_pole_x_equator_z_, 0.0f}; |
+ |
float width = ps_context_->width; |
float height = ps_context_->height; |
float min_dim = width < height ? width : height; |
@@ -423,42 +488,30 @@ void Planet::wRenderPixelSpan(int x0, int x1, int y) { |
continue; |
} |
- // calc parametric t value |
+ f32x4_t delta = {dx, dy, dz, 1.0f}; |
+ f32x4_t base = {x0, y0, z0, 1.0f}; |
+ |
+ // Calc parametric t value. |
float t = (-b - inline_sqrt(disc)) / (2.0f * a); |
- float px = x0 + t * dx; |
- float py = y0 + t * dy; |
- float pz = z0 + t * dz; |
- float nx = (px - planet_x_) * planet_one_over_radius_; |
- float ny = (py - planet_y_) * planet_one_over_radius_; |
- float nz = (pz - planet_z_) * planet_one_over_radius_; |
+ |
+ f32x4_t pos = base + broadcast(t) * delta; |
+ f32x4_t normal = (pos - planet_pos) * planet_one_over_radius; |
// Misc raytrace calculations. |
- float Lx = (light_x_ - px); |
- float Ly = (light_y_ - py); |
- float Lz = (light_z_ - pz); |
- float Lq = 1.0f / inline_quick_sqrt(Lx * Lx + Ly * Ly + Lz * Lz); |
- Lx *= Lq; |
- Ly *= Lq; |
- Lz *= Lq; |
- float d = (Lx * nx + Ly * ny + Lz * nz); |
- float pr = (diffuse_r_ * d) + ambient_r_; |
- float pg = (diffuse_g_ * d) + ambient_g_; |
- float pb = (diffuse_b_ * d) + ambient_b_; |
- float ds = -(nx * planet_pole_x_ + |
- ny * planet_pole_y_ + |
- nz * planet_pole_z_); |
+ f32x4_t L = light_pos - pos; |
+ float Lq = 1.0f / inline_quick_sqrt(dot3(L, L)); |
+ L = L * broadcast(Lq); |
+ float d = dot3(L, normal); |
+ f32x4_t p = diffuse * broadcast(d) + ambient; |
+ float ds = -dot3(normal, planet_pole); |
float ang = acos_.TableLerp(ds); |
float v = ang * kOneOverPI; |
- float dp = planet_equator_x_ * nx + |
- planet_equator_y_ * ny + |
- planet_equator_z_ * nz; |
- float w = dp / sin(ang); |
+ float dp = dot3(planet_equator, normal); |
+ float w = dp / sinf(ang); |
if (w > 1.0f) w = 1.0f; |
if (w < -1.0f) w = -1.0f; |
float th = acos_.TableLerp(w) * kOneOver2PI; |
- float dps = planet_pole_x_equator_x_ * nx + |
- planet_pole_x_equator_y_ * ny + |
- planet_pole_x_equator_z_ * nz; |
+ float dps = dot3(planet_pole_x_equator, normal); |
float u; |
if (dps < 0.0f) |
u = th; |
@@ -470,34 +523,21 @@ void Planet::wRenderPixelSpan(int x0, int x1, int y) { |
int ty = static_cast<int>(v * base_tex_->height); |
int offset = tx + ty * base_tex_->width; |
uint32_t base_texel = base_tex_->pixels[offset]; |
- float tr = ExtractR(base_texel); |
- float tg = ExtractG(base_texel); |
- float tb = ExtractB(base_texel); |
- |
- float ipr = 1.0f - pr; |
- if (ipr < 0.0f) ipr = 0.0f; |
- float ipg = 1.0f - pg; |
- if (ipg < 0.0f) ipg = 0.0f; |
- float ipb = 1.0f - pb; |
- if (ipb < 0.0f) ipb = 0.0f; |
+ f32x4_t dc = ExtractRGBA(base_texel); |
// Look up night texel. |
int nix = static_cast<int>(u * night_tex_->width); |
int niy = static_cast<int>(v * night_tex_->height); |
int noffset = nix + niy * night_tex_->width; |
uint32_t night_texel = night_tex_->pixels[noffset]; |
- float nr = ExtractR(night_texel); |
- float ng = ExtractG(night_texel); |
- float nb = ExtractB(night_texel); |
- |
- // Final color value is lerp between day and night texels. |
- unsigned int ir = Clamp255(pr * tr + nr * ipr); |
- unsigned int ig = Clamp255(pg * tg + ng * ipg); |
- unsigned int ib = Clamp255(pb * tb + nb * ipb); |
+ f32x4_t nc = ExtractRGBA(night_texel); |
- unsigned int color = MakeBGRA(ib, ig, ir, 0xFF); |
+ // Blend between daylight (dc) and nighttime (nc) color. |
+ f32x4_t pc = min(p, kOne); |
+ f32x4_t fc = dc * p + nc * (kOne - pc); |
+ uint32_t color = PackBGRA(fc); |
- *pixels = color; |
+ *pixels = color | kSolidAlpha; |
++pixels; |
} |
} |