native_client_sdk/src/examples/demo/earth_simd/earth.cc - Issue 289023002: Initial SIMD demos life and earth for PNaCl.

Unified Diff: native_client_sdk/src/examples/demo/earth_simd/earth.cc

Issue 289023002: Initial SIMD demos life and earth for PNaCl. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « native_client_sdk/src/examples/demo/earth/example.js ('k') | native_client_sdk/src/examples/demo/earth_simd/earth.jpg » ('j') | native_client_sdk/src/examples/demo/earth_simd/index.html » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: native_client_sdk/src/examples/demo/earth_simd/earth.cc

diff --git a/native_client_sdk/src/examples/demo/earth/earth.cc b/native_client_sdk/src/examples/demo/earth_simd/earth.cc

similarity index 84%

copy from native_client_sdk/src/examples/demo/earth/earth.cc

copy to native_client_sdk/src/examples/demo/earth_simd/earth.cc

index 420448bb1f476dec109ad7c856286b055892feed..ca3173cff9d5f423cc5a16401c43ad1671638057 100644

--- a/native_client_sdk/src/examples/demo/earth/earth.cc

+++ b/native_client_sdk/src/examples/demo/earth_simd/earth.cc

@@ -30,13 +30,20 @@

using namespace sdk_util; // For sdk_util::ThreadPool

+#define INLINE inline __attribute__((always_inline))

+// 128 bit SIMD vector types

+typedef uint8_t u8x16_t __attribute__ ((vector_size (16)));

+typedef int32_t i32x4_t __attribute__ ((vector_size (16)));

+typedef uint32_t u32x4_t __attribute__ ((vector_size (16)));

+typedef float f32x4_t __attribute__ ((vector_size (16)));

// Global properties used to setup Earth demo.

namespace {

const float kPI = M_PI;

const float kTwoPI = kPI * 2.0f;

const float kOneOverPI = 1.0f / kPI;

const float kOneOver2PI = 1.0f / kTwoPI;

-const float kOneOver255 = 1.0f / 255.0f;

const int kArcCosineTableSize = 4096;

const int kFramesToBenchmark = 100;

const float kZoomMin = 1.0f;

@@ -58,21 +65,56 @@ inline double getseconds() {

return 0.0;

}

-// RGBA helper functions, used for extracting color from RGBA source image.

-inline float ExtractR(uint32_t c) {

- return static_cast<float>(c & 0xFF) * kOneOver255;

+// SIMD Vector helper functions.

+INLINE f32x4_t min(f32x4_t a, f32x4_t b) {

+ i32x4_t m = a < b;

+ return (f32x4_t)(((i32x4_t)a & m) | ((i32x4_t)b & ~m));

+INLINE f32x4_t max(f32x4_t a, f32x4_t b) {

+ i32x4_t m = a > b;

+ return (f32x4_t)(((i32x4_t)a & m) | ((i32x4_t)b & ~m));

+INLINE float dot3(f32x4_t a, f32x4_t b) {

+ f32x4_t c = a * b;

+ return c[0] + c[1] + c[2];

+INLINE f32x4_t broadcast(float x) {

+ f32x4_t r = {x, x, x, x};

+ return r;

}

-inline float ExtractG(uint32_t c) {

- return static_cast<float>((c & 0xFF00) >> 8) * kOneOver255;

+// SIMD RGBA helper functions, used for extracting color from RGBA source image.

+INLINE f32x4_t ExtractRGBA(uint32_t c) {

+ const f32x4_t kOneOver255 = broadcast(1.0f / 255.0f);

+ const i32x4_t kZero = {0, 0, 0, 0};

+ i32x4_t v = {c, c, c, c};

+ // zero extend packed color into 32x4 integer vector

+ v = (i32x4_t)__builtin_shufflevector((u8x16_t)v, (u8x16_t)kZero,

+ 0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, 3, 16, 16, 16);

+ // convert color values to float, range 0..1

+ f32x4_t f = __builtin_convertvector(v, f32x4_t) * kOneOver255;

+ return f;

}

-inline float ExtractB(uint32_t c) {

- return static_cast<float>((c & 0xFF0000) >> 16) * kOneOver255;

+// SIMD BGRA helper function, for constructing a pixel for a BGRA buffer.

+INLINE uint32_t PackBGRA(f32x4_t f) {

+ const f32x4_t kZero = broadcast(0.0f);

+ const f32x4_t kHalf = broadcast(0.5f);

+ const f32x4_t k255 = broadcast(255.0f);

+ f = max(f, kZero);

+ f = f * k255 + kHalf;

+ f = min(f, k255);

+ i32x4_t i = __builtin_convertvector(f, i32x4_t);

+ u32x4_t p = (u32x4_t)__builtin_shufflevector((u8x16_t)i, (u8x16_t)i,

+ 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0, 12);

+ return p[0];

}

// BGRA helper function, for constructing a pixel for a BGRA buffer.

-inline uint32_t MakeBGRA(uint32_t b, uint32_t g, uint32_t r, uint32_t a) {

+INLINE uint32_t MakeBGRA(uint32_t b, uint32_t g, uint32_t r, uint32_t a) {

return (((a) << 24) | ((r) << 16) | ((g) << 8) | (b));

}

@@ -113,7 +155,7 @@ ArcCosine::ArcCosine() {

// looks up acos(f) using a table and lerping between entries

// (it is expected that input f is between -1 and 1)

-float ArcCosine::TableLerp(float f) {

+INLINE float ArcCosine::TableLerp(float f) {

float x = (f + 1.0f) * 0.5f;

x = x * kArcCosineTableSize;

int ix = static_cast<int>(x);

@@ -134,25 +176,25 @@ union Convert {

float AsFloat() { return f; }

};

-inline const int AsInteger(const float f) {

+INLINE const int AsInteger(const float f) {

Convert u(f);

return u.AsInt();

}

-inline const float AsFloat(const int i) {

+INLINE const float AsFloat(const int i) {

Convert u(i);

return u.AsFloat();

}

const long int kOneAsInteger = AsInteger(1.0f);

-inline float inline_quick_sqrt(float x) {

+INLINE float inline_quick_sqrt(float x) {

int i;

i = (AsInteger(x) >> 1) + (kOneAsInteger >> 1);

return AsFloat(i);

}

-inline float inline_sqrt(float x) {

+INLINE float inline_sqrt(float x) {

float y;

y = inline_quick_sqrt(x);

y = (y * y + x) / (2.0f * y);

@@ -160,15 +202,6 @@ inline float inline_sqrt(float x) {

return y;

}

-// takes a -0..1+ color, clamps it to 0..1 and maps it to 0..255 integer

-inline uint32_t Clamp255(float x) {

- if (x < 0.0f) {

- x = 0.0f;

- } else if (x > 1.0f) {

- x = 1.0f;

- }

- return static_cast<uint32_t>(x * 255.0f);

} // namespace

@@ -376,13 +409,30 @@ inline uint32_t* Planet::wGetAddr(int x, int y) {

return ps_context_->data + x + y * ps_context_->stride / sizeof(uint32_t);

}

-// This is the meat of the ray tracer. Given a pixel span (x0, x1) on

+// This is the inner loop of the ray tracer. Given a pixel span (x0, x1) on

// scanline y, shoot rays into the scene and render what they hit. Use

-// scanline coherence to do a few optimizations

+// scanline coherence to do a few optimizations.

+// This version uses portable SIMD 4 element single precision floating point

+// vectors to perform many of the calculations, and builds only on PNaCl.

void Planet::wRenderPixelSpan(int x0, int x1, int y) {

if (!base_tex_ || !night_tex_)

return;

- const int kColorBlack = MakeBGRA(0, 0, 0, 0xFF);

+ const uint32_t kColorBlack = MakeBGRA(0, 0, 0, 0xFF);

+ const uint32_t kSolidAlpha = MakeBGRA(0, 0, 0, 0xFF);

+ const f32x4_t kOne = {1.0f, 1.0f, 1.0f, 1.0f};

+ const f32x4_t diffuse = {diffuse_r_, diffuse_g_, diffuse_b_, 0.0f};

+ const f32x4_t ambient = {ambient_r_, ambient_g_, ambient_b_, 0.0f};

+ const f32x4_t light_pos = {light_x_, light_y_, light_z_, 1.0f};

+ const f32x4_t planet_pos = {planet_x_, planet_y_, planet_z_, 1.0f};

+ const f32x4_t planet_one_over_radius = broadcast(planet_one_over_radius_);

+ const f32x4_t planet_equator = {

+ planet_equator_x_, planet_equator_y_, planet_equator_z_, 0.0f};

+ const f32x4_t planet_pole = {

+ planet_pole_x_, planet_pole_y_, planet_pole_z_, 1.0f};

+ const f32x4_t planet_pole_x_equator = {

+ planet_pole_x_equator_x_, planet_pole_x_equator_y_,

+ planet_pole_x_equator_z_, 0.0f};

float width = ps_context_->width;

float height = ps_context_->height;

float min_dim = width < height ? width : height;

@@ -423,42 +473,30 @@ void Planet::wRenderPixelSpan(int x0, int x1, int y) {

continue;

}

- // calc parametric t value

+ f32x4_t delta = {dx, dy, dz, 1.0f};

+ f32x4_t base = {x0, y0, z0, 1.0f};

+ // Calc parametric t value.

float t = (-b - inline_sqrt(disc)) / (2.0f * a);

- float px = x0 + t * dx;

- float py = y0 + t * dy;

- float pz = z0 + t * dz;

- float nx = (px - planet_x_) * planet_one_over_radius_;

- float ny = (py - planet_y_) * planet_one_over_radius_;

- float nz = (pz - planet_z_) * planet_one_over_radius_;

+ f32x4_t pos = base + broadcast(t) * delta;

+ f32x4_t normal = (pos - planet_pos) * planet_one_over_radius;

// Misc raytrace calculations.

- float Lx = (light_x_ - px);

- float Ly = (light_y_ - py);

- float Lz = (light_z_ - pz);

- float Lq = 1.0f / inline_quick_sqrt(Lx * Lx + Ly * Ly + Lz * Lz);

- Lx *= Lq;

- Ly *= Lq;

- Lz *= Lq;

- float d = (Lx * nx + Ly * ny + Lz * nz);

- float pr = (diffuse_r_ * d) + ambient_r_;

- float pg = (diffuse_g_ * d) + ambient_g_;

- float pb = (diffuse_b_ * d) + ambient_b_;

- float ds = -(nx * planet_pole_x_ +

- ny * planet_pole_y_ +

- nz * planet_pole_z_);

+ f32x4_t L = light_pos - pos;

+ float Lq = 1.0f / inline_quick_sqrt(dot3(L, L));

+ L = L * broadcast(Lq);

+ float d = dot3(L, normal);

+ f32x4_t p = diffuse * broadcast(d) + ambient;

+ float ds = -dot3(normal, planet_pole);

float ang = acos_.TableLerp(ds);

float v = ang * kOneOverPI;

- float dp = planet_equator_x_ * nx +

- planet_equator_y_ * ny +

- planet_equator_z_ * nz;

- float w = dp / sin(ang);

+ float dp = dot3(planet_equator, normal);

+ float w = dp / sinf(ang);

if (w > 1.0f) w = 1.0f;

if (w < -1.0f) w = -1.0f;

float th = acos_.TableLerp(w) * kOneOver2PI;

- float dps = planet_pole_x_equator_x_ * nx +

- planet_pole_x_equator_y_ * ny +

- planet_pole_x_equator_z_ * nz;

+ float dps = dot3(planet_pole_x_equator, normal);

float u;

if (dps < 0.0f)

u = th;

@@ -470,34 +508,20 @@ void Planet::wRenderPixelSpan(int x0, int x1, int y) {

int ty = static_cast<int>(v * base_tex_->height);

int offset = tx + ty * base_tex_->width;

uint32_t base_texel = base_tex_->pixels[offset];

- float tr = ExtractR(base_texel);

- float tg = ExtractG(base_texel);

- float tb = ExtractB(base_texel);

- float ipr = 1.0f - pr;

binji 2014/05/15 18:58:37 You don't clamp this anymore?

nfullagar 2014/05/15 21:25:13 good catch! (Its only slightly visual when adjusti

- if (ipr < 0.0f) ipr = 0.0f;

- float ipg = 1.0f - pg;

- if (ipg < 0.0f) ipg = 0.0f;

- float ipb = 1.0f - pb;

- if (ipb < 0.0f) ipb = 0.0f;

+ f32x4_t dc = ExtractRGBA(base_texel);

// Look up night texel.

int nix = static_cast<int>(u * night_tex_->width);

int niy = static_cast<int>(v * night_tex_->height);

int noffset = nix + niy * night_tex_->width;

uint32_t night_texel = night_tex_->pixels[noffset];

- float nr = ExtractR(night_texel);

- float ng = ExtractG(night_texel);

- float nb = ExtractB(night_texel);

- // Final color value is lerp between day and night texels.

- unsigned int ir = Clamp255(pr * tr + nr * ipr);

- unsigned int ig = Clamp255(pg * tg + ng * ipg);

- unsigned int ib = Clamp255(pb * tb + nb * ipb);

+ f32x4_t nc = ExtractRGBA(night_texel);

- unsigned int color = MakeBGRA(ib, ig, ir, 0xFF);

+ // Blend between daylight (dc) and nighttime (nc) color.

+ f32x4_t fc = dc * p + nc * (kOne - p);

+ uint32_t color = PackBGRA(fc);

- *pixels = color;

+ *pixels = color | kSolidAlpha;

++pixels;

}