media/base/vector_math.cc - Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Unified Diff: media/base/vector_math.cc

Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Patch Set: Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: media/base/vector_math.cc

diff --git a/media/base/vector_math.cc b/media/base/vector_math.cc

index 578290538ac26cee8080b144a39124ae344f8646..dd963d785b155e6d7bb2f229890ff56e728138f1 100644

--- a/media/base/vector_math.cc

+++ b/media/base/vector_math.cc

@@ -3,80 +3,158 @@

// found in the LICENSE file.

#include "media/base/vector_math.h"

-#include "media/base/vector_math_testing.h"

#include <algorithm>

+#include "base/cpu.h"

#include "base/logging.h"

#include "build/build_config.h"

+#include "media/base/vector_math_testing.h"

// NaCl does not allow intrinsics.

#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)

#include <xmmintrin.h>

+using FmacProc = void (*)(const float*, float, int, float*);

+static FmacProc g_fmac_proc_ = nullptr;

+using DotProductProc = float (*)(const float*, const float*, int);

+static DotProductProc g_dotproduct_proc_ = nullptr;

+using ConvolveProc = float (*)(const float*,

+ const float*,

+ double);

+static ConvolveProc g_convolve_proc_ = nullptr;

+// AVX FMAC only performs well on AVX2+ machines due to those machines actually

+// having 256-bit processing units vs wrappers around 128 bit units.

+#define INITIALIZE() \

+ do { \

+ CHECK(!g_fmac_proc_); \

+ CHECK(!g_convolve_proc_); \

+ CHECK(!g_dotproduct_proc_); \

+ base::CPU cpu_info; \

+ g_fmac_proc_ = cpu_info.has_avx2() ? FMAC_AVX : FMAC_SSE; \

+ g_convolve_proc_ = cpu_info.has_avx() ? Convolve_AVX : Convolve_SSE; \

+ g_dotproduct_proc_ = cpu_info.has_avx() ? DotProduct_AVX : DotProduct_SSE; \

+ } while (0)

+#define CONVOLVE_FUNC g_convolve_proc_

+#define DOTPRODUCT_FUNC g_dotproduct_proc_

+#define FMAC_FUNC g_fmac_proc_

// Don't use custom SSE versions where the auto-vectorized C version performs

// better, which is anywhere clang is used.

#if !defined(__clang__)

-#define FMAC_FUNC FMAC_SSE

#define FMUL_FUNC FMUL_SSE

#else

-#define FMAC_FUNC FMAC_C

#define FMUL_FUNC FMUL_C

#endif

#define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE

#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

+// NEON optimized versions.

#include <arm_neon.h>

+#define INITIALIZE()

+#define CONVOLVE_FUNC Convolve_NEON

+#define DOTPRODUCT_FUNC DotProduct_NEON

#define FMAC_FUNC FMAC_NEON

#define FMUL_FUNC FMUL_NEON

#define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON

#else

+// No SIMD optimization versions.

+#define INITIALIZE()

+#define CONVOLVE_FUNC Convolve_C

+#define DOTPRODUCT_FUNC DotProduct_C

#define FMAC_FUNC FMAC_C

#define FMUL_FUNC FMUL_C

#define EWMAAndMaxPower_FUNC EWMAAndMaxPower_C

#endif

namespace media {

namespace vector_math {

-void FMAC(const float src[], float scale, int len, float dest[]) {

- // Ensure |src| and |dest| are 16-byte aligned.

+void Initialize() {

+ INITIALIZE();

+float Convolve(const float* src,

+ const float* k1,

+ const float* k2,

+ double kernel_interpolation_factor) {

+ return CONVOLVE_FUNC(src, k1, k2, kernel_interpolation_factor);

+float Convolve_C(const float* src,

+ const float* k1,

+ const float* k2,

+ double kernel_interpolation_factor) {

+ float sum1 = 0;

+ float sum2 = 0;

+ // Generate a single output sample.

+ int n = kKernelSize;

+ while (n--) {

+ sum1 += *src * *k1++;

+ sum2 += *src++ * *k2++;

+ }

+ // Linearly interpolate the two "convolutions".

+ return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 +

+ kernel_interpolation_factor * sum2);

+float DotProduct(const float* a, const float* b, int len) {

+ return DOTPRODUCT_FUNC(a, b, len);

+float DotProduct_C(const float* a, const float* b, int len) {

+ float sum = 0;

+ for (int i = 0; i < len; ++i)

+ sum += a[i] * b[i];

+ return sum;

+void FMAC(const float* src, float scale, int len, float* dest) {

+ // Ensure |src| and |dest| are aligned.

DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));

DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));

return FMAC_FUNC(src, scale, len, dest);

}

-void FMAC_C(const float src[], float scale, int len, float dest[]) {

+void FMAC_C(const float* src, float scale, int len, float* dest) {

for (int i = 0; i < len; ++i)

dest[i] += src[i] * scale;

}

-void FMUL(const float src[], float scale, int len, float dest[]) {

- // Ensure |src| and |dest| are 16-byte aligned.

+void FMUL(const float* src, float scale, int len, float* dest) {

+ // Ensure |src| and |dest| are aligned.

DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));

DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));

return FMUL_FUNC(src, scale, len, dest);

}

-void FMUL_C(const float src[], float scale, int len, float dest[]) {

+void FMUL_C(const float* src, float scale, int len, float* dest) {

for (int i = 0; i < len; ++i)

dest[i] = src[i] * scale;

}

-void Crossfade(const float src[], int len, float dest[]) {

- float cf_ratio = 0;

- const float cf_increment = 1.0f / len;

- for (int i = 0; i < len; ++i, cf_ratio += cf_increment)

- dest[i] = (1.0f - cf_ratio) * src[i] + cf_ratio * dest[i];

-std::pair<float, float> EWMAAndMaxPower(

- float initial_value, const float src[], int len, float smoothing_factor) {

+std::pair<float, float> EWMAAndMaxPower(float initial_value,

+ const float* src,

+ int len,

+ float smoothing_factor) {

// Ensure |src| is 16-byte aligned.

DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));

return EWMAAndMaxPower_FUNC(initial_value, src, len, smoothing_factor);

}

-std::pair<float, float> EWMAAndMaxPower_C(

- float initial_value, const float src[], int len, float smoothing_factor) {

+std::pair<float, float> EWMAAndMaxPower_C(float initial_value,

+ const float* src,

+ int len,

+ float smoothing_factor) {

std::pair<float, float> result(initial_value, 0.0f);

const float weight_prev = 1.0f - smoothing_factor;

for (int i = 0; i < len; ++i) {

@@ -90,10 +168,75 @@ std::pair<float, float> EWMAAndMaxPower_C(

}

#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)

-void FMUL_SSE(const float src[], float scale, int len, float dest[]) {

+float Convolve_SSE(const float* src,

+ const float* k1,

+ const float* k2,

+ double kernel_interpolation_factor) {

+ __m128 m_input;

+ __m128 m_sums1 = _mm_setzero_ps();

+ __m128 m_sums2 = _mm_setzero_ps();

+ // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling

+ // these loops hurt performance in local testing.

+ if (reinterpret_cast<uintptr_t>(src) & 0x0F) {

+ for (int i = 0; i < kKernelSize; i += 4) {

+ m_input = _mm_loadu_ps(src + i);

+ m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

+ m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

+ }

+ } else {

+ for (int i = 0; i < kKernelSize; i += 4) {

+ m_input = _mm_load_ps(src + i);

+ m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

+ m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

+ }

+ // Linearly interpolate the two "convolutions".

+ m_sums1 = _mm_mul_ps(

+ m_sums1,

+ _mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));

+ m_sums2 = _mm_mul_ps(

+ m_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));

+ m_sums1 = _mm_add_ps(m_sums1, m_sums2);

+ // Sum components together.

+ float result;

+ m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

+ _mm_store_ss(&result,

+ _mm_add_ss(m_sums2, _mm_shuffle_ps(m_sums2, m_sums2, 1)));

+ return result;

+float DotProduct_SSE(const float* a, const float* b, int len) {

const int rem = len % 4;

const int last_index = len - rem;

- __m128 m_scale = _mm_set_ps1(scale);

+ // First sum all components.

+ __m128 m_sum = _mm_setzero_ps();

+ for (int s = 0; s < last_index; s += 4) {

+ m_sum =

+ _mm_add_ps(m_sum, _mm_mul_ps(_mm_loadu_ps(a + s), _mm_loadu_ps(b + s)));

+ }

+ // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a

+ // horizontal sum function, so we have to condense manually.

+ float sum;

+ m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);

+ _mm_store_ss(&sum, _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));

+ // Handle any remaining values that wouldn't fit in an SSE pass.

+ for (int i = last_index; i < len; ++i)

+ sum += a[i] * b[i];

+ return sum;

+void FMUL_SSE(const float* src, float scale, int len, float* dest) {

+ const int rem = len % 4;

+ const int last_index = len - rem;

+ const __m128 m_scale = _mm_set_ps1(scale);

for (int i = 0; i < last_index; i += 4)

_mm_store_ps(dest + i, _mm_mul_ps(_mm_load_ps(src + i), m_scale));

@@ -102,13 +245,14 @@ void FMUL_SSE(const float src[], float scale, int len, float dest[]) {

dest[i] = src[i] * scale;

}

-void FMAC_SSE(const float src[], float scale, int len, float dest[]) {

+void FMAC_SSE(const float* src, float scale, int len, float* dest) {

const int rem = len % 4;

const int last_index = len - rem;

- __m128 m_scale = _mm_set_ps1(scale);

+ const __m128 m_scale = _mm_set_ps1(scale);

for (int i = 0; i < last_index; i += 4) {

- _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),

- _mm_mul_ps(_mm_load_ps(src + i), m_scale)));

+ _mm_store_ps(dest + i,

+ _mm_add_ps(_mm_load_ps(dest + i),

+ _mm_mul_ps(_mm_load_ps(src + i), m_scale)));

}

// Handle any remaining values that wouldn't fit in an SSE pass.

@@ -120,12 +264,12 @@ void FMAC_SSE(const float src[], float scale, int len, float dest[]) {

// needed because compilers other than clang don't support access via

// operator[]().

#define EXTRACT_FLOAT(a, i) \

- (i == 0 ? \

- _mm_cvtss_f32(a) : \

- _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))

+ (i == 0 ? _mm_cvtss_f32(a) : _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))

-std::pair<float, float> EWMAAndMaxPower_SSE(

- float initial_value, const float src[], int len, float smoothing_factor) {

+std::pair<float, float> EWMAAndMaxPower_SSE(float initial_value,

+ const float* src,

+ int len,

+ float smoothing_factor) {

// When the recurrence is unrolled, we see that we can split it into 4

// separate lanes of evaluation:

@@ -159,10 +303,8 @@ std::pair<float, float> EWMAAndMaxPower_SSE(

const __m128 sample_x4 = _mm_load_ps(src + i);

const __m128 sample_squared_x4 = _mm_mul_ps(sample_x4, sample_x4);

max_x4 = _mm_max_ps(max_x4, sample_squared_x4);

- // Note: The compiler optimizes this to a single multiply-and-accumulate

- // instruction:

- ewma_x4 = _mm_add_ps(ewma_x4,

- _mm_mul_ps(sample_squared_x4, smoothing_factor_x4));

+ ewma_x4 =

+ _mm_add_ps(ewma_x4, _mm_mul_ps(sample_squared_x4, smoothing_factor_x4));

}

// y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])

@@ -195,13 +337,61 @@ std::pair<float, float> EWMAAndMaxPower_SSE(

#endif

#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

-void FMAC_NEON(const float src[], float scale, int len, float dest[]) {

+float Convolve_NEON(const float* src,

+ const float* k1,

+ const float* k2,

+ double kernel_interpolation_factor) {

+ float32x4_t m_input;

+ float32x4_t m_sums1 = vmovq_n_f32(0);

+ float32x4_t m_sums2 = vmovq_n_f32(0);

+ const float* upper = src + kKernelSize;

+ for (; input_ptr < upper;) {

+ m_input = vld1q_f32(src);

+ src += 4;

+ m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));

+ k1 += 4;

+ m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));

+ k2 += 4;

+ }

+ // Linearly interpolate the two "convolutions".

+ m_sums1 = vmlaq_f32(

+ vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),

+ m_sums2, vmovq_n_f32(kernel_interpolation_factor));

+ // Sum components together.

+ float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

+ return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

+float DotProduct_NEON(const float* a, const float* b, int len) {

+ const int rem = len % 4;

+ const int last_index = len - rem;

+ // First sum all components.

+ float32x4_t m_sum = vmovq_n_f32(0);

+ for (int s = 0; s < last_index; s += 4)

+ m_sum = vmlaq_f32(m_sum, vld1q_f32(a + s), vld1q_f32(b + s));

+ // Reduce to a single float for this channel.

+ float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));

+ float sum = vget_lane_f32(vpadd_f32(m_half, m_half), 0);

+ // Handle any remaining values that wouldn't fit in an NEON pass.

+ for (int i = last_index; i < len; ++i)

+ sum += a[i] * b[i];

+ return sum;

+void FMAC_NEON(const float* src, float scale, int len, float* dest) {

const int rem = len % 4;

const int last_index = len - rem;

- float32x4_t m_scale = vmovq_n_f32(scale);

+ const float32x4_t m_scale = vmovq_n_f32(scale);

for (int i = 0; i < last_index; i += 4) {

- vst1q_f32(dest + i, vmlaq_f32(

- vld1q_f32(dest + i), vld1q_f32(src + i), m_scale));

+ vst1q_f32(dest + i,

+ vmlaq_f32(vld1q_f32(dest + i), vld1q_f32(src + i), m_scale));

}

// Handle any remaining values that wouldn't fit in an NEON pass.

@@ -209,10 +399,10 @@ void FMAC_NEON(const float src[], float scale, int len, float dest[]) {

dest[i] += src[i] * scale;

}

-void FMUL_NEON(const float src[], float scale, int len, float dest[]) {

+void FMUL_NEON(const float* src, float scale, int len, float* dest) {

const int rem = len % 4;

const int last_index = len - rem;

- float32x4_t m_scale = vmovq_n_f32(scale);

+ const float32x4_t m_scale = vmovq_n_f32(scale);

for (int i = 0; i < last_index; i += 4)

vst1q_f32(dest + i, vmulq_f32(vld1q_f32(src + i), m_scale));

@@ -221,8 +411,10 @@ void FMUL_NEON(const float src[], float scale, int len, float dest[]) {

dest[i] = src[i] * scale;

}

-std::pair<float, float> EWMAAndMaxPower_NEON(

- float initial_value, const float src[], int len, float smoothing_factor) {

+std::pair<float, float> EWMAAndMaxPower_NEON(float initial_value,

+ const float* src,

+ int len,

+ float smoothing_factor) {

// When the recurrence is unrolled, we see that we can split it into 4

// separate lanes of evaluation:

« no previous file with comments | « media/base/vector_math.h ('k') | media/base/vector_math_avx.cc » ('j') | no next file with comments »