| Index: media/base/vector_math.cc
|
| diff --git a/media/base/vector_math.cc b/media/base/vector_math.cc
|
| index 578290538ac26cee8080b144a39124ae344f8646..dd963d785b155e6d7bb2f229890ff56e728138f1 100644
|
| --- a/media/base/vector_math.cc
|
| +++ b/media/base/vector_math.cc
|
| @@ -3,80 +3,158 @@
|
| // found in the LICENSE file.
|
|
|
| #include "media/base/vector_math.h"
|
| -#include "media/base/vector_math_testing.h"
|
|
|
| #include <algorithm>
|
|
|
| +#include "base/cpu.h"
|
| #include "base/logging.h"
|
| #include "build/build_config.h"
|
| +#include "media/base/vector_math_testing.h"
|
|
|
| // NaCl does not allow intrinsics.
|
| #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
|
| #include <xmmintrin.h>
|
| +
|
| +using FmacProc = void (*)(const float*, float, int, float*);
|
| +static FmacProc g_fmac_proc_ = nullptr;
|
| +using DotProductProc = float (*)(const float*, const float*, int);
|
| +static DotProductProc g_dotproduct_proc_ = nullptr;
|
| +using ConvolveProc = float (*)(const float*,
|
| + const float*,
|
| + const float*,
|
| + double);
|
| +static ConvolveProc g_convolve_proc_ = nullptr;
|
| +
|
| +// AVX FMAC only performs well on AVX2+ machines due to those machines actually
|
| +// having 256-bit processing units vs wrappers around 128 bit units.
|
| +#define INITIALIZE() \
|
| + do { \
|
| + CHECK(!g_fmac_proc_); \
|
| + CHECK(!g_convolve_proc_); \
|
| + CHECK(!g_dotproduct_proc_); \
|
| + base::CPU cpu_info; \
|
| + g_fmac_proc_ = cpu_info.has_avx2() ? FMAC_AVX : FMAC_SSE; \
|
| + g_convolve_proc_ = cpu_info.has_avx() ? Convolve_AVX : Convolve_SSE; \
|
| + g_dotproduct_proc_ = cpu_info.has_avx() ? DotProduct_AVX : DotProduct_SSE; \
|
| + } while (0)
|
| +
|
| +#define CONVOLVE_FUNC g_convolve_proc_
|
| +#define DOTPRODUCT_FUNC g_dotproduct_proc_
|
| +#define FMAC_FUNC g_fmac_proc_
|
| +
|
| // Don't use custom SSE versions where the auto-vectorized C version performs
|
| // better, which is anywhere clang is used.
|
| #if !defined(__clang__)
|
| -#define FMAC_FUNC FMAC_SSE
|
| #define FMUL_FUNC FMUL_SSE
|
| #else
|
| -#define FMAC_FUNC FMAC_C
|
| #define FMUL_FUNC FMUL_C
|
| #endif
|
| #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE
|
| +
|
| #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
|
| +
|
| +// NEON optimized versions.
|
| #include <arm_neon.h>
|
| +#define INITIALIZE()
|
| +#define CONVOLVE_FUNC Convolve_NEON
|
| +#define DOTPRODUCT_FUNC DotProduct_NEON
|
| #define FMAC_FUNC FMAC_NEON
|
| #define FMUL_FUNC FMUL_NEON
|
| #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON
|
| +
|
| #else
|
| +
|
| +// No SIMD optimization versions.
|
| +#define INITIALIZE()
|
| +#define CONVOLVE_FUNC Convolve_C
|
| +#define DOTPRODUCT_FUNC DotProduct_C
|
| #define FMAC_FUNC FMAC_C
|
| #define FMUL_FUNC FMUL_C
|
| #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_C
|
| +
|
| #endif
|
|
|
| namespace media {
|
| namespace vector_math {
|
|
|
| -void FMAC(const float src[], float scale, int len, float dest[]) {
|
| - // Ensure |src| and |dest| are 16-byte aligned.
|
| +void Initialize() {
|
| + INITIALIZE();
|
| +}
|
| +
|
| +float Convolve(const float* src,
|
| + const float* k1,
|
| + const float* k2,
|
| + double kernel_interpolation_factor) {
|
| + return CONVOLVE_FUNC(src, k1, k2, kernel_interpolation_factor);
|
| +}
|
| +
|
| +float Convolve_C(const float* src,
|
| + const float* k1,
|
| + const float* k2,
|
| + double kernel_interpolation_factor) {
|
| + float sum1 = 0;
|
| + float sum2 = 0;
|
| +
|
| + // Generate a single output sample.
|
| + int n = kKernelSize;
|
| + while (n--) {
|
| + sum1 += *src * *k1++;
|
| + sum2 += *src++ * *k2++;
|
| + }
|
| +
|
| + // Linearly interpolate the two "convolutions".
|
| + return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 +
|
| + kernel_interpolation_factor * sum2);
|
| +}
|
| +
|
| +float DotProduct(const float* a, const float* b, int len) {
|
| + return DOTPRODUCT_FUNC(a, b, len);
|
| +}
|
| +
|
| +float DotProduct_C(const float* a, const float* b, int len) {
|
| + float sum = 0;
|
| + for (int i = 0; i < len; ++i)
|
| + sum += a[i] * b[i];
|
| + return sum;
|
| +}
|
| +
|
| +void FMAC(const float* src, float scale, int len, float* dest) {
|
| + // Ensure |src| and |dest| are aligned.
|
| DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
|
| DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));
|
| return FMAC_FUNC(src, scale, len, dest);
|
| }
|
|
|
| -void FMAC_C(const float src[], float scale, int len, float dest[]) {
|
| +void FMAC_C(const float* src, float scale, int len, float* dest) {
|
| for (int i = 0; i < len; ++i)
|
| dest[i] += src[i] * scale;
|
| }
|
|
|
| -void FMUL(const float src[], float scale, int len, float dest[]) {
|
| - // Ensure |src| and |dest| are 16-byte aligned.
|
| +void FMUL(const float* src, float scale, int len, float* dest) {
|
| + // Ensure |src| and |dest| are aligned.
|
| DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
|
| DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));
|
| return FMUL_FUNC(src, scale, len, dest);
|
| }
|
|
|
| -void FMUL_C(const float src[], float scale, int len, float dest[]) {
|
| +void FMUL_C(const float* src, float scale, int len, float* dest) {
|
| for (int i = 0; i < len; ++i)
|
| dest[i] = src[i] * scale;
|
| }
|
|
|
| -void Crossfade(const float src[], int len, float dest[]) {
|
| - float cf_ratio = 0;
|
| - const float cf_increment = 1.0f / len;
|
| - for (int i = 0; i < len; ++i, cf_ratio += cf_increment)
|
| - dest[i] = (1.0f - cf_ratio) * src[i] + cf_ratio * dest[i];
|
| -}
|
| -
|
| -std::pair<float, float> EWMAAndMaxPower(
|
| - float initial_value, const float src[], int len, float smoothing_factor) {
|
| +std::pair<float, float> EWMAAndMaxPower(float initial_value,
|
| + const float* src,
|
| + int len,
|
| + float smoothing_factor) {
|
| // Ensure |src| is 16-byte aligned.
|
| DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
|
| return EWMAAndMaxPower_FUNC(initial_value, src, len, smoothing_factor);
|
| }
|
|
|
| -std::pair<float, float> EWMAAndMaxPower_C(
|
| - float initial_value, const float src[], int len, float smoothing_factor) {
|
| +std::pair<float, float> EWMAAndMaxPower_C(float initial_value,
|
| + const float* src,
|
| + int len,
|
| + float smoothing_factor) {
|
| std::pair<float, float> result(initial_value, 0.0f);
|
| const float weight_prev = 1.0f - smoothing_factor;
|
| for (int i = 0; i < len; ++i) {
|
| @@ -90,10 +168,75 @@ std::pair<float, float> EWMAAndMaxPower_C(
|
| }
|
|
|
| #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
|
| -void FMUL_SSE(const float src[], float scale, int len, float dest[]) {
|
| +float Convolve_SSE(const float* src,
|
| + const float* k1,
|
| + const float* k2,
|
| + double kernel_interpolation_factor) {
|
| + __m128 m_input;
|
| + __m128 m_sums1 = _mm_setzero_ps();
|
| + __m128 m_sums2 = _mm_setzero_ps();
|
| +
|
| + // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
|
| + // these loops hurt performance in local testing.
|
| + if (reinterpret_cast<uintptr_t>(src) & 0x0F) {
|
| + for (int i = 0; i < kKernelSize; i += 4) {
|
| + m_input = _mm_loadu_ps(src + i);
|
| + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
|
| + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
|
| + }
|
| + } else {
|
| + for (int i = 0; i < kKernelSize; i += 4) {
|
| + m_input = _mm_load_ps(src + i);
|
| + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
|
| + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
|
| + }
|
| + }
|
| +
|
| + // Linearly interpolate the two "convolutions".
|
| + m_sums1 = _mm_mul_ps(
|
| + m_sums1,
|
| + _mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
|
| + m_sums2 = _mm_mul_ps(
|
| + m_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
|
| + m_sums1 = _mm_add_ps(m_sums1, m_sums2);
|
| +
|
| + // Sum components together.
|
| + float result;
|
| + m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
|
| + _mm_store_ss(&result,
|
| + _mm_add_ss(m_sums2, _mm_shuffle_ps(m_sums2, m_sums2, 1)));
|
| +
|
| + return result;
|
| +}
|
| +
|
| +float DotProduct_SSE(const float* a, const float* b, int len) {
|
| const int rem = len % 4;
|
| const int last_index = len - rem;
|
| - __m128 m_scale = _mm_set_ps1(scale);
|
| +
|
| + // First sum all components.
|
| + __m128 m_sum = _mm_setzero_ps();
|
| + for (int s = 0; s < last_index; s += 4) {
|
| + m_sum =
|
| + _mm_add_ps(m_sum, _mm_mul_ps(_mm_loadu_ps(a + s), _mm_loadu_ps(b + s)));
|
| + }
|
| +
|
| + // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a
|
| + // horizontal sum function, so we have to condense manually.
|
| + float sum;
|
| + m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);
|
| + _mm_store_ss(&sum, _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));
|
| +
|
| + // Handle any remaining values that wouldn't fit in an SSE pass.
|
| + for (int i = last_index; i < len; ++i)
|
| + sum += a[i] * b[i];
|
| +
|
| + return sum;
|
| +}
|
| +
|
| +void FMUL_SSE(const float* src, float scale, int len, float* dest) {
|
| + const int rem = len % 4;
|
| + const int last_index = len - rem;
|
| + const __m128 m_scale = _mm_set_ps1(scale);
|
| for (int i = 0; i < last_index; i += 4)
|
| _mm_store_ps(dest + i, _mm_mul_ps(_mm_load_ps(src + i), m_scale));
|
|
|
| @@ -102,13 +245,14 @@ void FMUL_SSE(const float src[], float scale, int len, float dest[]) {
|
| dest[i] = src[i] * scale;
|
| }
|
|
|
| -void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
|
| +void FMAC_SSE(const float* src, float scale, int len, float* dest) {
|
| const int rem = len % 4;
|
| const int last_index = len - rem;
|
| - __m128 m_scale = _mm_set_ps1(scale);
|
| + const __m128 m_scale = _mm_set_ps1(scale);
|
| for (int i = 0; i < last_index; i += 4) {
|
| - _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
|
| - _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
|
| + _mm_store_ps(dest + i,
|
| + _mm_add_ps(_mm_load_ps(dest + i),
|
| + _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
|
| }
|
|
|
| // Handle any remaining values that wouldn't fit in an SSE pass.
|
| @@ -120,12 +264,12 @@ void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
|
| // needed because compilers other than clang don't support access via
|
| // operator[]().
|
| #define EXTRACT_FLOAT(a, i) \
|
| - (i == 0 ? \
|
| - _mm_cvtss_f32(a) : \
|
| - _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))
|
| + (i == 0 ? _mm_cvtss_f32(a) : _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))
|
|
|
| -std::pair<float, float> EWMAAndMaxPower_SSE(
|
| - float initial_value, const float src[], int len, float smoothing_factor) {
|
| +std::pair<float, float> EWMAAndMaxPower_SSE(float initial_value,
|
| + const float* src,
|
| + int len,
|
| + float smoothing_factor) {
|
| // When the recurrence is unrolled, we see that we can split it into 4
|
| // separate lanes of evaluation:
|
| //
|
| @@ -159,10 +303,8 @@ std::pair<float, float> EWMAAndMaxPower_SSE(
|
| const __m128 sample_x4 = _mm_load_ps(src + i);
|
| const __m128 sample_squared_x4 = _mm_mul_ps(sample_x4, sample_x4);
|
| max_x4 = _mm_max_ps(max_x4, sample_squared_x4);
|
| - // Note: The compiler optimizes this to a single multiply-and-accumulate
|
| - // instruction:
|
| - ewma_x4 = _mm_add_ps(ewma_x4,
|
| - _mm_mul_ps(sample_squared_x4, smoothing_factor_x4));
|
| + ewma_x4 =
|
| + _mm_add_ps(ewma_x4, _mm_mul_ps(sample_squared_x4, smoothing_factor_x4));
|
| }
|
|
|
| // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
|
| @@ -195,13 +337,61 @@ std::pair<float, float> EWMAAndMaxPower_SSE(
|
| #endif
|
|
|
| #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
|
| -void FMAC_NEON(const float src[], float scale, int len, float dest[]) {
|
| +float Convolve_NEON(const float* src,
|
| + const float* k1,
|
| + const float* k2,
|
| + double kernel_interpolation_factor) {
|
| + float32x4_t m_input;
|
| + float32x4_t m_sums1 = vmovq_n_f32(0);
|
| + float32x4_t m_sums2 = vmovq_n_f32(0);
|
| +
|
| + const float* upper = src + kKernelSize;
|
| + for (; input_ptr < upper;) {
|
| + m_input = vld1q_f32(src);
|
| + src += 4;
|
| + m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
|
| + k1 += 4;
|
| + m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
|
| + k2 += 4;
|
| + }
|
| +
|
| + // Linearly interpolate the two "convolutions".
|
| + m_sums1 = vmlaq_f32(
|
| + vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
|
| + m_sums2, vmovq_n_f32(kernel_interpolation_factor));
|
| +
|
| + // Sum components together.
|
| + float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
|
| + return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
|
| +}
|
| +
|
| +float DotProduct_NEON(const float* a, const float* b, int len) {
|
| + const int rem = len % 4;
|
| + const int last_index = len - rem;
|
| +
|
| + // First sum all components.
|
| + float32x4_t m_sum = vmovq_n_f32(0);
|
| + for (int s = 0; s < last_index; s += 4)
|
| + m_sum = vmlaq_f32(m_sum, vld1q_f32(a + s), vld1q_f32(b + s));
|
| +
|
| + // Reduce to a single float for this channel.
|
| + float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
|
| + float sum = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
|
| +
|
| + // Handle any remaining values that wouldn't fit in an NEON pass.
|
| + for (int i = last_index; i < len; ++i)
|
| + sum += a[i] * b[i];
|
| +
|
| + return sum;
|
| +}
|
| +
|
| +void FMAC_NEON(const float* src, float scale, int len, float* dest) {
|
| const int rem = len % 4;
|
| const int last_index = len - rem;
|
| - float32x4_t m_scale = vmovq_n_f32(scale);
|
| + const float32x4_t m_scale = vmovq_n_f32(scale);
|
| for (int i = 0; i < last_index; i += 4) {
|
| - vst1q_f32(dest + i, vmlaq_f32(
|
| - vld1q_f32(dest + i), vld1q_f32(src + i), m_scale));
|
| + vst1q_f32(dest + i,
|
| + vmlaq_f32(vld1q_f32(dest + i), vld1q_f32(src + i), m_scale));
|
| }
|
|
|
| // Handle any remaining values that wouldn't fit in an NEON pass.
|
| @@ -209,10 +399,10 @@ void FMAC_NEON(const float src[], float scale, int len, float dest[]) {
|
| dest[i] += src[i] * scale;
|
| }
|
|
|
| -void FMUL_NEON(const float src[], float scale, int len, float dest[]) {
|
| +void FMUL_NEON(const float* src, float scale, int len, float* dest) {
|
| const int rem = len % 4;
|
| const int last_index = len - rem;
|
| - float32x4_t m_scale = vmovq_n_f32(scale);
|
| + const float32x4_t m_scale = vmovq_n_f32(scale);
|
| for (int i = 0; i < last_index; i += 4)
|
| vst1q_f32(dest + i, vmulq_f32(vld1q_f32(src + i), m_scale));
|
|
|
| @@ -221,8 +411,10 @@ void FMUL_NEON(const float src[], float scale, int len, float dest[]) {
|
| dest[i] = src[i] * scale;
|
| }
|
|
|
| -std::pair<float, float> EWMAAndMaxPower_NEON(
|
| - float initial_value, const float src[], int len, float smoothing_factor) {
|
| +std::pair<float, float> EWMAAndMaxPower_NEON(float initial_value,
|
| + const float* src,
|
| + int len,
|
| + float smoothing_factor) {
|
| // When the recurrence is unrolled, we see that we can split it into 4
|
| // separate lanes of evaluation:
|
| //
|
|
|