Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(774)

Unified Diff: media/base/vector_math.cc

Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « media/base/vector_math.h ('k') | media/base/vector_math_avx.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: media/base/vector_math.cc
diff --git a/media/base/vector_math.cc b/media/base/vector_math.cc
index 578290538ac26cee8080b144a39124ae344f8646..dd963d785b155e6d7bb2f229890ff56e728138f1 100644
--- a/media/base/vector_math.cc
+++ b/media/base/vector_math.cc
@@ -3,80 +3,158 @@
// found in the LICENSE file.
#include "media/base/vector_math.h"
-#include "media/base/vector_math_testing.h"
#include <algorithm>
+#include "base/cpu.h"
#include "base/logging.h"
#include "build/build_config.h"
+#include "media/base/vector_math_testing.h"
// NaCl does not allow intrinsics.
#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
#include <xmmintrin.h>
+
+using FmacProc = void (*)(const float*, float, int, float*);
+static FmacProc g_fmac_proc_ = nullptr;
+using DotProductProc = float (*)(const float*, const float*, int);
+static DotProductProc g_dotproduct_proc_ = nullptr;
+using ConvolveProc = float (*)(const float*,
+ const float*,
+ const float*,
+ double);
+static ConvolveProc g_convolve_proc_ = nullptr;
+
+// AVX FMAC only performs well on AVX2+ machines due to those machines actually
+// having 256-bit processing units vs wrappers around 128 bit units.
+#define INITIALIZE() \
+ do { \
+ CHECK(!g_fmac_proc_); \
+ CHECK(!g_convolve_proc_); \
+ CHECK(!g_dotproduct_proc_); \
+ base::CPU cpu_info; \
+ g_fmac_proc_ = cpu_info.has_avx2() ? FMAC_AVX : FMAC_SSE; \
+ g_convolve_proc_ = cpu_info.has_avx() ? Convolve_AVX : Convolve_SSE; \
+ g_dotproduct_proc_ = cpu_info.has_avx() ? DotProduct_AVX : DotProduct_SSE; \
+ } while (0)
+
+#define CONVOLVE_FUNC g_convolve_proc_
+#define DOTPRODUCT_FUNC g_dotproduct_proc_
+#define FMAC_FUNC g_fmac_proc_
+
// Don't use custom SSE versions where the auto-vectorized C version performs
// better, which is anywhere clang is used.
#if !defined(__clang__)
-#define FMAC_FUNC FMAC_SSE
#define FMUL_FUNC FMUL_SSE
#else
-#define FMAC_FUNC FMAC_C
#define FMUL_FUNC FMUL_C
#endif
#define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE
+
#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
+
+// NEON optimized versions.
#include <arm_neon.h>
+#define INITIALIZE()
+#define CONVOLVE_FUNC Convolve_NEON
+#define DOTPRODUCT_FUNC DotProduct_NEON
#define FMAC_FUNC FMAC_NEON
#define FMUL_FUNC FMUL_NEON
#define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON
+
#else
+
+// No SIMD optimization versions.
+#define INITIALIZE()
+#define CONVOLVE_FUNC Convolve_C
+#define DOTPRODUCT_FUNC DotProduct_C
#define FMAC_FUNC FMAC_C
#define FMUL_FUNC FMUL_C
#define EWMAAndMaxPower_FUNC EWMAAndMaxPower_C
+
#endif
namespace media {
namespace vector_math {
-void FMAC(const float src[], float scale, int len, float dest[]) {
- // Ensure |src| and |dest| are 16-byte aligned.
+void Initialize() {
+ INITIALIZE();
+}
+
+float Convolve(const float* src,
+ const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor) {
+ return CONVOLVE_FUNC(src, k1, k2, kernel_interpolation_factor);
+}
+
+float Convolve_C(const float* src,
+ const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor) {
+ float sum1 = 0;
+ float sum2 = 0;
+
+ // Generate a single output sample.
+ int n = kKernelSize;
+ while (n--) {
+ sum1 += *src * *k1++;
+ sum2 += *src++ * *k2++;
+ }
+
+ // Linearly interpolate the two "convolutions".
+ return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 +
+ kernel_interpolation_factor * sum2);
+}
+
+float DotProduct(const float* a, const float* b, int len) {
+ return DOTPRODUCT_FUNC(a, b, len);
+}
+
+float DotProduct_C(const float* a, const float* b, int len) {
+ float sum = 0;
+ for (int i = 0; i < len; ++i)
+ sum += a[i] * b[i];
+ return sum;
+}
+
+void FMAC(const float* src, float scale, int len, float* dest) {
+ // Ensure |src| and |dest| are aligned.
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));
return FMAC_FUNC(src, scale, len, dest);
}
-void FMAC_C(const float src[], float scale, int len, float dest[]) {
+void FMAC_C(const float* src, float scale, int len, float* dest) {
for (int i = 0; i < len; ++i)
dest[i] += src[i] * scale;
}
-void FMUL(const float src[], float scale, int len, float dest[]) {
- // Ensure |src| and |dest| are 16-byte aligned.
+void FMUL(const float* src, float scale, int len, float* dest) {
+ // Ensure |src| and |dest| are aligned.
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));
return FMUL_FUNC(src, scale, len, dest);
}
-void FMUL_C(const float src[], float scale, int len, float dest[]) {
+void FMUL_C(const float* src, float scale, int len, float* dest) {
for (int i = 0; i < len; ++i)
dest[i] = src[i] * scale;
}
-void Crossfade(const float src[], int len, float dest[]) {
- float cf_ratio = 0;
- const float cf_increment = 1.0f / len;
- for (int i = 0; i < len; ++i, cf_ratio += cf_increment)
- dest[i] = (1.0f - cf_ratio) * src[i] + cf_ratio * dest[i];
-}
-
-std::pair<float, float> EWMAAndMaxPower(
- float initial_value, const float src[], int len, float smoothing_factor) {
+std::pair<float, float> EWMAAndMaxPower(float initial_value,
+ const float* src,
+ int len,
+ float smoothing_factor) {
// Ensure |src| is 16-byte aligned.
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));
return EWMAAndMaxPower_FUNC(initial_value, src, len, smoothing_factor);
}
-std::pair<float, float> EWMAAndMaxPower_C(
- float initial_value, const float src[], int len, float smoothing_factor) {
+std::pair<float, float> EWMAAndMaxPower_C(float initial_value,
+ const float* src,
+ int len,
+ float smoothing_factor) {
std::pair<float, float> result(initial_value, 0.0f);
const float weight_prev = 1.0f - smoothing_factor;
for (int i = 0; i < len; ++i) {
@@ -90,10 +168,75 @@ std::pair<float, float> EWMAAndMaxPower_C(
}
#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)
-void FMUL_SSE(const float src[], float scale, int len, float dest[]) {
+float Convolve_SSE(const float* src,
+ const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor) {
+ __m128 m_input;
+ __m128 m_sums1 = _mm_setzero_ps();
+ __m128 m_sums2 = _mm_setzero_ps();
+
+ // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
+ // these loops hurt performance in local testing.
+ if (reinterpret_cast<uintptr_t>(src) & 0x0F) {
+ for (int i = 0; i < kKernelSize; i += 4) {
+ m_input = _mm_loadu_ps(src + i);
+ m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+ m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+ }
+ } else {
+ for (int i = 0; i < kKernelSize; i += 4) {
+ m_input = _mm_load_ps(src + i);
+ m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
+ m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
+ }
+ }
+
+ // Linearly interpolate the two "convolutions".
+ m_sums1 = _mm_mul_ps(
+ m_sums1,
+ _mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
+ m_sums2 = _mm_mul_ps(
+ m_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
+ m_sums1 = _mm_add_ps(m_sums1, m_sums2);
+
+ // Sum components together.
+ float result;
+ m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
+ _mm_store_ss(&result,
+ _mm_add_ss(m_sums2, _mm_shuffle_ps(m_sums2, m_sums2, 1)));
+
+ return result;
+}
+
+float DotProduct_SSE(const float* a, const float* b, int len) {
const int rem = len % 4;
const int last_index = len - rem;
- __m128 m_scale = _mm_set_ps1(scale);
+
+ // First sum all components.
+ __m128 m_sum = _mm_setzero_ps();
+ for (int s = 0; s < last_index; s += 4) {
+ m_sum =
+ _mm_add_ps(m_sum, _mm_mul_ps(_mm_loadu_ps(a + s), _mm_loadu_ps(b + s)));
+ }
+
+ // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a
+ // horizontal sum function, so we have to condense manually.
+ float sum;
+ m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);
+ _mm_store_ss(&sum, _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));
+
+ // Handle any remaining values that wouldn't fit in an SSE pass.
+ for (int i = last_index; i < len; ++i)
+ sum += a[i] * b[i];
+
+ return sum;
+}
+
+void FMUL_SSE(const float* src, float scale, int len, float* dest) {
+ const int rem = len % 4;
+ const int last_index = len - rem;
+ const __m128 m_scale = _mm_set_ps1(scale);
for (int i = 0; i < last_index; i += 4)
_mm_store_ps(dest + i, _mm_mul_ps(_mm_load_ps(src + i), m_scale));
@@ -102,13 +245,14 @@ void FMUL_SSE(const float src[], float scale, int len, float dest[]) {
dest[i] = src[i] * scale;
}
-void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
+void FMAC_SSE(const float* src, float scale, int len, float* dest) {
const int rem = len % 4;
const int last_index = len - rem;
- __m128 m_scale = _mm_set_ps1(scale);
+ const __m128 m_scale = _mm_set_ps1(scale);
for (int i = 0; i < last_index; i += 4) {
- _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),
- _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
+ _mm_store_ps(dest + i,
+ _mm_add_ps(_mm_load_ps(dest + i),
+ _mm_mul_ps(_mm_load_ps(src + i), m_scale)));
}
// Handle any remaining values that wouldn't fit in an SSE pass.
@@ -120,12 +264,12 @@ void FMAC_SSE(const float src[], float scale, int len, float dest[]) {
// needed because compilers other than clang don't support access via
// operator[]().
#define EXTRACT_FLOAT(a, i) \
- (i == 0 ? \
- _mm_cvtss_f32(a) : \
- _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))
+ (i == 0 ? _mm_cvtss_f32(a) : _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))
-std::pair<float, float> EWMAAndMaxPower_SSE(
- float initial_value, const float src[], int len, float smoothing_factor) {
+std::pair<float, float> EWMAAndMaxPower_SSE(float initial_value,
+ const float* src,
+ int len,
+ float smoothing_factor) {
// When the recurrence is unrolled, we see that we can split it into 4
// separate lanes of evaluation:
//
@@ -159,10 +303,8 @@ std::pair<float, float> EWMAAndMaxPower_SSE(
const __m128 sample_x4 = _mm_load_ps(src + i);
const __m128 sample_squared_x4 = _mm_mul_ps(sample_x4, sample_x4);
max_x4 = _mm_max_ps(max_x4, sample_squared_x4);
- // Note: The compiler optimizes this to a single multiply-and-accumulate
- // instruction:
- ewma_x4 = _mm_add_ps(ewma_x4,
- _mm_mul_ps(sample_squared_x4, smoothing_factor_x4));
+ ewma_x4 =
+ _mm_add_ps(ewma_x4, _mm_mul_ps(sample_squared_x4, smoothing_factor_x4));
}
// y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])
@@ -195,13 +337,61 @@ std::pair<float, float> EWMAAndMaxPower_SSE(
#endif
#if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)
-void FMAC_NEON(const float src[], float scale, int len, float dest[]) {
+float Convolve_NEON(const float* src,
+ const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor) {
+ float32x4_t m_input;
+ float32x4_t m_sums1 = vmovq_n_f32(0);
+ float32x4_t m_sums2 = vmovq_n_f32(0);
+
+ const float* upper = src + kKernelSize;
+ for (; input_ptr < upper;) {
+ m_input = vld1q_f32(src);
+ src += 4;
+ m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
+ k1 += 4;
+ m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
+ k2 += 4;
+ }
+
+ // Linearly interpolate the two "convolutions".
+ m_sums1 = vmlaq_f32(
+ vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
+ m_sums2, vmovq_n_f32(kernel_interpolation_factor));
+
+ // Sum components together.
+ float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
+ return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
+}
+
+float DotProduct_NEON(const float* a, const float* b, int len) {
+ const int rem = len % 4;
+ const int last_index = len - rem;
+
+ // First sum all components.
+ float32x4_t m_sum = vmovq_n_f32(0);
+ for (int s = 0; s < last_index; s += 4)
+ m_sum = vmlaq_f32(m_sum, vld1q_f32(a + s), vld1q_f32(b + s));
+
+ // Reduce to a single float for this channel.
+ float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
+ float sum = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
+
+ // Handle any remaining values that wouldn't fit in an NEON pass.
+ for (int i = last_index; i < len; ++i)
+ sum += a[i] * b[i];
+
+ return sum;
+}
+
+void FMAC_NEON(const float* src, float scale, int len, float* dest) {
const int rem = len % 4;
const int last_index = len - rem;
- float32x4_t m_scale = vmovq_n_f32(scale);
+ const float32x4_t m_scale = vmovq_n_f32(scale);
for (int i = 0; i < last_index; i += 4) {
- vst1q_f32(dest + i, vmlaq_f32(
- vld1q_f32(dest + i), vld1q_f32(src + i), m_scale));
+ vst1q_f32(dest + i,
+ vmlaq_f32(vld1q_f32(dest + i), vld1q_f32(src + i), m_scale));
}
// Handle any remaining values that wouldn't fit in an NEON pass.
@@ -209,10 +399,10 @@ void FMAC_NEON(const float src[], float scale, int len, float dest[]) {
dest[i] += src[i] * scale;
}
-void FMUL_NEON(const float src[], float scale, int len, float dest[]) {
+void FMUL_NEON(const float* src, float scale, int len, float* dest) {
const int rem = len % 4;
const int last_index = len - rem;
- float32x4_t m_scale = vmovq_n_f32(scale);
+ const float32x4_t m_scale = vmovq_n_f32(scale);
for (int i = 0; i < last_index; i += 4)
vst1q_f32(dest + i, vmulq_f32(vld1q_f32(src + i), m_scale));
@@ -221,8 +411,10 @@ void FMUL_NEON(const float src[], float scale, int len, float dest[]) {
dest[i] = src[i] * scale;
}
-std::pair<float, float> EWMAAndMaxPower_NEON(
- float initial_value, const float src[], int len, float smoothing_factor) {
+std::pair<float, float> EWMAAndMaxPower_NEON(float initial_value,
+ const float* src,
+ int len,
+ float smoothing_factor) {
// When the recurrence is unrolled, we see that we can split it into 4
// separate lanes of evaluation:
//
« no previous file with comments | « media/base/vector_math.h ('k') | media/base/vector_math_avx.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698