media/base/vector_math.cc - Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Side by Side Diff: media/base/vector_math.cc

Issue 2556993002: Experiment with AVX optimizations for FMAC, FMUL operations.

Patch Set: Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "media/base/vector_math.h"	5 #include "media/base/vector_math.h"

6 #include "media/base/vector_math_testing.h"

7	6

8 #include <algorithm>	7 #include <algorithm>

9	8

	9 #include "base/cpu.h"

10 #include "base/logging.h"	10 #include "base/logging.h"

11 #include "build/build_config.h"	11 #include "build/build_config.h"

	12 #include "media/base/vector_math_testing.h"

12	13

13 // NaCl does not allow intrinsics.	14 // NaCl does not allow intrinsics.

14 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)	15 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)

15 #include <xmmintrin.h>	16 #include <xmmintrin.h>

	17

	18 using FmacProc = void ()(const float, float, int, float*);

	19 static FmacProc g_fmac_proc_ = nullptr;

	20 using DotProductProc = float ()(const float, const float*, int);

	21 static DotProductProc g_dotproduct_proc_ = nullptr;

	22 using ConvolveProc = float ()(const float,

	23 const float*,

	24 const float*,

	25 double);

	26 static ConvolveProc g_convolve_proc_ = nullptr;

	27

	28 // AVX FMAC only performs well on AVX2+ machines due to those machines actually

	29 // having 256-bit processing units vs wrappers around 128 bit units.

	30 #define INITIALIZE() \

	31 do { \

	32 CHECK(!g_fmac_proc_); \

	33 CHECK(!g_convolve_proc_); \

	34 CHECK(!g_dotproduct_proc_); \

	35 base::CPU cpu_info; \

	36 g_fmac_proc_ = cpu_info.has_avx2() ? FMAC_AVX : FMAC_SSE; \

	37 g_convolve_proc_ = cpu_info.has_avx() ? Convolve_AVX : Convolve_SSE; \

	38 g_dotproduct_proc_ = cpu_info.has_avx() ? DotProduct_AVX : DotProduct_SSE; \

	39 } while (0)

	40

	41 #define CONVOLVE_FUNC g_convolve_proc_

	42 #define DOTPRODUCT_FUNC g_dotproduct_proc_

	43 #define FMAC_FUNC g_fmac_proc_

	44

16 // Don't use custom SSE versions where the auto-vectorized C version performs	45 // Don't use custom SSE versions where the auto-vectorized C version performs

17 // better, which is anywhere clang is used.	46 // better, which is anywhere clang is used.

18 #if !defined(__clang__)	47 #if !defined(__clang__)

19 #define FMAC_FUNC FMAC_SSE

20 #define FMUL_FUNC FMUL_SSE	48 #define FMUL_FUNC FMUL_SSE

21 #else	49 #else

22 #define FMAC_FUNC FMAC_C

23 #define FMUL_FUNC FMUL_C	50 #define FMUL_FUNC FMUL_C

24 #endif	51 #endif

25 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE	52 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_SSE

	53

26 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)	54 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

	55

	56 // NEON optimized versions.

27 #include <arm_neon.h>	57 #include <arm_neon.h>

	58 #define INITIALIZE()

	59 #define CONVOLVE_FUNC Convolve_NEON

	60 #define DOTPRODUCT_FUNC DotProduct_NEON

28 #define FMAC_FUNC FMAC_NEON	61 #define FMAC_FUNC FMAC_NEON

29 #define FMUL_FUNC FMUL_NEON	62 #define FMUL_FUNC FMUL_NEON

30 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON	63 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_NEON

	64

31 #else	65 #else

	66

	67 // No SIMD optimization versions.

	68 #define INITIALIZE()

	69 #define CONVOLVE_FUNC Convolve_C

	70 #define DOTPRODUCT_FUNC DotProduct_C

32 #define FMAC_FUNC FMAC_C	71 #define FMAC_FUNC FMAC_C

33 #define FMUL_FUNC FMUL_C	72 #define FMUL_FUNC FMUL_C

34 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_C	73 #define EWMAAndMaxPower_FUNC EWMAAndMaxPower_C

	74

35 #endif	75 #endif

36	76

37 namespace media {	77 namespace media {

38 namespace vector_math {	78 namespace vector_math {

39	79

40 void FMAC(const float src[], float scale, int len, float dest[]) {	80 void Initialize() {

41 // Ensure \|src\| and \|dest\| are 16-byte aligned.	81 INITIALIZE();

	82 }

	83

	84 float Convolve(const float* src,

	85 const float* k1,

	86 const float* k2,

	87 double kernel_interpolation_factor) {

	88 return CONVOLVE_FUNC(src, k1, k2, kernel_interpolation_factor);

	89 }

	90

	91 float Convolve_C(const float* src,

	92 const float* k1,

	93 const float* k2,

	94 double kernel_interpolation_factor) {

	95 float sum1 = 0;

	96 float sum2 = 0;

	97

	98 // Generate a single output sample.

	99 int n = kKernelSize;

	100 while (n--) {

	101 sum1 += src *k1++;

	102 sum2 += src++ *k2++;

	103 }

	104

	105 // Linearly interpolate the two "convolutions".

	106 return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 +

	107 kernel_interpolation_factor * sum2);

	108 }

	109

	110 float DotProduct(const float* a, const float* b, int len) {

	111 return DOTPRODUCT_FUNC(a, b, len);

	112 }

	113

	114 float DotProduct_C(const float* a, const float* b, int len) {

	115 float sum = 0;

	116 for (int i = 0; i < len; ++i)

	117 sum += a[i] * b[i];

	118 return sum;

	119 }

	120

	121 void FMAC(const float* src, float scale, int len, float* dest) {

	122 // Ensure \|src\| and \|dest\| are aligned.

42 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));	123 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));

43 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));	124 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));

44 return FMAC_FUNC(src, scale, len, dest);	125 return FMAC_FUNC(src, scale, len, dest);

45 }	126 }

46	127

47 void FMAC_C(const float src[], float scale, int len, float dest[]) {	128 void FMAC_C(const float* src, float scale, int len, float* dest) {

48 for (int i = 0; i < len; ++i)	129 for (int i = 0; i < len; ++i)

49 dest[i] += src[i] * scale;	130 dest[i] += src[i] * scale;

50 }	131 }

51	132

52 void FMUL(const float src[], float scale, int len, float dest[]) {	133 void FMUL(const float* src, float scale, int len, float* dest) {

53 // Ensure \|src\| and \|dest\| are 16-byte aligned.	134 // Ensure \|src\| and \|dest\| are aligned.

54 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));	135 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));

55 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));	136 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(dest) & (kRequiredAlignment - 1));

56 return FMUL_FUNC(src, scale, len, dest);	137 return FMUL_FUNC(src, scale, len, dest);

57 }	138 }

58	139

59 void FMUL_C(const float src[], float scale, int len, float dest[]) {	140 void FMUL_C(const float* src, float scale, int len, float* dest) {

60 for (int i = 0; i < len; ++i)	141 for (int i = 0; i < len; ++i)

61 dest[i] = src[i] * scale;	142 dest[i] = src[i] * scale;

62 }	143 }

63	144

64 void Crossfade(const float src[], int len, float dest[]) {	145 std::pair<float, float> EWMAAndMaxPower(float initial_value,

65 float cf_ratio = 0;	146 const float* src,

66 const float cf_increment = 1.0f / len;	147 int len,

67 for (int i = 0; i < len; ++i, cf_ratio += cf_increment)	148 float smoothing_factor) {

68 dest[i] = (1.0f - cf_ratio) * src[i] + cf_ratio * dest[i];

69 }

70

71 std::pair<float, float> EWMAAndMaxPower(

72 float initial_value, const float src[], int len, float smoothing_factor) {

73 // Ensure \|src\| is 16-byte aligned.	149 // Ensure \|src\| is 16-byte aligned.

74 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));	150 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(src) & (kRequiredAlignment - 1));

75 return EWMAAndMaxPower_FUNC(initial_value, src, len, smoothing_factor);	151 return EWMAAndMaxPower_FUNC(initial_value, src, len, smoothing_factor);

76 }	152 }

77	153

78 std::pair<float, float> EWMAAndMaxPower_C(	154 std::pair<float, float> EWMAAndMaxPower_C(float initial_value,

79 float initial_value, const float src[], int len, float smoothing_factor) {	155 const float* src,

	156 int len,

	157 float smoothing_factor) {

80 std::pair<float, float> result(initial_value, 0.0f);	158 std::pair<float, float> result(initial_value, 0.0f);

81 const float weight_prev = 1.0f - smoothing_factor;	159 const float weight_prev = 1.0f - smoothing_factor;

82 for (int i = 0; i < len; ++i) {	160 for (int i = 0; i < len; ++i) {

83 result.first *= weight_prev;	161 result.first *= weight_prev;

84 const float sample = src[i];	162 const float sample = src[i];

85 const float sample_squared = sample * sample;	163 const float sample_squared = sample * sample;

86 result.first += sample_squared * smoothing_factor;	164 result.first += sample_squared * smoothing_factor;

87 result.second = std::max(result.second, sample_squared);	165 result.second = std::max(result.second, sample_squared);

88 }	166 }

89 return result;	167 return result;

90 }	168 }

91	169

92 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)	170 #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_NACL)

93 void FMUL_SSE(const float src[], float scale, int len, float dest[]) {	171 float Convolve_SSE(const float* src,

	172 const float* k1,

	173 const float* k2,

	174 double kernel_interpolation_factor) {

	175 __m128 m_input;

	176 __m128 m_sums1 = _mm_setzero_ps();

	177 __m128 m_sums2 = _mm_setzero_ps();

	178

	179 // Based on \|input_ptr\| alignment, we need to use loadu or load. Unrolling

	180 // these loops hurt performance in local testing.

	181 if (reinterpret_cast<uintptr_t>(src) & 0x0F) {

	182 for (int i = 0; i < kKernelSize; i += 4) {

	183 m_input = _mm_loadu_ps(src + i);

	184 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

	185 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

	186 }

	187 } else {

	188 for (int i = 0; i < kKernelSize; i += 4) {

	189 m_input = _mm_load_ps(src + i);

	190 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

	191 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

	192 }

	193 }

	194

	195 // Linearly interpolate the two "convolutions".

	196 m_sums1 = _mm_mul_ps(

	197 m_sums1,

	198 _mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));

	199 m_sums2 = _mm_mul_ps(

	200 m_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));

	201 m_sums1 = _mm_add_ps(m_sums1, m_sums2);

	202

	203 // Sum components together.

	204 float result;

	205 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

	206 _mm_store_ss(&result,

	207 _mm_add_ss(m_sums2, _mm_shuffle_ps(m_sums2, m_sums2, 1)));

	208

	209 return result;

	210 }

	211

	212 float DotProduct_SSE(const float* a, const float* b, int len) {

94 const int rem = len % 4;	213 const int rem = len % 4;

95 const int last_index = len - rem;	214 const int last_index = len - rem;

96 __m128 m_scale = _mm_set_ps1(scale);	215

	216 // First sum all components.

	217 __m128 m_sum = _mm_setzero_ps();

	218 for (int s = 0; s < last_index; s += 4) {

	219 m_sum =

	220 _mm_add_ps(m_sum, _mm_mul_ps(_mm_loadu_ps(a + s), _mm_loadu_ps(b + s)));

	221 }

	222

	223 // Reduce to a single float for this channel. Sadly, SSE1,2 doesn't have a

	224 // horizontal sum function, so we have to condense manually.

	225 float sum;

	226 m_sum = _mm_add_ps(_mm_movehl_ps(m_sum, m_sum), m_sum);

	227 _mm_store_ss(&sum, _mm_add_ss(m_sum, _mm_shuffle_ps(m_sum, m_sum, 1)));

	228

	229 // Handle any remaining values that wouldn't fit in an SSE pass.

	230 for (int i = last_index; i < len; ++i)

	231 sum += a[i] * b[i];

	232

	233 return sum;

	234 }

	235

	236 void FMUL_SSE(const float* src, float scale, int len, float* dest) {

	237 const int rem = len % 4;

	238 const int last_index = len - rem;

	239 const __m128 m_scale = _mm_set_ps1(scale);

97 for (int i = 0; i < last_index; i += 4)	240 for (int i = 0; i < last_index; i += 4)

98 _mm_store_ps(dest + i, _mm_mul_ps(_mm_load_ps(src + i), m_scale));	241 _mm_store_ps(dest + i, _mm_mul_ps(_mm_load_ps(src + i), m_scale));

99	242

100 // Handle any remaining values that wouldn't fit in an SSE pass.	243 // Handle any remaining values that wouldn't fit in an SSE pass.

101 for (int i = last_index; i < len; ++i)	244 for (int i = last_index; i < len; ++i)

102 dest[i] = src[i] * scale;	245 dest[i] = src[i] * scale;

103 }	246 }

104	247

105 void FMAC_SSE(const float src[], float scale, int len, float dest[]) {	248 void FMAC_SSE(const float* src, float scale, int len, float* dest) {

106 const int rem = len % 4;	249 const int rem = len % 4;

107 const int last_index = len - rem;	250 const int last_index = len - rem;

108 __m128 m_scale = _mm_set_ps1(scale);	251 const __m128 m_scale = _mm_set_ps1(scale);

109 for (int i = 0; i < last_index; i += 4) {	252 for (int i = 0; i < last_index; i += 4) {

110 _mm_store_ps(dest + i, _mm_add_ps(_mm_load_ps(dest + i),	253 _mm_store_ps(dest + i,

111 _mm_mul_ps(_mm_load_ps(src + i), m_scale)));	254 _mm_add_ps(_mm_load_ps(dest + i),

	255 _mm_mul_ps(_mm_load_ps(src + i), m_scale)));

112 }	256 }

113	257

114 // Handle any remaining values that wouldn't fit in an SSE pass.	258 // Handle any remaining values that wouldn't fit in an SSE pass.

115 for (int i = last_index; i < len; ++i)	259 for (int i = last_index; i < len; ++i)

116 dest[i] += src[i] * scale;	260 dest[i] += src[i] * scale;

117 }	261 }

118	262

119 // Convenience macro to extract float 0 through 3 from the vector \|a\|. This is	263 // Convenience macro to extract float 0 through 3 from the vector \|a\|. This is

120 // needed because compilers other than clang don't support access via	264 // needed because compilers other than clang don't support access via

121 // operator[]().	265 // operator[]().

122 #define EXTRACT_FLOAT(a, i) \	266 #define EXTRACT_FLOAT(a, i) \

123 (i == 0 ? \	267 (i == 0 ? _mm_cvtss_f32(a) : _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))

124 _mm_cvtss_f32(a) : \

125 _mm_cvtss_f32(_mm_shuffle_ps(a, a, i)))

126	268

127 std::pair<float, float> EWMAAndMaxPower_SSE(	269 std::pair<float, float> EWMAAndMaxPower_SSE(float initial_value,

128 float initial_value, const float src[], int len, float smoothing_factor) {	270 const float* src,

	271 int len,

	272 float smoothing_factor) {

129 // When the recurrence is unrolled, we see that we can split it into 4	273 // When the recurrence is unrolled, we see that we can split it into 4

130 // separate lanes of evaluation:	274 // separate lanes of evaluation:

131 //	275 //

132 // y[n] = a(S[n]^2) + (1-a)(y[n-1])	276 // y[n] = a(S[n]^2) + (1-a)(y[n-1])

133 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ...	277 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ...

134 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])	278 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])

135 //	279 //

136 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ...	280 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ...

137 //	281 //

138 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in	282 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in

(...skipping 13 matching lines...) Expand all Loading...
152 // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and	296 // Compute z[n], z[n-1], z[n-2], and z[n-3] in parallel in lanes 3, 2, 1 and

153 // 0, respectively.	297 // 0, respectively.

154 __m128 max_x4 = _mm_setzero_ps();	298 __m128 max_x4 = _mm_setzero_ps();

155 __m128 ewma_x4 = _mm_setr_ps(0.0f, 0.0f, 0.0f, initial_value);	299 __m128 ewma_x4 = _mm_setr_ps(0.0f, 0.0f, 0.0f, initial_value);

156 int i;	300 int i;

157 for (i = 0; i < last_index; i += 4) {	301 for (i = 0; i < last_index; i += 4) {

158 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_4th_x4);	302 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_4th_x4);

159 const __m128 sample_x4 = _mm_load_ps(src + i);	303 const __m128 sample_x4 = _mm_load_ps(src + i);

160 const __m128 sample_squared_x4 = _mm_mul_ps(sample_x4, sample_x4);	304 const __m128 sample_squared_x4 = _mm_mul_ps(sample_x4, sample_x4);

161 max_x4 = _mm_max_ps(max_x4, sample_squared_x4);	305 max_x4 = _mm_max_ps(max_x4, sample_squared_x4);

162 // Note: The compiler optimizes this to a single multiply-and-accumulate	306 ewma_x4 =

163 // instruction:	307 _mm_add_ps(ewma_x4, _mm_mul_ps(sample_squared_x4, smoothing_factor_x4));

164 ewma_x4 = _mm_add_ps(ewma_x4,

165 _mm_mul_ps(sample_squared_x4, smoothing_factor_x4));

166 }	308 }

167	309

168 // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])	310 // y[n] = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])

169 float ewma = EXTRACT_FLOAT(ewma_x4, 3);	311 float ewma = EXTRACT_FLOAT(ewma_x4, 3);

170 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4);	312 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4);

171 ewma += EXTRACT_FLOAT(ewma_x4, 2);	313 ewma += EXTRACT_FLOAT(ewma_x4, 2);

172 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4);	314 ewma_x4 = _mm_mul_ps(ewma_x4, weight_prev_x4);

173 ewma += EXTRACT_FLOAT(ewma_x4, 1);	315 ewma += EXTRACT_FLOAT(ewma_x4, 1);

174 ewma_x4 = _mm_mul_ss(ewma_x4, weight_prev_x4);	316 ewma_x4 = _mm_mul_ss(ewma_x4, weight_prev_x4);

175 ewma += EXTRACT_FLOAT(ewma_x4, 0);	317 ewma += EXTRACT_FLOAT(ewma_x4, 0);

(...skipping 12 matching lines...) Expand all Loading...
188 const float sample_squared = sample * sample;	330 const float sample_squared = sample * sample;

189 result.first += sample_squared * smoothing_factor;	331 result.first += sample_squared * smoothing_factor;

190 result.second = std::max(result.second, sample_squared);	332 result.second = std::max(result.second, sample_squared);

191 }	333 }

192	334

193 return result;	335 return result;

194 }	336 }

195 #endif	337 #endif

196	338

197 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)	339 #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

198 void FMAC_NEON(const float src[], float scale, int len, float dest[]) {	340 float Convolve_NEON(const float* src,

	341 const float* k1,

	342 const float* k2,

	343 double kernel_interpolation_factor) {

	344 float32x4_t m_input;

	345 float32x4_t m_sums1 = vmovq_n_f32(0);

	346 float32x4_t m_sums2 = vmovq_n_f32(0);

	347

	348 const float* upper = src + kKernelSize;

	349 for (; input_ptr < upper;) {

	350 m_input = vld1q_f32(src);

	351 src += 4;

	352 m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));

	353 k1 += 4;

	354 m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));

	355 k2 += 4;

	356 }

	357

	358 // Linearly interpolate the two "convolutions".

	359 m_sums1 = vmlaq_f32(

	360 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),

	361 m_sums2, vmovq_n_f32(kernel_interpolation_factor));

	362

	363 // Sum components together.

	364 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

	365 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

	366 }

	367

	368 float DotProduct_NEON(const float* a, const float* b, int len) {

199 const int rem = len % 4;	369 const int rem = len % 4;

200 const int last_index = len - rem;	370 const int last_index = len - rem;

201 float32x4_t m_scale = vmovq_n_f32(scale);	371

	372 // First sum all components.

	373 float32x4_t m_sum = vmovq_n_f32(0);

	374 for (int s = 0; s < last_index; s += 4)

	375 m_sum = vmlaq_f32(m_sum, vld1q_f32(a + s), vld1q_f32(b + s));

	376

	377 // Reduce to a single float for this channel.

	378 float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));

	379 float sum = vget_lane_f32(vpadd_f32(m_half, m_half), 0);

	380

	381 // Handle any remaining values that wouldn't fit in an NEON pass.

	382 for (int i = last_index; i < len; ++i)

	383 sum += a[i] * b[i];

	384

	385 return sum;

	386 }

	387

	388 void FMAC_NEON(const float* src, float scale, int len, float* dest) {

	389 const int rem = len % 4;

	390 const int last_index = len - rem;

	391 const float32x4_t m_scale = vmovq_n_f32(scale);

202 for (int i = 0; i < last_index; i += 4) {	392 for (int i = 0; i < last_index; i += 4) {

203 vst1q_f32(dest + i, vmlaq_f32(	393 vst1q_f32(dest + i,

204 vld1q_f32(dest + i), vld1q_f32(src + i), m_scale));	394 vmlaq_f32(vld1q_f32(dest + i), vld1q_f32(src + i), m_scale));

205 }	395 }

206	396

207 // Handle any remaining values that wouldn't fit in an NEON pass.	397 // Handle any remaining values that wouldn't fit in an NEON pass.

208 for (int i = last_index; i < len; ++i)	398 for (int i = last_index; i < len; ++i)

209 dest[i] += src[i] * scale;	399 dest[i] += src[i] * scale;

210 }	400 }

211	401

212 void FMUL_NEON(const float src[], float scale, int len, float dest[]) {	402 void FMUL_NEON(const float* src, float scale, int len, float* dest) {

213 const int rem = len % 4;	403 const int rem = len % 4;

214 const int last_index = len - rem;	404 const int last_index = len - rem;

215 float32x4_t m_scale = vmovq_n_f32(scale);	405 const float32x4_t m_scale = vmovq_n_f32(scale);

216 for (int i = 0; i < last_index; i += 4)	406 for (int i = 0; i < last_index; i += 4)

217 vst1q_f32(dest + i, vmulq_f32(vld1q_f32(src + i), m_scale));	407 vst1q_f32(dest + i, vmulq_f32(vld1q_f32(src + i), m_scale));

218	408

219 // Handle any remaining values that wouldn't fit in an NEON pass.	409 // Handle any remaining values that wouldn't fit in an NEON pass.

220 for (int i = last_index; i < len; ++i)	410 for (int i = last_index; i < len; ++i)

221 dest[i] = src[i] * scale;	411 dest[i] = src[i] * scale;

222 }	412 }

223	413

224 std::pair<float, float> EWMAAndMaxPower_NEON(	414 std::pair<float, float> EWMAAndMaxPower_NEON(float initial_value,

225 float initial_value, const float src[], int len, float smoothing_factor) {	415 const float* src,

	416 int len,

	417 float smoothing_factor) {

226 // When the recurrence is unrolled, we see that we can split it into 4	418 // When the recurrence is unrolled, we see that we can split it into 4

227 // separate lanes of evaluation:	419 // separate lanes of evaluation:

228 //	420 //

229 // y[n] = a(S[n]^2) + (1-a)(y[n-1])	421 // y[n] = a(S[n]^2) + (1-a)(y[n-1])

230 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ...	422 // = a(S[n]^2) + (1-a)^1(aS[n-1]^2) + (1-a)^2(aS[n-2]^2) + ...

231 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])	423 // = z[n] + (1-a)^1(z[n-1]) + (1-a)^2(z[n-2]) + (1-a)^3(z[n-3])

232 //	424 //

233 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ...	425 // where z[n] = a(S[n]^2) + (1-a)^4(z[n-4]) + (1-a)^8(z[n-8]) + ...

234 //	426 //

235 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in	427 // Thus, the strategy here is to compute z[n], z[n-1], z[n-2], and z[n-3] in

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
282 result.first += sample_squared * smoothing_factor;	474 result.first += sample_squared * smoothing_factor;

283 result.second = std::max(result.second, sample_squared);	475 result.second = std::max(result.second, sample_squared);

284 }	476 }

285	477

286 return result;	478 return result;

287 }	479 }

288 #endif	480 #endif

289	481

290 } // namespace vector_math	482 } // namespace vector_math

291 } // namespace media	483 } // namespace media

OLD	NEW

« no previous file with comments | « media/base/vector_math.h ('k') | media/base/vector_math_avx.cc » ('j') | no next file with comments »