Source/platform/audio/VectorMath.cpp - Issue 715753002: [WIP] support arm_neon_optional flag in blink.

Side by Side Diff: Source/platform/audio/VectorMath.cpp

Issue 715753002: [WIP] support arm_neon_optional flag in blink. Base URL: https://chromium.googlesource.com/chromium/blink.git@master

Patch Set: Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2010, Google Inc. All rights reserved.	2 * Copyright (C) 2010, Google Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 * 1. Redistributions of source code must retain the above copyright	7 * 1. Redistributions of source code must retain the above copyright

8 * notice, this list of conditions and the following disclaimer.	8 * notice, this list of conditions and the following disclaimer.

9 * 2. Redistributions in binary form must reproduce the above copyright	9 * 2. Redistributions in binary form must reproduce the above copyright

10 * notice, this list of conditions and the following disclaimer in the	10 * notice, this list of conditions and the following disclaimer in the

11 * documentation and/or other materials provided with the distribution.	11 * documentation and/or other materials provided with the distribution.

12 *	12 *

13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND AN Y	13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND AN Y

14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED	14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE	15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR AN Y	16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR AN Y

17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES	17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;	18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND O N	19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND O N

20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS	21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23 */	23 */

24	24

25 #include "config.h"	25 #include "config.h"

26	26

27 #if ENABLE(WEB_AUDIO)	27 #if ENABLE(WEB_AUDIO)

28	28

29 #include "platform/audio/VectorMath.h"	29 #include "platform/audio/VectorMath.h"

	30

30 #include "wtf/Assertions.h"	31 #include "wtf/Assertions.h"

31 #include "wtf/CPU.h"	32 #include <emmintrin.h>

32 #include <stdint.h>	33 #include <stdint.h>

33	34

34 #if OS(MACOSX)	35 #define SSE2_MULT_ADD(loadInstr, storeInstr) \

35 #include <Accelerate/Accelerate.h>	36 while (destP < endP) { \

36 #endif	37 pSource = _mm_load_ps(sourceP); \

	38 temp = _mm_mul_ps(pSource, mScale); \

	39 dest = _mm_##loadInstr##_ps(destP); \

	40 dest = _mm_add_ps(dest, temp); \

	41 _mm_##storeInstr##_ps(destP, dest); \

	42 sourceP += 4; \

	43 destP += 4; \

	44 } \

37	45

38 #if CPU(X86) \|\| CPU(X86_64)	46 #define SSE2_MULT(loadInstr, storeInstr) \

39 #include <emmintrin.h>	47 while (destP < endP) { \

40 #endif	48 pSource1 = _mm_load_ps(source1P); \

41	49 pSource2 = _mm_##loadInstr##_ps(source2P); \

42 #if HAVE(ARM_NEON_INTRINSICS)	50 dest = _mm_mul_ps(pSource1, pSource2); \

43 #include <arm_neon.h>	51 _mm_##storeInstr##_ps(destP, dest); \

44 #endif	52 source1P += 4; \

45	53 source2P += 4; \

46 #include <math.h>	54 destP += 4; \

47 #include <algorithm>	55 } \

48	56

49 namespace blink {	57 namespace blink {

50	58

51 namespace VectorMath {	59 namespace VectorMath {

52	60

53 #if OS(MACOSX)

54 // On the Mac we use the highly optimized versions in Accelerate.framework

55 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecL ib/vDSP_translate.h> which defines macros of the same name as

56 // our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.

57

58 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess)

59 {

60 #if CPU(X86)

61 ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);

62 #else

63 vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess) ;

64 #endif

65 }

66

67 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess)

68 {

69 #if CPU(X86)

70 ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);

71 #else

72 vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStrid e, framesToProcess);

73 #endif

74 }

75

76 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess)

77 {

78 #if CPU(X86)

79 ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);

80 #else

81 vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStrid e, framesToProcess);

82 #endif

83 }

84

85 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)

86 {

87 DSPSplitComplex sc1;

88 DSPSplitComplex sc2;

89 DSPSplitComplex dest;

90 sc1.realp = const_cast<float*>(real1P);

91 sc1.imagp = const_cast<float*>(imag1P);

92 sc2.realp = const_cast<float*>(real2P);

93 sc2.imagp = const_cast<float*>(imag2P);

94 dest.realp = realDestP;

95 dest.imagp = imagDestP;

96 #if CPU(X86)

97 ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);

98 #else

99 vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);

100 #endif

101 }

102

103 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des tP, int destStride, size_t framesToProcess)

104 {

105 vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride , framesToProcess);

106 }

107

108 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT oProcess)

109 {

110 vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);

111 }

112

113 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo Process)

114 {

115 vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess) ;

116 }

117

118 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess )

119 {

120 vDSP_vclip(const_cast<float>(sourceP), sourceStride, const_cast<float>(low ThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProc ess);

121 }

122 #else

123

124 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des tP, int destStride, size_t framesToProcess)	61 void vsma(const float* sourceP, int sourceStride, const float* scale, float* des tP, int destStride, size_t framesToProcess)

125 {	62 {

126 int n = framesToProcess;	63 int n = framesToProcess;

127	64

128 #if CPU(X86) \|\| CPU(X86_64)

129 if ((sourceStride == 1) && (destStride == 1)) {	65 if ((sourceStride == 1) && (destStride == 1)) {

130 float k = *scale;	66 float k = *scale;

131	67

132 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.	68 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.

133 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {	69 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {

134 destP += k *sourceP;	70 destP += k *sourceP;

135 sourceP++;	71 sourceP++;

136 destP++;	72 destP++;

137 n--;	73 n--;

138 }	74 }

139	75

140 // Now the sourceP is aligned, use SSE.	76 // Now the sourceP is aligned, use SSE.

141 int tailFrames = n % 4;	77 int tailFrames = n % 4;

142 const float* endP = destP + n - tailFrames;	78 const float* endP = destP + n - tailFrames;

143	79

144 __m128 pSource;	80 __m128 pSource;

145 __m128 dest;	81 __m128 dest;

146 __m128 temp;	82 __m128 temp;

147 __m128 mScale = _mm_set_ps1(k);	83 __m128 mScale = _mm_set_ps1(k);

148	84

149 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);	85 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);

150	86

151 #define SSE2_MULT_ADD(loadInstr, storeInstr) \

152 while (destP < endP) \

153 { \

154 pSource = _mm_load_ps(sourceP); \

155 temp = _mm_mul_ps(pSource, mScale); \

156 dest = _mm_##loadInstr##_ps(destP); \

157 dest = _mm_add_ps(dest, temp); \

158 _mm_##storeInstr##_ps(destP, dest); \

159 sourceP += 4; \

160 destP += 4; \

161 }

162

163 if (destAligned)	87 if (destAligned)

164 SSE2_MULT_ADD(load, store)	88 SSE2_MULT_ADD(load, store)

165 else	89 else

166 SSE2_MULT_ADD(loadu, storeu)	90 SSE2_MULT_ADD(loadu, storeu)

167	91

168 n = tailFrames;	92 n = tailFrames;

169 }	93 }

170 #elif HAVE(ARM_NEON_INTRINSICS)

171 if ((sourceStride == 1) && (destStride == 1)) {

172 int tailFrames = n % 4;

173 const float* endP = destP + n - tailFrames;

174	94

175 float32x4_t k = vdupq_n_f32(*scale);

176 while (destP < endP) {

177 float32x4_t source = vld1q_f32(sourceP);

178 float32x4_t dest = vld1q_f32(destP);

179

180 dest = vmlaq_f32(dest, source, k);

181 vst1q_f32(destP, dest);

182

183 sourceP += 4;

184 destP += 4;

185 }

186 n = tailFrames;

187 }

188 #endif

189 while (n) {	95 while (n) {

190 destP += sourceP * *scale;	96 destP += sourceP * *scale;

191 sourceP += sourceStride;	97 sourceP += sourceStride;

192 destP += destStride;	98 destP += destStride;

193 n--;	99 n--;

194 }	100 }

195 }	101 }

196	102

197 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess)	103 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess)

198 {	104 {

199 int n = framesToProcess;	105 int n = framesToProcess;

200	106

201 #if CPU(X86) \|\| CPU(X86_64)

202 if ((sourceStride == 1) && (destStride == 1)) {	107 if ((sourceStride == 1) && (destStride == 1)) {

203 float k = *scale;	108 float k = *scale;

204	109

205 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.	110 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.

206 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {	111 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {

207 destP = k *sourceP;	112 destP = k *sourceP;

208 sourceP++;	113 sourceP++;

209 destP++;	114 destP++;

210 n--;	115 n--;

211 }	116 }

(...skipping 28 matching lines...) Expand all Loading...
240	145

241 // Non-SSE handling for remaining frames which is less than 4.	146 // Non-SSE handling for remaining frames which is less than 4.

242 n %= 4;	147 n %= 4;

243 while (n) {	148 while (n) {

244 destP = k *sourceP;	149 destP = k *sourceP;

245 sourceP++;	150 sourceP++;

246 destP++;	151 destP++;

247 n--;	152 n--;

248 }	153 }

249 } else { // If strides are not 1, rollback to normal algorithm.	154 } else { // If strides are not 1, rollback to normal algorithm.

250 #elif HAVE(ARM_NEON_INTRINSICS)

251 if ((sourceStride == 1) && (destStride == 1)) {

252 float k = *scale;	155 float k = *scale;

253 int tailFrames = n % 4;	156 while (n--) {

254 const float* endP = destP + n - tailFrames;	157 destP = k *sourceP;

255	158 sourceP += sourceStride;

256 while (destP < endP) {	159 destP += destStride;

257 float32x4_t source = vld1q_f32(sourceP);

258 vst1q_f32(destP, vmulq_n_f32(source, k));

259

260 sourceP += 4;

261 destP += 4;

262 }	160 }

263 n = tailFrames;

264 }	161 }

265 #endif

266 float k = *scale;

267 while (n--) {

268 destP = k *sourceP;

269 sourceP += sourceStride;

270 destP += destStride;

271 }

272 #if CPU(X86) \|\| CPU(X86_64)

273 }

274 #endif

275 }	162 }

276	163

277 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess)	164 void vadd(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess)

278 {	165 {

279 int n = framesToProcess;	166 int n = framesToProcess;

280	167

281 #if CPU(X86) \|\| CPU(X86_64)

282 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {	168 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {

283 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.	169 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.

284 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {	170 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {

285 destP = source1P + *source2P;	171 destP = source1P + *source2P;

286 source1P++;	172 source1P++;

287 source2P++;	173 source2P++;

288 destP++;	174 destP++;

289 n--;	175 n--;

290 }	176 }

291	177

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
351 // Non-SSE handling for remaining frames which is less than 4.	237 // Non-SSE handling for remaining frames which is less than 4.

352 n %= 4;	238 n %= 4;

353 while (n) {	239 while (n) {

354 destP = source1P + *source2P;	240 destP = source1P + *source2P;

355 source1P++;	241 source1P++;

356 source2P++;	242 source2P++;

357 destP++;	243 destP++;

358 n--;	244 n--;

359 }	245 }

360 } else { // if strides are not 1, rollback to normal algorithm	246 } else { // if strides are not 1, rollback to normal algorithm

361 #elif HAVE(ARM_NEON_INTRINSICS)	247 while (n--) {

362 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {	248 destP = source1P + *source2P;

363 int tailFrames = n % 4;	249 source1P += sourceStride1;

364 const float* endP = destP + n - tailFrames;	250 source2P += sourceStride2;

365	251 destP += destStride;

366 while (destP < endP) {

367 float32x4_t source1 = vld1q_f32(source1P);

368 float32x4_t source2 = vld1q_f32(source2P);

369 vst1q_f32(destP, vaddq_f32(source1, source2));

370

371 source1P += 4;

372 source2P += 4;

373 destP += 4;

374 }	252 }

375 n = tailFrames;

376 }

377 #endif

378 while (n--) {

379 destP = source1P + *source2P;

380 source1P += sourceStride1;

381 source2P += sourceStride2;

382 destP += destStride;

383 }

384 #if CPU(X86) \|\| CPU(X86_64)

385 }

386 #endif

387 }	253 }

388	254

389 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess)	255 void vmul(const float* source1P, int sourceStride1, const float* source2P, int s ourceStride2, float* destP, int destStride, size_t framesToProcess)

390 {	256 {

391

392 int n = framesToProcess;	257 int n = framesToProcess;

393	258

394 #if CPU(X86) \|\| CPU(X86_64)

395 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {	259 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

396 // If the source1P address is not 16-byte aligned, the first several fra mes (at most three) should be processed separately.	260 // If the source1P address is not 16-byte aligned, the first several fra mes (at most three) should be processed separately.

397 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {	261 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {

398 destP = source1P * *source2P;	262 destP = source1P * *source2P;

399 source1P++;	263 source1P++;

400 source2P++;	264 source2P++;

401 destP++;	265 destP++;

402 n--;	266 n--;

403 }	267 }

404	268

405 // Now the source1P address aligned and start to apply SSE.	269 // Now the source1P address aligned and start to apply SSE.

406 int tailFrames = n % 4;	270 int tailFrames = n % 4;

407 const float* endP = destP + n - tailFrames;	271 const float* endP = destP + n - tailFrames;

408 __m128 pSource1;	272 __m128 pSource1;

409 __m128 pSource2;	273 __m128 pSource2;

410 __m128 dest;	274 __m128 dest;

411	275

412 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);	276 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);

413 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);	277 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);

414	278

415 #define SSE2_MULT(loadInstr, storeInstr) \

416 while (destP < endP) \

417 { \

418 pSource1 = _mm_load_ps(source1P); \

419 pSource2 = _mm_##loadInstr##_ps(source2P); \

420 dest = _mm_mul_ps(pSource1, pSource2); \

421 _mm_##storeInstr##_ps(destP, dest); \

422 source1P += 4; \

423 source2P += 4; \

424 destP += 4; \

425 }

426

427 if (source2Aligned && destAligned) // Both aligned.	279 if (source2Aligned && destAligned) // Both aligned.

428 SSE2_MULT(load, store)	280 SSE2_MULT(load, store)

429 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.	281 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.

430 SSE2_MULT(load, storeu)	282 SSE2_MULT(load, storeu)

431 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.	283 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.

432 SSE2_MULT(loadu, store)	284 SSE2_MULT(loadu, store)

433 else // Neither aligned.	285 else // Neither aligned.

434 SSE2_MULT(loadu, storeu)	286 SSE2_MULT(loadu, storeu)

435	287

436 n = tailFrames;	288 n = tailFrames;

437 }	289 }

438 #elif HAVE(ARM_NEON_INTRINSICS)

439 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {

440 int tailFrames = n % 4;

441 const float* endP = destP + n - tailFrames;

442	290

443 while (destP < endP) {

444 float32x4_t source1 = vld1q_f32(source1P);

445 float32x4_t source2 = vld1q_f32(source2P);

446 vst1q_f32(destP, vmulq_f32(source1, source2));

447

448 source1P += 4;

449 source2P += 4;

450 destP += 4;

451 }

452 n = tailFrames;

453 }

454 #endif

455 while (n) {	291 while (n) {

456 destP = source1P * *source2P;	292 destP = source1P * *source2P;

457 source1P += sourceStride1;	293 source1P += sourceStride1;

458 source2P += sourceStride2;	294 source2P += sourceStride2;

459 destP += destStride;	295 destP += destStride;

460 n--;	296 n--;

461 }	297 }

462 }	298 }

463	299

464 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)	300 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)

465 {	301 {

466 unsigned i = 0;	302 unsigned i = 0;

467 #if CPU(X86) \|\| CPU(X86_64)	303

468 // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.	304 // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.

469 // Otherwise, fall through to the scalar code below.	305 // Otherwise, fall through to the scalar code below.

470 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)	306 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)

471 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)	307 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)

472 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)	308 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)

473 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)	309 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)

474 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)	310 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)

475 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {	311 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {

476	312

477 unsigned endSize = framesToProcess - framesToProcess % 4;	313 unsigned endSize = framesToProcess - framesToProcess % 4;

478 while (i < endSize) {	314 while (i < endSize) {

479 __m128 real1 = _mm_load_ps(real1P + i);	315 __m128 real1 = _mm_load_ps(real1P + i);

480 __m128 real2 = _mm_load_ps(real2P + i);	316 __m128 real2 = _mm_load_ps(real2P + i);

481 __m128 imag1 = _mm_load_ps(imag1P + i);	317 __m128 imag1 = _mm_load_ps(imag1P + i);

482 __m128 imag2 = _mm_load_ps(imag2P + i);	318 __m128 imag2 = _mm_load_ps(imag2P + i);

483 __m128 real = _mm_mul_ps(real1, real2);	319 __m128 real = _mm_mul_ps(real1, real2);

484 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));	320 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));

485 __m128 imag = _mm_mul_ps(real1, imag2);	321 __m128 imag = _mm_mul_ps(real1, imag2);

486 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));	322 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));

487 _mm_store_ps(realDestP + i, real);	323 _mm_store_ps(realDestP + i, real);

488 _mm_store_ps(imagDestP + i, imag);	324 _mm_store_ps(imagDestP + i, imag);

489 i += 4;	325 i += 4;

490 }	326 }

491 }	327 }

492 #elif HAVE(ARM_NEON_INTRINSICS)

493 unsigned endSize = framesToProcess - framesToProcess % 4;

494 while (i < endSize) {

495 float32x4_t real1 = vld1q_f32(real1P + i);

496 float32x4_t real2 = vld1q_f32(real2P + i);

497 float32x4_t imag1 = vld1q_f32(imag1P + i);

498 float32x4_t imag2 = vld1q_f32(imag2P + i);

499	328

500 float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, i mag2);

501 float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, r eal2);

502

503 vst1q_f32(realDestP + i, realResult);

504 vst1q_f32(imagDestP + i, imagResult);

505

506 i += 4;

507 }

508 #endif

509 for (; i < framesToProcess; ++i) {	329 for (; i < framesToProcess; ++i) {

510 // Read and compute result before storing them, in case the	330 // Read and compute result before storing them, in case the

511 // destination is the same as one of the sources.	331 // destination is the same as one of the sources.

512 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];	332 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];

513 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];	333 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];

514	334

515 realDestP[i] = realResult;	335 realDestP[i] = realResult;

516 imagDestP[i] = imagResult;	336 imagDestP[i] = imagResult;

517 }	337 }

518 }	338 }

519	339

520 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo Process)	340 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesTo Process)

521 {	341 {

522 int n = framesToProcess;	342 int n = framesToProcess;

523 float sum = 0;	343 float sum = 0;

524	344

525 #if CPU(X86) \|\| CPU(X86_64)

526 if (sourceStride == 1) {	345 if (sourceStride == 1) {

527 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.	346 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.

528 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {	347 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {

529 float sample = *sourceP;	348 float sample = *sourceP;

530 sum += sample * sample;	349 sum += sample * sample;

531 sourceP++;	350 sourceP++;

532 n--;	351 n--;

533 }	352 }

534	353

535 // Now the sourceP is aligned, use SSE.	354 // Now the sourceP is aligned, use SSE.

536 int tailFrames = n % 4;	355 int tailFrames = n % 4;

537 const float* endP = sourceP + n - tailFrames;	356 const float* endP = sourceP + n - tailFrames;

538 __m128 source;	357 __m128 source;

539 __m128 mSum = _mm_setzero_ps();	358 __m128 mSum = _mm_setzero_ps();

540	359

541 while (sourceP < endP) {	360 while (sourceP < endP) {

542 source = _mm_load_ps(sourceP);	361 source = _mm_load_ps(sourceP);

543 source = _mm_mul_ps(source, source);	362 source = _mm_mul_ps(source, source);

544 mSum = _mm_add_ps(mSum, source);	363 mSum = _mm_add_ps(mSum, source);

545 sourceP += 4;	364 sourceP += 4;

546 }	365 }

547	366

548 // Summarize the SSE results.	367 // Summarize the SSE results.

549 const float* groupSumP = reinterpret_cast<float*>(&mSum);	368 const float* groupSumP = reinterpret_cast<float*>(&mSum);

550 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];	369 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];

551	370

552 n = tailFrames;	371 n = tailFrames;

553 }	372 }

554 #elif HAVE(ARM_NEON_INTRINSICS)

555 if (sourceStride == 1) {

556 int tailFrames = n % 4;

557 const float* endP = sourceP + n - tailFrames;

558

559 float32x4_t fourSum = vdupq_n_f32(0);

560 while (sourceP < endP) {

561 float32x4_t source = vld1q_f32(sourceP);

562 fourSum = vmlaq_f32(fourSum, source, source);

563 sourceP += 4;

564 }

565 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourS um));

566

567 float groupSum[2];

568 vst1_f32(groupSum, twoSum);

569 sum += groupSum[0] + groupSum[1];

570

571 n = tailFrames;

572 }

573 #endif

574	373

575 while (n--) {	374 while (n--) {

576 float sample = *sourceP;	375 float sample = *sourceP;

577 sum += sample * sample;	376 sum += sample * sample;

578 sourceP += sourceStride;	377 sourceP += sourceStride;

579 }	378 }

580	379

581 ASSERT(sumP);	380 ASSERT(sumP);

582 *sumP = sum;	381 *sumP = sum;

583 }	382 }

584	383

585 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT oProcess)	384 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT oProcess)

586 {	385 {

587 int n = framesToProcess;	386 int n = framesToProcess;

588 float max = 0;	387 float max = 0;

589	388

590 #if CPU(X86) \|\| CPU(X86_64)

591 if (sourceStride == 1) {	389 if (sourceStride == 1) {

592 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.	390 // If the sourceP address is not 16-byte aligned, the first several fram es (at most three) should be processed separately.

593 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {	391 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {

594 max = std::max(max, fabsf(*sourceP));	392 max = std::max(max, fabsf(*sourceP));

595 sourceP++;	393 sourceP++;

596 n--;	394 n--;

597 }	395 }

598	396

599 // Now the sourceP is aligned, use SSE.	397 // Now the sourceP is aligned, use SSE.

600 int tailFrames = n % 4;	398 int tailFrames = n % 4;

(...skipping 13 matching lines...) Expand all Loading...
614	412

615 // Get max from the SSE results.	413 // Get max from the SSE results.

616 const float* groupMaxP = reinterpret_cast<float*>(&mMax);	414 const float* groupMaxP = reinterpret_cast<float*>(&mMax);

617 max = std::max(max, groupMaxP[0]);	415 max = std::max(max, groupMaxP[0]);

618 max = std::max(max, groupMaxP[1]);	416 max = std::max(max, groupMaxP[1]);

619 max = std::max(max, groupMaxP[2]);	417 max = std::max(max, groupMaxP[2]);

620 max = std::max(max, groupMaxP[3]);	418 max = std::max(max, groupMaxP[3]);

621	419

622 n = tailFrames;	420 n = tailFrames;

623 }	421 }

624 #elif HAVE(ARM_NEON_INTRINSICS)

625 if (sourceStride == 1) {

626 int tailFrames = n % 4;

627 const float* endP = sourceP + n - tailFrames;

628

629 float32x4_t fourMax = vdupq_n_f32(0);

630 while (sourceP < endP) {

631 float32x4_t source = vld1q_f32(sourceP);

632 fourMax = vmaxq_f32(fourMax, vabsq_f32(source));

633 sourceP += 4;

634 }

635 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourM ax));

636

637 float groupMax[2];

638 vst1_f32(groupMax, twoMax);

639 max = std::max(groupMax[0], groupMax[1]);

640

641 n = tailFrames;

642 }

643 #endif

644	422

645 while (n--) {	423 while (n--) {

646 max = std::max(max, fabsf(*sourceP));	424 max = std::max(max, fabsf(*sourceP));

647 sourceP += sourceStride;	425 sourceP += sourceStride;

648 }	426 }

649	427

650 ASSERT(maxP);	428 ASSERT(maxP);

651 *maxP = max;	429 *maxP = max;

652 }	430 }

653	431

654 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess )	432 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c onst float* highThresholdP, float* destP, int destStride, size_t framesToProcess )

655 {	433 {

656 int n = framesToProcess;	434 int n = framesToProcess;

657 float lowThreshold = *lowThresholdP;	435 float lowThreshold = *lowThresholdP;

658 float highThreshold = *highThresholdP;	436 float highThreshold = *highThresholdP;

659	437

660 // FIXME: Optimize for SSE2.	438 // FIXME: Optimize for SSE2.

661 #if HAVE(ARM_NEON_INTRINSICS)

662 if ((sourceStride == 1) && (destStride == 1)) {

663 int tailFrames = n % 4;

664 const float* endP = destP + n - tailFrames;

665

666 float32x4_t low = vdupq_n_f32(lowThreshold);

667 float32x4_t high = vdupq_n_f32(highThreshold);

668 while (destP < endP) {

669 float32x4_t source = vld1q_f32(sourceP);

670 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));

671 sourceP += 4;

672 destP += 4;

673 }

674 n = tailFrames;

675 }

676 #endif

677 while (n--) {	439 while (n--) {

678 destP = std::max(std::min(sourceP, highThreshold), lowThreshold);	440 destP = std::max(std::min(sourceP, highThreshold), lowThreshold);

679 sourceP += sourceStride;	441 sourceP += sourceStride;

680 destP += destStride;	442 destP += destStride;

681 }	443 }

682 }	444 }

683	445

684 #endif // OS(MACOSX)

685

686 } // namespace VectorMath	446 } // namespace VectorMath

687	447

688 } // namespace blink	448 } // namespace blink

689	449

690 #endif // ENABLE(WEB_AUDIO)	450 #endif // ENABLE(WEB_AUDIO)

OLD	NEW

« no previous file with comments | « Source/platform/Logging.cpp ('k') | Source/platform/audio/cpu/arm/VectorMathNEON.h » ('j') | no next file with comments »