OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright (C) 2010, Google Inc. All rights reserved. | 2 * Copyright (C) 2010, Google Inc. All rights reserved. |
3 * | 3 * |
4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
6 * are met: | 6 * are met: |
7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
(...skipping 161 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
172 | 172 |
173 if (destAligned) | 173 if (destAligned) |
174 SSE2_MULT_ADD(load, store) | 174 SSE2_MULT_ADD(load, store) |
175 else | 175 else |
176 SSE2_MULT_ADD(loadu, storeu) | 176 SSE2_MULT_ADD(loadu, storeu) |
177 | 177 |
178 n = tailFrames; | 178 n = tailFrames; |
179 } | 179 } |
180 #elif HAVE(ARM_NEON_INTRINSICS) | 180 #elif HAVE(ARM_NEON_INTRINSICS) |
181 if ((sourceStride == 1) && (destStride == 1)) { | 181 if ((sourceStride == 1) && (destStride == 1)) { |
182 int tailFrames = n % 4; | 182 unsigned tailFrames = n & 15; |
183 const float* endP = destP + n - tailFrames; | 183 float32x4_t scaleNum = vdupq_n_f32(*scale); |
184 | 184 |
185 float32x4_t k = vdupq_n_f32(*scale); | 185 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { |
186 while (destP < endP) { | 186 float32x4_t dest0 = vld1q_f32(destP); |
187 float32x4_t source = vld1q_f32(sourceP); | 187 float32x4_t source0 = vld1q_f32(sourceP); |
188 float32x4_t dest = vld1q_f32(destP); | |
189 | 188 |
190 dest = vmlaq_f32(dest, source, k); | 189 float32x4_t dest1 = vld1q_f32(destP + 4); |
191 vst1q_f32(destP, dest); | 190 float32x4_t source1 = vld1q_f32(sourceP + 4); |
192 | 191 |
193 sourceP += 4; | 192 float32x4_t dest2 = vld1q_f32(destP + 8); |
194 destP += 4; | 193 float32x4_t source2 = vld1q_f32(sourceP + 8); |
194 | |
195 float32x4_t dest3 = vld1q_f32(destP + 12); | |
196 float32x4_t source3 = vld1q_f32(sourceP + 12); | |
197 | |
198 float32x4_t result0 = vmlaq_f32(dest0, source0, scaleNum); | |
199 float32x4_t result1 = vmlaq_f32(dest1, source1, scaleNum); | |
200 float32x4_t result2 = vmlaq_f32(dest2, source2, scaleNum); | |
201 float32x4_t result3 = vmlaq_f32(dest3, source3, scaleNum); | |
202 | |
203 vst1q_f32(destP, result0); | |
204 vst1q_f32(destP + 4, result1); | |
205 vst1q_f32(destP + 8, result2); | |
206 vst1q_f32(destP + 12, result3); | |
207 | |
208 sourceP += 16; | |
209 destP += 16; | |
195 } | 210 } |
196 n = tailFrames; | 211 n = tailFrames; |
197 } | 212 } |
198 #endif | 213 #endif |
199 while (n) { | 214 while (n) { |
200 *destP += *sourceP * *scale; | 215 *destP += *sourceP * *scale; |
201 sourceP += sourceStride; | 216 sourceP += sourceStride; |
202 destP += destStride; | 217 destP += destStride; |
203 n--; | 218 n--; |
204 } | 219 } |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
251 // Non-SSE handling for remaining frames which is less than 4. | 266 // Non-SSE handling for remaining frames which is less than 4. |
252 n %= 4; | 267 n %= 4; |
253 while (n) { | 268 while (n) { |
254 *destP = k * *sourceP; | 269 *destP = k * *sourceP; |
255 sourceP++; | 270 sourceP++; |
256 destP++; | 271 destP++; |
257 n--; | 272 n--; |
258 } | 273 } |
259 } else { // If strides are not 1, rollback to normal algorithm. | 274 } else { // If strides are not 1, rollback to normal algorithm. |
260 #elif HAVE(ARM_NEON_INTRINSICS) | 275 #elif HAVE(ARM_NEON_INTRINSICS) |
261 if ((sourceStride == 1) && (destStride == 1)) { | 276 if (sourceStride == 1 && destStride == 1) { |
262 float k = *scale; | 277 unsigned tailFrames = framesToProcess & 15; |
263 int tailFrames = n % 4; | 278 float32x4_t scaleNum = vdupq_n_f32(*scale); |
Raymond Toy
2014/09/22 20:00:36
Is there any performance difference between making
KhNo
2014/09/23 02:07:32
No performance difference for scaleNum, It is just
| |
264 const float* endP = destP + n - tailFrames; | |
265 | 279 |
266 while (destP < endP) { | 280 for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCou nt--) { |
267 float32x4_t source = vld1q_f32(sourceP); | 281 float32x4_t source0 = vld1q_f32(sourceP); |
268 vst1q_f32(destP, vmulq_n_f32(source, k)); | 282 float32x4_t source1 = vld1q_f32(sourceP + 4); |
283 float32x4_t source2 = vld1q_f32(sourceP + 8); | |
284 float32x4_t source3 = vld1q_f32(sourceP + 12); | |
269 | 285 |
270 sourceP += 4; | 286 float32x4_t result0 = vmulq_f32(source0, scaleNum); |
271 destP += 4; | 287 float32x4_t result1 = vmulq_f32(source1, scaleNum); |
288 float32x4_t result2 = vmulq_f32(source2, scaleNum); | |
289 float32x4_t result3 = vmulq_f32(source3, scaleNum); | |
290 | |
291 vst1q_f32(destP, result0); | |
292 vst1q_f32(destP + 4, result1); | |
293 vst1q_f32(destP + 8, result2); | |
294 vst1q_f32(destP + 12, result3); | |
295 | |
296 sourceP += 16; | |
297 destP += 16; | |
272 } | 298 } |
273 n = tailFrames; | 299 n = tailFrames; |
274 } | 300 } |
275 #endif | 301 #endif |
276 float k = *scale; | 302 float k = *scale; |
277 while (n--) { | 303 while (n--) { |
278 *destP = k * *sourceP; | 304 *destP = k * *sourceP; |
279 sourceP += sourceStride; | 305 sourceP += sourceStride; |
280 destP += destStride; | 306 destP += destStride; |
281 } | 307 } |
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
362 n %= 4; | 388 n %= 4; |
363 while (n) { | 389 while (n) { |
364 *destP = *source1P + *source2P; | 390 *destP = *source1P + *source2P; |
365 source1P++; | 391 source1P++; |
366 source2P++; | 392 source2P++; |
367 destP++; | 393 destP++; |
368 n--; | 394 n--; |
369 } | 395 } |
370 } else { // if strides are not 1, rollback to normal algorithm | 396 } else { // if strides are not 1, rollback to normal algorithm |
371 #elif HAVE(ARM_NEON_INTRINSICS) | 397 #elif HAVE(ARM_NEON_INTRINSICS) |
372 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { | 398 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
373 int tailFrames = n % 4; | 399 unsigned tailFrames = framesToProcess & 15; |
374 const float* endP = destP + n - tailFrames; | |
375 | 400 |
376 while (destP < endP) { | 401 for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCou nt--) { |
377 float32x4_t source1 = vld1q_f32(source1P); | 402 float32x4_t source10 = vld1q_f32(source1P); |
378 float32x4_t source2 = vld1q_f32(source2P); | 403 float32x4_t source20 = vld1q_f32(source2P); |
379 vst1q_f32(destP, vaddq_f32(source1, source2)); | |
380 | 404 |
381 source1P += 4; | 405 float32x4_t source11 = vld1q_f32(source1P + 4); |
382 source2P += 4; | 406 float32x4_t source21 = vld1q_f32(source2P + 4); |
383 destP += 4; | 407 |
408 float32x4_t source12 = vld1q_f32(source1P + 8); | |
409 float32x4_t source22 = vld1q_f32(source2P + 8); | |
410 | |
411 float32x4_t source13 = vld1q_f32(source1P + 12); | |
412 float32x4_t source23 = vld1q_f32(source2P + 12); | |
413 | |
414 float32x4_t result0 = vaddq_f32(source10, source20); | |
415 float32x4_t result1 = vaddq_f32(source11, source21); | |
416 float32x4_t result2 = vaddq_f32(source12, source22); | |
417 float32x4_t result3 = vaddq_f32(source13, source23); | |
418 | |
419 vst1q_f32(destP, result0); | |
420 vst1q_f32(destP + 4, result1); | |
421 vst1q_f32(destP + 8, result2); | |
422 vst1q_f32(destP + 12, result3); | |
423 | |
424 source1P += 16; | |
425 source2P += 16; | |
426 destP += 16; | |
384 } | 427 } |
385 n = tailFrames; | 428 n = tailFrames; |
386 } | 429 } |
387 #endif | 430 #endif |
388 while (n--) { | 431 while (n--) { |
389 *destP = *source1P + *source2P; | 432 *destP = *source1P + *source2P; |
390 source1P += sourceStride1; | 433 source1P += sourceStride1; |
391 source2P += sourceStride2; | 434 source2P += sourceStride2; |
392 destP += destStride; | 435 destP += destStride; |
393 } | 436 } |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
439 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. | 482 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. |
440 SSE2_MULT(load, storeu) | 483 SSE2_MULT(load, storeu) |
441 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. | 484 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. |
442 SSE2_MULT(loadu, store) | 485 SSE2_MULT(loadu, store) |
443 else // Neither aligned. | 486 else // Neither aligned. |
444 SSE2_MULT(loadu, storeu) | 487 SSE2_MULT(loadu, storeu) |
445 | 488 |
446 n = tailFrames; | 489 n = tailFrames; |
447 } | 490 } |
448 #elif HAVE(ARM_NEON_INTRINSICS) | 491 #elif HAVE(ARM_NEON_INTRINSICS) |
449 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { | 492 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
450 int tailFrames = n % 4; | 493 unsigned tailFrames = n & 15; |
451 const float* endP = destP + n - tailFrames; | |
452 | 494 |
453 while (destP < endP) { | 495 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { |
454 float32x4_t source1 = vld1q_f32(source1P); | 496 float32x4_t source10 = vld1q_f32(source1P); |
455 float32x4_t source2 = vld1q_f32(source2P); | 497 float32x4_t source20 = vld1q_f32(source2P); |
Raymond Toy
2014/09/22 20:00:35
It might be advantageous to load all of the elemen
KhNo
2014/09/23 02:07:32
Thanks for review, I think also it is better for c
| |
456 vst1q_f32(destP, vmulq_f32(source1, source2)); | |
457 | 498 |
458 source1P += 4; | 499 float32x4_t source11 = vld1q_f32(source1P + 4); |
459 source2P += 4; | 500 float32x4_t source21 = vld1q_f32(source2P + 4); |
460 destP += 4; | 501 |
502 float32x4_t source12 = vld1q_f32(source1P + 8); | |
503 float32x4_t source22 = vld1q_f32(source2P + 8); | |
504 | |
505 float32x4_t source13 = vld1q_f32(source1P + 12); | |
506 float32x4_t source23 = vld1q_f32(source2P + 12); | |
507 | |
508 float32x4_t result0 = vmulq_f32(source10, source20); | |
509 float32x4_t result1 = vmulq_f32(source11, source21); | |
510 float32x4_t result2 = vmulq_f32(source12, source22); | |
511 float32x4_t result3 = vmulq_f32(source13, source23); | |
512 | |
513 vst1q_f32(destP, result0); | |
514 vst1q_f32(destP + 4, result1); | |
515 vst1q_f32(destP + 8, result2); | |
516 vst1q_f32(destP + 12, result3); | |
517 | |
518 source1P += 16; | |
519 source2P += 16; | |
520 destP += 16; | |
461 } | 521 } |
462 n = tailFrames; | 522 n = tailFrames; |
463 } | 523 } |
464 #endif | 524 #endif |
465 while (n) { | 525 while (n) { |
466 *destP = *source1P * *source2P; | 526 *destP = *source1P * *source2P; |
467 source1P += sourceStride1; | 527 source1P += sourceStride1; |
468 source2P += sourceStride2; | 528 source2P += sourceStride2; |
469 destP += destStride; | 529 destP += destStride; |
470 n--; | 530 n--; |
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
556 } | 616 } |
557 | 617 |
558 // Summarize the SSE results. | 618 // Summarize the SSE results. |
559 const float* groupSumP = reinterpret_cast<float*>(&mSum); | 619 const float* groupSumP = reinterpret_cast<float*>(&mSum); |
560 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; | 620 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; |
561 | 621 |
562 n = tailFrames; | 622 n = tailFrames; |
563 } | 623 } |
564 #elif HAVE(ARM_NEON_INTRINSICS) | 624 #elif HAVE(ARM_NEON_INTRINSICS) |
565 if (sourceStride == 1) { | 625 if (sourceStride == 1) { |
566 int tailFrames = n % 4; | 626 unsigned tailFrames = n & 15; |
567 const float* endP = sourceP + n - tailFrames; | |
568 | 627 |
569 float32x4_t fourSum = vdupq_n_f32(0); | 628 float32x4_t result0 = vdupq_n_f32(0); |
570 while (sourceP < endP) { | 629 float32x4_t result1 = result0; |
571 float32x4_t source = vld1q_f32(sourceP); | 630 float32x4_t result2 = result0; |
572 fourSum = vmlaq_f32(fourSum, source, source); | 631 float32x4_t result3 = result0; |
573 sourceP += 4; | 632 |
633 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { | |
634 float32x4_t source0 = vld1q_f32(sourceP); | |
635 float32x4_t source1 = vld1q_f32(sourceP + 4); | |
636 float32x4_t source2 = vld1q_f32(sourceP + 8); | |
637 float32x4_t source3 = vld1q_f32(sourceP + 12); | |
638 | |
639 result0 = vmlaq_f32(result0, source0, source0); | |
640 result1 = vmlaq_f32(result1, source1, source1); | |
641 result2 = vmlaq_f32(result2, source2, source2); | |
642 result3 = vmlaq_f32(result3, source3, source3); | |
643 | |
644 sourceP += 16; | |
574 } | 645 } |
575 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourS um)); | |
576 | 646 |
577 float groupSum[2]; | 647 result0 = vaddq_f32(result0, result1); |
578 vst1_f32(groupSum, twoSum); | 648 result0 = vaddq_f32(result0, result2); |
579 sum += groupSum[0] + groupSum[1]; | 649 result0 = vaddq_f32(result0, result3); |
650 | |
651 sum += vgetq_lane_f32(result0, 0); | |
652 sum += vgetq_lane_f32(result0, 1); | |
653 sum += vgetq_lane_f32(result0, 2); | |
654 sum += vgetq_lane_f32(result0, 3); | |
580 | 655 |
581 n = tailFrames; | 656 n = tailFrames; |
582 } | 657 } |
583 #endif | 658 #endif |
584 | 659 |
585 while (n--) { | 660 while (n--) { |
586 float sample = *sourceP; | 661 float sample = *sourceP; |
587 sum += sample * sample; | 662 sum += sample * sample; |
588 sourceP += sourceStride; | 663 sourceP += sourceStride; |
589 } | 664 } |
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
691 } | 766 } |
692 } | 767 } |
693 | 768 |
694 #endif // OS(MACOSX) | 769 #endif // OS(MACOSX) |
695 | 770 |
696 } // namespace VectorMath | 771 } // namespace VectorMath |
697 | 772 |
698 } // namespace WebCore | 773 } // namespace WebCore |
699 | 774 |
700 #endif // ENABLE(WEB_AUDIO) | 775 #endif // ENABLE(WEB_AUDIO) |
OLD | NEW |