OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2010, Google Inc. All rights reserved. | 2 * Copyright (C) 2010, Google Inc. All rights reserved. |
3 * | 3 * |
4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
6 * are met: | 6 * are met: |
7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
11 * documentation and/or other materials provided with the distribution. | 11 * documentation and/or other materials provided with the distribution. |
12 * | 12 * |
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND AN
Y | 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND |
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR AN
Y | 16 * ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE |
17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND O
N | 19 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 20 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
| 23 * DAMAGE. |
23 */ | 24 */ |
24 | 25 |
25 #include "platform/audio/VectorMath.h" | 26 #include "platform/audio/VectorMath.h" |
26 #include "wtf/Assertions.h" | 27 #include "wtf/Assertions.h" |
27 #include "wtf/CPU.h" | 28 #include "wtf/CPU.h" |
28 #include "wtf/MathExtras.h" | 29 #include "wtf/MathExtras.h" |
29 #include <stdint.h> | 30 #include <stdint.h> |
30 | 31 |
31 #if OS(MACOSX) | 32 #if OS(MACOSX) |
32 #include <Accelerate/Accelerate.h> | 33 #include <Accelerate/Accelerate.h> |
33 #endif | 34 #endif |
34 | 35 |
35 #if CPU(X86) || CPU(X86_64) | 36 #if CPU(X86) || CPU(X86_64) |
36 #include <emmintrin.h> | 37 #include <emmintrin.h> |
37 #endif | 38 #endif |
38 | 39 |
39 #if HAVE(ARM_NEON_INTRINSICS) | 40 #if HAVE(ARM_NEON_INTRINSICS) |
40 #include <arm_neon.h> | 41 #include <arm_neon.h> |
41 #endif | 42 #endif |
42 | 43 |
43 #include <math.h> | 44 #include <math.h> |
44 #include <algorithm> | 45 #include <algorithm> |
45 | 46 |
46 namespace blink { | 47 namespace blink { |
47 | 48 |
48 namespace VectorMath { | 49 namespace VectorMath { |
49 | 50 |
50 #if OS(MACOSX) | 51 #if OS(MACOSX) |
51 // On the Mac we use the highly optimized versions in Accelerate.framework | 52 // On the Mac we use the highly optimized versions in Accelerate.framework |
52 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecL
ib/vDSP_translate.h> which defines macros of the same name as | 53 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes |
53 // our namespaced function names, so we must handle this case differently. Other
architectures (64bit, ARM, etc.) do not include this header file. | 54 // <vecLib/vDSP_translate.h> which defines macros of the same name as |
| 55 // our namespaced function names, so we must handle this case differently. Other |
| 56 // architectures (64bit, ARM, etc.) do not include this header file. |
54 | 57 |
55 void vsmul(const float* sourceP, | 58 void vsmul(const float* sourceP, |
56 int sourceStride, | 59 int sourceStride, |
57 const float* scale, | 60 const float* scale, |
58 float* destP, | 61 float* destP, |
59 int destStride, | 62 int destStride, |
60 size_t framesToProcess) { | 63 size_t framesToProcess) { |
61 #if CPU(X86) | 64 #if CPU(X86) |
62 ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); | 65 ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); |
63 #else | 66 #else |
(...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
163 const float* scale, | 166 const float* scale, |
164 float* destP, | 167 float* destP, |
165 int destStride, | 168 int destStride, |
166 size_t framesToProcess) { | 169 size_t framesToProcess) { |
167 int n = framesToProcess; | 170 int n = framesToProcess; |
168 | 171 |
169 #if CPU(X86) || CPU(X86_64) | 172 #if CPU(X86) || CPU(X86_64) |
170 if ((sourceStride == 1) && (destStride == 1)) { | 173 if ((sourceStride == 1) && (destStride == 1)) { |
171 float k = *scale; | 174 float k = *scale; |
172 | 175 |
173 // If the sourceP address is not 16-byte aligned, the first several frames (
at most three) should be processed separately. | 176 // If the sourceP address is not 16-byte aligned, the first several frames |
| 177 // (at most three) should be processed separately. |
174 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { | 178 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
175 *destP += k * *sourceP; | 179 *destP += k * *sourceP; |
176 sourceP++; | 180 sourceP++; |
177 destP++; | 181 destP++; |
178 n--; | 182 n--; |
179 } | 183 } |
180 | 184 |
181 // Now the sourceP is aligned, use SSE. | 185 // Now the sourceP is aligned, use SSE. |
182 int tailFrames = n % 4; | 186 int tailFrames = n % 4; |
183 const float* endP = destP + n - tailFrames; | 187 const float* endP = destP + n - tailFrames; |
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
239 const float* scale, | 243 const float* scale, |
240 float* destP, | 244 float* destP, |
241 int destStride, | 245 int destStride, |
242 size_t framesToProcess) { | 246 size_t framesToProcess) { |
243 int n = framesToProcess; | 247 int n = framesToProcess; |
244 | 248 |
245 #if CPU(X86) || CPU(X86_64) | 249 #if CPU(X86) || CPU(X86_64) |
246 if ((sourceStride == 1) && (destStride == 1)) { | 250 if ((sourceStride == 1) && (destStride == 1)) { |
247 float k = *scale; | 251 float k = *scale; |
248 | 252 |
249 // If the sourceP address is not 16-byte aligned, the first several frames (
at most three) should be processed separately. | 253 // If the sourceP address is not 16-byte aligned, the first several frames |
| 254 // (at most three) should be processed separately. |
250 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { | 255 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { |
251 *destP = k * *sourceP; | 256 *destP = k * *sourceP; |
252 sourceP++; | 257 sourceP++; |
253 destP++; | 258 destP++; |
254 n--; | 259 n--; |
255 } | 260 } |
256 | 261 |
257 // Now the sourceP address is aligned and start to apply SSE. | 262 // Now the sourceP address is aligned and start to apply SSE. |
258 int group = n / 4; | 263 int group = n / 4; |
259 __m128 mScale = _mm_set_ps1(k); | 264 __m128 mScale = _mm_set_ps1(k); |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
321 int sourceStride1, | 326 int sourceStride1, |
322 const float* source2P, | 327 const float* source2P, |
323 int sourceStride2, | 328 int sourceStride2, |
324 float* destP, | 329 float* destP, |
325 int destStride, | 330 int destStride, |
326 size_t framesToProcess) { | 331 size_t framesToProcess) { |
327 int n = framesToProcess; | 332 int n = framesToProcess; |
328 | 333 |
329 #if CPU(X86) || CPU(X86_64) | 334 #if CPU(X86) || CPU(X86_64) |
330 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { | 335 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
331 // If the sourceP address is not 16-byte aligned, the first several frames (
at most three) should be processed separately. | 336 // If the sourceP address is not 16-byte aligned, the first several frames |
| 337 // (at most three) should be processed separately. |
332 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { | 338 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { |
333 *destP = *source1P + *source2P; | 339 *destP = *source1P + *source2P; |
334 source1P++; | 340 source1P++; |
335 source2P++; | 341 source2P++; |
336 destP++; | 342 destP++; |
337 n--; | 343 n--; |
338 } | 344 } |
339 | 345 |
340 // Now the source1P address is aligned and start to apply SSE. | 346 // Now the source1P address is aligned and start to apply SSE. |
341 int group = n / 4; | 347 int group = n / 4; |
(...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
441 int sourceStride1, | 447 int sourceStride1, |
442 const float* source2P, | 448 const float* source2P, |
443 int sourceStride2, | 449 int sourceStride2, |
444 float* destP, | 450 float* destP, |
445 int destStride, | 451 int destStride, |
446 size_t framesToProcess) { | 452 size_t framesToProcess) { |
447 int n = framesToProcess; | 453 int n = framesToProcess; |
448 | 454 |
449 #if CPU(X86) || CPU(X86_64) | 455 #if CPU(X86) || CPU(X86_64) |
450 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { | 456 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
451 // If the source1P address is not 16-byte aligned, the first several frames
(at most three) should be processed separately. | 457 // If the source1P address is not 16-byte aligned, the first several frames |
| 458 // (at most three) should be processed separately. |
452 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { | 459 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { |
453 *destP = *source1P * *source2P; | 460 *destP = *source1P * *source2P; |
454 source1P++; | 461 source1P++; |
455 source2P++; | 462 source2P++; |
456 destP++; | 463 destP++; |
457 n--; | 464 n--; |
458 } | 465 } |
459 | 466 |
460 // Now the source1P address aligned and start to apply SSE. | 467 // Now the source1P address aligned and start to apply SSE. |
461 int tailFrames = n % 4; | 468 int tailFrames = n % 4; |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
519 | 526 |
520 void zvmul(const float* real1P, | 527 void zvmul(const float* real1P, |
521 const float* imag1P, | 528 const float* imag1P, |
522 const float* real2P, | 529 const float* real2P, |
523 const float* imag2P, | 530 const float* imag2P, |
524 float* realDestP, | 531 float* realDestP, |
525 float* imagDestP, | 532 float* imagDestP, |
526 size_t framesToProcess) { | 533 size_t framesToProcess) { |
527 unsigned i = 0; | 534 unsigned i = 0; |
528 #if CPU(X86) || CPU(X86_64) | 535 #if CPU(X86) || CPU(X86_64) |
529 // Only use the SSE optimization in the very common case that all addresses ar
e 16-byte aligned. | 536 // Only use the SSE optimization in the very common case that all addresses |
530 // Otherwise, fall through to the scalar code below. | 537 // are 16-byte aligned. Otherwise, fall through to the scalar code below. |
531 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) && | 538 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) && |
532 !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) && | 539 !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) && |
533 !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) && | 540 !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) && |
534 !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) && | 541 !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) && |
535 !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) && | 542 !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) && |
536 !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { | 543 !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { |
537 unsigned endSize = framesToProcess - framesToProcess % 4; | 544 unsigned endSize = framesToProcess - framesToProcess % 4; |
538 while (i < endSize) { | 545 while (i < endSize) { |
539 __m128 real1 = _mm_load_ps(real1P + i); | 546 __m128 real1 = _mm_load_ps(real1P + i); |
540 __m128 real2 = _mm_load_ps(real2P + i); | 547 __m128 real2 = _mm_load_ps(real2P + i); |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
579 | 586 |
580 void vsvesq(const float* sourceP, | 587 void vsvesq(const float* sourceP, |
581 int sourceStride, | 588 int sourceStride, |
582 float* sumP, | 589 float* sumP, |
583 size_t framesToProcess) { | 590 size_t framesToProcess) { |
584 int n = framesToProcess; | 591 int n = framesToProcess; |
585 float sum = 0; | 592 float sum = 0; |
586 | 593 |
587 #if CPU(X86) || CPU(X86_64) | 594 #if CPU(X86) || CPU(X86_64) |
588 if (sourceStride == 1) { | 595 if (sourceStride == 1) { |
589 // If the sourceP address is not 16-byte aligned, the first several frames (
at most three) should be processed separately. | 596 // If the sourceP address is not 16-byte aligned, the first several frames |
| 597 // (at most three) should be processed separately. |
590 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { | 598 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
591 float sample = *sourceP; | 599 float sample = *sourceP; |
592 sum += sample * sample; | 600 sum += sample * sample; |
593 sourceP++; | 601 sourceP++; |
594 n--; | 602 n--; |
595 } | 603 } |
596 | 604 |
597 // Now the sourceP is aligned, use SSE. | 605 // Now the sourceP is aligned, use SSE. |
598 int tailFrames = n % 4; | 606 int tailFrames = n % 4; |
599 const float* endP = sourceP + n - tailFrames; | 607 const float* endP = sourceP + n - tailFrames; |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
647 | 655 |
648 void vmaxmgv(const float* sourceP, | 656 void vmaxmgv(const float* sourceP, |
649 int sourceStride, | 657 int sourceStride, |
650 float* maxP, | 658 float* maxP, |
651 size_t framesToProcess) { | 659 size_t framesToProcess) { |
652 int n = framesToProcess; | 660 int n = framesToProcess; |
653 float max = 0; | 661 float max = 0; |
654 | 662 |
655 #if CPU(X86) || CPU(X86_64) | 663 #if CPU(X86) || CPU(X86_64) |
656 if (sourceStride == 1) { | 664 if (sourceStride == 1) { |
657 // If the sourceP address is not 16-byte aligned, the first several frames (
at most three) should be processed separately. | 665 // If the sourceP address is not 16-byte aligned, the first several frames |
| 666 // (at most three) should be processed separately. |
658 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { | 667 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { |
659 max = std::max(max, fabsf(*sourceP)); | 668 max = std::max(max, fabsf(*sourceP)); |
660 sourceP++; | 669 sourceP++; |
661 n--; | 670 n--; |
662 } | 671 } |
663 | 672 |
664 // Now the sourceP is aligned, use SSE. | 673 // Now the sourceP is aligned, use SSE. |
665 int tailFrames = n % 4; | 674 int tailFrames = n % 4; |
666 const float* endP = sourceP + n - tailFrames; | 675 const float* endP = sourceP + n - tailFrames; |
667 __m128 source; | 676 __m128 source; |
668 __m128 mMax = _mm_setzero_ps(); | 677 __m128 mMax = _mm_setzero_ps(); |
669 int mask = 0x7FFFFFFF; | 678 int mask = 0x7FFFFFFF; |
670 __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask)); | 679 __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask)); |
671 | 680 |
672 while (sourceP < endP) { | 681 while (sourceP < endP) { |
673 source = _mm_load_ps(sourceP); | 682 source = _mm_load_ps(sourceP); |
674 // Calculate the absolute value by anding source with mask, the sign bit i
s set to 0. | 683 // Calculate the absolute value by anding source with mask, the sign bit |
| 684 // is set to 0. |
675 source = _mm_and_ps(source, mMask); | 685 source = _mm_and_ps(source, mMask); |
676 mMax = _mm_max_ps(mMax, source); | 686 mMax = _mm_max_ps(mMax, source); |
677 sourceP += 4; | 687 sourceP += 4; |
678 } | 688 } |
679 | 689 |
680 // Get max from the SSE results. | 690 // Get max from the SSE results. |
681 const float* groupMaxP = reinterpret_cast<float*>(&mMax); | 691 const float* groupMaxP = reinterpret_cast<float*>(&mMax); |
682 max = std::max(max, groupMaxP[0]); | 692 max = std::max(max, groupMaxP[0]); |
683 max = std::max(max, groupMaxP[1]); | 693 max = std::max(max, groupMaxP[1]); |
684 max = std::max(max, groupMaxP[2]); | 694 max = std::max(max, groupMaxP[2]); |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
750 sourceP += sourceStride; | 760 sourceP += sourceStride; |
751 destP += destStride; | 761 destP += destStride; |
752 } | 762 } |
753 } | 763 } |
754 | 764 |
755 #endif // OS(MACOSX) | 765 #endif // OS(MACOSX) |
756 | 766 |
757 } // namespace VectorMath | 767 } // namespace VectorMath |
758 | 768 |
759 } // namespace blink | 769 } // namespace blink |
OLD | NEW |