Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1354)

Side by Side Diff: third_party/WebKit/Source/platform/audio/VectorMath.cpp

Issue 2340583003: Add MSA (MIPS SIMD Arch) optimized VectorMath functions (Closed)
Patch Set: Incorporate review comments Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | third_party/WebKit/Source/platform/cpu/mips/CommonMacrosMSA.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2010, Google Inc. All rights reserved. 2 * Copyright (C) 2010, Google Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions 5 * modification, are permitted provided that the following conditions
6 * are met: 6 * are met:
7 * 1. Redistributions of source code must retain the above copyright 7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer. 8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright 9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the 10 * notice, this list of conditions and the following disclaimer in the
(...skipping 23 matching lines...) Expand all
34 #endif 34 #endif
35 35
36 #if CPU(X86) || CPU(X86_64) 36 #if CPU(X86) || CPU(X86_64)
37 #include <emmintrin.h> 37 #include <emmintrin.h>
38 #endif 38 #endif
39 39
40 #if HAVE(ARM_NEON_INTRINSICS) 40 #if HAVE(ARM_NEON_INTRINSICS)
41 #include <arm_neon.h> 41 #include <arm_neon.h>
42 #endif 42 #endif
43 43
44 #if HAVE(MIPS_MSA_INTRINSICS)
45 #include "platform/cpu/mips/CommonMacrosMSA.h"
46 #endif
47
44 #include <math.h> 48 #include <math.h>
45 #include <algorithm> 49 #include <algorithm>
46 50
47 namespace blink { 51 namespace blink {
48 52
49 namespace VectorMath { 53 namespace VectorMath {
50 54
51 #if OS(MACOSX) 55 #if OS(MACOSX)
52 // On the Mac we use the highly optimized versions in Accelerate.framework 56 // On the Mac we use the highly optimized versions in Accelerate.framework
53 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes 57 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes
(...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after
222 float32x4_t dest = vld1q_f32(destP); 226 float32x4_t dest = vld1q_f32(destP);
223 227
224 dest = vmlaq_f32(dest, source, k); 228 dest = vmlaq_f32(dest, source, k);
225 vst1q_f32(destP, dest); 229 vst1q_f32(destP, dest);
226 230
227 sourceP += 4; 231 sourceP += 4;
228 destP += 4; 232 destP += 4;
229 } 233 }
230 n = tailFrames; 234 n = tailFrames;
231 } 235 }
236 #elif HAVE(MIPS_MSA_INTRINSICS)
237 if ((sourceStride == 1) && (destStride == 1)) {
238 float* destPCopy = destP;
239 const v4f32 vScale = (v4f32)__msa_fill_w(*((int32_t*)scale));
240 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
241 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
242
243 for (; n >= 32; n -= 32) {
244 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
245 vSrc7);
246 LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6,
247 vDst7);
248 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
249 VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
250 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
251 }
252 }
232 #endif 253 #endif
233 while (n) { 254 while (n) {
234 *destP += *sourceP * *scale; 255 *destP += *sourceP * *scale;
235 sourceP += sourceStride; 256 sourceP += sourceStride;
236 destP += destStride; 257 destP += destStride;
237 n--; 258 n--;
238 } 259 }
239 } 260 }
240 261
241 void vsmul(const float* sourceP, 262 void vsmul(const float* sourceP,
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after
303 324
304 while (destP < endP) { 325 while (destP < endP) {
305 float32x4_t source = vld1q_f32(sourceP); 326 float32x4_t source = vld1q_f32(sourceP);
306 vst1q_f32(destP, vmulq_n_f32(source, k)); 327 vst1q_f32(destP, vmulq_n_f32(source, k));
307 328
308 sourceP += 4; 329 sourceP += 4;
309 destP += 4; 330 destP += 4;
310 } 331 }
311 n = tailFrames; 332 n = tailFrames;
312 } 333 }
334 #elif HAVE(MIPS_MSA_INTRINSICS)
335 if ((sourceStride == 1) && (destStride == 1)) {
336 const v4f32 vScale = (v4f32)__msa_fill_w(*((int32_t*)scale));
337 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
338 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
339
340 for (; n >= 32; n -= 32) {
341 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
342 vSrc7);
343 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
344 VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
345 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
346 }
347 }
313 #endif 348 #endif
314 float k = *scale; 349 float k = *scale;
315 while (n--) { 350 while (n--) {
316 *destP = k * *sourceP; 351 *destP = k * *sourceP;
317 sourceP += sourceStride; 352 sourceP += sourceStride;
318 destP += destStride; 353 destP += destStride;
319 } 354 }
320 #if CPU(X86) || CPU(X86_64) 355 #if CPU(X86) || CPU(X86_64)
321 } 356 }
322 #endif 357 #endif
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after
424 float32x4_t source1 = vld1q_f32(source1P); 459 float32x4_t source1 = vld1q_f32(source1P);
425 float32x4_t source2 = vld1q_f32(source2P); 460 float32x4_t source2 = vld1q_f32(source2P);
426 vst1q_f32(destP, vaddq_f32(source1, source2)); 461 vst1q_f32(destP, vaddq_f32(source1, source2));
427 462
428 source1P += 4; 463 source1P += 4;
429 source2P += 4; 464 source2P += 4;
430 destP += 4; 465 destP += 4;
431 } 466 }
432 n = tailFrames; 467 n = tailFrames;
433 } 468 }
469 #elif HAVE(MIPS_MSA_INTRINSICS)
470 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
471 v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
472 vSrc1P7;
473 v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,
474 vSrc2P7;
475 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
476
477 for (; n >= 32; n -= 32) {
478 LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,
479 vSrc1P6, vSrc1P7);
480 LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,
481 vSrc2P6, vSrc2P7);
482 ADD4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,
483 vSrc2P3, vDst0, vDst1, vDst2, vDst3);
484 ADD4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,
485 vSrc2P7, vDst4, vDst5, vDst6, vDst7);
486 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
487 }
488 }
434 #endif 489 #endif
435 while (n--) { 490 while (n--) {
436 *destP = *source1P + *source2P; 491 *destP = *source1P + *source2P;
437 source1P += sourceStride1; 492 source1P += sourceStride1;
438 source2P += sourceStride2; 493 source2P += sourceStride2;
439 destP += destStride; 494 destP += destStride;
440 } 495 }
441 #if CPU(X86) || CPU(X86_64) 496 #if CPU(X86) || CPU(X86_64)
442 } 497 }
443 #endif 498 #endif
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
507 float32x4_t source1 = vld1q_f32(source1P); 562 float32x4_t source1 = vld1q_f32(source1P);
508 float32x4_t source2 = vld1q_f32(source2P); 563 float32x4_t source2 = vld1q_f32(source2P);
509 vst1q_f32(destP, vmulq_f32(source1, source2)); 564 vst1q_f32(destP, vmulq_f32(source1, source2));
510 565
511 source1P += 4; 566 source1P += 4;
512 source2P += 4; 567 source2P += 4;
513 destP += 4; 568 destP += 4;
514 } 569 }
515 n = tailFrames; 570 n = tailFrames;
516 } 571 }
572 #elif HAVE(MIPS_MSA_INTRINSICS)
573 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
574 v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,
575 vSrc1P7;
576 v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,
577 vSrc2P7;
578 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
579
580 for (; n >= 32; n -= 32) {
581 LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,
582 vSrc1P6, vSrc1P7);
583 LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,
584 vSrc2P6, vSrc2P7);
585 MUL4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,
586 vSrc2P3, vDst0, vDst1, vDst2, vDst3);
587 MUL4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,
588 vSrc2P7, vDst4, vDst5, vDst6, vDst7);
589 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
590 }
591 }
517 #endif 592 #endif
518 while (n) { 593 while (n) {
519 *destP = *source1P * *source2P; 594 *destP = *source1P * *source2P;
520 source1P += sourceStride1; 595 source1P += sourceStride1;
521 source2P += sourceStride2; 596 source2P += sourceStride2;
522 destP += destStride; 597 destP += destStride;
523 n--; 598 n--;
524 } 599 }
525 } 600 }
526 601
(...skipping 182 matching lines...) Expand 10 before | Expand all | Expand 10 after
709 } 784 }
710 float32x2_t twoMax = 785 float32x2_t twoMax =
711 vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); 786 vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));
712 787
713 float groupMax[2]; 788 float groupMax[2];
714 vst1_f32(groupMax, twoMax); 789 vst1_f32(groupMax, twoMax);
715 max = std::max(groupMax[0], groupMax[1]); 790 max = std::max(groupMax[0], groupMax[1]);
716 791
717 n = tailFrames; 792 n = tailFrames;
718 } 793 }
794 #elif HAVE(MIPS_MSA_INTRINSICS)
795 if (sourceStride == 1) {
796 v4f32 vMax = {
797 0,
798 };
799 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
800 const v16i8 vSignBitMask = (v16i8)__msa_fill_w(0x7FFFFFFF);
801
802 for (; n >= 32; n -= 32) {
803 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
804 vSrc7);
805 AND_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vSignBitMask);
806 VMAX_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vMax);
807 AND_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vSignBitMask);
808 VMAX_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vMax);
809 }
810
811 max = std::max(max, vMax[0]);
812 max = std::max(max, vMax[1]);
813 max = std::max(max, vMax[2]);
814 max = std::max(max, vMax[3]);
815 }
719 #endif 816 #endif
720 817
721 while (n--) { 818 while (n--) {
722 max = std::max(max, fabsf(*sourceP)); 819 max = std::max(max, fabsf(*sourceP));
723 sourceP += sourceStride; 820 sourceP += sourceStride;
724 } 821 }
725 822
726 ASSERT(maxP); 823 ASSERT(maxP);
727 *maxP = max; 824 *maxP = max;
728 } 825 }
(...skipping 18 matching lines...) Expand all
747 float32x4_t low = vdupq_n_f32(lowThreshold); 844 float32x4_t low = vdupq_n_f32(lowThreshold);
748 float32x4_t high = vdupq_n_f32(highThreshold); 845 float32x4_t high = vdupq_n_f32(highThreshold);
749 while (destP < endP) { 846 while (destP < endP) {
750 float32x4_t source = vld1q_f32(sourceP); 847 float32x4_t source = vld1q_f32(sourceP);
751 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); 848 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
752 sourceP += 4; 849 sourceP += 4;
753 destP += 4; 850 destP += 4;
754 } 851 }
755 n = tailFrames; 852 n = tailFrames;
756 } 853 }
854 #elif HAVE(MIPS_MSA_INTRINSICS)
855 if ((sourceStride == 1) && (destStride == 1)) {
856 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
857 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
858 const v4f32 vLowThr = (v4f32)__msa_fill_w(*((int32_t*)lowThresholdP));
859 const v4f32 vHighThr = (v4f32)__msa_fill_w(*((int32_t*)highThresholdP));
Raymond Toy 2016/10/04 15:48:07 Use C++ casting here instead of C. However, is th
860
861 for (; n >= 32; n -= 32) {
862 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,
863 vSrc7);
864 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2,
865 vDst3);
866 VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6,
867 vDst7);
868 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
869 }
870 }
757 #endif 871 #endif
758 while (n--) { 872 while (n--) {
759 *destP = clampTo(*sourceP, lowThreshold, highThreshold); 873 *destP = clampTo(*sourceP, lowThreshold, highThreshold);
760 sourceP += sourceStride; 874 sourceP += sourceStride;
761 destP += destStride; 875 destP += destStride;
762 } 876 }
763 } 877 }
764 878
765 #endif // OS(MACOSX) 879 #endif // OS(MACOSX)
766 880
767 } // namespace VectorMath 881 } // namespace VectorMath
768 882
769 } // namespace blink 883 } // namespace blink
OLDNEW
« no previous file with comments | « no previous file | third_party/WebKit/Source/platform/cpu/mips/CommonMacrosMSA.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698