third_party/WebKit/Source/platform/audio/VectorMath.cpp - Issue 2340583003: Add MSA (MIPS SIMD Arch) optimized VectorMath functions

Side by Side Diff: third_party/WebKit/Source/platform/audio/VectorMath.cpp

Issue 2340583003: Add MSA (MIPS SIMD Arch) optimized VectorMath functions (Closed)

Patch Set: Incorporate review comments Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2010, Google Inc. All rights reserved.	2 * Copyright (C) 2010, Google Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 * 1. Redistributions of source code must retain the above copyright	7 * 1. Redistributions of source code must retain the above copyright

8 * notice, this list of conditions and the following disclaimer.	8 * notice, this list of conditions and the following disclaimer.

9 * 2. Redistributions in binary form must reproduce the above copyright	9 * 2. Redistributions in binary form must reproduce the above copyright

10 * notice, this list of conditions and the following disclaimer in the	10 * notice, this list of conditions and the following disclaimer in the

(...skipping 23 matching lines...) Expand all Loading...
34 #endif	34 #endif

35	35

36 #if CPU(X86) \|\| CPU(X86_64)	36 #if CPU(X86) \|\| CPU(X86_64)

37 #include <emmintrin.h>	37 #include <emmintrin.h>

38 #endif	38 #endif

39	39

40 #if HAVE(ARM_NEON_INTRINSICS)	40 #if HAVE(ARM_NEON_INTRINSICS)

41 #include <arm_neon.h>	41 #include <arm_neon.h>

42 #endif	42 #endif

43	43

	44 #if HAVE(MIPS_MSA_INTRINSICS)

	45 #include "platform/cpu/mips/CommonMacrosMSA.h"

	46 #endif

	47

44 #include <math.h>	48 #include <math.h>

45 #include <algorithm>	49 #include <algorithm>

46	50

47 namespace blink {	51 namespace blink {

48	52

49 namespace VectorMath {	53 namespace VectorMath {

50	54

51 #if OS(MACOSX)	55 #if OS(MACOSX)

52 // On the Mac we use the highly optimized versions in Accelerate.framework	56 // On the Mac we use the highly optimized versions in Accelerate.framework

53 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes	57 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes

(...skipping 168 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
222 float32x4_t dest = vld1q_f32(destP);	226 float32x4_t dest = vld1q_f32(destP);

223	227

224 dest = vmlaq_f32(dest, source, k);	228 dest = vmlaq_f32(dest, source, k);

225 vst1q_f32(destP, dest);	229 vst1q_f32(destP, dest);

226	230

227 sourceP += 4;	231 sourceP += 4;

228 destP += 4;	232 destP += 4;

229 }	233 }

230 n = tailFrames;	234 n = tailFrames;

231 }	235 }

	236 #elif HAVE(MIPS_MSA_INTRINSICS)

	237 if ((sourceStride == 1) && (destStride == 1)) {

	238 float* destPCopy = destP;

	239 const v4f32 vScale = (v4f32)__msa_fill_w(((int32_t)scale));

	240 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

	241 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	242

	243 for (; n >= 32; n -= 32) {

	244 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

	245 vSrc7);

	246 LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6,

	247 vDst7);

	248 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);

	249 VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);

	250 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

	251 }

	252 }

232 #endif	253 #endif

233 while (n) {	254 while (n) {

234 destP += sourceP * *scale;	255 destP += sourceP * *scale;

235 sourceP += sourceStride;	256 sourceP += sourceStride;

236 destP += destStride;	257 destP += destStride;

237 n--;	258 n--;

238 }	259 }

239 }	260 }

240	261

241 void vsmul(const float* sourceP,	262 void vsmul(const float* sourceP,

(...skipping 61 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
303	324

304 while (destP < endP) {	325 while (destP < endP) {

305 float32x4_t source = vld1q_f32(sourceP);	326 float32x4_t source = vld1q_f32(sourceP);

306 vst1q_f32(destP, vmulq_n_f32(source, k));	327 vst1q_f32(destP, vmulq_n_f32(source, k));

307	328

308 sourceP += 4;	329 sourceP += 4;

309 destP += 4;	330 destP += 4;

310 }	331 }

311 n = tailFrames;	332 n = tailFrames;

312 }	333 }

	334 #elif HAVE(MIPS_MSA_INTRINSICS)

	335 if ((sourceStride == 1) && (destStride == 1)) {

	336 const v4f32 vScale = (v4f32)__msa_fill_w(((int32_t)scale));

	337 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

	338 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	339

	340 for (; n >= 32; n -= 32) {

	341 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

	342 vSrc7);

	343 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);

	344 VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);

	345 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

	346 }

	347 }

313 #endif	348 #endif

314 float k = *scale;	349 float k = *scale;

315 while (n--) {	350 while (n--) {

316 destP = k *sourceP;	351 destP = k *sourceP;

317 sourceP += sourceStride;	352 sourceP += sourceStride;

318 destP += destStride;	353 destP += destStride;

319 }	354 }

320 #if CPU(X86) \|\| CPU(X86_64)	355 #if CPU(X86) \|\| CPU(X86_64)

321 }	356 }

322 #endif	357 #endif

(...skipping 101 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
424 float32x4_t source1 = vld1q_f32(source1P);	459 float32x4_t source1 = vld1q_f32(source1P);

425 float32x4_t source2 = vld1q_f32(source2P);	460 float32x4_t source2 = vld1q_f32(source2P);

426 vst1q_f32(destP, vaddq_f32(source1, source2));	461 vst1q_f32(destP, vaddq_f32(source1, source2));

427	462

428 source1P += 4;	463 source1P += 4;

429 source2P += 4;	464 source2P += 4;

430 destP += 4;	465 destP += 4;

431 }	466 }

432 n = tailFrames;	467 n = tailFrames;

433 }	468 }

	469 #elif HAVE(MIPS_MSA_INTRINSICS)

	470 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

	471 v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,

	472 vSrc1P7;

	473 v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,

	474 vSrc2P7;

	475 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	476

	477 for (; n >= 32; n -= 32) {

	478 LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,

	479 vSrc1P6, vSrc1P7);

	480 LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,

	481 vSrc2P6, vSrc2P7);

	482 ADD4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,

	483 vSrc2P3, vDst0, vDst1, vDst2, vDst3);

	484 ADD4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,

	485 vSrc2P7, vDst4, vDst5, vDst6, vDst7);

	486 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

	487 }

	488 }

434 #endif	489 #endif

435 while (n--) {	490 while (n--) {

436 destP = source1P + *source2P;	491 destP = source1P + *source2P;

437 source1P += sourceStride1;	492 source1P += sourceStride1;

438 source2P += sourceStride2;	493 source2P += sourceStride2;

439 destP += destStride;	494 destP += destStride;

440 }	495 }

441 #if CPU(X86) \|\| CPU(X86_64)	496 #if CPU(X86) \|\| CPU(X86_64)

442 }	497 }

443 #endif	498 #endif

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
507 float32x4_t source1 = vld1q_f32(source1P);	562 float32x4_t source1 = vld1q_f32(source1P);

508 float32x4_t source2 = vld1q_f32(source2P);	563 float32x4_t source2 = vld1q_f32(source2P);

509 vst1q_f32(destP, vmulq_f32(source1, source2));	564 vst1q_f32(destP, vmulq_f32(source1, source2));

510	565

511 source1P += 4;	566 source1P += 4;

512 source2P += 4;	567 source2P += 4;

513 destP += 4;	568 destP += 4;

514 }	569 }

515 n = tailFrames;	570 n = tailFrames;

516 }	571 }

	572 #elif HAVE(MIPS_MSA_INTRINSICS)

	573 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

	574 v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,

	575 vSrc1P7;

	576 v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,

	577 vSrc2P7;

	578 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	579

	580 for (; n >= 32; n -= 32) {

	581 LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,

	582 vSrc1P6, vSrc1P7);

	583 LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,

	584 vSrc2P6, vSrc2P7);

	585 MUL4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,

	586 vSrc2P3, vDst0, vDst1, vDst2, vDst3);

	587 MUL4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,

	588 vSrc2P7, vDst4, vDst5, vDst6, vDst7);

	589 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

	590 }

	591 }

517 #endif	592 #endif

518 while (n) {	593 while (n) {

519 destP = source1P * *source2P;	594 destP = source1P * *source2P;

520 source1P += sourceStride1;	595 source1P += sourceStride1;

521 source2P += sourceStride2;	596 source2P += sourceStride2;

522 destP += destStride;	597 destP += destStride;

523 n--;	598 n--;

524 }	599 }

525 }	600 }

526	601

(...skipping 182 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
709 }	784 }

710 float32x2_t twoMax =	785 float32x2_t twoMax =

711 vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));	786 vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));

712	787

713 float groupMax[2];	788 float groupMax[2];

714 vst1_f32(groupMax, twoMax);	789 vst1_f32(groupMax, twoMax);

715 max = std::max(groupMax[0], groupMax[1]);	790 max = std::max(groupMax[0], groupMax[1]);

716	791

717 n = tailFrames;	792 n = tailFrames;

718 }	793 }

	794 #elif HAVE(MIPS_MSA_INTRINSICS)

	795 if (sourceStride == 1) {

	796 v4f32 vMax = {

	797 0,

	798 };

	799 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

	800 const v16i8 vSignBitMask = (v16i8)__msa_fill_w(0x7FFFFFFF);

	801

	802 for (; n >= 32; n -= 32) {

	803 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

	804 vSrc7);

	805 AND_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vSignBitMask);

	806 VMAX_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vMax);

	807 AND_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vSignBitMask);

	808 VMAX_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vMax);

	809 }

	810

	811 max = std::max(max, vMax[0]);

	812 max = std::max(max, vMax[1]);

	813 max = std::max(max, vMax[2]);

	814 max = std::max(max, vMax[3]);

	815 }

719 #endif	816 #endif

720	817

721 while (n--) {	818 while (n--) {

722 max = std::max(max, fabsf(*sourceP));	819 max = std::max(max, fabsf(*sourceP));

723 sourceP += sourceStride;	820 sourceP += sourceStride;

724 }	821 }

725	822

726 ASSERT(maxP);	823 ASSERT(maxP);

727 *maxP = max;	824 *maxP = max;

728 }	825 }

(...skipping 18 matching lines...) Expand all Loading...
747 float32x4_t low = vdupq_n_f32(lowThreshold);	844 float32x4_t low = vdupq_n_f32(lowThreshold);

748 float32x4_t high = vdupq_n_f32(highThreshold);	845 float32x4_t high = vdupq_n_f32(highThreshold);

749 while (destP < endP) {	846 while (destP < endP) {

750 float32x4_t source = vld1q_f32(sourceP);	847 float32x4_t source = vld1q_f32(sourceP);

751 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));	848 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));

752 sourceP += 4;	849 sourceP += 4;

753 destP += 4;	850 destP += 4;

754 }	851 }

755 n = tailFrames;	852 n = tailFrames;

756 }	853 }

	854 #elif HAVE(MIPS_MSA_INTRINSICS)

	855 if ((sourceStride == 1) && (destStride == 1)) {

	856 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

	857 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	858 const v4f32 vLowThr = (v4f32)__msa_fill_w(((int32_t)lowThresholdP));

	859 const v4f32 vHighThr = (v4f32)__msa_fill_w(((int32_t)highThresholdP));
	Raymond Toy 2016/10/04 15:48:07 Use C++ casting here instead of C. However, is th Use C++ casting here instead of C. However, is this guaranteed to do the right thing with this kind of type punning? I always forget the complicated rules. I always use a union of int and float for this kind of punning.
	860

	861 for (; n >= 32; n -= 32) {

	862 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

	863 vSrc7);

	864 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2,

	865 vDst3);

	866 VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6,

	867 vDst7);

	868 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

	869 }

	870 }

757 #endif	871 #endif

758 while (n--) {	872 while (n--) {

759 destP = clampTo(sourceP, lowThreshold, highThreshold);	873 destP = clampTo(sourceP, lowThreshold, highThreshold);

760 sourceP += sourceStride;	874 sourceP += sourceStride;

761 destP += destStride;	875 destP += destStride;

762 }	876 }

763 }	877 }

764	878

765 #endif // OS(MACOSX)	879 #endif // OS(MACOSX)

766	880

767 } // namespace VectorMath	881 } // namespace VectorMath

768	882

769 } // namespace blink	883 } // namespace blink

OLD	NEW

« no previous file with comments | « no previous file | third_party/WebKit/Source/platform/cpu/mips/CommonMacrosMSA.h » ('j') | no next file with comments »