third_party/WebKit/Source/platform/audio/VectorMath.cpp - Issue 2340583003: Add MSA (MIPS SIMD Arch) optimized VectorMath functions

Side by Side Diff: third_party/WebKit/Source/platform/audio/VectorMath.cpp

Issue 2340583003: Add MSA (MIPS SIMD Arch) optimized VectorMath functions (Closed)

Patch Set: formatting changes Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2010, Google Inc. All rights reserved.	2 * Copyright (C) 2010, Google Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 * 1. Redistributions of source code must retain the above copyright	7 * 1. Redistributions of source code must retain the above copyright

8 * notice, this list of conditions and the following disclaimer.	8 * notice, this list of conditions and the following disclaimer.

9 * 2. Redistributions in binary form must reproduce the above copyright	9 * 2. Redistributions in binary form must reproduce the above copyright

10 * notice, this list of conditions and the following disclaimer in the	10 * notice, this list of conditions and the following disclaimer in the

(...skipping 23 matching lines...) Expand all Loading...
34 #endif	34 #endif

35	35

36 #if CPU(X86) \|\| CPU(X86_64)	36 #if CPU(X86) \|\| CPU(X86_64)

37 #include <emmintrin.h>	37 #include <emmintrin.h>

38 #endif	38 #endif

39	39

40 #if HAVE(ARM_NEON_INTRINSICS)	40 #if HAVE(ARM_NEON_INTRINSICS)

41 #include <arm_neon.h>	41 #include <arm_neon.h>

42 #endif	42 #endif

43	43

	44 #if HAVE(MIPS_MSA_INTRINSICS)

	45 #include "platform/cpu/mips/CommonMacrosMSA.h"

	46 #endif

	47

44 #include <math.h>	48 #include <math.h>

45 #include <algorithm>	49 #include <algorithm>

46	50

47 namespace blink {	51 namespace blink {

48	52

49 namespace VectorMath {	53 namespace VectorMath {

50	54

51 #if OS(MACOSX)	55 #if OS(MACOSX)

52 // On the Mac we use the highly optimized versions in Accelerate.framework	56 // On the Mac we use the highly optimized versions in Accelerate.framework

53 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes	57 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes

(...skipping 168 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
222 float32x4_t dest = vld1q_f32(destP);	226 float32x4_t dest = vld1q_f32(destP);

223	227

224 dest = vmlaq_f32(dest, source, k);	228 dest = vmlaq_f32(dest, source, k);

225 vst1q_f32(destP, dest);	229 vst1q_f32(destP, dest);

226	230

227 sourceP += 4;	231 sourceP += 4;

228 destP += 4;	232 destP += 4;

229 }	233 }

230 n = tailFrames;	234 n = tailFrames;

231 }	235 }

	236 #elif HAVE(MIPS_MSA_INTRINSICS)

	237 if ((sourceStride == 1) && (destStride == 1)) {

	238 float* destPCopy = destP;

	239 v4f32 vScale;

	240 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

	241 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	242 FloatInt scaleVal;

	243

	244 scaleVal.floatVal = *scale;

	245 vScale = (v4f32)__msa_fill_w(scaleVal.intVal);

	246

	247 for (; n >= 32; n -= 32) {

	248 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

	249 vSrc7);

	250 LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6,

	251 vDst7);

	252 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);

	253 VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);

	254 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

	255 }

	256 }

232 #endif	257 #endif

233 while (n) {	258 while (n) {

234 destP += sourceP * *scale;	259 destP += sourceP * *scale;

235 sourceP += sourceStride;	260 sourceP += sourceStride;

236 destP += destStride;	261 destP += destStride;

237 n--;	262 n--;

238 }	263 }

239 }	264 }

240	265

241 void vsmul(const float* sourceP,	266 void vsmul(const float* sourceP,

(...skipping 61 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
303	328

304 while (destP < endP) {	329 while (destP < endP) {

305 float32x4_t source = vld1q_f32(sourceP);	330 float32x4_t source = vld1q_f32(sourceP);

306 vst1q_f32(destP, vmulq_n_f32(source, k));	331 vst1q_f32(destP, vmulq_n_f32(source, k));

307	332

308 sourceP += 4;	333 sourceP += 4;

309 destP += 4;	334 destP += 4;

310 }	335 }

311 n = tailFrames;	336 n = tailFrames;

312 }	337 }

	338 #elif HAVE(MIPS_MSA_INTRINSICS)

	339 if ((sourceStride == 1) && (destStride == 1)) {

	340 v4f32 vScale;

	341 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

	342 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	343 FloatInt scaleVal;

	344

	345 scaleVal.floatVal = *scale;

	346 vScale = (v4f32)__msa_fill_w(scaleVal.intVal);

	347

	348 for (; n >= 32; n -= 32) {

	349 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

	350 vSrc7);

	351 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);

	352 VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);

	353 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

	354 }

	355 }

313 #endif	356 #endif

314 float k = *scale;	357 float k = *scale;

315 while (n--) {	358 while (n--) {

316 destP = k *sourceP;	359 destP = k *sourceP;

317 sourceP += sourceStride;	360 sourceP += sourceStride;

318 destP += destStride;	361 destP += destStride;

319 }	362 }

320 #if CPU(X86) \|\| CPU(X86_64)	363 #if CPU(X86) \|\| CPU(X86_64)

321 }	364 }

322 #endif	365 #endif

(...skipping 101 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
424 float32x4_t source1 = vld1q_f32(source1P);	467 float32x4_t source1 = vld1q_f32(source1P);

425 float32x4_t source2 = vld1q_f32(source2P);	468 float32x4_t source2 = vld1q_f32(source2P);

426 vst1q_f32(destP, vaddq_f32(source1, source2));	469 vst1q_f32(destP, vaddq_f32(source1, source2));

427	470

428 source1P += 4;	471 source1P += 4;

429 source2P += 4;	472 source2P += 4;

430 destP += 4;	473 destP += 4;

431 }	474 }

432 n = tailFrames;	475 n = tailFrames;

433 }	476 }

	477 #elif HAVE(MIPS_MSA_INTRINSICS)

	478 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

	479 v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,

	480 vSrc1P7;

	481 v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,

	482 vSrc2P7;

	483 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	484

	485 for (; n >= 32; n -= 32) {

	486 LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,

	487 vSrc1P6, vSrc1P7);

	488 LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,

	489 vSrc2P6, vSrc2P7);

	490 ADD4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,

	491 vSrc2P3, vDst0, vDst1, vDst2, vDst3);

	492 ADD4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,

	493 vSrc2P7, vDst4, vDst5, vDst6, vDst7);

	494 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

	495 }

	496 }

434 #endif	497 #endif

435 while (n--) {	498 while (n--) {

436 destP = source1P + *source2P;	499 destP = source1P + *source2P;

437 source1P += sourceStride1;	500 source1P += sourceStride1;

438 source2P += sourceStride2;	501 source2P += sourceStride2;

439 destP += destStride;	502 destP += destStride;

440 }	503 }

441 #if CPU(X86) \|\| CPU(X86_64)	504 #if CPU(X86) \|\| CPU(X86_64)

442 }	505 }

443 #endif	506 #endif

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
507 float32x4_t source1 = vld1q_f32(source1P);	570 float32x4_t source1 = vld1q_f32(source1P);

508 float32x4_t source2 = vld1q_f32(source2P);	571 float32x4_t source2 = vld1q_f32(source2P);

509 vst1q_f32(destP, vmulq_f32(source1, source2));	572 vst1q_f32(destP, vmulq_f32(source1, source2));

510	573

511 source1P += 4;	574 source1P += 4;

512 source2P += 4;	575 source2P += 4;

513 destP += 4;	576 destP += 4;

514 }	577 }

515 n = tailFrames;	578 n = tailFrames;

516 }	579 }

	580 #elif HAVE(MIPS_MSA_INTRINSICS)

	581 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

	582 v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,

	583 vSrc1P7;

	584 v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,

	585 vSrc2P7;

	586 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	587

	588 for (; n >= 32; n -= 32) {

	589 LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,

	590 vSrc1P6, vSrc1P7);

	591 LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,

	592 vSrc2P6, vSrc2P7);

	593 MUL4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,

	594 vSrc2P3, vDst0, vDst1, vDst2, vDst3);

	595 MUL4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,

	596 vSrc2P7, vDst4, vDst5, vDst6, vDst7);

	597 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

	598 }

	599 }

517 #endif	600 #endif

518 while (n) {	601 while (n) {

519 destP = source1P * *source2P;	602 destP = source1P * *source2P;

520 source1P += sourceStride1;	603 source1P += sourceStride1;

521 source2P += sourceStride2;	604 source2P += sourceStride2;

522 destP += destStride;	605 destP += destStride;

523 n--;	606 n--;

524 }	607 }

525 }	608 }

526	609

(...skipping 182 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
709 }	792 }

710 float32x2_t twoMax =	793 float32x2_t twoMax =

711 vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));	794 vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));

712	795

713 float groupMax[2];	796 float groupMax[2];

714 vst1_f32(groupMax, twoMax);	797 vst1_f32(groupMax, twoMax);

715 max = std::max(groupMax[0], groupMax[1]);	798 max = std::max(groupMax[0], groupMax[1]);

716	799

717 n = tailFrames;	800 n = tailFrames;

718 }	801 }

	802 #elif HAVE(MIPS_MSA_INTRINSICS)

	803 if (sourceStride == 1) {

	804 v4f32 vMax = {

	805 0,

	806 };

	807 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

	808 const v16i8 vSignBitMask = (v16i8)__msa_fill_w(0x7FFFFFFF);

	809

	810 for (; n >= 32; n -= 32) {

	811 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

	812 vSrc7);

	813 AND_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vSignBitMask);

	814 VMAX_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vMax);

	815 AND_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vSignBitMask);

	816 VMAX_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vMax);

	817 }

	818

	819 max = std::max(max, vMax[0]);

	820 max = std::max(max, vMax[1]);

	821 max = std::max(max, vMax[2]);

	822 max = std::max(max, vMax[3]);

	823 }

719 #endif	824 #endif

720	825

721 while (n--) {	826 while (n--) {

722 max = std::max(max, fabsf(*sourceP));	827 max = std::max(max, fabsf(*sourceP));

723 sourceP += sourceStride;	828 sourceP += sourceStride;

724 }	829 }

725	830

726 ASSERT(maxP);	831 ASSERT(maxP);

727 *maxP = max;	832 *maxP = max;

728 }	833 }

(...skipping 18 matching lines...) Expand all Loading...
747 float32x4_t low = vdupq_n_f32(lowThreshold);	852 float32x4_t low = vdupq_n_f32(lowThreshold);

748 float32x4_t high = vdupq_n_f32(highThreshold);	853 float32x4_t high = vdupq_n_f32(highThreshold);

749 while (destP < endP) {	854 while (destP < endP) {

750 float32x4_t source = vld1q_f32(sourceP);	855 float32x4_t source = vld1q_f32(sourceP);

751 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));	856 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));

752 sourceP += 4;	857 sourceP += 4;

753 destP += 4;	858 destP += 4;

754 }	859 }

755 n = tailFrames;	860 n = tailFrames;

756 }	861 }

	862 #elif HAVE(MIPS_MSA_INTRINSICS)

	863 if ((sourceStride == 1) && (destStride == 1)) {

	864 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

	865 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	866 v4f32 vLowThr, vHighThr;

	867 FloatInt lowThr, highThr;

	868

	869 lowThr.floatVal = lowThreshold;

	870 highThr.floatVal = highThreshold;

	871 vLowThr = (v4f32)__msa_fill_w(lowThr.intVal);

	872 vHighThr = (v4f32)__msa_fill_w(highThr.intVal);

	873

	874 for (; n >= 32; n -= 32) {

	875 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

	876 vSrc7);

	877 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2,

	878 vDst3);

	879 VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6,

	880 vDst7);

	881 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

	882 }

	883 }

757 #endif	884 #endif

758 while (n--) {	885 while (n--) {

759 destP = clampTo(sourceP, lowThreshold, highThreshold);	886 destP = clampTo(sourceP, lowThreshold, highThreshold);

760 sourceP += sourceStride;	887 sourceP += sourceStride;

761 destP += destStride;	888 destP += destStride;

762 }	889 }

763 }	890 }

764	891

765 #endif // OS(MACOSX)	892 #endif // OS(MACOSX)

766	893

767 } // namespace VectorMath	894 } // namespace VectorMath

768	895

769 } // namespace blink	896 } // namespace blink

OLD	NEW

« no previous file with comments | « no previous file | third_party/WebKit/Source/platform/cpu/mips/CommonMacrosMSA.h » ('j') | no next file with comments »