third_party/WebKit/Source/platform/audio/VectorMath.cpp - Issue 2340583003: Add MSA (MIPS SIMD Arch) optimized VectorMath functions

Unified Diff: third_party/WebKit/Source/platform/audio/VectorMath.cpp

Issue 2340583003: Add MSA (MIPS SIMD Arch) optimized VectorMath functions (Closed)

Patch Set: Incorporate review comments Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/WebKit/Source/platform/audio/VectorMath.cpp

diff --git a/third_party/WebKit/Source/platform/audio/VectorMath.cpp b/third_party/WebKit/Source/platform/audio/VectorMath.cpp

index bf0d2ada982376de5c77ecf0b10d5bf76052cd51..c7cf211312fdcb6969a8a9376a91134b3e1cd27a 100644

--- a/third_party/WebKit/Source/platform/audio/VectorMath.cpp

+++ b/third_party/WebKit/Source/platform/audio/VectorMath.cpp

@@ -41,6 +41,10 @@

#include <arm_neon.h>

#endif

+#if HAVE(MIPS_MSA_INTRINSICS)

+#include "platform/cpu/mips/CommonMacrosMSA.h"

+#endif

#include <math.h>

#include <algorithm>

@@ -229,6 +233,23 @@ void vsma(const float* sourceP,

}

n = tailFrames;

}

+#elif HAVE(MIPS_MSA_INTRINSICS)

+ if ((sourceStride == 1) && (destStride == 1)) {

+ float* destPCopy = destP;

+ const v4f32 vScale = (v4f32)__msa_fill_w(*((int32_t*)scale));

+ v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

+ v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

+ for (; n >= 32; n -= 32) {

+ LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

+ vSrc7);

+ LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6,

+ vDst7);

+ VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);

+ VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);

+ ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

+ }

#endif

while (n) {

*destP += *sourceP * *scale;

@@ -310,6 +331,20 @@ void vsmul(const float* sourceP,

}

n = tailFrames;

}

+#elif HAVE(MIPS_MSA_INTRINSICS)

+ if ((sourceStride == 1) && (destStride == 1)) {

+ const v4f32 vScale = (v4f32)__msa_fill_w(*((int32_t*)scale));

+ v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

+ v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

+ for (; n >= 32; n -= 32) {

+ LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

+ vSrc7);

+ VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);

+ VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);

+ ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

+ }

#endif

float k = *scale;

while (n--) {

@@ -431,6 +466,26 @@ void vadd(const float* source1P,

}

n = tailFrames;

}

+#elif HAVE(MIPS_MSA_INTRINSICS)

+ if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

+ v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,

+ vSrc1P7;

+ v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,

+ vSrc2P7;

+ v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

+ for (; n >= 32; n -= 32) {

+ LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,

+ vSrc1P6, vSrc1P7);

+ LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,

+ vSrc2P6, vSrc2P7);

+ ADD4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,

+ vSrc2P3, vDst0, vDst1, vDst2, vDst3);

+ ADD4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,

+ vSrc2P7, vDst4, vDst5, vDst6, vDst7);

+ ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

+ }

#endif

while (n--) {

*destP = *source1P + *source2P;

@@ -514,6 +569,26 @@ void vmul(const float* source1P,

}

n = tailFrames;

}

+#elif HAVE(MIPS_MSA_INTRINSICS)

+ if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

+ v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6,

+ vSrc1P7;

+ v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6,

+ vSrc2P7;

+ v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

+ for (; n >= 32; n -= 32) {

+ LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5,

+ vSrc1P6, vSrc1P7);

+ LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5,

+ vSrc2P6, vSrc2P7);

+ MUL4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3,

+ vSrc2P3, vDst0, vDst1, vDst2, vDst3);

+ MUL4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7,

+ vSrc2P7, vDst4, vDst5, vDst6, vDst7);

+ ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

+ }

#endif

while (n) {

*destP = *source1P * *source2P;

@@ -716,6 +791,28 @@ void vmaxmgv(const float* sourceP,

n = tailFrames;

}

+#elif HAVE(MIPS_MSA_INTRINSICS)

+ if (sourceStride == 1) {

+ v4f32 vMax = {

+ 0,

+ };

+ v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

+ const v16i8 vSignBitMask = (v16i8)__msa_fill_w(0x7FFFFFFF);

+ for (; n >= 32; n -= 32) {

+ LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

+ vSrc7);

+ AND_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vSignBitMask);

+ VMAX_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vMax);

+ AND_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vSignBitMask);

+ VMAX_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vMax);

+ }

+ max = std::max(max, vMax[0]);

+ max = std::max(max, vMax[1]);

+ max = std::max(max, vMax[2]);

+ max = std::max(max, vMax[3]);

+ }

#endif

while (n--) {

@@ -754,6 +851,23 @@ void vclip(const float* sourceP,

}

n = tailFrames;

}

+#elif HAVE(MIPS_MSA_INTRINSICS)

+ if ((sourceStride == 1) && (destStride == 1)) {

+ v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

+ v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

+ const v4f32 vLowThr = (v4f32)__msa_fill_w(*((int32_t*)lowThresholdP));

+ const v4f32 vHighThr = (v4f32)__msa_fill_w(*((int32_t*)highThresholdP));

Raymond Toy 2016/10/04 15:48:07 Use C++ casting here instead of C. However, is th

+ for (; n >= 32; n -= 32) {

+ LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6,

+ vSrc7);

+ VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2,

+ vDst3);

+ VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6,

+ vDst7);

+ ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);

+ }

#endif

while (n--) {

*destP = clampTo(*sourceP, lowThreshold, highThreshold);

« no previous file with comments | « no previous file | third_party/WebKit/Source/platform/cpu/mips/CommonMacrosMSA.h » ('j') | no next file with comments »