Chromium Code Reviews| Index: third_party/WebKit/Source/platform/audio/VectorMath.cpp |
| diff --git a/third_party/WebKit/Source/platform/audio/VectorMath.cpp b/third_party/WebKit/Source/platform/audio/VectorMath.cpp |
| index 2474d0ed385b84806efcdacf271468b8c7039765..5cbc82c1eb6c9eee4c24f4f2d3baba2e2c981d3d 100644 |
| --- a/third_party/WebKit/Source/platform/audio/VectorMath.cpp |
| +++ b/third_party/WebKit/Source/platform/audio/VectorMath.cpp |
| @@ -40,6 +40,10 @@ |
| #include <arm_neon.h> |
| #endif |
| +#if HAVE(MIPS_MSA_INTRINSICS) |
| +#include "platform/cpu/mips/CommonMacrosMSA.h" |
| +#endif |
| + |
| #include <math.h> |
| #include <algorithm> |
| @@ -182,6 +186,67 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des |
| } |
| n = tailFrames; |
| } |
| +#elif HAVE(MIPS_MSA_INTRINSICS) |
| + if ((sourceStride == 1) && (destStride == 1)) { |
| + float* destPCopy = destP; |
| + const v4f32 vScale = (v4f32) __msa_fill_w(*((int32_t *) scale)); |
| + v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; |
| + v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| + |
| + for (; n >= 32; n -= 32) { |
| + LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7); |
|
Raymond Toy
2016/10/03 16:47:06
Are there alignment constraints for sourceP and de
Prashant.Patil
2016/10/04 11:47:27
There are no alignment constraints
|
| + LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7); |
| + VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| + VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale); |
| + ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| + } |
| + |
| + if (n > 0) { |
| + if (n >= 28) { |
|
Raymond Toy
2016/10/03 16:47:06
Is there really much to be gained in having this c
Prashant.Patil
2016/10/04 11:47:27
OK. I shall remove all cases below 32.
|
| + LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); |
| + vSrc6 = LD_SP(sourceP); |
| + sourceP += 4; |
| + LD_SP6(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5); |
| + vDst6 = LD_SP(destPCopy); |
| + destPCopy += 4; |
| + VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| + VSMA2(vSrc4, vSrc5, vDst4, vDst5, vScale); |
| + vDst6 += vSrc6 * vScale; |
| + ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); |
| + ST_SP(vDst6, destP); |
| + destP += 4; |
| + n -= 28; |
| + } else if (n >= 24) { |
| + LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); |
| + LD_SP6(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5); |
| + VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| + VSMA2(vSrc4, vSrc5, vDst4, vDst5, vScale); |
| + ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); |
| + n -= 24; |
| + } else if (n >= 16) { |
| + LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3); |
| + LD_SP4(destPCopy, 4, vDst0, vDst1, vDst2, vDst3); |
| + VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| + ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); |
| + n -= 16; |
| + } else if (n >= 8) { |
| + LD_SP2(sourceP, 4, vSrc0, vSrc1); |
| + LD_SP2(destPCopy, 4, vDst0, vDst1); |
| + VSMA2(vSrc0, vSrc1, vDst0, vDst1, vScale); |
| + ST_SP2(vDst0, vDst1, destP, 4); |
| + n -= 8; |
| + } |
| + if (n >= 4) { |
| + vSrc0 = LD_SP(sourceP); |
| + vDst0 = LD_SP(destPCopy); |
| + vDst0 += vSrc0 * vScale; |
| + ST_SP(vDst0, destP); |
| + sourceP += 4; |
| + destP += 4; |
| + n -= 4; |
| + } |
| + } |
| + } |
| #endif |
| while (n) { |
| *destP += *sourceP * *scale; |
| @@ -259,6 +324,71 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de |
| } |
| n = tailFrames; |
| } |
| +#elif HAVE(MIPS_MSA_INTRINSICS) |
| + if ((sourceStride == 1) && (destStride == 1)) { |
| + const v4f32 vScale = (v4f32) __msa_fill_w(*((int32_t *) scale)); |
| + v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSrc9; |
| + v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, vDst8, vDst9; |
| + |
| + for (; n >= 40; n -= 40) { |
|
Raymond Toy
2016/10/03 16:47:06
Is it really worth doing blocks of 40 instead of 3
Prashant.Patil
2016/10/04 11:47:27
OK. This was done considering 10 cycle vector load
Raymond Toy
2016/10/04 15:37:44
If there is significant gain in doing this, by all
|
| + LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); |
| + LD_SP4(sourceP, 4, vSrc6, vSrc7, vSrc8, vSrc9); |
| + VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| + VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale); |
| + VSMUL2(vSrc8, vSrc9, vDst8, vDst9, vScale); |
| + ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); |
| + ST_SP4(vDst6, vDst7, vDst8, vDst9, destP, 4); |
| + } |
| + |
| + if (n > 0) { |
| + if (n >= 24) { |
| + if (n >= 32) { |
| + LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7); |
| + VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| + VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale); |
| + ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| + n -= 32; |
| + } else if (n >= 28) { |
| + LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); |
| + vSrc6 = LD_SP(sourceP); |
| + sourceP += 4; |
| + VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| + VSMUL2(vSrc4, vSrc5, vDst4, vDst5, vScale); |
| + vDst6 = vSrc6 * vScale; |
| + ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); |
| + ST_SP(vDst6, destP); |
| + destP += 4; |
| + n -= 28; |
| + } else { |
| + LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); |
| + VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| + VSMUL2(vSrc4, vSrc5, vDst4, vDst5, vScale); |
| + ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); |
| + n -= 24; |
| + } |
| + } else { |
| + if (n >= 16) { |
| + LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3); |
| + VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| + ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); |
| + n -= 16; |
| + } else if (n >= 8) { |
| + LD_SP2(sourceP, 4, vSrc0, vSrc1); |
| + VSMUL2(vSrc0, vSrc1, vDst0, vDst1, vScale); |
| + ST_SP2(vDst0, vDst1, destP, 4); |
| + n -= 8; |
| + } |
| + } |
| + if (n >= 4) { |
| + vSrc0 = LD_SP(sourceP); |
| + vDst0 = vSrc0 * vScale; |
| + ST_SP(vDst0, destP); |
| + sourceP += 4; |
| + destP += 4; |
| + n -= 4; |
| + } |
| + } |
| + } |
| #endif |
| float k = *scale; |
| while (n--) { |
| @@ -371,6 +501,97 @@ void vadd(const float* source1P, int sourceStride1, const float* source2P, int s |
| } |
| n = tailFrames; |
| } |
| +#elif HAVE(MIPS_MSA_INTRINSICS) |
| + if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
| + v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; |
| + v4f32 vSrc8, vSrc9, vSrc10, vSrc11, vSrc12, vSrc13, vSrc14, vSrc15; |
| + v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| + |
| + for (; n >= 32; n -= 32) { |
| + LD_SP8(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9, vSrc10, vSrc11); |
| + LD_SP8(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13, vSrc14, vSrc15); |
|
Raymond Toy
2016/10/03 16:47:06
Can we pick better names for vSrc[n]? It's really
Prashant.Patil
2016/10/04 11:47:27
Done.
|
| + ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); |
| + ADD4(vSrc8, vSrc12, vSrc9, vSrc13, vSrc10, vSrc14, vSrc11, vSrc15, vDst4, vDst5, vDst6, vDst7); |
| + ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| + } |
| + |
| + if (n > 0) { |
| + if (n >= 20) { |
|
Raymond Toy
2016/10/03 16:47:06
Is this really worth doing? In the typical use ca
Prashant.Patil
2016/10/04 11:47:27
Done.
|
| + if (n >= 28) { |
| + LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9); |
| + vSrc10 = LD_SP(source1P); |
| + source1P += 4; |
| + LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13); |
| + vSrc14 = LD_SP(source2P); |
| + source2P += 4; |
| + ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); |
| + ADD2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5); |
| + vDst6 = vSrc10 + vSrc14; |
| + ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); |
| + ST_SP(vDst6, destP); |
| + destP += 4; |
| + n -= 28; |
| + } else if (n >= 24) { |
| + LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9); |
| + LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13); |
| + ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); |
| + ADD2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5); |
| + ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); |
| + ST_SP2(vDst4, vDst5, destP, 4); |
| + n -= 24; |
| + } else { |
| + LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3); |
| + vSrc8 = LD_SP(source1P); |
| + source1P += 4; |
| + LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7); |
| + vSrc12 = LD_SP(source2P); |
| + source2P += 4; |
| + ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); |
| + vDst4 = vSrc8 + vSrc12; |
| + ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); |
| + ST_SP(vDst4, destP); |
| + destP += 4; |
| + n -= 20; |
| + } |
| + } else if (n >= 4) { |
| + if (n >= 16) { |
| + LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3); |
| + LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7); |
| + ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); |
| + ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); |
| + n -= 16; |
| + } else if (n >= 12) { |
| + LD_SP2(source1P, 4, vSrc0, vSrc1); |
| + vSrc2 = LD_SP(source1P); |
| + source1P += 4; |
| + LD_SP2(source2P, 4, vSrc4, vSrc5); |
| + vSrc6 = LD_SP(source2P); |
| + source2P += 4; |
| + ADD2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1); |
| + vDst2 = vSrc2 + vSrc6; |
| + ST_SP2(vDst0, vDst1, destP, 4); |
| + ST_SP(vDst2, destP); |
| + destP += 4; |
| + n -= 12; |
| + } else if (n >= 8) { |
| + LD_SP2(source1P, 4, vSrc0, vSrc1); |
| + LD_SP2(source2P, 4, vSrc4, vSrc5); |
| + ADD2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1); |
| + ST_SP2(vDst0, vDst1, destP, 4); |
| + n -= 8; |
| + } else { |
| + vSrc0 = LD_SP(source1P); |
| + vSrc4 = LD_SP(source2P); |
| + vDst0 = vSrc0 + vSrc4; |
| + ST_SP(vDst0, destP); |
| + source1P += 4; |
| + source2P += 4; |
| + destP += 4; |
| + n -= 4; |
| + } |
| + } |
| + } |
| + } |
| #endif |
| while (n--) { |
| *destP = *source1P + *source2P; |
| @@ -448,6 +669,97 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s |
| } |
| n = tailFrames; |
| } |
| +#elif HAVE(MIPS_MSA_INTRINSICS) |
| + if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
| + v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; |
| + v4f32 vSrc8, vSrc9, vSrc10, vSrc11, vSrc12, vSrc13, vSrc14, vSrc15; |
| + v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| + |
| + for (; n >= 32; n -= 32) { |
| + LD_SP8(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9, vSrc10, vSrc11); |
|
Raymond Toy
2016/10/03 16:47:06
Same comment as in line 512.
Prashant.Patil
2016/10/04 11:47:27
Done.
|
| + LD_SP8(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13, vSrc14, vSrc15); |
| + MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); |
| + MUL4(vSrc8, vSrc12, vSrc9, vSrc13, vSrc10, vSrc14, vSrc11, vSrc15, vDst4, vDst5, vDst6, vDst7); |
| + ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| + } |
| + |
| + if (n > 0) { |
| + if (n >= 20) { |
| + if (n >= 28) { |
| + LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9); |
| + vSrc10 = LD_SP(source1P); |
| + source1P += 4; |
| + LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13); |
| + vSrc14 = LD_SP(source2P); |
| + source2P += 4; |
| + MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); |
| + MUL2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5); |
| + vDst6 = vSrc10 * vSrc14; |
| + ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); |
| + ST_SP(vDst6, destP); |
| + destP += 4; |
| + n -= 28; |
| + } else if (n >= 24) { |
| + LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9); |
| + LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13); |
| + MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); |
| + MUL2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5); |
| + ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); |
| + ST_SP2(vDst4, vDst5, destP, 4); |
| + n -= 24; |
| + } else { /* n >= 20 */ |
| + LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3); |
| + vSrc8 = LD_SP(source1P); |
| + source1P += 4; |
| + LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7); |
| + vSrc12 = LD_SP(source2P); |
| + source2P += 4; |
| + MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); |
| + vDst4 = vSrc8 * vSrc12; |
| + ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); |
| + ST_SP(vDst4, destP); |
| + destP += 4; |
| + n -= 20; |
| + } |
| + } else if (n >= 4) { |
| + if (n >= 16) { |
| + LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3); |
| + LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7); |
| + MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); |
| + ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); |
| + n -= 16; |
| + } else if (n >= 12) { |
| + LD_SP2(source1P, 4, vSrc0, vSrc1); |
| + vSrc2 = LD_SP(source1P); |
| + source1P += 4; |
| + LD_SP2(source2P, 4, vSrc4, vSrc5); |
| + vSrc6 = LD_SP(source2P); |
| + source2P += 4; |
| + MUL2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1); |
| + vDst2 = vSrc2 * vSrc6; |
| + ST_SP2(vDst0, vDst1, destP, 4); |
| + ST_SP(vDst2, destP); |
| + destP += 4; |
| + n -= 12; |
| + } else if (n >= 8) { |
| + LD_SP2(source1P, 4, vSrc0, vSrc1); |
| + LD_SP2(source2P, 4, vSrc4, vSrc5); |
| + MUL2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1); |
| + ST_SP2(vDst0, vDst1, destP, 4); |
| + n -= 8; |
| + } else { // n >= 4 |
| + vSrc0 = LD_SP(source1P); |
| + vSrc4 = LD_SP(source2P); |
| + vDst0 = vSrc0 * vSrc4; |
| + ST_SP(vDst0, destP); |
| + source1P += 4; |
| + source2P += 4; |
| + destP += 4; |
| + n -= 4; |
| + } |
| + } |
| + } |
| + } |
| #endif |
| while (n) { |
| *destP = *source1P * *source2P; |
| @@ -637,6 +949,62 @@ void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT |
| n = tailFrames; |
| } |
| +#elif HAVE(MIPS_MSA_INTRINSICS) |
| + if (sourceStride == 1) { |
| + v4f32 vMax = {0, }; |
| + v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSrc9; |
| + const v16i8 vMask = (v16i8) __msa_fill_w(0x7FFFFFFF); |
| + |
| + for (; n >= 40; n -= 40) { |
| + LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); |
|
Raymond Toy
2016/10/03 16:47:06
Same comment as in line 333.
Prashant.Patil
2016/10/04 11:47:27
Done.
|
| + LD_SP4(sourceP, 4, vSrc6, vSrc7, vSrc8, vSrc9); |
| + VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax); |
| + VMAXMGV4(vSrc4, vSrc5, vSrc6, vSrc7, vMask, vMax); |
| + VMAXMGV2(vSrc8, vSrc9, vMask, vMax); |
| + } |
| + |
| + if (n > 0) { |
| + if (n >= 32) { |
| + LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7); |
| + VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax); |
| + VMAXMGV4(vSrc4, vSrc5, vSrc6, vSrc7, vMask, vMax); |
| + n -= 32; |
| + } else if (n >= 28) { |
| + LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); |
| + vSrc6 = LD_SP(sourceP); |
| + sourceP += 4; |
| + VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax); |
| + VMAXMGV2(vSrc4, vSrc5, vMask, vMax); |
| + vMax = __msa_fmax_w(vMax, (v4f32) ((v16i8) vSrc6 & vMask)); |
| + n -= 28; |
| + } else if (n >= 24) { |
| + LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); |
| + VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax); |
| + VMAXMGV2(vSrc4, vSrc5, vMask, vMax); |
| + n -= 24; |
| + } else if (n >= 16) { |
| + LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3); |
| + VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax); |
| + n -= 16; |
| + } else if (n >= 8) { |
| + LD_SP2(sourceP, 4, vSrc0, vSrc1); |
| + VMAXMGV2(vSrc0, vSrc1, vMask, vMax); |
| + n -= 8; |
| + } |
| + |
| + if (n >= 4) { |
| + vSrc0 = LD_SP(sourceP); |
| + sourceP += 4; |
| + vMax = __msa_fmax_w(vMax, (v4f32) ((v16i8) vSrc0 & vMask)); |
| + n -= 4; |
| + } |
| + } |
| + |
| + max = std::max(max, vMax[0]); |
| + max = std::max(max, vMax[1]); |
| + max = std::max(max, vMax[2]); |
| + max = std::max(max, vMax[3]); |
| + } |
| #endif |
| while (n--) { |
| @@ -670,6 +1038,68 @@ void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c |
| } |
| n = tailFrames; |
| } |
| +#elif HAVE(MIPS_MSA_INTRINSICS) |
| + if ((sourceStride == 1) && (destStride == 1)) { |
| + v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSrc9; |
| + v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, vDst8, vDst9; |
| + const v4f32 vLowThr = (v4f32) __msa_fill_w(*((int32_t *) lowThresholdP)); |
| + const v4f32 vHighThr = (v4f32) __msa_fill_w(*((int32_t *) highThresholdP)); |
| + |
| + for (; n >= 40; n -= 40) { |
| + LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7); |
| + LD_SP2(sourceP, 4, vSrc8, vSrc9); |
| + VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3); |
| + VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6, vDst7); |
| + VCLIP2(vSrc8, vSrc9, vLowThr, vHighThr, vDst8, vDst9); |
| + ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| + ST_SP2(vDst8, vDst9, destP, 4); |
| + } |
| + |
| + if (n > 0) { |
| + if (n >= 32) { |
| + LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7); |
| + VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3); |
| + VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6, vDst7); |
| + ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| + n -= 32; |
| + } else if (n >= 28) { |
| + LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); |
| + vSrc6 = LD_SP(sourceP); |
| + sourceP += 4; |
| + VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3); |
| + VCLIP2(vSrc4, vSrc5, vLowThr, vHighThr, vDst4, vDst5); |
| + vDst6 = __msa_fmax_w(__msa_fmin_w(vSrc6, vHighThr), vLowThr); |
| + ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); |
| + ST_SP(vDst6, destP); |
| + destP += 4; |
| + n -= 28; |
| + } else if (n >= 24) { |
| + LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); |
| + VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3); |
| + VCLIP2(vSrc4, vSrc5, vLowThr, vHighThr, vDst4, vDst5); |
| + ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); |
| + n -= 24; |
| + } else if (n >= 16) { |
| + LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3); |
| + VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3); |
| + ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); |
| + n -= 16; |
| + } else if (n >= 8) { |
| + LD_SP2(sourceP, 4, vSrc0, vSrc1); |
| + VCLIP2(vSrc0, vSrc1, vLowThr, vHighThr, vDst0, vDst1); |
| + ST_SP2(vDst0, vDst1, destP, 4); |
| + n -= 8; |
| + } |
| + if (n >= 4) { |
| + vSrc0 = LD_SP(sourceP); |
| + sourceP += 4; |
| + vDst0 = __msa_fmax_w(__msa_fmin_w(vSrc0, vHighThr), vLowThr); |
| + ST_SP(vDst0, destP); |
| + destP += 4; |
| + n -= 4; |
| + } |
| + } |
| + } |
| #endif |
| while (n--) { |
| *destP = clampTo(*sourceP, lowThreshold, highThreshold); |