Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(108)

Unified Diff: third_party/WebKit/Source/platform/audio/VectorMath.cpp

Issue 2340583003: Add MSA (MIPS SIMD Arch) optimized VectorMath functions (Closed)
Patch Set: Removing zvmul and vsvesq Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/platform/audio/VectorMath.cpp
diff --git a/third_party/WebKit/Source/platform/audio/VectorMath.cpp b/third_party/WebKit/Source/platform/audio/VectorMath.cpp
index 2474d0ed385b84806efcdacf271468b8c7039765..5cbc82c1eb6c9eee4c24f4f2d3baba2e2c981d3d 100644
--- a/third_party/WebKit/Source/platform/audio/VectorMath.cpp
+++ b/third_party/WebKit/Source/platform/audio/VectorMath.cpp
@@ -40,6 +40,10 @@
#include <arm_neon.h>
#endif
+#if HAVE(MIPS_MSA_INTRINSICS)
+#include "platform/cpu/mips/CommonMacrosMSA.h"
+#endif
+
#include <math.h>
#include <algorithm>
@@ -182,6 +186,67 @@ void vsma(const float* sourceP, int sourceStride, const float* scale, float* des
}
n = tailFrames;
}
+#elif HAVE(MIPS_MSA_INTRINSICS)
+ if ((sourceStride == 1) && (destStride == 1)) {
+ float* destPCopy = destP;
+ const v4f32 vScale = (v4f32) __msa_fill_w(*((int32_t *) scale));
+ v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
+ v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
+
+ for (; n >= 32; n -= 32) {
+ LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7);
Raymond Toy 2016/10/03 16:47:06 Are there alignment constraints for sourceP and de
Prashant.Patil 2016/10/04 11:47:27 There are no alignment constraints
+ LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7);
+ VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
+ VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
+ ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
+ }
+
+ if (n > 0) {
+ if (n >= 28) {
Raymond Toy 2016/10/03 16:47:06 Is there really much to be gained in having this c
Prashant.Patil 2016/10/04 11:47:27 OK. I shall remove all cases below 32.
+ LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
+ vSrc6 = LD_SP(sourceP);
+ sourceP += 4;
+ LD_SP6(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5);
+ vDst6 = LD_SP(destPCopy);
+ destPCopy += 4;
+ VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
+ VSMA2(vSrc4, vSrc5, vDst4, vDst5, vScale);
+ vDst6 += vSrc6 * vScale;
+ ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
+ ST_SP(vDst6, destP);
+ destP += 4;
+ n -= 28;
+ } else if (n >= 24) {
+ LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
+ LD_SP6(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5);
+ VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
+ VSMA2(vSrc4, vSrc5, vDst4, vDst5, vScale);
+ ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
+ n -= 24;
+ } else if (n >= 16) {
+ LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);
+ LD_SP4(destPCopy, 4, vDst0, vDst1, vDst2, vDst3);
+ VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
+ ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
+ n -= 16;
+ } else if (n >= 8) {
+ LD_SP2(sourceP, 4, vSrc0, vSrc1);
+ LD_SP2(destPCopy, 4, vDst0, vDst1);
+ VSMA2(vSrc0, vSrc1, vDst0, vDst1, vScale);
+ ST_SP2(vDst0, vDst1, destP, 4);
+ n -= 8;
+ }
+ if (n >= 4) {
+ vSrc0 = LD_SP(sourceP);
+ vDst0 = LD_SP(destPCopy);
+ vDst0 += vSrc0 * vScale;
+ ST_SP(vDst0, destP);
+ sourceP += 4;
+ destP += 4;
+ n -= 4;
+ }
+ }
+ }
#endif
while (n) {
*destP += *sourceP * *scale;
@@ -259,6 +324,71 @@ void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de
}
n = tailFrames;
}
+#elif HAVE(MIPS_MSA_INTRINSICS)
+ if ((sourceStride == 1) && (destStride == 1)) {
+ const v4f32 vScale = (v4f32) __msa_fill_w(*((int32_t *) scale));
+ v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSrc9;
+ v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, vDst8, vDst9;
+
+ for (; n >= 40; n -= 40) {
Raymond Toy 2016/10/03 16:47:06 Is it really worth doing blocks of 40 instead of 3
Prashant.Patil 2016/10/04 11:47:27 OK. This was done considering 10 cycle vector load
Raymond Toy 2016/10/04 15:37:44 If there is significant gain in doing this, by all
+ LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
+ LD_SP4(sourceP, 4, vSrc6, vSrc7, vSrc8, vSrc9);
+ VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
+ VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
+ VSMUL2(vSrc8, vSrc9, vDst8, vDst9, vScale);
+ ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
+ ST_SP4(vDst6, vDst7, vDst8, vDst9, destP, 4);
+ }
+
+ if (n > 0) {
+ if (n >= 24) {
+ if (n >= 32) {
+ LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7);
+ VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
+ VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale);
+ ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
+ n -= 32;
+ } else if (n >= 28) {
+ LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
+ vSrc6 = LD_SP(sourceP);
+ sourceP += 4;
+ VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
+ VSMUL2(vSrc4, vSrc5, vDst4, vDst5, vScale);
+ vDst6 = vSrc6 * vScale;
+ ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
+ ST_SP(vDst6, destP);
+ destP += 4;
+ n -= 28;
+ } else {
+ LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
+ VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
+ VSMUL2(vSrc4, vSrc5, vDst4, vDst5, vScale);
+ ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
+ n -= 24;
+ }
+ } else {
+ if (n >= 16) {
+ LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);
+ VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale);
+ ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
+ n -= 16;
+ } else if (n >= 8) {
+ LD_SP2(sourceP, 4, vSrc0, vSrc1);
+ VSMUL2(vSrc0, vSrc1, vDst0, vDst1, vScale);
+ ST_SP2(vDst0, vDst1, destP, 4);
+ n -= 8;
+ }
+ }
+ if (n >= 4) {
+ vSrc0 = LD_SP(sourceP);
+ vDst0 = vSrc0 * vScale;
+ ST_SP(vDst0, destP);
+ sourceP += 4;
+ destP += 4;
+ n -= 4;
+ }
+ }
+ }
#endif
float k = *scale;
while (n--) {
@@ -371,6 +501,97 @@ void vadd(const float* source1P, int sourceStride1, const float* source2P, int s
}
n = tailFrames;
}
+#elif HAVE(MIPS_MSA_INTRINSICS)
+ if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
+ v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
+ v4f32 vSrc8, vSrc9, vSrc10, vSrc11, vSrc12, vSrc13, vSrc14, vSrc15;
+ v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
+
+ for (; n >= 32; n -= 32) {
+ LD_SP8(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9, vSrc10, vSrc11);
+ LD_SP8(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13, vSrc14, vSrc15);
Raymond Toy 2016/10/03 16:47:06 Can we pick better names for vSrc[n]? It's really
Prashant.Patil 2016/10/04 11:47:27 Done.
+ ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
+ ADD4(vSrc8, vSrc12, vSrc9, vSrc13, vSrc10, vSrc14, vSrc11, vSrc15, vDst4, vDst5, vDst6, vDst7);
+ ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
+ }
+
+ if (n > 0) {
+ if (n >= 20) {
Raymond Toy 2016/10/03 16:47:06 Is this really worth doing? In the typical use ca
Prashant.Patil 2016/10/04 11:47:27 Done.
+ if (n >= 28) {
+ LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9);
+ vSrc10 = LD_SP(source1P);
+ source1P += 4;
+ LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13);
+ vSrc14 = LD_SP(source2P);
+ source2P += 4;
+ ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
+ ADD2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);
+ vDst6 = vSrc10 + vSrc14;
+ ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
+ ST_SP(vDst6, destP);
+ destP += 4;
+ n -= 28;
+ } else if (n >= 24) {
+ LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9);
+ LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13);
+ ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
+ ADD2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);
+ ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
+ ST_SP2(vDst4, vDst5, destP, 4);
+ n -= 24;
+ } else {
+ LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);
+ vSrc8 = LD_SP(source1P);
+ source1P += 4;
+ LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);
+ vSrc12 = LD_SP(source2P);
+ source2P += 4;
+ ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
+ vDst4 = vSrc8 + vSrc12;
+ ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
+ ST_SP(vDst4, destP);
+ destP += 4;
+ n -= 20;
+ }
+ } else if (n >= 4) {
+ if (n >= 16) {
+ LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);
+ LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);
+ ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
+ ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
+ n -= 16;
+ } else if (n >= 12) {
+ LD_SP2(source1P, 4, vSrc0, vSrc1);
+ vSrc2 = LD_SP(source1P);
+ source1P += 4;
+ LD_SP2(source2P, 4, vSrc4, vSrc5);
+ vSrc6 = LD_SP(source2P);
+ source2P += 4;
+ ADD2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);
+ vDst2 = vSrc2 + vSrc6;
+ ST_SP2(vDst0, vDst1, destP, 4);
+ ST_SP(vDst2, destP);
+ destP += 4;
+ n -= 12;
+ } else if (n >= 8) {
+ LD_SP2(source1P, 4, vSrc0, vSrc1);
+ LD_SP2(source2P, 4, vSrc4, vSrc5);
+ ADD2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);
+ ST_SP2(vDst0, vDst1, destP, 4);
+ n -= 8;
+ } else {
+ vSrc0 = LD_SP(source1P);
+ vSrc4 = LD_SP(source2P);
+ vDst0 = vSrc0 + vSrc4;
+ ST_SP(vDst0, destP);
+ source1P += 4;
+ source2P += 4;
+ destP += 4;
+ n -= 4;
+ }
+ }
+ }
+ }
#endif
while (n--) {
*destP = *source1P + *source2P;
@@ -448,6 +669,97 @@ void vmul(const float* source1P, int sourceStride1, const float* source2P, int s
}
n = tailFrames;
}
+#elif HAVE(MIPS_MSA_INTRINSICS)
+ if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
+ v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
+ v4f32 vSrc8, vSrc9, vSrc10, vSrc11, vSrc12, vSrc13, vSrc14, vSrc15;
+ v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
+
+ for (; n >= 32; n -= 32) {
+ LD_SP8(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9, vSrc10, vSrc11);
Raymond Toy 2016/10/03 16:47:06 Same comment as in line 512.
Prashant.Patil 2016/10/04 11:47:27 Done.
+ LD_SP8(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13, vSrc14, vSrc15);
+ MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
+ MUL4(vSrc8, vSrc12, vSrc9, vSrc13, vSrc10, vSrc14, vSrc11, vSrc15, vDst4, vDst5, vDst6, vDst7);
+ ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
+ }
+
+ if (n > 0) {
+ if (n >= 20) {
+ if (n >= 28) {
+ LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9);
+ vSrc10 = LD_SP(source1P);
+ source1P += 4;
+ LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13);
+ vSrc14 = LD_SP(source2P);
+ source2P += 4;
+ MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
+ MUL2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);
+ vDst6 = vSrc10 * vSrc14;
+ ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
+ ST_SP(vDst6, destP);
+ destP += 4;
+ n -= 28;
+ } else if (n >= 24) {
+ LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9);
+ LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13);
+ MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
+ MUL2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);
+ ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
+ ST_SP2(vDst4, vDst5, destP, 4);
+ n -= 24;
+ } else { /* n >= 20 */
+ LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);
+ vSrc8 = LD_SP(source1P);
+ source1P += 4;
+ LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);
+ vSrc12 = LD_SP(source2P);
+ source2P += 4;
+ MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
+ vDst4 = vSrc8 * vSrc12;
+ ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
+ ST_SP(vDst4, destP);
+ destP += 4;
+ n -= 20;
+ }
+ } else if (n >= 4) {
+ if (n >= 16) {
+ LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);
+ LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);
+ MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
+ ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
+ n -= 16;
+ } else if (n >= 12) {
+ LD_SP2(source1P, 4, vSrc0, vSrc1);
+ vSrc2 = LD_SP(source1P);
+ source1P += 4;
+ LD_SP2(source2P, 4, vSrc4, vSrc5);
+ vSrc6 = LD_SP(source2P);
+ source2P += 4;
+ MUL2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);
+ vDst2 = vSrc2 * vSrc6;
+ ST_SP2(vDst0, vDst1, destP, 4);
+ ST_SP(vDst2, destP);
+ destP += 4;
+ n -= 12;
+ } else if (n >= 8) {
+ LD_SP2(source1P, 4, vSrc0, vSrc1);
+ LD_SP2(source2P, 4, vSrc4, vSrc5);
+ MUL2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);
+ ST_SP2(vDst0, vDst1, destP, 4);
+ n -= 8;
+ } else { // n >= 4
+ vSrc0 = LD_SP(source1P);
+ vSrc4 = LD_SP(source2P);
+ vDst0 = vSrc0 * vSrc4;
+ ST_SP(vDst0, destP);
+ source1P += 4;
+ source2P += 4;
+ destP += 4;
+ n -= 4;
+ }
+ }
+ }
+ }
#endif
while (n) {
*destP = *source1P * *source2P;
@@ -637,6 +949,62 @@ void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesT
n = tailFrames;
}
+#elif HAVE(MIPS_MSA_INTRINSICS)
+ if (sourceStride == 1) {
+ v4f32 vMax = {0, };
+ v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSrc9;
+ const v16i8 vMask = (v16i8) __msa_fill_w(0x7FFFFFFF);
+
+ for (; n >= 40; n -= 40) {
+ LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
Raymond Toy 2016/10/03 16:47:06 Same comment as in line 333.
Prashant.Patil 2016/10/04 11:47:27 Done.
+ LD_SP4(sourceP, 4, vSrc6, vSrc7, vSrc8, vSrc9);
+ VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);
+ VMAXMGV4(vSrc4, vSrc5, vSrc6, vSrc7, vMask, vMax);
+ VMAXMGV2(vSrc8, vSrc9, vMask, vMax);
+ }
+
+ if (n > 0) {
+ if (n >= 32) {
+ LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7);
+ VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);
+ VMAXMGV4(vSrc4, vSrc5, vSrc6, vSrc7, vMask, vMax);
+ n -= 32;
+ } else if (n >= 28) {
+ LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
+ vSrc6 = LD_SP(sourceP);
+ sourceP += 4;
+ VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);
+ VMAXMGV2(vSrc4, vSrc5, vMask, vMax);
+ vMax = __msa_fmax_w(vMax, (v4f32) ((v16i8) vSrc6 & vMask));
+ n -= 28;
+ } else if (n >= 24) {
+ LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
+ VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);
+ VMAXMGV2(vSrc4, vSrc5, vMask, vMax);
+ n -= 24;
+ } else if (n >= 16) {
+ LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);
+ VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);
+ n -= 16;
+ } else if (n >= 8) {
+ LD_SP2(sourceP, 4, vSrc0, vSrc1);
+ VMAXMGV2(vSrc0, vSrc1, vMask, vMax);
+ n -= 8;
+ }
+
+ if (n >= 4) {
+ vSrc0 = LD_SP(sourceP);
+ sourceP += 4;
+ vMax = __msa_fmax_w(vMax, (v4f32) ((v16i8) vSrc0 & vMask));
+ n -= 4;
+ }
+ }
+
+ max = std::max(max, vMax[0]);
+ max = std::max(max, vMax[1]);
+ max = std::max(max, vMax[2]);
+ max = std::max(max, vMax[3]);
+ }
#endif
while (n--) {
@@ -670,6 +1038,68 @@ void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, c
}
n = tailFrames;
}
+#elif HAVE(MIPS_MSA_INTRINSICS)
+ if ((sourceStride == 1) && (destStride == 1)) {
+ v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSrc9;
+ v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, vDst8, vDst9;
+ const v4f32 vLowThr = (v4f32) __msa_fill_w(*((int32_t *) lowThresholdP));
+ const v4f32 vHighThr = (v4f32) __msa_fill_w(*((int32_t *) highThresholdP));
+
+ for (; n >= 40; n -= 40) {
+ LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7);
+ LD_SP2(sourceP, 4, vSrc8, vSrc9);
+ VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3);
+ VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6, vDst7);
+ VCLIP2(vSrc8, vSrc9, vLowThr, vHighThr, vDst8, vDst9);
+ ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
+ ST_SP2(vDst8, vDst9, destP, 4);
+ }
+
+ if (n > 0) {
+ if (n >= 32) {
+ LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7);
+ VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3);
+ VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6, vDst7);
+ ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4);
+ n -= 32;
+ } else if (n >= 28) {
+ LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
+ vSrc6 = LD_SP(sourceP);
+ sourceP += 4;
+ VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3);
+ VCLIP2(vSrc4, vSrc5, vLowThr, vHighThr, vDst4, vDst5);
+ vDst6 = __msa_fmax_w(__msa_fmin_w(vSrc6, vHighThr), vLowThr);
+ ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
+ ST_SP(vDst6, destP);
+ destP += 4;
+ n -= 28;
+ } else if (n >= 24) {
+ LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
+ VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3);
+ VCLIP2(vSrc4, vSrc5, vLowThr, vHighThr, vDst4, vDst5);
+ ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
+ n -= 24;
+ } else if (n >= 16) {
+ LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);
+ VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3);
+ ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
+ n -= 16;
+ } else if (n >= 8) {
+ LD_SP2(sourceP, 4, vSrc0, vSrc1);
+ VCLIP2(vSrc0, vSrc1, vLowThr, vHighThr, vDst0, vDst1);
+ ST_SP2(vDst0, vDst1, destP, 4);
+ n -= 8;
+ }
+ if (n >= 4) {
+ vSrc0 = LD_SP(sourceP);
+ sourceP += 4;
+ vDst0 = __msa_fmax_w(__msa_fmin_w(vSrc0, vHighThr), vLowThr);
+ ST_SP(vDst0, destP);
+ destP += 4;
+ n -= 4;
+ }
+ }
+ }
#endif
while (n--) {
*destP = clampTo(*sourceP, lowThreshold, highThreshold);

Powered by Google App Engine
This is Rietveld 408576698