Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(85)

Side by Side Diff: third_party/WebKit/Source/platform/audio/VectorMath.cpp

Issue 2340583003: Add MSA (MIPS SIMD Arch) optimized VectorMath functions (Closed)
Patch Set: Removing zvmul and vsvesq Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2010, Google Inc. All rights reserved. 2 * Copyright (C) 2010, Google Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions 5 * modification, are permitted provided that the following conditions
6 * are met: 6 * are met:
7 * 1. Redistributions of source code must retain the above copyright 7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer. 8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright 9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the 10 * notice, this list of conditions and the following disclaimer in the
(...skipping 22 matching lines...) Expand all
33 #endif 33 #endif
34 34
35 #if CPU(X86) || CPU(X86_64) 35 #if CPU(X86) || CPU(X86_64)
36 #include <emmintrin.h> 36 #include <emmintrin.h>
37 #endif 37 #endif
38 38
39 #if HAVE(ARM_NEON_INTRINSICS) 39 #if HAVE(ARM_NEON_INTRINSICS)
40 #include <arm_neon.h> 40 #include <arm_neon.h>
41 #endif 41 #endif
42 42
43 #if HAVE(MIPS_MSA_INTRINSICS)
44 #include "platform/cpu/mips/CommonMacrosMSA.h"
45 #endif
46
43 #include <math.h> 47 #include <math.h>
44 #include <algorithm> 48 #include <algorithm>
45 49
46 namespace blink { 50 namespace blink {
47 51
48 namespace VectorMath { 52 namespace VectorMath {
49 53
50 #if OS(MACOSX) 54 #if OS(MACOSX)
51 // On the Mac we use the highly optimized versions in Accelerate.framework 55 // On the Mac we use the highly optimized versions in Accelerate.framework
52 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecL ib/vDSP_translate.h> which defines macros of the same name as 56 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecL ib/vDSP_translate.h> which defines macros of the same name as
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after
175 float32x4_t dest = vld1q_f32(destP); 179 float32x4_t dest = vld1q_f32(destP);
176 180
177 dest = vmlaq_f32(dest, source, k); 181 dest = vmlaq_f32(dest, source, k);
178 vst1q_f32(destP, dest); 182 vst1q_f32(destP, dest);
179 183
180 sourceP += 4; 184 sourceP += 4;
181 destP += 4; 185 destP += 4;
182 } 186 }
183 n = tailFrames; 187 n = tailFrames;
184 } 188 }
189 #elif HAVE(MIPS_MSA_INTRINSICS)
190 if ((sourceStride == 1) && (destStride == 1)) {
191 float* destPCopy = destP;
192 const v4f32 vScale = (v4f32) __msa_fill_w(*((int32_t *) scale));
193 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
194 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
195
196 for (; n >= 32; n -= 32) {
197 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7);
Raymond Toy 2016/10/03 16:47:06 Are there alignment constraints for sourceP and de
Prashant.Patil 2016/10/04 11:47:27 There are no alignment constraints
198 LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6 , vDst7);
199 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale );
200 VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale );
201 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4);
202 }
203
204 if (n > 0) {
205 if (n >= 28) {
Raymond Toy 2016/10/03 16:47:06 Is there really much to be gained in having this c
Prashant.Patil 2016/10/04 11:47:27 OK. I shall remove all cases below 32.
206 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
207 vSrc6 = LD_SP(sourceP);
208 sourceP += 4;
209 LD_SP6(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5);
210 vDst6 = LD_SP(destPCopy);
211 destPCopy += 4;
212 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vS cale);
213 VSMA2(vSrc4, vSrc5, vDst4, vDst5, vScale);
214 vDst6 += vSrc6 * vScale;
215 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
216 ST_SP(vDst6, destP);
217 destP += 4;
218 n -= 28;
219 } else if (n >= 24) {
220 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
221 LD_SP6(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5);
222 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vS cale);
223 VSMA2(vSrc4, vSrc5, vDst4, vDst5, vScale);
224 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
225 n -= 24;
226 } else if (n >= 16) {
227 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);
228 LD_SP4(destPCopy, 4, vDst0, vDst1, vDst2, vDst3);
229 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vS cale);
230 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
231 n -= 16;
232 } else if (n >= 8) {
233 LD_SP2(sourceP, 4, vSrc0, vSrc1);
234 LD_SP2(destPCopy, 4, vDst0, vDst1);
235 VSMA2(vSrc0, vSrc1, vDst0, vDst1, vScale);
236 ST_SP2(vDst0, vDst1, destP, 4);
237 n -= 8;
238 }
239 if (n >= 4) {
240 vSrc0 = LD_SP(sourceP);
241 vDst0 = LD_SP(destPCopy);
242 vDst0 += vSrc0 * vScale;
243 ST_SP(vDst0, destP);
244 sourceP += 4;
245 destP += 4;
246 n -= 4;
247 }
248 }
249 }
185 #endif 250 #endif
186 while (n) { 251 while (n) {
187 *destP += *sourceP * *scale; 252 *destP += *sourceP * *scale;
188 sourceP += sourceStride; 253 sourceP += sourceStride;
189 destP += destStride; 254 destP += destStride;
190 n--; 255 n--;
191 } 256 }
192 } 257 }
193 258
194 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess) 259 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess)
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
252 317
253 while (destP < endP) { 318 while (destP < endP) {
254 float32x4_t source = vld1q_f32(sourceP); 319 float32x4_t source = vld1q_f32(sourceP);
255 vst1q_f32(destP, vmulq_n_f32(source, k)); 320 vst1q_f32(destP, vmulq_n_f32(source, k));
256 321
257 sourceP += 4; 322 sourceP += 4;
258 destP += 4; 323 destP += 4;
259 } 324 }
260 n = tailFrames; 325 n = tailFrames;
261 } 326 }
327 #elif HAVE(MIPS_MSA_INTRINSICS)
328 if ((sourceStride == 1) && (destStride == 1)) {
329 const v4f32 vScale = (v4f32) __msa_fill_w(*((int32_t *) scale));
330 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSr c9;
331 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, vDst8, vDs t9;
332
333 for (; n >= 40; n -= 40) {
Raymond Toy 2016/10/03 16:47:06 Is it really worth doing blocks of 40 instead of 3
Prashant.Patil 2016/10/04 11:47:27 OK. This was done considering 10 cycle vector load
Raymond Toy 2016/10/04 15:37:44 If there is significant gain in doing this, by all
334 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
335 LD_SP4(sourceP, 4, vSrc6, vSrc7, vSrc8, vSrc9);
336 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScal e);
337 VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScal e);
338 VSMUL2(vSrc8, vSrc9, vDst8, vDst9, vScale);
339 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
340 ST_SP4(vDst6, vDst7, vDst8, vDst9, destP, 4);
341 }
342
343 if (n > 0) {
344 if (n >= 24) {
345 if (n >= 32) {
346 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7);
347 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale);
348 VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst 7, vScale);
349 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst 7, destP, 4);
350 n -= 32;
351 } else if (n >= 28) {
352 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5) ;
353 vSrc6 = LD_SP(sourceP);
354 sourceP += 4;
355 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale);
356 VSMUL2(vSrc4, vSrc5, vDst4, vDst5, vScale);
357 vDst6 = vSrc6 * vScale;
358 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
359 ST_SP(vDst6, destP);
360 destP += 4;
361 n -= 28;
362 } else {
363 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5) ;
364 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale);
365 VSMUL2(vSrc4, vSrc5, vDst4, vDst5, vScale);
366 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
367 n -= 24;
368 }
369 } else {
370 if (n >= 16) {
371 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);
372 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale);
373 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
374 n -= 16;
375 } else if (n >= 8) {
376 LD_SP2(sourceP, 4, vSrc0, vSrc1);
377 VSMUL2(vSrc0, vSrc1, vDst0, vDst1, vScale);
378 ST_SP2(vDst0, vDst1, destP, 4);
379 n -= 8;
380 }
381 }
382 if (n >= 4) {
383 vSrc0 = LD_SP(sourceP);
384 vDst0 = vSrc0 * vScale;
385 ST_SP(vDst0, destP);
386 sourceP += 4;
387 destP += 4;
388 n -= 4;
389 }
390 }
391 }
262 #endif 392 #endif
263 float k = *scale; 393 float k = *scale;
264 while (n--) { 394 while (n--) {
265 *destP = k * *sourceP; 395 *destP = k * *sourceP;
266 sourceP += sourceStride; 396 sourceP += sourceStride;
267 destP += destStride; 397 destP += destStride;
268 } 398 }
269 #if CPU(X86) || CPU(X86_64) 399 #if CPU(X86) || CPU(X86_64)
270 } 400 }
271 #endif 401 #endif
(...skipping 92 matching lines...) Expand 10 before | Expand all | Expand 10 after
364 float32x4_t source1 = vld1q_f32(source1P); 494 float32x4_t source1 = vld1q_f32(source1P);
365 float32x4_t source2 = vld1q_f32(source2P); 495 float32x4_t source2 = vld1q_f32(source2P);
366 vst1q_f32(destP, vaddq_f32(source1, source2)); 496 vst1q_f32(destP, vaddq_f32(source1, source2));
367 497
368 source1P += 4; 498 source1P += 4;
369 source2P += 4; 499 source2P += 4;
370 destP += 4; 500 destP += 4;
371 } 501 }
372 n = tailFrames; 502 n = tailFrames;
373 } 503 }
504 #elif HAVE(MIPS_MSA_INTRINSICS)
505 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
506 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
507 v4f32 vSrc8, vSrc9, vSrc10, vSrc11, vSrc12, vSrc13, vSrc14, vSrc15;
508 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
509
510 for (; n >= 32; n -= 32) {
511 LD_SP8(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9, vSrc10 , vSrc11);
512 LD_SP8(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13, vSrc 14, vSrc15);
Raymond Toy 2016/10/03 16:47:06 Can we pick better names for vSrc[n]? It's really
Prashant.Patil 2016/10/04 11:47:27 Done.
513 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
514 ADD4(vSrc8, vSrc12, vSrc9, vSrc13, vSrc10, vSrc14, vSrc11, vSrc15, v Dst4, vDst5, vDst6, vDst7);
515 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4);
516 }
517
518 if (n > 0) {
519 if (n >= 20) {
Raymond Toy 2016/10/03 16:47:06 Is this really worth doing? In the typical use ca
Prashant.Patil 2016/10/04 11:47:27 Done.
520 if (n >= 28) {
521 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 );
522 vSrc10 = LD_SP(source1P);
523 source1P += 4;
524 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13);
525 vSrc14 = LD_SP(source2P);
526 source2P += 4;
527 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
528 ADD2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);
529 vDst6 = vSrc10 + vSrc14;
530 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
531 ST_SP(vDst6, destP);
532 destP += 4;
533 n -= 28;
534 } else if (n >= 24) {
535 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 );
536 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13);
537 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
538 ADD2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);
539 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
540 ST_SP2(vDst4, vDst5, destP, 4);
541 n -= 24;
542 } else {
543 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);
544 vSrc8 = LD_SP(source1P);
545 source1P += 4;
546 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);
547 vSrc12 = LD_SP(source2P);
548 source2P += 4;
549 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
550 vDst4 = vSrc8 + vSrc12;
551 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
552 ST_SP(vDst4, destP);
553 destP += 4;
554 n -= 20;
555 }
556 } else if (n >= 4) {
557 if (n >= 16) {
558 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);
559 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);
560 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
561 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
562 n -= 16;
563 } else if (n >= 12) {
564 LD_SP2(source1P, 4, vSrc0, vSrc1);
565 vSrc2 = LD_SP(source1P);
566 source1P += 4;
567 LD_SP2(source2P, 4, vSrc4, vSrc5);
568 vSrc6 = LD_SP(source2P);
569 source2P += 4;
570 ADD2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);
571 vDst2 = vSrc2 + vSrc6;
572 ST_SP2(vDst0, vDst1, destP, 4);
573 ST_SP(vDst2, destP);
574 destP += 4;
575 n -= 12;
576 } else if (n >= 8) {
577 LD_SP2(source1P, 4, vSrc0, vSrc1);
578 LD_SP2(source2P, 4, vSrc4, vSrc5);
579 ADD2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);
580 ST_SP2(vDst0, vDst1, destP, 4);
581 n -= 8;
582 } else {
583 vSrc0 = LD_SP(source1P);
584 vSrc4 = LD_SP(source2P);
585 vDst0 = vSrc0 + vSrc4;
586 ST_SP(vDst0, destP);
587 source1P += 4;
588 source2P += 4;
589 destP += 4;
590 n -= 4;
591 }
592 }
593 }
594 }
374 #endif 595 #endif
375 while (n--) { 596 while (n--) {
376 *destP = *source1P + *source2P; 597 *destP = *source1P + *source2P;
377 source1P += sourceStride1; 598 source1P += sourceStride1;
378 source2P += sourceStride2; 599 source2P += sourceStride2;
379 destP += destStride; 600 destP += destStride;
380 } 601 }
381 #if CPU(X86) || CPU(X86_64) 602 #if CPU(X86) || CPU(X86_64)
382 } 603 }
383 #endif 604 #endif
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
441 float32x4_t source1 = vld1q_f32(source1P); 662 float32x4_t source1 = vld1q_f32(source1P);
442 float32x4_t source2 = vld1q_f32(source2P); 663 float32x4_t source2 = vld1q_f32(source2P);
443 vst1q_f32(destP, vmulq_f32(source1, source2)); 664 vst1q_f32(destP, vmulq_f32(source1, source2));
444 665
445 source1P += 4; 666 source1P += 4;
446 source2P += 4; 667 source2P += 4;
447 destP += 4; 668 destP += 4;
448 } 669 }
449 n = tailFrames; 670 n = tailFrames;
450 } 671 }
672 #elif HAVE(MIPS_MSA_INTRINSICS)
673 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
674 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;
675 v4f32 vSrc8, vSrc9, vSrc10, vSrc11, vSrc12, vSrc13, vSrc14, vSrc15;
676 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;
677
678 for (; n >= 32; n -= 32) {
679 LD_SP8(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9, vSrc10 , vSrc11);
Raymond Toy 2016/10/03 16:47:06 Same comment as in line 512.
Prashant.Patil 2016/10/04 11:47:27 Done.
680 LD_SP8(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13, vSrc 14, vSrc15);
681 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
682 MUL4(vSrc8, vSrc12, vSrc9, vSrc13, vSrc10, vSrc14, vSrc11, vSrc15, v Dst4, vDst5, vDst6, vDst7);
683 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4);
684 }
685
686 if (n > 0) {
687 if (n >= 20) {
688 if (n >= 28) {
689 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 );
690 vSrc10 = LD_SP(source1P);
691 source1P += 4;
692 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13);
693 vSrc14 = LD_SP(source2P);
694 source2P += 4;
695 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
696 MUL2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);
697 vDst6 = vSrc10 * vSrc14;
698 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
699 ST_SP(vDst6, destP);
700 destP += 4;
701 n -= 28;
702 } else if (n >= 24) {
703 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 );
704 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13);
705 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
706 MUL2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);
707 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
708 ST_SP2(vDst4, vDst5, destP, 4);
709 n -= 24;
710 } else { /* n >= 20 */
711 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);
712 vSrc8 = LD_SP(source1P);
713 source1P += 4;
714 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);
715 vSrc12 = LD_SP(source2P);
716 source2P += 4;
717 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
718 vDst4 = vSrc8 * vSrc12;
719 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
720 ST_SP(vDst4, destP);
721 destP += 4;
722 n -= 20;
723 }
724 } else if (n >= 4) {
725 if (n >= 16) {
726 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);
727 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);
728 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);
729 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
730 n -= 16;
731 } else if (n >= 12) {
732 LD_SP2(source1P, 4, vSrc0, vSrc1);
733 vSrc2 = LD_SP(source1P);
734 source1P += 4;
735 LD_SP2(source2P, 4, vSrc4, vSrc5);
736 vSrc6 = LD_SP(source2P);
737 source2P += 4;
738 MUL2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);
739 vDst2 = vSrc2 * vSrc6;
740 ST_SP2(vDst0, vDst1, destP, 4);
741 ST_SP(vDst2, destP);
742 destP += 4;
743 n -= 12;
744 } else if (n >= 8) {
745 LD_SP2(source1P, 4, vSrc0, vSrc1);
746 LD_SP2(source2P, 4, vSrc4, vSrc5);
747 MUL2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);
748 ST_SP2(vDst0, vDst1, destP, 4);
749 n -= 8;
750 } else { // n >= 4
751 vSrc0 = LD_SP(source1P);
752 vSrc4 = LD_SP(source2P);
753 vDst0 = vSrc0 * vSrc4;
754 ST_SP(vDst0, destP);
755 source1P += 4;
756 source2P += 4;
757 destP += 4;
758 n -= 4;
759 }
760 }
761 }
762 }
451 #endif 763 #endif
452 while (n) { 764 while (n) {
453 *destP = *source1P * *source2P; 765 *destP = *source1P * *source2P;
454 source1P += sourceStride1; 766 source1P += sourceStride1;
455 source2P += sourceStride2; 767 source2P += sourceStride2;
456 destP += destStride; 768 destP += destStride;
457 n--; 769 n--;
458 } 770 }
459 } 771 }
460 772
(...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after
630 sourceP += 4; 942 sourceP += 4;
631 } 943 }
632 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourM ax)); 944 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourM ax));
633 945
634 float groupMax[2]; 946 float groupMax[2];
635 vst1_f32(groupMax, twoMax); 947 vst1_f32(groupMax, twoMax);
636 max = std::max(groupMax[0], groupMax[1]); 948 max = std::max(groupMax[0], groupMax[1]);
637 949
638 n = tailFrames; 950 n = tailFrames;
639 } 951 }
952 #elif HAVE(MIPS_MSA_INTRINSICS)
953 if (sourceStride == 1) {
954 v4f32 vMax = {0, };
955 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSr c9;
956 const v16i8 vMask = (v16i8) __msa_fill_w(0x7FFFFFFF);
957
958 for (; n >= 40; n -= 40) {
959 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
Raymond Toy 2016/10/03 16:47:06 Same comment as in line 333.
Prashant.Patil 2016/10/04 11:47:27 Done.
960 LD_SP4(sourceP, 4, vSrc6, vSrc7, vSrc8, vSrc9);
961 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);
962 VMAXMGV4(vSrc4, vSrc5, vSrc6, vSrc7, vMask, vMax);
963 VMAXMGV2(vSrc8, vSrc9, vMask, vMax);
964 }
965
966 if (n > 0) {
967 if (n >= 32) {
968 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSr c6, vSrc7);
969 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);
970 VMAXMGV4(vSrc4, vSrc5, vSrc6, vSrc7, vMask, vMax);
971 n -= 32;
972 } else if (n >= 28) {
973 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
974 vSrc6 = LD_SP(sourceP);
975 sourceP += 4;
976 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);
977 VMAXMGV2(vSrc4, vSrc5, vMask, vMax);
978 vMax = __msa_fmax_w(vMax, (v4f32) ((v16i8) vSrc6 & vMask));
979 n -= 28;
980 } else if (n >= 24) {
981 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
982 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);
983 VMAXMGV2(vSrc4, vSrc5, vMask, vMax);
984 n -= 24;
985 } else if (n >= 16) {
986 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);
987 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);
988 n -= 16;
989 } else if (n >= 8) {
990 LD_SP2(sourceP, 4, vSrc0, vSrc1);
991 VMAXMGV2(vSrc0, vSrc1, vMask, vMax);
992 n -= 8;
993 }
994
995 if (n >= 4) {
996 vSrc0 = LD_SP(sourceP);
997 sourceP += 4;
998 vMax = __msa_fmax_w(vMax, (v4f32) ((v16i8) vSrc0 & vMask));
999 n -= 4;
1000 }
1001 }
1002
1003 max = std::max(max, vMax[0]);
1004 max = std::max(max, vMax[1]);
1005 max = std::max(max, vMax[2]);
1006 max = std::max(max, vMax[3]);
1007 }
640 #endif 1008 #endif
641 1009
642 while (n--) { 1010 while (n--) {
643 max = std::max(max, fabsf(*sourceP)); 1011 max = std::max(max, fabsf(*sourceP));
644 sourceP += sourceStride; 1012 sourceP += sourceStride;
645 } 1013 }
646 1014
647 ASSERT(maxP); 1015 ASSERT(maxP);
648 *maxP = max; 1016 *maxP = max;
649 } 1017 }
(...skipping 13 matching lines...) Expand all
663 float32x4_t low = vdupq_n_f32(lowThreshold); 1031 float32x4_t low = vdupq_n_f32(lowThreshold);
664 float32x4_t high = vdupq_n_f32(highThreshold); 1032 float32x4_t high = vdupq_n_f32(highThreshold);
665 while (destP < endP) { 1033 while (destP < endP) {
666 float32x4_t source = vld1q_f32(sourceP); 1034 float32x4_t source = vld1q_f32(sourceP);
667 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); 1035 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
668 sourceP += 4; 1036 sourceP += 4;
669 destP += 4; 1037 destP += 4;
670 } 1038 }
671 n = tailFrames; 1039 n = tailFrames;
672 } 1040 }
1041 #elif HAVE(MIPS_MSA_INTRINSICS)
1042 if ((sourceStride == 1) && (destStride == 1)) {
1043 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSr c9;
1044 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, vDst8, vDs t9;
1045 const v4f32 vLowThr = (v4f32) __msa_fill_w(*((int32_t *) lowThresholdP)) ;
1046 const v4f32 vHighThr = (v4f32) __msa_fill_w(*((int32_t *) highThresholdP ));
1047
1048 for (; n >= 40; n -= 40) {
1049 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7);
1050 LD_SP2(sourceP, 4, vSrc8, vSrc9);
1051 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3);
1052 VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6, vDst7);
1053 VCLIP2(vSrc8, vSrc9, vLowThr, vHighThr, vDst8, vDst9);
1054 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4);
1055 ST_SP2(vDst8, vDst9, destP, 4);
1056 }
1057
1058 if (n > 0) {
1059 if (n >= 32) {
1060 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSr c6, vSrc7);
1061 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3);
1062 VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDs t5, vDst6, vDst7);
1063 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, d estP, 4);
1064 n -= 32;
1065 } else if (n >= 28) {
1066 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
1067 vSrc6 = LD_SP(sourceP);
1068 sourceP += 4;
1069 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3);
1070 VCLIP2(vSrc4, vSrc5, vLowThr, vHighThr, vDst4, vDst5);
1071 vDst6 = __msa_fmax_w(__msa_fmin_w(vSrc6, vHighThr), vLowThr);
1072 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
1073 ST_SP(vDst6, destP);
1074 destP += 4;
1075 n -= 28;
1076 } else if (n >= 24) {
1077 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
1078 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3);
1079 VCLIP2(vSrc4, vSrc5, vLowThr, vHighThr, vDst4, vDst5);
1080 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);
1081 n -= 24;
1082 } else if (n >= 16) {
1083 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);
1084 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3);
1085 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);
1086 n -= 16;
1087 } else if (n >= 8) {
1088 LD_SP2(sourceP, 4, vSrc0, vSrc1);
1089 VCLIP2(vSrc0, vSrc1, vLowThr, vHighThr, vDst0, vDst1);
1090 ST_SP2(vDst0, vDst1, destP, 4);
1091 n -= 8;
1092 }
1093 if (n >= 4) {
1094 vSrc0 = LD_SP(sourceP);
1095 sourceP += 4;
1096 vDst0 = __msa_fmax_w(__msa_fmin_w(vSrc0, vHighThr), vLowThr);
1097 ST_SP(vDst0, destP);
1098 destP += 4;
1099 n -= 4;
1100 }
1101 }
1102 }
673 #endif 1103 #endif
674 while (n--) { 1104 while (n--) {
675 *destP = clampTo(*sourceP, lowThreshold, highThreshold); 1105 *destP = clampTo(*sourceP, lowThreshold, highThreshold);
676 sourceP += sourceStride; 1106 sourceP += sourceStride;
677 destP += destStride; 1107 destP += destStride;
678 } 1108 }
679 } 1109 }
680 1110
681 #endif // OS(MACOSX) 1111 #endif // OS(MACOSX)
682 1112
683 } // namespace VectorMath 1113 } // namespace VectorMath
684 1114
685 } // namespace blink 1115 } // namespace blink
686 1116
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698