Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright (C) 2010, Google Inc. All rights reserved. | 2 * Copyright (C) 2010, Google Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
| (...skipping 23 matching lines...) Expand all Loading... | |
| 34 #endif | 34 #endif |
| 35 | 35 |
| 36 #if CPU(X86) || CPU(X86_64) | 36 #if CPU(X86) || CPU(X86_64) |
| 37 #include <emmintrin.h> | 37 #include <emmintrin.h> |
| 38 #endif | 38 #endif |
| 39 | 39 |
| 40 #if HAVE(ARM_NEON_INTRINSICS) | 40 #if HAVE(ARM_NEON_INTRINSICS) |
| 41 #include <arm_neon.h> | 41 #include <arm_neon.h> |
| 42 #endif | 42 #endif |
| 43 | 43 |
| 44 #if HAVE(MIPS_MSA_INTRINSICS) | |
| 45 #include "platform/cpu/mips/CommonMacrosMSA.h" | |
| 46 #endif | |
| 47 | |
| 44 #include <math.h> | 48 #include <math.h> |
| 45 #include <algorithm> | 49 #include <algorithm> |
| 46 | 50 |
| 47 namespace blink { | 51 namespace blink { |
| 48 | 52 |
| 49 namespace VectorMath { | 53 namespace VectorMath { |
| 50 | 54 |
| 51 #if OS(MACOSX) | 55 #if OS(MACOSX) |
| 52 // On the Mac we use the highly optimized versions in Accelerate.framework | 56 // On the Mac we use the highly optimized versions in Accelerate.framework |
| 53 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes | 57 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes |
| (...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 222 float32x4_t dest = vld1q_f32(destP); | 226 float32x4_t dest = vld1q_f32(destP); |
| 223 | 227 |
| 224 dest = vmlaq_f32(dest, source, k); | 228 dest = vmlaq_f32(dest, source, k); |
| 225 vst1q_f32(destP, dest); | 229 vst1q_f32(destP, dest); |
| 226 | 230 |
| 227 sourceP += 4; | 231 sourceP += 4; |
| 228 destP += 4; | 232 destP += 4; |
| 229 } | 233 } |
| 230 n = tailFrames; | 234 n = tailFrames; |
| 231 } | 235 } |
| 236 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 237 if ((sourceStride == 1) && (destStride == 1)) { | |
| 238 float* destPCopy = destP; | |
| 239 const v4f32 vScale = (v4f32)__msa_fill_w(FLOAT2INT(*scale)); | |
| 240 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; | |
| 241 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; | |
| 242 | |
| 243 for (; n >= 32; n -= 32) { | |
| 244 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, | |
| 245 vSrc7); | |
| 246 LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, | |
| 247 vDst7); | |
| 248 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); | |
| 249 VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale); | |
| 250 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); | |
| 251 } | |
| 252 } | |
| 232 #endif | 253 #endif |
| 233 while (n) { | 254 while (n) { |
| 234 *destP += *sourceP * *scale; | 255 *destP += *sourceP * *scale; |
| 235 sourceP += sourceStride; | 256 sourceP += sourceStride; |
| 236 destP += destStride; | 257 destP += destStride; |
| 237 n--; | 258 n--; |
| 238 } | 259 } |
| 239 } | 260 } |
| 240 | 261 |
| 241 void vsmul(const float* sourceP, | 262 void vsmul(const float* sourceP, |
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 303 | 324 |
| 304 while (destP < endP) { | 325 while (destP < endP) { |
| 305 float32x4_t source = vld1q_f32(sourceP); | 326 float32x4_t source = vld1q_f32(sourceP); |
| 306 vst1q_f32(destP, vmulq_n_f32(source, k)); | 327 vst1q_f32(destP, vmulq_n_f32(source, k)); |
| 307 | 328 |
| 308 sourceP += 4; | 329 sourceP += 4; |
| 309 destP += 4; | 330 destP += 4; |
| 310 } | 331 } |
| 311 n = tailFrames; | 332 n = tailFrames; |
| 312 } | 333 } |
| 334 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 335 if ((sourceStride == 1) && (destStride == 1)) { | |
| 336 const v4f32 vScale = (v4f32)__msa_fill_w(FLOAT2INT(*scale)); | |
| 337 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; | |
| 338 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; | |
| 339 | |
| 340 for (; n >= 32; n -= 32) { | |
| 341 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, | |
| 342 vSrc7); | |
| 343 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); | |
| 344 VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale); | |
| 345 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); | |
| 346 } | |
| 347 } | |
| 313 #endif | 348 #endif |
| 314 float k = *scale; | 349 float k = *scale; |
| 315 while (n--) { | 350 while (n--) { |
| 316 *destP = k * *sourceP; | 351 *destP = k * *sourceP; |
| 317 sourceP += sourceStride; | 352 sourceP += sourceStride; |
| 318 destP += destStride; | 353 destP += destStride; |
| 319 } | 354 } |
| 320 #if CPU(X86) || CPU(X86_64) | 355 #if CPU(X86) || CPU(X86_64) |
| 321 } | 356 } |
| 322 #endif | 357 #endif |
| (...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 424 float32x4_t source1 = vld1q_f32(source1P); | 459 float32x4_t source1 = vld1q_f32(source1P); |
| 425 float32x4_t source2 = vld1q_f32(source2P); | 460 float32x4_t source2 = vld1q_f32(source2P); |
| 426 vst1q_f32(destP, vaddq_f32(source1, source2)); | 461 vst1q_f32(destP, vaddq_f32(source1, source2)); |
| 427 | 462 |
| 428 source1P += 4; | 463 source1P += 4; |
| 429 source2P += 4; | 464 source2P += 4; |
| 430 destP += 4; | 465 destP += 4; |
| 431 } | 466 } |
| 432 n = tailFrames; | 467 n = tailFrames; |
| 433 } | 468 } |
| 469 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 470 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { | |
| 471 v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6, | |
| 472 vSrc1P7; | |
| 473 v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6, | |
| 474 vSrc2P7; | |
| 475 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; | |
| 476 | |
| 477 for (; n >= 32; n -= 32) { | |
| 478 LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, | |
| 479 vSrc1P6, vSrc1P7); | |
| 480 LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, | |
| 481 vSrc2P6, vSrc2P7); | |
| 482 ADD4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3, | |
| 483 vSrc2P3, vDst0, vDst1, vDst2, vDst3); | |
| 484 ADD4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7, | |
| 485 vSrc2P7, vDst4, vDst5, vDst6, vDst7); | |
| 486 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); | |
| 487 } | |
| 488 } | |
| 434 #endif | 489 #endif |
| 435 while (n--) { | 490 while (n--) { |
| 436 *destP = *source1P + *source2P; | 491 *destP = *source1P + *source2P; |
| 437 source1P += sourceStride1; | 492 source1P += sourceStride1; |
| 438 source2P += sourceStride2; | 493 source2P += sourceStride2; |
| 439 destP += destStride; | 494 destP += destStride; |
| 440 } | 495 } |
| 441 #if CPU(X86) || CPU(X86_64) | 496 #if CPU(X86) || CPU(X86_64) |
| 442 } | 497 } |
| 443 #endif | 498 #endif |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 507 float32x4_t source1 = vld1q_f32(source1P); | 562 float32x4_t source1 = vld1q_f32(source1P); |
| 508 float32x4_t source2 = vld1q_f32(source2P); | 563 float32x4_t source2 = vld1q_f32(source2P); |
| 509 vst1q_f32(destP, vmulq_f32(source1, source2)); | 564 vst1q_f32(destP, vmulq_f32(source1, source2)); |
| 510 | 565 |
| 511 source1P += 4; | 566 source1P += 4; |
| 512 source2P += 4; | 567 source2P += 4; |
| 513 destP += 4; | 568 destP += 4; |
| 514 } | 569 } |
| 515 n = tailFrames; | 570 n = tailFrames; |
| 516 } | 571 } |
| 572 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 573 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { | |
| 574 v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6, | |
| 575 vSrc1P7; | |
| 576 v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6, | |
| 577 vSrc2P7; | |
| 578 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; | |
| 579 | |
| 580 for (; n >= 32; n -= 32) { | |
| 581 LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, | |
| 582 vSrc1P6, vSrc1P7); | |
| 583 LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, | |
| 584 vSrc2P6, vSrc2P7); | |
| 585 MUL4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3, | |
| 586 vSrc2P3, vDst0, vDst1, vDst2, vDst3); | |
| 587 MUL4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7, | |
| 588 vSrc2P7, vDst4, vDst5, vDst6, vDst7); | |
| 589 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); | |
| 590 } | |
| 591 } | |
| 517 #endif | 592 #endif |
| 518 while (n) { | 593 while (n) { |
| 519 *destP = *source1P * *source2P; | 594 *destP = *source1P * *source2P; |
| 520 source1P += sourceStride1; | 595 source1P += sourceStride1; |
| 521 source2P += sourceStride2; | 596 source2P += sourceStride2; |
| 522 destP += destStride; | 597 destP += destStride; |
| 523 n--; | 598 n--; |
| 524 } | 599 } |
| 525 } | 600 } |
| 526 | 601 |
| (...skipping 182 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 709 } | 784 } |
| 710 float32x2_t twoMax = | 785 float32x2_t twoMax = |
| 711 vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); | 786 vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); |
| 712 | 787 |
| 713 float groupMax[2]; | 788 float groupMax[2]; |
| 714 vst1_f32(groupMax, twoMax); | 789 vst1_f32(groupMax, twoMax); |
| 715 max = std::max(groupMax[0], groupMax[1]); | 790 max = std::max(groupMax[0], groupMax[1]); |
| 716 | 791 |
| 717 n = tailFrames; | 792 n = tailFrames; |
| 718 } | 793 } |
| 794 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 795 if (sourceStride == 1) { | |
| 796 v4f32 vMax = { | |
| 797 0, | |
| 798 }; | |
| 799 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; | |
| 800 const v16i8 vSignBitMask = (v16i8)__msa_fill_w(0x7FFFFFFF); | |
| 801 | |
| 802 for (; n >= 32; n -= 32) { | |
| 803 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, | |
| 804 vSrc7); | |
| 805 AND_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vSignBitMask); | |
| 806 VMAX_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vMax); | |
| 807 AND_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vSignBitMask); | |
| 808 VMAX_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vMax); | |
| 809 } | |
| 810 | |
| 811 max = std::max(max, vMax[0]); | |
| 812 max = std::max(max, vMax[1]); | |
| 813 max = std::max(max, vMax[2]); | |
| 814 max = std::max(max, vMax[3]); | |
| 815 } | |
| 719 #endif | 816 #endif |
| 720 | 817 |
| 721 while (n--) { | 818 while (n--) { |
| 722 max = std::max(max, fabsf(*sourceP)); | 819 max = std::max(max, fabsf(*sourceP)); |
| 723 sourceP += sourceStride; | 820 sourceP += sourceStride; |
| 724 } | 821 } |
| 725 | 822 |
| 726 ASSERT(maxP); | 823 ASSERT(maxP); |
| 727 *maxP = max; | 824 *maxP = max; |
| 728 } | 825 } |
| (...skipping 18 matching lines...) Expand all Loading... | |
| 747 float32x4_t low = vdupq_n_f32(lowThreshold); | 844 float32x4_t low = vdupq_n_f32(lowThreshold); |
| 748 float32x4_t high = vdupq_n_f32(highThreshold); | 845 float32x4_t high = vdupq_n_f32(highThreshold); |
| 749 while (destP < endP) { | 846 while (destP < endP) { |
| 750 float32x4_t source = vld1q_f32(sourceP); | 847 float32x4_t source = vld1q_f32(sourceP); |
| 751 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); | 848 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); |
| 752 sourceP += 4; | 849 sourceP += 4; |
| 753 destP += 4; | 850 destP += 4; |
| 754 } | 851 } |
| 755 n = tailFrames; | 852 n = tailFrames; |
| 756 } | 853 } |
| 854 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 855 if ((sourceStride == 1) && (destStride == 1)) { | |
| 856 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; | |
| 857 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; | |
| 858 const v4f32 vLowThr = (v4f32)__msa_fill_w(FLOAT2INT(lowThreshold)); | |
|
Raymond Toy
2016/10/05 17:26:53
Isn't this some kind of gcc/clang extension? FLOA
Prashant.Patil
2016/10/06 08:27:35
I will remove this macro usage.
| |
| 859 const v4f32 vHighThr = (v4f32)__msa_fill_w(FLOAT2INT(highThreshold)); | |
| 860 | |
| 861 for (; n >= 32; n -= 32) { | |
| 862 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, | |
| 863 vSrc7); | |
| 864 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, | |
| 865 vDst3); | |
| 866 VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6, | |
| 867 vDst7); | |
| 868 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); | |
| 869 } | |
| 870 } | |
| 757 #endif | 871 #endif |
| 758 while (n--) { | 872 while (n--) { |
| 759 *destP = clampTo(*sourceP, lowThreshold, highThreshold); | 873 *destP = clampTo(*sourceP, lowThreshold, highThreshold); |
| 760 sourceP += sourceStride; | 874 sourceP += sourceStride; |
| 761 destP += destStride; | 875 destP += destStride; |
| 762 } | 876 } |
| 763 } | 877 } |
| 764 | 878 |
| 765 #endif // OS(MACOSX) | 879 #endif // OS(MACOSX) |
| 766 | 880 |
| 767 } // namespace VectorMath | 881 } // namespace VectorMath |
| 768 | 882 |
| 769 } // namespace blink | 883 } // namespace blink |
| OLD | NEW |