| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2010, Google Inc. All rights reserved. | 2 * Copyright (C) 2010, Google Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
| (...skipping 23 matching lines...) Expand all Loading... |
| 34 #endif | 34 #endif |
| 35 | 35 |
| 36 #if CPU(X86) || CPU(X86_64) | 36 #if CPU(X86) || CPU(X86_64) |
| 37 #include <emmintrin.h> | 37 #include <emmintrin.h> |
| 38 #endif | 38 #endif |
| 39 | 39 |
| 40 #if HAVE(ARM_NEON_INTRINSICS) | 40 #if HAVE(ARM_NEON_INTRINSICS) |
| 41 #include <arm_neon.h> | 41 #include <arm_neon.h> |
| 42 #endif | 42 #endif |
| 43 | 43 |
| 44 #if HAVE(MIPS_MSA_INTRINSICS) |
| 45 #include "platform/cpu/mips/CommonMacrosMSA.h" |
| 46 #endif |
| 47 |
| 44 #include <math.h> | 48 #include <math.h> |
| 45 #include <algorithm> | 49 #include <algorithm> |
| 46 | 50 |
| 47 namespace blink { | 51 namespace blink { |
| 48 | 52 |
| 49 namespace VectorMath { | 53 namespace VectorMath { |
| 50 | 54 |
| 51 #if OS(MACOSX) | 55 #if OS(MACOSX) |
| 52 // On the Mac we use the highly optimized versions in Accelerate.framework | 56 // On the Mac we use the highly optimized versions in Accelerate.framework |
| 53 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes | 57 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes |
| (...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 222 float32x4_t dest = vld1q_f32(destP); | 226 float32x4_t dest = vld1q_f32(destP); |
| 223 | 227 |
| 224 dest = vmlaq_f32(dest, source, k); | 228 dest = vmlaq_f32(dest, source, k); |
| 225 vst1q_f32(destP, dest); | 229 vst1q_f32(destP, dest); |
| 226 | 230 |
| 227 sourceP += 4; | 231 sourceP += 4; |
| 228 destP += 4; | 232 destP += 4; |
| 229 } | 233 } |
| 230 n = tailFrames; | 234 n = tailFrames; |
| 231 } | 235 } |
| 236 #elif HAVE(MIPS_MSA_INTRINSICS) |
| 237 if ((sourceStride == 1) && (destStride == 1)) { |
| 238 float* destPCopy = destP; |
| 239 v4f32 vScale; |
| 240 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; |
| 241 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| 242 FloatInt scaleVal; |
| 243 |
| 244 scaleVal.floatVal = *scale; |
| 245 vScale = (v4f32)__msa_fill_w(scaleVal.intVal); |
| 246 |
| 247 for (; n >= 32; n -= 32) { |
| 248 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, |
| 249 vSrc7); |
| 250 LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, |
| 251 vDst7); |
| 252 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| 253 VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale); |
| 254 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| 255 } |
| 256 } |
| 232 #endif | 257 #endif |
| 233 while (n) { | 258 while (n) { |
| 234 *destP += *sourceP * *scale; | 259 *destP += *sourceP * *scale; |
| 235 sourceP += sourceStride; | 260 sourceP += sourceStride; |
| 236 destP += destStride; | 261 destP += destStride; |
| 237 n--; | 262 n--; |
| 238 } | 263 } |
| 239 } | 264 } |
| 240 | 265 |
| 241 void vsmul(const float* sourceP, | 266 void vsmul(const float* sourceP, |
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 303 | 328 |
| 304 while (destP < endP) { | 329 while (destP < endP) { |
| 305 float32x4_t source = vld1q_f32(sourceP); | 330 float32x4_t source = vld1q_f32(sourceP); |
| 306 vst1q_f32(destP, vmulq_n_f32(source, k)); | 331 vst1q_f32(destP, vmulq_n_f32(source, k)); |
| 307 | 332 |
| 308 sourceP += 4; | 333 sourceP += 4; |
| 309 destP += 4; | 334 destP += 4; |
| 310 } | 335 } |
| 311 n = tailFrames; | 336 n = tailFrames; |
| 312 } | 337 } |
| 338 #elif HAVE(MIPS_MSA_INTRINSICS) |
| 339 if ((sourceStride == 1) && (destStride == 1)) { |
| 340 v4f32 vScale; |
| 341 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; |
| 342 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| 343 FloatInt scaleVal; |
| 344 |
| 345 scaleVal.floatVal = *scale; |
| 346 vScale = (v4f32)__msa_fill_w(scaleVal.intVal); |
| 347 |
| 348 for (; n >= 32; n -= 32) { |
| 349 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, |
| 350 vSrc7); |
| 351 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale); |
| 352 VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale); |
| 353 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| 354 } |
| 355 } |
| 313 #endif | 356 #endif |
| 314 float k = *scale; | 357 float k = *scale; |
| 315 while (n--) { | 358 while (n--) { |
| 316 *destP = k * *sourceP; | 359 *destP = k * *sourceP; |
| 317 sourceP += sourceStride; | 360 sourceP += sourceStride; |
| 318 destP += destStride; | 361 destP += destStride; |
| 319 } | 362 } |
| 320 #if CPU(X86) || CPU(X86_64) | 363 #if CPU(X86) || CPU(X86_64) |
| 321 } | 364 } |
| 322 #endif | 365 #endif |
| (...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 424 float32x4_t source1 = vld1q_f32(source1P); | 467 float32x4_t source1 = vld1q_f32(source1P); |
| 425 float32x4_t source2 = vld1q_f32(source2P); | 468 float32x4_t source2 = vld1q_f32(source2P); |
| 426 vst1q_f32(destP, vaddq_f32(source1, source2)); | 469 vst1q_f32(destP, vaddq_f32(source1, source2)); |
| 427 | 470 |
| 428 source1P += 4; | 471 source1P += 4; |
| 429 source2P += 4; | 472 source2P += 4; |
| 430 destP += 4; | 473 destP += 4; |
| 431 } | 474 } |
| 432 n = tailFrames; | 475 n = tailFrames; |
| 433 } | 476 } |
| 477 #elif HAVE(MIPS_MSA_INTRINSICS) |
| 478 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
| 479 v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6, |
| 480 vSrc1P7; |
| 481 v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6, |
| 482 vSrc2P7; |
| 483 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| 484 |
| 485 for (; n >= 32; n -= 32) { |
| 486 LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, |
| 487 vSrc1P6, vSrc1P7); |
| 488 LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, |
| 489 vSrc2P6, vSrc2P7); |
| 490 ADD4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3, |
| 491 vSrc2P3, vDst0, vDst1, vDst2, vDst3); |
| 492 ADD4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7, |
| 493 vSrc2P7, vDst4, vDst5, vDst6, vDst7); |
| 494 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| 495 } |
| 496 } |
| 434 #endif | 497 #endif |
| 435 while (n--) { | 498 while (n--) { |
| 436 *destP = *source1P + *source2P; | 499 *destP = *source1P + *source2P; |
| 437 source1P += sourceStride1; | 500 source1P += sourceStride1; |
| 438 source2P += sourceStride2; | 501 source2P += sourceStride2; |
| 439 destP += destStride; | 502 destP += destStride; |
| 440 } | 503 } |
| 441 #if CPU(X86) || CPU(X86_64) | 504 #if CPU(X86) || CPU(X86_64) |
| 442 } | 505 } |
| 443 #endif | 506 #endif |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 507 float32x4_t source1 = vld1q_f32(source1P); | 570 float32x4_t source1 = vld1q_f32(source1P); |
| 508 float32x4_t source2 = vld1q_f32(source2P); | 571 float32x4_t source2 = vld1q_f32(source2P); |
| 509 vst1q_f32(destP, vmulq_f32(source1, source2)); | 572 vst1q_f32(destP, vmulq_f32(source1, source2)); |
| 510 | 573 |
| 511 source1P += 4; | 574 source1P += 4; |
| 512 source2P += 4; | 575 source2P += 4; |
| 513 destP += 4; | 576 destP += 4; |
| 514 } | 577 } |
| 515 n = tailFrames; | 578 n = tailFrames; |
| 516 } | 579 } |
| 580 #elif HAVE(MIPS_MSA_INTRINSICS) |
| 581 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
| 582 v4f32 vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, vSrc1P6, |
| 583 vSrc1P7; |
| 584 v4f32 vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, vSrc2P6, |
| 585 vSrc2P7; |
| 586 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| 587 |
| 588 for (; n >= 32; n -= 32) { |
| 589 LD_SP8(source1P, 4, vSrc1P0, vSrc1P1, vSrc1P2, vSrc1P3, vSrc1P4, vSrc1P5, |
| 590 vSrc1P6, vSrc1P7); |
| 591 LD_SP8(source2P, 4, vSrc2P0, vSrc2P1, vSrc2P2, vSrc2P3, vSrc2P4, vSrc2P5, |
| 592 vSrc2P6, vSrc2P7); |
| 593 MUL4(vSrc1P0, vSrc2P0, vSrc1P1, vSrc2P1, vSrc1P2, vSrc2P2, vSrc1P3, |
| 594 vSrc2P3, vDst0, vDst1, vDst2, vDst3); |
| 595 MUL4(vSrc1P4, vSrc2P4, vSrc1P5, vSrc2P5, vSrc1P6, vSrc2P6, vSrc1P7, |
| 596 vSrc2P7, vDst4, vDst5, vDst6, vDst7); |
| 597 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| 598 } |
| 599 } |
| 517 #endif | 600 #endif |
| 518 while (n) { | 601 while (n) { |
| 519 *destP = *source1P * *source2P; | 602 *destP = *source1P * *source2P; |
| 520 source1P += sourceStride1; | 603 source1P += sourceStride1; |
| 521 source2P += sourceStride2; | 604 source2P += sourceStride2; |
| 522 destP += destStride; | 605 destP += destStride; |
| 523 n--; | 606 n--; |
| 524 } | 607 } |
| 525 } | 608 } |
| 526 | 609 |
| (...skipping 182 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 709 } | 792 } |
| 710 float32x2_t twoMax = | 793 float32x2_t twoMax = |
| 711 vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); | 794 vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); |
| 712 | 795 |
| 713 float groupMax[2]; | 796 float groupMax[2]; |
| 714 vst1_f32(groupMax, twoMax); | 797 vst1_f32(groupMax, twoMax); |
| 715 max = std::max(groupMax[0], groupMax[1]); | 798 max = std::max(groupMax[0], groupMax[1]); |
| 716 | 799 |
| 717 n = tailFrames; | 800 n = tailFrames; |
| 718 } | 801 } |
| 802 #elif HAVE(MIPS_MSA_INTRINSICS) |
| 803 if (sourceStride == 1) { |
| 804 v4f32 vMax = { |
| 805 0, |
| 806 }; |
| 807 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; |
| 808 const v16i8 vSignBitMask = (v16i8)__msa_fill_w(0x7FFFFFFF); |
| 809 |
| 810 for (; n >= 32; n -= 32) { |
| 811 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, |
| 812 vSrc7); |
| 813 AND_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vSignBitMask); |
| 814 VMAX_W4_SP(vSrc0, vSrc1, vSrc2, vSrc3, vMax); |
| 815 AND_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vSignBitMask); |
| 816 VMAX_W4_SP(vSrc4, vSrc5, vSrc6, vSrc7, vMax); |
| 817 } |
| 818 |
| 819 max = std::max(max, vMax[0]); |
| 820 max = std::max(max, vMax[1]); |
| 821 max = std::max(max, vMax[2]); |
| 822 max = std::max(max, vMax[3]); |
| 823 } |
| 719 #endif | 824 #endif |
| 720 | 825 |
| 721 while (n--) { | 826 while (n--) { |
| 722 max = std::max(max, fabsf(*sourceP)); | 827 max = std::max(max, fabsf(*sourceP)); |
| 723 sourceP += sourceStride; | 828 sourceP += sourceStride; |
| 724 } | 829 } |
| 725 | 830 |
| 726 ASSERT(maxP); | 831 ASSERT(maxP); |
| 727 *maxP = max; | 832 *maxP = max; |
| 728 } | 833 } |
| (...skipping 18 matching lines...) Expand all Loading... |
| 747 float32x4_t low = vdupq_n_f32(lowThreshold); | 852 float32x4_t low = vdupq_n_f32(lowThreshold); |
| 748 float32x4_t high = vdupq_n_f32(highThreshold); | 853 float32x4_t high = vdupq_n_f32(highThreshold); |
| 749 while (destP < endP) { | 854 while (destP < endP) { |
| 750 float32x4_t source = vld1q_f32(sourceP); | 855 float32x4_t source = vld1q_f32(sourceP); |
| 751 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); | 856 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); |
| 752 sourceP += 4; | 857 sourceP += 4; |
| 753 destP += 4; | 858 destP += 4; |
| 754 } | 859 } |
| 755 n = tailFrames; | 860 n = tailFrames; |
| 756 } | 861 } |
| 862 #elif HAVE(MIPS_MSA_INTRINSICS) |
| 863 if ((sourceStride == 1) && (destStride == 1)) { |
| 864 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; |
| 865 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; |
| 866 v4f32 vLowThr, vHighThr; |
| 867 FloatInt lowThr, highThr; |
| 868 |
| 869 lowThr.floatVal = lowThreshold; |
| 870 highThr.floatVal = highThreshold; |
| 871 vLowThr = (v4f32)__msa_fill_w(lowThr.intVal); |
| 872 vHighThr = (v4f32)__msa_fill_w(highThr.intVal); |
| 873 |
| 874 for (; n >= 32; n -= 32) { |
| 875 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, |
| 876 vSrc7); |
| 877 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, |
| 878 vDst3); |
| 879 VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6, |
| 880 vDst7); |
| 881 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP, 4); |
| 882 } |
| 883 } |
| 757 #endif | 884 #endif |
| 758 while (n--) { | 885 while (n--) { |
| 759 *destP = clampTo(*sourceP, lowThreshold, highThreshold); | 886 *destP = clampTo(*sourceP, lowThreshold, highThreshold); |
| 760 sourceP += sourceStride; | 887 sourceP += sourceStride; |
| 761 destP += destStride; | 888 destP += destStride; |
| 762 } | 889 } |
| 763 } | 890 } |
| 764 | 891 |
| 765 #endif // OS(MACOSX) | 892 #endif // OS(MACOSX) |
| 766 | 893 |
| 767 } // namespace VectorMath | 894 } // namespace VectorMath |
| 768 | 895 |
| 769 } // namespace blink | 896 } // namespace blink |
| OLD | NEW |