| OLD | NEW |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #ifndef CommonMacrosMSA_h | 5 #ifndef CommonMacrosMSA_h |
| 6 #define CommonMacrosMSA_h | 6 #define CommonMacrosMSA_h |
| 7 | 7 |
| 8 #include <msa.h> | 8 #include <msa.h> |
| 9 #include <stdint.h> | 9 #include <stdint.h> |
| 10 | 10 |
| 11 #if defined(__clang__) | 11 #if defined(__clang__) |
| 12 #define CLANG_BUILD | 12 #define CLANG_BUILD |
| 13 #endif | 13 #endif |
| 14 | 14 |
| 15 typedef union { |
| 16 int32_t intVal; |
| 17 float floatVal; |
| 18 } FloatInt; |
| 19 |
| 15 #ifdef CLANG_BUILD | 20 #ifdef CLANG_BUILD |
| 16 #define SRLI_B(a, b) __msa_srli_b((v16i8)a, b) | 21 #define SRLI_B(a, b) __msa_srli_b((v16i8)a, b) |
| 17 #define SRLI_H(a, b) __msa_srli_h((v8i16)a, b) | 22 #define SRLI_H(a, b) __msa_srli_h((v8i16)a, b) |
| 18 #define SLLI_B(a, b) __msa_slli_b((v16i8)a, b) | 23 #define SLLI_B(a, b) __msa_slli_b((v16i8)a, b) |
| 19 #define SLLI_H(a, b) __msa_slli_h((v8i16)a, b) | 24 #define SLLI_H(a, b) __msa_slli_h((v8i16)a, b) |
| 20 #define CEQI_B(a, b) __msa_ceqi_b((v16i8)a, b) | 25 #define CEQI_B(a, b) __msa_ceqi_b((v16i8)a, b) |
| 21 #define CEQI_H(a, b) __msa_ceqi_h((v8i16)a, b) | 26 #define CEQI_H(a, b) __msa_ceqi_h((v8i16)a, b) |
| 22 #define ANDI_B(a, b) __msa_andi_b((v16u8)a, b) | 27 #define ANDI_B(a, b) __msa_andi_b((v16u8)a, b) |
| 23 #else | 28 #else |
| 24 #define SRLI_B(a, b) ((v16u8)a >> b) | 29 #define SRLI_B(a, b) ((v16u8)a >> b) |
| (...skipping 632 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 657 { \ | 662 { \ |
| 658 out0 = in0 / in1; \ | 663 out0 = in0 / in1; \ |
| 659 out1 = in2 / in3; \ | 664 out1 = in2 / in3; \ |
| 660 } | 665 } |
| 661 #define DIV4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ | 666 #define DIV4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ |
| 662 { \ | 667 { \ |
| 663 DIV2(in0, in1, in2, in3, out0, out1); \ | 668 DIV2(in0, in1, in2, in3, out0, out1); \ |
| 664 DIV2(in4, in5, in6, in7, out2, out3); \ | 669 DIV2(in4, in5, in6, in7, out2, out3); \ |
| 665 } | 670 } |
| 666 | 671 |
| 672 /* Description : Logical AND of 4 pairs of vectors with mask |
| 673 Arguments : Inputs - in0, in1, in2, in3, mask |
| 674 Outputs - in0, in1, in2, in3 |
| 675 Details : Each element in 'in0' is logically AND'ed with mask |
| 676 Each element in 'in1' is logically AND'ed with mask |
| 677 Each element in 'in2' is logically AND'ed with mask |
| 678 Each element in 'in3' is logically AND'ed with mask |
| 679 */ |
| 680 #define AND_W4(RTYPE, in0, in1, in2, in3, mask) \ |
| 681 { \ |
| 682 in0 = (RTYPE)((v16i8)in0 & (v16i8)mask); \ |
| 683 in1 = (RTYPE)((v16i8)in1 & (v16i8)mask); \ |
| 684 in2 = (RTYPE)((v16i8)in2 & (v16i8)mask); \ |
| 685 in3 = (RTYPE)((v16i8)in3 & (v16i8)mask); \ |
| 686 } |
| 687 #define AND_W4_SP(...) AND_W4(v4f32, __VA_ARGS__) |
| 688 |
| 689 /* Description : Addition of 2 pairs of vectors |
| 690 Arguments : Inputs - in0, in1, in2, in3 |
| 691 Outputs - out0, out1 |
| 692 Details : Each element in 'in0' is added to 'in1' and result is written |
| 693 to 'out0' |
| 694 Each element in 'in2' is added to 'in3' and result is written |
| 695 to 'out1' |
| 696 */ |
| 697 #define ADD2(in0, in1, in2, in3, out0, out1) \ |
| 698 { \ |
| 699 out0 = in0 + in1; \ |
| 700 out1 = in2 + in3; \ |
| 701 } |
| 702 |
| 703 /* Description : Addition of 4 pairs of vectors |
| 704 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
| 705 Outputs - out0, out1 |
| 706 Details : Each element in 'in0' is added to 'in1' and result is written |
| 707 to 'out0' |
| 708 Each element in 'in2' is added to 'in3' and result is written |
| 709 to 'out1' |
| 710 Each element in 'in4' is added to 'in5' and result is written |
| 711 to 'out2' |
| 712 Each element in 'in6' is added to 'in7' and result is written |
| 713 to 'out3' |
| 714 */ |
| 715 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ |
| 716 { \ |
| 717 ADD2(in0, in1, in2, in3, out0, out1); \ |
| 718 ADD2(in4, in5, in6, in7, out2, out3); \ |
| 719 } |
| 720 |
| 667 /* Description : Vector Floating-Point Convert from Unsigned Integer | 721 /* Description : Vector Floating-Point Convert from Unsigned Integer |
| 668 Arguments : Inputs - in0, in1 | 722 Arguments : Inputs - in0, in1 |
| 669 Outputs - out0, out1 | 723 Outputs - out0, out1 |
| 670 Details : | |
| 671 */ | 724 */ |
| 672 #define FFINTU_W2(RTYPE, in0, in1, out0, out1) \ | 725 #define FFINTU_W2(RTYPE, in0, in1, out0, out1) \ |
| 673 { \ | 726 { \ |
| 674 out0 = (RTYPE)__msa_ffint_u_w((v4u32)in0); \ | 727 out0 = (RTYPE)__msa_ffint_u_w((v4u32)in0); \ |
| 675 out1 = (RTYPE)__msa_ffint_u_w((v4u32)in1); \ | 728 out1 = (RTYPE)__msa_ffint_u_w((v4u32)in1); \ |
| 676 } | 729 } |
| 677 #define FFINTU_W2_SP(...) FFINTU_W2(v4f32, __VA_ARGS__) | 730 #define FFINTU_W2_SP(...) FFINTU_W2(v4f32, __VA_ARGS__) |
| 678 | 731 |
| 732 /* Description : Vector Floating-Point Convert from Unsigned Integer |
| 733 Arguments : Inputs - in0, in1, in2, in3 |
| 734 Outputs - out0, out1, out2, out3 |
| 735 */ |
| 679 #define FFINTU_W4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ | 736 #define FFINTU_W4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ |
| 680 { \ | 737 { \ |
| 681 FFINTU_W2(RTYPE, in0, in1, out0, out1); \ | 738 FFINTU_W2(RTYPE, in0, in1, out0, out1); \ |
| 682 FFINTU_W2(RTYPE, in2, in3, out2, out3); \ | 739 FFINTU_W2(RTYPE, in2, in3, out2, out3); \ |
| 683 } | 740 } |
| 684 #define FFINTU_W4_SP(...) FFINTU_W4(v4f32, __VA_ARGS__) | 741 #define FFINTU_W4_SP(...) FFINTU_W4(v4f32, __VA_ARGS__) |
| 685 | 742 |
| 686 /* Description : Vector Floating-Point Truncate and Convert to Unsigned Integer | 743 /* Description : Vector Floating-Point Truncate and Convert to Unsigned Integer |
| 687 Arguments : Inputs - in0, in1 | 744 Arguments : Inputs - in0, in1 |
| 688 Outputs - out0, out1 | 745 Outputs - out0, out1 |
| 689 Details : | |
| 690 */ | 746 */ |
| 691 #define FTRUNCU_W2(RTYPE, in0, in1, out0, out1) \ | 747 #define FTRUNCU_W2(RTYPE, in0, in1, out0, out1) \ |
| 692 { \ | 748 { \ |
| 693 out0 = (RTYPE)__msa_ftrunc_u_w((v4f32)in0); \ | 749 out0 = (RTYPE)__msa_ftrunc_u_w((v4f32)in0); \ |
| 694 out1 = (RTYPE)__msa_ftrunc_u_w((v4f32)in1); \ | 750 out1 = (RTYPE)__msa_ftrunc_u_w((v4f32)in1); \ |
| 695 } | 751 } |
| 696 #define FTRUNCU_W2_UB(...) FTRUNCU_W2(v16u8, __VA_ARGS__) | 752 #define FTRUNCU_W2_UB(...) FTRUNCU_W2(v16u8, __VA_ARGS__) |
| 697 | 753 |
| 754 /* Description : Vector Floating-Point Truncate and Convert to Unsigned Integer |
| 755 Arguments : Inputs - in0, in1, in2, in3 |
| 756 Outputs - out0, out1, out2, out3 |
| 757 */ |
| 698 #define FTRUNCU_W4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ | 758 #define FTRUNCU_W4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ |
| 699 { \ | 759 { \ |
| 700 FTRUNCU_W2(RTYPE, in0, in1, out0, out1); \ | 760 FTRUNCU_W2(RTYPE, in0, in1, out0, out1); \ |
| 701 FTRUNCU_W2(RTYPE, in2, in3, out2, out3); \ | 761 FTRUNCU_W2(RTYPE, in2, in3, out2, out3); \ |
| 702 } | 762 } |
| 703 #define FTRUNCU_W4_UB(...) FTRUNCU_W4(v16u8, __VA_ARGS__) | 763 #define FTRUNCU_W4_UB(...) FTRUNCU_W4(v16u8, __VA_ARGS__) |
| 704 | 764 |
| 765 /* Description : Vector Floating-Point multiply with scale and accumulate |
| 766 Arguments : Inputs - in0, in1, in2, in3, out0, out1, out2, out3, scale |
| 767 Outputs - out0, out1, out2, out3 |
| 768 */ |
| 769 #define VSMA4(in0, in1, in2, in3, out0, out1, out2, out3, scale) \ |
| 770 { \ |
| 771 out0 += in0 * scale; \ |
| 772 out1 += in1 * scale; \ |
| 773 out2 += in2 * scale; \ |
| 774 out3 += in3 * scale; \ |
| 775 } |
| 776 |
| 777 /* Description : Vector Floating-Point multiply with scale |
| 778 Arguments : Inputs - in0, in1, in2, in3, scale |
| 779 Outputs - out0, out1, out2, out3 |
| 780 */ |
| 781 #define VSMUL4(in0, in1, in2, in3, out0, out1, out2, out3, scale) \ |
| 782 { \ |
| 783 out0 = in0 * scale; \ |
| 784 out1 = in1 * scale; \ |
| 785 out2 = in2 * scale; \ |
| 786 out3 = in3 * scale; \ |
| 787 } |
| 788 |
| 789 /* Description : Vector Floating-Point max value |
| 790 Arguments : Inputs - in0, in1, in2, in3, max |
| 791 Output - max |
| 792 */ |
| 793 #define VMAX_W4(RTYPE, in0, in1, in2, in3, max) \ |
| 794 { \ |
| 795 max = (RTYPE)__msa_fmax_w((v4f32)max, (v4f32)in0); \ |
| 796 max = (RTYPE)__msa_fmax_w((v4f32)max, (v4f32)in1); \ |
| 797 max = (RTYPE)__msa_fmax_w((v4f32)max, (v4f32)in2); \ |
| 798 max = (RTYPE)__msa_fmax_w((v4f32)max, (v4f32)in3); \ |
| 799 } |
| 800 #define VMAX_W4_SP(...) VMAX_W4(v4f32, __VA_ARGS__) |
| 801 |
| 802 /* Description : Vector Floating-Point clip to min max |
| 803 Arguments : Inputs - in0, in1, in2, in3, min, max |
| 804 Outputs - out0, out1, out2, out3 |
| 805 */ |
| 806 #define VCLIP4(in0, in1, in2, in3, min, max, out0, out1, out2, out3) \ |
| 807 { \ |
| 808 out0 = __msa_fmax_w(__msa_fmin_w(in0, max), min); \ |
| 809 out1 = __msa_fmax_w(__msa_fmin_w(in1, max), min); \ |
| 810 out2 = __msa_fmax_w(__msa_fmin_w(in2, max), min); \ |
| 811 out3 = __msa_fmax_w(__msa_fmin_w(in3, max), min); \ |
| 812 } |
| 813 |
| 705 #endif // CommonMacrosMSA_h | 814 #endif // CommonMacrosMSA_h |
| OLD | NEW |